Merge "Added OUTPUT_YUV_DENOISED CFLAG to VP8 encoder"
diff --git a/README b/README
index 6d7d5ec..f209105 100644
--- a/README
+++ b/README
@@ -1,5 +1,4 @@
-vpx Multi-Format Codec SDK
-README - 1 August 2013
+README - 30 May 2014
Welcome to the WebM VP8/VP9 Codec SDK!
@@ -63,6 +62,7 @@
armv7-none-rvct
armv7-win32-vs11
armv7-win32-vs12
+ armv7s-darwin-gcc
mips32-linux-gcc
ppc32-darwin8-gcc
ppc32-darwin9-gcc
@@ -80,6 +80,7 @@
x86-darwin11-gcc
x86-darwin12-gcc
x86-darwin13-gcc
+ x86-iphonesimulator-gcc
x86-linux-gcc
x86-linux-icc
x86-os2-gcc
@@ -96,6 +97,7 @@
x86_64-darwin11-gcc
x86_64-darwin12-gcc
x86_64-darwin13-gcc
+ x86_64-iphonesimulator-gcc
x86_64-linux-gcc
x86_64-linux-icc
x86_64-solaris-gcc
@@ -131,6 +133,14 @@
This defaults to config.log. This should give a good indication of what went
wrong. If not, contact us for support.
+VP8/VP9 TEST VECTORS:
+ The test vectors can be downloaded and verified using the build system after
+ running configure. To specify an alternate directory the
+ LIBVPX_TEST_DATA_PATH environment variable can be used.
+
+ $ ./configure --enable-unit-tests
+ $ LIBVPX_TEST_DATA_PATH=../libvpx-test-data make testdata
+
SUPPORT
This library is an open source project supported by its community. Please
please email webm-discuss@webmproject.org for help.
diff --git a/build/arm-msvs/obj_int_extract.bat b/build/arm-msvs/obj_int_extract.bat
index 267ed61..c0987bc 100644
--- a/build/arm-msvs/obj_int_extract.bat
+++ b/build/arm-msvs/obj_int_extract.bat
@@ -11,8 +11,8 @@
REM %1 - Relative path to the directory containing the vp8 and vpx_scale
REM source directories.
REM %2 - Path to obj_int_extract.exe.
-cl /I "./" /I "%1" /nologo /c /DWINAPI_FAMILY=WINAPI_FAMILY_PHONE_APP "%1/vp8/encoder/vp8_asm_enc_offsets.c"
+cl /I. /I%1 /nologo /c /DWINAPI_FAMILY=WINAPI_FAMILY_PHONE_APP "%~1/vp8/encoder/vp8_asm_enc_offsets.c"
%2\obj_int_extract.exe rvds "vp8_asm_enc_offsets.obj" > "vp8_asm_enc_offsets.asm"
-cl /I "./" /I "%1" /nologo /c /DWINAPI_FAMILY=WINAPI_FAMILY_PHONE_APP "%1/vpx_scale/vpx_scale_asm_offsets.c"
+cl /I. /I%1 /nologo /c /DWINAPI_FAMILY=WINAPI_FAMILY_PHONE_APP "%~1/vpx_scale/vpx_scale_asm_offsets.c"
%2\obj_int_extract.exe rvds "vpx_scale_asm_offsets.obj" > "vpx_scale_asm_offsets.asm"
diff --git a/build/make/Makefile b/build/make/Makefile
index 63ec271..dc61429 100644
--- a/build/make/Makefile
+++ b/build/make/Makefile
@@ -330,7 +330,10 @@
ifneq ($(target),)
include $(SRC_PATH_BARE)/$(target:-$(TOOLCHAIN)=).mk
endif
-ifeq ($(filter %clean,$(MAKECMDGOALS)),)
+
+skip_deps := $(filter %clean,$(MAKECMDGOALS))
+skip_deps += $(findstring testdata,$(MAKECMDGOALS))
+ifeq ($(strip $(skip_deps)),)
# Older versions of make don't like -include directives with no arguments
ifneq ($(filter %.d,$(OBJS-yes:.o=.d)),)
-include $(filter %.d,$(OBJS-yes:.o=.d))
diff --git a/build/make/configure.sh b/build/make/configure.sh
index ad7dc82..d4124c7 100755
--- a/build/make/configure.sh
+++ b/build/make/configure.sh
@@ -774,6 +774,13 @@
add_cflags "-mmacosx-version-min=10.9"
add_ldflags "-mmacosx-version-min=10.9"
;;
+ *-iphonesimulator-*)
+ add_cflags "-miphoneos-version-min=5.0"
+ add_ldflags "-miphoneos-version-min=5.0"
+ osx_sdk_dir="$(xcrun --sdk iphonesimulator --show-sdk-path)"
+ add_cflags "-isysroot ${osx_sdk_dir}"
+ add_ldflags "-isysroot ${osx_sdk_dir}"
+ ;;
esac
# Handle Solaris variants. Solaris 10 needs -lposix4
@@ -795,7 +802,7 @@
armv8)
soft_enable neon
;;
- armv7)
+ armv7|armv7s)
soft_enable neon
soft_enable neon_asm
soft_enable media
@@ -824,7 +831,7 @@
arch_int=${arch_int%%te}
check_add_asflags --defsym ARCHITECTURE=${arch_int}
tune_cflags="-mtune="
- if [ ${tgt_isa} = "armv7" ]; then
+ if [ ${tgt_isa} = "armv7" ] || [ ${tgt_isa} = "armv7s" ]; then
if [ -z "${float_abi}" ]; then
check_cpp <<EOF && float_abi=hard || float_abi=softfp
#ifndef __ARM_PCS_VFP
@@ -1164,6 +1171,12 @@
# enabled icc && ! enabled pic && add_cflags -fno-pic -mdynamic-no-pic
enabled icc && ! enabled pic && add_cflags -fno-pic
;;
+ iphonesimulator)
+ add_asflags -f macho${bits}
+ enabled x86 && sim_arch="-arch i386" || sim_arch="-arch x86_64"
+ add_cflags ${sim_arch}
+ add_ldflags ${sim_arch}
+ ;;
os2)
add_asflags -f aout
enabled debug && add_asflags -g
diff --git a/build/make/gen_msvs_proj.sh b/build/make/gen_msvs_proj.sh
index 4e803b8..f1cc04e 100755
--- a/build/make/gen_msvs_proj.sh
+++ b/build/make/gen_msvs_proj.sh
@@ -67,7 +67,9 @@
if [ "${f##*.}" == "$pat" ]; then
unset file_list[i]
- objf=$(echo ${f%.*}.obj | sed -e 's/^[\./]\+//g' -e 's,[:/],_,g')
+ objf=$(echo ${f%.*}.obj \
+ | sed -e "s,$src_path_bare,," \
+ -e 's/^[\./]\+//g' -e 's,[:/ ],_,g')
open_tag File RelativePath="$f"
if [ "$pat" == "asm" ] && $asm_use_custom_step; then
@@ -153,7 +155,7 @@
opt=${opt##-I}
opt=$(fix_path "$opt")
incs="${incs}${incs:+;}"${opt}""
- yasmincs="${yasmincs} -I${opt}"
+ yasmincs="${yasmincs} -I"${opt}""
;;
-D*) defines="${defines}${defines:+;}${opt##-D}"
;;
@@ -174,7 +176,8 @@
-*) die_unknown $opt
;;
*)
- file_list[${#file_list[@]}]="$(fix_path $opt)"
+ # The paths in file_list are fixed outside of the loop.
+ file_list[${#file_list[@]}]="$opt"
case "$opt" in
*.asm) uses_asm=true
;;
@@ -182,6 +185,10 @@
;;
esac
done
+
+# Make one call to fix_path for file_list to improve performance.
+fix_file_list
+
outfile=${outfile:-/dev/stdout}
guid=${guid:-`generate_uuid`}
asm_use_custom_step=false
@@ -300,7 +307,7 @@
vpx)
tag Tool \
Name="VCPreBuildEventTool" \
- CommandLine="call obj_int_extract.bat $src_path_bare $plat_no_ws\\\$(ConfigurationName)" \
+ CommandLine="call obj_int_extract.bat "$src_path_bare" $plat_no_ws\\\$(ConfigurationName)" \
tag Tool \
Name="VCCLCompilerTool" \
diff --git a/build/make/gen_msvs_vcxproj.sh b/build/make/gen_msvs_vcxproj.sh
index 8529eed..eee354d 100755
--- a/build/make/gen_msvs_vcxproj.sh
+++ b/build/make/gen_msvs_vcxproj.sh
@@ -78,7 +78,9 @@
if [ "${f##*.}" == "$pat" ]; then
unset file_list[i]
- objf=$(echo ${f%.*}.obj | sed -e 's/^[\./]\+//g' -e 's,[:/],_,g')
+ objf=$(echo ${f%.*}.obj \
+ | sed -e "s,$src_path_bare,," \
+ -e 's/^[\./]\+//g' -e 's,[:/ ],_,g')
if ([ "$pat" == "asm" ] || [ "$pat" == "s" ]) && $asm_use_custom_step; then
# Avoid object file name collisions, i.e. vpx_config.c and
@@ -175,7 +177,7 @@
opt=${opt##-I}
opt=$(fix_path "$opt")
incs="${incs}${incs:+;}"${opt}""
- yasmincs="${yasmincs} -I${opt}"
+ yasmincs="${yasmincs} -I"${opt}""
;;
-D*) defines="${defines}${defines:+;}${opt##-D}"
;;
@@ -196,7 +198,8 @@
-*) die_unknown $opt
;;
*)
- file_list[${#file_list[@]}]="$(fix_path $opt)"
+ # The paths in file_list are fixed outside of the loop.
+ file_list[${#file_list[@]}]="$opt"
case "$opt" in
*.asm|*.s) uses_asm=true
;;
@@ -204,6 +207,10 @@
;;
esac
done
+
+# Make one call to fix_path for file_list to improve performance.
+fix_file_list
+
outfile=${outfile:-/dev/stdout}
guid=${guid:-`generate_uuid`}
asm_use_custom_step=false
@@ -392,7 +399,7 @@
hostplat=Win32
fi
open_tag PreBuildEvent
- tag_content Command "call obj_int_extract.bat $src_path_bare $hostplat\\\$(Configuration)"
+ tag_content Command "call obj_int_extract.bat "$src_path_bare" $hostplat\\\$(Configuration)"
close_tag PreBuildEvent
fi
open_tag ClCompile
diff --git a/build/make/msvs_common.sh b/build/make/msvs_common.sh
index eb2eb7b..90c1488 100644
--- a/build/make/msvs_common.sh
+++ b/build/make/msvs_common.sh
@@ -13,7 +13,7 @@
&& cygpath --help >/dev/null 2>&1; then
FIXPATH='cygpath -m'
else
- FIXPATH='echo'
+ FIXPATH='echo_path'
fi
die() {
@@ -27,8 +27,23 @@
exit 1
}
+echo_path() {
+ for path; do
+ echo "$path"
+ done
+}
+
+# Output one, possibly changed based on the system, path per line.
fix_path() {
- $FIXPATH "$1"
+ $FIXPATH "$@"
+}
+
+# Corrects the paths in file_list in one pass for efficiency.
+fix_file_list() {
+ # TODO(jzern): this could be more generic and take the array as a param.
+ files=$(fix_path "${file_list[@]}")
+ local IFS=$'\n'
+ file_list=($files)
}
generate_uuid() {
diff --git a/build/x86-msvs/obj_int_extract.bat b/build/x86-msvs/obj_int_extract.bat
index 44d095d..dfa3b90 100644
--- a/build/x86-msvs/obj_int_extract.bat
+++ b/build/x86-msvs/obj_int_extract.bat
@@ -10,6 +10,6 @@
REM Arguments:
REM %1 - Relative path to the directory containing the vp8 source directory.
REM %2 - Path to obj_int_extract.exe.
-cl /I "./" /I "%1" /nologo /c "%1/vp8/encoder/vp8_asm_enc_offsets.c"
+cl /I. /I%1 /nologo /c "%~1/vp8/encoder/vp8_asm_enc_offsets.c"
%2\obj_int_extract.exe rvds "vp8_asm_enc_offsets.obj" > "vp8_asm_enc_offsets.asm"
diff --git a/configure b/configure
index b6d645a..800553e 100755
--- a/configure
+++ b/configure
@@ -103,6 +103,7 @@
all_platforms="${all_platforms} armv7-none-rvct" #neon Cortex-A8
all_platforms="${all_platforms} armv7-win32-vs11"
all_platforms="${all_platforms} armv7-win32-vs12"
+all_platforms="${all_platforms} armv7s-darwin-gcc"
all_platforms="${all_platforms} mips32-linux-gcc"
all_platforms="${all_platforms} ppc32-darwin8-gcc"
all_platforms="${all_platforms} ppc32-darwin9-gcc"
@@ -120,6 +121,7 @@
all_platforms="${all_platforms} x86-darwin11-gcc"
all_platforms="${all_platforms} x86-darwin12-gcc"
all_platforms="${all_platforms} x86-darwin13-gcc"
+all_platforms="${all_platforms} x86-iphonesimulator-gcc"
all_platforms="${all_platforms} x86-linux-gcc"
all_platforms="${all_platforms} x86-linux-icc"
all_platforms="${all_platforms} x86-os2-gcc"
@@ -136,6 +138,7 @@
all_platforms="${all_platforms} x86_64-darwin11-gcc"
all_platforms="${all_platforms} x86_64-darwin12-gcc"
all_platforms="${all_platforms} x86_64-darwin13-gcc"
+all_platforms="${all_platforms} x86_64-iphonesimulator-gcc"
all_platforms="${all_platforms} x86_64-linux-gcc"
all_platforms="${all_platforms} x86_64-linux-icc"
all_platforms="${all_platforms} x86_64-solaris-gcc"
@@ -723,6 +726,10 @@
# iOS/ARM builds do not work with gtest. This does not match
# x86 targets.
;;
+ *-iphonesimulator-*)
+ soft_enable webm_io
+ soft_enable libyuv
+ ;;
*-win*)
# Some mingw toolchains don't have pthread available by default.
# Treat these more like visual studio where threading in gtest
diff --git a/examples.mk b/examples.mk
index 946c030..ce833fc 100644
--- a/examples.mk
+++ b/examples.mk
@@ -306,6 +306,7 @@
--name=$$(@:.$(VCPROJ_SFX)=)\
--ver=$$(CONFIG_VS_VERSION)\
--proj-guid=$$($$(@:.$(VCPROJ_SFX)=).GUID)\
+ --src-path-bare="$(SRC_PATH_BARE)" \
$$(if $$(CONFIG_STATIC_MSVCRT),--static-crt) \
--out=$$@ $$(INTERNAL_CFLAGS) $$(CFLAGS) \
$$(INTERNAL_LDFLAGS) $$(LDFLAGS) -l$$(CODEC_LIB) $$^
diff --git a/libs.mk b/libs.mk
index d02e9bc..2b072b6 100644
--- a/libs.mk
+++ b/libs.mk
@@ -222,6 +222,7 @@
--name=obj_int_extract \
--ver=$(CONFIG_VS_VERSION) \
--proj-guid=E1360C65-D375-4335-8057-7ED99CC3F9B2 \
+ --src-path-bare="$(SRC_PATH_BARE)" \
$(if $(CONFIG_STATIC_MSVCRT),--static-crt) \
--out=$@ $^ \
-I. \
@@ -253,6 +254,7 @@
--proj-guid=DCE19DAF-69AC-46DB-B14A-39F0FAA5DB74 \
--module-def=vpx.def \
--ver=$(CONFIG_VS_VERSION) \
+ --src-path-bare="$(SRC_PATH_BARE)" \
--out=$@ $(CFLAGS) \
$(filter-out $(addprefix %, $(ASM_INCLUDES)), $^) \
--src-path-bare="$(SRC_PATH_BARE)" \
@@ -447,6 +449,7 @@
-D_VARIADIC_MAX=10 \
--proj-guid=CD837F5F-52D8-4314-A370-895D614166A7 \
--ver=$(CONFIG_VS_VERSION) \
+ --src-path-bare="$(SRC_PATH_BARE)" \
$(if $(CONFIG_STATIC_MSVCRT),--static-crt) \
--out=$@ $(INTERNAL_CFLAGS) $(CFLAGS) \
-I. -I"$(SRC_PATH_BARE)/third_party/googletest/src/include" \
diff --git a/test/convolve_test.cc b/test/convolve_test.cc
index 3412ddd..6af2abb 100644
--- a/test/convolve_test.cc
+++ b/test/convolve_test.cc
@@ -645,6 +645,26 @@
#endif
#if HAVE_AVX2
+// TODO(jzern): these prototypes can be removed after the avx2 versions are
+// reenabled in vp9_rtcd_defs.pl.
+extern "C" {
+void vp9_convolve8_vert_avx2(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const int16_t *filter_x, int x_step_q4,
+ const int16_t *filter_y, int y_step_q4,
+ int w, int h);
+void vp9_convolve8_horiz_avx2(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const int16_t *filter_x, int x_step_q4,
+ const int16_t *filter_y, int y_step_q4,
+ int w, int h);
+void vp9_convolve8_avx2(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const int16_t *filter_x, int x_step_q4,
+ const int16_t *filter_y, int y_step_q4,
+ int w, int h);
+}
+
const ConvolveFunctions convolve8_avx2(
vp9_convolve8_horiz_avx2, vp9_convolve8_avg_horiz_ssse3,
vp9_convolve8_vert_avx2, vp9_convolve8_avg_vert_ssse3,
@@ -655,8 +675,10 @@
make_tuple(8, 4, &convolve8_avx2),
make_tuple(4, 8, &convolve8_avx2),
make_tuple(8, 8, &convolve8_avx2),
+ make_tuple(8, 16, &convolve8_avx2)));
+
+INSTANTIATE_TEST_CASE_P(DISABLED_AVX2, ConvolveTest, ::testing::Values(
make_tuple(16, 8, &convolve8_avx2),
- make_tuple(8, 16, &convolve8_avx2),
make_tuple(16, 16, &convolve8_avx2),
make_tuple(32, 16, &convolve8_avx2),
make_tuple(16, 32, &convolve8_avx2),
diff --git a/test/dct16x16_test.cc b/test/dct16x16_test.cc
index 20b1c8f..e6a20fb 100644
--- a/test/dct16x16_test.cc
+++ b/test/dct16x16_test.cc
@@ -272,10 +272,18 @@
vp9_fdct16x16_c(in, out, stride);
}
+void idct16x16_ref(const int16_t *in, uint8_t *dest, int stride, int tx_type) {
+ vp9_idct16x16_256_add_c(in, dest, stride);
+}
+
void fht16x16_ref(const int16_t *in, int16_t *out, int stride, int tx_type) {
vp9_fht16x16_c(in, out, stride, tx_type);
}
+void iht16x16_ref(const int16_t *in, uint8_t *dest, int stride, int tx_type) {
+ vp9_iht16x16_256_add_c(in, dest, stride, tx_type);
+}
+
class Trans16x16TestBase {
public:
virtual ~Trans16x16TestBase() {}
@@ -358,12 +366,13 @@
input_block[j] = rnd.Rand8() - rnd.Rand8();
input_extreme_block[j] = rnd.Rand8() % 2 ? 255 : -255;
}
- if (i == 0)
+ if (i == 0) {
for (int j = 0; j < kNumCoeffs; ++j)
input_extreme_block[j] = 255;
- if (i == 1)
+ } else if (i == 1) {
for (int j = 0; j < kNumCoeffs; ++j)
input_extreme_block[j] = -255;
+ }
fwd_txfm_ref(input_extreme_block, output_ref_block, pitch_, tx_type_);
REGISTER_STATE_CHECK(RunFwdTxfm(input_extreme_block,
@@ -378,6 +387,47 @@
}
}
+ void RunQuantCheck(int dc_thred, int ac_thred) {
+ ACMRandom rnd(ACMRandom::DeterministicSeed());
+ const int count_test_block = 1000;
+ DECLARE_ALIGNED_ARRAY(16, int16_t, input_block, kNumCoeffs);
+ DECLARE_ALIGNED_ARRAY(16, int16_t, input_extreme_block, kNumCoeffs);
+ DECLARE_ALIGNED_ARRAY(16, int16_t, output_ref_block, kNumCoeffs);
+
+ DECLARE_ALIGNED_ARRAY(16, uint8_t, dst, kNumCoeffs);
+ DECLARE_ALIGNED_ARRAY(16, uint8_t, ref, kNumCoeffs);
+
+ for (int i = 0; i < count_test_block; ++i) {
+ // Initialize a test block with input range [-255, 255].
+ for (int j = 0; j < kNumCoeffs; ++j) {
+ input_block[j] = rnd.Rand8() - rnd.Rand8();
+ input_extreme_block[j] = rnd.Rand8() % 2 ? 255 : -255;
+ }
+ if (i == 0)
+ for (int j = 0; j < kNumCoeffs; ++j)
+ input_extreme_block[j] = 255;
+ if (i == 1)
+ for (int j = 0; j < kNumCoeffs; ++j)
+ input_extreme_block[j] = -255;
+
+ fwd_txfm_ref(input_extreme_block, output_ref_block, pitch_, tx_type_);
+
+ // clear reconstructed pixel buffers
+ vpx_memset(dst, 0, kNumCoeffs * sizeof(uint8_t));
+ vpx_memset(ref, 0, kNumCoeffs * sizeof(uint8_t));
+
+ // quantization with maximum allowed step sizes
+ output_ref_block[0] = (output_ref_block[0] / dc_thred) * dc_thred;
+ for (int j = 1; j < kNumCoeffs; ++j)
+ output_ref_block[j] = (output_ref_block[j] / ac_thred) * ac_thred;
+ inv_txfm_ref(output_ref_block, ref, pitch_, tx_type_);
+ REGISTER_STATE_CHECK(RunInvTxfm(output_ref_block, dst, pitch_));
+
+ for (int j = 0; j < kNumCoeffs; ++j)
+ EXPECT_EQ(ref[j], dst[j]);
+ }
+ }
+
void RunInvAccuracyCheck() {
ACMRandom rnd(ACMRandom::DeterministicSeed());
const int count_test_block = 1000;
@@ -414,6 +464,7 @@
int pitch_;
int tx_type_;
fht_t fwd_txfm_ref;
+ iht_t inv_txfm_ref;
};
class Trans16x16DCT
@@ -428,6 +479,7 @@
tx_type_ = GET_PARAM(2);
pitch_ = 16;
fwd_txfm_ref = fdct16x16_ref;
+ inv_txfm_ref = idct16x16_ref;
}
virtual void TearDown() { libvpx_test::ClearSystemState(); }
@@ -455,6 +507,12 @@
RunMemCheck();
}
+TEST_P(Trans16x16DCT, QuantCheck) {
+ // Use maximally allowed quantization step sizes for DC and AC
+ // coefficients respectively.
+ RunQuantCheck(1336, 1828);
+}
+
TEST_P(Trans16x16DCT, InvAccuracyCheck) {
RunInvAccuracyCheck();
}
@@ -471,6 +529,7 @@
tx_type_ = GET_PARAM(2);
pitch_ = 16;
fwd_txfm_ref = fht16x16_ref;
+ inv_txfm_ref = iht16x16_ref;
}
virtual void TearDown() { libvpx_test::ClearSystemState(); }
@@ -498,6 +557,12 @@
RunMemCheck();
}
+TEST_P(Trans16x16HT, QuantCheck) {
+ // The encoder skips any non-DC intra prediction modes,
+ // when the quantization step size goes beyond 988.
+ RunQuantCheck(549, 988);
+}
+
using std::tr1::make_tuple;
INSTANTIATE_TEST_CASE_P(
@@ -541,4 +606,29 @@
::testing::Values(
make_tuple(&vp9_fdct16x16_c, &vp9_idct16x16_256_add_ssse3, 0)));
#endif
+
+#if HAVE_AVX2
+// TODO(jzern): these prototypes can be removed after the avx2 versions are
+// reenabled in vp9_rtcd_defs.pl.
+extern "C" {
+void vp9_fdct16x16_avx2(const int16_t *input, int16_t *output, int stride);
+void vp9_fht16x16_avx2(const int16_t *input, int16_t *output, int stride,
+ int tx_type);
+}
+INSTANTIATE_TEST_CASE_P(
+ DISABLED_AVX2, Trans16x16DCT,
+ ::testing::Values(
+ make_tuple(&vp9_fdct16x16_avx2,
+ &vp9_idct16x16_256_add_c, 0)));
+INSTANTIATE_TEST_CASE_P(
+ AVX2, Trans16x16HT,
+ ::testing::Values(
+ make_tuple(&vp9_fht16x16_avx2, &vp9_iht16x16_256_add_c, 3)));
+INSTANTIATE_TEST_CASE_P(
+ DISABLED_AVX2, Trans16x16HT,
+ ::testing::Values(
+ make_tuple(&vp9_fht16x16_avx2, &vp9_iht16x16_256_add_c, 0),
+ make_tuple(&vp9_fht16x16_avx2, &vp9_iht16x16_256_add_c, 1),
+ make_tuple(&vp9_fht16x16_avx2, &vp9_iht16x16_256_add_c, 2)));
+#endif
} // namespace
diff --git a/test/dct32x32_test.cc b/test/dct32x32_test.cc
index 72c0bd6..501c696 100644
--- a/test/dct32x32_test.cc
+++ b/test/dct32x32_test.cc
@@ -179,12 +179,13 @@
input_block[j] = rnd.Rand8() - rnd.Rand8();
input_extreme_block[j] = rnd.Rand8() & 1 ? 255 : -255;
}
- if (i == 0)
+ if (i == 0) {
for (int j = 0; j < kNumCoeffs; ++j)
input_extreme_block[j] = 255;
- if (i == 1)
+ } else if (i == 1) {
for (int j = 0; j < kNumCoeffs; ++j)
input_extreme_block[j] = -255;
+ }
const int stride = 32;
vp9_fdct32x32_c(input_extreme_block, output_ref_block, stride);
diff --git a/test/fdct4x4_test.cc b/test/fdct4x4_test.cc
index 030665e..ec233d3 100644
--- a/test/fdct4x4_test.cc
+++ b/test/fdct4x4_test.cc
@@ -136,12 +136,13 @@
input_block[j] = rnd.Rand8() - rnd.Rand8();
input_extreme_block[j] = rnd.Rand8() % 2 ? 255 : -255;
}
- if (i == 0)
+ if (i == 0) {
for (int j = 0; j < kNumCoeffs; ++j)
input_extreme_block[j] = 255;
- if (i == 1)
+ } else if (i == 1) {
for (int j = 0; j < kNumCoeffs; ++j)
input_extreme_block[j] = -255;
+ }
fwd_txfm_ref(input_extreme_block, output_ref_block, pitch_, tx_type_);
REGISTER_STATE_CHECK(RunFwdTxfm(input_extreme_block,
@@ -375,4 +376,19 @@
make_tuple(&vp9_fht4x4_sse2, &vp9_iht4x4_16_add_sse2, 3)));
#endif
+#if HAVE_AVX2
+INSTANTIATE_TEST_CASE_P(
+ AVX2, Trans4x4DCT,
+ ::testing::Values(
+ make_tuple(&vp9_fdct4x4_avx2,
+ &vp9_idct4x4_16_add_c, 0)));
+INSTANTIATE_TEST_CASE_P(
+ AVX2, Trans4x4HT,
+ ::testing::Values(
+ make_tuple(&vp9_fht4x4_avx2, &vp9_iht4x4_16_add_c, 0),
+ make_tuple(&vp9_fht4x4_avx2, &vp9_iht4x4_16_add_c, 1),
+ make_tuple(&vp9_fht4x4_avx2, &vp9_iht4x4_16_add_c, 2),
+ make_tuple(&vp9_fht4x4_avx2, &vp9_iht4x4_16_add_c, 3)));
+#endif
+
} // namespace
diff --git a/test/fdct8x8_test.cc b/test/fdct8x8_test.cc
index c7cf164..146aa31 100644
--- a/test/fdct8x8_test.cc
+++ b/test/fdct8x8_test.cc
@@ -177,23 +177,36 @@
ACMRandom rnd(ACMRandom::DeterministicSeed());
int max_error = 0;
int total_error = 0;
+ int total_coeff_error = 0;
const int count_test_block = 100000;
DECLARE_ALIGNED_ARRAY(16, int16_t, test_input_block, 64);
DECLARE_ALIGNED_ARRAY(16, int16_t, test_temp_block, 64);
+ DECLARE_ALIGNED_ARRAY(16, int16_t, ref_temp_block, 64);
DECLARE_ALIGNED_ARRAY(16, uint8_t, dst, 64);
DECLARE_ALIGNED_ARRAY(16, uint8_t, src, 64);
for (int i = 0; i < count_test_block; ++i) {
// Initialize a test block with input range [-255, 255].
for (int j = 0; j < 64; ++j) {
- src[j] = rnd.Rand8() % 2 ? 255 : 0;
- dst[j] = src[j] > 0 ? 0 : 255;
+ if (i == 0) {
+ src[j] = 255;
+ dst[j] = 0;
+ } else if (i == 1) {
+ src[j] = 0;
+ dst[j] = 255;
+ } else {
+ src[j] = rnd.Rand8() % 2 ? 255 : 0;
+ dst[j] = rnd.Rand8() % 2 ? 255 : 0;
+ }
+
test_input_block[j] = src[j] - dst[j];
}
REGISTER_STATE_CHECK(
RunFwdTxfm(test_input_block, test_temp_block, pitch_));
REGISTER_STATE_CHECK(
+ fwd_txfm_ref(test_input_block, ref_temp_block, pitch_, tx_type_));
+ REGISTER_STATE_CHECK(
RunInvTxfm(test_temp_block, dst, pitch_));
for (int j = 0; j < 64; ++j) {
@@ -202,6 +215,9 @@
if (max_error < error)
max_error = error;
total_error += error;
+
+ const int coeff_diff = test_temp_block[j] - ref_temp_block[j];
+ total_coeff_error += abs(coeff_diff);
}
EXPECT_GE(1, max_error)
@@ -211,6 +227,10 @@
EXPECT_GE(count_test_block/5, total_error)
<< "Error: Extremal 8x8 FDCT/IDCT or FHT/IHT has average"
<< " roundtrip error > 1/5 per block";
+
+ EXPECT_EQ(0, total_coeff_error)
+ << "Error: Extremal 8x8 FDCT/FHT has"
+ << "overflow issues in the intermediate steps > 1";
}
}
@@ -347,4 +367,18 @@
::testing::Values(
make_tuple(&vp9_fdct8x8_ssse3, &vp9_idct8x8_64_add_ssse3, 0)));
#endif
+
+#if HAVE_AVX2
+INSTANTIATE_TEST_CASE_P(
+ AVX2, FwdTrans8x8DCT,
+ ::testing::Values(
+ make_tuple(&vp9_fdct8x8_avx2, &vp9_idct8x8_64_add_c, 0)));
+INSTANTIATE_TEST_CASE_P(
+ AVX2, FwdTrans8x8HT,
+ ::testing::Values(
+ make_tuple(&vp9_fht8x8_avx2, &vp9_iht8x8_64_add_c, 0),
+ make_tuple(&vp9_fht8x8_avx2, &vp9_iht8x8_64_add_c, 1),
+ make_tuple(&vp9_fht8x8_avx2, &vp9_iht8x8_64_add_c, 2),
+ make_tuple(&vp9_fht8x8_avx2, &vp9_iht8x8_64_add_c, 3)));
+#endif
} // namespace
diff --git a/test/partial_idct_test.cc b/test/partial_idct_test.cc
index 79ef521..f2171b2 100644
--- a/test/partial_idct_test.cc
+++ b/test/partial_idct_test.cc
@@ -28,7 +28,8 @@
namespace {
typedef void (*fwd_txfm_t)(const int16_t *in, int16_t *out, int stride);
typedef void (*inv_txfm_t)(const int16_t *in, uint8_t *out, int stride);
-typedef std::tr1::tuple<inv_txfm_t,
+typedef std::tr1::tuple<fwd_txfm_t,
+ inv_txfm_t,
inv_txfm_t,
TX_SIZE, int> partial_itxfm_param_t;
const int kMaxNumCoeffs = 1024;
@@ -36,10 +37,11 @@
public:
virtual ~PartialIDctTest() {}
virtual void SetUp() {
- full_itxfm_ = GET_PARAM(0);
- partial_itxfm_ = GET_PARAM(1);
- tx_size_ = GET_PARAM(2);
- last_nonzero_ = GET_PARAM(3);
+ ftxfm_ = GET_PARAM(0);
+ full_itxfm_ = GET_PARAM(1);
+ partial_itxfm_ = GET_PARAM(2);
+ tx_size_ = GET_PARAM(3);
+ last_nonzero_ = GET_PARAM(4);
}
virtual void TearDown() { libvpx_test::ClearSystemState(); }
@@ -47,10 +49,90 @@
protected:
int last_nonzero_;
TX_SIZE tx_size_;
+ fwd_txfm_t ftxfm_;
inv_txfm_t full_itxfm_;
inv_txfm_t partial_itxfm_;
};
+TEST_P(PartialIDctTest, RunQuantCheck) {
+ ACMRandom rnd(ACMRandom::DeterministicSeed());
+ int size;
+ switch (tx_size_) {
+ case TX_4X4:
+ size = 4;
+ break;
+ case TX_8X8:
+ size = 8;
+ break;
+ case TX_16X16:
+ size = 16;
+ break;
+ case TX_32X32:
+ size = 32;
+ break;
+ default:
+ FAIL() << "Wrong Size!";
+ break;
+ }
+ DECLARE_ALIGNED_ARRAY(16, int16_t, test_coef_block1, kMaxNumCoeffs);
+ DECLARE_ALIGNED_ARRAY(16, int16_t, test_coef_block2, kMaxNumCoeffs);
+ DECLARE_ALIGNED_ARRAY(16, uint8_t, dst1, kMaxNumCoeffs);
+ DECLARE_ALIGNED_ARRAY(16, uint8_t, dst2, kMaxNumCoeffs);
+
+ const int count_test_block = 1000;
+ const int block_size = size * size;
+
+ DECLARE_ALIGNED_ARRAY(16, int16_t, input_extreme_block, kMaxNumCoeffs);
+ DECLARE_ALIGNED_ARRAY(16, int16_t, output_ref_block, kMaxNumCoeffs);
+
+ int max_error = 0;
+ for (int i = 0; i < count_test_block; ++i) {
+ // clear out destination buffer
+ memset(dst1, 0, sizeof(*dst1) * block_size);
+ memset(dst2, 0, sizeof(*dst2) * block_size);
+ memset(test_coef_block1, 0, sizeof(*test_coef_block1) * block_size);
+ memset(test_coef_block2, 0, sizeof(*test_coef_block2) * block_size);
+
+ ACMRandom rnd(ACMRandom::DeterministicSeed());
+
+ for (int i = 0; i < count_test_block; ++i) {
+ // Initialize a test block with input range [-255, 255].
+ if (i == 0) {
+ for (int j = 0; j < block_size; ++j)
+ input_extreme_block[j] = 255;
+ } else if (i == 1) {
+ for (int j = 0; j < block_size; ++j)
+ input_extreme_block[j] = -255;
+ } else {
+ for (int j = 0; j < block_size; ++j) {
+ input_extreme_block[j] = rnd.Rand8() % 2 ? 255 : -255;
+ }
+ }
+
+ ftxfm_(input_extreme_block, output_ref_block, size);
+
+ // quantization with maximum allowed step sizes
+ test_coef_block1[0] = (output_ref_block[0] / 1336) * 1336;
+ for (int j = 1; j < last_nonzero_; ++j)
+ test_coef_block1[vp9_default_scan_orders[tx_size_].scan[j]]
+ = (output_ref_block[j] / 1828) * 1828;
+ }
+
+ REGISTER_STATE_CHECK(full_itxfm_(test_coef_block1, dst1, size));
+ REGISTER_STATE_CHECK(partial_itxfm_(test_coef_block1, dst2, size));
+
+ for (int j = 0; j < block_size; ++j) {
+ const int diff = dst1[j] - dst2[j];
+ const int error = diff * diff;
+ if (max_error < error)
+ max_error = error;
+ }
+ }
+
+ EXPECT_EQ(0, max_error)
+ << "Error: partial inverse transform produces different results";
+}
+
TEST_P(PartialIDctTest, ResultsMatch) {
ACMRandom rnd(ACMRandom::DeterministicSeed());
int size;
@@ -119,47 +201,60 @@
INSTANTIATE_TEST_CASE_P(
C, PartialIDctTest,
::testing::Values(
- make_tuple(&vp9_idct32x32_1024_add_c,
+ make_tuple(&vp9_fdct32x32_c,
+ &vp9_idct32x32_1024_add_c,
&vp9_idct32x32_34_add_c,
TX_32X32, 34),
- make_tuple(&vp9_idct32x32_1024_add_c,
+ make_tuple(&vp9_fdct32x32_c,
+ &vp9_idct32x32_1024_add_c,
&vp9_idct32x32_1_add_c,
TX_32X32, 1),
- make_tuple(&vp9_idct16x16_256_add_c,
+ make_tuple(&vp9_fdct16x16_c,
+ &vp9_idct16x16_256_add_c,
&vp9_idct16x16_10_add_c,
TX_16X16, 10),
- make_tuple(&vp9_idct16x16_256_add_c,
+ make_tuple(&vp9_fdct16x16_c,
+ &vp9_idct16x16_256_add_c,
&vp9_idct16x16_1_add_c,
TX_16X16, 1),
- make_tuple(&vp9_idct8x8_64_add_c,
+ make_tuple(&vp9_fdct8x8_c,
+ &vp9_idct8x8_64_add_c,
&vp9_idct8x8_12_add_c,
TX_8X8, 12),
- make_tuple(&vp9_idct8x8_64_add_c,
+ make_tuple(&vp9_fdct8x8_c,
+ &vp9_idct8x8_64_add_c,
&vp9_idct8x8_1_add_c,
TX_8X8, 1),
- make_tuple(&vp9_idct4x4_16_add_c,
+ make_tuple(&vp9_fdct4x4_c,
+ &vp9_idct4x4_16_add_c,
&vp9_idct4x4_1_add_c,
TX_4X4, 1)));
#if HAVE_NEON_ASM
INSTANTIATE_TEST_CASE_P(
NEON, PartialIDctTest,
::testing::Values(
- make_tuple(&vp9_idct32x32_1024_add_c,
+ make_tuple(&vp9_fdct32x32_c,
+ &vp9_idct32x32_1024_add_c,
&vp9_idct32x32_1_add_neon,
TX_32X32, 1),
- make_tuple(&vp9_idct16x16_256_add_c,
+ make_tuple(&vp9_fdct16x16_c,
+ &vp9_idct16x16_256_add_c,
&vp9_idct16x16_10_add_neon,
TX_16X16, 10),
- make_tuple(&vp9_idct16x16_256_add_c,
+ make_tuple(&vp9_fdct16x16_c,
+ &vp9_idct16x16_256_add_c,
&vp9_idct16x16_1_add_neon,
TX_16X16, 1),
- make_tuple(&vp9_idct8x8_64_add_c,
+ make_tuple(&vp9_fdct8x8_c,
+ &vp9_idct8x8_64_add_c,
&vp9_idct8x8_12_add_neon,
TX_8X8, 12),
- make_tuple(&vp9_idct8x8_64_add_c,
+ make_tuple(&vp9_fdct8x8_c,
+ &vp9_idct8x8_64_add_c,
&vp9_idct8x8_1_add_neon,
TX_8X8, 1),
- make_tuple(&vp9_idct4x4_16_add_c,
+ make_tuple(&vp9_fdct4x4_c,
+ &vp9_idct4x4_16_add_c,
&vp9_idct4x4_1_add_neon,
TX_4X4, 1)));
#endif
@@ -168,35 +263,53 @@
INSTANTIATE_TEST_CASE_P(
SSE2, PartialIDctTest,
::testing::Values(
- make_tuple(&vp9_idct32x32_1024_add_c,
+ make_tuple(&vp9_fdct32x32_c,
+ &vp9_idct32x32_1024_add_c,
&vp9_idct32x32_34_add_sse2,
TX_32X32, 34),
- make_tuple(&vp9_idct32x32_1024_add_c,
+ make_tuple(&vp9_fdct32x32_c,
+ &vp9_idct32x32_1024_add_c,
&vp9_idct32x32_1_add_sse2,
TX_32X32, 1),
- make_tuple(&vp9_idct16x16_256_add_c,
+ make_tuple(&vp9_fdct16x16_c,
+ &vp9_idct16x16_256_add_c,
&vp9_idct16x16_10_add_sse2,
TX_16X16, 10),
- make_tuple(&vp9_idct16x16_256_add_c,
+ make_tuple(&vp9_fdct16x16_c,
+ &vp9_idct16x16_256_add_c,
&vp9_idct16x16_1_add_sse2,
TX_16X16, 1),
- make_tuple(&vp9_idct8x8_64_add_c,
+ make_tuple(&vp9_fdct8x8_c,
+ &vp9_idct8x8_64_add_c,
&vp9_idct8x8_12_add_sse2,
TX_8X8, 12),
- make_tuple(&vp9_idct8x8_64_add_c,
+ make_tuple(&vp9_fdct8x8_c,
+ &vp9_idct8x8_64_add_c,
&vp9_idct8x8_1_add_sse2,
TX_8X8, 1),
- make_tuple(&vp9_idct4x4_16_add_c,
+ make_tuple(&vp9_fdct4x4_c,
+ &vp9_idct4x4_16_add_c,
&vp9_idct4x4_1_add_sse2,
TX_4X4, 1)));
#endif
#if HAVE_SSSE3 && ARCH_X86_64
INSTANTIATE_TEST_CASE_P(
- SSSE3, PartialIDctTest,
+ SSSE3_64, PartialIDctTest,
::testing::Values(
- make_tuple(&vp9_idct8x8_64_add_c,
+ make_tuple(&vp9_fdct8x8_c,
+ &vp9_idct8x8_64_add_c,
&vp9_idct8x8_12_add_ssse3,
TX_8X8, 12)));
#endif
+
+#if HAVE_SSSE3
+INSTANTIATE_TEST_CASE_P(
+ SSSE3, PartialIDctTest,
+ ::testing::Values(
+ make_tuple(&vp9_fdct16x16_c,
+ &vp9_idct16x16_256_add_c,
+ &vp9_idct16x16_10_add_ssse3,
+ TX_16X16, 10)));
+#endif
} // namespace
diff --git a/test/sad_test.cc b/test/sad_test.cc
index adb191f..89d8c41 100644
--- a/test/sad_test.cc
+++ b/test/sad_test.cc
@@ -185,10 +185,8 @@
}
void CheckSAD(unsigned int max_sad) {
- unsigned int reference_sad, exp_sad;
-
- reference_sad = ReferenceSAD(max_sad, 0);
- exp_sad = SAD(max_sad, 0);
+ const unsigned int reference_sad = ReferenceSAD(max_sad, 0);
+ const unsigned int exp_sad = SAD(max_sad, 0);
if (reference_sad <= max_sad) {
ASSERT_EQ(exp_sad, reference_sad);
@@ -218,10 +216,8 @@
}
void CheckSAD() {
- unsigned int reference_sad, exp_sad;
-
- reference_sad = ReferenceSAD(UINT_MAX, 0);
- exp_sad = SAD(0);
+ const unsigned int reference_sad = ReferenceSAD(UINT_MAX, 0);
+ const unsigned int exp_sad = SAD(0);
ASSERT_EQ(reference_sad, exp_sad);
}
@@ -631,4 +627,24 @@
#endif // CONFIG_USE_X86INC
#endif // HAVE_SSSE3
+#if HAVE_AVX2
+#if CONFIG_VP9_ENCODER
+// TODO(jzern): these prototypes can be removed after the avx2 versions are
+// reenabled in vp9_rtcd_defs.pl.
+extern "C" {
+void vp9_sad32x32x4d_avx2(const uint8_t *src_ptr, int src_stride,
+ const uint8_t *const ref_ptr[], int ref_stride,
+ unsigned int *sad_array);
+void vp9_sad64x64x4d_avx2(const uint8_t *src_ptr, int src_stride,
+ const uint8_t *const ref_ptr[], int ref_stride,
+ unsigned int *sad_array);
+}
+const sad_n_by_n_by_4_fn_t sad_64x64x4d_avx2 = vp9_sad64x64x4d_avx2;
+const sad_n_by_n_by_4_fn_t sad_32x32x4d_avx2 = vp9_sad32x32x4d_avx2;
+INSTANTIATE_TEST_CASE_P(DISABLED_AVX2, SADx4Test, ::testing::Values(
+ make_tuple(32, 32, sad_32x32x4d_avx2),
+ make_tuple(64, 64, sad_64x64x4d_avx2)));
+#endif // CONFIG_VP9_ENCODER
+#endif // HAVE_AVX2
+
} // namespace
diff --git a/test/svc_test.cc b/test/svc_test.cc
index fb9277b..db26a8e 100644
--- a/test/svc_test.cc
+++ b/test/svc_test.cc
@@ -31,7 +31,6 @@
SvcTest()
: codec_iface_(0),
test_file_name_("hantro_collage_w352h288.yuv"),
- stats_file_name_("hantro_collage_w352h288.stat"),
codec_initialized_(false),
decoder_(0) {
memset(&svc_, 0, sizeof(svc_));
@@ -74,7 +73,6 @@
struct vpx_codec_enc_cfg codec_enc_;
vpx_codec_iface_t *codec_iface_;
std::string test_file_name_;
- std::string stats_file_name_;
bool codec_initialized_;
Decoder *decoder_;
};
@@ -364,7 +362,9 @@
EXPECT_EQ(kHeight * 8 / 16, layer_height);
}
-TEST_F(SvcTest, FirstPassEncode) {
+TEST_F(SvcTest, TwoPassEncode) {
+ // First pass encode
+ std::string stats_buf;
svc_.spatial_layers = 2;
codec_enc_.g_pass = VPX_RC_FIRST_PASS;
vpx_svc_set_scale_factors(&svc_, "4/16,16/16");
@@ -383,50 +383,44 @@
res = vpx_svc_encode(&svc_, &codec_, video.img(), video.pts(),
video.duration(), VPX_DL_GOOD_QUALITY);
ASSERT_EQ(VPX_CODEC_OK, res);
- EXPECT_GT(vpx_svc_get_rc_stats_buffer_size(&svc_), 0U);
+ size_t stats_size = vpx_svc_get_rc_stats_buffer_size(&svc_);
+ EXPECT_GT(stats_size, 0U);
+ const char *stats_data = vpx_svc_get_rc_stats_buffer(&svc_);
+ ASSERT_TRUE(stats_data != NULL);
+ stats_buf.append(stats_data, stats_size);
// FRAME 1
video.Next();
res = vpx_svc_encode(&svc_, &codec_, video.img(), video.pts(),
video.duration(), VPX_DL_GOOD_QUALITY);
- ASSERT_EQ(VPX_CODEC_OK, res);
- EXPECT_GT(vpx_svc_get_rc_stats_buffer_size(&svc_), 0U);
+ stats_size = vpx_svc_get_rc_stats_buffer_size(&svc_);
+ EXPECT_GT(stats_size, 0U);
+ stats_data = vpx_svc_get_rc_stats_buffer(&svc_);
+ ASSERT_TRUE(stats_data != NULL);
+ stats_buf.append(stats_data, stats_size);
// Flush encoder and test EOS packet
res = vpx_svc_encode(&svc_, &codec_, NULL, video.pts(),
video.duration(), VPX_DL_GOOD_QUALITY);
- ASSERT_EQ(VPX_CODEC_OK, res);
- EXPECT_GT(vpx_svc_get_rc_stats_buffer_size(&svc_), 0U);
-}
+ stats_size = vpx_svc_get_rc_stats_buffer_size(&svc_);
+ EXPECT_GT(stats_size, 0U);
+ stats_data = vpx_svc_get_rc_stats_buffer(&svc_);
+ ASSERT_TRUE(stats_data != NULL);
+ stats_buf.append(stats_data, stats_size);
-TEST_F(SvcTest, SecondPassEncode) {
- svc_.spatial_layers = 2;
+ // Tear down encoder
+ vpx_svc_release(&svc_);
+ vpx_codec_destroy(&codec_);
+
+ // Second pass encode
codec_enc_.g_pass = VPX_RC_LAST_PASS;
+ codec_enc_.rc_twopass_stats_in.buf = &stats_buf[0];
+ codec_enc_.rc_twopass_stats_in.sz = stats_buf.size();
- FILE *const stats_file = libvpx_test::OpenTestDataFile(stats_file_name_);
- ASSERT_TRUE(stats_file != NULL) << "Stats file open failed. Filename: "
- << stats_file;
-
- struct vpx_fixed_buf stats_buf;
- fseek(stats_file, 0, SEEK_END);
- stats_buf.sz = static_cast<size_t>(ftell(stats_file));
- fseek(stats_file, 0, SEEK_SET);
-
- stats_buf.buf = malloc(stats_buf.sz);
- ASSERT_TRUE(stats_buf.buf != NULL);
- const size_t bytes_read = fread(stats_buf.buf, 1, stats_buf.sz, stats_file);
- ASSERT_EQ(bytes_read, stats_buf.sz);
- fclose(stats_file);
- codec_enc_.rc_twopass_stats_in = stats_buf;
-
- vpx_codec_err_t res =
- vpx_svc_init(&svc_, &codec_, vpx_codec_vp9_cx(), &codec_enc_);
+ res = vpx_svc_init(&svc_, &codec_, vpx_codec_vp9_cx(), &codec_enc_);
ASSERT_EQ(VPX_CODEC_OK, res);
codec_initialized_ = true;
- libvpx_test::I420VideoSource video(test_file_name_, kWidth, kHeight,
- codec_enc_.g_timebase.den,
- codec_enc_.g_timebase.num, 0, 30);
// FRAME 0
video.Begin();
// This frame is a keyframe.
@@ -465,8 +459,6 @@
static_cast<const uint8_t *>(vpx_svc_get_buffer(&svc_)),
vpx_svc_get_frame_size(&svc_));
ASSERT_EQ(VPX_CODEC_OK, res_dec) << decoder_->DecodeError();
-
- free(stats_buf.buf);
}
} // namespace
diff --git a/test/test-data.sha1 b/test/test-data.sha1
index 9c23929..56946b5 100644
--- a/test/test-data.sha1
+++ b/test/test-data.sha1
@@ -1,5 +1,4 @@
d5dfb0151c9051f8c85999255645d7a23916d3c0 hantro_collage_w352h288.yuv
-998cec53307c94aa5835aaf8d5731f6a3c7c2e5a hantro_collage_w352h288.stat
b87815bf86020c592ccc7a846ba2e28ec8043902 hantro_odd.yuv
b1f1c3ec79114b9a0651af24ce634afb44a9a419 rush_hour_444.y4m
5184c46ddca8b1fadd16742e8500115bc8f749da vp80-00-comprehensive-001.ivf
diff --git a/test/test.mk b/test/test.mk
index f0a27c7..56e467a 100644
--- a/test/test.mk
+++ b/test/test.mk
@@ -131,7 +131,6 @@
## TEST DATA
##
LIBVPX_TEST_DATA-$(CONFIG_ENCODERS) += hantro_collage_w352h288.yuv
-LIBVPX_TEST_DATA-$(CONFIG_ENCODERS) += hantro_collage_w352h288.stat
LIBVPX_TEST_DATA-$(CONFIG_ENCODERS) += hantro_odd.yuv
LIBVPX_TEST_DATA-$(CONFIG_VP9_ENCODER) += rush_hour_444.y4m
diff --git a/test/variance_test.cc b/test/variance_test.cc
index c9bf13a..9985695 100644
--- a/test/variance_test.cc
+++ b/test/variance_test.cc
@@ -702,6 +702,57 @@
make_tuple(6, 6, subpel_avg_variance64x64_ssse3)));
#endif
#endif
+
+#if HAVE_AVX2
+// TODO(jzern): these prototypes can be removed after the avx2 versions are
+// reenabled in vp9_rtcd_defs.pl.
+extern "C" {
+unsigned int vp9_sub_pixel_variance32x32_avx2(
+ const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset,
+ const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vp9_sub_pixel_variance64x64_avx2(
+ const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset,
+ const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vp9_sub_pixel_avg_variance32x32_avx2(
+ const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset,
+ const uint8_t *ref_ptr, int ref_stride, unsigned int *sse,
+ const uint8_t *second_pred);
+unsigned int vp9_sub_pixel_avg_variance64x64_avx2(
+ const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset,
+ const uint8_t *ref_ptr, int ref_stride, unsigned int *sse,
+ const uint8_t *second_pred);
+}
+const vp9_variance_fn_t variance16x16_avx2 = vp9_variance16x16_avx2;
+const vp9_variance_fn_t variance32x16_avx2 = vp9_variance32x16_avx2;
+const vp9_variance_fn_t variance32x32_avx2 = vp9_variance32x32_avx2;
+const vp9_variance_fn_t variance64x32_avx2 = vp9_variance64x32_avx2;
+const vp9_variance_fn_t variance64x64_avx2 = vp9_variance64x64_avx2;
+INSTANTIATE_TEST_CASE_P(
+ AVX2, VP9VarianceTest,
+ ::testing::Values(make_tuple(4, 4, variance16x16_avx2),
+ make_tuple(5, 4, variance32x16_avx2),
+ make_tuple(5, 5, variance32x32_avx2),
+ make_tuple(6, 5, variance64x32_avx2),
+ make_tuple(6, 6, variance64x64_avx2)));
+
+const vp9_subpixvariance_fn_t subpel_variance32x32_avx2 =
+ vp9_sub_pixel_variance32x32_avx2;
+const vp9_subpixvariance_fn_t subpel_variance64x64_avx2 =
+ vp9_sub_pixel_variance64x64_avx2;
+INSTANTIATE_TEST_CASE_P(
+ DISABLED_AVX2, VP9SubpelVarianceTest,
+ ::testing::Values(make_tuple(5, 5, subpel_variance32x32_avx2),
+ make_tuple(6, 6, subpel_variance64x64_avx2)));
+
+const vp9_subp_avg_variance_fn_t subpel_avg_variance32x32_avx2 =
+ vp9_sub_pixel_avg_variance32x32_avx2;
+const vp9_subp_avg_variance_fn_t subpel_avg_variance64x64_avx2 =
+ vp9_sub_pixel_avg_variance64x64_avx2;
+INSTANTIATE_TEST_CASE_P(
+ DISABLED_AVX2, VP9SubpelAvgVarianceTest,
+ ::testing::Values(make_tuple(5, 5, subpel_avg_variance32x32_avx2),
+ make_tuple(6, 6, subpel_avg_variance64x64_avx2)));
+#endif // HAVE_AVX2
#endif // CONFIG_VP9_ENCODER
} // namespace vp9
diff --git a/third_party/googletest/README.libvpx b/third_party/googletest/README.libvpx
index 6fdeb87..7201a67 100644
--- a/third_party/googletest/README.libvpx
+++ b/third_party/googletest/README.libvpx
@@ -12,4 +12,4 @@
generation.
Local Modifications:
-None.
\ No newline at end of file
+Removed unused declarations of kPathSeparatorString to have warning free build.
\ No newline at end of file
diff --git a/third_party/googletest/src/src/gtest-all.cc b/third_party/googletest/src/src/gtest-all.cc
index a9a03b2..8d90627 100644
--- a/third_party/googletest/src/src/gtest-all.cc
+++ b/third_party/googletest/src/src/gtest-all.cc
@@ -7904,7 +7904,6 @@
// of them.
const char kPathSeparator = '\\';
const char kAlternatePathSeparator = '/';
-const char kPathSeparatorString[] = "\\";
const char kAlternatePathSeparatorString[] = "/";
# if GTEST_OS_WINDOWS_MOBILE
// Windows CE doesn't have a current directory. You should not use
@@ -7918,7 +7917,6 @@
# endif // GTEST_OS_WINDOWS_MOBILE
#else
const char kPathSeparator = '/';
-const char kPathSeparatorString[] = "/";
const char kCurrentDirectoryString[] = "./";
#endif // GTEST_OS_WINDOWS
diff --git a/vp8/encoder/arm/neon/denoising_neon.c b/vp8/encoder/arm/neon/denoising_neon.c
index b8e4034..78cc6fa 100644
--- a/vp8/encoder/arm/neon/denoising_neon.c
+++ b/vp8/encoder/arm/neon/denoising_neon.c
@@ -68,8 +68,8 @@
int64x2_t v_sum_diff_total = vdupq_n_s64(0);
/* Go over lines. */
- int i;
- for (i = 0; i < 16; ++i) {
+ int r;
+ for (r = 0; r < 16; ++r) {
/* Load inputs. */
const uint8x16_t v_sig = vld1q_u8(sig);
const uint8x16_t v_mc_running_avg_y = vld1q_u8(mc_running_avg_y);
@@ -145,14 +145,91 @@
/* Too much adjustments => copy block. */
{
- const int64x1_t x = vqadd_s64(vget_high_s64(v_sum_diff_total),
+ int64x1_t x = vqadd_s64(vget_high_s64(v_sum_diff_total),
vget_low_s64(v_sum_diff_total));
- const int s0 = vget_lane_s32(vabs_s32(vreinterpret_s32_s64(x)), 0);
+ int sum_diff = vget_lane_s32(vabs_s32(vreinterpret_s32_s64(x)), 0);
int sum_diff_thresh = SUM_DIFF_THRESHOLD;
if (increase_denoising) sum_diff_thresh = SUM_DIFF_THRESHOLD_HIGH;
- if (s0 > sum_diff_thresh)
+ if (sum_diff > sum_diff_thresh) {
+ // Before returning to copy the block (i.e., apply no denoising),
+ // checK if we can still apply some (weaker) temporal filtering to
+ // this block, that would otherwise not be denoised at all. Simplest
+ // is to apply an additional adjustment to running_avg_y to bring it
+ // closer to sig. The adjustment is capped by a maximum delta, and
+ // chosen such that in most cases the resulting sum_diff will be
+ // within the accceptable range given by sum_diff_thresh.
+
+ // The delta is set by the excess of absolute pixel diff over the
+ // threshold.
+ int delta = ((sum_diff - sum_diff_thresh) >> 8) + 1;
+ // Only apply the adjustment for max delta up to 3.
+ if (delta < 4) {
+ const uint8x16_t k_delta = vmovq_n_u8(delta);
+ sig -= sig_stride * 16;
+ mc_running_avg_y -= mc_running_avg_y_stride * 16;
+ running_avg_y -= running_avg_y_stride * 16;
+ for (r = 0; r < 16; ++r) {
+ uint8x16_t v_running_avg_y = vld1q_u8(running_avg_y);
+ const uint8x16_t v_sig = vld1q_u8(sig);
+ const uint8x16_t v_mc_running_avg_y = vld1q_u8(mc_running_avg_y);
+
+ /* Calculate absolute difference and sign masks. */
+ const uint8x16_t v_abs_diff = vabdq_u8(v_sig,
+ v_mc_running_avg_y);
+ const uint8x16_t v_diff_pos_mask = vcltq_u8(v_sig,
+ v_mc_running_avg_y);
+ const uint8x16_t v_diff_neg_mask = vcgtq_u8(v_sig,
+ v_mc_running_avg_y);
+ // Clamp absolute difference to delta to get the adjustment.
+ const uint8x16_t v_abs_adjustment =
+ vminq_u8(v_abs_diff, (k_delta));
+
+ const uint8x16_t v_pos_adjustment = vandq_u8(v_diff_pos_mask,
+ v_abs_adjustment);
+ const uint8x16_t v_neg_adjustment = vandq_u8(v_diff_neg_mask,
+ v_abs_adjustment);
+
+ v_running_avg_y = vqsubq_u8(v_running_avg_y, v_pos_adjustment);
+ v_running_avg_y = vqaddq_u8(v_running_avg_y, v_neg_adjustment);
+
+ /* Store results. */
+ vst1q_u8(running_avg_y, v_running_avg_y);
+
+ {
+ const int8x16_t v_sum_diff =
+ vqsubq_s8(vreinterpretq_s8_u8(v_neg_adjustment),
+ vreinterpretq_s8_u8(v_pos_adjustment));
+
+ const int16x8_t fe_dc_ba_98_76_54_32_10 =
+ vpaddlq_s8(v_sum_diff);
+ const int32x4_t fedc_ba98_7654_3210 =
+ vpaddlq_s16(fe_dc_ba_98_76_54_32_10);
+ const int64x2_t fedcba98_76543210 =
+ vpaddlq_s32(fedc_ba98_7654_3210);
+
+ v_sum_diff_total = vqaddq_s64(v_sum_diff_total,
+ fedcba98_76543210);
+ }
+ /* Update pointers for next iteration. */
+ sig += sig_stride;
+ mc_running_avg_y += mc_running_avg_y_stride;
+ running_avg_y += running_avg_y_stride;
+ }
+ {
+ // Update the sum of all pixel differences of this MB.
+ x = vqadd_s64(vget_high_s64(v_sum_diff_total),
+ vget_low_s64(v_sum_diff_total));
+ sum_diff = vget_lane_s32(vabs_s32(vreinterpret_s32_s64(x)), 0);
+
+ if (sum_diff > sum_diff_thresh) {
+ return COPY_BLOCK;
+ }
+ }
+ } else {
return COPY_BLOCK;
+ }
+ }
}
/* Tell above level that block was filtered. */
diff --git a/vp8/encoder/block.h b/vp8/encoder/block.h
index 34879cf..1f212ca 100644
--- a/vp8/encoder/block.h
+++ b/vp8/encoder/block.h
@@ -125,9 +125,9 @@
int optimize;
int q_index;
- int increase_denoising;
#if CONFIG_TEMPORAL_DENOISING
+ int increase_denoising;
MB_PREDICTION_MODE best_sse_inter_mode;
int_mv best_sse_mv;
MV_REFERENCE_FRAME best_reference_frame;
diff --git a/vp8/encoder/pickinter.c b/vp8/encoder/pickinter.c
index cf6a82f..817c9ef 100644
--- a/vp8/encoder/pickinter.c
+++ b/vp8/encoder/pickinter.c
@@ -590,9 +590,9 @@
int distortion2;
int bestsme = INT_MAX;
int best_mode_index = 0;
- unsigned int sse = INT_MAX, best_rd_sse = INT_MAX;
+ unsigned int sse = UINT_MAX, best_rd_sse = UINT_MAX;
#if CONFIG_TEMPORAL_DENOISING
- unsigned int zero_mv_sse = INT_MAX, best_sse = INT_MAX;
+ unsigned int zero_mv_sse = UINT_MAX, best_sse = UINT_MAX;
#endif
int sf_improved_mv_pred = cpi->sf.improved_mv_pred;
diff --git a/vp8/encoder/rdopt.c b/vp8/encoder/rdopt.c
index d9f39b5..f145d09 100644
--- a/vp8/encoder/rdopt.c
+++ b/vp8/encoder/rdopt.c
@@ -1973,8 +1973,8 @@
cpi->common.y1dc_delta_q);
#if CONFIG_TEMPORAL_DENOISING
- unsigned int zero_mv_sse = INT_MAX, best_sse = INT_MAX,
- best_rd_sse = INT_MAX;
+ unsigned int zero_mv_sse = UINT_MAX, best_sse = UINT_MAX,
+ best_rd_sse = UINT_MAX;
#endif
mode_mv = mode_mv_sb[sign_bias];
diff --git a/vp9/common/vp9_alloccommon.c b/vp9/common/vp9_alloccommon.c
index f44ada1..e56a0b7 100644
--- a/vp9/common/vp9_alloccommon.c
+++ b/vp9/common/vp9_alloccommon.c
@@ -53,32 +53,41 @@
}
static int alloc_mi(VP9_COMMON *cm, int mi_size) {
- cm->mip = (MODE_INFO *)vpx_calloc(mi_size, sizeof(*cm->mip));
- if (cm->mip == NULL)
- return 1;
+ int i;
- cm->prev_mip = (MODE_INFO *)vpx_calloc(mi_size, sizeof(*cm->prev_mip));
- if (cm->prev_mip == NULL)
- return 1;
+ for (i = 0; i < 2; ++i) {
+ cm->mip_array[i] =
+ (MODE_INFO *)vpx_calloc(mi_size, sizeof(*cm->mip));
+ if (cm->mip_array[i] == NULL)
+ return 1;
- cm->mi_grid_base =
- (MODE_INFO **)vpx_calloc(mi_size, sizeof(*cm->mi_grid_base));
- if (cm->mi_grid_base == NULL)
- return 1;
+ cm->mi_grid_base_array[i] =
+ (MODE_INFO **)vpx_calloc(mi_size, sizeof(*cm->mi_grid_base));
+ if (cm->mi_grid_base_array[i] == NULL)
+ return 1;
+ }
- cm->prev_mi_grid_base =
- (MODE_INFO **)vpx_calloc(mi_size, sizeof(*cm->prev_mi_grid_base));
- if (cm->prev_mi_grid_base == NULL)
- return 1;
+ // Init the index.
+ cm->mi_idx = 0;
+ cm->prev_mi_idx = 1;
+
+ cm->mip = cm->mip_array[cm->mi_idx];
+ cm->prev_mip = cm->mip_array[cm->prev_mi_idx];
+ cm->mi_grid_base = cm->mi_grid_base_array[cm->mi_idx];
+ cm->prev_mi_grid_base = cm->mi_grid_base_array[cm->prev_mi_idx];
return 0;
}
static void free_mi(VP9_COMMON *cm) {
- vpx_free(cm->mip);
- vpx_free(cm->prev_mip);
- vpx_free(cm->mi_grid_base);
- vpx_free(cm->prev_mi_grid_base);
+ int i;
+
+ for (i = 0; i < 2; ++i) {
+ vpx_free(cm->mip_array[i]);
+ cm->mip_array[i] = NULL;
+ vpx_free(cm->mi_grid_base_array[i]);
+ cm->mi_grid_base_array[i] = NULL;
+ }
cm->mip = NULL;
cm->prev_mip = NULL;
@@ -237,13 +246,16 @@
}
void vp9_swap_mi_and_prev_mi(VP9_COMMON *cm) {
+ // Swap indices.
+ const int tmp = cm->mi_idx;
+ cm->mi_idx = cm->prev_mi_idx;
+ cm->prev_mi_idx = tmp;
+
// Current mip will be the prev_mip for the next frame.
- MODE_INFO *temp = cm->prev_mip;
- MODE_INFO **temp2 = cm->prev_mi_grid_base;
- cm->prev_mip = cm->mip;
- cm->mip = temp;
- cm->prev_mi_grid_base = cm->mi_grid_base;
- cm->mi_grid_base = temp2;
+ cm->mip = cm->mip_array[cm->mi_idx];
+ cm->prev_mip = cm->mip_array[cm->prev_mi_idx];
+ cm->mi_grid_base = cm->mi_grid_base_array[cm->mi_idx];
+ cm->prev_mi_grid_base = cm->mi_grid_base_array[cm->prev_mi_idx];
// Update the upper left visible macroblock ptrs.
cm->mi = cm->mip + cm->mi_stride + 1;
diff --git a/vp9/common/vp9_idct.h b/vp9/common/vp9_idct.h
index d868776..3253bcb 100644
--- a/vp9/common/vp9_idct.h
+++ b/vp9/common/vp9_idct.h
@@ -33,6 +33,9 @@
#define pair_set_epi16(a, b) \
_mm_set_epi16(b, a, b, a, b, a, b, a)
+#define dual_set_epi16(a, b) \
+ _mm_set_epi16(b, b, b, b, a, a, a, a)
+
// Constants:
// for (int i = 1; i< 32; ++i)
// printf("static const int cospi_%d_64 = %.0f;\n", i,
diff --git a/vp9/common/vp9_onyxc_int.h b/vp9/common/vp9_onyxc_int.h
index 20de434..e1753a1 100644
--- a/vp9/common/vp9_onyxc_int.h
+++ b/vp9/common/vp9_onyxc_int.h
@@ -142,6 +142,11 @@
/* We allocate a MODE_INFO struct for each macroblock, together with
an extra row on top and column on the left to simplify prediction. */
+ int mi_idx;
+ int prev_mi_idx;
+ MODE_INFO *mip_array[2];
+ MODE_INFO **mi_grid_base_array[2];
+
MODE_INFO *mip; /* Base of allocated array */
MODE_INFO *mi; /* Corresponds to upper left visible macroblock */
MODE_INFO *prev_mip; /* MODE_INFO array 'mip' from last decoded frame */
diff --git a/vp9/common/vp9_rtcd_defs.pl b/vp9/common/vp9_rtcd_defs.pl
index c300cde..66a3956 100644
--- a/vp9/common/vp9_rtcd_defs.pl
+++ b/vp9/common/vp9_rtcd_defs.pl
@@ -305,15 +305,15 @@
$vp9_convolve_avg_neon_asm=vp9_convolve_avg_neon;
add_proto qw/void vp9_convolve8/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
-specialize qw/vp9_convolve8 sse2 ssse3 avx2 neon_asm dspr2/;
+specialize qw/vp9_convolve8 sse2 ssse3 neon_asm dspr2/;
$vp9_convolve8_neon_asm=vp9_convolve8_neon;
add_proto qw/void vp9_convolve8_horiz/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
-specialize qw/vp9_convolve8_horiz sse2 ssse3 avx2 neon_asm dspr2/;
+specialize qw/vp9_convolve8_horiz sse2 ssse3 neon_asm dspr2/;
$vp9_convolve8_horiz_neon_asm=vp9_convolve8_horiz_neon;
add_proto qw/void vp9_convolve8_vert/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
-specialize qw/vp9_convolve8_vert sse2 ssse3 avx2 neon_asm dspr2/;
+specialize qw/vp9_convolve8_vert sse2 ssse3 neon_asm dspr2/;
$vp9_convolve8_vert_neon_asm=vp9_convolve8_vert_neon;
add_proto qw/void vp9_convolve8_avg/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
@@ -360,7 +360,7 @@
$vp9_idct16x16_256_add_neon_asm=vp9_idct16x16_256_add_neon;
add_proto qw/void vp9_idct16x16_10_add/, "const int16_t *input, uint8_t *dest, int dest_stride";
-specialize qw/vp9_idct16x16_10_add sse2 neon_asm dspr2/;
+specialize qw/vp9_idct16x16_10_add sse2 ssse3 neon_asm dspr2/;
$vp9_idct16x16_10_add_neon_asm=vp9_idct16x16_10_add_neon;
add_proto qw/void vp9_idct32x32_1024_add/, "const int16_t *input, uint8_t *dest, int dest_stride";
@@ -422,10 +422,6 @@
add_proto qw/unsigned int vp9_variance16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vp9_variance16x16 mmx/, "$sse2_x86inc", "$avx2_x86inc";
-add_proto qw/void vp9_get_sse_sum_16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
-specialize qw/vp9_get_sse_sum_16x16 sse2/;
-$vp9_get_sse_sum_16x16_sse2=vp9_get16x16var_sse2;
-
add_proto qw/unsigned int vp9_variance16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vp9_variance16x8 mmx/, "$sse2_x86inc";
@@ -435,9 +431,11 @@
add_proto qw/unsigned int vp9_variance8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vp9_variance8x8 mmx/, "$sse2_x86inc";
-add_proto qw/void vp9_get_sse_sum_8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
-specialize qw/vp9_get_sse_sum_8x8 sse2/;
-$vp9_get_sse_sum_8x8_sse2=vp9_get8x8var_sse2;
+add_proto qw/void vp9_get8x8var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
+specialize qw/vp9_get8x8var mmx/, "$sse2_x86inc";
+
+add_proto qw/void vp9_get16x16var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
+specialize qw/vp9_get16x16var avx2/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_variance8x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vp9_variance8x4/, "$sse2_x86inc";
@@ -449,10 +447,10 @@
specialize qw/vp9_variance4x4 mmx/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_sub_pixel_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-specialize qw/vp9_sub_pixel_variance64x64 avx2/, "$sse2_x86inc", "$ssse3_x86inc";
+specialize qw/vp9_sub_pixel_variance64x64/, "$sse2_x86inc", "$ssse3_x86inc";
add_proto qw/unsigned int vp9_sub_pixel_avg_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
-specialize qw/vp9_sub_pixel_avg_variance64x64 avx2/, "$sse2_x86inc", "$ssse3_x86inc";
+specialize qw/vp9_sub_pixel_avg_variance64x64/, "$sse2_x86inc", "$ssse3_x86inc";
add_proto qw/unsigned int vp9_sub_pixel_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vp9_sub_pixel_variance32x64/, "$sse2_x86inc", "$ssse3_x86inc";
@@ -479,10 +477,10 @@
specialize qw/vp9_sub_pixel_avg_variance16x32/, "$sse2_x86inc", "$ssse3_x86inc";
add_proto qw/unsigned int vp9_sub_pixel_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-specialize qw/vp9_sub_pixel_variance32x32 avx2/, "$sse2_x86inc", "$ssse3_x86inc";
+specialize qw/vp9_sub_pixel_variance32x32/, "$sse2_x86inc", "$ssse3_x86inc";
add_proto qw/unsigned int vp9_sub_pixel_avg_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
-specialize qw/vp9_sub_pixel_avg_variance32x32 avx2/, "$sse2_x86inc", "$ssse3_x86inc";
+specialize qw/vp9_sub_pixel_avg_variance32x32/, "$sse2_x86inc", "$ssse3_x86inc";
add_proto qw/unsigned int vp9_sub_pixel_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vp9_sub_pixel_variance16x16/, "$sse2_x86inc", "$ssse3_x86inc";
@@ -655,7 +653,7 @@
specialize qw/vp9_sad4x4x8 sse4/;
add_proto qw/void vp9_sad64x64x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array";
-specialize qw/vp9_sad64x64x4d sse2 avx2/;
+specialize qw/vp9_sad64x64x4d sse2/;
add_proto qw/void vp9_sad32x64x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array";
specialize qw/vp9_sad32x64x4d sse2/;
@@ -670,7 +668,7 @@
specialize qw/vp9_sad16x32x4d sse2/;
add_proto qw/void vp9_sad32x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array";
-specialize qw/vp9_sad32x32x4d sse2 avx2/;
+specialize qw/vp9_sad32x32x4d sse2/;
add_proto qw/void vp9_sad16x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array";
specialize qw/vp9_sad16x16x4d sse2/;
@@ -741,19 +739,31 @@
specialize qw/vp9_fht8x8 sse2 avx2/;
add_proto qw/void vp9_fht16x16/, "const int16_t *input, int16_t *output, int stride, int tx_type";
-specialize qw/vp9_fht16x16 sse2 avx2/;
+specialize qw/vp9_fht16x16 sse2/;
add_proto qw/void vp9_fwht4x4/, "const int16_t *input, int16_t *output, int stride";
specialize qw/vp9_fwht4x4/, "$mmx_x86inc";
+add_proto qw/void vp9_fdct4x4_1/, "const int16_t *input, int16_t *output, int stride";
+specialize qw/vp9_fdct4x4_1 sse2/;
+
add_proto qw/void vp9_fdct4x4/, "const int16_t *input, int16_t *output, int stride";
specialize qw/vp9_fdct4x4 sse2 avx2/;
+add_proto qw/void vp9_fdct8x8_1/, "const int16_t *input, int16_t *output, int stride";
+specialize qw/vp9_fdct8x8_1 sse2/;
+
add_proto qw/void vp9_fdct8x8/, "const int16_t *input, int16_t *output, int stride";
specialize qw/vp9_fdct8x8 sse2 avx2/, "$ssse3_x86_64";
+add_proto qw/void vp9_fdct16x16_1/, "const int16_t *input, int16_t *output, int stride";
+specialize qw/vp9_fdct16x16_1 sse2/;
+
add_proto qw/void vp9_fdct16x16/, "const int16_t *input, int16_t *output, int stride";
-specialize qw/vp9_fdct16x16 sse2 avx2/;
+specialize qw/vp9_fdct16x16 sse2/;
+
+add_proto qw/void vp9_fdct32x32_1/, "const int16_t *input, int16_t *output, int stride";
+specialize qw/vp9_fdct32x32_1 sse2/;
add_proto qw/void vp9_fdct32x32/, "const int16_t *input, int16_t *output, int stride";
specialize qw/vp9_fdct32x32 sse2 avx2/;
diff --git a/vp9/common/x86/vp9_idct_intrin_sse2.c b/vp9/common/x86/vp9_idct_intrin_sse2.c
index ff9c432..b60f8a0 100644
--- a/vp9/common/x86/vp9_idct_intrin_sse2.c
+++ b/vp9/common/x86/vp9_idct_intrin_sse2.c
@@ -375,15 +375,6 @@
out3 = _mm_unpackhi_epi64(tr1_2, tr1_6); \
}
-#define TRANSPOSE_8X4(in0, in1, in2, in3, out0, out1) \
- { \
- const __m128i tr0_0 = _mm_unpacklo_epi16(in0, in1); \
- const __m128i tr0_1 = _mm_unpacklo_epi16(in2, in3); \
- \
- in0 = _mm_unpacklo_epi32(tr0_0, tr0_1); /* i1 i0 */ \
- in1 = _mm_unpackhi_epi32(tr0_0, tr0_1); /* i3 i2 */ \
- }
-
#define TRANSPOSE_8X8_10(in0, in1, in2, in3, out0, out1) \
{ \
const __m128i tr0_0 = _mm_unpacklo_epi16(in0, in1); \
@@ -612,23 +603,6 @@
RECON_AND_STORE(dest, dc_value);
}
-static INLINE void array_transpose_4X8(__m128i *in, __m128i * out) {
- const __m128i tr0_0 = _mm_unpacklo_epi16(in[0], in[1]);
- const __m128i tr0_1 = _mm_unpacklo_epi16(in[2], in[3]);
- const __m128i tr0_4 = _mm_unpacklo_epi16(in[4], in[5]);
- const __m128i tr0_5 = _mm_unpacklo_epi16(in[6], in[7]);
-
- const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
- const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1);
- const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5);
- const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5);
-
- out[0] = _mm_unpacklo_epi64(tr1_0, tr1_4);
- out[1] = _mm_unpackhi_epi64(tr1_0, tr1_4);
- out[2] = _mm_unpacklo_epi64(tr1_2, tr1_6);
- out[3] = _mm_unpackhi_epi64(tr1_2, tr1_6);
-}
-
static void idct8_sse2(__m128i *in) {
const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
diff --git a/vp9/common/x86/vp9_idct_intrin_sse2.h b/vp9/common/x86/vp9_idct_intrin_sse2.h
index 1c62e32..0f179b4 100644
--- a/vp9/common/x86/vp9_idct_intrin_sse2.h
+++ b/vp9/common/x86/vp9_idct_intrin_sse2.h
@@ -45,6 +45,32 @@
res[7] = _mm_unpackhi_epi64(tr1_6, tr1_7);
}
+#define TRANSPOSE_8X4(in0, in1, in2, in3, out0, out1) \
+ { \
+ const __m128i tr0_0 = _mm_unpacklo_epi16(in0, in1); \
+ const __m128i tr0_1 = _mm_unpacklo_epi16(in2, in3); \
+ \
+ in0 = _mm_unpacklo_epi32(tr0_0, tr0_1); /* i1 i0 */ \
+ in1 = _mm_unpackhi_epi32(tr0_0, tr0_1); /* i3 i2 */ \
+ }
+
+static INLINE void array_transpose_4X8(__m128i *in, __m128i * out) {
+ const __m128i tr0_0 = _mm_unpacklo_epi16(in[0], in[1]);
+ const __m128i tr0_1 = _mm_unpacklo_epi16(in[2], in[3]);
+ const __m128i tr0_4 = _mm_unpacklo_epi16(in[4], in[5]);
+ const __m128i tr0_5 = _mm_unpacklo_epi16(in[6], in[7]);
+
+ const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
+ const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1);
+ const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5);
+ const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5);
+
+ out[0] = _mm_unpacklo_epi64(tr1_0, tr1_4);
+ out[1] = _mm_unpackhi_epi64(tr1_0, tr1_4);
+ out[2] = _mm_unpacklo_epi64(tr1_2, tr1_6);
+ out[3] = _mm_unpackhi_epi64(tr1_2, tr1_6);
+}
+
static INLINE void array_transpose_16x16(__m128i *res0, __m128i *res1) {
__m128i tbuf[8];
array_transpose_8x8(res0, res0);
diff --git a/vp9/common/x86/vp9_idct_intrin_ssse3.c b/vp9/common/x86/vp9_idct_intrin_ssse3.c
index e5d3cb5..73bf5d1 100644
--- a/vp9/common/x86/vp9_idct_intrin_ssse3.c
+++ b/vp9/common/x86/vp9_idct_intrin_ssse3.c
@@ -16,7 +16,7 @@
#include <tmmintrin.h> // SSSE3
#include "vp9/common/x86/vp9_idct_intrin_sse2.h"
-static void idct16_8col(__m128i *in) {
+static void idct16_8col(__m128i *in, int round) {
const __m128i k__cospi_p30_m02 = pair_set_epi16(cospi_30_64, -cospi_2_64);
const __m128i k__cospi_p02_p30 = pair_set_epi16(cospi_2_64, cospi_30_64);
const __m128i k__cospi_p14_m18 = pair_set_epi16(cospi_14_64, -cospi_18_64);
@@ -36,6 +36,8 @@
const __m128i k__cospi_m24_m08 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
const __m128i k__cospi_p16_p16_x2 = pair_set_epi16(23170, 23170);
+ const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
+ const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64);
__m128i v[16], u[16], s[16], t[16];
@@ -266,28 +268,80 @@
t[15] = _mm_add_epi16(s[12], s[15]);
// stage 6
- s[0] = _mm_add_epi16(t[0], t[7]);
- s[1] = _mm_add_epi16(t[1], t[6]);
- s[2] = _mm_add_epi16(t[2], t[5]);
- s[3] = _mm_add_epi16(t[3], t[4]);
- s[4] = _mm_sub_epi16(t[3], t[4]);
- s[5] = _mm_sub_epi16(t[2], t[5]);
- s[6] = _mm_sub_epi16(t[1], t[6]);
- s[7] = _mm_sub_epi16(t[0], t[7]);
- s[8] = t[8];
- s[9] = t[9];
+ if (round == 1) {
+ s[0] = _mm_add_epi16(t[0], t[7]);
+ s[1] = _mm_add_epi16(t[1], t[6]);
+ s[2] = _mm_add_epi16(t[2], t[5]);
+ s[3] = _mm_add_epi16(t[3], t[4]);
+ s[4] = _mm_sub_epi16(t[3], t[4]);
+ s[5] = _mm_sub_epi16(t[2], t[5]);
+ s[6] = _mm_sub_epi16(t[1], t[6]);
+ s[7] = _mm_sub_epi16(t[0], t[7]);
+ s[8] = t[8];
+ s[9] = t[9];
- u[0] = _mm_sub_epi16(t[13], t[10]);
- u[1] = _mm_add_epi16(t[13], t[10]);
- u[2] = _mm_sub_epi16(t[12], t[11]);
- u[3] = _mm_add_epi16(t[12], t[11]);
+ u[0] = _mm_unpacklo_epi16(t[10], t[13]);
+ u[1] = _mm_unpackhi_epi16(t[10], t[13]);
+ u[2] = _mm_unpacklo_epi16(t[11], t[12]);
+ u[3] = _mm_unpackhi_epi16(t[11], t[12]);
- s[10] = _mm_mulhrs_epi16(u[0], k__cospi_p16_p16_x2);
- s[13] = _mm_mulhrs_epi16(u[1], k__cospi_p16_p16_x2);
- s[11] = _mm_mulhrs_epi16(u[2], k__cospi_p16_p16_x2);
- s[12] = _mm_mulhrs_epi16(u[3], k__cospi_p16_p16_x2);
- s[14] = t[14];
- s[15] = t[15];
+ v[0] = _mm_madd_epi16(u[0], k__cospi_m16_p16);
+ v[1] = _mm_madd_epi16(u[1], k__cospi_m16_p16);
+ v[2] = _mm_madd_epi16(u[0], k__cospi_p16_p16);
+ v[3] = _mm_madd_epi16(u[1], k__cospi_p16_p16);
+ v[4] = _mm_madd_epi16(u[2], k__cospi_m16_p16);
+ v[5] = _mm_madd_epi16(u[3], k__cospi_m16_p16);
+ v[6] = _mm_madd_epi16(u[2], k__cospi_p16_p16);
+ v[7] = _mm_madd_epi16(u[3], k__cospi_p16_p16);
+
+ u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
+ u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
+ u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
+ u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
+ u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
+ u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
+ u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
+ u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
+
+ u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
+ u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
+ u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
+ u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
+ u[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
+ u[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
+ u[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
+ u[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
+
+ s[10] = _mm_packs_epi32(u[0], u[1]);
+ s[13] = _mm_packs_epi32(u[2], u[3]);
+ s[11] = _mm_packs_epi32(u[4], u[5]);
+ s[12] = _mm_packs_epi32(u[6], u[7]);
+ s[14] = t[14];
+ s[15] = t[15];
+ } else {
+ s[0] = _mm_add_epi16(t[0], t[7]);
+ s[1] = _mm_add_epi16(t[1], t[6]);
+ s[2] = _mm_add_epi16(t[2], t[5]);
+ s[3] = _mm_add_epi16(t[3], t[4]);
+ s[4] = _mm_sub_epi16(t[3], t[4]);
+ s[5] = _mm_sub_epi16(t[2], t[5]);
+ s[6] = _mm_sub_epi16(t[1], t[6]);
+ s[7] = _mm_sub_epi16(t[0], t[7]);
+ s[8] = t[8];
+ s[9] = t[9];
+
+ u[0] = _mm_sub_epi16(t[13], t[10]);
+ u[1] = _mm_add_epi16(t[13], t[10]);
+ u[2] = _mm_sub_epi16(t[12], t[11]);
+ u[3] = _mm_add_epi16(t[12], t[11]);
+
+ s[10] = _mm_mulhrs_epi16(u[0], k__cospi_p16_p16_x2);
+ s[13] = _mm_mulhrs_epi16(u[1], k__cospi_p16_p16_x2);
+ s[11] = _mm_mulhrs_epi16(u[2], k__cospi_p16_p16_x2);
+ s[12] = _mm_mulhrs_epi16(u[3], k__cospi_p16_p16_x2);
+ s[14] = t[14];
+ s[15] = t[15];
+ }
// stage 7
in[0] = _mm_add_epi16(s[0], s[15]);
@@ -308,10 +362,10 @@
in[15] = _mm_sub_epi16(s[0], s[15]);
}
-static void idct16_sse2(__m128i *in0, __m128i *in1) {
+static void idct16_sse2(__m128i *in0, __m128i *in1, int round) {
array_transpose_16x16(in0, in1);
- idct16_8col(in0);
- idct16_8col(in1);
+ idct16_8col(in0, round);
+ idct16_8col(in1, round);
}
void vp9_idct16x16_256_add_ssse3(const int16_t *input, uint8_t *dest,
@@ -322,10 +376,387 @@
input += 8;
load_buffer_8x16(input, in1);
- idct16_sse2(in0, in1);
- idct16_sse2(in0, in1);
+ idct16_sse2(in0, in1, 0);
+ idct16_sse2(in0, in1, 1);
write_buffer_8x16(dest, in0, stride);
dest += 8;
write_buffer_8x16(dest, in1, stride);
}
+
+static void idct16_10_r1(__m128i *in, __m128i *l) {
+ const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
+ const __m128i zero = _mm_setzero_si128();
+
+ const __m128i stg2_01 = dual_set_epi16(3212, 32610);
+ const __m128i stg2_67 = dual_set_epi16(-9512, 31358);
+ const __m128i stg3_01 = dual_set_epi16(6392, 32138);
+ const __m128i stg4_01 = dual_set_epi16(23170, 23170);
+
+
+
+ const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64);
+ const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64);
+ const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
+ const __m128i stg4_7 = pair_set_epi16(-cospi_8_64, cospi_24_64);
+
+ __m128i stp1_0, stp1_1, stp1_4, stp1_6,
+ stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15;
+ __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7,
+ stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13;
+ __m128i tmp0, tmp1, tmp2, tmp3, tmp4;
+
+ // Stage2
+ {
+ const __m128i lo_1_15 = _mm_unpackhi_epi64(in[0], in[0]);
+ const __m128i lo_13_3 = _mm_unpackhi_epi64(in[1], in[1]);
+
+ stp2_8 = _mm_mulhrs_epi16(lo_1_15, stg2_01);
+ stp2_11 = _mm_mulhrs_epi16(lo_13_3, stg2_67);
+ }
+
+ // Stage3
+ {
+ const __m128i lo_2_14 = _mm_unpacklo_epi64(in[1], in[1]);
+ stp1_4 = _mm_mulhrs_epi16(lo_2_14, stg3_01);
+
+ stp1_13 = _mm_unpackhi_epi64(stp2_11, zero);
+ stp1_14 = _mm_unpackhi_epi64(stp2_8, zero);
+ }
+
+ // Stage4
+ {
+ const __m128i lo_0_8 = _mm_unpacklo_epi64(in[0], in[0]);
+ const __m128i lo_9_14 = _mm_unpacklo_epi16(stp2_8, stp1_14);
+ const __m128i lo_10_13 = _mm_unpacklo_epi16(stp2_11, stp1_13);
+
+ tmp0 = _mm_mulhrs_epi16(lo_0_8, stg4_01);
+ tmp1 = _mm_madd_epi16(lo_9_14, stg4_4);
+ tmp3 = _mm_madd_epi16(lo_9_14, stg4_5);
+ tmp2 = _mm_madd_epi16(lo_10_13, stg4_6);
+ tmp4 = _mm_madd_epi16(lo_10_13, stg4_7);
+
+ tmp1 = _mm_add_epi32(tmp1, rounding);
+ tmp3 = _mm_add_epi32(tmp3, rounding);
+ tmp2 = _mm_add_epi32(tmp2, rounding);
+ tmp4 = _mm_add_epi32(tmp4, rounding);
+
+ tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS);
+ tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS);
+ tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
+ tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS);
+
+ stp1_0 = _mm_unpacklo_epi64(tmp0, tmp0);
+ stp1_1 = _mm_unpackhi_epi64(tmp0, tmp0);
+ stp2_9 = _mm_packs_epi32(tmp1, tmp3);
+ stp2_10 = _mm_packs_epi32(tmp2, tmp4);
+
+ stp2_6 = _mm_unpackhi_epi64(stp1_4, zero);
+ }
+
+ // Stage5 and Stage6
+ {
+ tmp0 = _mm_add_epi16(stp2_8, stp2_11);
+ tmp1 = _mm_sub_epi16(stp2_8, stp2_11);
+ tmp2 = _mm_add_epi16(stp2_9, stp2_10);
+ tmp3 = _mm_sub_epi16(stp2_9, stp2_10);
+
+ stp1_9 = _mm_unpacklo_epi64(tmp2, zero);
+ stp1_10 = _mm_unpacklo_epi64(tmp3, zero);
+ stp1_8 = _mm_unpacklo_epi64(tmp0, zero);
+ stp1_11 = _mm_unpacklo_epi64(tmp1, zero);
+
+ stp1_13 = _mm_unpackhi_epi64(tmp3, zero);
+ stp1_14 = _mm_unpackhi_epi64(tmp2, zero);
+ stp1_12 = _mm_unpackhi_epi64(tmp1, zero);
+ stp1_15 = _mm_unpackhi_epi64(tmp0, zero);
+ }
+
+ // Stage6
+ {
+ const __m128i lo_6_5 = _mm_add_epi16(stp2_6, stp1_4);
+ const __m128i lo_6_6 = _mm_sub_epi16(stp2_6, stp1_4);
+ const __m128i lo_10_13 = _mm_sub_epi16(stp1_13, stp1_10);
+ const __m128i lo_10_14 = _mm_add_epi16(stp1_13, stp1_10);
+ const __m128i lo_11_12 = _mm_sub_epi16(stp1_12, stp1_11);
+ const __m128i lo_11_13 = _mm_add_epi16(stp1_12, stp1_11);
+
+ tmp1 = _mm_unpacklo_epi64(lo_6_5, lo_6_6);
+ tmp0 = _mm_unpacklo_epi64(lo_10_13, lo_10_14);
+ tmp4 = _mm_unpacklo_epi64(lo_11_12, lo_11_13);
+
+ stp1_6 = _mm_mulhrs_epi16(tmp1, stg4_01);
+ tmp0 = _mm_mulhrs_epi16(tmp0, stg4_01);
+ tmp4 = _mm_mulhrs_epi16(tmp4, stg4_01);
+
+ stp2_10 = _mm_unpacklo_epi64(tmp0, zero);
+ stp2_13 = _mm_unpackhi_epi64(tmp0, zero);
+ stp2_11 = _mm_unpacklo_epi64(tmp4, zero);
+ stp2_12 = _mm_unpackhi_epi64(tmp4, zero);
+
+ tmp0 = _mm_add_epi16(stp1_0, stp1_4);
+ tmp1 = _mm_sub_epi16(stp1_0, stp1_4);
+ tmp2 = _mm_add_epi16(stp1_1, stp1_6);
+ tmp3 = _mm_sub_epi16(stp1_1, stp1_6);
+
+ stp2_0 = _mm_unpackhi_epi64(tmp0, zero);
+ stp2_1 = _mm_unpacklo_epi64(tmp2, zero);
+ stp2_2 = _mm_unpackhi_epi64(tmp2, zero);
+ stp2_3 = _mm_unpacklo_epi64(tmp0, zero);
+ stp2_4 = _mm_unpacklo_epi64(tmp1, zero);
+ stp2_5 = _mm_unpackhi_epi64(tmp3, zero);
+ stp2_6 = _mm_unpacklo_epi64(tmp3, zero);
+ stp2_7 = _mm_unpackhi_epi64(tmp1, zero);
+ }
+
+ // Stage7. Left 8x16 only.
+ l[0] = _mm_add_epi16(stp2_0, stp1_15);
+ l[1] = _mm_add_epi16(stp2_1, stp1_14);
+ l[2] = _mm_add_epi16(stp2_2, stp2_13);
+ l[3] = _mm_add_epi16(stp2_3, stp2_12);
+ l[4] = _mm_add_epi16(stp2_4, stp2_11);
+ l[5] = _mm_add_epi16(stp2_5, stp2_10);
+ l[6] = _mm_add_epi16(stp2_6, stp1_9);
+ l[7] = _mm_add_epi16(stp2_7, stp1_8);
+ l[8] = _mm_sub_epi16(stp2_7, stp1_8);
+ l[9] = _mm_sub_epi16(stp2_6, stp1_9);
+ l[10] = _mm_sub_epi16(stp2_5, stp2_10);
+ l[11] = _mm_sub_epi16(stp2_4, stp2_11);
+ l[12] = _mm_sub_epi16(stp2_3, stp2_12);
+ l[13] = _mm_sub_epi16(stp2_2, stp2_13);
+ l[14] = _mm_sub_epi16(stp2_1, stp1_14);
+ l[15] = _mm_sub_epi16(stp2_0, stp1_15);
+}
+
+static void idct16_10_r2(__m128i *in) {
+ const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
+
+ const __m128i stg2_0 = dual_set_epi16(3212, 3212);
+ const __m128i stg2_1 = dual_set_epi16(32610, 32610);
+ const __m128i stg2_6 = dual_set_epi16(-9512, -9512);
+ const __m128i stg2_7 = dual_set_epi16(31358, 31358);
+ const __m128i stg3_0 = dual_set_epi16(6392, 6392);
+ const __m128i stg3_1 = dual_set_epi16(32138, 32138);
+ const __m128i stg4_01 = dual_set_epi16(23170, 23170);
+
+ const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64);
+ const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64);
+ const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
+ const __m128i stg4_7 = pair_set_epi16(-cospi_8_64, cospi_24_64);
+
+ __m128i stp1_0, stp1_2, stp1_3, stp1_5, stp1_6,
+ stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15,
+ stp1_8_0, stp1_12_0;
+ __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7,
+ stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14;
+ __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+
+ /* Stage2 */
+ {
+ stp1_8_0 = _mm_mulhrs_epi16(in[1], stg2_0);
+ stp1_15 = _mm_mulhrs_epi16(in[1], stg2_1);
+ stp1_11 = _mm_mulhrs_epi16(in[3], stg2_6);
+ stp1_12_0 = _mm_mulhrs_epi16(in[3], stg2_7);
+ }
+
+ /* Stage3 */
+ {
+ stp2_4 = _mm_mulhrs_epi16(in[2], stg3_0);
+ stp2_7 = _mm_mulhrs_epi16(in[2], stg3_1);
+
+ stp1_9 = stp1_8_0;
+ stp1_10 = stp1_11;
+
+ stp1_13 = stp1_12_0;
+ stp1_14 = stp1_15;
+ }
+
+ /* Stage4 */
+ {
+ const __m128i lo_9_14 = _mm_unpacklo_epi16(stp1_9, stp1_14);
+ const __m128i hi_9_14 = _mm_unpackhi_epi16(stp1_9, stp1_14);
+ const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13);
+ const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13);
+
+ stp1_0 = _mm_mulhrs_epi16(in[0], stg4_01);
+
+ stp2_5 = stp2_4;
+ stp2_6 = stp2_7;
+
+
+ tmp0 = _mm_madd_epi16(lo_9_14, stg4_4);
+ tmp1 = _mm_madd_epi16(hi_9_14, stg4_4);
+ tmp2 = _mm_madd_epi16(lo_9_14, stg4_5);
+ tmp3 = _mm_madd_epi16(hi_9_14, stg4_5);
+ tmp4 = _mm_madd_epi16(lo_10_13, stg4_6);
+ tmp5 = _mm_madd_epi16(hi_10_13, stg4_6);
+ tmp6 = _mm_madd_epi16(lo_10_13, stg4_7);
+ tmp7 = _mm_madd_epi16(hi_10_13, stg4_7);
+
+ tmp0 = _mm_add_epi32(tmp0, rounding);
+ tmp1 = _mm_add_epi32(tmp1, rounding);
+ tmp2 = _mm_add_epi32(tmp2, rounding);
+ tmp3 = _mm_add_epi32(tmp3, rounding);
+ tmp4 = _mm_add_epi32(tmp4, rounding);
+ tmp5 = _mm_add_epi32(tmp5, rounding);
+ tmp6 = _mm_add_epi32(tmp6, rounding);
+ tmp7 = _mm_add_epi32(tmp7, rounding);
+
+ tmp0 = _mm_srai_epi32(tmp0, 14);
+ tmp1 = _mm_srai_epi32(tmp1, 14);
+ tmp2 = _mm_srai_epi32(tmp2, 14);
+ tmp3 = _mm_srai_epi32(tmp3, 14);
+ tmp4 = _mm_srai_epi32(tmp4, 14);
+ tmp5 = _mm_srai_epi32(tmp5, 14);
+ tmp6 = _mm_srai_epi32(tmp6, 14);
+ tmp7 = _mm_srai_epi32(tmp7, 14);
+
+ stp2_9 = _mm_packs_epi32(tmp0, tmp1);
+ stp2_14 = _mm_packs_epi32(tmp2, tmp3);
+ stp2_10 = _mm_packs_epi32(tmp4, tmp5);
+ stp2_13 = _mm_packs_epi32(tmp6, tmp7);
+ }
+
+ /* Stage5 */
+ {
+ stp1_2 = stp1_0;
+ stp1_3 = stp1_0;
+
+ tmp0 = _mm_sub_epi16(stp2_6, stp2_5);
+ tmp1 = _mm_add_epi16(stp2_6, stp2_5);
+
+ stp1_5 = _mm_mulhrs_epi16(tmp0, stg4_01);
+ stp1_6 = _mm_mulhrs_epi16(tmp1, stg4_01);
+
+ stp1_8 = _mm_add_epi16(stp1_8_0, stp1_11);
+ stp1_9 = _mm_add_epi16(stp2_9, stp2_10);
+ stp1_10 = _mm_sub_epi16(stp2_9, stp2_10);
+ stp1_11 = _mm_sub_epi16(stp1_8_0, stp1_11);
+
+ stp1_12 = _mm_sub_epi16(stp1_15, stp1_12_0);
+ stp1_13 = _mm_sub_epi16(stp2_14, stp2_13);
+ stp1_14 = _mm_add_epi16(stp2_14, stp2_13);
+ stp1_15 = _mm_add_epi16(stp1_15, stp1_12_0);
+ }
+
+ /* Stage6 */
+ {
+ stp2_0 = _mm_add_epi16(stp1_0, stp2_7);
+ stp2_1 = _mm_add_epi16(stp1_0, stp1_6);
+ stp2_2 = _mm_add_epi16(stp1_2, stp1_5);
+ stp2_3 = _mm_add_epi16(stp1_3, stp2_4);
+
+ tmp0 = _mm_sub_epi16(stp1_13, stp1_10);
+ tmp1 = _mm_add_epi16(stp1_13, stp1_10);
+ tmp2 = _mm_sub_epi16(stp1_12, stp1_11);
+ tmp3 = _mm_add_epi16(stp1_12, stp1_11);
+
+ stp2_4 = _mm_sub_epi16(stp1_3, stp2_4);
+ stp2_5 = _mm_sub_epi16(stp1_2, stp1_5);
+ stp2_6 = _mm_sub_epi16(stp1_0, stp1_6);
+ stp2_7 = _mm_sub_epi16(stp1_0, stp2_7);
+
+ stp2_10 = _mm_mulhrs_epi16(tmp0, stg4_01);
+ stp2_13 = _mm_mulhrs_epi16(tmp1, stg4_01);
+ stp2_11 = _mm_mulhrs_epi16(tmp2, stg4_01);
+ stp2_12 = _mm_mulhrs_epi16(tmp3, stg4_01);
+ }
+
+ // Stage7
+ in[0] = _mm_add_epi16(stp2_0, stp1_15);
+ in[1] = _mm_add_epi16(stp2_1, stp1_14);
+ in[2] = _mm_add_epi16(stp2_2, stp2_13);
+ in[3] = _mm_add_epi16(stp2_3, stp2_12);
+ in[4] = _mm_add_epi16(stp2_4, stp2_11);
+ in[5] = _mm_add_epi16(stp2_5, stp2_10);
+ in[6] = _mm_add_epi16(stp2_6, stp1_9);
+ in[7] = _mm_add_epi16(stp2_7, stp1_8);
+ in[8] = _mm_sub_epi16(stp2_7, stp1_8);
+ in[9] = _mm_sub_epi16(stp2_6, stp1_9);
+ in[10] = _mm_sub_epi16(stp2_5, stp2_10);
+ in[11] = _mm_sub_epi16(stp2_4, stp2_11);
+ in[12] = _mm_sub_epi16(stp2_3, stp2_12);
+ in[13] = _mm_sub_epi16(stp2_2, stp2_13);
+ in[14] = _mm_sub_epi16(stp2_1, stp1_14);
+ in[15] = _mm_sub_epi16(stp2_0, stp1_15);
+}
+
+void vp9_idct16x16_10_add_ssse3(const int16_t *input, uint8_t *dest,
+ int stride) {
+ const __m128i final_rounding = _mm_set1_epi16(1<<5);
+ const __m128i zero = _mm_setzero_si128();
+ __m128i in[16], l[16];
+
+ int i;
+ // First 1-D inverse DCT
+ // Load input data.
+ in[0] = _mm_load_si128((const __m128i *)input);
+ in[1] = _mm_load_si128((const __m128i *)(input + 8 * 2));
+ in[2] = _mm_load_si128((const __m128i *)(input + 8 * 4));
+ in[3] = _mm_load_si128((const __m128i *)(input + 8 * 6));
+
+ TRANSPOSE_8X4(in[0], in[1], in[2], in[3], in[0], in[1]);
+
+ idct16_10_r1(in, l);
+
+ // Second 1-D inverse transform, performed per 8x16 block
+ for (i = 0; i < 2; i++) {
+ array_transpose_4X8(l + 8*i, in);
+
+ idct16_10_r2(in);
+
+ // Final rounding and shift
+ in[0] = _mm_adds_epi16(in[0], final_rounding);
+ in[1] = _mm_adds_epi16(in[1], final_rounding);
+ in[2] = _mm_adds_epi16(in[2], final_rounding);
+ in[3] = _mm_adds_epi16(in[3], final_rounding);
+ in[4] = _mm_adds_epi16(in[4], final_rounding);
+ in[5] = _mm_adds_epi16(in[5], final_rounding);
+ in[6] = _mm_adds_epi16(in[6], final_rounding);
+ in[7] = _mm_adds_epi16(in[7], final_rounding);
+ in[8] = _mm_adds_epi16(in[8], final_rounding);
+ in[9] = _mm_adds_epi16(in[9], final_rounding);
+ in[10] = _mm_adds_epi16(in[10], final_rounding);
+ in[11] = _mm_adds_epi16(in[11], final_rounding);
+ in[12] = _mm_adds_epi16(in[12], final_rounding);
+ in[13] = _mm_adds_epi16(in[13], final_rounding);
+ in[14] = _mm_adds_epi16(in[14], final_rounding);
+ in[15] = _mm_adds_epi16(in[15], final_rounding);
+
+ in[0] = _mm_srai_epi16(in[0], 6);
+ in[1] = _mm_srai_epi16(in[1], 6);
+ in[2] = _mm_srai_epi16(in[2], 6);
+ in[3] = _mm_srai_epi16(in[3], 6);
+ in[4] = _mm_srai_epi16(in[4], 6);
+ in[5] = _mm_srai_epi16(in[5], 6);
+ in[6] = _mm_srai_epi16(in[6], 6);
+ in[7] = _mm_srai_epi16(in[7], 6);
+ in[8] = _mm_srai_epi16(in[8], 6);
+ in[9] = _mm_srai_epi16(in[9], 6);
+ in[10] = _mm_srai_epi16(in[10], 6);
+ in[11] = _mm_srai_epi16(in[11], 6);
+ in[12] = _mm_srai_epi16(in[12], 6);
+ in[13] = _mm_srai_epi16(in[13], 6);
+ in[14] = _mm_srai_epi16(in[14], 6);
+ in[15] = _mm_srai_epi16(in[15], 6);
+
+ RECON_AND_STORE(dest, in[0]);
+ RECON_AND_STORE(dest, in[1]);
+ RECON_AND_STORE(dest, in[2]);
+ RECON_AND_STORE(dest, in[3]);
+ RECON_AND_STORE(dest, in[4]);
+ RECON_AND_STORE(dest, in[5]);
+ RECON_AND_STORE(dest, in[6]);
+ RECON_AND_STORE(dest, in[7]);
+ RECON_AND_STORE(dest, in[8]);
+ RECON_AND_STORE(dest, in[9]);
+ RECON_AND_STORE(dest, in[10]);
+ RECON_AND_STORE(dest, in[11]);
+ RECON_AND_STORE(dest, in[12]);
+ RECON_AND_STORE(dest, in[13]);
+ RECON_AND_STORE(dest, in[14]);
+ RECON_AND_STORE(dest, in[15]);
+
+ dest += 8 - (stride * 16);
+ }
+}
diff --git a/vp9/decoder/vp9_decodeframe.c b/vp9/decoder/vp9_decodeframe.c
index b2bd3ed..fc70035 100644
--- a/vp9/decoder/vp9_decodeframe.c
+++ b/vp9/decoder/vp9_decodeframe.c
@@ -204,7 +204,7 @@
switch (tx_size) {
case TX_4X4:
tx_type = get_tx_type_4x4(plane_type, xd, block);
- vp9_iht4x4_16_add(dqcoeff, dst, stride, tx_type);
+ vp9_iht4x4_add(tx_type, dqcoeff, dst, stride, eob);
break;
case TX_8X8:
tx_type = get_tx_type(plane_type, xd);
diff --git a/vp9/decoder/vp9_decoder.h b/vp9/decoder/vp9_decoder.h
index a6edf0c..ab4f9a2 100644
--- a/vp9/decoder/vp9_decoder.h
+++ b/vp9/decoder/vp9_decoder.h
@@ -43,6 +43,8 @@
int refresh_frame_flags;
+ int frame_parallel_decode; // frame-based threading.
+
VP9Worker lf_worker;
VP9Worker *tile_workers;
int num_tile_workers;
diff --git a/vp9/encoder/vp9_aq_complexity.c b/vp9/encoder/vp9_aq_complexity.c
index 47ad8d8..0d6b41d 100644
--- a/vp9/encoder/vp9_aq_complexity.c
+++ b/vp9/encoder/vp9_aq_complexity.c
@@ -47,11 +47,21 @@
// Use some of the segments for in frame Q adjustment.
for (segment = 1; segment < 2; segment++) {
- const int qindex_delta =
+ int qindex_delta =
vp9_compute_qdelta_by_rate(&cpi->rc, cm->frame_type, cm->base_qindex,
in_frame_q_adj_ratio[segment]);
- vp9_enable_segfeature(seg, segment, SEG_LVL_ALT_Q);
- vp9_set_segdata(seg, segment, SEG_LVL_ALT_Q, qindex_delta);
+
+ // For AQ mode 2, we dont allow Q0 in a segment if the base Q is not 0.
+ // Q0 (lossless) implies 4x4 only and in AQ mode 2 a segment Q delta
+ // is sometimes applied without going back around the rd loop.
+ // This could lead to an illegal combination of partition size and q.
+ if ((cm->base_qindex != 0) && ((cm->base_qindex + qindex_delta) == 0)) {
+ qindex_delta = -cm->base_qindex + 1;
+ }
+ if ((cm->base_qindex + qindex_delta) > 0) {
+ vp9_enable_segfeature(seg, segment, SEG_LVL_ALT_Q);
+ vp9_set_segdata(seg, segment, SEG_LVL_ALT_Q, qindex_delta);
+ }
}
}
}
diff --git a/vp9/encoder/vp9_block.h b/vp9/encoder/vp9_block.h
index c3cd93b..2463ed0 100644
--- a/vp9/encoder/vp9_block.h
+++ b/vp9/encoder/vp9_block.h
@@ -28,6 +28,7 @@
struct buf_2d src;
// Quantizer setings
+ int16_t *quant_fp;
int16_t *quant;
int16_t *quant_shift;
int16_t *zbin;
@@ -48,7 +49,7 @@
MACROBLOCKD e_mbd;
int skip_block;
- int select_txfm_size;
+ int select_tx_size;
int skip_recode;
int skip_optimize;
int q_index;
@@ -105,6 +106,9 @@
int use_lp32x32fdct;
int skip_encode;
+ // skip forward transform and quantization
+ int skip_txfm;
+
// Used to store sub partition's choices.
MV pred_mv[MAX_REF_FRAMES];
diff --git a/vp9/encoder/vp9_context_tree.h b/vp9/encoder/vp9_context_tree.h
index bb384aa..872223b 100644
--- a/vp9/encoder/vp9_context_tree.h
+++ b/vp9/encoder/vp9_context_tree.h
@@ -33,6 +33,7 @@
int is_coded;
int num_4x4_blk;
int skip;
+ int skip_txfm;
int best_mode_index;
int hybrid_pred_diff;
int comp_pred_diff;
diff --git a/vp9/encoder/vp9_dct.c b/vp9/encoder/vp9_dct.c
index 5772767..5c99a0a 100644
--- a/vp9/encoder/vp9_dct.c
+++ b/vp9/encoder/vp9_dct.c
@@ -43,6 +43,17 @@
output[3] = fdct_round_shift(temp2);
}
+void vp9_fdct4x4_1_c(const int16_t *input, int16_t *output, int stride) {
+ int r, c;
+ int16_t sum = 0;
+ for (r = 0; r < 4; ++r)
+ for (c = 0; c < 4; ++c)
+ sum += input[r * stride + c];
+
+ output[0] = sum << 3;
+ output[1] = 0;
+}
+
void vp9_fdct4x4_c(const int16_t *input, int16_t *output, int stride) {
// The 2D transform is done with two passes which are actually pretty
// similar. In the first one, we transform the columns and transpose
@@ -240,6 +251,17 @@
output[7] = fdct_round_shift(t3);
}
+void vp9_fdct8x8_1_c(const int16_t *input, int16_t *output, int stride) {
+ int r, c;
+ int16_t sum = 0;
+ for (r = 0; r < 8; ++r)
+ for (c = 0; c < 8; ++c)
+ sum += input[r * stride + c];
+
+ output[0] = sum * 8;
+ output[1] = 0;
+}
+
void vp9_fdct8x8_c(const int16_t *input, int16_t *final_output, int stride) {
int i, j;
int16_t intermediate[64];
@@ -311,6 +333,17 @@
}
}
+void vp9_fdct16x16_1_c(const int16_t *input, int16_t *output, int stride) {
+ int r, c;
+ int16_t sum = 0;
+ for (r = 0; r < 16; ++r)
+ for (c = 0; c < 16; ++c)
+ sum += input[r * stride + c];
+
+ output[0] = sum * 8;
+ output[1] = 0;
+}
+
void vp9_fdct16x16_c(const int16_t *input, int16_t *output, int stride) {
// The 2D transform is done with two passes which are actually pretty
// similar. In the first one, we transform the columns and transpose
@@ -1329,6 +1362,17 @@
output[31] = dct_32_round(step[31] * cospi_31_64 + step[16] * -cospi_1_64);
}
+void vp9_fdct32x32_1_c(const int16_t *input, int16_t *output, int stride) {
+ int r, c;
+ int16_t sum = 0;
+ for (r = 0; r < 32; ++r)
+ for (c = 0; c < 32; ++c)
+ sum += input[r * stride + c];
+
+ output[0] = sum << 2;
+ output[1] = 0;
+}
+
void vp9_fdct32x32_c(const int16_t *input, int16_t *out, int stride) {
int i, j;
int output[32 * 32];
diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c
index 5d3735e..001ac69 100644
--- a/vp9/encoder/vp9_encodeframe.c
+++ b/vp9/encoder/vp9_encodeframe.c
@@ -478,8 +478,8 @@
unsigned int sse = 0;
int sum = 0;
if (x_idx < pixels_wide && y_idx < pixels_high)
- vp9_get_sse_sum_8x8(s + y_idx * sp + x_idx, sp,
- d + y_idx * dp + x_idx, dp, &sse, &sum);
+ vp9_get8x8var(s + y_idx * sp + x_idx, sp,
+ d + y_idx * dp + x_idx, dp, &sse, &sum);
fill_variance(sse, sum, 64, &vst->split[k].part_variances.none);
}
}
@@ -547,22 +547,6 @@
}
}
-// Original activity measure from Tim T's code.
-static unsigned int tt_activity_measure(MACROBLOCK *x) {
- unsigned int sse;
- // TODO: This could also be done over smaller areas (8x8), but that would
- // require extensive changes elsewhere, as lambda is assumed to be fixed
- // over an entire MB in most of the code.
- // Another option is to compute four 8x8 variances, and pick a single
- // lambda using a non-linear combination (e.g., the smallest, or second
- // smallest, etc.).
- const unsigned int act = vp9_variance16x16(x->plane[0].src.buf,
- x->plane[0].src.stride,
- VP9_VAR_OFFS, 0, &sse) << 4;
- // If the region is flat, lower the activity some more.
- return act < (8 << 12) ? MIN(act, 5 << 12) : act;
-}
-
static void update_state(VP9_COMP *cpi, PICK_MODE_CONTEXT *ctx,
int mi_row, int mi_col, BLOCK_SIZE bsize,
int output_enabled) {
@@ -713,6 +697,38 @@
x->e_mbd.plane[i].subsampling_y);
}
+static void set_mode_info_seg_skip(MACROBLOCK *x, TX_MODE tx_mode, int *rate,
+ int64_t *dist, BLOCK_SIZE bsize) {
+ MACROBLOCKD *const xd = &x->e_mbd;
+ MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
+ INTERP_FILTER filter_ref;
+
+ if (xd->up_available)
+ filter_ref = xd->mi[-xd->mi_stride]->mbmi.interp_filter;
+ else if (xd->left_available)
+ filter_ref = xd->mi[-1]->mbmi.interp_filter;
+ else
+ filter_ref = EIGHTTAP;
+
+ mbmi->sb_type = bsize;
+ mbmi->mode = ZEROMV;
+ mbmi->tx_size = MIN(max_txsize_lookup[bsize],
+ tx_mode_to_biggest_tx_size[tx_mode]);
+ mbmi->skip = 1;
+ mbmi->uv_mode = DC_PRED;
+ mbmi->ref_frame[0] = LAST_FRAME;
+ mbmi->ref_frame[1] = NONE;
+ mbmi->mv[0].as_int = 0;
+ mbmi->interp_filter = filter_ref;
+
+ xd->mi[0]->bmi[0].as_mv[0].as_int = 0;
+ x->skip = 1;
+ x->skip_encode = 1;
+
+ *rate = 0;
+ *dist = 0;
+}
+
static void rd_pick_sb_modes(VP9_COMP *cpi, const TileInfo *const tile,
int mi_row, int mi_col,
int *totalrate, int64_t *totaldist,
@@ -1214,9 +1230,9 @@
int b_offset = b_mi_row * MI_SIZE * src_stride +
b_mi_col * MI_SIZE;
- vp9_get_sse_sum_16x16(src + b_offset, src_stride,
- pre_src + b_offset, pre_stride,
- &d16[j].sse, &d16[j].sum);
+ vp9_get16x16var(src + b_offset, src_stride,
+ pre_src + b_offset, pre_stride,
+ &d16[j].sse, &d16[j].sum);
d16[j].var = d16[j].sse -
(((uint32_t)d16[j].sum * d16[j].sum) >> 8);
@@ -1354,6 +1370,7 @@
}
x->skip = ctx->skip;
+ x->skip_txfm = mbmi->segment_id ? 0 : ctx->skip_txfm;
}
static void encode_b_rt(VP9_COMP *cpi, const TileInfo *const tile,
@@ -1872,8 +1889,8 @@
BLOCK_SIZE min_size = BLOCK_32X32;
BLOCK_SIZE max_size = BLOCK_8X8;
int bsl = mi_width_log2_lookup[BLOCK_64X64];
- int search_range_ctrl = (((mi_row + mi_col) >> bsl) +
- cpi->sf.chessboard_index) & 0x01;
+ const int search_range_ctrl = (((mi_row + mi_col) >> bsl) +
+ get_chessboard_index(cm)) % 2;
// Trap case where we do not have a prediction.
if (search_range_ctrl &&
(left_in_image || above_in_image || cm->frame_type != KEY_FRAME)) {
@@ -2369,22 +2386,6 @@
sizeof(*xd->above_seg_context) * aligned_mi_cols);
}
-static void switch_lossless_mode(VP9_COMP *cpi, int lossless) {
- if (lossless) {
- // printf("Switching to lossless\n");
- cpi->mb.fwd_txm4x4 = vp9_fwht4x4;
- cpi->mb.itxm_add = vp9_iwht4x4_add;
- cpi->mb.optimize = 0;
- cpi->common.lf.filter_level = 0;
- cpi->zbin_mode_boost_enabled = 0;
- cpi->common.tx_mode = ONLY_4X4;
- } else {
- // printf("Not lossless\n");
- cpi->mb.fwd_txm4x4 = vp9_fdct4x4;
- cpi->mb.itxm_add = vp9_idct4x4_add;
- }
-}
-
static int check_dual_ref_flags(VP9_COMP *cpi) {
const int ref_flags = cpi->ref_frame_flags;
@@ -2396,15 +2397,15 @@
}
}
-static void reset_skip_txfm_size(VP9_COMMON *cm, TX_SIZE txfm_max) {
+static void reset_skip_tx_size(VP9_COMMON *cm, TX_SIZE max_tx_size) {
int mi_row, mi_col;
const int mis = cm->mi_stride;
MODE_INFO **mi_ptr = cm->mi_grid_visible;
for (mi_row = 0; mi_row < cm->mi_rows; ++mi_row, mi_ptr += mis) {
for (mi_col = 0; mi_col < cm->mi_cols; ++mi_col) {
- if (mi_ptr[mi_col]->mbmi.tx_size > txfm_max)
- mi_ptr[mi_col]->mbmi.tx_size = txfm_max;
+ if (mi_ptr[mi_col]->mbmi.tx_size > max_tx_size)
+ mi_ptr[mi_col]->mbmi.tx_size = max_tx_size;
}
}
}
@@ -2421,7 +2422,7 @@
}
static TX_MODE select_tx_mode(const VP9_COMP *cpi) {
- if (cpi->oxcf.lossless) {
+ if (cpi->mb.e_mbd.lossless) {
return ONLY_4X4;
} else if (cpi->common.current_video_frame == 0) {
return TX_MODE_SELECT;
@@ -2434,6 +2435,8 @@
return rd_opt->tx_select_threshes[frame_type][ALLOW_32X32] >
rd_opt->tx_select_threshes[frame_type][TX_MODE_SELECT] ?
ALLOW_32X32 : TX_MODE_SELECT;
+ } else if (cpi->sf.tx_size_search_method == USE_TX_8X8) {
+ return ALLOW_8X8;
} else {
unsigned int total = 0;
int i;
@@ -2450,18 +2453,6 @@
}
}
-// Start RTC Exploration
-typedef enum {
- BOTH_ZERO = 0,
- ZERO_PLUS_PREDICTED = 1,
- BOTH_PREDICTED = 2,
- NEW_PLUS_NON_INTRA = 3,
- BOTH_NEW = 4,
- INTRA_PLUS_NON_INTRA = 5,
- BOTH_INTRA = 6,
- INVALID_CASE = 9
-} motion_vector_context;
-
static void set_mode_info(MB_MODE_INFO *mbmi, BLOCK_SIZE bsize,
PREDICTION_MODE mode) {
mbmi->mode = mode;
@@ -2483,17 +2474,21 @@
VP9_COMMON *const cm = &cpi->common;
MACROBLOCK *const x = &cpi->mb;
MACROBLOCKD *const xd = &x->e_mbd;
+ MB_MODE_INFO *mbmi;
set_offsets(cpi, tile, mi_row, mi_col, bsize);
- xd->mi[0]->mbmi.sb_type = bsize;
+ mbmi = &xd->mi[0]->mbmi;
+ mbmi->sb_type = bsize;
if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ && cm->seg.enabled) {
- if (xd->mi[0]->mbmi.segment_id && x->in_static_area)
+ if (mbmi->segment_id && x->in_static_area)
x->rdmult = vp9_cyclic_refresh_get_rdmult(cpi->cyclic_refresh);
}
if (!frame_is_intra_only(cm)) {
- vp9_pick_inter_mode(cpi, x, tile, mi_row, mi_col,
- rate, dist, bsize);
+ if (vp9_segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP))
+ set_mode_info_seg_skip(x, cm->tx_mode, rate, dist, bsize);
+ else
+ vp9_pick_inter_mode(cpi, x, tile, mi_row, mi_col, rate, dist, bsize);
} else {
set_mode_info(&xd->mi[0]->mbmi, bsize, DC_PRED);
}
@@ -2619,6 +2614,7 @@
nonrd_pick_sb_modes(cpi, tile, mi_row, mi_col,
&this_rate, &this_dist, bsize);
ctx->mic.mbmi = xd->mi[0]->mbmi;
+ ctx->skip_txfm = x->skip_txfm;
if (this_rate != INT_MAX) {
int pl = partition_plane_context(xd, mi_row, mi_col, bsize);
@@ -2705,6 +2701,7 @@
&this_rate, &this_dist, subsize);
pc_tree->horizontal[0].mic.mbmi = xd->mi[0]->mbmi;
+ pc_tree->horizontal[0].skip_txfm = x->skip_txfm;
sum_rd = RDCOST(x->rdmult, x->rddiv, sum_rate, sum_dist);
@@ -2714,6 +2711,7 @@
&this_rate, &this_dist, subsize);
pc_tree->horizontal[1].mic.mbmi = xd->mi[0]->mbmi;
+ pc_tree->horizontal[1].skip_txfm = x->skip_txfm;
if (this_rate == INT_MAX) {
sum_rd = INT64_MAX;
@@ -2743,12 +2741,14 @@
nonrd_pick_sb_modes(cpi, tile, mi_row, mi_col,
&this_rate, &this_dist, subsize);
pc_tree->vertical[0].mic.mbmi = xd->mi[0]->mbmi;
+ pc_tree->vertical[0].skip_txfm = x->skip_txfm;
sum_rd = RDCOST(x->rdmult, x->rddiv, sum_rate, sum_dist);
if (sum_rd < best_rd && mi_col + ms < cm->mi_cols) {
load_pred_mv(x, ctx);
nonrd_pick_sb_modes(cpi, tile, mi_row, mi_col + ms,
&this_rate, &this_dist, subsize);
pc_tree->vertical[1].mic.mbmi = xd->mi[0]->mbmi;
+ pc_tree->vertical[1].skip_txfm = x->skip_txfm;
if (this_rate == INT_MAX) {
sum_rd = INT64_MAX;
} else {
@@ -2837,14 +2837,17 @@
case PARTITION_NONE:
nonrd_pick_sb_modes(cpi, tile, mi_row, mi_col, totrate, totdist, subsize);
pc_tree->none.mic.mbmi = xd->mi[0]->mbmi;
+ pc_tree->none.skip_txfm = x->skip_txfm;
break;
case PARTITION_VERT:
nonrd_pick_sb_modes(cpi, tile, mi_row, mi_col, totrate, totdist, subsize);
pc_tree->vertical[0].mic.mbmi = xd->mi[0]->mbmi;
+ pc_tree->vertical[0].skip_txfm = x->skip_txfm;
if (mi_col + hbs < cm->mi_cols) {
nonrd_pick_sb_modes(cpi, tile, mi_row, mi_col + hbs,
&rate, &dist, subsize);
pc_tree->vertical[1].mic.mbmi = xd->mi[0]->mbmi;
+ pc_tree->vertical[1].skip_txfm = x->skip_txfm;
if (rate != INT_MAX && dist != INT64_MAX &&
*totrate != INT_MAX && *totdist != INT64_MAX) {
*totrate += rate;
@@ -2855,10 +2858,12 @@
case PARTITION_HORZ:
nonrd_pick_sb_modes(cpi, tile, mi_row, mi_col, totrate, totdist, subsize);
pc_tree->horizontal[0].mic.mbmi = xd->mi[0]->mbmi;
+ pc_tree->horizontal[0].skip_txfm = x->skip_txfm;
if (mi_row + hbs < cm->mi_rows) {
nonrd_pick_sb_modes(cpi, tile, mi_row + hbs, mi_col,
&rate, &dist, subsize);
pc_tree->horizontal[1].mic.mbmi = xd->mi[0]->mbmi;
+ pc_tree->horizontal[1].skip_txfm = x->skip_txfm;
if (rate != INT_MAX && dist != INT64_MAX &&
*totrate != INT_MAX && *totdist != INT64_MAX) {
*totrate += rate;
@@ -2993,6 +2998,33 @@
cm->show_frame;
}
+static void encode_tiles(VP9_COMP *cpi) {
+ const VP9_COMMON *const cm = &cpi->common;
+ const int tile_cols = 1 << cm->log2_tile_cols;
+ const int tile_rows = 1 << cm->log2_tile_rows;
+ int tile_col, tile_row;
+ TOKENEXTRA *tok = cpi->tok;
+
+ for (tile_row = 0; tile_row < tile_rows; ++tile_row) {
+ for (tile_col = 0; tile_col < tile_cols; ++tile_col) {
+ TileInfo tile;
+ TOKENEXTRA *old_tok = tok;
+ int mi_row;
+
+ vp9_tile_init(&tile, cm, tile_row, tile_col);
+ for (mi_row = tile.mi_row_start; mi_row < tile.mi_row_end;
+ mi_row += MI_BLOCK_SIZE) {
+ if (cpi->sf.use_nonrd_pick_mode && cm->frame_type != KEY_FRAME)
+ encode_nonrd_sb_row(cpi, &tile, mi_row, &tok);
+ else
+ encode_rd_sb_row(cpi, &tile, mi_row, &tok);
+ }
+ cpi->tok_count[tile_row][tile_col] = (unsigned int)(tok - old_tok);
+ assert(tok - cpi->tok <= get_token_alloc(cm->mb_rows, cm->mb_cols));
+ }
+ }
+}
+
static void encode_frame_internal(VP9_COMP *cpi) {
SPEED_FEATURES *const sf = &cpi->sf;
RD_OPT *const rd_opt = &cpi->rd;
@@ -3011,13 +3043,21 @@
vp9_zero(rd_opt->tx_select_diff);
vp9_zero(rd_opt->tx_select_threshes);
- cm->tx_mode = select_tx_mode(cpi);
-
cpi->mb.e_mbd.lossless = cm->base_qindex == 0 &&
cm->y_dc_delta_q == 0 &&
cm->uv_dc_delta_q == 0 &&
cm->uv_ac_delta_q == 0;
- switch_lossless_mode(cpi, cpi->mb.e_mbd.lossless);
+
+ cm->tx_mode = select_tx_mode(cpi);
+
+ cpi->mb.fwd_txm4x4 = cpi->mb.e_mbd.lossless ? vp9_fwht4x4 : vp9_fdct4x4;
+ cpi->mb.itxm_add = cpi->mb.e_mbd.lossless ? vp9_iwht4x4_add : vp9_idct4x4_add;
+
+ if (cpi->mb.e_mbd.lossless) {
+ cpi->mb.optimize = 0;
+ cpi->common.lf.filter_level = 0;
+ cpi->zbin_mode_boost_enabled = 0;
+ }
vp9_frame_init_quantizer(cpi);
@@ -3026,6 +3066,7 @@
init_encode_frame_mb_context(cpi);
set_prev_mi(cm);
+ x->skip_txfm = 0;
if (sf->use_nonrd_pick_mode) {
// Initialize internal buffer pointers for rtc coding, where non-RD
// mode decision is used and hence no buffer pointer swap needed.
@@ -3070,33 +3111,7 @@
struct vpx_usec_timer emr_timer;
vpx_usec_timer_start(&emr_timer);
- {
- // Take tiles into account and give start/end MB
- int tile_col, tile_row;
- TOKENEXTRA *tp = cpi->tok;
- const int tile_cols = 1 << cm->log2_tile_cols;
- const int tile_rows = 1 << cm->log2_tile_rows;
-
- for (tile_row = 0; tile_row < tile_rows; tile_row++) {
- for (tile_col = 0; tile_col < tile_cols; tile_col++) {
- TileInfo tile;
- TOKENEXTRA *tp_old = tp;
- int mi_row;
-
- // For each row of SBs in the frame
- vp9_tile_init(&tile, cm, tile_row, tile_col);
- for (mi_row = tile.mi_row_start;
- mi_row < tile.mi_row_end; mi_row += MI_BLOCK_SIZE) {
- if (sf->use_nonrd_pick_mode && cm->frame_type != KEY_FRAME)
- encode_nonrd_sb_row(cpi, &tile, mi_row, &tp);
- else
- encode_rd_sb_row(cpi, &tile, mi_row, &tp);
- }
- cpi->tok_count[tile_row][tile_col] = (unsigned int)(tp - tp_old);
- assert(tp - cpi->tok <= get_token_alloc(cm->mb_rows, cm->mb_cols));
- }
- }
- }
+ encode_tiles(cpi);
vpx_usec_timer_mark(&emr_timer);
cpi->time_encode_sb_row += vpx_usec_timer_elapsed(&emr_timer);
@@ -3240,16 +3255,16 @@
if (count4x4 == 0 && count16x16_lp == 0 && count16x16_16x16p == 0 &&
count32x32 == 0) {
cm->tx_mode = ALLOW_8X8;
- reset_skip_txfm_size(cm, TX_8X8);
+ reset_skip_tx_size(cm, TX_8X8);
} else if (count8x8_8x8p == 0 && count16x16_16x16p == 0 &&
count8x8_lp == 0 && count16x16_lp == 0 && count32x32 == 0) {
cm->tx_mode = ONLY_4X4;
- reset_skip_txfm_size(cm, TX_4X4);
+ reset_skip_tx_size(cm, TX_4X4);
} else if (count8x8_lp == 0 && count16x16_lp == 0 && count4x4 == 0) {
cm->tx_mode = ALLOW_32X32;
} else if (count32x32 == 0 && count8x8_lp == 0 && count4x4 == 0) {
cm->tx_mode = ALLOW_16X16;
- reset_skip_txfm_size(cm, TX_16X16);
+ reset_skip_tx_size(cm, TX_16X16);
}
}
} else {
@@ -3310,7 +3325,7 @@
const int mi_width = num_8x8_blocks_wide_lookup[bsize];
const int mi_height = num_8x8_blocks_high_lookup[bsize];
- x->skip_recode = !x->select_txfm_size && mbmi->sb_type >= BLOCK_8X8 &&
+ x->skip_recode = !x->select_tx_size && mbmi->sb_type >= BLOCK_8X8 &&
cpi->oxcf.aq_mode != COMPLEXITY_AQ &&
cpi->oxcf.aq_mode != CYCLIC_REFRESH_AQ &&
cpi->sf.allow_skip_recode;
diff --git a/vp9/encoder/vp9_encodemb.c b/vp9/encoder/vp9_encodemb.c
index 8581e61..1c00698 100644
--- a/vp9/encoder/vp9_encodemb.c
+++ b/vp9/encoder/vp9_encodemb.c
@@ -301,6 +301,52 @@
vp9_fdct32x32(src, dst, src_stride);
}
+void vp9_xform_quant_fp(MACROBLOCK *x, int plane, int block,
+ BLOCK_SIZE plane_bsize, TX_SIZE tx_size) {
+ MACROBLOCKD *const xd = &x->e_mbd;
+ const struct macroblock_plane *const p = &x->plane[plane];
+ const struct macroblockd_plane *const pd = &xd->plane[plane];
+ int16_t *const coeff = BLOCK_OFFSET(p->coeff, block);
+ int16_t *const qcoeff = BLOCK_OFFSET(p->qcoeff, block);
+ int16_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
+ uint16_t *const eob = &p->eobs[block];
+ const int diff_stride = 4 * num_4x4_blocks_wide_lookup[plane_bsize];
+ int i, j;
+ const int16_t *src_diff;
+
+ txfrm_block_to_raster_xy(plane_bsize, tx_size, block, &i, &j);
+ src_diff = &p->src_diff[4 * (j * diff_stride + i)];
+
+ switch (tx_size) {
+ case TX_32X32:
+ vp9_fdct32x32_1(src_diff, coeff, diff_stride);
+ vp9_quantize_dc_32x32(coeff, x->skip_block, p->round,
+ p->quant_fp[0], qcoeff, dqcoeff,
+ pd->dequant[0], eob);
+ break;
+ case TX_16X16:
+ vp9_fdct16x16_1(src_diff, coeff, diff_stride);
+ vp9_quantize_dc(coeff, x->skip_block, p->round,
+ p->quant_fp[0], qcoeff, dqcoeff,
+ pd->dequant[0], eob);
+ break;
+ case TX_8X8:
+ vp9_fdct8x8_1(src_diff, coeff, diff_stride);
+ vp9_quantize_dc(coeff, x->skip_block, p->round,
+ p->quant_fp[0], qcoeff, dqcoeff,
+ pd->dequant[0], eob);
+ break;
+ case TX_4X4:
+ x->fwd_txm4x4(src_diff, coeff, diff_stride);
+ vp9_quantize_dc(coeff, x->skip_block, p->round,
+ p->quant_fp[0], qcoeff, dqcoeff,
+ pd->dequant[0], eob);
+ break;
+ default:
+ assert(0);
+ }
+}
+
void vp9_xform_quant(MACROBLOCK *x, int plane, int block,
BLOCK_SIZE plane_bsize, TX_SIZE tx_size) {
MACROBLOCKD *const xd = &x->e_mbd;
@@ -376,8 +422,19 @@
return;
}
- if (!x->skip_recode)
- vp9_xform_quant(x, plane, block, plane_bsize, tx_size);
+ if (x->skip_txfm == 0) {
+ // full forward transform and quantization
+ if (!x->skip_recode)
+ vp9_xform_quant(x, plane, block, plane_bsize, tx_size);
+ } else if (x->skip_txfm == 2) {
+ // fast path forward transform and quantization
+ vp9_xform_quant_fp(x, plane, block, plane_bsize, tx_size);
+ } else {
+ // skip forward transform
+ p->eobs[block] = 0;
+ *a = *l = 0;
+ return;
+ }
if (x->optimize && (!x->skip_recode || !x->skip_optimize)) {
const int ctx = combine_entropy_contexts(*a, *l);
diff --git a/vp9/encoder/vp9_encodemb.h b/vp9/encoder/vp9_encodemb.h
index 8021459..3196c99 100644
--- a/vp9/encoder/vp9_encodemb.h
+++ b/vp9/encoder/vp9_encodemb.h
@@ -22,7 +22,8 @@
void vp9_encode_sb(MACROBLOCK *x, BLOCK_SIZE bsize);
void vp9_encode_sby_pass1(MACROBLOCK *x, BLOCK_SIZE bsize);
-
+void vp9_xform_quant_fp(MACROBLOCK *x, int plane, int block,
+ BLOCK_SIZE plane_bsize, TX_SIZE tx_size);
void vp9_xform_quant(MACROBLOCK *x, int plane, int block,
BLOCK_SIZE plane_bsize, TX_SIZE tx_size);
diff --git a/vp9/encoder/vp9_encoder.c b/vp9/encoder/vp9_encoder.c
index 90155f3..aa7a91d 100644
--- a/vp9/encoder/vp9_encoder.c
+++ b/vp9/encoder/vp9_encoder.c
@@ -393,11 +393,6 @@
// Set rd thresholds based on mode and speed setting
vp9_set_rd_speed_thresholds(cpi);
vp9_set_rd_speed_thresholds_sub8x8(cpi);
-
- cpi->mb.fwd_txm4x4 = vp9_fdct4x4;
- if (cpi->oxcf.lossless || cpi->mb.e_mbd.lossless) {
- cpi->mb.fwd_txm4x4 = vp9_fwht4x4;
- }
}
static void alloc_raw_frame_buffers(VP9_COMP *cpi) {
@@ -593,19 +588,7 @@
cpi->oxcf = *oxcf;
cpi->pass = get_pass(cpi->oxcf.mode);
- if (cpi->oxcf.mode == REALTIME)
- cpi->oxcf.play_alternate = 0;
- cpi->oxcf.lossless = oxcf->lossless;
- if (cpi->oxcf.lossless) {
- // In lossless mode, make sure right quantizer range and correct transform
- // is set.
- cpi->oxcf.worst_allowed_q = 0;
- cpi->oxcf.best_allowed_q = 0;
- cpi->mb.itxm_add = vp9_iwht4x4_add;
- } else {
- cpi->mb.itxm_add = vp9_idct4x4_add;
- }
rc->baseline_gf_interval = DEFAULT_GF_INTERVAL;
cpi->ref_frame_flags = VP9_ALT_FLAG | VP9_GOLD_FLAG | VP9_LAST_FLAG;
@@ -627,33 +610,30 @@
// local file playback mode == really big buffer
if (cpi->oxcf.rc_mode == RC_MODE_VBR) {
- cpi->oxcf.starting_buffer_level = 60000;
- cpi->oxcf.optimal_buffer_level = 60000;
- cpi->oxcf.maximum_buffer_size = 240000;
+ cpi->oxcf.starting_buffer_level_ms = 60000;
+ cpi->oxcf.optimal_buffer_level_ms = 60000;
+ cpi->oxcf.maximum_buffer_size_ms = 240000;
}
- cpi->oxcf.starting_buffer_level =
- vp9_rescale(cpi->oxcf.starting_buffer_level,
- cpi->oxcf.target_bandwidth, 1000);
+ rc->starting_buffer_level = vp9_rescale(cpi->oxcf.starting_buffer_level_ms,
+ cpi->oxcf.target_bandwidth, 1000);
// Set or reset optimal and maximum buffer levels.
- if (cpi->oxcf.optimal_buffer_level == 0)
- cpi->oxcf.optimal_buffer_level = cpi->oxcf.target_bandwidth / 8;
+ if (cpi->oxcf.optimal_buffer_level_ms == 0)
+ rc->optimal_buffer_level = cpi->oxcf.target_bandwidth / 8;
else
- cpi->oxcf.optimal_buffer_level =
- vp9_rescale(cpi->oxcf.optimal_buffer_level,
- cpi->oxcf.target_bandwidth, 1000);
+ rc->optimal_buffer_level = vp9_rescale(cpi->oxcf.optimal_buffer_level_ms,
+ cpi->oxcf.target_bandwidth, 1000);
- if (cpi->oxcf.maximum_buffer_size == 0)
- cpi->oxcf.maximum_buffer_size = cpi->oxcf.target_bandwidth / 8;
+ if (cpi->oxcf.maximum_buffer_size_ms == 0)
+ rc->maximum_buffer_size = cpi->oxcf.target_bandwidth / 8;
else
- cpi->oxcf.maximum_buffer_size =
- vp9_rescale(cpi->oxcf.maximum_buffer_size,
- cpi->oxcf.target_bandwidth, 1000);
+ rc->maximum_buffer_size = vp9_rescale(cpi->oxcf.maximum_buffer_size_ms,
+ cpi->oxcf.target_bandwidth, 1000);
// Under a configuration change, where maximum_buffer_size may change,
// keep buffer level clipped to the maximum allowed buffer size.
- rc->bits_off_target = MIN(rc->bits_off_target, cpi->oxcf.maximum_buffer_size);
- rc->buffer_level = MIN(rc->buffer_level, cpi->oxcf.maximum_buffer_size);
+ rc->bits_off_target = MIN(rc->bits_off_target, rc->maximum_buffer_size);
+ rc->buffer_level = MIN(rc->buffer_level, rc->maximum_buffer_size);
// Set up frame rate and related parameters rate control values.
vp9_new_framerate(cpi, cpi->oxcf.framerate);
@@ -1439,21 +1419,6 @@
vp8_yv12_extend_frame_borders_c(dst);
}
-static int find_fp_qindex() {
- int i;
-
- for (i = 0; i < QINDEX_RANGE; i++) {
- if (vp9_convert_qindex_to_q(i) >= 30.0) {
- break;
- }
- }
-
- if (i == QINDEX_RANGE)
- i--;
-
- return i;
-}
-
#define WRITE_RECON_BUFFER 0
#if WRITE_RECON_BUFFER
void write_cx_frame_to_file(YV12_BUFFER_CONFIG *frame, int this_frame) {
@@ -1686,7 +1651,7 @@
(cpi->rc.projected_frame_size - cpi->rc.this_frame_target),
cpi->rc.vbr_bits_off_target,
cpi->rc.total_target_vs_actual,
- (cpi->oxcf.starting_buffer_level - cpi->rc.bits_off_target),
+ (cpi->rc.starting_buffer_level - cpi->rc.bits_off_target),
cpi->rc.total_actual_bits, cm->base_qindex,
vp9_convert_qindex_to_q(cm->base_qindex),
(double)vp9_dc_quant(cm->base_qindex, 0) / 4.0,
@@ -2308,17 +2273,6 @@
encode_frame_to_data_rate(cpi, size, dest, frame_flags);
}
-static void Pass1Encode(VP9_COMP *cpi, size_t *size, uint8_t *dest,
- unsigned int *frame_flags) {
- (void) size;
- (void) dest;
- (void) frame_flags;
-
- vp9_rc_get_first_pass_params(cpi);
- vp9_set_quantizer(&cpi->common, find_fp_qindex());
- vp9_first_pass(cpi);
-}
-
static void Pass2Encode(VP9_COMP *cpi, size_t *size,
uint8_t *dest, unsigned int *frame_flags) {
cpi->allow_encode_breakout = ENCODE_BREAKOUT_ENABLED;
@@ -2460,7 +2414,7 @@
cpi->refresh_alt_ref_frame = 0;
// Should we code an alternate reference frame.
- if (cpi->oxcf.play_alternate && rc->source_alt_ref_pending) {
+ if (is_altref_enabled(&cpi->oxcf) && rc->source_alt_ref_pending) {
int frames_to_arf;
#if CONFIG_MULTIPLE_ARF
@@ -2658,7 +2612,10 @@
if (cpi->pass == 1 &&
(!cpi->use_svc || cpi->svc.number_temporal_layers == 1)) {
- Pass1Encode(cpi, size, dest, frame_flags);
+ const int lossless = is_lossless_requested(&cpi->oxcf);
+ cpi->mb.fwd_txm4x4 = lossless ? vp9_fwht4x4 : vp9_fdct4x4;
+ cpi->mb.itxm_add = lossless ? vp9_iwht4x4_add : vp9_idct4x4_add;
+ vp9_first_pass(cpi);
} else if (cpi->pass == 2 &&
(!cpi->use_svc || cpi->svc.number_temporal_layers == 1)) {
Pass2Encode(cpi, size, dest, frame_flags);
diff --git a/vp9/encoder/vp9_encoder.h b/vp9/encoder/vp9_encoder.h
index 47c9019..6b0e228 100644
--- a/vp9/encoder/vp9_encoder.h
+++ b/vp9/encoder/vp9_encoder.h
@@ -216,9 +216,9 @@
int over_shoot_pct;
// buffering parameters
- int64_t starting_buffer_level; // in seconds
- int64_t optimal_buffer_level;
- int64_t maximum_buffer_size;
+ int64_t starting_buffer_level_ms;
+ int64_t optimal_buffer_level_ms;
+ int64_t maximum_buffer_size_ms;
// Frame drop threshold.
int drop_frames_water_mark;
@@ -228,7 +228,6 @@
int worst_allowed_q;
int best_allowed_q;
int cq_level;
- int lossless;
AQ_MODE aq_mode; // Adaptive Quantization mode
// Internal frame size scaling.
@@ -257,7 +256,6 @@
// these parameters aren't to be used in final build don't use!!!
int play_alternate;
- int alt_freq;
int encode_breakout; // early breakout : for video conf recommend 800
@@ -286,6 +284,14 @@
vp8e_tuning tuning;
} VP9EncoderConfig;
+static INLINE int is_altref_enabled(const VP9EncoderConfig *cfg) {
+ return cfg->mode != REALTIME && cfg->play_alternate && cfg->lag_in_frames > 0;
+}
+
+static INLINE int is_lossless_requested(const VP9EncoderConfig *cfg) {
+ return cfg->best_allowed_q == 0 && cfg->worst_allowed_q == 0;
+}
+
static INLINE int is_best_mode(MODE mode) {
return mode == ONE_PASS_BEST || mode == TWO_PASS_SECOND_BEST;
}
@@ -628,6 +634,10 @@
: 0];
}
+static INLINE int get_chessboard_index(const VP9_COMMON *cm) {
+ return cm->current_video_frame % 2;
+}
+
#ifdef __cplusplus
} // extern "C"
#endif
diff --git a/vp9/encoder/vp9_firstpass.c b/vp9/encoder/vp9_firstpass.c
index c1d925a..430f713 100644
--- a/vp9/encoder/vp9_firstpass.c
+++ b/vp9/encoder/vp9_firstpass.c
@@ -137,14 +137,13 @@
FILE *fpfile;
fpfile = fopen("firstpass.stt", "a");
- fprintf(fpfile, "%12.0f %12.0f %12.0f %12.0f %12.0f %12.4f %12.4f"
+ fprintf(fpfile, "%12.0f %12.0f %12.0f %12.0f %12.4f %12.4f"
"%12.4f %12.4f %12.4f %12.4f %12.4f %12.4f %12.4f"
"%12.0f %12.0f %12.4f %12.0f %12.0f %12.4f\n",
stats->frame,
stats->intra_error,
stats->coded_error,
stats->sr_coded_error,
- stats->ssim_weighted_pred_err,
stats->pcnt_inter,
stats->pcnt_motion,
stats->pcnt_second_ref,
@@ -398,6 +397,32 @@
}
}
+static int find_fp_qindex() {
+ int i;
+
+ for (i = 0; i < QINDEX_RANGE; ++i)
+ if (vp9_convert_qindex_to_q(i) >= 30.0)
+ break;
+
+ if (i == QINDEX_RANGE)
+ i--;
+
+ return i;
+}
+
+static void set_first_pass_params(VP9_COMP *cpi) {
+ VP9_COMMON *const cm = &cpi->common;
+ if (!cpi->refresh_alt_ref_frame &&
+ (cm->current_video_frame == 0 ||
+ (cpi->frame_flags & FRAMEFLAGS_KEY))) {
+ cm->frame_type = KEY_FRAME;
+ } else {
+ cm->frame_type = INTER_FRAME;
+ }
+ // Do not use periodic key frames.
+ cpi->rc.frames_to_key = INT_MAX;
+}
+
void vp9_first_pass(VP9_COMP *cpi) {
int mb_row, mb_col;
MACROBLOCK *const x = &cpi->mb;
@@ -434,10 +459,12 @@
TWO_PASS *twopass = &cpi->twopass;
const MV zero_mv = {0, 0};
const YV12_BUFFER_CONFIG *first_ref_buf = lst_yv12;
- FIRSTPASS_STATS fps;
vp9_clear_system_state();
+ set_first_pass_params(cpi);
+ vp9_set_quantizer(cm, find_fp_qindex());
+
if (cpi->use_svc && cpi->svc.number_temporal_layers == 1) {
MV_REFERENCE_FRAME ref_frame = LAST_FRAME;
const YV12_BUFFER_CONFIG *scaled_ref_buf = NULL;
@@ -569,73 +596,91 @@
if (cm->current_video_frame > 0) {
int tmp_err, motion_error;
int_mv mv, tmp_mv;
+ int raw_motion_error;
+ struct buf_2d unscaled_last_source_buf_2d;
xd->plane[0].pre[0].buf = first_ref_buf->y_buffer + recon_yoffset;
motion_error = get_prediction_error(bsize, &x->plane[0].src,
&xd->plane[0].pre[0]);
- // Assume 0,0 motion with no mv overhead.
- mv.as_int = tmp_mv.as_int = 0;
- // Test last reference frame using the previous best mv as the
- // starting point (best reference) for the search.
- first_pass_motion_search(cpi, x, &best_ref_mv.as_mv, &mv.as_mv,
- &motion_error);
- if (cpi->oxcf.aq_mode == VARIANCE_AQ) {
- vp9_clear_system_state();
- motion_error = (int)(motion_error * error_weight);
- }
+ // compute the motion error of the zero motion vector using the last
+ // source frame as the reference
+ // skip the further motion search on reconstructed frame
+ // if this error is small
+ unscaled_last_source_buf_2d.buf = cpi->unscaled_last_source->y_buffer
+ + recon_yoffset;
+ unscaled_last_source_buf_2d.stride =
+ cpi->unscaled_last_source->y_stride;
+ raw_motion_error = get_prediction_error(bsize, &x->plane[0].src,
+ &unscaled_last_source_buf_2d);
- // If the current best reference mv is not centered on 0,0 then do a 0,0
- // based search as well.
- if (best_ref_mv.as_int) {
- tmp_err = INT_MAX;
- first_pass_motion_search(cpi, x, &zero_mv, &tmp_mv.as_mv,
- &tmp_err);
- if (cpi->oxcf.aq_mode == VARIANCE_AQ) {
- vp9_clear_system_state();
- tmp_err = (int)(tmp_err * error_weight);
- }
-
- if (tmp_err < motion_error) {
- motion_error = tmp_err;
- mv.as_int = tmp_mv.as_int;
- }
- }
-
- // Search in an older reference frame.
- if (cm->current_video_frame > 1 && gld_yv12 != NULL) {
+ // TODO(pengchong): Replace the hard-coded threshold
+ if (raw_motion_error > 25) {
// Assume 0,0 motion with no mv overhead.
- int gf_motion_error;
+ mv.as_int = tmp_mv.as_int = 0;
- xd->plane[0].pre[0].buf = gld_yv12->y_buffer + recon_yoffset;
- gf_motion_error = get_prediction_error(bsize, &x->plane[0].src,
- &xd->plane[0].pre[0]);
-
- first_pass_motion_search(cpi, x, &zero_mv, &tmp_mv.as_mv,
- &gf_motion_error);
+ // Test last reference frame using the previous best mv as the
+ // starting point (best reference) for the search.
+ first_pass_motion_search(cpi, x, &best_ref_mv.as_mv, &mv.as_mv,
+ &motion_error);
if (cpi->oxcf.aq_mode == VARIANCE_AQ) {
vp9_clear_system_state();
- gf_motion_error = (int)(gf_motion_error * error_weight);
+ motion_error = (int)(motion_error * error_weight);
}
- if (gf_motion_error < motion_error && gf_motion_error < this_error)
- ++second_ref_count;
+ // If the current best reference mv is not centered on 0,0
+ // then do a 0,0
+ // based search as well.
+ if (best_ref_mv.as_int) {
+ tmp_err = INT_MAX;
+ first_pass_motion_search(cpi, x, &zero_mv, &tmp_mv.as_mv,
+ &tmp_err);
+ if (cpi->oxcf.aq_mode == VARIANCE_AQ) {
+ vp9_clear_system_state();
+ tmp_err = (int)(tmp_err * error_weight);
+ }
- // Reset to last frame as reference buffer.
- xd->plane[0].pre[0].buf = first_ref_buf->y_buffer + recon_yoffset;
- xd->plane[1].pre[0].buf = first_ref_buf->u_buffer + recon_uvoffset;
- xd->plane[2].pre[0].buf = first_ref_buf->v_buffer + recon_uvoffset;
+ if (tmp_err < motion_error) {
+ motion_error = tmp_err;
+ mv.as_int = tmp_mv.as_int;
+ }
+ }
- // In accumulating a score for the older reference frame take the
- // best of the motion predicted score and the intra coded error
- // (just as will be done for) accumulation of "coded_error" for
- // the last frame.
- if (gf_motion_error < this_error)
- sr_coded_error += gf_motion_error;
- else
- sr_coded_error += this_error;
- } else {
- sr_coded_error += motion_error;
+ // Search in an older reference frame.
+ if (cm->current_video_frame > 1 && gld_yv12 != NULL) {
+ // Assume 0,0 motion with no mv overhead.
+ int gf_motion_error;
+
+ xd->plane[0].pre[0].buf = gld_yv12->y_buffer + recon_yoffset;
+ gf_motion_error = get_prediction_error(bsize, &x->plane[0].src,
+ &xd->plane[0].pre[0]);
+
+ first_pass_motion_search(cpi, x, &zero_mv, &tmp_mv.as_mv,
+ &gf_motion_error);
+ if (cpi->oxcf.aq_mode == VARIANCE_AQ) {
+ vp9_clear_system_state();
+ gf_motion_error = (int)(gf_motion_error * error_weight);
+ }
+
+ if (gf_motion_error < motion_error && gf_motion_error < this_error)
+ ++second_ref_count;
+
+ // Reset to last frame as reference buffer.
+ xd->plane[0].pre[0].buf = first_ref_buf->y_buffer + recon_yoffset;
+ xd->plane[1].pre[0].buf = first_ref_buf->u_buffer + recon_uvoffset;
+ xd->plane[2].pre[0].buf = first_ref_buf->v_buffer + recon_uvoffset;
+
+ // In accumulating a score for the older reference frame take the
+ // best of the motion predicted score and the intra coded error
+ // (just as will be done for) accumulation of "coded_error" for
+ // the last frame.
+ if (gf_motion_error < this_error)
+ sr_coded_error += gf_motion_error;
+ else
+ sr_coded_error += this_error;
+ } else {
+ sr_coded_error += motion_error;
+ }
}
// Start by assuming that intra mode is best.
best_ref_mv.as_int = 0;
@@ -729,6 +774,8 @@
vp9_clear_system_state();
{
+ FIRSTPASS_STATS fps;
+
fps.frame = cm->current_video_frame;
fps.spatial_layer_id = cpi->svc.spatial_layer_id;
fps.intra_error = (double)(intra_error >> 8);
@@ -767,7 +814,8 @@
fps.duration = (double)(cpi->source->ts_end - cpi->source->ts_start);
// Don't want to do output stats with a stack variable!
- output_stats(&fps, cpi->output_pkt_list);
+ twopass->this_frame_stats = fps;
+ output_stats(&twopass->this_frame_stats, cpi->output_pkt_list);
accumulate_stats(&twopass->total_stats, &fps);
}
@@ -775,9 +823,9 @@
// the prediction is good enough... but also don't allow it to lag too far.
if ((twopass->sr_update_lag > 3) ||
((cm->current_video_frame > 0) &&
- (fps.pcnt_inter > 0.20) &&
- ((fps.intra_error /
- DOUBLE_DIVIDE_CHECK(fps.coded_error)) > 2.0))) {
+ (twopass->this_frame_stats.pcnt_inter > 0.20) &&
+ ((twopass->this_frame_stats.intra_error /
+ DOUBLE_DIVIDE_CHECK(twopass->this_frame_stats.coded_error)) > 2.0))) {
if (gld_yv12 != NULL) {
vp8_yv12_copy_frame(lst_yv12, gld_yv12);
}
@@ -1477,7 +1525,7 @@
double mv_in_out_accumulator = 0.0;
double abs_mv_in_out_accumulator = 0.0;
double mv_ratio_accumulator_thresh;
- unsigned int allow_alt_ref = oxcf->play_alternate && oxcf->lag_in_frames;
+ unsigned int allow_alt_ref = is_altref_enabled(oxcf);
int f_boost = 0;
int b_boost = 0;
@@ -2051,19 +2099,6 @@
twopass->modified_error_left -= kf_group_err;
}
-void vp9_rc_get_first_pass_params(VP9_COMP *cpi) {
- VP9_COMMON *const cm = &cpi->common;
- if (!cpi->refresh_alt_ref_frame &&
- (cm->current_video_frame == 0 ||
- (cpi->frame_flags & FRAMEFLAGS_KEY))) {
- cm->frame_type = KEY_FRAME;
- } else {
- cm->frame_type = INTER_FRAME;
- }
- // Do not use periodic key frames.
- cpi->rc.frames_to_key = INT_MAX;
-}
-
// For VBR...adjustment to the frame target based on error from previous frames
void vbr_rate_correction(int * this_frame_target,
const int64_t vbr_bits_off_target) {
diff --git a/vp9/encoder/vp9_firstpass.h b/vp9/encoder/vp9_firstpass.h
index c89cfaf..8206521 100644
--- a/vp9/encoder/vp9_firstpass.h
+++ b/vp9/encoder/vp9_firstpass.h
@@ -22,7 +22,6 @@
double intra_error;
double coded_error;
double sr_coded_error;
- double ssim_weighted_pred_err;
double pcnt_inter;
double pcnt_motion;
double pcnt_second_ref;
@@ -44,6 +43,7 @@
unsigned int section_intra_rating;
unsigned int next_iiratio;
FIRSTPASS_STATS total_stats;
+ FIRSTPASS_STATS this_frame_stats;
const FIRSTPASS_STATS *stats_in;
const FIRSTPASS_STATS *stats_in_start;
const FIRSTPASS_STATS *stats_in_end;
diff --git a/vp9/encoder/vp9_mcomp.c b/vp9/encoder/vp9_mcomp.c
index 9d2b2a4..48ac5f9 100644
--- a/vp9/encoder/vp9_mcomp.c
+++ b/vp9/encoder/vp9_mcomp.c
@@ -524,7 +524,8 @@
// Work out the start point for the search
bestsad = vfp->sdf(what->buf, what->stride,
- get_buf_from_mv(in_what, ref_mv), in_what->stride);
+ get_buf_from_mv(in_what, ref_mv), in_what->stride) +
+ mvsad_err_cost(x, ref_mv, &fcenter_mv, sad_per_bit);
// Search all possible scales upto the search param around the center point
// pick the scale of the point that is best as the starting scale of
@@ -1592,3 +1593,49 @@
}
return best_sad;
}
+
+int vp9_full_pixel_search(VP9_COMP *cpi, MACROBLOCK *x,
+ BLOCK_SIZE bsize, MV *mvp_full,
+ int step_param, int error_per_bit,
+ const MV *ref_mv, MV *tmp_mv,
+ int var_max, int rd) {
+ const SPEED_FEATURES *const sf = &cpi->sf;
+ const SEARCH_METHODS method = sf->search_method;
+ vp9_variance_fn_ptr_t *fn_ptr = &cpi->fn_ptr[bsize];
+ int var = 0;
+
+ switch (method) {
+ case FAST_DIAMOND:
+ var = vp9_fast_dia_search(x, mvp_full, step_param, error_per_bit, 0,
+ fn_ptr, 1, ref_mv, tmp_mv);
+ break;
+ case FAST_HEX:
+ var = vp9_fast_hex_search(x, mvp_full, step_param, error_per_bit, 0,
+ fn_ptr, 1, ref_mv, tmp_mv);
+ break;
+ case HEX:
+ var = vp9_hex_search(x, mvp_full, step_param, error_per_bit, 1,
+ fn_ptr, 1, ref_mv, tmp_mv);
+ break;
+ case SQUARE:
+ var = vp9_square_search(x, mvp_full, step_param, error_per_bit, 1,
+ fn_ptr, 1, ref_mv, tmp_mv);
+ break;
+ case BIGDIA:
+ var = vp9_bigdia_search(x, mvp_full, step_param, error_per_bit, 1,
+ fn_ptr, 1, ref_mv, tmp_mv);
+ break;
+ case NSTEP:
+ var = vp9_full_pixel_diamond(cpi, x, mvp_full, step_param, error_per_bit,
+ (sf->max_step_search_steps - 1) - step_param,
+ 1, fn_ptr, ref_mv, tmp_mv);
+ break;
+ default:
+ assert(!"Invalid search method.");
+ }
+
+ if (method != NSTEP && rd && var < var_max)
+ var = vp9_get_mvpred_var(x, tmp_mv, ref_mv, fn_ptr, 1);
+
+ return var;
+}
diff --git a/vp9/encoder/vp9_mcomp.h b/vp9/encoder/vp9_mcomp.h
index 873edf3..07e410d 100644
--- a/vp9/encoder/vp9_mcomp.h
+++ b/vp9/encoder/vp9_mcomp.h
@@ -145,6 +145,14 @@
int search_range,
const vp9_variance_fn_ptr_t *fn_ptr,
const MV *center_mv, const uint8_t *second_pred);
+
+struct VP9_COMP;
+
+int vp9_full_pixel_search(struct VP9_COMP *cpi, MACROBLOCK *x,
+ BLOCK_SIZE bsize, MV *mvp_full,
+ int step_param, int error_per_bit,
+ const MV *ref_mv, MV *tmp_mv,
+ int var_max, int rd);
#ifdef __cplusplus
} // extern "C"
#endif
diff --git a/vp9/encoder/vp9_pickmode.c b/vp9/encoder/vp9_pickmode.c
index 11633a7..4b0b85a 100644
--- a/vp9/encoder/vp9_pickmode.c
+++ b/vp9/encoder/vp9_pickmode.c
@@ -84,8 +84,8 @@
mvp_full.col >>= 3;
mvp_full.row >>= 3;
- full_pixel_search(cpi, x, bsize, &mvp_full, step_param, sadpb, &ref_mv,
- &tmp_mv->as_mv, INT_MAX, 0);
+ vp9_full_pixel_search(cpi, x, bsize, &mvp_full, step_param, sadpb, &ref_mv,
+ &tmp_mv->as_mv, INT_MAX, 0);
x->mv_col_min = tmp_col_min;
x->mv_col_max = tmp_col_max;
@@ -156,24 +156,28 @@
unsigned int sse;
int rate;
int64_t dist;
-
struct macroblock_plane *const p = &x->plane[0];
struct macroblockd_plane *const pd = &xd->plane[0];
-
+ const int quant = pd->dequant[1];
unsigned int var = cpi->fn_ptr[bsize].vf(p->src.buf, p->src.stride,
pd->dst.buf, pd->dst.stride, &sse);
-
*var_y = var;
*sse_y = sse;
+ if (sse < pd->dequant[0] * pd->dequant[0] >> 6)
+ x->skip_txfm = 1;
+ else if (var < quant * quant >> 6)
+ x->skip_txfm = 2;
+ else
+ x->skip_txfm = 0;
+
// TODO(jingning) This is a temporary solution to account for frames with
// light changes. Need to customize the rate-distortion modeling for non-RD
// mode decision.
if ((sse >> 3) > var)
sse = var;
-
vp9_model_rd_from_var_lapndz(var + sse, 1 << num_pels_log2_lookup[bsize],
- pd->dequant[1] >> 3, &rate, &dist);
+ quant >> 3, &rate, &dist);
*out_rate_sum = rate;
*out_dist_sum = dist << 3;
}
@@ -199,6 +203,7 @@
VP9_ALT_FLAG };
int64_t best_rd = INT64_MAX;
int64_t this_rd = INT64_MAX;
+ int skip_txfm = 0;
int rate = INT_MAX;
int64_t dist = INT64_MAX;
@@ -220,8 +225,8 @@
int mode_idx[MB_MODE_COUNT] = {0};
INTERP_FILTER filter_ref = SWITCHABLE;
int bsl = mi_width_log2_lookup[bsize];
- int pred_filter_search = (((mi_row + mi_col) >> bsl) +
- cpi->sf.chessboard_index) & 0x01;
+ const int pred_filter_search = (((mi_row + mi_col) >> bsl) +
+ get_chessboard_index(cm)) % 2;
x->skip_encode = cpi->sf.skip_encode_frame && x->q_index < QIDX_SKIP_THRESH;
@@ -280,8 +285,7 @@
for (this_mode = NEARESTMV; this_mode <= NEWMV; ++this_mode) {
int rate_mv = 0;
- if (cpi->sf.disable_inter_mode_mask[bsize] &
- (1 << INTER_OFFSET(this_mode)))
+ if (!(cpi->sf.inter_mode_mask[bsize] & (1 << this_mode)))
continue;
if (rd_less_than_thresh(best_rd, rd_threshes[mode_idx[this_mode]],
@@ -342,6 +346,7 @@
if (cost < best_cost) {
best_filter = filter;
best_cost = cost;
+ skip_txfm = x->skip_txfm;
}
}
@@ -350,6 +355,7 @@
dist = pf_dist[mbmi->interp_filter];
var_y = pf_var[mbmi->interp_filter];
sse_y = pf_sse[mbmi->interp_filter];
+ x->skip_txfm = skip_txfm;
} else {
mbmi->interp_filter = (filter_ref == SWITCHABLE) ? EIGHTTAP: filter_ref;
vp9_build_inter_predictors_sby(xd, mi_row, mi_col, bsize);
@@ -439,6 +445,7 @@
best_mode = this_mode;
best_pred_filter = mbmi->interp_filter;
best_ref_frame = ref_frame;
+ skip_txfm = x->skip_txfm;
}
if (x->skip)
@@ -451,6 +458,7 @@
mbmi->ref_frame[0] = best_ref_frame;
mbmi->mv[0].as_int = frame_mv[best_mode][best_ref_frame].as_int;
xd->mi[0]->bmi[0].as_mv[0].as_int = mbmi->mv[0].as_int;
+ x->skip_txfm = skip_txfm;
// Perform intra prediction search, if the best SAD is above a certain
// threshold.
@@ -475,6 +483,8 @@
mbmi->ref_frame[0] = INTRA_FRAME;
mbmi->uv_mode = this_mode;
mbmi->mv[0].as_int = INVALID_MV;
+ } else {
+ x->skip_txfm = skip_txfm;
}
}
}
diff --git a/vp9/encoder/vp9_quantize.c b/vp9/encoder/vp9_quantize.c
index 4d3086d..f817bcc 100644
--- a/vp9/encoder/vp9_quantize.c
+++ b/vp9/encoder/vp9_quantize.c
@@ -19,6 +19,50 @@
#include "vp9/encoder/vp9_quantize.h"
#include "vp9/encoder/vp9_rdopt.h"
+void vp9_quantize_dc(const int16_t *coeff_ptr, int skip_block,
+ const int16_t *round_ptr, const int16_t quant,
+ int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr,
+ const int16_t dequant_ptr, uint16_t *eob_ptr) {
+ int eob = -1;
+
+ if (!skip_block) {
+ const int rc = 0;
+ const int coeff = coeff_ptr[rc];
+ const int coeff_sign = (coeff >> 31);
+ const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+
+ int tmp = clamp(abs_coeff + round_ptr[rc != 0], INT16_MIN, INT16_MAX);
+ tmp = (tmp * quant) >> 16;
+ qcoeff_ptr[rc] = (tmp ^ coeff_sign) - coeff_sign;
+ dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr;
+ if (tmp)
+ eob = 0;
+ }
+ *eob_ptr = eob + 1;
+}
+
+void vp9_quantize_dc_32x32(const int16_t *coeff_ptr, int skip_block,
+ const int16_t *round_ptr, const int16_t quant,
+ int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr,
+ const int16_t dequant_ptr, uint16_t *eob_ptr) {
+ int eob = -1;
+
+ if (!skip_block) {
+ const int rc = 0;
+ const int coeff = coeff_ptr[rc];
+ const int coeff_sign = (coeff >> 31);
+ const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+
+ int tmp = clamp(abs_coeff + round_ptr[rc != 0], INT16_MIN, INT16_MAX);
+ tmp = (tmp * quant) >> 15;
+ qcoeff_ptr[rc] = (tmp ^ coeff_sign) - coeff_sign;
+ dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr / 2;
+ if (tmp)
+ eob = 0;
+ }
+ *eob_ptr = eob + 1;
+}
+
void vp9_quantize_b_c(const int16_t *coeff_ptr, intptr_t count,
int skip_block,
const int16_t *zbin_ptr, const int16_t *round_ptr,
@@ -167,6 +211,7 @@
quant = i == 0 ? vp9_dc_quant(q, cm->y_dc_delta_q)
: vp9_ac_quant(q, 0);
invert_quant(&quants->y_quant[q][i], &quants->y_quant_shift[q][i], quant);
+ quants->y_quant_fp[q][i] = (1 << 16) / quant;
quants->y_zbin[q][i] = ROUND_POWER_OF_TWO(qzbin_factor * quant, 7);
quants->y_round[q][i] = (qrounding_factor * quant) >> 7;
cm->y_dequant[q][i] = quant;
@@ -176,6 +221,7 @@
: vp9_ac_quant(q, cm->uv_ac_delta_q);
invert_quant(&quants->uv_quant[q][i],
&quants->uv_quant_shift[q][i], quant);
+ quants->uv_quant_fp[q][i] = (1 << 16) / quant;
quants->uv_zbin[q][i] = ROUND_POWER_OF_TWO(qzbin_factor * quant, 7);
quants->uv_round[q][i] = (qrounding_factor * quant) >> 7;
cm->uv_dequant[q][i] = quant;
@@ -193,12 +239,14 @@
for (i = 2; i < 8; i++) {
quants->y_quant[q][i] = quants->y_quant[q][1];
+ quants->y_quant_fp[q][i] = quants->y_quant_fp[q][1];
quants->y_quant_shift[q][i] = quants->y_quant_shift[q][1];
quants->y_zbin[q][i] = quants->y_zbin[q][1];
quants->y_round[q][i] = quants->y_round[q][1];
cm->y_dequant[q][i] = cm->y_dequant[q][1];
quants->uv_quant[q][i] = quants->uv_quant[q][1];
+ quants->uv_quant_fp[q][i] = quants->uv_quant_fp[q][1];
quants->uv_quant_shift[q][i] = quants->uv_quant_shift[q][1];
quants->uv_zbin[q][i] = quants->uv_zbin[q][1];
quants->uv_round[q][i] = quants->uv_round[q][1];
@@ -227,6 +275,7 @@
// Y
x->plane[0].quant = quants->y_quant[qindex];
+ x->plane[0].quant_fp = quants->y_quant_fp[qindex];
x->plane[0].quant_shift = quants->y_quant_shift[qindex];
x->plane[0].zbin = quants->y_zbin[qindex];
x->plane[0].round = quants->y_round[qindex];
@@ -236,6 +285,7 @@
// UV
for (i = 1; i < 3; i++) {
x->plane[i].quant = quants->uv_quant[qindex];
+ x->plane[i].quant_fp = quants->uv_quant_fp[qindex];
x->plane[i].quant_shift = quants->uv_quant_shift[qindex];
x->plane[i].zbin = quants->uv_zbin[qindex];
x->plane[i].round = quants->uv_round[qindex];
diff --git a/vp9/encoder/vp9_quantize.h b/vp9/encoder/vp9_quantize.h
index 1835e9c..0e90462 100644
--- a/vp9/encoder/vp9_quantize.h
+++ b/vp9/encoder/vp9_quantize.h
@@ -24,6 +24,11 @@
DECLARE_ALIGNED(16, int16_t, y_zbin[QINDEX_RANGE][8]);
DECLARE_ALIGNED(16, int16_t, y_round[QINDEX_RANGE][8]);
+ // TODO(jingning): in progress of re-working the quantization. will decide
+ // if we want to deprecate the current use of y_quant.
+ DECLARE_ALIGNED(16, int16_t, y_quant_fp[QINDEX_RANGE][8]);
+ DECLARE_ALIGNED(16, int16_t, uv_quant_fp[QINDEX_RANGE][8]);
+
DECLARE_ALIGNED(16, int16_t, uv_quant[QINDEX_RANGE][8]);
DECLARE_ALIGNED(16, int16_t, uv_quant_shift[QINDEX_RANGE][8]);
DECLARE_ALIGNED(16, int16_t, uv_zbin[QINDEX_RANGE][8]);
@@ -37,6 +42,14 @@
#endif
} QUANTS;
+void vp9_quantize_dc(const int16_t *coeff_ptr, int skip_block,
+ const int16_t *round_ptr, const int16_t quant_ptr,
+ int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr,
+ const int16_t dequant_ptr, uint16_t *eob_ptr);
+void vp9_quantize_dc_32x32(const int16_t *coeff_ptr, int skip_block,
+ const int16_t *round_ptr, const int16_t quant_ptr,
+ int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr,
+ const int16_t dequant_ptr, uint16_t *eob_ptr);
void vp9_regular_quantize_b_4x4(MACROBLOCK *x, int plane, int block,
const int16_t *scan, const int16_t *iscan);
diff --git a/vp9/encoder/vp9_ratectrl.c b/vp9/encoder/vp9_ratectrl.c
index b58eac9..143c23b 100644
--- a/vp9/encoder/vp9_ratectrl.c
+++ b/vp9/encoder/vp9_ratectrl.c
@@ -159,7 +159,7 @@
lrc->bits_off_target += bits_off_for_this_layer;
// Clip buffer level to maximum buffer size for the layer.
- lrc->bits_off_target = MIN(lrc->bits_off_target, lc->maximum_buffer_size);
+ lrc->bits_off_target = MIN(lrc->bits_off_target, lrc->maximum_buffer_size);
lrc->buffer_level = lrc->bits_off_target;
}
}
@@ -167,7 +167,6 @@
// Update the buffer level: leaky bucket model.
static void update_buffer_level(VP9_COMP *cpi, int encoded_frame_size) {
const VP9_COMMON *const cm = &cpi->common;
- const VP9EncoderConfig *oxcf = &cpi->oxcf;
RATE_CONTROL *const rc = &cpi->rc;
// Non-viewable frames are a special case and are treated as pure overhead.
@@ -178,7 +177,7 @@
}
// Clip the buffer level to the maximum specified buffer size.
- rc->bits_off_target = MIN(rc->bits_off_target, oxcf->maximum_buffer_size);
+ rc->bits_off_target = MIN(rc->bits_off_target, rc->maximum_buffer_size);
rc->buffer_level = rc->bits_off_target;
if (cpi->use_svc && cpi->oxcf.rc_mode == RC_MODE_CBR) {
@@ -188,23 +187,20 @@
void vp9_rc_init(const VP9EncoderConfig *oxcf, int pass, RATE_CONTROL *rc) {
if (pass == 0 && oxcf->rc_mode == RC_MODE_CBR) {
- rc->avg_frame_qindex[0] = oxcf->worst_allowed_q;
- rc->avg_frame_qindex[1] = oxcf->worst_allowed_q;
- rc->avg_frame_qindex[2] = oxcf->worst_allowed_q;
+ rc->avg_frame_qindex[KEY_FRAME] = oxcf->worst_allowed_q;
+ rc->avg_frame_qindex[INTER_FRAME] = oxcf->worst_allowed_q;
} else {
- rc->avg_frame_qindex[0] = (oxcf->worst_allowed_q +
- oxcf->best_allowed_q) / 2;
- rc->avg_frame_qindex[1] = (oxcf->worst_allowed_q +
- oxcf->best_allowed_q) / 2;
- rc->avg_frame_qindex[2] = (oxcf->worst_allowed_q +
- oxcf->best_allowed_q) / 2;
+ rc->avg_frame_qindex[KEY_FRAME] = (oxcf->worst_allowed_q +
+ oxcf->best_allowed_q) / 2;
+ rc->avg_frame_qindex[INTER_FRAME] = (oxcf->worst_allowed_q +
+ oxcf->best_allowed_q) / 2;
}
rc->last_q[KEY_FRAME] = oxcf->best_allowed_q;
rc->last_q[INTER_FRAME] = oxcf->best_allowed_q;
- rc->buffer_level = oxcf->starting_buffer_level;
- rc->bits_off_target = oxcf->starting_buffer_level;
+ rc->buffer_level = rc->starting_buffer_level;
+ rc->bits_off_target = rc->starting_buffer_level;
rc->rolling_target_bits = rc->avg_frame_bandwidth;
rc->rolling_actual_bits = rc->avg_frame_bandwidth;
@@ -250,7 +246,7 @@
// If buffer is below drop_mark, for now just drop every other frame
// (starting with the next frame) until it increases back over drop_mark.
int drop_mark = (int)(oxcf->drop_frames_water_mark *
- oxcf->optimal_buffer_level / 100);
+ rc->optimal_buffer_level / 100);
if ((rc->buffer_level > drop_mark) &&
(rc->decimation_factor > 0)) {
--rc->decimation_factor;
@@ -432,7 +428,6 @@
: rc->last_q[INTER_FRAME] * 2;
}
}
-
return MIN(active_worst_quality, rc->worst_quality);
}
@@ -444,10 +439,9 @@
// ambient Q (at buffer = optimal level) to worst_quality level
// (at buffer = critical level).
const VP9_COMMON *const cm = &cpi->common;
- const VP9EncoderConfig *oxcf = &cpi->oxcf;
const RATE_CONTROL *rc = &cpi->rc;
// Buffer level below which we push active_worst to worst_quality.
- int64_t critical_level = oxcf->optimal_buffer_level >> 2;
+ int64_t critical_level = rc->optimal_buffer_level >> 2;
int64_t buff_lvl_step = 0;
int adjustment = 0;
int active_worst_quality;
@@ -459,26 +453,26 @@
else
active_worst_quality = MIN(rc->worst_quality,
rc->avg_frame_qindex[KEY_FRAME] * 3 / 2);
- if (rc->buffer_level > oxcf->optimal_buffer_level) {
+ if (rc->buffer_level > rc->optimal_buffer_level) {
// Adjust down.
// Maximum limit for down adjustment, ~30%.
int max_adjustment_down = active_worst_quality / 3;
if (max_adjustment_down) {
- buff_lvl_step = ((oxcf->maximum_buffer_size -
- oxcf->optimal_buffer_level) / max_adjustment_down);
+ buff_lvl_step = ((rc->maximum_buffer_size -
+ rc->optimal_buffer_level) / max_adjustment_down);
if (buff_lvl_step)
- adjustment = (int)((rc->buffer_level - oxcf->optimal_buffer_level) /
+ adjustment = (int)((rc->buffer_level - rc->optimal_buffer_level) /
buff_lvl_step);
active_worst_quality -= adjustment;
}
} else if (rc->buffer_level > critical_level) {
// Adjust up from ambient Q.
if (critical_level) {
- buff_lvl_step = (oxcf->optimal_buffer_level - critical_level);
+ buff_lvl_step = (rc->optimal_buffer_level - critical_level);
if (buff_lvl_step) {
adjustment =
(int)((rc->worst_quality - rc->avg_frame_qindex[INTER_FRAME]) *
- (oxcf->optimal_buffer_level - rc->buffer_level) /
+ (rc->optimal_buffer_level - rc->buffer_level) /
buff_lvl_step);
}
active_worst_quality = rc->avg_frame_qindex[INTER_FRAME] + adjustment;
@@ -644,7 +638,7 @@
int delta_qindex = vp9_compute_qdelta(rc, last_boosted_q,
last_boosted_q * 0.75);
active_best_quality = MAX(qindex + delta_qindex, rc->best_quality);
- } else if (cm->current_video_frame > 0) {
+ } else {
// not first frame of one pass and kf_boost is set
double q_adj_factor = 1.0;
double q_val;
@@ -987,7 +981,6 @@
} else {
q = rc_pick_q_and_bounds_two_pass(cpi, bottom_index, top_index);
}
-
if (cpi->sf.use_nonrd_pick_mode) {
if (cpi->sf.force_frame_boost == 1)
q -= cpi->sf.max_delta_qindex;
@@ -1086,21 +1079,21 @@
rc->last_q[KEY_FRAME] = qindex;
rc->avg_frame_qindex[KEY_FRAME] =
ROUND_POWER_OF_TWO(3 * rc->avg_frame_qindex[KEY_FRAME] + qindex, 2);
- } else if (!rc->is_src_frame_alt_ref &&
- (cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame) &&
- !(cpi->use_svc && oxcf->rc_mode == RC_MODE_CBR)) {
- rc->avg_frame_qindex[2] =
- ROUND_POWER_OF_TWO(3 * rc->avg_frame_qindex[2] + qindex, 2);
} else {
- rc->last_q[INTER_FRAME] = qindex;
- rc->avg_frame_qindex[INTER_FRAME] =
+ if (rc->is_src_frame_alt_ref ||
+ !(cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame) ||
+ (cpi->use_svc && oxcf->rc_mode == RC_MODE_CBR)) {
+ rc->last_q[INTER_FRAME] = qindex;
+ rc->avg_frame_qindex[INTER_FRAME] =
ROUND_POWER_OF_TWO(3 * rc->avg_frame_qindex[INTER_FRAME] + qindex, 2);
- rc->ni_frames++;
- rc->tot_q += vp9_convert_qindex_to_q(qindex);
- rc->avg_q = rc->tot_q / rc->ni_frames;
- // Calculate the average Q for normal inter frames (not key or GFU frames).
- rc->ni_tot_qi += qindex;
- rc->ni_av_qi = rc->ni_tot_qi / rc->ni_frames;
+ rc->ni_frames++;
+ rc->tot_q += vp9_convert_qindex_to_q(qindex);
+ rc->avg_q = rc->tot_q / rc->ni_frames;
+ // Calculate the average Q for normal inter frames (not key or GFU
+ // frames).
+ rc->ni_tot_qi += qindex;
+ rc->ni_av_qi = rc->ni_tot_qi / rc->ni_frames;
+ }
}
// Keep record of last boosted (KF/KF/ARF) Q value.
@@ -1136,7 +1129,7 @@
rc->total_target_vs_actual = rc->total_actual_bits - rc->total_target_bits;
- if (oxcf->play_alternate && cpi->refresh_alt_ref_frame &&
+ if (is_altref_enabled(oxcf) && cpi->refresh_alt_ref_frame &&
(cm->frame_type != KEY_FRAME))
// Update the alternate reference frame stats as appropriate.
update_alt_ref_frame_stats(cpi);
@@ -1227,8 +1220,8 @@
const VP9EncoderConfig *oxcf = &cpi->oxcf;
const RATE_CONTROL *rc = &cpi->rc;
const SVC *const svc = &cpi->svc;
- const int64_t diff = oxcf->optimal_buffer_level - rc->buffer_level;
- const int64_t one_pct_bits = 1 + oxcf->optimal_buffer_level / 100;
+ const int64_t diff = rc->optimal_buffer_level - rc->buffer_level;
+ const int64_t one_pct_bits = 1 + rc->optimal_buffer_level / 100;
int min_frame_target = MAX(rc->avg_frame_bandwidth >> 4, FRAME_OVERHEAD_BITS);
int target = rc->avg_frame_bandwidth;
if (svc->number_temporal_layers > 1 &&
@@ -1259,8 +1252,8 @@
const SVC *const svc = &cpi->svc;
int target;
if (cpi->common.current_video_frame == 0) {
- target = ((cpi->oxcf.starting_buffer_level / 2) > INT_MAX)
- ? INT_MAX : (int)(cpi->oxcf.starting_buffer_level / 2);
+ target = ((rc->starting_buffer_level / 2) > INT_MAX)
+ ? INT_MAX : (int)(rc->starting_buffer_level / 2);
} else {
int kf_boost = 32;
double framerate = oxcf->framerate;
@@ -1396,8 +1389,7 @@
// Extended interval for genuinely static scenes
rc->static_scene_max_gf_interval = oxcf->key_freq >> 1;
- // Special conditions when alt ref frame enabled
- if (oxcf->play_alternate && oxcf->lag_in_frames) {
+ if (is_altref_enabled(oxcf)) {
if (rc->static_scene_max_gf_interval > oxcf->lag_in_frames - 1)
rc->static_scene_max_gf_interval = oxcf->lag_in_frames - 1;
}
diff --git a/vp9/encoder/vp9_ratectrl.h b/vp9/encoder/vp9_ratectrl.h
index d6a0151..f1a4a3f 100644
--- a/vp9/encoder/vp9_ratectrl.h
+++ b/vp9/encoder/vp9_ratectrl.h
@@ -61,7 +61,7 @@
int ni_av_qi;
int ni_tot_qi;
int ni_frames;
- int avg_frame_qindex[3]; // 0 - KEY, 1 - INTER, 2 - ARF/GF
+ int avg_frame_qindex[FRAME_TYPES];
double tot_q;
double avg_q;
@@ -84,6 +84,10 @@
int worst_quality;
int best_quality;
+
+ int64_t starting_buffer_level;
+ int64_t optimal_buffer_level;
+ int64_t maximum_buffer_size;
// int active_best_quality;
} RATE_CONTROL;
diff --git a/vp9/encoder/vp9_rdopt.c b/vp9/encoder/vp9_rdopt.c
index d402d7b..e110df2 100644
--- a/vp9/encoder/vp9_rdopt.c
+++ b/vp9/encoder/vp9_rdopt.c
@@ -311,8 +311,8 @@
x->errorperbit = rd->RDMULT / RD_MULT_EPB_RATIO;
x->errorperbit += (x->errorperbit == 0);
- x->select_txfm_size = (cpi->sf.tx_size_search_method == USE_LARGESTALL &&
- cm->frame_type != KEY_FRAME) ? 0 : 1;
+ x->select_tx_size = (cpi->sf.tx_size_search_method == USE_LARGESTALL &&
+ cm->frame_type != KEY_FRAME) ? 0 : 1;
set_block_thresholds(cm, rd);
@@ -796,11 +796,11 @@
}
}
-static void choose_largest_txfm_size(VP9_COMP *cpi, MACROBLOCK *x,
- int *rate, int64_t *distortion,
- int *skip, int64_t *sse,
- int64_t ref_best_rd,
- BLOCK_SIZE bs) {
+static void choose_largest_tx_size(VP9_COMP *cpi, MACROBLOCK *x,
+ int *rate, int64_t *distortion,
+ int *skip, int64_t *sse,
+ int64_t ref_best_rd,
+ BLOCK_SIZE bs) {
const TX_SIZE max_tx_size = max_txsize_lookup[bs];
VP9_COMMON *const cm = &cpi->common;
const TX_SIZE largest_tx_size = tx_mode_to_biggest_tx_size[cm->tx_mode];
@@ -815,12 +815,12 @@
cpi->tx_stepdown_count[0]++;
}
-static void choose_txfm_size_from_rd(VP9_COMP *cpi, MACROBLOCK *x,
- int (*r)[2], int *rate,
- int64_t *d, int64_t *distortion,
- int *s, int *skip,
- int64_t tx_cache[TX_MODES],
- BLOCK_SIZE bs) {
+static void choose_tx_size_from_rd(VP9_COMP *cpi, MACROBLOCK *x,
+ int (*r)[2], int *rate,
+ int64_t *d, int64_t *distortion,
+ int *s, int *skip,
+ int64_t tx_cache[TX_MODES],
+ BLOCK_SIZE bs) {
const TX_SIZE max_tx_size = max_txsize_lookup[bs];
VP9_COMMON *const cm = &cpi->common;
MACROBLOCKD *const xd = &x->e_mbd;
@@ -898,12 +898,12 @@
return (int64_t) (RDCOST(rdmult, rddiv, rate, dist) * scale);
}
-static void choose_txfm_size_from_modelrd(VP9_COMP *cpi, MACROBLOCK *x,
- int (*r)[2], int *rate,
- int64_t *d, int64_t *distortion,
- int *s, int *skip, int64_t *sse,
- int64_t ref_best_rd,
- BLOCK_SIZE bs) {
+static void choose_tx_size_from_modelrd(VP9_COMP *cpi, MACROBLOCK *x,
+ int (*r)[2], int *rate,
+ int64_t *d, int64_t *distortion,
+ int *s, int *skip, int64_t *sse,
+ int64_t ref_best_rd,
+ BLOCK_SIZE bs) {
const TX_SIZE max_tx_size = max_txsize_lookup[bs];
VP9_COMMON *const cm = &cpi->common;
MACROBLOCKD *const xd = &x->e_mbd;
@@ -987,28 +987,20 @@
if (cpi->sf.tx_size_search_method == USE_LARGESTALL || xd->lossless) {
vpx_memset(txfm_cache, 0, TX_MODES * sizeof(int64_t));
- choose_largest_txfm_size(cpi, x, rate, distortion, skip, sse,
- ref_best_rd, bs);
+ choose_largest_tx_size(cpi, x, rate, distortion, skip, sse, ref_best_rd,
+ bs);
if (psse)
*psse = sse[mbmi->tx_size];
return;
}
- if (cpi->sf.tx_size_search_method == USE_LARGESTINTRA_MODELINTER) {
- for (tx_size = TX_4X4; tx_size <= max_tx_size; ++tx_size)
- model_rd_for_sb_y_tx(cpi, bs, tx_size, x, xd,
- &r[tx_size][0], &d[tx_size], &s[tx_size]);
- choose_txfm_size_from_modelrd(cpi, x, r, rate, d, distortion, s,
- skip, sse, ref_best_rd, bs);
- } else {
- for (tx_size = TX_4X4; tx_size <= max_tx_size; ++tx_size)
- txfm_rd_in_plane(x, &r[tx_size][0], &d[tx_size],
- &s[tx_size], &sse[tx_size],
- ref_best_rd, 0, bs, tx_size,
- cpi->sf.use_fast_coef_costing);
- choose_txfm_size_from_rd(cpi, x, r, rate, d, distortion, s,
- skip, txfm_cache, bs);
- }
+ for (tx_size = TX_4X4; tx_size <= max_tx_size; ++tx_size)
+ txfm_rd_in_plane(x, &r[tx_size][0], &d[tx_size], &s[tx_size],
+ &sse[tx_size], ref_best_rd, 0, bs, tx_size,
+ cpi->sf.use_fast_coef_costing);
+ choose_tx_size_from_rd(cpi, x, r, rate, d, distortion, s,
+ skip, txfm_cache, bs);
+
if (psse)
*psse = sse[mbmi->tx_size];
}
@@ -1025,8 +1017,8 @@
assert(bs == mbmi->sb_type);
if (cpi->sf.tx_size_search_method != USE_FULL_RD || xd->lossless) {
vpx_memset(txfm_cache, 0, TX_MODES * sizeof(int64_t));
- choose_largest_txfm_size(cpi, x, rate, distortion, skip, sse,
- ref_best_rd, bs);
+ choose_largest_tx_size(cpi, x, rate, distortion, skip, sse, ref_best_rd,
+ bs);
} else {
int r[TX_SIZES][2], s[TX_SIZES];
int64_t d[TX_SIZES];
@@ -1036,8 +1028,8 @@
&s[tx_size], &sse[tx_size],
ref_best_rd, 0, bs, tx_size,
cpi->sf.use_fast_coef_costing);
- choose_txfm_size_from_rd(cpi, x, r, rate, d, distortion, s,
- skip, txfm_cache, bs);
+ choose_tx_size_from_rd(cpi, x, r, rate, d, distortion, s, skip, txfm_cache,
+ bs);
}
if (psse)
*psse = sse[mbmi->tx_size];
@@ -1338,7 +1330,7 @@
int64_t ref_best_rd) {
MACROBLOCKD *const xd = &x->e_mbd;
MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
- TX_SIZE uv_txfm_size = get_uv_tx_size(mbmi);
+ const TX_SIZE uv_tx_size = get_uv_tx_size(mbmi);
int plane;
int pnrate = 0, pnskip = 1;
int64_t pndist = 0, pnsse = 0;
@@ -1359,7 +1351,7 @@
for (plane = 1; plane < MAX_MB_PLANE; ++plane) {
txfm_rd_in_plane(x, &pnrate, &pndist, &pnskip, &pnsse,
- ref_best_rd, plane, bsize, uv_txfm_size,
+ ref_best_rd, plane, bsize, uv_tx_size,
cpi->sf.use_fast_coef_costing);
if (pnrate == INT_MAX)
goto term;
@@ -1411,7 +1403,7 @@
*rate_tokenonly = this_rate_tokenonly;
*distortion = this_distortion;
*skippable = s;
- if (!x->select_txfm_size)
+ if (!x->select_tx_size)
swap_block_ptr(x, ctx, 2, 0, 1, MAX_MB_PLANE);
}
}
@@ -1675,9 +1667,9 @@
static int check_best_zero_mv(
const VP9_COMP *cpi, const uint8_t mode_context[MAX_REF_FRAMES],
int_mv frame_mv[MB_MODE_COUNT][MAX_REF_FRAMES],
- int disable_inter_mode_mask, int this_mode,
+ int inter_mode_mask, int this_mode,
const MV_REFERENCE_FRAME ref_frames[2]) {
- if (!(disable_inter_mode_mask & (1 << INTER_OFFSET(ZEROMV))) &&
+ if ((inter_mode_mask & (1 << ZEROMV)) &&
(this_mode == NEARMV || this_mode == NEARESTMV || this_mode == ZEROMV) &&
frame_mv[this_mode][ref_frames[0]].as_int == 0 &&
(ref_frames[1] == NONE ||
@@ -1743,7 +1735,7 @@
ENTROPY_CONTEXT t_above[2], t_left[2];
int subpelmv = 1, have_ref = 0;
const int has_second_rf = has_second_ref(mbmi);
- const int disable_inter_mode_mask = cpi->sf.disable_inter_mode_mask[bsize];
+ const int inter_mode_mask = cpi->sf.inter_mode_mask[bsize];
vp9_zero(*bsi);
@@ -1792,11 +1784,11 @@
mode_idx = INTER_OFFSET(this_mode);
bsi->rdstat[i][mode_idx].brdcost = INT64_MAX;
- if (disable_inter_mode_mask & (1 << mode_idx))
+ if (!(inter_mode_mask & (1 << this_mode)))
continue;
if (!check_best_zero_mv(cpi, mbmi->mode_context, frame_mv,
- disable_inter_mode_mask,
+ inter_mode_mask,
this_mode, mbmi->ref_frame))
continue;
@@ -1858,9 +1850,9 @@
vp9_set_mv_search_range(x, &bsi->ref_mv[0]->as_mv);
- bestsme = full_pixel_search(cpi, x, bsize, &mvp_full, step_param,
- sadpb, &bsi->ref_mv[0]->as_mv, new_mv,
- INT_MAX, 1);
+ bestsme = vp9_full_pixel_search(cpi, x, bsize, &mvp_full, step_param,
+ sadpb, &bsi->ref_mv[0]->as_mv, new_mv,
+ INT_MAX, 1);
// Should we do a full search (best quality only)
if (is_best_mode(cpi->oxcf.mode)) {
@@ -2393,8 +2385,8 @@
mvp_full.col >>= 3;
mvp_full.row >>= 3;
- bestsme = full_pixel_search(cpi, x, bsize, &mvp_full, step_param, sadpb,
- &ref_mv, &tmp_mv->as_mv, INT_MAX, 1);
+ bestsme = vp9_full_pixel_search(cpi, x, bsize, &mvp_full, step_param, sadpb,
+ &ref_mv, &tmp_mv->as_mv, INT_MAX, 1);
x->mv_col_min = tmp_col_min;
x->mv_col_max = tmp_col_max;
@@ -2810,7 +2802,8 @@
*rate2 += vp9_get_switchable_rate(cpi);
if (!is_comp_pred) {
- if (!x->in_active_map) {
+ if (!x->in_active_map ||
+ vp9_segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP)) {
if (psse)
*psse = 0;
*distortion = 0;
@@ -3063,7 +3056,7 @@
const int mode_search_skip_flags = cpi->sf.mode_search_skip_flags;
const int intra_y_mode_mask =
cpi->sf.intra_y_mode_mask[max_txsize_lookup[bsize]];
- int disable_inter_mode_mask = cpi->sf.disable_inter_mode_mask[bsize];
+ int inter_mode_mask = cpi->sf.inter_mode_mask[bsize];
vp9_zero(best_mbmode);
x->skip_encode = cpi->sf.skip_encode_frame && x->q_index < QIDX_SKIP_THRESH;
@@ -3127,10 +3120,8 @@
// If the segment skip feature is enabled....
// then do nothing if the current mode is not allowed..
if (vp9_segfeature_active(seg, segment_id, SEG_LVL_SKIP)) {
- const int inter_non_zero_mode_mask = 0x1F7F7;
- mode_skip_mask |= inter_non_zero_mode_mask;
- mode_skip_mask &= ~(1 << THR_ZEROMV);
- disable_inter_mode_mask = ~(1 << INTER_OFFSET(ZEROMV));
+ mode_skip_mask = ~(1 << THR_ZEROMV);
+ inter_mode_mask = (1 << ZEROMV);
}
// Disable this drop out case if the ref frame
@@ -3182,7 +3173,8 @@
mode_index = THR_ZEROMV;
mode_skip_mask = ~(1 << mode_index);
mode_skip_start = MAX_MODES;
- disable_inter_mode_mask = 0;
+ inter_mode_mask = (1 << NEARESTMV) | (1 << NEARMV) | (1 << ZEROMV) |
+ (1 << NEWMV);
}
for (mode_index = 0; mode_index < MAX_MODES; ++mode_index) {
@@ -3229,8 +3221,7 @@
this_mode = vp9_mode_order[mode_index].mode;
ref_frame = vp9_mode_order[mode_index].ref_frame[0];
- if (ref_frame != INTRA_FRAME &&
- disable_inter_mode_mask & (1 << INTER_OFFSET(this_mode)))
+ if (ref_frame != INTRA_FRAME && !(inter_mode_mask & (1 << this_mode)))
continue;
second_ref_frame = vp9_mode_order[mode_index].ref_frame[1];
@@ -3279,7 +3270,7 @@
!vp9_segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP)) {
const MV_REFERENCE_FRAME ref_frames[2] = {ref_frame, second_ref_frame};
if (!check_best_zero_mv(cpi, mbmi->mode_context, frame_mv,
- disable_inter_mode_mask, this_mode, ref_frames))
+ inter_mode_mask, this_mode, ref_frames))
continue;
}
}
@@ -3450,7 +3441,7 @@
best_rd = this_rd;
best_mbmode = *mbmi;
best_skip2 = this_skip2;
- if (!x->select_txfm_size)
+ if (!x->select_tx_size)
swap_block_ptr(x, ctx, 1, 0, 0, max_plane);
vpx_memcpy(ctx->zcoeff_blk, x->zcoeff_blk[mbmi->tx_size],
sizeof(uint8_t) * ctx->num_4x4_blk);
@@ -3665,7 +3656,6 @@
int_mv seg_mvs[4][MAX_REF_FRAMES];
b_mode_info best_bmodes[4];
int best_skip2 = 0;
- int ref_frame_mask = 0;
int mode_skip_mask = 0;
x->skip_encode = cpi->sf.skip_encode_frame && x->q_index < QIDX_SKIP_THRESH;
@@ -3700,17 +3690,6 @@
frame_mv[ZEROMV][ref_frame].as_int = 0;
}
- for (ref_frame = LAST_FRAME;
- ref_frame <= ALTREF_FRAME && cpi->sf.reference_masking; ++ref_frame) {
- int i;
- for (i = LAST_FRAME; i <= ALTREF_FRAME; ++i) {
- if ((x->pred_mv_sad[ref_frame] >> 1) > x->pred_mv_sad[i]) {
- ref_frame_mask |= (1 << ref_frame);
- break;
- }
- }
- }
-
for (ref_index = 0; ref_index < MAX_REFS; ++ref_index) {
int mode_excluded = 0;
int64_t this_rd = INT64_MAX;
@@ -3805,11 +3784,6 @@
vp9_get_segdata(seg, segment_id, SEG_LVL_REF_FRAME) !=
(int)ref_frame) {
continue;
- // If the segment skip feature is enabled....
- // then do nothing if the current mode is not allowed..
- } else if (vp9_segfeature_active(seg, segment_id, SEG_LVL_SKIP) &&
- ref_frame != INTRA_FRAME) {
- continue;
// Disable this drop out case if the ref frame
// segment level feature is enabled for this segment. This is to
// prevent the possibility that we end up unable to pick any mode.
@@ -4034,15 +4008,10 @@
}
if (!disable_skip) {
- // Test for the condition where skip block will be activated
- // because there are no non zero coefficients and make any
- // necessary adjustment for rate. Ignore if skip is coded at
- // segment level as the cost wont have been added in.
- // Is Mb level skip allowed (i.e. not coded at segment level).
- const int mb_skip_allowed = !vp9_segfeature_active(seg, segment_id,
- SEG_LVL_SKIP);
+ // Skip is never coded at the segment level for sub8x8 blocks and instead
+ // always coded in the bitstream at the mode info level.
- if (mb_skip_allowed && ref_frame != INTRA_FRAME && !xd->lossless) {
+ if (ref_frame != INTRA_FRAME && !xd->lossless) {
if (RDCOST(x->rdmult, x->rddiv, rate_y + rate_uv, distortion2) <
RDCOST(x->rdmult, x->rddiv, 0, total_sse)) {
// Add in the cost of the no skip flag.
@@ -4057,7 +4026,7 @@
rate_uv = 0;
this_skip2 = 1;
}
- } else if (mb_skip_allowed) {
+ } else {
// Add in the cost of the no skip flag.
rate2 += vp9_cost_bit(vp9_get_skip_prob(cm, xd), 0);
}
@@ -4102,7 +4071,7 @@
RDCOST(x->rdmult, x->rddiv, rate_uv, distortion_uv);
best_mbmode = *mbmi;
best_skip2 = this_skip2;
- if (!x->select_txfm_size)
+ if (!x->select_tx_size)
swap_block_ptr(x, ctx, 1, 0, 0, max_plane);
vpx_memcpy(ctx->zcoeff_blk, x->zcoeff_blk[TX_4X4],
sizeof(uint8_t) * ctx->num_4x4_blk);
diff --git a/vp9/encoder/vp9_rdopt.h b/vp9/encoder/vp9_rdopt.h
index 5ea09a8..6e56317 100644
--- a/vp9/encoder/vp9_rdopt.h
+++ b/vp9/encoder/vp9_rdopt.h
@@ -94,49 +94,6 @@
return best_rd < ((int64_t)thresh * thresh_fact >> 5) || thresh == INT_MAX;
}
-static INLINE int full_pixel_search(VP9_COMP *cpi, MACROBLOCK *x,
- BLOCK_SIZE bsize, MV *mvp_full,
- int step_param, int error_per_bit,
- const MV *ref_mv, MV *tmp_mv,
- int var_max, int rd) {
- int var = 0;
-
- if (cpi->sf.search_method == FAST_DIAMOND) {
- var = vp9_fast_dia_search(x, mvp_full, step_param, error_per_bit, 0,
- &cpi->fn_ptr[bsize], 1, ref_mv, tmp_mv);
- if (rd && var < var_max)
- var = vp9_get_mvpred_var(x, tmp_mv, ref_mv, &cpi->fn_ptr[bsize], 1);
- } else if (cpi->sf.search_method == FAST_HEX) {
- var = vp9_fast_hex_search(x, mvp_full, step_param, error_per_bit, 0,
- &cpi->fn_ptr[bsize], 1, ref_mv, tmp_mv);
- if (rd && var < var_max)
- var = vp9_get_mvpred_var(x, tmp_mv, ref_mv, &cpi->fn_ptr[bsize], 1);
- } else if (cpi->sf.search_method == HEX) {
- var = vp9_hex_search(x, mvp_full, step_param, error_per_bit, 1,
- &cpi->fn_ptr[bsize], 1, ref_mv, tmp_mv);
- if (rd && var < var_max)
- var = vp9_get_mvpred_var(x, tmp_mv, ref_mv, &cpi->fn_ptr[bsize], 1);
- } else if (cpi->sf.search_method == SQUARE) {
- var = vp9_square_search(x, mvp_full, step_param, error_per_bit, 1,
- &cpi->fn_ptr[bsize], 1, ref_mv, tmp_mv);
- if (rd && var < var_max)
- var = vp9_get_mvpred_var(x, tmp_mv, ref_mv, &cpi->fn_ptr[bsize], 1);
- } else if (cpi->sf.search_method == BIGDIA) {
- var = vp9_bigdia_search(x, mvp_full, step_param, error_per_bit, 1,
- &cpi->fn_ptr[bsize], 1, ref_mv, tmp_mv);
- if (rd && var < var_max)
- var = vp9_get_mvpred_var(x, tmp_mv, ref_mv, &cpi->fn_ptr[bsize], 1);
- } else {
- int further_steps = (cpi->sf.max_step_search_steps - 1) - step_param;
-
- var = vp9_full_pixel_diamond(cpi, x, mvp_full, step_param, error_per_bit,
- further_steps, 1, &cpi->fn_ptr[bsize],
- ref_mv, tmp_mv);
- }
-
- return var;
-}
-
#ifdef __cplusplus
} // extern "C"
#endif
diff --git a/vp9/encoder/vp9_speed_features.c b/vp9/encoder/vp9_speed_features.c
index 4c3d34d..700862f 100644
--- a/vp9/encoder/vp9_speed_features.c
+++ b/vp9/encoder/vp9_speed_features.c
@@ -28,6 +28,12 @@
};
enum {
+ INTER_ALL = (1 << NEARESTMV) | (1 << NEARMV) | (1 << ZEROMV) | (1 << NEWMV),
+ INTER_NEAREST = (1 << NEARESTMV),
+ INTER_NEAREST_NEAR_NEW = (1 << NEARESTMV) | (1 << NEARMV) | (1 << NEWMV)
+};
+
+enum {
DISABLE_ALL_INTER_SPLIT = (1 << THR_COMP_GA) |
(1 << THR_COMP_LA) |
(1 << THR_ALTR) |
@@ -234,10 +240,10 @@
sf->intra_y_mode_mask[TX_32X32] = INTRA_DC;
sf->frame_parameter_update = 0;
sf->search_method = FAST_HEX;
- sf->disable_inter_mode_mask[BLOCK_32X32] = 1 << INTER_OFFSET(ZEROMV);
- sf->disable_inter_mode_mask[BLOCK_32X64] = ~(1 << INTER_OFFSET(NEARESTMV));
- sf->disable_inter_mode_mask[BLOCK_64X32] = ~(1 << INTER_OFFSET(NEARESTMV));
- sf->disable_inter_mode_mask[BLOCK_64X64] = ~(1 << INTER_OFFSET(NEARESTMV));
+ sf->inter_mode_mask[BLOCK_32X32] = INTER_NEAREST_NEAR_NEW;
+ sf->inter_mode_mask[BLOCK_32X64] = INTER_NEAREST;
+ sf->inter_mode_mask[BLOCK_64X32] = INTER_NEAREST;
+ sf->inter_mode_mask[BLOCK_64X64] = INTER_NEAREST;
sf->max_intra_bsize = BLOCK_32X32;
sf->allow_skip_recode = 1;
}
@@ -255,7 +261,6 @@
sf->use_nonrd_pick_mode = 1;
sf->search_method = FAST_DIAMOND;
sf->allow_skip_recode = 0;
- sf->chessboard_index = cm->current_video_frame & 0x01;
}
if (speed >= 6) {
@@ -263,12 +268,14 @@
sf->partition_search_type = SOURCE_VAR_BASED_PARTITION;
sf->search_type_check_frequency = 50;
sf->source_var_thresh = 360;
+
+ sf->tx_size_search_method = USE_TX_8X8;
}
if (speed >= 7) {
int i;
for (i = 0; i < BLOCK_SIZES; ++i)
- sf->disable_inter_mode_mask[i] = ~(1 << INTER_OFFSET(NEARESTMV));
+ sf->inter_mode_mask[i] = INTER_NEAREST;
}
}
@@ -285,7 +292,7 @@
sf->subpel_search_method = SUBPEL_TREE;
sf->subpel_iters_per_step = 2;
sf->subpel_force_stop = 0;
- sf->optimize_coefficients = !oxcf->lossless;
+ sf->optimize_coefficients = !is_lossless_requested(&cpi->oxcf);
sf->reduce_first_step_size = 0;
sf->auto_mv_step_size = 0;
sf->max_step_search_steps = MAX_MVSEARCH_STEPS;
@@ -326,7 +333,7 @@
sf->mode_skip_start = MAX_MODES; // Mode index at which mode skip mask set
sf->use_nonrd_pick_mode = 0;
for (i = 0; i < BLOCK_SIZES; ++i)
- sf->disable_inter_mode_mask[i] = 0;
+ sf->inter_mode_mask[i] = INTER_ALL;
sf->max_intra_bsize = BLOCK_64X64;
// This setting only takes effect when partition_search_type is set
// to FIXED_PARTITION.
diff --git a/vp9/encoder/vp9_speed_features.h b/vp9/encoder/vp9_speed_features.h
index a54599e..4a1a13b 100644
--- a/vp9/encoder/vp9_speed_features.h
+++ b/vp9/encoder/vp9_speed_features.h
@@ -56,9 +56,8 @@
typedef enum {
USE_FULL_RD = 0,
- USE_LARGESTINTRA,
- USE_LARGESTINTRA_MODELINTER,
- USE_LARGESTALL
+ USE_LARGESTALL,
+ USE_TX_8X8
} TX_SIZE_SEARCH_METHOD;
typedef enum {
@@ -283,9 +282,6 @@
// encoding process for RTC.
int partition_check;
- // Chessboard pattern index
- int chessboard_index;
-
// Use finer quantizer in every other few frames that run variable block
// partition type search.
int force_frame_boost;
@@ -331,8 +327,8 @@
int use_nonrd_pick_mode;
// A binary mask indicating if NEARESTMV, NEARMV, ZEROMV, NEWMV
- // modes are disabled in order from LSB to MSB for each BLOCK_SIZE.
- int disable_inter_mode_mask[BLOCK_SIZES];
+ // modes are used in order from LSB to MSB for each BLOCK_SIZE.
+ int inter_mode_mask[BLOCK_SIZES];
// This feature controls whether we do the expensive context update and
// calculation in the rd coefficient costing loop.
diff --git a/vp9/encoder/vp9_svc_layercontext.c b/vp9/encoder/vp9_svc_layercontext.c
index c25314b..1b99575 100644
--- a/vp9/encoder/vp9_svc_layercontext.c
+++ b/vp9/encoder/vp9_svc_layercontext.c
@@ -54,7 +54,7 @@
lrc->last_q[INTER_FRAME] = oxcf->best_allowed_q;
}
- lrc->buffer_level = vp9_rescale((int)(oxcf->starting_buffer_level),
+ lrc->buffer_level = vp9_rescale((int)(oxcf->starting_buffer_level_ms),
lc->target_bandwidth, 1000);
lrc->bits_off_target = lrc->buffer_level;
}
@@ -87,14 +87,14 @@
}
bitrate_alloc = (float)lc->target_bandwidth / target_bandwidth;
// Update buffer-related quantities.
- lc->starting_buffer_level =
- (int64_t)(oxcf->starting_buffer_level * bitrate_alloc);
- lc->optimal_buffer_level =
- (int64_t)(oxcf->optimal_buffer_level * bitrate_alloc);
- lc->maximum_buffer_size =
- (int64_t)(oxcf->maximum_buffer_size * bitrate_alloc);
- lrc->bits_off_target = MIN(lrc->bits_off_target, lc->maximum_buffer_size);
- lrc->buffer_level = MIN(lrc->buffer_level, lc->maximum_buffer_size);
+ lrc->starting_buffer_level =
+ (int64_t)(rc->starting_buffer_level * bitrate_alloc);
+ lrc->optimal_buffer_level =
+ (int64_t)(rc->optimal_buffer_level * bitrate_alloc);
+ lrc->maximum_buffer_size =
+ (int64_t)(rc->maximum_buffer_size * bitrate_alloc);
+ lrc->bits_off_target = MIN(lrc->bits_off_target, lrc->maximum_buffer_size);
+ lrc->buffer_level = MIN(lrc->buffer_level, lrc->maximum_buffer_size);
// Update framerate-related quantities.
if (svc->number_temporal_layers > 1) {
lc->framerate = oxcf->framerate / oxcf->ts_rate_decimator[layer];
@@ -160,9 +160,6 @@
cpi->rc = lc->rc;
cpi->twopass = lc->twopass;
cpi->oxcf.target_bandwidth = lc->target_bandwidth;
- cpi->oxcf.starting_buffer_level = lc->starting_buffer_level;
- cpi->oxcf.optimal_buffer_level = lc->optimal_buffer_level;
- cpi->oxcf.maximum_buffer_size = lc->maximum_buffer_size;
// Reset the frames_since_key and frames_to_key counters to their values
// before the layer restore. Keep these defined for the stream (not layer).
if (cpi->svc.number_temporal_layers > 1) {
@@ -178,9 +175,6 @@
lc->rc = cpi->rc;
lc->twopass = cpi->twopass;
lc->target_bandwidth = (int)oxcf->target_bandwidth;
- lc->starting_buffer_level = oxcf->starting_buffer_level;
- lc->optimal_buffer_level = oxcf->optimal_buffer_level;
- lc->maximum_buffer_size = oxcf->maximum_buffer_size;
}
void vp9_init_second_pass_spatial_svc(VP9_COMP *cpi) {
diff --git a/vp9/encoder/vp9_svc_layercontext.h b/vp9/encoder/vp9_svc_layercontext.h
index 6881ce1..36e2027 100644
--- a/vp9/encoder/vp9_svc_layercontext.h
+++ b/vp9/encoder/vp9_svc_layercontext.h
@@ -22,9 +22,6 @@
typedef struct {
RATE_CONTROL rc;
int target_bandwidth;
- int64_t starting_buffer_level;
- int64_t optimal_buffer_level;
- int64_t maximum_buffer_size;
double framerate;
int avg_frame_size;
TWO_PASS twopass;
diff --git a/vp9/encoder/vp9_variance.c b/vp9/encoder/vp9_variance.c
index 02bed89..eb5ae2e 100644
--- a/vp9/encoder/vp9_variance.c
+++ b/vp9/encoder/vp9_variance.c
@@ -156,16 +156,15 @@
return vp9_variance##W##x##H##_c(temp3, W, dst, dst_stride, sse); \
}
-
-void vp9_get_sse_sum_16x16_c(const uint8_t *src_ptr, int source_stride,
- const uint8_t *ref_ptr, int ref_stride,
- unsigned int *sse, int *sum) {
+void vp9_get16x16var_c(const uint8_t *src_ptr, int source_stride,
+ const uint8_t *ref_ptr, int ref_stride,
+ unsigned int *sse, int *sum) {
variance(src_ptr, source_stride, ref_ptr, ref_stride, 16, 16, sse, sum);
}
-void vp9_get_sse_sum_8x8_c(const uint8_t *src_ptr, int source_stride,
- const uint8_t *ref_ptr, int ref_stride,
- unsigned int *sse, int *sum) {
+void vp9_get8x8var_c(const uint8_t *src_ptr, int source_stride,
+ const uint8_t *ref_ptr, int ref_stride,
+ unsigned int *sse, int *sum) {
variance(src_ptr, source_stride, ref_ptr, ref_stride, 8, 8, sse, sum);
}
diff --git a/vp9/encoder/x86/vp9_dct_sse2.c b/vp9/encoder/x86/vp9_dct_sse2.c
index 1f58d87..487deef 100644
--- a/vp9/encoder/x86/vp9_dct_sse2.c
+++ b/vp9/encoder/x86/vp9_dct_sse2.c
@@ -12,6 +12,35 @@
#include "vp9/common/vp9_idct.h" // for cospi constants
#include "vpx_ports/mem.h"
+void vp9_fdct4x4_1_sse2(const int16_t *input, int16_t *output, int stride) {
+ __m128i in0, in1;
+ __m128i tmp;
+ const __m128i zero = _mm_setzero_si128();
+ in0 = _mm_loadl_epi64((const __m128i *)(input + 0 * stride));
+ in1 = _mm_loadl_epi64((const __m128i *)(input + 1 * stride));
+ in1 = _mm_unpacklo_epi64(in1, _mm_loadl_epi64((const __m128i *)
+ (input + 2 * stride)));
+ in0 = _mm_unpacklo_epi64(in0, _mm_loadl_epi64((const __m128i *)
+ (input + 3 * stride)));
+
+ tmp = _mm_add_epi16(in0, in1);
+ in0 = _mm_unpacklo_epi16(zero, tmp);
+ in1 = _mm_unpackhi_epi16(zero, tmp);
+ in0 = _mm_srai_epi32(in0, 16);
+ in1 = _mm_srai_epi32(in1, 16);
+
+ tmp = _mm_add_epi32(in0, in1);
+ in0 = _mm_unpacklo_epi32(tmp, zero);
+ in1 = _mm_unpackhi_epi32(tmp, zero);
+
+ tmp = _mm_add_epi32(in0, in1);
+ in0 = _mm_srli_si128(tmp, 8);
+
+ in1 = _mm_add_epi32(tmp, in0);
+ in0 = _mm_slli_epi32(in1, 1);
+ _mm_store_si128((__m128i *)(output), in0);
+}
+
void vp9_fdct4x4_sse2(const int16_t *input, int16_t *output, int stride) {
// This 2D transform implements 4 vertical 1D transforms followed
// by 4 horizontal 1D transforms. The multiplies and adds are as given
@@ -377,6 +406,46 @@
}
}
+void vp9_fdct8x8_1_sse2(const int16_t *input, int16_t *output, int stride) {
+ __m128i in0 = _mm_load_si128((const __m128i *)(input + 0 * stride));
+ __m128i in1 = _mm_load_si128((const __m128i *)(input + 1 * stride));
+ __m128i in2 = _mm_load_si128((const __m128i *)(input + 2 * stride));
+ __m128i in3 = _mm_load_si128((const __m128i *)(input + 3 * stride));
+ __m128i u0, u1, sum;
+
+ u0 = _mm_add_epi16(in0, in1);
+ u1 = _mm_add_epi16(in2, in3);
+
+ in0 = _mm_load_si128((const __m128i *)(input + 4 * stride));
+ in1 = _mm_load_si128((const __m128i *)(input + 5 * stride));
+ in2 = _mm_load_si128((const __m128i *)(input + 6 * stride));
+ in3 = _mm_load_si128((const __m128i *)(input + 7 * stride));
+
+ sum = _mm_add_epi16(u0, u1);
+
+ in0 = _mm_add_epi16(in0, in1);
+ in2 = _mm_add_epi16(in2, in3);
+ sum = _mm_add_epi16(sum, in0);
+
+ u0 = _mm_setzero_si128();
+ sum = _mm_add_epi16(sum, in2);
+
+ in0 = _mm_unpacklo_epi16(u0, sum);
+ in1 = _mm_unpackhi_epi16(u0, sum);
+ in0 = _mm_srai_epi32(in0, 16);
+ in1 = _mm_srai_epi32(in1, 16);
+
+ sum = _mm_add_epi32(in0, in1);
+ in0 = _mm_unpacklo_epi32(sum, u0);
+ in1 = _mm_unpackhi_epi32(sum, u0);
+
+ sum = _mm_add_epi32(in0, in1);
+ in0 = _mm_srli_si128(sum, 8);
+
+ in1 = _mm_add_epi32(sum, in0);
+ _mm_store_si128((__m128i *)(output), in1);
+}
+
void vp9_fdct8x8_sse2(const int16_t *input, int16_t *output, int stride) {
int pass;
// Constants
@@ -1168,6 +1237,74 @@
}
}
+void vp9_fdct16x16_1_sse2(const int16_t *input, int16_t *output, int stride) {
+ __m128i in0, in1, in2, in3;
+ __m128i u0, u1;
+ __m128i sum = _mm_setzero_si128();
+ int i;
+
+ for (i = 0; i < 2; ++i) {
+ input += 8 * i;
+ in0 = _mm_load_si128((const __m128i *)(input + 0 * stride));
+ in1 = _mm_load_si128((const __m128i *)(input + 1 * stride));
+ in2 = _mm_load_si128((const __m128i *)(input + 2 * stride));
+ in3 = _mm_load_si128((const __m128i *)(input + 3 * stride));
+
+ u0 = _mm_add_epi16(in0, in1);
+ u1 = _mm_add_epi16(in2, in3);
+ sum = _mm_add_epi16(sum, u0);
+
+ in0 = _mm_load_si128((const __m128i *)(input + 4 * stride));
+ in1 = _mm_load_si128((const __m128i *)(input + 5 * stride));
+ in2 = _mm_load_si128((const __m128i *)(input + 6 * stride));
+ in3 = _mm_load_si128((const __m128i *)(input + 7 * stride));
+
+ sum = _mm_add_epi16(sum, u1);
+ u0 = _mm_add_epi16(in0, in1);
+ u1 = _mm_add_epi16(in2, in3);
+ sum = _mm_add_epi16(sum, u0);
+
+ in0 = _mm_load_si128((const __m128i *)(input + 8 * stride));
+ in1 = _mm_load_si128((const __m128i *)(input + 9 * stride));
+ in2 = _mm_load_si128((const __m128i *)(input + 10 * stride));
+ in3 = _mm_load_si128((const __m128i *)(input + 11 * stride));
+
+ sum = _mm_add_epi16(sum, u1);
+ u0 = _mm_add_epi16(in0, in1);
+ u1 = _mm_add_epi16(in2, in3);
+ sum = _mm_add_epi16(sum, u0);
+
+ in0 = _mm_load_si128((const __m128i *)(input + 12 * stride));
+ in1 = _mm_load_si128((const __m128i *)(input + 13 * stride));
+ in2 = _mm_load_si128((const __m128i *)(input + 14 * stride));
+ in3 = _mm_load_si128((const __m128i *)(input + 15 * stride));
+
+ sum = _mm_add_epi16(sum, u1);
+ u0 = _mm_add_epi16(in0, in1);
+ u1 = _mm_add_epi16(in2, in3);
+ sum = _mm_add_epi16(sum, u0);
+
+ sum = _mm_add_epi16(sum, u1);
+ }
+
+ u0 = _mm_setzero_si128();
+ in0 = _mm_unpacklo_epi16(u0, sum);
+ in1 = _mm_unpackhi_epi16(u0, sum);
+ in0 = _mm_srai_epi32(in0, 16);
+ in1 = _mm_srai_epi32(in1, 16);
+
+ sum = _mm_add_epi32(in0, in1);
+ in0 = _mm_unpacklo_epi32(sum, u0);
+ in1 = _mm_unpackhi_epi32(sum, u0);
+
+ sum = _mm_add_epi32(in0, in1);
+ in0 = _mm_srli_si128(sum, 8);
+
+ in1 = _mm_add_epi32(sum, in0);
+ in1 = _mm_srai_epi32(in1, 1);
+ _mm_store_si128((__m128i *)(output), in1);
+}
+
void vp9_fdct16x16_sse2(const int16_t *input, int16_t *output, int stride) {
// The 2D transform is done with two passes which are actually pretty
// similar. In the first one, we transform the columns and transpose
@@ -2680,6 +2817,77 @@
}
}
+void vp9_fdct32x32_1_sse2(const int16_t *input, int16_t *output, int stride) {
+ __m128i in0, in1, in2, in3;
+ __m128i u0, u1;
+ __m128i sum = _mm_setzero_si128();
+ int i;
+
+ for (i = 0; i < 8; ++i) {
+ in0 = _mm_load_si128((const __m128i *)(input + 0));
+ in1 = _mm_load_si128((const __m128i *)(input + 8));
+ in2 = _mm_load_si128((const __m128i *)(input + 16));
+ in3 = _mm_load_si128((const __m128i *)(input + 24));
+
+ input += stride;
+ u0 = _mm_add_epi16(in0, in1);
+ u1 = _mm_add_epi16(in2, in3);
+ sum = _mm_add_epi16(sum, u0);
+
+ in0 = _mm_load_si128((const __m128i *)(input + 0));
+ in1 = _mm_load_si128((const __m128i *)(input + 8));
+ in2 = _mm_load_si128((const __m128i *)(input + 16));
+ in3 = _mm_load_si128((const __m128i *)(input + 24));
+
+ input += stride;
+ sum = _mm_add_epi16(sum, u1);
+ u0 = _mm_add_epi16(in0, in1);
+ u1 = _mm_add_epi16(in2, in3);
+ sum = _mm_add_epi16(sum, u0);
+
+ in0 = _mm_load_si128((const __m128i *)(input + 0));
+ in1 = _mm_load_si128((const __m128i *)(input + 8));
+ in2 = _mm_load_si128((const __m128i *)(input + 16));
+ in3 = _mm_load_si128((const __m128i *)(input + 24));
+
+ input += stride;
+ sum = _mm_add_epi16(sum, u1);
+ u0 = _mm_add_epi16(in0, in1);
+ u1 = _mm_add_epi16(in2, in3);
+ sum = _mm_add_epi16(sum, u0);
+
+ in0 = _mm_load_si128((const __m128i *)(input + 0));
+ in1 = _mm_load_si128((const __m128i *)(input + 8));
+ in2 = _mm_load_si128((const __m128i *)(input + 16));
+ in3 = _mm_load_si128((const __m128i *)(input + 24));
+
+ input += stride;
+ sum = _mm_add_epi16(sum, u1);
+ u0 = _mm_add_epi16(in0, in1);
+ u1 = _mm_add_epi16(in2, in3);
+ sum = _mm_add_epi16(sum, u0);
+
+ sum = _mm_add_epi16(sum, u1);
+ }
+
+ u0 = _mm_setzero_si128();
+ in0 = _mm_unpacklo_epi16(u0, sum);
+ in1 = _mm_unpackhi_epi16(u0, sum);
+ in0 = _mm_srai_epi32(in0, 16);
+ in1 = _mm_srai_epi32(in1, 16);
+
+ sum = _mm_add_epi32(in0, in1);
+ in0 = _mm_unpacklo_epi32(sum, u0);
+ in1 = _mm_unpackhi_epi32(sum, u0);
+
+ sum = _mm_add_epi32(in0, in1);
+ in0 = _mm_srli_si128(sum, 8);
+
+ in1 = _mm_add_epi32(sum, in0);
+ in1 = _mm_srai_epi32(in1, 3);
+ _mm_store_si128((__m128i *)(output), in1);
+}
+
#define FDCT32x32_2D vp9_fdct32x32_rd_sse2
#define FDCT32x32_HIGH_PRECISION 0
#include "vp9/encoder/x86/vp9_dct32x32_sse2.c"
diff --git a/vp9/encoder/x86/vp9_dct_ssse3_x86_64.asm b/vp9/encoder/x86/vp9_dct_ssse3_x86_64.asm
index 8723a71..28458dc 100644
--- a/vp9/encoder/x86/vp9_dct_ssse3_x86_64.asm
+++ b/vp9/encoder/x86/vp9_dct_ssse3_x86_64.asm
@@ -23,6 +23,7 @@
pw_%2_m%1: dw %2, -%1, %2, -%1, %2, -%1, %2, -%1
%endmacro
+TRANSFORM_COEFFS 11585, 11585
TRANSFORM_COEFFS 15137, 6270
TRANSFORM_COEFFS 16069, 3196
TRANSFORM_COEFFS 9102, 13623
@@ -83,7 +84,7 @@
%endmacro
; 1D forward 8x8 DCT transform
-%macro FDCT8_1D 0
+%macro FDCT8_1D 1
SUM_SUB 0, 7, 9
SUM_SUB 1, 6, 9
SUM_SUB 2, 5, 9
@@ -92,14 +93,21 @@
SUM_SUB 0, 3, 9
SUM_SUB 1, 2, 9
SUM_SUB 6, 5, 9
+%if %1 == 0
SUM_SUB 0, 1, 9
+%endif
BUTTERFLY_4X 2, 3, 6270, 15137, m8, 9, 10
pmulhrsw m6, m12
pmulhrsw m5, m12
+%if %1 == 0
pmulhrsw m0, m12
pmulhrsw m1, m12
+%else
+ BUTTERFLY_4X 1, 0, 11585, 11585, m8, 9, 10
+ SWAP 0, 1
+%endif
SUM_SUB 4, 5, 9
SUM_SUB 7, 6, 9
@@ -150,10 +158,10 @@
psllw m7, 2
; column transform
- FDCT8_1D
+ FDCT8_1D 0
TRANSPOSE8X8 0, 1, 2, 3, 4, 5, 6, 7, 9
- FDCT8_1D
+ FDCT8_1D 1
TRANSPOSE8X8 0, 1, 2, 3, 4, 5, 6, 7, 9
DIVIDE_ROUND_2X 0, 1, 9, 10
diff --git a/vp9/encoder/x86/vp9_variance_mmx.c b/vp9/encoder/x86/vp9_variance_mmx.c
index ae2f976..ce1c832 100644
--- a/vp9/encoder/x86/vp9_variance_mmx.c
+++ b/vp9/encoder/x86/vp9_variance_mmx.c
@@ -12,141 +12,92 @@
#include "vp9/encoder/vp9_variance.h"
#include "vpx_ports/mem.h"
-extern unsigned int vp9_get8x8var_mmx
-(
- const unsigned char *src_ptr,
- int source_stride,
- const unsigned char *ref_ptr,
- int recon_stride,
- unsigned int *SSE,
- int *Sum
-);
-extern unsigned int vp9_get4x4var_mmx
-(
- const unsigned char *src_ptr,
- int source_stride,
- const unsigned char *ref_ptr,
- int recon_stride,
- unsigned int *SSE,
- int *Sum
-);
+unsigned int vp9_get8x8var_mmx(const uint8_t *src, int src_stride,
+ const uint8_t *ref, int ref_stride,
+ unsigned int *sse, int *sum);
-unsigned int vp9_variance4x4_mmx(
- const unsigned char *src_ptr,
- int source_stride,
- const unsigned char *ref_ptr,
- int recon_stride,
- unsigned int *sse) {
- unsigned int var;
- int avg;
+unsigned int vp9_get4x4var_mmx(const uint8_t *src, int src_stride,
+ const uint8_t *ref, int ref_stride,
+ unsigned int *SSE, int *sum);
- vp9_get4x4var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &var, &avg);
- *sse = var;
- return (var - (((unsigned int)avg * avg) >> 4));
+unsigned int vp9_variance4x4_mmx(const uint8_t *src, int src_stride,
+ const uint8_t *ref, int ref_stride,
+ unsigned int *sse) {
+ int sum;
+ vp9_get4x4var_mmx(src, src_stride, ref, ref_stride, sse, &sum);
+ return *sse - (((unsigned int)sum * sum) >> 4);
}
-unsigned int vp9_variance8x8_mmx(
- const unsigned char *src_ptr,
- int source_stride,
- const unsigned char *ref_ptr,
- int recon_stride,
- unsigned int *sse) {
- unsigned int var;
- int avg;
-
- vp9_get8x8var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &var, &avg);
- *sse = var;
-
- return (var - (((unsigned int)avg * avg) >> 6));
+unsigned int vp9_variance8x8_mmx(const uint8_t *src, int src_stride,
+ const uint8_t *ref, int ref_stride,
+ unsigned int *sse) {
+ int sum;
+ vp9_get8x8var_mmx(src, src_stride, ref, ref_stride, sse, &sum);
+ return *sse - (((unsigned int)sum * sum) >> 6);
}
-unsigned int vp9_mse16x16_mmx(
- const unsigned char *src_ptr,
- int source_stride,
- const unsigned char *ref_ptr,
- int recon_stride,
- unsigned int *sse) {
- unsigned int sse0, sse1, sse2, sse3, var;
+unsigned int vp9_mse16x16_mmx(const uint8_t *src, int src_stride,
+ const uint8_t *ref, int ref_stride,
+ unsigned int *sse) {
+ unsigned int sse0, sse1, sse2, sse3;
int sum0, sum1, sum2, sum3;
+ vp9_get8x8var_mmx(src, src_stride, ref, ref_stride, &sse0, &sum0);
+ vp9_get8x8var_mmx(src + 8, src_stride, ref + 8, ref_stride, &sse1, &sum1);
+ vp9_get8x8var_mmx(src + 8 * src_stride, src_stride,
+ ref + 8 * ref_stride, ref_stride, &sse2, &sum2);
+ vp9_get8x8var_mmx(src + 8 * src_stride + 8, src_stride,
+ ref + 8 * ref_stride + 8, ref_stride, &sse3, &sum3);
- vp9_get8x8var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &sse0,
- &sum0);
- vp9_get8x8var_mmx(src_ptr + 8, source_stride, ref_ptr + 8, recon_stride,
- &sse1, &sum1);
- vp9_get8x8var_mmx(src_ptr + 8 * source_stride, source_stride,
- ref_ptr + 8 * recon_stride, recon_stride, &sse2, &sum2);
- vp9_get8x8var_mmx(src_ptr + 8 * source_stride + 8, source_stride,
- ref_ptr + 8 * recon_stride + 8, recon_stride, &sse3, &sum3);
-
- var = sse0 + sse1 + sse2 + sse3;
- *sse = var;
- return var;
+ *sse = sse0 + sse1 + sse2 + sse3;
+ return *sse;
}
-unsigned int vp9_variance16x16_mmx(
- const unsigned char *src_ptr,
- int source_stride,
- const unsigned char *ref_ptr,
- int recon_stride,
- unsigned int *sse) {
- unsigned int sse0, sse1, sse2, sse3, var;
- int sum0, sum1, sum2, sum3, avg;
+unsigned int vp9_variance16x16_mmx(const uint8_t *src, int src_stride,
+ const uint8_t *ref, int ref_stride,
+ unsigned int *sse) {
+ unsigned int sse0, sse1, sse2, sse3;
+ int sum0, sum1, sum2, sum3, sum;
- vp9_get8x8var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &sse0,
- &sum0);
- vp9_get8x8var_mmx(src_ptr + 8, source_stride, ref_ptr + 8, recon_stride,
- &sse1, &sum1);
- vp9_get8x8var_mmx(src_ptr + 8 * source_stride, source_stride,
- ref_ptr + 8 * recon_stride, recon_stride, &sse2, &sum2);
- vp9_get8x8var_mmx(src_ptr + 8 * source_stride + 8, source_stride,
- ref_ptr + 8 * recon_stride + 8, recon_stride, &sse3, &sum3);
+ vp9_get8x8var_mmx(src, src_stride, ref, ref_stride, &sse0, &sum0);
+ vp9_get8x8var_mmx(src + 8, src_stride, ref + 8, ref_stride, &sse1, &sum1);
+ vp9_get8x8var_mmx(src + 8 * src_stride, src_stride,
+ ref + 8 * ref_stride, ref_stride, &sse2, &sum2);
+ vp9_get8x8var_mmx(src + 8 * src_stride + 8, src_stride,
+ ref + 8 * ref_stride + 8, ref_stride, &sse3, &sum3);
- var = sse0 + sse1 + sse2 + sse3;
- avg = sum0 + sum1 + sum2 + sum3;
- *sse = var;
- return (var - (((unsigned int)avg * avg) >> 8));
+ *sse = sse0 + sse1 + sse2 + sse3;
+ sum = sum0 + sum1 + sum2 + sum3;
+ return *sse - (((unsigned int)sum * sum) >> 8);
}
-unsigned int vp9_variance16x8_mmx(
- const unsigned char *src_ptr,
- int source_stride,
- const unsigned char *ref_ptr,
- int recon_stride,
- unsigned int *sse) {
- unsigned int sse0, sse1, var;
- int sum0, sum1, avg;
+unsigned int vp9_variance16x8_mmx(const uint8_t *src, int src_stride,
+ const uint8_t *ref, int ref_stride,
+ unsigned int *sse) {
+ unsigned int sse0, sse1;
+ int sum0, sum1, sum;
- vp9_get8x8var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &sse0,
- &sum0);
- vp9_get8x8var_mmx(src_ptr + 8, source_stride, ref_ptr + 8, recon_stride,
- &sse1, &sum1);
+ vp9_get8x8var_mmx(src, src_stride, ref, ref_stride, &sse0, &sum0);
+ vp9_get8x8var_mmx(src + 8, src_stride, ref + 8, ref_stride, &sse1, &sum1);
- var = sse0 + sse1;
- avg = sum0 + sum1;
- *sse = var;
- return (var - (((unsigned int)avg * avg) >> 7));
+ *sse = sse0 + sse1;
+ sum = sum0 + sum1;
+ return *sse - (((unsigned int)sum * sum) >> 7);
}
-unsigned int vp9_variance8x16_mmx(
- const unsigned char *src_ptr,
- int source_stride,
- const unsigned char *ref_ptr,
- int recon_stride,
- unsigned int *sse) {
- unsigned int sse0, sse1, var;
- int sum0, sum1, avg;
+unsigned int vp9_variance8x16_mmx(const uint8_t *src, int src_stride,
+ const uint8_t *ref, int ref_stride,
+ unsigned int *sse) {
+ unsigned int sse0, sse1;
+ int sum0, sum1, sum;
- vp9_get8x8var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &sse0,
- &sum0);
- vp9_get8x8var_mmx(src_ptr + 8 * source_stride, source_stride,
- ref_ptr + 8 * recon_stride, recon_stride, &sse1, &sum1);
+ vp9_get8x8var_mmx(src, src_stride, ref, ref_stride, &sse0, &sum0);
+ vp9_get8x8var_mmx(src + 8 * src_stride, src_stride,
+ ref + 8 * ref_stride, ref_stride, &sse1, &sum1);
- var = sse0 + sse1;
- avg = sum0 + sum1;
- *sse = var;
-
- return (var - (((unsigned int)avg * avg) >> 7));
+ *sse = sse0 + sse1;
+ sum = sum0 + sum1;
+ return *sse - (((unsigned int)sum * sum) >> 7);
}
diff --git a/vp9/vp9_cx_iface.c b/vp9/vp9_cx_iface.c
index d52424a..72768e1 100644
--- a/vp9/vp9_cx_iface.c
+++ b/vp9/vp9_cx_iface.c
@@ -331,8 +331,10 @@
oxcf->target_bandwidth = 1000 * cfg->rc_target_bitrate;
oxcf->rc_max_intra_bitrate_pct = extra_cfg->rc_max_intra_bitrate_pct;
- oxcf->best_allowed_q = vp9_quantizer_to_qindex(cfg->rc_min_quantizer);
- oxcf->worst_allowed_q = vp9_quantizer_to_qindex(cfg->rc_max_quantizer);
+ oxcf->best_allowed_q =
+ extra_cfg->lossless ? 0 : vp9_quantizer_to_qindex(cfg->rc_min_quantizer);
+ oxcf->worst_allowed_q =
+ extra_cfg->lossless ? 0 : vp9_quantizer_to_qindex(cfg->rc_max_quantizer);
oxcf->cq_level = vp9_quantizer_to_qindex(extra_cfg->cq_level);
oxcf->fixed_q = -1;
@@ -343,9 +345,9 @@
oxcf->scaled_frame_width = cfg->rc_scaled_width;
oxcf->scaled_frame_height = cfg->rc_scaled_height;
- oxcf->maximum_buffer_size = cfg->rc_buf_sz;
- oxcf->starting_buffer_level = cfg->rc_buf_initial_sz;
- oxcf->optimal_buffer_level = cfg->rc_buf_optimal_sz;
+ oxcf->maximum_buffer_size_ms = cfg->rc_buf_sz;
+ oxcf->starting_buffer_level_ms = cfg->rc_buf_initial_sz;
+ oxcf->optimal_buffer_level_ms = cfg->rc_buf_optimal_sz;
oxcf->drop_frames_water_mark = cfg->rc_dropframe_thresh;
@@ -376,8 +378,6 @@
oxcf->tile_columns = extra_cfg->tile_columns;
oxcf->tile_rows = extra_cfg->tile_rows;
- oxcf->lossless = extra_cfg->lossless;
-
oxcf->error_resilient_mode = cfg->g_error_resilient;
oxcf->frame_parallel_decoding_mode = extra_cfg->frame_parallel_decoding_mode;
diff --git a/vp9/vp9_dx_iface.c b/vp9/vp9_dx_iface.c
index 6e56c84..98faa7f 100644
--- a/vp9/vp9_dx_iface.c
+++ b/vp9/vp9_dx_iface.c
@@ -38,8 +38,8 @@
vpx_decrypt_cb decrypt_cb;
void *decrypt_state;
vpx_image_t img;
- int img_avail;
int invert_tile_order;
+ int frame_parallel_decode; // frame-based threading.
// External frame buffer info to save for VP9 common.
void *ext_priv; // Private data associated with the external frame buffers.
@@ -67,6 +67,11 @@
ctx->priv->alg_priv = alg_priv;
ctx->priv->alg_priv->si.sz = sizeof(ctx->priv->alg_priv->si);
ctx->priv->init_flags = ctx->init_flags;
+ ctx->priv->alg_priv->frame_parallel_decode =
+ (ctx->init_flags & VPX_CODEC_USE_FRAME_THREADING);
+
+ // Disable frame parallel decoding for now.
+ ctx->priv->alg_priv->frame_parallel_decode = 0;
if (ctx->config.dec) {
// Update the reference to the config structure to an internal copy.
@@ -232,6 +237,7 @@
ctx->pbi->max_threads = ctx->cfg.threads;
ctx->pbi->inv_tile_order = ctx->invert_tile_order;
+ ctx->pbi->frame_parallel_decode = ctx->frame_parallel_decode;
// If postprocessing was enabled by the application and a
// configuration has not been provided, default it.
@@ -245,15 +251,11 @@
static vpx_codec_err_t decode_one(vpx_codec_alg_priv_t *ctx,
const uint8_t **data, unsigned int data_sz,
void *user_priv, int64_t deadline) {
- YV12_BUFFER_CONFIG sd;
- vp9_ppflags_t flags = {0, 0, 0};
+ vp9_ppflags_t flags = {0};
VP9_COMMON *cm = NULL;
(void)deadline;
- vp9_zero(sd);
- ctx->img_avail = 0;
-
// Determine the stream parameters. Note that we rely on peek_si to
// validate that we have a buffer that does not wrap around the top
// of the heap.
@@ -288,13 +290,6 @@
if (ctx->base.init_flags & VPX_CODEC_USE_POSTPROC)
set_ppflags(ctx, &flags);
- if (vp9_get_raw_frame(ctx->pbi, &sd, &flags))
- return update_error_state(ctx, &cm->error);
-
- yuvconfig2image(&ctx->img, &sd, user_priv);
- ctx->img.fb_priv = cm->frame_bufs[cm->new_fb_idx].raw_frame_buffer.priv;
- ctx->img_avail = 1;
-
return VPX_CODEC_OK;
}
@@ -423,15 +418,20 @@
vpx_codec_iter_t *iter) {
vpx_image_t *img = NULL;
- if (ctx->img_avail) {
- // iter acts as a flip flop, so an image is only returned on the first
- // call to get_frame.
- if (!(*iter)) {
+ // iter acts as a flip flop, so an image is only returned on the first
+ // call to get_frame.
+ if (*iter == NULL && ctx->pbi != NULL) {
+ YV12_BUFFER_CONFIG sd;
+ vp9_ppflags_t flags = {0, 0, 0};
+
+ if (vp9_get_raw_frame(ctx->pbi, &sd, &flags) == 0) {
+ VP9_COMMON *cm = &ctx->pbi->common;
+ yuvconfig2image(&ctx->img, &sd, NULL);
+ ctx->img.fb_priv = cm->frame_bufs[cm->new_fb_idx].raw_frame_buffer.priv;
img = &ctx->img;
*iter = img;
}
}
- ctx->img_avail = 0;
return img;
}
diff --git a/vpx/exports_enc b/vpx/exports_enc
index 155faf6..07f0280 100644
--- a/vpx/exports_enc
+++ b/vpx/exports_enc
@@ -8,7 +8,6 @@
text vpx_codec_set_cx_data_buf
text vpx_svc_dump_statistics
text vpx_svc_encode
-text vpx_svc_free
text vpx_svc_get_buffer
text vpx_svc_get_encode_frame_count
text vpx_svc_get_frame_size
@@ -22,4 +21,4 @@
text vpx_svc_set_scale_factors
text vpx_svc_get_layer_resolution
text vpx_svc_get_rc_stats_buffer_size
-text vpx_svc_get_rc_stats_buffer
\ No newline at end of file
+text vpx_svc_get_rc_stats_buffer