Merge "Fix an unused variable warning."
diff --git a/build/make/Makefile b/build/make/Makefile
index 40fa6d5..62d139e 100755
--- a/build/make/Makefile
+++ b/build/make/Makefile
@@ -153,7 +153,7 @@
#
obj_int_extract: build/make/obj_int_extract.c
$(if $(quiet),echo " [HOSTCC] $@")
- $(qexec)$(HOSTCC) -I. -o $@ $<
+ $(qexec)$(HOSTCC) -I. -I$(SRC_PATH_BARE) -o $@ $<
CLEAN-OBJS += obj_int_extract
#
diff --git a/build/make/gen_msvs_proj.sh b/build/make/gen_msvs_proj.sh
index becd958..c2ef44a 100755
--- a/build/make/gen_msvs_proj.sh
+++ b/build/make/gen_msvs_proj.sh
@@ -33,6 +33,7 @@
--proj-guid=GUID GUID to use for the project
--module-def=filename File containing export definitions (for DLLs)
--ver=version Version (7,8,9) of visual studio to generate for
+ --src-path-bare=dir Path to root of source tree
-Ipath/to/include Additional include directories
-DFLAG[=value] Preprocessor macros to define
-Lpath/to/lib Additional library search paths
@@ -191,6 +192,8 @@
;;
--lib) proj_kind="lib"
;;
+ --src-path-bare=*) src_path_bare="$optval"
+ ;;
--static-crt) use_static_runtime=true
;;
--ver=*)
@@ -335,6 +338,35 @@
case "$target" in
x86*)
case "$name" in
+ obj_int_extract)
+ tag Tool \
+ Name="VCCLCompilerTool" \
+ Optimization="0" \
+ AdditionalIncludeDirectories="$incs" \
+ PreprocessorDefinitions="WIN32;DEBUG;_CONSOLE;_CRT_SECURE_NO_WARNINGS;_CRT_SECURE_NO_DEPRECATE" \
+ RuntimeLibrary="$debug_runtime" \
+ WarningLevel="3" \
+ Detect64BitPortabilityProblems="true" \
+ DebugInformationFormat="1" \
+ ;;
+ vpx)
+ tag Tool \
+ Name="VCPreBuildEventTool" \
+ CommandLine="call obj_int_extract.bat $src_path_bare" \
+
+ tag Tool \
+ Name="VCCLCompilerTool" \
+ Optimization="0" \
+ AdditionalIncludeDirectories="$incs" \
+ PreprocessorDefinitions="WIN32;_DEBUG;_CRT_SECURE_NO_WARNINGS;_CRT_SECURE_NO_DEPRECATE;$defines" \
+ RuntimeLibrary="$debug_runtime" \
+ UsePrecompiledHeader="0" \
+ WarningLevel="3" \
+ DebugInformationFormat="1" \
+ Detect64BitPortabilityProblems="true" \
+
+ $uses_asm && tag Tool Name="YASM" IncludePaths="$incs" Debug="1"
+ ;;
*)
tag Tool \
Name="VCCLCompilerTool" \
@@ -358,6 +390,12 @@
case "$target" in
x86*)
case "$name" in
+ obj_int_extract)
+ tag Tool \
+ Name="VCLinkerTool" \
+ OutputFile="${name}.exe" \
+ GenerateDebugInformation="true" \
+ ;;
*)
tag Tool \
Name="VCLinkerTool" \
@@ -406,6 +444,34 @@
case "$target" in
x86*)
case "$name" in
+ obj_int_extract)
+ tag Tool \
+ Name="VCCLCompilerTool" \
+ AdditionalIncludeDirectories="$incs" \
+ PreprocessorDefinitions="WIN32;NDEBUG;_CONSOLE;_CRT_SECURE_NO_WARNINGS;_CRT_SECURE_NO_DEPRECATE" \
+ RuntimeLibrary="$release_runtime" \
+ UsePrecompiledHeader="0" \
+ WarningLevel="3" \
+ Detect64BitPortabilityProblems="true" \
+ DebugInformationFormat="0" \
+ ;;
+ vpx)
+ tag Tool \
+ Name="VCPreBuildEventTool" \
+ CommandLine="call obj_int_extract.bat $src_path_bare" \
+
+ tag Tool \
+ Name="VCCLCompilerTool" \
+ AdditionalIncludeDirectories="$incs" \
+ PreprocessorDefinitions="WIN32;NDEBUG;_CRT_SECURE_NO_WARNINGS;_CRT_SECURE_NO_DEPRECATE;$defines" \
+ RuntimeLibrary="$release_runtime" \
+ UsePrecompiledHeader="0" \
+ WarningLevel="3" \
+ DebugInformationFormat="0" \
+ Detect64BitPortabilityProblems="true" \
+
+ $uses_asm && tag Tool Name="YASM" IncludePaths="$incs"
+ ;;
*)
tag Tool \
Name="VCCLCompilerTool" \
@@ -428,6 +494,12 @@
case "$target" in
x86*)
case "$name" in
+ obj_int_extract)
+ tag Tool \
+ Name="VCLinkerTool" \
+ OutputFile="${name}.exe" \
+ GenerateDebugInformation="true" \
+ ;;
*)
tag Tool \
Name="VCLinkerTool" \
diff --git a/build/make/obj_int_extract.c b/build/make/obj_int_extract.c
index 26cf457..c46d9d5 100644
--- a/build/make/obj_int_extract.c
+++ b/build/make/obj_int_extract.c
@@ -14,7 +14,7 @@
#include "vpx_config.h"
-#if defined(_MSC_VER)
+#if defined(_MSC_VER) || defined(__MINGW32__)
#include <io.h>
#include <share.h>
#include "vpx/vpx_integer.h"
@@ -816,7 +816,7 @@
#endif
-#if defined(_MSC_VER)
+#if defined(_MSC_VER) || defined(__MINGW32__)
/* See "Microsoft Portable Executable and Common Object File Format Specification"
for reference.
*/
@@ -830,7 +830,6 @@
unsigned int i;
unsigned __int8 *ptr;
unsigned __int32 symoffset;
- FILE *fp;
char **sectionlist; //this array holds all section names in their correct order.
//it is used to check if the symbol is in .bss or .data section.
@@ -841,9 +840,18 @@
strtab_ptr = symtab_ptr + symtab_sz * 18;
if (nsections > 96)
- goto bail;
+ {
+ log_msg("Too many sections\n");
+ return 1;
+ }
- sectionlist = malloc(nsections * sizeof * sectionlist);
+ sectionlist = malloc(nsections * sizeof(sectionlist));
+
+ if (sectionlist == NULL)
+ {
+ log_msg("Allocating first level of section list failed\n");
+ return 1;
+ }
//log_msg("COFF: Found %u symbols in %u sections.\n", symtab_sz, nsections);
@@ -861,6 +869,12 @@
//log_msg("COFF: Parsing section %s\n",sectionname);
sectionlist[i] = malloc(strlen(sectionname) + 1);
+
+ if (sectionlist[i] == NULL)
+ {
+ log_msg("Allocating storage for %s failed\n", sectionname);
+ goto bail;
+ }
strcpy(sectionlist[i], sectionname);
if (!strcmp(sectionname, ".data")) sectionrawdata_ptr = get_le32(ptr + 20);
@@ -871,14 +885,6 @@
//log_msg("COFF: Symbol table at offset %u\n", symtab_ptr);
//log_msg("COFF: raw data pointer ofset for section .data is %u\n", sectionrawdata_ptr);
- fp = fopen("assembly_offsets.asm", "w");
-
- if (fp == NULL)
- {
- perror("open file");
- goto bail;
- }
-
/* The compiler puts the data with non-zero offset in .data section, but puts the data with
zero offset in .bss section. So, if the data in in .bss section, set offset=0.
Note from Wiki: In an object module compiled from C, the bss section contains
@@ -912,13 +918,23 @@
char name[9] = {0, 0, 0, 0, 0, 0, 0, 0, 0};
strncpy(name, ptr, 8);
//log_msg("COFF: Parsing symbol %s\n",name);
- fprintf(fp, "%-40s EQU ", name);
+ /* The 64bit Windows compiler doesn't prefix with an _.
+ * Check what's there, and bump if necessary
+ */
+ if (name[0] == '_')
+ printf("%-40s EQU ", name + 1);
+ else
+ printf("%-40s EQU ", name);
}
else
{
//log_msg("COFF: Parsing symbol %s\n",
// buf + strtab_ptr + get_le32(ptr+4));
- fprintf(fp, "%-40s EQU ", buf + strtab_ptr + get_le32(ptr + 4));
+ if ((buf + strtab_ptr + get_le32(ptr + 4))[0] == '_')
+ printf("%-40s EQU ",
+ buf + strtab_ptr + get_le32(ptr + 4) + 1);
+ else
+ printf("%-40s EQU ", buf + strtab_ptr + get_le32(ptr + 4));
}
if (!(strcmp(sectionlist[section-1], ".bss")))
@@ -935,14 +951,13 @@
//log_msg(" Address: %u\n",get_le32(ptr+8));
//log_msg(" Offset: %u\n", symoffset);
- fprintf(fp, "%5d\n", symoffset);
+ printf("%5d\n", symoffset);
}
ptr += 18;
}
- fprintf(fp, " END\n");
- fclose(fp);
+ printf(" END\n");
for (i = 0; i < nsections; i++)
{
@@ -992,11 +1007,7 @@
else
f = argv[1];
- if (_sopen_s(&fd, f, _O_BINARY, _SH_DENYNO, _S_IREAD | _S_IWRITE))
- {
- perror("Unable to open file");
- goto bail;
- }
+ fd = _sopen(f, _O_BINARY, _SH_DENYNO, _S_IREAD | _S_IWRITE);
if (_fstat(fd, &stat_buf))
{
diff --git a/build/x86-msvs/obj_int_extract.bat b/build/x86-msvs/obj_int_extract.bat
new file mode 100644
index 0000000..1bb8653
--- /dev/null
+++ b/build/x86-msvs/obj_int_extract.bat
@@ -0,0 +1,15 @@
+REM Copyright (c) 2011 The WebM project authors. All Rights Reserved.
+REM
+REM Use of this source code is governed by a BSD-style license
+REM that can be found in the LICENSE file in the root of the source
+REM tree. An additional intellectual property rights grant can be found
+REM in the file PATENTS. All contributing project authors may
+REM be found in the AUTHORS file in the root of the source tree.
+echo on
+
+cl /I "./" /I "%1" /nologo /c "%1/vp8/common/asm_com_offsets.c"
+cl /I "./" /I "%1" /nologo /c "%1/vp8/decoder/asm_dec_offsets.c"
+cl /I "./" /I "%1" /nologo /c "%1/vp8/encoder/asm_enc_offsets.c"
+obj_int_extract.exe rvds "asm_com_offsets.obj" > "asm_com_offsets.asm"
+obj_int_extract.exe rvds "asm_dec_offsets.obj" > "asm_dec_offsets.asm"
+obj_int_extract.exe rvds "asm_enc_offsets.obj" > "asm_enc_offsets.asm"
diff --git a/libs.mk b/libs.mk
index 37ce0b1..350b310 100644
--- a/libs.mk
+++ b/libs.mk
@@ -9,7 +9,13 @@
##
-ASM:=$(if $(filter yes,$(CONFIG_GCC)),.asm.s,.asm)
+# ARM assembly files are written in RVCT-style. We use some make magic to
+# filter those files to allow GCC compilation
+ifeq ($(ARCH_ARM),yes)
+ ASM:=$(if $(filter yes,$(CONFIG_GCC)),.asm.s,.asm)
+else
+ ASM:=.asm
+endif
CODEC_SRCS-yes += libs.mk
@@ -126,6 +132,23 @@
ifeq ($(CONFIG_EXTERNAL_BUILD),yes)
ifeq ($(CONFIG_MSVS),yes)
+obj_int_extract.vcproj: $(SRC_PATH_BARE)/build/make/obj_int_extract.c
+ @cp $(SRC_PATH_BARE)/build/x86-msvs/obj_int_extract.bat .
+ @echo " [CREATE] $@"
+ $(SRC_PATH_BARE)/build/make/gen_msvs_proj.sh \
+ --exe \
+ --target=$(TOOLCHAIN) \
+ --name=obj_int_extract \
+ --ver=$(CONFIG_VS_VERSION) \
+ --proj-guid=E1360C65-D375-4335-8057-7ED99CC3F9B2 \
+ $(if $(CONFIG_STATIC_MSVCRT),--static-crt) \
+ --out=$@ $^ \
+ -I. \
+ -I"$(SRC_PATH_BARE)" \
+
+PROJECTS-$(BUILD_LIBVPX) += obj_int_extract.vcproj
+PROJECTS-$(BUILD_LIBVPX) += obj_int_extract.bat
+
vpx.def: $(call enabled,CODEC_EXPORTS)
@echo " [CREATE] $@"
$(SRC_PATH_BARE)/build/make/gen_msvs_def.sh\
@@ -135,15 +158,16 @@
vpx.vcproj: $(CODEC_SRCS) vpx.def
@echo " [CREATE] $@"
- $(SRC_PATH_BARE)/build/make/gen_msvs_proj.sh\
- --lib\
- --target=$(TOOLCHAIN)\
+ $(SRC_PATH_BARE)/build/make/gen_msvs_proj.sh \
+ --lib \
+ --target=$(TOOLCHAIN) \
$(if $(CONFIG_STATIC_MSVCRT),--static-crt) \
- --name=vpx\
- --proj-guid=DCE19DAF-69AC-46DB-B14A-39F0FAA5DB74\
- --module-def=vpx.def\
- --ver=$(CONFIG_VS_VERSION)\
- --out=$@ $(CFLAGS) $^\
+ --name=vpx \
+ --proj-guid=DCE19DAF-69AC-46DB-B14A-39F0FAA5DB74 \
+ --module-def=vpx.def \
+ --ver=$(CONFIG_VS_VERSION) \
+ --out=$@ $(CFLAGS) $^ \
+ --src-path-bare="$(SRC_PATH_BARE)" \
PROJECTS-$(BUILD_LIBVPX) += vpx.vcproj
@@ -207,36 +231,38 @@
#
# Add assembler dependencies for configuration and offsets
#
-$(filter %.s.o,$(OBJS-yes)): $(BUILD_PFX)vpx_config.asm
-$(filter %.asm.o,$(OBJS-yes)): $(BUILD_PFX)vpx_config.asm
+$(filter %.s.o,$(OBJS-yes)): $(BUILD_PFX)vpx_config.asm
+$(filter %$(ASM).o,$(OBJS-yes)): $(BUILD_PFX)vpx_config.asm
#
# Calculate platform- and compiler-specific offsets for hand coded assembly
#
-ifeq ($(ARCH_ARM), yes)
- asm_com_offsets.asm: obj_int_extract
- asm_com_offsets.asm: $(VP8_PREFIX)common/asm_com_offsets.c.o
+ifeq ($(CONFIG_EXTERNAL_BUILD),) # Visual Studio uses obj_int_extract.bat
+ ifeq ($(ARCH_ARM), yes)
+ asm_com_offsets.asm: obj_int_extract
+ asm_com_offsets.asm: $(VP8_PREFIX)common/asm_com_offsets.c.o
./obj_int_extract rvds $< $(ADS2GAS) > $@
- OBJS-yes += $(VP8_PREFIX)common/asm_com_offsets.c.o
- CLEAN-OBJS += asm_com_offsets.asm
- $(filter %$(ASM).o,$(OBJS-yes)): $(BUILD_PFX)asm_com_offsets.asm
+ OBJS-yes += $(VP8_PREFIX)common/asm_com_offsets.c.o
+ CLEAN-OBJS += asm_com_offsets.asm
+ $(filter %$(ASM).o,$(OBJS-yes)): $(BUILD_PFX)asm_com_offsets.asm
- ifeq ($(CONFIG_VP8_ENCODER), yes)
- asm_enc_offsets.asm: obj_int_extract
- asm_enc_offsets.asm: $(VP8_PREFIX)encoder/asm_enc_offsets.c.o
+ ifeq ($(CONFIG_VP8_ENCODER), yes)
+ asm_enc_offsets.asm: obj_int_extract
+ asm_enc_offsets.asm: $(VP8_PREFIX)encoder/asm_enc_offsets.c.o
./obj_int_extract rvds $< $(ADS2GAS) > $@
- OBJS-yes += $(VP8_PREFIX)encoder/asm_enc_offsets.c.o
- CLEAN-OBJS += asm_enc_offsets.asm
- $(filter %$(ASM).o,$(OBJS-yes)): $(BUILD_PFX)asm_enc_offsets.asm
- endif
+ OBJS-yes += $(VP8_PREFIX)encoder/asm_enc_offsets.c.o
+ CLEAN-OBJS += asm_enc_offsets.asm
+ $(filter %$(ASM).o,$(OBJS-yes)): $(BUILD_PFX)asm_enc_offsets.asm
+ endif
- ifeq ($(CONFIG_VP8_DECODER), yes)
- asm_dec_offsets.asm: obj_int_extract
- asm_dec_offsets.asm: $(VP8_PREFIX)decoder/asm_dec_offsets.c.o
+ ifeq ($(CONFIG_VP8_DECODER), yes)
+ asm_dec_offsets.asm: obj_int_extract
+ asm_dec_offsets.asm: $(VP8_PREFIX)decoder/asm_dec_offsets.c.o
./obj_int_extract rvds $< $(ADS2GAS) > $@
- OBJS-yes += $(VP8_PREFIX)decoder/asm_dec_offsets.c.o
- CLEAN-OBJS += asm_dec_offsets.asm
- $(filter %$(ASM).o,$(OBJS-yes)): $(BUILD_PFX)asm_dec_offsets.asm
+ OBJS-yes += $(VP8_PREFIX)decoder/asm_dec_offsets.c.o
+ CLEAN-OBJS += asm_dec_offsets.asm
+ $(filter %$(ASM).o,$(OBJS-yes)): $(BUILD_PFX)asm_dec_offsets.asm
+ endif
endif
endif
diff --git a/solution.mk b/solution.mk
index bef0088..782150f 100644
--- a/solution.mk
+++ b/solution.mk
@@ -13,8 +13,9 @@
@echo " [CREATE] $@"
$(SRC_PATH_BARE)/build/make/gen_msvs_sln.sh \
$(if $(filter %vpx.vcproj,$^),\
- $(foreach vcp,$(filter-out %vpx.vcproj,$^),\
+ $(foreach vcp,$(filter-out %vpx.vcproj %obj_int_extract.vcproj,$^),\
--dep=$(vcp:.vcproj=):vpx)) \
+ --dep=vpx:obj_int_extract \
--ver=$(CONFIG_VS_VERSION)\
--out=$@ $^
vpx.sln.mk: vpx.sln
diff --git a/vp8/common/arm/arm_systemdependent.c b/vp8/common/arm/arm_systemdependent.c
index 69e1bdf..bd5c075 100644
--- a/vp8/common/arm/arm_systemdependent.c
+++ b/vp8/common/arm/arm_systemdependent.c
@@ -19,14 +19,6 @@
#include "vp8/common/idct.h"
#include "vp8/common/onyxc_int.h"
-extern void (*vp8_build_intra_predictors_mby_ptr)(MACROBLOCKD *x);
-extern void vp8_build_intra_predictors_mby(MACROBLOCKD *x);
-extern void vp8_build_intra_predictors_mby_neon(MACROBLOCKD *x);
-
-extern void (*vp8_build_intra_predictors_mby_s_ptr)(MACROBLOCKD *x);
-extern void vp8_build_intra_predictors_mby_s(MACROBLOCKD *x);
-extern void vp8_build_intra_predictors_mby_s_neon(MACROBLOCKD *x);
-
void vp8_arch_arm_common_init(VP8_COMMON *ctx)
{
#if CONFIG_RUNTIME_CPU_DETECT
@@ -106,31 +98,12 @@
rtcd->recon.recon2 = vp8_recon2b_neon;
rtcd->recon.recon4 = vp8_recon4b_neon;
rtcd->recon.recon_mb = vp8_recon_mb_neon;
-
+ rtcd->recon.build_intra_predictors_mby =
+ vp8_build_intra_predictors_mby_neon;
+ rtcd->recon.build_intra_predictors_mby_s =
+ vp8_build_intra_predictors_mby_s_neon;
}
#endif
#endif
-
-#if HAVE_ARMV6
-#if CONFIG_RUNTIME_CPU_DETECT
- if (has_media)
-#endif
- {
- vp8_build_intra_predictors_mby_ptr = vp8_build_intra_predictors_mby;
- vp8_build_intra_predictors_mby_s_ptr = vp8_build_intra_predictors_mby_s;
- }
-#endif
-
-#if HAVE_ARMV7
-#if CONFIG_RUNTIME_CPU_DETECT
- if (has_neon)
-#endif
- {
- vp8_build_intra_predictors_mby_ptr =
- vp8_build_intra_predictors_mby_neon;
- vp8_build_intra_predictors_mby_s_ptr =
- vp8_build_intra_predictors_mby_s_neon;
- }
-#endif
}
diff --git a/vp8/common/arm/recon_arm.h b/vp8/common/arm/recon_arm.h
index b46b7fc..377cb2a 100644
--- a/vp8/common/arm/recon_arm.h
+++ b/vp8/common/arm/recon_arm.h
@@ -53,6 +53,9 @@
extern prototype_recon_macroblock(vp8_recon_mb_neon);
+extern prototype_build_intra_predictors(vp8_build_intra_predictors_mby_neon);
+extern prototype_build_intra_predictors(vp8_build_intra_predictors_mby_s_neon);
+
#if !CONFIG_RUNTIME_CPU_DETECT
#undef vp8_recon_recon
#define vp8_recon_recon vp8_recon_b_neon
@@ -74,6 +77,13 @@
#undef vp8_recon_recon_mb
#define vp8_recon_recon_mb vp8_recon_mb_neon
+
+#undef vp8_recon_build_intra_predictors_mby
+#define vp8_recon_build_intra_predictors_mby vp8_build_intra_predictors_mby_neon
+
+#undef vp8_recon_build_intra_predictors_mby_s
+#define vp8_recon_build_intra_predictors_mby_s vp8_build_intra_predictors_mby_s_neon
+
#endif
#endif
diff --git a/vp8/common/generic/systemdependent.c b/vp8/common/generic/systemdependent.c
index c843d86..5c64647 100644
--- a/vp8/common/generic/systemdependent.c
+++ b/vp8/common/generic/systemdependent.c
@@ -20,12 +20,6 @@
extern void vp8_arch_x86_common_init(VP8_COMMON *ctx);
extern void vp8_arch_arm_common_init(VP8_COMMON *ctx);
-void (*vp8_build_intra_predictors_mby_ptr)(MACROBLOCKD *x);
-extern void vp8_build_intra_predictors_mby(MACROBLOCKD *x);
-
-void (*vp8_build_intra_predictors_mby_s_ptr)(MACROBLOCKD *x);
-extern void vp8_build_intra_predictors_mby_s(MACROBLOCKD *x);
-
void vp8_machine_specific_config(VP8_COMMON *ctx)
{
#if CONFIG_RUNTIME_CPU_DETECT
@@ -45,6 +39,10 @@
rtcd->recon.recon4 = vp8_recon4b_c;
rtcd->recon.recon_mb = vp8_recon_mb_c;
rtcd->recon.recon_mby = vp8_recon_mby_c;
+ rtcd->recon.build_intra_predictors_mby =
+ vp8_build_intra_predictors_mby;
+ rtcd->recon.build_intra_predictors_mby_s =
+ vp8_build_intra_predictors_mby_s;
rtcd->subpix.sixtap16x16 = vp8_sixtap_predict16x16_c;
rtcd->subpix.sixtap8x8 = vp8_sixtap_predict8x8_c;
@@ -75,9 +73,6 @@
#endif
#endif
- /* Pure C: */
- vp8_build_intra_predictors_mby_ptr = vp8_build_intra_predictors_mby;
- vp8_build_intra_predictors_mby_s_ptr = vp8_build_intra_predictors_mby_s;
#if ARCH_X86 || ARCH_X86_64
vp8_arch_x86_common_init(ctx);
diff --git a/vp8/common/recon.h b/vp8/common/recon.h
index e7df90a..e608f21 100644
--- a/vp8/common/recon.h
+++ b/vp8/common/recon.h
@@ -23,6 +23,9 @@
#define prototype_recon_macroblock(sym) \
void sym(const struct vp8_recon_rtcd_vtable *rtcd, MACROBLOCKD *x)
+#define prototype_build_intra_predictors(sym) \
+ void sym(MACROBLOCKD *x)
+
struct vp8_recon_rtcd_vtable;
#if ARCH_X86 || ARCH_X86_64
@@ -73,9 +76,23 @@
#endif
extern prototype_recon_macroblock(vp8_recon_recon_mby);
+#ifndef vp8_recon_build_intra_predictors_mby
+#define vp8_recon_build_intra_predictors_mby vp8_build_intra_predictors_mby
+#endif
+extern prototype_build_intra_predictors\
+ (vp8_recon_build_intra_predictors_mby);
+
+#ifndef vp8_recon_build_intra_predictors_mby_s
+#define vp8_recon_build_intra_predictors_mby_s vp8_build_intra_predictors_mby_s
+#endif
+extern prototype_build_intra_predictors\
+ (vp8_recon_build_intra_predictors_mby_s);
+
+
typedef prototype_copy_block((*vp8_copy_block_fn_t));
typedef prototype_recon_block((*vp8_recon_fn_t));
typedef prototype_recon_macroblock((*vp8_recon_mb_fn_t));
+typedef prototype_build_intra_predictors((*vp8_build_intra_pred_fn_t));
typedef struct vp8_recon_rtcd_vtable
{
vp8_copy_block_fn_t copy16x16;
@@ -86,6 +103,8 @@
vp8_recon_fn_t recon4;
vp8_recon_mb_fn_t recon_mb;
vp8_recon_mb_fn_t recon_mby;
+ vp8_build_intra_pred_fn_t build_intra_predictors_mby_s;
+ vp8_build_intra_pred_fn_t build_intra_predictors_mby;
} vp8_recon_rtcd_vtable_t;
#if CONFIG_RUNTIME_CPU_DETECT
diff --git a/vp8/common/reconintra.h b/vp8/common/reconintra.h
index 988b43a..4025a53 100644
--- a/vp8/common/reconintra.h
+++ b/vp8/common/reconintra.h
@@ -14,13 +14,6 @@
extern void init_intra_left_above_pixels(MACROBLOCKD *x);
-extern void (*vp8_build_intra_predictors_mby_ptr)(MACROBLOCKD *x);
-extern void vp8_build_intra_predictors_mby(MACROBLOCKD *x);
-extern void vp8_build_intra_predictors_mby_neon(MACROBLOCKD *x);
-extern void (*vp8_build_intra_predictors_mby_s_ptr)(MACROBLOCKD *x);
-extern void vp8_build_intra_predictors_mby_s(MACROBLOCKD *x);
-extern void vp8_build_intra_predictors_mby_s_neon(MACROBLOCKD *x);
-
extern void vp8_build_intra_predictors_mbuv(MACROBLOCKD *x);
extern void vp8_build_intra_predictors_mbuv_s(MACROBLOCKD *x);
diff --git a/vp8/decoder/decodframe.c b/vp8/decoder/decodframe.c
index c454bbc..3d4d9b9 100644
--- a/vp8/decoder/decodframe.c
+++ b/vp8/decoder/decodframe.c
@@ -115,8 +115,8 @@
{
vp8_build_intra_predictors_mbuv_s(xd);
- vp8_build_intra_predictors_mby_s_ptr(xd);
-
+ RECON_INVOKE(&pbi->common.rtcd.recon,
+ build_intra_predictors_mby_s)(xd);
}
else
{
@@ -214,7 +214,8 @@
if (xd->mode_info_context->mbmi.mode != B_PRED)
{
- vp8_build_intra_predictors_mby_ptr(xd);
+ RECON_INVOKE(&pbi->common.rtcd.recon,
+ build_intra_predictors_mby)(xd);
} else {
vp8_intra_prediction_down_copy(xd);
}
diff --git a/vp8/encoder/arm/arm_csystemdependent.c b/vp8/encoder/arm/arm_csystemdependent.c
index 73007d4..5ba14f3 100644
--- a/vp8/encoder/arm/arm_csystemdependent.c
+++ b/vp8/encoder/arm/arm_csystemdependent.c
@@ -50,8 +50,8 @@
cpi->rtcd.variance.halfpixvar16x16_v = vp8_variance_halfpixvar16x16_v_armv6;
cpi->rtcd.variance.halfpixvar16x16_hv = vp8_variance_halfpixvar16x16_hv_armv6;
- /*cpi->rtcd.variance.mse16x16 = vp8_mse16x16_c;
- cpi->rtcd.variance.getmbss = vp8_get_mb_ss_c;*/
+ cpi->rtcd.variance.mse16x16 = vp8_mse16x16_armv6;
+ /*cpi->rtcd.variance.getmbss = vp8_get_mb_ss_c;*/
/*cpi->rtcd.variance.get16x16prederror = vp8_get16x16pred_error_c;
cpi->rtcd.variance.get8x8var = vp8_get8x8var_c;
@@ -71,8 +71,8 @@
cpi->rtcd.encodemb.submby = vp8_subtract_mby_c;
cpi->rtcd.encodemb.submbuv = vp8_subtract_mbuv_c;*/
- /*cpi->rtcd.quantize.quantb = vp8_regular_quantize_b;
- cpi->rtcd.quantize.fastquantb = vp8_fast_quantize_b_c;*/
+ /*cpi->rtcd.quantize.quantb = vp8_regular_quantize_b;*/
+ cpi->rtcd.quantize.fastquantb = vp8_fast_quantize_b_armv6;
}
#endif
diff --git a/vp8/encoder/arm/armv6/vp8_fast_quantize_b_armv6.asm b/vp8/encoder/arm/armv6/vp8_fast_quantize_b_armv6.asm
new file mode 100644
index 0000000..ae2f603
--- /dev/null
+++ b/vp8/encoder/arm/armv6/vp8_fast_quantize_b_armv6.asm
@@ -0,0 +1,224 @@
+;
+; Copyright (c) 2011 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+
+ EXPORT |vp8_fast_quantize_b_armv6|
+
+ INCLUDE asm_enc_offsets.asm
+
+ ARM
+ REQUIRE8
+ PRESERVE8
+
+ AREA ||.text||, CODE, READONLY, ALIGN=2
+
+; r0 BLOCK *b
+; r1 BLOCKD *d
+|vp8_fast_quantize_b_armv6| PROC
+ stmfd sp!, {r1, r4-r11, lr}
+
+ ldr r3, [r0, #vp8_block_coeff] ; coeff
+ ldr r4, [r0, #vp8_block_quant_fast] ; quant_fast
+ ldr r5, [r0, #vp8_block_round] ; round
+ ldr r6, [r1, #vp8_blockd_qcoeff] ; qcoeff
+ ldr r7, [r1, #vp8_blockd_dqcoeff] ; dqcoeff
+ ldr r8, [r1, #vp8_blockd_dequant] ; dequant
+
+ ldr r2, loop_count ; loop_count=0x1000000. 'lsls' instruction
+ ; is used to update the counter so that
+ ; it can be used to mark nonzero
+ ; quantized coefficient pairs.
+
+ mov r1, #0 ; flags for quantized coeffs
+
+ ; PART 1: quantization and dequantization loop
+loop
+ ldr r9, [r3], #4 ; [z1 | z0]
+ ldr r10, [r5], #4 ; [r1 | r0]
+ ldr r11, [r4], #4 ; [q1 | q0]
+
+ ssat16 lr, #1, r9 ; [sz1 | sz0]
+ eor r9, r9, lr ; [z1 ^ sz1 | z0 ^ sz0]
+ ssub16 r9, r9, lr ; x = (z ^ sz) - sz
+ sadd16 r9, r9, r10 ; [x1+r1 | x0+r0]
+
+ ldr r12, [r3], #4 ; [z3 | z2]
+
+ smulbb r0, r9, r11 ; [(x0+r0)*q0]
+ smultt r9, r9, r11 ; [(x1+r1)*q1]
+
+ ldr r10, [r5], #4 ; [r3 | r2]
+
+ ssat16 r11, #1, r12 ; [sz3 | sz2]
+ eor r12, r12, r11 ; [z3 ^ sz3 | z2 ^ sz2]
+ pkhtb r0, r9, r0, asr #16 ; [y1 | y0]
+ ldr r9, [r4], #4 ; [q3 | q2]
+ ssub16 r12, r12, r11 ; x = (z ^ sz) - sz
+
+ sadd16 r12, r12, r10 ; [x3+r3 | x2+r2]
+
+ eor r0, r0, lr ; [(y1 ^ sz1) | (y0 ^ sz0)]
+
+ smulbb r10, r12, r9 ; [(x2+r2)*q2]
+ smultt r12, r12, r9 ; [(x3+r3)*q3]
+
+ ssub16 r0, r0, lr ; x = (y ^ sz) - sz
+
+ cmp r0, #0 ; check if zero
+ orrne r1, r1, r2, lsr #24 ; add flag for nonzero coeffs
+
+ str r0, [r6], #4 ; *qcoeff++ = x
+ ldr r9, [r8], #4 ; [dq1 | dq0]
+
+ pkhtb r10, r12, r10, asr #16 ; [y3 | y2]
+ eor r10, r10, r11 ; [(y3 ^ sz3) | (y2 ^ sz2)]
+ ssub16 r10, r10, r11 ; x = (y ^ sz) - sz
+
+ cmp r10, #0 ; check if zero
+ orrne r1, r1, r2, lsr #23 ; add flag for nonzero coeffs
+
+ str r10, [r6], #4 ; *qcoeff++ = x
+ ldr r11, [r8], #4 ; [dq3 | dq2]
+
+ smulbb r12, r0, r9 ; [x0*dq0]
+ smultt r0, r0, r9 ; [x1*dq1]
+
+ smulbb r9, r10, r11 ; [x2*dq2]
+ smultt r10, r10, r11 ; [x3*dq3]
+
+ lsls r2, r2, #2 ; update loop counter
+ strh r12, [r7, #0] ; dqcoeff[0] = [x0*dq0]
+ strh r0, [r7, #2] ; dqcoeff[1] = [x1*dq1]
+ strh r9, [r7, #4] ; dqcoeff[2] = [x2*dq2]
+ strh r10, [r7, #6] ; dqcoeff[3] = [x3*dq3]
+ add r7, r7, #8 ; dqcoeff += 8
+ bne loop
+
+ ; PART 2: check position for eob...
+ mov lr, #0 ; init eob
+ cmp r1, #0 ; coeffs after quantization?
+ ldr r11, [sp, #0] ; restore BLOCKD pointer
+ beq end ; skip eob calculations if all zero
+
+ ldr r0, [r11, #vp8_blockd_qcoeff]
+
+ ; check shortcut for nonzero qcoeffs
+ tst r1, #0x80
+ bne quant_coeff_15_14
+ tst r1, #0x20
+ bne quant_coeff_13_11
+ tst r1, #0x8
+ bne quant_coeff_12_7
+ tst r1, #0x40
+ bne quant_coeff_10_9
+ tst r1, #0x10
+ bne quant_coeff_8_3
+ tst r1, #0x2
+ bne quant_coeff_6_5
+ tst r1, #0x4
+ bne quant_coeff_4_2
+ b quant_coeff_1_0
+
+quant_coeff_15_14
+ ldrh r2, [r0, #30] ; rc=15, i=15
+ mov lr, #16
+ cmp r2, #0
+ bne end
+
+ ldrh r3, [r0, #28] ; rc=14, i=14
+ mov lr, #15
+ cmp r3, #0
+ bne end
+
+quant_coeff_13_11
+ ldrh r2, [r0, #22] ; rc=11, i=13
+ mov lr, #14
+ cmp r2, #0
+ bne end
+
+quant_coeff_12_7
+ ldrh r3, [r0, #14] ; rc=7, i=12
+ mov lr, #13
+ cmp r3, #0
+ bne end
+
+ ldrh r2, [r0, #20] ; rc=10, i=11
+ mov lr, #12
+ cmp r2, #0
+ bne end
+
+quant_coeff_10_9
+ ldrh r3, [r0, #26] ; rc=13, i=10
+ mov lr, #11
+ cmp r3, #0
+ bne end
+
+ ldrh r2, [r0, #24] ; rc=12, i=9
+ mov lr, #10
+ cmp r2, #0
+ bne end
+
+quant_coeff_8_3
+ ldrh r3, [r0, #18] ; rc=9, i=8
+ mov lr, #9
+ cmp r3, #0
+ bne end
+
+ ldrh r2, [r0, #12] ; rc=6, i=7
+ mov lr, #8
+ cmp r2, #0
+ bne end
+
+quant_coeff_6_5
+ ldrh r3, [r0, #6] ; rc=3, i=6
+ mov lr, #7
+ cmp r3, #0
+ bne end
+
+ ldrh r2, [r0, #4] ; rc=2, i=5
+ mov lr, #6
+ cmp r2, #0
+ bne end
+
+quant_coeff_4_2
+ ldrh r3, [r0, #10] ; rc=5, i=4
+ mov lr, #5
+ cmp r3, #0
+ bne end
+
+ ldrh r2, [r0, #16] ; rc=8, i=3
+ mov lr, #4
+ cmp r2, #0
+ bne end
+
+ ldrh r3, [r0, #8] ; rc=4, i=2
+ mov lr, #3
+ cmp r3, #0
+ bne end
+
+quant_coeff_1_0
+ ldrh r2, [r0, #2] ; rc=1, i=1
+ mov lr, #2
+ cmp r2, #0
+ bne end
+
+ mov lr, #1 ; rc=0, i=0
+
+end
+ str lr, [r11, #vp8_blockd_eob]
+ ldmfd sp!, {r1, r4-r11, pc}
+
+ ENDP
+
+loop_count
+ DCD 0x1000000
+
+ END
+
diff --git a/vp8/encoder/arm/armv6/vp8_mse16x16_armv6.asm b/vp8/encoder/arm/armv6/vp8_mse16x16_armv6.asm
new file mode 100644
index 0000000..a9060d7
--- /dev/null
+++ b/vp8/encoder/arm/armv6/vp8_mse16x16_armv6.asm
@@ -0,0 +1,133 @@
+;
+; Copyright (c) 2011 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+
+ EXPORT |vp8_mse16x16_armv6|
+
+ ARM
+
+ AREA ||.text||, CODE, READONLY, ALIGN=2
+
+; r0 unsigned char *src_ptr
+; r1 int source_stride
+; r2 unsigned char *ref_ptr
+; r3 int recon_stride
+; stack unsigned int *sse
+;
+;note: Based on vp8_variance16x16_armv6. In this function, sum is never used.
+; So, we can remove this part of calculation.
+
+|vp8_mse16x16_armv6| PROC
+
+ push {r4-r9, lr}
+ mov r12, #16 ; set loop counter to 16 (=block height)
+
+ mov r4, #0 ; initialize sse = 0
+
+loop
+ ; 1st 4 pixels
+ ldr r5, [r0, #0x0] ; load 4 src pixels
+ ldr r6, [r2, #0x0] ; load 4 ref pixels
+
+ mov lr, #0 ; constant zero
+
+ usub8 r8, r5, r6 ; calculate difference
+ sel r7, r8, lr ; select bytes with positive difference
+ usub8 r9, r6, r5 ; calculate difference with reversed operands
+ sel r8, r9, lr ; select bytes with negative difference
+
+ ; calculate partial sums
+ usad8 r5, r7, lr ; calculate sum of positive differences
+ usad8 r6, r8, lr ; calculate sum of negative differences
+ orr r8, r8, r7 ; differences of all 4 pixels
+
+ ldr r5, [r0, #0x4] ; load 4 src pixels
+
+ ; calculate sse
+ uxtb16 r6, r8 ; byte (two pixels) to halfwords
+ uxtb16 r7, r8, ror #8 ; another two pixels to halfwords
+ smlad r4, r6, r6, r4 ; dual signed multiply, add and accumulate (1)
+
+ ; 2nd 4 pixels
+ ldr r6, [r2, #0x4] ; load 4 ref pixels
+ smlad r4, r7, r7, r4 ; dual signed multiply, add and accumulate (2)
+
+ usub8 r8, r5, r6 ; calculate difference
+ sel r7, r8, lr ; select bytes with positive difference
+ usub8 r9, r6, r5 ; calculate difference with reversed operands
+ sel r8, r9, lr ; select bytes with negative difference
+
+ ; calculate partial sums
+ usad8 r5, r7, lr ; calculate sum of positive differences
+ usad8 r6, r8, lr ; calculate sum of negative differences
+ orr r8, r8, r7 ; differences of all 4 pixels
+ ldr r5, [r0, #0x8] ; load 4 src pixels
+ ; calculate sse
+ uxtb16 r6, r8 ; byte (two pixels) to halfwords
+ uxtb16 r7, r8, ror #8 ; another two pixels to halfwords
+ smlad r4, r6, r6, r4 ; dual signed multiply, add and accumulate (1)
+
+ ; 3rd 4 pixels
+ ldr r6, [r2, #0x8] ; load 4 ref pixels
+ smlad r4, r7, r7, r4 ; dual signed multiply, add and accumulate (2)
+
+ usub8 r8, r5, r6 ; calculate difference
+ sel r7, r8, lr ; select bytes with positive difference
+ usub8 r9, r6, r5 ; calculate difference with reversed operands
+ sel r8, r9, lr ; select bytes with negative difference
+
+ ; calculate partial sums
+ usad8 r5, r7, lr ; calculate sum of positive differences
+ usad8 r6, r8, lr ; calculate sum of negative differences
+ orr r8, r8, r7 ; differences of all 4 pixels
+
+ ldr r5, [r0, #0xc] ; load 4 src pixels
+
+ ; calculate sse
+ uxtb16 r6, r8 ; byte (two pixels) to halfwords
+ uxtb16 r7, r8, ror #8 ; another two pixels to halfwords
+ smlad r4, r6, r6, r4 ; dual signed multiply, add and accumulate (1)
+
+ ; 4th 4 pixels
+ ldr r6, [r2, #0xc] ; load 4 ref pixels
+ smlad r4, r7, r7, r4 ; dual signed multiply, add and accumulate (2)
+
+ usub8 r8, r5, r6 ; calculate difference
+ add r0, r0, r1 ; set src_ptr to next row
+ sel r7, r8, lr ; select bytes with positive difference
+ usub8 r9, r6, r5 ; calculate difference with reversed operands
+ add r2, r2, r3 ; set dst_ptr to next row
+ sel r8, r9, lr ; select bytes with negative difference
+
+ ; calculate partial sums
+ usad8 r5, r7, lr ; calculate sum of positive differences
+ usad8 r6, r8, lr ; calculate sum of negative differences
+ orr r8, r8, r7 ; differences of all 4 pixels
+
+ subs r12, r12, #1 ; next row
+
+ ; calculate sse
+ uxtb16 r6, r8 ; byte (two pixels) to halfwords
+ uxtb16 r7, r8, ror #8 ; another two pixels to halfwords
+ smlad r4, r6, r6, r4 ; dual signed multiply, add and accumulate (1)
+ smlad r4, r7, r7, r4 ; dual signed multiply, add and accumulate (2)
+
+ bne loop
+
+ ; return stuff
+ ldr r1, [sp, #28] ; get address of sse
+ mov r0, r4 ; return sse
+ str r4, [r1] ; store sse
+
+ pop {r4-r9, pc}
+
+ ENDP
+
+ END
diff --git a/vp8/encoder/arm/quantize_arm.h b/vp8/encoder/arm/quantize_arm.h
index 5f9155e..0c6adf4 100644
--- a/vp8/encoder/arm/quantize_arm.h
+++ b/vp8/encoder/arm/quantize_arm.h
@@ -12,6 +12,16 @@
#ifndef QUANTIZE_ARM_H
#define QUANTIZE_ARM_H
+#if HAVE_ARMV6
+
+extern prototype_quantize_block(vp8_fast_quantize_b_armv6);
+
+#undef vp8_quantize_fastquantb
+#define vp8_quantize_fastquantb vp8_fast_quantize_b_armv6
+
+#endif /* HAVE_ARMV6 */
+
+
#if HAVE_ARMV7
extern prototype_quantize_block(vp8_fast_quantize_b_neon);
diff --git a/vp8/encoder/arm/variance_arm.h b/vp8/encoder/arm/variance_arm.h
index 7ac0ac0..7ad7c76 100644
--- a/vp8/encoder/arm/variance_arm.h
+++ b/vp8/encoder/arm/variance_arm.h
@@ -20,6 +20,7 @@
extern prototype_variance(vp8_variance_halfpixvar16x16_h_armv6);
extern prototype_variance(vp8_variance_halfpixvar16x16_v_armv6);
extern prototype_variance(vp8_variance_halfpixvar16x16_hv_armv6);
+extern prototype_variance(vp8_mse16x16_armv6);
#if !CONFIG_RUNTIME_CPU_DETECT
@@ -32,6 +33,9 @@
#undef vp8_variance_var16x16
#define vp8_variance_var16x16 vp8_variance16x16_armv6
+#undef vp8_variance_mse16x16
+#define vp8_variance_mse16x16 vp8_mse16x16_armv6
+
#undef vp8_variance_halfpixvar16x16_h
#define vp8_variance_halfpixvar16x16_h vp8_variance_halfpixvar16x16_h_armv6
diff --git a/vp8/encoder/asm_enc_offsets.c b/vp8/encoder/asm_enc_offsets.c
index cd49532..fcf7775 100644
--- a/vp8/encoder/asm_enc_offsets.c
+++ b/vp8/encoder/asm_enc_offsets.c
@@ -65,6 +65,17 @@
DEFINE(vp8_common_mb_rows, offsetof(VP8_COMMON, mb_rows));
+// offsets from BLOCK structure
+DEFINE(vp8_block_coeff, offsetof(BLOCK, coeff));
+DEFINE(vp8_block_quant_fast, offsetof(BLOCK, quant_fast));
+DEFINE(vp8_block_round, offsetof(BLOCK, round));
+
+// offsets from BLOCKD structure
+DEFINE(vp8_blockd_qcoeff, offsetof(BLOCKD, qcoeff));
+DEFINE(vp8_blockd_dqcoeff, offsetof(BLOCKD, dqcoeff));
+DEFINE(vp8_blockd_dequant, offsetof(BLOCKD, dequant));
+DEFINE(vp8_blockd_eob, offsetof(BLOCKD, eob));
+
// These two sizes are used in vp8cx_pack_tokens. They are hard coded
// so if the size changes this will have to be adjusted.
#if HAVE_ARMV5TE
diff --git a/vp8/encoder/encodeframe.c b/vp8/encoder/encodeframe.c
index 0ced6e7..0613b90 100644
--- a/vp8/encoder/encodeframe.c
+++ b/vp8/encoder/encodeframe.c
@@ -1184,7 +1184,8 @@
int distortion2;
x->e_mbd.mode_info_context->mbmi.mode = mode;
- vp8_build_intra_predictors_mby_ptr(&x->e_mbd);
+ RECON_INVOKE(&cpi->common.rtcd.recon, build_intra_predictors_mby)
+ (&x->e_mbd);
distortion2 = VARIANCE_INVOKE(&cpi->rtcd.variance, get16x16prederror)(x->src.y_buffer, x->src.y_stride, x->e_mbd.predictor, 16, 0x7fffffff);
rate2 = x->mbmode_cost[x->e_mbd.frame_type][mode];
this_rd = RD_ESTIMATE(x->rdmult, x->rddiv, rate2, distortion2);
diff --git a/vp8/encoder/encodeintra.c b/vp8/encoder/encodeintra.c
index cd66016..7b81c8d 100644
--- a/vp8/encoder/encodeintra.c
+++ b/vp8/encoder/encodeintra.c
@@ -80,7 +80,7 @@
{
int b;
- vp8_build_intra_predictors_mby_ptr(&x->e_mbd);
+ RECON_INVOKE(&rtcd->common->recon, build_intra_predictors_mby)(&x->e_mbd);
ENCODEMB_INVOKE(&rtcd->encodemb, submby)(x->src_diff, x->src.y_buffer, x->e_mbd.predictor, x->src.y_stride);
diff --git a/vp8/encoder/ethreading.c b/vp8/encoder/ethreading.c
index 12d5f66..5c607a0 100644
--- a/vp8/encoder/ethreading.c
+++ b/vp8/encoder/ethreading.c
@@ -24,6 +24,35 @@
extern void vp8_build_block_offsets(MACROBLOCK *x);
extern void vp8_setup_block_ptrs(MACROBLOCK *x);
+#if CONFIG_MULTITHREAD
+
+extern void loopfilter_frame(VP8_COMP *cpi, VP8_COMMON *cm);
+
+static THREAD_FUNCTION loopfilter_thread(void *p_data)
+{
+ VP8_COMP *cpi = (VP8_COMP *)(((LPFTHREAD_DATA *)p_data)->ptr1);
+ VP8_COMMON *cm = &cpi->common;
+
+ while (1)
+ {
+ if (cpi->b_multi_threaded == 0)
+ break;
+
+ if (sem_wait(&cpi->h_event_start_lpf) == 0)
+ {
+ if (cpi->b_multi_threaded == FALSE) // we're shutting down
+ break;
+
+ loopfilter_frame(cpi, cm);
+
+ sem_post(&cpi->h_event_end_lpf);
+ }
+ }
+
+ return 0;
+}
+#endif
+
static
THREAD_FUNCTION thread_encoding_proc(void *p_data)
{
@@ -479,6 +508,15 @@
pthread_create(&cpi->h_encoding_thread[ithread], 0, thread_encoding_proc, ethd);
}
+ {
+ LPFTHREAD_DATA * lpfthd = &cpi->lpf_thread_data;
+
+ sem_init(&cpi->h_event_start_lpf, 0, 0);
+ sem_init(&cpi->h_event_end_lpf, 0, 0);
+
+ lpfthd->ptr1 = (void *)cpi;
+ pthread_create(&cpi->h_filter_thread, 0, loopfilter_thread, lpfthd);
+ }
}
}
@@ -500,9 +538,14 @@
sem_destroy(&cpi->h_event_start_encoding[i]);
}
+
+ sem_post(&cpi->h_event_start_lpf);
+ pthread_join(cpi->h_filter_thread, 0);
}
sem_destroy(&cpi->h_event_end_encoding);
+ sem_destroy(&cpi->h_event_end_lpf);
+ sem_destroy(&cpi->h_event_start_lpf);
//free thread related resources
vpx_free(cpi->h_event_start_encoding);
diff --git a/vp8/encoder/firstpass.c b/vp8/encoder/firstpass.c
index d2cc848..774d9b6 100644
--- a/vp8/encoder/firstpass.c
+++ b/vp8/encoder/firstpass.c
@@ -281,21 +281,6 @@
}
-extern size_t vp8_firstpass_stats_sz(unsigned int mb_count)
-{
- /* Calculate the size of a stats packet, which is dependent on the frame
- * resolution. The FIRSTPASS_STATS struct has a single element array,
- * motion_map, which is virtually expanded to have one element per
- * macroblock.
- */
- size_t stats_sz;
-
- stats_sz = sizeof(FIRSTPASS_STATS) + mb_count;
- stats_sz = (stats_sz + 7) & ~7;
- return stats_sz;
-}
-
-
void vp8_output_stats(const VP8_COMP *cpi,
struct vpx_codec_pkt_list *pktlist,
FIRSTPASS_STATS *stats)
@@ -303,18 +288,19 @@
struct vpx_codec_cx_pkt pkt;
pkt.kind = VPX_CODEC_STATS_PKT;
pkt.data.twopass_stats.buf = stats;
- pkt.data.twopass_stats.sz = vp8_firstpass_stats_sz(cpi->common.MBs);
+ pkt.data.twopass_stats.sz = sizeof(FIRSTPASS_STATS);
vpx_codec_pkt_list_add(pktlist, &pkt);
// TEMP debug code
#if OUTPUT_FPF
+
{
FILE *fpfile;
fpfile = fopen("firstpass.stt", "a");
fprintf(fpfile, "%12.0f %12.0f %12.0f %12.4f %12.4f %12.4f %12.4f"
- " %12.4f %12.4f %12.4f %12.4f %12.4f %12.4f %12.4f %12.0f"
- " %12.4f\n",
+ " %12.4f %12.4f %12.4f %12.4f %12.4f %12.4f %12.4f %12.4f"
+ " %12.0f %12.4f\n",
stats->frame,
stats->intra_error,
stats->coded_error,
@@ -333,24 +319,17 @@
stats->count,
stats->duration);
fclose(fpfile);
-
-
- fpfile = fopen("fpmotionmap.stt", "a");
- if(fwrite(cpi->fp_motion_map, 1, cpi->common.MBs, fpfile));
- fclose(fpfile);
}
#endif
}
int vp8_input_stats(VP8_COMP *cpi, FIRSTPASS_STATS *fps)
{
- size_t stats_sz = vp8_firstpass_stats_sz(cpi->common.MBs);
-
if (cpi->stats_in >= cpi->stats_in_end)
return EOF;
*fps = *cpi->stats_in;
- cpi->stats_in = (void*)((char *)cpi->stats_in + stats_sz);
+ cpi->stats_in = (void*)((char *)cpi->stats_in + sizeof(FIRSTPASS_STATS));
return 1;
}
@@ -416,57 +395,9 @@
section->duration /= section->count;
}
-unsigned char *vp8_fpmm_get_pos(VP8_COMP *cpi)
-{
- return cpi->fp_motion_map_stats;
-}
-void vp8_fpmm_reset_pos(VP8_COMP *cpi, unsigned char *target_pos)
-{
- cpi->fp_motion_map_stats = target_pos;
-}
-
-void vp8_advance_fpmm(VP8_COMP *cpi, int count)
-{
- cpi->fp_motion_map_stats = (void*)((char*)cpi->fp_motion_map_stats +
- count * vp8_firstpass_stats_sz(cpi->common.MBs));
-}
-
-void vp8_input_fpmm(VP8_COMP *cpi)
-{
- unsigned char *fpmm = cpi->fp_motion_map;
- int MBs = cpi->common.MBs;
- int max_frames = cpi->active_arnr_frames;
- int i;
-
- for (i=0; i<max_frames; i++)
- {
- char *motion_map = (char*)cpi->fp_motion_map_stats
- + sizeof(FIRSTPASS_STATS);
-
- memcpy(fpmm, motion_map, MBs);
- fpmm += MBs;
- vp8_advance_fpmm(cpi, 1);
- }
-
- // Flag the use of weights in the temporal filter
- cpi->use_weighted_temporal_filter = 1;
-}
-
void vp8_init_first_pass(VP8_COMP *cpi)
{
vp8_zero_stats(cpi->total_stats);
-
-// TEMP debug code
-#ifdef OUTPUT_FPF
- {
- FILE *fpfile;
- fpfile = fopen("firstpass.stt", "w");
- fclose(fpfile);
- fpfile = fopen("fpmotionmap.stt", "wb");
- fclose(fpfile);
- }
-#endif
-
}
void vp8_end_first_pass(VP8_COMP *cpi)
@@ -583,8 +514,6 @@
MV zero_ref_mv = {0, 0};
- unsigned char *fp_motion_map_ptr = cpi->fp_motion_map;
-
vp8_clear_system_state(); //__asm emms;
x->src = * cpi->Source;
@@ -636,7 +565,6 @@
for (mb_col = 0; mb_col < cm->mb_cols; mb_col++)
{
int this_error;
- int zero_error;
int zz_to_best_ratio;
int gf_motion_error = INT_MAX;
int use_dc_pred = (mb_col || mb_row) && (!mb_col || !mb_row);
@@ -658,9 +586,6 @@
// Cumulative intra error total
intra_error += (long long)this_error;
- // Indicate default assumption of intra in the motion map
- *fp_motion_map_ptr = 0;
-
// Set up limit values for motion vectors to prevent them extending outside the UMV borders
x->mv_col_min = -((mb_col * 16) + (VP8BORDERINPIXELS - 16));
x->mv_col_max = ((cm->mb_cols - 1 - mb_col) * 16) + (VP8BORDERINPIXELS - 16);
@@ -679,9 +604,6 @@
d->bmi.mv.as_mv.row = 0;
d->bmi.mv.as_mv.col = 0;
- // Save (0,0) error for later use
- zero_error = motion_error;
-
// Test last reference frame using the previous best mv as the
// starting point (best reference) for the search
vp8_first_pass_motion_search(cpi, x, &best_ref_mv.as_mv,
@@ -796,25 +718,6 @@
else if (d->bmi.mv.as_mv.col < 0)
sum_in_vectors--;
}
-
- // Compute how close (0,0) predictor is to best
- // predictor in terms of their prediction error
- zz_to_best_ratio = (10*zero_error + this_error/2)
- / (this_error+!this_error);
-
- if ((zero_error < 50000) &&
- (zz_to_best_ratio <= 11) )
- *fp_motion_map_ptr = 1;
- else
- *fp_motion_map_ptr = 0;
- }
- else
- {
- // 0,0 mv was best
- if( zero_error<50000 )
- *fp_motion_map_ptr = 2;
- else
- *fp_motion_map_ptr = 1;
}
}
}
@@ -828,9 +731,6 @@
recon_yoffset += 16;
recon_uvoffset += 8;
-
- // Update the motion map
- fp_motion_map_ptr++;
}
// adjust to the next row of mbs
@@ -892,13 +792,10 @@
// than the full time between subsequent cpi->source_time_stamp s .
fps.duration = cpi->source_end_time_stamp - cpi->source_time_stamp;
- // don't want to do outputstats with a stack variable!
+ // don't want to do output stats with a stack variable!
memcpy(cpi->this_frame_stats,
&fps,
sizeof(FIRSTPASS_STATS));
- memcpy((char*)cpi->this_frame_stats + sizeof(FIRSTPASS_STATS),
- cpi->fp_motion_map,
- sizeof(cpi->fp_motion_map[0]) * cpi->common.MBs);
vp8_output_stats(cpi, cpi->output_pkt_list, cpi->this_frame_stats);
vp8_accumulate_stats(cpi->total_stats, &fps);
}
@@ -944,10 +841,10 @@
extern const int vp8_bits_per_mb[2][QINDEX_RANGE];
#define BASE_ERRPERMB 150
-static int estimate_max_q(VP8_COMP *cpi, double section_err, int section_target_bandwitdh, int Height, int Width)
+static int estimate_max_q(VP8_COMP *cpi, double section_err, int section_target_bandwitdh)
{
int Q;
- int num_mbs = ((Height * Width) / (16 * 16));
+ int num_mbs = cpi->common.MBs;
int target_norm_bits_per_mb;
double err_per_mb = section_err / num_mbs;
@@ -1044,10 +941,10 @@
return Q;
}
-static int estimate_q(VP8_COMP *cpi, double section_err, int section_target_bandwitdh, int Height, int Width)
+static int estimate_q(VP8_COMP *cpi, double section_err, int section_target_bandwitdh)
{
int Q;
- int num_mbs = ((Height * Width) / (16 * 16));
+ int num_mbs = cpi->common.MBs;
int target_norm_bits_per_mb;
double err_per_mb = section_err / num_mbs;
@@ -1095,10 +992,10 @@
}
// Estimate a worst case Q for a KF group
-static int estimate_kf_group_q(VP8_COMP *cpi, double section_err, int section_target_bandwitdh, int Height, int Width, double group_iiratio)
+static int estimate_kf_group_q(VP8_COMP *cpi, double section_err, int section_target_bandwitdh, double group_iiratio)
{
int Q;
- int num_mbs = ((Height * Width) / (16 * 16));
+ int num_mbs = cpi->common.MBs;
int target_norm_bits_per_mb = (512 * section_target_bandwitdh) / num_mbs;
int bits_per_mb_at_this_q;
@@ -1193,11 +1090,10 @@
// For cq mode estimate a cq level that matches the observed
// complexity and data rate.
-static int estimate_cq(VP8_COMP *cpi, double section_err,
- int section_target_bandwitdh, int Height, int Width)
+static int estimate_cq(VP8_COMP *cpi, double section_err, int section_target_bandwitdh)
{
int Q;
- int num_mbs = ((Height * Width) / (16 * 16));
+ int num_mbs = cpi->common.MBs;
int target_norm_bits_per_mb;
double err_per_mb = section_err / num_mbs;
@@ -1351,8 +1247,6 @@
cpi->clip_bpe = cpi->bits_left /
DOUBLE_DIVIDE_CHECK(cpi->modified_error_total);
cpi->observed_bpe = cpi->clip_bpe;
-
- cpi->fp_motion_map_stats = (unsigned char *)cpi->stats_in;
}
void vp8_end_second_pass(VP8_COMP *cpi)
@@ -1360,7 +1254,7 @@
}
// This function gives and estimate of how badly we believe
-// the predicition quality is decaying from frame to frame.
+// the prediction quality is decaying from frame to frame.
double get_prediction_decay_rate(VP8_COMP *cpi, FIRSTPASS_STATS *next_frame)
{
double prediction_decay_rate;
@@ -1472,8 +1366,6 @@
int max_bits = frame_max_bits(cpi); // Max for a single frame
- unsigned char *fpmm_pos;
-
unsigned int allow_alt_ref =
cpi->oxcf.play_alternate && cpi->oxcf.lag_in_frames;
@@ -1482,8 +1374,6 @@
vp8_clear_system_state(); //__asm emms;
- fpmm_pos = vp8_fpmm_get_pos(cpi);
-
start_pos = cpi->stats_in;
vpx_memset(&next_frame, 0, sizeof(next_frame)); // assure clean
@@ -1717,7 +1607,7 @@
arf_frame_bits = (int)((double)Boost * (group_bits / (double)allocation_chunks));
// Estimate if there are enough bits available to make worthwhile use of an arf.
- tmp_q = estimate_q(cpi, mod_frame_err, (int)arf_frame_bits, cpi->common.Height, cpi->common.Width);
+ tmp_q = estimate_q(cpi, mod_frame_err, (int)arf_frame_bits);
// Only use an arf if it is likely we will be able to code it at a lower Q than the surrounding frames.
if (tmp_q < cpi->worst_quality)
@@ -1780,20 +1670,6 @@
}
cpi->active_arnr_frames = frames_bwd + 1 + frames_fwd;
-
- {
- // Advance to & read in the motion map for those frames
- // to be considered for filtering based on the position
- // of the ARF
- vp8_fpmm_reset_pos(cpi, cpi->fp_motion_map_stats_save);
-
- // Position at the 'earliest' frame to be filtered
- vp8_advance_fpmm(cpi,
- cpi->baseline_gf_interval - frames_bwd);
-
- // Read / create a motion map for the region of interest
- vp8_input_fpmm(cpi);
- }
}
else
{
@@ -2023,9 +1899,6 @@
reset_fpf_position(cpi, start_pos);
}
-
- // Reset the First pass motion map file position
- vp8_fpmm_reset_pos(cpi, fpmm_pos);
}
// Allocate bits to a normal frame that is neither a gf an arf or a key frame.
@@ -2107,13 +1980,6 @@
if (EOF == vp8_input_stats(cpi, &this_frame))
return;
- vpx_memset(cpi->fp_motion_map, 0,
- cpi->oxcf.arnr_max_frames*cpi->common.MBs);
- cpi->fp_motion_map_stats_save = vp8_fpmm_get_pos(cpi);
-
- // Step over this frame's first pass motion map
- vp8_advance_fpmm(cpi, 1);
-
this_frame_error = this_frame.ssim_weighted_pred_err;
this_frame_intra_error = this_frame.intra_error;
this_frame_coded_error = this_frame.coded_error;
@@ -2245,8 +2111,7 @@
est_cq =
estimate_cq( cpi,
(cpi->total_coded_error_left / frames_left),
- (int)(cpi->bits_left / frames_left),
- cpi->common.Height, cpi->common.Width);
+ (int)(cpi->bits_left / frames_left));
cpi->cq_target_quality = cpi->oxcf.cq_level;
if ( est_cq > cpi->cq_target_quality )
@@ -2258,9 +2123,7 @@
cpi->maxq_min_limit = cpi->best_quality;
tmp_q = estimate_max_q( cpi,
(cpi->total_coded_error_left / frames_left),
- (int)(cpi->bits_left / frames_left),
- cpi->common.Height,
- cpi->common.Width);
+ (int)(cpi->bits_left / frames_left));
// Limit the maxq value returned subsequently.
// This increases the risk of overspend or underspend if the initial
@@ -2288,7 +2151,7 @@
if (frames_left < 1)
frames_left = 1;
- tmp_q = estimate_max_q(cpi, (cpi->total_coded_error_left / frames_left), (int)(cpi->bits_left / frames_left), cpi->common.Height, cpi->common.Width);
+ tmp_q = estimate_max_q(cpi, (cpi->total_coded_error_left / frames_left), (int)(cpi->bits_left / frames_left));
// Move active_worst_quality but in a damped way
if (tmp_q > cpi->active_worst_quality)
@@ -2897,7 +2760,7 @@
bits_per_frame = (cpi->oxcf.target_bandwidth * cpi->oxcf.two_pass_vbrmin_section / 100);
// Work out if spatial resampling is necessary
- kf_q = estimate_kf_group_q(cpi, err_per_frame, bits_per_frame, new_height, new_width, group_iiratio);
+ kf_q = estimate_kf_group_q(cpi, err_per_frame, bits_per_frame, group_iiratio);
// If we project a required Q higher than the maximum allowed Q then make a guess at the actual size of frames in this section
projected_bits_perframe = bits_per_frame;
@@ -2968,7 +2831,7 @@
effective_size_ratio = (1.0 + (3.0 * effective_size_ratio)) / 4.0;
// Now try again and see what Q we get with the smaller image size
- kf_q = estimate_kf_group_q(cpi, err_per_frame * effective_size_ratio, bits_per_frame, new_height, new_width, group_iiratio);
+ kf_q = estimate_kf_group_q(cpi, err_per_frame * effective_size_ratio, bits_per_frame, group_iiratio);
if (0)
{
diff --git a/vp8/encoder/generic/csystemdependent.c b/vp8/encoder/generic/csystemdependent.c
index fc0580d..81108fe 100644
--- a/vp8/encoder/generic/csystemdependent.c
+++ b/vp8/encoder/generic/csystemdependent.c
@@ -103,6 +103,10 @@
// Pure C:
vp8_yv12_copy_partial_frame_ptr = vp8_yv12_copy_partial_frame;
+#if CONFIG_PSNR
+ cpi->rtcd.variance.ssimpf_8x8 = ssim_parms_8x8_c;
+ cpi->rtcd.variance.ssimpf = ssim_parms_c;
+#endif
#if ARCH_X86 || ARCH_X86_64
vp8_arch_x86_encoder_init(cpi);
diff --git a/vp8/encoder/mcomp.c b/vp8/encoder/mcomp.c
index 33aaa2c..c210c1d 100644
--- a/vp8/encoder/mcomp.c
+++ b/vp8/encoder/mcomp.c
@@ -1415,7 +1415,7 @@
int col_min = ref_col - distance;
int col_max = ref_col + distance;
- unsigned short sad_array8[8];
+ DECLARE_ALIGNED_ARRAY(16, unsigned short, sad_array8, 8);
unsigned int sad_array[3];
// Work out the mid point for the search
diff --git a/vp8/encoder/onyx_if.c b/vp8/encoder/onyx_if.c
index 39610a7..8965634 100644
--- a/vp8/encoder/onyx_if.c
+++ b/vp8/encoder/onyx_if.c
@@ -86,9 +86,11 @@
YV12_BUFFER_CONFIG *source,
YV12_BUFFER_CONFIG *dest,
int lumamask,
- double *weight
+ double *weight,
+ const vp8_variance_rtcd_vtable_t *rtcd
);
+
extern double vp8_calc_ssimg
(
YV12_BUFFER_CONFIG *source,
@@ -281,12 +283,6 @@
vpx_free(cpi->active_map);
cpi->active_map = 0;
-#if !(CONFIG_REALTIME_ONLY)
- // Delete first pass motion map
- vpx_free(cpi->fp_motion_map);
- cpi->fp_motion_map = 0;
-#endif
-
vp8_de_alloc_frame_buffers(&cpi->common);
vp8_yv12_de_alloc_frame_buffer(&cpi->last_frame_uf);
@@ -1360,11 +1356,11 @@
#if !(CONFIG_REALTIME_ONLY)
vpx_free(cpi->total_stats);
- cpi->total_stats = vpx_calloc(1, vp8_firstpass_stats_sz(cpi->common.MBs));
+ cpi->total_stats = vpx_calloc(1, sizeof(FIRSTPASS_STATS));
vpx_free(cpi->this_frame_stats);
- cpi->this_frame_stats = vpx_calloc(1, vp8_firstpass_stats_sz(cpi->common.MBs));
+ cpi->this_frame_stats = vpx_calloc(1, sizeof(FIRSTPASS_STATS));
if(!cpi->total_stats || !cpi->this_frame_stats)
vpx_internal_error(&cpi->common.error, VPX_CODEC_MEM_ERROR,
@@ -1462,8 +1458,7 @@
VP8_COMP *cpi = (VP8_COMP *)(ptr);
VP8_COMMON *cm = &cpi->common;
- if (!cpi)
- return;
+ cpi->oxcf = *oxcf;
cpi->auto_gold = 1;
cpi->auto_adjust_gold_quantizer = 1;
@@ -1475,299 +1470,31 @@
cm->version = oxcf->Version;
vp8_setup_version(cm);
- if (oxcf == 0)
- {
- cpi->pass = 0;
+ // change includes all joint functionality
+ vp8_change_config(ptr, oxcf);
- cpi->auto_worst_q = 0;
- cpi->oxcf.best_allowed_q = MINQ;
- cpi->oxcf.worst_allowed_q = MAXQ;
- cpi->oxcf.cq_level = MINQ;
+ // Initialize active best and worst q and average q values.
+ cpi->active_worst_quality = cpi->oxcf.worst_allowed_q;
+ cpi->active_best_quality = cpi->oxcf.best_allowed_q;
+ cpi->avg_frame_qindex = cpi->oxcf.worst_allowed_q;
- cpi->oxcf.end_usage = USAGE_STREAM_FROM_SERVER;
- cpi->oxcf.starting_buffer_level = 4000;
- cpi->oxcf.optimal_buffer_level = 5000;
- cpi->oxcf.maximum_buffer_size = 6000;
- cpi->oxcf.under_shoot_pct = 90;
- cpi->oxcf.allow_df = 0;
- cpi->oxcf.drop_frames_water_mark = 20;
-
- cpi->oxcf.allow_spatial_resampling = 0;
- cpi->oxcf.resample_down_water_mark = 40;
- cpi->oxcf.resample_up_water_mark = 60;
-
- cpi->oxcf.fixed_q = cpi->interquantizer;
-
- cpi->filter_type = NORMAL_LOOPFILTER;
-
- if (cm->simpler_lpf)
- cpi->filter_type = SIMPLE_LOOPFILTER;
-
- cpi->compressor_speed = 1;
- cpi->horiz_scale = 0;
- cpi->vert_scale = 0;
- cpi->oxcf.two_pass_vbrbias = 50;
- cpi->oxcf.two_pass_vbrmax_section = 400;
- cpi->oxcf.two_pass_vbrmin_section = 0;
-
- cpi->oxcf.Sharpness = 0;
- cpi->oxcf.noise_sensitivity = 0;
- }
- else
- cpi->oxcf = *oxcf;
-
-
- switch (cpi->oxcf.Mode)
- {
-
- case MODE_REALTIME:
- cpi->pass = 0;
- cpi->compressor_speed = 2;
-
- if (cpi->oxcf.cpu_used < -16)
- {
- cpi->oxcf.cpu_used = -16;
- }
-
- if (cpi->oxcf.cpu_used > 16)
- cpi->oxcf.cpu_used = 16;
-
- break;
-
-#if !(CONFIG_REALTIME_ONLY)
- case MODE_GOODQUALITY:
- cpi->pass = 0;
- cpi->compressor_speed = 1;
-
- if (cpi->oxcf.cpu_used < -5)
- {
- cpi->oxcf.cpu_used = -5;
- }
-
- if (cpi->oxcf.cpu_used > 5)
- cpi->oxcf.cpu_used = 5;
-
- break;
-
- case MODE_BESTQUALITY:
- cpi->pass = 0;
- cpi->compressor_speed = 0;
- break;
-
- case MODE_FIRSTPASS:
- cpi->pass = 1;
- cpi->compressor_speed = 1;
- break;
- case MODE_SECONDPASS:
- cpi->pass = 2;
- cpi->compressor_speed = 1;
-
- if (cpi->oxcf.cpu_used < -5)
- {
- cpi->oxcf.cpu_used = -5;
- }
-
- if (cpi->oxcf.cpu_used > 5)
- cpi->oxcf.cpu_used = 5;
-
- break;
- case MODE_SECONDPASS_BEST:
- cpi->pass = 2;
- cpi->compressor_speed = 0;
- break;
-#endif
- }
-
- if (cpi->pass == 0)
- cpi->auto_worst_q = 1;
-
- cpi->oxcf.worst_allowed_q = q_trans[oxcf->worst_allowed_q];
- cpi->oxcf.best_allowed_q = q_trans[oxcf->best_allowed_q];
- cpi->oxcf.cq_level = q_trans[cpi->oxcf.cq_level];
-
- if (oxcf->fixed_q >= 0)
- {
- if (oxcf->worst_allowed_q < 0)
- cpi->oxcf.fixed_q = q_trans[0];
- else
- cpi->oxcf.fixed_q = q_trans[oxcf->worst_allowed_q];
-
- if (oxcf->alt_q < 0)
- cpi->oxcf.alt_q = q_trans[0];
- else
- cpi->oxcf.alt_q = q_trans[oxcf->alt_q];
-
- if (oxcf->key_q < 0)
- cpi->oxcf.key_q = q_trans[0];
- else
- cpi->oxcf.key_q = q_trans[oxcf->key_q];
-
- if (oxcf->gold_q < 0)
- cpi->oxcf.gold_q = q_trans[0];
- else
- cpi->oxcf.gold_q = q_trans[oxcf->gold_q];
-
- }
-
- cpi->baseline_gf_interval = cpi->oxcf.alt_freq ? cpi->oxcf.alt_freq : DEFAULT_GF_INTERVAL;
- cpi->ref_frame_flags = VP8_ALT_FLAG | VP8_GOLD_FLAG | VP8_LAST_FLAG;
-
- //cpi->use_golden_frame_only = 0;
- //cpi->use_last_frame_only = 0;
- cm->refresh_golden_frame = 0;
- cm->refresh_last_frame = 1;
- cm->refresh_entropy_probs = 1;
-
- if (cpi->oxcf.token_partitions >= 0 && cpi->oxcf.token_partitions <= 3)
- cm->multi_token_partition = (TOKEN_PARTITION) cpi->oxcf.token_partitions;
-
- setup_features(cpi);
-
- {
- int i;
-
- for (i = 0; i < MAX_MB_SEGMENTS; i++)
- cpi->segment_encode_breakout[i] = cpi->oxcf.encode_breakout;
- }
-
- // At the moment the first order values may not be > MAXQ
- if (cpi->oxcf.fixed_q > MAXQ)
- cpi->oxcf.fixed_q = MAXQ;
-
- // local file playback mode == really big buffer
- if (cpi->oxcf.end_usage == USAGE_LOCAL_FILE_PLAYBACK)
- {
- cpi->oxcf.starting_buffer_level = 60000;
- cpi->oxcf.optimal_buffer_level = 60000;
- cpi->oxcf.maximum_buffer_size = 240000;
-
- }
-
-
- // Convert target bandwidth from Kbit/s to Bit/s
- cpi->oxcf.target_bandwidth *= 1000;
+ // Initialise the starting buffer levels
cpi->oxcf.starting_buffer_level =
rescale(cpi->oxcf.starting_buffer_level,
cpi->oxcf.target_bandwidth, 1000);
- if (cpi->oxcf.optimal_buffer_level == 0)
- cpi->oxcf.optimal_buffer_level = cpi->oxcf.target_bandwidth / 8;
- else
- cpi->oxcf.optimal_buffer_level =
- rescale(cpi->oxcf.optimal_buffer_level,
- cpi->oxcf.target_bandwidth, 1000);
-
- if (cpi->oxcf.maximum_buffer_size == 0)
- cpi->oxcf.maximum_buffer_size = cpi->oxcf.target_bandwidth / 8;
- else
- cpi->oxcf.maximum_buffer_size =
- rescale(cpi->oxcf.maximum_buffer_size,
- cpi->oxcf.target_bandwidth, 1000);
-
- cpi->buffer_level = cpi->oxcf.starting_buffer_level;
+ cpi->buffer_level = cpi->oxcf.starting_buffer_level;
cpi->bits_off_target = cpi->oxcf.starting_buffer_level;
- vp8_new_frame_rate(cpi, cpi->oxcf.frame_rate);
- cpi->worst_quality = cpi->oxcf.worst_allowed_q;
- cpi->active_worst_quality = cpi->oxcf.worst_allowed_q;
- cpi->avg_frame_qindex = cpi->oxcf.worst_allowed_q;
- cpi->best_quality = cpi->oxcf.best_allowed_q;
- cpi->active_best_quality = cpi->oxcf.best_allowed_q;
- cpi->cq_target_quality = cpi->oxcf.cq_level;
-
- cpi->buffered_mode = (cpi->oxcf.optimal_buffer_level > 0) ? TRUE : FALSE;
-
cpi->rolling_target_bits = cpi->av_per_frame_bandwidth;
cpi->rolling_actual_bits = cpi->av_per_frame_bandwidth;
- cpi->long_rolling_target_bits = cpi->av_per_frame_bandwidth;
- cpi->long_rolling_actual_bits = cpi->av_per_frame_bandwidth;
+ cpi->long_rolling_target_bits = cpi->av_per_frame_bandwidth;
+ cpi->long_rolling_actual_bits = cpi->av_per_frame_bandwidth;
cpi->total_actual_bits = 0;
- cpi->total_target_vs_actual = 0;
-
- // Only allow dropped frames in buffered mode
- cpi->drop_frames_allowed = cpi->oxcf.allow_df && cpi->buffered_mode;
-
- cm->filter_type = (LOOPFILTERTYPE) cpi->filter_type;
-
- if (!cm->use_bilinear_mc_filter)
- cm->mcomp_filter_type = SIXTAP;
- else
- cm->mcomp_filter_type = BILINEAR;
-
- cpi->target_bandwidth = cpi->oxcf.target_bandwidth;
-
- cm->Width = cpi->oxcf.Width ;
- cm->Height = cpi->oxcf.Height ;
-
- cpi->intra_frame_target = (4 * (cm->Width + cm->Height) / 15) * 1000; // As per VP8
-
- cm->horiz_scale = cpi->horiz_scale;
- cm->vert_scale = cpi->vert_scale ;
-
- // VP8 sharpness level mapping 0-7 (vs 0-10 in general VPx dialogs)
- if (cpi->oxcf.Sharpness > 7)
- cpi->oxcf.Sharpness = 7;
-
- cm->sharpness_level = cpi->oxcf.Sharpness;
-
- if (cm->horiz_scale != NORMAL || cm->vert_scale != NORMAL)
- {
- int UNINITIALIZED_IS_SAFE(hr), UNINITIALIZED_IS_SAFE(hs);
- int UNINITIALIZED_IS_SAFE(vr), UNINITIALIZED_IS_SAFE(vs);
-
- Scale2Ratio(cm->horiz_scale, &hr, &hs);
- Scale2Ratio(cm->vert_scale, &vr, &vs);
-
- // always go to the next whole number
- cm->Width = (hs - 1 + cpi->oxcf.Width * hr) / hs;
- cm->Height = (vs - 1 + cpi->oxcf.Height * vr) / vs;
- }
-
- if (((cm->Width + 15) & 0xfffffff0) != cm->yv12_fb[cm->lst_fb_idx].y_width ||
- ((cm->Height + 15) & 0xfffffff0) != cm->yv12_fb[cm->lst_fb_idx].y_height ||
- cm->yv12_fb[cm->lst_fb_idx].y_width == 0)
- {
- alloc_raw_frame_buffers(cpi);
- vp8_alloc_compressor_data(cpi);
- }
-
- // Clamp KF frame size to quarter of data rate
- if (cpi->intra_frame_target > cpi->target_bandwidth >> 2)
- cpi->intra_frame_target = cpi->target_bandwidth >> 2;
-
- if (cpi->oxcf.fixed_q >= 0)
- {
- cpi->last_q[0] = cpi->oxcf.fixed_q;
- cpi->last_q[1] = cpi->oxcf.fixed_q;
- }
-
- cpi->Speed = cpi->oxcf.cpu_used;
-
- // force to allowlag to 0 if lag_in_frames is 0;
- if (cpi->oxcf.lag_in_frames == 0)
- {
- cpi->oxcf.allow_lag = 0;
- }
- // Limit on lag buffers as these are not currently dynamically allocated
- else if (cpi->oxcf.lag_in_frames > MAX_LAG_BUFFERS)
- cpi->oxcf.lag_in_frames = MAX_LAG_BUFFERS;
-
- // YX Temp
- cpi->last_alt_ref_sei = -1;
- cpi->is_src_frame_alt_ref = 0;
- cpi->is_next_src_alt_ref = 0;
-
-#if 0
- // Experimental RD Code
- cpi->frame_distortion = 0;
- cpi->last_frame_distortion = 0;
-#endif
+ cpi->total_target_vs_actual = 0;
#if VP8_TEMPORAL_ALT_REF
-
- cpi->use_weighted_temporal_filter = 0;
-
{
int i;
@@ -1779,12 +1506,6 @@
#endif
}
-/*
- * This function needs more clean up, i.e. be more tuned torwards
- * change_config rather than init_config !!!!!!!!!!!!!!!!
- * YX - 5/28/2009
- *
- */
void vp8_change_config(VP8_PTR ptr, VP8_CONFIG *oxcf)
{
@@ -1897,7 +1618,8 @@
}
- cpi->baseline_gf_interval = cpi->oxcf.alt_freq ? cpi->oxcf.alt_freq : DEFAULT_GF_INTERVAL;
+ cpi->baseline_gf_interval =
+ cpi->oxcf.alt_freq ? cpi->oxcf.alt_freq : DEFAULT_GF_INTERVAL;
cpi->ref_frame_flags = VP8_ALT_FLAG | VP8_GOLD_FLAG | VP8_LAST_FLAG;
@@ -1908,7 +1630,8 @@
cm->refresh_entropy_probs = 1;
if (cpi->oxcf.token_partitions >= 0 && cpi->oxcf.token_partitions <= 3)
- cm->multi_token_partition = (TOKEN_PARTITION) cpi->oxcf.token_partitions;
+ cm->multi_token_partition =
+ (TOKEN_PARTITION) cpi->oxcf.token_partitions;
setup_features(cpi);
@@ -1929,16 +1652,12 @@
cpi->oxcf.starting_buffer_level = 60000;
cpi->oxcf.optimal_buffer_level = 60000;
cpi->oxcf.maximum_buffer_size = 240000;
-
}
// Convert target bandwidth from Kbit/s to Bit/s
cpi->oxcf.target_bandwidth *= 1000;
- cpi->oxcf.starting_buffer_level =
- rescale(cpi->oxcf.starting_buffer_level,
- cpi->oxcf.target_bandwidth, 1000);
-
+ // Set or reset optimal and maximum buffer levels.
if (cpi->oxcf.optimal_buffer_level == 0)
cpi->oxcf.optimal_buffer_level = cpi->oxcf.target_bandwidth / 8;
else
@@ -1953,31 +1672,41 @@
rescale(cpi->oxcf.maximum_buffer_size,
cpi->oxcf.target_bandwidth, 1000);
- cpi->buffer_level = cpi->oxcf.starting_buffer_level;
- cpi->bits_off_target = cpi->oxcf.starting_buffer_level;
-
+ // Set up frame rate and related parameters rate control values.
vp8_new_frame_rate(cpi, cpi->oxcf.frame_rate);
+
+ // Set absolute upper and lower quality limits
cpi->worst_quality = cpi->oxcf.worst_allowed_q;
- cpi->active_worst_quality = cpi->oxcf.worst_allowed_q;
- cpi->avg_frame_qindex = cpi->oxcf.worst_allowed_q;
cpi->best_quality = cpi->oxcf.best_allowed_q;
- cpi->active_best_quality = cpi->oxcf.best_allowed_q;
+
+ // active values should only be modified if out of new range
+ if (cpi->active_worst_quality > cpi->oxcf.worst_allowed_q)
+ {
+ cpi->active_worst_quality = cpi->oxcf.worst_allowed_q;
+ }
+ // less likely
+ else if (cpi->active_worst_quality < cpi->oxcf.best_allowed_q)
+ {
+ cpi->active_worst_quality = cpi->oxcf.best_allowed_q;
+ }
+ if (cpi->active_best_quality < cpi->oxcf.best_allowed_q)
+ {
+ cpi->active_best_quality = cpi->oxcf.best_allowed_q;
+ }
+ // less likely
+ else if (cpi->active_best_quality > cpi->oxcf.worst_allowed_q)
+ {
+ cpi->active_best_quality = cpi->oxcf.worst_allowed_q;
+ }
+
cpi->buffered_mode = (cpi->oxcf.optimal_buffer_level > 0) ? TRUE : FALSE;
cpi->cq_target_quality = cpi->oxcf.cq_level;
- cpi->rolling_target_bits = cpi->av_per_frame_bandwidth;
- cpi->rolling_actual_bits = cpi->av_per_frame_bandwidth;
- cpi->long_rolling_target_bits = cpi->av_per_frame_bandwidth;
- cpi->long_rolling_actual_bits = cpi->av_per_frame_bandwidth;
-
- cpi->total_actual_bits = 0;
- cpi->total_target_vs_actual = 0;
-
// Only allow dropped frames in buffered mode
- cpi->drop_frames_allowed = cpi->oxcf.allow_df && cpi->buffered_mode;
+ cpi->drop_frames_allowed = cpi->oxcf.allow_df && cpi->buffered_mode;
- cm->filter_type = (LOOPFILTERTYPE) cpi->filter_type;
+ cm->filter_type = (LOOPFILTERTYPE) cpi->filter_type;
if (!cm->use_bilinear_mc_filter)
cm->mcomp_filter_type = SIXTAP;
@@ -1992,7 +1721,8 @@
cm->horiz_scale = cpi->horiz_scale;
cm->vert_scale = cpi->vert_scale ;
- cpi->intra_frame_target = (4 * (cm->Width + cm->Height) / 15) * 1000; // As per VP8
+ // As per VP8
+ cpi->intra_frame_target = (4 * (cm->Width + cm->Height) / 15) * 1000;
// VP8 sharpness level mapping 0-7 (vs 0-10 in general VPx dialogs)
if (cpi->oxcf.Sharpness > 7)
@@ -2013,8 +1743,10 @@
cm->Height = (vs - 1 + cpi->oxcf.Height * vr) / vs;
}
- if (((cm->Width + 15) & 0xfffffff0) != cm->yv12_fb[cm->lst_fb_idx].y_width ||
- ((cm->Height + 15) & 0xfffffff0) != cm->yv12_fb[cm->lst_fb_idx].y_height ||
+ if (((cm->Width + 15) & 0xfffffff0) !=
+ cm->yv12_fb[cm->lst_fb_idx].y_width ||
+ ((cm->Height + 15) & 0xfffffff0) !=
+ cm->yv12_fb[cm->lst_fb_idx].y_height ||
cm->yv12_fb[cm->lst_fb_idx].y_width == 0)
{
alloc_raw_frame_buffers(cpi);
@@ -2153,12 +1885,6 @@
vpx_memset(cpi->active_map , 1, (cpi->common.mb_rows * cpi->common.mb_cols));
cpi->active_map_enabled = 0;
-#if !(CONFIG_REALTIME_ONLY)
- // Create the first pass motion map structure and set to 0
- // Allocate space for maximum of 15 buffers
- CHECK_MEM_ERROR(cpi->fp_motion_map, vpx_calloc(15*cpi->common.MBs, 1));
-#endif
-
#if 0
// Experimental code for lagged and one pass
// Initialise one_pass GF frames stats
@@ -2308,7 +2034,7 @@
}
else if (cpi->pass == 2)
{
- size_t packet_sz = vp8_firstpass_stats_sz(cpi->common.MBs);
+ size_t packet_sz = sizeof(FIRSTPASS_STATS);
int packets = oxcf->two_pass_stats_in.sz / packet_sz;
cpi->stats_in = oxcf->two_pass_stats_in.buf;
@@ -3509,6 +3235,89 @@
return force_recode;
}
+void loopfilter_frame(VP8_COMP *cpi, VP8_COMMON *cm)
+{
+ if (cm->no_lpf)
+ {
+ cm->filter_level = 0;
+ }
+ else
+ {
+ struct vpx_usec_timer timer;
+
+ vp8_clear_system_state();
+
+ vpx_usec_timer_start(&timer);
+ if (cpi->sf.auto_filter == 0)
+ vp8cx_pick_filter_level_fast(cpi->Source, cpi);
+
+ else
+ vp8cx_pick_filter_level(cpi->Source, cpi);
+
+ vpx_usec_timer_mark(&timer);
+ cpi->time_pick_lpf += vpx_usec_timer_elapsed(&timer);
+ }
+
+#if CONFIG_MULTITHREAD
+ sem_post(&cpi->h_event_end_lpf); /* signal that we have set filter_level */
+#endif
+
+ if (cm->filter_level > 0)
+ {
+ vp8cx_set_alt_lf_level(cpi, cm->filter_level);
+ vp8_loop_filter_frame(cm, &cpi->mb.e_mbd, cm->filter_level);
+ cm->last_filter_type = cm->filter_type;
+ cm->last_sharpness_level = cm->sharpness_level;
+ }
+
+ vp8_yv12_extend_frame_borders_ptr(cm->frame_to_show);
+
+ {
+ YV12_BUFFER_CONFIG *lst_yv12 = &cm->yv12_fb[cm->lst_fb_idx];
+ YV12_BUFFER_CONFIG *new_yv12 = &cm->yv12_fb[cm->new_fb_idx];
+ YV12_BUFFER_CONFIG *gld_yv12 = &cm->yv12_fb[cm->gld_fb_idx];
+ YV12_BUFFER_CONFIG *alt_yv12 = &cm->yv12_fb[cm->alt_fb_idx];
+ // At this point the new frame has been encoded.
+ // If any buffer copy / swapping is signaled it should be done here.
+ if (cm->frame_type == KEY_FRAME)
+ {
+ vp8_yv12_copy_frame_ptr(cm->frame_to_show, gld_yv12);
+ vp8_yv12_copy_frame_ptr(cm->frame_to_show, alt_yv12);
+ }
+ else // For non key frames
+ {
+ // Code to copy between reference buffers
+ if (cm->copy_buffer_to_arf)
+ {
+ if (cm->copy_buffer_to_arf == 1)
+ {
+ if (cm->refresh_last_frame)
+ // We copy new_frame here because last and new buffers will already have been swapped if cm->refresh_last_frame is set.
+ vp8_yv12_copy_frame_ptr(new_yv12, alt_yv12);
+ else
+ vp8_yv12_copy_frame_ptr(lst_yv12, alt_yv12);
+ }
+ else if (cm->copy_buffer_to_arf == 2)
+ vp8_yv12_copy_frame_ptr(gld_yv12, alt_yv12);
+ }
+
+ if (cm->copy_buffer_to_gf)
+ {
+ if (cm->copy_buffer_to_gf == 1)
+ {
+ if (cm->refresh_last_frame)
+ // We copy new_frame here because last and new buffers will already have been swapped if cm->refresh_last_frame is set.
+ vp8_yv12_copy_frame_ptr(new_yv12, gld_yv12);
+ else
+ vp8_yv12_copy_frame_ptr(lst_yv12, gld_yv12);
+ }
+ else if (cm->copy_buffer_to_gf == 2)
+ vp8_yv12_copy_frame_ptr(alt_yv12, gld_yv12);
+ }
+ }
+ }
+}
+
static void encode_frame_to_data_rate
(
VP8_COMP *cpi,
@@ -3542,6 +3351,7 @@
int drop_mark50 = drop_mark / 4;
int drop_mark25 = drop_mark / 8;
+
// Clear down mmx registers to allow floating point in what follows
vp8_clear_system_state();
@@ -3862,11 +3672,12 @@
}
}
- // If CBR and the buffer is as full then it is reasonable to allow higher quality on the frames
- // to prevent bits just going to waste.
+ // If CBR and the buffer is as full then it is reasonable to allow
+ // higher quality on the frames to prevent bits just going to waste.
if (cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER)
{
- // Note that the use of >= here elliminates the risk of a devide by 0 error in the else if clause
+ // Note that the use of >= here elliminates the risk of a devide
+ // by 0 error in the else if clause
if (cpi->buffer_level >= cpi->oxcf.maximum_buffer_size)
cpi->active_best_quality = cpi->best_quality;
@@ -3879,6 +3690,20 @@
}
}
}
+ // Make sure constrained quality mode limits are adhered to for the first
+ // few frames of one pass encodes
+ else if (cpi->oxcf.end_usage == USAGE_CONSTRAINED_QUALITY)
+ {
+ if ( (cm->frame_type == KEY_FRAME) ||
+ cm->refresh_golden_frame || cpi->common.refresh_alt_ref_frame )
+ {
+ cpi->active_best_quality = cpi->best_quality;
+ }
+ else if (cpi->active_best_quality < cpi->cq_target_quality)
+ {
+ cpi->active_best_quality = cpi->cq_target_quality;
+ }
+ }
// Clip the active best and worst quality values to limits
if (cpi->active_worst_quality > cpi->worst_quality)
@@ -4058,8 +3883,8 @@
vp8_setup_key_frame(cpi);
// transform / motion compensation build reconstruction frame
-
vp8_encode_frame(cpi);
+
cpi->projected_frame_size -= vp8_estimate_entropy_savings(cpi);
cpi->projected_frame_size = (cpi->projected_frame_size > 0) ? cpi->projected_frame_size : 0;
@@ -4408,92 +4233,43 @@
else
cm->frame_to_show = &cm->yv12_fb[cm->new_fb_idx];
- if (cm->no_lpf)
+
+#if CONFIG_MULTITHREAD
+ if (cpi->b_multi_threaded)
{
- cm->filter_level = 0;
+ sem_post(&cpi->h_event_start_lpf); /* start loopfilter in separate thread */
}
else
+#endif
{
- struct vpx_usec_timer timer;
-
- vpx_usec_timer_start(&timer);
-
- if (cpi->sf.auto_filter == 0)
- vp8cx_pick_filter_level_fast(cpi->Source, cpi);
- else
- vp8cx_pick_filter_level(cpi->Source, cpi);
-
- vpx_usec_timer_mark(&timer);
-
- cpi->time_pick_lpf += vpx_usec_timer_elapsed(&timer);
+ loopfilter_frame(cpi, cm);
}
- if (cm->filter_level > 0)
- {
- vp8cx_set_alt_lf_level(cpi, cm->filter_level);
- vp8_loop_filter_frame(cm, &cpi->mb.e_mbd, cm->filter_level);
- cm->last_filter_type = cm->filter_type;
- cm->last_sharpness_level = cm->sharpness_level;
- }
-
- /* Move storing frame_type out of the above loop since it is also
- * needed in motion search besides loopfilter */
- cm->last_frame_type = cm->frame_type;
-
- vp8_yv12_extend_frame_borders_ptr(cm->frame_to_show);
-
if (cpi->oxcf.error_resilient_mode == 1)
{
cm->refresh_entropy_probs = 0;
}
+#if CONFIG_MULTITHREAD
+ /* wait that filter_level is picked so that we can continue with stream packing */
+ if (cpi->b_multi_threaded)
+ sem_wait(&cpi->h_event_end_lpf);
+#endif
+
// build the bitstream
vp8_pack_bitstream(cpi, dest, size);
+#if CONFIG_MULTITHREAD
+ /* wait for loopfilter thread done */
+ if (cpi->b_multi_threaded)
{
- YV12_BUFFER_CONFIG *lst_yv12 = &cm->yv12_fb[cm->lst_fb_idx];
- YV12_BUFFER_CONFIG *new_yv12 = &cm->yv12_fb[cm->new_fb_idx];
- YV12_BUFFER_CONFIG *gld_yv12 = &cm->yv12_fb[cm->gld_fb_idx];
- YV12_BUFFER_CONFIG *alt_yv12 = &cm->yv12_fb[cm->alt_fb_idx];
- // At this point the new frame has been encoded coded.
- // If any buffer copy / swaping is signalled it should be done here.
- if (cm->frame_type == KEY_FRAME)
- {
- vp8_yv12_copy_frame_ptr(cm->frame_to_show, gld_yv12);
- vp8_yv12_copy_frame_ptr(cm->frame_to_show, alt_yv12);
- }
- else // For non key frames
- {
- // Code to copy between reference buffers
- if (cm->copy_buffer_to_arf)
- {
- if (cm->copy_buffer_to_arf == 1)
- {
- if (cm->refresh_last_frame)
- // We copy new_frame here because last and new buffers will already have been swapped if cm->refresh_last_frame is set.
- vp8_yv12_copy_frame_ptr(new_yv12, alt_yv12);
- else
- vp8_yv12_copy_frame_ptr(lst_yv12, alt_yv12);
- }
- else if (cm->copy_buffer_to_arf == 2)
- vp8_yv12_copy_frame_ptr(gld_yv12, alt_yv12);
- }
-
- if (cm->copy_buffer_to_gf)
- {
- if (cm->copy_buffer_to_gf == 1)
- {
- if (cm->refresh_last_frame)
- // We copy new_frame here because last and new buffers will already have been swapped if cm->refresh_last_frame is set.
- vp8_yv12_copy_frame_ptr(new_yv12, gld_yv12);
- else
- vp8_yv12_copy_frame_ptr(lst_yv12, gld_yv12);
- }
- else if (cm->copy_buffer_to_gf == 2)
- vp8_yv12_copy_frame_ptr(alt_yv12, gld_yv12);
- }
- }
+ sem_wait(&cpi->h_event_end_lpf);
}
+#endif
+
+ /* Move storing frame_type out of the above loop since it is also
+ * needed in motion search besides loopfilter */
+ cm->last_frame_type = cm->frame_type;
// Update rate control heuristics
cpi->total_byte_count += (*size);
@@ -5328,7 +5104,9 @@
cpi->time_compress_data += vpx_usec_timer_elapsed(&cmptimer);
if (cpi->b_calculate_psnr && cpi->pass != 1 && cm->show_frame)
+ {
generate_psnr_packet(cpi);
+ }
#if CONFIG_PSNR
@@ -5344,12 +5122,35 @@
if (cpi->b_calculate_psnr)
{
double y, u, v;
- double sq_error;
- double frame_psnr = vp8_calc_psnr(cpi->Source, cm->frame_to_show, &y, &u, &v, &sq_error);
+ double ye,ue,ve;
+ double frame_psnr;
+ YV12_BUFFER_CONFIG *orig = cpi->Source;
+ YV12_BUFFER_CONFIG *recon = cpi->common.frame_to_show;
+ YV12_BUFFER_CONFIG *pp = &cm->post_proc_buffer;
+ int y_samples = orig->y_height * orig->y_width ;
+ int uv_samples = orig->uv_height * orig->uv_width ;
+ int t_samples = y_samples + 2 * uv_samples;
+ long long sq_error;
- cpi->total_y += y;
- cpi->total_u += u;
- cpi->total_v += v;
+ ye = calc_plane_error(orig->y_buffer, orig->y_stride,
+ recon->y_buffer, recon->y_stride, orig->y_width, orig->y_height,
+ IF_RTCD(&cpi->rtcd.variance));
+
+ ue = calc_plane_error(orig->u_buffer, orig->uv_stride,
+ recon->u_buffer, recon->uv_stride, orig->uv_width, orig->uv_height,
+ IF_RTCD(&cpi->rtcd.variance));
+
+ ve = calc_plane_error(orig->v_buffer, orig->uv_stride,
+ recon->v_buffer, recon->uv_stride, orig->uv_width, orig->uv_height,
+ IF_RTCD(&cpi->rtcd.variance));
+
+ sq_error = ye + ue + ve;
+
+ frame_psnr = vp8_mse2psnr(t_samples, 255.0, sq_error);
+
+ cpi->total_y += vp8_mse2psnr(y_samples, 255.0, ye);
+ cpi->total_u += vp8_mse2psnr(uv_samples, 255.0, ue);
+ cpi->total_v += vp8_mse2psnr(uv_samples, 255.0, ve);
cpi->total_sq_error += sq_error;
cpi->total += frame_psnr;
{
@@ -5358,18 +5159,36 @@
vp8_deblock(cm->frame_to_show, &cm->post_proc_buffer, cm->filter_level * 10 / 6, 1, 0, IF_RTCD(&cm->rtcd.postproc));
vp8_clear_system_state();
- frame_psnr2 = vp8_calc_psnr(cpi->Source, &cm->post_proc_buffer, &y2, &u2, &v2, &sq_error);
- frame_ssim2 = vp8_calc_ssim(cpi->Source, &cm->post_proc_buffer, 1, &weight);
+
+ ye = calc_plane_error(orig->y_buffer, orig->y_stride,
+ pp->y_buffer, pp->y_stride, orig->y_width, orig->y_height,
+ IF_RTCD(&cpi->rtcd.variance));
+
+ ue = calc_plane_error(orig->u_buffer, orig->uv_stride,
+ pp->u_buffer, pp->uv_stride, orig->uv_width, orig->uv_height,
+ IF_RTCD(&cpi->rtcd.variance));
+
+ ve = calc_plane_error(orig->v_buffer, orig->uv_stride,
+ pp->v_buffer, pp->uv_stride, orig->uv_width, orig->uv_height,
+ IF_RTCD(&cpi->rtcd.variance));
+
+ sq_error = ye + ue + ve;
+
+ frame_psnr2 = vp8_mse2psnr(t_samples, 255.0, sq_error);
+
+ cpi->totalp_y += vp8_mse2psnr(y_samples, 255.0, ye);
+ cpi->totalp_u += vp8_mse2psnr(uv_samples, 255.0, ue);
+ cpi->totalp_v += vp8_mse2psnr(uv_samples, 255.0, ve);
+ cpi->total_sq_error2 += sq_error;
+ cpi->totalp += frame_psnr2;
+
+ frame_ssim2 = vp8_calc_ssim(cpi->Source,
+ &cm->post_proc_buffer, 1, &weight,
+ IF_RTCD(&cpi->rtcd.variance));
cpi->summed_quality += frame_ssim2 * weight;
cpi->summed_weights += weight;
- cpi->totalp_y += y2;
- cpi->totalp_u += u2;
- cpi->totalp_v += v2;
- cpi->totalp += frame_psnr2;
- cpi->total_sq_error2 += sq_error;
-
}
}
diff --git a/vp8/encoder/onyx_int.h b/vp8/encoder/onyx_int.h
index 6d0cbd9..0e53f68 100644
--- a/vp8/encoder/onyx_int.h
+++ b/vp8/encoder/onyx_int.h
@@ -496,11 +496,6 @@
struct vpx_codec_pkt_list *output_pkt_list;
int first_pass_done;
-#if !(CONFIG_REALTIME_ONLY)
- unsigned char *fp_motion_map;
- unsigned char *fp_motion_map_stats, *fp_motion_map_stats_save;
-#endif
-
#if 0
// Experimental code for lagged and one pass
ONEPASS_FRAMESTATS one_pass_frame_stats[MAX_LAG_BUFFERS];
@@ -604,12 +599,17 @@
int encoding_thread_count;
pthread_t *h_encoding_thread;
+ pthread_t h_filter_thread;
+
MB_ROW_COMP *mb_row_ei;
ENCODETHREAD_DATA *en_thread_data;
+ LPFTHREAD_DATA lpf_thread_data;
//events
sem_t *h_event_start_encoding;
sem_t h_event_end_encoding;
+ sem_t h_event_start_lpf;
+ sem_t h_event_end_lpf;
#endif
TOKENLIST *tplist;
@@ -642,8 +642,6 @@
YV12_BUFFER_CONFIG *frames[MAX_LAG_BUFFERS];
int fixed_divide[512];
#endif
- // Flag to indicate temporal filter method
- int use_weighted_temporal_filter;
#if CONFIG_PSNR
int count;
diff --git a/vp8/encoder/pickinter.c b/vp8/encoder/pickinter.c
index 6ab85ad..0790d35 100644
--- a/vp8/encoder/pickinter.c
+++ b/vp8/encoder/pickinter.c
@@ -664,7 +664,8 @@
case V_PRED:
case H_PRED:
case TM_PRED:
- vp8_build_intra_predictors_mby_ptr(&x->e_mbd);
+ RECON_INVOKE(&cpi->common.rtcd.recon, build_intra_predictors_mby)
+ (&x->e_mbd);
distortion2 = VARIANCE_INVOKE(&cpi->rtcd.variance, get16x16prederror)(x->src.y_buffer, x->src.y_stride, x->e_mbd.predictor, 16, 0x7fffffff);
rate2 += x->mbmode_cost[x->e_mbd.frame_type][x->e_mbd.mode_info_context->mbmi.mode];
this_rd = RD_ESTIMATE(x->rdmult, x->rddiv, rate2, distortion2);
diff --git a/vp8/encoder/ratectrl.c b/vp8/encoder/ratectrl.c
index bfffe43..9797f5f 100644
--- a/vp8/encoder/ratectrl.c
+++ b/vp8/encoder/ratectrl.c
@@ -842,7 +842,8 @@
{
int one_percent_bits = 1 + cpi->oxcf.optimal_buffer_level / 100;
- if ((cpi->buffer_level < cpi->oxcf.optimal_buffer_level) || (cpi->bits_off_target < cpi->oxcf.optimal_buffer_level))
+ if ((cpi->buffer_level < cpi->oxcf.optimal_buffer_level) ||
+ (cpi->bits_off_target < cpi->oxcf.optimal_buffer_level))
{
int percent_low = 0;
@@ -851,9 +852,12 @@
// If we are are below the optimal buffer fullness level and adherence
// to buffering contraints is important to the end useage then adjust
// the per frame target.
- if ((cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER) && (cpi->buffer_level < cpi->oxcf.optimal_buffer_level))
+ if ((cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER) &&
+ (cpi->buffer_level < cpi->oxcf.optimal_buffer_level))
{
- percent_low = (cpi->oxcf.optimal_buffer_level - cpi->buffer_level) / one_percent_bits;
+ percent_low =
+ (cpi->oxcf.optimal_buffer_level - cpi->buffer_level) /
+ one_percent_bits;
if (percent_low > 100)
percent_low = 100;
@@ -864,7 +868,8 @@
else if (cpi->bits_off_target < 0)
{
// Adjust per frame data target downwards to compensate.
- percent_low = (int)(100 * -cpi->bits_off_target / (cpi->total_byte_count * 8));
+ percent_low = (int)(100 * -cpi->bits_off_target /
+ (cpi->total_byte_count * 8));
if (percent_low > 100)
percent_low = 100;
@@ -873,39 +878,60 @@
}
// lower the target bandwidth for this frame.
- cpi->this_frame_target = (cpi->this_frame_target * (100 - (percent_low / 2))) / 100;
+ cpi->this_frame_target =
+ (cpi->this_frame_target * (100 - (percent_low / 2))) / 100;
- // Are we using allowing control of active_worst_allowed_q according to buffer level.
+ // Are we using allowing control of active_worst_allowed_q
+ // according to buffer level.
if (cpi->auto_worst_q)
{
int critical_buffer_level;
- // For streaming applications the most important factor is cpi->buffer_level as this takes
- // into account the specified short term buffering constraints. However, hitting the long
- // term clip data rate target is also important.
+ // For streaming applications the most important factor is
+ // cpi->buffer_level as this takes into account the
+ // specified short term buffering constraints. However,
+ // hitting the long term clip data rate target is also
+ // important.
if (cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER)
{
- // Take the smaller of cpi->buffer_level and cpi->bits_off_target
- critical_buffer_level = (cpi->buffer_level < cpi->bits_off_target) ? cpi->buffer_level : cpi->bits_off_target;
+ // Take the smaller of cpi->buffer_level and
+ // cpi->bits_off_target
+ critical_buffer_level =
+ (cpi->buffer_level < cpi->bits_off_target)
+ ? cpi->buffer_level : cpi->bits_off_target;
}
- // For local file playback short term buffering contraints are less of an issue
+ // For local file playback short term buffering contraints
+ // are less of an issue
else
{
- // Consider only how we are doing for the clip as a whole
+ // Consider only how we are doing for the clip as a
+ // whole
critical_buffer_level = cpi->bits_off_target;
}
- // Set the active worst quality based upon the selected buffer fullness number.
+ // Set the active worst quality based upon the selected
+ // buffer fullness number.
if (critical_buffer_level < cpi->oxcf.optimal_buffer_level)
{
- if (critical_buffer_level > (cpi->oxcf.optimal_buffer_level / 4))
+ if ( critical_buffer_level >
+ (cpi->oxcf.optimal_buffer_level >> 2) )
{
- int qadjustment_range = cpi->worst_quality - cpi->ni_av_qi;
- int above_base = (critical_buffer_level - (cpi->oxcf.optimal_buffer_level / 4));
+ INT64 qadjustment_range =
+ cpi->worst_quality - cpi->ni_av_qi;
+ INT64 above_base =
+ (critical_buffer_level -
+ (cpi->oxcf.optimal_buffer_level >> 2));
- // Step active worst quality down from cpi->ni_av_qi when (critical_buffer_level == cpi->optimal_buffer_level)
- // to cpi->oxcf.worst_allowed_q when (critical_buffer_level == cpi->optimal_buffer_level/4)
- cpi->active_worst_quality = cpi->worst_quality - ((qadjustment_range * above_base) / (cpi->oxcf.optimal_buffer_level * 3 / 4));
+ // Step active worst quality down from
+ // cpi->ni_av_qi when (critical_buffer_level ==
+ // cpi->optimal_buffer_level) to
+ // cpi->worst_quality when
+ // (critical_buffer_level ==
+ // cpi->optimal_buffer_level >> 2)
+ cpi->active_worst_quality =
+ cpi->worst_quality -
+ ((qadjustment_range * above_base) /
+ (cpi->oxcf.optimal_buffer_level*3>>2));
}
else
{
@@ -965,6 +991,15 @@
// Set the active worst quality
cpi->active_worst_quality = cpi->worst_quality;
}
+
+ // Special trap for constrained quality mode
+ // "active_worst_quality" may never drop below cq level
+ // for any frame type.
+ if ( cpi->oxcf.end_usage == USAGE_CONSTRAINED_QUALITY &&
+ cpi->active_worst_quality < cpi->cq_target_quality)
+ {
+ cpi->active_worst_quality = cpi->cq_target_quality;
+ }
}
// Test to see if we have to drop a frame
diff --git a/vp8/encoder/rdopt.c b/vp8/encoder/rdopt.c
index b0dcfe0..c706c57 100644
--- a/vp8/encoder/rdopt.c
+++ b/vp8/encoder/rdopt.c
@@ -745,7 +745,8 @@
{
x->e_mbd.mode_info_context->mbmi.mode = mode;
- vp8_build_intra_predictors_mby_ptr(&x->e_mbd);
+ RECON_INVOKE(&cpi->common.rtcd.recon, build_intra_predictors_mby)
+ (&x->e_mbd);
macro_block_yrd(x, &ratey, &distortion, IF_RTCD(&cpi->rtcd.encodemb));
rate = ratey + x->mbmode_cost[x->e_mbd.frame_type]
@@ -2038,7 +2039,8 @@
case H_PRED:
case TM_PRED:
x->e_mbd.mode_info_context->mbmi.ref_frame = INTRA_FRAME;
- vp8_build_intra_predictors_mby_ptr(&x->e_mbd);
+ RECON_INVOKE(&cpi->common.rtcd.recon, build_intra_predictors_mby)
+ (&x->e_mbd);
macro_block_yrd(x, &rate_y, &distortion, IF_RTCD(&cpi->rtcd.encodemb)) ;
rate2 += rate_y;
distortion2 += distortion;
diff --git a/vp8/encoder/ssim.c b/vp8/encoder/ssim.c
index 4ebcba1..64d67c6 100644
--- a/vp8/encoder/ssim.c
+++ b/vp8/encoder/ssim.c
@@ -11,298 +11,13 @@
#include "vpx_scale/yv12config.h"
#include "math.h"
+#include "onyx_int.h"
-#define C1 (float)(64 * 64 * 0.01*255*0.01*255)
-#define C2 (float)(64 * 64 * 0.03*255*0.03*255)
-
-static int width_y;
-static int height_y;
-static int height_uv;
-static int width_uv;
-static int stride_uv;
-static int stride;
-static int lumimask;
-static int luminance;
-static double plane_summed_weights = 0;
-
-static short img12_sum_block[8*4096*4096*2] ;
-
-static short img1_sum[8*4096*2];
-static short img2_sum[8*4096*2];
-static int img1_sq_sum[8*4096*2];
-static int img2_sq_sum[8*4096*2];
-static int img12_mul_sum[8*4096*2];
-
-
-double vp8_similarity
-(
- int mu_x,
- int mu_y,
- int pre_mu_x2,
- int pre_mu_y2,
- int pre_mu_xy2
-)
-{
- int mu_x2, mu_y2, mu_xy, theta_x2, theta_y2, theta_xy;
-
- mu_x2 = mu_x * mu_x;
- mu_y2 = mu_y * mu_y;
- mu_xy = mu_x * mu_y;
-
- theta_x2 = 64 * pre_mu_x2 - mu_x2;
- theta_y2 = 64 * pre_mu_y2 - mu_y2;
- theta_xy = 64 * pre_mu_xy2 - mu_xy;
-
- return (2 * mu_xy + C1) * (2 * theta_xy + C2) / ((mu_x2 + mu_y2 + C1) * (theta_x2 + theta_y2 + C2));
-}
-
-double vp8_ssim
-(
- const unsigned char *img1,
- const unsigned char *img2,
- int stride_img1,
- int stride_img2,
- int width,
- int height
-)
-{
- int x, y, x2, y2, img1_block, img2_block, img1_sq_block, img2_sq_block, img12_mul_block, temp;
-
- double plane_quality, weight, mean;
-
- short *img1_sum_ptr1, *img1_sum_ptr2;
- short *img2_sum_ptr1, *img2_sum_ptr2;
- int *img1_sq_sum_ptr1, *img1_sq_sum_ptr2;
- int *img2_sq_sum_ptr1, *img2_sq_sum_ptr2;
- int *img12_mul_sum_ptr1, *img12_mul_sum_ptr2;
-
- plane_quality = 0;
-
- if (lumimask)
- plane_summed_weights = 0.0f;
- else
- plane_summed_weights = (height - 7) * (width - 7);
-
- //some prologue for the main loop
- temp = 8 * width;
-
- img1_sum_ptr1 = img1_sum + temp;
- img2_sum_ptr1 = img2_sum + temp;
- img1_sq_sum_ptr1 = img1_sq_sum + temp;
- img2_sq_sum_ptr1 = img2_sq_sum + temp;
- img12_mul_sum_ptr1 = img12_mul_sum + temp;
-
- for (x = 0; x < width; x++)
- {
- img1_sum[x] = img1[x];
- img2_sum[x] = img2[x];
- img1_sq_sum[x] = img1[x] * img1[x];
- img2_sq_sum[x] = img2[x] * img2[x];
- img12_mul_sum[x] = img1[x] * img2[x];
-
- img1_sum_ptr1[x] = 0;
- img2_sum_ptr1[x] = 0;
- img1_sq_sum_ptr1[x] = 0;
- img2_sq_sum_ptr1[x] = 0;
- img12_mul_sum_ptr1[x] = 0;
- }
-
- //the main loop
- for (y = 1; y < height; y++)
- {
- img1 += stride_img1;
- img2 += stride_img2;
-
- temp = (y - 1) % 9 * width;
-
- img1_sum_ptr1 = img1_sum + temp;
- img2_sum_ptr1 = img2_sum + temp;
- img1_sq_sum_ptr1 = img1_sq_sum + temp;
- img2_sq_sum_ptr1 = img2_sq_sum + temp;
- img12_mul_sum_ptr1 = img12_mul_sum + temp;
-
- temp = y % 9 * width;
-
- img1_sum_ptr2 = img1_sum + temp;
- img2_sum_ptr2 = img2_sum + temp;
- img1_sq_sum_ptr2 = img1_sq_sum + temp;
- img2_sq_sum_ptr2 = img2_sq_sum + temp;
- img12_mul_sum_ptr2 = img12_mul_sum + temp;
-
- for (x = 0; x < width; x++)
- {
- img1_sum_ptr2[x] = img1_sum_ptr1[x] + img1[x];
- img2_sum_ptr2[x] = img2_sum_ptr1[x] + img2[x];
- img1_sq_sum_ptr2[x] = img1_sq_sum_ptr1[x] + img1[x] * img1[x];
- img2_sq_sum_ptr2[x] = img2_sq_sum_ptr1[x] + img2[x] * img2[x];
- img12_mul_sum_ptr2[x] = img12_mul_sum_ptr1[x] + img1[x] * img2[x];
- }
-
- if (y > 6)
- {
- //calculate the sum of the last 8 lines by subtracting the total sum of 8 lines back from the present sum
- temp = (y + 1) % 9 * width;
-
- img1_sum_ptr1 = img1_sum + temp;
- img2_sum_ptr1 = img2_sum + temp;
- img1_sq_sum_ptr1 = img1_sq_sum + temp;
- img2_sq_sum_ptr1 = img2_sq_sum + temp;
- img12_mul_sum_ptr1 = img12_mul_sum + temp;
-
- for (x = 0; x < width; x++)
- {
- img1_sum_ptr1[x] = img1_sum_ptr2[x] - img1_sum_ptr1[x];
- img2_sum_ptr1[x] = img2_sum_ptr2[x] - img2_sum_ptr1[x];
- img1_sq_sum_ptr1[x] = img1_sq_sum_ptr2[x] - img1_sq_sum_ptr1[x];
- img2_sq_sum_ptr1[x] = img2_sq_sum_ptr2[x] - img2_sq_sum_ptr1[x];
- img12_mul_sum_ptr1[x] = img12_mul_sum_ptr2[x] - img12_mul_sum_ptr1[x];
- }
-
- //here we calculate the sum over the 8x8 block of pixels
- //this is done by sliding a window across the column sums for the last 8 lines
- //each time adding the new column sum, and subtracting the one which fell out of the window
- img1_block = 0;
- img2_block = 0;
- img1_sq_block = 0;
- img2_sq_block = 0;
- img12_mul_block = 0;
-
- //prologue, and calculation of simularity measure from the first 8 column sums
- for (x = 0; x < 8; x++)
- {
- img1_block += img1_sum_ptr1[x];
- img2_block += img2_sum_ptr1[x];
- img1_sq_block += img1_sq_sum_ptr1[x];
- img2_sq_block += img2_sq_sum_ptr1[x];
- img12_mul_block += img12_mul_sum_ptr1[x];
- }
-
- if (lumimask)
- {
- y2 = y - 7;
- x2 = 0;
-
- if (luminance)
- {
- mean = (img2_block + img1_block) / 128.0f;
-
- if (!(y2 % 2 || x2 % 2))
- *(img12_sum_block + y2 / 2 * width_uv + x2 / 2) = img2_block + img1_block;
- }
- else
- {
- mean = *(img12_sum_block + y2 * width_uv + x2);
- mean += *(img12_sum_block + y2 * width_uv + x2 + 4);
- mean += *(img12_sum_block + (y2 + 4) * width_uv + x2);
- mean += *(img12_sum_block + (y2 + 4) * width_uv + x2 + 4);
-
- mean /= 512.0f;
- }
-
- weight = mean < 40 ? 0.0f :
- (mean < 50 ? (mean - 40.0f) / 10.0f : 1.0f);
- plane_summed_weights += weight;
-
- plane_quality += weight * vp8_similarity(img1_block, img2_block, img1_sq_block, img2_sq_block, img12_mul_block);
- }
- else
- plane_quality += vp8_similarity(img1_block, img2_block, img1_sq_block, img2_sq_block, img12_mul_block);
-
- //and for the rest
- for (x = 8; x < width; x++)
- {
- img1_block = img1_block + img1_sum_ptr1[x] - img1_sum_ptr1[x - 8];
- img2_block = img2_block + img2_sum_ptr1[x] - img2_sum_ptr1[x - 8];
- img1_sq_block = img1_sq_block + img1_sq_sum_ptr1[x] - img1_sq_sum_ptr1[x - 8];
- img2_sq_block = img2_sq_block + img2_sq_sum_ptr1[x] - img2_sq_sum_ptr1[x - 8];
- img12_mul_block = img12_mul_block + img12_mul_sum_ptr1[x] - img12_mul_sum_ptr1[x - 8];
-
- if (lumimask)
- {
- y2 = y - 7;
- x2 = x - 7;
-
- if (luminance)
- {
- mean = (img2_block + img1_block) / 128.0f;
-
- if (!(y2 % 2 || x2 % 2))
- *(img12_sum_block + y2 / 2 * width_uv + x2 / 2) = img2_block + img1_block;
- }
- else
- {
- mean = *(img12_sum_block + y2 * width_uv + x2);
- mean += *(img12_sum_block + y2 * width_uv + x2 + 4);
- mean += *(img12_sum_block + (y2 + 4) * width_uv + x2);
- mean += *(img12_sum_block + (y2 + 4) * width_uv + x2 + 4);
-
- mean /= 512.0f;
- }
-
- weight = mean < 40 ? 0.0f :
- (mean < 50 ? (mean - 40.0f) / 10.0f : 1.0f);
- plane_summed_weights += weight;
-
- plane_quality += weight * vp8_similarity(img1_block, img2_block, img1_sq_block, img2_sq_block, img12_mul_block);
- }
- else
- plane_quality += vp8_similarity(img1_block, img2_block, img1_sq_block, img2_sq_block, img12_mul_block);
- }
- }
- }
-
- if (plane_summed_weights == 0)
- return 1.0f;
- else
- return plane_quality / plane_summed_weights;
-}
-
-double vp8_calc_ssim
-(
- YV12_BUFFER_CONFIG *source,
- YV12_BUFFER_CONFIG *dest,
- int lumamask,
- double *weight
-)
-{
- double a, b, c;
- double frame_weight;
- double ssimv;
-
- width_y = source->y_width;
- height_y = source->y_height;
- height_uv = source->uv_height;
- width_uv = source->uv_width;
- stride_uv = dest->uv_stride;
- stride = dest->y_stride;
-
- lumimask = lumamask;
-
- luminance = 1;
- a = vp8_ssim(source->y_buffer, dest->y_buffer,
- source->y_stride, dest->y_stride, source->y_width, source->y_height);
- luminance = 0;
-
- frame_weight = plane_summed_weights / ((width_y - 7) * (height_y - 7));
-
- if (frame_weight == 0)
- a = b = c = 1.0f;
- else
- {
- b = vp8_ssim(source->u_buffer, dest->u_buffer,
- source->uv_stride, dest->uv_stride, source->uv_width, source->uv_height);
-
- c = vp8_ssim(source->v_buffer, dest->v_buffer,
- source->uv_stride, dest->uv_stride, source->uv_width, source->uv_height);
- }
-
- ssimv = a * .8 + .1 * (b + c);
-
- *weight = frame_weight;
-
- return ssimv;
-}
-
+#if CONFIG_RUNTIME_CPU_DETECT
+#define IF_RTCD(x) (x)
+#else
+#define IF_RTCD(x) NULL
+#endif
// Google version of SSIM
// SSIM
#define KERNEL 3
@@ -520,3 +235,174 @@
*ssim_v /= uvsize;
return ssim_all;
}
+
+
+void ssim_parms_c
+(
+ unsigned char *s,
+ int sp,
+ unsigned char *r,
+ int rp,
+ unsigned long *sum_s,
+ unsigned long *sum_r,
+ unsigned long *sum_sq_s,
+ unsigned long *sum_sq_r,
+ unsigned long *sum_sxr
+)
+{
+ int i,j;
+ for(i=0;i<16;i++,s+=sp,r+=rp)
+ {
+ for(j=0;j<16;j++)
+ {
+ *sum_s += s[j];
+ *sum_r += r[j];
+ *sum_sq_s += s[j] * s[j];
+ *sum_sq_r += r[j] * r[j];
+ *sum_sxr += s[j] * r[j];
+ }
+ }
+}
+void ssim_parms_8x8_c
+(
+ unsigned char *s,
+ int sp,
+ unsigned char *r,
+ int rp,
+ unsigned long *sum_s,
+ unsigned long *sum_r,
+ unsigned long *sum_sq_s,
+ unsigned long *sum_sq_r,
+ unsigned long *sum_sxr
+)
+{
+ int i,j;
+ for(i=0;i<8;i++,s+=sp,r+=rp)
+ {
+ for(j=0;j<8;j++)
+ {
+ *sum_s += s[j];
+ *sum_r += r[j];
+ *sum_sq_s += s[j] * s[j];
+ *sum_sq_r += r[j] * r[j];
+ *sum_sxr += s[j] * r[j];
+ }
+ }
+}
+
+const static long long c1 = 426148; // (256^2*(.01*255)^2
+const static long long c2 = 3835331; //(256^2*(.03*255)^2
+
+static double similarity
+(
+ unsigned long sum_s,
+ unsigned long sum_r,
+ unsigned long sum_sq_s,
+ unsigned long sum_sq_r,
+ unsigned long sum_sxr,
+ int count
+)
+{
+ long long ssim_n = (2*sum_s*sum_r+ c1)*(2*count*sum_sxr-2*sum_s*sum_r+c2);
+
+ long long ssim_d = (sum_s*sum_s +sum_r*sum_r+c1)*
+ (count*sum_sq_s-sum_s*sum_s + count*sum_sq_r-sum_r*sum_r +c2) ;
+
+ return ssim_n * 1.0 / ssim_d;
+}
+
+static double ssim_16x16(unsigned char *s,int sp, unsigned char *r,int rp,
+ const vp8_variance_rtcd_vtable_t *rtcd)
+{
+ unsigned long sum_s=0,sum_r=0,sum_sq_s=0,sum_sq_r=0,sum_sxr=0;
+ rtcd->ssimpf(s, sp, r, rp, &sum_s, &sum_r, &sum_sq_s, &sum_sq_r, &sum_sxr);
+ return similarity(sum_s, sum_r, sum_sq_s, sum_sq_r, sum_sxr, 256);
+}
+static double ssim_8x8(unsigned char *s,int sp, unsigned char *r,int rp,
+ const vp8_variance_rtcd_vtable_t *rtcd)
+{
+ unsigned long sum_s=0,sum_r=0,sum_sq_s=0,sum_sq_r=0,sum_sxr=0;
+ rtcd->ssimpf_8x8(s, sp, r, rp, &sum_s, &sum_r, &sum_sq_s, &sum_sq_r, &sum_sxr);
+ return similarity(sum_s, sum_r, sum_sq_s, sum_sq_r, sum_sxr, 64);
+}
+
+// TODO: (jbb) tried to scale this function such that we may be able to use it
+// for distortion metric in mode selection code ( provided we do a reconstruction)
+long dssim(unsigned char *s,int sp, unsigned char *r,int rp,
+ const vp8_variance_rtcd_vtable_t *rtcd)
+{
+ unsigned long sum_s=0,sum_r=0,sum_sq_s=0,sum_sq_r=0,sum_sxr=0;
+ double ssim3;
+ long long ssim_n;
+ long long ssim_d;
+
+ rtcd->ssimpf(s, sp, r, rp, &sum_s, &sum_r, &sum_sq_s, &sum_sq_r, &sum_sxr);
+ ssim_n = (2*sum_s*sum_r+ c1)*(2*256*sum_sxr-2*sum_s*sum_r+c2);
+
+ ssim_d = (sum_s*sum_s +sum_r*sum_r+c1)*
+ (256*sum_sq_s-sum_s*sum_s + 256*sum_sq_r-sum_r*sum_r +c2) ;
+
+ ssim3 = 256 * (ssim_d-ssim_n) / ssim_d;
+ return (long)( 256*ssim3 * ssim3 );
+}
+// TODO: (jbb) this 8x8 window might be too big + we may want to pick pixels
+// such that the window regions overlap block boundaries to penalize blocking
+// artifacts.
+
+double vp8_ssim2
+(
+ unsigned char *img1,
+ unsigned char *img2,
+ int stride_img1,
+ int stride_img2,
+ int width,
+ int height,
+ const vp8_variance_rtcd_vtable_t *rtcd
+)
+{
+ int i,j;
+
+ double ssim_total=0;
+
+ // we can sample points as frequently as we like start with 1 per 8x8
+ for(i=0; i < height; i+=8, img1 += stride_img1*8, img2 += stride_img2*8)
+ {
+ for(j=0; j < width; j+=8 )
+ {
+ ssim_total += ssim_8x8(img1, stride_img1, img2, stride_img2, rtcd);
+ }
+ }
+ ssim_total /= (width/8 * height /8);
+ return ssim_total;
+
+}
+double vp8_calc_ssim
+(
+ YV12_BUFFER_CONFIG *source,
+ YV12_BUFFER_CONFIG *dest,
+ int lumamask,
+ double *weight,
+ const vp8_variance_rtcd_vtable_t *rtcd
+)
+{
+ double a, b, c;
+ double ssimv;
+
+ a = vp8_ssim2(source->y_buffer, dest->y_buffer,
+ source->y_stride, dest->y_stride, source->y_width,
+ source->y_height, rtcd);
+
+ b = vp8_ssim2(source->u_buffer, dest->u_buffer,
+ source->uv_stride, dest->uv_stride, source->uv_width,
+ source->uv_height, rtcd);
+
+ c = vp8_ssim2(source->v_buffer, dest->v_buffer,
+ source->uv_stride, dest->uv_stride, source->uv_width,
+ source->uv_height, rtcd);
+
+ ssimv = a * .8 + .1 * (b + c);
+
+ *weight = 1;
+
+ return ssimv;
+}
diff --git a/vp8/encoder/temporal_filter.c b/vp8/encoder/temporal_filter.c
index 0f8e654..fd36b22 100644
--- a/vp8/encoder/temporal_filter.c
+++ b/vp8/encoder/temporal_filter.c
@@ -287,8 +287,7 @@
int byte;
int frame;
int mb_col, mb_row;
- unsigned int filter_weight[MAX_LAG_BUFFERS];
- unsigned char *mm_ptr = cpi->fp_motion_map;
+ unsigned int filter_weight;
int mb_cols = cpi->common.mb_cols;
int mb_rows = cpi->common.mb_rows;
int MBs = cpi->common.MBs;
@@ -306,13 +305,6 @@
unsigned char *u_buffer = mbd->pre.u_buffer;
unsigned char *v_buffer = mbd->pre.v_buffer;
- if (!cpi->use_weighted_temporal_filter)
- {
- // Temporal filtering is unweighted
- for (frame = 0; frame < frame_count; frame++)
- filter_weight[frame] = 1;
- }
-
for (mb_row = 0; mb_row < mb_rows; mb_row++)
{
#if ALT_REF_MC_ENABLED
@@ -338,34 +330,9 @@
+ (VP8BORDERINPIXELS - 19);
#endif
- // Read & process macroblock weights from motion map
- if (cpi->use_weighted_temporal_filter)
- {
- weight_cap = 2;
-
- for (frame = alt_ref_index-1; frame >= 0; frame--)
- {
- w = *(mm_ptr + (frame+1)*MBs);
- filter_weight[frame] = w < weight_cap ? w : weight_cap;
- weight_cap = w;
- }
-
- filter_weight[alt_ref_index] = 2;
-
- weight_cap = 2;
-
- for (frame = alt_ref_index+1; frame < frame_count; frame++)
- {
- w = *(mm_ptr + frame*MBs);
- filter_weight[frame] = w < weight_cap ? w : weight_cap;
- weight_cap = w;
- }
-
- }
-
for (frame = 0; frame < frame_count; frame++)
{
- int err;
+ int err = 0;
if (cpi->frames[frame] == NULL)
continue;
@@ -374,28 +341,25 @@
mbd->block[0].bmi.mv.as_mv.col = 0;
#if ALT_REF_MC_ENABLED
- //if (filter_weight[frame] == 0)
- {
#define THRESH_LOW 10000
#define THRESH_HIGH 20000
- // Correlation has been lost try MC
- err = vp8_temporal_filter_find_matching_mb_c
- (cpi,
- cpi->frames[alt_ref_index],
- cpi->frames[frame],
- mb_y_offset,
- THRESH_LOW);
+ // Find best match in this frame by MC
+ err = vp8_temporal_filter_find_matching_mb_c
+ (cpi,
+ cpi->frames[alt_ref_index],
+ cpi->frames[frame],
+ mb_y_offset,
+ THRESH_LOW);
- if (filter_weight[frame] < 2)
- {
- // Set weight depending on error
- filter_weight[frame] = err<THRESH_LOW
- ? 2 : err<THRESH_HIGH ? 1 : 0;
- }
- }
#endif
- if (filter_weight[frame] != 0)
+ // Assign higher weight to matching MB if it's error
+ // score is lower. If not applying MC default behavior
+ // is to weight all MBs equal.
+ filter_weight = err<THRESH_LOW
+ ? 2 : err<THRESH_HIGH ? 1 : 0;
+
+ if (filter_weight != 0)
{
// Construct the predictors
vp8_temporal_filter_predictors_mb_c
@@ -415,7 +379,7 @@
predictor,
16,
strength,
- filter_weight[frame],
+ filter_weight,
accumulator,
count);
@@ -425,7 +389,7 @@
predictor + 256,
8,
strength,
- filter_weight[frame],
+ filter_weight,
accumulator + 256,
count + 256);
@@ -435,7 +399,7 @@
predictor + 320,
8,
strength,
- filter_weight[frame],
+ filter_weight,
accumulator + 320,
count + 320);
}
@@ -491,7 +455,6 @@
byte += stride - 8;
}
- mm_ptr++;
mb_y_offset += 16;
mb_uv_offset += 8;
}
diff --git a/vp8/encoder/variance.h b/vp8/encoder/variance.h
index 5befd3b..bf17ea8 100644
--- a/vp8/encoder/variance.h
+++ b/vp8/encoder/variance.h
@@ -85,6 +85,19 @@
unsigned int *sse \
);
+#define prototype_ssimpf(sym) \
+ void (sym) \
+ ( \
+ unsigned char *s, \
+ int sp, \
+ unsigned char *r, \
+ int rp, \
+ unsigned long *sum_s, \
+ unsigned long *sum_r, \
+ unsigned long *sum_sq_s, \
+ unsigned long *sum_sq_r, \
+ unsigned long *sum_sxr \
+ );
#define prototype_getmbss(sym) unsigned int (sym)(const short *)
@@ -306,6 +319,15 @@
#endif
extern prototype_sad(vp8_variance_get4x4sse_cs);
+#ifndef vp8_ssimpf
+#define vp8_ssimpf ssim_parms_c
+#endif
+extern prototype_ssimpf(vp8_ssimpf)
+
+#ifndef vp8_ssimpf_8x8
+#define vp8_ssimpf_8x8 ssim_parms_8x8_c
+#endif
+extern prototype_ssimpf(vp8_ssimpf_8x8)
typedef prototype_sad(*vp8_sad_fn_t);
typedef prototype_sad_multi_same_address(*vp8_sad_multi_fn_t);
@@ -315,6 +337,10 @@
typedef prototype_variance2(*vp8_variance2_fn_t);
typedef prototype_subpixvariance(*vp8_subpixvariance_fn_t);
typedef prototype_getmbss(*vp8_getmbss_fn_t);
+
+typedef prototype_ssimpf(*vp8_ssimpf_fn_t)
+
+
typedef struct
{
vp8_sad_fn_t sad4x4;
@@ -365,6 +391,11 @@
vp8_sad_multi_d_fn_t sad8x8x4d;
vp8_sad_multi_d_fn_t sad4x4x4d;
+#if CONFIG_PSNR
+ vp8_ssimpf_fn_t ssimpf_8x8;
+ vp8_ssimpf_fn_t ssimpf;
+#endif
+
} vp8_variance_rtcd_vtable_t;
typedef struct
@@ -378,6 +409,7 @@
vp8_sad_multi_fn_t sdx3f;
vp8_sad_multi1_fn_t sdx8f;
vp8_sad_multi_d_fn_t sdx4df;
+
} vp8_variance_fn_ptr_t;
#if CONFIG_RUNTIME_CPU_DETECT
diff --git a/vp8/encoder/x86/sad_sse4.asm b/vp8/encoder/x86/sad_sse4.asm
index 21e2e50..03ecec4 100644
--- a/vp8/encoder/x86/sad_sse4.asm
+++ b/vp8/encoder/x86/sad_sse4.asm
@@ -186,7 +186,7 @@
PROCESS_16X2X8 0
mov rdi, arg(4) ;Results
- movdqu XMMWORD PTR [rdi], xmm1
+ movdqa XMMWORD PTR [rdi], xmm1
; begin epilog
pop rdi
@@ -224,7 +224,7 @@
PROCESS_16X2X8 0
mov rdi, arg(4) ;Results
- movdqu XMMWORD PTR [rdi], xmm1
+ movdqa XMMWORD PTR [rdi], xmm1
; begin epilog
pop rdi
@@ -262,7 +262,7 @@
PROCESS_8X2X8 0
mov rdi, arg(4) ;Results
- movdqu XMMWORD PTR [rdi], xmm1
+ movdqa XMMWORD PTR [rdi], xmm1
; begin epilog
pop rdi
@@ -303,7 +303,7 @@
PROCESS_8X2X8 0
PROCESS_8X2X8 0
mov rdi, arg(4) ;Results
- movdqu XMMWORD PTR [rdi], xmm1
+ movdqa XMMWORD PTR [rdi], xmm1
; begin epilog
pop rdi
@@ -339,7 +339,7 @@
PROCESS_4X2X8 0
mov rdi, arg(4) ;Results
- movdqu XMMWORD PTR [rdi], xmm1
+ movdqa XMMWORD PTR [rdi], xmm1
; begin epilog
pop rdi
diff --git a/vp8/encoder/x86/ssim_opt.asm b/vp8/encoder/x86/ssim_opt.asm
new file mode 100644
index 0000000..c267cdb
--- /dev/null
+++ b/vp8/encoder/x86/ssim_opt.asm
@@ -0,0 +1,215 @@
+;
+; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+%include "vpx_ports/x86_abi_support.asm"
+
+; tabulate_ssim - sums sum_s,sum_r,sum_sq_s,sum_sq_r, sum_sxr
+%macro TABULATE_SSIM 0
+ paddusw xmm15, xmm3 ; sum_s
+ paddusw xmm14, xmm4 ; sum_r
+ movdqa xmm1, xmm3
+ pmaddwd xmm1, xmm1
+ paddq xmm13, xmm1 ; sum_sq_s
+ movdqa xmm2, xmm4
+ pmaddwd xmm2, xmm2
+ paddq xmm12, xmm2 ; sum_sq_r
+ pmaddwd xmm3, xmm4
+ paddq xmm11, xmm3 ; sum_sxr
+%endmacro
+
+; Sum across the register %1 starting with q words
+%macro SUM_ACROSS_Q 1
+ movdqa xmm2,%1
+ punpckldq %1,xmm0
+ punpckhdq xmm2,xmm0
+ paddq %1,xmm2
+ movdqa xmm2,%1
+ punpcklqdq %1,xmm0
+ punpckhqdq xmm2,xmm0
+ paddq %1,xmm2
+%endmacro
+
+; Sum across the register %1 starting with q words
+%macro SUM_ACROSS_W 1
+ movdqa xmm1, %1
+ punpcklwd %1,xmm0
+ punpckhwd xmm1,xmm0
+ paddd %1, xmm1
+ SUM_ACROSS_Q %1
+%endmacro
+;void ssim_parms_sse3(
+; unsigned char *s,
+; int sp,
+; unsigned char *r,
+; int rp
+; unsigned long *sum_s,
+; unsigned long *sum_r,
+; unsigned long *sum_sq_s,
+; unsigned long *sum_sq_r,
+; unsigned long *sum_sxr);
+;
+; TODO: Use parm passing through structure, probably don't need the pxors
+; ( calling app will initialize to 0 ) could easily fit everything in sse2
+; without too much hastle, and can probably do better estimates with psadw
+; or pavgb At this point this is just meant to be first pass for calculating
+; all the parms needed for 16x16 ssim so we can play with dssim as distortion
+; in mode selection code.
+global sym(vp8_ssim_parms_16x16_sse3)
+sym(vp8_ssim_parms_16x16_sse3):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 9
+ push rsi
+ push rdi
+ ; end prolog
+
+ mov rsi, arg(0) ;s
+ mov rcx, arg(1) ;sp
+ mov rdi, arg(2) ;r
+ mov rax, arg(3) ;rp
+
+ pxor xmm0, xmm0
+ pxor xmm15,xmm15 ;sum_s
+ pxor xmm14,xmm14 ;sum_r
+ pxor xmm13,xmm13 ;sum_sq_s
+ pxor xmm12,xmm12 ;sum_sq_r
+ pxor xmm11,xmm11 ;sum_sxr
+
+ mov rdx, 16 ;row counter
+NextRow:
+
+ ;grab source and reference pixels
+ movdqu xmm5, [rsi]
+ movdqu xmm6, [rdi]
+ movdqa xmm3, xmm5
+ movdqa xmm4, xmm6
+ punpckhbw xmm3, xmm0 ; high_s
+ punpckhbw xmm4, xmm0 ; high_r
+
+ TABULATE_SSIM
+
+ movdqa xmm3, xmm5
+ movdqa xmm4, xmm6
+ punpcklbw xmm3, xmm0 ; low_s
+ punpcklbw xmm4, xmm0 ; low_r
+
+ TABULATE_SSIM
+
+ add rsi, rcx ; next s row
+ add rdi, rax ; next r row
+
+ dec rdx ; counter
+ jnz NextRow
+
+ SUM_ACROSS_W xmm15
+ SUM_ACROSS_W xmm14
+ SUM_ACROSS_Q xmm13
+ SUM_ACROSS_Q xmm12
+ SUM_ACROSS_Q xmm11
+
+ mov rdi,arg(4)
+ movq [rdi], xmm15;
+ mov rdi,arg(5)
+ movq [rdi], xmm14;
+ mov rdi,arg(6)
+ movq [rdi], xmm13;
+ mov rdi,arg(7)
+ movq [rdi], xmm12;
+ mov rdi,arg(8)
+ movq [rdi], xmm11;
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+;void ssim_parms_sse3(
+; unsigned char *s,
+; int sp,
+; unsigned char *r,
+; int rp
+; unsigned long *sum_s,
+; unsigned long *sum_r,
+; unsigned long *sum_sq_s,
+; unsigned long *sum_sq_r,
+; unsigned long *sum_sxr);
+;
+; TODO: Use parm passing through structure, probably don't need the pxors
+; ( calling app will initialize to 0 ) could easily fit everything in sse2
+; without too much hastle, and can probably do better estimates with psadw
+; or pavgb At this point this is just meant to be first pass for calculating
+; all the parms needed for 16x16 ssim so we can play with dssim as distortion
+; in mode selection code.
+global sym(vp8_ssim_parms_8x8_sse3)
+sym(vp8_ssim_parms_8x8_sse3):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 9
+ push rsi
+ push rdi
+ ; end prolog
+
+ mov rsi, arg(0) ;s
+ mov rcx, arg(1) ;sp
+ mov rdi, arg(2) ;r
+ mov rax, arg(3) ;rp
+
+ pxor xmm0, xmm0
+ pxor xmm15,xmm15 ;sum_s
+ pxor xmm14,xmm14 ;sum_r
+ pxor xmm13,xmm13 ;sum_sq_s
+ pxor xmm12,xmm12 ;sum_sq_r
+ pxor xmm11,xmm11 ;sum_sxr
+
+ mov rdx, 8 ;row counter
+NextRow2:
+
+ ;grab source and reference pixels
+ movq xmm5, [rsi]
+ movq xmm6, [rdi]
+
+ movdqa xmm3, xmm5
+ movdqa xmm4, xmm6
+ punpcklbw xmm3, xmm0 ; low_s
+ punpcklbw xmm4, xmm0 ; low_r
+
+ TABULATE_SSIM
+
+ add rsi, rcx ; next s row
+ add rdi, rax ; next r row
+
+ dec rdx ; counter
+ jnz NextRow2
+
+ SUM_ACROSS_W xmm15
+ SUM_ACROSS_W xmm14
+ SUM_ACROSS_Q xmm13
+ SUM_ACROSS_Q xmm12
+ SUM_ACROSS_Q xmm11
+
+ mov rdi,arg(4)
+ movq [rdi], xmm15;
+ mov rdi,arg(5)
+ movq [rdi], xmm14;
+ mov rdi,arg(6)
+ movq [rdi], xmm13;
+ mov rdi,arg(7)
+ movq [rdi], xmm12;
+ mov rdi,arg(8)
+ movq [rdi], xmm11;
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ UNSHADOW_ARGS
+ pop rbp
+ ret
diff --git a/vp8/encoder/x86/variance_impl_sse2.asm b/vp8/encoder/x86/variance_impl_sse2.asm
index 5d1a17d..c2c30de 100644
--- a/vp8/encoder/x86/variance_impl_sse2.asm
+++ b/vp8/encoder/x86/variance_impl_sse2.asm
@@ -790,7 +790,7 @@
ret
-;void vp8_half_horiz_vert_variance16x_h_sse2
+;void vp8_half_horiz_vert_variance8x_h_sse2
;(
; unsigned char *ref_ptr,
; int ref_pixels_per_line,
@@ -800,8 +800,8 @@
; int *sum,
; unsigned int *sumsquared
;)
-global sym(vp8_half_horiz_vert_variance16x_h_sse2)
-sym(vp8_half_horiz_vert_variance16x_h_sse2):
+global sym(vp8_half_horiz_vert_variance8x_h_sse2)
+sym(vp8_half_horiz_vert_variance8x_h_sse2):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 7
@@ -835,7 +835,7 @@
add rsi, r8
%endif
-vp8_half_horiz_vert_variance16x_h_1:
+vp8_half_horiz_vert_variance8x_h_1:
movq xmm1, QWORD PTR [rsi] ;
movq xmm2, QWORD PTR [rsi+1] ;
@@ -863,7 +863,7 @@
%endif
sub rcx, 1 ;
- jnz vp8_half_horiz_vert_variance16x_h_1 ;
+ jnz vp8_half_horiz_vert_variance8x_h_1 ;
movdq2q mm6, xmm6 ;
movdq2q mm7, xmm7 ;
@@ -910,8 +910,7 @@
pop rbp
ret
-
-;void vp8_half_vert_variance16x_h_sse2
+;void vp8_half_horiz_vert_variance16x_h_sse2
;(
; unsigned char *ref_ptr,
; int ref_pixels_per_line,
@@ -921,8 +920,124 @@
; int *sum,
; unsigned int *sumsquared
;)
-global sym(vp8_half_vert_variance16x_h_sse2)
-sym(vp8_half_vert_variance16x_h_sse2):
+global sym(vp8_half_horiz_vert_variance16x_h_sse2)
+sym(vp8_half_horiz_vert_variance16x_h_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 7
+ SAVE_XMM
+ GET_GOT rbx
+ push rsi
+ push rdi
+ ; end prolog
+
+ pxor xmm6, xmm6 ; error accumulator
+ pxor xmm7, xmm7 ; sse eaccumulator
+ mov rsi, arg(0) ;ref_ptr ;
+
+ mov rdi, arg(2) ;src_ptr ;
+ movsxd rcx, dword ptr arg(4) ;Height ;
+ movsxd rax, dword ptr arg(1) ;ref_pixels_per_line
+ movsxd rdx, dword ptr arg(3) ;src_pixels_per_line
+
+ pxor xmm0, xmm0 ;
+
+ movdqu xmm5, XMMWORD PTR [rsi]
+ movdqu xmm3, XMMWORD PTR [rsi+1]
+ pavgb xmm5, xmm3 ; xmm5 = avg(xmm1,xmm3) horizontal line 1
+
+ lea rsi, [rsi + rax]
+
+vp8_half_horiz_vert_variance16x_h_1:
+ movdqu xmm1, XMMWORD PTR [rsi] ;
+ movdqu xmm2, XMMWORD PTR [rsi+1] ;
+ pavgb xmm1, xmm2 ; xmm1 = avg(xmm1,xmm3) horizontal line i+1
+
+ pavgb xmm5, xmm1 ; xmm = vertical average of the above
+
+ movdqa xmm4, xmm5
+ punpcklbw xmm5, xmm0 ; xmm5 = words of above
+ punpckhbw xmm4, xmm0
+
+ movq xmm3, QWORD PTR [rdi] ; xmm3 = d0,d1,d2..d7
+ punpcklbw xmm3, xmm0 ; xmm3 = words of above
+ psubw xmm5, xmm3 ; xmm5 -= xmm3
+
+ movq xmm3, QWORD PTR [rdi+8]
+ punpcklbw xmm3, xmm0
+ psubw xmm4, xmm3
+
+ paddw xmm6, xmm5 ; xmm6 += accumulated column differences
+ paddw xmm6, xmm4
+ pmaddwd xmm5, xmm5 ; xmm5 *= xmm5
+ pmaddwd xmm4, xmm4
+ paddd xmm7, xmm5 ; xmm7 += accumulated square column differences
+ paddd xmm7, xmm4
+
+ movdqa xmm5, xmm1 ; save xmm1 for use on the next row
+
+ lea rsi, [rsi + rax]
+ lea rdi, [rdi + rdx]
+
+ sub rcx, 1 ;
+ jnz vp8_half_horiz_vert_variance16x_h_1 ;
+
+ pxor xmm1, xmm1
+ pxor xmm5, xmm5
+
+ punpcklwd xmm0, xmm6
+ punpckhwd xmm1, xmm6
+ psrad xmm0, 16
+ psrad xmm1, 16
+ paddd xmm0, xmm1
+ movdqa xmm1, xmm0
+
+ movdqa xmm6, xmm7
+ punpckldq xmm6, xmm5
+ punpckhdq xmm7, xmm5
+ paddd xmm6, xmm7
+
+ punpckldq xmm0, xmm5
+ punpckhdq xmm1, xmm5
+ paddd xmm0, xmm1
+
+ movdqa xmm7, xmm6
+ movdqa xmm1, xmm0
+
+ psrldq xmm7, 8
+ psrldq xmm1, 8
+
+ paddd xmm6, xmm7
+ paddd xmm0, xmm1
+
+ mov rsi, arg(5) ;[Sum]
+ mov rdi, arg(6) ;[SSE]
+
+ movd [rsi], xmm0
+ movd [rdi], xmm6
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+
+;void vp8_half_vert_variance8x_h_sse2
+;(
+; unsigned char *ref_ptr,
+; int ref_pixels_per_line,
+; unsigned char *src_ptr,
+; int src_pixels_per_line,
+; unsigned int Height,
+; int *sum,
+; unsigned int *sumsquared
+;)
+global sym(vp8_half_vert_variance8x_h_sse2)
+sym(vp8_half_vert_variance8x_h_sse2):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 7
@@ -945,7 +1060,7 @@
movsxd rax, dword ptr arg(1) ;ref_pixels_per_line
pxor xmm0, xmm0 ;
-vp8_half_vert_variance16x_h_1:
+vp8_half_vert_variance8x_h_1:
movq xmm5, QWORD PTR [rsi] ; xmm5 = s0,s1,s2..s8
movq xmm3, QWORD PTR [rsi+rax] ; xmm3 = s1,s2,s3..s9
@@ -969,7 +1084,7 @@
%endif
sub rcx, 1 ;
- jnz vp8_half_vert_variance16x_h_1 ;
+ jnz vp8_half_vert_variance8x_h_1 ;
movdq2q mm6, xmm6 ;
movdq2q mm7, xmm7 ;
@@ -1016,8 +1131,7 @@
pop rbp
ret
-
-;void vp8_half_horiz_variance16x_h_sse2
+;void vp8_half_vert_variance16x_h_sse2
;(
; unsigned char *ref_ptr,
; int ref_pixels_per_line,
@@ -1027,8 +1141,116 @@
; int *sum,
; unsigned int *sumsquared
;)
-global sym(vp8_half_horiz_variance16x_h_sse2)
-sym(vp8_half_horiz_variance16x_h_sse2):
+global sym(vp8_half_vert_variance16x_h_sse2)
+sym(vp8_half_vert_variance16x_h_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 7
+ SAVE_XMM
+ GET_GOT rbx
+ push rsi
+ push rdi
+ ; end prolog
+
+ pxor xmm6, xmm6 ; error accumulator
+ pxor xmm7, xmm7 ; sse eaccumulator
+ mov rsi, arg(0) ;ref_ptr
+
+ mov rdi, arg(2) ;src_ptr
+ movsxd rcx, dword ptr arg(4) ;Height
+ movsxd rax, dword ptr arg(1) ;ref_pixels_per_line
+ movsxd rdx, dword ptr arg(3) ;src_pixels_per_line
+
+ movdqu xmm5, XMMWORD PTR [rsi]
+ lea rsi, [rsi + rax ]
+ pxor xmm0, xmm0
+
+vp8_half_vert_variance16x_h_1:
+ movdqu xmm3, XMMWORD PTR [rsi]
+
+ pavgb xmm5, xmm3 ; xmm5 = avg(xmm1,xmm3)
+ movdqa xmm4, xmm5
+ punpcklbw xmm5, xmm0
+ punpckhbw xmm4, xmm0
+
+ movq xmm2, QWORD PTR [rdi]
+ punpcklbw xmm2, xmm0
+ psubw xmm5, xmm2
+ movq xmm2, QWORD PTR [rdi+8]
+ punpcklbw xmm2, xmm0
+ psubw xmm4, xmm2
+
+ paddw xmm6, xmm5 ; xmm6 += accumulated column differences
+ paddw xmm6, xmm4
+ pmaddwd xmm5, xmm5 ; xmm5 *= xmm5
+ pmaddwd xmm4, xmm4
+ paddd xmm7, xmm5 ; xmm7 += accumulated square column differences
+ paddd xmm7, xmm4
+
+ movdqa xmm5, xmm3
+
+ lea rsi, [rsi + rax]
+ lea rdi, [rdi + rdx]
+
+ sub rcx, 1
+ jnz vp8_half_vert_variance16x_h_1
+
+ pxor xmm1, xmm1
+ pxor xmm5, xmm5
+
+ punpcklwd xmm0, xmm6
+ punpckhwd xmm1, xmm6
+ psrad xmm0, 16
+ psrad xmm1, 16
+ paddd xmm0, xmm1
+ movdqa xmm1, xmm0
+
+ movdqa xmm6, xmm7
+ punpckldq xmm6, xmm5
+ punpckhdq xmm7, xmm5
+ paddd xmm6, xmm7
+
+ punpckldq xmm0, xmm5
+ punpckhdq xmm1, xmm5
+ paddd xmm0, xmm1
+
+ movdqa xmm7, xmm6
+ movdqa xmm1, xmm0
+
+ psrldq xmm7, 8
+ psrldq xmm1, 8
+
+ paddd xmm6, xmm7
+ paddd xmm0, xmm1
+
+ mov rsi, arg(5) ;[Sum]
+ mov rdi, arg(6) ;[SSE]
+
+ movd [rsi], xmm0
+ movd [rdi], xmm6
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+
+;void vp8_half_horiz_variance8x_h_sse2
+;(
+; unsigned char *ref_ptr,
+; int ref_pixels_per_line,
+; unsigned char *src_ptr,
+; int src_pixels_per_line,
+; unsigned int Height,
+; int *sum,
+; unsigned int *sumsquared
+;)
+global sym(vp8_half_horiz_variance8x_h_sse2)
+sym(vp8_half_horiz_variance8x_h_sse2):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 7
@@ -1050,7 +1272,7 @@
movsxd rcx, dword ptr arg(4) ;Height ;
pxor xmm0, xmm0 ;
-vp8_half_horiz_variance16x16_1:
+vp8_half_horiz_variance8x_h_1:
movq xmm5, QWORD PTR [rsi] ; xmm5 = s0,s1,s2..s8
movq xmm3, QWORD PTR [rsi+1] ; xmm3 = s1,s2,s3..s9
@@ -1073,7 +1295,7 @@
add rdi, r9
%endif
sub rcx, 1 ;
- jnz vp8_half_horiz_variance16x16_1 ;
+ jnz vp8_half_horiz_variance8x_h_1 ;
movdq2q mm6, xmm6 ;
movdq2q mm7, xmm7 ;
@@ -1120,6 +1342,109 @@
pop rbp
ret
+;void vp8_half_horiz_variance16x_h_sse2
+;(
+; unsigned char *ref_ptr,
+; int ref_pixels_per_line,
+; unsigned char *src_ptr,
+; int src_pixels_per_line,
+; unsigned int Height,
+; int *sum,
+; unsigned int *sumsquared
+;)
+global sym(vp8_half_horiz_variance16x_h_sse2)
+sym(vp8_half_horiz_variance16x_h_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 7
+ SAVE_XMM
+ GET_GOT rbx
+ push rsi
+ push rdi
+ ; end prolog
+
+ pxor xmm6, xmm6 ; error accumulator
+ pxor xmm7, xmm7 ; sse eaccumulator
+ mov rsi, arg(0) ;ref_ptr ;
+
+ mov rdi, arg(2) ;src_ptr ;
+ movsxd rcx, dword ptr arg(4) ;Height ;
+ movsxd rax, dword ptr arg(1) ;ref_pixels_per_line
+ movsxd rdx, dword ptr arg(3) ;src_pixels_per_line
+
+ pxor xmm0, xmm0 ;
+
+vp8_half_horiz_variance16x_h_1:
+ movdqu xmm5, XMMWORD PTR [rsi] ; xmm5 = s0,s1,s2..s15
+ movdqu xmm3, XMMWORD PTR [rsi+1] ; xmm3 = s1,s2,s3..s16
+
+ pavgb xmm5, xmm3 ; xmm5 = avg(xmm1,xmm3)
+ movdqa xmm1, xmm5
+ punpcklbw xmm5, xmm0 ; xmm5 = words of above
+ punpckhbw xmm1, xmm0
+
+ movq xmm3, QWORD PTR [rdi] ; xmm3 = d0,d1,d2..d7
+ punpcklbw xmm3, xmm0 ; xmm3 = words of above
+ movq xmm2, QWORD PTR [rdi+8]
+ punpcklbw xmm2, xmm0
+
+ psubw xmm5, xmm3 ; xmm5 -= xmm3
+ psubw xmm1, xmm2
+ paddw xmm6, xmm5 ; xmm6 += accumulated column differences
+ paddw xmm6, xmm1
+ pmaddwd xmm5, xmm5 ; xmm5 *= xmm5
+ pmaddwd xmm1, xmm1
+ paddd xmm7, xmm5 ; xmm7 += accumulated square column differences
+ paddd xmm7, xmm1
+
+ lea rsi, [rsi + rax]
+ lea rdi, [rdi + rdx]
+
+ sub rcx, 1 ;
+ jnz vp8_half_horiz_variance16x_h_1 ;
+
+ pxor xmm1, xmm1
+ pxor xmm5, xmm5
+
+ punpcklwd xmm0, xmm6
+ punpckhwd xmm1, xmm6
+ psrad xmm0, 16
+ psrad xmm1, 16
+ paddd xmm0, xmm1
+ movdqa xmm1, xmm0
+
+ movdqa xmm6, xmm7
+ punpckldq xmm6, xmm5
+ punpckhdq xmm7, xmm5
+ paddd xmm6, xmm7
+
+ punpckldq xmm0, xmm5
+ punpckhdq xmm1, xmm5
+ paddd xmm0, xmm1
+
+ movdqa xmm7, xmm6
+ movdqa xmm1, xmm0
+
+ psrldq xmm7, 8
+ psrldq xmm1, 8
+
+ paddd xmm6, xmm7
+ paddd xmm0, xmm1
+
+ mov rsi, arg(5) ;[Sum]
+ mov rdi, arg(6) ;[SSE]
+
+ movd [rsi], xmm0
+ movd [rdi], xmm6
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
SECTION_RODATA
; short xmm_bi_rd[8] = { 64, 64, 64, 64,64, 64, 64, 64};
diff --git a/vp8/encoder/x86/variance_mmx.c b/vp8/encoder/x86/variance_mmx.c
index 6eed98e..07358c0 100644
--- a/vp8/encoder/x86/variance_mmx.c
+++ b/vp8/encoder/x86/variance_mmx.c
@@ -456,146 +456,6 @@
return (xxsum - ((xsum * xsum) >> 7));
}
-unsigned int vp8_i_variance16x16_mmx(
- const unsigned char *src_ptr,
- int source_stride,
- const unsigned char *ref_ptr,
- int recon_stride,
- unsigned int *sse)
-{
- unsigned int sse0, sse1, sse2, sse3, var;
- int sum0, sum1, sum2, sum3, avg;
-
-
- vp8_get8x8var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ;
- vp8_get8x8var_mmx(src_ptr + 8, source_stride, ref_ptr + 8, recon_stride, &sse1, &sum1);
- vp8_get8x8var_mmx(src_ptr + (source_stride >> 1), source_stride, ref_ptr + (recon_stride >> 1), recon_stride, &sse2, &sum2) ;
- vp8_get8x8var_mmx(src_ptr + (source_stride >> 1) + 8, source_stride, ref_ptr + (recon_stride >> 1) + 8, recon_stride, &sse3, &sum3);
-
- var = sse0 + sse1 + sse2 + sse3;
- avg = sum0 + sum1 + sum2 + sum3;
- *sse = var;
- return (var - ((avg * avg) >> 8));
-
-}
-
-unsigned int vp8_i_variance8x16_mmx(
- const unsigned char *src_ptr,
- int source_stride,
- const unsigned char *ref_ptr,
- int recon_stride,
- unsigned int *sse)
-{
- unsigned int sse0, sse1, var;
- int sum0, sum1, avg;
- vp8_get8x8var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ;
- vp8_get8x8var_mmx(src_ptr + (source_stride >> 1), source_stride, ref_ptr + (recon_stride >> 1), recon_stride, &sse1, &sum1) ;
-
- var = sse0 + sse1;
- avg = sum0 + sum1;
-
- *sse = var;
- return (var - ((avg * avg) >> 7));
-
-}
-
-unsigned int vp8_i_sub_pixel_variance16x16_mmx
-(
- const unsigned char *src_ptr,
- int src_pixels_per_line,
- int xoffset,
- int yoffset,
- const unsigned char *dst_ptr,
- int dst_pixels_per_line,
- unsigned int *sse
-)
-{
- int xsum0, xsum1;
- unsigned int xxsum0, xxsum1;
- int f2soffset = (src_pixels_per_line >> 1);
- int f2doffset = (dst_pixels_per_line >> 1);
-
-
- vp8_filter_block2d_bil_var_mmx(
- src_ptr, src_pixels_per_line,
- dst_ptr, dst_pixels_per_line, 8,
- vp8_vp7_bilinear_filters_mmx[xoffset], vp8_vp7_bilinear_filters_mmx[yoffset],
- &xsum0, &xxsum0
- );
-
-
- vp8_filter_block2d_bil_var_mmx(
- src_ptr + 8, src_pixels_per_line,
- dst_ptr + 8, dst_pixels_per_line, 8,
- vp8_vp7_bilinear_filters_mmx[xoffset], vp8_vp7_bilinear_filters_mmx[yoffset],
- &xsum1, &xxsum1
- );
-
- xsum0 += xsum1;
- xxsum0 += xxsum1;
-
- vp8_filter_block2d_bil_var_mmx(
- src_ptr + f2soffset, src_pixels_per_line,
- dst_ptr + f2doffset, dst_pixels_per_line, 8,
- vp8_vp7_bilinear_filters_mmx[xoffset], vp8_vp7_bilinear_filters_mmx[yoffset],
- &xsum1, &xxsum1
- );
-
- xsum0 += xsum1;
- xxsum0 += xxsum1;
-
- vp8_filter_block2d_bil_var_mmx(
- src_ptr + f2soffset + 8, src_pixels_per_line,
- dst_ptr + f2doffset + 8, dst_pixels_per_line, 8,
- vp8_vp7_bilinear_filters_mmx[xoffset], vp8_vp7_bilinear_filters_mmx[yoffset],
- &xsum1, &xxsum1
- );
-
- xsum0 += xsum1;
- xxsum0 += xxsum1;
- *sse = xxsum0;
- return (xxsum0 - ((xsum0 * xsum0) >> 8));
-}
-
-
-unsigned int vp8_i_sub_pixel_variance8x16_mmx
-(
- const unsigned char *src_ptr,
- int src_pixels_per_line,
- int xoffset,
- int yoffset,
- const unsigned char *dst_ptr,
- int dst_pixels_per_line,
- unsigned int *sse
-)
-{
- int xsum0, xsum1;
- unsigned int xxsum0, xxsum1;
- int f2soffset = (src_pixels_per_line >> 1);
- int f2doffset = (dst_pixels_per_line >> 1);
-
-
- vp8_filter_block2d_bil_var_mmx(
- src_ptr, src_pixels_per_line,
- dst_ptr, dst_pixels_per_line, 8,
- vp8_vp7_bilinear_filters_mmx[xoffset], vp8_vp7_bilinear_filters_mmx[yoffset],
- &xsum0, &xxsum0
- );
-
-
- vp8_filter_block2d_bil_var_mmx(
- src_ptr + f2soffset, src_pixels_per_line,
- dst_ptr + f2doffset, dst_pixels_per_line, 8,
- vp8_vp7_bilinear_filters_mmx[xoffset], vp8_vp7_bilinear_filters_mmx[yoffset],
- &xsum1, &xxsum1
- );
-
- xsum0 += xsum1;
- xxsum0 += xxsum1;
- *sse = xxsum0;
- return (xxsum0 - ((xsum0 * xsum0) >> 7));
-}
-
unsigned int vp8_variance_halfpixvar16x16_h_mmx(
const unsigned char *src_ptr,
diff --git a/vp8/encoder/x86/variance_sse2.c b/vp8/encoder/x86/variance_sse2.c
index 7cf6a63..0edda30 100644
--- a/vp8/encoder/x86/variance_sse2.c
+++ b/vp8/encoder/x86/variance_sse2.c
@@ -81,6 +81,16 @@
int *sum,
unsigned int *sumsquared
);
+void vp8_half_horiz_vert_variance8x_h_sse2
+(
+ const unsigned char *ref_ptr,
+ int ref_pixels_per_line,
+ const unsigned char *src_ptr,
+ int src_pixels_per_line,
+ unsigned int Height,
+ int *sum,
+ unsigned int *sumsquared
+);
void vp8_half_horiz_vert_variance16x_h_sse2
(
const unsigned char *ref_ptr,
@@ -91,6 +101,16 @@
int *sum,
unsigned int *sumsquared
);
+void vp8_half_horiz_variance8x_h_sse2
+(
+ const unsigned char *ref_ptr,
+ int ref_pixels_per_line,
+ const unsigned char *src_ptr,
+ int src_pixels_per_line,
+ unsigned int Height,
+ int *sum,
+ unsigned int *sumsquared
+);
void vp8_half_horiz_variance16x_h_sse2
(
const unsigned char *ref_ptr,
@@ -101,6 +121,16 @@
int *sum,
unsigned int *sumsquared
);
+void vp8_half_vert_variance8x_h_sse2
+(
+ const unsigned char *ref_ptr,
+ int ref_pixels_per_line,
+ const unsigned char *src_ptr,
+ int src_pixels_per_line,
+ unsigned int Height,
+ int *sum,
+ unsigned int *sumsquared
+);
void vp8_half_vert_variance16x_h_sse2
(
const unsigned char *ref_ptr,
@@ -262,21 +292,21 @@
if (xoffset == 4 && yoffset == 0)
{
- vp8_half_horiz_variance16x_h_sse2(
+ vp8_half_horiz_variance8x_h_sse2(
src_ptr, src_pixels_per_line,
dst_ptr, dst_pixels_per_line, 8,
&xsum, &xxsum);
}
else if (xoffset == 0 && yoffset == 4)
{
- vp8_half_vert_variance16x_h_sse2(
+ vp8_half_vert_variance8x_h_sse2(
src_ptr, src_pixels_per_line,
dst_ptr, dst_pixels_per_line, 8,
&xsum, &xxsum);
}
else if (xoffset == 4 && yoffset == 4)
{
- vp8_half_horiz_vert_variance16x_h_sse2(
+ vp8_half_horiz_vert_variance8x_h_sse2(
src_ptr, src_pixels_per_line,
dst_ptr, dst_pixels_per_line, 8,
&xsum, &xxsum);
@@ -317,11 +347,6 @@
src_ptr, src_pixels_per_line,
dst_ptr, dst_pixels_per_line, 16,
&xsum0, &xxsum0);
-
- vp8_half_horiz_variance16x_h_sse2(
- src_ptr + 8, src_pixels_per_line,
- dst_ptr + 8, dst_pixels_per_line, 16,
- &xsum1, &xxsum1);
}
else if (xoffset == 0 && yoffset == 4)
{
@@ -329,11 +354,6 @@
src_ptr, src_pixels_per_line,
dst_ptr, dst_pixels_per_line, 16,
&xsum0, &xxsum0);
-
- vp8_half_vert_variance16x_h_sse2(
- src_ptr + 8, src_pixels_per_line,
- dst_ptr + 8, dst_pixels_per_line, 16,
- &xsum1, &xxsum1);
}
else if (xoffset == 4 && yoffset == 4)
{
@@ -341,11 +361,6 @@
src_ptr, src_pixels_per_line,
dst_ptr, dst_pixels_per_line, 16,
&xsum0, &xxsum0);
-
- vp8_half_horiz_vert_variance16x_h_sse2(
- src_ptr + 8, src_pixels_per_line,
- dst_ptr + 8, dst_pixels_per_line, 16,
- &xsum1, &xxsum1);
}
else
{
@@ -356,17 +371,16 @@
&xsum0, &xxsum0
);
-
vp8_filter_block2d_bil_var_sse2(
src_ptr + 8, src_pixels_per_line,
dst_ptr + 8, dst_pixels_per_line, 16,
xoffset, yoffset,
&xsum1, &xxsum1
);
+ xsum0 += xsum1;
+ xxsum0 += xxsum1;
}
- xsum0 += xsum1;
- xxsum0 += xxsum1;
*sse = xxsum0;
return (xxsum0 - ((xsum0 * xsum0) >> 8));
}
@@ -406,11 +420,6 @@
src_ptr, src_pixels_per_line,
dst_ptr, dst_pixels_per_line, 8,
&xsum0, &xxsum0);
-
- vp8_half_horiz_variance16x_h_sse2(
- src_ptr + 8, src_pixels_per_line,
- dst_ptr + 8, dst_pixels_per_line, 8,
- &xsum1, &xxsum1);
}
else if (xoffset == 0 && yoffset == 4)
{
@@ -418,11 +427,6 @@
src_ptr, src_pixels_per_line,
dst_ptr, dst_pixels_per_line, 8,
&xsum0, &xxsum0);
-
- vp8_half_vert_variance16x_h_sse2(
- src_ptr + 8, src_pixels_per_line,
- dst_ptr + 8, dst_pixels_per_line, 8,
- &xsum1, &xxsum1);
}
else if (xoffset == 4 && yoffset == 4)
{
@@ -430,11 +434,6 @@
src_ptr, src_pixels_per_line,
dst_ptr, dst_pixels_per_line, 8,
&xsum0, &xxsum0);
-
- vp8_half_horiz_vert_variance16x_h_sse2(
- src_ptr + 8, src_pixels_per_line,
- dst_ptr + 8, dst_pixels_per_line, 8,
- &xsum1, &xxsum1);
}
else
{
@@ -449,11 +448,10 @@
dst_ptr + 8, dst_pixels_per_line, 8,
xoffset, yoffset,
&xsum1, &xxsum1);
+ xsum0 += xsum1;
+ xxsum0 += xxsum1;
}
- xsum0 += xsum1;
- xxsum0 += xxsum1;
-
*sse = xxsum0;
return (xxsum0 - ((xsum0 * xsum0) >> 7));
}
@@ -474,21 +472,21 @@
if (xoffset == 4 && yoffset == 0)
{
- vp8_half_horiz_variance16x_h_sse2(
+ vp8_half_horiz_variance8x_h_sse2(
src_ptr, src_pixels_per_line,
dst_ptr, dst_pixels_per_line, 16,
&xsum, &xxsum);
}
else if (xoffset == 0 && yoffset == 4)
{
- vp8_half_vert_variance16x_h_sse2(
+ vp8_half_vert_variance8x_h_sse2(
src_ptr, src_pixels_per_line,
dst_ptr, dst_pixels_per_line, 16,
&xsum, &xxsum);
}
else if (xoffset == 4 && yoffset == 4)
{
- vp8_half_horiz_vert_variance16x_h_sse2(
+ vp8_half_horiz_vert_variance8x_h_sse2(
src_ptr, src_pixels_per_line,
dst_ptr, dst_pixels_per_line, 16,
&xsum, &xxsum);
@@ -506,81 +504,6 @@
return (xxsum - ((xsum * xsum) >> 7));
}
-unsigned int vp8_i_variance16x16_wmt(
- const unsigned char *src_ptr,
- int source_stride,
- const unsigned char *ref_ptr,
- int recon_stride,
- unsigned int *sse)
-{
- unsigned int sse0, sse1, sse2, sse3, var;
- int sum0, sum1, sum2, sum3, avg;
-
-
- vp8_get8x8var_sse2(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ;
- vp8_get8x8var_sse2(src_ptr + 8, source_stride, ref_ptr + 8, recon_stride, &sse1, &sum1);
- vp8_get8x8var_sse2(src_ptr + (source_stride >> 1), source_stride, ref_ptr + (recon_stride >> 1), recon_stride, &sse2, &sum2) ;
- vp8_get8x8var_sse2(src_ptr + (source_stride >> 1) + 8, source_stride, ref_ptr + (recon_stride >> 1) + 8, recon_stride, &sse3, &sum3);
-
- var = sse0 + sse1 + sse2 + sse3;
- avg = sum0 + sum1 + sum2 + sum3;
-
- *sse = var;
- return (var - ((avg * avg) >> 8));
-
-}
-
-unsigned int vp8_i_variance8x16_wmt(
- const unsigned char *src_ptr,
- int source_stride,
- const unsigned char *ref_ptr,
- int recon_stride,
- unsigned int *sse)
-{
- unsigned int sse0, sse1, var;
- int sum0, sum1, avg;
- vp8_get8x8var_sse2(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ;
- vp8_get8x8var_sse2(src_ptr + (source_stride >> 1), source_stride, ref_ptr + (recon_stride >> 1), recon_stride, &sse1, &sum1) ;
-
- var = sse0 + sse1;
- avg = sum0 + sum1;
-
- *sse = var;
- return (var - ((avg * avg) >> 7));
-
-}
-
-
-unsigned int vp8_i_sub_pixel_variance16x16_wmt
-(
- const unsigned char *src_ptr,
- int src_pixels_per_line,
- int xoffset,
- int yoffset,
- const unsigned char *dst_ptr,
- int dst_pixels_per_line,
- unsigned int *sse
-)
-{
- return vp8_sub_pixel_variance16x16_wmt(src_ptr, (src_pixels_per_line >> 1), xoffset, yoffset, dst_ptr, (dst_pixels_per_line >> 1), sse);
-}
-
-
-unsigned int vp8_i_sub_pixel_variance8x16_wmt
-(
- const unsigned char *src_ptr,
- int src_pixels_per_line,
- int xoffset,
- int yoffset,
- const unsigned char *dst_ptr,
- int dst_pixels_per_line,
- unsigned int *sse
-)
-{
-
- return vp8_sub_pixel_variance8x16_wmt(src_ptr, (src_pixels_per_line >> 1), xoffset, yoffset, dst_ptr, (dst_pixels_per_line >> 1), sse);
-}
-
unsigned int vp8_variance_halfpixvar16x16_h_wmt(
const unsigned char *src_ptr,
@@ -589,21 +512,14 @@
int dst_pixels_per_line,
unsigned int *sse)
{
- int xsum0, xsum1;
- unsigned int xxsum0, xxsum1;
+ int xsum0;
+ unsigned int xxsum0;
vp8_half_horiz_variance16x_h_sse2(
src_ptr, src_pixels_per_line,
dst_ptr, dst_pixels_per_line, 16,
&xsum0, &xxsum0);
- vp8_half_horiz_variance16x_h_sse2(
- src_ptr + 8, src_pixels_per_line,
- dst_ptr + 8, dst_pixels_per_line, 16,
- &xsum1, &xxsum1);
-
- xsum0 += xsum1;
- xxsum0 += xxsum1;
*sse = xxsum0;
return (xxsum0 - ((xsum0 * xsum0) >> 8));
}
@@ -616,21 +532,13 @@
int dst_pixels_per_line,
unsigned int *sse)
{
- int xsum0, xsum1;
- unsigned int xxsum0, xxsum1;
-
+ int xsum0;
+ unsigned int xxsum0;
vp8_half_vert_variance16x_h_sse2(
src_ptr, src_pixels_per_line,
dst_ptr, dst_pixels_per_line, 16,
&xsum0, &xxsum0);
- vp8_half_vert_variance16x_h_sse2(
- src_ptr + 8, src_pixels_per_line,
- dst_ptr + 8, dst_pixels_per_line, 16,
- &xsum1, &xxsum1);
-
- xsum0 += xsum1;
- xxsum0 += xxsum1;
*sse = xxsum0;
return (xxsum0 - ((xsum0 * xsum0) >> 8));
}
@@ -643,21 +551,14 @@
int dst_pixels_per_line,
unsigned int *sse)
{
- int xsum0, xsum1;
- unsigned int xxsum0, xxsum1;
+ int xsum0;
+ unsigned int xxsum0;
vp8_half_horiz_vert_variance16x_h_sse2(
src_ptr, src_pixels_per_line,
dst_ptr, dst_pixels_per_line, 16,
&xsum0, &xxsum0);
- vp8_half_horiz_vert_variance16x_h_sse2(
- src_ptr + 8, src_pixels_per_line,
- dst_ptr + 8, dst_pixels_per_line, 16,
- &xsum1, &xxsum1);
-
- xsum0 += xsum1;
- xxsum0 += xxsum1;
*sse = xxsum0;
return (xxsum0 - ((xsum0 * xsum0) >> 8));
}
diff --git a/vp8/encoder/x86/variance_ssse3.c b/vp8/encoder/x86/variance_ssse3.c
index 750ae8b..eb5d486 100644
--- a/vp8/encoder/x86/variance_ssse3.c
+++ b/vp8/encoder/x86/variance_ssse3.c
@@ -76,8 +76,8 @@
unsigned int *sse
)
{
- int xsum0, xsum1;
- unsigned int xxsum0, xxsum1;
+ int xsum0;
+ unsigned int xxsum0;
// note we could avoid these if statements if the calling function
// just called the appropriate functions inside.
@@ -87,14 +87,6 @@
src_ptr, src_pixels_per_line,
dst_ptr, dst_pixels_per_line, 16,
&xsum0, &xxsum0);
-
- vp8_half_horiz_variance16x_h_sse2(
- src_ptr + 8, src_pixels_per_line,
- dst_ptr + 8, dst_pixels_per_line, 16,
- &xsum1, &xxsum1);
-
- xsum0 += xsum1;
- xxsum0 += xxsum1;
}
else if (xoffset == 0 && yoffset == 4)
{
@@ -102,14 +94,6 @@
src_ptr, src_pixels_per_line,
dst_ptr, dst_pixels_per_line, 16,
&xsum0, &xxsum0);
-
- vp8_half_vert_variance16x_h_sse2(
- src_ptr + 8, src_pixels_per_line,
- dst_ptr + 8, dst_pixels_per_line, 16,
- &xsum1, &xxsum1);
-
- xsum0 += xsum1;
- xxsum0 += xxsum1;
}
else if (xoffset == 4 && yoffset == 4)
{
@@ -117,24 +101,65 @@
src_ptr, src_pixels_per_line,
dst_ptr, dst_pixels_per_line, 16,
&xsum0, &xxsum0);
-
- vp8_half_horiz_vert_variance16x_h_sse2(
- src_ptr + 8, src_pixels_per_line,
- dst_ptr + 8, dst_pixels_per_line, 16,
- &xsum1, &xxsum1);
-
- xsum0 += xsum1;
- xxsum0 += xxsum1;
}
else
{
- vp8_filter_block2d_bil_var_ssse3(
- src_ptr, src_pixels_per_line,
- dst_ptr, dst_pixels_per_line, 16,
- xoffset, yoffset,
- &xsum0, &xxsum0);
+ vp8_filter_block2d_bil_var_ssse3(
+ src_ptr, src_pixels_per_line,
+ dst_ptr, dst_pixels_per_line, 16,
+ xoffset, yoffset,
+ &xsum0, &xxsum0);
}
*sse = xxsum0;
return (xxsum0 - ((xsum0 * xsum0) >> 8));
}
+
+unsigned int vp8_sub_pixel_variance16x8_ssse3
+(
+ const unsigned char *src_ptr,
+ int src_pixels_per_line,
+ int xoffset,
+ int yoffset,
+ const unsigned char *dst_ptr,
+ int dst_pixels_per_line,
+ unsigned int *sse
+
+)
+{
+ int xsum0;
+ unsigned int xxsum0;
+
+ if (xoffset == 4 && yoffset == 0)
+ {
+ vp8_half_horiz_variance16x_h_sse2(
+ src_ptr, src_pixels_per_line,
+ dst_ptr, dst_pixels_per_line, 8,
+ &xsum0, &xxsum0);
+ }
+ else if (xoffset == 0 && yoffset == 4)
+ {
+ vp8_half_vert_variance16x_h_sse2(
+ src_ptr, src_pixels_per_line,
+ dst_ptr, dst_pixels_per_line, 8,
+ &xsum0, &xxsum0);
+ }
+ else if (xoffset == 4 && yoffset == 4)
+ {
+ vp8_half_horiz_vert_variance16x_h_sse2(
+ src_ptr, src_pixels_per_line,
+ dst_ptr, dst_pixels_per_line, 8,
+ &xsum0, &xxsum0);
+ }
+ else
+ {
+ vp8_filter_block2d_bil_var_ssse3(
+ src_ptr, src_pixels_per_line,
+ dst_ptr, dst_pixels_per_line, 8,
+ xoffset, yoffset,
+ &xsum0, &xxsum0);
+ }
+
+ *sse = xxsum0;
+ return (xxsum0 - ((xsum0 * xsum0) >> 7));
+}
diff --git a/vp8/encoder/x86/variance_x86.h b/vp8/encoder/x86/variance_x86.h
index 1e2fb34..3560f74 100644
--- a/vp8/encoder/x86/variance_x86.h
+++ b/vp8/encoder/x86/variance_x86.h
@@ -286,6 +286,7 @@
#if HAVE_SSSE3
extern prototype_sad_multi_same_address(vp8_sad16x16x3_ssse3);
extern prototype_sad_multi_same_address(vp8_sad16x8x3_ssse3);
+extern prototype_subpixvariance(vp8_sub_pixel_variance16x8_ssse3);
extern prototype_subpixvariance(vp8_sub_pixel_variance16x16_ssse3);
#if !CONFIG_RUNTIME_CPU_DETECT
@@ -295,6 +296,9 @@
#undef vp8_variance_sad16x8x3
#define vp8_variance_sad16x8x3 vp8_sad16x8x3_ssse3
+#undef vp8_variance_subpixvar16x8
+#define vp8_variance_subpixvar16x8 vp8_sub_pixel_variance16x8_ssse3
+
#undef vp8_variance_subpixvar16x16
#define vp8_variance_subpixvar16x16 vp8_sub_pixel_variance16x16_ssse3
diff --git a/vp8/encoder/x86/x86_csystemdependent.c b/vp8/encoder/x86/x86_csystemdependent.c
index c7639a7..5ab3641 100644
--- a/vp8/encoder/x86/x86_csystemdependent.c
+++ b/vp8/encoder/x86/x86_csystemdependent.c
@@ -176,6 +176,25 @@
d->dqcoeff
);
}
+#if CONFIG_PSNR
+#if ARCH_X86_64
+typedef void ssimpf
+(
+ unsigned char *s,
+ int sp,
+ unsigned char *r,
+ int rp,
+ unsigned long *sum_s,
+ unsigned long *sum_r,
+ unsigned long *sum_sq_s,
+ unsigned long *sum_sq_r,
+ unsigned long *sum_sxr
+);
+
+extern ssimpf vp8_ssim_parms_16x16_sse3;
+extern ssimpf vp8_ssim_parms_8x8_sse3;
+#endif
+#endif
#endif
@@ -280,6 +299,8 @@
cpi->rtcd.variance.get16x16prederror = vp8_get16x16pred_error_sse2;
cpi->rtcd.variance.get8x8var = vp8_get8x8var_sse2;
cpi->rtcd.variance.get16x16var = vp8_get16x16var_sse2;
+
+
/* cpi->rtcd.variance.get4x4sse_cs not implemented for wmt */;
cpi->rtcd.fdct.short4x4 = vp8_short_fdct4x4_sse2;
@@ -334,13 +355,23 @@
cpi->rtcd.variance.sad16x16x3 = vp8_sad16x16x3_ssse3;
cpi->rtcd.variance.sad16x8x3 = vp8_sad16x8x3_ssse3;
+ cpi->rtcd.variance.subpixvar16x8 = vp8_sub_pixel_variance16x8_ssse3;
cpi->rtcd.variance.subpixvar16x16 = vp8_sub_pixel_variance16x16_ssse3;
cpi->rtcd.quantize.fastquantb = vp8_fast_quantize_b_ssse3;
+#if CONFIG_PSNR
+#if ARCH_X86_64
+ cpi->rtcd.variance.ssimpf_8x8 = vp8_ssim_parms_8x8_sse3;
+ cpi->rtcd.variance.ssimpf = vp8_ssim_parms_16x16_sse3;
+#endif
+#endif
+
}
#endif
+
+
#if HAVE_SSE4_1
if (SSE4_1Enabled)
{
diff --git a/vp8/vp8_common.mk b/vp8/vp8_common.mk
index 4daadee..ba9caa7 100644
--- a/vp8/vp8_common.mk
+++ b/vp8/vp8_common.mk
@@ -24,6 +24,7 @@
VP8_COMMON_SRCS-yes += common/entropymv.c
VP8_COMMON_SRCS-yes += common/extend.c
VP8_COMMON_SRCS-yes += common/filter.c
+VP8_COMMON_SRCS-yes += common/filter.h
VP8_COMMON_SRCS-yes += common/findnearmv.c
VP8_COMMON_SRCS-yes += common/generic/systemdependent.c
VP8_COMMON_SRCS-yes += common/idctllm.c
diff --git a/vp8/vp8_cx_iface.c b/vp8/vp8_cx_iface.c
index a45a379..2622738 100644
--- a/vp8/vp8_cx_iface.c
+++ b/vp8/vp8_cx_iface.c
@@ -199,7 +199,7 @@
{
int mb_r = (cfg->g_h + 15) / 16;
int mb_c = (cfg->g_w + 15) / 16;
- size_t packet_sz = vp8_firstpass_stats_sz(mb_r * mb_c);
+ size_t packet_sz = sizeof(FIRSTPASS_STATS);
int n_packets = cfg->rc_twopass_stats_in.sz / packet_sz;
FIRSTPASS_STATS *stats;
diff --git a/vp8/vp8cx.mk b/vp8/vp8cx.mk
index c0ae250..8f0681f 100644
--- a/vp8/vp8cx.mk
+++ b/vp8/vp8cx.mk
@@ -116,6 +116,7 @@
VP8_CX_SRCS-$(HAVE_SSE4_1) += encoder/x86/sad_sse4.asm
VP8_CX_SRCS-$(ARCH_X86)$(ARCH_X86_64) += encoder/x86/quantize_mmx.asm
VP8_CX_SRCS-$(ARCH_X86)$(ARCH_X86_64) += encoder/x86/encodeopt.asm
+VP8_CX_SRCS-$(ARCH_X86_64) += encoder/x86/ssim_opt.asm
ifeq ($(CONFIG_REALTIME_ONLY),yes)
VP8_CX_SRCS_REMOVE-$(HAVE_SSE2) += encoder/x86/temporal_filter_apply_sse2.asm
diff --git a/vp8/vp8cx_arm.mk b/vp8/vp8cx_arm.mk
index 7980a0f..b07ee8f 100644
--- a/vp8/vp8cx_arm.mk
+++ b/vp8/vp8cx_arm.mk
@@ -34,8 +34,10 @@
#File list for armv6
# encoder
+VP8_CX_SRCS-$(HAVE_ARMV6) += encoder/arm/armv6/vp8_fast_quantize_b_armv6$(ASM)
VP8_CX_SRCS-$(HAVE_ARMV6) += encoder/arm/armv6/vp8_sad16x16_armv6$(ASM)
VP8_CX_SRCS-$(HAVE_ARMV6) += encoder/arm/armv6/vp8_variance16x16_armv6$(ASM)
+VP8_CX_SRCS-$(HAVE_ARMV6) += encoder/arm/armv6/vp8_mse16x16_armv6$(ASM)
VP8_CX_SRCS-$(HAVE_ARMV6) += encoder/arm/armv6/walsh_v6$(ASM)
#File list for neon
diff --git a/vpx/internal/vpx_codec_internal.h b/vpx/internal/vpx_codec_internal.h
index 470ea73..a1ff192 100644
--- a/vpx/internal/vpx_codec_internal.h
+++ b/vpx/internal/vpx_codec_internal.h
@@ -332,7 +332,7 @@
* extended in one of two ways. First, a second, algorithm specific structure
* can be allocated and the priv member pointed to it. Alternatively, this
* structure can be made the first member of the algorithm specific structure,
- * and the pointer casted to the proper type.
+ * and the pointer cast to the proper type.
*/
struct vpx_codec_priv
{
diff --git a/vpx_ports/x86_abi_support.asm b/vpx_ports/x86_abi_support.asm
index a872b28..be64cd7 100644
--- a/vpx_ports/x86_abi_support.asm
+++ b/vpx_ports/x86_abi_support.asm
@@ -168,15 +168,10 @@
%macro GET_GOT 1
push %1
call %%get_got
- %%sub_offset:
- jmp %%exitGG
%%get_got:
- mov %1, [esp]
- add %1, fake_got - %%sub_offset
- ret
- %%exitGG:
+ pop %1
%undef GLOBAL
- %define GLOBAL(x) x + %1 - fake_got
+ %define GLOBAL(x) x + %1 - %%get_got
%undef RESTORE_GOT
%define RESTORE_GOT pop %1
%endmacro
@@ -289,7 +284,6 @@
%elifidn __OUTPUT_FORMAT__,macho32
%macro SECTION_RODATA 0
section .text
-fake_got:
%endmacro
%else
%define SECTION_RODATA section .rodata