Merge "mips msa vp9 common headers added"
diff --git a/.mailmap b/.mailmap
index fb82a24..0bfda12 100644
--- a/.mailmap
+++ b/.mailmap
@@ -1,18 +1,26 @@
Adrian Grange <agrange@google.com>
+Alex Converse <aconverse@google.com> <alex.converse@gmail.com>
Alexis Ballier <aballier@gentoo.org> <alexis.ballier@gmail.com>
+Alpha Lam <hclam@google.com> <hclam@chromium.org>
+Deb Mukherjee <debargha@google.com>
+Erik Niemeyer <erik.a.niemeyer@intel.com> <erik.a.niemeyer@gmail.com>
+Guillaume Martres <gmartres@google.com> <smarter3@gmail.com>
Hangyu Kuang <hkuang@google.com>
Jim Bankoski <jimbankoski@google.com>
-John Koleszar <jkoleszar@google.com>
Johann Koenig <johannkoenig@google.com>
Johann Koenig <johannkoenig@google.com> <johann.koenig@duck.com>
-Johann Koenig <johannkoenig@google.com> <johannkoenig@dhcp-172-19-7-52.mtv.corp.google.com>
+John Koleszar <jkoleszar@google.com>
+Joshua Litt <joshualitt@google.com> <joshualitt@chromium.org>
+Marco Paniconi <marpan@google.com>
+Marco Paniconi <marpan@google.com> <marpan@chromium.org>
Pascal Massimino <pascal.massimino@gmail.com>
+Paul Wilkins <paulwilkins@google.com>
+Ralph Giles <giles@xiph.org> <giles@entropywave.com>
+Ralph Giles <giles@xiph.org> <giles@mozilla.com>
Sami Pietilä <samipietila@google.com>
+Tamar Levy <tamar.levy@intel.com>
+Tamar Levy <tamar.levy@intel.com> <levytamar82@gmail.com>
Tero Rintaluoma <teror@google.com> <tero.rintaluoma@on2.com>
Timothy B. Terriberry <tterribe@xiph.org> Tim Terriberry <tterriberry@mozilla.com>
Tom Finegan <tomfinegan@google.com>
-Ralph Giles <giles@xiph.org> <giles@entropywave.com>
-Ralph Giles <giles@xiph.org> <giles@mozilla.com>
-Alpha Lam <hclam@google.com> <hclam@chromium.org>
-Deb Mukherjee <debargha@google.com>
Yaowu Xu <yaowu@google.com> <yaowu@xuyaowu.com>
diff --git a/AUTHORS b/AUTHORS
index a9aa481..2f63d7c 100644
--- a/AUTHORS
+++ b/AUTHORS
@@ -3,10 +3,11 @@
Aaron Watry <awatry@gmail.com>
Abo Talib Mahfoodh <ab.mahfoodh@gmail.com>
+Adam Xu <adam@xuyaowu.com>
Adrian Grange <agrange@google.com>
Ahmad Sharif <asharif@google.com>
Alexander Voronov <avoronov@graphics.cs.msu.ru>
-Alex Converse <alex.converse@gmail.com>
+Alex Converse <aconverse@google.com>
Alexis Ballier <aballier@gentoo.org>
Alok Ahuja <waveletcoeff@gmail.com>
Alpha Lam <hclam@google.com>
@@ -14,44 +15,58 @@
Ami Fischman <fischman@chromium.org>
Andoni Morales Alastruey <ylatuya@gmail.com>
Andres Mejia <mcitadel@gmail.com>
+Andrew Russell <anrussell@google.com>
Aron Rosenberg <arosenberg@logitech.com>
Attila Nagy <attilanagy@google.com>
changjun.yang <changjun.yang@intel.com>
+Charles 'Buck' Krasic <ckrasic@google.com>
chm <chm@rock-chips.com>
Christian Duvivier <cduvivier@google.com>
Daniel Kang <ddkang@google.com>
Deb Mukherjee <debargha@google.com>
+Dim Temp <dimtemp0@gmail.com>
Dmitry Kovalev <dkovalev@google.com>
Dragan Mrdjan <dmrdjan@mips.com>
-Erik Niemeyer <erik.a.niemeyer@gmail.com>
+Ehsan Akhgari <ehsan.akhgari@gmail.com>
+Erik Niemeyer <erik.a.niemeyer@intel.com>
Fabio Pedretti <fabio.ped@libero.it>
Frank Galligan <fgalligan@google.com>
Fredrik Söderquist <fs@opera.com>
Fritz Koenig <frkoenig@google.com>
Gaute Strokkenes <gaute.strokkenes@broadcom.com>
Giuseppe Scrivano <gscrivano@gnu.org>
+Gordana Cmiljanovic <gordana.cmiljanovic@imgtec.com>
Guillaume Martres <gmartres@google.com>
Guillermo Ballester Valor <gbvalor@gmail.com>
Hangyu Kuang <hkuang@google.com>
+Hanno Böck <hanno@hboeck.de>
Henrik Lundin <hlundin@google.com>
Hui Su <huisu@google.com>
Ivan Maltz <ivanmaltz@google.com>
+Jacek Caban <cjacek@gmail.com>
+JackyChen <jackychen@google.com>
James Berry <jamesberry@google.com>
+James Yu <james.yu@linaro.org>
James Zern <jzern@google.com>
+Jan Gerber <j@mailb.org>
Jan Kratochvil <jan.kratochvil@redhat.com>
Janne Salonen <jsalonen@google.com>
Jeff Faust <jfaust@google.com>
Jeff Muizelaar <jmuizelaar@mozilla.com>
Jeff Petkau <jpet@chromium.org>
+Jia Jia <jia.jia@linaro.org>
Jim Bankoski <jimbankoski@google.com>
Jingning Han <jingning@google.com>
+Joey Parrish <joeyparrish@google.com>
Johann Koenig <johannkoenig@google.com>
John Koleszar <jkoleszar@google.com>
+John Stark <jhnstrk@gmail.com>
Joshua Bleecher Snyder <josh@treelinelabs.com>
Joshua Litt <joshualitt@google.com>
Justin Clift <justin@salasaga.org>
Justin Lebar <justin.lebar@gmail.com>
KO Myung-Hun <komh@chollian.net>
+Lawrence Velázquez <larryv@macports.org>
Lou Quillio <louquillio@google.com>
Luca Barbato <lu_zero@gentoo.org>
Makoto Kato <makoto.kt@gmail.com>
@@ -65,6 +80,7 @@
Mike Frysinger <vapier@chromium.org>
Mike Hommey <mhommey@mozilla.com>
Mikhal Shemer <mikhal@google.com>
+Minghai Shang <minghai@google.com>
Morton Jonuschat <yabawock@gmail.com>
Parag Salasakar <img.mips1@gmail.com>
Pascal Massimino <pascal.massimino@gmail.com>
@@ -72,6 +88,8 @@
Paul Wilkins <paulwilkins@google.com>
Pavol Rusnak <stick@gk2.sk>
Paweł Hajdan <phajdan@google.com>
+Pengchong Jin <pengchong@google.com>
+Peter de Rivaz <peter.derivaz@gmail.com>
Philip Jägenstedt <philipj@opera.com>
Priit Laes <plaes@plaes.org>
Rafael Ávila de Espíndola <rafael.espindola@gmail.com>
@@ -79,22 +97,29 @@
Ralph Giles <giles@xiph.org>
Rob Bradford <rob@linux.intel.com>
Ronald S. Bultje <rbultje@google.com>
+Rui Ueyama <ruiu@google.com>
Sami Pietilä <samipietila@google.com>
Scott Graham <scottmg@chromium.org>
Scott LaVarnway <slavarnway@google.com>
+Sean McGovern <gseanmcg@gmail.com>
+Sergey Ulanov <sergeyu@chromium.org>
Shimon Doodkin <helpmepro1@gmail.com>
Stefan Holmer <holmer@google.com>
Suman Sunkara <sunkaras@google.com>
Taekhyun Kim <takim@nvidia.com>
Takanori MATSUURA <t.matsuu@gmail.com>
Tamar Levy <tamar.levy@intel.com>
+Tao Bai <michaelbai@chromium.org>
Tero Rintaluoma <teror@google.com>
Thijs Vermeir <thijsvermeir@gmail.com>
+Tim Kopp <tkopp@google.com>
Timothy B. Terriberry <tterribe@xiph.org>
Tom Finegan <tomfinegan@google.com>
Vignesh Venkatasubramanian <vigneshv@google.com>
Yaowu Xu <yaowu@google.com>
+Yongzhe Wang <yongzhe@google.com>
Yunqing Wang <yunqingwang@google.com>
+Zoe Liu <zoeliu@google.com>
Google Inc.
The Mozilla Foundation
The Xiph.Org Foundation
diff --git a/CHANGELOG b/CHANGELOG
index 97c9a7b..a318784 100644
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -1,3 +1,26 @@
+2015-04-03 v1.4.0 "Indian Runner Duck"
+ This release includes significant improvements to the VP9 codec.
+
+ - Upgrading:
+ This release is ABI incompatible with 1.3.0. It drops the compatibility
+ layer, requiring VPX_IMG_FMT_* instead of IMG_FMT_*, and adds several codec
+ controls for VP9.
+
+ - Enhancements:
+ Faster VP9 encoding and decoding
+ Multithreaded VP9 decoding (tile and frame-based)
+ Multithreaded VP9 encoding - on by default
+ YUV 4:2:2 and 4:4:4 support in VP9
+ 10 and 12bit support in VP9
+ 64bit ARM support by replacing ARM assembly with intrinsics
+
+ - Bug Fixes:
+ Fixes a VP9 bitstream issue in Profile 1. This only affected non-YUV 4:2:0
+ files.
+
+ - Known Issues:
+ Frame Parallel decoding fails for segmented and non-420 files.
+
2013-11-15 v1.3.0 "Forest"
This release introduces the VP9 codec in a backward-compatible way.
All existing users of VP8 can continue to use the library without
diff --git a/README b/README
index 584a344..fcd1c2e 100644
--- a/README
+++ b/README
@@ -1,4 +1,4 @@
-README - 30 May 2014
+README - 23 March 2015
Welcome to the WebM VP8/VP9 Codec SDK!
@@ -62,12 +62,6 @@
armv7s-darwin-gcc
mips32-linux-gcc
mips64-linux-gcc
- ppc32-darwin8-gcc
- ppc32-darwin9-gcc
- ppc32-linux-gcc
- ppc64-darwin8-gcc
- ppc64-darwin9-gcc
- ppc64-linux-gcc
sparc-solaris-gcc
x86-android-gcc
x86-darwin8-gcc
@@ -78,6 +72,7 @@
x86-darwin11-gcc
x86-darwin12-gcc
x86-darwin13-gcc
+ x86-darwin14-gcc
x86-iphonesimulator-gcc
x86-linux-gcc
x86-linux-icc
@@ -95,6 +90,7 @@
x86_64-darwin11-gcc
x86_64-darwin12-gcc
x86_64-darwin13-gcc
+ x86_64-darwin14-gcc
x86_64-iphonesimulator-gcc
x86_64-linux-gcc
x86_64-linux-icc
@@ -111,6 +107,7 @@
universal-darwin11-gcc
universal-darwin12-gcc
universal-darwin13-gcc
+ universal-darwin14-gcc
generic-gnu
The generic-gnu target, in conjunction with the CROSS environment variable,
diff --git a/build/make/Makefile b/build/make/Makefile
index e048e9c..fc7749a 100644
--- a/build/make/Makefile
+++ b/build/make/Makefile
@@ -383,8 +383,8 @@
.libs: $(LIBS)
@touch $@
$(foreach lib,$(filter %_g.a,$(LIBS)),$(eval $(call archive_template,$(lib))))
-$(foreach lib,$(filter %so.$(VERSION_MAJOR).$(VERSION_MINOR).$(VERSION_PATCH),$(LIBS)),$(eval $(call so_template,$(lib))))
-$(foreach lib,$(filter %$(VERSION_MAJOR).dylib,$(LIBS)),$(eval $(call dl_template,$(lib))))
+$(foreach lib,$(filter %so.$(SO_VERSION_MAJOR).$(SO_VERSION_MINOR).$(SO_VERSION_PATCH),$(LIBS)),$(eval $(call so_template,$(lib))))
+$(foreach lib,$(filter %$(SO_VERSION_MAJOR).dylib,$(LIBS)),$(eval $(call dl_template,$(lib))))
INSTALL-LIBS=$(call cond_enabled,CONFIG_INSTALL_LIBS,INSTALL-LIBS)
ifeq ($(MAKECMDGOALS),dist)
diff --git a/build/make/configure.sh b/build/make/configure.sh
index e54de21..84ca4b9 100644
--- a/build/make/configure.sh
+++ b/build/make/configure.sh
@@ -640,12 +640,6 @@
*i[3456]86*)
tgt_isa=x86
;;
- *powerpc64*)
- tgt_isa=ppc64
- ;;
- *powerpc*)
- tgt_isa=ppc32
- ;;
*sparc*)
tgt_isa=sparc
;;
@@ -1070,29 +1064,6 @@
check_add_asflags -march=${tgt_isa}
check_add_asflags -KPIC
;;
- ppc*)
- enable_feature ppc
- bits=${tgt_isa##ppc}
- link_with_cc=gcc
- setup_gnu_toolchain
- add_asflags -force_cpusubtype_ALL -I"\$(dir \$<)darwin"
- soft_enable altivec
- enabled altivec && add_cflags -maltivec
-
- case "$tgt_os" in
- linux*)
- add_asflags -maltivec -mregnames -I"\$(dir \$<)linux"
- ;;
- darwin*)
- darwin_arch="-arch ppc"
- enabled ppc64 && darwin_arch="${darwin_arch}64"
- add_cflags ${darwin_arch} -m${bits} -fasm-blocks
- add_asflags ${darwin_arch} -force_cpusubtype_ALL -I"\$(dir \$<)darwin"
- add_ldflags ${darwin_arch} -m${bits}
- enabled altivec && add_cflags -faltivec
- ;;
- esac
- ;;
x86*)
case ${tgt_os} in
win*)
diff --git a/configure b/configure
index 728521a..e05dd69 100755
--- a/configure
+++ b/configure
@@ -112,12 +112,6 @@
all_platforms="${all_platforms} armv7s-darwin-gcc"
all_platforms="${all_platforms} mips32-linux-gcc"
all_platforms="${all_platforms} mips64-linux-gcc"
-all_platforms="${all_platforms} ppc32-darwin8-gcc"
-all_platforms="${all_platforms} ppc32-darwin9-gcc"
-all_platforms="${all_platforms} ppc32-linux-gcc"
-all_platforms="${all_platforms} ppc64-darwin8-gcc"
-all_platforms="${all_platforms} ppc64-darwin9-gcc"
-all_platforms="${all_platforms} ppc64-linux-gcc"
all_platforms="${all_platforms} sparc-solaris-gcc"
all_platforms="${all_platforms} x86-android-gcc"
all_platforms="${all_platforms} x86-darwin8-gcc"
@@ -247,8 +241,6 @@
mips
x86
x86_64
- ppc32
- ppc64
"
ARCH_EXT_LIST="
edsp
@@ -269,8 +261,6 @@
sse4_1
avx
avx2
-
- altivec
"
HAVE_LIST="
${ARCH_EXT_LIST}
@@ -621,12 +611,6 @@
universal-darwin*)
darwin_ver=${tgt_os##darwin}
- # Snow Leopard (10.6/darwin10) dropped support for PPC
- # Include PPC support for all prior versions
- if [ $darwin_ver -lt 10 ]; then
- fat_bin_archs="$fat_bin_archs ppc32-${tgt_os}-gcc"
- fi
-
# Tiger (10.4/darwin8) brought support for x86
if [ $darwin_ver -ge 8 ]; then
fat_bin_archs="$fat_bin_archs x86-${tgt_os}-${tgt_cc}"
@@ -727,7 +711,7 @@
esac
# Other toolchain specific defaults
- case $toolchain in x86*|ppc*|universal*) soft_enable postproc;; esac
+ case $toolchain in x86*|universal*) soft_enable postproc;; esac
if enabled postproc_visualizer; then
enabled postproc || die "postproc_visualizer requires postproc to be enabled"
diff --git a/libs.mk b/libs.mk
index e48d55c..3046e1b 100644
--- a/libs.mk
+++ b/libs.mk
@@ -230,25 +230,27 @@
BUILD_LIBVPX_SO := $(if $(BUILD_LIBVPX),$(CONFIG_SHARED))
+SO_VERSION_MAJOR := 2
+SO_VERSION_MINOR := 0
+SO_VERSION_PATCH := 0
ifeq ($(filter darwin%,$(TGT_OS)),$(TGT_OS))
-LIBVPX_SO := libvpx.$(VERSION_MAJOR).dylib
+LIBVPX_SO := libvpx.$(SO_VERSION_MAJOR).dylib
EXPORT_FILE := libvpx.syms
LIBVPX_SO_SYMLINKS := $(addprefix $(LIBSUBDIR)/, \
libvpx.dylib )
else
-LIBVPX_SO := libvpx.so.$(VERSION_MAJOR).$(VERSION_MINOR).$(VERSION_PATCH)
+LIBVPX_SO := libvpx.so.$(SO_VERSION_MAJOR).$(SO_VERSION_MINOR).$(SO_VERSION_PATCH)
EXPORT_FILE := libvpx.ver
-SYM_LINK := libvpx.so
LIBVPX_SO_SYMLINKS := $(addprefix $(LIBSUBDIR)/, \
- libvpx.so libvpx.so.$(VERSION_MAJOR) \
- libvpx.so.$(VERSION_MAJOR).$(VERSION_MINOR))
+ libvpx.so libvpx.so.$(SO_VERSION_MAJOR) \
+ libvpx.so.$(SO_VERSION_MAJOR).$(SO_VERSION_MINOR))
endif
LIBS-$(BUILD_LIBVPX_SO) += $(BUILD_PFX)$(LIBVPX_SO)\
$(notdir $(LIBVPX_SO_SYMLINKS))
$(BUILD_PFX)$(LIBVPX_SO): $(LIBVPX_OBJS) $(EXPORT_FILE)
$(BUILD_PFX)$(LIBVPX_SO): extralibs += -lm
-$(BUILD_PFX)$(LIBVPX_SO): SONAME = libvpx.so.$(VERSION_MAJOR)
+$(BUILD_PFX)$(LIBVPX_SO): SONAME = libvpx.so.$(SO_VERSION_MAJOR)
$(BUILD_PFX)$(LIBVPX_SO): EXPORTS_FILE = $(EXPORT_FILE)
libvpx.ver: $(call enabled,CODEC_EXPORTS)
diff --git a/test/lpf_8_test.cc b/test/lpf_8_test.cc
index c836fac..2c396ce 100644
--- a/test/lpf_8_test.cc
+++ b/test/lpf_8_test.cc
@@ -52,7 +52,7 @@
const uint8_t *thresh1);
#endif // CONFIG_VP9_HIGHBITDEPTH
-typedef std::tr1::tuple<loop_op_t, loop_op_t, int> loop8_param_t;
+typedef std::tr1::tuple<loop_op_t, loop_op_t, int, int> loop8_param_t;
typedef std::tr1::tuple<dual_loop_op_t, dual_loop_op_t, int> dualloop8_param_t;
#if HAVE_SSE2
@@ -144,6 +144,7 @@
loopfilter_op_ = GET_PARAM(0);
ref_loopfilter_op_ = GET_PARAM(1);
bit_depth_ = GET_PARAM(2);
+ count_ = GET_PARAM(3);
mask_ = (1 << bit_depth_) - 1;
}
@@ -151,6 +152,7 @@
protected:
int bit_depth_;
+ int count_;
int mask_;
loop_op_t loopfilter_op_;
loop_op_t ref_loopfilter_op_;
@@ -206,7 +208,6 @@
tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp
};
int32_t p = kNumCoeffs/32;
- int count = 1;
uint16_t tmp_s[kNumCoeffs];
int j = 0;
@@ -238,13 +239,13 @@
ref_s[j] = s[j];
}
#if CONFIG_VP9_HIGHBITDEPTH
- ref_loopfilter_op_(ref_s + 8 + p * 8, p, blimit, limit, thresh, count, bd);
+ ref_loopfilter_op_(ref_s + 8 + p * 8, p, blimit, limit, thresh, count_, bd);
ASM_REGISTER_STATE_CHECK(
- loopfilter_op_(s + 8 + p * 8, p, blimit, limit, thresh, count, bd));
+ loopfilter_op_(s + 8 + p * 8, p, blimit, limit, thresh, count_, bd));
#else
- ref_loopfilter_op_(ref_s+8+p*8, p, blimit, limit, thresh, count);
+ ref_loopfilter_op_(ref_s+8+p*8, p, blimit, limit, thresh, count_);
ASM_REGISTER_STATE_CHECK(
- loopfilter_op_(s + 8 + p * 8, p, blimit, limit, thresh, count));
+ loopfilter_op_(s + 8 + p * 8, p, blimit, limit, thresh, count_));
#endif // CONFIG_VP9_HIGHBITDEPTH
for (int j = 0; j < kNumCoeffs; ++j) {
@@ -305,19 +306,18 @@
tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp
};
int32_t p = kNumCoeffs / 32;
- int count = 1;
for (int j = 0; j < kNumCoeffs; ++j) {
s[j] = rnd.Rand16() & mask_;
ref_s[j] = s[j];
}
#if CONFIG_VP9_HIGHBITDEPTH
- ref_loopfilter_op_(ref_s + 8 + p * 8, p, blimit, limit, thresh, count, bd);
+ ref_loopfilter_op_(ref_s + 8 + p * 8, p, blimit, limit, thresh, count_, bd);
ASM_REGISTER_STATE_CHECK(
- loopfilter_op_(s + 8 + p * 8, p, blimit, limit, thresh, count, bd));
+ loopfilter_op_(s + 8 + p * 8, p, blimit, limit, thresh, count_, bd));
#else
- ref_loopfilter_op_(ref_s+8+p*8, p, blimit, limit, thresh, count);
+ ref_loopfilter_op_(ref_s+8+p*8, p, blimit, limit, thresh, count_);
ASM_REGISTER_STATE_CHECK(
- loopfilter_op_(s + 8 + p * 8, p, blimit, limit, thresh, count));
+ loopfilter_op_(s + 8 + p * 8, p, blimit, limit, thresh, count_));
#endif // CONFIG_VP9_HIGHBITDEPTH
for (int j = 0; j < kNumCoeffs; ++j) {
err_count += ref_s[j] != s[j];
@@ -521,55 +521,62 @@
SSE2, Loop8Test6Param,
::testing::Values(
make_tuple(&vp9_highbd_lpf_horizontal_4_sse2,
- &vp9_highbd_lpf_horizontal_4_c, 8),
+ &vp9_highbd_lpf_horizontal_4_c, 8, 1),
make_tuple(&vp9_highbd_lpf_vertical_4_sse2,
- &vp9_highbd_lpf_vertical_4_c, 8),
+ &vp9_highbd_lpf_vertical_4_c, 8, 1),
make_tuple(&vp9_highbd_lpf_horizontal_8_sse2,
- &vp9_highbd_lpf_horizontal_8_c, 8),
+ &vp9_highbd_lpf_horizontal_8_c, 8, 1),
make_tuple(&vp9_highbd_lpf_horizontal_16_sse2,
- &vp9_highbd_lpf_horizontal_16_c, 8),
+ &vp9_highbd_lpf_horizontal_16_c, 8, 1),
+ make_tuple(&vp9_highbd_lpf_horizontal_16_sse2,
+ &vp9_highbd_lpf_horizontal_16_c, 8, 2),
make_tuple(&vp9_highbd_lpf_vertical_8_sse2,
- &vp9_highbd_lpf_vertical_8_c, 8),
+ &vp9_highbd_lpf_vertical_8_c, 8, 1),
make_tuple(&wrapper_vertical_16_sse2,
- &wrapper_vertical_16_c, 8),
+ &wrapper_vertical_16_c, 8, 1),
make_tuple(&vp9_highbd_lpf_horizontal_4_sse2,
- &vp9_highbd_lpf_horizontal_4_c, 10),
+ &vp9_highbd_lpf_horizontal_4_c, 10, 1),
make_tuple(&vp9_highbd_lpf_vertical_4_sse2,
- &vp9_highbd_lpf_vertical_4_c, 10),
+ &vp9_highbd_lpf_vertical_4_c, 10, 1),
make_tuple(&vp9_highbd_lpf_horizontal_8_sse2,
- &vp9_highbd_lpf_horizontal_8_c, 10),
+ &vp9_highbd_lpf_horizontal_8_c, 10, 1),
make_tuple(&vp9_highbd_lpf_horizontal_16_sse2,
- &vp9_highbd_lpf_horizontal_16_c, 10),
+ &vp9_highbd_lpf_horizontal_16_c, 10, 1),
+ make_tuple(&vp9_highbd_lpf_horizontal_16_sse2,
+ &vp9_highbd_lpf_horizontal_16_c, 10, 2),
make_tuple(&vp9_highbd_lpf_vertical_8_sse2,
- &vp9_highbd_lpf_vertical_8_c, 10),
+ &vp9_highbd_lpf_vertical_8_c, 10, 1),
make_tuple(&wrapper_vertical_16_sse2,
- &wrapper_vertical_16_c, 10),
+ &wrapper_vertical_16_c, 10, 1),
make_tuple(&vp9_highbd_lpf_horizontal_4_sse2,
- &vp9_highbd_lpf_horizontal_4_c, 12),
+ &vp9_highbd_lpf_horizontal_4_c, 12, 1),
make_tuple(&vp9_highbd_lpf_vertical_4_sse2,
- &vp9_highbd_lpf_vertical_4_c, 12),
+ &vp9_highbd_lpf_vertical_4_c, 12, 1),
make_tuple(&vp9_highbd_lpf_horizontal_8_sse2,
- &vp9_highbd_lpf_horizontal_8_c, 12),
+ &vp9_highbd_lpf_horizontal_8_c, 12, 1),
make_tuple(&vp9_highbd_lpf_horizontal_16_sse2,
- &vp9_highbd_lpf_horizontal_16_c, 12),
+ &vp9_highbd_lpf_horizontal_16_c, 12, 1),
+ make_tuple(&vp9_highbd_lpf_horizontal_16_sse2,
+ &vp9_highbd_lpf_horizontal_16_c, 12, 2),
make_tuple(&vp9_highbd_lpf_vertical_8_sse2,
- &vp9_highbd_lpf_vertical_8_c, 12),
+ &vp9_highbd_lpf_vertical_8_c, 12, 1),
make_tuple(&wrapper_vertical_16_sse2,
- &wrapper_vertical_16_c, 12),
+ &wrapper_vertical_16_c, 12, 1),
make_tuple(&wrapper_vertical_16_dual_sse2,
- &wrapper_vertical_16_dual_c, 8),
+ &wrapper_vertical_16_dual_c, 8, 1),
make_tuple(&wrapper_vertical_16_dual_sse2,
- &wrapper_vertical_16_dual_c, 10),
+ &wrapper_vertical_16_dual_c, 10, 1),
make_tuple(&wrapper_vertical_16_dual_sse2,
- &wrapper_vertical_16_dual_c, 12)));
+ &wrapper_vertical_16_dual_c, 12, 1)));
#else
INSTANTIATE_TEST_CASE_P(
SSE2, Loop8Test6Param,
::testing::Values(
- make_tuple(&vp9_lpf_horizontal_8_sse2, &vp9_lpf_horizontal_8_c, 8),
- make_tuple(&vp9_lpf_horizontal_16_sse2, &vp9_lpf_horizontal_16_c, 8),
- make_tuple(&vp9_lpf_vertical_8_sse2, &vp9_lpf_vertical_8_c, 8),
- make_tuple(&wrapper_vertical_16_sse2, &wrapper_vertical_16_c, 8)));
+ make_tuple(&vp9_lpf_horizontal_8_sse2, &vp9_lpf_horizontal_8_c, 8, 1),
+ make_tuple(&vp9_lpf_horizontal_16_sse2, &vp9_lpf_horizontal_16_c, 8, 1),
+ make_tuple(&vp9_lpf_horizontal_16_sse2, &vp9_lpf_horizontal_16_c, 8, 2),
+ make_tuple(&vp9_lpf_vertical_8_sse2, &vp9_lpf_vertical_8_c, 8, 1),
+ make_tuple(&wrapper_vertical_16_sse2, &wrapper_vertical_16_c, 8, 1)));
#endif // CONFIG_VP9_HIGHBITDEPTH
#endif
@@ -577,7 +584,9 @@
INSTANTIATE_TEST_CASE_P(
AVX2, Loop8Test6Param,
::testing::Values(
- make_tuple(&vp9_lpf_horizontal_16_avx2, &vp9_lpf_horizontal_16_c, 8)));
+ make_tuple(&vp9_lpf_horizontal_16_avx2, &vp9_lpf_horizontal_16_c, 8, 1),
+ make_tuple(&vp9_lpf_horizontal_16_avx2, &vp9_lpf_horizontal_16_c, 8,
+ 2)));
#endif
#if HAVE_SSE2
@@ -635,20 +644,22 @@
// Using #if inside the macro is unsupported on MSVS but the tests are not
// currently built for MSVS with ARM and NEON.
make_tuple(&vp9_lpf_horizontal_16_neon,
- &vp9_lpf_horizontal_16_c, 8),
+ &vp9_lpf_horizontal_16_c, 8, 1),
+ make_tuple(&vp9_lpf_horizontal_16_neon,
+ &vp9_lpf_horizontal_16_c, 8, 2),
make_tuple(&wrapper_vertical_16_neon,
- &wrapper_vertical_16_c, 8),
+ &wrapper_vertical_16_c, 8, 1),
make_tuple(&wrapper_vertical_16_dual_neon,
- &wrapper_vertical_16_dual_c, 8),
+ &wrapper_vertical_16_dual_c, 8, 1),
make_tuple(&vp9_lpf_horizontal_8_neon,
- &vp9_lpf_horizontal_8_c, 8),
+ &vp9_lpf_horizontal_8_c, 8, 1),
make_tuple(&vp9_lpf_vertical_8_neon,
- &vp9_lpf_vertical_8_c, 8),
+ &vp9_lpf_vertical_8_c, 8, 1),
#endif // HAVE_NEON_ASM
make_tuple(&vp9_lpf_horizontal_4_neon,
- &vp9_lpf_horizontal_4_c, 8),
+ &vp9_lpf_horizontal_4_c, 8, 1),
make_tuple(&vp9_lpf_vertical_4_neon,
- &vp9_lpf_vertical_4_c, 8)));
+ &vp9_lpf_vertical_4_c, 8, 1)));
INSTANTIATE_TEST_CASE_P(
NEON, Loop8Test9Param,
::testing::Values(
diff --git a/test/partial_idct_test.cc b/test/partial_idct_test.cc
index ba82da4..ba73f86 100644
--- a/test/partial_idct_test.cc
+++ b/test/partial_idct_test.cc
@@ -230,7 +230,7 @@
&vp9_idct4x4_1_add_c,
TX_4X4, 1)));
-#if HAVE_NEON
+#if HAVE_NEON && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
INSTANTIATE_TEST_CASE_P(
NEON, PartialIDctTest,
::testing::Values(
@@ -258,7 +258,7 @@
&vp9_idct4x4_16_add_c,
&vp9_idct4x4_1_add_neon,
TX_4X4, 1)));
-#endif // HAVE_NEON
+#endif // HAVE_NEON && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
#if HAVE_SSE2 && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
INSTANTIATE_TEST_CASE_P(
diff --git a/test/test_vector_test.cc b/test/test_vector_test.cc
index fa264f2..5847074 100644
--- a/test/test_vector_test.cc
+++ b/test/test_vector_test.cc
@@ -29,7 +29,7 @@
enum DecodeMode {
kSerialMode,
- kFrameParallMode
+ kFrameParallelMode
};
const int kDecodeMode = 0;
@@ -95,7 +95,7 @@
vpx_codec_dec_cfg_t cfg = {0};
char str[256];
- if (mode == kFrameParallMode) {
+ if (mode == kFrameParallelMode) {
flags |= VPX_CODEC_USE_FRAME_THREADING;
}
diff --git a/test/tools_common.sh b/test/tools_common.sh
index 34c1516..0ae011e 100755
--- a/test/tools_common.sh
+++ b/test/tools_common.sh
@@ -402,6 +402,7 @@
VP9_WEBM_FILE="${LIBVPX_TEST_DATA_PATH}/vp90-2-00-quantizer-00.webm"
VP9_FPM_WEBM_FILE="${LIBVPX_TEST_DATA_PATH}/vp90-2-07-frame_parallel-1.webm"
+VP9_LT_50_FRAMES_WEBM_FILE="${LIBVPX_TEST_DATA_PATH}/vp90-2-02-size-32x08.webm"
YUV_RAW_INPUT="${LIBVPX_TEST_DATA_PATH}/hantro_collage_w352h288.yuv"
YUV_RAW_INPUT_WIDTH=352
diff --git a/test/vp9_frame_parallel_test.cc b/test/vp9_frame_parallel_test.cc
index 0594d75..f0df88a 100644
--- a/test/vp9_frame_parallel_test.cc
+++ b/test/vp9_frame_parallel_test.cc
@@ -29,7 +29,7 @@
#if CONFIG_WEBM_IO
-struct FileList {
+struct PauseFileList {
const char *name;
// md5 sum for decoded frames which does not include skipped frames.
const char *expected_md5;
@@ -39,7 +39,8 @@
// Decodes |filename| with |num_threads|. Pause at the specified frame_num,
// seek to next key frame and then continue decoding until the end. Return
// the md5 of the decoded frames which does not include skipped frames.
-string DecodeFile(const string &filename, int num_threads, int pause_num) {
+string DecodeFileWithPause(const string &filename, int num_threads,
+ int pause_num) {
libvpx_test::WebMVideoSource video(filename);
video.Init();
int in_frames = 0;
@@ -92,12 +93,12 @@
return string(md5.Get());
}
-void DecodeFiles(const FileList files[]) {
- for (const FileList *iter = files; iter->name != NULL; ++iter) {
+void DecodeFilesWithPause(const PauseFileList files[]) {
+ for (const PauseFileList *iter = files; iter->name != NULL; ++iter) {
SCOPED_TRACE(iter->name);
for (int t = 2; t <= 8; ++t) {
EXPECT_EQ(iter->expected_md5,
- DecodeFile(iter->name, t, iter->pause_frame_num))
+ DecodeFileWithPause(iter->name, t, iter->pause_frame_num))
<< "threads = " << t;
}
}
@@ -106,19 +107,19 @@
TEST(VP9MultiThreadedFrameParallel, PauseSeekResume) {
// vp90-2-07-frame_parallel-1.webm is a 40 frame video file with
// one key frame for every ten frames.
- static const FileList files[] = {
+ static const PauseFileList files[] = {
{ "vp90-2-07-frame_parallel-1.webm",
- "6ea7c3875d67252e7caf2bc6e75b36b1", 6},
+ "6ea7c3875d67252e7caf2bc6e75b36b1", 6 },
{ "vp90-2-07-frame_parallel-1.webm",
- "4bb634160c7356a8d7d4299b6dc83a45", 12},
+ "4bb634160c7356a8d7d4299b6dc83a45", 12 },
{ "vp90-2-07-frame_parallel-1.webm",
- "89772591e6ef461f9fa754f916c78ed8", 26},
- { NULL, NULL, 0},
+ "89772591e6ef461f9fa754f916c78ed8", 26 },
+ { NULL, NULL, 0 },
};
- DecodeFiles(files);
+ DecodeFilesWithPause(files);
}
-struct InvalidFileList {
+struct FileList {
const char *name;
// md5 sum for decoded frames which does not include corrupted frames.
const char *expected_md5;
@@ -128,8 +129,8 @@
// Decodes |filename| with |num_threads|. Return the md5 of the decoded
// frames which does not include corrupted frames.
-string DecodeInvalidFile(const string &filename, int num_threads,
- int expected_frame_count) {
+string DecodeFile(const string &filename, int num_threads,
+ int expected_frame_count) {
libvpx_test::WebMVideoSource video(filename);
video.Init();
@@ -173,37 +174,47 @@
return string(md5.Get());
}
-void DecodeInvalidFiles(const InvalidFileList files[]) {
- for (const InvalidFileList *iter = files; iter->name != NULL; ++iter) {
+void DecodeFiles(const FileList files[]) {
+ for (const FileList *iter = files; iter->name != NULL; ++iter) {
SCOPED_TRACE(iter->name);
for (int t = 2; t <= 8; ++t) {
EXPECT_EQ(iter->expected_md5,
- DecodeInvalidFile(iter->name, t, iter->expected_frame_count))
+ DecodeFile(iter->name, t, iter->expected_frame_count))
<< "threads = " << t;
}
}
}
TEST(VP9MultiThreadedFrameParallel, InvalidFileTest) {
- static const InvalidFileList files[] = {
+ static const FileList files[] = {
// invalid-vp90-2-07-frame_parallel-1.webm is a 40 frame video file with
// one key frame for every ten frames. The 11th frame has corrupted data.
{ "invalid-vp90-2-07-frame_parallel-1.webm",
- "0549d0f45f60deaef8eb708e6c0eb6cb", 30},
+ "0549d0f45f60deaef8eb708e6c0eb6cb", 30 },
// invalid-vp90-2-07-frame_parallel-2.webm is a 40 frame video file with
// one key frame for every ten frames. The 1st and 31st frames have
// corrupted data.
{ "invalid-vp90-2-07-frame_parallel-2.webm",
- "6a1f3cf6f9e7a364212fadb9580d525e", 20},
+ "6a1f3cf6f9e7a364212fadb9580d525e", 20 },
// invalid-vp90-2-07-frame_parallel-3.webm is a 40 frame video file with
// one key frame for every ten frames. The 5th and 13th frames have
// corrupted data.
{ "invalid-vp90-2-07-frame_parallel-3.webm",
- "8256544308de926b0681e04685b98677", 27},
- { NULL, NULL, 0},
+ "8256544308de926b0681e04685b98677", 27 },
+ { NULL, NULL, 0 },
};
- DecodeInvalidFiles(files);
+ DecodeFiles(files);
}
+TEST(VP9MultiThreadedFrameParallel, ValidFileTest) {
+ static const FileList files[] = {
+#if CONFIG_VP9_HIGHBITDEPTH
+ { "vp92-2-20-10bit-yuv420.webm",
+ "a16b99df180c584e8db2ffeda987d293", 10 },
+#endif
+ { NULL, NULL, 0 },
+ };
+ DecodeFiles(files);
+}
#endif // CONFIG_WEBM_IO
} // namespace
diff --git a/test/vpxdec.sh b/test/vpxdec.sh
index d73a447..de51c80 100755
--- a/test/vpxdec.sh
+++ b/test/vpxdec.sh
@@ -17,7 +17,8 @@
# Environment check: Make sure input is available.
vpxdec_verify_environment() {
if [ ! -e "${VP8_IVF_FILE}" ] || [ ! -e "${VP9_WEBM_FILE}" ] || \
- [ ! -e "${VP9_FPM_WEBM_FILE}" ] ; then
+ [ ! -e "${VP9_FPM_WEBM_FILE}" ] || \
+ [ ! -e "${VP9_LT_50_FRAMES_WEBM_FILE}" ] ; then
elog "Libvpx test data must exist in LIBVPX_TEST_DATA_PATH."
return 1
fi
@@ -87,12 +88,29 @@
--frame-parallel
done
fi
+}
+vpxdec_vp9_webm_less_than_50_frames() {
+ # ensure that reaching eof in webm_guess_framerate doesn't result in invalid
+ # frames in actual webm_read_frame calls.
+ if [ "$(vpxdec_can_decode_vp9)" = "yes" ] && \
+ [ "$(webm_io_available)" = "yes" ]; then
+ local readonly decoder="$(vpx_tool_path vpxdec)"
+ local readonly expected=10
+ local readonly num_frames=$(${VPX_TEST_PREFIX} "${decoder}" \
+ "${VP9_LT_50_FRAMES_WEBM_FILE}" --summary --noblit 2>&1 \
+ | awk '/^[0-9]+ decoded frames/ { print $1 }')
+ if [ "$num_frames" -ne "$expected" ]; then
+ elog "Output frames ($num_frames) != expected ($expected)"
+ return 1
+ fi
+ fi
}
vpxdec_tests="vpxdec_vp8_ivf
vpxdec_vp8_ivf_pipe_input
vpxdec_vp9_webm
- vpxdec_vp9_webm_frame_parallel"
+ vpxdec_vp9_webm_frame_parallel
+ vpxdec_vp9_webm_less_than_50_frames"
run_tests vpxdec_verify_environment "${vpxdec_tests}"
diff --git a/vp8/common/debugmodes.c b/vp8/common/debugmodes.c
index 46064e6..159fddc 100644
--- a/vp8/common/debugmodes.c
+++ b/vp8/common/debugmodes.c
@@ -81,7 +81,6 @@
fprintf(mvs, "\n");
/* print out the block modes */
- mb_index = 0;
fprintf(mvs, "Mbs for Frame %d\n", frame);
{
int b_row;
@@ -129,7 +128,6 @@
/* print out the block modes */
- mb_index = 0;
fprintf(mvs, "MVs for Frame %d\n", frame);
{
int b_row;
diff --git a/vp8/common/mfqe.c b/vp8/common/mfqe.c
index 0693326..edd6ca0 100644
--- a/vp8/common/mfqe.c
+++ b/vp8/common/mfqe.c
@@ -153,11 +153,11 @@
actd = (vp8_variance16x16(yd, yd_stride, VP8_ZEROS, 0, &sse)+128)>>8;
act = (vp8_variance16x16(y, y_stride, VP8_ZEROS, 0, &sse)+128)>>8;
#ifdef USE_SSD
- sad = (vp8_variance16x16(y, y_stride, yd, yd_stride, &sse));
+ vp8_variance16x16(y, y_stride, yd, yd_stride, &sse);
sad = (sse + 128)>>8;
- usad = (vp8_variance8x8(u, uv_stride, ud, uvd_stride, &sse));
+ vp8_variance8x8(u, uv_stride, ud, uvd_stride, &sse);
usad = (sse + 32)>>6;
- vsad = (vp8_variance8x8(v, uv_stride, vd, uvd_stride, &sse));
+ vp8_variance8x8(v, uv_stride, vd, uvd_stride, &sse);
vsad = (sse + 32)>>6;
#else
sad = (vp8_sad16x16(y, y_stride, yd, yd_stride, UINT_MAX) + 128) >> 8;
@@ -170,11 +170,11 @@
actd = (vp8_variance8x8(yd, yd_stride, VP8_ZEROS, 0, &sse)+32)>>6;
act = (vp8_variance8x8(y, y_stride, VP8_ZEROS, 0, &sse)+32)>>6;
#ifdef USE_SSD
- sad = (vp8_variance8x8(y, y_stride, yd, yd_stride, &sse));
+ vp8_variance8x8(y, y_stride, yd, yd_stride, &sse);
sad = (sse + 32)>>6;
- usad = (vp8_variance4x4(u, uv_stride, ud, uvd_stride, &sse));
+ vp8_variance4x4(u, uv_stride, ud, uvd_stride, &sse);
usad = (sse + 8)>>4;
- vsad = (vp8_variance4x4(v, uv_stride, vd, uvd_stride, &sse));
+ vp8_variance4x4(v, uv_stride, vd, uvd_stride, &sse);
vsad = (sse + 8)>>4;
#else
sad = (vp8_sad8x8(y, y_stride, yd, yd_stride, UINT_MAX) + 32) >> 6;
diff --git a/vp8/common/ppc/copy_altivec.asm b/vp8/common/ppc/copy_altivec.asm
deleted file mode 100644
index a4ce915..0000000
--- a/vp8/common/ppc/copy_altivec.asm
+++ /dev/null
@@ -1,47 +0,0 @@
-;
-; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
-
- .globl copy_mem16x16_ppc
-
-;# r3 unsigned char *src
-;# r4 int src_stride
-;# r5 unsigned char *dst
-;# r6 int dst_stride
-
-;# Make the assumption that input will not be aligned,
-;# but the output will be. So two reads and a perm
-;# for the input, but only one store for the output.
-copy_mem16x16_ppc:
- mfspr r11, 256 ;# get old VRSAVE
- oris r12, r11, 0xe000
- mtspr 256, r12 ;# set VRSAVE
-
- li r10, 16
- mtctr r10
-
-cp_16x16_loop:
- lvsl v0, 0, r3 ;# permutate value for alignment
-
- lvx v1, 0, r3
- lvx v2, r10, r3
-
- vperm v1, v1, v2, v0
-
- stvx v1, 0, r5
-
- add r3, r3, r4 ;# increment source pointer
- add r5, r5, r6 ;# increment destination pointer
-
- bdnz cp_16x16_loop
-
- mtspr 256, r11 ;# reset old VRSAVE
-
- blr
diff --git a/vp8/common/ppc/filter_altivec.asm b/vp8/common/ppc/filter_altivec.asm
deleted file mode 100644
index 4da2e94..0000000
--- a/vp8/common/ppc/filter_altivec.asm
+++ /dev/null
@@ -1,1013 +0,0 @@
-;
-; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
-
- .globl sixtap_predict_ppc
- .globl sixtap_predict8x4_ppc
- .globl sixtap_predict8x8_ppc
- .globl sixtap_predict16x16_ppc
-
-.macro load_c V, LABEL, OFF, R0, R1
- lis \R0, \LABEL@ha
- la \R1, \LABEL@l(\R0)
- lvx \V, \OFF, \R1
-.endm
-
-.macro load_hfilter V0, V1
- load_c \V0, HFilter, r5, r9, r10
-
- addi r5, r5, 16
- lvx \V1, r5, r10
-.endm
-
-;# Vertical filtering
-.macro Vprolog
- load_c v0, VFilter, r6, r3, r10
-
- vspltish v5, 8
- vspltish v6, 3
- vslh v6, v5, v6 ;# 0x0040 0040 0040 0040 0040 0040 0040 0040
-
- vspltb v1, v0, 1
- vspltb v2, v0, 2
- vspltb v3, v0, 3
- vspltb v4, v0, 4
- vspltb v5, v0, 5
- vspltb v0, v0, 0
-.endm
-
-.macro vpre_load
- Vprolog
- li r10, 16
- lvx v10, 0, r9 ;# v10..v14 = first 5 rows
- lvx v11, r10, r9
- addi r9, r9, 32
- lvx v12, 0, r9
- lvx v13, r10, r9
- addi r9, r9, 32
- lvx v14, 0, r9
-.endm
-
-.macro Msum Re, Ro, V, T, TMP
- ;# (Re,Ro) += (V*T)
- vmuleub \TMP, \V, \T ;# trashes v8
- vadduhm \Re, \Re, \TMP ;# Re = evens, saturation unnecessary
- vmuloub \TMP, \V, \T
- vadduhm \Ro, \Ro, \TMP ;# Ro = odds
-.endm
-
-.macro vinterp_no_store P0 P1 P2 P3 P4 P5
- vmuleub v8, \P0, v0 ;# 64 + 4 positive taps
- vadduhm v16, v6, v8
- vmuloub v8, \P0, v0
- vadduhm v17, v6, v8
- Msum v16, v17, \P2, v2, v8
- Msum v16, v17, \P3, v3, v8
- Msum v16, v17, \P5, v5, v8
-
- vmuleub v18, \P1, v1 ;# 2 negative taps
- vmuloub v19, \P1, v1
- Msum v18, v19, \P4, v4, v8
-
- vsubuhs v16, v16, v18 ;# subtract neg from pos
- vsubuhs v17, v17, v19
- vsrh v16, v16, v7 ;# divide by 128
- vsrh v17, v17, v7 ;# v16 v17 = evens, odds
- vmrghh v18, v16, v17 ;# v18 v19 = 16-bit result in order
- vmrglh v19, v16, v17
- vpkuhus \P0, v18, v19 ;# P0 = 8-bit result
-.endm
-
-.macro vinterp_no_store_8x8 P0 P1 P2 P3 P4 P5
- vmuleub v24, \P0, v13 ;# 64 + 4 positive taps
- vadduhm v21, v20, v24
- vmuloub v24, \P0, v13
- vadduhm v22, v20, v24
- Msum v21, v22, \P2, v15, v25
- Msum v21, v22, \P3, v16, v25
- Msum v21, v22, \P5, v18, v25
-
- vmuleub v23, \P1, v14 ;# 2 negative taps
- vmuloub v24, \P1, v14
- Msum v23, v24, \P4, v17, v25
-
- vsubuhs v21, v21, v23 ;# subtract neg from pos
- vsubuhs v22, v22, v24
- vsrh v21, v21, v19 ;# divide by 128
- vsrh v22, v22, v19 ;# v16 v17 = evens, odds
- vmrghh v23, v21, v22 ;# v18 v19 = 16-bit result in order
- vmrglh v24, v21, v22
- vpkuhus \P0, v23, v24 ;# P0 = 8-bit result
-.endm
-
-
-.macro Vinterp P0 P1 P2 P3 P4 P5
- vinterp_no_store \P0, \P1, \P2, \P3, \P4, \P5
- stvx \P0, 0, r7
- add r7, r7, r8 ;# 33 ops per 16 pels
-.endm
-
-
-.macro luma_v P0, P1, P2, P3, P4, P5
- addi r9, r9, 16 ;# P5 = newest input row
- lvx \P5, 0, r9
- Vinterp \P0, \P1, \P2, \P3, \P4, \P5
-.endm
-
-.macro luma_vtwo
- luma_v v10, v11, v12, v13, v14, v15
- luma_v v11, v12, v13, v14, v15, v10
-.endm
-
-.macro luma_vfour
- luma_vtwo
- luma_v v12, v13, v14, v15, v10, v11
- luma_v v13, v14, v15, v10, v11, v12
-.endm
-
-.macro luma_vsix
- luma_vfour
- luma_v v14, v15, v10, v11, v12, v13
- luma_v v15, v10, v11, v12, v13, v14
-.endm
-
-.macro Interp4 R I I4
- vmsummbm \R, v13, \I, v15
- vmsummbm \R, v14, \I4, \R
-.endm
-
-.macro Read8x8 VD, RS, RP, increment_counter
- lvsl v21, 0, \RS ;# permutate value for alignment
-
- ;# input to filter is 21 bytes wide, output is 16 bytes.
- ;# input will can span three vectors if not aligned correctly.
- lvx \VD, 0, \RS
- lvx v20, r10, \RS
-
-.if \increment_counter
- add \RS, \RS, \RP
-.endif
-
- vperm \VD, \VD, v20, v21
-.endm
-
-.macro interp_8x8 R
- vperm v20, \R, \R, v16 ;# v20 = 0123 1234 2345 3456
- vperm v21, \R, \R, v17 ;# v21 = 4567 5678 6789 789A
- Interp4 v20, v20, v21 ;# v20 = result 0 1 2 3
- vperm \R, \R, \R, v18 ;# R = 89AB 9ABC ABCx BCxx
- Interp4 v21, v21, \R ;# v21 = result 4 5 6 7
-
- vpkswus \R, v20, v21 ;# R = 0 1 2 3 4 5 6 7
- vsrh \R, \R, v19
-
- vpkuhus \R, \R, \R ;# saturate and pack
-
-.endm
-
-.macro Read4x4 VD, RS, RP, increment_counter
- lvsl v21, 0, \RS ;# permutate value for alignment
-
- ;# input to filter is 21 bytes wide, output is 16 bytes.
- ;# input will can span three vectors if not aligned correctly.
- lvx v20, 0, \RS
-
-.if \increment_counter
- add \RS, \RS, \RP
-.endif
-
- vperm \VD, v20, v20, v21
-.endm
- .text
-
- .align 2
-;# r3 unsigned char * src
-;# r4 int src_pitch
-;# r5 int x_offset
-;# r6 int y_offset
-;# r7 unsigned char * dst
-;# r8 int dst_pitch
-sixtap_predict_ppc:
- mfspr r11, 256 ;# get old VRSAVE
- oris r12, r11, 0xff87
- ori r12, r12, 0xffc0
- mtspr 256, r12 ;# set VRSAVE
-
- stwu r1,-32(r1) ;# create space on the stack
-
- slwi. r5, r5, 5 ;# index into horizontal filter array
-
- vspltish v19, 7
-
- ;# If there isn't any filtering to be done for the horizontal, then
- ;# just skip to the second pass.
- beq- vertical_only_4x4
-
- ;# load up horizontal filter
- load_hfilter v13, v14
-
- ;# rounding added in on the multiply
- vspltisw v16, 8
- vspltisw v15, 3
- vslw v15, v16, v15 ;# 0x00000040000000400000004000000040
-
- ;# Load up permutation constants
- load_c v16, B_0123, 0, r9, r10
- load_c v17, B_4567, 0, r9, r10
- load_c v18, B_89AB, 0, r9, r10
-
- ;# Back off input buffer by 2 bytes. Need 2 before and 3 after
- addi r3, r3, -2
-
- addi r9, r3, 0
- li r10, 16
- Read8x8 v2, r3, r4, 1
- Read8x8 v3, r3, r4, 1
- Read8x8 v4, r3, r4, 1
- Read8x8 v5, r3, r4, 1
-
- slwi. r6, r6, 4 ;# index into vertical filter array
-
- ;# filter a line
- interp_8x8 v2
- interp_8x8 v3
- interp_8x8 v4
- interp_8x8 v5
-
- ;# Finished filtering main horizontal block. If there is no
- ;# vertical filtering, jump to storing the data. Otherwise
- ;# load up and filter the additional 5 lines that are needed
- ;# for the vertical filter.
- beq- store_4x4
-
- ;# only needed if there is a vertical filter present
- ;# if the second filter is not null then need to back off by 2*pitch
- sub r9, r9, r4
- sub r9, r9, r4
-
- Read8x8 v0, r9, r4, 1
- Read8x8 v1, r9, r4, 0
- Read8x8 v6, r3, r4, 1
- Read8x8 v7, r3, r4, 1
- Read8x8 v8, r3, r4, 0
-
- interp_8x8 v0
- interp_8x8 v1
- interp_8x8 v6
- interp_8x8 v7
- interp_8x8 v8
-
- b second_pass_4x4
-
-vertical_only_4x4:
- ;# only needed if there is a vertical filter present
- ;# if the second filter is not null then need to back off by 2*pitch
- sub r3, r3, r4
- sub r3, r3, r4
- li r10, 16
-
- Read8x8 v0, r3, r4, 1
- Read8x8 v1, r3, r4, 1
- Read8x8 v2, r3, r4, 1
- Read8x8 v3, r3, r4, 1
- Read8x8 v4, r3, r4, 1
- Read8x8 v5, r3, r4, 1
- Read8x8 v6, r3, r4, 1
- Read8x8 v7, r3, r4, 1
- Read8x8 v8, r3, r4, 0
-
- slwi r6, r6, 4 ;# index into vertical filter array
-
-second_pass_4x4:
- load_c v20, b_hilo_4x4, 0, r9, r10
- load_c v21, b_hilo, 0, r9, r10
-
- ;# reposition input so that it can go through the
- ;# filtering phase with one pass.
- vperm v0, v0, v1, v20 ;# 0 1 x x
- vperm v2, v2, v3, v20 ;# 2 3 x x
- vperm v4, v4, v5, v20 ;# 4 5 x x
- vperm v6, v6, v7, v20 ;# 6 7 x x
-
- vperm v0, v0, v2, v21 ;# 0 1 2 3
- vperm v4, v4, v6, v21 ;# 4 5 6 7
-
- vsldoi v1, v0, v4, 4
- vsldoi v2, v0, v4, 8
- vsldoi v3, v0, v4, 12
-
- vsldoi v5, v4, v8, 4
-
- load_c v13, VFilter, r6, r9, r10
-
- vspltish v15, 8
- vspltish v20, 3
- vslh v20, v15, v20 ;# 0x0040 0040 0040 0040 0040 0040 0040 0040
-
- vspltb v14, v13, 1
- vspltb v15, v13, 2
- vspltb v16, v13, 3
- vspltb v17, v13, 4
- vspltb v18, v13, 5
- vspltb v13, v13, 0
-
- vinterp_no_store_8x8 v0, v1, v2, v3, v4, v5
-
- stvx v0, 0, r1
-
- lwz r0, 0(r1)
- stw r0, 0(r7)
- add r7, r7, r8
-
- lwz r0, 4(r1)
- stw r0, 0(r7)
- add r7, r7, r8
-
- lwz r0, 8(r1)
- stw r0, 0(r7)
- add r7, r7, r8
-
- lwz r0, 12(r1)
- stw r0, 0(r7)
-
- b exit_4x4
-
-store_4x4:
-
- stvx v2, 0, r1
- lwz r0, 0(r1)
- stw r0, 0(r7)
- add r7, r7, r8
-
- stvx v3, 0, r1
- lwz r0, 0(r1)
- stw r0, 0(r7)
- add r7, r7, r8
-
- stvx v4, 0, r1
- lwz r0, 0(r1)
- stw r0, 0(r7)
- add r7, r7, r8
-
- stvx v5, 0, r1
- lwz r0, 0(r1)
- stw r0, 0(r7)
-
-exit_4x4:
-
- addi r1, r1, 32 ;# recover stack
-
- mtspr 256, r11 ;# reset old VRSAVE
-
- blr
-
-.macro w_8x8 V, D, R, P
- stvx \V, 0, r1
- lwz \R, 0(r1)
- stw \R, 0(r7)
- lwz \R, 4(r1)
- stw \R, 4(r7)
- add \D, \D, \P
-.endm
-
- .align 2
-;# r3 unsigned char * src
-;# r4 int src_pitch
-;# r5 int x_offset
-;# r6 int y_offset
-;# r7 unsigned char * dst
-;# r8 int dst_pitch
-
-sixtap_predict8x4_ppc:
- mfspr r11, 256 ;# get old VRSAVE
- oris r12, r11, 0xffff
- ori r12, r12, 0xffc0
- mtspr 256, r12 ;# set VRSAVE
-
- stwu r1,-32(r1) ;# create space on the stack
-
- slwi. r5, r5, 5 ;# index into horizontal filter array
-
- vspltish v19, 7
-
- ;# If there isn't any filtering to be done for the horizontal, then
- ;# just skip to the second pass.
- beq- second_pass_pre_copy_8x4
-
- load_hfilter v13, v14
-
- ;# rounding added in on the multiply
- vspltisw v16, 8
- vspltisw v15, 3
- vslw v15, v16, v15 ;# 0x00000040000000400000004000000040
-
- ;# Load up permutation constants
- load_c v16, B_0123, 0, r9, r10
- load_c v17, B_4567, 0, r9, r10
- load_c v18, B_89AB, 0, r9, r10
-
- ;# Back off input buffer by 2 bytes. Need 2 before and 3 after
- addi r3, r3, -2
-
- addi r9, r3, 0
- li r10, 16
- Read8x8 v2, r3, r4, 1
- Read8x8 v3, r3, r4, 1
- Read8x8 v4, r3, r4, 1
- Read8x8 v5, r3, r4, 1
-
- slwi. r6, r6, 4 ;# index into vertical filter array
-
- ;# filter a line
- interp_8x8 v2
- interp_8x8 v3
- interp_8x8 v4
- interp_8x8 v5
-
- ;# Finished filtering main horizontal block. If there is no
- ;# vertical filtering, jump to storing the data. Otherwise
- ;# load up and filter the additional 5 lines that are needed
- ;# for the vertical filter.
- beq- store_8x4
-
- ;# only needed if there is a vertical filter present
- ;# if the second filter is not null then need to back off by 2*pitch
- sub r9, r9, r4
- sub r9, r9, r4
-
- Read8x8 v0, r9, r4, 1
- Read8x8 v1, r9, r4, 0
- Read8x8 v6, r3, r4, 1
- Read8x8 v7, r3, r4, 1
- Read8x8 v8, r3, r4, 0
-
- interp_8x8 v0
- interp_8x8 v1
- interp_8x8 v6
- interp_8x8 v7
- interp_8x8 v8
-
- b second_pass_8x4
-
-second_pass_pre_copy_8x4:
- ;# only needed if there is a vertical filter present
- ;# if the second filter is not null then need to back off by 2*pitch
- sub r3, r3, r4
- sub r3, r3, r4
- li r10, 16
-
- Read8x8 v0, r3, r4, 1
- Read8x8 v1, r3, r4, 1
- Read8x8 v2, r3, r4, 1
- Read8x8 v3, r3, r4, 1
- Read8x8 v4, r3, r4, 1
- Read8x8 v5, r3, r4, 1
- Read8x8 v6, r3, r4, 1
- Read8x8 v7, r3, r4, 1
- Read8x8 v8, r3, r4, 1
-
- slwi r6, r6, 4 ;# index into vertical filter array
-
-second_pass_8x4:
- load_c v13, VFilter, r6, r9, r10
-
- vspltish v15, 8
- vspltish v20, 3
- vslh v20, v15, v20 ;# 0x0040 0040 0040 0040 0040 0040 0040 0040
-
- vspltb v14, v13, 1
- vspltb v15, v13, 2
- vspltb v16, v13, 3
- vspltb v17, v13, 4
- vspltb v18, v13, 5
- vspltb v13, v13, 0
-
- vinterp_no_store_8x8 v0, v1, v2, v3, v4, v5
- vinterp_no_store_8x8 v1, v2, v3, v4, v5, v6
- vinterp_no_store_8x8 v2, v3, v4, v5, v6, v7
- vinterp_no_store_8x8 v3, v4, v5, v6, v7, v8
-
- cmpi cr0, r8, 8
- beq cr0, store_aligned_8x4
-
- w_8x8 v0, r7, r0, r8
- w_8x8 v1, r7, r0, r8
- w_8x8 v2, r7, r0, r8
- w_8x8 v3, r7, r0, r8
-
- b exit_8x4
-
-store_aligned_8x4:
-
- load_c v10, b_hilo, 0, r9, r10
-
- vperm v0, v0, v1, v10
- vperm v2, v2, v3, v10
-
- stvx v0, 0, r7
- addi r7, r7, 16
- stvx v2, 0, r7
-
- b exit_8x4
-
-store_8x4:
- cmpi cr0, r8, 8
- beq cr0, store_aligned2_8x4
-
- w_8x8 v2, r7, r0, r8
- w_8x8 v3, r7, r0, r8
- w_8x8 v4, r7, r0, r8
- w_8x8 v5, r7, r0, r8
-
- b exit_8x4
-
-store_aligned2_8x4:
- load_c v10, b_hilo, 0, r9, r10
-
- vperm v2, v2, v3, v10
- vperm v4, v4, v5, v10
-
- stvx v2, 0, r7
- addi r7, r7, 16
- stvx v4, 0, r7
-
-exit_8x4:
-
- addi r1, r1, 32 ;# recover stack
-
- mtspr 256, r11 ;# reset old VRSAVE
-
-
- blr
-
- .align 2
-;# r3 unsigned char * src
-;# r4 int src_pitch
-;# r5 int x_offset
-;# r6 int y_offset
-;# r7 unsigned char * dst
-;# r8 int dst_pitch
-
-;# Because the width that needs to be filtered will fit in a single altivec
-;# register there is no need to loop. Everything can stay in registers.
-sixtap_predict8x8_ppc:
- mfspr r11, 256 ;# get old VRSAVE
- oris r12, r11, 0xffff
- ori r12, r12, 0xffc0
- mtspr 256, r12 ;# set VRSAVE
-
- stwu r1,-32(r1) ;# create space on the stack
-
- slwi. r5, r5, 5 ;# index into horizontal filter array
-
- vspltish v19, 7
-
- ;# If there isn't any filtering to be done for the horizontal, then
- ;# just skip to the second pass.
- beq- second_pass_pre_copy_8x8
-
- load_hfilter v13, v14
-
- ;# rounding added in on the multiply
- vspltisw v16, 8
- vspltisw v15, 3
- vslw v15, v16, v15 ;# 0x00000040000000400000004000000040
-
- ;# Load up permutation constants
- load_c v16, B_0123, 0, r9, r10
- load_c v17, B_4567, 0, r9, r10
- load_c v18, B_89AB, 0, r9, r10
-
- ;# Back off input buffer by 2 bytes. Need 2 before and 3 after
- addi r3, r3, -2
-
- addi r9, r3, 0
- li r10, 16
- Read8x8 v2, r3, r4, 1
- Read8x8 v3, r3, r4, 1
- Read8x8 v4, r3, r4, 1
- Read8x8 v5, r3, r4, 1
- Read8x8 v6, r3, r4, 1
- Read8x8 v7, r3, r4, 1
- Read8x8 v8, r3, r4, 1
- Read8x8 v9, r3, r4, 1
-
- slwi. r6, r6, 4 ;# index into vertical filter array
-
- ;# filter a line
- interp_8x8 v2
- interp_8x8 v3
- interp_8x8 v4
- interp_8x8 v5
- interp_8x8 v6
- interp_8x8 v7
- interp_8x8 v8
- interp_8x8 v9
-
- ;# Finished filtering main horizontal block. If there is no
- ;# vertical filtering, jump to storing the data. Otherwise
- ;# load up and filter the additional 5 lines that are needed
- ;# for the vertical filter.
- beq- store_8x8
-
- ;# only needed if there is a vertical filter present
- ;# if the second filter is not null then need to back off by 2*pitch
- sub r9, r9, r4
- sub r9, r9, r4
-
- Read8x8 v0, r9, r4, 1
- Read8x8 v1, r9, r4, 0
- Read8x8 v10, r3, r4, 1
- Read8x8 v11, r3, r4, 1
- Read8x8 v12, r3, r4, 0
-
- interp_8x8 v0
- interp_8x8 v1
- interp_8x8 v10
- interp_8x8 v11
- interp_8x8 v12
-
- b second_pass_8x8
-
-second_pass_pre_copy_8x8:
- ;# only needed if there is a vertical filter present
- ;# if the second filter is not null then need to back off by 2*pitch
- sub r3, r3, r4
- sub r3, r3, r4
- li r10, 16
-
- Read8x8 v0, r3, r4, 1
- Read8x8 v1, r3, r4, 1
- Read8x8 v2, r3, r4, 1
- Read8x8 v3, r3, r4, 1
- Read8x8 v4, r3, r4, 1
- Read8x8 v5, r3, r4, 1
- Read8x8 v6, r3, r4, 1
- Read8x8 v7, r3, r4, 1
- Read8x8 v8, r3, r4, 1
- Read8x8 v9, r3, r4, 1
- Read8x8 v10, r3, r4, 1
- Read8x8 v11, r3, r4, 1
- Read8x8 v12, r3, r4, 0
-
- slwi r6, r6, 4 ;# index into vertical filter array
-
-second_pass_8x8:
- load_c v13, VFilter, r6, r9, r10
-
- vspltish v15, 8
- vspltish v20, 3
- vslh v20, v15, v20 ;# 0x0040 0040 0040 0040 0040 0040 0040 0040
-
- vspltb v14, v13, 1
- vspltb v15, v13, 2
- vspltb v16, v13, 3
- vspltb v17, v13, 4
- vspltb v18, v13, 5
- vspltb v13, v13, 0
-
- vinterp_no_store_8x8 v0, v1, v2, v3, v4, v5
- vinterp_no_store_8x8 v1, v2, v3, v4, v5, v6
- vinterp_no_store_8x8 v2, v3, v4, v5, v6, v7
- vinterp_no_store_8x8 v3, v4, v5, v6, v7, v8
- vinterp_no_store_8x8 v4, v5, v6, v7, v8, v9
- vinterp_no_store_8x8 v5, v6, v7, v8, v9, v10
- vinterp_no_store_8x8 v6, v7, v8, v9, v10, v11
- vinterp_no_store_8x8 v7, v8, v9, v10, v11, v12
-
- cmpi cr0, r8, 8
- beq cr0, store_aligned_8x8
-
- w_8x8 v0, r7, r0, r8
- w_8x8 v1, r7, r0, r8
- w_8x8 v2, r7, r0, r8
- w_8x8 v3, r7, r0, r8
- w_8x8 v4, r7, r0, r8
- w_8x8 v5, r7, r0, r8
- w_8x8 v6, r7, r0, r8
- w_8x8 v7, r7, r0, r8
-
- b exit_8x8
-
-store_aligned_8x8:
-
- load_c v10, b_hilo, 0, r9, r10
-
- vperm v0, v0, v1, v10
- vperm v2, v2, v3, v10
- vperm v4, v4, v5, v10
- vperm v6, v6, v7, v10
-
- stvx v0, 0, r7
- addi r7, r7, 16
- stvx v2, 0, r7
- addi r7, r7, 16
- stvx v4, 0, r7
- addi r7, r7, 16
- stvx v6, 0, r7
-
- b exit_8x8
-
-store_8x8:
- cmpi cr0, r8, 8
- beq cr0, store_aligned2_8x8
-
- w_8x8 v2, r7, r0, r8
- w_8x8 v3, r7, r0, r8
- w_8x8 v4, r7, r0, r8
- w_8x8 v5, r7, r0, r8
- w_8x8 v6, r7, r0, r8
- w_8x8 v7, r7, r0, r8
- w_8x8 v8, r7, r0, r8
- w_8x8 v9, r7, r0, r8
-
- b exit_8x8
-
-store_aligned2_8x8:
- load_c v10, b_hilo, 0, r9, r10
-
- vperm v2, v2, v3, v10
- vperm v4, v4, v5, v10
- vperm v6, v6, v7, v10
- vperm v8, v8, v9, v10
-
- stvx v2, 0, r7
- addi r7, r7, 16
- stvx v4, 0, r7
- addi r7, r7, 16
- stvx v6, 0, r7
- addi r7, r7, 16
- stvx v8, 0, r7
-
-exit_8x8:
-
- addi r1, r1, 32 ;# recover stack
-
- mtspr 256, r11 ;# reset old VRSAVE
-
- blr
-
- .align 2
-;# r3 unsigned char * src
-;# r4 int src_pitch
-;# r5 int x_offset
-;# r6 int y_offset
-;# r7 unsigned char * dst
-;# r8 int dst_pitch
-
-;# Two pass filtering. First pass is Horizontal edges, second pass is vertical
-;# edges. One of the filters can be null, but both won't be. Needs to use a
-;# temporary buffer because the source buffer can't be modified and the buffer
-;# for the destination is not large enough to hold the temporary data.
-sixtap_predict16x16_ppc:
- mfspr r11, 256 ;# get old VRSAVE
- oris r12, r11, 0xffff
- ori r12, r12, 0xf000
- mtspr 256, r12 ;# set VRSAVE
-
- stwu r1,-416(r1) ;# create space on the stack
-
- ;# Three possiblities
- ;# 1. First filter is null. Don't use a temp buffer.
- ;# 2. Second filter is null. Don't use a temp buffer.
- ;# 3. Neither are null, use temp buffer.
-
- ;# First Pass (horizontal edge)
- ;# setup pointers for src
- ;# if possiblity (1) then setup the src pointer to be the orginal and jump
- ;# to second pass. this is based on if x_offset is 0.
-
- ;# load up horizontal filter
- slwi. r5, r5, 5 ;# index into horizontal filter array
-
- load_hfilter v4, v5
-
- beq- copy_horizontal_16x21
-
- ;# Back off input buffer by 2 bytes. Need 2 before and 3 after
- addi r3, r3, -2
-
- slwi. r6, r6, 4 ;# index into vertical filter array
-
- ;# setup constants
- ;# v14 permutation value for alignment
- load_c v14, b_hperm, 0, r9, r10
-
- ;# These statements are guessing that there won't be a second pass,
- ;# but if there is then inside the bypass they need to be set
- li r0, 16 ;# prepare for no vertical filter
-
- ;# Change the output pointer and pitch to be the actual
- ;# desination instead of a temporary buffer.
- addi r9, r7, 0
- addi r5, r8, 0
-
- ;# no vertical filter, so write the output from the first pass
- ;# directly into the output buffer.
- beq- no_vertical_filter_bypass
-
- ;# if the second filter is not null then need to back off by 2*pitch
- sub r3, r3, r4
- sub r3, r3, r4
-
- ;# setup counter for the number of lines that are going to be filtered
- li r0, 21
-
- ;# use the stack as temporary storage
- la r9, 48(r1)
- li r5, 16
-
-no_vertical_filter_bypass:
-
- mtctr r0
-
- ;# rounding added in on the multiply
- vspltisw v10, 8
- vspltisw v12, 3
- vslw v12, v10, v12 ;# 0x00000040000000400000004000000040
-
- ;# downshift by 7 ( divide by 128 ) at the end
- vspltish v13, 7
-
- ;# index to the next set of vectors in the row.
- li r10, 16
- li r12, 32
-
-horizontal_loop_16x16:
-
- lvsl v15, 0, r3 ;# permutate value for alignment
-
- ;# input to filter is 21 bytes wide, output is 16 bytes.
- ;# input will can span three vectors if not aligned correctly.
- lvx v1, 0, r3
- lvx v2, r10, r3
- lvx v3, r12, r3
-
- vperm v8, v1, v2, v15
- vperm v9, v2, v3, v15 ;# v8 v9 = 21 input pixels left-justified
-
- vsldoi v11, v8, v9, 4
-
- ;# set 0
- vmsummbm v6, v4, v8, v12 ;# taps times elements
- vmsummbm v0, v5, v11, v6
-
- ;# set 1
- vsldoi v10, v8, v9, 1
- vsldoi v11, v8, v9, 5
-
- vmsummbm v6, v4, v10, v12
- vmsummbm v1, v5, v11, v6
-
- ;# set 2
- vsldoi v10, v8, v9, 2
- vsldoi v11, v8, v9, 6
-
- vmsummbm v6, v4, v10, v12
- vmsummbm v2, v5, v11, v6
-
- ;# set 3
- vsldoi v10, v8, v9, 3
- vsldoi v11, v8, v9, 7
-
- vmsummbm v6, v4, v10, v12
- vmsummbm v3, v5, v11, v6
-
- vpkswus v0, v0, v1 ;# v0 = 0 4 8 C 1 5 9 D (16-bit)
- vpkswus v1, v2, v3 ;# v1 = 2 6 A E 3 7 B F
-
- vsrh v0, v0, v13 ;# divide v0, v1 by 128
- vsrh v1, v1, v13
-
- vpkuhus v0, v0, v1 ;# v0 = scrambled 8-bit result
- vperm v0, v0, v0, v14 ;# v0 = correctly-ordered result
-
- stvx v0, 0, r9
- add r9, r9, r5
-
- add r3, r3, r4
-
- bdnz horizontal_loop_16x16
-
- ;# check again to see if vertical filter needs to be done.
- cmpi cr0, r6, 0
- beq cr0, end_16x16
-
- ;# yes there is, so go to the second pass
- b second_pass_16x16
-
-copy_horizontal_16x21:
- li r10, 21
- mtctr r10
-
- li r10, 16
-
- sub r3, r3, r4
- sub r3, r3, r4
-
- ;# this is done above if there is a horizontal filter,
- ;# if not it needs to be done down here.
- slwi r6, r6, 4 ;# index into vertical filter array
-
- ;# always write to the stack when doing a horizontal copy
- la r9, 48(r1)
-
-copy_horizontal_loop_16x21:
- lvsl v15, 0, r3 ;# permutate value for alignment
-
- lvx v1, 0, r3
- lvx v2, r10, r3
-
- vperm v8, v1, v2, v15
-
- stvx v8, 0, r9
- addi r9, r9, 16
-
- add r3, r3, r4
-
- bdnz copy_horizontal_loop_16x21
-
-second_pass_16x16:
-
- ;# always read from the stack when doing a vertical filter
- la r9, 48(r1)
-
- ;# downshift by 7 ( divide by 128 ) at the end
- vspltish v7, 7
-
- vpre_load
-
- luma_vsix
- luma_vsix
- luma_vfour
-
-end_16x16:
-
- addi r1, r1, 416 ;# recover stack
-
- mtspr 256, r11 ;# reset old VRSAVE
-
- blr
-
- .data
-
- .align 4
-HFilter:
- .byte 0, 0,128, 0, 0, 0,128, 0, 0, 0,128, 0, 0, 0,128, 0
- .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
- .byte 0, -6,123, 12, 0, -6,123, 12, 0, -6,123, 12, 0, -6,123, 12
- .byte -1, 0, 0, 0, -1, 0, 0, 0, -1, 0, 0, 0, -1, 0, 0, 0
- .byte 2,-11,108, 36, 2,-11,108, 36, 2,-11,108, 36, 2,-11,108, 36
- .byte -8, 1, 0, 0, -8, 1, 0, 0, -8, 1, 0, 0, -8, 1, 0, 0
- .byte 0, -9, 93, 50, 0, -9, 93, 50, 0, -9, 93, 50, 0, -9, 93, 50
- .byte -6, 0, 0, 0, -6, 0, 0, 0, -6, 0, 0, 0, -6, 0, 0, 0
- .byte 3,-16, 77, 77, 3,-16, 77, 77, 3,-16, 77, 77, 3,-16, 77, 77
- .byte -16, 3, 0, 0,-16, 3, 0, 0,-16, 3, 0, 0,-16, 3, 0, 0
- .byte 0, -6, 50, 93, 0, -6, 50, 93, 0, -6, 50, 93, 0, -6, 50, 93
- .byte -9, 0, 0, 0, -9, 0, 0, 0, -9, 0, 0, 0, -9, 0, 0, 0
- .byte 1, -8, 36,108, 1, -8, 36,108, 1, -8, 36,108, 1, -8, 36,108
- .byte -11, 2, 0, 0,-11, 2, 0, 0,-11, 2, 0, 0,-11, 2, 0, 0
- .byte 0, -1, 12,123, 0, -1, 12,123, 0, -1, 12,123, 0, -1, 12,123
- .byte -6, 0, 0, 0, -6, 0, 0, 0, -6, 0, 0, 0, -6, 0, 0, 0
-
- .align 4
-VFilter:
- .byte 0, 0,128, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
- .byte 0, 6,123, 12, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
- .byte 2, 11,108, 36, 8, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
- .byte 0, 9, 93, 50, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
- .byte 3, 16, 77, 77, 16, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
- .byte 0, 6, 50, 93, 9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
- .byte 1, 8, 36,108, 11, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
- .byte 0, 1, 12,123, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
-
- .align 4
-b_hperm:
- .byte 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15
-
- .align 4
-B_0123:
- .byte 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6
-
- .align 4
-B_4567:
- .byte 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10
-
- .align 4
-B_89AB:
- .byte 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14
-
- .align 4
-b_hilo:
- .byte 0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23
-
- .align 4
-b_hilo_4x4:
- .byte 0, 1, 2, 3, 16, 17, 18, 19, 0, 0, 0, 0, 0, 0, 0, 0
diff --git a/vp8/common/ppc/filter_bilinear_altivec.asm b/vp8/common/ppc/filter_bilinear_altivec.asm
deleted file mode 100644
index fd8aa66..0000000
--- a/vp8/common/ppc/filter_bilinear_altivec.asm
+++ /dev/null
@@ -1,677 +0,0 @@
-;
-; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
-
- .globl bilinear_predict4x4_ppc
- .globl bilinear_predict8x4_ppc
- .globl bilinear_predict8x8_ppc
- .globl bilinear_predict16x16_ppc
-
-.macro load_c V, LABEL, OFF, R0, R1
- lis \R0, \LABEL@ha
- la \R1, \LABEL@l(\R0)
- lvx \V, \OFF, \R1
-.endm
-
-.macro load_vfilter V0, V1
- load_c \V0, vfilter_b, r6, r9, r10
-
- addi r6, r6, 16
- lvx \V1, r6, r10
-.endm
-
-.macro HProlog jump_label
- ;# load up horizontal filter
- slwi. r5, r5, 4 ;# index into horizontal filter array
-
- ;# index to the next set of vectors in the row.
- li r10, 16
- li r12, 32
-
- ;# downshift by 7 ( divide by 128 ) at the end
- vspltish v19, 7
-
- ;# If there isn't any filtering to be done for the horizontal, then
- ;# just skip to the second pass.
- beq \jump_label
-
- load_c v20, hfilter_b, r5, r9, r0
-
- ;# setup constants
- ;# v14 permutation value for alignment
- load_c v28, b_hperm_b, 0, r9, r0
-
- ;# rounding added in on the multiply
- vspltisw v21, 8
- vspltisw v18, 3
- vslw v18, v21, v18 ;# 0x00000040000000400000004000000040
-
- slwi. r6, r6, 5 ;# index into vertical filter array
-.endm
-
-;# Filters a horizontal line
-;# expects:
-;# r3 src_ptr
-;# r4 pitch
-;# r10 16
-;# r12 32
-;# v17 perm intput
-;# v18 rounding
-;# v19 shift
-;# v20 filter taps
-;# v21 tmp
-;# v22 tmp
-;# v23 tmp
-;# v24 tmp
-;# v25 tmp
-;# v26 tmp
-;# v27 tmp
-;# v28 perm output
-;#
-.macro HFilter V
- vperm v24, v21, v21, v10 ;# v20 = 0123 1234 2345 3456
- vperm v25, v21, v21, v11 ;# v21 = 4567 5678 6789 789A
-
- vmsummbm v24, v20, v24, v18
- vmsummbm v25, v20, v25, v18
-
- vpkswus v24, v24, v25 ;# v24 = 0 4 8 C 1 5 9 D (16-bit)
-
- vsrh v24, v24, v19 ;# divide v0, v1 by 128
-
- vpkuhus \V, v24, v24 ;# \V = scrambled 8-bit result
-.endm
-
-.macro hfilter_8 V, increment_counter
- lvsl v17, 0, r3 ;# permutate value for alignment
-
- ;# input to filter is 9 bytes wide, output is 8 bytes.
- lvx v21, 0, r3
- lvx v22, r10, r3
-
-.if \increment_counter
- add r3, r3, r4
-.endif
- vperm v21, v21, v22, v17
-
- HFilter \V
-.endm
-
-
-.macro load_and_align_8 V, increment_counter
- lvsl v17, 0, r3 ;# permutate value for alignment
-
- ;# input to filter is 21 bytes wide, output is 16 bytes.
- ;# input will can span three vectors if not aligned correctly.
- lvx v21, 0, r3
- lvx v22, r10, r3
-
-.if \increment_counter
- add r3, r3, r4
-.endif
-
- vperm \V, v21, v22, v17
-.endm
-
-.macro write_aligned_8 V, increment_counter
- stvx \V, 0, r7
-
-.if \increment_counter
- add r7, r7, r8
-.endif
-.endm
-
-.macro vfilter_16 P0 P1
- vmuleub v22, \P0, v20 ;# 64 + 4 positive taps
- vadduhm v22, v18, v22
- vmuloub v23, \P0, v20
- vadduhm v23, v18, v23
-
- vmuleub v24, \P1, v21
- vadduhm v22, v22, v24 ;# Re = evens, saturation unnecessary
- vmuloub v25, \P1, v21
- vadduhm v23, v23, v25 ;# Ro = odds
-
- vsrh v22, v22, v19 ;# divide by 128
- vsrh v23, v23, v19 ;# v16 v17 = evens, odds
- vmrghh \P0, v22, v23 ;# v18 v19 = 16-bit result in order
- vmrglh v23, v22, v23
- vpkuhus \P0, \P0, v23 ;# P0 = 8-bit result
-.endm
-
-
-.macro w_8x8 V, D, R, P
- stvx \V, 0, r1
- lwz \R, 0(r1)
- stw \R, 0(r7)
- lwz \R, 4(r1)
- stw \R, 4(r7)
- add \D, \D, \P
-.endm
-
-
- .align 2
-;# r3 unsigned char * src
-;# r4 int src_pitch
-;# r5 int x_offset
-;# r6 int y_offset
-;# r7 unsigned char * dst
-;# r8 int dst_pitch
-bilinear_predict4x4_ppc:
- mfspr r11, 256 ;# get old VRSAVE
- oris r12, r11, 0xf830
- ori r12, r12, 0xfff8
- mtspr 256, r12 ;# set VRSAVE
-
- stwu r1,-32(r1) ;# create space on the stack
-
- HProlog second_pass_4x4_pre_copy_b
-
- ;# Load up permutation constants
- load_c v10, b_0123_b, 0, r9, r12
- load_c v11, b_4567_b, 0, r9, r12
-
- hfilter_8 v0, 1
- hfilter_8 v1, 1
- hfilter_8 v2, 1
- hfilter_8 v3, 1
-
- ;# Finished filtering main horizontal block. If there is no
- ;# vertical filtering, jump to storing the data. Otherwise
- ;# load up and filter the additional line that is needed
- ;# for the vertical filter.
- beq store_out_4x4_b
-
- hfilter_8 v4, 0
-
- b second_pass_4x4_b
-
-second_pass_4x4_pre_copy_b:
- slwi r6, r6, 5 ;# index into vertical filter array
-
- load_and_align_8 v0, 1
- load_and_align_8 v1, 1
- load_and_align_8 v2, 1
- load_and_align_8 v3, 1
- load_and_align_8 v4, 1
-
-second_pass_4x4_b:
- vspltish v20, 8
- vspltish v18, 3
- vslh v18, v20, v18 ;# 0x0040 0040 0040 0040 0040 0040 0040 0040
-
- load_vfilter v20, v21
-
- vfilter_16 v0, v1
- vfilter_16 v1, v2
- vfilter_16 v2, v3
- vfilter_16 v3, v4
-
-store_out_4x4_b:
-
- stvx v0, 0, r1
- lwz r0, 0(r1)
- stw r0, 0(r7)
- add r7, r7, r8
-
- stvx v1, 0, r1
- lwz r0, 0(r1)
- stw r0, 0(r7)
- add r7, r7, r8
-
- stvx v2, 0, r1
- lwz r0, 0(r1)
- stw r0, 0(r7)
- add r7, r7, r8
-
- stvx v3, 0, r1
- lwz r0, 0(r1)
- stw r0, 0(r7)
-
-exit_4x4:
-
- addi r1, r1, 32 ;# recover stack
- mtspr 256, r11 ;# reset old VRSAVE
-
- blr
-
- .align 2
-;# r3 unsigned char * src
-;# r4 int src_pitch
-;# r5 int x_offset
-;# r6 int y_offset
-;# r7 unsigned char * dst
-;# r8 int dst_pitch
-bilinear_predict8x4_ppc:
- mfspr r11, 256 ;# get old VRSAVE
- oris r12, r11, 0xf830
- ori r12, r12, 0xfff8
- mtspr 256, r12 ;# set VRSAVE
-
- stwu r1,-32(r1) ;# create space on the stack
-
- HProlog second_pass_8x4_pre_copy_b
-
- ;# Load up permutation constants
- load_c v10, b_0123_b, 0, r9, r12
- load_c v11, b_4567_b, 0, r9, r12
-
- hfilter_8 v0, 1
- hfilter_8 v1, 1
- hfilter_8 v2, 1
- hfilter_8 v3, 1
-
- ;# Finished filtering main horizontal block. If there is no
- ;# vertical filtering, jump to storing the data. Otherwise
- ;# load up and filter the additional line that is needed
- ;# for the vertical filter.
- beq store_out_8x4_b
-
- hfilter_8 v4, 0
-
- b second_pass_8x4_b
-
-second_pass_8x4_pre_copy_b:
- slwi r6, r6, 5 ;# index into vertical filter array
-
- load_and_align_8 v0, 1
- load_and_align_8 v1, 1
- load_and_align_8 v2, 1
- load_and_align_8 v3, 1
- load_and_align_8 v4, 1
-
-second_pass_8x4_b:
- vspltish v20, 8
- vspltish v18, 3
- vslh v18, v20, v18 ;# 0x0040 0040 0040 0040 0040 0040 0040 0040
-
- load_vfilter v20, v21
-
- vfilter_16 v0, v1
- vfilter_16 v1, v2
- vfilter_16 v2, v3
- vfilter_16 v3, v4
-
-store_out_8x4_b:
-
- cmpi cr0, r8, 8
- beq cr0, store_aligned_8x4_b
-
- w_8x8 v0, r7, r0, r8
- w_8x8 v1, r7, r0, r8
- w_8x8 v2, r7, r0, r8
- w_8x8 v3, r7, r0, r8
-
- b exit_8x4
-
-store_aligned_8x4_b:
- load_c v10, b_hilo_b, 0, r9, r10
-
- vperm v0, v0, v1, v10
- vperm v2, v2, v3, v10
-
- stvx v0, 0, r7
- addi r7, r7, 16
- stvx v2, 0, r7
-
-exit_8x4:
-
- addi r1, r1, 32 ;# recover stack
- mtspr 256, r11 ;# reset old VRSAVE
-
- blr
-
- .align 2
-;# r3 unsigned char * src
-;# r4 int src_pitch
-;# r5 int x_offset
-;# r6 int y_offset
-;# r7 unsigned char * dst
-;# r8 int dst_pitch
-bilinear_predict8x8_ppc:
- mfspr r11, 256 ;# get old VRSAVE
- oris r12, r11, 0xfff0
- ori r12, r12, 0xffff
- mtspr 256, r12 ;# set VRSAVE
-
- stwu r1,-32(r1) ;# create space on the stack
-
- HProlog second_pass_8x8_pre_copy_b
-
- ;# Load up permutation constants
- load_c v10, b_0123_b, 0, r9, r12
- load_c v11, b_4567_b, 0, r9, r12
-
- hfilter_8 v0, 1
- hfilter_8 v1, 1
- hfilter_8 v2, 1
- hfilter_8 v3, 1
- hfilter_8 v4, 1
- hfilter_8 v5, 1
- hfilter_8 v6, 1
- hfilter_8 v7, 1
-
- ;# Finished filtering main horizontal block. If there is no
- ;# vertical filtering, jump to storing the data. Otherwise
- ;# load up and filter the additional line that is needed
- ;# for the vertical filter.
- beq store_out_8x8_b
-
- hfilter_8 v8, 0
-
- b second_pass_8x8_b
-
-second_pass_8x8_pre_copy_b:
- slwi r6, r6, 5 ;# index into vertical filter array
-
- load_and_align_8 v0, 1
- load_and_align_8 v1, 1
- load_and_align_8 v2, 1
- load_and_align_8 v3, 1
- load_and_align_8 v4, 1
- load_and_align_8 v5, 1
- load_and_align_8 v6, 1
- load_and_align_8 v7, 1
- load_and_align_8 v8, 0
-
-second_pass_8x8_b:
- vspltish v20, 8
- vspltish v18, 3
- vslh v18, v20, v18 ;# 0x0040 0040 0040 0040 0040 0040 0040 0040
-
- load_vfilter v20, v21
-
- vfilter_16 v0, v1
- vfilter_16 v1, v2
- vfilter_16 v2, v3
- vfilter_16 v3, v4
- vfilter_16 v4, v5
- vfilter_16 v5, v6
- vfilter_16 v6, v7
- vfilter_16 v7, v8
-
-store_out_8x8_b:
-
- cmpi cr0, r8, 8
- beq cr0, store_aligned_8x8_b
-
- w_8x8 v0, r7, r0, r8
- w_8x8 v1, r7, r0, r8
- w_8x8 v2, r7, r0, r8
- w_8x8 v3, r7, r0, r8
- w_8x8 v4, r7, r0, r8
- w_8x8 v5, r7, r0, r8
- w_8x8 v6, r7, r0, r8
- w_8x8 v7, r7, r0, r8
-
- b exit_8x8
-
-store_aligned_8x8_b:
- load_c v10, b_hilo_b, 0, r9, r10
-
- vperm v0, v0, v1, v10
- vperm v2, v2, v3, v10
- vperm v4, v4, v5, v10
- vperm v6, v6, v7, v10
-
- stvx v0, 0, r7
- addi r7, r7, 16
- stvx v2, 0, r7
- addi r7, r7, 16
- stvx v4, 0, r7
- addi r7, r7, 16
- stvx v6, 0, r7
-
-exit_8x8:
-
- addi r1, r1, 32 ;# recover stack
- mtspr 256, r11 ;# reset old VRSAVE
-
- blr
-
-;# Filters a horizontal line
-;# expects:
-;# r3 src_ptr
-;# r4 pitch
-;# r10 16
-;# r12 32
-;# v17 perm intput
-;# v18 rounding
-;# v19 shift
-;# v20 filter taps
-;# v21 tmp
-;# v22 tmp
-;# v23 tmp
-;# v24 tmp
-;# v25 tmp
-;# v26 tmp
-;# v27 tmp
-;# v28 perm output
-;#
-.macro hfilter_16 V, increment_counter
-
- lvsl v17, 0, r3 ;# permutate value for alignment
-
- ;# input to filter is 21 bytes wide, output is 16 bytes.
- ;# input will can span three vectors if not aligned correctly.
- lvx v21, 0, r3
- lvx v22, r10, r3
- lvx v23, r12, r3
-
-.if \increment_counter
- add r3, r3, r4
-.endif
- vperm v21, v21, v22, v17
- vperm v22, v22, v23, v17 ;# v8 v9 = 21 input pixels left-justified
-
- ;# set 0
- vmsummbm v24, v20, v21, v18 ;# taps times elements
-
- ;# set 1
- vsldoi v23, v21, v22, 1
- vmsummbm v25, v20, v23, v18
-
- ;# set 2
- vsldoi v23, v21, v22, 2
- vmsummbm v26, v20, v23, v18
-
- ;# set 3
- vsldoi v23, v21, v22, 3
- vmsummbm v27, v20, v23, v18
-
- vpkswus v24, v24, v25 ;# v24 = 0 4 8 C 1 5 9 D (16-bit)
- vpkswus v25, v26, v27 ;# v25 = 2 6 A E 3 7 B F
-
- vsrh v24, v24, v19 ;# divide v0, v1 by 128
- vsrh v25, v25, v19
-
- vpkuhus \V, v24, v25 ;# \V = scrambled 8-bit result
- vperm \V, \V, v0, v28 ;# \V = correctly-ordered result
-.endm
-
-.macro load_and_align_16 V, increment_counter
- lvsl v17, 0, r3 ;# permutate value for alignment
-
- ;# input to filter is 21 bytes wide, output is 16 bytes.
- ;# input will can span three vectors if not aligned correctly.
- lvx v21, 0, r3
- lvx v22, r10, r3
-
-.if \increment_counter
- add r3, r3, r4
-.endif
-
- vperm \V, v21, v22, v17
-.endm
-
-.macro write_16 V, increment_counter
- stvx \V, 0, r7
-
-.if \increment_counter
- add r7, r7, r8
-.endif
-.endm
-
- .align 2
-;# r3 unsigned char * src
-;# r4 int src_pitch
-;# r5 int x_offset
-;# r6 int y_offset
-;# r7 unsigned char * dst
-;# r8 int dst_pitch
-bilinear_predict16x16_ppc:
- mfspr r11, 256 ;# get old VRSAVE
- oris r12, r11, 0xffff
- ori r12, r12, 0xfff8
- mtspr 256, r12 ;# set VRSAVE
-
- HProlog second_pass_16x16_pre_copy_b
-
- hfilter_16 v0, 1
- hfilter_16 v1, 1
- hfilter_16 v2, 1
- hfilter_16 v3, 1
- hfilter_16 v4, 1
- hfilter_16 v5, 1
- hfilter_16 v6, 1
- hfilter_16 v7, 1
- hfilter_16 v8, 1
- hfilter_16 v9, 1
- hfilter_16 v10, 1
- hfilter_16 v11, 1
- hfilter_16 v12, 1
- hfilter_16 v13, 1
- hfilter_16 v14, 1
- hfilter_16 v15, 1
-
- ;# Finished filtering main horizontal block. If there is no
- ;# vertical filtering, jump to storing the data. Otherwise
- ;# load up and filter the additional line that is needed
- ;# for the vertical filter.
- beq store_out_16x16_b
-
- hfilter_16 v16, 0
-
- b second_pass_16x16_b
-
-second_pass_16x16_pre_copy_b:
- slwi r6, r6, 5 ;# index into vertical filter array
-
- load_and_align_16 v0, 1
- load_and_align_16 v1, 1
- load_and_align_16 v2, 1
- load_and_align_16 v3, 1
- load_and_align_16 v4, 1
- load_and_align_16 v5, 1
- load_and_align_16 v6, 1
- load_and_align_16 v7, 1
- load_and_align_16 v8, 1
- load_and_align_16 v9, 1
- load_and_align_16 v10, 1
- load_and_align_16 v11, 1
- load_and_align_16 v12, 1
- load_and_align_16 v13, 1
- load_and_align_16 v14, 1
- load_and_align_16 v15, 1
- load_and_align_16 v16, 0
-
-second_pass_16x16_b:
- vspltish v20, 8
- vspltish v18, 3
- vslh v18, v20, v18 ;# 0x0040 0040 0040 0040 0040 0040 0040 0040
-
- load_vfilter v20, v21
-
- vfilter_16 v0, v1
- vfilter_16 v1, v2
- vfilter_16 v2, v3
- vfilter_16 v3, v4
- vfilter_16 v4, v5
- vfilter_16 v5, v6
- vfilter_16 v6, v7
- vfilter_16 v7, v8
- vfilter_16 v8, v9
- vfilter_16 v9, v10
- vfilter_16 v10, v11
- vfilter_16 v11, v12
- vfilter_16 v12, v13
- vfilter_16 v13, v14
- vfilter_16 v14, v15
- vfilter_16 v15, v16
-
-store_out_16x16_b:
-
- write_16 v0, 1
- write_16 v1, 1
- write_16 v2, 1
- write_16 v3, 1
- write_16 v4, 1
- write_16 v5, 1
- write_16 v6, 1
- write_16 v7, 1
- write_16 v8, 1
- write_16 v9, 1
- write_16 v10, 1
- write_16 v11, 1
- write_16 v12, 1
- write_16 v13, 1
- write_16 v14, 1
- write_16 v15, 0
-
- mtspr 256, r11 ;# reset old VRSAVE
-
- blr
-
- .data
-
- .align 4
-hfilter_b:
- .byte 128, 0, 0, 0,128, 0, 0, 0,128, 0, 0, 0,128, 0, 0, 0
- .byte 112, 16, 0, 0,112, 16, 0, 0,112, 16, 0, 0,112, 16, 0, 0
- .byte 96, 32, 0, 0, 96, 32, 0, 0, 96, 32, 0, 0, 96, 32, 0, 0
- .byte 80, 48, 0, 0, 80, 48, 0, 0, 80, 48, 0, 0, 80, 48, 0, 0
- .byte 64, 64, 0, 0, 64, 64, 0, 0, 64, 64, 0, 0, 64, 64, 0, 0
- .byte 48, 80, 0, 0, 48, 80, 0, 0, 48, 80, 0, 0, 48, 80, 0, 0
- .byte 32, 96, 0, 0, 32, 96, 0, 0, 32, 96, 0, 0, 32, 96, 0, 0
- .byte 16,112, 0, 0, 16,112, 0, 0, 16,112, 0, 0, 16,112, 0, 0
-
- .align 4
-vfilter_b:
- .byte 128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128
- .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
- .byte 112,112,112,112,112,112,112,112,112,112,112,112,112,112,112,112
- .byte 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16
- .byte 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96
- .byte 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32
- .byte 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80
- .byte 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48
- .byte 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64
- .byte 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64
- .byte 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48
- .byte 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80
- .byte 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32
- .byte 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96
- .byte 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16
- .byte 112,112,112,112,112,112,112,112,112,112,112,112,112,112,112,112
-
- .align 4
-b_hperm_b:
- .byte 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15
-
- .align 4
-b_0123_b:
- .byte 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6
-
- .align 4
-b_4567_b:
- .byte 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10
-
-b_hilo_b:
- .byte 0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23
diff --git a/vp8/common/ppc/idctllm_altivec.asm b/vp8/common/ppc/idctllm_altivec.asm
deleted file mode 100644
index 117d9cf..0000000
--- a/vp8/common/ppc/idctllm_altivec.asm
+++ /dev/null
@@ -1,189 +0,0 @@
-;
-; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
-
- .globl short_idct4x4llm_ppc
-
-.macro load_c V, LABEL, OFF, R0, R1
- lis \R0, \LABEL@ha
- la \R1, \LABEL@l(\R0)
- lvx \V, \OFF, \R1
-.endm
-
-;# r3 short *input
-;# r4 short *output
-;# r5 int pitch
- .align 2
-short_idct4x4llm_ppc:
- mfspr r11, 256 ;# get old VRSAVE
- oris r12, r11, 0xfff8
- mtspr 256, r12 ;# set VRSAVE
-
- load_c v8, sinpi8sqrt2, 0, r9, r10
- load_c v9, cospi8sqrt2minus1, 0, r9, r10
- load_c v10, hi_hi, 0, r9, r10
- load_c v11, lo_lo, 0, r9, r10
- load_c v12, shift_16, 0, r9, r10
-
- li r10, 16
- lvx v0, 0, r3 ;# input ip[0], ip[ 4]
- lvx v1, r10, r3 ;# input ip[8], ip[12]
-
- ;# first pass
- vupkhsh v2, v0
- vupkhsh v3, v1
- vaddsws v6, v2, v3 ;# a1 = ip[0]+ip[8]
- vsubsws v7, v2, v3 ;# b1 = ip[0]-ip[8]
-
- vupklsh v0, v0
- vmulosh v4, v0, v8
- vsraw v4, v4, v12
- vaddsws v4, v4, v0 ;# ip[ 4] * sin(pi/8) * sqrt(2)
-
- vupklsh v1, v1
- vmulosh v5, v1, v9
- vsraw v5, v5, v12 ;# ip[12] * cos(pi/8) * sqrt(2)
- vaddsws v5, v5, v1
-
- vsubsws v4, v4, v5 ;# c1
-
- vmulosh v3, v1, v8
- vsraw v3, v3, v12
- vaddsws v3, v3, v1 ;# ip[12] * sin(pi/8) * sqrt(2)
-
- vmulosh v5, v0, v9
- vsraw v5, v5, v12 ;# ip[ 4] * cos(pi/8) * sqrt(2)
- vaddsws v5, v5, v0
-
- vaddsws v3, v3, v5 ;# d1
-
- vaddsws v0, v6, v3 ;# a1 + d1
- vsubsws v3, v6, v3 ;# a1 - d1
-
- vaddsws v1, v7, v4 ;# b1 + c1
- vsubsws v2, v7, v4 ;# b1 - c1
-
- ;# transpose input
- vmrghw v4, v0, v1 ;# a0 b0 a1 b1
- vmrghw v5, v2, v3 ;# c0 d0 c1 d1
-
- vmrglw v6, v0, v1 ;# a2 b2 a3 b3
- vmrglw v7, v2, v3 ;# c2 d2 c3 d3
-
- vperm v0, v4, v5, v10 ;# a0 b0 c0 d0
- vperm v1, v4, v5, v11 ;# a1 b1 c1 d1
-
- vperm v2, v6, v7, v10 ;# a2 b2 c2 d2
- vperm v3, v6, v7, v11 ;# a3 b3 c3 d3
-
- ;# second pass
- vaddsws v6, v0, v2 ;# a1 = ip[0]+ip[8]
- vsubsws v7, v0, v2 ;# b1 = ip[0]-ip[8]
-
- vmulosh v4, v1, v8
- vsraw v4, v4, v12
- vaddsws v4, v4, v1 ;# ip[ 4] * sin(pi/8) * sqrt(2)
-
- vmulosh v5, v3, v9
- vsraw v5, v5, v12 ;# ip[12] * cos(pi/8) * sqrt(2)
- vaddsws v5, v5, v3
-
- vsubsws v4, v4, v5 ;# c1
-
- vmulosh v2, v3, v8
- vsraw v2, v2, v12
- vaddsws v2, v2, v3 ;# ip[12] * sin(pi/8) * sqrt(2)
-
- vmulosh v5, v1, v9
- vsraw v5, v5, v12 ;# ip[ 4] * cos(pi/8) * sqrt(2)
- vaddsws v5, v5, v1
-
- vaddsws v3, v2, v5 ;# d1
-
- vaddsws v0, v6, v3 ;# a1 + d1
- vsubsws v3, v6, v3 ;# a1 - d1
-
- vaddsws v1, v7, v4 ;# b1 + c1
- vsubsws v2, v7, v4 ;# b1 - c1
-
- vspltish v6, 4
- vspltish v7, 3
-
- vpkswss v0, v0, v1
- vpkswss v1, v2, v3
-
- vaddshs v0, v0, v6
- vaddshs v1, v1, v6
-
- vsrah v0, v0, v7
- vsrah v1, v1, v7
-
- ;# transpose output
- vmrghh v2, v0, v1 ;# a0 c0 a1 c1 a2 c2 a3 c3
- vmrglh v3, v0, v1 ;# b0 d0 b1 d1 b2 d2 b3 d3
-
- vmrghh v0, v2, v3 ;# a0 b0 c0 d0 a1 b1 c1 d1
- vmrglh v1, v2, v3 ;# a2 b2 c2 d2 a3 b3 c3 d3
-
- stwu r1,-416(r1) ;# create space on the stack
-
- stvx v0, 0, r1
- lwz r6, 0(r1)
- stw r6, 0(r4)
- lwz r6, 4(r1)
- stw r6, 4(r4)
-
- add r4, r4, r5
-
- lwz r6, 8(r1)
- stw r6, 0(r4)
- lwz r6, 12(r1)
- stw r6, 4(r4)
-
- add r4, r4, r5
-
- stvx v1, 0, r1
- lwz r6, 0(r1)
- stw r6, 0(r4)
- lwz r6, 4(r1)
- stw r6, 4(r4)
-
- add r4, r4, r5
-
- lwz r6, 8(r1)
- stw r6, 0(r4)
- lwz r6, 12(r1)
- stw r6, 4(r4)
-
- addi r1, r1, 416 ;# recover stack
-
- mtspr 256, r11 ;# reset old VRSAVE
-
- blr
-
- .align 4
-sinpi8sqrt2:
- .short 35468, 35468, 35468, 35468, 35468, 35468, 35468, 35468
-
- .align 4
-cospi8sqrt2minus1:
- .short 20091, 20091, 20091, 20091, 20091, 20091, 20091, 20091
-
- .align 4
-shift_16:
- .long 16, 16, 16, 16
-
- .align 4
-hi_hi:
- .byte 0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23
-
- .align 4
-lo_lo:
- .byte 8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31
diff --git a/vp8/common/ppc/loopfilter_altivec.c b/vp8/common/ppc/loopfilter_altivec.c
deleted file mode 100644
index 71bf6e2..0000000
--- a/vp8/common/ppc/loopfilter_altivec.c
+++ /dev/null
@@ -1,135 +0,0 @@
-/*
- * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#include "loopfilter.h"
-#include "onyxc_int.h"
-
-typedef void loop_filter_function_y_ppc
-(
- unsigned char *s, // source pointer
- int p, // pitch
- const signed char *flimit,
- const signed char *limit,
- const signed char *thresh
-);
-
-typedef void loop_filter_function_uv_ppc
-(
- unsigned char *u, // source pointer
- unsigned char *v, // source pointer
- int p, // pitch
- const signed char *flimit,
- const signed char *limit,
- const signed char *thresh
-);
-
-typedef void loop_filter_function_s_ppc
-(
- unsigned char *s, // source pointer
- int p, // pitch
- const signed char *flimit
-);
-
-loop_filter_function_y_ppc mbloop_filter_horizontal_edge_y_ppc;
-loop_filter_function_y_ppc mbloop_filter_vertical_edge_y_ppc;
-loop_filter_function_y_ppc loop_filter_horizontal_edge_y_ppc;
-loop_filter_function_y_ppc loop_filter_vertical_edge_y_ppc;
-
-loop_filter_function_uv_ppc mbloop_filter_horizontal_edge_uv_ppc;
-loop_filter_function_uv_ppc mbloop_filter_vertical_edge_uv_ppc;
-loop_filter_function_uv_ppc loop_filter_horizontal_edge_uv_ppc;
-loop_filter_function_uv_ppc loop_filter_vertical_edge_uv_ppc;
-
-loop_filter_function_s_ppc loop_filter_simple_horizontal_edge_ppc;
-loop_filter_function_s_ppc loop_filter_simple_vertical_edge_ppc;
-
-// Horizontal MB filtering
-void loop_filter_mbh_ppc(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
- int y_stride, int uv_stride, loop_filter_info *lfi)
-{
- mbloop_filter_horizontal_edge_y_ppc(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->thr);
-
- if (u_ptr)
- mbloop_filter_horizontal_edge_uv_ppc(u_ptr, v_ptr, uv_stride, lfi->mbflim, lfi->lim, lfi->thr);
-}
-
-void loop_filter_mbhs_ppc(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
- int y_stride, int uv_stride, loop_filter_info *lfi)
-{
- (void)u_ptr;
- (void)v_ptr;
- (void)uv_stride;
- loop_filter_simple_horizontal_edge_ppc(y_ptr, y_stride, lfi->mbflim);
-}
-
-// Vertical MB Filtering
-void loop_filter_mbv_ppc(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
- int y_stride, int uv_stride, loop_filter_info *lfi)
-{
- mbloop_filter_vertical_edge_y_ppc(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->thr);
-
- if (u_ptr)
- mbloop_filter_vertical_edge_uv_ppc(u_ptr, v_ptr, uv_stride, lfi->mbflim, lfi->lim, lfi->thr);
-}
-
-void loop_filter_mbvs_ppc(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
- int y_stride, int uv_stride, loop_filter_info *lfi)
-{
- (void)u_ptr;
- (void)v_ptr;
- (void)uv_stride;
- loop_filter_simple_vertical_edge_ppc(y_ptr, y_stride, lfi->mbflim);
-}
-
-// Horizontal B Filtering
-void loop_filter_bh_ppc(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
- int y_stride, int uv_stride, loop_filter_info *lfi)
-{
- // These should all be done at once with one call, instead of 3
- loop_filter_horizontal_edge_y_ppc(y_ptr + 4 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr);
- loop_filter_horizontal_edge_y_ppc(y_ptr + 8 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr);
- loop_filter_horizontal_edge_y_ppc(y_ptr + 12 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr);
-
- if (u_ptr)
- loop_filter_horizontal_edge_uv_ppc(u_ptr + 4 * uv_stride, v_ptr + 4 * uv_stride, uv_stride, lfi->flim, lfi->lim, lfi->thr);
-}
-
-void loop_filter_bhs_ppc(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
- int y_stride, int uv_stride, loop_filter_info *lfi)
-{
- (void)u_ptr;
- (void)v_ptr;
- (void)uv_stride;
- loop_filter_simple_horizontal_edge_ppc(y_ptr + 4 * y_stride, y_stride, lfi->flim);
- loop_filter_simple_horizontal_edge_ppc(y_ptr + 8 * y_stride, y_stride, lfi->flim);
- loop_filter_simple_horizontal_edge_ppc(y_ptr + 12 * y_stride, y_stride, lfi->flim);
-}
-
-// Vertical B Filtering
-void loop_filter_bv_ppc(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
- int y_stride, int uv_stride, loop_filter_info *lfi)
-{
- loop_filter_vertical_edge_y_ppc(y_ptr, y_stride, lfi->flim, lfi->lim, lfi->thr);
-
- if (u_ptr)
- loop_filter_vertical_edge_uv_ppc(u_ptr + 4, v_ptr + 4, uv_stride, lfi->flim, lfi->lim, lfi->thr);
-}
-
-void loop_filter_bvs_ppc(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
- int y_stride, int uv_stride, loop_filter_info *lfi)
-{
- (void)u_ptr;
- (void)v_ptr;
- (void)uv_stride;
- loop_filter_simple_vertical_edge_ppc(y_ptr + 4, y_stride, lfi->flim);
- loop_filter_simple_vertical_edge_ppc(y_ptr + 8, y_stride, lfi->flim);
- loop_filter_simple_vertical_edge_ppc(y_ptr + 12, y_stride, lfi->flim);
-}
diff --git a/vp8/common/ppc/loopfilter_filters_altivec.asm b/vp8/common/ppc/loopfilter_filters_altivec.asm
deleted file mode 100644
index 61df4e9..0000000
--- a/vp8/common/ppc/loopfilter_filters_altivec.asm
+++ /dev/null
@@ -1,1253 +0,0 @@
-;
-; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
-
- .globl mbloop_filter_horizontal_edge_y_ppc
- .globl loop_filter_horizontal_edge_y_ppc
- .globl mbloop_filter_vertical_edge_y_ppc
- .globl loop_filter_vertical_edge_y_ppc
-
- .globl mbloop_filter_horizontal_edge_uv_ppc
- .globl loop_filter_horizontal_edge_uv_ppc
- .globl mbloop_filter_vertical_edge_uv_ppc
- .globl loop_filter_vertical_edge_uv_ppc
-
- .globl loop_filter_simple_horizontal_edge_ppc
- .globl loop_filter_simple_vertical_edge_ppc
-
- .text
-;# We often need to perform transposes (and other transpose-like operations)
-;# on matrices of data. This is simplified by the fact that we usually
-;# operate on hunks of data whose dimensions are powers of 2, or at least
-;# divisible by highish powers of 2.
-;#
-;# These operations can be very confusing. They become more straightforward
-;# when we think of them as permutations of address bits: Concatenate a
-;# group of vector registers and think of it as occupying a block of
-;# memory beginning at address zero. The low four bits 0...3 of the
-;# address then correspond to position within a register, the higher-order
-;# address bits select the register.
-;#
-;# Although register selection, at the code level, is arbitrary, things
-;# are simpler if we use contiguous ranges of register numbers, simpler
-;# still if the low-order bits of the register number correspond to
-;# conceptual address bits. We do this whenever reasonable.
-;#
-;# A 16x16 transpose can then be thought of as an operation on
-;# a 256-element block of memory. It takes 8 bits 0...7 to address this
-;# memory and the effect of a transpose is to interchange address bit
-;# 0 with 4, 1 with 5, 2 with 6, and 3 with 7. Bits 0...3 index the
-;# column, which is interchanged with the row addressed by bits 4..7.
-;#
-;# The altivec merge instructions provide a rapid means of effecting
-;# many of these transforms. They operate at three widths (8,16,32).
-;# Writing V(x) for vector register #x, paired merges permute address
-;# indices as follows.
-;#
-;# 0->1 1->2 2->3 3->(4+d) (4+s)->0:
-;#
-;# vmrghb V( x), V( y), V( y + (1<<s))
-;# vmrglb V( x + (1<<d)), V( y), V( y + (1<<s))
-;#
-;#
-;# =0= 1->2 2->3 3->(4+d) (4+s)->1:
-;#
-;# vmrghh V( x), V( y), V( y + (1<<s))
-;# vmrglh V( x + (1<<d)), V( y), V( y + (1<<s))
-;#
-;#
-;# =0= =1= 2->3 3->(4+d) (4+s)->2:
-;#
-;# vmrghw V( x), V( y), V( y + (1<<s))
-;# vmrglw V( x + (1<<d)), V( y), V( y + (1<<s))
-;#
-;#
-;# Unfortunately, there is no doubleword merge instruction.
-;# The following sequence uses "vperm" is a substitute.
-;# Assuming that the selection masks b_hihi and b_lolo (defined in LFppc.c)
-;# are in registers Vhihi and Vlolo, we can also effect the permutation
-;#
-;# =0= =1= =2= 3->(4+d) (4+s)->3 by the sequence:
-;#
-;# vperm V( x), V( y), V( y + (1<<s)), Vhihi
-;# vperm V( x + (1<<d)), V( y), V( y + (1<<s)), Vlolo
-;#
-;#
-;# Except for bits s and d, the other relationships between register
-;# number (= high-order part of address) bits are at the disposal of
-;# the programmer.
-;#
-
-;# To avoid excess transposes, we filter all 3 vertical luma subblock
-;# edges together. This requires a single 16x16 transpose, which, in
-;# the above language, amounts to the following permutation of address
-;# indices: 0<->4 1<->5 2<->6 3<->7, which we accomplish by
-;# 4 iterations of the cyclic transform 0->1->2->3->4->5->6->7->0.
-;#
-;# Except for the fact that the destination registers get written
-;# before we are done referencing the old contents, the cyclic transform
-;# is effected by
-;#
-;# x = 0; do {
-;# vmrghb V(2x), V(x), V(x+8);
-;# vmrghb V(2x+1), V(x), V(x+8);
-;# } while( ++x < 8);
-;#
-;# For clarity, and because we can afford it, we do this transpose
-;# using all 32 registers, alternating the banks 0..15 and 16 .. 31,
-;# leaving the final result in 16 .. 31, as the lower registers are
-;# used in the filtering itself.
-;#
-.macro Tpair A, B, X, Y
- vmrghb \A, \X, \Y
- vmrglb \B, \X, \Y
-.endm
-
-;# Each step takes 8*2 = 16 instructions
-
-.macro t16_even
- Tpair v16,v17, v0,v8
- Tpair v18,v19, v1,v9
- Tpair v20,v21, v2,v10
- Tpair v22,v23, v3,v11
- Tpair v24,v25, v4,v12
- Tpair v26,v27, v5,v13
- Tpair v28,v29, v6,v14
- Tpair v30,v31, v7,v15
-.endm
-
-.macro t16_odd
- Tpair v0,v1, v16,v24
- Tpair v2,v3, v17,v25
- Tpair v4,v5, v18,v26
- Tpair v6,v7, v19,v27
- Tpair v8,v9, v20,v28
- Tpair v10,v11, v21,v29
- Tpair v12,v13, v22,v30
- Tpair v14,v15, v23,v31
-.endm
-
-;# Whole transpose takes 4*16 = 64 instructions
-
-.macro t16_full
- t16_odd
- t16_even
- t16_odd
- t16_even
-.endm
-
-;# Vertical edge filtering requires transposes. For the simple filter,
-;# we need to convert 16 rows of 4 pels each into 4 registers of 16 pels
-;# each. Writing 0 ... 63 for the pixel indices, the desired result is:
-;#
-;# v0 = 0 1 ... 14 15
-;# v1 = 16 17 ... 30 31
-;# v2 = 32 33 ... 47 48
-;# v3 = 49 50 ... 62 63
-;#
-;# In frame-buffer memory, the layout is:
-;#
-;# 0 16 32 48
-;# 1 17 33 49
-;# ...
-;# 15 31 47 63.
-;#
-;# We begin by reading the data 32 bits at a time (using scalar operations)
-;# into a temporary array, reading the rows of the array into vector registers,
-;# with the following layout:
-;#
-;# v0 = 0 16 32 48 4 20 36 52 8 24 40 56 12 28 44 60
-;# v1 = 1 17 33 49 5 21 ... 45 61
-;# v2 = 2 18 ... 46 62
-;# v3 = 3 19 ... 47 63
-;#
-;# From the "address-bit" perspective discussed above, we simply need to
-;# interchange bits 0 <-> 4 and 1 <-> 5, leaving bits 2 and 3 alone.
-;# In other words, we transpose each of the four 4x4 submatrices.
-;#
-;# This transformation is its own inverse, and we need to perform it
-;# again before writing the pixels back into the frame buffer.
-;#
-;# It acts in place on registers v0...v3, uses v4...v7 as temporaries,
-;# and assumes that v14/v15 contain the b_hihi/b_lolo selectors
-;# defined above. We think of both groups of 4 registers as having
-;# "addresses" {0,1,2,3} * 16.
-;#
-.macro Transpose4times4x4 Vlo, Vhi
-
- ;# d=s=0 0->1 1->2 2->3 3->4 4->0 =5=
-
- vmrghb v4, v0, v1
- vmrglb v5, v0, v1
- vmrghb v6, v2, v3
- vmrglb v7, v2, v3
-
- ;# d=0 s=1 =0= 1->2 2->3 3->4 4->5 5->1
-
- vmrghh v0, v4, v6
- vmrglh v1, v4, v6
- vmrghh v2, v5, v7
- vmrglh v3, v5, v7
-
- ;# d=s=0 =0= =1= 2->3 3->4 4->2 =5=
-
- vmrghw v4, v0, v1
- vmrglw v5, v0, v1
- vmrghw v6, v2, v3
- vmrglw v7, v2, v3
-
- ;# d=0 s=1 =0= =1= =2= 3->4 4->5 5->3
-
- vperm v0, v4, v6, \Vlo
- vperm v1, v4, v6, \Vhi
- vperm v2, v5, v7, \Vlo
- vperm v3, v5, v7, \Vhi
-.endm
-;# end Transpose4times4x4
-
-
-;# Normal mb vertical edge filter transpose.
-;#
-;# We read 8 columns of data, initially in the following pattern:
-;#
-;# (0,0) (1,0) ... (7,0) (0,1) (1,1) ... (7,1)
-;# (0,2) (1,2) ... (7,2) (0,3) (1,3) ... (7,3)
-;# ...
-;# (0,14) (1,14) .. (7,14) (0,15) (1,15) .. (7,15)
-;#
-;# and wish to convert to:
-;#
-;# (0,0) ... (0,15)
-;# (1,0) ... (1,15)
-;# ...
-;# (7,0) ... (7,15).
-;#
-;# In "address bit" language, we wish to map
-;#
-;# 0->4 1->5 2->6 3->0 4->1 5->2 6->3, i.e., I -> (I+4) mod 7.
-;#
-;# This can be accomplished by 4 iterations of the cyclic transform
-;#
-;# I -> (I+1) mod 7;
-;#
-;# each iteration can be realized by (d=0, s=2):
-;#
-;# x = 0; do Tpair( V(2x),V(2x+1), V(x),V(x+4)) while( ++x < 4);
-;#
-;# The input/output is in registers v0...v7. We use v10...v17 as mirrors;
-;# preserving v8 = sign converter.
-;#
-;# Inverse transpose is similar, except here I -> (I+3) mod 7 and the
-;# result lands in the "mirror" registers v10...v17
-;#
-.macro t8x16_odd
- Tpair v10, v11, v0, v4
- Tpair v12, v13, v1, v5
- Tpair v14, v15, v2, v6
- Tpair v16, v17, v3, v7
-.endm
-
-.macro t8x16_even
- Tpair v0, v1, v10, v14
- Tpair v2, v3, v11, v15
- Tpair v4, v5, v12, v16
- Tpair v6, v7, v13, v17
-.endm
-
-.macro transpose8x16_fwd
- t8x16_odd
- t8x16_even
- t8x16_odd
- t8x16_even
-.endm
-
-.macro transpose8x16_inv
- t8x16_odd
- t8x16_even
- t8x16_odd
-.endm
-
-.macro Transpose16x16
- vmrghb v0, v16, v24
- vmrglb v1, v16, v24
- vmrghb v2, v17, v25
- vmrglb v3, v17, v25
- vmrghb v4, v18, v26
- vmrglb v5, v18, v26
- vmrghb v6, v19, v27
- vmrglb v7, v19, v27
- vmrghb v8, v20, v28
- vmrglb v9, v20, v28
- vmrghb v10, v21, v29
- vmrglb v11, v21, v29
- vmrghb v12, v22, v30
- vmrglb v13, v22, v30
- vmrghb v14, v23, v31
- vmrglb v15, v23, v31
- vmrghb v16, v0, v8
- vmrglb v17, v0, v8
- vmrghb v18, v1, v9
- vmrglb v19, v1, v9
- vmrghb v20, v2, v10
- vmrglb v21, v2, v10
- vmrghb v22, v3, v11
- vmrglb v23, v3, v11
- vmrghb v24, v4, v12
- vmrglb v25, v4, v12
- vmrghb v26, v5, v13
- vmrglb v27, v5, v13
- vmrghb v28, v6, v14
- vmrglb v29, v6, v14
- vmrghb v30, v7, v15
- vmrglb v31, v7, v15
- vmrghb v0, v16, v24
- vmrglb v1, v16, v24
- vmrghb v2, v17, v25
- vmrglb v3, v17, v25
- vmrghb v4, v18, v26
- vmrglb v5, v18, v26
- vmrghb v6, v19, v27
- vmrglb v7, v19, v27
- vmrghb v8, v20, v28
- vmrglb v9, v20, v28
- vmrghb v10, v21, v29
- vmrglb v11, v21, v29
- vmrghb v12, v22, v30
- vmrglb v13, v22, v30
- vmrghb v14, v23, v31
- vmrglb v15, v23, v31
- vmrghb v16, v0, v8
- vmrglb v17, v0, v8
- vmrghb v18, v1, v9
- vmrglb v19, v1, v9
- vmrghb v20, v2, v10
- vmrglb v21, v2, v10
- vmrghb v22, v3, v11
- vmrglb v23, v3, v11
- vmrghb v24, v4, v12
- vmrglb v25, v4, v12
- vmrghb v26, v5, v13
- vmrglb v27, v5, v13
- vmrghb v28, v6, v14
- vmrglb v29, v6, v14
- vmrghb v30, v7, v15
- vmrglb v31, v7, v15
-.endm
-
-;# load_g loads a global vector (whose address is in the local variable Gptr)
-;# into vector register Vreg. Trashes r0
-.macro load_g Vreg, Gptr
- lwz r0, \Gptr
- lvx \Vreg, 0, r0
-.endm
-
-;# exploit the saturation here. if the answer is negative
-;# it will be clamped to 0. orring 0 with a positive
-;# number will be the positive number (abs)
-;# RES = abs( A-B), trashes TMP
-.macro Abs RES, TMP, A, B
- vsububs \RES, \A, \B
- vsububs \TMP, \B, \A
- vor \RES, \RES, \TMP
-.endm
-
-;# RES = Max( RES, abs( A-B)), trashes TMP
-.macro max_abs RES, TMP, A, B
- vsububs \TMP, \A, \B
- vmaxub \RES, \RES, \TMP
- vsububs \TMP, \B, \A
- vmaxub \RES, \RES, \TMP
-.endm
-
-.macro Masks
- ;# build masks
- ;# input is all 8 bit unsigned (0-255). need to
- ;# do abs(vala-valb) > limit. but no need to compare each
- ;# value to the limit. find the max of the absolute differences
- ;# and compare that to the limit.
- ;# First hev
- Abs v14, v13, v2, v3 ;# |P1 - P0|
- max_abs v14, v13, v5, v4 ;# |Q1 - Q0|
-
- vcmpgtub v10, v14, v10 ;# HEV = true if thresh exceeded
-
- ;# Next limit
- max_abs v14, v13, v0, v1 ;# |P3 - P2|
- max_abs v14, v13, v1, v2 ;# |P2 - P1|
- max_abs v14, v13, v6, v5 ;# |Q2 - Q1|
- max_abs v14, v13, v7, v6 ;# |Q3 - Q2|
-
- vcmpgtub v9, v14, v9 ;# R = true if limit exceeded
-
- ;# flimit
- Abs v14, v13, v3, v4 ;# |P0 - Q0|
-
- vcmpgtub v8, v14, v8 ;# X = true if flimit exceeded
-
- vor v8, v8, v9 ;# R = true if flimit or limit exceeded
- ;# done building masks
-.endm
-
-.macro build_constants RFL, RLI, RTH, FL, LI, TH
- ;# build constants
- lvx \FL, 0, \RFL ;# flimit
- lvx \LI, 0, \RLI ;# limit
- lvx \TH, 0, \RTH ;# thresh
-
- vspltisb v11, 8
- vspltisb v12, 4
- vslb v11, v11, v12 ;# 0x80808080808080808080808080808080
-.endm
-
-.macro load_data_y
- ;# setup strides/pointers to be able to access
- ;# all of the data
- add r5, r4, r4 ;# r5 = 2 * stride
- sub r6, r3, r5 ;# r6 -> 2 rows back
- neg r7, r4 ;# r7 = -stride
-
- ;# load 16 pixels worth of data to work on
- sub r0, r6, r5 ;# r0 -> 4 rows back (temp)
- lvx v0, 0, r0 ;# P3 (read only)
- lvx v1, r7, r6 ;# P2
- lvx v2, 0, r6 ;# P1
- lvx v3, r7, r3 ;# P0
- lvx v4, 0, r3 ;# Q0
- lvx v5, r4, r3 ;# Q1
- lvx v6, r5, r3 ;# Q2
- add r0, r3, r5 ;# r0 -> 2 rows fwd (temp)
- lvx v7, r4, r0 ;# Q3 (read only)
-.endm
-
-;# Expects
-;# v10 == HEV
-;# v13 == tmp
-;# v14 == tmp
-.macro common_adjust P0, Q0, P1, Q1, HEV_PRESENT
- vxor \P1, \P1, v11 ;# SP1
- vxor \P0, \P0, v11 ;# SP0
- vxor \Q0, \Q0, v11 ;# SQ0
- vxor \Q1, \Q1, v11 ;# SQ1
-
- vsubsbs v13, \P1, \Q1 ;# f = c (P1 - Q1)
-.if \HEV_PRESENT
- vand v13, v13, v10 ;# f &= hev
-.endif
- vsubsbs v14, \Q0, \P0 ;# -126 <= X = Q0-P0 <= +126
- vaddsbs v13, v13, v14
- vaddsbs v13, v13, v14
- vaddsbs v13, v13, v14 ;# A = c( c(P1-Q1) + 3*(Q0-P0))
-
- vandc v13, v13, v8 ;# f &= mask
-
- vspltisb v8, 3
- vspltisb v9, 4
-
- vaddsbs v14, v13, v9 ;# f1 = c (f+4)
- vaddsbs v15, v13, v8 ;# f2 = c (f+3)
-
- vsrab v13, v14, v8 ;# f1 >>= 3
- vsrab v15, v15, v8 ;# f2 >>= 3
-
- vsubsbs \Q0, \Q0, v13 ;# u1 = c (SQ0 - f1)
- vaddsbs \P0, \P0, v15 ;# u2 = c (SP0 + f2)
-.endm
-
-.macro vp8_mbfilter
- Masks
-
- ;# start the fitering here
- vxor v1, v1, v11 ;# SP2
- vxor v2, v2, v11 ;# SP1
- vxor v3, v3, v11 ;# SP0
- vxor v4, v4, v11 ;# SQ0
- vxor v5, v5, v11 ;# SQ1
- vxor v6, v6, v11 ;# SQ2
-
- ;# add outer taps if we have high edge variance
- vsubsbs v13, v2, v5 ;# f = c (SP1-SQ1)
-
- vsubsbs v14, v4, v3 ;# SQ0-SP0
- vaddsbs v13, v13, v14
- vaddsbs v13, v13, v14
- vaddsbs v13, v13, v14 ;# f = c( c(SP1-SQ1) + 3*(SQ0-SP0))
-
- vandc v13, v13, v8 ;# f &= mask
- vand v15, v13, v10 ;# f2 = f & hev
-
- ;# save bottom 3 bits so that we round one side +4 and the other +3
- vspltisb v8, 3
- vspltisb v9, 4
-
- vaddsbs v14, v15, v9 ;# f1 = c (f+4)
- vaddsbs v15, v15, v8 ;# f2 = c (f+3)
-
- vsrab v14, v14, v8 ;# f1 >>= 3
- vsrab v15, v15, v8 ;# f2 >>= 3
-
- vsubsbs v4, v4, v14 ;# u1 = c (SQ0 - f1)
- vaddsbs v3, v3, v15 ;# u2 = c (SP0 + f2)
-
- ;# only apply wider filter if not high edge variance
- vandc v13, v13, v10 ;# f &= ~hev
-
- vspltisb v9, 2
- vnor v8, v8, v8
- vsrb v9, v8, v9 ;# 0x3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f
- vupkhsb v9, v9 ;# 0x003f003f003f003f003f003f003f003f
- vspltisb v8, 9
-
- ;# roughly 1/7th difference across boundary
- vspltish v10, 7
- vmulosb v14, v8, v13 ;# A = c( c(P1-Q1) + 3*(Q0-P0))
- vmulesb v15, v8, v13
- vaddshs v14, v14, v9 ;# += 63
- vaddshs v15, v15, v9
- vsrah v14, v14, v10 ;# >>= 7
- vsrah v15, v15, v10
- vmrglh v10, v15, v14
- vmrghh v15, v15, v14
-
- vpkshss v10, v15, v10 ;# X = saturated down to bytes
-
- vsubsbs v6, v6, v10 ;# subtract from Q and add to P
- vaddsbs v1, v1, v10
-
- vxor v6, v6, v11
- vxor v1, v1, v11
-
- ;# roughly 2/7th difference across boundary
- vspltish v10, 7
- vaddubm v12, v8, v8
- vmulosb v14, v12, v13 ;# A = c( c(P1-Q1) + 3*(Q0-P0))
- vmulesb v15, v12, v13
- vaddshs v14, v14, v9
- vaddshs v15, v15, v9
- vsrah v14, v14, v10 ;# >>= 7
- vsrah v15, v15, v10
- vmrglh v10, v15, v14
- vmrghh v15, v15, v14
-
- vpkshss v10, v15, v10 ;# X = saturated down to bytes
-
- vsubsbs v5, v5, v10 ;# subtract from Q and add to P
- vaddsbs v2, v2, v10
-
- vxor v5, v5, v11
- vxor v2, v2, v11
-
- ;# roughly 3/7th difference across boundary
- vspltish v10, 7
- vaddubm v12, v12, v8
- vmulosb v14, v12, v13 ;# A = c( c(P1-Q1) + 3*(Q0-P0))
- vmulesb v15, v12, v13
- vaddshs v14, v14, v9
- vaddshs v15, v15, v9
- vsrah v14, v14, v10 ;# >>= 7
- vsrah v15, v15, v10
- vmrglh v10, v15, v14
- vmrghh v15, v15, v14
-
- vpkshss v10, v15, v10 ;# X = saturated down to bytes
-
- vsubsbs v4, v4, v10 ;# subtract from Q and add to P
- vaddsbs v3, v3, v10
-
- vxor v4, v4, v11
- vxor v3, v3, v11
-.endm
-
-.macro SBFilter
- Masks
-
- common_adjust v3, v4, v2, v5, 1
-
- ;# outer tap adjustments
- vspltisb v8, 1
-
- vaddubm v13, v13, v8 ;# f += 1
- vsrab v13, v13, v8 ;# f >>= 1
-
- vandc v13, v13, v10 ;# f &= ~hev
-
- vsubsbs v5, v5, v13 ;# u1 = c (SQ1 - f)
- vaddsbs v2, v2, v13 ;# u2 = c (SP1 + f)
-
- vxor v2, v2, v11
- vxor v3, v3, v11
- vxor v4, v4, v11
- vxor v5, v5, v11
-.endm
-
- .align 2
-mbloop_filter_horizontal_edge_y_ppc:
- mfspr r11, 256 ;# get old VRSAVE
- oris r12, r11, 0xffff
- mtspr 256, r12 ;# set VRSAVE
-
- build_constants r5, r6, r7, v8, v9, v10
-
- load_data_y
-
- vp8_mbfilter
-
- stvx v1, r7, r6 ;# P2
- stvx v2, 0, r6 ;# P1
- stvx v3, r7, r3 ;# P0
- stvx v4, 0, r3 ;# Q0
- stvx v5, r4, r3 ;# Q1
- stvx v6, r5, r3 ;# Q2
-
- mtspr 256, r11 ;# reset old VRSAVE
-
- blr
-
- .align 2
-;# r3 unsigned char *s
-;# r4 int p
-;# r5 const signed char *flimit
-;# r6 const signed char *limit
-;# r7 const signed char *thresh
-loop_filter_horizontal_edge_y_ppc:
- mfspr r11, 256 ;# get old VRSAVE
- oris r12, r11, 0xffff
- mtspr 256, r12 ;# set VRSAVE
-
- build_constants r5, r6, r7, v8, v9, v10
-
- load_data_y
-
- SBFilter
-
- stvx v2, 0, r6 ;# P1
- stvx v3, r7, r3 ;# P0
- stvx v4, 0, r3 ;# Q0
- stvx v5, r4, r3 ;# Q1
-
- mtspr 256, r11 ;# reset old VRSAVE
-
- blr
-
-;# Filtering a vertical mb. Each mb is aligned on a 16 byte boundary.
-;# So we can read in an entire mb aligned. However if we want to filter the mb
-;# edge we run into problems. For the loopfilter we require 4 bytes before the mb
-;# and 4 after for a total of 8 bytes. Reading 16 bytes inorder to get 4 is a bit
-;# of a waste. So this is an even uglier way to get around that.
-;# Using the regular register file words are read in and then saved back out to
-;# memory to align and order them up. Then they are read in using the
-;# vector register file.
-.macro RLVmb V, R
- lwzux r0, r3, r4
- stw r0, 4(\R)
- lwz r0,-4(r3)
- stw r0, 0(\R)
- lwzux r0, r3, r4
- stw r0,12(\R)
- lwz r0,-4(r3)
- stw r0, 8(\R)
- lvx \V, 0, \R
-.endm
-
-.macro WLVmb V, R
- stvx \V, 0, \R
- lwz r0,12(\R)
- stwux r0, r3, r4
- lwz r0, 8(\R)
- stw r0,-4(r3)
- lwz r0, 4(\R)
- stwux r0, r3, r4
- lwz r0, 0(\R)
- stw r0,-4(r3)
-.endm
-
- .align 2
-;# r3 unsigned char *s
-;# r4 int p
-;# r5 const signed char *flimit
-;# r6 const signed char *limit
-;# r7 const signed char *thresh
-mbloop_filter_vertical_edge_y_ppc:
- mfspr r11, 256 ;# get old VRSAVE
- oris r12, r11, 0xffff
- ori r12, r12, 0xc000
- mtspr 256, r12 ;# set VRSAVE
-
- la r9, -48(r1) ;# temporary space for reading in vectors
- sub r3, r3, r4
-
- RLVmb v0, r9
- RLVmb v1, r9
- RLVmb v2, r9
- RLVmb v3, r9
- RLVmb v4, r9
- RLVmb v5, r9
- RLVmb v6, r9
- RLVmb v7, r9
-
- transpose8x16_fwd
-
- build_constants r5, r6, r7, v8, v9, v10
-
- vp8_mbfilter
-
- transpose8x16_inv
-
- add r3, r3, r4
- neg r4, r4
-
- WLVmb v17, r9
- WLVmb v16, r9
- WLVmb v15, r9
- WLVmb v14, r9
- WLVmb v13, r9
- WLVmb v12, r9
- WLVmb v11, r9
- WLVmb v10, r9
-
- mtspr 256, r11 ;# reset old VRSAVE
-
- blr
-
-.macro RL V, R, P
- lvx \V, 0, \R
- add \R, \R, \P
-.endm
-
-.macro WL V, R, P
- stvx \V, 0, \R
- add \R, \R, \P
-.endm
-
-.macro Fil P3, P2, P1, P0, Q0, Q1, Q2, Q3
- ;# K = |P0-P1| already
- Abs v14, v13, \Q0, \Q1 ;# M = |Q0-Q1|
- vmaxub v14, v14, v4 ;# M = max( |P0-P1|, |Q0-Q1|)
- vcmpgtub v10, v14, v0
-
- Abs v4, v5, \Q2, \Q3 ;# K = |Q2-Q3| = next |P0-P1]
-
- max_abs v14, v13, \Q1, \Q2 ;# M = max( M, |Q1-Q2|)
- max_abs v14, v13, \P1, \P2 ;# M = max( M, |P1-P2|)
- max_abs v14, v13, \P2, \P3 ;# M = max( M, |P2-P3|)
-
- vmaxub v14, v14, v4 ;# M = max interior abs diff
- vcmpgtub v9, v14, v2 ;# M = true if int_l exceeded
-
- Abs v14, v13, \P0, \Q0 ;# X = Abs( P0-Q0)
- vcmpgtub v8, v14, v3 ;# X = true if edge_l exceeded
- vor v8, v8, v9 ;# M = true if edge_l or int_l exceeded
-
- ;# replace P1,Q1 w/signed versions
- common_adjust \P0, \Q0, \P1, \Q1, 1
-
- vaddubm v13, v13, v1 ;# -16 <= M <= 15, saturation irrelevant
- vsrab v13, v13, v1
- vandc v13, v13, v10 ;# adjust P1,Q1 by (M+1)>>1 if ! hev
- vsubsbs \Q1, \Q1, v13
- vaddsbs \P1, \P1, v13
-
- vxor \P1, \P1, v11 ;# P1
- vxor \P0, \P0, v11 ;# P0
- vxor \Q0, \Q0, v11 ;# Q0
- vxor \Q1, \Q1, v11 ;# Q1
-.endm
-
-
- .align 2
-;# r3 unsigned char *s
-;# r4 int p
-;# r5 const signed char *flimit
-;# r6 const signed char *limit
-;# r7 const signed char *thresh
-loop_filter_vertical_edge_y_ppc:
- mfspr r11, 256 ;# get old VRSAVE
- oris r12, r11, 0xffff
- ori r12, r12, 0xffff
- mtspr 256, r12 ;# set VRSAVE
-
- addi r9, r3, 0
- RL v16, r9, r4
- RL v17, r9, r4
- RL v18, r9, r4
- RL v19, r9, r4
- RL v20, r9, r4
- RL v21, r9, r4
- RL v22, r9, r4
- RL v23, r9, r4
- RL v24, r9, r4
- RL v25, r9, r4
- RL v26, r9, r4
- RL v27, r9, r4
- RL v28, r9, r4
- RL v29, r9, r4
- RL v30, r9, r4
- lvx v31, 0, r9
-
- Transpose16x16
-
- vspltisb v1, 1
-
- build_constants r5, r6, r7, v3, v2, v0
-
- Abs v4, v5, v19, v18 ;# K(v14) = first |P0-P1|
-
- Fil v16, v17, v18, v19, v20, v21, v22, v23
- Fil v20, v21, v22, v23, v24, v25, v26, v27
- Fil v24, v25, v26, v27, v28, v29, v30, v31
-
- Transpose16x16
-
- addi r9, r3, 0
- WL v16, r9, r4
- WL v17, r9, r4
- WL v18, r9, r4
- WL v19, r9, r4
- WL v20, r9, r4
- WL v21, r9, r4
- WL v22, r9, r4
- WL v23, r9, r4
- WL v24, r9, r4
- WL v25, r9, r4
- WL v26, r9, r4
- WL v27, r9, r4
- WL v28, r9, r4
- WL v29, r9, r4
- WL v30, r9, r4
- stvx v31, 0, r9
-
- mtspr 256, r11 ;# reset old VRSAVE
-
- blr
-
-;# -=-=-=-=-=-=-=-=-=-=-=-=-=-=-=- UV FILTERING -=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
-.macro active_chroma_sel V
- andi. r7, r3, 8 ;# row origin modulo 16
- add r7, r7, r7 ;# selects selectors
- lis r12, _chromaSelectors@ha
- la r0, _chromaSelectors@l(r12)
- lwzux r0, r7, r0 ;# leave selector addr in r7
-
- lvx \V, 0, r0 ;# mask to concatenate active U,V pels
-.endm
-
-.macro hread_uv Dest, U, V, Offs, VMask
- lvx \U, \Offs, r3
- lvx \V, \Offs, r4
- vperm \Dest, \U, \V, \VMask ;# Dest = active part of U then V
-.endm
-
-.macro hwrite_uv New, U, V, Offs, Umask, Vmask
- vperm \U, \New, \U, \Umask ;# Combine new pels with siblings
- vperm \V, \New, \V, \Vmask
- stvx \U, \Offs, r3 ;# Write to frame buffer
- stvx \V, \Offs, r4
-.endm
-
-;# Process U,V in parallel.
-.macro load_chroma_h
- neg r9, r5 ;# r9 = -1 * stride
- add r8, r9, r9 ;# r8 = -2 * stride
- add r10, r5, r5 ;# r10 = 2 * stride
-
- active_chroma_sel v12
-
- ;# P3, Q3 are read-only; need not save addresses or sibling pels
- add r6, r8, r8 ;# r6 = -4 * stride
- hread_uv v0, v14, v15, r6, v12
- add r6, r10, r5 ;# r6 = 3 * stride
- hread_uv v7, v14, v15, r6, v12
-
- ;# Others are read/write; save addresses and sibling pels
-
- add r6, r8, r9 ;# r6 = -3 * stride
- hread_uv v1, v16, v17, r6, v12
- hread_uv v2, v18, v19, r8, v12
- hread_uv v3, v20, v21, r9, v12
- hread_uv v4, v22, v23, 0, v12
- hread_uv v5, v24, v25, r5, v12
- hread_uv v6, v26, v27, r10, v12
-.endm
-
-.macro uresult_sel V
- load_g \V, 4(r7)
-.endm
-
-.macro vresult_sel V
- load_g \V, 8(r7)
-.endm
-
-;# always write P1,P0,Q0,Q1
-.macro store_chroma_h
- uresult_sel v11
- vresult_sel v12
- hwrite_uv v2, v18, v19, r8, v11, v12
- hwrite_uv v3, v20, v21, r9, v11, v12
- hwrite_uv v4, v22, v23, 0, v11, v12
- hwrite_uv v5, v24, v25, r5, v11, v12
-.endm
-
- .align 2
-;# r3 unsigned char *u
-;# r4 unsigned char *v
-;# r5 int p
-;# r6 const signed char *flimit
-;# r7 const signed char *limit
-;# r8 const signed char *thresh
-mbloop_filter_horizontal_edge_uv_ppc:
- mfspr r11, 256 ;# get old VRSAVE
- oris r12, r11, 0xffff
- ori r12, r12, 0xffff
- mtspr 256, r12 ;# set VRSAVE
-
- build_constants r6, r7, r8, v8, v9, v10
-
- load_chroma_h
-
- vp8_mbfilter
-
- store_chroma_h
-
- hwrite_uv v1, v16, v17, r6, v11, v12 ;# v1 == P2
- hwrite_uv v6, v26, v27, r10, v11, v12 ;# v6 == Q2
-
- mtspr 256, r11 ;# reset old VRSAVE
-
- blr
-
- .align 2
-;# r3 unsigned char *u
-;# r4 unsigned char *v
-;# r5 int p
-;# r6 const signed char *flimit
-;# r7 const signed char *limit
-;# r8 const signed char *thresh
-loop_filter_horizontal_edge_uv_ppc:
- mfspr r11, 256 ;# get old VRSAVE
- oris r12, r11, 0xffff
- ori r12, r12, 0xffff
- mtspr 256, r12 ;# set VRSAVE
-
- build_constants r6, r7, r8, v8, v9, v10
-
- load_chroma_h
-
- SBFilter
-
- store_chroma_h
-
- mtspr 256, r11 ;# reset old VRSAVE
-
- blr
-
-.macro R V, R
- lwzux r0, r3, r5
- stw r0, 4(\R)
- lwz r0,-4(r3)
- stw r0, 0(\R)
- lwzux r0, r4, r5
- stw r0,12(\R)
- lwz r0,-4(r4)
- stw r0, 8(\R)
- lvx \V, 0, \R
-.endm
-
-
-.macro W V, R
- stvx \V, 0, \R
- lwz r0,12(\R)
- stwux r0, r4, r5
- lwz r0, 8(\R)
- stw r0,-4(r4)
- lwz r0, 4(\R)
- stwux r0, r3, r5
- lwz r0, 0(\R)
- stw r0,-4(r3)
-.endm
-
-.macro chroma_vread R
- sub r3, r3, r5 ;# back up one line for simplicity
- sub r4, r4, r5
-
- R v0, \R
- R v1, \R
- R v2, \R
- R v3, \R
- R v4, \R
- R v5, \R
- R v6, \R
- R v7, \R
-
- transpose8x16_fwd
-.endm
-
-.macro chroma_vwrite R
-
- transpose8x16_inv
-
- add r3, r3, r5
- add r4, r4, r5
- neg r5, r5 ;# Write rows back in reverse order
-
- W v17, \R
- W v16, \R
- W v15, \R
- W v14, \R
- W v13, \R
- W v12, \R
- W v11, \R
- W v10, \R
-.endm
-
- .align 2
-;# r3 unsigned char *u
-;# r4 unsigned char *v
-;# r5 int p
-;# r6 const signed char *flimit
-;# r7 const signed char *limit
-;# r8 const signed char *thresh
-mbloop_filter_vertical_edge_uv_ppc:
- mfspr r11, 256 ;# get old VRSAVE
- oris r12, r11, 0xffff
- ori r12, r12, 0xc000
- mtspr 256, r12 ;# set VRSAVE
-
- la r9, -48(r1) ;# temporary space for reading in vectors
-
- chroma_vread r9
-
- build_constants r6, r7, r8, v8, v9, v10
-
- vp8_mbfilter
-
- chroma_vwrite r9
-
- mtspr 256, r11 ;# reset old VRSAVE
-
- blr
-
- .align 2
-;# r3 unsigned char *u
-;# r4 unsigned char *v
-;# r5 int p
-;# r6 const signed char *flimit
-;# r7 const signed char *limit
-;# r8 const signed char *thresh
-loop_filter_vertical_edge_uv_ppc:
- mfspr r11, 256 ;# get old VRSAVE
- oris r12, r11, 0xffff
- ori r12, r12, 0xc000
- mtspr 256, r12 ;# set VRSAVE
-
- la r9, -48(r1) ;# temporary space for reading in vectors
-
- chroma_vread r9
-
- build_constants r6, r7, r8, v8, v9, v10
-
- SBFilter
-
- chroma_vwrite r9
-
- mtspr 256, r11 ;# reset old VRSAVE
-
- blr
-
-;# -=-=-=-=-=-=-=-=-=-=-=-=-=-= SIMPLE LOOP FILTER =-=-=-=-=-=-=-=-=-=-=-=-=-=-
-
-.macro vp8_simple_filter
- Abs v14, v13, v1, v2 ;# M = abs( P0 - Q0)
- vcmpgtub v8, v14, v8 ;# v5 = true if _over_ limit
-
- ;# preserve unsigned v0 and v3
- common_adjust v1, v2, v0, v3, 0
-
- vxor v1, v1, v11
- vxor v2, v2, v11 ;# cvt Q0, P0 back to pels
-.endm
-
-.macro simple_vertical
- addi r8, 0, 16
- addi r7, r5, 32
-
- lvx v0, 0, r5
- lvx v1, r8, r5
- lvx v2, 0, r7
- lvx v3, r8, r7
-
- lis r12, _B_hihi@ha
- la r0, _B_hihi@l(r12)
- lvx v16, 0, r0
-
- lis r12, _B_lolo@ha
- la r0, _B_lolo@l(r12)
- lvx v17, 0, r0
-
- Transpose4times4x4 v16, v17
- vp8_simple_filter
-
- vxor v0, v0, v11
- vxor v3, v3, v11 ;# cvt Q0, P0 back to pels
-
- Transpose4times4x4 v16, v17
-
- stvx v0, 0, r5
- stvx v1, r8, r5
- stvx v2, 0, r7
- stvx v3, r8, r7
-.endm
-
- .align 2
-;# r3 unsigned char *s
-;# r4 int p
-;# r5 const signed char *flimit
-loop_filter_simple_horizontal_edge_ppc:
- mfspr r11, 256 ;# get old VRSAVE
- oris r12, r11, 0xffff
- mtspr 256, r12 ;# set VRSAVE
-
- ;# build constants
- lvx v8, 0, r5 ;# flimit
-
- vspltisb v11, 8
- vspltisb v12, 4
- vslb v11, v11, v12 ;# 0x80808080808080808080808080808080
-
- neg r5, r4 ;# r5 = -1 * stride
- add r6, r5, r5 ;# r6 = -2 * stride
-
- lvx v0, r6, r3 ;# v0 = P1 = 16 pels two rows above edge
- lvx v1, r5, r3 ;# v1 = P0 = 16 pels one row above edge
- lvx v2, 0, r3 ;# v2 = Q0 = 16 pels one row below edge
- lvx v3, r4, r3 ;# v3 = Q1 = 16 pels two rows below edge
-
- vp8_simple_filter
-
- stvx v1, r5, r3 ;# store P0
- stvx v2, 0, r3 ;# store Q0
-
- mtspr 256, r11 ;# reset old VRSAVE
-
- blr
-
-.macro RLV Offs
- stw r0, (\Offs*4)(r5)
- lwzux r0, r7, r4
-.endm
-
-.macro WLV Offs
- lwz r0, (\Offs*4)(r5)
- stwux r0, r7, r4
-.endm
-
- .align 2
-;# r3 unsigned char *s
-;# r4 int p
-;# r5 const signed char *flimit
-loop_filter_simple_vertical_edge_ppc:
- mfspr r11, 256 ;# get old VRSAVE
- oris r12, r11, 0xffff
- ori r12, r12, 0xc000
- mtspr 256, r12 ;# set VRSAVE
-
- ;# build constants
- lvx v8, 0, r5 ;# flimit
-
- vspltisb v11, 8
- vspltisb v12, 4
- vslb v11, v11, v12 ;# 0x80808080808080808080808080808080
-
- la r5, -96(r1) ;# temporary space for reading in vectors
-
- ;# Store 4 pels at word "Offs" in temp array, then advance r7
- ;# to next row and read another 4 pels from the frame buffer.
-
- subi r7, r3, 2 ;# r7 -> 2 pels before start
- lwzx r0, 0, r7 ;# read first 4 pels
-
- ;# 16 unaligned word accesses
- RLV 0
- RLV 4
- RLV 8
- RLV 12
- RLV 1
- RLV 5
- RLV 9
- RLV 13
- RLV 2
- RLV 6
- RLV 10
- RLV 14
- RLV 3
- RLV 7
- RLV 11
-
- stw r0, (15*4)(r5) ;# write last 4 pels
-
- simple_vertical
-
- ;# Read temp array, write frame buffer.
- subi r7, r3, 2 ;# r7 -> 2 pels before start
- lwzx r0, 0, r5 ;# read/write first 4 pels
- stwx r0, 0, r7
-
- WLV 4
- WLV 8
- WLV 12
- WLV 1
- WLV 5
- WLV 9
- WLV 13
- WLV 2
- WLV 6
- WLV 10
- WLV 14
- WLV 3
- WLV 7
- WLV 11
- WLV 15
-
- mtspr 256, r11 ;# reset old VRSAVE
-
- blr
-
- .data
-
-_chromaSelectors:
- .long _B_hihi
- .long _B_Ures0
- .long _B_Vres0
- .long 0
- .long _B_lolo
- .long _B_Ures8
- .long _B_Vres8
- .long 0
-
- .align 4
-_B_Vres8:
- .byte 16, 17, 18, 19, 20, 21, 22, 23, 8, 9, 10, 11, 12, 13, 14, 15
-
- .align 4
-_B_Ures8:
- .byte 16, 17, 18, 19, 20, 21, 22, 23, 0, 1, 2, 3, 4, 5, 6, 7
-
- .align 4
-_B_lolo:
- .byte 8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31
-
- .align 4
-_B_Vres0:
- .byte 8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31
- .align 4
-_B_Ures0:
- .byte 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
-
- .align 4
-_B_hihi:
- .byte 0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23
diff --git a/vp8/common/ppc/platform_altivec.asm b/vp8/common/ppc/platform_altivec.asm
deleted file mode 100644
index f81d86f..0000000
--- a/vp8/common/ppc/platform_altivec.asm
+++ /dev/null
@@ -1,59 +0,0 @@
-;
-; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
-
- .globl save_platform_context
- .globl restore_platform_context
-
-.macro W V P
- stvx \V, 0, \P
- addi \P, \P, 16
-.endm
-
-.macro R V P
- lvx \V, 0, \P
- addi \P, \P, 16
-.endm
-
-;# r3 context_ptr
- .align 2
-save_platform_contex:
- W v20, r3
- W v21, r3
- W v22, r3
- W v23, r3
- W v24, r3
- W v25, r3
- W v26, r3
- W v27, r3
- W v28, r3
- W v29, r3
- W v30, r3
- W v31, r3
-
- blr
-
-;# r3 context_ptr
- .align 2
-restore_platform_context:
- R v20, r3
- R v21, r3
- R v22, r3
- R v23, r3
- R v24, r3
- R v25, r3
- R v26, r3
- R v27, r3
- R v28, r3
- R v29, r3
- R v30, r3
- R v31, r3
-
- blr
diff --git a/vp8/common/ppc/recon_altivec.asm b/vp8/common/ppc/recon_altivec.asm
deleted file mode 100644
index dd39e05..0000000
--- a/vp8/common/ppc/recon_altivec.asm
+++ /dev/null
@@ -1,175 +0,0 @@
-;
-; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
-
- .globl recon4b_ppc
- .globl recon2b_ppc
- .globl recon_b_ppc
-
-.macro row_of16 Diff Pred Dst Stride
- lvx v1, 0, \Pred ;# v1 = pred = p0..p15
- addi \Pred, \Pred, 16 ;# next pred
- vmrghb v2, v0, v1 ;# v2 = 16-bit p0..p7
- lvx v3, 0, \Diff ;# v3 = d0..d7
- vaddshs v2, v2, v3 ;# v2 = r0..r7
- vmrglb v1, v0, v1 ;# v1 = 16-bit p8..p15
- lvx v3, r8, \Diff ;# v3 = d8..d15
- addi \Diff, \Diff, 32 ;# next diff
- vaddshs v3, v3, v1 ;# v3 = r8..r15
- vpkshus v2, v2, v3 ;# v2 = 8-bit r0..r15
- stvx v2, 0, \Dst ;# to dst
- add \Dst, \Dst, \Stride ;# next dst
-.endm
-
- .text
- .align 2
-;# r3 = short *diff_ptr,
-;# r4 = unsigned char *pred_ptr,
-;# r5 = unsigned char *dst_ptr,
-;# r6 = int stride
-recon4b_ppc:
- mfspr r0, 256 ;# get old VRSAVE
- stw r0, -8(r1) ;# save old VRSAVE to stack
- oris r0, r0, 0xf000
- mtspr 256,r0 ;# set VRSAVE
-
- vxor v0, v0, v0
- li r8, 16
-
- row_of16 r3, r4, r5, r6
- row_of16 r3, r4, r5, r6
- row_of16 r3, r4, r5, r6
- row_of16 r3, r4, r5, r6
-
- lwz r12, -8(r1) ;# restore old VRSAVE from stack
- mtspr 256, r12 ;# reset old VRSAVE
-
- blr
-
-.macro two_rows_of8 Diff Pred Dst Stride write_first_four_pels
- lvx v1, 0, \Pred ;# v1 = pred = p0..p15
- vmrghb v2, v0, v1 ;# v2 = 16-bit p0..p7
- lvx v3, 0, \Diff ;# v3 = d0..d7
- vaddshs v2, v2, v3 ;# v2 = r0..r7
- vmrglb v1, v0, v1 ;# v1 = 16-bit p8..p15
- lvx v3, r8, \Diff ;# v2 = d8..d15
- vaddshs v3, v3, v1 ;# v3 = r8..r15
- vpkshus v2, v2, v3 ;# v3 = 8-bit r0..r15
- stvx v2, 0, r10 ;# 2 rows to dst from buf
- lwz r0, 0(r10)
-.if \write_first_four_pels
- stw r0, 0(\Dst)
- .else
- stwux r0, \Dst, \Stride
-.endif
- lwz r0, 4(r10)
- stw r0, 4(\Dst)
- lwz r0, 8(r10)
- stwux r0, \Dst, \Stride ;# advance dst to next row
- lwz r0, 12(r10)
- stw r0, 4(\Dst)
-.endm
-
- .align 2
-;# r3 = short *diff_ptr,
-;# r4 = unsigned char *pred_ptr,
-;# r5 = unsigned char *dst_ptr,
-;# r6 = int stride
-
-recon2b_ppc:
- mfspr r0, 256 ;# get old VRSAVE
- stw r0, -8(r1) ;# save old VRSAVE to stack
- oris r0, r0, 0xf000
- mtspr 256,r0 ;# set VRSAVE
-
- vxor v0, v0, v0
- li r8, 16
-
- la r10, -48(r1) ;# buf
-
- two_rows_of8 r3, r4, r5, r6, 1
-
- addi r4, r4, 16; ;# next pred
- addi r3, r3, 32; ;# next diff
-
- two_rows_of8 r3, r4, r5, r6, 0
-
- lwz r12, -8(r1) ;# restore old VRSAVE from stack
- mtspr 256, r12 ;# reset old VRSAVE
-
- blr
-
-.macro get_two_diff_rows
- stw r0, 0(r10)
- lwz r0, 4(r3)
- stw r0, 4(r10)
- lwzu r0, 32(r3)
- stw r0, 8(r10)
- lwz r0, 4(r3)
- stw r0, 12(r10)
- lvx v3, 0, r10
-.endm
-
- .align 2
-;# r3 = short *diff_ptr,
-;# r4 = unsigned char *pred_ptr,
-;# r5 = unsigned char *dst_ptr,
-;# r6 = int stride
-recon_b_ppc:
- mfspr r0, 256 ;# get old VRSAVE
- stw r0, -8(r1) ;# save old VRSAVE to stack
- oris r0, r0, 0xf000
- mtspr 256,r0 ;# set VRSAVE
-
- vxor v0, v0, v0
-
- la r10, -48(r1) ;# buf
-
- lwz r0, 0(r4)
- stw r0, 0(r10)
- lwz r0, 16(r4)
- stw r0, 4(r10)
- lwz r0, 32(r4)
- stw r0, 8(r10)
- lwz r0, 48(r4)
- stw r0, 12(r10)
-
- lvx v1, 0, r10; ;# v1 = pred = p0..p15
-
- lwz r0, 0(r3) ;# v3 = d0..d7
-
- get_two_diff_rows
-
- vmrghb v2, v0, v1; ;# v2 = 16-bit p0..p7
- vaddshs v2, v2, v3; ;# v2 = r0..r7
-
- lwzu r0, 32(r3) ;# v3 = d8..d15
-
- get_two_diff_rows
-
- vmrglb v1, v0, v1; ;# v1 = 16-bit p8..p15
- vaddshs v3, v3, v1; ;# v3 = r8..r15
-
- vpkshus v2, v2, v3; ;# v2 = 8-bit r0..r15
- stvx v2, 0, r10; ;# 16 pels to dst from buf
-
- lwz r0, 0(r10)
- stw r0, 0(r5)
- lwz r0, 4(r10)
- stwux r0, r5, r6
- lwz r0, 8(r10)
- stwux r0, r5, r6
- lwz r0, 12(r10)
- stwx r0, r5, r6
-
- lwz r12, -8(r1) ;# restore old VRSAVE from stack
- mtspr 256, r12 ;# reset old VRSAVE
-
- blr
diff --git a/vp8/common/ppc/sad_altivec.asm b/vp8/common/ppc/sad_altivec.asm
deleted file mode 100644
index e5f2638..0000000
--- a/vp8/common/ppc/sad_altivec.asm
+++ /dev/null
@@ -1,277 +0,0 @@
-;
-; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
-
- .globl vp8_sad16x16_ppc
- .globl vp8_sad16x8_ppc
- .globl vp8_sad8x16_ppc
- .globl vp8_sad8x8_ppc
- .globl vp8_sad4x4_ppc
-
-.macro load_aligned_16 V R O
- lvsl v3, 0, \R ;# permutate value for alignment
-
- lvx v1, 0, \R
- lvx v2, \O, \R
-
- vperm \V, v1, v2, v3
-.endm
-
-.macro prologue
- mfspr r11, 256 ;# get old VRSAVE
- oris r12, r11, 0xffc0
- mtspr 256, r12 ;# set VRSAVE
-
- stwu r1, -32(r1) ;# create space on the stack
-
- li r10, 16 ;# load offset and loop counter
-
- vspltisw v8, 0 ;# zero out total to start
-.endm
-
-.macro epilogue
- addi r1, r1, 32 ;# recover stack
-
- mtspr 256, r11 ;# reset old VRSAVE
-.endm
-
-.macro SAD_16
- ;# v6 = abs (v4 - v5)
- vsububs v6, v4, v5
- vsububs v7, v5, v4
- vor v6, v6, v7
-
- ;# v8 += abs (v4 - v5)
- vsum4ubs v8, v6, v8
-.endm
-
-.macro sad_16_loop loop_label
- lvsl v3, 0, r5 ;# only needs to be done once per block
-
- ;# preload a line of data before getting into the loop
- lvx v4, 0, r3
- lvx v1, 0, r5
- lvx v2, r10, r5
-
- add r5, r5, r6
- add r3, r3, r4
-
- vperm v5, v1, v2, v3
-
- .align 4
-\loop_label:
- ;# compute difference on first row
- vsububs v6, v4, v5
- vsububs v7, v5, v4
-
- ;# load up next set of data
- lvx v9, 0, r3
- lvx v1, 0, r5
- lvx v2, r10, r5
-
- ;# perform abs() of difference
- vor v6, v6, v7
- add r3, r3, r4
-
- ;# add to the running tally
- vsum4ubs v8, v6, v8
-
- ;# now onto the next line
- vperm v5, v1, v2, v3
- add r5, r5, r6
- lvx v4, 0, r3
-
- ;# compute difference on second row
- vsububs v6, v9, v5
- lvx v1, 0, r5
- vsububs v7, v5, v9
- lvx v2, r10, r5
- vor v6, v6, v7
- add r3, r3, r4
- vsum4ubs v8, v6, v8
- vperm v5, v1, v2, v3
- add r5, r5, r6
-
- bdnz \loop_label
-
- vspltisw v7, 0
-
- vsumsws v8, v8, v7
-
- stvx v8, 0, r1
- lwz r3, 12(r1)
-.endm
-
-.macro sad_8_loop loop_label
- .align 4
-\loop_label:
- ;# only one of the inputs should need to be aligned.
- load_aligned_16 v4, r3, r10
- load_aligned_16 v5, r5, r10
-
- ;# move onto the next line
- add r3, r3, r4
- add r5, r5, r6
-
- ;# only one of the inputs should need to be aligned.
- load_aligned_16 v6, r3, r10
- load_aligned_16 v7, r5, r10
-
- ;# move onto the next line
- add r3, r3, r4
- add r5, r5, r6
-
- vmrghb v4, v4, v6
- vmrghb v5, v5, v7
-
- SAD_16
-
- bdnz \loop_label
-
- vspltisw v7, 0
-
- vsumsws v8, v8, v7
-
- stvx v8, 0, r1
- lwz r3, 12(r1)
-.endm
-
- .align 2
-;# r3 unsigned char *src_ptr
-;# r4 int src_stride
-;# r5 unsigned char *ref_ptr
-;# r6 int ref_stride
-;#
-;# r3 return value
-vp8_sad16x16_ppc:
-
- prologue
-
- li r9, 8
- mtctr r9
-
- sad_16_loop sad16x16_loop
-
- epilogue
-
- blr
-
- .align 2
-;# r3 unsigned char *src_ptr
-;# r4 int src_stride
-;# r5 unsigned char *ref_ptr
-;# r6 int ref_stride
-;#
-;# r3 return value
-vp8_sad16x8_ppc:
-
- prologue
-
- li r9, 4
- mtctr r9
-
- sad_16_loop sad16x8_loop
-
- epilogue
-
- blr
-
- .align 2
-;# r3 unsigned char *src_ptr
-;# r4 int src_stride
-;# r5 unsigned char *ref_ptr
-;# r6 int ref_stride
-;#
-;# r3 return value
-vp8_sad8x16_ppc:
-
- prologue
-
- li r9, 8
- mtctr r9
-
- sad_8_loop sad8x16_loop
-
- epilogue
-
- blr
-
- .align 2
-;# r3 unsigned char *src_ptr
-;# r4 int src_stride
-;# r5 unsigned char *ref_ptr
-;# r6 int ref_stride
-;#
-;# r3 return value
-vp8_sad8x8_ppc:
-
- prologue
-
- li r9, 4
- mtctr r9
-
- sad_8_loop sad8x8_loop
-
- epilogue
-
- blr
-
-.macro transfer_4x4 I P
- lwz r0, 0(\I)
- add \I, \I, \P
-
- lwz r7, 0(\I)
- add \I, \I, \P
-
- lwz r8, 0(\I)
- add \I, \I, \P
-
- lwz r9, 0(\I)
-
- stw r0, 0(r1)
- stw r7, 4(r1)
- stw r8, 8(r1)
- stw r9, 12(r1)
-.endm
-
- .align 2
-;# r3 unsigned char *src_ptr
-;# r4 int src_stride
-;# r5 unsigned char *ref_ptr
-;# r6 int ref_stride
-;#
-;# r3 return value
-vp8_sad4x4_ppc:
-
- prologue
-
- transfer_4x4 r3, r4
- lvx v4, 0, r1
-
- transfer_4x4 r5, r6
- lvx v5, 0, r1
-
- vspltisw v8, 0 ;# zero out total to start
-
- ;# v6 = abs (v4 - v5)
- vsububs v6, v4, v5
- vsububs v7, v5, v4
- vor v6, v6, v7
-
- ;# v8 += abs (v4 - v5)
- vsum4ubs v7, v6, v8
- vsumsws v7, v7, v8
-
- stvx v7, 0, r1
- lwz r3, 12(r1)
-
- epilogue
-
- blr
diff --git a/vp8/common/ppc/systemdependent.c b/vp8/common/ppc/systemdependent.c
deleted file mode 100644
index 6899c0e..0000000
--- a/vp8/common/ppc/systemdependent.c
+++ /dev/null
@@ -1,165 +0,0 @@
-/*
- * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#include "subpixel.h"
-#include "loopfilter.h"
-#include "recon.h"
-#include "onyxc_int.h"
-
-extern void (*vp8_post_proc_down_and_across_mb_row)(
- unsigned char *src_ptr,
- unsigned char *dst_ptr,
- int src_pixels_per_line,
- int dst_pixels_per_line,
- int cols,
- unsigned char *f,
- int size
-);
-
-extern void (*vp8_mbpost_proc_down)(unsigned char *dst, int pitch, int rows, int cols, int flimit);
-extern void vp8_mbpost_proc_down_c(unsigned char *dst, int pitch, int rows, int cols, int flimit);
-extern void (*vp8_mbpost_proc_across_ip)(unsigned char *src, int pitch, int rows, int cols, int flimit);
-extern void vp8_mbpost_proc_across_ip_c(unsigned char *src, int pitch, int rows, int cols, int flimit);
-
-extern void vp8_post_proc_down_and_across_mb_row_c
-(
- unsigned char *src_ptr,
- unsigned char *dst_ptr,
- int src_pixels_per_line,
- int dst_pixels_per_line,
- int cols,
- unsigned char *f,
- int size
-);
-void vp8_plane_add_noise_c(unsigned char *Start, unsigned int Width, unsigned int Height, int Pitch, int q, int a);
-
-extern copy_mem_block_function *vp8_copy_mem16x16;
-extern copy_mem_block_function *vp8_copy_mem8x8;
-extern copy_mem_block_function *vp8_copy_mem8x4;
-
-// PPC
-extern subpixel_predict_function sixtap_predict_ppc;
-extern subpixel_predict_function sixtap_predict8x4_ppc;
-extern subpixel_predict_function sixtap_predict8x8_ppc;
-extern subpixel_predict_function sixtap_predict16x16_ppc;
-extern subpixel_predict_function bilinear_predict4x4_ppc;
-extern subpixel_predict_function bilinear_predict8x4_ppc;
-extern subpixel_predict_function bilinear_predict8x8_ppc;
-extern subpixel_predict_function bilinear_predict16x16_ppc;
-
-extern copy_mem_block_function copy_mem16x16_ppc;
-
-void recon_b_ppc(short *diff_ptr, unsigned char *pred_ptr, unsigned char *dst_ptr, int stride);
-void recon2b_ppc(short *diff_ptr, unsigned char *pred_ptr, unsigned char *dst_ptr, int stride);
-void recon4b_ppc(short *diff_ptr, unsigned char *pred_ptr, unsigned char *dst_ptr, int stride);
-
-extern void short_idct4x4llm_ppc(short *input, short *output, int pitch);
-
-// Generic C
-extern subpixel_predict_function vp8_sixtap_predict_c;
-extern subpixel_predict_function vp8_sixtap_predict8x4_c;
-extern subpixel_predict_function vp8_sixtap_predict8x8_c;
-extern subpixel_predict_function vp8_sixtap_predict16x16_c;
-extern subpixel_predict_function vp8_bilinear_predict4x4_c;
-extern subpixel_predict_function vp8_bilinear_predict8x4_c;
-extern subpixel_predict_function vp8_bilinear_predict8x8_c;
-extern subpixel_predict_function vp8_bilinear_predict16x16_c;
-
-extern copy_mem_block_function vp8_copy_mem16x16_c;
-extern copy_mem_block_function vp8_copy_mem8x8_c;
-extern copy_mem_block_function vp8_copy_mem8x4_c;
-
-void vp8_recon_b_c(short *diff_ptr, unsigned char *pred_ptr, unsigned char *dst_ptr, int stride);
-void vp8_recon2b_c(short *diff_ptr, unsigned char *pred_ptr, unsigned char *dst_ptr, int stride);
-void vp8_recon4b_c(short *diff_ptr, unsigned char *pred_ptr, unsigned char *dst_ptr, int stride);
-
-extern void vp8_short_idct4x4llm_1_c(short *input, short *output, int pitch);
-extern void vp8_short_idct4x4llm_c(short *input, short *output, int pitch);
-extern void vp8_dc_only_idct_c(short input_dc, short *output, int pitch);
-
-// PPC
-extern loop_filter_block_function loop_filter_mbv_ppc;
-extern loop_filter_block_function loop_filter_bv_ppc;
-extern loop_filter_block_function loop_filter_mbh_ppc;
-extern loop_filter_block_function loop_filter_bh_ppc;
-
-extern loop_filter_block_function loop_filter_mbvs_ppc;
-extern loop_filter_block_function loop_filter_bvs_ppc;
-extern loop_filter_block_function loop_filter_mbhs_ppc;
-extern loop_filter_block_function loop_filter_bhs_ppc;
-
-// Generic C
-extern loop_filter_block_function vp8_loop_filter_mbv_c;
-extern loop_filter_block_function vp8_loop_filter_bv_c;
-extern loop_filter_block_function vp8_loop_filter_mbh_c;
-extern loop_filter_block_function vp8_loop_filter_bh_c;
-
-extern loop_filter_block_function vp8_loop_filter_mbvs_c;
-extern loop_filter_block_function vp8_loop_filter_bvs_c;
-extern loop_filter_block_function vp8_loop_filter_mbhs_c;
-extern loop_filter_block_function vp8_loop_filter_bhs_c;
-
-extern loop_filter_block_function *vp8_lf_mbvfull;
-extern loop_filter_block_function *vp8_lf_mbhfull;
-extern loop_filter_block_function *vp8_lf_bvfull;
-extern loop_filter_block_function *vp8_lf_bhfull;
-
-extern loop_filter_block_function *vp8_lf_mbvsimple;
-extern loop_filter_block_function *vp8_lf_mbhsimple;
-extern loop_filter_block_function *vp8_lf_bvsimple;
-extern loop_filter_block_function *vp8_lf_bhsimple;
-
-void vp8_clear_c(void)
-{
-}
-
-void vp8_machine_specific_config(void)
-{
- // Pure C:
- vp8_clear_system_state = vp8_clear_c;
- vp8_recon_b = vp8_recon_b_c;
- vp8_recon4b = vp8_recon4b_c;
- vp8_recon2b = vp8_recon2b_c;
-
- vp8_bilinear_predict16x16 = bilinear_predict16x16_ppc;
- vp8_bilinear_predict8x8 = bilinear_predict8x8_ppc;
- vp8_bilinear_predict8x4 = bilinear_predict8x4_ppc;
- vp8_bilinear_predict = bilinear_predict4x4_ppc;
-
- vp8_sixtap_predict16x16 = sixtap_predict16x16_ppc;
- vp8_sixtap_predict8x8 = sixtap_predict8x8_ppc;
- vp8_sixtap_predict8x4 = sixtap_predict8x4_ppc;
- vp8_sixtap_predict = sixtap_predict_ppc;
-
- vp8_short_idct4x4_1 = vp8_short_idct4x4llm_1_c;
- vp8_short_idct4x4 = short_idct4x4llm_ppc;
- vp8_dc_only_idct = vp8_dc_only_idct_c;
-
- vp8_lf_mbvfull = loop_filter_mbv_ppc;
- vp8_lf_bvfull = loop_filter_bv_ppc;
- vp8_lf_mbhfull = loop_filter_mbh_ppc;
- vp8_lf_bhfull = loop_filter_bh_ppc;
-
- vp8_lf_mbvsimple = loop_filter_mbvs_ppc;
- vp8_lf_bvsimple = loop_filter_bvs_ppc;
- vp8_lf_mbhsimple = loop_filter_mbhs_ppc;
- vp8_lf_bhsimple = loop_filter_bhs_ppc;
-
- vp8_post_proc_down_and_across_mb_row = vp8_post_proc_down_and_across_mb_row_c;
- vp8_mbpost_proc_down = vp8_mbpost_proc_down_c;
- vp8_mbpost_proc_across_ip = vp8_mbpost_proc_across_ip_c;
- vp8_plane_add_noise = vp8_plane_add_noise_c;
-
- vp8_copy_mem16x16 = copy_mem16x16_ppc;
- vp8_copy_mem8x8 = vp8_copy_mem8x8_c;
- vp8_copy_mem8x4 = vp8_copy_mem8x4_c;
-
-}
diff --git a/vp8/common/ppc/variance_altivec.asm b/vp8/common/ppc/variance_altivec.asm
deleted file mode 100644
index fb8d5bb..0000000
--- a/vp8/common/ppc/variance_altivec.asm
+++ /dev/null
@@ -1,375 +0,0 @@
-;
-; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
-
- .globl vp8_get8x8var_ppc
- .globl vp8_get16x16var_ppc
- .globl vp8_mse16x16_ppc
- .globl vp8_variance16x16_ppc
- .globl vp8_variance16x8_ppc
- .globl vp8_variance8x16_ppc
- .globl vp8_variance8x8_ppc
- .globl vp8_variance4x4_ppc
-
-.macro load_aligned_16 V R O
- lvsl v3, 0, \R ;# permutate value for alignment
-
- lvx v1, 0, \R
- lvx v2, \O, \R
-
- vperm \V, v1, v2, v3
-.endm
-
-.macro prologue
- mfspr r11, 256 ;# get old VRSAVE
- oris r12, r11, 0xffc0
- mtspr 256, r12 ;# set VRSAVE
-
- stwu r1, -32(r1) ;# create space on the stack
-
- li r10, 16 ;# load offset and loop counter
-
- vspltisw v7, 0 ;# zero for merging
- vspltisw v8, 0 ;# zero out total to start
- vspltisw v9, 0 ;# zero out total for dif^2
-.endm
-
-.macro epilogue
- addi r1, r1, 32 ;# recover stack
-
- mtspr 256, r11 ;# reset old VRSAVE
-.endm
-
-.macro compute_sum_sse
- ;# Compute sum first. Unpack to so signed subract
- ;# can be used. Only have a half word signed
- ;# subract. Do high, then low.
- vmrghb v2, v7, v4
- vmrghb v3, v7, v5
- vsubshs v2, v2, v3
- vsum4shs v8, v2, v8
-
- vmrglb v2, v7, v4
- vmrglb v3, v7, v5
- vsubshs v2, v2, v3
- vsum4shs v8, v2, v8
-
- ;# Now compute sse.
- vsububs v2, v4, v5
- vsububs v3, v5, v4
- vor v2, v2, v3
-
- vmsumubm v9, v2, v2, v9
-.endm
-
-.macro variance_16 DS loop_label store_sum
-\loop_label:
- ;# only one of the inputs should need to be aligned.
- load_aligned_16 v4, r3, r10
- load_aligned_16 v5, r5, r10
-
- ;# move onto the next line
- add r3, r3, r4
- add r5, r5, r6
-
- compute_sum_sse
-
- bdnz \loop_label
-
- vsumsws v8, v8, v7
- vsumsws v9, v9, v7
-
- stvx v8, 0, r1
- lwz r3, 12(r1)
-
- stvx v9, 0, r1
- lwz r4, 12(r1)
-
-.if \store_sum
- stw r3, 0(r8) ;# sum
-.endif
- stw r4, 0(r7) ;# sse
-
- mullw r3, r3, r3 ;# sum*sum
- srlwi r3, r3, \DS ;# (sum*sum) >> DS
- subf r3, r3, r4 ;# sse - ((sum*sum) >> DS)
-.endm
-
-.macro variance_8 DS loop_label store_sum
-\loop_label:
- ;# only one of the inputs should need to be aligned.
- load_aligned_16 v4, r3, r10
- load_aligned_16 v5, r5, r10
-
- ;# move onto the next line
- add r3, r3, r4
- add r5, r5, r6
-
- ;# only one of the inputs should need to be aligned.
- load_aligned_16 v6, r3, r10
- load_aligned_16 v0, r5, r10
-
- ;# move onto the next line
- add r3, r3, r4
- add r5, r5, r6
-
- vmrghb v4, v4, v6
- vmrghb v5, v5, v0
-
- compute_sum_sse
-
- bdnz \loop_label
-
- vsumsws v8, v8, v7
- vsumsws v9, v9, v7
-
- stvx v8, 0, r1
- lwz r3, 12(r1)
-
- stvx v9, 0, r1
- lwz r4, 12(r1)
-
-.if \store_sum
- stw r3, 0(r8) ;# sum
-.endif
- stw r4, 0(r7) ;# sse
-
- mullw r3, r3, r3 ;# sum*sum
- srlwi r3, r3, \DS ;# (sum*sum) >> 8
- subf r3, r3, r4 ;# sse - ((sum*sum) >> 8)
-.endm
-
- .align 2
-;# r3 unsigned char *src_ptr
-;# r4 int source_stride
-;# r5 unsigned char *ref_ptr
-;# r6 int recon_stride
-;# r7 unsigned int *SSE
-;# r8 int *Sum
-;#
-;# r3 return value
-vp8_get8x8var_ppc:
-
- prologue
-
- li r9, 4
- mtctr r9
-
- variance_8 6, get8x8var_loop, 1
-
- epilogue
-
- blr
-
- .align 2
-;# r3 unsigned char *src_ptr
-;# r4 int source_stride
-;# r5 unsigned char *ref_ptr
-;# r6 int recon_stride
-;# r7 unsigned int *SSE
-;# r8 int *Sum
-;#
-;# r3 return value
-vp8_get16x16var_ppc:
-
- prologue
-
- mtctr r10
-
- variance_16 8, get16x16var_loop, 1
-
- epilogue
-
- blr
-
- .align 2
-;# r3 unsigned char *src_ptr
-;# r4 int source_stride
-;# r5 unsigned char *ref_ptr
-;# r6 int recon_stride
-;# r7 unsigned int *sse
-;#
-;# r 3 return value
-vp8_mse16x16_ppc:
- prologue
-
- mtctr r10
-
-mse16x16_loop:
- ;# only one of the inputs should need to be aligned.
- load_aligned_16 v4, r3, r10
- load_aligned_16 v5, r5, r10
-
- ;# move onto the next line
- add r3, r3, r4
- add r5, r5, r6
-
- ;# Now compute sse.
- vsububs v2, v4, v5
- vsububs v3, v5, v4
- vor v2, v2, v3
-
- vmsumubm v9, v2, v2, v9
-
- bdnz mse16x16_loop
-
- vsumsws v9, v9, v7
-
- stvx v9, 0, r1
- lwz r3, 12(r1)
-
- stvx v9, 0, r1
- lwz r3, 12(r1)
-
- stw r3, 0(r7) ;# sse
-
- epilogue
-
- blr
-
- .align 2
-;# r3 unsigned char *src_ptr
-;# r4 int source_stride
-;# r5 unsigned char *ref_ptr
-;# r6 int recon_stride
-;# r7 unsigned int *sse
-;#
-;# r3 return value
-vp8_variance16x16_ppc:
-
- prologue
-
- mtctr r10
-
- variance_16 8, variance16x16_loop, 0
-
- epilogue
-
- blr
-
- .align 2
-;# r3 unsigned char *src_ptr
-;# r4 int source_stride
-;# r5 unsigned char *ref_ptr
-;# r6 int recon_stride
-;# r7 unsigned int *sse
-;#
-;# r3 return value
-vp8_variance16x8_ppc:
-
- prologue
-
- li r9, 8
- mtctr r9
-
- variance_16 7, variance16x8_loop, 0
-
- epilogue
-
- blr
-
- .align 2
-;# r3 unsigned char *src_ptr
-;# r4 int source_stride
-;# r5 unsigned char *ref_ptr
-;# r6 int recon_stride
-;# r7 unsigned int *sse
-;#
-;# r3 return value
-vp8_variance8x16_ppc:
-
- prologue
-
- li r9, 8
- mtctr r9
-
- variance_8 7, variance8x16_loop, 0
-
- epilogue
-
- blr
-
- .align 2
-;# r3 unsigned char *src_ptr
-;# r4 int source_stride
-;# r5 unsigned char *ref_ptr
-;# r6 int recon_stride
-;# r7 unsigned int *sse
-;#
-;# r3 return value
-vp8_variance8x8_ppc:
-
- prologue
-
- li r9, 4
- mtctr r9
-
- variance_8 6, variance8x8_loop, 0
-
- epilogue
-
- blr
-
-.macro transfer_4x4 I P
- lwz r0, 0(\I)
- add \I, \I, \P
-
- lwz r10,0(\I)
- add \I, \I, \P
-
- lwz r8, 0(\I)
- add \I, \I, \P
-
- lwz r9, 0(\I)
-
- stw r0, 0(r1)
- stw r10, 4(r1)
- stw r8, 8(r1)
- stw r9, 12(r1)
-.endm
-
- .align 2
-;# r3 unsigned char *src_ptr
-;# r4 int source_stride
-;# r5 unsigned char *ref_ptr
-;# r6 int recon_stride
-;# r7 unsigned int *sse
-;#
-;# r3 return value
-vp8_variance4x4_ppc:
-
- prologue
-
- transfer_4x4 r3, r4
- lvx v4, 0, r1
-
- transfer_4x4 r5, r6
- lvx v5, 0, r1
-
- compute_sum_sse
-
- vsumsws v8, v8, v7
- vsumsws v9, v9, v7
-
- stvx v8, 0, r1
- lwz r3, 12(r1)
-
- stvx v9, 0, r1
- lwz r4, 12(r1)
-
- stw r4, 0(r7) ;# sse
-
- mullw r3, r3, r3 ;# sum*sum
- srlwi r3, r3, 4 ;# (sum*sum) >> 4
- subf r3, r3, r4 ;# sse - ((sum*sum) >> 4)
-
- epilogue
-
- blr
diff --git a/vp8/common/ppc/variance_subpixel_altivec.asm b/vp8/common/ppc/variance_subpixel_altivec.asm
deleted file mode 100644
index 2308373..0000000
--- a/vp8/common/ppc/variance_subpixel_altivec.asm
+++ /dev/null
@@ -1,865 +0,0 @@
-;
-; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
-
- .globl vp8_sub_pixel_variance4x4_ppc
- .globl vp8_sub_pixel_variance8x8_ppc
- .globl vp8_sub_pixel_variance8x16_ppc
- .globl vp8_sub_pixel_variance16x8_ppc
- .globl vp8_sub_pixel_variance16x16_ppc
-
-.macro load_c V, LABEL, OFF, R0, R1
- lis \R0, \LABEL@ha
- la \R1, \LABEL@l(\R0)
- lvx \V, \OFF, \R1
-.endm
-
-.macro load_vfilter V0, V1
- load_c \V0, vfilter_b, r6, r12, r10
-
- addi r6, r6, 16
- lvx \V1, r6, r10
-.endm
-
-.macro HProlog jump_label
- ;# load up horizontal filter
- slwi. r5, r5, 4 ;# index into horizontal filter array
-
- ;# index to the next set of vectors in the row.
- li r10, 16
-
- ;# downshift by 7 ( divide by 128 ) at the end
- vspltish v19, 7
-
- ;# If there isn't any filtering to be done for the horizontal, then
- ;# just skip to the second pass.
- beq \jump_label
-
- load_c v20, hfilter_b, r5, r12, r0
-
- ;# setup constants
- ;# v14 permutation value for alignment
- load_c v28, b_hperm_b, 0, r12, r0
-
- ;# index to the next set of vectors in the row.
- li r12, 32
-
- ;# rounding added in on the multiply
- vspltisw v21, 8
- vspltisw v18, 3
- vslw v18, v21, v18 ;# 0x00000040000000400000004000000040
-
- slwi. r6, r6, 5 ;# index into vertical filter array
-.endm
-
-;# Filters a horizontal line
-;# expects:
-;# r3 src_ptr
-;# r4 pitch
-;# r10 16
-;# r12 32
-;# v17 perm intput
-;# v18 rounding
-;# v19 shift
-;# v20 filter taps
-;# v21 tmp
-;# v22 tmp
-;# v23 tmp
-;# v24 tmp
-;# v25 tmp
-;# v26 tmp
-;# v27 tmp
-;# v28 perm output
-;#
-
-.macro hfilter_8 V, hp, lp, increment_counter
- lvsl v17, 0, r3 ;# permutate value for alignment
-
- ;# input to filter is 9 bytes wide, output is 8 bytes.
- lvx v21, 0, r3
- lvx v22, r10, r3
-
-.if \increment_counter
- add r3, r3, r4
-.endif
- vperm v21, v21, v22, v17
-
- vperm v24, v21, v21, \hp ;# v20 = 0123 1234 2345 3456
- vperm v25, v21, v21, \lp ;# v21 = 4567 5678 6789 789A
-
- vmsummbm v24, v20, v24, v18
- vmsummbm v25, v20, v25, v18
-
- vpkswus v24, v24, v25 ;# v24 = 0 4 8 C 1 5 9 D (16-bit)
-
- vsrh v24, v24, v19 ;# divide v0, v1 by 128
-
- vpkuhus \V, v24, v24 ;# \V = scrambled 8-bit result
-.endm
-
-.macro vfilter_16 P0 P1
- vmuleub v22, \P0, v20 ;# 64 + 4 positive taps
- vadduhm v22, v18, v22
- vmuloub v23, \P0, v20
- vadduhm v23, v18, v23
-
- vmuleub v24, \P1, v21
- vadduhm v22, v22, v24 ;# Re = evens, saturation unnecessary
- vmuloub v25, \P1, v21
- vadduhm v23, v23, v25 ;# Ro = odds
-
- vsrh v22, v22, v19 ;# divide by 128
- vsrh v23, v23, v19 ;# v16 v17 = evens, odds
- vmrghh \P0, v22, v23 ;# v18 v19 = 16-bit result in order
- vmrglh v23, v22, v23
- vpkuhus \P0, \P0, v23 ;# P0 = 8-bit result
-.endm
-
-.macro compute_sum_sse src, ref, sum, sse, t1, t2, z0
- ;# Compute sum first. Unpack to so signed subract
- ;# can be used. Only have a half word signed
- ;# subract. Do high, then low.
- vmrghb \t1, \z0, \src
- vmrghb \t2, \z0, \ref
- vsubshs \t1, \t1, \t2
- vsum4shs \sum, \t1, \sum
-
- vmrglb \t1, \z0, \src
- vmrglb \t2, \z0, \ref
- vsubshs \t1, \t1, \t2
- vsum4shs \sum, \t1, \sum
-
- ;# Now compute sse.
- vsububs \t1, \src, \ref
- vsububs \t2, \ref, \src
- vor \t1, \t1, \t2
-
- vmsumubm \sse, \t1, \t1, \sse
-.endm
-
-.macro variance_final sum, sse, z0, DS
- vsumsws \sum, \sum, \z0
- vsumsws \sse, \sse, \z0
-
- stvx \sum, 0, r1
- lwz r3, 12(r1)
-
- stvx \sse, 0, r1
- lwz r4, 12(r1)
-
- stw r4, 0(r9) ;# sse
-
- mullw r3, r3, r3 ;# sum*sum
- srlwi r3, r3, \DS ;# (sum*sum) >> 8
- subf r3, r3, r4 ;# sse - ((sum*sum) >> 8)
-.endm
-
-.macro compute_sum_sse_16 V, increment_counter
- load_and_align_16 v16, r7, r8, \increment_counter
- compute_sum_sse \V, v16, v18, v19, v20, v21, v23
-.endm
-
-.macro load_and_align_16 V, R, P, increment_counter
- lvsl v17, 0, \R ;# permutate value for alignment
-
- ;# input to filter is 21 bytes wide, output is 16 bytes.
- ;# input will can span three vectors if not aligned correctly.
- lvx v21, 0, \R
- lvx v22, r10, \R
-
-.if \increment_counter
- add \R, \R, \P
-.endif
-
- vperm \V, v21, v22, v17
-.endm
-
- .align 2
-;# r3 unsigned char *src_ptr
-;# r4 int src_pixels_per_line
-;# r5 int xoffset
-;# r6 int yoffset
-;# r7 unsigned char *dst_ptr
-;# r8 int dst_pixels_per_line
-;# r9 unsigned int *sse
-;#
-;# r3 return value
-vp8_sub_pixel_variance4x4_ppc:
- mfspr r11, 256 ;# get old VRSAVE
- oris r12, r11, 0xf830
- ori r12, r12, 0xfff8
- mtspr 256, r12 ;# set VRSAVE
-
- stwu r1,-32(r1) ;# create space on the stack
-
- HProlog second_pass_4x4_pre_copy_b
-
- ;# Load up permutation constants
- load_c v10, b_0123_b, 0, r12, r0
- load_c v11, b_4567_b, 0, r12, r0
-
- hfilter_8 v0, v10, v11, 1
- hfilter_8 v1, v10, v11, 1
- hfilter_8 v2, v10, v11, 1
- hfilter_8 v3, v10, v11, 1
-
- ;# Finished filtering main horizontal block. If there is no
- ;# vertical filtering, jump to storing the data. Otherwise
- ;# load up and filter the additional line that is needed
- ;# for the vertical filter.
- beq compute_sum_sse_4x4_b
-
- hfilter_8 v4, v10, v11, 0
-
- b second_pass_4x4_b
-
-second_pass_4x4_pre_copy_b:
- slwi r6, r6, 5 ;# index into vertical filter array
-
- load_and_align_16 v0, r3, r4, 1
- load_and_align_16 v1, r3, r4, 1
- load_and_align_16 v2, r3, r4, 1
- load_and_align_16 v3, r3, r4, 1
- load_and_align_16 v4, r3, r4, 0
-
-second_pass_4x4_b:
- vspltish v20, 8
- vspltish v18, 3
- vslh v18, v20, v18 ;# 0x0040 0040 0040 0040 0040 0040 0040 0040
-
- load_vfilter v20, v21
-
- vfilter_16 v0, v1
- vfilter_16 v1, v2
- vfilter_16 v2, v3
- vfilter_16 v3, v4
-
-compute_sum_sse_4x4_b:
- vspltish v18, 0 ;# sum
- vspltish v19, 0 ;# sse
- vspltish v23, 0 ;# unpack
- li r10, 16
-
- load_and_align_16 v4, r7, r8, 1
- load_and_align_16 v5, r7, r8, 1
- load_and_align_16 v6, r7, r8, 1
- load_and_align_16 v7, r7, r8, 1
-
- vmrghb v0, v0, v1
- vmrghb v1, v2, v3
-
- vmrghb v2, v4, v5
- vmrghb v3, v6, v7
-
- load_c v10, b_hilo_b, 0, r12, r0
-
- vperm v0, v0, v1, v10
- vperm v1, v2, v3, v10
-
- compute_sum_sse v0, v1, v18, v19, v20, v21, v23
-
- variance_final v18, v19, v23, 4
-
- addi r1, r1, 32 ;# recover stack
- mtspr 256, r11 ;# reset old VRSAVE
-
- blr
-
- .align 2
-;# r3 unsigned char *src_ptr
-;# r4 int src_pixels_per_line
-;# r5 int xoffset
-;# r6 int yoffset
-;# r7 unsigned char *dst_ptr
-;# r8 int dst_pixels_per_line
-;# r9 unsigned int *sse
-;#
-;# r3 return value
-vp8_sub_pixel_variance8x8_ppc:
- mfspr r11, 256 ;# get old VRSAVE
- oris r12, r11, 0xfff0
- ori r12, r12, 0xffff
- mtspr 256, r12 ;# set VRSAVE
-
- stwu r1,-32(r1) ;# create space on the stack
-
- HProlog second_pass_8x8_pre_copy_b
-
- ;# Load up permutation constants
- load_c v10, b_0123_b, 0, r12, r0
- load_c v11, b_4567_b, 0, r12, r0
-
- hfilter_8 v0, v10, v11, 1
- hfilter_8 v1, v10, v11, 1
- hfilter_8 v2, v10, v11, 1
- hfilter_8 v3, v10, v11, 1
- hfilter_8 v4, v10, v11, 1
- hfilter_8 v5, v10, v11, 1
- hfilter_8 v6, v10, v11, 1
- hfilter_8 v7, v10, v11, 1
-
- ;# Finished filtering main horizontal block. If there is no
- ;# vertical filtering, jump to storing the data. Otherwise
- ;# load up and filter the additional line that is needed
- ;# for the vertical filter.
- beq compute_sum_sse_8x8_b
-
- hfilter_8 v8, v10, v11, 0
-
- b second_pass_8x8_b
-
-second_pass_8x8_pre_copy_b:
- slwi. r6, r6, 5 ;# index into vertical filter array
-
- load_and_align_16 v0, r3, r4, 1
- load_and_align_16 v1, r3, r4, 1
- load_and_align_16 v2, r3, r4, 1
- load_and_align_16 v3, r3, r4, 1
- load_and_align_16 v4, r3, r4, 1
- load_and_align_16 v5, r3, r4, 1
- load_and_align_16 v6, r3, r4, 1
- load_and_align_16 v7, r3, r4, 1
- load_and_align_16 v8, r3, r4, 0
-
- beq compute_sum_sse_8x8_b
-
-second_pass_8x8_b:
- vspltish v20, 8
- vspltish v18, 3
- vslh v18, v20, v18 ;# 0x0040 0040 0040 0040 0040 0040 0040 0040
-
- load_vfilter v20, v21
-
- vfilter_16 v0, v1
- vfilter_16 v1, v2
- vfilter_16 v2, v3
- vfilter_16 v3, v4
- vfilter_16 v4, v5
- vfilter_16 v5, v6
- vfilter_16 v6, v7
- vfilter_16 v7, v8
-
-compute_sum_sse_8x8_b:
- vspltish v18, 0 ;# sum
- vspltish v19, 0 ;# sse
- vspltish v23, 0 ;# unpack
- li r10, 16
-
- vmrghb v0, v0, v1
- vmrghb v1, v2, v3
- vmrghb v2, v4, v5
- vmrghb v3, v6, v7
-
- load_and_align_16 v4, r7, r8, 1
- load_and_align_16 v5, r7, r8, 1
- load_and_align_16 v6, r7, r8, 1
- load_and_align_16 v7, r7, r8, 1
- load_and_align_16 v8, r7, r8, 1
- load_and_align_16 v9, r7, r8, 1
- load_and_align_16 v10, r7, r8, 1
- load_and_align_16 v11, r7, r8, 0
-
- vmrghb v4, v4, v5
- vmrghb v5, v6, v7
- vmrghb v6, v8, v9
- vmrghb v7, v10, v11
-
- compute_sum_sse v0, v4, v18, v19, v20, v21, v23
- compute_sum_sse v1, v5, v18, v19, v20, v21, v23
- compute_sum_sse v2, v6, v18, v19, v20, v21, v23
- compute_sum_sse v3, v7, v18, v19, v20, v21, v23
-
- variance_final v18, v19, v23, 6
-
- addi r1, r1, 32 ;# recover stack
- mtspr 256, r11 ;# reset old VRSAVE
- blr
-
- .align 2
-;# r3 unsigned char *src_ptr
-;# r4 int src_pixels_per_line
-;# r5 int xoffset
-;# r6 int yoffset
-;# r7 unsigned char *dst_ptr
-;# r8 int dst_pixels_per_line
-;# r9 unsigned int *sse
-;#
-;# r3 return value
-vp8_sub_pixel_variance8x16_ppc:
- mfspr r11, 256 ;# get old VRSAVE
- oris r12, r11, 0xffff
- ori r12, r12, 0xfffc
- mtspr 256, r12 ;# set VRSAVE
-
- stwu r1,-32(r1) ;# create space on the stack
-
- HProlog second_pass_8x16_pre_copy_b
-
- ;# Load up permutation constants
- load_c v29, b_0123_b, 0, r12, r0
- load_c v30, b_4567_b, 0, r12, r0
-
- hfilter_8 v0, v29, v30, 1
- hfilter_8 v1, v29, v30, 1
- hfilter_8 v2, v29, v30, 1
- hfilter_8 v3, v29, v30, 1
- hfilter_8 v4, v29, v30, 1
- hfilter_8 v5, v29, v30, 1
- hfilter_8 v6, v29, v30, 1
- hfilter_8 v7, v29, v30, 1
- hfilter_8 v8, v29, v30, 1
- hfilter_8 v9, v29, v30, 1
- hfilter_8 v10, v29, v30, 1
- hfilter_8 v11, v29, v30, 1
- hfilter_8 v12, v29, v30, 1
- hfilter_8 v13, v29, v30, 1
- hfilter_8 v14, v29, v30, 1
- hfilter_8 v15, v29, v30, 1
-
- ;# Finished filtering main horizontal block. If there is no
- ;# vertical filtering, jump to storing the data. Otherwise
- ;# load up and filter the additional line that is needed
- ;# for the vertical filter.
- beq compute_sum_sse_8x16_b
-
- hfilter_8 v16, v29, v30, 0
-
- b second_pass_8x16_b
-
-second_pass_8x16_pre_copy_b:
- slwi. r6, r6, 5 ;# index into vertical filter array
-
- load_and_align_16 v0, r3, r4, 1
- load_and_align_16 v1, r3, r4, 1
- load_and_align_16 v2, r3, r4, 1
- load_and_align_16 v3, r3, r4, 1
- load_and_align_16 v4, r3, r4, 1
- load_and_align_16 v5, r3, r4, 1
- load_and_align_16 v6, r3, r4, 1
- load_and_align_16 v7, r3, r4, 1
- load_and_align_16 v8, r3, r4, 1
- load_and_align_16 v9, r3, r4, 1
- load_and_align_16 v10, r3, r4, 1
- load_and_align_16 v11, r3, r4, 1
- load_and_align_16 v12, r3, r4, 1
- load_and_align_16 v13, r3, r4, 1
- load_and_align_16 v14, r3, r4, 1
- load_and_align_16 v15, r3, r4, 1
- load_and_align_16 v16, r3, r4, 0
-
- beq compute_sum_sse_8x16_b
-
-second_pass_8x16_b:
- vspltish v20, 8
- vspltish v18, 3
- vslh v18, v20, v18 ;# 0x0040 0040 0040 0040 0040 0040 0040 0040
-
- load_vfilter v20, v21
-
- vfilter_16 v0, v1
- vfilter_16 v1, v2
- vfilter_16 v2, v3
- vfilter_16 v3, v4
- vfilter_16 v4, v5
- vfilter_16 v5, v6
- vfilter_16 v6, v7
- vfilter_16 v7, v8
- vfilter_16 v8, v9
- vfilter_16 v9, v10
- vfilter_16 v10, v11
- vfilter_16 v11, v12
- vfilter_16 v12, v13
- vfilter_16 v13, v14
- vfilter_16 v14, v15
- vfilter_16 v15, v16
-
-compute_sum_sse_8x16_b:
- vspltish v18, 0 ;# sum
- vspltish v19, 0 ;# sse
- vspltish v23, 0 ;# unpack
- li r10, 16
-
- vmrghb v0, v0, v1
- vmrghb v1, v2, v3
- vmrghb v2, v4, v5
- vmrghb v3, v6, v7
- vmrghb v4, v8, v9
- vmrghb v5, v10, v11
- vmrghb v6, v12, v13
- vmrghb v7, v14, v15
-
- load_and_align_16 v8, r7, r8, 1
- load_and_align_16 v9, r7, r8, 1
- load_and_align_16 v10, r7, r8, 1
- load_and_align_16 v11, r7, r8, 1
- load_and_align_16 v12, r7, r8, 1
- load_and_align_16 v13, r7, r8, 1
- load_and_align_16 v14, r7, r8, 1
- load_and_align_16 v15, r7, r8, 1
-
- vmrghb v8, v8, v9
- vmrghb v9, v10, v11
- vmrghb v10, v12, v13
- vmrghb v11, v14, v15
-
- compute_sum_sse v0, v8, v18, v19, v20, v21, v23
- compute_sum_sse v1, v9, v18, v19, v20, v21, v23
- compute_sum_sse v2, v10, v18, v19, v20, v21, v23
- compute_sum_sse v3, v11, v18, v19, v20, v21, v23
-
- load_and_align_16 v8, r7, r8, 1
- load_and_align_16 v9, r7, r8, 1
- load_and_align_16 v10, r7, r8, 1
- load_and_align_16 v11, r7, r8, 1
- load_and_align_16 v12, r7, r8, 1
- load_and_align_16 v13, r7, r8, 1
- load_and_align_16 v14, r7, r8, 1
- load_and_align_16 v15, r7, r8, 0
-
- vmrghb v8, v8, v9
- vmrghb v9, v10, v11
- vmrghb v10, v12, v13
- vmrghb v11, v14, v15
-
- compute_sum_sse v4, v8, v18, v19, v20, v21, v23
- compute_sum_sse v5, v9, v18, v19, v20, v21, v23
- compute_sum_sse v6, v10, v18, v19, v20, v21, v23
- compute_sum_sse v7, v11, v18, v19, v20, v21, v23
-
- variance_final v18, v19, v23, 7
-
- addi r1, r1, 32 ;# recover stack
- mtspr 256, r11 ;# reset old VRSAVE
- blr
-
-;# Filters a horizontal line
-;# expects:
-;# r3 src_ptr
-;# r4 pitch
-;# r10 16
-;# r12 32
-;# v17 perm intput
-;# v18 rounding
-;# v19 shift
-;# v20 filter taps
-;# v21 tmp
-;# v22 tmp
-;# v23 tmp
-;# v24 tmp
-;# v25 tmp
-;# v26 tmp
-;# v27 tmp
-;# v28 perm output
-;#
-.macro hfilter_16 V, increment_counter
-
- lvsl v17, 0, r3 ;# permutate value for alignment
-
- ;# input to filter is 21 bytes wide, output is 16 bytes.
- ;# input will can span three vectors if not aligned correctly.
- lvx v21, 0, r3
- lvx v22, r10, r3
- lvx v23, r12, r3
-
-.if \increment_counter
- add r3, r3, r4
-.endif
- vperm v21, v21, v22, v17
- vperm v22, v22, v23, v17 ;# v8 v9 = 21 input pixels left-justified
-
- ;# set 0
- vmsummbm v24, v20, v21, v18 ;# taps times elements
-
- ;# set 1
- vsldoi v23, v21, v22, 1
- vmsummbm v25, v20, v23, v18
-
- ;# set 2
- vsldoi v23, v21, v22, 2
- vmsummbm v26, v20, v23, v18
-
- ;# set 3
- vsldoi v23, v21, v22, 3
- vmsummbm v27, v20, v23, v18
-
- vpkswus v24, v24, v25 ;# v24 = 0 4 8 C 1 5 9 D (16-bit)
- vpkswus v25, v26, v27 ;# v25 = 2 6 A E 3 7 B F
-
- vsrh v24, v24, v19 ;# divide v0, v1 by 128
- vsrh v25, v25, v19
-
- vpkuhus \V, v24, v25 ;# \V = scrambled 8-bit result
- vperm \V, \V, v0, v28 ;# \V = correctly-ordered result
-.endm
-
- .align 2
-;# r3 unsigned char *src_ptr
-;# r4 int src_pixels_per_line
-;# r5 int xoffset
-;# r6 int yoffset
-;# r7 unsigned char *dst_ptr
-;# r8 int dst_pixels_per_line
-;# r9 unsigned int *sse
-;#
-;# r3 return value
-vp8_sub_pixel_variance16x8_ppc:
- mfspr r11, 256 ;# get old VRSAVE
- oris r12, r11, 0xffff
- ori r12, r12, 0xfff8
- mtspr 256, r12 ;# set VRSAVE
-
- stwu r1, -32(r1) ;# create space on the stack
-
- HProlog second_pass_16x8_pre_copy_b
-
- hfilter_16 v0, 1
- hfilter_16 v1, 1
- hfilter_16 v2, 1
- hfilter_16 v3, 1
- hfilter_16 v4, 1
- hfilter_16 v5, 1
- hfilter_16 v6, 1
- hfilter_16 v7, 1
-
- ;# Finished filtering main horizontal block. If there is no
- ;# vertical filtering, jump to storing the data. Otherwise
- ;# load up and filter the additional line that is needed
- ;# for the vertical filter.
- beq compute_sum_sse_16x8_b
-
- hfilter_16 v8, 0
-
- b second_pass_16x8_b
-
-second_pass_16x8_pre_copy_b:
- slwi. r6, r6, 5 ;# index into vertical filter array
-
- load_and_align_16 v0, r3, r4, 1
- load_and_align_16 v1, r3, r4, 1
- load_and_align_16 v2, r3, r4, 1
- load_and_align_16 v3, r3, r4, 1
- load_and_align_16 v4, r3, r4, 1
- load_and_align_16 v5, r3, r4, 1
- load_and_align_16 v6, r3, r4, 1
- load_and_align_16 v7, r3, r4, 1
- load_and_align_16 v8, r3, r4, 1
-
- beq compute_sum_sse_16x8_b
-
-second_pass_16x8_b:
- vspltish v20, 8
- vspltish v18, 3
- vslh v18, v20, v18 ;# 0x0040 0040 0040 0040 0040 0040 0040 0040
-
- load_vfilter v20, v21
-
- vfilter_16 v0, v1
- vfilter_16 v1, v2
- vfilter_16 v2, v3
- vfilter_16 v3, v4
- vfilter_16 v4, v5
- vfilter_16 v5, v6
- vfilter_16 v6, v7
- vfilter_16 v7, v8
-
-compute_sum_sse_16x8_b:
- vspltish v18, 0 ;# sum
- vspltish v19, 0 ;# sse
- vspltish v23, 0 ;# unpack
- li r10, 16
-
- compute_sum_sse_16 v0, 1
- compute_sum_sse_16 v1, 1
- compute_sum_sse_16 v2, 1
- compute_sum_sse_16 v3, 1
- compute_sum_sse_16 v4, 1
- compute_sum_sse_16 v5, 1
- compute_sum_sse_16 v6, 1
- compute_sum_sse_16 v7, 0
-
- variance_final v18, v19, v23, 7
-
- addi r1, r1, 32 ;# recover stack
-
- mtspr 256, r11 ;# reset old VRSAVE
-
- blr
-
- .align 2
-;# r3 unsigned char *src_ptr
-;# r4 int src_pixels_per_line
-;# r5 int xoffset
-;# r6 int yoffset
-;# r7 unsigned char *dst_ptr
-;# r8 int dst_pixels_per_line
-;# r9 unsigned int *sse
-;#
-;# r3 return value
-vp8_sub_pixel_variance16x16_ppc:
- mfspr r11, 256 ;# get old VRSAVE
- oris r12, r11, 0xffff
- ori r12, r12, 0xfff8
- mtspr 256, r12 ;# set VRSAVE
-
- stwu r1, -32(r1) ;# create space on the stack
-
- HProlog second_pass_16x16_pre_copy_b
-
- hfilter_16 v0, 1
- hfilter_16 v1, 1
- hfilter_16 v2, 1
- hfilter_16 v3, 1
- hfilter_16 v4, 1
- hfilter_16 v5, 1
- hfilter_16 v6, 1
- hfilter_16 v7, 1
- hfilter_16 v8, 1
- hfilter_16 v9, 1
- hfilter_16 v10, 1
- hfilter_16 v11, 1
- hfilter_16 v12, 1
- hfilter_16 v13, 1
- hfilter_16 v14, 1
- hfilter_16 v15, 1
-
- ;# Finished filtering main horizontal block. If there is no
- ;# vertical filtering, jump to storing the data. Otherwise
- ;# load up and filter the additional line that is needed
- ;# for the vertical filter.
- beq compute_sum_sse_16x16_b
-
- hfilter_16 v16, 0
-
- b second_pass_16x16_b
-
-second_pass_16x16_pre_copy_b:
- slwi. r6, r6, 5 ;# index into vertical filter array
-
- load_and_align_16 v0, r3, r4, 1
- load_and_align_16 v1, r3, r4, 1
- load_and_align_16 v2, r3, r4, 1
- load_and_align_16 v3, r3, r4, 1
- load_and_align_16 v4, r3, r4, 1
- load_and_align_16 v5, r3, r4, 1
- load_and_align_16 v6, r3, r4, 1
- load_and_align_16 v7, r3, r4, 1
- load_and_align_16 v8, r3, r4, 1
- load_and_align_16 v9, r3, r4, 1
- load_and_align_16 v10, r3, r4, 1
- load_and_align_16 v11, r3, r4, 1
- load_and_align_16 v12, r3, r4, 1
- load_and_align_16 v13, r3, r4, 1
- load_and_align_16 v14, r3, r4, 1
- load_and_align_16 v15, r3, r4, 1
- load_and_align_16 v16, r3, r4, 0
-
- beq compute_sum_sse_16x16_b
-
-second_pass_16x16_b:
- vspltish v20, 8
- vspltish v18, 3
- vslh v18, v20, v18 ;# 0x0040 0040 0040 0040 0040 0040 0040 0040
-
- load_vfilter v20, v21
-
- vfilter_16 v0, v1
- vfilter_16 v1, v2
- vfilter_16 v2, v3
- vfilter_16 v3, v4
- vfilter_16 v4, v5
- vfilter_16 v5, v6
- vfilter_16 v6, v7
- vfilter_16 v7, v8
- vfilter_16 v8, v9
- vfilter_16 v9, v10
- vfilter_16 v10, v11
- vfilter_16 v11, v12
- vfilter_16 v12, v13
- vfilter_16 v13, v14
- vfilter_16 v14, v15
- vfilter_16 v15, v16
-
-compute_sum_sse_16x16_b:
- vspltish v18, 0 ;# sum
- vspltish v19, 0 ;# sse
- vspltish v23, 0 ;# unpack
- li r10, 16
-
- compute_sum_sse_16 v0, 1
- compute_sum_sse_16 v1, 1
- compute_sum_sse_16 v2, 1
- compute_sum_sse_16 v3, 1
- compute_sum_sse_16 v4, 1
- compute_sum_sse_16 v5, 1
- compute_sum_sse_16 v6, 1
- compute_sum_sse_16 v7, 1
- compute_sum_sse_16 v8, 1
- compute_sum_sse_16 v9, 1
- compute_sum_sse_16 v10, 1
- compute_sum_sse_16 v11, 1
- compute_sum_sse_16 v12, 1
- compute_sum_sse_16 v13, 1
- compute_sum_sse_16 v14, 1
- compute_sum_sse_16 v15, 0
-
- variance_final v18, v19, v23, 8
-
- addi r1, r1, 32 ;# recover stack
-
- mtspr 256, r11 ;# reset old VRSAVE
-
- blr
-
- .data
-
- .align 4
-hfilter_b:
- .byte 128, 0, 0, 0,128, 0, 0, 0,128, 0, 0, 0,128, 0, 0, 0
- .byte 112, 16, 0, 0,112, 16, 0, 0,112, 16, 0, 0,112, 16, 0, 0
- .byte 96, 32, 0, 0, 96, 32, 0, 0, 96, 32, 0, 0, 96, 32, 0, 0
- .byte 80, 48, 0, 0, 80, 48, 0, 0, 80, 48, 0, 0, 80, 48, 0, 0
- .byte 64, 64, 0, 0, 64, 64, 0, 0, 64, 64, 0, 0, 64, 64, 0, 0
- .byte 48, 80, 0, 0, 48, 80, 0, 0, 48, 80, 0, 0, 48, 80, 0, 0
- .byte 32, 96, 0, 0, 32, 96, 0, 0, 32, 96, 0, 0, 32, 96, 0, 0
- .byte 16,112, 0, 0, 16,112, 0, 0, 16,112, 0, 0, 16,112, 0, 0
-
- .align 4
-vfilter_b:
- .byte 128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128
- .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
- .byte 112,112,112,112,112,112,112,112,112,112,112,112,112,112,112,112
- .byte 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16
- .byte 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96
- .byte 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32
- .byte 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80
- .byte 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48
- .byte 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64
- .byte 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64
- .byte 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48
- .byte 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80
- .byte 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32
- .byte 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96
- .byte 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16
- .byte 112,112,112,112,112,112,112,112,112,112,112,112,112,112,112,112
-
- .align 4
-b_hperm_b:
- .byte 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15
-
- .align 4
-b_0123_b:
- .byte 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6
-
- .align 4
-b_4567_b:
- .byte 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10
-
-b_hilo_b:
- .byte 0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23
diff --git a/vp8/decoder/decodeframe.c b/vp8/decoder/decodeframe.c
index ab03c90..4f404bd6 100644
--- a/vp8/decoder/decodeframe.c
+++ b/vp8/decoder/decodeframe.c
@@ -1069,7 +1069,6 @@
pc->vert_scale = clear[6] >> 6;
}
data += 7;
- clear += 7;
}
else
{
diff --git a/vp8/encoder/firstpass.c b/vp8/encoder/firstpass.c
index 85767ef..84b9fae 100644
--- a/vp8/encoder/firstpass.c
+++ b/vp8/encoder/firstpass.c
@@ -2969,7 +2969,6 @@
*/
decay_accumulator = 1.0;
boost_score = 0.0;
- loop_decay_rate = 1.00; /* Starting decay rate */
for (i = 0 ; i < cpi->twopass.frames_to_key ; i++)
{
@@ -3213,7 +3212,7 @@
int new_width = cpi->oxcf.Width;
int new_height = cpi->oxcf.Height;
- int projected_buffer_level = (int)cpi->buffer_level;
+ int projected_buffer_level;
int tmp_q;
double projected_bits_perframe;
diff --git a/vp8/encoder/picklpf.c b/vp8/encoder/picklpf.c
index f0c8f28..a55a1ea 100644
--- a/vp8/encoder/picklpf.c
+++ b/vp8/encoder/picklpf.c
@@ -142,7 +142,7 @@
int min_filter_level = get_min_filter_level(cpi, cm->base_qindex);
int max_filter_level = get_max_filter_level(cpi, cm->base_qindex);
int filt_val;
- int best_filt_val = cm->filter_level;
+ int best_filt_val;
YV12_BUFFER_CONFIG * saved_frame = cm->frame_to_show;
/* Replace unfiltered frame buffer with a new one */
@@ -274,8 +274,7 @@
int filter_step;
int filt_high = 0;
- /* Start search at previous frame filter level */
- int filt_mid = cm->filter_level;
+ int filt_mid;
int filt_low = 0;
int filt_best;
int filt_direction = 0;
diff --git a/vp8/encoder/ppc/csystemdependent.c b/vp8/encoder/ppc/csystemdependent.c
deleted file mode 100644
index 63f2357..0000000
--- a/vp8/encoder/ppc/csystemdependent.c
+++ /dev/null
@@ -1,160 +0,0 @@
-/*
- * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#include "vp8/encoder/variance.h"
-#include "vp8/encoder/onyx_int.h"
-
-SADFunction *vp8_sad16x16;
-SADFunction *vp8_sad16x8;
-SADFunction *vp8_sad8x16;
-SADFunction *vp8_sad8x8;
-SADFunction *vp8_sad4x4;
-
-variance_function *vp8_variance4x4;
-variance_function *vp8_variance8x8;
-variance_function *vp8_variance8x16;
-variance_function *vp8_variance16x8;
-variance_function *vp8_variance16x16;
-
-variance_function *vp8_mse16x16;
-
-sub_pixel_variance_function *vp8_sub_pixel_variance4x4;
-sub_pixel_variance_function *vp8_sub_pixel_variance8x8;
-sub_pixel_variance_function *vp8_sub_pixel_variance8x16;
-sub_pixel_variance_function *vp8_sub_pixel_variance16x8;
-sub_pixel_variance_function *vp8_sub_pixel_variance16x16;
-
-int (*vp8_block_error)(short *coeff, short *dqcoeff);
-int (*vp8_mbblock_error)(MACROBLOCK *mb, int dc);
-
-int (*vp8_mbuverror)(MACROBLOCK *mb);
-unsigned int (*vp8_get_mb_ss)(short *);
-void (*vp8_short_fdct4x4)(short *input, short *output, int pitch);
-void (*vp8_short_fdct8x4)(short *input, short *output, int pitch);
-void (*vp8_fast_fdct4x4)(short *input, short *output, int pitch);
-void (*vp8_fast_fdct8x4)(short *input, short *output, int pitch);
-void (*short_walsh4x4)(short *input, short *output, int pitch);
-
-void (*vp8_subtract_b)(BLOCK *be, BLOCKD *bd, int pitch);
-void (*vp8_subtract_mby)(short *diff, unsigned char *src, unsigned char *pred, int stride);
-void (*vp8_subtract_mbuv)(short *diff, unsigned char *usrc, unsigned char *vsrc, unsigned char *pred, int stride);
-void (*vp8_fast_quantize_b)(BLOCK *b, BLOCKD *d);
-
-unsigned int (*vp8_get4x4sse_cs)(unsigned char *src_ptr, int source_stride, unsigned char *ref_ptr, int recon_stride);
-
-// c imports
-extern int block_error_c(short *coeff, short *dqcoeff);
-extern int vp8_mbblock_error_c(MACROBLOCK *mb, int dc);
-
-extern int vp8_mbuverror_c(MACROBLOCK *mb);
-extern unsigned int vp8_get8x8var_c(unsigned char *src_ptr, int source_stride, unsigned char *ref_ptr, int recon_stride, unsigned int *SSE, int *Sum);
-extern void short_fdct4x4_c(short *input, short *output, int pitch);
-extern void short_fdct8x4_c(short *input, short *output, int pitch);
-extern void vp8_short_walsh4x4_c(short *input, short *output, int pitch);
-
-extern void vp8_subtract_b_c(BLOCK *be, BLOCKD *bd, int pitch);
-extern void subtract_mby_c(short *diff, unsigned char *src, unsigned char *pred, int stride);
-extern void subtract_mbuv_c(short *diff, unsigned char *usrc, unsigned char *vsrc, unsigned char *pred, int stride);
-extern void vp8_fast_quantize_b_c(BLOCK *b, BLOCKD *d);
-
-extern SADFunction sad16x16_c;
-extern SADFunction sad16x8_c;
-extern SADFunction sad8x16_c;
-extern SADFunction sad8x8_c;
-extern SADFunction sad4x4_c;
-
-extern variance_function variance16x16_c;
-extern variance_function variance8x16_c;
-extern variance_function variance16x8_c;
-extern variance_function variance8x8_c;
-extern variance_function variance4x4_c;
-extern variance_function mse16x16_c;
-
-extern sub_pixel_variance_function sub_pixel_variance4x4_c;
-extern sub_pixel_variance_function sub_pixel_variance8x8_c;
-extern sub_pixel_variance_function sub_pixel_variance8x16_c;
-extern sub_pixel_variance_function sub_pixel_variance16x8_c;
-extern sub_pixel_variance_function sub_pixel_variance16x16_c;
-
-extern unsigned int vp8_get_mb_ss_c(short *);
-extern unsigned int vp8_get4x4sse_cs_c(unsigned char *src_ptr, int source_stride, unsigned char *ref_ptr, int recon_stride);
-
-// ppc
-extern int vp8_block_error_ppc(short *coeff, short *dqcoeff);
-
-extern void vp8_short_fdct4x4_ppc(short *input, short *output, int pitch);
-extern void vp8_short_fdct8x4_ppc(short *input, short *output, int pitch);
-
-extern void vp8_subtract_mby_ppc(short *diff, unsigned char *src, unsigned char *pred, int stride);
-extern void vp8_subtract_mbuv_ppc(short *diff, unsigned char *usrc, unsigned char *vsrc, unsigned char *pred, int stride);
-
-extern SADFunction vp8_sad16x16_ppc;
-extern SADFunction vp8_sad16x8_ppc;
-extern SADFunction vp8_sad8x16_ppc;
-extern SADFunction vp8_sad8x8_ppc;
-extern SADFunction vp8_sad4x4_ppc;
-
-extern variance_function vp8_variance16x16_ppc;
-extern variance_function vp8_variance8x16_ppc;
-extern variance_function vp8_variance16x8_ppc;
-extern variance_function vp8_variance8x8_ppc;
-extern variance_function vp8_variance4x4_ppc;
-extern variance_function vp8_mse16x16_ppc;
-
-extern sub_pixel_variance_function vp8_sub_pixel_variance4x4_ppc;
-extern sub_pixel_variance_function vp8_sub_pixel_variance8x8_ppc;
-extern sub_pixel_variance_function vp8_sub_pixel_variance8x16_ppc;
-extern sub_pixel_variance_function vp8_sub_pixel_variance16x8_ppc;
-extern sub_pixel_variance_function vp8_sub_pixel_variance16x16_ppc;
-
-extern unsigned int vp8_get8x8var_ppc(unsigned char *src_ptr, int source_stride, unsigned char *ref_ptr, int recon_stride, unsigned int *SSE, int *Sum);
-extern unsigned int vp8_get16x16var_ppc(unsigned char *src_ptr, int source_stride, unsigned char *ref_ptr, int recon_stride, unsigned int *SSE, int *Sum);
-
-void vp8_cmachine_specific_config(void)
-{
- // Pure C:
- vp8_mbuverror = vp8_mbuverror_c;
- vp8_fast_quantize_b = vp8_fast_quantize_b_c;
- vp8_short_fdct4x4 = vp8_short_fdct4x4_ppc;
- vp8_short_fdct8x4 = vp8_short_fdct8x4_ppc;
- vp8_fast_fdct4x4 = vp8_short_fdct4x4_ppc;
- vp8_fast_fdct8x4 = vp8_short_fdct8x4_ppc;
- short_walsh4x4 = vp8_short_walsh4x4_c;
-
- vp8_variance4x4 = vp8_variance4x4_ppc;
- vp8_variance8x8 = vp8_variance8x8_ppc;
- vp8_variance8x16 = vp8_variance8x16_ppc;
- vp8_variance16x8 = vp8_variance16x8_ppc;
- vp8_variance16x16 = vp8_variance16x16_ppc;
- vp8_mse16x16 = vp8_mse16x16_ppc;
-
- vp8_sub_pixel_variance4x4 = vp8_sub_pixel_variance4x4_ppc;
- vp8_sub_pixel_variance8x8 = vp8_sub_pixel_variance8x8_ppc;
- vp8_sub_pixel_variance8x16 = vp8_sub_pixel_variance8x16_ppc;
- vp8_sub_pixel_variance16x8 = vp8_sub_pixel_variance16x8_ppc;
- vp8_sub_pixel_variance16x16 = vp8_sub_pixel_variance16x16_ppc;
-
- vp8_get_mb_ss = vp8_get_mb_ss_c;
- vp8_get4x4sse_cs = vp8_get4x4sse_cs_c;
-
- vp8_sad16x16 = vp8_sad16x16_ppc;
- vp8_sad16x8 = vp8_sad16x8_ppc;
- vp8_sad8x16 = vp8_sad8x16_ppc;
- vp8_sad8x8 = vp8_sad8x8_ppc;
- vp8_sad4x4 = vp8_sad4x4_ppc;
-
- vp8_block_error = vp8_block_error_ppc;
- vp8_mbblock_error = vp8_mbblock_error_c;
-
- vp8_subtract_b = vp8_subtract_b_c;
- vp8_subtract_mby = vp8_subtract_mby_ppc;
- vp8_subtract_mbuv = vp8_subtract_mbuv_ppc;
-}
diff --git a/vp8/encoder/ppc/encodemb_altivec.asm b/vp8/encoder/ppc/encodemb_altivec.asm
deleted file mode 100644
index 6e0099d..0000000
--- a/vp8/encoder/ppc/encodemb_altivec.asm
+++ /dev/null
@@ -1,153 +0,0 @@
-;
-; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
-
- .globl vp8_subtract_mbuv_ppc
- .globl vp8_subtract_mby_ppc
-
-;# r3 short *diff
-;# r4 unsigned char *usrc
-;# r5 unsigned char *vsrc
-;# r6 unsigned char *pred
-;# r7 int stride
-vp8_subtract_mbuv_ppc:
- mfspr r11, 256 ;# get old VRSAVE
- oris r12, r11, 0xf000
- mtspr 256, r12 ;# set VRSAVE
-
- li r9, 256
- add r3, r3, r9
- add r3, r3, r9
- add r6, r6, r9
-
- li r10, 16
- li r9, 4
- mtctr r9
-
- vspltisw v0, 0
-
-mbu_loop:
- lvsl v5, 0, r4 ;# permutate value for alignment
- lvx v1, 0, r4 ;# src
- lvx v2, 0, r6 ;# pred
-
- add r4, r4, r7
- addi r6, r6, 16
-
- vperm v1, v1, v0, v5
-
- vmrghb v3, v0, v1 ;# unpack high src to short
- vmrghb v4, v0, v2 ;# unpack high pred to short
-
- lvsl v5, 0, r4 ;# permutate value for alignment
- lvx v1, 0, r4 ;# src
-
- add r4, r4, r7
-
- vsubshs v3, v3, v4
-
- stvx v3, 0, r3 ;# store out diff
-
- vperm v1, v1, v0, v5
-
- vmrghb v3, v0, v1 ;# unpack high src to short
- vmrglb v4, v0, v2 ;# unpack high pred to short
-
- vsubshs v3, v3, v4
-
- stvx v3, r10, r3 ;# store out diff
-
- addi r3, r3, 32
-
- bdnz mbu_loop
-
- mtctr r9
-
-mbv_loop:
- lvsl v5, 0, r5 ;# permutate value for alignment
- lvx v1, 0, r5 ;# src
- lvx v2, 0, r6 ;# pred
-
- add r5, r5, r7
- addi r6, r6, 16
-
- vperm v1, v1, v0, v5
-
- vmrghb v3, v0, v1 ;# unpack high src to short
- vmrghb v4, v0, v2 ;# unpack high pred to short
-
- lvsl v5, 0, r5 ;# permutate value for alignment
- lvx v1, 0, r5 ;# src
-
- add r5, r5, r7
-
- vsubshs v3, v3, v4
-
- stvx v3, 0, r3 ;# store out diff
-
- vperm v1, v1, v0, v5
-
- vmrghb v3, v0, v1 ;# unpack high src to short
- vmrglb v4, v0, v2 ;# unpack high pred to short
-
- vsubshs v3, v3, v4
-
- stvx v3, r10, r3 ;# store out diff
-
- addi r3, r3, 32
-
- bdnz mbv_loop
-
- mtspr 256, r11 ;# reset old VRSAVE
-
- blr
-
-;# r3 short *diff
-;# r4 unsigned char *src
-;# r5 unsigned char *pred
-;# r6 int stride
-vp8_subtract_mby_ppc:
- mfspr r11, 256 ;# get old VRSAVE
- oris r12, r11, 0xf800
- mtspr 256, r12 ;# set VRSAVE
-
- li r10, 16
- mtctr r10
-
- vspltisw v0, 0
-
-mby_loop:
- lvx v1, 0, r4 ;# src
- lvx v2, 0, r5 ;# pred
-
- add r4, r4, r6
- addi r5, r5, 16
-
- vmrghb v3, v0, v1 ;# unpack high src to short
- vmrghb v4, v0, v2 ;# unpack high pred to short
-
- vsubshs v3, v3, v4
-
- stvx v3, 0, r3 ;# store out diff
-
- vmrglb v3, v0, v1 ;# unpack low src to short
- vmrglb v4, v0, v2 ;# unpack low pred to short
-
- vsubshs v3, v3, v4
-
- stvx v3, r10, r3 ;# store out diff
-
- addi r3, r3, 32
-
- bdnz mby_loop
-
- mtspr 256, r11 ;# reset old VRSAVE
-
- blr
diff --git a/vp8/encoder/ppc/fdct_altivec.asm b/vp8/encoder/ppc/fdct_altivec.asm
deleted file mode 100644
index 935d0cb..0000000
--- a/vp8/encoder/ppc/fdct_altivec.asm
+++ /dev/null
@@ -1,205 +0,0 @@
-;
-; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
-
- .globl vp8_short_fdct4x4_ppc
- .globl vp8_short_fdct8x4_ppc
-
-.macro load_c V, LABEL, OFF, R0, R1
- lis \R0, \LABEL@ha
- la \R1, \LABEL@l(\R0)
- lvx \V, \OFF, \R1
-.endm
-
-;# Forward and inverse DCTs are nearly identical; only differences are
-;# in normalization (fwd is twice unitary, inv is half unitary)
-;# and that they are of course transposes of each other.
-;#
-;# The following three accomplish most of implementation and
-;# are used only by ppc_idct.c and ppc_fdct.c.
-.macro prologue
- mfspr r11, 256 ;# get old VRSAVE
- oris r12, r11, 0xfffc
- mtspr 256, r12 ;# set VRSAVE
-
- stwu r1,-32(r1) ;# create space on the stack
-
- li r6, 16
-
- load_c v0, dct_tab, 0, r9, r10
- lvx v1, r6, r10
- addi r10, r10, 32
- lvx v2, 0, r10
- lvx v3, r6, r10
-
- load_c v4, ppc_dctperm_tab, 0, r9, r10
- load_c v5, ppc_dctperm_tab, r6, r9, r10
-
- load_c v6, round_tab, 0, r10, r9
-.endm
-
-.macro epilogue
- addi r1, r1, 32 ;# recover stack
-
- mtspr 256, r11 ;# reset old VRSAVE
-.endm
-
-;# Do horiz xf on two rows of coeffs v8 = a0 a1 a2 a3 b0 b1 b2 b3.
-;# a/A are the even rows 0,2 b/B are the odd rows 1,3
-;# For fwd transform, indices are horizontal positions, then frequencies.
-;# For inverse transform, frequencies then positions.
-;# The two resulting A0..A3 B0..B3 are later combined
-;# and vertically transformed.
-
-.macro two_rows_horiz Dst
- vperm v9, v8, v8, v4 ;# v9 = a2 a3 a0 a1 b2 b3 b0 b1
-
- vmsumshm v10, v0, v8, v6
- vmsumshm v10, v1, v9, v10
- vsraw v10, v10, v7 ;# v10 = A0 A1 B0 B1
-
- vmsumshm v11, v2, v8, v6
- vmsumshm v11, v3, v9, v11
- vsraw v11, v11, v7 ;# v11 = A2 A3 B2 B3
-
- vpkuwum v10, v10, v11 ;# v10 = A0 A1 B0 B1 A2 A3 B2 B3
- vperm \Dst, v10, v10, v5 ;# Dest = A0 B0 A1 B1 A2 B2 A3 B3
-.endm
-
-;# Vertical xf on two rows. DCT values in comments are for inverse transform;
-;# forward transform uses transpose.
-
-.macro two_rows_vert Ceven, Codd
- vspltw v8, \Ceven, 0 ;# v8 = c00 c10 or c02 c12 four times
- vspltw v9, \Codd, 0 ;# v9 = c20 c30 or c22 c32 ""
- vmsumshm v8, v8, v12, v6
- vmsumshm v8, v9, v13, v8
- vsraw v10, v8, v7
-
- vspltw v8, \Codd, 1 ;# v8 = c01 c11 or c03 c13
- vspltw v9, \Ceven, 1 ;# v9 = c21 c31 or c23 c33
- vmsumshm v8, v8, v12, v6
- vmsumshm v8, v9, v13, v8
- vsraw v8, v8, v7
-
- vpkuwum v8, v10, v8 ;# v8 = rows 0,1 or 2,3
-.endm
-
-.macro two_rows_h Dest
- stw r0, 0(r8)
- lwz r0, 4(r3)
- stw r0, 4(r8)
- lwzux r0, r3,r5
- stw r0, 8(r8)
- lwz r0, 4(r3)
- stw r0, 12(r8)
- lvx v8, 0,r8
- two_rows_horiz \Dest
-.endm
-
- .align 2
-;# r3 short *input
-;# r4 short *output
-;# r5 int pitch
-vp8_short_fdct4x4_ppc:
-
- prologue
-
- vspltisw v7, 14 ;# == 14, fits in 5 signed bits
- addi r8, r1, 0
-
-
- lwz r0, 0(r3)
- two_rows_h v12 ;# v12 = H00 H10 H01 H11 H02 H12 H03 H13
-
- lwzux r0, r3, r5
- two_rows_h v13 ;# v13 = H20 H30 H21 H31 H22 H32 H23 H33
-
- lvx v6, r6, r9 ;# v6 = Vround
- vspltisw v7, -16 ;# == 16 == -16, only low 5 bits matter
-
- two_rows_vert v0, v1
- stvx v8, 0, r4
- two_rows_vert v2, v3
- stvx v8, r6, r4
-
- epilogue
-
- blr
-
- .align 2
-;# r3 short *input
-;# r4 short *output
-;# r5 int pitch
-vp8_short_fdct8x4_ppc:
- prologue
-
- vspltisw v7, 14 ;# == 14, fits in 5 signed bits
- addi r8, r1, 0
- addi r10, r3, 0
-
- lwz r0, 0(r3)
- two_rows_h v12 ;# v12 = H00 H10 H01 H11 H02 H12 H03 H13
-
- lwzux r0, r3, r5
- two_rows_h v13 ;# v13 = H20 H30 H21 H31 H22 H32 H23 H33
-
- lvx v6, r6, r9 ;# v6 = Vround
- vspltisw v7, -16 ;# == 16 == -16, only low 5 bits matter
-
- two_rows_vert v0, v1
- stvx v8, 0, r4
- two_rows_vert v2, v3
- stvx v8, r6, r4
-
- ;# Next block
- addi r3, r10, 8
- addi r4, r4, 32
- lvx v6, 0, r9 ;# v6 = Hround
-
- vspltisw v7, 14 ;# == 14, fits in 5 signed bits
- addi r8, r1, 0
-
- lwz r0, 0(r3)
- two_rows_h v12 ;# v12 = H00 H10 H01 H11 H02 H12 H03 H13
-
- lwzux r0, r3, r5
- two_rows_h v13 ;# v13 = H20 H30 H21 H31 H22 H32 H23 H33
-
- lvx v6, r6, r9 ;# v6 = Vround
- vspltisw v7, -16 ;# == 16 == -16, only low 5 bits matter
-
- two_rows_vert v0, v1
- stvx v8, 0, r4
- two_rows_vert v2, v3
- stvx v8, r6, r4
-
- epilogue
-
- blr
-
- .data
- .align 4
-ppc_dctperm_tab:
- .byte 4,5,6,7, 0,1,2,3, 12,13,14,15, 8,9,10,11
- .byte 0,1,4,5, 2,3,6,7, 8,9,12,13, 10,11,14,15
-
- .align 4
-dct_tab:
- .short 23170, 23170,-12540,-30274, 23170, 23170,-12540,-30274
- .short 23170, 23170, 30274, 12540, 23170, 23170, 30274, 12540
-
- .short 23170,-23170, 30274,-12540, 23170,-23170, 30274,-12540
- .short -23170, 23170, 12540,-30274,-23170, 23170, 12540,-30274
-
- .align 4
-round_tab:
- .long (1 << (14-1)), (1 << (14-1)), (1 << (14-1)), (1 << (14-1))
- .long (1 << (16-1)), (1 << (16-1)), (1 << (16-1)), (1 << (16-1))
diff --git a/vp8/encoder/ppc/rdopt_altivec.asm b/vp8/encoder/ppc/rdopt_altivec.asm
deleted file mode 100644
index ba48230..0000000
--- a/vp8/encoder/ppc/rdopt_altivec.asm
+++ /dev/null
@@ -1,51 +0,0 @@
-;
-; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
-
- .globl vp8_block_error_ppc
-
- .align 2
-;# r3 short *Coeff
-;# r4 short *dqcoeff
-vp8_block_error_ppc:
- mfspr r11, 256 ;# get old VRSAVE
- oris r12, r11, 0xf800
- mtspr 256, r12 ;# set VRSAVE
-
- stwu r1,-32(r1) ;# create space on the stack
-
- stw r5, 12(r1) ;# tranfer dc to vector register
-
- lvx v0, 0, r3 ;# Coeff
- lvx v1, 0, r4 ;# dqcoeff
-
- li r10, 16
-
- vspltisw v3, 0
-
- vsubshs v0, v0, v1
-
- vmsumshm v2, v0, v0, v3 ;# multiply differences
-
- lvx v0, r10, r3 ;# Coeff
- lvx v1, r10, r4 ;# dqcoeff
-
- vsubshs v0, v0, v1
-
- vmsumshm v1, v0, v0, v2 ;# multiply differences
- vsumsws v1, v1, v3 ;# sum up
-
- stvx v1, 0, r1
- lwz r3, 12(r1) ;# return value
-
- addi r1, r1, 32 ;# recover stack
- mtspr 256, r11 ;# reset old VRSAVE
-
- blr
diff --git a/vp8/encoder/rdopt.c b/vp8/encoder/rdopt.c
index 29da926..582c8bc 100644
--- a/vp8/encoder/rdopt.c
+++ b/vp8/encoder/rdopt.c
@@ -1661,7 +1661,6 @@
mv.as_mv.row = mvx[vcnt/2];
mv.as_mv.col = mvy[vcnt/2];
- find = 1;
/* sr is set to 0 to allow calling function to decide the search
* range.
*/
@@ -2293,7 +2292,6 @@
mode_mv[NEWMV].as_int = d->bmi.mv.as_int;
/* Further step/diamond searches as necessary */
- n = 0;
further_steps = (cpi->sf.max_step_search_steps - 1) - step_param;
n = num00;
@@ -2560,8 +2558,6 @@
intra_rd_penalty, cpi, x);
if (this_rd < best_mode.rd || x->skip)
{
- /* Note index of best mode so far */
- best_mode_index = mode_index;
*returnrate = rd.rate2;
*returndistortion = rd.distortion2;
update_best_mode(&best_mode, this_rd, &rd, other_cost, x);
diff --git a/vp8/encoder/x86/quantize_sse2.c b/vp8/encoder/x86/quantize_sse2.c
index 291d219..f56e646 100644
--- a/vp8/encoder/x86/quantize_sse2.c
+++ b/vp8/encoder/x86/quantize_sse2.c
@@ -35,7 +35,7 @@
void vp8_regular_quantize_b_sse2(BLOCK *b, BLOCKD *d)
{
char eob = 0;
- short *zbin_boost_ptr = b->zrun_zbin_boost;
+ short *zbin_boost_ptr;
short *qcoeff_ptr = d->qcoeff;
DECLARE_ALIGNED_ARRAY(16, short, x, 16);
DECLARE_ALIGNED_ARRAY(16, short, y, 16);
diff --git a/vp8/vp8_cx_iface.c b/vp8/vp8_cx_iface.c
index 6768ffd..bf8a853 100644
--- a/vp8/vp8_cx_iface.c
+++ b/vp8/vp8_cx_iface.c
@@ -861,9 +861,6 @@
if (!ctx->cfg.rc_target_bitrate)
return res;
- if (!ctx->cfg.rc_target_bitrate)
- return res;
-
if (img)
res = validate_img(ctx, img);
diff --git a/vp9/common/vp9_entropy.c b/vp9/common/vp9_entropy.c
index 77a8709..7cdfaec 100644
--- a/vp9/common/vp9_entropy.c
+++ b/vp9/common/vp9_entropy.c
@@ -15,6 +15,18 @@
#include "vpx_mem/vpx_mem.h"
#include "vpx/vpx_integer.h"
+// Unconstrained Node Tree
+const vp9_tree_index vp9_coef_con_tree[TREE_SIZE(ENTROPY_TOKENS)] = {
+ 2, 6, // 0 = LOW_VAL
+ -TWO_TOKEN, 4, // 1 = TWO
+ -THREE_TOKEN, -FOUR_TOKEN, // 2 = THREE
+ 8, 10, // 3 = HIGH_LOW
+ -CATEGORY1_TOKEN, -CATEGORY2_TOKEN, // 4 = CAT_ONE
+ 12, 14, // 5 = CAT_THREEFOUR
+ -CATEGORY3_TOKEN, -CATEGORY4_TOKEN, // 6 = CAT_THREE
+ -CATEGORY5_TOKEN, -CATEGORY6_TOKEN // 7 = CAT_FIVE
+};
+
const vp9_prob vp9_cat1_prob[] = { 159 };
const vp9_prob vp9_cat2_prob[] = { 165, 145 };
const vp9_prob vp9_cat3_prob[] = { 173, 148, 140 };
diff --git a/vp9/common/vp9_entropy.h b/vp9/common/vp9_entropy.h
index 4eb2e64..eee096f 100644
--- a/vp9/common/vp9_entropy.h
+++ b/vp9/common/vp9_entropy.h
@@ -173,6 +173,7 @@
#define PIVOT_NODE 2 // which node is pivot
#define MODEL_NODES (ENTROPY_NODES - UNCONSTRAINED_NODES)
+extern const vp9_tree_index vp9_coef_con_tree[TREE_SIZE(ENTROPY_TOKENS)];
extern const vp9_prob vp9_pareto8_full[COEFF_PROB_MODELS][MODEL_NODES];
typedef vp9_prob vp9_coeff_probs_model[REF_TYPES][COEF_BANDS]
diff --git a/vp9/common/vp9_postproc.c b/vp9/common/vp9_postproc.c
index bb21ade..0fbc2d2 100644
--- a/vp9/common/vp9_postproc.c
+++ b/vp9/common/vp9_postproc.c
@@ -91,10 +91,7 @@
int flimit) {
uint8_t const *p_src;
uint8_t *p_dst;
- int row;
- int col;
- int i;
- int v;
+ int row, col, i, v, kernel;
int pitch = src_pixels_per_line;
uint8_t d[8];
(void)dst_pixels_per_line;
@@ -105,8 +102,8 @@
p_dst = dst_ptr;
for (col = 0; col < cols; col++) {
- int kernel = 4;
- int v = p_src[col];
+ kernel = 4;
+ v = p_src[col];
for (i = -2; i <= 2; i++) {
if (abs(v - p_src[col + i * pitch]) > flimit)
@@ -128,7 +125,7 @@
d[i] = p_src[i];
for (col = 0; col < cols; col++) {
- int kernel = 4;
+ kernel = 4;
v = p_src[col];
d[col & 7] = v;
@@ -168,10 +165,7 @@
int flimit) {
uint16_t const *p_src;
uint16_t *p_dst;
- int row;
- int col;
- int i;
- int v;
+ int row, col, i, v, kernel;
int pitch = src_pixels_per_line;
uint16_t d[8];
@@ -181,8 +175,8 @@
p_dst = dst_ptr;
for (col = 0; col < cols; col++) {
- int kernel = 4;
- int v = p_src[col];
+ kernel = 4;
+ v = p_src[col];
for (i = -2; i <= 2; i++) {
if (abs(v - p_src[col + i * pitch]) > flimit)
@@ -205,7 +199,7 @@
d[i] = p_src[i];
for (col = 0; col < cols; col++) {
- int kernel = 4;
+ kernel = 4;
v = p_src[col];
d[col & 7] = v;
@@ -518,22 +512,24 @@
assert((src->flags & YV12_FLAG_HIGHBITDEPTH) ==
(dst->flags & YV12_FLAG_HIGHBITDEPTH));
if (src->flags & YV12_FLAG_HIGHBITDEPTH) {
- const uint16_t *const src = CONVERT_TO_SHORTPTR(srcs[i] + 2 * src_stride
- + 2);
- uint16_t *const dst = CONVERT_TO_SHORTPTR(dsts[i] + 2 * dst_stride + 2);
- vp9_highbd_post_proc_down_and_across(src, dst, src_stride, dst_stride,
- src_height, src_width, ppl);
+ const uint16_t *const src_plane = CONVERT_TO_SHORTPTR(
+ srcs[i] + 2 * src_stride + 2);
+ uint16_t *const dst_plane = CONVERT_TO_SHORTPTR(
+ dsts[i] + 2 * dst_stride + 2);
+ vp9_highbd_post_proc_down_and_across(src_plane, dst_plane, src_stride,
+ dst_stride, src_height, src_width,
+ ppl);
} else {
- const uint8_t *const src = srcs[i] + 2 * src_stride + 2;
- uint8_t *const dst = dsts[i] + 2 * dst_stride + 2;
+ const uint8_t *const src_plane = srcs[i] + 2 * src_stride + 2;
+ uint8_t *const dst_plane = dsts[i] + 2 * dst_stride + 2;
- vp9_post_proc_down_and_across(src, dst, src_stride, dst_stride,
- src_height, src_width, ppl);
+ vp9_post_proc_down_and_across(src_plane, dst_plane, src_stride,
+ dst_stride, src_height, src_width, ppl);
}
#else
- const uint8_t *const src = srcs[i] + 2 * src_stride + 2;
- uint8_t *const dst = dsts[i] + 2 * dst_stride + 2;
- vp9_post_proc_down_and_across(src, dst, src_stride, dst_stride,
+ const uint8_t *const src_plane = srcs[i] + 2 * src_stride + 2;
+ uint8_t *const dst_plane = dsts[i] + 2 * dst_stride + 2;
+ vp9_post_proc_down_and_across(src_plane, dst_plane, src_stride, dst_stride,
src_height, src_width, ppl);
#endif
}
@@ -558,16 +554,15 @@
* a gaussian distribution with sigma determined by q.
*/
{
- double i;
int next, j;
next = 0;
for (i = -32; i < 32; i++) {
- int a = (int)(0.5 + 256 * gaussian(sigma, 0, i));
+ int a_i = (int)(0.5 + 256 * gaussian(sigma, 0, i));
- if (a) {
- for (j = 0; j < a; j++) {
+ if (a_i) {
+ for (j = 0; j < a_i; j++) {
char_dist[next + j] = (char) i;
}
diff --git a/vp9/common/vp9_reconinter.c b/vp9/common/vp9_reconinter.c
index 1be358e..c496299 100644
--- a/vp9/common/vp9_reconinter.c
+++ b/vp9/common/vp9_reconinter.c
@@ -285,10 +285,10 @@
void vp9_setup_dst_planes(struct macroblockd_plane planes[MAX_MB_PLANE],
const YV12_BUFFER_CONFIG *src,
int mi_row, int mi_col) {
- uint8_t *const buffers[4] = {src->y_buffer, src->u_buffer, src->v_buffer,
- src->alpha_buffer};
- const int strides[4] = {src->y_stride, src->uv_stride, src->uv_stride,
- src->alpha_stride};
+ uint8_t *const buffers[MAX_MB_PLANE] = { src->y_buffer, src->u_buffer,
+ src->v_buffer};
+ const int strides[MAX_MB_PLANE] = { src->y_stride, src->uv_stride,
+ src->uv_stride};
int i;
for (i = 0; i < MAX_MB_PLANE; ++i) {
@@ -304,11 +304,10 @@
const struct scale_factors *sf) {
if (src != NULL) {
int i;
- uint8_t *const buffers[4] = {src->y_buffer, src->u_buffer, src->v_buffer,
- src->alpha_buffer};
- const int strides[4] = {src->y_stride, src->uv_stride, src->uv_stride,
- src->alpha_stride};
-
+ uint8_t *const buffers[MAX_MB_PLANE] = { src->y_buffer, src->u_buffer,
+ src->v_buffer};
+ const int strides[MAX_MB_PLANE] = { src->y_stride, src->uv_stride,
+ src->uv_stride};
for (i = 0; i < MAX_MB_PLANE; ++i) {
struct macroblockd_plane *const pd = &xd->plane[i];
setup_pred_plane(&pd->pre[idx], buffers[i], strides[i], mi_row, mi_col,
diff --git a/vp9/common/vp9_rtcd_defs.pl b/vp9/common/vp9_rtcd_defs.pl
index 1344813..c8628f8 100644
--- a/vp9/common/vp9_rtcd_defs.pl
+++ b/vp9/common/vp9_rtcd_defs.pl
@@ -499,7 +499,7 @@
specialize qw/vp9_highbd_d153_predictor_4x4/;
add_proto qw/void vp9_highbd_v_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
- specialize qw/vp9_highbd_v_predictor_4x4 neon/, "$sse_x86inc";
+ specialize qw/vp9_highbd_v_predictor_4x4/, "$sse_x86inc";
add_proto qw/void vp9_highbd_tm_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
specialize qw/vp9_highbd_tm_predictor_4x4/, "$sse_x86inc";
@@ -577,7 +577,7 @@
specialize qw/vp9_highbd_d153_predictor_16x16/;
add_proto qw/void vp9_highbd_v_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
- specialize qw/vp9_highbd_v_predictor_16x16 neon/, "$sse2_x86inc";
+ specialize qw/vp9_highbd_v_predictor_16x16/, "$sse2_x86inc";
add_proto qw/void vp9_highbd_tm_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
specialize qw/vp9_highbd_tm_predictor_16x16/, "$sse2_x86_64";
@@ -1028,16 +1028,20 @@
specialize qw/vp9_sad32x32x8/;
add_proto qw/void vp9_sad16x16x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
-specialize qw/vp9_sad16x16x8 sse4/;
+specialize qw/vp9_sad16x16x8 sse4_1/;
+$vp9_sad16x16x8_sse4_1=vp9_sad16x16x8_sse4;
add_proto qw/void vp9_sad16x8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
-specialize qw/vp9_sad16x8x8 sse4/;
+specialize qw/vp9_sad16x8x8 sse4_1/;
+$vp9_sad16x8x8_sse4_1=vp9_sad16x8x8_sse4;
add_proto qw/void vp9_sad8x16x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
-specialize qw/vp9_sad8x16x8 sse4/;
+specialize qw/vp9_sad8x16x8 sse4_1/;
+$vp9_sad8x16x8_sse4_1=vp9_sad8x16x8_sse4;
add_proto qw/void vp9_sad8x8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
-specialize qw/vp9_sad8x8x8 sse4/;
+specialize qw/vp9_sad8x8x8 sse4_1/;
+$vp9_sad8x8x8_sse4_1=vp9_sad8x8x8_sse4;
add_proto qw/void vp9_sad8x4x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
specialize qw/vp9_sad8x4x8/;
@@ -1046,7 +1050,8 @@
specialize qw/vp9_sad4x8x8/;
add_proto qw/void vp9_sad4x4x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
-specialize qw/vp9_sad4x4x8 sse4/;
+specialize qw/vp9_sad4x4x8 sse4_1/;
+$vp9_sad4x4x8_sse4_1=vp9_sad4x4x8_sse4;
add_proto qw/void vp9_sad64x64x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array";
specialize qw/vp9_sad64x64x4d sse2 avx2 neon/;
@@ -1109,6 +1114,15 @@
add_proto qw/unsigned int vp9_avg_4x4/, "const uint8_t *, int p";
specialize qw/vp9_avg_4x4 sse2/;
+add_proto qw/void vp9_hadamard_8x8/, "int16_t const *src_diff, int src_stride, int16_t *coeff";
+specialize qw/vp9_hadamard_8x8 sse2/, "$ssse3_x86_64";
+
+add_proto qw/void vp9_hadamard_16x16/, "int16_t const *src_diff, int src_stride, int16_t *coeff";
+specialize qw/vp9_hadamard_16x16 sse2/;
+
+add_proto qw/int16_t vp9_satd/, "const int16_t *coeff, int length";
+specialize qw/vp9_satd sse2/;
+
add_proto qw/void vp9_int_pro_row/, "int16_t *hbuf, uint8_t const *ref, const int ref_stride, const int height";
specialize qw/vp9_int_pro_row sse2/;
@@ -1162,6 +1176,9 @@
add_proto qw/int64_t vp9_block_error/, "const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz";
specialize qw/vp9_block_error avx2/, "$sse2_x86inc";
+ add_proto qw/int64_t vp9_block_error_fp/, "const int16_t *coeff, const int16_t *dqcoeff, int block_size";
+ specialize qw/vp9_block_error_fp sse2/;
+
add_proto qw/void vp9_quantize_fp/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
specialize qw/vp9_quantize_fp neon sse2/, "$ssse3_x86_64";
diff --git a/vp9/decoder/vp9_decodeframe.c b/vp9/decoder/vp9_decodeframe.c
index 8840750..1a3b946 100644
--- a/vp9/decoder/vp9_decodeframe.c
+++ b/vp9/decoder/vp9_decodeframe.c
@@ -1807,162 +1807,134 @@
#endif // CONFIG_VP9_HIGHBITDEPTH
void dec_build_inter_predictors(VP9Decoder *const pbi, MACROBLOCKD *xd,
- int plane, int block, int bw, int bh, int x,
- int y, int w, int h, int mi_x, int mi_y) {
+ int plane, int bw, int bh, int x,
+ int y, int w, int h, int mi_x, int mi_y,
+ const InterpKernel *kernel,
+ const struct scale_factors *sf,
+ struct buf_2d *pre_buf, struct buf_2d *dst_buf,
+ const MV* mv, RefCntBuffer *ref_frame_buf,
+ int is_scaled, int ref) {
struct macroblockd_plane *const pd = &xd->plane[plane];
- const MODE_INFO *mi = xd->mi[0].src_mi;
- const int is_compound = has_second_ref(&mi->mbmi);
- const InterpKernel *kernel = vp9_get_interp_kernel(mi->mbmi.interp_filter);
- int ref;
+ uint8_t *const dst = dst_buf->buf + dst_buf->stride * y + x;
+ MV32 scaled_mv;
+ int xs, ys, x0, y0, x0_16, y0_16, frame_width, frame_height,
+ buf_stride, subpel_x, subpel_y;
+ uint8_t *ref_frame, *buf_ptr;
- for (ref = 0; ref < 1 + is_compound; ++ref) {
- const struct scale_factors *const sf = &xd->block_refs[ref]->sf;
- struct buf_2d *const pre_buf = &pd->pre[ref];
- struct buf_2d *const dst_buf = &pd->dst;
- uint8_t *const dst = dst_buf->buf + dst_buf->stride * y + x;
- const MV mv = mi->mbmi.sb_type < BLOCK_8X8
- ? average_split_mvs(pd, mi, ref, block)
- : mi->mbmi.mv[ref].as_mv;
+ // Get reference frame pointer, width and height.
+ if (plane == 0) {
+ frame_width = ref_frame_buf->buf.y_crop_width;
+ frame_height = ref_frame_buf->buf.y_crop_height;
+ ref_frame = ref_frame_buf->buf.y_buffer;
+ } else {
+ frame_width = ref_frame_buf->buf.uv_crop_width;
+ frame_height = ref_frame_buf->buf.uv_crop_height;
+ ref_frame = plane == 1 ? ref_frame_buf->buf.u_buffer
+ : ref_frame_buf->buf.v_buffer;
+ }
- const MV mv_q4 = clamp_mv_to_umv_border_sb(xd, &mv, bw, bh,
+ if (is_scaled) {
+ const MV mv_q4 = clamp_mv_to_umv_border_sb(xd, mv, bw, bh,
pd->subsampling_x,
pd->subsampling_y);
+ // Co-ordinate of containing block to pixel precision.
+ int x_start = (-xd->mb_to_left_edge >> (3 + pd->subsampling_x));
+ int y_start = (-xd->mb_to_top_edge >> (3 + pd->subsampling_y));
- MV32 scaled_mv;
- int xs, ys, x0, y0, x0_16, y0_16, y1, frame_width, frame_height,
- buf_stride, subpel_x, subpel_y;
- uint8_t *ref_frame, *buf_ptr;
- const int idx = xd->block_refs[ref]->idx;
- BufferPool *const pool = pbi->common.buffer_pool;
- RefCntBuffer *const ref_frame_buf = &pool->frame_bufs[idx];
- const int is_scaled = vp9_is_scaled(sf);
+ // Co-ordinate of the block to 1/16th pixel precision.
+ x0_16 = (x_start + x) << SUBPEL_BITS;
+ y0_16 = (y_start + y) << SUBPEL_BITS;
- // Get reference frame pointer, width and height.
- if (plane == 0) {
- frame_width = ref_frame_buf->buf.y_crop_width;
- frame_height = ref_frame_buf->buf.y_crop_height;
- ref_frame = ref_frame_buf->buf.y_buffer;
- } else {
- frame_width = ref_frame_buf->buf.uv_crop_width;
- frame_height = ref_frame_buf->buf.uv_crop_height;
- ref_frame = plane == 1 ? ref_frame_buf->buf.u_buffer
- : ref_frame_buf->buf.v_buffer;
+ // Co-ordinate of current block in reference frame
+ // to 1/16th pixel precision.
+ x0_16 = sf->scale_value_x(x0_16, sf);
+ y0_16 = sf->scale_value_y(y0_16, sf);
+
+ // Map the top left corner of the block into the reference frame.
+ x0 = sf->scale_value_x(x_start + x, sf);
+ y0 = sf->scale_value_y(y_start + y, sf);
+
+ // Scale the MV and incorporate the sub-pixel offset of the block
+ // in the reference frame.
+ scaled_mv = vp9_scale_mv(&mv_q4, mi_x + x, mi_y + y, sf);
+ xs = sf->x_step_q4;
+ ys = sf->y_step_q4;
+ } else {
+ // Co-ordinate of containing block to pixel precision.
+ x0 = (-xd->mb_to_left_edge >> (3 + pd->subsampling_x)) + x;
+ y0 = (-xd->mb_to_top_edge >> (3 + pd->subsampling_y)) + y;
+
+ // Co-ordinate of the block to 1/16th pixel precision.
+ x0_16 = x0 << SUBPEL_BITS;
+ y0_16 = y0 << SUBPEL_BITS;
+
+ scaled_mv.row = mv->row * (1 << (1 - pd->subsampling_y));
+ scaled_mv.col = mv->col * (1 << (1 - pd->subsampling_x));
+ xs = ys = 16;
+ }
+ subpel_x = scaled_mv.col & SUBPEL_MASK;
+ subpel_y = scaled_mv.row & SUBPEL_MASK;
+
+ // Calculate the top left corner of the best matching block in the
+ // reference frame.
+ x0 += scaled_mv.col >> SUBPEL_BITS;
+ y0 += scaled_mv.row >> SUBPEL_BITS;
+ x0_16 += scaled_mv.col;
+ y0_16 += scaled_mv.row;
+
+ // Get reference block pointer.
+ buf_ptr = ref_frame + y0 * pre_buf->stride + x0;
+ buf_stride = pre_buf->stride;
+
+ // Do border extension if there is motion or the
+ // width/height is not a multiple of 8 pixels.
+ if (is_scaled || scaled_mv.col || scaled_mv.row ||
+ (frame_width & 0x7) || (frame_height & 0x7)) {
+ int y1 = ((y0_16 + (h - 1) * ys) >> SUBPEL_BITS) + 1;
+
+ // Get reference block bottom right horizontal coordinate.
+ int x1 = ((x0_16 + (w - 1) * xs) >> SUBPEL_BITS) + 1;
+ int x_pad = 0, y_pad = 0;
+
+ if (subpel_x || (sf->x_step_q4 != SUBPEL_SHIFTS)) {
+ x0 -= VP9_INTERP_EXTEND - 1;
+ x1 += VP9_INTERP_EXTEND;
+ x_pad = 1;
}
- if (is_scaled) {
- // Co-ordinate of containing block to pixel precision.
- int x_start = (-xd->mb_to_left_edge >> (3 + pd->subsampling_x));
- int y_start = (-xd->mb_to_top_edge >> (3 + pd->subsampling_y));
-
- // Co-ordinate of the block to 1/16th pixel precision.
- x0_16 = (x_start + x) << SUBPEL_BITS;
- y0_16 = (y_start + y) << SUBPEL_BITS;
-
- // Co-ordinate of current block in reference frame
- // to 1/16th pixel precision.
- x0_16 = sf->scale_value_x(x0_16, sf);
- y0_16 = sf->scale_value_y(y0_16, sf);
-
- // Map the top left corner of the block into the reference frame.
- x0 = sf->scale_value_x(x_start + x, sf);
- y0 = sf->scale_value_y(y_start + y, sf);
-
- // Scale the MV and incorporate the sub-pixel offset of the block
- // in the reference frame.
- scaled_mv = vp9_scale_mv(&mv_q4, mi_x + x, mi_y + y, sf);
- xs = sf->x_step_q4;
- ys = sf->y_step_q4;
- } else {
- // Co-ordinate of containing block to pixel precision.
- x0 = (-xd->mb_to_left_edge >> (3 + pd->subsampling_x)) + x;
- y0 = (-xd->mb_to_top_edge >> (3 + pd->subsampling_y)) + y;
-
- // Co-ordinate of the block to 1/16th pixel precision.
- x0_16 = x0 << SUBPEL_BITS;
- y0_16 = y0 << SUBPEL_BITS;
-
- scaled_mv.row = mv_q4.row;
- scaled_mv.col = mv_q4.col;
- xs = ys = 16;
+ if (subpel_y || (sf->y_step_q4 != SUBPEL_SHIFTS)) {
+ y0 -= VP9_INTERP_EXTEND - 1;
+ y1 += VP9_INTERP_EXTEND;
+ y_pad = 1;
}
- subpel_x = scaled_mv.col & SUBPEL_MASK;
- subpel_y = scaled_mv.row & SUBPEL_MASK;
- // Calculate the top left corner of the best matching block in the
- // reference frame.
- x0 += scaled_mv.col >> SUBPEL_BITS;
- y0 += scaled_mv.row >> SUBPEL_BITS;
- x0_16 += scaled_mv.col;
- y0_16 += scaled_mv.row;
+ // Wait until reference block is ready. Pad 7 more pixels as last 7
+ // pixels of each superblock row can be changed by next superblock row.
+ if (pbi->frame_parallel_decode)
+ vp9_frameworker_wait(pbi->frame_worker_owner, ref_frame_buf,
+ MAX(0, (y1 + 7)) << (plane == 0 ? 0 : 1));
- // Get reference block pointer.
- buf_ptr = ref_frame + y0 * pre_buf->stride + x0;
- buf_stride = pre_buf->stride;
-
- // Get reference block bottom right vertical coordinate.
- y1 = ((y0_16 + (h - 1) * ys) >> SUBPEL_BITS) + 1;
-
- // Do border extension if there is motion or the
- // width/height is not a multiple of 8 pixels.
- if (is_scaled || scaled_mv.col || scaled_mv.row ||
- (frame_width & 0x7) || (frame_height & 0x7)) {
- // Get reference block bottom right horizontal coordinate.
- int x1 = ((x0_16 + (w - 1) * xs) >> SUBPEL_BITS) + 1;
- int x_pad = 0, y_pad = 0;
-
- if (subpel_x || (sf->x_step_q4 != SUBPEL_SHIFTS)) {
- x0 -= VP9_INTERP_EXTEND - 1;
- x1 += VP9_INTERP_EXTEND;
- x_pad = 1;
- }
-
- if (subpel_y || (sf->y_step_q4 != SUBPEL_SHIFTS)) {
- y0 -= VP9_INTERP_EXTEND - 1;
- y1 += VP9_INTERP_EXTEND;
- y_pad = 1;
- }
-
- // Wait until reference block is ready. Pad 7 more pixels as last 7
- // pixels of each superblock row can be changed by next superblock row.
- if (pbi->frame_parallel_decode)
- vp9_frameworker_wait(pbi->frame_worker_owner, ref_frame_buf,
- MAX(0, (y1 + 7) << (plane == 0 ? 0 : 1)));
-
- // Skip border extension if block is inside the frame.
- if (x0 < 0 || x0 > frame_width - 1 || x1 < 0 || x1 > frame_width - 1 ||
- y0 < 0 || y0 > frame_height - 1 || y1 < 0 || y1 > frame_height - 1) {
- uint8_t *buf_ptr1 = ref_frame + y0 * pre_buf->stride + x0;
- // Extend the border.
+ // Skip border extension if block is inside the frame.
+ if (x0 < 0 || x0 > frame_width - 1 || x1 < 0 || x1 > frame_width - 1 ||
+ y0 < 0 || y0 > frame_height - 1 || y1 < 0 || y1 > frame_height - 1) {
+ uint8_t *buf_ptr1 = ref_frame + y0 * pre_buf->stride + x0;
+ // Extend the border.
#if CONFIG_VP9_HIGHBITDEPTH
- if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
- high_build_mc_border(buf_ptr1,
- pre_buf->stride,
- xd->mc_buf_high,
- x1 - x0 + 1,
- x0,
- y0,
- x1 - x0 + 1,
- y1 - y0 + 1,
- frame_width,
- frame_height);
- buf_stride = x1 - x0 + 1;
- buf_ptr = CONVERT_TO_BYTEPTR(xd->mc_buf_high) +
- y_pad * 3 * buf_stride + x_pad * 3;
- } else {
- build_mc_border(buf_ptr1,
- pre_buf->stride,
- xd->mc_buf,
- x1 - x0 + 1,
- x0,
- y0,
- x1 - x0 + 1,
- y1 - y0 + 1,
- frame_width,
- frame_height);
- buf_stride = x1 - x0 + 1;
- buf_ptr = xd->mc_buf + y_pad * 3 * buf_stride + x_pad * 3;
- }
-#else
+ if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+ high_build_mc_border(buf_ptr1,
+ pre_buf->stride,
+ xd->mc_buf_high,
+ x1 - x0 + 1,
+ x0,
+ y0,
+ x1 - x0 + 1,
+ y1 - y0 + 1,
+ frame_width,
+ frame_height);
+ buf_stride = x1 - x0 + 1;
+ buf_ptr = CONVERT_TO_BYTEPTR(xd->mc_buf_high) +
+ y_pad * 3 * buf_stride + x_pad * 3;
+ } else {
build_mc_border(buf_ptr1,
pre_buf->stride,
xd->mc_buf,
@@ -1975,28 +1947,43 @@
frame_height);
buf_stride = x1 - x0 + 1;
buf_ptr = xd->mc_buf + y_pad * 3 * buf_stride + x_pad * 3;
-#endif // CONFIG_VP9_HIGHBITDEPTH
}
- } else {
- // Wait until reference block is ready. Pad 7 more pixels as last 7
- // pixels of each superblock row can be changed by next superblock row.
- if (pbi->frame_parallel_decode)
- vp9_frameworker_wait(pbi->frame_worker_owner, ref_frame_buf,
- MAX(0, (y1 + 7) << (plane == 0 ? 0 : 1)));
- }
-#if CONFIG_VP9_HIGHBITDEPTH
- if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
- high_inter_predictor(buf_ptr, buf_stride, dst, dst_buf->stride, subpel_x,
- subpel_y, sf, w, h, ref, kernel, xs, ys, xd->bd);
- } else {
- inter_predictor(buf_ptr, buf_stride, dst, dst_buf->stride, subpel_x,
- subpel_y, sf, w, h, ref, kernel, xs, ys);
- }
#else
+ build_mc_border(buf_ptr1,
+ pre_buf->stride,
+ xd->mc_buf,
+ x1 - x0 + 1,
+ x0,
+ y0,
+ x1 - x0 + 1,
+ y1 - y0 + 1,
+ frame_width,
+ frame_height);
+ buf_stride = x1 - x0 + 1;
+ buf_ptr = xd->mc_buf + y_pad * 3 * buf_stride + x_pad * 3;
+#endif // CONFIG_VP9_HIGHBITDEPTH
+ }
+ } else {
+ // Wait until reference block is ready. Pad 7 more pixels as last 7
+ // pixels of each superblock row can be changed by next superblock row.
+ if (pbi->frame_parallel_decode) {
+ const int y1 = ((y0_16 + (h - 1) * ys) >> SUBPEL_BITS) + 1;
+ vp9_frameworker_wait(pbi->frame_worker_owner, ref_frame_buf,
+ MAX(0, (y1 + 7)) << (plane == 0 ? 0 : 1));
+ }
+ }
+#if CONFIG_VP9_HIGHBITDEPTH
+ if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+ high_inter_predictor(buf_ptr, buf_stride, dst, dst_buf->stride, subpel_x,
+ subpel_y, sf, w, h, ref, kernel, xs, ys, xd->bd);
+ } else {
inter_predictor(buf_ptr, buf_stride, dst, dst_buf->stride, subpel_x,
subpel_y, sf, w, h, ref, kernel, xs, ys);
-#endif // CONFIG_VP9_HIGHBITDEPTH
}
+#else
+ inter_predictor(buf_ptr, buf_stride, dst, dst_buf->stride, subpel_x,
+ subpel_y, sf, w, h, ref, kernel, xs, ys);
+#endif // CONFIG_VP9_HIGHBITDEPTH
}
void vp9_dec_build_inter_predictors_sb(VP9Decoder *const pbi, MACROBLOCKD *xd,
@@ -2005,24 +1992,50 @@
int plane;
const int mi_x = mi_col * MI_SIZE;
const int mi_y = mi_row * MI_SIZE;
+ const MODE_INFO *mi = xd->mi[0].src_mi;
+ const InterpKernel *kernel = vp9_get_interp_kernel(mi->mbmi.interp_filter);
+ const BLOCK_SIZE sb_type = mi->mbmi.sb_type;
+ const int is_compound = has_second_ref(&mi->mbmi);
+
for (plane = 0; plane < MAX_MB_PLANE; ++plane) {
const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize,
&xd->plane[plane]);
+ struct macroblockd_plane *const pd = &xd->plane[plane];
+ struct buf_2d *const dst_buf = &pd->dst;
const int num_4x4_w = num_4x4_blocks_wide_lookup[plane_bsize];
const int num_4x4_h = num_4x4_blocks_high_lookup[plane_bsize];
+
const int bw = 4 * num_4x4_w;
const int bh = 4 * num_4x4_h;
+ int ref;
- if (xd->mi[0].src_mi->mbmi.sb_type < BLOCK_8X8) {
- int i = 0, x, y;
- assert(bsize == BLOCK_8X8);
- for (y = 0; y < num_4x4_h; ++y)
- for (x = 0; x < num_4x4_w; ++x)
- dec_build_inter_predictors(pbi, xd, plane, i++, bw, bh,
- 4 * x, 4 * y, 4, 4, mi_x, mi_y);
- } else {
- dec_build_inter_predictors(pbi, xd, plane, 0, bw, bh,
- 0, 0, bw, bh, mi_x, mi_y);
+ for (ref = 0; ref < 1 + is_compound; ++ref) {
+ const struct scale_factors *const sf = &xd->block_refs[ref]->sf;
+ struct buf_2d *const pre_buf = &pd->pre[ref];
+ const int idx = xd->block_refs[ref]->idx;
+ BufferPool *const pool = pbi->common.buffer_pool;
+ RefCntBuffer *const ref_frame_buf = &pool->frame_bufs[idx];
+ const int is_scaled = vp9_is_scaled(sf);
+
+ if (sb_type < BLOCK_8X8) {
+ int i = 0, x, y;
+ assert(bsize == BLOCK_8X8);
+ for (y = 0; y < num_4x4_h; ++y) {
+ for (x = 0; x < num_4x4_w; ++x) {
+ const MV mv = average_split_mvs(pd, mi, ref, i++);
+ dec_build_inter_predictors(pbi, xd, plane, bw, bh,
+ 4 * x, 4 * y, 4, 4, mi_x, mi_y, kernel,
+ sf, pre_buf, dst_buf, &mv,
+ ref_frame_buf, is_scaled, ref);
+ }
+ }
+ } else {
+ const MV mv = mi->mbmi.mv[ref].as_mv;
+ dec_build_inter_predictors(pbi, xd, plane, bw, bh,
+ 0, 0, bw, bh, mi_x, mi_y, kernel,
+ sf, pre_buf, dst_buf, &mv, ref_frame_buf,
+ is_scaled, ref);
+ }
}
}
}
diff --git a/vp9/decoder/vp9_detokenize.c b/vp9/decoder/vp9_detokenize.c
index 35690b8..fd40875 100644
--- a/vp9/decoder/vp9_detokenize.c
+++ b/vp9/decoder/vp9_detokenize.c
@@ -45,17 +45,6 @@
return val;
}
-static const vp9_tree_index coeff_subtree_high[TREE_SIZE(ENTROPY_TOKENS)] = {
- 2, 6, /* 0 = LOW_VAL */
- -TWO_TOKEN, 4, /* 1 = TWO */
- -THREE_TOKEN, -FOUR_TOKEN, /* 2 = THREE */
- 8, 10, /* 3 = HIGH_LOW */
- -CATEGORY1_TOKEN, -CATEGORY2_TOKEN, /* 4 = CAT_ONE */
- 12, 14, /* 5 = CAT_THREEFOUR */
- -CATEGORY3_TOKEN, -CATEGORY4_TOKEN, /* 6 = CAT_THREE */
- -CATEGORY5_TOKEN, -CATEGORY6_TOKEN /* 7 = CAT_FIVE */
-};
-
static int decode_coefs(VP9_COMMON *cm, const MACROBLOCKD *xd,
FRAME_COUNTS *counts, PLANE_TYPE type,
tran_low_t *dqcoeff, TX_SIZE tx_size, const int16_t *dq,
@@ -147,7 +136,7 @@
val = 1;
} else {
INCREMENT_COUNT(TWO_TOKEN);
- token = vp9_read_tree(r, coeff_subtree_high,
+ token = vp9_read_tree(r, vp9_coef_con_tree,
vp9_pareto8_full[prob[PIVOT_NODE] - 1]);
switch (token) {
case TWO_TOKEN:
diff --git a/vp9/decoder/vp9_dthread.c b/vp9/decoder/vp9_dthread.c
index be2e6cd..96a63bd 100644
--- a/vp9/decoder/vp9_dthread.c
+++ b/vp9/decoder/vp9_dthread.c
@@ -155,6 +155,10 @@
dst_worker_data->pbi->need_resync = src_worker_data->pbi->need_resync;
vp9_frameworker_unlock_stats(src_worker);
+ dst_cm->bit_depth = src_cm->bit_depth;
+#if CONFIG_VP9_HIGHBITDEPTH
+ dst_cm->use_highbitdepth = src_cm->use_highbitdepth;
+#endif
dst_cm->prev_frame = src_cm->show_existing_frame ?
src_cm->prev_frame : src_cm->cur_frame;
dst_cm->last_width = !src_cm->show_existing_frame ?
diff --git a/vp9/encoder/vp9_avg.c b/vp9/encoder/vp9_avg.c
index dc5cfe2..58daa3a 100644
--- a/vp9/encoder/vp9_avg.c
+++ b/vp9/encoder/vp9_avg.c
@@ -28,6 +28,94 @@
return (sum + 8) >> 4;
}
+static void hadamard_col8(const int16_t *src_diff, int src_stride,
+ int16_t *coeff) {
+ int16_t b0 = src_diff[0 * src_stride] + src_diff[1 * src_stride];
+ int16_t b1 = src_diff[0 * src_stride] - src_diff[1 * src_stride];
+ int16_t b2 = src_diff[2 * src_stride] + src_diff[3 * src_stride];
+ int16_t b3 = src_diff[2 * src_stride] - src_diff[3 * src_stride];
+ int16_t b4 = src_diff[4 * src_stride] + src_diff[5 * src_stride];
+ int16_t b5 = src_diff[4 * src_stride] - src_diff[5 * src_stride];
+ int16_t b6 = src_diff[6 * src_stride] + src_diff[7 * src_stride];
+ int16_t b7 = src_diff[6 * src_stride] - src_diff[7 * src_stride];
+
+ int16_t c0 = b0 + b2;
+ int16_t c1 = b1 + b3;
+ int16_t c2 = b0 - b2;
+ int16_t c3 = b1 - b3;
+ int16_t c4 = b4 + b6;
+ int16_t c5 = b5 + b7;
+ int16_t c6 = b4 - b6;
+ int16_t c7 = b5 - b7;
+
+ coeff[0] = c0 + c4;
+ coeff[7] = c1 + c5;
+ coeff[3] = c2 + c6;
+ coeff[4] = c3 + c7;
+ coeff[2] = c0 - c4;
+ coeff[6] = c1 - c5;
+ coeff[1] = c2 - c6;
+ coeff[5] = c3 - c7;
+}
+
+void vp9_hadamard_8x8_c(int16_t const *src_diff, int src_stride,
+ int16_t *coeff) {
+ int idx;
+ int16_t buffer[64];
+ int16_t *tmp_buf = &buffer[0];
+ for (idx = 0; idx < 8; ++idx) {
+ hadamard_col8(src_diff, src_stride, tmp_buf);
+ tmp_buf += 8;
+ ++src_diff;
+ }
+
+ tmp_buf = &buffer[0];
+ for (idx = 0; idx < 8; ++idx) {
+ hadamard_col8(tmp_buf, 8, coeff);
+ coeff += 8;
+ ++tmp_buf;
+ }
+}
+
+// In place 16x16 2D Hadamard transform
+void vp9_hadamard_16x16_c(int16_t const *src_diff, int src_stride,
+ int16_t *coeff) {
+ int idx;
+ for (idx = 0; idx < 4; ++idx) {
+ int16_t const *src_ptr = src_diff + (idx >> 1) * 8 * src_stride
+ + (idx & 0x01) * 8;
+ vp9_hadamard_8x8_c(src_ptr, src_stride, coeff + idx * 64);
+ }
+
+ for (idx = 0; idx < 64; ++idx) {
+ int16_t a0 = coeff[0];
+ int16_t a1 = coeff[64];
+ int16_t a2 = coeff[128];
+ int16_t a3 = coeff[192];
+
+ int16_t b0 = a0 + a1;
+ int16_t b1 = a0 - a1;
+ int16_t b2 = a2 + a3;
+ int16_t b3 = a2 - a3;
+
+ coeff[0] = (b0 + b2) >> 1;
+ coeff[64] = (b1 + b3) >> 1;
+ coeff[128] = (b0 - b2) >> 1;
+ coeff[192] = (b1 - b3) >> 1;
+
+ ++coeff;
+ }
+}
+
+int16_t vp9_satd_c(const int16_t *coeff, int length) {
+ int i;
+ int satd = 0;
+ for (i = 0; i < length; ++i)
+ satd += abs(coeff[i]);
+
+ return (int16_t)satd;
+}
+
// Integer projection onto row vectors.
void vp9_int_pro_row_c(int16_t *hbuf, uint8_t const *ref,
const int ref_stride, const int height) {
diff --git a/vp9/encoder/vp9_bitstream.c b/vp9/encoder/vp9_bitstream.c
index b24fe29..d67d1f4 100644
--- a/vp9/encoder/vp9_bitstream.c
+++ b/vp9/encoder/vp9_bitstream.c
@@ -268,8 +268,7 @@
vp9_write(w, is_inter, vp9_get_intra_inter_prob(cm, xd));
if (bsize >= BLOCK_8X8 && cm->tx_mode == TX_MODE_SELECT &&
- !(is_inter &&
- (skip || vp9_segfeature_active(seg, segment_id, SEG_LVL_SKIP)))) {
+ !(is_inter && skip)) {
write_selected_tx_size(cm, xd, w);
}
diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c
index 10a62ef..367ab3c 100644
--- a/vp9/encoder/vp9_encodeframe.c
+++ b/vp9/encoder/vp9_encodeframe.c
@@ -165,21 +165,6 @@
return BLOCK_8X8;
}
-static BLOCK_SIZE get_nonrd_var_based_fixed_partition(VP9_COMP *cpi,
- MACROBLOCK *x,
- int mi_row,
- int mi_col) {
- unsigned int var = get_sby_perpixel_diff_variance(cpi, &x->plane[0].src,
- mi_row, mi_col,
- BLOCK_64X64);
- if (var < 4)
- return BLOCK_64X64;
- else if (var < 10)
- return BLOCK_32X32;
- else
- return BLOCK_16X16;
-}
-
// Lighter version of set_offsets that only sets the mode info
// pointers.
static INLINE void set_mode_info_offsets(VP9_COMMON *const cm,
@@ -474,7 +459,7 @@
}
-void vp9_set_vbp_thresholds(VP9_COMP *cpi, int q) {
+void vp9_set_vbp_thresholds(VP9_COMP *cpi, int64_t thresholds[], int q) {
SPEED_FEATURES *const sf = &cpi->sf;
if (sf->partition_search_type != VAR_BASED_PARTITION &&
sf->partition_search_type != REFERENCE_PARTITION) {
@@ -482,36 +467,116 @@
} else {
VP9_COMMON *const cm = &cpi->common;
const int is_key_frame = (cm->frame_type == KEY_FRAME);
- const int threshold_multiplier = is_key_frame ? 80 : 4;
+ const int threshold_multiplier = is_key_frame ? 20 : 1;
const int64_t threshold_base = (int64_t)(threshold_multiplier *
- vp9_convert_qindex_to_q(q, cm->bit_depth));
+ cpi->y_dequant[q][1]);
// TODO(marpan): Allow 4x4 partitions for inter-frames.
// use_4x4_partition = (variance4x4downsample[i2 + j] == 1);
// If 4x4 partition is not used, then 8x8 partition will be selected
// if variance of 16x16 block is very high, so use larger threshold
// for 16x16 (threshold_bsize_min) in that case.
+
+ // Array index: 0 - threshold_64x64; 1 - threshold_32x32;
+ // 2 - threshold_16x16; 3 - vbp_threshold_8x8;
if (is_key_frame) {
- cpi->vbp_threshold = threshold_base >> 2;
- cpi->vbp_threshold_bsize_max = threshold_base;
- cpi->vbp_threshold_bsize_min = threshold_base << 2;
- cpi->vbp_threshold_16x16 = cpi->vbp_threshold;
+ thresholds[0] = threshold_base;
+ thresholds[1] = threshold_base >> 2;
+ thresholds[2] = threshold_base >> 2;
+ thresholds[3] = threshold_base << 2;
cpi->vbp_bsize_min = BLOCK_8X8;
} else {
- cpi->vbp_threshold = threshold_base;
+ thresholds[1] = threshold_base;
if (cm->width <= 352 && cm->height <= 288) {
- cpi->vbp_threshold_bsize_max = threshold_base >> 2;
- cpi->vbp_threshold_bsize_min = threshold_base << 3;
+ thresholds[0] = threshold_base >> 2;
+ thresholds[2] = threshold_base << 3;
} else {
- cpi->vbp_threshold_bsize_max = threshold_base;
- cpi->vbp_threshold_bsize_min = threshold_base << cpi->oxcf.speed;
+ thresholds[0] = threshold_base;
+ thresholds[2] = threshold_base << cpi->oxcf.speed;
}
- cpi->vbp_threshold_16x16 = cpi->vbp_threshold_bsize_min;
cpi->vbp_bsize_min = BLOCK_16X16;
}
}
}
+static void fill_variance_4x4avg(const uint8_t *s, int sp, const uint8_t *d,
+ int dp, int x8_idx, int y8_idx, v8x8 *vst,
+#if CONFIG_VP9_HIGHBITDEPTH
+ int highbd_flag,
+#endif
+ int pixels_wide,
+ int pixels_high,
+ int is_key_frame) {
+ int k;
+ for (k = 0; k < 4; k++) {
+ int x4_idx = x8_idx + ((k & 1) << 2);
+ int y4_idx = y8_idx + ((k >> 1) << 2);
+ unsigned int sse = 0;
+ int sum = 0;
+ if (x4_idx < pixels_wide && y4_idx < pixels_high) {
+ int s_avg;
+ int d_avg = 128;
+#if CONFIG_VP9_HIGHBITDEPTH
+ if (highbd_flag & YV12_FLAG_HIGHBITDEPTH) {
+ s_avg = vp9_highbd_avg_4x4(s + y4_idx * sp + x4_idx, sp);
+ if (!is_key_frame)
+ d_avg = vp9_highbd_avg_4x4(d + y4_idx * dp + x4_idx, dp);
+ } else {
+ s_avg = vp9_avg_4x4(s + y4_idx * sp + x4_idx, sp);
+ if (!is_key_frame)
+ d_avg = vp9_avg_4x4(d + y4_idx * dp + x4_idx, dp);
+ }
+#else
+ s_avg = vp9_avg_4x4(s + y4_idx * sp + x4_idx, sp);
+ if (!is_key_frame)
+ d_avg = vp9_avg_4x4(d + y4_idx * dp + x4_idx, dp);
+#endif
+ sum = s_avg - d_avg;
+ sse = sum * sum;
+ }
+ fill_variance(sse, sum, 0, &vst->split[k].part_variances.none);
+ }
+}
+
+static void fill_variance_8x8avg(const uint8_t *s, int sp, const uint8_t *d,
+ int dp, int x16_idx, int y16_idx, v16x16 *vst,
+#if CONFIG_VP9_HIGHBITDEPTH
+ int highbd_flag,
+#endif
+ int pixels_wide,
+ int pixels_high,
+ int is_key_frame) {
+ int k;
+ for (k = 0; k < 4; k++) {
+ int x8_idx = x16_idx + ((k & 1) << 3);
+ int y8_idx = y16_idx + ((k >> 1) << 3);
+ unsigned int sse = 0;
+ int sum = 0;
+ if (x8_idx < pixels_wide && y8_idx < pixels_high) {
+ int s_avg;
+ int d_avg = 128;
+#if CONFIG_VP9_HIGHBITDEPTH
+ if (highbd_flag & YV12_FLAG_HIGHBITDEPTH) {
+ s_avg = vp9_highbd_avg_8x8(s + y8_idx * sp + x8_idx, sp);
+ if (!is_key_frame)
+ d_avg = vp9_highbd_avg_8x8(d + y8_idx * dp + x8_idx, dp);
+ } else {
+ s_avg = vp9_avg_8x8(s + y8_idx * sp + x8_idx, sp);
+ if (!is_key_frame)
+ d_avg = vp9_avg_8x8(d + y8_idx * dp + x8_idx, dp);
+ }
+#else
+ s_avg = vp9_avg_8x8(s + y8_idx * sp + x8_idx, sp);
+ if (!is_key_frame)
+ d_avg = vp9_avg_8x8(d + y8_idx * dp + x8_idx, dp);
+#endif
+ sum = s_avg - d_avg;
+ sse = sum * sum;
+ }
+ fill_variance(sse, sum, 0, &vst->split[k].part_variances.none);
+ }
+}
+
// This function chooses partitioning based on the variance between source and
// reconstructed last, where variance is computed for down-sampled inputs.
static void choose_partitioning(VP9_COMP *cpi,
@@ -529,6 +594,8 @@
int sp;
int dp;
int pixels_wide = 64, pixels_high = 64;
+ int64_t thresholds[4] = {cpi->vbp_thresholds[0], cpi->vbp_thresholds[1],
+ cpi->vbp_thresholds[2], cpi->vbp_thresholds[3]};
// Always use 4x4 partition for key frame.
const int is_key_frame = (cm->frame_type == KEY_FRAME);
@@ -541,6 +608,11 @@
const uint8_t *const map = cm->seg.update_map ? cpi->segmentation_map :
cm->last_frame_seg_map;
segment_id = vp9_get_segment_id(cm, map, BLOCK_64X64, mi_row, mi_col);
+
+ if (cyclic_refresh_segment_id_boosted(segment_id)) {
+ int q = vp9_get_qindex(&cm->seg, segment_id, cm->base_qindex);
+ vp9_set_vbp_thresholds(cpi, thresholds, q);
+ }
}
set_offsets(cpi, tile, x, mi_row, mi_col, BLOCK_64X64);
@@ -560,18 +632,10 @@
const YV12_BUFFER_CONFIG *yv12_g = get_ref_frame_buffer(cpi, GOLDEN_FRAME);
unsigned int y_sad, y_sad_g;
- BLOCK_SIZE bsize;
- if (mi_row + 4 < cm->mi_rows && mi_col + 4 < cm->mi_cols)
- bsize = BLOCK_64X64;
- else if (mi_row + 4 < cm->mi_rows && mi_col + 4 >= cm->mi_cols)
- bsize = BLOCK_32X64;
- else if (mi_row + 4 >= cm->mi_rows && mi_col + 4 < cm->mi_cols)
- bsize = BLOCK_64X32;
- else
- bsize = BLOCK_32X32;
+ const BLOCK_SIZE bsize = BLOCK_32X32
+ + (mi_col + 4 < cm->mi_cols) * 2 + (mi_row + 4 < cm->mi_rows);
assert(yv12 != NULL);
-
if (yv12_g && yv12_g != yv12) {
vp9_setup_pre_planes(xd, 0, yv12_g, mi_row, mi_col,
&cm->frame_refs[GOLDEN_FRAME - 1].sf);
@@ -656,33 +720,13 @@
v16x16 *vst = &vt.split[i].split[j];
variance4x4downsample[i2 + j] = 0;
if (!is_key_frame) {
- for (k = 0; k < 4; k++) {
- int x8_idx = x16_idx + ((k & 1) << 3);
- int y8_idx = y16_idx + ((k >> 1) << 3);
- unsigned int sse = 0;
- int sum = 0;
- if (x8_idx < pixels_wide && y8_idx < pixels_high) {
- int s_avg, d_avg;
+ fill_variance_8x8avg(s, sp, d, dp, x16_idx, y16_idx, vst,
#if CONFIG_VP9_HIGHBITDEPTH
- if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
- s_avg = vp9_highbd_avg_8x8(s + y8_idx * sp + x8_idx, sp);
- d_avg = vp9_highbd_avg_8x8(d + y8_idx * dp + x8_idx, dp);
- } else {
- s_avg = vp9_avg_8x8(s + y8_idx * sp + x8_idx, sp);
- d_avg = vp9_avg_8x8(d + y8_idx * dp + x8_idx, dp);
- }
-#else
- s_avg = vp9_avg_8x8(s + y8_idx * sp + x8_idx, sp);
- d_avg = vp9_avg_8x8(d + y8_idx * dp + x8_idx, dp);
+ xd->cur_buf->flags,
#endif
- sum = s_avg - d_avg;
- sse = sum * sum;
- }
- // If variance is based on 8x8 downsampling, we stop here and have
- // one sample for 8x8 block (so use 1 for count in fill_variance),
- // which of course means variance = 0 for 8x8 block.
- fill_variance(sse, sum, 0, &vst->split[k].part_variances.none);
- }
+ pixels_wide,
+ pixels_high,
+ is_key_frame);
fill_variance_tree(&vt.split[i].split[j], BLOCK_16X16);
// For low-resolution, compute the variance based on 8x8 down-sampling,
// and if it is large (above the threshold) we go down for 4x4.
@@ -692,7 +736,7 @@
}
if (is_key_frame || (low_res &&
vt.split[i].split[j].part_variances.none.variance >
- (cpi->vbp_threshold << 1))) {
+ (thresholds[1] << 1))) {
// Go down to 4x4 down-sampling for variance.
variance4x4downsample[i2 + j] = 1;
for (k = 0; k < 4; k++) {
@@ -700,47 +744,18 @@
int y8_idx = y16_idx + ((k >> 1) << 3);
v8x8 *vst2 = is_key_frame ? &vst->split[k] :
&vt2[i2 + j].split[k];
- for (m = 0; m < 4; m++) {
- int x4_idx = x8_idx + ((m & 1) << 2);
- int y4_idx = y8_idx + ((m >> 1) << 2);
- unsigned int sse = 0;
- int sum = 0;
- if (x4_idx < pixels_wide && y4_idx < pixels_high) {
- int d_avg = 128;
+ fill_variance_4x4avg(s, sp, d, dp, x8_idx, y8_idx, vst2,
#if CONFIG_VP9_HIGHBITDEPTH
- int s_avg;
- if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
- s_avg = vp9_highbd_avg_4x4(s + y4_idx * sp + x4_idx, sp);
- if (cm->frame_type != KEY_FRAME)
- d_avg = vp9_highbd_avg_4x4(d + y4_idx * dp + x4_idx, dp);
- } else {
- s_avg = vp9_avg_4x4(s + y4_idx * sp + x4_idx, sp);
- if (cm->frame_type != KEY_FRAME)
- d_avg = vp9_avg_4x4(d + y4_idx * dp + x4_idx, dp);
- }
-#else
- int s_avg = vp9_avg_4x4(s + y4_idx * sp + x4_idx, sp);
- if (!is_key_frame)
- d_avg = vp9_avg_4x4(d + y4_idx * dp + x4_idx, dp);
+ xd->cur_buf->flags,
#endif
- sum = s_avg - d_avg;
- sse = sum * sum;
- }
- // If variance is based on 4x4 down-sampling, we stop here and have
- // one sample for 4x4 block (so use 1 for count in fill_variance),
- // which of course means variance = 0 for 4x4 block.
- fill_variance(sse, sum, 0, &vst2->split[m].part_variances.none);
- }
+ pixels_wide,
+ pixels_high,
+ is_key_frame);
}
}
}
}
- // No 64x64 blocks on segments other than base (un-boosted) segment,
- // so force split.
- if (cyclic_refresh_segment_id_boosted(segment_id))
- force_split[0] = 1;
-
// Fill the rest of the variance tree by summing split partition values.
for (i = 0; i < 4; i++) {
const int i2 = i << 2;
@@ -757,7 +772,7 @@
// If variance of this 32x32 block is above the threshold, force the block
// to split. This also forces a split on the upper (64x64) level.
get_variance(&vt.split[i].part_variances.none);
- if (vt.split[i].part_variances.none.variance > cpi->vbp_threshold) {
+ if (vt.split[i].part_variances.none.variance > thresholds[1]) {
force_split[i + 1] = 1;
force_split[0] = 1;
}
@@ -769,16 +784,15 @@
// we get to one that's got a variance lower than our threshold.
if ( mi_col + 8 > cm->mi_cols || mi_row + 8 > cm->mi_rows ||
!set_vt_partitioning(cpi, xd, &vt, BLOCK_64X64, mi_row, mi_col,
- cpi->vbp_threshold_bsize_max, BLOCK_16X16,
- force_split[0])) {
+ thresholds[0], BLOCK_16X16, force_split[0])) {
for (i = 0; i < 4; ++i) {
const int x32_idx = ((i & 1) << 2);
const int y32_idx = ((i >> 1) << 2);
const int i2 = i << 2;
if (!set_vt_partitioning(cpi, xd, &vt.split[i], BLOCK_32X32,
(mi_row + y32_idx), (mi_col + x32_idx),
- cpi->vbp_threshold,
- BLOCK_16X16, force_split[i + 1])) {
+ thresholds[1], BLOCK_16X16,
+ force_split[i + 1])) {
for (j = 0; j < 4; ++j) {
const int x16_idx = ((j & 1) << 1);
const int y16_idx = ((j >> 1) << 1);
@@ -791,8 +805,7 @@
if (!set_vt_partitioning(cpi, xd, vtemp, BLOCK_16X16,
mi_row + y32_idx + y16_idx,
mi_col + x32_idx + x16_idx,
- cpi->vbp_threshold_16x16,
- cpi->vbp_bsize_min, 0)) {
+ thresholds[2], cpi->vbp_bsize_min, 0)) {
for (k = 0; k < 4; ++k) {
const int x8_idx = (k & 1);
const int y8_idx = (k >> 1);
@@ -801,8 +814,7 @@
BLOCK_8X8,
mi_row + y32_idx + y16_idx + y8_idx,
mi_col + x32_idx + x16_idx + x8_idx,
- cpi->vbp_threshold_bsize_min,
- BLOCK_8X8, 0)) {
+ thresholds[3], BLOCK_8X8, 0)) {
set_block_size(cpi, xd,
(mi_row + y32_idx + y16_idx + y8_idx),
(mi_col + x32_idx + x16_idx + x8_idx),
diff --git a/vp9/encoder/vp9_encodeframe.h b/vp9/encoder/vp9_encodeframe.h
index 8d545b6..1027130 100644
--- a/vp9/encoder/vp9_encodeframe.h
+++ b/vp9/encoder/vp9_encodeframe.h
@@ -12,6 +12,8 @@
#ifndef VP9_ENCODER_VP9_ENCODEFRAME_H_
#define VP9_ENCODER_VP9_ENCODEFRAME_H_
+#include "vpx/vpx_integer.h"
+
#ifdef __cplusplus
extern "C" {
#endif
@@ -38,7 +40,7 @@
void vp9_encode_tile(struct VP9_COMP *cpi, struct ThreadData *td,
int tile_row, int tile_col);
-void vp9_set_vbp_thresholds(struct VP9_COMP *cpi, int q);
+void vp9_set_vbp_thresholds(struct VP9_COMP *cpi, int64_t thresholds[], int q);
#ifdef __cplusplus
} // extern "C"
diff --git a/vp9/encoder/vp9_encoder.c b/vp9/encoder/vp9_encoder.c
index fc05811..82f99b3 100644
--- a/vp9/encoder/vp9_encoder.c
+++ b/vp9/encoder/vp9_encoder.c
@@ -183,6 +183,33 @@
}
}
+int vp9_get_active_map(VP9_COMP* cpi,
+ unsigned char* new_map_16x16,
+ int rows,
+ int cols) {
+ if (rows == cpi->common.mb_rows && cols == cpi->common.mb_cols &&
+ new_map_16x16) {
+ unsigned char* const seg_map_8x8 = cpi->segmentation_map;
+ const int mi_rows = cpi->common.mi_rows;
+ const int mi_cols = cpi->common.mi_cols;
+ vpx_memset(new_map_16x16, !cpi->active_map.enabled, rows * cols);
+ if (cpi->active_map.enabled) {
+ int r, c;
+ for (r = 0; r < mi_rows; ++r) {
+ for (c = 0; c < mi_cols; ++c) {
+ // Cyclic refresh segments are considered active despite not having
+ // AM_SEGMENT_ID_ACTIVE
+ new_map_16x16[(r >> 1) * cols + (c >> 1)] |=
+ seg_map_8x8[r * mi_cols + c] != AM_SEGMENT_ID_INACTIVE;
+ }
+ }
+ }
+ return 0;
+ } else {
+ return -1;
+ }
+}
+
void vp9_set_high_precision_mv(VP9_COMP *cpi, int allow_high_precision_mv) {
MACROBLOCK *const mb = &cpi->td.mb;
cpi->common.allow_high_precision_mv = allow_high_precision_mv;
@@ -2931,7 +2958,7 @@
set_size_dependent_vars(cpi, &q, &bottom_index, &top_index);
vp9_set_quantizer(cm, q);
- vp9_set_vbp_thresholds(cpi, q);
+ vp9_set_vbp_thresholds(cpi, cpi->vbp_thresholds, q);
setup_frame(cpi);
diff --git a/vp9/encoder/vp9_encoder.h b/vp9/encoder/vp9_encoder.h
index 914080c..a5342ad 100644
--- a/vp9/encoder/vp9_encoder.h
+++ b/vp9/encoder/vp9_encoder.h
@@ -460,10 +460,9 @@
int resize_pending;
// VAR_BASED_PARTITION thresholds
- int64_t vbp_threshold;
- int64_t vbp_threshold_bsize_min;
- int64_t vbp_threshold_bsize_max;
- int64_t vbp_threshold_16x16;
+ // 0 - threshold_64x64; 1 - threshold_32x32;
+ // 2 - threshold_16x16; 3 - vbp_threshold_8x8;
+ int64_t vbp_thresholds[4];
BLOCK_SIZE vbp_bsize_min;
// Multi-threading
@@ -508,6 +507,8 @@
int vp9_set_active_map(VP9_COMP *cpi, unsigned char *map, int rows, int cols);
+int vp9_get_active_map(VP9_COMP *cpi, unsigned char *map, int rows, int cols);
+
int vp9_set_internal_size(VP9_COMP *cpi,
VPX_SCALING horiz_mode, VPX_SCALING vert_mode);
diff --git a/vp9/encoder/vp9_firstpass.c b/vp9/encoder/vp9_firstpass.c
index 380f0b7..bf9e500 100644
--- a/vp9/encoder/vp9_firstpass.c
+++ b/vp9/encoder/vp9_firstpass.c
@@ -2612,6 +2612,7 @@
}
#define MINQ_ADJ_LIMIT 48
+#define MINQ_ADJ_LIMIT_CQ 20
void vp9_twopass_postencode_update(VP9_COMP *cpi) {
TWO_PASS *const twopass = &cpi->twopass;
RATE_CONTROL *const rc = &cpi->rc;
@@ -2651,7 +2652,7 @@
const int maxq_adj_limit =
rc->worst_quality - twopass->active_worst_quality;
const int minq_adj_limit =
- (cpi->oxcf.rc_mode == VPX_CQ) ? 0 : MINQ_ADJ_LIMIT;
+ (cpi->oxcf.rc_mode == VPX_CQ ? MINQ_ADJ_LIMIT_CQ : MINQ_ADJ_LIMIT);
// Undershoot.
if (rc->rate_error_estimate > cpi->oxcf.under_shoot_pct) {
diff --git a/vp9/encoder/vp9_mcomp.c b/vp9/encoder/vp9_mcomp.c
index 9602eb5..1f5f08a 100644
--- a/vp9/encoder/vp9_mcomp.c
+++ b/vp9/encoder/vp9_mcomp.c
@@ -2017,7 +2017,7 @@
if (fn_ptr->sdx3f != NULL) {
while ((c + 2) < col_max) {
int i;
- unsigned int sads[3];
+ DECLARE_ALIGNED(16, uint32_t, sads[3]);
fn_ptr->sdx3f(what->buf, what->stride, check_here, in_what->stride,
sads);
@@ -2082,7 +2082,7 @@
if (fn_ptr->sdx8f != NULL) {
while ((c + 7) < col_max) {
int i;
- unsigned int sads[8];
+ DECLARE_ALIGNED(16, uint32_t, sads[8]);
fn_ptr->sdx8f(what->buf, what->stride, check_here, in_what->stride,
sads);
@@ -2106,7 +2106,7 @@
if (fn_ptr->sdx3f != NULL) {
while ((c + 2) < col_max) {
int i;
- unsigned int sads[3];
+ DECLARE_ALIGNED(16, uint32_t, sads[3]);
fn_ptr->sdx3f(what->buf, what->stride, check_here, in_what->stride,
sads);
diff --git a/vp9/encoder/vp9_pickmode.c b/vp9/encoder/vp9_pickmode.c
index ad895e7..0ad3249 100644
--- a/vp9/encoder/vp9_pickmode.c
+++ b/vp9/encoder/vp9_pickmode.c
@@ -20,9 +20,11 @@
#include "vp9/common/vp9_blockd.h"
#include "vp9/common/vp9_common.h"
#include "vp9/common/vp9_mvref_common.h"
+#include "vp9/common/vp9_pred_common.h"
#include "vp9/common/vp9_reconinter.h"
#include "vp9/common/vp9_reconintra.h"
+#include "vp9/encoder/vp9_cost.h"
#include "vp9/encoder/vp9_encoder.h"
#include "vp9/encoder/vp9_pickmode.h"
#include "vp9/encoder/vp9_ratectrl.h"
@@ -188,6 +190,8 @@
cond_cost_list(cpi, cost_list),
x->nmvjointcost, x->mvcost,
&dis, &x->pred_sse[ref], NULL, 0, 0);
+ *rate_mv = vp9_mv_bit_cost(&tmp_mv->as_mv, &ref_mv,
+ x->nmvjointcost, x->mvcost, MV_COST_WEIGHT);
}
if (scaled_ref_frame) {
@@ -198,6 +202,250 @@
return rv;
}
+static void block_variance(const uint8_t *src, int src_stride,
+ const uint8_t *ref, int ref_stride,
+ int w, int h, unsigned int *sse, int *sum,
+ int block_size, unsigned int *sse8x8,
+ int *sum8x8, unsigned int *var8x8) {
+ int i, j, k = 0;
+
+ *sse = 0;
+ *sum = 0;
+
+ for (i = 0; i < h; i += block_size) {
+ for (j = 0; j < w; j += block_size) {
+ vp9_get8x8var(src + src_stride * i + j, src_stride,
+ ref + ref_stride * i + j, ref_stride,
+ &sse8x8[k], &sum8x8[k]);
+ *sse += sse8x8[k];
+ *sum += sum8x8[k];
+ var8x8[k] = sse8x8[k] - (((unsigned int)sum8x8[k] * sum8x8[k]) >> 6);
+ k++;
+ }
+ }
+}
+
+static void calculate_variance(int bw, int bh, TX_SIZE tx_size,
+ unsigned int *sse_i, int *sum_i,
+ unsigned int *var_o, unsigned int *sse_o,
+ int *sum_o) {
+ const BLOCK_SIZE unit_size = txsize_to_bsize[tx_size];
+ const int nw = 1 << (bw - b_width_log2_lookup[unit_size]);
+ const int nh = 1 << (bh - b_height_log2_lookup[unit_size]);
+ int i, j, k = 0;
+
+ for (i = 0; i < nh; i += 2) {
+ for (j = 0; j < nw; j += 2) {
+ sse_o[k] = sse_i[i * nw + j] + sse_i[i * nw + j + 1] +
+ sse_i[(i + 1) * nw + j] + sse_i[(i + 1) * nw + j + 1];
+ sum_o[k] = sum_i[i * nw + j] + sum_i[i * nw + j + 1] +
+ sum_i[(i + 1) * nw + j] + sum_i[(i + 1) * nw + j + 1];
+ var_o[k] = sse_o[k] - (((unsigned int)sum_o[k] * sum_o[k]) >>
+ (b_width_log2_lookup[unit_size] +
+ b_height_log2_lookup[unit_size] + 6));
+ k++;
+ }
+ }
+}
+
+static void model_rd_for_sb_y_large(VP9_COMP *cpi, BLOCK_SIZE bsize,
+ MACROBLOCK *x, MACROBLOCKD *xd,
+ int *out_rate_sum, int64_t *out_dist_sum,
+ unsigned int *var_y, unsigned int *sse_y,
+ int mi_row, int mi_col, int *early_term) {
+ // Note our transform coeffs are 8 times an orthogonal transform.
+ // Hence quantizer step is also 8 times. To get effective quantizer
+ // we need to divide by 8 before sending to modeling function.
+ unsigned int sse;
+ int rate;
+ int64_t dist;
+ struct macroblock_plane *const p = &x->plane[0];
+ struct macroblockd_plane *const pd = &xd->plane[0];
+ const uint32_t dc_quant = pd->dequant[0];
+ const uint32_t ac_quant = pd->dequant[1];
+ const int64_t dc_thr = dc_quant * dc_quant >> 6;
+ const int64_t ac_thr = ac_quant * ac_quant >> 6;
+ unsigned int var;
+ int sum;
+ int skip_dc = 0;
+
+ const int bw = b_width_log2_lookup[bsize];
+ const int bh = b_height_log2_lookup[bsize];
+ const int num8x8 = 1 << (bw + bh - 2);
+ unsigned int sse8x8[64] = {0};
+ int sum8x8[64] = {0};
+ unsigned int var8x8[64] = {0};
+ TX_SIZE tx_size;
+ int i, k;
+
+ // Calculate variance for whole partition, and also save 8x8 blocks' variance
+ // to be used in following transform skipping test.
+ block_variance(p->src.buf, p->src.stride, pd->dst.buf, pd->dst.stride,
+ 4 << bw, 4 << bh, &sse, &sum, 8, sse8x8, sum8x8, var8x8);
+ var = sse - (((int64_t)sum * sum) >> (bw + bh + 4));
+
+ *var_y = var;
+ *sse_y = sse;
+
+ if (cpi->common.tx_mode == TX_MODE_SELECT) {
+ if (sse > (var << 2))
+ tx_size = MIN(max_txsize_lookup[bsize],
+ tx_mode_to_biggest_tx_size[cpi->common.tx_mode]);
+ else
+ tx_size = TX_8X8;
+
+ if (cpi->sf.partition_search_type == VAR_BASED_PARTITION) {
+ if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ &&
+ cyclic_refresh_segment_id_boosted(xd->mi[0].src_mi->mbmi.segment_id))
+ tx_size = TX_8X8;
+ else if (tx_size > TX_16X16)
+ tx_size = TX_16X16;
+ }
+ } else {
+ tx_size = MIN(max_txsize_lookup[bsize],
+ tx_mode_to_biggest_tx_size[cpi->common.tx_mode]);
+ }
+
+ assert(tx_size >= TX_8X8);
+ xd->mi[0].src_mi->mbmi.tx_size = tx_size;
+
+ // Evaluate if the partition block is a skippable block in Y plane.
+ {
+ unsigned int sse16x16[16] = {0};
+ int sum16x16[16] = {0};
+ unsigned int var16x16[16] = {0};
+ const int num16x16 = num8x8 >> 2;
+
+ unsigned int sse32x32[4] = {0};
+ int sum32x32[4] = {0};
+ unsigned int var32x32[4] = {0};
+ const int num32x32 = num8x8 >> 4;
+
+ int ac_test = 1;
+ int dc_test = 1;
+ const int num = (tx_size == TX_8X8) ? num8x8 :
+ ((tx_size == TX_16X16) ? num16x16 : num32x32);
+ const unsigned int *sse_tx = (tx_size == TX_8X8) ? sse8x8 :
+ ((tx_size == TX_16X16) ? sse16x16 : sse32x32);
+ const unsigned int *var_tx = (tx_size == TX_8X8) ? var8x8 :
+ ((tx_size == TX_16X16) ? var16x16 : var32x32);
+
+ // Calculate variance if tx_size > TX_8X8
+ if (tx_size >= TX_16X16)
+ calculate_variance(bw, bh, TX_8X8, sse8x8, sum8x8, var16x16, sse16x16,
+ sum16x16);
+ if (tx_size == TX_32X32)
+ calculate_variance(bw, bh, TX_16X16, sse16x16, sum16x16, var32x32,
+ sse32x32, sum32x32);
+
+ // Skipping test
+ x->skip_txfm[0] = 0;
+ for (k = 0; k < num; k++)
+ // Check if all ac coefficients can be quantized to zero.
+ if (!(var_tx[k] < ac_thr || var == 0)) {
+ ac_test = 0;
+ break;
+ }
+
+ for (k = 0; k < num; k++)
+ // Check if dc coefficient can be quantized to zero.
+ if (!(sse_tx[k] - var_tx[k] < dc_thr || sse == var)) {
+ dc_test = 0;
+ break;
+ }
+
+ if (ac_test) {
+ x->skip_txfm[0] = 2;
+
+ if (dc_test)
+ x->skip_txfm[0] = 1;
+ } else if (dc_test) {
+ skip_dc = 1;
+ }
+ }
+
+ if (x->skip_txfm[0] == 1) {
+ int skip_uv[2] = {0};
+ unsigned int var_uv[2];
+ unsigned int sse_uv[2];
+
+ *out_rate_sum = 0;
+ *out_dist_sum = sse << 4;
+
+ // Transform skipping test in UV planes.
+ for (i = 1; i <= 2; i++) {
+ struct macroblock_plane *const p = &x->plane[i];
+ struct macroblockd_plane *const pd = &xd->plane[i];
+ const TX_SIZE uv_tx_size = get_uv_tx_size(&xd->mi[0].src_mi->mbmi, pd);
+ const BLOCK_SIZE unit_size = txsize_to_bsize[uv_tx_size];
+ const BLOCK_SIZE uv_bsize = get_plane_block_size(bsize, pd);
+ const int uv_bw = b_width_log2_lookup[uv_bsize];
+ const int uv_bh = b_height_log2_lookup[uv_bsize];
+ const int sf = (uv_bw - b_width_log2_lookup[unit_size]) +
+ (uv_bh - b_height_log2_lookup[unit_size]);
+ const uint32_t uv_dc_thr = pd->dequant[0] * pd->dequant[0] >> (6 - sf);
+ const uint32_t uv_ac_thr = pd->dequant[1] * pd->dequant[1] >> (6 - sf);
+ int j = i - 1;
+
+ vp9_build_inter_predictors_sbp(xd, mi_row, mi_col, bsize, i);
+ var_uv[j] = cpi->fn_ptr[uv_bsize].vf(p->src.buf, p->src.stride,
+ pd->dst.buf, pd->dst.stride, &sse_uv[j]);
+
+ if ((var_uv[j] < uv_ac_thr || var_uv[j] == 0) &&
+ (sse_uv[j] - var_uv[j] < uv_dc_thr || sse_uv[j] == var_uv[j]))
+ skip_uv[j] = 1;
+ else
+ break;
+ }
+
+ // If the transform in YUV planes are skippable, the mode search checks
+ // fewer inter modes and doesn't check intra modes.
+ if (skip_uv[0] & skip_uv[1]) {
+ *early_term = 1;
+ }
+
+ return;
+ }
+
+ if (!skip_dc) {
+#if CONFIG_VP9_HIGHBITDEPTH
+ if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+ vp9_model_rd_from_var_lapndz(sse - var, num_pels_log2_lookup[bsize],
+ dc_quant >> (xd->bd - 5), &rate, &dist);
+ } else {
+ vp9_model_rd_from_var_lapndz(sse - var, num_pels_log2_lookup[bsize],
+ dc_quant >> 3, &rate, &dist);
+ }
+#else
+ vp9_model_rd_from_var_lapndz(sse - var, num_pels_log2_lookup[bsize],
+ dc_quant >> 3, &rate, &dist);
+#endif // CONFIG_VP9_HIGHBITDEPTH
+ }
+
+ if (!skip_dc) {
+ *out_rate_sum = rate >> 1;
+ *out_dist_sum = dist << 3;
+ } else {
+ *out_rate_sum = 0;
+ *out_dist_sum = (sse - var) << 4;
+ }
+
+#if CONFIG_VP9_HIGHBITDEPTH
+ if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+ vp9_model_rd_from_var_lapndz(var, num_pels_log2_lookup[bsize],
+ ac_quant >> (xd->bd - 5), &rate, &dist);
+ } else {
+ vp9_model_rd_from_var_lapndz(var, num_pels_log2_lookup[bsize],
+ ac_quant >> 3, &rate, &dist);
+ }
+#else
+ vp9_model_rd_from_var_lapndz(var, num_pels_log2_lookup[bsize],
+ ac_quant >> 3, &rate, &dist);
+#endif // CONFIG_VP9_HIGHBITDEPTH
+
+ *out_rate_sum += rate;
+ *out_dist_sum += dist << 4;
+}
static void model_rd_for_sb_y(VP9_COMP *cpi, BLOCK_SIZE bsize,
MACROBLOCK *x, MACROBLOCKD *xd,
@@ -312,6 +560,132 @@
*out_dist_sum += dist << 4;
}
+#if CONFIG_VP9_HIGHBITDEPTH
+static void block_yrd(VP9_COMP *cpi, MACROBLOCK *x, int *rate, int64_t *dist,
+ int *skippable, int64_t *sse, int plane,
+ BLOCK_SIZE bsize, TX_SIZE tx_size) {
+ MACROBLOCKD *xd = &x->e_mbd;
+ unsigned int var_y, sse_y;
+ (void)plane;
+ (void)tx_size;
+ model_rd_for_sb_y(cpi, bsize, x, xd, rate, dist, &var_y, &sse_y);
+ *sse = INT_MAX;
+ *skippable = 0;
+ return;
+}
+#else
+static void block_yrd(VP9_COMP *cpi, MACROBLOCK *x, int *rate, int64_t *dist,
+ int *skippable, int64_t *sse, int plane,
+ BLOCK_SIZE bsize, TX_SIZE tx_size) {
+ MACROBLOCKD *xd = &x->e_mbd;
+ const struct macroblockd_plane *pd = &xd->plane[plane];
+ const struct macroblock_plane *const p = &x->plane[plane];
+ const int num_4x4_w = num_4x4_blocks_wide_lookup[bsize];
+ const int num_4x4_h = num_4x4_blocks_high_lookup[bsize];
+ const int step = 1 << (tx_size << 1);
+ const int block_step = (1 << tx_size);
+ int block = 0, r, c;
+ int shift = tx_size == TX_32X32 ? 0 : 2;
+ const int max_blocks_wide = num_4x4_w + (xd->mb_to_right_edge >= 0 ? 0 :
+ xd->mb_to_right_edge >> (5 + pd->subsampling_x));
+ const int max_blocks_high = num_4x4_h + (xd->mb_to_bottom_edge >= 0 ? 0 :
+ xd->mb_to_bottom_edge >> (5 + pd->subsampling_y));
+ int eob_cost = 0;
+
+ (void)cpi;
+ vp9_subtract_plane(x, bsize, plane);
+ *skippable = 1;
+ // Keep track of the row and column of the blocks we use so that we know
+ // if we are in the unrestricted motion border.
+ for (r = 0; r < max_blocks_high; r += block_step) {
+ for (c = 0; c < num_4x4_w; c += block_step) {
+ if (c < max_blocks_wide) {
+ const scan_order *const scan_order = &vp9_default_scan_orders[tx_size];
+ tran_low_t *const coeff = BLOCK_OFFSET(p->coeff, block);
+ tran_low_t *const qcoeff = BLOCK_OFFSET(p->qcoeff, block);
+ tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
+ uint16_t *const eob = &p->eobs[block];
+ const int diff_stride = 4 * num_4x4_blocks_wide_lookup[bsize];
+ const int16_t *src_diff;
+ src_diff = &p->src_diff[(r * diff_stride + c) << 2];
+
+ switch (tx_size) {
+ case TX_32X32:
+ vp9_fdct32x32_rd(src_diff, coeff, diff_stride);
+ vp9_quantize_fp_32x32(coeff, 1024, x->skip_block, p->zbin,
+ p->round_fp, p->quant_fp, p->quant_shift,
+ qcoeff, dqcoeff, pd->dequant, eob,
+ scan_order->scan, scan_order->iscan);
+ break;
+ case TX_16X16:
+ vp9_hadamard_16x16(src_diff, diff_stride, (int16_t *)coeff);
+ vp9_quantize_fp(coeff, 256, x->skip_block, p->zbin, p->round_fp,
+ p->quant_fp, p->quant_shift, qcoeff, dqcoeff,
+ pd->dequant, eob,
+ scan_order->scan, scan_order->iscan);
+ break;
+ case TX_8X8:
+ vp9_hadamard_8x8(src_diff, diff_stride, (int16_t *)coeff);
+ vp9_quantize_fp(coeff, 64, x->skip_block, p->zbin, p->round_fp,
+ p->quant_fp, p->quant_shift, qcoeff, dqcoeff,
+ pd->dequant, eob,
+ scan_order->scan, scan_order->iscan);
+ break;
+ case TX_4X4:
+ x->fwd_txm4x4(src_diff, coeff, diff_stride);
+ vp9_quantize_fp(coeff, 16, x->skip_block, p->zbin, p->round_fp,
+ p->quant_fp, p->quant_shift, qcoeff, dqcoeff,
+ pd->dequant, eob,
+ scan_order->scan, scan_order->iscan);
+ break;
+ default:
+ assert(0);
+ break;
+ }
+ *skippable &= (*eob == 0);
+ eob_cost += 1;
+ }
+ block += step;
+ }
+ }
+
+ if (*skippable && *sse < INT64_MAX) {
+ *rate = 0;
+ *dist = (*sse << 6) >> shift;
+ *sse = *dist;
+ return;
+ }
+
+ block = 0;
+ *rate = 0;
+ *dist = 0;
+ *sse = (*sse << 6) >> shift;
+ for (r = 0; r < max_blocks_high; r += block_step) {
+ for (c = 0; c < num_4x4_w; c += block_step) {
+ if (c < max_blocks_wide) {
+ tran_low_t *const coeff = BLOCK_OFFSET(p->coeff, block);
+ tran_low_t *const qcoeff = BLOCK_OFFSET(p->qcoeff, block);
+ tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
+ uint16_t *const eob = &p->eobs[block];
+
+ if (*eob == 1)
+ *rate += (int)abs(qcoeff[0]);
+ else if (*eob > 1)
+ *rate += (int)vp9_satd((const int16_t *)qcoeff, step << 4);
+
+ *dist += vp9_block_error_fp(coeff, dqcoeff, step << 4) >> shift;
+ }
+ block += step;
+ }
+ }
+
+ if (*skippable == 0) {
+ *rate <<= 10;
+ *rate += (eob_cost << 8);
+ }
+}
+#endif
+
static void model_rd_for_sb_uv(VP9_COMP *cpi, BLOCK_SIZE bsize,
MACROBLOCK *x, MACROBLOCKD *xd,
int *out_rate_sum, int64_t *out_dist_sum,
@@ -518,7 +892,9 @@
int i, j;
int rate;
int64_t dist;
- unsigned int var_y, sse_y;
+ int64_t this_sse = INT64_MAX;
+ int is_skippable;
+
txfrm_block_to_raster_xy(plane_bsize, tx_size, block, &i, &j);
assert(plane == 0);
(void) plane;
@@ -533,8 +909,13 @@
x->skip_encode ? src_stride : dst_stride,
pd->dst.buf, dst_stride,
i, j, 0);
- // This procedure assumes zero offset from p->src.buf and pd->dst.buf.
- model_rd_for_sb_y(cpi, bsize_tx, x, xd, &rate, &dist, &var_y, &sse_y);
+
+ // TODO(jingning): This needs further refactoring.
+ block_yrd(cpi, x, &rate, &dist, &is_skippable, &this_sse, 0,
+ bsize_tx, MIN(tx_size, TX_16X16));
+ x->skip_txfm[0] = is_skippable;
+ rate += vp9_cost_bit(vp9_get_skip_prob(&cpi->common, xd), is_skippable);
+
p->src.buf = src_buf_base;
pd->dst.buf = dst_buf_base;
args->rate += rate;
@@ -602,9 +983,23 @@
*rd_cost = best_rdc;
}
-static const int ref_frame_cost[MAX_REF_FRAMES] = {
- 1235, 229, 530, 615,
-};
+static void init_ref_frame_cost(VP9_COMMON *const cm,
+ MACROBLOCKD *const xd,
+ int ref_frame_cost[MAX_REF_FRAMES]) {
+ vp9_prob intra_inter_p = vp9_get_intra_inter_prob(cm, xd);
+ vp9_prob ref_single_p1 = vp9_get_pred_prob_single_ref_p1(cm, xd);
+ vp9_prob ref_single_p2 = vp9_get_pred_prob_single_ref_p2(cm, xd);
+
+ ref_frame_cost[INTRA_FRAME] = vp9_cost_bit(intra_inter_p, 0);
+ ref_frame_cost[LAST_FRAME] = ref_frame_cost[GOLDEN_FRAME] =
+ ref_frame_cost[ALTREF_FRAME] = vp9_cost_bit(intra_inter_p, 1);
+
+ ref_frame_cost[LAST_FRAME] += vp9_cost_bit(ref_single_p1, 0);
+ ref_frame_cost[GOLDEN_FRAME] += vp9_cost_bit(ref_single_p1, 1);
+ ref_frame_cost[ALTREF_FRAME] += vp9_cost_bit(ref_single_p1, 1);
+ ref_frame_cost[GOLDEN_FRAME] += vp9_cost_bit(ref_single_p2, 0);
+ ref_frame_cost[ALTREF_FRAME] += vp9_cost_bit(ref_single_p2, 1);
+}
typedef struct {
MV_REFERENCE_FRAME ref_frame;
@@ -682,6 +1077,10 @@
int ref_frame_skip_mask = 0;
int idx;
int best_pred_sad = INT_MAX;
+ int best_early_term = 0;
+ int ref_frame_cost[MAX_REF_FRAMES];
+
+ init_ref_frame_cost(cm, xd, ref_frame_cost);
if (reuse_inter_pred) {
int i;
@@ -773,6 +1172,10 @@
int mode_index;
int i;
PREDICTION_MODE this_mode = ref_mode_set[idx].pred_mode;
+ int64_t this_sse;
+ int is_skippable;
+ int this_early_term = 0;
+
if (!(cpi->sf.inter_mode_mask[bsize] & (1 << this_mode)))
continue;
@@ -925,17 +1328,55 @@
var_y = pf_var[best_filter];
sse_y = pf_sse[best_filter];
x->skip_txfm[0] = skip_txfm;
+ if (reuse_inter_pred) {
+ pd->dst.buf = this_mode_pred->data;
+ pd->dst.stride = this_mode_pred->stride;
+ }
} else {
mbmi->interp_filter = (filter_ref == SWITCHABLE) ? EIGHTTAP : filter_ref;
vp9_build_inter_predictors_sby(xd, mi_row, mi_col, bsize);
- model_rd_for_sb_y(cpi, bsize, x, xd, &this_rdc.rate, &this_rdc.dist,
- &var_y, &sse_y);
- this_rdc.rate +=
- cm->interp_filter == SWITCHABLE ?
- vp9_get_switchable_rate(cpi, xd) : 0;
+
+ // For large partition blocks, extra testing is done.
+ if (bsize > BLOCK_32X32 &&
+ !cyclic_refresh_segment_id_boosted(xd->mi[0].src_mi->mbmi.segment_id) &&
+ cm->base_qindex) {
+ model_rd_for_sb_y_large(cpi, bsize, x, xd, &this_rdc.rate,
+ &this_rdc.dist, &var_y, &sse_y, mi_row, mi_col,
+ &this_early_term);
+ } else {
+ model_rd_for_sb_y(cpi, bsize, x, xd, &this_rdc.rate, &this_rdc.dist,
+ &var_y, &sse_y);
+ }
}
- // chroma component rate-distortion cost modeling
+ if (!this_early_term) {
+ this_sse = (int64_t)sse_y;
+ block_yrd(cpi, x, &this_rdc.rate, &this_rdc.dist, &is_skippable,
+ &this_sse, 0, bsize, MIN(mbmi->tx_size, TX_16X16));
+ x->skip_txfm[0] = is_skippable;
+ if (is_skippable) {
+ this_rdc.rate = vp9_cost_bit(vp9_get_skip_prob(cm, xd), 1);
+ } else {
+ if (RDCOST(x->rdmult, x->rddiv, this_rdc.rate, this_rdc.dist) <
+ RDCOST(x->rdmult, x->rddiv, 0, this_sse)) {
+ this_rdc.rate += vp9_cost_bit(vp9_get_skip_prob(cm, xd), 0);
+ } else {
+ this_rdc.rate = vp9_cost_bit(vp9_get_skip_prob(cm, xd), 1);
+ this_rdc.dist = this_sse;
+ x->skip_txfm[0] = 1;
+ }
+ }
+
+ if (cm->interp_filter == SWITCHABLE) {
+ if ((mbmi->mv[0].as_mv.row | mbmi->mv[0].as_mv.col) & 0x07)
+ this_rdc.rate += vp9_get_switchable_rate(cpi, xd);
+ }
+ } else {
+ this_rdc.rate += cm->interp_filter == SWITCHABLE ?
+ vp9_get_switchable_rate(cpi, xd) : 0;
+ this_rdc.rate += vp9_cost_bit(vp9_get_skip_prob(cm, xd), 1);
+ }
+
if (x->color_sensitivity[0] || x->color_sensitivity[1]) {
int uv_rate = 0;
int64_t uv_dist = 0;
@@ -943,7 +1384,8 @@
vp9_build_inter_predictors_sbp(xd, mi_row, mi_col, bsize, 1);
if (x->color_sensitivity[1])
vp9_build_inter_predictors_sbp(xd, mi_row, mi_col, bsize, 2);
- model_rd_for_sb_uv(cpi, bsize, x, xd, &uv_rate, &uv_dist, &var_y, &sse_y);
+ model_rd_for_sb_uv(cpi, bsize, x, xd, &uv_rate, &uv_dist,
+ &var_y, &sse_y);
this_rdc.rate += uv_rate;
this_rdc.dist += uv_dist;
}
@@ -982,6 +1424,7 @@
best_tx_size = mbmi->tx_size;
best_ref_frame = ref_frame;
best_mode_skip_txfm = x->skip_txfm[0];
+ best_early_term = this_early_term;
if (reuse_inter_pred) {
free_pred_buffer(best_pred);
@@ -994,6 +1437,13 @@
if (x->skip)
break;
+
+ // If early termination flag is 1 and at least 2 modes are checked,
+ // the mode search is terminated.
+ if (best_early_term && idx > 0) {
+ x->skip = 1;
+ break;
+ }
}
mbmi->mode = best_mode;
@@ -1042,6 +1492,8 @@
const PREDICTION_MODE this_mode = intra_mode_list[i];
if (!((1 << this_mode) & cpi->sf.intra_y_mode_mask[intra_tx_size]))
continue;
+ mbmi->mode = this_mode;
+ mbmi->ref_frame[0] = INTRA_FRAME;
args.mode = this_mode;
args.rate = 0;
args.dist = 0;
@@ -1058,17 +1510,17 @@
if (this_rdc.rdcost < best_rdc.rdcost) {
best_rdc = this_rdc;
- mbmi->mode = this_mode;
+ best_mode = this_mode;
best_intra_tx_size = mbmi->tx_size;
- mbmi->ref_frame[0] = INTRA_FRAME;
+ best_ref_frame = INTRA_FRAME;
mbmi->uv_mode = this_mode;
mbmi->mv[0].as_int = INVALID_MV;
+ best_mode_skip_txfm = x->skip_txfm[0];
}
}
// Reset mb_mode_info to the best inter mode.
- if (mbmi->ref_frame[0] != INTRA_FRAME) {
- x->skip_txfm[0] = best_mode_skip_txfm;
+ if (best_ref_frame != INTRA_FRAME) {
mbmi->tx_size = best_tx_size;
} else {
mbmi->tx_size = best_intra_tx_size;
@@ -1076,6 +1528,9 @@
}
pd->dst = orig_dst;
+ mbmi->mode = best_mode;
+ mbmi->ref_frame[0] = best_ref_frame;
+ x->skip_txfm[0] = best_mode_skip_txfm;
if (reuse_inter_pred && best_pred != NULL) {
if (best_pred->data != orig_dst.buf && is_inter_mode(mbmi->mode)) {
diff --git a/vp9/encoder/vp9_rdopt.c b/vp9/encoder/vp9_rdopt.c
index 5efa40b..166535b 100644
--- a/vp9/encoder/vp9_rdopt.c
+++ b/vp9/encoder/vp9_rdopt.c
@@ -292,6 +292,18 @@
return error;
}
+int64_t vp9_block_error_fp_c(const int16_t *coeff, const int16_t *dqcoeff,
+ int block_size) {
+ int i;
+ int64_t error = 0;
+
+ for (i = 0; i < block_size; i++) {
+ const int diff = coeff[i] - dqcoeff[i];
+ error += diff * diff;
+ }
+
+ return error;
+}
#if CONFIG_VP9_HIGHBITDEPTH
int64_t vp9_highbd_block_error_c(const tran_low_t *coeff,
@@ -1540,6 +1552,7 @@
int mi_row, int mi_col,
int_mv single_newmv[MAX_REF_FRAMES],
int *rate_mv) {
+ const VP9_COMMON *const cm = &cpi->common;
const int pw = 4 * num_4x4_blocks_wide_lookup[bsize];
const int ph = 4 * num_4x4_blocks_high_lookup[bsize];
MACROBLOCKD *xd = &x->e_mbd;
@@ -1548,14 +1561,8 @@
mbmi->ref_frame[1] < 0 ? 0 : mbmi->ref_frame[1]};
int_mv ref_mv[2];
int ite, ref;
- // Prediction buffer from second frame.
-#if CONFIG_VP9_HIGHBITDEPTH
- uint8_t *second_pred;
- uint8_t *second_pred_alloc;
-#else
- uint8_t *second_pred = vpx_memalign(16, pw * ph * sizeof(uint8_t));
-#endif // CONFIG_VP9_HIGHBITDEPTH
const InterpKernel *kernel = vp9_get_interp_kernel(mbmi->interp_filter);
+ struct scale_factors sf;
// Do joint motion search in compound mode to get more accurate mv.
struct buf_2d backup_yv12[2][MAX_MB_PLANE];
@@ -1564,14 +1571,13 @@
vp9_get_scaled_ref_frame(cpi, mbmi->ref_frame[0]),
vp9_get_scaled_ref_frame(cpi, mbmi->ref_frame[1])
};
+
+ // Prediction buffer from second frame.
#if CONFIG_VP9_HIGHBITDEPTH
- if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
- second_pred_alloc = vpx_memalign(16, pw * ph * sizeof(uint16_t));
- second_pred = CONVERT_TO_BYTEPTR(second_pred_alloc);
- } else {
- second_pred_alloc = vpx_memalign(16, pw * ph * sizeof(uint8_t));
- second_pred = second_pred_alloc;
- }
+ DECLARE_ALIGNED_ARRAY(16, uint16_t, second_pred_alloc_16, 64 * 64);
+ uint8_t *second_pred;
+#else
+ DECLARE_ALIGNED_ARRAY(16, uint8_t, second_pred, 64 * 64);
#endif // CONFIG_VP9_HIGHBITDEPTH
for (ref = 0; ref < 2; ++ref) {
@@ -1591,6 +1597,17 @@
frame_mv[refs[ref]].as_int = single_newmv[refs[ref]].as_int;
}
+ // Since we have scaled the reference frames to match the size of the current
+ // frame we must use a unit scaling factor during mode selection.
+#if CONFIG_VP9_HIGHBITDEPTH
+ vp9_setup_scale_factors_for_frame(&sf, cm->width, cm->height,
+ cm->width, cm->height,
+ cm->use_highbitdepth);
+#else
+ vp9_setup_scale_factors_for_frame(&sf, cm->width, cm->height,
+ cm->width, cm->height);
+#endif // CONFIG_VP9_HIGHBITDEPTH
+
// Allow joint search multiple times iteratively for each reference frame
// and break out of the search loop if it couldn't find a better mv.
for (ite = 0; ite < 4; ite++) {
@@ -1615,22 +1632,22 @@
// Get the prediction block from the 'other' reference frame.
#if CONFIG_VP9_HIGHBITDEPTH
if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+ second_pred = CONVERT_TO_BYTEPTR(second_pred_alloc_16);
vp9_highbd_build_inter_predictor(ref_yv12[!id].buf,
ref_yv12[!id].stride,
second_pred, pw,
&frame_mv[refs[!id]].as_mv,
- &xd->block_refs[!id]->sf,
- pw, ph, 0,
+ &sf, pw, ph, 0,
kernel, MV_PRECISION_Q3,
mi_col * MI_SIZE, mi_row * MI_SIZE,
xd->bd);
} else {
+ second_pred = (uint8_t *)second_pred_alloc_16;
vp9_build_inter_predictor(ref_yv12[!id].buf,
ref_yv12[!id].stride,
second_pred, pw,
&frame_mv[refs[!id]].as_mv,
- &xd->block_refs[!id]->sf,
- pw, ph, 0,
+ &sf, pw, ph, 0,
kernel, MV_PRECISION_Q3,
mi_col * MI_SIZE, mi_row * MI_SIZE);
}
@@ -1639,8 +1656,7 @@
ref_yv12[!id].stride,
second_pred, pw,
&frame_mv[refs[!id]].as_mv,
- &xd->block_refs[!id]->sf,
- pw, ph, 0,
+ &sf, pw, ph, 0,
kernel, MV_PRECISION_Q3,
mi_col * MI_SIZE, mi_row * MI_SIZE);
#endif // CONFIG_VP9_HIGHBITDEPTH
@@ -1712,12 +1728,6 @@
&mbmi->ref_mvs[refs[ref]][0].as_mv,
x->nmvjointcost, x->mvcost, MV_COST_WEIGHT);
}
-
-#if CONFIG_VP9_HIGHBITDEPTH
- vpx_free(second_pred_alloc);
-#else
- vpx_free(second_pred);
-#endif // CONFIG_VP9_HIGHBITDEPTH
}
static int64_t rd_pick_best_sub8x8_mode(VP9_COMP *cpi, MACROBLOCK *x,
@@ -2412,7 +2422,6 @@
int_mv cur_mv[2];
#if CONFIG_VP9_HIGHBITDEPTH
DECLARE_ALIGNED_ARRAY(16, uint16_t, tmp_buf16, MAX_MB_PLANE * 64 * 64);
- DECLARE_ALIGNED_ARRAY(16, uint8_t, tmp_buf8, MAX_MB_PLANE * 64 * 64);
uint8_t *tmp_buf;
#else
DECLARE_ALIGNED_ARRAY(16, uint8_t, tmp_buf, MAX_MB_PLANE * 64 * 64);
@@ -2441,7 +2450,7 @@
if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
tmp_buf = CONVERT_TO_BYTEPTR(tmp_buf16);
} else {
- tmp_buf = tmp_buf8;
+ tmp_buf = (uint8_t *)tmp_buf16;
}
#endif // CONFIG_VP9_HIGHBITDEPTH
diff --git a/vp9/encoder/vp9_speed_features.c b/vp9/encoder/vp9_speed_features.c
index 3515b6e..4c5ba5d 100644
--- a/vp9/encoder/vp9_speed_features.c
+++ b/vp9/encoder/vp9_speed_features.c
@@ -301,7 +301,7 @@
(frames_since_key % (sf->last_partitioning_redo_frequency << 1) == 1);
sf->max_delta_qindex = is_keyframe ? 20 : 15;
sf->partition_search_type = REFERENCE_PARTITION;
- sf->use_nonrd_pick_mode = !is_keyframe;
+ sf->use_nonrd_pick_mode = 1;
sf->allow_skip_recode = 0;
sf->inter_mode_mask[BLOCK_32X32] = INTER_NEAREST_NEW_ZERO;
sf->inter_mode_mask[BLOCK_32X64] = INTER_NEAREST_NEW_ZERO;
diff --git a/vp9/encoder/vp9_tokenize.c b/vp9/encoder/vp9_tokenize.c
index 4c89953..799109b 100644
--- a/vp9/encoder/vp9_tokenize.c
+++ b/vp9/encoder/vp9_tokenize.c
@@ -65,18 +65,6 @@
-CATEGORY5_TOKEN, -CATEGORY6_TOKEN // 10 = CAT_FIVE
};
-// Unconstrained Node Tree
-const vp9_tree_index vp9_coef_con_tree[TREE_SIZE(ENTROPY_TOKENS)] = {
- 2, 6, // 0 = LOW_VAL
- -TWO_TOKEN, 4, // 1 = TWO
- -THREE_TOKEN, -FOUR_TOKEN, // 2 = THREE
- 8, 10, // 3 = HIGH_LOW
- -CATEGORY1_TOKEN, -CATEGORY2_TOKEN, // 4 = CAT_ONE
- 12, 14, // 5 = CAT_THREEFOUR
- -CATEGORY3_TOKEN, -CATEGORY4_TOKEN, // 6 = CAT_THREE
- -CATEGORY5_TOKEN, -CATEGORY6_TOKEN // 7 = CAT_FIVE
-};
-
static const vp9_tree_index cat1[2] = {0, 0};
static const vp9_tree_index cat2[4] = {2, 2, 0, 0};
static const vp9_tree_index cat3[6] = {2, 2, 4, 4, 0, 0};
diff --git a/vp9/encoder/x86/vp9_avg_intrin_sse2.c b/vp9/encoder/x86/vp9_avg_intrin_sse2.c
index 4e80b255e..ecd6ce9 100644
--- a/vp9/encoder/x86/vp9_avg_intrin_sse2.c
+++ b/vp9/encoder/x86/vp9_avg_intrin_sse2.c
@@ -57,6 +57,179 @@
return (avg + 8) >> 4;
}
+static void hadamard_col8_sse2(__m128i *in, int iter) {
+ __m128i a0 = in[0];
+ __m128i a1 = in[1];
+ __m128i a2 = in[2];
+ __m128i a3 = in[3];
+ __m128i a4 = in[4];
+ __m128i a5 = in[5];
+ __m128i a6 = in[6];
+ __m128i a7 = in[7];
+
+ __m128i b0 = _mm_add_epi16(a0, a1);
+ __m128i b1 = _mm_sub_epi16(a0, a1);
+ __m128i b2 = _mm_add_epi16(a2, a3);
+ __m128i b3 = _mm_sub_epi16(a2, a3);
+ __m128i b4 = _mm_add_epi16(a4, a5);
+ __m128i b5 = _mm_sub_epi16(a4, a5);
+ __m128i b6 = _mm_add_epi16(a6, a7);
+ __m128i b7 = _mm_sub_epi16(a6, a7);
+
+ a0 = _mm_add_epi16(b0, b2);
+ a1 = _mm_add_epi16(b1, b3);
+ a2 = _mm_sub_epi16(b0, b2);
+ a3 = _mm_sub_epi16(b1, b3);
+ a4 = _mm_add_epi16(b4, b6);
+ a5 = _mm_add_epi16(b5, b7);
+ a6 = _mm_sub_epi16(b4, b6);
+ a7 = _mm_sub_epi16(b5, b7);
+
+ if (iter == 0) {
+ b0 = _mm_add_epi16(a0, a4);
+ b7 = _mm_add_epi16(a1, a5);
+ b3 = _mm_add_epi16(a2, a6);
+ b4 = _mm_add_epi16(a3, a7);
+ b2 = _mm_sub_epi16(a0, a4);
+ b6 = _mm_sub_epi16(a1, a5);
+ b1 = _mm_sub_epi16(a2, a6);
+ b5 = _mm_sub_epi16(a3, a7);
+
+ a0 = _mm_unpacklo_epi16(b0, b1);
+ a1 = _mm_unpacklo_epi16(b2, b3);
+ a2 = _mm_unpackhi_epi16(b0, b1);
+ a3 = _mm_unpackhi_epi16(b2, b3);
+ a4 = _mm_unpacklo_epi16(b4, b5);
+ a5 = _mm_unpacklo_epi16(b6, b7);
+ a6 = _mm_unpackhi_epi16(b4, b5);
+ a7 = _mm_unpackhi_epi16(b6, b7);
+
+ b0 = _mm_unpacklo_epi32(a0, a1);
+ b1 = _mm_unpacklo_epi32(a4, a5);
+ b2 = _mm_unpackhi_epi32(a0, a1);
+ b3 = _mm_unpackhi_epi32(a4, a5);
+ b4 = _mm_unpacklo_epi32(a2, a3);
+ b5 = _mm_unpacklo_epi32(a6, a7);
+ b6 = _mm_unpackhi_epi32(a2, a3);
+ b7 = _mm_unpackhi_epi32(a6, a7);
+
+ in[0] = _mm_unpacklo_epi64(b0, b1);
+ in[1] = _mm_unpackhi_epi64(b0, b1);
+ in[2] = _mm_unpacklo_epi64(b2, b3);
+ in[3] = _mm_unpackhi_epi64(b2, b3);
+ in[4] = _mm_unpacklo_epi64(b4, b5);
+ in[5] = _mm_unpackhi_epi64(b4, b5);
+ in[6] = _mm_unpacklo_epi64(b6, b7);
+ in[7] = _mm_unpackhi_epi64(b6, b7);
+ } else {
+ in[0] = _mm_add_epi16(a0, a4);
+ in[7] = _mm_add_epi16(a1, a5);
+ in[3] = _mm_add_epi16(a2, a6);
+ in[4] = _mm_add_epi16(a3, a7);
+ in[2] = _mm_sub_epi16(a0, a4);
+ in[6] = _mm_sub_epi16(a1, a5);
+ in[1] = _mm_sub_epi16(a2, a6);
+ in[5] = _mm_sub_epi16(a3, a7);
+ }
+}
+
+void vp9_hadamard_8x8_sse2(int16_t const *src_diff, int src_stride,
+ int16_t *coeff) {
+ __m128i src[8];
+ src[0] = _mm_load_si128((const __m128i *)src_diff);
+ src[1] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
+ src[2] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
+ src[3] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
+ src[4] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
+ src[5] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
+ src[6] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
+ src[7] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
+
+ hadamard_col8_sse2(src, 0);
+ hadamard_col8_sse2(src, 1);
+
+ _mm_store_si128((__m128i *)coeff, src[0]);
+ coeff += 8;
+ _mm_store_si128((__m128i *)coeff, src[1]);
+ coeff += 8;
+ _mm_store_si128((__m128i *)coeff, src[2]);
+ coeff += 8;
+ _mm_store_si128((__m128i *)coeff, src[3]);
+ coeff += 8;
+ _mm_store_si128((__m128i *)coeff, src[4]);
+ coeff += 8;
+ _mm_store_si128((__m128i *)coeff, src[5]);
+ coeff += 8;
+ _mm_store_si128((__m128i *)coeff, src[6]);
+ coeff += 8;
+ _mm_store_si128((__m128i *)coeff, src[7]);
+}
+
+void vp9_hadamard_16x16_sse2(int16_t const *src_diff, int src_stride,
+ int16_t *coeff) {
+ int idx;
+ for (idx = 0; idx < 4; ++idx) {
+ int16_t const *src_ptr = src_diff + (idx >> 1) * 8 * src_stride
+ + (idx & 0x01) * 8;
+ vp9_hadamard_8x8_sse2(src_ptr, src_stride, coeff + idx * 64);
+ }
+
+ for (idx = 0; idx < 64; idx += 8) {
+ __m128i coeff0 = _mm_load_si128((const __m128i *)coeff);
+ __m128i coeff1 = _mm_load_si128((const __m128i *)(coeff + 64));
+ __m128i coeff2 = _mm_load_si128((const __m128i *)(coeff + 128));
+ __m128i coeff3 = _mm_load_si128((const __m128i *)(coeff + 192));
+
+ __m128i b0 = _mm_add_epi16(coeff0, coeff1);
+ __m128i b1 = _mm_sub_epi16(coeff0, coeff1);
+ __m128i b2 = _mm_add_epi16(coeff2, coeff3);
+ __m128i b3 = _mm_sub_epi16(coeff2, coeff3);
+
+ coeff0 = _mm_add_epi16(b0, b2);
+ coeff1 = _mm_add_epi16(b1, b3);
+ coeff0 = _mm_srai_epi16(coeff0, 1);
+ coeff1 = _mm_srai_epi16(coeff1, 1);
+ _mm_store_si128((__m128i *)coeff, coeff0);
+ _mm_store_si128((__m128i *)(coeff + 64), coeff1);
+
+ coeff2 = _mm_sub_epi16(b0, b2);
+ coeff3 = _mm_sub_epi16(b1, b3);
+ coeff2 = _mm_srai_epi16(coeff2, 1);
+ coeff3 = _mm_srai_epi16(coeff3, 1);
+ _mm_store_si128((__m128i *)(coeff + 128), coeff2);
+ _mm_store_si128((__m128i *)(coeff + 192), coeff3);
+
+ coeff += 8;
+ }
+}
+
+int16_t vp9_satd_sse2(const int16_t *coeff, int length) {
+ int i;
+ __m128i sum = _mm_load_si128((const __m128i *)coeff);
+ __m128i sign = _mm_srai_epi16(sum, 15);
+ __m128i val = _mm_xor_si128(sum, sign);
+ sum = _mm_sub_epi16(val, sign);
+ coeff += 8;
+
+ for (i = 8; i < length; i += 8) {
+ __m128i src_line = _mm_load_si128((const __m128i *)coeff);
+ sign = _mm_srai_epi16(src_line, 15);
+ val = _mm_xor_si128(src_line, sign);
+ val = _mm_sub_epi16(val, sign);
+ sum = _mm_add_epi16(sum, val);
+ coeff += 8;
+ }
+
+ val = _mm_srli_si128(sum, 8);
+ sum = _mm_add_epi16(sum, val);
+ val = _mm_srli_epi64(sum, 32);
+ sum = _mm_add_epi16(sum, val);
+ val = _mm_srli_epi32(sum, 16);
+ sum = _mm_add_epi16(sum, val);
+
+ return _mm_extract_epi16(sum, 0);
+}
+
void vp9_int_pro_row_sse2(int16_t *hbuf, uint8_t const*ref,
const int ref_stride, const int height) {
int idx;
diff --git a/vp9/encoder/x86/vp9_dct_ssse3.c b/vp9/encoder/x86/vp9_dct_ssse3.c
index bdc75e9..1c1005a 100644
--- a/vp9/encoder/x86/vp9_dct_ssse3.c
+++ b/vp9/encoder/x86/vp9_dct_ssse3.c
@@ -293,7 +293,8 @@
if (!skip_block) {
__m128i eob;
- __m128i round, quant, dequant;
+ __m128i round, quant, dequant, thr;
+ int16_t nzflag;
{
__m128i coeff0, coeff1;
@@ -368,6 +369,7 @@
// AC only loop
index = 2;
+ thr = _mm_srai_epi16(dequant, 1);
while (n_coeffs < 0) {
__m128i coeff0, coeff1;
{
@@ -387,28 +389,39 @@
qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
- qcoeff0 = _mm_adds_epi16(qcoeff0, round);
- qcoeff1 = _mm_adds_epi16(qcoeff1, round);
- qtmp0 = _mm_mulhi_epi16(qcoeff0, quant);
- qtmp1 = _mm_mulhi_epi16(qcoeff1, quant);
+ nzflag = _mm_movemask_epi8(_mm_cmpgt_epi16(qcoeff0, thr)) |
+ _mm_movemask_epi8(_mm_cmpgt_epi16(qcoeff1, thr));
- // Reinsert signs
- qcoeff0 = _mm_xor_si128(qtmp0, coeff0_sign);
- qcoeff1 = _mm_xor_si128(qtmp1, coeff1_sign);
- qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
- qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
+ if (nzflag) {
+ qcoeff0 = _mm_adds_epi16(qcoeff0, round);
+ qcoeff1 = _mm_adds_epi16(qcoeff1, round);
+ qtmp0 = _mm_mulhi_epi16(qcoeff0, quant);
+ qtmp1 = _mm_mulhi_epi16(qcoeff1, quant);
- _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs), qcoeff0);
- _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs) + 1, qcoeff1);
+ // Reinsert signs
+ qcoeff0 = _mm_xor_si128(qtmp0, coeff0_sign);
+ qcoeff1 = _mm_xor_si128(qtmp1, coeff1_sign);
+ qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
+ qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
- coeff0 = _mm_mullo_epi16(qcoeff0, dequant);
- coeff1 = _mm_mullo_epi16(qcoeff1, dequant);
+ _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs), qcoeff0);
+ _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs) + 1, qcoeff1);
- _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs), coeff0);
- _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs) + 1, coeff1);
+ coeff0 = _mm_mullo_epi16(qcoeff0, dequant);
+ coeff1 = _mm_mullo_epi16(qcoeff1, dequant);
+
+ _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs), coeff0);
+ _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs) + 1, coeff1);
+ } else {
+ _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs), zero);
+ _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs) + 1, zero);
+
+ _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs), zero);
+ _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs) + 1, zero);
+ }
}
- {
+ if (nzflag) {
// Scan for eob
__m128i zero_coeff0, zero_coeff1;
__m128i nzero_coeff0, nzero_coeff1;
diff --git a/vp9/encoder/x86/vp9_dct_ssse3_x86_64.asm b/vp9/encoder/x86/vp9_dct_ssse3_x86_64.asm
index 28458dc..3a29aba 100644
--- a/vp9/encoder/x86/vp9_dct_ssse3_x86_64.asm
+++ b/vp9/encoder/x86/vp9_dct_ssse3_x86_64.asm
@@ -179,4 +179,77 @@
mova [outputq + 112], m7
RET
+
+%macro HMD8_1D 0
+ psubw m8, m0, m1
+ psubw m9, m2, m3
+ paddw m0, m1
+ paddw m2, m3
+ SWAP 1, 8
+ SWAP 3, 9
+ psubw m8, m4, m5
+ psubw m9, m6, m7
+ paddw m4, m5
+ paddw m6, m7
+ SWAP 5, 8
+ SWAP 7, 9
+
+ psubw m8, m0, m2
+ psubw m9, m1, m3
+ paddw m0, m2
+ paddw m1, m3
+ SWAP 2, 8
+ SWAP 3, 9
+ psubw m8, m4, m6
+ psubw m9, m5, m7
+ paddw m4, m6
+ paddw m5, m7
+ SWAP 6, 8
+ SWAP 7, 9
+
+ psubw m8, m0, m4
+ psubw m9, m1, m5
+ paddw m0, m4
+ paddw m1, m5
+ SWAP 4, 8
+ SWAP 5, 9
+ psubw m8, m2, m6
+ psubw m9, m3, m7
+ paddw m2, m6
+ paddw m3, m7
+ SWAP 6, 8
+ SWAP 7, 9
+%endmacro
+
+INIT_XMM ssse3
+cglobal hadamard_8x8, 3, 5, 10, input, stride, output
+ lea r3, [2 * strideq]
+ lea r4, [4 * strideq]
+
+ mova m0, [inputq]
+ mova m1, [inputq + r3]
+ lea inputq, [inputq + r4]
+ mova m2, [inputq]
+ mova m3, [inputq + r3]
+ lea inputq, [inputq + r4]
+ mova m4, [inputq]
+ mova m5, [inputq + r3]
+ lea inputq, [inputq + r4]
+ mova m6, [inputq]
+ mova m7, [inputq + r3]
+
+ HMD8_1D
+ TRANSPOSE8X8 0, 1, 2, 3, 4, 5, 6, 7, 9
+ HMD8_1D
+
+ mova [outputq + 0], m0
+ mova [outputq + 16], m1
+ mova [outputq + 32], m2
+ mova [outputq + 48], m3
+ mova [outputq + 64], m4
+ mova [outputq + 80], m5
+ mova [outputq + 96], m6
+ mova [outputq + 112], m7
+
+ RET
%endif
diff --git a/vp9/encoder/x86/vp9_error_sse2.asm b/vp9/encoder/x86/vp9_error_sse2.asm
index 1126fdb..56373e8 100644
--- a/vp9/encoder/x86/vp9_error_sse2.asm
+++ b/vp9/encoder/x86/vp9_error_sse2.asm
@@ -72,3 +72,49 @@
movd edx, m5
%endif
RET
+
+; Compute the sum of squared difference between two int16_t vectors.
+; int64_t vp9_block_error_fp(int16_t *coeff, int16_t *dqcoeff,
+; intptr_t block_size)
+
+INIT_XMM sse2
+cglobal block_error_fp, 3, 3, 6, uqc, dqc, size
+ pxor m4, m4 ; sse accumulator
+ pxor m5, m5 ; dedicated zero register
+ lea uqcq, [uqcq+sizeq*2]
+ lea dqcq, [dqcq+sizeq*2]
+ neg sizeq
+.loop:
+ mova m2, [uqcq+sizeq*2]
+ mova m0, [dqcq+sizeq*2]
+ mova m3, [uqcq+sizeq*2+mmsize]
+ mova m1, [dqcq+sizeq*2+mmsize]
+ psubw m0, m2
+ psubw m1, m3
+ ; individual errors are max. 15bit+sign, so squares are 30bit, and
+ ; thus the sum of 2 should fit in a 31bit integer (+ unused sign bit)
+ pmaddwd m0, m0
+ pmaddwd m1, m1
+ ; accumulate in 64bit
+ punpckldq m3, m0, m5
+ punpckhdq m0, m5
+ paddq m4, m3
+ punpckldq m3, m1, m5
+ paddq m4, m0
+ punpckhdq m1, m5
+ paddq m4, m3
+ paddq m4, m1
+ add sizeq, mmsize
+ jl .loop
+
+ ; accumulate horizontally and store in return value
+ movhlps m5, m4
+ paddq m4, m5
+%if ARCH_X86_64
+ movq rax, m4
+%else
+ pshufd m5, m4, 0x1
+ movd eax, m4
+ movd edx, m5
+%endif
+ RET
diff --git a/vp9/encoder/x86/vp9_quantize_sse2.c b/vp9/encoder/x86/vp9_quantize_sse2.c
index 679c66e..00abd3c 100644
--- a/vp9/encoder/x86/vp9_quantize_sse2.c
+++ b/vp9/encoder/x86/vp9_quantize_sse2.c
@@ -230,6 +230,8 @@
const int16_t* scan_ptr,
const int16_t* iscan_ptr) {
__m128i zero;
+ __m128i thr;
+ int16_t nzflag;
(void)scan_ptr;
(void)zbin_ptr;
(void)quant_shift_ptr;
@@ -316,6 +318,8 @@
n_coeffs += 8 * 2;
}
+ thr = _mm_srai_epi16(dequant, 1);
+
// AC only loop
while (n_coeffs < 0) {
__m128i coeff0, coeff1;
@@ -335,28 +339,39 @@
qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
- qcoeff0 = _mm_adds_epi16(qcoeff0, round);
- qcoeff1 = _mm_adds_epi16(qcoeff1, round);
- qtmp0 = _mm_mulhi_epi16(qcoeff0, quant);
- qtmp1 = _mm_mulhi_epi16(qcoeff1, quant);
+ nzflag = _mm_movemask_epi8(_mm_cmpgt_epi16(qcoeff0, thr)) |
+ _mm_movemask_epi8(_mm_cmpgt_epi16(qcoeff1, thr));
- // Reinsert signs
- qcoeff0 = _mm_xor_si128(qtmp0, coeff0_sign);
- qcoeff1 = _mm_xor_si128(qtmp1, coeff1_sign);
- qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
- qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
+ if (nzflag) {
+ qcoeff0 = _mm_adds_epi16(qcoeff0, round);
+ qcoeff1 = _mm_adds_epi16(qcoeff1, round);
+ qtmp0 = _mm_mulhi_epi16(qcoeff0, quant);
+ qtmp1 = _mm_mulhi_epi16(qcoeff1, quant);
- _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs), qcoeff0);
- _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs) + 1, qcoeff1);
+ // Reinsert signs
+ qcoeff0 = _mm_xor_si128(qtmp0, coeff0_sign);
+ qcoeff1 = _mm_xor_si128(qtmp1, coeff1_sign);
+ qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
+ qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
- coeff0 = _mm_mullo_epi16(qcoeff0, dequant);
- coeff1 = _mm_mullo_epi16(qcoeff1, dequant);
+ _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs), qcoeff0);
+ _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs) + 1, qcoeff1);
- _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs), coeff0);
- _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs) + 1, coeff1);
+ coeff0 = _mm_mullo_epi16(qcoeff0, dequant);
+ coeff1 = _mm_mullo_epi16(qcoeff1, dequant);
+
+ _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs), coeff0);
+ _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs) + 1, coeff1);
+ } else {
+ _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs), zero);
+ _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs) + 1, zero);
+
+ _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs), zero);
+ _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs) + 1, zero);
+ }
}
- {
+ if (nzflag) {
// Scan for eob
__m128i zero_coeff0, zero_coeff1;
__m128i nzero_coeff0, nzero_coeff1;
diff --git a/vp9/encoder/x86/vp9_quantize_ssse3_x86_64.asm b/vp9/encoder/x86/vp9_quantize_ssse3_x86_64.asm
index c35eb36..449d52b 100644
--- a/vp9/encoder/x86/vp9_quantize_ssse3_x86_64.asm
+++ b/vp9/encoder/x86/vp9_quantize_ssse3_x86_64.asm
@@ -282,6 +282,8 @@
psignw m8, m9
psignw m13, m10
psrlw m0, m3, 2
+%else
+ psrlw m0, m3, 1
%endif
mova [r4q+ncoeffq*2+ 0], m8
mova [r4q+ncoeffq*2+16], m13
@@ -302,7 +304,7 @@
mova m10, [ coeffq+ncoeffq*2+16] ; m10 = c[i]
pabsw m6, m9 ; m6 = abs(m9)
pabsw m11, m10 ; m11 = abs(m10)
-%ifidn %1, fp_32x32
+
pcmpgtw m7, m6, m0
pcmpgtw m12, m11, m0
pmovmskb r6d, m7
@@ -310,7 +312,7 @@
or r6, r2
jz .skip_iter
-%endif
+
pcmpeqw m7, m7
paddsw m6, m1 ; m6 += round
@@ -348,7 +350,6 @@
add ncoeffq, mmsize
jl .ac_only_loop
-%ifidn %1, fp_32x32
jmp .accumulate_eob
.skip_iter:
mova [r3q+ncoeffq*2+ 0], m5
@@ -357,7 +358,6 @@
mova [r4q+ncoeffq*2+16], m5
add ncoeffq, mmsize
jl .ac_only_loop
-%endif
.accumulate_eob:
; horizontally accumulate/max eobs and write into [eob] memory pointer
diff --git a/vp9/vp9_common.mk b/vp9/vp9_common.mk
index 8670405..037367b 100644
--- a/vp9/vp9_common.mk
+++ b/vp9/vp9_common.mk
@@ -144,8 +144,10 @@
VP9_COMMON_SRCS-$(HAVE_NEON_ASM) += common/arm/neon/vp9_mb_lpf_neon$(ASM)
VP9_COMMON_SRCS-$(HAVE_NEON_ASM) += common/arm/neon/vp9_save_reg_neon$(ASM)
+ifneq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_iht4x4_add_neon.c
VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_iht8x8_add_neon.c
+endif
VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_loopfilter_neon.c
# neon with assembly and intrinsics implementations. If both are available
diff --git a/vp9/vp9_cx_iface.c b/vp9/vp9_cx_iface.c
index 0ce37aa..cba15e6 100644
--- a/vp9/vp9_cx_iface.c
+++ b/vp9/vp9_cx_iface.c
@@ -1260,6 +1260,21 @@
}
}
+static vpx_codec_err_t ctrl_get_active_map(vpx_codec_alg_priv_t *ctx,
+ va_list args) {
+ vpx_active_map_t *const map = va_arg(args, vpx_active_map_t *);
+
+ if (map) {
+ if (!vp9_get_active_map(ctx->cpi, map->active_map,
+ (int)map->rows, (int)map->cols))
+ return VPX_CODEC_OK;
+ else
+ return VPX_CODEC_INVALID_PARAM;
+ } else {
+ return VPX_CODEC_INVALID_PARAM;
+ }
+}
+
static vpx_codec_err_t ctrl_set_scale_mode(vpx_codec_alg_priv_t *ctx,
va_list args) {
vpx_scaling_mode_t *const mode = va_arg(args, vpx_scaling_mode_t *);
@@ -1417,6 +1432,7 @@
#if VPX_ENCODER_ABI_VERSION > (4 + VPX_CODEC_ABI_VERSION)
{VP9E_GET_SVC_LAYER_ID, ctrl_get_svc_layer_id},
#endif
+ {VP9E_GET_ACTIVEMAP, ctrl_get_active_map},
{ -1, NULL},
};
diff --git a/vp9/vp9_dx_iface.c b/vp9/vp9_dx_iface.c
index 7350cb3..c2f782b 100644
--- a/vp9/vp9_dx_iface.c
+++ b/vp9/vp9_dx_iface.c
@@ -755,6 +755,8 @@
(FrameWorkerData *)worker->data1;
ctx->next_output_worker_id =
(ctx->next_output_worker_id + 1) % ctx->num_frame_workers;
+ if (ctx->base.init_flags & VPX_CODEC_USE_POSTPROC)
+ set_ppflags(ctx, &flags);
// Wait for the frame from worker thread.
if (winterface->sync(worker)) {
// Check if worker has received any frames.
diff --git a/vp9/vp9cx.mk b/vp9/vp9cx.mk
index 2b3f894..f8da734 100644
--- a/vp9/vp9cx.mk
+++ b/vp9/vp9cx.mk
@@ -152,12 +152,14 @@
VP9_CX_SRCS-$(HAVE_AVX2) += encoder/x86/vp9_error_intrin_avx2.c
VP9_CX_SRCS-$(HAVE_AVX2) += encoder/x86/vp9_variance_avx2.c
-VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_avg_neon.c
-VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_sad_neon.c
-VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_sad4d_neon.c
+ifneq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_dct_neon.c
-VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_variance_neon.c
+endif
+VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_avg_neon.c
VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_quantize_neon.c
+VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_sad4d_neon.c
+VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_sad_neon.c
VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_subtract_neon.c
+VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_variance_neon.c
VP9_CX_SRCS-yes := $(filter-out $(VP9_CX_SRCS_REMOVE-yes),$(VP9_CX_SRCS-yes))
diff --git a/vpx/vp8cx.h b/vpx/vp8cx.h
index 60b588f..0e8adc1 100644
--- a/vpx/vp8cx.h
+++ b/vpx/vp8cx.h
@@ -508,6 +508,12 @@
* Supported in codecs: VP9
*/
VP9E_SET_COLOR_SPACE,
+
+ /*!\brief Codec control function to get an Active map back from the encoder.
+ *
+ * Supported in codecs: VP9
+ */
+ VP9E_GET_ACTIVEMAP,
};
/*!\brief vpx 1-D scaling mode
@@ -691,6 +697,8 @@
VPX_CTRL_USE_TYPE(VP9E_SET_TUNE_CONTENT, int) /* vp9e_tune_content */
VPX_CTRL_USE_TYPE(VP9E_SET_COLOR_SPACE, int)
+
+VPX_CTRL_USE_TYPE(VP9E_GET_ACTIVEMAP, vpx_active_map_t *)
/*! @} - end defgroup vp8_encoder */
#ifdef __cplusplus
} // extern "C"
diff --git a/vpxdec.c b/vpxdec.c
index 0403550..8c938df 100644
--- a/vpxdec.c
+++ b/vpxdec.c
@@ -1080,9 +1080,6 @@
}
}
}
-
- if (stop_after && frame_in >= stop_after)
- break;
}
if (summary || progress) {
diff --git a/webmdec.cc b/webmdec.cc
index d591f3e..e152f5e 100644
--- a/webmdec.cc
+++ b/webmdec.cc
@@ -63,6 +63,7 @@
struct VpxInputContext *vpx_ctx) {
mkvparser::MkvReader *const reader = new mkvparser::MkvReader(vpx_ctx->file);
webm_ctx->reader = reader;
+ webm_ctx->reached_eos = 0;
mkvparser::EBMLHeader header;
long long pos = 0;
@@ -121,6 +122,11 @@
uint8_t **buffer,
size_t *bytes_in_buffer,
size_t *buffer_size) {
+ // This check is needed for frame parallel decoding, in which case this
+ // function could be called even after it has reached end of input stream.
+ if (webm_ctx->reached_eos) {
+ return 1;
+ }
mkvparser::Segment *const segment =
reinterpret_cast<mkvparser::Segment*>(webm_ctx->segment);
const mkvparser::Cluster* cluster =
@@ -140,6 +146,7 @@
cluster = segment->GetNext(cluster);
if (cluster == NULL || cluster->EOS()) {
*bytes_in_buffer = 0;
+ webm_ctx->reached_eos = 1;
return 1;
}
status = cluster->GetFirst(block_entry);
@@ -212,6 +219,7 @@
webm_ctx->block_entry = NULL;
webm_ctx->block_frame_index = 0;
webm_ctx->timestamp_ns = 0;
+ webm_ctx->reached_eos = 0;
return 0;
}
diff --git a/webmdec.h b/webmdec.h
index 1cd35d4..7d16380 100644
--- a/webmdec.h
+++ b/webmdec.h
@@ -29,6 +29,7 @@
int video_track_index;
uint64_t timestamp_ns;
int is_key_frame;
+ int reached_eos;
};
// Checks if the input is a WebM file. If so, initializes WebMInputContext so