Merge "Prevent decoder from using uninitialized entropy context."
diff --git a/build/make/configure.sh b/build/make/configure.sh
index 7be583d..f980180 100644
--- a/build/make/configure.sh
+++ b/build/make/configure.sh
@@ -1272,9 +1272,6 @@
check_cc <<EOF && INLINE="inline"
static inline function() {}
EOF
- check_cc <<EOF && INLINE="__inline__ __attribute__((always_inline))"
-static __attribute__((always_inline)) function() {}
-EOF
# Almost every platform uses pthreads.
if enabled multithread; then
diff --git a/build/make/iosbuild.sh b/build/make/iosbuild.sh
index 9030108..bdd6690 100755
--- a/build/make/iosbuild.sh
+++ b/build/make/iosbuild.sh
@@ -22,7 +22,8 @@
FRAMEWORK_DIR="VPX.framework"
HEADER_DIR="${FRAMEWORK_DIR}/Headers/vpx"
MAKE_JOBS=1
-LIBVPX_SOURCE_DIR=$(dirname "$0" | sed -e s,/build/make,,)
+SCRIPT_DIR=$(dirname "$0")
+LIBVPX_SOURCE_DIR=$(cd ${SCRIPT_DIR}/../..; pwd)
LIPO=$(xcrun -sdk iphoneos${SDK} -find lipo)
ORIG_PWD="$(pwd)"
TARGETS="arm64-darwin-gcc
@@ -41,7 +42,7 @@
mkdir "${target}"
cd "${target}"
- eval "../../${LIBVPX_SOURCE_DIR}/configure" --target="${target}" \
+ eval "${LIBVPX_SOURCE_DIR}/configure" --target="${target}" \
--disable-docs ${EXTRA_CONFIGURE_ARGS} ${devnull}
export DIST_DIR
eval make -j ${MAKE_JOBS} dist ${devnull}
@@ -57,9 +58,6 @@
arm64-*)
echo "__aarch64__"
;;
- armv6-*)
- echo "__ARM_ARCH_6__"
- ;;
armv7-*)
echo "__ARM_ARCH_7A__"
;;
@@ -175,8 +173,13 @@
# Trap function. Cleans up the subtree used to build all targets contained in
# $TARGETS.
cleanup() {
+ local readonly res=$?
cd "${ORIG_PWD}"
+ if [ $res -ne 0 ]; then
+ elog "build exited with error ($res)"
+ fi
+
if [ "${PRESERVE_BUILD_OUTPUT}" != "yes" ]; then
rm -rf "${BUILD_ROOT}"
fi
@@ -186,14 +189,21 @@
cat << EOF
Usage: ${0##*/} [arguments]
--help: Display this message and exit.
+ --extra-configure-args <args>: Extra args to pass when configuring libvpx.
--jobs: Number of make jobs.
--preserve-build-output: Do not delete the build directory.
--show-build-output: Show output from each library build.
+ --targets <targets>: Override default target list. Defaults:
+ ${TARGETS}
--verbose: Output information about the environment and each stage of the
build.
EOF
}
+elog() {
+ echo "${0##*/} failed because: $@" 1>&2
+}
+
vlog() {
if [ "${VERBOSE}" = "yes" ]; then
echo "$@"
@@ -223,6 +233,10 @@
--show-build-output)
devnull=
;;
+ --targets)
+ TARGETS="$2"
+ shift
+ ;;
--verbose)
VERBOSE=yes
;;
@@ -251,3 +265,5 @@
fi
build_framework "${TARGETS}"
+echo "Successfully built '${FRAMEWORK_DIR}' for:"
+echo " ${TARGETS}"
diff --git a/test/codec_factory.h b/test/codec_factory.h
index 7f9398c..3a0e3db 100644
--- a/test/codec_factory.h
+++ b/test/codec_factory.h
@@ -35,6 +35,11 @@
virtual Decoder* CreateDecoder(vpx_codec_dec_cfg_t cfg,
unsigned long deadline) const = 0;
+ virtual Decoder* CreateDecoder(vpx_codec_dec_cfg_t cfg,
+ const vpx_codec_flags_t flags,
+ unsigned long deadline) // NOLINT(runtime/int)
+ const = 0;
+
virtual Encoder* CreateEncoder(vpx_codec_enc_cfg_t cfg,
unsigned long deadline,
const unsigned long init_flags,
@@ -72,6 +77,10 @@
VP8Decoder(vpx_codec_dec_cfg_t cfg, unsigned long deadline)
: Decoder(cfg, deadline) {}
+ VP8Decoder(vpx_codec_dec_cfg_t cfg, const vpx_codec_flags_t flag,
+ unsigned long deadline) // NOLINT
+ : Decoder(cfg, flag, deadline) {}
+
protected:
virtual vpx_codec_iface_t* CodecInterface() const {
#if CONFIG_VP8_DECODER
@@ -104,8 +113,14 @@
virtual Decoder* CreateDecoder(vpx_codec_dec_cfg_t cfg,
unsigned long deadline) const {
+ return CreateDecoder(cfg, 0, deadline);
+ }
+
+ virtual Decoder* CreateDecoder(vpx_codec_dec_cfg_t cfg,
+ const vpx_codec_flags_t flags,
+ unsigned long deadline) const { // NOLINT
#if CONFIG_VP8_DECODER
- return new VP8Decoder(cfg, deadline);
+ return new VP8Decoder(cfg, flags, deadline);
#else
return NULL;
#endif
@@ -154,6 +169,10 @@
VP9Decoder(vpx_codec_dec_cfg_t cfg, unsigned long deadline)
: Decoder(cfg, deadline) {}
+ VP9Decoder(vpx_codec_dec_cfg_t cfg, const vpx_codec_flags_t flag,
+ unsigned long deadline) // NOLINT
+ : Decoder(cfg, flag, deadline) {}
+
protected:
virtual vpx_codec_iface_t* CodecInterface() const {
#if CONFIG_VP9_DECODER
@@ -186,8 +205,14 @@
virtual Decoder* CreateDecoder(vpx_codec_dec_cfg_t cfg,
unsigned long deadline) const {
+ return CreateDecoder(cfg, 0, deadline);
+ }
+
+ virtual Decoder* CreateDecoder(vpx_codec_dec_cfg_t cfg,
+ const vpx_codec_flags_t flags,
+ unsigned long deadline) const { // NOLINT
#if CONFIG_VP9_DECODER
- return new VP9Decoder(cfg, deadline);
+ return new VP9Decoder(cfg, flags, deadline);
#else
return NULL;
#endif
diff --git a/test/convolve_test.cc b/test/convolve_test.cc
index 5dd73db..e30ccf9 100644
--- a/test/convolve_test.cc
+++ b/test/convolve_test.cc
@@ -10,6 +10,7 @@
#include <string.h>
#include "test/acm_random.h"
+#include "test/clear_system_state.h"
#include "test/register_state_check.h"
#include "test/util.h"
#include "third_party/googletest/src/include/gtest/gtest.h"
@@ -315,6 +316,8 @@
#endif
}
+ virtual void TearDown() { libvpx_test::ClearSystemState(); }
+
static void TearDownTestCase() {
vpx_free(input_ - 1);
input_ = NULL;
diff --git a/test/decode_test_driver.cc b/test/decode_test_driver.cc
index 0ef4f7b..852d90e 100644
--- a/test/decode_test_driver.cc
+++ b/test/decode_test_driver.cc
@@ -110,4 +110,12 @@
RunLoop(video, dec_cfg);
}
+void DecoderTest::set_cfg(const vpx_codec_dec_cfg_t &dec_cfg) {
+ memcpy(&cfg_, &dec_cfg, sizeof(cfg_));
+}
+
+void DecoderTest::set_flags(const vpx_codec_flags_t flags) {
+ flags_ = flags;
+}
+
} // namespace libvpx_test
diff --git a/test/decode_test_driver.h b/test/decode_test_driver.h
index a757b59..232428e 100644
--- a/test/decode_test_driver.h
+++ b/test/decode_test_driver.h
@@ -41,7 +41,13 @@
class Decoder {
public:
Decoder(vpx_codec_dec_cfg_t cfg, unsigned long deadline)
- : cfg_(cfg), deadline_(deadline), init_done_(false) {
+ : cfg_(cfg), flags_(0), deadline_(deadline), init_done_(false) {
+ memset(&decoder_, 0, sizeof(decoder_));
+ }
+
+ Decoder(vpx_codec_dec_cfg_t cfg, const vpx_codec_flags_t flag,
+ unsigned long deadline) // NOLINT
+ : cfg_(cfg), flags_(flag), deadline_(deadline), init_done_(false) {
memset(&decoder_, 0, sizeof(decoder_));
}
@@ -97,6 +103,10 @@
bool IsVP8() const;
+ vpx_codec_ctx_t * GetDecoder() {
+ return &decoder_;
+ }
+
protected:
virtual vpx_codec_iface_t* CodecInterface() const = 0;
@@ -104,7 +114,7 @@
if (!init_done_) {
const vpx_codec_err_t res = vpx_codec_dec_init(&decoder_,
CodecInterface(),
- &cfg_, 0);
+ &cfg_, flags_);
ASSERT_EQ(VPX_CODEC_OK, res) << DecodeError();
init_done_ = true;
}
@@ -112,6 +122,7 @@
vpx_codec_ctx_t decoder_;
vpx_codec_dec_cfg_t cfg_;
+ vpx_codec_flags_t flags_;
unsigned int deadline_;
bool init_done_;
};
@@ -124,6 +135,9 @@
virtual void RunLoop(CompressedVideoSource *video,
const vpx_codec_dec_cfg_t &dec_cfg);
+ virtual void set_cfg(const vpx_codec_dec_cfg_t &dec_cfg);
+ virtual void set_flags(const vpx_codec_flags_t flags);
+
// Hook to be called before decompressing every frame.
virtual void PreDecodeFrameHook(const CompressedVideoSource& /*video*/,
Decoder* /*decoder*/) {}
@@ -146,11 +160,16 @@
const vpx_codec_err_t res_peek);
protected:
- explicit DecoderTest(const CodecFactory *codec) : codec_(codec) {}
+ explicit DecoderTest(const CodecFactory *codec)
+ : codec_(codec),
+ cfg_(),
+ flags_(0) {}
virtual ~DecoderTest() {}
const CodecFactory *codec_;
+ vpx_codec_dec_cfg_t cfg_;
+ vpx_codec_flags_t flags_;
};
} // namespace libvpx_test
diff --git a/test/encode_test_driver.cc b/test/encode_test_driver.cc
index 5b6757c..b49e8cb 100644
--- a/test/encode_test_driver.cc
+++ b/test/encode_test_driver.cc
@@ -140,6 +140,8 @@
}
void EncoderTest::RunLoop(VideoSource *video) {
+ vpx_codec_dec_cfg_t dec_cfg = vpx_codec_dec_cfg_t();
+
stats_.Reset();
ASSERT_TRUE(passes_ == 1 || passes_ == 2);
@@ -157,7 +159,13 @@
Encoder* const encoder = codec_->CreateEncoder(cfg_, deadline_, init_flags_,
&stats_);
ASSERT_TRUE(encoder != NULL);
- Decoder* const decoder = codec_->CreateDecoder(dec_cfg_, 0);
+
+ unsigned long dec_init_flags = 0; // NOLINT
+ // Use fragment decoder if encoder outputs partitions.
+ // NOTE: fragment decoder and partition encoder are only supported by VP8.
+ if (init_flags_ & VPX_CODEC_USE_OUTPUT_PARTITION)
+ dec_init_flags |= VPX_CODEC_USE_INPUT_FRAGMENTS;
+ Decoder* const decoder = codec_->CreateDecoder(dec_cfg, dec_init_flags, 0);
bool again;
for (again = true, video->Begin(); again; video->Next()) {
again = (video->img() != NULL);
@@ -199,6 +207,13 @@
}
}
+ // Flush the decoder when there are no more fragments.
+ if ((init_flags_ & VPX_CODEC_USE_OUTPUT_PARTITION) && has_dxdata) {
+ const vpx_codec_err_t res_dec = decoder->DecodeFrame(NULL, 0);
+ if (!HandleDecodeResult(res_dec, *video, decoder))
+ break;
+ }
+
if (has_dxdata && has_cxdata) {
const vpx_image_t *img_enc = encoder->GetPreviewFrame();
DxDataIterator dec_iter = decoder->GetDxData();
diff --git a/test/encode_test_driver.h b/test/encode_test_driver.h
index 3bc7847..770ac7e 100644
--- a/test/encode_test_driver.h
+++ b/test/encode_test_driver.h
@@ -185,6 +185,11 @@
// Map the TestMode enum to the deadline_ and passes_ variables.
void SetMode(TestMode mode);
+ // Set encoder flag.
+ void set_init_flags(unsigned long flag) { // NOLINT(runtime/int)
+ init_flags_ = flag;
+ }
+
// Main loop
virtual void RunLoop(VideoSource *video);
diff --git a/test/fdct4x4_test.cc b/test/fdct4x4_test.cc
index d6a3473..5357a8d 100644
--- a/test/fdct4x4_test.cc
+++ b/test/fdct4x4_test.cc
@@ -474,14 +474,17 @@
::testing::Values(
make_tuple(&vp9_fdct4x4_c,
&vp9_idct4x4_16_add_neon, 0, VPX_BITS_8)));
+#endif // HAVE_NEON_ASM && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
+
+#if HAVE_NEON && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
INSTANTIATE_TEST_CASE_P(
- DISABLED_NEON, Trans4x4HT,
+ NEON, Trans4x4HT,
::testing::Values(
make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_neon, 0, VPX_BITS_8),
make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_neon, 1, VPX_BITS_8),
make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_neon, 2, VPX_BITS_8),
make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_neon, 3, VPX_BITS_8)));
-#endif // HAVE_NEON_ASM && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
+#endif // HAVE_NEON && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
#if CONFIG_USE_X86INC && HAVE_MMX && !CONFIG_VP9_HIGHBITDEPTH && \
!CONFIG_EMULATE_HARDWARE
diff --git a/test/fdct8x8_test.cc b/test/fdct8x8_test.cc
index 110c9c3..6f2c89b 100644
--- a/test/fdct8x8_test.cc
+++ b/test/fdct8x8_test.cc
@@ -62,6 +62,10 @@
using libvpx_test::ACMRandom;
namespace {
+
+const int kSignBiasMaxDiff255 = 1500;
+const int kSignBiasMaxDiff15 = 10000;
+
typedef void (*FdctFunc)(const int16_t *in, tran_low_t *out, int stride);
typedef void (*IdctFunc)(const tran_low_t *in, uint8_t *out, int stride);
typedef void (*FhtFunc)(const int16_t *in, tran_low_t *out, int stride,
@@ -160,7 +164,7 @@
for (int j = 0; j < 64; ++j) {
const int diff = abs(count_sign_block[j][0] - count_sign_block[j][1]);
- const int max_diff = 1125;
+ const int max_diff = kSignBiasMaxDiff255;
EXPECT_LT(diff, max_diff << (bit_depth_ - 8))
<< "Error: 8x8 FDCT/FHT has a sign bias > "
<< 1. * max_diff / count_test_block * 100 << "%"
@@ -190,9 +194,9 @@
for (int j = 0; j < 64; ++j) {
const int diff = abs(count_sign_block[j][0] - count_sign_block[j][1]);
- const int max_diff = 10000;
+ const int max_diff = kSignBiasMaxDiff15;
EXPECT_LT(diff, max_diff << (bit_depth_ - 8))
- << "Error: 4x4 FDCT/FHT has a sign bias > "
+ << "Error: 8x8 FDCT/FHT has a sign bias > "
<< 1. * max_diff / count_test_block * 100 << "%"
<< " for input range [-15, 15] at index " << j
<< " count0: " << count_sign_block[j][0]
@@ -646,36 +650,24 @@
using std::tr1::make_tuple;
#if CONFIG_VP9_HIGHBITDEPTH
-// TODO(jingning): re-enable after this handles the expanded range [0, 65535]
-// returned from Rand16().
-INSTANTIATE_TEST_CASE_P(
- DISABLED_C, FwdTrans8x8DCT,
- ::testing::Values(
- make_tuple(&vp9_fdct8x8_c, &vp9_idct8x8_64_add_c, 0, VPX_BITS_8)));
INSTANTIATE_TEST_CASE_P(
C, FwdTrans8x8DCT,
::testing::Values(
+ make_tuple(&vp9_fdct8x8_c, &vp9_idct8x8_64_add_c, 0, VPX_BITS_8),
make_tuple(&vp9_highbd_fdct8x8_c, &idct8x8_10, 0, VPX_BITS_10),
make_tuple(&vp9_highbd_fdct8x8_c, &idct8x8_12, 0, VPX_BITS_12)));
#else
-// TODO(jingning): re-enable after this handles the expanded range [0, 65535]
-// returned from Rand16().
INSTANTIATE_TEST_CASE_P(
- DISABLED_C, FwdTrans8x8DCT,
+ C, FwdTrans8x8DCT,
::testing::Values(
make_tuple(&vp9_fdct8x8_c, &vp9_idct8x8_64_add_c, 0, VPX_BITS_8)));
#endif // CONFIG_VP9_HIGHBITDEPTH
#if CONFIG_VP9_HIGHBITDEPTH
-// TODO(jingning): re-enable after this handles the expanded range [0, 65535]
-// returned from Rand16().
-INSTANTIATE_TEST_CASE_P(
- DISABLED_C, FwdTrans8x8HT,
- ::testing::Values(
- make_tuple(&vp9_fht8x8_c, &vp9_iht8x8_64_add_c, 0, VPX_BITS_8)));
INSTANTIATE_TEST_CASE_P(
C, FwdTrans8x8HT,
::testing::Values(
+ make_tuple(&vp9_fht8x8_c, &vp9_iht8x8_64_add_c, 0, VPX_BITS_8),
make_tuple(&vp9_highbd_fht8x8_c, &iht8x8_10, 0, VPX_BITS_10),
make_tuple(&vp9_highbd_fht8x8_c, &iht8x8_10, 1, VPX_BITS_10),
make_tuple(&vp9_highbd_fht8x8_c, &iht8x8_10, 2, VPX_BITS_10),
@@ -691,12 +683,9 @@
// TODO(jingning): re-enable after this handles the expanded range [0, 65535]
// returned from Rand16().
INSTANTIATE_TEST_CASE_P(
- DISABLED_C, FwdTrans8x8HT,
- ::testing::Values(
- make_tuple(&vp9_fht8x8_c, &vp9_iht8x8_64_add_c, 0, VPX_BITS_8)));
-INSTANTIATE_TEST_CASE_P(
C, FwdTrans8x8HT,
::testing::Values(
+ make_tuple(&vp9_fht8x8_c, &vp9_iht8x8_64_add_c, 0, VPX_BITS_8),
make_tuple(&vp9_fht8x8_c, &vp9_iht8x8_64_add_c, 1, VPX_BITS_8),
make_tuple(&vp9_fht8x8_c, &vp9_iht8x8_64_add_c, 2, VPX_BITS_8),
make_tuple(&vp9_fht8x8_c, &vp9_iht8x8_64_add_c, 3, VPX_BITS_8)));
@@ -706,49 +695,44 @@
// TODO(jingning): re-enable after this handles the expanded range [0, 65535]
// returned from Rand16().
INSTANTIATE_TEST_CASE_P(
- DISABLED_NEON, FwdTrans8x8DCT,
+ NEON, FwdTrans8x8DCT,
::testing::Values(
make_tuple(&vp9_fdct8x8_neon, &vp9_idct8x8_64_add_neon, 0,
VPX_BITS_8)));
+#endif // HAVE_NEON_ASM && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
+
+#if HAVE_NEON && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
INSTANTIATE_TEST_CASE_P(
- DISABLED_NEON, FwdTrans8x8HT,
+ NEON, FwdTrans8x8HT,
::testing::Values(
make_tuple(&vp9_fht8x8_c, &vp9_iht8x8_64_add_neon, 0, VPX_BITS_8),
make_tuple(&vp9_fht8x8_c, &vp9_iht8x8_64_add_neon, 1, VPX_BITS_8),
make_tuple(&vp9_fht8x8_c, &vp9_iht8x8_64_add_neon, 2, VPX_BITS_8),
make_tuple(&vp9_fht8x8_c, &vp9_iht8x8_64_add_neon, 3, VPX_BITS_8)));
-#endif // HAVE_NEON_ASM && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
+#endif // HAVE_NEON && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
#if HAVE_SSE2 && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
// TODO(jingning): re-enable after these handle the expanded range [0, 65535]
// returned from Rand16().
INSTANTIATE_TEST_CASE_P(
- DISABLED_SSE2, FwdTrans8x8DCT,
+ SSE2, FwdTrans8x8DCT,
::testing::Values(
make_tuple(&vp9_fdct8x8_sse2, &vp9_idct8x8_64_add_sse2, 0,
VPX_BITS_8)));
INSTANTIATE_TEST_CASE_P(
- DISABLED_SSE2, FwdTrans8x8HT,
- ::testing::Values(
- make_tuple(&vp9_fht8x8_sse2, &vp9_iht8x8_64_add_sse2, 0, VPX_BITS_8)));
-INSTANTIATE_TEST_CASE_P(
SSE2, FwdTrans8x8HT,
::testing::Values(
+ make_tuple(&vp9_fht8x8_sse2, &vp9_iht8x8_64_add_sse2, 0, VPX_BITS_8),
make_tuple(&vp9_fht8x8_sse2, &vp9_iht8x8_64_add_sse2, 1, VPX_BITS_8),
make_tuple(&vp9_fht8x8_sse2, &vp9_iht8x8_64_add_sse2, 2, VPX_BITS_8),
make_tuple(&vp9_fht8x8_sse2, &vp9_iht8x8_64_add_sse2, 3, VPX_BITS_8)));
#endif // HAVE_SSE2 && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
#if HAVE_SSE2 && CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
-// TODO(jingning): re-enable after these handle the expanded range [0, 65535]
-// returned from Rand16().
-INSTANTIATE_TEST_CASE_P(
- DISABLED_SSE2, FwdTrans8x8DCT,
- ::testing::Values(
- make_tuple(&vp9_fdct8x8_sse2, &vp9_idct8x8_64_add_c, 0, VPX_BITS_8)));
INSTANTIATE_TEST_CASE_P(
SSE2, FwdTrans8x8DCT,
::testing::Values(
+ make_tuple(&vp9_fdct8x8_sse2, &vp9_idct8x8_64_add_c, 0, VPX_BITS_8),
make_tuple(&vp9_highbd_fdct8x8_c,
&idct8x8_64_add_10_sse2, 12, VPX_BITS_10),
make_tuple(&vp9_highbd_fdct8x8_sse2,
@@ -761,12 +745,9 @@
// TODO(jingning): re-enable after these handle the expanded range [0, 65535]
// returned from Rand16().
INSTANTIATE_TEST_CASE_P(
- DISABLED_SSE2, FwdTrans8x8HT,
- ::testing::Values(
- make_tuple(&vp9_fht8x8_sse2, &vp9_iht8x8_64_add_c, 0, VPX_BITS_8)));
-INSTANTIATE_TEST_CASE_P(
SSE2, FwdTrans8x8HT,
::testing::Values(
+ make_tuple(&vp9_fht8x8_sse2, &vp9_iht8x8_64_add_c, 0, VPX_BITS_8),
make_tuple(&vp9_fht8x8_sse2, &vp9_iht8x8_64_add_c, 1, VPX_BITS_8),
make_tuple(&vp9_fht8x8_sse2, &vp9_iht8x8_64_add_c, 2, VPX_BITS_8),
make_tuple(&vp9_fht8x8_sse2, &vp9_iht8x8_64_add_c, 3, VPX_BITS_8)));
@@ -791,7 +772,7 @@
// TODO(jingning): re-enable after this handles the expanded range [0, 65535]
// returned from Rand16().
INSTANTIATE_TEST_CASE_P(
- DISABLED_SSSE3, FwdTrans8x8DCT,
+ SSSE3, FwdTrans8x8DCT,
::testing::Values(
make_tuple(&vp9_fdct8x8_ssse3, &vp9_idct8x8_64_add_ssse3, 0,
VPX_BITS_8)));
diff --git a/test/test-data.mk b/test/test-data.mk
index 9c46372..d07ca32 100644
--- a/test/test-data.mk
+++ b/test/test-data.mk
@@ -17,6 +17,7 @@
LIBVPX_TEST_DATA-$(CONFIG_ENCODERS) += park_joy_90p_8_444.y4m
LIBVPX_TEST_DATA-$(CONFIG_ENCODERS) += park_joy_90p_8_440.yuv
+LIBVPX_TEST_DATA-$(CONFIG_VP9_ENCODER) += niklas_1280_720_30.yuv
LIBVPX_TEST_DATA-$(CONFIG_VP9_ENCODER) += rush_hour_444.y4m
LIBVPX_TEST_DATA-$(CONFIG_VP9_ENCODER) += screendata.y4m
diff --git a/test/test.mk b/test/test.mk
index f199e87..6e935de 100644
--- a/test/test.mk
+++ b/test/test.mk
@@ -94,6 +94,7 @@
# These tests require both the encoder and decoder to be built.
ifeq ($(CONFIG_VP8_ENCODER)$(CONFIG_VP8_DECODER),yesyes)
LIBVPX_TEST_SRCS-yes += vp8_boolcoder_test.cc
+LIBVPX_TEST_SRCS-yes += vp8_fragments_test.cc
endif
LIBVPX_TEST_SRCS-$(CONFIG_POSTPROC) += pp_filter_test.cc
@@ -127,6 +128,12 @@
LIBVPX_TEST_SRCS-yes += tile_independence_test.cc
LIBVPX_TEST_SRCS-yes += vp9_boolcoder_test.cc
+# TODO(jbb): Remove this when the TODO(minghai): in vpx_encoder.h gets resolved.
+# This test fails because of a mismatch in the size of the vpx_codec_alg_priv
+# between encoder and decoder when this is enabled.
+ifneq ($(CONFIG_SPATIAL_SVC),yes)
+LIBVPX_TEST_SRCS-yes += vp9_encoder_parms_get_to_decoder.cc
+endif
endif
LIBVPX_TEST_SRCS-$(CONFIG_VP9) += convolve_test.cc
diff --git a/test/vp8_fragments_test.cc b/test/vp8_fragments_test.cc
new file mode 100644
index 0000000..cb0d1a1
--- /dev/null
+++ b/test/vp8_fragments_test.cc
@@ -0,0 +1,37 @@
+/*
+ * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+#include "third_party/googletest/src/include/gtest/gtest.h"
+#include "test/codec_factory.h"
+#include "test/video_source.h"
+
+namespace {
+
+class VP8FramgmentsTest
+ : public ::libvpx_test::EncoderTest,
+ public ::testing::Test {
+ protected:
+ VP8FramgmentsTest() : EncoderTest(&::libvpx_test::kVP8) {}
+ virtual ~VP8FramgmentsTest() {}
+
+ virtual void SetUp() {
+ const unsigned long init_flags = // NOLINT(runtime/int)
+ VPX_CODEC_USE_OUTPUT_PARTITION;
+ InitializeConfig();
+ SetMode(::libvpx_test::kRealTime);
+ set_init_flags(init_flags);
+ }
+};
+
+TEST_F(VP8FramgmentsTest, TestFragmentsEncodeDecode) {
+ ::libvpx_test::RandomVideoSource video;
+ ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+}
+
+} // namespace
diff --git a/test/vp9_encoder_parms_get_to_decoder.cc b/test/vp9_encoder_parms_get_to_decoder.cc
new file mode 100644
index 0000000..6c354fd
--- /dev/null
+++ b/test/vp9_encoder_parms_get_to_decoder.cc
@@ -0,0 +1,189 @@
+/*
+ * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "test/codec_factory.h"
+#include "test/encode_test_driver.h"
+#include "test/y4m_video_source.h"
+#include "test/yuv_video_source.h"
+#include "test/util.h"
+#include "third_party/googletest/src/include/gtest/gtest.h"
+#include "vp9/decoder/vp9_decoder.h"
+
+typedef vpx_codec_stream_info_t vp9_stream_info_t;
+struct vpx_codec_alg_priv {
+ vpx_codec_priv_t base;
+ vpx_codec_dec_cfg_t cfg;
+ vp9_stream_info_t si;
+ struct VP9Decoder *pbi;
+ int postproc_cfg_set;
+ vp8_postproc_cfg_t postproc_cfg;
+ vpx_decrypt_cb decrypt_cb;
+ void *decrypt_state;
+ vpx_image_t img;
+ int img_avail;
+ int flushed;
+ int invert_tile_order;
+ int frame_parallel_decode;
+
+ // External frame buffer info to save for VP9 common.
+ void *ext_priv; // Private data associated with the external frame buffers.
+ vpx_get_frame_buffer_cb_fn_t get_ext_fb_cb;
+ vpx_release_frame_buffer_cb_fn_t release_ext_fb_cb;
+};
+
+static vpx_codec_alg_priv_t *get_alg_priv(vpx_codec_ctx_t *ctx) {
+ return (vpx_codec_alg_priv_t *)ctx->priv;
+}
+
+namespace {
+
+const unsigned int kFramerate = 50;
+const int kCpuUsed = 2;
+
+struct EncodePerfTestVideo {
+ const char *name;
+ uint32_t width;
+ uint32_t height;
+ uint32_t bitrate;
+ int frames;
+};
+
+const EncodePerfTestVideo kVP9EncodePerfTestVectors[] = {
+ {"niklas_1280_720_30.yuv", 1280, 720, 600, 10},
+};
+
+struct EncodeParameters {
+ int32_t tile_rows;
+ int32_t tile_cols;
+ int32_t lossless;
+ int32_t error_resilient;
+ int32_t frame_parallel;
+ // TODO(JBB): quantizers / bitrate
+};
+
+const EncodeParameters kVP9EncodeParameterSet[] = {
+ {0, 0, 0, 1, 0},
+ {0, 0, 0, 0, 0},
+ {0, 0, 1, 0, 0},
+ {0, 2, 0, 0, 1},
+ // TODO(JBB): Test profiles (requires more work).
+};
+
+int is_extension_y4m(const char *filename) {
+ const char *dot = strrchr(filename, '.');
+ if (!dot || dot == filename)
+ return 0;
+ else
+ return !strcmp(dot, ".y4m");
+}
+
+class Vp9EncoderParmsGetToDecoder
+ : public ::libvpx_test::EncoderTest,
+ public ::libvpx_test::CodecTestWith2Params<EncodeParameters, \
+ EncodePerfTestVideo> {
+ protected:
+ Vp9EncoderParmsGetToDecoder()
+ : EncoderTest(GET_PARAM(0)),
+ encode_parms(GET_PARAM(1)) {
+ }
+
+ virtual ~Vp9EncoderParmsGetToDecoder() {}
+
+ virtual void SetUp() {
+ InitializeConfig();
+ SetMode(::libvpx_test::kTwoPassGood);
+ cfg_.g_lag_in_frames = 25;
+ cfg_.g_error_resilient = encode_parms.error_resilient;
+ dec_cfg_.threads = 4;
+ test_video_ = GET_PARAM(2);
+ cfg_.rc_target_bitrate = test_video_.bitrate;
+ }
+
+ virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
+ ::libvpx_test::Encoder *encoder) {
+ if (video->frame() == 1) {
+ encoder->Control(VP9E_SET_LOSSLESS, encode_parms.lossless);
+ encoder->Control(VP9E_SET_FRAME_PARALLEL_DECODING,
+ encode_parms.frame_parallel);
+ encoder->Control(VP9E_SET_TILE_ROWS, encode_parms.tile_rows);
+ encoder->Control(VP9E_SET_TILE_COLUMNS, encode_parms.tile_cols);
+ encoder->Control(VP8E_SET_CPUUSED, kCpuUsed);
+ encoder->Control(VP8E_SET_ENABLEAUTOALTREF, 1);
+ encoder->Control(VP8E_SET_ARNR_MAXFRAMES, 7);
+ encoder->Control(VP8E_SET_ARNR_STRENGTH, 5);
+ encoder->Control(VP8E_SET_ARNR_TYPE, 3);
+ }
+ }
+
+ virtual bool HandleDecodeResult(const vpx_codec_err_t res_dec,
+ const libvpx_test::VideoSource& video,
+ libvpx_test::Decoder *decoder) {
+ vpx_codec_ctx_t* vp9_decoder = decoder->GetDecoder();
+ vpx_codec_alg_priv_t* priv =
+ (vpx_codec_alg_priv_t*) get_alg_priv(vp9_decoder);
+
+ VP9Decoder* pbi = priv->pbi;
+ VP9_COMMON* common = &pbi->common;
+
+ if (encode_parms.lossless) {
+ EXPECT_EQ(common->base_qindex, 0);
+ EXPECT_EQ(common->y_dc_delta_q, 0);
+ EXPECT_EQ(common->uv_dc_delta_q, 0);
+ EXPECT_EQ(common->uv_ac_delta_q, 0);
+ EXPECT_EQ(common->tx_mode, ONLY_4X4);
+ }
+ EXPECT_EQ(common->error_resilient_mode, encode_parms.error_resilient);
+ if (encode_parms.error_resilient) {
+ EXPECT_EQ(common->frame_parallel_decoding_mode, 1);
+ EXPECT_EQ(common->use_prev_frame_mvs, 0);
+ } else {
+ EXPECT_EQ(common->frame_parallel_decoding_mode,
+ encode_parms.frame_parallel);
+ }
+
+ EXPECT_EQ(common->log2_tile_cols, encode_parms.tile_cols);
+ EXPECT_EQ(common->log2_tile_rows, encode_parms.tile_rows);
+
+ EXPECT_EQ(VPX_CODEC_OK, res_dec) << decoder->DecodeError();
+ return VPX_CODEC_OK == res_dec;
+ }
+
+ EncodePerfTestVideo test_video_;
+
+ private:
+ EncodeParameters encode_parms;
+};
+
+TEST_P(Vp9EncoderParmsGetToDecoder, BitstreamParms) {
+ init_flags_ = VPX_CODEC_USE_PSNR;
+
+ libvpx_test::VideoSource *video;
+ if (is_extension_y4m(test_video_.name)) {
+ video = new libvpx_test::Y4mVideoSource(test_video_.name,
+ 0, test_video_.frames);
+ } else {
+ video = new libvpx_test::YUVVideoSource(test_video_.name,
+ VPX_IMG_FMT_I420,
+ test_video_.width,
+ test_video_.height,
+ kFramerate, 1, 0,
+ test_video_.frames);
+ }
+
+ ASSERT_NO_FATAL_FAILURE(RunLoop(video));
+ delete(video);
+}
+
+VP9_INSTANTIATE_TEST_CASE(
+ Vp9EncoderParmsGetToDecoder,
+ ::testing::ValuesIn(kVP9EncodeParameterSet),
+ ::testing::ValuesIn(kVP9EncodePerfTestVectors));
+
+} // namespace
diff --git a/vp8/vp8_dx_iface.c b/vp8/vp8_dx_iface.c
index 5aa274d..352ed7b 100644
--- a/vp8/vp8_dx_iface.c
+++ b/vp8/vp8_dx_iface.c
@@ -60,7 +60,6 @@
vpx_decrypt_cb decrypt_cb;
void *decrypt_state;
vpx_image_t img;
- int flushed;
int img_setup;
struct frame_buffers yv12_frame_buffers;
void *user_priv;
@@ -89,7 +88,6 @@
priv->si.sz = sizeof(priv->si);
priv->decrypt_cb = NULL;
priv->decrypt_state = NULL;
- priv->flushed = 0;
if (ctx->config.dec)
{
@@ -307,6 +305,11 @@
return 0;
}
+ if (!ctx->fragments.enabled && (data == NULL && data_sz == 0))
+ {
+ return 0;
+ }
+
if (!ctx->fragments.enabled)
{
ctx->fragments.ptrs[0] = data;
@@ -327,14 +330,11 @@
unsigned int resolution_change = 0;
unsigned int w, h;
- if (data == NULL && data_sz == 0) {
- ctx->flushed = 1;
- return VPX_CODEC_OK;
+ if (!ctx->fragments.enabled && (data == NULL && data_sz == 0))
+ {
+ return 0;
}
- /* Reset flushed when receiving a valid frame */
- ctx->flushed = 0;
-
/* Update the input fragment data */
if(update_fragments(ctx, data, data_sz, &res) <= 0)
return res;
diff --git a/vp9/common/arm/neon/vp9_iht4x4_add_neon.asm b/vp9/common/arm/neon/vp9_iht4x4_add_neon.asm
deleted file mode 100644
index 2f326e2..0000000
--- a/vp9/common/arm/neon/vp9_iht4x4_add_neon.asm
+++ /dev/null
@@ -1,237 +0,0 @@
-;
-; Copyright (c) 2013 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
- EXPORT |vp9_iht4x4_16_add_neon|
- ARM
- REQUIRE8
- PRESERVE8
-
- AREA ||.text||, CODE, READONLY, ALIGN=2
-
- ; Parallel 1D IDCT on all the columns of a 4x4 16bits data matrix which are
- ; loaded in d16-d19. d0 must contain cospi_8_64. d1 must contain
- ; cospi_16_64. d2 must contain cospi_24_64. The output will be stored back
- ; into d16-d19 registers. This macro will touch q10- q15 registers and use
- ; them as buffer during calculation.
- MACRO
- IDCT4x4_1D
- ; stage 1
- vadd.s16 d23, d16, d18 ; (input[0] + input[2])
- vsub.s16 d24, d16, d18 ; (input[0] - input[2])
-
- vmull.s16 q15, d17, d2 ; input[1] * cospi_24_64
- vmull.s16 q10, d17, d0 ; input[1] * cospi_8_64
- vmull.s16 q13, d23, d1 ; (input[0] + input[2]) * cospi_16_64
- vmull.s16 q14, d24, d1 ; (input[0] - input[2]) * cospi_16_64
- vmlsl.s16 q15, d19, d0 ; input[1] * cospi_24_64 - input[3] * cospi_8_64
- vmlal.s16 q10, d19, d2 ; input[1] * cospi_8_64 + input[3] * cospi_24_64
-
- ; dct_const_round_shift
- vqrshrn.s32 d26, q13, #14
- vqrshrn.s32 d27, q14, #14
- vqrshrn.s32 d29, q15, #14
- vqrshrn.s32 d28, q10, #14
-
- ; stage 2
- ; output[0] = step[0] + step[3];
- ; output[1] = step[1] + step[2];
- ; output[3] = step[0] - step[3];
- ; output[2] = step[1] - step[2];
- vadd.s16 q8, q13, q14
- vsub.s16 q9, q13, q14
- vswp d18, d19
- MEND
-
- ; Parallel 1D IADST on all the columns of a 4x4 16bits data matrix which
- ; loaded in d16-d19. d3 must contain sinpi_1_9. d4 must contain sinpi_2_9.
- ; d5 must contain sinpi_4_9. d6 must contain sinpi_3_9. The output will be
- ; stored back into d16-d19 registers. This macro will touch q11,q12,q13,
- ; q14,q15 registers and use them as buffer during calculation.
- MACRO
- IADST4x4_1D
- vmull.s16 q10, d3, d16 ; s0 = sinpi_1_9 * x0
- vmull.s16 q11, d4, d16 ; s1 = sinpi_2_9 * x0
- vmull.s16 q12, d6, d17 ; s2 = sinpi_3_9 * x1
- vmull.s16 q13, d5, d18 ; s3 = sinpi_4_9 * x2
- vmull.s16 q14, d3, d18 ; s4 = sinpi_1_9 * x2
- vmovl.s16 q15, d16 ; expand x0 from 16 bit to 32 bit
- vaddw.s16 q15, q15, d19 ; x0 + x3
- vmull.s16 q8, d4, d19 ; s5 = sinpi_2_9 * x3
- vsubw.s16 q15, q15, d18 ; s7 = x0 + x3 - x2
- vmull.s16 q9, d5, d19 ; s6 = sinpi_4_9 * x3
-
- vadd.s32 q10, q10, q13 ; x0 = s0 + s3 + s5
- vadd.s32 q10, q10, q8
- vsub.s32 q11, q11, q14 ; x1 = s1 - s4 - s6
- vdup.32 q8, r0 ; duplicate sinpi_3_9
- vsub.s32 q11, q11, q9
- vmul.s32 q15, q15, q8 ; x2 = sinpi_3_9 * s7
-
- vadd.s32 q13, q10, q12 ; s0 = x0 + x3
- vadd.s32 q10, q10, q11 ; x0 + x1
- vadd.s32 q14, q11, q12 ; s1 = x1 + x3
- vsub.s32 q10, q10, q12 ; s3 = x0 + x1 - x3
-
- ; dct_const_round_shift
- vqrshrn.s32 d16, q13, #14
- vqrshrn.s32 d17, q14, #14
- vqrshrn.s32 d18, q15, #14
- vqrshrn.s32 d19, q10, #14
- MEND
-
- ; Generate cosine constants in d6 - d8 for the IDCT
- MACRO
- GENERATE_COSINE_CONSTANTS
- ; cospi_8_64 = 15137 = 0x3b21
- mov r0, #0x3b00
- add r0, #0x21
- ; cospi_16_64 = 11585 = 0x2d41
- mov r3, #0x2d00
- add r3, #0x41
- ; cospi_24_64 = 6270 = 0x187e
- mov r12, #0x1800
- add r12, #0x7e
-
- ; generate constant vectors
- vdup.16 d0, r0 ; duplicate cospi_8_64
- vdup.16 d1, r3 ; duplicate cospi_16_64
- vdup.16 d2, r12 ; duplicate cospi_24_64
- MEND
-
- ; Generate sine constants in d1 - d4 for the IADST.
- MACRO
- GENERATE_SINE_CONSTANTS
- ; sinpi_1_9 = 5283 = 0x14A3
- mov r0, #0x1400
- add r0, #0xa3
- ; sinpi_2_9 = 9929 = 0x26C9
- mov r3, #0x2600
- add r3, #0xc9
- ; sinpi_4_9 = 15212 = 0x3B6C
- mov r12, #0x3b00
- add r12, #0x6c
-
- ; generate constant vectors
- vdup.16 d3, r0 ; duplicate sinpi_1_9
-
- ; sinpi_3_9 = 13377 = 0x3441
- mov r0, #0x3400
- add r0, #0x41
-
- vdup.16 d4, r3 ; duplicate sinpi_2_9
- vdup.16 d5, r12 ; duplicate sinpi_4_9
- vdup.16 q3, r0 ; duplicate sinpi_3_9
- MEND
-
- ; Transpose a 4x4 16bits data matrix. Datas are loaded in d16-d19.
- MACRO
- TRANSPOSE4X4
- vtrn.16 d16, d17
- vtrn.16 d18, d19
- vtrn.32 q8, q9
- MEND
-
- AREA Block, CODE, READONLY ; name this block of code
-;void vp9_iht4x4_16_add_neon(int16_t *input, uint8_t *dest,
-; int dest_stride, int tx_type)
-;
-; r0 int16_t input
-; r1 uint8_t *dest
-; r2 int dest_stride
-; r3 int tx_type)
-; This function will only handle tx_type of 1,2,3.
-|vp9_iht4x4_16_add_neon| PROC
-
- ; load the inputs into d16-d19
- vld1.s16 {q8,q9}, [r0]!
-
- ; transpose the input data
- TRANSPOSE4X4
-
- ; decide the type of transform
- cmp r3, #2
- beq idct_iadst
- cmp r3, #3
- beq iadst_iadst
-
-iadst_idct
- ; generate constants
- GENERATE_COSINE_CONSTANTS
- GENERATE_SINE_CONSTANTS
-
- ; first transform rows
- IDCT4x4_1D
-
- ; transpose the matrix
- TRANSPOSE4X4
-
- ; then transform columns
- IADST4x4_1D
-
- b end_vp9_iht4x4_16_add_neon
-
-idct_iadst
- ; generate constants
- GENERATE_COSINE_CONSTANTS
- GENERATE_SINE_CONSTANTS
-
- ; first transform rows
- IADST4x4_1D
-
- ; transpose the matrix
- TRANSPOSE4X4
-
- ; then transform columns
- IDCT4x4_1D
-
- b end_vp9_iht4x4_16_add_neon
-
-iadst_iadst
- ; generate constants
- GENERATE_SINE_CONSTANTS
-
- ; first transform rows
- IADST4x4_1D
-
- ; transpose the matrix
- TRANSPOSE4X4
-
- ; then transform columns
- IADST4x4_1D
-
-end_vp9_iht4x4_16_add_neon
- ; ROUND_POWER_OF_TWO(temp_out[j], 4)
- vrshr.s16 q8, q8, #4
- vrshr.s16 q9, q9, #4
-
- vld1.32 {d26[0]}, [r1], r2
- vld1.32 {d26[1]}, [r1], r2
- vld1.32 {d27[0]}, [r1], r2
- vld1.32 {d27[1]}, [r1]
-
- ; ROUND_POWER_OF_TWO(temp_out[j], 4) + dest[j * dest_stride + i]
- vaddw.u8 q8, q8, d26
- vaddw.u8 q9, q9, d27
-
- ; clip_pixel
- vqmovun.s16 d26, q8
- vqmovun.s16 d27, q9
-
- ; do the stores in reverse order with negative post-increment, by changing
- ; the sign of the stride
- rsb r2, r2, #0
- vst1.32 {d27[1]}, [r1], r2
- vst1.32 {d27[0]}, [r1], r2
- vst1.32 {d26[1]}, [r1], r2
- vst1.32 {d26[0]}, [r1] ; no post-increment
- bx lr
- ENDP ; |vp9_iht4x4_16_add_neon|
-
- END
diff --git a/vp9/common/arm/neon/vp9_iht4x4_add_neon.c b/vp9/common/arm/neon/vp9_iht4x4_add_neon.c
new file mode 100644
index 0000000..cd8c358
--- /dev/null
+++ b/vp9/common/arm/neon/vp9_iht4x4_add_neon.c
@@ -0,0 +1,247 @@
+/*
+ * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+
+#include "./vp9_rtcd.h"
+#include "vp9/common/vp9_common.h"
+
+static int16_t sinpi_1_9 = 0x14a3;
+static int16_t sinpi_2_9 = 0x26c9;
+static int16_t sinpi_3_9 = 0x3441;
+static int16_t sinpi_4_9 = 0x3b6c;
+static int16_t cospi_8_64 = 0x3b21;
+static int16_t cospi_16_64 = 0x2d41;
+static int16_t cospi_24_64 = 0x187e;
+
+static inline void TRANSPOSE4X4(
+ int16x8_t *q8s16,
+ int16x8_t *q9s16) {
+ int32x4_t q8s32, q9s32;
+ int16x4x2_t d0x2s16, d1x2s16;
+ int32x4x2_t q0x2s32;
+
+ d0x2s16 = vtrn_s16(vget_low_s16(*q8s16), vget_high_s16(*q8s16));
+ d1x2s16 = vtrn_s16(vget_low_s16(*q9s16), vget_high_s16(*q9s16));
+
+ q8s32 = vreinterpretq_s32_s16(vcombine_s16(d0x2s16.val[0], d0x2s16.val[1]));
+ q9s32 = vreinterpretq_s32_s16(vcombine_s16(d1x2s16.val[0], d1x2s16.val[1]));
+ q0x2s32 = vtrnq_s32(q8s32, q9s32);
+
+ *q8s16 = vreinterpretq_s16_s32(q0x2s32.val[0]);
+ *q9s16 = vreinterpretq_s16_s32(q0x2s32.val[1]);
+ return;
+}
+
+static inline void GENERATE_COSINE_CONSTANTS(
+ int16x4_t *d0s16,
+ int16x4_t *d1s16,
+ int16x4_t *d2s16) {
+ *d0s16 = vdup_n_s16(cospi_8_64);
+ *d1s16 = vdup_n_s16(cospi_16_64);
+ *d2s16 = vdup_n_s16(cospi_24_64);
+ return;
+}
+
+static inline void GENERATE_SINE_CONSTANTS(
+ int16x4_t *d3s16,
+ int16x4_t *d4s16,
+ int16x4_t *d5s16,
+ int16x8_t *q3s16) {
+ *d3s16 = vdup_n_s16(sinpi_1_9);
+ *d4s16 = vdup_n_s16(sinpi_2_9);
+ *q3s16 = vdupq_n_s16(sinpi_3_9);
+ *d5s16 = vdup_n_s16(sinpi_4_9);
+ return;
+}
+
+static inline void IDCT4x4_1D(
+ int16x4_t *d0s16,
+ int16x4_t *d1s16,
+ int16x4_t *d2s16,
+ int16x8_t *q8s16,
+ int16x8_t *q9s16) {
+ int16x4_t d16s16, d17s16, d18s16, d19s16, d23s16, d24s16;
+ int16x4_t d26s16, d27s16, d28s16, d29s16;
+ int32x4_t q10s32, q13s32, q14s32, q15s32;
+ int16x8_t q13s16, q14s16;
+
+ d16s16 = vget_low_s16(*q8s16);
+ d17s16 = vget_high_s16(*q8s16);
+ d18s16 = vget_low_s16(*q9s16);
+ d19s16 = vget_high_s16(*q9s16);
+
+ d23s16 = vadd_s16(d16s16, d18s16);
+ d24s16 = vsub_s16(d16s16, d18s16);
+
+ q15s32 = vmull_s16(d17s16, *d2s16);
+ q10s32 = vmull_s16(d17s16, *d0s16);
+ q13s32 = vmull_s16(d23s16, *d1s16);
+ q14s32 = vmull_s16(d24s16, *d1s16);
+ q15s32 = vmlsl_s16(q15s32, d19s16, *d0s16);
+ q10s32 = vmlal_s16(q10s32, d19s16, *d2s16);
+
+ d26s16 = vqrshrn_n_s32(q13s32, 14);
+ d27s16 = vqrshrn_n_s32(q14s32, 14);
+ d29s16 = vqrshrn_n_s32(q15s32, 14);
+ d28s16 = vqrshrn_n_s32(q10s32, 14);
+
+ q13s16 = vcombine_s16(d26s16, d27s16);
+ q14s16 = vcombine_s16(d28s16, d29s16);
+ *q8s16 = vaddq_s16(q13s16, q14s16);
+ *q9s16 = vsubq_s16(q13s16, q14s16);
+ *q9s16 = vcombine_s16(vget_high_s16(*q9s16),
+ vget_low_s16(*q9s16)); // vswp
+ return;
+}
+
+static inline void IADST4x4_1D(
+ int16x4_t *d3s16,
+ int16x4_t *d4s16,
+ int16x4_t *d5s16,
+ int16x8_t *q3s16,
+ int16x8_t *q8s16,
+ int16x8_t *q9s16) {
+ int16x4_t d6s16, d16s16, d17s16, d18s16, d19s16;
+ int32x4_t q8s32, q9s32, q10s32, q11s32, q12s32, q13s32, q14s32, q15s32;
+
+ d6s16 = vget_low_s16(*q3s16);
+
+ d16s16 = vget_low_s16(*q8s16);
+ d17s16 = vget_high_s16(*q8s16);
+ d18s16 = vget_low_s16(*q9s16);
+ d19s16 = vget_high_s16(*q9s16);
+
+ q10s32 = vmull_s16(*d3s16, d16s16);
+ q11s32 = vmull_s16(*d4s16, d16s16);
+ q12s32 = vmull_s16(d6s16, d17s16);
+ q13s32 = vmull_s16(*d5s16, d18s16);
+ q14s32 = vmull_s16(*d3s16, d18s16);
+ q15s32 = vmovl_s16(d16s16);
+ q15s32 = vaddw_s16(q15s32, d19s16);
+ q8s32 = vmull_s16(*d4s16, d19s16);
+ q15s32 = vsubw_s16(q15s32, d18s16);
+ q9s32 = vmull_s16(*d5s16, d19s16);
+
+ q10s32 = vaddq_s32(q10s32, q13s32);
+ q10s32 = vaddq_s32(q10s32, q8s32);
+ q11s32 = vsubq_s32(q11s32, q14s32);
+ q8s32 = vdupq_n_s32(sinpi_3_9);
+ q11s32 = vsubq_s32(q11s32, q9s32);
+ q15s32 = vmulq_s32(q15s32, q8s32);
+
+ q13s32 = vaddq_s32(q10s32, q12s32);
+ q10s32 = vaddq_s32(q10s32, q11s32);
+ q14s32 = vaddq_s32(q11s32, q12s32);
+ q10s32 = vsubq_s32(q10s32, q12s32);
+
+ d16s16 = vqrshrn_n_s32(q13s32, 14);
+ d17s16 = vqrshrn_n_s32(q14s32, 14);
+ d18s16 = vqrshrn_n_s32(q15s32, 14);
+ d19s16 = vqrshrn_n_s32(q10s32, 14);
+
+ *q8s16 = vcombine_s16(d16s16, d17s16);
+ *q9s16 = vcombine_s16(d18s16, d19s16);
+ return;
+}
+
+void vp9_iht4x4_16_add_neon(const tran_low_t *input, uint8_t *dest,
+ int dest_stride, int tx_type) {
+ uint8x8_t d26u8, d27u8;
+ int16x4_t d0s16, d1s16, d2s16, d3s16, d4s16, d5s16;
+ uint32x2_t d26u32, d27u32;
+ int16x8_t q3s16, q8s16, q9s16;
+ uint16x8_t q8u16, q9u16;
+
+ d26u32 = d27u32 = vdup_n_u32(0);
+
+ q8s16 = vld1q_s16(input);
+ q9s16 = vld1q_s16(input + 8);
+
+ TRANSPOSE4X4(&q8s16, &q9s16);
+
+ switch (tx_type) {
+ case 0: // idct_idct is not supported. Fall back to C
+ vp9_iht4x4_16_add_c(input, dest, dest_stride, tx_type);
+ return;
+ break;
+ case 1: // iadst_idct
+ // generate constants
+ GENERATE_COSINE_CONSTANTS(&d0s16, &d1s16, &d2s16);
+ GENERATE_SINE_CONSTANTS(&d3s16, &d4s16, &d5s16, &q3s16);
+
+ // first transform rows
+ IDCT4x4_1D(&d0s16, &d1s16, &d2s16, &q8s16, &q9s16);
+
+ // transpose the matrix
+ TRANSPOSE4X4(&q8s16, &q9s16);
+
+ // then transform columns
+ IADST4x4_1D(&d3s16, &d4s16, &d5s16, &q3s16, &q8s16, &q9s16);
+ break;
+ case 2: // idct_iadst
+ // generate constantsyy
+ GENERATE_COSINE_CONSTANTS(&d0s16, &d1s16, &d2s16);
+ GENERATE_SINE_CONSTANTS(&d3s16, &d4s16, &d5s16, &q3s16);
+
+ // first transform rows
+ IADST4x4_1D(&d3s16, &d4s16, &d5s16, &q3s16, &q8s16, &q9s16);
+
+ // transpose the matrix
+ TRANSPOSE4X4(&q8s16, &q9s16);
+
+ // then transform columns
+ IDCT4x4_1D(&d0s16, &d1s16, &d2s16, &q8s16, &q9s16);
+ break;
+ case 3: // iadst_iadst
+ // generate constants
+ GENERATE_SINE_CONSTANTS(&d3s16, &d4s16, &d5s16, &q3s16);
+
+ // first transform rows
+ IADST4x4_1D(&d3s16, &d4s16, &d5s16, &q3s16, &q8s16, &q9s16);
+
+ // transpose the matrix
+ TRANSPOSE4X4(&q8s16, &q9s16);
+
+ // then transform columns
+ IADST4x4_1D(&d3s16, &d4s16, &d5s16, &q3s16, &q8s16, &q9s16);
+ break;
+ default: // iadst_idct
+ assert(0);
+ break;
+ }
+
+ q8s16 = vrshrq_n_s16(q8s16, 4);
+ q9s16 = vrshrq_n_s16(q9s16, 4);
+
+ d26u32 = vld1_lane_u32((const uint32_t *)dest, d26u32, 0);
+ dest += dest_stride;
+ d26u32 = vld1_lane_u32((const uint32_t *)dest, d26u32, 1);
+ dest += dest_stride;
+ d27u32 = vld1_lane_u32((const uint32_t *)dest, d27u32, 0);
+ dest += dest_stride;
+ d27u32 = vld1_lane_u32((const uint32_t *)dest, d27u32, 1);
+
+ q8u16 = vaddw_u8(vreinterpretq_u16_s16(q8s16), vreinterpret_u8_u32(d26u32));
+ q9u16 = vaddw_u8(vreinterpretq_u16_s16(q9s16), vreinterpret_u8_u32(d27u32));
+
+ d26u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16));
+ d27u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16));
+
+ vst1_lane_u32((uint32_t *)dest, vreinterpret_u32_u8(d27u8), 1);
+ dest -= dest_stride;
+ vst1_lane_u32((uint32_t *)dest, vreinterpret_u32_u8(d27u8), 0);
+ dest -= dest_stride;
+ vst1_lane_u32((uint32_t *)dest, vreinterpret_u32_u8(d26u8), 1);
+ dest -= dest_stride;
+ vst1_lane_u32((uint32_t *)dest, vreinterpret_u32_u8(d26u8), 0);
+ return;
+}
diff --git a/vp9/common/arm/neon/vp9_iht8x8_add_neon.asm b/vp9/common/arm/neon/vp9_iht8x8_add_neon.asm
deleted file mode 100644
index b41f566..0000000
--- a/vp9/common/arm/neon/vp9_iht8x8_add_neon.asm
+++ /dev/null
@@ -1,698 +0,0 @@
-;
-; Copyright (c) 2013 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
- EXPORT |vp9_iht8x8_64_add_neon|
- ARM
- REQUIRE8
- PRESERVE8
-
- AREA ||.text||, CODE, READONLY, ALIGN=2
-
- ; Generate IADST constants in r0 - r12 for the IADST.
- MACRO
- GENERATE_IADST_CONSTANTS
- ; generate cospi_2_64 = 16305
- mov r0, #0x3f00
- add r0, #0xb1
-
- ; generate cospi_30_64 = 1606
- mov r1, #0x600
- add r1, #0x46
-
- ; generate cospi_10_64 = 14449
- mov r2, #0x3800
- add r2, #0x71
-
- ; generate cospi_22_64 = 7723
- mov r3, #0x1e00
- add r3, #0x2b
-
- ; generate cospi_18_64 = 10394
- mov r4, #0x2800
- add r4, #0x9a
-
- ; generate cospi_14_64 = 12665
- mov r5, #0x3100
- add r5, #0x79
-
- ; generate cospi_26_64 = 4756
- mov r6, #0x1200
- add r6, #0x94
-
- ; generate cospi_6_64 = 15679
- mov r7, #0x3d00
- add r7, #0x3f
-
- ; generate cospi_8_64 = 15137
- mov r8, #0x3b00
- add r8, #0x21
-
- ; generate cospi_24_64 = 6270
- mov r9, #0x1800
- add r9, #0x7e
-
- ; generate 0
- mov r10, #0
-
- ; generate cospi_16_64 = 11585
- mov r12, #0x2d00
- add r12, #0x41
- MEND
-
- ; Generate IDCT constants in r3 - r9 for the IDCT.
- MACRO
- GENERATE_IDCT_CONSTANTS
- ; generate cospi_28_64 = 3196
- mov r3, #0x0c00
- add r3, #0x7c
-
- ; generate cospi_4_64 = 16069
- mov r4, #0x3e00
- add r4, #0xc5
-
- ; generate cospi_12_64 = 13623
- mov r5, #0x3500
- add r5, #0x37
-
- ; generate cospi_20_64 = 9102
- mov r6, #0x2300
- add r6, #0x8e
-
- ; generate cospi_16_64 = 11585
- mov r7, #0x2d00
- add r7, #0x41
-
- ; generate cospi_24_64 = 6270
- mov r8, #0x1800
- add r8, #0x7e
-
- ; generate cospi_8_64 = 15137
- mov r9, #0x3b00
- add r9, #0x21
- MEND
-
- ; Transpose a 8x8 16bits data matrix. Datas are loaded in q8-q15.
- MACRO
- TRANSPOSE8X8
- vswp d17, d24
- vswp d23, d30
- vswp d21, d28
- vswp d19, d26
- vtrn.32 q8, q10
- vtrn.32 q9, q11
- vtrn.32 q12, q14
- vtrn.32 q13, q15
- vtrn.16 q8, q9
- vtrn.16 q10, q11
- vtrn.16 q12, q13
- vtrn.16 q14, q15
- MEND
-
- ; Parallel 1D IDCT on all the columns of a 8x8 16bits data matrix which are
- ; loaded in q8-q15. The IDCT constants are loaded in r3 - r9. The output
- ; will be stored back into q8-q15 registers. This macro will touch q0-q7
- ; registers and use them as buffer during calculation.
- MACRO
- IDCT8x8_1D
- ; stage 1
- vdup.16 d0, r3 ; duplicate cospi_28_64
- vdup.16 d1, r4 ; duplicate cospi_4_64
- vdup.16 d2, r5 ; duplicate cospi_12_64
- vdup.16 d3, r6 ; duplicate cospi_20_64
-
- ; input[1] * cospi_28_64
- vmull.s16 q2, d18, d0
- vmull.s16 q3, d19, d0
-
- ; input[5] * cospi_12_64
- vmull.s16 q5, d26, d2
- vmull.s16 q6, d27, d2
-
- ; input[1]*cospi_28_64-input[7]*cospi_4_64
- vmlsl.s16 q2, d30, d1
- vmlsl.s16 q3, d31, d1
-
- ; input[5] * cospi_12_64 - input[3] * cospi_20_64
- vmlsl.s16 q5, d22, d3
- vmlsl.s16 q6, d23, d3
-
- ; dct_const_round_shift(input_dc * cospi_16_64)
- vqrshrn.s32 d8, q2, #14 ; >> 14
- vqrshrn.s32 d9, q3, #14 ; >> 14
-
- ; dct_const_round_shift(input_dc * cospi_16_64)
- vqrshrn.s32 d10, q5, #14 ; >> 14
- vqrshrn.s32 d11, q6, #14 ; >> 14
-
- ; input[1] * cospi_4_64
- vmull.s16 q2, d18, d1
- vmull.s16 q3, d19, d1
-
- ; input[5] * cospi_20_64
- vmull.s16 q9, d26, d3
- vmull.s16 q13, d27, d3
-
- ; input[1]*cospi_4_64+input[7]*cospi_28_64
- vmlal.s16 q2, d30, d0
- vmlal.s16 q3, d31, d0
-
- ; input[5] * cospi_20_64 + input[3] * cospi_12_64
- vmlal.s16 q9, d22, d2
- vmlal.s16 q13, d23, d2
-
- ; dct_const_round_shift(input_dc * cospi_16_64)
- vqrshrn.s32 d14, q2, #14 ; >> 14
- vqrshrn.s32 d15, q3, #14 ; >> 14
-
- ; stage 2 & stage 3 - even half
- vdup.16 d0, r7 ; duplicate cospi_16_64
-
- ; dct_const_round_shift(input_dc * cospi_16_64)
- vqrshrn.s32 d12, q9, #14 ; >> 14
- vqrshrn.s32 d13, q13, #14 ; >> 14
-
- ; input[0] * cospi_16_64
- vmull.s16 q2, d16, d0
- vmull.s16 q3, d17, d0
-
- ; input[0] * cospi_16_64
- vmull.s16 q13, d16, d0
- vmull.s16 q15, d17, d0
-
- ; (input[0] + input[2]) * cospi_16_64
- vmlal.s16 q2, d24, d0
- vmlal.s16 q3, d25, d0
-
- ; (input[0] - input[2]) * cospi_16_64
- vmlsl.s16 q13, d24, d0
- vmlsl.s16 q15, d25, d0
-
- vdup.16 d0, r8 ; duplicate cospi_24_64
- vdup.16 d1, r9 ; duplicate cospi_8_64
-
- ; dct_const_round_shift(input_dc * cospi_16_64)
- vqrshrn.s32 d18, q2, #14 ; >> 14
- vqrshrn.s32 d19, q3, #14 ; >> 14
-
- ; dct_const_round_shift(input_dc * cospi_16_64)
- vqrshrn.s32 d22, q13, #14 ; >> 14
- vqrshrn.s32 d23, q15, #14 ; >> 14
-
- ; input[1] * cospi_24_64
- vmull.s16 q2, d20, d0
- vmull.s16 q3, d21, d0
-
- ; input[1] * cospi_8_64
- vmull.s16 q8, d20, d1
- vmull.s16 q12, d21, d1
-
- ; input[1] * cospi_24_64 - input[3] * cospi_8_64
- vmlsl.s16 q2, d28, d1
- vmlsl.s16 q3, d29, d1
-
- ; input[1] * cospi_8_64 + input[3] * cospi_24_64
- vmlal.s16 q8, d28, d0
- vmlal.s16 q12, d29, d0
-
- ; dct_const_round_shift(input_dc * cospi_16_64)
- vqrshrn.s32 d26, q2, #14 ; >> 14
- vqrshrn.s32 d27, q3, #14 ; >> 14
-
- ; dct_const_round_shift(input_dc * cospi_16_64)
- vqrshrn.s32 d30, q8, #14 ; >> 14
- vqrshrn.s32 d31, q12, #14 ; >> 14
-
- vadd.s16 q0, q9, q15 ; output[0] = step[0] + step[3]
- vadd.s16 q1, q11, q13 ; output[1] = step[1] + step[2]
- vsub.s16 q2, q11, q13 ; output[2] = step[1] - step[2]
- vsub.s16 q3, q9, q15 ; output[3] = step[0] - step[3]
-
- ; stage 3 -odd half
- vdup.16 d16, r7 ; duplicate cospi_16_64
-
- ; stage 2 - odd half
- vsub.s16 q13, q4, q5 ; step2[5] = step1[4] - step1[5]
- vadd.s16 q4, q4, q5 ; step2[4] = step1[4] + step1[5]
- vsub.s16 q14, q7, q6 ; step2[6] = -step1[6] + step1[7]
- vadd.s16 q7, q7, q6 ; step2[7] = step1[6] + step1[7]
-
- ; step2[6] * cospi_16_64
- vmull.s16 q9, d28, d16
- vmull.s16 q10, d29, d16
-
- ; step2[6] * cospi_16_64
- vmull.s16 q11, d28, d16
- vmull.s16 q12, d29, d16
-
- ; (step2[6] - step2[5]) * cospi_16_64
- vmlsl.s16 q9, d26, d16
- vmlsl.s16 q10, d27, d16
-
- ; (step2[5] + step2[6]) * cospi_16_64
- vmlal.s16 q11, d26, d16
- vmlal.s16 q12, d27, d16
-
- ; dct_const_round_shift(input_dc * cospi_16_64)
- vqrshrn.s32 d10, q9, #14 ; >> 14
- vqrshrn.s32 d11, q10, #14 ; >> 14
-
- ; dct_const_round_shift(input_dc * cospi_16_64)
- vqrshrn.s32 d12, q11, #14 ; >> 14
- vqrshrn.s32 d13, q12, #14 ; >> 14
-
- ; stage 4
- vadd.s16 q8, q0, q7 ; output[0] = step1[0] + step1[7];
- vadd.s16 q9, q1, q6 ; output[1] = step1[1] + step1[6];
- vadd.s16 q10, q2, q5 ; output[2] = step1[2] + step1[5];
- vadd.s16 q11, q3, q4 ; output[3] = step1[3] + step1[4];
- vsub.s16 q12, q3, q4 ; output[4] = step1[3] - step1[4];
- vsub.s16 q13, q2, q5 ; output[5] = step1[2] - step1[5];
- vsub.s16 q14, q1, q6 ; output[6] = step1[1] - step1[6];
- vsub.s16 q15, q0, q7 ; output[7] = step1[0] - step1[7];
- MEND
-
- ; Parallel 1D IADST on all the columns of a 8x8 16bits data matrix which
- ; loaded in q8-q15. IADST constants are loaded in r0 - r12 registers. The
- ; output will be stored back into q8-q15 registers. This macro will touch
- ; q0 - q7 registers and use them as buffer during calculation.
- MACRO
- IADST8X8_1D
- vdup.16 d14, r0 ; duplicate cospi_2_64
- vdup.16 d15, r1 ; duplicate cospi_30_64
-
- ; cospi_2_64 * x0
- vmull.s16 q1, d30, d14
- vmull.s16 q2, d31, d14
-
- ; cospi_30_64 * x0
- vmull.s16 q3, d30, d15
- vmull.s16 q4, d31, d15
-
- vdup.16 d30, r4 ; duplicate cospi_18_64
- vdup.16 d31, r5 ; duplicate cospi_14_64
-
- ; s0 = cospi_2_64 * x0 + cospi_30_64 * x1;
- vmlal.s16 q1, d16, d15
- vmlal.s16 q2, d17, d15
-
- ; s1 = cospi_30_64 * x0 - cospi_2_64 * x1
- vmlsl.s16 q3, d16, d14
- vmlsl.s16 q4, d17, d14
-
- ; cospi_18_64 * x4
- vmull.s16 q5, d22, d30
- vmull.s16 q6, d23, d30
-
- ; cospi_14_64 * x4
- vmull.s16 q7, d22, d31
- vmull.s16 q8, d23, d31
-
- ; s4 = cospi_18_64 * x4 + cospi_14_64 * x5;
- vmlal.s16 q5, d24, d31
- vmlal.s16 q6, d25, d31
-
- ; s5 = cospi_14_64 * x4 - cospi_18_64 * x5
- vmlsl.s16 q7, d24, d30
- vmlsl.s16 q8, d25, d30
-
- ; (s0 + s4)
- vadd.s32 q11, q1, q5
- vadd.s32 q12, q2, q6
-
- vdup.16 d0, r2 ; duplicate cospi_10_64
- vdup.16 d1, r3 ; duplicate cospi_22_64
-
- ; (s0 - s4)
- vsub.s32 q1, q1, q5
- vsub.s32 q2, q2, q6
-
- ; x0 = dct_const_round_shift(s0 + s4);
- vqrshrn.s32 d22, q11, #14 ; >> 14
- vqrshrn.s32 d23, q12, #14 ; >> 14
-
- ; (s1 + s5)
- vadd.s32 q12, q3, q7
- vadd.s32 q15, q4, q8
-
- ; (s1 - s5)
- vsub.s32 q3, q3, q7
- vsub.s32 q4, q4, q8
-
- ; x4 = dct_const_round_shift(s0 - s4);
- vqrshrn.s32 d2, q1, #14 ; >> 14
- vqrshrn.s32 d3, q2, #14 ; >> 14
-
- ; x1 = dct_const_round_shift(s1 + s5);
- vqrshrn.s32 d24, q12, #14 ; >> 14
- vqrshrn.s32 d25, q15, #14 ; >> 14
-
- ; x5 = dct_const_round_shift(s1 - s5);
- vqrshrn.s32 d6, q3, #14 ; >> 14
- vqrshrn.s32 d7, q4, #14 ; >> 14
-
- ; cospi_10_64 * x2
- vmull.s16 q4, d26, d0
- vmull.s16 q5, d27, d0
-
- ; cospi_22_64 * x2
- vmull.s16 q2, d26, d1
- vmull.s16 q6, d27, d1
-
- vdup.16 d30, r6 ; duplicate cospi_26_64
- vdup.16 d31, r7 ; duplicate cospi_6_64
-
- ; s2 = cospi_10_64 * x2 + cospi_22_64 * x3;
- vmlal.s16 q4, d20, d1
- vmlal.s16 q5, d21, d1
-
- ; s3 = cospi_22_64 * x2 - cospi_10_64 * x3;
- vmlsl.s16 q2, d20, d0
- vmlsl.s16 q6, d21, d0
-
- ; cospi_26_64 * x6
- vmull.s16 q0, d18, d30
- vmull.s16 q13, d19, d30
-
- ; s6 = cospi_26_64 * x6 + cospi_6_64 * x7;
- vmlal.s16 q0, d28, d31
- vmlal.s16 q13, d29, d31
-
- ; cospi_6_64 * x6
- vmull.s16 q10, d18, d31
- vmull.s16 q9, d19, d31
-
- ; s7 = cospi_6_64 * x6 - cospi_26_64 * x7;
- vmlsl.s16 q10, d28, d30
- vmlsl.s16 q9, d29, d30
-
- ; (s3 + s7)
- vadd.s32 q14, q2, q10
- vadd.s32 q15, q6, q9
-
- ; (s3 - s7)
- vsub.s32 q2, q2, q10
- vsub.s32 q6, q6, q9
-
- ; x3 = dct_const_round_shift(s3 + s7);
- vqrshrn.s32 d28, q14, #14 ; >> 14
- vqrshrn.s32 d29, q15, #14 ; >> 14
-
- ; x7 = dct_const_round_shift(s3 - s7);
- vqrshrn.s32 d4, q2, #14 ; >> 14
- vqrshrn.s32 d5, q6, #14 ; >> 14
-
- ; (s2 + s6)
- vadd.s32 q9, q4, q0
- vadd.s32 q10, q5, q13
-
- ; (s2 - s6)
- vsub.s32 q4, q4, q0
- vsub.s32 q5, q5, q13
-
- vdup.16 d30, r8 ; duplicate cospi_8_64
- vdup.16 d31, r9 ; duplicate cospi_24_64
-
- ; x2 = dct_const_round_shift(s2 + s6);
- vqrshrn.s32 d18, q9, #14 ; >> 14
- vqrshrn.s32 d19, q10, #14 ; >> 14
-
- ; x6 = dct_const_round_shift(s2 - s6);
- vqrshrn.s32 d8, q4, #14 ; >> 14
- vqrshrn.s32 d9, q5, #14 ; >> 14
-
- ; cospi_8_64 * x4
- vmull.s16 q5, d2, d30
- vmull.s16 q6, d3, d30
-
- ; cospi_24_64 * x4
- vmull.s16 q7, d2, d31
- vmull.s16 q0, d3, d31
-
- ; s4 = cospi_8_64 * x4 + cospi_24_64 * x5;
- vmlal.s16 q5, d6, d31
- vmlal.s16 q6, d7, d31
-
- ; s5 = cospi_24_64 * x4 - cospi_8_64 * x5;
- vmlsl.s16 q7, d6, d30
- vmlsl.s16 q0, d7, d30
-
- ; cospi_8_64 * x7
- vmull.s16 q1, d4, d30
- vmull.s16 q3, d5, d30
-
- ; cospi_24_64 * x7
- vmull.s16 q10, d4, d31
- vmull.s16 q2, d5, d31
-
- ; s6 = -cospi_24_64 * x6 + cospi_8_64 * x7;
- vmlsl.s16 q1, d8, d31
- vmlsl.s16 q3, d9, d31
-
- ; s7 = cospi_8_64 * x6 + cospi_24_64 * x7;
- vmlal.s16 q10, d8, d30
- vmlal.s16 q2, d9, d30
-
- vadd.s16 q8, q11, q9 ; x0 = s0 + s2;
-
- vsub.s16 q11, q11, q9 ; x2 = s0 - s2;
-
- vadd.s16 q4, q12, q14 ; x1 = s1 + s3;
-
- vsub.s16 q12, q12, q14 ; x3 = s1 - s3;
-
- ; (s4 + s6)
- vadd.s32 q14, q5, q1
- vadd.s32 q15, q6, q3
-
- ; (s4 - s6)
- vsub.s32 q5, q5, q1
- vsub.s32 q6, q6, q3
-
- ; x4 = dct_const_round_shift(s4 + s6);
- vqrshrn.s32 d18, q14, #14 ; >> 14
- vqrshrn.s32 d19, q15, #14 ; >> 14
-
- ; x6 = dct_const_round_shift(s4 - s6);
- vqrshrn.s32 d10, q5, #14 ; >> 14
- vqrshrn.s32 d11, q6, #14 ; >> 14
-
- ; (s5 + s7)
- vadd.s32 q1, q7, q10
- vadd.s32 q3, q0, q2
-
- ; (s5 - s7))
- vsub.s32 q7, q7, q10
- vsub.s32 q0, q0, q2
-
- ; x5 = dct_const_round_shift(s5 + s7);
- vqrshrn.s32 d28, q1, #14 ; >> 14
- vqrshrn.s32 d29, q3, #14 ; >> 14
-
- ; x7 = dct_const_round_shift(s5 - s7);
- vqrshrn.s32 d14, q7, #14 ; >> 14
- vqrshrn.s32 d15, q0, #14 ; >> 14
-
- vdup.16 d30, r12 ; duplicate cospi_16_64
-
- ; cospi_16_64 * x2
- vmull.s16 q2, d22, d30
- vmull.s16 q3, d23, d30
-
- ; cospi_6_64 * x6
- vmull.s16 q13, d22, d30
- vmull.s16 q1, d23, d30
-
- ; cospi_16_64 * x2 + cospi_16_64 * x3;
- vmlal.s16 q2, d24, d30
- vmlal.s16 q3, d25, d30
-
- ; cospi_16_64 * x2 - cospi_16_64 * x3;
- vmlsl.s16 q13, d24, d30
- vmlsl.s16 q1, d25, d30
-
- ; x2 = dct_const_round_shift(s2);
- vqrshrn.s32 d4, q2, #14 ; >> 14
- vqrshrn.s32 d5, q3, #14 ; >> 14
-
- ;x3 = dct_const_round_shift(s3);
- vqrshrn.s32 d24, q13, #14 ; >> 14
- vqrshrn.s32 d25, q1, #14 ; >> 14
-
- ; cospi_16_64 * x6
- vmull.s16 q13, d10, d30
- vmull.s16 q1, d11, d30
-
- ; cospi_6_64 * x6
- vmull.s16 q11, d10, d30
- vmull.s16 q0, d11, d30
-
- ; cospi_16_64 * x6 + cospi_16_64 * x7;
- vmlal.s16 q13, d14, d30
- vmlal.s16 q1, d15, d30
-
- ; cospi_16_64 * x6 - cospi_16_64 * x7;
- vmlsl.s16 q11, d14, d30
- vmlsl.s16 q0, d15, d30
-
- ; x6 = dct_const_round_shift(s6);
- vqrshrn.s32 d20, q13, #14 ; >> 14
- vqrshrn.s32 d21, q1, #14 ; >> 14
-
- ;x7 = dct_const_round_shift(s7);
- vqrshrn.s32 d12, q11, #14 ; >> 14
- vqrshrn.s32 d13, q0, #14 ; >> 14
-
- vdup.16 q5, r10 ; duplicate 0
-
- vsub.s16 q9, q5, q9 ; output[1] = -x4;
- vsub.s16 q11, q5, q2 ; output[3] = -x2;
- vsub.s16 q13, q5, q6 ; output[5] = -x7;
- vsub.s16 q15, q5, q4 ; output[7] = -x1;
- MEND
-
-
- AREA Block, CODE, READONLY ; name this block of code
-;void vp9_iht8x8_64_add_neon(int16_t *input, uint8_t *dest,
-; int dest_stride, int tx_type)
-;
-; r0 int16_t input
-; r1 uint8_t *dest
-; r2 int dest_stride
-; r3 int tx_type)
-; This function will only handle tx_type of 1,2,3.
-|vp9_iht8x8_64_add_neon| PROC
-
- ; load the inputs into d16-d19
- vld1.s16 {q8,q9}, [r0]!
- vld1.s16 {q10,q11}, [r0]!
- vld1.s16 {q12,q13}, [r0]!
- vld1.s16 {q14,q15}, [r0]!
-
- push {r0-r10}
- vpush {d8-d15}
-
- ; transpose the input data
- TRANSPOSE8X8
-
- ; decide the type of transform
- cmp r3, #2
- beq idct_iadst
- cmp r3, #3
- beq iadst_iadst
-
-iadst_idct
- ; generate IDCT constants
- GENERATE_IDCT_CONSTANTS
-
- ; first transform rows
- IDCT8x8_1D
-
- ; transpose the matrix
- TRANSPOSE8X8
-
- ; generate IADST constants
- GENERATE_IADST_CONSTANTS
-
- ; then transform columns
- IADST8X8_1D
-
- b end_vp9_iht8x8_64_add_neon
-
-idct_iadst
- ; generate IADST constants
- GENERATE_IADST_CONSTANTS
-
- ; first transform rows
- IADST8X8_1D
-
- ; transpose the matrix
- TRANSPOSE8X8
-
- ; generate IDCT constants
- GENERATE_IDCT_CONSTANTS
-
- ; then transform columns
- IDCT8x8_1D
-
- b end_vp9_iht8x8_64_add_neon
-
-iadst_iadst
- ; generate IADST constants
- GENERATE_IADST_CONSTANTS
-
- ; first transform rows
- IADST8X8_1D
-
- ; transpose the matrix
- TRANSPOSE8X8
-
- ; then transform columns
- IADST8X8_1D
-
-end_vp9_iht8x8_64_add_neon
- vpop {d8-d15}
- pop {r0-r10}
-
- ; ROUND_POWER_OF_TWO(temp_out[j], 5)
- vrshr.s16 q8, q8, #5
- vrshr.s16 q9, q9, #5
- vrshr.s16 q10, q10, #5
- vrshr.s16 q11, q11, #5
- vrshr.s16 q12, q12, #5
- vrshr.s16 q13, q13, #5
- vrshr.s16 q14, q14, #5
- vrshr.s16 q15, q15, #5
-
- ; save dest pointer
- mov r0, r1
-
- ; load destination data
- vld1.64 {d0}, [r1], r2
- vld1.64 {d1}, [r1], r2
- vld1.64 {d2}, [r1], r2
- vld1.64 {d3}, [r1], r2
- vld1.64 {d4}, [r1], r2
- vld1.64 {d5}, [r1], r2
- vld1.64 {d6}, [r1], r2
- vld1.64 {d7}, [r1]
-
- ; ROUND_POWER_OF_TWO(temp_out[j], 5) + dest[j * dest_stride + i]
- vaddw.u8 q8, q8, d0
- vaddw.u8 q9, q9, d1
- vaddw.u8 q10, q10, d2
- vaddw.u8 q11, q11, d3
- vaddw.u8 q12, q12, d4
- vaddw.u8 q13, q13, d5
- vaddw.u8 q14, q14, d6
- vaddw.u8 q15, q15, d7
-
- ; clip_pixel
- vqmovun.s16 d0, q8
- vqmovun.s16 d1, q9
- vqmovun.s16 d2, q10
- vqmovun.s16 d3, q11
- vqmovun.s16 d4, q12
- vqmovun.s16 d5, q13
- vqmovun.s16 d6, q14
- vqmovun.s16 d7, q15
-
- ; store the data
- vst1.64 {d0}, [r0], r2
- vst1.64 {d1}, [r0], r2
- vst1.64 {d2}, [r0], r2
- vst1.64 {d3}, [r0], r2
- vst1.64 {d4}, [r0], r2
- vst1.64 {d5}, [r0], r2
- vst1.64 {d6}, [r0], r2
- vst1.64 {d7}, [r0], r2
- bx lr
- ENDP ; |vp9_iht8x8_64_add_neon|
-
- END
diff --git a/vp9/common/arm/neon/vp9_iht8x8_add_neon.c b/vp9/common/arm/neon/vp9_iht8x8_add_neon.c
new file mode 100644
index 0000000..03c836d
--- /dev/null
+++ b/vp9/common/arm/neon/vp9_iht8x8_add_neon.c
@@ -0,0 +1,623 @@
+/*
+ * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+
+#include "./vp9_rtcd.h"
+#include "vp9/common/vp9_common.h"
+
+static int16_t cospi_2_64 = 16305;
+static int16_t cospi_4_64 = 16069;
+static int16_t cospi_6_64 = 15679;
+static int16_t cospi_8_64 = 15137;
+static int16_t cospi_10_64 = 14449;
+static int16_t cospi_12_64 = 13623;
+static int16_t cospi_14_64 = 12665;
+static int16_t cospi_16_64 = 11585;
+static int16_t cospi_18_64 = 10394;
+static int16_t cospi_20_64 = 9102;
+static int16_t cospi_22_64 = 7723;
+static int16_t cospi_24_64 = 6270;
+static int16_t cospi_26_64 = 4756;
+static int16_t cospi_28_64 = 3196;
+static int16_t cospi_30_64 = 1606;
+
+static inline void TRANSPOSE8X8(
+ int16x8_t *q8s16,
+ int16x8_t *q9s16,
+ int16x8_t *q10s16,
+ int16x8_t *q11s16,
+ int16x8_t *q12s16,
+ int16x8_t *q13s16,
+ int16x8_t *q14s16,
+ int16x8_t *q15s16) {
+ int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16, d23s16;
+ int16x4_t d24s16, d25s16, d26s16, d27s16, d28s16, d29s16, d30s16, d31s16;
+ int32x4x2_t q0x2s32, q1x2s32, q2x2s32, q3x2s32;
+ int16x8x2_t q0x2s16, q1x2s16, q2x2s16, q3x2s16;
+
+ d16s16 = vget_low_s16(*q8s16);
+ d17s16 = vget_high_s16(*q8s16);
+ d18s16 = vget_low_s16(*q9s16);
+ d19s16 = vget_high_s16(*q9s16);
+ d20s16 = vget_low_s16(*q10s16);
+ d21s16 = vget_high_s16(*q10s16);
+ d22s16 = vget_low_s16(*q11s16);
+ d23s16 = vget_high_s16(*q11s16);
+ d24s16 = vget_low_s16(*q12s16);
+ d25s16 = vget_high_s16(*q12s16);
+ d26s16 = vget_low_s16(*q13s16);
+ d27s16 = vget_high_s16(*q13s16);
+ d28s16 = vget_low_s16(*q14s16);
+ d29s16 = vget_high_s16(*q14s16);
+ d30s16 = vget_low_s16(*q15s16);
+ d31s16 = vget_high_s16(*q15s16);
+
+ *q8s16 = vcombine_s16(d16s16, d24s16); // vswp d17, d24
+ *q9s16 = vcombine_s16(d18s16, d26s16); // vswp d19, d26
+ *q10s16 = vcombine_s16(d20s16, d28s16); // vswp d21, d28
+ *q11s16 = vcombine_s16(d22s16, d30s16); // vswp d23, d30
+ *q12s16 = vcombine_s16(d17s16, d25s16);
+ *q13s16 = vcombine_s16(d19s16, d27s16);
+ *q14s16 = vcombine_s16(d21s16, d29s16);
+ *q15s16 = vcombine_s16(d23s16, d31s16);
+
+ q0x2s32 = vtrnq_s32(vreinterpretq_s32_s16(*q8s16),
+ vreinterpretq_s32_s16(*q10s16));
+ q1x2s32 = vtrnq_s32(vreinterpretq_s32_s16(*q9s16),
+ vreinterpretq_s32_s16(*q11s16));
+ q2x2s32 = vtrnq_s32(vreinterpretq_s32_s16(*q12s16),
+ vreinterpretq_s32_s16(*q14s16));
+ q3x2s32 = vtrnq_s32(vreinterpretq_s32_s16(*q13s16),
+ vreinterpretq_s32_s16(*q15s16));
+
+ q0x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q0x2s32.val[0]), // q8
+ vreinterpretq_s16_s32(q1x2s32.val[0])); // q9
+ q1x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q0x2s32.val[1]), // q10
+ vreinterpretq_s16_s32(q1x2s32.val[1])); // q11
+ q2x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q2x2s32.val[0]), // q12
+ vreinterpretq_s16_s32(q3x2s32.val[0])); // q13
+ q3x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q2x2s32.val[1]), // q14
+ vreinterpretq_s16_s32(q3x2s32.val[1])); // q15
+
+ *q8s16 = q0x2s16.val[0];
+ *q9s16 = q0x2s16.val[1];
+ *q10s16 = q1x2s16.val[0];
+ *q11s16 = q1x2s16.val[1];
+ *q12s16 = q2x2s16.val[0];
+ *q13s16 = q2x2s16.val[1];
+ *q14s16 = q3x2s16.val[0];
+ *q15s16 = q3x2s16.val[1];
+ return;
+}
+
+static inline void IDCT8x8_1D(
+ int16x8_t *q8s16,
+ int16x8_t *q9s16,
+ int16x8_t *q10s16,
+ int16x8_t *q11s16,
+ int16x8_t *q12s16,
+ int16x8_t *q13s16,
+ int16x8_t *q14s16,
+ int16x8_t *q15s16) {
+ int16x4_t d0s16, d1s16, d2s16, d3s16;
+ int16x4_t d8s16, d9s16, d10s16, d11s16, d12s16, d13s16, d14s16, d15s16;
+ int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16, d23s16;
+ int16x4_t d24s16, d25s16, d26s16, d27s16, d28s16, d29s16, d30s16, d31s16;
+ int16x8_t q0s16, q1s16, q2s16, q3s16, q4s16, q5s16, q6s16, q7s16;
+ int32x4_t q2s32, q3s32, q5s32, q6s32, q8s32, q9s32;
+ int32x4_t q10s32, q11s32, q12s32, q13s32, q15s32;
+
+ d0s16 = vdup_n_s16(cospi_28_64);
+ d1s16 = vdup_n_s16(cospi_4_64);
+ d2s16 = vdup_n_s16(cospi_12_64);
+ d3s16 = vdup_n_s16(cospi_20_64);
+
+ d16s16 = vget_low_s16(*q8s16);
+ d17s16 = vget_high_s16(*q8s16);
+ d18s16 = vget_low_s16(*q9s16);
+ d19s16 = vget_high_s16(*q9s16);
+ d20s16 = vget_low_s16(*q10s16);
+ d21s16 = vget_high_s16(*q10s16);
+ d22s16 = vget_low_s16(*q11s16);
+ d23s16 = vget_high_s16(*q11s16);
+ d24s16 = vget_low_s16(*q12s16);
+ d25s16 = vget_high_s16(*q12s16);
+ d26s16 = vget_low_s16(*q13s16);
+ d27s16 = vget_high_s16(*q13s16);
+ d28s16 = vget_low_s16(*q14s16);
+ d29s16 = vget_high_s16(*q14s16);
+ d30s16 = vget_low_s16(*q15s16);
+ d31s16 = vget_high_s16(*q15s16);
+
+ q2s32 = vmull_s16(d18s16, d0s16);
+ q3s32 = vmull_s16(d19s16, d0s16);
+ q5s32 = vmull_s16(d26s16, d2s16);
+ q6s32 = vmull_s16(d27s16, d2s16);
+
+ q2s32 = vmlsl_s16(q2s32, d30s16, d1s16);
+ q3s32 = vmlsl_s16(q3s32, d31s16, d1s16);
+ q5s32 = vmlsl_s16(q5s32, d22s16, d3s16);
+ q6s32 = vmlsl_s16(q6s32, d23s16, d3s16);
+
+ d8s16 = vqrshrn_n_s32(q2s32, 14);
+ d9s16 = vqrshrn_n_s32(q3s32, 14);
+ d10s16 = vqrshrn_n_s32(q5s32, 14);
+ d11s16 = vqrshrn_n_s32(q6s32, 14);
+ q4s16 = vcombine_s16(d8s16, d9s16);
+ q5s16 = vcombine_s16(d10s16, d11s16);
+
+ q2s32 = vmull_s16(d18s16, d1s16);
+ q3s32 = vmull_s16(d19s16, d1s16);
+ q9s32 = vmull_s16(d26s16, d3s16);
+ q13s32 = vmull_s16(d27s16, d3s16);
+
+ q2s32 = vmlal_s16(q2s32, d30s16, d0s16);
+ q3s32 = vmlal_s16(q3s32, d31s16, d0s16);
+ q9s32 = vmlal_s16(q9s32, d22s16, d2s16);
+ q13s32 = vmlal_s16(q13s32, d23s16, d2s16);
+
+ d14s16 = vqrshrn_n_s32(q2s32, 14);
+ d15s16 = vqrshrn_n_s32(q3s32, 14);
+ d12s16 = vqrshrn_n_s32(q9s32, 14);
+ d13s16 = vqrshrn_n_s32(q13s32, 14);
+ q6s16 = vcombine_s16(d12s16, d13s16);
+ q7s16 = vcombine_s16(d14s16, d15s16);
+
+ d0s16 = vdup_n_s16(cospi_16_64);
+
+ q2s32 = vmull_s16(d16s16, d0s16);
+ q3s32 = vmull_s16(d17s16, d0s16);
+ q13s32 = vmull_s16(d16s16, d0s16);
+ q15s32 = vmull_s16(d17s16, d0s16);
+
+ q2s32 = vmlal_s16(q2s32, d24s16, d0s16);
+ q3s32 = vmlal_s16(q3s32, d25s16, d0s16);
+ q13s32 = vmlsl_s16(q13s32, d24s16, d0s16);
+ q15s32 = vmlsl_s16(q15s32, d25s16, d0s16);
+
+ d0s16 = vdup_n_s16(cospi_24_64);
+ d1s16 = vdup_n_s16(cospi_8_64);
+
+ d18s16 = vqrshrn_n_s32(q2s32, 14);
+ d19s16 = vqrshrn_n_s32(q3s32, 14);
+ d22s16 = vqrshrn_n_s32(q13s32, 14);
+ d23s16 = vqrshrn_n_s32(q15s32, 14);
+ *q9s16 = vcombine_s16(d18s16, d19s16);
+ *q11s16 = vcombine_s16(d22s16, d23s16);
+
+ q2s32 = vmull_s16(d20s16, d0s16);
+ q3s32 = vmull_s16(d21s16, d0s16);
+ q8s32 = vmull_s16(d20s16, d1s16);
+ q12s32 = vmull_s16(d21s16, d1s16);
+
+ q2s32 = vmlsl_s16(q2s32, d28s16, d1s16);
+ q3s32 = vmlsl_s16(q3s32, d29s16, d1s16);
+ q8s32 = vmlal_s16(q8s32, d28s16, d0s16);
+ q12s32 = vmlal_s16(q12s32, d29s16, d0s16);
+
+ d26s16 = vqrshrn_n_s32(q2s32, 14);
+ d27s16 = vqrshrn_n_s32(q3s32, 14);
+ d30s16 = vqrshrn_n_s32(q8s32, 14);
+ d31s16 = vqrshrn_n_s32(q12s32, 14);
+ *q13s16 = vcombine_s16(d26s16, d27s16);
+ *q15s16 = vcombine_s16(d30s16, d31s16);
+
+ q0s16 = vaddq_s16(*q9s16, *q15s16);
+ q1s16 = vaddq_s16(*q11s16, *q13s16);
+ q2s16 = vsubq_s16(*q11s16, *q13s16);
+ q3s16 = vsubq_s16(*q9s16, *q15s16);
+
+ *q13s16 = vsubq_s16(q4s16, q5s16);
+ q4s16 = vaddq_s16(q4s16, q5s16);
+ *q14s16 = vsubq_s16(q7s16, q6s16);
+ q7s16 = vaddq_s16(q7s16, q6s16);
+ d26s16 = vget_low_s16(*q13s16);
+ d27s16 = vget_high_s16(*q13s16);
+ d28s16 = vget_low_s16(*q14s16);
+ d29s16 = vget_high_s16(*q14s16);
+
+ d16s16 = vdup_n_s16(cospi_16_64);
+
+ q9s32 = vmull_s16(d28s16, d16s16);
+ q10s32 = vmull_s16(d29s16, d16s16);
+ q11s32 = vmull_s16(d28s16, d16s16);
+ q12s32 = vmull_s16(d29s16, d16s16);
+
+ q9s32 = vmlsl_s16(q9s32, d26s16, d16s16);
+ q10s32 = vmlsl_s16(q10s32, d27s16, d16s16);
+ q11s32 = vmlal_s16(q11s32, d26s16, d16s16);
+ q12s32 = vmlal_s16(q12s32, d27s16, d16s16);
+
+ d10s16 = vqrshrn_n_s32(q9s32, 14);
+ d11s16 = vqrshrn_n_s32(q10s32, 14);
+ d12s16 = vqrshrn_n_s32(q11s32, 14);
+ d13s16 = vqrshrn_n_s32(q12s32, 14);
+ q5s16 = vcombine_s16(d10s16, d11s16);
+ q6s16 = vcombine_s16(d12s16, d13s16);
+
+ *q8s16 = vaddq_s16(q0s16, q7s16);
+ *q9s16 = vaddq_s16(q1s16, q6s16);
+ *q10s16 = vaddq_s16(q2s16, q5s16);
+ *q11s16 = vaddq_s16(q3s16, q4s16);
+ *q12s16 = vsubq_s16(q3s16, q4s16);
+ *q13s16 = vsubq_s16(q2s16, q5s16);
+ *q14s16 = vsubq_s16(q1s16, q6s16);
+ *q15s16 = vsubq_s16(q0s16, q7s16);
+ return;
+}
+
+static inline void IADST8X8_1D(
+ int16x8_t *q8s16,
+ int16x8_t *q9s16,
+ int16x8_t *q10s16,
+ int16x8_t *q11s16,
+ int16x8_t *q12s16,
+ int16x8_t *q13s16,
+ int16x8_t *q14s16,
+ int16x8_t *q15s16) {
+ int16x4_t d0s16, d1s16, d2s16, d3s16, d4s16, d5s16, d6s16, d7s16;
+ int16x4_t d8s16, d9s16, d10s16, d11s16, d12s16, d13s16, d14s16, d15s16;
+ int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16, d23s16;
+ int16x4_t d24s16, d25s16, d26s16, d27s16, d28s16, d29s16, d30s16, d31s16;
+ int16x8_t q2s16, q4s16, q5s16, q6s16;
+ int32x4_t q0s32, q1s32, q2s32, q3s32, q4s32, q5s32, q6s32, q7s32, q8s32;
+ int32x4_t q9s32, q10s32, q11s32, q12s32, q13s32, q14s32, q15s32;
+
+ d16s16 = vget_low_s16(*q8s16);
+ d17s16 = vget_high_s16(*q8s16);
+ d18s16 = vget_low_s16(*q9s16);
+ d19s16 = vget_high_s16(*q9s16);
+ d20s16 = vget_low_s16(*q10s16);
+ d21s16 = vget_high_s16(*q10s16);
+ d22s16 = vget_low_s16(*q11s16);
+ d23s16 = vget_high_s16(*q11s16);
+ d24s16 = vget_low_s16(*q12s16);
+ d25s16 = vget_high_s16(*q12s16);
+ d26s16 = vget_low_s16(*q13s16);
+ d27s16 = vget_high_s16(*q13s16);
+ d28s16 = vget_low_s16(*q14s16);
+ d29s16 = vget_high_s16(*q14s16);
+ d30s16 = vget_low_s16(*q15s16);
+ d31s16 = vget_high_s16(*q15s16);
+
+ d14s16 = vdup_n_s16(cospi_2_64);
+ d15s16 = vdup_n_s16(cospi_30_64);
+
+ q1s32 = vmull_s16(d30s16, d14s16);
+ q2s32 = vmull_s16(d31s16, d14s16);
+ q3s32 = vmull_s16(d30s16, d15s16);
+ q4s32 = vmull_s16(d31s16, d15s16);
+
+ d30s16 = vdup_n_s16(cospi_18_64);
+ d31s16 = vdup_n_s16(cospi_14_64);
+
+ q1s32 = vmlal_s16(q1s32, d16s16, d15s16);
+ q2s32 = vmlal_s16(q2s32, d17s16, d15s16);
+ q3s32 = vmlsl_s16(q3s32, d16s16, d14s16);
+ q4s32 = vmlsl_s16(q4s32, d17s16, d14s16);
+
+ q5s32 = vmull_s16(d22s16, d30s16);
+ q6s32 = vmull_s16(d23s16, d30s16);
+ q7s32 = vmull_s16(d22s16, d31s16);
+ q8s32 = vmull_s16(d23s16, d31s16);
+
+ q5s32 = vmlal_s16(q5s32, d24s16, d31s16);
+ q6s32 = vmlal_s16(q6s32, d25s16, d31s16);
+ q7s32 = vmlsl_s16(q7s32, d24s16, d30s16);
+ q8s32 = vmlsl_s16(q8s32, d25s16, d30s16);
+
+ q11s32 = vaddq_s32(q1s32, q5s32);
+ q12s32 = vaddq_s32(q2s32, q6s32);
+ q1s32 = vsubq_s32(q1s32, q5s32);
+ q2s32 = vsubq_s32(q2s32, q6s32);
+
+ d22s16 = vqrshrn_n_s32(q11s32, 14);
+ d23s16 = vqrshrn_n_s32(q12s32, 14);
+ *q11s16 = vcombine_s16(d22s16, d23s16);
+
+ q12s32 = vaddq_s32(q3s32, q7s32);
+ q15s32 = vaddq_s32(q4s32, q8s32);
+ q3s32 = vsubq_s32(q3s32, q7s32);
+ q4s32 = vsubq_s32(q4s32, q8s32);
+
+ d2s16 = vqrshrn_n_s32(q1s32, 14);
+ d3s16 = vqrshrn_n_s32(q2s32, 14);
+ d24s16 = vqrshrn_n_s32(q12s32, 14);
+ d25s16 = vqrshrn_n_s32(q15s32, 14);
+ d6s16 = vqrshrn_n_s32(q3s32, 14);
+ d7s16 = vqrshrn_n_s32(q4s32, 14);
+ *q12s16 = vcombine_s16(d24s16, d25s16);
+
+ d0s16 = vdup_n_s16(cospi_10_64);
+ d1s16 = vdup_n_s16(cospi_22_64);
+ q4s32 = vmull_s16(d26s16, d0s16);
+ q5s32 = vmull_s16(d27s16, d0s16);
+ q2s32 = vmull_s16(d26s16, d1s16);
+ q6s32 = vmull_s16(d27s16, d1s16);
+
+ d30s16 = vdup_n_s16(cospi_26_64);
+ d31s16 = vdup_n_s16(cospi_6_64);
+
+ q4s32 = vmlal_s16(q4s32, d20s16, d1s16);
+ q5s32 = vmlal_s16(q5s32, d21s16, d1s16);
+ q2s32 = vmlsl_s16(q2s32, d20s16, d0s16);
+ q6s32 = vmlsl_s16(q6s32, d21s16, d0s16);
+
+ q0s32 = vmull_s16(d18s16, d30s16);
+ q13s32 = vmull_s16(d19s16, d30s16);
+
+ q0s32 = vmlal_s16(q0s32, d28s16, d31s16);
+ q13s32 = vmlal_s16(q13s32, d29s16, d31s16);
+
+ q10s32 = vmull_s16(d18s16, d31s16);
+ q9s32 = vmull_s16(d19s16, d31s16);
+
+ q10s32 = vmlsl_s16(q10s32, d28s16, d30s16);
+ q9s32 = vmlsl_s16(q9s32, d29s16, d30s16);
+
+ q14s32 = vaddq_s32(q2s32, q10s32);
+ q15s32 = vaddq_s32(q6s32, q9s32);
+ q2s32 = vsubq_s32(q2s32, q10s32);
+ q6s32 = vsubq_s32(q6s32, q9s32);
+
+ d28s16 = vqrshrn_n_s32(q14s32, 14);
+ d29s16 = vqrshrn_n_s32(q15s32, 14);
+ d4s16 = vqrshrn_n_s32(q2s32, 14);
+ d5s16 = vqrshrn_n_s32(q6s32, 14);
+ *q14s16 = vcombine_s16(d28s16, d29s16);
+
+ q9s32 = vaddq_s32(q4s32, q0s32);
+ q10s32 = vaddq_s32(q5s32, q13s32);
+ q4s32 = vsubq_s32(q4s32, q0s32);
+ q5s32 = vsubq_s32(q5s32, q13s32);
+
+ d30s16 = vdup_n_s16(cospi_8_64);
+ d31s16 = vdup_n_s16(cospi_24_64);
+
+ d18s16 = vqrshrn_n_s32(q9s32, 14);
+ d19s16 = vqrshrn_n_s32(q10s32, 14);
+ d8s16 = vqrshrn_n_s32(q4s32, 14);
+ d9s16 = vqrshrn_n_s32(q5s32, 14);
+ *q9s16 = vcombine_s16(d18s16, d19s16);
+
+ q5s32 = vmull_s16(d2s16, d30s16);
+ q6s32 = vmull_s16(d3s16, d30s16);
+ q7s32 = vmull_s16(d2s16, d31s16);
+ q0s32 = vmull_s16(d3s16, d31s16);
+
+ q5s32 = vmlal_s16(q5s32, d6s16, d31s16);
+ q6s32 = vmlal_s16(q6s32, d7s16, d31s16);
+ q7s32 = vmlsl_s16(q7s32, d6s16, d30s16);
+ q0s32 = vmlsl_s16(q0s32, d7s16, d30s16);
+
+ q1s32 = vmull_s16(d4s16, d30s16);
+ q3s32 = vmull_s16(d5s16, d30s16);
+ q10s32 = vmull_s16(d4s16, d31s16);
+ q2s32 = vmull_s16(d5s16, d31s16);
+
+ q1s32 = vmlsl_s16(q1s32, d8s16, d31s16);
+ q3s32 = vmlsl_s16(q3s32, d9s16, d31s16);
+ q10s32 = vmlal_s16(q10s32, d8s16, d30s16);
+ q2s32 = vmlal_s16(q2s32, d9s16, d30s16);
+
+ *q8s16 = vaddq_s16(*q11s16, *q9s16);
+ *q11s16 = vsubq_s16(*q11s16, *q9s16);
+ q4s16 = vaddq_s16(*q12s16, *q14s16);
+ *q12s16 = vsubq_s16(*q12s16, *q14s16);
+
+ q14s32 = vaddq_s32(q5s32, q1s32);
+ q15s32 = vaddq_s32(q6s32, q3s32);
+ q5s32 = vsubq_s32(q5s32, q1s32);
+ q6s32 = vsubq_s32(q6s32, q3s32);
+
+ d18s16 = vqrshrn_n_s32(q14s32, 14);
+ d19s16 = vqrshrn_n_s32(q15s32, 14);
+ d10s16 = vqrshrn_n_s32(q5s32, 14);
+ d11s16 = vqrshrn_n_s32(q6s32, 14);
+ *q9s16 = vcombine_s16(d18s16, d19s16);
+
+ q1s32 = vaddq_s32(q7s32, q10s32);
+ q3s32 = vaddq_s32(q0s32, q2s32);
+ q7s32 = vsubq_s32(q7s32, q10s32);
+ q0s32 = vsubq_s32(q0s32, q2s32);
+
+ d28s16 = vqrshrn_n_s32(q1s32, 14);
+ d29s16 = vqrshrn_n_s32(q3s32, 14);
+ d14s16 = vqrshrn_n_s32(q7s32, 14);
+ d15s16 = vqrshrn_n_s32(q0s32, 14);
+ *q14s16 = vcombine_s16(d28s16, d29s16);
+
+ d30s16 = vdup_n_s16(cospi_16_64);
+
+ d22s16 = vget_low_s16(*q11s16);
+ d23s16 = vget_high_s16(*q11s16);
+ q2s32 = vmull_s16(d22s16, d30s16);
+ q3s32 = vmull_s16(d23s16, d30s16);
+ q13s32 = vmull_s16(d22s16, d30s16);
+ q1s32 = vmull_s16(d23s16, d30s16);
+
+ d24s16 = vget_low_s16(*q12s16);
+ d25s16 = vget_high_s16(*q12s16);
+ q2s32 = vmlal_s16(q2s32, d24s16, d30s16);
+ q3s32 = vmlal_s16(q3s32, d25s16, d30s16);
+ q13s32 = vmlsl_s16(q13s32, d24s16, d30s16);
+ q1s32 = vmlsl_s16(q1s32, d25s16, d30s16);
+
+ d4s16 = vqrshrn_n_s32(q2s32, 14);
+ d5s16 = vqrshrn_n_s32(q3s32, 14);
+ d24s16 = vqrshrn_n_s32(q13s32, 14);
+ d25s16 = vqrshrn_n_s32(q1s32, 14);
+ q2s16 = vcombine_s16(d4s16, d5s16);
+ *q12s16 = vcombine_s16(d24s16, d25s16);
+
+ q13s32 = vmull_s16(d10s16, d30s16);
+ q1s32 = vmull_s16(d11s16, d30s16);
+ q11s32 = vmull_s16(d10s16, d30s16);
+ q0s32 = vmull_s16(d11s16, d30s16);
+
+ q13s32 = vmlal_s16(q13s32, d14s16, d30s16);
+ q1s32 = vmlal_s16(q1s32, d15s16, d30s16);
+ q11s32 = vmlsl_s16(q11s32, d14s16, d30s16);
+ q0s32 = vmlsl_s16(q0s32, d15s16, d30s16);
+
+ d20s16 = vqrshrn_n_s32(q13s32, 14);
+ d21s16 = vqrshrn_n_s32(q1s32, 14);
+ d12s16 = vqrshrn_n_s32(q11s32, 14);
+ d13s16 = vqrshrn_n_s32(q0s32, 14);
+ *q10s16 = vcombine_s16(d20s16, d21s16);
+ q6s16 = vcombine_s16(d12s16, d13s16);
+
+ q5s16 = vdupq_n_s16(0);
+
+ *q9s16 = vsubq_s16(q5s16, *q9s16);
+ *q11s16 = vsubq_s16(q5s16, q2s16);
+ *q13s16 = vsubq_s16(q5s16, q6s16);
+ *q15s16 = vsubq_s16(q5s16, q4s16);
+ return;
+}
+
+void vp9_iht8x8_64_add_neon(const tran_low_t *input, uint8_t *dest,
+ int dest_stride, int tx_type) {
+ int i;
+ uint8_t *d1, *d2;
+ uint8x8_t d0u8, d1u8, d2u8, d3u8;
+ uint64x1_t d0u64, d1u64, d2u64, d3u64;
+ int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16;
+ uint16x8_t q8u16, q9u16, q10u16, q11u16;
+
+ q8s16 = vld1q_s16(input);
+ q9s16 = vld1q_s16(input + 8);
+ q10s16 = vld1q_s16(input + 8 * 2);
+ q11s16 = vld1q_s16(input + 8 * 3);
+ q12s16 = vld1q_s16(input + 8 * 4);
+ q13s16 = vld1q_s16(input + 8 * 5);
+ q14s16 = vld1q_s16(input + 8 * 6);
+ q15s16 = vld1q_s16(input + 8 * 7);
+
+ TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16,
+ &q12s16, &q13s16, &q14s16, &q15s16);
+
+ switch (tx_type) {
+ case 0: // idct_idct is not supported. Fall back to C
+ vp9_iht8x8_64_add_c(input, dest, dest_stride, tx_type);
+ return;
+ break;
+ case 1: // iadst_idct
+ // generate IDCT constants
+ // GENERATE_IDCT_CONSTANTS
+
+ // first transform rows
+ IDCT8x8_1D(&q8s16, &q9s16, &q10s16, &q11s16,
+ &q12s16, &q13s16, &q14s16, &q15s16);
+
+ // transpose the matrix
+ TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16,
+ &q12s16, &q13s16, &q14s16, &q15s16);
+
+ // generate IADST constants
+ // GENERATE_IADST_CONSTANTS
+
+ // then transform columns
+ IADST8X8_1D(&q8s16, &q9s16, &q10s16, &q11s16,
+ &q12s16, &q13s16, &q14s16, &q15s16);
+ break;
+ case 2: // idct_iadst
+ // generate IADST constants
+ // GENERATE_IADST_CONSTANTS
+
+ // first transform rows
+ IADST8X8_1D(&q8s16, &q9s16, &q10s16, &q11s16,
+ &q12s16, &q13s16, &q14s16, &q15s16);
+
+ // transpose the matrix
+ TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16,
+ &q12s16, &q13s16, &q14s16, &q15s16);
+
+ // generate IDCT constants
+ // GENERATE_IDCT_CONSTANTS
+
+ // then transform columns
+ IDCT8x8_1D(&q8s16, &q9s16, &q10s16, &q11s16,
+ &q12s16, &q13s16, &q14s16, &q15s16);
+ break;
+ case 3: // iadst_iadst
+ // generate IADST constants
+ // GENERATE_IADST_CONSTANTS
+
+ // first transform rows
+ IADST8X8_1D(&q8s16, &q9s16, &q10s16, &q11s16,
+ &q12s16, &q13s16, &q14s16, &q15s16);
+
+ // transpose the matrix
+ TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16,
+ &q12s16, &q13s16, &q14s16, &q15s16);
+
+ // then transform columns
+ IADST8X8_1D(&q8s16, &q9s16, &q10s16, &q11s16,
+ &q12s16, &q13s16, &q14s16, &q15s16);
+ break;
+ default: // iadst_idct
+ assert(0);
+ break;
+ }
+
+ q8s16 = vrshrq_n_s16(q8s16, 5);
+ q9s16 = vrshrq_n_s16(q9s16, 5);
+ q10s16 = vrshrq_n_s16(q10s16, 5);
+ q11s16 = vrshrq_n_s16(q11s16, 5);
+ q12s16 = vrshrq_n_s16(q12s16, 5);
+ q13s16 = vrshrq_n_s16(q13s16, 5);
+ q14s16 = vrshrq_n_s16(q14s16, 5);
+ q15s16 = vrshrq_n_s16(q15s16, 5);
+
+ for (d1 = d2 = dest, i = 0; i < 2; i++) {
+ if (i != 0) {
+ q8s16 = q12s16;
+ q9s16 = q13s16;
+ q10s16 = q14s16;
+ q11s16 = q15s16;
+ }
+
+ d0u64 = vld1_u64((uint64_t *)d1);
+ d1 += dest_stride;
+ d1u64 = vld1_u64((uint64_t *)d1);
+ d1 += dest_stride;
+ d2u64 = vld1_u64((uint64_t *)d1);
+ d1 += dest_stride;
+ d3u64 = vld1_u64((uint64_t *)d1);
+ d1 += dest_stride;
+
+ q8u16 = vaddw_u8(vreinterpretq_u16_s16(q8s16),
+ vreinterpret_u8_u64(d0u64));
+ q9u16 = vaddw_u8(vreinterpretq_u16_s16(q9s16),
+ vreinterpret_u8_u64(d1u64));
+ q10u16 = vaddw_u8(vreinterpretq_u16_s16(q10s16),
+ vreinterpret_u8_u64(d2u64));
+ q11u16 = vaddw_u8(vreinterpretq_u16_s16(q11s16),
+ vreinterpret_u8_u64(d3u64));
+
+ d0u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16));
+ d1u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16));
+ d2u8 = vqmovun_s16(vreinterpretq_s16_u16(q10u16));
+ d3u8 = vqmovun_s16(vreinterpretq_s16_u16(q11u16));
+
+ vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d0u8));
+ d2 += dest_stride;
+ vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d1u8));
+ d2 += dest_stride;
+ vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d2u8));
+ d2 += dest_stride;
+ vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d3u8));
+ d2 += dest_stride;
+ }
+ return;
+}
diff --git a/vp9/common/vp9_alloccommon.c b/vp9/common/vp9_alloccommon.c
index 284d3a2..cad5750 100644
--- a/vp9/common/vp9_alloccommon.c
+++ b/vp9/common/vp9_alloccommon.c
@@ -44,7 +44,10 @@
vp9_free_frame_buffer(&cm->frame_bufs[i].buf);
}
+#if CONFIG_VP9_POSTPROC
vp9_free_frame_buffer(&cm->post_proc_buffer);
+ vp9_free_frame_buffer(&cm->post_proc_buffer_int);
+#endif
}
void vp9_free_context_buffers(VP9_COMMON *cm) {
@@ -125,7 +128,7 @@
init_frame_bufs(cm);
-#if CONFIG_INTERNAL_STATS || CONFIG_VP9_POSTPROC
+#if CONFIG_VP9_POSTPROC
if (vp9_alloc_frame_buffer(&cm->post_proc_buffer, width, height, ss_x, ss_y,
#if CONFIG_VP9_HIGHBITDEPTH
cm->use_highbitdepth,
diff --git a/vp9/common/vp9_blockd.h b/vp9/common/vp9_blockd.h
index 7d7209c..ebb1d1d 100644
--- a/vp9/common/vp9_blockd.h
+++ b/vp9/common/vp9_blockd.h
@@ -212,6 +212,12 @@
/* pointer to current frame */
const YV12_BUFFER_CONFIG *cur_buf;
+ ENTROPY_CONTEXT *above_context[MAX_MB_PLANE];
+ ENTROPY_CONTEXT left_context[MAX_MB_PLANE][16];
+
+ PARTITION_CONTEXT *above_seg_context;
+ PARTITION_CONTEXT left_seg_context[8];
+
/* mc buffer */
DECLARE_ALIGNED(16, uint8_t, mc_buf[80 * 2 * 80 * 2]);
@@ -221,17 +227,10 @@
DECLARE_ALIGNED(16, uint16_t, mc_buf_high[80 * 2 * 80 * 2]);
#endif
- int lossless;
-
- int corrupted;
-
DECLARE_ALIGNED(16, tran_low_t, dqcoeff[MAX_MB_PLANE][64 * 64]);
- ENTROPY_CONTEXT *above_context[MAX_MB_PLANE];
- ENTROPY_CONTEXT left_context[MAX_MB_PLANE][16];
-
- PARTITION_CONTEXT *above_seg_context;
- PARTITION_CONTEXT left_seg_context[8];
+ int lossless;
+ int corrupted;
} MACROBLOCKD;
static INLINE BLOCK_SIZE get_subsize(BLOCK_SIZE bsize,
diff --git a/vp9/common/vp9_loopfilter.c b/vp9/common/vp9_loopfilter.c
index 43a4fe5..58b2da7 100644
--- a/vp9/common/vp9_loopfilter.c
+++ b/vp9/common/vp9_loopfilter.c
@@ -968,7 +968,7 @@
break;
}
// The largest loopfilter we have is 16x16 so we use the 16x16 mask
- // for 32x32 transforms also also.
+ // for 32x32 transforms also.
lfm->left_y[TX_16X16] |= lfm->left_y[TX_32X32];
lfm->above_y[TX_16X16] |= lfm->above_y[TX_32X32];
lfm->left_uv[TX_16X16] |= lfm->left_uv[TX_32X32];
diff --git a/vp9/common/vp9_mfqe.c b/vp9/common/vp9_mfqe.c
new file mode 100644
index 0000000..f1bdc1b
--- /dev/null
+++ b/vp9/common/vp9_mfqe.c
@@ -0,0 +1,314 @@
+/*
+ * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vpx_config.h"
+#include "./vpx_scale_rtcd.h"
+#include "./vp9_rtcd.h"
+
+#include "vp9/common/vp9_onyxc_int.h"
+#include "vp9/common/vp9_postproc.h"
+
+// TODO(jackychen): Replace this function with SSE2 code. There is
+// one SSE2 implementation in vp8, so will consider how to share it
+// between vp8 and vp9.
+static void filter_by_weight(const uint8_t *src, int src_stride,
+ uint8_t *dst, int dst_stride,
+ int block_size, int src_weight) {
+ const int dst_weight = (1 << MFQE_PRECISION) - src_weight;
+ const int rounding_bit = 1 << (MFQE_PRECISION - 1);
+ int r, c;
+
+ for (r = 0; r < block_size; r++) {
+ for (c = 0; c < block_size; c++) {
+ dst[c] = (src[c] * src_weight + dst[c] * dst_weight + rounding_bit)
+ >> MFQE_PRECISION;
+ }
+ src += src_stride;
+ dst += dst_stride;
+ }
+}
+
+static void filter_by_weight32x32(const uint8_t *src, int src_stride,
+ uint8_t *dst, int dst_stride, int weight) {
+ filter_by_weight(src, src_stride, dst, dst_stride, 16, weight);
+ filter_by_weight(src + 16, src_stride, dst + 16, dst_stride, 16, weight);
+ filter_by_weight(src + src_stride * 16, src_stride, dst + dst_stride * 16,
+ dst_stride, 16, weight);
+ filter_by_weight(src + src_stride * 16 + 16, src_stride,
+ dst + dst_stride * 16 + 16, dst_stride, 16, weight);
+}
+
+static void filter_by_weight64x64(const uint8_t *src, int src_stride,
+ uint8_t *dst, int dst_stride, int weight) {
+ filter_by_weight32x32(src, src_stride, dst, dst_stride, weight);
+ filter_by_weight32x32(src + 32, src_stride, dst + 32,
+ dst_stride, weight);
+ filter_by_weight32x32(src + src_stride * 32, src_stride,
+ dst + dst_stride * 32, dst_stride, weight);
+ filter_by_weight32x32(src + src_stride * 32 + 32, src_stride,
+ dst + dst_stride * 32 + 32, dst_stride, weight);
+}
+
+static void apply_ifactor(const uint8_t *y, int y_stride, uint8_t *yd,
+ int yd_stride, const uint8_t *u, const uint8_t *v,
+ int uv_stride, uint8_t *ud, uint8_t *vd,
+ int uvd_stride, BLOCK_SIZE block_size,
+ int weight) {
+ if (block_size == BLOCK_16X16) {
+ filter_by_weight(y, y_stride, yd, yd_stride, 16, weight);
+ filter_by_weight(u, uv_stride, ud, uvd_stride, 8, weight);
+ filter_by_weight(v, uv_stride, vd, uvd_stride, 8, weight);
+ } else if (block_size == BLOCK_32X32) {
+ filter_by_weight32x32(y, y_stride, yd, yd_stride, weight);
+ filter_by_weight(u, uv_stride, ud, uvd_stride, 16, weight);
+ filter_by_weight(v, uv_stride, vd, uvd_stride, 16, weight);
+ } else if (block_size == BLOCK_64X64) {
+ filter_by_weight64x64(y, y_stride, yd, yd_stride, weight);
+ filter_by_weight32x32(u, uv_stride, ud, uvd_stride, weight);
+ filter_by_weight32x32(v, uv_stride, vd, uvd_stride, weight);
+ }
+}
+
+// TODO(jackychen): Determine whether replace it with assembly code.
+static void copy_mem8x8(const uint8_t *src, int src_stride,
+ uint8_t *dst, int dst_stride) {
+ int r;
+ for (r = 0; r < 8; r++) {
+ memcpy(dst, src, 8);
+ src += src_stride;
+ dst += dst_stride;
+ }
+}
+
+static void copy_mem16x16(const uint8_t *src, int src_stride,
+ uint8_t *dst, int dst_stride) {
+ int r;
+ for (r = 0; r < 16; r++) {
+ memcpy(dst, src, 16);
+ src += src_stride;
+ dst += dst_stride;
+ }
+}
+
+static void copy_mem32x32(const uint8_t *src, int src_stride,
+ uint8_t *dst, int dst_stride) {
+ copy_mem16x16(src, src_stride, dst, dst_stride);
+ copy_mem16x16(src + 16, src_stride, dst + 16, dst_stride);
+ copy_mem16x16(src + src_stride * 16, src_stride,
+ dst + dst_stride * 16, dst_stride);
+ copy_mem16x16(src + src_stride * 16 + 16, src_stride,
+ dst + dst_stride * 16 + 16, dst_stride);
+}
+
+void copy_mem64x64(const uint8_t *src, int src_stride,
+ uint8_t *dst, int dst_stride) {
+ copy_mem32x32(src, src_stride, dst, dst_stride);
+ copy_mem32x32(src + 32, src_stride, dst + 32, dst_stride);
+ copy_mem32x32(src + src_stride * 32, src_stride,
+ dst + src_stride * 32, dst_stride);
+ copy_mem32x32(src + src_stride * 32 + 32, src_stride,
+ dst + src_stride * 32 + 32, dst_stride);
+}
+
+static void copy_block(const uint8_t *y, const uint8_t *u, const uint8_t *v,
+ int y_stride, int uv_stride, uint8_t *yd, uint8_t *ud,
+ uint8_t *vd, int yd_stride, int uvd_stride,
+ BLOCK_SIZE bs) {
+ if (bs == BLOCK_16X16) {
+ copy_mem16x16(y, y_stride, yd, yd_stride);
+ copy_mem8x8(u, uv_stride, ud, uvd_stride);
+ copy_mem8x8(v, uv_stride, vd, uvd_stride);
+ } else if (bs == BLOCK_32X32) {
+ copy_mem32x32(y, y_stride, yd, yd_stride);
+ copy_mem16x16(u, uv_stride, ud, uvd_stride);
+ copy_mem16x16(v, uv_stride, vd, uvd_stride);
+ } else {
+ copy_mem64x64(y, y_stride, yd, yd_stride);
+ copy_mem32x32(u, uv_stride, ud, uvd_stride);
+ copy_mem32x32(v, uv_stride, vd, uvd_stride);
+ }
+}
+
+static void mfqe_block(BLOCK_SIZE bs, const uint8_t *y, const uint8_t *u,
+ const uint8_t *v, int y_stride, int uv_stride,
+ uint8_t *yd, uint8_t *ud, uint8_t *vd,
+ int yd_stride, int uvd_stride) {
+ int sad, sad_thr, vdiff;
+ uint32_t sse;
+
+ if (bs == BLOCK_16X16) {
+ vdiff = (vp9_variance16x16(y, y_stride, yd, yd_stride, &sse) + 128) >> 8;
+ sad = (vp9_sad16x16(y, y_stride, yd, yd_stride) + 128) >> 8;
+ } else if (bs == BLOCK_32X32) {
+ vdiff = (vp9_variance32x32(y, y_stride, yd, yd_stride, &sse) + 512) >> 10;
+ sad = (vp9_sad32x32(y, y_stride, yd, yd_stride) + 512) >> 10;
+ } else /* if (bs == BLOCK_64X64) */ {
+ vdiff = (vp9_variance64x64(y, y_stride, yd, yd_stride, &sse) + 2048) >> 12;
+ sad = (vp9_sad64x64(y, y_stride, yd, yd_stride) + 2048) >> 12;
+ }
+
+ if (bs == BLOCK_16X16) {
+ sad_thr = 8;
+ } else if (bs == BLOCK_32X32) {
+ sad_thr = 7;
+ } else { // BLOCK_64X64
+ sad_thr = 6;
+ }
+
+ // TODO(jackychen): More experiments and remove magic numbers.
+ // vdiff > sad * 3 means vdiff should not be too small, otherwise,
+ // it might be a lighting change in smooth area. When there is a
+ // lighting change in smooth area, it is dangerous to do MFQE.
+ if (sad > 1 && sad < sad_thr && vdiff > sad * 3 && vdiff < 150) {
+ // TODO(jackychen): Add weighted average in the calculation.
+ // Currently, the data is copied from last frame without averaging.
+ apply_ifactor(y, y_stride, yd, yd_stride, u, v, uv_stride,
+ ud, vd, uvd_stride, bs, 0);
+ } else {
+ // Copy the block from current frame (i.e., no mfqe is done).
+ copy_block(y, u, v, y_stride, uv_stride, yd, ud, vd,
+ yd_stride, uvd_stride, bs);
+ }
+}
+
+static int mfqe_decision(MODE_INFO *mi, BLOCK_SIZE cur_bs) {
+ // Check the motion in current block(for inter frame),
+ // or check the motion in the correlated block in last frame (for keyframe).
+ const int mv_len_square = mi->mbmi.mv[0].as_mv.row *
+ mi->mbmi.mv[0].as_mv.row +
+ mi->mbmi.mv[0].as_mv.col *
+ mi->mbmi.mv[0].as_mv.col;
+ const int mv_threshold = 100;
+ return mi->mbmi.mode >= NEARESTMV && // Not an intra block
+ cur_bs >= BLOCK_16X16 &&
+ mv_len_square <= mv_threshold;
+}
+
+// Process each partiton in a super block, recursively.
+static void mfqe_partition(VP9_COMMON *cm, MODE_INFO *mi, BLOCK_SIZE bs,
+ const uint8_t *y, const uint8_t *u,
+ const uint8_t *v, int y_stride, int uv_stride,
+ uint8_t *yd, uint8_t *ud, uint8_t *vd,
+ int yd_stride, int uvd_stride) {
+ int mi_offset, y_offset, uv_offset;
+ const BLOCK_SIZE cur_bs = mi->mbmi.sb_type;
+ // TODO(jackychen): Consider how and whether to use qdiff in MFQE.
+ // int qdiff = cm->base_qindex - cm->postproc_state.last_base_qindex;
+ const int bsl = b_width_log2_lookup[bs];
+ PARTITION_TYPE partition = partition_lookup[bsl][cur_bs];
+ const BLOCK_SIZE subsize = get_subsize(bs, partition);
+
+ if (cur_bs < BLOCK_8X8) {
+ // If there are blocks smaller than 8x8, it must be on the boundary.
+ return;
+ }
+ // No MFQE on blocks smaller than 16x16
+ if (partition == PARTITION_SPLIT && bs == BLOCK_16X16) {
+ partition = PARTITION_NONE;
+ }
+ switch (partition) {
+ case PARTITION_HORZ:
+ case PARTITION_VERT:
+ // If current block size is not square.
+ // Copy the block from current frame(i.e., no mfqe is done).
+ // TODO(jackychen): Rectangle blocks should also be taken into account.
+ copy_block(y, u, v, y_stride, uv_stride, yd, ud, vd,
+ yd_stride, uvd_stride, bs);
+ break;
+ case PARTITION_NONE:
+ if (mfqe_decision(mi, cur_bs)) {
+ // Do mfqe on this partition.
+ mfqe_block(cur_bs, y, u, v, y_stride, uv_stride,
+ yd, ud, vd, yd_stride, uvd_stride);
+ } else {
+ // Copy the block from current frame(i.e., no mfqe is done).
+ copy_block(y, u, v, y_stride, uv_stride, yd, ud, vd,
+ yd_stride, uvd_stride, bs);
+ }
+ break;
+ case PARTITION_SPLIT:
+ if (bs == BLOCK_64X64) {
+ mi_offset = 4;
+ y_offset = 32;
+ uv_offset = 16;
+ } else {
+ mi_offset = 2;
+ y_offset = 16;
+ uv_offset = 8;
+ }
+ // Recursion on four square partitions, e.g. if bs is 64X64,
+ // then look into four 32X32 blocks in it.
+ mfqe_partition(cm, mi, subsize, y, u, v, y_stride, uv_stride, yd, ud, vd,
+ yd_stride, uvd_stride);
+ mfqe_partition(cm, mi + mi_offset, subsize, y + y_offset, u + uv_offset,
+ v + uv_offset, y_stride, uv_stride, yd + y_offset,
+ ud + uv_offset, vd + uv_offset, yd_stride, uvd_stride);
+ mfqe_partition(cm, mi + mi_offset * cm->mi_stride, subsize,
+ y + y_offset * y_stride, u + uv_offset * uv_stride,
+ v + uv_offset * uv_stride, y_stride, uv_stride,
+ yd + y_offset * yd_stride, ud + uv_offset * uvd_stride,
+ vd + uv_offset * uvd_stride, yd_stride, uvd_stride);
+ mfqe_partition(cm, mi + mi_offset * cm->mi_stride + mi_offset,
+ subsize, y + y_offset * y_stride + y_offset,
+ u + uv_offset * uv_stride + uv_offset,
+ v + uv_offset * uv_stride + uv_offset, y_stride,
+ uv_stride, yd + y_offset * yd_stride + y_offset,
+ ud + uv_offset * uvd_stride + uv_offset,
+ vd + uv_offset * uvd_stride + uv_offset,
+ yd_stride, uvd_stride);
+ break;
+ default:
+ assert(0);
+ }
+}
+
+void vp9_mfqe(VP9_COMMON *cm) {
+ int mi_row, mi_col;
+ // Current decoded frame.
+ const YV12_BUFFER_CONFIG *show = cm->frame_to_show;
+ // Last decoded frame and will store the MFQE result.
+ YV12_BUFFER_CONFIG *dest = &cm->post_proc_buffer;
+ // Loop through each super block.
+ for (mi_row = 0; mi_row < cm->mi_rows; mi_row += MI_BLOCK_SIZE) {
+ for (mi_col = 0; mi_col < cm->mi_cols; mi_col += MI_BLOCK_SIZE) {
+ MODE_INFO *mi;
+ MODE_INFO *mi_local = cm->mi + (mi_row * cm->mi_stride + mi_col);
+ // Motion Info in last frame.
+ MODE_INFO *mi_prev = cm->postproc_state.prev_mi +
+ (mi_row * cm->mi_stride + mi_col);
+ const uint32_t y_stride = show->y_stride;
+ const uint32_t uv_stride = show->uv_stride;
+ const uint32_t yd_stride = dest->y_stride;
+ const uint32_t uvd_stride = dest->uv_stride;
+ const uint32_t row_offset_y = mi_row << 3;
+ const uint32_t row_offset_uv = mi_row << 2;
+ const uint32_t col_offset_y = mi_col << 3;
+ const uint32_t col_offset_uv = mi_col << 2;
+ const uint8_t *y = show->y_buffer + row_offset_y * y_stride +
+ col_offset_y;
+ const uint8_t *u = show->u_buffer + row_offset_uv * uv_stride +
+ col_offset_uv;
+ const uint8_t *v = show->v_buffer + row_offset_uv * uv_stride +
+ col_offset_uv;
+ uint8_t *yd = dest->y_buffer + row_offset_y * yd_stride + col_offset_y;
+ uint8_t *ud = dest->u_buffer + row_offset_uv * uvd_stride +
+ col_offset_uv;
+ uint8_t *vd = dest->v_buffer + row_offset_uv * uvd_stride +
+ col_offset_uv;
+ if (frame_is_intra_only(cm)) {
+ mi = mi_prev;
+ } else {
+ mi = mi_local;
+ }
+ mfqe_partition(cm, mi, BLOCK_64X64, y, u, v, y_stride, uv_stride, yd, ud,
+ vd, yd_stride, uvd_stride);
+ }
+ }
+}
diff --git a/vp9/common/vp9_mfqe.h b/vp9/common/vp9_mfqe.h
new file mode 100644
index 0000000..dfff8c2
--- /dev/null
+++ b/vp9/common/vp9_mfqe.h
@@ -0,0 +1,31 @@
+/*
+ * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VP9_COMMON_VP9_MFQE_H_
+#define VP9_COMMON_VP9_MFQE_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// Multiframe Quality Enhancement.
+// The aim for MFQE is to replace pixel blocks in the current frame with
+// the correlated pixel blocks (with higher quality) in the last frame.
+// The replacement can only be taken in stationary blocks by checking
+// the motion of the blocks and other conditions such as the SAD of
+// the current block and correlated block, the variance of the block
+// difference, etc.
+void vp9_mfqe(struct VP9Common *cm);
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // VP9_COMMON_VP9_MFQE_H_
diff --git a/vp9/common/vp9_onyxc_int.h b/vp9/common/vp9_onyxc_int.h
index c012055..c166590 100644
--- a/vp9/common/vp9_onyxc_int.h
+++ b/vp9/common/vp9_onyxc_int.h
@@ -112,7 +112,10 @@
int new_fb_idx;
+#if CONFIG_VP9_POSTPROC
YV12_BUFFER_CONFIG post_proc_buffer;
+ YV12_BUFFER_CONFIG post_proc_buffer_int;
+#endif
FRAME_TYPE last_frame_type; /* last frame's frame type for motion search.*/
FRAME_TYPE frame_type;
@@ -181,7 +184,6 @@
struct segmentation seg;
// Context probabilities for reference frame prediction
- int allow_comp_inter_inter;
MV_REFERENCE_FRAME comp_fixed_ref;
MV_REFERENCE_FRAME comp_var_ref[2];
REFERENCE_MODE reference_mode;
diff --git a/vp9/common/vp9_postproc.c b/vp9/common/vp9_postproc.c
index 575ffbc..06cb65a 100644
--- a/vp9/common/vp9_postproc.c
+++ b/vp9/common/vp9_postproc.c
@@ -79,6 +79,9 @@
0, 9, 5, 5, 11, 10, 13, 9, 10, 13,
};
+static const uint8_t q_diff_thresh = 20;
+static const uint8_t last_q_thresh = 170;
+
void vp9_post_proc_down_and_across_c(const uint8_t *src_ptr,
uint8_t *dst_ptr,
int src_pixels_per_line,
@@ -616,6 +619,17 @@
}
}
+static void swap_mi_and_prev_mi(VP9_COMMON *cm) {
+ // Current mip will be the prev_mip for the next frame.
+ MODE_INFO *temp = cm->postproc_state.prev_mip;
+ cm->postproc_state.prev_mip = cm->mip;
+ cm->mip = temp;
+
+ // Update the upper left visible macroblock ptrs.
+ cm->mi = cm->mip + cm->mi_stride + 1;
+ cm->postproc_state.prev_mi = cm->postproc_state.prev_mip + cm->mi_stride + 1;
+}
+
int vp9_post_proc_frame(struct VP9Common *cm,
YV12_BUFFER_CONFIG *dest, vp9_ppflags_t *ppflags) {
const int q = MIN(63, cm->lf.filter_level * 10 / 6);
@@ -633,7 +647,42 @@
vp9_clear_system_state();
-#if CONFIG_VP9_POSTPROC || CONFIG_INTERNAL_STATS
+ // Alloc memory for prev_mip in the first frame.
+ if (cm->current_video_frame == 1) {
+ cm->postproc_state.last_base_qindex = cm->base_qindex;
+ cm->postproc_state.last_frame_valid = 1;
+ ppstate->prev_mip = vpx_calloc(cm->mi_alloc_size, sizeof(*cm->mip));
+ if (!ppstate->prev_mip) {
+ return 1;
+ }
+ ppstate->prev_mi = ppstate->prev_mip + cm->mi_stride + 1;
+ vpx_memset(ppstate->prev_mip, 0,
+ cm->mi_stride * (cm->mi_rows + 1) * sizeof(*cm->mip));
+ }
+
+ // Allocate post_proc_buffer_int if needed.
+ if ((flags & VP9D_MFQE) && !cm->post_proc_buffer_int.buffer_alloc) {
+ if ((flags & VP9D_DEMACROBLOCK) || (flags & VP9D_DEBLOCK)) {
+ const int width = ALIGN_POWER_OF_TWO(cm->width, 4);
+ const int height = ALIGN_POWER_OF_TWO(cm->height, 4);
+
+ if (vp9_alloc_frame_buffer(&cm->post_proc_buffer_int, width, height,
+ cm->subsampling_x, cm->subsampling_y,
+#if CONFIG_VP9_HIGHBITDEPTH
+ cm->use_highbitdepth,
+#endif // CONFIG_VP9_HIGHBITDEPTH
+ VP9_ENC_BORDER_IN_PIXELS) < 0) {
+ vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
+ "Failed to allocate MFQE framebuffer");
+ }
+
+ // Ensure that postproc is set to all 0s so that post proc
+ // doesn't pull random data in from edge.
+ vpx_memset(cm->post_proc_buffer_int.buffer_alloc, 128,
+ cm->post_proc_buffer.frame_size);
+ }
+ }
+
if (vp9_realloc_frame_buffer(&cm->post_proc_buffer, cm->width, cm->height,
cm->subsampling_x, cm->subsampling_y,
#if CONFIG_VP9_HIGHBITDEPTH
@@ -642,9 +691,28 @@
VP9_DEC_BORDER_IN_PIXELS, NULL, NULL, NULL) < 0)
vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
"Failed to allocate post-processing buffer");
-#endif
- if (flags & VP9D_DEMACROBLOCK) {
+ if ((flags & VP9D_MFQE) && cm->current_video_frame >= 2 &&
+ cm->postproc_state.last_frame_valid &&
+ cm->postproc_state.last_base_qindex <= last_q_thresh &&
+ cm->base_qindex - cm->postproc_state.last_base_qindex >= q_diff_thresh) {
+ vp9_mfqe(cm);
+ // TODO(jackychen): Consider whether enable deblocking by default
+ // if mfqe is enabled. Need to take both the quality and the speed
+ // into consideration.
+ if ((flags & VP9D_DEMACROBLOCK) || (flags & VP9D_DEBLOCK)) {
+ vp8_yv12_copy_frame(ppbuf, &cm->post_proc_buffer_int);
+ }
+ if ((flags & VP9D_DEMACROBLOCK) && cm->post_proc_buffer_int.buffer_alloc) {
+ deblock_and_de_macro_block(&cm->post_proc_buffer_int, ppbuf,
+ q + (ppflags->deblocking_level - 5) * 10,
+ 1, 0);
+ } else if (flags & VP9D_DEBLOCK) {
+ vp9_deblock(&cm->post_proc_buffer_int, ppbuf, q);
+ } else {
+ vp8_yv12_copy_frame(&cm->post_proc_buffer_int, ppbuf);
+ }
+ } else if (flags & VP9D_DEMACROBLOCK) {
deblock_and_de_macro_block(cm->frame_to_show, ppbuf,
q + (ppflags->deblocking_level - 5) * 10, 1, 0);
} else if (flags & VP9D_DEBLOCK) {
@@ -653,6 +721,9 @@
vp8_yv12_copy_frame(cm->frame_to_show, ppbuf);
}
+ cm->postproc_state.last_base_qindex = cm->base_qindex;
+ cm->postproc_state.last_frame_valid = 1;
+
if (flags & VP9D_ADDNOISE) {
const int noise_level = ppflags->noise_level;
if (ppstate->last_q != q ||
@@ -673,6 +744,7 @@
dest->uv_width = dest->y_width >> cm->subsampling_x;
dest->uv_height = dest->y_height >> cm->subsampling_y;
+ swap_mi_and_prev_mi(cm);
return 0;
}
-#endif
+#endif // CONFIG_VP9_POSTPROC
diff --git a/vp9/common/vp9_postproc.h b/vp9/common/vp9_postproc.h
index ebebc1a..035c9cd 100644
--- a/vp9/common/vp9_postproc.h
+++ b/vp9/common/vp9_postproc.h
@@ -14,6 +14,8 @@
#include "vpx_ports/mem.h"
#include "vpx_scale/yv12config.h"
+#include "vp9/common/vp9_blockd.h"
+#include "vp9/common/vp9_mfqe.h"
#include "vp9/common/vp9_ppflags.h"
#ifdef __cplusplus
@@ -24,6 +26,10 @@
int last_q;
int last_noise;
char noise[3072];
+ int last_base_qindex;
+ int last_frame_valid;
+ MODE_INFO *prev_mip;
+ MODE_INFO *prev_mi;
DECLARE_ALIGNED(16, char, blackclamp[16]);
DECLARE_ALIGNED(16, char, whiteclamp[16]);
DECLARE_ALIGNED(16, char, bothclamp[16]);
@@ -31,6 +37,8 @@
struct VP9Common;
+#define MFQE_PRECISION 4
+
int vp9_post_proc_frame(struct VP9Common *cm,
YV12_BUFFER_CONFIG *dest, vp9_ppflags_t *flags);
diff --git a/vp9/common/vp9_ppflags.h b/vp9/common/vp9_ppflags.h
index 1644a1b..12b989f 100644
--- a/vp9/common/vp9_ppflags.h
+++ b/vp9/common/vp9_ppflags.h
@@ -26,7 +26,8 @@
VP9D_DEBUG_TXT_RATE_INFO = 1 << 6,
VP9D_DEBUG_DRAW_MV = 1 << 7,
VP9D_DEBUG_CLR_BLK_MODES = 1 << 8,
- VP9D_DEBUG_CLR_FRM_REF_BLKS = 1 << 9
+ VP9D_DEBUG_CLR_FRM_REF_BLKS = 1 << 9,
+ VP9D_MFQE = 1 << 10
};
typedef struct {
diff --git a/vp9/common/vp9_rtcd_defs.pl b/vp9/common/vp9_rtcd_defs.pl
index 575990b..11c0d81 100644
--- a/vp9/common/vp9_rtcd_defs.pl
+++ b/vp9/common/vp9_rtcd_defs.pl
@@ -457,12 +457,10 @@
specialize qw/vp9_idct32x32_1_add sse2 neon dspr2/;
add_proto qw/void vp9_iht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
- specialize qw/vp9_iht4x4_16_add sse2 neon_asm dspr2/;
- $vp9_iht4x4_16_add_neon_asm=vp9_iht4x4_16_add_neon;
+ specialize qw/vp9_iht4x4_16_add sse2 neon dspr2/;
add_proto qw/void vp9_iht8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
- specialize qw/vp9_iht8x8_64_add sse2 neon_asm dspr2/;
- $vp9_iht8x8_64_add_neon_asm=vp9_iht8x8_64_add_neon;
+ specialize qw/vp9_iht8x8_64_add sse2 neon dspr2/;
add_proto qw/void vp9_iht16x16_256_add/, "const tran_low_t *input, uint8_t *output, int pitch, int tx_type";
specialize qw/vp9_iht16x16_256_add sse2 dspr2/;
diff --git a/vp9/common/x86/vp9_idct_intrin_sse2.c b/vp9/common/x86/vp9_idct_intrin_sse2.c
index 3610c71..42e0baa 100644
--- a/vp9/common/x86/vp9_idct_intrin_sse2.c
+++ b/vp9/common/x86/vp9_idct_intrin_sse2.c
@@ -4260,7 +4260,7 @@
// N.B. Only first 4 cols contain non-zero coeffs
max_input = _mm_max_epi16(inptr[0], inptr[1]);
min_input = _mm_min_epi16(inptr[0], inptr[1]);
- for (i = 2; i < 4; i++) {
+ for (i = 2; i < 8; i++) {
max_input = _mm_max_epi16(max_input, inptr[i]);
min_input = _mm_min_epi16(min_input, inptr[i]);
}
diff --git a/vp9/common/x86/vp9_loopfilter_mmx.asm b/vp9/common/x86/vp9_loopfilter_mmx.asm
index 91055b9f..f5f7d5a 100644
--- a/vp9/common/x86/vp9_loopfilter_mmx.asm
+++ b/vp9/common/x86/vp9_loopfilter_mmx.asm
@@ -601,9 +601,6 @@
t80:
times 8 db 0x80
align 16
-t1s:
- times 8 db 0x01
-align 16
t3:
times 8 db 0x03
align 16
@@ -612,15 +609,3 @@
align 16
ones:
times 4 dw 0x0001
-align 16
-s27:
- times 4 dw 0x1b00
-align 16
-s18:
- times 4 dw 0x1200
-align 16
-s9:
- times 4 dw 0x0900
-align 16
-s63:
- times 4 dw 0x003f
diff --git a/vp9/common/x86/vp9_subpixel_8t_intrin_ssse3.c b/vp9/common/x86/vp9_subpixel_8t_intrin_ssse3.c
index c4efa65..71dbb40 100644
--- a/vp9/common/x86/vp9_subpixel_8t_intrin_ssse3.c
+++ b/vp9/common/x86/vp9_subpixel_8t_intrin_ssse3.c
@@ -312,9 +312,11 @@
unsigned int out_pitch,
unsigned int output_height,
int16_t *filter) {
- __m128i addFilterReg64, filtersReg, minReg, srcRegFilt6;
+ __m128i addFilterReg64, filtersReg, minReg;
__m128i firstFilters, secondFilters, thirdFilters, forthFilters;
- __m128i srcRegFilt1, srcRegFilt2, srcRegFilt3, srcRegFilt4, srcRegFilt5;
+ __m128i srcRegFilt1, srcRegFilt2, srcRegFilt3, srcRegFilt5;
+ __m128i srcReg1, srcReg2, srcReg3, srcReg4, srcReg5, srcReg6, srcReg7;
+ __m128i srcReg8;
unsigned int i;
// create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64
@@ -333,27 +335,26 @@
// duplicate only the forth 16 bits in the filter
forthFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x706u));
+ // load the first 7 rows of 8 bytes
+ srcReg1 = _mm_loadl_epi64((__m128i *)&src_ptr[0]);
+ srcReg2 = _mm_loadl_epi64((__m128i *)&(src_ptr + src_pitch)[0]);
+ srcReg3 = _mm_loadl_epi64((__m128i *)&(src_ptr + src_pitch * 2)[0]);
+ srcReg4 = _mm_loadl_epi64((__m128i *)&(src_ptr + src_pitch * 3)[0]);
+ srcReg5 = _mm_loadl_epi64((__m128i *)&(src_ptr + src_pitch * 4)[0]);
+ srcReg6 = _mm_loadl_epi64((__m128i *)&(src_ptr + src_pitch * 5)[0]);
+ srcReg7 = _mm_loadl_epi64((__m128i *)&(src_ptr + src_pitch * 6)[0]);
+
for (i = 0; i < output_height; i++) {
- // load the first 8 bytes
- srcRegFilt1 = _mm_loadl_epi64((__m128i *)&src_ptr[0]);
- // load the next 8 bytes in stride of src_pitch
- srcRegFilt2 = _mm_loadl_epi64((__m128i *)&(src_ptr+src_pitch)[0]);
- srcRegFilt3 = _mm_loadl_epi64((__m128i *)&(src_ptr+src_pitch*2)[0]);
- srcRegFilt4 = _mm_loadl_epi64((__m128i *)&(src_ptr+src_pitch*3)[0]);
+ // load the last 8 bytes
+ srcReg8 = _mm_loadl_epi64((__m128i *)&(src_ptr + src_pitch * 7)[0]);
// merge the result together
- srcRegFilt1 = _mm_unpacklo_epi8(srcRegFilt1, srcRegFilt2);
- srcRegFilt3 = _mm_unpacklo_epi8(srcRegFilt3, srcRegFilt4);
-
- // load the next 8 bytes in stride of src_pitch
- srcRegFilt2 = _mm_loadl_epi64((__m128i *)&(src_ptr+src_pitch*4)[0]);
- srcRegFilt4 = _mm_loadl_epi64((__m128i *)&(src_ptr+src_pitch*5)[0]);
- srcRegFilt5 = _mm_loadl_epi64((__m128i *)&(src_ptr+src_pitch*6)[0]);
- srcRegFilt6 = _mm_loadl_epi64((__m128i *)&(src_ptr+src_pitch*7)[0]);
+ srcRegFilt1 = _mm_unpacklo_epi8(srcReg1, srcReg2);
+ srcRegFilt3 = _mm_unpacklo_epi8(srcReg3, srcReg4);
// merge the result together
- srcRegFilt2 = _mm_unpacklo_epi8(srcRegFilt2, srcRegFilt4);
- srcRegFilt5 = _mm_unpacklo_epi8(srcRegFilt5, srcRegFilt6);
+ srcRegFilt2 = _mm_unpacklo_epi8(srcReg5, srcReg6);
+ srcRegFilt5 = _mm_unpacklo_epi8(srcReg7, srcReg8);
// multiply 2 adjacent elements with the filter and add the result
srcRegFilt1 = _mm_maddubs_epi16(srcRegFilt1, firstFilters);
@@ -377,6 +378,15 @@
src_ptr+=src_pitch;
+ // shift down a row
+ srcReg1 = srcReg2;
+ srcReg2 = srcReg3;
+ srcReg3 = srcReg4;
+ srcReg4 = srcReg5;
+ srcReg5 = srcReg6;
+ srcReg6 = srcReg7;
+ srcReg7 = srcReg8;
+
// save only 8 bytes convolve result
_mm_storel_epi64((__m128i*)&output_ptr[0], srcRegFilt1);
@@ -390,9 +400,11 @@
unsigned int out_pitch,
unsigned int output_height,
int16_t *filter) {
- __m128i addFilterReg64, filtersReg, srcRegFilt1, srcRegFilt2, srcRegFilt3;
+ __m128i addFilterReg64, filtersReg, srcRegFilt1, srcRegFilt3;
__m128i firstFilters, secondFilters, thirdFilters, forthFilters;
- __m128i srcRegFilt4, srcRegFilt5, srcRegFilt6, srcRegFilt7, srcRegFilt8;
+ __m128i srcRegFilt5, srcRegFilt6, srcRegFilt7, srcRegFilt8;
+ __m128i srcReg1, srcReg2, srcReg3, srcReg4, srcReg5, srcReg6, srcReg7;
+ __m128i srcReg8;
unsigned int i;
// create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64
@@ -411,19 +423,24 @@
// duplicate only the forth 16 bits in the filter
forthFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x706u));
+ // load the first 7 rows of 16 bytes
+ srcReg1 = _mm_loadu_si128((__m128i *)(src_ptr));
+ srcReg2 = _mm_loadu_si128((__m128i *)(src_ptr + src_pitch));
+ srcReg3 = _mm_loadu_si128((__m128i *)(src_ptr + src_pitch * 2));
+ srcReg4 = _mm_loadu_si128((__m128i *)(src_ptr + src_pitch * 3));
+ srcReg5 = _mm_loadu_si128((__m128i *)(src_ptr + src_pitch * 4));
+ srcReg6 = _mm_loadu_si128((__m128i *)(src_ptr + src_pitch * 5));
+ srcReg7 = _mm_loadu_si128((__m128i *)(src_ptr + src_pitch * 6));
+
for (i = 0; i < output_height; i++) {
- // load the first 16 bytes
- srcRegFilt1 = _mm_loadu_si128((__m128i *)(src_ptr));
- // load the next 16 bytes in stride of src_pitch
- srcRegFilt2 = _mm_loadu_si128((__m128i *)(src_ptr+src_pitch));
- srcRegFilt3 = _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*6));
- srcRegFilt4 = _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*7));
+ // load the last 16 bytes
+ srcReg8 = _mm_loadu_si128((__m128i *)(src_ptr + src_pitch * 7));
// merge the result together
- srcRegFilt5 = _mm_unpacklo_epi8(srcRegFilt1, srcRegFilt2);
- srcRegFilt6 = _mm_unpacklo_epi8(srcRegFilt3, srcRegFilt4);
- srcRegFilt1 = _mm_unpackhi_epi8(srcRegFilt1, srcRegFilt2);
- srcRegFilt3 = _mm_unpackhi_epi8(srcRegFilt3, srcRegFilt4);
+ srcRegFilt5 = _mm_unpacklo_epi8(srcReg1, srcReg2);
+ srcRegFilt6 = _mm_unpacklo_epi8(srcReg7, srcReg8);
+ srcRegFilt1 = _mm_unpackhi_epi8(srcReg1, srcReg2);
+ srcRegFilt3 = _mm_unpackhi_epi8(srcReg7, srcReg8);
// multiply 2 adjacent elements with the filter and add the result
srcRegFilt5 = _mm_maddubs_epi16(srcRegFilt5, firstFilters);
@@ -435,25 +452,17 @@
srcRegFilt5 = _mm_adds_epi16(srcRegFilt5, srcRegFilt6);
srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt3);
- // load the next 16 bytes in stride of two/three src_pitch
- srcRegFilt2 = _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*2));
- srcRegFilt3 = _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*3));
-
// merge the result together
- srcRegFilt4 = _mm_unpacklo_epi8(srcRegFilt2, srcRegFilt3);
- srcRegFilt6 = _mm_unpackhi_epi8(srcRegFilt2, srcRegFilt3);
+ srcRegFilt3 = _mm_unpacklo_epi8(srcReg3, srcReg4);
+ srcRegFilt6 = _mm_unpackhi_epi8(srcReg3, srcReg4);
// multiply 2 adjacent elements with the filter and add the result
- srcRegFilt4 = _mm_maddubs_epi16(srcRegFilt4, secondFilters);
+ srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, secondFilters);
srcRegFilt6 = _mm_maddubs_epi16(srcRegFilt6, secondFilters);
- // load the next 16 bytes in stride of four/five src_pitch
- srcRegFilt2 = _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*4));
- srcRegFilt3 = _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*5));
-
// merge the result together
- srcRegFilt7 = _mm_unpacklo_epi8(srcRegFilt2, srcRegFilt3);
- srcRegFilt8 = _mm_unpackhi_epi8(srcRegFilt2, srcRegFilt3);
+ srcRegFilt7 = _mm_unpacklo_epi8(srcReg5, srcReg6);
+ srcRegFilt8 = _mm_unpackhi_epi8(srcReg5, srcReg6);
// multiply 2 adjacent elements with the filter and add the result
srcRegFilt7 = _mm_maddubs_epi16(srcRegFilt7, thirdFilters);
@@ -461,13 +470,13 @@
// add and saturate the results together
srcRegFilt5 = _mm_adds_epi16(srcRegFilt5,
- _mm_min_epi16(srcRegFilt4, srcRegFilt7));
+ _mm_min_epi16(srcRegFilt3, srcRegFilt7));
srcRegFilt1 = _mm_adds_epi16(srcRegFilt1,
_mm_min_epi16(srcRegFilt6, srcRegFilt8));
// add and saturate the results together
srcRegFilt5 = _mm_adds_epi16(srcRegFilt5,
- _mm_max_epi16(srcRegFilt4, srcRegFilt7));
+ _mm_max_epi16(srcRegFilt3, srcRegFilt7));
srcRegFilt1 = _mm_adds_epi16(srcRegFilt1,
_mm_max_epi16(srcRegFilt6, srcRegFilt8));
srcRegFilt5 = _mm_adds_epi16(srcRegFilt5, addFilterReg64);
@@ -484,6 +493,15 @@
src_ptr+=src_pitch;
+ // shift down a row
+ srcReg1 = srcReg2;
+ srcReg2 = srcReg3;
+ srcReg3 = srcReg4;
+ srcReg4 = srcReg5;
+ srcReg5 = srcReg6;
+ srcReg6 = srcReg7;
+ srcReg7 = srcReg8;
+
// save 16 bytes convolve result
_mm_store_si128((__m128i*)output_ptr, srcRegFilt1);
diff --git a/vp9/decoder/vp9_decodeframe.c b/vp9/decoder/vp9_decodeframe.c
index 470b2ba..5bf44d7 100644
--- a/vp9/decoder/vp9_decodeframe.c
+++ b/vp9/decoder/vp9_decodeframe.c
@@ -952,7 +952,6 @@
&tile_data->bit_reader, pbi->decrypt_cb,
pbi->decrypt_state);
init_macroblockd(cm, &tile_data->xd);
- vp9_zero(tile_data->xd.dqcoeff);
}
}
@@ -1149,7 +1148,6 @@
&tile_data->bit_reader, pbi->decrypt_cb,
pbi->decrypt_state);
init_macroblockd(cm, &tile_data->xd);
- vp9_zero(tile_data->xd.dqcoeff);
worker->had_error = 0;
if (i == num_workers - 1 || n == tile_cols - 1) {
@@ -1563,7 +1561,6 @@
"Uninitialized entropy context.");
vp9_zero(cm->counts);
- vp9_zero(xd->dqcoeff);
xd->corrupted = 0;
new_fb->corrupted = read_compressed_header(pbi, data, first_partition_size);
diff --git a/vp9/decoder/vp9_read_bit_buffer.c b/vp9/decoder/vp9_read_bit_buffer.c
index 3eef728..c3b38a9 100644
--- a/vp9/decoder/vp9_read_bit_buffer.c
+++ b/vp9/decoder/vp9_read_bit_buffer.c
@@ -10,20 +10,20 @@
#include "vp9/decoder/vp9_read_bit_buffer.h"
size_t vp9_rb_bytes_read(struct vp9_read_bit_buffer *rb) {
- return (rb->bit_offset + CHAR_BIT - 1) / CHAR_BIT;
+ return (rb->bit_offset + 7) >> 3;
}
int vp9_rb_read_bit(struct vp9_read_bit_buffer *rb) {
const size_t off = rb->bit_offset;
- const size_t p = off / CHAR_BIT;
- const int q = CHAR_BIT - 1 - (int)off % CHAR_BIT;
- if (rb->bit_buffer + p >= rb->bit_buffer_end) {
- rb->error_handler(rb->error_handler_data);
- return 0;
- } else {
- const int bit = (rb->bit_buffer[p] & (1 << q)) >> q;
+ const size_t p = off >> 3;
+ const int q = 7 - (int)(off & 0x7);
+ if (rb->bit_buffer + p < rb->bit_buffer_end) {
+ const int bit = (rb->bit_buffer[p] >> q) & 1;
rb->bit_offset = off + 1;
return bit;
+ } else {
+ rb->error_handler(rb->error_handler_data);
+ return 0;
}
}
diff --git a/vp9/encoder/vp9_bitstream.c b/vp9/encoder/vp9_bitstream.c
index 20368f0..4154a6f 100644
--- a/vp9/encoder/vp9_bitstream.c
+++ b/vp9/encoder/vp9_bitstream.c
@@ -1183,7 +1183,7 @@
vp9_cond_prob_diff_update(&header_bc, &fc->intra_inter_prob[i],
counts->intra_inter[i]);
- if (cm->allow_comp_inter_inter) {
+ if (cpi->allow_comp_inter_inter) {
const int use_compound_pred = cm->reference_mode != SINGLE_REFERENCE;
const int use_hybrid_pred = cm->reference_mode == REFERENCE_MODE_SELECT;
diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c
index 69dedac..86023a5 100644
--- a/vp9/encoder/vp9_encodeframe.c
+++ b/vp9/encoder/vp9_encodeframe.c
@@ -278,7 +278,7 @@
typedef struct {
int64_t sum_square_error;
int64_t sum_error;
- int count;
+ int log2_count;
int variance;
} var;
@@ -327,7 +327,6 @@
static void tree_to_node(void *data, BLOCK_SIZE bsize, variance_node *node) {
int i;
node->part_variances = NULL;
- vpx_memset(node->split, 0, sizeof(node->split));
switch (bsize) {
case BLOCK_64X64: {
v64x64 *vt = (v64x64 *) data;
@@ -375,18 +374,18 @@
static void fill_variance(int64_t s2, int64_t s, int c, var *v) {
v->sum_square_error = s2;
v->sum_error = s;
- v->count = c;
- if (c > 0)
- v->variance = (int)(256 *
- (v->sum_square_error - v->sum_error * v->sum_error /
- v->count) / v->count);
- else
- v->variance = 0;
+ v->log2_count = c;
+}
+
+static void get_variance(var *v) {
+ v->variance = (int)(256 * (v->sum_square_error -
+ ((v->sum_error * v->sum_error) >> v->log2_count)) >> v->log2_count);
}
void sum_2_variances(const var *a, const var *b, var *r) {
+ assert(a->log2_count == b->log2_count);
fill_variance(a->sum_square_error + b->sum_square_error,
- a->sum_error + b->sum_error, a->count + b->count, r);
+ a->sum_error + b->sum_error, a->log2_count + 1, r);
}
static void fill_variance_tree(void *data, BLOCK_SIZE bsize) {
@@ -424,8 +423,9 @@
if (cm->frame_type == KEY_FRAME) {
bsize_ref = BLOCK_8X8;
- // Choose lower thresholds for key frame variance to favor split.
- threshold_bsize_ref = threshold >> 1;
+ // Choose lower thresholds for key frame variance to favor split, but keep
+ // threshold for splitting to 4x4 block still fairly high for now.
+ threshold_bsize_ref = threshold << 2;
threshold_low = threshold >> 2;
}
@@ -433,6 +433,7 @@
// variance is below threshold, otherwise split will be selected.
// No check for vert/horiz split as too few samples for variance.
if (bsize == bsize_ref) {
+ get_variance(&vt.part_variances->none);
if (mi_col + block_width / 2 < cm->mi_cols &&
mi_row + block_height / 2 < cm->mi_rows &&
vt.part_variances->none.variance < threshold_bsize_ref) {
@@ -441,6 +442,7 @@
}
return 0;
} else if (bsize > bsize_ref) {
+ get_variance(&vt.part_variances->none);
// For key frame, for bsize above 32X32, or very high variance, take split.
if (cm->frame_type == KEY_FRAME &&
(bsize > BLOCK_32X32 ||
@@ -454,24 +456,32 @@
set_block_size(cpi, xd, mi_row, mi_col, bsize);
return 1;
}
+
// Check vertical split.
- if (mi_row + block_height / 2 < cm->mi_rows &&
- vt.part_variances->vert[0].variance < threshold_low &&
- vt.part_variances->vert[1].variance < threshold_low) {
- BLOCK_SIZE subsize = get_subsize(bsize, PARTITION_VERT);
- set_block_size(cpi, xd, mi_row, mi_col, subsize);
- set_block_size(cpi, xd, mi_row, mi_col + block_width / 2, subsize);
- return 1;
+ if (mi_row + block_height / 2 < cm->mi_rows) {
+ get_variance(&vt.part_variances->vert[0]);
+ get_variance(&vt.part_variances->vert[1]);
+ if (vt.part_variances->vert[0].variance < threshold_low &&
+ vt.part_variances->vert[1].variance < threshold_low) {
+ BLOCK_SIZE subsize = get_subsize(bsize, PARTITION_VERT);
+ set_block_size(cpi, xd, mi_row, mi_col, subsize);
+ set_block_size(cpi, xd, mi_row, mi_col + block_width / 2, subsize);
+ return 1;
+ }
}
// Check horizontal split.
- if (mi_col + block_width / 2 < cm->mi_cols &&
- vt.part_variances->horz[0].variance < threshold_low &&
- vt.part_variances->horz[1].variance < threshold_low) {
- BLOCK_SIZE subsize = get_subsize(bsize, PARTITION_HORZ);
- set_block_size(cpi, xd, mi_row, mi_col, subsize);
- set_block_size(cpi, xd, mi_row + block_height / 2, mi_col, subsize);
- return 1;
+ if (mi_col + block_width / 2 < cm->mi_cols) {
+ get_variance(&vt.part_variances->horz[0]);
+ get_variance(&vt.part_variances->horz[1]);
+ if (vt.part_variances->horz[0].variance < threshold_low &&
+ vt.part_variances->horz[1].variance < threshold_low) {
+ BLOCK_SIZE subsize = get_subsize(bsize, PARTITION_HORZ);
+ set_block_size(cpi, xd, mi_row, mi_col, subsize);
+ set_block_size(cpi, xd, mi_row + block_height / 2, mi_col, subsize);
+ return 1;
+ }
}
+
return 0;
}
return 0;
@@ -573,7 +583,7 @@
// If variance is based on 8x8 downsampling, we stop here and have
// one sample for 8x8 block (so use 1 for count in fill_variance),
// which of course means variance = 0 for 8x8 block.
- fill_variance(sse, sum, 1, &vst->split[k].part_variances.none);
+ fill_variance(sse, sum, 0, &vst->split[k].part_variances.none);
} else {
// For key frame, go down to 4x4.
v8x8 *vst2 = &vst->split[k];
@@ -583,7 +593,16 @@
unsigned int sse = 0;
int sum = 0;
if (x4_idx < pixels_wide && y4_idx < pixels_high) {
+#if CONFIG_VP9_HIGHBITDEPTH
+ int s_avg;
+ if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+ s_avg = vp9_highbd_avg_4x4(s + y4_idx * sp + x4_idx, sp);
+ } else {
+ s_avg = vp9_avg_4x4(s + y4_idx * sp + x4_idx, sp);
+ }
+#else
int s_avg = vp9_avg_4x4(s + y4_idx * sp + x4_idx, sp);
+#endif
// For key frame, reference is set to 128.
sum = s_avg - 128;
sse = sum * sum;
@@ -591,7 +610,7 @@
// If variance is based on 4x4 downsampling, we stop here and have
// one sample for 4x4 block (so use 1 for count in fill_variance),
// which of course means variance = 0 for 4x4 block.
- fill_variance(sse, sum, 1, &vst2->split[m].part_variances.none);
+ fill_variance(sse, sum, 0, &vst2->split[m].part_variances.none);
}
}
}
@@ -637,8 +656,6 @@
for (k = 0; k < 4; ++k) {
const int x8_idx = (k & 1);
const int y8_idx = (k >> 1);
- // TODO(marpan): Allow for setting 4x4 partition on key frame.
- /*
if (cm->frame_type == KEY_FRAME) {
if (!set_vt_partitioning(cpi, xd,
&vt.split[i].split[j].split[k],
@@ -651,12 +668,11 @@
BLOCK_4X4);
}
} else {
- */
set_block_size(cpi, xd,
(mi_row + y32_idx + y16_idx + y8_idx),
(mi_col + x32_idx + x16_idx + x8_idx),
BLOCK_8X8);
- // }
+ }
}
}
}
@@ -3103,7 +3119,7 @@
if (mi_row + hbs < cm->mi_rows) {
pc_tree->horizontal[1].pred_pixel_ready = 1;
nonrd_pick_sb_modes(cpi, tile_data, x, mi_row + hbs, mi_col,
- &this_rdc, subsize, &pc_tree->horizontal[0]);
+ &this_rdc, subsize, &pc_tree->horizontal[1]);
pc_tree->horizontal[1].mic.mbmi = xd->mi[0].src_mi->mbmi;
pc_tree->horizontal[1].skip_txfm[0] = x->skip_txfm[0];
pc_tree->horizontal[1].skip = x->skip;
@@ -3227,7 +3243,7 @@
if (mi_row + hbs < cm->mi_rows) {
pc_tree->horizontal[1].pred_pixel_ready = 1;
nonrd_pick_sb_modes(cpi, tile_data, x, mi_row + hbs, mi_col,
- dummy_cost, subsize, &pc_tree->horizontal[0]);
+ dummy_cost, subsize, &pc_tree->horizontal[1]);
pc_tree->horizontal[1].mic.mbmi = xd->mi[0].src_mi->mbmi;
pc_tree->horizontal[1].skip_txfm[0] = x->skip_txfm[0];
pc_tree->horizontal[1].skip = x->skip;
@@ -3707,9 +3723,9 @@
cm->ref_frame_sign_bias[GOLDEN_FRAME]) ||
(cm->ref_frame_sign_bias[ALTREF_FRAME] ==
cm->ref_frame_sign_bias[LAST_FRAME])) {
- cm->allow_comp_inter_inter = 0;
+ cpi->allow_comp_inter_inter = 0;
} else {
- cm->allow_comp_inter_inter = 1;
+ cpi->allow_comp_inter_inter = 1;
cm->comp_fixed_ref = ALTREF_FRAME;
cm->comp_var_ref[0] = LAST_FRAME;
cm->comp_var_ref[1] = GOLDEN_FRAME;
@@ -3733,7 +3749,7 @@
const int is_alt_ref = frame_type == ALTREF_FRAME;
/* prediction (compound, single or hybrid) mode selection */
- if (is_alt_ref || !cm->allow_comp_inter_inter)
+ if (is_alt_ref || !cpi->allow_comp_inter_inter)
cm->reference_mode = SINGLE_REFERENCE;
else if (mode_thrs[COMPOUND_REFERENCE] > mode_thrs[SINGLE_REFERENCE] &&
mode_thrs[COMPOUND_REFERENCE] >
diff --git a/vp9/encoder/vp9_encoder.h b/vp9/encoder/vp9_encoder.h
index 14f7c7f..a58a90e 100644
--- a/vp9/encoder/vp9_encoder.h
+++ b/vp9/encoder/vp9_encoder.h
@@ -339,6 +339,8 @@
unsigned int max_mv_magnitude;
int mv_step_param;
+ int allow_comp_inter_inter;
+
// Default value is 1. From first pass stats, encode_breakout may be disabled.
ENCODE_BREAKOUT_TYPE allow_encode_breakout;
diff --git a/vp9/encoder/vp9_pickmode.c b/vp9/encoder/vp9_pickmode.c
index b450324..379d067 100644
--- a/vp9/encoder/vp9_pickmode.c
+++ b/vp9/encoder/vp9_pickmode.c
@@ -249,14 +249,14 @@
#if CONFIG_VP9_HIGHBITDEPTH
if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
- vp9_model_rd_from_var_lapndz(sse - var, 1 << num_pels_log2_lookup[bsize],
+ vp9_model_rd_from_var_lapndz(sse - var, num_pels_log2_lookup[bsize],
dc_quant >> (xd->bd - 5), &rate, &dist);
} else {
- vp9_model_rd_from_var_lapndz(sse - var, 1 << num_pels_log2_lookup[bsize],
+ vp9_model_rd_from_var_lapndz(sse - var, num_pels_log2_lookup[bsize],
dc_quant >> 3, &rate, &dist);
}
#else
- vp9_model_rd_from_var_lapndz(sse - var, 1 << num_pels_log2_lookup[bsize],
+ vp9_model_rd_from_var_lapndz(sse - var, num_pels_log2_lookup[bsize],
dc_quant >> 3, &rate, &dist);
#endif // CONFIG_VP9_HIGHBITDEPTH
@@ -265,14 +265,14 @@
#if CONFIG_VP9_HIGHBITDEPTH
if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
- vp9_model_rd_from_var_lapndz(var, 1 << num_pels_log2_lookup[bsize],
+ vp9_model_rd_from_var_lapndz(var, num_pels_log2_lookup[bsize],
ac_quant >> (xd->bd - 5), &rate, &dist);
} else {
- vp9_model_rd_from_var_lapndz(var, 1 << num_pels_log2_lookup[bsize],
+ vp9_model_rd_from_var_lapndz(var, num_pels_log2_lookup[bsize],
ac_quant >> 3, &rate, &dist);
}
#else
- vp9_model_rd_from_var_lapndz(var, 1 << num_pels_log2_lookup[bsize],
+ vp9_model_rd_from_var_lapndz(var, num_pels_log2_lookup[bsize],
ac_quant >> 3, &rate, &dist);
#endif // CONFIG_VP9_HIGHBITDEPTH
@@ -537,9 +537,9 @@
// Reduce the intra cost penalty for small blocks (<=16x16).
const int reduction_fac =
(cpi->sf.partition_search_type == VAR_BASED_PARTITION &&
- bsize <= BLOCK_16X16) ? 4 : 1;
+ bsize <= BLOCK_16X16) ? 2 : 0;
const int intra_cost_penalty = vp9_get_intra_cost_penalty(
- cm->base_qindex, cm->y_dc_delta_q, cm->bit_depth) / reduction_fac;
+ cm->base_qindex, cm->y_dc_delta_q, cm->bit_depth) >> reduction_fac;
const int64_t inter_mode_thresh = RDCOST(x->rdmult, x->rddiv,
intra_cost_penalty, 0);
const int8_t segment_id = mbmi->segment_id;
diff --git a/vp9/encoder/vp9_rd.c b/vp9/encoder/vp9_rd.c
index 5b49bfc..34d49f0 100644
--- a/vp9/encoder/vp9_rd.c
+++ b/vp9/encoder/vp9_rd.c
@@ -379,7 +379,7 @@
*d_q10 = (dist_tab_q10[xq] * b_q10 + dist_tab_q10[xq + 1] * a_q10) >> 10;
}
-void vp9_model_rd_from_var_lapndz(unsigned int var, unsigned int n,
+void vp9_model_rd_from_var_lapndz(unsigned int var, unsigned int n_log2,
unsigned int qstep, int *rate,
int64_t *dist) {
// This function models the rate and distortion for a Laplacian
@@ -395,10 +395,10 @@
int d_q10, r_q10;
static const uint32_t MAX_XSQ_Q10 = 245727;
const uint64_t xsq_q10_64 =
- ((((uint64_t)qstep * qstep * n) << 10) + (var >> 1)) / var;
+ (((uint64_t)qstep * qstep << (n_log2 + 10)) + (var >> 1)) / var;
const int xsq_q10 = (int)MIN(xsq_q10_64, MAX_XSQ_Q10);
model_rd_norm(xsq_q10, &r_q10, &d_q10);
- *rate = (n * r_q10 + 2) >> 2;
+ *rate = ((r_q10 << n_log2) + 2) >> 2;
*dist = (var * (int64_t)d_q10 + 512) >> 10;
}
}
diff --git a/vp9/encoder/vp9_rdopt.c b/vp9/encoder/vp9_rdopt.c
index 600a3eb..bc5edc8 100644
--- a/vp9/encoder/vp9_rdopt.c
+++ b/vp9/encoder/vp9_rdopt.c
@@ -265,15 +265,15 @@
} else {
#if CONFIG_VP9_HIGHBITDEPTH
if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
- vp9_model_rd_from_var_lapndz(sum_sse, 1 << num_pels_log2_lookup[bs],
+ vp9_model_rd_from_var_lapndz(sum_sse, num_pels_log2_lookup[bs],
pd->dequant[1] >> (xd->bd - 5),
&rate, &dist);
} else {
- vp9_model_rd_from_var_lapndz(sum_sse, 1 << num_pels_log2_lookup[bs],
+ vp9_model_rd_from_var_lapndz(sum_sse, num_pels_log2_lookup[bs],
pd->dequant[1] >> 3, &rate, &dist);
}
#else
- vp9_model_rd_from_var_lapndz(sum_sse, 1 << num_pels_log2_lookup[bs],
+ vp9_model_rd_from_var_lapndz(sum_sse, num_pels_log2_lookup[bs],
pd->dequant[1] >> 3, &rate, &dist);
#endif // CONFIG_VP9_HIGHBITDEPTH
rate_sum += rate;
@@ -3065,7 +3065,7 @@
comp_pred = second_ref_frame > INTRA_FRAME;
if (comp_pred) {
- if (!cm->allow_comp_inter_inter)
+ if (!cpi->allow_comp_inter_inter)
continue;
// Skip compound inter modes if ARF is not available.
@@ -3715,7 +3715,7 @@
comp_pred = second_ref_frame > INTRA_FRAME;
if (comp_pred) {
- if (!cm->allow_comp_inter_inter)
+ if (!cpi->allow_comp_inter_inter)
continue;
if (!(cpi->ref_frame_flags & flag_list[second_ref_frame]))
continue;
diff --git a/vp9/vp9_common.mk b/vp9/vp9_common.mk
index ccd5518..2560459 100644
--- a/vp9/vp9_common.mk
+++ b/vp9/vp9_common.mk
@@ -72,6 +72,8 @@
VP9_COMMON_SRCS-$(HAVE_AVX2) += common/x86/vp9_loopfilter_intrin_avx2.c
VP9_COMMON_SRCS-$(CONFIG_VP9_POSTPROC) += common/vp9_postproc.h
VP9_COMMON_SRCS-$(CONFIG_VP9_POSTPROC) += common/vp9_postproc.c
+VP9_COMMON_SRCS-$(CONFIG_VP9_POSTPROC) += common/vp9_mfqe.h
+VP9_COMMON_SRCS-$(CONFIG_VP9_POSTPROC) += common/vp9_mfqe.c
VP9_COMMON_SRCS-$(HAVE_MMX) += common/x86/vp9_loopfilter_mmx.asm
VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_subpixel_8t_sse2.asm
VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_subpixel_bilinear_sse2.asm
@@ -133,12 +135,13 @@
VP9_COMMON_SRCS-$(HAVE_NEON_ASM) += common/arm/neon/vp9_loopfilter_16_neon_asm$(ASM)
VP9_COMMON_SRCS-$(HAVE_NEON_ASM) += common/arm/neon/vp9_dc_only_idct_add_neon$(ASM)
-VP9_COMMON_SRCS-$(HAVE_NEON_ASM) += common/arm/neon/vp9_iht4x4_add_neon$(ASM)
-VP9_COMMON_SRCS-$(HAVE_NEON_ASM) += common/arm/neon/vp9_iht8x8_add_neon$(ASM)
VP9_COMMON_SRCS-$(HAVE_NEON_ASM) += common/arm/neon/vp9_mb_lpf_neon$(ASM)
VP9_COMMON_SRCS-$(HAVE_NEON_ASM) += common/arm/neon/vp9_save_reg_neon$(ASM)
VP9_COMMON_SRCS-$(HAVE_NEON_ASM) += common/arm/neon/vp9_reconintra_neon$(ASM)
+VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_iht4x4_add_neon.c
+VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_iht8x8_add_neon.c
+
# neon with assembly and intrinsics implementations. If both are available
# prefer assembly.
ifeq ($(HAVE_NEON_ASM), yes)
diff --git a/vp9/vp9_dx_iface.c b/vp9/vp9_dx_iface.c
index 85e32d3..8095140 100644
--- a/vp9/vp9_dx_iface.c
+++ b/vp9/vp9_dx_iface.c
@@ -11,6 +11,7 @@
#include <stdlib.h>
#include <string.h>
+#include "./vpx_config.h"
#include "./vpx_version.h"
#include "vpx/internal/vpx_codec_internal.h"
diff --git a/vpx_ports/x86_abi_support.asm b/vpx_ports/x86_abi_support.asm
index 3814ef4..c94b76a 100644
--- a/vpx_ports/x86_abi_support.asm
+++ b/vpx_ports/x86_abi_support.asm
@@ -395,7 +395,7 @@
; On Android platforms use lrand48 when building postproc routines. Prior to L
; rand() was not available.
-%if CONFIG_POSTPROC=1
+%if CONFIG_POSTPROC=1 || CONFIG_VP9_POSTPROC=1
%ifdef __ANDROID__
extern sym(lrand48)
%define LIBVPX_RAND lrand48
@@ -403,4 +403,4 @@
extern sym(rand)
%define LIBVPX_RAND rand
%endif
-%endif ; CONFIG_POSTPROC
+%endif ; CONFIG_POSTPROC || CONFIG_VP9_POSTPROC
diff --git a/vpxdec.c b/vpxdec.c
index c4d2a9e..30e2742 100644
--- a/vpxdec.c
+++ b/vpxdec.c
@@ -131,7 +131,7 @@
#endif
#if CONFIG_LIBYUV
-static INLINE int vpx_image_scale(vpx_image_t *src, vpx_image_t *dst,
+static INLINE int libyuv_scale(vpx_image_t *src, vpx_image_t *dst,
FilterModeEnum mode) {
#if CONFIG_VP9 && CONFIG_VP9_HIGHBITDEPTH
if (src->fmt == VPX_IMG_FMT_I42016) {
@@ -948,7 +948,7 @@
if (img->d_w != scaled_img->d_w || img->d_h != scaled_img->d_h) {
#if CONFIG_LIBYUV
- vpx_image_scale(img, scaled_img, kFilterBox);
+ libyuv_scale(img, scaled_img, kFilterBox);
img = scaled_img;
#else
fprintf(stderr, "Failed to scale output frame: %s.\n"