Merge "vp9_receive_compressed_data: remove unnecessary indent"
diff --git a/build/make/configure.sh b/build/make/configure.sh
index f361021..c6c8660 100755
--- a/build/make/configure.sh
+++ b/build/make/configure.sh
@@ -1060,9 +1060,11 @@
CC=${CC:-icc}
LD=${LD:-icc}
setup_gnu_toolchain
- add_cflags -use-msasm -use-asm
- add_ldflags -i-static
- enabled x86_64 && add_cflags -ipo -static -O3
+ add_cflags -use-msasm # remove -use-msasm too?
+ # add -no-intel-extensions to suppress warning #10237
+ # refer to http://software.intel.com/en-us/forums/topic/280199
+ add_ldflags -i-static -no-intel-extensions
+ enabled x86_64 && add_cflags -ipo -static -O3 -no-prec-div
enabled x86_64 && AR=xiar
case ${tune_cpu} in
atom*)
diff --git a/examples.mk b/examples.mk
index 7b47ade..88327fe 100644
--- a/examples.mk
+++ b/examples.mk
@@ -40,9 +40,9 @@
vpxenc.SRCS += vpx_ports/mem_ops.h
vpxenc.SRCS += vpx_ports/mem_ops_aligned.h
vpxenc.SRCS += vpx_ports/vpx_timer.h
-vpxenc.SRCS += libmkv/EbmlIDs.h
-vpxenc.SRCS += libmkv/EbmlWriter.c
-vpxenc.SRCS += libmkv/EbmlWriter.h
+vpxenc.SRCS += third_party/libmkv/EbmlIDs.h
+vpxenc.SRCS += third_party/libmkv/EbmlWriter.c
+vpxenc.SRCS += third_party/libmkv/EbmlWriter.h
vpxenc.SRCS += $(LIBYUV_SRCS)
vpxenc.GUID = 548DEC74-7A15-4B2B-AFC3-AA102E7C25C1
vpxenc.DESCRIPTION = Full featured encoder
diff --git a/test/convolve_test.cc b/test/convolve_test.cc
index f0b412d..abeb4bd 100644
--- a/test/convolve_test.cc
+++ b/test/convolve_test.cc
@@ -599,6 +599,28 @@
make_tuple(32, 64, &convolve8_c),
make_tuple(64, 64, &convolve8_c)));
+#if HAVE_SSE2
+const ConvolveFunctions convolve8_sse2(
+ vp9_convolve8_horiz_sse2, vp9_convolve8_avg_horiz_sse2,
+ vp9_convolve8_vert_sse2, vp9_convolve8_avg_vert_sse2,
+ vp9_convolve8_sse2, vp9_convolve8_avg_sse2);
+
+INSTANTIATE_TEST_CASE_P(SSE2, ConvolveTest, ::testing::Values(
+ make_tuple(4, 4, &convolve8_sse2),
+ make_tuple(8, 4, &convolve8_sse2),
+ make_tuple(4, 8, &convolve8_sse2),
+ make_tuple(8, 8, &convolve8_sse2),
+ make_tuple(16, 8, &convolve8_sse2),
+ make_tuple(8, 16, &convolve8_sse2),
+ make_tuple(16, 16, &convolve8_sse2),
+ make_tuple(32, 16, &convolve8_sse2),
+ make_tuple(16, 32, &convolve8_sse2),
+ make_tuple(32, 32, &convolve8_sse2),
+ make_tuple(64, 32, &convolve8_sse2),
+ make_tuple(32, 64, &convolve8_sse2),
+ make_tuple(64, 64, &convolve8_sse2)));
+#endif
+
#if HAVE_SSSE3
const ConvolveFunctions convolve8_ssse3(
vp9_convolve8_horiz_ssse3, vp9_convolve8_avg_horiz_ssse3,
diff --git a/test/dct16x16_test.cc b/test/dct16x16_test.cc
index 0d19aa0..3d61d40 100644
--- a/test/dct16x16_test.cc
+++ b/test/dct16x16_test.cc
@@ -21,7 +21,7 @@
extern "C" {
#include "vp9/common/vp9_entropy.h"
#include "./vp9_rtcd.h"
-void vp9_short_idct16x16_add_c(int16_t *input, uint8_t *output, int pitch);
+void vp9_idct16x16_256_add_c(const int16_t *input, uint8_t *output, int pitch);
}
#include "vpx/vpx_integer.h"
@@ -258,9 +258,10 @@
}
typedef void (*fdct_t)(int16_t *in, int16_t *out, int stride);
-typedef void (*idct_t)(int16_t *in, uint8_t *dst, int stride);
+typedef void (*idct_t)(const int16_t *in, uint8_t *dst, int stride);
typedef void (*fht_t) (int16_t *in, int16_t *out, int stride, int tx_type);
-typedef void (*iht_t) (int16_t *in, uint8_t *dst, int stride, int tx_type);
+typedef void (*iht_t) (const int16_t *in, uint8_t *dst, int stride,
+ int tx_type);
void fdct16x16_ref(int16_t *in, int16_t *out, int stride, int tx_type) {
vp9_short_fdct16x16_c(in, out, stride);
@@ -496,27 +497,27 @@
INSTANTIATE_TEST_CASE_P(
C, Trans16x16DCT,
::testing::Values(
- make_tuple(&vp9_short_fdct16x16_c, &vp9_short_idct16x16_add_c, 0)));
+ make_tuple(&vp9_short_fdct16x16_c, &vp9_idct16x16_256_add_c, 0)));
INSTANTIATE_TEST_CASE_P(
C, Trans16x16HT,
::testing::Values(
- make_tuple(&vp9_short_fht16x16_c, &vp9_short_iht16x16_add_c, 0),
- make_tuple(&vp9_short_fht16x16_c, &vp9_short_iht16x16_add_c, 1),
- make_tuple(&vp9_short_fht16x16_c, &vp9_short_iht16x16_add_c, 2),
- make_tuple(&vp9_short_fht16x16_c, &vp9_short_iht16x16_add_c, 3)));
+ make_tuple(&vp9_short_fht16x16_c, &vp9_iht16x16_256_add_c, 0),
+ make_tuple(&vp9_short_fht16x16_c, &vp9_iht16x16_256_add_c, 1),
+ make_tuple(&vp9_short_fht16x16_c, &vp9_iht16x16_256_add_c, 2),
+ make_tuple(&vp9_short_fht16x16_c, &vp9_iht16x16_256_add_c, 3)));
#if HAVE_SSE2
INSTANTIATE_TEST_CASE_P(
SSE2, Trans16x16DCT,
::testing::Values(
make_tuple(&vp9_short_fdct16x16_sse2,
- &vp9_short_idct16x16_add_sse2, 0)));
+ &vp9_idct16x16_256_add_sse2, 0)));
INSTANTIATE_TEST_CASE_P(
SSE2, Trans16x16HT,
::testing::Values(
- make_tuple(&vp9_short_fht16x16_sse2, &vp9_short_iht16x16_add_sse2, 0),
- make_tuple(&vp9_short_fht16x16_sse2, &vp9_short_iht16x16_add_sse2, 1),
- make_tuple(&vp9_short_fht16x16_sse2, &vp9_short_iht16x16_add_sse2, 2),
- make_tuple(&vp9_short_fht16x16_sse2, &vp9_short_iht16x16_add_sse2, 3)));
+ make_tuple(&vp9_short_fht16x16_sse2, &vp9_iht16x16_256_add_sse2, 0),
+ make_tuple(&vp9_short_fht16x16_sse2, &vp9_iht16x16_256_add_sse2, 1),
+ make_tuple(&vp9_short_fht16x16_sse2, &vp9_iht16x16_256_add_sse2, 2),
+ make_tuple(&vp9_short_fht16x16_sse2, &vp9_iht16x16_256_add_sse2, 3)));
#endif
} // namespace
diff --git a/test/dct32x32_test.cc b/test/dct32x32_test.cc
index f331886..f456abc 100644
--- a/test/dct32x32_test.cc
+++ b/test/dct32x32_test.cc
@@ -75,7 +75,7 @@
}
typedef void (*fwd_txfm_t)(int16_t *in, int16_t *out, int stride);
-typedef void (*inv_txfm_t)(int16_t *in, uint8_t *dst, int stride);
+typedef void (*inv_txfm_t)(const int16_t *in, uint8_t *dst, int stride);
class Trans32x32Test : public PARAMS(fwd_txfm_t, inv_txfm_t, int) {
public:
@@ -247,16 +247,16 @@
INSTANTIATE_TEST_CASE_P(
C, Trans32x32Test,
::testing::Values(
- make_tuple(&vp9_short_fdct32x32_c, &vp9_short_idct32x32_add_c, 0),
- make_tuple(&vp9_short_fdct32x32_rd_c, &vp9_short_idct32x32_add_c, 1)));
+ make_tuple(&vp9_short_fdct32x32_c, &vp9_idct32x32_1024_add_c, 0),
+ make_tuple(&vp9_short_fdct32x32_rd_c, &vp9_idct32x32_1024_add_c, 1)));
#if HAVE_SSE2
INSTANTIATE_TEST_CASE_P(
SSE2, Trans32x32Test,
::testing::Values(
make_tuple(&vp9_short_fdct32x32_sse2,
- &vp9_short_idct32x32_add_sse2, 0),
+ &vp9_idct32x32_1024_add_sse2, 0),
make_tuple(&vp9_short_fdct32x32_rd_sse2,
- &vp9_short_idct32x32_add_sse2, 1)));
+ &vp9_idct32x32_1024_add_sse2, 1)));
#endif
} // namespace
diff --git a/test/fdct4x4_test.cc b/test/fdct4x4_test.cc
index ea40ca6..edc194d 100644
--- a/test/fdct4x4_test.cc
+++ b/test/fdct4x4_test.cc
@@ -31,7 +31,7 @@
}
void idct4x4_add(int16_t* /*in*/, int16_t *out, uint8_t *dst,
int stride, int /*tx_type*/) {
- vp9_short_idct4x4_add_c(out, dst, stride >> 1);
+ vp9_idct4x4_16_add_c(out, dst, stride >> 1);
}
void fht4x4(int16_t *in, int16_t *out, uint8_t* /*dst*/,
int stride, int tx_type) {
@@ -39,7 +39,7 @@
}
void iht4x4_add(int16_t* /*in*/, int16_t *out, uint8_t *dst,
int stride, int tx_type) {
- vp9_short_iht4x4_add_c(out, dst, stride >> 1, tx_type);
+ vp9_iht4x4_16_add_c(out, dst, stride >> 1, tx_type);
}
class FwdTrans4x4Test : public ::testing::TestWithParam<int> {
diff --git a/test/fdct8x8_test.cc b/test/fdct8x8_test.cc
index 7edb4d0..728db6d 100644
--- a/test/fdct8x8_test.cc
+++ b/test/fdct8x8_test.cc
@@ -21,7 +21,7 @@
extern "C" {
#include "vp9/common/vp9_entropy.h"
#include "./vp9_rtcd.h"
-void vp9_short_idct8x8_add_c(int16_t *input, uint8_t *output, int pitch);
+void vp9_idct8x8_64_add_c(const int16_t *input, uint8_t *output, int pitch);
}
#include "vpx/vpx_integer.h"
@@ -29,9 +29,10 @@
namespace {
typedef void (*fdct_t)(int16_t *in, int16_t *out, int stride);
-typedef void (*idct_t)(int16_t *in, uint8_t *dst, int stride);
+typedef void (*idct_t)(const int16_t *in, uint8_t *dst, int stride);
typedef void (*fht_t) (int16_t *in, int16_t *out, int stride, int tx_type);
-typedef void (*iht_t) (int16_t *in, uint8_t *dst, int stride, int tx_type);
+typedef void (*iht_t) (const int16_t *in, uint8_t *dst, int stride,
+ int tx_type);
void fdct8x8_ref(int16_t *in, int16_t *out, int stride, int tx_type) {
vp9_short_fdct8x8_c(in, out, stride);
@@ -296,26 +297,26 @@
INSTANTIATE_TEST_CASE_P(
C, FwdTrans8x8DCT,
::testing::Values(
- make_tuple(&vp9_short_fdct8x8_c, &vp9_short_idct8x8_add_c, 0)));
+ make_tuple(&vp9_short_fdct8x8_c, &vp9_idct8x8_64_add_c, 0)));
INSTANTIATE_TEST_CASE_P(
C, FwdTrans8x8HT,
::testing::Values(
- make_tuple(&vp9_short_fht8x8_c, &vp9_short_iht8x8_add_c, 0),
- make_tuple(&vp9_short_fht8x8_c, &vp9_short_iht8x8_add_c, 1),
- make_tuple(&vp9_short_fht8x8_c, &vp9_short_iht8x8_add_c, 2),
- make_tuple(&vp9_short_fht8x8_c, &vp9_short_iht8x8_add_c, 3)));
+ make_tuple(&vp9_short_fht8x8_c, &vp9_iht8x8_64_add_c, 0),
+ make_tuple(&vp9_short_fht8x8_c, &vp9_iht8x8_64_add_c, 1),
+ make_tuple(&vp9_short_fht8x8_c, &vp9_iht8x8_64_add_c, 2),
+ make_tuple(&vp9_short_fht8x8_c, &vp9_iht8x8_64_add_c, 3)));
#if HAVE_SSE2
INSTANTIATE_TEST_CASE_P(
SSE2, FwdTrans8x8DCT,
::testing::Values(
- make_tuple(&vp9_short_fdct8x8_sse2, &vp9_short_idct8x8_add_sse2, 0)));
+ make_tuple(&vp9_short_fdct8x8_sse2, &vp9_idct8x8_64_add_sse2, 0)));
INSTANTIATE_TEST_CASE_P(
SSE2, FwdTrans8x8HT,
::testing::Values(
- make_tuple(&vp9_short_fht8x8_sse2, &vp9_short_iht8x8_add_sse2, 0),
- make_tuple(&vp9_short_fht8x8_sse2, &vp9_short_iht8x8_add_sse2, 1),
- make_tuple(&vp9_short_fht8x8_sse2, &vp9_short_iht8x8_add_sse2, 2),
- make_tuple(&vp9_short_fht8x8_sse2, &vp9_short_iht8x8_add_sse2, 3)));
+ make_tuple(&vp9_short_fht8x8_sse2, &vp9_iht8x8_64_add_sse2, 0),
+ make_tuple(&vp9_short_fht8x8_sse2, &vp9_iht8x8_64_add_sse2, 1),
+ make_tuple(&vp9_short_fht8x8_sse2, &vp9_iht8x8_64_add_sse2, 2),
+ make_tuple(&vp9_short_fht8x8_sse2, &vp9_iht8x8_64_add_sse2, 3)));
#endif
} // namespace
diff --git a/test/idct8x8_test.cc b/test/idct8x8_test.cc
index fc8129e..d8c61ff 100644
--- a/test/idct8x8_test.cc
+++ b/test/idct8x8_test.cc
@@ -126,7 +126,7 @@
reference_dct_2d(input, output_r);
for (int j = 0; j < 64; ++j)
coeff[j] = round(output_r[j]);
- vp9_short_idct8x8_add_c(coeff, dst, 8);
+ vp9_idct8x8_64_add_c(coeff, dst, 8);
for (int j = 0; j < 64; ++j) {
const int diff = dst[j] - src[j];
const int error = diff * diff;
diff --git a/test/resize_test.cc b/test/resize_test.cc
index d194dfd..e8c2c82 100644
--- a/test/resize_test.cc
+++ b/test/resize_test.cc
@@ -208,7 +208,7 @@
virtual void PSNRPktHook(const vpx_codec_cx_pkt_t *pkt) {
if (!frame0_psnr_)
frame0_psnr_ = pkt->data.psnr.psnr[0];
- EXPECT_NEAR(pkt->data.psnr.psnr[0], frame0_psnr_, 1.0);
+ EXPECT_NEAR(pkt->data.psnr.psnr[0], frame0_psnr_, 1.5);
}
virtual void FramePktHook(const vpx_codec_cx_pkt_t *pkt) {
diff --git a/libmkv/EbmlBufferWriter.c b/third_party/libmkv/EbmlBufferWriter.c
similarity index 100%
rename from libmkv/EbmlBufferWriter.c
rename to third_party/libmkv/EbmlBufferWriter.c
diff --git a/libmkv/EbmlBufferWriter.h b/third_party/libmkv/EbmlBufferWriter.h
similarity index 100%
rename from libmkv/EbmlBufferWriter.h
rename to third_party/libmkv/EbmlBufferWriter.h
diff --git a/libmkv/EbmlIDs.h b/third_party/libmkv/EbmlIDs.h
similarity index 100%
rename from libmkv/EbmlIDs.h
rename to third_party/libmkv/EbmlIDs.h
diff --git a/libmkv/EbmlWriter.c b/third_party/libmkv/EbmlWriter.c
similarity index 100%
rename from libmkv/EbmlWriter.c
rename to third_party/libmkv/EbmlWriter.c
diff --git a/libmkv/EbmlWriter.h b/third_party/libmkv/EbmlWriter.h
similarity index 100%
rename from libmkv/EbmlWriter.h
rename to third_party/libmkv/EbmlWriter.h
diff --git a/libmkv/Makefile b/third_party/libmkv/Makefile
similarity index 100%
rename from libmkv/Makefile
rename to third_party/libmkv/Makefile
diff --git a/libmkv/WebMElement.c b/third_party/libmkv/WebMElement.c
similarity index 100%
rename from libmkv/WebMElement.c
rename to third_party/libmkv/WebMElement.c
diff --git a/libmkv/WebMElement.h b/third_party/libmkv/WebMElement.h
similarity index 100%
rename from libmkv/WebMElement.h
rename to third_party/libmkv/WebMElement.h
diff --git a/libmkv/testlibmkv.c b/third_party/libmkv/testlibmkv.c
similarity index 100%
rename from libmkv/testlibmkv.c
rename to third_party/libmkv/testlibmkv.c
diff --git a/vp9/common/arm/neon/vp9_idct16x16_neon.c b/vp9/common/arm/neon/vp9_idct16x16_neon.c
index fb7b5cd..0b9fc09 100644
--- a/vp9/common/arm/neon/vp9_idct16x16_neon.c
+++ b/vp9/common/arm/neon/vp9_idct16x16_neon.c
@@ -11,31 +11,31 @@
#include "./vp9_rtcd.h"
#include "vp9/common/vp9_common.h"
-extern void vp9_short_idct16x16_add_neon_pass1(int16_t *input,
- int16_t *output,
- int output_stride);
-extern void vp9_short_idct16x16_add_neon_pass2(int16_t *src,
- int16_t *output,
- int16_t *pass1Output,
- int16_t skip_adding,
- uint8_t *dest,
- int dest_stride);
-extern void vp9_short_idct16x16_10_add_neon_pass1(int16_t *input,
- int16_t *output,
- int output_stride);
-extern void vp9_short_idct16x16_10_add_neon_pass2(int16_t *src,
- int16_t *output,
- int16_t *pass1Output,
- int16_t skip_adding,
- uint8_t *dest,
- int dest_stride);
+void vp9_idct16x16_256_add_neon_pass1(const int16_t *input,
+ int16_t *output,
+ int output_stride);
+void vp9_idct16x16_256_add_neon_pass2(const int16_t *src,
+ int16_t *output,
+ int16_t *pass1Output,
+ int16_t skip_adding,
+ uint8_t *dest,
+ int dest_stride);
+void vp9_idct16x16_10_add_neon_pass1(const int16_t *input,
+ int16_t *output,
+ int output_stride);
+void vp9_idct16x16_10_add_neon_pass2(const int16_t *src,
+ int16_t *output,
+ int16_t *pass1Output,
+ int16_t skip_adding,
+ uint8_t *dest,
+ int dest_stride);
/* For ARM NEON, d8-d15 are callee-saved registers, and need to be saved. */
extern void vp9_push_neon(int64_t *store);
extern void vp9_pop_neon(int64_t *store);
-void vp9_short_idct16x16_add_neon(int16_t *input,
- uint8_t *dest, int dest_stride) {
+void vp9_idct16x16_256_add_neon(const int16_t *input,
+ uint8_t *dest, int dest_stride) {
int64_t store_reg[8];
int16_t pass1_output[16*16] = {0};
int16_t row_idct_output[16*16] = {0};
@@ -46,12 +46,12 @@
/* Parallel idct on the upper 8 rows */
// First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the
// stage 6 result in pass1_output.
- vp9_short_idct16x16_add_neon_pass1(input, pass1_output, 8);
+ vp9_idct16x16_256_add_neon_pass1(input, pass1_output, 8);
// Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines
// with result in pass1(pass1_output) to calculate final result in stage 7
// which will be saved into row_idct_output.
- vp9_short_idct16x16_add_neon_pass2(input+1,
+ vp9_idct16x16_256_add_neon_pass2(input+1,
row_idct_output,
pass1_output,
0,
@@ -61,12 +61,12 @@
/* Parallel idct on the lower 8 rows */
// First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the
// stage 6 result in pass1_output.
- vp9_short_idct16x16_add_neon_pass1(input+8*16, pass1_output, 8);
+ vp9_idct16x16_256_add_neon_pass1(input+8*16, pass1_output, 8);
// Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines
// with result in pass1(pass1_output) to calculate final result in stage 7
// which will be saved into row_idct_output.
- vp9_short_idct16x16_add_neon_pass2(input+8*16+1,
+ vp9_idct16x16_256_add_neon_pass2(input+8*16+1,
row_idct_output+8,
pass1_output,
0,
@@ -76,12 +76,12 @@
/* Parallel idct on the left 8 columns */
// First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the
// stage 6 result in pass1_output.
- vp9_short_idct16x16_add_neon_pass1(row_idct_output, pass1_output, 8);
+ vp9_idct16x16_256_add_neon_pass1(row_idct_output, pass1_output, 8);
// Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines
// with result in pass1(pass1_output) to calculate final result in stage 7.
// Then add the result to the destination data.
- vp9_short_idct16x16_add_neon_pass2(row_idct_output+1,
+ vp9_idct16x16_256_add_neon_pass2(row_idct_output+1,
row_idct_output,
pass1_output,
1,
@@ -91,12 +91,12 @@
/* Parallel idct on the right 8 columns */
// First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the
// stage 6 result in pass1_output.
- vp9_short_idct16x16_add_neon_pass1(row_idct_output+8*16, pass1_output, 8);
+ vp9_idct16x16_256_add_neon_pass1(row_idct_output+8*16, pass1_output, 8);
// Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines
// with result in pass1(pass1_output) to calculate final result in stage 7.
// Then add the result to the destination data.
- vp9_short_idct16x16_add_neon_pass2(row_idct_output+8*16+1,
+ vp9_idct16x16_256_add_neon_pass2(row_idct_output+8*16+1,
row_idct_output+8,
pass1_output,
1,
@@ -109,8 +109,8 @@
return;
}
-void vp9_short_idct16x16_10_add_neon(int16_t *input,
- uint8_t *dest, int dest_stride) {
+void vp9_idct16x16_10_add_neon(const int16_t *input,
+ uint8_t *dest, int dest_stride) {
int64_t store_reg[8];
int16_t pass1_output[16*16] = {0};
int16_t row_idct_output[16*16] = {0};
@@ -121,12 +121,12 @@
/* Parallel idct on the upper 8 rows */
// First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the
// stage 6 result in pass1_output.
- vp9_short_idct16x16_10_add_neon_pass1(input, pass1_output, 8);
+ vp9_idct16x16_10_add_neon_pass1(input, pass1_output, 8);
// Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines
// with result in pass1(pass1_output) to calculate final result in stage 7
// which will be saved into row_idct_output.
- vp9_short_idct16x16_10_add_neon_pass2(input+1,
+ vp9_idct16x16_10_add_neon_pass2(input+1,
row_idct_output,
pass1_output,
0,
@@ -138,12 +138,12 @@
/* Parallel idct on the left 8 columns */
// First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the
// stage 6 result in pass1_output.
- vp9_short_idct16x16_add_neon_pass1(row_idct_output, pass1_output, 8);
+ vp9_idct16x16_256_add_neon_pass1(row_idct_output, pass1_output, 8);
// Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines
// with result in pass1(pass1_output) to calculate final result in stage 7.
// Then add the result to the destination data.
- vp9_short_idct16x16_add_neon_pass2(row_idct_output+1,
+ vp9_idct16x16_256_add_neon_pass2(row_idct_output+1,
row_idct_output,
pass1_output,
1,
@@ -153,12 +153,12 @@
/* Parallel idct on the right 8 columns */
// First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the
// stage 6 result in pass1_output.
- vp9_short_idct16x16_add_neon_pass1(row_idct_output+8*16, pass1_output, 8);
+ vp9_idct16x16_256_add_neon_pass1(row_idct_output+8*16, pass1_output, 8);
// Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines
// with result in pass1(pass1_output) to calculate final result in stage 7.
// Then add the result to the destination data.
- vp9_short_idct16x16_add_neon_pass2(row_idct_output+8*16+1,
+ vp9_idct16x16_256_add_neon_pass2(row_idct_output+8*16+1,
row_idct_output+8,
pass1_output,
1,
diff --git a/vp9/common/arm/neon/vp9_short_idct16x16_1_add_neon.asm b/vp9/common/arm/neon/vp9_short_idct16x16_1_add_neon.asm
index cf5c8f7..b1fd21b 100644
--- a/vp9/common/arm/neon/vp9_short_idct16x16_1_add_neon.asm
+++ b/vp9/common/arm/neon/vp9_short_idct16x16_1_add_neon.asm
@@ -8,21 +8,21 @@
;
- EXPORT |vp9_short_idct16x16_1_add_neon|
+ EXPORT |vp9_idct16x16_1_add_neon|
ARM
REQUIRE8
PRESERVE8
AREA ||.text||, CODE, READONLY, ALIGN=2
-;void vp9_short_idct16x16_1_add_neon(int16_t *input, uint8_t *dest,
+;void vp9_idct16x16_1_add_neon(int16_t *input, uint8_t *dest,
; int dest_stride)
;
; r0 int16_t input
; r1 uint8_t *dest
; r2 int dest_stride)
-|vp9_short_idct16x16_1_add_neon| PROC
+|vp9_idct16x16_1_add_neon| PROC
ldrsh r0, [r0]
; generate cospi_16_64 = 11585
@@ -193,6 +193,6 @@
vst1.64 {d31}, [r12], r2
bx lr
- ENDP ; |vp9_short_idct16x16_1_add_neon|
+ ENDP ; |vp9_idct16x16_1_add_neon|
END
diff --git a/vp9/common/arm/neon/vp9_short_idct16x16_add_neon.asm b/vp9/common/arm/neon/vp9_short_idct16x16_add_neon.asm
index df2a052..a13c0d0 100644
--- a/vp9/common/arm/neon/vp9_short_idct16x16_add_neon.asm
+++ b/vp9/common/arm/neon/vp9_short_idct16x16_add_neon.asm
@@ -8,10 +8,10 @@
; be found in the AUTHORS file in the root of the source tree.
;
- EXPORT |vp9_short_idct16x16_add_neon_pass1|
- EXPORT |vp9_short_idct16x16_add_neon_pass2|
- EXPORT |vp9_short_idct16x16_10_add_neon_pass1|
- EXPORT |vp9_short_idct16x16_10_add_neon_pass2|
+ EXPORT |vp9_idct16x16_256_add_neon_pass1|
+ EXPORT |vp9_idct16x16_256_add_neon_pass2|
+ EXPORT |vp9_idct16x16_10_add_neon_pass1|
+ EXPORT |vp9_idct16x16_10_add_neon_pass2|
ARM
REQUIRE8
PRESERVE8
@@ -36,7 +36,7 @@
MEND
AREA Block, CODE, READONLY ; name this block of code
-;void |vp9_short_idct16x16_add_neon_pass1|(int16_t *input,
+;void |vp9_idct16x16_256_add_neon_pass1|(int16_t *input,
; int16_t *output, int output_stride)
;
; r0 int16_t input
@@ -46,7 +46,7 @@
; idct16 stage1 - stage6 on all the elements loaded in q8-q15. The output
; will be stored back into q8-q15 registers. This function will touch q0-q7
; registers and use them as buffer during calculation.
-|vp9_short_idct16x16_add_neon_pass1| PROC
+|vp9_idct16x16_256_add_neon_pass1| PROC
; TODO(hkuang): Find a better way to load the elements.
; load elements of 0, 2, 4, 6, 8, 10, 12, 14 into q8 - q15
@@ -273,9 +273,9 @@
vst1.64 {d31}, [r1], r2
bx lr
- ENDP ; |vp9_short_idct16x16_add_neon_pass1|
+ ENDP ; |vp9_idct16x16_256_add_neon_pass1|
-;void vp9_short_idct16x16_add_neon_pass2(int16_t *src,
+;void vp9_idct16x16_256_add_neon_pass2(int16_t *src,
; int16_t *output,
; int16_t *pass1Output,
; int16_t skip_adding,
@@ -292,7 +292,7 @@
; idct16 stage1 - stage7 on all the elements loaded in q8-q15. The output
; will be stored back into q8-q15 registers. This function will touch q0-q7
; registers and use them as buffer during calculation.
-|vp9_short_idct16x16_add_neon_pass2| PROC
+|vp9_idct16x16_256_add_neon_pass2| PROC
push {r3-r9}
; TODO(hkuang): Find a better way to load the elements.
@@ -784,9 +784,9 @@
end_idct16x16_pass2
pop {r3-r9}
bx lr
- ENDP ; |vp9_short_idct16x16_add_neon_pass2|
+ ENDP ; |vp9_idct16x16_256_add_neon_pass2|
-;void |vp9_short_idct16x16_10_add_neon_pass1|(int16_t *input,
+;void |vp9_idct16x16_10_add_neon_pass1|(int16_t *input,
; int16_t *output, int output_stride)
;
; r0 int16_t input
@@ -796,7 +796,7 @@
; idct16 stage1 - stage6 on all the elements loaded in q8-q15. The output
; will be stored back into q8-q15 registers. This function will touch q0-q7
; registers and use them as buffer during calculation.
-|vp9_short_idct16x16_10_add_neon_pass1| PROC
+|vp9_idct16x16_10_add_neon_pass1| PROC
; TODO(hkuang): Find a better way to load the elements.
; load elements of 0, 2, 4, 6, 8, 10, 12, 14 into q8 - q15
@@ -905,9 +905,9 @@
vst1.64 {d31}, [r1], r2
bx lr
- ENDP ; |vp9_short_idct16x16_10_add_neon_pass1|
+ ENDP ; |vp9_idct16x16_10_add_neon_pass1|
-;void vp9_short_idct16x16_10_add_neon_pass2(int16_t *src,
+;void vp9_idct16x16_10_add_neon_pass2(int16_t *src,
; int16_t *output,
; int16_t *pass1Output,
; int16_t skip_adding,
@@ -924,7 +924,7 @@
; idct16 stage1 - stage7 on all the elements loaded in q8-q15. The output
; will be stored back into q8-q15 registers. This function will touch q0-q7
; registers and use them as buffer during calculation.
-|vp9_short_idct16x16_10_add_neon_pass2| PROC
+|vp9_idct16x16_10_add_neon_pass2| PROC
push {r3-r9}
; TODO(hkuang): Find a better way to load the elements.
@@ -1175,5 +1175,5 @@
end_idct10_16x16_pass2
pop {r3-r9}
bx lr
- ENDP ; |vp9_short_idct16x16_10_add_neon_pass2|
+ ENDP ; |vp9_idct16x16_10_add_neon_pass2|
END
diff --git a/vp9/common/arm/neon/vp9_short_idct32x32_add_neon.asm b/vp9/common/arm/neon/vp9_short_idct32x32_add_neon.asm
index b5a284b..f00d027 100644
--- a/vp9/common/arm/neon/vp9_short_idct32x32_add_neon.asm
+++ b/vp9/common/arm/neon/vp9_short_idct32x32_add_neon.asm
@@ -43,7 +43,7 @@
cospi_31_64 EQU 804
- EXPORT |vp9_short_idct32x32_add_neon|
+ EXPORT |vp9_idct32x32_1024_add_neon|
ARM
REQUIRE8
PRESERVE8
@@ -288,7 +288,7 @@
MEND
; --------------------------------------------------------------------------
-;void vp9_short_idct32x32_add_neon(int16_t *input, uint8_t *dest, int dest_stride);
+;void vp9_idct32x32_1024_add_neon(int16_t *input, uint8_t *dest, int dest_stride);
;
; r0 int16_t *input,
; r1 uint8_t *dest,
@@ -303,7 +303,7 @@
; r9 dest + 15 * dest_stride, descending (14, 13, 12, ...)
; r10 dest + 16 * dest_stride, ascending (17, 18, 19, ...)
-|vp9_short_idct32x32_add_neon| PROC
+|vp9_idct32x32_1024_add_neon| PROC
; This function does one pass of idct32x32 transform.
;
; This is done by transposing the input and then doing a 1d transform on
@@ -1295,5 +1295,5 @@
vpop {d8-d15}
pop {r4-r11}
bx lr
- ENDP ; |vp9_short_idct32x32_add_neon|
+ ENDP ; |vp9_idct32x32_1024_add_neon|
END
diff --git a/vp9/common/arm/neon/vp9_short_idct4x4_1_add_neon.asm b/vp9/common/arm/neon/vp9_short_idct4x4_1_add_neon.asm
index 869ee5f..0d4a721 100644
--- a/vp9/common/arm/neon/vp9_short_idct4x4_1_add_neon.asm
+++ b/vp9/common/arm/neon/vp9_short_idct4x4_1_add_neon.asm
@@ -8,21 +8,21 @@
;
- EXPORT |vp9_short_idct4x4_1_add_neon|
+ EXPORT |vp9_idct4x4_1_add_neon|
ARM
REQUIRE8
PRESERVE8
AREA ||.text||, CODE, READONLY, ALIGN=2
-;void vp9_short_idct4x4_1_add_neon(int16_t *input, uint8_t *dest,
+;void vp9_idct4x4_1_add_neon(int16_t *input, uint8_t *dest,
; int dest_stride)
;
; r0 int16_t input
; r1 uint8_t *dest
; r2 int dest_stride)
-|vp9_short_idct4x4_1_add_neon| PROC
+|vp9_idct4x4_1_add_neon| PROC
ldrsh r0, [r0]
; generate cospi_16_64 = 11585
@@ -63,6 +63,6 @@
vst1.32 {d7[1]}, [r12]
bx lr
- ENDP ; |vp9_short_idct4x4_1_add_neon|
+ ENDP ; |vp9_idct4x4_1_add_neon|
END
diff --git a/vp9/common/arm/neon/vp9_short_idct4x4_add_neon.asm b/vp9/common/arm/neon/vp9_short_idct4x4_add_neon.asm
index 640fb93..00283fc 100644
--- a/vp9/common/arm/neon/vp9_short_idct4x4_add_neon.asm
+++ b/vp9/common/arm/neon/vp9_short_idct4x4_add_neon.asm
@@ -8,7 +8,7 @@
; be found in the AUTHORS file in the root of the source tree.
;
- EXPORT |vp9_short_idct4x4_add_neon|
+ EXPORT |vp9_idct4x4_16_add_neon|
ARM
REQUIRE8
PRESERVE8
@@ -16,13 +16,13 @@
AREA ||.text||, CODE, READONLY, ALIGN=2
AREA Block, CODE, READONLY ; name this block of code
-;void vp9_short_idct4x4_add_neon(int16_t *input, uint8_t *dest, int dest_stride)
+;void vp9_idct4x4_16_add_neon(int16_t *input, uint8_t *dest, int dest_stride)
;
; r0 int16_t input
; r1 uint8_t *dest
; r2 int dest_stride)
-|vp9_short_idct4x4_add_neon| PROC
+|vp9_idct4x4_16_add_neon| PROC
; The 2D transform is done with two passes which are actually pretty
; similar. We first transform the rows. This is done by transposing
@@ -185,6 +185,6 @@
vst1.32 {d26[1]}, [r1], r2
vst1.32 {d26[0]}, [r1] ; no post-increment
bx lr
- ENDP ; |vp9_short_idct4x4_add_neon|
+ ENDP ; |vp9_idct4x4_16_add_neon|
END
diff --git a/vp9/common/arm/neon/vp9_short_idct8x8_1_add_neon.asm b/vp9/common/arm/neon/vp9_short_idct8x8_1_add_neon.asm
index 923804f..421d202 100644
--- a/vp9/common/arm/neon/vp9_short_idct8x8_1_add_neon.asm
+++ b/vp9/common/arm/neon/vp9_short_idct8x8_1_add_neon.asm
@@ -8,21 +8,21 @@
;
- EXPORT |vp9_short_idct8x8_1_add_neon|
+ EXPORT |vp9_idct8x8_1_add_neon|
ARM
REQUIRE8
PRESERVE8
AREA ||.text||, CODE, READONLY, ALIGN=2
-;void vp9_short_idct8x8_1_add_neon(int16_t *input, uint8_t *dest,
+;void vp9_idct8x8_1_add_neon(int16_t *input, uint8_t *dest,
; int dest_stride)
;
; r0 int16_t input
; r1 uint8_t *dest
; r2 int dest_stride)
-|vp9_short_idct8x8_1_add_neon| PROC
+|vp9_idct8x8_1_add_neon| PROC
ldrsh r0, [r0]
; generate cospi_16_64 = 11585
@@ -83,6 +83,6 @@
vst1.64 {d31}, [r12], r2
bx lr
- ENDP ; |vp9_short_idct8x8_1_add_neon|
+ ENDP ; |vp9_idct8x8_1_add_neon|
END
diff --git a/vp9/common/arm/neon/vp9_short_idct8x8_add_neon.asm b/vp9/common/arm/neon/vp9_short_idct8x8_add_neon.asm
index c02251a..5476400 100644
--- a/vp9/common/arm/neon/vp9_short_idct8x8_add_neon.asm
+++ b/vp9/common/arm/neon/vp9_short_idct8x8_add_neon.asm
@@ -8,8 +8,8 @@
; be found in the AUTHORS file in the root of the source tree.
;
- EXPORT |vp9_short_idct8x8_add_neon|
- EXPORT |vp9_short_idct8x8_10_add_neon|
+ EXPORT |vp9_idct8x8_64_add_neon|
+ EXPORT |vp9_idct8x8_10_add_neon|
ARM
REQUIRE8
PRESERVE8
@@ -198,13 +198,13 @@
MEND
AREA Block, CODE, READONLY ; name this block of code
-;void vp9_short_idct8x8_add_neon(int16_t *input, uint8_t *dest, int dest_stride)
+;void vp9_idct8x8_64_add_neon(int16_t *input, uint8_t *dest, int dest_stride)
;
; r0 int16_t input
; r1 uint8_t *dest
; r2 int dest_stride)
-|vp9_short_idct8x8_add_neon| PROC
+|vp9_idct8x8_64_add_neon| PROC
push {r4-r9}
vpush {d8-d15}
vld1.s16 {q8,q9}, [r0]!
@@ -308,15 +308,15 @@
vpop {d8-d15}
pop {r4-r9}
bx lr
- ENDP ; |vp9_short_idct8x8_add_neon|
+ ENDP ; |vp9_idct8x8_64_add_neon|
-;void vp9_short_idct8x8_10_add_neon(int16_t *input, uint8_t *dest, int dest_stride)
+;void vp9_idct8x8_10_add_neon(int16_t *input, uint8_t *dest, int dest_stride)
;
; r0 int16_t input
; r1 uint8_t *dest
; r2 int dest_stride)
-|vp9_short_idct8x8_10_add_neon| PROC
+|vp9_idct8x8_10_add_neon| PROC
push {r4-r9}
vpush {d8-d15}
vld1.s16 {q8,q9}, [r0]!
@@ -514,6 +514,6 @@
vpop {d8-d15}
pop {r4-r9}
bx lr
- ENDP ; |vp9_short_idct8x8_10_add_neon|
+ ENDP ; |vp9_idct8x8_10_add_neon|
END
diff --git a/vp9/common/arm/neon/vp9_short_iht4x4_add_neon.asm b/vp9/common/arm/neon/vp9_short_iht4x4_add_neon.asm
index 963ef35..2f326e2 100644
--- a/vp9/common/arm/neon/vp9_short_iht4x4_add_neon.asm
+++ b/vp9/common/arm/neon/vp9_short_iht4x4_add_neon.asm
@@ -8,7 +8,7 @@
; be found in the AUTHORS file in the root of the source tree.
;
- EXPORT |vp9_short_iht4x4_add_neon|
+ EXPORT |vp9_iht4x4_16_add_neon|
ARM
REQUIRE8
PRESERVE8
@@ -139,7 +139,7 @@
MEND
AREA Block, CODE, READONLY ; name this block of code
-;void vp9_short_iht4x4_add_neon(int16_t *input, uint8_t *dest,
+;void vp9_iht4x4_16_add_neon(int16_t *input, uint8_t *dest,
; int dest_stride, int tx_type)
;
; r0 int16_t input
@@ -147,7 +147,7 @@
; r2 int dest_stride
; r3 int tx_type)
; This function will only handle tx_type of 1,2,3.
-|vp9_short_iht4x4_add_neon| PROC
+|vp9_iht4x4_16_add_neon| PROC
; load the inputs into d16-d19
vld1.s16 {q8,q9}, [r0]!
@@ -175,7 +175,7 @@
; then transform columns
IADST4x4_1D
- b end_vp9_short_iht4x4_add_neon
+ b end_vp9_iht4x4_16_add_neon
idct_iadst
; generate constants
@@ -191,7 +191,7 @@
; then transform columns
IDCT4x4_1D
- b end_vp9_short_iht4x4_add_neon
+ b end_vp9_iht4x4_16_add_neon
iadst_iadst
; generate constants
@@ -206,7 +206,7 @@
; then transform columns
IADST4x4_1D
-end_vp9_short_iht4x4_add_neon
+end_vp9_iht4x4_16_add_neon
; ROUND_POWER_OF_TWO(temp_out[j], 4)
vrshr.s16 q8, q8, #4
vrshr.s16 q9, q9, #4
@@ -232,6 +232,6 @@
vst1.32 {d26[1]}, [r1], r2
vst1.32 {d26[0]}, [r1] ; no post-increment
bx lr
- ENDP ; |vp9_short_iht4x4_add_neon|
+ ENDP ; |vp9_iht4x4_16_add_neon|
END
diff --git a/vp9/common/arm/neon/vp9_short_iht8x8_add_neon.asm b/vp9/common/arm/neon/vp9_short_iht8x8_add_neon.asm
index bab9cb4..93d3af3 100644
--- a/vp9/common/arm/neon/vp9_short_iht8x8_add_neon.asm
+++ b/vp9/common/arm/neon/vp9_short_iht8x8_add_neon.asm
@@ -8,7 +8,7 @@
; be found in the AUTHORS file in the root of the source tree.
;
- EXPORT |vp9_short_iht8x8_add_neon|
+ EXPORT |vp9_iht8x8_64_add_neon|
ARM
REQUIRE8
PRESERVE8
@@ -559,7 +559,7 @@
AREA Block, CODE, READONLY ; name this block of code
-;void vp9_short_iht8x8_add_neon(int16_t *input, uint8_t *dest,
+;void vp9_iht8x8_64_add_neon(int16_t *input, uint8_t *dest,
; int dest_stride, int tx_type)
;
; r0 int16_t input
@@ -567,7 +567,7 @@
; r2 int dest_stride
; r3 int tx_type)
; This function will only handle tx_type of 1,2,3.
-|vp9_short_iht8x8_add_neon| PROC
+|vp9_iht8x8_64_add_neon| PROC
; load the inputs into d16-d19
vld1.s16 {q8,q9}, [r0]!
@@ -602,7 +602,7 @@
; then transform columns
IADST8X8_1D
- b end_vp9_short_iht8x8_add_neon
+ b end_vp9_iht8x8_64_add_neon
idct_iadst
; generate IADST constants
@@ -620,7 +620,7 @@
; then transform columns
IDCT8x8_1D
- b end_vp9_short_iht8x8_add_neon
+ b end_vp9_iht8x8_64_add_neon
iadst_iadst
; generate IADST constants
@@ -635,7 +635,7 @@
; then transform columns
IADST8X8_1D
-end_vp9_short_iht8x8_add_neon
+end_vp9_iht8x8_64_add_neon
pop {r0-r10}
; ROUND_POWER_OF_TWO(temp_out[j], 5)
@@ -691,6 +691,6 @@
vst1.64 {d6}, [r0], r2
vst1.64 {d7}, [r0], r2
bx lr
- ENDP ; |vp9_short_iht8x8_add_neon|
+ ENDP ; |vp9_iht8x8_64_add_neon|
END
diff --git a/vp9/common/mips/dspr2/vp9_common_dspr2.h b/vp9/common/mips/dspr2/vp9_common_dspr2.h
index d2fa4c1..dc88f16 100644
--- a/vp9/common/mips/dspr2/vp9_common_dspr2.h
+++ b/vp9/common/mips/dspr2/vp9_common_dspr2.h
@@ -81,5 +81,34 @@
);
}
+void vp9_convolve2_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const int16_t *filter_x, int x_step_q4,
+ const int16_t *filter_y, int y_step_q4,
+ int w, int h);
+
+void vp9_convolve2_avg_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const int16_t *filter_x, int x_step_q4,
+ const int16_t *filter_y, int y_step_q4,
+ int w, int h);
+
+void vp9_convolve2_avg_vert_dspr2(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const int16_t *filter_x, int x_step_q4,
+ const int16_t *filter_y, int y_step_q4,
+ int w, int h);
+
+void vp9_convolve2_dspr2(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const int16_t *filter,
+ int w, int h);
+
+void vp9_convolve2_vert_dspr2(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const int16_t *filter_x, int x_step_q4,
+ const int16_t *filter_y, int y_step_q4,
+ int w, int h);
+
#endif // #if HAVE_DSPR2
#endif // VP9_COMMON_VP9_COMMON_DSPR2_H_
diff --git a/vp9/common/mips/dspr2/vp9_convolve2_avg_dspr2.c b/vp9/common/mips/dspr2/vp9_convolve2_avg_dspr2.c
new file mode 100644
index 0000000..91d62bc
--- /dev/null
+++ b/vp9/common/mips/dspr2/vp9_convolve2_avg_dspr2.c
@@ -0,0 +1,281 @@
+/*
+ * Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include <stdio.h>
+
+#include "./vpx_config.h"
+#include "./vp9_rtcd.h"
+#include "vp9/common/vp9_common.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_ports/mem.h"
+#include "vp9/common/vp9_convolve.h"
+#include "vp9/common/mips/dspr2/vp9_common_dspr2.h"
+
+#if HAVE_DSPR2
+static void convolve_bi_avg_vert_4_dspr2(const uint8_t *src,
+ int32_t src_stride,
+ uint8_t *dst,
+ int32_t dst_stride,
+ const int16_t *filter_y,
+ int32_t w,
+ int32_t h) {
+ int32_t x, y;
+ const uint8_t *src_ptr;
+ uint8_t *dst_ptr;
+ uint8_t *cm = vp9_ff_cropTbl;
+ uint32_t vector4a = 64;
+ uint32_t load1, load2;
+ uint32_t p1, p2;
+ uint32_t scratch1, scratch2;
+ uint32_t store1, store2;
+ int32_t Temp1, Temp2;
+ const int16_t *filter = &filter_y[3];
+ uint32_t filter45;
+
+ filter45 = ((const int32_t *)filter)[0];
+
+ for (y = h; y--;) {
+ /* prefetch data to cache memory */
+ vp9_prefetch_store(dst + dst_stride);
+
+ for (x = 0; x < w; x += 4) {
+ src_ptr = src + x;
+ dst_ptr = dst + x;
+
+ __asm__ __volatile__ (
+ "ulw %[load1], 0(%[src_ptr]) \n\t"
+ "add %[src_ptr], %[src_ptr], %[src_stride] \n\t"
+ "ulw %[load2], 0(%[src_ptr]) \n\t"
+
+ "mtlo %[vector4a], $ac0 \n\t"
+ "mtlo %[vector4a], $ac1 \n\t"
+ "mtlo %[vector4a], $ac2 \n\t"
+ "mtlo %[vector4a], $ac3 \n\t"
+ "mthi $zero, $ac0 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "mthi $zero, $ac2 \n\t"
+ "mthi $zero, $ac3 \n\t"
+
+ "preceu.ph.qbr %[scratch1], %[load1] \n\t"
+ "preceu.ph.qbr %[p1], %[load2] \n\t"
+ "precrq.ph.w %[p2], %[p1], %[scratch1] \n\t" /* pixel 2 */
+ "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */
+
+ "dpa.w.ph $ac0, %[p1], %[filter45] \n\t"
+ "dpa.w.ph $ac1, %[p2], %[filter45] \n\t"
+
+ "preceu.ph.qbl %[scratch1], %[load1] \n\t"
+ "preceu.ph.qbl %[p1], %[load2] \n\t"
+ "precrq.ph.w %[p2], %[p1], %[scratch1] \n\t" /* pixel 2 */
+ "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */
+
+ "dpa.w.ph $ac2, %[p1], %[filter45] \n\t"
+ "dpa.w.ph $ac3, %[p2], %[filter45] \n\t"
+
+ "extp %[Temp1], $ac0, 31 \n\t"
+ "extp %[Temp2], $ac1, 31 \n\t"
+
+ "lbu %[scratch1], 0(%[dst_ptr]) \n\t"
+ "lbu %[scratch2], 1(%[dst_ptr]) \n\t"
+
+ "lbux %[store1], %[Temp1](%[cm]) \n\t"
+ "addqh_r.w %[store1], %[store1], %[scratch1] \n\t" /* pixel 1 */
+ "extp %[Temp1], $ac2, 31 \n\t"
+
+ "lbux %[store2], %[Temp2](%[cm]) \n\t"
+ "addqh_r.w %[store2], %[store2], %[scratch2] \n\t" /* pixel 2 */
+ "extp %[Temp2], $ac3, 31 \n\t"
+ "lbu %[scratch1], 2(%[dst_ptr]) \n\t"
+
+ "sb %[store1], 0(%[dst_ptr]) \n\t"
+ "sb %[store2], 1(%[dst_ptr]) \n\t"
+ "lbu %[scratch2], 3(%[dst_ptr]) \n\t"
+
+ "lbux %[store1], %[Temp1](%[cm]) \n\t"
+ "lbux %[store2], %[Temp2](%[cm]) \n\t"
+ "addqh_r.w %[store1], %[store1], %[scratch1] \n\t" /* pixel 3 */
+ "addqh_r.w %[store2], %[store2], %[scratch2] \n\t" /* pixel 4 */
+
+ "sb %[store1], 2(%[dst_ptr]) \n\t"
+ "sb %[store2], 3(%[dst_ptr]) \n\t"
+
+ : [load1] "=&r" (load1), [load2] "=&r" (load2),
+ [p1] "=&r" (p1), [p2] "=&r" (p2),
+ [scratch1] "=&r" (scratch1), [scratch2] "=&r" (scratch2),
+ [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2),
+ [store1] "=&r" (store1), [store2] "=&r" (store2),
+ [src_ptr] "+r" (src_ptr)
+ : [filter45] "r" (filter45), [vector4a] "r" (vector4a),
+ [src_stride] "r" (src_stride), [cm] "r" (cm),
+ [dst_ptr] "r" (dst_ptr)
+ );
+ }
+
+ /* Next row... */
+ src += src_stride;
+ dst += dst_stride;
+ }
+}
+
+static void convolve_bi_avg_vert_64_dspr2(const uint8_t *src,
+ int32_t src_stride,
+ uint8_t *dst,
+ int32_t dst_stride,
+ const int16_t *filter_y,
+ int32_t h) {
+ int32_t x, y;
+ const uint8_t *src_ptr;
+ uint8_t *dst_ptr;
+ uint8_t *cm = vp9_ff_cropTbl;
+ uint32_t vector4a = 64;
+ uint32_t load1, load2;
+ uint32_t p1, p2;
+ uint32_t scratch1, scratch2;
+ uint32_t store1, store2;
+ int32_t Temp1, Temp2;
+ const int16_t *filter = &filter_y[3];
+ uint32_t filter45;;
+
+ filter45 = ((const int32_t *)filter)[0];
+
+ for (y = h; y--;) {
+ /* prefetch data to cache memory */
+ vp9_prefetch_store(dst + dst_stride);
+ vp9_prefetch_store(dst + dst_stride + 32);
+
+ for (x = 0; x < 64; x += 4) {
+ src_ptr = src + x;
+ dst_ptr = dst + x;
+
+ __asm__ __volatile__ (
+ "ulw %[load1], 0(%[src_ptr]) \n\t"
+ "add %[src_ptr], %[src_ptr], %[src_stride] \n\t"
+ "ulw %[load2], 0(%[src_ptr]) \n\t"
+
+ "mtlo %[vector4a], $ac0 \n\t"
+ "mtlo %[vector4a], $ac1 \n\t"
+ "mtlo %[vector4a], $ac2 \n\t"
+ "mtlo %[vector4a], $ac3 \n\t"
+ "mthi $zero, $ac0 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "mthi $zero, $ac2 \n\t"
+ "mthi $zero, $ac3 \n\t"
+
+ "preceu.ph.qbr %[scratch1], %[load1] \n\t"
+ "preceu.ph.qbr %[p1], %[load2] \n\t"
+ "precrq.ph.w %[p2], %[p1], %[scratch1] \n\t" /* pixel 2 */
+ "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */
+
+ "dpa.w.ph $ac0, %[p1], %[filter45] \n\t"
+ "dpa.w.ph $ac1, %[p2], %[filter45] \n\t"
+
+ "preceu.ph.qbl %[scratch1], %[load1] \n\t"
+ "preceu.ph.qbl %[p1], %[load2] \n\t"
+ "precrq.ph.w %[p2], %[p1], %[scratch1] \n\t" /* pixel 2 */
+ "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */
+
+ "dpa.w.ph $ac2, %[p1], %[filter45] \n\t"
+ "dpa.w.ph $ac3, %[p2], %[filter45] \n\t"
+
+ "extp %[Temp1], $ac0, 31 \n\t"
+ "extp %[Temp2], $ac1, 31 \n\t"
+
+ "lbu %[scratch1], 0(%[dst_ptr]) \n\t"
+ "lbu %[scratch2], 1(%[dst_ptr]) \n\t"
+
+ "lbux %[store1], %[Temp1](%[cm]) \n\t"
+ "addqh_r.w %[store1], %[store1], %[scratch1] \n\t" /* pixel 1 */
+ "extp %[Temp1], $ac2, 31 \n\t"
+
+ "lbux %[store2], %[Temp2](%[cm]) \n\t"
+ "addqh_r.w %[store2], %[store2], %[scratch2] \n\t" /* pixel 2 */
+ "extp %[Temp2], $ac3, 31 \n\t"
+ "lbu %[scratch1], 2(%[dst_ptr]) \n\t"
+
+ "sb %[store1], 0(%[dst_ptr]) \n\t"
+ "sb %[store2], 1(%[dst_ptr]) \n\t"
+ "lbu %[scratch2], 3(%[dst_ptr]) \n\t"
+
+ "lbux %[store1], %[Temp1](%[cm]) \n\t"
+ "lbux %[store2], %[Temp2](%[cm]) \n\t"
+ "addqh_r.w %[store1], %[store1], %[scratch1] \n\t" /* pixel 3 */
+ "addqh_r.w %[store2], %[store2], %[scratch2] \n\t" /* pixel 4 */
+
+ "sb %[store1], 2(%[dst_ptr]) \n\t"
+ "sb %[store2], 3(%[dst_ptr]) \n\t"
+
+ : [load1] "=&r" (load1), [load2] "=&r" (load2),
+ [p1] "=&r" (p1), [p2] "=&r" (p2),
+ [scratch1] "=&r" (scratch1), [scratch2] "=&r" (scratch2),
+ [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2),
+ [store1] "=&r" (store1), [store2] "=&r" (store2),
+ [src_ptr] "+r" (src_ptr)
+ : [filter45] "r" (filter45), [vector4a] "r" (vector4a),
+ [src_stride] "r" (src_stride), [cm] "r" (cm),
+ [dst_ptr] "r" (dst_ptr)
+ );
+ }
+
+ /* Next row... */
+ src += src_stride;
+ dst += dst_stride;
+ }
+}
+
+void vp9_convolve2_avg_vert_dspr2(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const int16_t *filter_x, int x_step_q4,
+ const int16_t *filter_y, int y_step_q4,
+ int w, int h) {
+ if (16 == y_step_q4) {
+ uint32_t pos = 38;
+
+ /* bit positon for extract from acc */
+ __asm__ __volatile__ (
+ "wrdsp %[pos], 1 \n\t"
+ :
+ : [pos] "r" (pos)
+ );
+
+ vp9_prefetch_store(dst);
+
+ switch (w) {
+ case 4:
+ case 8:
+ case 16:
+ case 32:
+ convolve_bi_avg_vert_4_dspr2(src, src_stride,
+ dst, dst_stride,
+ filter_y, w, h);
+ break;
+ case 64:
+ vp9_prefetch_store(dst + 32);
+ convolve_bi_avg_vert_64_dspr2(src, src_stride,
+ dst, dst_stride,
+ filter_y, h);
+ break;
+ default:
+ vp9_convolve8_avg_vert_c(src, src_stride,
+ dst, dst_stride,
+ filter_x, x_step_q4,
+ filter_y, y_step_q4,
+ w, h);
+ break;
+ }
+ } else {
+ vp9_convolve8_avg_vert_c(src, src_stride,
+ dst, dst_stride,
+ filter_x, x_step_q4,
+ filter_y, y_step_q4,
+ w, h);
+ }
+}
+#endif
diff --git a/vp9/common/mips/dspr2/vp9_convolve2_avg_horiz_dspr2.c b/vp9/common/mips/dspr2/vp9_convolve2_avg_horiz_dspr2.c
new file mode 100644
index 0000000..148b20f
--- /dev/null
+++ b/vp9/common/mips/dspr2/vp9_convolve2_avg_horiz_dspr2.c
@@ -0,0 +1,833 @@
+/*
+ * Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include <stdio.h>
+
+#include "./vpx_config.h"
+#include "./vp9_rtcd.h"
+#include "vp9/common/vp9_common.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_ports/mem.h"
+#include "vp9/common/vp9_convolve.h"
+#include "vp9/common/mips/dspr2/vp9_common_dspr2.h"
+
+#if HAVE_DSPR2
+static void convolve_bi_avg_horiz_4_dspr2(const uint8_t *src,
+ int32_t src_stride,
+ uint8_t *dst,
+ int32_t dst_stride,
+ const int16_t *filter_x0,
+ int32_t h) {
+ int32_t y;
+ uint8_t *cm = vp9_ff_cropTbl;
+ int32_t Temp1, Temp2, Temp3, Temp4;
+ uint32_t vector4a = 64;
+ uint32_t tp1, tp2;
+ uint32_t p1, p2, p3;
+ uint32_t tn1, tn2;
+ const int16_t *filter = &filter_x0[3];
+ uint32_t filter45;
+
+ filter45 = ((const int32_t *)filter)[0];
+
+ for (y = h; y--;) {
+ /* prefetch data to cache memory */
+ vp9_prefetch_load(src + src_stride);
+ vp9_prefetch_load(src + src_stride + 32);
+ vp9_prefetch_store(dst + dst_stride);
+
+ __asm__ __volatile__ (
+ "ulw %[tp1], 0(%[src]) \n\t"
+ "ulw %[tp2], 4(%[src]) \n\t"
+
+ /* even 1. pixel */
+ "mtlo %[vector4a], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+ "preceu.ph.qbr %[p1], %[tp1] \n\t"
+ "preceu.ph.qbl %[p2], %[tp1] \n\t"
+ "dpa.w.ph $ac3, %[p1], %[filter45] \n\t"
+ "extp %[Temp1], $ac3, 31 \n\t"
+
+ /* even 2. pixel */
+ "mtlo %[vector4a], $ac2 \n\t"
+ "mthi $zero, $ac2 \n\t"
+ "balign %[tp2], %[tp1], 3 \n\t"
+ "dpa.w.ph $ac2, %[p2], %[filter45] \n\t"
+ "extp %[Temp3], $ac2, 31 \n\t"
+
+ "lbu %[p2], 3(%[dst]) \n\t" /* load odd 2 */
+
+ /* odd 1. pixel */
+ "lbux %[tp1], %[Temp1](%[cm]) \n\t" /* even 1 */
+ "mtlo %[vector4a], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+ "lbu %[Temp1], 1(%[dst]) \n\t" /* load odd 1 */
+ "preceu.ph.qbr %[p1], %[tp2] \n\t"
+ "preceu.ph.qbl %[p3], %[tp2] \n\t"
+ "dpa.w.ph $ac3, %[p1], %[filter45] \n\t"
+ "extp %[Temp2], $ac3, 31 \n\t"
+
+ "lbu %[tn2], 0(%[dst]) \n\t" /* load even 1 */
+
+ /* odd 2. pixel */
+ "lbux %[tp2], %[Temp3](%[cm]) \n\t" /* even 2 */
+ "mtlo %[vector4a], $ac2 \n\t"
+ "mthi $zero, $ac2 \n\t"
+ "lbux %[tn1], %[Temp2](%[cm]) \n\t" /* odd 1 */
+ "addqh_r.w %[tn2], %[tn2], %[tp1] \n\t" /* average even 1 */
+ "dpa.w.ph $ac2, %[p3], %[filter45] \n\t"
+ "extp %[Temp4], $ac2, 31 \n\t"
+
+ "lbu %[tp1], 2(%[dst]) \n\t" /* load even 2 */
+ "sb %[tn2], 0(%[dst]) \n\t" /* store even 1 */
+
+ /* clamp */
+ "addqh_r.w %[Temp1], %[Temp1], %[tn1] \n\t" /* average odd 1 */
+ "lbux %[p3], %[Temp4](%[cm]) \n\t" /* odd 2 */
+ "sb %[Temp1], 1(%[dst]) \n\t" /* store odd 1 */
+
+ "addqh_r.w %[tp1], %[tp1], %[tp2] \n\t" /* average even 2 */
+ "sb %[tp1], 2(%[dst]) \n\t" /* store even 2 */
+
+ "addqh_r.w %[p2], %[p2], %[p3] \n\t" /* average odd 2 */
+ "sb %[p2], 3(%[dst]) \n\t" /* store odd 2 */
+
+ : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2),
+ [tn1] "=&r" (tn1), [tn2] "=&r" (tn2),
+ [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3),
+ [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2),
+ [Temp3] "=&r" (Temp3), [Temp4] "=&r" (Temp4)
+ : [filter45] "r" (filter45), [vector4a] "r" (vector4a),
+ [cm] "r" (cm), [dst] "r" (dst), [src] "r" (src)
+ );
+
+ /* Next row... */
+ src += src_stride;
+ dst += dst_stride;
+ }
+}
+
+static void convolve_bi_avg_horiz_8_dspr2(const uint8_t *src,
+ int32_t src_stride,
+ uint8_t *dst,
+ int32_t dst_stride,
+ const int16_t *filter_x0,
+ int32_t h) {
+ int32_t y;
+ uint8_t *cm = vp9_ff_cropTbl;
+ uint32_t vector4a = 64;
+ int32_t Temp1, Temp2, Temp3;
+ uint32_t tp1, tp2, tp3, tp4;
+ uint32_t p1, p2, p3, p4, n1;
+ uint32_t st0, st1;
+ const int16_t *filter = &filter_x0[3];
+ uint32_t filter45;;
+
+ filter45 = ((const int32_t *)filter)[0];
+
+ for (y = h; y--;) {
+ /* prefetch data to cache memory */
+ vp9_prefetch_load(src + src_stride);
+ vp9_prefetch_load(src + src_stride + 32);
+ vp9_prefetch_store(dst + dst_stride);
+
+ __asm__ __volatile__ (
+ "ulw %[tp1], 0(%[src]) \n\t"
+ "ulw %[tp2], 4(%[src]) \n\t"
+
+ /* even 1. pixel */
+ "mtlo %[vector4a], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+ "mtlo %[vector4a], $ac2 \n\t"
+ "mthi $zero, $ac2 \n\t"
+ "preceu.ph.qbr %[p1], %[tp1] \n\t"
+ "preceu.ph.qbl %[p2], %[tp1] \n\t"
+ "preceu.ph.qbr %[p3], %[tp2] \n\t"
+ "preceu.ph.qbl %[p4], %[tp2] \n\t"
+ "ulw %[tp3], 8(%[src]) \n\t"
+ "dpa.w.ph $ac3, %[p1], %[filter45] \n\t"
+ "extp %[Temp1], $ac3, 31 \n\t"
+ "lbu %[Temp2], 0(%[dst]) \n\t"
+ "lbu %[tp4], 2(%[dst]) \n\t"
+
+ /* even 2. pixel */
+ "dpa.w.ph $ac2, %[p2], %[filter45] \n\t"
+ "extp %[Temp3], $ac2, 31 \n\t"
+
+ /* even 3. pixel */
+ "lbux %[st0], %[Temp1](%[cm]) \n\t"
+ "mtlo %[vector4a], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "lbux %[st1], %[Temp3](%[cm]) \n\t"
+ "dpa.w.ph $ac1, %[p3], %[filter45] \n\t"
+ "extp %[Temp1], $ac1, 31 \n\t"
+
+ "addqh_r.w %[Temp2], %[Temp2], %[st0] \n\t"
+ "addqh_r.w %[tp4], %[tp4], %[st1] \n\t"
+ "sb %[Temp2], 0(%[dst]) \n\t"
+ "sb %[tp4], 2(%[dst]) \n\t"
+
+ /* even 4. pixel */
+ "mtlo %[vector4a], $ac2 \n\t"
+ "mthi $zero, $ac2 \n\t"
+ "mtlo %[vector4a], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+
+ "balign %[tp3], %[tp2], 3 \n\t"
+ "balign %[tp2], %[tp1], 3 \n\t"
+
+ "lbux %[st0], %[Temp1](%[cm]) \n\t"
+ "lbu %[Temp2], 4(%[dst]) \n\t"
+ "addqh_r.w %[Temp2], %[Temp2], %[st0] \n\t"
+
+ "dpa.w.ph $ac2, %[p4], %[filter45] \n\t"
+ "extp %[Temp3], $ac2, 31 \n\t"
+
+ /* odd 1. pixel */
+ "mtlo %[vector4a], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "sb %[Temp2], 4(%[dst]) \n\t"
+ "preceu.ph.qbr %[p1], %[tp2] \n\t"
+ "preceu.ph.qbl %[p2], %[tp2] \n\t"
+ "preceu.ph.qbr %[p3], %[tp3] \n\t"
+ "preceu.ph.qbl %[p4], %[tp3] \n\t"
+ "dpa.w.ph $ac3, %[p1], %[filter45] \n\t"
+ "extp %[Temp2], $ac3, 31 \n\t"
+
+ "lbu %[tp1], 6(%[dst]) \n\t"
+
+ /* odd 2. pixel */
+ "mtlo %[vector4a], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+ "mtlo %[vector4a], $ac2 \n\t"
+ "mthi $zero, $ac2 \n\t"
+ "lbux %[st0], %[Temp3](%[cm]) \n\t"
+ "dpa.w.ph $ac1, %[p2], %[filter45] \n\t"
+ "extp %[Temp3], $ac1, 31 \n\t"
+
+ "lbu %[tp2], 1(%[dst]) \n\t"
+ "lbu %[tp3], 3(%[dst]) \n\t"
+ "addqh_r.w %[tp1], %[tp1], %[st0] \n\t"
+
+ /* odd 3. pixel */
+ "lbux %[st1], %[Temp2](%[cm]) \n\t"
+ "dpa.w.ph $ac3, %[p3], %[filter45] \n\t"
+ "addqh_r.w %[tp2], %[tp2], %[st1] \n\t"
+ "extp %[Temp2], $ac3, 31 \n\t"
+
+ "lbu %[tp4], 5(%[dst]) \n\t"
+
+ /* odd 4. pixel */
+ "sb %[tp2], 1(%[dst]) \n\t"
+ "sb %[tp1], 6(%[dst]) \n\t"
+ "dpa.w.ph $ac2, %[p4], %[filter45] \n\t"
+ "extp %[Temp1], $ac2, 31 \n\t"
+
+ "lbu %[tp1], 7(%[dst]) \n\t"
+
+ /* clamp */
+ "lbux %[p4], %[Temp3](%[cm]) \n\t"
+ "addqh_r.w %[tp3], %[tp3], %[p4] \n\t"
+
+ "lbux %[p2], %[Temp2](%[cm]) \n\t"
+ "addqh_r.w %[tp4], %[tp4], %[p2] \n\t"
+
+ "lbux %[p1], %[Temp1](%[cm]) \n\t"
+ "addqh_r.w %[tp1], %[tp1], %[p1] \n\t"
+
+ /* store bytes */
+ "sb %[tp3], 3(%[dst]) \n\t"
+ "sb %[tp4], 5(%[dst]) \n\t"
+ "sb %[tp1], 7(%[dst]) \n\t"
+
+ : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2),
+ [tp3] "=&r" (tp3), [tp4] "=&r" (tp4),
+ [st0] "=&r" (st0), [st1] "=&r" (st1),
+ [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4),
+ [n1] "=&r" (n1),
+ [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3)
+ : [filter45] "r" (filter45), [vector4a] "r" (vector4a),
+ [cm] "r" (cm), [dst] "r" (dst), [src] "r" (src)
+ );
+
+ /* Next row... */
+ src += src_stride;
+ dst += dst_stride;
+ }
+}
+
+static void convolve_bi_avg_horiz_16_dspr2(const uint8_t *src_ptr,
+ int32_t src_stride,
+ uint8_t *dst_ptr,
+ int32_t dst_stride,
+ const int16_t *filter_x0,
+ int32_t h,
+ int32_t count) {
+ int32_t y, c;
+ const uint8_t *src;
+ uint8_t *dst;
+ uint8_t *cm = vp9_ff_cropTbl;
+ uint32_t vector_64 = 64;
+ int32_t Temp1, Temp2, Temp3;
+ uint32_t qload1, qload2, qload3;
+ uint32_t p1, p2, p3, p4, p5;
+ uint32_t st1, st2, st3;
+ const int16_t *filter = &filter_x0[3];
+ uint32_t filter45;;
+
+ filter45 = ((const int32_t *)filter)[0];
+
+ for (y = h; y--;) {
+ src = src_ptr;
+ dst = dst_ptr;
+
+ /* prefetch data to cache memory */
+ vp9_prefetch_load(src_ptr + src_stride);
+ vp9_prefetch_load(src_ptr + src_stride + 32);
+ vp9_prefetch_store(dst_ptr + dst_stride);
+
+ for (c = 0; c < count; c++) {
+ __asm__ __volatile__ (
+ "ulw %[qload1], 0(%[src]) \n\t"
+ "ulw %[qload2], 4(%[src]) \n\t"
+
+ /* even 1. pixel */
+ "mtlo %[vector_64], $ac1 \n\t" /* even 1 */
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[vector_64], $ac2 \n\t" /* even 2 */
+ "mthi $zero, $ac2 \n\t"
+ "preceu.ph.qbr %[p1], %[qload1] \n\t"
+ "preceu.ph.qbl %[p2], %[qload1] \n\t"
+ "preceu.ph.qbr %[p3], %[qload2] \n\t"
+ "preceu.ph.qbl %[p4], %[qload2] \n\t"
+ "ulw %[qload3], 8(%[src]) \n\t"
+ "dpa.w.ph $ac1, %[p1], %[filter45] \n\t" /* even 1 */
+ "extp %[Temp1], $ac1, 31 \n\t" /* even 1 */
+ "lbu %[st2], 0(%[dst]) \n\t" /* load even 1 from dst */
+
+ /* even 2. pixel */
+ "mtlo %[vector_64], $ac3 \n\t" /* even 3 */
+ "mthi $zero, $ac3 \n\t"
+ "preceu.ph.qbr %[p1], %[qload3] \n\t"
+ "preceu.ph.qbl %[p5], %[qload3] \n\t"
+ "ulw %[qload1], 12(%[src]) \n\t"
+ "dpa.w.ph $ac2, %[p2], %[filter45] \n\t" /* even 1 */
+ "extp %[Temp2], $ac2, 31 \n\t" /* even 1 */
+ "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 1 */
+
+ "lbu %[qload3], 2(%[dst]) \n\t" /* load even 2 from dst */
+
+ /* even 3. pixel */
+ "mtlo %[vector_64], $ac1 \n\t" /* even 4 */
+ "mthi $zero, $ac1 \n\t"
+ "addqh_r.w %[st2], %[st2], %[st1] \n\t" /* average even 1 */
+ "preceu.ph.qbr %[p2], %[qload1] \n\t"
+ "sb %[st2], 0(%[dst]) \n\t" /* store even 1 to dst */
+ "dpa.w.ph $ac3, %[p3], %[filter45] \n\t" /* even 3 */
+ "extp %[Temp3], $ac3, 31 \n\t" /* even 3 */
+ "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 1 */
+
+ /* even 4. pixel */
+ "mtlo %[vector_64], $ac2 \n\t" /* even 5 */
+ "mthi $zero, $ac2 \n\t"
+ "addqh_r.w %[qload3], %[qload3], %[st2] \n\t" /* average even 2 */
+ "preceu.ph.qbl %[p3], %[qload1] \n\t"
+ "sb %[qload3], 2(%[dst]) \n\t" /* store even 2 to dst */
+ "lbu %[qload3], 4(%[dst]) \n\t" /* load even 3 from dst */
+ "lbu %[qload1], 6(%[dst]) \n\t" /* load even 4 from dst */
+ "dpa.w.ph $ac1, %[p4], %[filter45] \n\t" /* even 4 */
+ "extp %[Temp1], $ac1, 31 \n\t" /* even 4 */
+ "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 3 */
+
+ /* even 5. pixel */
+ "mtlo %[vector_64], $ac3 \n\t" /* even 6 */
+ "mthi $zero, $ac3 \n\t"
+ "addqh_r.w %[qload3], %[qload3], %[st3] \n\t" /* average even 3 */
+ "sb %[qload3], 4(%[dst]) \n\t" /* store even 3 to dst */
+ "dpa.w.ph $ac2, %[p1], %[filter45] \n\t" /* even 5 */
+ "extp %[Temp2], $ac2, 31 \n\t" /* even 5 */
+ "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 4 */
+
+ /* even 6. pixel */
+ "mtlo %[vector_64], $ac1 \n\t" /* even 7 */
+ "mthi $zero, $ac1 \n\t"
+ "addqh_r.w %[qload1], %[qload1], %[st1] \n\t" /* average even 4 */
+ "sb %[qload1], 6(%[dst]) \n\t" /* store even 4 to dst */
+ "dpa.w.ph $ac3, %[p5], %[filter45] \n\t" /* even 6 */
+ "lbu %[qload2], 8(%[dst]) \n\t" /* load even 5 from dst */
+ "extp %[Temp3], $ac3, 31 \n\t" /* even 6 */
+ "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 5 */
+
+ /* even 7. pixel */
+ "mtlo %[vector_64], $ac2 \n\t" /* even 8 */
+ "mthi $zero, $ac2 \n\t"
+ "addqh_r.w %[qload2], %[qload2], %[st2] \n\t" /* average even 5 */
+ "sb %[qload2], 8(%[dst]) \n\t" /* store even 5 to dst */
+ "dpa.w.ph $ac1, %[p2], %[filter45] \n\t" /* even 7 */
+ "lbu %[qload3], 10(%[dst]) \n\t" /* load even 6 from dst */
+ "extp %[Temp1], $ac1, 31 \n\t" /* even 7 */
+ "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 6 */
+
+ "lbu %[st2], 12(%[dst]) \n\t" /* load even 7 from dst */
+
+ /* even 8. pixel */
+ "mtlo %[vector_64], $ac3 \n\t" /* odd 1 */
+ "mthi $zero, $ac3 \n\t"
+ "addqh_r.w %[qload3], %[qload3], %[st3] \n\t" /* average even 6 */
+ "dpa.w.ph $ac2, %[p3], %[filter45] \n\t" /* even 8 */
+ "sb %[qload3], 10(%[dst]) \n\t" /* store even 6 to dst */
+ "extp %[Temp2], $ac2, 31 \n\t" /* even 8 */
+ "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 7 */
+
+ /* ODD pixels */
+ "ulw %[qload1], 1(%[src]) \n\t"
+ "ulw %[qload2], 5(%[src]) \n\t"
+
+ "addqh_r.w %[st2], %[st2], %[st1] \n\t" /* average even 7 */
+
+ /* odd 1. pixel */
+ "mtlo %[vector_64], $ac1 \n\t" /* odd 2 */
+ "mthi $zero, $ac1 \n\t"
+ "preceu.ph.qbr %[p1], %[qload1] \n\t"
+ "preceu.ph.qbl %[p2], %[qload1] \n\t"
+ "preceu.ph.qbr %[p3], %[qload2] \n\t"
+ "preceu.ph.qbl %[p4], %[qload2] \n\t"
+ "sb %[st2], 12(%[dst]) \n\t" /* store even 7 to dst */
+ "ulw %[qload3], 9(%[src]) \n\t"
+ "dpa.w.ph $ac3, %[p1], %[filter45] \n\t" /* odd 1 */
+ "lbu %[qload2], 14(%[dst]) \n\t" /* load even 8 from dst */
+ "extp %[Temp3], $ac3, 31 \n\t" /* odd 1 */
+ "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 8 */
+
+ "lbu %[st1], 1(%[dst]) \n\t" /* load odd 1 from dst */
+
+ /* odd 2. pixel */
+ "mtlo %[vector_64], $ac2 \n\t" /* odd 3 */
+ "mthi $zero, $ac2 \n\t"
+ "addqh_r.w %[qload2], %[qload2], %[st2] \n\t" /* average even 8 */
+ "preceu.ph.qbr %[p1], %[qload3] \n\t"
+ "preceu.ph.qbl %[p5], %[qload3] \n\t"
+ "sb %[qload2], 14(%[dst]) \n\t" /* store even 8 to dst */
+ "ulw %[qload1], 13(%[src]) \n\t"
+ "dpa.w.ph $ac1, %[p2], %[filter45] \n\t" /* odd 2 */
+ "lbu %[qload3], 3(%[dst]) \n\t" /* load odd 2 from dst */
+ "extp %[Temp1], $ac1, 31 \n\t" /* odd 2 */
+ "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 1 */
+
+ /* odd 3. pixel */
+ "mtlo %[vector_64], $ac3 \n\t" /* odd 4 */
+ "mthi $zero, $ac3 \n\t"
+ "addqh_r.w %[st3], %[st3], %[st1] \n\t" /* average odd 1 */
+ "preceu.ph.qbr %[p2], %[qload1] \n\t"
+ "dpa.w.ph $ac2, %[p3], %[filter45] \n\t" /* odd 3 */
+ "sb %[st3], 1(%[dst]) \n\t" /* store odd 1 to dst */
+ "extp %[Temp2], $ac2, 31 \n\t" /* odd 3 */
+ "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 2 */
+
+ /* odd 4. pixel */
+ "mtlo %[vector_64], $ac1 \n\t" /* odd 5 */
+ "mthi $zero, $ac1 \n\t"
+ "addqh_r.w %[qload3], %[qload3], %[st1] \n\t" /* average odd 2 */
+ "preceu.ph.qbl %[p3], %[qload1] \n\t"
+ "sb %[qload3], 3(%[dst]) \n\t" /* store odd 2 to dst */
+ "lbu %[qload1], 5(%[dst]) \n\t" /* load odd 3 from dst */
+ "dpa.w.ph $ac3, %[p4], %[filter45] \n\t" /* odd 4 */
+ "extp %[Temp3], $ac3, 31 \n\t" /* odd 4 */
+ "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 3 */
+
+ "lbu %[st1], 7(%[dst]) \n\t" /* load odd 4 from dst */
+
+ /* odd 5. pixel */
+ "mtlo %[vector_64], $ac2 \n\t" /* odd 6 */
+ "mthi $zero, $ac2 \n\t"
+ "addqh_r.w %[qload1], %[qload1], %[st2] \n\t" /* average odd 3 */
+ "sb %[qload1], 5(%[dst]) \n\t" /* store odd 3 to dst */
+ "dpa.w.ph $ac1, %[p1], %[filter45] \n\t" /* odd 5 */
+ "extp %[Temp1], $ac1, 31 \n\t" /* odd 5 */
+ "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 4 */
+
+ "lbu %[qload1], 9(%[dst]) \n\t" /* load odd 5 from dst */
+
+ /* odd 6. pixel */
+ "mtlo %[vector_64], $ac3 \n\t" /* odd 7 */
+ "mthi $zero, $ac3 \n\t"
+ "addqh_r.w %[st1], %[st1], %[st3] \n\t" /* average odd 4 */
+ "sb %[st1], 7(%[dst]) \n\t" /* store odd 4 to dst */
+ "dpa.w.ph $ac2, %[p5], %[filter45] \n\t" /* odd 6 */
+ "extp %[Temp2], $ac2, 31 \n\t" /* odd 6 */
+ "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 5 */
+
+ /* odd 7. pixel */
+ "mtlo %[vector_64], $ac1 \n\t" /* odd 8 */
+ "mthi $zero, $ac1 \n\t"
+ "addqh_r.w %[qload1], %[qload1], %[st1] \n\t" /* average odd 5 */
+ "sb %[qload1], 9(%[dst]) \n\t" /* store odd 5 to dst */
+ "lbu %[qload2], 11(%[dst]) \n\t" /* load odd 6 from dst */
+ "dpa.w.ph $ac3, %[p2], %[filter45] \n\t" /* odd 7 */
+ "extp %[Temp3], $ac3, 31 \n\t" /* odd 7 */
+
+ "lbu %[qload3], 13(%[dst]) \n\t" /* load odd 7 from dst */
+
+ /* odd 8. pixel */
+ "dpa.w.ph $ac1, %[p3], %[filter45] \n\t" /* odd 8 */
+ "extp %[Temp1], $ac1, 31 \n\t" /* odd 8 */
+
+ "lbu %[qload1], 15(%[dst]) \n\t" /* load odd 8 from dst */
+
+ "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 6 */
+ "addqh_r.w %[qload2], %[qload2], %[st2] \n\t" /* average odd 6 */
+
+ "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 7 */
+ "addqh_r.w %[qload3], %[qload3], %[st3] \n\t" /* average odd 7 */
+
+ "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 8 */
+ "addqh_r.w %[qload1], %[qload1], %[st1] \n\t" /* average odd 8 */
+
+ "sb %[qload2], 11(%[dst]) \n\t" /* store odd 6 to dst */
+ "sb %[qload3], 13(%[dst]) \n\t" /* store odd 7 to dst */
+ "sb %[qload1], 15(%[dst]) \n\t" /* store odd 8 to dst */
+
+ : [qload1] "=&r" (qload1), [qload2] "=&r" (qload2),
+ [st1] "=&r" (st1), [st2] "=&r" (st2), [st3] "=&r" (st3),
+ [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4),
+ [qload3] "=&r" (qload3), [p5] "=&r" (p5),
+ [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3)
+ : [filter45] "r" (filter45), [vector_64] "r" (vector_64),
+ [cm] "r" (cm), [dst] "r" (dst), [src] "r" (src)
+ );
+
+ src += 16;
+ dst += 16;
+ }
+
+ /* Next row... */
+ src_ptr += src_stride;
+ dst_ptr += dst_stride;
+ }
+}
+
+static void convolve_bi_avg_horiz_64_dspr2(const uint8_t *src_ptr,
+ int32_t src_stride,
+ uint8_t *dst_ptr,
+ int32_t dst_stride,
+ const int16_t *filter_x0,
+ int32_t h) {
+ int32_t y, c;
+ const uint8_t *src;
+ uint8_t *dst;
+ uint8_t *cm = vp9_ff_cropTbl;
+ uint32_t vector_64 = 64;
+ int32_t Temp1, Temp2, Temp3;
+ uint32_t qload1, qload2, qload3;
+ uint32_t p1, p2, p3, p4, p5;
+ uint32_t st1, st2, st3;
+ const int16_t *filter = &filter_x0[3];
+ uint32_t filter45;;
+
+ filter45 = ((const int32_t *)filter)[0];
+
+ for (y = h; y--;) {
+ src = src_ptr;
+ dst = dst_ptr;
+
+ /* prefetch data to cache memory */
+ vp9_prefetch_load(src_ptr + src_stride);
+ vp9_prefetch_load(src_ptr + src_stride + 32);
+ vp9_prefetch_load(src_ptr + src_stride + 64);
+ vp9_prefetch_store(dst_ptr + dst_stride);
+ vp9_prefetch_store(dst_ptr + dst_stride + 32);
+
+ for (c = 0; c < 4; c++) {
+ __asm__ __volatile__ (
+ "ulw %[qload1], 0(%[src]) \n\t"
+ "ulw %[qload2], 4(%[src]) \n\t"
+
+ /* even 1. pixel */
+ "mtlo %[vector_64], $ac1 \n\t" /* even 1 */
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[vector_64], $ac2 \n\t" /* even 2 */
+ "mthi $zero, $ac2 \n\t"
+ "preceu.ph.qbr %[p1], %[qload1] \n\t"
+ "preceu.ph.qbl %[p2], %[qload1] \n\t"
+ "preceu.ph.qbr %[p3], %[qload2] \n\t"
+ "preceu.ph.qbl %[p4], %[qload2] \n\t"
+ "ulw %[qload3], 8(%[src]) \n\t"
+ "dpa.w.ph $ac1, %[p1], %[filter45] \n\t" /* even 1 */
+ "extp %[Temp1], $ac1, 31 \n\t" /* even 1 */
+ "lbu %[st2], 0(%[dst]) \n\t" /* load even 1 from dst */
+
+ /* even 2. pixel */
+ "mtlo %[vector_64], $ac3 \n\t" /* even 3 */
+ "mthi $zero, $ac3 \n\t"
+ "preceu.ph.qbr %[p1], %[qload3] \n\t"
+ "preceu.ph.qbl %[p5], %[qload3] \n\t"
+ "ulw %[qload1], 12(%[src]) \n\t"
+ "dpa.w.ph $ac2, %[p2], %[filter45] \n\t" /* even 1 */
+ "extp %[Temp2], $ac2, 31 \n\t" /* even 1 */
+ "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 1 */
+
+ "lbu %[qload3], 2(%[dst]) \n\t" /* load even 2 from dst */
+
+ /* even 3. pixel */
+ "mtlo %[vector_64], $ac1 \n\t" /* even 4 */
+ "mthi $zero, $ac1 \n\t"
+ "addqh_r.w %[st2], %[st2], %[st1] \n\t" /* average even 1 */
+ "preceu.ph.qbr %[p2], %[qload1] \n\t"
+ "sb %[st2], 0(%[dst]) \n\t" /* store even 1 to dst */
+ "dpa.w.ph $ac3, %[p3], %[filter45] \n\t" /* even 3 */
+ "extp %[Temp3], $ac3, 31 \n\t" /* even 3 */
+ "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 1 */
+
+ /* even 4. pixel */
+ "mtlo %[vector_64], $ac2 \n\t" /* even 5 */
+ "mthi $zero, $ac2 \n\t"
+ "addqh_r.w %[qload3], %[qload3], %[st2] \n\t" /* average even 2 */
+ "preceu.ph.qbl %[p3], %[qload1] \n\t"
+ "sb %[qload3], 2(%[dst]) \n\t" /* store even 2 to dst */
+ "lbu %[qload3], 4(%[dst]) \n\t" /* load even 3 from dst */
+ "lbu %[qload1], 6(%[dst]) \n\t" /* load even 4 from dst */
+ "dpa.w.ph $ac1, %[p4], %[filter45] \n\t" /* even 4 */
+ "extp %[Temp1], $ac1, 31 \n\t" /* even 4 */
+ "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 3 */
+
+ /* even 5. pixel */
+ "mtlo %[vector_64], $ac3 \n\t" /* even 6 */
+ "mthi $zero, $ac3 \n\t"
+ "addqh_r.w %[qload3], %[qload3], %[st3] \n\t" /* average even 3 */
+ "sb %[qload3], 4(%[dst]) \n\t" /* store even 3 to dst */
+ "dpa.w.ph $ac2, %[p1], %[filter45] \n\t" /* even 5 */
+ "extp %[Temp2], $ac2, 31 \n\t" /* even 5 */
+ "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 4 */
+
+ /* even 6. pixel */
+ "mtlo %[vector_64], $ac1 \n\t" /* even 7 */
+ "mthi $zero, $ac1 \n\t"
+ "addqh_r.w %[qload1], %[qload1], %[st1] \n\t" /* average even 4 */
+ "sb %[qload1], 6(%[dst]) \n\t" /* store even 4 to dst */
+ "dpa.w.ph $ac3, %[p5], %[filter45] \n\t" /* even 6 */
+ "lbu %[qload2], 8(%[dst]) \n\t" /* load even 5 from dst */
+ "extp %[Temp3], $ac3, 31 \n\t" /* even 6 */
+ "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 5 */
+
+ /* even 7. pixel */
+ "mtlo %[vector_64], $ac2 \n\t" /* even 8 */
+ "mthi $zero, $ac2 \n\t"
+ "addqh_r.w %[qload2], %[qload2], %[st2] \n\t" /* average even 5 */
+ "sb %[qload2], 8(%[dst]) \n\t" /* store even 5 to dst */
+ "dpa.w.ph $ac1, %[p2], %[filter45] \n\t" /* even 7 */
+ "lbu %[qload3], 10(%[dst]) \n\t" /* load even 6 from dst */
+ "extp %[Temp1], $ac1, 31 \n\t" /* even 7 */
+ "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 6 */
+
+ "lbu %[st2], 12(%[dst]) \n\t" /* load even 7 from dst */
+
+ /* even 8. pixel */
+ "mtlo %[vector_64], $ac3 \n\t" /* odd 1 */
+ "mthi $zero, $ac3 \n\t"
+ "addqh_r.w %[qload3], %[qload3], %[st3] \n\t" /* average even 6 */
+ "dpa.w.ph $ac2, %[p3], %[filter45] \n\t" /* even 8 */
+ "sb %[qload3], 10(%[dst]) \n\t" /* store even 6 to dst */
+ "extp %[Temp2], $ac2, 31 \n\t" /* even 8 */
+ "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 7 */
+
+ /* ODD pixels */
+ "ulw %[qload1], 1(%[src]) \n\t"
+ "ulw %[qload2], 5(%[src]) \n\t"
+
+ "addqh_r.w %[st2], %[st2], %[st1] \n\t" /* average even 7 */
+
+ /* odd 1. pixel */
+ "mtlo %[vector_64], $ac1 \n\t" /* odd 2 */
+ "mthi $zero, $ac1 \n\t"
+ "preceu.ph.qbr %[p1], %[qload1] \n\t"
+ "preceu.ph.qbl %[p2], %[qload1] \n\t"
+ "preceu.ph.qbr %[p3], %[qload2] \n\t"
+ "preceu.ph.qbl %[p4], %[qload2] \n\t"
+ "sb %[st2], 12(%[dst]) \n\t" /* store even 7 to dst */
+ "ulw %[qload3], 9(%[src]) \n\t"
+ "dpa.w.ph $ac3, %[p1], %[filter45] \n\t" /* odd 1 */
+ "lbu %[qload2], 14(%[dst]) \n\t" /* load even 8 from dst */
+ "extp %[Temp3], $ac3, 31 \n\t" /* odd 1 */
+ "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 8 */
+
+ "lbu %[st1], 1(%[dst]) \n\t" /* load odd 1 from dst */
+
+ /* odd 2. pixel */
+ "mtlo %[vector_64], $ac2 \n\t" /* odd 3 */
+ "mthi $zero, $ac2 \n\t"
+ "addqh_r.w %[qload2], %[qload2], %[st2] \n\t" /* average even 8 */
+ "preceu.ph.qbr %[p1], %[qload3] \n\t"
+ "preceu.ph.qbl %[p5], %[qload3] \n\t"
+ "sb %[qload2], 14(%[dst]) \n\t" /* store even 8 to dst */
+ "ulw %[qload1], 13(%[src]) \n\t"
+ "dpa.w.ph $ac1, %[p2], %[filter45] \n\t" /* odd 2 */
+ "lbu %[qload3], 3(%[dst]) \n\t" /* load odd 2 from dst */
+ "extp %[Temp1], $ac1, 31 \n\t" /* odd 2 */
+ "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 1 */
+
+ /* odd 3. pixel */
+ "mtlo %[vector_64], $ac3 \n\t" /* odd 4 */
+ "mthi $zero, $ac3 \n\t"
+ "addqh_r.w %[st3], %[st3], %[st1] \n\t" /* average odd 1 */
+ "preceu.ph.qbr %[p2], %[qload1] \n\t"
+ "dpa.w.ph $ac2, %[p3], %[filter45] \n\t" /* odd 3 */
+ "sb %[st3], 1(%[dst]) \n\t" /* store odd 1 to dst */
+ "extp %[Temp2], $ac2, 31 \n\t" /* odd 3 */
+ "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 2 */
+
+ /* odd 4. pixel */
+ "mtlo %[vector_64], $ac1 \n\t" /* odd 5 */
+ "mthi $zero, $ac1 \n\t"
+ "addqh_r.w %[qload3], %[qload3], %[st1] \n\t" /* average odd 2 */
+ "preceu.ph.qbl %[p3], %[qload1] \n\t"
+ "sb %[qload3], 3(%[dst]) \n\t" /* store odd 2 to dst */
+ "lbu %[qload1], 5(%[dst]) \n\t" /* load odd 3 from dst */
+ "dpa.w.ph $ac3, %[p4], %[filter45] \n\t" /* odd 4 */
+ "extp %[Temp3], $ac3, 31 \n\t" /* odd 4 */
+ "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 3 */
+
+ "lbu %[st1], 7(%[dst]) \n\t" /* load odd 4 from dst */
+
+ /* odd 5. pixel */
+ "mtlo %[vector_64], $ac2 \n\t" /* odd 6 */
+ "mthi $zero, $ac2 \n\t"
+ "addqh_r.w %[qload1], %[qload1], %[st2] \n\t" /* average odd 3 */
+ "sb %[qload1], 5(%[dst]) \n\t" /* store odd 3 to dst */
+ "dpa.w.ph $ac1, %[p1], %[filter45] \n\t" /* odd 5 */
+ "extp %[Temp1], $ac1, 31 \n\t" /* odd 5 */
+ "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 4 */
+
+ "lbu %[qload1], 9(%[dst]) \n\t" /* load odd 5 from dst */
+
+ /* odd 6. pixel */
+ "mtlo %[vector_64], $ac3 \n\t" /* odd 7 */
+ "mthi $zero, $ac3 \n\t"
+ "addqh_r.w %[st1], %[st1], %[st3] \n\t" /* average odd 4 */
+ "sb %[st1], 7(%[dst]) \n\t" /* store odd 4 to dst */
+ "dpa.w.ph $ac2, %[p5], %[filter45] \n\t" /* odd 6 */
+ "extp %[Temp2], $ac2, 31 \n\t" /* odd 6 */
+ "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 5 */
+
+ /* odd 7. pixel */
+ "mtlo %[vector_64], $ac1 \n\t" /* odd 8 */
+ "mthi $zero, $ac1 \n\t"
+ "addqh_r.w %[qload1], %[qload1], %[st1] \n\t" /* average odd 5 */
+ "sb %[qload1], 9(%[dst]) \n\t" /* store odd 5 to dst */
+ "lbu %[qload2], 11(%[dst]) \n\t" /* load odd 6 from dst */
+ "dpa.w.ph $ac3, %[p2], %[filter45] \n\t" /* odd 7 */
+ "extp %[Temp3], $ac3, 31 \n\t" /* odd 7 */
+
+ "lbu %[qload3], 13(%[dst]) \n\t" /* load odd 7 from dst */
+
+ /* odd 8. pixel */
+ "dpa.w.ph $ac1, %[p3], %[filter45] \n\t" /* odd 8 */
+ "extp %[Temp1], $ac1, 31 \n\t" /* odd 8 */
+
+ "lbu %[qload1], 15(%[dst]) \n\t" /* load odd 8 from dst */
+
+ "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 6 */
+ "addqh_r.w %[qload2], %[qload2], %[st2] \n\t" /* average odd 6 */
+
+ "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 7 */
+ "addqh_r.w %[qload3], %[qload3], %[st3] \n\t" /* average odd 7 */
+
+ "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 8 */
+ "addqh_r.w %[qload1], %[qload1], %[st1] \n\t" /* average odd 8 */
+
+ "sb %[qload2], 11(%[dst]) \n\t" /* store odd 6 to dst */
+ "sb %[qload3], 13(%[dst]) \n\t" /* store odd 7 to dst */
+ "sb %[qload1], 15(%[dst]) \n\t" /* store odd 8 to dst */
+
+ : [qload1] "=&r" (qload1), [qload2] "=&r" (qload2),
+ [st1] "=&r" (st1), [st2] "=&r" (st2), [st3] "=&r" (st3),
+ [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4),
+ [qload3] "=&r" (qload3), [p5] "=&r" (p5),
+ [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3)
+ : [filter45] "r" (filter45), [vector_64] "r" (vector_64),
+ [cm] "r" (cm), [dst] "r" (dst), [src] "r" (src)
+ );
+
+ src += 16;
+ dst += 16;
+ }
+
+ /* Next row... */
+ src_ptr += src_stride;
+ dst_ptr += dst_stride;
+ }
+}
+
+void vp9_convolve2_avg_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const int16_t *filter_x, int x_step_q4,
+ const int16_t *filter_y, int y_step_q4,
+ int w, int h) {
+ if (16 == x_step_q4) {
+ uint32_t pos = 38;
+
+ /* bit positon for extract from acc */
+ __asm__ __volatile__ (
+ "wrdsp %[pos], 1 \n\t"
+ :
+ : [pos] "r" (pos)
+ );
+
+ /* prefetch data to cache memory */
+ vp9_prefetch_load(src);
+ vp9_prefetch_load(src + 32);
+ vp9_prefetch_store(dst);
+
+ switch (w) {
+ case 4:
+ convolve_bi_avg_horiz_4_dspr2(src, src_stride,
+ dst, dst_stride,
+ filter_x, h);
+ break;
+ case 8:
+ convolve_bi_avg_horiz_8_dspr2(src, src_stride,
+ dst, dst_stride,
+ filter_x, h);
+ break;
+ case 16:
+ convolve_bi_avg_horiz_16_dspr2(src, src_stride,
+ dst, dst_stride,
+ filter_x, h, 1);
+ break;
+ case 32:
+ convolve_bi_avg_horiz_16_dspr2(src, src_stride,
+ dst, dst_stride,
+ filter_x, h, 2);
+ break;
+ case 64:
+ vp9_prefetch_load(src + 64);
+ vp9_prefetch_store(dst + 32);
+
+ convolve_bi_avg_horiz_64_dspr2(src, src_stride,
+ dst, dst_stride,
+ filter_x, h);
+ break;
+ default:
+ vp9_convolve8_avg_horiz_c(src, src_stride,
+ dst, dst_stride,
+ filter_x, x_step_q4,
+ filter_y, y_step_q4,
+ w, h);
+ break;
+ }
+ } else {
+ vp9_convolve8_avg_horiz_c(src, src_stride,
+ dst, dst_stride,
+ filter_x, x_step_q4,
+ filter_y, y_step_q4,
+ w, h);
+ }
+}
+#endif
diff --git a/vp9/common/mips/dspr2/vp9_convolve2_dspr2.c b/vp9/common/mips/dspr2/vp9_convolve2_dspr2.c
new file mode 100644
index 0000000..bc422bc
--- /dev/null
+++ b/vp9/common/mips/dspr2/vp9_convolve2_dspr2.c
@@ -0,0 +1,784 @@
+/*
+ * Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include <stdio.h>
+
+#include "./vpx_config.h"
+#include "./vp9_rtcd.h"
+#include "vp9/common/vp9_common.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_ports/mem.h"
+#include "vp9/common/vp9_convolve.h"
+#include "vp9/common/mips/dspr2/vp9_common_dspr2.h"
+
+#if HAVE_DSPR2
+static void convolve_bi_horiz_4_transposed_dspr2(const uint8_t *src,
+ int32_t src_stride,
+ uint8_t *dst,
+ int32_t dst_stride,
+ const int16_t *filter_x0,
+ int32_t h) {
+ int32_t y;
+ uint8_t *cm = vp9_ff_cropTbl;
+ uint8_t *dst_ptr;
+ int32_t Temp1, Temp2;
+ uint32_t vector4a = 64;
+ uint32_t tp1, tp2;
+ uint32_t p1, p2;
+ const int16_t *filter = &filter_x0[3];
+ uint32_t filter45;
+
+ filter45 = ((const int32_t *)filter)[0];
+
+ for (y = h; y--;) {
+ dst_ptr = dst;
+ /* prefetch data to cache memory */
+ vp9_prefetch_load(src + src_stride);
+ vp9_prefetch_load(src + src_stride + 32);
+
+ __asm__ __volatile__ (
+ "ulw %[tp1], 0(%[src]) \n\t"
+ "ulw %[tp2], 4(%[src]) \n\t"
+
+ /* even 1. pixel */
+ "mtlo %[vector4a], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+ "preceu.ph.qbr %[p1], %[tp1] \n\t"
+ "preceu.ph.qbl %[p2], %[tp1] \n\t"
+ "dpa.w.ph $ac3, %[p1], %[filter45] \n\t"
+ "extp %[Temp1], $ac3, 31 \n\t"
+
+ /* even 2. pixel */
+ "mtlo %[vector4a], $ac2 \n\t"
+ "mthi $zero, $ac2 \n\t"
+ "balign %[tp2], %[tp1], 3 \n\t"
+ "dpa.w.ph $ac2, %[p2], %[filter45] \n\t"
+ "extp %[Temp2], $ac2, 31 \n\t"
+
+ /* odd 1. pixel */
+ "lbux %[tp1], %[Temp1](%[cm]) \n\t"
+ "mtlo %[vector4a], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+ "preceu.ph.qbr %[p1], %[tp2] \n\t"
+ "preceu.ph.qbl %[p2], %[tp2] \n\t"
+ "dpa.w.ph $ac3, %[p1], %[filter45] \n\t"
+ "extp %[Temp1], $ac3, 31 \n\t"
+
+ /* odd 2. pixel */
+ "lbux %[tp2], %[Temp2](%[cm]) \n\t"
+ "mtlo %[vector4a], $ac2 \n\t"
+ "mthi $zero, $ac2 \n\t"
+ "dpa.w.ph $ac2, %[p2], %[filter45] \n\t"
+ "extp %[Temp2], $ac2, 31 \n\t"
+
+ /* clamp */
+ "lbux %[p1], %[Temp1](%[cm]) \n\t"
+ "lbux %[p2], %[Temp2](%[cm]) \n\t"
+
+ /* store bytes */
+ "sb %[tp1], 0(%[dst_ptr]) \n\t"
+ "addu %[dst_ptr], %[dst_ptr], %[dst_stride] \n\t"
+
+ "sb %[p1], 0(%[dst_ptr]) \n\t"
+ "addu %[dst_ptr], %[dst_ptr], %[dst_stride] \n\t"
+
+ "sb %[tp2], 0(%[dst_ptr]) \n\t"
+ "addu %[dst_ptr], %[dst_ptr], %[dst_stride] \n\t"
+
+ "sb %[p2], 0(%[dst_ptr]) \n\t"
+ "addu %[dst_ptr], %[dst_ptr], %[dst_stride] \n\t"
+
+ : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2),
+ [p1] "=&r" (p1), [p2] "=&r" (p2),
+ [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2),
+ [dst_ptr] "+r" (dst_ptr)
+ : [filter45] "r" (filter45),[vector4a] "r" (vector4a),
+ [cm] "r" (cm), [src] "r" (src), [dst_stride] "r" (dst_stride)
+ );
+
+ /* Next row... */
+ src += src_stride;
+ dst += 1;
+ }
+}
+
+static void convolve_bi_horiz_8_transposed_dspr2(const uint8_t *src,
+ int32_t src_stride,
+ uint8_t *dst,
+ int32_t dst_stride,
+ const int16_t *filter_x0,
+ int32_t h) {
+ int32_t y;
+ uint8_t *cm = vp9_ff_cropTbl;
+ uint8_t *dst_ptr;
+ uint32_t vector4a = 64;
+ int32_t Temp1, Temp2, Temp3;
+ uint32_t tp1, tp2, tp3;
+ uint32_t p1, p2, p3, p4;
+ uint8_t *odd_dst;
+ uint32_t dst_pitch_2 = (dst_stride << 1);
+ const int16_t *filter = &filter_x0[3];
+ uint32_t filter45;
+
+ filter45 = ((const int32_t *)filter)[0];
+
+ for (y = h; y--;) {
+ /* prefetch data to cache memory */
+ vp9_prefetch_load(src + src_stride);
+ vp9_prefetch_load(src + src_stride + 32);
+
+ dst_ptr = dst;
+ odd_dst = (dst_ptr + dst_stride);
+
+ __asm__ __volatile__ (
+ "ulw %[tp1], 0(%[src]) \n\t"
+ "ulw %[tp2], 4(%[src]) \n\t"
+
+ /* even 1. pixel */
+ "mtlo %[vector4a], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+ "mtlo %[vector4a], $ac2 \n\t"
+ "mthi $zero, $ac2 \n\t"
+ "preceu.ph.qbr %[p1], %[tp1] \n\t"
+ "preceu.ph.qbl %[p2], %[tp1] \n\t"
+ "preceu.ph.qbr %[p3], %[tp2] \n\t"
+ "preceu.ph.qbl %[p4], %[tp2] \n\t"
+ "ulw %[tp3], 8(%[src]) \n\t"
+ "dpa.w.ph $ac3, %[p1], %[filter45] \n\t"
+ "extp %[Temp1], $ac3, 31 \n\t"
+
+ /* even 2. pixel */
+ "dpa.w.ph $ac2, %[p2], %[filter45] \n\t"
+ "extp %[Temp3], $ac2, 31 \n\t"
+
+ /* even 3. pixel */
+ "lbux %[Temp2], %[Temp1](%[cm]) \n\t"
+ "mtlo %[vector4a], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "balign %[tp3], %[tp2], 3 \n\t"
+ "balign %[tp2], %[tp1], 3 \n\t"
+ "dpa.w.ph $ac1, %[p3], %[filter45] \n\t"
+ "lbux %[tp1], %[Temp3](%[cm]) \n\t"
+ "extp %[p3], $ac1, 31 \n\t"
+
+ /* even 4. pixel */
+ "mtlo %[vector4a], $ac2 \n\t"
+ "mthi $zero, $ac2 \n\t"
+ "mtlo %[vector4a], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+ "sb %[Temp2], 0(%[dst_ptr]) \n\t"
+ "addu %[dst_ptr], %[dst_ptr], %[dst_pitch_2] \n\t"
+ "sb %[tp1], 0(%[dst_ptr]) \n\t"
+ "addu %[dst_ptr], %[dst_ptr], %[dst_pitch_2] \n\t"
+
+ "dpa.w.ph $ac2, %[p4], %[filter45] \n\t"
+ "extp %[Temp3], $ac2, 31 \n\t"
+
+ "lbux %[Temp1], %[p3](%[cm]) \n\t"
+
+ /* odd 1. pixel */
+ "mtlo %[vector4a], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "preceu.ph.qbr %[p1], %[tp2] \n\t"
+ "preceu.ph.qbl %[p2], %[tp2] \n\t"
+ "preceu.ph.qbr %[p3], %[tp3] \n\t"
+ "preceu.ph.qbl %[p4], %[tp3] \n\t"
+ "sb %[Temp1], 0(%[dst_ptr]) \n\t"
+ "addu %[dst_ptr], %[dst_ptr], %[dst_pitch_2] \n\t"
+
+ "dpa.w.ph $ac3, %[p1], %[filter45] \n\t"
+ "extp %[Temp2], $ac3, 31 \n\t"
+
+ /* odd 2. pixel */
+ "lbux %[tp1], %[Temp3](%[cm]) \n\t"
+ "mtlo %[vector4a], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+ "mtlo %[vector4a], $ac2 \n\t"
+ "mthi $zero, $ac2 \n\t"
+ "dpa.w.ph $ac1, %[p2], %[filter45] \n\t"
+ "sb %[tp1], 0(%[dst_ptr]) \n\t"
+ "addu %[dst_ptr], %[dst_ptr], %[dst_pitch_2] \n\t"
+ "extp %[Temp3], $ac1, 31 \n\t"
+
+ /* odd 3. pixel */
+ "lbux %[tp3], %[Temp2](%[cm]) \n\t"
+ "dpa.w.ph $ac3, %[p3], %[filter45] \n\t"
+ "extp %[Temp2], $ac3, 31 \n\t"
+
+ /* odd 4. pixel */
+ "sb %[tp3], 0(%[odd_dst]) \n\t"
+ "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t"
+ "dpa.w.ph $ac2, %[p4], %[filter45] \n\t"
+ "extp %[Temp1], $ac2, 31 \n\t"
+
+ /* clamp */
+ "lbux %[p4], %[Temp3](%[cm]) \n\t"
+ "lbux %[p2], %[Temp2](%[cm]) \n\t"
+ "lbux %[p1], %[Temp1](%[cm]) \n\t"
+
+ /* store bytes */
+ "sb %[p4], 0(%[odd_dst]) \n\t"
+ "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t"
+
+ "sb %[p2], 0(%[odd_dst]) \n\t"
+ "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t"
+
+ "sb %[p1], 0(%[odd_dst]) \n\t"
+
+ : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2), [tp3] "=&r" (tp3),
+ [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4),
+ [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3),
+ [dst_ptr] "+r" (dst_ptr), [odd_dst] "+r" (odd_dst)
+ : [filter45] "r" (filter45),[vector4a] "r" (vector4a), [cm] "r" (cm),
+ [src] "r" (src), [dst_pitch_2] "r" (dst_pitch_2)
+ );
+
+ /* Next row... */
+ src += src_stride;
+ dst += 1;
+ }
+}
+
+static void convolve_bi_horiz_16_transposed_dspr2(const uint8_t *src_ptr,
+ int32_t src_stride,
+ uint8_t *dst_ptr,
+ int32_t dst_stride,
+ const int16_t *filter_x0,
+ int32_t h,
+ int32_t count) {
+ int32_t c, y;
+ const uint8_t *src;
+ uint8_t *dst;
+ uint8_t *cm = vp9_ff_cropTbl;
+ uint32_t vector_64 = 64;
+ int32_t Temp1, Temp2, Temp3;
+ uint32_t qload1, qload2;
+ uint32_t p1, p2, p3, p4, p5;
+ uint32_t st1, st2, st3;
+ uint32_t dst_pitch_2 = (dst_stride << 1);
+ uint8_t *odd_dst;
+ const int16_t *filter = &filter_x0[3];
+ uint32_t filter45;
+
+ filter45 = ((const int32_t *)filter)[0];
+
+ for (y = h; y--;) {
+ /* prefetch data to cache memory */
+ vp9_prefetch_load(src_ptr + src_stride);
+ vp9_prefetch_load(src_ptr + src_stride + 32);
+
+ src = src_ptr;
+ dst = dst_ptr;
+
+ odd_dst = (dst + dst_stride);
+
+ for (c = 0; c < count; c++) {
+ __asm__ __volatile__ (
+ "ulw %[qload1], 0(%[src]) \n\t"
+ "ulw %[qload2], 4(%[src]) \n\t"
+
+ /* even 1. pixel */
+ "mtlo %[vector_64], $ac1 \n\t" /* even 1 */
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[vector_64], $ac2 \n\t" /* even 2 */
+ "mthi $zero, $ac2 \n\t"
+ "preceu.ph.qbr %[p1], %[qload1] \n\t"
+ "preceu.ph.qbl %[p2], %[qload1] \n\t"
+ "preceu.ph.qbr %[p3], %[qload2] \n\t"
+ "preceu.ph.qbl %[p4], %[qload2] \n\t"
+ "ulw %[qload1], 8(%[src]) \n\t"
+ "dpa.w.ph $ac1, %[p1], %[filter45] \n\t" /* even 1 */
+ "extp %[Temp1], $ac1, 31 \n\t" /* even 1 */
+
+ /* even 2. pixel */
+ "mtlo %[vector_64], $ac3 \n\t" /* even 3 */
+ "mthi $zero, $ac3 \n\t"
+ "preceu.ph.qbr %[p1], %[qload1] \n\t"
+ "preceu.ph.qbl %[p5], %[qload1] \n\t"
+ "ulw %[qload2], 12(%[src]) \n\t"
+ "dpa.w.ph $ac2, %[p2], %[filter45] \n\t" /* even 1 */
+ "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 1 */
+ "extp %[Temp2], $ac2, 31 \n\t" /* even 1 */
+
+ /* even 3. pixel */
+ "mtlo %[vector_64], $ac1 \n\t" /* even 4 */
+ "mthi $zero, $ac1 \n\t"
+ "preceu.ph.qbr %[p2], %[qload2] \n\t"
+ "sb %[st1], 0(%[dst]) \n\t" /* even 1 */
+ "addu %[dst], %[dst], %[dst_pitch_2] \n\t"
+ "dpa.w.ph $ac3, %[p3], %[filter45] \n\t" /* even 3 */
+ "extp %[Temp3], $ac3, 31 \n\t" /* even 3 */
+ "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 1 */
+
+ /* even 4. pixel */
+ "mtlo %[vector_64], $ac2 \n\t" /* even 5 */
+ "mthi $zero, $ac2 \n\t"
+ "preceu.ph.qbl %[p3], %[qload2] \n\t"
+ "sb %[st2], 0(%[dst]) \n\t" /* even 2 */
+ "addu %[dst], %[dst], %[dst_pitch_2] \n\t"
+ "dpa.w.ph $ac1, %[p4], %[filter45] \n\t" /* even 4 */
+ "extp %[Temp1], $ac1, 31 \n\t" /* even 4 */
+ "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 3 */
+
+ /* even 5. pixel */
+ "mtlo %[vector_64], $ac3 \n\t" /* even 6 */
+ "mthi $zero, $ac3 \n\t"
+ "sb %[st3], 0(%[dst]) \n\t" /* even 3 */
+ "addu %[dst], %[dst], %[dst_pitch_2] \n\t"
+ "dpa.w.ph $ac2, %[p1], %[filter45] \n\t" /* even 5 */
+ "extp %[Temp2], $ac2, 31 \n\t" /* even 5 */
+ "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 4 */
+
+ /* even 6. pixel */
+ "mtlo %[vector_64], $ac1 \n\t" /* even 7 */
+ "mthi $zero, $ac1 \n\t"
+ "sb %[st1], 0(%[dst]) \n\t" /* even 4 */
+ "addu %[dst], %[dst], %[dst_pitch_2] \n\t"
+ "ulw %[qload1], 20(%[src]) \n\t"
+ "dpa.w.ph $ac3, %[p5], %[filter45] \n\t" /* even 6 */
+ "extp %[Temp3], $ac3, 31 \n\t" /* even 6 */
+ "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 5 */
+
+ /* even 7. pixel */
+ "mtlo %[vector_64], $ac2 \n\t" /* even 8 */
+ "mthi $zero, $ac2 \n\t"
+ "preceu.ph.qbr %[p5], %[qload1] \n\t"
+ "sb %[st2], 0(%[dst]) \n\t" /* even 5 */
+ "addu %[dst], %[dst], %[dst_pitch_2] \n\t"
+ "dpa.w.ph $ac1, %[p2], %[filter45] \n\t" /* even 7 */
+ "extp %[Temp1], $ac1, 31 \n\t" /* even 7 */
+ "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 6 */
+
+ /* even 8. pixel */
+ "mtlo %[vector_64], $ac3 \n\t" /* odd 1 */
+ "mthi $zero, $ac3 \n\t"
+ "dpa.w.ph $ac2, %[p3], %[filter45] \n\t" /* even 8 */
+ "sb %[st3], 0(%[dst]) \n\t" /* even 6 */
+ "addu %[dst], %[dst], %[dst_pitch_2] \n\t"
+ "extp %[Temp2], $ac2, 31 \n\t" /* even 8 */
+ "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 7 */
+
+ /* ODD pixels */
+ "ulw %[qload1], 1(%[src]) \n\t"
+ "ulw %[qload2], 5(%[src]) \n\t"
+
+ /* odd 1. pixel */
+ "mtlo %[vector_64], $ac1 \n\t" /* odd 2 */
+ "mthi $zero, $ac1 \n\t"
+ "preceu.ph.qbr %[p1], %[qload1] \n\t"
+ "preceu.ph.qbl %[p2], %[qload1] \n\t"
+ "preceu.ph.qbr %[p3], %[qload2] \n\t"
+ "preceu.ph.qbl %[p4], %[qload2] \n\t"
+ "sb %[st1], 0(%[dst]) \n\t" /* even 7 */
+ "addu %[dst], %[dst], %[dst_pitch_2] \n\t"
+ "ulw %[qload2], 9(%[src]) \n\t"
+ "dpa.w.ph $ac3, %[p1], %[filter45] \n\t" /* odd 1 */
+ "extp %[Temp3], $ac3, 31 \n\t" /* odd 1 */
+ "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 8 */
+
+ /* odd 2. pixel */
+ "mtlo %[vector_64], $ac2 \n\t" /* odd 3 */
+ "mthi $zero, $ac2 \n\t"
+ "preceu.ph.qbr %[p1], %[qload2] \n\t"
+ "preceu.ph.qbl %[p5], %[qload2] \n\t"
+ "sb %[st2], 0(%[dst]) \n\t" /* even 8 */
+ "ulw %[qload1], 13(%[src]) \n\t"
+ "dpa.w.ph $ac1, %[p2], %[filter45] \n\t" /* odd 2 */
+ "extp %[Temp1], $ac1, 31 \n\t" /* odd 2 */
+ "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 1 */
+
+ /* odd 3. pixel */
+ "mtlo %[vector_64], $ac3 \n\t" /* odd 4 */
+ "mthi $zero, $ac3 \n\t"
+ "preceu.ph.qbr %[p2], %[qload1] \n\t"
+ "sb %[st3], 0(%[odd_dst]) \n\t" /* odd 1 */
+ "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t"
+ "dpa.w.ph $ac2, %[p3], %[filter45] \n\t" /* odd 3 */
+ "extp %[Temp2], $ac2, 31 \n\t" /* odd 3 */
+ "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 2 */
+
+ /* odd 4. pixel */
+ "mtlo %[vector_64], $ac1 \n\t" /* odd 5 */
+ "mthi $zero, $ac1 \n\t"
+ "preceu.ph.qbl %[p3], %[qload1] \n\t"
+ "sb %[st1], 0(%[odd_dst]) \n\t" /* odd 2 */
+ "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t"
+ "dpa.w.ph $ac3, %[p4], %[filter45] \n\t" /* odd 4 */
+ "extp %[Temp3], $ac3, 31 \n\t" /* odd 4 */
+ "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 3 */
+
+ /* odd 5. pixel */
+ "mtlo %[vector_64], $ac2 \n\t" /* odd 6 */
+ "mthi $zero, $ac2 \n\t"
+ "sb %[st2], 0(%[odd_dst]) \n\t" /* odd 3 */
+ "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t"
+ "dpa.w.ph $ac1, %[p1], %[filter45] \n\t" /* odd 5 */
+ "extp %[Temp1], $ac1, 31 \n\t" /* odd 5 */
+ "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 4 */
+
+ /* odd 6. pixel */
+ "mtlo %[vector_64], $ac3 \n\t" /* odd 7 */
+ "mthi $zero, $ac3 \n\t"
+ "sb %[st3], 0(%[odd_dst]) \n\t" /* odd 4 */
+ "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t"
+ "ulw %[qload1], 21(%[src]) \n\t"
+ "dpa.w.ph $ac2, %[p5], %[filter45] \n\t" /* odd 6 */
+ "extp %[Temp2], $ac2, 31 \n\t" /* odd 6 */
+ "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 5 */
+
+ /* odd 7. pixel */
+ "mtlo %[vector_64], $ac1 \n\t" /* odd 8 */
+ "mthi $zero, $ac1 \n\t"
+ "preceu.ph.qbr %[p5], %[qload1] \n\t"
+ "sb %[st1], 0(%[odd_dst]) \n\t" /* odd 5 */
+ "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t"
+ "dpa.w.ph $ac3, %[p2], %[filter45] \n\t" /* odd 7 */
+ "extp %[Temp3], $ac3, 31 \n\t" /* odd 7 */
+
+ /* odd 8. pixel */
+ "dpa.w.ph $ac1, %[p3], %[filter45] \n\t" /* odd 8 */
+ "extp %[Temp1], $ac1, 31 \n\t" /* odd 8 */
+
+ "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 6 */
+ "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 7 */
+ "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 8 */
+
+ "sb %[st2], 0(%[odd_dst]) \n\t" /* odd 6 */
+ "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t"
+
+ "sb %[st3], 0(%[odd_dst]) \n\t" /* odd 7 */
+ "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t"
+
+ "sb %[st1], 0(%[odd_dst]) \n\t" /* odd 8 */
+
+ : [qload1] "=&r" (qload1), [qload2] "=&r" (qload2), [p5] "=&r" (p5),
+ [st1] "=&r" (st1), [st2] "=&r" (st2), [st3] "=&r" (st3),
+ [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4),
+ [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3),
+ [dst] "+r" (dst), [odd_dst] "+r" (odd_dst)
+ : [filter45] "r" (filter45), [vector_64] "r" (vector_64),
+ [cm] "r" (cm),
+ [src] "r" (src), [dst_pitch_2] "r" (dst_pitch_2)
+ );
+
+ src += 16;
+ dst = (dst_ptr + ((c + 1) * 16 * dst_stride));
+ odd_dst = (dst + dst_stride);
+ }
+
+ /* Next row... */
+ src_ptr += src_stride;
+ dst_ptr += 1;
+ }
+}
+
+static void convolve_bi_horiz_64_transposed_dspr2(const uint8_t *src_ptr,
+ int32_t src_stride,
+ uint8_t *dst_ptr,
+ int32_t dst_stride,
+ const int16_t *filter_x0,
+ int32_t h) {
+ int32_t c, y;
+ const uint8_t *src;
+ uint8_t *dst;
+ uint8_t *cm = vp9_ff_cropTbl;
+ uint32_t vector_64 = 64;
+ int32_t Temp1, Temp2, Temp3;
+ uint32_t qload1, qload2;
+ uint32_t p1, p2, p3, p4, p5;
+ uint32_t st1, st2, st3;
+ uint32_t dst_pitch_2 = (dst_stride << 1);
+ uint8_t *odd_dst;
+ const int16_t *filter = &filter_x0[3];
+ uint32_t filter45;
+
+ filter45 = ((const int32_t *)filter)[0];
+
+ for (y = h; y--;) {
+ /* prefetch data to cache memory */
+ vp9_prefetch_load(src_ptr + src_stride);
+ vp9_prefetch_load(src_ptr + src_stride + 32);
+ vp9_prefetch_load(src_ptr + src_stride + 64);
+
+ src = src_ptr;
+ dst = dst_ptr;
+
+ odd_dst = (dst + dst_stride);
+
+ for (c = 0; c < 4; c++) {
+ __asm__ __volatile__ (
+ "ulw %[qload1], 0(%[src]) \n\t"
+ "ulw %[qload2], 4(%[src]) \n\t"
+
+ /* even 1. pixel */
+ "mtlo %[vector_64], $ac1 \n\t" /* even 1 */
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[vector_64], $ac2 \n\t" /* even 2 */
+ "mthi $zero, $ac2 \n\t"
+ "preceu.ph.qbr %[p1], %[qload1] \n\t"
+ "preceu.ph.qbl %[p2], %[qload1] \n\t"
+ "preceu.ph.qbr %[p3], %[qload2] \n\t"
+ "preceu.ph.qbl %[p4], %[qload2] \n\t"
+ "ulw %[qload1], 8(%[src]) \n\t"
+ "dpa.w.ph $ac1, %[p1], %[filter45] \n\t" /* even 1 */
+ "extp %[Temp1], $ac1, 31 \n\t" /* even 1 */
+
+ /* even 2. pixel */
+ "mtlo %[vector_64], $ac3 \n\t" /* even 3 */
+ "mthi $zero, $ac3 \n\t"
+ "preceu.ph.qbr %[p1], %[qload1] \n\t"
+ "preceu.ph.qbl %[p5], %[qload1] \n\t"
+ "ulw %[qload2], 12(%[src]) \n\t"
+ "dpa.w.ph $ac2, %[p2], %[filter45] \n\t" /* even 1 */
+ "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 1 */
+ "extp %[Temp2], $ac2, 31 \n\t" /* even 1 */
+
+ /* even 3. pixel */
+ "mtlo %[vector_64], $ac1 \n\t" /* even 4 */
+ "mthi $zero, $ac1 \n\t"
+ "preceu.ph.qbr %[p2], %[qload2] \n\t"
+ "sb %[st1], 0(%[dst]) \n\t" /* even 1 */
+ "addu %[dst], %[dst], %[dst_pitch_2] \n\t"
+ "dpa.w.ph $ac3, %[p3], %[filter45] \n\t" /* even 3 */
+ "extp %[Temp3], $ac3, 31 \n\t" /* even 3 */
+ "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 1 */
+
+ /* even 4. pixel */
+ "mtlo %[vector_64], $ac2 \n\t" /* even 5 */
+ "mthi $zero, $ac2 \n\t"
+ "preceu.ph.qbl %[p3], %[qload2] \n\t"
+ "sb %[st2], 0(%[dst]) \n\t" /* even 2 */
+ "addu %[dst], %[dst], %[dst_pitch_2] \n\t"
+ "dpa.w.ph $ac1, %[p4], %[filter45] \n\t" /* even 4 */
+ "extp %[Temp1], $ac1, 31 \n\t" /* even 4 */
+ "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 3 */
+
+ /* even 5. pixel */
+ "mtlo %[vector_64], $ac3 \n\t" /* even 6 */
+ "mthi $zero, $ac3 \n\t"
+ "sb %[st3], 0(%[dst]) \n\t" /* even 3 */
+ "addu %[dst], %[dst], %[dst_pitch_2] \n\t"
+ "dpa.w.ph $ac2, %[p1], %[filter45] \n\t" /* even 5 */
+ "extp %[Temp2], $ac2, 31 \n\t" /* even 5 */
+ "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 4 */
+
+ /* even 6. pixel */
+ "mtlo %[vector_64], $ac1 \n\t" /* even 7 */
+ "mthi $zero, $ac1 \n\t"
+ "sb %[st1], 0(%[dst]) \n\t" /* even 4 */
+ "addu %[dst], %[dst], %[dst_pitch_2] \n\t"
+ "ulw %[qload1], 20(%[src]) \n\t"
+ "dpa.w.ph $ac3, %[p5], %[filter45] \n\t" /* even 6 */
+ "extp %[Temp3], $ac3, 31 \n\t" /* even 6 */
+ "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 5 */
+
+ /* even 7. pixel */
+ "mtlo %[vector_64], $ac2 \n\t" /* even 8 */
+ "mthi $zero, $ac2 \n\t"
+ "preceu.ph.qbr %[p5], %[qload1] \n\t"
+ "sb %[st2], 0(%[dst]) \n\t" /* even 5 */
+ "addu %[dst], %[dst], %[dst_pitch_2] \n\t"
+ "dpa.w.ph $ac1, %[p2], %[filter45] \n\t" /* even 7 */
+ "extp %[Temp1], $ac1, 31 \n\t" /* even 7 */
+ "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 6 */
+
+ /* even 8. pixel */
+ "mtlo %[vector_64], $ac3 \n\t" /* odd 1 */
+ "mthi $zero, $ac3 \n\t"
+ "dpa.w.ph $ac2, %[p3], %[filter45] \n\t" /* even 8 */
+ "sb %[st3], 0(%[dst]) \n\t" /* even 6 */
+ "addu %[dst], %[dst], %[dst_pitch_2] \n\t"
+ "extp %[Temp2], $ac2, 31 \n\t" /* even 8 */
+ "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 7 */
+
+ /* ODD pixels */
+ "ulw %[qload1], 1(%[src]) \n\t"
+ "ulw %[qload2], 5(%[src]) \n\t"
+
+ /* odd 1. pixel */
+ "mtlo %[vector_64], $ac1 \n\t" /* odd 2 */
+ "mthi $zero, $ac1 \n\t"
+ "preceu.ph.qbr %[p1], %[qload1] \n\t"
+ "preceu.ph.qbl %[p2], %[qload1] \n\t"
+ "preceu.ph.qbr %[p3], %[qload2] \n\t"
+ "preceu.ph.qbl %[p4], %[qload2] \n\t"
+ "sb %[st1], 0(%[dst]) \n\t" /* even 7 */
+ "addu %[dst], %[dst], %[dst_pitch_2] \n\t"
+ "ulw %[qload2], 9(%[src]) \n\t"
+ "dpa.w.ph $ac3, %[p1], %[filter45] \n\t" /* odd 1 */
+ "extp %[Temp3], $ac3, 31 \n\t" /* odd 1 */
+ "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 8 */
+
+ /* odd 2. pixel */
+ "mtlo %[vector_64], $ac2 \n\t" /* odd 3 */
+ "mthi $zero, $ac2 \n\t"
+ "preceu.ph.qbr %[p1], %[qload2] \n\t"
+ "preceu.ph.qbl %[p5], %[qload2] \n\t"
+ "sb %[st2], 0(%[dst]) \n\t" /* even 8 */
+ "ulw %[qload1], 13(%[src]) \n\t"
+ "dpa.w.ph $ac1, %[p2], %[filter45] \n\t" /* odd 2 */
+ "extp %[Temp1], $ac1, 31 \n\t" /* odd 2 */
+ "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 1 */
+
+ /* odd 3. pixel */
+ "mtlo %[vector_64], $ac3 \n\t" /* odd 4 */
+ "mthi $zero, $ac3 \n\t"
+ "preceu.ph.qbr %[p2], %[qload1] \n\t"
+ "sb %[st3], 0(%[odd_dst]) \n\t" /* odd 1 */
+ "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t"
+ "dpa.w.ph $ac2, %[p3], %[filter45] \n\t" /* odd 3 */
+ "extp %[Temp2], $ac2, 31 \n\t" /* odd 3 */
+ "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 2 */
+
+ /* odd 4. pixel */
+ "mtlo %[vector_64], $ac1 \n\t" /* odd 5 */
+ "mthi $zero, $ac1 \n\t"
+ "preceu.ph.qbl %[p3], %[qload1] \n\t"
+ "sb %[st1], 0(%[odd_dst]) \n\t" /* odd 2 */
+ "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t"
+ "dpa.w.ph $ac3, %[p4], %[filter45] \n\t" /* odd 4 */
+ "extp %[Temp3], $ac3, 31 \n\t" /* odd 4 */
+ "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 3 */
+
+ /* odd 5. pixel */
+ "mtlo %[vector_64], $ac2 \n\t" /* odd 6 */
+ "mthi $zero, $ac2 \n\t"
+ "sb %[st2], 0(%[odd_dst]) \n\t" /* odd 3 */
+ "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t"
+ "dpa.w.ph $ac1, %[p1], %[filter45] \n\t" /* odd 5 */
+ "extp %[Temp1], $ac1, 31 \n\t" /* odd 5 */
+ "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 4 */
+
+ /* odd 6. pixel */
+ "mtlo %[vector_64], $ac3 \n\t" /* odd 7 */
+ "mthi $zero, $ac3 \n\t"
+ "sb %[st3], 0(%[odd_dst]) \n\t" /* odd 4 */
+ "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t"
+ "ulw %[qload1], 21(%[src]) \n\t"
+ "dpa.w.ph $ac2, %[p5], %[filter45] \n\t" /* odd 6 */
+ "extp %[Temp2], $ac2, 31 \n\t" /* odd 6 */
+ "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 5 */
+
+ /* odd 7. pixel */
+ "mtlo %[vector_64], $ac1 \n\t" /* odd 8 */
+ "mthi $zero, $ac1 \n\t"
+ "preceu.ph.qbr %[p5], %[qload1] \n\t"
+ "sb %[st1], 0(%[odd_dst]) \n\t" /* odd 5 */
+ "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t"
+ "dpa.w.ph $ac3, %[p2], %[filter45] \n\t" /* odd 7 */
+ "extp %[Temp3], $ac3, 31 \n\t" /* odd 7 */
+
+ /* odd 8. pixel */
+ "dpa.w.ph $ac1, %[p3], %[filter45] \n\t" /* odd 8 */
+ "extp %[Temp1], $ac1, 31 \n\t" /* odd 8 */
+
+ "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 6 */
+ "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 7 */
+ "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 8 */
+
+ "sb %[st2], 0(%[odd_dst]) \n\t" /* odd 6 */
+ "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t"
+
+ "sb %[st3], 0(%[odd_dst]) \n\t" /* odd 7 */
+ "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t"
+
+ "sb %[st1], 0(%[odd_dst]) \n\t" /* odd 8 */
+
+ : [qload1] "=&r" (qload1), [qload2] "=&r" (qload2), [p5] "=&r" (p5),
+ [st1] "=&r" (st1), [st2] "=&r" (st2), [st3] "=&r" (st3),
+ [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4),
+ [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3),
+ [dst] "+r" (dst), [odd_dst] "+r" (odd_dst)
+ : [filter45] "r" (filter45), [vector_64] "r" (vector_64),
+ [cm] "r" (cm),
+ [src] "r" (src), [dst_pitch_2] "r" (dst_pitch_2)
+ );
+
+ src += 16;
+ dst = (dst_ptr + ((c + 1) * 16 * dst_stride));
+ odd_dst = (dst + dst_stride);
+ }
+
+ /* Next row... */
+ src_ptr += src_stride;
+ dst_ptr += 1;
+ }
+}
+
+void convolve_bi_horiz_transposed(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const int16_t *filter, int w, int h) {
+ int x, y;
+
+ for (y = 0; y < h; ++y) {
+ for (x = 0; x < w; ++x) {
+ int sum = 0;
+
+ sum += src[x] * filter[3];
+ sum += src[x + 1] * filter[4];
+
+ dst[x * dst_stride] = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS));
+ }
+
+ src += src_stride;
+ dst += 1;
+ }
+}
+
+void vp9_convolve2_dspr2(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const int16_t *filter,
+ int w, int h) {
+ uint32_t pos = 38;
+
+ /* bit positon for extract from acc */
+ __asm__ __volatile__ (
+ "wrdsp %[pos], 1 \n\t"
+ :
+ : [pos] "r" (pos)
+ );
+
+ /* prefetch data to cache memory */
+ vp9_prefetch_load(src);
+ vp9_prefetch_load(src + 32);
+
+ switch (w) {
+ case 4:
+ convolve_bi_horiz_4_transposed_dspr2(src, src_stride,
+ dst, dst_stride,
+ filter, h);
+ break;
+ case 8:
+ convolve_bi_horiz_8_transposed_dspr2(src, src_stride,
+ dst, dst_stride,
+ filter, h);
+ break;
+ case 16:
+ case 32:
+ convolve_bi_horiz_16_transposed_dspr2(src, src_stride,
+ dst, dst_stride,
+ filter, h,
+ (w/16));
+ break;
+ case 64:
+ vp9_prefetch_load(src + 32);
+ convolve_bi_horiz_64_transposed_dspr2(src, src_stride,
+ dst, dst_stride,
+ filter, h);
+ break;
+ default:
+ convolve_bi_horiz_transposed(src, src_stride,
+ dst, dst_stride,
+ filter, w, h);
+ break;
+ }
+}
+#endif
diff --git a/vp9/common/mips/dspr2/vp9_convolve2_horiz_dspr2.c b/vp9/common/mips/dspr2/vp9_convolve2_horiz_dspr2.c
new file mode 100644
index 0000000..1debdb4
--- /dev/null
+++ b/vp9/common/mips/dspr2/vp9_convolve2_horiz_dspr2.c
@@ -0,0 +1,713 @@
+/*
+ * Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include <stdio.h>
+
+#include "./vpx_config.h"
+#include "./vp9_rtcd.h"
+#include "vp9/common/vp9_common.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_ports/mem.h"
+#include "vp9/common/vp9_convolve.h"
+#include "vp9/common/mips/dspr2/vp9_common_dspr2.h"
+
+#if HAVE_DSPR2
+static void convolve_bi_horiz_4_dspr2(const uint8_t *src,
+ int32_t src_stride,
+ uint8_t *dst,
+ int32_t dst_stride,
+ const int16_t *filter_x0,
+ int32_t h) {
+ int32_t y;
+ uint8_t *cm = vp9_ff_cropTbl;
+ int32_t Temp1, Temp2, Temp3, Temp4;
+ uint32_t vector4a = 64;
+ uint32_t tp1, tp2;
+ uint32_t p1, p2;
+ const int16_t *filter = &filter_x0[3];
+ uint32_t filter45;;
+
+ filter45 = ((const int32_t *)filter)[0];
+
+ for (y = h; y--;) {
+ /* prefetch data to cache memory */
+ vp9_prefetch_load(src + src_stride);
+ vp9_prefetch_load(src + src_stride + 32);
+ vp9_prefetch_store(dst + dst_stride);
+
+ __asm__ __volatile__ (
+ "ulw %[tp1], 0(%[src]) \n\t"
+ "ulw %[tp2], 4(%[src]) \n\t"
+
+ /* even 1. pixel */
+ "mtlo %[vector4a], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+ "preceu.ph.qbr %[p1], %[tp1] \n\t"
+ "preceu.ph.qbl %[p2], %[tp1] \n\t"
+ "dpa.w.ph $ac3, %[p1], %[filter45] \n\t"
+ "extp %[Temp1], $ac3, 31 \n\t"
+
+ /* even 2. pixel */
+ "mtlo %[vector4a], $ac2 \n\t"
+ "mthi $zero, $ac2 \n\t"
+ "balign %[tp2], %[tp1], 3 \n\t"
+ "dpa.w.ph $ac2, %[p2], %[filter45] \n\t"
+ "extp %[Temp3], $ac2, 31 \n\t"
+
+ /* odd 1. pixel */
+ "lbux %[tp1], %[Temp1](%[cm]) \n\t"
+ "mtlo %[vector4a], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+ "preceu.ph.qbr %[p1], %[tp2] \n\t"
+ "preceu.ph.qbl %[p2], %[tp2] \n\t"
+ "dpa.w.ph $ac3, %[p1], %[filter45] \n\t"
+ "extp %[Temp2], $ac3, 31 \n\t"
+
+ /* odd 2. pixel */
+ "lbux %[tp2], %[Temp3](%[cm]) \n\t"
+ "mtlo %[vector4a], $ac2 \n\t"
+ "mthi $zero, $ac2 \n\t"
+ "dpa.w.ph $ac2, %[p2], %[filter45] \n\t"
+ "extp %[Temp4], $ac2, 31 \n\t"
+
+ /* clamp */
+ "lbux %[p1], %[Temp2](%[cm]) \n\t"
+ "lbux %[p2], %[Temp4](%[cm]) \n\t"
+
+ /* store bytes */
+ "sb %[tp1], 0(%[dst]) \n\t"
+ "sb %[p1], 1(%[dst]) \n\t"
+ "sb %[tp2], 2(%[dst]) \n\t"
+ "sb %[p2], 3(%[dst]) \n\t"
+
+ : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2),
+ [p1] "=&r" (p1), [p2] "=&r" (p2),
+ [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2),
+ [Temp3] "=&r" (Temp3), [Temp4] "=&r" (Temp4)
+ : [filter45] "r" (filter45), [vector4a] "r" (vector4a),
+ [cm] "r" (cm), [dst] "r" (dst), [src] "r" (src)
+ );
+
+ /* Next row... */
+ src += src_stride;
+ dst += dst_stride;
+ }
+}
+
+static void convolve_bi_horiz_8_dspr2(const uint8_t *src,
+ int32_t src_stride,
+ uint8_t *dst,
+ int32_t dst_stride,
+ const int16_t *filter_x0,
+ int32_t h) {
+ int32_t y;
+ uint8_t *cm = vp9_ff_cropTbl;
+ uint32_t vector4a = 64;
+ int32_t Temp1, Temp2, Temp3;
+ uint32_t tp1, tp2, tp3;
+ uint32_t p1, p2, p3, p4;
+ uint32_t st0, st1;
+ const int16_t *filter = &filter_x0[3];
+ uint32_t filter45;;
+
+ filter45 = ((const int32_t *)filter)[0];
+
+ for (y = h; y--;) {
+ /* prefetch data to cache memory */
+ vp9_prefetch_load(src + src_stride);
+ vp9_prefetch_load(src + src_stride + 32);
+ vp9_prefetch_store(dst + dst_stride);
+
+ __asm__ __volatile__ (
+ "ulw %[tp1], 0(%[src]) \n\t"
+ "ulw %[tp2], 4(%[src]) \n\t"
+
+ /* even 1. pixel */
+ "mtlo %[vector4a], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+ "mtlo %[vector4a], $ac2 \n\t"
+ "mthi $zero, $ac2 \n\t"
+ "preceu.ph.qbr %[p1], %[tp1] \n\t"
+ "preceu.ph.qbl %[p2], %[tp1] \n\t"
+ "preceu.ph.qbr %[p3], %[tp2] \n\t"
+ "preceu.ph.qbl %[p4], %[tp2] \n\t"
+ "ulw %[tp3], 8(%[src]) \n\t"
+ "dpa.w.ph $ac3, %[p1], %[filter45] \n\t"
+ "extp %[Temp1], $ac3, 31 \n\t"
+
+ /* even 2. pixel */
+ "dpa.w.ph $ac2, %[p2], %[filter45] \n\t"
+ "extp %[Temp3], $ac2, 31 \n\t"
+
+ /* even 3. pixel */
+ "lbux %[st0], %[Temp1](%[cm]) \n\t"
+ "mtlo %[vector4a], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "dpa.w.ph $ac1, %[p3], %[filter45] \n\t"
+ "extp %[Temp1], $ac1, 31 \n\t"
+
+ /* even 4. pixel */
+ "mtlo %[vector4a], $ac2 \n\t"
+ "mthi $zero, $ac2 \n\t"
+ "mtlo %[vector4a], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+ "sb %[st0], 0(%[dst]) \n\t"
+ "lbux %[st1], %[Temp3](%[cm]) \n\t"
+
+ "balign %[tp3], %[tp2], 3 \n\t"
+ "balign %[tp2], %[tp1], 3 \n\t"
+
+ "dpa.w.ph $ac2, %[p4], %[filter45] \n\t"
+ "extp %[Temp3], $ac2, 31 \n\t"
+
+ "lbux %[st0], %[Temp1](%[cm]) \n\t"
+
+ /* odd 1. pixel */
+ "mtlo %[vector4a], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "sb %[st1], 2(%[dst]) \n\t"
+ "preceu.ph.qbr %[p1], %[tp2] \n\t"
+ "preceu.ph.qbl %[p2], %[tp2] \n\t"
+ "preceu.ph.qbr %[p3], %[tp3] \n\t"
+ "preceu.ph.qbl %[p4], %[tp3] \n\t"
+ "sb %[st0], 4(%[dst]) \n\t"
+ "dpa.w.ph $ac3, %[p1], %[filter45] \n\t"
+ "extp %[Temp2], $ac3, 31 \n\t"
+
+ /* odd 2. pixel */
+ "mtlo %[vector4a], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+ "mtlo %[vector4a], $ac2 \n\t"
+ "mthi $zero, $ac2 \n\t"
+ "lbux %[st0], %[Temp3](%[cm]) \n\t"
+ "dpa.w.ph $ac1, %[p2], %[filter45] \n\t"
+ "extp %[Temp3], $ac1, 31 \n\t"
+
+ /* odd 3. pixel */
+ "lbux %[st1], %[Temp2](%[cm]) \n\t"
+ "dpa.w.ph $ac3, %[p3], %[filter45] \n\t"
+ "extp %[Temp2], $ac3, 31 \n\t"
+
+ /* odd 4. pixel */
+ "sb %[st1], 1(%[dst]) \n\t"
+ "sb %[st0], 6(%[dst]) \n\t"
+ "dpa.w.ph $ac2, %[p4], %[filter45] \n\t"
+ "extp %[Temp1], $ac2, 31 \n\t"
+
+ /* clamp */
+ "lbux %[p4], %[Temp3](%[cm]) \n\t"
+ "lbux %[p2], %[Temp2](%[cm]) \n\t"
+ "lbux %[p1], %[Temp1](%[cm]) \n\t"
+
+ /* store bytes */
+ "sb %[p4], 3(%[dst]) \n\t"
+ "sb %[p2], 5(%[dst]) \n\t"
+ "sb %[p1], 7(%[dst]) \n\t"
+
+ : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2), [tp3] "=&r" (tp3),
+ [st0] "=&r" (st0), [st1] "=&r" (st1),
+ [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4),
+ [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3)
+ : [filter45] "r" (filter45), [vector4a] "r" (vector4a),
+ [cm] "r" (cm), [dst] "r" (dst), [src] "r" (src)
+ );
+
+ /* Next row... */
+ src += src_stride;
+ dst += dst_stride;
+ }
+}
+
+static void convolve_bi_horiz_16_dspr2(const uint8_t *src_ptr,
+ int32_t src_stride,
+ uint8_t *dst_ptr,
+ int32_t dst_stride,
+ const int16_t *filter_x0,
+ int32_t h,
+ int32_t count) {
+ int32_t y, c;
+ const uint8_t *src;
+ uint8_t *dst;
+ uint8_t *cm = vp9_ff_cropTbl;
+ uint32_t vector_64 = 64;
+ int32_t Temp1, Temp2, Temp3;
+ uint32_t qload1, qload2, qload3;
+ uint32_t p1, p2, p3, p4, p5;
+ uint32_t st1, st2, st3;
+ const int16_t *filter = &filter_x0[3];
+ uint32_t filter45;;
+
+ filter45 = ((const int32_t *)filter)[0];
+
+ for (y = h; y--;) {
+ src = src_ptr;
+ dst = dst_ptr;
+
+ /* prefetch data to cache memory */
+ vp9_prefetch_load(src_ptr + src_stride);
+ vp9_prefetch_load(src_ptr + src_stride + 32);
+ vp9_prefetch_store(dst_ptr + dst_stride);
+
+ for (c = 0; c < count; c++) {
+ __asm__ __volatile__ (
+ "ulw %[qload1], 0(%[src]) \n\t"
+ "ulw %[qload2], 4(%[src]) \n\t"
+
+ /* even 1. pixel */
+ "mtlo %[vector_64], $ac1 \n\t" /* even 1 */
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[vector_64], $ac2 \n\t" /* even 2 */
+ "mthi $zero, $ac2 \n\t"
+ "preceu.ph.qbr %[p1], %[qload1] \n\t"
+ "preceu.ph.qbl %[p2], %[qload1] \n\t"
+ "preceu.ph.qbr %[p3], %[qload2] \n\t"
+ "preceu.ph.qbl %[p4], %[qload2] \n\t"
+ "ulw %[qload3], 8(%[src]) \n\t"
+ "dpa.w.ph $ac1, %[p1], %[filter45] \n\t" /* even 1 */
+ "extp %[Temp1], $ac1, 31 \n\t" /* even 1 */
+
+ /* even 2. pixel */
+ "mtlo %[vector_64], $ac3 \n\t" /* even 3 */
+ "mthi $zero, $ac3 \n\t"
+ "preceu.ph.qbr %[p1], %[qload3] \n\t"
+ "preceu.ph.qbl %[p5], %[qload3] \n\t"
+ "ulw %[qload1], 12(%[src]) \n\t"
+ "dpa.w.ph $ac2, %[p2], %[filter45] \n\t" /* even 1 */
+ "extp %[Temp2], $ac2, 31 \n\t" /* even 1 */
+ "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 1 */
+
+ /* even 3. pixel */
+ "mtlo %[vector_64], $ac1 \n\t" /* even 4 */
+ "mthi $zero, $ac1 \n\t"
+ "preceu.ph.qbr %[p2], %[qload1] \n\t"
+ "sb %[st1], 0(%[dst]) \n\t" /* even 1 */
+ "dpa.w.ph $ac3, %[p3], %[filter45] \n\t" /* even 3 */
+ "extp %[Temp3], $ac3, 31 \n\t" /* even 3 */
+ "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 1 */
+
+ /* even 4. pixel */
+ "mtlo %[vector_64], $ac2 \n\t" /* even 5 */
+ "mthi $zero, $ac2 \n\t"
+ "preceu.ph.qbl %[p3], %[qload1] \n\t"
+ "sb %[st2], 2(%[dst]) \n\t" /* even 1 */
+ "dpa.w.ph $ac1, %[p4], %[filter45] \n\t" /* even 4 */
+ "extp %[Temp1], $ac1, 31 \n\t" /* even 4 */
+ "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 3 */
+
+ /* even 5. pixel */
+ "mtlo %[vector_64], $ac3 \n\t" /* even 6 */
+ "mthi $zero, $ac3 \n\t"
+ "sb %[st3], 4(%[dst]) \n\t" /* even 3 */
+ "dpa.w.ph $ac2, %[p1], %[filter45] \n\t" /* even 5 */
+ "extp %[Temp2], $ac2, 31 \n\t" /* even 5 */
+ "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 4 */
+
+ /* even 6. pixel */
+ "mtlo %[vector_64], $ac1 \n\t" /* even 7 */
+ "mthi $zero, $ac1 \n\t"
+ "sb %[st1], 6(%[dst]) \n\t" /* even 4 */
+ "dpa.w.ph $ac3, %[p5], %[filter45] \n\t" /* even 6 */
+ "extp %[Temp3], $ac3, 31 \n\t" /* even 6 */
+ "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 5 */
+
+ /* even 7. pixel */
+ "mtlo %[vector_64], $ac2 \n\t" /* even 8 */
+ "mthi $zero, $ac2 \n\t"
+ "sb %[st2], 8(%[dst]) \n\t" /* even 5 */
+ "dpa.w.ph $ac1, %[p2], %[filter45] \n\t" /* even 7 */
+ "extp %[Temp1], $ac1, 31 \n\t" /* even 7 */
+ "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 6 */
+
+ /* even 8. pixel */
+ "mtlo %[vector_64], $ac3 \n\t" /* odd 1 */
+ "mthi $zero, $ac3 \n\t"
+ "dpa.w.ph $ac2, %[p3], %[filter45] \n\t" /* even 8 */
+ "sb %[st3], 10(%[dst]) \n\t" /* even 6 */
+ "extp %[Temp2], $ac2, 31 \n\t" /* even 8 */
+ "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 7 */
+
+ /* ODD pixels */
+ "ulw %[qload1], 1(%[src]) \n\t"
+ "ulw %[qload2], 5(%[src]) \n\t"
+
+ /* odd 1. pixel */
+ "mtlo %[vector_64], $ac1 \n\t" /* odd 2 */
+ "mthi $zero, $ac1 \n\t"
+ "preceu.ph.qbr %[p1], %[qload1] \n\t"
+ "preceu.ph.qbl %[p2], %[qload1] \n\t"
+ "preceu.ph.qbr %[p3], %[qload2] \n\t"
+ "preceu.ph.qbl %[p4], %[qload2] \n\t"
+ "sb %[st1], 12(%[dst]) \n\t" /* even 7 */
+ "ulw %[qload3], 9(%[src]) \n\t"
+ "dpa.w.ph $ac3, %[p1], %[filter45] \n\t" /* odd 1 */
+ "extp %[Temp3], $ac3, 31 \n\t" /* odd 1 */
+ "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 8 */
+
+ /* odd 2. pixel */
+ "mtlo %[vector_64], $ac2 \n\t" /* odd 3 */
+ "mthi $zero, $ac2 \n\t"
+ "preceu.ph.qbr %[p1], %[qload3] \n\t"
+ "preceu.ph.qbl %[p5], %[qload3] \n\t"
+ "sb %[st2], 14(%[dst]) \n\t" /* even 8 */
+ "ulw %[qload1], 13(%[src]) \n\t"
+ "dpa.w.ph $ac1, %[p2], %[filter45] \n\t" /* odd 2 */
+ "extp %[Temp1], $ac1, 31 \n\t" /* odd 2 */
+ "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 1 */
+
+ /* odd 3. pixel */
+ "mtlo %[vector_64], $ac3 \n\t" /* odd 4 */
+ "mthi $zero, $ac3 \n\t"
+ "preceu.ph.qbr %[p2], %[qload1] \n\t"
+ "sb %[st3], 1(%[dst]) \n\t" /* odd 1 */
+ "dpa.w.ph $ac2, %[p3], %[filter45] \n\t" /* odd 3 */
+ "extp %[Temp2], $ac2, 31 \n\t" /* odd 3 */
+ "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 2 */
+
+ /* odd 4. pixel */
+ "mtlo %[vector_64], $ac1 \n\t" /* odd 5 */
+ "mthi $zero, $ac1 \n\t"
+ "preceu.ph.qbl %[p3], %[qload1] \n\t"
+ "sb %[st1], 3(%[dst]) \n\t" /* odd 2 */
+ "dpa.w.ph $ac3, %[p4], %[filter45] \n\t" /* odd 4 */
+ "extp %[Temp3], $ac3, 31 \n\t" /* odd 4 */
+ "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 3 */
+
+ /* odd 5. pixel */
+ "mtlo %[vector_64], $ac2 \n\t" /* odd 6 */
+ "mthi $zero, $ac2 \n\t"
+ "sb %[st2], 5(%[dst]) \n\t" /* odd 3 */
+ "dpa.w.ph $ac1, %[p1], %[filter45] \n\t" /* odd 5 */
+ "extp %[Temp1], $ac1, 31 \n\t" /* odd 5 */
+ "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 4 */
+
+ /* odd 6. pixel */
+ "mtlo %[vector_64], $ac3 \n\t" /* odd 7 */
+ "mthi $zero, $ac3 \n\t"
+ "sb %[st3], 7(%[dst]) \n\t" /* odd 4 */
+ "dpa.w.ph $ac2, %[p5], %[filter45] \n\t" /* odd 6 */
+ "extp %[Temp2], $ac2, 31 \n\t" /* odd 6 */
+ "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 5 */
+
+ /* odd 7. pixel */
+ "mtlo %[vector_64], $ac1 \n\t" /* odd 8 */
+ "mthi $zero, $ac1 \n\t"
+ "sb %[st1], 9(%[dst]) \n\t" /* odd 5 */
+ "dpa.w.ph $ac3, %[p2], %[filter45] \n\t" /* odd 7 */
+ "extp %[Temp3], $ac3, 31 \n\t" /* odd 7 */
+
+ /* odd 8. pixel */
+ "dpa.w.ph $ac1, %[p3], %[filter45] \n\t" /* odd 8 */
+ "extp %[Temp1], $ac1, 31 \n\t" /* odd 8 */
+
+ "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 6 */
+ "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 7 */
+ "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 8 */
+
+ "sb %[st2], 11(%[dst]) \n\t" /* odd 6 */
+ "sb %[st3], 13(%[dst]) \n\t" /* odd 7 */
+ "sb %[st1], 15(%[dst]) \n\t" /* odd 8 */
+
+ : [qload1] "=&r" (qload1), [qload2] "=&r" (qload2), [qload3] "=&r" (qload3),
+ [st1] "=&r" (st1), [st2] "=&r" (st2), [st3] "=&r" (st3),
+ [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4),
+ [p5] "=&r" (p5),
+ [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3)
+ : [filter45] "r" (filter45), [vector_64] "r" (vector_64),
+ [cm] "r" (cm), [dst] "r" (dst), [src] "r" (src)
+ );
+
+ src += 16;
+ dst += 16;
+ }
+
+ /* Next row... */
+ src_ptr += src_stride;
+ dst_ptr += dst_stride;
+ }
+}
+
+static void convolve_bi_horiz_64_dspr2(const uint8_t *src_ptr,
+ int32_t src_stride,
+ uint8_t *dst_ptr,
+ int32_t dst_stride,
+ const int16_t *filter_x0,
+ int32_t h) {
+ int32_t y, c;
+ const uint8_t *src;
+ uint8_t *dst;
+ uint8_t *cm = vp9_ff_cropTbl;
+ uint32_t vector_64 = 64;
+ int32_t Temp1, Temp2, Temp3;
+ uint32_t qload1, qload2, qload3;
+ uint32_t p1, p2, p3, p4, p5;
+ uint32_t st1, st2, st3;
+ const int16_t *filter = &filter_x0[3];
+ uint32_t filter45;;
+
+ filter45 = ((const int32_t *)filter)[0];
+
+ for (y = h; y--;) {
+ src = src_ptr;
+ dst = dst_ptr;
+
+ /* prefetch data to cache memory */
+ vp9_prefetch_load(src_ptr + src_stride);
+ vp9_prefetch_load(src_ptr + src_stride + 32);
+ vp9_prefetch_load(src_ptr + src_stride + 64);
+ vp9_prefetch_store(dst_ptr + dst_stride);
+ vp9_prefetch_store(dst_ptr + dst_stride + 32);
+
+ for (c = 0; c < 4; c++) {
+ __asm__ __volatile__ (
+ "ulw %[qload1], 0(%[src]) \n\t"
+ "ulw %[qload2], 4(%[src]) \n\t"
+
+ /* even 1. pixel */
+ "mtlo %[vector_64], $ac1 \n\t" /* even 1 */
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[vector_64], $ac2 \n\t" /* even 2 */
+ "mthi $zero, $ac2 \n\t"
+ "preceu.ph.qbr %[p1], %[qload1] \n\t"
+ "preceu.ph.qbl %[p2], %[qload1] \n\t"
+ "preceu.ph.qbr %[p3], %[qload2] \n\t"
+ "preceu.ph.qbl %[p4], %[qload2] \n\t"
+ "ulw %[qload3], 8(%[src]) \n\t"
+ "dpa.w.ph $ac1, %[p1], %[filter45] \n\t" /* even 1 */
+ "extp %[Temp1], $ac1, 31 \n\t" /* even 1 */
+
+ /* even 2. pixel */
+ "mtlo %[vector_64], $ac3 \n\t" /* even 3 */
+ "mthi $zero, $ac3 \n\t"
+ "preceu.ph.qbr %[p1], %[qload3] \n\t"
+ "preceu.ph.qbl %[p5], %[qload3] \n\t"
+ "ulw %[qload1], 12(%[src]) \n\t"
+ "dpa.w.ph $ac2, %[p2], %[filter45] \n\t" /* even 1 */
+ "extp %[Temp2], $ac2, 31 \n\t" /* even 1 */
+ "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 1 */
+
+ /* even 3. pixel */
+ "mtlo %[vector_64], $ac1 \n\t" /* even 4 */
+ "mthi $zero, $ac1 \n\t"
+ "preceu.ph.qbr %[p2], %[qload1] \n\t"
+ "sb %[st1], 0(%[dst]) \n\t" /* even 1 */
+ "dpa.w.ph $ac3, %[p3], %[filter45] \n\t" /* even 3 */
+ "extp %[Temp3], $ac3, 31 \n\t" /* even 3 */
+ "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 1 */
+
+ /* even 4. pixel */
+ "mtlo %[vector_64], $ac2 \n\t" /* even 5 */
+ "mthi $zero, $ac2 \n\t"
+ "preceu.ph.qbl %[p3], %[qload1] \n\t"
+ "sb %[st2], 2(%[dst]) \n\t" /* even 1 */
+ "dpa.w.ph $ac1, %[p4], %[filter45] \n\t" /* even 4 */
+ "extp %[Temp1], $ac1, 31 \n\t" /* even 4 */
+ "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 3 */
+
+ /* even 5. pixel */
+ "mtlo %[vector_64], $ac3 \n\t" /* even 6 */
+ "mthi $zero, $ac3 \n\t"
+ "sb %[st3], 4(%[dst]) \n\t" /* even 3 */
+ "dpa.w.ph $ac2, %[p1], %[filter45] \n\t" /* even 5 */
+ "extp %[Temp2], $ac2, 31 \n\t" /* even 5 */
+ "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 4 */
+
+ /* even 6. pixel */
+ "mtlo %[vector_64], $ac1 \n\t" /* even 7 */
+ "mthi $zero, $ac1 \n\t"
+ "sb %[st1], 6(%[dst]) \n\t" /* even 4 */
+ "dpa.w.ph $ac3, %[p5], %[filter45] \n\t" /* even 6 */
+ "extp %[Temp3], $ac3, 31 \n\t" /* even 6 */
+ "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 5 */
+
+ /* even 7. pixel */
+ "mtlo %[vector_64], $ac2 \n\t" /* even 8 */
+ "mthi $zero, $ac2 \n\t"
+ "sb %[st2], 8(%[dst]) \n\t" /* even 5 */
+ "dpa.w.ph $ac1, %[p2], %[filter45] \n\t" /* even 7 */
+ "extp %[Temp1], $ac1, 31 \n\t" /* even 7 */
+ "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 6 */
+
+ /* even 8. pixel */
+ "mtlo %[vector_64], $ac3 \n\t" /* odd 1 */
+ "mthi $zero, $ac3 \n\t"
+ "dpa.w.ph $ac2, %[p3], %[filter45] \n\t" /* even 8 */
+ "sb %[st3], 10(%[dst]) \n\t" /* even 6 */
+ "extp %[Temp2], $ac2, 31 \n\t" /* even 8 */
+ "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 7 */
+
+ /* ODD pixels */
+ "ulw %[qload1], 1(%[src]) \n\t"
+ "ulw %[qload2], 5(%[src]) \n\t"
+
+ /* odd 1. pixel */
+ "mtlo %[vector_64], $ac1 \n\t" /* odd 2 */
+ "mthi $zero, $ac1 \n\t"
+ "preceu.ph.qbr %[p1], %[qload1] \n\t"
+ "preceu.ph.qbl %[p2], %[qload1] \n\t"
+ "preceu.ph.qbr %[p3], %[qload2] \n\t"
+ "preceu.ph.qbl %[p4], %[qload2] \n\t"
+ "sb %[st1], 12(%[dst]) \n\t" /* even 7 */
+ "ulw %[qload3], 9(%[src]) \n\t"
+ "dpa.w.ph $ac3, %[p1], %[filter45] \n\t" /* odd 1 */
+ "extp %[Temp3], $ac3, 31 \n\t" /* odd 1 */
+ "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 8 */
+
+ /* odd 2. pixel */
+ "mtlo %[vector_64], $ac2 \n\t" /* odd 3 */
+ "mthi $zero, $ac2 \n\t"
+ "preceu.ph.qbr %[p1], %[qload3] \n\t"
+ "preceu.ph.qbl %[p5], %[qload3] \n\t"
+ "sb %[st2], 14(%[dst]) \n\t" /* even 8 */
+ "ulw %[qload1], 13(%[src]) \n\t"
+ "dpa.w.ph $ac1, %[p2], %[filter45] \n\t" /* odd 2 */
+ "extp %[Temp1], $ac1, 31 \n\t" /* odd 2 */
+ "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 1 */
+
+ /* odd 3. pixel */
+ "mtlo %[vector_64], $ac3 \n\t" /* odd 4 */
+ "mthi $zero, $ac3 \n\t"
+ "preceu.ph.qbr %[p2], %[qload1] \n\t"
+ "sb %[st3], 1(%[dst]) \n\t" /* odd 1 */
+ "dpa.w.ph $ac2, %[p3], %[filter45] \n\t" /* odd 3 */
+ "extp %[Temp2], $ac2, 31 \n\t" /* odd 3 */
+ "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 2 */
+
+ /* odd 4. pixel */
+ "mtlo %[vector_64], $ac1 \n\t" /* odd 5 */
+ "mthi $zero, $ac1 \n\t"
+ "preceu.ph.qbl %[p3], %[qload1] \n\t"
+ "sb %[st1], 3(%[dst]) \n\t" /* odd 2 */
+ "dpa.w.ph $ac3, %[p4], %[filter45] \n\t" /* odd 4 */
+ "extp %[Temp3], $ac3, 31 \n\t" /* odd 4 */
+ "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 3 */
+
+ /* odd 5. pixel */
+ "mtlo %[vector_64], $ac2 \n\t" /* odd 6 */
+ "mthi $zero, $ac2 \n\t"
+ "sb %[st2], 5(%[dst]) \n\t" /* odd 3 */
+ "dpa.w.ph $ac1, %[p1], %[filter45] \n\t" /* odd 5 */
+ "extp %[Temp1], $ac1, 31 \n\t" /* odd 5 */
+ "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 4 */
+
+ /* odd 6. pixel */
+ "mtlo %[vector_64], $ac3 \n\t" /* odd 7 */
+ "mthi $zero, $ac3 \n\t"
+ "sb %[st3], 7(%[dst]) \n\t" /* odd 4 */
+ "dpa.w.ph $ac2, %[p5], %[filter45] \n\t" /* odd 6 */
+ "extp %[Temp2], $ac2, 31 \n\t" /* odd 6 */
+ "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 5 */
+
+ /* odd 7. pixel */
+ "mtlo %[vector_64], $ac1 \n\t" /* odd 8 */
+ "mthi $zero, $ac1 \n\t"
+ "sb %[st1], 9(%[dst]) \n\t" /* odd 5 */
+ "dpa.w.ph $ac3, %[p2], %[filter45] \n\t" /* odd 7 */
+ "extp %[Temp3], $ac3, 31 \n\t" /* odd 7 */
+
+ /* odd 8. pixel */
+ "dpa.w.ph $ac1, %[p3], %[filter45] \n\t" /* odd 8 */
+ "extp %[Temp1], $ac1, 31 \n\t" /* odd 8 */
+
+ "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 6 */
+ "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 7 */
+ "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 8 */
+
+ "sb %[st2], 11(%[dst]) \n\t" /* odd 6 */
+ "sb %[st3], 13(%[dst]) \n\t" /* odd 7 */
+ "sb %[st1], 15(%[dst]) \n\t" /* odd 8 */
+
+ : [qload1] "=&r" (qload1), [qload2] "=&r" (qload2), [qload3] "=&r" (qload3),
+ [st1] "=&r" (st1), [st2] "=&r" (st2), [st3] "=&r" (st3),
+ [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4),
+ [p5] "=&r" (p5),
+ [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3)
+ : [filter45] "r" (filter45), [vector_64] "r" (vector_64),
+ [cm] "r" (cm), [dst] "r" (dst), [src] "r" (src)
+ );
+
+ src += 16;
+ dst += 16;
+ }
+
+ /* Next row... */
+ src_ptr += src_stride;
+ dst_ptr += dst_stride;
+ }
+}
+
+void vp9_convolve2_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const int16_t *filter_x, int x_step_q4,
+ const int16_t *filter_y, int y_step_q4,
+ int w, int h) {
+ if (16 == x_step_q4) {
+ uint32_t pos = 38;
+
+ vp9_prefetch_load((const uint8_t *)filter_x);
+
+ /* bit positon for extract from acc */
+ __asm__ __volatile__ (
+ "wrdsp %[pos], 1 \n\t"
+ :
+ : [pos] "r" (pos)
+ );
+
+ /* prefetch data to cache memory */
+ vp9_prefetch_load(src);
+ vp9_prefetch_load(src + 32);
+ vp9_prefetch_store(dst);
+
+ switch (w) {
+ case 4:
+ convolve_bi_horiz_4_dspr2(src, (int32_t)src_stride,
+ dst, (int32_t)dst_stride,
+ filter_x, (int32_t)h);
+ break;
+ case 8:
+ convolve_bi_horiz_8_dspr2(src, (int32_t)src_stride,
+ dst, (int32_t)dst_stride,
+ filter_x, (int32_t)h);
+ break;
+ case 16:
+ convolve_bi_horiz_16_dspr2(src, (int32_t)src_stride,
+ dst, (int32_t)dst_stride,
+ filter_x, (int32_t)h, 1);
+ break;
+ case 32:
+ convolve_bi_horiz_16_dspr2(src, (int32_t)src_stride,
+ dst, (int32_t)dst_stride,
+ filter_x, (int32_t)h, 2);
+ break;
+ case 64:
+ vp9_prefetch_load(src + 64);
+ vp9_prefetch_store(dst + 32);
+
+ convolve_bi_horiz_64_dspr2(src, (int32_t)src_stride,
+ dst, (int32_t)dst_stride,
+ filter_x, (int32_t)h);
+ break;
+ default:
+ vp9_convolve8_horiz_c(src, src_stride,
+ dst, dst_stride,
+ filter_x, x_step_q4,
+ filter_y, y_step_q4,
+ w, h);
+ break;
+ }
+ } else {
+ vp9_convolve8_horiz_c(src, src_stride,
+ dst, dst_stride,
+ filter_x, x_step_q4,
+ filter_y, y_step_q4,
+ w, h);
+ }
+}
+#endif
diff --git a/vp9/common/mips/dspr2/vp9_convolve2_vert_dspr2.c b/vp9/common/mips/dspr2/vp9_convolve2_vert_dspr2.c
new file mode 100644
index 0000000..8eb105c
--- /dev/null
+++ b/vp9/common/mips/dspr2/vp9_convolve2_vert_dspr2.c
@@ -0,0 +1,266 @@
+/*
+ * Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include <stdio.h>
+
+#include "./vpx_config.h"
+#include "./vp9_rtcd.h"
+#include "vp9/common/vp9_common.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_ports/mem.h"
+#include "vp9/common/vp9_convolve.h"
+#include "vp9/common/mips/dspr2/vp9_common_dspr2.h"
+
+#if HAVE_DSPR2
+static void convolve_bi_vert_4_dspr2(const uint8_t *src,
+ int32_t src_stride,
+ uint8_t *dst,
+ int32_t dst_stride,
+ const int16_t *filter_y,
+ int32_t w,
+ int32_t h) {
+ int32_t x, y;
+ const uint8_t *src_ptr;
+ uint8_t *dst_ptr;
+ uint8_t *cm = vp9_ff_cropTbl;
+ uint32_t vector4a = 64;
+ uint32_t load1, load2;
+ uint32_t p1, p2;
+ uint32_t scratch1;
+ uint32_t store1, store2;
+ int32_t Temp1, Temp2;
+ const int16_t *filter = &filter_y[3];
+ uint32_t filter45;
+
+ filter45 = ((const int32_t *)filter)[0];
+
+ for (y = h; y--;) {
+ /* prefetch data to cache memory */
+ vp9_prefetch_store(dst + dst_stride);
+
+ for (x = 0; x < w; x += 4) {
+ src_ptr = src + x;
+ dst_ptr = dst + x;
+
+ __asm__ __volatile__ (
+ "ulw %[load1], 0(%[src_ptr]) \n\t"
+ "add %[src_ptr], %[src_ptr], %[src_stride] \n\t"
+ "ulw %[load2], 0(%[src_ptr]) \n\t"
+
+ "mtlo %[vector4a], $ac0 \n\t"
+ "mtlo %[vector4a], $ac1 \n\t"
+ "mtlo %[vector4a], $ac2 \n\t"
+ "mtlo %[vector4a], $ac3 \n\t"
+ "mthi $zero, $ac0 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "mthi $zero, $ac2 \n\t"
+ "mthi $zero, $ac3 \n\t"
+
+ "preceu.ph.qbr %[scratch1], %[load1] \n\t"
+ "preceu.ph.qbr %[p1], %[load2] \n\t"
+
+ "precrq.ph.w %[p2], %[p1], %[scratch1] \n\t" /* pixel 2 */
+ "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */
+
+ "dpa.w.ph $ac0, %[p1], %[filter45] \n\t"
+ "dpa.w.ph $ac1, %[p2], %[filter45] \n\t"
+
+ "preceu.ph.qbl %[scratch1], %[load1] \n\t"
+ "preceu.ph.qbl %[p1], %[load2] \n\t"
+
+ "precrq.ph.w %[p2], %[p1], %[scratch1] \n\t" /* pixel 2 */
+ "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */
+
+ "dpa.w.ph $ac2, %[p1], %[filter45] \n\t"
+ "dpa.w.ph $ac3, %[p2], %[filter45] \n\t"
+
+ "extp %[Temp1], $ac0, 31 \n\t"
+ "extp %[Temp2], $ac1, 31 \n\t"
+
+ "lbux %[store1], %[Temp1](%[cm]) \n\t"
+ "extp %[Temp1], $ac2, 31 \n\t"
+
+ "lbux %[store2], %[Temp2](%[cm]) \n\t"
+ "extp %[Temp2], $ac3, 31 \n\t"
+
+ "sb %[store1], 0(%[dst_ptr]) \n\t"
+ "sb %[store2], 1(%[dst_ptr]) \n\t"
+
+ "lbux %[store1], %[Temp1](%[cm]) \n\t"
+ "lbux %[store2], %[Temp2](%[cm]) \n\t"
+
+ "sb %[store1], 2(%[dst_ptr]) \n\t"
+ "sb %[store2], 3(%[dst_ptr]) \n\t"
+
+ : [load1] "=&r" (load1), [load2] "=&r" (load2),
+ [p1] "=&r" (p1), [p2] "=&r" (p2),
+ [scratch1] "=&r" (scratch1),
+ [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2),
+ [store1] "=&r" (store1), [store2] "=&r" (store2),
+ [src_ptr] "+r" (src_ptr)
+ : [filter45] "r" (filter45),[vector4a] "r" (vector4a),
+ [src_stride] "r" (src_stride),
+ [cm] "r" (cm), [dst_ptr] "r" (dst_ptr)
+ );
+ }
+
+ /* Next row... */
+ src += src_stride;
+ dst += dst_stride;
+ }
+}
+
+static void convolve_bi_vert_64_dspr2(const uint8_t *src,
+ int32_t src_stride,
+ uint8_t *dst,
+ int32_t dst_stride,
+ const int16_t *filter_y,
+ int32_t h) {
+ int32_t x, y;
+ const uint8_t *src_ptr;
+ uint8_t *dst_ptr;
+ uint8_t *cm = vp9_ff_cropTbl;
+ uint32_t vector4a = 64;
+ uint32_t load1, load2;
+ uint32_t p1, p2;
+ uint32_t scratch1;
+ uint32_t store1, store2;
+ int32_t Temp1, Temp2;
+ const int16_t *filter = &filter_y[3];
+ uint32_t filter45;
+
+ filter45 = ((const int32_t *)filter)[0];
+
+ for (y = h; y--;) {
+ /* prefetch data to cache memory */
+ vp9_prefetch_store(dst + dst_stride);
+
+ for (x = 0; x < 64; x += 4) {
+ src_ptr = src + x;
+ dst_ptr = dst + x;
+
+ __asm__ __volatile__ (
+ "ulw %[load1], 0(%[src_ptr]) \n\t"
+ "add %[src_ptr], %[src_ptr], %[src_stride] \n\t"
+ "ulw %[load2], 0(%[src_ptr]) \n\t"
+
+ "mtlo %[vector4a], $ac0 \n\t"
+ "mtlo %[vector4a], $ac1 \n\t"
+ "mtlo %[vector4a], $ac2 \n\t"
+ "mtlo %[vector4a], $ac3 \n\t"
+ "mthi $zero, $ac0 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "mthi $zero, $ac2 \n\t"
+ "mthi $zero, $ac3 \n\t"
+
+ "preceu.ph.qbr %[scratch1], %[load1] \n\t"
+ "preceu.ph.qbr %[p1], %[load2] \n\t"
+
+ "precrq.ph.w %[p2], %[p1], %[scratch1] \n\t" /* pixel 2 */
+ "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */
+
+ "dpa.w.ph $ac0, %[p1], %[filter45] \n\t"
+ "dpa.w.ph $ac1, %[p2], %[filter45] \n\t"
+
+ "preceu.ph.qbl %[scratch1], %[load1] \n\t"
+ "preceu.ph.qbl %[p1], %[load2] \n\t"
+
+ "precrq.ph.w %[p2], %[p1], %[scratch1] \n\t" /* pixel 2 */
+ "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */
+
+ "dpa.w.ph $ac2, %[p1], %[filter45] \n\t"
+ "dpa.w.ph $ac3, %[p2], %[filter45] \n\t"
+
+ "extp %[Temp1], $ac0, 31 \n\t"
+ "extp %[Temp2], $ac1, 31 \n\t"
+
+ "lbux %[store1], %[Temp1](%[cm]) \n\t"
+ "extp %[Temp1], $ac2, 31 \n\t"
+
+ "lbux %[store2], %[Temp2](%[cm]) \n\t"
+ "extp %[Temp2], $ac3, 31 \n\t"
+
+ "sb %[store1], 0(%[dst_ptr]) \n\t"
+ "sb %[store2], 1(%[dst_ptr]) \n\t"
+
+ "lbux %[store1], %[Temp1](%[cm]) \n\t"
+ "lbux %[store2], %[Temp2](%[cm]) \n\t"
+
+ "sb %[store1], 2(%[dst_ptr]) \n\t"
+ "sb %[store2], 3(%[dst_ptr]) \n\t"
+
+ : [load1] "=&r" (load1), [load2] "=&r" (load2),
+ [p1] "=&r" (p1), [p2] "=&r" (p2),
+ [scratch1] "=&r" (scratch1),
+ [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2),
+ [store1] "=&r" (store1), [store2] "=&r" (store2),
+ [src_ptr] "+r" (src_ptr)
+ : [filter45] "r" (filter45),[vector4a] "r" (vector4a),
+ [src_stride] "r" (src_stride),
+ [cm] "r" (cm), [dst_ptr] "r" (dst_ptr)
+ );
+ }
+
+ /* Next row... */
+ src += src_stride;
+ dst += dst_stride;
+ }
+}
+
+void vp9_convolve2_vert_dspr2(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const int16_t *filter_x, int x_step_q4,
+ const int16_t *filter_y, int y_step_q4,
+ int w, int h) {
+ if (16 == y_step_q4) {
+ uint32_t pos = 38;
+
+ /* bit positon for extract from acc */
+ __asm__ __volatile__ (
+ "wrdsp %[pos], 1 \n\t"
+ :
+ : [pos] "r" (pos)
+ );
+
+ vp9_prefetch_store(dst);
+
+ switch (w) {
+ case 4 :
+ case 8 :
+ case 16 :
+ case 32 :
+ convolve_bi_vert_4_dspr2(src, src_stride,
+ dst, dst_stride,
+ filter_y, w, h);
+ break;
+ case 64 :
+ vp9_prefetch_store(dst + 32);
+ convolve_bi_vert_64_dspr2(src, src_stride,
+ dst, dst_stride,
+ filter_y, h);
+ break;
+ default:
+ vp9_convolve8_vert_c(src, src_stride,
+ dst, dst_stride,
+ filter_x, x_step_q4,
+ filter_y, y_step_q4,
+ w, h);
+ break;
+ }
+ } else {
+ vp9_convolve8_vert_c(src, src_stride,
+ dst, dst_stride,
+ filter_x, x_step_q4,
+ filter_y, y_step_q4,
+ w, h);
+ }
+}
+#endif
diff --git a/vp9/common/mips/dspr2/vp9_convolve8_avg_dspr2.c b/vp9/common/mips/dspr2/vp9_convolve8_avg_dspr2.c
index 0930ad1..da7f0fd 100644
--- a/vp9/common/mips/dspr2/vp9_convolve8_avg_dspr2.c
+++ b/vp9/common/mips/dspr2/vp9_convolve8_avg_dspr2.c
@@ -355,6 +355,12 @@
filter_x, x_step_q4,
filter_y, y_step_q4,
w, h);
+ } else if (((const int32_t *)filter_y)[0] == 0) {
+ vp9_convolve2_avg_vert_dspr2(src, src_stride,
+ dst, dst_stride,
+ filter_x, x_step_q4,
+ filter_y, y_step_q4,
+ w, h);
} else {
if (16 == y_step_q4) {
uint32_t pos = 38;
diff --git a/vp9/common/mips/dspr2/vp9_convolve8_avg_horiz_dspr2.c b/vp9/common/mips/dspr2/vp9_convolve8_avg_horiz_dspr2.c
index 37c665b..69da1cf 100644
--- a/vp9/common/mips/dspr2/vp9_convolve8_avg_horiz_dspr2.c
+++ b/vp9/common/mips/dspr2/vp9_convolve8_avg_horiz_dspr2.c
@@ -965,6 +965,12 @@
filter_x, x_step_q4,
filter_y, y_step_q4,
w, h);
+ } else if (((const int32_t *)filter_x)[0] == 0) {
+ vp9_convolve2_avg_horiz_dspr2(src, src_stride,
+ dst, dst_stride,
+ filter_x, x_step_q4,
+ filter_y, y_step_q4,
+ w, h);
} else {
if (16 == x_step_q4) {
uint32_t pos = 38;
diff --git a/vp9/common/mips/dspr2/vp9_convolve8_dspr2.c b/vp9/common/mips/dspr2/vp9_convolve8_dspr2.c
index 2c48bd0..126e05a 100644
--- a/vp9/common/mips/dspr2/vp9_convolve8_dspr2.c
+++ b/vp9/common/mips/dspr2/vp9_convolve8_dspr2.c
@@ -930,6 +930,21 @@
}
}
+void copy_horiz_transposed(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ int w, int h) {
+ int x, y;
+
+ for (y = 0; y < h; ++y) {
+ for (x = 0; x < w; ++x) {
+ dst[x * dst_stride] = src[x];
+ }
+
+ src += src_stride;
+ dst += 1;
+ }
+}
+
void vp9_convolve8_dspr2(const uint8_t *src, ptrdiff_t src_stride,
uint8_t *dst, ptrdiff_t dst_stride,
const int16_t *filter_x, int x_step_q4,
@@ -966,20 +981,14 @@
/* copy the src to dst */
if (filter_x[3] == 0x80) {
- int32_t y;
- int32_t c;
- const uint8_t *src_ptr = src - src_stride * 3;
- uint8_t *dst_ptr = temp;
-
- for (y = intermediate_height; y--;) {
- for (c = 0; c < w; c++) {
- dst_ptr[c * intermediate_height] = src_ptr[c];
- }
-
- /* next row... */
- src_ptr += src_stride;
- dst_ptr += 1;
- }
+ copy_horiz_transposed(src - src_stride * 3, src_stride,
+ temp, intermediate_height,
+ w, intermediate_height);
+ } else if (((const int32_t *)filter_x)[0] == 0) {
+ vp9_convolve2_dspr2(src - src_stride * 3, src_stride,
+ temp, intermediate_height,
+ filter_x,
+ w, intermediate_height);
} else {
src -= (src_stride * 3 + 3);
@@ -1021,20 +1030,14 @@
/* copy the src to dst */
if (filter_y[3] == 0x80) {
- int32_t y;
- int32_t c;
- uint8_t *src_ptr = temp + 3;
- uint8_t *dst_ptr = dst;
-
- for (y = w; y--;) {
- for (c = 0; c < h; c++) {
- dst_ptr[c * dst_stride] = src_ptr[c];
- }
-
- /* next row... */
- src_ptr += intermediate_height;
- dst_ptr += 1;
- }
+ copy_horiz_transposed(temp + 3, intermediate_height,
+ dst, dst_stride,
+ h, w);
+ } else if (((const int32_t *)filter_y)[0] == 0) {
+ vp9_convolve2_dspr2(temp + 3, intermediate_height,
+ dst, dst_stride,
+ filter_y,
+ h, w);
} else {
switch (h) {
case 4:
diff --git a/vp9/common/mips/dspr2/vp9_convolve8_horiz_dspr2.c b/vp9/common/mips/dspr2/vp9_convolve8_horiz_dspr2.c
index 743d641..0303896 100644
--- a/vp9/common/mips/dspr2/vp9_convolve8_horiz_dspr2.c
+++ b/vp9/common/mips/dspr2/vp9_convolve8_horiz_dspr2.c
@@ -849,6 +849,12 @@
filter_x, x_step_q4,
filter_y, y_step_q4,
w, h);
+ } else if (((const int32_t *)filter_x)[0] == 0) {
+ vp9_convolve2_horiz_dspr2(src, src_stride,
+ dst, dst_stride,
+ filter_x, x_step_q4,
+ filter_y, y_step_q4,
+ w, h);
} else {
if (16 == x_step_q4) {
uint32_t pos = 38;
diff --git a/vp9/common/mips/dspr2/vp9_convolve8_vert_dspr2.c b/vp9/common/mips/dspr2/vp9_convolve8_vert_dspr2.c
index bdc7930..0930bb3 100644
--- a/vp9/common/mips/dspr2/vp9_convolve8_vert_dspr2.c
+++ b/vp9/common/mips/dspr2/vp9_convolve8_vert_dspr2.c
@@ -341,6 +341,12 @@
filter_x, x_step_q4,
filter_y, y_step_q4,
w, h);
+ } else if (((const int32_t *)filter_y)[0] == 0) {
+ vp9_convolve2_vert_dspr2(src, src_stride,
+ dst, dst_stride,
+ filter_x, x_step_q4,
+ filter_y, y_step_q4,
+ w, h);
} else {
if (16 == y_step_q4) {
uint32_t pos = 38;
diff --git a/vp9/common/vp9_alloccommon.c b/vp9/common/vp9_alloccommon.c
index f0c653f..0f50f37 100644
--- a/vp9/common/vp9_alloccommon.c
+++ b/vp9/common/vp9_alloccommon.c
@@ -179,6 +179,7 @@
}
void vp9_initialize_common() {
+ vp9_init_neighbors();
vp9_coef_tree_initialize();
vp9_entropy_mode_init();
vp9_entropy_mv_init();
diff --git a/vp9/common/vp9_blockd.h b/vp9/common/vp9_blockd.h
index f116c06..0538b37 100644
--- a/vp9/common/vp9_blockd.h
+++ b/vp9/common/vp9_blockd.h
@@ -221,9 +221,7 @@
int lossless;
/* Inverse transform function pointers. */
- void (*inv_txm4x4_1_add)(int16_t *input, uint8_t *dest, int stride);
- void (*inv_txm4x4_add)(int16_t *input, uint8_t *dest, int stride);
- void (*itxm_add)(int16_t *input, uint8_t *dest, int stride, int eob);
+ void (*itxm_add)(const int16_t *input, uint8_t *dest, int stride, int eob);
struct subpix_fn_table subpix;
@@ -578,7 +576,7 @@
}
}
-static int get_tx_eob(struct segmentation *seg, int segment_id,
+static int get_tx_eob(const struct segmentation *seg, int segment_id,
TX_SIZE tx_size) {
const int eob_max = 16 << (tx_size << 1);
return vp9_segfeature_active(seg, segment_id, SEG_LVL_SKIP) ? 0 : eob_max;
diff --git a/vp9/common/vp9_convolve.c b/vp9/common/vp9_convolve.c
index 1705402..a2d864c 100644
--- a/vp9/common/vp9_convolve.c
+++ b/vp9/common/vp9_convolve.c
@@ -7,13 +7,13 @@
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
-#include "vp9/common/vp9_convolve.h"
#include <assert.h>
#include "./vpx_config.h"
#include "./vp9_rtcd.h"
#include "vp9/common/vp9_common.h"
+#include "vp9/common/vp9_convolve.h"
#include "vp9/common/vp9_filter.h"
#include "vpx/vpx_integer.h"
#include "vpx_ports/mem.h"
diff --git a/vp9/common/vp9_entropy.c b/vp9/common/vp9_entropy.c
index 72ea72e..8ebe0e5 100644
--- a/vp9/common/vp9_entropy.c
+++ b/vp9/common/vp9_entropy.c
@@ -52,222 +52,7 @@
0, 1, 2, 3, 3, 4, 4, 5, 5, 5, 5, 5
};
-DECLARE_ALIGNED(16, const int16_t, vp9_default_scan_4x4[16]) = {
- 0, 4, 1, 5,
- 8, 2, 12, 9,
- 3, 6, 13, 10,
- 7, 14, 11, 15,
-};
-DECLARE_ALIGNED(16, const int16_t, vp9_col_scan_4x4[16]) = {
- 0, 4, 8, 1,
- 12, 5, 9, 2,
- 13, 6, 10, 3,
- 7, 14, 11, 15,
-};
-
-DECLARE_ALIGNED(16, const int16_t, vp9_row_scan_4x4[16]) = {
- 0, 1, 4, 2,
- 5, 3, 6, 8,
- 9, 7, 12, 10,
- 13, 11, 14, 15,
-};
-
-DECLARE_ALIGNED(16, const int16_t, vp9_default_scan_8x8[64]) = {
- 0, 8, 1, 16, 9, 2, 17, 24,
- 10, 3, 18, 25, 32, 11, 4, 26,
- 33, 19, 40, 12, 34, 27, 5, 41,
- 20, 48, 13, 35, 42, 28, 21, 6,
- 49, 56, 36, 43, 29, 7, 14, 50,
- 57, 44, 22, 37, 15, 51, 58, 30,
- 45, 23, 52, 59, 38, 31, 60, 53,
- 46, 39, 61, 54, 47, 62, 55, 63,
-};
-
-DECLARE_ALIGNED(16, const int16_t, vp9_col_scan_8x8[64]) = {
- 0, 8, 16, 1, 24, 9, 32, 17,
- 2, 40, 25, 10, 33, 18, 48, 3,
- 26, 41, 11, 56, 19, 34, 4, 49,
- 27, 42, 12, 35, 20, 57, 50, 28,
- 5, 43, 13, 36, 58, 51, 21, 44,
- 6, 29, 59, 37, 14, 52, 22, 7,
- 45, 60, 30, 15, 38, 53, 23, 46,
- 31, 61, 39, 54, 47, 62, 55, 63,
-};
-
-DECLARE_ALIGNED(16, const int16_t, vp9_row_scan_8x8[64]) = {
- 0, 1, 2, 8, 9, 3, 16, 10,
- 4, 17, 11, 24, 5, 18, 25, 12,
- 19, 26, 32, 6, 13, 20, 33, 27,
- 7, 34, 40, 21, 28, 41, 14, 35,
- 48, 42, 29, 36, 49, 22, 43, 15,
- 56, 37, 50, 44, 30, 57, 23, 51,
- 58, 45, 38, 52, 31, 59, 53, 46,
- 60, 39, 61, 47, 54, 55, 62, 63,
-};
-
-DECLARE_ALIGNED(16, const int16_t, vp9_default_scan_16x16[256]) = {
- 0, 16, 1, 32, 17, 2, 48, 33, 18, 3, 64, 34, 49, 19, 65, 80,
- 50, 4, 35, 66, 20, 81, 96, 51, 5, 36, 82, 97, 67, 112, 21, 52,
- 98, 37, 83, 113, 6, 68, 128, 53, 22, 99, 114, 84, 7, 129, 38, 69,
- 100, 115, 144, 130, 85, 54, 23, 8, 145, 39, 70, 116, 101, 131, 160, 146,
- 55, 86, 24, 71, 132, 117, 161, 40, 9, 102, 147, 176, 162, 87, 56, 25,
- 133, 118, 177, 148, 72, 103, 41, 163, 10, 192, 178, 88, 57, 134, 149, 119,
- 26, 164, 73, 104, 193, 42, 179, 208, 11, 135, 89, 165, 120, 150, 58, 194,
- 180, 27, 74, 209, 105, 151, 136, 43, 90, 224, 166, 195, 181, 121, 210, 59,
- 12, 152, 106, 167, 196, 75, 137, 225, 211, 240, 182, 122, 91, 28, 197, 13,
- 226, 168, 183, 153, 44, 212, 138, 107, 241, 60, 29, 123, 198, 184, 227, 169,
- 242, 76, 213, 154, 45, 92, 14, 199, 139, 61, 228, 214, 170, 185, 243, 108,
- 77, 155, 30, 15, 200, 229, 124, 215, 244, 93, 46, 186, 171, 201, 109, 140,
- 230, 62, 216, 245, 31, 125, 78, 156, 231, 47, 187, 202, 217, 94, 246, 141,
- 63, 232, 172, 110, 247, 157, 79, 218, 203, 126, 233, 188, 248, 95, 173, 142,
- 219, 111, 249, 234, 158, 127, 189, 204, 250, 235, 143, 174, 220, 205, 159,
- 251,
- 190, 221, 175, 236, 237, 191, 206, 252, 222, 253, 207, 238, 223, 254, 239,
- 255,
-};
-
-DECLARE_ALIGNED(16, const int16_t, vp9_col_scan_16x16[256]) = {
- 0, 16, 32, 48, 1, 64, 17, 80, 33, 96, 49, 2, 65, 112, 18, 81,
- 34, 128, 50, 97, 3, 66, 144, 19, 113, 35, 82, 160, 98, 51, 129, 4,
- 67, 176, 20, 114, 145, 83, 36, 99, 130, 52, 192, 5, 161, 68, 115, 21,
- 146, 84, 208, 177, 37, 131, 100, 53, 162, 224, 69, 6, 116, 193, 147, 85,
- 22, 240, 132, 38, 178, 101, 163, 54, 209, 117, 70, 7, 148, 194, 86, 179,
- 225, 23, 133, 39, 164, 8, 102, 210, 241, 55, 195, 118, 149, 71, 180, 24,
- 87, 226, 134, 165, 211, 40, 103, 56, 72, 150, 196, 242, 119, 9, 181, 227,
- 88, 166, 25, 135, 41, 104, 212, 57, 151, 197, 120, 73, 243, 182, 136, 167,
- 213, 89, 10, 228, 105, 152, 198, 26, 42, 121, 183, 244, 168, 58, 137, 229,
- 74, 214, 90, 153, 199, 184, 11, 106, 245, 27, 122, 230, 169, 43, 215, 59,
- 200, 138, 185, 246, 75, 12, 91, 154, 216, 231, 107, 28, 44, 201, 123, 170,
- 60, 247, 232, 76, 139, 13, 92, 217, 186, 248, 155, 108, 29, 124, 45, 202,
- 233, 171, 61, 14, 77, 140, 15, 249, 93, 30, 187, 156, 218, 46, 109, 125,
- 62, 172, 78, 203, 31, 141, 234, 94, 47, 188, 63, 157, 110, 250, 219, 79,
- 126, 204, 173, 142, 95, 189, 111, 235, 158, 220, 251, 127, 174, 143, 205,
- 236,
- 159, 190, 221, 252, 175, 206, 237, 191, 253, 222, 238, 207, 254, 223, 239,
- 255,
-};
-
-DECLARE_ALIGNED(16, const int16_t, vp9_row_scan_16x16[256]) = {
- 0, 1, 2, 16, 3, 17, 4, 18, 32, 5, 33, 19, 6, 34, 48, 20,
- 49, 7, 35, 21, 50, 64, 8, 36, 65, 22, 51, 37, 80, 9, 66, 52,
- 23, 38, 81, 67, 10, 53, 24, 82, 68, 96, 39, 11, 54, 83, 97, 69,
- 25, 98, 84, 40, 112, 55, 12, 70, 99, 113, 85, 26, 41, 56, 114, 100,
- 13, 71, 128, 86, 27, 115, 101, 129, 42, 57, 72, 116, 14, 87, 130, 102,
- 144, 73, 131, 117, 28, 58, 15, 88, 43, 145, 103, 132, 146, 118, 74, 160,
- 89, 133, 104, 29, 59, 147, 119, 44, 161, 148, 90, 105, 134, 162, 120, 176,
- 75, 135, 149, 30, 60, 163, 177, 45, 121, 91, 106, 164, 178, 150, 192, 136,
- 165, 179, 31, 151, 193, 76, 122, 61, 137, 194, 107, 152, 180, 208, 46, 166,
- 167, 195, 92, 181, 138, 209, 123, 153, 224, 196, 77, 168, 210, 182, 240, 108,
- 197, 62, 154, 225, 183, 169, 211, 47, 139, 93, 184, 226, 212, 241, 198, 170,
- 124, 155, 199, 78, 213, 185, 109, 227, 200, 63, 228, 242, 140, 214, 171, 186,
- 156, 229, 243, 125, 94, 201, 244, 215, 216, 230, 141, 187, 202, 79, 172, 110,
- 157, 245, 217, 231, 95, 246, 232, 126, 203, 247, 233, 173, 218, 142, 111,
- 158,
- 188, 248, 127, 234, 219, 249, 189, 204, 143, 174, 159, 250, 235, 205, 220,
- 175,
- 190, 251, 221, 191, 206, 236, 207, 237, 252, 222, 253, 223, 238, 239, 254,
- 255,
-};
-
-DECLARE_ALIGNED(16, const int16_t, vp9_default_scan_32x32[1024]) = {
- 0, 32, 1, 64, 33, 2, 96, 65, 34, 128, 3, 97, 66, 160,
- 129, 35, 98, 4, 67, 130, 161, 192, 36, 99, 224, 5, 162, 193,
- 68, 131, 37, 100,
- 225, 194, 256, 163, 69, 132, 6, 226, 257, 288, 195, 101, 164, 38,
- 258, 7, 227, 289, 133, 320, 70, 196, 165, 290, 259, 228, 39, 321,
- 102, 352, 8, 197,
- 71, 134, 322, 291, 260, 353, 384, 229, 166, 103, 40, 354, 323, 292,
- 135, 385, 198, 261, 72, 9, 416, 167, 386, 355, 230, 324, 104, 293,
- 41, 417, 199, 136,
- 262, 387, 448, 325, 356, 10, 73, 418, 231, 168, 449, 294, 388, 105,
- 419, 263, 42, 200, 357, 450, 137, 480, 74, 326, 232, 11, 389, 169,
- 295, 420, 106, 451,
- 481, 358, 264, 327, 201, 43, 138, 512, 482, 390, 296, 233, 170, 421,
- 75, 452, 359, 12, 513, 265, 483, 328, 107, 202, 514, 544, 422, 391,
- 453, 139, 44, 234,
- 484, 297, 360, 171, 76, 515, 545, 266, 329, 454, 13, 423, 203, 108,
- 546, 485, 576, 298, 235, 140, 361, 330, 172, 547, 45, 455, 267, 577,
- 486, 77, 204, 362,
- 608, 14, 299, 578, 109, 236, 487, 609, 331, 141, 579, 46, 15, 173,
- 610, 363, 78, 205, 16, 110, 237, 611, 142, 47, 174, 79, 206, 17,
- 111, 238, 48, 143,
- 80, 175, 112, 207, 49, 18, 239, 81, 113, 19, 50, 82, 114, 51,
- 83, 115, 640, 516, 392, 268, 144, 20, 672, 641, 548, 517, 424,
- 393, 300, 269, 176, 145,
- 52, 21, 704, 673, 642, 580, 549, 518, 456, 425, 394, 332, 301,
- 270, 208, 177, 146, 84, 53, 22, 736, 705, 674, 643, 612, 581,
- 550, 519, 488, 457, 426, 395,
- 364, 333, 302, 271, 240, 209, 178, 147, 116, 85, 54, 23, 737,
- 706, 675, 613, 582, 551, 489, 458, 427, 365, 334, 303, 241,
- 210, 179, 117, 86, 55, 738, 707,
- 614, 583, 490, 459, 366, 335, 242, 211, 118, 87, 739, 615, 491,
- 367, 243, 119, 768, 644, 520, 396, 272, 148, 24, 800, 769, 676,
- 645, 552, 521, 428, 397, 304,
- 273, 180, 149, 56, 25, 832, 801, 770, 708, 677, 646, 584, 553,
- 522, 460, 429, 398, 336, 305, 274, 212, 181, 150, 88, 57, 26,
- 864, 833, 802, 771, 740, 709,
- 678, 647, 616, 585, 554, 523, 492, 461, 430, 399, 368, 337, 306,
- 275, 244, 213, 182, 151, 120, 89, 58, 27, 865, 834, 803, 741,
- 710, 679, 617, 586, 555, 493,
- 462, 431, 369, 338, 307, 245, 214, 183, 121, 90, 59, 866, 835,
- 742, 711, 618, 587, 494, 463, 370, 339, 246, 215, 122, 91, 867,
- 743, 619, 495, 371, 247, 123,
- 896, 772, 648, 524, 400, 276, 152, 28, 928, 897, 804, 773, 680,
- 649, 556, 525, 432, 401, 308, 277, 184, 153, 60, 29, 960, 929,
- 898, 836, 805, 774, 712, 681,
- 650, 588, 557, 526, 464, 433, 402, 340, 309, 278, 216, 185, 154,
- 92, 61, 30, 992, 961, 930, 899, 868, 837, 806, 775, 744, 713, 682,
- 651, 620, 589, 558, 527,
- 496, 465, 434, 403, 372, 341, 310, 279, 248, 217, 186, 155, 124,
- 93, 62, 31, 993, 962, 931, 869, 838, 807, 745, 714, 683, 621, 590,
- 559, 497, 466, 435, 373,
- 342, 311, 249, 218, 187, 125, 94, 63, 994, 963, 870, 839, 746, 715,
- 622, 591, 498, 467, 374, 343, 250, 219, 126, 95, 995, 871, 747, 623,
- 499, 375, 251, 127,
- 900, 776, 652, 528, 404, 280, 156, 932, 901, 808, 777, 684, 653, 560,
- 529, 436, 405, 312, 281, 188, 157, 964, 933, 902, 840, 809, 778, 716,
- 685, 654, 592, 561,
- 530, 468, 437, 406, 344, 313, 282, 220, 189, 158, 996, 965, 934, 903,
- 872, 841, 810, 779, 748, 717, 686, 655, 624, 593, 562, 531, 500, 469,
- 438, 407, 376, 345,
- 314, 283, 252, 221, 190, 159, 997, 966, 935, 873, 842, 811, 749, 718,
- 687, 625, 594, 563, 501, 470, 439, 377, 346, 315, 253, 222, 191, 998,
- 967, 874, 843, 750,
- 719, 626, 595, 502, 471, 378, 347, 254, 223, 999, 875, 751, 627, 503,
- 379, 255, 904, 780, 656, 532, 408, 284, 936, 905, 812, 781, 688, 657,
- 564, 533, 440, 409,
- 316, 285, 968, 937, 906, 844, 813, 782, 720, 689, 658, 596, 565, 534,
- 472, 441, 410, 348, 317, 286, 1000, 969, 938, 907, 876, 845, 814, 783,
- 752, 721, 690, 659,
- 628, 597, 566, 535, 504, 473, 442, 411, 380, 349, 318, 287, 1001, 970,
- 939, 877, 846, 815, 753, 722, 691, 629, 598, 567, 505, 474, 443, 381,
- 350, 319, 1002, 971,
- 878, 847, 754, 723, 630, 599, 506, 475, 382, 351, 1003, 879, 755, 631,
- 507, 383, 908, 784, 660, 536, 412, 940, 909, 816, 785, 692, 661, 568,
- 537, 444, 413, 972,
- 941, 910, 848, 817, 786, 724, 693, 662, 600, 569, 538, 476, 445, 414,
- 1004, 973, 942, 911, 880, 849, 818, 787, 756, 725, 694, 663, 632, 601,
- 570, 539, 508, 477,
- 446, 415, 1005, 974, 943, 881, 850, 819, 757, 726, 695, 633, 602, 571,
- 509, 478, 447, 1006, 975, 882, 851, 758, 727, 634, 603, 510, 479,
- 1007, 883, 759, 635, 511,
- 912, 788, 664, 540, 944, 913, 820, 789, 696, 665, 572, 541, 976, 945,
- 914, 852, 821, 790, 728, 697, 666, 604, 573, 542, 1008, 977, 946, 915,
- 884, 853, 822, 791,
- 760, 729, 698, 667, 636, 605, 574, 543, 1009, 978, 947, 885, 854, 823,
- 761, 730, 699, 637, 606, 575, 1010, 979, 886, 855, 762, 731, 638, 607,
- 1011, 887, 763, 639,
- 916, 792, 668, 948, 917, 824, 793, 700, 669, 980, 949, 918, 856, 825,
- 794, 732, 701, 670, 1012, 981, 950, 919, 888, 857, 826, 795, 764, 733,
- 702, 671, 1013, 982,
- 951, 889, 858, 827, 765, 734, 703, 1014, 983, 890, 859, 766, 735, 1015,
- 891, 767, 920, 796, 952, 921, 828, 797, 984, 953, 922, 860, 829, 798,
- 1016, 985, 954, 923,
- 892, 861, 830, 799, 1017, 986, 955, 893, 862, 831, 1018, 987, 894, 863,
- 1019, 895, 924, 956, 925, 988, 957, 926, 1020, 989, 958, 927, 1021,
- 990, 959, 1022, 991, 1023,
-};
/* Array indices are identical to previously-existing CONTEXT_NODE indices */
@@ -513,134 +298,7 @@
vp9_copy(cm->fc.coef_probs[TX_32X32], default_coef_probs_32x32);
}
-// Neighborhood 5-tuples for various scans and blocksizes,
-// in {top, left, topleft, topright, bottomleft} order
-// for each position in raster scan order.
-// -1 indicates the neighbor does not exist.
-DECLARE_ALIGNED(16, int16_t,
- vp9_default_scan_4x4_neighbors[17 * MAX_NEIGHBORS]);
-DECLARE_ALIGNED(16, int16_t,
- vp9_col_scan_4x4_neighbors[17 * MAX_NEIGHBORS]);
-DECLARE_ALIGNED(16, int16_t,
- vp9_row_scan_4x4_neighbors[17 * MAX_NEIGHBORS]);
-DECLARE_ALIGNED(16, int16_t,
- vp9_col_scan_8x8_neighbors[65 * MAX_NEIGHBORS]);
-DECLARE_ALIGNED(16, int16_t,
- vp9_row_scan_8x8_neighbors[65 * MAX_NEIGHBORS]);
-DECLARE_ALIGNED(16, int16_t,
- vp9_default_scan_8x8_neighbors[65 * MAX_NEIGHBORS]);
-DECLARE_ALIGNED(16, int16_t,
- vp9_col_scan_16x16_neighbors[257 * MAX_NEIGHBORS]);
-DECLARE_ALIGNED(16, int16_t,
- vp9_row_scan_16x16_neighbors[257 * MAX_NEIGHBORS]);
-DECLARE_ALIGNED(16, int16_t,
- vp9_default_scan_16x16_neighbors[257 * MAX_NEIGHBORS]);
-DECLARE_ALIGNED(16, int16_t,
- vp9_default_scan_32x32_neighbors[1025 * MAX_NEIGHBORS]);
-
-DECLARE_ALIGNED(16, int16_t, vp9_default_iscan_4x4[16]);
-DECLARE_ALIGNED(16, int16_t, vp9_col_iscan_4x4[16]);
-DECLARE_ALIGNED(16, int16_t, vp9_row_iscan_4x4[16]);
-DECLARE_ALIGNED(16, int16_t, vp9_col_iscan_8x8[64]);
-DECLARE_ALIGNED(16, int16_t, vp9_row_iscan_8x8[64]);
-DECLARE_ALIGNED(16, int16_t, vp9_default_iscan_8x8[64]);
-DECLARE_ALIGNED(16, int16_t, vp9_col_iscan_16x16[256]);
-DECLARE_ALIGNED(16, int16_t, vp9_row_iscan_16x16[256]);
-DECLARE_ALIGNED(16, int16_t, vp9_default_iscan_16x16[256]);
-DECLARE_ALIGNED(16, int16_t, vp9_default_iscan_32x32[1024]);
-
-static int find_in_scan(const int16_t *scan, int l, int idx) {
- int n, l2 = l * l;
- for (n = 0; n < l2; n++) {
- int rc = scan[n];
- if (rc == idx)
- return n;
- }
- assert(0);
- return -1;
-}
-static void init_scan_neighbors(const int16_t *scan,
- int16_t *iscan,
- int l, int16_t *neighbors) {
- int l2 = l * l;
- int n, i, j;
-
- // dc doesn't use this type of prediction
- neighbors[MAX_NEIGHBORS * 0 + 0] = 0;
- neighbors[MAX_NEIGHBORS * 0 + 1] = 0;
- iscan[0] = find_in_scan(scan, l, 0);
- for (n = 1; n < l2; n++) {
- int rc = scan[n];
- iscan[n] = find_in_scan(scan, l, n);
- i = rc / l;
- j = rc % l;
- if (i > 0 && j > 0) {
- // col/row scan is used for adst/dct, and generally means that
- // energy decreases to zero much faster in the dimension in
- // which ADST is used compared to the direction in which DCT
- // is used. Likewise, we find much higher correlation between
- // coefficients within the direction in which DCT is used.
- // Therefore, if we use ADST/DCT, prefer the DCT neighbor coeff
- // as a context. If ADST or DCT is used in both directions, we
- // use the combination of the two as a context.
- int a = (i - 1) * l + j;
- int b = i * l + j - 1;
- if (scan == vp9_col_scan_4x4 || scan == vp9_col_scan_8x8 ||
- scan == vp9_col_scan_16x16) {
- // in the col/row scan cases (as well as left/top edge cases), we set
- // both contexts to the same value, so we can branchlessly do a+b+1>>1
- // which automatically becomes a if a == b
- neighbors[MAX_NEIGHBORS * n + 0] =
- neighbors[MAX_NEIGHBORS * n + 1] = a;
- } else if (scan == vp9_row_scan_4x4 || scan == vp9_row_scan_8x8 ||
- scan == vp9_row_scan_16x16) {
- neighbors[MAX_NEIGHBORS * n + 0] =
- neighbors[MAX_NEIGHBORS * n + 1] = b;
- } else {
- neighbors[MAX_NEIGHBORS * n + 0] = a;
- neighbors[MAX_NEIGHBORS * n + 1] = b;
- }
- } else if (i > 0) {
- neighbors[MAX_NEIGHBORS * n + 0] =
- neighbors[MAX_NEIGHBORS * n + 1] = (i - 1) * l + j;
- } else {
- assert(j > 0);
- neighbors[MAX_NEIGHBORS * n + 0] =
- neighbors[MAX_NEIGHBORS * n + 1] = i * l + j - 1;
- }
- assert(iscan[neighbors[MAX_NEIGHBORS * n + 0]] < n);
- }
- // one padding item so we don't have to add branches in code to handle
- // calls to get_coef_context() for the token after the final dc token
- neighbors[MAX_NEIGHBORS * l2 + 0] = 0;
- neighbors[MAX_NEIGHBORS * l2 + 1] = 0;
-}
-
-void vp9_init_neighbors() {
- init_scan_neighbors(vp9_default_scan_4x4, vp9_default_iscan_4x4, 4,
- vp9_default_scan_4x4_neighbors);
- init_scan_neighbors(vp9_row_scan_4x4, vp9_row_iscan_4x4, 4,
- vp9_row_scan_4x4_neighbors);
- init_scan_neighbors(vp9_col_scan_4x4, vp9_col_iscan_4x4, 4,
- vp9_col_scan_4x4_neighbors);
- init_scan_neighbors(vp9_default_scan_8x8, vp9_default_iscan_8x8, 8,
- vp9_default_scan_8x8_neighbors);
- init_scan_neighbors(vp9_row_scan_8x8, vp9_row_iscan_8x8, 8,
- vp9_row_scan_8x8_neighbors);
- init_scan_neighbors(vp9_col_scan_8x8, vp9_col_iscan_8x8, 8,
- vp9_col_scan_8x8_neighbors);
- init_scan_neighbors(vp9_default_scan_16x16, vp9_default_iscan_16x16, 16,
- vp9_default_scan_16x16_neighbors);
- init_scan_neighbors(vp9_row_scan_16x16, vp9_row_iscan_16x16, 16,
- vp9_row_scan_16x16_neighbors);
- init_scan_neighbors(vp9_col_scan_16x16, vp9_col_iscan_16x16, 16,
- vp9_col_scan_16x16_neighbors);
- init_scan_neighbors(vp9_default_scan_32x32, vp9_default_iscan_32x32, 32,
- vp9_default_scan_32x32_neighbors);
-}
-
void vp9_coef_tree_initialize() {
- vp9_init_neighbors();
init_bit_trees();
vp9_tokens_from_tree(vp9_coef_encodings, vp9_coef_tree);
}
@@ -657,10 +315,10 @@
static void adapt_coef_probs(VP9_COMMON *cm, TX_SIZE tx_size,
unsigned int count_sat,
unsigned int update_factor) {
- FRAME_CONTEXT *pre_fc = &cm->frame_contexts[cm->frame_context_idx];
+ const FRAME_CONTEXT *pre_fc = &cm->frame_contexts[cm->frame_context_idx];
vp9_coeff_probs_model *dst_coef_probs = cm->fc.coef_probs[tx_size];
- vp9_coeff_probs_model *pre_coef_probs = pre_fc->coef_probs[tx_size];
+ const vp9_coeff_probs_model *pre_coef_probs = pre_fc->coef_probs[tx_size];
vp9_coeff_count_model *coef_counts = cm->counts.coef[tx_size];
unsigned int (*eob_branch_count)[REF_TYPES][COEF_BANDS][PREV_COEF_CONTEXTS] =
cm->counts.eob_branch[tx_size];
diff --git a/vp9/common/vp9_entropy.h b/vp9/common/vp9_entropy.h
index ef9ea46..02178b5 100644
--- a/vp9/common/vp9_entropy.h
+++ b/vp9/common/vp9_entropy.h
@@ -12,9 +12,13 @@
#define VP9_COMMON_VP9_ENTROPY_H_
#include "vpx/vpx_integer.h"
-#include "vp9/common/vp9_treecoder.h"
+
#include "vp9/common/vp9_blockd.h"
#include "vp9/common/vp9_common.h"
+#include "vp9/common/vp9_scan.h"
+#include "vp9/common/vp9_treecoder.h"
+
+#define DIFF_UPDATE_PROB 252
/* Coefficient token alphabet */
@@ -36,6 +40,9 @@
#define INTER_MODE_CONTEXTS 7
+extern DECLARE_ALIGNED(16, const uint8_t,
+ vp9_pt_energy_class[MAX_ENTROPY_TOKENS]);
+
extern const vp9_tree_index vp9_coef_tree[];
#define DCT_EOB_MODEL_TOKEN 3 /* EOB Extra Bits 0+0 */
@@ -44,7 +51,7 @@
extern struct vp9_token vp9_coef_encodings[MAX_ENTROPY_TOKENS];
typedef struct {
- vp9_tree_p tree;
+ vp9_tree_index *tree;
const vp9_prob *prob;
int len;
int base_val;
@@ -96,64 +103,8 @@
struct VP9Common;
void vp9_default_coef_probs(struct VP9Common *cm);
-extern DECLARE_ALIGNED(16, const int16_t, vp9_default_scan_4x4[16]);
-extern DECLARE_ALIGNED(16, const int16_t, vp9_col_scan_4x4[16]);
-extern DECLARE_ALIGNED(16, const int16_t, vp9_row_scan_4x4[16]);
-
-extern DECLARE_ALIGNED(16, const int16_t, vp9_default_scan_8x8[64]);
-
-extern DECLARE_ALIGNED(16, const int16_t, vp9_col_scan_8x8[64]);
-extern DECLARE_ALIGNED(16, const int16_t, vp9_row_scan_8x8[64]);
-
-extern DECLARE_ALIGNED(16, const int16_t, vp9_default_scan_16x16[256]);
-
-extern DECLARE_ALIGNED(16, const int16_t, vp9_col_scan_16x16[256]);
-extern DECLARE_ALIGNED(16, const int16_t, vp9_row_scan_16x16[256]);
-
-extern DECLARE_ALIGNED(16, const int16_t, vp9_default_scan_32x32[1024]);
-
-extern DECLARE_ALIGNED(16, int16_t, vp9_default_iscan_4x4[16]);
-
-extern DECLARE_ALIGNED(16, int16_t, vp9_col_iscan_4x4[16]);
-extern DECLARE_ALIGNED(16, int16_t, vp9_row_iscan_4x4[16]);
-
-extern DECLARE_ALIGNED(16, int16_t, vp9_default_iscan_8x8[64]);
-
-extern DECLARE_ALIGNED(16, int16_t, vp9_col_iscan_8x8[64]);
-extern DECLARE_ALIGNED(16, int16_t, vp9_row_iscan_8x8[64]);
-
-extern DECLARE_ALIGNED(16, int16_t, vp9_default_iscan_16x16[256]);
-
-extern DECLARE_ALIGNED(16, int16_t, vp9_col_iscan_16x16[256]);
-extern DECLARE_ALIGNED(16, int16_t, vp9_row_iscan_16x16[256]);
-
-extern DECLARE_ALIGNED(16, int16_t, vp9_default_iscan_32x32[1024]);
-
-#define MAX_NEIGHBORS 2
-
-extern DECLARE_ALIGNED(16, int16_t,
- vp9_default_scan_4x4_neighbors[17 * MAX_NEIGHBORS]);
-extern DECLARE_ALIGNED(16, int16_t,
- vp9_col_scan_4x4_neighbors[17 * MAX_NEIGHBORS]);
-extern DECLARE_ALIGNED(16, int16_t,
- vp9_row_scan_4x4_neighbors[17 * MAX_NEIGHBORS]);
-extern DECLARE_ALIGNED(16, int16_t,
- vp9_col_scan_8x8_neighbors[65 * MAX_NEIGHBORS]);
-extern DECLARE_ALIGNED(16, int16_t,
- vp9_row_scan_8x8_neighbors[65 * MAX_NEIGHBORS]);
-extern DECLARE_ALIGNED(16, int16_t,
- vp9_default_scan_8x8_neighbors[65 * MAX_NEIGHBORS]);
-extern DECLARE_ALIGNED(16, int16_t,
- vp9_col_scan_16x16_neighbors[257 * MAX_NEIGHBORS]);
-extern DECLARE_ALIGNED(16, int16_t,
- vp9_row_scan_16x16_neighbors[257 * MAX_NEIGHBORS]);
-extern DECLARE_ALIGNED(16, int16_t,
- vp9_default_scan_16x16_neighbors[257 * MAX_NEIGHBORS]);
-extern DECLARE_ALIGNED(16, int16_t,
- vp9_default_scan_32x32_neighbors[1025 * MAX_NEIGHBORS]);
-
-void vp9_coef_tree_initialize(void);
+void vp9_coef_tree_initialize();
void vp9_adapt_coef_probs(struct VP9Common *cm);
static INLINE void reset_skip_context(MACROBLOCKD *xd, BLOCK_SIZE bsize) {
@@ -183,13 +134,6 @@
? (COEF_BANDS-1) : band_translate[coef_index];
}
-static INLINE int get_coef_context(const int16_t *neighbors,
- uint8_t *token_cache,
- int c) {
- return (1 + token_cache[neighbors[MAX_NEIGHBORS * c + 0]] +
- token_cache[neighbors[MAX_NEIGHBORS * c + 1]]) >> 1;
-}
-
// 128 lists of probabilities are stored for the following ONE node probs:
// 1, 3, 5, 7, ..., 253, 255
// In between probabilities are interpolated linearly
@@ -210,126 +154,6 @@
void vp9_model_to_full_probs(const vp9_prob *model, vp9_prob *full);
-static INLINE const int16_t* get_scan_4x4(TX_TYPE tx_type) {
- switch (tx_type) {
- case ADST_DCT:
- return vp9_row_scan_4x4;
- case DCT_ADST:
- return vp9_col_scan_4x4;
- default:
- return vp9_default_scan_4x4;
- }
-}
-
-static INLINE void get_scan_nb_4x4(TX_TYPE tx_type,
- const int16_t **scan, const int16_t **nb) {
- switch (tx_type) {
- case ADST_DCT:
- *scan = vp9_row_scan_4x4;
- *nb = vp9_row_scan_4x4_neighbors;
- break;
- case DCT_ADST:
- *scan = vp9_col_scan_4x4;
- *nb = vp9_col_scan_4x4_neighbors;
- break;
- default:
- *scan = vp9_default_scan_4x4;
- *nb = vp9_default_scan_4x4_neighbors;
- break;
- }
-}
-
-static INLINE const int16_t* get_iscan_4x4(TX_TYPE tx_type) {
- switch (tx_type) {
- case ADST_DCT:
- return vp9_row_iscan_4x4;
- case DCT_ADST:
- return vp9_col_iscan_4x4;
- default:
- return vp9_default_iscan_4x4;
- }
-}
-
-static INLINE const int16_t* get_scan_8x8(TX_TYPE tx_type) {
- switch (tx_type) {
- case ADST_DCT:
- return vp9_row_scan_8x8;
- case DCT_ADST:
- return vp9_col_scan_8x8;
- default:
- return vp9_default_scan_8x8;
- }
-}
-
-static INLINE void get_scan_nb_8x8(TX_TYPE tx_type,
- const int16_t **scan, const int16_t **nb) {
- switch (tx_type) {
- case ADST_DCT:
- *scan = vp9_row_scan_8x8;
- *nb = vp9_row_scan_8x8_neighbors;
- break;
- case DCT_ADST:
- *scan = vp9_col_scan_8x8;
- *nb = vp9_col_scan_8x8_neighbors;
- break;
- default:
- *scan = vp9_default_scan_8x8;
- *nb = vp9_default_scan_8x8_neighbors;
- break;
- }
-}
-
-static INLINE const int16_t* get_iscan_8x8(TX_TYPE tx_type) {
- switch (tx_type) {
- case ADST_DCT:
- return vp9_row_iscan_8x8;
- case DCT_ADST:
- return vp9_col_iscan_8x8;
- default:
- return vp9_default_iscan_8x8;
- }
-}
-
-static INLINE const int16_t* get_scan_16x16(TX_TYPE tx_type) {
- switch (tx_type) {
- case ADST_DCT:
- return vp9_row_scan_16x16;
- case DCT_ADST:
- return vp9_col_scan_16x16;
- default:
- return vp9_default_scan_16x16;
- }
-}
-
-static INLINE void get_scan_nb_16x16(TX_TYPE tx_type,
- const int16_t **scan, const int16_t **nb) {
- switch (tx_type) {
- case ADST_DCT:
- *scan = vp9_row_scan_16x16;
- *nb = vp9_row_scan_16x16_neighbors;
- break;
- case DCT_ADST:
- *scan = vp9_col_scan_16x16;
- *nb = vp9_col_scan_16x16_neighbors;
- break;
- default:
- *scan = vp9_default_scan_16x16;
- *nb = vp9_default_scan_16x16_neighbors;
- break;
- }
-}
-
-static INLINE const int16_t* get_iscan_16x16(TX_TYPE tx_type) {
- switch (tx_type) {
- case ADST_DCT:
- return vp9_row_iscan_16x16;
- case DCT_ADST:
- return vp9_col_iscan_16x16;
- default:
- return vp9_default_iscan_16x16;
- }
-}
-
static int get_entropy_context(TX_SIZE tx_size,
ENTROPY_CONTEXT *a, ENTROPY_CONTEXT *l) {
ENTROPY_CONTEXT above_ec = 0, left_ec = 0;
@@ -386,7 +210,4 @@
}
}
-
-enum { VP9_COEF_UPDATE_PROB = 252 };
-
#endif // VP9_COMMON_VP9_ENTROPY_H_
diff --git a/vp9/common/vp9_entropymode.c b/vp9/common/vp9_entropymode.c
index e176796..56e6444 100644
--- a/vp9/common/vp9_entropymode.c
+++ b/vp9/common/vp9_entropymode.c
@@ -226,7 +226,7 @@
};
/* Array indices are identical to previously-existing INTRAMODECONTEXTNODES. */
-const vp9_tree_index vp9_intra_mode_tree[INTRA_MODES * 2 - 2] = {
+const vp9_tree_index vp9_intra_mode_tree[TREE_SIZE(INTRA_MODES)] = {
-DC_PRED, 2, /* 0 = DC_NODE */
-TM_PRED, 4, /* 1 = TM_NODE */
-V_PRED, 6, /* 2 = V_NODE */
@@ -237,22 +237,20 @@
-D63_PRED, 16, /* 7 = D63_NODE */
-D153_PRED, -D207_PRED /* 8 = D153_NODE */
};
+struct vp9_token vp9_intra_mode_encodings[INTRA_MODES];
-const vp9_tree_index vp9_inter_mode_tree[6] = {
+const vp9_tree_index vp9_inter_mode_tree[TREE_SIZE(INTER_MODES)] = {
-ZEROMV, 2,
-NEARESTMV, 4,
-NEARMV, -NEWMV
};
+struct vp9_token vp9_inter_mode_encodings[INTER_MODES];
-const vp9_tree_index vp9_partition_tree[6] = {
+const vp9_tree_index vp9_partition_tree[TREE_SIZE(PARTITION_TYPES)] = {
-PARTITION_NONE, 2,
-PARTITION_HORZ, 4,
-PARTITION_VERT, -PARTITION_SPLIT
};
-
-struct vp9_token vp9_intra_mode_encodings[INTRA_MODES];
-struct vp9_token vp9_inter_mode_encodings[INTER_MODES];
-
struct vp9_token vp9_partition_encodings[PARTITION_TYPES];
static const vp9_prob default_intra_inter_p[INTRA_INTER_CONTEXTS] = {
@@ -338,7 +336,8 @@
vp9_copy(cm->fc.mbskip_probs, default_mbskip_probs);
}
-const vp9_tree_index vp9_switchable_interp_tree[SWITCHABLE_FILTERS*2-2] = {
+const vp9_tree_index vp9_switchable_interp_tree
+ [TREE_SIZE(SWITCHABLE_FILTERS)] = {
-EIGHTTAP, 2,
-EIGHTTAP_SMOOTH, -EIGHTTAP_SHARP
};
diff --git a/vp9/common/vp9_entropymode.h b/vp9/common/vp9_entropymode.h
index ccade27..ab37b75 100644
--- a/vp9/common/vp9_entropymode.h
+++ b/vp9/common/vp9_entropymode.h
@@ -15,7 +15,6 @@
#include "vp9/common/vp9_treecoder.h"
#define TX_SIZE_CONTEXTS 2
-#define MODE_UPDATE_PROB 252
#define SWITCHABLE_FILTERS 3 // number of switchable filters
// #define MODE_STATS
@@ -38,19 +37,17 @@
extern const vp9_prob vp9_kf_y_mode_prob[INTRA_MODES][INTRA_MODES]
[INTRA_MODES - 1];
-extern const vp9_tree_index vp9_intra_mode_tree[];
-extern const vp9_tree_index vp9_inter_mode_tree[];
-
+extern const vp9_tree_index vp9_intra_mode_tree[TREE_SIZE(INTRA_MODES)];
extern struct vp9_token vp9_intra_mode_encodings[INTRA_MODES];
+
+extern const vp9_tree_index vp9_inter_mode_tree[TREE_SIZE(INTER_MODES)];
extern struct vp9_token vp9_inter_mode_encodings[INTER_MODES];
-// probability models for partition information
-extern const vp9_tree_index vp9_partition_tree[];
+extern const vp9_tree_index vp9_partition_tree[TREE_SIZE(PARTITION_TYPES)];
extern struct vp9_token vp9_partition_encodings[PARTITION_TYPES];
extern const vp9_tree_index vp9_switchable_interp_tree
- [2 * (SWITCHABLE_FILTERS - 1)];
-
+ [TREE_SIZE(SWITCHABLE_FILTERS)];
extern struct vp9_token vp9_switchable_interp_encodings[SWITCHABLE_FILTERS];
void vp9_entropy_mode_init();
diff --git a/vp9/common/vp9_entropymv.c b/vp9/common/vp9_entropymv.c
index a9e25b7..e851181 100644
--- a/vp9/common/vp9_entropymv.c
+++ b/vp9/common/vp9_entropymv.c
@@ -18,14 +18,14 @@
/* Integer pel reference mv threshold for use of high-precision 1/8 mv */
#define COMPANDED_MVREF_THRESH 8
-const vp9_tree_index vp9_mv_joint_tree[2 * MV_JOINTS - 2] = {
+const vp9_tree_index vp9_mv_joint_tree[TREE_SIZE(MV_JOINTS)] = {
-MV_JOINT_ZERO, 2,
-MV_JOINT_HNZVZ, 4,
-MV_JOINT_HZVNZ, -MV_JOINT_HNZVNZ
};
struct vp9_token vp9_mv_joint_encodings[MV_JOINTS];
-const vp9_tree_index vp9_mv_class_tree[2 * MV_CLASSES - 2] = {
+const vp9_tree_index vp9_mv_class_tree[TREE_SIZE(MV_CLASSES)] = {
-MV_CLASS_0, 2,
-MV_CLASS_1, 4,
6, 8,
@@ -39,12 +39,12 @@
};
struct vp9_token vp9_mv_class_encodings[MV_CLASSES];
-const vp9_tree_index vp9_mv_class0_tree[2 * CLASS0_SIZE - 2] = {
+const vp9_tree_index vp9_mv_class0_tree[TREE_SIZE(CLASS0_SIZE)] = {
-0, -1,
};
struct vp9_token vp9_mv_class0_encodings[CLASS0_SIZE];
-const vp9_tree_index vp9_mv_fp_tree[2 * 4 - 2] = {
+const vp9_tree_index vp9_mv_fp_tree[TREE_SIZE(4)] = {
-0, 2,
-1, 4,
-2, -3
@@ -214,11 +214,11 @@
void vp9_adapt_mv_probs(VP9_COMMON *cm, int allow_hp) {
int i, j;
- FRAME_CONTEXT *pre_fc = &cm->frame_contexts[cm->frame_context_idx];
+ const FRAME_CONTEXT *pre_fc = &cm->frame_contexts[cm->frame_context_idx];
nmv_context *ctx = &cm->fc.nmvc;
- nmv_context *pre_ctx = &pre_fc->nmvc;
- nmv_context_counts *cts = &cm->counts.mv;
+ const nmv_context *pre_ctx = &pre_fc->nmvc;
+ const nmv_context_counts *cts = &cm->counts.mv;
adapt_probs(0, vp9_mv_joint_tree, ctx->joints, pre_ctx->joints, cts->joints);
diff --git a/vp9/common/vp9_entropymv.h b/vp9/common/vp9_entropymv.h
index 3b782ab..c42653d 100644
--- a/vp9/common/vp9_entropymv.h
+++ b/vp9/common/vp9_entropymv.h
@@ -43,9 +43,6 @@
return type == MV_JOINT_HNZVZ || type == MV_JOINT_HNZVNZ;
}
-extern const vp9_tree_index vp9_mv_joint_tree[2 * MV_JOINTS - 2];
-extern struct vp9_token vp9_mv_joint_encodings[MV_JOINTS];
-
/* Symbols for coding magnitude class of nonzero components */
#define MV_CLASSES 11
typedef enum {
@@ -62,9 +59,6 @@
MV_CLASS_10 = 10, /* (1024,2048] integer pel */
} MV_CLASS_TYPE;
-extern const vp9_tree_index vp9_mv_class_tree[2 * MV_CLASSES - 2];
-extern struct vp9_token vp9_mv_class_encodings[MV_CLASSES];
-
#define CLASS0_BITS 1 /* bits at integer precision for class 0 */
#define CLASS0_SIZE (1 << CLASS0_BITS)
#define MV_OFFSET_BITS (MV_CLASSES + CLASS0_BITS - 2)
@@ -77,10 +71,16 @@
#define MV_UPP ((1 << MV_IN_USE_BITS) - 1)
#define MV_LOW (-(1 << MV_IN_USE_BITS))
-extern const vp9_tree_index vp9_mv_class0_tree[2 * CLASS0_SIZE - 2];
+extern const vp9_tree_index vp9_mv_joint_tree[TREE_SIZE(MV_JOINTS)];
+extern struct vp9_token vp9_mv_joint_encodings[MV_JOINTS];
+
+extern const vp9_tree_index vp9_mv_class_tree[TREE_SIZE(MV_CLASSES)];
+extern struct vp9_token vp9_mv_class_encodings[MV_CLASSES];
+
+extern const vp9_tree_index vp9_mv_class0_tree[TREE_SIZE(CLASS0_SIZE)];
extern struct vp9_token vp9_mv_class0_encodings[CLASS0_SIZE];
-extern const vp9_tree_index vp9_mv_fp_tree[2 * 4 - 2];
+extern const vp9_tree_index vp9_mv_fp_tree[TREE_SIZE(4)];
extern struct vp9_token vp9_mv_fp_encodings[4];
typedef struct {
diff --git a/vp9/common/vp9_filter.h b/vp9/common/vp9_filter.h
index 676b274..36d19a7 100644
--- a/vp9/common/vp9_filter.h
+++ b/vp9/common/vp9_filter.h
@@ -27,7 +27,7 @@
SWITCHABLE = 4 /* should be the last one */
} INTERPOLATIONFILTERTYPE;
-typedef const int16_t subpel_kernel[SUBPEL_TAPS];
+typedef int16_t subpel_kernel[SUBPEL_TAPS];
struct subpix_fn_table {
const subpel_kernel *filter_x;
diff --git a/vp9/common/vp9_idct.c b/vp9/common/vp9_idct.c
index 99d84c9..52b039d 100644
--- a/vp9/common/vp9_idct.c
+++ b/vp9/common/vp9_idct.c
@@ -18,13 +18,13 @@
#include "vp9/common/vp9_common.h"
#include "vp9/common/vp9_idct.h"
-void vp9_short_iwalsh4x4_add_c(int16_t *input, uint8_t *dest, int dest_stride) {
+void vp9_iwht4x4_16_add_c(const int16_t *input, uint8_t *dest, int stride) {
/* 4-point reversible, orthonormal inverse Walsh-Hadamard in 3.5 adds,
0.5 shifts per pixel. */
int i;
int16_t output[16];
int a1, b1, c1, d1, e1;
- int16_t *ip = input;
+ const int16_t *ip = input;
int16_t *op = output;
for (i = 0; i < 4; i++) {
@@ -60,21 +60,21 @@
c1 = e1 - c1;
a1 -= b1;
d1 += c1;
- dest[dest_stride * 0] = clip_pixel(dest[dest_stride * 0] + a1);
- dest[dest_stride * 1] = clip_pixel(dest[dest_stride * 1] + b1);
- dest[dest_stride * 2] = clip_pixel(dest[dest_stride * 2] + c1);
- dest[dest_stride * 3] = clip_pixel(dest[dest_stride * 3] + d1);
+ dest[stride * 0] = clip_pixel(dest[stride * 0] + a1);
+ dest[stride * 1] = clip_pixel(dest[stride * 1] + b1);
+ dest[stride * 2] = clip_pixel(dest[stride * 2] + c1);
+ dest[stride * 3] = clip_pixel(dest[stride * 3] + d1);
ip++;
dest++;
}
}
-void vp9_short_iwalsh4x4_1_add_c(int16_t *in, uint8_t *dest, int dest_stride) {
+void vp9_iwht4x4_1_add_c(const int16_t *in, uint8_t *dest, int dest_stride) {
int i;
int a1, e1;
int16_t tmp[4];
- int16_t *ip = in;
+ const int16_t *ip = in;
int16_t *op = tmp;
a1 = ip[0] >> UNIT_QUANT_SHIFT;
@@ -96,7 +96,7 @@
}
}
-void vp9_idct4_1d_c(int16_t *input, int16_t *output) {
+static void idct4_1d(const int16_t *input, int16_t *output) {
int16_t step[4];
int temp1, temp2;
// stage 1
@@ -116,7 +116,7 @@
output[3] = step[0] - step[3];
}
-void vp9_short_idct4x4_add_c(int16_t *input, uint8_t *dest, int dest_stride) {
+void vp9_idct4x4_16_add_c(const int16_t *input, uint8_t *dest, int stride) {
int16_t out[4 * 4];
int16_t *outptr = out;
int i, j;
@@ -124,7 +124,7 @@
// Rows
for (i = 0; i < 4; ++i) {
- vp9_idct4_1d(input, outptr);
+ idct4_1d(input, outptr);
input += 4;
outptr += 4;
}
@@ -133,14 +133,14 @@
for (i = 0; i < 4; ++i) {
for (j = 0; j < 4; ++j)
temp_in[j] = out[j * 4 + i];
- vp9_idct4_1d(temp_in, temp_out);
+ idct4_1d(temp_in, temp_out);
for (j = 0; j < 4; ++j)
- dest[j * dest_stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 4)
- + dest[j * dest_stride + i]);
+ dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 4)
+ + dest[j * stride + i]);
}
}
-void vp9_short_idct4x4_1_add_c(int16_t *input, uint8_t *dest, int dest_stride) {
+void vp9_idct4x4_1_add_c(const int16_t *input, uint8_t *dest, int dest_stride) {
int i;
int a1;
int16_t out = dct_const_round_shift(input[0] * cospi_16_64);
@@ -156,7 +156,7 @@
}
}
-static void idct8_1d(int16_t *input, int16_t *output) {
+static void idct8_1d(const int16_t *input, int16_t *output) {
int16_t step1[8], step2[8];
int temp1, temp2;
// stage 1
@@ -174,7 +174,7 @@
step1[6] = dct_const_round_shift(temp2);
// stage 2 & stage 3 - even half
- vp9_idct4_1d(step1, step1);
+ idct4_1d(step1, step1);
// stage 2 - odd half
step2[4] = step1[4] + step1[5];
@@ -201,7 +201,7 @@
output[7] = step1[0] - step1[7];
}
-void vp9_short_idct8x8_add_c(int16_t *input, uint8_t *dest, int dest_stride) {
+void vp9_idct8x8_64_add_c(const int16_t *input, uint8_t *dest, int stride) {
int16_t out[8 * 8];
int16_t *outptr = out;
int i, j;
@@ -220,12 +220,12 @@
temp_in[j] = out[j * 8 + i];
idct8_1d(temp_in, temp_out);
for (j = 0; j < 8; ++j)
- dest[j * dest_stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 5)
- + dest[j * dest_stride + i]);
+ dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 5)
+ + dest[j * stride + i]);
}
}
-void vp9_short_idct8x8_1_add_c(int16_t *input, uint8_t *dest, int dest_stride) {
+void vp9_idct8x8_1_add_c(const int16_t *input, uint8_t *dest, int stride) {
int i, j;
int a1;
int16_t out = dct_const_round_shift(input[0] * cospi_16_64);
@@ -234,11 +234,11 @@
for (j = 0; j < 8; ++j) {
for (i = 0; i < 8; ++i)
dest[i] = clip_pixel(dest[i] + a1);
- dest += dest_stride;
+ dest += stride;
}
}
-static void iadst4_1d(int16_t *input, int16_t *output) {
+static void iadst4_1d(const int16_t *input, int16_t *output) {
int s0, s1, s2, s3, s4, s5, s6, s7;
int x0 = input[0];
@@ -280,13 +280,13 @@
output[3] = dct_const_round_shift(s3);
}
-void vp9_short_iht4x4_add_c(int16_t *input, uint8_t *dest, int dest_stride,
- int tx_type) {
+void vp9_iht4x4_16_add_c(const int16_t *input, uint8_t *dest, int stride,
+ int tx_type) {
const transform_2d IHT_4[] = {
- { vp9_idct4_1d, vp9_idct4_1d }, // DCT_DCT = 0
- { iadst4_1d, vp9_idct4_1d }, // ADST_DCT = 1
- { vp9_idct4_1d, iadst4_1d }, // DCT_ADST = 2
- { iadst4_1d, iadst4_1d } // ADST_ADST = 3
+ { idct4_1d, idct4_1d }, // DCT_DCT = 0
+ { iadst4_1d, idct4_1d }, // ADST_DCT = 1
+ { idct4_1d, iadst4_1d }, // DCT_ADST = 2
+ { iadst4_1d, iadst4_1d } // ADST_ADST = 3
};
int i, j;
@@ -307,11 +307,11 @@
temp_in[j] = out[j * 4 + i];
IHT_4[tx_type].cols(temp_in, temp_out);
for (j = 0; j < 4; ++j)
- dest[j * dest_stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 4)
- + dest[j * dest_stride + i]);
+ dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 4)
+ + dest[j * stride + i]);
}
}
-static void iadst8_1d(int16_t *input, int16_t *output) {
+static void iadst8_1d(const int16_t *input, int16_t *output) {
int s0, s1, s2, s3, s4, s5, s6, s7;
int x0 = input[7];
@@ -395,8 +395,8 @@
{ iadst8_1d, iadst8_1d } // ADST_ADST = 3
};
-void vp9_short_iht8x8_add_c(int16_t *input, uint8_t *dest, int dest_stride,
- int tx_type) {
+void vp9_iht8x8_64_add_c(const int16_t *input, uint8_t *dest, int stride,
+ int tx_type) {
int i, j;
int16_t out[8 * 8];
int16_t *outptr = out;
@@ -416,12 +416,12 @@
temp_in[j] = out[j * 8 + i];
ht.cols(temp_in, temp_out);
for (j = 0; j < 8; ++j)
- dest[j * dest_stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 5)
- + dest[j * dest_stride + i]); }
+ dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 5)
+ + dest[j * stride + i]);
+ }
}
-void vp9_short_idct8x8_10_add_c(int16_t *input, uint8_t *dest,
- int dest_stride) {
+void vp9_idct8x8_10_add_c(const int16_t *input, uint8_t *dest, int stride) {
int16_t out[8 * 8] = { 0 };
int16_t *outptr = out;
int i, j;
@@ -441,12 +441,12 @@
temp_in[j] = out[j * 8 + i];
idct8_1d(temp_in, temp_out);
for (j = 0; j < 8; ++j)
- dest[j * dest_stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 5)
- + dest[j * dest_stride + i]);
+ dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 5)
+ + dest[j * stride + i]);
}
}
-static void idct16_1d(int16_t *input, int16_t *output) {
+static void idct16_1d(const int16_t *input, int16_t *output) {
int16_t step1[16], step2[16];
int temp1, temp2;
@@ -611,7 +611,7 @@
output[15] = step2[0] - step2[15];
}
-void vp9_short_idct16x16_add_c(int16_t *input, uint8_t *dest, int dest_stride) {
+void vp9_idct16x16_256_add_c(const int16_t *input, uint8_t *dest, int stride) {
int16_t out[16 * 16];
int16_t *outptr = out;
int i, j;
@@ -630,12 +630,12 @@
temp_in[j] = out[j * 16 + i];
idct16_1d(temp_in, temp_out);
for (j = 0; j < 16; ++j)
- dest[j * dest_stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6)
- + dest[j * dest_stride + i]);
+ dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6)
+ + dest[j * stride + i]);
}
}
-void iadst16_1d(int16_t *input, int16_t *output) {
+static void iadst16_1d(const int16_t *input, int16_t *output) {
int s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14, s15;
int x0 = input[15];
@@ -813,8 +813,8 @@
{ iadst16_1d, iadst16_1d } // ADST_ADST = 3
};
-void vp9_short_iht16x16_add_c(int16_t *input, uint8_t *dest, int dest_stride,
- int tx_type) {
+void vp9_iht16x16_256_add_c(const int16_t *input, uint8_t *dest, int stride,
+ int tx_type) {
int i, j;
int16_t out[16 * 16];
int16_t *outptr = out;
@@ -834,12 +834,11 @@
temp_in[j] = out[j * 16 + i];
ht.cols(temp_in, temp_out);
for (j = 0; j < 16; ++j)
- dest[j * dest_stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6)
- + dest[j * dest_stride + i]); }
+ dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6)
+ + dest[j * stride + i]); }
}
-void vp9_short_idct16x16_10_add_c(int16_t *input, uint8_t *dest,
- int dest_stride) {
+void vp9_idct16x16_10_add_c(const int16_t *input, uint8_t *dest, int stride) {
int16_t out[16 * 16] = { 0 };
int16_t *outptr = out;
int i, j;
@@ -859,13 +858,12 @@
temp_in[j] = out[j*16 + i];
idct16_1d(temp_in, temp_out);
for (j = 0; j < 16; ++j)
- dest[j * dest_stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6)
- + dest[j * dest_stride + i]);
+ dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6)
+ + dest[j * stride + i]);
}
}
-void vp9_short_idct16x16_1_add_c(int16_t *input, uint8_t *dest,
- int dest_stride) {
+void vp9_idct16x16_1_add_c(const int16_t *input, uint8_t *dest, int stride) {
int i, j;
int a1;
int16_t out = dct_const_round_shift(input[0] * cospi_16_64);
@@ -874,11 +872,11 @@
for (j = 0; j < 16; ++j) {
for (i = 0; i < 16; ++i)
dest[i] = clip_pixel(dest[i] + a1);
- dest += dest_stride;
+ dest += stride;
}
}
-static void idct32_1d(int16_t *input, int16_t *output) {
+static void idct32_1d(const int16_t *input, int16_t *output) {
int16_t step1[32], step2[32];
int temp1, temp2;
@@ -1245,7 +1243,7 @@
output[31] = step1[0] - step1[31];
}
-void vp9_short_idct32x32_add_c(int16_t *input, uint8_t *dest, int dest_stride) {
+void vp9_idct32x32_1024_add_c(const int16_t *input, uint8_t *dest, int stride) {
int16_t out[32 * 32];
int16_t *outptr = out;
int i, j;
@@ -1253,7 +1251,20 @@
// Rows
for (i = 0; i < 32; ++i) {
- idct32_1d(input, outptr);
+ int16_t zero_coeff[16];
+ for (j = 0; j < 16; ++j)
+ zero_coeff[j] = input[2 * j] | input[2 * j + 1];
+ for (j = 0; j < 8; ++j)
+ zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1];
+ for (j = 0; j < 4; ++j)
+ zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1];
+ for (j = 0; j < 2; ++j)
+ zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1];
+
+ if (zero_coeff[0] | zero_coeff[1])
+ idct32_1d(input, outptr);
+ else
+ vpx_memset(outptr, 0, sizeof(int16_t) * 32);
input += 32;
outptr += 32;
}
@@ -1264,13 +1275,12 @@
temp_in[j] = out[j * 32 + i];
idct32_1d(temp_in, temp_out);
for (j = 0; j < 32; ++j)
- dest[j * dest_stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6)
- + dest[j * dest_stride + i]);
+ dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6)
+ + dest[j * stride + i]);
}
}
-void vp9_short_idct32x32_1_add_c(int16_t *input, uint8_t *dest,
- int dest_stride) {
+void vp9_idct32x32_1_add_c(const int16_t *input, uint8_t *dest, int stride) {
int i, j;
int a1;
@@ -1281,28 +1291,27 @@
for (j = 0; j < 32; ++j) {
for (i = 0; i < 32; ++i)
dest[i] = clip_pixel(dest[i] + a1);
- dest += dest_stride;
+ dest += stride;
}
}
// idct
-void vp9_idct_add(int16_t *input, uint8_t *dest, int stride, int eob) {
+void vp9_idct4x4_add(const int16_t *input, uint8_t *dest, int stride, int eob) {
if (eob > 1)
- vp9_short_idct4x4_add(input, dest, stride);
+ vp9_idct4x4_16_add(input, dest, stride);
else
- vp9_short_idct4x4_1_add(input, dest, stride);
+ vp9_idct4x4_1_add(input, dest, stride);
}
-void vp9_idct_add_lossless(int16_t *input, uint8_t *dest, int stride,
- int eob) {
+void vp9_iwht4x4_add(const int16_t *input, uint8_t *dest, int stride, int eob) {
if (eob > 1)
- vp9_short_iwalsh4x4_add(input, dest, stride);
+ vp9_iwht4x4_16_add(input, dest, stride);
else
- vp9_short_iwalsh4x4_1_add_c(input, dest, stride);
+ vp9_iwht4x4_1_add(input, dest, stride);
}
-void vp9_idct_add_8x8(int16_t *input, uint8_t *dest, int stride, int eob) {
+void vp9_idct8x8_add(const int16_t *input, uint8_t *dest, int stride, int eob) {
// If dc is 1, then input[0] is the reconstructed value, do not need
// dequantization. Also, when dc is 1, dc is counted in eobs, namely eobs >=1.
@@ -1313,64 +1322,66 @@
if (eob) {
if (eob == 1)
// DC only DCT coefficient
- vp9_short_idct8x8_1_add(input, dest, stride);
+ vp9_idct8x8_1_add(input, dest, stride);
else if (eob <= 10)
- vp9_short_idct8x8_10_add(input, dest, stride);
+ vp9_idct8x8_10_add(input, dest, stride);
else
- vp9_short_idct8x8_add(input, dest, stride);
+ vp9_idct8x8_64_add(input, dest, stride);
}
}
-void vp9_idct_add_16x16(int16_t *input, uint8_t *dest, int stride, int eob) {
+void vp9_idct16x16_add(const int16_t *input, uint8_t *dest, int stride,
+ int eob) {
/* The calculation can be simplified if there are not many non-zero dct
* coefficients. Use eobs to separate different cases. */
if (eob) {
if (eob == 1)
/* DC only DCT coefficient. */
- vp9_short_idct16x16_1_add(input, dest, stride);
+ vp9_idct16x16_1_add(input, dest, stride);
else if (eob <= 10)
- vp9_short_idct16x16_10_add(input, dest, stride);
+ vp9_idct16x16_10_add(input, dest, stride);
else
- vp9_short_idct16x16_add(input, dest, stride);
+ vp9_idct16x16_256_add(input, dest, stride);
}
}
-void vp9_idct_add_32x32(int16_t *input, uint8_t *dest, int stride, int eob) {
+void vp9_idct32x32_add(const int16_t *input, uint8_t *dest, int stride,
+ int eob) {
if (eob) {
if (eob == 1)
- vp9_short_idct32x32_1_add(input, dest, stride);
+ vp9_idct32x32_1_add(input, dest, stride);
else
- vp9_short_idct32x32_add(input, dest, stride);
+ vp9_idct32x32_1024_add(input, dest, stride);
}
}
// iht
-void vp9_iht_add(TX_TYPE tx_type, int16_t *input, uint8_t *dest, int stride,
- int eob) {
+void vp9_iht4x4_add(TX_TYPE tx_type, const int16_t *input, uint8_t *dest,
+ int stride, int eob) {
if (tx_type == DCT_DCT)
- vp9_idct_add(input, dest, stride, eob);
+ vp9_idct4x4_add(input, dest, stride, eob);
else
- vp9_short_iht4x4_add(input, dest, stride, tx_type);
+ vp9_iht4x4_16_add(input, dest, stride, tx_type);
}
-void vp9_iht_add_8x8(TX_TYPE tx_type, int16_t *input, uint8_t *dest,
- int stride, int eob) {
+void vp9_iht8x8_add(TX_TYPE tx_type, const int16_t *input, uint8_t *dest,
+ int stride, int eob) {
if (tx_type == DCT_DCT) {
- vp9_idct_add_8x8(input, dest, stride, eob);
+ vp9_idct8x8_add(input, dest, stride, eob);
} else {
if (eob > 0) {
- vp9_short_iht8x8_add(input, dest, stride, tx_type);
+ vp9_iht8x8_64_add(input, dest, stride, tx_type);
}
}
}
-void vp9_iht_add_16x16(TX_TYPE tx_type, int16_t *input, uint8_t *dest,
- int stride, int eob) {
+void vp9_iht16x16_add(TX_TYPE tx_type, const int16_t *input, uint8_t *dest,
+ int stride, int eob) {
if (tx_type == DCT_DCT) {
- vp9_idct_add_16x16(input, dest, stride, eob);
+ vp9_idct16x16_add(input, dest, stride, eob);
} else {
if (eob > 0) {
- vp9_short_iht16x16_add(input, dest, stride, tx_type);
+ vp9_iht16x16_256_add(input, dest, stride, tx_type);
}
}
}
diff --git a/vp9/common/vp9_idct.h b/vp9/common/vp9_idct.h
index 0ef905c..2b3f35f 100644
--- a/vp9/common/vp9_idct.h
+++ b/vp9/common/vp9_idct.h
@@ -81,28 +81,27 @@
return rv;
}
-typedef void (*transform_1d)(int16_t*, int16_t*);
+typedef void (*transform_1d)(const int16_t*, int16_t*);
typedef struct {
transform_1d cols, rows; // vertical and horizontal
} transform_2d;
+void vp9_iwht4x4_add(const int16_t *input, uint8_t *dest, int stride, int eob);
-void vp9_idct_add(int16_t *input, uint8_t *dest, int stride, int eob);
-void vp9_idct_add_lossless(int16_t *input, uint8_t *dest,
- int stride, int eob);
-void vp9_idct_add_8x8(int16_t *input, uint8_t *dest, int stride, int eob);
-void vp9_idct_add_16x16(int16_t *input, uint8_t *dest, int stride, int eob);
-void vp9_idct_add_32x32(int16_t *input, uint8_t *dest, int stride, int eob);
+void vp9_idct4x4_add(const int16_t *input, uint8_t *dest, int stride, int eob);
+void vp9_idct8x8_add(const int16_t *input, uint8_t *dest, int stride, int eob);
+void vp9_idct16x16_add(const int16_t *input, uint8_t *dest, int stride, int
+ eob);
+void vp9_idct32x32_add(const int16_t *input, uint8_t *dest, int stride,
+ int eob);
-void vp9_iht_add(TX_TYPE tx_type, int16_t *input, uint8_t *dest,
- int stride, int eob);
-
-void vp9_iht_add_8x8(TX_TYPE tx_type, int16_t *input, uint8_t *dest,
- int stride, int eob);
-
-void vp9_iht_add_16x16(TX_TYPE tx_type, int16_t *input, uint8_t *dest,
- int stride, int eob);
+void vp9_iht4x4_add(TX_TYPE tx_type, const int16_t *input, uint8_t *dest,
+ int stride, int eob);
+void vp9_iht8x8_add(TX_TYPE tx_type, const int16_t *input, uint8_t *dest,
+ int stride, int eob);
+void vp9_iht16x16_add(TX_TYPE tx_type, const int16_t *input, uint8_t *dest,
+ int stride, int eob);
#endif // VP9_COMMON_VP9_IDCT_H_
diff --git a/vp9/common/vp9_reconinter.c b/vp9/common/vp9_reconinter.c
index b3b9e1d..2fabe2a 100644
--- a/vp9/common/vp9_reconinter.c
+++ b/vp9/common/vp9_reconinter.c
@@ -174,36 +174,17 @@
static INLINE void foreach_predicted_block_in_plane(
const MACROBLOCKD* const xd, BLOCK_SIZE bsize, int plane,
foreach_predicted_block_visitor visit, void *arg) {
- int i, x, y;
-
- // block sizes in number of 4x4 blocks log 2 ("*_b")
- // 4x4=0, 8x8=2, 16x16=4, 32x32=6, 64x64=8
- // subsampled size of the block
const int bwl = b_width_log2(bsize) - xd->plane[plane].subsampling_x;
const int bhl = b_height_log2(bsize) - xd->plane[plane].subsampling_y;
- // size of the predictor to use.
- int pred_w, pred_h;
-
if (xd->this_mi->mbmi.sb_type < BLOCK_8X8) {
+ int i = 0, x, y;
assert(bsize == BLOCK_8X8);
- pred_w = 0;
- pred_h = 0;
+ for (y = 0; y < 1 << bhl; ++y)
+ for (x = 0; x < 1 << bwl; ++x)
+ visit(plane, i++, bsize, 0, 0, arg);
} else {
- pred_w = bwl;
- pred_h = bhl;
- }
- assert(pred_w <= bwl);
- assert(pred_h <= bhl);
-
- // visit each subblock in raster order
- i = 0;
- for (y = 0; y < 1 << bhl; y += 1 << pred_h) {
- for (x = 0; x < 1 << bwl; x += 1 << pred_w) {
- visit(plane, i, bsize, pred_w, pred_h, arg);
- i += 1 << pred_w;
- }
- i += (1 << (bwl + pred_h)) - (1 << bwl);
+ visit(plane, 0, bsize, bwl, bhl, arg);
}
}
diff --git a/vp9/common/vp9_rtcd_defs.sh b/vp9/common/vp9_rtcd_defs.sh
index 61be7c6..21513d4 100644
--- a/vp9/common/vp9_rtcd_defs.sh
+++ b/vp9/common/vp9_rtcd_defs.sh
@@ -31,7 +31,7 @@
# RECON
#
prototype void vp9_d207_predictor_4x4 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"
-specialize vp9_d207_predictor_4x4
+specialize vp9_d207_predictor_4x4 $ssse3_x86inc
prototype void vp9_d45_predictor_4x4 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"
specialize vp9_d45_predictor_4x4 $ssse3_x86inc
@@ -49,7 +49,7 @@
specialize vp9_d135_predictor_4x4
prototype void vp9_d153_predictor_4x4 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"
-specialize vp9_d153_predictor_4x4
+specialize vp9_d153_predictor_4x4 $ssse3_x86inc
prototype void vp9_v_predictor_4x4 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"
specialize vp9_v_predictor_4x4 $sse_x86inc
@@ -70,7 +70,7 @@
specialize vp9_dc_128_predictor_4x4
prototype void vp9_d207_predictor_8x8 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"
-specialize vp9_d207_predictor_8x8
+specialize vp9_d207_predictor_8x8 $ssse3_x86inc
prototype void vp9_d45_predictor_8x8 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"
specialize vp9_d45_predictor_8x8 $ssse3_x86inc
@@ -88,7 +88,7 @@
specialize vp9_d135_predictor_8x8
prototype void vp9_d153_predictor_8x8 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"
-specialize vp9_d153_predictor_8x8
+specialize vp9_d153_predictor_8x8 $ssse3_x86inc
prototype void vp9_v_predictor_8x8 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"
specialize vp9_v_predictor_8x8 $sse_x86inc
@@ -109,7 +109,7 @@
specialize vp9_dc_128_predictor_8x8
prototype void vp9_d207_predictor_16x16 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"
-specialize vp9_d207_predictor_16x16
+specialize vp9_d207_predictor_16x16 $ssse3_x86inc
prototype void vp9_d45_predictor_16x16 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"
specialize vp9_d45_predictor_16x16 $ssse3_x86inc
@@ -127,7 +127,7 @@
specialize vp9_d135_predictor_16x16
prototype void vp9_d153_predictor_16x16 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"
-specialize vp9_d153_predictor_16x16
+specialize vp9_d153_predictor_16x16 $ssse3_x86inc
prototype void vp9_v_predictor_16x16 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"
specialize vp9_v_predictor_16x16 $sse2_x86inc
@@ -148,7 +148,7 @@
specialize vp9_dc_128_predictor_16x16
prototype void vp9_d207_predictor_32x32 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"
-specialize vp9_d207_predictor_32x32
+specialize vp9_d207_predictor_32x32 $ssse3_x86inc
prototype void vp9_d45_predictor_32x32 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"
specialize vp9_d45_predictor_32x32 $ssse3_x86inc
@@ -247,74 +247,72 @@
specialize vp9_convolve_avg $sse2_x86inc neon dspr2
prototype void vp9_convolve8 "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"
-specialize vp9_convolve8 ssse3 neon dspr2
+specialize vp9_convolve8 sse2 ssse3 neon dspr2
prototype void vp9_convolve8_horiz "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"
-specialize vp9_convolve8_horiz ssse3 neon dspr2
+specialize vp9_convolve8_horiz sse2 ssse3 neon dspr2
prototype void vp9_convolve8_vert "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"
-specialize vp9_convolve8_vert ssse3 neon dspr2
+specialize vp9_convolve8_vert sse2 ssse3 neon dspr2
prototype void vp9_convolve8_avg "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"
-specialize vp9_convolve8_avg ssse3 neon dspr2
+specialize vp9_convolve8_avg sse2 ssse3 neon dspr2
prototype void vp9_convolve8_avg_horiz "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"
-specialize vp9_convolve8_avg_horiz ssse3 neon dspr2
+specialize vp9_convolve8_avg_horiz sse2 ssse3 neon dspr2
prototype void vp9_convolve8_avg_vert "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"
-specialize vp9_convolve8_avg_vert ssse3 neon dspr2
+specialize vp9_convolve8_avg_vert sse2 ssse3 neon dspr2
#
# dct
#
-prototype void vp9_short_idct4x4_1_add "int16_t *input, uint8_t *dest, int dest_stride"
-specialize vp9_short_idct4x4_1_add sse2 neon
+prototype void vp9_idct4x4_1_add "const int16_t *input, uint8_t *dest, int dest_stride"
+specialize vp9_idct4x4_1_add sse2 neon
-prototype void vp9_short_idct4x4_add "int16_t *input, uint8_t *dest, int dest_stride"
-specialize vp9_short_idct4x4_add sse2 neon
+prototype void vp9_idct4x4_16_add "const int16_t *input, uint8_t *dest, int dest_stride"
+specialize vp9_idct4x4_16_add sse2 neon
-prototype void vp9_short_idct8x8_1_add "int16_t *input, uint8_t *dest, int dest_stride"
-specialize vp9_short_idct8x8_1_add sse2 neon
+prototype void vp9_idct8x8_1_add "const int16_t *input, uint8_t *dest, int dest_stride"
+specialize vp9_idct8x8_1_add sse2 neon
-prototype void vp9_short_idct8x8_add "int16_t *input, uint8_t *dest, int dest_stride"
-specialize vp9_short_idct8x8_add sse2 neon
+prototype void vp9_idct8x8_64_add "const int16_t *input, uint8_t *dest, int dest_stride"
+specialize vp9_idct8x8_64_add sse2 neon
-prototype void vp9_short_idct8x8_10_add "int16_t *input, uint8_t *dest, int dest_stride"
-specialize vp9_short_idct8x8_10_add sse2 neon
+prototype void vp9_idct8x8_10_add "const int16_t *input, uint8_t *dest, int dest_stride"
+specialize vp9_idct8x8_10_add sse2 neon
-prototype void vp9_short_idct16x16_1_add "int16_t *input, uint8_t *dest, int dest_stride"
-specialize vp9_short_idct16x16_1_add sse2 neon
+prototype void vp9_idct16x16_1_add "const int16_t *input, uint8_t *dest, int dest_stride"
+specialize vp9_idct16x16_1_add sse2 neon
-prototype void vp9_short_idct16x16_add "int16_t *input, uint8_t *dest, int dest_stride"
-specialize vp9_short_idct16x16_add sse2 neon
+prototype void vp9_idct16x16_256_add "const int16_t *input, uint8_t *dest, int dest_stride"
+specialize vp9_idct16x16_256_add sse2 neon
-prototype void vp9_short_idct16x16_10_add "int16_t *input, uint8_t *dest, int dest_stride"
-specialize vp9_short_idct16x16_10_add sse2 neon
+prototype void vp9_idct16x16_10_add "const int16_t *input, uint8_t *dest, int dest_stride"
+specialize vp9_idct16x16_10_add sse2 neon
-prototype void vp9_short_idct32x32_add "int16_t *input, uint8_t *dest, int dest_stride"
-specialize vp9_short_idct32x32_add sse2 neon
+prototype void vp9_idct32x32_1024_add "const int16_t *input, uint8_t *dest, int dest_stride"
+specialize vp9_idct32x32_1024_add sse2 neon
-prototype void vp9_short_idct32x32_1_add "int16_t *input, uint8_t *dest, int dest_stride"
-specialize vp9_short_idct32x32_1_add sse2
+prototype void vp9_idct32x32_1_add "const int16_t *input, uint8_t *dest, int dest_stride"
+specialize vp9_idct32x32_1_add sse2
-prototype void vp9_short_iht4x4_add "int16_t *input, uint8_t *dest, int dest_stride, int tx_type"
-specialize vp9_short_iht4x4_add sse2 neon
+prototype void vp9_iht4x4_16_add "const int16_t *input, uint8_t *dest, int dest_stride, int tx_type"
+specialize vp9_iht4x4_16_add sse2 neon
-prototype void vp9_short_iht8x8_add "int16_t *input, uint8_t *dest, int dest_stride, int tx_type"
-specialize vp9_short_iht8x8_add sse2 neon
+prototype void vp9_iht8x8_64_add "const int16_t *input, uint8_t *dest, int dest_stride, int tx_type"
+specialize vp9_iht8x8_64_add sse2 neon
-prototype void vp9_short_iht16x16_add "int16_t *input, uint8_t *output, int pitch, int tx_type"
-specialize vp9_short_iht16x16_add sse2
+prototype void vp9_iht16x16_256_add "const int16_t *input, uint8_t *output, int pitch, int tx_type"
+specialize vp9_iht16x16_256_add sse2
-prototype void vp9_idct4_1d "int16_t *input, int16_t *output"
-specialize vp9_idct4_1d sse2
# dct and add
-prototype void vp9_short_iwalsh4x4_1_add "int16_t *input, uint8_t *dest, int dest_stride"
-specialize vp9_short_iwalsh4x4_1_add
+prototype void vp9_iwht4x4_1_add "const int16_t *input, uint8_t *dest, int dest_stride"
+specialize vp9_iwht4x4_1_add
-prototype void vp9_short_iwalsh4x4_add "int16_t *input, uint8_t *dest, int dest_stride"
-specialize vp9_short_iwalsh4x4_add
+prototype void vp9_iwht4x4_16_add "const int16_t *input, uint8_t *dest, int dest_stride"
+specialize vp9_iwht4x4_16_add
#
# Encoder functions below this point.
diff --git a/vp9/common/vp9_scan.c b/vp9/common/vp9_scan.c
new file mode 100644
index 0000000..f17da91
--- /dev/null
+++ b/vp9/common/vp9_scan.c
@@ -0,0 +1,357 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+
+#include "vp9/common/vp9_scan.h"
+
+DECLARE_ALIGNED(16, const int16_t, vp9_default_scan_4x4[16]) = {
+ 0, 4, 1, 5,
+ 8, 2, 12, 9,
+ 3, 6, 13, 10,
+ 7, 14, 11, 15,
+};
+
+DECLARE_ALIGNED(16, const int16_t, vp9_col_scan_4x4[16]) = {
+ 0, 4, 8, 1,
+ 12, 5, 9, 2,
+ 13, 6, 10, 3,
+ 7, 14, 11, 15,
+};
+
+DECLARE_ALIGNED(16, const int16_t, vp9_row_scan_4x4[16]) = {
+ 0, 1, 4, 2,
+ 5, 3, 6, 8,
+ 9, 7, 12, 10,
+ 13, 11, 14, 15,
+};
+
+DECLARE_ALIGNED(16, const int16_t, vp9_default_scan_8x8[64]) = {
+ 0, 8, 1, 16, 9, 2, 17, 24,
+ 10, 3, 18, 25, 32, 11, 4, 26,
+ 33, 19, 40, 12, 34, 27, 5, 41,
+ 20, 48, 13, 35, 42, 28, 21, 6,
+ 49, 56, 36, 43, 29, 7, 14, 50,
+ 57, 44, 22, 37, 15, 51, 58, 30,
+ 45, 23, 52, 59, 38, 31, 60, 53,
+ 46, 39, 61, 54, 47, 62, 55, 63,
+};
+
+DECLARE_ALIGNED(16, const int16_t, vp9_col_scan_8x8[64]) = {
+ 0, 8, 16, 1, 24, 9, 32, 17,
+ 2, 40, 25, 10, 33, 18, 48, 3,
+ 26, 41, 11, 56, 19, 34, 4, 49,
+ 27, 42, 12, 35, 20, 57, 50, 28,
+ 5, 43, 13, 36, 58, 51, 21, 44,
+ 6, 29, 59, 37, 14, 52, 22, 7,
+ 45, 60, 30, 15, 38, 53, 23, 46,
+ 31, 61, 39, 54, 47, 62, 55, 63,
+};
+
+DECLARE_ALIGNED(16, const int16_t, vp9_row_scan_8x8[64]) = {
+ 0, 1, 2, 8, 9, 3, 16, 10,
+ 4, 17, 11, 24, 5, 18, 25, 12,
+ 19, 26, 32, 6, 13, 20, 33, 27,
+ 7, 34, 40, 21, 28, 41, 14, 35,
+ 48, 42, 29, 36, 49, 22, 43, 15,
+ 56, 37, 50, 44, 30, 57, 23, 51,
+ 58, 45, 38, 52, 31, 59, 53, 46,
+ 60, 39, 61, 47, 54, 55, 62, 63,
+};
+
+DECLARE_ALIGNED(16, const int16_t, vp9_default_scan_16x16[256]) = {
+ 0, 16, 1, 32, 17, 2, 48, 33, 18, 3, 64, 34, 49, 19, 65, 80,
+ 50, 4, 35, 66, 20, 81, 96, 51, 5, 36, 82, 97, 67, 112, 21, 52,
+ 98, 37, 83, 113, 6, 68, 128, 53, 22, 99, 114, 84, 7, 129, 38, 69,
+ 100, 115, 144, 130, 85, 54, 23, 8, 145, 39, 70, 116, 101, 131, 160, 146,
+ 55, 86, 24, 71, 132, 117, 161, 40, 9, 102, 147, 176, 162, 87, 56, 25,
+ 133, 118, 177, 148, 72, 103, 41, 163, 10, 192, 178, 88, 57, 134, 149, 119,
+ 26, 164, 73, 104, 193, 42, 179, 208, 11, 135, 89, 165, 120, 150, 58, 194,
+ 180, 27, 74, 209, 105, 151, 136, 43, 90, 224, 166, 195, 181, 121, 210, 59,
+ 12, 152, 106, 167, 196, 75, 137, 225, 211, 240, 182, 122, 91, 28, 197, 13,
+ 226, 168, 183, 153, 44, 212, 138, 107, 241, 60, 29, 123, 198, 184, 227, 169,
+ 242, 76, 213, 154, 45, 92, 14, 199, 139, 61, 228, 214, 170, 185, 243, 108,
+ 77, 155, 30, 15, 200, 229, 124, 215, 244, 93, 46, 186, 171, 201, 109, 140,
+ 230, 62, 216, 245, 31, 125, 78, 156, 231, 47, 187, 202, 217, 94, 246, 141,
+ 63, 232, 172, 110, 247, 157, 79, 218, 203, 126, 233, 188, 248, 95, 173, 142,
+ 219, 111, 249, 234, 158, 127, 189, 204, 250, 235, 143, 174, 220, 205, 159,
+ 251,
+ 190, 221, 175, 236, 237, 191, 206, 252, 222, 253, 207, 238, 223, 254, 239,
+ 255,
+};
+
+DECLARE_ALIGNED(16, const int16_t, vp9_col_scan_16x16[256]) = {
+ 0, 16, 32, 48, 1, 64, 17, 80, 33, 96, 49, 2, 65, 112, 18, 81,
+ 34, 128, 50, 97, 3, 66, 144, 19, 113, 35, 82, 160, 98, 51, 129, 4,
+ 67, 176, 20, 114, 145, 83, 36, 99, 130, 52, 192, 5, 161, 68, 115, 21,
+ 146, 84, 208, 177, 37, 131, 100, 53, 162, 224, 69, 6, 116, 193, 147, 85,
+ 22, 240, 132, 38, 178, 101, 163, 54, 209, 117, 70, 7, 148, 194, 86, 179,
+ 225, 23, 133, 39, 164, 8, 102, 210, 241, 55, 195, 118, 149, 71, 180, 24,
+ 87, 226, 134, 165, 211, 40, 103, 56, 72, 150, 196, 242, 119, 9, 181, 227,
+ 88, 166, 25, 135, 41, 104, 212, 57, 151, 197, 120, 73, 243, 182, 136, 167,
+ 213, 89, 10, 228, 105, 152, 198, 26, 42, 121, 183, 244, 168, 58, 137, 229,
+ 74, 214, 90, 153, 199, 184, 11, 106, 245, 27, 122, 230, 169, 43, 215, 59,
+ 200, 138, 185, 246, 75, 12, 91, 154, 216, 231, 107, 28, 44, 201, 123, 170,
+ 60, 247, 232, 76, 139, 13, 92, 217, 186, 248, 155, 108, 29, 124, 45, 202,
+ 233, 171, 61, 14, 77, 140, 15, 249, 93, 30, 187, 156, 218, 46, 109, 125,
+ 62, 172, 78, 203, 31, 141, 234, 94, 47, 188, 63, 157, 110, 250, 219, 79,
+ 126, 204, 173, 142, 95, 189, 111, 235, 158, 220, 251, 127, 174, 143, 205,
+ 236,
+ 159, 190, 221, 252, 175, 206, 237, 191, 253, 222, 238, 207, 254, 223, 239,
+ 255,
+};
+
+DECLARE_ALIGNED(16, const int16_t, vp9_row_scan_16x16[256]) = {
+ 0, 1, 2, 16, 3, 17, 4, 18, 32, 5, 33, 19, 6, 34, 48, 20,
+ 49, 7, 35, 21, 50, 64, 8, 36, 65, 22, 51, 37, 80, 9, 66, 52,
+ 23, 38, 81, 67, 10, 53, 24, 82, 68, 96, 39, 11, 54, 83, 97, 69,
+ 25, 98, 84, 40, 112, 55, 12, 70, 99, 113, 85, 26, 41, 56, 114, 100,
+ 13, 71, 128, 86, 27, 115, 101, 129, 42, 57, 72, 116, 14, 87, 130, 102,
+ 144, 73, 131, 117, 28, 58, 15, 88, 43, 145, 103, 132, 146, 118, 74, 160,
+ 89, 133, 104, 29, 59, 147, 119, 44, 161, 148, 90, 105, 134, 162, 120, 176,
+ 75, 135, 149, 30, 60, 163, 177, 45, 121, 91, 106, 164, 178, 150, 192, 136,
+ 165, 179, 31, 151, 193, 76, 122, 61, 137, 194, 107, 152, 180, 208, 46, 166,
+ 167, 195, 92, 181, 138, 209, 123, 153, 224, 196, 77, 168, 210, 182, 240, 108,
+ 197, 62, 154, 225, 183, 169, 211, 47, 139, 93, 184, 226, 212, 241, 198, 170,
+ 124, 155, 199, 78, 213, 185, 109, 227, 200, 63, 228, 242, 140, 214, 171, 186,
+ 156, 229, 243, 125, 94, 201, 244, 215, 216, 230, 141, 187, 202, 79, 172, 110,
+ 157, 245, 217, 231, 95, 246, 232, 126, 203, 247, 233, 173, 218, 142, 111,
+ 158,
+ 188, 248, 127, 234, 219, 249, 189, 204, 143, 174, 159, 250, 235, 205, 220,
+ 175,
+ 190, 251, 221, 191, 206, 236, 207, 237, 252, 222, 253, 223, 238, 239, 254,
+ 255,
+};
+
+DECLARE_ALIGNED(16, const int16_t, vp9_default_scan_32x32[1024]) = {
+ 0, 32, 1, 64, 33, 2, 96, 65, 34, 128, 3, 97, 66, 160,
+ 129, 35, 98, 4, 67, 130, 161, 192, 36, 99, 224, 5, 162, 193,
+ 68, 131, 37, 100,
+ 225, 194, 256, 163, 69, 132, 6, 226, 257, 288, 195, 101, 164, 38,
+ 258, 7, 227, 289, 133, 320, 70, 196, 165, 290, 259, 228, 39, 321,
+ 102, 352, 8, 197,
+ 71, 134, 322, 291, 260, 353, 384, 229, 166, 103, 40, 354, 323, 292,
+ 135, 385, 198, 261, 72, 9, 416, 167, 386, 355, 230, 324, 104, 293,
+ 41, 417, 199, 136,
+ 262, 387, 448, 325, 356, 10, 73, 418, 231, 168, 449, 294, 388, 105,
+ 419, 263, 42, 200, 357, 450, 137, 480, 74, 326, 232, 11, 389, 169,
+ 295, 420, 106, 451,
+ 481, 358, 264, 327, 201, 43, 138, 512, 482, 390, 296, 233, 170, 421,
+ 75, 452, 359, 12, 513, 265, 483, 328, 107, 202, 514, 544, 422, 391,
+ 453, 139, 44, 234,
+ 484, 297, 360, 171, 76, 515, 545, 266, 329, 454, 13, 423, 203, 108,
+ 546, 485, 576, 298, 235, 140, 361, 330, 172, 547, 45, 455, 267, 577,
+ 486, 77, 204, 362,
+ 608, 14, 299, 578, 109, 236, 487, 609, 331, 141, 579, 46, 15, 173,
+ 610, 363, 78, 205, 16, 110, 237, 611, 142, 47, 174, 79, 206, 17,
+ 111, 238, 48, 143,
+ 80, 175, 112, 207, 49, 18, 239, 81, 113, 19, 50, 82, 114, 51,
+ 83, 115, 640, 516, 392, 268, 144, 20, 672, 641, 548, 517, 424,
+ 393, 300, 269, 176, 145,
+ 52, 21, 704, 673, 642, 580, 549, 518, 456, 425, 394, 332, 301,
+ 270, 208, 177, 146, 84, 53, 22, 736, 705, 674, 643, 612, 581,
+ 550, 519, 488, 457, 426, 395,
+ 364, 333, 302, 271, 240, 209, 178, 147, 116, 85, 54, 23, 737,
+ 706, 675, 613, 582, 551, 489, 458, 427, 365, 334, 303, 241,
+ 210, 179, 117, 86, 55, 738, 707,
+ 614, 583, 490, 459, 366, 335, 242, 211, 118, 87, 739, 615, 491,
+ 367, 243, 119, 768, 644, 520, 396, 272, 148, 24, 800, 769, 676,
+ 645, 552, 521, 428, 397, 304,
+ 273, 180, 149, 56, 25, 832, 801, 770, 708, 677, 646, 584, 553,
+ 522, 460, 429, 398, 336, 305, 274, 212, 181, 150, 88, 57, 26,
+ 864, 833, 802, 771, 740, 709,
+ 678, 647, 616, 585, 554, 523, 492, 461, 430, 399, 368, 337, 306,
+ 275, 244, 213, 182, 151, 120, 89, 58, 27, 865, 834, 803, 741,
+ 710, 679, 617, 586, 555, 493,
+ 462, 431, 369, 338, 307, 245, 214, 183, 121, 90, 59, 866, 835,
+ 742, 711, 618, 587, 494, 463, 370, 339, 246, 215, 122, 91, 867,
+ 743, 619, 495, 371, 247, 123,
+ 896, 772, 648, 524, 400, 276, 152, 28, 928, 897, 804, 773, 680,
+ 649, 556, 525, 432, 401, 308, 277, 184, 153, 60, 29, 960, 929,
+ 898, 836, 805, 774, 712, 681,
+ 650, 588, 557, 526, 464, 433, 402, 340, 309, 278, 216, 185, 154,
+ 92, 61, 30, 992, 961, 930, 899, 868, 837, 806, 775, 744, 713, 682,
+ 651, 620, 589, 558, 527,
+ 496, 465, 434, 403, 372, 341, 310, 279, 248, 217, 186, 155, 124,
+ 93, 62, 31, 993, 962, 931, 869, 838, 807, 745, 714, 683, 621, 590,
+ 559, 497, 466, 435, 373,
+ 342, 311, 249, 218, 187, 125, 94, 63, 994, 963, 870, 839, 746, 715,
+ 622, 591, 498, 467, 374, 343, 250, 219, 126, 95, 995, 871, 747, 623,
+ 499, 375, 251, 127,
+ 900, 776, 652, 528, 404, 280, 156, 932, 901, 808, 777, 684, 653, 560,
+ 529, 436, 405, 312, 281, 188, 157, 964, 933, 902, 840, 809, 778, 716,
+ 685, 654, 592, 561,
+ 530, 468, 437, 406, 344, 313, 282, 220, 189, 158, 996, 965, 934, 903,
+ 872, 841, 810, 779, 748, 717, 686, 655, 624, 593, 562, 531, 500, 469,
+ 438, 407, 376, 345,
+ 314, 283, 252, 221, 190, 159, 997, 966, 935, 873, 842, 811, 749, 718,
+ 687, 625, 594, 563, 501, 470, 439, 377, 346, 315, 253, 222, 191, 998,
+ 967, 874, 843, 750,
+ 719, 626, 595, 502, 471, 378, 347, 254, 223, 999, 875, 751, 627, 503,
+ 379, 255, 904, 780, 656, 532, 408, 284, 936, 905, 812, 781, 688, 657,
+ 564, 533, 440, 409,
+ 316, 285, 968, 937, 906, 844, 813, 782, 720, 689, 658, 596, 565, 534,
+ 472, 441, 410, 348, 317, 286, 1000, 969, 938, 907, 876, 845, 814, 783,
+ 752, 721, 690, 659,
+ 628, 597, 566, 535, 504, 473, 442, 411, 380, 349, 318, 287, 1001, 970,
+ 939, 877, 846, 815, 753, 722, 691, 629, 598, 567, 505, 474, 443, 381,
+ 350, 319, 1002, 971,
+ 878, 847, 754, 723, 630, 599, 506, 475, 382, 351, 1003, 879, 755, 631,
+ 507, 383, 908, 784, 660, 536, 412, 940, 909, 816, 785, 692, 661, 568,
+ 537, 444, 413, 972,
+ 941, 910, 848, 817, 786, 724, 693, 662, 600, 569, 538, 476, 445, 414,
+ 1004, 973, 942, 911, 880, 849, 818, 787, 756, 725, 694, 663, 632, 601,
+ 570, 539, 508, 477,
+ 446, 415, 1005, 974, 943, 881, 850, 819, 757, 726, 695, 633, 602, 571,
+ 509, 478, 447, 1006, 975, 882, 851, 758, 727, 634, 603, 510, 479,
+ 1007, 883, 759, 635, 511,
+ 912, 788, 664, 540, 944, 913, 820, 789, 696, 665, 572, 541, 976, 945,
+ 914, 852, 821, 790, 728, 697, 666, 604, 573, 542, 1008, 977, 946, 915,
+ 884, 853, 822, 791,
+ 760, 729, 698, 667, 636, 605, 574, 543, 1009, 978, 947, 885, 854, 823,
+ 761, 730, 699, 637, 606, 575, 1010, 979, 886, 855, 762, 731, 638, 607,
+ 1011, 887, 763, 639,
+ 916, 792, 668, 948, 917, 824, 793, 700, 669, 980, 949, 918, 856, 825,
+ 794, 732, 701, 670, 1012, 981, 950, 919, 888, 857, 826, 795, 764, 733,
+ 702, 671, 1013, 982,
+ 951, 889, 858, 827, 765, 734, 703, 1014, 983, 890, 859, 766, 735, 1015,
+ 891, 767, 920, 796, 952, 921, 828, 797, 984, 953, 922, 860, 829, 798,
+ 1016, 985, 954, 923,
+ 892, 861, 830, 799, 1017, 986, 955, 893, 862, 831, 1018, 987, 894, 863,
+ 1019, 895, 924, 956, 925, 988, 957, 926, 1020, 989, 958, 927, 1021,
+ 990, 959, 1022, 991, 1023,
+};
+
+// Neighborhood 5-tuples for various scans and blocksizes,
+// in {top, left, topleft, topright, bottomleft} order
+// for each position in raster scan order.
+// -1 indicates the neighbor does not exist.
+DECLARE_ALIGNED(16, int16_t,
+ vp9_default_scan_4x4_neighbors[17 * MAX_NEIGHBORS]);
+DECLARE_ALIGNED(16, int16_t,
+ vp9_col_scan_4x4_neighbors[17 * MAX_NEIGHBORS]);
+DECLARE_ALIGNED(16, int16_t,
+ vp9_row_scan_4x4_neighbors[17 * MAX_NEIGHBORS]);
+DECLARE_ALIGNED(16, int16_t,
+ vp9_col_scan_8x8_neighbors[65 * MAX_NEIGHBORS]);
+DECLARE_ALIGNED(16, int16_t,
+ vp9_row_scan_8x8_neighbors[65 * MAX_NEIGHBORS]);
+DECLARE_ALIGNED(16, int16_t,
+ vp9_default_scan_8x8_neighbors[65 * MAX_NEIGHBORS]);
+DECLARE_ALIGNED(16, int16_t,
+ vp9_col_scan_16x16_neighbors[257 * MAX_NEIGHBORS]);
+DECLARE_ALIGNED(16, int16_t,
+ vp9_row_scan_16x16_neighbors[257 * MAX_NEIGHBORS]);
+DECLARE_ALIGNED(16, int16_t,
+ vp9_default_scan_16x16_neighbors[257 * MAX_NEIGHBORS]);
+DECLARE_ALIGNED(16, int16_t,
+ vp9_default_scan_32x32_neighbors[1025 * MAX_NEIGHBORS]);
+
+
+DECLARE_ALIGNED(16, int16_t, vp9_default_iscan_4x4[16]);
+DECLARE_ALIGNED(16, int16_t, vp9_col_iscan_4x4[16]);
+DECLARE_ALIGNED(16, int16_t, vp9_row_iscan_4x4[16]);
+DECLARE_ALIGNED(16, int16_t, vp9_col_iscan_8x8[64]);
+DECLARE_ALIGNED(16, int16_t, vp9_row_iscan_8x8[64]);
+DECLARE_ALIGNED(16, int16_t, vp9_default_iscan_8x8[64]);
+DECLARE_ALIGNED(16, int16_t, vp9_col_iscan_16x16[256]);
+DECLARE_ALIGNED(16, int16_t, vp9_row_iscan_16x16[256]);
+DECLARE_ALIGNED(16, int16_t, vp9_default_iscan_16x16[256]);
+DECLARE_ALIGNED(16, int16_t, vp9_default_iscan_32x32[1024]);
+
+static int find_in_scan(const int16_t *scan, int l, int idx) {
+ int n, l2 = l * l;
+ for (n = 0; n < l2; n++) {
+ int rc = scan[n];
+ if (rc == idx)
+ return n;
+ }
+ assert(0);
+ return -1;
+}
+static void init_scan_neighbors(const int16_t *scan,
+ int16_t *iscan,
+ int l, int16_t *neighbors) {
+ int l2 = l * l;
+ int n, i, j;
+
+ // dc doesn't use this type of prediction
+ neighbors[MAX_NEIGHBORS * 0 + 0] = 0;
+ neighbors[MAX_NEIGHBORS * 0 + 1] = 0;
+ iscan[0] = find_in_scan(scan, l, 0);
+ for (n = 1; n < l2; n++) {
+ int rc = scan[n];
+ iscan[n] = find_in_scan(scan, l, n);
+ i = rc / l;
+ j = rc % l;
+ if (i > 0 && j > 0) {
+ // col/row scan is used for adst/dct, and generally means that
+ // energy decreases to zero much faster in the dimension in
+ // which ADST is used compared to the direction in which DCT
+ // is used. Likewise, we find much higher correlation between
+ // coefficients within the direction in which DCT is used.
+ // Therefore, if we use ADST/DCT, prefer the DCT neighbor coeff
+ // as a context. If ADST or DCT is used in both directions, we
+ // use the combination of the two as a context.
+ int a = (i - 1) * l + j;
+ int b = i * l + j - 1;
+ if (scan == vp9_col_scan_4x4 || scan == vp9_col_scan_8x8 ||
+ scan == vp9_col_scan_16x16) {
+ // in the col/row scan cases (as well as left/top edge cases), we set
+ // both contexts to the same value, so we can branchlessly do a+b+1>>1
+ // which automatically becomes a if a == b
+ neighbors[MAX_NEIGHBORS * n + 0] =
+ neighbors[MAX_NEIGHBORS * n + 1] = a;
+ } else if (scan == vp9_row_scan_4x4 || scan == vp9_row_scan_8x8 ||
+ scan == vp9_row_scan_16x16) {
+ neighbors[MAX_NEIGHBORS * n + 0] =
+ neighbors[MAX_NEIGHBORS * n + 1] = b;
+ } else {
+ neighbors[MAX_NEIGHBORS * n + 0] = a;
+ neighbors[MAX_NEIGHBORS * n + 1] = b;
+ }
+ } else if (i > 0) {
+ neighbors[MAX_NEIGHBORS * n + 0] =
+ neighbors[MAX_NEIGHBORS * n + 1] = (i - 1) * l + j;
+ } else {
+ assert(j > 0);
+ neighbors[MAX_NEIGHBORS * n + 0] =
+ neighbors[MAX_NEIGHBORS * n + 1] = i * l + j - 1;
+ }
+ assert(iscan[neighbors[MAX_NEIGHBORS * n + 0]] < n);
+ }
+ // one padding item so we don't have to add branches in code to handle
+ // calls to get_coef_context() for the token after the final dc token
+ neighbors[MAX_NEIGHBORS * l2 + 0] = 0;
+ neighbors[MAX_NEIGHBORS * l2 + 1] = 0;
+}
+
+void vp9_init_neighbors() {
+ init_scan_neighbors(vp9_default_scan_4x4, vp9_default_iscan_4x4, 4,
+ vp9_default_scan_4x4_neighbors);
+ init_scan_neighbors(vp9_row_scan_4x4, vp9_row_iscan_4x4, 4,
+ vp9_row_scan_4x4_neighbors);
+ init_scan_neighbors(vp9_col_scan_4x4, vp9_col_iscan_4x4, 4,
+ vp9_col_scan_4x4_neighbors);
+ init_scan_neighbors(vp9_default_scan_8x8, vp9_default_iscan_8x8, 8,
+ vp9_default_scan_8x8_neighbors);
+ init_scan_neighbors(vp9_row_scan_8x8, vp9_row_iscan_8x8, 8,
+ vp9_row_scan_8x8_neighbors);
+ init_scan_neighbors(vp9_col_scan_8x8, vp9_col_iscan_8x8, 8,
+ vp9_col_scan_8x8_neighbors);
+ init_scan_neighbors(vp9_default_scan_16x16, vp9_default_iscan_16x16, 16,
+ vp9_default_scan_16x16_neighbors);
+ init_scan_neighbors(vp9_row_scan_16x16, vp9_row_iscan_16x16, 16,
+ vp9_row_scan_16x16_neighbors);
+ init_scan_neighbors(vp9_col_scan_16x16, vp9_col_iscan_16x16, 16,
+ vp9_col_scan_16x16_neighbors);
+ init_scan_neighbors(vp9_default_scan_32x32, vp9_default_iscan_32x32, 32,
+ vp9_default_scan_32x32_neighbors);
+}
diff --git a/vp9/common/vp9_scan.h b/vp9/common/vp9_scan.h
new file mode 100644
index 0000000..a5c8463
--- /dev/null
+++ b/vp9/common/vp9_scan.h
@@ -0,0 +1,200 @@
+/*
+ * Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VP9_COMMON_VP9_SCAN_H_
+#define VP9_COMMON_VP9_SCAN_H_
+
+#include "vpx/vpx_integer.h"
+#include "vpx_ports/mem.h"
+
+#include "vp9/common/vp9_enums.h"
+
+#define MAX_NEIGHBORS 2
+
+extern DECLARE_ALIGNED(16, const int16_t, vp9_default_scan_4x4[16]);
+extern DECLARE_ALIGNED(16, const int16_t, vp9_col_scan_4x4[16]);
+extern DECLARE_ALIGNED(16, const int16_t, vp9_row_scan_4x4[16]);
+
+extern DECLARE_ALIGNED(16, const int16_t, vp9_default_scan_8x8[64]);
+extern DECLARE_ALIGNED(16, const int16_t, vp9_col_scan_8x8[64]);
+extern DECLARE_ALIGNED(16, const int16_t, vp9_row_scan_8x8[64]);
+
+extern DECLARE_ALIGNED(16, const int16_t, vp9_default_scan_16x16[256]);
+extern DECLARE_ALIGNED(16, const int16_t, vp9_col_scan_16x16[256]);
+extern DECLARE_ALIGNED(16, const int16_t, vp9_row_scan_16x16[256]);
+
+extern DECLARE_ALIGNED(16, const int16_t, vp9_default_scan_32x32[1024]);
+
+extern DECLARE_ALIGNED(16, int16_t, vp9_default_iscan_4x4[16]);
+extern DECLARE_ALIGNED(16, int16_t, vp9_col_iscan_4x4[16]);
+extern DECLARE_ALIGNED(16, int16_t, vp9_row_iscan_4x4[16]);
+
+extern DECLARE_ALIGNED(16, int16_t, vp9_default_iscan_8x8[64]);
+extern DECLARE_ALIGNED(16, int16_t, vp9_col_iscan_8x8[64]);
+extern DECLARE_ALIGNED(16, int16_t, vp9_row_iscan_8x8[64]);
+
+extern DECLARE_ALIGNED(16, int16_t, vp9_default_iscan_16x16[256]);
+extern DECLARE_ALIGNED(16, int16_t, vp9_col_iscan_16x16[256]);
+extern DECLARE_ALIGNED(16, int16_t, vp9_row_iscan_16x16[256]);
+
+extern DECLARE_ALIGNED(16, int16_t, vp9_default_iscan_32x32[1024]);
+
+extern DECLARE_ALIGNED(16, int16_t,
+ vp9_default_scan_4x4_neighbors[17 * MAX_NEIGHBORS]);
+extern DECLARE_ALIGNED(16, int16_t,
+ vp9_col_scan_4x4_neighbors[17 * MAX_NEIGHBORS]);
+extern DECLARE_ALIGNED(16, int16_t,
+ vp9_row_scan_4x4_neighbors[17 * MAX_NEIGHBORS]);
+extern DECLARE_ALIGNED(16, int16_t,
+ vp9_col_scan_8x8_neighbors[65 * MAX_NEIGHBORS]);
+extern DECLARE_ALIGNED(16, int16_t,
+ vp9_row_scan_8x8_neighbors[65 * MAX_NEIGHBORS]);
+extern DECLARE_ALIGNED(16, int16_t,
+ vp9_default_scan_8x8_neighbors[65 * MAX_NEIGHBORS]);
+extern DECLARE_ALIGNED(16, int16_t,
+ vp9_col_scan_16x16_neighbors[257 * MAX_NEIGHBORS]);
+extern DECLARE_ALIGNED(16, int16_t,
+ vp9_row_scan_16x16_neighbors[257 * MAX_NEIGHBORS]);
+extern DECLARE_ALIGNED(16, int16_t,
+ vp9_default_scan_16x16_neighbors[257 * MAX_NEIGHBORS]);
+extern DECLARE_ALIGNED(16, int16_t,
+ vp9_default_scan_32x32_neighbors[1025 * MAX_NEIGHBORS]);
+
+
+void vp9_init_neighbors();
+
+static INLINE const int16_t* get_scan_4x4(TX_TYPE tx_type) {
+ switch (tx_type) {
+ case ADST_DCT:
+ return vp9_row_scan_4x4;
+ case DCT_ADST:
+ return vp9_col_scan_4x4;
+ default:
+ return vp9_default_scan_4x4;
+ }
+}
+
+static INLINE void get_scan_nb_4x4(TX_TYPE tx_type,
+ const int16_t **scan, const int16_t **nb) {
+ switch (tx_type) {
+ case ADST_DCT:
+ *scan = vp9_row_scan_4x4;
+ *nb = vp9_row_scan_4x4_neighbors;
+ break;
+ case DCT_ADST:
+ *scan = vp9_col_scan_4x4;
+ *nb = vp9_col_scan_4x4_neighbors;
+ break;
+ default:
+ *scan = vp9_default_scan_4x4;
+ *nb = vp9_default_scan_4x4_neighbors;
+ break;
+ }
+}
+
+static INLINE const int16_t* get_iscan_4x4(TX_TYPE tx_type) {
+ switch (tx_type) {
+ case ADST_DCT:
+ return vp9_row_iscan_4x4;
+ case DCT_ADST:
+ return vp9_col_iscan_4x4;
+ default:
+ return vp9_default_iscan_4x4;
+ }
+}
+
+static INLINE const int16_t* get_scan_8x8(TX_TYPE tx_type) {
+ switch (tx_type) {
+ case ADST_DCT:
+ return vp9_row_scan_8x8;
+ case DCT_ADST:
+ return vp9_col_scan_8x8;
+ default:
+ return vp9_default_scan_8x8;
+ }
+}
+
+static INLINE void get_scan_nb_8x8(TX_TYPE tx_type,
+ const int16_t **scan, const int16_t **nb) {
+ switch (tx_type) {
+ case ADST_DCT:
+ *scan = vp9_row_scan_8x8;
+ *nb = vp9_row_scan_8x8_neighbors;
+ break;
+ case DCT_ADST:
+ *scan = vp9_col_scan_8x8;
+ *nb = vp9_col_scan_8x8_neighbors;
+ break;
+ default:
+ *scan = vp9_default_scan_8x8;
+ *nb = vp9_default_scan_8x8_neighbors;
+ break;
+ }
+}
+
+static INLINE const int16_t* get_iscan_8x8(TX_TYPE tx_type) {
+ switch (tx_type) {
+ case ADST_DCT:
+ return vp9_row_iscan_8x8;
+ case DCT_ADST:
+ return vp9_col_iscan_8x8;
+ default:
+ return vp9_default_iscan_8x8;
+ }
+}
+
+static INLINE const int16_t* get_scan_16x16(TX_TYPE tx_type) {
+ switch (tx_type) {
+ case ADST_DCT:
+ return vp9_row_scan_16x16;
+ case DCT_ADST:
+ return vp9_col_scan_16x16;
+ default:
+ return vp9_default_scan_16x16;
+ }
+}
+
+static INLINE void get_scan_nb_16x16(TX_TYPE tx_type,
+ const int16_t **scan, const int16_t **nb) {
+ switch (tx_type) {
+ case ADST_DCT:
+ *scan = vp9_row_scan_16x16;
+ *nb = vp9_row_scan_16x16_neighbors;
+ break;
+ case DCT_ADST:
+ *scan = vp9_col_scan_16x16;
+ *nb = vp9_col_scan_16x16_neighbors;
+ break;
+ default:
+ *scan = vp9_default_scan_16x16;
+ *nb = vp9_default_scan_16x16_neighbors;
+ break;
+ }
+}
+
+static INLINE const int16_t* get_iscan_16x16(TX_TYPE tx_type) {
+ switch (tx_type) {
+ case ADST_DCT:
+ return vp9_row_iscan_16x16;
+ case DCT_ADST:
+ return vp9_col_iscan_16x16;
+ default:
+ return vp9_default_iscan_16x16;
+ }
+}
+
+static INLINE int get_coef_context(const int16_t *neighbors,
+ uint8_t *token_cache,
+ int c) {
+ return (1 + token_cache[neighbors[MAX_NEIGHBORS * c + 0]] +
+ token_cache[neighbors[MAX_NEIGHBORS * c + 1]]) >> 1;
+}
+
+#endif // VP9_COMMON_VP9_SCAN_H_
diff --git a/vp9/common/vp9_systemdependent.h b/vp9/common/vp9_systemdependent.h
index cc909e2..254a431 100644
--- a/vp9/common/vp9_systemdependent.h
+++ b/vp9/common/vp9_systemdependent.h
@@ -13,6 +13,7 @@
#ifdef _MSC_VER
#include <math.h>
+#define snprintf _snprintf
#endif
#include "./vpx_config.h"
@@ -23,8 +24,8 @@
#define vp9_clear_system_state()
#endif
-#ifdef _MSC_VER
-// round is not defined in MSVC
+#if defined(_MSC_VER) && _MSC_VER < 1800
+// round is not defined in MSVC before VS2013.
static int round(double x) {
if (x < 0)
return (int)ceil(x - 0.5);
diff --git a/vp9/common/vp9_treecoder.h b/vp9/common/vp9_treecoder.h
index 31182c3..4ba171f 100644
--- a/vp9/common/vp9_treecoder.h
+++ b/vp9/common/vp9_treecoder.h
@@ -21,6 +21,8 @@
typedef int8_t vp9_tree_index;
+#define TREE_SIZE(leaf_count) (2 * (leaf_count) - 2)
+
#define vp9_complement(x) (255 - x)
/* We build coding trees compactly in arrays.
@@ -30,7 +32,7 @@
Index > 0 means need another bit, specification at index.
Nonnegative indices are always even; processing begins at node 0. */
-typedef const vp9_tree_index vp9_tree[], *vp9_tree_p;
+typedef const vp9_tree_index vp9_tree[];
struct vp9_token {
int value;
diff --git a/vp9/common/x86/vp9_asm_stubs.c b/vp9/common/x86/vp9_asm_stubs.c
index 3f1c198..ba9ceb2 100644
--- a/vp9/common/x86/vp9_asm_stubs.c
+++ b/vp9/common/x86/vp9_asm_stubs.c
@@ -36,90 +36,28 @@
{ 8, 8, 8, 8, 120, 120, 120, 120 }
};
+typedef void filter8_1dfunction (
+ const unsigned char *src_ptr,
+ const unsigned int src_pitch,
+ unsigned char *output_ptr,
+ unsigned int out_pitch,
+ unsigned int output_height,
+ const short *filter
+);
+
#if HAVE_SSSE3
-void vp9_filter_block1d16_v8_ssse3(const unsigned char *src_ptr,
- const unsigned int src_pitch,
- unsigned char *output_ptr,
- unsigned int out_pitch,
- unsigned int output_height,
- const short *filter);
-
-void vp9_filter_block1d16_h8_ssse3(const unsigned char *src_ptr,
- const unsigned int src_pitch,
- unsigned char *output_ptr,
- unsigned int out_pitch,
- unsigned int output_height,
- const short *filter);
-
-void vp9_filter_block1d8_v8_ssse3(const unsigned char *src_ptr,
- const unsigned int src_pitch,
- unsigned char *output_ptr,
- unsigned int out_pitch,
- unsigned int output_height,
- const short *filter);
-
-void vp9_filter_block1d8_h8_ssse3(const unsigned char *src_ptr,
- const unsigned int src_pitch,
- unsigned char *output_ptr,
- unsigned int out_pitch,
- unsigned int output_height,
- const short *filter);
-
-void vp9_filter_block1d4_v8_ssse3(const unsigned char *src_ptr,
- const unsigned int src_pitch,
- unsigned char *output_ptr,
- unsigned int out_pitch,
- unsigned int output_height,
- const short *filter);
-
-void vp9_filter_block1d4_h8_ssse3(const unsigned char *src_ptr,
- const unsigned int src_pitch,
- unsigned char *output_ptr,
- unsigned int out_pitch,
- unsigned int output_height,
- const short *filter);
-
-void vp9_filter_block1d16_v8_avg_ssse3(const unsigned char *src_ptr,
- const unsigned int src_pitch,
- unsigned char *output_ptr,
- unsigned int out_pitch,
- unsigned int output_height,
- const short *filter);
-
-void vp9_filter_block1d16_h8_avg_ssse3(const unsigned char *src_ptr,
- const unsigned int src_pitch,
- unsigned char *output_ptr,
- unsigned int out_pitch,
- unsigned int output_height,
- const short *filter);
-
-void vp9_filter_block1d8_v8_avg_ssse3(const unsigned char *src_ptr,
- const unsigned int src_pitch,
- unsigned char *output_ptr,
- unsigned int out_pitch,
- unsigned int output_height,
- const short *filter);
-
-void vp9_filter_block1d8_h8_avg_ssse3(const unsigned char *src_ptr,
- const unsigned int src_pitch,
- unsigned char *output_ptr,
- unsigned int out_pitch,
- unsigned int output_height,
- const short *filter);
-
-void vp9_filter_block1d4_v8_avg_ssse3(const unsigned char *src_ptr,
- const unsigned int src_pitch,
- unsigned char *output_ptr,
- unsigned int out_pitch,
- unsigned int output_height,
- const short *filter);
-
-void vp9_filter_block1d4_h8_avg_ssse3(const unsigned char *src_ptr,
- const unsigned int src_pitch,
- unsigned char *output_ptr,
- unsigned int out_pitch,
- unsigned int output_height,
- const short *filter);
+filter8_1dfunction vp9_filter_block1d16_v8_ssse3;
+filter8_1dfunction vp9_filter_block1d16_h8_ssse3;
+filter8_1dfunction vp9_filter_block1d8_v8_ssse3;
+filter8_1dfunction vp9_filter_block1d8_h8_ssse3;
+filter8_1dfunction vp9_filter_block1d4_v8_ssse3;
+filter8_1dfunction vp9_filter_block1d4_h8_ssse3;
+filter8_1dfunction vp9_filter_block1d16_v8_avg_ssse3;
+filter8_1dfunction vp9_filter_block1d16_h8_avg_ssse3;
+filter8_1dfunction vp9_filter_block1d8_v8_avg_ssse3;
+filter8_1dfunction vp9_filter_block1d8_h8_avg_ssse3;
+filter8_1dfunction vp9_filter_block1d4_v8_avg_ssse3;
+filter8_1dfunction vp9_filter_block1d4_h8_avg_ssse3;
void vp9_convolve8_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride,
uint8_t *dst, ptrdiff_t dst_stride,
@@ -317,3 +255,214 @@
}
}
#endif
+
+#if HAVE_SSE2
+filter8_1dfunction vp9_filter_block1d16_v8_sse2;
+filter8_1dfunction vp9_filter_block1d16_h8_sse2;
+filter8_1dfunction vp9_filter_block1d8_v8_sse2;
+filter8_1dfunction vp9_filter_block1d8_h8_sse2;
+filter8_1dfunction vp9_filter_block1d4_v8_sse2;
+filter8_1dfunction vp9_filter_block1d4_h8_sse2;
+filter8_1dfunction vp9_filter_block1d16_v8_avg_sse2;
+filter8_1dfunction vp9_filter_block1d16_h8_avg_sse2;
+filter8_1dfunction vp9_filter_block1d8_v8_avg_sse2;
+filter8_1dfunction vp9_filter_block1d8_h8_avg_sse2;
+filter8_1dfunction vp9_filter_block1d4_v8_avg_sse2;
+filter8_1dfunction vp9_filter_block1d4_h8_avg_sse2;
+
+void vp9_convolve8_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const int16_t *filter_x, int x_step_q4,
+ const int16_t *filter_y, int y_step_q4,
+ int w, int h) {
+ /* Ensure the filter can be compressed to int16_t. */
+ if (x_step_q4 == 16 && filter_x[3] != 128) {
+ while (w >= 16) {
+ vp9_filter_block1d16_h8_sse2(src, src_stride,
+ dst, dst_stride,
+ h, filter_x);
+ src += 16;
+ dst += 16;
+ w -= 16;
+ }
+ while (w >= 8) {
+ vp9_filter_block1d8_h8_sse2(src, src_stride,
+ dst, dst_stride,
+ h, filter_x);
+ src += 8;
+ dst += 8;
+ w -= 8;
+ }
+ while (w >= 4) {
+ vp9_filter_block1d4_h8_sse2(src, src_stride,
+ dst, dst_stride,
+ h, filter_x);
+ src += 4;
+ dst += 4;
+ w -= 4;
+ }
+ }
+ if (w) {
+ vp9_convolve8_horiz_c(src, src_stride, dst, dst_stride,
+ filter_x, x_step_q4, filter_y, y_step_q4,
+ w, h);
+ }
+}
+
+void vp9_convolve8_vert_sse2(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const int16_t *filter_x, int x_step_q4,
+ const int16_t *filter_y, int y_step_q4,
+ int w, int h) {
+ if (y_step_q4 == 16 && filter_y[3] != 128) {
+ while (w >= 16) {
+ vp9_filter_block1d16_v8_sse2(src - src_stride * 3, src_stride,
+ dst, dst_stride,
+ h, filter_y);
+ src += 16;
+ dst += 16;
+ w -= 16;
+ }
+ while (w >= 8) {
+ vp9_filter_block1d8_v8_sse2(src - src_stride * 3, src_stride,
+ dst, dst_stride,
+ h, filter_y);
+ src += 8;
+ dst += 8;
+ w -= 8;
+ }
+ while (w >= 4) {
+ vp9_filter_block1d4_v8_sse2(src - src_stride * 3, src_stride,
+ dst, dst_stride,
+ h, filter_y);
+ src += 4;
+ dst += 4;
+ w -= 4;
+ }
+ }
+ if (w) {
+ vp9_convolve8_vert_c(src, src_stride, dst, dst_stride,
+ filter_x, x_step_q4, filter_y, y_step_q4,
+ w, h);
+ }
+}
+
+void vp9_convolve8_avg_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const int16_t *filter_x, int x_step_q4,
+ const int16_t *filter_y, int y_step_q4,
+ int w, int h) {
+ if (x_step_q4 == 16 && filter_x[3] != 128) {
+ while (w >= 16) {
+ vp9_filter_block1d16_h8_avg_sse2(src, src_stride,
+ dst, dst_stride,
+ h, filter_x);
+ src += 16;
+ dst += 16;
+ w -= 16;
+ }
+ while (w >= 8) {
+ vp9_filter_block1d8_h8_avg_sse2(src, src_stride,
+ dst, dst_stride,
+ h, filter_x);
+ src += 8;
+ dst += 8;
+ w -= 8;
+ }
+ while (w >= 4) {
+ vp9_filter_block1d4_h8_avg_sse2(src, src_stride,
+ dst, dst_stride,
+ h, filter_x);
+ src += 4;
+ dst += 4;
+ w -= 4;
+ }
+ }
+ if (w) {
+ vp9_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride,
+ filter_x, x_step_q4, filter_y, y_step_q4,
+ w, h);
+ }
+}
+
+void vp9_convolve8_avg_vert_sse2(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const int16_t *filter_x, int x_step_q4,
+ const int16_t *filter_y, int y_step_q4,
+ int w, int h) {
+ if (y_step_q4 == 16 && filter_y[3] != 128) {
+ while (w >= 16) {
+ vp9_filter_block1d16_v8_avg_sse2(src - src_stride * 3, src_stride,
+ dst, dst_stride,
+ h, filter_y);
+ src += 16;
+ dst += 16;
+ w -= 16;
+ }
+ while (w >= 8) {
+ vp9_filter_block1d8_v8_avg_sse2(src - src_stride * 3, src_stride,
+ dst, dst_stride,
+ h, filter_y);
+ src += 8;
+ dst += 8;
+ w -= 8;
+ }
+ while (w >= 4) {
+ vp9_filter_block1d4_v8_avg_sse2(src - src_stride * 3, src_stride,
+ dst, dst_stride,
+ h, filter_y);
+ src += 4;
+ dst += 4;
+ w -= 4;
+ }
+ }
+ if (w) {
+ vp9_convolve8_avg_vert_c(src, src_stride, dst, dst_stride,
+ filter_x, x_step_q4, filter_y, y_step_q4,
+ w, h);
+ }
+}
+
+void vp9_convolve8_sse2(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const int16_t *filter_x, int x_step_q4,
+ const int16_t *filter_y, int y_step_q4,
+ int w, int h) {
+ DECLARE_ALIGNED_ARRAY(16, unsigned char, fdata2, 64*71);
+
+ assert(w <= 64);
+ assert(h <= 64);
+ if (x_step_q4 == 16 && y_step_q4 == 16) {
+ vp9_convolve8_horiz_sse2(src - 3 * src_stride, src_stride, fdata2, 64,
+ filter_x, x_step_q4, filter_y, y_step_q4,
+ w, h + 7);
+ vp9_convolve8_vert_sse2(fdata2 + 3 * 64, 64, dst, dst_stride,
+ filter_x, x_step_q4, filter_y, y_step_q4, w, h);
+ } else {
+ vp9_convolve8_c(src, src_stride, dst, dst_stride,
+ filter_x, x_step_q4, filter_y, y_step_q4, w, h);
+ }
+}
+
+void vp9_convolve8_avg_sse2(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const int16_t *filter_x, int x_step_q4,
+ const int16_t *filter_y, int y_step_q4,
+ int w, int h) {
+ DECLARE_ALIGNED_ARRAY(16, unsigned char, fdata2, 64*71);
+
+ assert(w <= 64);
+ assert(h <= 64);
+ if (x_step_q4 == 16 && y_step_q4 == 16) {
+ vp9_convolve8_horiz_sse2(src - 3 * src_stride, src_stride, fdata2, 64,
+ filter_x, x_step_q4, filter_y, y_step_q4,
+ w, h + 7);
+ vp9_convolve8_avg_vert_sse2(fdata2 + 3 * 64, 64, dst, dst_stride,
+ filter_x, x_step_q4, filter_y, y_step_q4,
+ w, h);
+ } else {
+ vp9_convolve8_avg_c(src, src_stride, dst, dst_stride,
+ filter_x, x_step_q4, filter_y, y_step_q4, w, h);
+ }
+}
+#endif
diff --git a/vp9/common/x86/vp9_idct_intrin_sse2.c b/vp9/common/x86/vp9_idct_intrin_sse2.c
index d00993c..cfec36b 100644
--- a/vp9/common/x86/vp9_idct_intrin_sse2.c
+++ b/vp9/common/x86/vp9_idct_intrin_sse2.c
@@ -15,7 +15,7 @@
#include "vp9/common/vp9_common.h"
#include "vp9/common/vp9_idct.h"
-void vp9_short_idct4x4_add_sse2(int16_t *input, uint8_t *dest, int stride) {
+void vp9_idct4x4_16_add_sse2(const int16_t *input, uint8_t *dest, int stride) {
const __m128i zero = _mm_setzero_si128();
const __m128i eight = _mm_set1_epi16(8);
const __m128i cst = _mm_setr_epi16((int16_t)cospi_16_64, (int16_t)cospi_16_64,
@@ -26,10 +26,10 @@
__m128i input0, input1, input2, input3;
// Rows
- input0 = _mm_loadl_epi64((__m128i *)input);
- input1 = _mm_loadl_epi64((__m128i *)(input + 4));
- input2 = _mm_loadl_epi64((__m128i *)(input + 8));
- input3 = _mm_loadl_epi64((__m128i *)(input + 12));
+ input0 = _mm_loadl_epi64((const __m128i *)input);
+ input1 = _mm_loadl_epi64((const __m128i *)(input + 4));
+ input2 = _mm_loadl_epi64((const __m128i *)(input + 8));
+ input3 = _mm_loadl_epi64((const __m128i *)(input + 12));
// Construct i3, i1, i3, i1, i2, i0, i2, i0
input0 = _mm_shufflelo_epi16(input0, 0xd8);
@@ -148,7 +148,7 @@
RECON_AND_STORE4X4(dest, input3);
}
-void vp9_short_idct4x4_1_add_sse2(int16_t *input, uint8_t *dest, int stride) {
+void vp9_idct4x4_1_add_sse2(const int16_t *input, uint8_t *dest, int stride) {
__m128i dc_value;
const __m128i zero = _mm_setzero_si128();
int a;
@@ -165,41 +165,6 @@
RECON_AND_STORE4X4(dest, dc_value);
}
-void vp9_idct4_1d_sse2(int16_t *input, int16_t *output) {
- const __m128i zero = _mm_setzero_si128();
- const __m128i c1 = _mm_setr_epi16((int16_t)cospi_16_64, (int16_t)cospi_16_64,
- (int16_t)cospi_16_64, (int16_t)-cospi_16_64,
- (int16_t)cospi_24_64, (int16_t)-cospi_8_64,
- (int16_t)cospi_8_64, (int16_t)cospi_24_64);
- const __m128i c2 = _mm_setr_epi16(1, 1, 1, 1, 1, -1, 1, -1);
-
- const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
- __m128i in, temp;
-
- // Load input data.
- in = _mm_loadl_epi64((__m128i *)input);
-
- // Construct i3, i1, i3, i1, i2, i0, i2, i0
- in = _mm_shufflelo_epi16(in, 0xd8);
- in = _mm_unpacklo_epi32(in, in);
-
- // Stage 1
- in = _mm_madd_epi16(in, c1);
- in = _mm_add_epi32(in, rounding);
- in = _mm_srai_epi32(in, DCT_CONST_BITS);
- in = _mm_packs_epi32(in, zero);
-
- // Stage 2
- temp = _mm_shufflelo_epi16(in, 0x9c);
- in = _mm_shufflelo_epi16(in, 0xc9);
- in = _mm_unpacklo_epi64(temp, in);
- in = _mm_madd_epi16(in, c2);
- in = _mm_packs_epi32(in, zero);
-
- // Store results
- _mm_storel_epi64((__m128i *)output, in);
-}
-
static INLINE void transpose_4x4(__m128i *res) {
const __m128i tr0_0 = _mm_unpacklo_epi16(res[0], res[1]);
const __m128i tr0_1 = _mm_unpacklo_epi16(res[2], res[3]);
@@ -210,7 +175,7 @@
res[3] = _mm_unpackhi_epi64(res[2], res[2]);
}
-void idct4_1d_sse2(__m128i *in) {
+static void idct4_1d_sse2(__m128i *in) {
const __m128i k__cospi_p16_p16 = pair_set_epi16(cospi_16_64, cospi_16_64);
const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
@@ -249,7 +214,7 @@
in[3] = _mm_sub_epi16(u[0], u[3]);
}
-void iadst4_1d_sse2(__m128i *in) {
+static void iadst4_1d_sse2(__m128i *in) {
const __m128i k__sinpi_p01_p04 = pair_set_epi16(sinpi_1_9, sinpi_4_9);
const __m128i k__sinpi_p03_p02 = pair_set_epi16(sinpi_3_9, sinpi_2_9);
const __m128i k__sinpi_p02_m01 = pair_set_epi16(sinpi_2_9, -sinpi_1_9);
@@ -299,16 +264,16 @@
in[3] = _mm_unpackhi_epi64(in[1], in[1]);
}
-void vp9_short_iht4x4_add_sse2(int16_t *input, uint8_t *dest, int stride,
- int tx_type) {
+void vp9_iht4x4_16_add_sse2(const int16_t *input, uint8_t *dest, int stride,
+ int tx_type) {
__m128i in[4];
const __m128i zero = _mm_setzero_si128();
const __m128i eight = _mm_set1_epi16(8);
- in[0] = _mm_loadl_epi64((__m128i *)input);
- in[1] = _mm_loadl_epi64((__m128i *)(input + 4));
- in[2] = _mm_loadl_epi64((__m128i *)(input + 8));
- in[3] = _mm_loadl_epi64((__m128i *)(input + 12));
+ in[0] = _mm_loadl_epi64((const __m128i *)input);
+ in[1] = _mm_loadl_epi64((const __m128i *)(input + 4));
+ in[2] = _mm_loadl_epi64((const __m128i *)(input + 8));
+ in[3] = _mm_loadl_epi64((const __m128i *)(input + 12));
switch (tx_type) {
case 0: // DCT_DCT
@@ -529,7 +494,7 @@
dest += stride; \
}
-void vp9_short_idct8x8_add_sse2(int16_t *input, uint8_t *dest, int stride) {
+void vp9_idct8x8_64_add_sse2(const int16_t *input, uint8_t *dest, int stride) {
const __m128i zero = _mm_setzero_si128();
const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
const __m128i final_rounding = _mm_set1_epi16(1<<4);
@@ -549,14 +514,14 @@
int i;
// Load input data.
- in0 = _mm_load_si128((__m128i *)input);
- in1 = _mm_load_si128((__m128i *)(input + 8 * 1));
- in2 = _mm_load_si128((__m128i *)(input + 8 * 2));
- in3 = _mm_load_si128((__m128i *)(input + 8 * 3));
- in4 = _mm_load_si128((__m128i *)(input + 8 * 4));
- in5 = _mm_load_si128((__m128i *)(input + 8 * 5));
- in6 = _mm_load_si128((__m128i *)(input + 8 * 6));
- in7 = _mm_load_si128((__m128i *)(input + 8 * 7));
+ in0 = _mm_load_si128((const __m128i *)input);
+ in1 = _mm_load_si128((const __m128i *)(input + 8 * 1));
+ in2 = _mm_load_si128((const __m128i *)(input + 8 * 2));
+ in3 = _mm_load_si128((const __m128i *)(input + 8 * 3));
+ in4 = _mm_load_si128((const __m128i *)(input + 8 * 4));
+ in5 = _mm_load_si128((const __m128i *)(input + 8 * 5));
+ in6 = _mm_load_si128((const __m128i *)(input + 8 * 6));
+ in7 = _mm_load_si128((const __m128i *)(input + 8 * 7));
// 2-D
for (i = 0; i < 2; i++) {
@@ -597,7 +562,7 @@
RECON_AND_STORE(dest, in7);
}
-void vp9_short_idct8x8_1_add_sse2(int16_t *input, uint8_t *dest, int stride) {
+void vp9_idct8x8_1_add_sse2(const int16_t *input, uint8_t *dest, int stride) {
__m128i dc_value;
const __m128i zero = _mm_setzero_si128();
int a;
@@ -648,7 +613,7 @@
res[7] = _mm_unpackhi_epi64(tr1_6, tr1_7);
}
-void idct8_1d_sse2(__m128i *in) {
+static void idct8_1d_sse2(__m128i *in) {
const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
const __m128i stg1_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
@@ -689,7 +654,7 @@
in[7] = in7;
}
-void iadst8_1d_sse2(__m128i *in) {
+static void iadst8_1d_sse2(__m128i *in) {
const __m128i k__cospi_p02_p30 = pair_set_epi16(cospi_2_64, cospi_30_64);
const __m128i k__cospi_p30_m02 = pair_set_epi16(cospi_30_64, -cospi_2_64);
const __m128i k__cospi_p10_p22 = pair_set_epi16(cospi_10_64, cospi_22_64);
@@ -918,21 +883,21 @@
}
-void vp9_short_iht8x8_add_sse2(int16_t *input, uint8_t *dest, int stride,
- int tx_type) {
+void vp9_iht8x8_64_add_sse2(const int16_t *input, uint8_t *dest, int stride,
+ int tx_type) {
__m128i in[8];
const __m128i zero = _mm_setzero_si128();
const __m128i final_rounding = _mm_set1_epi16(1<<4);
// load input data
- in[0] = _mm_load_si128((__m128i *)input);
- in[1] = _mm_load_si128((__m128i *)(input + 8 * 1));
- in[2] = _mm_load_si128((__m128i *)(input + 8 * 2));
- in[3] = _mm_load_si128((__m128i *)(input + 8 * 3));
- in[4] = _mm_load_si128((__m128i *)(input + 8 * 4));
- in[5] = _mm_load_si128((__m128i *)(input + 8 * 5));
- in[6] = _mm_load_si128((__m128i *)(input + 8 * 6));
- in[7] = _mm_load_si128((__m128i *)(input + 8 * 7));
+ in[0] = _mm_load_si128((const __m128i *)input);
+ in[1] = _mm_load_si128((const __m128i *)(input + 8 * 1));
+ in[2] = _mm_load_si128((const __m128i *)(input + 8 * 2));
+ in[3] = _mm_load_si128((const __m128i *)(input + 8 * 3));
+ in[4] = _mm_load_si128((const __m128i *)(input + 8 * 4));
+ in[5] = _mm_load_si128((const __m128i *)(input + 8 * 5));
+ in[6] = _mm_load_si128((const __m128i *)(input + 8 * 6));
+ in[7] = _mm_load_si128((const __m128i *)(input + 8 * 7));
switch (tx_type) {
case 0: // DCT_DCT
@@ -985,7 +950,7 @@
RECON_AND_STORE(dest, in[7]);
}
-void vp9_short_idct8x8_10_add_sse2(int16_t *input, uint8_t *dest, int stride) {
+void vp9_idct8x8_10_add_sse2(const int16_t *input, uint8_t *dest, int stride) {
const __m128i zero = _mm_setzero_si128();
const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
const __m128i final_rounding = _mm_set1_epi16(1<<4);
@@ -1005,10 +970,10 @@
__m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
// Rows. Load 4-row input data.
- in0 = _mm_load_si128((__m128i *)input);
- in1 = _mm_load_si128((__m128i *)(input + 8 * 1));
- in2 = _mm_load_si128((__m128i *)(input + 8 * 2));
- in3 = _mm_load_si128((__m128i *)(input + 8 * 3));
+ in0 = _mm_load_si128((const __m128i *)input);
+ in1 = _mm_load_si128((const __m128i *)(input + 8 * 1));
+ in2 = _mm_load_si128((const __m128i *)(input + 8 * 2));
+ in3 = _mm_load_si128((const __m128i *)(input + 8 * 3));
// 8x4 Transpose
TRANSPOSE_8X4(in0, in1, in2, in3, in0, in1, in2, in3)
@@ -1263,7 +1228,8 @@
stp2_10, stp2_13, stp2_11, stp2_12) \
}
-void vp9_short_idct16x16_add_sse2(int16_t *input, uint8_t *dest, int stride) {
+void vp9_idct16x16_256_add_sse2(const int16_t *input, uint8_t *dest,
+ int stride) {
const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
const __m128i final_rounding = _mm_set1_epi16(1<<5);
const __m128i zero = _mm_setzero_si128();
@@ -1318,22 +1284,22 @@
if (i == 1) input += 128;
// Load input data.
- in0 = _mm_load_si128((__m128i *)input);
- in8 = _mm_load_si128((__m128i *)(input + 8 * 1));
- in1 = _mm_load_si128((__m128i *)(input + 8 * 2));
- in9 = _mm_load_si128((__m128i *)(input + 8 * 3));
- in2 = _mm_load_si128((__m128i *)(input + 8 * 4));
- in10 = _mm_load_si128((__m128i *)(input + 8 * 5));
- in3 = _mm_load_si128((__m128i *)(input + 8 * 6));
- in11 = _mm_load_si128((__m128i *)(input + 8 * 7));
- in4 = _mm_load_si128((__m128i *)(input + 8 * 8));
- in12 = _mm_load_si128((__m128i *)(input + 8 * 9));
- in5 = _mm_load_si128((__m128i *)(input + 8 * 10));
- in13 = _mm_load_si128((__m128i *)(input + 8 * 11));
- in6 = _mm_load_si128((__m128i *)(input + 8 * 12));
- in14 = _mm_load_si128((__m128i *)(input + 8 * 13));
- in7 = _mm_load_si128((__m128i *)(input + 8 * 14));
- in15 = _mm_load_si128((__m128i *)(input + 8 * 15));
+ in0 = _mm_load_si128((const __m128i *)input);
+ in8 = _mm_load_si128((const __m128i *)(input + 8 * 1));
+ in1 = _mm_load_si128((const __m128i *)(input + 8 * 2));
+ in9 = _mm_load_si128((const __m128i *)(input + 8 * 3));
+ in2 = _mm_load_si128((const __m128i *)(input + 8 * 4));
+ in10 = _mm_load_si128((const __m128i *)(input + 8 * 5));
+ in3 = _mm_load_si128((const __m128i *)(input + 8 * 6));
+ in11 = _mm_load_si128((const __m128i *)(input + 8 * 7));
+ in4 = _mm_load_si128((const __m128i *)(input + 8 * 8));
+ in12 = _mm_load_si128((const __m128i *)(input + 8 * 9));
+ in5 = _mm_load_si128((const __m128i *)(input + 8 * 10));
+ in13 = _mm_load_si128((const __m128i *)(input + 8 * 11));
+ in6 = _mm_load_si128((const __m128i *)(input + 8 * 12));
+ in14 = _mm_load_si128((const __m128i *)(input + 8 * 13));
+ in7 = _mm_load_si128((const __m128i *)(input + 8 * 14));
+ in15 = _mm_load_si128((const __m128i *)(input + 8 * 15));
TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3,
in4, in5, in6, in7);
@@ -1470,7 +1436,7 @@
}
}
-void vp9_short_idct16x16_1_add_sse2(int16_t *input, uint8_t *dest, int stride) {
+void vp9_idct16x16_1_add_sse2(const int16_t *input, uint8_t *dest, int stride) {
__m128i dc_value;
const __m128i zero = _mm_setzero_si128();
int a, i;
@@ -1519,7 +1485,7 @@
res0[15] = tbuf[7];
}
-void iadst16_1d_8col(__m128i *in) {
+static void iadst16_1d_8col(__m128i *in) {
// perform 16x16 1-D ADST for 8 columns
__m128i s[16], x[16], u[32], v[32];
const __m128i k__cospi_p01_p31 = pair_set_epi16(cospi_1_64, cospi_31_64);
@@ -1989,7 +1955,7 @@
in[15] = _mm_sub_epi16(kZero, s[1]);
}
-void idct16_1d_8col(__m128i *in) {
+static void idct16_1d_8col(__m128i *in) {
const __m128i k__cospi_p30_m02 = pair_set_epi16(cospi_30_64, -cospi_2_64);
const __m128i k__cospi_p02_p30 = pair_set_epi16(cospi_2_64, cospi_30_64);
const __m128i k__cospi_p14_m18 = pair_set_epi16(cospi_14_64, -cospi_18_64);
@@ -2333,36 +2299,36 @@
in[15] = _mm_sub_epi16(s[0], s[15]);
}
-void idct16_1d_sse2(__m128i *in0, __m128i *in1) {
+static void idct16_1d_sse2(__m128i *in0, __m128i *in1) {
array_transpose_16x16(in0, in1);
idct16_1d_8col(in0);
idct16_1d_8col(in1);
}
-void iadst16_1d_sse2(__m128i *in0, __m128i *in1) {
+static void iadst16_1d_sse2(__m128i *in0, __m128i *in1) {
array_transpose_16x16(in0, in1);
iadst16_1d_8col(in0);
iadst16_1d_8col(in1);
}
-static INLINE void load_buffer_8x16(int16_t *input, __m128i *in) {
- in[0] = _mm_load_si128((__m128i *)(input + 0 * 16));
- in[1] = _mm_load_si128((__m128i *)(input + 1 * 16));
- in[2] = _mm_load_si128((__m128i *)(input + 2 * 16));
- in[3] = _mm_load_si128((__m128i *)(input + 3 * 16));
- in[4] = _mm_load_si128((__m128i *)(input + 4 * 16));
- in[5] = _mm_load_si128((__m128i *)(input + 5 * 16));
- in[6] = _mm_load_si128((__m128i *)(input + 6 * 16));
- in[7] = _mm_load_si128((__m128i *)(input + 7 * 16));
+static INLINE void load_buffer_8x16(const int16_t *input, __m128i *in) {
+ in[0] = _mm_load_si128((const __m128i *)(input + 0 * 16));
+ in[1] = _mm_load_si128((const __m128i *)(input + 1 * 16));
+ in[2] = _mm_load_si128((const __m128i *)(input + 2 * 16));
+ in[3] = _mm_load_si128((const __m128i *)(input + 3 * 16));
+ in[4] = _mm_load_si128((const __m128i *)(input + 4 * 16));
+ in[5] = _mm_load_si128((const __m128i *)(input + 5 * 16));
+ in[6] = _mm_load_si128((const __m128i *)(input + 6 * 16));
+ in[7] = _mm_load_si128((const __m128i *)(input + 7 * 16));
- in[8] = _mm_load_si128((__m128i *)(input + 8 * 16));
- in[9] = _mm_load_si128((__m128i *)(input + 9 * 16));
- in[10] = _mm_load_si128((__m128i *)(input + 10 * 16));
- in[11] = _mm_load_si128((__m128i *)(input + 11 * 16));
- in[12] = _mm_load_si128((__m128i *)(input + 12 * 16));
- in[13] = _mm_load_si128((__m128i *)(input + 13 * 16));
- in[14] = _mm_load_si128((__m128i *)(input + 14 * 16));
- in[15] = _mm_load_si128((__m128i *)(input + 15 * 16));
+ in[8] = _mm_load_si128((const __m128i *)(input + 8 * 16));
+ in[9] = _mm_load_si128((const __m128i *)(input + 9 * 16));
+ in[10] = _mm_load_si128((const __m128i *)(input + 10 * 16));
+ in[11] = _mm_load_si128((const __m128i *)(input + 11 * 16));
+ in[12] = _mm_load_si128((const __m128i *)(input + 12 * 16));
+ in[13] = _mm_load_si128((const __m128i *)(input + 13 * 16));
+ in[14] = _mm_load_si128((const __m128i *)(input + 14 * 16));
+ in[15] = _mm_load_si128((const __m128i *)(input + 15 * 16));
}
static INLINE void write_buffer_8x16(uint8_t *dest, __m128i *in, int stride) {
@@ -2421,8 +2387,8 @@
RECON_AND_STORE(dest, in[15]);
}
-void vp9_short_iht16x16_add_sse2(int16_t *input, uint8_t *dest, int stride,
- int tx_type) {
+void vp9_iht16x16_256_add_sse2(const int16_t *input, uint8_t *dest, int stride,
+ int tx_type) {
__m128i in0[16], in1[16];
load_buffer_8x16(input, in0);
@@ -2456,8 +2422,8 @@
write_buffer_8x16(dest, in1, stride);
}
-void vp9_short_idct16x16_10_add_sse2(int16_t *input, uint8_t *dest,
- int stride) {
+void vp9_idct16x16_10_add_sse2(const int16_t *input, uint8_t *dest,
+ int stride) {
const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
const __m128i final_rounding = _mm_set1_epi16(1<<5);
const __m128i zero = _mm_setzero_si128();
@@ -2503,14 +2469,14 @@
__m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
int i;
// 1-D idct. Load input data.
- in0 = _mm_load_si128((__m128i *)input);
- in8 = _mm_load_si128((__m128i *)(input + 8 * 1));
- in1 = _mm_load_si128((__m128i *)(input + 8 * 2));
- in9 = _mm_load_si128((__m128i *)(input + 8 * 3));
- in2 = _mm_load_si128((__m128i *)(input + 8 * 4));
- in10 = _mm_load_si128((__m128i *)(input + 8 * 5));
- in3 = _mm_load_si128((__m128i *)(input + 8 * 6));
- in11 = _mm_load_si128((__m128i *)(input + 8 * 7));
+ in0 = _mm_load_si128((const __m128i *)input);
+ in8 = _mm_load_si128((const __m128i *)(input + 8 * 1));
+ in1 = _mm_load_si128((const __m128i *)(input + 8 * 2));
+ in9 = _mm_load_si128((const __m128i *)(input + 8 * 3));
+ in2 = _mm_load_si128((const __m128i *)(input + 8 * 4));
+ in10 = _mm_load_si128((const __m128i *)(input + 8 * 5));
+ in3 = _mm_load_si128((const __m128i *)(input + 8 * 6));
+ in11 = _mm_load_si128((const __m128i *)(input + 8 * 7));
TRANSPOSE_8X4(in0, in1, in2, in3, in0, in1, in2, in3);
TRANSPOSE_8X4(in8, in9, in10, in11, in8, in9, in10, in11);
@@ -2815,11 +2781,12 @@
#define LOAD_DQCOEFF(reg, input) \
{ \
- reg = _mm_load_si128((__m128i *) input); \
+ reg = _mm_load_si128((const __m128i *) input); \
input += 8; \
} \
-void vp9_short_idct32x32_add_sse2(int16_t *input, uint8_t *dest, int stride) {
+void vp9_idct32x32_1024_add_sse2(const int16_t *input, uint8_t *dest,
+ int stride) {
const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
const __m128i final_rounding = _mm_set1_epi16(1<<5);
@@ -3550,7 +3517,7 @@
}
} //NOLINT
-void vp9_short_idct32x32_1_add_sse2(int16_t *input, uint8_t *dest, int stride) {
+void vp9_idct32x32_1_add_sse2(const int16_t *input, uint8_t *dest, int stride) {
__m128i dc_value;
const __m128i zero = _mm_setzero_si128();
int a, i;
diff --git a/vp9/common/x86/vp9_intrapred_ssse3.asm b/vp9/common/x86/vp9_intrapred_ssse3.asm
index c51d011..314d1a2 100644
--- a/vp9/common/x86/vp9_intrapred_ssse3.asm
+++ b/vp9/common/x86/vp9_intrapred_ssse3.asm
@@ -13,27 +13,23 @@
SECTION_RODATA
pb_1: times 16 db 1
-pw_2: times 8 dw 2
-pb_7m1: times 8 db 7, -1
-pb_15: times 16 db 15
-
sh_b01234577: db 0, 1, 2, 3, 4, 5, 7, 7, 0, 0, 0, 0, 0, 0, 0, 0
sh_b12345677: db 1, 2, 3, 4, 5, 6, 7, 7, 0, 0, 0, 0, 0, 0, 0, 0
sh_b23456777: db 2, 3, 4, 5, 6, 7, 7, 7, 0, 0, 0, 0, 0, 0, 0, 0
sh_b0123456777777777: db 0, 1, 2, 3, 4, 5, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7
sh_b1234567777777777: db 1, 2, 3, 4, 5, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7
sh_b2345677777777777: db 2, 3, 4, 5, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7
-sh_b2w01234577: db 0, -1, 1, -1, 2, -1, 3, -1, 4, -1, 5, -1, 7, -1, 7, -1
-sh_b2w12345677: db 1, -1, 2, -1, 3, -1, 4, -1, 5, -1, 6, -1, 7, -1, 7, -1
-sh_b2w23456777: db 2, -1, 3, -1, 4, -1, 5, -1, 6, -1, 7, -1, 7, -1, 7, -1
-sh_b2w01234567: db 0, -1, 1, -1, 2, -1, 3, -1, 4, -1, 5, -1, 6, -1, 7, -1
-sh_b2w12345678: db 1, -1, 2, -1, 3, -1, 4, -1, 5, -1, 6, -1, 7, -1, 8, -1
-sh_b2w23456789: db 2, -1, 3, -1, 4, -1, 5, -1, 6, -1, 7, -1, 8, -1, 9, -1
-sh_b2w89abcdef: db 8, -1, 9, -1, 10, -1, 11, -1, 12, -1, 13, -1, 14, -1, 15, -1
-sh_b2w9abcdeff: db 9, -1, 10, -1, 11, -1, 12, -1, 13, -1, 14, -1, 15, -1, 15, -1
-sh_b2wabcdefff: db 10, -1, 11, -1, 12, -1, 13, -1, 14, -1, 15, -1, 15, -1, 15, -1
sh_b123456789abcdeff: db 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 15
sh_b23456789abcdefff: db 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 15, 15
+sh_b32104567: db 3, 2, 1, 0, 4, 5, 6, 7, 0, 0, 0, 0, 0, 0, 0, 0
+sh_b8091a2b345: db 8, 0, 9, 1, 10, 2, 11, 3, 4, 5, 0, 0, 0, 0, 0, 0
+sh_b76543210: db 7, 6, 5, 4, 3, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0
+sh_b65432108: db 6, 5, 4, 3, 2, 1, 0, 8, 0, 0, 0, 0, 0, 0, 0, 0
+sh_b54321089: db 5, 4, 3, 2, 1, 0, 8, 9, 0, 0, 0, 0, 0, 0, 0, 0
+sh_b89abcdef: db 8, 9, 10, 11, 12, 13, 14, 15, 0, 0, 0, 0, 0, 0, 0, 0
+sh_bfedcba9876543210: db 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
+sh_b1233: db 1, 2, 3, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+sh_b2333: db 2, 3, 3, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
SECTION .text
@@ -455,3 +451,590 @@
jnz .loop
RESTORE_GOT
REP_RET
+
+INIT_XMM ssse3
+cglobal d153_predictor_4x4, 4, 5, 4, dst, stride, above, left, goffset
+ GET_GOT goffsetq
+ movd m0, [leftq] ; l1, l2, l3, l4
+ movd m1, [aboveq-1] ; tl, t1, t2, t3
+ punpckldq m0, m1 ; l1, l2, l3, l4, tl, t1, t2, t3
+ pshufb m0, [GLOBAL(sh_b32104567)]; l4, l3, l2, l1, tl, t1, t2, t3
+ psrldq m1, m0, 1 ; l3, l2, l1, tl, t1, t2, t3
+ psrldq m2, m0, 2 ; l2, l1, tl, t1, t2, t3
+ ; comments below are for a predictor like this
+ ; A1 B1 C1 D1
+ ; A2 B2 A1 B1
+ ; A3 B3 A2 B2
+ ; A4 B4 A3 B3
+ X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m1, m2, m3 ; 3-tap avg B4 B3 B2 B1 C1 D1
+ pavgb m1, m0 ; 2-tap avg A4 A3 A2 A1
+
+ punpcklqdq m3, m1 ; B4 B3 B2 B1 C1 D1 x x A4 A3 A2 A1 ..
+
+ DEFINE_ARGS dst, stride, stride3
+ lea stride3q, [strideq*3]
+ pshufb m3, [GLOBAL(sh_b8091a2b345)] ; A4 B4 A3 B3 A2 B2 A1 B1 C1 D1 ..
+ movd [dstq+stride3q ], m3
+ psrldq m3, 2 ; A3 B3 A2 B2 A1 B1 C1 D1 ..
+ movd [dstq+strideq*2], m3
+ psrldq m3, 2 ; A2 B2 A1 B1 C1 D1 ..
+ movd [dstq+strideq ], m3
+ psrldq m3, 2 ; A1 B1 C1 D1 ..
+ movd [dstq ], m3
+ RESTORE_GOT
+ RET
+
+INIT_XMM ssse3
+cglobal d153_predictor_8x8, 4, 5, 8, dst, stride, above, left, goffset
+ GET_GOT goffsetq
+ movq m0, [leftq] ; [0- 7] l1-8 [byte]
+ movhps m0, [aboveq-1] ; [8-15] tl, t1-7 [byte]
+ pshufb m1, m0, [GLOBAL(sh_b76543210)] ; l8-1 [word]
+ pshufb m2, m0, [GLOBAL(sh_b65432108)] ; l7-1,tl [word]
+ pshufb m3, m0, [GLOBAL(sh_b54321089)] ; l6-1,tl,t1 [word]
+ pshufb m0, [GLOBAL(sh_b89abcdef)] ; tl,t1-7 [word]
+ psrldq m4, m0, 1 ; t1-7 [word]
+ psrldq m5, m0, 2 ; t2-7 [word]
+ ; comments below are for a predictor like this
+ ; A1 B1 C1 D1 E1 F1 G1 H1
+ ; A2 B2 A1 B1 C1 D1 E1 F1
+ ; A3 B3 A2 B2 A1 B1 C1 D1
+ ; A4 B4 A3 B3 A2 B2 A1 B1
+ ; A5 B5 A4 B4 A3 B3 A2 B2
+ ; A6 B6 A5 B5 A4 B4 A3 B3
+ ; A7 B7 A6 B6 A5 B5 A4 B4
+ ; A8 B8 A7 B7 A6 B6 A5 B5
+ pavgb m6, m1, m2 ; 2-tap avg A8-A1
+
+ X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m4, m5, m7 ; 3-tap avg C-H1
+
+ X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m1, m2, m3, m0 ; 3-tap avg B8-1
+
+ punpcklbw m6, m0 ; A-B8, A-B7 ... A-B2, A-B1
+
+ DEFINE_ARGS dst, stride, stride3
+ lea stride3q, [strideq*3]
+
+ movhps [dstq+stride3q], m6 ; A-B4, A-B3, A-B2, A-B1
+ palignr m0, m7, m6, 10 ; A-B3, A-B2, A-B1, C-H1
+ movq [dstq+strideq*2], m0
+ psrldq m0, 2 ; A-B2, A-B1, C-H1
+ movq [dstq+strideq ], m0
+ psrldq m0, 2 ; A-H1
+ movq [dstq ], m0
+ lea dstq, [dstq+strideq*4]
+ movq [dstq+stride3q ], m6 ; A-B8, A-B7, A-B6, A-B5
+ psrldq m6, 2 ; A-B7, A-B6, A-B5, A-B4
+ movq [dstq+strideq*2], m6
+ psrldq m6, 2 ; A-B6, A-B5, A-B4, A-B3
+ movq [dstq+strideq ], m6
+ psrldq m6, 2 ; A-B5, A-B4, A-B3, A-B2
+ movq [dstq ], m6
+ RESTORE_GOT
+ RET
+
+INIT_XMM ssse3
+cglobal d153_predictor_16x16, 4, 5, 8, dst, stride, above, left, goffset
+ GET_GOT goffsetq
+ mova m0, [leftq]
+ movu m7, [aboveq-1]
+ ; comments below are for a predictor like this
+ ; A1 B1 C1 D1 E1 F1 G1 H1 I1 J1 K1 L1 M1 N1 O1 P1
+ ; A2 B2 A1 B1 C1 D1 E1 F1 G1 H1 I1 J1 K1 L1 M1 N1
+ ; A3 B3 A2 B2 A1 B1 C1 D1 E1 F1 G1 H1 I1 J1 K1 L1
+ ; A4 B4 A3 B3 A2 B2 A1 B1 C1 D1 E1 F1 G1 H1 I1 J1
+ ; A5 B5 A4 B4 A3 B3 A2 B2 A1 B1 C1 D1 E1 F1 G1 H1
+ ; A6 B6 A5 B5 A4 B4 A3 B3 A2 B2 A1 B1 C1 D1 E1 F1
+ ; A7 B7 A6 B6 A5 B5 A4 B4 A3 B3 A2 B2 A1 B1 C1 D1
+ ; A8 B8 A7 B7 A6 B6 A5 B5 A4 B4 A3 B3 A2 B2 A1 B1
+ ; A9 B9 A8 B8 A7 B7 A6 B6 A5 B5 A4 B4 A3 B3 A2 B2
+ ; Aa Ba A9 B9 A8 B8 A7 B7 A6 B6 A5 B5 A4 B4 A3 B3
+ ; Ab Bb Aa Ba A9 B9 A8 B8 A7 B7 A6 B6 A5 B5 A4 B4
+ ; Ac Bc Ab Bb Aa Ba A9 B9 A8 B8 A7 B7 A6 B6 A5 B5
+ ; Ad Bd Ac Bc Ab Bb Aa Ba A9 B9 A8 B8 A7 B7 A6 B6
+ ; Ae Be Ad Bd Ac Bc Ab Bb Aa Ba A9 B9 A8 B8 A7 B7
+ ; Af Bf Ae Be Ad Bd Ac Bc Ab Bb Aa Ba A9 B9 A8 B8
+ ; Ag Bg Af Bf Ae Be Ad Bd Ac Bc Ab Bb Aa Ba A9 B9
+ pshufb m6, m7, [GLOBAL(sh_bfedcba9876543210)]
+ palignr m5, m0, m6, 15
+ palignr m3, m0, m6, 14
+
+ X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m5, m3, m4 ; 3-tap avg B3-Bg
+ pshufb m1, m0, [GLOBAL(sh_b123456789abcdeff)]
+ pavgb m5, m0 ; A1 - Ag
+
+ punpcklbw m0, m4, m5 ; A-B8 ... A-B1
+ punpckhbw m4, m5 ; A-B9 ... A-Bg
+
+ pshufb m3, m7, [GLOBAL(sh_b123456789abcdeff)]
+ pshufb m5, m7, [GLOBAL(sh_b23456789abcdefff)]
+
+ X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m7, m3, m5, m1 ; 3-tap avg C1-P1
+
+ pshufb m6, m0, [GLOBAL(sh_bfedcba9876543210)]
+ DEFINE_ARGS dst, stride, stride3
+ lea stride3q, [strideq*3]
+ palignr m2, m1, m6, 14
+ mova [dstq ], m2
+ palignr m2, m1, m6, 12
+ mova [dstq+strideq ], m2
+ palignr m2, m1, m6, 10
+ mova [dstq+strideq*2], m2
+ palignr m2, m1, m6, 8
+ mova [dstq+stride3q ], m2
+ lea dstq, [dstq+strideq*4]
+ palignr m2, m1, m6, 6
+ mova [dstq ], m2
+ palignr m2, m1, m6, 4
+ mova [dstq+strideq ], m2
+ palignr m2, m1, m6, 2
+ mova [dstq+strideq*2], m2
+ pshufb m4, [GLOBAL(sh_bfedcba9876543210)]
+ mova [dstq+stride3q ], m6
+ lea dstq, [dstq+strideq*4]
+
+ palignr m2, m6, m4, 14
+ mova [dstq ], m2
+ palignr m2, m6, m4, 12
+ mova [dstq+strideq ], m2
+ palignr m2, m6, m4, 10
+ mova [dstq+strideq*2], m2
+ palignr m2, m6, m4, 8
+ mova [dstq+stride3q ], m2
+ lea dstq, [dstq+strideq*4]
+ palignr m2, m6, m4, 6
+ mova [dstq ], m2
+ palignr m2, m6, m4, 4
+ mova [dstq+strideq ], m2
+ palignr m2, m6, m4, 2
+ mova [dstq+strideq*2], m2
+ mova [dstq+stride3q ], m4
+ RESTORE_GOT
+ RET
+
+INIT_XMM ssse3
+cglobal d153_predictor_32x32, 4, 5, 8, dst, stride, above, left, goffset
+ GET_GOT goffsetq
+ mova m0, [leftq]
+ movu m7, [aboveq-1]
+ movu m1, [aboveq+15]
+
+ pshufb m4, m1, [GLOBAL(sh_b123456789abcdeff)]
+ pshufb m6, m1, [GLOBAL(sh_b23456789abcdefff)]
+
+ X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m1, m4, m6, m2 ; 3-tap avg above [high]
+
+ palignr m3, m1, m7, 1
+ palignr m5, m1, m7, 2
+
+ X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m7, m3, m5, m1 ; 3-tap avg above [low]
+
+ pshufb m7, [GLOBAL(sh_bfedcba9876543210)]
+ palignr m5, m0, m7, 15
+ palignr m3, m0, m7, 14
+
+ X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m5, m3, m4 ; 3-tap avg B3-Bg
+ pavgb m5, m0 ; A1 - Ag
+ punpcklbw m6, m4, m5 ; A-B8 ... A-B1
+ punpckhbw m4, m5 ; A-B9 ... A-Bg
+ pshufb m6, [GLOBAL(sh_bfedcba9876543210)]
+ pshufb m4, [GLOBAL(sh_bfedcba9876543210)]
+
+ DEFINE_ARGS dst, stride, stride3, left, line
+ lea stride3q, [strideq*3]
+
+ palignr m5, m2, m1, 14
+ palignr m7, m1, m6, 14
+ mova [dstq ], m7
+ mova [dstq+16 ], m5
+ palignr m5, m2, m1, 12
+ palignr m7, m1, m6, 12
+ mova [dstq+strideq ], m7
+ mova [dstq+strideq+16 ], m5
+ palignr m5, m2, m1, 10
+ palignr m7, m1, m6, 10
+ mova [dstq+strideq*2 ], m7
+ mova [dstq+strideq*2+16], m5
+ palignr m5, m2, m1, 8
+ palignr m7, m1, m6, 8
+ mova [dstq+stride3q ], m7
+ mova [dstq+stride3q+16 ], m5
+ lea dstq, [dstq+strideq*4]
+ palignr m5, m2, m1, 6
+ palignr m7, m1, m6, 6
+ mova [dstq ], m7
+ mova [dstq+16 ], m5
+ palignr m5, m2, m1, 4
+ palignr m7, m1, m6, 4
+ mova [dstq+strideq ], m7
+ mova [dstq+strideq+16 ], m5
+ palignr m5, m2, m1, 2
+ palignr m7, m1, m6, 2
+ mova [dstq+strideq*2 ], m7
+ mova [dstq+strideq*2+16], m5
+ mova [dstq+stride3q ], m6
+ mova [dstq+stride3q+16 ], m1
+ lea dstq, [dstq+strideq*4]
+
+ palignr m5, m1, m6, 14
+ palignr m3, m6, m4, 14
+ mova [dstq ], m3
+ mova [dstq+16 ], m5
+ palignr m5, m1, m6, 12
+ palignr m3, m6, m4, 12
+ mova [dstq+strideq ], m3
+ mova [dstq+strideq+16 ], m5
+ palignr m5, m1, m6, 10
+ palignr m3, m6, m4, 10
+ mova [dstq+strideq*2 ], m3
+ mova [dstq+strideq*2+16], m5
+ palignr m5, m1, m6, 8
+ palignr m3, m6, m4, 8
+ mova [dstq+stride3q ], m3
+ mova [dstq+stride3q+16 ], m5
+ lea dstq, [dstq+strideq*4]
+ palignr m5, m1, m6, 6
+ palignr m3, m6, m4, 6
+ mova [dstq ], m3
+ mova [dstq+16 ], m5
+ palignr m5, m1, m6, 4
+ palignr m3, m6, m4, 4
+ mova [dstq+strideq ], m3
+ mova [dstq+strideq+16 ], m5
+ palignr m5, m1, m6, 2
+ palignr m3, m6, m4, 2
+ mova [dstq+strideq*2 ], m3
+ mova [dstq+strideq*2+16], m5
+ mova [dstq+stride3q ], m4
+ mova [dstq+stride3q+16 ], m6
+ lea dstq, [dstq+strideq*4]
+
+ mova m7, [leftq]
+ mova m3, [leftq+16]
+ palignr m5, m3, m7, 15
+ palignr m0, m3, m7, 14
+
+ X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m3, m5, m0, m2 ; 3-tap avg Bh -
+ pavgb m5, m3 ; Ah -
+ punpcklbw m3, m2, m5 ; A-B8 ... A-B1
+ punpckhbw m2, m5 ; A-B9 ... A-Bg
+ pshufb m3, [GLOBAL(sh_bfedcba9876543210)]
+ pshufb m2, [GLOBAL(sh_bfedcba9876543210)]
+
+ palignr m7, m6, m4, 14
+ palignr m0, m4, m3, 14
+ mova [dstq ], m0
+ mova [dstq+16 ], m7
+ palignr m7, m6, m4, 12
+ palignr m0, m4, m3, 12
+ mova [dstq+strideq ], m0
+ mova [dstq+strideq+16 ], m7
+ palignr m7, m6, m4, 10
+ palignr m0, m4, m3, 10
+ mova [dstq+strideq*2 ], m0
+ mova [dstq+strideq*2+16], m7
+ palignr m7, m6, m4, 8
+ palignr m0, m4, m3, 8
+ mova [dstq+stride3q ], m0
+ mova [dstq+stride3q+16 ], m7
+ lea dstq, [dstq+strideq*4]
+ palignr m7, m6, m4, 6
+ palignr m0, m4, m3, 6
+ mova [dstq ], m0
+ mova [dstq+16 ], m7
+ palignr m7, m6, m4, 4
+ palignr m0, m4, m3, 4
+ mova [dstq+strideq ], m0
+ mova [dstq+strideq+16 ], m7
+ palignr m7, m6, m4, 2
+ palignr m0, m4, m3, 2
+ mova [dstq+strideq*2 ], m0
+ mova [dstq+strideq*2+16], m7
+ mova [dstq+stride3q ], m3
+ mova [dstq+stride3q+16 ], m4
+ lea dstq, [dstq+strideq*4]
+
+ palignr m7, m4, m3, 14
+ palignr m0, m3, m2, 14
+ mova [dstq ], m0
+ mova [dstq+16 ], m7
+ palignr m7, m4, m3, 12
+ palignr m0, m3, m2, 12
+ mova [dstq+strideq ], m0
+ mova [dstq+strideq+16 ], m7
+ palignr m7, m4, m3, 10
+ palignr m0, m3, m2, 10
+ mova [dstq+strideq*2 ], m0
+ mova [dstq+strideq*2+16], m7
+ palignr m7, m4, m3, 8
+ palignr m0, m3, m2, 8
+ mova [dstq+stride3q ], m0
+ mova [dstq+stride3q+16 ], m7
+ lea dstq, [dstq+strideq*4]
+ palignr m7, m4, m3, 6
+ palignr m0, m3, m2, 6
+ mova [dstq ], m0
+ mova [dstq+16 ], m7
+ palignr m7, m4, m3, 4
+ palignr m0, m3, m2, 4
+ mova [dstq+strideq ], m0
+ mova [dstq+strideq+16 ], m7
+ palignr m7, m4, m3, 2
+ palignr m0, m3, m2, 2
+ mova [dstq+strideq*2 ], m0
+ mova [dstq+strideq*2+16], m7
+ mova [dstq+stride3q ], m2
+ mova [dstq+stride3q+16 ], m3
+
+ RESTORE_GOT
+ RET
+
+INIT_MMX ssse3
+cglobal d207_predictor_4x4, 2, 5, 4, dst, stride, unused, left, goffset
+ GET_GOT goffsetq
+ movifnidn leftq, leftmp
+ movd m0, [leftq] ; abcd [byte]
+ pshufb m1, m0, [GLOBAL(sh_b1233)] ; bcdd [byte]
+ pshufb m3, m0, [GLOBAL(sh_b2333)] ; cddd
+
+ X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m1, m3, m2
+ pavgb m1, m0 ; ab, bc, cd, d [byte]
+
+ punpcklbw m1, m2 ; ab, a2bc, bc, b2cd, cd, c3d, d, d
+ movd [dstq ], m1
+ psrlq m1, 16 ; bc, b2cd, cd, c3d, d, d
+ movd [dstq+strideq], m1
+ lea dstq, [dstq+strideq*2]
+ psrlq m1, 16 ; cd, c3d, d, d
+ movd [dstq ], m1
+ pshufw m1, m1, q1111 ; d, d, d, d
+ movd [dstq+strideq], m1
+ RESTORE_GOT
+ RET
+
+INIT_XMM ssse3
+cglobal d207_predictor_8x8, 2, 5, 4, dst, stride, stride3, left, goffset
+ GET_GOT goffsetq
+ movifnidn leftq, leftmp
+ movq m3, [leftq] ; abcdefgh [byte]
+ lea stride3q, [strideq*3]
+
+ pshufb m1, m3, [GLOBAL(sh_b2345677777777777)]
+ pshufb m0, m3, [GLOBAL(sh_b0123456777777777)]
+ pshufb m2, m3, [GLOBAL(sh_b1234567777777777)]
+
+ X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m2, m1, m3
+ pavgb m0, m2
+ punpcklbw m0, m3 ; interleaved output
+
+ movq [dstq ], m0
+ psrldq m0, 2
+ movq [dstq+strideq ], m0
+ psrldq m0, 2
+ movq [dstq+strideq*2], m0
+ psrldq m0, 2
+ movq [dstq+stride3q ], m0
+ lea dstq, [dstq+strideq*4]
+ pshufhw m0, m0, q0000 ; de, d2ef, ef, e2fg, fg, f2gh, gh, g3h, 8xh
+ psrldq m0, 2
+ movq [dstq ], m0
+ psrldq m0, 2
+ movq [dstq+strideq ], m0
+ psrldq m0, 2
+ movq [dstq+strideq*2], m0
+ psrldq m0, 2
+ movq [dstq+stride3q ], m0
+ RESTORE_GOT
+ RET
+
+INIT_XMM ssse3
+cglobal d207_predictor_16x16, 2, 5, 5, dst, stride, stride3, left, goffset
+ GET_GOT goffsetq
+ lea stride3q, [strideq*3]
+ movifnidn leftq, leftmp
+ mova m0, [leftq] ; abcdefghijklmnop [byte]
+ pshufb m1, m0, [GLOBAL(sh_b123456789abcdeff)] ; bcdefghijklmnopp
+ pshufb m2, m0, [GLOBAL(sh_b23456789abcdefff)]
+
+ X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m1, m2, m3
+ pavgb m1, m0 ; ab, bc, cd .. no, op, pp [byte]
+
+ punpckhbw m4, m1, m3 ; interleaved input
+ punpcklbw m1, m3 ; interleaved output
+ mova [dstq ], m1
+ palignr m3, m4, m1, 2
+ mova [dstq+strideq ], m3
+ palignr m3, m4, m1, 4
+ mova [dstq+strideq*2], m3
+ palignr m3, m4, m1, 6
+ mova [dstq+stride3q ], m3
+ lea dstq, [dstq+strideq*4]
+ palignr m3, m4, m1, 8
+ mova [dstq ], m3
+ palignr m3, m4, m1, 10
+ mova [dstq+strideq ], m3
+ palignr m3, m4, m1, 12
+ mova [dstq+strideq*2], m3
+ palignr m3, m4, m1, 14
+ mova [dstq+stride3q ], m3
+ DEFINE_ARGS dst, stride, stride3, line
+ mov lined, 2
+ mova m0, [GLOBAL(sh_b23456789abcdefff)]
+.loop:
+ lea dstq, [dstq+strideq*4]
+ mova [dstq ], m4
+ pshufb m4, m0
+ mova [dstq+strideq ], m4
+ pshufb m4, m0
+ mova [dstq+strideq*2], m4
+ pshufb m4, m0
+ mova [dstq+stride3q ], m4
+ pshufb m4, m0
+ dec lined
+ jnz .loop
+ RESTORE_GOT
+ REP_RET
+
+INIT_XMM ssse3
+cglobal d207_predictor_32x32, 2, 5, 8, dst, stride, stride3, left, goffset
+ GET_GOT goffsetq
+ lea stride3q, [strideq*3]
+ movifnidn leftq, leftmp
+ mova m1, [leftq] ; 0-15 [byte]
+ mova m2, [leftq+16] ; 16-31 [byte]
+ pshufb m0, m2, [GLOBAL(sh_b23456789abcdefff)]
+ pshufb m4, m2, [GLOBAL(sh_b123456789abcdeff)]
+
+ X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m2, m4, m0, m3
+ palignr m6, m2, m1, 1
+ palignr m5, m2, m1, 2
+ pavgb m2, m4 ; high 16px even lines
+
+ X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m1, m6, m5, m0
+ pavgb m1, m6 ; low 16px even lines
+
+ punpckhbw m6, m1, m0 ; interleaved output 2
+ punpcklbw m1, m0 ; interleaved output 1
+
+ punpckhbw m7, m2, m3 ; interleaved output 4
+ punpcklbw m2, m3 ; interleaved output 3
+
+ ; output 1st 8 lines (and half of 2nd 8 lines)
+ DEFINE_ARGS dst, stride, stride3, dst8
+ lea dst8q, [dstq+strideq*8]
+ mova [dstq ], m1
+ mova [dstq +16], m6
+ mova [dst8q ], m6
+ palignr m0, m6, m1, 2
+ palignr m4, m2, m6, 2
+ mova [dstq +strideq ], m0
+ mova [dstq +strideq +16], m4
+ mova [dst8q+strideq ], m4
+ palignr m0, m6, m1, 4
+ palignr m4, m2, m6, 4
+ mova [dstq +strideq*2 ], m0
+ mova [dstq +strideq*2+16], m4
+ mova [dst8q+strideq*2 ], m4
+ palignr m0, m6, m1, 6
+ palignr m4, m2, m6, 6
+ mova [dstq +stride3q ], m0
+ mova [dstq +stride3q +16], m4
+ mova [dst8q+stride3q ], m4
+ lea dstq, [dstq +strideq*4]
+ lea dst8q, [dst8q+strideq*4]
+ palignr m0, m6, m1, 8
+ palignr m4, m2, m6, 8
+ mova [dstq ], m0
+ mova [dstq +16], m4
+ mova [dst8q ], m4
+ palignr m0, m6, m1, 10
+ palignr m4, m2, m6, 10
+ mova [dstq +strideq ], m0
+ mova [dstq +strideq +16], m4
+ mova [dst8q+strideq ], m4
+ palignr m0, m6, m1, 12
+ palignr m4, m2, m6, 12
+ mova [dstq +strideq*2 ], m0
+ mova [dstq +strideq*2+16], m4
+ mova [dst8q+strideq*2 ], m4
+ palignr m0, m6, m1, 14
+ palignr m4, m2, m6, 14
+ mova [dstq +stride3q ], m0
+ mova [dstq +stride3q +16], m4
+ mova [dst8q+stride3q ], m4
+ lea dstq, [dstq+strideq*4]
+ lea dst8q, [dst8q+strideq*4]
+
+ ; output 2nd half of 2nd 8 lines and half of 3rd 8 lines
+ mova [dstq +16], m2
+ mova [dst8q ], m2
+ palignr m4, m7, m2, 2
+ mova [dstq +strideq +16], m4
+ mova [dst8q+strideq ], m4
+ palignr m4, m7, m2, 4
+ mova [dstq +strideq*2+16], m4
+ mova [dst8q+strideq*2 ], m4
+ palignr m4, m7, m2, 6
+ mova [dstq +stride3q +16], m4
+ mova [dst8q+stride3q ], m4
+ lea dstq, [dstq+strideq*4]
+ lea dst8q, [dst8q+strideq*4]
+ palignr m4, m7, m2, 8
+ mova [dstq +16], m4
+ mova [dst8q ], m4
+ palignr m4, m7, m2, 10
+ mova [dstq +strideq +16], m4
+ mova [dst8q+strideq ], m4
+ palignr m4, m7, m2, 12
+ mova [dstq +strideq*2+16], m4
+ mova [dst8q+strideq*2 ], m4
+ palignr m4, m7, m2, 14
+ mova [dstq +stride3q +16], m4
+ mova [dst8q+stride3q ], m4
+ lea dstq, [dstq+strideq*4]
+ lea dst8q, [dst8q+strideq*4]
+
+ ; output 2nd half of 3rd 8 lines and half of 4th 8 lines
+ mova m0, [sh_b23456789abcdefff]
+ mova [dstq +16], m7
+ mova [dst8q ], m7
+ pshufb m7, m0
+ mova [dstq +strideq +16], m7
+ mova [dst8q+strideq ], m7
+ pshufb m7, m0
+ mova [dstq +strideq*2+16], m7
+ mova [dst8q+strideq*2 ], m7
+ pshufb m7, m0
+ mova [dstq +stride3q +16], m7
+ mova [dst8q+stride3q ], m7
+ pshufb m7, m0
+ lea dstq, [dstq+strideq*4]
+ lea dst8q, [dst8q+strideq*4]
+ mova [dstq +16], m7
+ mova [dst8q ], m7
+ pshufb m7, m0
+ mova [dstq +strideq +16], m7
+ mova [dst8q+strideq ], m7
+ pshufb m7, m0
+ mova [dstq +strideq*2+16], m7
+ mova [dst8q+strideq*2 ], m7
+ pshufb m7, m0
+ mova [dstq +stride3q +16], m7
+ mova [dst8q+stride3q ], m7
+ pshufb m7, m0
+ lea dstq, [dstq+strideq*4]
+
+ ; output last half of 4th 8 lines
+ mova [dstq +16], m7
+ mova [dstq +strideq +16], m7
+ mova [dstq +strideq*2+16], m7
+ mova [dstq +stride3q +16], m7
+ lea dstq, [dstq+strideq*4]
+ mova [dstq +16], m7
+ mova [dstq +strideq +16], m7
+ mova [dstq +strideq*2+16], m7
+ mova [dstq +stride3q +16], m7
+
+ ; done!
+ RESTORE_GOT
+ RET
diff --git a/vp9/common/x86/vp9_subpixel_8t_sse2.asm b/vp9/common/x86/vp9_subpixel_8t_sse2.asm
new file mode 100644
index 0000000..9dc8d0a
--- /dev/null
+++ b/vp9/common/x86/vp9_subpixel_8t_sse2.asm
@@ -0,0 +1,987 @@
+;
+; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+
+%include "vpx_ports/x86_abi_support.asm"
+
+;Note: tap3 and tap4 have to be applied and added after other taps to avoid
+;overflow.
+
+%macro GET_FILTERS_4 0
+ mov rdx, arg(5) ;filter ptr
+ mov rcx, 0x0400040
+
+ movdqa xmm7, [rdx] ;load filters
+ pshuflw xmm0, xmm7, 0b ;k0
+ pshuflw xmm1, xmm7, 01010101b ;k1
+ pshuflw xmm2, xmm7, 10101010b ;k2
+ pshuflw xmm3, xmm7, 11111111b ;k3
+ psrldq xmm7, 8
+ pshuflw xmm4, xmm7, 0b ;k4
+ pshuflw xmm5, xmm7, 01010101b ;k5
+ pshuflw xmm6, xmm7, 10101010b ;k6
+ pshuflw xmm7, xmm7, 11111111b ;k7
+
+ punpcklqdq xmm0, xmm1
+ punpcklqdq xmm2, xmm3
+ punpcklqdq xmm5, xmm4
+ punpcklqdq xmm6, xmm7
+
+ movdqa k0k1, xmm0
+ movdqa k2k3, xmm2
+ movdqa k5k4, xmm5
+ movdqa k6k7, xmm6
+
+ movq xmm6, rcx
+ pshufd xmm6, xmm6, 0
+ movdqa krd, xmm6
+
+ pxor xmm7, xmm7
+ movdqa zero, xmm7
+%endm
+
+%macro APPLY_FILTER_4 1
+ punpckldq xmm0, xmm1 ;two row in one register
+ punpckldq xmm6, xmm7
+ punpckldq xmm2, xmm3
+ punpckldq xmm5, xmm4
+
+ punpcklbw xmm0, zero ;unpack to word
+ punpcklbw xmm6, zero
+ punpcklbw xmm2, zero
+ punpcklbw xmm5, zero
+
+ pmullw xmm0, k0k1 ;multiply the filter factors
+ pmullw xmm6, k6k7
+ pmullw xmm2, k2k3
+ pmullw xmm5, k5k4
+
+ paddsw xmm0, xmm6 ;sum
+ movdqa xmm1, xmm0
+ psrldq xmm1, 8
+ paddsw xmm0, xmm1
+ paddsw xmm0, xmm2
+ psrldq xmm2, 8
+ paddsw xmm0, xmm5
+ psrldq xmm5, 8
+ paddsw xmm0, xmm2
+ paddsw xmm0, xmm5
+
+ paddsw xmm0, krd ;rounding
+ psraw xmm0, 7 ;shift
+ packuswb xmm0, xmm0 ;pack to byte
+
+%if %1
+ movd xmm1, [rdi]
+ pavgb xmm0, xmm1
+%endif
+ movd [rdi], xmm0
+%endm
+
+%macro GET_FILTERS 0
+ mov rdx, arg(5) ;filter ptr
+ mov rsi, arg(0) ;src_ptr
+ mov rdi, arg(2) ;output_ptr
+ mov rcx, 0x0400040
+
+ movdqa xmm7, [rdx] ;load filters
+ pshuflw xmm0, xmm7, 0b ;k0
+ pshuflw xmm1, xmm7, 01010101b ;k1
+ pshuflw xmm2, xmm7, 10101010b ;k2
+ pshuflw xmm3, xmm7, 11111111b ;k3
+ pshufhw xmm4, xmm7, 0b ;k4
+ pshufhw xmm5, xmm7, 01010101b ;k5
+ pshufhw xmm6, xmm7, 10101010b ;k6
+ pshufhw xmm7, xmm7, 11111111b ;k7
+
+ punpcklwd xmm0, xmm0
+ punpcklwd xmm1, xmm1
+ punpcklwd xmm2, xmm2
+ punpcklwd xmm3, xmm3
+ punpckhwd xmm4, xmm4
+ punpckhwd xmm5, xmm5
+ punpckhwd xmm6, xmm6
+ punpckhwd xmm7, xmm7
+
+ movdqa k0, xmm0 ;store filter factors on stack
+ movdqa k1, xmm1
+ movdqa k2, xmm2
+ movdqa k3, xmm3
+ movdqa k4, xmm4
+ movdqa k5, xmm5
+ movdqa k6, xmm6
+ movdqa k7, xmm7
+
+ movq xmm6, rcx
+ pshufd xmm6, xmm6, 0
+ movdqa krd, xmm6 ;rounding
+
+ pxor xmm7, xmm7
+ movdqa zero, xmm7
+%endm
+
+%macro LOAD_VERT_8 1
+ movq xmm0, [rsi + %1] ;0
+ movq xmm1, [rsi + rax + %1] ;1
+ movq xmm6, [rsi + rdx * 2 + %1] ;6
+ lea rsi, [rsi + rax]
+ movq xmm7, [rsi + rdx * 2 + %1] ;7
+ movq xmm2, [rsi + rax + %1] ;2
+ movq xmm3, [rsi + rax * 2 + %1] ;3
+ movq xmm4, [rsi + rdx + %1] ;4
+ movq xmm5, [rsi + rax * 4 + %1] ;5
+%endm
+
+%macro APPLY_FILTER_8 2
+ punpcklbw xmm0, zero
+ punpcklbw xmm1, zero
+ punpcklbw xmm6, zero
+ punpcklbw xmm7, zero
+ punpcklbw xmm2, zero
+ punpcklbw xmm5, zero
+ punpcklbw xmm3, zero
+ punpcklbw xmm4, zero
+
+ pmullw xmm0, k0
+ pmullw xmm1, k1
+ pmullw xmm6, k6
+ pmullw xmm7, k7
+ pmullw xmm2, k2
+ pmullw xmm5, k5
+ pmullw xmm3, k3
+ pmullw xmm4, k4
+
+ paddsw xmm0, xmm1
+ paddsw xmm0, xmm6
+ paddsw xmm0, xmm7
+ paddsw xmm0, xmm2
+ paddsw xmm0, xmm5
+ paddsw xmm0, xmm3
+ paddsw xmm0, xmm4
+
+ paddsw xmm0, krd ;rounding
+ psraw xmm0, 7 ;shift
+ packuswb xmm0, xmm0 ;pack back to byte
+%if %1
+ movq xmm1, [rdi + %2]
+ pavgb xmm0, xmm1
+%endif
+ movq [rdi + %2], xmm0
+%endm
+
+;void vp9_filter_block1d4_v8_sse2
+;(
+; unsigned char *src_ptr,
+; unsigned int src_pitch,
+; unsigned char *output_ptr,
+; unsigned int out_pitch,
+; unsigned int output_height,
+; short *filter
+;)
+global sym(vp9_filter_block1d4_v8_sse2) PRIVATE
+sym(vp9_filter_block1d4_v8_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ SAVE_XMM 7
+ push rsi
+ push rdi
+ push rbx
+ ; end prolog
+
+ ALIGN_STACK 16, rax
+ sub rsp, 16 * 6
+ %define k0k1 [rsp + 16 * 0]
+ %define k2k3 [rsp + 16 * 1]
+ %define k5k4 [rsp + 16 * 2]
+ %define k6k7 [rsp + 16 * 3]
+ %define krd [rsp + 16 * 4]
+ %define zero [rsp + 16 * 5]
+
+ GET_FILTERS_4
+
+ mov rsi, arg(0) ;src_ptr
+ mov rdi, arg(2) ;output_ptr
+
+ movsxd rax, DWORD PTR arg(1) ;pixels_per_line
+ movsxd rbx, DWORD PTR arg(3) ;out_pitch
+ lea rdx, [rax + rax * 2]
+ movsxd rcx, DWORD PTR arg(4) ;output_height
+
+.loop:
+ movd xmm0, [rsi] ;load src: row 0
+ movd xmm1, [rsi + rax] ;1
+ movd xmm6, [rsi + rdx * 2] ;6
+ lea rsi, [rsi + rax]
+ movd xmm7, [rsi + rdx * 2] ;7
+ movd xmm2, [rsi + rax] ;2
+ movd xmm3, [rsi + rax * 2] ;3
+ movd xmm4, [rsi + rdx] ;4
+ movd xmm5, [rsi + rax * 4] ;5
+
+ APPLY_FILTER_4 0
+
+ lea rdi, [rdi + rbx]
+ dec rcx
+ jnz .loop
+
+ add rsp, 16 * 6
+ pop rsp
+ pop rbx
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+;void vp9_filter_block1d8_v8_sse2
+;(
+; unsigned char *src_ptr,
+; unsigned int src_pitch,
+; unsigned char *output_ptr,
+; unsigned int out_pitch,
+; unsigned int output_height,
+; short *filter
+;)
+global sym(vp9_filter_block1d8_v8_sse2) PRIVATE
+sym(vp9_filter_block1d8_v8_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ SAVE_XMM 7
+ push rsi
+ push rdi
+ push rbx
+ ; end prolog
+
+ ALIGN_STACK 16, rax
+ sub rsp, 16 * 10
+ %define k0 [rsp + 16 * 0]
+ %define k1 [rsp + 16 * 1]
+ %define k2 [rsp + 16 * 2]
+ %define k3 [rsp + 16 * 3]
+ %define k4 [rsp + 16 * 4]
+ %define k5 [rsp + 16 * 5]
+ %define k6 [rsp + 16 * 6]
+ %define k7 [rsp + 16 * 7]
+ %define krd [rsp + 16 * 8]
+ %define zero [rsp + 16 * 9]
+
+ GET_FILTERS
+
+ movsxd rax, DWORD PTR arg(1) ;pixels_per_line
+ movsxd rbx, DWORD PTR arg(3) ;out_pitch
+ lea rdx, [rax + rax * 2]
+ movsxd rcx, DWORD PTR arg(4) ;output_height
+
+.loop:
+ LOAD_VERT_8 0
+ APPLY_FILTER_8 0, 0
+
+ lea rdi, [rdi + rbx]
+ dec rcx
+ jnz .loop
+
+ add rsp, 16 * 10
+ pop rsp
+ pop rbx
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+;void vp9_filter_block1d16_v8_sse2
+;(
+; unsigned char *src_ptr,
+; unsigned int src_pitch,
+; unsigned char *output_ptr,
+; unsigned int out_pitch,
+; unsigned int output_height,
+; short *filter
+;)
+global sym(vp9_filter_block1d16_v8_sse2) PRIVATE
+sym(vp9_filter_block1d16_v8_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ SAVE_XMM 7
+ push rsi
+ push rdi
+ push rbx
+ ; end prolog
+
+ ALIGN_STACK 16, rax
+ sub rsp, 16 * 10
+ %define k0 [rsp + 16 * 0]
+ %define k1 [rsp + 16 * 1]
+ %define k2 [rsp + 16 * 2]
+ %define k3 [rsp + 16 * 3]
+ %define k4 [rsp + 16 * 4]
+ %define k5 [rsp + 16 * 5]
+ %define k6 [rsp + 16 * 6]
+ %define k7 [rsp + 16 * 7]
+ %define krd [rsp + 16 * 8]
+ %define zero [rsp + 16 * 9]
+
+ GET_FILTERS
+
+ movsxd rax, DWORD PTR arg(1) ;pixels_per_line
+ movsxd rbx, DWORD PTR arg(3) ;out_pitch
+ lea rdx, [rax + rax * 2]
+ movsxd rcx, DWORD PTR arg(4) ;output_height
+
+.loop:
+ LOAD_VERT_8 0
+ APPLY_FILTER_8 0, 0
+ sub rsi, rax
+
+ LOAD_VERT_8 8
+ APPLY_FILTER_8 0, 8
+ add rdi, rbx
+
+ dec rcx
+ jnz .loop
+
+ add rsp, 16 * 10
+ pop rsp
+ pop rbx
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+global sym(vp9_filter_block1d4_v8_avg_sse2) PRIVATE
+sym(vp9_filter_block1d4_v8_avg_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ SAVE_XMM 7
+ push rsi
+ push rdi
+ push rbx
+ ; end prolog
+
+ ALIGN_STACK 16, rax
+ sub rsp, 16 * 6
+ %define k0k1 [rsp + 16 * 0]
+ %define k2k3 [rsp + 16 * 1]
+ %define k5k4 [rsp + 16 * 2]
+ %define k6k7 [rsp + 16 * 3]
+ %define krd [rsp + 16 * 4]
+ %define zero [rsp + 16 * 5]
+
+ GET_FILTERS_4
+
+ mov rsi, arg(0) ;src_ptr
+ mov rdi, arg(2) ;output_ptr
+
+ movsxd rax, DWORD PTR arg(1) ;pixels_per_line
+ movsxd rbx, DWORD PTR arg(3) ;out_pitch
+ lea rdx, [rax + rax * 2]
+ movsxd rcx, DWORD PTR arg(4) ;output_height
+
+.loop:
+ movd xmm0, [rsi] ;load src: row 0
+ movd xmm1, [rsi + rax] ;1
+ movd xmm6, [rsi + rdx * 2] ;6
+ lea rsi, [rsi + rax]
+ movd xmm7, [rsi + rdx * 2] ;7
+ movd xmm2, [rsi + rax] ;2
+ movd xmm3, [rsi + rax * 2] ;3
+ movd xmm4, [rsi + rdx] ;4
+ movd xmm5, [rsi + rax * 4] ;5
+
+ APPLY_FILTER_4 1
+
+ lea rdi, [rdi + rbx]
+ dec rcx
+ jnz .loop
+
+ add rsp, 16 * 6
+ pop rsp
+ pop rbx
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+global sym(vp9_filter_block1d8_v8_avg_sse2) PRIVATE
+sym(vp9_filter_block1d8_v8_avg_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ SAVE_XMM 7
+ push rsi
+ push rdi
+ push rbx
+ ; end prolog
+
+ ALIGN_STACK 16, rax
+ sub rsp, 16 * 10
+ %define k0 [rsp + 16 * 0]
+ %define k1 [rsp + 16 * 1]
+ %define k2 [rsp + 16 * 2]
+ %define k3 [rsp + 16 * 3]
+ %define k4 [rsp + 16 * 4]
+ %define k5 [rsp + 16 * 5]
+ %define k6 [rsp + 16 * 6]
+ %define k7 [rsp + 16 * 7]
+ %define krd [rsp + 16 * 8]
+ %define zero [rsp + 16 * 9]
+
+ GET_FILTERS
+
+ movsxd rax, DWORD PTR arg(1) ;pixels_per_line
+ movsxd rbx, DWORD PTR arg(3) ;out_pitch
+ lea rdx, [rax + rax * 2]
+ movsxd rcx, DWORD PTR arg(4) ;output_height
+.loop:
+ LOAD_VERT_8 0
+ APPLY_FILTER_8 1, 0
+
+ lea rdi, [rdi + rbx]
+ dec rcx
+ jnz .loop
+
+ add rsp, 16 * 10
+ pop rsp
+ pop rbx
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+global sym(vp9_filter_block1d16_v8_avg_sse2) PRIVATE
+sym(vp9_filter_block1d16_v8_avg_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ SAVE_XMM 7
+ push rsi
+ push rdi
+ push rbx
+ ; end prolog
+
+ ALIGN_STACK 16, rax
+ sub rsp, 16 * 10
+ %define k0 [rsp + 16 * 0]
+ %define k1 [rsp + 16 * 1]
+ %define k2 [rsp + 16 * 2]
+ %define k3 [rsp + 16 * 3]
+ %define k4 [rsp + 16 * 4]
+ %define k5 [rsp + 16 * 5]
+ %define k6 [rsp + 16 * 6]
+ %define k7 [rsp + 16 * 7]
+ %define krd [rsp + 16 * 8]
+ %define zero [rsp + 16 * 9]
+
+ GET_FILTERS
+
+ movsxd rax, DWORD PTR arg(1) ;pixels_per_line
+ movsxd rbx, DWORD PTR arg(3) ;out_pitch
+ lea rdx, [rax + rax * 2]
+ movsxd rcx, DWORD PTR arg(4) ;output_height
+.loop:
+ LOAD_VERT_8 0
+ APPLY_FILTER_8 1, 0
+ sub rsi, rax
+
+ LOAD_VERT_8 8
+ APPLY_FILTER_8 1, 8
+ add rdi, rbx
+
+ dec rcx
+ jnz .loop
+
+ add rsp, 16 * 10
+ pop rsp
+ pop rbx
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+;void vp9_filter_block1d4_h8_sse2
+;(
+; unsigned char *src_ptr,
+; unsigned int src_pixels_per_line,
+; unsigned char *output_ptr,
+; unsigned int output_pitch,
+; unsigned int output_height,
+; short *filter
+;)
+global sym(vp9_filter_block1d4_h8_sse2) PRIVATE
+sym(vp9_filter_block1d4_h8_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ SAVE_XMM 7
+ push rsi
+ push rdi
+ ; end prolog
+
+ ALIGN_STACK 16, rax
+ sub rsp, 16 * 6
+ %define k0k1 [rsp + 16 * 0]
+ %define k2k3 [rsp + 16 * 1]
+ %define k5k4 [rsp + 16 * 2]
+ %define k6k7 [rsp + 16 * 3]
+ %define krd [rsp + 16 * 4]
+ %define zero [rsp + 16 * 5]
+
+ GET_FILTERS_4
+
+ mov rsi, arg(0) ;src_ptr
+ mov rdi, arg(2) ;output_ptr
+
+ movsxd rax, DWORD PTR arg(1) ;pixels_per_line
+ movsxd rdx, DWORD PTR arg(3) ;out_pitch
+ movsxd rcx, DWORD PTR arg(4) ;output_height
+
+.loop:
+ movdqu xmm0, [rsi - 3] ;load src
+
+ movdqa xmm1, xmm0
+ movdqa xmm6, xmm0
+ movdqa xmm7, xmm0
+ movdqa xmm2, xmm0
+ movdqa xmm3, xmm0
+ movdqa xmm5, xmm0
+ movdqa xmm4, xmm0
+
+ psrldq xmm1, 1
+ psrldq xmm6, 6
+ psrldq xmm7, 7
+ psrldq xmm2, 2
+ psrldq xmm3, 3
+ psrldq xmm5, 5
+ psrldq xmm4, 4
+
+ APPLY_FILTER_4 0
+
+ lea rsi, [rsi + rax]
+ lea rdi, [rdi + rdx]
+ dec rcx
+ jnz .loop
+
+ add rsp, 16 * 6
+ pop rsp
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+;void vp9_filter_block1d8_h8_sse2
+;(
+; unsigned char *src_ptr,
+; unsigned int src_pixels_per_line,
+; unsigned char *output_ptr,
+; unsigned int output_pitch,
+; unsigned int output_height,
+; short *filter
+;)
+global sym(vp9_filter_block1d8_h8_sse2) PRIVATE
+sym(vp9_filter_block1d8_h8_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ SAVE_XMM 7
+ push rsi
+ push rdi
+ ; end prolog
+
+ ALIGN_STACK 16, rax
+ sub rsp, 16 * 10
+ %define k0 [rsp + 16 * 0]
+ %define k1 [rsp + 16 * 1]
+ %define k2 [rsp + 16 * 2]
+ %define k3 [rsp + 16 * 3]
+ %define k4 [rsp + 16 * 4]
+ %define k5 [rsp + 16 * 5]
+ %define k6 [rsp + 16 * 6]
+ %define k7 [rsp + 16 * 7]
+ %define krd [rsp + 16 * 8]
+ %define zero [rsp + 16 * 9]
+
+ GET_FILTERS
+
+ movsxd rax, DWORD PTR arg(1) ;pixels_per_line
+ movsxd rdx, DWORD PTR arg(3) ;out_pitch
+ movsxd rcx, DWORD PTR arg(4) ;output_height
+
+.loop:
+ movdqu xmm0, [rsi - 3] ;load src
+
+ movdqa xmm1, xmm0
+ movdqa xmm6, xmm0
+ movdqa xmm7, xmm0
+ movdqa xmm2, xmm0
+ movdqa xmm5, xmm0
+ movdqa xmm3, xmm0
+ movdqa xmm4, xmm0
+
+ psrldq xmm1, 1
+ psrldq xmm6, 6
+ psrldq xmm7, 7
+ psrldq xmm2, 2
+ psrldq xmm5, 5
+ psrldq xmm3, 3
+ psrldq xmm4, 4
+
+ APPLY_FILTER_8 0, 0
+
+ lea rsi, [rsi + rax]
+ lea rdi, [rdi + rdx]
+ dec rcx
+ jnz .loop
+
+ add rsp, 16 * 10
+ pop rsp
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+;void vp9_filter_block1d16_h8_sse2
+;(
+; unsigned char *src_ptr,
+; unsigned int src_pixels_per_line,
+; unsigned char *output_ptr,
+; unsigned int output_pitch,
+; unsigned int output_height,
+; short *filter
+;)
+global sym(vp9_filter_block1d16_h8_sse2) PRIVATE
+sym(vp9_filter_block1d16_h8_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ SAVE_XMM 7
+ push rsi
+ push rdi
+ ; end prolog
+
+ ALIGN_STACK 16, rax
+ sub rsp, 16 * 10
+ %define k0 [rsp + 16 * 0]
+ %define k1 [rsp + 16 * 1]
+ %define k2 [rsp + 16 * 2]
+ %define k3 [rsp + 16 * 3]
+ %define k4 [rsp + 16 * 4]
+ %define k5 [rsp + 16 * 5]
+ %define k6 [rsp + 16 * 6]
+ %define k7 [rsp + 16 * 7]
+ %define krd [rsp + 16 * 8]
+ %define zero [rsp + 16 * 9]
+
+ GET_FILTERS
+
+ movsxd rax, DWORD PTR arg(1) ;pixels_per_line
+ movsxd rdx, DWORD PTR arg(3) ;out_pitch
+ movsxd rcx, DWORD PTR arg(4) ;output_height
+
+.loop:
+ movdqu xmm0, [rsi - 3] ;load src
+
+ movdqa xmm1, xmm0
+ movdqa xmm6, xmm0
+ movdqa xmm7, xmm0
+ movdqa xmm2, xmm0
+ movdqa xmm5, xmm0
+ movdqa xmm3, xmm0
+ movdqa xmm4, xmm0
+
+ psrldq xmm1, 1
+ psrldq xmm6, 6
+ psrldq xmm7, 7
+ psrldq xmm2, 2
+ psrldq xmm5, 5
+ psrldq xmm3, 3
+ psrldq xmm4, 4
+
+ APPLY_FILTER_8 0, 0
+
+ movdqu xmm0, [rsi + 5] ;load src
+
+ movdqa xmm1, xmm0
+ movdqa xmm6, xmm0
+ movdqa xmm7, xmm0
+ movdqa xmm2, xmm0
+ movdqa xmm5, xmm0
+ movdqa xmm3, xmm0
+ movdqa xmm4, xmm0
+
+ psrldq xmm1, 1
+ psrldq xmm6, 6
+ psrldq xmm7, 7
+ psrldq xmm2, 2
+ psrldq xmm5, 5
+ psrldq xmm3, 3
+ psrldq xmm4, 4
+
+ APPLY_FILTER_8 0, 8
+
+ lea rsi, [rsi + rax]
+ lea rdi, [rdi + rdx]
+ dec rcx
+ jnz .loop
+
+ add rsp, 16 * 10
+ pop rsp
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+global sym(vp9_filter_block1d4_h8_avg_sse2) PRIVATE
+sym(vp9_filter_block1d4_h8_avg_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ SAVE_XMM 7
+ push rsi
+ push rdi
+ ; end prolog
+
+ ALIGN_STACK 16, rax
+ sub rsp, 16 * 6
+ %define k0k1 [rsp + 16 * 0]
+ %define k2k3 [rsp + 16 * 1]
+ %define k5k4 [rsp + 16 * 2]
+ %define k6k7 [rsp + 16 * 3]
+ %define krd [rsp + 16 * 4]
+ %define zero [rsp + 16 * 5]
+
+ GET_FILTERS_4
+
+ mov rsi, arg(0) ;src_ptr
+ mov rdi, arg(2) ;output_ptr
+
+ movsxd rax, DWORD PTR arg(1) ;pixels_per_line
+ movsxd rdx, DWORD PTR arg(3) ;out_pitch
+ movsxd rcx, DWORD PTR arg(4) ;output_height
+
+.loop:
+ movdqu xmm0, [rsi - 3] ;load src
+
+ movdqa xmm1, xmm0
+ movdqa xmm6, xmm0
+ movdqa xmm7, xmm0
+ movdqa xmm2, xmm0
+ movdqa xmm3, xmm0
+ movdqa xmm5, xmm0
+ movdqa xmm4, xmm0
+
+ psrldq xmm1, 1
+ psrldq xmm6, 6
+ psrldq xmm7, 7
+ psrldq xmm2, 2
+ psrldq xmm3, 3
+ psrldq xmm5, 5
+ psrldq xmm4, 4
+
+ APPLY_FILTER_4 1
+
+ lea rsi, [rsi + rax]
+ lea rdi, [rdi + rdx]
+ dec rcx
+ jnz .loop
+
+ add rsp, 16 * 6
+ pop rsp
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+global sym(vp9_filter_block1d8_h8_avg_sse2) PRIVATE
+sym(vp9_filter_block1d8_h8_avg_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ SAVE_XMM 7
+ push rsi
+ push rdi
+ ; end prolog
+
+ ALIGN_STACK 16, rax
+ sub rsp, 16 * 10
+ %define k0 [rsp + 16 * 0]
+ %define k1 [rsp + 16 * 1]
+ %define k2 [rsp + 16 * 2]
+ %define k3 [rsp + 16 * 3]
+ %define k4 [rsp + 16 * 4]
+ %define k5 [rsp + 16 * 5]
+ %define k6 [rsp + 16 * 6]
+ %define k7 [rsp + 16 * 7]
+ %define krd [rsp + 16 * 8]
+ %define zero [rsp + 16 * 9]
+
+ GET_FILTERS
+
+ movsxd rax, DWORD PTR arg(1) ;pixels_per_line
+ movsxd rdx, DWORD PTR arg(3) ;out_pitch
+ movsxd rcx, DWORD PTR arg(4) ;output_height
+
+.loop:
+ movdqu xmm0, [rsi - 3] ;load src
+
+ movdqa xmm1, xmm0
+ movdqa xmm6, xmm0
+ movdqa xmm7, xmm0
+ movdqa xmm2, xmm0
+ movdqa xmm5, xmm0
+ movdqa xmm3, xmm0
+ movdqa xmm4, xmm0
+
+ psrldq xmm1, 1
+ psrldq xmm6, 6
+ psrldq xmm7, 7
+ psrldq xmm2, 2
+ psrldq xmm5, 5
+ psrldq xmm3, 3
+ psrldq xmm4, 4
+
+ APPLY_FILTER_8 1, 0
+
+ lea rsi, [rsi + rax]
+ lea rdi, [rdi + rdx]
+ dec rcx
+ jnz .loop
+
+ add rsp, 16 * 10
+ pop rsp
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+global sym(vp9_filter_block1d16_h8_avg_sse2) PRIVATE
+sym(vp9_filter_block1d16_h8_avg_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ SAVE_XMM 7
+ push rsi
+ push rdi
+ ; end prolog
+
+ ALIGN_STACK 16, rax
+ sub rsp, 16 * 10
+ %define k0 [rsp + 16 * 0]
+ %define k1 [rsp + 16 * 1]
+ %define k2 [rsp + 16 * 2]
+ %define k3 [rsp + 16 * 3]
+ %define k4 [rsp + 16 * 4]
+ %define k5 [rsp + 16 * 5]
+ %define k6 [rsp + 16 * 6]
+ %define k7 [rsp + 16 * 7]
+ %define krd [rsp + 16 * 8]
+ %define zero [rsp + 16 * 9]
+
+ GET_FILTERS
+
+ movsxd rax, DWORD PTR arg(1) ;pixels_per_line
+ movsxd rdx, DWORD PTR arg(3) ;out_pitch
+ movsxd rcx, DWORD PTR arg(4) ;output_height
+
+.loop:
+ movdqu xmm0, [rsi - 3] ;load src
+
+ movdqa xmm1, xmm0
+ movdqa xmm6, xmm0
+ movdqa xmm7, xmm0
+ movdqa xmm2, xmm0
+ movdqa xmm5, xmm0
+ movdqa xmm3, xmm0
+ movdqa xmm4, xmm0
+
+ psrldq xmm1, 1
+ psrldq xmm6, 6
+ psrldq xmm7, 7
+ psrldq xmm2, 2
+ psrldq xmm5, 5
+ psrldq xmm3, 3
+ psrldq xmm4, 4
+
+ APPLY_FILTER_8 1, 0
+
+ movdqu xmm0, [rsi + 5] ;load src
+
+ movdqa xmm1, xmm0
+ movdqa xmm6, xmm0
+ movdqa xmm7, xmm0
+ movdqa xmm2, xmm0
+ movdqa xmm5, xmm0
+ movdqa xmm3, xmm0
+ movdqa xmm4, xmm0
+
+ psrldq xmm1, 1
+ psrldq xmm6, 6
+ psrldq xmm7, 7
+ psrldq xmm2, 2
+ psrldq xmm5, 5
+ psrldq xmm3, 3
+ psrldq xmm4, 4
+
+ APPLY_FILTER_8 1, 8
+
+ lea rsi, [rsi + rax]
+ lea rdi, [rdi + rdx]
+ dec rcx
+ jnz .loop
+
+ add rsp, 16 * 10
+ pop rsp
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
diff --git a/vp9/decoder/vp9_decodemv.c b/vp9/decoder/vp9_decodemv.c
index 224a724..d89d6b8 100644
--- a/vp9/decoder/vp9_decodemv.c
+++ b/vp9/decoder/vp9_decodemv.c
@@ -363,15 +363,14 @@
int i, j;
for (j = 0; j < SWITCHABLE_FILTERS + 1; ++j)
for (i = 0; i < SWITCHABLE_FILTERS - 1; ++i)
- vp9_diff_update_prob(r, MODE_UPDATE_PROB,
- &fc->switchable_interp_prob[j][i]);
+ vp9_diff_update_prob(r, &fc->switchable_interp_prob[j][i]);
}
static void read_inter_mode_probs(FRAME_CONTEXT *fc, vp9_reader *r) {
int i, j;
for (i = 0; i < INTER_MODE_CONTEXTS; ++i)
for (j = 0; j < INTER_MODES - 1; ++j)
- vp9_diff_update_prob(r, MODE_UPDATE_PROB, &fc->inter_mode_probs[i][j]);
+ vp9_diff_update_prob(r, &fc->inter_mode_probs[i][j]);
}
static INLINE COMPPREDMODE_TYPE read_comp_pred_mode(vp9_reader *r) {
@@ -426,6 +425,45 @@
mbmi->uv_mode = read_intra_mode_uv(cm, r, mbmi->mode);
}
+static INLINE void assign_mv(VP9_COMMON *cm, MB_PREDICTION_MODE mode,
+ int_mv mv[2], int_mv best_mv[2],
+ int_mv nearest_mv[2], int_mv near_mv[2],
+ int is_compound, int allow_hp, vp9_reader *r) {
+ int i;
+
+ switch (mode) {
+ case NEWMV:
+ read_mv(r, &mv[0].as_mv, &best_mv[0].as_mv,
+ &cm->fc.nmvc, &cm->counts.mv, allow_hp);
+ if (is_compound)
+ read_mv(r, &mv[1].as_mv, &best_mv[1].as_mv,
+ &cm->fc.nmvc, &cm->counts.mv, allow_hp);
+ break;
+ case NEARESTMV:
+ mv[0].as_int = nearest_mv[0].as_int;
+ if (is_compound)
+ mv[1].as_int = nearest_mv[1].as_int;
+ break;
+ case NEARMV:
+ mv[0].as_int = near_mv[0].as_int;
+ if (is_compound)
+ mv[1].as_int = near_mv[1].as_int;
+ break;
+ case ZEROMV:
+ mv[0].as_int = 0;
+ if (is_compound)
+ mv[1].as_int = 0;
+ break;
+ default:
+ assert(!"Invalid inter mode value.");
+ }
+
+ for (i = 0; i < 1 + is_compound; ++i) {
+ assert(mv[i].as_mv.row < MV_UPP && mv[i].as_mv.row > MV_LOW);
+ assert(mv[i].as_mv.col < MV_UPP && mv[i].as_mv.col > MV_LOW);
+ }
+}
+
static int read_is_inter_block(VP9D_COMP *pbi, int segment_id, vp9_reader *r) {
VP9_COMMON *const cm = &pbi->common;
MACROBLOCKD *const xd = &pbi->mb;
@@ -445,10 +483,7 @@
int mi_row, int mi_col, vp9_reader *r) {
VP9_COMMON *const cm = &pbi->common;
MACROBLOCKD *const xd = &pbi->mb;
- nmv_context *const nmvc = &cm->fc.nmvc;
MB_MODE_INFO *const mbmi = &mi->mbmi;
- int_mv *const mv0 = &mbmi->mv[0];
- int_mv *const mv1 = &mbmi->mv[1];
const BLOCK_SIZE bsize = mbmi->sb_type;
const int allow_hp = xd->allow_high_precision_mv;
@@ -518,41 +553,12 @@
mi_row, mi_col);
}
- switch (b_mode) {
- case NEWMV:
- read_mv(r, &block[0].as_mv, &best[0].as_mv, nmvc, &cm->counts.mv,
- allow_hp);
- if (is_compound)
- read_mv(r, &block[1].as_mv, &best[1].as_mv, nmvc, &cm->counts.mv,
- allow_hp);
- break;
- case NEARESTMV:
- block[0].as_int = nearest[0].as_int;
- if (is_compound)
- block[1].as_int = nearest[1].as_int;
- break;
- case NEARMV:
- block[0].as_int = nearmv[0].as_int;
- if (is_compound)
- block[1].as_int = nearmv[1].as_int;
- break;
- case ZEROMV:
- block[0].as_int = 0;
- if (is_compound)
- block[1].as_int = 0;
- break;
- default:
- assert(!"Invalid inter mode value");
- }
- mi->bmi[j].as_mv[0].as_int = block[0].as_int;
- assert(block[0].as_mv.row < MV_UPP && block[0].as_mv.row > MV_LOW);
- assert(block[0].as_mv.col < MV_UPP && block[0].as_mv.col > MV_LOW);
+ assign_mv(cm, b_mode, block, best, nearest, nearmv,
+ is_compound, allow_hp, r);
- if (is_compound) {
+ mi->bmi[j].as_mv[0].as_int = block[0].as_int;
+ if (is_compound)
mi->bmi[j].as_mv[1].as_int = block[1].as_int;
- assert(block[1].as_mv.row < MV_UPP && block[1].as_mv.row > MV_LOW);
- assert(block[1].as_mv.col < MV_UPP && block[1].as_mv.col > MV_LOW);
- }
if (num_4x4_h == 2)
mi->bmi[j + 2] = mi->bmi[j];
@@ -562,43 +568,12 @@
}
mi->mbmi.mode = b_mode;
- mv0->as_int = mi->bmi[3].as_mv[0].as_int;
- mv1->as_int = mi->bmi[3].as_mv[1].as_int;
+
+ mbmi->mv[0].as_int = mi->bmi[3].as_mv[0].as_int;
+ mbmi->mv[1].as_int = mi->bmi[3].as_mv[1].as_int;
} else {
- switch (mbmi->mode) {
- case NEARMV:
- mv0->as_int = nearmv[0].as_int;
- if (is_compound)
- mv1->as_int = nearmv[1].as_int;
- break;
-
- case NEARESTMV:
- mv0->as_int = nearest[0].as_int;
- if (is_compound)
- mv1->as_int = nearest[1].as_int;
- break;
-
- case ZEROMV:
- mv0->as_int = 0;
- if (is_compound)
- mv1->as_int = 0;
- break;
-
- case NEWMV:
- read_mv(r, &mv0->as_mv, &best[0].as_mv, nmvc, &cm->counts.mv, allow_hp);
- if (is_compound)
- read_mv(r, &mv1->as_mv, &best[1].as_mv, nmvc, &cm->counts.mv,
- allow_hp);
- break;
- default:
- assert(!"Invalid inter mode value");
- }
- assert(mv0->as_mv.row < MV_UPP && mv0->as_mv.row > MV_LOW);
- assert(mv0->as_mv.col < MV_UPP && mv0->as_mv.col > MV_LOW);
- if (is_compound) {
- assert(mv1->as_mv.row < MV_UPP && mv1->as_mv.row > MV_LOW);
- assert(mv1->as_mv.col < MV_UPP && mv1->as_mv.col > MV_LOW);
- }
+ assign_mv(cm, mbmi->mode, mbmi->mv, best, nearest, nearmv,
+ is_compound, allow_hp, r);
}
}
@@ -630,17 +605,17 @@
if (cm->comp_pred_mode == HYBRID_PREDICTION)
for (i = 0; i < COMP_INTER_CONTEXTS; i++)
- vp9_diff_update_prob(r, MODE_UPDATE_PROB, &cm->fc.comp_inter_prob[i]);
+ vp9_diff_update_prob(r, &cm->fc.comp_inter_prob[i]);
if (cm->comp_pred_mode != COMP_PREDICTION_ONLY)
for (i = 0; i < REF_CONTEXTS; i++) {
- vp9_diff_update_prob(r, MODE_UPDATE_PROB, &cm->fc.single_ref_prob[i][0]);
- vp9_diff_update_prob(r, MODE_UPDATE_PROB, &cm->fc.single_ref_prob[i][1]);
+ vp9_diff_update_prob(r, &cm->fc.single_ref_prob[i][0]);
+ vp9_diff_update_prob(r, &cm->fc.single_ref_prob[i][1]);
}
if (cm->comp_pred_mode != SINGLE_PREDICTION_ONLY)
for (i = 0; i < REF_CONTEXTS; i++)
- vp9_diff_update_prob(r, MODE_UPDATE_PROB, &cm->fc.comp_ref_prob[i]);
+ vp9_diff_update_prob(r, &cm->fc.comp_ref_prob[i]);
}
void vp9_prepare_read_mode_info(VP9D_COMP* pbi, vp9_reader *r) {
@@ -650,7 +625,7 @@
// TODO(jkoleszar): does this clear more than MBSKIP_CONTEXTS? Maybe remove.
// vpx_memset(cm->fc.mbskip_probs, 0, sizeof(cm->fc.mbskip_probs));
for (k = 0; k < MBSKIP_CONTEXTS; ++k)
- vp9_diff_update_prob(r, MODE_UPDATE_PROB, &cm->fc.mbskip_probs[k]);
+ vp9_diff_update_prob(r, &cm->fc.mbskip_probs[k]);
if (cm->frame_type != KEY_FRAME && !cm->intra_only) {
nmv_context *const nmvc = &pbi->common.fc.nmvc;
@@ -663,18 +638,17 @@
read_switchable_interp_probs(&cm->fc, r);
for (i = 0; i < INTRA_INTER_CONTEXTS; i++)
- vp9_diff_update_prob(r, MODE_UPDATE_PROB, &cm->fc.intra_inter_prob[i]);
+ vp9_diff_update_prob(r, &cm->fc.intra_inter_prob[i]);
read_comp_pred(cm, r);
for (j = 0; j < BLOCK_SIZE_GROUPS; j++)
for (i = 0; i < INTRA_MODES - 1; ++i)
- vp9_diff_update_prob(r, MODE_UPDATE_PROB, &cm->fc.y_mode_prob[j][i]);
+ vp9_diff_update_prob(r, &cm->fc.y_mode_prob[j][i]);
for (j = 0; j < NUM_PARTITION_CONTEXTS; ++j)
for (i = 0; i < PARTITION_TYPES - 1; ++i)
- vp9_diff_update_prob(r, MODE_UPDATE_PROB,
- &cm->fc.partition_prob[INTER_FRAME][j][i]);
+ vp9_diff_update_prob(r, &cm->fc.partition_prob[INTER_FRAME][j][i]);
read_mv_probs(r, nmvc, xd->allow_high_precision_mv);
}
diff --git a/vp9/decoder/vp9_decodframe.c b/vp9/decoder/vp9_decodframe.c
index 8b23c73..cc3422f 100644
--- a/vp9/decoder/vp9_decodframe.c
+++ b/vp9/decoder/vp9_decodframe.c
@@ -63,15 +63,15 @@
for (i = 0; i < TX_SIZE_CONTEXTS; ++i)
for (j = 0; j < TX_SIZES - 3; ++j)
- vp9_diff_update_prob(r, MODE_UPDATE_PROB, &tx_probs->p8x8[i][j]);
+ vp9_diff_update_prob(r, &tx_probs->p8x8[i][j]);
for (i = 0; i < TX_SIZE_CONTEXTS; ++i)
for (j = 0; j < TX_SIZES - 2; ++j)
- vp9_diff_update_prob(r, MODE_UPDATE_PROB, &tx_probs->p16x16[i][j]);
+ vp9_diff_update_prob(r, &tx_probs->p16x16[i][j]);
for (i = 0; i < TX_SIZE_CONTEXTS; ++i)
for (j = 0; j < TX_SIZES - 1; ++j)
- vp9_diff_update_prob(r, MODE_UPDATE_PROB, &tx_probs->p32x32[i][j]);
+ vp9_diff_update_prob(r, &tx_probs->p32x32[i][j]);
}
static void setup_plane_dequants(VP9_COMMON *cm, MACROBLOCKD *xd, int q_index) {
@@ -101,19 +101,19 @@
if (tx_type == DCT_DCT)
xd->itxm_add(qcoeff, dst, stride, eob);
else
- vp9_iht_add(tx_type, qcoeff, dst, stride, eob);
+ vp9_iht4x4_add(tx_type, qcoeff, dst, stride, eob);
break;
case TX_8X8:
tx_type = get_tx_type_8x8(pd->plane_type, xd);
- vp9_iht_add_8x8(tx_type, qcoeff, dst, stride, eob);
+ vp9_iht8x8_add(tx_type, qcoeff, dst, stride, eob);
break;
case TX_16X16:
tx_type = get_tx_type_16x16(pd->plane_type, xd);
- vp9_iht_add_16x16(tx_type, qcoeff, dst, stride, eob);
+ vp9_iht16x16_add(tx_type, qcoeff, dst, stride, eob);
break;
case TX_32X32:
tx_type = DCT_DCT;
- vp9_idct_add_32x32(qcoeff, dst, stride, eob);
+ vp9_idct32x32_add(qcoeff, dst, stride, eob);
break;
default:
assert(!"Invalid transform size");
@@ -253,7 +253,7 @@
if (!less8x8) {
assert(mbmi->sb_type == bsize);
if (eobtotal == 0)
- mbmi->skip_coeff = 1; // skip loopfilter
+ mbmi->skip_coeff = 1; // skip loopfilter
}
set_ref(pbi, 0, mi_row, mi_col);
@@ -371,8 +371,7 @@
for (l = 0; l < PREV_COEF_CONTEXTS; l++)
if (k > 0 || l < 3)
for (m = 0; m < UNCONSTRAINED_NODES; m++)
- vp9_diff_update_prob(r, VP9_COEF_UPDATE_PROB,
- &coef_probs[i][j][k][l][m]);
+ vp9_diff_update_prob(r, &coef_probs[i][j][k][l][m]);
}
static void read_coef_probs(FRAME_CONTEXT *fc, TX_MODE tx_mode,
@@ -490,8 +489,7 @@
cm->uv_dc_delta_q == 0 &&
cm->uv_ac_delta_q == 0;
- xd->itxm_add = xd->lossless ? vp9_idct_add_lossless
- : vp9_idct_add;
+ xd->itxm_add = xd->lossless ? vp9_iwht4x4_add : vp9_idct4x4_add;
}
static INTERPOLATIONFILTERTYPE read_interp_filter_type(
diff --git a/vp9/decoder/vp9_detokenize.c b/vp9/decoder/vp9_detokenize.c
index 8fcf83e..a67945c 100644
--- a/vp9/decoder/vp9_detokenize.c
+++ b/vp9/decoder/vp9_detokenize.c
@@ -61,8 +61,6 @@
254, 254, 254, 252, 249, 243, 230, 196, 177, 153, 140, 133, 130, 129, 0
};
-DECLARE_ALIGNED(16, extern const uint8_t,
- vp9_pt_energy_class[MAX_ENTROPY_TOKENS]);
#define INCREMENT_COUNT(token) \
do { \
coef_counts[type][ref][band][pt] \
@@ -205,7 +203,6 @@
if (c < seg_eob)
coef_counts[type][ref][band][pt][DCT_EOB_MODEL_TOKEN]++;
-
return c;
}
diff --git a/vp9/decoder/vp9_dsubexp.c b/vp9/decoder/vp9_dsubexp.c
index 6f01cea..df044c4 100644
--- a/vp9/decoder/vp9_dsubexp.c
+++ b/vp9/decoder/vp9_dsubexp.c
@@ -99,8 +99,8 @@
return word;
}
-void vp9_diff_update_prob(vp9_reader *r, int update_prob, vp9_prob* p) {
- if (vp9_read(r, update_prob)) {
+void vp9_diff_update_prob(vp9_reader *r, vp9_prob* p) {
+ if (vp9_read(r, DIFF_UPDATE_PROB)) {
const int delp = decode_term_subexp(r, SUBEXP_PARAM, 255);
*p = (vp9_prob)inv_remap_prob(delp, *p);
}
diff --git a/vp9/decoder/vp9_dsubexp.h b/vp9/decoder/vp9_dsubexp.h
index 21ac313..aeb9399 100644
--- a/vp9/decoder/vp9_dsubexp.h
+++ b/vp9/decoder/vp9_dsubexp.h
@@ -14,6 +14,6 @@
#include "vp9/decoder/vp9_dboolhuff.h"
-void vp9_diff_update_prob(vp9_reader *r, int update_prob, vp9_prob* p);
+void vp9_diff_update_prob(vp9_reader *r, vp9_prob* p);
#endif // VP9_DECODER_VP9_DSUBEXP_H_
diff --git a/vp9/decoder/vp9_thread.c b/vp9/decoder/vp9_thread.c
index dc3b681..5442ddf 100644
--- a/vp9/decoder/vp9_thread.c
+++ b/vp9/decoder/vp9_thread.c
@@ -29,7 +29,7 @@
//------------------------------------------------------------------------------
// simplistic pthread emulation layer
-#include <process.h>
+#include <process.h> // NOLINT
// _beginthreadex requires __stdcall
#define THREADFN unsigned int __stdcall
diff --git a/vp9/encoder/vp9_bitstream.c b/vp9/encoder/vp9_bitstream.c
index f7778a4..428ca7e 100644
--- a/vp9/encoder/vp9_bitstream.c
+++ b/vp9/encoder/vp9_bitstream.c
@@ -22,7 +22,6 @@
#include "vp9/common/vp9_seg_common.h"
#include "vp9/common/vp9_pred_common.h"
#include "vp9/common/vp9_entropy.h"
-#include "vp9/common/vp9_entropymv.h"
#include "vp9/common/vp9_mvref_common.h"
#include "vp9/common/vp9_treecoder.h"
#include "vp9/common/vp9_systemdependent.h"
@@ -180,9 +179,8 @@
vp9_tree_probs_from_distribution(tree, Pnew, bct, num_events, 0);
n--;
- for (i = 0; i < n; ++i) {
- vp9_cond_prob_diff_update(w, &Pcur[i], MODE_UPDATE_PROB, bct[i]);
- }
+ for (i = 0; i < n; ++i)
+ vp9_cond_prob_diff_update(w, &Pcur[i], bct[i]);
}
static void update_mbintra_mode_probs(VP9_COMP* const cpi,
@@ -228,8 +226,7 @@
int k;
for (k = 0; k < MBSKIP_CONTEXTS; ++k)
- vp9_cond_prob_diff_update(w, &cm->fc.mbskip_probs[k],
- MODE_UPDATE_PROB, cm->counts.mbskip[k]);
+ vp9_cond_prob_diff_update(w, &cm->fc.mbskip_probs[k], cm->counts.mbskip[k]);
}
static void write_intra_mode(vp9_writer *bc, int m, const vp9_prob *p) {
@@ -252,7 +249,7 @@
for (j = 0; j <= SWITCHABLE_FILTERS; ++j) {
for (i = 0; i < SWITCHABLE_FILTERS - 1; ++i) {
vp9_cond_prob_diff_update(bc, &cm->fc.switchable_interp_prob[j][i],
- MODE_UPDATE_PROB, branch_ct[j][i]);
+ branch_ct[j][i]);
}
}
#ifdef MODE_STATS
@@ -274,7 +271,7 @@
for (j = 0; j < INTER_MODES - 1; ++j)
vp9_cond_prob_diff_update(bc, &cm->fc.inter_mode_probs[i][j],
- MODE_UPDATE_PROB, branch_ct[j]);
+ branch_ct[j]);
}
}
@@ -341,7 +338,7 @@
const vp9_prob *p) {
assert(is_inter_mode(mode));
write_token(w, vp9_inter_mode_tree, p,
- &vp9_inter_mode_encodings[mode - NEARESTMV]);
+ &vp9_inter_mode_encodings[inter_mode_offset(mode)]);
}
@@ -389,8 +386,8 @@
mi->ref_frame[0]);
}
- // if using the prediction mdoel we have nothing further to do because
- // the reference frame is fully coded by the segment
+ // If using the prediction model we have nothing further to do because
+ // the reference frame is fully coded by the segment.
}
static void pack_inter_mode_mvs(VP9_COMP *cpi, MODE_INFO *m, vp9_writer *bc) {
@@ -407,8 +404,6 @@
const BLOCK_SIZE bsize = mi->sb_type;
const int allow_hp = xd->allow_high_precision_mv;
- x->partition_info = x->pi + (m - cm->mi);
-
#ifdef ENTROPY_STATS
active_section = 9;
#endif
@@ -490,7 +485,7 @@
for (idy = 0; idy < 2; idy += num_4x4_blocks_high) {
for (idx = 0; idx < 2; idx += num_4x4_blocks_wide) {
const int j = idy * 2 + idx;
- const MB_PREDICTION_MODE blockmode = x->partition_info->bmi[j].mode;
+ const MB_PREDICTION_MODE blockmode = m->bmi[j].as_mode;
write_sb_mv_ref(bc, blockmode, mv_ref_p);
++cm->counts.inter_mode[mi->mode_context[rf]]
[inter_mode_offset(blockmode)];
@@ -784,7 +779,7 @@
vp9_coeff_probs_model *old_frame_coef_probs =
cpi->common.fc.coef_probs[tx_size];
vp9_coeff_stats *frame_branch_ct = cpi->frame_branch_ct[tx_size];
- const vp9_prob upd = VP9_COEF_UPDATE_PROB;
+ const vp9_prob upd = DIFF_UPDATE_PROB;
const int entropy_nodes_update = UNCONSTRAINED_NODES;
int i, j, k, l, t;
switch (cpi->sf.use_fast_coef_updates) {
@@ -839,7 +834,7 @@
for (t = 0; t < entropy_nodes_update; ++t) {
vp9_prob newp = new_frame_coef_probs[i][j][k][l][t];
vp9_prob *oldp = old_frame_coef_probs[i][j][k][l] + t;
- const vp9_prob upd = VP9_COEF_UPDATE_PROB;
+ const vp9_prob upd = DIFF_UPDATE_PROB;
int s;
int u = 0;
if (l >= 3 && k == 0)
@@ -1122,26 +1117,23 @@
for (i = 0; i < TX_SIZE_CONTEXTS; i++) {
- tx_counts_to_branch_counts_8x8(cm->counts.tx.p8x8[i],
- ct_8x8p);
+ tx_counts_to_branch_counts_8x8(cm->counts.tx.p8x8[i], ct_8x8p);
for (j = 0; j < TX_SIZES - 3; j++)
- vp9_cond_prob_diff_update(w, &cm->fc.tx_probs.p8x8[i][j],
- MODE_UPDATE_PROB, ct_8x8p[j]);
+ vp9_cond_prob_diff_update(w, &cm->fc.tx_probs.p8x8[i][j], ct_8x8p[j]);
}
for (i = 0; i < TX_SIZE_CONTEXTS; i++) {
- tx_counts_to_branch_counts_16x16(cm->counts.tx.p16x16[i],
- ct_16x16p);
+ tx_counts_to_branch_counts_16x16(cm->counts.tx.p16x16[i], ct_16x16p);
for (j = 0; j < TX_SIZES - 2; j++)
vp9_cond_prob_diff_update(w, &cm->fc.tx_probs.p16x16[i][j],
- MODE_UPDATE_PROB, ct_16x16p[j]);
+ ct_16x16p[j]);
}
for (i = 0; i < TX_SIZE_CONTEXTS; i++) {
tx_counts_to_branch_counts_32x32(cm->counts.tx.p32x32[i], ct_32x32p);
for (j = 0; j < TX_SIZES - 1; j++)
vp9_cond_prob_diff_update(w, &cm->fc.tx_probs.p32x32[i][j],
- MODE_UPDATE_PROB, ct_32x32p[j]);
+ ct_32x32p[j]);
}
#ifdef MODE_STATS
if (!cpi->dummy_packing)
@@ -1471,7 +1463,6 @@
for (i = 0; i < INTRA_INTER_CONTEXTS; i++)
vp9_cond_prob_diff_update(&header_bc, &fc->intra_inter_prob[i],
- MODE_UPDATE_PROB,
cpi->intra_inter_count[i]);
if (cm->allow_comp_inter_inter) {
@@ -1485,7 +1476,6 @@
if (use_hybrid_pred)
for (i = 0; i < COMP_INTER_CONTEXTS; i++)
vp9_cond_prob_diff_update(&header_bc, &fc->comp_inter_prob[i],
- MODE_UPDATE_PROB,
cpi->comp_inter_count[i]);
}
}
@@ -1493,10 +1483,8 @@
if (cm->comp_pred_mode != COMP_PREDICTION_ONLY) {
for (i = 0; i < REF_CONTEXTS; i++) {
vp9_cond_prob_diff_update(&header_bc, &fc->single_ref_prob[i][0],
- MODE_UPDATE_PROB,
cpi->single_ref_count[i][0]);
vp9_cond_prob_diff_update(&header_bc, &fc->single_ref_prob[i][1],
- MODE_UPDATE_PROB,
cpi->single_ref_count[i][1]);
}
}
@@ -1504,7 +1492,6 @@
if (cm->comp_pred_mode != SINGLE_PREDICTION_ONLY)
for (i = 0; i < REF_CONTEXTS; i++)
vp9_cond_prob_diff_update(&header_bc, &fc->comp_ref_prob[i],
- MODE_UPDATE_PROB,
cpi->comp_ref_count[i]);
update_mbintra_mode_probs(cpi, &header_bc);
diff --git a/vp9/encoder/vp9_block.h b/vp9/encoder/vp9_block.h
index 2e28a2e..9b57bc3 100644
--- a/vp9/encoder/vp9_block.h
+++ b/vp9/encoder/vp9_block.h
@@ -23,16 +23,9 @@
int offset;
} search_site;
-typedef struct {
- struct {
- MB_PREDICTION_MODE mode;
- } bmi[4];
-} PARTITION_INFO;
-
// Structure to hold snapshot of coding context during the mode picking process
typedef struct {
MODE_INFO mic;
- PARTITION_INFO partition_info;
unsigned char zcoeff_blk[256];
int skip;
int_mv best_ref_mv;
@@ -87,9 +80,6 @@
MACROBLOCKD e_mbd;
int skip_block;
- PARTITION_INFO *partition_info; /* work pointer */
- PARTITION_INFO *pi; /* Corresponds to upper left visible macroblock */
- PARTITION_INFO *pip; /* Base of allocated array */
search_site *ss;
int ss_count;
@@ -145,6 +135,7 @@
// note that token_costs is the cost when eob node is skipped
vp9_coeff_cost token_costs[TX_SIZES];
+ uint8_t token_cache[1024];
int optimize;
@@ -188,4 +179,23 @@
int y_blocks);
};
+struct rdcost_block_args {
+ MACROBLOCK *x;
+ ENTROPY_CONTEXT t_above[16];
+ ENTROPY_CONTEXT t_left[16];
+ TX_SIZE tx_size;
+ int bw;
+ int bh;
+ int rate;
+ int64_t dist;
+ int64_t sse;
+ int this_rate;
+ int64_t this_dist;
+ int64_t this_sse;
+ int64_t this_rd;
+ int64_t best_rd;
+ int skip;
+ const int16_t *scan, *nb;
+};
+
#endif // VP9_ENCODER_VP9_BLOCK_H_
diff --git a/vp9/encoder/vp9_boolhuff.c b/vp9/encoder/vp9_boolhuff.c
index 0f1aa59..32c136e 100644
--- a/vp9/encoder/vp9_boolhuff.c
+++ b/vp9/encoder/vp9_boolhuff.c
@@ -22,23 +22,28 @@
#endif
const unsigned int vp9_prob_cost[256] = {
- 2047, 2047, 1791, 1641, 1535, 1452, 1385, 1328, 1279, 1235, 1196, 1161, 1129, 1099, 1072, 1046,
- 1023, 1000, 979, 959, 940, 922, 905, 889, 873, 858, 843, 829, 816, 803, 790, 778,
- 767, 755, 744, 733, 723, 713, 703, 693, 684, 675, 666, 657, 649, 641, 633, 625,
- 617, 609, 602, 594, 587, 580, 573, 567, 560, 553, 547, 541, 534, 528, 522, 516,
- 511, 505, 499, 494, 488, 483, 477, 472, 467, 462, 457, 452, 447, 442, 437, 433,
- 428, 424, 419, 415, 410, 406, 401, 397, 393, 389, 385, 381, 377, 373, 369, 365,
- 361, 357, 353, 349, 346, 342, 338, 335, 331, 328, 324, 321, 317, 314, 311, 307,
- 304, 301, 297, 294, 291, 288, 285, 281, 278, 275, 272, 269, 266, 263, 260, 257,
- 255, 252, 249, 246, 243, 240, 238, 235, 232, 229, 227, 224, 221, 219, 216, 214,
- 211, 208, 206, 203, 201, 198, 196, 194, 191, 189, 186, 184, 181, 179, 177, 174,
- 172, 170, 168, 165, 163, 161, 159, 156, 154, 152, 150, 148, 145, 143, 141, 139,
- 137, 135, 133, 131, 129, 127, 125, 123, 121, 119, 117, 115, 113, 111, 109, 107,
- 105, 103, 101, 99, 97, 95, 93, 92, 90, 88, 86, 84, 82, 81, 79, 77,
- 75, 73, 72, 70, 68, 66, 65, 63, 61, 60, 58, 56, 55, 53, 51, 50,
- 48, 46, 45, 43, 41, 40, 38, 37, 35, 33, 32, 30, 29, 27, 25, 24,
- 22, 21, 19, 18, 16, 15, 13, 12, 10, 9, 7, 6, 4, 3, 1, 1
-};
+ 2047, 2047, 1791, 1641, 1535, 1452, 1385, 1328, 1279, 1235, 1196, 1161,
+ 1129, 1099, 1072, 1046, 1023, 1000, 979, 959, 940, 922, 905, 889,
+ 873, 858, 843, 829, 816, 803, 790, 778, 767, 755, 744, 733,
+ 723, 713, 703, 693, 684, 675, 666, 657, 649, 641, 633, 625,
+ 617, 609, 602, 594, 587, 580, 573, 567, 560, 553, 547, 541,
+ 534, 528, 522, 516, 511, 505, 499, 494, 488, 483, 477, 472,
+ 467, 462, 457, 452, 447, 442, 437, 433, 428, 424, 419, 415,
+ 410, 406, 401, 397, 393, 389, 385, 381, 377, 373, 369, 365,
+ 361, 357, 353, 349, 346, 342, 338, 335, 331, 328, 324, 321,
+ 317, 314, 311, 307, 304, 301, 297, 294, 291, 288, 285, 281,
+ 278, 275, 272, 269, 266, 263, 260, 257, 255, 252, 249, 246,
+ 243, 240, 238, 235, 232, 229, 227, 224, 221, 219, 216, 214,
+ 211, 208, 206, 203, 201, 198, 196, 194, 191, 189, 186, 184,
+ 181, 179, 177, 174, 172, 170, 168, 165, 163, 161, 159, 156,
+ 154, 152, 150, 148, 145, 143, 141, 139, 137, 135, 133, 131,
+ 129, 127, 125, 123, 121, 119, 117, 115, 113, 111, 109, 107,
+ 105, 103, 101, 99, 97, 95, 93, 92, 90, 88, 86, 84,
+ 82, 81, 79, 77, 75, 73, 72, 70, 68, 66, 65, 63,
+ 61, 60, 58, 56, 55, 53, 51, 50, 48, 46, 45, 43,
+ 41, 40, 38, 37, 35, 33, 32, 30, 29, 27, 25, 24,
+ 22, 21, 19, 18, 16, 15, 13, 12, 10, 9, 7, 6,
+ 4, 3, 1, 1};
void vp9_start_encode(vp9_writer *br, uint8_t *source) {
br->lowvalue = 0;
diff --git a/vp9/encoder/vp9_dct.c b/vp9/encoder/vp9_dct.c
index 27e4cd0..a232a86 100644
--- a/vp9/encoder/vp9_dct.c
+++ b/vp9/encoder/vp9_dct.c
@@ -17,7 +17,7 @@
#include "vp9/common/vp9_blockd.h"
#include "vp9/common/vp9_idct.h"
-static void fdct4_1d(int16_t *input, int16_t *output) {
+static void fdct4(const int16_t *input, int16_t *output) {
int16_t step[4];
int temp1, temp2;
@@ -102,7 +102,7 @@
}
}
-static void fadst4_1d(int16_t *input, int16_t *output) {
+static void fadst4(const int16_t *input, int16_t *output) {
int x0, x1, x2, x3;
int s0, s1, s2, s3, s4, s5, s6, s7;
@@ -143,10 +143,10 @@
}
static const transform_2d FHT_4[] = {
- { fdct4_1d, fdct4_1d }, // DCT_DCT = 0
- { fadst4_1d, fdct4_1d }, // ADST_DCT = 1
- { fdct4_1d, fadst4_1d }, // DCT_ADST = 2
- { fadst4_1d, fadst4_1d } // ADST_ADST = 3
+ { fdct4, fdct4 }, // DCT_DCT = 0
+ { fadst4, fdct4 }, // ADST_DCT = 1
+ { fdct4, fadst4 }, // DCT_ADST = 2
+ { fadst4, fadst4 } // ADST_ADST = 3
};
void vp9_short_fht4x4_c(int16_t *input, int16_t *output,
@@ -183,7 +183,7 @@
vp9_short_fdct4x4_c(input + 4, output + 16, pitch);
}
-static void fdct8_1d(int16_t *input, int16_t *output) {
+static void fdct8(const int16_t *input, int16_t *output) {
/*canbe16*/ int s0, s1, s2, s3, s4, s5, s6, s7;
/*needs32*/ int t0, t1, t2, t3;
/*canbe16*/ int x0, x1, x2, x3;
@@ -198,7 +198,7 @@
s6 = input[1] - input[6];
s7 = input[0] - input[7];
- // fdct4_1d(step, step);
+ // fdct4(step, step);
x0 = s0 + s3;
x1 = s1 + s2;
x2 = s1 - s2;
@@ -259,7 +259,7 @@
s6 = (input[1 * stride] - input[6 * stride]) * 4;
s7 = (input[0 * stride] - input[7 * stride]) * 4;
- // fdct4_1d(step, step);
+ // fdct4(step, step);
x0 = s0 + s3;
x1 = s1 + s2;
x2 = s1 - s2;
@@ -301,7 +301,7 @@
// Rows
for (i = 0; i < 8; ++i) {
- fdct8_1d(&intermediate[i * 8], &final_output[i * 8]);
+ fdct8(&intermediate[i * 8], &final_output[i * 8]);
for (j = 0; j < 8; ++j)
final_output[j + i * 8] /= 2;
}
@@ -368,7 +368,7 @@
step1[6] = ((in[1 * 16] + 1) >> 2) - ((in[14 * 16] + 1) >> 2);
step1[7] = ((in[0 * 16] + 1) >> 2) - ((in[15 * 16] + 1) >> 2);
}
- // Work on the first eight values; fdct8_1d(input, even_results);
+ // Work on the first eight values; fdct8(input, even_results);
{
/*canbe16*/ int s0, s1, s2, s3, s4, s5, s6, s7;
/*needs32*/ int t0, t1, t2, t3;
@@ -384,7 +384,7 @@
s6 = input[1] - input[6];
s7 = input[0] - input[7];
- // fdct4_1d(step, step);
+ // fdct4(step, step);
x0 = s0 + s3;
x1 = s1 + s2;
x2 = s1 - s2;
@@ -486,7 +486,7 @@
}
}
-static void fadst8_1d(int16_t *input, int16_t *output) {
+static void fadst8(const int16_t *input, int16_t *output) {
int s0, s1, s2, s3, s4, s5, s6, s7;
int x0 = input[7];
@@ -558,10 +558,10 @@
}
static const transform_2d FHT_8[] = {
- { fdct8_1d, fdct8_1d }, // DCT_DCT = 0
- { fadst8_1d, fdct8_1d }, // ADST_DCT = 1
- { fdct8_1d, fadst8_1d }, // DCT_ADST = 2
- { fadst8_1d, fadst8_1d } // ADST_ADST = 3
+ { fdct8, fdct8 }, // DCT_DCT = 0
+ { fadst8, fdct8 }, // ADST_DCT = 1
+ { fdct8, fadst8 }, // DCT_ADST = 2
+ { fadst8, fadst8 } // ADST_ADST = 3
};
void vp9_short_fht8x8_c(int16_t *input, int16_t *output,
@@ -654,7 +654,7 @@
// Rewrote to use same algorithm as others.
-static void fdct16_1d(int16_t in[16], int16_t out[16]) {
+static void fdct16(const int16_t in[16], int16_t out[16]) {
/*canbe16*/ int step1[8];
/*canbe16*/ int step2[8];
/*canbe16*/ int step3[8];
@@ -680,7 +680,7 @@
step1[6] = in[1] - in[14];
step1[7] = in[0] - in[15];
- // fdct8_1d(step, step);
+ // fdct8(step, step);
{
/*canbe16*/ int s0, s1, s2, s3, s4, s5, s6, s7;
/*needs32*/ int t0, t1, t2, t3;
@@ -696,7 +696,7 @@
s6 = input[1] - input[6];
s7 = input[0] - input[7];
- // fdct4_1d(step, step);
+ // fdct4(step, step);
x0 = s0 + s3;
x1 = s1 + s2;
x2 = s1 - s2;
@@ -795,7 +795,7 @@
out[15] = dct_const_round_shift(temp2);
}
-void fadst16_1d(int16_t *input, int16_t *output) {
+static void fadst16(const int16_t *input, int16_t *output) {
int s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14, s15;
int x0 = input[15];
@@ -958,10 +958,10 @@
}
static const transform_2d FHT_16[] = {
- { fdct16_1d, fdct16_1d }, // DCT_DCT = 0
- { fadst16_1d, fdct16_1d }, // ADST_DCT = 1
- { fdct16_1d, fadst16_1d }, // DCT_ADST = 2
- { fadst16_1d, fadst16_1d } // ADST_ADST = 3
+ { fdct16, fdct16 }, // DCT_DCT = 0
+ { fadst16, fdct16 }, // ADST_DCT = 1
+ { fdct16, fadst16 }, // DCT_ADST = 2
+ { fadst16, fadst16 } // ADST_ADST = 3
};
void vp9_short_fht16x16_c(int16_t *input, int16_t *output,
@@ -1003,7 +1003,7 @@
return rv;
}
-static void dct32_1d(int *input, int *output, int round) {
+static void dct32_1d(const int *input, int *output, int round) {
int step[32];
// Stage 1
step[0] = input[0] + input[(32 - 1)];
diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c
index 1fbdb72..631a276 100644
--- a/vp9/encoder/vp9_encodeframe.c
+++ b/vp9/encoder/vp9_encodeframe.c
@@ -22,6 +22,7 @@
#include "vp9/common/vp9_entropymode.h"
#include "vp9/common/vp9_extend.h"
#include "vp9/common/vp9_findnearmv.h"
+#include "vp9/common/vp9_idct.h"
#include "vp9/common/vp9_mvref_common.h"
#include "vp9/common/vp9_pred_common.h"
#include "vp9/common/vp9_quant_common.h"
@@ -381,7 +382,6 @@
}
if (is_inter_block(mbmi) && mbmi->sb_type < BLOCK_8X8) {
- *x->partition_info = ctx->partition_info;
mbmi->mv[0].as_int = mi->bmi[3].as_mv[0].as_int;
mbmi->mv[1].as_int = mi->bmi[3].as_mv[1].as_int;
}
@@ -492,9 +492,6 @@
x->mb_activity_ptr = &cpi->mb_activity_map[idx_map];
x->active_ptr = cpi->active_map + idx_map;
- /* pointers to mode info contexts */
- x->partition_info = x->pi + idx_str;
-
xd->mi_8x8 = cm->mi_grid_visible + idx_str;
xd->prev_mi_8x8 = cm->prev_mi_grid_visible + idx_str;
@@ -1866,8 +1863,7 @@
// printf("Switching to lossless\n");
cpi->mb.fwd_txm8x4 = vp9_short_walsh8x4;
cpi->mb.fwd_txm4x4 = vp9_short_walsh4x4;
- cpi->mb.e_mbd.inv_txm4x4_1_add = vp9_short_iwalsh4x4_1_add;
- cpi->mb.e_mbd.inv_txm4x4_add = vp9_short_iwalsh4x4_add;
+ cpi->mb.e_mbd.itxm_add = vp9_iwht4x4_add;
cpi->mb.optimize = 0;
cpi->common.lf.filter_level = 0;
cpi->zbin_mode_boost_enabled = 0;
@@ -1876,8 +1872,7 @@
// printf("Not lossless\n");
cpi->mb.fwd_txm8x4 = vp9_short_fdct8x4;
cpi->mb.fwd_txm4x4 = vp9_short_fdct4x4;
- cpi->mb.e_mbd.inv_txm4x4_1_add = vp9_short_idct4x4_1_add;
- cpi->mb.e_mbd.inv_txm4x4_add = vp9_short_idct4x4_add;
+ cpi->mb.e_mbd.itxm_add = vp9_idct4x4_add;
}
}
diff --git a/vp9/encoder/vp9_encodemb.c b/vp9/encoder/vp9_encodemb.c
index a610d63..a0a7bab 100644
--- a/vp9/encoder/vp9_encodemb.c
+++ b/vp9/encoder/vp9_encodemb.c
@@ -24,9 +24,6 @@
#include "vp9/encoder/vp9_rdopt.h"
#include "vp9/encoder/vp9_tokenize.h"
-DECLARE_ALIGNED(16, extern const uint8_t,
- vp9_pt_energy_class[MAX_ENTROPY_TOKENS]);
-
void vp9_subtract_block_c(int rows, int cols,
int16_t *diff_ptr, ptrdiff_t diff_stride,
const uint8_t *src_ptr, ptrdiff_t src_stride,
@@ -43,15 +40,6 @@
}
}
-static void inverse_transform_b_4x4_add(MACROBLOCKD *xd, int eob,
- int16_t *dqcoeff, uint8_t *dest,
- int stride) {
- if (eob <= 1)
- xd->inv_txm4x4_1_add(dqcoeff, dest, stride);
- else
- xd->inv_txm4x4_add(dqcoeff, dest, stride);
-}
-
static void subtract_plane(MACROBLOCK *x, BLOCK_SIZE bsize, int plane) {
struct macroblock_plane *const p = &x->plane[plane];
const MACROBLOCKD *const xd = &x->e_mbd;
@@ -80,8 +68,7 @@
vp9_subtract_sbuv(x, bsize);
}
-
-#define RDTRUNC(RM,DM,R,D) ( (128+(R)*(RM)) & 0xFF )
+#define RDTRUNC(RM, DM, R, D) ((128 + (R) * (RM)) & 0xFF)
typedef struct vp9_token_state vp9_token_state;
struct vp9_token_state {
@@ -92,7 +79,7 @@
short qc;
};
-// TODO: experiments to find optimal multiple numbers
+// TODO(jimbankoski): experiment to find optimal RD numbers.
#define Y1_RD_MULT 4
#define UV_RD_MULT 2
@@ -272,11 +259,10 @@
best_index[i][1] = best;
/* Finally, make this the new head of the trellis. */
next = i;
- }
- /* There's no choice to make for a zero coefficient, so we don't
- * add a new trellis node, but we do need to update the costs.
- */
- else {
+ } else {
+ /* There's no choice to make for a zero coefficient, so we don't
+ * add a new trellis node, but we do need to update the costs.
+ */
band = get_coef_band(band_translate, i + 1);
t0 = tokens[next][0].token;
t1 = tokens[next][1].token;
@@ -456,20 +442,19 @@
switch (tx_size) {
case TX_32X32:
- vp9_short_idct32x32_add(dqcoeff, dst, pd->dst.stride);
+ vp9_idct32x32_1024_add(dqcoeff, dst, pd->dst.stride);
break;
case TX_16X16:
- vp9_idct_add_16x16(dqcoeff, dst, pd->dst.stride, pd->eobs[block]);
+ vp9_idct16x16_add(dqcoeff, dst, pd->dst.stride, pd->eobs[block]);
break;
case TX_8X8:
- vp9_idct_add_8x8(dqcoeff, dst, pd->dst.stride, pd->eobs[block]);
+ vp9_idct8x8_add(dqcoeff, dst, pd->dst.stride, pd->eobs[block]);
break;
case TX_4X4:
// this is like vp9_short_idct4x4 but has a special case around eob<=1
// which is significant (not just an optimization) for the lossless
// case.
- inverse_transform_b_4x4_add(xd, pd->eobs[block], dqcoeff,
- dst, pd->dst.stride);
+ xd->itxm_add(dqcoeff, dst, pd->dst.stride, pd->eobs[block]);
break;
default:
assert(!"Invalid transform size");
@@ -554,7 +539,7 @@
p->quant, p->quant_shift, qcoeff, dqcoeff,
pd->dequant, p->zbin_extra, eob, scan, iscan);
if (!x->skip_encode && *eob)
- vp9_short_idct32x32_add(dqcoeff, dst, pd->dst.stride);
+ vp9_idct32x32_1024_add(dqcoeff, dst, pd->dst.stride);
break;
case TX_16X16:
tx_type = get_tx_type_16x16(pd->plane_type, xd);
@@ -579,7 +564,7 @@
p->quant, p->quant_shift, qcoeff, dqcoeff,
pd->dequant, p->zbin_extra, eob, scan, iscan);
if (!x->skip_encode && *eob)
- vp9_iht_add_16x16(tx_type, dqcoeff, dst, pd->dst.stride, *eob);
+ vp9_iht16x16_add(tx_type, dqcoeff, dst, pd->dst.stride, *eob);
break;
case TX_8X8:
tx_type = get_tx_type_8x8(pd->plane_type, xd);
@@ -604,7 +589,7 @@
p->quant_shift, qcoeff, dqcoeff,
pd->dequant, p->zbin_extra, eob, scan, iscan);
if (!x->skip_encode && *eob)
- vp9_iht_add_8x8(tx_type, dqcoeff, dst, pd->dst.stride, *eob);
+ vp9_iht8x8_add(tx_type, dqcoeff, dst, pd->dst.stride, *eob);
break;
case TX_4X4:
tx_type = get_tx_type_4x4(pd->plane_type, xd, block);
@@ -636,9 +621,9 @@
// this is like vp9_short_idct4x4 but has a special case around eob<=1
// which is significant (not just an optimization) for the lossless
// case.
- inverse_transform_b_4x4_add(xd, *eob, dqcoeff, dst, pd->dst.stride);
+ xd->itxm_add(dqcoeff, dst, pd->dst.stride, *eob);
else
- vp9_short_iht4x4_add(dqcoeff, dst, pd->dst.stride, tx_type);
+ vp9_iht4x4_16_add(dqcoeff, dst, pd->dst.stride, tx_type);
}
break;
default:
diff --git a/vp9/encoder/vp9_encodemv.c b/vp9/encoder/vp9_encodemv.c
index 04a4172..9ebcc49 100644
--- a/vp9/encoder/vp9_encodemv.c
+++ b/vp9/encoder/vp9_encodemv.c
@@ -337,7 +337,7 @@
for (idy = 0; idy < 2; idy += num_4x4_h) {
for (idx = 0; idx < 2; idx += num_4x4_w) {
const int i = idy * 2 + idx;
- if (x->partition_info->bmi[i].mode == NEWMV)
+ if (mi->bmi[i].as_mode == NEWMV)
inc_mvs(mi->bmi[i].as_mv, best_ref_mv, is_compound, &cpi->NMVcount);
}
}
diff --git a/vp9/encoder/vp9_firstpass.c b/vp9/encoder/vp9_firstpass.c
index 4719313..b2becbb 100644
--- a/vp9/encoder/vp9_firstpass.c
+++ b/vp9/encoder/vp9_firstpass.c
@@ -8,8 +8,9 @@
* be found in the AUTHORS file in the root of the source tree.
*/
-#include "math.h"
-#include "limits.h"
+#include <math.h>
+#include <limits.h>
+#include <stdio.h>
#include "vp9/encoder/vp9_block.h"
#include "vp9/encoder/vp9_onyx_int.h"
#include "vp9/encoder/vp9_variance.h"
@@ -23,7 +24,6 @@
#include "vp9/common/vp9_systemdependent.h"
#include "vpx_mem/vpx_mem.h"
#include "vpx_scale/yv12config.h"
-#include <stdio.h>
#include "vp9/encoder/vp9_quantize.h"
#include "vp9/encoder/vp9_rdopt.h"
#include "vp9/encoder/vp9_ratectrl.h"
@@ -77,7 +77,8 @@
}
-// Resets the first pass file to the given position using a relative seek from the current position
+// Resets the first pass file to the given position using a relative seek from
+// the current position.
static void reset_fpf_position(VP9_COMP *cpi, FIRSTPASS_STATS *position) {
cpi->twopass.stats_in = position;
}
@@ -250,8 +251,10 @@
section->duration /= section->count;
}
-// Calculate a modified Error used in distributing bits between easier and harder frames
-static double calculate_modified_err(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
+// Calculate a modified Error used in distributing bits between easier and
+// harder frames.
+static double calculate_modified_err(VP9_COMP *cpi,
+ FIRSTPASS_STATS *this_frame) {
const FIRSTPASS_STATS *const stats = &cpi->twopass.total_stats;
const double av_err = stats->ssim_weighted_pred_err / stats->count;
const double this_err = this_frame->ssim_weighted_pred_err;
@@ -260,38 +263,43 @@
}
static const double weight_table[256] = {
- 0.020000, 0.020000, 0.020000, 0.020000, 0.020000, 0.020000, 0.020000, 0.020000,
- 0.020000, 0.020000, 0.020000, 0.020000, 0.020000, 0.020000, 0.020000, 0.020000,
- 0.020000, 0.020000, 0.020000, 0.020000, 0.020000, 0.020000, 0.020000, 0.020000,
- 0.020000, 0.020000, 0.020000, 0.020000, 0.020000, 0.020000, 0.020000, 0.020000,
- 0.020000, 0.031250, 0.062500, 0.093750, 0.125000, 0.156250, 0.187500, 0.218750,
- 0.250000, 0.281250, 0.312500, 0.343750, 0.375000, 0.406250, 0.437500, 0.468750,
- 0.500000, 0.531250, 0.562500, 0.593750, 0.625000, 0.656250, 0.687500, 0.718750,
- 0.750000, 0.781250, 0.812500, 0.843750, 0.875000, 0.906250, 0.937500, 0.968750,
- 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
- 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
- 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
- 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
- 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
- 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
- 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
- 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
- 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
- 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
- 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
- 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
- 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
- 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
- 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
- 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
- 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
- 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
- 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
- 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
- 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
- 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
- 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
- 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000
+ 0.020000, 0.020000, 0.020000, 0.020000, 0.020000, 0.020000, 0.020000,
+ 0.020000, 0.020000, 0.020000, 0.020000, 0.020000, 0.020000, 0.020000,
+ 0.020000, 0.020000, 0.020000, 0.020000, 0.020000, 0.020000, 0.020000,
+ 0.020000, 0.020000, 0.020000, 0.020000, 0.020000, 0.020000, 0.020000,
+ 0.020000, 0.020000, 0.020000, 0.020000, 0.020000, 0.031250, 0.062500,
+ 0.093750, 0.125000, 0.156250, 0.187500, 0.218750, 0.250000, 0.281250,
+ 0.312500, 0.343750, 0.375000, 0.406250, 0.437500, 0.468750, 0.500000,
+ 0.531250, 0.562500, 0.593750, 0.625000, 0.656250, 0.687500, 0.718750,
+ 0.750000, 0.781250, 0.812500, 0.843750, 0.875000, 0.906250, 0.937500,
+ 0.968750, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
+ 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
+ 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
+ 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
+ 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
+ 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
+ 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
+ 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
+ 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
+ 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
+ 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
+ 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
+ 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
+ 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
+ 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
+ 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
+ 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
+ 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
+ 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
+ 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
+ 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
+ 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
+ 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
+ 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
+ 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
+ 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
+ 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
+ 1.000000, 1.000000, 1.000000, 1.000000
};
static double simple_weight(YV12_BUFFER_CONFIG *source) {
@@ -300,7 +308,8 @@
uint8_t *src = source->y_buffer;
double sum_weights = 0.0;
- // Loop throught the Y plane raw examining levels and creating a weight for the image
+ // Loop through the Y plane examining levels and creating a weight for
+ // the image.
i = source->y_height;
do {
j = source->y_width;
@@ -340,7 +349,9 @@
output_stats(cpi, cpi->output_pkt_list, &cpi->twopass.total_stats);
}
-static void zz_motion_search(VP9_COMP *cpi, MACROBLOCK *x, YV12_BUFFER_CONFIG *recon_buffer, int *best_motion_err, int recon_yoffset) {
+static void zz_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
+ YV12_BUFFER_CONFIG *recon_buffer,
+ int *best_motion_err, int recon_yoffset) {
MACROBLOCKD *const xd = &x->e_mbd;
// Set up pointers for this macro block recon buffer
@@ -444,9 +455,9 @@
while (n < further_steps) {
n++;
- if (num00)
+ if (num00) {
num00--;
- else {
+ } else {
tmp_err = cpi->diamond_search_sad(x, &ref_mv_full, &tmp_mv,
step_param + n, x->sadperbit16,
&num00, &v_fn_ptr,
@@ -504,7 +515,6 @@
setup_pre_planes(xd, 0, lst_yv12, 0, 0, NULL);
setup_dst_planes(xd, new_yv12, 0, 0);
- x->partition_info = x->pi;
xd->mi_8x8 = cm->mi_grid_visible;
// required for vp9_frame_init_quantizer
xd->this_mi =
@@ -574,16 +584,20 @@
// do intra 16x16 prediction
this_error = vp9_encode_intra(x, use_dc_pred);
- // "intrapenalty" below deals with situations where the intra and inter error scores are very low (eg a plain black frame)
- // We do not have special cases in first pass for 0,0 and nearest etc so all inter modes carry an overhead cost estimate fot the mv.
- // When the error score is very low this causes us to pick all or lots of INTRA modes and throw lots of key frames.
+ // intrapenalty below deals with situations where the intra and inter
+ // error scores are very low (eg a plain black frame).
+ // We do not have special cases in first pass for 0,0 and nearest etc so
+ // all inter modes carry an overhead cost estimate for the mv.
+ // When the error score is very low this causes us to pick all or lots of
+ // INTRA modes and throw lots of key frames.
// This penalty adds a cost matching that of a 0,0 mv to the intra case.
this_error += intrapenalty;
// Cumulative intra error total
intra_error += (int64_t)this_error;
- // Set up limit values for motion vectors to prevent them extending outside the UMV borders
+ // Set up limit values for motion vectors to prevent them extending
+ // outside the UMV borders.
x->mv_col_min = -((mb_col * 16) + BORDER_MV_PIXELS_B16);
x->mv_col_max = ((cm->mb_cols - 1 - mb_col) * 16)
+ BORDER_MV_PIXELS_B16;
@@ -604,7 +618,8 @@
&mv.as_mv, lst_yv12,
&motion_error, recon_yoffset);
- // If the current best reference mv is not centred on 0,0 then do a 0,0 based search as well
+ // If the current best reference mv is not centered on 0,0 then do a 0,0
+ // based search as well.
if (best_ref_mv.as_int) {
tmp_err = INT_MAX;
first_pass_motion_search(cpi, x, &zero_ref_mv, &tmp_mv.as_mv,
@@ -644,9 +659,9 @@
sr_coded_error += gf_motion_error;
else
sr_coded_error += this_error;
- } else
+ } else {
sr_coded_error += motion_error;
-
+ }
/* Intra assumed best */
best_ref_mv.as_int = 0;
@@ -718,9 +733,9 @@
}
}
}
- } else
+ } else {
sr_coded_error += (int64_t)this_error;
-
+ }
coded_error += (int64_t)this_error;
// adjust to the next column of macroblocks
@@ -779,16 +794,19 @@
fps.mvr_abs = (double)sum_mvr_abs / (double)mvcount;
fps.MVc = (double)sum_mvc / (double)mvcount;
fps.mvc_abs = (double)sum_mvc_abs / (double)mvcount;
- fps.MVrv = ((double)sum_mvrs - (fps.MVr * fps.MVr / (double)mvcount)) / (double)mvcount;
- fps.MVcv = ((double)sum_mvcs - (fps.MVc * fps.MVc / (double)mvcount)) / (double)mvcount;
+ fps.MVrv = ((double)sum_mvrs - (fps.MVr * fps.MVr / (double)mvcount)) /
+ (double)mvcount;
+ fps.MVcv = ((double)sum_mvcs - (fps.MVc * fps.MVc / (double)mvcount)) /
+ (double)mvcount;
fps.mv_in_out_count = (double)sum_in_vectors / (double)(mvcount * 2);
fps.new_mv_count = new_mv_count;
fps.pcnt_motion = 1.0 * (double)mvcount / cpi->common.MBs;
}
- // TODO: handle the case when duration is set to 0, or something less
- // than the full time between subsequent values of cpi->source_time_stamp.
+ // TODO(paulwilkins): Handle the case when duration is set to 0, or
+ // something less than the full time between subsequent values of
+ // cpi->source_time_stamp.
fps.duration = (double)(cpi->source->ts_end
- cpi->source->ts_start);
@@ -808,15 +826,16 @@
2.0))) {
vp8_yv12_copy_frame(lst_yv12, gld_yv12);
cpi->twopass.sr_update_lag = 1;
- } else
+ } else {
cpi->twopass.sr_update_lag++;
-
+ }
// swap frame pointers so last frame refers to the frame we just compressed
swap_yv12(lst_yv12, new_yv12);
vp9_extend_frame_borders(lst_yv12, cm->subsampling_x, cm->subsampling_y);
- // Special case for the first frame. Copy into the GF buffer as a second reference.
+ // Special case for the first frame. Copy into the GF buffer as a second
+ // reference.
if (cm->current_video_frame == 0)
vp8_yv12_copy_frame(lst_yv12, gld_yv12);
@@ -824,7 +843,8 @@
if (0) {
char filename[512];
FILE *recon_file;
- sprintf(filename, "enc%04d.yuv", (int) cm->current_video_frame);
+ snprintf(filename, sizeof(filename), "enc%04d.yuv",
+ (int)cm->current_video_frame);
if (cm->current_video_frame == 0)
recon_file = fopen(filename, "wb");
@@ -836,7 +856,6 @@
}
cm->current_video_frame++;
-
}
// Estimate a cost per mb attributable to overheads such as the coding of
@@ -879,7 +898,7 @@
(av_intra * intra_cost)) * cpi->common.MBs) << 9;
// return mv_cost + mode_cost;
- // TODO PGW Fix overhead costs for extended Q range
+ // TODO(paulwilkins): Fix overhead costs for extended Q range.
#endif
return 0;
}
@@ -1103,8 +1122,8 @@
FIRSTPASS_STATS *start_pos;
double lower_bounds_min_rate = FRAME_OVERHEAD_BITS * cpi->oxcf.framerate;
- double two_pass_min_rate = (double)(cpi->oxcf.target_bandwidth
- * cpi->oxcf.two_pass_vbrmin_section / 100);
+ double two_pass_min_rate = (double)(cpi->oxcf.target_bandwidth *
+ cpi->oxcf.two_pass_vbrmin_section / 100);
if (two_pass_min_rate < lower_bounds_min_rate)
two_pass_min_rate = lower_bounds_min_rate;
@@ -1142,15 +1161,17 @@
// This variable monitors how far behind the second ref update is lagging
cpi->twopass.sr_update_lag = 1;
- // Scan the first pass file and calculate an average Intra / Inter error score ratio for the sequence
+ // Scan the first pass file and calculate an average Intra / Inter error score
+ // ratio for the sequence.
{
double sum_iiratio = 0.0;
double IIRatio;
- start_pos = cpi->twopass.stats_in; // Note starting "file" position
+ start_pos = cpi->twopass.stats_in; // Note the starting "file" position.
while (input_stats(cpi, &this_frame) != EOF) {
- IIRatio = this_frame.intra_error / DOUBLE_DIVIDE_CHECK(this_frame.coded_error);
+ IIRatio = this_frame.intra_error
+ / DOUBLE_DIVIDE_CHECK(this_frame.coded_error);
IIRatio = (IIRatio < 1.0) ? 1.0 : (IIRatio > 20.0) ? 20.0 : IIRatio;
sum_iiratio += IIRatio;
}
@@ -1162,21 +1183,21 @@
reset_fpf_position(cpi, start_pos);
}
- // Scan the first pass file and calculate a modified total error based upon the bias/power function
- // used to allocate bits
+ // Scan the first pass file and calculate a modified total error based upon
+ // the bias/power function used to allocate bits.
{
- start_pos = cpi->twopass.stats_in; // Note starting "file" position
+ start_pos = cpi->twopass.stats_in; // Note starting "file" position
cpi->twopass.modified_error_total = 0.0;
cpi->twopass.modified_error_used = 0.0;
while (input_stats(cpi, &this_frame) != EOF) {
- cpi->twopass.modified_error_total += calculate_modified_err(cpi, &this_frame);
+ cpi->twopass.modified_error_total +=
+ calculate_modified_err(cpi, &this_frame);
}
cpi->twopass.modified_error_left = cpi->twopass.modified_error_total;
- reset_fpf_position(cpi, start_pos); // Reset file position
-
+ reset_fpf_position(cpi, start_pos); // Reset file position
}
}
@@ -1322,7 +1343,6 @@
(this_frame_mvc_ratio < this_frame->mvc_abs)
? (this_frame_mvc_ratio * motion_pct)
: this_frame->mvc_abs * motion_pct;
-
}
}
@@ -1381,7 +1401,8 @@
// Update the motion related elements to the boost calculation
accumulate_frame_motion_stats(&this_frame,
&this_frame_mv_in_out, &mv_in_out_accumulator,
- &abs_mv_in_out_accumulator, &mv_ratio_accumulator);
+ &abs_mv_in_out_accumulator,
+ &mv_ratio_accumulator);
// We want to discount the flash frame itself and the recovery
// frame that follows as both will have poor scores.
@@ -1417,7 +1438,8 @@
// Update the motion related elements to the boost calculation
accumulate_frame_motion_stats(&this_frame,
&this_frame_mv_in_out, &mv_in_out_accumulator,
- &abs_mv_in_out_accumulator, &mv_ratio_accumulator);
+ &abs_mv_in_out_accumulator,
+ &mv_ratio_accumulator);
// We want to discount the the flash frame itself and the recovery
// frame that follows as both will have poor scores.
@@ -1433,7 +1455,6 @@
boost_score += (decay_accumulator *
calc_frame_boost(cpi, &this_frame, this_frame_mv_in_out));
-
}
*b_boost = (int)boost_score;
@@ -1667,7 +1688,8 @@
// Update the motion related elements to the boost calculation
accumulate_frame_motion_stats(&next_frame,
&this_frame_mv_in_out, &mv_in_out_accumulator,
- &abs_mv_in_out_accumulator, &mv_ratio_accumulator);
+ &abs_mv_in_out_accumulator,
+ &mv_ratio_accumulator);
// Cumulative effect of prediction quality decay
if (!flash_detected) {
@@ -1710,8 +1732,7 @@
((mv_ratio_accumulator > mv_ratio_accumulator_thresh) ||
(abs_mv_in_out_accumulator > 3.0) ||
(mv_in_out_accumulator < -2.0) ||
- ((boost_score - old_boost_score) < IIFACTOR))
- )) {
+ ((boost_score - old_boost_score) < IIFACTOR)))) {
boost_score = old_boost_score;
break;
}
@@ -1765,7 +1786,8 @@
(mv_in_out_accumulator > -2.0)) &&
(boost_score > 100)) {
// Alternative boost calculation for alt ref
- cpi->gfu_boost = calc_arf_boost(cpi, 0, (i - 1), (i - 1), &f_boost, &b_boost);
+ cpi->gfu_boost = calc_arf_boost(cpi, 0, (i - 1), (i - 1), &f_boost,
+ &b_boost);
cpi->source_alt_ref_pending = 1;
#if CONFIG_MULTIPLE_ARF
@@ -1842,9 +1864,9 @@
cpi->twopass.gf_group_bits =
(int64_t)(cpi->twopass.kf_group_bits *
(gf_group_err / cpi->twopass.kf_group_error_left));
- } else
+ } else {
cpi->twopass.gf_group_bits = 0;
-
+ }
cpi->twopass.gf_group_bits =
(cpi->twopass.gf_group_bits < 0)
? 0
@@ -1908,11 +1930,10 @@
if (gf_bits > alt_gf_bits)
gf_bits = alt_gf_bits;
- }
- // Else if it is harder than other frames in the group make sure it at
- // least receives an allocation in keeping with its relative error
- // score, otherwise it may be worse off than an "un-boosted" frame
- else {
+ } else {
+ // If it is harder than other frames in the group make sure it at
+ // least receives an allocation in keeping with its relative error
+ // score, otherwise it may be worse off than an "un-boosted" frame.
int alt_gf_bits = (int)((double)cpi->twopass.kf_group_bits *
mod_frame_err /
DOUBLE_DIVIDE_CHECK(cpi->twopass.kf_group_error_left));
@@ -2024,9 +2045,9 @@
// Clip target size to 0 - max_bits (or cpi->twopass.gf_group_bits) at
// the top end.
- if (target_frame_size < 0)
+ if (target_frame_size < 0) {
target_frame_size = 0;
- else {
+ } else {
if (target_frame_size > max_bits)
target_frame_size = max_bits;
@@ -2249,16 +2270,17 @@
if ((this_frame->pcnt_second_ref < 0.10) &&
(next_frame->pcnt_second_ref < 0.10) &&
((this_frame->pcnt_inter < 0.05) ||
- (
- ((this_frame->pcnt_inter - this_frame->pcnt_neutral) < .35) &&
- ((this_frame->intra_error / DOUBLE_DIVIDE_CHECK(this_frame->coded_error)) < 2.5) &&
- ((fabs(last_frame->coded_error - this_frame->coded_error) / DOUBLE_DIVIDE_CHECK(this_frame->coded_error) > .40) ||
- (fabs(last_frame->intra_error - this_frame->intra_error) / DOUBLE_DIVIDE_CHECK(this_frame->intra_error) > .40) ||
- ((next_frame->intra_error / DOUBLE_DIVIDE_CHECK(next_frame->coded_error)) > 3.5)
- )
- )
- )
- ) {
+ (((this_frame->pcnt_inter - this_frame->pcnt_neutral) < .35) &&
+ ((this_frame->intra_error /
+ DOUBLE_DIVIDE_CHECK(this_frame->coded_error)) < 2.5) &&
+ ((fabs(last_frame->coded_error - this_frame->coded_error) /
+ DOUBLE_DIVIDE_CHECK(this_frame->coded_error) >
+ .40) ||
+ (fabs(last_frame->intra_error - this_frame->intra_error) /
+ DOUBLE_DIVIDE_CHECK(this_frame->intra_error) >
+ .40) ||
+ ((next_frame->intra_error /
+ DOUBLE_DIVIDE_CHECK(next_frame->coded_error)) > 3.5))))) {
int i;
FIRSTPASS_STATS *start_pos;
@@ -2276,7 +2298,8 @@
// Examine how well the key frame predicts subsequent frames
for (i = 0; i < 16; i++) {
- next_iiratio = (IIKFACTOR1 * local_next_frame.intra_error / DOUBLE_DIVIDE_CHECK(local_next_frame.coded_error));
+ next_iiratio = (IIKFACTOR1 * local_next_frame.intra_error /
+ DOUBLE_DIVIDE_CHECK(local_next_frame.coded_error));
if (next_iiratio > RMAX)
next_iiratio = RMAX;
@@ -2285,7 +2308,8 @@
if (local_next_frame.pcnt_inter > 0.85)
decay_accumulator = decay_accumulator * local_next_frame.pcnt_inter;
else
- decay_accumulator = decay_accumulator * ((0.85 + local_next_frame.pcnt_inter) / 2.0);
+ decay_accumulator =
+ decay_accumulator * ((0.85 + local_next_frame.pcnt_inter) / 2.0);
// decay_accumulator = decay_accumulator * local_next_frame.pcnt_inter;
@@ -2313,9 +2337,9 @@
// If there is tolerable prediction for at least the next 3 frames then
// break out else discard this potential key frame and move on
- if (boost_score > 30.0 && (i > 3))
+ if (boost_score > 30.0 && (i > 3)) {
is_viable_kf = 1;
- else {
+ } else {
// Reset the file position
reset_fpf_position(cpi, start_pos);
@@ -2375,8 +2399,9 @@
// Accumulate kf group error
kf_group_err += calculate_modified_err(cpi, this_frame);
- // These figures keep intra and coded error counts for all frames including key frames in the group.
- // The effect of the key frame itself can be subtracted out using the first_frame data collected above
+ // These figures keep intra and coded error counts for all frames including
+ // key frames in the group. The effect of the key frame itself can be
+ // subtracted out using the first_frame data collected above.
kf_group_intra_err += this_frame->intra_error;
kf_group_coded_err += this_frame->coded_error;
@@ -2416,9 +2441,9 @@
// forcekeyframeevery intervals then break out of the loop.
if (cpi->twopass.frames_to_key >= 2 * (int)cpi->key_frame_frequency)
break;
- } else
+ } else {
cpi->twopass.frames_to_key++;
-
+ }
i++;
}
@@ -2458,22 +2483,24 @@
reset_fpf_position(cpi, current_pos);
cpi->next_key_frame_forced = 1;
- } else
+ } else {
cpi->next_key_frame_forced = 0;
-
+ }
// Special case for the last frame of the file
if (cpi->twopass.stats_in >= cpi->twopass.stats_in_end) {
// Accumulate kf group error
kf_group_err += calculate_modified_err(cpi, this_frame);
- // These figures keep intra and coded error counts for all frames including key frames in the group.
- // The effect of the key frame itself can be subtracted out using the first_frame data collected above
+ // These figures keep intra and coded error counts for all frames including
+ // key frames in the group. The effect of the key frame itself can be
+ // subtracted out using the first_frame data collected above.
kf_group_intra_err += this_frame->intra_error;
kf_group_coded_err += this_frame->coded_error;
}
// Calculate the number of bits that should be assigned to the kf group.
- if ((cpi->twopass.bits_left > 0) && (cpi->twopass.modified_error_left > 0.0)) {
+ if ((cpi->twopass.bits_left > 0) &&
+ (cpi->twopass.modified_error_left > 0.0)) {
// Max for a single normal frame (not key frame)
int max_bits = frame_max_bits(cpi);
@@ -2490,13 +2517,14 @@
max_grp_bits = (int64_t)max_bits * (int64_t)cpi->twopass.frames_to_key;
if (cpi->twopass.kf_group_bits > max_grp_bits)
cpi->twopass.kf_group_bits = max_grp_bits;
- } else
+ } else {
cpi->twopass.kf_group_bits = 0;
-
+ }
// Reset the first pass file position
reset_fpf_position(cpi, start_position);
- // determine how big to make this keyframe based on how well the subsequent frames use inter blocks
+ // Determine how big to make this keyframe based on how well the subsequent
+ // frames use inter blocks.
decay_accumulator = 1.0;
boost_score = 0.0;
loop_decay_rate = 1.00; // Starting decay rate
@@ -2569,7 +2597,7 @@
if (kf_boost < (cpi->twopass.frames_to_key * 3))
kf_boost = (cpi->twopass.frames_to_key * 3);
- if (kf_boost < 300) // Min KF boost
+ if (kf_boost < 300) // Min KF boost
kf_boost = 300;
// Make a note of baseline boost and the zero motion
@@ -2604,10 +2632,13 @@
allocation_chunks /= divisor;
}
- cpi->twopass.kf_group_bits = (cpi->twopass.kf_group_bits < 0) ? 0 : cpi->twopass.kf_group_bits;
+ cpi->twopass.kf_group_bits =
+ (cpi->twopass.kf_group_bits < 0) ? 0 : cpi->twopass.kf_group_bits;
// Calculate the number of bits to be spent on the key frame
- cpi->twopass.kf_bits = (int)((double)kf_boost * ((double)cpi->twopass.kf_group_bits / (double)allocation_chunks));
+ cpi->twopass.kf_bits =
+ (int)((double)kf_boost *
+ ((double)cpi->twopass.kf_group_bits / (double)allocation_chunks));
// If the key frame is actually easier than the average for the
// kf group (which does sometimes happen... eg a blank intro frame)
@@ -2625,11 +2656,10 @@
if (cpi->twopass.kf_bits > alt_kf_bits) {
cpi->twopass.kf_bits = alt_kf_bits;
}
- }
+ } else {
// Else if it is much harder than other frames in the group make sure
// it at least receives an allocation in keeping with its relative
// error score
- else {
alt_kf_bits =
(int)((double)cpi->twopass.bits_left *
(kf_mod_err /
@@ -2655,6 +2685,7 @@
cpi->twopass.kf_group_error_left = (int)(kf_group_err - kf_mod_err);
// Adjust the count of total modified error left.
- // The count of bits left is adjusted elsewhere based on real coded frame sizes
+ // The count of bits left is adjusted elsewhere based on real coded frame
+ // sizes.
cpi->twopass.modified_error_left -= kf_group_err;
}
diff --git a/vp9/encoder/vp9_firstpass.h b/vp9/encoder/vp9_firstpass.h
index 2296a66..c18d11e 100644
--- a/vp9/encoder/vp9_firstpass.h
+++ b/vp9/encoder/vp9_firstpass.h
@@ -10,6 +10,7 @@
#ifndef VP9_ENCODER_VP9_FIRSTPASS_H_
#define VP9_ENCODER_VP9_FIRSTPASS_H_
+#include "vp9/encoder/vp9_onyx_int.h"
void vp9_init_first_pass(VP9_COMP *cpi);
void vp9_first_pass(VP9_COMP *cpi);
diff --git a/vp9/encoder/vp9_lookahead.c b/vp9/encoder/vp9_lookahead.c
index 81445a9..c28c868 100644
--- a/vp9/encoder/vp9_lookahead.c
+++ b/vp9/encoder/vp9_lookahead.c
@@ -10,7 +10,7 @@
#include <assert.h>
#include <stdlib.h>
-#include "vpx_config.h"
+#include "./vpx_config.h"
#include "vp9/common/vp9_common.h"
#include "vp9/encoder/vp9_lookahead.h"
#include "vp9/common/vp9_extend.h"
@@ -77,7 +77,7 @@
goto bail;
}
return ctx;
-bail:
+ bail:
vp9_lookahead_destroy(ctx);
return NULL;
}
diff --git a/vp9/encoder/vp9_mcomp.c b/vp9/encoder/vp9_mcomp.c
index 44eaa65..561c725 100644
--- a/vp9/encoder/vp9_mcomp.c
+++ b/vp9/encoder/vp9_mcomp.c
@@ -320,8 +320,8 @@
*distortion = besterr;
besterr += mv_err_cost(bestmv, ref_mv, mvjcost, mvcost, error_per_bit);
- // TODO: Each subsequent iteration checks at least one point in
- // common with the last iteration could be 2 ( if diag selected)
+ // TODO(jbb): Each subsequent iteration checks at least one point in
+ // common with the last iteration could be 2 if diagonal is selected.
while (halfiters--) {
// 1/2 pel
FIRST_LEVEL_CHECKS;
@@ -332,8 +332,8 @@
tc = bc;
}
- // TODO: Each subsequent iteration checks at least one point in common with
- // the last iteration could be 2 ( if diag selected) 1/4 pel
+ // TODO(yaowu): Each subsequent iteration checks at least one point in common
+ // with the last iteration could be 2 if diagonal is selected.
// Note forced_stop: 0 - full, 1 - qtr only, 2 - half only
if (forced_stop != 2) {
@@ -1122,8 +1122,10 @@
+ mvsad_err_cost(&best_mv->as_mv, &fcenter_mv.as_mv,
mvjsadcost, mvsadcost, sad_per_bit);
- // search_param determines the length of the initial step and hence the number of iterations
- // 0 = initial step (MAX_FIRST_STEP) pel : 1 = (MAX_FIRST_STEP/2) pel, 2 = (MAX_FIRST_STEP/4) pel... etc.
+ // search_param determines the length of the initial step and hence the number
+ // of iterations
+ // 0 = initial step (MAX_FIRST_STEP) pel : 1 = (MAX_FIRST_STEP/2) pel, 2 =
+ // (MAX_FIRST_STEP/4) pel... etc.
ss = &x->ss[search_param * x->searches_per_step];
tot_steps = (x->ss_count / x->searches_per_step) - search_param;
@@ -1192,8 +1194,9 @@
break;
};
#endif
- } else if (best_address == in_what)
+ } else if (best_address == in_what) {
(*num00)++;
+ }
}
this_mv.as_mv.row = best_mv->as_mv.row * 8;
@@ -1263,8 +1266,11 @@
+ mvsad_err_cost(&best_mv->as_mv, &fcenter_mv.as_mv,
mvjsadcost, mvsadcost, sad_per_bit);
- // search_param determines the length of the initial step and hence the number of iterations
- // 0 = initial step (MAX_FIRST_STEP) pel : 1 = (MAX_FIRST_STEP/2) pel, 2 = (MAX_FIRST_STEP/4) pel... etc.
+ // search_param determines the length of the initial step and hence the number
+ // of iterations.
+ // 0 = initial step (MAX_FIRST_STEP) pel
+ // 1 = (MAX_FIRST_STEP/2) pel,
+ // 2 = (MAX_FIRST_STEP/4) pel...
ss = &x->ss[search_param * x->searches_per_step];
tot_steps = (x->ss_count / x->searches_per_step) - search_param;
@@ -1273,13 +1279,16 @@
for (step = 0; step < tot_steps; step++) {
int all_in = 1, t;
- // To know if all neighbor points are within the bounds, 4 bounds checking are enough instead of
- // checking 4 bounds for each points.
+ // All_in is true if every one of the points we are checking are within
+ // the bounds of the image.
all_in &= ((best_mv->as_mv.row + ss[i].mv.row) > x->mv_row_min);
all_in &= ((best_mv->as_mv.row + ss[i + 1].mv.row) < x->mv_row_max);
all_in &= ((best_mv->as_mv.col + ss[i + 2].mv.col) > x->mv_col_min);
all_in &= ((best_mv->as_mv.col + ss[i + 3].mv.col) < x->mv_col_max);
+ // If all the pixels are within the bounds we don't check whether the
+ // search point is valid in this loop, otherwise we check each point
+ // for validity..
if (all_in) {
unsigned int sad_array[4];
@@ -1312,10 +1321,13 @@
this_row_offset = best_mv->as_mv.row + ss[i].mv.row;
this_col_offset = best_mv->as_mv.col + ss[i].mv.col;
- if ((this_col_offset > x->mv_col_min) && (this_col_offset < x->mv_col_max) &&
- (this_row_offset > x->mv_row_min) && (this_row_offset < x->mv_row_max)) {
+ if ((this_col_offset > x->mv_col_min) &&
+ (this_col_offset < x->mv_col_max) &&
+ (this_row_offset > x->mv_row_min) &&
+ (this_row_offset < x->mv_row_max)) {
check_here = ss[i].offset + best_address;
- thissad = fn_ptr->sdf(what, what_stride, check_here, in_what_stride, bestsad);
+ thissad = fn_ptr->sdf(what, what_stride, check_here, in_what_stride,
+ bestsad);
if (thissad < bestsad) {
this_mv.as_mv.row = this_row_offset;
@@ -1365,8 +1377,9 @@
break;
};
#endif
- } else if (best_address == in_what)
+ } else if (best_address == in_what) {
(*num00)++;
+ }
}
this_mv.as_mv.row = best_mv->as_mv.row * 8;
@@ -1401,16 +1414,17 @@
n = num00;
num00 = 0;
- /* If there won't be more n-step search, check to see if refining search is needed. */
+ /* If there won't be more n-step search, check to see if refining search is
+ * needed. */
if (n > further_steps)
do_refine = 0;
while (n < further_steps) {
n++;
- if (num00)
+ if (num00) {
num00--;
- else {
+ } else {
thissme = cpi->diamond_search_sad(x, mvp_full, &temp_mv,
step_param + n, sadpb, &num00,
fn_ptr, x->nmvjointcost, x->mvcost,
@@ -1504,7 +1518,8 @@
check_here = r * mv_stride + in_what + col_min;
for (c = col_min; c < col_max; c++) {
- thissad = fn_ptr->sdf(what, what_stride, check_here, in_what_stride, bestsad);
+ thissad = fn_ptr->sdf(what, what_stride, check_here, in_what_stride,
+ bestsad);
this_mv.as_mv.col = c;
thissad += mvsad_err_cost(&this_mv.as_mv, &fcenter_mv.as_mv,
@@ -1621,7 +1636,8 @@
}
while (c < col_max) {
- thissad = fn_ptr->sdf(what, what_stride, check_here, in_what_stride, bestsad);
+ thissad = fn_ptr->sdf(what, what_stride, check_here, in_what_stride,
+ bestsad);
if (thissad < bestsad) {
this_mv.as_mv.col = c;
@@ -1639,7 +1655,6 @@
check_here++;
c++;
}
-
}
this_mv.as_mv.row = best_mv->as_mv.row * 8;
@@ -1770,7 +1785,8 @@
}
while (c < col_max) {
- thissad = fn_ptr->sdf(what, what_stride, check_here, in_what_stride, bestsad);
+ thissad = fn_ptr->sdf(what, what_stride, check_here, in_what_stride,
+ bestsad);
if (thissad < bestsad) {
this_mv.as_mv.col = c;
@@ -1840,10 +1856,14 @@
this_row_offset = ref_mv->as_mv.row + neighbors[j].row;
this_col_offset = ref_mv->as_mv.col + neighbors[j].col;
- if ((this_col_offset > x->mv_col_min) && (this_col_offset < x->mv_col_max) &&
- (this_row_offset > x->mv_row_min) && (this_row_offset < x->mv_row_max)) {
- check_here = (neighbors[j].row) * in_what_stride + neighbors[j].col + best_address;
- thissad = fn_ptr->sdf(what, what_stride, check_here, in_what_stride, bestsad);
+ if ((this_col_offset > x->mv_col_min) &&
+ (this_col_offset < x->mv_col_max) &&
+ (this_row_offset > x->mv_row_min) &&
+ (this_row_offset < x->mv_row_max)) {
+ check_here = (neighbors[j].row) * in_what_stride + neighbors[j].col +
+ best_address;
+ thissad = fn_ptr->sdf(what, what_stride, check_here, in_what_stride,
+ bestsad);
if (thissad < bestsad) {
this_mv.as_mv.row = this_row_offset;
@@ -1859,12 +1879,13 @@
}
}
- if (best_site == -1)
+ if (best_site == -1) {
break;
- else {
+ } else {
ref_mv->as_mv.row += neighbors[best_site].row;
ref_mv->as_mv.col += neighbors[best_site].col;
- best_address += (neighbors[best_site].row) * in_what_stride + neighbors[best_site].col;
+ best_address += (neighbors[best_site].row) * in_what_stride +
+ neighbors[best_site].col;
}
}
@@ -1927,7 +1948,8 @@
block_offset[2] = best_address + 1;
block_offset[3] = best_address + in_what_stride;
- fn_ptr->sdx4df(what, what_stride, block_offset, in_what_stride, sad_array);
+ fn_ptr->sdx4df(what, what_stride, block_offset, in_what_stride,
+ sad_array);
for (j = 0; j < 4; j++) {
if (sad_array[j] < bestsad) {
@@ -1947,10 +1969,14 @@
this_row_offset = ref_mv->as_mv.row + neighbors[j].row;
this_col_offset = ref_mv->as_mv.col + neighbors[j].col;
- if ((this_col_offset > x->mv_col_min) && (this_col_offset < x->mv_col_max) &&
- (this_row_offset > x->mv_row_min) && (this_row_offset < x->mv_row_max)) {
- check_here = (neighbors[j].row) * in_what_stride + neighbors[j].col + best_address;
- thissad = fn_ptr->sdf(what, what_stride, check_here, in_what_stride, bestsad);
+ if ((this_col_offset > x->mv_col_min) &&
+ (this_col_offset < x->mv_col_max) &&
+ (this_row_offset > x->mv_row_min) &&
+ (this_row_offset < x->mv_row_max)) {
+ check_here = (neighbors[j].row) * in_what_stride + neighbors[j].col +
+ best_address;
+ thissad = fn_ptr->sdf(what, what_stride, check_here, in_what_stride,
+ bestsad);
if (thissad < bestsad) {
this_mv.as_mv.row = this_row_offset;
@@ -1967,12 +1993,13 @@
}
}
- if (best_site == -1)
+ if (best_site == -1) {
break;
- else {
+ } else {
ref_mv->as_mv.row += neighbors[best_site].row;
ref_mv->as_mv.col += neighbors[best_site].col;
- best_address += (neighbors[best_site].row) * in_what_stride + neighbors[best_site].col;
+ best_address += (neighbors[best_site].row) * in_what_stride +
+ neighbors[best_site].col;
}
}
diff --git a/vp9/encoder/vp9_modecosts.c b/vp9/encoder/vp9_modecosts.c
index a5dfaed..b867d8b 100644
--- a/vp9/encoder/vp9_modecosts.c
+++ b/vp9/encoder/vp9_modecosts.c
@@ -17,7 +17,7 @@
void vp9_init_mode_costs(VP9_COMP *c) {
VP9_COMMON *const cm = &c->common;
- const vp9_tree_p KT = vp9_intra_mode_tree;
+ const vp9_tree_index *KT = vp9_intra_mode_tree;
int i, j;
for (i = 0; i < INTRA_MODES; i++) {
diff --git a/vp9/encoder/vp9_onyx_if.c b/vp9/encoder/vp9_onyx_if.c
index e7384ba..0833b4a 100644
--- a/vp9/encoder/vp9_onyx_if.c
+++ b/vp9/encoder/vp9_onyx_if.c
@@ -8,45 +8,35 @@
* be found in the AUTHORS file in the root of the source tree.
*/
-
-#include "vpx_config.h"
-#include "vp9/common/vp9_filter.h"
-#include "vp9/common/vp9_onyxc_int.h"
-#include "vp9/common/vp9_reconinter.h"
-#include "vp9/encoder/vp9_onyx_int.h"
-#include "vp9/common/vp9_systemdependent.h"
-#include "vp9/encoder/vp9_quantize.h"
-#include "vp9/common/vp9_alloccommon.h"
-#include "vp9/encoder/vp9_mcomp.h"
-#include "vp9/encoder/vp9_firstpass.h"
-#include "vp9/encoder/vp9_psnr.h"
-#include "vpx_scale/vpx_scale.h"
-#include "vp9/common/vp9_extend.h"
-#include "vp9/encoder/vp9_ratectrl.h"
-#include "vp9/common/vp9_quant_common.h"
-#include "vp9/common/vp9_tile_common.h"
-#include "vp9/encoder/vp9_segmentation.h"
-#include "./vp9_rtcd.h"
-#include "./vpx_scale_rtcd.h"
-#if CONFIG_VP9_POSTPROC
-#include "vp9/common/vp9_postproc.h"
-#endif
-#include "vpx_mem/vpx_mem.h"
-#include "vpx_ports/vpx_timer.h"
-
-#include "vp9/common/vp9_seg_common.h"
-#include "vp9/encoder/vp9_mbgraph.h"
-#include "vp9/common/vp9_pred_common.h"
-#include "vp9/encoder/vp9_rdopt.h"
-#include "vp9/encoder/vp9_bitstream.h"
-#include "vp9/encoder/vp9_picklpf.h"
-#include "vp9/common/vp9_mvref_common.h"
-#include "vp9/encoder/vp9_temporal_filter.h"
-
#include <math.h>
#include <stdio.h>
#include <limits.h>
+#include "./vpx_config.h"
+#include "./vpx_scale_rtcd.h"
+
+#include "vp9/common/vp9_alloccommon.h"
+#include "vp9/common/vp9_filter.h"
+#include "vp9/common/vp9_idct.h"
+#if CONFIG_VP9_POSTPROC
+#include "vp9/common/vp9_postproc.h"
+#endif
+#include "vp9/common/vp9_reconinter.h"
+#include "vp9/common/vp9_systemdependent.h"
+#include "vp9/common/vp9_tile_common.h"
+#include "vp9/encoder/vp9_firstpass.h"
+#include "vp9/encoder/vp9_mbgraph.h"
+#include "vp9/encoder/vp9_onyx_int.h"
+#include "vp9/encoder/vp9_picklpf.h"
+#include "vp9/encoder/vp9_psnr.h"
+#include "vp9/encoder/vp9_ratectrl.h"
+#include "vp9/encoder/vp9_rdopt.h"
+#include "vp9/encoder/vp9_segmentation.h"
+#include "vp9/encoder/vp9_temporal_filter.h"
+
+#include "vpx_ports/vpx_timer.h"
+
+
extern void print_tree_update_probs();
static void set_default_lf_deltas(struct loopfilter *lf);
@@ -55,12 +45,12 @@
#define SHARP_FILTER_QTHRESH 0 /* Q threshold for 8-tap sharp filter */
-#define ALTREF_HIGH_PRECISION_MV 1 /* whether to use high precision mv
- for altref computation */
-#define HIGH_PRECISION_MV_QTHRESH 200 /* Q threshold for use of high precision
- mv. Choose a very high value for
- now so that HIGH_PRECISION is always
- chosen */
+#define ALTREF_HIGH_PRECISION_MV 1 // Whether to use high precision mv
+ // for altref computation.
+#define HIGH_PRECISION_MV_QTHRESH 200 // Q threshold for high precision
+ // mv. Choose a very high value for
+ // now so that HIGH_PRECISION is always
+ // chosen.
// Masks for partially or completely disabling split mode
#define DISABLE_ALL_SPLIT 0x3F
@@ -69,8 +59,6 @@
#define LAST_AND_INTRA_SPLIT_ONLY 0x1E
#if CONFIG_INTERNAL_STATS
-#include "math.h"
-
extern double vp9_calc_ssim(YV12_BUFFER_CONFIG *source,
YV12_BUFFER_CONFIG *dest, int lumamask,
double *weight);
@@ -113,7 +101,8 @@
#endif
#ifdef SPEEDSTATS
-unsigned int frames_at_speed[16] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+unsigned int frames_at_speed[16] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0};
#endif
#if defined(SECTIONBITS_OUTPUT)
@@ -321,9 +310,6 @@
cpi->mb_activity_map = 0;
vpx_free(cpi->mb_norm_activity_map);
cpi->mb_norm_activity_map = 0;
-
- vpx_free(cpi->mb.pip);
- cpi->mb.pip = 0;
}
// Computes a q delta (in "q index" terms) to get from a starting q value
@@ -402,7 +388,6 @@
// Where relevant assume segment data is delta data
seg->abs_delta = SEGMENT_DELTADATA;
-
}
} else if (seg->enabled) {
// All other frames if segmentation has been enabled
@@ -752,8 +737,10 @@
sf->mode_search_skip_flags = 0;
sf->disable_split_var_thresh = 0;
sf->disable_filter_search_var_thresh = 0;
- sf->intra_y_mode_mask = ALL_INTRA_MODES;
- sf->intra_uv_mode_mask = ALL_INTRA_MODES;
+ for (i = 0; i < TX_SIZES; i++) {
+ sf->intra_y_mode_mask[i] = ALL_INTRA_MODES;
+ sf->intra_uv_mode_mask[i] = ALL_INTRA_MODES;
+ }
sf->use_rd_breakout = 0;
sf->skip_encode_sb = 0;
sf->use_uv_intra_rd_estimate = 0;
@@ -770,7 +757,7 @@
#endif
switch (mode) {
- case 0: // best quality mode
+ case 0: // This is the best quality mode.
break;
case 1:
@@ -782,16 +769,19 @@
#endif
sf->use_avoid_tested_higherror = 1;
sf->adaptive_rd_thresh = 1;
+ sf->recode_loop = (speed < 1);
if (speed == 1) {
sf->use_square_partition_only = !(cpi->common.frame_type == KEY_FRAME ||
cpi->common.intra_only);
- sf->tx_size_search_method = ((cpi->common.frame_type == KEY_FRAME ||
- cpi->common.intra_only)
- ? USE_FULL_RD : USE_LARGESTALL);
+ sf->less_rectangular_check = 1;
+ sf->tx_size_search_method = (cpi->common.frame_type == KEY_FRAME ||
+ cpi->common.intra_only)
+ ? USE_FULL_RD : USE_LARGESTALL;
if (MIN(cpi->common.width, cpi->common.height) >= 720)
- sf->disable_split_mask = DISABLE_ALL_SPLIT;
+ sf->disable_split_mask = cpi->common.show_frame ?
+ DISABLE_ALL_SPLIT : DISABLE_ALL_INTER_SPLIT;
else
sf->disable_split_mask = DISABLE_COMPOUND_SPLIT;
@@ -799,19 +789,26 @@
sf->adaptive_motion_search = 1;
sf->auto_mv_step_size = 1;
sf->adaptive_rd_thresh = 2;
+ sf->recode_loop = 2;
+ sf->intra_y_mode_mask[TX_32X32] = INTRA_DC_H_V;
+ sf->intra_uv_mode_mask[TX_32X32] = INTRA_DC_H_V;
+ sf->intra_uv_mode_mask[TX_16X16] = INTRA_DC_H_V;
}
if (speed == 2) {
sf->use_square_partition_only = !(cpi->common.frame_type == KEY_FRAME ||
cpi->common.intra_only);
+ sf->less_rectangular_check = 1;
sf->tx_size_search_method = ((cpi->common.frame_type == KEY_FRAME ||
cpi->common.intra_only)
? USE_FULL_RD : USE_LARGESTALL);
if (MIN(cpi->common.width, cpi->common.height) >= 720)
- sf->disable_split_mask = DISABLE_ALL_SPLIT;
+ sf->disable_split_mask = cpi->common.show_frame ?
+ DISABLE_ALL_SPLIT : DISABLE_ALL_INTER_SPLIT;
else
sf->disable_split_mask = LAST_AND_INTRA_SPLIT_ONLY;
+
sf->mode_search_skip_flags = FLAG_SKIP_INTRA_DIRMISMATCH |
FLAG_SKIP_INTRA_BESTINTER |
FLAG_SKIP_COMP_BESTINTRA |
@@ -831,6 +828,10 @@
sf->adaptive_rd_thresh = 2;
sf->mode_skip_start = 11;
+ sf->intra_y_mode_mask[TX_32X32] = INTRA_DC_H_V;
+ sf->intra_y_mode_mask[TX_16X16] = INTRA_DC_H_V;
+ sf->intra_uv_mode_mask[TX_32X32] = INTRA_DC_H_V;
+ sf->intra_uv_mode_mask[TX_16X16] = INTRA_DC_H_V;
}
if (speed == 3) {
sf->use_square_partition_only = 1;
@@ -910,11 +911,9 @@
sf->comp_inter_joint_search_thresh = BLOCK_SIZES;
sf->use_one_partition_size_always = 1;
sf->always_this_block_size = BLOCK_16X16;
- sf->tx_size_search_method = ((cpi->common.frame_type == KEY_FRAME ||
- cpi->common.intra_only ||
- cpi->common.show_frame == 0) ?
- USE_FULL_RD :
- USE_LARGESTALL);
+ sf->tx_size_search_method = (cpi->common.frame_type == KEY_FRAME ||
+ cpi->common.intra_only) ?
+ USE_FULL_RD : USE_LARGESTALL;
sf->mode_search_skip_flags = FLAG_SKIP_INTRA_DIRMISMATCH |
FLAG_SKIP_INTRA_BESTINTER |
FLAG_SKIP_COMP_BESTINTRA |
@@ -933,14 +932,15 @@
sf->subpel_iters_per_step = 1;
sf->disable_split_var_thresh = 64;
sf->disable_filter_search_var_thresh = 96;
- sf->intra_y_mode_mask = INTRA_DC_ONLY;
- sf->intra_uv_mode_mask = INTRA_DC_ONLY;
+ for (i = 0; i < TX_SIZES; i++) {
+ sf->intra_y_mode_mask[i] = INTRA_DC_ONLY;
+ sf->intra_uv_mode_mask[i] = INTRA_DC_ONLY;
+ }
sf->use_fast_coef_updates = 2;
sf->adaptive_rd_thresh = 4;
sf->mode_skip_start = 6;
}
break;
-
}; /* switch */
// Set rd thresholds based on mode and speed setting
@@ -997,20 +997,6 @@
"Failed to allocate altref buffer");
}
-static int alloc_partition_data(VP9_COMP *cpi) {
- vpx_free(cpi->mb.pip);
-
- cpi->mb.pip = vpx_calloc(cpi->common.mode_info_stride *
- (cpi->common.mi_rows + MI_BLOCK_SIZE),
- sizeof(PARTITION_INFO));
- if (!cpi->mb.pip)
- return 1;
-
- cpi->mb.pi = cpi->mb.pip + cpi->common.mode_info_stride + 1;
-
- return 0;
-}
-
void vp9_alloc_compressor_data(VP9_COMP *cpi) {
VP9_COMMON *cm = &cpi->common;
@@ -1018,10 +1004,6 @@
vpx_internal_error(&cpi->common.error, VPX_CODEC_MEM_ERROR,
"Failed to allocate frame buffers");
- if (alloc_partition_data(cpi))
- vpx_internal_error(&cpi->common.error, VPX_CODEC_MEM_ERROR,
- "Failed to allocate partition data");
-
if (vp9_alloc_frame_buffer(&cpi->last_frame_uf,
cm->width, cm->height,
cm->subsampling_x, cm->subsampling_y,
@@ -1093,10 +1075,6 @@
}
-// TODO perhaps change number of steps expose to outside world when setting
-// max and min limits. Also this will likely want refining for the extended Q
-// range.
-//
// Table that converts 0-63 Q range values passed in outside to the Qindex
// range used internally.
static const int q_trans[] = {
@@ -1123,11 +1101,14 @@
if (framerate < 0.1)
framerate = 30;
- cpi->oxcf.framerate = framerate;
- cpi->output_framerate = cpi->oxcf.framerate;
- cpi->per_frame_bandwidth = (int)(cpi->oxcf.target_bandwidth / cpi->output_framerate);
- cpi->av_per_frame_bandwidth = (int)(cpi->oxcf.target_bandwidth / cpi->output_framerate);
- cpi->min_frame_bandwidth = (int)(cpi->av_per_frame_bandwidth * cpi->oxcf.two_pass_vbrmin_section / 100);
+ cpi->oxcf.framerate = framerate;
+ cpi->output_framerate = cpi->oxcf.framerate;
+ cpi->per_frame_bandwidth = (int)(cpi->oxcf.target_bandwidth
+ / cpi->output_framerate);
+ cpi->av_per_frame_bandwidth = (int)(cpi->oxcf.target_bandwidth
+ / cpi->output_framerate);
+ cpi->min_frame_bandwidth = (int)(cpi->av_per_frame_bandwidth *
+ cpi->oxcf.two_pass_vbrmin_section / 100);
cpi->min_frame_bandwidth = MAX(cpi->min_frame_bandwidth, FRAME_OVERHEAD_BITS);
@@ -1260,14 +1241,8 @@
cpi->oxcf.cq_level = q_trans[cpi->oxcf.cq_level];
cpi->oxcf.lossless = oxcf->lossless;
- if (cpi->oxcf.lossless) {
- cpi->mb.e_mbd.inv_txm4x4_1_add = vp9_short_iwalsh4x4_1_add;
- cpi->mb.e_mbd.inv_txm4x4_add = vp9_short_iwalsh4x4_add;
- } else {
- cpi->mb.e_mbd.inv_txm4x4_1_add = vp9_short_idct4x4_1_add;
- cpi->mb.e_mbd.inv_txm4x4_add = vp9_short_idct4x4_add;
- }
-
+ cpi->mb.e_mbd.itxm_add = cpi->oxcf.lossless ? vp9_iwht4x4_add
+ : vp9_idct4x4_add;
cpi->baseline_gf_interval = DEFAULT_GF_INTERVAL;
cpi->ref_frame_flags = VP9_ALT_FLAG | VP9_GOLD_FLAG | VP9_LAST_FLAG;
@@ -1280,7 +1255,7 @@
cm->reset_frame_context = 0;
setup_features(cm);
- cpi->mb.e_mbd.allow_high_precision_mv = 0; // Default mv precision adaptation
+ cpi->mb.e_mbd.allow_high_precision_mv = 0; // Default mv precision
set_mvcost(&cpi->mb);
{
@@ -1521,7 +1496,7 @@
/*Initialize the feed-forward activity masking.*/
cpi->activity_avg = 90 << 12;
- cpi->frames_since_key = 8; // Give a sensible default for the first frame.
+ cpi->frames_since_key = 8; // Sensible default for first frame.
cpi->key_frame_frequency = cpi->oxcf.key_freq;
cpi->this_key_frame_forced = 0;
cpi->next_key_frame_forced = 0;
@@ -1803,8 +1778,10 @@
FILE *f = fopen("opsnr.stt", "a");
double time_encoded = (cpi->last_end_time_stamp_seen
- cpi->first_time_stamp_ever) / 10000000.000;
- double total_encode_time = (cpi->time_receive_data + cpi->time_compress_data) / 1000.000;
- double dr = (double)cpi->bytes * (double) 8 / (double)1000 / time_encoded;
+ double total_encode_time = (cpi->time_receive_data +
+ cpi->time_compress_data) / 1000.000;
+ double dr = (double)cpi->bytes * (double) 8 / (double)1000
+ / time_encoded;
if (cpi->b_calculate_psnr) {
YV12_BUFFER_CONFIG *lst_yv12 =
@@ -1824,20 +1801,15 @@
dr, cpi->total / cpi->count, total_psnr,
cpi->totalp / cpi->count, total_psnr2, total_ssim, total_ssimp,
total_encode_time);
-// fprintf(f, "%7.3f\t%7.3f\t%7.3f\t%7.3f\t%7.3f\t%7.3f\t%8.0f %10ld\n",
-// dr, cpi->total / cpi->count, total_psnr,
-// cpi->totalp / cpi->count, total_psnr2, total_ssim,
-// total_encode_time, cpi->tot_recode_hits);
}
if (cpi->b_calculate_ssimg) {
fprintf(f, "BitRate\tSSIM_Y\tSSIM_U\tSSIM_V\tSSIM_A\t Time(ms)\n");
fprintf(f, "%7.2f\t%6.4f\t%6.4f\t%6.4f\t%6.4f\t%8.0f\n", dr,
- cpi->total_ssimg_y / cpi->count, cpi->total_ssimg_u / cpi->count,
- cpi->total_ssimg_v / cpi->count, cpi->total_ssimg_all / cpi->count, total_encode_time);
-// fprintf(f, "%7.3f\t%6.4f\t%6.4f\t%6.4f\t%6.4f\t%8.0f %10ld\n", dr,
-// cpi->total_ssimg_y / cpi->count, cpi->total_ssimg_u / cpi->count,
-// cpi->total_ssimg_v / cpi->count, cpi->total_ssimg_all / cpi->count, total_encode_time, cpi->tot_recode_hits);
+ cpi->total_ssimg_y / cpi->count,
+ cpi->total_ssimg_u / cpi->count,
+ cpi->total_ssimg_v / cpi->count,
+ cpi->total_ssimg_all / cpi->count, total_encode_time);
}
fclose(f);
@@ -1884,11 +1856,9 @@
"[INTRA_MODES] =\n{\n");
for (i = 0; i < INTRA_MODES; i++) {
-
fprintf(fmode, " { // Above Mode : %d\n", i);
for (j = 0; j < INTRA_MODES; j++) {
-
fprintf(fmode, " {");
for (k = 0; k < INTRA_MODES; k++) {
@@ -1899,11 +1869,9 @@
}
fprintf(fmode, "}, // left_mode %d\n", j);
-
}
fprintf(fmode, " },\n");
-
}
fprintf(fmode, "};\n");
@@ -1937,14 +1905,14 @@
(cpi->time_receive_data + cpi->time_compress_data) / 1000);
}
#endif
-
}
dealloc_compressor_data(cpi);
vpx_free(cpi->mb.ss);
vpx_free(cpi->tok);
- for (i = 0; i < sizeof(cpi->mbgraph_stats) / sizeof(cpi->mbgraph_stats[0]); i++) {
+ for (i = 0; i < sizeof(cpi->mbgraph_stats) /
+ sizeof(cpi->mbgraph_stats[0]); ++i) {
vpx_free(cpi->mbgraph_stats[i].mb_stats);
}
@@ -1971,7 +1939,6 @@
fclose(kf_list);
#endif
-
}
@@ -2292,14 +2259,15 @@
cpi->frames_since_golden = 0;
// ******** Fixed Q test code only ************
- // If we are going to use the ALT reference for the next group of frames set a flag to say so.
+ // If we are going to use the ALT reference for the next group of frames
+ // set a flag to say so.
if (cpi->oxcf.fixed_q >= 0 &&
cpi->oxcf.play_alternate && !cpi->refresh_alt_ref_frame) {
cpi->source_alt_ref_pending = 1;
cpi->frames_till_gf_update_due = cpi->baseline_gf_interval;
- // TODO(ivan): for SVC encoder, GF automatic update is disabled by using a
- // large GF_interval
+ // TODO(ivan): For SVC encoder, GF automatic update is disabled by using
+ // a large GF_interval.
if (cpi->use_svc) {
cpi->frames_till_gf_update_due = INT_MAX;
}
@@ -2339,12 +2307,12 @@
return i;
}
-static void Pass1Encode(VP9_COMP *cpi, unsigned long *size, unsigned char *dest, unsigned int *frame_flags) {
+static void Pass1Encode(VP9_COMP *cpi, unsigned long *size, unsigned char *dest,
+ unsigned int *frame_flags) {
(void) size;
(void) dest;
(void) frame_flags;
-
vp9_set_quantizer(cpi, find_fp_qindex());
vp9_first_pass(cpi);
}
@@ -2352,13 +2320,11 @@
#define WRITE_RECON_BUFFER 0
#if WRITE_RECON_BUFFER
void write_cx_frame_to_file(YV12_BUFFER_CONFIG *frame, int this_frame) {
-
- // write the frame
FILE *yframe;
int i;
char filename[255];
- sprintf(filename, "cx\\y%04d.raw", this_frame);
+ snprintf(filename, sizeof(filename), "cx\\y%04d.raw", this_frame);
yframe = fopen(filename, "wb");
for (i = 0; i < frame->y_height; i++)
@@ -2366,7 +2332,7 @@
frame->y_width, 1, yframe);
fclose(yframe);
- sprintf(filename, "cx\\u%04d.raw", this_frame);
+ snprintf(filename, sizeof(filename), "cx\\u%04d.raw", this_frame);
yframe = fopen(filename, "wb");
for (i = 0; i < frame->uv_height; i++)
@@ -2374,7 +2340,7 @@
frame->uv_width, 1, yframe);
fclose(yframe);
- sprintf(filename, "cx\\v%04d.raw", this_frame);
+ snprintf(filename, sizeof(filename), "cx\\v%04d.raw", this_frame);
yframe = fopen(filename, "wb");
for (i = 0; i < frame->uv_height; i++)
@@ -2396,8 +2362,10 @@
for (i = 1; i < frame->y_height - 1; i++) {
for (j = 1; j < frame->y_width - 1; j++) {
/* Sobel hor and ver gradients */
- int v = 2 * (curr[1] - curr[-1]) + (prev[1] - prev[-1]) + (next[1] - next[-1]);
- int h = 2 * (prev[0] - next[0]) + (prev[1] - next[1]) + (prev[-1] - next[-1]);
+ int v = 2 * (curr[1] - curr[-1]) + (prev[1] - prev[-1]) +
+ (next[1] - next[-1]);
+ int h = 2 * (prev[0] - next[0]) + (prev[1] - next[1]) +
+ (prev[-1] - next[-1]);
h = (h < 0 ? -h : h);
v = (v < 0 ? -v : v);
if (h > EDGE_THRESH || v > EDGE_THRESH)
@@ -2433,10 +2401,9 @@
if (((cpi->projected_frame_size > high_limit) && (q < maxq)) ||
((cpi->projected_frame_size < low_limit) && (q > minq))) {
force_recode = 1;
- }
- // Special Constrained quality tests
- else if (cpi->oxcf.end_usage == USAGE_CONSTRAINED_QUALITY) {
- // Undershoot and below auto cq level
+ } else if (cpi->oxcf.end_usage == USAGE_CONSTRAINED_QUALITY) {
+ // Deal with frame undershoot and whether or not we are
+ // below the automatically set cq level.
if (q > cpi->cq_target_quality &&
cpi->projected_frame_size < ((cpi->this_frame_target * 7) >> 3)) {
force_recode = 1;
@@ -2597,152 +2564,75 @@
}
}
+#if 0 && CONFIG_INTERNAL_STATS
+static void output_frame_level_debug_stats(VP9_COMP *cpi) {
+ VP9_COMMON *const cm = &cpi->common;
+ FILE *const f = fopen("tmp.stt", cm->current_video_frame ? "a" : "w");
+ int recon_err;
-static void encode_frame_to_data_rate(VP9_COMP *cpi,
- unsigned long *size,
- unsigned char *dest,
- unsigned int *frame_flags) {
- VP9_COMMON *cm = &cpi->common;
- MACROBLOCKD *xd = &cpi->mb.e_mbd;
- TX_SIZE t;
- int q;
- int frame_over_shoot_limit;
- int frame_under_shoot_limit;
+ vp9_clear_system_state(); // __asm emms;
- int loop = 0;
- int loop_count;
+ recon_err = vp9_calc_ss_err(cpi->Source,
+ &cm->yv12_fb[cm->new_fb_idx]);
- int q_low;
- int q_high;
+ if (cpi->twopass.total_left_stats.coded_error != 0.0)
+ fprintf(f, "%10d %10d %10d %10d %10d %10d %10d %10d %10d"
+ "%7.2f %7.2f %7.2f %7.2f %7.2f %7.2f %7.2f"
+ "%6d %6d %5d %5d %5d %8.2f %10d %10.3f"
+ "%10.3f %8d %10d %10d %10d\n",
+ cpi->common.current_video_frame, cpi->this_frame_target,
+ cpi->projected_frame_size, 0,
+ (cpi->projected_frame_size - cpi->this_frame_target),
+ (int)cpi->total_target_vs_actual,
+ (int)(cpi->oxcf.starting_buffer_level - cpi->bits_off_target),
+ (int)cpi->total_actual_bits, cm->base_qindex,
+ vp9_convert_qindex_to_q(cm->base_qindex),
+ (double)vp9_dc_quant(cm->base_qindex, 0) / 4.0,
+ vp9_convert_qindex_to_q(cpi->active_best_quality),
+ vp9_convert_qindex_to_q(cpi->active_worst_quality), cpi->avg_q,
+ vp9_convert_qindex_to_q(cpi->ni_av_qi),
+ vp9_convert_qindex_to_q(cpi->cq_target_quality),
+ cpi->refresh_last_frame, cpi->refresh_golden_frame,
+ cpi->refresh_alt_ref_frame, cm->frame_type, cpi->gfu_boost,
+ cpi->twopass.est_max_qcorrection_factor, (int)cpi->twopass.bits_left,
+ cpi->twopass.total_left_stats.coded_error,
+ (double)cpi->twopass.bits_left /
+ (1 + cpi->twopass.total_left_stats.coded_error),
+ cpi->tot_recode_hits, recon_err, cpi->kf_boost, cpi->kf_zeromotion_pct);
- int top_index;
- int bottom_index;
- int active_worst_qchanged = 0;
+ fclose(f);
- int overshoot_seen = 0;
- int undershoot_seen = 0;
+ if (0) {
+ FILE *const fmodes = fopen("Modes.stt", "a");
+ int i;
- SPEED_FEATURES *sf = &cpi->sf;
- unsigned int max_mv_def = MIN(cpi->common.width, cpi->common.height);
- struct segmentation *seg = &cm->seg;
+ fprintf(fmodes, "%6d:%1d:%1d:%1d ", cpi->common.current_video_frame,
+ cm->frame_type, cpi->refresh_golden_frame,
+ cpi->refresh_alt_ref_frame);
- /* Scale the source buffer, if required */
- if (cm->mi_cols * 8 != cpi->un_scaled_source->y_width ||
- cm->mi_rows * 8 != cpi->un_scaled_source->y_height) {
- scale_and_extend_frame(cpi->un_scaled_source, &cpi->scaled_source);
- cpi->Source = &cpi->scaled_source;
- } else {
- cpi->Source = cpi->un_scaled_source;
+ for (i = 0; i < MAX_MODES; ++i)
+ fprintf(fmodes, "%5d ", cpi->mode_chosen_counts[i]);
+ for (i = 0; i < MAX_REFS; ++i)
+ fprintf(fmodes, "%5d ", cpi->sub8x8_mode_chosen_counts[i]);
+
+ fprintf(fmodes, "\n");
+
+ fclose(fmodes);
}
+}
+#endif
- scale_references(cpi);
-
- // Clear down mmx registers to allow floating point in what follows
- vp9_clear_system_state();
-
-
- // For an alt ref frame in 2 pass we skip the call to the second
- // pass function that sets the target bandwidth so must set it here
- if (cpi->refresh_alt_ref_frame) {
- // Per frame bit target for the alt ref frame
- cpi->per_frame_bandwidth = cpi->twopass.gf_bits;
- // per second target bitrate
- cpi->target_bandwidth = (int)(cpi->twopass.gf_bits *
- cpi->output_framerate);
- }
-
- // Clear zbin over-quant value and mode boost values.
- cpi->zbin_mode_boost = 0;
-
- // Enable or disable mode based tweaking of the zbin
- // For 2 Pass Only used where GF/ARF prediction quality
- // is above a threshold
- cpi->zbin_mode_boost = 0;
-
- // if (cpi->oxcf.lossless)
- cpi->zbin_mode_boost_enabled = 0;
- // else
- // cpi->zbin_mode_boost_enabled = 1;
-
- // Current default encoder behaviour for the altref sign bias
- cpi->common.ref_frame_sign_bias[ALTREF_FRAME] = cpi->source_alt_ref_active;
-
- // Check to see if a key frame is signaled
- // For two pass with auto key frame enabled cm->frame_type may already be set, but not for one pass.
- if ((cm->current_video_frame == 0) ||
- (cm->frame_flags & FRAMEFLAGS_KEY) ||
- (cpi->oxcf.auto_key && (cpi->frames_since_key % cpi->key_frame_frequency == 0))) {
- // Key frame from VFW/auto-keyframe/first frame
- cm->frame_type = KEY_FRAME;
- }
-
- // Set default state for segment based loop filter update flags
- cm->lf.mode_ref_delta_update = 0;
-
- // Initialize cpi->mv_step_param to default based on max resolution
- cpi->mv_step_param = vp9_init_search_range(cpi, max_mv_def);
- // Initialize cpi->max_mv_magnitude and cpi->mv_step_param if appropriate.
- if (sf->auto_mv_step_size) {
- if ((cpi->common.frame_type == KEY_FRAME) || cpi->common.intra_only) {
- // initialize max_mv_magnitude for use in the first INTER frame
- // after a key/intra-only frame
- cpi->max_mv_magnitude = max_mv_def;
- } else {
- if (cm->show_frame)
- // allow mv_steps to correspond to twice the max mv magnitude found
- // in the previous frame, capped by the default max_mv_magnitude based
- // on resolution
- cpi->mv_step_param = vp9_init_search_range(
- cpi, MIN(max_mv_def, 2 * cpi->max_mv_magnitude));
- cpi->max_mv_magnitude = 0;
- }
- }
-
- // Set various flags etc to special state if it is a key frame
- if (cm->frame_type == KEY_FRAME) {
- // Reset the loop filter deltas and segmentation map
- setup_features(cm);
-
- // If segmentation is enabled force a map update for key frames
- if (seg->enabled) {
- seg->update_map = 1;
- seg->update_data = 1;
- }
-
- // The alternate reference frame cannot be active for a key frame
- cpi->source_alt_ref_active = 0;
-
- cm->error_resilient_mode = (cpi->oxcf.error_resilient_mode != 0);
- cm->frame_parallel_decoding_mode =
- (cpi->oxcf.frame_parallel_decoding_mode != 0);
- if (cm->error_resilient_mode) {
- cm->frame_parallel_decoding_mode = 1;
- cm->reset_frame_context = 0;
- cm->refresh_frame_context = 0;
- }
- }
-
- // Configure experimental use of segmentation for enhanced coding of
- // static regions if indicated.
- // Only allowed for now in second pass of two pass (as requires lagged coding)
- // and if the relevant speed feature flag is set.
- if ((cpi->pass == 2) && (cpi->sf.static_segmentation)) {
- configure_static_seg_features(cpi);
- }
-
- // Decide how big to make the frame
- vp9_pick_frame_size(cpi);
-
- vp9_clear_system_state();
-
+static int pick_q_and_adjust_q_bounds(VP9_COMP *cpi,
+ int * bottom_index, int * top_index) {
// Set an active best quality and if necessary active worst quality
- q = cpi->active_worst_quality;
+ int q = cpi->active_worst_quality;
+ VP9_COMMON *const cm = &cpi->common;
if (cm->frame_type == KEY_FRAME) {
#if !CONFIG_MULTIPLE_ARF
- // Special case for key frames forced because we have reached
- // the maximum key frame interval. Here force the Q to a range
- // based on the ambient Q to reduce the risk of popping
+ // Handle the special case for key frames forced when we have75 reached
+ // the maximum key frame interval. Here force the Q to a range
+ // based on the ambient Q to reduce the risk of popping.
if (cpi->this_key_frame_forced) {
int delta_qindex;
int qindex = cpi->last_boosted_qindex;
@@ -2786,12 +2676,13 @@
cpi->active_best_quality = cpi->active_worst_quality
+ compute_qdelta(cpi, current_q, current_q * 0.3);
#endif
- } else if (cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame) {
+ } else if (!cpi->is_src_frame_alt_ref &&
+ (cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame)) {
int high = 2000;
int low = 400;
// Use the lower of cpi->active_worst_quality and recent
- // average Q as basis for GF/ARF Q limit unless last frame was
+ // average Q as basis for GF/ARF best Q limit unless last frame was
// a key frame.
if (cpi->frames_since_key > 1 &&
cpi->avg_frame_qindex < cpi->active_worst_quality) {
@@ -2832,14 +2723,10 @@
}
}
} else {
- if (!cpi->refresh_alt_ref_frame) {
- cpi->active_best_quality = inter_minq[q];
- } else {
cpi->active_best_quality = get_active_quality(q, cpi->gfu_boost,
low, high,
gf_low_motion_minq,
gf_high_motion_minq);
- }
}
} else {
if (cpi->oxcf.end_usage == USAGE_CONSTANT_QUALITY) {
@@ -2855,7 +2742,7 @@
cpi->active_best_quality = inter_minq[q];
#endif
- // For the constant/constrained quality mode we don't want
+ // For the constrained quality mode we don't want
// q to fall below the cq level.
if ((cpi->oxcf.end_usage == USAGE_CONSTRAINED_QUALITY) &&
(cpi->active_best_quality < cpi->cq_target_quality)) {
@@ -2883,16 +2770,171 @@
if (cpi->active_worst_quality < cpi->active_best_quality)
cpi->active_worst_quality = cpi->active_best_quality;
- // Special case code to try and match quality with forced key frames
+ // Limit Q range for the adaptive loop.
+ if (cm->frame_type == KEY_FRAME && !cpi->this_key_frame_forced) {
+ *top_index = cpi->active_best_quality;
+ } else if (!cpi->is_src_frame_alt_ref &&
+ (cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame)) {
+ *top_index =
+ (cpi->active_worst_quality + cpi->active_best_quality * 3) / 4;
+ } else {
+ *top_index = cpi->active_worst_quality;
+ }
+ *bottom_index = cpi->active_best_quality;
+
if (cpi->oxcf.end_usage == USAGE_CONSTANT_QUALITY) {
q = cpi->active_best_quality;
+ // Special case code to try and match quality with forced key frames
} else if ((cm->frame_type == KEY_FRAME) && cpi->this_key_frame_forced) {
q = cpi->last_boosted_qindex;
} else {
// Determine initial Q to try
q = vp9_regulate_q(cpi, cpi->this_frame_target);
+ if (q > *top_index)
+ q = *top_index;
}
+ return q;
+}
+static void encode_frame_to_data_rate(VP9_COMP *cpi,
+ unsigned long *size,
+ unsigned char *dest,
+ unsigned int *frame_flags) {
+ VP9_COMMON *const cm = &cpi->common;
+ MACROBLOCKD *const xd = &cpi->mb.e_mbd;
+ TX_SIZE t;
+ int q;
+ int frame_over_shoot_limit;
+ int frame_under_shoot_limit;
+
+ int loop = 0;
+ int loop_count;
+
+ int q_low;
+ int q_high;
+
+ int top_index;
+ int bottom_index;
+ int active_worst_qchanged = 0;
+
+ int overshoot_seen = 0;
+ int undershoot_seen = 0;
+
+ SPEED_FEATURES *const sf = &cpi->sf;
+ unsigned int max_mv_def = MIN(cpi->common.width, cpi->common.height);
+ struct segmentation *const seg = &cm->seg;
+
+ /* Scale the source buffer, if required. */
+ if (cm->mi_cols * 8 != cpi->un_scaled_source->y_width ||
+ cm->mi_rows * 8 != cpi->un_scaled_source->y_height) {
+ scale_and_extend_frame(cpi->un_scaled_source, &cpi->scaled_source);
+ cpi->Source = &cpi->scaled_source;
+ } else {
+ cpi->Source = cpi->un_scaled_source;
+ }
+ scale_references(cpi);
+
+ // Clear down mmx registers to allow floating point in what follows.
+ vp9_clear_system_state();
+
+ // For an alt ref frame in 2 pass we skip the call to the second
+ // pass function that sets the target bandwidth so we must set it here.
+ if (cpi->refresh_alt_ref_frame) {
+ // Set a per frame bit target for the alt ref frame.
+ cpi->per_frame_bandwidth = cpi->twopass.gf_bits;
+ // Set a per second target bitrate.
+ cpi->target_bandwidth = (int)(cpi->twopass.gf_bits * cpi->output_framerate);
+ }
+
+ // Clear zbin over-quant value and mode boost values.
+ cpi->zbin_mode_boost = 0;
+
+ // Enable or disable mode based tweaking of the zbin.
+ // For 2 pass only used where GF/ARF prediction quality
+ // is above a threshold.
+ cpi->zbin_mode_boost = 0;
+ cpi->zbin_mode_boost_enabled = 0;
+
+ // Current default encoder behavior for the altref sign bias.
+ cpi->common.ref_frame_sign_bias[ALTREF_FRAME] = cpi->source_alt_ref_active;
+
+ // Check to see if a key frame is signaled.
+ // For two pass with auto key frame enabled cm->frame_type may already be
+ // set, but not for one pass.
+ if ((cm->current_video_frame == 0) ||
+ (cm->frame_flags & FRAMEFLAGS_KEY) ||
+ (cpi->oxcf.auto_key && (cpi->frames_since_key %
+ cpi->key_frame_frequency == 0))) {
+ // Set frame type to key frame for the force key frame, if we exceed the
+ // maximum distance in an automatic keyframe selection or for the first
+ // frame.
+ cm->frame_type = KEY_FRAME;
+ }
+
+ // Set default state for segment based loop filter update flags.
+ cm->lf.mode_ref_delta_update = 0;
+
+ // Initialize cpi->mv_step_param to default based on max resolution.
+ cpi->mv_step_param = vp9_init_search_range(cpi, max_mv_def);
+ // Initialize cpi->max_mv_magnitude and cpi->mv_step_param if appropriate.
+ if (sf->auto_mv_step_size) {
+ if ((cpi->common.frame_type == KEY_FRAME) || cpi->common.intra_only) {
+ // Initialize max_mv_magnitude for use in the first INTER frame
+ // after a key/intra-only frame.
+ cpi->max_mv_magnitude = max_mv_def;
+ } else {
+ if (cm->show_frame)
+ // Allow mv_steps to correspond to twice the max mv magnitude found
+ // in the previous frame, capped by the default max_mv_magnitude based
+ // on resolution.
+ cpi->mv_step_param = vp9_init_search_range(
+ cpi, MIN(max_mv_def, 2 * cpi->max_mv_magnitude));
+ cpi->max_mv_magnitude = 0;
+ }
+ }
+
+ // Set various flags etc to special state if it is a key frame.
+ if (cm->frame_type == KEY_FRAME) {
+ // Reset the loop filter deltas and segmentation map.
+ setup_features(cm);
+
+ // If segmentation is enabled force a map update for key frames.
+ if (seg->enabled) {
+ seg->update_map = 1;
+ seg->update_data = 1;
+ }
+
+ // The alternate reference frame cannot be active for a key frame.
+ cpi->source_alt_ref_active = 0;
+
+ cm->error_resilient_mode = (cpi->oxcf.error_resilient_mode != 0);
+ cm->frame_parallel_decoding_mode =
+ (cpi->oxcf.frame_parallel_decoding_mode != 0);
+ if (cm->error_resilient_mode) {
+ cm->frame_parallel_decoding_mode = 1;
+ cm->reset_frame_context = 0;
+ cm->refresh_frame_context = 0;
+ }
+ }
+
+ // Configure experimental use of segmentation for enhanced coding of
+ // static regions if indicated.
+ // Only allowed in second pass of two pass (as requires lagged coding)
+ // and if the relevant speed feature flag is set.
+ if ((cpi->pass == 2) && (cpi->sf.static_segmentation)) {
+ configure_static_seg_features(cpi);
+ }
+
+ // Decide how big to make the frame.
+ vp9_pick_frame_size(cpi);
+
+ vp9_clear_system_state();
+
+ q = pick_q_and_adjust_q_bounds(cpi, &bottom_index, &top_index);
+
+ q_high = top_index;
+ q_low = bottom_index;
+
vp9_compute_frame_size_bounds(cpi, &frame_under_shoot_limit,
&frame_over_shoot_limit);
@@ -2915,16 +2957,9 @@
q_high = q;
printf("frame:%d q:%d\n", cm->current_video_frame, q);
- } else {
-#endif
- // Limit Q range for the adaptive loop.
- bottom_index = cpi->active_best_quality;
- top_index = cpi->active_worst_quality;
- q_low = cpi->active_best_quality;
- q_high = cpi->active_worst_quality;
-#if CONFIG_MULTIPLE_ARF
}
#endif
+
loop_count = 0;
vp9_zero(cpi->rd_tx_select_threshes);
@@ -2974,7 +3009,6 @@
vp9_set_quantizer(cpi, q);
if (loop_count == 0) {
-
// Set up entropy depending on frame type.
if (cm->frame_type == KEY_FRAME) {
/* Choose which entropy context to use. When using a forward reference
@@ -3016,10 +3050,10 @@
frame_over_shoot_limit = 1;
active_worst_qchanged = 0;
- // Special case handling for forced key frames
if (cpi->oxcf.end_usage == USAGE_CONSTANT_QUALITY) {
loop = 0;
} else {
+ // Special case handling for forced key frames
if ((cm->frame_type == KEY_FRAME) && cpi->this_key_frame_forced) {
int last_q = q;
int kf_err = vp9_calc_ss_err(cpi->Source,
@@ -3261,9 +3295,11 @@
// Keep a record of ambient average Q.
if (cm->frame_type != KEY_FRAME)
- cpi->avg_frame_qindex = (2 + 3 * cpi->avg_frame_qindex + cm->base_qindex) >> 2;
+ cpi->avg_frame_qindex = (2 + 3 * cpi->avg_frame_qindex +
+ cm->base_qindex) >> 2;
- // Keep a record from which we can calculate the average Q excluding GF updates and key frames
+ // Keep a record from which we can calculate the average Q excluding GF
+ // updates and key frames.
if (cm->frame_type != KEY_FRAME &&
!cpi->refresh_golden_frame &&
!cpi->refresh_alt_ref_frame) {
@@ -3281,7 +3317,8 @@
if (!cm->show_frame)
cpi->bits_off_target -= cpi->projected_frame_size;
else
- cpi->bits_off_target += cpi->av_per_frame_bandwidth - cpi->projected_frame_size;
+ cpi->bits_off_target += cpi->av_per_frame_bandwidth -
+ cpi->projected_frame_size;
// Clip the buffer level at the maximum buffer size
if (cpi->bits_off_target > cpi->oxcf.maximum_buffer_size)
@@ -3305,122 +3342,28 @@
cpi->total_actual_bits += cpi->projected_frame_size;
// Debug stats
- cpi->total_target_vs_actual += (cpi->this_frame_target - cpi->projected_frame_size);
+ cpi->total_target_vs_actual += (cpi->this_frame_target -
+ cpi->projected_frame_size);
cpi->buffer_level = cpi->bits_off_target;
- // Update bits left to the kf and gf groups to account for overshoot or undershoot on these frames
+ // Update bits left to the kf and gf groups to account for overshoot or
+ // undershoot on these frames
if (cm->frame_type == KEY_FRAME) {
- cpi->twopass.kf_group_bits += cpi->this_frame_target - cpi->projected_frame_size;
+ cpi->twopass.kf_group_bits += cpi->this_frame_target -
+ cpi->projected_frame_size;
cpi->twopass.kf_group_bits = MAX(cpi->twopass.kf_group_bits, 0);
} else if (cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame) {
- cpi->twopass.gf_group_bits += cpi->this_frame_target - cpi->projected_frame_size;
+ cpi->twopass.gf_group_bits += cpi->this_frame_target -
+ cpi->projected_frame_size;
cpi->twopass.gf_group_bits = MAX(cpi->twopass.gf_group_bits, 0);
}
- // Update the skip mb flag probabilities based on the distribution seen
- // in this frame.
- // update_base_skip_probs(cpi);
-
-#if 0 // CONFIG_INTERNAL_STATS
- {
- FILE *f = fopen("tmp.stt", cm->current_video_frame ? "a" : "w");
- int recon_err;
-
- vp9_clear_system_state(); // __asm emms;
-
- recon_err = vp9_calc_ss_err(cpi->Source,
- &cm->yv12_fb[cm->new_fb_idx]);
-
- if (cpi->twopass.total_left_stats.coded_error != 0.0)
- fprintf(f, "%10d %10d %10d %10d %10d %10d %10d %10d %10d"
- "%7.2f %7.2f %7.2f %7.2f %7.2f %7.2f %7.2f"
- "%6d %6d %5d %5d %5d %8.2f %10d %10.3f"
- "%10.3f %8d %10d %10d %10d\n",
- cpi->common.current_video_frame, cpi->this_frame_target,
- cpi->projected_frame_size, 0, //loop_size_estimate,
- (cpi->projected_frame_size - cpi->this_frame_target),
- (int)cpi->total_target_vs_actual,
- (int)(cpi->oxcf.starting_buffer_level - cpi->bits_off_target),
- (int)cpi->total_actual_bits,
- cm->base_qindex,
- vp9_convert_qindex_to_q(cm->base_qindex),
- (double)vp9_dc_quant(cm->base_qindex, 0) / 4.0,
- vp9_convert_qindex_to_q(cpi->active_best_quality),
- vp9_convert_qindex_to_q(cpi->active_worst_quality),
- cpi->avg_q,
- vp9_convert_qindex_to_q(cpi->ni_av_qi),
- vp9_convert_qindex_to_q(cpi->cq_target_quality),
- cpi->refresh_last_frame,
- cpi->refresh_golden_frame, cpi->refresh_alt_ref_frame,
- cm->frame_type, cpi->gfu_boost,
- cpi->twopass.est_max_qcorrection_factor,
- (int)cpi->twopass.bits_left,
- cpi->twopass.total_left_stats.coded_error,
- (double)cpi->twopass.bits_left /
- cpi->twopass.total_left_stats.coded_error,
- cpi->tot_recode_hits, recon_err, cpi->kf_boost,
- cpi->kf_zeromotion_pct);
- else
- fprintf(f, "%10d %10d %10d %10d %10d %10d %10d %10d %10d"
- "%7.2f %7.2f %7.2f %7.2f %7.2f %7.2f %7.2f"
- "%5d %5d %5d %8d %8d %8.2f %10d %10.3f"
- "%8d %10d %10d %10d\n",
- cpi->common.current_video_frame,
- cpi->this_frame_target, cpi->projected_frame_size,
- 0, //loop_size_estimate,
- (cpi->projected_frame_size - cpi->this_frame_target),
- (int)cpi->total_target_vs_actual,
- (int)(cpi->oxcf.starting_buffer_level - cpi->bits_off_target),
- (int)cpi->total_actual_bits,
- cm->base_qindex,
- vp9_convert_qindex_to_q(cm->base_qindex),
- (double)vp9_dc_quant(cm->base_qindex, 0) / 4.0,
- vp9_convert_qindex_to_q(cpi->active_best_quality),
- vp9_convert_qindex_to_q(cpi->active_worst_quality),
- cpi->avg_q,
- vp9_convert_qindex_to_q(cpi->ni_av_qi),
- vp9_convert_qindex_to_q(cpi->cq_target_quality),
- cpi->refresh_last_frame,
- cpi->refresh_golden_frame, cpi->refresh_alt_ref_frame,
- cm->frame_type, cpi->gfu_boost,
- cpi->twopass.est_max_qcorrection_factor,
- (int)cpi->twopass.bits_left,
- cpi->twopass.total_left_stats.coded_error,
- cpi->tot_recode_hits, recon_err, cpi->kf_boost,
- cpi->kf_zeromotion_pct);
-
- fclose(f);
-
- if (0) {
- FILE *fmodes = fopen("Modes.stt", "a");
- int i;
-
- fprintf(fmodes, "%6d:%1d:%1d:%1d ",
- cpi->common.current_video_frame,
- cm->frame_type, cpi->refresh_golden_frame,
- cpi->refresh_alt_ref_frame);
-
- for (i = 0; i < MAX_MODES; ++i)
- fprintf(fmodes, "%5d ", cpi->mode_chosen_counts[i]);
- for (i = 0; i < MAX_REFS; ++i)
- fprintf(fmodes, "%5d ", cpi->sub8x8_mode_chosen_counts[i]);
-
- fprintf(fmodes, "\n");
-
- fclose(fmodes);
- }
- }
-
-#endif
-
#if 0
- // Debug stats for segment feature experiments.
- print_seg_map(cpi);
+ output_frame_level_debug_stats(cpi);
#endif
-
// If this was a kf or Gf note the Q
if ((cm->frame_type == KEY_FRAME)
|| cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame)
@@ -3504,7 +3447,8 @@
#endif
}
- // Clear the one shot update flags for segmentation map and mode/ref loop filter deltas.
+ // Clear the one shot update flags for segmentation map and mode/ref loop
+ // filter deltas.
cm->seg.update_map = 0;
cm->seg.update_data = 0;
cm->lf.mode_ref_delta_update = 0;
@@ -3536,28 +3480,10 @@
// restore prev_mi
cm->prev_mi = cm->prev_mip + cm->mode_info_stride + 1;
cm->prev_mi_grid_visible = cm->prev_mi_grid_base + cm->mode_info_stride + 1;
-
- #if 0
- {
- char filename[512];
- FILE *recon_file;
- sprintf(filename, "enc%04d.yuv", (int) cm->current_video_frame);
- recon_file = fopen(filename, "wb");
- fwrite(cm->yv12_fb[cm->ref_frame_map[cpi->lst_fb_idx]].buffer_alloc,
- cm->yv12_fb[cm->ref_frame_map[cpi->lst_fb_idx]].frame_size,
- 1, recon_file);
- fclose(recon_file);
- }
-#endif
-#ifdef OUTPUT_YUV_REC
- vp9_write_yuv_rec_frame(cm);
-#endif
-
}
static void Pass2Encode(VP9_COMP *cpi, unsigned long *size,
unsigned char *dest, unsigned int *frame_flags) {
-
cpi->enable_encode_breakout = 1;
if (!cpi->refresh_alt_ref_frame)
@@ -3574,12 +3500,14 @@
if (!cpi->refresh_alt_ref_frame) {
double lower_bounds_min_rate = FRAME_OVERHEAD_BITS * cpi->oxcf.framerate;
double two_pass_min_rate = (double)(cpi->oxcf.target_bandwidth
- * cpi->oxcf.two_pass_vbrmin_section / 100);
+ * cpi->oxcf.two_pass_vbrmin_section
+ / 100);
if (two_pass_min_rate < lower_bounds_min_rate)
two_pass_min_rate = lower_bounds_min_rate;
- cpi->twopass.bits_left += (int64_t)(two_pass_min_rate / cpi->oxcf.framerate);
+ cpi->twopass.bits_left += (int64_t)(two_pass_min_rate
+ / cpi->oxcf.framerate);
}
}
@@ -3917,7 +3845,6 @@
cpi->bytes += *size;
if (cm->show_frame) {
-
cpi->count++;
if (cpi->b_calculate_psnr) {
@@ -4027,9 +3954,9 @@
vp9_ppflags_t *flags) {
VP9_COMP *cpi = (VP9_COMP *) comp;
- if (!cpi->common.show_frame)
+ if (!cpi->common.show_frame) {
return -1;
- else {
+ } else {
int ret;
#if CONFIG_VP9_POSTPROC
ret = vp9_post_proc_frame(&cpi->common, dest, flags);
diff --git a/vp9/encoder/vp9_onyx_int.h b/vp9/encoder/vp9_onyx_int.h
index 2652929..f88ae8a 100644
--- a/vp9/encoder/vp9_onyx_int.h
+++ b/vp9/encoder/vp9_onyx_int.h
@@ -36,7 +36,7 @@
#define DISABLE_RC_LONG_TERM_MEM 0
#endif
-#define MODE_TEST_HIT_STATS
+// #define MODE_TEST_HIT_STATS
// #define SPEEDSTATS 1
#if CONFIG_MULTIPLE_ARF
@@ -230,6 +230,7 @@
#define ALL_INTRA_MODES 0x3FF
#define INTRA_DC_ONLY 0x01
#define INTRA_DC_TM ((1 << TM_PRED) | (1 << DC_PRED))
+#define INTRA_DC_H_V ((1 << DC_PRED) | (1 << V_PRED) | (1 << H_PRED))
#define INTRA_DC_TM_H_V (INTRA_DC_TM | (1 << V_PRED) | (1 << H_PRED))
typedef enum {
@@ -285,8 +286,8 @@
// A source variance threshold below which filter search is disabled
// Choose a very large value (UINT_MAX) to use 8-tap always
unsigned int disable_filter_search_var_thresh;
- int intra_y_mode_mask;
- int intra_uv_mode_mask;
+ int intra_y_mode_mask[TX_SIZES];
+ int intra_uv_mode_mask[TX_SIZES];
int use_rd_breakout;
int use_uv_intra_rd_estimate;
int use_fast_lpf_pick;
@@ -314,6 +315,7 @@
MACROBLOCK mb;
VP9_COMMON common;
VP9_CONFIG oxcf;
+ struct rdcost_block_args rdcost_stack;
struct lookahead_ctx *lookahead;
struct lookahead_entry *source;
diff --git a/vp9/encoder/vp9_picklpf.c b/vp9/encoder/vp9_picklpf.c
index 239fd6b..476ecaa 100644
--- a/vp9/encoder/vp9_picklpf.c
+++ b/vp9/encoder/vp9_picklpf.c
@@ -54,7 +54,8 @@
src += srcoffset;
dst += dstoffset;
- // Loop through the Y plane raw and reconstruction data summing (square differences)
+ // Loop through the raw Y plane and reconstruction data summing the square
+ // differences.
for (i = 0; i < linestocopy; i += 16) {
for (j = 0; j < source->y_width; j += 16) {
unsigned int sse;
@@ -72,20 +73,6 @@
// Enforce a minimum filter level based upon baseline Q
static int get_min_filter_level(VP9_COMP *cpi, int base_qindex) {
int min_filter_level;
- /*int q = (int) vp9_convert_qindex_to_q(base_qindex);
-
- if (cpi->source_alt_ref_active && cpi->common.refresh_golden_frame && !cpi->common.refresh_alt_ref_frame)
- min_filter_level = 0;
- else
- {
- if (q <= 10)
- min_filter_level = 0;
- else if (q <= 64)
- min_filter_level = 1;
- else
- min_filter_level = (q >> 6);
- }
- */
min_filter_level = 0;
return min_filter_level;
@@ -93,11 +80,7 @@
// Enforce a maximum filter level based upon baseline Q
static int get_max_filter_level(VP9_COMP *cpi, int base_qindex) {
- // PGW August 2006: Highest filter values almost always a bad idea
-
- // jbb chg: 20100118 - not so any more with this overquant stuff allow high values
- // with lots of intra coming in.
- int max_filter_level = MAX_LOOP_FILTER;// * 3 / 4;
+ int max_filter_level = MAX_LOOP_FILTER;
(void)base_qindex;
if (cpi->twopass.section_intra_rating > 8)
@@ -128,7 +111,7 @@
int filt_best;
int filt_direction = 0;
- int Bias = 0; // Bias against raising loop filter and in favour of lowering it
+ int Bias = 0; // Bias against raising loop filter in favor of lowering it.
// Make a copy of the unfiltered / processed recon buffer
vpx_yv12_copy_y(cm->frame_to_show, &cpi->last_frame_uf);
@@ -136,7 +119,8 @@
lf->sharpness_level = cm->frame_type == KEY_FRAME ? 0
: cpi->oxcf.Sharpness;
- // Start the search at the previous frame filter level unless it is now out of range.
+ // Start the search at the previous frame filter level unless it is now out of
+ // range.
filt_mid = clamp(lf->filter_level, min_filter_level, max_filter_level);
// Define the initial step size
@@ -153,9 +137,8 @@
vpx_yv12_copy_y(&cpi->last_frame_uf, cm->frame_to_show);
while (filter_step > 0) {
- Bias = (best_err >> (15 - (filt_mid / 8))) * filter_step; // PGW change 12/12/06 for small images
+ Bias = (best_err >> (15 - (filt_mid / 8))) * filter_step;
- // jbb chg: 20100118 - in sections with lots of new material coming in don't bias as much to a low filter value
if (cpi->twopass.section_intra_rating < 20)
Bias = Bias * cpi->twopass.section_intra_rating / 20;
@@ -163,8 +146,12 @@
if (cpi->common.tx_mode != ONLY_4X4)
Bias >>= 1;
- filt_high = ((filt_mid + filter_step) > max_filter_level) ? max_filter_level : (filt_mid + filter_step);
- filt_low = ((filt_mid - filter_step) < min_filter_level) ? min_filter_level : (filt_mid - filter_step);
+ filt_high = ((filt_mid + filter_step) > max_filter_level)
+ ? max_filter_level
+ : (filt_mid + filter_step);
+ filt_low = ((filt_mid - filter_step) < min_filter_level)
+ ? min_filter_level
+ : (filt_mid - filter_step);
if ((filt_direction <= 0) && (filt_low != filt_mid)) {
// Get Low filter error score
@@ -176,7 +163,8 @@
// Re-instate the unfiltered frame
vpx_yv12_copy_y(&cpi->last_frame_uf, cm->frame_to_show);
- // If value is close to the best so far then bias towards a lower loop filter value.
+ // If value is close to the best so far then bias towards a lower loop
+ // filter value.
if ((filt_err - Bias) < best_err) {
// Was it actually better than the previous best?
if (filt_err < best_err)
@@ -215,4 +203,3 @@
lf->filter_level = filt_best;
}
-
diff --git a/vp9/encoder/vp9_ratectrl.c b/vp9/encoder/vp9_ratectrl.c
index bbcad17..224d1e4 100644
--- a/vp9/encoder/vp9_ratectrl.c
+++ b/vp9/encoder/vp9_ratectrl.c
@@ -59,9 +59,8 @@
int vp9_bits_per_mb(FRAME_TYPE frame_type, int qindex,
double correction_factor) {
-
const double q = vp9_convert_qindex_to_q(qindex);
- int enumerator = frame_type == KEY_FRAME ? 4000000 : 2500000;
+ int enumerator = frame_type == KEY_FRAME ? 3300000 : 2250000;
// q based adjustment to baseline enumerator
enumerator += (int)(enumerator * q) >> 12;
@@ -192,11 +191,12 @@
cpi->this_frame_target = cpi->per_frame_bandwidth;
}
- // Sanity check that the total sum of adjustments is not above the maximum allowed
- // That is that having allowed for KF and GF penalties we have not pushed the
- // current interframe target to low. If the adjustment we apply here is not capable of recovering
- // all the extra bits we have spent in the KF or GF then the remainder will have to be recovered over
- // a longer time span via other buffer / rate control mechanisms.
+ // Check that the total sum of adjustments is not above the maximum allowed.
+ // That is, having allowed for the KF and GF penalties, we have not pushed
+ // the current inter-frame target too low. If the adjustment we apply here is
+ // not capable of recovering all the extra bits we have spent in the KF or GF,
+ // then the remainder will have to be recovered over a longer time span via
+ // other buffer / rate control mechanisms.
if (cpi->this_frame_target < min_frame_target)
cpi->this_frame_target = min_frame_target;
@@ -265,12 +265,12 @@
rate_correction_factor);
// Work out a size correction factor.
- // if ( cpi->this_frame_target > 0 )
- // correction_factor = (100 * cpi->projected_frame_size) / cpi->this_frame_target;
if (projected_size_based_on_q > 0)
- correction_factor = (100 * cpi->projected_frame_size) / projected_size_based_on_q;
+ correction_factor =
+ (100 * cpi->projected_frame_size) / projected_size_based_on_q;
- // More heavily damped adjustment used if we have been oscillating either side of target
+ // More heavily damped adjustment used if we have been oscillating either side
+ // of target.
switch (damp_var) {
case 0:
adjustment_limit = 0.75;
@@ -287,27 +287,29 @@
// if ( (correction_factor > 102) && (Q < cpi->active_worst_quality) )
if (correction_factor > 102) {
// We are not already at the worst allowable quality
- correction_factor = (int)(100.5 + ((correction_factor - 100) * adjustment_limit));
- rate_correction_factor = ((rate_correction_factor * correction_factor) / 100);
+ correction_factor =
+ (int)(100.5 + ((correction_factor - 100) * adjustment_limit));
+ rate_correction_factor =
+ ((rate_correction_factor * correction_factor) / 100);
// Keep rate_correction_factor within limits
if (rate_correction_factor > MAX_BPB_FACTOR)
rate_correction_factor = MAX_BPB_FACTOR;
- }
- // else if ( (correction_factor < 99) && (Q > cpi->active_best_quality) )
- else if (correction_factor < 99) {
+ } else if (correction_factor < 99) {
// We are not already at the best allowable quality
- correction_factor = (int)(100.5 - ((100 - correction_factor) * adjustment_limit));
- rate_correction_factor = ((rate_correction_factor * correction_factor) / 100);
+ correction_factor =
+ (int)(100.5 - ((100 - correction_factor) * adjustment_limit));
+ rate_correction_factor =
+ ((rate_correction_factor * correction_factor) / 100);
// Keep rate_correction_factor within limits
if (rate_correction_factor < MIN_BPB_FACTOR)
rate_correction_factor = MIN_BPB_FACTOR;
}
- if (cpi->common.frame_type == KEY_FRAME)
+ if (cpi->common.frame_type == KEY_FRAME) {
cpi->key_frame_rate_correction_factor = rate_correction_factor;
- else {
+ } else {
if (cpi->refresh_alt_ref_frame || cpi->refresh_golden_frame)
cpi->gf_rate_correction_factor = rate_correction_factor;
else
@@ -326,20 +328,24 @@
double correction_factor;
// Select the appropriate correction factor based upon type of frame.
- if (cpi->common.frame_type == KEY_FRAME)
+ if (cpi->common.frame_type == KEY_FRAME) {
correction_factor = cpi->key_frame_rate_correction_factor;
- else {
+ } else {
if (cpi->refresh_alt_ref_frame || cpi->refresh_golden_frame)
correction_factor = cpi->gf_rate_correction_factor;
else
correction_factor = cpi->rate_correction_factor;
}
- // Calculate required scaling factor based on target frame size and size of frame produced using previous Q
+ // Calculate required scaling factor based on target frame size and size of
+ // frame produced using previous Q.
if (target_bits_per_frame >= (INT_MAX >> BPER_MB_NORMBITS))
- target_bits_per_mb = (target_bits_per_frame / cpi->common.MBs) << BPER_MB_NORMBITS; // Case where we would overflow int
+ target_bits_per_mb =
+ (target_bits_per_frame / cpi->common.MBs)
+ << BPER_MB_NORMBITS; // Case where we would overflow int
else
- target_bits_per_mb = (target_bits_per_frame << BPER_MB_NORMBITS) / cpi->common.MBs;
+ target_bits_per_mb =
+ (target_bits_per_frame << BPER_MB_NORMBITS) / cpi->common.MBs;
i = cpi->active_best_quality;
@@ -405,7 +411,6 @@
}
av_key_frame_frequency /= total_weight;
-
}
return av_key_frame_frequency;
}
diff --git a/vp9/encoder/vp9_rdopt.c b/vp9/encoder/vp9_rdopt.c
index 26bbc82..ba521af 100644
--- a/vp9/encoder/vp9_rdopt.c
+++ b/vp9/encoder/vp9_rdopt.c
@@ -36,7 +36,7 @@
#include "vp9/common/vp9_seg_common.h"
#include "vp9/common/vp9_pred_common.h"
#include "vp9/common/vp9_entropy.h"
-#include "vp9_rtcd.h"
+#include "./vp9_rtcd.h"
#include "vp9/common/vp9_mvref_common.h"
#include "vp9/common/vp9_common.h"
@@ -45,9 +45,6 @@
/* Factor to weigh the rate for switchable interp filters */
#define SWITCHABLE_INTERP_RATE_FACTOR 1
-DECLARE_ALIGNED(16, extern const uint8_t,
- vp9_pt_energy_class[MAX_ENTROPY_TOKENS]);
-
#define LAST_FRAME_MODE_MASK 0xFFEDCD60
#define GOLDEN_FRAME_MODE_MASK 0xFFDA3BB0
#define ALT_REF_MODE_MASK 0xFFC648D0
@@ -110,8 +107,13 @@
static int rd_thresh_block_size_factor[BLOCK_SIZES] =
{2, 3, 3, 4, 6, 6, 8, 12, 12, 16, 24, 24, 32};
-#define MAX_RD_THRESH_FACT 64
-#define RD_THRESH_INC 1
+#define RD_THRESH_MAX_FACT 64
+#define RD_THRESH_INC 1
+#define RD_THRESH_POW 1.25
+#define RD_MULT_EPB_RATIO 64
+
+#define MV_COST_WEIGHT 108
+#define MV_COST_WEIGHT_SUB 120
static void fill_token_costs(vp9_coeff_cost *c,
vp9_coeff_probs_model (*p)[BLOCK_TYPES]) {
@@ -161,7 +163,17 @@
static int compute_rd_mult(int qindex) {
const int q = vp9_dc_quant(qindex, 0);
- return (11 * q * q) >> 2;
+ // TODO(debargha): Adjust the function below
+ return (88 * q * q / 25);
+}
+
+static int compute_rd_thresh_factor(int qindex) {
+ int q;
+ // TODO(debargha): Adjust the function below
+ q = (int)(pow(vp9_dc_quant(qindex, 0) / 4.0, RD_THRESH_POW) * 5.12);
+ if (q < 8)
+ q = 8;
+ return q;
}
void vp9_initialize_me_consts(VP9_COMP *cpi, int qindex) {
@@ -169,35 +181,9 @@
cpi->mb.sadperbit4 = sad_per_bit4lut[qindex];
}
-void vp9_initialize_rd_consts(VP9_COMP *cpi, int qindex) {
+static void set_block_thresholds(VP9_COMP *cpi, int qindex) {
int q, i, bsize;
-
- vp9_clear_system_state(); // __asm emms;
-
- // Further tests required to see if optimum is different
- // for key frames, golden frames and arf frames.
- // if (cpi->common.refresh_golden_frame ||
- // cpi->common.refresh_alt_ref_frame)
- qindex = clamp(qindex, 0, MAXQ);
-
- cpi->RDDIV = 100;
- cpi->RDMULT = compute_rd_mult(qindex);
- if (cpi->pass == 2 && (cpi->common.frame_type != KEY_FRAME)) {
- if (cpi->twopass.next_iiratio > 31)
- cpi->RDMULT += (cpi->RDMULT * rd_iifactor[31]) >> 4;
- else
- cpi->RDMULT +=
- (cpi->RDMULT * rd_iifactor[cpi->twopass.next_iiratio]) >> 4;
- }
- cpi->mb.errorperbit = cpi->RDMULT >> 6;
- cpi->mb.errorperbit += (cpi->mb.errorperbit == 0);
-
- vp9_set_speed_features(cpi);
-
- q = (int)pow(vp9_dc_quant(qindex, 0) >> 2, 1.25);
- q <<= 2;
- if (q < 8)
- q = 8;
+ q = compute_rd_thresh_factor(qindex);
for (bsize = 0; bsize < BLOCK_SIZES; ++bsize) {
for (i = 0; i < MAX_MODES; ++i) {
@@ -226,6 +212,34 @@
}
}
}
+}
+
+void vp9_initialize_rd_consts(VP9_COMP *cpi, int qindex) {
+ int i;
+
+ vp9_clear_system_state(); // __asm emms;
+
+ // Further tests required to see if optimum is different
+ // for key frames, golden frames and arf frames.
+ // if (cpi->common.refresh_golden_frame ||
+ // cpi->common.refresh_alt_ref_frame)
+ qindex = clamp(qindex, 0, MAXQ);
+
+ cpi->RDDIV = RDDIV_BITS; // in bits (to multiply D by 128)
+ cpi->RDMULT = compute_rd_mult(qindex);
+ if (cpi->pass == 2 && (cpi->common.frame_type != KEY_FRAME)) {
+ if (cpi->twopass.next_iiratio > 31)
+ cpi->RDMULT += (cpi->RDMULT * rd_iifactor[31]) >> 4;
+ else
+ cpi->RDMULT +=
+ (cpi->RDMULT * rd_iifactor[cpi->twopass.next_iiratio]) >> 4;
+ }
+ cpi->mb.errorperbit = cpi->RDMULT / RD_MULT_EPB_RATIO;
+ cpi->mb.errorperbit += (cpi->mb.errorperbit == 0);
+
+ vp9_set_speed_features(cpi);
+
+ set_block_thresholds(cpi, qindex);
fill_token_costs(cpi->mb.token_costs, cpi->common.fc.coef_probs);
@@ -249,10 +263,10 @@
MB_PREDICTION_MODE m;
for (m = NEARESTMV; m < MB_MODE_COUNT; m++)
- cpi->mb.inter_mode_cost[i][m - NEARESTMV] =
+ cpi->mb.inter_mode_cost[i][inter_mode_offset(m)] =
cost_token(vp9_inter_mode_tree,
cpi->common.fc.inter_mode_probs[i],
- vp9_inter_mode_encodings + (m - NEARESTMV));
+ vp9_inter_mode_encodings + inter_mode_offset(m));
}
}
}
@@ -462,12 +476,12 @@
{ 1, 2, 3, 4, 11, 1024 - 21, 0 },
};
-static INLINE int cost_coeffs(MACROBLOCK *mb,
+static INLINE int cost_coeffs(MACROBLOCK *x,
int plane, int block,
ENTROPY_CONTEXT *A, ENTROPY_CONTEXT *L,
TX_SIZE tx_size,
const int16_t *scan, const int16_t *nb) {
- MACROBLOCKD *const xd = &mb->e_mbd;
+ MACROBLOCKD *const xd = &x->e_mbd;
MB_MODE_INFO *mbmi = &xd->this_mi->mbmi;
struct macroblockd_plane *pd = &xd->plane[plane];
const PLANE_TYPE type = pd->plane_type;
@@ -476,9 +490,9 @@
const int16_t *const qcoeff_ptr = BLOCK_OFFSET(pd->qcoeff, block);
const int ref = mbmi->ref_frame[0] != INTRA_FRAME;
unsigned int (*token_costs)[2][PREV_COEF_CONTEXTS][MAX_ENTROPY_TOKENS] =
- mb->token_costs[tx_size][type][ref];
+ x->token_costs[tx_size][type][ref];
const ENTROPY_CONTEXT above_ec = !!*A, left_ec = !!*L;
- uint8_t token_cache[1024];
+ uint8_t *p_tok = x->token_cache;
int pt = combine_entropy_contexts(above_ec, left_ec);
int c, cost;
@@ -497,7 +511,7 @@
int v = qcoeff_ptr[0];
int prev_t = vp9_dct_value_tokens_ptr[v].token;
cost = (*token_costs)[0][pt][prev_t] + vp9_dct_value_cost_ptr[v];
- token_cache[0] = vp9_pt_energy_class[prev_t];
+ p_tok[0] = vp9_pt_energy_class[prev_t];
++token_costs;
// ac tokens
@@ -507,9 +521,9 @@
v = qcoeff_ptr[rc];
t = vp9_dct_value_tokens_ptr[v].token;
- pt = get_coef_context(nb, token_cache, c);
+ pt = get_coef_context(nb, p_tok, c);
cost += (*token_costs)[!prev_t][pt][t] + vp9_dct_value_cost_ptr[v];
- token_cache[rc] = vp9_pt_energy_class[t];
+ p_tok[rc] = vp9_pt_energy_class[t];
prev_t = t;
if (!--band_left) {
band_left = *band_count++;
@@ -519,7 +533,7 @@
// eob token
if (band_left) {
- pt = get_coef_context(nb, token_cache, c);
+ pt = get_coef_context(nb, p_tok, c);
cost += (*token_costs)[0][pt][DCT_EOB_TOKEN];
}
}
@@ -530,25 +544,6 @@
return cost;
}
-struct rdcost_block_args {
- MACROBLOCK *x;
- ENTROPY_CONTEXT t_above[16];
- ENTROPY_CONTEXT t_left[16];
- TX_SIZE tx_size;
- int bw;
- int bh;
- int rate[256];
- int64_t dist[256];
- int64_t sse[256];
- int this_rate;
- int64_t this_dist;
- int64_t this_sse;
- int64_t this_rd;
- int64_t best_rd;
- int skip;
- const int16_t *scan, *nb;
-};
-
static void dist_block(int plane, int block, TX_SIZE tx_size, void *arg) {
const int ss_txfrm_size = tx_size << 1;
struct rdcost_block_args* args = arg;
@@ -560,17 +555,17 @@
int shift = args->tx_size == TX_32X32 ? 0 : 2;
int16_t *const coeff = BLOCK_OFFSET(p->coeff, block);
int16_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
- args->dist[block] = vp9_block_error(coeff, dqcoeff, 16 << ss_txfrm_size,
- &this_sse) >> shift;
- args->sse[block] = this_sse >> shift;
+ args->dist = vp9_block_error(coeff, dqcoeff, 16 << ss_txfrm_size,
+ &this_sse) >> shift;
+ args->sse = this_sse >> shift;
if (x->skip_encode &&
xd->this_mi->mbmi.ref_frame[0] == INTRA_FRAME) {
// TODO(jingning): tune the model to better capture the distortion.
int64_t p = (pd->dequant[1] * pd->dequant[1] *
- (1 << ss_txfrm_size)) >> shift;
- args->dist[block] = p;
- args->sse[block] = p;
+ (1 << ss_txfrm_size)) >> (shift + 2);
+ args->dist += (p >> 4);
+ args->sse += p;
}
}
@@ -581,10 +576,9 @@
int x_idx, y_idx;
txfrm_block_to_raster_xy(plane_bsize, args->tx_size, block, &x_idx, &y_idx);
- args->rate[block] = cost_coeffs(args->x, plane, block,
- args->t_above + x_idx,
- args->t_left + y_idx, args->tx_size,
- args->scan, args->nb);
+ args->rate = cost_coeffs(args->x, plane, block, args->t_above + x_idx,
+ args->t_left + y_idx, args->tx_size,
+ args->scan, args->nb);
}
static void block_yrd_txfm(int plane, int block, BLOCK_SIZE plane_bsize,
@@ -605,17 +599,17 @@
dist_block(plane, block, tx_size, args);
rate_block(plane, block, plane_bsize, tx_size, args);
- rd1 = RDCOST(x->rdmult, x->rddiv, args->rate[block], args->dist[block]);
- rd2 = RDCOST(x->rdmult, x->rddiv, 0, args->sse[block]);
+ rd1 = RDCOST(x->rdmult, x->rddiv, args->rate, args->dist);
+ rd2 = RDCOST(x->rdmult, x->rddiv, 0, args->sse);
// TODO(jingning): temporarily enabled only for luma component
rd = MIN(rd1, rd2);
if (plane == 0)
x->zcoeff_blk[tx_size][block] = rd1 > rd2;
- args->this_rate += args->rate[block];
- args->this_dist += args->dist[block];
- args->this_sse += args->sse[block];
+ args->this_rate += args->rate;
+ args->this_dist += args->dist;
+ args->this_sse += args->sse;
args->this_rd += rd;
if (args->this_rd > args->best_rd) {
@@ -657,7 +651,20 @@
}
}
+static void init_rdcost_stack(MACROBLOCK *x, TX_SIZE tx_size,
+ const int num_4x4_w, const int num_4x4_h,
+ const int64_t ref_rdcost,
+ struct rdcost_block_args *arg) {
+ vpx_memset(arg, 0, sizeof(struct rdcost_block_args));
+ arg->x = x;
+ arg->tx_size = tx_size;
+ arg->bw = num_4x4_w;
+ arg->bh = num_4x4_h;
+ arg->best_rd = ref_rdcost;
+}
+
static void txfm_rd_in_plane(MACROBLOCK *x,
+ struct rdcost_block_args *rd_stack,
int *rate, int64_t *distortion,
int *skippable, int64_t *sse,
int64_t ref_best_rd, int plane,
@@ -669,30 +676,29 @@
const int num_4x4_h = num_4x4_blocks_high_lookup[bs];
const uint8_t *band_translate; // just for the get_scan_and_band call
- struct rdcost_block_args args = { x, { 0 }, { 0 }, tx_size,
- num_4x4_w, num_4x4_h,
- { 0 }, { 0 }, { 0 },
- 0, 0, 0, 0, ref_best_rd, 0 };
+ init_rdcost_stack(x, tx_size, num_4x4_w, num_4x4_h,
+ ref_best_rd, rd_stack);
if (plane == 0)
xd->this_mi->mbmi.tx_size = tx_size;
- vp9_get_entropy_contexts(tx_size, args.t_above, args.t_left,
+ vp9_get_entropy_contexts(tx_size, rd_stack->t_above, rd_stack->t_left,
pd->above_context, pd->left_context,
num_4x4_w, num_4x4_h);
- get_scan_and_band(xd, tx_size, pd->plane_type, 0, &args.scan, &args.nb,
- &band_translate);
+ get_scan_and_band(xd, tx_size, pd->plane_type, 0, &rd_stack->scan,
+ &rd_stack->nb, &band_translate);
- foreach_transformed_block_in_plane(xd, bsize, plane, block_yrd_txfm, &args);
- if (args.skip) {
+ foreach_transformed_block_in_plane(xd, bsize, plane,
+ block_yrd_txfm, rd_stack);
+ if (rd_stack->skip) {
*rate = INT_MAX;
*distortion = INT64_MAX;
*sse = INT64_MAX;
*skippable = 0;
} else {
- *distortion = args.this_dist;
- *rate = args.this_rate;
- *sse = args.this_sse;
+ *distortion = rd_stack->this_dist;
+ *rate = rd_stack->this_rate;
+ *sse = rd_stack->this_sse;
*skippable = vp9_is_skippable_in_plane(xd, bsize, plane);
}
}
@@ -720,7 +726,7 @@
} else {
mbmi->tx_size = TX_4X4;
}
- txfm_rd_in_plane(x, rate, distortion, skip,
+ txfm_rd_in_plane(x, &cpi->rdcost_stack, rate, distortion, skip,
&sse[mbmi->tx_size], ref_best_rd, 0, bs,
mbmi->tx_size);
cpi->tx_stepdown_count[0]++;
@@ -904,8 +910,8 @@
// Actually encode using the chosen mode if a model was used, but do not
// update the r, d costs
- txfm_rd_in_plane(x, rate, distortion, skip, &sse[mbmi->tx_size],
- ref_best_rd, 0, bs, mbmi->tx_size);
+ txfm_rd_in_plane(x, &cpi->rdcost_stack, rate, distortion, skip,
+ &sse[mbmi->tx_size], ref_best_rd, 0, bs, mbmi->tx_size);
if (max_tx_size == TX_32X32 &&
rd[TX_32X32][1] <= rd[TX_16X16][1] &&
@@ -932,6 +938,7 @@
int64_t d[TX_SIZES], sse[TX_SIZES];
MACROBLOCKD *xd = &x->e_mbd;
MB_MODE_INFO *const mbmi = &xd->this_mi->mbmi;
+ struct rdcost_block_args *rdcost_stack = &cpi->rdcost_stack;
assert(bs == mbmi->sb_type);
if (mbmi->ref_frame[0] > INTRA_FRAME)
@@ -967,14 +974,16 @@
skip, sse, ref_best_rd, bs);
} else {
if (bs >= BLOCK_32X32)
- txfm_rd_in_plane(x, &r[TX_32X32][0], &d[TX_32X32], &s[TX_32X32],
- &sse[TX_32X32], ref_best_rd, 0, bs, TX_32X32);
+ txfm_rd_in_plane(x, rdcost_stack, &r[TX_32X32][0], &d[TX_32X32],
+ &s[TX_32X32], &sse[TX_32X32],
+ ref_best_rd, 0, bs, TX_32X32);
if (bs >= BLOCK_16X16)
- txfm_rd_in_plane(x, &r[TX_16X16][0], &d[TX_16X16], &s[TX_16X16],
- &sse[TX_16X16], ref_best_rd, 0, bs, TX_16X16);
- txfm_rd_in_plane(x, &r[TX_8X8][0], &d[TX_8X8], &s[TX_8X8],
+ txfm_rd_in_plane(x, rdcost_stack, &r[TX_16X16][0], &d[TX_16X16],
+ &s[TX_16X16], &sse[TX_16X16],
+ ref_best_rd, 0, bs, TX_16X16);
+ txfm_rd_in_plane(x, rdcost_stack, &r[TX_8X8][0], &d[TX_8X8], &s[TX_8X8],
&sse[TX_8X8], ref_best_rd, 0, bs, TX_8X8);
- txfm_rd_in_plane(x, &r[TX_4X4][0], &d[TX_4X4], &s[TX_4X4],
+ txfm_rd_in_plane(x, rdcost_stack, &r[TX_4X4][0], &d[TX_4X4], &s[TX_4X4],
&sse[TX_4X4], ref_best_rd, 0, bs, TX_4X4);
choose_txfm_size_from_rd(cpi, x, r, rate, d, distortion, s,
skip, txfm_cache, bs);
@@ -1044,7 +1053,7 @@
int64_t this_rd;
int ratey = 0;
- if (!(cpi->sf.intra_y_mode_mask & (1 << mode)))
+ if (!(cpi->sf.intra_y_mode_mask[TX_4X4] & (1 << mode)))
continue;
// Only do the oblique modes if the best so far is
@@ -1100,11 +1109,11 @@
goto next;
if (tx_type != DCT_DCT)
- vp9_short_iht4x4_add(BLOCK_OFFSET(pd->dqcoeff, block),
+ vp9_iht4x4_16_add(BLOCK_OFFSET(pd->dqcoeff, block),
dst, pd->dst.stride, tx_type);
else
- xd->inv_txm4x4_add(BLOCK_OFFSET(pd->dqcoeff, block),
- dst, pd->dst.stride);
+ xd->itxm_add(BLOCK_OFFSET(pd->dqcoeff, block), dst, pd->dst.stride,
+ 16);
}
}
@@ -1236,7 +1245,7 @@
MODE_INFO *above_mi = xd->mi_8x8[-xd->mode_info_stride];
MODE_INFO *left_mi = xd->mi_8x8[-1];
- if (!(cpi->sf.intra_y_mode_mask & (1 << mode)))
+ if (!(cpi->sf.intra_y_mode_mask[max_txsize_lookup[bsize]] & (1 << mode)))
continue;
if (cpi->common.frame_type == KEY_FRAME) {
@@ -1284,7 +1293,7 @@
return best_rd;
}
-static void super_block_uvrd(VP9_COMMON *const cm, MACROBLOCK *x,
+static void super_block_uvrd(VP9_COMP *const cpi, MACROBLOCK *x,
int *rate, int64_t *distortion, int *skippable,
int64_t *sse, BLOCK_SIZE bsize,
int64_t ref_best_rd) {
@@ -1307,7 +1316,7 @@
*skippable = 1;
for (plane = 1; plane < MAX_MB_PLANE; ++plane) {
- txfm_rd_in_plane(x, &pnrate, &pndist, &pnskip, &pnsse,
+ txfm_rd_in_plane(x, &cpi->rdcost_stack, &pnrate, &pndist, &pnskip, &pnsse,
ref_best_rd, plane, bsize, uv_txfm_size);
if (pnrate == INT_MAX)
goto term;
@@ -1339,14 +1348,15 @@
// int mode_mask = (bsize <= BLOCK_8X8)
// ? ALL_INTRA_MODES : cpi->sf.intra_uv_mode_mask;
- for (mode = DC_PRED; mode <= TM_PRED; mode++) {
+ for (mode = DC_PRED; mode <= TM_PRED; mode ++) {
// if (!(mode_mask & (1 << mode)))
- if (!(cpi->sf.intra_uv_mode_mask & (1 << mode)))
+ if (!(cpi->sf.intra_uv_mode_mask[max_uv_txsize_lookup[bsize]]
+ & (1 << mode)))
continue;
x->e_mbd.mi_8x8[0]->mbmi.uv_mode = mode;
- super_block_uvrd(&cpi->common, x, &this_rate_tokenonly,
+ super_block_uvrd(cpi, x, &this_rate_tokenonly,
&this_distortion, &s, &this_sse, bsize, best_rd);
if (this_rate_tokenonly == INT_MAX)
continue;
@@ -1377,8 +1387,8 @@
int64_t this_sse;
x->e_mbd.mi_8x8[0]->mbmi.uv_mode = DC_PRED;
- super_block_uvrd(&cpi->common, x, rate_tokenonly,
- distortion, skippable, &this_sse, bsize, INT64_MAX);
+ super_block_uvrd(cpi, x, rate_tokenonly, distortion,
+ skippable, &this_sse, bsize, INT64_MAX);
*rate = *rate_tokenonly +
x->intra_uv_mode_cost[cpi->common.frame_type][DC_PRED];
this_rd = RDCOST(x->rdmult, x->rddiv, *rate, *distortion);
@@ -1416,7 +1426,7 @@
// Don't account for mode here if segment skip is enabled.
if (!vp9_segfeature_active(&cpi->common.seg, segment_id, SEG_LVL_SKIP)) {
assert(is_inter_mode(mode));
- return x->inter_mode_cost[mode_context][mode - NEARESTMV];
+ return x->inter_mode_cost[mode_context][inter_mode_offset(mode)];
} else {
return 0;
}
@@ -1466,12 +1476,12 @@
case NEWMV:
this_mv->as_int = seg_mvs[mbmi->ref_frame[0]].as_int;
thismvcost = vp9_mv_bit_cost(&this_mv->as_mv, &best_ref_mv->as_mv,
- mvjcost, mvcost, 102);
+ mvjcost, mvcost, MV_COST_WEIGHT_SUB);
if (has_second_rf) {
this_second_mv->as_int = seg_mvs[mbmi->ref_frame[1]].as_int;
thismvcost += vp9_mv_bit_cost(&this_second_mv->as_mv,
&second_best_ref_mv->as_mv,
- mvjcost, mvcost, 102);
+ mvjcost, mvcost, MV_COST_WEIGHT_SUB);
}
break;
case NEARESTMV:
@@ -1502,7 +1512,8 @@
if (has_second_rf)
mic->bmi[i].as_mv[1].as_int = this_second_mv->as_int;
- x->partition_info->bmi[i].mode = m;
+ mic->bmi[i].as_mode = m;
+
for (idy = 0; idy < num_4x4_blocks_high; ++idy)
for (idx = 0; idx < num_4x4_blocks_wide; ++idx)
vpx_memcpy(&mic->bmi[i + idy * 2 + idx],
@@ -1649,7 +1660,7 @@
BEST_SEG_INFO *bsi_buf, int filter_idx,
int_mv seg_mvs[4][MAX_REF_FRAMES],
int mi_row, int mi_col) {
- int i, j, br = 0, idx, idy;
+ int i, br = 0, idx, idy;
int64_t bd = 0, block_sse = 0;
MB_PREDICTION_MODE this_mode;
MODE_INFO *mi = x->e_mbd.mi_8x8[0];
@@ -2011,15 +2022,6 @@
bsi->segment_rd = INT64_MAX;
return;
}
-
- for (j = 1; j < num_4x4_blocks_high; ++j)
- vpx_memcpy(&x->partition_info->bmi[i + j * 2],
- &x->partition_info->bmi[i],
- sizeof(x->partition_info->bmi[i]));
- for (j = 1; j < num_4x4_blocks_wide; ++j)
- vpx_memcpy(&x->partition_info->bmi[i + j],
- &x->partition_info->bmi[i],
- sizeof(x->partition_info->bmi[i]));
}
} /* for each label */
@@ -2031,7 +2033,7 @@
// update the coding decisions
for (i = 0; i < 4; ++i)
- bsi->modes[i] = x->partition_info->bmi[i].mode;
+ bsi->modes[i] = mi->bmi[i].as_mode;
}
static int64_t rd_pick_best_mbsegmentation(VP9_COMP *cpi, MACROBLOCK *x,
@@ -2076,7 +2078,7 @@
if (has_second_ref(mbmi))
mi->bmi[i].as_mv[1].as_int = bsi->rdstat[i][mode_idx].mvs[1].as_int;
xd->plane[0].eobs[i] = bsi->rdstat[i][mode_idx].eobs;
- x->partition_info->bmi[i].mode = bsi->modes[i];
+ mi->bmi[i].as_mode = bsi->modes[i];
}
/*
@@ -2209,7 +2211,6 @@
static void store_coding_context(MACROBLOCK *x, PICK_MODE_CONTEXT *ctx,
int mode_index,
- PARTITION_INFO *partition,
int_mv *ref_mv,
int_mv *second_ref_mv,
int64_t comp_pred_diff[NB_PREDICTION_TYPES],
@@ -2223,9 +2224,6 @@
ctx->best_mode_index = mode_index;
ctx->mic = *xd->this_mi;
- if (partition)
- ctx->partition_info = *partition;
-
ctx->best_ref_mv.as_int = ref_mv->as_int;
ctx->second_best_ref_mv.as_int = second_ref_mv->as_int;
@@ -2457,7 +2455,7 @@
&dis, &sse);
}
*rate_mv = vp9_mv_bit_cost(&tmp_mv->as_mv, &ref_mv.as_mv,
- x->nmvjointcost, x->mvcost, 96);
+ x->nmvjointcost, x->mvcost, MV_COST_WEIGHT);
if (cpi->sf.adaptive_motion_search && cpi->common.show_frame)
x->pred_mv[ref].as_int = tmp_mv->as_int;
@@ -2618,10 +2616,10 @@
}
*rate_mv = vp9_mv_bit_cost(&frame_mv[refs[0]].as_mv,
&mbmi->ref_mvs[refs[0]][0].as_mv,
- x->nmvjointcost, x->mvcost, 96);
+ x->nmvjointcost, x->mvcost, MV_COST_WEIGHT);
*rate_mv += vp9_mv_bit_cost(&frame_mv[refs[1]].as_mv,
&mbmi->ref_mvs[refs[1]][0].as_mv,
- x->nmvjointcost, x->mvcost, 96);
+ x->nmvjointcost, x->mvcost, MV_COST_WEIGHT);
vpx_free(second_pred);
}
@@ -2674,10 +2672,10 @@
} else {
rate_mv = vp9_mv_bit_cost(&frame_mv[refs[0]].as_mv,
&mbmi->ref_mvs[refs[0]][0].as_mv,
- x->nmvjointcost, x->mvcost, 96);
+ x->nmvjointcost, x->mvcost, MV_COST_WEIGHT);
rate_mv += vp9_mv_bit_cost(&frame_mv[refs[1]].as_mv,
&mbmi->ref_mvs[refs[1]][0].as_mv,
- x->nmvjointcost, x->mvcost, 96);
+ x->nmvjointcost, x->mvcost, MV_COST_WEIGHT);
}
if (frame_mv[refs[0]].as_int == INVALID_MV ||
frame_mv[refs[1]].as_int == INVALID_MV)
@@ -3015,7 +3013,7 @@
rdcosty = RDCOST(x->rdmult, x->rddiv, *rate2, *distortion);
rdcosty = MIN(rdcosty, RDCOST(x->rdmult, x->rddiv, 0, *psse));
- super_block_uvrd(cm, x, rate_uv, distortion_uv, &skippable_uv, &sseuv,
+ super_block_uvrd(cpi, x, rate_uv, distortion_uv, &skippable_uv, &sseuv,
bsize, ref_best_rd - rdcosty);
if (*rate_uv == INT_MAX) {
*rate2 = INT_MAX;
@@ -3645,10 +3643,17 @@
// values, which actually are bigger than this_rd itself. This can
// cause negative best_filter_rd[] values, which is obviously silly.
// Therefore, if filter_cache < ref, we do an adjusted calculation.
- if (cpi->rd_filter_cache[i] >= ref)
+ if (cpi->rd_filter_cache[i] >= ref) {
adj_rd = this_rd + cpi->rd_filter_cache[i] - ref;
- else // FIXME(rbultje) do this for comppred also
- adj_rd = this_rd - (ref - cpi->rd_filter_cache[i]) * this_rd / ref;
+ } else {
+ // FIXME(rbultje) do this for comppsred also
+ //
+ // To prevent out-of-range computation in
+ // adj_rd = cpi->rd_filter_cache[i] * this_rd / ref
+ // cpi->rd_filter_cache[i] / ref is converted to a 256 based ratio.
+ int tmp = cpi->rd_filter_cache[i] * 256 / ref;
+ adj_rd = (this_rd * tmp) >> 8;
+ }
best_filter_rd[i] = MIN(best_filter_rd[i], adj_rd);
}
}
@@ -3734,9 +3739,9 @@
} else {
cpi->rd_thresh_freq_fact[bsize][mode_index] += RD_THRESH_INC;
if (cpi->rd_thresh_freq_fact[bsize][mode_index] >
- (cpi->sf.adaptive_rd_thresh * MAX_RD_THRESH_FACT)) {
+ (cpi->sf.adaptive_rd_thresh * RD_THRESH_MAX_FACT)) {
cpi->rd_thresh_freq_fact[bsize][mode_index] =
- cpi->sf.adaptive_rd_thresh * MAX_RD_THRESH_FACT;
+ cpi->sf.adaptive_rd_thresh * RD_THRESH_MAX_FACT;
}
}
}
@@ -3783,7 +3788,6 @@
set_scale_factors(xd, mbmi->ref_frame[0], mbmi->ref_frame[1],
scale_factor);
store_coding_context(x, ctx, best_mode_index,
- NULL,
&mbmi->ref_mvs[mbmi->ref_frame[0]][0],
&mbmi->ref_mvs[mbmi->ref_frame[1] < 0 ? 0 :
mbmi->ref_frame[1]][0],
@@ -3842,7 +3846,6 @@
cpi->common.y_dc_delta_q);
int_mv seg_mvs[4][MAX_REF_FRAMES];
b_mode_info best_bmodes[4];
- PARTITION_INFO best_partition;
int best_skip2 = 0;
unsigned char best_zcoeff_blk[256] = { 0 };
@@ -3910,6 +3913,32 @@
ref_frame = vp9_ref_order[mode_index].ref_frame;
second_ref_frame = vp9_ref_order[mode_index].second_ref_frame;
+ // Look at the reference frame of the best mode so far and set the
+ // skip mask to look at a subset of the remaining modes.
+ if (mode_index > 2 && cpi->sf.mode_skip_start < MAX_MODES) {
+ if (mode_index == 3) {
+ switch (vp9_ref_order[best_mode_index].ref_frame) {
+ case INTRA_FRAME:
+ cpi->mode_skip_mask = 0;
+ break;
+ case LAST_FRAME:
+ cpi->mode_skip_mask = 0x0010;
+ break;
+ case GOLDEN_FRAME:
+ cpi->mode_skip_mask = 0x0008;
+ break;
+ case ALTREF_FRAME:
+ cpi->mode_skip_mask = 0x0000;
+ break;
+ case NONE:
+ case MAX_REF_FRAMES:
+ assert(!"Invalid Reference frame");
+ }
+ }
+ if (cpi->mode_skip_mask & ((int64_t)1 << mode_index))
+ continue;
+ }
+
// Skip if the current reference frame has been masked off
if (cpi->sf.reference_masking && !cpi->set_ref_frame_mask &&
(cpi->ref_frame_mask & (1 << ref_frame)))
@@ -4058,7 +4087,6 @@
&mbmi->ref_mvs[second_ref_frame][0] : NULL;
b_mode_info tmp_best_bmodes[16];
MB_MODE_INFO tmp_best_mbmode;
- PARTITION_INFO tmp_best_partition;
BEST_SEG_INFO bsi[SWITCHABLE_FILTERS];
int pred_exists = 0;
int uv_skippable;
@@ -4122,7 +4150,6 @@
tmp_best_sse = total_sse;
tmp_best_skippable = skippable;
tmp_best_mbmode = *mbmi;
- tmp_best_partition = *x->partition_info;
for (i = 0; i < 4; i++)
tmp_best_bmodes[i] = xd->this_mi->bmi[i];
pred_exists = 1;
@@ -4174,7 +4201,6 @@
distortion = tmp_best_distortion;
skippable = tmp_best_skippable;
*mbmi = tmp_best_mbmode;
- *x->partition_info = tmp_best_partition;
for (i = 0; i < 4; i++)
xd->this_mi->bmi[i] = tmp_best_bmodes[i];
}
@@ -4202,7 +4228,7 @@
// then dont bother looking at UV
vp9_build_inter_predictors_sbuv(&x->e_mbd, mi_row, mi_col,
BLOCK_8X8);
- super_block_uvrd(cm, x, &rate_uv, &distortion_uv, &uv_skippable,
+ super_block_uvrd(cpi, x, &rate_uv, &distortion_uv, &uv_skippable,
&uv_sse, BLOCK_8X8, tmp_best_rdu);
if (rate_uv == INT_MAX)
continue;
@@ -4302,7 +4328,6 @@
RDCOST(x->rdmult, x->rddiv, rate_uv, distortion_uv);
best_mbmode = *mbmi;
best_skip2 = this_skip2;
- best_partition = *x->partition_info;
vpx_memcpy(best_zcoeff_blk, x->zcoeff_blk[mbmi->tx_size],
sizeof(best_zcoeff_blk));
@@ -4445,9 +4470,9 @@
} else {
cpi->rd_thresh_freq_sub8x8[bsize][mode_index] += RD_THRESH_INC;
if (cpi->rd_thresh_freq_sub8x8[bsize][mode_index] >
- (cpi->sf.adaptive_rd_thresh * MAX_RD_THRESH_FACT)) {
+ (cpi->sf.adaptive_rd_thresh * RD_THRESH_MAX_FACT)) {
cpi->rd_thresh_freq_sub8x8[bsize][mode_index] =
- cpi->sf.adaptive_rd_thresh * MAX_RD_THRESH_FACT;
+ cpi->sf.adaptive_rd_thresh * RD_THRESH_MAX_FACT;
}
}
}
@@ -4460,15 +4485,8 @@
for (i = 0; i < 4; i++)
xd->this_mi->bmi[i].as_mode = best_bmodes[i].as_mode;
} else {
- for (i = 0; i < 4; i++)
- xd->this_mi->bmi[i].as_mv[0].as_int =
- best_bmodes[i].as_mv[0].as_int;
-
- if (has_second_ref(mbmi))
- for (i = 0; i < 4; i++)
- xd->this_mi->bmi[i].as_mv[1].as_int = best_bmodes[i].as_mv[1].as_int;
-
- *x->partition_info = best_partition;
+ for (i = 0; i < 4; ++i)
+ vpx_memcpy(&xd->this_mi->bmi[i], &best_bmodes[i], sizeof(b_mode_info));
mbmi->mv[0].as_int = xd->this_mi->bmi[3].as_mv[0].as_int;
mbmi->mv[1].as_int = xd->this_mi->bmi[3].as_mv[1].as_int;
@@ -4511,7 +4529,6 @@
set_scale_factors(xd, mbmi->ref_frame[0], mbmi->ref_frame[1],
scale_factor);
store_coding_context(x, ctx, best_mode_index,
- &best_partition,
&mbmi->ref_mvs[mbmi->ref_frame[0]][0],
&mbmi->ref_mvs[mbmi->ref_frame[1] < 0 ? 0 :
mbmi->ref_frame[1]][0],
diff --git a/vp9/encoder/vp9_rdopt.h b/vp9/encoder/vp9_rdopt.h
index c86ea27..aa4068d 100644
--- a/vp9/encoder/vp9_rdopt.h
+++ b/vp9/encoder/vp9_rdopt.h
@@ -12,8 +12,10 @@
#ifndef VP9_ENCODER_VP9_RDOPT_H_
#define VP9_ENCODER_VP9_RDOPT_H_
+#define RDDIV_BITS 7
+
#define RDCOST(RM, DM, R, D) \
- (((128 + ((int64_t)R) * (RM)) >> 8) + ((int64_t)DM) * (D))
+ (((128 + ((int64_t)R) * (RM)) >> 8) + (D << DM))
#define QIDX_SKIP_THRESH 115
void vp9_initialize_rd_consts(VP9_COMP *cpi, int qindex);
diff --git a/vp9/encoder/vp9_ssim.c b/vp9/encoder/vp9_ssim.c
index c155516..a5f18e6 100644
--- a/vp9/encoder/vp9_ssim.c
+++ b/vp9/encoder/vp9_ssim.c
@@ -42,8 +42,8 @@
}
}
-const static int64_t cc1 = 26634; // (64^2*(.01*255)^2
-const static int64_t cc2 = 239708; // (64^2*(.03*255)^2
+static const int64_t cc1 = 26634; // (64^2*(.01*255)^2
+static const int64_t cc2 = 239708; // (64^2*(.03*255)^2
static double similarity(unsigned long sum_s, unsigned long sum_r,
unsigned long sum_sq_s, unsigned long sum_sq_r,
diff --git a/vp9/encoder/vp9_subexp.c b/vp9/encoder/vp9_subexp.c
index 667b801..eb864d9 100644
--- a/vp9/encoder/vp9_subexp.c
+++ b/vp9/encoder/vp9_subexp.c
@@ -221,7 +221,8 @@
}
void vp9_cond_prob_diff_update(vp9_writer *w, vp9_prob *oldp,
- vp9_prob upd, unsigned int *ct) {
+ unsigned int *ct) {
+ const vp9_prob upd = DIFF_UPDATE_PROB;
vp9_prob newp = get_binary_prob(ct[0], ct[1]);
const int savings = vp9_prob_diff_update_savings_search(ct, *oldp, &newp,
upd);
diff --git a/vp9/encoder/vp9_subexp.h b/vp9/encoder/vp9_subexp.h
index 7acdaf6..521c777 100644
--- a/vp9/encoder/vp9_subexp.h
+++ b/vp9/encoder/vp9_subexp.h
@@ -19,7 +19,7 @@
vp9_prob newp, vp9_prob oldp);
void vp9_cond_prob_diff_update(vp9_writer *w, vp9_prob *oldp,
- vp9_prob upd, unsigned int *ct);
+ unsigned int *ct);
int vp9_prob_diff_update_savings_search(const unsigned int *ct,
vp9_prob oldp, vp9_prob *bestp,
diff --git a/vp9/encoder/vp9_tokenize.c b/vp9/encoder/vp9_tokenize.c
index 4e095f2..08745b0 100644
--- a/vp9/encoder/vp9_tokenize.c
+++ b/vp9/encoder/vp9_tokenize.c
@@ -29,9 +29,6 @@
extern vp9_coeff_stats tree_update_hist[TX_SIZES][BLOCK_TYPES];
#endif /* ENTROPY_STATS */
-DECLARE_ALIGNED(16, extern const uint8_t,
- vp9_pt_energy_class[MAX_ENTROPY_TOKENS]);
-
static TOKENVALUE dct_value_tokens[DCT_MAX_VALUE * 2];
const TOKENVALUE *vp9_dct_value_tokens_ptr;
static int dct_value_cost[DCT_MAX_VALUE * 2];
diff --git a/vp9/encoder/vp9_variance.h b/vp9/encoder/vp9_variance.h
index 6e686d6..61031e0 100644
--- a/vp9/encoder/vp9_variance.h
+++ b/vp9/encoder/vp9_variance.h
@@ -67,12 +67,6 @@
unsigned int *sse,
const uint8_t *second_pred);
-typedef void (*vp9_ssimpf_fn_t)(uint8_t *s, int sp, uint8_t *r,
- int rp, unsigned long *sum_s,
- unsigned long *sum_r, unsigned long *sum_sq_s,
- unsigned long *sum_sq_r,
- unsigned long *sum_sxr);
-
typedef unsigned int (*vp9_getmbss_fn_t)(const short *);
typedef unsigned int (*vp9_get16x16prederror_fn_t)(const uint8_t *src_ptr,
diff --git a/vp9/encoder/x86/vp9_dct32x32_sse2.c b/vp9/encoder/x86/vp9_dct32x32_sse2.c
index 95ae266..11eec7f 100644
--- a/vp9/encoder/x86/vp9_dct32x32_sse2.c
+++ b/vp9/encoder/x86/vp9_dct32x32_sse2.c
@@ -27,24 +27,6 @@
__m128i buf1 = _mm_shuffle_epi32(b, _MM_SHUFFLE(0, 0, 2, 0));
return _mm_unpacklo_epi64(buf0, buf1);
}
-
-static INLINE __m128i k_cvtlo_epi16(__m128i a, __m128i mask16, __m128i kZero) {
- // convert the lower 4 signed 16-bit integers into 4 signed 32-bit integers
- __m128i sign_bit = _mm_and_si128(a, mask16);
- __m128i b = _mm_unpacklo_epi16(a, kZero);
- sign_bit = _mm_cmplt_epi16(sign_bit, kZero);
- sign_bit = _mm_unpacklo_epi16(kZero, sign_bit);
- return _mm_or_si128(sign_bit, b);
-}
-
-static INLINE __m128i k_cvthi_epi16(__m128i a, __m128i mask16, __m128i kZero) {
- // convert the lower 4 signed 16-bit integers into 4 signed 32-bit integers
- __m128i sign_bit = _mm_and_si128(a, mask16);
- __m128i b = _mm_unpackhi_epi16(a, kZero);
- sign_bit = _mm_cmplt_epi16(sign_bit, kZero);
- sign_bit = _mm_unpackhi_epi16(kZero, sign_bit);
- return _mm_or_si128(sign_bit, b);
-}
#endif
void FDCT32x32_2D(int16_t *input,
@@ -1159,28 +1141,43 @@
} else {
__m128i lstep1[64], lstep2[64], lstep3[64];
__m128i u[32], v[32], sign[16];
- const __m128i mask16 = _mm_set1_epi32(0x80008000);
const __m128i K32One = _mm_set_epi32(1, 1, 1, 1);
// start using 32-bit operations
// stage 3
{
// expanding to 32-bit length priori to addition operations
- lstep2[ 0] = k_cvtlo_epi16(step2[ 0], mask16, kZero);
- lstep2[ 1] = k_cvthi_epi16(step2[ 0], mask16, kZero);
- lstep2[ 2] = k_cvtlo_epi16(step2[ 1], mask16, kZero);
- lstep2[ 3] = k_cvthi_epi16(step2[ 1], mask16, kZero);
- lstep2[ 4] = k_cvtlo_epi16(step2[ 2], mask16, kZero);
- lstep2[ 5] = k_cvthi_epi16(step2[ 2], mask16, kZero);
- lstep2[ 6] = k_cvtlo_epi16(step2[ 3], mask16, kZero);
- lstep2[ 7] = k_cvthi_epi16(step2[ 3], mask16, kZero);
- lstep2[ 8] = k_cvtlo_epi16(step2[ 4], mask16, kZero);
- lstep2[ 9] = k_cvthi_epi16(step2[ 4], mask16, kZero);
- lstep2[10] = k_cvtlo_epi16(step2[ 5], mask16, kZero);
- lstep2[11] = k_cvthi_epi16(step2[ 5], mask16, kZero);
- lstep2[12] = k_cvtlo_epi16(step2[ 6], mask16, kZero);
- lstep2[13] = k_cvthi_epi16(step2[ 6], mask16, kZero);
- lstep2[14] = k_cvtlo_epi16(step2[ 7], mask16, kZero);
- lstep2[15] = k_cvthi_epi16(step2[ 7], mask16, kZero);
+ lstep2[ 0] = _mm_unpacklo_epi16(step2[ 0], kZero);
+ lstep2[ 1] = _mm_unpackhi_epi16(step2[ 0], kZero);
+ lstep2[ 2] = _mm_unpacklo_epi16(step2[ 1], kZero);
+ lstep2[ 3] = _mm_unpackhi_epi16(step2[ 1], kZero);
+ lstep2[ 4] = _mm_unpacklo_epi16(step2[ 2], kZero);
+ lstep2[ 5] = _mm_unpackhi_epi16(step2[ 2], kZero);
+ lstep2[ 6] = _mm_unpacklo_epi16(step2[ 3], kZero);
+ lstep2[ 7] = _mm_unpackhi_epi16(step2[ 3], kZero);
+ lstep2[ 8] = _mm_unpacklo_epi16(step2[ 4], kZero);
+ lstep2[ 9] = _mm_unpackhi_epi16(step2[ 4], kZero);
+ lstep2[10] = _mm_unpacklo_epi16(step2[ 5], kZero);
+ lstep2[11] = _mm_unpackhi_epi16(step2[ 5], kZero);
+ lstep2[12] = _mm_unpacklo_epi16(step2[ 6], kZero);
+ lstep2[13] = _mm_unpackhi_epi16(step2[ 6], kZero);
+ lstep2[14] = _mm_unpacklo_epi16(step2[ 7], kZero);
+ lstep2[15] = _mm_unpackhi_epi16(step2[ 7], kZero);
+ lstep2[ 0] = _mm_madd_epi16(lstep2[ 0], kOne);
+ lstep2[ 1] = _mm_madd_epi16(lstep2[ 1], kOne);
+ lstep2[ 2] = _mm_madd_epi16(lstep2[ 2], kOne);
+ lstep2[ 3] = _mm_madd_epi16(lstep2[ 3], kOne);
+ lstep2[ 4] = _mm_madd_epi16(lstep2[ 4], kOne);
+ lstep2[ 5] = _mm_madd_epi16(lstep2[ 5], kOne);
+ lstep2[ 6] = _mm_madd_epi16(lstep2[ 6], kOne);
+ lstep2[ 7] = _mm_madd_epi16(lstep2[ 7], kOne);
+ lstep2[ 8] = _mm_madd_epi16(lstep2[ 8], kOne);
+ lstep2[ 9] = _mm_madd_epi16(lstep2[ 9], kOne);
+ lstep2[10] = _mm_madd_epi16(lstep2[10], kOne);
+ lstep2[11] = _mm_madd_epi16(lstep2[11], kOne);
+ lstep2[12] = _mm_madd_epi16(lstep2[12], kOne);
+ lstep2[13] = _mm_madd_epi16(lstep2[13], kOne);
+ lstep2[14] = _mm_madd_epi16(lstep2[14], kOne);
+ lstep2[15] = _mm_madd_epi16(lstep2[15], kOne);
lstep3[ 0] = _mm_add_epi32(lstep2[14], lstep2[ 0]);
lstep3[ 1] = _mm_add_epi32(lstep2[15], lstep2[ 1]);
@@ -1231,42 +1228,75 @@
lstep3[27] = _mm_srai_epi32(s3_13_5, DCT_CONST_BITS);
}
{
- lstep2[40] = k_cvtlo_epi16(step2[20], mask16, kZero);
- lstep2[41] = k_cvthi_epi16(step2[20], mask16, kZero);
- lstep2[42] = k_cvtlo_epi16(step2[21], mask16, kZero);
- lstep2[43] = k_cvthi_epi16(step2[21], mask16, kZero);
- lstep2[44] = k_cvtlo_epi16(step2[22], mask16, kZero);
- lstep2[45] = k_cvthi_epi16(step2[22], mask16, kZero);
- lstep2[46] = k_cvtlo_epi16(step2[23], mask16, kZero);
- lstep2[47] = k_cvthi_epi16(step2[23], mask16, kZero);
- lstep2[48] = k_cvtlo_epi16(step2[24], mask16, kZero);
- lstep2[49] = k_cvthi_epi16(step2[24], mask16, kZero);
- lstep2[50] = k_cvtlo_epi16(step2[25], mask16, kZero);
- lstep2[51] = k_cvthi_epi16(step2[25], mask16, kZero);
- lstep2[52] = k_cvtlo_epi16(step2[26], mask16, kZero);
- lstep2[53] = k_cvthi_epi16(step2[26], mask16, kZero);
- lstep2[54] = k_cvtlo_epi16(step2[27], mask16, kZero);
- lstep2[55] = k_cvthi_epi16(step2[27], mask16, kZero);
+ lstep2[40] = _mm_unpacklo_epi16(step2[20], kZero);
+ lstep2[41] = _mm_unpackhi_epi16(step2[20], kZero);
+ lstep2[42] = _mm_unpacklo_epi16(step2[21], kZero);
+ lstep2[43] = _mm_unpackhi_epi16(step2[21], kZero);
+ lstep2[44] = _mm_unpacklo_epi16(step2[22], kZero);
+ lstep2[45] = _mm_unpackhi_epi16(step2[22], kZero);
+ lstep2[46] = _mm_unpacklo_epi16(step2[23], kZero);
+ lstep2[47] = _mm_unpackhi_epi16(step2[23], kZero);
+ lstep2[48] = _mm_unpacklo_epi16(step2[24], kZero);
+ lstep2[49] = _mm_unpackhi_epi16(step2[24], kZero);
+ lstep2[50] = _mm_unpacklo_epi16(step2[25], kZero);
+ lstep2[51] = _mm_unpackhi_epi16(step2[25], kZero);
+ lstep2[52] = _mm_unpacklo_epi16(step2[26], kZero);
+ lstep2[53] = _mm_unpackhi_epi16(step2[26], kZero);
+ lstep2[54] = _mm_unpacklo_epi16(step2[27], kZero);
+ lstep2[55] = _mm_unpackhi_epi16(step2[27], kZero);
+ lstep2[40] = _mm_madd_epi16(lstep2[40], kOne);
+ lstep2[41] = _mm_madd_epi16(lstep2[41], kOne);
+ lstep2[42] = _mm_madd_epi16(lstep2[42], kOne);
+ lstep2[43] = _mm_madd_epi16(lstep2[43], kOne);
+ lstep2[44] = _mm_madd_epi16(lstep2[44], kOne);
+ lstep2[45] = _mm_madd_epi16(lstep2[45], kOne);
+ lstep2[46] = _mm_madd_epi16(lstep2[46], kOne);
+ lstep2[47] = _mm_madd_epi16(lstep2[47], kOne);
+ lstep2[48] = _mm_madd_epi16(lstep2[48], kOne);
+ lstep2[49] = _mm_madd_epi16(lstep2[49], kOne);
+ lstep2[50] = _mm_madd_epi16(lstep2[50], kOne);
+ lstep2[51] = _mm_madd_epi16(lstep2[51], kOne);
+ lstep2[52] = _mm_madd_epi16(lstep2[52], kOne);
+ lstep2[53] = _mm_madd_epi16(lstep2[53], kOne);
+ lstep2[54] = _mm_madd_epi16(lstep2[54], kOne);
+ lstep2[55] = _mm_madd_epi16(lstep2[55], kOne);
- lstep1[32] = k_cvtlo_epi16(step1[16], mask16, kZero);
- lstep1[33] = k_cvthi_epi16(step1[16], mask16, kZero);
- lstep1[34] = k_cvtlo_epi16(step1[17], mask16, kZero);
- lstep1[35] = k_cvthi_epi16(step1[17], mask16, kZero);
- lstep1[36] = k_cvtlo_epi16(step1[18], mask16, kZero);
- lstep1[37] = k_cvthi_epi16(step1[18], mask16, kZero);
- lstep1[38] = k_cvtlo_epi16(step1[19], mask16, kZero);
- lstep1[39] = k_cvthi_epi16(step1[19], mask16, kZero);
- lstep1[56] = k_cvtlo_epi16(step1[28], mask16, kZero);
- lstep1[57] = k_cvthi_epi16(step1[28], mask16, kZero);
- lstep1[58] = k_cvtlo_epi16(step1[29], mask16, kZero);
- lstep1[59] = k_cvthi_epi16(step1[29], mask16, kZero);
- lstep1[60] = k_cvtlo_epi16(step1[30], mask16, kZero);
- lstep1[61] = k_cvthi_epi16(step1[30], mask16, kZero);
- lstep1[62] = k_cvtlo_epi16(step1[31], mask16, kZero);
- lstep1[63] = k_cvthi_epi16(step1[31], mask16, kZero);
+ lstep1[32] = _mm_unpacklo_epi16(step1[16], kZero);
+ lstep1[33] = _mm_unpackhi_epi16(step1[16], kZero);
+ lstep1[34] = _mm_unpacklo_epi16(step1[17], kZero);
+ lstep1[35] = _mm_unpackhi_epi16(step1[17], kZero);
+ lstep1[36] = _mm_unpacklo_epi16(step1[18], kZero);
+ lstep1[37] = _mm_unpackhi_epi16(step1[18], kZero);
+ lstep1[38] = _mm_unpacklo_epi16(step1[19], kZero);
+ lstep1[39] = _mm_unpackhi_epi16(step1[19], kZero);
+ lstep1[56] = _mm_unpacklo_epi16(step1[28], kZero);
+ lstep1[57] = _mm_unpackhi_epi16(step1[28], kZero);
+ lstep1[58] = _mm_unpacklo_epi16(step1[29], kZero);
+ lstep1[59] = _mm_unpackhi_epi16(step1[29], kZero);
+ lstep1[60] = _mm_unpacklo_epi16(step1[30], kZero);
+ lstep1[61] = _mm_unpackhi_epi16(step1[30], kZero);
+ lstep1[62] = _mm_unpacklo_epi16(step1[31], kZero);
+ lstep1[63] = _mm_unpackhi_epi16(step1[31], kZero);
+ lstep1[32] = _mm_madd_epi16(lstep1[32], kOne);
+ lstep1[33] = _mm_madd_epi16(lstep1[33], kOne);
+ lstep1[34] = _mm_madd_epi16(lstep1[34], kOne);
+ lstep1[35] = _mm_madd_epi16(lstep1[35], kOne);
+ lstep1[36] = _mm_madd_epi16(lstep1[36], kOne);
+ lstep1[37] = _mm_madd_epi16(lstep1[37], kOne);
+ lstep1[38] = _mm_madd_epi16(lstep1[38], kOne);
+ lstep1[39] = _mm_madd_epi16(lstep1[39], kOne);
+ lstep1[56] = _mm_madd_epi16(lstep1[56], kOne);
+ lstep1[57] = _mm_madd_epi16(lstep1[57], kOne);
+ lstep1[58] = _mm_madd_epi16(lstep1[58], kOne);
+ lstep1[59] = _mm_madd_epi16(lstep1[59], kOne);
+ lstep1[60] = _mm_madd_epi16(lstep1[60], kOne);
+ lstep1[61] = _mm_madd_epi16(lstep1[61], kOne);
+ lstep1[62] = _mm_madd_epi16(lstep1[62], kOne);
+ lstep1[63] = _mm_madd_epi16(lstep1[63], kOne);
lstep3[32] = _mm_add_epi32(lstep2[46], lstep1[32]);
lstep3[33] = _mm_add_epi32(lstep2[47], lstep1[33]);
+
lstep3[34] = _mm_add_epi32(lstep2[44], lstep1[34]);
lstep3[35] = _mm_add_epi32(lstep2[45], lstep1[35]);
lstep3[36] = _mm_add_epi32(lstep2[42], lstep1[36]);
@@ -1302,14 +1332,22 @@
// stage 4
{
// expanding to 32-bit length priori to addition operations
- lstep2[16] = k_cvtlo_epi16(step2[ 8], mask16, kZero);
- lstep2[17] = k_cvthi_epi16(step2[ 8], mask16, kZero);
- lstep2[18] = k_cvtlo_epi16(step2[ 9], mask16, kZero);
- lstep2[19] = k_cvthi_epi16(step2[ 9], mask16, kZero);
- lstep2[28] = k_cvtlo_epi16(step2[14], mask16, kZero);
- lstep2[29] = k_cvthi_epi16(step2[14], mask16, kZero);
- lstep2[30] = k_cvtlo_epi16(step2[15], mask16, kZero);
- lstep2[31] = k_cvthi_epi16(step2[15], mask16, kZero);
+ lstep2[16] = _mm_unpacklo_epi16(step2[ 8], kZero);
+ lstep2[17] = _mm_unpackhi_epi16(step2[ 8], kZero);
+ lstep2[18] = _mm_unpacklo_epi16(step2[ 9], kZero);
+ lstep2[19] = _mm_unpackhi_epi16(step2[ 9], kZero);
+ lstep2[28] = _mm_unpacklo_epi16(step2[14], kZero);
+ lstep2[29] = _mm_unpackhi_epi16(step2[14], kZero);
+ lstep2[30] = _mm_unpacklo_epi16(step2[15], kZero);
+ lstep2[31] = _mm_unpackhi_epi16(step2[15], kZero);
+ lstep2[16] = _mm_madd_epi16(lstep2[16], kOne);
+ lstep2[17] = _mm_madd_epi16(lstep2[17], kOne);
+ lstep2[18] = _mm_madd_epi16(lstep2[18], kOne);
+ lstep2[19] = _mm_madd_epi16(lstep2[19], kOne);
+ lstep2[28] = _mm_madd_epi16(lstep2[28], kOne);
+ lstep2[29] = _mm_madd_epi16(lstep2[29], kOne);
+ lstep2[30] = _mm_madd_epi16(lstep2[30], kOne);
+ lstep2[31] = _mm_madd_epi16(lstep2[31], kOne);
lstep1[ 0] = _mm_add_epi32(lstep3[ 6], lstep3[ 0]);
lstep1[ 1] = _mm_add_epi32(lstep3[ 7], lstep3[ 1]);
@@ -1337,41 +1375,41 @@
lstep1[31] = _mm_add_epi32(lstep3[25], lstep2[31]);
}
{
- // to be continued...
- //
- const __m128i k32_p16_p16 = pair_set_epi32(cospi_16_64, cospi_16_64);
- const __m128i k32_p16_m16 = pair_set_epi32(cospi_16_64, -cospi_16_64);
+ // to be continued...
+ //
+ const __m128i k32_p16_p16 = pair_set_epi32(cospi_16_64, cospi_16_64);
+ const __m128i k32_p16_m16 = pair_set_epi32(cospi_16_64, -cospi_16_64);
- u[0] = _mm_unpacklo_epi32(lstep3[12], lstep3[10]);
- u[1] = _mm_unpackhi_epi32(lstep3[12], lstep3[10]);
- u[2] = _mm_unpacklo_epi32(lstep3[13], lstep3[11]);
- u[3] = _mm_unpackhi_epi32(lstep3[13], lstep3[11]);
+ u[0] = _mm_unpacklo_epi32(lstep3[12], lstep3[10]);
+ u[1] = _mm_unpackhi_epi32(lstep3[12], lstep3[10]);
+ u[2] = _mm_unpacklo_epi32(lstep3[13], lstep3[11]);
+ u[3] = _mm_unpackhi_epi32(lstep3[13], lstep3[11]);
- // TODO(jingning): manually inline k_madd_epi32_ to further hide
- // instruction latency.
- v[ 0] = k_madd_epi32(u[0], k32_p16_m16);
- v[ 1] = k_madd_epi32(u[1], k32_p16_m16);
- v[ 2] = k_madd_epi32(u[2], k32_p16_m16);
- v[ 3] = k_madd_epi32(u[3], k32_p16_m16);
- v[ 4] = k_madd_epi32(u[0], k32_p16_p16);
- v[ 5] = k_madd_epi32(u[1], k32_p16_p16);
- v[ 6] = k_madd_epi32(u[2], k32_p16_p16);
- v[ 7] = k_madd_epi32(u[3], k32_p16_p16);
+ // TODO(jingning): manually inline k_madd_epi32_ to further hide
+ // instruction latency.
+ v[ 0] = k_madd_epi32(u[0], k32_p16_m16);
+ v[ 1] = k_madd_epi32(u[1], k32_p16_m16);
+ v[ 2] = k_madd_epi32(u[2], k32_p16_m16);
+ v[ 3] = k_madd_epi32(u[3], k32_p16_m16);
+ v[ 4] = k_madd_epi32(u[0], k32_p16_p16);
+ v[ 5] = k_madd_epi32(u[1], k32_p16_p16);
+ v[ 6] = k_madd_epi32(u[2], k32_p16_p16);
+ v[ 7] = k_madd_epi32(u[3], k32_p16_p16);
- u[0] = k_packs_epi64(v[0], v[1]);
- u[1] = k_packs_epi64(v[2], v[3]);
- u[2] = k_packs_epi64(v[4], v[5]);
- u[3] = k_packs_epi64(v[6], v[7]);
+ u[0] = k_packs_epi64(v[0], v[1]);
+ u[1] = k_packs_epi64(v[2], v[3]);
+ u[2] = k_packs_epi64(v[4], v[5]);
+ u[3] = k_packs_epi64(v[6], v[7]);
- v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
- v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
- v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
- v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
+ v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
+ v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
+ v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
+ v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
- lstep1[10] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
- lstep1[11] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
- lstep1[12] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
- lstep1[13] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
+ lstep1[10] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
+ lstep1[11] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
+ lstep1[12] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
+ lstep1[13] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
}
{
const __m128i k32_m08_p24 = pair_set_epi32(-cospi_8_64, cospi_24_64);
@@ -2647,4 +2685,4 @@
}
}
}
-}
+} // NOLINT
diff --git a/vp9/encoder/x86/vp9_variance_mmx.c b/vp9/encoder/x86/vp9_variance_mmx.c
index d141560..a3d0114 100644
--- a/vp9/encoder/x86/vp9_variance_mmx.c
+++ b/vp9/encoder/x86/vp9_variance_mmx.c
@@ -8,12 +8,12 @@
* be found in the AUTHORS file in the root of the source tree.
*/
-#include "vpx_config.h"
+#include "./vpx_config.h"
#include "vp9/encoder/vp9_variance.h"
#include "vp9/common/vp9_pragmas.h"
#include "vpx_ports/mem.h"
-extern unsigned int vp9_get_mb_ss_mmx(const short *src_ptr);
+extern unsigned int vp9_get_mb_ss_mmx(const int16_t *src_ptr);
extern unsigned int vp9_get8x8var_mmx
(
const unsigned char *src_ptr,
@@ -45,7 +45,6 @@
vp9_get4x4var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &var, &avg);
*sse = var;
return (var - (((unsigned int)avg * avg) >> 4));
-
}
unsigned int vp9_variance8x8_mmx(
@@ -61,7 +60,6 @@
*sse = var;
return (var - (((unsigned int)avg * avg) >> 6));
-
}
unsigned int vp9_mse16x16_mmx(
@@ -74,10 +72,14 @@
int sum0, sum1, sum2, sum3;
- vp9_get8x8var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0);
- vp9_get8x8var_mmx(src_ptr + 8, source_stride, ref_ptr + 8, recon_stride, &sse1, &sum1);
- vp9_get8x8var_mmx(src_ptr + 8 * source_stride, source_stride, ref_ptr + 8 * recon_stride, recon_stride, &sse2, &sum2);
- vp9_get8x8var_mmx(src_ptr + 8 * source_stride + 8, source_stride, ref_ptr + 8 * recon_stride + 8, recon_stride, &sse3, &sum3);
+ vp9_get8x8var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &sse0,
+ &sum0);
+ vp9_get8x8var_mmx(src_ptr + 8, source_stride, ref_ptr + 8, recon_stride,
+ &sse1, &sum1);
+ vp9_get8x8var_mmx(src_ptr + 8 * source_stride, source_stride,
+ ref_ptr + 8 * recon_stride, recon_stride, &sse2, &sum2);
+ vp9_get8x8var_mmx(src_ptr + 8 * source_stride + 8, source_stride,
+ ref_ptr + 8 * recon_stride + 8, recon_stride, &sse3, &sum3);
var = sse0 + sse1 + sse2 + sse3;
*sse = var;
@@ -94,11 +96,14 @@
unsigned int sse0, sse1, sse2, sse3, var;
int sum0, sum1, sum2, sum3, avg;
-
- vp9_get8x8var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0);
- vp9_get8x8var_mmx(src_ptr + 8, source_stride, ref_ptr + 8, recon_stride, &sse1, &sum1);
- vp9_get8x8var_mmx(src_ptr + 8 * source_stride, source_stride, ref_ptr + 8 * recon_stride, recon_stride, &sse2, &sum2);
- vp9_get8x8var_mmx(src_ptr + 8 * source_stride + 8, source_stride, ref_ptr + 8 * recon_stride + 8, recon_stride, &sse3, &sum3);
+ vp9_get8x8var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &sse0,
+ &sum0);
+ vp9_get8x8var_mmx(src_ptr + 8, source_stride, ref_ptr + 8, recon_stride,
+ &sse1, &sum1);
+ vp9_get8x8var_mmx(src_ptr + 8 * source_stride, source_stride,
+ ref_ptr + 8 * recon_stride, recon_stride, &sse2, &sum2);
+ vp9_get8x8var_mmx(src_ptr + 8 * source_stride + 8, source_stride,
+ ref_ptr + 8 * recon_stride + 8, recon_stride, &sse3, &sum3);
var = sse0 + sse1 + sse2 + sse3;
avg = sum0 + sum1 + sum2 + sum3;
@@ -115,14 +120,15 @@
unsigned int sse0, sse1, var;
int sum0, sum1, avg;
- vp9_get8x8var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0);
- vp9_get8x8var_mmx(src_ptr + 8, source_stride, ref_ptr + 8, recon_stride, &sse1, &sum1);
+ vp9_get8x8var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &sse0,
+ &sum0);
+ vp9_get8x8var_mmx(src_ptr + 8, source_stride, ref_ptr + 8, recon_stride,
+ &sse1, &sum1);
var = sse0 + sse1;
avg = sum0 + sum1;
*sse = var;
return (var - (((unsigned int)avg * avg) >> 7));
-
}
@@ -135,13 +141,14 @@
unsigned int sse0, sse1, var;
int sum0, sum1, avg;
- vp9_get8x8var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0);
- vp9_get8x8var_mmx(src_ptr + 8 * source_stride, source_stride, ref_ptr + 8 * recon_stride, recon_stride, &sse1, &sum1);
+ vp9_get8x8var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &sse0,
+ &sum0);
+ vp9_get8x8var_mmx(src_ptr + 8 * source_stride, source_stride,
+ ref_ptr + 8 * recon_stride, recon_stride, &sse1, &sum1);
var = sse0 + sse1;
avg = sum0 + sum1;
*sse = var;
return (var - (((unsigned int)avg * avg) >> 7));
-
}
diff --git a/vp9/encoder/x86/vp9_variance_sse2.c b/vp9/encoder/x86/vp9_variance_sse2.c
index cea934d..79e42c4 100644
--- a/vp9/encoder/x86/vp9_variance_sse2.c
+++ b/vp9/encoder/x86/vp9_variance_sse2.c
@@ -8,7 +8,7 @@
* be found in the AUTHORS file in the root of the source tree.
*/
-#include "vpx_config.h"
+#include "./vpx_config.h"
#include "vp9/encoder/vp9_variance.h"
#include "vp9/common/vp9_pragmas.h"
@@ -26,7 +26,7 @@
unsigned int vp9_get_mb_ss_sse2
(
- const short *src_ptr
+ const int16_t *src_ptr
);
unsigned int vp9_get16x16var_sse2
(
@@ -250,7 +250,6 @@
const unsigned char *ref_ptr,
int recon_stride,
unsigned int *sse) {
-
unsigned int sse0;
int sum0;
vp9_get16x16var_sse2(src_ptr, source_stride, ref_ptr, recon_stride, &sse0,
@@ -407,12 +406,12 @@
FN(32, 16, 16, 5, 4, opt1, (int64_t)); \
FN(16, 32, 16, 4, 5, opt1, (int64_t)); \
FN(16, 16, 16, 4, 4, opt1, (unsigned int)); \
-FN(16, 8, 16, 4, 3, opt1,); \
-FN(8, 16, 8, 3, 4, opt1,); \
-FN(8, 8, 8, 3, 3, opt1,); \
-FN(8, 4, 8, 3, 2, opt1,); \
-FN(4, 8, 4, 2, 3, opt2,); \
-FN(4, 4, 4, 2, 2, opt2,)
+FN(16, 8, 16, 4, 3, opt1, (unsigned int)); \
+FN(8, 16, 8, 3, 4, opt1, (unsigned int)); \
+FN(8, 8, 8, 3, 3, opt1, (unsigned int)); \
+FN(8, 4, 8, 3, 2, opt1, (unsigned int)); \
+FN(4, 8, 4, 2, 3, opt2, (unsigned int)); \
+FN(4, 4, 4, 2, 2, opt2, (unsigned int))
FNS(sse2, sse);
FNS(ssse3, ssse3);
@@ -487,12 +486,12 @@
FN(32, 16, 16, 5, 4, opt1, (int64_t)); \
FN(16, 32, 16, 4, 5, opt1, (int64_t)); \
FN(16, 16, 16, 4, 4, opt1, (unsigned int)); \
-FN(16, 8, 16, 4, 3, opt1,); \
-FN(8, 16, 8, 3, 4, opt1,); \
-FN(8, 8, 8, 3, 3, opt1,); \
-FN(8, 4, 8, 3, 2, opt1,); \
-FN(4, 8, 4, 2, 3, opt2,); \
-FN(4, 4, 4, 2, 2, opt2,)
+FN(16, 8, 16, 4, 3, opt1, (unsigned int)); \
+FN(8, 16, 8, 3, 4, opt1, (unsigned int)); \
+FN(8, 8, 8, 3, 3, opt1, (unsigned int)); \
+FN(8, 4, 8, 3, 2, opt1, (unsigned int)); \
+FN(4, 8, 4, 2, 3, opt2, (unsigned int)); \
+FN(4, 4, 4, 2, 2, opt2, (unsigned int))
FNS(sse2, sse);
FNS(ssse3, ssse3);
diff --git a/vp9/vp9_common.mk b/vp9/vp9_common.mk
index 10fa461..af6e665 100644
--- a/vp9/vp9_common.mk
+++ b/vp9/vp9_common.mk
@@ -68,6 +68,8 @@
VP9_COMMON_SRCS-yes += common/vp9_treecoder.c
VP9_COMMON_SRCS-yes += common/vp9_common_data.c
VP9_COMMON_SRCS-yes += common/vp9_common_data.h
+VP9_COMMON_SRCS-yes += common/vp9_scan.c
+VP9_COMMON_SRCS-yes += common/vp9_scan.h
VP9_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/vp9_postproc_x86.h
VP9_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/vp9_asm_stubs.c
@@ -75,6 +77,7 @@
VP9_COMMON_SRCS-$(CONFIG_VP9_POSTPROC) += common/vp9_postproc.h
VP9_COMMON_SRCS-$(CONFIG_VP9_POSTPROC) += common/vp9_postproc.c
VP9_COMMON_SRCS-$(HAVE_MMX) += common/x86/vp9_loopfilter_mmx.asm
+VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_subpixel_8t_sse2.asm
VP9_COMMON_SRCS-$(HAVE_SSSE3) += common/x86/vp9_subpixel_8t_ssse3.asm
ifeq ($(CONFIG_VP9_POSTPROC),yes)
VP9_COMMON_SRCS-$(HAVE_MMX) += common/x86/vp9_postproc_mmx.asm
@@ -89,6 +92,11 @@
# common (c)
VP9_COMMON_SRCS-$(HAVE_DSPR2) += common/mips/dspr2/vp9_common_dspr2.h
+VP9_COMMON_SRCS-$(HAVE_DSPR2) += common/mips/dspr2/vp9_convolve2_avg_dspr2.c
+VP9_COMMON_SRCS-$(HAVE_DSPR2) += common/mips/dspr2/vp9_convolve2_avg_horiz_dspr2.c
+VP9_COMMON_SRCS-$(HAVE_DSPR2) += common/mips/dspr2/vp9_convolve2_dspr2.c
+VP9_COMMON_SRCS-$(HAVE_DSPR2) += common/mips/dspr2/vp9_convolve2_horiz_dspr2.c
+VP9_COMMON_SRCS-$(HAVE_DSPR2) += common/mips/dspr2/vp9_convolve2_vert_dspr2.c
VP9_COMMON_SRCS-$(HAVE_DSPR2) += common/mips/dspr2/vp9_convolve8_avg_dspr2.c
VP9_COMMON_SRCS-$(HAVE_DSPR2) += common/mips/dspr2/vp9_convolve8_avg_horiz_dspr2.c
VP9_COMMON_SRCS-$(HAVE_DSPR2) += common/mips/dspr2/vp9_convolve8_dspr2.c
diff --git a/vp9/vp9_cx_iface.c b/vp9/vp9_cx_iface.c
index 157752a..810fdf5 100644
--- a/vp9/vp9_cx_iface.c
+++ b/vp9/vp9_cx_iface.c
@@ -8,30 +8,30 @@
* be found in the AUTHORS file in the root of the source tree.
*/
+#include <stdlib.h>
+#include <string.h>
#include "vpx/vpx_codec.h"
#include "vpx/internal/vpx_codec_internal.h"
-#include "vpx_version.h"
+#include "./vpx_version.h"
#include "vp9/encoder/vp9_onyx_int.h"
#include "vpx/vp8cx.h"
#include "vp9/encoder/vp9_firstpass.h"
#include "vp9/common/vp9_onyx.h"
#include "vp9/vp9_iface_common.h"
-#include <stdlib.h>
-#include <string.h>
struct vp9_extracfg {
struct vpx_codec_pkt_list *pkt_list;
- int cpu_used; /** available cpu percentage in 1/16*/
- unsigned int enable_auto_alt_ref; /** if encoder decides to uses alternate reference frame */
+ int cpu_used; /* available cpu percentage in 1/16 */
+ unsigned int enable_auto_alt_ref;
unsigned int noise_sensitivity;
unsigned int Sharpness;
unsigned int static_thresh;
unsigned int tile_columns;
unsigned int tile_rows;
- unsigned int arnr_max_frames; /* alt_ref Noise Reduction Max Frame Count */
- unsigned int arnr_strength; /* alt_ref Noise Reduction Strength */
- unsigned int arnr_type; /* alt_ref filter type */
+ unsigned int arnr_max_frames;
+ unsigned int arnr_strength;
+ unsigned int arnr_type;
unsigned int experimental;
vp8e_tuning tuning;
unsigned int cq_level; /* constrained quality level */
@@ -48,7 +48,7 @@
static const struct extraconfig_map extracfg_map[] = {
{
0,
- {
+ { // NOLINT
NULL,
0, /* cpu_used */
1, /* enable_auto_alt_ref */
@@ -85,7 +85,7 @@
uint32_t pending_frame_magnitude;
vpx_image_t preview_img;
vp8_postproc_cfg_t preview_ppcfg;
- vpx_codec_pkt_list_decl(64) pkt_list; // changed to accomendate the maximum number of lagged frames allowed
+ vpx_codec_pkt_list_decl(64) pkt_list;
unsigned int fixed_kf_cntr;
};
@@ -120,26 +120,26 @@
#define ERROR(str) do {\
ctx->base.err_detail = str;\
return VPX_CODEC_INVALID_PARAM;\
- } while(0)
+ } while (0)
-#define RANGE_CHECK(p,memb,lo,hi) do {\
- if(!(((p)->memb == lo || (p)->memb > (lo)) && (p)->memb <= hi)) \
+#define RANGE_CHECK(p, memb, lo, hi) do {\
+ if (!(((p)->memb == lo || (p)->memb > (lo)) && (p)->memb <= hi)) \
ERROR(#memb " out of range ["#lo".."#hi"]");\
- } while(0)
+ } while (0)
-#define RANGE_CHECK_HI(p,memb,hi) do {\
- if(!((p)->memb <= (hi))) \
+#define RANGE_CHECK_HI(p, memb, hi) do {\
+ if (!((p)->memb <= (hi))) \
ERROR(#memb " out of range [.."#hi"]");\
- } while(0)
+ } while (0)
-#define RANGE_CHECK_LO(p,memb,lo) do {\
- if(!((p)->memb >= (lo))) \
+#define RANGE_CHECK_LO(p, memb, lo) do {\
+ if (!((p)->memb >= (lo))) \
ERROR(#memb " out of range ["#lo"..]");\
- } while(0)
+ } while (0)
-#define RANGE_CHECK_BOOL(p,memb) do {\
- if(!!((p)->memb) != (p)->memb) ERROR(#memb " expected boolean");\
- } while(0)
+#define RANGE_CHECK_BOOL(p, memb) do {\
+ if (!!((p)->memb) != (p)->memb) ERROR(#memb " expected boolean");\
+ } while (0)
static vpx_codec_err_t validate_config(vpx_codec_alg_priv_t *ctx,
const vpx_codec_enc_cfg_t *cfg,
@@ -247,7 +247,8 @@
oxcf->width = cfg.g_w;
oxcf->height = cfg.g_h;
/* guess a frame rate if out of whack, use 30 */
- oxcf->framerate = (double)(cfg.g_timebase.den) / (double)(cfg.g_timebase.num);
+ oxcf->framerate = (double)(cfg.g_timebase.den)
+ / (double)(cfg.g_timebase.num);
if (oxcf->framerate > 180) {
oxcf->framerate = 30;
@@ -266,11 +267,11 @@
}
if (cfg.g_pass == VPX_RC_FIRST_PASS) {
- oxcf->allow_lag = 0;
- oxcf->lag_in_frames = 0;
+ oxcf->allow_lag = 0;
+ oxcf->lag_in_frames = 0;
} else {
- oxcf->allow_lag = (cfg.g_lag_in_frames) > 0;
- oxcf->lag_in_frames = cfg.g_lag_in_frames;
+ oxcf->allow_lag = (cfg.g_lag_in_frames) > 0;
+ oxcf->lag_in_frames = cfg.g_lag_in_frames;
}
// VBR only supported for now.
@@ -282,7 +283,7 @@
else if (cfg.rc_end_usage == VPX_Q)
oxcf->end_usage = USAGE_CONSTANT_QUALITY;
- oxcf->target_bandwidth = cfg.rc_target_bitrate;
+ oxcf->target_bandwidth = cfg.rc_target_bitrate;
oxcf->rc_max_intra_bitrate_pct = vp8_cfg.rc_max_intra_bitrate_pct;
oxcf->best_allowed_q = cfg.rc_min_quantizer;
@@ -297,7 +298,7 @@
oxcf->starting_buffer_level = cfg.rc_buf_initial_sz;
oxcf->optimal_buffer_level = cfg.rc_buf_optimal_sz;
- oxcf->two_pass_vbrbias = cfg.rc_2pass_vbr_bias_pct;
+ oxcf->two_pass_vbrbias = cfg.rc_2pass_vbr_bias_pct;
oxcf->two_pass_vbrmin_section = cfg.rc_2pass_vbr_minsection_pct;
oxcf->two_pass_vbrmax_section = cfg.rc_2pass_vbr_maxsection_pct;
@@ -313,23 +314,23 @@
oxcf->encode_breakout = vp8_cfg.static_thresh;
oxcf->play_alternate = vp8_cfg.enable_auto_alt_ref;
oxcf->noise_sensitivity = vp8_cfg.noise_sensitivity;
- oxcf->Sharpness = vp8_cfg.Sharpness;
+ oxcf->Sharpness = vp8_cfg.Sharpness;
- oxcf->two_pass_stats_in = cfg.rc_twopass_stats_in;
- oxcf->output_pkt_list = vp8_cfg.pkt_list;
+ oxcf->two_pass_stats_in = cfg.rc_twopass_stats_in;
+ oxcf->output_pkt_list = vp8_cfg.pkt_list;
oxcf->arnr_max_frames = vp8_cfg.arnr_max_frames;
- oxcf->arnr_strength = vp8_cfg.arnr_strength;
- oxcf->arnr_type = vp8_cfg.arnr_type;
+ oxcf->arnr_strength = vp8_cfg.arnr_strength;
+ oxcf->arnr_type = vp8_cfg.arnr_type;
oxcf->tuning = vp8_cfg.tuning;
oxcf->tile_columns = vp8_cfg.tile_columns;
- oxcf->tile_rows = vp8_cfg.tile_rows;
+ oxcf->tile_rows = vp8_cfg.tile_rows;
oxcf->lossless = vp8_cfg.lossless;
- oxcf->error_resilient_mode = cfg.g_error_resilient;
+ oxcf->error_resilient_mode = cfg.g_error_resilient;
oxcf->frame_parallel_decoding_mode = vp8_cfg.frame_parallel_decoding_mode;
oxcf->ss_number_layers = cfg.ss_number_layers;
@@ -498,7 +499,7 @@
*/
for (i = 0;
extracfg_map[i].usage && extracfg_map[i].usage != cfg->g_usage;
- i++);
+ i++) {}
priv->vp8_cfg = extracfg_map[i].cfg;
priv->vp8_cfg.pkt_list = &priv->pkt_list.head;
@@ -553,7 +554,6 @@
static vpx_codec_err_t vp9e_destroy(vpx_codec_alg_priv_t *ctx) {
-
free(ctx->cx_data);
vp9_remove_compressor(&ctx->cpi);
free(ctx);
@@ -712,8 +712,10 @@
lib_flags = (flags & VPX_EFLAG_FORCE_KF) ? FRAMEFLAGS_KEY : 0;
/* vp8 use 10,000,000 ticks/second as time stamp */
- dst_time_stamp = pts * 10000000 * ctx->cfg.g_timebase.num / ctx->cfg.g_timebase.den;
- dst_end_time_stamp = (pts + duration) * 10000000 * ctx->cfg.g_timebase.num / ctx->cfg.g_timebase.den;
+ dst_time_stamp = pts * 10000000 * ctx->cfg.g_timebase.num
+ / ctx->cfg.g_timebase.den;
+ dst_end_time_stamp = (pts + duration) * 10000000 * ctx->cfg.g_timebase.num /
+ ctx->cfg.g_timebase.den;
if (img != NULL) {
res = image2yuvconfig(img, &sd);
@@ -839,8 +841,6 @@
cx_data += size;
cx_data_sz -= size;
}
-
- // printf("timestamp: %lld, duration: %d\n", pkt->data.frame.pts, pkt->data.frame.duration);
}
}
}
@@ -867,15 +867,14 @@
vp9_set_reference_enc(ctx->cpi, ref_frame_to_vp9_reframe(frame->frame_type),
&sd);
return VPX_CODEC_OK;
- } else
+ } else {
return VPX_CODEC_INVALID_PARAM;
-
+ }
}
static vpx_codec_err_t vp9e_copy_reference(vpx_codec_alg_priv_t *ctx,
int ctr_id,
va_list args) {
-
vpx_ref_frame_t *data = va_arg(args, vpx_ref_frame_t *);
if (data) {
@@ -886,8 +885,9 @@
vp9_copy_reference_enc(ctx->cpi,
ref_frame_to_vp9_reframe(frame->frame_type), &sd);
return VPX_CODEC_OK;
- } else
+ } else {
return VPX_CODEC_INVALID_PARAM;
+ }
}
static vpx_codec_err_t get_reference(vpx_codec_alg_priv_t *ctx,
@@ -916,8 +916,9 @@
if (data) {
ctx->preview_ppcfg = *((vp8_postproc_cfg_t *)data);
return VPX_CODEC_OK;
- } else
+ } else {
return VPX_CODEC_INVALID_PARAM;
+ }
#else
(void)ctx;
(void)ctr_id;
@@ -928,7 +929,6 @@
static vpx_image_t *vp9e_get_preview(vpx_codec_alg_priv_t *ctx) {
-
YV12_BUFFER_CONFIG sd;
vp9_ppflags_t flags = {0};
@@ -941,8 +941,9 @@
if (0 == vp9_get_preview_raw_frame(ctx->cpi, &sd, &flags)) {
yuvconfig2image(&ctx->preview_img, &sd, NULL);
return &ctx->preview_img;
- } else
+ } else {
return NULL;
+ }
}
static vpx_codec_err_t vp9e_update_entropy(vpx_codec_alg_priv_t *ctx,
@@ -951,7 +952,6 @@
int update = va_arg(args, int);
vp9_update_entropy(ctx->cpi, update);
return VPX_CODEC_OK;
-
}
static vpx_codec_err_t vp9e_update_reference(vpx_codec_alg_priv_t *ctx,
@@ -983,8 +983,9 @@
return VPX_CODEC_OK;
else
return VPX_CODEC_INVALID_PARAM;
- } else
+ } else {
return VPX_CODEC_INVALID_PARAM;
+ }
}
@@ -994,21 +995,20 @@
vpx_active_map_t *data = va_arg(args, vpx_active_map_t *);
if (data) {
-
vpx_active_map_t *map = (vpx_active_map_t *)data;
if (!vp9_set_active_map(ctx->cpi, map->active_map, map->rows, map->cols))
return VPX_CODEC_OK;
else
return VPX_CODEC_INVALID_PARAM;
- } else
+ } else {
return VPX_CODEC_INVALID_PARAM;
+ }
}
static vpx_codec_err_t vp9e_set_scalemode(vpx_codec_alg_priv_t *ctx,
int ctr_id,
va_list args) {
-
vpx_scaling_mode_t *data = va_arg(args, vpx_scaling_mode_t *);
if (data) {
@@ -1019,10 +1019,12 @@
if (!res) {
return VPX_CODEC_OK;
- } else
+ } else {
return VPX_CODEC_INVALID_PARAM;
- } else
+ }
+ } else {
return VPX_CODEC_INVALID_PARAM;
+ }
}
static vpx_codec_err_t vp9e_set_width(vpx_codec_alg_priv_t *ctx, int ctr_id,
@@ -1128,7 +1130,7 @@
static vpx_codec_enc_cfg_map_t vp9e_usage_cfg_map[] = {
{
0,
- {
+ { // NOLINT
0, /* g_usage */
0, /* g_threads */
0, /* g_profile */
@@ -1197,13 +1199,13 @@
vp9e_ctf_maps, /* vpx_codec_ctrl_fn_map_t *ctrl_maps; */
NOT_IMPLEMENTED, /* vpx_codec_get_mmap_fn_t get_mmap; */
NOT_IMPLEMENTED, /* vpx_codec_set_mmap_fn_t set_mmap; */
- {
+ { // NOLINT
NOT_IMPLEMENTED, /* vpx_codec_peek_si_fn_t peek_si; */
NOT_IMPLEMENTED, /* vpx_codec_get_si_fn_t get_si; */
NOT_IMPLEMENTED, /* vpx_codec_decode_fn_t decode; */
NOT_IMPLEMENTED, /* vpx_codec_frame_get_fn_t frame_get; */
},
- {
+ { // NOLINT
vp9e_usage_cfg_map, /* vpx_codec_enc_cfg_map_t peek_si; */
vp9e_encode, /* vpx_codec_encode_fn_t encode; */
vp9e_get_cxdata, /* vpx_codec_get_cx_data_fn_t frame_get; */
@@ -1226,13 +1228,13 @@
vp9e_ctf_maps, /* vpx_codec_ctrl_fn_map_t *ctrl_maps; */
NOT_IMPLEMENTED, /* vpx_codec_get_mmap_fn_t get_mmap; */
NOT_IMPLEMENTED, /* vpx_codec_set_mmap_fn_t set_mmap; */
- {
+ { // NOLINT
NOT_IMPLEMENTED, /* vpx_codec_peek_si_fn_t peek_si; */
NOT_IMPLEMENTED, /* vpx_codec_get_si_fn_t get_si; */
NOT_IMPLEMENTED, /* vpx_codec_decode_fn_t decode; */
NOT_IMPLEMENTED, /* vpx_codec_frame_get_fn_t frame_get; */
},
- {
+ { // NOLINT
vp9e_usage_cfg_map, /* vpx_codec_enc_cfg_map_t peek_si; */
vp9e_encode, /* vpx_codec_encode_fn_t encode; */
vp9e_get_cxdata, /* vpx_codec_get_cx_data_fn_t frame_get; */
diff --git a/vpxenc.c b/vpxenc.c
index 71cf01f..d7c6c0e 100644
--- a/vpxenc.c
+++ b/vpxenc.c
@@ -45,8 +45,8 @@
#include "vpx_ports/vpx_timer.h"
#include "tools_common.h"
#include "y4minput.h"
-#include "libmkv/EbmlWriter.h"
-#include "libmkv/EbmlIDs.h"
+#include "third_party/libmkv/EbmlWriter.h"
+#include "third_party/libmkv/EbmlIDs.h"
#include "third_party/libyuv/include/libyuv/scale.h"
/* Need special handling of these functions on Windows */