Merge "Fix intermediate height in convolve_c"
diff --git a/test/dct16x16_test.cc b/test/dct16x16_test.cc
index 89453bc..0743f35 100644
--- a/test/dct16x16_test.cc
+++ b/test/dct16x16_test.cc
@@ -75,7 +75,7 @@
static const double C14 = 0.195090322016128;
static const double C15 = 0.098017140329561;
-static void butterfly_16x16_dct_1d(double input[16], double output[16]) {
+void butterfly_16x16_dct_1d(double input[16], double output[16]) {
double step[16];
double intermediate[16];
double temp1, temp2;
@@ -287,37 +287,37 @@
vp9_short_iht16x16_add_c(out, dst, stride >> 1, tx_type);
}
-class FwdTrans16x16Test : public ::testing::TestWithParam<int> {
+class Trans16x16Test : public ::testing::TestWithParam<int> {
public:
- virtual ~FwdTrans16x16Test() {}
+ virtual ~Trans16x16Test() {}
virtual void SetUp() {
tx_type_ = GetParam();
if (tx_type_ == 0) {
- fwd_txfm = fdct16x16;
- inv_txfm = idct16x16_add;
+ fwd_txfm_ = fdct16x16;
+ inv_txfm_ = idct16x16_add;
} else {
- fwd_txfm = fht16x16;
- inv_txfm = iht16x16_add;
+ fwd_txfm_ = fht16x16;
+ inv_txfm_ = iht16x16_add;
}
}
protected:
void RunFwdTxfm(int16_t *in, int16_t *out, uint8_t *dst,
int stride, int tx_type) {
- (*fwd_txfm)(in, out, dst, stride, tx_type);
+ (*fwd_txfm_)(in, out, dst, stride, tx_type);
}
void RunInvTxfm(int16_t *in, int16_t *out, uint8_t *dst,
int stride, int tx_type) {
- (*inv_txfm)(in, out, dst, stride, tx_type);
+ (*inv_txfm_)(in, out, dst, stride, tx_type);
}
int tx_type_;
- void (*fwd_txfm)(int16_t*, int16_t*, uint8_t*, int, int);
- void (*inv_txfm)(int16_t*, int16_t*, uint8_t*, int, int);
+ void (*fwd_txfm_)(int16_t*, int16_t*, uint8_t*, int, int);
+ void (*inv_txfm_)(int16_t*, int16_t*, uint8_t*, int, int);
};
-TEST_P(FwdTrans16x16Test, AccuracyCheck) {
+TEST_P(Trans16x16Test, AccuracyCheck) {
ACMRandom rnd(ACMRandom::DeterministicSeed());
int max_error = 0;
int total_error = 0;
@@ -355,7 +355,7 @@
<< "Error: 16x16 FHT/IHT has average round trip error > 1 per block";
}
-TEST_P(FwdTrans16x16Test, CoeffSizeCheck) {
+TEST_P(Trans16x16Test, CoeffSizeCheck) {
ACMRandom rnd(ACMRandom::DeterministicSeed());
const int count_test_block = 1000;
for (int i = 0; i < count_test_block; ++i) {
@@ -389,14 +389,19 @@
}
}
-INSTANTIATE_TEST_CASE_P(VP9, FwdTrans16x16Test, ::testing::Range(0, 4));
-
-TEST(VP9Idct16x16Test, AccuracyCheck) {
+TEST_P(Trans16x16Test, InvAccuracyCheck) {
ACMRandom rnd(ACMRandom::DeterministicSeed());
const int count_test_block = 1000;
+ // TODO(jingning): is this unit test necessary? if so, need to add
+ // check sets for inverse hybrid transforms too.
+ if (tx_type_ != DCT_DCT)
+ return;
+
for (int i = 0; i < count_test_block; ++i) {
- int16_t in[256], coeff[256];
- uint8_t dst[256], src[256];
+ DECLARE_ALIGNED_ARRAY(16, int16_t, in, 256);
+ DECLARE_ALIGNED_ARRAY(16, int16_t, coeff, 256);
+ DECLARE_ALIGNED_ARRAY(16, uint8_t, dst, 256);
+ DECLARE_ALIGNED_ARRAY(16, uint8_t, src, 256);
double out_r[256];
for (int j = 0; j < 256; ++j) {
@@ -410,7 +415,10 @@
reference_16x16_dct_2d(in, out_r);
for (int j = 0; j < 256; j++)
coeff[j] = round(out_r[j]);
- vp9_short_idct16x16_add_c(coeff, dst, 16);
+
+ const int pitch = 32;
+ RunInvTxfm(coeff, coeff, dst, pitch, tx_type_);
+
for (int j = 0; j < 256; ++j) {
const int diff = dst[j] - src[j];
const int error = diff * diff;
@@ -421,4 +429,5 @@
}
}
+INSTANTIATE_TEST_CASE_P(VP9, Trans16x16Test, ::testing::Range(0, 4));
} // namespace
diff --git a/test/test-data.sha1 b/test/test-data.sha1
index 0ac4905..579d7e2 100644
--- a/test/test-data.sha1
+++ b/test/test-data.sha1
@@ -520,3 +520,5 @@
83c6d8f2969b759e10e5c6542baca1265c874c29 vp90-2-03-size-226x224.webm.md5
fe0af2ee47b1e5f6a66db369e2d7e9d870b38dce vp90-2-03-size-226x226.webm
94ad19b8b699cea105e2ff18f0df2afd7242bcf7 vp90-2-03-size-226x226.webm.md5
+495256cfd123fe777b2c0406862ed8468a1f4677 vp91-2-04-yv444.webm
+65e3a7ffef61ab340d9140f335ecc49125970c2c vp91-2-04-yv444.webm.md5
diff --git a/test/test.mk b/test/test.mk
index 25e05b9..2042c86 100644
--- a/test/test.mk
+++ b/test/test.mk
@@ -629,3 +629,5 @@
LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-226x224.webm.md5
LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-226x226.webm
LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-226x226.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp91-2-04-yv444.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp91-2-04-yv444.webm.md5
diff --git a/test/test_vector_test.cc b/test/test_vector_test.cc
index 9b0e9d5..4cd356d 100644
--- a/test/test_vector_test.cc
+++ b/test/test_vector_test.cc
@@ -159,7 +159,10 @@
"vp90-2-03-size-226x198.webm", "vp90-2-03-size-226x200.webm",
"vp90-2-03-size-226x202.webm", "vp90-2-03-size-226x208.webm",
"vp90-2-03-size-226x210.webm", "vp90-2-03-size-226x224.webm",
- "vp90-2-03-size-226x226.webm"
+ "vp90-2-03-size-226x226.webm",
+#if CONFIG_NON420
+ "vp91-2-04-yv444.webm"
+#endif
};
#endif
diff --git a/test/vp9_subtract_test.cc b/test/vp9_subtract_test.cc
index a66a450..332a839 100644
--- a/test/vp9_subtract_test.cc
+++ b/test/vp9_subtract_test.cc
@@ -39,8 +39,8 @@
ACMRandom rnd(ACMRandom::DeterministicSeed());
// FIXME(rbultje) split in its own file
- for (BLOCK_SIZE_TYPE bsize = BLOCK_4X4; bsize < BLOCK_SIZES;
- bsize = static_cast<BLOCK_SIZE_TYPE>(static_cast<int>(bsize) + 1)) {
+ for (BLOCK_SIZE bsize = BLOCK_4X4; bsize < BLOCK_SIZES;
+ bsize = static_cast<BLOCK_SIZE>(static_cast<int>(bsize) + 1)) {
const int block_width = 4 << b_width_log2(bsize);
const int block_height = 4 << b_height_log2(bsize);
int16_t *diff = reinterpret_cast<int16_t *>(
diff --git a/vp9/common/arm/neon/vp9_short_idct16x16_1_add_neon.asm b/vp9/common/arm/neon/vp9_short_idct16x16_1_add_neon.asm
new file mode 100644
index 0000000..cf5c8f7
--- /dev/null
+++ b/vp9/common/arm/neon/vp9_short_idct16x16_1_add_neon.asm
@@ -0,0 +1,198 @@
+;
+; Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license and patent
+; grant that can be found in the LICENSE file in the root of the source
+; tree. All contributing project authors may be found in the AUTHORS
+; file in the root of the source tree.
+;
+
+
+ EXPORT |vp9_short_idct16x16_1_add_neon|
+ ARM
+ REQUIRE8
+ PRESERVE8
+
+ AREA ||.text||, CODE, READONLY, ALIGN=2
+
+;void vp9_short_idct16x16_1_add_neon(int16_t *input, uint8_t *dest,
+; int dest_stride)
+;
+; r0 int16_t input
+; r1 uint8_t *dest
+; r2 int dest_stride)
+
+|vp9_short_idct16x16_1_add_neon| PROC
+ ldrsh r0, [r0]
+
+ ; generate cospi_16_64 = 11585
+ mov r12, #0x2d00
+ add r12, #0x41
+
+ ; out = dct_const_round_shift(input[0] * cospi_16_64)
+ mul r0, r0, r12 ; input[0] * cospi_16_64
+ add r0, r0, #0x2000 ; +(1 << ((DCT_CONST_BITS) - 1))
+ asr r0, r0, #14 ; >> DCT_CONST_BITS
+
+ ; out = dct_const_round_shift(out * cospi_16_64)
+ mul r0, r0, r12 ; out * cospi_16_64
+ mov r12, r1 ; save dest
+ add r0, r0, #0x2000 ; +(1 << ((DCT_CONST_BITS) - 1))
+ asr r0, r0, #14 ; >> DCT_CONST_BITS
+
+ ; a1 = ROUND_POWER_OF_TWO(out, 6)
+ add r0, r0, #32 ; + (1 <<((6) - 1))
+ asr r0, r0, #6 ; >> 6
+
+ vdup.s16 q0, r0 ; duplicate a1
+ mov r0, #8
+ sub r2, #8
+
+ ; load destination data row0 - row3
+ vld1.64 {d2}, [r1], r0
+ vld1.64 {d3}, [r1], r2
+ vld1.64 {d4}, [r1], r0
+ vld1.64 {d5}, [r1], r2
+ vld1.64 {d6}, [r1], r0
+ vld1.64 {d7}, [r1], r2
+ vld1.64 {d16}, [r1], r0
+ vld1.64 {d17}, [r1], r2
+
+ vaddw.u8 q9, q0, d2 ; dest[x] + a1
+ vaddw.u8 q10, q0, d3 ; dest[x] + a1
+ vaddw.u8 q11, q0, d4 ; dest[x] + a1
+ vaddw.u8 q12, q0, d5 ; dest[x] + a1
+ vqmovun.s16 d2, q9 ; clip_pixel
+ vqmovun.s16 d3, q10 ; clip_pixel
+ vqmovun.s16 d30, q11 ; clip_pixel
+ vqmovun.s16 d31, q12 ; clip_pixel
+ vst1.64 {d2}, [r12], r0
+ vst1.64 {d3}, [r12], r2
+ vst1.64 {d30}, [r12], r0
+ vst1.64 {d31}, [r12], r2
+
+ vaddw.u8 q9, q0, d6 ; dest[x] + a1
+ vaddw.u8 q10, q0, d7 ; dest[x] + a1
+ vaddw.u8 q11, q0, d16 ; dest[x] + a1
+ vaddw.u8 q12, q0, d17 ; dest[x] + a1
+ vqmovun.s16 d2, q9 ; clip_pixel
+ vqmovun.s16 d3, q10 ; clip_pixel
+ vqmovun.s16 d30, q11 ; clip_pixel
+ vqmovun.s16 d31, q12 ; clip_pixel
+ vst1.64 {d2}, [r12], r0
+ vst1.64 {d3}, [r12], r2
+ vst1.64 {d30}, [r12], r0
+ vst1.64 {d31}, [r12], r2
+
+ ; load destination data row4 - row7
+ vld1.64 {d2}, [r1], r0
+ vld1.64 {d3}, [r1], r2
+ vld1.64 {d4}, [r1], r0
+ vld1.64 {d5}, [r1], r2
+ vld1.64 {d6}, [r1], r0
+ vld1.64 {d7}, [r1], r2
+ vld1.64 {d16}, [r1], r0
+ vld1.64 {d17}, [r1], r2
+
+ vaddw.u8 q9, q0, d2 ; dest[x] + a1
+ vaddw.u8 q10, q0, d3 ; dest[x] + a1
+ vaddw.u8 q11, q0, d4 ; dest[x] + a1
+ vaddw.u8 q12, q0, d5 ; dest[x] + a1
+ vqmovun.s16 d2, q9 ; clip_pixel
+ vqmovun.s16 d3, q10 ; clip_pixel
+ vqmovun.s16 d30, q11 ; clip_pixel
+ vqmovun.s16 d31, q12 ; clip_pixel
+ vst1.64 {d2}, [r12], r0
+ vst1.64 {d3}, [r12], r2
+ vst1.64 {d30}, [r12], r0
+ vst1.64 {d31}, [r12], r2
+
+ vaddw.u8 q9, q0, d6 ; dest[x] + a1
+ vaddw.u8 q10, q0, d7 ; dest[x] + a1
+ vaddw.u8 q11, q0, d16 ; dest[x] + a1
+ vaddw.u8 q12, q0, d17 ; dest[x] + a1
+ vqmovun.s16 d2, q9 ; clip_pixel
+ vqmovun.s16 d3, q10 ; clip_pixel
+ vqmovun.s16 d30, q11 ; clip_pixel
+ vqmovun.s16 d31, q12 ; clip_pixel
+ vst1.64 {d2}, [r12], r0
+ vst1.64 {d3}, [r12], r2
+ vst1.64 {d30}, [r12], r0
+ vst1.64 {d31}, [r12], r2
+
+ ; load destination data row8 - row11
+ vld1.64 {d2}, [r1], r0
+ vld1.64 {d3}, [r1], r2
+ vld1.64 {d4}, [r1], r0
+ vld1.64 {d5}, [r1], r2
+ vld1.64 {d6}, [r1], r0
+ vld1.64 {d7}, [r1], r2
+ vld1.64 {d16}, [r1], r0
+ vld1.64 {d17}, [r1], r2
+
+ vaddw.u8 q9, q0, d2 ; dest[x] + a1
+ vaddw.u8 q10, q0, d3 ; dest[x] + a1
+ vaddw.u8 q11, q0, d4 ; dest[x] + a1
+ vaddw.u8 q12, q0, d5 ; dest[x] + a1
+ vqmovun.s16 d2, q9 ; clip_pixel
+ vqmovun.s16 d3, q10 ; clip_pixel
+ vqmovun.s16 d30, q11 ; clip_pixel
+ vqmovun.s16 d31, q12 ; clip_pixel
+ vst1.64 {d2}, [r12], r0
+ vst1.64 {d3}, [r12], r2
+ vst1.64 {d30}, [r12], r0
+ vst1.64 {d31}, [r12], r2
+
+ vaddw.u8 q9, q0, d6 ; dest[x] + a1
+ vaddw.u8 q10, q0, d7 ; dest[x] + a1
+ vaddw.u8 q11, q0, d16 ; dest[x] + a1
+ vaddw.u8 q12, q0, d17 ; dest[x] + a1
+ vqmovun.s16 d2, q9 ; clip_pixel
+ vqmovun.s16 d3, q10 ; clip_pixel
+ vqmovun.s16 d30, q11 ; clip_pixel
+ vqmovun.s16 d31, q12 ; clip_pixel
+ vst1.64 {d2}, [r12], r0
+ vst1.64 {d3}, [r12], r2
+ vst1.64 {d30}, [r12], r0
+ vst1.64 {d31}, [r12], r2
+
+ ; load destination data row12 - row15
+ vld1.64 {d2}, [r1], r0
+ vld1.64 {d3}, [r1], r2
+ vld1.64 {d4}, [r1], r0
+ vld1.64 {d5}, [r1], r2
+ vld1.64 {d6}, [r1], r0
+ vld1.64 {d7}, [r1], r2
+ vld1.64 {d16}, [r1], r0
+ vld1.64 {d17}, [r1], r2
+
+ vaddw.u8 q9, q0, d2 ; dest[x] + a1
+ vaddw.u8 q10, q0, d3 ; dest[x] + a1
+ vaddw.u8 q11, q0, d4 ; dest[x] + a1
+ vaddw.u8 q12, q0, d5 ; dest[x] + a1
+ vqmovun.s16 d2, q9 ; clip_pixel
+ vqmovun.s16 d3, q10 ; clip_pixel
+ vqmovun.s16 d30, q11 ; clip_pixel
+ vqmovun.s16 d31, q12 ; clip_pixel
+ vst1.64 {d2}, [r12], r0
+ vst1.64 {d3}, [r12], r2
+ vst1.64 {d30}, [r12], r0
+ vst1.64 {d31}, [r12], r2
+
+ vaddw.u8 q9, q0, d6 ; dest[x] + a1
+ vaddw.u8 q10, q0, d7 ; dest[x] + a1
+ vaddw.u8 q11, q0, d16 ; dest[x] + a1
+ vaddw.u8 q12, q0, d17 ; dest[x] + a1
+ vqmovun.s16 d2, q9 ; clip_pixel
+ vqmovun.s16 d3, q10 ; clip_pixel
+ vqmovun.s16 d30, q11 ; clip_pixel
+ vqmovun.s16 d31, q12 ; clip_pixel
+ vst1.64 {d2}, [r12], r0
+ vst1.64 {d3}, [r12], r2
+ vst1.64 {d30}, [r12], r0
+ vst1.64 {d31}, [r12], r2
+
+ bx lr
+ ENDP ; |vp9_short_idct16x16_1_add_neon|
+
+ END
diff --git a/vp9/common/arm/neon/vp9_short_idct8x8_1_add_neon.asm b/vp9/common/arm/neon/vp9_short_idct8x8_1_add_neon.asm
new file mode 100644
index 0000000..923804f
--- /dev/null
+++ b/vp9/common/arm/neon/vp9_short_idct8x8_1_add_neon.asm
@@ -0,0 +1,88 @@
+;
+; Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license and patent
+; grant that can be found in the LICENSE file in the root of the source
+; tree. All contributing project authors may be found in the AUTHORS
+; file in the root of the source tree.
+;
+
+
+ EXPORT |vp9_short_idct8x8_1_add_neon|
+ ARM
+ REQUIRE8
+ PRESERVE8
+
+ AREA ||.text||, CODE, READONLY, ALIGN=2
+
+;void vp9_short_idct8x8_1_add_neon(int16_t *input, uint8_t *dest,
+; int dest_stride)
+;
+; r0 int16_t input
+; r1 uint8_t *dest
+; r2 int dest_stride)
+
+|vp9_short_idct8x8_1_add_neon| PROC
+ ldrsh r0, [r0]
+
+ ; generate cospi_16_64 = 11585
+ mov r12, #0x2d00
+ add r12, #0x41
+
+ ; out = dct_const_round_shift(input[0] * cospi_16_64)
+ mul r0, r0, r12 ; input[0] * cospi_16_64
+ add r0, r0, #0x2000 ; +(1 << ((DCT_CONST_BITS) - 1))
+ asr r0, r0, #14 ; >> DCT_CONST_BITS
+
+ ; out = dct_const_round_shift(out * cospi_16_64)
+ mul r0, r0, r12 ; out * cospi_16_64
+ mov r12, r1 ; save dest
+ add r0, r0, #0x2000 ; +(1 << ((DCT_CONST_BITS) - 1))
+ asr r0, r0, #14 ; >> DCT_CONST_BITS
+
+ ; a1 = ROUND_POWER_OF_TWO(out, 5)
+ add r0, r0, #16 ; + (1 <<((5) - 1))
+ asr r0, r0, #5 ; >> 5
+
+ vdup.s16 q0, r0 ; duplicate a1
+
+ ; load destination data
+ vld1.64 {d2}, [r1], r2
+ vld1.64 {d3}, [r1], r2
+ vld1.64 {d4}, [r1], r2
+ vld1.64 {d5}, [r1], r2
+ vld1.64 {d6}, [r1], r2
+ vld1.64 {d7}, [r1], r2
+ vld1.64 {d16}, [r1], r2
+ vld1.64 {d17}, [r1]
+
+ vaddw.u8 q9, q0, d2 ; dest[x] + a1
+ vaddw.u8 q10, q0, d3 ; dest[x] + a1
+ vaddw.u8 q11, q0, d4 ; dest[x] + a1
+ vaddw.u8 q12, q0, d5 ; dest[x] + a1
+ vqmovun.s16 d2, q9 ; clip_pixel
+ vqmovun.s16 d3, q10 ; clip_pixel
+ vqmovun.s16 d30, q11 ; clip_pixel
+ vqmovun.s16 d31, q12 ; clip_pixel
+ vst1.64 {d2}, [r12], r2
+ vst1.64 {d3}, [r12], r2
+ vst1.64 {d30}, [r12], r2
+ vst1.64 {d31}, [r12], r2
+
+ vaddw.u8 q9, q0, d6 ; dest[x] + a1
+ vaddw.u8 q10, q0, d7 ; dest[x] + a1
+ vaddw.u8 q11, q0, d16 ; dest[x] + a1
+ vaddw.u8 q12, q0, d17 ; dest[x] + a1
+ vqmovun.s16 d2, q9 ; clip_pixel
+ vqmovun.s16 d3, q10 ; clip_pixel
+ vqmovun.s16 d30, q11 ; clip_pixel
+ vqmovun.s16 d31, q12 ; clip_pixel
+ vst1.64 {d2}, [r12], r2
+ vst1.64 {d3}, [r12], r2
+ vst1.64 {d30}, [r12], r2
+ vst1.64 {d31}, [r12], r2
+
+ bx lr
+ ENDP ; |vp9_short_idct8x8_1_add_neon|
+
+ END
diff --git a/vp9/common/generic/vp9_systemdependent.c b/vp9/common/generic/vp9_systemdependent.c
index 79092cd..f1447211 100644
--- a/vp9/common/generic/vp9_systemdependent.c
+++ b/vp9/common/generic/vp9_systemdependent.c
@@ -13,6 +13,7 @@
#include "vp9_rtcd.h"
#include "vp9/common/vp9_onyxc_int.h"
-void vp9_machine_specific_config(VP9_COMMON *ctx) {
+void vp9_machine_specific_config(VP9_COMMON *cm) {
+ (void)cm;
vp9_rtcd();
}
diff --git a/vp9/common/vp9_alloccommon.c b/vp9/common/vp9_alloccommon.c
index b68a052..e89fea8 100644
--- a/vp9/common/vp9_alloccommon.c
+++ b/vp9/common/vp9_alloccommon.c
@@ -47,26 +47,26 @@
}
}
-void vp9_free_frame_buffers(VP9_COMMON *oci) {
+void vp9_free_frame_buffers(VP9_COMMON *cm) {
int i;
for (i = 0; i < NUM_YV12_BUFFERS; i++)
- vp9_free_frame_buffer(&oci->yv12_fb[i]);
+ vp9_free_frame_buffer(&cm->yv12_fb[i]);
- vp9_free_frame_buffer(&oci->post_proc_buffer);
+ vp9_free_frame_buffer(&cm->post_proc_buffer);
- vpx_free(oci->mip);
- vpx_free(oci->prev_mip);
- vpx_free(oci->above_seg_context);
- vpx_free(oci->last_frame_seg_map);
+ vpx_free(cm->mip);
+ vpx_free(cm->prev_mip);
+ vpx_free(cm->above_seg_context);
+ vpx_free(cm->last_frame_seg_map);
- vpx_free(oci->above_context[0]);
+ vpx_free(cm->above_context[0]);
for (i = 0; i < MAX_MB_PLANE; i++)
- oci->above_context[i] = 0;
- oci->mip = NULL;
- oci->prev_mip = NULL;
- oci->above_seg_context = NULL;
- oci->last_frame_seg_map = NULL;
+ cm->above_context[i] = 0;
+ cm->mip = NULL;
+ cm->prev_mip = NULL;
+ cm->above_seg_context = NULL;
+ cm->last_frame_seg_map = NULL;
}
static void set_mb_mi(VP9_COMMON *cm, int aligned_width, int aligned_height) {
@@ -93,95 +93,95 @@
vp9_update_mode_info_in_image(cm, cm->prev_mi);
}
-int vp9_alloc_frame_buffers(VP9_COMMON *oci, int width, int height) {
+int vp9_alloc_frame_buffers(VP9_COMMON *cm, int width, int height) {
int i, mi_cols;
const int aligned_width = ALIGN_POWER_OF_TWO(width, MI_SIZE_LOG2);
const int aligned_height = ALIGN_POWER_OF_TWO(height, MI_SIZE_LOG2);
- const int ss_x = oci->subsampling_x;
- const int ss_y = oci->subsampling_y;
+ const int ss_x = cm->subsampling_x;
+ const int ss_y = cm->subsampling_y;
int mi_size;
- vp9_free_frame_buffers(oci);
+ vp9_free_frame_buffers(cm);
for (i = 0; i < NUM_YV12_BUFFERS; i++) {
- oci->fb_idx_ref_cnt[i] = 0;
- if (vp9_alloc_frame_buffer(&oci->yv12_fb[i], width, height, ss_x, ss_y,
+ cm->fb_idx_ref_cnt[i] = 0;
+ if (vp9_alloc_frame_buffer(&cm->yv12_fb[i], width, height, ss_x, ss_y,
VP9BORDERINPIXELS) < 0)
goto fail;
}
- oci->new_fb_idx = NUM_YV12_BUFFERS - 1;
- oci->fb_idx_ref_cnt[oci->new_fb_idx] = 1;
+ cm->new_fb_idx = NUM_YV12_BUFFERS - 1;
+ cm->fb_idx_ref_cnt[cm->new_fb_idx] = 1;
for (i = 0; i < ALLOWED_REFS_PER_FRAME; i++)
- oci->active_ref_idx[i] = i;
+ cm->active_ref_idx[i] = i;
for (i = 0; i < NUM_REF_FRAMES; i++) {
- oci->ref_frame_map[i] = i;
- oci->fb_idx_ref_cnt[i] = 1;
+ cm->ref_frame_map[i] = i;
+ cm->fb_idx_ref_cnt[i] = 1;
}
- if (vp9_alloc_frame_buffer(&oci->post_proc_buffer, width, height, ss_x, ss_y,
+ if (vp9_alloc_frame_buffer(&cm->post_proc_buffer, width, height, ss_x, ss_y,
VP9BORDERINPIXELS) < 0)
goto fail;
- set_mb_mi(oci, aligned_width, aligned_height);
+ set_mb_mi(cm, aligned_width, aligned_height);
// Allocation
- mi_size = oci->mode_info_stride * (oci->mi_rows + MI_BLOCK_SIZE);
+ mi_size = cm->mode_info_stride * (cm->mi_rows + MI_BLOCK_SIZE);
- oci->mip = vpx_calloc(mi_size, sizeof(MODE_INFO));
- if (!oci->mip)
+ cm->mip = vpx_calloc(mi_size, sizeof(MODE_INFO));
+ if (!cm->mip)
goto fail;
- oci->prev_mip = vpx_calloc(mi_size, sizeof(MODE_INFO));
- if (!oci->prev_mip)
+ cm->prev_mip = vpx_calloc(mi_size, sizeof(MODE_INFO));
+ if (!cm->prev_mip)
goto fail;
- setup_mi(oci);
+ setup_mi(cm);
// FIXME(jkoleszar): allocate subsampled arrays for U/V once subsampling
// information is exposed at this level
- mi_cols = mi_cols_aligned_to_sb(oci->mi_cols);
+ mi_cols = mi_cols_aligned_to_sb(cm->mi_cols);
// 2 contexts per 'mi unit', so that we have one context per 4x4 txfm
// block where mi unit size is 8x8.
- oci->above_context[0] = vpx_calloc(sizeof(ENTROPY_CONTEXT) * MAX_MB_PLANE *
+ cm->above_context[0] = vpx_calloc(sizeof(ENTROPY_CONTEXT) * MAX_MB_PLANE *
(2 * mi_cols), 1);
- if (!oci->above_context[0])
+ if (!cm->above_context[0])
goto fail;
- oci->above_seg_context = vpx_calloc(sizeof(PARTITION_CONTEXT) * mi_cols, 1);
- if (!oci->above_seg_context)
+ cm->above_seg_context = vpx_calloc(sizeof(PARTITION_CONTEXT) * mi_cols, 1);
+ if (!cm->above_seg_context)
goto fail;
// Create the segmentation map structure and set to 0.
- oci->last_frame_seg_map = vpx_calloc(oci->mi_rows * oci->mi_cols, 1);
- if (!oci->last_frame_seg_map)
+ cm->last_frame_seg_map = vpx_calloc(cm->mi_rows * cm->mi_cols, 1);
+ if (!cm->last_frame_seg_map)
goto fail;
return 0;
fail:
- vp9_free_frame_buffers(oci);
+ vp9_free_frame_buffers(cm);
return 1;
}
-void vp9_create_common(VP9_COMMON *oci) {
- vp9_machine_specific_config(oci);
+void vp9_create_common(VP9_COMMON *cm) {
+ vp9_machine_specific_config(cm);
- vp9_init_mbmode_probs(oci);
+ vp9_init_mbmode_probs(cm);
- oci->tx_mode = ONLY_4X4;
- oci->comp_pred_mode = HYBRID_PREDICTION;
+ cm->tx_mode = ONLY_4X4;
+ cm->comp_pred_mode = HYBRID_PREDICTION;
// Initialize reference frame sign bias structure to defaults
- vpx_memset(oci->ref_frame_sign_bias, 0, sizeof(oci->ref_frame_sign_bias));
+ vpx_memset(cm->ref_frame_sign_bias, 0, sizeof(cm->ref_frame_sign_bias));
}
-void vp9_remove_common(VP9_COMMON *oci) {
- vp9_free_frame_buffers(oci);
+void vp9_remove_common(VP9_COMMON *cm) {
+ vp9_free_frame_buffers(cm);
}
void vp9_initialize_common() {
diff --git a/vp9/common/vp9_alloccommon.h b/vp9/common/vp9_alloccommon.h
index 8bf5ed1..b7d7eba 100644
--- a/vp9/common/vp9_alloccommon.h
+++ b/vp9/common/vp9_alloccommon.h
@@ -16,14 +16,14 @@
void vp9_initialize_common();
-void vp9_update_mode_info_border(VP9_COMMON *cpi, MODE_INFO *mi);
-void vp9_update_mode_info_in_image(VP9_COMMON *cpi, MODE_INFO *mi);
+void vp9_update_mode_info_border(VP9_COMMON *cm, MODE_INFO *mi);
+void vp9_update_mode_info_in_image(VP9_COMMON *cm, MODE_INFO *mi);
-void vp9_create_common(VP9_COMMON *oci);
-void vp9_remove_common(VP9_COMMON *oci);
+void vp9_create_common(VP9_COMMON *cm);
+void vp9_remove_common(VP9_COMMON *cm);
-int vp9_alloc_frame_buffers(VP9_COMMON *oci, int width, int height);
-void vp9_free_frame_buffers(VP9_COMMON *oci);
+int vp9_alloc_frame_buffers(VP9_COMMON *cm, int width, int height);
+void vp9_free_frame_buffers(VP9_COMMON *cm);
void vp9_update_frame_size(VP9_COMMON *cm);
diff --git a/vp9/common/vp9_blockd.h b/vp9/common/vp9_blockd.h
index 6e57259..5ba7846 100644
--- a/vp9/common/vp9_blockd.h
+++ b/vp9/common/vp9_blockd.h
@@ -71,7 +71,7 @@
D135_PRED, // Directional 135 deg = 180 - 45
D117_PRED, // Directional 117 deg = 180 - 63
D153_PRED, // Directional 153 deg = 180 - 27
- D27_PRED, // Directional 27 deg = round(arctan(1/2) * 180/pi)
+ D207_PRED, // Directional 207 deg = 180 + 27
D63_PRED, // Directional 63 deg = round(arctan(2/1) * 180/pi)
TM_PRED, // True-motion
NEARESTMV,
@@ -115,18 +115,18 @@
MAX_REF_FRAMES = 4
} MV_REFERENCE_FRAME;
-static INLINE int b_width_log2(BLOCK_SIZE_TYPE sb_type) {
+static INLINE int b_width_log2(BLOCK_SIZE sb_type) {
return b_width_log2_lookup[sb_type];
}
-static INLINE int b_height_log2(BLOCK_SIZE_TYPE sb_type) {
+static INLINE int b_height_log2(BLOCK_SIZE sb_type) {
return b_height_log2_lookup[sb_type];
}
-static INLINE int mi_width_log2(BLOCK_SIZE_TYPE sb_type) {
+static INLINE int mi_width_log2(BLOCK_SIZE sb_type) {
return mi_width_log2_lookup[sb_type];
}
-static INLINE int mi_height_log2(BLOCK_SIZE_TYPE sb_type) {
+static INLINE int mi_height_log2(BLOCK_SIZE sb_type) {
return mi_height_log2_lookup[sb_type];
}
@@ -134,7 +134,7 @@
typedef struct {
MB_PREDICTION_MODE mode, uv_mode;
MV_REFERENCE_FRAME ref_frame[2];
- TX_SIZE txfm_size;
+ TX_SIZE tx_size;
int_mv mv[2]; // for each reference frame used
int_mv ref_mvs[MAX_REF_FRAMES][MAX_MV_REF_CANDIDATES];
int_mv best_mv, best_second_mv;
@@ -153,7 +153,7 @@
INTERPOLATIONFILTERTYPE interp_filter;
- BLOCK_SIZE_TYPE sb_type;
+ BLOCK_SIZE sb_type;
} MB_MODE_INFO;
typedef struct {
@@ -245,7 +245,7 @@
} MACROBLOCKD;
-static INLINE unsigned char *get_sb_index(MACROBLOCKD *xd, BLOCK_SIZE_TYPE subsize) {
+static INLINE unsigned char *get_sb_index(MACROBLOCKD *xd, BLOCK_SIZE subsize) {
switch (subsize) {
case BLOCK_64X64:
case BLOCK_64X32:
@@ -270,9 +270,8 @@
}
}
-static INLINE void update_partition_context(MACROBLOCKD *xd,
- BLOCK_SIZE_TYPE sb_type,
- BLOCK_SIZE_TYPE sb_size) {
+static INLINE void update_partition_context(MACROBLOCKD *xd, BLOCK_SIZE sb_type,
+ BLOCK_SIZE sb_size) {
const int bsl = b_width_log2(sb_size), bs = (1 << bsl) / 2;
const int bwl = b_width_log2(sb_type);
const int bhl = b_height_log2(sb_type);
@@ -290,8 +289,7 @@
vpx_memset(xd->left_seg_context, pcvalue[bhl == bsl], bs);
}
-static INLINE int partition_plane_context(MACROBLOCKD *xd,
- BLOCK_SIZE_TYPE sb_type) {
+static INLINE int partition_plane_context(MACROBLOCKD *xd, BLOCK_SIZE sb_type) {
int bsl = mi_width_log2(sb_type), bs = 1 << bsl;
int above = 0, left = 0, i;
int boffset = mi_width_log2(BLOCK_64X64) - bsl;
@@ -311,9 +309,8 @@
return (left * 2 + above) + bsl * PARTITION_PLOFFSET;
}
-static BLOCK_SIZE_TYPE get_subsize(BLOCK_SIZE_TYPE bsize,
- PARTITION_TYPE partition) {
- const BLOCK_SIZE_TYPE subsize = subsize_lookup[partition][bsize];
+static BLOCK_SIZE get_subsize(BLOCK_SIZE bsize, PARTITION_TYPE partition) {
+ const BLOCK_SIZE subsize = subsize_lookup[partition][bsize];
assert(subsize < BLOCK_SIZES);
return subsize;
}
@@ -363,34 +360,33 @@
static INLINE TX_SIZE get_uv_tx_size(const MB_MODE_INFO *mbmi) {
- return MIN(mbmi->txfm_size, max_uv_txsize_lookup[mbmi->sb_type]);
+ return MIN(mbmi->tx_size, max_uv_txsize_lookup[mbmi->sb_type]);
}
-static BLOCK_SIZE_TYPE get_plane_block_size(BLOCK_SIZE_TYPE bsize,
- const struct macroblockd_plane *pd) {
- BLOCK_SIZE_TYPE bs = ss_size_lookup[bsize]
- [pd->subsampling_x][pd->subsampling_y];
+static BLOCK_SIZE get_plane_block_size(BLOCK_SIZE bsize,
+ const struct macroblockd_plane *pd) {
+ BLOCK_SIZE bs = ss_size_lookup[bsize][pd->subsampling_x][pd->subsampling_y];
assert(bs < BLOCK_SIZES);
return bs;
}
-static INLINE int plane_block_width(BLOCK_SIZE_TYPE bsize,
+static INLINE int plane_block_width(BLOCK_SIZE bsize,
const struct macroblockd_plane* plane) {
return 4 << (b_width_log2(bsize) - plane->subsampling_x);
}
-static INLINE int plane_block_height(BLOCK_SIZE_TYPE bsize,
+static INLINE int plane_block_height(BLOCK_SIZE bsize,
const struct macroblockd_plane* plane) {
return 4 << (b_height_log2(bsize) - plane->subsampling_y);
}
typedef void (*foreach_transformed_block_visitor)(int plane, int block,
- BLOCK_SIZE_TYPE plane_bsize,
+ BLOCK_SIZE plane_bsize,
TX_SIZE tx_size,
void *arg);
static INLINE void foreach_transformed_block_in_plane(
- const MACROBLOCKD *const xd, BLOCK_SIZE_TYPE bsize, int plane,
+ const MACROBLOCKD *const xd, BLOCK_SIZE bsize, int plane,
foreach_transformed_block_visitor visit, void *arg) {
const struct macroblockd_plane *const pd = &xd->plane[plane];
const MB_MODE_INFO* mbmi = &xd->mode_info_context->mbmi;
@@ -398,8 +394,8 @@
// 4x4=0, 8x8=2, 16x16=4, 32x32=6, 64x64=8
// transform size varies per plane, look it up in a common way.
const TX_SIZE tx_size = plane ? get_uv_tx_size(mbmi)
- : mbmi->txfm_size;
- const BLOCK_SIZE_TYPE plane_bsize = get_plane_block_size(bsize, pd);
+ : mbmi->tx_size;
+ const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, pd);
const int num_4x4_w = num_4x4_blocks_wide_lookup[plane_bsize];
const int num_4x4_h = num_4x4_blocks_high_lookup[plane_bsize];
const int step = 1 << (tx_size << 1);
@@ -440,7 +436,7 @@
}
static INLINE void foreach_transformed_block(
- const MACROBLOCKD* const xd, BLOCK_SIZE_TYPE bsize,
+ const MACROBLOCKD* const xd, BLOCK_SIZE bsize,
foreach_transformed_block_visitor visit, void *arg) {
int plane;
@@ -449,7 +445,7 @@
}
static INLINE void foreach_transformed_block_uv(
- const MACROBLOCKD* const xd, BLOCK_SIZE_TYPE bsize,
+ const MACROBLOCKD* const xd, BLOCK_SIZE bsize,
foreach_transformed_block_visitor visit, void *arg) {
int plane;
@@ -457,25 +453,25 @@
foreach_transformed_block_in_plane(xd, bsize, plane, visit, arg);
}
-static int raster_block_offset(BLOCK_SIZE_TYPE plane_bsize,
+static int raster_block_offset(BLOCK_SIZE plane_bsize,
int raster_block, int stride) {
const int bw = b_width_log2(plane_bsize);
const int y = 4 * (raster_block >> bw);
const int x = 4 * (raster_block & ((1 << bw) - 1));
return y * stride + x;
}
-static int16_t* raster_block_offset_int16(BLOCK_SIZE_TYPE plane_bsize,
+static int16_t* raster_block_offset_int16(BLOCK_SIZE plane_bsize,
int raster_block, int16_t *base) {
const int stride = 4 << b_width_log2(plane_bsize);
return base + raster_block_offset(plane_bsize, raster_block, stride);
}
-static uint8_t* raster_block_offset_uint8(BLOCK_SIZE_TYPE plane_bsize,
+static uint8_t* raster_block_offset_uint8(BLOCK_SIZE plane_bsize,
int raster_block, uint8_t *base,
int stride) {
return base + raster_block_offset(plane_bsize, raster_block, stride);
}
-static int txfrm_block_to_raster_block(BLOCK_SIZE_TYPE plane_bsize,
+static int txfrm_block_to_raster_block(BLOCK_SIZE plane_bsize,
TX_SIZE tx_size, int block) {
const int bwl = b_width_log2(plane_bsize);
const int tx_cols_log2 = bwl - tx_size;
@@ -486,7 +482,7 @@
return x + (y << bwl);
}
-static void txfrm_block_to_raster_xy(BLOCK_SIZE_TYPE plane_bsize,
+static void txfrm_block_to_raster_xy(BLOCK_SIZE plane_bsize,
TX_SIZE tx_size, int block,
int *x, int *y) {
const int bwl = b_width_log2(plane_bsize);
@@ -497,7 +493,7 @@
*y = (raster_mb >> tx_cols_log2) << tx_size;
}
-static void extend_for_intra(MACROBLOCKD* const xd, BLOCK_SIZE_TYPE plane_bsize,
+static void extend_for_intra(MACROBLOCKD* const xd, BLOCK_SIZE plane_bsize,
int plane, int block, TX_SIZE tx_size) {
struct macroblockd_plane *const pd = &xd->plane[plane];
uint8_t *const buf = pd->dst.buf;
@@ -536,7 +532,7 @@
}
static void set_contexts_on_border(MACROBLOCKD *xd,
struct macroblockd_plane *pd,
- BLOCK_SIZE_TYPE plane_bsize,
+ BLOCK_SIZE plane_bsize,
int tx_size_in_blocks, int has_eob,
int aoff, int loff,
ENTROPY_CONTEXT *A, ENTROPY_CONTEXT *L) {
@@ -573,7 +569,7 @@
}
static void set_contexts(MACROBLOCKD *xd, struct macroblockd_plane *pd,
- BLOCK_SIZE_TYPE plane_bsize, TX_SIZE tx_size,
+ BLOCK_SIZE plane_bsize, TX_SIZE tx_size,
int has_eob, int aoff, int loff) {
ENTROPY_CONTEXT *const A = pd->above_context + aoff;
ENTROPY_CONTEXT *const L = pd->left_context + loff;
@@ -588,4 +584,10 @@
}
}
+static int get_tx_eob(struct segmentation *seg, int segment_id,
+ TX_SIZE tx_size) {
+ const int eob_max = 16 << (tx_size << 1);
+ return vp9_segfeature_active(seg, segment_id, SEG_LVL_SKIP) ? 0 : eob_max;
+}
+
#endif // VP9_COMMON_VP9_BLOCKD_H_
diff --git a/vp9/common/vp9_common_data.c b/vp9/common/vp9_common_data.c
index 310a667..dc41efd 100644
--- a/vp9/common/vp9_common_data.c
+++ b/vp9/common/vp9_common_data.c
@@ -74,7 +74,7 @@
}
};
-const BLOCK_SIZE_TYPE subsize_lookup[PARTITION_TYPES][BLOCK_SIZES] = {
+const BLOCK_SIZE subsize_lookup[PARTITION_TYPES][BLOCK_SIZES] = {
{ // PARTITION_NONE
BLOCK_4X4, BLOCK_4X8, BLOCK_8X4,
BLOCK_8X8, BLOCK_8X16, BLOCK_16X8,
@@ -115,7 +115,7 @@
TX_16X16, TX_16X16, TX_16X16, TX_32X32
};
-const BLOCK_SIZE_TYPE ss_size_lookup[BLOCK_SIZES][2][2] = {
+const BLOCK_SIZE ss_size_lookup[BLOCK_SIZES][2][2] = {
// ss_x == 0 ss_x == 0 ss_x == 1 ss_x == 1
// ss_y == 0 ss_y == 1 ss_y == 0 ss_y == 1
{{BLOCK_4X4, BLOCK_INVALID}, {BLOCK_INVALID, BLOCK_INVALID}},
diff --git a/vp9/common/vp9_common_data.h b/vp9/common/vp9_common_data.h
index 808b9ed..3822bfc 100644
--- a/vp9/common/vp9_common_data.h
+++ b/vp9/common/vp9_common_data.h
@@ -24,9 +24,9 @@
extern const int size_group_lookup[BLOCK_SIZES];
extern const int num_pels_log2_lookup[BLOCK_SIZES];
extern const PARTITION_TYPE partition_lookup[][BLOCK_SIZES];
-extern const BLOCK_SIZE_TYPE subsize_lookup[PARTITION_TYPES][BLOCK_SIZES];
+extern const BLOCK_SIZE subsize_lookup[PARTITION_TYPES][BLOCK_SIZES];
extern const TX_SIZE max_txsize_lookup[BLOCK_SIZES];
extern const TX_SIZE max_uv_txsize_lookup[BLOCK_SIZES];
-extern const BLOCK_SIZE_TYPE ss_size_lookup[BLOCK_SIZES][2][2];
+extern const BLOCK_SIZE ss_size_lookup[BLOCK_SIZES][2][2];
#endif // VP9_COMMON_VP9_COMMON_DATA_H
diff --git a/vp9/common/vp9_debugmodes.c b/vp9/common/vp9_debugmodes.c
index f2def5c..4de50aa 100644
--- a/vp9/common/vp9_debugmodes.c
+++ b/vp9/common/vp9_debugmodes.c
@@ -22,17 +22,17 @@
* and uses the passed in member offset to print out the value of an integer
* for each mbmi member value in the mi structure.
*/
-static void print_mi_data(VP9_COMMON *common, FILE *file, char *descriptor,
+static void print_mi_data(VP9_COMMON *cm, FILE *file, char *descriptor,
size_t member_offset) {
int mi_row;
int mi_col;
int mi_index = 0;
- MODE_INFO *mi = common->mi;
- int rows = common->mi_rows;
- int cols = common->mi_cols;
+ MODE_INFO *mi = cm->mi;
+ int rows = cm->mi_rows;
+ int cols = cm->mi_cols;
char prefix = descriptor[0];
- log_frame_info(common, descriptor, file);
+ log_frame_info(cm, descriptor, file);
mi_index = 0;
for (mi_row = 0; mi_row < rows; mi_row++) {
fprintf(file, "%c ", prefix);
@@ -59,7 +59,7 @@
print_mi_data(cm, mvs, "Modes:", offsetof(MB_MODE_INFO, mode));
print_mi_data(cm, mvs, "Skips:", offsetof(MB_MODE_INFO, skip_coeff));
print_mi_data(cm, mvs, "Ref frame:", offsetof(MB_MODE_INFO, ref_frame[0]));
- print_mi_data(cm, mvs, "Transform:", offsetof(MB_MODE_INFO, txfm_size));
+ print_mi_data(cm, mvs, "Transform:", offsetof(MB_MODE_INFO, tx_size));
print_mi_data(cm, mvs, "UV Modes:", offsetof(MB_MODE_INFO, uv_mode));
log_frame_info(cm, "Vectors ",mvs);
diff --git a/vp9/common/vp9_entropy.c b/vp9/common/vp9_entropy.c
index 21e0e04..32d9e0c 100644
--- a/vp9/common/vp9_entropy.c
+++ b/vp9/common/vp9_entropy.c
@@ -436,11 +436,11 @@
#include "vp9/common/vp9_default_coef_probs.h"
-void vp9_default_coef_probs(VP9_COMMON *pc) {
- vp9_copy(pc->fc.coef_probs[TX_4X4], default_coef_probs_4x4);
- vp9_copy(pc->fc.coef_probs[TX_8X8], default_coef_probs_8x8);
- vp9_copy(pc->fc.coef_probs[TX_16X16], default_coef_probs_16x16);
- vp9_copy(pc->fc.coef_probs[TX_32X32], default_coef_probs_32x32);
+void vp9_default_coef_probs(VP9_COMMON *cm) {
+ vp9_copy(cm->fc.coef_probs[TX_4X4], default_coef_probs_4x4);
+ vp9_copy(cm->fc.coef_probs[TX_8X8], default_coef_probs_8x8);
+ vp9_copy(cm->fc.coef_probs[TX_16X16], default_coef_probs_16x16);
+ vp9_copy(cm->fc.coef_probs[TX_32X32], default_coef_probs_32x32);
}
// Neighborhood 5-tuples for various scans and blocksizes,
diff --git a/vp9/common/vp9_entropy.h b/vp9/common/vp9_entropy.h
index 82cbfd3..699b44a 100644
--- a/vp9/common/vp9_entropy.h
+++ b/vp9/common/vp9_entropy.h
@@ -95,7 +95,7 @@
#define MODULUS_PARAM 13 /* Modulus parameter */
struct VP9Common;
-void vp9_default_coef_probs(struct VP9Common *);
+void vp9_default_coef_probs(struct VP9Common *cm);
extern DECLARE_ALIGNED(16, const int16_t, vp9_default_scan_4x4[16]);
extern DECLARE_ALIGNED(16, const int16_t, vp9_col_scan_4x4[16]);
@@ -154,13 +154,13 @@
vp9_default_scan_32x32_neighbors[1025 * MAX_NEIGHBORS]);
void vp9_coef_tree_initialize(void);
-void vp9_adapt_coef_probs(struct VP9Common *);
+void vp9_adapt_coef_probs(struct VP9Common *cm);
-static INLINE void reset_skip_context(MACROBLOCKD *xd, BLOCK_SIZE_TYPE bsize) {
+static INLINE void reset_skip_context(MACROBLOCKD *xd, BLOCK_SIZE bsize) {
int i;
for (i = 0; i < MAX_MB_PLANE; i++) {
struct macroblockd_plane *const pd = &xd->plane[i];
- const BLOCK_SIZE_TYPE plane_bsize = get_plane_block_size(bsize, pd);
+ const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, pd);
vpx_memset(pd->above_context, 0, sizeof(ENTROPY_CONTEXT) *
num_4x4_blocks_wide_lookup[plane_bsize]);
vpx_memset(pd->left_context, 0, sizeof(ENTROPY_CONTEXT) *
@@ -336,6 +336,45 @@
}
}
+static int get_entropy_context(const MACROBLOCKD *xd, TX_SIZE tx_size,
+ PLANE_TYPE type, int block_idx,
+ ENTROPY_CONTEXT *A, ENTROPY_CONTEXT *L,
+ const int16_t **scan,
+ const uint8_t **band_translate) {
+ ENTROPY_CONTEXT above_ec, left_ec;
+
+ switch (tx_size) {
+ case TX_4X4:
+ *scan = get_scan_4x4(get_tx_type_4x4(type, xd, block_idx));
+ *band_translate = vp9_coefband_trans_4x4;
+ above_ec = A[0] != 0;
+ left_ec = L[0] != 0;
+ break;
+ case TX_8X8:
+ *scan = get_scan_8x8(get_tx_type_8x8(type, xd));
+ *band_translate = vp9_coefband_trans_8x8plus;
+ above_ec = !!*(uint16_t *)A;
+ left_ec = !!*(uint16_t *)L;
+ break;
+ case TX_16X16:
+ *scan = get_scan_16x16(get_tx_type_16x16(type, xd));
+ *band_translate = vp9_coefband_trans_8x8plus;
+ above_ec = !!*(uint32_t *)A;
+ left_ec = !!*(uint32_t *)L;
+ break;
+ case TX_32X32:
+ *scan = vp9_default_scan_32x32;
+ *band_translate = vp9_coefband_trans_8x8plus;
+ above_ec = !!*(uint64_t *)A;
+ left_ec = !!*(uint64_t *)L;
+ break;
+ default:
+ assert(!"Invalid transform size.");
+ }
+
+ return combine_entropy_contexts(above_ec, left_ec);
+}
+
enum { VP9_COEF_UPDATE_PROB = 252 };
#endif // VP9_COMMON_VP9_ENTROPY_H_
diff --git a/vp9/common/vp9_entropymode.c b/vp9/common/vp9_entropymode.c
index 3bc8de1..a75d1a9 100644
--- a/vp9/common/vp9_entropymode.c
+++ b/vp9/common/vp9_entropymode.c
@@ -235,7 +235,7 @@
-D135_PRED, -D117_PRED, /* 5 = D135_NODE */
-D45_PRED, 14, /* 6 = D45_NODE */
-D63_PRED, 16, /* 7 = D63_NODE */
- -D153_PRED, -D27_PRED /* 8 = D153_NODE */
+ -D153_PRED, -D207_PRED /* 8 = D153_NODE */
};
const vp9_tree_index vp9_inter_mode_tree[6] = {
diff --git a/vp9/common/vp9_entropymode.h b/vp9/common/vp9_entropymode.h
index 2f8085d..4cf4c03 100644
--- a/vp9/common/vp9_entropymode.h
+++ b/vp9/common/vp9_entropymode.h
@@ -58,9 +58,9 @@
void vp9_setup_past_independence(struct VP9Common *cm);
-void vp9_init_mbmode_probs(struct VP9Common *x);
+void vp9_init_mbmode_probs(struct VP9Common *cm);
-void vp9_adapt_mode_probs(struct VP9Common *);
+void vp9_adapt_mode_probs(struct VP9Common *cm);
void tx_counts_to_branch_counts_32x32(unsigned int *tx_count_32x32p,
unsigned int (*ct_32x32p)[2]);
diff --git a/vp9/common/vp9_enums.h b/vp9/common/vp9_enums.h
index 9fab56f..1bf0742 100644
--- a/vp9/common/vp9_enums.h
+++ b/vp9/common/vp9_enums.h
@@ -22,7 +22,7 @@
#define MI_MASK (MI_BLOCK_SIZE - 1)
-typedef enum BLOCK_SIZE_TYPE {
+typedef enum BLOCK_SIZE {
BLOCK_4X4,
BLOCK_4X8,
BLOCK_8X4,
@@ -38,7 +38,7 @@
BLOCK_64X64,
BLOCK_SIZES,
BLOCK_INVALID = BLOCK_SIZES
-} BLOCK_SIZE_TYPE;
+} BLOCK_SIZE;
typedef enum PARTITION_TYPE {
PARTITION_NONE,
diff --git a/vp9/common/vp9_findnearmv.h b/vp9/common/vp9_findnearmv.h
index 178ad87..72572df 100644
--- a/vp9/common/vp9_findnearmv.h
+++ b/vp9/common/vp9_findnearmv.h
@@ -36,7 +36,7 @@
xd->mb_to_bottom_edge + RIGHT_BOTTOM_MARGIN);
}
-void vp9_append_sub8x8_mvs_for_idx(VP9_COMMON *pc,
+void vp9_append_sub8x8_mvs_for_idx(VP9_COMMON *cm,
MACROBLOCKD *xd,
int_mv *dst_nearest,
int_mv *dst_near,
diff --git a/vp9/common/vp9_loopfilter.c b/vp9/common/vp9_loopfilter.c
index fc52747..cfa61c2 100644
--- a/vp9/common/vp9_loopfilter.c
+++ b/vp9/common/vp9_loopfilter.c
@@ -22,13 +22,217 @@
const uint8_t *hev_thr;
};
+// This structure holds bit masks for all 8x8 blocks in a 64x64 region.
+// Each 1 bit represents a position in which we want to apply the loop filter.
+// Left_ entries refer to whether we apply a filter on the border to the
+// left of the block. Above_ entries refer to whether or not to apply a
+// filter on the above border. Int_ entries refer to whether or not to
+// apply borders on the 4x4 edges within the 8x8 block that each bit
+// represents.
+// Since each transform is accompanied by a potentially different type of
+// loop filter there is a different entry in the array for each transform size.
+typedef struct {
+ uint64_t left_y[TX_SIZES];
+ uint64_t above_y[TX_SIZES];
+ uint64_t int_4x4_y;
+ uint16_t left_uv[TX_SIZES];
+ uint16_t above_uv[TX_SIZES];
+ uint16_t int_4x4_uv;
+} LOOP_FILTER_MASK;
+
+// 64 bit masks for left transform size. Each 1 represents a position where
+// we should apply a loop filter across the left border of an 8x8 block
+// boundary.
+//
+// In the case of TX_16X16-> ( in low order byte first we end up with
+// a mask that looks like this
+//
+// 10101010
+// 10101010
+// 10101010
+// 10101010
+// 10101010
+// 10101010
+// 10101010
+// 10101010
+//
+// A loopfilter should be applied to every other 8x8 horizontally.
+static const uint64_t left_64x64_txform_mask[TX_SIZES]= {
+ 0xffffffffffffffff, // TX_4X4
+ 0xffffffffffffffff, // TX_8x8
+ 0x5555555555555555, // TX_16x16
+ 0x1111111111111111, // TX_32x32
+};
+
+// 64 bit masks for above transform size. Each 1 represents a position where
+// we should apply a loop filter across the top border of an 8x8 block
+// boundary.
+//
+// In the case of TX_32x32 -> ( in low order byte first we end up with
+// a mask that looks like this
+//
+// 11111111
+// 00000000
+// 00000000
+// 00000000
+// 11111111
+// 00000000
+// 00000000
+// 00000000
+//
+// A loopfilter should be applied to every other 4 the row vertically.
+static const uint64_t above_64x64_txform_mask[TX_SIZES]= {
+ 0xffffffffffffffff, // TX_4X4
+ 0xffffffffffffffff, // TX_8x8
+ 0x00ff00ff00ff00ff, // TX_16x16
+ 0x000000ff000000ff, // TX_32x32
+};
+
+// 64 bit masks for prediction sizes (left). Each 1 represents a position
+// where left border of an 8x8 block. These are aligned to the right most
+// appropriate bit, and then shifted into place.
+//
+// In the case of TX_16x32 -> ( low order byte first ) we end up with
+// a mask that looks like this :
+//
+// 10000000
+// 10000000
+// 10000000
+// 10000000
+// 00000000
+// 00000000
+// 00000000
+// 00000000
+static const uint64_t left_prediction_mask[BLOCK_SIZES] = {
+ 0x0000000000000001, // BLOCK_4X4,
+ 0x0000000000000001, // BLOCK_4X8,
+ 0x0000000000000001, // BLOCK_8X4,
+ 0x0000000000000001, // BLOCK_8X8,
+ 0x0000000000000101, // BLOCK_8X16,
+ 0x0000000000000001, // BLOCK_16X8,
+ 0x0000000000000101, // BLOCK_16X16,
+ 0x0000000001010101, // BLOCK_16X32,
+ 0x0000000000000101, // BLOCK_32X16,
+ 0x0000000001010101, // BLOCK_32X32,
+ 0x0101010101010101, // BLOCK_32X64,
+ 0x0000000001010101, // BLOCK_64X32,
+ 0x0101010101010101, // BLOCK_64X64
+};
+
+// 64 bit mask to shift and set for each prediction size.
+static const uint64_t above_prediction_mask[BLOCK_SIZES] = {
+ 0x0000000000000001, // BLOCK_4X4
+ 0x0000000000000001, // BLOCK_4X8
+ 0x0000000000000001, // BLOCK_8X4
+ 0x0000000000000001, // BLOCK_8X8
+ 0x0000000000000001, // BLOCK_8X16,
+ 0x0000000000000003, // BLOCK_16X8
+ 0x0000000000000003, // BLOCK_16X16
+ 0x0000000000000003, // BLOCK_16X32,
+ 0x000000000000000f, // BLOCK_32X16,
+ 0x000000000000000f, // BLOCK_32X32,
+ 0x000000000000000f, // BLOCK_32X64,
+ 0x00000000000000ff, // BLOCK_64X32,
+ 0x00000000000000ff, // BLOCK_64X64
+};
+// 64 bit mask to shift and set for each prediction size. A bit is set for
+// each 8x8 block that would be in the left most block of the given block
+// size in the 64x64 block.
+static const uint64_t size_mask[BLOCK_SIZES] = {
+ 0x0000000000000001, // BLOCK_4X4
+ 0x0000000000000001, // BLOCK_4X8
+ 0x0000000000000001, // BLOCK_8X4
+ 0x0000000000000001, // BLOCK_8X8
+ 0x0000000000000101, // BLOCK_8X16,
+ 0x0000000000000003, // BLOCK_16X8
+ 0x0000000000000303, // BLOCK_16X16
+ 0x0000000003030303, // BLOCK_16X32,
+ 0x0000000000000f0f, // BLOCK_32X16,
+ 0x000000000f0f0f0f, // BLOCK_32X32,
+ 0x0f0f0f0f0f0f0f0f, // BLOCK_32X64,
+ 0x00000000ffffffff, // BLOCK_64X32,
+ 0xffffffffffffffff, // BLOCK_64X64
+};
+
+// These are used for masking the left and above borders.
+static const uint64_t left_border = 0x1111111111111111;
+static const uint64_t above_border = 0x000000ff000000ff;
+
+// 16 bit masks for uv transform sizes.
+static const uint16_t left_64x64_txform_mask_uv[TX_SIZES]= {
+ 0xffff, // TX_4X4
+ 0xffff, // TX_8x8
+ 0x5555, // TX_16x16
+ 0x1111, // TX_32x32
+};
+
+static const uint16_t above_64x64_txform_mask_uv[TX_SIZES]= {
+ 0xffff, // TX_4X4
+ 0xffff, // TX_8x8
+ 0x0f0f, // TX_16x16
+ 0x000f, // TX_32x32
+};
+
+// 16 bit left mask to shift and set for each uv prediction size.
+static const uint16_t left_prediction_mask_uv[BLOCK_SIZES] = {
+ 0x0001, // BLOCK_4X4,
+ 0x0001, // BLOCK_4X8,
+ 0x0001, // BLOCK_8X4,
+ 0x0001, // BLOCK_8X8,
+ 0x0001, // BLOCK_8X16,
+ 0x0001, // BLOCK_16X8,
+ 0x0001, // BLOCK_16X16,
+ 0x0011, // BLOCK_16X32,
+ 0x0001, // BLOCK_32X16,
+ 0x0011, // BLOCK_32X32,
+ 0x1111, // BLOCK_32X64
+ 0x0011, // BLOCK_64X32,
+ 0x1111, // BLOCK_64X64
+};
+// 16 bit above mask to shift and set for uv each prediction size.
+static const uint16_t above_prediction_mask_uv[BLOCK_SIZES] = {
+ 0x0001, // BLOCK_4X4
+ 0x0001, // BLOCK_4X8
+ 0x0001, // BLOCK_8X4
+ 0x0001, // BLOCK_8X8
+ 0x0001, // BLOCK_8X16,
+ 0x0001, // BLOCK_16X8
+ 0x0001, // BLOCK_16X16
+ 0x0001, // BLOCK_16X32,
+ 0x0003, // BLOCK_32X16,
+ 0x0003, // BLOCK_32X32,
+ 0x0003, // BLOCK_32X64,
+ 0x000f, // BLOCK_64X32,
+ 0x000f, // BLOCK_64X64
+};
+
+// 64 bit mask to shift and set for each uv prediction size
+static const uint16_t size_mask_uv[BLOCK_SIZES] = {
+ 0x0001, // BLOCK_4X4
+ 0x0001, // BLOCK_4X8
+ 0x0001, // BLOCK_8X4
+ 0x0001, // BLOCK_8X8
+ 0x0001, // BLOCK_8X16,
+ 0x0001, // BLOCK_16X8
+ 0x0001, // BLOCK_16X16
+ 0x0011, // BLOCK_16X32,
+ 0x0003, // BLOCK_32X16,
+ 0x0033, // BLOCK_32X32,
+ 0x3333, // BLOCK_32X64,
+ 0x00ff, // BLOCK_64X32,
+ 0xffff, // BLOCK_64X64
+};
+static const uint16_t left_border_uv = 0x1111;
+static const uint16_t above_border_uv = 0x000f;
+
+
static void lf_init_lut(loop_filter_info_n *lfi) {
lfi->mode_lf_lut[DC_PRED] = 0;
lfi->mode_lf_lut[D45_PRED] = 0;
lfi->mode_lf_lut[D135_PRED] = 0;
lfi->mode_lf_lut[D117_PRED] = 0;
lfi->mode_lf_lut[D153_PRED] = 0;
- lfi->mode_lf_lut[D27_PRED] = 0;
+ lfi->mode_lf_lut[D207_PRED] = 0;
lfi->mode_lf_lut[D63_PRED] = 0;
lfi->mode_lf_lut[V_PRED] = 0;
lfi->mode_lf_lut[H_PRED] = 0;
@@ -39,7 +243,7 @@
lfi->mode_lf_lut[NEWMV] = 1;
}
-static void update_sharpness(loop_filter_info_n *const lfi, int sharpness_lvl) {
+static void update_sharpness(loop_filter_info_n *lfi, int sharpness_lvl) {
int lvl;
// For each possible value for the loop filter fill out limits
@@ -78,7 +282,7 @@
vpx_memset(lfi->hev_thr[i], i, SIMD_WIDTH);
}
-void vp9_loop_filter_frame_init(VP9_COMMON *const cm, int default_filt_lvl) {
+void vp9_loop_filter_frame_init(VP9_COMMON *cm, int default_filt_lvl) {
int seg_id;
// n_shift is the a multiplier for lf_deltas
// the multiplier is 1 for when filter_lvl is between 0 and 31;
@@ -124,9 +328,9 @@
}
}
-static int build_lfi(const loop_filter_info_n *const lfi_n,
- const MB_MODE_INFO *const mbmi,
- struct loop_filter_info *const lfi) {
+static int build_lfi(const loop_filter_info_n *lfi_n,
+ const MB_MODE_INFO *mbmi,
+ struct loop_filter_info *lfi) {
const int seg = mbmi->segment_id;
const int ref = mbmi->ref_frame[0];
const int mode = lfi_n->mode_lf_lut[mbmi->mode];
@@ -236,10 +440,347 @@
}
}
-static void filter_block_plane(VP9_COMMON *const cm,
- struct macroblockd_plane *const plane,
- const MODE_INFO *mi,
- int mi_row, int mi_col) {
+// This function ors into the current lfm structure, where to do loop
+// filters for the specific mi we are looking at. It uses information
+// including the block_size_type (32x16, 32x32, etc), the transform size,
+// whether there were any coefficients encoded, and the loop filter strength
+// block we are currently looking at. Shift is used to position the
+// 1's we produce.
+// TODO(JBB) Need another function for different resolution color..
+static void build_masks(const loop_filter_info_n *const lfi_n,
+ const MODE_INFO *mi, const int shift_y,
+ const int shift_uv,
+ LOOP_FILTER_MASK *lfm) {
+ const BLOCK_SIZE block_size = mi->mbmi.sb_type;
+ const TX_SIZE tx_size_y = mi->mbmi.tx_size;
+ const TX_SIZE tx_size_uv = get_uv_tx_size(&mi->mbmi);
+ const int skip = mi->mbmi.skip_coeff;
+ const int seg = mi->mbmi.segment_id;
+ const int ref = mi->mbmi.ref_frame[0];
+ const int mode = lfi_n->mode_lf_lut[mi->mbmi.mode];
+ const int filter_level = lfi_n->lvl[seg][ref][mode];
+ uint64_t *left_y = &lfm->left_y[tx_size_y];
+ uint64_t *above_y = &lfm->above_y[tx_size_y];
+ uint64_t *int_4x4_y = &lfm->int_4x4_y;
+ uint16_t *left_uv = &lfm->left_uv[tx_size_uv];
+ uint16_t *above_uv = &lfm->above_uv[tx_size_uv];
+ uint16_t *int_4x4_uv = &lfm->int_4x4_uv;
+
+ // If filter level is 0 we don't loop filter.
+ if (!filter_level)
+ return;
+
+ // These set 1 in the current block size for the block size edges.
+ // For instance if the block size is 32x16, we'll set :
+ // above = 1111
+ // 0000
+ // and
+ // left = 1000
+ // = 1000
+ // NOTE : In this example the low bit is left most ( 1000 ) is stored as
+ // 1, not 8...
+ //
+ // U and v set things on a 16 bit scale.
+ //
+ *above_y |= above_prediction_mask[block_size] << shift_y;
+ *above_uv |= above_prediction_mask_uv[block_size] << shift_uv;
+ *left_y |= left_prediction_mask[block_size] << shift_y;
+ *left_uv |= left_prediction_mask_uv[block_size] << shift_uv;
+
+ // If the block has no coefficients and is not intra we skip applying
+ // the loop filter on block edges.
+ if (skip && ref > INTRA_FRAME)
+ return;
+
+ // Here we are adding a mask for the transform size. The transform
+ // size mask is set to be correct for a 64x64 prediction block size. We
+ // mask to match the size of the block we are working on and then shift it
+ // into place..
+ *above_y |= (size_mask[block_size] &
+ above_64x64_txform_mask[tx_size_y]) << shift_y;
+ *above_uv |= (size_mask_uv[block_size] &
+ above_64x64_txform_mask_uv[tx_size_uv]) << shift_uv;
+
+ *left_y |= (size_mask[block_size] &
+ left_64x64_txform_mask[tx_size_y]) << shift_y;
+ *left_uv |= (size_mask_uv[block_size] &
+ left_64x64_txform_mask_uv[tx_size_uv]) << shift_uv;
+
+ // Here we are trying to determine what to do with the internal 4x4 block
+ // boundaries. These differ from the 4x4 boundaries on the outside edge of
+ // an 8x8 in that the internal ones can be skipped and don't depend on
+ // the prediction block size.
+ if (tx_size_y == TX_4X4) {
+ *int_4x4_y |= (size_mask[block_size] & 0xffffffffffffffff) << shift_y;
+ }
+ if (tx_size_uv == TX_4X4) {
+ *int_4x4_uv |= (size_mask_uv[block_size] & 0xffff) << shift_uv;
+ }
+}
+
+// This function does the same thing as the one above with the exception that
+// it only affects the y masks. It exists because for blocks < 16x16 in size,
+// we only update u and v masks on the first block.
+static void build_y_mask(const loop_filter_info_n *const lfi_n,
+ const MODE_INFO *mi, const int shift_y,
+ LOOP_FILTER_MASK *lfm) {
+ const BLOCK_SIZE block_size = mi->mbmi.sb_type;
+ const TX_SIZE tx_size_y = mi->mbmi.tx_size;
+ const int skip = mi->mbmi.skip_coeff;
+ const int seg = mi->mbmi.segment_id;
+ const int ref = mi->mbmi.ref_frame[0];
+ const int mode = lfi_n->mode_lf_lut[mi->mbmi.mode];
+ const int filter_level = lfi_n->lvl[seg][ref][mode];
+ uint64_t *left_y = &lfm->left_y[tx_size_y];
+ uint64_t *above_y = &lfm->above_y[tx_size_y];
+ uint64_t *int_4x4_y = &lfm->int_4x4_y;
+
+ if (!filter_level)
+ return;
+
+ *above_y |= above_prediction_mask[block_size] << shift_y;
+ *left_y |= left_prediction_mask[block_size] << shift_y;
+
+ if (skip && ref > INTRA_FRAME)
+ return;
+
+ *above_y |= (size_mask[block_size] &
+ above_64x64_txform_mask[tx_size_y]) << shift_y;
+
+ *left_y |= (size_mask[block_size] &
+ left_64x64_txform_mask[tx_size_y]) << shift_y;
+
+ if (tx_size_y == TX_4X4) {
+ *int_4x4_y |= (size_mask[block_size] & 0xffffffffffffffff) << shift_y;
+ }
+}
+
+// This function sets up the bit masks for the entire 64x64 region represented
+// by mi_row, mi_col.
+// TODO(JBB): This function only works for yv12.
+static void setup_mask(VP9_COMMON *const cm, const int mi_row, const int mi_col,
+ const MODE_INFO *mi, const int mode_info_stride,
+ LOOP_FILTER_MASK *lfm) {
+ int idx_32, idx_16, idx_8;
+ const loop_filter_info_n *const lfi_n = &cm->lf_info;
+ const MODE_INFO *mip = mi;
+ const MODE_INFO *mip2 = mi;
+
+ // These are offsets to the next mi in the 64x64 block. It is what gets
+ // added to the mi ptr as we go through each loop. It helps us to avoids
+ // setting up special row and column counters for each index. The last step
+ // brings us out back to the starting position.
+ const int offset_32[] = {4, (mode_info_stride << 2) - 4, 4,
+ -(mode_info_stride << 2) - 4};
+ const int offset_16[] = {2, (mode_info_stride << 1) - 2, 2,
+ -(mode_info_stride << 1) - 2};
+ const int offset[] = {1, mode_info_stride - 1, 1, -mode_info_stride - 1};
+
+ // Following variables represent shifts to position the current block
+ // mask over the appropriate block. A shift of 36 to the left will move
+ // the bits for the final 32 by 32 block in the 64x64 up 4 rows and left
+ // 4 rows to the appropriate spot.
+ const int shift_32_y[] = {0, 4, 32, 36};
+ const int shift_16_y[] = {0, 2, 16, 18};
+ const int shift_8_y[] = {0, 1, 8, 9};
+ const int shift_32_uv[] = {0, 2, 8, 10};
+ const int shift_16_uv[] = {0, 1, 4, 5};
+ int i;
+ const int max_rows = (mi_row + MI_BLOCK_SIZE > cm->mi_rows ?
+ cm->mi_rows - mi_row : MI_BLOCK_SIZE);
+ const int max_cols = (mi_col + MI_BLOCK_SIZE > cm->mi_cols ?
+ cm->mi_cols - mi_col : MI_BLOCK_SIZE);
+
+ vp9_zero(*lfm);
+
+ // TODO(jimbankoski): Try moving most of the following code into decode
+ // loop and storing lfm in the mbmi structure so that we don't have to go
+ // through the recursive loop structure multiple times.
+ switch (mip->mbmi.sb_type) {
+ case BLOCK_64X64:
+ build_masks(lfi_n, mip , 0, 0, lfm);
+ break;
+ case BLOCK_64X32:
+ build_masks(lfi_n, mip, 0, 0, lfm);
+ mip2 = mip + mode_info_stride * 4;
+ build_masks(lfi_n, mip2 , 32, 8, lfm);
+ break;
+ case BLOCK_32X64:
+ build_masks(lfi_n, mip, 0, 0, lfm);
+ mip2 = mip + 4;
+ build_masks(lfi_n, mip2, 4, 2, lfm);
+ break;
+ default:
+ for (idx_32 = 0; idx_32 < 4; mip += offset_32[idx_32], ++idx_32) {
+ const int shift_y = shift_32_y[idx_32];
+ const int shift_uv = shift_32_uv[idx_32];
+ const int mi_32_col_offset = ((idx_32 & 1) << 2);
+ const int mi_32_row_offset = ((idx_32 >> 1) << 2);
+ if (mi_32_col_offset >= max_cols || mi_32_row_offset >= max_rows)
+ continue;
+ switch (mip->mbmi.sb_type) {
+ case BLOCK_32X32:
+ build_masks(lfi_n, mip, shift_y, shift_uv, lfm);
+ break;
+ case BLOCK_32X16:
+ build_masks(lfi_n, mip, shift_y, shift_uv, lfm);
+ mip2 = mip + mode_info_stride * 2;
+ build_masks(lfi_n, mip2, shift_y + 16, shift_uv + 4, lfm);
+ break;
+ case BLOCK_16X32:
+ build_masks(lfi_n, mip, shift_y, shift_uv, lfm);
+ mip2 = mip + 2;
+ build_masks(lfi_n, mip2, shift_y + 2, shift_uv + 1, lfm);
+ break;
+ default:
+ for (idx_16 = 0; idx_16 < 4; mip += offset_16[idx_16], ++idx_16) {
+ const int shift_y = shift_32_y[idx_32] + shift_16_y[idx_16];
+ const int shift_uv = shift_32_uv[idx_32] + shift_16_uv[idx_16];
+ const int mi_16_col_offset = mi_32_col_offset +
+ ((idx_16 & 1) << 1);
+ const int mi_16_row_offset = mi_32_row_offset +
+ ((idx_16 >> 1) << 1);
+
+ if (mi_16_col_offset >= max_cols || mi_16_row_offset >= max_rows)
+ continue;
+
+ switch (mip->mbmi.sb_type) {
+ case BLOCK_16X16:
+ build_masks(lfi_n, mip, shift_y, shift_uv, lfm);
+ break;
+ case BLOCK_16X8:
+ build_masks(lfi_n, mip, shift_y, shift_uv, lfm);
+ mip2 = mip + mode_info_stride;
+ build_y_mask(lfi_n, mip2, shift_y+8, lfm);
+ break;
+ case BLOCK_8X16:
+ build_masks(lfi_n, mip, shift_y, shift_uv, lfm);
+ mip2 = mip + 1;
+ build_y_mask(lfi_n, mip2, shift_y+1, lfm);
+ break;
+ default: {
+ const int shift_y = shift_32_y[idx_32] +
+ shift_16_y[idx_16] +
+ shift_8_y[0];
+ build_masks(lfi_n, mip, shift_y, shift_uv, lfm);
+ mip += offset[0];
+ for (idx_8 = 1; idx_8 < 4; mip += offset[idx_8], ++idx_8) {
+ const int shift_y = shift_32_y[idx_32] +
+ shift_16_y[idx_16] +
+ shift_8_y[idx_8];
+ const int mi_8_col_offset = mi_16_col_offset +
+ ((idx_8 & 1));
+ const int mi_8_row_offset = mi_16_row_offset +
+ ((idx_8 >> 1));
+
+ if (mi_8_col_offset >= max_cols ||
+ mi_8_row_offset >= max_rows)
+ continue;
+ build_y_mask(lfi_n, mip, shift_y, lfm);
+ }
+ break;
+ }
+ }
+ }
+ break;
+ }
+ }
+ break;
+ }
+ // The largest loopfilter we have is 16x16 so we use the 16x16 mask
+ // for 32x32 transforms also also.
+ lfm->left_y[TX_16X16] |= lfm->left_y[TX_32X32];
+ lfm->above_y[TX_16X16] |= lfm->above_y[TX_32X32];
+ lfm->left_uv[TX_16X16] |= lfm->left_uv[TX_32X32];
+ lfm->above_uv[TX_16X16] |= lfm->above_uv[TX_32X32];
+
+ // We do at least 8 tap filter on every 32x32 even if the transform size
+ // is 4x4. So if the 4x4 is set on a border pixel add it to the 8x8 and
+ // remove it from the 4x4.
+ lfm->left_y[TX_8X8] |= lfm->left_y[TX_4X4] & left_border;
+ lfm->left_y[TX_4X4] &= ~left_border;
+ lfm->above_y[TX_8X8] |= lfm->above_y[TX_4X4] & above_border;
+ lfm->above_y[TX_4X4] &= ~above_border;
+ lfm->left_uv[TX_8X8] |= lfm->left_uv[TX_4X4] & left_border_uv;
+ lfm->left_uv[TX_4X4] &= ~left_border_uv;
+ lfm->above_uv[TX_8X8] |= lfm->above_uv[TX_4X4] & above_border_uv;
+ lfm->above_uv[TX_4X4] &= ~above_border_uv;
+
+ // We do some special edge handling.
+ if (mi_row + MI_BLOCK_SIZE > cm->mi_rows) {
+ const uint64_t rows = cm->mi_rows - mi_row;
+
+ // Each pixel inside the border gets a 1,
+ const uint64_t mask_y = (((uint64_t) 1 << (rows << 3)) - 1);
+ const uint16_t mask_uv = (((uint16_t) 1 << (((rows + 1) >> 1) << 2)) - 1);
+
+ // Remove values completely outside our border.
+ for (i = 0; i < TX_32X32; i++) {
+ lfm->left_y[i] &= mask_y;
+ lfm->above_y[i] &= mask_y;
+ lfm->left_uv[i] &= mask_uv;
+ lfm->above_uv[i] &= mask_uv;
+ }
+ lfm->int_4x4_y &= mask_y;
+ lfm->int_4x4_uv &= mask_uv;
+
+ // We don't apply a wide loop filter on the last uv block row. If set
+ // apply the shorter one instead.
+ if (rows == 1) {
+ lfm->above_uv[TX_8X8] |= lfm->above_uv[TX_16X16];
+ lfm->above_uv[TX_16X16] = 0;
+ }
+ if (rows == 5) {
+ lfm->above_uv[TX_8X8] |= lfm->above_uv[TX_16X16] & 0xff00;
+ lfm->above_uv[TX_16X16] &= ~(lfm->above_uv[TX_16X16] & 0xff00);
+ }
+ }
+
+ if (mi_col + MI_BLOCK_SIZE > cm->mi_cols) {
+ const uint64_t columns = cm->mi_cols - mi_col;
+
+ // Each pixel inside the border gets a 1, the multiply copies the border
+ // to where we need it.
+ const uint64_t mask_y = (((1 << columns) - 1)) * 0x0101010101010101;
+ const uint16_t mask_uv = ((1 << ((columns + 1) >> 1)) - 1) * 0x1111;
+
+ // Internal edges are not applied on the last column of the image so
+ // we mask 1 more for the internal edges
+ const uint16_t mask_uv_int = ((1 << (columns >> 1)) - 1) * 0x1111;
+
+ // Remove the bits outside the image edge.
+ for (i = 0; i < TX_32X32; i++) {
+ lfm->left_y[i] &= mask_y;
+ lfm->above_y[i] &= mask_y;
+ lfm->left_uv[i] &= mask_uv;
+ lfm->above_uv[i] &= mask_uv;
+ }
+ lfm->int_4x4_y &= mask_y;
+ lfm->int_4x4_uv &= mask_uv_int;
+
+ // We don't apply a wide loop filter on the last uv column. If set
+ // apply the shorter one instead.
+ if (columns == 1) {
+ lfm->left_uv[TX_8X8] |= lfm->left_uv[TX_16X16];
+ lfm->left_uv[TX_16X16] = 0;
+ }
+ if (columns == 5) {
+ lfm->left_uv[TX_8X8] |= (lfm->left_uv[TX_16X16] & 0xcccc);
+ lfm->left_uv[TX_16X16] &= ~(lfm->left_uv[TX_16X16] & 0xcccc);
+ }
+ }
+ // We don't a loop filter on the first column in the image. Mask that out.
+ if (mi_col == 0) {
+ for (i = 0; i < TX_32X32; i++) {
+ lfm->left_y[i] &= 0xfefefefefefefefe;
+ lfm->left_uv[i] &= 0xeeee;
+ }
+ }
+}
+static void filter_block_plane_non420(VP9_COMMON *cm,
+ struct macroblockd_plane *plane,
+ const MODE_INFO *mi,
+ int mi_row, int mi_col) {
const int ss_x = plane->subsampling_x;
const int ss_y = plane->subsampling_y;
const int row_step = 1 << ss_x;
@@ -274,7 +815,7 @@
const int skip_this_r = skip_this && !block_edge_above;
const TX_SIZE tx_size = (plane->plane_type == PLANE_TYPE_UV)
? get_uv_tx_size(&mi[c].mbmi)
- : mi[c].mbmi.txfm_size;
+ : mi[c].mbmi.tx_size;
const int skip_border_4x4_c = ss_x && mi_col + c == cm->mi_cols - 1;
const int skip_border_4x4_r = ss_y && mi_row + r == cm->mi_rows - 1;
@@ -356,11 +897,92 @@
}
}
+static void filter_block_plane(VP9_COMMON *const cm,
+ struct macroblockd_plane *const plane,
+ const MODE_INFO *mi,
+ int mi_row, int mi_col,
+ LOOP_FILTER_MASK *lfm) {
+ const int ss_x = plane->subsampling_x;
+ const int ss_y = plane->subsampling_y;
+ const int row_step = 1 << ss_x;
+ const int col_step = 1 << ss_y;
+ const int row_step_stride = cm->mode_info_stride * row_step;
+ struct buf_2d *const dst = &plane->dst;
+ uint8_t* const dst0 = dst->buf;
+ unsigned int mask_4x4_int[MI_BLOCK_SIZE] = {0};
+ struct loop_filter_info lfi[MI_BLOCK_SIZE][MI_BLOCK_SIZE];
+ int r, c;
+ int row_shift = 3 - ss_x;
+ int row_mask = 0xff >> (ss_x << 2);
+
+#define MASK_ROW(value) ((value >> (r_sampled << row_shift)) & row_mask)
+
+ for (r = 0; r < MI_BLOCK_SIZE && mi_row + r < cm->mi_rows; r += row_step) {
+ int r_sampled = r >> ss_x;
+
+ // Determine the vertical edges that need filtering
+ for (c = 0; c < MI_BLOCK_SIZE && mi_col + c < cm->mi_cols; c += col_step) {
+ if (!build_lfi(&cm->lf_info, &mi[c].mbmi, lfi[r] + (c >> ss_x)))
+ continue;
+ }
+ if (!plane->plane_type) {
+ mask_4x4_int[r] = MASK_ROW(lfm->int_4x4_y);
+ // Disable filtering on the leftmost column
+ filter_selectively_vert(dst->buf, dst->stride,
+ MASK_ROW(lfm->left_y[TX_16X16]),
+ MASK_ROW(lfm->left_y[TX_8X8]),
+ MASK_ROW(lfm->left_y[TX_4X4]),
+ MASK_ROW(lfm->int_4x4_y),
+ lfi[r]);
+ } else {
+ mask_4x4_int[r] = MASK_ROW(lfm->int_4x4_uv);
+ // Disable filtering on the leftmost column
+ filter_selectively_vert(dst->buf, dst->stride,
+ MASK_ROW(lfm->left_uv[TX_16X16]),
+ MASK_ROW(lfm->left_uv[TX_8X8]),
+ MASK_ROW(lfm->left_uv[TX_4X4]),
+ MASK_ROW(lfm->int_4x4_uv),
+ lfi[r]);
+ }
+ dst->buf += 8 * dst->stride;
+ mi += row_step_stride;
+ }
+
+ // Now do horizontal pass
+ dst->buf = dst0;
+ for (r = 0; r < MI_BLOCK_SIZE && mi_row + r < cm->mi_rows; r += row_step) {
+ const int skip_border_4x4_r = ss_y && mi_row + r == cm->mi_rows - 1;
+ const unsigned int mask_4x4_int_r = skip_border_4x4_r ? 0 : mask_4x4_int[r];
+ int r_sampled = r >> ss_x;
+
+ if (!plane->plane_type) {
+ filter_selectively_horiz(dst->buf, dst->stride,
+ MASK_ROW(lfm->above_y[TX_16X16]),
+ MASK_ROW(lfm->above_y[TX_8X8]),
+ MASK_ROW(lfm->above_y[TX_4X4]),
+ MASK_ROW(lfm->int_4x4_y),
+ mi_row + r == 0, lfi[r]);
+ } else {
+ filter_selectively_horiz(dst->buf, dst->stride,
+ MASK_ROW(lfm->above_uv[TX_16X16]),
+ MASK_ROW(lfm->above_uv[TX_8X8]),
+ MASK_ROW(lfm->above_uv[TX_4X4]),
+ mask_4x4_int_r,
+ mi_row + r == 0, lfi[r]);
+ }
+ dst->buf += 8 * dst->stride;
+ }
+#undef MASK_ROW
+}
+
void vp9_loop_filter_rows(const YV12_BUFFER_CONFIG *frame_buffer,
VP9_COMMON *cm, MACROBLOCKD *xd,
int start, int stop, int y_only) {
const int num_planes = y_only ? 1 : MAX_MB_PLANE;
int mi_row, mi_col;
+ LOOP_FILTER_MASK lfm;
+ int use_420 = y_only || (xd->plane[1].subsampling_y == 1 &&
+ xd->plane[1].subsampling_x == 1);
for (mi_row = start; mi_row < stop; mi_row += MI_BLOCK_SIZE) {
MODE_INFO* const mi = cm->mi + mi_row * cm->mode_info_stride;
@@ -369,8 +991,18 @@
int plane;
setup_dst_planes(xd, frame_buffer, mi_row, mi_col);
+
+ // TODO(JBB): Make setup_mask work for non 420.
+ if (use_420)
+ setup_mask(cm, mi_row, mi_col, mi + mi_col, cm->mode_info_stride, &lfm);
+
for (plane = 0; plane < num_planes; ++plane) {
- filter_block_plane(cm, &xd->plane[plane], mi + mi_col, mi_row, mi_col);
+ if (use_420)
+ filter_block_plane(cm, &xd->plane[plane], mi + mi_col, mi_row, mi_col,
+ &lfm);
+ else
+ filter_block_plane_non420(cm, &xd->plane[plane], mi + mi_col,
+ mi_row, mi_col);
}
}
}
diff --git a/vp9/common/vp9_mvref_common.c b/vp9/common/vp9_mvref_common.c
index e27ba43..d8381ec 100644
--- a/vp9/common/vp9_mvref_common.c
+++ b/vp9/common/vp9_mvref_common.c
@@ -36,7 +36,7 @@
9, // D135_PRED
9, // D117_PRED
9, // D153_PRED
- 9, // D27_PRED
+ 9, // D207_PRED
9, // D63_PRED
9, // TM_PRED
0, // NEARESTMV
diff --git a/vp9/common/vp9_postproc.c b/vp9/common/vp9_postproc.c
index 859c99e..48d3d2d 100644
--- a/vp9/common/vp9_postproc.c
+++ b/vp9/common/vp9_postproc.c
@@ -630,21 +630,21 @@
}
}
-int vp9_post_proc_frame(struct VP9Common *oci,
+int vp9_post_proc_frame(struct VP9Common *cm,
YV12_BUFFER_CONFIG *dest, vp9_ppflags_t *ppflags) {
- int q = oci->lf.filter_level * 10 / 6;
+ int q = cm->lf.filter_level * 10 / 6;
int flags = ppflags->post_proc_flag;
int deblock_level = ppflags->deblocking_level;
int noise_level = ppflags->noise_level;
- if (!oci->frame_to_show)
+ if (!cm->frame_to_show)
return -1;
if (q > 63)
q = 63;
if (!flags) {
- *dest = *oci->frame_to_show;
+ *dest = *cm->frame_to_show;
return 0;
}
@@ -653,52 +653,52 @@
#endif
if (flags & VP9D_DEMACROBLOCK) {
- deblock_and_de_macro_block(oci->frame_to_show, &oci->post_proc_buffer,
+ deblock_and_de_macro_block(cm->frame_to_show, &cm->post_proc_buffer,
q + (deblock_level - 5) * 10, 1, 0);
} else if (flags & VP9D_DEBLOCK) {
- vp9_deblock(oci->frame_to_show, &oci->post_proc_buffer, q);
+ vp9_deblock(cm->frame_to_show, &cm->post_proc_buffer, q);
} else {
- vp8_yv12_copy_frame(oci->frame_to_show, &oci->post_proc_buffer);
+ vp8_yv12_copy_frame(cm->frame_to_show, &cm->post_proc_buffer);
}
if (flags & VP9D_ADDNOISE) {
- if (oci->postproc_state.last_q != q
- || oci->postproc_state.last_noise != noise_level) {
- fillrd(&oci->postproc_state, 63 - q, noise_level);
+ if (cm->postproc_state.last_q != q
+ || cm->postproc_state.last_noise != noise_level) {
+ fillrd(&cm->postproc_state, 63 - q, noise_level);
}
- vp9_plane_add_noise(oci->post_proc_buffer.y_buffer,
- oci->postproc_state.noise,
- oci->postproc_state.blackclamp,
- oci->postproc_state.whiteclamp,
- oci->postproc_state.bothclamp,
- oci->post_proc_buffer.y_width,
- oci->post_proc_buffer.y_height,
- oci->post_proc_buffer.y_stride);
+ vp9_plane_add_noise(cm->post_proc_buffer.y_buffer,
+ cm->postproc_state.noise,
+ cm->postproc_state.blackclamp,
+ cm->postproc_state.whiteclamp,
+ cm->postproc_state.bothclamp,
+ cm->post_proc_buffer.y_width,
+ cm->post_proc_buffer.y_height,
+ cm->post_proc_buffer.y_stride);
}
#if 0 && CONFIG_POSTPROC_VISUALIZER
if (flags & VP9D_DEBUG_TXT_FRAME_INFO) {
char message[512];
sprintf(message, "F%1dG%1dQ%3dF%3dP%d_s%dx%d",
- (oci->frame_type == KEY_FRAME),
- oci->refresh_golden_frame,
- oci->base_qindex,
- oci->filter_level,
+ (cm->frame_type == KEY_FRAME),
+ cm->refresh_golden_frame,
+ cm->base_qindex,
+ cm->filter_level,
flags,
- oci->mb_cols, oci->mb_rows);
- vp9_blit_text(message, oci->post_proc_buffer.y_buffer,
- oci->post_proc_buffer.y_stride);
+ cm->mb_cols, cm->mb_rows);
+ vp9_blit_text(message, cm->post_proc_buffer.y_buffer,
+ cm->post_proc_buffer.y_stride);
}
if (flags & VP9D_DEBUG_TXT_MBLK_MODES) {
int i, j;
uint8_t *y_ptr;
- YV12_BUFFER_CONFIG *post = &oci->post_proc_buffer;
+ YV12_BUFFER_CONFIG *post = &cm->post_proc_buffer;
int mb_rows = post->y_height >> 4;
int mb_cols = post->y_width >> 4;
int mb_index = 0;
- MODE_INFO *mi = oci->mi;
+ MODE_INFO *mi = cm->mi;
y_ptr = post->y_buffer + 4 * post->y_stride + 4;
@@ -723,11 +723,11 @@
if (flags & VP9D_DEBUG_TXT_DC_DIFF) {
int i, j;
uint8_t *y_ptr;
- YV12_BUFFER_CONFIG *post = &oci->post_proc_buffer;
+ YV12_BUFFER_CONFIG *post = &cm->post_proc_buffer;
int mb_rows = post->y_height >> 4;
int mb_cols = post->y_width >> 4;
int mb_index = 0;
- MODE_INFO *mi = oci->mi;
+ MODE_INFO *mi = cm->mi;
y_ptr = post->y_buffer + 4 * post->y_stride + 4;
@@ -739,7 +739,7 @@
mi[mb_index].mbmi.mode != SPLITMV &&
mi[mb_index].mbmi.skip_coeff);
- if (oci->frame_type == KEY_FRAME)
+ if (cm->frame_type == KEY_FRAME)
sprintf(zz, "a");
else
sprintf(zz, "%c", dc_diff + '0');
@@ -759,19 +759,19 @@
char message[512];
snprintf(message, sizeof(message),
"Bitrate: %10.2f framerate: %10.2f ",
- oci->bitrate, oci->framerate);
- vp9_blit_text(message, oci->post_proc_buffer.y_buffer,
- oci->post_proc_buffer.y_stride);
+ cm->bitrate, cm->framerate);
+ vp9_blit_text(message, cm->post_proc_buffer.y_buffer,
+ cm->post_proc_buffer.y_stride);
}
/* Draw motion vectors */
if ((flags & VP9D_DEBUG_DRAW_MV) && ppflags->display_mv_flag) {
- YV12_BUFFER_CONFIG *post = &oci->post_proc_buffer;
+ YV12_BUFFER_CONFIG *post = &cm->post_proc_buffer;
int width = post->y_width;
int height = post->y_height;
- uint8_t *y_buffer = oci->post_proc_buffer.y_buffer;
- int y_stride = oci->post_proc_buffer.y_stride;
- MODE_INFO *mi = oci->mi;
+ uint8_t *y_buffer = cm->post_proc_buffer.y_buffer;
+ int y_stride = cm->post_proc_buffer.y_stride;
+ MODE_INFO *mi = cm->mi;
int x0, y0;
for (y0 = 0; y0 < height; y0 += 16) {
@@ -880,7 +880,7 @@
}
}
}
- } else if (mi->mbmi.mode >= NEARESTMV) {
+ } else if (is_inter_mode(mi->mbmi.mode)) {
MV *mv = &mi->mbmi.mv.as_mv;
const int lx0 = x0 + 8;
const int ly0 = y0 + 8;
@@ -908,14 +908,14 @@
if ((flags & VP9D_DEBUG_CLR_BLK_MODES)
&& (ppflags->display_mb_modes_flag || ppflags->display_b_modes_flag)) {
int y, x;
- YV12_BUFFER_CONFIG *post = &oci->post_proc_buffer;
+ YV12_BUFFER_CONFIG *post = &cm->post_proc_buffer;
int width = post->y_width;
int height = post->y_height;
- uint8_t *y_ptr = oci->post_proc_buffer.y_buffer;
- uint8_t *u_ptr = oci->post_proc_buffer.u_buffer;
- uint8_t *v_ptr = oci->post_proc_buffer.v_buffer;
- int y_stride = oci->post_proc_buffer.y_stride;
- MODE_INFO *mi = oci->mi;
+ uint8_t *y_ptr = cm->post_proc_buffer.y_buffer;
+ uint8_t *u_ptr = cm->post_proc_buffer.u_buffer;
+ uint8_t *v_ptr = cm->post_proc_buffer.v_buffer;
+ int y_stride = cm->post_proc_buffer.y_stride;
+ MODE_INFO *mi = cm->mi;
for (y = 0; y < height; y += 16) {
for (x = 0; x < width; x += 16) {
@@ -973,14 +973,14 @@
if ((flags & VP9D_DEBUG_CLR_FRM_REF_BLKS) &&
ppflags->display_ref_frame_flag) {
int y, x;
- YV12_BUFFER_CONFIG *post = &oci->post_proc_buffer;
+ YV12_BUFFER_CONFIG *post = &cm->post_proc_buffer;
int width = post->y_width;
int height = post->y_height;
- uint8_t *y_ptr = oci->post_proc_buffer.y_buffer;
- uint8_t *u_ptr = oci->post_proc_buffer.u_buffer;
- uint8_t *v_ptr = oci->post_proc_buffer.v_buffer;
- int y_stride = oci->post_proc_buffer.y_stride;
- MODE_INFO *mi = oci->mi;
+ uint8_t *y_ptr = cm->post_proc_buffer.y_buffer;
+ uint8_t *u_ptr = cm->post_proc_buffer.u_buffer;
+ uint8_t *v_ptr = cm->post_proc_buffer.v_buffer;
+ int y_stride = cm->post_proc_buffer.y_stride;
+ MODE_INFO *mi = cm->mi;
for (y = 0; y < height; y += 16) {
for (x = 0; x < width; x += 16) {
@@ -1006,11 +1006,11 @@
}
#endif
- *dest = oci->post_proc_buffer;
+ *dest = cm->post_proc_buffer;
/* handle problem with extending borders */
- dest->y_width = oci->width;
- dest->y_height = oci->height;
+ dest->y_width = cm->width;
+ dest->y_height = cm->height;
dest->uv_height = dest->y_height / 2;
return 0;
diff --git a/vp9/common/vp9_postproc.h b/vp9/common/vp9_postproc.h
index 759855f..c63beae 100644
--- a/vp9/common/vp9_postproc.h
+++ b/vp9/common/vp9_postproc.h
@@ -26,7 +26,7 @@
#include "vp9/common/vp9_onyxc_int.h"
#include "vp9/common/vp9_ppflags.h"
-int vp9_post_proc_frame(struct VP9Common *oci,
+int vp9_post_proc_frame(struct VP9Common *cm,
YV12_BUFFER_CONFIG *dest, vp9_ppflags_t *flags);
void vp9_denoise(const YV12_BUFFER_CONFIG *src, YV12_BUFFER_CONFIG *dst, int q);
diff --git a/vp9/common/vp9_pred_common.c b/vp9/common/vp9_pred_common.c
index 97ccb13..494cea7 100644
--- a/vp9/common/vp9_pred_common.c
+++ b/vp9/common/vp9_pred_common.c
@@ -90,25 +90,24 @@
// left of the entries correpsonding to real macroblocks.
// The prediction flags in these dummy entries are initialised to 0.
if (above_in_image && left_in_image) { // both edges available
- if (above_mbmi->ref_frame[1] <= INTRA_FRAME &&
- left_mbmi->ref_frame[1] <= INTRA_FRAME)
+ if (!has_second_ref(above_mbmi) && !has_second_ref(left_mbmi))
// neither edge uses comp pred (0/1)
pred_context = (above_mbmi->ref_frame[0] == cm->comp_fixed_ref) ^
(left_mbmi->ref_frame[0] == cm->comp_fixed_ref);
- else if (above_mbmi->ref_frame[1] <= INTRA_FRAME)
+ else if (!has_second_ref(above_mbmi))
// one of two edges uses comp pred (2/3)
pred_context = 2 + (above_mbmi->ref_frame[0] == cm->comp_fixed_ref ||
- above_mbmi->ref_frame[0] == INTRA_FRAME);
- else if (left_mbmi->ref_frame[1] <= INTRA_FRAME)
+ !is_inter_block(above_mbmi));
+ else if (!has_second_ref(left_mbmi))
// one of two edges uses comp pred (2/3)
pred_context = 2 + (left_mbmi->ref_frame[0] == cm->comp_fixed_ref ||
- left_mbmi->ref_frame[0] == INTRA_FRAME);
+ !is_inter_block(left_mbmi));
else // both edges use comp pred (4)
pred_context = 4;
} else if (above_in_image || left_in_image) { // one edge available
const MB_MODE_INFO *edge_mbmi = above_in_image ? above_mbmi : left_mbmi;
- if (edge_mbmi->ref_frame[1] <= INTRA_FRAME)
+ if (!has_second_ref(edge_mbmi))
// edge does not use comp pred (0/1)
pred_context = edge_mbmi->ref_frame[0] == cm->comp_fixed_ref;
else
@@ -146,14 +145,14 @@
} else if (above_intra || left_intra) { // intra/inter
const MB_MODE_INFO *edge_mbmi = above_intra ? left_mbmi : above_mbmi;
- if (edge_mbmi->ref_frame[1] <= INTRA_FRAME) // single pred (1/3)
+ if (!has_second_ref(edge_mbmi)) // single pred (1/3)
pred_context = 1 + 2 * (edge_mbmi->ref_frame[0] != cm->comp_var_ref[1]);
else // comp pred (1/3)
pred_context = 1 + 2 * (edge_mbmi->ref_frame[var_ref_idx]
!= cm->comp_var_ref[1]);
} else { // inter/inter
- int l_sg = left_mbmi->ref_frame[1] <= INTRA_FRAME;
- int a_sg = above_mbmi->ref_frame[1] <= INTRA_FRAME;
+ const int l_sg = !has_second_ref(left_mbmi);
+ const int a_sg = !has_second_ref(above_mbmi);
MV_REFERENCE_FRAME vrfa = a_sg ? above_mbmi->ref_frame[0]
: above_mbmi->ref_frame[var_ref_idx];
MV_REFERENCE_FRAME vrfl = l_sg ? left_mbmi->ref_frame[0]
@@ -187,13 +186,15 @@
} else if (above_in_image || left_in_image) { // one edge available
const MB_MODE_INFO *edge_mbmi = above_in_image ? above_mbmi : left_mbmi;
- if (edge_mbmi->ref_frame[0] == INTRA_FRAME)
+ if (!is_inter_block(edge_mbmi)) {
pred_context = 2;
- else if (edge_mbmi->ref_frame[1] > INTRA_FRAME)
- pred_context = 4 * (edge_mbmi->ref_frame[var_ref_idx]
+ } else {
+ if (has_second_ref(edge_mbmi))
+ pred_context = 4 * (edge_mbmi->ref_frame[var_ref_idx]
!= cm->comp_var_ref[1]);
- else
- pred_context = 3 * (edge_mbmi->ref_frame[0] != cm->comp_var_ref[1]);
+ else
+ pred_context = 3 * (edge_mbmi->ref_frame[0] != cm->comp_var_ref[1]);
+ }
} else { // no edges available (2)
pred_context = 2;
}
@@ -368,11 +369,11 @@
if (above_in_image)
above_context = above_mbmi->skip_coeff ? max_tx_size
- : above_mbmi->txfm_size;
+ : above_mbmi->tx_size;
if (left_in_image)
left_context = left_mbmi->skip_coeff ? max_tx_size
- : left_mbmi->txfm_size;
+ : left_mbmi->tx_size;
if (!left_in_image)
left_context = above_context;
@@ -383,7 +384,7 @@
return above_context + left_context > max_tx_size;
}
-void vp9_set_pred_flag_seg_id(VP9_COMMON *cm, BLOCK_SIZE_TYPE bsize,
+void vp9_set_pred_flag_seg_id(VP9_COMMON *cm, BLOCK_SIZE bsize,
int mi_row, int mi_col, uint8_t pred_flag) {
MODE_INFO *mi = &cm->mi[mi_row * cm->mode_info_stride + mi_col];
const int bw = 1 << mi_width_log2(bsize);
@@ -397,7 +398,7 @@
mi[y * cm->mode_info_stride + x].mbmi.seg_id_predicted = pred_flag;
}
-void vp9_set_pred_flag_mbskip(VP9_COMMON *cm, BLOCK_SIZE_TYPE bsize,
+void vp9_set_pred_flag_mbskip(VP9_COMMON *cm, BLOCK_SIZE bsize,
int mi_row, int mi_col, uint8_t pred_flag) {
MODE_INFO *mi = &cm->mi[mi_row * cm->mode_info_stride + mi_col];
const int bw = 1 << mi_width_log2(bsize);
@@ -412,7 +413,7 @@
}
int vp9_get_segment_id(VP9_COMMON *cm, const uint8_t *segment_ids,
- BLOCK_SIZE_TYPE bsize, int mi_row, int mi_col) {
+ BLOCK_SIZE bsize, int mi_row, int mi_col) {
const int mi_offset = mi_row * cm->mi_cols + mi_col;
const int bw = 1 << mi_width_log2(bsize);
const int bh = 1 << mi_height_log2(bsize);
diff --git a/vp9/common/vp9_pred_common.h b/vp9/common/vp9_pred_common.h
index c01d394..89e1356 100644
--- a/vp9/common/vp9_pred_common.h
+++ b/vp9/common/vp9_pred_common.h
@@ -15,7 +15,7 @@
#include "vp9/common/vp9_onyxc_int.h"
int vp9_get_segment_id(VP9_COMMON *cm, const uint8_t *segment_ids,
- BLOCK_SIZE_TYPE bsize, int mi_row, int mi_col);
+ BLOCK_SIZE bsize, int mi_row, int mi_col);
static INLINE int vp9_get_pred_context_seg_id(const MACROBLOCKD *xd) {
@@ -32,7 +32,7 @@
return seg->pred_probs[vp9_get_pred_context_seg_id(xd)];
}
-void vp9_set_pred_flag_seg_id(VP9_COMMON *cm, BLOCK_SIZE_TYPE bsize,
+void vp9_set_pred_flag_seg_id(VP9_COMMON *cm, BLOCK_SIZE bsize,
int mi_row, int mi_col, uint8_t pred_flag);
static INLINE int vp9_get_pred_context_mbskip(const MACROBLOCKD *xd) {
@@ -53,7 +53,7 @@
return xd->mode_info_context->mbmi.skip_coeff;
}
-void vp9_set_pred_flag_mbskip(VP9_COMMON *cm, BLOCK_SIZE_TYPE bsize,
+void vp9_set_pred_flag_mbskip(VP9_COMMON *cm, BLOCK_SIZE bsize,
int mi_row, int mi_col, uint8_t pred_flag);
unsigned char vp9_get_pred_context_switchable_interp(const MACROBLOCKD *xd);
@@ -103,7 +103,7 @@
unsigned char vp9_get_pred_context_tx_size(const MACROBLOCKD *xd);
-static const vp9_prob *get_tx_probs(BLOCK_SIZE_TYPE bsize, uint8_t context,
+static const vp9_prob *get_tx_probs(BLOCK_SIZE bsize, uint8_t context,
const struct tx_probs *tx_probs) {
if (bsize < BLOCK_16X16)
return tx_probs->p8x8[context];
@@ -115,12 +115,12 @@
static const vp9_prob *get_tx_probs2(const MACROBLOCKD *xd,
const struct tx_probs *tx_probs) {
- const BLOCK_SIZE_TYPE bsize = xd->mode_info_context->mbmi.sb_type;
+ const BLOCK_SIZE bsize = xd->mode_info_context->mbmi.sb_type;
const int context = vp9_get_pred_context_tx_size(xd);
return get_tx_probs(bsize, context, tx_probs);
}
-static void update_tx_counts(BLOCK_SIZE_TYPE bsize, uint8_t context,
+static void update_tx_counts(BLOCK_SIZE bsize, uint8_t context,
TX_SIZE tx_size, struct tx_counts *tx_counts) {
if (bsize >= BLOCK_32X32)
tx_counts->p32x32[context][tx_size]++;
diff --git a/vp9/common/vp9_reconinter.c b/vp9/common/vp9_reconinter.c
index 89c2aa8..88bba3a 100644
--- a/vp9/common/vp9_reconinter.c
+++ b/vp9/common/vp9_reconinter.c
@@ -117,8 +117,7 @@
int x, y;
};
-static void build_inter_predictors(int plane, int block,
- BLOCK_SIZE_TYPE bsize,
+static void build_inter_predictors(int plane, int block, BLOCK_SIZE bsize,
int pred_w, int pred_h,
void *argv) {
const struct build_inter_predictors_args* const arg = argv;
@@ -174,14 +173,14 @@
}
// TODO(jkoleszar): In principle, pred_w, pred_h are unnecessary, as we could
-// calculate the subsampled BLOCK_SIZE_TYPE, but that type isn't defined for
+// calculate the subsampled BLOCK_SIZE, but that type isn't defined for
// sizes smaller than 16x16 yet.
typedef void (*foreach_predicted_block_visitor)(int plane, int block,
- BLOCK_SIZE_TYPE bsize,
+ BLOCK_SIZE bsize,
int pred_w, int pred_h,
void *arg);
static INLINE void foreach_predicted_block_in_plane(
- const MACROBLOCKD* const xd, BLOCK_SIZE_TYPE bsize, int plane,
+ const MACROBLOCKD* const xd, BLOCK_SIZE bsize, int plane,
foreach_predicted_block_visitor visit, void *arg) {
int i, x, y;
@@ -216,8 +215,7 @@
}
}
-static void build_inter_predictors_for_planes(MACROBLOCKD *xd,
- BLOCK_SIZE_TYPE bsize,
+static void build_inter_predictors_for_planes(MACROBLOCKD *xd, BLOCK_SIZE bsize,
int mi_row, int mi_col,
int plane_from, int plane_to) {
int plane;
@@ -231,16 +229,16 @@
}
void vp9_build_inter_predictors_sby(MACROBLOCKD *xd, int mi_row, int mi_col,
- BLOCK_SIZE_TYPE bsize) {
+ BLOCK_SIZE bsize) {
build_inter_predictors_for_planes(xd, bsize, mi_row, mi_col, 0, 0);
}
void vp9_build_inter_predictors_sbuv(MACROBLOCKD *xd, int mi_row, int mi_col,
- BLOCK_SIZE_TYPE bsize) {
+ BLOCK_SIZE bsize) {
build_inter_predictors_for_planes(xd, bsize, mi_row, mi_col, 1,
MAX_MB_PLANE - 1);
}
void vp9_build_inter_predictors_sb(MACROBLOCKD *xd, int mi_row, int mi_col,
- BLOCK_SIZE_TYPE bsize) {
+ BLOCK_SIZE bsize) {
build_inter_predictors_for_planes(xd, bsize, mi_row, mi_col, 0,
MAX_MB_PLANE - 1);
}
diff --git a/vp9/common/vp9_reconinter.h b/vp9/common/vp9_reconinter.h
index 82c0796..504b793 100644
--- a/vp9/common/vp9_reconinter.h
+++ b/vp9/common/vp9_reconinter.h
@@ -15,19 +15,14 @@
#include "vp9/common/vp9_onyxc_int.h"
struct subpix_fn_table;
-void vp9_build_inter_predictors_sby(MACROBLOCKD *xd,
- int mb_row,
- int mb_col,
- BLOCK_SIZE_TYPE bsize);
+void vp9_build_inter_predictors_sby(MACROBLOCKD *xd, int mi_row, int mi_col,
+ BLOCK_SIZE bsize);
-void vp9_build_inter_predictors_sbuv(MACROBLOCKD *xd,
- int mb_row,
- int mb_col,
- BLOCK_SIZE_TYPE bsize);
+void vp9_build_inter_predictors_sbuv(MACROBLOCKD *xd, int mi_row, int mi_col,
+ BLOCK_SIZE bsize);
-void vp9_build_inter_predictors_sb(MACROBLOCKD *mb,
- int mb_row, int mb_col,
- BLOCK_SIZE_TYPE bsize);
+void vp9_build_inter_predictors_sb(MACROBLOCKD *xd, int mi_row, int mi_col,
+ BLOCK_SIZE bsize);
void vp9_setup_interp_filters(MACROBLOCKD *xd,
INTERPOLATIONFILTERTYPE filter,
diff --git a/vp9/common/vp9_reconintra.c b/vp9/common/vp9_reconintra.c
index 2a1bf5c..4a451b9 100644
--- a/vp9/common/vp9_reconintra.c
+++ b/vp9/common/vp9_reconintra.c
@@ -26,7 +26,7 @@
ADST_ADST, // D135
ADST_DCT, // D117
DCT_ADST, // D153
- DCT_ADST, // D27
+ DCT_ADST, // D207
ADST_DCT, // D63
ADST_ADST, // TM
DCT_DCT, // NEARESTMV
@@ -297,7 +297,7 @@
intra_pred_allsizes(pred[V_PRED], v);
intra_pred_allsizes(pred[H_PRED], h);
- intra_pred_allsizes(pred[D27_PRED], d207);
+ intra_pred_allsizes(pred[D207_PRED], d207);
intra_pred_allsizes(pred[D45_PRED], d45);
intra_pred_allsizes(pred[D63_PRED], d63);
intra_pred_allsizes(pred[D117_PRED], d117);
diff --git a/vp9/common/vp9_rtcd_defs.sh b/vp9/common/vp9_rtcd_defs.sh
index 30c1b26..104db6a 100644
--- a/vp9/common/vp9_rtcd_defs.sh
+++ b/vp9/common/vp9_rtcd_defs.sh
@@ -301,7 +301,7 @@
specialize vp9_short_idct4x4_add sse2 neon
prototype void vp9_short_idct8x8_1_add "int16_t *input, uint8_t *dest, int dest_stride"
-specialize vp9_short_idct8x8_1_add sse2
+specialize vp9_short_idct8x8_1_add sse2 neon
prototype void vp9_short_idct8x8_add "int16_t *input, uint8_t *dest, int dest_stride"
specialize vp9_short_idct8x8_add sse2 neon
@@ -310,7 +310,7 @@
specialize vp9_short_idct10_8x8_add sse2 neon
prototype void vp9_short_idct16x16_1_add "int16_t *input, uint8_t *dest, int dest_stride"
-specialize vp9_short_idct16x16_1_add sse2
+specialize vp9_short_idct16x16_1_add sse2 neon
prototype void vp9_short_idct16x16_add "int16_t *input, uint8_t *dest, int dest_stride"
specialize vp9_short_idct16x16_add sse2 neon
@@ -701,7 +701,7 @@
specialize vp9_quantize_b $ssse3_x86_64
prototype void vp9_quantize_b_32x32 "int16_t *coeff_ptr, intptr_t n_coeffs, int skip_block, int16_t *zbin_ptr, int16_t *round_ptr, int16_t *quant_ptr, int16_t *quant_shift_ptr, int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"
-specialize vp9_quantize_b_32x32
+specialize vp9_quantize_b_32x32 # $ssse3_x86_64 FIXME(jingning): need a unit test on thisbefore enabled
#
# Structured Similarity (SSIM)
diff --git a/vp9/common/vp9_systemdependent.h b/vp9/common/vp9_systemdependent.h
index 1b9147e..cc909e2 100644
--- a/vp9/common/vp9_systemdependent.h
+++ b/vp9/common/vp9_systemdependent.h
@@ -34,6 +34,6 @@
#endif
struct VP9Common;
-void vp9_machine_specific_config(struct VP9Common *);
+void vp9_machine_specific_config(struct VP9Common *cm);
#endif // VP9_COMMON_VP9_SYSTEMDEPENDENT_H_
diff --git a/vp9/common/x86/vp9_loopfilter_intrin_sse2.c b/vp9/common/x86/vp9_loopfilter_intrin_sse2.c
index 4af4f94..fa4dd9b 100644
--- a/vp9/common/x86/vp9_loopfilter_intrin_sse2.c
+++ b/vp9/common/x86/vp9_loopfilter_intrin_sse2.c
@@ -17,23 +17,11 @@
const unsigned char *_blimit,
const unsigned char *_limit,
const unsigned char *_thresh) {
- DECLARE_ALIGNED(16, unsigned char, flat2_op[7][8]);
- DECLARE_ALIGNED(16, unsigned char, flat2_oq[7][8]);
-
- DECLARE_ALIGNED(16, unsigned char, flat_op[3][8]);
- DECLARE_ALIGNED(16, unsigned char, flat_oq[3][8]);
-
- DECLARE_ALIGNED(16, unsigned char, ap[8][8]);
- DECLARE_ALIGNED(16, unsigned char, aq[8][8]);
-
-
__m128i mask, hev, flat, flat2;
const __m128i zero = _mm_set1_epi16(0);
const __m128i one = _mm_set1_epi8(1);
- __m128i p7, p6, p5;
- __m128i p4, p3, p2, p1, p0, q0, q1, q2, q3, q4;
- __m128i q5, q6, q7;
- int i = 0;
+ __m128i q7p7, q6p6, q5p5, q4p4, q3p3, q2p2, q1p1, q0p0, p0q0, p1q1;
+ __m128i abs_p1p0;
const unsigned int extended_thresh = _thresh[0] * 0x01010101u;
const unsigned int extended_limit = _limit[0] * 0x01010101u;
const unsigned int extended_blimit = _blimit[0] * 0x01010101u;
@@ -44,41 +32,35 @@
const __m128i blimit =
_mm_shuffle_epi32(_mm_cvtsi32_si128((int)extended_blimit), 0);
- p4 = _mm_loadl_epi64((__m128i *)(s - 5 * p));
- p3 = _mm_loadl_epi64((__m128i *)(s - 4 * p));
- p2 = _mm_loadl_epi64((__m128i *)(s - 3 * p));
- p1 = _mm_loadl_epi64((__m128i *)(s - 2 * p));
- p0 = _mm_loadl_epi64((__m128i *)(s - 1 * p));
- q0 = _mm_loadl_epi64((__m128i *)(s - 0 * p));
- q1 = _mm_loadl_epi64((__m128i *)(s + 1 * p));
- q2 = _mm_loadl_epi64((__m128i *)(s + 2 * p));
- q3 = _mm_loadl_epi64((__m128i *)(s + 3 * p));
- q4 = _mm_loadl_epi64((__m128i *)(s + 4 * p));
-
- _mm_storel_epi64((__m128i *)ap[4], p4);
- _mm_storel_epi64((__m128i *)ap[3], p3);
- _mm_storel_epi64((__m128i *)ap[2], p2);
- _mm_storel_epi64((__m128i *)ap[1], p1);
- _mm_storel_epi64((__m128i *)ap[0], p0);
- _mm_storel_epi64((__m128i *)aq[4], q4);
- _mm_storel_epi64((__m128i *)aq[3], q3);
- _mm_storel_epi64((__m128i *)aq[2], q2);
- _mm_storel_epi64((__m128i *)aq[1], q1);
- _mm_storel_epi64((__m128i *)aq[0], q0);
-
+ q4p4 = _mm_loadl_epi64((__m128i *)(s - 5 * p));
+ q4p4 = _mm_castps_si128(_mm_loadh_pi(_mm_castsi128_ps(q4p4),
+ (__m64 *)(s + 4 * p)));
+ q3p3 = _mm_loadl_epi64((__m128i *)(s - 4 * p));
+ q3p3 = _mm_castps_si128(_mm_loadh_pi(_mm_castsi128_ps(q3p3),
+ (__m64 *)(s + 3 * p)));
+ q2p2 = _mm_loadl_epi64((__m128i *)(s - 3 * p));
+ q2p2 = _mm_castps_si128(_mm_loadh_pi(_mm_castsi128_ps(q2p2),
+ (__m64 *)(s + 2 * p)));
+ q1p1 = _mm_loadl_epi64((__m128i *)(s - 2 * p));
+ q1p1 = _mm_castps_si128(_mm_loadh_pi(_mm_castsi128_ps(q1p1),
+ (__m64 *)(s + 1 * p)));
+ p1q1 = _mm_shuffle_epi32(q1p1, 78);
+ q0p0 = _mm_loadl_epi64((__m128i *)(s - 1 * p));
+ q0p0 = _mm_castps_si128(_mm_loadh_pi(_mm_castsi128_ps(q0p0),
+ (__m64 *)(s - 0 * p)));
+ p0q0 = _mm_shuffle_epi32(q0p0, 78);
{
- const __m128i abs_p1p0 = _mm_or_si128(_mm_subs_epu8(p1, p0),
- _mm_subs_epu8(p0, p1));
- const __m128i abs_q1q0 = _mm_or_si128(_mm_subs_epu8(q1, q0),
- _mm_subs_epu8(q0, q1));
- const __m128i fe = _mm_set1_epi8(0xfe);
- const __m128i ff = _mm_cmpeq_epi8(abs_p1p0, abs_p1p0);
- __m128i abs_p0q0 = _mm_or_si128(_mm_subs_epu8(p0, q0),
- _mm_subs_epu8(q0, p0));
- __m128i abs_p1q1 = _mm_or_si128(_mm_subs_epu8(p1, q1),
- _mm_subs_epu8(q1, p1));
- __m128i work;
+ __m128i abs_p1q1, abs_p0q0, abs_q1q0, fe, ff, work;
+ abs_p1p0 = _mm_or_si128(_mm_subs_epu8(q1p1, q0p0),
+ _mm_subs_epu8(q0p0, q1p1));
+ abs_q1q0 = _mm_srli_si128(abs_p1p0, 8);
+ fe = _mm_set1_epi8(0xfe);
+ ff = _mm_cmpeq_epi8(abs_p1p0, abs_p1p0);
+ abs_p0q0 = _mm_or_si128(_mm_subs_epu8(q0p0, p0q0),
+ _mm_subs_epu8(p0q0, q0p0));
+ abs_p1q1 = _mm_or_si128(_mm_subs_epu8(q1p1, p1q1),
+ _mm_subs_epu8(p1q1, q1p1));
flat = _mm_max_epu8(abs_p1p0, abs_q1q0);
hev = _mm_subs_epu8(flat, thresh);
hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
@@ -88,19 +70,16 @@
mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit);
mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
// mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1;
- mask = _mm_max_epu8(flat, mask);
+ mask = _mm_max_epu8(abs_p1p0, mask);
// mask |= (abs(p1 - p0) > limit) * -1;
// mask |= (abs(q1 - q0) > limit) * -1;
- work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p2, p1),
- _mm_subs_epu8(p1, p2)),
- _mm_or_si128(_mm_subs_epu8(p3, p2),
- _mm_subs_epu8(p2, p3)));
+
+ work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(q2p2, q1p1),
+ _mm_subs_epu8(q1p1, q2p2)),
+ _mm_or_si128(_mm_subs_epu8(q3p3, q2p2),
+ _mm_subs_epu8(q2p2, q3p3)));
mask = _mm_max_epu8(work, mask);
- work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(q2, q1),
- _mm_subs_epu8(q1, q2)),
- _mm_or_si128(_mm_subs_epu8(q3, q2),
- _mm_subs_epu8(q2, q3)));
- mask = _mm_max_epu8(work, mask);
+ mask = _mm_max_epu8(mask, _mm_srli_si128(mask, 8));
mask = _mm_subs_epu8(mask, limit);
mask = _mm_cmpeq_epi8(mask, zero);
}
@@ -110,21 +89,19 @@
const __m128i t4 = _mm_set1_epi8(4);
const __m128i t3 = _mm_set1_epi8(3);
const __m128i t80 = _mm_set1_epi8(0x80);
- const __m128i te0 = _mm_set1_epi8(0xe0);
- const __m128i t1f = _mm_set1_epi8(0x1f);
- const __m128i t1 = _mm_set1_epi8(0x1);
- const __m128i t7f = _mm_set1_epi8(0x7f);
-
- __m128i ps1 = _mm_xor_si128(p1, t80);
- __m128i ps0 = _mm_xor_si128(p0, t80);
- __m128i qs0 = _mm_xor_si128(q0, t80);
- __m128i qs1 = _mm_xor_si128(q1, t80);
+ const __m128i t1 = _mm_set1_epi16(0x1);
+ __m128i qs1ps1 = _mm_xor_si128(q1p1, t80);
+ __m128i qs0ps0 = _mm_xor_si128(q0p0, t80);
+ __m128i qs0 = _mm_xor_si128(p0q0, t80);
+ __m128i qs1 = _mm_xor_si128(p1q1, t80);
__m128i filt;
__m128i work_a;
__m128i filter1, filter2;
+ __m128i flat2_q6p6, flat2_q5p5, flat2_q4p4, flat2_q3p3, flat2_q2p2;
+ __m128i flat2_q1p1, flat2_q0p0, flat_q2p2, flat_q1p1, flat_q0p0;
- filt = _mm_and_si128(_mm_subs_epi8(ps1, qs1), hev);
- work_a = _mm_subs_epi8(qs0, ps0);
+ filt = _mm_and_si128(_mm_subs_epi8(qs1ps1, qs1), hev);
+ work_a = _mm_subs_epi8(qs0, qs0ps0);
filt = _mm_adds_epi8(filt, work_a);
filt = _mm_adds_epi8(filt, work_a);
filt = _mm_adds_epi8(filt, work_a);
@@ -134,82 +111,60 @@
filter1 = _mm_adds_epi8(filt, t4);
filter2 = _mm_adds_epi8(filt, t3);
- /* Filter1 >> 3 */
- work_a = _mm_cmpgt_epi8(zero, filter1);
- filter1 = _mm_srli_epi16(filter1, 3);
- work_a = _mm_and_si128(work_a, te0);
- filter1 = _mm_and_si128(filter1, t1f);
- filter1 = _mm_or_si128(filter1, work_a);
- qs0 = _mm_xor_si128(_mm_subs_epi8(qs0, filter1), t80);
+ filter1 = _mm_unpacklo_epi8(zero, filter1);
+ filter1 = _mm_srai_epi16(filter1, 0xB);
+ filter2 = _mm_unpacklo_epi8(zero, filter2);
+ filter2 = _mm_srai_epi16(filter2, 0xB);
- /* Filter2 >> 3 */
- work_a = _mm_cmpgt_epi8(zero, filter2);
- filter2 = _mm_srli_epi16(filter2, 3);
- work_a = _mm_and_si128(work_a, te0);
- filter2 = _mm_and_si128(filter2, t1f);
- filter2 = _mm_or_si128(filter2, work_a);
- ps0 = _mm_xor_si128(_mm_adds_epi8(ps0, filter2), t80);
+ /* Filter1 >> 3 */
+ filt = _mm_packs_epi16(filter2, _mm_subs_epi16(zero, filter1));
+ qs0ps0 = _mm_xor_si128(_mm_adds_epi8(qs0ps0, filt), t80);
/* filt >> 1 */
- filt = _mm_adds_epi8(filter1, t1);
- work_a = _mm_cmpgt_epi8(zero, filt);
- filt = _mm_srli_epi16(filt, 1);
- work_a = _mm_and_si128(work_a, t80);
- filt = _mm_and_si128(filt, t7f);
- filt = _mm_or_si128(filt, work_a);
- filt = _mm_andnot_si128(hev, filt);
- ps1 = _mm_xor_si128(_mm_adds_epi8(ps1, filt), t80);
- qs1 = _mm_xor_si128(_mm_subs_epi8(qs1, filt), t80);
+ filt = _mm_adds_epi16(filter1, t1);
+ filt = _mm_srai_epi16(filt, 1);
+ filt = _mm_andnot_si128(_mm_srai_epi16(_mm_unpacklo_epi8(zero, hev), 0x8),
+ filt);
+ filt = _mm_packs_epi16(filt, _mm_subs_epi16(zero, filt));
+ qs1ps1 = _mm_xor_si128(_mm_adds_epi8(qs1ps1, filt), t80);
// loopfilter done
{
__m128i work;
- work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p2, p0),
- _mm_subs_epu8(p0, p2)),
- _mm_or_si128(_mm_subs_epu8(q2, q0),
- _mm_subs_epu8(q0, q2)));
- flat = _mm_max_epu8(work, flat);
- work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p3, p0),
- _mm_subs_epu8(p0, p3)),
- _mm_or_si128(_mm_subs_epu8(q3, q0),
- _mm_subs_epu8(q0, q3)));
- flat = _mm_max_epu8(work, flat);
- work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p4, p0),
- _mm_subs_epu8(p0, p4)),
- _mm_or_si128(_mm_subs_epu8(q4, q0),
- _mm_subs_epu8(q0, q4)));
+ flat = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(q2p2, q0p0),
+ _mm_subs_epu8(q0p0, q2p2)),
+ _mm_or_si128(_mm_subs_epu8(q3p3, q0p0),
+ _mm_subs_epu8(q0p0, q3p3)));
+ flat = _mm_max_epu8(abs_p1p0, flat);
+ flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 8));
flat = _mm_subs_epu8(flat, one);
flat = _mm_cmpeq_epi8(flat, zero);
flat = _mm_and_si128(flat, mask);
- p5 = _mm_loadl_epi64((__m128i *)(s - 6 * p));
- q5 = _mm_loadl_epi64((__m128i *)(s + 5 * p));
- flat2 = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p5, p0),
- _mm_subs_epu8(p0, p5)),
- _mm_or_si128(_mm_subs_epu8(q5, q0),
- _mm_subs_epu8(q0, q5)));
- _mm_storel_epi64((__m128i *)ap[5], p5);
- _mm_storel_epi64((__m128i *)aq[5], q5);
- flat2 = _mm_max_epu8(work, flat2);
- p6 = _mm_loadl_epi64((__m128i *)(s - 7 * p));
- q6 = _mm_loadl_epi64((__m128i *)(s + 6 * p));
- work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p6, p0),
- _mm_subs_epu8(p0, p6)),
- _mm_or_si128(_mm_subs_epu8(q6, q0),
- _mm_subs_epu8(q0, q6)));
- _mm_storel_epi64((__m128i *)ap[6], p6);
- _mm_storel_epi64((__m128i *)aq[6], q6);
- flat2 = _mm_max_epu8(work, flat2);
+ q5p5 = _mm_loadl_epi64((__m128i *)(s - 6 * p));
+ q5p5 = _mm_castps_si128(_mm_loadh_pi(_mm_castsi128_ps(q5p5),
+ (__m64 *)(s + 5 * p)));
- p7 = _mm_loadl_epi64((__m128i *)(s - 8 * p));
- q7 = _mm_loadl_epi64((__m128i *)(s + 7 * p));
- work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p7, p0),
- _mm_subs_epu8(p0, p7)),
- _mm_or_si128(_mm_subs_epu8(q7, q0),
- _mm_subs_epu8(q0, q7)));
- _mm_storel_epi64((__m128i *)ap[7], p7);
- _mm_storel_epi64((__m128i *)aq[7], q7);
+ q6p6 = _mm_loadl_epi64((__m128i *)(s - 7 * p));
+ q6p6 = _mm_castps_si128(_mm_loadh_pi(_mm_castsi128_ps(q6p6),
+ (__m64 *)(s + 6 * p)));
+
+ flat2 = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(q4p4, q0p0),
+ _mm_subs_epu8(q0p0, q4p4)),
+ _mm_or_si128(_mm_subs_epu8(q5p5, q0p0),
+ _mm_subs_epu8(q0p0, q5p5)));
+
+ q7p7 = _mm_loadl_epi64((__m128i *)(s - 8 * p));
+ q7p7 = _mm_castps_si128(_mm_loadh_pi(_mm_castsi128_ps(q7p7),
+ (__m64 *)(s + 7 * p)));
+
+ work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(q6p6, q0p0),
+ _mm_subs_epu8(q0p0, q6p6)),
+ _mm_or_si128(_mm_subs_epu8(q7p7, q0p0),
+ _mm_subs_epu8(q0p0, q7p7)));
+
flat2 = _mm_max_epu8(work, flat2);
+ flat2 = _mm_max_epu8(flat2, _mm_srli_si128(flat2, 8));
flat2 = _mm_subs_epu8(flat2, one);
flat2 = _mm_cmpeq_epi8(flat2, zero);
flat2 = _mm_and_si128(flat2, flat); // flat2 & flat & mask
@@ -220,260 +175,198 @@
{
const __m128i eight = _mm_set1_epi16(8);
const __m128i four = _mm_set1_epi16(4);
- {
- __m128i workp_shft;
- __m128i a, b, c;
+ __m128i p7_16, p6_16, p5_16, p4_16, p3_16, p2_16, p1_16, p0_16;
+ __m128i q7_16, q6_16, q5_16, q4_16, q3_16, q2_16, q1_16, q0_16;
+ __m128i pixelFilter_p, pixelFilter_q;
+ __m128i pixetFilter_p2p1p0, pixetFilter_q2q1q0;
+ __m128i sum_p7, sum_q7, sum_p3, sum_q3, res_p, res_q;
- p7 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(ap[7])), zero);
- p6 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(ap[6])), zero);
- p5 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(ap[5])), zero);
- p4 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(ap[4])), zero);
- p3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(ap[3])), zero);
- p2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(ap[2])), zero);
- p1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(ap[1])), zero);
- p0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(ap[0])), zero);
- q0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(aq[0])), zero);
- q1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(aq[1])), zero);
- q2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(aq[2])), zero);
- q3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(aq[3])), zero);
- q4 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(aq[4])), zero);
- q5 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(aq[5])), zero);
- q6 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(aq[6])), zero);
- q7 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(aq[7])), zero);
+ p7_16 = _mm_unpacklo_epi8(q7p7, zero);;
+ p6_16 = _mm_unpacklo_epi8(q6p6, zero);
+ p5_16 = _mm_unpacklo_epi8(q5p5, zero);
+ p4_16 = _mm_unpacklo_epi8(q4p4, zero);
+ p3_16 = _mm_unpacklo_epi8(q3p3, zero);
+ p2_16 = _mm_unpacklo_epi8(q2p2, zero);
+ p1_16 = _mm_unpacklo_epi8(q1p1, zero);
+ p0_16 = _mm_unpacklo_epi8(q0p0, zero);
+ q0_16 = _mm_unpackhi_epi8(q0p0, zero);
+ q1_16 = _mm_unpackhi_epi8(q1p1, zero);
+ q2_16 = _mm_unpackhi_epi8(q2p2, zero);
+ q3_16 = _mm_unpackhi_epi8(q3p3, zero);
+ q4_16 = _mm_unpackhi_epi8(q4p4, zero);
+ q5_16 = _mm_unpackhi_epi8(q5p5, zero);
+ q6_16 = _mm_unpackhi_epi8(q6p6, zero);
+ q7_16 = _mm_unpackhi_epi8(q7p7, zero);
- c = _mm_sub_epi16(_mm_slli_epi16(p7, 3), p7); // p7 * 7
- c = _mm_add_epi16(_mm_slli_epi16(p6, 1), _mm_add_epi16(p4, c));
+ pixelFilter_p = _mm_add_epi16(_mm_add_epi16(p6_16, p5_16),
+ _mm_add_epi16(p4_16, p3_16));
+ pixelFilter_q = _mm_add_epi16(_mm_add_epi16(q6_16, q5_16),
+ _mm_add_epi16(q4_16, q3_16));
- b = _mm_add_epi16(_mm_add_epi16(p3, four), _mm_add_epi16(p3, p2));
- a = _mm_add_epi16(p3, _mm_add_epi16(p2, p1));
- a = _mm_add_epi16(_mm_add_epi16(p0, q0), a);
+ pixetFilter_p2p1p0 = _mm_add_epi16(p0_16, _mm_add_epi16(p2_16, p1_16));
+ pixelFilter_p = _mm_add_epi16(pixelFilter_p, pixetFilter_p2p1p0);
- _mm_storel_epi64((__m128i *)&flat_op[2][i*8],
- _mm_packus_epi16(_mm_srli_epi16(_mm_add_epi16(a, b), 3)
- , b));
+ pixetFilter_q2q1q0 = _mm_add_epi16(q0_16, _mm_add_epi16(q2_16, q1_16));
+ pixelFilter_q = _mm_add_epi16(pixelFilter_q, pixetFilter_q2q1q0);
+ pixelFilter_p = _mm_add_epi16(eight, _mm_add_epi16(pixelFilter_p,
+ pixelFilter_q));
+ pixetFilter_p2p1p0 = _mm_add_epi16(four,
+ _mm_add_epi16(pixetFilter_p2p1p0,
+ pixetFilter_q2q1q0));
+ res_p = _mm_srli_epi16(_mm_add_epi16(pixelFilter_p,
+ _mm_add_epi16(p7_16, p0_16)), 4);
+ res_q = _mm_srli_epi16(_mm_add_epi16(pixelFilter_p,
+ _mm_add_epi16(q7_16, q0_16)), 4);
+ flat2_q0p0 = _mm_packus_epi16(res_p, res_q);
+ res_p = _mm_srli_epi16(_mm_add_epi16(pixetFilter_p2p1p0,
+ _mm_add_epi16(p3_16, p0_16)), 3);
+ res_q = _mm_srli_epi16(_mm_add_epi16(pixetFilter_p2p1p0,
+ _mm_add_epi16(q3_16, q0_16)), 3);
- c = _mm_add_epi16(_mm_add_epi16(p5, eight), c);
- workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4);
- _mm_storel_epi64((__m128i *)&flat2_op[6][i*8],
- _mm_packus_epi16(workp_shft, workp_shft));
+ flat_q0p0 = _mm_packus_epi16(res_p, res_q);
- a = _mm_add_epi16(q1, a);
- b = _mm_add_epi16(_mm_sub_epi16(b, _mm_add_epi16(p3, p2)), p1);
- _mm_storel_epi64((__m128i *)&flat_op[1][i*8],
- _mm_packus_epi16(_mm_srli_epi16(_mm_add_epi16(a, b), 3)
- , b));
+ sum_p7 = _mm_add_epi16(p7_16, p7_16);
+ sum_q7 = _mm_add_epi16(q7_16, q7_16);
+ sum_p3 = _mm_add_epi16(p3_16, p3_16);
+ sum_q3 = _mm_add_epi16(q3_16, q3_16);
- c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p7, p6)), p5);
- workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4);
- _mm_storel_epi64((__m128i *)&flat2_op[5][i*8],
- _mm_packus_epi16(workp_shft, workp_shft));
+ pixelFilter_q = _mm_sub_epi16(pixelFilter_p, p6_16);
+ pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q6_16);
+ res_p = _mm_srli_epi16(_mm_add_epi16(pixelFilter_p,
+ _mm_add_epi16(sum_p7, p1_16)), 4);
+ res_q = _mm_srli_epi16(_mm_add_epi16(pixelFilter_q,
+ _mm_add_epi16(sum_q7, q1_16)), 4);
+ flat2_q1p1 = _mm_packus_epi16(res_p, res_q);
- a = _mm_add_epi16(q2, a);
- b = _mm_add_epi16(_mm_sub_epi16(b, _mm_add_epi16(p3, p1)), p0);
- _mm_storel_epi64((__m128i *)&flat_op[0][i*8],
- _mm_packus_epi16(_mm_srli_epi16(_mm_add_epi16(a, b), 3)
- , b));
+ pixetFilter_q2q1q0 = _mm_sub_epi16(pixetFilter_p2p1p0, p2_16);
+ pixetFilter_p2p1p0 = _mm_sub_epi16(pixetFilter_p2p1p0, q2_16);
+ res_p = _mm_srli_epi16(_mm_add_epi16(pixetFilter_p2p1p0,
+ _mm_add_epi16(sum_p3, p1_16)), 3);
+ res_q = _mm_srli_epi16(_mm_add_epi16(pixetFilter_q2q1q0,
+ _mm_add_epi16(sum_q3, q1_16)), 3);
+ flat_q1p1 = _mm_packus_epi16(res_p, res_q);
- c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p7, p5)), p4);
- workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4);
- _mm_storel_epi64((__m128i *)&flat2_op[4][i*8],
- _mm_packus_epi16(workp_shft, workp_shft));
+ sum_p7 = _mm_add_epi16(sum_p7, p7_16);
+ sum_q7 = _mm_add_epi16(sum_q7, q7_16);
+ sum_p3 = _mm_add_epi16(sum_p3, p3_16);
+ sum_q3 = _mm_add_epi16(sum_q3, q3_16);
- a = _mm_add_epi16(q3, a);
- b = _mm_add_epi16(_mm_sub_epi16(b, _mm_add_epi16(p3, p0)), q0);
- _mm_storel_epi64((__m128i *)&flat_oq[0][i*8],
- _mm_packus_epi16(_mm_srli_epi16(_mm_add_epi16(a, b), 3)
- , b));
+ pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q5_16);
+ pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p5_16);
+ res_p = _mm_srli_epi16(_mm_add_epi16(pixelFilter_p,
+ _mm_add_epi16(sum_p7, p2_16)), 4);
+ res_q = _mm_srli_epi16(_mm_add_epi16(pixelFilter_q,
+ _mm_add_epi16(sum_q7, q2_16)), 4);
+ flat2_q2p2 = _mm_packus_epi16(res_p, res_q);
- c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p7, p4)), p3);
- workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4);
- _mm_storel_epi64((__m128i *)&flat2_op[3][i*8],
- _mm_packus_epi16(workp_shft, workp_shft));
+ pixetFilter_p2p1p0 = _mm_sub_epi16(pixetFilter_p2p1p0, q1_16);
+ pixetFilter_q2q1q0 = _mm_sub_epi16(pixetFilter_q2q1q0, p1_16);
- b = _mm_add_epi16(q3, b);
- b = _mm_add_epi16(_mm_sub_epi16(b, _mm_add_epi16(p2, q0)), q1);
- _mm_storel_epi64((__m128i *)&flat_oq[1][i*8],
- _mm_packus_epi16(_mm_srli_epi16(_mm_add_epi16(a, b), 3)
- , b));
+ res_p = _mm_srli_epi16(_mm_add_epi16(pixetFilter_p2p1p0,
+ _mm_add_epi16(sum_p3, p2_16)), 3);
+ res_q = _mm_srli_epi16(_mm_add_epi16(pixetFilter_q2q1q0,
+ _mm_add_epi16(sum_q3, q2_16)), 3);
+ flat_q2p2 = _mm_packus_epi16(res_p, res_q);
- c = _mm_add_epi16(q4, c);
- c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p7, p3)), p2);
- workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4);
- _mm_storel_epi64((__m128i *)&flat2_op[2][i*8],
- _mm_packus_epi16(workp_shft, workp_shft));
+ sum_p7 = _mm_add_epi16(sum_p7, p7_16);
+ sum_q7 = _mm_add_epi16(sum_q7, q7_16);
+ pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q4_16);
+ pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p4_16);
+ res_p = _mm_srli_epi16(_mm_add_epi16(pixelFilter_p,
+ _mm_add_epi16(sum_p7, p3_16)), 4);
+ res_q = _mm_srli_epi16(_mm_add_epi16(pixelFilter_q,
+ _mm_add_epi16(sum_q7, q3_16)), 4);
+ flat2_q3p3 = _mm_packus_epi16(res_p, res_q);
- b = _mm_add_epi16(q3, b);
- b = _mm_add_epi16(_mm_sub_epi16(b, _mm_add_epi16(p1, q1)), q2);
- _mm_storel_epi64((__m128i *)&flat_oq[2][i*8],
- _mm_packus_epi16(_mm_srli_epi16(_mm_add_epi16(a, b), 3)
- , b));
- a = _mm_add_epi16(q5, a);
- c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p7, p2)), p1);
- workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4);
- _mm_storel_epi64((__m128i *)&flat2_op[1][i*8],
- _mm_packus_epi16(workp_shft, workp_shft));
+ sum_p7 = _mm_add_epi16(sum_p7, p7_16);
+ sum_q7 = _mm_add_epi16(sum_q7, q7_16);
+ pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q3_16);
+ pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p3_16);
+ res_p = _mm_srli_epi16(_mm_add_epi16(pixelFilter_p,
+ _mm_add_epi16(sum_p7, p4_16)), 4);
+ res_q = _mm_srli_epi16(_mm_add_epi16(pixelFilter_q,
+ _mm_add_epi16(sum_q7, q4_16)), 4);
+ flat2_q4p4 = _mm_packus_epi16(res_p, res_q);
- a = _mm_add_epi16(q6, a);
- c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p7, p1)), p0);
- workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4);
- _mm_storel_epi64((__m128i *)&flat2_op[0][i*8],
- _mm_packus_epi16(workp_shft, workp_shft));
+ sum_p7 = _mm_add_epi16(sum_p7, p7_16);
+ sum_q7 = _mm_add_epi16(sum_q7, q7_16);
+ pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q2_16);
+ pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p2_16);
+ res_p = _mm_srli_epi16(_mm_add_epi16(pixelFilter_p,
+ _mm_add_epi16(sum_p7, p5_16)), 4);
+ res_q = _mm_srli_epi16(_mm_add_epi16(pixelFilter_q,
+ _mm_add_epi16(sum_q7, q5_16)), 4);
+ flat2_q5p5 = _mm_packus_epi16(res_p, res_q);
- a = _mm_add_epi16(q7, a);
- c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p7, p0)), q0);
- workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4);
- _mm_storel_epi64((__m128i *)&flat2_oq[0][i*8],
- _mm_packus_epi16(workp_shft, workp_shft));
-
- a = _mm_add_epi16(q7, a);
- c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p6, q0)), q1);
- workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4);
- _mm_storel_epi64((__m128i *)&flat2_oq[1][i*8],
- _mm_packus_epi16(workp_shft, workp_shft));
-
- a = _mm_add_epi16(q7, a);
- c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p5, q1)), q2);
- workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4);
- _mm_storel_epi64((__m128i *)&flat2_oq[2][i*8],
- _mm_packus_epi16(workp_shft, workp_shft));
-
- a = _mm_add_epi16(q7, a);
- c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p4, q2)), q3);
- workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4);
- _mm_storel_epi64((__m128i *)&flat2_oq[3][i*8],
- _mm_packus_epi16(workp_shft, workp_shft));
-
- a = _mm_add_epi16(q7, a);
- c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p3, q3)), q4);
- workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4);
- _mm_storel_epi64((__m128i *)&flat2_oq[4][i*8],
- _mm_packus_epi16(workp_shft, workp_shft));
-
- a = _mm_add_epi16(q7, a);
- c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p2, q4)), q5);
- workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4);
- _mm_storel_epi64((__m128i *)&flat2_oq[5][i*8],
- _mm_packus_epi16(workp_shft, workp_shft));
-
- a = _mm_add_epi16(q7, a);
- c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p1, q5)), q6);
- workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4);
- _mm_storel_epi64((__m128i *)&flat2_oq[6][i*8],
- _mm_packus_epi16(workp_shft, workp_shft));
- }
+ sum_p7 = _mm_add_epi16(sum_p7, p7_16);
+ sum_q7 = _mm_add_epi16(sum_q7, q7_16);
+ pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q1_16);
+ pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p1_16);
+ res_p = _mm_srli_epi16(_mm_add_epi16(pixelFilter_p,
+ _mm_add_epi16(sum_p7, p6_16)), 4);
+ res_q = _mm_srli_epi16(_mm_add_epi16(pixelFilter_q,
+ _mm_add_epi16(sum_q7, q6_16)), 4);
+ flat2_q6p6 = _mm_packus_epi16(res_p, res_q);
}
// wide flat
// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- work_a = _mm_loadl_epi64((__m128i *)ap[2]);
- p2 = _mm_loadl_epi64((__m128i *)flat_op[2]);
- work_a = _mm_andnot_si128(flat, work_a);
- p2 = _mm_and_si128(flat, p2);
- p2 = _mm_or_si128(work_a, p2);
- _mm_storel_epi64((__m128i *)flat_op[2], p2);
+ flat = _mm_shuffle_epi32(flat, 68);
+ flat2 = _mm_shuffle_epi32(flat2, 68);
- p1 = _mm_loadl_epi64((__m128i *)flat_op[1]);
- work_a = _mm_andnot_si128(flat, ps1);
- p1 = _mm_and_si128(flat, p1);
- p1 = _mm_or_si128(work_a, p1);
- _mm_storel_epi64((__m128i *)flat_op[1], p1);
+ q2p2 = _mm_andnot_si128(flat, q2p2);
+ flat_q2p2 = _mm_and_si128(flat, flat_q2p2);
+ q2p2 = _mm_or_si128(q2p2, flat_q2p2);
- p0 = _mm_loadl_epi64((__m128i *)flat_op[0]);
- work_a = _mm_andnot_si128(flat, ps0);
- p0 = _mm_and_si128(flat, p0);
- p0 = _mm_or_si128(work_a, p0);
- _mm_storel_epi64((__m128i *)flat_op[0], p0);
+ qs1ps1 = _mm_andnot_si128(flat, qs1ps1);
+ flat_q1p1 = _mm_and_si128(flat, flat_q1p1);
+ q1p1 = _mm_or_si128(qs1ps1, flat_q1p1);
- q0 = _mm_loadl_epi64((__m128i *)flat_oq[0]);
- work_a = _mm_andnot_si128(flat, qs0);
- q0 = _mm_and_si128(flat, q0);
- q0 = _mm_or_si128(work_a, q0);
- _mm_storel_epi64((__m128i *)flat_oq[0], q0);
+ qs0ps0 = _mm_andnot_si128(flat, qs0ps0);
+ flat_q0p0 = _mm_and_si128(flat, flat_q0p0);
+ q0p0 = _mm_or_si128(qs0ps0, flat_q0p0);
- q1 = _mm_loadl_epi64((__m128i *)flat_oq[1]);
- work_a = _mm_andnot_si128(flat, qs1);
- q1 = _mm_and_si128(flat, q1);
- q1 = _mm_or_si128(work_a, q1);
- _mm_storel_epi64((__m128i *)flat_oq[1], q1);
+ q6p6 = _mm_andnot_si128(flat2, q6p6);
+ flat2_q6p6 = _mm_and_si128(flat2, flat2_q6p6);
+ q6p6 = _mm_or_si128(q6p6, flat2_q6p6);
+ _mm_storel_epi64((__m128i *)(s - 7 * p), q6p6);
+ _mm_storeh_pi((__m64 *)(s + 6 * p), _mm_castsi128_ps(q6p6));
- work_a = _mm_loadl_epi64((__m128i *)aq[2]);
- q2 = _mm_loadl_epi64((__m128i *)flat_oq[2]);
- work_a = _mm_andnot_si128(flat, work_a);
- q2 = _mm_and_si128(flat, q2);
- q2 = _mm_or_si128(work_a, q2);
- _mm_storel_epi64((__m128i *)flat_oq[2], q2);
+ q5p5 = _mm_andnot_si128(flat2, q5p5);
+ flat2_q5p5 = _mm_and_si128(flat2, flat2_q5p5);
+ q5p5 = _mm_or_si128(q5p5, flat2_q5p5);
+ _mm_storel_epi64((__m128i *)(s - 6 * p), q5p5);
+ _mm_storeh_pi((__m64 *)(s + 5 * p), _mm_castsi128_ps(q5p5));
- // write out op6 - op3
- {
- unsigned char *dst = (s - 7 * p);
- for (i = 6; i > 2; i--) {
- __m128i flat2_output;
- work_a = _mm_loadl_epi64((__m128i *)ap[i]);
- flat2_output = _mm_loadl_epi64((__m128i *)flat2_op[i]);
- work_a = _mm_andnot_si128(flat2, work_a);
- flat2_output = _mm_and_si128(flat2, flat2_output);
- work_a = _mm_or_si128(work_a, flat2_output);
- _mm_storel_epi64((__m128i *)dst, work_a);
- dst += p;
- }
- }
+ q4p4 = _mm_andnot_si128(flat2, q4p4);
+ flat2_q4p4 = _mm_and_si128(flat2, flat2_q4p4);
+ q4p4 = _mm_or_si128(q4p4, flat2_q4p4);
+ _mm_storel_epi64((__m128i *)(s - 5 * p), q4p4);
+ _mm_storeh_pi((__m64 *)(s + 4 * p), _mm_castsi128_ps(q4p4));
- work_a = _mm_loadl_epi64((__m128i *)flat_op[2]);
- p2 = _mm_loadl_epi64((__m128i *)flat2_op[2]);
- work_a = _mm_andnot_si128(flat2, work_a);
- p2 = _mm_and_si128(flat2, p2);
- p2 = _mm_or_si128(work_a, p2);
- _mm_storel_epi64((__m128i *)(s - 3 * p), p2);
+ q3p3 = _mm_andnot_si128(flat2, q3p3);
+ flat2_q3p3 = _mm_and_si128(flat2, flat2_q3p3);
+ q3p3 = _mm_or_si128(q3p3, flat2_q3p3);
+ _mm_storel_epi64((__m128i *)(s - 4 * p), q3p3);
+ _mm_storeh_pi((__m64 *)(s + 3 * p), _mm_castsi128_ps(q3p3));
- work_a = _mm_loadl_epi64((__m128i *)flat_op[1]);
- p1 = _mm_loadl_epi64((__m128i *)flat2_op[1]);
- work_a = _mm_andnot_si128(flat2, work_a);
- p1 = _mm_and_si128(flat2, p1);
- p1 = _mm_or_si128(work_a, p1);
- _mm_storel_epi64((__m128i *)(s - 2 * p), p1);
+ q2p2 = _mm_andnot_si128(flat2, q2p2);
+ flat2_q2p2 = _mm_and_si128(flat2, flat2_q2p2);
+ q2p2 = _mm_or_si128(q2p2, flat2_q2p2);
+ _mm_storel_epi64((__m128i *)(s - 3 * p), q2p2);
+ _mm_storeh_pi((__m64 *)(s + 2 * p), _mm_castsi128_ps(q2p2));
- work_a = _mm_loadl_epi64((__m128i *)flat_op[0]);
- p0 = _mm_loadl_epi64((__m128i *)flat2_op[0]);
- work_a = _mm_andnot_si128(flat2, work_a);
- p0 = _mm_and_si128(flat2, p0);
- p0 = _mm_or_si128(work_a, p0);
- _mm_storel_epi64((__m128i *)(s - 1 * p), p0);
+ q1p1 = _mm_andnot_si128(flat2, q1p1);
+ flat2_q1p1 = _mm_and_si128(flat2, flat2_q1p1);
+ q1p1 = _mm_or_si128(q1p1, flat2_q1p1);
+ _mm_storel_epi64((__m128i *)(s - 2 * p), q1p1);
+ _mm_storeh_pi((__m64 *)(s + 1 * p), _mm_castsi128_ps(q1p1));
- work_a = _mm_loadl_epi64((__m128i *)flat_oq[0]);
- q0 = _mm_loadl_epi64((__m128i *)flat2_oq[0]);
- work_a = _mm_andnot_si128(flat2, work_a);
- q0 = _mm_and_si128(flat2, q0);
- q0 = _mm_or_si128(work_a, q0);
- _mm_storel_epi64((__m128i *)(s - 0 * p), q0);
-
- work_a = _mm_loadl_epi64((__m128i *)flat_oq[1]);
- q1 = _mm_loadl_epi64((__m128i *)flat2_oq[1]);
- work_a = _mm_andnot_si128(flat2, work_a);
- q1 = _mm_and_si128(flat2, q1);
- q1 = _mm_or_si128(work_a, q1);
- _mm_storel_epi64((__m128i *)(s + 1 * p), q1);
-
- work_a = _mm_loadl_epi64((__m128i *)flat_oq[2]);
- q2 = _mm_loadl_epi64((__m128i *)flat2_oq[2]);
- work_a = _mm_andnot_si128(flat2, work_a);
- q2 = _mm_and_si128(flat2, q2);
- q2 = _mm_or_si128(work_a, q2);
- _mm_storel_epi64((__m128i *)(s + 2 * p), q2);
-
- // write out oq3 - oq7
- {
- unsigned char *dst = (s + 3 * p);
- for (i = 3; i < 7; i++) {
- __m128i flat2_output;
- work_a = _mm_loadl_epi64((__m128i *)aq[i]);
- flat2_output = _mm_loadl_epi64((__m128i *)flat2_oq[i]);
- work_a = _mm_andnot_si128(flat2, work_a);
- flat2_output = _mm_and_si128(flat2, flat2_output);
- work_a = _mm_or_si128(work_a, flat2_output);
- _mm_storel_epi64((__m128i *)dst, work_a);
- dst += p;
- }
- }
+ q0p0 = _mm_andnot_si128(flat2, q0p0);
+ flat2_q0p0 = _mm_and_si128(flat2, flat2_q0p0);
+ q0p0 = _mm_or_si128(q0p0, flat2_q0p0);
+ _mm_storel_epi64((__m128i *)(s - 1 * p), q0p0);
+ _mm_storeh_pi((__m64 *)(s - 0 * p), _mm_castsi128_ps(q0p0));
}
}
diff --git a/vp9/decoder/vp9_decodemv.c b/vp9/decoder/vp9_decodemv.c
index d1c59c3..8dfb22c 100644
--- a/vp9/decoder/vp9_decodemv.c
+++ b/vp9/decoder/vp9_decodemv.c
@@ -43,7 +43,7 @@
}
static TX_SIZE read_selected_tx_size(VP9_COMMON *cm, MACROBLOCKD *xd,
- BLOCK_SIZE_TYPE bsize, vp9_reader *r) {
+ BLOCK_SIZE bsize, vp9_reader *r) {
const uint8_t context = vp9_get_pred_context_tx_size(xd);
const vp9_prob *tx_probs = get_tx_probs(bsize, context, &cm->fc.tx_probs);
TX_SIZE tx_size = vp9_read(r, tx_probs[0]);
@@ -58,7 +58,7 @@
}
static TX_SIZE read_tx_size(VP9D_COMP *pbi, TX_MODE tx_mode,
- BLOCK_SIZE_TYPE bsize, int allow_select,
+ BLOCK_SIZE bsize, int allow_select,
vp9_reader *r) {
VP9_COMMON *const cm = &pbi->common;
MACROBLOCKD *const xd = &pbi->mb;
@@ -75,7 +75,7 @@
return TX_4X4;
}
-static void set_segment_id(VP9_COMMON *cm, BLOCK_SIZE_TYPE bsize,
+static void set_segment_id(VP9_COMMON *cm, BLOCK_SIZE bsize,
int mi_row, int mi_col, int segment_id) {
const int mi_offset = mi_row * cm->mi_cols + mi_col;
const int bw = 1 << mi_width_log2(bsize);
@@ -95,7 +95,7 @@
vp9_reader *r) {
MACROBLOCKD *const xd = &pbi->mb;
struct segmentation *const seg = &pbi->common.seg;
- const BLOCK_SIZE_TYPE bsize = xd->mode_info_context->mbmi.sb_type;
+ const BLOCK_SIZE bsize = xd->mode_info_context->mbmi.sb_type;
int segment_id;
if (!seg->enabled)
@@ -114,7 +114,7 @@
VP9_COMMON *const cm = &pbi->common;
MACROBLOCKD *const xd = &pbi->mb;
struct segmentation *const seg = &cm->seg;
- const BLOCK_SIZE_TYPE bsize = xd->mode_info_context->mbmi.sb_type;
+ const BLOCK_SIZE bsize = xd->mode_info_context->mbmi.sb_type;
int pred_segment_id, segment_id;
if (!seg->enabled)
@@ -155,12 +155,12 @@
VP9_COMMON *const cm = &pbi->common;
MACROBLOCKD *const xd = &pbi->mb;
MB_MODE_INFO *const mbmi = &m->mbmi;
- const BLOCK_SIZE_TYPE bsize = mbmi->sb_type;
+ const BLOCK_SIZE bsize = mbmi->sb_type;
const int mis = cm->mode_info_stride;
mbmi->segment_id = read_intra_segment_id(pbi, mi_row, mi_col, r);
mbmi->skip_coeff = read_skip_coeff(pbi, mbmi->segment_id, r);
- mbmi->txfm_size = read_tx_size(pbi, cm->tx_mode, bsize, 1, r);
+ mbmi->tx_size = read_tx_size(pbi, cm->tx_mode, bsize, 1, r);
mbmi->ref_frame[0] = INTRA_FRAME;
mbmi->ref_frame[1] = NONE;
@@ -381,7 +381,7 @@
vp9_reader *r) {
VP9_COMMON *const cm = &pbi->common;
MB_MODE_INFO *const mbmi = &mi->mbmi;
- const BLOCK_SIZE_TYPE bsize = mi->mbmi.sb_type;
+ const BLOCK_SIZE bsize = mi->mbmi.sb_type;
mbmi->ref_frame[0] = INTRA_FRAME;
mbmi->ref_frame[1] = NONE;
@@ -439,31 +439,31 @@
MB_MODE_INFO *const mbmi = &mi->mbmi;
int_mv *const mv0 = &mbmi->mv[0];
int_mv *const mv1 = &mbmi->mv[1];
- const BLOCK_SIZE_TYPE bsize = mbmi->sb_type;
+ const BLOCK_SIZE bsize = mbmi->sb_type;
const int allow_hp = xd->allow_high_precision_mv;
int_mv nearest, nearby, best_mv;
int_mv nearest_second, nearby_second, best_mv_second;
uint8_t inter_mode_ctx;
- MV_REFERENCE_FRAME ref0, ref1;
+ MV_REFERENCE_FRAME ref0;
int is_compound;
+ mbmi->uv_mode = DC_PRED;
read_ref_frames(pbi, r, mbmi->segment_id, mbmi->ref_frame);
ref0 = mbmi->ref_frame[0];
- ref1 = mbmi->ref_frame[1];
- is_compound = ref1 > INTRA_FRAME;
+ is_compound = has_second_ref(mbmi);
vp9_find_mv_refs(cm, xd, mi, xd->prev_mode_info_context,
ref0, mbmi->ref_mvs[ref0], mi_row, mi_col);
inter_mode_ctx = mbmi->mode_context[ref0];
- if (vp9_segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP))
+ if (vp9_segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP)) {
mbmi->mode = ZEROMV;
- else if (bsize >= BLOCK_8X8)
- mbmi->mode = read_inter_mode(cm, r, inter_mode_ctx);
-
- mbmi->uv_mode = DC_PRED;
+ } else {
+ if (bsize >= BLOCK_8X8)
+ mbmi->mode = read_inter_mode(cm, r, inter_mode_ctx);
+ }
// nearest, nearby
if (bsize < BLOCK_8X8 || mbmi->mode != ZEROMV) {
@@ -471,11 +471,8 @@
best_mv.as_int = mbmi->ref_mvs[ref0][0].as_int;
}
- mbmi->interp_filter = cm->mcomp_filter_type == SWITCHABLE
- ? read_switchable_filter_type(pbi, r)
- : cm->mcomp_filter_type;
-
if (is_compound) {
+ const MV_REFERENCE_FRAME ref1 = mbmi->ref_frame[1];
vp9_find_mv_refs(cm, xd, mi, xd->prev_mode_info_context,
ref1, mbmi->ref_mvs[ref1], mi_row, mi_col);
@@ -486,6 +483,10 @@
}
}
+ mbmi->interp_filter = cm->mcomp_filter_type == SWITCHABLE
+ ? read_switchable_filter_type(pbi, r)
+ : cm->mcomp_filter_type;
+
if (bsize < BLOCK_8X8) {
const int num_4x4_w = num_4x4_blocks_wide_lookup[bsize]; // 1 or 2
const int num_4x4_h = num_4x4_blocks_high_lookup[bsize]; // 1 or 2
@@ -590,8 +591,8 @@
mbmi->segment_id = read_inter_segment_id(pbi, mi_row, mi_col, r);
mbmi->skip_coeff = read_skip_coeff(pbi, mbmi->segment_id, r);
inter_block = read_is_inter_block(pbi, mbmi->segment_id, r);
- mbmi->txfm_size = read_tx_size(pbi, cm->tx_mode, mbmi->sb_type,
- !mbmi->skip_coeff || !inter_block, r);
+ mbmi->tx_size = read_tx_size(pbi, cm->tx_mode, mbmi->sb_type,
+ !mbmi->skip_coeff || !inter_block, r);
if (inter_block)
read_inter_block_mode_info(pbi, mi, mi_row, mi_col, r);
@@ -668,7 +669,7 @@
VP9_COMMON *const cm = &pbi->common;
MACROBLOCKD *const xd = &pbi->mb;
MODE_INFO *mi = xd->mode_info_context;
- const BLOCK_SIZE_TYPE bsize = mi->mbmi.sb_type;
+ const BLOCK_SIZE bsize = mi->mbmi.sb_type;
const int bw = 1 << mi_width_log2(bsize);
const int bh = 1 << mi_height_log2(bsize);
const int y_mis = MIN(bh, cm->mi_rows - mi_row);
diff --git a/vp9/decoder/vp9_decodframe.c b/vp9/decoder/vp9_decodframe.c
index fd88b6e..41e406d 100644
--- a/vp9/decoder/vp9_decodframe.c
+++ b/vp9/decoder/vp9_decodframe.c
@@ -87,7 +87,7 @@
xd->plane[i].dequant = cm->uv_dequant[xd->q_index];
}
-static void decode_block(int plane, int block, BLOCK_SIZE_TYPE plane_bsize,
+static void decode_block(int plane, int block, BLOCK_SIZE plane_bsize,
TX_SIZE tx_size, void *arg) {
MACROBLOCKD* const xd = arg;
struct macroblockd_plane *const pd = &xd->plane[plane];
@@ -123,8 +123,7 @@
}
}
-static void decode_block_intra(int plane, int block,
- BLOCK_SIZE_TYPE plane_bsize,
+static void decode_block_intra(int plane, int block, BLOCK_SIZE plane_bsize,
TX_SIZE tx_size, void *arg) {
MACROBLOCKD* const xd = arg;
struct macroblockd_plane *const pd = &xd->plane[plane];
@@ -133,32 +132,23 @@
block);
uint8_t* const dst = raster_block_offset_uint8(plane_bsize, raster_block,
pd->dst.buf, pd->dst.stride);
- int b_mode;
- const int tx_ib = raster_block >> tx_size;
- const int mode = (plane == 0) ? mi->mbmi.mode : mi->mbmi.uv_mode;
-
- if (plane == 0 && mi->mbmi.sb_type < BLOCK_8X8) {
- assert(plane_bsize == BLOCK_8X8);
- b_mode = mi->bmi[raster_block].as_mode;
- } else {
- b_mode = mode;
- }
+ const MB_PREDICTION_MODE mode = (plane == 0)
+ ? ((mi->mbmi.sb_type < BLOCK_8X8) ? mi->bmi[raster_block].as_mode
+ : mi->mbmi.mode)
+ : mi->mbmi.uv_mode;
if (xd->mb_to_right_edge < 0 || xd->mb_to_bottom_edge < 0)
extend_for_intra(xd, plane_bsize, plane, block, tx_size);
- vp9_predict_intra_block(xd, tx_ib, b_width_log2(plane_bsize), tx_size, b_mode,
- dst, pd->dst.stride,
- dst, pd->dst.stride);
+ vp9_predict_intra_block(xd, raster_block >> tx_size,
+ b_width_log2(plane_bsize), tx_size, mode,
+ dst, pd->dst.stride, dst, pd->dst.stride);
- // Early exit if there are no coefficients
- if (mi->mbmi.skip_coeff)
- return;
-
- decode_block(plane, block, plane_bsize, tx_size, arg);
+ if (!mi->mbmi.skip_coeff)
+ decode_block(plane, block, plane_bsize, tx_size, arg);
}
-static int decode_tokens(VP9D_COMP *pbi, BLOCK_SIZE_TYPE bsize, vp9_reader *r) {
+static int decode_tokens(VP9D_COMP *pbi, BLOCK_SIZE bsize, vp9_reader *r) {
MACROBLOCKD *const xd = &pbi->mb;
if (xd->mode_info_context->mbmi.skip_coeff) {
@@ -173,20 +163,19 @@
}
}
-static void set_offsets(VP9D_COMP *pbi, BLOCK_SIZE_TYPE bsize,
+static void set_offsets(VP9D_COMP *pbi, BLOCK_SIZE bsize,
int mi_row, int mi_col) {
VP9_COMMON *const cm = &pbi->common;
MACROBLOCKD *const xd = &pbi->mb;
- const int bh = 1 << mi_height_log2(bsize);
- const int bw = 1 << mi_width_log2(bsize);
- const int mi_idx = mi_row * cm->mode_info_stride + mi_col;
+ const int bh = num_8x8_blocks_high_lookup[bsize];
+ const int bw = num_8x8_blocks_wide_lookup[bsize];
+ const int offset = mi_row * cm->mode_info_stride + mi_col;
- xd->mode_info_context = cm->mi + mi_idx;
+ xd->mode_info_context = cm->mi + offset;
xd->mode_info_context->mbmi.sb_type = bsize;
// Special case: if prev_mi is NULL, the previous mode info context
// cannot be used.
- xd->prev_mode_info_context = cm->prev_mi ? cm->prev_mi + mi_idx : NULL;
-
+ xd->prev_mode_info_context = cm->prev_mi ? cm->prev_mi + offset : NULL;
set_skip_context(cm, xd, mi_row, mi_col);
set_partition_seg_context(cm, xd, mi_row, mi_col);
@@ -215,7 +204,7 @@
}
static void decode_modes_b(VP9D_COMP *pbi, int mi_row, int mi_col,
- vp9_reader *r, BLOCK_SIZE_TYPE bsize) {
+ vp9_reader *r, BLOCK_SIZE bsize) {
VP9_COMMON *const cm = &pbi->common;
MACROBLOCKD *const xd = &pbi->mb;
const int less8x8 = bsize < BLOCK_8X8;
@@ -243,7 +232,7 @@
int eobtotal;
set_ref(pbi, 0, mi_row, mi_col);
- if (mbmi->ref_frame[1] > INTRA_FRAME)
+ if (has_second_ref(mbmi))
set_ref(pbi, 1, mi_row, mi_col);
vp9_setup_interp_filters(xd, mbmi->interp_filter, cm);
@@ -265,14 +254,14 @@
}
static void decode_modes_sb(VP9D_COMP *pbi, int mi_row, int mi_col,
- vp9_reader* r, BLOCK_SIZE_TYPE bsize) {
- VP9_COMMON *const pc = &pbi->common;
+ vp9_reader* r, BLOCK_SIZE bsize) {
+ VP9_COMMON *const cm = &pbi->common;
MACROBLOCKD *const xd = &pbi->mb;
- const int bs = (1 << mi_width_log2(bsize)) / 2;
+ const int hbs = num_8x8_blocks_wide_lookup[bsize] / 2;
PARTITION_TYPE partition = PARTITION_NONE;
- BLOCK_SIZE_TYPE subsize;
+ BLOCK_SIZE subsize;
- if (mi_row >= pc->mi_rows || mi_col >= pc->mi_cols)
+ if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols)
return;
if (bsize < BLOCK_8X8) {
@@ -280,25 +269,25 @@
return;
} else {
int pl;
- const int idx = check_bsize_coverage(bs, pc->mi_rows, pc->mi_cols,
+ const int idx = check_bsize_coverage(hbs, cm->mi_rows, cm->mi_cols,
mi_row, mi_col);
- set_partition_seg_context(pc, xd, mi_row, mi_col);
+ set_partition_seg_context(cm, xd, mi_row, mi_col);
pl = partition_plane_context(xd, bsize);
if (idx == 0)
partition = treed_read(r, vp9_partition_tree,
- pc->fc.partition_prob[pc->frame_type][pl]);
+ cm->fc.partition_prob[cm->frame_type][pl]);
else if (idx > 0 &&
- !vp9_read(r, pc->fc.partition_prob[pc->frame_type][pl][idx]))
+ !vp9_read(r, cm->fc.partition_prob[cm->frame_type][pl][idx]))
partition = (idx == 1) ? PARTITION_HORZ : PARTITION_VERT;
else
partition = PARTITION_SPLIT;
- pc->counts.partition[pl][partition]++;
+ cm->counts.partition[pl][partition]++;
}
subsize = get_subsize(bsize, partition);
- *(get_sb_index(xd, subsize)) = 0;
+ *get_sb_index(xd, subsize) = 0;
switch (partition) {
case PARTITION_NONE:
@@ -306,22 +295,22 @@
break;
case PARTITION_HORZ:
decode_modes_b(pbi, mi_row, mi_col, r, subsize);
- *(get_sb_index(xd, subsize)) = 1;
- if (mi_row + bs < pc->mi_rows)
- decode_modes_b(pbi, mi_row + bs, mi_col, r, subsize);
+ *get_sb_index(xd, subsize) = 1;
+ if (mi_row + hbs < cm->mi_rows)
+ decode_modes_b(pbi, mi_row + hbs, mi_col, r, subsize);
break;
case PARTITION_VERT:
decode_modes_b(pbi, mi_row, mi_col, r, subsize);
- *(get_sb_index(xd, subsize)) = 1;
- if (mi_col + bs < pc->mi_cols)
- decode_modes_b(pbi, mi_row, mi_col + bs, r, subsize);
+ *get_sb_index(xd, subsize) = 1;
+ if (mi_col + hbs < cm->mi_cols)
+ decode_modes_b(pbi, mi_row, mi_col + hbs, r, subsize);
break;
case PARTITION_SPLIT: {
int n;
for (n = 0; n < 4; n++) {
const int j = n >> 1, i = n & 1;
- *(get_sb_index(xd, subsize)) = n;
- decode_modes_sb(pbi, mi_row + j * bs, mi_col + i * bs, r, subsize);
+ *get_sb_index(xd, subsize) = n;
+ decode_modes_sb(pbi, mi_row + j * hbs, mi_col + i * hbs, r, subsize);
}
} break;
default:
@@ -331,7 +320,7 @@
// update partition context
if (bsize >= BLOCK_8X8 &&
(bsize == BLOCK_8X8 || partition != PARTITION_SPLIT)) {
- set_partition_seg_context(pc, xd, mi_row, mi_col);
+ set_partition_seg_context(cm, xd, mi_row, mi_col);
update_partition_context(xd, subsize, bsize);
}
}
@@ -339,18 +328,18 @@
static void setup_token_decoder(VP9D_COMP *pbi,
const uint8_t *data, size_t read_size,
vp9_reader *r) {
- VP9_COMMON *pc = &pbi->common;
+ VP9_COMMON *cm = &pbi->common;
const uint8_t *data_end = pbi->source + pbi->source_sz;
// Validate the calculated partition length. If the buffer
// described by the partition can't be fully read, then restrict
// it to the portion that can be (for EC mode) or throw an error.
if (!read_is_valid(data, read_size, data_end))
- vpx_internal_error(&pc->error, VPX_CODEC_CORRUPT_FRAME,
+ vpx_internal_error(&cm->error, VPX_CODEC_CORRUPT_FRAME,
"Truncated packet or corrupt tile length");
if (vp9_reader_init(r, data, read_size))
- vpx_internal_error(&pc->error, VPX_CODEC_MEM_ERROR,
+ vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
"Failed to allocate bool decoder %d", 1);
}
@@ -582,31 +571,30 @@
static void decode_tile(VP9D_COMP *pbi, vp9_reader *r) {
const int num_threads = pbi->oxcf.max_threads;
- VP9_COMMON *const pc = &pbi->common;
+ VP9_COMMON *const cm = &pbi->common;
int mi_row, mi_col;
- YV12_BUFFER_CONFIG *const fb = &pc->yv12_fb[pc->new_fb_idx];
+ YV12_BUFFER_CONFIG *const fb = &cm->yv12_fb[cm->new_fb_idx];
if (pbi->do_loopfilter_inline) {
if (num_threads > 1) {
LFWorkerData *const lf_data = (LFWorkerData*)pbi->lf_worker.data1;
lf_data->frame_buffer = fb;
- lf_data->cm = pc;
+ lf_data->cm = cm;
lf_data->xd = pbi->mb;
lf_data->stop = 0;
lf_data->y_only = 0;
}
- vp9_loop_filter_frame_init(pc, pc->lf.filter_level);
+ vp9_loop_filter_frame_init(cm, cm->lf.filter_level);
}
- for (mi_row = pc->cur_tile_mi_row_start; mi_row < pc->cur_tile_mi_row_end;
+ for (mi_row = cm->cur_tile_mi_row_start; mi_row < cm->cur_tile_mi_row_end;
mi_row += MI_BLOCK_SIZE) {
// For a SB there are 2 left contexts, each pertaining to a MB row within
- vp9_zero(pc->left_context);
- vp9_zero(pc->left_seg_context);
- for (mi_col = pc->cur_tile_mi_col_start; mi_col < pc->cur_tile_mi_col_end;
- mi_col += MI_BLOCK_SIZE) {
+ vp9_zero(cm->left_context);
+ vp9_zero(cm->left_seg_context);
+ for (mi_col = cm->cur_tile_mi_col_start; mi_col < cm->cur_tile_mi_col_end;
+ mi_col += MI_BLOCK_SIZE)
decode_modes_sb(pbi, mi_row, mi_col, r, BLOCK_64X64);
- }
if (pbi->do_loopfilter_inline) {
// delay the loopfilter by 1 macroblock row.
@@ -617,7 +605,7 @@
LFWorkerData *const lf_data = (LFWorkerData*)pbi->lf_worker.data1;
// decoding has completed: finish up the loop filter in this thread.
- if (mi_row + MI_BLOCK_SIZE >= pc->cur_tile_mi_row_end) continue;
+ if (mi_row + MI_BLOCK_SIZE >= cm->cur_tile_mi_row_end) continue;
vp9_worker_sync(&pbi->lf_worker);
lf_data->start = lf_start;
@@ -625,7 +613,7 @@
pbi->lf_worker.hook = vp9_loop_filter_worker;
vp9_worker_launch(&pbi->lf_worker);
} else {
- vp9_loop_filter_rows(fb, pc, &pbi->mb, lf_start, mi_row, 0);
+ vp9_loop_filter_rows(fb, cm, &pbi->mb, lf_start, mi_row, 0);
}
}
}
@@ -640,8 +628,8 @@
} else {
lf_start = mi_row - MI_BLOCK_SIZE;
}
- vp9_loop_filter_rows(fb, pc, &pbi->mb,
- lf_start, pc->mi_rows, 0);
+ vp9_loop_filter_rows(fb, cm, &pbi->mb,
+ lf_start, cm->mi_rows, 0);
}
}
@@ -664,20 +652,20 @@
static const uint8_t *decode_tiles(VP9D_COMP *pbi, const uint8_t *data) {
vp9_reader residual_bc;
- VP9_COMMON *const pc = &pbi->common;
+ VP9_COMMON *const cm = &pbi->common;
const uint8_t *const data_end = pbi->source + pbi->source_sz;
- const int aligned_mi_cols = mi_cols_aligned_to_sb(pc->mi_cols);
- const int tile_cols = 1 << pc->log2_tile_cols;
- const int tile_rows = 1 << pc->log2_tile_rows;
+ const int aligned_mi_cols = mi_cols_aligned_to_sb(cm->mi_cols);
+ const int tile_cols = 1 << cm->log2_tile_cols;
+ const int tile_rows = 1 << cm->log2_tile_rows;
int tile_row, tile_col;
// Note: this memset assumes above_context[0], [1] and [2]
// are allocated as part of the same buffer.
- vpx_memset(pc->above_context[0], 0,
+ vpx_memset(cm->above_context[0], 0,
sizeof(ENTROPY_CONTEXT) * MAX_MB_PLANE * (2 * aligned_mi_cols));
- vpx_memset(pc->above_seg_context, 0,
+ vpx_memset(cm->above_seg_context, 0,
sizeof(PARTITION_CONTEXT) * aligned_mi_cols);
if (pbi->oxcf.inv_tile_order) {
@@ -702,9 +690,9 @@
}
for (tile_row = 0; tile_row < tile_rows; tile_row++) {
- vp9_get_tile_row_offsets(pc, tile_row);
+ vp9_get_tile_row_offsets(cm, tile_row);
for (tile_col = tile_cols - 1; tile_col >= 0; tile_col--) {
- vp9_get_tile_col_offsets(pc, tile_col);
+ vp9_get_tile_col_offsets(cm, tile_col);
setup_token_decoder(pbi, data_ptr2[tile_row][tile_col],
data_end - data_ptr2[tile_row][tile_col],
&residual_bc);
@@ -718,16 +706,16 @@
int has_more;
for (tile_row = 0; tile_row < tile_rows; tile_row++) {
- vp9_get_tile_row_offsets(pc, tile_row);
+ vp9_get_tile_row_offsets(cm, tile_row);
for (tile_col = 0; tile_col < tile_cols; tile_col++) {
size_t size;
- vp9_get_tile_col_offsets(pc, tile_col);
+ vp9_get_tile_col_offsets(cm, tile_col);
has_more = tile_col < tile_cols - 1 || tile_row < tile_rows - 1;
if (has_more) {
if (!read_is_valid(data, 4, data_end))
- vpx_internal_error(&pc->error, VPX_CODEC_CORRUPT_FRAME,
+ vpx_internal_error(&cm->error, VPX_CODEC_CORRUPT_FRAME,
"Truncated packet or corrupt tile length");
size = read_be32(data);
@@ -940,17 +928,17 @@
int vp9_decode_frame(VP9D_COMP *pbi, const uint8_t **p_data_end) {
int i;
- VP9_COMMON *const pc = &pbi->common;
+ VP9_COMMON *const cm = &pbi->common;
MACROBLOCKD *const xd = &pbi->mb;
const uint8_t *data = pbi->source;
const uint8_t *data_end = pbi->source + pbi->source_sz;
struct vp9_read_bit_buffer rb = { data, data_end, 0,
- pc, error_handler };
+ cm, error_handler };
const size_t first_partition_size = read_uncompressed_header(pbi, &rb);
- const int keyframe = pc->frame_type == KEY_FRAME;
- YV12_BUFFER_CONFIG *new_fb = &pc->yv12_fb[pc->new_fb_idx];
+ const int keyframe = cm->frame_type == KEY_FRAME;
+ YV12_BUFFER_CONFIG *new_fb = &cm->yv12_fb[cm->new_fb_idx];
if (!first_partition_size) {
// showing a frame directly
@@ -961,42 +949,39 @@
xd->corrupted = 0;
new_fb->corrupted = 0;
pbi->do_loopfilter_inline =
- (pc->log2_tile_rows | pc->log2_tile_cols) == 0 && pc->lf.filter_level;
+ (cm->log2_tile_rows | cm->log2_tile_cols) == 0 && cm->lf.filter_level;
if (!pbi->decoded_key_frame && !keyframe)
return -1;
if (!read_is_valid(data, first_partition_size, data_end))
- vpx_internal_error(&pc->error, VPX_CODEC_CORRUPT_FRAME,
+ vpx_internal_error(&cm->error, VPX_CODEC_CORRUPT_FRAME,
"Truncated packet or corrupt header length");
- xd->mode_info_context = pc->mi;
- xd->prev_mode_info_context = pc->prev_mi;
- xd->mode_info_stride = pc->mode_info_stride;
+ xd->mode_info_context = cm->mi;
+ xd->prev_mode_info_context = cm->prev_mi;
+ xd->mode_info_stride = cm->mode_info_stride;
- init_dequantizer(pc, &pbi->mb);
+ init_dequantizer(cm, &pbi->mb);
- if (!keyframe)
- vp9_setup_interp_filters(xd, pc->mcomp_filter_type, pc);
+ cm->fc = cm->frame_contexts[cm->frame_context_idx];
- pc->fc = pc->frame_contexts[pc->frame_context_idx];
-
- vp9_zero(pc->counts);
+ vp9_zero(cm->counts);
new_fb->corrupted |= read_compressed_header(pbi, data, first_partition_size);
- setup_block_dptrs(xd, pc->subsampling_x, pc->subsampling_y);
+ setup_block_dptrs(xd, cm->subsampling_x, cm->subsampling_y);
// clear out the coeff buffer
for (i = 0; i < MAX_MB_PLANE; ++i)
vp9_zero(xd->plane[i].qcoeff);
- set_prev_mi(pc);
+ set_prev_mi(cm);
*p_data_end = decode_tiles(pbi, data + first_partition_size);
- pc->last_width = pc->width;
- pc->last_height = pc->height;
+ cm->last_width = cm->width;
+ cm->last_height = cm->height;
new_fb->corrupted |= xd->corrupted;
@@ -1004,21 +989,21 @@
if (keyframe && !new_fb->corrupted)
pbi->decoded_key_frame = 1;
else
- vpx_internal_error(&pc->error, VPX_CODEC_CORRUPT_FRAME,
+ vpx_internal_error(&cm->error, VPX_CODEC_CORRUPT_FRAME,
"A stream must start with a complete key frame");
}
- if (!pc->error_resilient_mode && !pc->frame_parallel_decoding_mode) {
- vp9_adapt_coef_probs(pc);
+ if (!cm->error_resilient_mode && !cm->frame_parallel_decoding_mode) {
+ vp9_adapt_coef_probs(cm);
- if (!keyframe && !pc->intra_only) {
- vp9_adapt_mode_probs(pc);
- vp9_adapt_mv_probs(pc, xd->allow_high_precision_mv);
+ if (!keyframe && !cm->intra_only) {
+ vp9_adapt_mode_probs(cm);
+ vp9_adapt_mv_probs(cm, xd->allow_high_precision_mv);
}
}
- if (pc->refresh_frame_context)
- pc->frame_contexts[pc->frame_context_idx] = pc->fc;
+ if (cm->refresh_frame_context)
+ cm->frame_contexts[cm->frame_context_idx] = cm->fc;
return 0;
}
diff --git a/vp9/decoder/vp9_decodframe.h b/vp9/decoder/vp9_decodframe.h
index 00b6d67..c665f6f 100644
--- a/vp9/decoder/vp9_decodframe.h
+++ b/vp9/decoder/vp9_decodframe.h
@@ -15,7 +15,7 @@
struct VP9Common;
struct VP9Decompressor;
-void vp9_init_dequantizer(struct VP9Common *pc);
+void vp9_init_dequantizer(struct VP9Common *cm);
int vp9_decode_frame(struct VP9Decompressor *cpi, const uint8_t **p_data_end);
#endif // VP9_DECODER_VP9_DECODFRAME_H_
diff --git a/vp9/decoder/vp9_detokenize.c b/vp9/decoder/vp9_detokenize.c
index 3a62bba..c119093 100644
--- a/vp9/decoder/vp9_detokenize.c
+++ b/vp9/decoder/vp9_detokenize.c
@@ -94,9 +94,8 @@
ENTROPY_CONTEXT *A, ENTROPY_CONTEXT *L) {
FRAME_CONTEXT *const fc = &cm->fc;
FRAME_COUNTS *const counts = &cm->counts;
- ENTROPY_CONTEXT above_ec, left_ec;
const int ref = is_inter_block(&xd->mode_info_context->mbmi);
- int band, pt, c = 0;
+ int band, c = 0;
vp9_prob (*coef_probs)[PREV_COEF_CONTEXTS][UNCONSTRAINED_NODES] =
fc->coef_probs[tx_size][type][ref];
vp9_prob coef_probs_full[COEF_BANDS][PREV_COEF_CONTEXTS][ENTROPY_NODES];
@@ -104,38 +103,10 @@
vp9_prob *prob;
vp9_coeff_count_model *coef_counts = counts->coef[tx_size];
const int16_t *scan, *nb;
- uint8_t token_cache[1024];
const uint8_t *band_translate;
-
- switch (tx_size) {
- default:
- case TX_4X4:
- scan = get_scan_4x4(get_tx_type_4x4(type, xd, block_idx));
- above_ec = A[0] != 0;
- left_ec = L[0] != 0;
- band_translate = vp9_coefband_trans_4x4;
- break;
- case TX_8X8:
- scan = get_scan_8x8(get_tx_type_8x8(type, xd));
- above_ec = !!*(uint16_t *)A;
- left_ec = !!*(uint16_t *)L;
- band_translate = vp9_coefband_trans_8x8plus;
- break;
- case TX_16X16:
- scan = get_scan_16x16(get_tx_type_16x16(type, xd));
- above_ec = !!*(uint32_t *)A;
- left_ec = !!*(uint32_t *)L;
- band_translate = vp9_coefband_trans_8x8plus;
- break;
- case TX_32X32:
- scan = vp9_default_scan_32x32;
- above_ec = !!*(uint64_t *)A;
- left_ec = !!*(uint64_t *)L;
- band_translate = vp9_coefband_trans_8x8plus;
- break;
- }
-
- pt = combine_entropy_contexts(above_ec, left_ec);
+ uint8_t token_cache[1024];
+ int pt = get_entropy_context(xd, tx_size, type, block_idx, A, L,
+ &scan, &band_translate);
nb = vp9_get_coef_neighbors_handle(scan);
while (1) {
@@ -239,17 +210,13 @@
return c;
}
-static int get_eob(struct segmentation *seg, int segment_id, int eob_max) {
- return vp9_segfeature_active(seg, segment_id, SEG_LVL_SKIP) ? 0 : eob_max;
-}
-
struct decode_block_args {
VP9D_COMP *pbi;
vp9_reader *r;
int *eobtotal;
};
-static void decode_block(int plane, int block, BLOCK_SIZE_TYPE plane_bsize,
+static void decode_block(int plane, int block, BLOCK_SIZE plane_bsize,
TX_SIZE tx_size, void *argv) {
const struct decode_block_args* const arg = argv;
@@ -258,8 +225,7 @@
struct segmentation *seg = &arg->pbi->common.seg;
struct macroblockd_plane* pd = &xd->plane[plane];
const int segment_id = xd->mode_info_context->mbmi.segment_id;
- const int ss_txfrm_size = tx_size << 1;
- const int seg_eob = get_eob(seg, segment_id, 16 << ss_txfrm_size);
+ const int seg_eob = get_tx_eob(seg, segment_id, tx_size);
int aoff, loff, eob;
txfrm_block_to_raster_xy(plane_bsize, tx_size, block, &aoff, &loff);
@@ -275,7 +241,7 @@
*arg->eobtotal += eob;
}
-int vp9_decode_tokens(VP9D_COMP *pbi, vp9_reader *r, BLOCK_SIZE_TYPE bsize) {
+int vp9_decode_tokens(VP9D_COMP *pbi, vp9_reader *r, BLOCK_SIZE bsize) {
int eobtotal = 0;
struct decode_block_args args = {pbi, r, &eobtotal};
foreach_transformed_block(&pbi->mb, bsize, decode_block, &args);
diff --git a/vp9/decoder/vp9_detokenize.h b/vp9/decoder/vp9_detokenize.h
index f98fe8d..cf07c56 100644
--- a/vp9/decoder/vp9_detokenize.h
+++ b/vp9/decoder/vp9_detokenize.h
@@ -15,6 +15,6 @@
#include "vp9/decoder/vp9_onyxd_int.h"
#include "vp9/decoder/vp9_dboolhuff.h"
-int vp9_decode_tokens(VP9D_COMP* pbi, vp9_reader *r, BLOCK_SIZE_TYPE bsize);
+int vp9_decode_tokens(VP9D_COMP* pbi, vp9_reader *r, BLOCK_SIZE bsize);
#endif // VP9_DECODER_VP9_DETOKENIZE_H_
diff --git a/vp9/encoder/vp9_bitstream.c b/vp9/encoder/vp9_bitstream.c
index d7c73b6..f3bbc17 100644
--- a/vp9/encoder/vp9_bitstream.c
+++ b/vp9/encoder/vp9_bitstream.c
@@ -199,7 +199,7 @@
}
static void write_selected_tx_size(const VP9_COMP *cpi, TX_SIZE tx_size,
- BLOCK_SIZE_TYPE bsize, vp9_writer *w) {
+ BLOCK_SIZE bsize, vp9_writer *w) {
const MACROBLOCKD *const xd = &cpi->mb.e_mbd;
const vp9_prob *tx_probs = get_tx_probs2(xd, &cpi->common.fc.tx_probs);
vp9_write(w, tx_size != TX_4X4, tx_probs[0]);
@@ -237,7 +237,7 @@
static void update_switchable_interp_probs(VP9_COMP *const cpi,
vp9_writer* const bc) {
- VP9_COMMON *const pc = &cpi->common;
+ VP9_COMMON *const cm = &cpi->common;
unsigned int branch_ct[SWITCHABLE_FILTERS + 1]
[SWITCHABLE_FILTERS - 1][2];
vp9_prob new_prob[SWITCHABLE_FILTERS + 1][SWITCHABLE_FILTERS - 1];
@@ -246,21 +246,21 @@
vp9_tree_probs_from_distribution(
vp9_switchable_interp_tree,
new_prob[j], branch_ct[j],
- pc->counts.switchable_interp[j], 0);
+ cm->counts.switchable_interp[j], 0);
}
for (j = 0; j <= SWITCHABLE_FILTERS; ++j) {
for (i = 0; i < SWITCHABLE_FILTERS - 1; ++i) {
- vp9_cond_prob_diff_update(bc, &pc->fc.switchable_interp_prob[j][i],
+ vp9_cond_prob_diff_update(bc, &cm->fc.switchable_interp_prob[j][i],
MODE_UPDATE_PROB, branch_ct[j][i]);
}
}
#ifdef MODE_STATS
if (!cpi->dummy_packing)
- update_switchable_interp_stats(pc);
+ update_switchable_interp_stats(cm);
#endif
}
-static void update_inter_mode_probs(VP9_COMMON *pc, vp9_writer* const bc) {
+static void update_inter_mode_probs(VP9_COMMON *cm, vp9_writer* const bc) {
int i, j;
for (i = 0; i < INTER_MODE_CONTEXTS; ++i) {
@@ -269,10 +269,10 @@
vp9_tree_probs_from_distribution(vp9_inter_mode_tree,
new_prob, branch_ct,
- pc->counts.inter_mode[i], NEARESTMV);
+ cm->counts.inter_mode[i], NEARESTMV);
for (j = 0; j < INTER_MODES - 1; ++j)
- vp9_cond_prob_diff_update(bc, &pc->fc.inter_mode_probs[i][j],
+ vp9_cond_prob_diff_update(bc, &cm->fc.inter_mode_probs[i][j],
MODE_UPDATE_PROB, branch_ct[j]);
}
}
@@ -356,39 +356,39 @@
// This function encodes the reference frame
static void encode_ref_frame(VP9_COMP *cpi, vp9_writer *bc) {
- VP9_COMMON *const pc = &cpi->common;
+ VP9_COMMON *const cm = &cpi->common;
MACROBLOCK *const x = &cpi->mb;
MACROBLOCKD *const xd = &x->e_mbd;
MB_MODE_INFO *mi = &xd->mode_info_context->mbmi;
const int segment_id = mi->segment_id;
- int seg_ref_active = vp9_segfeature_active(&pc->seg, segment_id,
+ int seg_ref_active = vp9_segfeature_active(&cm->seg, segment_id,
SEG_LVL_REF_FRAME);
// If segment level coding of this signal is disabled...
// or the segment allows multiple reference frame options
if (!seg_ref_active) {
// does the feature use compound prediction or not
// (if not specified at the frame/segment level)
- if (pc->comp_pred_mode == HYBRID_PREDICTION) {
+ if (cm->comp_pred_mode == HYBRID_PREDICTION) {
vp9_write(bc, mi->ref_frame[1] > INTRA_FRAME,
- vp9_get_pred_prob_comp_inter_inter(pc, xd));
+ vp9_get_pred_prob_comp_inter_inter(cm, xd));
} else {
assert((mi->ref_frame[1] <= INTRA_FRAME) ==
- (pc->comp_pred_mode == SINGLE_PREDICTION_ONLY));
+ (cm->comp_pred_mode == SINGLE_PREDICTION_ONLY));
}
if (mi->ref_frame[1] > INTRA_FRAME) {
vp9_write(bc, mi->ref_frame[0] == GOLDEN_FRAME,
- vp9_get_pred_prob_comp_ref_p(pc, xd));
+ vp9_get_pred_prob_comp_ref_p(cm, xd));
} else {
vp9_write(bc, mi->ref_frame[0] != LAST_FRAME,
- vp9_get_pred_prob_single_ref_p1(pc, xd));
+ vp9_get_pred_prob_single_ref_p1(cm, xd));
if (mi->ref_frame[0] != LAST_FRAME)
vp9_write(bc, mi->ref_frame[0] != GOLDEN_FRAME,
- vp9_get_pred_prob_single_ref_p2(pc, xd));
+ vp9_get_pred_prob_single_ref_p2(cm, xd));
}
} else {
assert(mi->ref_frame[1] <= INTRA_FRAME);
- assert(vp9_get_segdata(&pc->seg, segment_id, SEG_LVL_REF_FRAME) ==
+ assert(vp9_get_segdata(&cm->seg, segment_id, SEG_LVL_REF_FRAME) ==
mi->ref_frame[0]);
}
@@ -397,20 +397,20 @@
}
static void pack_inter_mode_mvs(VP9_COMP *cpi, MODE_INFO *m, vp9_writer *bc) {
- VP9_COMMON *const pc = &cpi->common;
- const nmv_context *nmvc = &pc->fc.nmvc;
+ VP9_COMMON *const cm = &cpi->common;
+ const nmv_context *nmvc = &cm->fc.nmvc;
MACROBLOCK *const x = &cpi->mb;
MACROBLOCKD *const xd = &x->e_mbd;
- struct segmentation *seg = &pc->seg;
+ struct segmentation *seg = &cm->seg;
MB_MODE_INFO *const mi = &m->mbmi;
const MV_REFERENCE_FRAME rf = mi->ref_frame[0];
const MB_PREDICTION_MODE mode = mi->mode;
const int segment_id = mi->segment_id;
int skip_coeff;
- const BLOCK_SIZE_TYPE bsize = mi->sb_type;
+ const BLOCK_SIZE bsize = mi->sb_type;
const int allow_hp = xd->allow_high_precision_mv;
- x->partition_info = x->pi + (m - pc->mi);
+ x->partition_info = x->pi + (m - cm->mi);
#ifdef ENTROPY_STATS
active_section = 9;
@@ -432,12 +432,12 @@
if (!vp9_segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME))
vp9_write(bc, rf != INTRA_FRAME,
- vp9_get_pred_prob_intra_inter(pc, xd));
+ vp9_get_pred_prob_intra_inter(cm, xd));
- if (bsize >= BLOCK_8X8 && pc->tx_mode == TX_MODE_SELECT &&
+ if (bsize >= BLOCK_8X8 && cm->tx_mode == TX_MODE_SELECT &&
!(rf != INTRA_FRAME &&
(skip_coeff || vp9_segfeature_active(seg, segment_id, SEG_LVL_SKIP)))) {
- write_selected_tx_size(cpi, mi->txfm_size, bsize, bc);
+ write_selected_tx_size(cpi, mi->tx_size, bsize, bc);
}
if (rf == INTRA_FRAME) {
@@ -446,7 +446,7 @@
#endif
if (bsize >= BLOCK_8X8) {
- write_intra_mode(bc, mode, pc->fc.y_mode_prob[size_group_lookup[bsize]]);
+ write_intra_mode(bc, mode, cm->fc.y_mode_prob[size_group_lookup[bsize]]);
} else {
int idx, idy;
const int num_4x4_blocks_wide = num_4x4_blocks_wide_lookup[bsize];
@@ -454,11 +454,11 @@
for (idy = 0; idy < 2; idy += num_4x4_blocks_high) {
for (idx = 0; idx < 2; idx += num_4x4_blocks_wide) {
const MB_PREDICTION_MODE bm = m->bmi[idy * 2 + idx].as_mode;
- write_intra_mode(bc, bm, pc->fc.y_mode_prob[0]);
+ write_intra_mode(bc, bm, cm->fc.y_mode_prob[0]);
}
}
}
- write_intra_mode(bc, mi->uv_mode, pc->fc.uv_mode_prob[mode]);
+ write_intra_mode(bc, mi->uv_mode, cm->fc.uv_mode_prob[mode]);
} else {
vp9_prob *mv_ref_p;
encode_ref_frame(cpi, bc);
@@ -472,18 +472,18 @@
if (!vp9_segfeature_active(seg, segment_id, SEG_LVL_SKIP)) {
if (bsize >= BLOCK_8X8) {
write_sb_mv_ref(bc, mode, mv_ref_p);
- ++pc->counts.inter_mode[mi->mode_context[rf]]
+ ++cm->counts.inter_mode[mi->mode_context[rf]]
[inter_mode_offset(mode)];
}
}
- if (pc->mcomp_filter_type == SWITCHABLE) {
+ if (cm->mcomp_filter_type == SWITCHABLE) {
const int ctx = vp9_get_pred_context_switchable_interp(xd);
write_token(bc, vp9_switchable_interp_tree,
- pc->fc.switchable_interp_prob[ctx],
+ cm->fc.switchable_interp_prob[ctx],
&vp9_switchable_interp_encodings[mi->interp_filter]);
} else {
- assert(mi->interp_filter == pc->mcomp_filter_type);
+ assert(mi->interp_filter == cm->mcomp_filter_type);
}
if (bsize < BLOCK_8X8) {
@@ -499,7 +499,7 @@
blockmode = x->partition_info->bmi[j].mode;
blockmv = m->bmi[j].as_mv[0];
write_sb_mv_ref(bc, blockmode, mv_ref_p);
- ++pc->counts.inter_mode[mi->mode_context[rf]]
+ ++cm->counts.inter_mode[mi->mode_context[rf]]
[inter_mode_offset(blockmode)];
if (blockmode == NEWMV) {
@@ -533,11 +533,11 @@
static void write_mb_modes_kf(const VP9_COMP *cpi, MODE_INFO *m,
vp9_writer *bc) {
- const VP9_COMMON *const c = &cpi->common;
+ const VP9_COMMON *const cm = &cpi->common;
const MACROBLOCKD *const xd = &cpi->mb.e_mbd;
- const struct segmentation *const seg = &c->seg;
+ const struct segmentation *const seg = &cm->seg;
const int ym = m->mbmi.mode;
- const int mis = c->mode_info_stride;
+ const int mis = cm->mode_info_stride;
const int segment_id = m->mbmi.segment_id;
if (seg->update_map)
@@ -545,8 +545,8 @@
write_skip_coeff(cpi, segment_id, m, bc);
- if (m->mbmi.sb_type >= BLOCK_8X8 && c->tx_mode == TX_MODE_SELECT)
- write_selected_tx_size(cpi, m->mbmi.txfm_size, m->mbmi.sb_type, bc);
+ if (m->mbmi.sb_type >= BLOCK_8X8 && cm->tx_mode == TX_MODE_SELECT)
+ write_selected_tx_size(cpi, m->mbmi.tx_size, m->mbmi.sb_type, bc);
if (m->mbmi.sb_type >= BLOCK_8X8) {
const MB_PREDICTION_MODE A = above_block_mode(m, 0, mis);
@@ -584,11 +584,13 @@
if (m->mbmi.sb_type < BLOCK_8X8)
if (xd->ab_index > 0)
return;
+
xd->mode_info_context = m;
- set_mi_row_col(&cpi->common, xd, mi_row,
- 1 << mi_height_log2(m->mbmi.sb_type),
- mi_col, 1 << mi_width_log2(m->mbmi.sb_type));
- if ((cm->frame_type == KEY_FRAME) || cm->intra_only) {
+ set_mi_row_col(&cpi->common, xd,
+ mi_row, num_8x8_blocks_high_lookup[m->mbmi.sb_type],
+ mi_col, num_8x8_blocks_wide_lookup[m->mbmi.sb_type]);
+
+ if (cm->frame_type == KEY_FRAME || cm->intra_only) {
write_mb_modes_kf(cpi, m, bc);
#ifdef ENTROPY_STATS
active_section = 8;
@@ -606,8 +608,7 @@
static void write_modes_sb(VP9_COMP *cpi, MODE_INFO *m, vp9_writer *bc,
TOKENEXTRA **tok, TOKENEXTRA *tok_end,
- int mi_row, int mi_col,
- BLOCK_SIZE_TYPE bsize) {
+ int mi_row, int mi_col, BLOCK_SIZE bsize) {
VP9_COMMON *const cm = &cpi->common;
MACROBLOCKD *xd = &cpi->mb.e_mbd;
const int mis = cm->mode_info_stride;
@@ -615,7 +616,7 @@
int bs = (1 << bsl) / 4; // mode_info step for subsize
int n;
PARTITION_TYPE partition = PARTITION_NONE;
- BLOCK_SIZE_TYPE subsize;
+ BLOCK_SIZE subsize;
if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols)
return;
@@ -683,18 +684,18 @@
static void write_modes(VP9_COMP *cpi, vp9_writer* const bc,
TOKENEXTRA **tok, TOKENEXTRA *tok_end) {
- VP9_COMMON *const c = &cpi->common;
- const int mis = c->mode_info_stride;
- MODE_INFO *m, *m_ptr = c->mi;
+ VP9_COMMON *const cm = &cpi->common;
+ const int mis = cm->mode_info_stride;
+ MODE_INFO *m, *m_ptr = cm->mi;
int mi_row, mi_col;
- m_ptr += c->cur_tile_mi_col_start + c->cur_tile_mi_row_start * mis;
+ m_ptr += cm->cur_tile_mi_col_start + cm->cur_tile_mi_row_start * mis;
- for (mi_row = c->cur_tile_mi_row_start; mi_row < c->cur_tile_mi_row_end;
+ for (mi_row = cm->cur_tile_mi_row_start; mi_row < cm->cur_tile_mi_row_end;
mi_row += 8, m_ptr += 8 * mis) {
m = m_ptr;
- vp9_zero(c->left_seg_context);
- for (mi_col = c->cur_tile_mi_col_start; mi_col < c->cur_tile_mi_col_end;
+ vp9_zero(cm->left_seg_context);
+ for (mi_col = cm->cur_tile_mi_col_start; mi_col < cm->cur_tile_mi_col_end;
mi_col += MI_BLOCK_SIZE, m += MI_BLOCK_SIZE)
write_modes_sb(cpi, m, bc, tok, tok_end, mi_row, mi_col, BLOCK_64X64);
}
@@ -783,94 +784,170 @@
vp9_coeff_probs_model *old_frame_coef_probs =
cpi->common.fc.coef_probs[tx_size];
vp9_coeff_stats *frame_branch_ct = cpi->frame_branch_ct[tx_size];
- int i, j, k, l, t;
- int update[2] = {0, 0};
- int savings;
-
+ const vp9_prob upd = VP9_COEF_UPDATE_PROB;
const int entropy_nodes_update = UNCONSTRAINED_NODES;
+ int i, j, k, l, t;
+ switch (cpi->sf.use_fast_coef_updates) {
+ case 0: {
+ /* dry run to see if there is any udpate at all needed */
+ int savings = 0;
+ int update[2] = {0, 0};
+ for (i = 0; i < BLOCK_TYPES; ++i) {
+ for (j = 0; j < REF_TYPES; ++j) {
+ for (k = 0; k < COEF_BANDS; ++k) {
+ for (l = 0; l < PREV_COEF_CONTEXTS; ++l) {
+ for (t = 0; t < entropy_nodes_update; ++t) {
+ vp9_prob newp = new_frame_coef_probs[i][j][k][l][t];
+ const vp9_prob oldp = old_frame_coef_probs[i][j][k][l][t];
+ int s;
+ int u = 0;
- const int tstart = 0;
- /* dry run to see if there is any udpate at all needed */
- savings = 0;
- for (i = 0; i < BLOCK_TYPES; ++i) {
- for (j = 0; j < REF_TYPES; ++j) {
- for (k = 0; k < COEF_BANDS; ++k) {
- // int prev_coef_savings[ENTROPY_NODES] = {0};
- for (l = 0; l < PREV_COEF_CONTEXTS; ++l) {
- for (t = tstart; t < entropy_nodes_update; ++t) {
- vp9_prob newp = new_frame_coef_probs[i][j][k][l][t];
- const vp9_prob oldp = old_frame_coef_probs[i][j][k][l][t];
- const vp9_prob upd = VP9_COEF_UPDATE_PROB;
- int s;
- int u = 0;
-
- if (l >= 3 && k == 0)
- continue;
- if (t == PIVOT_NODE)
- s = vp9_prob_diff_update_savings_search_model(
- frame_branch_ct[i][j][k][l][0],
- old_frame_coef_probs[i][j][k][l], &newp, upd, i, j);
- else
- s = vp9_prob_diff_update_savings_search(
- frame_branch_ct[i][j][k][l][t], oldp, &newp, upd);
- if (s > 0 && newp != oldp)
- u = 1;
- if (u)
- savings += s - (int)(vp9_cost_zero(upd));
- else
- savings -= (int)(vp9_cost_zero(upd));
- update[u]++;
- }
- }
- }
- }
- }
-
- // printf("Update %d %d, savings %d\n", update[0], update[1], savings);
- /* Is coef updated at all */
- if (update[1] == 0 || savings < 0) {
- vp9_write_bit(bc, 0);
- return;
- }
- vp9_write_bit(bc, 1);
- for (i = 0; i < BLOCK_TYPES; ++i) {
- for (j = 0; j < REF_TYPES; ++j) {
- for (k = 0; k < COEF_BANDS; ++k) {
- // int prev_coef_savings[ENTROPY_NODES] = {0};
- for (l = 0; l < PREV_COEF_CONTEXTS; ++l) {
- // calc probs and branch cts for this frame only
- for (t = tstart; t < entropy_nodes_update; ++t) {
- vp9_prob newp = new_frame_coef_probs[i][j][k][l][t];
- vp9_prob *oldp = old_frame_coef_probs[i][j][k][l] + t;
- const vp9_prob upd = VP9_COEF_UPDATE_PROB;
- int s;
- int u = 0;
- if (l >= 3 && k == 0)
- continue;
- if (t == PIVOT_NODE)
- s = vp9_prob_diff_update_savings_search_model(
- frame_branch_ct[i][j][k][l][0],
- old_frame_coef_probs[i][j][k][l], &newp, upd, i, j);
- else
- s = vp9_prob_diff_update_savings_search(
- frame_branch_ct[i][j][k][l][t],
- *oldp, &newp, upd);
- if (s > 0 && newp != *oldp)
- u = 1;
- vp9_write(bc, u, upd);
-#ifdef ENTROPY_STATS
- if (!cpi->dummy_packing)
- ++tree_update_hist[tx_size][i][j][k][l][t][u];
-#endif
- if (u) {
- /* send/use new probability */
- vp9_write_prob_diff_update(bc, newp, *oldp);
- *oldp = newp;
+ if (l >= 3 && k == 0)
+ continue;
+ if (t == PIVOT_NODE)
+ s = vp9_prob_diff_update_savings_search_model(
+ frame_branch_ct[i][j][k][l][0],
+ old_frame_coef_probs[i][j][k][l], &newp, upd, i, j);
+ else
+ s = vp9_prob_diff_update_savings_search(
+ frame_branch_ct[i][j][k][l][t], oldp, &newp, upd);
+ if (s > 0 && newp != oldp)
+ u = 1;
+ if (u)
+ savings += s - (int)(vp9_cost_zero(upd));
+ else
+ savings -= (int)(vp9_cost_zero(upd));
+ update[u]++;
+ }
}
}
}
}
+
+ // printf("Update %d %d, savings %d\n", update[0], update[1], savings);
+ /* Is coef updated at all */
+ if (update[1] == 0 || savings < 0) {
+ vp9_write_bit(bc, 0);
+ return;
+ }
+ vp9_write_bit(bc, 1);
+ for (i = 0; i < BLOCK_TYPES; ++i) {
+ for (j = 0; j < REF_TYPES; ++j) {
+ for (k = 0; k < COEF_BANDS; ++k) {
+ for (l = 0; l < PREV_COEF_CONTEXTS; ++l) {
+ // calc probs and branch cts for this frame only
+ for (t = 0; t < entropy_nodes_update; ++t) {
+ vp9_prob newp = new_frame_coef_probs[i][j][k][l][t];
+ vp9_prob *oldp = old_frame_coef_probs[i][j][k][l] + t;
+ const vp9_prob upd = VP9_COEF_UPDATE_PROB;
+ int s;
+ int u = 0;
+ if (l >= 3 && k == 0)
+ continue;
+ if (t == PIVOT_NODE)
+ s = vp9_prob_diff_update_savings_search_model(
+ frame_branch_ct[i][j][k][l][0],
+ old_frame_coef_probs[i][j][k][l], &newp, upd, i, j);
+ else
+ s = vp9_prob_diff_update_savings_search(
+ frame_branch_ct[i][j][k][l][t],
+ *oldp, &newp, upd);
+ if (s > 0 && newp != *oldp)
+ u = 1;
+ vp9_write(bc, u, upd);
+#ifdef ENTROPY_STATS
+ if (!cpi->dummy_packing)
+ ++tree_update_hist[tx_size][i][j][k][l][t][u];
+#endif
+ if (u) {
+ /* send/use new probability */
+ vp9_write_prob_diff_update(bc, newp, *oldp);
+ *oldp = newp;
+ }
+ }
+ }
+ }
+ }
+ }
+ return;
}
+
+ case 1:
+ case 2: {
+ const int prev_coef_contexts_to_update =
+ (cpi->sf.use_fast_coef_updates == 2 ?
+ PREV_COEF_CONTEXTS >> 1 : PREV_COEF_CONTEXTS);
+ const int coef_band_to_update =
+ (cpi->sf.use_fast_coef_updates == 2 ?
+ COEF_BANDS >> 1 : COEF_BANDS);
+ int updates = 0;
+ int noupdates_before_first = 0;
+ for (i = 0; i < BLOCK_TYPES; ++i) {
+ for (j = 0; j < REF_TYPES; ++j) {
+ for (k = 0; k < COEF_BANDS; ++k) {
+ for (l = 0; l < PREV_COEF_CONTEXTS; ++l) {
+ // calc probs and branch cts for this frame only
+ for (t = 0; t < entropy_nodes_update; ++t) {
+ vp9_prob newp = new_frame_coef_probs[i][j][k][l][t];
+ vp9_prob *oldp = old_frame_coef_probs[i][j][k][l] + t;
+ int s;
+ int u = 0;
+ if (l >= 3 && k == 0)
+ continue;
+ if (l >= prev_coef_contexts_to_update ||
+ k >= coef_band_to_update) {
+ u = 0;
+ } else {
+ if (t == PIVOT_NODE)
+ s = vp9_prob_diff_update_savings_search_model(
+ frame_branch_ct[i][j][k][l][0],
+ old_frame_coef_probs[i][j][k][l], &newp, upd, i, j);
+ else
+ s = vp9_prob_diff_update_savings_search(
+ frame_branch_ct[i][j][k][l][t],
+ *oldp, &newp, upd);
+ if (s > 0 && newp != *oldp)
+ u = 1;
+ }
+ updates += u;
+ if (u == 0 && updates == 0) {
+ noupdates_before_first++;
+#ifdef ENTROPY_STATS
+ if (!cpi->dummy_packing)
+ ++tree_update_hist[tx_size][i][j][k][l][t][u];
+#endif
+ continue;
+ }
+ if (u == 1 && updates == 1) {
+ int v;
+ // first update
+ vp9_write_bit(bc, 1);
+ for (v = 0; v < noupdates_before_first; ++v)
+ vp9_write(bc, 0, upd);
+ }
+ vp9_write(bc, u, upd);
+#ifdef ENTROPY_STATS
+ if (!cpi->dummy_packing)
+ ++tree_update_hist[tx_size][i][j][k][l][t][u];
+#endif
+ if (u) {
+ /* send/use new probability */
+ vp9_write_prob_diff_update(bc, newp, *oldp);
+ *oldp = newp;
+ }
+ }
+ }
+ }
+ }
+ }
+ if (updates == 0) {
+ vp9_write_bit(bc, 0); // no updates
+ }
+ return;
+ }
+
+ default:
+ assert(0);
}
}
@@ -1457,7 +1534,7 @@
vp9_compute_update_table();
#ifdef ENTROPY_STATS
- if (pc->frame_type == INTER_FRAME)
+ if (cm->frame_type == INTER_FRAME)
active_section = 0;
else
active_section = 7;
diff --git a/vp9/encoder/vp9_block.h b/vp9/encoder/vp9_block.h
index 9426f44..7b2dd11 100644
--- a/vp9/encoder/vp9_block.h
+++ b/vp9/encoder/vp9_block.h
@@ -170,10 +170,10 @@
PICK_MODE_CONTEXT sb64_context;
int partition_cost[NUM_PARTITION_CONTEXTS][PARTITION_TYPES];
- BLOCK_SIZE_TYPE b_partitioning[4][4][4];
- BLOCK_SIZE_TYPE mb_partitioning[4][4];
- BLOCK_SIZE_TYPE sb_partitioning[4];
- BLOCK_SIZE_TYPE sb64_partitioning;
+ BLOCK_SIZE b_partitioning[4][4][4];
+ BLOCK_SIZE mb_partitioning[4][4];
+ BLOCK_SIZE sb_partitioning[4];
+ BLOCK_SIZE sb64_partitioning;
void (*fwd_txm4x4)(int16_t *input, int16_t *output, int pitch);
void (*fwd_txm8x4)(int16_t *input, int16_t *output, int pitch);
diff --git a/vp9/encoder/vp9_dct.c b/vp9/encoder/vp9_dct.c
index 3112dad..4f4ad04 100644
--- a/vp9/encoder/vp9_dct.c
+++ b/vp9/encoder/vp9_dct.c
@@ -1077,6 +1077,44 @@
output[30] = step[30];
output[31] = step[31];
+ // dump the magnitude by 4, hence the intermediate values are within
+ // the range of 16 bits.
+ if (round) {
+ output[0] = half_round_shift(output[0]);
+ output[1] = half_round_shift(output[1]);
+ output[2] = half_round_shift(output[2]);
+ output[3] = half_round_shift(output[3]);
+ output[4] = half_round_shift(output[4]);
+ output[5] = half_round_shift(output[5]);
+ output[6] = half_round_shift(output[6]);
+ output[7] = half_round_shift(output[7]);
+ output[8] = half_round_shift(output[8]);
+ output[9] = half_round_shift(output[9]);
+ output[10] = half_round_shift(output[10]);
+ output[11] = half_round_shift(output[11]);
+ output[12] = half_round_shift(output[12]);
+ output[13] = half_round_shift(output[13]);
+ output[14] = half_round_shift(output[14]);
+ output[15] = half_round_shift(output[15]);
+
+ output[16] = half_round_shift(output[16]);
+ output[17] = half_round_shift(output[17]);
+ output[18] = half_round_shift(output[18]);
+ output[19] = half_round_shift(output[19]);
+ output[20] = half_round_shift(output[20]);
+ output[21] = half_round_shift(output[21]);
+ output[22] = half_round_shift(output[22]);
+ output[23] = half_round_shift(output[23]);
+ output[24] = half_round_shift(output[24]);
+ output[25] = half_round_shift(output[25]);
+ output[26] = half_round_shift(output[26]);
+ output[27] = half_round_shift(output[27]);
+ output[28] = half_round_shift(output[28]);
+ output[29] = half_round_shift(output[29]);
+ output[30] = half_round_shift(output[30]);
+ output[31] = half_round_shift(output[31]);
+ }
+
// Stage 3
step[0] = output[0] + output[(8 - 1)];
step[1] = output[1] + output[(8 - 2)];
@@ -1112,44 +1150,6 @@
step[30] = output[30] + output[25];
step[31] = output[31] + output[24];
- // dump the magnitude by half, hence the intermediate values are within 1108
- // the range of 16 bits.
- if (round) {
- step[0] = half_round_shift(step[0]);
- step[1] = half_round_shift(step[1]);
- step[2] = half_round_shift(step[2]);
- step[3] = half_round_shift(step[3]);
- step[4] = half_round_shift(step[4]);
- step[5] = half_round_shift(step[5]);
- step[6] = half_round_shift(step[6]);
- step[7] = half_round_shift(step[7]);
- step[8] = half_round_shift(step[8]);
- step[9] = half_round_shift(step[9]);
- step[10] = half_round_shift(step[10]);
- step[11] = half_round_shift(step[11]);
- step[12] = half_round_shift(step[12]);
- step[13] = half_round_shift(step[13]);
- step[14] = half_round_shift(step[14]);
- step[15] = half_round_shift(step[15]);
-
- step[16] = half_round_shift(step[16]);
- step[17] = half_round_shift(step[17]);
- step[18] = half_round_shift(step[18]);
- step[19] = half_round_shift(step[19]);
- step[20] = half_round_shift(step[20]);
- step[21] = half_round_shift(step[21]);
- step[22] = half_round_shift(step[22]);
- step[23] = half_round_shift(step[23]);
- step[24] = half_round_shift(step[24]);
- step[25] = half_round_shift(step[25]);
- step[26] = half_round_shift(step[26]);
- step[27] = half_round_shift(step[27]);
- step[28] = half_round_shift(step[28]);
- step[29] = half_round_shift(step[29]);
- step[30] = half_round_shift(step[30]);
- step[31] = half_round_shift(step[31]);
- }
-
// Stage 4
output[0] = step[0] + step[3];
output[1] = step[1] + step[2];
diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c
index a0a3ace..eb83903 100644
--- a/vp9/encoder/vp9_encodeframe.c
+++ b/vp9/encoder/vp9_encodeframe.c
@@ -47,7 +47,7 @@
#endif
static void encode_superblock(VP9_COMP *cpi, TOKENEXTRA **t, int output_enabled,
- int mi_row, int mi_col, BLOCK_SIZE_TYPE bsize);
+ int mi_row, int mi_col, BLOCK_SIZE bsize);
static void adjust_act_zbin(VP9_COMP *cpi, MACROBLOCK *x);
@@ -78,7 +78,7 @@
};
static unsigned int get_sby_perpixel_variance(VP9_COMP *cpi, MACROBLOCK *x,
- BLOCK_SIZE_TYPE bs) {
+ BLOCK_SIZE bs) {
unsigned int var, sse;
var = cpi->fn_ptr[bs].vf(x->plane[0].src.buf,
x->plane[0].src.stride,
@@ -336,7 +336,7 @@
}
static void update_state(VP9_COMP *cpi, PICK_MODE_CONTEXT *ctx,
- BLOCK_SIZE_TYPE bsize, int output_enabled) {
+ BLOCK_SIZE bsize, int output_enabled) {
int i, x_idx, y;
VP9_COMMON *const cm = &cpi->common;
MACROBLOCK *const x = &cpi->mb;
@@ -404,7 +404,7 @@
THR_D135_PRED /*D135_PRED*/,
THR_D117_PRED /*D117_PRED*/,
THR_D153_PRED /*D153_PRED*/,
- THR_D27_PRED /*D27_PRED*/,
+ THR_D207_PRED /*D207_PRED*/,
THR_D63_PRED /*D63_PRED*/,
THR_TM /*TM_PRED*/,
THR_B_PRED /*I4X4_PRED*/,
@@ -469,10 +469,10 @@
}
static void set_offsets(VP9_COMP *cpi, int mi_row, int mi_col,
- BLOCK_SIZE_TYPE bsize) {
- MACROBLOCK * const x = &cpi->mb;
- VP9_COMMON * const cm = &cpi->common;
- MACROBLOCKD * const xd = &x->e_mbd;
+ BLOCK_SIZE bsize) {
+ MACROBLOCK *const x = &cpi->mb;
+ VP9_COMMON *const cm = &cpi->common;
+ MACROBLOCKD *const xd = &x->e_mbd;
MB_MODE_INFO *mbmi;
const int dst_fb_idx = cm->new_fb_idx;
const int idx_str = xd->mode_info_stride * mi_row + mi_col;
@@ -553,7 +553,7 @@
static void pick_sb_modes(VP9_COMP *cpi, int mi_row, int mi_col,
int *totalrate, int64_t *totaldist,
- BLOCK_SIZE_TYPE bsize, PICK_MODE_CONTEXT *ctx,
+ BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx,
int64_t best_rd) {
VP9_COMMON *const cm = &cpi->common;
MACROBLOCK *const x = &cpi->mb;
@@ -637,9 +637,8 @@
// TODO(jingning): the variables used here are little complicated. need further
// refactoring on organizing the temporary buffers, when recursive
// partition down to 4x4 block size is enabled.
-static PICK_MODE_CONTEXT *get_block_context(MACROBLOCK *x,
- BLOCK_SIZE_TYPE bsize) {
- MACROBLOCKD * const xd = &x->e_mbd;
+static PICK_MODE_CONTEXT *get_block_context(MACROBLOCK *x, BLOCK_SIZE bsize) {
+ MACROBLOCKD *const xd = &x->e_mbd;
switch (bsize) {
case BLOCK_64X64:
@@ -674,9 +673,8 @@
}
}
-static BLOCK_SIZE_TYPE *get_sb_partitioning(MACROBLOCK *x,
- BLOCK_SIZE_TYPE bsize) {
- MACROBLOCKD *xd = &x->e_mbd;
+static BLOCK_SIZE *get_sb_partitioning(MACROBLOCK *x, BLOCK_SIZE bsize) {
+ MACROBLOCKD *const xd = &x->e_mbd;
switch (bsize) {
case BLOCK_64X64:
return &x->sb64_partitioning;
@@ -696,7 +694,7 @@
ENTROPY_CONTEXT a[16 * MAX_MB_PLANE],
ENTROPY_CONTEXT l[16 * MAX_MB_PLANE],
PARTITION_CONTEXT sa[8], PARTITION_CONTEXT sl[8],
- BLOCK_SIZE_TYPE bsize) {
+ BLOCK_SIZE bsize) {
VP9_COMMON *const cm = &cpi->common;
MACROBLOCK *const x = &cpi->mb;
MACROBLOCKD *const xd = &x->e_mbd;
@@ -727,7 +725,7 @@
ENTROPY_CONTEXT a[16 * MAX_MB_PLANE],
ENTROPY_CONTEXT l[16 * MAX_MB_PLANE],
PARTITION_CONTEXT sa[8], PARTITION_CONTEXT sl[8],
- BLOCK_SIZE_TYPE bsize) {
+ BLOCK_SIZE bsize) {
const VP9_COMMON *const cm = &cpi->common;
const MACROBLOCK *const x = &cpi->mb;
const MACROBLOCKD *const xd = &x->e_mbd;
@@ -758,7 +756,7 @@
}
static void encode_b(VP9_COMP *cpi, TOKENEXTRA **tp, int mi_row, int mi_col,
- int output_enabled, BLOCK_SIZE_TYPE bsize, int sub_index) {
+ int output_enabled, BLOCK_SIZE bsize, int sub_index) {
VP9_COMMON * const cm = &cpi->common;
MACROBLOCK * const x = &cpi->mb;
MACROBLOCKD * const xd = &x->e_mbd;
@@ -767,7 +765,7 @@
return;
if (sub_index != -1)
- *(get_sb_index(xd, bsize)) = sub_index;
+ *get_sb_index(xd, bsize) = sub_index;
if (bsize < BLOCK_8X8) {
// When ab_index = 0 all sub-blocks are handled, so for ab_index != 0
@@ -788,15 +786,15 @@
}
static void encode_sb(VP9_COMP *cpi, TOKENEXTRA **tp, int mi_row, int mi_col,
- int output_enabled, BLOCK_SIZE_TYPE bsize) {
+ int output_enabled, BLOCK_SIZE bsize) {
VP9_COMMON * const cm = &cpi->common;
MACROBLOCK * const x = &cpi->mb;
MACROBLOCKD * const xd = &x->e_mbd;
- BLOCK_SIZE_TYPE c1 = BLOCK_8X8;
+ BLOCK_SIZE c1 = BLOCK_8X8;
const int bsl = b_width_log2(bsize), bs = (1 << bsl) / 4;
int UNINITIALIZED_IS_SAFE(pl);
PARTITION_TYPE partition;
- BLOCK_SIZE_TYPE subsize;
+ BLOCK_SIZE subsize;
int i;
if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols)
@@ -837,7 +835,7 @@
for (i = 0; i < 4; i++) {
const int x_idx = i & 1, y_idx = i >> 1;
- *(get_sb_index(xd, subsize)) = i;
+ *get_sb_index(xd, subsize) = i;
encode_sb(cpi, tp, mi_row + y_idx * bs, mi_col + x_idx * bs,
output_enabled, subsize);
}
@@ -853,8 +851,7 @@
}
}
-static void set_partitioning(VP9_COMP *cpi, MODE_INFO *m,
- BLOCK_SIZE_TYPE bsize) {
+static void set_partitioning(VP9_COMP *cpi, MODE_INFO *m, BLOCK_SIZE bsize) {
VP9_COMMON *const cm = &cpi->common;
const int mis = cm->mode_info_stride;
int block_row, block_col;
@@ -876,23 +873,17 @@
}
}
-static void set_block_size(VP9_COMMON * const cm, MODE_INFO *m,
- BLOCK_SIZE_TYPE bsize, int mis, int mi_row,
+static void set_block_size(VP9_COMMON * const cm, MODE_INFO *mi,
+ BLOCK_SIZE bsize, int mis, int mi_row,
int mi_col) {
- int row, col;
- int bwl = b_width_log2(bsize);
- int bhl = b_height_log2(bsize);
- int bsl = (bwl > bhl ? bwl : bhl);
-
- int bs = (1 << bsl) / 2; // Block size in units of 8 pels.
- MODE_INFO *m2 = m + mi_row * mis + mi_col;
- for (row = 0; row < bs; row++) {
- for (col = 0; col < bs; col++) {
- if (mi_row + row >= cm->mi_rows || mi_col + col >= cm->mi_cols)
- continue;
- m2[row * mis + col].mbmi.sb_type = bsize;
- }
- }
+ int r, c;
+ const int bs = MAX(num_8x8_blocks_wide_lookup[bsize],
+ num_8x8_blocks_high_lookup[bsize]);
+ MODE_INFO *const mi2 = &mi[mi_row * mis + mi_col];
+ for (r = 0; r < bs; r++)
+ for (c = 0; c < bs; c++)
+ if (mi_row + r < cm->mi_rows && mi_col + c < cm->mi_cols)
+ mi2[r * mis + c].mbmi.sb_type = bsize;
}
typedef struct {
@@ -929,9 +920,9 @@
V64X64,
} TREE_LEVEL;
-static void tree_to_node(void *data, BLOCK_SIZE_TYPE block_size, vt_node *node) {
+static void tree_to_node(void *data, BLOCK_SIZE bsize, vt_node *node) {
int i;
- switch (block_size) {
+ switch (bsize) {
case BLOCK_64X64: {
v64x64 *vt = (v64x64 *) data;
node->vt = &vt->vt;
@@ -988,9 +979,9 @@
a->sum_error + b->sum_error, a->count + b->count);
}
-static void fill_variance_tree(void *data, BLOCK_SIZE_TYPE block_size) {
+static void fill_variance_tree(void *data, BLOCK_SIZE bsize) {
vt_node node;
- tree_to_node(data, block_size, &node);
+ tree_to_node(data, bsize, &node);
sum_2_variances(&node.vt->horz[0], node.split[0], node.split[1]);
sum_2_variances(&node.vt->horz[1], node.split[2], node.split[3]);
sum_2_variances(&node.vt->vert[0], node.split[0], node.split[2]);
@@ -1000,7 +991,7 @@
#if PERFORM_RANDOM_PARTITIONING
static int set_vt_partitioning(VP9_COMP *cpi, void *data, MODE_INFO *m,
- BLOCK_SIZE_TYPE block_size, int mi_row,
+ BLOCK_SIZE block_size, int mi_row,
int mi_col, int mi_size) {
VP9_COMMON * const cm = &cpi->common;
vt_node vt;
@@ -1039,27 +1030,27 @@
#else // !PERFORM_RANDOM_PARTITIONING
static int set_vt_partitioning(VP9_COMP *cpi, void *data, MODE_INFO *m,
- BLOCK_SIZE_TYPE block_size, int mi_row,
+ BLOCK_SIZE bsize, int mi_row,
int mi_col, int mi_size) {
VP9_COMMON * const cm = &cpi->common;
vt_node vt;
const int mis = cm->mode_info_stride;
int64_t threshold = 50 * cpi->common.base_qindex;
- tree_to_node(data, block_size, &vt);
+ tree_to_node(data, bsize, &vt);
// split none is available only if we have more than half a block size
// in width and height inside the visible image
if (mi_col + mi_size < cm->mi_cols && mi_row + mi_size < cm->mi_rows
&& vt.vt->none.variance < threshold) {
- set_block_size(cm, m, block_size, mis, mi_row, mi_col);
+ set_block_size(cm, m, bsize, mis, mi_row, mi_col);
return 1;
}
// vertical split is available on all but the bottom border
if (mi_row + mi_size < cm->mi_rows && vt.vt->vert[0].variance < threshold
&& vt.vt->vert[1].variance < threshold) {
- set_block_size(cm, m, get_subsize(block_size, PARTITION_VERT), mis, mi_row,
+ set_block_size(cm, m, get_subsize(bsize, PARTITION_VERT), mis, mi_row,
mi_col);
return 1;
}
@@ -1067,7 +1058,7 @@
// horizontal split is available on all but the right border
if (mi_col + mi_size < cm->mi_cols && vt.vt->horz[0].variance < threshold
&& vt.vt->horz[1].variance < threshold) {
- set_block_size(cm, m, get_subsize(block_size, PARTITION_HORZ), mis, mi_row,
+ set_block_size(cm, m, get_subsize(bsize, PARTITION_HORZ), mis, mi_row,
mi_col);
return 1;
}
@@ -1192,7 +1183,7 @@
}
static void rd_use_partition(VP9_COMP *cpi, MODE_INFO *m, TOKENEXTRA **tp,
- int mi_row, int mi_col, BLOCK_SIZE_TYPE bsize,
+ int mi_row, int mi_col, BLOCK_SIZE bsize,
int *rate, int64_t *dist, int do_recon) {
VP9_COMMON * const cm = &cpi->common;
MACROBLOCK * const x = &cpi->mb;
@@ -1206,7 +1197,7 @@
int bss = (1 << bsl) / 4;
int i, pl;
PARTITION_TYPE partition = PARTITION_NONE;
- BLOCK_SIZE_TYPE subsize;
+ BLOCK_SIZE subsize;
ENTROPY_CONTEXT l[16 * MAX_MB_PLANE], a[16 * MAX_MB_PLANE];
PARTITION_CONTEXT sl[8], sa[8];
int last_part_rate = INT_MAX;
@@ -1217,9 +1208,9 @@
int64_t none_dist = INT_MAX;
int chosen_rate = INT_MAX;
int64_t chosen_dist = INT_MAX;
- BLOCK_SIZE_TYPE sub_subsize = BLOCK_4X4;
+ BLOCK_SIZE sub_subsize = BLOCK_4X4;
int splits_below = 0;
- BLOCK_SIZE_TYPE bs_type = m->mbmi.sb_type;
+ BLOCK_SIZE bs_type = m->mbmi.sb_type;
if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols)
return;
@@ -1283,7 +1274,7 @@
bsize, get_block_context(x, bsize), INT64_MAX);
break;
case PARTITION_HORZ:
- *(get_sb_index(xd, subsize)) = 0;
+ *get_sb_index(xd, subsize) = 0;
pick_sb_modes(cpi, mi_row, mi_col, &last_part_rate, &last_part_dist,
subsize, get_block_context(x, subsize), INT64_MAX);
if (last_part_rate != INT_MAX &&
@@ -1292,7 +1283,7 @@
int64_t dt = 0;
update_state(cpi, get_block_context(x, subsize), subsize, 0);
encode_superblock(cpi, tp, 0, mi_row, mi_col, subsize);
- *(get_sb_index(xd, subsize)) = 1;
+ *get_sb_index(xd, subsize) = 1;
pick_sb_modes(cpi, mi_row + (ms >> 1), mi_col, &rt, &dt, subsize,
get_block_context(x, subsize), INT64_MAX);
if (rt == INT_MAX || dt == INT_MAX) {
@@ -1306,7 +1297,7 @@
}
break;
case PARTITION_VERT:
- *(get_sb_index(xd, subsize)) = 0;
+ *get_sb_index(xd, subsize) = 0;
pick_sb_modes(cpi, mi_row, mi_col, &last_part_rate, &last_part_dist,
subsize, get_block_context(x, subsize), INT64_MAX);
if (last_part_rate != INT_MAX &&
@@ -1315,7 +1306,7 @@
int64_t dt = 0;
update_state(cpi, get_block_context(x, subsize), subsize, 0);
encode_superblock(cpi, tp, 0, mi_row, mi_col, subsize);
- *(get_sb_index(xd, subsize)) = 1;
+ *get_sb_index(xd, subsize) = 1;
pick_sb_modes(cpi, mi_row, mi_col + (ms >> 1), &rt, &dt, subsize,
get_block_context(x, subsize), INT64_MAX);
if (rt == INT_MAX || dt == INT_MAX) {
@@ -1341,7 +1332,7 @@
if ((mi_row + y_idx >= cm->mi_rows) || (mi_col + x_idx >= cm->mi_cols))
continue;
- *(get_sb_index(xd, subsize)) = i;
+ *get_sb_index(xd, subsize) = i;
rd_use_partition(cpi, m + jj * bss * mis + ii * bss, tp, mi_row + y_idx,
mi_col + x_idx, subsize, &rt, &dt, i != 3);
@@ -1366,7 +1357,7 @@
&& partition != PARTITION_SPLIT && bsize > BLOCK_8X8
&& (mi_row + ms < cm->mi_rows || mi_row + (ms >> 1) == cm->mi_rows)
&& (mi_col + ms < cm->mi_cols || mi_col + (ms >> 1) == cm->mi_cols)) {
- BLOCK_SIZE_TYPE split_subsize = get_subsize(bsize, PARTITION_SPLIT);
+ BLOCK_SIZE split_subsize = get_subsize(bsize, PARTITION_SPLIT);
split_rate = 0;
split_dist = 0;
restore_context(cpi, mi_row, mi_col, a, l, sa, sl, bsize);
@@ -1384,9 +1375,9 @@
|| (mi_col + x_idx >= cm->mi_cols))
continue;
- *(get_sb_index(xd, split_subsize)) = i;
- *(get_sb_partitioning(x, bsize)) = split_subsize;
- *(get_sb_partitioning(x, split_subsize)) = split_subsize;
+ *get_sb_index(xd, split_subsize) = i;
+ *get_sb_partitioning(x, bsize) = split_subsize;
+ *get_sb_partitioning(x, split_subsize) = split_subsize;
save_context(cpi, mi_row, mi_col, a, l, sa, sl, bsize);
@@ -1454,14 +1445,17 @@
*dist = chosen_dist;
}
-static const BLOCK_SIZE_TYPE min_partition_size[BLOCK_SIZES] =
- { BLOCK_4X4, BLOCK_4X4, BLOCK_4X4, BLOCK_4X4,
- BLOCK_4X4, BLOCK_4X4, BLOCK_8X8, BLOCK_8X8,
- BLOCK_8X8, BLOCK_16X16, BLOCK_16X16, BLOCK_16X16, BLOCK_16X16 };
-static const BLOCK_SIZE_TYPE max_partition_size[BLOCK_SIZES] =
- { BLOCK_8X8, BLOCK_16X16, BLOCK_16X16, BLOCK_16X16,
- BLOCK_32X32, BLOCK_32X32, BLOCK_32X32, BLOCK_64X64,
- BLOCK_64X64, BLOCK_64X64, BLOCK_64X64, BLOCK_64X64, BLOCK_64X64 };
+static const BLOCK_SIZE min_partition_size[BLOCK_SIZES] = {
+ BLOCK_4X4, BLOCK_4X4, BLOCK_4X4, BLOCK_4X4,
+ BLOCK_4X4, BLOCK_4X4, BLOCK_8X8, BLOCK_8X8,
+ BLOCK_8X8, BLOCK_16X16, BLOCK_16X16, BLOCK_16X16, BLOCK_16X16
+};
+
+static const BLOCK_SIZE max_partition_size[BLOCK_SIZES] = {
+ BLOCK_8X8, BLOCK_16X16, BLOCK_16X16, BLOCK_16X16,
+ BLOCK_32X32, BLOCK_32X32, BLOCK_32X32, BLOCK_64X64,
+ BLOCK_64X64, BLOCK_64X64, BLOCK_64X64, BLOCK_64X64, BLOCK_64X64
+};
// Look at all the mode_info entries for blocks that are part of this
// partition and find the min and max values for sb_type.
@@ -1471,8 +1465,8 @@
// The min and max are assumed to have been initialized prior to calling this
// function so repeat calls can accumulate a min and max of more than one sb64.
static void get_sb_partition_size_range(VP9_COMP *cpi, MODE_INFO * mi,
- BLOCK_SIZE_TYPE * min_block_size,
- BLOCK_SIZE_TYPE * max_block_size ) {
+ BLOCK_SIZE *min_block_size,
+ BLOCK_SIZE *max_block_size ) {
MACROBLOCKD *const xd = &cpi->mb.e_mbd;
int sb_width_in_blocks = MI_BLOCK_SIZE;
int sb_height_in_blocks = MI_BLOCK_SIZE;
@@ -1482,8 +1476,8 @@
// Check the sb_type for each block that belongs to this region.
for (i = 0; i < sb_height_in_blocks; ++i) {
for (j = 0; j < sb_width_in_blocks; ++j) {
- *min_block_size = MIN(*min_block_size, mi[index+j].mbmi.sb_type);
- *max_block_size = MAX(*max_block_size, mi[index+j].mbmi.sb_type);
+ *min_block_size = MIN(*min_block_size, mi[index + j].mbmi.sb_type);
+ *max_block_size = MAX(*max_block_size, mi[index + j].mbmi.sb_type);
}
index += xd->mode_info_stride;
}
@@ -1492,8 +1486,8 @@
// Look at neighboring blocks and set a min and max partition size based on
// what they chose.
static void rd_auto_partition_range(VP9_COMP *cpi,
- BLOCK_SIZE_TYPE * min_block_size,
- BLOCK_SIZE_TYPE * max_block_size) {
+ BLOCK_SIZE *min_block_size,
+ BLOCK_SIZE *max_block_size) {
MACROBLOCKD *const xd = &cpi->mb.e_mbd;
MODE_INFO *mi = xd->mode_info_context;
MODE_INFO *above_sb64_mi;
@@ -1544,8 +1538,7 @@
}
}
-static void compute_fast_motion_search_level(VP9_COMP *const cpi,
- const BLOCK_SIZE_TYPE bsize) {
+static void compute_fast_motion_search_level(VP9_COMP *cpi, BLOCK_SIZE bsize) {
VP9_COMMON *const cm = &cpi->common;
MACROBLOCK *const x = &cpi->mb;
MACROBLOCKD *const xd = &x->e_mbd;
@@ -1644,26 +1637,25 @@
// unlikely to be selected depending on previous rate-distortion optimization
// results, for encoding speed-up.
static void rd_pick_partition(VP9_COMP *cpi, TOKENEXTRA **tp, int mi_row,
- int mi_col, BLOCK_SIZE_TYPE bsize, int *rate,
+ int mi_col, BLOCK_SIZE bsize, int *rate,
int64_t *dist, int do_recon, int64_t best_rd) {
VP9_COMMON * const cm = &cpi->common;
MACROBLOCK * const x = &cpi->mb;
MACROBLOCKD * const xd = &x->e_mbd;
- int bsl = b_width_log2(bsize), bs = 1 << bsl;
- int ms = bs / 2;
+ const int ms = num_8x8_blocks_wide_lookup[bsize] / 2;
ENTROPY_CONTEXT l[16 * MAX_MB_PLANE], a[16 * MAX_MB_PLANE];
PARTITION_CONTEXT sl[8], sa[8];
TOKENEXTRA *tp_orig = *tp;
int i, pl;
- BLOCK_SIZE_TYPE subsize;
+ BLOCK_SIZE subsize;
int this_rate, sum_rate = 0, best_rate = INT_MAX;
int64_t this_dist, sum_dist = 0, best_dist = INT64_MAX;
int64_t sum_rd = 0;
int do_split = bsize >= BLOCK_8X8;
int do_rect = 1;
// Override skipping rectangular partition operations for edge blocks
- const int force_horz_split = (mi_row + (ms >> 1) >= cm->mi_rows);
- const int force_vert_split = (mi_col + (ms >> 1) >= cm->mi_cols);
+ const int force_horz_split = (mi_row + ms >= cm->mi_rows);
+ const int force_vert_split = (mi_col + ms >= cm->mi_cols);
int partition_none_allowed = !force_horz_split && !force_vert_split;
int partition_horz_allowed = !force_vert_split && bsize >= BLOCK_8X8;
@@ -1743,14 +1735,13 @@
if (do_split) {
subsize = get_subsize(bsize, PARTITION_SPLIT);
for (i = 0; i < 4 && sum_rd < best_rd; ++i) {
- int x_idx = (i & 1) * (ms >> 1);
- int y_idx = (i >> 1) * (ms >> 1);
+ const int x_idx = (i & 1) * ms;
+ const int y_idx = (i >> 1) * ms;
- if ((mi_row + y_idx >= cm->mi_rows) ||
- (mi_col + x_idx >= cm->mi_cols))
+ if (mi_row + y_idx >= cm->mi_rows || mi_col + x_idx >= cm->mi_cols)
continue;
- *(get_sb_index(xd, subsize)) = i;
+ *get_sb_index(xd, subsize) = i;
rd_pick_partition(cpi, tp, mi_row + y_idx, mi_col + x_idx, subsize,
&this_rate, &this_dist, i != 3, best_rd - sum_rd);
@@ -1796,17 +1787,17 @@
// PARTITION_HORZ
if (partition_horz_allowed && do_rect) {
subsize = get_subsize(bsize, PARTITION_HORZ);
- *(get_sb_index(xd, subsize)) = 0;
+ *get_sb_index(xd, subsize) = 0;
pick_sb_modes(cpi, mi_row, mi_col, &sum_rate, &sum_dist, subsize,
get_block_context(x, subsize), best_rd);
sum_rd = RDCOST(x->rdmult, x->rddiv, sum_rate, sum_dist);
- if (sum_rd < best_rd && mi_row + (ms >> 1) < cm->mi_rows) {
+ if (sum_rd < best_rd && mi_row + ms < cm->mi_rows) {
update_state(cpi, get_block_context(x, subsize), subsize, 0);
encode_superblock(cpi, tp, 0, mi_row, mi_col, subsize);
- *(get_sb_index(xd, subsize)) = 1;
- pick_sb_modes(cpi, mi_row + (ms >> 1), mi_col, &this_rate,
+ *get_sb_index(xd, subsize) = 1;
+ pick_sb_modes(cpi, mi_row + ms, mi_col, &this_rate,
&this_dist, subsize, get_block_context(x, subsize),
best_rd - sum_rd);
if (this_rate == INT_MAX) {
@@ -1836,16 +1827,16 @@
if (partition_vert_allowed && do_rect) {
subsize = get_subsize(bsize, PARTITION_VERT);
- *(get_sb_index(xd, subsize)) = 0;
+ *get_sb_index(xd, subsize) = 0;
pick_sb_modes(cpi, mi_row, mi_col, &sum_rate, &sum_dist, subsize,
get_block_context(x, subsize), best_rd);
sum_rd = RDCOST(x->rdmult, x->rddiv, sum_rate, sum_dist);
- if (sum_rd < best_rd && mi_col + (ms >> 1) < cm->mi_cols) {
+ if (sum_rd < best_rd && mi_col + ms < cm->mi_cols) {
update_state(cpi, get_block_context(x, subsize), subsize, 0);
encode_superblock(cpi, tp, 0, mi_row, mi_col, subsize);
- *(get_sb_index(xd, subsize)) = 1;
- pick_sb_modes(cpi, mi_row, mi_col + (ms >> 1), &this_rate,
+ *get_sb_index(xd, subsize) = 1;
+ pick_sb_modes(cpi, mi_row, mi_col + ms, &this_rate,
&this_dist, subsize, get_block_context(x, subsize),
best_rd - sum_rd);
if (this_rate == INT_MAX) {
@@ -2218,25 +2209,25 @@
}
static void set_txfm_flag(MODE_INFO *mi, int mis, int ymbs, int xmbs,
- TX_SIZE txfm_size) {
+ TX_SIZE tx_size) {
int x, y;
for (y = 0; y < ymbs; y++) {
for (x = 0; x < xmbs; x++)
- mi[y * mis + x].mbmi.txfm_size = txfm_size;
+ mi[y * mis + x].mbmi.tx_size = tx_size;
}
}
static void reset_skip_txfm_size_b(VP9_COMP *cpi, MODE_INFO *mi, int mis,
- TX_SIZE txfm_max, int bw, int bh, int mi_row,
- int mi_col, BLOCK_SIZE_TYPE bsize) {
- VP9_COMMON * const cm = &cpi->common;
- MB_MODE_INFO * const mbmi = &mi->mbmi;
+ TX_SIZE max_tx_size, int bw, int bh,
+ int mi_row, int mi_col, BLOCK_SIZE bsize) {
+ VP9_COMMON *const cm = &cpi->common;
+ MB_MODE_INFO *const mbmi = &mi->mbmi;
if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols)
return;
- if (mbmi->txfm_size > txfm_max) {
+ if (mbmi->tx_size > max_tx_size) {
MACROBLOCK * const x = &cpi->mb;
MACROBLOCKD * const xd = &x->e_mbd;
const int ymbs = MIN(bh, cm->mi_rows - mi_row);
@@ -2245,57 +2236,49 @@
xd->mode_info_context = mi;
assert(vp9_segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP) ||
get_skip_flag(mi, mis, ymbs, xmbs));
- set_txfm_flag(mi, mis, ymbs, xmbs, txfm_max);
+ set_txfm_flag(mi, mis, ymbs, xmbs, max_tx_size);
}
}
static void reset_skip_txfm_size_sb(VP9_COMP *cpi, MODE_INFO *mi,
- TX_SIZE txfm_max, int mi_row, int mi_col,
- BLOCK_SIZE_TYPE bsize) {
- VP9_COMMON * const cm = &cpi->common;
+ TX_SIZE max_tx_size, int mi_row, int mi_col,
+ BLOCK_SIZE bsize) {
+ const VP9_COMMON *const cm = &cpi->common;
const int mis = cm->mode_info_stride;
- int bwl, bhl;
- const int bsl = mi_width_log2(bsize), bs = 1 << (bsl - 1);
+ int bw, bh;
+ const int bs = num_8x8_blocks_wide_lookup[bsize], hbs = bs / 2;
if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols)
return;
- bwl = mi_width_log2(mi->mbmi.sb_type);
- bhl = mi_height_log2(mi->mbmi.sb_type);
+ bw = num_8x8_blocks_wide_lookup[mi->mbmi.sb_type];
+ bh = num_8x8_blocks_high_lookup[mi->mbmi.sb_type];
- if (bwl == bsl && bhl == bsl) {
- reset_skip_txfm_size_b(cpi, mi, mis, txfm_max, 1 << bsl, 1 << bsl, mi_row,
+ if (bw == bs && bh == bs) {
+ reset_skip_txfm_size_b(cpi, mi, mis, max_tx_size, bs, bs, mi_row,
mi_col, bsize);
- } else if (bwl == bsl && bhl < bsl) {
- reset_skip_txfm_size_b(cpi, mi, mis, txfm_max, 1 << bsl, bs, mi_row, mi_col,
+ } else if (bw == bs && bh < bs) {
+ reset_skip_txfm_size_b(cpi, mi, mis, max_tx_size, bs, hbs, mi_row, mi_col,
bsize);
- reset_skip_txfm_size_b(cpi, mi + bs * mis, mis, txfm_max, 1 << bsl, bs,
- mi_row + bs, mi_col, bsize);
- } else if (bwl < bsl && bhl == bsl) {
- reset_skip_txfm_size_b(cpi, mi, mis, txfm_max, bs, 1 << bsl, mi_row, mi_col,
+ reset_skip_txfm_size_b(cpi, mi + hbs * mis, mis, max_tx_size, bs, hbs,
+ mi_row + hbs, mi_col, bsize);
+ } else if (bw < bs && bh == bs) {
+ reset_skip_txfm_size_b(cpi, mi, mis, max_tx_size, hbs, bs, mi_row, mi_col,
bsize);
- reset_skip_txfm_size_b(cpi, mi + bs, mis, txfm_max, bs, 1 << bsl, mi_row,
- mi_col + bs, bsize);
+ reset_skip_txfm_size_b(cpi, mi + hbs, mis, max_tx_size, hbs, bs, mi_row,
+ mi_col + hbs, bsize);
} else {
- BLOCK_SIZE_TYPE subsize;
+ const BLOCK_SIZE subsize = subsize_lookup[PARTITION_SPLIT][bsize];
int n;
- assert(bwl < bsl && bhl < bsl);
- if (bsize == BLOCK_64X64) {
- subsize = BLOCK_32X32;
- } else if (bsize == BLOCK_32X32) {
- subsize = BLOCK_16X16;
- } else {
- assert(bsize == BLOCK_16X16);
- subsize = BLOCK_8X8;
- }
+ assert(bw < bs && bh < bs);
for (n = 0; n < 4; n++) {
- const int y_idx = n >> 1, x_idx = n & 0x01;
+ const int mi_dc = hbs * (n & 1);
+ const int mi_dr = hbs * (n >> 1);
- reset_skip_txfm_size_sb(cpi, mi + y_idx * bs * mis + x_idx * bs, txfm_max,
- mi_row + y_idx * bs, mi_col + x_idx * bs,
- subsize);
+ reset_skip_txfm_size_sb(cpi, &mi[mi_dr * mis + mi_dc], max_tx_size,
+ mi_row + mi_dr, mi_col + mi_dc, subsize);
}
}
}
@@ -2524,7 +2507,7 @@
static void sum_intra_stats(VP9_COMP *cpi, const MODE_INFO *mi) {
const MB_PREDICTION_MODE y_mode = mi->mbmi.mode;
const MB_PREDICTION_MODE uv_mode = mi->mbmi.uv_mode;
- const BLOCK_SIZE_TYPE bsize = mi->mbmi.sb_type;
+ const BLOCK_SIZE bsize = mi->mbmi.sb_type;
++cpi->y_uv_mode_count[y_mode][uv_mode];
@@ -2562,7 +2545,7 @@
}
static void encode_superblock(VP9_COMP *cpi, TOKENEXTRA **t, int output_enabled,
- int mi_row, int mi_col, BLOCK_SIZE_TYPE bsize) {
+ int mi_row, int mi_col, BLOCK_SIZE bsize) {
VP9_COMMON * const cm = &cpi->common;
MACROBLOCK * const x = &cpi->mb;
MACROBLOCKD * const xd = &x->e_mbd;
@@ -2665,7 +2648,7 @@
(mbmi->skip_coeff ||
vp9_segfeature_active(&cm->seg, segment_id, SEG_LVL_SKIP)))) {
const uint8_t context = vp9_get_pred_context_tx_size(xd);
- update_tx_counts(bsize, context, mbmi->txfm_size, &cm->counts.tx);
+ update_tx_counts(bsize, context, mbmi->tx_size, &cm->counts.tx);
} else {
int x, y;
TX_SIZE sz = (cm->tx_mode == TX_MODE_SELECT) ? TX_32X32 : cm->tx_mode;
@@ -2678,18 +2661,15 @@
if (sz == TX_8X8 && bsize < BLOCK_8X8)
sz = TX_4X4;
} else if (bsize >= BLOCK_8X8) {
- sz = mbmi->txfm_size;
+ sz = mbmi->tx_size;
} else {
sz = TX_4X4;
}
- for (y = 0; y < mi_height; y++) {
- for (x = 0; x < mi_width; x++) {
- if (mi_col + x < cm->mi_cols && mi_row + y < cm->mi_rows) {
- mi[mis * y + x].mbmi.txfm_size = sz;
- }
- }
- }
+ for (y = 0; y < mi_height; y++)
+ for (x = 0; x < mi_width; x++)
+ if (mi_col + x < cm->mi_cols && mi_row + y < cm->mi_rows)
+ mi[mis * y + x].mbmi.tx_size = sz;
}
}
}
diff --git a/vp9/encoder/vp9_encodeintra.c b/vp9/encoder/vp9_encodeintra.c
index 7393974..588b774 100644
--- a/vp9/encoder/vp9_encodeintra.c
+++ b/vp9/encoder/vp9_encodeintra.c
@@ -20,8 +20,8 @@
x->skip_encode = 0;
mbmi->mode = DC_PRED;
mbmi->ref_frame[0] = INTRA_FRAME;
- mbmi->txfm_size = use_16x16_pred ? (mbmi->sb_type >= BLOCK_16X16 ? TX_16X16
- : TX_8X8)
+ mbmi->tx_size = use_16x16_pred ? (mbmi->sb_type >= BLOCK_16X16 ? TX_16X16
+ : TX_8X8)
: TX_4X4;
vp9_encode_intra_block_y(x, mbmi->sb_type);
return vp9_get_mb_ss(x->plane[0].src_diff);
diff --git a/vp9/encoder/vp9_encodeintra.h b/vp9/encoder/vp9_encodeintra.h
index 496f421..e217924 100644
--- a/vp9/encoder/vp9_encodeintra.h
+++ b/vp9/encoder/vp9_encodeintra.h
@@ -14,7 +14,7 @@
#include "vp9/encoder/vp9_onyx_int.h"
int vp9_encode_intra(MACROBLOCK *x, int use_16x16_pred);
-void vp9_encode_block_intra(int plane, int block, BLOCK_SIZE_TYPE plane_bsize,
+void vp9_encode_block_intra(int plane, int block, BLOCK_SIZE plane_bsize,
TX_SIZE tx_size, void *arg);
#endif // VP9_ENCODER_VP9_ENCODEINTRA_H_
diff --git a/vp9/encoder/vp9_encodemb.c b/vp9/encoder/vp9_encodemb.c
index 13aee3e..da9a3bd 100644
--- a/vp9/encoder/vp9_encodemb.c
+++ b/vp9/encoder/vp9_encodemb.c
@@ -69,7 +69,7 @@
vp9_short_idct16x16_add(dqcoeff, dest, stride);
}
-static void subtract_plane(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize, int plane) {
+static void subtract_plane(MACROBLOCK *x, BLOCK_SIZE bsize, int plane) {
struct macroblock_plane *const p = &x->plane[plane];
const MACROBLOCKD *const xd = &x->e_mbd;
const struct macroblockd_plane *const pd = &xd->plane[plane];
@@ -81,18 +81,18 @@
pd->dst.buf, pd->dst.stride);
}
-void vp9_subtract_sby(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize) {
+void vp9_subtract_sby(MACROBLOCK *x, BLOCK_SIZE bsize) {
subtract_plane(x, bsize, 0);
}
-void vp9_subtract_sbuv(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize) {
+void vp9_subtract_sbuv(MACROBLOCK *x, BLOCK_SIZE bsize) {
int i;
for (i = 1; i < MAX_MB_PLANE; i++)
subtract_plane(x, bsize, i);
}
-void vp9_subtract_sb(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize) {
+void vp9_subtract_sb(MACROBLOCK *x, BLOCK_SIZE bsize) {
vp9_subtract_sby(x, bsize);
vp9_subtract_sbuv(x, bsize);
}
@@ -142,7 +142,7 @@
}
static void optimize_b(MACROBLOCK *mb,
- int plane, int block, BLOCK_SIZE_TYPE plane_bsize,
+ int plane, int block, BLOCK_SIZE plane_bsize,
ENTROPY_CONTEXT *a, ENTROPY_CONTEXT *l,
TX_SIZE tx_size) {
MACROBLOCKD *const xd = &mb->e_mbd;
@@ -370,7 +370,7 @@
*a = *l = (final_eob > 0);
}
-void vp9_optimize_b(int plane, int block, BLOCK_SIZE_TYPE plane_bsize,
+void vp9_optimize_b(int plane, int block, BLOCK_SIZE plane_bsize,
TX_SIZE tx_size, MACROBLOCK *mb, struct optimize_ctx *ctx) {
int x, y;
txfrm_block_to_raster_xy(plane_bsize, tx_size, block, &x, &y);
@@ -378,15 +378,15 @@
&ctx->ta[plane][x], &ctx->tl[plane][y], tx_size);
}
-static void optimize_init_b(int plane, BLOCK_SIZE_TYPE bsize,
+static void optimize_init_b(int plane, BLOCK_SIZE bsize,
struct encode_b_args *args) {
const MACROBLOCKD *xd = &args->x->e_mbd;
const struct macroblockd_plane* const pd = &xd->plane[plane];
- const BLOCK_SIZE_TYPE plane_bsize = get_plane_block_size(bsize, pd);
+ const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, pd);
const int num_4x4_w = num_4x4_blocks_wide_lookup[plane_bsize];
const int num_4x4_h = num_4x4_blocks_high_lookup[plane_bsize];
const MB_MODE_INFO *mbmi = &xd->mode_info_context->mbmi;
- const TX_SIZE tx_size = plane ? get_uv_tx_size(mbmi) : mbmi->txfm_size;
+ const TX_SIZE tx_size = plane ? get_uv_tx_size(mbmi) : mbmi->tx_size;
int i;
switch (tx_size) {
@@ -419,7 +419,7 @@
}
}
-void vp9_xform_quant(int plane, int block, BLOCK_SIZE_TYPE plane_bsize,
+void vp9_xform_quant(int plane, int block, BLOCK_SIZE plane_bsize,
TX_SIZE tx_size, void *arg) {
struct encode_b_args* const args = arg;
MACROBLOCK* const x = args->x;
@@ -492,7 +492,7 @@
}
}
-static void encode_block(int plane, int block, BLOCK_SIZE_TYPE plane_bsize,
+static void encode_block(int plane, int block, BLOCK_SIZE plane_bsize,
TX_SIZE tx_size, void *arg) {
struct encode_b_args *const args = arg;
MACROBLOCK *const x = args->x;
@@ -536,7 +536,7 @@
}
}
-void vp9_encode_sby(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize) {
+void vp9_encode_sby(MACROBLOCK *x, BLOCK_SIZE bsize) {
MACROBLOCKD *const xd = &x->e_mbd;
struct optimize_ctx ctx;
struct encode_b_args arg = {x, &ctx};
@@ -548,7 +548,7 @@
foreach_transformed_block_in_plane(xd, bsize, 0, encode_block, &arg);
}
-void vp9_encode_sb(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize) {
+void vp9_encode_sb(MACROBLOCK *x, BLOCK_SIZE bsize) {
MACROBLOCKD *const xd = &x->e_mbd;
struct optimize_ctx ctx;
struct encode_b_args arg = {x, &ctx};
@@ -564,7 +564,7 @@
foreach_transformed_block(xd, bsize, encode_block, &arg);
}
-void vp9_encode_block_intra(int plane, int block, BLOCK_SIZE_TYPE plane_bsize,
+void vp9_encode_block_intra(int plane, int block, BLOCK_SIZE plane_bsize,
TX_SIZE tx_size, void *arg) {
struct encode_b_args* const args = arg;
MACROBLOCK *const x = args->x;
@@ -714,7 +714,7 @@
}
}
-void vp9_encode_intra_block_y(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize) {
+void vp9_encode_intra_block_y(MACROBLOCK *x, BLOCK_SIZE bsize) {
MACROBLOCKD* const xd = &x->e_mbd;
struct optimize_ctx ctx;
struct encode_b_args arg = {x, &ctx};
@@ -722,7 +722,7 @@
foreach_transformed_block_in_plane(xd, bsize, 0, vp9_encode_block_intra,
&arg);
}
-void vp9_encode_intra_block_uv(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize) {
+void vp9_encode_intra_block_uv(MACROBLOCK *x, BLOCK_SIZE bsize) {
MACROBLOCKD* const xd = &x->e_mbd;
struct optimize_ctx ctx;
struct encode_b_args arg = {x, &ctx};
diff --git a/vp9/encoder/vp9_encodemb.h b/vp9/encoder/vp9_encodemb.h
index 1db15c3..2aa4188 100644
--- a/vp9/encoder/vp9_encodemb.h
+++ b/vp9/encoder/vp9_encodemb.h
@@ -32,18 +32,18 @@
struct optimize_ctx *ctx;
};
-void vp9_encode_sb(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize);
-void vp9_encode_sby(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize);
+void vp9_encode_sb(MACROBLOCK *x, BLOCK_SIZE bsize);
+void vp9_encode_sby(MACROBLOCK *x, BLOCK_SIZE bsize);
-void vp9_xform_quant(int plane, int block, BLOCK_SIZE_TYPE plane_bsize,
+void vp9_xform_quant(int plane, int block, BLOCK_SIZE plane_bsize,
TX_SIZE tx_size, void *arg);
-void vp9_subtract_sby(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize);
-void vp9_subtract_sbuv(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize);
-void vp9_subtract_sb(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize);
+void vp9_subtract_sby(MACROBLOCK *x, BLOCK_SIZE bsize);
+void vp9_subtract_sbuv(MACROBLOCK *x, BLOCK_SIZE bsize);
+void vp9_subtract_sb(MACROBLOCK *x, BLOCK_SIZE bsize);
-void vp9_encode_intra_block_y(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize);
-void vp9_encode_intra_block_uv(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize);
+void vp9_encode_intra_block_y(MACROBLOCK *x, BLOCK_SIZE bsize);
+void vp9_encode_intra_block_uv(MACROBLOCK *x, BLOCK_SIZE bsize);
#endif // VP9_ENCODER_VP9_ENCODEMB_H_
diff --git a/vp9/encoder/vp9_firstpass.c b/vp9/encoder/vp9_firstpass.c
index 6b37cc9..92485f9 100644
--- a/vp9/encoder/vp9_firstpass.c
+++ b/vp9/encoder/vp9_firstpass.c
@@ -661,7 +661,7 @@
mv.as_mv.col <<= 3;
this_error = motion_error;
vp9_set_mbmode_and_mvs(x, NEWMV, &mv);
- xd->mode_info_context->mbmi.txfm_size = TX_4X4;
+ xd->mode_info_context->mbmi.tx_size = TX_4X4;
xd->mode_info_context->mbmi.ref_frame[0] = LAST_FRAME;
xd->mode_info_context->mbmi.ref_frame[1] = NONE;
vp9_build_inter_predictors_sby(xd, mb_row << 1,
diff --git a/vp9/encoder/vp9_modecosts.c b/vp9/encoder/vp9_modecosts.c
index 5b23653..a5dfaed 100644
--- a/vp9/encoder/vp9_modecosts.c
+++ b/vp9/encoder/vp9_modecosts.c
@@ -16,7 +16,7 @@
void vp9_init_mode_costs(VP9_COMP *c) {
- VP9_COMMON *x = &c->common;
+ VP9_COMMON *const cm = &c->common;
const vp9_tree_p KT = vp9_intra_mode_tree;
int i, j;
@@ -28,16 +28,16 @@
}
// TODO(rbultje) separate tables for superblock costing?
- vp9_cost_tokens(c->mb.mbmode_cost, x->fc.y_mode_prob[1],
+ vp9_cost_tokens(c->mb.mbmode_cost, cm->fc.y_mode_prob[1],
vp9_intra_mode_tree);
vp9_cost_tokens(c->mb.intra_uv_mode_cost[1],
- x->fc.uv_mode_prob[INTRA_MODES - 1], vp9_intra_mode_tree);
+ cm->fc.uv_mode_prob[INTRA_MODES - 1], vp9_intra_mode_tree);
vp9_cost_tokens(c->mb.intra_uv_mode_cost[0],
vp9_kf_uv_mode_prob[INTRA_MODES - 1],
vp9_intra_mode_tree);
for (i = 0; i <= SWITCHABLE_FILTERS; ++i)
vp9_cost_tokens((int *)c->mb.switchable_interp_costs[i],
- x->fc.switchable_interp_prob[i],
+ cm->fc.switchable_interp_prob[i],
vp9_switchable_interp_tree);
}
diff --git a/vp9/encoder/vp9_onyx_if.c b/vp9/encoder/vp9_onyx_if.c
index e15c44d..e9c214f 100644
--- a/vp9/encoder/vp9_onyx_if.c
+++ b/vp9/encoder/vp9_onyx_if.c
@@ -608,7 +608,7 @@
sf->thresh_mult[THR_D135_PRED] += speed_multiplier * 2500;
sf->thresh_mult[THR_D117_PRED] += speed_multiplier * 2500;
sf->thresh_mult[THR_D153_PRED] += speed_multiplier * 2500;
- sf->thresh_mult[THR_D27_PRED] += speed_multiplier * 2500;
+ sf->thresh_mult[THR_D207_PRED] += speed_multiplier * 2500;
sf->thresh_mult[THR_D63_PRED] += speed_multiplier * 2500;
if (cpi->sf.skip_lots_of_modes) {
@@ -741,6 +741,7 @@
sf->skip_encode_sb = 0;
sf->use_uv_intra_rd_estimate = 0;
sf->use_fast_lpf_pick = 0;
+ sf->use_fast_coef_updates = 0;
sf->using_small_partition_info = 0;
// Skip any mode not chosen at size < X for all sizes > X
// Hence BLOCK_64X64 (skip is off)
@@ -802,6 +803,7 @@
sf->intra_y_mode_mask = INTRA_DC_TM_H_V;
sf->intra_uv_mode_mask = INTRA_DC_TM_H_V;
+ sf->use_fast_coef_updates = 1;
}
if (speed == 2) {
sf->adjust_thresholds_by_speed = 1;
@@ -840,6 +842,7 @@
sf->auto_min_max_partition_interval = 2;
sf->disable_split_var_thresh = 32;
sf->disable_filter_search_var_thresh = 32;
+ sf->use_fast_coef_updates = 2;
}
if (speed == 3) {
sf->comp_inter_joint_search_thresh = BLOCK_SIZES;
@@ -866,6 +869,7 @@
sf->disable_filter_search_var_thresh = 64;
sf->intra_y_mode_mask = INTRA_DC_ONLY;
sf->intra_uv_mode_mask = INTRA_DC_ONLY;
+ sf->use_fast_coef_updates = 2;
}
if (speed == 4) {
sf->comp_inter_joint_search_thresh = BLOCK_SIZES;
@@ -894,6 +898,7 @@
sf->subpel_iters_per_step = 1;
sf->disable_split_var_thresh = 64;
sf->disable_filter_search_var_thresh = 96;
+ sf->use_fast_coef_updates = 2;
}
/*
if (speed == 2) {
@@ -1720,6 +1725,10 @@
vp9_zero(cpi->y_uv_mode_count)
+#ifdef MODE_TEST_HIT_STATS
+ vp9_zero(cpi->mode_test_hits)
+#endif
+
return (VP9_PTR) cpi;
}
@@ -1801,6 +1810,34 @@
#endif
+#ifdef MODE_TEST_HIT_STATS
+ if (cpi->pass != 1) {
+ double norm_per_pixel_mode_tests = 0;
+ double norm_counts[BLOCK_SIZES];
+ int i;
+ int sb64_per_frame;
+ int norm_factors[BLOCK_SIZES] =
+ {256, 128, 128, 64, 32, 32, 16, 8, 8, 4, 2, 2, 1};
+ FILE *f = fopen("mode_hit_stats.stt", "a");
+
+ // On average, how many mode tests do we do
+ for (i = 0; i < BLOCK_SIZES; ++i) {
+ norm_counts[i] = (double)cpi->mode_test_hits[i] /
+ (double)norm_factors[i];
+ norm_per_pixel_mode_tests += norm_counts[i];
+ }
+ // Convert to a number per 64x64 and per frame
+ sb64_per_frame = ((cpi->common.height + 63) / 64) *
+ ((cpi->common.width + 63) / 64);
+ norm_per_pixel_mode_tests =
+ norm_per_pixel_mode_tests /
+ (double)(cpi->common.current_video_frame * sb64_per_frame);
+
+ fprintf(f, "%6.4f\n", norm_per_pixel_mode_tests);
+ fclose(f);
+ }
+#endif
+
#ifdef ENTROPY_STATS
{
int i, j, k;
diff --git a/vp9/encoder/vp9_onyx_int.h b/vp9/encoder/vp9_onyx_int.h
index 8328374..252e982 100644
--- a/vp9/encoder/vp9_onyx_int.h
+++ b/vp9/encoder/vp9_onyx_int.h
@@ -36,6 +36,8 @@
#define DISABLE_RC_LONG_TERM_MEM 0
#endif
+// #define MODE_TEST_HIT_STATS
+
// #define SPEEDSTATS 1
#if CONFIG_MULTIPLE_ARF
// Set MIN_GF_INTERVAL to 1 for the full decomposition.
@@ -182,7 +184,7 @@
THR_H_PRED,
THR_V_PRED,
THR_D135_PRED,
- THR_D27_PRED,
+ THR_D207_PRED,
THR_D153_PRED,
THR_D63_PRED,
THR_D117_PRED,
@@ -273,12 +275,12 @@
int use_square_partition_only;
int unused_mode_skip_lvl;
int reference_masking;
- BLOCK_SIZE_TYPE always_this_block_size;
+ BLOCK_SIZE always_this_block_size;
int auto_min_max_partition_size;
int auto_min_max_partition_interval;
int auto_min_max_partition_count;
- BLOCK_SIZE_TYPE min_partition_size;
- BLOCK_SIZE_TYPE max_partition_size;
+ BLOCK_SIZE min_partition_size;
+ BLOCK_SIZE max_partition_size;
int adjust_partitioning_from_last_frame;
int last_partitioning_redo_frequency;
int disable_splitmv;
@@ -298,6 +300,7 @@
int use_rd_breakout;
int use_uv_intra_rd_estimate;
int use_fast_lpf_pick;
+ int use_fast_coef_updates; // 0: 2-loop, 1: 1-loop, 2: 1-loop reduced
} SPEED_FEATURES;
typedef struct VP9_COMP {
@@ -663,6 +666,12 @@
#ifdef ENTROPY_STATS
int64_t mv_ref_stats[INTER_MODE_CONTEXTS][INTER_MODES - 1][2];
#endif
+
+
+#ifdef MODE_TEST_HIT_STATS
+ // Debug / test stats
+ int64_t mode_test_hits[BLOCK_SIZES];
+#endif
} VP9_COMP;
static int get_ref_frame_idx(VP9_COMP *cpi, MV_REFERENCE_FRAME ref_frame) {
diff --git a/vp9/encoder/vp9_quantize.c b/vp9/encoder/vp9_quantize.c
index 02c0685..fb0e470 100644
--- a/vp9/encoder/vp9_quantize.c
+++ b/vp9/encoder/vp9_quantize.c
@@ -84,7 +84,6 @@
*eob_ptr = eob + 1;
}
-// This function works well for large transform size.
void vp9_quantize_b_32x32_c(int16_t *coeff_ptr, intptr_t n_coeffs,
int skip_block,
int16_t *zbin_ptr, int16_t *round_ptr,
@@ -105,8 +104,8 @@
eob = -1;
// Base ZBIN
- zbins[0] = zbin_ptr[0] + zbin_oq_value;
- zbins[1] = zbin_ptr[1] + zbin_oq_value;
+ zbins[0] = ROUND_POWER_OF_TWO(zbin_ptr[0] + zbin_oq_value, 1);
+ zbins[1] = ROUND_POWER_OF_TWO(zbin_ptr[1] + zbin_oq_value, 1);
nzbins[0] = zbins[0] * -1;
nzbins[1] = zbins[1] * -1;
@@ -114,7 +113,7 @@
// Pre-scan pass
for (i = 0; i < n_coeffs; i++) {
rc = scan[i];
- z = coeff_ptr[rc] * 2;
+ z = coeff_ptr[rc];
// If the coefficient is out of the base ZBIN range, keep it for
// quantization.
@@ -130,14 +129,14 @@
// Calculate ZBIN
zbin = (zbins[rc != 0]);
- z = coeff_ptr[rc] * 2;
+ z = coeff_ptr[rc];
sz = (z >> 31); // sign of z
x = (z ^ sz) - sz; // x = abs(z)
if (x >= zbin) {
- x += (round_ptr[rc != 0]);
+ x += ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1);
y = (((int)(((int)(x * quant_ptr[rc != 0]) >> 16) + x)) *
- quant_shift_ptr[rc != 0]) >> 16; // quantize (x)
+ quant_shift_ptr[rc != 0]) >> 15; // quantize (x)
x = (y ^ sz) - sz; // get the sign back
qcoeff_ptr[rc] = x; // write to destination
diff --git a/vp9/encoder/vp9_rdopt.c b/vp9/encoder/vp9_rdopt.c
index ee21957..39b6544 100644
--- a/vp9/encoder/vp9_rdopt.c
+++ b/vp9/encoder/vp9_rdopt.c
@@ -90,7 +90,7 @@
{H_PRED, INTRA_FRAME, NONE},
{V_PRED, INTRA_FRAME, NONE},
{D135_PRED, INTRA_FRAME, NONE},
- {D27_PRED, INTRA_FRAME, NONE},
+ {D207_PRED, INTRA_FRAME, NONE},
{D153_PRED, INTRA_FRAME, NONE},
{D63_PRED, INTRA_FRAME, NONE},
{D117_PRED, INTRA_FRAME, NONE},
@@ -364,7 +364,7 @@
vp9_clear_system_state();
}
-static void model_rd_for_sb(VP9_COMP *cpi, BLOCK_SIZE_TYPE bsize,
+static void model_rd_for_sb(VP9_COMP *cpi, BLOCK_SIZE bsize,
MACROBLOCK *x, MACROBLOCKD *xd,
int *out_rate_sum, int64_t *out_dist_sum) {
// Note our transform coeffs are 8 times an orthogonal transform.
@@ -375,7 +375,7 @@
for (i = 0; i < MAX_MB_PLANE; ++i) {
struct macroblock_plane *const p = &x->plane[i];
struct macroblockd_plane *const pd = &xd->plane[i];
- const BLOCK_SIZE_TYPE bs = get_plane_block_size(bsize, pd);
+ const BLOCK_SIZE bs = get_plane_block_size(bsize, pd);
unsigned int sse;
int rate;
int64_t dist;
@@ -393,55 +393,52 @@
*out_dist_sum = dist_sum << 4;
}
-static void model_rd_for_sb_y_tx(VP9_COMP *cpi, BLOCK_SIZE_TYPE bsize,
+static void model_rd_for_sb_y_tx(VP9_COMP *cpi, BLOCK_SIZE bsize,
TX_SIZE tx_size,
MACROBLOCK *x, MACROBLOCKD *xd,
int *out_rate_sum, int64_t *out_dist_sum,
int *out_skip) {
- int t = 4, j, k;
- BLOCK_SIZE_TYPE bs = BLOCK_4X4;
+ int j, k;
+ BLOCK_SIZE bs;
struct macroblock_plane *const p = &x->plane[0];
struct macroblockd_plane *const pd = &xd->plane[0];
- const int width = plane_block_width(bsize, pd);
- const int height = plane_block_height(bsize, pd);
+ const int width = 4 << num_4x4_blocks_wide_lookup[bsize];
+ const int height = 4 << num_4x4_blocks_high_lookup[bsize];
int rate_sum = 0;
int64_t dist_sum = 0;
+ const int t = 4 << tx_size;
if (tx_size == TX_4X4) {
bs = BLOCK_4X4;
- t = 4;
} else if (tx_size == TX_8X8) {
bs = BLOCK_8X8;
- t = 8;
} else if (tx_size == TX_16X16) {
bs = BLOCK_16X16;
- t = 16;
} else if (tx_size == TX_32X32) {
bs = BLOCK_32X32;
- t = 32;
} else {
assert(0);
}
+
*out_skip = 1;
for (j = 0; j < height; j += t) {
for (k = 0; k < width; k += t) {
int rate;
int64_t dist;
unsigned int sse;
- (void) cpi->fn_ptr[bs].vf(p->src.buf + j * p->src.stride + k,
- p->src.stride,
- pd->dst.buf + j * pd->dst.stride + k,
- pd->dst.stride, &sse);
+ cpi->fn_ptr[bs].vf(&p->src.buf[j * p->src.stride + k], p->src.stride,
+ &pd->dst.buf[j * pd->dst.stride + k], pd->dst.stride,
+ &sse);
// sse works better than var, since there is no dc prediction used
- model_rd_from_var_lapndz(sse, t * t, pd->dequant[1] >> 3,
- &rate, &dist);
+ model_rd_from_var_lapndz(sse, t * t, pd->dequant[1] >> 3, &rate, &dist);
rate_sum += rate;
dist_sum += dist;
*out_skip &= (rate < 1024);
}
}
+
*out_rate_sum = rate_sum;
- *out_dist_sum = (dist_sum << 4);
+ *out_dist_sum = dist_sum << 4;
}
int64_t vp9_block_error_c(int16_t *coeff, int16_t *dqcoeff,
@@ -492,7 +489,7 @@
int c, cost;
// Check for consistency of tx_size with mode info
- assert(type == PLANE_TYPE_Y_WITH_DC ? mbmi->txfm_size == tx_size
+ assert(type == PLANE_TYPE_Y_WITH_DC ? mbmi->tx_size == tx_size
: get_uv_tx_size(mbmi) == tx_size);
if (eob == 0) {
@@ -579,7 +576,7 @@
}
}
-static void rate_block(int plane, int block, BLOCK_SIZE_TYPE plane_bsize,
+static void rate_block(int plane, int block, BLOCK_SIZE plane_bsize,
TX_SIZE tx_size, void *arg) {
struct rdcost_block_args* args = arg;
@@ -592,7 +589,7 @@
args->scan, args->nb);
}
-static void block_yrd_txfm(int plane, int block, BLOCK_SIZE_TYPE plane_bsize,
+static void block_yrd_txfm(int plane, int block, BLOCK_SIZE plane_bsize,
TX_SIZE tx_size, void *arg) {
struct rdcost_block_args *args = arg;
MACROBLOCK *const x = args->x;
@@ -626,10 +623,10 @@
int *rate, int64_t *distortion,
int *skippable, int64_t *sse,
int64_t ref_best_rd, int plane,
- BLOCK_SIZE_TYPE bsize, TX_SIZE tx_size) {
+ BLOCK_SIZE bsize, TX_SIZE tx_size) {
MACROBLOCKD *const xd = &x->e_mbd;
struct macroblockd_plane *const pd = &xd->plane[plane];
- const BLOCK_SIZE_TYPE bs = get_plane_block_size(bsize, pd);
+ const BLOCK_SIZE bs = get_plane_block_size(bsize, pd);
const int num_4x4_blocks_wide = num_4x4_blocks_wide_lookup[bs];
const int num_4x4_blocks_high = num_4x4_blocks_high_lookup[bs];
int i;
@@ -637,7 +634,7 @@
num_4x4_blocks_wide, num_4x4_blocks_high,
0, 0, 0, ref_best_rd, 0 };
if (plane == 0)
- xd->mode_info_context->mbmi.txfm_size = tx_size;
+ xd->mode_info_context->mbmi.tx_size = tx_size;
switch (tx_size) {
case TX_4X4:
@@ -687,7 +684,7 @@
int *rate, int64_t *distortion,
int *skip, int64_t *sse,
int64_t ref_best_rd,
- BLOCK_SIZE_TYPE bs) {
+ BLOCK_SIZE bs) {
const TX_SIZE max_txfm_size = max_txsize_lookup[bs];
VP9_COMMON *const cm = &cpi->common;
MACROBLOCKD *const xd = &x->e_mbd;
@@ -695,20 +692,20 @@
if (max_txfm_size == TX_32X32 &&
(cm->tx_mode == ALLOW_32X32 ||
cm->tx_mode == TX_MODE_SELECT)) {
- mbmi->txfm_size = TX_32X32;
+ mbmi->tx_size = TX_32X32;
} else if (max_txfm_size >= TX_16X16 &&
(cm->tx_mode == ALLOW_16X16 ||
cm->tx_mode == ALLOW_32X32 ||
cm->tx_mode == TX_MODE_SELECT)) {
- mbmi->txfm_size = TX_16X16;
+ mbmi->tx_size = TX_16X16;
} else if (cm->tx_mode != ONLY_4X4) {
- mbmi->txfm_size = TX_8X8;
+ mbmi->tx_size = TX_8X8;
} else {
- mbmi->txfm_size = TX_4X4;
+ mbmi->tx_size = TX_4X4;
}
txfm_rd_in_plane(x, rate, distortion, skip,
- &sse[mbmi->txfm_size], ref_best_rd, 0, bs,
- mbmi->txfm_size);
+ &sse[mbmi->tx_size], ref_best_rd, 0, bs,
+ mbmi->tx_size);
cpi->txfm_stepdown_count[0]++;
}
@@ -717,7 +714,7 @@
int64_t *d, int64_t *distortion,
int *s, int *skip,
int64_t tx_cache[TX_MODES],
- BLOCK_SIZE_TYPE bs) {
+ BLOCK_SIZE bs) {
const TX_SIZE max_tx_size = max_txsize_lookup[bs];
VP9_COMMON *const cm = &cpi->common;
MACROBLOCKD *const xd = &x->e_mbd;
@@ -763,26 +760,26 @@
(cm->tx_mode == TX_MODE_SELECT &&
rd[TX_32X32][1] < rd[TX_16X16][1] && rd[TX_32X32][1] < rd[TX_8X8][1] &&
rd[TX_32X32][1] < rd[TX_4X4][1]))) {
- mbmi->txfm_size = TX_32X32;
+ mbmi->tx_size = TX_32X32;
} else if (max_tx_size >= TX_16X16 &&
(cm->tx_mode == ALLOW_16X16 ||
cm->tx_mode == ALLOW_32X32 ||
(cm->tx_mode == TX_MODE_SELECT &&
rd[TX_16X16][1] < rd[TX_8X8][1] &&
rd[TX_16X16][1] < rd[TX_4X4][1]))) {
- mbmi->txfm_size = TX_16X16;
+ mbmi->tx_size = TX_16X16;
} else if (cm->tx_mode == ALLOW_8X8 ||
cm->tx_mode == ALLOW_16X16 ||
cm->tx_mode == ALLOW_32X32 ||
(cm->tx_mode == TX_MODE_SELECT && rd[TX_8X8][1] < rd[TX_4X4][1])) {
- mbmi->txfm_size = TX_8X8;
+ mbmi->tx_size = TX_8X8;
} else {
- mbmi->txfm_size = TX_4X4;
+ mbmi->tx_size = TX_4X4;
}
- *distortion = d[mbmi->txfm_size];
- *rate = r[mbmi->txfm_size][cm->tx_mode == TX_MODE_SELECT];
- *skip = s[mbmi->txfm_size];
+ *distortion = d[mbmi->tx_size];
+ *rate = r[mbmi->tx_size][cm->tx_mode == TX_MODE_SELECT];
+ *skip = s[mbmi->tx_size];
tx_cache[ONLY_4X4] = rd[TX_4X4][0];
tx_cache[ALLOW_8X8] = rd[TX_8X8][0];
@@ -820,7 +817,7 @@
int64_t *d, int64_t *distortion,
int *s, int *skip, int64_t *sse,
int64_t ref_best_rd,
- BLOCK_SIZE_TYPE bs) {
+ BLOCK_SIZE bs) {
const TX_SIZE max_txfm_size = max_txsize_lookup[bs];
VP9_COMMON *const cm = &cpi->common;
MACROBLOCKD *const xd = &x->e_mbd;
@@ -870,28 +867,28 @@
rd[TX_32X32][1] <= rd[TX_16X16][1] &&
rd[TX_32X32][1] <= rd[TX_8X8][1] &&
rd[TX_32X32][1] <= rd[TX_4X4][1]))) {
- mbmi->txfm_size = TX_32X32;
+ mbmi->tx_size = TX_32X32;
} else if (max_txfm_size >= TX_16X16 &&
(cm->tx_mode == ALLOW_16X16 ||
cm->tx_mode == ALLOW_32X32 ||
(cm->tx_mode == TX_MODE_SELECT &&
rd[TX_16X16][1] <= rd[TX_8X8][1] &&
rd[TX_16X16][1] <= rd[TX_4X4][1]))) {
- mbmi->txfm_size = TX_16X16;
+ mbmi->tx_size = TX_16X16;
} else if (cm->tx_mode == ALLOW_8X8 ||
cm->tx_mode == ALLOW_16X16 ||
cm->tx_mode == ALLOW_32X32 ||
(cm->tx_mode == TX_MODE_SELECT &&
rd[TX_8X8][1] <= rd[TX_4X4][1])) {
- mbmi->txfm_size = TX_8X8;
+ mbmi->tx_size = TX_8X8;
} else {
- mbmi->txfm_size = TX_4X4;
+ mbmi->tx_size = TX_4X4;
}
// Actually encode using the chosen mode if a model was used, but do not
// update the r, d costs
- txfm_rd_in_plane(x, rate, distortion, skip, &sse[mbmi->txfm_size],
- ref_best_rd, 0, bs, mbmi->txfm_size);
+ txfm_rd_in_plane(x, rate, distortion, skip, &sse[mbmi->tx_size],
+ ref_best_rd, 0, bs, mbmi->tx_size);
if (max_txfm_size == TX_32X32 &&
rd[TX_32X32][1] <= rd[TX_16X16][1] &&
@@ -911,7 +908,7 @@
static void super_block_yrd(VP9_COMP *cpi,
MACROBLOCK *x, int *rate, int64_t *distortion,
- int *skip, int64_t *psse, BLOCK_SIZE_TYPE bs,
+ int *skip, int64_t *psse, BLOCK_SIZE bs,
int64_t txfm_cache[TX_MODES],
int64_t ref_best_rd) {
int r[TX_SIZES][2], s[TX_SIZES];
@@ -930,7 +927,7 @@
choose_largest_txfm_size(cpi, x, rate, distortion, skip, sse,
ref_best_rd, bs);
if (psse)
- *psse = sse[mbmi->txfm_size];
+ *psse = sse[mbmi->tx_size];
return;
}
@@ -966,7 +963,7 @@
skip, txfm_cache, bs);
}
if (psse)
- *psse = sse[mbmi->txfm_size];
+ *psse = sse[mbmi->tx_size];
}
static int conditional_skipintra(MB_PREDICTION_MODE mode,
@@ -979,7 +976,7 @@
best_intra_mode != V_PRED &&
best_intra_mode != D45_PRED)
return 1;
- if (mode == D27_PRED &&
+ if (mode == D207_PRED &&
best_intra_mode != H_PRED &&
best_intra_mode != D45_PRED)
return 1;
@@ -996,8 +993,7 @@
ENTROPY_CONTEXT *a, ENTROPY_CONTEXT *l,
int *bestrate, int *bestratey,
int64_t *bestdistortion,
- BLOCK_SIZE_TYPE bsize,
- int64_t rd_thresh) {
+ BLOCK_SIZE bsize, int64_t rd_thresh) {
MB_PREDICTION_MODE mode;
MACROBLOCKD *xd = &x->e_mbd;
int64_t best_rd = rd_thresh;
@@ -1025,7 +1021,7 @@
vpx_memcpy(ta, a, sizeof(ta));
vpx_memcpy(tl, l, sizeof(tl));
- xd->mode_info_context->mbmi.txfm_size = TX_4X4;
+ xd->mode_info_context->mbmi.tx_size = TX_4X4;
for (mode = DC_PRED; mode <= TM_PRED; ++mode) {
int64_t this_rd;
@@ -1131,7 +1127,7 @@
int64_t best_rd) {
int i, j;
MACROBLOCKD *const xd = &mb->e_mbd;
- BLOCK_SIZE_TYPE bsize = xd->mode_info_context->mbmi.sb_type;
+ const BLOCK_SIZE bsize = xd->mode_info_context->mbmi.sb_type;
const int num_4x4_blocks_wide = num_4x4_blocks_wide_lookup[bsize];
const int num_4x4_blocks_high = num_4x4_blocks_high_lookup[bsize];
int idx, idy;
@@ -1197,7 +1193,7 @@
static int64_t rd_pick_intra_sby_mode(VP9_COMP *cpi, MACROBLOCK *x,
int *rate, int *rate_tokenonly,
int64_t *distortion, int *skippable,
- BLOCK_SIZE_TYPE bsize,
+ BLOCK_SIZE bsize,
int64_t tx_cache[TX_MODES],
int64_t best_rd) {
MB_PREDICTION_MODE mode;
@@ -1243,7 +1239,7 @@
if (this_rd < best_rd) {
mode_selected = mode;
best_rd = this_rd;
- best_tx = mic->mbmi.txfm_size;
+ best_tx = mic->mbmi.tx_size;
*rate = this_rate;
*rate_tokenonly = this_rate_tokenonly;
*distortion = this_distortion;
@@ -1251,7 +1247,7 @@
}
if (cpi->sf.tx_size_search_method == USE_FULL_RD && this_rd < INT64_MAX) {
- for (i = 0; i < TX_MODES; i++) {
+ for (i = 0; i < TX_MODES && local_tx_cache[i] < INT64_MAX; i++) {
const int64_t adj_rd = this_rd + local_tx_cache[i] -
local_tx_cache[cpi->common.tx_mode];
if (adj_rd < tx_cache[i]) {
@@ -1262,14 +1258,14 @@
}
mic->mbmi.mode = mode_selected;
- mic->mbmi.txfm_size = best_tx;
+ mic->mbmi.tx_size = best_tx;
return best_rd;
}
static void super_block_uvrd(VP9_COMMON *const cm, MACROBLOCK *x,
int *rate, int64_t *distortion, int *skippable,
- int64_t *sse, BLOCK_SIZE_TYPE bsize,
+ int64_t *sse, BLOCK_SIZE bsize,
int64_t ref_best_rd) {
MACROBLOCKD *const xd = &x->e_mbd;
MB_MODE_INFO *const mbmi = &xd->mode_info_context->mbmi;
@@ -1312,7 +1308,7 @@
static int64_t rd_pick_intra_sbuv_mode(VP9_COMP *cpi, MACROBLOCK *x,
int *rate, int *rate_tokenonly,
int64_t *distortion, int *skippable,
- BLOCK_SIZE_TYPE bsize) {
+ BLOCK_SIZE bsize) {
MB_PREDICTION_MODE mode;
MB_PREDICTION_MODE mode_selected = DC_PRED;
int64_t best_rd = INT64_MAX, this_rd;
@@ -1354,7 +1350,7 @@
static int64_t rd_sbuv_dcpred(VP9_COMP *cpi, MACROBLOCK *x,
int *rate, int *rate_tokenonly,
int64_t *distortion, int *skippable,
- BLOCK_SIZE_TYPE bsize) {
+ BLOCK_SIZE bsize) {
int64_t this_rd;
int64_t this_sse;
@@ -1368,7 +1364,7 @@
return this_rd;
}
-static void choose_intra_uv_mode(VP9_COMP *cpi, BLOCK_SIZE_TYPE bsize,
+static void choose_intra_uv_mode(VP9_COMP *cpi, BLOCK_SIZE bsize,
int *rate_uv, int *rate_uv_tokenonly,
int64_t *dist_uv, int *skip_uv,
MB_PREDICTION_MODE *mode_uv) {
@@ -1410,13 +1406,13 @@
}
static void joint_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
- BLOCK_SIZE_TYPE bsize,
+ BLOCK_SIZE bsize,
int_mv *frame_mv,
int mi_row, int mi_col,
int_mv single_newmv[MAX_REF_FRAMES],
int *rate_mv);
static void single_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
- BLOCK_SIZE_TYPE bsize,
+ BLOCK_SIZE bsize,
int mi_row, int mi_col,
int_mv *tmp_mv, int *rate_mv);
@@ -1504,7 +1500,7 @@
MACROBLOCKD *xd = &x->e_mbd;
struct macroblockd_plane *const pd = &xd->plane[0];
MODE_INFO *const mi = xd->mode_info_context;
- const BLOCK_SIZE_TYPE bsize = mi->mbmi.sb_type;
+ const BLOCK_SIZE bsize = mi->mbmi.sb_type;
const int width = plane_block_width(bsize, pd);
const int height = plane_block_height(bsize, pd);
int idx, idy;
@@ -1515,28 +1511,20 @@
int16_t* src_diff = raster_block_offset_int16(BLOCK_8X8, i,
x->plane[0].src_diff);
int16_t* coeff = BLOCK_OFFSET(x->plane[0].coeff, i);
- uint8_t* const pre = raster_block_offset_uint8(BLOCK_8X8, i,
- pd->pre[0].buf,
- pd->pre[0].stride);
uint8_t* const dst = raster_block_offset_uint8(BLOCK_8X8, i,
pd->dst.buf, pd->dst.stride);
int64_t thisdistortion = 0, thissse = 0;
int thisrate = 0;
+ int ref, second_ref = has_second_ref(&mi->mbmi);
- vp9_build_inter_predictor(pre, pd->pre[0].stride,
- dst, pd->dst.stride,
- &mi->bmi[i].as_mv[0].as_mv,
- &xd->scale_factor[0],
- width, height, 0, &xd->subpix, MV_PRECISION_Q3);
-
- if (mi->mbmi.ref_frame[1] > 0) {
- uint8_t* const second_pre =
- raster_block_offset_uint8(BLOCK_8X8, 0, pd->pre[1].buf, pd->pre[1].stride);
- vp9_build_inter_predictor(second_pre, pd->pre[1].stride,
+ for (ref = 0; ref < 1 + second_ref; ++ref) {
+ const uint8_t *pre = raster_block_offset_uint8(BLOCK_8X8, i,
+ pd->pre[ref].buf, pd->pre[ref].stride);
+ vp9_build_inter_predictor(pre, pd->pre[ref].stride,
dst, pd->dst.stride,
- &mi->bmi[i].as_mv[1].as_mv,
- &xd->scale_factor[1],
- width, height, 1, &xd->subpix, MV_PRECISION_Q3);
+ &mi->bmi[i].as_mv[ref].as_mv,
+ &xd->scale_factor[ref],
+ width, height, ref, &xd->subpix, MV_PRECISION_Q3);
}
vp9_subtract_block(height, width, src_diff, 8, src, src_stride,
@@ -1647,7 +1635,7 @@
int64_t this_segment_rd = 0;
int label_mv_thresh;
int segmentyrate = 0;
- BLOCK_SIZE_TYPE bsize = mbmi->sb_type;
+ const BLOCK_SIZE bsize = mbmi->sb_type;
const int num_4x4_blocks_wide = num_4x4_blocks_wide_lookup[bsize];
const int num_4x4_blocks_high = num_4x4_blocks_high_lookup[bsize];
vp9_variance_fn_ptr_t *v_fn_ptr;
@@ -2072,7 +2060,7 @@
static void mv_pred(VP9_COMP *cpi, MACROBLOCK *x,
uint8_t *ref_y_buffer, int ref_y_stride,
- int ref_frame, BLOCK_SIZE_TYPE block_size ) {
+ int ref_frame, BLOCK_SIZE block_size ) {
MACROBLOCKD *xd = &x->e_mbd;
MB_MODE_INFO *mbmi = &xd->mode_info_context->mbmi;
int_mv this_mv;
@@ -2241,7 +2229,7 @@
static void setup_buffer_inter(VP9_COMP *cpi, MACROBLOCK *x,
int idx, MV_REFERENCE_FRAME frame_type,
- BLOCK_SIZE_TYPE block_size,
+ BLOCK_SIZE block_size,
int mi_row, int mi_col,
int_mv frame_nearest_mv[MAX_REF_FRAMES],
int_mv frame_near_mv[MAX_REF_FRAMES],
@@ -2304,7 +2292,7 @@
}
static void single_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
- BLOCK_SIZE_TYPE bsize,
+ BLOCK_SIZE bsize,
int mi_row, int mi_col,
int_mv *tmp_mv, int *rate_mv) {
MACROBLOCKD *xd = &x->e_mbd;
@@ -2317,7 +2305,7 @@
int_mv mvp_full;
int ref = mbmi->ref_frame[0];
int_mv ref_mv = mbmi->ref_mvs[ref][0];
- const BLOCK_SIZE_TYPE block_size = get_plane_block_size(bsize, &xd->plane[0]);
+ const BLOCK_SIZE block_size = get_plane_block_size(bsize, &xd->plane[0]);
int tmp_col_min = x->mv_col_min;
int tmp_col_max = x->mv_col_max;
@@ -2430,7 +2418,7 @@
}
static void joint_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
- BLOCK_SIZE_TYPE bsize,
+ BLOCK_SIZE bsize,
int_mv *frame_mv,
int mi_row, int mi_col,
int_mv single_newmv[MAX_REF_FRAMES],
@@ -2441,7 +2429,7 @@
int refs[2] = { mbmi->ref_frame[0],
(mbmi->ref_frame[1] < 0 ? 0 : mbmi->ref_frame[1]) };
int_mv ref_mv[2];
- const BLOCK_SIZE_TYPE block_size = get_plane_block_size(bsize, &xd->plane[0]);
+ const BLOCK_SIZE block_size = get_plane_block_size(bsize, &xd->plane[0]);
int ite;
// Prediction buffer from second frame.
uint8_t *second_pred = vpx_memalign(16, pw * ph * sizeof(uint8_t));
@@ -2587,7 +2575,7 @@
}
static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
- BLOCK_SIZE_TYPE bsize,
+ BLOCK_SIZE bsize,
int64_t txfm_cache[],
int *rate2, int64_t *distortion,
int *skippable,
@@ -2871,9 +2859,8 @@
if (cpi->active_map_enabled && x->active_ptr[0] == 0)
x->skip = 1;
else if (x->encode_breakout) {
- const BLOCK_SIZE_TYPE y_size = get_plane_block_size(bsize, &xd->plane[0]);
- const BLOCK_SIZE_TYPE uv_size = get_plane_block_size(bsize,
- &xd->plane[1]);
+ const BLOCK_SIZE y_size = get_plane_block_size(bsize, &xd->plane[0]);
+ const BLOCK_SIZE uv_size = get_plane_block_size(bsize, &xd->plane[1]);
unsigned int var, sse;
// Skipping threshold for ac.
unsigned int thresh_ac;
@@ -2999,7 +2986,7 @@
void vp9_rd_pick_intra_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
int *returnrate, int64_t *returndist,
- BLOCK_SIZE_TYPE bsize,
+ BLOCK_SIZE bsize,
PICK_MODE_CONTEXT *ctx, int64_t best_rd) {
VP9_COMMON *const cm = &cpi->common;
MACROBLOCKD *const xd = &x->e_mbd;
@@ -3051,14 +3038,14 @@
int mi_row, int mi_col,
int *returnrate,
int64_t *returndistortion,
- BLOCK_SIZE_TYPE bsize,
+ BLOCK_SIZE bsize,
PICK_MODE_CONTEXT *ctx,
int64_t best_rd_so_far) {
VP9_COMMON *cm = &cpi->common;
MACROBLOCKD *xd = &x->e_mbd;
MB_MODE_INFO *mbmi = &xd->mode_info_context->mbmi;
const struct segmentation *seg = &cm->seg;
- const BLOCK_SIZE_TYPE block_size = get_plane_block_size(bsize, &xd->plane[0]);
+ const BLOCK_SIZE block_size = get_plane_block_size(bsize, &xd->plane[0]);
MB_PREDICTION_MODE this_mode;
MV_REFERENCE_FRAME ref_frame, second_ref_frame;
unsigned char segment_id = xd->mode_info_context->mbmi.segment_id;
@@ -3105,14 +3092,11 @@
int_mv seg_mvs[4][MAX_REF_FRAMES];
union b_mode_info best_bmodes[4];
PARTITION_INFO best_partition;
- int bwsl = b_width_log2(bsize);
- int bws = (1 << bwsl) / 4; // mode_info step for subsize
- int bhsl = b_height_log2(bsize);
- int bhs = (1 << bhsl) / 4; // mode_info step for subsize
+ const int bws = num_8x8_blocks_wide_lookup[bsize] / 2;
+ const int bhs = num_8x8_blocks_high_lookup[bsize] / 2;
int best_skip2 = 0;
- x->skip_encode = (cpi->sf.skip_encode_frame &&
- xd->q_index < QIDX_SKIP_THRESH);
+ x->skip_encode = cpi->sf.skip_encode_frame && xd->q_index < QIDX_SKIP_THRESH;
for (i = 0; i < 4; i++) {
int j;
@@ -3357,6 +3341,12 @@
continue;
}
+#ifdef MODE_TEST_HIT_STATS
+ // TEST/DEBUG CODE
+ // Keep a rcord of the number of test hits at each size
+ cpi->mode_test_hits[bsize]++;
+#endif
+
if (this_mode == I4X4_PRED) {
int rate;
@@ -3367,7 +3357,7 @@
*/
// I4X4_PRED is only considered for block sizes less than 8x8.
- mbmi->txfm_size = TX_4X4;
+ mbmi->tx_size = TX_4X4;
if (rd_pick_intra_sub_8x8_y_mode(cpi, x, &rate, &rate_y,
&distortion_y, best_rd) >= best_rd)
continue;
@@ -3418,7 +3408,7 @@
if (rate_y == INT_MAX)
continue;
- uv_tx = MIN(mbmi->txfm_size, max_uv_txsize_lookup[bsize]);
+ uv_tx = MIN(mbmi->tx_size, max_uv_txsize_lookup[bsize]);
if (rate_uv_intra[uv_tx] == INT_MAX) {
choose_intra_uv_mode(cpi, bsize, &rate_uv_intra[uv_tx],
&rate_uv_tokenonly[uv_tx],
@@ -3468,7 +3458,7 @@
cpi->rd_threshes[bsize][THR_NEWA];
this_rd_thresh = (ref_frame == GOLDEN_FRAME) ?
cpi->rd_threshes[bsize][THR_NEWG] : this_rd_thresh;
- xd->mode_info_context->mbmi.txfm_size = TX_4X4;
+ xd->mode_info_context->mbmi.tx_size = TX_4X4;
cpi->rd_filter_cache[SWITCHABLE_FILTERS] = INT64_MAX;
if (cm->mcomp_filter_type != BILINEAR) {
@@ -3832,7 +3822,7 @@
tx_cache[ALLOW_32X32] = tx_cache[ALLOW_16X16];
}
if (!mode_excluded && this_rd != INT64_MAX) {
- for (i = 0; i < TX_MODES; i++) {
+ for (i = 0; i < TX_MODES && tx_cache[i] < INT64_MAX; i++) {
int64_t adj_rd = INT64_MAX;
if (this_mode != I4X4_PRED) {
adj_rd = this_rd + tx_cache[i] - tx_cache[cm->tx_mode];
diff --git a/vp9/encoder/vp9_rdopt.h b/vp9/encoder/vp9_rdopt.h
index b7f56e5..eba7df9 100644
--- a/vp9/encoder/vp9_rdopt.h
+++ b/vp9/encoder/vp9_rdopt.h
@@ -20,12 +20,12 @@
void vp9_initialize_me_consts(VP9_COMP *cpi, int qindex);
void vp9_rd_pick_intra_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
- int *r, int64_t *d, BLOCK_SIZE_TYPE bsize,
+ int *r, int64_t *d, BLOCK_SIZE bsize,
PICK_MODE_CONTEXT *ctx, int64_t best_rd);
int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
int mi_row, int mi_col,
- int *r, int64_t *d, BLOCK_SIZE_TYPE bsize,
+ int *r, int64_t *d, BLOCK_SIZE bsize,
PICK_MODE_CONTEXT *ctx, int64_t best_rd);
void vp9_init_me_luts();
diff --git a/vp9/encoder/vp9_segmentation.c b/vp9/encoder/vp9_segmentation.c
index 8ff608b..0a6d2ab 100644
--- a/vp9/encoder/vp9_segmentation.c
+++ b/vp9/encoder/vp9_segmentation.c
@@ -138,7 +138,7 @@
// Temporal prediction not allowed on key frames
if (cm->frame_type != KEY_FRAME) {
- const BLOCK_SIZE_TYPE bsize = mi->mbmi.sb_type;
+ const BLOCK_SIZE bsize = mi->mbmi.sb_type;
// Test to see if the segment id matches the predicted value.
const int pred_segment_id = vp9_get_segment_id(cm, cm->last_frame_seg_map,
bsize, mi_row, mi_col);
@@ -161,52 +161,45 @@
int (*temporal_predictor_count)[2],
int *t_unpred_seg_counts,
int mi_row, int mi_col,
- BLOCK_SIZE_TYPE bsize) {
- VP9_COMMON *const cm = &cpi->common;
+ BLOCK_SIZE bsize) {
+ const VP9_COMMON *const cm = &cpi->common;
const int mis = cm->mode_info_stride;
- int bwl, bhl;
- const int bsl = mi_width_log2(bsize), bs = 1 << (bsl - 1);
+ int bw, bh;
+ const int bs = num_8x8_blocks_wide_lookup[bsize], hbs = bs / 2;
if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols)
return;
- bwl = mi_width_log2(mi->mbmi.sb_type);
- bhl = mi_height_log2(mi->mbmi.sb_type);
+ bw = num_8x8_blocks_wide_lookup[mi->mbmi.sb_type];
+ bh = num_8x8_blocks_high_lookup[mi->mbmi.sb_type];
- if (bwl == bsl && bhl == bsl) {
+ if (bw == bs && bh == bs) {
count_segs(cpi, mi, no_pred_segcounts, temporal_predictor_count,
- t_unpred_seg_counts, 1 << bsl, 1 << bsl, mi_row, mi_col);
- } else if (bwl == bsl && bhl < bsl) {
+ t_unpred_seg_counts, bs, bs, mi_row, mi_col);
+ } else if (bw == bs && bh < bs) {
count_segs(cpi, mi, no_pred_segcounts, temporal_predictor_count,
- t_unpred_seg_counts, 1 << bsl, bs, mi_row, mi_col);
- count_segs(cpi, mi + bs * mis, no_pred_segcounts, temporal_predictor_count,
- t_unpred_seg_counts, 1 << bsl, bs, mi_row + bs, mi_col);
- } else if (bwl < bsl && bhl == bsl) {
+ t_unpred_seg_counts, bs, hbs, mi_row, mi_col);
+ count_segs(cpi, mi + hbs * mis, no_pred_segcounts, temporal_predictor_count,
+ t_unpred_seg_counts, bs, hbs, mi_row + hbs, mi_col);
+ } else if (bw < bs && bh == bs) {
count_segs(cpi, mi, no_pred_segcounts, temporal_predictor_count,
- t_unpred_seg_counts, bs, 1 << bsl, mi_row, mi_col);
- count_segs(cpi, mi + bs, no_pred_segcounts, temporal_predictor_count,
- t_unpred_seg_counts, bs, 1 << bsl, mi_row, mi_col + bs);
+ t_unpred_seg_counts, hbs, bs, mi_row, mi_col);
+ count_segs(cpi, mi + hbs, no_pred_segcounts, temporal_predictor_count,
+ t_unpred_seg_counts, hbs, bs, mi_row, mi_col + hbs);
} else {
- BLOCK_SIZE_TYPE subsize;
+ const BLOCK_SIZE subsize = subsize_lookup[PARTITION_SPLIT][bsize];
int n;
- assert(bwl < bsl && bhl < bsl);
- if (bsize == BLOCK_64X64) {
- subsize = BLOCK_32X32;
- } else if (bsize == BLOCK_32X32) {
- subsize = BLOCK_16X16;
- } else {
- assert(bsize == BLOCK_16X16);
- subsize = BLOCK_8X8;
- }
+ assert(bw < bs && bh < bs);
for (n = 0; n < 4; n++) {
- const int y_idx = n >> 1, x_idx = n & 0x01;
+ const int mi_dc = hbs * (n & 1);
+ const int mi_dr = hbs * (n >> 1);
- count_segs_sb(cpi, mi + y_idx * bs * mis + x_idx * bs,
+ count_segs_sb(cpi, &mi[mi_dr * mis + mi_dc],
no_pred_segcounts, temporal_predictor_count,
t_unpred_seg_counts,
- mi_row + y_idx * bs, mi_col + x_idx * bs, subsize);
+ mi_row + mi_dr, mi_col + mi_dc, subsize);
}
}
}
diff --git a/vp9/encoder/vp9_tokenize.c b/vp9/encoder/vp9_tokenize.c
index b4270d5..03bf147 100644
--- a/vp9/encoder/vp9_tokenize.c
+++ b/vp9/encoder/vp9_tokenize.c
@@ -97,8 +97,7 @@
TX_SIZE tx_size;
};
-static void set_entropy_context_b(int plane, int block,
- BLOCK_SIZE_TYPE plane_bsize,
+static void set_entropy_context_b(int plane, int block, BLOCK_SIZE plane_bsize,
TX_SIZE tx_size, void *arg) {
struct tokenize_b_args* const args = arg;
MACROBLOCKD *const xd = args->xd;
@@ -108,7 +107,7 @@
set_contexts(xd, pd, plane_bsize, tx_size, pd->eobs[block] > 0, aoff, loff);
}
-static void tokenize_b(int plane, int block, BLOCK_SIZE_TYPE plane_bsize,
+static void tokenize_b(int plane, int block, BLOCK_SIZE plane_bsize,
TX_SIZE tx_size, void *arg) {
struct tokenize_b_args* const args = arg;
VP9_COMP *cpi = args->cpi;
@@ -122,16 +121,16 @@
const int eob = pd->eobs[block];
const PLANE_TYPE type = pd->plane_type;
const int16_t *qcoeff_ptr = BLOCK_OFFSET(pd->qcoeff, block);
- int seg_eob;
+
const int segment_id = mbmi->segment_id;
const int16_t *scan, *nb;
vp9_coeff_count *const counts = cpi->coef_counts[tx_size];
vp9_coeff_probs_model *const coef_probs = cpi->common.fc.coef_probs[tx_size];
const int ref = is_inter_block(mbmi);
- ENTROPY_CONTEXT above_ec, left_ec;
uint8_t token_cache[1024];
const uint8_t *band_translate;
ENTROPY_CONTEXT *A, *L;
+ const int seg_eob = get_tx_eob(&cpi->common.seg, segment_id, tx_size);
int aoff, loff;
txfrm_block_to_raster_xy(plane_bsize, tx_size, block, &aoff, &loff);
@@ -140,45 +139,9 @@
assert((!type && !plane) || (type && plane));
- switch (tx_size) {
- case TX_4X4:
- above_ec = A[0] != 0;
- left_ec = L[0] != 0;
- seg_eob = 16;
- scan = get_scan_4x4(get_tx_type_4x4(type, xd, block));
- band_translate = vp9_coefband_trans_4x4;
- break;
- case TX_8X8:
- above_ec = !!*(uint16_t *)A;
- left_ec = !!*(uint16_t *)L;
- seg_eob = 64;
- scan = get_scan_8x8(get_tx_type_8x8(type, xd));
- band_translate = vp9_coefband_trans_8x8plus;
- break;
- case TX_16X16:
- above_ec = !!*(uint32_t *)A;
- left_ec = !!*(uint32_t *)L;
- seg_eob = 256;
- scan = get_scan_16x16(get_tx_type_16x16(type, xd));
- band_translate = vp9_coefband_trans_8x8plus;
- break;
- case TX_32X32:
- above_ec = !!*(uint64_t *)A;
- left_ec = !!*(uint64_t *)L;
- seg_eob = 1024;
- scan = vp9_default_scan_32x32;
- band_translate = vp9_coefband_trans_8x8plus;
- break;
- default:
- assert(!"Invalid transform size");
- }
-
- pt = combine_entropy_contexts(above_ec, left_ec);
+ pt = get_entropy_context(xd, tx_size, type, block, A, L,
+ &scan, &band_translate);
nb = vp9_get_coef_neighbors_handle(scan);
-
- if (vp9_segfeature_active(&cpi->common.seg, segment_id, SEG_LVL_SKIP))
- seg_eob = 0;
-
c = 0;
do {
const int band = get_coef_band(band_translate, c);
@@ -222,20 +185,20 @@
};
static void is_skippable(int plane, int block,
- BLOCK_SIZE_TYPE plane_bsize, TX_SIZE tx_size,
+ BLOCK_SIZE plane_bsize, TX_SIZE tx_size,
void *argv) {
struct is_skippable_args *args = argv;
args->skippable[0] &= (!args->xd->plane[plane].eobs[block]);
}
-int vp9_sb_is_skippable(MACROBLOCKD *xd, BLOCK_SIZE_TYPE bsize) {
+int vp9_sb_is_skippable(MACROBLOCKD *xd, BLOCK_SIZE bsize) {
int result = 1;
struct is_skippable_args args = {xd, &result};
foreach_transformed_block(xd, bsize, is_skippable, &args);
return result;
}
-int vp9_is_skippable_in_plane(MACROBLOCKD *xd, BLOCK_SIZE_TYPE bsize,
+int vp9_is_skippable_in_plane(MACROBLOCKD *xd, BLOCK_SIZE bsize,
int plane) {
int result = 1;
struct is_skippable_args args = {xd, &result};
@@ -244,7 +207,7 @@
}
void vp9_tokenize_sb(VP9_COMP *cpi, TOKENEXTRA **t, int dry_run,
- BLOCK_SIZE_TYPE bsize) {
+ BLOCK_SIZE bsize) {
VP9_COMMON *const cm = &cpi->common;
MACROBLOCKD *const xd = &cpi->mb.e_mbd;
MB_MODE_INFO *const mbmi = &xd->mode_info_context->mbmi;
@@ -252,7 +215,7 @@
const int mb_skip_context = vp9_get_pred_context_mbskip(xd);
const int skip_inc = !vp9_segfeature_active(&cm->seg, mbmi->segment_id,
SEG_LVL_SKIP);
- struct tokenize_b_args arg = {cpi, xd, t, mbmi->txfm_size};
+ struct tokenize_b_args arg = {cpi, xd, t, mbmi->tx_size};
mbmi->skip_coeff = vp9_sb_is_skippable(xd, bsize);
if (mbmi->skip_coeff) {
diff --git a/vp9/encoder/vp9_tokenize.h b/vp9/encoder/vp9_tokenize.h
index 2afb748..b78e100 100644
--- a/vp9/encoder/vp9_tokenize.h
+++ b/vp9/encoder/vp9_tokenize.h
@@ -31,13 +31,13 @@
typedef int64_t vp9_coeff_accum[REF_TYPES][COEF_BANDS][PREV_COEF_CONTEXTS]
[MAX_ENTROPY_TOKENS + 1];
-int vp9_sb_is_skippable(MACROBLOCKD *xd, BLOCK_SIZE_TYPE bsize);
-int vp9_is_skippable_in_plane(MACROBLOCKD *xd, BLOCK_SIZE_TYPE bsize,
+int vp9_sb_is_skippable(MACROBLOCKD *xd, BLOCK_SIZE bsize);
+int vp9_is_skippable_in_plane(MACROBLOCKD *xd, BLOCK_SIZE bsize,
int plane);
struct VP9_COMP;
void vp9_tokenize_sb(struct VP9_COMP *cpi, TOKENEXTRA **t, int dry_run,
- BLOCK_SIZE_TYPE bsize);
+ BLOCK_SIZE bsize);
#ifdef ENTROPY_STATS
void init_context_counters();
diff --git a/vp9/encoder/x86/vp9_dct32x32_sse2.c b/vp9/encoder/x86/vp9_dct32x32_sse2.c
index ce65859..95ae266 100644
--- a/vp9/encoder/x86/vp9_dct32x32_sse2.c
+++ b/vp9/encoder/x86/vp9_dct32x32_sse2.c
@@ -403,6 +403,148 @@
step2[26] = _mm_packs_epi32(s2_26_6, s2_26_7);
step2[27] = _mm_packs_epi32(s2_27_6, s2_27_7);
}
+
+#if !FDCT32x32_HIGH_PRECISION
+ // dump the magnitude by half, hence the intermediate values are within
+ // the range of 16 bits.
+ if (1 == pass) {
+ __m128i s3_00_0 = _mm_cmplt_epi16(step2[ 0], kZero);
+ __m128i s3_01_0 = _mm_cmplt_epi16(step2[ 1], kZero);
+ __m128i s3_02_0 = _mm_cmplt_epi16(step2[ 2], kZero);
+ __m128i s3_03_0 = _mm_cmplt_epi16(step2[ 3], kZero);
+ __m128i s3_04_0 = _mm_cmplt_epi16(step2[ 4], kZero);
+ __m128i s3_05_0 = _mm_cmplt_epi16(step2[ 5], kZero);
+ __m128i s3_06_0 = _mm_cmplt_epi16(step2[ 6], kZero);
+ __m128i s3_07_0 = _mm_cmplt_epi16(step2[ 7], kZero);
+ __m128i s2_08_0 = _mm_cmplt_epi16(step2[ 8], kZero);
+ __m128i s2_09_0 = _mm_cmplt_epi16(step2[ 9], kZero);
+ __m128i s3_10_0 = _mm_cmplt_epi16(step2[10], kZero);
+ __m128i s3_11_0 = _mm_cmplt_epi16(step2[11], kZero);
+ __m128i s3_12_0 = _mm_cmplt_epi16(step2[12], kZero);
+ __m128i s3_13_0 = _mm_cmplt_epi16(step2[13], kZero);
+ __m128i s2_14_0 = _mm_cmplt_epi16(step2[14], kZero);
+ __m128i s2_15_0 = _mm_cmplt_epi16(step2[15], kZero);
+ __m128i s3_16_0 = _mm_cmplt_epi16(step1[16], kZero);
+ __m128i s3_17_0 = _mm_cmplt_epi16(step1[17], kZero);
+ __m128i s3_18_0 = _mm_cmplt_epi16(step1[18], kZero);
+ __m128i s3_19_0 = _mm_cmplt_epi16(step1[19], kZero);
+ __m128i s3_20_0 = _mm_cmplt_epi16(step2[20], kZero);
+ __m128i s3_21_0 = _mm_cmplt_epi16(step2[21], kZero);
+ __m128i s3_22_0 = _mm_cmplt_epi16(step2[22], kZero);
+ __m128i s3_23_0 = _mm_cmplt_epi16(step2[23], kZero);
+ __m128i s3_24_0 = _mm_cmplt_epi16(step2[24], kZero);
+ __m128i s3_25_0 = _mm_cmplt_epi16(step2[25], kZero);
+ __m128i s3_26_0 = _mm_cmplt_epi16(step2[26], kZero);
+ __m128i s3_27_0 = _mm_cmplt_epi16(step2[27], kZero);
+ __m128i s3_28_0 = _mm_cmplt_epi16(step1[28], kZero);
+ __m128i s3_29_0 = _mm_cmplt_epi16(step1[29], kZero);
+ __m128i s3_30_0 = _mm_cmplt_epi16(step1[30], kZero);
+ __m128i s3_31_0 = _mm_cmplt_epi16(step1[31], kZero);
+
+ step2[ 0] = _mm_sub_epi16(step2[ 0], s3_00_0);
+ step2[ 1] = _mm_sub_epi16(step2[ 1], s3_01_0);
+ step2[ 2] = _mm_sub_epi16(step2[ 2], s3_02_0);
+ step2[ 3] = _mm_sub_epi16(step2[ 3], s3_03_0);
+ step2[ 4] = _mm_sub_epi16(step2[ 4], s3_04_0);
+ step2[ 5] = _mm_sub_epi16(step2[ 5], s3_05_0);
+ step2[ 6] = _mm_sub_epi16(step2[ 6], s3_06_0);
+ step2[ 7] = _mm_sub_epi16(step2[ 7], s3_07_0);
+ step2[ 8] = _mm_sub_epi16(step2[ 8], s2_08_0);
+ step2[ 9] = _mm_sub_epi16(step2[ 9], s2_09_0);
+ step2[10] = _mm_sub_epi16(step2[10], s3_10_0);
+ step2[11] = _mm_sub_epi16(step2[11], s3_11_0);
+ step2[12] = _mm_sub_epi16(step2[12], s3_12_0);
+ step2[13] = _mm_sub_epi16(step2[13], s3_13_0);
+ step2[14] = _mm_sub_epi16(step2[14], s2_14_0);
+ step2[15] = _mm_sub_epi16(step2[15], s2_15_0);
+ step1[16] = _mm_sub_epi16(step1[16], s3_16_0);
+ step1[17] = _mm_sub_epi16(step1[17], s3_17_0);
+ step1[18] = _mm_sub_epi16(step1[18], s3_18_0);
+ step1[19] = _mm_sub_epi16(step1[19], s3_19_0);
+ step2[20] = _mm_sub_epi16(step2[20], s3_20_0);
+ step2[21] = _mm_sub_epi16(step2[21], s3_21_0);
+ step2[22] = _mm_sub_epi16(step2[22], s3_22_0);
+ step2[23] = _mm_sub_epi16(step2[23], s3_23_0);
+ step2[24] = _mm_sub_epi16(step2[24], s3_24_0);
+ step2[25] = _mm_sub_epi16(step2[25], s3_25_0);
+ step2[26] = _mm_sub_epi16(step2[26], s3_26_0);
+ step2[27] = _mm_sub_epi16(step2[27], s3_27_0);
+ step1[28] = _mm_sub_epi16(step1[28], s3_28_0);
+ step1[29] = _mm_sub_epi16(step1[29], s3_29_0);
+ step1[30] = _mm_sub_epi16(step1[30], s3_30_0);
+ step1[31] = _mm_sub_epi16(step1[31], s3_31_0);
+
+ step2[ 0] = _mm_add_epi16(step2[ 0], kOne);
+ step2[ 1] = _mm_add_epi16(step2[ 1], kOne);
+ step2[ 2] = _mm_add_epi16(step2[ 2], kOne);
+ step2[ 3] = _mm_add_epi16(step2[ 3], kOne);
+ step2[ 4] = _mm_add_epi16(step2[ 4], kOne);
+ step2[ 5] = _mm_add_epi16(step2[ 5], kOne);
+ step2[ 6] = _mm_add_epi16(step2[ 6], kOne);
+ step2[ 7] = _mm_add_epi16(step2[ 7], kOne);
+ step2[ 8] = _mm_add_epi16(step2[ 8], kOne);
+ step2[ 9] = _mm_add_epi16(step2[ 9], kOne);
+ step2[10] = _mm_add_epi16(step2[10], kOne);
+ step2[11] = _mm_add_epi16(step2[11], kOne);
+ step2[12] = _mm_add_epi16(step2[12], kOne);
+ step2[13] = _mm_add_epi16(step2[13], kOne);
+ step2[14] = _mm_add_epi16(step2[14], kOne);
+ step2[15] = _mm_add_epi16(step2[15], kOne);
+ step1[16] = _mm_add_epi16(step1[16], kOne);
+ step1[17] = _mm_add_epi16(step1[17], kOne);
+ step1[18] = _mm_add_epi16(step1[18], kOne);
+ step1[19] = _mm_add_epi16(step1[19], kOne);
+ step2[20] = _mm_add_epi16(step2[20], kOne);
+ step2[21] = _mm_add_epi16(step2[21], kOne);
+ step2[22] = _mm_add_epi16(step2[22], kOne);
+ step2[23] = _mm_add_epi16(step2[23], kOne);
+ step2[24] = _mm_add_epi16(step2[24], kOne);
+ step2[25] = _mm_add_epi16(step2[25], kOne);
+ step2[26] = _mm_add_epi16(step2[26], kOne);
+ step2[27] = _mm_add_epi16(step2[27], kOne);
+ step1[28] = _mm_add_epi16(step1[28], kOne);
+ step1[29] = _mm_add_epi16(step1[29], kOne);
+ step1[30] = _mm_add_epi16(step1[30], kOne);
+ step1[31] = _mm_add_epi16(step1[31], kOne);
+
+ step2[ 0] = _mm_srai_epi16(step2[ 0], 2);
+ step2[ 1] = _mm_srai_epi16(step2[ 1], 2);
+ step2[ 2] = _mm_srai_epi16(step2[ 2], 2);
+ step2[ 3] = _mm_srai_epi16(step2[ 3], 2);
+ step2[ 4] = _mm_srai_epi16(step2[ 4], 2);
+ step2[ 5] = _mm_srai_epi16(step2[ 5], 2);
+ step2[ 6] = _mm_srai_epi16(step2[ 6], 2);
+ step2[ 7] = _mm_srai_epi16(step2[ 7], 2);
+ step2[ 8] = _mm_srai_epi16(step2[ 8], 2);
+ step2[ 9] = _mm_srai_epi16(step2[ 9], 2);
+ step2[10] = _mm_srai_epi16(step2[10], 2);
+ step2[11] = _mm_srai_epi16(step2[11], 2);
+ step2[12] = _mm_srai_epi16(step2[12], 2);
+ step2[13] = _mm_srai_epi16(step2[13], 2);
+ step2[14] = _mm_srai_epi16(step2[14], 2);
+ step2[15] = _mm_srai_epi16(step2[15], 2);
+ step1[16] = _mm_srai_epi16(step1[16], 2);
+ step1[17] = _mm_srai_epi16(step1[17], 2);
+ step1[18] = _mm_srai_epi16(step1[18], 2);
+ step1[19] = _mm_srai_epi16(step1[19], 2);
+ step2[20] = _mm_srai_epi16(step2[20], 2);
+ step2[21] = _mm_srai_epi16(step2[21], 2);
+ step2[22] = _mm_srai_epi16(step2[22], 2);
+ step2[23] = _mm_srai_epi16(step2[23], 2);
+ step2[24] = _mm_srai_epi16(step2[24], 2);
+ step2[25] = _mm_srai_epi16(step2[25], 2);
+ step2[26] = _mm_srai_epi16(step2[26], 2);
+ step2[27] = _mm_srai_epi16(step2[27], 2);
+ step1[28] = _mm_srai_epi16(step1[28], 2);
+ step1[29] = _mm_srai_epi16(step1[29], 2);
+ step1[30] = _mm_srai_epi16(step1[30], 2);
+ step1[31] = _mm_srai_epi16(step1[31], 2);
+ }
+#endif
+
+#if FDCT32x32_HIGH_PRECISION
+ if (pass == 0) {
+#endif
// Stage 3
{
step3[0] = _mm_add_epi16(step2[(8 - 1)], step2[0]);
@@ -469,144 +611,6 @@
step3[31] = _mm_add_epi16(step2[24], step1[31]);
}
-#if !FDCT32x32_HIGH_PRECISION
- // dump the magnitude by half, hence the intermediate values are within
- // the range of 16 bits.
- if (1 == pass) {
- __m128i s3_00_0 = _mm_cmplt_epi16(step3[ 0], kZero);
- __m128i s3_01_0 = _mm_cmplt_epi16(step3[ 1], kZero);
- __m128i s3_02_0 = _mm_cmplt_epi16(step3[ 2], kZero);
- __m128i s3_03_0 = _mm_cmplt_epi16(step3[ 3], kZero);
- __m128i s3_04_0 = _mm_cmplt_epi16(step3[ 4], kZero);
- __m128i s3_05_0 = _mm_cmplt_epi16(step3[ 5], kZero);
- __m128i s3_06_0 = _mm_cmplt_epi16(step3[ 6], kZero);
- __m128i s3_07_0 = _mm_cmplt_epi16(step3[ 7], kZero);
- __m128i s2_08_0 = _mm_cmplt_epi16(step2[ 8], kZero);
- __m128i s2_09_0 = _mm_cmplt_epi16(step2[ 9], kZero);
- __m128i s3_10_0 = _mm_cmplt_epi16(step3[10], kZero);
- __m128i s3_11_0 = _mm_cmplt_epi16(step3[11], kZero);
- __m128i s3_12_0 = _mm_cmplt_epi16(step3[12], kZero);
- __m128i s3_13_0 = _mm_cmplt_epi16(step3[13], kZero);
- __m128i s2_14_0 = _mm_cmplt_epi16(step2[14], kZero);
- __m128i s2_15_0 = _mm_cmplt_epi16(step2[15], kZero);
- __m128i s3_16_0 = _mm_cmplt_epi16(step3[16], kZero);
- __m128i s3_17_0 = _mm_cmplt_epi16(step3[17], kZero);
- __m128i s3_18_0 = _mm_cmplt_epi16(step3[18], kZero);
- __m128i s3_19_0 = _mm_cmplt_epi16(step3[19], kZero);
- __m128i s3_20_0 = _mm_cmplt_epi16(step3[20], kZero);
- __m128i s3_21_0 = _mm_cmplt_epi16(step3[21], kZero);
- __m128i s3_22_0 = _mm_cmplt_epi16(step3[22], kZero);
- __m128i s3_23_0 = _mm_cmplt_epi16(step3[23], kZero);
- __m128i s3_24_0 = _mm_cmplt_epi16(step3[24], kZero);
- __m128i s3_25_0 = _mm_cmplt_epi16(step3[25], kZero);
- __m128i s3_26_0 = _mm_cmplt_epi16(step3[26], kZero);
- __m128i s3_27_0 = _mm_cmplt_epi16(step3[27], kZero);
- __m128i s3_28_0 = _mm_cmplt_epi16(step3[28], kZero);
- __m128i s3_29_0 = _mm_cmplt_epi16(step3[29], kZero);
- __m128i s3_30_0 = _mm_cmplt_epi16(step3[30], kZero);
- __m128i s3_31_0 = _mm_cmplt_epi16(step3[31], kZero);
- step3[ 0] = _mm_sub_epi16(step3[ 0], s3_00_0);
- step3[ 1] = _mm_sub_epi16(step3[ 1], s3_01_0);
- step3[ 2] = _mm_sub_epi16(step3[ 2], s3_02_0);
- step3[ 3] = _mm_sub_epi16(step3[ 3], s3_03_0);
- step3[ 4] = _mm_sub_epi16(step3[ 4], s3_04_0);
- step3[ 5] = _mm_sub_epi16(step3[ 5], s3_05_0);
- step3[ 6] = _mm_sub_epi16(step3[ 6], s3_06_0);
- step3[ 7] = _mm_sub_epi16(step3[ 7], s3_07_0);
- step2[ 8] = _mm_sub_epi16(step2[ 8], s2_08_0);
- step2[ 9] = _mm_sub_epi16(step2[ 9], s2_09_0);
- step3[10] = _mm_sub_epi16(step3[10], s3_10_0);
- step3[11] = _mm_sub_epi16(step3[11], s3_11_0);
- step3[12] = _mm_sub_epi16(step3[12], s3_12_0);
- step3[13] = _mm_sub_epi16(step3[13], s3_13_0);
- step2[14] = _mm_sub_epi16(step2[14], s2_14_0);
- step2[15] = _mm_sub_epi16(step2[15], s2_15_0);
- step3[16] = _mm_sub_epi16(step3[16], s3_16_0);
- step3[17] = _mm_sub_epi16(step3[17], s3_17_0);
- step3[18] = _mm_sub_epi16(step3[18], s3_18_0);
- step3[19] = _mm_sub_epi16(step3[19], s3_19_0);
- step3[20] = _mm_sub_epi16(step3[20], s3_20_0);
- step3[21] = _mm_sub_epi16(step3[21], s3_21_0);
- step3[22] = _mm_sub_epi16(step3[22], s3_22_0);
- step3[23] = _mm_sub_epi16(step3[23], s3_23_0);
- step3[24] = _mm_sub_epi16(step3[24], s3_24_0);
- step3[25] = _mm_sub_epi16(step3[25], s3_25_0);
- step3[26] = _mm_sub_epi16(step3[26], s3_26_0);
- step3[27] = _mm_sub_epi16(step3[27], s3_27_0);
- step3[28] = _mm_sub_epi16(step3[28], s3_28_0);
- step3[29] = _mm_sub_epi16(step3[29], s3_29_0);
- step3[30] = _mm_sub_epi16(step3[30], s3_30_0);
- step3[31] = _mm_sub_epi16(step3[31], s3_31_0);
- step3[ 0] = _mm_add_epi16(step3[ 0], kOne);
- step3[ 1] = _mm_add_epi16(step3[ 1], kOne);
- step3[ 2] = _mm_add_epi16(step3[ 2], kOne);
- step3[ 3] = _mm_add_epi16(step3[ 3], kOne);
- step3[ 4] = _mm_add_epi16(step3[ 4], kOne);
- step3[ 5] = _mm_add_epi16(step3[ 5], kOne);
- step3[ 6] = _mm_add_epi16(step3[ 6], kOne);
- step3[ 7] = _mm_add_epi16(step3[ 7], kOne);
- step2[ 8] = _mm_add_epi16(step2[ 8], kOne);
- step2[ 9] = _mm_add_epi16(step2[ 9], kOne);
- step3[10] = _mm_add_epi16(step3[10], kOne);
- step3[11] = _mm_add_epi16(step3[11], kOne);
- step3[12] = _mm_add_epi16(step3[12], kOne);
- step3[13] = _mm_add_epi16(step3[13], kOne);
- step2[14] = _mm_add_epi16(step2[14], kOne);
- step2[15] = _mm_add_epi16(step2[15], kOne);
- step3[16] = _mm_add_epi16(step3[16], kOne);
- step3[17] = _mm_add_epi16(step3[17], kOne);
- step3[18] = _mm_add_epi16(step3[18], kOne);
- step3[19] = _mm_add_epi16(step3[19], kOne);
- step3[20] = _mm_add_epi16(step3[20], kOne);
- step3[21] = _mm_add_epi16(step3[21], kOne);
- step3[22] = _mm_add_epi16(step3[22], kOne);
- step3[23] = _mm_add_epi16(step3[23], kOne);
- step3[24] = _mm_add_epi16(step3[24], kOne);
- step3[25] = _mm_add_epi16(step3[25], kOne);
- step3[26] = _mm_add_epi16(step3[26], kOne);
- step3[27] = _mm_add_epi16(step3[27], kOne);
- step3[28] = _mm_add_epi16(step3[28], kOne);
- step3[29] = _mm_add_epi16(step3[29], kOne);
- step3[30] = _mm_add_epi16(step3[30], kOne);
- step3[31] = _mm_add_epi16(step3[31], kOne);
- step3[ 0] = _mm_srai_epi16(step3[ 0], 2);
- step3[ 1] = _mm_srai_epi16(step3[ 1], 2);
- step3[ 2] = _mm_srai_epi16(step3[ 2], 2);
- step3[ 3] = _mm_srai_epi16(step3[ 3], 2);
- step3[ 4] = _mm_srai_epi16(step3[ 4], 2);
- step3[ 5] = _mm_srai_epi16(step3[ 5], 2);
- step3[ 6] = _mm_srai_epi16(step3[ 6], 2);
- step3[ 7] = _mm_srai_epi16(step3[ 7], 2);
- step2[ 8] = _mm_srai_epi16(step2[ 8], 2);
- step2[ 9] = _mm_srai_epi16(step2[ 9], 2);
- step3[10] = _mm_srai_epi16(step3[10], 2);
- step3[11] = _mm_srai_epi16(step3[11], 2);
- step3[12] = _mm_srai_epi16(step3[12], 2);
- step3[13] = _mm_srai_epi16(step3[13], 2);
- step2[14] = _mm_srai_epi16(step2[14], 2);
- step2[15] = _mm_srai_epi16(step2[15], 2);
- step3[16] = _mm_srai_epi16(step3[16], 2);
- step3[17] = _mm_srai_epi16(step3[17], 2);
- step3[18] = _mm_srai_epi16(step3[18], 2);
- step3[19] = _mm_srai_epi16(step3[19], 2);
- step3[20] = _mm_srai_epi16(step3[20], 2);
- step3[21] = _mm_srai_epi16(step3[21], 2);
- step3[22] = _mm_srai_epi16(step3[22], 2);
- step3[23] = _mm_srai_epi16(step3[23], 2);
- step3[24] = _mm_srai_epi16(step3[24], 2);
- step3[25] = _mm_srai_epi16(step3[25], 2);
- step3[26] = _mm_srai_epi16(step3[26], 2);
- step3[27] = _mm_srai_epi16(step3[27], 2);
- step3[28] = _mm_srai_epi16(step3[28], 2);
- step3[29] = _mm_srai_epi16(step3[29], 2);
- step3[30] = _mm_srai_epi16(step3[30], 2);
- step3[31] = _mm_srai_epi16(step3[31], 2);
- }
-#endif
-
-#if FDCT32x32_HIGH_PRECISION
- if (pass == 0) {
-#endif
// Stage 4
{
step1[ 0] = _mm_add_epi16(step3[ 3], step3[ 0]);
@@ -1158,25 +1162,146 @@
const __m128i mask16 = _mm_set1_epi32(0x80008000);
const __m128i K32One = _mm_set_epi32(1, 1, 1, 1);
// start using 32-bit operations
+ // stage 3
+ {
+ // expanding to 32-bit length priori to addition operations
+ lstep2[ 0] = k_cvtlo_epi16(step2[ 0], mask16, kZero);
+ lstep2[ 1] = k_cvthi_epi16(step2[ 0], mask16, kZero);
+ lstep2[ 2] = k_cvtlo_epi16(step2[ 1], mask16, kZero);
+ lstep2[ 3] = k_cvthi_epi16(step2[ 1], mask16, kZero);
+ lstep2[ 4] = k_cvtlo_epi16(step2[ 2], mask16, kZero);
+ lstep2[ 5] = k_cvthi_epi16(step2[ 2], mask16, kZero);
+ lstep2[ 6] = k_cvtlo_epi16(step2[ 3], mask16, kZero);
+ lstep2[ 7] = k_cvthi_epi16(step2[ 3], mask16, kZero);
+ lstep2[ 8] = k_cvtlo_epi16(step2[ 4], mask16, kZero);
+ lstep2[ 9] = k_cvthi_epi16(step2[ 4], mask16, kZero);
+ lstep2[10] = k_cvtlo_epi16(step2[ 5], mask16, kZero);
+ lstep2[11] = k_cvthi_epi16(step2[ 5], mask16, kZero);
+ lstep2[12] = k_cvtlo_epi16(step2[ 6], mask16, kZero);
+ lstep2[13] = k_cvthi_epi16(step2[ 6], mask16, kZero);
+ lstep2[14] = k_cvtlo_epi16(step2[ 7], mask16, kZero);
+ lstep2[15] = k_cvthi_epi16(step2[ 7], mask16, kZero);
+
+ lstep3[ 0] = _mm_add_epi32(lstep2[14], lstep2[ 0]);
+ lstep3[ 1] = _mm_add_epi32(lstep2[15], lstep2[ 1]);
+ lstep3[ 2] = _mm_add_epi32(lstep2[12], lstep2[ 2]);
+ lstep3[ 3] = _mm_add_epi32(lstep2[13], lstep2[ 3]);
+ lstep3[ 4] = _mm_add_epi32(lstep2[10], lstep2[ 4]);
+ lstep3[ 5] = _mm_add_epi32(lstep2[11], lstep2[ 5]);
+ lstep3[ 6] = _mm_add_epi32(lstep2[ 8], lstep2[ 6]);
+ lstep3[ 7] = _mm_add_epi32(lstep2[ 9], lstep2[ 7]);
+ lstep3[ 8] = _mm_sub_epi32(lstep2[ 6], lstep2[ 8]);
+ lstep3[ 9] = _mm_sub_epi32(lstep2[ 7], lstep2[ 9]);
+ lstep3[10] = _mm_sub_epi32(lstep2[ 4], lstep2[10]);
+ lstep3[11] = _mm_sub_epi32(lstep2[ 5], lstep2[11]);
+ lstep3[12] = _mm_sub_epi32(lstep2[ 2], lstep2[12]);
+ lstep3[13] = _mm_sub_epi32(lstep2[ 3], lstep2[13]);
+ lstep3[14] = _mm_sub_epi32(lstep2[ 0], lstep2[14]);
+ lstep3[15] = _mm_sub_epi32(lstep2[ 1], lstep2[15]);
+ }
+ {
+ const __m128i s3_10_0 = _mm_unpacklo_epi16(step2[13], step2[10]);
+ const __m128i s3_10_1 = _mm_unpackhi_epi16(step2[13], step2[10]);
+ const __m128i s3_11_0 = _mm_unpacklo_epi16(step2[12], step2[11]);
+ const __m128i s3_11_1 = _mm_unpackhi_epi16(step2[12], step2[11]);
+ const __m128i s3_10_2 = _mm_madd_epi16(s3_10_0, k__cospi_p16_m16);
+ const __m128i s3_10_3 = _mm_madd_epi16(s3_10_1, k__cospi_p16_m16);
+ const __m128i s3_11_2 = _mm_madd_epi16(s3_11_0, k__cospi_p16_m16);
+ const __m128i s3_11_3 = _mm_madd_epi16(s3_11_1, k__cospi_p16_m16);
+ const __m128i s3_12_2 = _mm_madd_epi16(s3_11_0, k__cospi_p16_p16);
+ const __m128i s3_12_3 = _mm_madd_epi16(s3_11_1, k__cospi_p16_p16);
+ const __m128i s3_13_2 = _mm_madd_epi16(s3_10_0, k__cospi_p16_p16);
+ const __m128i s3_13_3 = _mm_madd_epi16(s3_10_1, k__cospi_p16_p16);
+ // dct_const_round_shift
+ const __m128i s3_10_4 = _mm_add_epi32(s3_10_2, k__DCT_CONST_ROUNDING);
+ const __m128i s3_10_5 = _mm_add_epi32(s3_10_3, k__DCT_CONST_ROUNDING);
+ const __m128i s3_11_4 = _mm_add_epi32(s3_11_2, k__DCT_CONST_ROUNDING);
+ const __m128i s3_11_5 = _mm_add_epi32(s3_11_3, k__DCT_CONST_ROUNDING);
+ const __m128i s3_12_4 = _mm_add_epi32(s3_12_2, k__DCT_CONST_ROUNDING);
+ const __m128i s3_12_5 = _mm_add_epi32(s3_12_3, k__DCT_CONST_ROUNDING);
+ const __m128i s3_13_4 = _mm_add_epi32(s3_13_2, k__DCT_CONST_ROUNDING);
+ const __m128i s3_13_5 = _mm_add_epi32(s3_13_3, k__DCT_CONST_ROUNDING);
+ lstep3[20] = _mm_srai_epi32(s3_10_4, DCT_CONST_BITS);
+ lstep3[21] = _mm_srai_epi32(s3_10_5, DCT_CONST_BITS);
+ lstep3[22] = _mm_srai_epi32(s3_11_4, DCT_CONST_BITS);
+ lstep3[23] = _mm_srai_epi32(s3_11_5, DCT_CONST_BITS);
+ lstep3[24] = _mm_srai_epi32(s3_12_4, DCT_CONST_BITS);
+ lstep3[25] = _mm_srai_epi32(s3_12_5, DCT_CONST_BITS);
+ lstep3[26] = _mm_srai_epi32(s3_13_4, DCT_CONST_BITS);
+ lstep3[27] = _mm_srai_epi32(s3_13_5, DCT_CONST_BITS);
+ }
+ {
+ lstep2[40] = k_cvtlo_epi16(step2[20], mask16, kZero);
+ lstep2[41] = k_cvthi_epi16(step2[20], mask16, kZero);
+ lstep2[42] = k_cvtlo_epi16(step2[21], mask16, kZero);
+ lstep2[43] = k_cvthi_epi16(step2[21], mask16, kZero);
+ lstep2[44] = k_cvtlo_epi16(step2[22], mask16, kZero);
+ lstep2[45] = k_cvthi_epi16(step2[22], mask16, kZero);
+ lstep2[46] = k_cvtlo_epi16(step2[23], mask16, kZero);
+ lstep2[47] = k_cvthi_epi16(step2[23], mask16, kZero);
+ lstep2[48] = k_cvtlo_epi16(step2[24], mask16, kZero);
+ lstep2[49] = k_cvthi_epi16(step2[24], mask16, kZero);
+ lstep2[50] = k_cvtlo_epi16(step2[25], mask16, kZero);
+ lstep2[51] = k_cvthi_epi16(step2[25], mask16, kZero);
+ lstep2[52] = k_cvtlo_epi16(step2[26], mask16, kZero);
+ lstep2[53] = k_cvthi_epi16(step2[26], mask16, kZero);
+ lstep2[54] = k_cvtlo_epi16(step2[27], mask16, kZero);
+ lstep2[55] = k_cvthi_epi16(step2[27], mask16, kZero);
+
+ lstep1[32] = k_cvtlo_epi16(step1[16], mask16, kZero);
+ lstep1[33] = k_cvthi_epi16(step1[16], mask16, kZero);
+ lstep1[34] = k_cvtlo_epi16(step1[17], mask16, kZero);
+ lstep1[35] = k_cvthi_epi16(step1[17], mask16, kZero);
+ lstep1[36] = k_cvtlo_epi16(step1[18], mask16, kZero);
+ lstep1[37] = k_cvthi_epi16(step1[18], mask16, kZero);
+ lstep1[38] = k_cvtlo_epi16(step1[19], mask16, kZero);
+ lstep1[39] = k_cvthi_epi16(step1[19], mask16, kZero);
+ lstep1[56] = k_cvtlo_epi16(step1[28], mask16, kZero);
+ lstep1[57] = k_cvthi_epi16(step1[28], mask16, kZero);
+ lstep1[58] = k_cvtlo_epi16(step1[29], mask16, kZero);
+ lstep1[59] = k_cvthi_epi16(step1[29], mask16, kZero);
+ lstep1[60] = k_cvtlo_epi16(step1[30], mask16, kZero);
+ lstep1[61] = k_cvthi_epi16(step1[30], mask16, kZero);
+ lstep1[62] = k_cvtlo_epi16(step1[31], mask16, kZero);
+ lstep1[63] = k_cvthi_epi16(step1[31], mask16, kZero);
+
+ lstep3[32] = _mm_add_epi32(lstep2[46], lstep1[32]);
+ lstep3[33] = _mm_add_epi32(lstep2[47], lstep1[33]);
+ lstep3[34] = _mm_add_epi32(lstep2[44], lstep1[34]);
+ lstep3[35] = _mm_add_epi32(lstep2[45], lstep1[35]);
+ lstep3[36] = _mm_add_epi32(lstep2[42], lstep1[36]);
+ lstep3[37] = _mm_add_epi32(lstep2[43], lstep1[37]);
+ lstep3[38] = _mm_add_epi32(lstep2[40], lstep1[38]);
+ lstep3[39] = _mm_add_epi32(lstep2[41], lstep1[39]);
+ lstep3[40] = _mm_sub_epi32(lstep1[38], lstep2[40]);
+ lstep3[41] = _mm_sub_epi32(lstep1[39], lstep2[41]);
+ lstep3[42] = _mm_sub_epi32(lstep1[36], lstep2[42]);
+ lstep3[43] = _mm_sub_epi32(lstep1[37], lstep2[43]);
+ lstep3[44] = _mm_sub_epi32(lstep1[34], lstep2[44]);
+ lstep3[45] = _mm_sub_epi32(lstep1[35], lstep2[45]);
+ lstep3[46] = _mm_sub_epi32(lstep1[32], lstep2[46]);
+ lstep3[47] = _mm_sub_epi32(lstep1[33], lstep2[47]);
+ lstep3[48] = _mm_sub_epi32(lstep1[62], lstep2[48]);
+ lstep3[49] = _mm_sub_epi32(lstep1[63], lstep2[49]);
+ lstep3[50] = _mm_sub_epi32(lstep1[60], lstep2[50]);
+ lstep3[51] = _mm_sub_epi32(lstep1[61], lstep2[51]);
+ lstep3[52] = _mm_sub_epi32(lstep1[58], lstep2[52]);
+ lstep3[53] = _mm_sub_epi32(lstep1[59], lstep2[53]);
+ lstep3[54] = _mm_sub_epi32(lstep1[56], lstep2[54]);
+ lstep3[55] = _mm_sub_epi32(lstep1[57], lstep2[55]);
+ lstep3[56] = _mm_add_epi32(lstep2[54], lstep1[56]);
+ lstep3[57] = _mm_add_epi32(lstep2[55], lstep1[57]);
+ lstep3[58] = _mm_add_epi32(lstep2[52], lstep1[58]);
+ lstep3[59] = _mm_add_epi32(lstep2[53], lstep1[59]);
+ lstep3[60] = _mm_add_epi32(lstep2[50], lstep1[60]);
+ lstep3[61] = _mm_add_epi32(lstep2[51], lstep1[61]);
+ lstep3[62] = _mm_add_epi32(lstep2[48], lstep1[62]);
+ lstep3[63] = _mm_add_epi32(lstep2[49], lstep1[63]);
+ }
+
// stage 4
{
// expanding to 32-bit length priori to addition operations
- lstep3[ 0] = k_cvtlo_epi16(step3[ 0], mask16, kZero);
- lstep3[ 1] = k_cvthi_epi16(step3[ 0], mask16, kZero);
- lstep3[ 2] = k_cvtlo_epi16(step3[ 1], mask16, kZero);
- lstep3[ 3] = k_cvthi_epi16(step3[ 1], mask16, kZero);
- lstep3[ 4] = k_cvtlo_epi16(step3[ 2], mask16, kZero);
- lstep3[ 5] = k_cvthi_epi16(step3[ 2], mask16, kZero);
- lstep3[ 6] = k_cvtlo_epi16(step3[ 3], mask16, kZero);
- lstep3[ 7] = k_cvthi_epi16(step3[ 3], mask16, kZero);
- lstep3[20] = k_cvtlo_epi16(step3[10], mask16, kZero);
- lstep3[21] = k_cvthi_epi16(step3[10], mask16, kZero);
- lstep3[22] = k_cvtlo_epi16(step3[11], mask16, kZero);
- lstep3[23] = k_cvthi_epi16(step3[11], mask16, kZero);
- lstep3[24] = k_cvtlo_epi16(step3[12], mask16, kZero);
- lstep3[25] = k_cvthi_epi16(step3[12], mask16, kZero);
- lstep3[26] = k_cvtlo_epi16(step3[13], mask16, kZero);
- lstep3[27] = k_cvthi_epi16(step3[13], mask16, kZero);
lstep2[16] = k_cvtlo_epi16(step2[ 8], mask16, kZero);
lstep2[17] = k_cvthi_epi16(step2[ 8], mask16, kZero);
lstep2[18] = k_cvtlo_epi16(step2[ 9], mask16, kZero);
@@ -1212,88 +1337,150 @@
lstep1[31] = _mm_add_epi32(lstep3[25], lstep2[31]);
}
{
- const __m128i s1_05_0 = _mm_unpacklo_epi16(step3[6], step3[5]);
- const __m128i s1_05_1 = _mm_unpackhi_epi16(step3[6], step3[5]);
- const __m128i s1_05_2 = _mm_madd_epi16(s1_05_0, k__cospi_p16_m16);
- const __m128i s1_05_3 = _mm_madd_epi16(s1_05_1, k__cospi_p16_m16);
- const __m128i s1_06_2 = _mm_madd_epi16(s1_05_0, k__cospi_p16_p16);
- const __m128i s1_06_3 = _mm_madd_epi16(s1_05_1, k__cospi_p16_p16);
- // dct_const_round_shift
- const __m128i s1_05_4 = _mm_add_epi32(s1_05_2, k__DCT_CONST_ROUNDING);
- const __m128i s1_05_5 = _mm_add_epi32(s1_05_3, k__DCT_CONST_ROUNDING);
- const __m128i s1_06_4 = _mm_add_epi32(s1_06_2, k__DCT_CONST_ROUNDING);
- const __m128i s1_06_5 = _mm_add_epi32(s1_06_3, k__DCT_CONST_ROUNDING);
- lstep1[10] = _mm_srai_epi32(s1_05_4, DCT_CONST_BITS);
- lstep1[11] = _mm_srai_epi32(s1_05_5, DCT_CONST_BITS);
- lstep1[12] = _mm_srai_epi32(s1_06_4, DCT_CONST_BITS);
- lstep1[13] = _mm_srai_epi32(s1_06_5, DCT_CONST_BITS);
+ // to be continued...
+ //
+ const __m128i k32_p16_p16 = pair_set_epi32(cospi_16_64, cospi_16_64);
+ const __m128i k32_p16_m16 = pair_set_epi32(cospi_16_64, -cospi_16_64);
+
+ u[0] = _mm_unpacklo_epi32(lstep3[12], lstep3[10]);
+ u[1] = _mm_unpackhi_epi32(lstep3[12], lstep3[10]);
+ u[2] = _mm_unpacklo_epi32(lstep3[13], lstep3[11]);
+ u[3] = _mm_unpackhi_epi32(lstep3[13], lstep3[11]);
+
+ // TODO(jingning): manually inline k_madd_epi32_ to further hide
+ // instruction latency.
+ v[ 0] = k_madd_epi32(u[0], k32_p16_m16);
+ v[ 1] = k_madd_epi32(u[1], k32_p16_m16);
+ v[ 2] = k_madd_epi32(u[2], k32_p16_m16);
+ v[ 3] = k_madd_epi32(u[3], k32_p16_m16);
+ v[ 4] = k_madd_epi32(u[0], k32_p16_p16);
+ v[ 5] = k_madd_epi32(u[1], k32_p16_p16);
+ v[ 6] = k_madd_epi32(u[2], k32_p16_p16);
+ v[ 7] = k_madd_epi32(u[3], k32_p16_p16);
+
+ u[0] = k_packs_epi64(v[0], v[1]);
+ u[1] = k_packs_epi64(v[2], v[3]);
+ u[2] = k_packs_epi64(v[4], v[5]);
+ u[3] = k_packs_epi64(v[6], v[7]);
+
+ v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
+ v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
+ v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
+ v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
+
+ lstep1[10] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
+ lstep1[11] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
+ lstep1[12] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
+ lstep1[13] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
}
{
- const __m128i s1_18_0 = _mm_unpacklo_epi16(step3[18], step3[29]);
- const __m128i s1_18_1 = _mm_unpackhi_epi16(step3[18], step3[29]);
- const __m128i s1_19_0 = _mm_unpacklo_epi16(step3[19], step3[28]);
- const __m128i s1_19_1 = _mm_unpackhi_epi16(step3[19], step3[28]);
- const __m128i s1_20_0 = _mm_unpacklo_epi16(step3[20], step3[27]);
- const __m128i s1_20_1 = _mm_unpackhi_epi16(step3[20], step3[27]);
- const __m128i s1_21_0 = _mm_unpacklo_epi16(step3[21], step3[26]);
- const __m128i s1_21_1 = _mm_unpackhi_epi16(step3[21], step3[26]);
- const __m128i s1_18_2 = _mm_madd_epi16(s1_18_0, k__cospi_m08_p24);
- const __m128i s1_18_3 = _mm_madd_epi16(s1_18_1, k__cospi_m08_p24);
- const __m128i s1_19_2 = _mm_madd_epi16(s1_19_0, k__cospi_m08_p24);
- const __m128i s1_19_3 = _mm_madd_epi16(s1_19_1, k__cospi_m08_p24);
- const __m128i s1_20_2 = _mm_madd_epi16(s1_20_0, k__cospi_m24_m08);
- const __m128i s1_20_3 = _mm_madd_epi16(s1_20_1, k__cospi_m24_m08);
- const __m128i s1_21_2 = _mm_madd_epi16(s1_21_0, k__cospi_m24_m08);
- const __m128i s1_21_3 = _mm_madd_epi16(s1_21_1, k__cospi_m24_m08);
- const __m128i s1_26_2 = _mm_madd_epi16(s1_21_0, k__cospi_m08_p24);
- const __m128i s1_26_3 = _mm_madd_epi16(s1_21_1, k__cospi_m08_p24);
- const __m128i s1_27_2 = _mm_madd_epi16(s1_20_0, k__cospi_m08_p24);
- const __m128i s1_27_3 = _mm_madd_epi16(s1_20_1, k__cospi_m08_p24);
- const __m128i s1_28_2 = _mm_madd_epi16(s1_19_0, k__cospi_p24_p08);
- const __m128i s1_28_3 = _mm_madd_epi16(s1_19_1, k__cospi_p24_p08);
- const __m128i s1_29_2 = _mm_madd_epi16(s1_18_0, k__cospi_p24_p08);
- const __m128i s1_29_3 = _mm_madd_epi16(s1_18_1, k__cospi_p24_p08);
- // dct_const_round_shift
- const __m128i s1_18_4 = _mm_add_epi32(s1_18_2, k__DCT_CONST_ROUNDING);
- const __m128i s1_18_5 = _mm_add_epi32(s1_18_3, k__DCT_CONST_ROUNDING);
- const __m128i s1_19_4 = _mm_add_epi32(s1_19_2, k__DCT_CONST_ROUNDING);
- const __m128i s1_19_5 = _mm_add_epi32(s1_19_3, k__DCT_CONST_ROUNDING);
- const __m128i s1_20_4 = _mm_add_epi32(s1_20_2, k__DCT_CONST_ROUNDING);
- const __m128i s1_20_5 = _mm_add_epi32(s1_20_3, k__DCT_CONST_ROUNDING);
- const __m128i s1_21_4 = _mm_add_epi32(s1_21_2, k__DCT_CONST_ROUNDING);
- const __m128i s1_21_5 = _mm_add_epi32(s1_21_3, k__DCT_CONST_ROUNDING);
- const __m128i s1_26_4 = _mm_add_epi32(s1_26_2, k__DCT_CONST_ROUNDING);
- const __m128i s1_26_5 = _mm_add_epi32(s1_26_3, k__DCT_CONST_ROUNDING);
- const __m128i s1_27_4 = _mm_add_epi32(s1_27_2, k__DCT_CONST_ROUNDING);
- const __m128i s1_27_5 = _mm_add_epi32(s1_27_3, k__DCT_CONST_ROUNDING);
- const __m128i s1_28_4 = _mm_add_epi32(s1_28_2, k__DCT_CONST_ROUNDING);
- const __m128i s1_28_5 = _mm_add_epi32(s1_28_3, k__DCT_CONST_ROUNDING);
- const __m128i s1_29_4 = _mm_add_epi32(s1_29_2, k__DCT_CONST_ROUNDING);
- const __m128i s1_29_5 = _mm_add_epi32(s1_29_3, k__DCT_CONST_ROUNDING);
- lstep1[36] = _mm_srai_epi32(s1_18_4, DCT_CONST_BITS);
- lstep1[37] = _mm_srai_epi32(s1_18_5, DCT_CONST_BITS);
- lstep1[38] = _mm_srai_epi32(s1_19_4, DCT_CONST_BITS);
- lstep1[39] = _mm_srai_epi32(s1_19_5, DCT_CONST_BITS);
- lstep1[40] = _mm_srai_epi32(s1_20_4, DCT_CONST_BITS);
- lstep1[41] = _mm_srai_epi32(s1_20_5, DCT_CONST_BITS);
- lstep1[42] = _mm_srai_epi32(s1_21_4, DCT_CONST_BITS);
- lstep1[43] = _mm_srai_epi32(s1_21_5, DCT_CONST_BITS);
- lstep1[52] = _mm_srai_epi32(s1_26_4, DCT_CONST_BITS);
- lstep1[53] = _mm_srai_epi32(s1_26_5, DCT_CONST_BITS);
- lstep1[54] = _mm_srai_epi32(s1_27_4, DCT_CONST_BITS);
- lstep1[55] = _mm_srai_epi32(s1_27_5, DCT_CONST_BITS);
- lstep1[56] = _mm_srai_epi32(s1_28_4, DCT_CONST_BITS);
- lstep1[57] = _mm_srai_epi32(s1_28_5, DCT_CONST_BITS);
- lstep1[58] = _mm_srai_epi32(s1_29_4, DCT_CONST_BITS);
- lstep1[59] = _mm_srai_epi32(s1_29_5, DCT_CONST_BITS);
+ const __m128i k32_m08_p24 = pair_set_epi32(-cospi_8_64, cospi_24_64);
+ const __m128i k32_m24_m08 = pair_set_epi32(-cospi_24_64, -cospi_8_64);
+ const __m128i k32_p24_p08 = pair_set_epi32(cospi_24_64, cospi_8_64);
+
+ u[ 0] = _mm_unpacklo_epi32(lstep3[36], lstep3[58]);
+ u[ 1] = _mm_unpackhi_epi32(lstep3[36], lstep3[58]);
+ u[ 2] = _mm_unpacklo_epi32(lstep3[37], lstep3[59]);
+ u[ 3] = _mm_unpackhi_epi32(lstep3[37], lstep3[59]);
+ u[ 4] = _mm_unpacklo_epi32(lstep3[38], lstep3[56]);
+ u[ 5] = _mm_unpackhi_epi32(lstep3[38], lstep3[56]);
+ u[ 6] = _mm_unpacklo_epi32(lstep3[39], lstep3[57]);
+ u[ 7] = _mm_unpackhi_epi32(lstep3[39], lstep3[57]);
+ u[ 8] = _mm_unpacklo_epi32(lstep3[40], lstep3[54]);
+ u[ 9] = _mm_unpackhi_epi32(lstep3[40], lstep3[54]);
+ u[10] = _mm_unpacklo_epi32(lstep3[41], lstep3[55]);
+ u[11] = _mm_unpackhi_epi32(lstep3[41], lstep3[55]);
+ u[12] = _mm_unpacklo_epi32(lstep3[42], lstep3[52]);
+ u[13] = _mm_unpackhi_epi32(lstep3[42], lstep3[52]);
+ u[14] = _mm_unpacklo_epi32(lstep3[43], lstep3[53]);
+ u[15] = _mm_unpackhi_epi32(lstep3[43], lstep3[53]);
+
+ v[ 0] = k_madd_epi32(u[ 0], k32_m08_p24);
+ v[ 1] = k_madd_epi32(u[ 1], k32_m08_p24);
+ v[ 2] = k_madd_epi32(u[ 2], k32_m08_p24);
+ v[ 3] = k_madd_epi32(u[ 3], k32_m08_p24);
+ v[ 4] = k_madd_epi32(u[ 4], k32_m08_p24);
+ v[ 5] = k_madd_epi32(u[ 5], k32_m08_p24);
+ v[ 6] = k_madd_epi32(u[ 6], k32_m08_p24);
+ v[ 7] = k_madd_epi32(u[ 7], k32_m08_p24);
+ v[ 8] = k_madd_epi32(u[ 8], k32_m24_m08);
+ v[ 9] = k_madd_epi32(u[ 9], k32_m24_m08);
+ v[10] = k_madd_epi32(u[10], k32_m24_m08);
+ v[11] = k_madd_epi32(u[11], k32_m24_m08);
+ v[12] = k_madd_epi32(u[12], k32_m24_m08);
+ v[13] = k_madd_epi32(u[13], k32_m24_m08);
+ v[14] = k_madd_epi32(u[14], k32_m24_m08);
+ v[15] = k_madd_epi32(u[15], k32_m24_m08);
+ v[16] = k_madd_epi32(u[12], k32_m08_p24);
+ v[17] = k_madd_epi32(u[13], k32_m08_p24);
+ v[18] = k_madd_epi32(u[14], k32_m08_p24);
+ v[19] = k_madd_epi32(u[15], k32_m08_p24);
+ v[20] = k_madd_epi32(u[ 8], k32_m08_p24);
+ v[21] = k_madd_epi32(u[ 9], k32_m08_p24);
+ v[22] = k_madd_epi32(u[10], k32_m08_p24);
+ v[23] = k_madd_epi32(u[11], k32_m08_p24);
+ v[24] = k_madd_epi32(u[ 4], k32_p24_p08);
+ v[25] = k_madd_epi32(u[ 5], k32_p24_p08);
+ v[26] = k_madd_epi32(u[ 6], k32_p24_p08);
+ v[27] = k_madd_epi32(u[ 7], k32_p24_p08);
+ v[28] = k_madd_epi32(u[ 0], k32_p24_p08);
+ v[29] = k_madd_epi32(u[ 1], k32_p24_p08);
+ v[30] = k_madd_epi32(u[ 2], k32_p24_p08);
+ v[31] = k_madd_epi32(u[ 3], k32_p24_p08);
+
+ u[ 0] = k_packs_epi64(v[ 0], v[ 1]);
+ u[ 1] = k_packs_epi64(v[ 2], v[ 3]);
+ u[ 2] = k_packs_epi64(v[ 4], v[ 5]);
+ u[ 3] = k_packs_epi64(v[ 6], v[ 7]);
+ u[ 4] = k_packs_epi64(v[ 8], v[ 9]);
+ u[ 5] = k_packs_epi64(v[10], v[11]);
+ u[ 6] = k_packs_epi64(v[12], v[13]);
+ u[ 7] = k_packs_epi64(v[14], v[15]);
+ u[ 8] = k_packs_epi64(v[16], v[17]);
+ u[ 9] = k_packs_epi64(v[18], v[19]);
+ u[10] = k_packs_epi64(v[20], v[21]);
+ u[11] = k_packs_epi64(v[22], v[23]);
+ u[12] = k_packs_epi64(v[24], v[25]);
+ u[13] = k_packs_epi64(v[26], v[27]);
+ u[14] = k_packs_epi64(v[28], v[29]);
+ u[15] = k_packs_epi64(v[30], v[31]);
+
+ v[ 0] = _mm_add_epi32(u[ 0], k__DCT_CONST_ROUNDING);
+ v[ 1] = _mm_add_epi32(u[ 1], k__DCT_CONST_ROUNDING);
+ v[ 2] = _mm_add_epi32(u[ 2], k__DCT_CONST_ROUNDING);
+ v[ 3] = _mm_add_epi32(u[ 3], k__DCT_CONST_ROUNDING);
+ v[ 4] = _mm_add_epi32(u[ 4], k__DCT_CONST_ROUNDING);
+ v[ 5] = _mm_add_epi32(u[ 5], k__DCT_CONST_ROUNDING);
+ v[ 6] = _mm_add_epi32(u[ 6], k__DCT_CONST_ROUNDING);
+ v[ 7] = _mm_add_epi32(u[ 7], k__DCT_CONST_ROUNDING);
+ v[ 8] = _mm_add_epi32(u[ 8], k__DCT_CONST_ROUNDING);
+ v[ 9] = _mm_add_epi32(u[ 9], k__DCT_CONST_ROUNDING);
+ v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);
+ v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);
+ v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);
+ v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING);
+ v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);
+ v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);
+
+ lstep1[36] = _mm_srai_epi32(v[ 0], DCT_CONST_BITS);
+ lstep1[37] = _mm_srai_epi32(v[ 1], DCT_CONST_BITS);
+ lstep1[38] = _mm_srai_epi32(v[ 2], DCT_CONST_BITS);
+ lstep1[39] = _mm_srai_epi32(v[ 3], DCT_CONST_BITS);
+ lstep1[40] = _mm_srai_epi32(v[ 4], DCT_CONST_BITS);
+ lstep1[41] = _mm_srai_epi32(v[ 5], DCT_CONST_BITS);
+ lstep1[42] = _mm_srai_epi32(v[ 6], DCT_CONST_BITS);
+ lstep1[43] = _mm_srai_epi32(v[ 7], DCT_CONST_BITS);
+ lstep1[52] = _mm_srai_epi32(v[ 8], DCT_CONST_BITS);
+ lstep1[53] = _mm_srai_epi32(v[ 9], DCT_CONST_BITS);
+ lstep1[54] = _mm_srai_epi32(v[10], DCT_CONST_BITS);
+ lstep1[55] = _mm_srai_epi32(v[11], DCT_CONST_BITS);
+ lstep1[56] = _mm_srai_epi32(v[12], DCT_CONST_BITS);
+ lstep1[57] = _mm_srai_epi32(v[13], DCT_CONST_BITS);
+ lstep1[58] = _mm_srai_epi32(v[14], DCT_CONST_BITS);
+ lstep1[59] = _mm_srai_epi32(v[15], DCT_CONST_BITS);
}
// stage 5
{
- lstep3[ 8] = k_cvtlo_epi16(step3[4], mask16, kZero);
- lstep3[ 9] = k_cvthi_epi16(step3[4], mask16, kZero);
- lstep3[14] = k_cvtlo_epi16(step3[7], mask16, kZero);
- lstep3[15] = k_cvthi_epi16(step3[7], mask16, kZero);
-
lstep2[ 8] = _mm_add_epi32(lstep1[10], lstep3[ 8]);
lstep2[ 9] = _mm_add_epi32(lstep1[11], lstep3[ 9]);
lstep2[10] = _mm_sub_epi32(lstep3[ 8], lstep1[10]);
@@ -1465,23 +1652,6 @@
lstep2[29] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
}
{
- lstep3[32] = k_cvtlo_epi16(step3[16], mask16, kZero);
- lstep3[33] = k_cvthi_epi16(step3[16], mask16, kZero);
- lstep3[34] = k_cvtlo_epi16(step3[17], mask16, kZero);
- lstep3[35] = k_cvthi_epi16(step3[17], mask16, kZero);
- lstep3[44] = k_cvtlo_epi16(step3[22], mask16, kZero);
- lstep3[45] = k_cvthi_epi16(step3[22], mask16, kZero);
- lstep3[46] = k_cvtlo_epi16(step3[23], mask16, kZero);
- lstep3[47] = k_cvthi_epi16(step3[23], mask16, kZero);
- lstep3[48] = k_cvtlo_epi16(step3[24], mask16, kZero);
- lstep3[49] = k_cvthi_epi16(step3[24], mask16, kZero);
- lstep3[50] = k_cvtlo_epi16(step3[25], mask16, kZero);
- lstep3[51] = k_cvthi_epi16(step3[25], mask16, kZero);
- lstep3[60] = k_cvtlo_epi16(step3[30], mask16, kZero);
- lstep3[61] = k_cvthi_epi16(step3[30], mask16, kZero);
- lstep3[62] = k_cvtlo_epi16(step3[31], mask16, kZero);
- lstep3[63] = k_cvthi_epi16(step3[31], mask16, kZero);
-
lstep2[32] = _mm_add_epi32(lstep1[38], lstep3[32]);
lstep2[33] = _mm_add_epi32(lstep1[39], lstep3[33]);
lstep2[34] = _mm_add_epi32(lstep1[36], lstep3[34]);
diff --git a/vp9/encoder/x86/vp9_quantize_ssse3.asm b/vp9/encoder/x86/vp9_quantize_ssse3.asm
index 60f7991..7deb981 100644
--- a/vp9/encoder/x86/vp9_quantize_ssse3.asm
+++ b/vp9/encoder/x86/vp9_quantize_ssse3.asm
@@ -36,6 +36,14 @@
pshufd m4, m4, 0
mova m2, [quantq] ; m2 = quant
paddw m0, m4 ; m0 = zbin + zbin_oq
+%ifidn %1, b_32x32
+ pcmpeqw m5, m5
+ psrlw m5, 15
+ paddw m0, m5
+ paddw m1, m5
+ psrlw m0, 1 ; m0 = (m0 + 1) / 2
+ psrlw m1, 1 ; m1 = (m1 + 1) / 2
+%endif
mova m3, [r2q] ; m3 = dequant
psubw m0, [pw_1]
mov r2, shiftmp
@@ -43,6 +51,9 @@
mova m4, [r2] ; m4 = shift
mov r4, dqcoeffmp
mov r5, iscanmp
+%ifidn %1, b_32x32
+ psllw m4, 1
+%endif
pxor m5, m5 ; m5 = dedicated zero
DEFINE_ARGS coeff, ncoeff, d1, qcoeff, dqcoeff, iscan, d2, d3, d4, d5, d6, eob
lea coeffq, [ coeffq+ncoeffq*2]
@@ -56,10 +67,6 @@
mova m10, [ coeffq+ncoeffq*2+16] ; m10 = c[i]
pabsw m6, m9 ; m6 = abs(m9)
pabsw m11, m10 ; m11 = abs(m10)
-%ifidn %1, b_32x32
- paddw m6, m6
- paddw m11, m11
-%endif
pcmpgtw m7, m6, m0 ; m7 = c[i] >= zbin
punpckhqdq m0, m0
pcmpgtw m12, m11, m0 ; m12 = c[i] >= zbin
@@ -112,10 +119,6 @@
mova m10, [ coeffq+ncoeffq*2+16] ; m10 = c[i]
pabsw m6, m9 ; m6 = abs(m9)
pabsw m11, m10 ; m11 = abs(m10)
-%ifidn %1, b_32x32
- paddw m6, m6
- paddw m11, m11
-%endif
pcmpgtw m7, m6, m0 ; m7 = c[i] >= zbin
pcmpgtw m12, m11, m0 ; m12 = c[i] >= zbin
%ifidn %1, b_32x32
@@ -164,6 +167,7 @@
pmaxsw m8, m13
add ncoeffq, mmsize
jl .ac_only_loop
+
%ifidn %1, b_32x32
jmp .accumulate_eob
.skip_iter:
diff --git a/vp9/encoder/x86/vp9_subpel_variance.asm b/vp9/encoder/x86/vp9_subpel_variance.asm
index 19e2feb..533456b 100644
--- a/vp9/encoder/x86/vp9_subpel_variance.asm
+++ b/vp9/encoder/x86/vp9_subpel_variance.asm
@@ -270,8 +270,13 @@
%if mmsize == 16
movhps m2, [srcq+src_strideq*2]
%else ; mmsize == 8
+%if %1 == 4
+ movh m1, [srcq+src_strideq*2]
+ punpckldq m2, m1
+%else
punpckldq m2, [srcq+src_strideq*2]
%endif
+%endif
movh m1, [dstq]
%if mmsize == 16
movlhps m0, m2
@@ -542,9 +547,16 @@
movhps m2, [srcq+src_strideq]
movhps m3, [srcq+src_strideq+1]
%else
+%if %1 == 4
+ movh m1, [srcq+src_strideq]
+ punpckldq m2, m1
+ movh m1, [srcq+src_strideq+1]
+ punpckldq m3, m1
+%else
punpckldq m2, [srcq+src_strideq]
punpckldq m3, [srcq+src_strideq+1]
%endif
+%endif
pavgb m2, m3
%if mmsize == 16
movlhps m0, m2
diff --git a/vp9/vp9_common.mk b/vp9/vp9_common.mk
index d5692ef..fb302ab 100644
--- a/vp9/vp9_common.mk
+++ b/vp9/vp9_common.mk
@@ -98,7 +98,9 @@
VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_dc_only_idct_add_neon$(ASM)
VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_short_idct4x4_1_add_neon$(ASM)
VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_short_idct4x4_add_neon$(ASM)
+VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_short_idct8x8_1_add_neon$(ASM)
VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_short_idct8x8_add_neon$(ASM)
+VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_short_idct16x16_1_add_neon$(ASM)
VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_short_idct16x16_add_neon$(ASM)
VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_mb_lpf_neon$(ASM)
VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_copy_neon$(ASM)