Merge "Speed up idct8x8 by rearrange instructions. Speed improve from 264% ~ 270% to 280% ~ 300% base on assembly-perf."
diff --git a/test/dct32x32_test.cc b/test/dct32x32_test.cc
index e05d482..9cfa386 100644
--- a/test/dct32x32_test.cc
+++ b/test/dct32x32_test.cc
@@ -13,15 +13,17 @@
#include <string.h>
#include "third_party/googletest/src/include/gtest/gtest.h"
+#include "test/acm_random.h"
+#include "test/clear_system_state.h"
+#include "test/register_state_check.h"
+#include "test/util.h"
extern "C" {
+#include "./vpx_config.h"
#include "vp9/common/vp9_entropy.h"
#include "./vp9_rtcd.h"
- void vp9_short_fdct32x32_c(int16_t *input, int16_t *out, int pitch);
- void vp9_short_idct32x32_add_c(short *input, uint8_t *output, int pitch);
}
-#include "test/acm_random.h"
#include "vpx/vpx_integer.h"
using libvpx_test::ACMRandom;
@@ -36,29 +38,9 @@
}
#endif
-static const double kPi = 3.141592653589793238462643383279502884;
-static void reference2_32x32_idct_2d(double *input, double *output) {
- double x;
- for (int l = 0; l < 32; ++l) {
- for (int k = 0; k < 32; ++k) {
- double s = 0;
- for (int i = 0; i < 32; ++i) {
- for (int j = 0; j < 32; ++j) {
- x = cos(kPi * j * (l + 0.5) / 32.0) *
- cos(kPi * i * (k + 0.5) / 32.0) * input[i * 32 + j] / 1024;
- if (i != 0)
- x *= sqrt(2.0);
- if (j != 0)
- x *= sqrt(2.0);
- s += x;
- }
- }
- output[k * 32 + l] = s / 4;
- }
- }
-}
-
-static void reference_32x32_dct_1d(double in[32], double out[32], int stride) {
+const int kNumCoeffs = 1024;
+const double kPi = 3.141592653589793238462643383279502884;
+void reference_32x32_dct_1d(const double in[32], double out[32], int stride) {
const double kInvSqrt2 = 0.707106781186547524400844362104;
for (int k = 0; k < 32; k++) {
out[k] = 0.0;
@@ -69,7 +51,8 @@
}
}
-static void reference_32x32_dct_2d(int16_t input[32*32], double output[32*32]) {
+void reference_32x32_dct_2d(const int16_t input[kNumCoeffs],
+ double output[kNumCoeffs]) {
// First transform columns
for (int i = 0; i < 32; ++i) {
double temp_in[32], temp_out[32];
@@ -91,27 +74,165 @@
}
}
-TEST(VP9Idct32x32Test, AccuracyCheck) {
- ACMRandom rnd(ACMRandom::DeterministicSeed());
- const int count_test_block = 1000;
- for (int i = 0; i < count_test_block; ++i) {
- int16_t in[1024], coeff[1024];
- uint8_t dst[1024], src[1024];
- double out_r[1024];
+typedef void (*fwd_txfm_t)(int16_t *in, int16_t *out, int stride);
+typedef void (*inv_txfm_t)(int16_t *in, uint8_t *dst, int stride);
- for (int j = 0; j < 1024; ++j) {
+class Trans32x32Test : public PARAMS(fwd_txfm_t, inv_txfm_t, int) {
+ public:
+ virtual ~Trans32x32Test() {}
+ virtual void SetUp() {
+ fwd_txfm_ = GET_PARAM(0);
+ inv_txfm_ = GET_PARAM(1);
+ version_ = GET_PARAM(2); // 0: high precision forward transform
+ // 1: low precision version for rd loop
+ }
+
+ virtual void TearDown() { libvpx_test::ClearSystemState(); }
+
+ protected:
+ int version_;
+ fwd_txfm_t fwd_txfm_;
+ inv_txfm_t inv_txfm_;
+};
+
+TEST_P(Trans32x32Test, AccuracyCheck) {
+ ACMRandom rnd(ACMRandom::DeterministicSeed());
+ uint32_t max_error = 0;
+ int64_t total_error = 0;
+ const int count_test_block = 1000;
+ DECLARE_ALIGNED_ARRAY(16, int16_t, test_input_block, kNumCoeffs);
+ DECLARE_ALIGNED_ARRAY(16, int16_t, test_temp_block, kNumCoeffs);
+ DECLARE_ALIGNED_ARRAY(16, uint8_t, dst, kNumCoeffs);
+ DECLARE_ALIGNED_ARRAY(16, uint8_t, src, kNumCoeffs);
+
+ for (int i = 0; i < count_test_block; ++i) {
+ // Initialize a test block with input range [-255, 255].
+ for (int j = 0; j < kNumCoeffs; ++j) {
src[j] = rnd.Rand8();
dst[j] = rnd.Rand8();
+ test_input_block[j] = src[j] - dst[j];
}
+
+ const int pitch = 64;
+ REGISTER_STATE_CHECK(fwd_txfm_(test_input_block, test_temp_block, pitch));
+ REGISTER_STATE_CHECK(inv_txfm_(test_temp_block, dst, 32));
+
+ for (int j = 0; j < kNumCoeffs; ++j) {
+ const uint32_t diff = dst[j] - src[j];
+ const uint32_t error = diff * diff;
+ if (max_error < error)
+ max_error = error;
+ total_error += error;
+ }
+ }
+
+ if (version_ == 1) {
+ max_error /= 2;
+ total_error /= 45;
+ }
+
+ EXPECT_GE(1u, max_error)
+ << "Error: 32x32 FDCT/IDCT has an individual round-trip error > 1";
+
+ EXPECT_GE(count_test_block, total_error)
+ << "Error: 32x32 FDCT/IDCT has average round-trip error > 1 per block";
+}
+
+TEST_P(Trans32x32Test, CoeffCheck) {
+ ACMRandom rnd(ACMRandom::DeterministicSeed());
+ const int count_test_block = 1000;
+
+ DECLARE_ALIGNED_ARRAY(16, int16_t, input_block, kNumCoeffs);
+ DECLARE_ALIGNED_ARRAY(16, int16_t, output_ref_block, kNumCoeffs);
+ DECLARE_ALIGNED_ARRAY(16, int16_t, output_block, kNumCoeffs);
+
+ for (int i = 0; i < count_test_block; ++i) {
+ for (int j = 0; j < kNumCoeffs; ++j)
+ input_block[j] = rnd.Rand8() - rnd.Rand8();
+
+ const int pitch = 64;
+ vp9_short_fdct32x32_c(input_block, output_ref_block, pitch);
+ REGISTER_STATE_CHECK(fwd_txfm_(input_block, output_block, pitch));
+
+ if (version_ == 0) {
+ for (int j = 0; j < kNumCoeffs; ++j)
+ EXPECT_EQ(output_block[j], output_ref_block[j])
+ << "Error: 32x32 FDCT versions have mismatched coefficients";
+ } else {
+ for (int j = 0; j < kNumCoeffs; ++j)
+ EXPECT_GE(6, abs(output_block[j] - output_ref_block[j]))
+ << "Error: 32x32 FDCT rd has mismatched coefficients";
+ }
+ }
+}
+
+TEST_P(Trans32x32Test, MemCheck) {
+ ACMRandom rnd(ACMRandom::DeterministicSeed());
+ const int count_test_block = 2000;
+
+ DECLARE_ALIGNED_ARRAY(16, int16_t, input_block, kNumCoeffs);
+ DECLARE_ALIGNED_ARRAY(16, int16_t, input_extreme_block, kNumCoeffs);
+ DECLARE_ALIGNED_ARRAY(16, int16_t, output_ref_block, kNumCoeffs);
+ DECLARE_ALIGNED_ARRAY(16, int16_t, output_block, kNumCoeffs);
+
+ for (int i = 0; i < count_test_block; ++i) {
// Initialize a test block with input range [-255, 255].
- for (int j = 0; j < 1024; ++j)
+ for (int j = 0; j < kNumCoeffs; ++j) {
+ input_block[j] = rnd.Rand8() - rnd.Rand8();
+ input_extreme_block[j] = rnd.Rand8() & 1 ? 255 : -255;
+ }
+ if (i == 0)
+ for (int j = 0; j < kNumCoeffs; ++j)
+ input_extreme_block[j] = 255;
+ if (i == 1)
+ for (int j = 0; j < kNumCoeffs; ++j)
+ input_extreme_block[j] = -255;
+
+ const int pitch = 64;
+ vp9_short_fdct32x32_c(input_extreme_block, output_ref_block, pitch);
+ REGISTER_STATE_CHECK(fwd_txfm_(input_extreme_block, output_block, pitch));
+
+ // The minimum quant value is 4.
+ for (int j = 0; j < kNumCoeffs; ++j) {
+ if (version_ == 0) {
+ EXPECT_EQ(output_block[j], output_ref_block[j])
+ << "Error: 32x32 FDCT versions have mismatched coefficients";
+ } else {
+ EXPECT_GE(6, abs(output_block[j] - output_ref_block[j]))
+ << "Error: 32x32 FDCT rd has mismatched coefficients";
+ }
+ EXPECT_GE(4 * DCT_MAX_VALUE, abs(output_ref_block[j]))
+ << "Error: 32x32 FDCT C has coefficient larger than 4*DCT_MAX_VALUE";
+ EXPECT_GE(4 * DCT_MAX_VALUE, abs(output_block[j]))
+ << "Error: 32x32 FDCT has coefficient larger than "
+ << "4*DCT_MAX_VALUE";
+ }
+ }
+}
+
+TEST_P(Trans32x32Test, InverseAccuracy) {
+ ACMRandom rnd(ACMRandom::DeterministicSeed());
+ const int count_test_block = 1000;
+ DECLARE_ALIGNED_ARRAY(16, int16_t, in, kNumCoeffs);
+ DECLARE_ALIGNED_ARRAY(16, int16_t, coeff, kNumCoeffs);
+ DECLARE_ALIGNED_ARRAY(16, uint8_t, dst, kNumCoeffs);
+ DECLARE_ALIGNED_ARRAY(16, uint8_t, src, kNumCoeffs);
+
+ for (int i = 0; i < count_test_block; ++i) {
+ double out_r[kNumCoeffs];
+
+ // Initialize a test block with input range [-255, 255]
+ for (int j = 0; j < kNumCoeffs; ++j) {
+ src[j] = rnd.Rand8();
+ dst[j] = rnd.Rand8();
in[j] = src[j] - dst[j];
+ }
reference_32x32_dct_2d(in, out_r);
- for (int j = 0; j < 1024; j++)
+ for (int j = 0; j < kNumCoeffs; ++j)
coeff[j] = round(out_r[j]);
- vp9_short_idct32x32_add_c(coeff, dst, 32);
- for (int j = 0; j < 1024; ++j) {
+ REGISTER_STATE_CHECK(inv_txfm_(coeff, dst, 32));
+ for (int j = 0; j < kNumCoeffs; ++j) {
const int diff = dst[j] - src[j];
const int error = diff * diff;
EXPECT_GE(1, error)
@@ -121,72 +242,21 @@
}
}
-TEST(VP9Fdct32x32Test, AccuracyCheck) {
- ACMRandom rnd(ACMRandom::DeterministicSeed());
- unsigned int max_error = 0;
- int64_t total_error = 0;
- const int count_test_block = 1000;
- for (int i = 0; i < count_test_block; ++i) {
- int16_t test_input_block[1024];
- int16_t test_temp_block[1024];
- uint8_t dst[1024], src[1024];
+using std::tr1::make_tuple;
- for (int j = 0; j < 1024; ++j) {
- src[j] = rnd.Rand8();
- dst[j] = rnd.Rand8();
- }
- // Initialize a test block with input range [-255, 255].
- for (int j = 0; j < 1024; ++j)
- test_input_block[j] = src[j] - dst[j];
+INSTANTIATE_TEST_CASE_P(
+ C, Trans32x32Test,
+ ::testing::Values(
+ make_tuple(&vp9_short_fdct32x32_c, &vp9_short_idct32x32_add_c, 0),
+ make_tuple(&vp9_short_fdct32x32_rd_c, &vp9_short_idct32x32_add_c, 1)));
- const int pitch = 64;
- vp9_short_fdct32x32_c(test_input_block, test_temp_block, pitch);
- vp9_short_idct32x32_add_c(test_temp_block, dst, 32);
-
- for (int j = 0; j < 1024; ++j) {
- const unsigned diff = dst[j] - src[j];
- const unsigned error = diff * diff;
- if (max_error < error)
- max_error = error;
- total_error += error;
- }
- }
-
- EXPECT_GE(1u, max_error)
- << "Error: 32x32 FDCT/IDCT has an individual roundtrip error > 1";
-
- EXPECT_GE(count_test_block, total_error)
- << "Error: 32x32 FDCT/IDCT has average roundtrip error > 1 per block";
-}
-
-TEST(VP9Fdct32x32Test, CoeffSizeCheck) {
- ACMRandom rnd(ACMRandom::DeterministicSeed());
- const int count_test_block = 1000;
- for (int i = 0; i < count_test_block; ++i) {
- int16_t input_block[1024], input_extreme_block[1024];
- int16_t output_block[1024], output_extreme_block[1024];
-
- // Initialize a test block with input range [-255, 255].
- for (int j = 0; j < 1024; ++j) {
- input_block[j] = rnd.Rand8() - rnd.Rand8();
- input_extreme_block[j] = rnd.Rand8() % 2 ? 255 : -255;
- }
- if (i == 0)
- for (int j = 0; j < 1024; ++j)
- input_extreme_block[j] = 255;
-
- const int pitch = 64;
- vp9_short_fdct32x32_c(input_block, output_block, pitch);
- vp9_short_fdct32x32_c(input_extreme_block, output_extreme_block, pitch);
-
- // The minimum quant value is 4.
- for (int j = 0; j < 1024; ++j) {
- EXPECT_GE(4*DCT_MAX_VALUE, abs(output_block[j]))
- << "Error: 32x32 FDCT has coefficient larger than 4*DCT_MAX_VALUE";
- EXPECT_GE(4*DCT_MAX_VALUE, abs(output_extreme_block[j]))
- << "Error: 32x32 FDCT extreme has coefficient larger than "
- "4*DCT_MAX_VALUE";
- }
- }
-}
+#if HAVE_SSE2
+INSTANTIATE_TEST_CASE_P(
+ SSE2, Trans32x32Test,
+ ::testing::Values(
+ make_tuple(&vp9_short_fdct32x32_sse2,
+ &vp9_short_idct32x32_add_sse2, 0),
+ make_tuple(&vp9_short_fdct32x32_rd_sse2,
+ &vp9_short_idct32x32_add_sse2, 1)));
+#endif
} // namespace
diff --git a/test/fdct8x8_test.cc b/test/fdct8x8_test.cc
index 81d242b..ee6c9f6 100644
--- a/test/fdct8x8_test.cc
+++ b/test/fdct8x8_test.cc
@@ -13,14 +13,16 @@
#include <string.h>
#include "third_party/googletest/src/include/gtest/gtest.h"
+#include "test/clear_system_state.h"
+#include "test/register_state_check.h"
#include "vpx_ports/mem.h"
extern "C" {
-#include "vp9_rtcd.h"
-void vp9_short_idct8x8_add_c(short *input, uint8_t *output, int pitch);
+#include "./vp9_rtcd.h"
+void vp9_short_idct8x8_add_c(int16_t *input, uint8_t *output, int pitch);
}
-#include "acm_random.h"
+#include "test/acm_random.h"
#include "vpx/vpx_integer.h"
using libvpx_test::ACMRandom;
@@ -62,6 +64,7 @@
inv_txfm = iht8x8_add;
}
}
+ virtual void TearDown() { libvpx_test::ClearSystemState(); }
protected:
void RunFwdTxfm(int16_t *in, int16_t *out, uint8_t *dst,
@@ -92,8 +95,9 @@
// Initialize a test block with input range [-255, 255].
for (int j = 0; j < 64; ++j)
test_input_block[j] = rnd.Rand8() - rnd.Rand8();
-
- RunFwdTxfm(test_input_block, test_output_block, NULL, pitch, tx_type_);
+ REGISTER_STATE_CHECK(
+ RunFwdTxfm(test_input_block, test_output_block,
+ NULL, pitch, tx_type_));
for (int j = 0; j < 64; ++j) {
if (test_output_block[j] < 0)
@@ -121,8 +125,9 @@
// Initialize a test block with input range [-15, 15].
for (int j = 0; j < 64; ++j)
test_input_block[j] = (rnd.Rand8() >> 4) - (rnd.Rand8() >> 4);
-
- RunFwdTxfm(test_input_block, test_output_block, NULL, pitch, tx_type_);
+ REGISTER_STATE_CHECK(
+ RunFwdTxfm(test_input_block, test_output_block,
+ NULL, pitch, tx_type_));
for (int j = 0; j < 64; ++j) {
if (test_output_block[j] < 0)
@@ -165,9 +170,11 @@
test_input_block[j] = src[j] - dst[j];
const int pitch = 16;
- RunFwdTxfm(test_input_block, test_temp_block, dst, pitch, tx_type_);
- for (int j = 0; j < 64; ++j){
- if(test_temp_block[j] > 0) {
+ REGISTER_STATE_CHECK(
+ RunFwdTxfm(test_input_block, test_temp_block,
+ dst, pitch, tx_type_));
+ for (int j = 0; j < 64; ++j) {
+ if (test_temp_block[j] > 0) {
test_temp_block[j] += 2;
test_temp_block[j] /= 4;
test_temp_block[j] *= 4;
@@ -177,7 +184,9 @@
test_temp_block[j] *= 4;
}
}
- RunInvTxfm(test_input_block, test_temp_block, dst, pitch, tx_type_);
+ REGISTER_STATE_CHECK(
+ RunInvTxfm(test_input_block, test_temp_block,
+ dst, pitch, tx_type_));
for (int j = 0; j < 64; ++j) {
const int diff = dst[j] - src[j];
@@ -216,8 +225,12 @@
test_input_block[j] = src[j] - dst[j];
const int pitch = 16;
- RunFwdTxfm(test_input_block, test_temp_block, dst, pitch, tx_type_);
- RunInvTxfm(test_input_block, test_temp_block, dst, pitch, tx_type_);
+ REGISTER_STATE_CHECK(
+ RunFwdTxfm(test_input_block, test_temp_block,
+ dst, pitch, tx_type_));
+ REGISTER_STATE_CHECK(
+ RunInvTxfm(test_input_block, test_temp_block,
+ dst, pitch, tx_type_));
for (int j = 0; j < 64; ++j) {
const int diff = dst[j] - src[j];
diff --git a/test/test-data.sha1 b/test/test-data.sha1
index 579d7e2..370ffc1 100644
--- a/test/test-data.sha1
+++ b/test/test-data.sha1
@@ -522,3 +522,5 @@
94ad19b8b699cea105e2ff18f0df2afd7242bcf7 vp90-2-03-size-226x226.webm.md5
495256cfd123fe777b2c0406862ed8468a1f4677 vp91-2-04-yv444.webm
65e3a7ffef61ab340d9140f335ecc49125970c2c vp91-2-04-yv444.webm.md5
+b6524e4084d15b5d0caaa3d3d1368db30cbee69c vp90-2-03-deltaq.webm
+65f45ec9a55537aac76104818278e0978f94a678 vp90-2-03-deltaq.webm.md5
diff --git a/test/test.mk b/test/test.mk
index 2042c86..4eb599d 100644
--- a/test/test.mk
+++ b/test/test.mk
@@ -629,5 +629,7 @@
LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-226x224.webm.md5
LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-226x226.webm
LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-226x226.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-deltaq.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-deltaq.webm.md5
LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp91-2-04-yv444.webm
LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp91-2-04-yv444.webm.md5
diff --git a/test/test_vector_test.cc b/test/test_vector_test.cc
index 4cd356d..9bd03b9 100644
--- a/test/test_vector_test.cc
+++ b/test/test_vector_test.cc
@@ -159,7 +159,7 @@
"vp90-2-03-size-226x198.webm", "vp90-2-03-size-226x200.webm",
"vp90-2-03-size-226x202.webm", "vp90-2-03-size-226x208.webm",
"vp90-2-03-size-226x210.webm", "vp90-2-03-size-226x224.webm",
- "vp90-2-03-size-226x226.webm",
+ "vp90-2-03-size-226x226.webm", "vp90-2-03-deltaq.webm",
#if CONFIG_NON420
"vp91-2-04-yv444.webm"
#endif
diff --git a/vp9/common/vp9_entropy.h b/vp9/common/vp9_entropy.h
index 699b44a..f138c09 100644
--- a/vp9/common/vp9_entropy.h
+++ b/vp9/common/vp9_entropy.h
@@ -341,7 +341,7 @@
ENTROPY_CONTEXT *A, ENTROPY_CONTEXT *L,
const int16_t **scan,
const uint8_t **band_translate) {
- ENTROPY_CONTEXT above_ec, left_ec;
+ ENTROPY_CONTEXT above_ec = 0, left_ec = 0;
switch (tx_size) {
case TX_4X4:
diff --git a/vp9/common/vp9_entropymv.c b/vp9/common/vp9_entropymv.c
index c6eefda..2e973e5 100644
--- a/vp9/common/vp9_entropymv.c
+++ b/vp9/common/vp9_entropymv.c
@@ -149,8 +149,6 @@
static void inc_mv_component(int v, nmv_component_counts *comp_counts,
int incr, int usehp) {
int s, z, c, o, d, e, f;
- if (!incr)
- return;
assert (v != 0); /* should not be zero */
s = v < 0;
comp_counts->sign[s] += incr;
@@ -177,35 +175,24 @@
}
}
-static void counts_to_context(nmv_component_counts *mvcomp, int usehp) {
- int v;
- vpx_memset(mvcomp->sign, 0, sizeof(nmv_component_counts) - sizeof(mvcomp->mvcount));
- for (v = 1; v <= MV_MAX; v++) {
- inc_mv_component(-v, mvcomp, mvcomp->mvcount[MV_MAX - v], usehp);
- inc_mv_component( v, mvcomp, mvcomp->mvcount[MV_MAX + v], usehp);
- }
-}
void vp9_inc_mv(const MV *mv, nmv_context_counts *counts) {
const MV_JOINT_TYPE j = vp9_get_mv_joint(mv);
++counts->joints[j];
- if (mv_joint_vertical(j))
- ++counts->comps[0].mvcount[MV_MAX + mv->row];
+ if (mv_joint_vertical(j)) {
+ inc_mv_component(mv->row, &counts->comps[0], 1, 1);
+ }
- if (mv_joint_horizontal(j))
- ++counts->comps[1].mvcount[MV_MAX + mv->col];
+ if (mv_joint_horizontal(j)) {
+ inc_mv_component(mv->col, &counts->comps[1], 1, 1);
+ }
}
static vp9_prob adapt_prob(vp9_prob prep, const unsigned int ct[2]) {
return merge_probs2(prep, ct, MV_COUNT_SAT, MV_MAX_UPDATE_FACTOR);
}
-void vp9_counts_process(nmv_context_counts *nmv_count, int usehp) {
- counts_to_context(&nmv_count->comps[0], usehp);
- counts_to_context(&nmv_count->comps[1], usehp);
-}
-
static unsigned int adapt_probs(unsigned int i,
vp9_tree tree,
vp9_prob this_probs[],
@@ -235,8 +222,6 @@
nmv_context *pre_ctx = &pre_fc->nmvc;
nmv_context_counts *cts = &cm->counts.mv;
- vp9_counts_process(cts, allow_hp);
-
adapt_probs(0, vp9_mv_joint_tree, ctx->joints, pre_ctx->joints, cts->joints);
for (i = 0; i < 2; ++i) {
diff --git a/vp9/common/vp9_loopfilter.c b/vp9/common/vp9_loopfilter.c
index cfa61c2..df806ac 100644
--- a/vp9/common/vp9_loopfilter.c
+++ b/vp9/common/vp9_loopfilter.c
@@ -777,6 +777,7 @@
}
}
}
+#if CONFIG_NON420
static void filter_block_plane_non420(VP9_COMMON *cm,
struct macroblockd_plane *plane,
const MODE_INFO *mi,
@@ -896,6 +897,7 @@
dst->buf += 8 * dst->stride;
}
}
+#endif
static void filter_block_plane(VP9_COMMON *const cm,
struct macroblockd_plane *const plane,
@@ -981,8 +983,10 @@
const int num_planes = y_only ? 1 : MAX_MB_PLANE;
int mi_row, mi_col;
LOOP_FILTER_MASK lfm;
+#if CONFIG_NON420
int use_420 = y_only || (xd->plane[1].subsampling_y == 1 &&
xd->plane[1].subsampling_x == 1);
+#endif
for (mi_row = start; mi_row < stop; mi_row += MI_BLOCK_SIZE) {
MODE_INFO* const mi = cm->mi + mi_row * cm->mode_info_stride;
@@ -993,16 +997,22 @@
setup_dst_planes(xd, frame_buffer, mi_row, mi_col);
// TODO(JBB): Make setup_mask work for non 420.
+#if CONFIG_NON420
if (use_420)
+#endif
setup_mask(cm, mi_row, mi_col, mi + mi_col, cm->mode_info_stride, &lfm);
for (plane = 0; plane < num_planes; ++plane) {
+#if CONFIG_NON420
if (use_420)
+#endif
filter_block_plane(cm, &xd->plane[plane], mi + mi_col, mi_row, mi_col,
&lfm);
+#if CONFIG_NON420
else
filter_block_plane_non420(cm, &xd->plane[plane], mi + mi_col,
mi_row, mi_col);
+#endif
}
}
}
diff --git a/vp9/common/vp9_rtcd_defs.sh b/vp9/common/vp9_rtcd_defs.sh
index c2777aa..f5eeb2c 100644
--- a/vp9/common/vp9_rtcd_defs.sh
+++ b/vp9/common/vp9_rtcd_defs.sh
@@ -701,7 +701,7 @@
specialize vp9_quantize_b $ssse3_x86_64
prototype void vp9_quantize_b_32x32 "int16_t *coeff_ptr, intptr_t n_coeffs, int skip_block, int16_t *zbin_ptr, int16_t *round_ptr, int16_t *quant_ptr, int16_t *quant_shift_ptr, int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"
-specialize vp9_quantize_b_32x32 # $ssse3_x86_64 FIXME(jingning): need a unit test on thisbefore enabled
+specialize vp9_quantize_b_32x32 $ssse3_x86_64
#
# Structured Similarity (SSIM)
diff --git a/vp9/decoder/vp9_decodemv.c b/vp9/decoder/vp9_decodemv.c
index 8dfb22c..7f23dc1 100644
--- a/vp9/decoder/vp9_decodemv.c
+++ b/vp9/decoder/vp9_decodemv.c
@@ -460,6 +460,7 @@
if (vp9_segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP)) {
mbmi->mode = ZEROMV;
+ assert(bsize >= BLOCK_8X8);
} else {
if (bsize >= BLOCK_8X8)
mbmi->mode = read_inter_mode(cm, r, inter_mode_ctx);
diff --git a/vp9/decoder/vp9_decodframe.c b/vp9/decoder/vp9_decodframe.c
index e41ea54..6cb7c09 100644
--- a/vp9/decoder/vp9_decodframe.c
+++ b/vp9/decoder/vp9_decodframe.c
@@ -454,8 +454,7 @@
static int read_delta_q(struct vp9_read_bit_buffer *rb, int *delta_q) {
const int old = *delta_q;
- if (vp9_rb_read_bit(rb))
- *delta_q = vp9_rb_read_signed_literal(rb, 4);
+ *delta_q = vp9_rb_read_bit(rb) ? vp9_rb_read_signed_literal(rb, 4) : 0;
return old != *delta_q;
}
diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c
index eb83903..45758e7 100644
--- a/vp9/encoder/vp9_encodeframe.c
+++ b/vp9/encoder/vp9_encodeframe.c
@@ -851,13 +851,75 @@
}
}
-static void set_partitioning(VP9_COMP *cpi, MODE_INFO *m, BLOCK_SIZE bsize) {
+// Check to see if the given partition size is allowed for a specified number
+// of 8x8 block rows and columns remaining in the image.
+// If not then return the largest allowed partition size
+static BLOCK_SIZE find_partition_size(BLOCK_SIZE bsize,
+ int rows_left, int cols_left,
+ int *bh, int *bw) {
+ if ((rows_left <= 0) || (cols_left <= 0)) {
+ return MIN(bsize, BLOCK_8X8);
+ } else {
+ for (; bsize > 0; --bsize) {
+ *bh = num_8x8_blocks_high_lookup[bsize];
+ *bw = num_8x8_blocks_wide_lookup[bsize];
+ if ((*bh <= rows_left) && (*bw <= cols_left)) {
+ break;
+ }
+ }
+ }
+ return bsize;
+}
+
+// This function attempts to set all mode info entries in a given SB64
+// to the same block partition size.
+// However, at the bottom and right borders of the image the requested size
+// may not be allowed in which case this code attempts to choose the largest
+// allowable partition.
+static void set_partitioning(VP9_COMP *cpi, MODE_INFO *m,
+ int mi_row, int mi_col) {
VP9_COMMON *const cm = &cpi->common;
+ BLOCK_SIZE bsize = cpi->sf.always_this_block_size;
const int mis = cm->mode_info_stride;
+ int row8x8_remaining = cm->cur_tile_mi_row_end - mi_row;
+ int col8x8_remaining = cm->cur_tile_mi_col_end - mi_col;
int block_row, block_col;
- for (block_row = 0; block_row < 8; ++block_row) {
- for (block_col = 0; block_col < 8; ++block_col) {
- m[block_row * mis + block_col].mbmi.sb_type = bsize;
+
+ assert((row8x8_remaining > 0) && (col8x8_remaining > 0));
+
+ // Apply the requested partition size to the SB64 if it is all "in image"
+ if ((col8x8_remaining >= MI_BLOCK_SIZE) &&
+ (row8x8_remaining >= MI_BLOCK_SIZE)) {
+ for (block_row = 0; block_row < MI_BLOCK_SIZE; ++block_row) {
+ for (block_col = 0; block_col < MI_BLOCK_SIZE; ++block_col) {
+ m[block_row * mis + block_col].mbmi.sb_type = bsize;
+ }
+ }
+ } else {
+ // Else this is a partial SB64.
+ int bh = num_8x8_blocks_high_lookup[bsize];
+ int bw = num_8x8_blocks_wide_lookup[bsize];
+ int sub_block_row;
+ int sub_block_col;
+ int row_index;
+ int col_index;
+
+ for (block_row = 0; block_row < MI_BLOCK_SIZE; block_row += bh) {
+ for (block_col = 0; block_col < MI_BLOCK_SIZE; block_col += bw) {
+ // Find a partition size that fits
+ bsize = find_partition_size(cpi->sf.always_this_block_size,
+ (row8x8_remaining - block_row),
+ (col8x8_remaining - block_col), &bh, &bw);
+
+ // Set the mi entries for all 8x8 blocks within the selected size
+ for (sub_block_row = 0; sub_block_row < bh; ++sub_block_row) {
+ for (sub_block_col = 0; sub_block_col < bw; ++sub_block_col) {
+ row_index = block_row + sub_block_row;
+ col_index = block_col + sub_block_col;
+ m[row_index * mis + col_index].mbmi.sb_type = bsize;
+ }
+ }
+ }
}
}
}
@@ -1946,7 +2008,7 @@
cpi->mb.source_variance = UINT_MAX;
if (cpi->sf.use_one_partition_size_always) {
set_offsets(cpi, mi_row, mi_col, BLOCK_64X64);
- set_partitioning(cpi, m, cpi->sf.always_this_block_size);
+ set_partitioning(cpi, m, mi_row, mi_col);
rd_use_partition(cpi, m, tp, mi_row, mi_col, BLOCK_64X64,
&dummy_rate, &dummy_dist, 1);
} else if (cpi->sf.partition_by_variance) {
diff --git a/vp9/encoder/vp9_encodemv.c b/vp9/encoder/vp9_encodemv.c
index 1203c00..9977289 100644
--- a/vp9/encoder/vp9_encodemv.c
+++ b/vp9/encoder/vp9_encodemv.c
@@ -155,7 +155,6 @@
unsigned int (*branch_ct_class0_hp)[2],
unsigned int (*branch_ct_hp)[2]) {
int i, j, k;
- vp9_counts_process(nmv_count, usehp);
vp9_tree_probs_from_distribution(vp9_mv_joint_tree,
prob->joints,
branch_ct_joint,
diff --git a/vp9/encoder/vp9_quantize.c b/vp9/encoder/vp9_quantize.c
index fb0e470..96abeff 100644
--- a/vp9/encoder/vp9_quantize.c
+++ b/vp9/encoder/vp9_quantize.c
@@ -135,6 +135,7 @@
if (x >= zbin) {
x += ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1);
+ x = clamp(x, INT16_MIN, INT16_MAX);
y = (((int)(((int)(x * quant_ptr[rc != 0]) >> 16) + x)) *
quant_shift_ptr[rc != 0]) >> 15; // quantize (x)
diff --git a/vp9/encoder/x86/vp9_quantize_ssse3.asm b/vp9/encoder/x86/vp9_quantize_ssse3.asm
index 7deb981..ae0d6cd 100644
--- a/vp9/encoder/x86/vp9_quantize_ssse3.asm
+++ b/vp9/encoder/x86/vp9_quantize_ssse3.asm
@@ -70,9 +70,15 @@
pcmpgtw m7, m6, m0 ; m7 = c[i] >= zbin
punpckhqdq m0, m0
pcmpgtw m12, m11, m0 ; m12 = c[i] >= zbin
+%ifidn %1, b_32x32
+ paddsw m6, m1
+ punpckhqdq m1, m1
+ paddsw m11, m1
+%else
paddw m6, m1 ; m6 += round
punpckhqdq m1, m1
paddw m11, m1 ; m11 += round
+%endif
pmulhw m8, m6, m2 ; m8 = m6*q>>16
punpckhqdq m2, m2
pmulhw m13, m11, m2 ; m13 = m11*q>>16
@@ -126,9 +132,12 @@
pmovmskb r2, m12
or r6, r2
jz .skip_iter
-%endif
+ paddsw m6, m1
+ paddsw m11, m1
+%else
paddw m6, m1 ; m6 += round
paddw m11, m1 ; m11 += round
+%endif
pmulhw m14, m6, m2 ; m14 = m6*q>>16
pmulhw m13, m11, m2 ; m13 = m11*q>>16
paddw m14, m6 ; m14 += m6