Merge "VPX: x86 asm version of vpx_idct32x32_1024_add()"
diff --git a/build/make/configure.sh b/build/make/configure.sh
index 37ed86f..98248b0 100644
--- a/build/make/configure.sh
+++ b/build/make/configure.sh
@@ -1208,14 +1208,20 @@
soft_enable runtime_cpu_detect
# We can't use 'check_cflags' until the compiler is configured and CC is
# populated.
- check_gcc_machine_option mmx
- check_gcc_machine_option sse
- check_gcc_machine_option sse2
- check_gcc_machine_option sse3
- check_gcc_machine_option ssse3
- check_gcc_machine_option sse4 sse4_1
- check_gcc_machine_option avx
- check_gcc_machine_option avx2
+ for ext in ${ARCH_EXT_LIST_X86}; do
+ # disable higher order extensions to simplify asm dependencies
+ if [ "$disable_exts" = "yes" ]; then
+ if ! disabled $ext; then
+ RTCD_OPTIONS="${RTCD_OPTIONS}--disable-${ext} "
+ disable_feature $ext
+ fi
+ elif disabled $ext; then
+ disable_exts="yes"
+ else
+ # use the shortened version for the flag: sse4_1 -> sse4
+ check_gcc_machine_option ${ext%_*} $ext
+ fi
+ done
if enabled external_build; then
log_echo " skipping assembler detection"
diff --git a/configure b/configure
index 24992c4..f12779c 100755
--- a/configure
+++ b/configure
@@ -234,6 +234,16 @@
x86
x86_64
"
+ARCH_EXT_LIST_X86="
+ mmx
+ sse
+ sse2
+ sse3
+ ssse3
+ sse4_1
+ avx
+ avx2
+"
ARCH_EXT_LIST="
edsp
media
@@ -245,14 +255,7 @@
msa
mips64
- mmx
- sse
- sse2
- sse3
- ssse3
- sse4_1
- avx
- avx2
+ ${ARCH_EXT_LIST_X86}
"
HAVE_LIST="
${ARCH_EXT_LIST}
diff --git a/test/vp9_avg_test.cc b/test/vp9_avg_test.cc
index d383131..290bdc7 100644
--- a/test/vp9_avg_test.cc
+++ b/test/vp9_avg_test.cc
@@ -194,6 +194,48 @@
int16_t sum_c_;
};
+typedef int (*SatdFunc)(const int16_t *coeffs, int length);
+typedef std::tr1::tuple<int, SatdFunc> SatdTestParam;
+
+class SatdTest
+ : public ::testing::Test,
+ public ::testing::WithParamInterface<SatdTestParam> {
+ protected:
+ virtual void SetUp() {
+ satd_size_ = GET_PARAM(0);
+ satd_func_ = GET_PARAM(1);
+ rnd_.Reset(ACMRandom::DeterministicSeed());
+ src_ = reinterpret_cast<int16_t*>(
+ vpx_memalign(16, sizeof(*src_) * satd_size_));
+ ASSERT_TRUE(src_ != NULL);
+ }
+
+ virtual void TearDown() {
+ libvpx_test::ClearSystemState();
+ vpx_free(src_);
+ }
+
+ void FillConstant(const int16_t val) {
+ for (int i = 0; i < satd_size_; ++i) src_[i] = val;
+ }
+
+ void FillRandom() {
+ for (int i = 0; i < satd_size_; ++i) src_[i] = rnd_.Rand16();
+ }
+
+ void Check(const int expected) {
+ int total;
+ ASM_REGISTER_STATE_CHECK(total = satd_func_(src_, satd_size_));
+ EXPECT_EQ(expected, total);
+ }
+
+ int satd_size_;
+
+ private:
+ int16_t *src_;
+ SatdFunc satd_func_;
+ ACMRandom rnd_;
+};
uint8_t* AverageTestBase::source_data_ = NULL;
@@ -246,6 +288,36 @@
RunComparison();
}
+
+TEST_P(SatdTest, MinValue) {
+ const int kMin = -32640;
+ const int expected = -kMin * satd_size_;
+ FillConstant(kMin);
+ Check(expected);
+}
+
+TEST_P(SatdTest, MaxValue) {
+ const int kMax = 32640;
+ const int expected = kMax * satd_size_;
+ FillConstant(kMax);
+ Check(expected);
+}
+
+TEST_P(SatdTest, Random) {
+ int expected;
+ switch (satd_size_) {
+ case 16: expected = 205298; break;
+ case 64: expected = 1113950; break;
+ case 256: expected = 4268415; break;
+ case 1024: expected = 16954082; break;
+ default:
+ FAIL() << "Invalid satd size (" << satd_size_
+ << ") valid: 16/64/256/1024";
+ }
+ FillRandom();
+ Check(expected);
+}
+
using std::tr1::make_tuple;
INSTANTIATE_TEST_CASE_P(
@@ -254,6 +326,14 @@
make_tuple(16, 16, 1, 8, &vp9_avg_8x8_c),
make_tuple(16, 16, 1, 4, &vp9_avg_4x4_c)));
+INSTANTIATE_TEST_CASE_P(
+ C, SatdTest,
+ ::testing::Values(
+ make_tuple(16, &vp9_satd_c),
+ make_tuple(64, &vp9_satd_c),
+ make_tuple(256, &vp9_satd_c),
+ make_tuple(1024, &vp9_satd_c)));
+
#if HAVE_SSE2
INSTANTIATE_TEST_CASE_P(
SSE2, AverageTest,
@@ -276,6 +356,14 @@
make_tuple(16, &vp9_int_pro_col_sse2, &vp9_int_pro_col_c),
make_tuple(32, &vp9_int_pro_col_sse2, &vp9_int_pro_col_c),
make_tuple(64, &vp9_int_pro_col_sse2, &vp9_int_pro_col_c)));
+
+INSTANTIATE_TEST_CASE_P(
+ SSE2, SatdTest,
+ ::testing::Values(
+ make_tuple(16, &vp9_satd_sse2),
+ make_tuple(64, &vp9_satd_sse2),
+ make_tuple(256, &vp9_satd_sse2),
+ make_tuple(1024, &vp9_satd_sse2)));
#endif
#if HAVE_NEON
@@ -297,6 +385,14 @@
make_tuple(16, &vp9_int_pro_col_neon, &vp9_int_pro_col_c),
make_tuple(32, &vp9_int_pro_col_neon, &vp9_int_pro_col_c),
make_tuple(64, &vp9_int_pro_col_neon, &vp9_int_pro_col_c)));
+
+INSTANTIATE_TEST_CASE_P(
+ NEON, SatdTest,
+ ::testing::Values(
+ make_tuple(16, &vp9_satd_neon),
+ make_tuple(64, &vp9_satd_neon),
+ make_tuple(256, &vp9_satd_neon),
+ make_tuple(1024, &vp9_satd_neon)));
#endif
#if HAVE_MSA
diff --git a/vp9/common/vp9_rtcd_defs.pl b/vp9/common/vp9_rtcd_defs.pl
index 890b638..8fe6503 100644
--- a/vp9/common/vp9_rtcd_defs.pl
+++ b/vp9/common/vp9_rtcd_defs.pl
@@ -209,8 +209,8 @@
add_proto qw/void vp9_hadamard_16x16/, "int16_t const *src_diff, int src_stride, int16_t *coeff";
specialize qw/vp9_hadamard_16x16 sse2/;
-add_proto qw/int16_t vp9_satd/, "const int16_t *coeff, int length";
-specialize qw/vp9_satd sse2/;
+add_proto qw/int vp9_satd/, "const int16_t *coeff, int length";
+specialize qw/vp9_satd sse2 neon/;
add_proto qw/void vp9_int_pro_row/, "int16_t *hbuf, uint8_t const *ref, const int ref_stride, const int height";
specialize qw/vp9_int_pro_row sse2 neon/;
diff --git a/vp9/encoder/arm/neon/vp9_avg_neon.c b/vp9/encoder/arm/neon/vp9_avg_neon.c
index d569ec9..5996bd4 100644
--- a/vp9/encoder/arm/neon/vp9_avg_neon.c
+++ b/vp9/encoder/arm/neon/vp9_avg_neon.c
@@ -50,6 +50,33 @@
return (horizontal_add_u16x8(v_sum) + 32) >> 6;
}
+// coeff: 16 bits, dynamic range [-32640, 32640].
+// length: value range {16, 64, 256, 1024}.
+int vp9_satd_neon(const int16_t *coeff, int length) {
+ const int16x4_t zero = vdup_n_s16(0);
+ int32x4_t accum = vdupq_n_s32(0);
+
+ do {
+ const int16x8_t src0 = vld1q_s16(coeff);
+ const int16x8_t src8 = vld1q_s16(coeff + 8);
+ accum = vabal_s16(accum, vget_low_s16(src0), zero);
+ accum = vabal_s16(accum, vget_high_s16(src0), zero);
+ accum = vabal_s16(accum, vget_low_s16(src8), zero);
+ accum = vabal_s16(accum, vget_high_s16(src8), zero);
+ length -= 16;
+ coeff += 16;
+ } while (length != 0);
+
+ {
+ // satd: 26 bits, dynamic range [-32640 * 1024, 32640 * 1024]
+ const int64x2_t s0 = vpaddlq_s32(accum); // cascading summation of 'accum'.
+ const int32x2_t s1 = vadd_s32(vreinterpret_s32_s64(vget_low_s64(s0)),
+ vreinterpret_s32_s64(vget_high_s64(s0)));
+ const int satd = vget_lane_s32(s1, 0);
+ return satd;
+ }
+}
+
void vp9_int_pro_row_neon(int16_t hbuf[16], uint8_t const *ref,
const int ref_stride, const int height) {
int i;
diff --git a/vp9/encoder/vp9_avg.c b/vp9/encoder/vp9_avg.c
index a9a4c30..7baa09a 100644
--- a/vp9/encoder/vp9_avg.c
+++ b/vp9/encoder/vp9_avg.c
@@ -117,14 +117,14 @@
// coeff: 16 bits, dynamic range [-32640, 32640].
// length: value range {16, 64, 256, 1024}.
-int16_t vp9_satd_c(const int16_t *coeff, int length) {
+int vp9_satd_c(const int16_t *coeff, int length) {
int i;
int satd = 0;
for (i = 0; i < length; ++i)
satd += abs(coeff[i]);
// satd: 26 bits, dynamic range [-32640 * 1024, 32640 * 1024]
- return (int16_t)satd;
+ return satd;
}
// Integer projection onto row vectors.
diff --git a/vp9/encoder/vp9_denoiser.c b/vp9/encoder/vp9_denoiser.c
index e87a12e..fc76c11 100644
--- a/vp9/encoder/vp9_denoiser.c
+++ b/vp9/encoder/vp9_denoiser.c
@@ -316,7 +316,8 @@
void vp9_denoiser_denoise(VP9_DENOISER *denoiser, MACROBLOCK *mb,
int mi_row, int mi_col, BLOCK_SIZE bs,
- PICK_MODE_CONTEXT *ctx) {
+ PICK_MODE_CONTEXT *ctx,
+ VP9_DENOISER_DECISION *denoiser_decision) {
int mv_col, mv_row;
int motion_magnitude = 0;
VP9_DENOISER_DECISION decision = COPY_BLOCK;
@@ -380,6 +381,7 @@
num_4x4_blocks_wide_lookup[bs] << 2,
num_4x4_blocks_high_lookup[bs] << 2);
}
+ *denoiser_decision = decision;
}
static void copy_frame(YV12_BUFFER_CONFIG * const dest,
@@ -458,6 +460,7 @@
void vp9_denoiser_reset_frame_stats(PICK_MODE_CONTEXT *ctx) {
ctx->zeromv_sse = UINT_MAX;
ctx->newmv_sse = UINT_MAX;
+ ctx->zeromv_lastref_sse = UINT_MAX;
}
void vp9_denoiser_update_frame_stats(MB_MODE_INFO *mbmi, unsigned int sse,
diff --git a/vp9/encoder/vp9_denoiser.h b/vp9/encoder/vp9_denoiser.h
index bc676e9..c8c9352 100644
--- a/vp9/encoder/vp9_denoiser.h
+++ b/vp9/encoder/vp9_denoiser.h
@@ -54,7 +54,8 @@
void vp9_denoiser_denoise(VP9_DENOISER *denoiser, MACROBLOCK *mb,
int mi_row, int mi_col, BLOCK_SIZE bs,
- PICK_MODE_CONTEXT *ctx);
+ PICK_MODE_CONTEXT *ctx ,
+ VP9_DENOISER_DECISION *denoiser_decision);
void vp9_denoiser_reset_frame_stats(PICK_MODE_CONTEXT *ctx);
diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c
index f9c28f6..7e56989 100644
--- a/vp9/encoder/vp9_encodeframe.c
+++ b/vp9/encoder/vp9_encodeframe.c
@@ -1746,16 +1746,6 @@
set_offsets(cpi, tile, x, mi_row, mi_col, bsize);
update_state_rt(cpi, td, ctx, mi_row, mi_col, bsize);
-#if CONFIG_VP9_TEMPORAL_DENOISING
- if (cpi->oxcf.noise_sensitivity > 0 &&
- output_enabled &&
- cpi->common.frame_type != KEY_FRAME &&
- cpi->resize_pending == 0) {
- vp9_denoiser_denoise(&cpi->denoiser, x, mi_row, mi_col,
- VPXMAX(BLOCK_8X8, bsize), ctx);
- }
-#endif
-
encode_superblock(cpi, td, tp, output_enabled, mi_row, mi_col, bsize, ctx);
update_stats(&cpi->common, td);
diff --git a/vp9/encoder/vp9_noise_estimate.c b/vp9/encoder/vp9_noise_estimate.c
index b41ffd0..b26f6f2 100644
--- a/vp9/encoder/vp9_noise_estimate.c
+++ b/vp9/encoder/vp9_noise_estimate.c
@@ -25,7 +25,7 @@
int width,
int height) {
ne->enabled = 0;
- ne->level = kLow;
+ ne->level = kLowLow;
ne->value = 0;
ne->count = 0;
ne->thresh = 90;
@@ -220,22 +220,25 @@
// Reset counter and check noise level condition.
ne->num_frames_estimate = 30;
ne->count = 0;
- if (ne->value > (ne->thresh << 1))
+ if (ne->value > (ne->thresh << 1)) {
ne->level = kHigh;
- else
+ } else {
if (ne->value > ne->thresh)
ne->level = kMedium;
else if (ne->value > (ne->thresh >> 1))
ne->level = kLow;
else
ne->level = kLowLow;
+ }
+#if CONFIG_VP9_TEMPORAL_DENOISING
+ if (cpi->oxcf.noise_sensitivity > 0)
+ vp9_denoiser_set_noise_level(&cpi->denoiser, ne->level);
+#endif
}
}
}
#if CONFIG_VP9_TEMPORAL_DENOISING
- if (cpi->oxcf.noise_sensitivity > 0) {
+ if (cpi->oxcf.noise_sensitivity > 0)
copy_frame(&cpi->denoiser.last_source, cpi->Source);
- vp9_denoiser_set_noise_level(&cpi->denoiser, ne->level);
- }
#endif
}
diff --git a/vp9/encoder/vp9_pickmode.c b/vp9/encoder/vp9_pickmode.c
index 8aafae1..095847a 100644
--- a/vp9/encoder/vp9_pickmode.c
+++ b/vp9/encoder/vp9_pickmode.c
@@ -673,7 +673,7 @@
if (*eob == 1)
*rate += (int)abs(qcoeff[0]);
else if (*eob > 1)
- *rate += (int)vp9_satd((const int16_t *)qcoeff, step << 4);
+ *rate += vp9_satd((const int16_t *)qcoeff, step << 4);
*dist += vp9_block_error_fp(coeff, dqcoeff, step << 4) >> shift;
}
@@ -1143,6 +1143,9 @@
int best_pred_sad = INT_MAX;
int best_early_term = 0;
int ref_frame_cost[MAX_REF_FRAMES];
+#if CONFIG_VP9_TEMPORAL_DENOISING
+ int64_t zero_last_cost_orig = INT64_MAX;
+#endif
init_ref_frame_cost(cm, xd, ref_frame_cost);
@@ -1524,8 +1527,12 @@
}
#if CONFIG_VP9_TEMPORAL_DENOISING
- if (cpi->oxcf.noise_sensitivity > 0)
+ if (cpi->oxcf.noise_sensitivity > 0) {
vp9_denoiser_update_frame_stats(mbmi, sse_y, this_mode, ctx);
+ // Keep track of zero_last cost.
+ if (ref_frame == LAST_FRAME && frame_mv[this_mode][ref_frame].as_int == 0)
+ zero_last_cost_orig = this_rdc.rdcost;
+ }
#else
(void)ctx;
#endif
@@ -1683,6 +1690,54 @@
}
}
+#if CONFIG_VP9_TEMPORAL_DENOISING
+ if (cpi->oxcf.noise_sensitivity > 0 &&
+ cpi->resize_pending == 0) {
+ VP9_DENOISER_DECISION decision = COPY_BLOCK;
+ vp9_denoiser_denoise(&cpi->denoiser, x, mi_row, mi_col,
+ VPXMAX(BLOCK_8X8, bsize), ctx, &decision);
+ // If INTRA mode was selected, re-evaluate ZEROMV on denoised result.
+ // Only do this under noise conditions, and if rdcost of ZEROMV on
+ // original source is not significantly higher than rdcost of INTRA MODE.
+ if (best_ref_frame == INTRA_FRAME &&
+ decision == FILTER_BLOCK &&
+ cpi->noise_estimate.enabled &&
+ cpi->noise_estimate.level > kLow &&
+ zero_last_cost_orig < (best_rdc.rdcost << 2)) {
+ // Check if we should pick ZEROMV on denoised signal.
+ int rate = 0;
+ int64_t dist = 0;
+ mbmi->mode = ZEROMV;
+ mbmi->ref_frame[0] = LAST_FRAME;
+ mbmi->ref_frame[1] = NONE;
+ mbmi->mv[0].as_int = 0;
+ mbmi->interp_filter = EIGHTTAP;
+ xd->plane[0].pre[0] = yv12_mb[LAST_FRAME][0];
+ vp9_build_inter_predictors_sby(xd, mi_row, mi_col, bsize);
+ model_rd_for_sb_y(cpi, bsize, x, xd, &rate, &dist, &var_y, &sse_y);
+ this_rdc.rate = rate + ref_frame_cost[LAST_FRAME] +
+ cpi->inter_mode_cost[x->mbmi_ext->mode_context[LAST_FRAME]]
+ [INTER_OFFSET(ZEROMV)];
+ this_rdc.dist = dist;
+ this_rdc.rdcost = RDCOST(x->rdmult, x->rddiv, rate, dist);
+ // Switch to ZEROMV if the rdcost for ZEROMV on denoised source
+ // is lower than INTRA (on original source).
+ if (this_rdc.rdcost > best_rdc.rdcost) {
+ this_rdc = best_rdc;
+ mbmi->mode = best_mode;
+ mbmi->ref_frame[0] = best_ref_frame;
+ mbmi->mv[0].as_int = INVALID_MV;
+ mbmi->interp_filter = best_pred_filter;
+ mbmi->tx_size = best_tx_size;
+ x->skip_txfm[0] = best_mode_skip_txfm;
+ } else {
+ best_ref_frame = LAST_FRAME;
+ best_rdc = this_rdc;
+ }
+ }
+ }
+#endif
+
if (cpi->sf.adaptive_rd_thresh) {
THR_MODES best_mode_idx = mode_idx[best_ref_frame][mode_offset(mbmi->mode)];
diff --git a/vp9/encoder/x86/vp9_avg_intrin_sse2.c b/vp9/encoder/x86/vp9_avg_intrin_sse2.c
index 4531d79..4414871 100644
--- a/vp9/encoder/x86/vp9_avg_intrin_sse2.c
+++ b/vp9/encoder/x86/vp9_avg_intrin_sse2.c
@@ -283,31 +283,30 @@
}
}
-int16_t vp9_satd_sse2(const int16_t *coeff, int length) {
+int vp9_satd_sse2(const int16_t *coeff, int length) {
int i;
- __m128i sum = _mm_load_si128((const __m128i *)coeff);
- __m128i sign = _mm_srai_epi16(sum, 15);
- __m128i val = _mm_xor_si128(sum, sign);
- sum = _mm_sub_epi16(val, sign);
- coeff += 8;
+ const __m128i zero = _mm_setzero_si128();
+ __m128i accum = zero;
- for (i = 8; i < length; i += 8) {
- __m128i src_line = _mm_load_si128((const __m128i *)coeff);
- sign = _mm_srai_epi16(src_line, 15);
- val = _mm_xor_si128(src_line, sign);
- val = _mm_sub_epi16(val, sign);
- sum = _mm_add_epi16(sum, val);
+ for (i = 0; i < length; i += 8) {
+ const __m128i src_line = _mm_load_si128((const __m128i *)coeff);
+ const __m128i inv = _mm_sub_epi16(zero, src_line);
+ const __m128i abs = _mm_max_epi16(src_line, inv); // abs(src_line)
+ const __m128i abs_lo = _mm_unpacklo_epi16(abs, zero);
+ const __m128i abs_hi = _mm_unpackhi_epi16(abs, zero);
+ const __m128i sum = _mm_add_epi32(abs_lo, abs_hi);
+ accum = _mm_add_epi32(accum, sum);
coeff += 8;
}
- val = _mm_srli_si128(sum, 8);
- sum = _mm_add_epi16(sum, val);
- val = _mm_srli_epi64(sum, 32);
- sum = _mm_add_epi16(sum, val);
- val = _mm_srli_epi32(sum, 16);
- sum = _mm_add_epi16(sum, val);
+ { // cascading summation of accum
+ __m128i hi = _mm_srli_si128(accum, 8);
+ accum = _mm_add_epi32(accum, hi);
+ hi = _mm_srli_epi64(accum, 32);
+ accum = _mm_add_epi32(accum, hi);
+ }
- return _mm_extract_epi16(sum, 0);
+ return _mm_cvtsi128_si32(accum);
}
void vp9_int_pro_row_sse2(int16_t *hbuf, uint8_t const*ref,
diff --git a/vpx_dsp/x86/intrapred_sse2.asm b/vpx_dsp/x86/intrapred_sse2.asm
index 04b39a5..62c2d29 100644
--- a/vpx_dsp/x86/intrapred_sse2.asm
+++ b/vpx_dsp/x86/intrapred_sse2.asm
@@ -545,33 +545,31 @@
RET
INIT_XMM sse2
-cglobal tm_predictor_8x8, 4, 4, 4, dst, stride, above, left
+cglobal tm_predictor_8x8, 4, 4, 5, dst, stride, above, left
pxor m1, m1
movd m2, [aboveq-1]
movq m0, [aboveq]
punpcklbw m2, m1
- punpcklbw m0, m1
- pshuflw m2, m2, 0x0
+ punpcklbw m0, m1 ; t1 t2 t3 t4 t5 t6 t7 t8 [word]
+ pshuflw m2, m2, 0x0 ; [63:0] tl tl tl tl [word]
DEFINE_ARGS dst, stride, line, left
mov lineq, -4
- punpcklqdq m2, m2
- add leftq, 8
- psubw m0, m2
-.loop:
- movd m2, [leftq+lineq*2]
- movd m3, [leftq+lineq*2+1]
- punpcklbw m2, m1
- punpcklbw m3, m1
- pshuflw m2, m2, 0x0
- pshuflw m3, m3, 0x0
- punpcklqdq m2, m2
- punpcklqdq m3, m3
- paddw m2, m0
+ punpcklqdq m2, m2 ; tl tl tl tl tl tl tl tl [word]
+ psubw m0, m2 ; t1-tl t2-tl ... t8-tl [word]
+ movq m2, [leftq]
+ punpcklbw m2, m1 ; l1 l2 l3 l4 l5 l6 l7 l8 [word]
+.loop
+ pshuflw m4, m2, 0x0 ; [63:0] l1 l1 l1 l1 [word]
+ pshuflw m3, m2, 0x55 ; [63:0] l2 l2 l2 l2 [word]
+ punpcklqdq m4, m4 ; l1 l1 l1 l1 l1 l1 l1 l1 [word]
+ punpcklqdq m3, m3 ; l2 l2 l2 l2 l2 l2 l2 l2 [word]
+ paddw m4, m0
paddw m3, m0
- packuswb m2, m3
- movq [dstq ], m2
- movhps [dstq+strideq], m2
+ packuswb m4, m3
+ movq [dstq ], m4
+ movhps [dstq+strideq], m4
lea dstq, [dstq+strideq*2]
+ psrldq m2, 4
inc lineq
jnz .loop
REP_RET