Merge "Merge branch 'masterbase' into nextgenv2" into nextgenv2
diff --git a/CHANGELOG b/CHANGELOG
index 7746cc6..7db420e 100644
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -1,3 +1,7 @@
+Next Release
+ - Incompatible changes:
+ The VP9 encoder's default keyframe interval changed to 128 from 9999.
+
2015-11-09 v1.5.0 "Javan Whistling Duck"
This release improves upon the VP9 encoder and speeds up the encoding and
decoding processes.
diff --git a/examples/vp8_multi_resolution_encoder.c b/examples/vp8_multi_resolution_encoder.c
index 0248ede..fc775ef 100644
--- a/examples/vp8_multi_resolution_encoder.c
+++ b/examples/vp8_multi_resolution_encoder.c
@@ -347,8 +347,7 @@
double psnr_totals[NUM_ENCODERS][4] = {{0,0}};
int psnr_count[NUM_ENCODERS] = {0};
- double cx_time = 0;
- struct timeval tv1, tv2, difftv;
+ int64_t cx_time = 0;
/* Set the required target bitrates for each resolution level.
* If target bitrate for highest-resolution level is set to 0,
@@ -582,6 +581,7 @@
while(frame_avail || got_data)
{
+ struct vpx_usec_timer timer;
vpx_codec_iter_t iter[NUM_ENCODERS]={NULL};
const vpx_codec_cx_pkt_t *pkt[NUM_ENCODERS];
@@ -636,18 +636,18 @@
vpx_codec_control(&codec[i], VP8E_SET_TEMPORAL_LAYER_ID, layer_id);
}
- gettimeofday(&tv1, NULL);
/* Encode each frame at multi-levels */
/* Note the flags must be set to 0 in the encode call if they are set
for each frame with the vpx_codec_control(), as done above. */
+ vpx_usec_timer_start(&timer);
if(vpx_codec_encode(&codec[0], frame_avail? &raw[0] : NULL,
frame_cnt, 1, 0, arg_deadline))
{
die_codec(&codec[0], "Failed to encode frame");
}
- gettimeofday(&tv2, NULL);
- timersub(&tv2, &tv1, &difftv);
- cx_time += (double)(difftv.tv_sec * 1000000 + difftv.tv_usec);
+ vpx_usec_timer_mark(&timer);
+ cx_time += vpx_usec_timer_elapsed(&timer);
+
for (i=NUM_ENCODERS-1; i>=0 ; i--)
{
got_data = 0;
@@ -686,8 +686,10 @@
frame_cnt++;
}
printf("\n");
- printf("FPS for encoding %d %f %f \n", frame_cnt, (float)cx_time / 1000000,
- 1000000 * (double)frame_cnt / (double)cx_time);
+ printf("Frame cnt and encoding time/FPS stats for encoding: %d %f %f \n",
+ frame_cnt,
+ 1000 * (float)cx_time / (double)(frame_cnt * 1000000),
+ 1000000 * (double)frame_cnt / (double)cx_time);
fclose(infile);
diff --git a/test/altref_test.cc b/test/altref_test.cc
index af25b72..0799f42 100644
--- a/test/altref_test.cc
+++ b/test/altref_test.cc
@@ -14,6 +14,8 @@
#include "test/util.h"
namespace {
+#if CONFIG_VP8_ENCODER
+
// lookahead range: [kLookAheadMin, kLookAheadMax).
const int kLookAheadMin = 5;
const int kLookAheadMax = 26;
@@ -63,7 +65,106 @@
EXPECT_GE(altref_count(), 1);
}
-
VP8_INSTANTIATE_TEST_CASE(AltRefTest,
::testing::Range(kLookAheadMin, kLookAheadMax));
+
+#endif // CONFIG_VP8_ENCODER
+
+class AltRefForcedKeyTestLarge
+ : public ::libvpx_test::EncoderTest,
+ public ::libvpx_test::CodecTestWith2Params<libvpx_test::TestMode, int> {
+ protected:
+ AltRefForcedKeyTestLarge()
+ : EncoderTest(GET_PARAM(0)),
+ encoding_mode_(GET_PARAM(1)),
+ cpu_used_(GET_PARAM(2)),
+ forced_kf_frame_num_(1),
+ frame_num_(0) {}
+ virtual ~AltRefForcedKeyTestLarge() {}
+
+ virtual void SetUp() {
+ InitializeConfig();
+ SetMode(encoding_mode_);
+ cfg_.rc_end_usage = VPX_VBR;
+ cfg_.g_threads = 0;
+ }
+
+ virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
+ ::libvpx_test::Encoder *encoder) {
+ if (video->frame() == 0) {
+ encoder->Control(VP8E_SET_CPUUSED, cpu_used_);
+ encoder->Control(VP8E_SET_ENABLEAUTOALTREF, 1);
+ // override test default for tile columns if necessary.
+#if CONFIG_VP9_ENCODER
+ if (GET_PARAM(0) == &libvpx_test::kVP9) {
+ encoder->Control(VP9E_SET_TILE_COLUMNS, 6);
+ }
+#endif
+#if CONFIG_VP10_ENCODER
+ if (GET_PARAM(0) == &libvpx_test::kVP10) {
+ encoder->Control(VP9E_SET_TILE_COLUMNS, 6);
+ }
+#endif
+ }
+ frame_flags_ =
+ (video->frame() == forced_kf_frame_num_) ? VPX_EFLAG_FORCE_KF : 0;
+ }
+
+ virtual void FramePktHook(const vpx_codec_cx_pkt_t *pkt) {
+ if (frame_num_ == forced_kf_frame_num_) {
+ ASSERT_TRUE(!!(pkt->data.frame.flags & VPX_FRAME_IS_KEY))
+ << "Frame #" << frame_num_ << " isn't a keyframe!";
+ }
+ ++frame_num_;
+ }
+
+ ::libvpx_test::TestMode encoding_mode_;
+ int cpu_used_;
+ unsigned int forced_kf_frame_num_;
+ unsigned int frame_num_;
+};
+
+TEST_P(AltRefForcedKeyTestLarge, Frame1IsKey) {
+ const vpx_rational timebase = { 1, 30 };
+ const int lag_values[] = { 3, 15, 25, -1 };
+
+ forced_kf_frame_num_ = 1;
+ for (int i = 0; lag_values[i] != -1; ++i) {
+ frame_num_ = 0;
+ cfg_.g_lag_in_frames = lag_values[i];
+ libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
+ timebase.den, timebase.num, 0, 30);
+ ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+ }
+}
+
+TEST_P(AltRefForcedKeyTestLarge, ForcedFrameIsKey) {
+ const vpx_rational timebase = { 1, 30 };
+ const int lag_values[] = { 3, 15, 25, -1 };
+
+ for (int i = 0; lag_values[i] != -1; ++i) {
+ frame_num_ = 0;
+ forced_kf_frame_num_ = lag_values[i] - 1;
+ cfg_.g_lag_in_frames = lag_values[i];
+ libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
+ timebase.den, timebase.num, 0, 30);
+ ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+ }
+}
+
+VP8_INSTANTIATE_TEST_CASE(
+ AltRefForcedKeyTestLarge,
+ ::testing::Values(::libvpx_test::kOnePassGood),
+ ::testing::Range(0, 9));
+
+VP9_INSTANTIATE_TEST_CASE(
+ AltRefForcedKeyTestLarge,
+ ::testing::Values(::libvpx_test::kOnePassGood),
+ ::testing::Range(0, 9));
+
+VP10_INSTANTIATE_TEST_CASE(
+ AltRefForcedKeyTestLarge,
+ ::testing::Values(::libvpx_test::kOnePassGood),
+ ::testing::Range(0, 9));
+
} // namespace
diff --git a/test/cpu_speed_test.cc b/test/cpu_speed_test.cc
index 6a938a0..572834c 100644
--- a/test/cpu_speed_test.cc
+++ b/test/cpu_speed_test.cc
@@ -26,7 +26,8 @@
: EncoderTest(GET_PARAM(0)),
encoding_mode_(GET_PARAM(1)),
set_cpu_used_(GET_PARAM(2)),
- min_psnr_(kMaxPSNR) {}
+ min_psnr_(kMaxPSNR),
+ tune_content_(VP9E_CONTENT_DEFAULT) {}
virtual ~CpuSpeedTest() {}
virtual void SetUp() {
@@ -49,6 +50,7 @@
::libvpx_test::Encoder *encoder) {
if (video->frame() == 1) {
encoder->Control(VP8E_SET_CPUUSED, set_cpu_used_);
+ encoder->Control(VP9E_SET_TUNE_CONTENT, tune_content_);
if (encoding_mode_ != ::libvpx_test::kRealTime) {
encoder->Control(VP8E_SET_ENABLEAUTOALTREF, 1);
encoder->Control(VP8E_SET_ARNR_MAXFRAMES, 7);
@@ -66,6 +68,7 @@
::libvpx_test::TestMode encoding_mode_;
int set_cpu_used_;
double min_psnr_;
+ int tune_content_;
};
TEST_P(CpuSpeedTest, TestQ0) {
@@ -103,6 +106,21 @@
EXPECT_GE(min_psnr_, kMaxPSNR);
}
+TEST_P(CpuSpeedTest, TestTuneScreen) {
+ ::libvpx_test::Y4mVideoSource video("screendata.y4m", 0, 25);
+ cfg_.g_timebase = video.timebase();
+ cfg_.rc_2pass_vbr_minsection_pct = 5;
+ cfg_.rc_2pass_vbr_minsection_pct = 2000;
+ cfg_.rc_target_bitrate = 2000;
+ cfg_.rc_max_quantizer = 63;
+ cfg_.rc_min_quantizer = 0;
+ tune_content_ = VP9E_CONTENT_SCREEN;
+
+ init_flags_ = VPX_CODEC_USE_PSNR;
+
+ ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+}
+
TEST_P(CpuSpeedTest, TestEncodeHighBitrate) {
// Validate that this non multiple of 64 wide clip encodes and decodes
// without a mismatch when passing in a very low max q. This pushes
diff --git a/test/datarate_test.cc b/test/datarate_test.cc
index 9d5074e..5467c46 100644
--- a/test/datarate_test.cc
+++ b/test/datarate_test.cc
@@ -519,6 +519,9 @@
cfg_.rc_end_usage = VPX_CBR;
cfg_.rc_target_bitrate = 200;
cfg_.g_lag_in_frames = 0;
+ // TODO(marpan): Investigate datarate target failures with a smaller keyframe
+ // interval (128).
+ cfg_.kf_max_dist = 9999;
::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
30, 1, 0, 140);
@@ -774,10 +777,6 @@
svc_params_.max_quantizers[i] = 63;
svc_params_.min_quantizers[i] = 0;
}
- svc_params_.scaling_factor_num[0] = 144;
- svc_params_.scaling_factor_den[0] = 288;
- svc_params_.scaling_factor_num[1] = 288;
- svc_params_.scaling_factor_den[1] = 288;
encoder->Control(VP9E_SET_SVC, 1);
encoder->Control(VP9E_SET_SVC_PARAMETERS, &svc_params_);
encoder->Control(VP8E_SET_CPUUSED, speed_setting_);
@@ -814,8 +813,6 @@
if (bits_total_) {
const double file_size_in_kb = bits_total_ / 1000.; // bits per kilobit
duration_ = (last_pts_ + 1) * timebase_;
- effective_datarate_ = (bits_total_ - bits_in_last_frame_) / 1000.0
- / (cfg_.rc_buf_initial_sz / 1000.0 + duration_);
file_datarate_ = file_size_in_kb / duration_;
}
}
@@ -839,7 +836,6 @@
int64_t bits_total_;
double duration_;
double file_datarate_;
- double effective_datarate_;
size_t bits_in_last_frame_;
vpx_svc_extra_cfg_t svc_params_;
int speed_setting_;
@@ -884,7 +880,49 @@
// Check basic rate targeting for 1 pass CBR SVC: 2 spatial layers and
// 3 temporal layers. Run CIF clip with 1 thread.
-TEST_P(DatarateOnePassCbrSvc, OnePassCbrSvc) {
+TEST_P(DatarateOnePassCbrSvc, OnePassCbrSvc2SpatialLayers) {
+ cfg_.rc_buf_initial_sz = 500;
+ cfg_.rc_buf_optimal_sz = 500;
+ cfg_.rc_buf_sz = 1000;
+ cfg_.rc_min_quantizer = 0;
+ cfg_.rc_max_quantizer = 63;
+ cfg_.rc_end_usage = VPX_CBR;
+ cfg_.g_lag_in_frames = 0;
+ cfg_.ss_number_layers = 2;
+ cfg_.ts_number_layers = 3;
+ cfg_.ts_rate_decimator[0] = 4;
+ cfg_.ts_rate_decimator[1] = 2;
+ cfg_.ts_rate_decimator[2] = 1;
+ cfg_.g_error_resilient = 1;
+ cfg_.g_threads = 1;
+ cfg_.temporal_layering_mode = 3;
+ svc_params_.scaling_factor_num[0] = 144;
+ svc_params_.scaling_factor_den[0] = 288;
+ svc_params_.scaling_factor_num[1] = 288;
+ svc_params_.scaling_factor_den[1] = 288;
+ cfg_.rc_dropframe_thresh = 10;
+ cfg_.kf_max_dist = 9999;
+ ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
+ 30, 1, 0, 200);
+ // TODO(wonkap/marpan): Check that effective_datarate for each layer hits the
+ // layer target_bitrate.
+ for (int i = 200; i <= 800; i += 200) {
+ cfg_.rc_target_bitrate = i;
+ ResetModel();
+ assign_layer_bitrates(&cfg_, &svc_params_, cfg_.ss_number_layers,
+ cfg_.ts_number_layers, cfg_.temporal_layering_mode);
+ ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+ ASSERT_GE(cfg_.rc_target_bitrate, file_datarate_ * 0.85)
+ << " The datarate for the file exceeds the target by too much!";
+ ASSERT_LE(cfg_.rc_target_bitrate, file_datarate_ * 1.15)
+ << " The datarate for the file is lower than the target by too much!";
+ EXPECT_EQ(static_cast<unsigned int>(0), GetMismatchFrames());
+ }
+}
+
+// Check basic rate targeting for 1 pass CBR SVC: 2 spatial layers and 3
+// temporal layers. Run CIF clip with 1 thread, and few short key frame periods.
+TEST_P(DatarateOnePassCbrSvc, OnePassCbrSvc2SpatialLayersSmallKf) {
cfg_.rc_buf_initial_sz = 500;
cfg_.rc_buf_optimal_sz = 500;
cfg_.rc_buf_sz = 1000;
@@ -907,25 +945,26 @@
cfg_.rc_dropframe_thresh = 10;
::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
30, 1, 0, 200);
- // TODO(wonkap/marpan): Check that effective_datarate for each layer hits the
- // layer target_bitrate. Also check if test can pass at lower bitrate (~200k).
- for (int i = 400; i <= 800; i += 200) {
- cfg_.rc_target_bitrate = i;
+ cfg_.rc_target_bitrate = 400;
+ // For this 3 temporal layer case, pattern repeats every 4 frames, so choose
+ // 4 key neighboring key frame periods (so key frame will land on 0-2-1-2).
+ for (int j = 64; j <= 67; j++) {
+ cfg_.kf_max_dist = j;
ResetModel();
assign_layer_bitrates(&cfg_, &svc_params_, cfg_.ss_number_layers,
cfg_.ts_number_layers, cfg_.temporal_layering_mode);
ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
- ASSERT_GE(cfg_.rc_target_bitrate, effective_datarate_ * 0.85)
+ ASSERT_GE(cfg_.rc_target_bitrate, file_datarate_ * 0.85)
<< " The datarate for the file exceeds the target by too much!";
ASSERT_LE(cfg_.rc_target_bitrate, file_datarate_ * 1.15)
<< " The datarate for the file is lower than the target by too much!";
- EXPECT_EQ(GetMismatchFrames(), (unsigned int) 0);
+ EXPECT_EQ(static_cast<unsigned int>(0), GetMismatchFrames());
}
}
// Check basic rate targeting for 1 pass CBR SVC: 2 spatial layers and
// 3 temporal layers. Run HD clip with 4 threads.
-TEST_P(DatarateOnePassCbrSvc, OnePassCbrSvc4threads) {
+TEST_P(DatarateOnePassCbrSvc, OnePassCbrSvc2SpatialLayers4threads) {
cfg_.rc_buf_initial_sz = 500;
cfg_.rc_buf_optimal_sz = 500;
cfg_.rc_buf_sz = 1000;
@@ -946,6 +985,7 @@
svc_params_.scaling_factor_num[1] = 288;
svc_params_.scaling_factor_den[1] = 288;
cfg_.rc_dropframe_thresh = 10;
+ cfg_.kf_max_dist = 9999;
::libvpx_test::I420VideoSource video("niklas_1280_720_30.y4m", 1280, 720,
30, 1, 0, 300);
cfg_.rc_target_bitrate = 800;
@@ -953,19 +993,143 @@
assign_layer_bitrates(&cfg_, &svc_params_, cfg_.ss_number_layers,
cfg_.ts_number_layers, cfg_.temporal_layering_mode);
ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
- ASSERT_GE(cfg_.rc_target_bitrate, effective_datarate_ * 0.85)
+ ASSERT_GE(cfg_.rc_target_bitrate, file_datarate_ * 0.85)
<< " The datarate for the file exceeds the target by too much!";
ASSERT_LE(cfg_.rc_target_bitrate, file_datarate_ * 1.15)
<< " The datarate for the file is lower than the target by too much!";
- EXPECT_EQ(GetMismatchFrames(), (unsigned int) 0);
+ EXPECT_EQ(static_cast<unsigned int>(0), GetMismatchFrames());
+}
+
+// Check basic rate targeting for 1 pass CBR SVC: 3 spatial layers and
+// 3 temporal layers. Run CIF clip with 1 thread.
+TEST_P(DatarateOnePassCbrSvc, OnePassCbrSvc3SpatialLayers) {
+ cfg_.rc_buf_initial_sz = 500;
+ cfg_.rc_buf_optimal_sz = 500;
+ cfg_.rc_buf_sz = 1000;
+ cfg_.rc_min_quantizer = 0;
+ cfg_.rc_max_quantizer = 63;
+ cfg_.rc_end_usage = VPX_CBR;
+ cfg_.g_lag_in_frames = 0;
+ cfg_.ss_number_layers = 3;
+ cfg_.ts_number_layers = 3;
+ cfg_.ts_rate_decimator[0] = 4;
+ cfg_.ts_rate_decimator[1] = 2;
+ cfg_.ts_rate_decimator[2] = 1;
+ cfg_.g_error_resilient = 1;
+ cfg_.g_threads = 1;
+ cfg_.temporal_layering_mode = 3;
+ svc_params_.scaling_factor_num[0] = 72;
+ svc_params_.scaling_factor_den[0] = 288;
+ svc_params_.scaling_factor_num[1] = 144;
+ svc_params_.scaling_factor_den[1] = 288;
+ svc_params_.scaling_factor_num[2] = 288;
+ svc_params_.scaling_factor_den[2] = 288;
+ cfg_.rc_dropframe_thresh = 10;
+ cfg_.kf_max_dist = 9999;
+ ::libvpx_test::I420VideoSource video("niklas_1280_720_30.y4m", 1280, 720,
+ 30, 1, 0, 300);
+ cfg_.rc_target_bitrate = 800;
+ ResetModel();
+ assign_layer_bitrates(&cfg_, &svc_params_, cfg_.ss_number_layers,
+ cfg_.ts_number_layers, cfg_.temporal_layering_mode);
+ ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+ ASSERT_GE(cfg_.rc_target_bitrate, file_datarate_ * 0.85)
+ << " The datarate for the file exceeds the target by too much!";
+ ASSERT_LE(cfg_.rc_target_bitrate, file_datarate_ * 1.22)
+ << " The datarate for the file is lower than the target by too much!";
+ EXPECT_EQ(static_cast<unsigned int>(0), GetMismatchFrames());
+}
+
+// Check basic rate targeting for 1 pass CBR SVC: 3 spatial layers and 3
+// temporal layers. Run CIF clip with 1 thread, and few short key frame periods.
+TEST_P(DatarateOnePassCbrSvc, OnePassCbrSvc3SpatialLayersSmallKf) {
+ cfg_.rc_buf_initial_sz = 500;
+ cfg_.rc_buf_optimal_sz = 500;
+ cfg_.rc_buf_sz = 1000;
+ cfg_.rc_min_quantizer = 0;
+ cfg_.rc_max_quantizer = 63;
+ cfg_.rc_end_usage = VPX_CBR;
+ cfg_.g_lag_in_frames = 0;
+ cfg_.ss_number_layers = 3;
+ cfg_.ts_number_layers = 3;
+ cfg_.ts_rate_decimator[0] = 4;
+ cfg_.ts_rate_decimator[1] = 2;
+ cfg_.ts_rate_decimator[2] = 1;
+ cfg_.g_error_resilient = 1;
+ cfg_.g_threads = 1;
+ cfg_.temporal_layering_mode = 3;
+ svc_params_.scaling_factor_num[0] = 72;
+ svc_params_.scaling_factor_den[0] = 288;
+ svc_params_.scaling_factor_num[1] = 144;
+ svc_params_.scaling_factor_den[1] = 288;
+ svc_params_.scaling_factor_num[2] = 288;
+ svc_params_.scaling_factor_den[2] = 288;
+ cfg_.rc_dropframe_thresh = 10;
+ ::libvpx_test::I420VideoSource video("niklas_1280_720_30.y4m", 1280, 720,
+ 30, 1, 0, 300);
+ cfg_.rc_target_bitrate = 800;
+ // For this 3 temporal layer case, pattern repeats every 4 frames, so choose
+ // 4 key neighboring key frame periods (so key frame will land on 0-2-1-2).
+ for (int j = 32; j <= 35; j++) {
+ cfg_.kf_max_dist = j;
+ ResetModel();
+ assign_layer_bitrates(&cfg_, &svc_params_, cfg_.ss_number_layers,
+ cfg_.ts_number_layers, cfg_.temporal_layering_mode);
+ ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+ ASSERT_GE(cfg_.rc_target_bitrate, file_datarate_ * 0.85)
+ << " The datarate for the file exceeds the target by too much!";
+ ASSERT_LE(cfg_.rc_target_bitrate, file_datarate_ * 1.30)
+ << " The datarate for the file is lower than the target by too much!";
+ EXPECT_EQ(static_cast<unsigned int>(0), GetMismatchFrames());
+ }
+}
+
+// Check basic rate targeting for 1 pass CBR SVC: 3 spatial layers and
+// 3 temporal layers. Run HD clip with 4 threads.
+TEST_P(DatarateOnePassCbrSvc, OnePassCbrSvc3SpatialLayers4threads) {
+ cfg_.rc_buf_initial_sz = 500;
+ cfg_.rc_buf_optimal_sz = 500;
+ cfg_.rc_buf_sz = 1000;
+ cfg_.rc_min_quantizer = 0;
+ cfg_.rc_max_quantizer = 63;
+ cfg_.rc_end_usage = VPX_CBR;
+ cfg_.g_lag_in_frames = 0;
+ cfg_.ss_number_layers = 3;
+ cfg_.ts_number_layers = 3;
+ cfg_.ts_rate_decimator[0] = 4;
+ cfg_.ts_rate_decimator[1] = 2;
+ cfg_.ts_rate_decimator[2] = 1;
+ cfg_.g_error_resilient = 1;
+ cfg_.g_threads = 4;
+ cfg_.temporal_layering_mode = 3;
+ svc_params_.scaling_factor_num[0] = 72;
+ svc_params_.scaling_factor_den[0] = 288;
+ svc_params_.scaling_factor_num[1] = 144;
+ svc_params_.scaling_factor_den[1] = 288;
+ svc_params_.scaling_factor_num[2] = 288;
+ svc_params_.scaling_factor_den[2] = 288;
+ cfg_.rc_dropframe_thresh = 10;
+ cfg_.kf_max_dist = 9999;
+ ::libvpx_test::I420VideoSource video("niklas_1280_720_30.y4m", 1280, 720,
+ 30, 1, 0, 300);
+ cfg_.rc_target_bitrate = 800;
+ ResetModel();
+ assign_layer_bitrates(&cfg_, &svc_params_, cfg_.ss_number_layers,
+ cfg_.ts_number_layers, cfg_.temporal_layering_mode);
+ ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+ ASSERT_GE(cfg_.rc_target_bitrate, file_datarate_ * 0.85)
+ << " The datarate for the file exceeds the target by too much!";
+ ASSERT_LE(cfg_.rc_target_bitrate, file_datarate_ * 1.22)
+ << " The datarate for the file is lower than the target by too much!";
+ EXPECT_EQ(static_cast<unsigned int>(0), GetMismatchFrames());
}
VP8_INSTANTIATE_TEST_CASE(DatarateTestLarge, ALL_TEST_MODES);
VP9_INSTANTIATE_TEST_CASE(DatarateTestVP9Large,
::testing::Values(::libvpx_test::kOnePassGood,
::libvpx_test::kRealTime),
- ::testing::Range(2, 7));
+ ::testing::Range(2, 9));
VP9_INSTANTIATE_TEST_CASE(DatarateOnePassCbrSvc,
::testing::Values(::libvpx_test::kRealTime),
- ::testing::Range(5, 8));
+ ::testing::Range(5, 9));
} // namespace
diff --git a/test/lpf_8_test.cc b/test/lpf_8_test.cc
index b16f14c..778a36c 100644
--- a/test/lpf_8_test.cc
+++ b/test/lpf_8_test.cc
@@ -430,7 +430,7 @@
using std::tr1::make_tuple;
-#if HAVE_MMX && !CONFIG_VP9_HIGHBITDEPTH
+#if HAVE_MMX && CONFIG_USE_X86INC && !CONFIG_VP9_HIGHBITDEPTH
INSTANTIATE_TEST_CASE_P(
MMX, Loop8Test6Param,
::testing::Values(
diff --git a/test/resize_test.cc b/test/resize_test.cc
index 0177308..eaebd75 100644
--- a/test/resize_test.cc
+++ b/test/resize_test.cc
@@ -463,6 +463,17 @@
frame_info_list_.push_back(FrameInfo(pts, img.d_w, img.d_h));
}
+ virtual void MismatchHook(const vpx_image_t *img1,
+ const vpx_image_t *img2) {
+ double mismatch_psnr = compute_psnr(img1, img2);
+ mismatch_psnr_ += mismatch_psnr;
+ ++mismatch_nframes_;
+ }
+
+ unsigned int GetMismatchFrames() {
+ return mismatch_nframes_;
+ }
+
void DefaultConfig() {
cfg_.rc_buf_initial_sz = 500;
cfg_.rc_buf_optimal_sz = 600;
@@ -488,6 +499,8 @@
std::vector< FrameInfo > frame_info_list_;
int set_cpu_used_;
bool change_bitrate_;
+ double mismatch_psnr_;
+ int mismatch_nframes_;
};
TEST_P(ResizeRealtimeTest, TestExternalResizeWorks) {
@@ -497,6 +510,8 @@
// Disable internal resize for this test.
cfg_.rc_resize_allowed = 0;
change_bitrate_ = false;
+ mismatch_psnr_ = 0.0;
+ mismatch_nframes_ = 0;
ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
for (std::vector<FrameInfo>::const_iterator info = frame_info_list_.begin();
@@ -510,6 +525,7 @@
<< "Frame " << frame << " had unexpected width";
EXPECT_EQ(expected_h, info->h)
<< "Frame " << frame << " had unexpected height";
+ EXPECT_EQ(static_cast<unsigned int>(0), GetMismatchFrames());
}
}
@@ -523,6 +539,8 @@
cfg_.g_w = 352;
cfg_.g_h = 288;
change_bitrate_ = false;
+ mismatch_psnr_ = 0.0;
+ mismatch_nframes_ = 0;
ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
unsigned int last_w = cfg_.g_w;
@@ -542,6 +560,7 @@
// Verify that we get 1 resize down event in this test.
ASSERT_EQ(1, resize_count) << "Resizing should occur.";
+ EXPECT_EQ(static_cast<unsigned int>(0), GetMismatchFrames());
}
// Verify the dynamic resizer behavior for real time, 1 pass CBR mode.
@@ -554,6 +573,8 @@
cfg_.g_w = 352;
cfg_.g_h = 288;
change_bitrate_ = true;
+ mismatch_psnr_ = 0.0;
+ mismatch_nframes_ = 0;
// Disable dropped frames.
cfg_.rc_dropframe_thresh = 0;
// Starting bitrate low.
@@ -583,6 +604,7 @@
// Verify that we get 2 resize events in this test.
ASSERT_EQ(resize_count, 2) << "Resizing should occur twice.";
+ EXPECT_EQ(static_cast<unsigned int>(0), GetMismatchFrames());
}
vpx_img_fmt_t CspForFrameNumber(int frame) {
diff --git a/test/test.mk b/test/test.mk
index 1f120ce..db2e361 100644
--- a/test/test.mk
+++ b/test/test.mk
@@ -19,6 +19,7 @@
LIBVPX_TEST_SRCS-yes += ../md5_utils.h ../md5_utils.c
LIBVPX_TEST_SRCS-$(CONFIG_DECODERS) += ivf_video_source.h
LIBVPX_TEST_SRCS-$(CONFIG_ENCODERS) += ../y4minput.h ../y4minput.c
+LIBVPX_TEST_SRCS-$(CONFIG_ENCODERS) += altref_test.cc
LIBVPX_TEST_SRCS-$(CONFIG_ENCODERS) += aq_segment_test.cc
LIBVPX_TEST_SRCS-$(CONFIG_ENCODERS) += datarate_test.cc
LIBVPX_TEST_SRCS-$(CONFIG_ENCODERS) += encode_api_test.cc
@@ -28,7 +29,6 @@
LIBVPX_TEST_SRCS-$(CONFIG_ENCODERS) += y4m_video_source.h
LIBVPX_TEST_SRCS-$(CONFIG_ENCODERS) += yuv_video_source.h
-LIBVPX_TEST_SRCS-$(CONFIG_VP8_ENCODER) += altref_test.cc
LIBVPX_TEST_SRCS-$(CONFIG_VP8_ENCODER) += config_test.cc
LIBVPX_TEST_SRCS-$(CONFIG_VP8_ENCODER) += cq_test.cc
LIBVPX_TEST_SRCS-$(CONFIG_VP8_ENCODER) += keyframe_test.cc
diff --git a/third_party/x86inc/README.libvpx b/third_party/x86inc/README.libvpx
index e91e305..8d3cd96 100644
--- a/third_party/x86inc/README.libvpx
+++ b/third_party/x86inc/README.libvpx
@@ -1,5 +1,5 @@
-URL: http://git.videolan.org/?p=x264.git
-Version: a95584945dd9ce3acc66c6cd8f6796bc4404d40d
+URL: https://git.videolan.org/git/x264.git
+Version: d23d18655249944c1ca894b451e2c82c7a584c62
License: ISC
License File: LICENSE
@@ -13,12 +13,8 @@
Manage name mangling (prefixing with '_') manually because 'PREFIX' does not
exist in libvpx.
Expand PIC default to macho64 and respect CONFIG_PIC from libvpx
-Catch all elf formats for 'hidden' status and SECTION notes.
-Avoid 'amdnop' when building with nasm.
Set 'private_extern' visibility for macho targets.
Copy PIC 'GLOBAL' macros from x86_abi_support.asm
Use .text instead of .rodata on macho to avoid broken tables in PIC mode.
Use .text with no alignment for aout
Only use 'hidden' visibility with Chromium
-Move '%use smartalign' for nasm out of 'INIT_CPUFLAGS' and before
- 'ALIGNMODE'.
diff --git a/third_party/x86inc/x86inc.asm b/third_party/x86inc/x86inc.asm
index e7d3fa5..b647dff 100644
--- a/third_party/x86inc/x86inc.asm
+++ b/third_party/x86inc/x86inc.asm
@@ -1,7 +1,7 @@
;*****************************************************************************
;* x86inc.asm: x264asm abstraction layer
;*****************************************************************************
-;* Copyright (C) 2005-2015 x264 project
+;* Copyright (C) 2005-2016 x264 project
;*
;* Authors: Loren Merritt <lorenm@u.washington.edu>
;* Anton Mitrofanov <BugMaster@narod.ru>
@@ -66,16 +66,35 @@
%endif
%endif
-%ifidn __OUTPUT_FORMAT__,elf32
- %define mangle(x) x
+%define FORMAT_ELF 0
+%ifidn __OUTPUT_FORMAT__,elf
+ %define FORMAT_ELF 1
+%elifidn __OUTPUT_FORMAT__,elf32
+ %define FORMAT_ELF 1
%elifidn __OUTPUT_FORMAT__,elf64
- %define mangle(x) x
-%elifidn __OUTPUT_FORMAT__,x64
- %define mangle(x) x
-%elifidn __OUTPUT_FORMAT__,win64
- %define mangle(x) x
+ %define FORMAT_ELF 1
+%endif
+
+%define FORMAT_MACHO 0
+%ifidn __OUTPUT_FORMAT__,macho32
+ %define FORMAT_MACHO 1
+%elifidn __OUTPUT_FORMAT__,macho64
+ %define FORMAT_MACHO 1
+%endif
+
+; Set PREFIX for libvpx builds.
+%if FORMAT_ELF
+ %undef PREFIX
+%elif WIN64
+ %undef PREFIX
%else
+ %define PREFIX
+%endif
+
+%ifdef PREFIX
%define mangle(x) _ %+ x
+%else
+ %define mangle(x) x
%endif
; In some instances macho32 tables get misaligned when using .rodata.
@@ -94,14 +113,6 @@
%endif
%endmacro
-%macro SECTION_TEXT 0-1 16
- %ifidn __OUTPUT_FORMAT__,aout
- SECTION .text
- %else
- SECTION .text align=%1
- %endif
-%endmacro
-
; PIC macros are copied from vpx_ports/x86_abi_support.asm. The "define PIC"
; from original code is added in for 64bit.
%ifidn __OUTPUT_FORMAT__,elf32
@@ -188,8 +199,16 @@
%ifdef PIC
default rel
%endif
+
+%ifndef GET_GOT_DEFINED
+ %define GET_GOT_DEFINED 0
+%endif
; Done with PIC macros
+%ifdef __NASM_VER__
+ %use smartalign
+%endif
+
; Macros to eliminate most code duplication between x86_32 and x86_64:
; Currently this works only for leaf functions which load all their arguments
; into registers at the start, and make no other use of the stack. Luckily that
@@ -237,6 +256,7 @@
%define r%1w %2w
%define r%1b %2b
%define r%1h %2h
+ %define %2q %2
%if %0 == 2
%define r%1m %2d
%define r%1mp %2
@@ -261,9 +281,9 @@
%define e%1h %3
%define r%1b %2
%define e%1b %2
-%if ARCH_X86_64 == 0
- %define r%1 e%1
-%endif
+ %if ARCH_X86_64 == 0
+ %define r%1 e%1
+ %endif
%endmacro
DECLARE_REG_SIZE ax, al, ah
@@ -373,7 +393,7 @@
%macro ASSERT 1
%if (%1) == 0
- %error assert failed
+ %error assertion ``%1'' failed
%endif
%endmacro
@@ -464,8 +484,10 @@
%if %1 != 0 && required_stack_alignment > STACK_ALIGNMENT
%if %1 > 0
%assign regs_used (regs_used + 1)
- %elif ARCH_X86_64 && regs_used == num_args && num_args <= 4 + UNIX64 * 2
- %warning "Stack pointer will overwrite register argument"
+ %endif
+ %if ARCH_X86_64 && regs_used < 5 + UNIX64 * 3
+ ; Ensure that we don't clobber any registers containing arguments
+ %assign regs_used 5 + UNIX64 * 3
%endif
%endif
%endif
@@ -579,9 +601,9 @@
%macro RET 0
WIN64_RESTORE_XMM_INTERNAL rsp
POP_IF_USED 14, 13, 12, 11, 10, 9, 8, 7
-%if mmsize == 32
- vzeroupper
-%endif
+ %if mmsize == 32
+ vzeroupper
+ %endif
AUTO_REP_RET
%endmacro
@@ -618,17 +640,17 @@
%define has_epilogue regs_used > 9 || mmsize == 32 || stack_size > 0
%macro RET 0
-%if stack_size_padded > 0
-%if required_stack_alignment > STACK_ALIGNMENT
- mov rsp, rstkm
-%else
- add rsp, stack_size_padded
-%endif
-%endif
+ %if stack_size_padded > 0
+ %if required_stack_alignment > STACK_ALIGNMENT
+ mov rsp, rstkm
+ %else
+ add rsp, stack_size_padded
+ %endif
+ %endif
POP_IF_USED 14, 13, 12, 11, 10, 9
-%if mmsize == 32
- vzeroupper
-%endif
+ %if mmsize == 32
+ vzeroupper
+ %endif
AUTO_REP_RET
%endmacro
@@ -674,29 +696,29 @@
%define has_epilogue regs_used > 3 || mmsize == 32 || stack_size > 0
%macro RET 0
-%if stack_size_padded > 0
-%if required_stack_alignment > STACK_ALIGNMENT
- mov rsp, rstkm
-%else
- add rsp, stack_size_padded
-%endif
-%endif
+ %if stack_size_padded > 0
+ %if required_stack_alignment > STACK_ALIGNMENT
+ mov rsp, rstkm
+ %else
+ add rsp, stack_size_padded
+ %endif
+ %endif
POP_IF_USED 6, 5, 4, 3
-%if mmsize == 32
- vzeroupper
-%endif
+ %if mmsize == 32
+ vzeroupper
+ %endif
AUTO_REP_RET
%endmacro
%endif ;======================================================================
%if WIN64 == 0
-%macro WIN64_SPILL_XMM 1
-%endmacro
-%macro WIN64_RESTORE_XMM 1
-%endmacro
-%macro WIN64_PUSH_XMM 0
-%endmacro
+ %macro WIN64_SPILL_XMM 1
+ %endmacro
+ %macro WIN64_RESTORE_XMM 1
+ %endmacro
+ %macro WIN64_PUSH_XMM 0
+ %endmacro
%endif
; On AMD cpus <=K10, an ordinary ret is slow if it immediately follows either
@@ -709,24 +731,26 @@
%else
rep ret
%endif
+ annotate_function_size
%endmacro
%define last_branch_adr $$
%macro AUTO_REP_RET 0
- %ifndef cpuflags
- times ((last_branch_adr-$)>>31)+1 rep ; times 1 iff $ != last_branch_adr.
- %elif notcpuflag(ssse3)
- times ((last_branch_adr-$)>>31)+1 rep
+ %if notcpuflag(ssse3)
+ times ((last_branch_adr-$)>>31)+1 rep ; times 1 iff $ == last_branch_adr.
%endif
ret
+ annotate_function_size
%endmacro
%macro BRANCH_INSTR 0-*
%rep %0
%macro %1 1-2 %1
%2 %1
- %%branch_instr:
- %xdefine last_branch_adr %%branch_instr
+ %if notcpuflag(ssse3)
+ %%branch_instr equ $
+ %xdefine last_branch_adr %%branch_instr
+ %endif
%endmacro
%rotate 1
%endrep
@@ -741,6 +765,7 @@
%elif %2
jmp %1
%endif
+ annotate_function_size
%endmacro
;=============================================================================
@@ -762,6 +787,7 @@
cglobal_internal 0, %1 %+ SUFFIX, %2
%endmacro
%macro cglobal_internal 2-3+
+ annotate_function_size
%if %1
%xdefine %%FUNCTION_PREFIX private_prefix
; libvpx explicitly sets visibility in shared object builds. Avoid
@@ -782,17 +808,10 @@
CAT_XDEFINE cglobaled_, %2, 1
%endif
%xdefine current_function %2
- %ifidn __OUTPUT_FORMAT__,elf32
+ %xdefine current_function_section __SECT__
+ %if FORMAT_ELF
global %2:function %%VISIBILITY
- %elifidn __OUTPUT_FORMAT__,elf64
- global %2:function %%VISIBILITY
- %elifidn __OUTPUT_FORMAT__,macho32
- %ifdef __NASM_VER__
- global %2
- %else
- global %2:private_extern
- %endif
- %elifidn __OUTPUT_FORMAT__,macho64
+ %elif FORMAT_MACHO
%ifdef __NASM_VER__
global %2
%else
@@ -822,16 +841,16 @@
; like cextern, but without the prefix
%macro cextern_naked 1
- %xdefine %1 mangle(%1)
+ %ifdef PREFIX
+ %xdefine %1 mangle(%1)
+ %endif
CAT_XDEFINE cglobaled_, %1, 1
extern %1
%endmacro
%macro const 1-2+
%xdefine %1 mangle(private_prefix %+ _ %+ %1)
- %ifidn __OUTPUT_FORMAT__,elf32
- global %1:data hidden
- %elifidn __OUTPUT_FORMAT__,elf64
+ %if FORMAT_ELF
global %1:data hidden
%else
global %1
@@ -839,14 +858,29 @@
%1: %2
%endmacro
-; This is needed for ELF, otherwise the GNU linker assumes the stack is
-; executable by default.
-%ifidn __OUTPUT_FORMAT__,elf32
-SECTION .note.GNU-stack noalloc noexec nowrite progbits
-%elifidn __OUTPUT_FORMAT__,elf64
-SECTION .note.GNU-stack noalloc noexec nowrite progbits
+; This is needed for ELF, otherwise the GNU linker assumes the stack is executable by default.
+%if FORMAT_ELF
+ [SECTION .note.GNU-stack noalloc noexec nowrite progbits]
%endif
+; Tell debuggers how large the function was.
+; This may be invoked multiple times per function; we rely on later instances overriding earlier ones.
+; This is invoked by RET and similar macros, and also cglobal does it for the previous function,
+; but if the last function in a source file doesn't use any of the standard macros for its epilogue,
+; then its size might be unspecified.
+%macro annotate_function_size 0
+ %ifdef __YASM_VER__
+ %ifdef current_function
+ %if FORMAT_ELF
+ current_function_section
+ %%ecf equ $
+ size current_function %%ecf - current_function
+ __SECT__
+ %endif
+ %endif
+ %endif
+%endmacro
+
; cpuflags
%assign cpuflags_mmx (1<<0)
@@ -875,12 +909,9 @@
%assign cpuflags_bmi1 (1<<22)|cpuflags_lzcnt
%assign cpuflags_bmi2 (1<<23)|cpuflags_bmi1
-%define cpuflag(x) ((cpuflags & (cpuflags_ %+ x)) == (cpuflags_ %+ x))
-%define notcpuflag(x) ((cpuflags & (cpuflags_ %+ x)) != (cpuflags_ %+ x))
-
-%ifdef __NASM_VER__
- %use smartalign
-%endif
+; Returns a boolean value expressing whether or not the specified cpuflag is enabled.
+%define cpuflag(x) (((((cpuflags & (cpuflags_ %+ x)) ^ (cpuflags_ %+ x)) - 1) >> 31) & 1)
+%define notcpuflag(x) (cpuflag(x) ^ 1)
; Takes an arbitrary number of cpuflags from the above list.
; All subsequent functions (up to the next INIT_CPUFLAGS) is built for the specified cpu.
@@ -917,12 +948,18 @@
%endif
%endif
- %ifdef __NASM_VER__
- ALIGNMODE k7
- %elif ARCH_X86_64 || cpuflag(sse2)
- CPU amdnop
+ %if ARCH_X86_64 || cpuflag(sse2)
+ %ifdef __NASM_VER__
+ ALIGNMODE k8
+ %else
+ CPU amdnop
+ %endif
%else
- CPU basicnop
+ %ifdef __NASM_VER__
+ ALIGNMODE nop
+ %else
+ CPU basicnop
+ %endif
%endif
%endmacro
@@ -951,14 +988,14 @@
%define movnta movntq
%assign %%i 0
%rep 8
- CAT_XDEFINE m, %%i, mm %+ %%i
- CAT_XDEFINE nnmm, %%i, %%i
- %assign %%i %%i+1
+ CAT_XDEFINE m, %%i, mm %+ %%i
+ CAT_XDEFINE nnmm, %%i, %%i
+ %assign %%i %%i+1
%endrep
%rep 8
- CAT_UNDEF m, %%i
- CAT_UNDEF nnmm, %%i
- %assign %%i %%i+1
+ CAT_UNDEF m, %%i
+ CAT_UNDEF nnmm, %%i
+ %assign %%i %%i+1
%endrep
INIT_CPUFLAGS %1
%endmacro
@@ -969,7 +1006,7 @@
%define mmsize 16
%define num_mmregs 8
%if ARCH_X86_64
- %define num_mmregs 16
+ %define num_mmregs 16
%endif
%define mova movdqa
%define movu movdqu
@@ -977,9 +1014,9 @@
%define movnta movntdq
%assign %%i 0
%rep num_mmregs
- CAT_XDEFINE m, %%i, xmm %+ %%i
- CAT_XDEFINE nnxmm, %%i, %%i
- %assign %%i %%i+1
+ CAT_XDEFINE m, %%i, xmm %+ %%i
+ CAT_XDEFINE nnxmm, %%i, %%i
+ %assign %%i %%i+1
%endrep
INIT_CPUFLAGS %1
%endmacro
@@ -990,7 +1027,7 @@
%define mmsize 32
%define num_mmregs 8
%if ARCH_X86_64
- %define num_mmregs 16
+ %define num_mmregs 16
%endif
%define mova movdqa
%define movu movdqu
@@ -998,9 +1035,9 @@
%define movnta movntdq
%assign %%i 0
%rep num_mmregs
- CAT_XDEFINE m, %%i, ymm %+ %%i
- CAT_XDEFINE nnymm, %%i, %%i
- %assign %%i %%i+1
+ CAT_XDEFINE m, %%i, ymm %+ %%i
+ CAT_XDEFINE nnymm, %%i, %%i
+ %assign %%i %%i+1
%endrep
INIT_CPUFLAGS %1
%endmacro
@@ -1024,7 +1061,7 @@
%assign i 0
%rep 16
DECLARE_MMCAST i
-%assign i i+1
+ %assign i i+1
%endrep
; I often want to use macros that permute their arguments. e.g. there's no
@@ -1042,23 +1079,23 @@
; doesn't cost any cycles.
%macro PERMUTE 2-* ; takes a list of pairs to swap
-%rep %0/2
- %xdefine %%tmp%2 m%2
- %rotate 2
-%endrep
-%rep %0/2
- %xdefine m%1 %%tmp%2
- CAT_XDEFINE nn, m%1, %1
- %rotate 2
-%endrep
+ %rep %0/2
+ %xdefine %%tmp%2 m%2
+ %rotate 2
+ %endrep
+ %rep %0/2
+ %xdefine m%1 %%tmp%2
+ CAT_XDEFINE nn, m%1, %1
+ %rotate 2
+ %endrep
%endmacro
%macro SWAP 2+ ; swaps a single chain (sometimes more concise than pairs)
-%ifnum %1 ; SWAP 0, 1, ...
- SWAP_INTERNAL_NUM %1, %2
-%else ; SWAP m0, m1, ...
- SWAP_INTERNAL_NAME %1, %2
-%endif
+ %ifnum %1 ; SWAP 0, 1, ...
+ SWAP_INTERNAL_NUM %1, %2
+ %else ; SWAP m0, m1, ...
+ SWAP_INTERNAL_NAME %1, %2
+ %endif
%endmacro
%macro SWAP_INTERNAL_NUM 2-*
@@ -1068,7 +1105,7 @@
%xdefine m%2 %%tmp
CAT_XDEFINE nn, m%1, %1
CAT_XDEFINE nn, m%2, %2
- %rotate 1
+ %rotate 1
%endrep
%endmacro
@@ -1076,7 +1113,7 @@
%xdefine %%args nn %+ %1
%rep %0-1
%xdefine %%args %%args, nn %+ %2
- %rotate 1
+ %rotate 1
%endrep
SWAP_INTERNAL_NUM %%args
%endmacro
@@ -1093,7 +1130,7 @@
%assign %%i 0
%rep num_mmregs
CAT_XDEFINE %%f, %%i, m %+ %%i
- %assign %%i %%i+1
+ %assign %%i %%i+1
%endrep
%endmacro
@@ -1103,20 +1140,20 @@
%rep num_mmregs
CAT_XDEFINE m, %%i, %1_m %+ %%i
CAT_XDEFINE nn, m %+ %%i, %%i
- %assign %%i %%i+1
+ %assign %%i %%i+1
%endrep
%endif
%endmacro
; Append cpuflags to the callee's name iff the appended name is known and the plain name isn't
%macro call 1
- call_internal %1, %1 %+ SUFFIX
+ call_internal %1 %+ SUFFIX, %1
%endmacro
%macro call_internal 2
- %xdefine %%i %1
- %ifndef cglobaled_%1
- %ifdef cglobaled_%2
- %xdefine %%i %2
+ %xdefine %%i %2
+ %ifndef cglobaled_%2
+ %ifdef cglobaled_%1
+ %xdefine %%i %1
%endif
%endif
call %%i
@@ -1159,7 +1196,7 @@
%endif
CAT_XDEFINE sizeofxmm, i, 16
CAT_XDEFINE sizeofymm, i, 32
-%assign i i+1
+ %assign i i+1
%endrep
%undef i
@@ -1536,7 +1573,7 @@
%else
CAT_XDEFINE q, j, i
%endif
-%assign i i+1
+ %assign i i+1
%endrep
%undef i
%undef j
@@ -1559,55 +1596,54 @@
FMA_INSTR pmacsdql, pmuldq, paddq ; sse4 emulation
FMA_INSTR pmadcswd, pmaddwd, paddd
-; convert FMA4 to FMA3 if possible
-%macro FMA4_INSTR 4
- %macro %1 4-8 %1, %2, %3, %4
- %if cpuflag(fma4)
- v%5 %1, %2, %3, %4
- %elifidn %1, %2
- v%6 %1, %4, %3 ; %1 = %1 * %3 + %4
- %elifidn %1, %3
- v%7 %1, %2, %4 ; %1 = %2 * %1 + %4
- %elifidn %1, %4
- v%8 %1, %2, %3 ; %1 = %2 * %3 + %1
- %else
- %error fma3 emulation of ``%5 %1, %2, %3, %4'' is not supported
- %endif
- %endmacro
+; Macros for consolidating FMA3 and FMA4 using 4-operand (dst, src1, src2, src3) syntax.
+; FMA3 is only possible if dst is the same as one of the src registers.
+; Either src2 or src3 can be a memory operand.
+%macro FMA4_INSTR 2-*
+ %push fma4_instr
+ %xdefine %$prefix %1
+ %rep %0 - 1
+ %macro %$prefix%2 4-6 %$prefix, %2
+ %if notcpuflag(fma3) && notcpuflag(fma4)
+ %error use of ``%5%6'' fma instruction in cpuname function: current_function
+ %elif cpuflag(fma4)
+ v%5%6 %1, %2, %3, %4
+ %elifidn %1, %2
+ ; If %3 or %4 is a memory operand it needs to be encoded as the last operand.
+ %ifid %3
+ v%{5}213%6 %2, %3, %4
+ %else
+ v%{5}132%6 %2, %4, %3
+ %endif
+ %elifidn %1, %3
+ v%{5}213%6 %3, %2, %4
+ %elifidn %1, %4
+ v%{5}231%6 %4, %2, %3
+ %else
+ %error fma3 emulation of ``%5%6 %1, %2, %3, %4'' is not supported
+ %endif
+ %endmacro
+ %rotate 1
+ %endrep
+ %pop
%endmacro
-FMA4_INSTR fmaddpd, fmadd132pd, fmadd213pd, fmadd231pd
-FMA4_INSTR fmaddps, fmadd132ps, fmadd213ps, fmadd231ps
-FMA4_INSTR fmaddsd, fmadd132sd, fmadd213sd, fmadd231sd
-FMA4_INSTR fmaddss, fmadd132ss, fmadd213ss, fmadd231ss
+FMA4_INSTR fmadd, pd, ps, sd, ss
+FMA4_INSTR fmaddsub, pd, ps
+FMA4_INSTR fmsub, pd, ps, sd, ss
+FMA4_INSTR fmsubadd, pd, ps
+FMA4_INSTR fnmadd, pd, ps, sd, ss
+FMA4_INSTR fnmsub, pd, ps, sd, ss
-FMA4_INSTR fmaddsubpd, fmaddsub132pd, fmaddsub213pd, fmaddsub231pd
-FMA4_INSTR fmaddsubps, fmaddsub132ps, fmaddsub213ps, fmaddsub231ps
-FMA4_INSTR fmsubaddpd, fmsubadd132pd, fmsubadd213pd, fmsubadd231pd
-FMA4_INSTR fmsubaddps, fmsubadd132ps, fmsubadd213ps, fmsubadd231ps
-
-FMA4_INSTR fmsubpd, fmsub132pd, fmsub213pd, fmsub231pd
-FMA4_INSTR fmsubps, fmsub132ps, fmsub213ps, fmsub231ps
-FMA4_INSTR fmsubsd, fmsub132sd, fmsub213sd, fmsub231sd
-FMA4_INSTR fmsubss, fmsub132ss, fmsub213ss, fmsub231ss
-
-FMA4_INSTR fnmaddpd, fnmadd132pd, fnmadd213pd, fnmadd231pd
-FMA4_INSTR fnmaddps, fnmadd132ps, fnmadd213ps, fnmadd231ps
-FMA4_INSTR fnmaddsd, fnmadd132sd, fnmadd213sd, fnmadd231sd
-FMA4_INSTR fnmaddss, fnmadd132ss, fnmadd213ss, fnmadd231ss
-
-FMA4_INSTR fnmsubpd, fnmsub132pd, fnmsub213pd, fnmsub231pd
-FMA4_INSTR fnmsubps, fnmsub132ps, fnmsub213ps, fnmsub231ps
-FMA4_INSTR fnmsubsd, fnmsub132sd, fnmsub213sd, fnmsub231sd
-FMA4_INSTR fnmsubss, fnmsub132ss, fnmsub213ss, fnmsub231ss
-
-; workaround: vpbroadcastq is broken in x86_32 due to a yasm bug
-%if ARCH_X86_64 == 0
-%macro vpbroadcastq 2
-%if sizeof%1 == 16
- movddup %1, %2
-%else
- vbroadcastsd %1, %2
-%endif
-%endmacro
+; workaround: vpbroadcastq is broken in x86_32 due to a yasm bug (fixed in 1.3.0)
+%ifdef __YASM_VER__
+ %if __YASM_VERSION_ID__ < 0x01030000 && ARCH_X86_64 == 0
+ %macro vpbroadcastq 2
+ %if sizeof%1 == 16
+ movddup %1, %2
+ %else
+ vbroadcastsd %1, %2
+ %endif
+ %endmacro
+ %endif
%endif
diff --git a/vp10/common/mvref_common.h b/vp10/common/mvref_common.h
index 3ac93b5..104a91a 100644
--- a/vp10/common/mvref_common.h
+++ b/vp10/common/mvref_common.h
@@ -296,15 +296,15 @@
static INLINE uint8_t vp10_drl_ctx(const CANDIDATE_MV *ref_mv_stack,
int ref_idx) {
- if (ref_mv_stack[ref_idx].weight > REF_CAT_LEVEL &&
- ref_mv_stack[ref_idx + 1].weight > REF_CAT_LEVEL) {
+ if (ref_mv_stack[ref_idx].weight >= REF_CAT_LEVEL &&
+ ref_mv_stack[ref_idx + 1].weight >= REF_CAT_LEVEL) {
if (ref_mv_stack[ref_idx].weight == ref_mv_stack[ref_idx + 1].weight)
return 0;
else
return 1;
}
- if (ref_mv_stack[ref_idx].weight > REF_CAT_LEVEL &&
+ if (ref_mv_stack[ref_idx].weight >= REF_CAT_LEVEL &&
ref_mv_stack[ref_idx + 1].weight < REF_CAT_LEVEL)
return 2;
@@ -316,7 +316,6 @@
return 4;
}
- assert(0);
return 0;
}
#endif
diff --git a/vp10/encoder/encoder.c b/vp10/encoder/encoder.c
index abb9e0b..97d091a 100644
--- a/vp10/encoder/encoder.c
+++ b/vp10/encoder/encoder.c
@@ -4828,6 +4828,20 @@
arf_src_index = get_arf_src_index(cpi);
if (arf_src_index) {
+ for (i = 0; i <= arf_src_index; ++i) {
+ struct lookahead_entry *e = vp10_lookahead_peek(cpi->lookahead, i);
+ // Avoid creating an alt-ref if there's a forced keyframe pending.
+ if (e == NULL) {
+ break;
+ } else if (e->flags == VPX_EFLAG_FORCE_KF) {
+ arf_src_index = 0;
+ flush = 1;
+ break;
+ }
+ }
+ }
+
+ if (arf_src_index) {
assert(arf_src_index <= rc->frames_to_key);
if ((source = vp10_lookahead_peek(cpi->lookahead, arf_src_index)) != NULL) {
diff --git a/vp8/encoder/denoising.c b/vp8/encoder/denoising.c
index 113865f..5ae44e8 100644
--- a/vp8/encoder/denoising.c
+++ b/vp8/encoder/denoising.c
@@ -23,7 +23,7 @@
*/
static const unsigned int SSE_DIFF_THRESHOLD = 16 * 16 * 20;
static const unsigned int SSE_THRESHOLD = 16 * 16 * 40;
-static const unsigned int SSE_THRESHOLD_HIGH = 16 * 16 * 60;
+static const unsigned int SSE_THRESHOLD_HIGH = 16 * 16 * 80;
/*
* The filter function was modified to reduce the computational complexity.
diff --git a/vp8/encoder/denoising.h b/vp8/encoder/denoising.h
index 9a379a6..148ccda 100644
--- a/vp8/encoder/denoising.h
+++ b/vp8/encoder/denoising.h
@@ -18,8 +18,8 @@
extern "C" {
#endif
-#define SUM_DIFF_THRESHOLD (16 * 16 * 2)
-#define SUM_DIFF_THRESHOLD_HIGH (600) // ~(16 * 16 * 1.5)
+#define SUM_DIFF_THRESHOLD 448
+#define SUM_DIFF_THRESHOLD_HIGH 512
#define MOTION_MAGNITUDE_THRESHOLD (8*3)
#define SUM_DIFF_THRESHOLD_UV (96) // (8 * 8 * 1.5)
diff --git a/vp8/encoder/ethreading.c b/vp8/encoder/ethreading.c
index 4f689c4..2a0c298 100644
--- a/vp8/encoder/ethreading.c
+++ b/vp8/encoder/ethreading.c
@@ -518,7 +518,6 @@
cpi->b_multi_threaded = 0;
cpi->encoding_thread_count = 0;
- cpi->b_lpf_running = 0;
pthread_mutex_init(&cpi->mt_mutex, NULL);
diff --git a/vp8/encoder/onyx_if.c b/vp8/encoder/onyx_if.c
index 354bdfe..0efdac4 100644
--- a/vp8/encoder/onyx_if.c
+++ b/vp8/encoder/onyx_if.c
@@ -1531,15 +1531,6 @@
if (!oxcf)
return;
-#if CONFIG_MULTITHREAD
- /* wait for the last picture loopfilter thread done */
- if (cpi->b_lpf_running)
- {
- sem_wait(&cpi->h_event_end_lpf);
- cpi->b_lpf_running = 0;
- }
-#endif
-
if (cm->version != oxcf->Version)
{
cm->version = oxcf->Version;
@@ -3589,15 +3580,6 @@
/* Clear down mmx registers to allow floating point in what follows */
vp8_clear_system_state();
-#if CONFIG_MULTITHREAD
- /* wait for the last picture loopfilter thread done */
- if (cpi->b_lpf_running)
- {
- sem_wait(&cpi->h_event_end_lpf);
- cpi->b_lpf_running = 0;
- }
-#endif
-
if(cpi->force_next_frame_intra)
{
cm->frame_type = KEY_FRAME; /* delayed intra frame */
@@ -4326,8 +4308,6 @@
vp8_setup_key_frame(cpi);
}
-
-
#if CONFIG_REALTIME_ONLY & CONFIG_ONTHEFLY_BITPACKING
{
if(cpi->oxcf.error_resilient_mode)
@@ -4793,7 +4773,6 @@
{
/* start loopfilter in separate thread */
sem_post(&cpi->h_event_start_lpf);
- cpi->b_lpf_running = 1;
}
else
#endif
@@ -4825,11 +4804,10 @@
vp8_pack_bitstream(cpi, dest, dest_end, size);
#if CONFIG_MULTITHREAD
- /* if PSNR packets are generated we have to wait for the lpf */
- if (cpi->b_lpf_running && cpi->b_calculate_psnr)
+ /* wait for the lpf thread done */
+ if (cpi->b_multi_threaded)
{
sem_wait(&cpi->h_event_end_lpf);
- cpi->b_lpf_running = 0;
}
#endif
@@ -5757,14 +5735,6 @@
{
int ret;
-#if CONFIG_MULTITHREAD
- if(cpi->b_lpf_running)
- {
- sem_wait(&cpi->h_event_end_lpf);
- cpi->b_lpf_running = 0;
- }
-#endif
-
#if CONFIG_POSTPROC
cpi->common.show_frame_mi = cpi->common.mi;
ret = vp8_post_proc_frame(&cpi->common, dest, flags);
diff --git a/vp8/encoder/onyx_int.h b/vp8/encoder/onyx_int.h
index b436548..86f401c 100644
--- a/vp8/encoder/onyx_int.h
+++ b/vp8/encoder/onyx_int.h
@@ -536,7 +536,6 @@
int mt_sync_range;
int b_multi_threaded;
int encoding_thread_count;
- int b_lpf_running;
pthread_t *h_encoding_thread;
pthread_t h_filter_thread;
diff --git a/vp8/encoder/pickinter.c b/vp8/encoder/pickinter.c
index 0ea0632..51fbe54 100644
--- a/vp8/encoder/pickinter.c
+++ b/vp8/encoder/pickinter.c
@@ -50,7 +50,8 @@
static const int skin_mean[5][2] =
{{7463, 9614}, {6400, 10240}, {7040, 10240}, {8320, 9280}, {6800, 9614}};
static const int skin_inv_cov[4] = {4107, 1663, 1663, 2157}; // q16
-static const int skin_threshold[2] = {1570636, 800000}; // q18
+static const int skin_threshold[6] = {1570636, 1400000, 800000, 800000, 800000,
+ 800000}; // q18
// Evaluates the Mahalanobis distance measure for the input CbCr values.
static int evaluate_skin_color_difference(int cb, int cr, int idx) {
@@ -73,7 +74,7 @@
}
// Checks if the input yCbCr values corresponds to skin color.
-static int is_skin_color(int y, int cb, int cr)
+static int is_skin_color(int y, int cb, int cr, int consec_zeromv)
{
if (y < 40 || y > 220)
{
@@ -88,13 +89,31 @@
else
{
int i = 0;
- for (; i < 5; i++)
- {
- if (evaluate_skin_color_difference(cb, cr, i) < skin_threshold[1])
- {
- return 1;
- }
- }
+ // No skin if block has been zero motion for long consecutive time.
+ if (consec_zeromv > 80)
+ return 0;
+ // Exit on grey.
+ if (cb == 128 && cr == 128)
+ return 0;
+ // Exit on very strong cb.
+ if (cb > 150 && cr < 110)
+ return 0;
+ for (; i < 5; i++) {
+ int skin_color_diff = evaluate_skin_color_difference(cb, cr, i);
+ if (skin_color_diff < skin_threshold[i + 1]) {
+ if (y < 60 && skin_color_diff > 3 * (skin_threshold[i + 1] >> 2))
+ return 0;
+ else if (consec_zeromv > 30 &&
+ skin_color_diff > (skin_threshold[i + 1] >> 1))
+ return 0;
+ else
+ return 1;
+ }
+ // Exit if difference is much large than the threshold.
+ if (skin_color_diff > (skin_threshold[i + 1] << 3)) {
+ return 0;
+ }
+ }
return 0;
}
}
@@ -851,8 +870,10 @@
x->src.v_buffer[4 * x->src.uv_stride + 3] +
x->src.v_buffer[4 * x->src.uv_stride + 4]) >> 2;
x->is_skin = 0;
- if (!cpi->oxcf.screen_content_mode)
- x->is_skin = is_skin_color(y, cb, cr);
+ if (!cpi->oxcf.screen_content_mode) {
+ int block_index = mb_row * cpi->common.mb_cols + mb_col;
+ x->is_skin = is_skin_color(y, cb, cr, cpi->consec_zero_last[block_index]);
+ }
}
#if CONFIG_TEMPORAL_DENOISING
if (cpi->oxcf.noise_sensitivity) {
diff --git a/vp9/decoder/vp9_decodeframe.c b/vp9/decoder/vp9_decodeframe.c
index 9ce137d..32c7219 100644
--- a/vp9/decoder/vp9_decodeframe.c
+++ b/vp9/decoder/vp9_decodeframe.c
@@ -189,54 +189,31 @@
uint8_t *dst, int stride,
int eob) {
struct macroblockd_plane *const pd = &xd->plane[plane];
- if (eob > 0) {
- tran_low_t *const dqcoeff = pd->dqcoeff;
+ tran_low_t *const dqcoeff = pd->dqcoeff;
+ assert(eob > 0);
#if CONFIG_VP9_HIGHBITDEPTH
- if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
- if (xd->lossless) {
- vp9_highbd_iwht4x4_add(dqcoeff, dst, stride, eob, xd->bd);
- } else {
- switch (tx_size) {
- case TX_4X4:
- vp9_highbd_idct4x4_add(dqcoeff, dst, stride, eob, xd->bd);
- break;
- case TX_8X8:
- vp9_highbd_idct8x8_add(dqcoeff, dst, stride, eob, xd->bd);
- break;
- case TX_16X16:
- vp9_highbd_idct16x16_add(dqcoeff, dst, stride, eob, xd->bd);
- break;
- case TX_32X32:
- vp9_highbd_idct32x32_add(dqcoeff, dst, stride, eob, xd->bd);
- break;
- default:
- assert(0 && "Invalid transform size");
- }
- }
+ if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+ if (xd->lossless) {
+ vp9_highbd_iwht4x4_add(dqcoeff, dst, stride, eob, xd->bd);
} else {
- if (xd->lossless) {
- vp9_iwht4x4_add(dqcoeff, dst, stride, eob);
- } else {
- switch (tx_size) {
- case TX_4X4:
- vp9_idct4x4_add(dqcoeff, dst, stride, eob);
- break;
- case TX_8X8:
- vp9_idct8x8_add(dqcoeff, dst, stride, eob);
- break;
- case TX_16X16:
- vp9_idct16x16_add(dqcoeff, dst, stride, eob);
- break;
- case TX_32X32:
- vp9_idct32x32_add(dqcoeff, dst, stride, eob);
- break;
- default:
- assert(0 && "Invalid transform size");
- return;
- }
+ switch (tx_size) {
+ case TX_4X4:
+ vp9_highbd_idct4x4_add(dqcoeff, dst, stride, eob, xd->bd);
+ break;
+ case TX_8X8:
+ vp9_highbd_idct8x8_add(dqcoeff, dst, stride, eob, xd->bd);
+ break;
+ case TX_16X16:
+ vp9_highbd_idct16x16_add(dqcoeff, dst, stride, eob, xd->bd);
+ break;
+ case TX_32X32:
+ vp9_highbd_idct32x32_add(dqcoeff, dst, stride, eob, xd->bd);
+ break;
+ default:
+ assert(0 && "Invalid transform size");
}
}
-#else
+ } else {
if (xd->lossless) {
vp9_iwht4x4_add(dqcoeff, dst, stride, eob);
} else {
@@ -258,18 +235,40 @@
return;
}
}
+ }
+#else
+ if (xd->lossless) {
+ vp9_iwht4x4_add(dqcoeff, dst, stride, eob);
+ } else {
+ switch (tx_size) {
+ case TX_4X4:
+ vp9_idct4x4_add(dqcoeff, dst, stride, eob);
+ break;
+ case TX_8X8:
+ vp9_idct8x8_add(dqcoeff, dst, stride, eob);
+ break;
+ case TX_16X16:
+ vp9_idct16x16_add(dqcoeff, dst, stride, eob);
+ break;
+ case TX_32X32:
+ vp9_idct32x32_add(dqcoeff, dst, stride, eob);
+ break;
+ default:
+ assert(0 && "Invalid transform size");
+ return;
+ }
+ }
#endif // CONFIG_VP9_HIGHBITDEPTH
- if (eob == 1) {
- dqcoeff[0] = 0;
- } else {
- if (tx_size <= TX_16X16 && eob <= 10)
- memset(dqcoeff, 0, 4 * (4 << tx_size) * sizeof(dqcoeff[0]));
- else if (tx_size == TX_32X32 && eob <= 34)
- memset(dqcoeff, 0, 256 * sizeof(dqcoeff[0]));
- else
- memset(dqcoeff, 0, (16 << (tx_size << 1)) * sizeof(dqcoeff[0]));
- }
+ if (eob == 1) {
+ dqcoeff[0] = 0;
+ } else {
+ if (tx_size <= TX_16X16 && eob <= 10)
+ memset(dqcoeff, 0, 4 * (4 << tx_size) * sizeof(dqcoeff[0]));
+ else if (tx_size == TX_32X32 && eob <= 34)
+ memset(dqcoeff, 0, 256 * sizeof(dqcoeff[0]));
+ else
+ memset(dqcoeff, 0, (16 << (tx_size << 1)) * sizeof(dqcoeff[0]));
}
}
@@ -279,54 +278,31 @@
uint8_t *dst, int stride,
int eob) {
struct macroblockd_plane *const pd = &xd->plane[plane];
- if (eob > 0) {
- tran_low_t *const dqcoeff = pd->dqcoeff;
+ tran_low_t *const dqcoeff = pd->dqcoeff;
+ assert(eob > 0);
#if CONFIG_VP9_HIGHBITDEPTH
- if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
- if (xd->lossless) {
- vp9_highbd_iwht4x4_add(dqcoeff, dst, stride, eob, xd->bd);
- } else {
- switch (tx_size) {
- case TX_4X4:
- vp9_highbd_iht4x4_add(tx_type, dqcoeff, dst, stride, eob, xd->bd);
- break;
- case TX_8X8:
- vp9_highbd_iht8x8_add(tx_type, dqcoeff, dst, stride, eob, xd->bd);
- break;
- case TX_16X16:
- vp9_highbd_iht16x16_add(tx_type, dqcoeff, dst, stride, eob, xd->bd);
- break;
- case TX_32X32:
- vp9_highbd_idct32x32_add(dqcoeff, dst, stride, eob, xd->bd);
- break;
- default:
- assert(0 && "Invalid transform size");
- }
- }
+ if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+ if (xd->lossless) {
+ vp9_highbd_iwht4x4_add(dqcoeff, dst, stride, eob, xd->bd);
} else {
- if (xd->lossless) {
- vp9_iwht4x4_add(dqcoeff, dst, stride, eob);
- } else {
- switch (tx_size) {
- case TX_4X4:
- vp9_iht4x4_add(tx_type, dqcoeff, dst, stride, eob);
- break;
- case TX_8X8:
- vp9_iht8x8_add(tx_type, dqcoeff, dst, stride, eob);
- break;
- case TX_16X16:
- vp9_iht16x16_add(tx_type, dqcoeff, dst, stride, eob);
- break;
- case TX_32X32:
- vp9_idct32x32_add(dqcoeff, dst, stride, eob);
- break;
- default:
- assert(0 && "Invalid transform size");
- return;
- }
+ switch (tx_size) {
+ case TX_4X4:
+ vp9_highbd_iht4x4_add(tx_type, dqcoeff, dst, stride, eob, xd->bd);
+ break;
+ case TX_8X8:
+ vp9_highbd_iht8x8_add(tx_type, dqcoeff, dst, stride, eob, xd->bd);
+ break;
+ case TX_16X16:
+ vp9_highbd_iht16x16_add(tx_type, dqcoeff, dst, stride, eob, xd->bd);
+ break;
+ case TX_32X32:
+ vp9_highbd_idct32x32_add(dqcoeff, dst, stride, eob, xd->bd);
+ break;
+ default:
+ assert(0 && "Invalid transform size");
}
}
-#else
+ } else {
if (xd->lossless) {
vp9_iwht4x4_add(dqcoeff, dst, stride, eob);
} else {
@@ -348,18 +324,40 @@
return;
}
}
+ }
+#else
+ if (xd->lossless) {
+ vp9_iwht4x4_add(dqcoeff, dst, stride, eob);
+ } else {
+ switch (tx_size) {
+ case TX_4X4:
+ vp9_iht4x4_add(tx_type, dqcoeff, dst, stride, eob);
+ break;
+ case TX_8X8:
+ vp9_iht8x8_add(tx_type, dqcoeff, dst, stride, eob);
+ break;
+ case TX_16X16:
+ vp9_iht16x16_add(tx_type, dqcoeff, dst, stride, eob);
+ break;
+ case TX_32X32:
+ vp9_idct32x32_add(dqcoeff, dst, stride, eob);
+ break;
+ default:
+ assert(0 && "Invalid transform size");
+ return;
+ }
+ }
#endif // CONFIG_VP9_HIGHBITDEPTH
- if (eob == 1) {
- dqcoeff[0] = 0;
- } else {
- if (tx_type == DCT_DCT && tx_size <= TX_16X16 && eob <= 10)
- memset(dqcoeff, 0, 4 * (4 << tx_size) * sizeof(dqcoeff[0]));
- else if (tx_size == TX_32X32 && eob <= 34)
- memset(dqcoeff, 0, 256 * sizeof(dqcoeff[0]));
- else
- memset(dqcoeff, 0, (16 << (tx_size << 1)) * sizeof(dqcoeff[0]));
- }
+ if (eob == 1) {
+ dqcoeff[0] = 0;
+ } else {
+ if (tx_type == DCT_DCT && tx_size <= TX_16X16 && eob <= 10)
+ memset(dqcoeff, 0, 4 * (4 << tx_size) * sizeof(dqcoeff[0]));
+ else if (tx_size == TX_32X32 && eob <= 34)
+ memset(dqcoeff, 0, 256 * sizeof(dqcoeff[0]));
+ else
+ memset(dqcoeff, 0, (16 << (tx_size << 1)) * sizeof(dqcoeff[0]));
}
}
@@ -389,8 +387,10 @@
&vp9_default_scan_orders[tx_size] : &vp9_scan_orders[tx_size][tx_type];
const int eob = vp9_decode_block_tokens(xd, plane, sc, col, row, tx_size,
r, mi->segment_id);
- inverse_transform_block_intra(xd, plane, tx_type, tx_size,
- dst, pd->dst.stride, eob);
+ if (eob > 0) {
+ inverse_transform_block_intra(xd, plane, tx_type, tx_size,
+ dst, pd->dst.stride, eob);
+ }
}
}
@@ -402,9 +402,11 @@
const int eob = vp9_decode_block_tokens(xd, plane, sc, col, row, tx_size, r,
mi->segment_id);
- inverse_transform_block_inter(xd, plane, tx_size,
- &pd->dst.buf[4 * row * pd->dst.stride + 4 * col],
- pd->dst.stride, eob);
+ if (eob > 0) {
+ inverse_transform_block_inter(
+ xd, plane, tx_size, &pd->dst.buf[4 * row * pd->dst.stride + 4 * col],
+ pd->dst.stride, eob);
+ }
return eob;
}
@@ -859,7 +861,7 @@
VPX_CODEC_CORRUPT_FRAME, "Invalid block size.");
}
- vpx_read_mode_info(pbi, xd, mi_row, mi_col, r, x_mis, y_mis);
+ vp9_read_mode_info(pbi, xd, mi_row, mi_col, r, x_mis, y_mis);
if (mi->skip) {
dec_reset_skip_context(xd);
diff --git a/vp9/decoder/vp9_decodemv.c b/vp9/decoder/vp9_decodemv.c
index 8604420..596427c 100644
--- a/vp9/decoder/vp9_decodemv.c
+++ b/vp9/decoder/vp9_decodemv.c
@@ -149,17 +149,12 @@
}
static int read_inter_segment_id(VP9_COMMON *const cm, MACROBLOCKD *const xd,
- int mi_row, int mi_col, vpx_reader *r) {
+ int mi_row, int mi_col, vpx_reader *r,
+ int x_mis, int y_mis) {
struct segmentation *const seg = &cm->seg;
MODE_INFO *const mi = xd->mi[0];
int predicted_segment_id, segment_id;
const int mi_offset = mi_row * cm->mi_cols + mi_col;
- const int bw = xd->plane[0].n4_w >> 1;
- const int bh = xd->plane[0].n4_h >> 1;
-
- // TODO(slavarnway): move x_mis, y_mis into xd ?????
- const int x_mis = VPXMIN(cm->mi_cols - mi_col, bw);
- const int y_mis = VPXMIN(cm->mi_rows - mi_row, bh);
if (!seg->enabled)
return 0; // Default for disabled segmentation
@@ -202,19 +197,14 @@
static void read_intra_frame_mode_info(VP9_COMMON *const cm,
MACROBLOCKD *const xd,
- int mi_row, int mi_col, vpx_reader *r) {
+ int mi_row, int mi_col, vpx_reader *r,
+ int x_mis, int y_mis) {
MODE_INFO *const mi = xd->mi[0];
const MODE_INFO *above_mi = xd->above_mi;
const MODE_INFO *left_mi = xd->left_mi;
const BLOCK_SIZE bsize = mi->sb_type;
int i;
const int mi_offset = mi_row * cm->mi_cols + mi_col;
- const int bw = xd->plane[0].n4_w >> 1;
- const int bh = xd->plane[0].n4_h >> 1;
-
- // TODO(slavarnway): move x_mis, y_mis into xd ?????
- const int x_mis = VPXMIN(cm->mi_cols - mi_col, bw);
- const int y_mis = VPXMIN(cm->mi_rows - mi_row, bh);
mi->segment_id = read_intra_segment_id(cm, mi_offset, x_mis, y_mis, r);
mi->skip = read_skip(cm, xd, mi->segment_id, r);
@@ -473,14 +463,13 @@
}
}
-static void dec_find_best_ref_mvs(MACROBLOCKD *xd, int allow_hp, int_mv *mvlist,
- int_mv *best_mv, int refmv_count) {
+static void dec_find_best_ref_mvs(int allow_hp, int_mv *mvlist, int_mv *best_mv,
+ int refmv_count) {
int i;
// Make sure all the candidates are properly clamped etc
for (i = 0; i < refmv_count; ++i) {
lower_mv_precision(&mvlist[i].as_mv, allow_hp);
- clamp_mv2(&mvlist[i].as_mv, xd);
*best_mv = mvlist[i];
}
}
@@ -788,7 +777,7 @@
tmp_mvs, mi_row, mi_col, -1, 0,
fpm_sync, (void *)pbi);
- dec_find_best_ref_mvs(xd, allow_hp, tmp_mvs, &best_ref_mvs[ref],
+ dec_find_best_ref_mvs(allow_hp, tmp_mvs, &best_ref_mvs[ref],
refmv_count);
}
}
@@ -839,12 +828,14 @@
static void read_inter_frame_mode_info(VP9Decoder *const pbi,
MACROBLOCKD *const xd,
- int mi_row, int mi_col, vpx_reader *r) {
+ int mi_row, int mi_col, vpx_reader *r,
+ int x_mis, int y_mis) {
VP9_COMMON *const cm = &pbi->common;
MODE_INFO *const mi = xd->mi[0];
int inter_block;
- mi->segment_id = read_inter_segment_id(cm, xd, mi_row, mi_col, r);
+ mi->segment_id = read_inter_segment_id(cm, xd, mi_row, mi_col, r, x_mis,
+ y_mis);
mi->skip = read_skip(cm, xd, mi->segment_id, r);
inter_block = read_is_inter_block(cm, xd, mi->segment_id, r);
mi->tx_size = read_tx_size(cm, xd, !mi->skip || !inter_block, r);
@@ -860,7 +851,7 @@
memcpy(dst, src, sizeof(*dst) * 2);
}
-void vpx_read_mode_info(VP9Decoder *const pbi, MACROBLOCKD *xd,
+void vp9_read_mode_info(VP9Decoder *const pbi, MACROBLOCKD *xd,
int mi_row, int mi_col, vpx_reader *r,
int x_mis, int y_mis) {
VP9_COMMON *const cm = &pbi->common;
@@ -869,9 +860,9 @@
int w, h;
if (frame_is_intra_only(cm)) {
- read_intra_frame_mode_info(cm, xd, mi_row, mi_col, r);
+ read_intra_frame_mode_info(cm, xd, mi_row, mi_col, r, x_mis, y_mis);
} else {
- read_inter_frame_mode_info(pbi, xd, mi_row, mi_col, r);
+ read_inter_frame_mode_info(pbi, xd, mi_row, mi_col, r, x_mis, y_mis);
for (h = 0; h < y_mis; ++h) {
for (w = 0; w < x_mis; ++w) {
diff --git a/vp9/decoder/vp9_decodemv.h b/vp9/decoder/vp9_decodemv.h
index 75f568c..45569ec 100644
--- a/vp9/decoder/vp9_decodemv.h
+++ b/vp9/decoder/vp9_decodemv.h
@@ -19,7 +19,7 @@
extern "C" {
#endif
-void vpx_read_mode_info(VP9Decoder *const pbi, MACROBLOCKD *xd,
+void vp9_read_mode_info(VP9Decoder *const pbi, MACROBLOCKD *xd,
int mi_row, int mi_col, vpx_reader *r,
int x_mis, int y_mis);
diff --git a/vp9/encoder/vp9_aq_cyclicrefresh.c b/vp9/encoder/vp9_aq_cyclicrefresh.c
index ac834ca..b27ce6a 100644
--- a/vp9/encoder/vp9_aq_cyclicrefresh.c
+++ b/vp9/encoder/vp9_aq_cyclicrefresh.c
@@ -192,11 +192,16 @@
p[2].src.buf,
p[0].src.stride,
p[1].src.stride,
- bsize);
+ bsize,
+ 0,
+ 0);
if (is_skin)
refresh_this_block = 1;
}
+ if (cpi->oxcf.rc_mode == VPX_VBR && mi->ref_frame[0] == GOLDEN_FRAME)
+ refresh_this_block = 0;
+
// If this block is labeled for refresh, check if we should reset the
// segment_id.
if (cyclic_refresh_segment_id_boosted(mi->segment_id)) {
@@ -304,6 +309,8 @@
rc->baseline_gf_interval = VPXMIN(4 * (100 / cr->percent_refresh), 40);
else
rc->baseline_gf_interval = 40;
+ if (cpi->oxcf.rc_mode == VPX_VBR)
+ rc->baseline_gf_interval = 20;
}
// Update some encoding stats (from the just encoded frame). If this frame's
@@ -316,42 +323,40 @@
int mi_row, mi_col;
double fraction_low = 0.0;
int low_content_frame = 0;
-
MODE_INFO **mi = cm->mi_grid_visible;
RATE_CONTROL *const rc = &cpi->rc;
const int rows = cm->mi_rows, cols = cm->mi_cols;
int cnt1 = 0, cnt2 = 0;
int force_gf_refresh = 0;
-
+ int flag_force_gf_high_motion = 0;
for (mi_row = 0; mi_row < rows; mi_row++) {
for (mi_col = 0; mi_col < cols; mi_col++) {
- int16_t abs_mvr = mi[0]->mv[0].as_mv.row >= 0 ?
- mi[0]->mv[0].as_mv.row : -1 * mi[0]->mv[0].as_mv.row;
- int16_t abs_mvc = mi[0]->mv[0].as_mv.col >= 0 ?
- mi[0]->mv[0].as_mv.col : -1 * mi[0]->mv[0].as_mv.col;
-
- // Calculate the motion of the background.
- if (abs_mvr <= 16 && abs_mvc <= 16) {
- cnt1++;
- if (abs_mvr == 0 && abs_mvc == 0)
- cnt2++;
+ if (flag_force_gf_high_motion == 1) {
+ int16_t abs_mvr = mi[0]->mv[0].as_mv.row >= 0 ?
+ mi[0]->mv[0].as_mv.row : -1 * mi[0]->mv[0].as_mv.row;
+ int16_t abs_mvc = mi[0]->mv[0].as_mv.col >= 0 ?
+ mi[0]->mv[0].as_mv.col : -1 * mi[0]->mv[0].as_mv.col;
+ // Calculate the motion of the background.
+ if (abs_mvr <= 16 && abs_mvc <= 16) {
+ cnt1++;
+ if (abs_mvr == 0 && abs_mvc == 0)
+ cnt2++;
+ }
}
mi++;
-
// Accumulate low_content_frame.
if (cr->map[mi_row * cols + mi_col] < 1)
low_content_frame++;
}
mi += 8;
}
-
// For video conference clips, if the background has high motion in current
// frame because of the camera movement, set this frame as the golden frame.
// Use 70% and 5% as the thresholds for golden frame refreshing.
// Also, force this frame as a golden update frame if this frame will change
// the resolution (resize_pending != 0).
if (cpi->resize_pending != 0 ||
- (cnt1 * 10 > (70 * rows * cols) && cnt2 * 20 < cnt1)) {
+ (cnt1 * 100 > (70 * rows * cols) && cnt2 * 20 < cnt1)) {
vp9_cyclic_refresh_set_golden_update(cpi);
rc->frames_till_gf_update_due = rc->baseline_gf_interval;
@@ -360,7 +365,6 @@
cpi->refresh_golden_frame = 1;
force_gf_refresh = 1;
}
-
fraction_low =
(double)low_content_frame / (rows * cols);
// Update average.
@@ -503,6 +507,18 @@
cr->motion_thresh = 4;
cr->rate_boost_fac = 12;
}
+ if (cpi->oxcf.rc_mode == VPX_VBR) {
+ // To be adjusted for VBR mode, e.g., based on gf period and boost.
+ // For now use smaller qp-delta (than CBR), no second boosted seg, and
+ // turn-off (no refresh) on golden refresh (since it's already boosted).
+ cr->percent_refresh = 10;
+ cr->rate_ratio_qdelta = 1.5;
+ cr->rate_boost_fac = 10;
+ if (cpi->refresh_golden_frame == 1) {
+ cr->percent_refresh = 0;
+ cr->rate_ratio_qdelta = 1.0;
+ }
+ }
}
// Setup cyclic background refresh: set delta q and segmentation map.
diff --git a/vp9/encoder/vp9_denoiser.c b/vp9/encoder/vp9_denoiser.c
index e419cff..9bc9f26 100644
--- a/vp9/encoder/vp9_denoiser.c
+++ b/vp9/encoder/vp9_denoiser.c
@@ -338,7 +338,9 @@
mb->plane[2].src.buf,
mb->plane[0].src.stride,
mb->plane[1].src.stride,
- bs);
+ bs,
+ 0,
+ 0);
}
mv_col = ctx->best_sse_mv.as_mv.col;
diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c
index cf1fe81..9076b31 100644
--- a/vp9/encoder/vp9_encodeframe.c
+++ b/vp9/encoder/vp9_encodeframe.c
@@ -243,14 +243,16 @@
static void duplicate_mode_info_in_sb(VP9_COMMON *cm, MACROBLOCKD *xd,
int mi_row, int mi_col,
BLOCK_SIZE bsize) {
- const int block_width = num_8x8_blocks_wide_lookup[bsize];
- const int block_height = num_8x8_blocks_high_lookup[bsize];
+ const int block_width = VPXMIN(num_8x8_blocks_wide_lookup[bsize],
+ cm->mi_cols - mi_col);
+ const int block_height = VPXMIN(num_8x8_blocks_high_lookup[bsize],
+ cm->mi_rows - mi_row);
+ const int mi_stride = xd->mi_stride;
+ MODE_INFO *const src_mi = xd->mi[0];
int i, j;
for (j = 0; j < block_height; ++j)
- for (i = 0; i < block_width; ++i) {
- if (mi_row + j < cm->mi_rows && mi_col + i < cm->mi_cols)
- xd->mi[j * xd->mi_stride + i] = xd->mi[0];
- }
+ for (i = 0; i < block_width; ++i)
+ xd->mi[j * mi_stride + i] = src_mi;
}
static void set_block_size(VP9_COMP * const cpi,
@@ -691,21 +693,17 @@
const int use_4x4_partition = cm->frame_type == KEY_FRAME;
const int low_res = (cm->width <= 352 && cm->height <= 288);
int variance4x4downsample[16];
+ int segment_id;
- int segment_id = CR_SEGMENT_ID_BASE;
+ set_offsets(cpi, tile, x, mi_row, mi_col, BLOCK_64X64);
+ segment_id = xd->mi[0]->segment_id;
if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ && cm->seg.enabled) {
- const uint8_t *const map = cm->seg.update_map ? cpi->segmentation_map :
- cm->last_frame_seg_map;
- segment_id = get_segment_id(cm, map, BLOCK_64X64, mi_row, mi_col);
-
if (cyclic_refresh_segment_id_boosted(segment_id)) {
int q = vp9_get_qindex(&cm->seg, segment_id, cm->base_qindex);
set_vbp_thresholds(cpi, thresholds, q);
}
}
- set_offsets(cpi, tile, x, mi_row, mi_col, BLOCK_64X64);
-
if (xd->mb_to_right_edge < 0)
pixels_wide += (xd->mb_to_right_edge >> 3);
if (xd->mb_to_bottom_edge < 0)
@@ -770,37 +768,59 @@
x->pred_mv[LAST_FRAME] = mi->mv[0].as_mv;
}
+ set_ref_ptrs(cm, xd, mi->ref_frame[0], mi->ref_frame[1]);
vp9_build_inter_predictors_sb(xd, mi_row, mi_col, BLOCK_64X64);
// Check if most of the superblock is skin content, and if so, force split
- // to 32x32. Avoid checking superblocks on/near boundary and avoid low
- // resolutons for now.
+ // to 32x32, and set x->sb_is_skin for use in mode selection.
+ // Avoid checking superblocks on/near boundary and avoid low resolutions.
// Note superblock may still pick 64X64 if y_sad is very small
// (i.e., y_sad < cpi->vbp_threshold_sad) below. For now leave this as is.
x->sb_is_skin = 0;
#if !CONFIG_VP9_HIGHBITDEPTH
if (cpi->use_skin_detection && !low_res && (mi_col >= 8 &&
mi_col + 8 < cm->mi_cols && mi_row >= 8 && mi_row + 8 < cm->mi_rows)) {
+ CYCLIC_REFRESH *const cr = cpi->cyclic_refresh;
+ int bl_index1, bl_index2, bl_index3;
int num_16x16_skin = 0;
int num_16x16_nonskin = 0;
+ int is_skin = 0;
+ int consec_zeromv = 0;
uint8_t *ysignal = x->plane[0].src.buf;
uint8_t *usignal = x->plane[1].src.buf;
uint8_t *vsignal = x->plane[2].src.buf;
int spuv = x->plane[1].src.stride;
- for (i = 0; i < 4; i++) {
- for (j = 0; j < 4; j++) {
- int is_skin = vp9_compute_skin_block(ysignal,
- usignal,
- vsignal,
- sp,
- spuv,
- BLOCK_16X16);
+ const int block_index = mi_row * cm->mi_cols + mi_col;
+ const int bw = num_8x8_blocks_wide_lookup[BLOCK_64X64];
+ const int bh = num_8x8_blocks_high_lookup[BLOCK_64X64];
+ const int xmis = VPXMIN(cm->mi_cols - mi_col, bw);
+ const int ymis = VPXMIN(cm->mi_rows - mi_row, bh);
+ // Loop through the 16x16 sub-blocks.
+ int j, i;
+ for (i = 0; i < ymis; i+=2) {
+ for (j = 0; j < xmis; j+=2) {
+ int bl_index = block_index + i * cm->mi_cols + j;
+ bl_index1 = bl_index + 1;
+ bl_index2 = bl_index + cm->mi_cols;
+ bl_index3 = bl_index2 + 1;
+ consec_zeromv = VPXMIN(cr->consec_zero_mv[bl_index],
+ VPXMIN(cr->consec_zero_mv[bl_index1],
+ VPXMIN(cr->consec_zero_mv[bl_index2],
+ cr->consec_zero_mv[bl_index3])));
+ is_skin = vp9_compute_skin_block(ysignal,
+ usignal,
+ vsignal,
+ sp,
+ spuv,
+ BLOCK_16X16,
+ consec_zeromv,
+ 0);
num_16x16_skin += is_skin;
num_16x16_nonskin += (1 - is_skin);
if (num_16x16_nonskin > 3) {
// Exit loop if at least 4 of the 16x16 blocks are not skin.
- i = 4;
- j = 4;
+ i = ymis;
+ j = xmis;
}
ysignal += 16;
usignal += 8;
@@ -2439,7 +2459,8 @@
PARTITION_CONTEXT sl[8], sa[8];
TOKENEXTRA *tp_orig = *tp;
PICK_MODE_CONTEXT *ctx = &pc_tree->none;
- int i, pl;
+ int i;
+ const int pl = partition_plane_context(xd, mi_row, mi_col, bsize);
BLOCK_SIZE subsize;
RD_COST this_rdc, sum_rdc, best_rdc;
int do_split = bsize >= BLOCK_8X8;
@@ -2587,7 +2608,6 @@
&this_rdc, bsize, ctx, best_rdc.rdcost);
if (this_rdc.rate != INT_MAX) {
if (bsize >= BLOCK_8X8) {
- pl = partition_plane_context(xd, mi_row, mi_col, bsize);
this_rdc.rate += cpi->partition_cost[pl][PARTITION_NONE];
this_rdc.rdcost = RDCOST(x->rdmult, x->rddiv,
this_rdc.rate, this_rdc.dist);
@@ -2706,7 +2726,6 @@
}
if (sum_rdc.rdcost < best_rdc.rdcost && i == 4) {
- pl = partition_plane_context(xd, mi_row, mi_col, bsize);
sum_rdc.rate += cpi->partition_cost[pl][PARTITION_SPLIT];
sum_rdc.rdcost = RDCOST(x->rdmult, x->rddiv,
sum_rdc.rate, sum_rdc.dist);
@@ -2772,7 +2791,6 @@
}
if (sum_rdc.rdcost < best_rdc.rdcost) {
- pl = partition_plane_context(xd, mi_row, mi_col, bsize);
sum_rdc.rate += cpi->partition_cost[pl][PARTITION_HORZ];
sum_rdc.rdcost = RDCOST(x->rdmult, x->rddiv, sum_rdc.rate, sum_rdc.dist);
if (sum_rdc.rdcost < best_rdc.rdcost) {
@@ -2824,7 +2842,6 @@
}
if (sum_rdc.rdcost < best_rdc.rdcost) {
- pl = partition_plane_context(xd, mi_row, mi_col, bsize);
sum_rdc.rate += cpi->partition_cost[pl][PARTITION_VERT];
sum_rdc.rdcost = RDCOST(x->rdmult, x->rddiv,
sum_rdc.rate, sum_rdc.dist);
@@ -4271,13 +4288,9 @@
VP9_COMMON *const cm = &cpi->common;
MACROBLOCK *const x = &td->mb;
MACROBLOCKD *const xd = &x->e_mbd;
- MODE_INFO **mi_8x8 = xd->mi;
- MODE_INFO *mi = mi_8x8[0];
+ MODE_INFO *mi = xd->mi[0];
const int seg_skip = segfeature_active(&cm->seg, mi->segment_id,
SEG_LVL_SKIP);
- const int mis = cm->mi_stride;
- const int mi_width = num_8x8_blocks_wide_lookup[bsize];
- const int mi_height = num_8x8_blocks_high_lookup[bsize];
x->skip_recode = !x->select_tx_size && mi->sb_type >= BLOCK_8X8 &&
cpi->oxcf.aq_mode != COMPLEXITY_AQ &&
@@ -4333,20 +4346,14 @@
++get_tx_counts(max_txsize_lookup[bsize], get_tx_size_context(xd),
&td->counts->tx)[mi->tx_size];
} else {
- int x, y;
- TX_SIZE tx_size;
// The new intra coding scheme requires no change of transform size
if (is_inter_block(mi)) {
- tx_size = VPXMIN(tx_mode_to_biggest_tx_size[cm->tx_mode],
- max_txsize_lookup[bsize]);
+ mi->tx_size = VPXMIN(tx_mode_to_biggest_tx_size[cm->tx_mode],
+ max_txsize_lookup[bsize]);
} else {
- tx_size = (bsize >= BLOCK_8X8) ? mi->tx_size : TX_4X4;
+ mi->tx_size = (bsize >= BLOCK_8X8) ? mi->tx_size : TX_4X4;
}
- for (y = 0; y < mi_height; y++)
- for (x = 0; x < mi_width; x++)
- if (mi_col + x < cm->mi_cols && mi_row + y < cm->mi_rows)
- mi_8x8[mis * y + x]->tx_size = tx_size;
}
++td->counts->tx.tx_totals[mi->tx_size];
++td->counts->tx.tx_totals[get_uv_tx_size(mi, &xd->plane[1])];
diff --git a/vp9/encoder/vp9_encodemv.c b/vp9/encoder/vp9_encodemv.c
index 8f4d80c..71f27cc 100644
--- a/vp9/encoder/vp9_encodemv.c
+++ b/vp9/encoder/vp9_encodemv.c
@@ -75,11 +75,12 @@
static void build_nmv_component_cost_table(int *mvcost,
const nmv_component* const mvcomp,
int usehp) {
- int i, v;
int sign_cost[2], class_cost[MV_CLASSES], class0_cost[CLASS0_SIZE];
int bits_cost[MV_OFFSET_BITS][2];
int class0_fp_cost[CLASS0_SIZE][MV_FP_SIZE], fp_cost[MV_FP_SIZE];
int class0_hp_cost[2], hp_cost[2];
+ int i;
+ int c, o;
sign_cost[0] = vp9_cost_zero(mvcomp->sign);
sign_cost[1] = vp9_cost_one(mvcomp->sign);
@@ -94,44 +95,56 @@
vp9_cost_tokens(class0_fp_cost[i], mvcomp->class0_fp[i], vp9_mv_fp_tree);
vp9_cost_tokens(fp_cost, mvcomp->fp, vp9_mv_fp_tree);
- if (usehp) {
- class0_hp_cost[0] = vp9_cost_zero(mvcomp->class0_hp);
- class0_hp_cost[1] = vp9_cost_one(mvcomp->class0_hp);
- hp_cost[0] = vp9_cost_zero(mvcomp->hp);
- hp_cost[1] = vp9_cost_one(mvcomp->hp);
- }
+ // Always build the hp costs to avoid an uninitialized warning from gcc
+ class0_hp_cost[0] = vp9_cost_zero(mvcomp->class0_hp);
+ class0_hp_cost[1] = vp9_cost_one(mvcomp->class0_hp);
+ hp_cost[0] = vp9_cost_zero(mvcomp->hp);
+ hp_cost[1] = vp9_cost_one(mvcomp->hp);
+
mvcost[0] = 0;
- for (v = 1; v <= MV_MAX; ++v) {
- int z, c, o, d, e, f, cost = 0;
- z = v - 1;
- c = vp9_get_mv_class(z, &o);
- cost += class_cost[c];
+ // MV_CLASS_0
+ for (o = 0; o < (CLASS0_SIZE << 3); ++o) {
+ int d, e, f;
+ int cost = class_cost[MV_CLASS_0];
+ int v = o + 1;
d = (o >> 3); /* int mv data */
f = (o >> 1) & 3; /* fractional pel mv data */
- e = (o & 1); /* high precision mv data */
- if (c == MV_CLASS_0) {
- cost += class0_cost[d];
- } else {
- int i, b;
- b = c + CLASS0_BITS - 1; /* number of bits */
- for (i = 0; i < b; ++i)
- cost += bits_cost[i][((d >> i) & 1)];
- }
- if (c == MV_CLASS_0) {
- cost += class0_fp_cost[d][f];
- } else {
- cost += fp_cost[f];
- }
+ cost += class0_cost[d];
+ cost += class0_fp_cost[d][f];
if (usehp) {
- if (c == MV_CLASS_0) {
- cost += class0_hp_cost[e];
- } else {
- cost += hp_cost[e];
- }
+ e = (o & 1); /* high precision mv data */
+ cost += class0_hp_cost[e];
}
mvcost[v] = cost + sign_cost[0];
mvcost[-v] = cost + sign_cost[1];
}
+ for (c = MV_CLASS_1; c < MV_CLASSES; ++c) {
+ int d;
+ for (d = 0; d < (1 << c); ++d) {
+ int f;
+ int whole_cost = class_cost[c];
+ int b = c + CLASS0_BITS - 1; /* number of bits */
+ for (i = 0; i < b; ++i)
+ whole_cost += bits_cost[i][((d >> i) & 1)];
+ for (f = 0; f < 4; ++f) {
+ int cost = whole_cost + fp_cost[f];
+ int v = (CLASS0_SIZE << (c + 2)) + d * 8 + f * 2 /* + e */ + 1;
+ if (usehp) {
+ mvcost[v] = cost + hp_cost[0] + sign_cost[0];
+ mvcost[-v] = cost + hp_cost[0] + sign_cost[1];
+ if (v + 1 > MV_MAX) break;
+ mvcost[v + 1] = cost + hp_cost[1] + sign_cost[0];
+ mvcost[-v - 1] = cost + hp_cost[1] + sign_cost[1];
+ } else {
+ mvcost[v] = cost + sign_cost[0];
+ mvcost[-v] = cost + sign_cost[1];
+ if (v + 1 > MV_MAX) break;
+ mvcost[v + 1] = cost + sign_cost[0];
+ mvcost[-v - 1] = cost + sign_cost[1];
+ }
+ }
+ }
+ }
}
static int update_mv(vpx_writer *w, const unsigned int ct[2], vpx_prob *cur_p,
diff --git a/vp9/encoder/vp9_encoder.c b/vp9/encoder/vp9_encoder.c
index e8a8b89..01855ea 100644
--- a/vp9/encoder/vp9_encoder.c
+++ b/vp9/encoder/vp9_encoder.c
@@ -410,6 +410,9 @@
memset(&cpi->svc.scaled_frames[0], 0,
MAX_LAG_BUFFERS * sizeof(cpi->svc.scaled_frames[0]));
+ vpx_free_frame_buffer(&cpi->svc.scaled_temp);
+ memset(&cpi->svc.scaled_temp, 0, sizeof(cpi->svc.scaled_temp));
+
vpx_free_frame_buffer(&cpi->svc.empty_frame.img);
memset(&cpi->svc.empty_frame, 0, sizeof(cpi->svc.empty_frame));
@@ -2451,6 +2454,13 @@
return scale;
}
+static int big_rate_miss(VP9_COMP *cpi, int high_limit, int low_limit) {
+ const RATE_CONTROL *const rc = &cpi->rc;
+
+ return (rc->projected_frame_size > ((high_limit * 3) / 2)) ||
+ (rc->projected_frame_size < (low_limit / 2));
+}
+
// Function to test for conditions that indicate we should loop
// back and recode a frame.
static int recode_loop_test(VP9_COMP *cpi,
@@ -2462,6 +2472,7 @@
int force_recode = 0;
if ((rc->projected_frame_size >= rc->max_frame_bandwidth) ||
+ big_rate_miss(cpi, high_limit, low_limit) ||
(cpi->sf.recode_loop == ALLOW_RECODE) ||
(frame_is_kfgfarf &&
(cpi->sf.recode_loop == ALLOW_RECODE_KFARFGF))) {
@@ -2808,9 +2819,38 @@
vpx_clear_system_state();
+#if CONFIG_VP9_HIGHBITDEPTH
+ if (cm->use_highbitdepth) {
+ recon_err = vpx_highbd_get_y_sse(cpi->Source, get_frame_new_buffer(cm));
+ } else {
+ recon_err = vpx_get_y_sse(cpi->Source, get_frame_new_buffer(cm));
+ }
+#else
recon_err = vpx_get_y_sse(cpi->Source, get_frame_new_buffer(cm));
+#endif // CONFIG_VP9_HIGHBITDEPTH
- if (cpi->twopass.total_left_stats.coded_error != 0.0)
+
+ if (cpi->twopass.total_left_stats.coded_error != 0.0) {
+ double dc_quant_devisor;
+#if CONFIG_VP9_HIGHBITDEPTH
+ switch (cm->bit_depth) {
+ case VPX_BITS_8:
+ dc_quant_devisor = 4.0;
+ break;
+ case VPX_BITS_10:
+ dc_quant_devisor = 16.0;
+ break;
+ case VPX_BITS_12:
+ dc_quant_devisor = 64.0;
+ break;
+ default:
+ assert(0 && "bit_depth must be VPX_BITS_8, VPX_BITS_10 or VPX_BITS_12");
+ break;
+ }
+#else
+ dc_quant_devisor = 4.0;
+#endif
+
fprintf(f, "%10u %dx%d %10d %10d %d %d %10d %10d %10d %10d"
"%10"PRId64" %10"PRId64" %5d %5d %10"PRId64" "
"%10"PRId64" %10"PRId64" %10d "
@@ -2836,7 +2876,8 @@
(cpi->rc.starting_buffer_level - cpi->rc.bits_off_target),
cpi->rc.total_actual_bits, cm->base_qindex,
vp9_convert_qindex_to_q(cm->base_qindex, cm->bit_depth),
- (double)vp9_dc_quant(cm->base_qindex, 0, cm->bit_depth) / 4.0,
+ (double)vp9_dc_quant(cm->base_qindex, 0, cm->bit_depth) /
+ dc_quant_devisor,
vp9_convert_qindex_to_q(cpi->twopass.active_worst_quality,
cm->bit_depth),
cpi->rc.avg_q,
@@ -2851,7 +2892,7 @@
cpi->twopass.kf_zeromotion_pct,
cpi->twopass.fr_content_type,
cm->lf.filter_level);
-
+ }
fclose(f);
if (0) {
@@ -3089,17 +3130,30 @@
vpx_clear_system_state();
set_frame_size(cpi);
- cpi->Source = vp9_scale_if_required(cm,
- cpi->un_scaled_source,
- &cpi->scaled_source,
- (cpi->oxcf.pass == 0));
+ if (is_one_pass_cbr_svc(cpi) &&
+ cpi->un_scaled_source->y_width == cm->width << 2 &&
+ cpi->un_scaled_source->y_height == cm->height << 2 &&
+ cpi->svc.scaled_temp.y_width == cm->width << 1 &&
+ cpi->svc.scaled_temp.y_height == cm->height << 1) {
+ cpi->Source = vp9_svc_twostage_scale(cm,
+ cpi->un_scaled_source,
+ &cpi->scaled_source,
+ &cpi->svc.scaled_temp);
+ } else {
+ cpi->Source = vp9_scale_if_required(cm,
+ cpi->un_scaled_source,
+ &cpi->scaled_source,
+ (cpi->oxcf.pass == 0));
+ }
// Avoid scaling last_source unless its needed.
- // Last source is currently only used for screen-content mode,
- // if partition_search_type == SOURCE_VAR_BASED_PARTITION, or if noise
+ // Last source is needed if vp9_avg_source_sad() is used, or if
+ // partition_search_type == SOURCE_VAR_BASED_PARTITION, or if noise
// estimation is enabled.
if (cpi->unscaled_last_source != NULL &&
(cpi->oxcf.content == VP9E_CONTENT_SCREEN ||
+ (cpi->oxcf.pass == 0 && cpi->oxcf.rc_mode == VPX_VBR &&
+ cpi->oxcf.mode == REALTIME && cpi->oxcf.speed >= 5) ||
cpi->sf.partition_search_type == SOURCE_VAR_BASED_PARTITION ||
cpi->noise_estimate.enabled))
cpi->Last_Source = vp9_scale_if_required(cm,
@@ -3109,18 +3163,18 @@
vp9_update_noise_estimate(cpi);
if (cpi->oxcf.pass == 0 &&
- cpi->oxcf.rc_mode == VPX_CBR &&
+ cpi->oxcf.mode == REALTIME &&
+ cpi->oxcf.speed >= 5 &&
cpi->resize_state == 0 &&
cm->frame_type != KEY_FRAME &&
- cpi->oxcf.content == VP9E_CONTENT_SCREEN)
+ (cpi->oxcf.content == VP9E_CONTENT_SCREEN ||
+ cpi->oxcf.rc_mode == VPX_VBR))
vp9_avg_source_sad(cpi);
- // TODO(wonkap/marpan): For 1 pass SVC, since only ZERMOV is allowed for
- // upsampled reference frame (i.e, svc->force_zero_mode_spatial_ref = 0),
- // we should be able to avoid this frame-level upsampling.
- // Keeping it for now as there is an asan error in the multi-threaded SVC
- // rate control test if this upsampling is removed.
- if (frame_is_intra_only(cm) == 0) {
+ // For 1 pass SVC, since only ZEROMV is allowed for upsampled reference
+ // frame (i.e, svc->force_zero_mode_spatial_ref = 0), we can avoid this
+ // frame-level upsampling.
+ if (frame_is_intra_only(cm) == 0 && !is_one_pass_cbr_svc(cpi)) {
vp9_scale_references(cpi);
}
@@ -3510,6 +3564,25 @@
}
}
+YV12_BUFFER_CONFIG *vp9_svc_twostage_scale(VP9_COMMON *cm,
+ YV12_BUFFER_CONFIG *unscaled,
+ YV12_BUFFER_CONFIG *scaled,
+ YV12_BUFFER_CONFIG *scaled_temp) {
+ if (cm->mi_cols * MI_SIZE != unscaled->y_width ||
+ cm->mi_rows * MI_SIZE != unscaled->y_height) {
+#if CONFIG_VP9_HIGHBITDEPTH
+ scale_and_extend_frame(unscaled, scaled_temp, (int)cm->bit_depth);
+ scale_and_extend_frame(scaled_temp, scaled, (int)cm->bit_depth);
+#else
+ vp9_scale_and_extend_frame(unscaled, scaled_temp);
+ vp9_scale_and_extend_frame(scaled_temp, scaled);
+#endif // CONFIG_VP9_HIGHBITDEPTH
+ return scaled;
+ } else {
+ return unscaled;
+ }
+}
+
YV12_BUFFER_CONFIG *vp9_scale_if_required(VP9_COMMON *cm,
YV12_BUFFER_CONFIG *unscaled,
YV12_BUFFER_CONFIG *scaled,
@@ -3680,6 +3753,12 @@
++cm->current_video_frame;
cpi->ext_refresh_frame_flags_pending = 0;
cpi->svc.rc_drop_superframe = 1;
+ // TODO(marpan): Advancing the svc counters on dropped frames can break
+ // the referencing scheme for the fixed svc patterns defined in
+ // vp9_one_pass_cbr_svc_start_layer(). Look into fixing this issue, but
+ // for now, don't advance the svc frame counters on dropped frame.
+ // if (cpi->use_svc)
+ // vp9_inc_frame_in_layer(cpi);
return;
}
}
@@ -4106,6 +4185,20 @@
arf_src_index = 0;
if (arf_src_index) {
+ for (i = 0; i <= arf_src_index; ++i) {
+ struct lookahead_entry *e = vp9_lookahead_peek(cpi->lookahead, i);
+ // Avoid creating an alt-ref if there's a forced keyframe pending.
+ if (e == NULL) {
+ break;
+ } else if (e->flags == VPX_EFLAG_FORCE_KF) {
+ arf_src_index = 0;
+ flush = 1;
+ break;
+ }
+ }
+ }
+
+ if (arf_src_index) {
assert(arf_src_index <= rc->frames_to_key);
if ((source = vp9_lookahead_peek(cpi->lookahead, arf_src_index)) != NULL) {
diff --git a/vp9/encoder/vp9_encoder.h b/vp9/encoder/vp9_encoder.h
index 017fa61..02d223a 100644
--- a/vp9/encoder/vp9_encoder.h
+++ b/vp9/encoder/vp9_encoder.h
@@ -614,6 +614,11 @@
void vp9_set_high_precision_mv(VP9_COMP *cpi, int allow_high_precision_mv);
+YV12_BUFFER_CONFIG *vp9_svc_twostage_scale(VP9_COMMON *cm,
+ YV12_BUFFER_CONFIG *unscaled,
+ YV12_BUFFER_CONFIG *scaled,
+ YV12_BUFFER_CONFIG *scaled_temp);
+
YV12_BUFFER_CONFIG *vp9_scale_if_required(VP9_COMMON *cm,
YV12_BUFFER_CONFIG *unscaled,
YV12_BUFFER_CONFIG *scaled,
diff --git a/vp9/encoder/vp9_firstpass.c b/vp9/encoder/vp9_firstpass.c
index 9d3b154..10fd6c0 100644
--- a/vp9/encoder/vp9_firstpass.c
+++ b/vp9/encoder/vp9_firstpass.c
@@ -490,7 +490,35 @@
cpi->rc.frames_to_key = INT_MAX;
}
+// This threshold is used to track blocks where to all intents and purposes
+// the intra prediction error 0. Though the metric we test against
+// is technically a sse we are mainly interested in blocks where all the pixels
+// int he 8 bit domain have an error of <= 1 (where error = sse) so a
+// linear scaling for 10 and 12 bit gives similar results.
#define UL_INTRA_THRESH 50
+#if CONFIG_VP9_HIGHBITDEPTH
+static int get_ul_intra_threshold(VP9_COMMON *cm) {
+ int ret_val = UL_INTRA_THRESH;
+ if (cm->use_highbitdepth) {
+ switch (cm->bit_depth) {
+ case VPX_BITS_8:
+ ret_val = UL_INTRA_THRESH;
+ break;
+ case VPX_BITS_10:
+ ret_val = UL_INTRA_THRESH >> 2;
+ break;
+ case VPX_BITS_12:
+ ret_val = UL_INTRA_THRESH >> 4;
+ break;
+ default:
+ assert(0 && "cm->bit_depth should be VPX_BITS_8, "
+ "VPX_BITS_10 or VPX_BITS_12");
+ }
+ }
+ return ret_val;
+}
+#endif // CONFIG_VP9_HIGHBITDEPTH
+
#define INVALID_ROW -1
void vp9_first_pass(VP9_COMP *cpi, const struct lookahead_entry *source) {
int mb_row, mb_col;
@@ -681,7 +709,11 @@
// domain). In natural videos this is uncommon, but it is much more
// common in animations, graphics and screen content, so may be used
// as a signal to detect these types of content.
+#if CONFIG_VP9_HIGHBITDEPTH
+ if (this_error < get_ul_intra_threshold(cm)) {
+#else
if (this_error < UL_INTRA_THRESH) {
+#endif
++intra_skip_count;
} else if ((mb_col > 0) && (image_data_start_row == INVALID_ROW)) {
image_data_start_row = mb_row;
diff --git a/vp9/encoder/vp9_mcomp.c b/vp9/encoder/vp9_mcomp.c
index 8b7825e..4669145 100644
--- a/vp9/encoder/vp9_mcomp.c
+++ b/vp9/encoder/vp9_mcomp.c
@@ -383,6 +383,51 @@
(cost_list[4] - 2 * cost_list[0] + cost_list[2]));
}
+int vp9_skip_sub_pixel_tree(
+ const MACROBLOCK *x,
+ MV *bestmv, const MV *ref_mv,
+ int allow_hp,
+ int error_per_bit,
+ const vp9_variance_fn_ptr_t *vfp,
+ int forced_stop,
+ int iters_per_step,
+ int *cost_list,
+ int *mvjcost, int *mvcost[2],
+ int *distortion,
+ unsigned int *sse1,
+ const uint8_t *second_pred,
+ int w, int h) {
+ SETUP_SUBPEL_SEARCH;
+ besterr = setup_center_error(xd, bestmv, ref_mv, error_per_bit, vfp,
+ z, src_stride, y, y_stride, second_pred,
+ w, h, offset, mvjcost, mvcost,
+ sse1, distortion);
+ (void) halfiters;
+ (void) quarteriters;
+ (void) eighthiters;
+ (void) whichdir;
+ (void) allow_hp;
+ (void) forced_stop;
+ (void) hstep;
+ (void) rr;
+ (void) rc;
+ (void) minr;
+ (void) minc;
+ (void) maxr;
+ (void) maxc;
+ (void) tr;
+ (void) tc;
+ (void) sse;
+ (void) thismse;
+ (void) cost_list;
+
+ if ((abs(bestmv->col - ref_mv->col) > (MAX_FULL_PEL_VAL << 3)) ||
+ (abs(bestmv->row - ref_mv->row) > (MAX_FULL_PEL_VAL << 3)))
+ return INT_MAX;
+
+ return besterr;
+}
+
int vp9_find_best_sub_pixel_tree_pruned_evenmore(
const MACROBLOCK *x,
MV *bestmv, const MV *ref_mv,
diff --git a/vp9/encoder/vp9_mcomp.h b/vp9/encoder/vp9_mcomp.h
index 1c101f2..1b0c860 100644
--- a/vp9/encoder/vp9_mcomp.h
+++ b/vp9/encoder/vp9_mcomp.h
@@ -92,6 +92,7 @@
extern fractional_mv_step_fp vp9_find_best_sub_pixel_tree_pruned;
extern fractional_mv_step_fp vp9_find_best_sub_pixel_tree_pruned_more;
extern fractional_mv_step_fp vp9_find_best_sub_pixel_tree_pruned_evenmore;
+extern fractional_mv_step_fp vp9_skip_sub_pixel_tree;
typedef int (*vp9_full_search_fn_t)(const MACROBLOCK *x,
const MV *ref_mv, int sad_per_bit,
diff --git a/vp9/encoder/vp9_noise_estimate.c b/vp9/encoder/vp9_noise_estimate.c
index e56cc9b..d505629 100644
--- a/vp9/encoder/vp9_noise_estimate.c
+++ b/vp9/encoder/vp9_noise_estimate.c
@@ -173,12 +173,18 @@
// been encoded as zero/low motion x (= thresh_consec_zeromv) frames
// in a row. consec_zero_mv[] defined for 8x8 blocks, so consider all
// 4 sub-blocks for 16x16 block. Also, avoid skin blocks.
+ int consec_zeromv = VPXMIN(cr->consec_zero_mv[bl_index],
+ VPXMIN(cr->consec_zero_mv[bl_index1],
+ VPXMIN(cr->consec_zero_mv[bl_index2],
+ cr->consec_zero_mv[bl_index3])));
int is_skin = vp9_compute_skin_block(src_y,
src_u,
src_v,
src_ystride,
src_uvstride,
- bsize);
+ bsize,
+ consec_zeromv,
+ 0);
if (frame_low_motion &&
cr->consec_zero_mv[bl_index] > thresh_consec_zeromv &&
cr->consec_zero_mv[bl_index1] > thresh_consec_zeromv &&
diff --git a/vp9/encoder/vp9_pickmode.c b/vp9/encoder/vp9_pickmode.c
index d861f80..3ea2ccd 100644
--- a/vp9/encoder/vp9_pickmode.c
+++ b/vp9/encoder/vp9_pickmode.c
@@ -1279,14 +1279,21 @@
usable_ref_frame = GOLDEN_FRAME;
}
- // If the reference is temporally aligned with current superframe
- // (e.g., spatial reference within superframe), constrain the inter mode:
- // for now only test zero motion.
- if (cpi->use_svc && svc ->force_zero_mode_spatial_ref) {
- if (svc->ref_frame_index[cpi->lst_fb_idx] == svc->current_superframe)
- svc_force_zero_mode[LAST_FRAME - 1] = 1;
- if (svc->ref_frame_index[cpi->gld_fb_idx] == svc->current_superframe)
- svc_force_zero_mode[GOLDEN_FRAME - 1] = 1;
+ // For svc mode, on spatial_layer_id > 0: if the reference has different scale
+ // constrain the inter mode to only test zero motion.
+ if (cpi->use_svc &&
+ svc ->force_zero_mode_spatial_ref &&
+ cpi->svc.spatial_layer_id > 0) {
+ if (cpi->ref_frame_flags & flag_list[LAST_FRAME]) {
+ struct scale_factors *const sf = &cm->frame_refs[LAST_FRAME - 1].sf;
+ if (vp9_is_scaled(sf))
+ svc_force_zero_mode[LAST_FRAME - 1] = 1;
+ }
+ if (cpi->ref_frame_flags & flag_list[GOLDEN_FRAME]) {
+ struct scale_factors *const sf = &cm->frame_refs[GOLDEN_FRAME - 1].sf;
+ if (vp9_is_scaled(sf))
+ svc_force_zero_mode[GOLDEN_FRAME - 1] = 1;
+ }
}
for (ref_frame = LAST_FRAME; ref_frame <= usable_ref_frame; ++ref_frame) {
@@ -1356,7 +1363,9 @@
continue;
if (this_mode == NEWMV) {
- if (ref_frame > LAST_FRAME && !cpi->use_svc) {
+ if (ref_frame > LAST_FRAME &&
+ !cpi->use_svc &&
+ cpi->oxcf.rc_mode == VPX_CBR) {
int tmp_sad;
int dis, cost_list[5];
@@ -1591,7 +1600,8 @@
this_rdc.rate += ref_frame_cost[ref_frame];
this_rdc.rdcost = RDCOST(x->rdmult, x->rddiv, this_rdc.rate, this_rdc.dist);
- if (cpi->oxcf.speed >= 5 &&
+ if (cpi->oxcf.rc_mode == VPX_CBR &&
+ cpi->oxcf.speed >= 5 &&
cpi->oxcf.content != VP9E_CONTENT_SCREEN &&
!x->sb_is_skin) {
// Bias against non-zero (above some threshold) motion for large blocks.
@@ -1679,12 +1689,15 @@
xd->mi[0]->bmi[0].as_mv[0].as_int = mi->mv[0].as_int;
x->skip_txfm[0] = best_mode_skip_txfm;
- // Perform intra prediction only if base layer is chosen as the reference.
+ // For spatial enhancemanent layer: perform intra prediction only if base
+ // layer is chosen as the reference. Always perform intra prediction if
+ // LAST is the only reference or is_key_frame is set.
if (cpi->svc.spatial_layer_id) {
perform_intra_pred =
cpi->svc.layer_context[cpi->svc.temporal_layer_id].is_key_frame ||
+ !(cpi->ref_frame_flags & flag_list[GOLDEN_FRAME]) ||
(!cpi->svc.layer_context[cpi->svc.temporal_layer_id].is_key_frame
- && svc_force_zero_mode[best_ref_frame]);
+ && svc_force_zero_mode[best_ref_frame - 1]);
inter_mode_thresh = (inter_mode_thresh << 1) + inter_mode_thresh;
}
// Perform intra prediction search, if the best SAD is above a certain
diff --git a/vp9/encoder/vp9_ratectrl.c b/vp9/encoder/vp9_ratectrl.c
index 5df2909..61bb35e 100644
--- a/vp9/encoder/vp9_ratectrl.c
+++ b/vp9/encoder/vp9_ratectrl.c
@@ -1469,7 +1469,12 @@
cm->frame_type = INTER_FRAME;
}
if (rc->frames_till_gf_update_due == 0) {
- rc->baseline_gf_interval = (rc->min_gf_interval + rc->max_gf_interval) / 2;
+ if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ && cpi->oxcf.pass == 0) {
+ vp9_cyclic_refresh_set_golden_update(cpi);
+ } else {
+ rc->baseline_gf_interval =
+ (rc->min_gf_interval + rc->max_gf_interval) / 2;
+ }
rc->frames_till_gf_update_due = rc->baseline_gf_interval;
// NOTE: frames_till_gf_update_due must be <= frames_to_key.
if (rc->frames_till_gf_update_due > rc->frames_to_key) {
@@ -1487,6 +1492,8 @@
else
target = calc_pframe_target_size_one_pass_vbr(cpi);
vp9_rc_set_frame_target(cpi, target);
+ if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ && cpi->oxcf.pass == 0)
+ vp9_cyclic_refresh_update_parameters(cpi);
}
static int calc_pframe_target_size_one_pass_cbr(const VP9_COMP *cpi) {
@@ -1567,40 +1574,28 @@
return vp9_rc_clamp_iframe_target_size(cpi, target);
}
-// Reset information needed to set proper reference frames and buffer updates
-// for temporal layering. This is called when a key frame is encoded.
-static void reset_temporal_layer_to_zero(VP9_COMP *cpi) {
- int sl;
- LAYER_CONTEXT *lc = NULL;
- cpi->svc.temporal_layer_id = 0;
-
- for (sl = 0; sl < cpi->svc.number_spatial_layers; ++sl) {
- lc = &cpi->svc.layer_context[sl * cpi->svc.number_temporal_layers];
- lc->current_video_frame_in_layer = 0;
- lc->frames_from_key_frame = 0;
- }
-}
-
void vp9_rc_get_svc_params(VP9_COMP *cpi) {
VP9_COMMON *const cm = &cpi->common;
RATE_CONTROL *const rc = &cpi->rc;
int target = rc->avg_frame_bandwidth;
int layer = LAYER_IDS_TO_IDX(cpi->svc.spatial_layer_id,
cpi->svc.temporal_layer_id, cpi->svc.number_temporal_layers);
-
+ // Periodic key frames is based on the super-frame counter
+ // (svc.current_superframe), also only base spatial layer is key frame.
if ((cm->current_video_frame == 0) ||
(cpi->frame_flags & FRAMEFLAGS_KEY) ||
- (cpi->oxcf.auto_key && (rc->frames_since_key %
- cpi->oxcf.key_freq == 0))) {
+ (cpi->oxcf.auto_key &&
+ (cpi->svc.current_superframe % cpi->oxcf.key_freq == 0) &&
+ cpi->svc.spatial_layer_id == 0)) {
cm->frame_type = KEY_FRAME;
rc->source_alt_ref_active = 0;
-
if (is_two_pass_svc(cpi)) {
cpi->svc.layer_context[layer].is_key_frame = 1;
cpi->ref_frame_flags &=
(~VP9_LAST_FLAG & ~VP9_GOLD_FLAG & ~VP9_ALT_FLAG);
} else if (is_one_pass_cbr_svc(cpi)) {
- reset_temporal_layer_to_zero(cpi);
+ if (cm->current_video_frame > 0)
+ vp9_svc_reset_key_frame(cpi);
layer = LAYER_IDS_TO_IDX(cpi->svc.spatial_layer_id,
cpi->svc.temporal_layer_id, cpi->svc.number_temporal_layers);
cpi->svc.layer_context[layer].is_key_frame = 1;
@@ -2010,13 +2005,17 @@
VP9_COMMON * const cm = &cpi->common;
RATE_CONTROL *const rc = &cpi->rc;
rc->high_source_sad = 0;
- if (cpi->Last_Source != NULL) {
+ if (cpi->Last_Source != NULL &&
+ cpi->Last_Source->y_width == cpi->Source->y_width &&
+ cpi->Last_Source->y_height == cpi->Source->y_height) {
const uint8_t *src_y = cpi->Source->y_buffer;
const int src_ystride = cpi->Source->y_stride;
const uint8_t *last_src_y = cpi->Last_Source->y_buffer;
const int last_src_ystride = cpi->Last_Source->y_stride;
int sbi_row, sbi_col;
const BLOCK_SIZE bsize = BLOCK_64X64;
+ uint32_t min_thresh = 4000;
+ float thresh = 8.0f;
// Loop over sub-sample of frame, and compute average sad over 64x64 blocks.
uint64_t avg_sad = 0;
int num_samples = 0;
@@ -2047,12 +2046,32 @@
// between current and the previous frame value(s). Use a minimum threshold
// for cases where there is small change from content that is completely
// static.
- if (avg_sad > VPXMAX(4000, (rc->avg_source_sad << 3)) &&
+ if (cpi->oxcf.rc_mode == VPX_VBR) {
+ min_thresh = 30000;
+ thresh = 2.0f;
+ }
+ if (avg_sad >
+ VPXMAX(min_thresh, (unsigned int)(rc->avg_source_sad * thresh)) &&
rc->frames_since_key > 1)
rc->high_source_sad = 1;
else
rc->high_source_sad = 0;
- rc->avg_source_sad = (rc->avg_source_sad + avg_sad) >> 1;
+ if (avg_sad > 0 || cpi->oxcf.rc_mode == VPX_CBR)
+ rc->avg_source_sad = (rc->avg_source_sad + avg_sad) >> 1;
+ // For VBR, under scene change/high content change, force golden refresh.
+ if (cpi->oxcf.rc_mode == VPX_VBR &&
+ rc->high_source_sad &&
+ cpi->refresh_golden_frame == 0 &&
+ cpi->ext_refresh_frame_flags_pending == 0) {
+ int target;
+ cpi->refresh_golden_frame = 1;
+ rc->frames_till_gf_update_due = rc->baseline_gf_interval;
+ if (rc->frames_till_gf_update_due > rc->frames_to_key)
+ rc->frames_till_gf_update_due = rc->frames_to_key;
+ rc->gfu_boost = DEFAULT_GF_BOOST;
+ target = calc_pframe_target_size_one_pass_vbr(cpi);
+ vp9_rc_set_frame_target(cpi, target);
+ }
}
}
diff --git a/vp9/encoder/vp9_rdopt.c b/vp9/encoder/vp9_rdopt.c
index 193c9d3..508c596 100644
--- a/vp9/encoder/vp9_rdopt.c
+++ b/vp9/encoder/vp9_rdopt.c
@@ -387,47 +387,70 @@
cost = token_costs[0][0][pt][EOB_TOKEN];
c = 0;
} else {
- int band_left = *band_count++;
+ if (use_fast_coef_costing) {
+ int band_left = *band_count++;
- // dc token
- int v = qcoeff[0];
- int16_t prev_t;
- EXTRABIT e;
- vp9_get_token_extra(v, &prev_t, &e);
- cost = (*token_costs)[0][pt][prev_t] +
- vp9_get_cost(prev_t, e, cat6_high_cost);
+ // dc token
+ int v = qcoeff[0];
+ int16_t prev_t;
+ cost = vp9_get_token_cost(v, &prev_t, cat6_high_cost);
+ cost += (*token_costs)[0][pt][prev_t];
- token_cache[0] = vp9_pt_energy_class[prev_t];
- ++token_costs;
+ token_cache[0] = vp9_pt_energy_class[prev_t];
+ ++token_costs;
- // ac tokens
- for (c = 1; c < eob; c++) {
- const int rc = scan[c];
- int16_t t;
+ // ac tokens
+ for (c = 1; c < eob; c++) {
+ const int rc = scan[c];
+ int16_t t;
- v = qcoeff[rc];
- vp9_get_token_extra(v, &t, &e);
- if (use_fast_coef_costing) {
- cost += (*token_costs)[!prev_t][!prev_t][t] +
- vp9_get_cost(t, e, cat6_high_cost);
- } else {
- pt = get_coef_context(nb, token_cache, c);
- cost += (*token_costs)[!prev_t][pt][t] +
- vp9_get_cost(t, e, cat6_high_cost);
- token_cache[rc] = vp9_pt_energy_class[t];
+ v = qcoeff[rc];
+ cost += vp9_get_token_cost(v, &t, cat6_high_cost);
+ cost += (*token_costs)[!prev_t][!prev_t][t];
+ prev_t = t;
+ if (!--band_left) {
+ band_left = *band_count++;
+ ++token_costs;
+ }
}
- prev_t = t;
- if (!--band_left) {
- band_left = *band_count++;
- ++token_costs;
- }
- }
- // eob token
- if (band_left) {
- if (use_fast_coef_costing) {
+ // eob token
+ if (band_left)
cost += (*token_costs)[0][!prev_t][EOB_TOKEN];
- } else {
+
+ } else { // !use_fast_coef_costing
+ int band_left = *band_count++;
+
+ // dc token
+ int v = qcoeff[0];
+ int16_t tok;
+ unsigned int (*tok_cost_ptr)[COEFF_CONTEXTS][ENTROPY_TOKENS];
+ cost = vp9_get_token_cost(v, &tok, cat6_high_cost);
+ cost += (*token_costs)[0][pt][tok];
+
+ token_cache[0] = vp9_pt_energy_class[tok];
+ ++token_costs;
+
+ tok_cost_ptr = &((*token_costs)[!tok]);
+
+ // ac tokens
+ for (c = 1; c < eob; c++) {
+ const int rc = scan[c];
+
+ v = qcoeff[rc];
+ cost += vp9_get_token_cost(v, &tok, cat6_high_cost);
+ pt = get_coef_context(nb, token_cache, c);
+ cost += (*tok_cost_ptr)[pt][tok];
+ token_cache[rc] = vp9_pt_energy_class[tok];
+ if (!--band_left) {
+ band_left = *band_count++;
+ ++token_costs;
+ }
+ tok_cost_ptr = &((*token_costs)[!tok]);
+ }
+
+ // eob token
+ if (band_left) {
pt = get_coef_context(nb, token_cache, c);
cost += (*token_costs)[0][pt][EOB_TOKEN];
}
diff --git a/vp9/encoder/vp9_skin_detection.c b/vp9/encoder/vp9_skin_detection.c
index 8e117eb..ff0dfce 100644
--- a/vp9/encoder/vp9_skin_detection.c
+++ b/vp9/encoder/vp9_skin_detection.c
@@ -15,7 +15,7 @@
#include "vp9/encoder/vp9_encoder.h"
#include "vp9/encoder/vp9_skin_detection.h"
-#define MODEL_MODE 0
+#define MODEL_MODE 1
// Fixed-point skin color model parameters.
static const int skin_mean[5][2] = {
@@ -48,7 +48,8 @@
return skin_diff;
}
-int vp9_skin_pixel(const uint8_t y, const uint8_t cb, const uint8_t cr) {
+int vp9_skin_pixel(const uint8_t y, const uint8_t cb, const uint8_t cr,
+ int motion) {
if (y < y_low || y > y_high) {
return 0;
} else {
@@ -62,16 +63,19 @@
// Exit on very strong cb.
if (cb > 150 && cr < 110)
return 0;
- // Exit on (another) low luminance threshold if either color is high.
- if (y < 50 && (cb > 140 || cr > 140))
- return 0;
for (; i < 5; i++) {
- if (evaluate_skin_color_difference(cb, cr, i) < skin_threshold[i + 1]) {
- return 1;
+ int skin_color_diff = evaluate_skin_color_difference(cb, cr, i);
+ if (skin_color_diff < skin_threshold[i + 1]) {
+ if (y < 60 && skin_color_diff > 3 * (skin_threshold[i + 1] >> 2))
+ return 0;
+ else if (motion == 0 &&
+ skin_color_diff > (skin_threshold[i + 1] >> 1))
+ return 0;
+ else
+ return 1;
}
// Exit if difference is much large than the threshold.
- if (evaluate_skin_color_difference(cb, cr, i) >
- (skin_threshold[i + 1] << 3)) {
+ if (skin_color_diff > (skin_threshold[i + 1] << 3)) {
return 0;
}
}
@@ -81,16 +85,25 @@
}
int vp9_compute_skin_block(const uint8_t *y, const uint8_t *u, const uint8_t *v,
- int stride, int strideuv, int bsize) {
- // Take center pixel in block to determine is_skin.
- const int y_width_shift = (4 << b_width_log2_lookup[bsize]) >> 1;
- const int y_height_shift = (4 << b_height_log2_lookup[bsize]) >> 1;
- const int uv_width_shift = y_width_shift >> 1;
- const int uv_height_shift = y_height_shift >> 1;
- const uint8_t ysource = y[y_height_shift * stride + y_width_shift];
- const uint8_t usource = u[uv_height_shift * strideuv + uv_width_shift];
- const uint8_t vsource = v[uv_height_shift * strideuv + uv_width_shift];
- return vp9_skin_pixel(ysource, usource, vsource);
+ int stride, int strideuv, int bsize,
+ int consec_zeromv, int curr_motion_magn) {
+ // No skin if block has been zero/small motion for long consecutive time.
+ if (consec_zeromv > 80 && curr_motion_magn == 0) {
+ return 0;
+ } else {
+ int motion = 1;
+ // Take center pixel in block to determine is_skin.
+ const int y_width_shift = (4 << b_width_log2_lookup[bsize]) >> 1;
+ const int y_height_shift = (4 << b_height_log2_lookup[bsize]) >> 1;
+ const int uv_width_shift = y_width_shift >> 1;
+ const int uv_height_shift = y_height_shift >> 1;
+ const uint8_t ysource = y[y_height_shift * stride + y_width_shift];
+ const uint8_t usource = u[uv_height_shift * strideuv + uv_width_shift];
+ const uint8_t vsource = v[uv_height_shift * strideuv + uv_width_shift];
+ if (consec_zeromv > 30 && curr_motion_magn == 0)
+ motion = 0;
+ return vp9_skin_pixel(ysource, usource, vsource, motion);
+ }
}
@@ -99,6 +112,7 @@
void vp9_compute_skin_map(VP9_COMP *const cpi, FILE *yuv_skinmap_file) {
int i, j, mi_row, mi_col, num_bl;
VP9_COMMON *const cm = &cpi->common;
+ CYCLIC_REFRESH *const cr = cpi->cyclic_refresh;
uint8_t *y;
const uint8_t *src_y = cpi->Source->y_buffer;
const uint8_t *src_u = cpi->Source->u_buffer;
@@ -113,7 +127,7 @@
int shuv = shy - 1;
int fac = y_bsize / 8;
// Use center pixel or average of center 2x2 pixels.
- int mode_filter = 1;
+ int mode_filter = 0;
YV12_BUFFER_CONFIG skinmap;
memset(&skinmap, 0, sizeof(YV12_BUFFER_CONFIG));
if (vpx_alloc_frame_buffer(&skinmap, cm->width, cm->height,
@@ -130,27 +144,48 @@
for (mi_row = 0; mi_row < cm->mi_rows - 1; mi_row += fac) {
num_bl = 0;
for (mi_col = 0; mi_col < cm->mi_cols - 1; mi_col += fac) {
- // Select pixel for each block for skin detection.
- // Use center pixel, or 2x2 average at center.
- uint8_t ysource = src_y[ypos * src_ystride + ypos];
- uint8_t usource = src_u[uvpos * src_uvstride + uvpos];
- uint8_t vsource = src_v[uvpos * src_uvstride + uvpos];
- uint8_t ysource2 = src_y[(ypos + 1) * src_ystride + ypos];
- uint8_t usource2 = src_u[(uvpos + 1) * src_uvstride + uvpos];
- uint8_t vsource2 = src_v[(uvpos + 1) * src_uvstride + uvpos];
- uint8_t ysource3 = src_y[ypos * src_ystride + (ypos + 1)];
- uint8_t usource3 = src_u[uvpos * src_uvstride + (uvpos + 1)];
- uint8_t vsource3 = src_v[uvpos * src_uvstride + (uvpos + 1)];
- uint8_t ysource4 = src_y[(ypos + 1) * src_ystride + (ypos + 1)];
- uint8_t usource4 = src_u[(uvpos + 1) * src_uvstride + (uvpos + 1)];
- uint8_t vsource4 = src_v[(uvpos + 1) * src_uvstride + (uvpos + 1)];
int is_skin = 0;
if (mode_filter == 1) {
+ // Use 2x2 average at center.
+ uint8_t ysource = src_y[ypos * src_ystride + ypos];
+ uint8_t usource = src_u[uvpos * src_uvstride + uvpos];
+ uint8_t vsource = src_v[uvpos * src_uvstride + uvpos];
+ uint8_t ysource2 = src_y[(ypos + 1) * src_ystride + ypos];
+ uint8_t usource2 = src_u[(uvpos + 1) * src_uvstride + uvpos];
+ uint8_t vsource2 = src_v[(uvpos + 1) * src_uvstride + uvpos];
+ uint8_t ysource3 = src_y[ypos * src_ystride + (ypos + 1)];
+ uint8_t usource3 = src_u[uvpos * src_uvstride + (uvpos + 1)];
+ uint8_t vsource3 = src_v[uvpos * src_uvstride + (uvpos + 1)];
+ uint8_t ysource4 = src_y[(ypos + 1) * src_ystride + (ypos + 1)];
+ uint8_t usource4 = src_u[(uvpos + 1) * src_uvstride + (uvpos + 1)];
+ uint8_t vsource4 = src_v[(uvpos + 1) * src_uvstride + (uvpos + 1)];
ysource = (ysource + ysource2 + ysource3 + ysource4) >> 2;
usource = (usource + usource2 + usource3 + usource4) >> 2;
vsource = (vsource + vsource2 + vsource3 + vsource4) >> 2;
+ is_skin = vp9_skin_pixel(ysource, usource, vsource, 1);
+ } else {
+ int block_size = BLOCK_8X8;
+ int consec_zeromv = 0;
+ if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ && cm->seg.enabled) {
+ int bl_index = mi_row * cm->mi_cols + mi_col;
+ int bl_index1 = bl_index + 1;
+ int bl_index2 = bl_index + cm->mi_cols;
+ int bl_index3 = bl_index2 + 1;
+ if (y_bsize == 8)
+ consec_zeromv = cr->consec_zero_mv[bl_index];
+ else
+ consec_zeromv = VPXMIN(cr->consec_zero_mv[bl_index],
+ VPXMIN(cr->consec_zero_mv[bl_index1],
+ VPXMIN(cr->consec_zero_mv[bl_index2],
+ cr->consec_zero_mv[bl_index3])));
+ }
+ if (y_bsize == 16)
+ block_size = BLOCK_16X16;
+ is_skin = vp9_compute_skin_block(src_y, src_u, src_v, src_ystride,
+ src_uvstride, block_size,
+ consec_zeromv,
+ 0);
}
- is_skin = vp9_skin_pixel(ysource, usource, vsource);
for (i = 0; i < y_bsize; i++) {
for (j = 0; j < y_bsize; j++) {
if (is_skin)
diff --git a/vp9/encoder/vp9_skin_detection.h b/vp9/encoder/vp9_skin_detection.h
index 73f7c39..c77382d 100644
--- a/vp9/encoder/vp9_skin_detection.h
+++ b/vp9/encoder/vp9_skin_detection.h
@@ -21,10 +21,12 @@
// #define OUTPUT_YUV_SKINMAP
-int vp9_skin_pixel(const uint8_t y, const uint8_t cb, const uint8_t cr);
+int vp9_skin_pixel(const uint8_t y, const uint8_t cb, const uint8_t cr,
+ int motion);
int vp9_compute_skin_block(const uint8_t *y, const uint8_t *u, const uint8_t *v,
- int stride, int strideuv, int bsize);
+ int stride, int strideuv, int bsize,
+ int consec_zeromv, int curr_motion_magn);
#ifdef OUTPUT_YUV_SKINMAP
// For viewing skin map on input source.
diff --git a/vp9/encoder/vp9_speed_features.c b/vp9/encoder/vp9_speed_features.c
index f684507..02be3c3 100644
--- a/vp9/encoder/vp9_speed_features.c
+++ b/vp9/encoder/vp9_speed_features.c
@@ -400,6 +400,8 @@
sf->mode_search_skip_flags = FLAG_SKIP_INTRA_DIRMISMATCH;
sf->tx_size_search_method = is_keyframe ? USE_LARGESTALL : USE_TX_8X8;
sf->simple_model_rd_from_var = 1;
+ if (cpi->oxcf.rc_mode == VPX_VBR)
+ sf->mv.search_method = NSTEP;
if (!is_keyframe) {
int i;
@@ -441,7 +443,7 @@
}
if (speed >= 8) {
sf->adaptive_rd_thresh = 4;
- sf->mv.subpel_force_stop = 2;
+ sf->mv.subpel_force_stop = (content == VP9E_CONTENT_SCREEN) ? 3 : 2;
sf->lpf_pick = LPF_PICK_MINIMAL_LPF;
}
}
@@ -607,7 +609,10 @@
sf->optimize_coefficients = 0;
}
- if (sf->mv.subpel_search_method == SUBPEL_TREE) {
+ if (sf->mv.subpel_force_stop == 3) {
+ // Whole pel only
+ cpi->find_fractional_mv_step = vp9_skip_sub_pixel_tree;
+ } else if (sf->mv.subpel_search_method == SUBPEL_TREE) {
cpi->find_fractional_mv_step = vp9_find_best_sub_pixel_tree;
} else if (sf->mv.subpel_search_method == SUBPEL_TREE_PRUNED) {
cpi->find_fractional_mv_step = vp9_find_best_sub_pixel_tree_pruned;
diff --git a/vp9/encoder/vp9_speed_features.h b/vp9/encoder/vp9_speed_features.h
index fa2f79d..90b3216 100644
--- a/vp9/encoder/vp9_speed_features.h
+++ b/vp9/encoder/vp9_speed_features.h
@@ -188,7 +188,11 @@
// Maximum number of steps in logarithmic subpel search before giving up.
int subpel_iters_per_step;
- // Control when to stop subpel search
+ // Control when to stop subpel search:
+ // 0: Full subpel search.
+ // 1: Stop at quarter pixel.
+ // 2: Stop at half pixel.
+ // 3: Stop at full pixel.
int subpel_force_stop;
// This variable sets the step_param used in full pel motion search.
diff --git a/vp9/encoder/vp9_svc_layercontext.c b/vp9/encoder/vp9_svc_layercontext.c
index 1d56154..73048f8 100644
--- a/vp9/encoder/vp9_svc_layercontext.c
+++ b/vp9/encoder/vp9_svc_layercontext.c
@@ -43,6 +43,26 @@
cpi->svc.ext_alt_fb_idx[sl] = 2;
}
+ // For 1 pass cbr: allocate scaled_frame that may be used as an intermediate
+ // buffer for a 2 stage down-sampling: two stages of 1:2 down-sampling for a
+ // target of 1/4x1/4.
+ if (cpi->oxcf.pass == 0 && cpi->oxcf.rc_mode == VPX_CBR) {
+ if (vpx_realloc_frame_buffer(&cpi->svc.scaled_temp,
+ cpi->common.width >> 1,
+ cpi->common.height >> 1,
+ cpi->common.subsampling_x,
+ cpi->common.subsampling_y,
+#if CONFIG_VP9_HIGHBITDEPTH
+ cpi->common.use_highbitdepth,
+#endif
+ VP9_ENC_BORDER_IN_PIXELS,
+ cpi->common.byte_alignment,
+ NULL, NULL, NULL))
+ vpx_internal_error(&cpi->common.error, VPX_CODEC_MEM_ERROR,
+ "Failed to allocate scaled_frame for svc ");
+ }
+
+
if (cpi->oxcf.error_resilient_mode == 0 && cpi->oxcf.pass == 2) {
if (vpx_realloc_frame_buffer(&cpi->svc.empty_frame.img,
SMALL_FRAME_WIDTH, SMALL_FRAME_HEIGHT,
@@ -796,3 +816,27 @@
}
}
}
+
+// Reset on key frame: reset counters, references and buffer updates.
+void vp9_svc_reset_key_frame(VP9_COMP *const cpi) {
+ int sl, tl;
+ SVC *const svc = &cpi->svc;
+ LAYER_CONTEXT *lc = NULL;
+ for (sl = 0; sl < svc->number_spatial_layers; ++sl) {
+ for (tl = 0; tl < svc->number_temporal_layers; ++tl) {
+ lc = &cpi->svc.layer_context[sl * svc->number_temporal_layers + tl];
+ lc->current_video_frame_in_layer = 0;
+ lc->frames_from_key_frame = 0;
+ }
+ }
+ if (svc->temporal_layering_mode == VP9E_TEMPORAL_LAYERING_MODE_0212) {
+ set_flags_and_fb_idx_for_temporal_mode3(cpi);
+ } else if (svc->temporal_layering_mode ==
+ VP9E_TEMPORAL_LAYERING_MODE_NOLAYERING) {
+ set_flags_and_fb_idx_for_temporal_mode_noLayering(cpi);
+ } else if (svc->temporal_layering_mode == VP9E_TEMPORAL_LAYERING_MODE_0101) {
+ set_flags_and_fb_idx_for_temporal_mode2(cpi);
+ }
+ vp9_update_temporal_layer_framerate(cpi);
+ vp9_restore_layer_context(cpi);
+}
diff --git a/vp9/encoder/vp9_svc_layercontext.h b/vp9/encoder/vp9_svc_layercontext.h
index 4e18640..9f386fb 100644
--- a/vp9/encoder/vp9_svc_layercontext.h
+++ b/vp9/encoder/vp9_svc_layercontext.h
@@ -70,6 +70,8 @@
// Store scaled source frames to be used for temporal filter to generate
// a alt ref frame.
YV12_BUFFER_CONFIG scaled_frames[MAX_LAG_BUFFERS];
+ // Temp buffer used for 2-stage down-sampling, for real-time mode.
+ YV12_BUFFER_CONFIG scaled_temp;
// Layer context used for rate control in one pass temporal CBR mode or
// two pass spatial mode.
@@ -134,6 +136,8 @@
void vp9_free_svc_cyclic_refresh(struct VP9_COMP *const cpi);
+void vp9_svc_reset_key_frame(struct VP9_COMP *const cpi);
+
#ifdef __cplusplus
} // extern "C"
#endif
diff --git a/vp9/encoder/vp9_temporal_filter.c b/vp9/encoder/vp9_temporal_filter.c
index 82f566b..ebe28b8 100644
--- a/vp9/encoder/vp9_temporal_filter.c
+++ b/vp9/encoder/vp9_temporal_filter.c
@@ -45,8 +45,7 @@
int x, int y) {
const int which_mv = 0;
const MV mv = { mv_row, mv_col };
- const InterpKernel *const kernel =
- vp9_filter_kernels[xd->mi[0]->interp_filter];
+ const InterpKernel *const kernel = vp9_filter_kernels[EIGHTTAP_SHARP];
enum mv_precision mv_precision_uv;
int uv_stride;
@@ -86,6 +85,7 @@
return;
}
#endif // CONFIG_VP9_HIGHBITDEPTH
+ (void)xd;
vp9_build_inter_predictor(y_mb_ptr, stride,
&pred[0], 16,
&mv,
diff --git a/vp9/encoder/vp9_tokenize.c b/vp9/encoder/vp9_tokenize.c
index 93be6d7..ee1d08a 100644
--- a/vp9/encoder/vp9_tokenize.c
+++ b/vp9/encoder/vp9_tokenize.c
@@ -50,6 +50,35 @@
const TOKENVALUE *vp9_dct_cat_lt_10_value_tokens = dct_cat_lt_10_value_tokens +
(sizeof(dct_cat_lt_10_value_tokens) / sizeof(*dct_cat_lt_10_value_tokens))
/ 2;
+// The corresponding costs of the extrabits for the tokens in the above table
+// are stored in the table below. The values are obtained from looking up the
+// entry for the specified extrabits in the table corresponding to the token
+// (as defined in cost element vp9_extra_bits)
+// e.g. {9, 63} maps to cat5_cost[63 >> 1], {1, 1} maps to sign_cost[1 >> 1]
+static const int dct_cat_lt_10_value_cost[] = {
+ 3773, 3750, 3704, 3681, 3623, 3600, 3554, 3531,
+ 3432, 3409, 3363, 3340, 3282, 3259, 3213, 3190,
+ 3136, 3113, 3067, 3044, 2986, 2963, 2917, 2894,
+ 2795, 2772, 2726, 2703, 2645, 2622, 2576, 2553,
+ 3197, 3116, 3058, 2977, 2881, 2800,
+ 2742, 2661, 2615, 2534, 2476, 2395,
+ 2299, 2218, 2160, 2079,
+ 2566, 2427, 2334, 2195, 2023, 1884, 1791, 1652,
+ 1893, 1696, 1453, 1256, 1229, 864,
+ 512, 512, 512, 512, 0,
+ 512, 512, 512, 512,
+ 864, 1229, 1256, 1453, 1696, 1893,
+ 1652, 1791, 1884, 2023, 2195, 2334, 2427, 2566,
+ 2079, 2160, 2218, 2299, 2395, 2476, 2534, 2615,
+ 2661, 2742, 2800, 2881, 2977, 3058, 3116, 3197,
+ 2553, 2576, 2622, 2645, 2703, 2726, 2772, 2795,
+ 2894, 2917, 2963, 2986, 3044, 3067, 3113, 3136,
+ 3190, 3213, 3259, 3282, 3340, 3363, 3409, 3432,
+ 3531, 3554, 3600, 3623, 3681, 3704, 3750, 3773,
+};
+const int *vp9_dct_cat_lt_10_value_cost = dct_cat_lt_10_value_cost +
+ (sizeof(dct_cat_lt_10_value_cost) / sizeof(*dct_cat_lt_10_value_cost))
+ / 2;
// Array indices are identical to previously-existing CONTEXT_NODE indices
const vpx_tree_index vp9_coef_tree[TREE_SIZE(ENTROPY_TOKENS)] = {
diff --git a/vp9/encoder/vp9_tokenize.h b/vp9/encoder/vp9_tokenize.h
index df979b2..fad7988 100644
--- a/vp9/encoder/vp9_tokenize.h
+++ b/vp9/encoder/vp9_tokenize.h
@@ -74,6 +74,7 @@
*/
extern const TOKENVALUE *vp9_dct_value_tokens_ptr;
extern const TOKENVALUE *vp9_dct_cat_lt_10_value_tokens;
+extern const int *vp9_dct_cat_lt_10_value_cost;
extern const int16_t vp9_cat6_low_cost[256];
extern const int vp9_cat6_high_cost[64];
extern const int vp9_cat6_high10_high_cost[256];
@@ -117,6 +118,18 @@
return vp9_dct_cat_lt_10_value_tokens[v].token;
}
+static INLINE int vp9_get_token_cost(int v, int16_t *token,
+ const int *cat6_high_table) {
+ if (v >= CAT6_MIN_VAL || v <= -CAT6_MIN_VAL) {
+ EXTRABIT extrabits;
+ *token = CATEGORY6_TOKEN;
+ extrabits = abs(v) - CAT6_MIN_VAL;
+ return vp9_cat6_low_cost[extrabits & 0xff] +
+ cat6_high_table[extrabits >> 8];
+ }
+ *token = vp9_dct_cat_lt_10_value_tokens[v].token;
+ return vp9_dct_cat_lt_10_value_cost[v];
+}
#ifdef __cplusplus
} // extern "C"
diff --git a/vp9/vp9_cx_iface.c b/vp9/vp9_cx_iface.c
index 7f01acb..d13e699 100644
--- a/vp9/vp9_cx_iface.c
+++ b/vp9/vp9_cx_iface.c
@@ -145,7 +145,7 @@
RANGE_CHECK(cfg, g_w, 1, 65535); // 16 bits available
RANGE_CHECK(cfg, g_h, 1, 65535); // 16 bits available
RANGE_CHECK(cfg, g_timebase.den, 1, 1000000000);
- RANGE_CHECK(cfg, g_timebase.num, 1, cfg->g_timebase.den);
+ RANGE_CHECK(cfg, g_timebase.num, 1, 1000000000);
RANGE_CHECK_HI(cfg, g_profile, 3);
RANGE_CHECK_HI(cfg, rc_max_quantizer, 63);
@@ -473,7 +473,16 @@
oxcf->content = extra_cfg->content;
oxcf->tile_columns = extra_cfg->tile_columns;
- oxcf->tile_rows = extra_cfg->tile_rows;
+
+ // TODO(yunqing): The dependencies between row tiles cause error in multi-
+ // threaded encoding. For now, tile_rows is forced to be 0 in this case.
+ // The further fix can be done by adding synchronizations after a tile row
+ // is encoded. But this will hurt multi-threaded encoder performance. So,
+ // it is recommended to use tile-rows=0 while encoding with threads > 1.
+ if (oxcf->max_threads > 1 && oxcf->tile_columns > 0)
+ oxcf->tile_rows = 0;
+ else
+ oxcf->tile_rows = extra_cfg->tile_rows;
oxcf->error_resilient_mode = cfg->g_error_resilient;
oxcf->frame_parallel_decoding_mode = extra_cfg->frame_parallel_decoding_mode;
@@ -1553,7 +1562,7 @@
// keyframing settings (kf)
VPX_KF_AUTO, // g_kfmode
0, // kf_min_dist
- 9999, // kf_max_dist
+ 128, // kf_max_dist
VPX_SS_DEFAULT_LAYERS, // ss_number_layers
{0},
diff --git a/vpx/src/svc_encodeframe.c b/vpx/src/svc_encodeframe.c
index 628afca..8028608 100644
--- a/vpx/src/svc_encodeframe.c
+++ b/vpx/src/svc_encodeframe.c
@@ -322,7 +322,7 @@
for (sl = 0; sl < svc_ctx->spatial_layers; ++sl) {
if (si->svc_params.scaling_factor_den[sl] > 0) {
- alloc_ratio[sl] = (float)( (sl+1) );
+ alloc_ratio[sl] = (float)( pow(2, sl) );
total += alloc_ratio[sl];
}
}
diff --git a/vpx_dsp/vpx_dsp.mk b/vpx_dsp/vpx_dsp.mk
index fc7060f..e371849 100644
--- a/vpx_dsp/vpx_dsp.mk
+++ b/vpx_dsp/vpx_dsp.mk
@@ -41,6 +41,7 @@
DSP_SRCS-yes += intrapred.c
ifeq ($(CONFIG_USE_X86INC),yes)
+DSP_SRCS-$(HAVE_MMX) += x86/loopfilter_mmx.asm
DSP_SRCS-$(HAVE_SSE) += x86/intrapred_sse2.asm
DSP_SRCS-$(HAVE_SSE2) += x86/intrapred_sse2.asm
DSP_SRCS-$(HAVE_SSSE3) += x86/intrapred_ssse3.asm
@@ -130,7 +131,6 @@
DSP_SRCS-$(ARCH_X86)$(ARCH_X86_64) += x86/loopfilter_sse2.c
DSP_SRCS-$(HAVE_AVX2) += x86/loopfilter_avx2.c
-DSP_SRCS-$(HAVE_MMX) += x86/loopfilter_mmx.asm
DSP_SRCS-$(HAVE_NEON) += arm/loopfilter_neon.c
ifeq ($(HAVE_NEON_ASM),yes)
diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl
index ced7009..d01e81d 100644
--- a/vpx_dsp/vpx_dsp_rtcd_defs.pl
+++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl
@@ -548,7 +548,7 @@
$vpx_lpf_vertical_8_dual_neon_asm=vpx_lpf_vertical_8_dual_neon;
add_proto qw/void vpx_lpf_vertical_4/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh";
-specialize qw/vpx_lpf_vertical_4 mmx neon dspr2 msa/;
+specialize qw/vpx_lpf_vertical_4 neon dspr2 msa/, "$mmx_x86inc";
add_proto qw/void vpx_lpf_vertical_4_dual/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1";
specialize qw/vpx_lpf_vertical_4_dual sse2 neon dspr2 msa/;
@@ -569,7 +569,7 @@
$vpx_lpf_horizontal_8_dual_neon_asm=vpx_lpf_horizontal_8_dual_neon;
add_proto qw/void vpx_lpf_horizontal_4/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh";
-specialize qw/vpx_lpf_horizontal_4 mmx neon dspr2 msa/;
+specialize qw/vpx_lpf_horizontal_4 neon dspr2 msa/, "$mmx_x86inc";
add_proto qw/void vpx_lpf_horizontal_4_dual/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1";
specialize qw/vpx_lpf_horizontal_4_dual sse2 neon dspr2 msa/;
diff --git a/vpx_dsp/x86/convolve.h b/vpx_dsp/x86/convolve.h
index 95aa790..95c721a 100644
--- a/vpx_dsp/x86/convolve.h
+++ b/vpx_dsp/x86/convolve.h
@@ -34,7 +34,7 @@
int w, int h) { \
assert(filter[3] != 128); \
assert(step_q4 == 16); \
- if (filter[0] || filter[1] || filter[2]) { \
+ if (filter[0] | filter[1] | filter[2]) { \
while (w >= 16) { \
vpx_filter_block1d16_##dir##8_##avg##opt(src_start, \
src_stride, \
@@ -46,27 +46,20 @@
dst += 16; \
w -= 16; \
} \
- while (w >= 8) { \
+ if (w == 8) { \
vpx_filter_block1d8_##dir##8_##avg##opt(src_start, \
src_stride, \
dst, \
dst_stride, \
h, \
filter); \
- src += 8; \
- dst += 8; \
- w -= 8; \
- } \
- while (w >= 4) { \
+ } else if (w == 4) { \
vpx_filter_block1d4_##dir##8_##avg##opt(src_start, \
src_stride, \
dst, \
dst_stride, \
h, \
filter); \
- src += 4; \
- dst += 4; \
- w -= 4; \
} \
} else { \
while (w >= 16) { \
@@ -80,27 +73,20 @@
dst += 16; \
w -= 16; \
} \
- while (w >= 8) { \
+ if (w == 8) { \
vpx_filter_block1d8_##dir##2_##avg##opt(src, \
src_stride, \
dst, \
dst_stride, \
h, \
filter); \
- src += 8; \
- dst += 8; \
- w -= 8; \
- } \
- while (w >= 4) { \
+ } else if (w == 4) { \
vpx_filter_block1d4_##dir##2_##avg##opt(src, \
src_stride, \
dst, \
dst_stride, \
h, \
filter); \
- src += 4; \
- dst += 4; \
- w -= 4; \
} \
} \
}
@@ -164,7 +150,7 @@
if (step_q4 == 16 && filter[3] != 128) { \
uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \
- if (filter[0] || filter[1] || filter[2]) { \
+ if (filter[0] | filter[1] | filter[2]) { \
while (w >= 16) { \
vpx_highbd_filter_block1d16_##dir##8_##avg##opt(src_start, \
src_stride, \
diff --git a/vpx_dsp/x86/loopfilter_mmx.asm b/vpx_dsp/x86/loopfilter_mmx.asm
index 15105e3..45d0ecc 100644
--- a/vpx_dsp/x86/loopfilter_mmx.asm
+++ b/vpx_dsp/x86/loopfilter_mmx.asm
@@ -1,5 +1,5 @@
;
-; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+; Copyright (c) 2016 The WebM project authors. All Rights Reserved.
;
; Use of this source code is governed by a BSD-style license
; that can be found in the LICENSE file in the root of the source
@@ -8,589 +8,429 @@
; be found in the AUTHORS file in the root of the source tree.
;
-
-%include "vpx_ports/x86_abi_support.asm"
-
-
-;void vpx_lpf_horizontal_4_mmx
-;(
-; unsigned char *src_ptr,
-; int src_pixel_step,
-; const char *blimit,
-; const char *limit,
-; const char *thresh
-;)
-global sym(vpx_lpf_horizontal_4_mmx) PRIVATE
-sym(vpx_lpf_horizontal_4_mmx):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 5
- GET_GOT rbx
- push rsi
- push rdi
- ; end prolog
-
- ALIGN_STACK 16, rax
- sub rsp, 32 ; reserve 32 bytes
- %define t0 [rsp + 0] ;__declspec(align(16)) char t0[8];
- %define t1 [rsp + 16] ;__declspec(align(16)) char t1[8];
-
- mov rsi, arg(0) ;src_ptr
- movsxd rax, dword ptr arg(1) ;src_pixel_step ; destination pitch?
-
- mov rdx, arg(3) ;limit
- movq mm7, [rdx]
- mov rdi, rsi ; rdi points to row +1 for indirect addressing
- add rdi, rax
-
- ; calculate breakout conditions
- movq mm2, [rdi+2*rax] ; q3
- movq mm1, [rsi+2*rax] ; q2
- movq mm6, mm1 ; q2
- psubusb mm1, mm2 ; q2-=q3
- psubusb mm2, mm6 ; q3-=q2
- por mm1, mm2 ; abs(q3-q2)
- psubusb mm1, mm7 ;
-
-
- movq mm4, [rsi+rax] ; q1
- movq mm3, mm4 ; q1
- psubusb mm4, mm6 ; q1-=q2
- psubusb mm6, mm3 ; q2-=q1
- por mm4, mm6 ; abs(q2-q1)
-
- psubusb mm4, mm7
- por mm1, mm4
-
- movq mm4, [rsi] ; q0
- movq mm0, mm4 ; q0
- psubusb mm4, mm3 ; q0-=q1
- psubusb mm3, mm0 ; q1-=q0
- por mm4, mm3 ; abs(q0-q1)
- movq t0, mm4 ; save to t0
- psubusb mm4, mm7
- por mm1, mm4
-
-
- neg rax ; negate pitch to deal with above border
-
- movq mm2, [rsi+4*rax] ; p3
- movq mm4, [rdi+4*rax] ; p2
- movq mm5, mm4 ; p2
- psubusb mm4, mm2 ; p2-=p3
- psubusb mm2, mm5 ; p3-=p2
- por mm4, mm2 ; abs(p3 - p2)
- psubusb mm4, mm7
- por mm1, mm4
-
-
- movq mm4, [rsi+2*rax] ; p1
- movq mm3, mm4 ; p1
- psubusb mm4, mm5 ; p1-=p2
- psubusb mm5, mm3 ; p2-=p1
- por mm4, mm5 ; abs(p2 - p1)
- psubusb mm4, mm7
- por mm1, mm4
-
- movq mm2, mm3 ; p1
-
- movq mm4, [rsi+rax] ; p0
- movq mm5, mm4 ; p0
- psubusb mm4, mm3 ; p0-=p1
- psubusb mm3, mm5 ; p1-=p0
- por mm4, mm3 ; abs(p1 - p0)
- movq t1, mm4 ; save to t1
- psubusb mm4, mm7
- por mm1, mm4
-
- movq mm3, [rdi] ; q1
- movq mm4, mm3 ; q1
- psubusb mm3, mm2 ; q1-=p1
- psubusb mm2, mm4 ; p1-=q1
- por mm2, mm3 ; abs(p1-q1)
- pand mm2, [GLOBAL(tfe)] ; set lsb of each byte to zero
- psrlw mm2, 1 ; abs(p1-q1)/2
-
- movq mm6, mm5 ; p0
- movq mm3, [rsi] ; q0
- psubusb mm5, mm3 ; p0-=q0
- psubusb mm3, mm6 ; q0-=p0
- por mm5, mm3 ; abs(p0 - q0)
- paddusb mm5, mm5 ; abs(p0-q0)*2
- paddusb mm5, mm2 ; abs (p0 - q0) *2 + abs(p1-q1)/2
-
- mov rdx, arg(2) ;blimit ; get blimit
- movq mm7, [rdx] ; blimit
-
- psubusb mm5, mm7 ; abs (p0 - q0) *2 + abs(p1-q1)/2 > blimit
- por mm1, mm5
- pxor mm5, mm5
- pcmpeqb mm1, mm5 ; mask mm1
-
- ; calculate high edge variance
- mov rdx, arg(4) ;thresh ; get thresh
- movq mm7, [rdx] ;
- movq mm4, t0 ; get abs (q1 - q0)
- psubusb mm4, mm7
- movq mm3, t1 ; get abs (p1 - p0)
- psubusb mm3, mm7
- paddb mm4, mm3 ; abs(q1 - q0) > thresh || abs(p1 - p0) > thresh
-
- pcmpeqb mm4, mm5
-
- pcmpeqb mm5, mm5
- pxor mm4, mm5
-
-
- ; start work on filters
- movq mm2, [rsi+2*rax] ; p1
- movq mm7, [rdi] ; q1
- pxor mm2, [GLOBAL(t80)] ; p1 offset to convert to signed values
- pxor mm7, [GLOBAL(t80)] ; q1 offset to convert to signed values
- psubsb mm2, mm7 ; p1 - q1
- pand mm2, mm4 ; high var mask (hvm)(p1 - q1)
- pxor mm6, [GLOBAL(t80)] ; offset to convert to signed values
- pxor mm0, [GLOBAL(t80)] ; offset to convert to signed values
- movq mm3, mm0 ; q0
- psubsb mm0, mm6 ; q0 - p0
- paddsb mm2, mm0 ; 1 * (q0 - p0) + hvm(p1 - q1)
- paddsb mm2, mm0 ; 2 * (q0 - p0) + hvm(p1 - q1)
- paddsb mm2, mm0 ; 3 * (q0 - p0) + hvm(p1 - q1)
- pand mm1, mm2 ; mask filter values we don't care about
- movq mm2, mm1
- paddsb mm1, [GLOBAL(t4)] ; 3* (q0 - p0) + hvm(p1 - q1) + 4
- paddsb mm2, [GLOBAL(t3)] ; 3* (q0 - p0) + hvm(p1 - q1) + 3
-
- pxor mm0, mm0 ;
- pxor mm5, mm5
- punpcklbw mm0, mm2 ;
- punpckhbw mm5, mm2 ;
- psraw mm0, 11 ;
- psraw mm5, 11
- packsswb mm0, mm5
- movq mm2, mm0 ; (3* (q0 - p0) + hvm(p1 - q1) + 3) >> 3;
-
- pxor mm0, mm0 ; 0
- movq mm5, mm1 ; abcdefgh
- punpcklbw mm0, mm1 ; e0f0g0h0
- psraw mm0, 11 ; sign extended shift right by 3
- pxor mm1, mm1 ; 0
- punpckhbw mm1, mm5 ; a0b0c0d0
- psraw mm1, 11 ; sign extended shift right by 3
- movq mm5, mm0 ; save results
-
- packsswb mm0, mm1 ; (3* (q0 - p0) + hvm(p1 - q1) + 4) >>3
- paddsw mm5, [GLOBAL(ones)]
- paddsw mm1, [GLOBAL(ones)]
- psraw mm5, 1 ; partial shifted one more time for 2nd tap
- psraw mm1, 1 ; partial shifted one more time for 2nd tap
- packsswb mm5, mm1 ; (3* (q0 - p0) + hvm(p1 - q1) + 4) >>4
- pandn mm4, mm5 ; high edge variance additive
-
- paddsb mm6, mm2 ; p0+= p0 add
- pxor mm6, [GLOBAL(t80)] ; unoffset
- movq [rsi+rax], mm6 ; write back
-
- movq mm6, [rsi+2*rax] ; p1
- pxor mm6, [GLOBAL(t80)] ; reoffset
- paddsb mm6, mm4 ; p1+= p1 add
- pxor mm6, [GLOBAL(t80)] ; unoffset
- movq [rsi+2*rax], mm6 ; write back
-
- psubsb mm3, mm0 ; q0-= q0 add
- pxor mm3, [GLOBAL(t80)] ; unoffset
- movq [rsi], mm3 ; write back
-
- psubsb mm7, mm4 ; q1-= q1 add
- pxor mm7, [GLOBAL(t80)] ; unoffset
- movq [rdi], mm7 ; write back
-
- add rsp, 32
- pop rsp
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_GOT
- UNSHADOW_ARGS
- pop rbp
- ret
-
-
-;void vpx_lpf_vertical_4_mmx
-;(
-; unsigned char *src_ptr,
-; int src_pixel_step,
-; const char *blimit,
-; const char *limit,
-; const char *thresh
-;)
-global sym(vpx_lpf_vertical_4_mmx) PRIVATE
-sym(vpx_lpf_vertical_4_mmx):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 5
- GET_GOT rbx
- push rsi
- push rdi
- ; end prolog
-
- ALIGN_STACK 16, rax
- sub rsp, 64 ; reserve 64 bytes
- %define t0 [rsp + 0] ;__declspec(align(16)) char t0[8];
- %define t1 [rsp + 16] ;__declspec(align(16)) char t1[8];
- %define srct [rsp + 32] ;__declspec(align(16)) char srct[32];
-
- mov rsi, arg(0) ;src_ptr
- movsxd rax, dword ptr arg(1) ;src_pixel_step ; destination pitch?
-
- lea rsi, [rsi + rax*4 - 4]
-
- mov rdi, rsi ; rdi points to row +1 for indirect addressing
- add rdi, rax
-
-
- ;transpose
- movq mm6, [rsi+2*rax] ; 67 66 65 64 63 62 61 60
- movq mm7, mm6 ; 77 76 75 74 73 72 71 70
-
- punpckhbw mm7, [rdi+2*rax] ; 77 67 76 66 75 65 74 64
- punpcklbw mm6, [rdi+2*rax] ; 73 63 72 62 71 61 70 60
-
- movq mm4, [rsi] ; 47 46 45 44 43 42 41 40
- movq mm5, mm4 ; 47 46 45 44 43 42 41 40
-
- punpckhbw mm5, [rsi+rax] ; 57 47 56 46 55 45 54 44
- punpcklbw mm4, [rsi+rax] ; 53 43 52 42 51 41 50 40
-
- movq mm3, mm5 ; 57 47 56 46 55 45 54 44
- punpckhwd mm5, mm7 ; 77 67 57 47 76 66 56 46
-
- punpcklwd mm3, mm7 ; 75 65 55 45 74 64 54 44
- movq mm2, mm4 ; 53 43 52 42 51 41 50 40
-
- punpckhwd mm4, mm6 ; 73 63 53 43 72 62 52 42
- punpcklwd mm2, mm6 ; 71 61 51 41 70 60 50 40
-
- neg rax
- movq mm6, [rsi+rax*2] ; 27 26 25 24 23 22 21 20
-
- movq mm1, mm6 ; 27 26 25 24 23 22 21 20
- punpckhbw mm6, [rsi+rax] ; 37 27 36 36 35 25 34 24
-
- punpcklbw mm1, [rsi+rax] ; 33 23 32 22 31 21 30 20
- movq mm7, [rsi+rax*4]; ; 07 06 05 04 03 02 01 00
-
- punpckhbw mm7, [rdi+rax*4] ; 17 07 16 06 15 05 14 04
- movq mm0, mm7 ; 17 07 16 06 15 05 14 04
-
- punpckhwd mm7, mm6 ; 37 27 17 07 36 26 16 06
- punpcklwd mm0, mm6 ; 35 25 15 05 34 24 14 04
-
- movq mm6, mm7 ; 37 27 17 07 36 26 16 06
- punpckhdq mm7, mm5 ; 77 67 57 47 37 27 17 07 = q3
-
- punpckldq mm6, mm5 ; 76 66 56 46 36 26 16 06 = q2
-
- movq mm5, mm6 ; 76 66 56 46 36 26 16 06
- psubusb mm5, mm7 ; q2-q3
-
- psubusb mm7, mm6 ; q3-q2
- por mm7, mm5; ; mm7=abs (q3-q2)
-
- movq mm5, mm0 ; 35 25 15 05 34 24 14 04
- punpckhdq mm5, mm3 ; 75 65 55 45 35 25 15 05 = q1
-
- punpckldq mm0, mm3 ; 74 64 54 44 34 24 15 04 = q0
- movq mm3, mm5 ; 75 65 55 45 35 25 15 05 = q1
-
- psubusb mm3, mm6 ; q1-q2
- psubusb mm6, mm5 ; q2-q1
-
- por mm6, mm3 ; mm6=abs(q2-q1)
- lea rdx, srct
-
- movq [rdx+24], mm5 ; save q1
- movq [rdx+16], mm0 ; save q0
-
- movq mm3, [rsi+rax*4] ; 07 06 05 04 03 02 01 00
- punpcklbw mm3, [rdi+rax*4] ; 13 03 12 02 11 01 10 00
-
- movq mm0, mm3 ; 13 03 12 02 11 01 10 00
- punpcklwd mm0, mm1 ; 31 21 11 01 30 20 10 00
-
- punpckhwd mm3, mm1 ; 33 23 13 03 32 22 12 02
- movq mm1, mm0 ; 31 21 11 01 30 20 10 00
-
- punpckldq mm0, mm2 ; 70 60 50 40 30 20 10 00 =p3
- punpckhdq mm1, mm2 ; 71 61 51 41 31 21 11 01 =p2
-
- movq mm2, mm1 ; 71 61 51 41 31 21 11 01 =p2
- psubusb mm2, mm0 ; p2-p3
-
- psubusb mm0, mm1 ; p3-p2
- por mm0, mm2 ; mm0=abs(p3-p2)
-
- movq mm2, mm3 ; 33 23 13 03 32 22 12 02
- punpckldq mm2, mm4 ; 72 62 52 42 32 22 12 02 = p1
-
- punpckhdq mm3, mm4 ; 73 63 53 43 33 23 13 03 = p0
- movq [rdx+8], mm3 ; save p0
-
- movq [rdx], mm2 ; save p1
- movq mm5, mm2 ; mm5 = p1
-
- psubusb mm2, mm1 ; p1-p2
- psubusb mm1, mm5 ; p2-p1
-
- por mm1, mm2 ; mm1=abs(p2-p1)
- mov rdx, arg(3) ;limit
-
- movq mm4, [rdx] ; mm4 = limit
- psubusb mm7, mm4
-
- psubusb mm0, mm4
- psubusb mm1, mm4
-
- psubusb mm6, mm4
- por mm7, mm6
-
- por mm0, mm1
- por mm0, mm7 ; abs(q3-q2) > limit || abs(p3-p2) > limit ||abs(p2-p1) > limit || abs(q2-q1) > limit
-
- movq mm1, mm5 ; p1
-
- movq mm7, mm3 ; mm3=mm7=p0
- psubusb mm7, mm5 ; p0 - p1
-
- psubusb mm5, mm3 ; p1 - p0
- por mm5, mm7 ; abs(p1-p0)
-
- movq t0, mm5 ; save abs(p1-p0)
- lea rdx, srct
-
- psubusb mm5, mm4
- por mm0, mm5 ; mm0=mask
-
- movq mm5, [rdx+16] ; mm5=q0
- movq mm7, [rdx+24] ; mm7=q1
-
- movq mm6, mm5 ; mm6=q0
- movq mm2, mm7 ; q1
- psubusb mm5, mm7 ; q0-q1
-
- psubusb mm7, mm6 ; q1-q0
- por mm7, mm5 ; abs(q1-q0)
-
- movq t1, mm7 ; save abs(q1-q0)
- psubusb mm7, mm4
-
- por mm0, mm7 ; mask
-
- movq mm5, mm2 ; q1
- psubusb mm5, mm1 ; q1-=p1
- psubusb mm1, mm2 ; p1-=q1
- por mm5, mm1 ; abs(p1-q1)
- pand mm5, [GLOBAL(tfe)] ; set lsb of each byte to zero
- psrlw mm5, 1 ; abs(p1-q1)/2
-
- mov rdx, arg(2) ;blimit ;
-
- movq mm4, [rdx] ;blimit
- movq mm1, mm3 ; mm1=mm3=p0
-
- movq mm7, mm6 ; mm7=mm6=q0
- psubusb mm1, mm7 ; p0-q0
-
- psubusb mm7, mm3 ; q0-p0
- por mm1, mm7 ; abs(q0-p0)
- paddusb mm1, mm1 ; abs(q0-p0)*2
- paddusb mm1, mm5 ; abs (p0 - q0) *2 + abs(p1-q1)/2
-
- psubusb mm1, mm4 ; abs (p0 - q0) *2 + abs(p1-q1)/2 > blimit
- por mm1, mm0; ; mask
-
- pxor mm0, mm0
- pcmpeqb mm1, mm0
-
- ; calculate high edge variance
- mov rdx, arg(4) ;thresh ; get thresh
- movq mm7, [rdx]
- ;
- movq mm4, t0 ; get abs (q1 - q0)
- psubusb mm4, mm7
-
- movq mm3, t1 ; get abs (p1 - p0)
- psubusb mm3, mm7
-
- por mm4, mm3 ; abs(q1 - q0) > thresh || abs(p1 - p0) > thresh
- pcmpeqb mm4, mm0
-
- pcmpeqb mm0, mm0
- pxor mm4, mm0
-
-
-
- ; start work on filters
- lea rdx, srct
-
- movq mm2, [rdx] ; p1
- movq mm7, [rdx+24] ; q1
-
- movq mm6, [rdx+8] ; p0
- movq mm0, [rdx+16] ; q0
-
- pxor mm2, [GLOBAL(t80)] ; p1 offset to convert to signed values
- pxor mm7, [GLOBAL(t80)] ; q1 offset to convert to signed values
-
- psubsb mm2, mm7 ; p1 - q1
- pand mm2, mm4 ; high var mask (hvm)(p1 - q1)
-
- pxor mm6, [GLOBAL(t80)] ; offset to convert to signed values
- pxor mm0, [GLOBAL(t80)] ; offset to convert to signed values
-
- movq mm3, mm0 ; q0
- psubsb mm0, mm6 ; q0 - p0
-
- paddsb mm2, mm0 ; 1 * (q0 - p0) + hvm(p1 - q1)
- paddsb mm2, mm0 ; 2 * (q0 - p0) + hvm(p1 - q1)
-
- paddsb mm2, mm0 ; 3 * (q0 - p0) + hvm(p1 - q1)
- pand mm1, mm2 ; mask filter values we don't care about
-
- movq mm2, mm1
- paddsb mm1, [GLOBAL(t4)] ; 3* (q0 - p0) + hvm(p1 - q1) + 4
-
- paddsb mm2, [GLOBAL(t3)] ; 3* (q0 - p0) + hvm(p1 - q1) + 3
- pxor mm0, mm0 ;
-
- pxor mm5, mm5
- punpcklbw mm0, mm2 ;
-
- punpckhbw mm5, mm2 ;
- psraw mm0, 11 ;
-
- psraw mm5, 11
- packsswb mm0, mm5
-
- movq mm2, mm0 ; (3* (q0 - p0) + hvm(p1 - q1) + 3) >> 3;
-
- pxor mm0, mm0 ; 0
- movq mm5, mm1 ; abcdefgh
-
- punpcklbw mm0, mm1 ; e0f0g0h0
- psraw mm0, 11 ; sign extended shift right by 3
-
- pxor mm1, mm1 ; 0
- punpckhbw mm1, mm5 ; a0b0c0d0
-
- psraw mm1, 11 ; sign extended shift right by 3
- movq mm5, mm0 ; save results
-
- packsswb mm0, mm1 ; (3* (q0 - p0) + hvm(p1 - q1) + 4) >>3
- paddsw mm5, [GLOBAL(ones)]
-
- paddsw mm1, [GLOBAL(ones)]
- psraw mm5, 1 ; partial shifted one more time for 2nd tap
-
- psraw mm1, 1 ; partial shifted one more time for 2nd tap
- packsswb mm5, mm1 ; (3* (q0 - p0) + hvm(p1 - q1) + 4) >>4
-
- pandn mm4, mm5 ; high edge variance additive
-
- paddsb mm6, mm2 ; p0+= p0 add
- pxor mm6, [GLOBAL(t80)] ; unoffset
-
- ; mm6=p0 ;
- movq mm1, [rdx] ; p1
- pxor mm1, [GLOBAL(t80)] ; reoffset
-
- paddsb mm1, mm4 ; p1+= p1 add
- pxor mm1, [GLOBAL(t80)] ; unoffset
- ; mm6 = p0 mm1 = p1
-
- psubsb mm3, mm0 ; q0-= q0 add
- pxor mm3, [GLOBAL(t80)] ; unoffset
-
- ; mm3 = q0
- psubsb mm7, mm4 ; q1-= q1 add
- pxor mm7, [GLOBAL(t80)] ; unoffset
- ; mm7 = q1
-
- ; transpose and write back
- ; mm1 = 72 62 52 42 32 22 12 02
- ; mm6 = 73 63 53 43 33 23 13 03
- ; mm3 = 74 64 54 44 34 24 14 04
- ; mm7 = 75 65 55 45 35 25 15 05
-
- movq mm2, mm1 ; 72 62 52 42 32 22 12 02
- punpcklbw mm2, mm6 ; 33 32 23 22 13 12 03 02
-
- movq mm4, mm3 ; 74 64 54 44 34 24 14 04
- punpckhbw mm1, mm6 ; 73 72 63 62 53 52 43 42
-
- punpcklbw mm4, mm7 ; 35 34 25 24 15 14 05 04
- punpckhbw mm3, mm7 ; 75 74 65 64 55 54 45 44
-
- movq mm6, mm2 ; 33 32 23 22 13 12 03 02
- punpcklwd mm2, mm4 ; 15 14 13 12 05 04 03 02
-
- punpckhwd mm6, mm4 ; 35 34 33 32 25 24 23 22
- movq mm5, mm1 ; 73 72 63 62 53 52 43 42
-
- punpcklwd mm1, mm3 ; 55 54 53 52 45 44 43 42
- punpckhwd mm5, mm3 ; 75 74 73 72 65 64 63 62
-
-
- ; mm2 = 15 14 13 12 05 04 03 02
- ; mm6 = 35 34 33 32 25 24 23 22
- ; mm5 = 55 54 53 52 45 44 43 42
- ; mm1 = 75 74 73 72 65 64 63 62
-
-
-
- movd [rsi+rax*4+2], mm2
- psrlq mm2, 32
-
- movd [rdi+rax*4+2], mm2
- movd [rsi+rax*2+2], mm6
-
- psrlq mm6, 32
- movd [rsi+rax+2],mm6
-
- movd [rsi+2], mm1
- psrlq mm1, 32
-
- movd [rdi+2], mm1
- neg rax
-
- movd [rdi+rax+2],mm5
- psrlq mm5, 32
-
- movd [rdi+rax*2+2], mm5
-
- add rsp, 64
- pop rsp
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_GOT
- UNSHADOW_ARGS
- pop rbp
- ret
+%include "third_party/x86inc/x86inc.asm"
SECTION_RODATA
align 16
tfe:
times 8 db 0xfe
-align 16
t80:
times 8 db 0x80
-align 16
t3:
times 8 db 0x03
-align 16
t4:
times 8 db 0x04
-align 16
ones:
times 4 dw 0x0001
+
+SECTION .text
+
+%define stkreg rsp
+
+%define t0 0
+%define t1 t0 + 16
+%define p1 t1 + 16
+%define p0 p1 + 16
+%define q0 p0 + 16
+%define q1 q0 + 16
+%define lstacksize q1 + 16
+
+%define goffsetq _limitq
+
+;void vpx_lpf_horizontal_4_mmx(unsigned char *src_ptr, int src_pixel_step,
+; const char *blimit, const char *limit,
+; const char *thresh);
+INIT_MMX mmx
+cglobal lpf_horizontal_4, 5, 6, 8, 0 - lstacksize, \
+ s, p, _blimit, _limit, _thresh, s1
+ movq m7, [_limitq]
+ GET_GOT goffsetq
+%if GET_GOT_DEFINED=1
+ add rsp, gprsize ; restore stack
+%endif
+ lea s1q, [sq + pq] ; s1q points to row +1
+
+ ; calculate breakout conditions
+ movq m2, [s1q + 2 * pq] ; q3
+ movq m1, [ sq + 2 * pq] ; q2
+ movq m6, m1 ; q2
+ psubusb m1, m2 ; q2-=q3
+ psubusb m2, m6 ; q3-=q2
+ por m1, m2 ; abs(q3-q2)
+ psubusb m1, m7
+ movq m4, [sq + pq] ; q1
+ movq m3, m4 ; q1
+ psubusb m4, m6 ; q1-=q2
+ psubusb m6, m3 ; q2-=q1
+ por m4, m6 ; abs(q2-q1)
+ psubusb m4, m7
+ por m1, m4
+ movq m4, [sq] ; q0
+ movq m0, m4 ; q0
+ psubusb m4, m3 ; q0-=q1
+ psubusb m3, m0 ; q1-=q0
+ por m4, m3 ; abs(q0-q1)
+ movq [stkreg + t0], m4 ; save to t0
+ psubusb m4, m7
+ por m1, m4
+ neg pq ; negate pitch to deal with
+ ; above border
+ movq m2, [ sq + 4 * pq] ; p3
+ movq m4, [s1q + 4 * pq] ; p2
+ movq m5, m4 ; p2
+ psubusb m4, m2 ; p2-=p3
+ psubusb m2, m5 ; p3-=p2
+ por m4, m2 ; abs(p3 - p2)
+ psubusb m4, m7
+ por m1, m4
+ movq m4, [sq + 2 * pq] ; p1
+ movq m3, m4 ; p1
+ psubusb m4, m5 ; p1-=p2
+ psubusb m5, m3 ; p2-=p1
+ por m4, m5 ; abs(p2 - p1)
+ psubusb m4, m7
+ por m1, m4
+ movq m2, m3 ; p1
+ movq m4, [sq + pq] ; p0
+ movq m5, m4 ; p0
+ psubusb m4, m3 ; p0-=p1
+ psubusb m3, m5 ; p1-=p0
+ por m4, m3 ; abs(p1 - p0)
+ movq [stkreg + t1], m4 ; save to t1
+ psubusb m4, m7
+ por m1, m4
+ movq m3, [s1q] ; q1
+ movq m4, m3 ; q1
+ psubusb m3, m2 ; q1-=p1
+ psubusb m2, m4 ; p1-=q1
+ por m2, m3 ; abs(p1-q1)
+ pand m2, [GLOBAL(tfe)] ; set lsb of each byte to zero
+ psrlw m2, 1 ; abs(p1-q1)/2
+ movq m6, m5 ; p0
+ movq m3, [sq] ; q0
+ psubusb m5, m3 ; p0-=q0
+ psubusb m3, m6 ; q0-=p0
+ por m5, m3 ; abs(p0 - q0)
+ paddusb m5, m5 ; abs(p0-q0)*2
+ paddusb m5, m2 ; abs (p0 - q0) * 2 + abs(p1-q1)/2
+ movq m7, [_blimitq] ; blimit
+ psubusb m5, m7 ; abs (p0 - q0) * 2 +
+ ; abs(p1-q1)/2 > blimit
+ por m1, m5
+ pxor m5, m5
+ pcmpeqb m1, m5 ; mask m1
+
+ ; calculate high edge variance
+ movq m7, [_threshq]
+ movq m4, [stkreg + t0] ; get abs (q1 - q0)
+ psubusb m4, m7
+ movq m3, [stkreg + t1] ; get abs (p1 - p0)
+ psubusb m3, m7
+ paddb m4, m3 ; abs(q1 - q0) > thresh ||
+ ; abs(p1 - p0) > thresh
+ pcmpeqb m4, m5
+ pcmpeqb m5, m5
+ movq m3, [GLOBAL(t80)]
+ pxor m4, m5
+
+ ; start work on filters
+ movq m2, [sq + 2 * pq] ; p1
+ movq m7, [s1q] ; q1
+ pxor m2, m3 ; p1 converted to signed values
+ pxor m7, m3 ; q1 converted to signed values
+ psubsb m2, m7 ; p1 - q1
+ pand m2, m4 ; high var mask (hvm)(p1 - q1)
+ pxor m6, m3 ; p0 converted to signed values
+ pxor m0, m3 ; q0 converted to signed values
+ movq m3, m0 ; q0
+ psubsb m0, m6 ; q0 - p0
+ paddsb m2, m0 ; 1 * (q0 - p0) + hvm(p1 - q1)
+ paddsb m2, m0 ; 2 * (q0 - p0) + hvm(p1 - q1)
+ paddsb m2, m0 ; 3 * (q0 - p0) + hvm(p1 - q1)
+ pand m1, m2 ; mask filter values we don't
+ ; care about
+ movq m2, m1
+ paddsb m1, [GLOBAL(t4)] ; 3* (q0 - p0) + hvm(p1 - q1) + 4
+ paddsb m2, [GLOBAL(t3)] ; 3* (q0 - p0) + hvm(p1 - q1) + 3
+
+ pxor m0, m0
+ pxor m5, m5
+ punpcklbw m0, m2
+ punpckhbw m5, m2
+ psraw m0, 11
+ psraw m5, 11
+ packsswb m0, m5
+ movq m2, m0 ; (3* (q0 - p0) + hvm(p1 - q1)
+ ; + 3) >> 3;
+ pxor m0, m0
+ movq m5, m1 ; abcdefgh
+ punpcklbw m0, m1 ; e0f0g0h0
+ psraw m0, 11 ; sign extended shift right by 3
+ pxor m1, m1
+ punpckhbw m1, m5 ; a0b0c0d0
+ psraw m1, 11 ; sign extended shift right by 3
+ movq m5, m0 ; save results
+
+ packsswb m0, m1 ; (3* (q0 - p0) + hvm(p1 - q1)
+ ; + 4) >>3
+ paddsw m5, [GLOBAL(ones)]
+ paddsw m1, [GLOBAL(ones)]
+ psraw m5, 1
+ psraw m1, 1
+ packsswb m5, m1 ; (3* (q0 - p0) + hvm(p1 - q1)
+ ; + 4) >>4
+ movq m1, [GLOBAL(t80)]
+ pandn m4, m5 ; high edge variance additive
+ paddsb m6, m2 ; p0+= p0 add
+ pxor m6, m1 ; unoffset
+ movq [sq + pq], m6 ; write back
+ movq m6, [sq + 2 * pq] ; p1
+ pxor m6, m1 ; reoffset
+ paddsb m6, m4 ; p1+= p1 add
+ pxor m6, m1 ; unoffset
+ movq [sq + 2 * pq], m6 ; write back
+ psubsb m3, m0 ; q0-= q0 add
+ pxor m3, m1 ; unoffset
+ movq [sq], m3 ; write back
+ psubsb m7, m4 ; q1-= q1 add
+ pxor m7, m1 ; unoffset
+ movq [s1q], m7 ; write back
+ RET
+
+;void vpx_lpf_vertical_4_mmx(unsigned char *src_ptr, int src_pixel_step,
+; const char *blimit, const char *limit,
+; const char *thresh);
+INIT_MMX mmx
+cglobal lpf_vertical_4, 5, 6, 8, 0 - lstacksize, \
+ s, p, _blimit, _limit, _thresh, s1
+ lea sq, [sq + pq * 4 - 4]
+ lea s1q, [sq + pq] ; s1q points to row +1
+ ;transpose
+ movq m6, [ sq + 2 * pq] ; 67 66 65 64 63 62 61 60
+ movq m7, m6 ; 77 76 75 74 73 72 71 70
+ punpckhbw m7, [s1q + 2 * pq] ; 77 67 76 66 75 65 74 64
+ punpcklbw m6, [s1q + 2 * pq] ; 73 63 72 62 71 61 70 60
+ movq m4, [sq] ; 47 46 45 44 43 42 41 40
+ movq m5, m4 ; 47 46 45 44 43 42 41 40
+ punpckhbw m5, [sq + pq] ; 57 47 56 46 55 45 54 44
+ punpcklbw m4, [sq + pq] ; 53 43 52 42 51 41 50 40
+ movq m3, m5 ; 57 47 56 46 55 45 54 44
+ punpckhwd m5, m7 ; 77 67 57 47 76 66 56 46
+ punpcklwd m3, m7 ; 75 65 55 45 74 64 54 44
+ movq m2, m4 ; 53 43 52 42 51 41 50 40
+ punpckhwd m4, m6 ; 73 63 53 43 72 62 52 42
+ punpcklwd m2, m6 ; 71 61 51 41 70 60 50 40
+ neg pq
+ movq m6, [ sq + pq * 2] ; 27 26 25 24 23 22 21 20
+ movq m1, m6 ; 27 26 25 24 23 22 21 20
+ punpckhbw m6, [ sq + pq ] ; 37 27 36 36 35 25 34 24
+ punpcklbw m1, [ sq + pq ] ; 33 23 32 22 31 21 30 20
+ movq m7, [ sq + pq * 4]; ; 07 06 05 04 03 02 01 00
+ punpckhbw m7, [s1q + pq * 4] ; 17 07 16 06 15 05 14 04
+ movq m0, m7 ; 17 07 16 06 15 05 14 04
+ punpckhwd m7, m6 ; 37 27 17 07 36 26 16 06
+ punpcklwd m0, m6 ; 35 25 15 05 34 24 14 04
+ movq m6, m7 ; 37 27 17 07 36 26 16 06
+ punpckhdq m7, m5 ; 77 67 57 47 37 27 17 07 = q3
+ punpckldq m6, m5 ; 76 66 56 46 36 26 16 06 = q2
+ movq m5, m6 ; 76 66 56 46 36 26 16 06
+ psubusb m5, m7 ; q2-q3
+ psubusb m7, m6 ; q3-q2
+ por m7, m5; ; m7=abs (q3-q2)
+ movq m5, m0 ; 35 25 15 05 34 24 14 04
+ punpckhdq m5, m3 ; 75 65 55 45 35 25 15 05 = q1
+ punpckldq m0, m3 ; 74 64 54 44 34 24 15 04 = q0
+ movq m3, m5 ; 75 65 55 45 35 25 15 05 = q1
+ psubusb m3, m6 ; q1-q2
+ psubusb m6, m5 ; q2-q1
+ por m6, m3 ; m6=abs(q2-q1)
+
+ movq [stkreg + q1], m5 ; save q1
+ movq [stkreg + q0], m0 ; save q0
+
+ movq m3, [ sq + pq * 4] ; 07 06 05 04 03 02 01 00
+ punpcklbw m3, [s1q + pq * 4] ; 13 03 12 02 11 01 10 00
+ movq m0, m3 ; 13 03 12 02 11 01 10 00
+ punpcklwd m0, m1 ; 31 21 11 01 30 20 10 00
+ punpckhwd m3, m1 ; 33 23 13 03 32 22 12 02
+ movq m1, m0 ; 31 21 11 01 30 20 10 00
+ punpckldq m0, m2 ; 70 60 50 40 30 20 10 00 =p3
+ punpckhdq m1, m2 ; 71 61 51 41 31 21 11 01 =p2
+ movq m2, m1 ; 71 61 51 41 31 21 11 01 =p2
+ psubusb m2, m0 ; p2-p3
+ psubusb m0, m1 ; p3-p2
+ por m0, m2 ; m0=abs(p3-p2)
+ movq m2, m3 ; 33 23 13 03 32 22 12 02
+ punpckldq m2, m4 ; 72 62 52 42 32 22 12 02 = p1
+ punpckhdq m3, m4 ; 73 63 53 43 33 23 13 03 = p0
+
+ movq [stkreg + p0], m3 ; save p0
+ movq [stkreg + p1], m2 ; save p1
+ movq m5, m2 ; m5 = p1
+ psubusb m2, m1 ; p1-p2
+ psubusb m1, m5 ; p2-p1
+ por m1, m2 ; m1=abs(p2-p1)
+ movq m4, [_limitq]
+ GET_GOT goffsetq
+%if GET_GOT_DEFINED=1
+ add rsp, gprsize ; restore stack
+%endif
+ psubusb m7, m4
+ psubusb m0, m4
+ psubusb m1, m4
+ psubusb m6, m4
+ por m7, m6
+ por m0, m1
+ por m0, m7 ; abs(q3-q2) > limit ||
+ ; abs(p3-p2) > limit ||
+ ; abs(p2-p1) > limit ||
+ ; abs(q2-q1) > limit
+ movq m1, m5 ; p1
+ movq m7, m3 ; m3=m7=p0
+ psubusb m7, m5 ; p0 - p1
+ psubusb m5, m3 ; p1 - p0
+ por m5, m7 ; abs(p1-p0)
+ movq [stkreg + t0], m5 ; save abs(p1-p0)
+ psubusb m5, m4
+ por m0, m5 ; m0=mask
+ movq m5, [stkreg + q0] ; m5=q0
+ movq m7, [stkreg + q1] ; m7=q1
+ movq m6, m5 ; m6=q0
+ movq m2, m7 ; q1
+ psubusb m5, m7 ; q0-q1
+ psubusb m7, m6 ; q1-q0
+ por m7, m5 ; abs(q1-q0)
+ movq [stkreg + t1], m7 ; save abs(q1-q0)
+ psubusb m7, m4
+ por m0, m7 ; mask
+ movq m5, m2 ; q1
+ psubusb m5, m1 ; q1-=p1
+ psubusb m1, m2 ; p1-=q1
+ por m5, m1 ; abs(p1-q1)
+ pand m5, [GLOBAL(tfe)] ; set lsb of each byte to zero
+ psrlw m5, 1 ; abs(p1-q1)/2
+ movq m4, [_blimitq]
+ movq m1, m3 ; m1=m3=p0
+ movq m7, m6 ; m7=m6=q0
+ psubusb m1, m7 ; p0-q0
+ psubusb m7, m3 ; q0-p0
+ por m1, m7 ; abs(q0-p0)
+ paddusb m1, m1 ; abs(q0-p0)*2
+ paddusb m1, m5 ; abs(p0 - q0)*2 + abs(p1-q1)/2
+ psubusb m1, m4 ; abs(p0 - q0)*2 + abs(p1-q1)/2
+ ; > blimit
+ por m1, m0; ; mask
+ pxor m0, m0
+ pcmpeqb m1, m0
+
+ ; calculate high edge variance
+ movq m7, [_threshq]
+ movq m4, [stkreg + t0] ; get abs (q1 - q0)
+ psubusb m4, m7
+ movq m3, [stkreg + t1] ; get abs (p1 - p0)
+ psubusb m3, m7
+ por m4, m3 ; abs(q1 - q0) > thresh ||
+ ; abs(p1 - p0) > thresh
+ pcmpeqb m4, m0
+ pcmpeqb m0, m0
+ movq m3, [GLOBAL(t80)]
+ pxor m4, m0
+
+ ; start work on filters
+ movq m2, [stkreg + p1]
+ movq m7, [stkreg + q1]
+ movq m6, [stkreg + p0]
+ movq m0, [stkreg + q0]
+ pxor m2, m3
+ pxor m7, m3
+ psubsb m2, m7 ; p1 - q1
+ pand m2, m4 ; high var mask (hvm)(p1 - q1)
+ pxor m6, m3
+ pxor m0, m3
+ movq m3, m0 ; q0
+ psubsb m0, m6 ; q0 - p0
+ paddsb m2, m0 ; 1 * (q0 - p0) + hvm(p1 - q1)
+ paddsb m2, m0 ; 2 * (q0 - p0) + hvm(p1 - q1)
+ paddsb m2, m0 ; 3 * (q0 - p0) + hvm(p1 - q1)
+ pand m1, m2 ; mask filter values we don't
+ ; care about
+ movq m2, m1
+ paddsb m1, [GLOBAL(t4)] ; 3*(q0 - p0) + hvm(p1 - q1) + 4
+ paddsb m2, [GLOBAL(t3)] ; 3*(q0 - p0) + hvm(p1 - q1) + 3
+ pxor m0, m0
+ pxor m5, m5
+ punpcklbw m0, m2
+ punpckhbw m5, m2
+ psraw m0, 11
+ psraw m5, 11
+ packsswb m0, m5
+ movq m2, m0 ; (3*(q0 - p0) + hvm(p1 - q1)
+ ; + 3) >> 3;
+ pxor m0, m0
+ movq m5, m1 ; abcdefgh
+ punpcklbw m0, m1 ; e0f0g0h0
+ psraw m0, 11 ; sign extended shift right by 3
+ pxor m1, m1
+ punpckhbw m1, m5 ; a0b0c0d0
+ psraw m1, 11 ; sign extended shift right by 3
+ movq m5, m0 ; save results
+ packsswb m0, m1 ; (3*(q0 - p0) + hvm(p1 - q1)
+ ; + 4) >>3
+ paddsw m5, [GLOBAL(ones)]
+ paddsw m1, [GLOBAL(ones)]
+ psraw m5, 1
+ psraw m1, 1
+ packsswb m5, m1 ; (3* (q0 - p0) + hvm(p1 - q1)
+ ; + 4) >>4
+ pandn m4, m5 ; high edge variance additive
+ movq m5, [GLOBAL(t80)]
+ paddsb m6, m2 ; p0+= p0 add
+ pxor m6, m5 ; unoffset
+ ; m6=p0
+ movq m1, [stkreg + p1]
+ pxor m1, m5 ; reoffset
+ paddsb m1, m4 ; p1+= p1 add
+ pxor m1, m5 ; unoffset
+ ; m6 = p0 m1 = p1
+ psubsb m3, m0 ; q0-= q0 add
+ pxor m3, m5 ; unoffset
+ ; m3 = q0
+ psubsb m7, m4 ; q1-= q1 add
+ pxor m7, m5 ; unoffset
+ ; m7 = q1
+ ; transpose and write back
+ ; m1 = 72 62 52 42 32 22 12 02
+ ; m6 = 73 63 53 43 33 23 13 03
+ ; m3 = 74 64 54 44 34 24 14 04
+ ; m7 = 75 65 55 45 35 25 15 05
+ movq m2, m1 ; 72 62 52 42 32 22 12 02
+ punpcklbw m2, m6 ; 33 32 23 22 13 12 03 02
+ movq m4, m3 ; 74 64 54 44 34 24 14 04
+ punpckhbw m1, m6 ; 73 72 63 62 53 52 43 42
+ punpcklbw m4, m7 ; 35 34 25 24 15 14 05 04
+ punpckhbw m3, m7 ; 75 74 65 64 55 54 45 44
+ movq m6, m2 ; 33 32 23 22 13 12 03 02
+ punpcklwd m2, m4 ; 15 14 13 12 05 04 03 02
+ punpckhwd m6, m4 ; 35 34 33 32 25 24 23 22
+ movq m5, m1 ; 73 72 63 62 53 52 43 42
+ punpcklwd m1, m3 ; 55 54 53 52 45 44 43 42
+ punpckhwd m5, m3 ; 75 74 73 72 65 64 63 62
+
+ ; m2 = 15 14 13 12 05 04 03 02
+ ; m6 = 35 34 33 32 25 24 23 22
+ ; m5 = 55 54 53 52 45 44 43 42
+ ; m1 = 75 74 73 72 65 64 63 62
+ movd [sq + pq * 4 + 2], m2
+ psrlq m2, 32
+ movd [s1q + pq * 4 + 2], m2
+ movd [sq + pq * 2 + 2], m6
+ psrlq m6, 32
+ movd [sq + pq + 2], m6
+ movd [sq + 2], m1
+ psrlq m1, 32
+ movd [s1q + 2], m1
+ neg pq
+ movd [s1q + pq + 2], m5
+ psrlq m5, 32
+ movd [s1q + pq * 2 + 2], m5
+ RET
diff --git a/vpx_dsp/x86/vpx_subpixel_8t_ssse3.asm b/vpx_dsp/x86/vpx_subpixel_8t_ssse3.asm
index 3fbaa27..d2cb8ea 100644
--- a/vpx_dsp/x86/vpx_subpixel_8t_ssse3.asm
+++ b/vpx_dsp/x86/vpx_subpixel_8t_ssse3.asm
@@ -16,6 +16,11 @@
; %define USE_PMULHRSW
; NOTE: pmulhrsw has a latency of 5 cycles. Tests showed a performance loss
; when using this instruction.
+;
+; The add order below (based on ffvp9) must be followed to prevent outranges.
+; x = k0k1 + k4k5
+; y = k2k3 + k6k7
+; z = signed SAT(x + y)
SECTION .text
%if ARCH_X86_64
@@ -77,17 +82,12 @@
pmaddubsw %2, k0k1k4k5
pmaddubsw m3, k2k3k6k7
-
- mova m4, %2
- mova m5, m3
- psrldq %2, 8
- psrldq m3, 8
- mova m6, m5
-
- paddsw m4, m3
- pmaxsw m5, %2
- pminsw %2, m6
+ mova m4, %2 ;k0k1
+ mova m5, m3 ;k2k3
+ psrldq %2, 8 ;k4k5
+ psrldq m3, 8 ;k6k7
paddsw %2, m4
+ paddsw m5, m3
paddsw %2, m5
paddsw %2, krd
psraw %2, 7
@@ -157,27 +157,20 @@
pmaddubsw m7, k0k1k4k5
palignr m3, m2, 5
pmaddubsw m3, k2k3k6k7
- mova m0, m4
- mova m5, m1
- mova m2, m7
- psrldq m4, 8
- psrldq m1, 8
- mova m6, m5
- paddsw m0, m1
- mova m1, m3
- psrldq m7, 8
- psrldq m3, 8
- paddsw m2, m3
- mova m3, m1
- pmaxsw m5, m4
- pminsw m4, m6
+ mova m0, m4 ;k0k1
+ mova m5, m1 ;k2k3
+ mova m2, m7 ;k0k1 upper
+ psrldq m4, 8 ;k4k5
+ psrldq m1, 8 ;k6k7
paddsw m4, m0
- paddsw m4, m5
- pmaxsw m1, m7
- pminsw m7, m3
+ paddsw m5, m1
+ mova m1, m3 ;k2k3 upper
+ psrldq m7, 8 ;k4k5 upper
+ psrldq m3, 8 ;k6k7 upper
paddsw m7, m2
+ paddsw m4, m5
+ paddsw m1, m3
paddsw m7, m1
-
paddsw m4, krd
psraw m4, 7
packuswb m4, m4
@@ -240,16 +233,13 @@
pmaddubsw %3, k2k3
pmaddubsw %4, k4k5
pmaddubsw %5, k6k7
-
+ paddsw %2, %4
+ paddsw %5, %3
paddsw %2, %5
- mova %1, %3
- pminsw %3, %4
- pmaxsw %1, %4
- paddsw %2, %3
- paddsw %1, %2
- paddsw %1, krd
- psraw %1, 7
- packuswb %1, %1
+ paddsw %2, krd
+ psraw %2, 7
+ packuswb %2, %2
+ SWAP %1, %2
%endm
;-------------------------------------------------------------------------------
@@ -293,39 +283,33 @@
pmaddubsw m3, k4k5
palignr m7, m4, 13
- paddsw m1, m5
- mova m5, m6
- mova m0, m2
- palignr m5, m4, 5
- pminsw m2, m3
- pmaddubsw m7, k6k7
- pmaxsw m3, m0
- paddsw m1, m2
mova m0, m6
- palignr m6, m4, 1
- pmaddubsw m5, k2k3
+ palignr m0, m4, 5
+ pmaddubsw m7, k6k7
paddsw m1, m3
+ paddsw m2, m5
+ paddsw m1, m2
+ mova m5, m6
+ palignr m6, m4, 1
+ pmaddubsw m0, k2k3
pmaddubsw m6, k0k1
- palignr m0, m4, 9
+ palignr m5, m4, 9
paddsw m1, krd
- pmaddubsw m0, k4k5
- mova m4, m5
+ pmaddubsw m5, k4k5
psraw m1, 7
- pminsw m5, m0
- paddsw m6, m7
+ paddsw m0, m7
+%ifidn %1, h8_avg
+ movh m7, [dstq]
+ movh m2, [dstq + dstrideq]
+%endif
packuswb m1, m1
-
paddsw m6, m5
- pmaxsw m0, m4
paddsw m6, m0
paddsw m6, krd
psraw m6, 7
packuswb m6, m6
-
%ifidn %1, h8_avg
- movh m0, [dstq]
- movh m2, [dstq + dstrideq]
- pavgb m1, m0
+ pavgb m1, m7
pavgb m6, m2
%endif
movh [dstq], m1
@@ -388,7 +372,7 @@
pmaddubsw m1, k2k3
palignr m2, m7, 9
pmaddubsw m2, k4k5
- paddsw m0, m3
+ paddsw m1, m3
mova m3, m4
punpckhbw m4, m4
mova m5, m4
@@ -403,17 +387,13 @@
pmaddubsw m6, k4k5
palignr m7, m3, 13
pmaddubsw m7, k6k7
-
- mova m3, m1
- pmaxsw m1, m2
- pminsw m2, m3
paddsw m0, m2
paddsw m0, m1
- paddsw m4, m7
- mova m7, m5
- pmaxsw m5, m6
- pminsw m6, m7
+%ifidn %1, h8_avg
+ mova m1, [dstq]
+%endif
paddsw m4, m6
+ paddsw m5, m7
paddsw m4, m5
paddsw m0, krd
paddsw m4, krd
@@ -421,7 +401,6 @@
psraw m4, 7
packuswb m0, m4
%ifidn %1, h8_avg
- mova m1, [dstq]
pavgb m0, m1
%endif
lea srcq, [srcq + sstrideq]
@@ -488,27 +467,21 @@
movx m7, [src1q + sstride6q ] ;H
punpcklbw m6, m7 ;G H
pmaddubsw m6, k6k7
- mova tmp, m2
pmaddubsw m3, k2k3
pmaddubsw m1, k0k1
- pmaxsw m2, m4
- paddsw m0, m6
+ paddsw m0, m4
+ paddsw m2, m6
movx m6, [srcq + sstrideq * 8 ] ;H next iter
punpcklbw m7, m6
pmaddubsw m7, k6k7
- pminsw m4, tmp
- paddsw m0, m4
- mova m4, m3
paddsw m0, m2
- pminsw m3, m5
- pmaxsw m5, m4
paddsw m0, krd
psraw m0, 7
- paddsw m1, m7
+ paddsw m1, m5
packuswb m0, m0
+ paddsw m3, m7
paddsw m1, m3
- paddsw m1, m5
paddsw m1, krd
psraw m1, 7
lea srcq, [srcq + sstrideq * 2 ]
@@ -538,11 +511,11 @@
movx m1, [srcq + sstrideq ] ;B
movx m6, [srcq + sstride6q ] ;G
punpcklbw m0, m1 ;A B
- movx m7, [rax + sstride6q ] ;H
+ movx m7, [src1q + sstride6q ] ;H
pmaddubsw m0, k0k1
movx m2, [srcq + sstrideq * 2 ] ;C
punpcklbw m6, m7 ;G H
- movx m3, [rax + sstrideq * 2 ] ;D
+ movx m3, [src1q + sstrideq * 2] ;D
pmaddubsw m6, k6k7
movx m4, [srcq + sstrideq * 4 ] ;E
punpcklbw m2, m3 ;C D
@@ -550,10 +523,7 @@
punpcklbw m4, m5 ;E F
pmaddubsw m2, k2k3
pmaddubsw m4, k4k5
- paddsw m0, m6
- mova m1, m2
- pmaxsw m2, m4
- pminsw m4, m1
+ paddsw m2, m6
paddsw m0, m4
paddsw m0, m2
paddsw m0, krd
@@ -572,7 +542,6 @@
%macro SUBPIX_VFILTER16 1
cglobal filter_block1d16_%1, 6, 6+(ARCH_X86_64*3), 14, LOCAL_VARS_SIZE, \
src, sstride, dst, dstride, height, filter
-
mova m4, [filterq]
SETUP_LOCAL_VARS
%if ARCH_X86_64
@@ -611,12 +580,9 @@
punpcklbw m3, m5 ;A B
movh m7, [srcq + sstrideq * 2 + 8] ;C
pmaddubsw m6, k6k7
- mova m1, m2
movh m5, [src1q + sstrideq * 2 + 8] ;D
- pmaxsw m2, m4
punpcklbw m7, m5 ;C D
- pminsw m4, m1
- paddsw m0, m6
+ paddsw m2, m6
pmaddubsw m3, k0k1
movh m1, [srcq + sstrideq * 4 + 8] ;E
paddsw m0, m4
@@ -630,30 +596,24 @@
movh m5, [src1q + sstride6q + 8] ;H
psraw m0, 7
punpcklbw m2, m5 ;G H
- packuswb m0, m0
pmaddubsw m2, k6k7
%ifidn %1, v8_avg
- movh m4, [dstq]
- pavgb m0, m4
+ mova m4, [dstq]
%endif
movh [dstq], m0
- mova m6, m7
- pmaxsw m7, m1
- pminsw m1, m6
- paddsw m3, m2
+ paddsw m7, m2
paddsw m3, m1
paddsw m3, m7
paddsw m3, krd
psraw m3, 7
- packuswb m3, m3
+ packuswb m0, m3
add srcq, sstrideq
add src1q, sstrideq
%ifidn %1, v8_avg
- movh m1, [dstq + 8]
- pavgb m3, m1
+ pavgb m0, m4
%endif
- movh [dstq + 8], m3
+ mova [dstq], m0
add dstq, dst_stride
dec heightd
jnz .loop
diff --git a/vpxenc.c b/vpxenc.c
index 35b79de..5e14934 100644
--- a/vpxenc.c
+++ b/vpxenc.c
@@ -380,7 +380,8 @@
static const arg_def_t tile_cols = ARG_DEF(
NULL, "tile-columns", 1, "Number of tile columns to use, log2");
static const arg_def_t tile_rows = ARG_DEF(
- NULL, "tile-rows", 1, "Number of tile rows to use, log2");
+ NULL, "tile-rows", 1,
+ "Number of tile rows to use, log2 (set to 0 while threads > 1)");
static const arg_def_t lossless = ARG_DEF(
NULL, "lossless", 1, "Lossless mode (0: false (default), 1: true)");
static const arg_def_t frame_parallel_decoding = ARG_DEF(
@@ -804,7 +805,6 @@
int arg_ctrls[ARG_CTRL_CNT_MAX][2];
int arg_ctrl_cnt;
int write_webm;
- int have_kf_max_dist;
#if CONFIG_VP9_HIGHBITDEPTH
// whether to use 16bit internal buffers
int use_16bit_internal;
@@ -1224,7 +1224,6 @@
config->cfg.kf_min_dist = arg_parse_uint(&arg);
} else if (arg_match(&arg, &kf_max_dist, argi)) {
config->cfg.kf_max_dist = arg_parse_uint(&arg);
- config->have_kf_max_dist = 1;
} else if (arg_match(&arg, &kf_disabled, argi)) {
config->cfg.kf_mode = VPX_KF_DISABLED;
#if CONFIG_VP9_HIGHBITDEPTH
@@ -1352,19 +1351,6 @@
}
}
-
-static void set_default_kf_interval(struct stream_state *stream,
- struct VpxEncoderConfig *global) {
- /* Use a max keyframe interval of 5 seconds, if none was
- * specified on the command line.
- */
- if (!stream->config.have_kf_max_dist) {
- double framerate = (double)global->framerate.num / global->framerate.den;
- if (framerate > 0.0)
- stream->config.cfg.kf_max_dist = (unsigned int)(5.0 * framerate);
- }
-}
-
static const char* file_type_to_string(enum VideoFileType t) {
switch (t) {
case FILE_TYPE_RAW: return "RAW";
@@ -2087,8 +2073,6 @@
stream->config.cfg.g_timebase.num = global.framerate.den);
}
- FOREACH_STREAM(set_default_kf_interval(stream, &global));
-
/* Show configuration */
if (global.verbose && pass == 0)
FOREACH_STREAM(show_stream_config(stream, &global, &input));