Merge "mips msa vp9 idct 16x16 optimization"
diff --git a/vp9/common/vp9_rtcd_defs.pl b/vp9/common/vp9_rtcd_defs.pl
index e6e6695..383ec4c 100644
--- a/vp9/common/vp9_rtcd_defs.pl
+++ b/vp9/common/vp9_rtcd_defs.pl
@@ -87,13 +87,13 @@
specialize qw/vp9_dc_predictor_4x4 dspr2/, "$sse_x86inc";
add_proto qw/void vp9_dc_top_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vp9_dc_top_predictor_4x4/;
+specialize qw/vp9_dc_top_predictor_4x4/, "$sse_x86inc";
add_proto qw/void vp9_dc_left_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vp9_dc_left_predictor_4x4/;
+specialize qw/vp9_dc_left_predictor_4x4/, "$sse_x86inc";
add_proto qw/void vp9_dc_128_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vp9_dc_128_predictor_4x4/;
+specialize qw/vp9_dc_128_predictor_4x4/, "$sse_x86inc";
add_proto qw/void vp9_d207_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
specialize qw/vp9_d207_predictor_8x8/, "$ssse3_x86inc";
@@ -126,13 +126,13 @@
specialize qw/vp9_dc_predictor_8x8 dspr2/, "$sse_x86inc";
add_proto qw/void vp9_dc_top_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vp9_dc_top_predictor_8x8/;
+specialize qw/vp9_dc_top_predictor_8x8/, "$sse_x86inc";
add_proto qw/void vp9_dc_left_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vp9_dc_left_predictor_8x8/;
+specialize qw/vp9_dc_left_predictor_8x8/, "$sse_x86inc";
add_proto qw/void vp9_dc_128_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vp9_dc_128_predictor_8x8/;
+specialize qw/vp9_dc_128_predictor_8x8/, "$sse_x86inc";
add_proto qw/void vp9_d207_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
specialize qw/vp9_d207_predictor_16x16/, "$ssse3_x86inc";
@@ -165,13 +165,13 @@
specialize qw/vp9_dc_predictor_16x16 dspr2/, "$sse2_x86inc";
add_proto qw/void vp9_dc_top_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vp9_dc_top_predictor_16x16/;
+specialize qw/vp9_dc_top_predictor_16x16/, "$sse2_x86inc";
add_proto qw/void vp9_dc_left_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
specialize qw/vp9_dc_left_predictor_16x16/;
add_proto qw/void vp9_dc_128_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vp9_dc_128_predictor_16x16/;
+specialize qw/vp9_dc_128_predictor_16x16/, "$sse2_x86inc";
add_proto qw/void vp9_d207_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
specialize qw/vp9_d207_predictor_32x32/, "$ssse3_x86inc";
diff --git a/vp9/common/x86/vp9_intrapred_sse2.asm b/vp9/common/x86/vp9_intrapred_sse2.asm
index 69b07f6..92ac491 100644
--- a/vp9/common/x86/vp9_intrapred_sse2.asm
+++ b/vp9/common/x86/vp9_intrapred_sse2.asm
@@ -15,6 +15,10 @@
pw_8: times 8 dw 8
pw_16: times 8 dw 16
pw_32: times 8 dw 32
+dc_128: times 16 db 128
+pw2_4: times 8 dw 2
+pw2_8: times 8 dw 4
+pw2_16: times 8 dw 8
SECTION .text
@@ -40,6 +44,46 @@
RET
INIT_MMX sse
+cglobal dc_left_predictor_4x4, 4, 5, 2, dst, stride, above, left, goffset
+ GET_GOT goffsetq
+
+ pxor m1, m1
+ movd m0, [leftq]
+ psadbw m0, m1
+ paddw m0, [GLOBAL(pw2_4)]
+ psraw m0, 2
+ pshufw m0, m0, 0x0
+ packuswb m0, m0
+ movd [dstq ], m0
+ movd [dstq+strideq], m0
+ lea dstq, [dstq+strideq*2]
+ movd [dstq ], m0
+ movd [dstq+strideq], m0
+
+ RESTORE_GOT
+ RET
+
+INIT_MMX sse
+cglobal dc_top_predictor_4x4, 4, 5, 2, dst, stride, above, left, goffset
+ GET_GOT goffsetq
+
+ pxor m1, m1
+ movd m0, [aboveq]
+ psadbw m0, m1
+ paddw m0, [GLOBAL(pw2_4)]
+ psraw m0, 2
+ pshufw m0, m0, 0x0
+ packuswb m0, m0
+ movd [dstq ], m0
+ movd [dstq+strideq], m0
+ lea dstq, [dstq+strideq*2]
+ movd [dstq ], m0
+ movd [dstq+strideq], m0
+
+ RESTORE_GOT
+ RET
+
+INIT_MMX sse
cglobal dc_predictor_8x8, 4, 5, 3, dst, stride, above, left, goffset
GET_GOT goffsetq
@@ -68,6 +112,91 @@
RESTORE_GOT
RET
+INIT_MMX sse
+cglobal dc_top_predictor_8x8, 4, 5, 3, dst, stride, above, left, goffset
+ GET_GOT goffsetq
+
+ pxor m1, m1
+ movq m0, [aboveq]
+ DEFINE_ARGS dst, stride, stride3
+ lea stride3q, [strideq*3]
+ psadbw m0, m1
+ paddw m0, [GLOBAL(pw2_8)]
+ psraw m0, 3
+ pshufw m0, m0, 0x0
+ packuswb m0, m0
+ movq [dstq ], m0
+ movq [dstq+strideq ], m0
+ movq [dstq+strideq*2], m0
+ movq [dstq+stride3q ], m0
+ lea dstq, [dstq+strideq*4]
+ movq [dstq ], m0
+ movq [dstq+strideq ], m0
+ movq [dstq+strideq*2], m0
+ movq [dstq+stride3q ], m0
+
+ RESTORE_GOT
+ RET
+
+INIT_MMX sse
+cglobal dc_left_predictor_8x8, 4, 5, 3, dst, stride, above, left, goffset
+ GET_GOT goffsetq
+
+ pxor m1, m1
+ movq m0, [leftq]
+ DEFINE_ARGS dst, stride, stride3
+ lea stride3q, [strideq*3]
+ psadbw m0, m1
+ paddw m0, [GLOBAL(pw2_8)]
+ psraw m0, 3
+ pshufw m0, m0, 0x0
+ packuswb m0, m0
+ movq [dstq ], m0
+ movq [dstq+strideq ], m0
+ movq [dstq+strideq*2], m0
+ movq [dstq+stride3q ], m0
+ lea dstq, [dstq+strideq*4]
+ movq [dstq ], m0
+ movq [dstq+strideq ], m0
+ movq [dstq+strideq*2], m0
+ movq [dstq+stride3q ], m0
+
+ RESTORE_GOT
+ RET
+
+INIT_MMX sse
+cglobal dc_128_predictor_4x4, 4, 5, 3, dst, stride, above, left, goffset
+ GET_GOT goffsetq
+
+ DEFINE_ARGS dst, stride, stride3
+ lea stride3q, [strideq*3]
+ movd m0, [GLOBAL(dc_128)]
+ movd [dstq ], m0
+ movd [dstq+strideq ], m0
+ movd [dstq+strideq*2], m0
+ movd [dstq+stride3q ], m0
+ RESTORE_GOT
+ RET
+
+INIT_MMX sse
+cglobal dc_128_predictor_8x8, 4, 5, 3, dst, stride, above, left, goffset
+ GET_GOT goffsetq
+
+ DEFINE_ARGS dst, stride, stride3
+ lea stride3q, [strideq*3]
+ movq m0, [GLOBAL(dc_128)]
+ movq [dstq ], m0
+ movq [dstq+strideq ], m0
+ movq [dstq+strideq*2], m0
+ movq [dstq+stride3q ], m0
+ lea dstq, [dstq+strideq*4]
+ movq [dstq ], m0
+ movq [dstq+strideq ], m0
+ movq [dstq+strideq*2], m0
+ movq [dstq+stride3q ], m0
+ RESTORE_GOT
+ RET
+
INIT_XMM sse2
cglobal dc_predictor_16x16, 4, 5, 3, dst, stride, above, left, goffset
GET_GOT goffsetq
@@ -100,6 +229,60 @@
RESTORE_GOT
REP_RET
+
+INIT_XMM sse2
+cglobal dc_top_predictor_16x16, 4, 5, 3, dst, stride, above, left, goffset
+ GET_GOT goffsetq
+
+ pxor m1, m1
+ pxor m2, m2
+ mova m0, [aboveq]
+ DEFINE_ARGS dst, stride, stride3, lines4
+ lea stride3q, [strideq*3]
+ mov lines4d, 4
+ psadbw m0, m1
+ psadbw m2, m1
+ paddw m0, m2
+ movhlps m2, m0
+ paddw m0, m2
+ paddw m0, [GLOBAL(pw2_16)]
+ psraw m0, 4
+ pshuflw m0, m0, 0x0
+ punpcklqdq m0, m0
+ packuswb m0, m0
+.loop:
+ mova [dstq ], m0
+ mova [dstq+strideq ], m0
+ mova [dstq+strideq*2], m0
+ mova [dstq+stride3q ], m0
+ lea dstq, [dstq+strideq*4]
+ dec lines4d
+ jnz .loop
+
+ RESTORE_GOT
+ REP_RET
+
+
+INIT_XMM sse2
+cglobal dc_128_predictor_16x16, 4, 5, 3, dst, stride, above, left, goffset
+ GET_GOT goffsetq
+
+ DEFINE_ARGS dst, stride, stride3, lines4
+ lea stride3q, [strideq*3]
+ mov lines4d, 4
+ mova m0, [GLOBAL(dc_128)]
+.loop:
+ mova [dstq ], m0
+ mova [dstq+strideq ], m0
+ mova [dstq+strideq*2], m0
+ mova [dstq+stride3q ], m0
+ lea dstq, [dstq+strideq*4]
+ dec lines4d
+ jnz .loop
+ RESTORE_GOT
+ RET
+
+
INIT_XMM sse2
cglobal dc_predictor_32x32, 4, 5, 5, dst, stride, above, left, goffset
GET_GOT goffsetq
diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c
index cd50ccf..6c35aa4 100644
--- a/vp9/encoder/vp9_encodeframe.c
+++ b/vp9/encoder/vp9_encodeframe.c
@@ -2158,7 +2158,6 @@
int bh, bw;
BLOCK_SIZE min_size = BLOCK_4X4;
BLOCK_SIZE max_size = BLOCK_64X64;
- int i = 0;
int bs_hist[BLOCK_SIZES] = {0};
// Trap case where we do not have a prediction.
@@ -2188,36 +2187,10 @@
bs_hist);
}
- // adjust observed min and max
+ // Adjust observed min and max for "relaxed" auto partition case.
if (cpi->sf.auto_min_max_partition_size == RELAXED_NEIGHBORING_MIN_MAX) {
min_size = min_partition_size[min_size];
max_size = max_partition_size[max_size];
- } else if (cpi->sf.auto_min_max_partition_size ==
- CONSTRAIN_NEIGHBORING_MIN_MAX) {
- // adjust the search range based on the histogram of the observed
- // partition sizes from left, above the previous co-located blocks
- int sum = 0;
- int first_moment = 0;
- int second_moment = 0;
- int var_unnormalized = 0;
-
- for (i = 0; i < BLOCK_SIZES; i++) {
- sum += bs_hist[i];
- first_moment += bs_hist[i] * i;
- second_moment += bs_hist[i] * i * i;
- }
-
- // if variance is small enough,
- // adjust the range around its mean size, which gives a tighter range
- var_unnormalized = second_moment - first_moment * first_moment / sum;
- if (var_unnormalized <= 4 * sum) {
- int mean = first_moment / sum;
- min_size = min_partition_size[mean];
- max_size = max_partition_size[mean];
- } else {
- min_size = min_partition_size[min_size];
- max_size = max_partition_size[max_size];
- }
}
}
diff --git a/vp9/encoder/vp9_firstpass.c b/vp9/encoder/vp9_firstpass.c
index c8f1313..9752668 100644
--- a/vp9/encoder/vp9_firstpass.c
+++ b/vp9/encoder/vp9_firstpass.c
@@ -51,7 +51,6 @@
#define KF_MAX_BOOST 128.0
#define MIN_ARF_GF_BOOST 240
#define MIN_DECAY_FACTOR 0.01
-#define MIN_GF_INTERVAL 4
#define MIN_KF_BOOST 300
#define NEW_MV_MODE_PENALTY 32
#define SVC_FACTOR_PT_LOW 0.45
@@ -1323,14 +1322,17 @@
// Function to test for a condition where a complex transition is followed
// by a static section. For example in slide shows where there is a fade
// between slides. This is to help with more optimal kf and gf positioning.
-static int detect_transition_to_still(const TWO_PASS *twopass,
+static int detect_transition_to_still(VP9_COMP *cpi,
int frame_interval, int still_interval,
double loop_decay_rate,
double last_decay_rate) {
+ TWO_PASS *const twopass = &cpi->twopass;
+ RATE_CONTROL *const rc = &cpi->rc;
+
// Break clause to detect very still sections after motion
// For example a static image after a fade or other transition
// instead of a clean scene cut.
- if (frame_interval > MIN_GF_INTERVAL &&
+ if (frame_interval > rc->min_gf_interval &&
loop_decay_rate >= 0.999 &&
last_decay_rate < 0.9) {
int j;
@@ -1838,7 +1840,7 @@
int int_lbq =
(int)(vp9_convert_qindex_to_q(rc->last_boosted_qindex,
cpi->common.bit_depth));
- active_min_gf_interval = MIN_GF_INTERVAL + MIN(2, int_max_q / 200);
+ active_min_gf_interval = rc->min_gf_interval + MIN(2, int_max_q / 200);
if (active_min_gf_interval > rc->max_gf_interval)
active_min_gf_interval = rc->max_gf_interval;
@@ -1894,7 +1896,7 @@
// Break clause to detect very still sections after motion. For example,
// a static image after a fade or other transition.
- if (detect_transition_to_still(twopass, i, 5, loop_decay_rate,
+ if (detect_transition_to_still(cpi, i, 5, loop_decay_rate,
last_loop_decay_rate)) {
allow_alt_ref = 0;
break;
@@ -1960,7 +1962,7 @@
// Should we use the alternate reference frame.
if (allow_alt_ref &&
(i < cpi->oxcf.lag_in_frames) &&
- (i >= MIN_GF_INTERVAL)) {
+ (i >= rc->min_gf_interval)) {
// Calculate the boost for alt ref.
rc->gfu_boost = calc_arf_boost(cpi, 0, (i - 1), (i - 1), &f_boost,
&b_boost);
@@ -2239,7 +2241,7 @@
// Special check for transition or high motion followed by a
// static scene.
- if (detect_transition_to_still(twopass, i, cpi->oxcf.key_freq - i,
+ if (detect_transition_to_still(cpi, i, cpi->oxcf.key_freq - i,
loop_decay_rate, decay_accumulator))
break;
diff --git a/vp9/encoder/vp9_pickmode.c b/vp9/encoder/vp9_pickmode.c
index c32a0df..57bce3a 100644
--- a/vp9/encoder/vp9_pickmode.c
+++ b/vp9/encoder/vp9_pickmode.c
@@ -1489,7 +1489,7 @@
for (i = 0; i < 4; ++i) {
const PREDICTION_MODE this_mode = intra_mode_list[i];
- if (!((1 << this_mode) & cpi->sf.intra_y_mode_mask[intra_tx_size]))
+ if (!((1 << this_mode) & cpi->sf.intra_y_mode_bsize_mask[bsize]))
continue;
mbmi->mode = this_mode;
mbmi->ref_frame[0] = INTRA_FRAME;
diff --git a/vp9/encoder/vp9_ratectrl.c b/vp9/encoder/vp9_ratectrl.c
index 8713caa..4c33ffd 100644
--- a/vp9/encoder/vp9_ratectrl.c
+++ b/vp9/encoder/vp9_ratectrl.c
@@ -1609,12 +1609,19 @@
return target_index - qindex;
}
-void vp9_rc_set_gf_max_interval(const VP9_COMP *const cpi,
- RATE_CONTROL *const rc) {
+#define MIN_GF_INTERVAL 4
+#define MAX_GF_INTERVAL 16
+void vp9_rc_set_gf_interval_range(const VP9_COMP *const cpi,
+ RATE_CONTROL *const rc) {
const VP9EncoderConfig *const oxcf = &cpi->oxcf;
+
+ // Set a minimum interval.
+ rc->min_gf_interval =
+ MIN(MAX_GF_INTERVAL, MAX(MIN_GF_INTERVAL, (int)(cpi->framerate * 0.125)));
+
// Set Maximum gf/arf interval.
rc->max_gf_interval =
- MIN(16, (int)(cpi->framerate / 2.0));
+ MIN(MAX_GF_INTERVAL, (int)(cpi->framerate * 0.75));
// Round up to next even number if odd.
rc->max_gf_interval += (rc->max_gf_interval & 0x01);
@@ -1628,6 +1635,9 @@
if (rc->max_gf_interval > rc->static_scene_max_gf_interval)
rc->max_gf_interval = rc->static_scene_max_gf_interval;
+
+ // Clamp min to max
+ rc->min_gf_interval = MIN(rc->min_gf_interval, rc->max_gf_interval);
}
void vp9_rc_update_framerate(VP9_COMP *cpi) {
@@ -1654,7 +1664,7 @@
rc->max_frame_bandwidth = MAX(MAX((cm->MBs * MAX_MB_RATE), MAXRATE_1080P),
vbr_max_bits);
- vp9_rc_set_gf_max_interval(cpi, rc);
+ vp9_rc_set_gf_interval_range(cpi, rc);
}
#define VBR_PCT_ADJUSTMENT_LIMIT 50
diff --git a/vp9/encoder/vp9_ratectrl.h b/vp9/encoder/vp9_ratectrl.h
index 705796a..869f6e5 100644
--- a/vp9/encoder/vp9_ratectrl.h
+++ b/vp9/encoder/vp9_ratectrl.h
@@ -73,6 +73,7 @@
int frames_since_golden;
int frames_till_gf_update_due;
+ int min_gf_interval;
int max_gf_interval;
int static_scene_max_gf_interval;
int baseline_gf_interval;
@@ -238,8 +239,8 @@
void vp9_rc_update_framerate(struct VP9_COMP *cpi);
-void vp9_rc_set_gf_max_interval(const struct VP9_COMP *const cpi,
- RATE_CONTROL *const rc);
+void vp9_rc_set_gf_interval_range(const struct VP9_COMP *const cpi,
+ RATE_CONTROL *const rc);
void vp9_set_target_rate(struct VP9_COMP *cpi);
diff --git a/vp9/encoder/vp9_speed_features.c b/vp9/encoder/vp9_speed_features.c
index 4c5ba5d..4999b49 100644
--- a/vp9/encoder/vp9_speed_features.c
+++ b/vp9/encoder/vp9_speed_features.c
@@ -115,7 +115,7 @@
FLAG_SKIP_INTRA_LOWVAR;
sf->disable_filter_search_var_thresh = 100;
sf->comp_inter_joint_search_thresh = BLOCK_SIZES;
- sf->auto_min_max_partition_size = CONSTRAIN_NEIGHBORING_MIN_MAX;
+ sf->auto_min_max_partition_size = RELAXED_NEIGHBORING_MIN_MAX;
sf->allow_partition_search_skip = 1;
}
@@ -318,11 +318,15 @@
if (!is_keyframe) {
int i;
if (content == VP9E_CONTENT_SCREEN) {
- for (i = 0; i < TX_SIZES; ++i)
- sf->intra_y_mode_mask[i] = INTRA_DC_TM_H_V;
+ for (i = 0; i < BLOCK_SIZES; ++i)
+ sf->intra_y_mode_bsize_mask[i] = INTRA_DC_TM_H_V;
} else {
- for (i = 0; i < TX_SIZES; i++)
- sf->intra_y_mode_mask[i] = INTRA_DC;
+ for (i = 0; i < BLOCK_SIZES; ++i)
+ if (i >= BLOCK_16X16)
+ sf->intra_y_mode_bsize_mask[i] = INTRA_DC;
+ else
+ // Use H and V intra mode for block sizes <= 16X16.
+ sf->intra_y_mode_bsize_mask[i] = INTRA_DC_H_V;
}
}
}
diff --git a/vp9/encoder/vp9_speed_features.h b/vp9/encoder/vp9_speed_features.h
index 8722d9c..1d0dbb1 100644
--- a/vp9/encoder/vp9_speed_features.h
+++ b/vp9/encoder/vp9_speed_features.h
@@ -101,8 +101,7 @@
typedef enum {
NOT_IN_USE = 0,
RELAXED_NEIGHBORING_MIN_MAX = 1,
- CONSTRAIN_NEIGHBORING_MIN_MAX = 2,
- STRICT_NEIGHBORING_MIN_MAX = 3
+ STRICT_NEIGHBORING_MIN_MAX = 2
} AUTO_MIN_MAX_MODE;
typedef enum {
@@ -340,6 +339,10 @@
int intra_y_mode_mask[TX_SIZES];
int intra_uv_mode_mask[TX_SIZES];
+ // These bit masks allow you to enable or disable intra modes for each
+ // prediction block size separately.
+ int intra_y_mode_bsize_mask[BLOCK_SIZES];
+
// This variable enables an early break out of mode testing if the model for
// rd built from the prediction signal indicates a value that's much
// higher than the best rd we've seen so far.
diff --git a/vp9/encoder/vp9_svc_layercontext.c b/vp9/encoder/vp9_svc_layercontext.c
index e122397..b3491a2 100644
--- a/vp9/encoder/vp9_svc_layercontext.c
+++ b/vp9/encoder/vp9_svc_layercontext.c
@@ -195,7 +195,7 @@
oxcf->two_pass_vbrmin_section / 100);
lrc->max_frame_bandwidth = (int)(((int64_t)lrc->avg_frame_bandwidth *
oxcf->two_pass_vbrmax_section) / 100);
- vp9_rc_set_gf_max_interval(cpi, lrc);
+ vp9_rc_set_gf_interval_range(cpi, lrc);
}
void vp9_restore_layer_context(VP9_COMP *const cpi) {