Merge "Add neon optimize vp9_short_idct4x4_1_add."
diff --git a/test/md5_helper.h b/test/md5_helper.h
index fc1a974..f34054d 100644
--- a/test/md5_helper.h
+++ b/test/md5_helper.h
@@ -25,9 +25,15 @@
void Add(const vpx_image_t *img) {
for (int plane = 0; plane < 3; ++plane) {
- uint8_t *buf = img->planes[plane];
- const int h = plane ? (img->d_h + 1) >> 1 : img->d_h;
- const int w = plane ? (img->d_w + 1) >> 1 : img->d_w;
+ const uint8_t *buf = img->planes[plane];
+ // Calculate the width and height to do the md5 check. For the chroma
+ // plane, we never want to round down and thus skip a pixel so if
+ // we are shifting by 1 (chroma_shift) we add 1 before doing the shift.
+ // This works only for chroma_shift of 0 and 1.
+ const int h = plane ? (img->d_h + img->y_chroma_shift) >>
+ img->y_chroma_shift : img->d_h;
+ const int w = plane ? (img->d_w + img->x_chroma_shift) >>
+ img->x_chroma_shift : img->d_w;
for (int y = 0; y < h; ++y) {
MD5Update(&md5_, buf, w);
diff --git a/vp9/encoder/vp9_onyx_if.c b/vp9/encoder/vp9_onyx_if.c
index 15b0bc6..e15c44d 100644
--- a/vp9/encoder/vp9_onyx_if.c
+++ b/vp9/encoder/vp9_onyx_if.c
@@ -735,7 +735,8 @@
sf->mode_search_skip_flags = 0;
sf->disable_split_var_thresh = 0;
sf->disable_filter_search_var_thresh = 0;
- sf->last_chroma_intra_mode = TM_PRED;
+ sf->intra_y_mode_mask = ALL_INTRA_MODES;
+ sf->intra_uv_mode_mask = ALL_INTRA_MODES;
sf->use_rd_breakout = 0;
sf->skip_encode_sb = 0;
sf->use_uv_intra_rd_estimate = 0;
@@ -765,7 +766,7 @@
sf->static_segmentation = 0;
#endif
sf->use_avoid_tested_higherror = 1;
- sf->adaptive_rd_thresh = 1;
+ sf->adaptive_rd_thresh = MIN((speed + 1), 4);
if (speed == 1) {
sf->comp_inter_joint_search_thresh = BLOCK_SIZES;
@@ -798,6 +799,9 @@
// the main framework of partition search type.
sf->disable_split_var_thresh = 0;
sf->disable_filter_search_var_thresh = 16;
+
+ sf->intra_y_mode_mask = INTRA_DC_TM_H_V;
+ sf->intra_uv_mode_mask = INTRA_DC_TM_H_V;
}
if (speed == 2) {
sf->adjust_thresholds_by_speed = 1;
@@ -819,7 +823,8 @@
FLAG_SKIP_COMP_REFMISMATCH |
FLAG_SKIP_INTRA_LOWVAR |
FLAG_EARLY_TERMINATE;
- sf->last_chroma_intra_mode = DC_PRED;
+ sf->intra_y_mode_mask = INTRA_DC_TM;
+ sf->intra_uv_mode_mask = INTRA_DC_TM;
sf->use_uv_intra_rd_estimate = 1;
sf->use_rd_breakout = 1;
sf->skip_encode_sb = 1;
@@ -859,6 +864,8 @@
sf->subpel_iters_per_step = 1;
sf->disable_split_var_thresh = 64;
sf->disable_filter_search_var_thresh = 64;
+ sf->intra_y_mode_mask = INTRA_DC_ONLY;
+ sf->intra_uv_mode_mask = INTRA_DC_ONLY;
}
if (speed == 4) {
sf->comp_inter_joint_search_thresh = BLOCK_SIZES;
@@ -1395,7 +1402,7 @@
}
VP9_PTR vp9_create_compressor(VP9_CONFIG *oxcf) {
- int i;
+ int i, j;
volatile union {
VP9_COMP *cpi;
VP9_PTR ptr;
@@ -1597,9 +1604,10 @@
vp9_set_speed_features(cpi);
- // Set starting values of RD threshold multipliers (128 = *1)
- for (i = 0; i < MAX_MODES; i++)
- cpi->rd_thresh_mult[i] = 128;
+ // Default rd threshold factors for mode selection
+ for (i = 0; i < BLOCK_SIZES; ++i)
+ for (j = 0; j < MAX_MODES; ++j)
+ cpi->rd_thresh_freq_fact[i][j] = 32;
#define BFP(BT, SDF, SDAF, VF, SVF, SVAF, SVFHH, SVFHV, SVFHHV, \
SDX3F, SDX8F, SDX4DF)\
@@ -2629,8 +2637,6 @@
// Set various flags etc to special state if it is a key frame
if (cm->frame_type == KEY_FRAME) {
- int i;
-
// Reset the loop filter deltas and segmentation map
setup_features(cm);
@@ -2643,10 +2649,6 @@
// The alternate reference frame cannot be active for a key frame
cpi->source_alt_ref_active = 0;
- // Reset the RD threshold multipliers to default of * 1 (128)
- for (i = 0; i < MAX_MODES; i++)
- cpi->rd_thresh_mult[i] = 128;
-
cm->error_resilient_mode = (cpi->oxcf.error_resilient_mode != 0);
cm->frame_parallel_decoding_mode =
(cpi->oxcf.frame_parallel_decoding_mode != 0);
diff --git a/vp9/encoder/vp9_onyx_int.h b/vp9/encoder/vp9_onyx_int.h
index 1c0b2ca..8328374 100644
--- a/vp9/encoder/vp9_onyx_int.h
+++ b/vp9/encoder/vp9_onyx_int.h
@@ -238,6 +238,11 @@
// Other methods to come
} SUBPEL_SEARCH_METHODS;
+#define ALL_INTRA_MODES 0x3FF
+#define INTRA_DC_ONLY 0x01
+#define INTRA_DC_TM ((1 << TM_PRED) | (1 << DC_PRED))
+#define INTRA_DC_TM_H_V (INTRA_DC_TM | (1 << V_PRED) | (1 << H_PRED))
+
typedef struct {
int RD;
SEARCH_METHODS search_method;
@@ -288,7 +293,8 @@
// A source variance threshold below which filter search is disabled
// Choose a very large value (UINT_MAX) to use 8-tap always
unsigned int disable_filter_search_var_thresh;
- MB_PREDICTION_MODE last_chroma_intra_mode;
+ int intra_y_mode_mask;
+ int intra_uv_mode_mask;
int use_rd_breakout;
int use_uv_intra_rd_estimate;
int use_fast_lpf_pick;
@@ -375,8 +381,6 @@
int ref_frame_mask;
int set_ref_frame_mask;
- int rd_thresh_mult[MAX_MODES];
- int rd_baseline_thresh[BLOCK_SIZES][MAX_MODES];
int rd_threshes[BLOCK_SIZES][MAX_MODES];
int rd_thresh_freq_fact[BLOCK_SIZES][MAX_MODES];
diff --git a/vp9/encoder/vp9_rdopt.c b/vp9/encoder/vp9_rdopt.c
index 567d3d0..ee21957 100644
--- a/vp9/encoder/vp9_rdopt.c
+++ b/vp9/encoder/vp9_rdopt.c
@@ -104,9 +104,8 @@
static int rd_thresh_block_size_factor[BLOCK_SIZES] =
{2, 3, 3, 4, 6, 6, 8, 12, 12, 16, 24, 24, 32};
-#define BASE_RD_THRESH_FREQ_FACT 16
-#define MAX_RD_THRESH_FREQ_FACT 32
-#define MAX_RD_THRESH_FREQ_INC 1
+#define MAX_RD_THRESH_FACT 64
+#define RD_THRESH_INC 1
static void fill_token_costs(vp9_coeff_cost *c,
vp9_coeff_probs_model (*p)[BLOCK_TYPES]) {
@@ -212,12 +211,6 @@
} else {
cpi->rd_threshes[bsize][i] = INT_MAX;
}
- cpi->rd_baseline_thresh[bsize][i] = cpi->rd_threshes[bsize][i];
-
- if (cpi->sf.adaptive_rd_thresh)
- cpi->rd_thresh_freq_fact[bsize][i] = MAX_RD_THRESH_FREQ_FACT;
- else
- cpi->rd_thresh_freq_fact[bsize][i] = BASE_RD_THRESH_FREQ_FACT;
}
}
} else {
@@ -236,12 +229,6 @@
} else {
cpi->rd_threshes[bsize][i] = INT_MAX;
}
- cpi->rd_baseline_thresh[bsize][i] = cpi->rd_threshes[bsize][i];
-
- if (cpi->sf.adaptive_rd_thresh)
- cpi->rd_thresh_freq_fact[bsize][i] = MAX_RD_THRESH_FREQ_FACT;
- else
- cpi->rd_thresh_freq_fact[bsize][i] = BASE_RD_THRESH_FREQ_FACT;
}
}
}
@@ -1043,6 +1030,10 @@
for (mode = DC_PRED; mode <= TM_PRED; ++mode) {
int64_t this_rd;
int ratey = 0;
+
+ if (!(cpi->sf.intra_y_mode_mask & (1 << mode)))
+ continue;
+
// Only do the oblique modes if the best so far is
// one of the neighboring directional modes
if (cpi->sf.mode_search_skip_flags & FLAG_SKIP_INTRA_DIRMISMATCH) {
@@ -1228,6 +1219,9 @@
int64_t local_tx_cache[TX_MODES];
const int mis = xd->mode_info_stride;
+ if (!(cpi->sf.intra_y_mode_mask & (1 << mode)))
+ continue;
+
if (cpi->common.frame_type == KEY_FRAME) {
const MB_PREDICTION_MODE A = above_block_mode(mic, 0, mis);
const MB_PREDICTION_MODE L = xd->left_available ?
@@ -1325,10 +1319,14 @@
int this_rate_tokenonly, this_rate, s;
int64_t this_distortion, this_sse;
- MB_PREDICTION_MODE last_mode = bsize <= BLOCK_8X8 ?
- TM_PRED : cpi->sf.last_chroma_intra_mode;
+ // int mode_mask = (bsize <= BLOCK_8X8)
+ // ? ALL_INTRA_MODES : cpi->sf.intra_uv_mode_mask;
- for (mode = DC_PRED; mode <= last_mode; mode++) {
+ for (mode = DC_PRED; mode <= TM_PRED; mode++) {
+ // if (!(mode_mask & (1 << mode)))
+ if (!(cpi->sf.intra_uv_mode_mask & (1 << mode)))
+ continue;
+
x->e_mbd.mode_info_context->mbmi.uv_mode = mode;
super_block_uvrd(&cpi->common, x, &this_rate_tokenonly,
&this_distortion, &s, &this_sse, bsize, best_rd);
@@ -3216,7 +3214,7 @@
// Test best rd so far against threshold for trying this mode.
if ((best_rd < ((cpi->rd_threshes[bsize][mode_index] *
- cpi->rd_thresh_freq_fact[bsize][mode_index]) >> 4)) ||
+ cpi->rd_thresh_freq_fact[bsize][mode_index]) >> 5)) ||
cpi->rd_threshes[bsize][mode_index] == INT_MAX)
continue;
@@ -3777,29 +3775,6 @@
}
}
}
-#if 0
- // Testing this mode gave rise to an improvement in best error score.
- // Lower threshold a bit for next time
- cpi->rd_thresh_mult[mode_index] =
- (cpi->rd_thresh_mult[mode_index] >= (MIN_THRESHMULT + 2)) ?
- cpi->rd_thresh_mult[mode_index] - 2 : MIN_THRESHMULT;
- cpi->rd_threshes[mode_index] =
- (cpi->rd_baseline_thresh[mode_index] >> 7)
- * cpi->rd_thresh_mult[mode_index];
-#endif
- } else {
- // If the mode did not help improve the best error case then
- // raise the threshold for testing that mode next time around.
-#if 0
- cpi->rd_thresh_mult[mode_index] += 4;
-
- if (cpi->rd_thresh_mult[mode_index] > MAX_THRESHMULT)
- cpi->rd_thresh_mult[mode_index] = MAX_THRESHMULT;
-
- cpi->rd_threshes[mode_index] =
- (cpi->rd_baseline_thresh[mode_index] >> 7)
- * cpi->rd_thresh_mult[mode_index];
-#endif
}
/* keep record of best compound/single-only prediction */
@@ -3942,33 +3917,19 @@
if (cpi->sf.adaptive_rd_thresh) {
for (mode_index = 0; mode_index < MAX_MODES; ++mode_index) {
if (mode_index == best_mode_index) {
- cpi->rd_thresh_freq_fact[bsize][mode_index] = BASE_RD_THRESH_FREQ_FACT;
+ cpi->rd_thresh_freq_fact[bsize][mode_index] -=
+ (cpi->rd_thresh_freq_fact[bsize][mode_index] >> 3);
} else {
- cpi->rd_thresh_freq_fact[bsize][mode_index] += MAX_RD_THRESH_FREQ_INC;
+ cpi->rd_thresh_freq_fact[bsize][mode_index] += RD_THRESH_INC;
if (cpi->rd_thresh_freq_fact[bsize][mode_index] >
- (cpi->sf.adaptive_rd_thresh * MAX_RD_THRESH_FREQ_FACT)) {
+ (cpi->sf.adaptive_rd_thresh * MAX_RD_THRESH_FACT)) {
cpi->rd_thresh_freq_fact[bsize][mode_index] =
- cpi->sf.adaptive_rd_thresh * MAX_RD_THRESH_FREQ_FACT;
+ cpi->sf.adaptive_rd_thresh * MAX_RD_THRESH_FACT;
}
}
}
}
- // TODO(rbultje) integrate with RD trd_thresh_freq_facthresholding
-#if 0
- // Reduce the activation RD thresholds for the best choice mode
- if ((cpi->rd_baseline_thresh[best_mode_index] > 0) &&
- (cpi->rd_baseline_thresh[best_mode_index] < (INT_MAX >> 2))) {
- int best_adjustment = (cpi->rd_thresh_mult[best_mode_index] >> 2);
-
- cpi->rd_thresh_mult[best_mode_index] =
- (cpi->rd_thresh_mult[best_mode_index] >= (MIN_THRESHMULT + best_adjustment)) ?
- cpi->rd_thresh_mult[best_mode_index] - best_adjustment : MIN_THRESHMULT;
- cpi->rd_threshes[best_mode_index] =
- (cpi->rd_baseline_thresh[best_mode_index] >> 7) * cpi->rd_thresh_mult[best_mode_index];
- }
-#endif
-
// macroblock modes
*mbmi = best_mbmode;
x->skip |= best_skip2;
diff --git a/vp9/encoder/x86/vp9_variance_impl_mmx.asm b/vp9/encoder/x86/vp9_variance_impl_mmx.asm
index d3dbefe..3501cf1 100644
--- a/vp9/encoder/x86/vp9_variance_impl_mmx.asm
+++ b/vp9/encoder/x86/vp9_variance_impl_mmx.asm
@@ -342,8 +342,8 @@
movsxd rdx, dword ptr arg(3) ;[recon_stride]
; Row 1
- movq mm0, [rax] ; Copy eight bytes to mm0
- movq mm1, [rbx] ; Copy eight bytes to mm1
+ movd mm0, [rax] ; Copy 4 bytes to mm0
+ movd mm1, [rbx] ; Copy 4 bytes to mm1
punpcklbw mm0, mm6 ; unpack to higher prrcision
punpcklbw mm1, mm6
psubsw mm0, mm1 ; A-B (low order) to MM0
@@ -351,12 +351,12 @@
pmaddwd mm0, mm0 ; square and accumulate
add rbx,rdx ; Inc pointer into ref data
add rax,rcx ; Inc pointer into the new data
- movq mm1, [rbx] ; Copy eight bytes to mm1
+ movd mm1, [rbx] ; Copy 4 bytes to mm1
paddd mm7, mm0 ; accumulate in mm7
; Row 2
- movq mm0, [rax] ; Copy eight bytes to mm0
+ movd mm0, [rax] ; Copy 4 bytes to mm0
punpcklbw mm0, mm6 ; unpack to higher prrcision
punpcklbw mm1, mm6
psubsw mm0, mm1 ; A-B (low order) to MM0
@@ -365,11 +365,11 @@
pmaddwd mm0, mm0 ; square and accumulate
add rbx,rdx ; Inc pointer into ref data
add rax,rcx ; Inc pointer into the new data
- movq mm1, [rbx] ; Copy eight bytes to mm1
+ movd mm1, [rbx] ; Copy 4 bytes to mm1
paddd mm7, mm0 ; accumulate in mm7
; Row 3
- movq mm0, [rax] ; Copy eight bytes to mm0
+ movd mm0, [rax] ; Copy 4 bytes to mm0
punpcklbw mm0, mm6 ; unpack to higher prrcision
punpcklbw mm1, mm6
psubsw mm0, mm1 ; A-B (low order) to MM0
@@ -378,11 +378,11 @@
pmaddwd mm0, mm0 ; square and accumulate
add rbx,rdx ; Inc pointer into ref data
add rax,rcx ; Inc pointer into the new data
- movq mm1, [rbx] ; Copy eight bytes to mm1
+ movd mm1, [rbx] ; Copy 4 bytes to mm1
paddd mm7, mm0 ; accumulate in mm7
; Row 4
- movq mm0, [rax] ; Copy eight bytes to mm0
+ movd mm0, [rax] ; Copy 4 bytes to mm0
punpcklbw mm0, mm6 ; unpack to higher prrcision
punpcklbw mm1, mm6