Merge "Use lookup table to simplify logic"
diff --git a/test/dct16x16_test.cc b/test/dct16x16_test.cc
index 3d61d40..b990bf8 100644
--- a/test/dct16x16_test.cc
+++ b/test/dct16x16_test.cc
@@ -395,8 +395,7 @@
for (int j = 0; j < kNumCoeffs; ++j)
coeff[j] = round(out_r[j]);
- const int pitch = 32;
- REGISTER_STATE_CHECK(RunInvTxfm(coeff, dst, pitch));
+ REGISTER_STATE_CHECK(RunInvTxfm(coeff, dst, 16));
for (int j = 0; j < kNumCoeffs; ++j) {
const uint32_t diff = dst[j] - src[j];
@@ -421,7 +420,7 @@
fwd_txfm_ = GET_PARAM(0);
inv_txfm_ = GET_PARAM(1);
tx_type_ = GET_PARAM(2);
- pitch_ = 32;
+ pitch_ = 16;
fwd_txfm_ref = fdct16x16_ref;
}
virtual void TearDown() { libvpx_test::ClearSystemState(); }
@@ -431,7 +430,7 @@
fwd_txfm_(in, out, stride);
}
void RunInvTxfm(int16_t *out, uint8_t *dst, int stride) {
- inv_txfm_(out, dst, stride >> 1);
+ inv_txfm_(out, dst, stride);
}
fdct_t fwd_txfm_;
diff --git a/test/dct32x32_test.cc b/test/dct32x32_test.cc
index f456abc..5abb9b1 100644
--- a/test/dct32x32_test.cc
+++ b/test/dct32x32_test.cc
@@ -113,8 +113,7 @@
test_input_block[j] = src[j] - dst[j];
}
- const int pitch = 64;
- REGISTER_STATE_CHECK(fwd_txfm_(test_input_block, test_temp_block, pitch));
+ REGISTER_STATE_CHECK(fwd_txfm_(test_input_block, test_temp_block, 32));
REGISTER_STATE_CHECK(inv_txfm_(test_temp_block, dst, 32));
for (int j = 0; j < kNumCoeffs; ++j) {
@@ -150,9 +149,9 @@
for (int j = 0; j < kNumCoeffs; ++j)
input_block[j] = rnd.Rand8() - rnd.Rand8();
- const int pitch = 64;
- vp9_short_fdct32x32_c(input_block, output_ref_block, pitch);
- REGISTER_STATE_CHECK(fwd_txfm_(input_block, output_block, pitch));
+ const int stride = 32;
+ vp9_short_fdct32x32_c(input_block, output_ref_block, stride);
+ REGISTER_STATE_CHECK(fwd_txfm_(input_block, output_block, stride));
if (version_ == 0) {
for (int j = 0; j < kNumCoeffs; ++j)
@@ -188,9 +187,9 @@
for (int j = 0; j < kNumCoeffs; ++j)
input_extreme_block[j] = -255;
- const int pitch = 64;
- vp9_short_fdct32x32_c(input_extreme_block, output_ref_block, pitch);
- REGISTER_STATE_CHECK(fwd_txfm_(input_extreme_block, output_block, pitch));
+ const int stride = 32;
+ vp9_short_fdct32x32_c(input_extreme_block, output_ref_block, stride);
+ REGISTER_STATE_CHECK(fwd_txfm_(input_extreme_block, output_block, stride));
// The minimum quant value is 4.
for (int j = 0; j < kNumCoeffs; ++j) {
diff --git a/vp8/encoder/bitstream.c b/vp8/encoder/bitstream.c
index 8ca4f5f..78e54e2 100644
--- a/vp8/encoder/bitstream.c
+++ b/vp8/encoder/bitstream.c
@@ -1062,7 +1062,7 @@
if (cpi->common.frame_type == KEY_FRAME)
{
/* Reset to default counts/probabilities at key frames */
- vp8_copy(cpi->coef_counts, default_coef_counts);
+ vp8_copy(cpi->mb.coef_counts, default_coef_counts);
}
if (cpi->oxcf.error_resilient_mode & VPX_ERROR_RESILIENT_PARTITIONS)
diff --git a/vp9/common/vp9_blockd.h b/vp9/common/vp9_blockd.h
index 0a441bd..36e7e83 100644
--- a/vp9/common/vp9_blockd.h
+++ b/vp9/common/vp9_blockd.h
@@ -234,31 +234,6 @@
int q_index;
} MACROBLOCKD;
-static INLINE uint8_t *get_sb_index(MACROBLOCKD *xd, BLOCK_SIZE subsize) {
- switch (subsize) {
- case BLOCK_64X64:
- case BLOCK_64X32:
- case BLOCK_32X64:
- case BLOCK_32X32:
- return &xd->sb_index;
- case BLOCK_32X16:
- case BLOCK_16X32:
- case BLOCK_16X16:
- return &xd->mb_index;
- case BLOCK_16X8:
- case BLOCK_8X16:
- case BLOCK_8X8:
- return &xd->b_index;
- case BLOCK_8X4:
- case BLOCK_4X8:
- case BLOCK_4X4:
- return &xd->ab_index;
- default:
- assert(0);
- return NULL;
- }
-}
-
static INLINE void update_partition_context(MACROBLOCKD *xd, BLOCK_SIZE sb_type,
BLOCK_SIZE sb_size) {
const int bsl = b_width_log2(sb_size), bs = (1 << bsl) / 2;
diff --git a/vp9/common/vp9_onyxc_int.h b/vp9/common/vp9_onyxc_int.h
index 5c8c03e..3111852 100644
--- a/vp9/common/vp9_onyxc_int.h
+++ b/vp9/common/vp9_onyxc_int.h
@@ -146,8 +146,6 @@
TX_MODE tx_mode;
int base_qindex;
- int last_kf_gf_q; /* Q used on the last GF or KF */
-
int y_dc_delta_q;
int uv_dc_delta_q;
int uv_ac_delta_q;
diff --git a/vp9/common/vp9_rtcd_defs.sh b/vp9/common/vp9_rtcd_defs.sh
index 526be87..6fa9e22 100644
--- a/vp9/common/vp9_rtcd_defs.sh
+++ b/vp9/common/vp9_rtcd_defs.sh
@@ -701,13 +701,13 @@
prototype void vp9_short_fdct4x4 "int16_t *InputData, int16_t *OutputData, int pitch"
specialize vp9_short_fdct4x4 sse2
-prototype void vp9_short_fdct32x32 "int16_t *InputData, int16_t *OutputData, int pitch"
+prototype void vp9_short_fdct32x32 "int16_t *InputData, int16_t *OutputData, int stride"
specialize vp9_short_fdct32x32 sse2
-prototype void vp9_short_fdct32x32_rd "int16_t *InputData, int16_t *OutputData, int pitch"
+prototype void vp9_short_fdct32x32_rd "int16_t *InputData, int16_t *OutputData, int stride"
specialize vp9_short_fdct32x32_rd sse2
-prototype void vp9_short_fdct16x16 "int16_t *InputData, int16_t *OutputData, int pitch"
+prototype void vp9_short_fdct16x16 "int16_t *InputData, int16_t *OutputData, int stride"
specialize vp9_short_fdct16x16 sse2
prototype void vp9_short_walsh4x4 "int16_t *InputData, int16_t *OutputData, int pitch"
diff --git a/vp9/common/vp9_seg_common.c b/vp9/common/vp9_seg_common.c
index 6bfd8f8..ef30404 100644
--- a/vp9/common/vp9_seg_common.c
+++ b/vp9/common/vp9_seg_common.c
@@ -76,7 +76,7 @@
}
-const vp9_tree_index vp9_segment_tree[14] = {
+const vp9_tree_index vp9_segment_tree[TREE_SIZE(MAX_SEGMENTS)] = {
2, 4, 6, 8, 10, 12,
0, -1, -2, -3, -4, -5, -6, -7
};
diff --git a/vp9/common/vp9_seg_common.h b/vp9/common/vp9_seg_common.h
index f22239b..eb38c06 100644
--- a/vp9/common/vp9_seg_common.h
+++ b/vp9/common/vp9_seg_common.h
@@ -76,7 +76,7 @@
int segment_id,
SEG_LVL_FEATURES feature_id);
-extern const vp9_tree_index vp9_segment_tree[14];
+extern const vp9_tree_index vp9_segment_tree[TREE_SIZE(MAX_SEGMENTS)];
#endif // VP9_COMMON_VP9_SEG_COMMON_H_
diff --git a/vp9/decoder/vp9_decodemv.c b/vp9/decoder/vp9_decodemv.c
index 5bfe7b7..16e07a5 100644
--- a/vp9/decoder/vp9_decodemv.c
+++ b/vp9/decoder/vp9_decodemv.c
@@ -421,11 +421,12 @@
mbmi->uv_mode = read_intra_mode_uv(cm, r, mbmi->mode);
}
-static INLINE void assign_mv(VP9_COMMON *cm, MB_PREDICTION_MODE mode,
+static INLINE int assign_mv(VP9_COMMON *cm, MB_PREDICTION_MODE mode,
int_mv mv[2], int_mv best_mv[2],
int_mv nearest_mv[2], int_mv near_mv[2],
int is_compound, int allow_hp, vp9_reader *r) {
int i;
+ int ret = 1;
switch (mode) {
case NEWMV:
@@ -434,6 +435,10 @@
if (is_compound)
read_mv(r, &mv[1].as_mv, &best_mv[1].as_mv,
&cm->fc.nmvc, &cm->counts.mv, allow_hp);
+ for (i = 0; i < 1 + is_compound; ++i) {
+ ret = ret && mv[i].as_mv.row < MV_UPP && mv[i].as_mv.row > MV_LOW;
+ ret = ret && mv[i].as_mv.col < MV_UPP && mv[i].as_mv.col > MV_LOW;
+ }
break;
case NEARESTMV:
mv[0].as_int = nearest_mv[0].as_int;
@@ -451,13 +456,9 @@
mv[1].as_int = 0;
break;
default:
- assert(!"Invalid inter mode value.");
+ return 0;
}
-
- for (i = 0; i < 1 + is_compound; ++i) {
- assert(mv[i].as_mv.row < MV_UPP && mv[i].as_mv.row > MV_LOW);
- assert(mv[i].as_mv.col < MV_UPP && mv[i].as_mv.col > MV_LOW);
- }
+ return ret;
}
static int read_is_inter_block(VP9D_COMP *pbi, int segment_id, vp9_reader *r) {
@@ -553,8 +554,12 @@
mi_row, mi_col);
}
- assign_mv(cm, b_mode, block, best, nearest, nearmv,
- is_compound, allow_hp, r);
+ if (!assign_mv(cm, b_mode, block, best, nearest, nearmv,
+ is_compound, allow_hp, r)) {
+ xd->corrupted |= 1;
+ break;
+ };
+
mi->bmi[j].as_mv[0].as_int = block[0].as_int;
if (is_compound)
@@ -572,8 +577,9 @@
mbmi->mv[0].as_int = mi->bmi[3].as_mv[0].as_int;
mbmi->mv[1].as_int = mi->bmi[3].as_mv[1].as_int;
} else {
- assign_mv(cm, mbmi->mode, mbmi->mv, best, nearest, nearmv,
- is_compound, allow_hp, r);
+ xd->corrupted |= !assign_mv(cm, mbmi->mode, mbmi->mv,
+ best, nearest, nearmv,
+ is_compound, allow_hp, r);
}
}
diff --git a/vp9/decoder/vp9_decodframe.c b/vp9/decoder/vp9_decodframe.c
index b914de7..ec310f4 100644
--- a/vp9/decoder/vp9_decodframe.c
+++ b/vp9/decoder/vp9_decodframe.c
@@ -224,14 +224,14 @@
static void decode_modes_b(VP9D_COMP *pbi, int tile_col,
int mi_row, int mi_col,
- vp9_reader *r, BLOCK_SIZE bsize) {
+ vp9_reader *r, BLOCK_SIZE bsize, int index) {
MACROBLOCKD *const xd = &pbi->mb;
const int less8x8 = bsize < BLOCK_8X8;
MB_MODE_INFO *mbmi;
int eobtotal;
if (less8x8)
- if (xd->ab_index > 0)
+ if (index > 0)
return;
set_offsets(pbi, bsize, tile_col, mi_row, mi_col);
@@ -271,9 +271,10 @@
xd->corrupted |= vp9_reader_has_error(r);
}
+
static void decode_modes_sb(VP9D_COMP *pbi, int tile_col,
int mi_row, int mi_col,
- vp9_reader* r, BLOCK_SIZE bsize) {
+ vp9_reader* r, BLOCK_SIZE bsize, int index) {
VP9_COMMON *const cm = &pbi->common;
MACROBLOCKD *const xd = &pbi->mb;
const int hbs = num_8x8_blocks_wide_lookup[bsize] / 2;
@@ -284,7 +285,7 @@
return;
if (bsize < BLOCK_8X8) {
- if (xd->ab_index != 0)
+ if (index > 0)
return;
} else {
int pl;
@@ -306,31 +307,27 @@
}
subsize = get_subsize(bsize, partition);
- *get_sb_index(xd, subsize) = 0;
switch (partition) {
case PARTITION_NONE:
- decode_modes_b(pbi, tile_col, mi_row, mi_col, r, subsize);
+ decode_modes_b(pbi, tile_col, mi_row, mi_col, r, subsize, 0);
break;
case PARTITION_HORZ:
- decode_modes_b(pbi, tile_col, mi_row, mi_col, r, subsize);
- *get_sb_index(xd, subsize) = 1;
+ decode_modes_b(pbi, tile_col, mi_row, mi_col, r, subsize, 0);
if (mi_row + hbs < cm->mi_rows)
- decode_modes_b(pbi, tile_col, mi_row + hbs, mi_col, r, subsize);
+ decode_modes_b(pbi, tile_col, mi_row + hbs, mi_col, r, subsize, 1);
break;
case PARTITION_VERT:
- decode_modes_b(pbi, tile_col, mi_row, mi_col, r, subsize);
- *get_sb_index(xd, subsize) = 1;
+ decode_modes_b(pbi, tile_col, mi_row, mi_col, r, subsize, 0);
if (mi_col + hbs < cm->mi_cols)
- decode_modes_b(pbi, tile_col, mi_row, mi_col + hbs, r, subsize);
+ decode_modes_b(pbi, tile_col, mi_row, mi_col + hbs, r, subsize, 1);
break;
case PARTITION_SPLIT: {
int n;
for (n = 0; n < 4; n++) {
const int j = n >> 1, i = n & 1;
- *get_sb_index(xd, subsize) = n;
decode_modes_sb(pbi, tile_col, mi_row + j * hbs, mi_col + i * hbs,
- r, subsize);
+ r, subsize, n);
}
} break;
default:
@@ -611,7 +608,7 @@
vp9_zero(cm->left_seg_context);
for (mi_col = cm->cur_tile_mi_col_start; mi_col < cm->cur_tile_mi_col_end;
mi_col += MI_BLOCK_SIZE)
- decode_modes_sb(pbi, tile_col, mi_row, mi_col, r, BLOCK_64X64);
+ decode_modes_sb(pbi, tile_col, mi_row, mi_col, r, BLOCK_64X64, 0);
if (pbi->do_loopfilter_inline) {
// delay the loopfilter by 1 macroblock row.
@@ -802,6 +799,7 @@
struct vp9_read_bit_buffer *rb) {
VP9_COMMON *const cm = &pbi->common;
MACROBLOCKD *const xd = &pbi->mb;
+ size_t sz;
int i;
cm->last_frame_type = cm->frame_type;
@@ -909,8 +907,9 @@
setup_segmentation(&cm->seg, rb);
setup_tile_info(cm, rb);
+ sz = vp9_rb_read_literal(rb, 16);
- return vp9_rb_read_literal(rb, 16);
+ return sz > 0 ? sz : -1;
}
static int read_compressed_header(VP9D_COMP *pbi, const uint8_t *data,
diff --git a/vp9/encoder/vp9_bitstream.c b/vp9/encoder/vp9_bitstream.c
index bfac5a7..ed795f0 100644
--- a/vp9/encoder/vp9_bitstream.c
+++ b/vp9/encoder/vp9_bitstream.c
@@ -565,13 +565,13 @@
static void write_modes_b(VP9_COMP *cpi, MODE_INFO **mi_8x8, vp9_writer *bc,
TOKENEXTRA **tok, TOKENEXTRA *tok_end,
- int mi_row, int mi_col) {
+ int mi_row, int mi_col, int index) {
VP9_COMMON *const cm = &cpi->common;
MACROBLOCKD *const xd = &cpi->mb.e_mbd;
MODE_INFO *m = mi_8x8[0];
if (m->mbmi.sb_type < BLOCK_8X8)
- if (xd->ab_index > 0)
+ if (index > 0)
return;
xd->mi_8x8 = mi_8x8;
@@ -597,7 +597,8 @@
static void write_modes_sb(VP9_COMP *cpi, MODE_INFO **mi_8x8, vp9_writer *bc,
TOKENEXTRA **tok, TOKENEXTRA *tok_end,
- int mi_row, int mi_col, BLOCK_SIZE bsize) {
+ int mi_row, int mi_col, BLOCK_SIZE bsize,
+ int index) {
VP9_COMMON *const cm = &cpi->common;
MACROBLOCKD *xd = &cpi->mb.e_mbd;
const int mis = cm->mode_info_stride;
@@ -613,11 +614,10 @@
partition = partition_lookup[bsl][m->mbmi.sb_type];
- if (bsize < BLOCK_8X8)
- if (xd->ab_index > 0)
+ if (bsize < BLOCK_8X8) {
+ if (index > 0)
return;
-
- if (bsize >= BLOCK_8X8) {
+ } else {
int pl;
const int idx = check_bsize_coverage(bs, cm->mi_rows, cm->mi_cols,
mi_row, mi_col);
@@ -634,31 +634,28 @@
}
subsize = get_subsize(bsize, partition);
- *(get_sb_index(xd, subsize)) = 0;
switch (partition) {
case PARTITION_NONE:
- write_modes_b(cpi, mi_8x8, bc, tok, tok_end, mi_row, mi_col);
+ write_modes_b(cpi, mi_8x8, bc, tok, tok_end, mi_row, mi_col, 0);
break;
case PARTITION_HORZ:
- write_modes_b(cpi, mi_8x8, bc, tok, tok_end, mi_row, mi_col);
- *(get_sb_index(xd, subsize)) = 1;
+ write_modes_b(cpi, mi_8x8, bc, tok, tok_end, mi_row, mi_col, 0);
if ((mi_row + bs) < cm->mi_rows)
write_modes_b(cpi, mi_8x8 + bs * mis, bc, tok, tok_end, mi_row + bs,
- mi_col);
+ mi_col, 1);
break;
case PARTITION_VERT:
- write_modes_b(cpi, mi_8x8, bc, tok, tok_end, mi_row, mi_col);
- *(get_sb_index(xd, subsize)) = 1;
+ write_modes_b(cpi, mi_8x8, bc, tok, tok_end, mi_row, mi_col, 0);
if ((mi_col + bs) < cm->mi_cols)
- write_modes_b(cpi, mi_8x8 + bs, bc, tok, tok_end, mi_row, mi_col + bs);
+ write_modes_b(cpi, mi_8x8 + bs, bc, tok, tok_end, mi_row, mi_col + bs,
+ 1);
break;
case PARTITION_SPLIT:
for (n = 0; n < 4; n++) {
- int j = n >> 1, i = n & 0x01;
- *(get_sb_index(xd, subsize)) = n;
+ const int j = n >> 1, i = n & 1;
write_modes_sb(cpi, mi_8x8 + j * bs * mis + i * bs, bc, tok, tok_end,
- mi_row + j * bs, mi_col + i * bs, subsize);
+ mi_row + j * bs, mi_col + i * bs, subsize, n);
}
break;
default:
@@ -690,7 +687,7 @@
for (mi_col = cm->cur_tile_mi_col_start; mi_col < cm->cur_tile_mi_col_end;
mi_col += MI_BLOCK_SIZE, m_8x8 += MI_BLOCK_SIZE) {
write_modes_sb(cpi, m_8x8, bc, tok, tok_end, mi_row, mi_col,
- BLOCK_64X64);
+ BLOCK_64X64, 0);
}
}
}
diff --git a/vp9/encoder/vp9_dct.c b/vp9/encoder/vp9_dct.c
index b6555bc..23c652d 100644
--- a/vp9/encoder/vp9_dct.c
+++ b/vp9/encoder/vp9_dct.c
@@ -302,14 +302,13 @@
}
}
-void vp9_short_fdct16x16_c(int16_t *input, int16_t *output, int pitch) {
+void vp9_short_fdct16x16_c(int16_t *input, int16_t *output, int stride) {
// The 2D transform is done with two passes which are actually pretty
// similar. In the first one, we transform the columns and transpose
// the results. In the second one, we transform the rows. To achieve that,
// as the first pass results are transposed, we tranpose the columns (that
// is the transposed rows) and transpose the results (so that it goes back
// in normal/row positions).
- const int stride = pitch >> 1;
int pass;
// We need an intermediate buffer between passes.
int16_t intermediate[256];
@@ -1315,8 +1314,7 @@
output[31] = dct_32_round(step[31] * cospi_31_64 + step[16] * -cospi_1_64);
}
-void vp9_short_fdct32x32_c(int16_t *input, int16_t *out, int pitch) {
- int shortpitch = pitch >> 1;
+void vp9_short_fdct32x32_c(int16_t *input, int16_t *out, int stride) {
int i, j;
int output[32 * 32];
@@ -1324,7 +1322,7 @@
for (i = 0; i < 32; ++i) {
int temp_in[32], temp_out[32];
for (j = 0; j < 32; ++j)
- temp_in[j] = input[j * shortpitch + i] * 4;
+ temp_in[j] = input[j * stride + i] * 4;
dct32_1d(temp_in, temp_out, 0);
for (j = 0; j < 32; ++j)
output[j * 32 + i] = (temp_out[j] + 1 + (temp_out[j] > 0)) >> 2;
@@ -1344,8 +1342,7 @@
// Note that although we use dct_32_round in dct32_1d computation flow,
// this 2d fdct32x32 for rate-distortion optimization loop is operating
// within 16 bits precision.
-void vp9_short_fdct32x32_rd_c(int16_t *input, int16_t *out, int pitch) {
- int shortpitch = pitch >> 1;
+void vp9_short_fdct32x32_rd_c(int16_t *input, int16_t *out, int stride) {
int i, j;
int output[32 * 32];
@@ -1353,7 +1350,7 @@
for (i = 0; i < 32; ++i) {
int temp_in[32], temp_out[32];
for (j = 0; j < 32; ++j)
- temp_in[j] = input[j * shortpitch + i] * 4;
+ temp_in[j] = input[j * stride + i] * 4;
dct32_1d(temp_in, temp_out, 0);
for (j = 0; j < 32; ++j)
// TODO(cd): see quality impact of only doing
diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c
index 081f331..6e8e1d1 100644
--- a/vp9/encoder/vp9_encodeframe.c
+++ b/vp9/encoder/vp9_encodeframe.c
@@ -50,6 +50,31 @@
int enc_debug = 0;
#endif
+static INLINE uint8_t *get_sb_index(MACROBLOCKD *xd, BLOCK_SIZE subsize) {
+ switch (subsize) {
+ case BLOCK_64X64:
+ case BLOCK_64X32:
+ case BLOCK_32X64:
+ case BLOCK_32X32:
+ return &xd->sb_index;
+ case BLOCK_32X16:
+ case BLOCK_16X32:
+ case BLOCK_16X16:
+ return &xd->mb_index;
+ case BLOCK_16X8:
+ case BLOCK_8X16:
+ case BLOCK_8X8:
+ return &xd->b_index;
+ case BLOCK_8X4:
+ case BLOCK_4X8:
+ case BLOCK_4X4:
+ return &xd->ab_index;
+ default:
+ assert(0);
+ return NULL;
+ }
+}
+
static void encode_superblock(VP9_COMP *cpi, TOKENEXTRA **t, int output_enabled,
int mi_row, int mi_col, BLOCK_SIZE bsize);
@@ -554,7 +579,10 @@
MACROBLOCK *const x = &cpi->mb;
MACROBLOCKD *const xd = &x->e_mbd;
int orig_rdmult = x->rdmult;
- double rdmult_ratio = 1.0;
+ double rdmult_ratio;
+
+ vp9_clear_system_state(); // __asm emms;
+ rdmult_ratio = 1.0; // avoid uninitialized warnings
// Use the lower precision, but faster, 32x32 fdct for mode selection.
x->use_lp32x32fdct = 1;
@@ -593,7 +621,10 @@
if (cpi->oxcf.tuning == VP8_TUNE_SSIM)
vp9_activity_masking(cpi, x);
- x->rdmult = round(x->rdmult * rdmult_ratio);
+ if (cpi->sf.variance_adaptive_quantization) {
+ vp9_clear_system_state(); // __asm emms;
+ x->rdmult = round(x->rdmult * rdmult_ratio);
+ }
// Find best coding mode & reconstruct the MB so it is available
// as a predictor for MBs that follow in the SB
@@ -609,9 +640,13 @@
totaldist, bsize, ctx, best_rd);
}
- x->rdmult = orig_rdmult;
- if (*totalrate != INT_MAX)
- *totalrate = round(*totalrate * rdmult_ratio);
+ if (cpi->sf.variance_adaptive_quantization) {
+ x->rdmult = orig_rdmult;
+ if (*totalrate != INT_MAX) {
+ vp9_clear_system_state(); // __asm emms;
+ *totalrate = round(*totalrate * rdmult_ratio);
+ }
+ }
}
static void update_stats(VP9_COMP *cpi) {
diff --git a/vp9/encoder/vp9_encodemb.c b/vp9/encoder/vp9_encodemb.c
index c1e1a0d..13d8aa8 100644
--- a/vp9/encoder/vp9_encodemb.c
+++ b/vp9/encoder/vp9_encodemb.c
@@ -365,9 +365,9 @@
yoff = 32 * (block >> twl);
src_diff = p->src_diff + 4 * bw * yoff + xoff;
if (x->use_lp32x32fdct)
- vp9_short_fdct32x32_rd(src_diff, coeff, bw * 8);
+ vp9_short_fdct32x32_rd(src_diff, coeff, bw * 4);
else
- vp9_short_fdct32x32(src_diff, coeff, bw * 8);
+ vp9_short_fdct32x32(src_diff, coeff, bw * 4);
vp9_quantize_b_32x32(coeff, 1024, x->skip_block, p->zbin, p->round,
p->quant, p->quant_shift, qcoeff, dqcoeff,
pd->dequant, p->zbin_extra, eob, scan, iscan);
@@ -379,7 +379,7 @@
xoff = 16 * (block & twmask);
yoff = 16 * (block >> twl);
src_diff = p->src_diff + 4 * bw * yoff + xoff;
- vp9_short_fdct16x16(src_diff, coeff, bw * 8);
+ vp9_short_fdct16x16(src_diff, coeff, bw * 4);
vp9_quantize_b(coeff, 256, x->skip_block, p->zbin, p->round,
p->quant, p->quant_shift, qcoeff, dqcoeff,
pd->dequant, p->zbin_extra, eob, scan, iscan);
@@ -532,9 +532,9 @@
vp9_subtract_block(32, 32, src_diff, bw * 4,
src, p->src.stride, dst, pd->dst.stride);
if (x->use_lp32x32fdct)
- vp9_short_fdct32x32_rd(src_diff, coeff, bw * 8);
+ vp9_short_fdct32x32_rd(src_diff, coeff, bw * 4);
else
- vp9_short_fdct32x32(src_diff, coeff, bw * 8);
+ vp9_short_fdct32x32(src_diff, coeff, bw * 4);
vp9_quantize_b_32x32(coeff, 1024, x->skip_block, p->zbin, p->round,
p->quant, p->quant_shift, qcoeff, dqcoeff,
pd->dequant, p->zbin_extra, eob, scan, iscan);
@@ -559,7 +559,7 @@
if (tx_type != DCT_DCT)
vp9_short_fht16x16(src_diff, coeff, bw * 4, tx_type);
else
- vp9_short_fdct16x16(src_diff, coeff, bw * 8);
+ vp9_short_fdct16x16(src_diff, coeff, bw * 4);
vp9_quantize_b(coeff, 256, x->skip_block, p->zbin, p->round,
p->quant, p->quant_shift, qcoeff, dqcoeff,
pd->dequant, p->zbin_extra, eob, scan, iscan);
diff --git a/vp9/encoder/vp9_firstpass.c b/vp9/encoder/vp9_firstpass.c
index 7157451..caf4162 100644
--- a/vp9/encoder/vp9_firstpass.c
+++ b/vp9/encoder/vp9_firstpass.c
@@ -554,7 +554,10 @@
int this_error;
int gf_motion_error = INT_MAX;
int use_dc_pred = (mb_col || mb_row) && (!mb_col || !mb_row);
- double error_weight = 1.0;
+ double error_weight;
+
+ vp9_clear_system_state(); // __asm emms;
+ error_weight = 1.0; // avoid uninitialized warnings
xd->plane[0].dst.buf = new_yv12->y_buffer + recon_yoffset;
xd->plane[1].dst.buf = new_yv12->u_buffer + recon_uvoffset;
@@ -587,7 +590,11 @@
}
// do intra 16x16 prediction
- this_error = error_weight * vp9_encode_intra(x, use_dc_pred);
+ this_error = vp9_encode_intra(x, use_dc_pred);
+ if (cpi->sf.variance_adaptive_quantization) {
+ vp9_clear_system_state(); // __asm emms;
+ this_error *= error_weight;
+ }
// intrapenalty below deals with situations where the intra and inter
// error scores are very low (eg a plain black frame).
@@ -622,7 +629,10 @@
first_pass_motion_search(cpi, x, &best_ref_mv,
&mv.as_mv, lst_yv12,
&motion_error, recon_yoffset);
- motion_error *= error_weight;
+ if (cpi->sf.variance_adaptive_quantization) {
+ vp9_clear_system_state(); // __asm emms;
+ motion_error *= error_weight;
+ }
// If the current best reference mv is not centered on 0,0 then do a 0,0
// based search as well.
@@ -630,7 +640,10 @@
tmp_err = INT_MAX;
first_pass_motion_search(cpi, x, &zero_ref_mv, &tmp_mv.as_mv,
lst_yv12, &tmp_err, recon_yoffset);
- tmp_err *= error_weight;
+ if (cpi->sf.variance_adaptive_quantization) {
+ vp9_clear_system_state(); // __asm emms;
+ tmp_err *= error_weight;
+ }
if (tmp_err < motion_error) {
motion_error = tmp_err;
@@ -647,7 +660,10 @@
first_pass_motion_search(cpi, x, &zero_ref_mv,
&tmp_mv.as_mv, gld_yv12,
&gf_motion_error, recon_yoffset);
- gf_motion_error *= error_weight;
+ if (cpi->sf.variance_adaptive_quantization) {
+ vp9_clear_system_state(); // __asm emms;
+ gf_motion_error *= error_weight;
+ }
if ((gf_motion_error < motion_error) &&
(gf_motion_error < this_error)) {
diff --git a/vp9/encoder/vp9_mbgraph.c b/vp9/encoder/vp9_mbgraph.c
index 2f147a0..ea4c9e8 100644
--- a/vp9/encoder/vp9_mbgraph.c
+++ b/vp9/encoder/vp9_mbgraph.c
@@ -61,6 +61,7 @@
best_err = cpi->find_fractional_mv_step(
x,
&dst_mv->as_mv, &ref_mv->as_mv,
+ xd->allow_high_precision_mv,
x->errorperbit, &v_fn_ptr,
0, cpi->sf.subpel_iters_per_step, NULL, NULL,
& distortion, &sse);
diff --git a/vp9/encoder/vp9_mcomp.c b/vp9/encoder/vp9_mcomp.c
index 561c725..a52f5b1 100644
--- a/vp9/encoder/vp9_mcomp.c
+++ b/vp9/encoder/vp9_mcomp.c
@@ -275,6 +275,7 @@
int vp9_find_best_sub_pixel_iterative(MACROBLOCK *x,
MV *bestmv, const MV *ref_mv,
+ int allow_hp,
int error_per_bit,
const vp9_variance_fn_ptr_t *vfp,
int forced_stop,
@@ -348,8 +349,7 @@
}
}
- if (xd->allow_high_precision_mv && vp9_use_mv_hp(ref_mv) &&
- forced_stop == 0) {
+ if (allow_hp && vp9_use_mv_hp(ref_mv) && forced_stop == 0) {
hstep >>= 1;
while (eighthiters--) {
FIRST_LEVEL_CHECKS;
@@ -373,6 +373,7 @@
int vp9_find_best_sub_pixel_tree(MACROBLOCK *x,
MV *bestmv, const MV *ref_mv,
+ int allow_hp,
int error_per_bit,
const vp9_variance_fn_ptr_t *vfp,
int forced_stop,
@@ -436,8 +437,7 @@
tc = bc;
}
- if (xd->allow_high_precision_mv && vp9_use_mv_hp(ref_mv) &&
- forced_stop == 0) {
+ if (allow_hp && vp9_use_mv_hp(ref_mv) && forced_stop == 0) {
hstep >>= 1;
FIRST_LEVEL_CHECKS;
if (eighthiters > 1) {
@@ -465,6 +465,7 @@
int vp9_find_best_sub_pixel_comp_iterative(MACROBLOCK *x,
MV *bestmv, const MV *ref_mv,
+ int allow_hp,
int error_per_bit,
const vp9_variance_fn_ptr_t *vfp,
int forced_stop,
@@ -544,8 +545,7 @@
}
}
- if (xd->allow_high_precision_mv && vp9_use_mv_hp(ref_mv) &&
- forced_stop == 0) {
+ if (allow_hp && vp9_use_mv_hp(ref_mv) && forced_stop == 0) {
hstep >>= 1;
while (eighthiters--) {
FIRST_LEVEL_CHECKS;
@@ -568,6 +568,7 @@
int vp9_find_best_sub_pixel_comp_tree(MACROBLOCK *x,
MV *bestmv, const MV *ref_mv,
+ int allow_hp,
int error_per_bit,
const vp9_variance_fn_ptr_t *vfp,
int forced_stop,
@@ -642,8 +643,7 @@
tc = bc;
}
- if (xd->allow_high_precision_mv && vp9_use_mv_hp(ref_mv) &&
- forced_stop == 0) {
+ if (allow_hp && vp9_use_mv_hp(ref_mv) && forced_stop == 0) {
hstep >>= 1;
FIRST_LEVEL_CHECKS;
if (eighthiters > 1) {
diff --git a/vp9/encoder/vp9_mcomp.h b/vp9/encoder/vp9_mcomp.h
index 77c157c..bcab679 100644
--- a/vp9/encoder/vp9_mcomp.h
+++ b/vp9/encoder/vp9_mcomp.h
@@ -74,6 +74,7 @@
typedef int (fractional_mv_step_fp) (
MACROBLOCK *x,
MV *bestmv, const MV *ref_mv,
+ int allow_hp,
int error_per_bit,
const vp9_variance_fn_ptr_t *vfp,
int forced_stop, // 0 - full, 1 - qtr only, 2 - half only
@@ -88,6 +89,7 @@
typedef int (fractional_mv_step_comp_fp) (
MACROBLOCK *x,
MV *bestmv, const MV *ref_mv,
+ int allow_hp,
int error_per_bit,
const vp9_variance_fn_ptr_t *vfp,
int forced_stop, // 0 - full, 1 - qtr only, 2 - half only
diff --git a/vp9/encoder/vp9_onyx_if.c b/vp9/encoder/vp9_onyx_if.c
index 2b1caf4..54b3d43 100644
--- a/vp9/encoder/vp9_onyx_if.c
+++ b/vp9/encoder/vp9_onyx_if.c
@@ -3386,11 +3386,6 @@
#if 0
output_frame_level_debug_stats(cpi);
#endif
- // If this was a kf or Gf note the Q
- if ((cm->frame_type == KEY_FRAME)
- || cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame)
- cm->last_kf_gf_q = cm->base_qindex;
-
if (cpi->refresh_golden_frame == 1)
cm->frame_flags = cm->frame_flags | FRAMEFLAGS_GOLDEN;
else
diff --git a/vp9/encoder/vp9_rdopt.c b/vp9/encoder/vp9_rdopt.c
index 3eb14c8..30cdb3f 100644
--- a/vp9/encoder/vp9_rdopt.c
+++ b/vp9/encoder/vp9_rdopt.c
@@ -1860,6 +1860,7 @@
cpi->find_fractional_mv_step(x,
&mode_mv[NEWMV].as_mv,
&bsi->ref_mv->as_mv,
+ x->e_mbd.allow_high_precision_mv,
x->errorperbit, v_fn_ptr,
0, cpi->sf.subpel_iters_per_step,
x->nmvjointcost, x->mvcost,
@@ -2440,6 +2441,7 @@
int dis; /* TODO: use dis in distortion calculation later. */
unsigned int sse;
cpi->find_fractional_mv_step(x, &tmp_mv->as_mv, &ref_mv.as_mv,
+ xd->allow_high_precision_mv,
x->errorperbit,
&cpi->fn_ptr[block_size],
0, cpi->sf.subpel_iters_per_step,
@@ -2575,6 +2577,7 @@
bestsme = cpi->find_fractional_mv_step_comp(
x, &tmp_mv.as_mv,
&ref_mv[id].as_mv,
+ xd->allow_high_precision_mv,
x->errorperbit,
&cpi->fn_ptr[block_size],
0, cpi->sf.subpel_iters_per_step,
diff --git a/vp9/encoder/vp9_temporal_filter.c b/vp9/encoder/vp9_temporal_filter.c
index 6ff0de4..5cf8143 100644
--- a/vp9/encoder/vp9_temporal_filter.c
+++ b/vp9/encoder/vp9_temporal_filter.c
@@ -166,6 +166,7 @@
// Ignore mv costing by sending NULL pointer instead of cost array
bestsme = cpi->find_fractional_mv_step(x, &ref_mv->as_mv,
&best_ref_mv1.as_mv,
+ xd->allow_high_precision_mv,
x->errorperbit,
&cpi->fn_ptr[BLOCK_16X16],
0, cpi->sf.subpel_iters_per_step,
diff --git a/vp9/encoder/vp9_vaq.c b/vp9/encoder/vp9_vaq.c
index 3d3b4b0..3179ae3 100644
--- a/vp9/encoder/vp9_vaq.c
+++ b/vp9/encoder/vp9_vaq.c
@@ -37,25 +37,36 @@
unsigned int vp9_vaq_segment_id(int energy) {
ENERGY_IN_BOUNDS(energy);
+
return SEGMENT_ID(energy);
}
double vp9_vaq_rdmult_ratio(int energy) {
ENERGY_IN_BOUNDS(energy);
+
+ vp9_clear_system_state(); // __asm emms;
+
return RDMULT_RATIO(energy);
}
double vp9_vaq_inv_q_ratio(int energy) {
ENERGY_IN_BOUNDS(energy);
+
+ vp9_clear_system_state(); // __asm emms;
+
return Q_RATIO(-energy);
}
void vp9_vaq_init() {
int i;
- double base_ratio = 1.8;
+ double base_ratio;
assert(ENERGY_SPAN <= MAX_SEGMENTS);
+ vp9_clear_system_state(); // __asm emms;
+
+ base_ratio = 1.8;
+
for (i = ENERGY_MIN; i <= ENERGY_MAX; i++) {
Q_RATIO(i) = pow(base_ratio, i/3.0);
}
@@ -74,6 +85,8 @@
seg->abs_delta = SEGMENT_DELTADATA;
+ vp9_clear_system_state(); // __asm emms;
+
for (i = ENERGY_MIN; i <= ENERGY_MAX; i++) {
int qindex_delta, segment_rdmult;
@@ -89,6 +102,7 @@
segment_rdmult = vp9_compute_rd_mult(cpi, cm->base_qindex + qindex_delta +
cm->y_dc_delta_q);
+
RDMULT_RATIO(i) = (double) segment_rdmult / base_rdmult;
}
}
@@ -120,9 +134,14 @@
}
int vp9_block_energy(VP9_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bs) {
+ double energy;
+ unsigned int var = block_variance(cpi, x, bs);
+
+ vp9_clear_system_state(); // __asm emms;
+
// if (var <= 1000)
// return 0;
- unsigned int var = block_variance(cpi, x, bs);
- double energy = 0.9*(logf(var + 1) - 10.0);
+
+ energy = 0.9*(logf(var + 1) - 10.0);
return clamp(round(energy), ENERGY_MIN, ENERGY_MAX);
}
diff --git a/vp9/encoder/x86/vp9_dct32x32_sse2.c b/vp9/encoder/x86/vp9_dct32x32_sse2.c
index 11eec7f..de47a5b 100644
--- a/vp9/encoder/x86/vp9_dct32x32_sse2.c
+++ b/vp9/encoder/x86/vp9_dct32x32_sse2.c
@@ -30,11 +30,11 @@
#endif
void FDCT32x32_2D(int16_t *input,
- int16_t *output_org, int pitch) {
+ int16_t *output_org, int stride) {
// Calculate pre-multiplied strides
- const int str1 = pitch >> 1;
- const int str2 = pitch;
- const int str3 = pitch + str1;
+ const int str1 = stride;
+ const int str2 = 2 * stride;
+ const int str3 = 2 * stride + str1;
// We need an intermediate buffer between passes.
DECLARE_ALIGNED(16, int16_t, intermediate[32 * 32]);
// Constants
diff --git a/vp9/encoder/x86/vp9_dct_sse2.c b/vp9/encoder/x86/vp9_dct_sse2.c
index 5e1e5ed..457883f 100644
--- a/vp9/encoder/x86/vp9_dct_sse2.c
+++ b/vp9/encoder/x86/vp9_dct_sse2.c
@@ -1056,14 +1056,13 @@
write_buffer_8x8(output, in, 8);
}
-void vp9_short_fdct16x16_sse2(int16_t *input, int16_t *output, int pitch) {
+void vp9_short_fdct16x16_sse2(int16_t *input, int16_t *output, int stride) {
// The 2D transform is done with two passes which are actually pretty
// similar. In the first one, we transform the columns and transpose
// the results. In the second one, we transform the rows. To achieve that,
// as the first pass results are transposed, we tranpose the columns (that
// is the transposed rows) and transpose the results (so that it goes back
// in normal/row positions).
- const int stride = pitch >> 1;
int pass;
// We need an intermediate buffer between passes.
DECLARE_ALIGNED_ARRAY(16, int16_t, intermediate, 256);