Merge "vp9/decode_tiles_mt: remove unnecessary local"
diff --git a/test/vp9_error_block_test.cc b/test/vp9_error_block_test.cc
index 8c5d5a2..d779706 100644
--- a/test/vp9_error_block_test.cc
+++ b/test/vp9_error_block_test.cc
@@ -136,7 +136,23 @@
using std::tr1::make_tuple;
-#if HAVE_SSE2
+#if CONFIG_USE_X86INC && HAVE_SSE2
+int64_t wrap_vp9_highbd_block_error_8bit_sse2(const tran_low_t *coeff,
+ const tran_low_t *dqcoeff,
+ intptr_t block_size,
+ int64_t *ssz, int bps) {
+ assert(bps == 8);
+ return vp9_highbd_block_error_8bit_sse2(coeff, dqcoeff, block_size, ssz);
+}
+
+int64_t wrap_vp9_highbd_block_error_8bit_c(const tran_low_t *coeff,
+ const tran_low_t *dqcoeff,
+ intptr_t block_size,
+ int64_t *ssz, int bps) {
+ assert(bps == 8);
+ return vp9_highbd_block_error_8bit_c(coeff, dqcoeff, block_size, ssz);
+}
+
INSTANTIATE_TEST_CASE_P(
SSE2, ErrorBlockTest,
::testing::Values(
@@ -145,7 +161,9 @@
make_tuple(&vp9_highbd_block_error_sse2,
&vp9_highbd_block_error_c, VPX_BITS_12),
make_tuple(&vp9_highbd_block_error_sse2,
- &vp9_highbd_block_error_c, VPX_BITS_8)));
+ &vp9_highbd_block_error_c, VPX_BITS_8),
+ make_tuple(&wrap_vp9_highbd_block_error_8bit_sse2,
+ &wrap_vp9_highbd_block_error_8bit_c, VPX_BITS_8)));
#endif // HAVE_SSE2
#endif // CONFIG_VP9_HIGHBITDEPTH
} // namespace
diff --git a/vp10/common/onyxc_int.h b/vp10/common/onyxc_int.h
index c345068..e2454b0 100644
--- a/vp10/common/onyxc_int.h
+++ b/vp10/common/onyxc_int.h
@@ -279,6 +279,7 @@
int error_resilient_mode;
int log2_tile_cols, log2_tile_rows;
+ int tile_sz_mag;
int byte_alignment;
int skip_loop_filter;
diff --git a/vp10/decoder/decodeframe.c b/vp10/decoder/decodeframe.c
index 62c2942..ad02c95 100644
--- a/vp10/decoder/decodeframe.c
+++ b/vp10/decoder/decodeframe.c
@@ -170,8 +170,12 @@
static void update_mv_probs(vpx_prob *p, int n, vpx_reader *r) {
int i;
for (i = 0; i < n; ++i)
+#if CONFIG_MISC_FIXES
+ vp10_diff_update_prob(r, &p[i]);
+#else
if (vpx_read(r, MV_UPDATE_PROB))
p[i] = (vpx_read_literal(r, 7) << 1) | 1;
+#endif
}
static void read_mv_probs(nmv_context *ctx, int allow_hp, vpx_reader *r) {
@@ -1370,6 +1374,15 @@
cm->log2_tile_rows = vpx_rb_read_bit(rb);
if (cm->log2_tile_rows)
cm->log2_tile_rows += vpx_rb_read_bit(rb);
+
+#if CONFIG_MISC_FIXES
+ // tile size magnitude
+ if (cm->log2_tile_rows > 0 || cm->log2_tile_cols > 0) {
+ cm->tile_sz_mag = vpx_rb_read_literal(rb, 2);
+ }
+#else
+ cm->tile_sz_mag = 3;
+#endif
}
typedef struct TileBuffer {
@@ -1378,10 +1391,27 @@
int col; // only used with multi-threaded decoding
} TileBuffer;
+static int mem_get_varsize(const uint8_t *data, const int mag) {
+ switch (mag) {
+ case 0:
+ return data[0];
+ case 1:
+ return mem_get_le16(data);
+ case 2:
+ return mem_get_le24(data);
+ case 3:
+ return mem_get_le32(data);
+ }
+
+ assert("Invalid tile size marker value" && 0);
+
+ return -1;
+}
+
// Reads the next tile returning its size and adjusting '*data' accordingly
// based on 'is_last'.
static void get_tile_buffer(const uint8_t *const data_end,
- int is_last,
+ const int tile_sz_mag, int is_last,
struct vpx_internal_error_info *error_info,
const uint8_t **data,
vpx_decrypt_cb decrypt_cb, void *decrypt_state,
@@ -1395,12 +1425,12 @@
if (decrypt_cb) {
uint8_t be_data[4];
- decrypt_cb(decrypt_state, *data, be_data, 4);
- size = mem_get_be32(be_data);
+ decrypt_cb(decrypt_state, *data, be_data, tile_sz_mag + 1);
+ size = mem_get_varsize(be_data, tile_sz_mag);
} else {
- size = mem_get_be32(*data);
+ size = mem_get_varsize(*data, tile_sz_mag);
}
- *data += 4;
+ *data += tile_sz_mag + 1;
if (size > (size_t)(data_end - *data))
vpx_internal_error(error_info, VPX_CODEC_CORRUPT_FRAME,
@@ -1426,7 +1456,8 @@
const int is_last = (r == tile_rows - 1) && (c == tile_cols - 1);
TileBuffer *const buf = &tile_buffers[r][c];
buf->col = c;
- get_tile_buffer(data_end, is_last, &pbi->common.error, &data,
+ get_tile_buffer(data_end, pbi->common.tile_sz_mag,
+ is_last, &pbi->common.error, &data,
pbi->decrypt_cb, pbi->decrypt_state, buf);
}
}
diff --git a/vp10/decoder/decoder.c b/vp10/decoder/decoder.c
index 23851af..03a81f5 100644
--- a/vp10/decoder/decoder.c
+++ b/vp10/decoder/decoder.c
@@ -459,6 +459,9 @@
// an invalid bitstream and need to return an error.
uint8_t marker;
+#if CONFIG_MISC_FIXES
+ size_t frame_sz_sum = 0;
+#endif
assert(data_sz);
marker = read_marker(decrypt_cb, decrypt_state, data + data_sz - 1);
@@ -467,7 +470,7 @@
if ((marker & 0xe0) == 0xc0) {
const uint32_t frames = (marker & 0x7) + 1;
const uint32_t mag = ((marker >> 3) & 0x3) + 1;
- const size_t index_sz = 2 + mag * frames;
+ const size_t index_sz = 2 + mag * (frames - CONFIG_MISC_FIXES);
// This chunk is marked as having a superframe index but doesn't have
// enough data for it, thus it's an invalid superframe index.
@@ -498,13 +501,19 @@
x = clear_buffer;
}
- for (i = 0; i < frames; ++i) {
+ for (i = 0; i < frames - CONFIG_MISC_FIXES; ++i) {
uint32_t this_sz = 0;
for (j = 0; j < mag; ++j)
this_sz |= (*x++) << (j * 8);
sizes[i] = this_sz;
+#if CONFIG_MISC_FIXES
+ frame_sz_sum += this_sz;
+#endif
}
+#if CONFIG_MISC_FIXES
+ sizes[i] = data_sz - index_sz - frame_sz_sum;
+#endif
*count = frames;
}
}
diff --git a/vp10/decoder/detokenize.c b/vp10/decoder/detokenize.c
index 2902ece..d39e3dc 100644
--- a/vp10/decoder/detokenize.c
+++ b/vp10/decoder/detokenize.c
@@ -163,26 +163,33 @@
case CATEGORY5_TOKEN:
val = CAT5_MIN_VAL + read_coeff(cat5_prob, 5, r);
break;
- case CATEGORY6_TOKEN:
+ case CATEGORY6_TOKEN: {
+#if CONFIG_MISC_FIXES
+ const int skip_bits = TX_SIZES - 1 - tx_size;
+#else
+ const int skip_bits = 0;
+#endif
+ const uint8_t *cat6p = cat6_prob + skip_bits;
#if CONFIG_VP9_HIGHBITDEPTH
switch (xd->bd) {
case VPX_BITS_8:
- val = CAT6_MIN_VAL + read_coeff(cat6_prob, 14, r);
+ val = CAT6_MIN_VAL + read_coeff(cat6p, 14 - skip_bits, r);
break;
case VPX_BITS_10:
- val = CAT6_MIN_VAL + read_coeff(cat6_prob, 16, r);
+ val = CAT6_MIN_VAL + read_coeff(cat6p, 16 - skip_bits, r);
break;
case VPX_BITS_12:
- val = CAT6_MIN_VAL + read_coeff(cat6_prob, 18, r);
+ val = CAT6_MIN_VAL + read_coeff(cat6p, 18 - skip_bits, r);
break;
default:
assert(0);
return -1;
}
#else
- val = CAT6_MIN_VAL + read_coeff(cat6_prob, 14, r);
+ val = CAT6_MIN_VAL + read_coeff(cat6p, 14 - skip_bits, r);
#endif
break;
+ }
}
}
v = (val * dqv) >> dq_shift;
diff --git a/vp10/encoder/bitstream.c b/vp10/encoder/bitstream.c
index 485fc53..1661fbd 100644
--- a/vp10/encoder/bitstream.c
+++ b/vp10/encoder/bitstream.c
@@ -122,8 +122,11 @@
static void pack_mb_tokens(vpx_writer *w,
TOKENEXTRA **tp, const TOKENEXTRA *const stop,
- vpx_bit_depth_t bit_depth) {
+ vpx_bit_depth_t bit_depth, const TX_SIZE tx) {
TOKENEXTRA *p = *tp;
+#if !CONFIG_MISC_FIXES
+ (void) tx;
+#endif
while (p < stop && p->token != EOSB_TOKEN) {
const int t = p->token;
@@ -171,6 +174,12 @@
if (b->base_val) {
const int e = p->extra, l = b->len;
+#if CONFIG_MISC_FIXES
+ int skip_bits =
+ (b->base_val == CAT6_MIN_VAL) ? TX_SIZES - 1 - tx : 0;
+#else
+ int skip_bits = 0;
+#endif
if (l) {
const unsigned char *pb = b->prob;
@@ -180,7 +189,12 @@
do {
const int bb = (v >> --n) & 1;
- vpx_write(w, bb, pb[i >> 1]);
+ if (skip_bits) {
+ skip_bits--;
+ assert(!bb);
+ } else {
+ vpx_write(w, bb, pb[i >> 1]);
+ }
i = b->tree[i + bb];
} while (n);
}
@@ -190,7 +204,7 @@
++p;
}
- *tp = p + (p->token == EOSB_TOKEN);
+ *tp = p;
}
static void write_segment_id(vpx_writer *w, const struct segmentation *seg,
@@ -382,6 +396,7 @@
const VP10_COMMON *const cm = &cpi->common;
MACROBLOCKD *const xd = &cpi->td.mb.e_mbd;
MODE_INFO *m;
+ int plane;
xd->mi = cm->mi_grid_visible + (mi_row * cm->mi_stride + mi_col);
m = xd->mi[0];
@@ -398,8 +413,16 @@
pack_inter_mode_mvs(cpi, m, w);
}
- assert(*tok < tok_end);
- pack_mb_tokens(w, tok, tok_end, cm->bit_depth);
+ if (!m->mbmi.skip) {
+ assert(*tok < tok_end);
+ for (plane = 0; plane < MAX_MB_PLANE; ++plane) {
+ TX_SIZE tx = plane ? get_uv_tx_size(&m->mbmi, &xd->plane[plane])
+ : m->mbmi.tx_size;
+ pack_mb_tokens(w, tok, tok_end, cm->bit_depth, tx);
+ assert(*tok < tok_end && (*tok)->token == EOSB_TOKEN);
+ (*tok)++;
+ }
+ }
}
static void write_partition(const VP10_COMMON *const cm,
@@ -940,7 +963,8 @@
}
}
-static size_t encode_tiles(VP10_COMP *cpi, uint8_t *data_ptr) {
+static size_t encode_tiles(VP10_COMP *cpi, uint8_t *data_ptr,
+ unsigned int *max_tile_sz) {
VP10_COMMON *const cm = &cpi->common;
vpx_writer residual_bc;
int tile_row, tile_col;
@@ -948,6 +972,7 @@
size_t total_size = 0;
const int tile_cols = 1 << cm->log2_tile_cols;
const int tile_rows = 1 << cm->log2_tile_rows;
+ unsigned int max_tile = 0;
memset(cm->above_seg_context, 0,
sizeof(*cm->above_seg_context) * mi_cols_aligned_to_sb(cm->mi_cols));
@@ -971,13 +996,15 @@
vpx_stop_encode(&residual_bc);
if (tile_col < tile_cols - 1 || tile_row < tile_rows - 1) {
// size of this tile
- mem_put_be32(data_ptr + total_size, residual_bc.pos);
+ mem_put_le32(data_ptr + total_size, residual_bc.pos);
+ max_tile = max_tile > residual_bc.pos ? max_tile : residual_bc.pos;
total_size += 4;
}
total_size += residual_bc.pos;
}
}
+ *max_tile_sz = max_tile;
return total_size;
}
@@ -1278,15 +1305,62 @@
return header_bc.pos;
}
-void vp10_pack_bitstream(VP10_COMP *cpi, uint8_t *dest, size_t *size) {
+#if CONFIG_MISC_FIXES
+static int remux_tiles(uint8_t *dest, const int sz,
+ const int n_tiles, const int mag) {
+ int rpos = 0, wpos = 0, n;
+
+ for (n = 0; n < n_tiles; n++) {
+ int tile_sz;
+
+ if (n == n_tiles - 1) {
+ tile_sz = sz - rpos;
+ } else {
+ tile_sz = mem_get_le32(&dest[rpos]);
+ rpos += 4;
+ switch (mag) {
+ case 0:
+ dest[wpos] = tile_sz;
+ break;
+ case 1:
+ mem_put_le16(&dest[wpos], tile_sz);
+ break;
+ case 2:
+ mem_put_le24(&dest[wpos], tile_sz);
+ break;
+ case 3: // remuxing should only happen if mag < 3
+ default:
+ assert("Invalid value for tile size magnitude" && 0);
+ }
+ wpos += mag + 1;
+ }
+
+ memmove(&dest[wpos], &dest[rpos], tile_sz);
+ wpos += tile_sz;
+ rpos += tile_sz;
+ }
+
+ assert(rpos > wpos);
+ assert(rpos == sz);
+
+ return wpos;
+}
+#endif
+
+void vp10_pack_bitstream(VP10_COMP *const cpi, uint8_t *dest, size_t *size) {
+ VP10_COMMON *const cm = &cpi->common;
uint8_t *data = dest;
size_t first_part_size, uncompressed_hdr_size;
struct vpx_write_bit_buffer wb = {data, 0};
struct vpx_write_bit_buffer saved_wb;
+ unsigned int max_tile, data_sz;
+ const int n_log2_tiles = cm->log2_tile_rows + cm->log2_tile_cols;
+ const int have_tiles = n_log2_tiles > 0;
write_uncompressed_header(cpi, &wb);
saved_wb = wb;
- vpx_wb_write_literal(&wb, 0, 16); // don't know in advance first part. size
+ // don't know in advance first part. size
+ vpx_wb_write_literal(&wb, 0, 16 + have_tiles * 2);
uncompressed_hdr_size = vpx_wb_bytes_written(&wb);
data += uncompressed_hdr_size;
@@ -1295,10 +1369,32 @@
first_part_size = write_compressed_header(cpi, data);
data += first_part_size;
+
+ data_sz = encode_tiles(cpi, data, &max_tile);
+#if CONFIG_MISC_FIXES
+ if (max_tile > 0) {
+ int mag;
+ unsigned int mask;
+
+ // Choose the (tile size) magnitude
+ for (mag = 0, mask = 0xff; mag < 4; mag++) {
+ if (max_tile <= mask)
+ break;
+ mask <<= 8;
+ mask |= 0xff;
+ }
+ assert(n_log2_tiles > 0);
+ vpx_wb_write_literal(&saved_wb, mag, 2);
+ if (mag < 3)
+ data_sz = remux_tiles(data, data_sz, 1 << n_log2_tiles, mag);
+ } else {
+ assert(n_log2_tiles == 0);
+ }
+#endif
+ data += data_sz;
+
// TODO(jbb): Figure out what to do if first_part_size > 16 bits.
vpx_wb_write_literal(&saved_wb, (int)first_part_size, 16);
- data += encode_tiles(cpi, data);
-
*size = data - dest;
}
diff --git a/vp10/encoder/encodeframe.c b/vp10/encoder/encodeframe.c
index 019e5b1..ce1530c 100644
--- a/vp10/encoder/encodeframe.c
+++ b/vp10/encoder/encodeframe.c
@@ -1356,9 +1356,6 @@
if (output_enabled) {
update_stats(&cpi->common, td);
-
- (*tp)->token = EOSB_TOKEN;
- (*tp)++;
}
}
diff --git a/vp10/encoder/encodemv.c b/vp10/encoder/encodemv.c
index ca2de1f..0736c65 100644
--- a/vp10/encoder/encodemv.c
+++ b/vp10/encoder/encodemv.c
@@ -15,6 +15,7 @@
#include "vp10/encoder/cost.h"
#include "vp10/encoder/encodemv.h"
+#include "vp10/encoder/subexp.h"
#include "vpx_dsp/vpx_dsp_common.h"
@@ -134,8 +135,12 @@
}
}
-static int update_mv(vpx_writer *w, const unsigned int ct[2], vpx_prob *cur_p,
- vpx_prob upd_p) {
+static void update_mv(vpx_writer *w, const unsigned int ct[2], vpx_prob *cur_p,
+ vpx_prob upd_p) {
+#if CONFIG_MISC_FIXES
+ (void) upd_p;
+ vp10_cond_prob_diff_update(w, cur_p, ct);
+#else
const vpx_prob new_p = get_binary_prob(ct[0], ct[1]) | 1;
const int update = cost_branch256(ct, *cur_p) + vp10_cost_zero(upd_p) >
cost_branch256(ct, new_p) + vp10_cost_one(upd_p) + 7 * 256;
@@ -144,7 +149,7 @@
*cur_p = new_p;
vpx_write_literal(w, new_p >> 1, 7);
}
- return update;
+#endif
}
static void write_mv_update(const vpx_tree_index *tree,
diff --git a/vp10/encoder/tokenize.c b/vp10/encoder/tokenize.c
index af915fe..cbebd5a 100644
--- a/vp10/encoder/tokenize.c
+++ b/vp10/encoder/tokenize.c
@@ -628,8 +628,16 @@
}
if (!dry_run) {
+ int plane;
+
td->counts->skip[ctx][0] += skip_inc;
- vp10_foreach_transformed_block(xd, bsize, tokenize_b, &arg);
+
+ for (plane = 0; plane < MAX_MB_PLANE; ++plane) {
+ vp10_foreach_transformed_block_in_plane(xd, bsize, plane, tokenize_b,
+ &arg);
+ (*t)->token = EOSB_TOKEN;
+ (*t)++;
+ }
} else {
vp10_foreach_transformed_block(xd, bsize, set_entropy_context_b, &arg);
}
diff --git a/vp10/vp10_cx_iface.c b/vp10/vp10_cx_iface.c
index 304f74e..409ed1c 100644
--- a/vp10/vp10_cx_iface.c
+++ b/vp10/vp10_cx_iface.c
@@ -91,7 +91,9 @@
size_t pending_cx_data_sz;
int pending_frame_count;
size_t pending_frame_sizes[8];
+#if !CONFIG_MISC_FIXES
size_t pending_frame_magnitude;
+#endif
vpx_image_t preview_img;
vpx_enc_frame_flags_t next_frame_flags;
vp8_postproc_cfg_t preview_ppcfg;
@@ -781,24 +783,39 @@
uint8_t marker = 0xc0;
unsigned int mask;
int mag, index_sz;
+#if CONFIG_MISC_FIXES
+ int i;
+ size_t max_frame_sz = 0;
+#endif
assert(ctx->pending_frame_count);
assert(ctx->pending_frame_count <= 8);
// Add the number of frames to the marker byte
marker |= ctx->pending_frame_count - 1;
+#if CONFIG_MISC_FIXES
+ for (i = 0; i < ctx->pending_frame_count - 1; i++) {
+ const size_t frame_sz = (unsigned int) ctx->pending_frame_sizes[i];
+ max_frame_sz = frame_sz > max_frame_sz ? frame_sz : max_frame_sz;
+ }
+#endif
// Choose the magnitude
for (mag = 0, mask = 0xff; mag < 4; mag++) {
+#if CONFIG_MISC_FIXES
+ if (max_frame_sz <= mask)
+ break;
+#else
if (ctx->pending_frame_magnitude < mask)
break;
+#endif
mask <<= 8;
mask |= 0xff;
}
marker |= mag << 3;
// Write the index
- index_sz = 2 + (mag + 1) * ctx->pending_frame_count;
+ index_sz = 2 + (mag + 1) * (ctx->pending_frame_count - CONFIG_MISC_FIXES);
if (ctx->pending_cx_data_sz + index_sz < ctx->cx_data_sz) {
uint8_t *x = ctx->pending_cx_data + ctx->pending_cx_data_sz;
int i, j;
@@ -818,7 +835,7 @@
#endif
*x++ = marker;
- for (i = 0; i < ctx->pending_frame_count; i++) {
+ for (i = 0; i < ctx->pending_frame_count - CONFIG_MISC_FIXES; i++) {
unsigned int this_sz = (unsigned int)ctx->pending_frame_sizes[i];
for (j = 0; j <= mag; j++) {
@@ -974,7 +991,9 @@
ctx->pending_cx_data = cx_data;
ctx->pending_cx_data_sz += size;
ctx->pending_frame_sizes[ctx->pending_frame_count++] = size;
+#if !CONFIG_MISC_FIXES
ctx->pending_frame_magnitude |= size;
+#endif
cx_data += size;
cx_data_sz -= size;
@@ -991,7 +1010,9 @@
ctx->pending_cx_data = NULL;
ctx->pending_cx_data_sz = 0;
ctx->pending_frame_count = 0;
+#if !CONFIG_MISC_FIXES
ctx->pending_frame_magnitude = 0;
+#endif
ctx->output_cx_pkt_cb.output_cx_pkt(
&pkt, ctx->output_cx_pkt_cb.user_priv);
}
@@ -1008,7 +1029,9 @@
if (ctx->pending_cx_data) {
ctx->pending_frame_sizes[ctx->pending_frame_count++] = size;
+#if !CONFIG_MISC_FIXES
ctx->pending_frame_magnitude |= size;
+#endif
ctx->pending_cx_data_sz += size;
// write the superframe only for the case when
if (!ctx->output_cx_pkt_cb.output_cx_pkt)
@@ -1018,7 +1041,9 @@
ctx->pending_cx_data = NULL;
ctx->pending_cx_data_sz = 0;
ctx->pending_frame_count = 0;
+#if !CONFIG_MISC_FIXES
ctx->pending_frame_magnitude = 0;
+#endif
} else {
pkt.data.frame.buf = cx_data;
pkt.data.frame.sz = size;
diff --git a/vp9/common/vp9_rtcd_defs.pl b/vp9/common/vp9_rtcd_defs.pl
index e633691..ed5f4ca 100644
--- a/vp9/common/vp9_rtcd_defs.pl
+++ b/vp9/common/vp9_rtcd_defs.pl
@@ -241,11 +241,15 @@
}
if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
-# the transform coefficients are held in 32-bit
-# values, so the assembler code for vp9_block_error can no longer be used.
add_proto qw/int64_t vp9_block_error/, "const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz";
specialize qw/vp9_block_error/;
+ add_proto qw/int64_t vp9_highbd_block_error/, "const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz, int bd";
+ specialize qw/vp9_highbd_block_error/, "$sse2_x86inc";
+
+ add_proto qw/int64_t vp9_highbd_block_error_8bit/, "const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz";
+ specialize qw/vp9_highbd_block_error_8bit/, "$sse2_x86inc";
+
add_proto qw/void vp9_quantize_fp/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
specialize qw/vp9_quantize_fp/;
@@ -320,9 +324,6 @@
# ENCODEMB INVOKE
- add_proto qw/int64_t vp9_highbd_block_error/, "const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz, int bd";
- specialize qw/vp9_highbd_block_error sse2/;
-
add_proto qw/void vp9_highbd_quantize_fp/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
specialize qw/vp9_highbd_quantize_fp/;
diff --git a/vp9/encoder/vp9_denoiser.c b/vp9/encoder/vp9_denoiser.c
index 4cac388..678e312 100644
--- a/vp9/encoder/vp9_denoiser.c
+++ b/vp9/encoder/vp9_denoiser.c
@@ -195,8 +195,8 @@
int mi_row,
int mi_col,
PICK_MODE_CONTEXT *ctx,
- int *motion_magnitude
- ) {
+ int *motion_magnitude,
+ int is_skin) {
int mv_col, mv_row;
int sse_diff = ctx->zeromv_sse - ctx->newmv_sse;
MV_REFERENCE_FRAME frame;
@@ -214,6 +214,9 @@
saved_mbmi = *mbmi;
+ if (is_skin && *motion_magnitude > 16)
+ return COPY_BLOCK;
+
// If the best reference frame uses inter-prediction and there is enough of a
// difference in sum-squared-error, use it.
if (frame != INTRA_FRAME &&
@@ -313,18 +316,37 @@
int mi_row, int mi_col, BLOCK_SIZE bs,
PICK_MODE_CONTEXT *ctx) {
int motion_magnitude = 0;
- VP9_DENOISER_DECISION decision = FILTER_BLOCK;
+ VP9_DENOISER_DECISION decision = COPY_BLOCK;
YV12_BUFFER_CONFIG avg = denoiser->running_avg_y[INTRA_FRAME];
YV12_BUFFER_CONFIG mc_avg = denoiser->mc_running_avg_y;
uint8_t *avg_start = block_start(avg.y_buffer, avg.y_stride, mi_row, mi_col);
uint8_t *mc_avg_start = block_start(mc_avg.y_buffer, mc_avg.y_stride,
mi_row, mi_col);
struct buf_2d src = mb->plane[0].src;
+ int is_skin = 0;
+
+ if (bs <= BLOCK_16X16) {
+ // Take center pixel in block to determine is_skin.
+ const int y_width_shift = (4 << b_width_log2_lookup[bs]) >> 1;
+ const int y_height_shift = (4 << b_height_log2_lookup[bs]) >> 1;
+ const int uv_width_shift = y_width_shift >> 1;
+ const int uv_height_shift = y_height_shift >> 1;
+ const int stride = mb->plane[0].src.stride;
+ const int strideuv = mb->plane[1].src.stride;
+ const uint8_t ysource =
+ mb->plane[0].src.buf[y_height_shift * stride + y_width_shift];
+ const uint8_t usource =
+ mb->plane[1].src.buf[uv_height_shift * strideuv + uv_width_shift];
+ const uint8_t vsource =
+ mb->plane[2].src.buf[uv_height_shift * strideuv + uv_width_shift];
+ is_skin = vp9_skin_pixel(ysource, usource, vsource);
+ }
decision = perform_motion_compensation(denoiser, mb, bs,
denoiser->increase_denoising,
mi_row, mi_col, ctx,
- &motion_magnitude);
+ &motion_magnitude,
+ is_skin);
if (decision == FILTER_BLOCK) {
decision = vp9_denoiser_filter(src.buf, src.stride,
diff --git a/vp9/encoder/vp9_denoiser.h b/vp9/encoder/vp9_denoiser.h
index c66fdf4..ec0b25e 100644
--- a/vp9/encoder/vp9_denoiser.h
+++ b/vp9/encoder/vp9_denoiser.h
@@ -12,6 +12,7 @@
#define VP9_ENCODER_DENOISER_H_
#include "vp9/encoder/vp9_block.h"
+#include "vp9/encoder/vp9_skin_detection.h"
#include "vpx_scale/yv12config.h"
#ifdef __cplusplus
diff --git a/vp9/encoder/vp9_rdopt.c b/vp9/encoder/vp9_rdopt.c
index 1818906..1944291 100644
--- a/vp9/encoder/vp9_rdopt.c
+++ b/vp9/encoder/vp9_rdopt.c
@@ -269,6 +269,71 @@
*out_dist_sum = dist_sum << 4;
}
+#if CONFIG_VP9_HIGHBITDEPTH
+int64_t vp9_highbd_block_error_c(const tran_low_t *coeff,
+ const tran_low_t *dqcoeff,
+ intptr_t block_size,
+ int64_t *ssz, int bd) {
+ int i;
+ int64_t error = 0, sqcoeff = 0;
+ int shift = 2 * (bd - 8);
+ int rounding = shift > 0 ? 1 << (shift - 1) : 0;
+
+ for (i = 0; i < block_size; i++) {
+ const int64_t diff = coeff[i] - dqcoeff[i];
+ error += diff * diff;
+ sqcoeff += (int64_t)coeff[i] * (int64_t)coeff[i];
+ }
+ assert(error >= 0 && sqcoeff >= 0);
+ error = (error + rounding) >> shift;
+ sqcoeff = (sqcoeff + rounding) >> shift;
+
+ *ssz = sqcoeff;
+ return error;
+}
+
+int64_t vp9_highbd_block_error_8bit_c(const tran_low_t *coeff,
+ const tran_low_t *dqcoeff,
+ intptr_t block_size,
+ int64_t *ssz) {
+ int i;
+ int32_t c, d;
+ int64_t error = 0, sqcoeff = 0;
+ int16_t diff;
+
+ const int32_t hi = 0x00007fff;
+ const int32_t lo = 0xffff8000;
+
+ for (i = 0; i < block_size; i++) {
+ c = coeff[i];
+ d = dqcoeff[i];
+
+ // Saturate to 16 bits
+ c = (c > hi) ? hi : ((c < lo) ? lo : c);
+ d = (d > hi) ? hi : ((d < lo) ? lo : d);
+
+ diff = d - c;
+ error += diff * diff;
+ sqcoeff += c * c;
+ }
+ assert(error >= 0 && sqcoeff >= 0);
+
+ *ssz = sqcoeff;
+ return error;
+}
+
+static int64_t vp9_highbd_block_error_dispatch(const tran_low_t *coeff,
+ const tran_low_t *dqcoeff,
+ intptr_t block_size,
+ int64_t *ssz, int bd) {
+ if (bd == 8) {
+ return vp9_highbd_block_error_8bit(coeff, dqcoeff, block_size, ssz);
+ } else {
+ return vp9_highbd_block_error(coeff, dqcoeff, block_size, ssz, bd);
+ }
+}
+#endif // CONFIG_VP9_HIGHBITDEPTH
+
int64_t vp9_block_error_c(const tran_low_t *coeff, const tran_low_t *dqcoeff,
intptr_t block_size, int64_t *ssz) {
int i;
@@ -297,30 +362,6 @@
return error;
}
-#if CONFIG_VP9_HIGHBITDEPTH
-int64_t vp9_highbd_block_error_c(const tran_low_t *coeff,
- const tran_low_t *dqcoeff,
- intptr_t block_size,
- int64_t *ssz, int bd) {
- int i;
- int64_t error = 0, sqcoeff = 0;
- int shift = 2 * (bd - 8);
- int rounding = shift > 0 ? 1 << (shift - 1) : 0;
-
- for (i = 0; i < block_size; i++) {
- const int64_t diff = coeff[i] - dqcoeff[i];
- error += diff * diff;
- sqcoeff += (int64_t)coeff[i] * (int64_t)coeff[i];
- }
- assert(error >= 0 && sqcoeff >= 0);
- error = (error + rounding) >> shift;
- sqcoeff = (sqcoeff + rounding) >> shift;
-
- *ssz = sqcoeff;
- return error;
-}
-#endif // CONFIG_VP9_HIGHBITDEPTH
-
/* The trailing '0' is a terminator which is used inside cost_coeffs() to
* decide whether to include cost of a trailing EOB node or not (i.e. we
* can skip this if the last coefficient in this transform block, e.g. the
@@ -430,8 +471,9 @@
tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
#if CONFIG_VP9_HIGHBITDEPTH
const int bd = (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) ? xd->bd : 8;
- *out_dist = vp9_highbd_block_error(coeff, dqcoeff, 16 << ss_txfrm_size,
- &this_sse, bd) >> shift;
+ *out_dist = vp9_highbd_block_error_dispatch(coeff, dqcoeff,
+ 16 << ss_txfrm_size,
+ &this_sse, bd) >> shift;
#else
*out_dist = vp9_block_error(coeff, dqcoeff, 16 << ss_txfrm_size,
&this_sse) >> shift;
@@ -831,7 +873,7 @@
ratey += cost_coeffs(x, 0, block, tempa + idx, templ + idy, TX_4X4,
so->scan, so->neighbors,
cpi->sf.use_fast_coef_costing);
- distortion += vp9_highbd_block_error(
+ distortion += vp9_highbd_block_error_dispatch(
coeff, BLOCK_OFFSET(pd->dqcoeff, block),
16, &unused, xd->bd) >> 2;
if (RDCOST(x->rdmult, x->rddiv, ratey, distortion) >= best_rd)
@@ -929,8 +971,13 @@
ratey += cost_coeffs(x, 0, block, tempa + idx, templ + idy, TX_4X4,
so->scan, so->neighbors,
cpi->sf.use_fast_coef_costing);
+#if CONFIG_VP9_HIGHBITDEPTH
+ distortion += vp9_highbd_block_error_8bit(
+ coeff, BLOCK_OFFSET(pd->dqcoeff, block), 16, &unused) >> 2;
+#else
distortion += vp9_block_error(coeff, BLOCK_OFFSET(pd->dqcoeff, block),
16, &unused) >> 2;
+#endif
if (RDCOST(x->rdmult, x->rddiv, ratey, distortion) >= best_rd)
goto next;
vp9_iht4x4_add(tx_type, BLOCK_OFFSET(pd->dqcoeff, block),
@@ -1368,6 +1415,9 @@
k = i;
for (idy = 0; idy < height / 4; ++idy) {
for (idx = 0; idx < width / 4; ++idx) {
+#if CONFIG_VP9_HIGHBITDEPTH
+ const int bd = (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) ? xd->bd : 8;
+#endif
int64_t ssz, rd, rd1, rd2;
tran_low_t* coeff;
@@ -1377,14 +1427,8 @@
coeff, 8);
vp9_regular_quantize_b_4x4(x, 0, k, so->scan, so->iscan);
#if CONFIG_VP9_HIGHBITDEPTH
- if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
- thisdistortion += vp9_highbd_block_error(coeff,
- BLOCK_OFFSET(pd->dqcoeff, k),
- 16, &ssz, xd->bd);
- } else {
- thisdistortion += vp9_block_error(coeff, BLOCK_OFFSET(pd->dqcoeff, k),
- 16, &ssz);
- }
+ thisdistortion += vp9_highbd_block_error_dispatch(
+ coeff, BLOCK_OFFSET(pd->dqcoeff, k), 16, &ssz, bd);
#else
thisdistortion += vp9_block_error(coeff, BLOCK_OFFSET(pd->dqcoeff, k),
16, &ssz);
diff --git a/vp9/encoder/vp9_skin_detection.c b/vp9/encoder/vp9_skin_detection.c
index aaa8ea0..c2763b7 100644
--- a/vp9/encoder/vp9_skin_detection.c
+++ b/vp9/encoder/vp9_skin_detection.c
@@ -98,12 +98,13 @@
uint8_t ysource4 = src_y[(ypos + 1) * src_ystride + (ypos + 1)];
uint8_t usource4 = src_u[(uvpos + 1) * src_uvstride + (uvpos + 1)];
uint8_t vsource4 = src_v[(uvpos + 1) * src_uvstride + (uvpos + 1)];
+ int is_skin = 0;
if (mode_filter == 1) {
ysource = (ysource + ysource2 + ysource3 + ysource4) >> 2;
usource = (usource + usource2 + usource3 + usource4) >> 2;
vsource = (vsource + vsource2 + vsource3 + vsource4) >> 2;
}
- const int is_skin = vp9_skin_pixel(ysource, usource, vsource);
+ is_skin = vp9_skin_pixel(ysource, usource, vsource);
for (i = 0; i < y_bsize; i++) {
for (j = 0; j < y_bsize; j++) {
if (is_skin)
diff --git a/vp9/encoder/vp9_skin_detection.h b/vp9/encoder/vp9_skin_detection.h
index 3d4e737..0a87ef9 100644
--- a/vp9/encoder/vp9_skin_detection.h
+++ b/vp9/encoder/vp9_skin_detection.h
@@ -25,7 +25,8 @@
#ifdef OUTPUT_YUV_SKINMAP
// For viewing skin map on input source.
-void vp9_compute_skin_map(VP9_COMP *const cpi, FILE *yuv_skinmap_file);
+void vp9_compute_skin_map(struct VP9_COMP *const cpi, FILE *yuv_skinmap_file);
+extern void vp9_write_yuv_frame_420(YV12_BUFFER_CONFIG *s, FILE *f);
#endif
#ifdef __cplusplus
diff --git a/vp9/encoder/x86/vp9_highbd_error_sse2.asm b/vp9/encoder/x86/vp9_highbd_error_sse2.asm
new file mode 100644
index 0000000..f3b8f01
--- /dev/null
+++ b/vp9/encoder/x86/vp9_highbd_error_sse2.asm
@@ -0,0 +1,98 @@
+;
+; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+%define private_prefix vp9
+
+%include "third_party/x86inc/x86inc.asm"
+
+SECTION .text
+ALIGN 16
+
+;
+; int64_t vp9_highbd_block_error_8bit(int32_t *coeff, int32_t *dqcoeff,
+; intptr_t block_size, int64_t *ssz)
+;
+
+INIT_XMM sse2
+cglobal highbd_block_error_8bit, 3, 3, 8, uqc, dqc, size, ssz
+ pxor m4, m4 ; sse accumulator
+ pxor m6, m6 ; ssz accumulator
+ pxor m5, m5 ; dedicated zero register
+ lea uqcq, [uqcq+sizeq*4]
+ lea dqcq, [dqcq+sizeq*4]
+ neg sizeq
+
+ ALIGN 16
+
+.loop:
+ mova m0, [dqcq+sizeq*4]
+ packssdw m0, [dqcq+sizeq*4+mmsize]
+ mova m2, [uqcq+sizeq*4]
+ packssdw m2, [uqcq+sizeq*4+mmsize]
+
+ mova m1, [dqcq+sizeq*4+mmsize*2]
+ packssdw m1, [dqcq+sizeq*4+mmsize*3]
+ mova m3, [uqcq+sizeq*4+mmsize*2]
+ packssdw m3, [uqcq+sizeq*4+mmsize*3]
+
+ add sizeq, mmsize
+
+ ; individual errors are max. 15bit+sign, so squares are 30bit, and
+ ; thus the sum of 2 should fit in a 31bit integer (+ unused sign bit)
+
+ psubw m0, m2
+ pmaddwd m2, m2
+ pmaddwd m0, m0
+
+ psubw m1, m3
+ pmaddwd m3, m3
+ pmaddwd m1, m1
+
+ ; accumulate in 64bit
+ punpckldq m7, m0, m5
+ punpckhdq m0, m5
+ paddq m4, m7
+
+ punpckldq m7, m2, m5
+ punpckhdq m2, m5
+ paddq m6, m7
+
+ punpckldq m7, m1, m5
+ punpckhdq m1, m5
+ paddq m4, m7
+
+ punpckldq m7, m3, m5
+ punpckhdq m3, m5
+ paddq m6, m7
+
+ paddq m4, m0
+ paddq m4, m1
+ paddq m6, m2
+ paddq m6, m3
+
+ jnz .loop
+
+ ; accumulate horizontally and store in return value
+ movhlps m5, m4
+ movhlps m7, m6
+ paddq m4, m5
+ paddq m6, m7
+
+%if ARCH_X86_64
+ movq rax, m4
+ movq [sszq], m6
+%else
+ mov eax, sszm
+ pshufd m5, m4, 0x1
+ movq [eax], m6
+ movd eax, m4
+ movd edx, m5
+%endif
+ RET
diff --git a/vp9/vp9cx.mk b/vp9/vp9cx.mk
index 84b12d7..a2cbacf 100644
--- a/vp9/vp9cx.mk
+++ b/vp9/vp9cx.mk
@@ -100,8 +100,12 @@
ifeq ($(CONFIG_USE_X86INC),yes)
VP9_CX_SRCS-$(HAVE_MMX) += encoder/x86/vp9_dct_mmx.asm
+ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
+VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_highbd_error_sse2.asm
+else
VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_error_sse2.asm
endif
+endif
ifeq ($(ARCH_X86_64),yes)
ifeq ($(CONFIG_USE_X86INC),yes)
diff --git a/vpx_dsp/x86/quantize_ssse3_x86_64.asm b/vpx_dsp/x86/quantize_ssse3_x86_64.asm
index 2f3cadd..ca21539 100644
--- a/vpx_dsp/x86/quantize_ssse3_x86_64.asm
+++ b/vpx_dsp/x86/quantize_ssse3_x86_64.asm
@@ -69,11 +69,9 @@
%if CONFIG_VP9_HIGHBITDEPTH
; coeff stored as 32bit numbers & require 16bit numbers
mova m9, [ coeffq+ncoeffq*4+ 0]
- mova m6, [ coeffq+ncoeffq*4+16]
+ packssdw m9, [ coeffq+ncoeffq*4+16]
mova m10, [ coeffq+ncoeffq*4+32]
- mova m11, [ coeffq+ncoeffq*4+48]
- packssdw m9, m6 ; m9 = c[i]
- packssdw m10, m11 ; m10 = c[i]
+ packssdw m10, [ coeffq+ncoeffq*4+48]
%else
mova m9, [ coeffq+ncoeffq*2+ 0] ; m9 = c[i]
mova m10, [ coeffq+ncoeffq*2+16] ; m10 = c[i]
@@ -171,11 +169,9 @@
%if CONFIG_VP9_HIGHBITDEPTH
; pack coeff from 32bit to 16bit array
mova m9, [ coeffq+ncoeffq*4+ 0]
- mova m6, [ coeffq+ncoeffq*4+16]
+ packssdw m9, [ coeffq+ncoeffq*4+16]
mova m10, [ coeffq+ncoeffq*4+32]
- mova m11, [ coeffq+ncoeffq*4+48]
- packssdw m9, m6 ; m9 = c[i]
- packssdw m10, m11 ; m10 = c[i]
+ packssdw m10, [ coeffq+ncoeffq*4+48]
%else
mova m9, [ coeffq+ncoeffq*2+ 0] ; m9 = c[i]
mova m10, [ coeffq+ncoeffq*2+16] ; m10 = c[i]