Merge "Add several new test vectors with small resolution."
diff --git a/test/vp9_error_block_test.cc b/test/vp9_error_block_test.cc
index d779706..77b12ea 100644
--- a/test/vp9_error_block_test.cc
+++ b/test/vp9_error_block_test.cc
@@ -67,12 +67,22 @@
int64_t ret;
int64_t ref_ssz;
int64_t ref_ret;
+ const int msb = bit_depth_ + 8 - 1;
for (int i = 0; i < kNumIterations; ++i) {
int err_count = 0;
block_size = 16 << (i % 9); // All block sizes from 4x4, 8x4 ..64x64
for (int j = 0; j < block_size; j++) {
- coeff[j] = rnd(2 << 20) - (1 << 20);
- dqcoeff[j] = rnd(2 << 20) - (1 << 20);
+ // coeff and dqcoeff will always have at least the same sign, and this
+ // can be used for optimization, so generate test input precisely.
+ if (rnd(2)) {
+ // Positive number
+ coeff[j] = rnd(1 << msb);
+ dqcoeff[j] = rnd(1 << msb);
+ } else {
+ // Negative number
+ coeff[j] = -rnd(1 << msb);
+ dqcoeff[j] = -rnd(1 << msb);
+ }
}
ref_ret = ref_error_block_op_(coeff, dqcoeff, block_size, &ref_ssz,
bit_depth_);
@@ -85,7 +95,7 @@
err_count_total += err_count;
}
EXPECT_EQ(0, err_count_total)
- << "Error: Error Block Test, C output doesn't match SSE2 output. "
+ << "Error: Error Block Test, C output doesn't match optimized output. "
<< "First failed at test case " << first_failure;
}
@@ -100,23 +110,36 @@
int64_t ret;
int64_t ref_ssz;
int64_t ref_ret;
- int max_val = ((1 << 20) - 1);
+ const int msb = bit_depth_ + 8 - 1;
+ int max_val = ((1 << msb) - 1);
for (int i = 0; i < kNumIterations; ++i) {
int err_count = 0;
- int k = (i / 9) % 5;
+ int k = (i / 9) % 9;
// Change the maximum coeff value, to test different bit boundaries
- if ( k == 4 && (i % 9) == 0 ) {
+ if ( k == 8 && (i % 9) == 0 ) {
max_val >>= 1;
}
block_size = 16 << (i % 9); // All block sizes from 4x4, 8x4 ..64x64
for (int j = 0; j < block_size; j++) {
- if (k < 4) { // Test at maximum values
- coeff[j] = k % 2 ? max_val : -max_val;
- dqcoeff[j] = (k >> 1) % 2 ? max_val : -max_val;
+ if (k < 4) {
+ // Test at positive maximum values
+ coeff[j] = k % 2 ? max_val : 0;
+ dqcoeff[j] = (k >> 1) % 2 ? max_val : 0;
+ } else if (k < 8) {
+ // Test at negative maximum values
+ coeff[j] = k % 2 ? -max_val : 0;
+ dqcoeff[j] = (k >> 1) % 2 ? -max_val : 0;
} else {
- coeff[j] = rnd(2 << 14) - (1 << 14);
- dqcoeff[j] = rnd(2 << 14) - (1 << 14);
+ if (rnd(2)) {
+ // Positive number
+ coeff[j] = rnd(1 << 14);
+ dqcoeff[j] = rnd(1 << 14);
+ } else {
+ // Negative number
+ coeff[j] = -rnd(1 << 14);
+ dqcoeff[j] = -rnd(1 << 14);
+ }
}
}
ref_ret = ref_error_block_op_(coeff, dqcoeff, block_size, &ref_ssz,
@@ -130,21 +153,13 @@
err_count_total += err_count;
}
EXPECT_EQ(0, err_count_total)
- << "Error: Error Block Test, C output doesn't match SSE2 output. "
+ << "Error: Error Block Test, C output doesn't match optimized output. "
<< "First failed at test case " << first_failure;
}
using std::tr1::make_tuple;
-#if CONFIG_USE_X86INC && HAVE_SSE2
-int64_t wrap_vp9_highbd_block_error_8bit_sse2(const tran_low_t *coeff,
- const tran_low_t *dqcoeff,
- intptr_t block_size,
- int64_t *ssz, int bps) {
- assert(bps == 8);
- return vp9_highbd_block_error_8bit_sse2(coeff, dqcoeff, block_size, ssz);
-}
-
+#if CONFIG_USE_X86INC
int64_t wrap_vp9_highbd_block_error_8bit_c(const tran_low_t *coeff,
const tran_low_t *dqcoeff,
intptr_t block_size,
@@ -153,6 +168,15 @@
return vp9_highbd_block_error_8bit_c(coeff, dqcoeff, block_size, ssz);
}
+#if HAVE_SSE2
+int64_t wrap_vp9_highbd_block_error_8bit_sse2(const tran_low_t *coeff,
+ const tran_low_t *dqcoeff,
+ intptr_t block_size,
+ int64_t *ssz, int bps) {
+ assert(bps == 8);
+ return vp9_highbd_block_error_8bit_sse2(coeff, dqcoeff, block_size, ssz);
+}
+
INSTANTIATE_TEST_CASE_P(
SSE2, ErrorBlockTest,
::testing::Values(
@@ -165,5 +189,23 @@
make_tuple(&wrap_vp9_highbd_block_error_8bit_sse2,
&wrap_vp9_highbd_block_error_8bit_c, VPX_BITS_8)));
#endif // HAVE_SSE2
+
+#if HAVE_AVX
+int64_t wrap_vp9_highbd_block_error_8bit_avx(const tran_low_t *coeff,
+ const tran_low_t *dqcoeff,
+ intptr_t block_size,
+ int64_t *ssz, int bps) {
+ assert(bps == 8);
+ return vp9_highbd_block_error_8bit_avx(coeff, dqcoeff, block_size, ssz);
+}
+
+INSTANTIATE_TEST_CASE_P(
+ AVX, ErrorBlockTest,
+ ::testing::Values(
+ make_tuple(&wrap_vp9_highbd_block_error_8bit_avx,
+ &wrap_vp9_highbd_block_error_8bit_c, VPX_BITS_8)));
+#endif // HAVE_AVX
+
+#endif // CONFIG_USE_X86INC
#endif // CONFIG_VP9_HIGHBITDEPTH
} // namespace
diff --git a/vp10/common/blockd.c b/vp10/common/blockd.c
index 5394b5e..b6f910f 100644
--- a/vp10/common/blockd.c
+++ b/vp10/common/blockd.c
@@ -66,7 +66,7 @@
for (r = 0; r < max_blocks_high; r += (1 << tx_size)) {
// Skip visiting the sub blocks that are wholly within the UMV.
for (c = 0; c < max_blocks_wide; c += (1 << tx_size)) {
- visit(plane, i, plane_bsize, tx_size, arg);
+ visit(plane, i, r, c, plane_bsize, tx_size, arg);
i += step;
}
i += extra_step;
diff --git a/vp10/common/blockd.h b/vp10/common/blockd.h
index 8454154..b89d791 100644
--- a/vp10/common/blockd.h
+++ b/vp10/common/blockd.h
@@ -283,6 +283,7 @@
}
typedef void (*foreach_transformed_block_visitor)(int plane, int block,
+ int blk_row, int blk_col,
BLOCK_SIZE plane_bsize,
TX_SIZE tx_size,
void *arg);
@@ -296,17 +297,6 @@
const MACROBLOCKD* const xd, BLOCK_SIZE bsize,
foreach_transformed_block_visitor visit, void *arg);
-static INLINE void txfrm_block_to_raster_xy(BLOCK_SIZE plane_bsize,
- TX_SIZE tx_size, int block,
- int *x, int *y) {
- const int bwl = b_width_log2_lookup[plane_bsize];
- const int tx_cols_log2 = bwl - tx_size;
- const int tx_cols = 1 << tx_cols_log2;
- const int raster_mb = block >> (tx_size << 1);
- *x = (raster_mb & (tx_cols - 1)) << tx_size;
- *y = (raster_mb >> tx_cols_log2) << tx_size;
-}
-
void vp10_set_contexts(const MACROBLOCKD *xd, struct macroblockd_plane *pd,
BLOCK_SIZE plane_bsize, TX_SIZE tx_size, int has_eob,
int aoff, int loff);
diff --git a/vp10/common/entropy.c b/vp10/common/entropy.c
index 56dd73a..1676506 100644
--- a/vp10/common/entropy.c
+++ b/vp10/common/entropy.c
@@ -403,7 +403,6 @@
{255, 241, 243, 255, 236, 255, 252, 254},
{255, 243, 245, 255, 237, 255, 252, 254},
{255, 246, 247, 255, 239, 255, 253, 255},
- {255, 246, 247, 255, 239, 255, 253, 255},
};
static const vp10_coeff_probs_model default_coef_probs_4x4[PLANE_TYPES] = {
@@ -743,14 +742,16 @@
};
static void extend_to_full_distribution(vpx_prob *probs, vpx_prob p) {
- memcpy(probs, vp10_pareto8_full[p = 0 ? 0 : p - 1],
- MODEL_NODES * sizeof(vpx_prob));
+ memcpy(probs, vp10_pareto8_full[p - 1], MODEL_NODES * sizeof(vpx_prob));
}
void vp10_model_to_full_probs(const vpx_prob *model, vpx_prob *full) {
if (full != model)
memcpy(full, model, sizeof(vpx_prob) * UNCONSTRAINED_NODES);
- extend_to_full_distribution(&full[UNCONSTRAINED_NODES], model[PIVOT_NODE]);
+ // TODO(aconverse): model[PIVOT_NODE] should never be zero.
+ // https://code.google.com/p/webm/issues/detail?id=1089
+ if (model[PIVOT_NODE] != 0)
+ extend_to_full_distribution(&full[UNCONSTRAINED_NODES], model[PIVOT_NODE]);
}
void vp10_default_coef_probs(VP10_COMMON *cm) {
diff --git a/vp10/common/entropy.h b/vp10/common/entropy.h
index fba7020..2f93cb3 100644
--- a/vp10/common/entropy.h
+++ b/vp10/common/entropy.h
@@ -153,7 +153,7 @@
// 1, 3, 5, 7, ..., 253, 255
// In between probabilities are interpolated linearly
-#define COEFF_PROB_MODELS 256
+#define COEFF_PROB_MODELS 255
#define UNCONSTRAINED_NODES 3
diff --git a/vp10/encoder/context_tree.c b/vp10/encoder/context_tree.c
index 532e82c..6c056d2 100644
--- a/vp10/encoder/context_tree.c
+++ b/vp10/encoder/context_tree.c
@@ -30,13 +30,13 @@
for (i = 0; i < MAX_MB_PLANE; ++i) {
for (k = 0; k < 3; ++k) {
CHECK_MEM_ERROR(cm, ctx->coeff[i][k],
- vpx_memalign(16, num_pix * sizeof(*ctx->coeff[i][k])));
+ vpx_memalign(32, num_pix * sizeof(*ctx->coeff[i][k])));
CHECK_MEM_ERROR(cm, ctx->qcoeff[i][k],
- vpx_memalign(16, num_pix * sizeof(*ctx->qcoeff[i][k])));
+ vpx_memalign(32, num_pix * sizeof(*ctx->qcoeff[i][k])));
CHECK_MEM_ERROR(cm, ctx->dqcoeff[i][k],
- vpx_memalign(16, num_pix * sizeof(*ctx->dqcoeff[i][k])));
+ vpx_memalign(32, num_pix * sizeof(*ctx->dqcoeff[i][k])));
CHECK_MEM_ERROR(cm, ctx->eobs[i][k],
- vpx_memalign(16, num_blk * sizeof(*ctx->eobs[i][k])));
+ vpx_memalign(32, num_blk * sizeof(*ctx->eobs[i][k])));
ctx->coeff_pbuf[i][k] = ctx->coeff[i][k];
ctx->qcoeff_pbuf[i][k] = ctx->qcoeff[i][k];
ctx->dqcoeff_pbuf[i][k] = ctx->dqcoeff[i][k];
diff --git a/vp10/encoder/encodemb.c b/vp10/encoder/encodemb.c
index ff23fee..92ba4dd 100644
--- a/vp10/encoder/encodemb.c
+++ b/vp10/encoder/encodemb.c
@@ -323,195 +323,6 @@
}
#endif // CONFIG_VP9_HIGHBITDEPTH
-void vp10_xform_quant_fp(MACROBLOCK *x, int plane, int block,
- BLOCK_SIZE plane_bsize, TX_SIZE tx_size) {
- MACROBLOCKD *const xd = &x->e_mbd;
- const struct macroblock_plane *const p = &x->plane[plane];
- const struct macroblockd_plane *const pd = &xd->plane[plane];
- PLANE_TYPE plane_type = (plane == 0) ? PLANE_TYPE_Y : PLANE_TYPE_UV;
- TX_TYPE tx_type = get_tx_type(plane_type, xd, block);
- const scan_order *const scan_order = get_scan(tx_size, tx_type);
- tran_low_t *const coeff = BLOCK_OFFSET(p->coeff, block);
- tran_low_t *const qcoeff = BLOCK_OFFSET(p->qcoeff, block);
- tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
- uint16_t *const eob = &p->eobs[block];
- const int diff_stride = 4 * num_4x4_blocks_wide_lookup[plane_bsize];
- int i, j;
- const int16_t *src_diff;
- txfrm_block_to_raster_xy(plane_bsize, tx_size, block, &i, &j);
- src_diff = &p->src_diff[4 * (j * diff_stride + i)];
-
-#if CONFIG_VP9_HIGHBITDEPTH
- if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
- switch (tx_size) {
- case TX_32X32:
- highbd_fdct32x32(x->use_lp32x32fdct, src_diff, coeff, diff_stride);
- vp10_highbd_quantize_fp_32x32(coeff, 1024, x->skip_block, p->zbin,
- p->round_fp, p->quant_fp, p->quant_shift,
- qcoeff, dqcoeff, pd->dequant,
- eob, scan_order->scan,
- scan_order->iscan);
- break;
- case TX_16X16:
- vpx_highbd_fdct16x16(src_diff, coeff, diff_stride);
- vp10_highbd_quantize_fp(coeff, 256, x->skip_block, p->zbin, p->round_fp,
- p->quant_fp, p->quant_shift, qcoeff, dqcoeff,
- pd->dequant, eob,
- scan_order->scan, scan_order->iscan);
- break;
- case TX_8X8:
- vpx_highbd_fdct8x8(src_diff, coeff, diff_stride);
- vp10_highbd_quantize_fp(coeff, 64, x->skip_block, p->zbin, p->round_fp,
- p->quant_fp, p->quant_shift, qcoeff, dqcoeff,
- pd->dequant, eob,
- scan_order->scan, scan_order->iscan);
- break;
- case TX_4X4:
- if (xd->lossless[xd->mi[0]->mbmi.segment_id]) {
- vp10_highbd_fwht4x4(src_diff, coeff, diff_stride);
- } else {
- vpx_highbd_fdct4x4(src_diff, coeff, diff_stride);
- }
- vp10_highbd_quantize_fp(coeff, 16, x->skip_block, p->zbin, p->round_fp,
- p->quant_fp, p->quant_shift, qcoeff, dqcoeff,
- pd->dequant, eob,
- scan_order->scan, scan_order->iscan);
- break;
- default:
- assert(0);
- }
- return;
- }
-#endif // CONFIG_VP9_HIGHBITDEPTH
-
- switch (tx_size) {
- case TX_32X32:
- fdct32x32(x->use_lp32x32fdct, src_diff, coeff, diff_stride);
- vp10_quantize_fp_32x32(coeff, 1024, x->skip_block, p->zbin, p->round_fp,
- p->quant_fp, p->quant_shift, qcoeff, dqcoeff,
- pd->dequant, eob, scan_order->scan,
- scan_order->iscan);
- break;
- case TX_16X16:
- vpx_fdct16x16(src_diff, coeff, diff_stride);
- vp10_quantize_fp(coeff, 256, x->skip_block, p->zbin, p->round_fp,
- p->quant_fp, p->quant_shift, qcoeff, dqcoeff,
- pd->dequant, eob,
- scan_order->scan, scan_order->iscan);
- break;
- case TX_8X8:
- vp10_fdct8x8_quant(src_diff, diff_stride, coeff, 64,
- x->skip_block, p->zbin, p->round_fp,
- p->quant_fp, p->quant_shift, qcoeff, dqcoeff,
- pd->dequant, eob,
- scan_order->scan, scan_order->iscan);
- break;
- case TX_4X4:
- if (xd->lossless[xd->mi[0]->mbmi.segment_id]) {
- vp10_fwht4x4(src_diff, coeff, diff_stride);
- } else {
- vpx_fdct4x4(src_diff, coeff, diff_stride);
- }
- vp10_quantize_fp(coeff, 16, x->skip_block, p->zbin, p->round_fp,
- p->quant_fp, p->quant_shift, qcoeff, dqcoeff,
- pd->dequant, eob,
- scan_order->scan, scan_order->iscan);
- break;
- default:
- assert(0);
- break;
- }
-}
-
-void vp10_xform_quant_dc(MACROBLOCK *x, int plane, int block,
- BLOCK_SIZE plane_bsize, TX_SIZE tx_size) {
- MACROBLOCKD *const xd = &x->e_mbd;
- const struct macroblock_plane *const p = &x->plane[plane];
- const struct macroblockd_plane *const pd = &xd->plane[plane];
- tran_low_t *const coeff = BLOCK_OFFSET(p->coeff, block);
- tran_low_t *const qcoeff = BLOCK_OFFSET(p->qcoeff, block);
- tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
- uint16_t *const eob = &p->eobs[block];
- const int diff_stride = 4 * num_4x4_blocks_wide_lookup[plane_bsize];
- int i, j;
- const int16_t *src_diff;
-
- txfrm_block_to_raster_xy(plane_bsize, tx_size, block, &i, &j);
- src_diff = &p->src_diff[4 * (j * diff_stride + i)];
-
-#if CONFIG_VP9_HIGHBITDEPTH
- if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
- switch (tx_size) {
- case TX_32X32:
- vpx_highbd_fdct32x32_1(src_diff, coeff, diff_stride);
- vpx_highbd_quantize_dc_32x32(coeff, x->skip_block, p->round,
- p->quant_fp[0], qcoeff, dqcoeff,
- pd->dequant[0], eob);
- break;
- case TX_16X16:
- vpx_highbd_fdct16x16_1(src_diff, coeff, diff_stride);
- vpx_highbd_quantize_dc(coeff, 256, x->skip_block, p->round,
- p->quant_fp[0], qcoeff, dqcoeff,
- pd->dequant[0], eob);
- break;
- case TX_8X8:
- vpx_highbd_fdct8x8_1(src_diff, coeff, diff_stride);
- vpx_highbd_quantize_dc(coeff, 64, x->skip_block, p->round,
- p->quant_fp[0], qcoeff, dqcoeff,
- pd->dequant[0], eob);
- break;
- case TX_4X4:
- if (xd->lossless[xd->mi[0]->mbmi.segment_id]) {
- vp10_highbd_fwht4x4(src_diff, coeff, diff_stride);
- } else {
- vpx_highbd_fdct4x4(src_diff, coeff, diff_stride);
- }
- vpx_highbd_quantize_dc(coeff, 16, x->skip_block, p->round,
- p->quant_fp[0], qcoeff, dqcoeff,
- pd->dequant[0], eob);
- break;
- default:
- assert(0);
- }
- return;
- }
-#endif // CONFIG_VP9_HIGHBITDEPTH
-
- switch (tx_size) {
- case TX_32X32:
- vpx_fdct32x32_1(src_diff, coeff, diff_stride);
- vpx_quantize_dc_32x32(coeff, x->skip_block, p->round,
- p->quant_fp[0], qcoeff, dqcoeff,
- pd->dequant[0], eob);
- break;
- case TX_16X16:
- vpx_fdct16x16_1(src_diff, coeff, diff_stride);
- vpx_quantize_dc(coeff, 256, x->skip_block, p->round,
- p->quant_fp[0], qcoeff, dqcoeff,
- pd->dequant[0], eob);
- break;
- case TX_8X8:
- vpx_fdct8x8_1(src_diff, coeff, diff_stride);
- vpx_quantize_dc(coeff, 64, x->skip_block, p->round,
- p->quant_fp[0], qcoeff, dqcoeff,
- pd->dequant[0], eob);
- break;
- case TX_4X4:
- if (xd->lossless[xd->mi[0]->mbmi.segment_id]) {
- vp10_fwht4x4(src_diff, coeff, diff_stride);
- } else {
- vpx_fdct4x4(src_diff, coeff, diff_stride);
- }
- vpx_quantize_dc(coeff, 16, x->skip_block, p->round,
- p->quant_fp[0], qcoeff, dqcoeff,
- pd->dequant[0], eob);
- break;
- default:
- assert(0);
- break;
- }
-}
-
void vp10_fwd_txfm_4x4(const int16_t *src_diff, tran_low_t *coeff,
int diff_stride, TX_TYPE tx_type, int lossless) {
if (lossless) {
@@ -657,8 +468,9 @@
}
#endif // CONFIG_VP9_HIGHBITDEPTH
-void vp10_xform_quant(MACROBLOCK *x, int plane, int block,
- BLOCK_SIZE plane_bsize, TX_SIZE tx_size) {
+void vp10_xform_quant_fp(MACROBLOCK *x, int plane, int block,
+ int blk_row, int blk_col,
+ BLOCK_SIZE plane_bsize, TX_SIZE tx_size) {
MACROBLOCKD *const xd = &x->e_mbd;
const struct macroblock_plane *const p = &x->plane[plane];
const struct macroblockd_plane *const pd = &xd->plane[plane];
@@ -670,10 +482,196 @@
tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
uint16_t *const eob = &p->eobs[block];
const int diff_stride = 4 * num_4x4_blocks_wide_lookup[plane_bsize];
- int i, j;
const int16_t *src_diff;
- txfrm_block_to_raster_xy(plane_bsize, tx_size, block, &i, &j);
- src_diff = &p->src_diff[4 * (j * diff_stride + i)];
+ src_diff = &p->src_diff[4 * (blk_row * diff_stride + blk_col)];
+
+#if CONFIG_VP9_HIGHBITDEPTH
+ if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+ switch (tx_size) {
+ case TX_32X32:
+ highbd_fdct32x32(x->use_lp32x32fdct, src_diff, coeff, diff_stride);
+ vp10_highbd_quantize_fp_32x32(coeff, 1024, x->skip_block, p->zbin,
+ p->round_fp, p->quant_fp, p->quant_shift,
+ qcoeff, dqcoeff, pd->dequant,
+ eob, scan_order->scan,
+ scan_order->iscan);
+ break;
+ case TX_16X16:
+ vpx_highbd_fdct16x16(src_diff, coeff, diff_stride);
+ vp10_highbd_quantize_fp(coeff, 256, x->skip_block, p->zbin, p->round_fp,
+ p->quant_fp, p->quant_shift, qcoeff, dqcoeff,
+ pd->dequant, eob,
+ scan_order->scan, scan_order->iscan);
+ break;
+ case TX_8X8:
+ vpx_highbd_fdct8x8(src_diff, coeff, diff_stride);
+ vp10_highbd_quantize_fp(coeff, 64, x->skip_block, p->zbin, p->round_fp,
+ p->quant_fp, p->quant_shift, qcoeff, dqcoeff,
+ pd->dequant, eob,
+ scan_order->scan, scan_order->iscan);
+ break;
+ case TX_4X4:
+ if (xd->lossless[xd->mi[0]->mbmi.segment_id]) {
+ vp10_highbd_fwht4x4(src_diff, coeff, diff_stride);
+ } else {
+ vpx_highbd_fdct4x4(src_diff, coeff, diff_stride);
+ }
+ vp10_highbd_quantize_fp(coeff, 16, x->skip_block, p->zbin, p->round_fp,
+ p->quant_fp, p->quant_shift, qcoeff, dqcoeff,
+ pd->dequant, eob,
+ scan_order->scan, scan_order->iscan);
+ break;
+ default:
+ assert(0);
+ }
+ return;
+ }
+#endif // CONFIG_VP9_HIGHBITDEPTH
+
+ switch (tx_size) {
+ case TX_32X32:
+ fdct32x32(x->use_lp32x32fdct, src_diff, coeff, diff_stride);
+ vp10_quantize_fp_32x32(coeff, 1024, x->skip_block, p->zbin, p->round_fp,
+ p->quant_fp, p->quant_shift, qcoeff, dqcoeff,
+ pd->dequant, eob, scan_order->scan,
+ scan_order->iscan);
+ break;
+ case TX_16X16:
+ vpx_fdct16x16(src_diff, coeff, diff_stride);
+ vp10_quantize_fp(coeff, 256, x->skip_block, p->zbin, p->round_fp,
+ p->quant_fp, p->quant_shift, qcoeff, dqcoeff,
+ pd->dequant, eob,
+ scan_order->scan, scan_order->iscan);
+ break;
+ case TX_8X8:
+ vp10_fdct8x8_quant(src_diff, diff_stride, coeff, 64,
+ x->skip_block, p->zbin, p->round_fp,
+ p->quant_fp, p->quant_shift, qcoeff, dqcoeff,
+ pd->dequant, eob,
+ scan_order->scan, scan_order->iscan);
+ break;
+ case TX_4X4:
+ if (xd->lossless[xd->mi[0]->mbmi.segment_id]) {
+ vp10_fwht4x4(src_diff, coeff, diff_stride);
+ } else {
+ vpx_fdct4x4(src_diff, coeff, diff_stride);
+ }
+ vp10_quantize_fp(coeff, 16, x->skip_block, p->zbin, p->round_fp,
+ p->quant_fp, p->quant_shift, qcoeff, dqcoeff,
+ pd->dequant, eob,
+ scan_order->scan, scan_order->iscan);
+ break;
+ default:
+ assert(0);
+ break;
+ }
+}
+
+void vp10_xform_quant_dc(MACROBLOCK *x, int plane, int block,
+ int blk_row, int blk_col,
+ BLOCK_SIZE plane_bsize, TX_SIZE tx_size) {
+ MACROBLOCKD *const xd = &x->e_mbd;
+ const struct macroblock_plane *const p = &x->plane[plane];
+ const struct macroblockd_plane *const pd = &xd->plane[plane];
+ tran_low_t *const coeff = BLOCK_OFFSET(p->coeff, block);
+ tran_low_t *const qcoeff = BLOCK_OFFSET(p->qcoeff, block);
+ tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
+ uint16_t *const eob = &p->eobs[block];
+ const int diff_stride = 4 * num_4x4_blocks_wide_lookup[plane_bsize];
+ const int16_t *src_diff;
+ src_diff = &p->src_diff[4 * (blk_row * diff_stride + blk_col)];
+
+#if CONFIG_VP9_HIGHBITDEPTH
+ if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+ switch (tx_size) {
+ case TX_32X32:
+ vpx_highbd_fdct32x32_1(src_diff, coeff, diff_stride);
+ vpx_highbd_quantize_dc_32x32(coeff, x->skip_block, p->round,
+ p->quant_fp[0], qcoeff, dqcoeff,
+ pd->dequant[0], eob);
+ break;
+ case TX_16X16:
+ vpx_highbd_fdct16x16_1(src_diff, coeff, diff_stride);
+ vpx_highbd_quantize_dc(coeff, 256, x->skip_block, p->round,
+ p->quant_fp[0], qcoeff, dqcoeff,
+ pd->dequant[0], eob);
+ break;
+ case TX_8X8:
+ vpx_highbd_fdct8x8_1(src_diff, coeff, diff_stride);
+ vpx_highbd_quantize_dc(coeff, 64, x->skip_block, p->round,
+ p->quant_fp[0], qcoeff, dqcoeff,
+ pd->dequant[0], eob);
+ break;
+ case TX_4X4:
+ if (xd->lossless[xd->mi[0]->mbmi.segment_id]) {
+ vp10_highbd_fwht4x4(src_diff, coeff, diff_stride);
+ } else {
+ vpx_highbd_fdct4x4(src_diff, coeff, diff_stride);
+ }
+ vpx_highbd_quantize_dc(coeff, 16, x->skip_block, p->round,
+ p->quant_fp[0], qcoeff, dqcoeff,
+ pd->dequant[0], eob);
+ break;
+ default:
+ assert(0);
+ }
+ return;
+ }
+#endif // CONFIG_VP9_HIGHBITDEPTH
+
+ switch (tx_size) {
+ case TX_32X32:
+ vpx_fdct32x32_1(src_diff, coeff, diff_stride);
+ vpx_quantize_dc_32x32(coeff, x->skip_block, p->round,
+ p->quant_fp[0], qcoeff, dqcoeff,
+ pd->dequant[0], eob);
+ break;
+ case TX_16X16:
+ vpx_fdct16x16_1(src_diff, coeff, diff_stride);
+ vpx_quantize_dc(coeff, 256, x->skip_block, p->round,
+ p->quant_fp[0], qcoeff, dqcoeff,
+ pd->dequant[0], eob);
+ break;
+ case TX_8X8:
+ vpx_fdct8x8_1(src_diff, coeff, diff_stride);
+ vpx_quantize_dc(coeff, 64, x->skip_block, p->round,
+ p->quant_fp[0], qcoeff, dqcoeff,
+ pd->dequant[0], eob);
+ break;
+ case TX_4X4:
+ if (xd->lossless[xd->mi[0]->mbmi.segment_id]) {
+ vp10_fwht4x4(src_diff, coeff, diff_stride);
+ } else {
+ vpx_fdct4x4(src_diff, coeff, diff_stride);
+ }
+ vpx_quantize_dc(coeff, 16, x->skip_block, p->round,
+ p->quant_fp[0], qcoeff, dqcoeff,
+ pd->dequant[0], eob);
+ break;
+ default:
+ assert(0);
+ break;
+ }
+}
+
+
+
+void vp10_xform_quant(MACROBLOCK *x, int plane, int block,
+ int blk_row, int blk_col,
+ BLOCK_SIZE plane_bsize, TX_SIZE tx_size) {
+ MACROBLOCKD *const xd = &x->e_mbd;
+ const struct macroblock_plane *const p = &x->plane[plane];
+ const struct macroblockd_plane *const pd = &xd->plane[plane];
+ PLANE_TYPE plane_type = (plane == 0) ? PLANE_TYPE_Y : PLANE_TYPE_UV;
+ TX_TYPE tx_type = get_tx_type(plane_type, xd, block);
+ const scan_order *const scan_order = get_scan(tx_size, tx_type);
+ tran_low_t *const coeff = BLOCK_OFFSET(p->coeff, block);
+ tran_low_t *const qcoeff = BLOCK_OFFSET(p->qcoeff, block);
+ tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
+ uint16_t *const eob = &p->eobs[block];
+ const int diff_stride = 4 * num_4x4_blocks_wide_lookup[plane_bsize];
+ const int16_t *src_diff;
+ src_diff = &p->src_diff[4 * (blk_row * diff_stride + blk_col)];
#if CONFIG_VP9_HIGHBITDEPTH
if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
@@ -751,7 +749,8 @@
}
}
-static void encode_block(int plane, int block, BLOCK_SIZE plane_bsize,
+static void encode_block(int plane, int block, int blk_row, int blk_col,
+ BLOCK_SIZE plane_bsize,
TX_SIZE tx_size, void *arg) {
struct encode_b_args *const args = arg;
MACROBLOCK *const x = args->x;
@@ -760,14 +759,12 @@
struct macroblock_plane *const p = &x->plane[plane];
struct macroblockd_plane *const pd = &xd->plane[plane];
tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
- int i, j;
uint8_t *dst;
ENTROPY_CONTEXT *a, *l;
TX_TYPE tx_type = get_tx_type(pd->plane_type, xd, block);
- txfrm_block_to_raster_xy(plane_bsize, tx_size, block, &i, &j);
- dst = &pd->dst.buf[4 * j * pd->dst.stride + 4 * i];
- a = &ctx->ta[plane][i];
- l = &ctx->tl[plane][j];
+ dst = &pd->dst.buf[4 * blk_row * pd->dst.stride + 4 * blk_col];
+ a = &ctx->ta[plane][blk_col];
+ l = &ctx->tl[plane][blk_row];
// TODO(jingning): per transformed block zero forcing only enabled for
// luma component. will integrate chroma components as well.
@@ -786,17 +783,20 @@
*a = *l = 0;
return;
} else {
- vp10_xform_quant_fp(x, plane, block, plane_bsize, tx_size);
+ vp10_xform_quant_fp(x, plane, block, blk_row, blk_col,
+ plane_bsize, tx_size);
}
} else {
if (max_txsize_lookup[plane_bsize] == tx_size) {
int txfm_blk_index = (plane << 2) + (block >> (tx_size << 1));
if (x->skip_txfm[txfm_blk_index] == SKIP_TXFM_NONE) {
// full forward transform and quantization
- vp10_xform_quant(x, plane, block, plane_bsize, tx_size);
+ vp10_xform_quant(x, plane, block, blk_row, blk_col,
+ plane_bsize, tx_size);
} else if (x->skip_txfm[txfm_blk_index] == SKIP_TXFM_AC_ONLY) {
// fast path forward transform and quantization
- vp10_xform_quant_dc(x, plane, block, plane_bsize, tx_size);
+ vp10_xform_quant_dc(x, plane, block, blk_row, blk_col,
+ plane_bsize, tx_size);
} else {
// skip forward transform
p->eobs[block] = 0;
@@ -804,7 +804,8 @@
return;
}
} else {
- vp10_xform_quant(x, plane, block, plane_bsize, tx_size);
+ vp10_xform_quant(x, plane, block, blk_row, blk_col,
+ plane_bsize, tx_size);
}
}
}
@@ -879,19 +880,18 @@
}
}
-static void encode_block_pass1(int plane, int block, BLOCK_SIZE plane_bsize,
+static void encode_block_pass1(int plane, int block, int blk_row, int blk_col,
+ BLOCK_SIZE plane_bsize,
TX_SIZE tx_size, void *arg) {
MACROBLOCK *const x = (MACROBLOCK *)arg;
MACROBLOCKD *const xd = &x->e_mbd;
struct macroblock_plane *const p = &x->plane[plane];
struct macroblockd_plane *const pd = &xd->plane[plane];
tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
- int i, j;
uint8_t *dst;
- txfrm_block_to_raster_xy(plane_bsize, tx_size, block, &i, &j);
- dst = &pd->dst.buf[4 * j * pd->dst.stride + 4 * i];
+ dst = &pd->dst.buf[4 * blk_row * pd->dst.stride + 4 * blk_col];
- vp10_xform_quant(x, plane, block, plane_bsize, tx_size);
+ vp10_xform_quant(x, plane, block, blk_row, blk_col, plane_bsize, tx_size);
if (p->eobs[block] > 0) {
#if CONFIG_VP9_HIGHBITDEPTH
@@ -948,8 +948,9 @@
}
}
-void vp10_encode_block_intra(int plane, int block, BLOCK_SIZE plane_bsize,
- TX_SIZE tx_size, void *arg) {
+void vp10_encode_block_intra(int plane, int block, int blk_row, int blk_col,
+ BLOCK_SIZE plane_bsize,
+ TX_SIZE tx_size, void *arg) {
struct encode_b_args* const args = arg;
MACROBLOCK *const x = args->x;
MACROBLOCKD *const xd = &x->e_mbd;
@@ -971,15 +972,13 @@
uint16_t *eob = &p->eobs[block];
const int src_stride = p->src.stride;
const int dst_stride = pd->dst.stride;
- int i, j;
- txfrm_block_to_raster_xy(plane_bsize, tx_size, block, &i, &j);
- dst = &pd->dst.buf[4 * (j * dst_stride + i)];
- src = &p->src.buf[4 * (j * src_stride + i)];
- src_diff = &p->src_diff[4 * (j * diff_stride + i)];
+ dst = &pd->dst.buf[4 * (blk_row * dst_stride + blk_col)];
+ src = &p->src.buf[4 * (blk_row * src_stride + blk_col)];
+ src_diff = &p->src_diff[4 * (blk_row * diff_stride + blk_col)];
mode = plane == 0 ? get_y_mode(xd->mi[0], block) : mbmi->uv_mode;
vp10_predict_intra_block(xd, bwl, bhl, tx_size, mode, dst, dst_stride,
- dst, dst_stride, i, j, plane);
+ dst, dst_stride, blk_col, blk_row, plane);
#if CONFIG_VP9_HIGHBITDEPTH
if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
@@ -1130,5 +1129,5 @@
struct encode_b_args arg = {x, NULL, &xd->mi[0]->mbmi.skip};
vp10_foreach_transformed_block_in_plane(xd, bsize, plane,
- vp10_encode_block_intra, &arg);
+ vp10_encode_block_intra, &arg);
}
diff --git a/vp10/encoder/encodemb.h b/vp10/encoder/encodemb.h
index 62a7db4..2e6516e 100644
--- a/vp10/encoder/encodemb.h
+++ b/vp10/encoder/encodemb.h
@@ -26,16 +26,20 @@
void vp10_encode_sb(MACROBLOCK *x, BLOCK_SIZE bsize);
void vp10_encode_sby_pass1(MACROBLOCK *x, BLOCK_SIZE bsize);
void vp10_xform_quant_fp(MACROBLOCK *x, int plane, int block,
- BLOCK_SIZE plane_bsize, TX_SIZE tx_size);
+ int blk_row, int blk_col,
+ BLOCK_SIZE plane_bsize, TX_SIZE tx_size);
void vp10_xform_quant_dc(MACROBLOCK *x, int plane, int block,
- BLOCK_SIZE plane_bsize, TX_SIZE tx_size);
+ int blk_row, int blk_col,
+ BLOCK_SIZE plane_bsize, TX_SIZE tx_size);
void vp10_xform_quant(MACROBLOCK *x, int plane, int block,
- BLOCK_SIZE plane_bsize, TX_SIZE tx_size);
+ int blk_row, int blk_col,
+ BLOCK_SIZE plane_bsize, TX_SIZE tx_size);
void vp10_subtract_plane(MACROBLOCK *x, BLOCK_SIZE bsize, int plane);
-void vp10_encode_block_intra(int plane, int block, BLOCK_SIZE plane_bsize,
- TX_SIZE tx_size, void *arg);
+void vp10_encode_block_intra(int plane, int block, int blk_row, int blk_col,
+ BLOCK_SIZE plane_bsize,
+ TX_SIZE tx_size, void *arg);
void vp10_encode_intra_block_plane(MACROBLOCK *x, BLOCK_SIZE bsize, int plane);
diff --git a/vp10/encoder/palette.c b/vp10/encoder/palette.c
index b6fd653..522e185 100644
--- a/vp10/encoder/palette.c
+++ b/vp10/encoder/palette.c
@@ -39,12 +39,20 @@
}
}
+// Generate a random number in the range [0, 32768).
+static unsigned int lcg_rand16(unsigned int *state) {
+ *state = *state * 1103515245 + 12345;
+ return *state / 65536 % 32768;
+}
+
static void calc_centroids(const double *data, double *centroids,
const uint8_t *indices, int n, int k, int dim) {
int i, j, index;
int count[PALETTE_MAX_SIZE];
+ unsigned int rand_state = (unsigned int)data[0];
- srand((unsigned int) data[0]);
+ assert(n <= 32768);
+
memset(count, 0, sizeof(count[0]) * k);
memset(centroids, 0, sizeof(centroids[0]) * k * dim);
@@ -59,8 +67,7 @@
for (i = 0; i < k; ++i) {
if (count[i] == 0) {
- // TODO(huisu): replace rand() with something else.
- memcpy(centroids + i * dim, data + (rand() % n) * dim,
+ memcpy(centroids + i * dim, data + (lcg_rand16(&rand_state) % n) * dim,
sizeof(centroids[0]) * dim);
} else {
const double norm = 1.0 / count[i];
diff --git a/vp10/encoder/rdopt.c b/vp10/encoder/rdopt.c
index 2e000af..cecc59c 100644
--- a/vp10/encoder/rdopt.c
+++ b/vp10/encoder/rdopt.c
@@ -451,18 +451,16 @@
*out_sse = this_sse >> shift;
}
-static int rate_block(int plane, int block, BLOCK_SIZE plane_bsize,
+static int rate_block(int plane, int block, int blk_row, int blk_col,
TX_SIZE tx_size, struct rdcost_block_args* args) {
- int x_idx, y_idx;
- txfrm_block_to_raster_xy(plane_bsize, tx_size, block, &x_idx, &y_idx);
-
- return cost_coeffs(args->x, plane, block, args->t_above + x_idx,
- args->t_left + y_idx, tx_size,
+ return cost_coeffs(args->x, plane, block, args->t_above + blk_col,
+ args->t_left + blk_row, tx_size,
args->so->scan, args->so->neighbors,
args->use_fast_coef_costing);
}
-static void block_rd_txfm(int plane, int block, BLOCK_SIZE plane_bsize,
+static void block_rd_txfm(int plane, int block, int blk_row, int blk_col,
+ BLOCK_SIZE plane_bsize,
TX_SIZE tx_size, void *arg) {
struct rdcost_block_args *args = arg;
MACROBLOCK *const x = args->x;
@@ -478,20 +476,23 @@
if (!is_inter_block(mbmi)) {
struct encode_b_args arg = {x, NULL, &mbmi->skip};
- vp10_encode_block_intra(plane, block, plane_bsize, tx_size, &arg);
+ vp10_encode_block_intra(plane, block, blk_row, blk_col,
+ plane_bsize, tx_size, &arg);
dist_block(x, plane, block, tx_size, &dist, &sse);
} else if (max_txsize_lookup[plane_bsize] == tx_size) {
if (x->skip_txfm[(plane << 2) + (block >> (tx_size << 1))] ==
SKIP_TXFM_NONE) {
// full forward transform and quantization
- vp10_xform_quant(x, plane, block, plane_bsize, tx_size);
+ vp10_xform_quant(x, plane, block, blk_row, blk_col,
+ plane_bsize, tx_size);
dist_block(x, plane, block, tx_size, &dist, &sse);
} else if (x->skip_txfm[(plane << 2) + (block >> (tx_size << 1))] ==
SKIP_TXFM_AC_ONLY) {
// compute DC coefficient
tran_low_t *const coeff = BLOCK_OFFSET(x->plane[plane].coeff, block);
tran_low_t *const dqcoeff = BLOCK_OFFSET(xd->plane[plane].dqcoeff, block);
- vp10_xform_quant_dc(x, plane, block, plane_bsize, tx_size);
+ vp10_xform_quant_dc(x, plane, block, blk_row, blk_col,
+ plane_bsize, tx_size);
sse = x->bsse[(plane << 2) + (block >> (tx_size << 1))] << 4;
dist = sse;
if (x->plane[plane].eobs[block]) {
@@ -515,7 +516,7 @@
}
} else {
// full forward transform and quantization
- vp10_xform_quant(x, plane, block, plane_bsize, tx_size);
+ vp10_xform_quant(x, plane, block, blk_row, blk_col, plane_bsize, tx_size);
dist_block(x, plane, block, tx_size, &dist, &sse);
}
@@ -525,7 +526,7 @@
return;
}
- rate = rate_block(plane, block, plane_bsize, tx_size, args);
+ rate = rate_block(plane, block, blk_row, blk_col, tx_size, args);
rd1 = RDCOST(x->rdmult, x->rddiv, rate, dist);
rd2 = RDCOST(x->rdmult, x->rddiv, 0, sse);
diff --git a/vp10/encoder/tokenize.c b/vp10/encoder/tokenize.c
index 2c9998b..e568c0b 100644
--- a/vp10/encoder/tokenize.c
+++ b/vp10/encoder/tokenize.c
@@ -443,7 +443,9 @@
TOKENEXTRA **tp;
};
-static void set_entropy_context_b(int plane, int block, BLOCK_SIZE plane_bsize,
+static void set_entropy_context_b(int plane, int block,
+ int blk_row, int blk_col,
+ BLOCK_SIZE plane_bsize,
TX_SIZE tx_size, void *arg) {
struct tokenize_b_args* const args = arg;
ThreadData *const td = args->td;
@@ -451,10 +453,8 @@
MACROBLOCKD *const xd = &x->e_mbd;
struct macroblock_plane *p = &x->plane[plane];
struct macroblockd_plane *pd = &xd->plane[plane];
- int aoff, loff;
- txfrm_block_to_raster_xy(plane_bsize, tx_size, block, &aoff, &loff);
vp10_set_contexts(xd, pd, plane_bsize, tx_size, p->eobs[block] > 0,
- aoff, loff);
+ blk_col, blk_row);
}
static INLINE void add_token(TOKENEXTRA **t, const vpx_prob *context_tree,
@@ -520,7 +520,8 @@
}
}
-static void tokenize_b(int plane, int block, BLOCK_SIZE plane_bsize,
+static void tokenize_b(int plane, int block, int blk_row, int blk_col,
+ BLOCK_SIZE plane_bsize,
TX_SIZE tx_size, void *arg) {
struct tokenize_b_args* const args = arg;
VP10_COMP *cpi = args->cpi;
@@ -553,11 +554,8 @@
const int seg_eob = get_tx_eob(&cpi->common.seg, segment_id, tx_size);
int16_t token;
EXTRABIT extra;
- int aoff, loff;
- txfrm_block_to_raster_xy(plane_bsize, tx_size, block, &aoff, &loff);
-
- pt = get_entropy_context(tx_size, pd->above_context + aoff,
- pd->left_context + loff);
+ pt = get_entropy_context(tx_size, pd->above_context + blk_col,
+ pd->left_context + blk_row);
scan = so->scan;
nb = so->neighbors;
c = 0;
@@ -597,20 +595,22 @@
*tp = t;
- vp10_set_contexts(xd, pd, plane_bsize, tx_size, c > 0, aoff, loff);
+ vp10_set_contexts(xd, pd, plane_bsize, tx_size, c > 0, blk_col, blk_row);
}
struct is_skippable_args {
uint16_t *eobs;
int *skippable;
};
-static void is_skippable(int plane, int block,
+static void is_skippable(int plane, int block, int blk_row, int blk_col,
BLOCK_SIZE plane_bsize, TX_SIZE tx_size,
void *argv) {
struct is_skippable_args *args = argv;
(void)plane;
(void)plane_bsize;
(void)tx_size;
+ (void)blk_row;
+ (void)blk_col;
args->skippable[0] &= (!args->eobs[block]);
}
@@ -624,13 +624,15 @@
return result;
}
-static void has_high_freq_coeff(int plane, int block,
+static void has_high_freq_coeff(int plane, int block, int blk_row, int blk_col,
BLOCK_SIZE plane_bsize, TX_SIZE tx_size,
void *argv) {
struct is_skippable_args *args = argv;
int eobs = (tx_size == TX_4X4) ? 3 : 10;
(void) plane;
(void) plane_bsize;
+ (void) blk_row;
+ (void) blk_col;
*(args->skippable) |= (args->eobs[block] > eobs);
}
diff --git a/vp9/common/vp9_entropy.c b/vp9/common/vp9_entropy.c
index 579857b..719e542 100644
--- a/vp9/common/vp9_entropy.c
+++ b/vp9/common/vp9_entropy.c
@@ -403,7 +403,6 @@
{255, 241, 243, 255, 236, 255, 252, 254},
{255, 243, 245, 255, 237, 255, 252, 254},
{255, 246, 247, 255, 239, 255, 253, 255},
- {255, 246, 247, 255, 239, 255, 253, 255},
};
static const vp9_coeff_probs_model default_coef_probs_4x4[PLANE_TYPES] = {
@@ -743,14 +742,16 @@
};
static void extend_to_full_distribution(vpx_prob *probs, vpx_prob p) {
- memcpy(probs, vp9_pareto8_full[p = 0 ? 0 : p - 1],
- MODEL_NODES * sizeof(vpx_prob));
+ memcpy(probs, vp9_pareto8_full[p - 1], MODEL_NODES * sizeof(vpx_prob));
}
void vp9_model_to_full_probs(const vpx_prob *model, vpx_prob *full) {
if (full != model)
memcpy(full, model, sizeof(vpx_prob) * UNCONSTRAINED_NODES);
- extend_to_full_distribution(&full[UNCONSTRAINED_NODES], model[PIVOT_NODE]);
+ // TODO(aconverse): model[PIVOT_NODE] should never be zero.
+ // https://code.google.com/p/webm/issues/detail?id=1089
+ if (model[PIVOT_NODE] != 0)
+ extend_to_full_distribution(&full[UNCONSTRAINED_NODES], model[PIVOT_NODE]);
}
void vp9_default_coef_probs(VP9_COMMON *cm) {
diff --git a/vp9/common/vp9_entropy.h b/vp9/common/vp9_entropy.h
index 21611ed..63b3bff 100644
--- a/vp9/common/vp9_entropy.h
+++ b/vp9/common/vp9_entropy.h
@@ -138,7 +138,7 @@
// 1, 3, 5, 7, ..., 253, 255
// In between probabilities are interpolated linearly
-#define COEFF_PROB_MODELS 256
+#define COEFF_PROB_MODELS 255
#define UNCONSTRAINED_NODES 3
diff --git a/vp9/common/vp9_rtcd_defs.pl b/vp9/common/vp9_rtcd_defs.pl
index ed5f4ca..5bf71ef 100644
--- a/vp9/common/vp9_rtcd_defs.pl
+++ b/vp9/common/vp9_rtcd_defs.pl
@@ -248,7 +248,7 @@
specialize qw/vp9_highbd_block_error/, "$sse2_x86inc";
add_proto qw/int64_t vp9_highbd_block_error_8bit/, "const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz";
- specialize qw/vp9_highbd_block_error_8bit/, "$sse2_x86inc";
+ specialize qw/vp9_highbd_block_error_8bit/, "$sse2_x86inc", "$avx_x86inc";
add_proto qw/void vp9_quantize_fp/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
specialize qw/vp9_quantize_fp/;
diff --git a/vp9/encoder/vp9_aq_cyclicrefresh.c b/vp9/encoder/vp9_aq_cyclicrefresh.c
index 2cd89c0..bdc95d4 100644
--- a/vp9/encoder/vp9_aq_cyclicrefresh.c
+++ b/vp9/encoder/vp9_aq_cyclicrefresh.c
@@ -486,10 +486,17 @@
// Account for larger interval on base layer for temporal layers.
if (cr->percent_refresh > 0 &&
rc->frames_since_key < (4 * cpi->svc.number_temporal_layers) *
- (100 / cr->percent_refresh))
+ (100 / cr->percent_refresh)) {
cr->rate_ratio_qdelta = 3.0;
- else
+ } else {
cr->rate_ratio_qdelta = 2.0;
+#if CONFIG_VP9_TEMPORAL_DENOISING
+ if (cpi->oxcf.noise_sensitivity > 0 &&
+ cpi->denoiser.denoising_level >= kMedium)
+ // Reduce the delta-qp if the estimated source noise is above threshold.
+ cr->rate_ratio_qdelta = 1.5;
+#endif
+ }
// Adjust some parameters for low resolutions at low bitrates.
if (cm->width <= 352 &&
cm->height <= 288 &&
diff --git a/vp9/encoder/vp9_context_tree.c b/vp9/encoder/vp9_context_tree.c
index e87cccb..396ed3f 100644
--- a/vp9/encoder/vp9_context_tree.c
+++ b/vp9/encoder/vp9_context_tree.c
@@ -30,13 +30,13 @@
for (i = 0; i < MAX_MB_PLANE; ++i) {
for (k = 0; k < 3; ++k) {
CHECK_MEM_ERROR(cm, ctx->coeff[i][k],
- vpx_memalign(16, num_pix * sizeof(*ctx->coeff[i][k])));
+ vpx_memalign(32, num_pix * sizeof(*ctx->coeff[i][k])));
CHECK_MEM_ERROR(cm, ctx->qcoeff[i][k],
- vpx_memalign(16, num_pix * sizeof(*ctx->qcoeff[i][k])));
+ vpx_memalign(32, num_pix * sizeof(*ctx->qcoeff[i][k])));
CHECK_MEM_ERROR(cm, ctx->dqcoeff[i][k],
- vpx_memalign(16, num_pix * sizeof(*ctx->dqcoeff[i][k])));
+ vpx_memalign(32, num_pix * sizeof(*ctx->dqcoeff[i][k])));
CHECK_MEM_ERROR(cm, ctx->eobs[i][k],
- vpx_memalign(16, num_blk * sizeof(*ctx->eobs[i][k])));
+ vpx_memalign(32, num_blk * sizeof(*ctx->eobs[i][k])));
ctx->coeff_pbuf[i][k] = ctx->coeff[i][k];
ctx->qcoeff_pbuf[i][k] = ctx->qcoeff[i][k];
ctx->dqcoeff_pbuf[i][k] = ctx->dqcoeff[i][k];
diff --git a/vp9/encoder/vp9_denoiser.c b/vp9/encoder/vp9_denoiser.c
index 678e312..05aa1d2 100644
--- a/vp9/encoder/vp9_denoiser.c
+++ b/vp9/encoder/vp9_denoiser.c
@@ -10,6 +10,8 @@
#include <assert.h>
#include <limits.h>
+#include <math.h>
+
#include "./vpx_dsp_rtcd.h"
#include "vpx_dsp/vpx_dsp_common.h"
#include "vpx_scale/yv12config.h"
@@ -17,13 +19,9 @@
#include "vp9/common/vp9_reconinter.h"
#include "vp9/encoder/vp9_context_tree.h"
#include "vp9/encoder/vp9_denoiser.h"
+#include "vp9/encoder/vp9_encoder.h"
-/* The VP9 denoiser is a work-in-progress. It currently is only designed to work
- * with speed 6, though it (inexplicably) seems to also work with speed 5 (one
- * would need to modify the source code in vp9_pickmode.c and vp9_encoder.c to
- * make the calls to the vp9_denoiser_* functions when in speed 5).
- *
- * The implementation is very similar to that of the VP8 denoiser. While
+/* The VP9 denoiser is similar to that of the VP8 denoiser. While
* choosing the motion vectors / reference frames, the denoiser is run, and if
* it did not modify the signal to much, the denoised block is copied to the
* signal.
@@ -195,7 +193,7 @@
int mi_row,
int mi_col,
PICK_MODE_CONTEXT *ctx,
- int *motion_magnitude,
+ int motion_magnitude,
int is_skin) {
int mv_col, mv_row;
int sse_diff = ctx->zeromv_sse - ctx->newmv_sse;
@@ -209,18 +207,17 @@
mv_col = ctx->best_sse_mv.as_mv.col;
mv_row = ctx->best_sse_mv.as_mv.row;
- *motion_magnitude = mv_row * mv_row + mv_col * mv_col;
frame = ctx->best_reference_frame;
saved_mbmi = *mbmi;
- if (is_skin && *motion_magnitude > 16)
+ if (is_skin && motion_magnitude > 16)
return COPY_BLOCK;
// If the best reference frame uses inter-prediction and there is enough of a
// difference in sum-squared-error, use it.
if (frame != INTRA_FRAME &&
- sse_diff > sse_diff_thresh(bs, increase_denoising, *motion_magnitude)) {
+ sse_diff > sse_diff_thresh(bs, increase_denoising, motion_magnitude)) {
mbmi->ref_frame[0] = ctx->best_reference_frame;
mbmi->mode = ctx->best_sse_inter_mode;
mbmi->mv[0] = ctx->best_sse_mv;
@@ -242,7 +239,7 @@
*mbmi = saved_mbmi;
return COPY_BLOCK;
}
- if (*motion_magnitude >
+ if (motion_magnitude >
(noise_motion_thresh(bs, increase_denoising) << 3)) {
// Restore everything to its original state
*mbmi = saved_mbmi;
@@ -315,6 +312,7 @@
void vp9_denoiser_denoise(VP9_DENOISER *denoiser, MACROBLOCK *mb,
int mi_row, int mi_col, BLOCK_SIZE bs,
PICK_MODE_CONTEXT *ctx) {
+ int mv_col, mv_row;
int motion_magnitude = 0;
VP9_DENOISER_DECISION decision = COPY_BLOCK;
YV12_BUFFER_CONFIG avg = denoiser->running_avg_y[INTRA_FRAME];
@@ -325,7 +323,7 @@
struct buf_2d src = mb->plane[0].src;
int is_skin = 0;
- if (bs <= BLOCK_16X16) {
+ if (bs <= BLOCK_16X16 && denoiser->denoising_level >= kMedium) {
// Take center pixel in block to determine is_skin.
const int y_width_shift = (4 << b_width_log2_lookup[bs]) >> 1;
const int y_height_shift = (4 << b_height_log2_lookup[bs]) >> 1;
@@ -342,17 +340,28 @@
is_skin = vp9_skin_pixel(ysource, usource, vsource);
}
- decision = perform_motion_compensation(denoiser, mb, bs,
- denoiser->increase_denoising,
- mi_row, mi_col, ctx,
- &motion_magnitude,
- is_skin);
+ mv_col = ctx->best_sse_mv.as_mv.col;
+ mv_row = ctx->best_sse_mv.as_mv.row;
+ motion_magnitude = mv_row * mv_row + mv_col * mv_col;
+ if (denoiser->denoising_level == kHigh && motion_magnitude < 16) {
+ denoiser->increase_denoising = 1;
+ } else {
+ denoiser->increase_denoising = 0;
+ }
+
+ if (denoiser->denoising_level >= kMedium)
+ decision = perform_motion_compensation(denoiser, mb, bs,
+ denoiser->increase_denoising,
+ mi_row, mi_col, ctx,
+ motion_magnitude,
+ is_skin);
if (decision == FILTER_BLOCK) {
decision = vp9_denoiser_filter(src.buf, src.stride,
mc_avg_start, mc_avg.y_stride,
avg_start, avg.y_stride,
- 0, bs, motion_magnitude);
+ denoiser->increase_denoising,
+ bs, motion_magnitude);
}
if (decision == FILTER_BLOCK) {
@@ -499,14 +508,43 @@
vp9_denoiser_free(denoiser);
return 1;
}
+
+ fail = vpx_alloc_frame_buffer(&denoiser->last_source, width, height,
+ ssx, ssy,
+#if CONFIG_VP9_HIGHBITDEPTH
+ use_highbitdepth,
+#endif
+ border, legacy_byte_alignment);
+ if (fail) {
+ vp9_denoiser_free(denoiser);
+ return 1;
+ }
#ifdef OUTPUT_YUV_DENOISED
make_grayscale(&denoiser->running_avg_y[i]);
#endif
denoiser->increase_denoising = 0;
denoiser->frame_buffer_initialized = 1;
+ vp9_denoiser_init_noise_estimate(denoiser, width, height);
return 0;
}
+void vp9_denoiser_init_noise_estimate(VP9_DENOISER *denoiser,
+ int width,
+ int height) {
+ // Denoiser is off by default, i.e., no denoising is performed.
+ // Noise level is measured periodically, and if observed to be above
+ // thresh_noise_estimate, then denoising is performed.
+ denoiser->denoising_level = kLow;
+ denoiser->noise_estimate = 0;
+ denoiser->noise_estimate_count = 0;
+ denoiser->thresh_noise_estimate = 20;
+ if (width * height >= 1920 * 1080) {
+ denoiser->thresh_noise_estimate = 70;
+ } else if (width * height >= 1280 * 720) {
+ denoiser->thresh_noise_estimate = 40;
+ }
+}
+
void vp9_denoiser_free(VP9_DENOISER *denoiser) {
int i;
denoiser->frame_buffer_initialized = 0;
@@ -517,6 +555,125 @@
vpx_free_frame_buffer(&denoiser->running_avg_y[i]);
}
vpx_free_frame_buffer(&denoiser->mc_running_avg_y);
+ vpx_free_frame_buffer(&denoiser->last_source);
+}
+
+void vp9_denoiser_update_noise_estimate(VP9_COMP *const cpi) {
+ const VP9_COMMON *const cm = &cpi->common;
+ CYCLIC_REFRESH *const cr = cpi->cyclic_refresh;
+ int frame_period = 10;
+ int thresh_consec_zeromv = 8;
+ unsigned int thresh_sum_diff = 128;
+ int num_frames_estimate = 20;
+ int min_blocks_estimate = cm->mi_rows * cm->mi_cols >> 7;
+ // Estimate of noise level every frame_period frames.
+ // Estimate is between current source and last source.
+ if (cm->current_video_frame % frame_period != 0 ||
+ cpi->denoiser.last_source.y_buffer == NULL) {
+ copy_frame(&cpi->denoiser.last_source, cpi->Source);
+ return;
+ } else {
+ int num_samples = 0;
+ uint64_t avg_est = 0;
+ int bsize = BLOCK_16X16;
+ static const unsigned char const_source[16] = {
+ 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
+ 128, 128};
+ // Loop over sub-sample of 16x16 blocks of frame, and for blocks that have
+ // been encoded as zero/small mv at least x consecutive frames, compute
+ // the variance to update estimate of noise in the source.
+ const uint8_t *src_y = cpi->Source->y_buffer;
+ const int src_ystride = cpi->Source->y_stride;
+ const uint8_t *last_src_y = cpi->denoiser.last_source.y_buffer;
+ const int last_src_ystride = cpi->denoiser.last_source.y_stride;
+ const uint8_t *src_u = cpi->Source->u_buffer;
+ const uint8_t *src_v = cpi->Source->v_buffer;
+ const int src_uvstride = cpi->Source->uv_stride;
+ const int y_width_shift = (4 << b_width_log2_lookup[bsize]) >> 1;
+ const int y_height_shift = (4 << b_height_log2_lookup[bsize]) >> 1;
+ const int uv_width_shift = y_width_shift >> 1;
+ const int uv_height_shift = y_height_shift >> 1;
+ int mi_row, mi_col;
+ for (mi_row = 0; mi_row < cm->mi_rows; mi_row ++) {
+ for (mi_col = 0; mi_col < cm->mi_cols; mi_col ++) {
+ // 16x16 blocks, 1/4 sample of frame.
+ if (mi_row % 4 == 0 && mi_col % 4 == 0) {
+ int bl_index = mi_row * cm->mi_cols + mi_col;
+ int bl_index1 = bl_index + 1;
+ int bl_index2 = bl_index + cm->mi_cols;
+ int bl_index3 = bl_index2 + 1;
+ // Only consider blocks that are likely steady background. i.e, have
+ // been encoded as zero/low motion x (= thresh_consec_zeromv) frames
+ // in a row. consec_zero_mv[] defined for 8x8 blocks, so consider all
+ // 4 sub-blocks for 16x16 block. Also, avoid skin blocks.
+ const uint8_t ysource =
+ src_y[y_height_shift * src_ystride + y_width_shift];
+ const uint8_t usource =
+ src_u[uv_height_shift * src_uvstride + uv_width_shift];
+ const uint8_t vsource =
+ src_v[uv_height_shift * src_uvstride + uv_width_shift];
+ int is_skin = vp9_skin_pixel(ysource, usource, vsource);
+ if (cr->consec_zero_mv[bl_index] > thresh_consec_zeromv &&
+ cr->consec_zero_mv[bl_index1] > thresh_consec_zeromv &&
+ cr->consec_zero_mv[bl_index2] > thresh_consec_zeromv &&
+ cr->consec_zero_mv[bl_index3] > thresh_consec_zeromv &&
+ !is_skin) {
+ // Compute variance.
+ unsigned int sse;
+ unsigned int variance = cpi->fn_ptr[bsize].vf(src_y,
+ src_ystride,
+ last_src_y,
+ last_src_ystride,
+ &sse);
+ // Only consider this block as valid for noise measurement if the
+ // average term (sse - variance = N * avg^{2}, N = 16X16) of the
+ // temporal residual is small (avoid effects from lighting change).
+ if ((sse - variance) < thresh_sum_diff) {
+ unsigned int sse2;
+ const unsigned int spatial_variance =
+ cpi->fn_ptr[bsize].vf(src_y, src_ystride, const_source,
+ 0, &sse2);
+ avg_est += variance / (10 + spatial_variance);
+ num_samples++;
+ }
+ }
+ }
+ src_y += 8;
+ last_src_y += 8;
+ src_u += 4;
+ src_v += 4;
+ }
+ src_y += (src_ystride << 3) - (cm->mi_cols << 3);
+ last_src_y += (last_src_ystride << 3) - (cm->mi_cols << 3);
+ src_u += (src_uvstride << 2) - (cm->mi_cols << 2);
+ src_v += (src_uvstride << 2) - (cm->mi_cols << 2);
+ }
+ // Update noise estimate if we have at a minimum number of block samples,
+ // and avg_est > 0 (avg_est == 0 can happen if the application inputs
+ // duplicate frames).
+ if (num_samples > min_blocks_estimate && avg_est > 0) {
+ // Normalize.
+ avg_est = (avg_est << 8) / num_samples;
+ // Update noise estimate.
+ cpi->denoiser.noise_estimate = (3 * cpi->denoiser.noise_estimate +
+ avg_est) >> 2;
+ cpi->denoiser.noise_estimate_count++;
+ if (cpi->denoiser.noise_estimate_count == num_frames_estimate) {
+ // Reset counter and check noise level condition.
+ cpi->denoiser.noise_estimate_count = 0;
+ if (cpi->denoiser.noise_estimate >
+ (cpi->denoiser.thresh_noise_estimate << 1))
+ cpi->denoiser.denoising_level = kHigh;
+ else
+ if (cpi->denoiser.noise_estimate >
+ cpi->denoiser.thresh_noise_estimate)
+ cpi->denoiser.denoising_level = kMedium;
+ else
+ cpi->denoiser.denoising_level = kLow;
+ }
+ }
+ }
+ copy_frame(&cpi->denoiser.last_source, cpi->Source);
}
#ifdef OUTPUT_YUV_DENOISED
diff --git a/vp9/encoder/vp9_denoiser.h b/vp9/encoder/vp9_denoiser.h
index ec0b25e..c149e9d 100644
--- a/vp9/encoder/vp9_denoiser.h
+++ b/vp9/encoder/vp9_denoiser.h
@@ -26,13 +26,26 @@
FILTER_BLOCK
} VP9_DENOISER_DECISION;
+typedef enum vp9_denoiser_level {
+ kLow,
+ kMedium,
+ kHigh
+} VP9_DENOISER_LEVEL;
+
typedef struct vp9_denoiser {
YV12_BUFFER_CONFIG running_avg_y[MAX_REF_FRAMES];
YV12_BUFFER_CONFIG mc_running_avg_y;
+ YV12_BUFFER_CONFIG last_source;
int increase_denoising;
int frame_buffer_initialized;
+ VP9_DENOISER_LEVEL denoising_level;
+ int noise_estimate;
+ int thresh_noise_estimate;
+ int noise_estimate_count;
} VP9_DENOISER;
+struct VP9_COMP;
+
void vp9_denoiser_update_frame_info(VP9_DENOISER *denoiser,
YV12_BUFFER_CONFIG src,
FRAME_TYPE frame_type,
@@ -69,6 +82,12 @@
void vp9_denoiser_free(VP9_DENOISER *denoiser);
+void vp9_denoiser_init_noise_estimate(VP9_DENOISER *denoiser,
+ int width,
+ int height);
+
+void vp9_denoiser_update_noise_estimate(struct VP9_COMP *const cpi);
+
#ifdef __cplusplus
} // extern "C"
#endif
diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c
index 2333a13..1f5709c 100644
--- a/vp9/encoder/vp9_encodeframe.c
+++ b/vp9/encoder/vp9_encodeframe.c
@@ -481,7 +481,7 @@
VP9_COMMON *const cm = &cpi->common;
const int is_key_frame = (cm->frame_type == KEY_FRAME);
const int threshold_multiplier = is_key_frame ? 20 : 1;
- const int64_t threshold_base = (int64_t)(threshold_multiplier *
+ int64_t threshold_base = (int64_t)(threshold_multiplier *
cpi->y_dequant[q][1]);
if (is_key_frame) {
thresholds[0] = threshold_base;
@@ -489,6 +489,16 @@
thresholds[2] = threshold_base >> 2;
thresholds[3] = threshold_base << 2;
} else {
+#if CONFIG_VP9_TEMPORAL_DENOISING
+ if (cpi->oxcf.noise_sensitivity > 0) {
+ // Increase base variance threshold is estimated noise level is high.
+ if (cpi->denoiser.denoising_level == kHigh)
+ threshold_base = threshold_base << 2;
+ else
+ if (cpi->denoiser.denoising_level == kMedium)
+ threshold_base = threshold_base << 1;
+ }
+#endif
thresholds[1] = threshold_base;
if (cm->width <= 352 && cm->height <= 288) {
thresholds[0] = threshold_base >> 2;
diff --git a/vp9/encoder/vp9_encoder.c b/vp9/encoder/vp9_encoder.c
index 5b75d67..72eafec 100644
--- a/vp9/encoder/vp9_encoder.c
+++ b/vp9/encoder/vp9_encoder.c
@@ -3250,6 +3250,13 @@
&cpi->scaled_last_source,
(cpi->oxcf.pass == 0));
+#if CONFIG_VP9_TEMPORAL_DENOISING
+ if (cpi->oxcf.noise_sensitivity > 0 &&
+ cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ) {
+ vp9_denoiser_update_noise_estimate(cpi);
+ }
+#endif
+
if (cpi->oxcf.pass == 0 &&
cpi->oxcf.rc_mode == VPX_CBR &&
cpi->resize_state == 0 &&
diff --git a/vp9/encoder/vp9_firstpass.c b/vp9/encoder/vp9_firstpass.c
index a6b5ebb..30738b5 100644
--- a/vp9/encoder/vp9_firstpass.c
+++ b/vp9/encoder/vp9_firstpass.c
@@ -1183,10 +1183,13 @@
double group_weight_factor) {
const RATE_CONTROL *const rc = &cpi->rc;
const VP9EncoderConfig *const oxcf = &cpi->oxcf;
+ // Clamp the target rate to VBR min / max limts.
+ const int target_rate =
+ vp9_rc_clamp_pframe_target_size(cpi, section_target_bandwidth);
inactive_zone = fclamp(inactive_zone, 0.0, 1.0);
- if (section_target_bandwidth <= 0) {
+ if (target_rate <= 0) {
return rc->worst_quality; // Highest value allowed
} else {
const int num_mbs = (cpi->oxcf.resize_mode != RESIZE_NONE)
@@ -1195,7 +1198,7 @@
const double av_err_per_mb = section_err / active_mbs;
const double speed_term = 1.0 + 0.04 * oxcf->speed;
const double ediv_size_correction = (double)num_mbs / EDIV_SIZE_FACTOR;
- const int target_norm_bits_per_mb = ((uint64_t)section_target_bandwidth <<
+ const int target_norm_bits_per_mb = ((uint64_t)target_rate <<
BPER_MB_NORMBITS) / active_mbs;
int q;
@@ -2444,7 +2447,7 @@
if ((i <= rc->max_gf_interval) ||
((i <= (rc->max_gf_interval * 4)) && (decay_accumulator > 0.5))) {
const double frame_boost =
- calc_frame_boost(cpi, this_frame, 0, KF_MAX_BOOST);
+ calc_frame_boost(cpi, &next_frame, 0, KF_MAX_BOOST);
// How fast is prediction quality decaying.
if (!detect_flash(twopass, 0)) {
@@ -2737,11 +2740,6 @@
}
target_rate = gf_group->bit_allocation[gf_group->index];
- if (cpi->common.frame_type == KEY_FRAME)
- target_rate = vp9_rc_clamp_iframe_target_size(cpi, target_rate);
- else
- target_rate = vp9_rc_clamp_pframe_target_size(cpi, target_rate);
-
rc->base_frame_target = target_rate;
{
diff --git a/vp9/encoder/vp9_pickmode.c b/vp9/encoder/vp9_pickmode.c
index fc4d9ae..af04583 100644
--- a/vp9/encoder/vp9_pickmode.c
+++ b/vp9/encoder/vp9_pickmode.c
@@ -1068,6 +1068,21 @@
{GOLDEN_FRAME, NEWMV}
};
+int set_intra_cost_penalty(const VP9_COMP *const cpi, BLOCK_SIZE bsize) {
+ const VP9_COMMON *const cm = &cpi->common;
+ // Reduce the intra cost penalty for small blocks (<=16x16).
+ int reduction_fac =
+ (bsize <= BLOCK_16X16) ? ((bsize <= BLOCK_8X8) ? 4 : 2) : 0;
+#if CONFIG_VP9_TEMPORAL_DENOISING
+ if (cpi->oxcf.noise_sensitivity > 0 &&
+ cpi->denoiser.denoising_level == kHigh)
+ // Don't reduce intra cost penalty if estimated noise level is high.
+ reduction_fac = 0;
+#endif
+ return vp9_get_intra_cost_penalty(
+ cm->base_qindex, cm->y_dc_delta_q, cm->bit_depth) >> reduction_fac;
+}
+
// TODO(jingning) placeholder for inter-frame non-RD mode decision.
// this needs various further optimizations. to be continued..
void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
@@ -1094,11 +1109,7 @@
// var_y and sse_y are saved to be used in skipping checking
unsigned int var_y = UINT_MAX;
unsigned int sse_y = UINT_MAX;
- // Reduce the intra cost penalty for small blocks (<=16x16).
- const int reduction_fac = (bsize <= BLOCK_16X16) ?
- ((bsize <= BLOCK_8X8) ? 4 : 2) : 0;
- const int intra_cost_penalty = vp9_get_intra_cost_penalty(
- cm->base_qindex, cm->y_dc_delta_q, cm->bit_depth) >> reduction_fac;
+ const int intra_cost_penalty = set_intra_cost_penalty(cpi, bsize);
const int64_t inter_mode_thresh = RDCOST(x->rdmult, x->rddiv,
intra_cost_penalty, 0);
const int *const rd_threshes = cpi->rd.threshes[mbmi->segment_id][bsize];
diff --git a/vp9/encoder/vp9_ratectrl.c b/vp9/encoder/vp9_ratectrl.c
index 1d13199..d700685 100644
--- a/vp9/encoder/vp9_ratectrl.c
+++ b/vp9/encoder/vp9_ratectrl.c
@@ -1816,6 +1816,11 @@
RATE_CONTROL *const rc = &cpi->rc;
int target_rate = rc->base_frame_target;
+ if (cpi->common.frame_type == KEY_FRAME)
+ target_rate = vp9_rc_clamp_iframe_target_size(cpi, target_rate);
+ else
+ target_rate = vp9_rc_clamp_pframe_target_size(cpi, target_rate);
+
// Correction to rate target based on prior over or under shoot.
if (cpi->oxcf.rc_mode == VPX_VBR || cpi->oxcf.rc_mode == VPX_CQ)
vbr_rate_correction(cpi, &target_rate);
diff --git a/vp9/encoder/vp9_rdopt.c b/vp9/encoder/vp9_rdopt.c
index 1944291..4f3a06e 100644
--- a/vp9/encoder/vp9_rdopt.c
+++ b/vp9/encoder/vp9_rdopt.c
@@ -296,30 +296,11 @@
const tran_low_t *dqcoeff,
intptr_t block_size,
int64_t *ssz) {
- int i;
- int32_t c, d;
- int64_t error = 0, sqcoeff = 0;
- int16_t diff;
-
- const int32_t hi = 0x00007fff;
- const int32_t lo = 0xffff8000;
-
- for (i = 0; i < block_size; i++) {
- c = coeff[i];
- d = dqcoeff[i];
-
- // Saturate to 16 bits
- c = (c > hi) ? hi : ((c < lo) ? lo : c);
- d = (d > hi) ? hi : ((d < lo) ? lo : d);
-
- diff = d - c;
- error += diff * diff;
- sqcoeff += c * c;
- }
- assert(error >= 0 && sqcoeff >= 0);
-
- *ssz = sqcoeff;
- return error;
+ // Note that the C versions of these 2 functions (vp9_block_error and
+ // vp9_highbd_block_error_8bit are the same, but the optimized assembly
+ // routines are not compatible in the non high bitdepth configuration, so
+ // they still cannot share the same name.
+ return vp9_block_error_c(coeff, dqcoeff, block_size, ssz);
}
static int64_t vp9_highbd_block_error_dispatch(const tran_low_t *coeff,
diff --git a/vp9/encoder/x86/vp9_highbd_error_avx.asm b/vp9/encoder/x86/vp9_highbd_error_avx.asm
new file mode 100644
index 0000000..e476323
--- /dev/null
+++ b/vp9/encoder/x86/vp9_highbd_error_avx.asm
@@ -0,0 +1,261 @@
+;
+; Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+%define private_prefix vp9
+
+%include "third_party/x86inc/x86inc.asm"
+
+SECTION .text
+ALIGN 16
+
+;
+; int64_t vp9_highbd_block_error_8bit(int32_t *coeff, int32_t *dqcoeff,
+; intptr_t block_size, int64_t *ssz)
+;
+
+INIT_XMM avx
+cglobal highbd_block_error_8bit, 4, 5, 8, uqc, dqc, size, ssz
+ vzeroupper
+
+ ; If only one iteration is required, then handle this as a special case.
+ ; It is the most frequent case, so we can have a significant gain here
+ ; by not setting up a loop and accumulators.
+ cmp sizeq, 16
+ jne .generic
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ;; Common case of size == 16
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+ ; Load input vectors
+ mova xm0, [dqcq]
+ packssdw xm0, [dqcq+16]
+ mova xm2, [uqcq]
+ packssdw xm2, [uqcq+16]
+
+ mova xm1, [dqcq+32]
+ packssdw xm1, [dqcq+48]
+ mova xm3, [uqcq+32]
+ packssdw xm3, [uqcq+48]
+
+ ; Compute the errors.
+ psubw xm0, xm2
+ psubw xm1, xm3
+
+ ; Individual errors are max 15bit+sign, so squares are 30bit, and
+ ; thus the sum of 2 should fit in a 31bit integer (+ unused sign bit).
+ pmaddwd xm2, xm2
+ pmaddwd xm3, xm3
+
+ pmaddwd xm0, xm0
+ pmaddwd xm1, xm1
+
+ ; Squares are always positive, so we can use unsigned arithmetic after
+ ; squaring. As mentioned earlier 2 sums fit in 31 bits, so 4 sums will
+ ; fit in 32bits
+ paddd xm2, xm3
+ paddd xm0, xm1
+
+ ; Accumulate horizontally in 64 bits, there is no chance of overflow here
+ pxor xm5, xm5
+
+ pblendw xm3, xm5, xm2, 0x33 ; Zero extended low of a pair of 32 bits
+ psrlq xm2, 32 ; Zero extended high of a pair of 32 bits
+
+ pblendw xm1, xm5, xm0, 0x33 ; Zero extended low of a pair of 32 bits
+ psrlq xm0, 32 ; Zero extended high of a pair of 32 bits
+
+ paddq xm2, xm3
+ paddq xm0, xm1
+
+ psrldq xm3, xm2, 8
+ psrldq xm1, xm0, 8
+
+ paddq xm2, xm3
+ paddq xm0, xm1
+
+ ; Store the return value
+%if ARCH_X86_64
+ movq rax, xm0
+ movq [sszq], xm2
+%else
+ movd eax, xm0
+ pextrd edx, xm0, 1
+ movq [sszd], xm2
+%endif
+ RET
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ;; Generic case of size != 16, speculative low precision
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ALIGN 16
+.generic:
+ pxor xm4, xm4 ; sse accumulator
+ pxor xm5, xm5 ; overflow detection register for xm4
+ pxor xm6, xm6 ; ssz accumulator
+ pxor xm7, xm7 ; overflow detection register for xm6
+ lea uqcq, [uqcq+sizeq*4]
+ lea dqcq, [dqcq+sizeq*4]
+ neg sizeq
+
+ ; Push the negative size as the high precision code might need it
+ push sizeq
+
+.loop:
+ ; Load input vectors
+ mova xm0, [dqcq+sizeq*4]
+ packssdw xm0, [dqcq+sizeq*4+16]
+ mova xm2, [uqcq+sizeq*4]
+ packssdw xm2, [uqcq+sizeq*4+16]
+
+ mova xm1, [dqcq+sizeq*4+32]
+ packssdw xm1, [dqcq+sizeq*4+48]
+ mova xm3, [uqcq+sizeq*4+32]
+ packssdw xm3, [uqcq+sizeq*4+48]
+
+ add sizeq, 16
+
+ ; Compute the squared errors.
+ ; Individual errors are max 15bit+sign, so squares are 30bit, and
+ ; thus the sum of 2 should fit in a 31bit integer (+ unused sign bit).
+ psubw xm0, xm2
+ pmaddwd xm2, xm2
+ pmaddwd xm0, xm0
+
+ psubw xm1, xm3
+ pmaddwd xm3, xm3
+ pmaddwd xm1, xm1
+
+ ; Squares are always positive, so we can use unsigned arithmetic after
+ ; squaring. As mentioned earlier 2 sums fit in 31 bits, so 4 sums will
+ ; fit in 32bits
+ paddd xm2, xm3
+ paddd xm0, xm1
+
+ ; We accumulate using 32 bit arithmetic, but detect potential overflow
+ ; by checking if the MSB of the accumulators have ever been a set bit.
+ ; If yes, we redo the whole compute at the end on higher precision, but
+ ; this happens extremely rarely, so we still achieve a net gain.
+ paddd xm4, xm0
+ paddd xm6, xm2
+ por xm5, xm4 ; OR in the accumulator for overflow detection
+ por xm7, xm6 ; OR in the accumulator for overflow detection
+
+ jnz .loop
+
+ ; Add pairs horizontally (still only on 32 bits)
+ phaddd xm4, xm4
+ por xm5, xm4 ; OR in the accumulator for overflow detection
+ phaddd xm6, xm6
+ por xm7, xm6 ; OR in the accumulator for overflow detection
+
+ ; Check for possibility of overflow by testing if bit 32 of each dword lane
+ ; have ever been set. If they were not, then there was no overflow and the
+ ; final sum will fit in 32 bits. If overflow happened, then
+ ; we redo the whole computation on higher precision.
+ por xm7, xm5
+ pmovmskb r4, xm7
+ test r4, 0x8888
+ jnz .highprec
+
+ phaddd xm4, xm4
+ phaddd xm6, xm6
+ pmovzxdq xm4, xm4
+ pmovzxdq xm6, xm6
+
+ ; Restore stack
+ pop sizeq
+
+ ; Store the return value
+%if ARCH_X86_64
+ movq rax, xm4
+ movq [sszq], xm6
+%else
+ movd eax, xm4
+ pextrd edx, xm4, 1
+ movq [sszd], xm6
+%endif
+ RET
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ;; Generic case of size != 16, high precision case
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+.highprec:
+ pxor xm4, xm4 ; sse accumulator
+ pxor xm5, xm5 ; dedicated zero register
+ pxor xm6, xm6 ; ssz accumulator
+ pop sizeq
+
+.loophp:
+ mova xm0, [dqcq+sizeq*4]
+ packssdw xm0, [dqcq+sizeq*4+16]
+ mova xm2, [uqcq+sizeq*4]
+ packssdw xm2, [uqcq+sizeq*4+16]
+
+ mova xm1, [dqcq+sizeq*4+32]
+ packssdw xm1, [dqcq+sizeq*4+48]
+ mova xm3, [uqcq+sizeq*4+32]
+ packssdw xm3, [uqcq+sizeq*4+48]
+
+ add sizeq, 16
+
+ ; individual errors are max. 15bit+sign, so squares are 30bit, and
+ ; thus the sum of 2 should fit in a 31bit integer (+ unused sign bit)
+
+ psubw xm0, xm2
+ pmaddwd xm2, xm2
+ pmaddwd xm0, xm0
+
+ psubw xm1, xm3
+ pmaddwd xm3, xm3
+ pmaddwd xm1, xm1
+
+ ; accumulate in 64bit
+ punpckldq xm7, xm0, xm5
+ punpckhdq xm0, xm5
+ paddq xm4, xm7
+
+ punpckldq xm7, xm2, xm5
+ punpckhdq xm2, xm5
+ paddq xm6, xm7
+
+ punpckldq xm7, xm1, xm5
+ punpckhdq xm1, xm5
+ paddq xm4, xm7
+
+ punpckldq xm7, xm3, xm5
+ punpckhdq xm3, xm5
+ paddq xm6, xm7
+
+ paddq xm4, xm0
+ paddq xm4, xm1
+ paddq xm6, xm2
+ paddq xm6, xm3
+
+ jnz .loophp
+
+ ; Accumulate horizontally
+ movhlps xm5, xm4
+ movhlps xm7, xm6
+ paddq xm4, xm5
+ paddq xm6, xm7
+
+ ; Store the return value
+%if ARCH_X86_64
+ movq rax, xm4
+ movq [sszq], xm6
+%else
+ movd eax, xm4
+ pextrd edx, xm4, 1
+ movq [sszd], xm6
+%endif
+ RET
+
+END
diff --git a/vp9/vp9cx.mk b/vp9/vp9cx.mk
index a2cbacf..25a176f 100644
--- a/vp9/vp9cx.mk
+++ b/vp9/vp9cx.mk
@@ -102,6 +102,7 @@
VP9_CX_SRCS-$(HAVE_MMX) += encoder/x86/vp9_dct_mmx.asm
ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_highbd_error_sse2.asm
+VP9_CX_SRCS-$(HAVE_AVX) += encoder/x86/vp9_highbd_error_avx.asm
else
VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_error_sse2.asm
endif
diff --git a/vpx_dsp/vpx_dsp.mk b/vpx_dsp/vpx_dsp.mk
index 31d8c75..9620eaa 100644
--- a/vpx_dsp/vpx_dsp.mk
+++ b/vpx_dsp/vpx_dsp.mk
@@ -248,7 +248,8 @@
endif
ifeq ($(ARCH_X86_64),yes)
ifeq ($(CONFIG_USE_X86INC),yes)
-DSP_SRCS-$(HAVE_SSSE3) += x86/quantize_ssse3_x86_64.asm
+DSP_SRCS-$(HAVE_SSSE3) += x86/quantize_ssse3_x86_64.asm
+DSP_SRCS-$(HAVE_AVX) += x86/quantize_avx_x86_64.asm
endif
endif
endif # CONFIG_VP9_ENCODER || CONFIG_VP10_ENCODER
diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl
index 0117f17..b369b05 100644
--- a/vpx_dsp/vpx_dsp_rtcd_defs.pl
+++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl
@@ -913,25 +913,19 @@
# Quantization
#
if ((vpx_config("CONFIG_VP9_ENCODER") eq "yes") || (vpx_config("CONFIG_VP10_ENCODER") eq "yes")) {
-if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
add_proto qw/void vpx_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
- specialize qw/vpx_quantize_b sse2/, "$ssse3_x86_64_x86inc";
+ specialize qw/vpx_quantize_b sse2/, "$ssse3_x86_64_x86inc", "$avx_x86_64_x86inc";
add_proto qw/void vpx_quantize_b_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
- specialize qw/vpx_quantize_b_32x32/, "$ssse3_x86_64_x86inc";
+ specialize qw/vpx_quantize_b_32x32/, "$ssse3_x86_64_x86inc", "$avx_x86_64_x86inc";
- add_proto qw/void vpx_highbd_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
- specialize qw/vpx_highbd_quantize_b sse2/;
+ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
+ add_proto qw/void vpx_highbd_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+ specialize qw/vpx_highbd_quantize_b sse2/;
- add_proto qw/void vpx_highbd_quantize_b_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
- specialize qw/vpx_highbd_quantize_b_32x32 sse2/;
-} else {
- add_proto qw/void vpx_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
- specialize qw/vpx_quantize_b sse2/, "$ssse3_x86_64_x86inc";
-
- add_proto qw/void vpx_quantize_b_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
- specialize qw/vpx_quantize_b_32x32/, "$ssse3_x86_64_x86inc";
-} # CONFIG_VP9_HIGHBITDEPTH
+ add_proto qw/void vpx_highbd_quantize_b_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+ specialize qw/vpx_highbd_quantize_b_32x32 sse2/;
+ } # CONFIG_VP9_HIGHBITDEPTH
} # CONFIG_VP9_ENCODER || CONFIG_VP10_ENCODER
if (vpx_config("CONFIG_ENCODERS") eq "yes") {
diff --git a/vpx_dsp/x86/quantize_avx_x86_64.asm b/vpx_dsp/x86/quantize_avx_x86_64.asm
new file mode 100644
index 0000000..01c4129
--- /dev/null
+++ b/vpx_dsp/x86/quantize_avx_x86_64.asm
@@ -0,0 +1,544 @@
+;
+; Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+%include "third_party/x86inc/x86inc.asm"
+
+SECTION .text
+
+%macro QUANTIZE_FN 2
+cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \
+ shift, qcoeff, dqcoeff, dequant, \
+ eob, scan, iscan
+
+ vzeroupper
+
+ ; If we can skip this block, then just zero the output
+ cmp skipmp, 0
+ jne .blank
+
+%ifnidn %1, b_32x32
+
+ ; Special case for ncoeff == 16, as it is frequent and we can save on
+ ; not setting up a loop.
+ cmp ncoeffmp, 16
+ jne .generic
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ;; Special case of ncoeff == 16
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+.single:
+
+ movifnidn coeffq, coeffmp
+ movifnidn zbinq, zbinmp
+ mova m0, [zbinq] ; m0 = zbin
+
+ ; Get DC and first 15 AC coeffs - in this special case, that is all.
+%if CONFIG_VP9_HIGHBITDEPTH
+ ; coeff stored as 32bit numbers but we process them as 16 bit numbers
+ mova m9, [coeffq]
+ packssdw m9, [coeffq+16] ; m9 = c[i]
+ mova m10, [coeffq+32]
+ packssdw m10, [coeffq+48] ; m10 = c[i]
+%else
+ mova m9, [coeffq] ; m9 = c[i]
+ mova m10, [coeffq+16] ; m10 = c[i]
+%endif
+
+ mov r0, eobmp ; Output pointer
+ mov r1, qcoeffmp ; Output pointer
+ mov r2, dqcoeffmp ; Output pointer
+
+ pxor m5, m5 ; m5 = dedicated zero
+
+ pcmpeqw m4, m4 ; All word lanes -1
+ paddw m0, m4 ; m0 = zbin - 1
+
+ pabsw m6, m9 ; m6 = abs(m9)
+ pabsw m11, m10 ; m11 = abs(m10)
+ pcmpgtw m7, m6, m0 ; m7 = c[i] >= zbin
+ punpckhqdq m0, m0
+ pcmpgtw m12, m11, m0 ; m12 = c[i] >= zbin
+
+ ; Check if all coeffs are less than zbin. If yes, we just write zeros
+ ; to the outputs and we are done.
+ por m14, m7, m12
+ ptest m14, m14
+ jnz .single_nonzero
+
+%if CONFIG_VP9_HIGHBITDEPTH
+ mova [r1 ], ymm5
+ mova [r1+32], ymm5
+ mova [r2 ], ymm5
+ mova [r2+32], ymm5
+%else
+ mova [r1], ymm5
+ mova [r2], ymm5
+%endif
+ mov [r0], word 0
+
+ vzeroupper
+ RET
+
+.single_nonzero:
+
+ ; Actual quantization of size 16 block - setup pointers, rounders, etc.
+ movifnidn r4, roundmp
+ movifnidn r5, quantmp
+ mov r3, dequantmp
+ mov r6, shiftmp
+ mova m1, [r4] ; m1 = round
+ mova m2, [r5] ; m2 = quant
+ mova m3, [r3] ; m3 = dequant
+ mova m4, [r6] ; m4 = shift
+
+ mov r3, iscanmp
+
+ DEFINE_ARGS eob, qcoeff, dqcoeff, iscan
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+ paddsw m6, m1 ; m6 += round
+ punpckhqdq m1, m1
+ paddsw m11, m1 ; m11 += round
+ pmulhw m8, m6, m2 ; m8 = m6*q>>16
+ punpckhqdq m2, m2
+ pmulhw m13, m11, m2 ; m13 = m11*q>>16
+ paddw m8, m6 ; m8 += m6
+ paddw m13, m11 ; m13 += m11
+ pmulhw m8, m4 ; m8 = m8*qsh>>16
+ punpckhqdq m4, m4
+ pmulhw m13, m4 ; m13 = m13*qsh>>16
+ psignw m8, m9 ; m8 = reinsert sign
+ psignw m13, m10 ; m13 = reinsert sign
+ pand m8, m7
+ pand m13, m12
+
+%if CONFIG_VP9_HIGHBITDEPTH
+ ; Store 16bit numbers as 32bit numbers in array pointed to by qcoeff
+ pcmpgtw m6, m5, m8
+ punpckhwd m6, m8, m6
+ pmovsxwd m11, m8
+ mova [qcoeffq ], m11
+ mova [qcoeffq+16], m6
+ pcmpgtw m6, m5, m13
+ punpckhwd m6, m13, m6
+ pmovsxwd m11, m13
+ mova [qcoeffq+32], m11
+ mova [qcoeffq+48], m6
+%else
+ mova [qcoeffq ], m8
+ mova [qcoeffq+16], m13
+%endif
+
+ pmullw m8, m3 ; dqc[i] = qc[i] * q
+ punpckhqdq m3, m3
+ pmullw m13, m3 ; dqc[i] = qc[i] * q
+
+%if CONFIG_VP9_HIGHBITDEPTH
+ ; Store 16bit numbers as 32bit numbers in array pointed to by qcoeff
+ pcmpgtw m6, m5, m8
+ punpckhwd m6, m8, m6
+ pmovsxwd m11, m8
+ mova [dqcoeffq ], m11
+ mova [dqcoeffq+16], m6
+ pcmpgtw m6, m5, m13
+ punpckhwd m6, m13, m6
+ pmovsxwd m11, m13
+ mova [dqcoeffq+32], m11
+ mova [dqcoeffq+48], m6
+%else
+ mova [dqcoeffq ], m8
+ mova [dqcoeffq+16], m13
+%endif
+
+ mova m6, [iscanq] ; m6 = scan[i]
+ mova m11, [iscanq+16] ; m11 = scan[i]
+
+ pcmpeqw m8, m8, m5 ; m8 = c[i] == 0
+ pcmpeqw m13, m13, m5 ; m13 = c[i] == 0
+ psubw m6, m6, m7 ; m6 = scan[i] + 1
+ psubw m11, m11, m12 ; m11 = scan[i] + 1
+ pandn m8, m8, m6 ; m8 = max(eob)
+ pandn m13, m13, m11 ; m13 = max(eob)
+ pmaxsw m8, m8, m13
+
+ ; Horizontally accumulate/max eobs and write into [eob] memory pointer
+ pshufd m7, m8, 0xe
+ pmaxsw m8, m7
+ pshuflw m7, m8, 0xe
+ pmaxsw m8, m7
+ pshuflw m7, m8, 0x1
+ pmaxsw m8, m7
+ movq rax, m8
+ mov [eobq], ax
+
+ vzeroupper
+ RET
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ;; Generic case of ncoeff != 16
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+.generic:
+
+%endif ; %ifnidn %1, b_32x32
+
+DEFINE_ARGS coeff, ncoeff, skip, zbin, round, quant, shift, \
+ qcoeff, dqcoeff, dequant, eob, scan, iscan
+
+ ; Actual quantization loop - setup pointers, rounders, etc.
+ movifnidn coeffq, coeffmp
+ movifnidn ncoeffq, ncoeffmp
+ mov r2, dequantmp
+ movifnidn zbinq, zbinmp
+ movifnidn roundq, roundmp
+ movifnidn quantq, quantmp
+ mova m0, [zbinq] ; m0 = zbin
+ mova m1, [roundq] ; m1 = round
+ mova m2, [quantq] ; m2 = quant
+ mova m3, [r2] ; m3 = dequant
+ pcmpeqw m4, m4 ; All lanes -1
+%ifidn %1, b_32x32
+ psubw m0, m4
+ psubw m1, m4
+ psrlw m0, 1 ; m0 = (m0 + 1) / 2
+ psrlw m1, 1 ; m1 = (m1 + 1) / 2
+%endif
+ paddw m0, m4 ; m0 = m0 + 1
+
+ mov r2, shiftmp
+ mov r3, qcoeffmp
+ mova m4, [r2] ; m4 = shift
+ mov r4, dqcoeffmp
+ mov r5, iscanmp
+%ifidn %1, b_32x32
+ psllw m4, 1
+%endif
+ pxor m5, m5 ; m5 = dedicated zero
+
+ DEFINE_ARGS coeff, ncoeff, d1, qcoeff, dqcoeff, iscan, d2, d3, d4, d5, eob
+
+%if CONFIG_VP9_HIGHBITDEPTH
+ lea coeffq, [ coeffq+ncoeffq*4]
+ lea qcoeffq, [ qcoeffq+ncoeffq*4]
+ lea dqcoeffq, [dqcoeffq+ncoeffq*4]
+%else
+ lea coeffq, [ coeffq+ncoeffq*2]
+ lea qcoeffq, [ qcoeffq+ncoeffq*2]
+ lea dqcoeffq, [dqcoeffq+ncoeffq*2]
+%endif
+ lea iscanq, [ iscanq+ncoeffq*2]
+ neg ncoeffq
+
+ ; get DC and first 15 AC coeffs
+%if CONFIG_VP9_HIGHBITDEPTH
+ ; coeff stored as 32bit numbers & require 16bit numbers
+ mova m9, [coeffq+ncoeffq*4+ 0]
+ packssdw m9, [coeffq+ncoeffq*4+16]
+ mova m10, [coeffq+ncoeffq*4+32]
+ packssdw m10, [coeffq+ncoeffq*4+48]
+%else
+ mova m9, [coeffq+ncoeffq*2+ 0] ; m9 = c[i]
+ mova m10, [coeffq+ncoeffq*2+16] ; m10 = c[i]
+%endif
+
+ pabsw m6, m9 ; m6 = abs(m9)
+ pabsw m11, m10 ; m11 = abs(m10)
+ pcmpgtw m7, m6, m0 ; m7 = c[i] >= zbin
+ punpckhqdq m0, m0
+ pcmpgtw m12, m11, m0 ; m12 = c[i] >= zbin
+
+ ; Check if all coeffs are less than zbin. If yes, skip forward quickly.
+ por m14, m7, m12
+ ptest m14, m14
+ jnz .first_nonzero
+
+%if CONFIG_VP9_HIGHBITDEPTH
+ mova [qcoeffq+ncoeffq*4 ], ymm5
+ mova [qcoeffq+ncoeffq*4+32], ymm5
+ mova [dqcoeffq+ncoeffq*4 ], ymm5
+ mova [dqcoeffq+ncoeffq*4+32], ymm5
+%else
+ mova [qcoeffq+ncoeffq*2], ymm5
+ mova [dqcoeffq+ncoeffq*2], ymm5
+%endif
+
+ add ncoeffq, mmsize
+
+ punpckhqdq m1, m1
+ punpckhqdq m2, m2
+ punpckhqdq m3, m3
+ punpckhqdq m4, m4
+ pxor m8, m8
+
+ jmp .ac_only_loop
+
+.first_nonzero:
+
+ paddsw m6, m1 ; m6 += round
+ punpckhqdq m1, m1
+ paddsw m11, m1 ; m11 += round
+ pmulhw m8, m6, m2 ; m8 = m6*q>>16
+ punpckhqdq m2, m2
+ pmulhw m13, m11, m2 ; m13 = m11*q>>16
+ paddw m8, m6 ; m8 += m6
+ paddw m13, m11 ; m13 += m11
+ pmulhw m8, m4 ; m8 = m8*qsh>>16
+ punpckhqdq m4, m4
+ pmulhw m13, m4 ; m13 = m13*qsh>>16
+ psignw m8, m9 ; m8 = reinsert sign
+ psignw m13, m10 ; m13 = reinsert sign
+ pand m8, m7
+ pand m13, m12
+
+%if CONFIG_VP9_HIGHBITDEPTH
+ ; store 16bit numbers as 32bit numbers in array pointed to by qcoeff
+ pcmpgtw m6, m5, m8
+ punpckhwd m6, m8, m6
+ pmovsxwd m11, m8
+ mova [qcoeffq+ncoeffq*4+ 0], m11
+ mova [qcoeffq+ncoeffq*4+16], m6
+ pcmpgtw m6, m5, m13
+ punpckhwd m6, m13, m6
+ pmovsxwd m11, m13
+ mova [qcoeffq+ncoeffq*4+32], m11
+ mova [qcoeffq+ncoeffq*4+48], m6
+%else
+ mova [qcoeffq+ncoeffq*2+ 0], m8
+ mova [qcoeffq+ncoeffq*2+16], m13
+%endif
+
+%ifidn %1, b_32x32
+ pabsw m8, m8
+ pabsw m13, m13
+%endif
+ pmullw m8, m3 ; dqc[i] = qc[i] * q
+ punpckhqdq m3, m3
+ pmullw m13, m3 ; dqc[i] = qc[i] * q
+%ifidn %1, b_32x32
+ psrlw m8, 1
+ psrlw m13, 1
+ psignw m8, m9
+ psignw m13, m10
+%endif
+
+%if CONFIG_VP9_HIGHBITDEPTH
+ ; store 16bit numbers as 32bit numbers in array pointed to by qcoeff
+ pcmpgtw m6, m5, m8
+ punpckhwd m6, m8, m6
+ pmovsxwd m11, m8
+ mova [dqcoeffq+ncoeffq*4+ 0], m11
+ mova [dqcoeffq+ncoeffq*4+16], m6
+ pcmpgtw m6, m5, m13
+ punpckhwd m6, m13, m6
+ pmovsxwd m11, m13
+ mova [dqcoeffq+ncoeffq*4+32], m11
+ mova [dqcoeffq+ncoeffq*4+48], m6
+%else
+ mova [dqcoeffq+ncoeffq*2+ 0], m8
+ mova [dqcoeffq+ncoeffq*2+16], m13
+%endif
+
+ pcmpeqw m8, m5 ; m8 = c[i] == 0
+ pcmpeqw m13, m5 ; m13 = c[i] == 0
+ mova m6, [iscanq+ncoeffq*2] ; m6 = scan[i]
+ mova m11, [iscanq+ncoeffq*2+16] ; m11 = scan[i]
+ psubw m6, m7 ; m6 = scan[i] + 1
+ psubw m11, m12 ; m11 = scan[i] + 1
+ pandn m8, m6 ; m8 = max(eob)
+ pandn m13, m11 ; m13 = max(eob)
+ pmaxsw m8, m13
+ add ncoeffq, mmsize
+
+.ac_only_loop:
+
+%if CONFIG_VP9_HIGHBITDEPTH
+ ; pack coeff from 32bit to 16bit array
+ mova m9, [coeffq+ncoeffq*4+ 0]
+ packssdw m9, [coeffq+ncoeffq*4+16]
+ mova m10, [coeffq+ncoeffq*4+32]
+ packssdw m10, [coeffq+ncoeffq*4+48]
+%else
+ mova m9, [coeffq+ncoeffq*2+ 0] ; m9 = c[i]
+ mova m10, [coeffq+ncoeffq*2+16] ; m10 = c[i]
+%endif
+
+ pabsw m6, m9 ; m6 = abs(m9)
+ pabsw m11, m10 ; m11 = abs(m10)
+ pcmpgtw m7, m6, m0 ; m7 = c[i] >= zbin
+ pcmpgtw m12, m11, m0 ; m12 = c[i] >= zbin
+
+ ; Check if all coeffs are less than zbin. If yes, skip this itertion.
+ ; And just write zeros as the result would be.
+ por m14, m7, m12
+ ptest m14, m14
+ jnz .rest_nonzero
+
+%if CONFIG_VP9_HIGHBITDEPTH
+ mova [qcoeffq+ncoeffq*4+ 0], ymm5
+ mova [qcoeffq+ncoeffq*4+32], ymm5
+ mova [dqcoeffq+ncoeffq*4+ 0], ymm5
+ mova [dqcoeffq+ncoeffq*4+32], ymm5
+%else
+ mova [qcoeffq+ncoeffq*2+ 0], ymm5
+ mova [dqcoeffq+ncoeffq*2+ 0], ymm5
+%endif
+ add ncoeffq, mmsize
+ jnz .ac_only_loop
+
+ ; Horizontally accumulate/max eobs and write into [eob] memory pointer
+ mov r2, eobmp
+ pshufd m7, m8, 0xe
+ pmaxsw m8, m7
+ pshuflw m7, m8, 0xe
+ pmaxsw m8, m7
+ pshuflw m7, m8, 0x1
+ pmaxsw m8, m7
+ movq rax, m8
+ mov [r2], ax
+ vzeroupper
+ RET
+
+.rest_nonzero:
+ paddsw m6, m1 ; m6 += round
+ paddsw m11, m1 ; m11 += round
+ pmulhw m14, m6, m2 ; m14 = m6*q>>16
+ pmulhw m13, m11, m2 ; m13 = m11*q>>16
+ paddw m14, m6 ; m14 += m6
+ paddw m13, m11 ; m13 += m11
+ pmulhw m14, m4 ; m14 = m14*qsh>>16
+ pmulhw m13, m4 ; m13 = m13*qsh>>16
+ psignw m14, m9 ; m14 = reinsert sign
+ psignw m13, m10 ; m13 = reinsert sign
+ pand m14, m7
+ pand m13, m12
+
+%if CONFIG_VP9_HIGHBITDEPTH
+ ; store 16bit numbers as 32bit numbers in array pointed to by qcoeff
+ pcmpgtw m6, m5, m14
+ punpckhwd m6, m14, m6
+ pmovsxwd m11, m14
+ mova [qcoeffq+ncoeffq*4+ 0], m11
+ mova [qcoeffq+ncoeffq*4+16], m6
+ pcmpgtw m6, m5, m13
+ punpckhwd m6, m13, m6
+ pmovsxwd m11, m13
+ mova [qcoeffq+ncoeffq*4+32], m11
+ mova [qcoeffq+ncoeffq*4+48], m6
+%else
+ mova [qcoeffq+ncoeffq*2+ 0], m14
+ mova [qcoeffq+ncoeffq*2+16], m13
+%endif
+
+%ifidn %1, b_32x32
+ pabsw m14, m14
+ pabsw m13, m13
+%endif
+ pmullw m14, m3 ; dqc[i] = qc[i] * q
+ pmullw m13, m3 ; dqc[i] = qc[i] * q
+%ifidn %1, b_32x32
+ psrlw m14, 1
+ psrlw m13, 1
+ psignw m14, m9
+ psignw m13, m10
+%endif
+
+%if CONFIG_VP9_HIGHBITDEPTH
+ ; store 16bit numbers as 32bit numbers in array pointed to by qcoeff
+ pcmpgtw m6, m5, m14
+ punpckhwd m6, m14, m6
+ pmovsxwd m11, m14
+ mova [dqcoeffq+ncoeffq*4+ 0], m11
+ mova [dqcoeffq+ncoeffq*4+16], m6
+ pcmpgtw m6, m5, m13
+ punpckhwd m6, m13, m6
+ pmovsxwd m11, m13
+ mova [dqcoeffq+ncoeffq*4+32], m11
+ mova [dqcoeffq+ncoeffq*4+48], m6
+%else
+ mova [dqcoeffq+ncoeffq*2+ 0], m14
+ mova [dqcoeffq+ncoeffq*2+16], m13
+%endif
+
+ pcmpeqw m14, m5 ; m14 = c[i] == 0
+ pcmpeqw m13, m5 ; m13 = c[i] == 0
+ mova m6, [iscanq+ncoeffq*2+ 0] ; m6 = scan[i]
+ mova m11, [iscanq+ncoeffq*2+16] ; m11 = scan[i]
+ psubw m6, m7 ; m6 = scan[i] + 1
+ psubw m11, m12 ; m11 = scan[i] + 1
+ pandn m14, m6 ; m14 = max(eob)
+ pandn m13, m11 ; m13 = max(eob)
+ pmaxsw m8, m14
+ pmaxsw m8, m13
+ add ncoeffq, mmsize
+ jnz .ac_only_loop
+
+ ; Horizontally accumulate/max eobs and write into [eob] memory pointer
+ mov r2, eobmp
+ pshufd m7, m8, 0xe
+ pmaxsw m8, m7
+ pshuflw m7, m8, 0xe
+ pmaxsw m8, m7
+ pshuflw m7, m8, 0x1
+ pmaxsw m8, m7
+ movq rax, m8
+ mov [r2], ax
+ vzeroupper
+ RET
+
+ ; Skip-block, i.e. just write all zeroes
+.blank:
+
+DEFINE_ARGS coeff, ncoeff, skip, zbin, round, quant, shift, \
+ qcoeff, dqcoeff, dequant, eob, scan, iscan
+
+ mov r0, dqcoeffmp
+ movifnidn ncoeffq, ncoeffmp
+ mov r2, qcoeffmp
+ mov r3, eobmp
+
+DEFINE_ARGS dqcoeff, ncoeff, qcoeff, eob
+
+%if CONFIG_VP9_HIGHBITDEPTH
+ lea dqcoeffq, [dqcoeffq+ncoeffq*4]
+ lea qcoeffq, [ qcoeffq+ncoeffq*4]
+%else
+ lea dqcoeffq, [dqcoeffq+ncoeffq*2]
+ lea qcoeffq, [ qcoeffq+ncoeffq*2]
+%endif
+
+ neg ncoeffq
+ pxor m7, m7
+
+.blank_loop:
+%if CONFIG_VP9_HIGHBITDEPTH
+ mova [dqcoeffq+ncoeffq*4+ 0], ymm7
+ mova [dqcoeffq+ncoeffq*4+32], ymm7
+ mova [qcoeffq+ncoeffq*4+ 0], ymm7
+ mova [qcoeffq+ncoeffq*4+32], ymm7
+%else
+ mova [dqcoeffq+ncoeffq*2+ 0], ymm7
+ mova [qcoeffq+ncoeffq*2+ 0], ymm7
+%endif
+ add ncoeffq, mmsize
+ jl .blank_loop
+
+ mov [eobq], word 0
+
+ vzeroupper
+ RET
+%endmacro
+
+INIT_XMM avx
+QUANTIZE_FN b, 7
+QUANTIZE_FN b_32x32, 7
+
+END