Merge "ANS: Remove extra buffer size checks causing a false decode error." into nextgenv2
diff --git a/test/subtract_test.cc b/test/subtract_test.cc
index a3f0152..48edf1e 100644
--- a/test/subtract_test.cc
+++ b/test/subtract_test.cc
@@ -15,12 +15,16 @@
#include "test/acm_random.h"
#include "test/clear_system_state.h"
#include "test/register_state_check.h"
+#include "test/util.h"
#if CONFIG_VP10
#include "vp10/common/blockd.h"
#elif CONFIG_VP9
#include "vp9/common/vp9_blockd.h"
#endif
#include "vpx_mem/vpx_mem.h"
+#include "vpx_ports/mem.h"
+
+#define USE_SPEED_TEST (0)
typedef void (*SubtractFunc)(int rows, int cols,
int16_t *diff_ptr, ptrdiff_t diff_stride,
@@ -108,4 +112,151 @@
INSTANTIATE_TEST_CASE_P(MSA, VP9SubtractBlockTest,
::testing::Values(vpx_subtract_block_msa));
#endif
+
+typedef void (*HBDSubtractFunc)(int rows, int cols,
+ int16_t *diff_ptr, ptrdiff_t diff_stride,
+ const uint8_t *src_ptr, ptrdiff_t src_stride,
+ const uint8_t *pred_ptr, ptrdiff_t pred_stride,
+ int bd);
+
+using ::std::tr1::get;
+using ::std::tr1::make_tuple;
+using ::std::tr1::tuple;
+
+// <width, height, bit_dpeth, subtract>
+typedef tuple<int, int, int, HBDSubtractFunc> Params;
+
+#if CONFIG_VP9_HIGHBITDEPTH
+class VP10HBDSubtractBlockTest : public ::testing::TestWithParam<Params> {
+ public:
+ virtual void SetUp() {
+ block_width_ = GET_PARAM(0);
+ block_height_ = GET_PARAM(1);
+ bit_depth_ = static_cast<vpx_bit_depth_t>(GET_PARAM(2));
+ func_ = GET_PARAM(3);
+
+ rnd_.Reset(ACMRandom::DeterministicSeed());
+
+ const size_t max_width = 128;
+ const size_t max_block_size = max_width * max_width;
+ src_ = CONVERT_TO_BYTEPTR(reinterpret_cast<uint16_t *>(
+ vpx_memalign(16, max_block_size * sizeof(uint16_t))));
+ pred_ = CONVERT_TO_BYTEPTR(reinterpret_cast<uint16_t *>(
+ vpx_memalign(16, max_block_size * sizeof(uint16_t))));
+ diff_ = reinterpret_cast<int16_t *>(
+ vpx_memalign(16, max_block_size * sizeof(int16_t)));
+ }
+
+ virtual void TearDown() {
+ vpx_free(CONVERT_TO_SHORTPTR(src_));
+ vpx_free(CONVERT_TO_SHORTPTR(pred_));
+ vpx_free(diff_);
+ }
+
+ protected:
+ void RunForSpeed();
+ void CheckResult();
+
+ private:
+ ACMRandom rnd_;
+ int block_height_;
+ int block_width_;
+ vpx_bit_depth_t bit_depth_;
+ HBDSubtractFunc func_;
+ uint8_t *src_;
+ uint8_t *pred_;
+ int16_t *diff_;
+};
+
+void VP10HBDSubtractBlockTest::RunForSpeed() {
+ const int test_num = 200000;
+ const int max_width = 128;
+ const int max_block_size = max_width * max_width;
+ const int mask = (1 << bit_depth_) - 1;
+ int i, j;
+
+ for (j = 0; j < max_block_size; ++j) {
+ CONVERT_TO_SHORTPTR(src_)[j] = rnd_.Rand16() & mask;
+ CONVERT_TO_SHORTPTR(pred_)[j] = rnd_.Rand16() & mask;
+ }
+
+ for (i = 0; i < test_num; ++i) {
+ func_(block_height_, block_width_, diff_, block_width_,
+ src_, block_width_, pred_, block_width_, bit_depth_);
+ }
+}
+
+void VP10HBDSubtractBlockTest::CheckResult() {
+ const int test_num = 100;
+ const int max_width = 128;
+ const int max_block_size = max_width * max_width;
+ const int mask = (1 << bit_depth_) - 1;
+ int i, j;
+
+ for (i = 0; i < test_num; ++i) {
+ for (j = 0; j < max_block_size; ++j) {
+ CONVERT_TO_SHORTPTR(src_)[j] = rnd_.Rand16() & mask;
+ CONVERT_TO_SHORTPTR(pred_)[j] = rnd_.Rand16() & mask;
+ }
+
+ func_(block_height_, block_width_, diff_, block_width_,
+ src_, block_width_, pred_, block_width_, bit_depth_);
+
+ for (int r = 0; r < block_height_; ++r) {
+ for (int c = 0; c < block_width_; ++c) {
+ EXPECT_EQ(diff_[r * block_width_ + c],
+ (CONVERT_TO_SHORTPTR(src_)[r * block_width_ + c] -
+ CONVERT_TO_SHORTPTR(pred_)[r * block_width_ + c]))
+ << "r = " << r << ", c = " << c << ", test: " << i;
+ }
+ }
+ }
+}
+
+TEST_P(VP10HBDSubtractBlockTest, CheckResult) {
+ CheckResult();
+}
+
+#if USE_SPEED_TEST
+TEST_P(VP10HBDSubtractBlockTest, CheckSpeed) {
+ RunForSpeed();
+}
+#endif // USE_SPEED_TEST
+
+#if HAVE_SSE2
+INSTANTIATE_TEST_CASE_P(SSE2, VP10HBDSubtractBlockTest, ::testing::Values(
+ make_tuple(4, 4, 12, vpx_highbd_subtract_block_sse2),
+ make_tuple(4, 4, 12, vpx_highbd_subtract_block_c),
+ make_tuple(4, 8, 12, vpx_highbd_subtract_block_sse2),
+ make_tuple(4, 8, 12, vpx_highbd_subtract_block_c),
+ make_tuple(8, 4, 12, vpx_highbd_subtract_block_sse2),
+ make_tuple(8, 4, 12, vpx_highbd_subtract_block_c),
+ make_tuple(8, 8, 12, vpx_highbd_subtract_block_sse2),
+ make_tuple(8, 8, 12, vpx_highbd_subtract_block_c),
+ make_tuple(8, 16, 12, vpx_highbd_subtract_block_sse2),
+ make_tuple(8, 16, 12, vpx_highbd_subtract_block_c),
+ make_tuple(16, 8, 12, vpx_highbd_subtract_block_sse2),
+ make_tuple(16, 8, 12, vpx_highbd_subtract_block_c),
+ make_tuple(16, 16, 12, vpx_highbd_subtract_block_sse2),
+ make_tuple(16, 16, 12, vpx_highbd_subtract_block_c),
+ make_tuple(16, 32, 12, vpx_highbd_subtract_block_sse2),
+ make_tuple(16, 32, 12, vpx_highbd_subtract_block_c),
+ make_tuple(32, 16, 12, vpx_highbd_subtract_block_sse2),
+ make_tuple(32, 16, 12, vpx_highbd_subtract_block_c),
+ make_tuple(32, 32, 12, vpx_highbd_subtract_block_sse2),
+ make_tuple(32, 32, 12, vpx_highbd_subtract_block_c),
+ make_tuple(32, 64, 12, vpx_highbd_subtract_block_sse2),
+ make_tuple(32, 64, 12, vpx_highbd_subtract_block_c),
+ make_tuple(64, 32, 12, vpx_highbd_subtract_block_sse2),
+ make_tuple(64, 32, 12, vpx_highbd_subtract_block_c),
+ make_tuple(64, 64, 12, vpx_highbd_subtract_block_sse2),
+ make_tuple(64, 64, 12, vpx_highbd_subtract_block_c),
+ make_tuple(64, 128, 12, vpx_highbd_subtract_block_sse2),
+ make_tuple(64, 128, 12, vpx_highbd_subtract_block_c),
+ make_tuple(128, 64, 12, vpx_highbd_subtract_block_sse2),
+ make_tuple(128, 64, 12, vpx_highbd_subtract_block_c),
+ make_tuple(128, 128, 12, vpx_highbd_subtract_block_sse2),
+ make_tuple(128, 128, 12, vpx_highbd_subtract_block_c)));
+#endif // HAVE_SSE2
+#endif // CONFIG_VP9_HIGHBITDEPTH
} // namespace
diff --git a/vp10/common/idct.c b/vp10/common/idct.c
index 47a3219..b06a5e9 100644
--- a/vp10/common/idct.c
+++ b/vp10/common/idct.c
@@ -19,6 +19,23 @@
#include "vpx_dsp/inv_txfm.h"
#include "vpx_ports/mem.h"
+int get_tx_scale(const MACROBLOCKD *const xd, const TX_TYPE tx_type,
+ const TX_SIZE tx_size) {
+ (void) tx_type;
+#if CONFIG_VP9_HIGHBITDEPTH
+ if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+ if (xd->bd == BITDEPTH_10) {
+ return 0;
+ } else {
+ return tx_size == TX_32X32;
+ }
+ }
+#else
+ (void)xd;
+#endif
+ return tx_size == TX_32X32;
+}
+
#if CONFIG_EXT_TX
static void iidtx4_c(const tran_low_t *input, tran_low_t *output) {
int i;
diff --git a/vp10/common/idct.h b/vp10/common/idct.h
index 31b26b8..ffdad0c 100644
--- a/vp10/common/idct.h
+++ b/vp10/common/idct.h
@@ -14,6 +14,7 @@
#include <assert.h>
#include "./vpx_config.h"
+#include "vp10/common/blockd.h"
#include "vp10/common/common.h"
#include "vp10/common/enums.h"
#include "vpx_dsp/inv_txfm.h"
@@ -48,6 +49,10 @@
} highbd_transform_2d;
#endif // CONFIG_VP9_HIGHBITDEPTH
+#define MAX_TX_SCALE 1
+int get_tx_scale(const MACROBLOCKD *const xd, const TX_TYPE tx_type,
+ const TX_SIZE tx_size);
+
void vp10_iwht4x4_add(const tran_low_t *input, uint8_t *dest, int stride,
int eob);
void vp10_idct4x4_add(const tran_low_t *input, uint8_t *dest, int stride,
diff --git a/vp10/common/vp10_rtcd_defs.pl b/vp10/common/vp10_rtcd_defs.pl
index 7b20239..1d227dd 100644
--- a/vp10/common/vp10_rtcd_defs.pl
+++ b/vp10/common/vp10_rtcd_defs.pl
@@ -662,11 +662,11 @@
add_proto qw/int64_t vp10_highbd_block_error/, "const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz, int bd";
specialize qw/vp10_highbd_block_error sse2/;
- add_proto qw/void vp10_highbd_quantize_fp/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+ add_proto qw/void vp10_highbd_quantize_fp/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan, const int log_scale";
specialize qw/vp10_highbd_quantize_fp/;
- add_proto qw/void vp10_highbd_quantize_fp_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
- specialize qw/vp10_highbd_quantize_fp_32x32/;
+ add_proto qw/void vp10_highbd_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan, const int log_scale";
+ specialize qw/vp10_highbd_quantize_b/;
# fdct functions
add_proto qw/void vp10_highbd_fht4x4/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
diff --git a/vp10/decoder/detokenize.c b/vp10/decoder/detokenize.c
index b8d409a..58cd9e6 100644
--- a/vp10/decoder/detokenize.c
+++ b/vp10/decoder/detokenize.c
@@ -15,9 +15,7 @@
#include "vp10/common/blockd.h"
#include "vp10/common/common.h"
#include "vp10/common/entropy.h"
-#if CONFIG_COEFFICIENT_RANGE_CHECKING
#include "vp10/common/idct.h"
-#endif
#include "vp10/decoder/detokenize.h"
@@ -113,15 +111,7 @@
cat6_prob = vp10_cat6_prob;
#endif
-#if CONFIG_VP9_HIGHBITDEPTH
- if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH && xd->bd == BITDEPTH_10) {
- dq_shift = 0;
- } else {
- dq_shift = (tx_size == TX_32X32);
- }
-#else
- dq_shift = (tx_size == TX_32X32);
-#endif
+ dq_shift = get_tx_scale(xd, 0, tx_size);
while (c < max_eob) {
int val = -1;
@@ -257,15 +247,7 @@
const uint8_t *cat5_prob;
const uint8_t *cat6_prob;
-#if CONFIG_VP9_HIGHBITDEPTH
- if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH && xd->bd == BITDEPTH_10) {
- dq_shift = 0;
- } else {
- dq_shift = (tx_size == TX_32X32);
- }
-#else
- dq_shift = (tx_size == TX_32X32);
-#endif
+ dq_shift = get_tx_scale(xd, 0, tx_size);
if (counts) {
coef_counts = counts->coef[tx_size][type][ref];
diff --git a/vp10/encoder/bitstream.c b/vp10/encoder/bitstream.c
index da1885d..7f3b6a2 100644
--- a/vp10/encoder/bitstream.c
+++ b/vp10/encoder/bitstream.c
@@ -2313,8 +2313,8 @@
vp10_copy(eob_counts_copy, cm->counts.eob_branch);
for (i = 1; i <= cpi->common.coef_probs_update_idx; ++i) {
for (tx_size = TX_4X4; tx_size <= max_tx_size; ++tx_size)
- full_to_model_counts(cm->counts.coef[tx_size],
- subframe_stats->coef_counts_buf[i][tx_size]);
+ vp10_full_to_model_counts(cm->counts.coef[tx_size],
+ subframe_stats->coef_counts_buf[i][tx_size]);
vp10_copy(cm->counts.eob_branch, subframe_stats->eob_counts_buf[i]);
vp10_partial_adapt_probs(cm, 0, 0);
vp10_copy(subframe_stats->coef_probs_buf[i], cm->fc->coef_probs);
diff --git a/vp10/encoder/encodeframe.c b/vp10/encoder/encodeframe.c
index 88e9486..06463c1 100644
--- a/vp10/encoder/encodeframe.c
+++ b/vp10/encoder/encodeframe.c
@@ -49,6 +49,12 @@
#include "vp10/encoder/segmentation.h"
#include "vp10/encoder/tokenize.h"
+#if CONFIG_VP9_HIGHBITDEPTH
+# define IF_HBD(...) __VA_ARGS__
+#else
+# define IF_HBD(...)
+#endif // CONFIG_VP9_HIGHBITDEPTH
+
static void encode_superblock(VP10_COMP *cpi, ThreadData * td,
TOKENEXTRA **t, int output_enabled,
int mi_row, int mi_col, BLOCK_SIZE bsize,
@@ -413,234 +419,102 @@
}
}
-typedef struct {
- int64_t sum_square_error;
- int64_t sum_error;
- int log2_count;
- int variance;
-} var;
-
-typedef struct {
- var none;
- var horz[2];
- var vert[2];
-} partition_variance;
-
-typedef struct {
- partition_variance part_variances;
- var split[4];
-} v4x4;
-
-typedef struct {
- partition_variance part_variances;
- v4x4 split[4];
-} v8x8;
-
-typedef struct {
- partition_variance part_variances;
- v8x8 split[4];
-} v16x16;
-
-typedef struct {
- partition_variance part_variances;
- v16x16 split[4];
-} v32x32;
-
-typedef struct {
- partition_variance part_variances;
- v32x32 split[4];
-} v64x64;
-
-#if CONFIG_EXT_PARTITION
-typedef struct {
- partition_variance part_variances;
- v64x64 split[4];
-} v128x128;
-#endif // CONFIG_EXT_PARTITION
-
-typedef struct {
- partition_variance *part_variances;
- var *split[4];
-} variance_node;
-
-typedef enum {
- V16X16,
- V32X32,
- V64X64,
-#if CONFIG_EXT_PARTITION
- V128X128,
-#endif // CONFIG_EXT_PARTITION
-} TREE_LEVEL;
-
-static void tree_to_node(void *data, BLOCK_SIZE bsize, variance_node *node) {
- int i;
- node->part_variances = NULL;
- switch (bsize) {
-#if CONFIG_EXT_PARTITION
- case BLOCK_128X128: {
- v128x128 *vt = (v128x128 *) data;
- node->part_variances = &vt->part_variances;
- for (i = 0; i < 4; i++)
- node->split[i] = &vt->split[i].part_variances.none;
- break;
- }
-#endif // CONFIG_EXT_PARTITION
- case BLOCK_64X64: {
- v64x64 *vt = (v64x64 *) data;
- node->part_variances = &vt->part_variances;
- for (i = 0; i < 4; i++)
- node->split[i] = &vt->split[i].part_variances.none;
- break;
- }
- case BLOCK_32X32: {
- v32x32 *vt = (v32x32 *) data;
- node->part_variances = &vt->part_variances;
- for (i = 0; i < 4; i++)
- node->split[i] = &vt->split[i].part_variances.none;
- break;
- }
- case BLOCK_16X16: {
- v16x16 *vt = (v16x16 *) data;
- node->part_variances = &vt->part_variances;
- for (i = 0; i < 4; i++)
- node->split[i] = &vt->split[i].part_variances.none;
- break;
- }
- case BLOCK_8X8: {
- v8x8 *vt = (v8x8 *) data;
- node->part_variances = &vt->part_variances;
- for (i = 0; i < 4; i++)
- node->split[i] = &vt->split[i].part_variances.none;
- break;
- }
- case BLOCK_4X4: {
- v4x4 *vt = (v4x4 *) data;
- node->part_variances = &vt->part_variances;
- for (i = 0; i < 4; i++)
- node->split[i] = &vt->split[i];
- break;
- }
- default: {
- assert(0);
- break;
- }
- }
-}
-
-// Set variance values given sum square error, sum error, count.
-static void fill_variance(int64_t s2, int64_t s, int c, var *v) {
- v->sum_square_error = s2;
- v->sum_error = s;
- v->log2_count = c;
-}
-
-static void get_variance(var *v) {
- v->variance = (int)(256 * (v->sum_square_error -
- ((v->sum_error * v->sum_error) >> v->log2_count)) >> v->log2_count);
-}
-
-static void sum_2_variances(const var *a, const var *b, var *r) {
- assert(a->log2_count == b->log2_count);
- fill_variance(a->sum_square_error + b->sum_square_error,
- a->sum_error + b->sum_error, a->log2_count + 1, r);
-}
-
-static void fill_variance_tree(void *data, BLOCK_SIZE bsize) {
- variance_node node;
- memset(&node, 0, sizeof(node));
- tree_to_node(data, bsize, &node);
- sum_2_variances(node.split[0], node.split[1], &node.part_variances->horz[0]);
- sum_2_variances(node.split[2], node.split[3], &node.part_variances->horz[1]);
- sum_2_variances(node.split[0], node.split[2], &node.part_variances->vert[0]);
- sum_2_variances(node.split[1], node.split[3], &node.part_variances->vert[1]);
- sum_2_variances(&node.part_variances->vert[0], &node.part_variances->vert[1],
- &node.part_variances->none);
-}
-
-static int set_vt_partitioning(VP10_COMP *cpi,
+static void set_vt_partitioning(VP10_COMP *cpi,
MACROBLOCK *const x,
MACROBLOCKD *const xd,
- void *data,
- BLOCK_SIZE bsize,
+ VAR_TREE *vt,
int mi_row,
int mi_col,
- int64_t threshold,
- BLOCK_SIZE bsize_min,
- int force_split) {
+ const int64_t *const threshold,
+ const BLOCK_SIZE *const bsize_min) {
VP10_COMMON * const cm = &cpi->common;
- variance_node vt;
- const int block_width = num_8x8_blocks_wide_lookup[bsize];
- const int block_height = num_8x8_blocks_high_lookup[bsize];
- const int low_res = (cm->width <= 352 && cm->height <= 288);
+ const int hbw = num_8x8_blocks_wide_lookup[vt->bsize] / 2;
+ const int hbh = num_8x8_blocks_high_lookup[vt->bsize] / 2;
+ const int has_cols = mi_col + hbw < cm->mi_cols;
+ const int has_rows = mi_row + hbh < cm->mi_rows;
- assert(block_height == block_width);
- tree_to_node(data, bsize, &vt);
+ if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols)
+ return;
- if (force_split == 1)
- return 0;
+ assert(vt->bsize >= BLOCK_8X8);
+
+ assert(hbh == hbw);
+
+ if (vt->force_split || (!has_cols && !has_rows))
+ goto split;
// For bsize=bsize_min (16x16/8x8 for 8x8/4x4 downsampling), select if
// variance is below threshold, otherwise split will be selected.
// No check for vert/horiz split as too few samples for variance.
- if (bsize == bsize_min) {
- // Variance already computed to set the force_split.
- if (low_res || cm->frame_type == KEY_FRAME)
- get_variance(&vt.part_variances->none);
- if (mi_col + block_width / 2 < cm->mi_cols &&
- mi_row + block_height / 2 < cm->mi_rows &&
- vt.part_variances->none.variance < threshold) {
- set_block_size(cpi, x, xd, mi_row, mi_col, bsize);
- return 1;
+ if (vt->bsize == bsize_min[0]) {
+ if (has_cols && has_rows &&
+ vt->variances.none.variance < threshold[0]) {
+ set_block_size(cpi, x, xd, mi_row, mi_col, vt->bsize);
+ return;
+ } else {
+ BLOCK_SIZE subsize = get_subsize(vt->bsize, PARTITION_SPLIT);
+ set_block_size(cpi, x, xd, mi_row, mi_col, subsize);
+ if (vt->bsize > BLOCK_8X8) {
+ set_block_size(cpi, x, xd, mi_row, mi_col + hbw, subsize);
+ set_block_size(cpi, x, xd, mi_row + hbh, mi_col, subsize);
+ set_block_size(cpi, x, xd, mi_row + hbh, mi_col + hbw, subsize);
+ }
+ return;
}
- return 0;
- } else if (bsize > bsize_min) {
- // Variance already computed to set the force_split.
- if (low_res || cm->frame_type == KEY_FRAME)
- get_variance(&vt.part_variances->none);
+ } else if (vt->bsize > bsize_min[0]) {
// For key frame: take split for bsize above 32X32 or very high variance.
if (cm->frame_type == KEY_FRAME &&
- (bsize > BLOCK_32X32 ||
- vt.part_variances->none.variance > (threshold << 4))) {
- return 0;
+ (vt->bsize > BLOCK_32X32 ||
+ vt->variances.none.variance > (threshold[0] << 4))) {
+ goto split;
}
// If variance is low, take the bsize (no split).
- if (mi_col + block_width / 2 < cm->mi_cols &&
- mi_row + block_height / 2 < cm->mi_rows &&
- vt.part_variances->none.variance < threshold) {
- set_block_size(cpi, x, xd, mi_row, mi_col, bsize);
- return 1;
+ if (has_cols && has_rows &&
+ vt->variances.none.variance < threshold[0]) {
+ set_block_size(cpi, x, xd, mi_row, mi_col, vt->bsize);
+ return;
}
// Check vertical split.
- if (mi_row + block_height / 2 < cm->mi_rows) {
- BLOCK_SIZE subsize = get_subsize(bsize, PARTITION_VERT);
- get_variance(&vt.part_variances->vert[0]);
- get_variance(&vt.part_variances->vert[1]);
- if (vt.part_variances->vert[0].variance < threshold &&
- vt.part_variances->vert[1].variance < threshold &&
+ if (has_rows) {
+ BLOCK_SIZE subsize = get_subsize(vt->bsize, PARTITION_VERT);
+ if (vt->variances.vert[0].variance < threshold[0] &&
+ vt->variances.vert[1].variance < threshold[0] &&
get_plane_block_size(subsize, &xd->plane[1]) < BLOCK_INVALID) {
set_block_size(cpi, x, xd, mi_row, mi_col, subsize);
- set_block_size(cpi, x, xd, mi_row, mi_col + block_width / 2, subsize);
- return 1;
+ set_block_size(cpi, x, xd, mi_row, mi_col + hbw, subsize);
+ return;
}
}
// Check horizontal split.
- if (mi_col + block_width / 2 < cm->mi_cols) {
- BLOCK_SIZE subsize = get_subsize(bsize, PARTITION_HORZ);
- get_variance(&vt.part_variances->horz[0]);
- get_variance(&vt.part_variances->horz[1]);
- if (vt.part_variances->horz[0].variance < threshold &&
- vt.part_variances->horz[1].variance < threshold &&
+ if (has_cols) {
+ BLOCK_SIZE subsize = get_subsize(vt->bsize, PARTITION_HORZ);
+ if (vt->variances.horz[0].variance < threshold[0] &&
+ vt->variances.horz[1].variance < threshold[0] &&
get_plane_block_size(subsize, &xd->plane[1]) < BLOCK_INVALID) {
set_block_size(cpi, x, xd, mi_row, mi_col, subsize);
- set_block_size(cpi, x, xd, mi_row + block_height / 2, mi_col, subsize);
- return 1;
+ set_block_size(cpi, x, xd, mi_row + hbh, mi_col, subsize);
+ return;
}
}
-
- return 0;
}
- return 0;
+
+split:
+ {
+ set_vt_partitioning(cpi, x, xd, vt->split[0],
+ mi_row, mi_col,
+ threshold + 1, bsize_min + 1);
+ set_vt_partitioning(cpi, x, xd, vt->split[1],
+ mi_row, mi_col + hbw,
+ threshold + 1, bsize_min + 1);
+ set_vt_partitioning(cpi, x, xd, vt->split[2],
+ mi_row + hbh, mi_col,
+ threshold + 1, bsize_min + 1);
+ set_vt_partitioning(cpi, x, xd, vt->split[3],
+ mi_row + hbh, mi_col + hbw,
+ threshold + 1, bsize_min + 1);
+ return;
+ }
}
// Set the variance split thresholds for following the block sizes:
@@ -654,23 +528,24 @@
const int64_t threshold_base = (int64_t)(threshold_multiplier *
cpi->y_dequant[q][1]);
if (is_key_frame) {
- thresholds[0] = threshold_base;
- thresholds[1] = threshold_base >> 2;
- thresholds[2] = threshold_base >> 2;
- thresholds[3] = threshold_base << 2;
- } else {
thresholds[1] = threshold_base;
+ thresholds[2] = threshold_base >> 2;
+ thresholds[3] = threshold_base >> 2;
+ thresholds[4] = threshold_base << 2;
+ } else {
+ thresholds[2] = threshold_base;
if (cm->width <= 352 && cm->height <= 288) {
- thresholds[0] = threshold_base >> 2;
- thresholds[2] = threshold_base << 3;
+ thresholds[1] = threshold_base >> 2;
+ thresholds[3] = threshold_base << 3;
} else {
- thresholds[0] = threshold_base;
- thresholds[1] = (5 * threshold_base) >> 2;
+ thresholds[1] = threshold_base;
+ thresholds[2] = (5 * threshold_base) >> 2;
if (cm->width >= 1920 && cm->height >= 1080)
- thresholds[1] = (7 * threshold_base) >> 2;
- thresholds[2] = threshold_base << cpi->oxcf.speed;
+ thresholds[2] = (7 * threshold_base) >> 2;
+ thresholds[3] = threshold_base << cpi->oxcf.speed;
}
}
+ thresholds[0] = INT64_MIN;
}
void vp10_set_variance_partition_thresholds(VP10_COMP *cpi, int q) {
@@ -699,10 +574,10 @@
}
// Compute the minmax over the 8x8 subblocks.
-static int compute_minmax_8x8(const uint8_t *s, int sp, const uint8_t *d,
- int dp, int x16_idx, int y16_idx,
+static int compute_minmax_8x8(const uint8_t *src, int src_stride,
+ const uint8_t *ref, int ref_stride,
#if CONFIG_VP9_HIGHBITDEPTH
- int highbd_flag,
+ int highbd,
#endif
int pixels_wide,
int pixels_high) {
@@ -711,24 +586,26 @@
int minmax_min = 255;
// Loop over the 4 8x8 subblocks.
for (k = 0; k < 4; k++) {
- int x8_idx = x16_idx + ((k & 1) << 3);
- int y8_idx = y16_idx + ((k >> 1) << 3);
+ const int x8_idx = ((k & 1) << 3);
+ const int y8_idx = ((k >> 1) << 3);
int min = 0;
int max = 0;
if (x8_idx < pixels_wide && y8_idx < pixels_high) {
+ const int src_offset = y8_idx * src_stride + x8_idx;
+ const int ref_offset = y8_idx * ref_stride + x8_idx;
#if CONFIG_VP9_HIGHBITDEPTH
- if (highbd_flag & YV12_FLAG_HIGHBITDEPTH) {
- vpx_highbd_minmax_8x8(s + y8_idx * sp + x8_idx, sp,
- d + y8_idx * dp + x8_idx, dp,
+ if (highbd) {
+ vpx_highbd_minmax_8x8(src + src_offset, src_stride,
+ ref + ref_offset, ref_stride,
&min, &max);
} else {
- vpx_minmax_8x8(s + y8_idx * sp + x8_idx, sp,
- d + y8_idx * dp + x8_idx, dp,
+ vpx_minmax_8x8(src + src_offset, src_stride,
+ ref + ref_offset, ref_stride,
&min, &max);
}
#else
- vpx_minmax_8x8(s + y8_idx * sp + x8_idx, sp,
- d + y8_idx * dp + x8_idx, dp,
+ vpx_minmax_8x8(src + src_offset, src_stride,
+ ref + ref_offset, ref_stride,
&min, &max);
#endif
if ((max - min) > minmax_max)
@@ -740,110 +617,252 @@
return (minmax_max - minmax_min);
}
-static void fill_variance_4x4avg(const uint8_t *s, int sp, const uint8_t *d,
- int dp, int x8_idx, int y8_idx, v8x8 *vst,
#if CONFIG_VP9_HIGHBITDEPTH
- int highbd_flag,
-#endif
- int pixels_wide,
- int pixels_high,
- int is_key_frame) {
- int k;
- for (k = 0; k < 4; k++) {
- int x4_idx = x8_idx + ((k & 1) << 2);
- int y4_idx = y8_idx + ((k >> 1) << 2);
- unsigned int sse = 0;
- int sum = 0;
- if (x4_idx < pixels_wide && y4_idx < pixels_high) {
- int s_avg;
- int d_avg = 128;
-#if CONFIG_VP9_HIGHBITDEPTH
- if (highbd_flag & YV12_FLAG_HIGHBITDEPTH) {
- s_avg = vpx_highbd_avg_4x4(s + y4_idx * sp + x4_idx, sp);
- if (!is_key_frame)
- d_avg = vpx_highbd_avg_4x4(d + y4_idx * dp + x4_idx, dp);
- } else {
- s_avg = vpx_avg_4x4(s + y4_idx * sp + x4_idx, sp);
- if (!is_key_frame)
- d_avg = vpx_avg_4x4(d + y4_idx * dp + x4_idx, dp);
- }
+static INLINE int avg_4x4(const uint8_t *const src, const int stride,
+ const int highbd) {
+ if (highbd) {
+ return vpx_highbd_avg_4x4(src, stride);
+ } else {
+ return vpx_avg_4x4(src, stride);
+ }
+}
#else
- s_avg = vpx_avg_4x4(s + y4_idx * sp + x4_idx, sp);
- if (!is_key_frame)
- d_avg = vpx_avg_4x4(d + y4_idx * dp + x4_idx, dp);
+static INLINE int avg_4x4(const uint8_t *const src, const int stride) {
+ return vpx_avg_4x4(src, stride);
+}
#endif
- sum = s_avg - d_avg;
- sse = sum * sum;
- }
- fill_variance(sse, sum, 0, &vst->split[k].part_variances.none);
+
+#if CONFIG_VP9_HIGHBITDEPTH
+static INLINE int avg_8x8(const uint8_t *const src, const int stride,
+ const int highbd) {
+ if (highbd) {
+ return vpx_highbd_avg_8x8(src, stride);
+ } else {
+ return vpx_avg_8x8(src, stride);
+ }
+}
+#else
+static INLINE int avg_8x8(const uint8_t *const src, const int stride) {
+ return vpx_avg_8x8(src, stride);
+}
+#endif
+
+static void init_variance_tree(VAR_TREE *const vt,
+#if CONFIG_VP9_HIGHBITDEPTH
+ const int highbd,
+#endif
+ BLOCK_SIZE bsize,
+ BLOCK_SIZE leaf_size,
+ const int width, const int height,
+ const uint8_t *const src, const int src_stride,
+ const uint8_t *const ref, const int ref_stride) {
+ assert(bsize >= leaf_size);
+
+ vt->bsize = bsize;
+
+ vt->force_split = 0;
+
+ vt->src = src;
+ vt->src_stride = src_stride;
+ vt->ref = ref;
+ vt->ref_stride = ref_stride;
+
+ vt->width = width;
+ vt->height = height;
+
+#if CONFIG_VP9_HIGHBITDEPTH
+ vt->highbd = highbd;
+#endif // CONFIG_VP9_HIGHBITDEPTH
+
+ if (bsize > leaf_size) {
+ const BLOCK_SIZE subsize = get_subsize(bsize, PARTITION_SPLIT);
+ const int px = num_4x4_blocks_wide_lookup[subsize] * 4;
+
+ init_variance_tree(vt->split[0],
+#if CONFIG_VP9_HIGHBITDEPTH
+ highbd,
+#endif // CONFIG_VP9_HIGHBITDEPTH
+ subsize, leaf_size,
+ VPXMIN(px, width), VPXMIN(px, height),
+ src, src_stride,
+ ref, ref_stride);
+ init_variance_tree(vt->split[1],
+#if CONFIG_VP9_HIGHBITDEPTH
+ highbd,
+#endif // CONFIG_VP9_HIGHBITDEPTH
+ subsize, leaf_size,
+ width - px, VPXMIN(px, height),
+ src + px, src_stride,
+ ref + px, ref_stride);
+ init_variance_tree(vt->split[2],
+#if CONFIG_VP9_HIGHBITDEPTH
+ highbd,
+#endif // CONFIG_VP9_HIGHBITDEPTH
+ subsize, leaf_size,
+ VPXMIN(px, width), height - px,
+ src + px * src_stride, src_stride,
+ ref + px * ref_stride, ref_stride);
+ init_variance_tree(vt->split[3],
+#if CONFIG_VP9_HIGHBITDEPTH
+ highbd,
+#endif // CONFIG_VP9_HIGHBITDEPTH
+ subsize, leaf_size,
+ width - px, height - px,
+ src + px * src_stride + px, src_stride,
+ ref + px * ref_stride + px, ref_stride);
}
}
-static void fill_variance_8x8avg(const uint8_t *s, int sp, const uint8_t *d,
- int dp, int x16_idx, int y16_idx, v16x16 *vst,
-#if CONFIG_VP9_HIGHBITDEPTH
- int highbd_flag,
-#endif
- int pixels_wide,
- int pixels_high,
- int is_key_frame) {
- int k;
- for (k = 0; k < 4; k++) {
- int x8_idx = x16_idx + ((k & 1) << 3);
- int y8_idx = y16_idx + ((k >> 1) << 3);
+
+// Fill the variance tree based on averaging pixel values (sub-sampling), at
+// the leaf node size.
+static void fill_variance_tree(VAR_TREE *const vt,
+ const BLOCK_SIZE leaf_size) {
+ if (vt->bsize > leaf_size) {
+ fill_variance_tree(vt->split[0], leaf_size);
+ fill_variance_tree(vt->split[1], leaf_size);
+ fill_variance_tree(vt->split[2], leaf_size);
+ fill_variance_tree(vt->split[3], leaf_size);
+ fill_variance_node(vt);
+ } else if (vt->width <= 0 || vt->height <= 0) {
+ fill_variance(0, 0, 0, &vt->variances.none);
+ } else {
unsigned int sse = 0;
int sum = 0;
- if (x8_idx < pixels_wide && y8_idx < pixels_high) {
- int s_avg;
- int d_avg = 128;
-#if CONFIG_VP9_HIGHBITDEPTH
- if (highbd_flag & YV12_FLAG_HIGHBITDEPTH) {
- s_avg = vpx_highbd_avg_8x8(s + y8_idx * sp + x8_idx, sp);
- if (!is_key_frame)
- d_avg = vpx_highbd_avg_8x8(d + y8_idx * dp + x8_idx, dp);
- } else {
- s_avg = vpx_avg_8x8(s + y8_idx * sp + x8_idx, sp);
- if (!is_key_frame)
- d_avg = vpx_avg_8x8(d + y8_idx * dp + x8_idx, dp);
- }
-#else
- s_avg = vpx_avg_8x8(s + y8_idx * sp + x8_idx, sp);
- if (!is_key_frame)
- d_avg = vpx_avg_8x8(d + y8_idx * dp + x8_idx, dp);
-#endif
- sum = s_avg - d_avg;
- sse = sum * sum;
+ int src_avg;
+ int ref_avg;
+ assert(leaf_size == BLOCK_4X4 || leaf_size == BLOCK_8X8);
+ if (leaf_size == BLOCK_4X4) {
+ src_avg = avg_4x4(vt->src, vt->src_stride IF_HBD(, vt->highbd));
+ ref_avg = avg_4x4(vt->ref, vt->ref_stride IF_HBD(, vt->highbd));
+ } else {
+ src_avg = avg_8x8(vt->src, vt->src_stride IF_HBD(, vt->highbd));
+ ref_avg = avg_8x8(vt->ref, vt->ref_stride IF_HBD(, vt->highbd));
}
- fill_variance(sse, sum, 0, &vst->split[k].part_variances.none);
+ sum = src_avg - ref_avg;
+ sse = sum * sum;
+ fill_variance(sse, sum, 0, &vt->variances.none);
}
}
+static void refine_variance_tree(VAR_TREE *const vt, const int64_t threshold) {
+ if (vt->bsize >= BLOCK_8X8) {
+ if (vt->bsize == BLOCK_16X16) {
+ if (vt->variances.none.variance <= threshold)
+ return;
+ else
+ vt->force_split = 0;
+ }
+
+ refine_variance_tree(vt->split[0], threshold);
+ refine_variance_tree(vt->split[1], threshold);
+ refine_variance_tree(vt->split[2], threshold);
+ refine_variance_tree(vt->split[3], threshold);
+
+ if (vt->bsize <= BLOCK_16X16)
+ fill_variance_node(vt);
+ } else if (vt->width <= 0 || vt->height <= 0) {
+ fill_variance(0, 0, 0, &vt->variances.none);
+ } else {
+ const int src_avg = avg_4x4(vt->src, vt->src_stride IF_HBD(, vt->highbd));
+ const int ref_avg = avg_4x4(vt->ref, vt->ref_stride IF_HBD(, vt->highbd));
+ const int sum = src_avg - ref_avg;
+ const unsigned int sse = sum * sum;
+ assert(vt->bsize == BLOCK_4X4);
+ fill_variance(sse, sum, 0, &vt->variances.none);
+ }
+}
+
+static int check_split_key_frame(VAR_TREE *const vt,
+ const int64_t threshold) {
+ if (vt->bsize == BLOCK_32X32) {
+ vt->force_split = vt->variances.none.variance > threshold;
+ } else {
+ vt->force_split |= check_split_key_frame(vt->split[0], threshold);
+ vt->force_split |= check_split_key_frame(vt->split[1], threshold);
+ vt->force_split |= check_split_key_frame(vt->split[2], threshold);
+ vt->force_split |= check_split_key_frame(vt->split[3], threshold);
+ }
+ return vt->force_split;
+}
+
+static int check_split(VP10_COMP *const cpi,
+ VAR_TREE *const vt,
+ const int segment_id,
+ const int64_t *const thresholds
+ ) {
+ if (vt->bsize == BLOCK_16X16) {
+ vt->force_split = vt->variances.none.variance > thresholds[0];
+ if (!vt->force_split &&
+ vt->variances.none.variance > thresholds[-1] &&
+ !cyclic_refresh_segment_id_boosted(segment_id)) {
+ // We have some nominal amount of 16x16 variance (based on average),
+ // compute the minmax over the 8x8 sub-blocks, and if above threshold,
+ // force split to 8x8 block for this 16x16 block.
+ int minmax = compute_minmax_8x8(vt->src, vt->src_stride,
+ vt->ref, vt->ref_stride,
+#if CONFIG_VP9_HIGHBITDEPTH
+ vt->highbd,
+#endif
+ vt->width, vt->height);
+ vt->force_split = minmax > cpi->vbp_threshold_minmax;
+ }
+ } else {
+ vt->force_split |= check_split(cpi, vt->split[0],
+ segment_id, thresholds + 1);
+ vt->force_split |= check_split(cpi, vt->split[1],
+ segment_id, thresholds + 1);
+ vt->force_split |= check_split(cpi, vt->split[2],
+ segment_id, thresholds + 1);
+ vt->force_split |= check_split(cpi, vt->split[3],
+ segment_id, thresholds + 1);
+
+ if (vt->bsize == BLOCK_32X32 && !vt->force_split) {
+ vt->force_split = vt->variances.none.variance > thresholds[0];
+ }
+ }
+
+ return vt->force_split;
+}
+
// This function chooses partitioning based on the variance between source and
-// reconstructed last, where variance is computed for down-sampled inputs.
-static int choose_partitioning(VP10_COMP *cpi,
+// reconstructed last (or golden), where variance is computed for down-sampled
+// inputs.
+static void choose_partitioning(VP10_COMP *const cpi,
+ ThreadData *const td,
const TileInfo *const tile,
- MACROBLOCK *x,
- int mi_row, int mi_col) {
- VP10_COMMON * const cm = &cpi->common;
- MACROBLOCKD *xd = &x->e_mbd;
- int i, j, k, m;
- v64x64 vt;
- v16x16 vt2[16];
- int force_split[21];
- uint8_t *s;
- const uint8_t *d;
- int sp;
- int dp;
+ MACROBLOCK *const x,
+ const int mi_row, const int mi_col) {
+ VP10_COMMON *const cm = &cpi->common;
+ MACROBLOCKD *const xd = &x->e_mbd;
+ VAR_TREE *const vt = td->var_root[cm->mib_size_log2 - MIN_MIB_SIZE_LOG2];
+ int i;
+ const uint8_t *src;
+ const uint8_t *ref;
+ int src_stride;
+ int ref_stride;
int pixels_wide = 8 * num_8x8_blocks_wide_lookup[cm->sb_size];
int pixels_high = 8 * num_8x8_blocks_high_lookup[cm->sb_size];
- int64_t thresholds[4] = {cpi->vbp_thresholds[0], cpi->vbp_thresholds[1],
- cpi->vbp_thresholds[2], cpi->vbp_thresholds[3]};
+ int64_t thresholds[5] = {
+ cpi->vbp_thresholds[0],
+ cpi->vbp_thresholds[1],
+ cpi->vbp_thresholds[2],
+ cpi->vbp_thresholds[3],
+ cpi->vbp_thresholds[4],
+ };
+ BLOCK_SIZE bsize_min[5] = {
+ BLOCK_16X16,
+ BLOCK_16X16,
+ BLOCK_16X16,
+ cpi->vbp_bsize_min,
+ BLOCK_8X8
+ };
+ const int start_level = cm->sb_size == BLOCK_64X64 ? 1 : 0;
+ const int64_t *const thre = thresholds + start_level;
+ const BLOCK_SIZE *const bmin = bsize_min + start_level;
- // Always use 4x4 partition for key frame.
const int is_key_frame = (cm->frame_type == KEY_FRAME);
- const int use_4x4_partition = is_key_frame;
const int low_res = (cm->width <= 352 && cm->height <= 288);
- int variance4x4downsample[16];
int segment_id = CR_SEGMENT_ID_BASE;
@@ -858,11 +877,6 @@
}
}
-#if CONFIG_EXT_PARTITION || CONFIG_EXT_PARTITION_TYPES
- printf("Not yet implemented: choose_partitioning\n");
- exit(-1);
-#endif // CONFIG_EXT_PARTITION
-
set_offsets(cpi, tile, x, mi_row, mi_col, cm->sb_size);
if (xd->mb_to_right_edge < 0)
@@ -870,33 +884,31 @@
if (xd->mb_to_bottom_edge < 0)
pixels_high += (xd->mb_to_bottom_edge >> 3);
- s = x->plane[0].src.buf;
- sp = x->plane[0].src.stride;
+ src = x->plane[0].src.buf;
+ src_stride = x->plane[0].src.stride;
if (!is_key_frame) {
MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
unsigned int uv_sad;
const YV12_BUFFER_CONFIG *yv12 = get_ref_frame_buffer(cpi, LAST_FRAME);
-
- const YV12_BUFFER_CONFIG *yv12_g = NULL;
+ const YV12_BUFFER_CONFIG *yv12_g = get_ref_frame_buffer(cpi, GOLDEN_FRAME);
unsigned int y_sad, y_sad_g;
- const int max_mi_block_size = cm->mib_size;
- const int is_right_edge = mi_col + max_mi_block_size / 2 > cm->mi_cols;
- const int is_left_edge = mi_row + max_mi_block_size / 2 > cm->mi_rows;
+ const int hbs = cm->mib_size / 2;
+ const int split_vert = mi_col + hbs >= cm->mi_cols;
+ const int split_horz = mi_row + hbs >= cm->mi_rows;
BLOCK_SIZE bsize;
- if (is_right_edge && is_left_edge)
+ if (split_vert && split_horz)
bsize = get_subsize(cm->sb_size, PARTITION_SPLIT);
- else if (is_right_edge)
+ else if (split_vert)
bsize = get_subsize(cm->sb_size, PARTITION_VERT);
- else if (is_left_edge)
+ else if (split_horz)
bsize = get_subsize(cm->sb_size, PARTITION_HORZ);
else
bsize = cm->sb_size;
assert(yv12 != NULL);
- yv12_g = get_ref_frame_buffer(cpi, GOLDEN_FRAME);
if (yv12_g && yv12_g != yv12) {
vp10_setup_pre_planes(xd, 0, yv12_g, mi_row, mi_col,
@@ -918,6 +930,7 @@
mbmi->interp_filter = BILINEAR;
y_sad = vp10_int_pro_motion_estimation(cpi, x, bsize, mi_row, mi_col);
+
if (y_sad_g < y_sad) {
vp10_setup_pre_planes(xd, 0, yv12_g, mi_row, mi_col,
&cm->frame_refs[GOLDEN_FRAME - 1].sf);
@@ -944,196 +957,65 @@
x->color_sensitivity[i - 1] = uv_sad > (y_sad >> 2);
}
- d = xd->plane[0].dst.buf;
- dp = xd->plane[0].dst.stride;
+ ref = xd->plane[0].dst.buf;
+ ref_stride = xd->plane[0].dst.stride;
// If the y_sad is very small, take the largest partition and exit.
// Don't check on boosted segment for now, as largest is suppressed there.
if (segment_id == CR_SEGMENT_ID_BASE && y_sad < cpi->vbp_threshold_sad) {
- if (!is_right_edge && !is_left_edge) {
+ if (!split_vert && !split_horz) {
set_block_size(cpi, x, xd, mi_row, mi_col, cm->sb_size);
- return 0;
+ return;
}
}
} else {
- d = VP10_VAR_OFFS;
- dp = 0;
+ ref = VP10_VAR_OFFS;
+ ref_stride = 0;
#if CONFIG_VP9_HIGHBITDEPTH
if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
switch (xd->bd) {
case 10:
- d = CONVERT_TO_BYTEPTR(VP10_HIGH_VAR_OFFS_10);
+ ref = CONVERT_TO_BYTEPTR(VP10_HIGH_VAR_OFFS_10);
break;
case 12:
- d = CONVERT_TO_BYTEPTR(VP10_HIGH_VAR_OFFS_12);
+ ref = CONVERT_TO_BYTEPTR(VP10_HIGH_VAR_OFFS_12);
break;
case 8:
default:
- d = CONVERT_TO_BYTEPTR(VP10_HIGH_VAR_OFFS_8);
+ ref = CONVERT_TO_BYTEPTR(VP10_HIGH_VAR_OFFS_8);
break;
}
}
#endif // CONFIG_VP9_HIGHBITDEPTH
}
- // Index for force_split: 0 for 64x64, 1-4 for 32x32 blocks,
- // 5-20 for the 16x16 blocks.
- force_split[0] = 0;
- // Fill in the entire tree of 8x8 (or 4x4 under some conditions) variances
- // for splits.
- for (i = 0; i < 4; i++) {
- const int x32_idx = ((i & 1) << 5);
- const int y32_idx = ((i >> 1) << 5);
- const int i2 = i << 2;
- force_split[i + 1] = 0;
- for (j = 0; j < 4; j++) {
- const int x16_idx = x32_idx + ((j & 1) << 4);
- const int y16_idx = y32_idx + ((j >> 1) << 4);
- const int split_index = 5 + i2 + j;
- v16x16 *vst = &vt.split[i].split[j];
- force_split[split_index] = 0;
- variance4x4downsample[i2 + j] = 0;
- if (!is_key_frame) {
- fill_variance_8x8avg(s, sp, d, dp, x16_idx, y16_idx, vst,
+ init_variance_tree(vt,
#if CONFIG_VP9_HIGHBITDEPTH
- xd->cur_buf->flags,
-#endif
- pixels_wide,
- pixels_high,
- is_key_frame);
- fill_variance_tree(&vt.split[i].split[j], BLOCK_16X16);
- get_variance(&vt.split[i].split[j].part_variances.none);
- if (vt.split[i].split[j].part_variances.none.variance >
- thresholds[2]) {
- // 16X16 variance is above threshold for split, so force split to 8x8
- // for this 16x16 block (this also forces splits for upper levels).
- force_split[split_index] = 1;
- force_split[i + 1] = 1;
- force_split[0] = 1;
- } else if (vt.split[i].split[j].part_variances.none.variance >
- thresholds[1] &&
- !cyclic_refresh_segment_id_boosted(segment_id)) {
- // We have some nominal amount of 16x16 variance (based on average),
- // compute the minmax over the 8x8 sub-blocks, and if above threshold,
- // force split to 8x8 block for this 16x16 block.
- int minmax = compute_minmax_8x8(s, sp, d, dp, x16_idx, y16_idx,
-#if CONFIG_VP9_HIGHBITDEPTH
- xd->cur_buf->flags,
-#endif
- pixels_wide, pixels_high);
- if (minmax > cpi->vbp_threshold_minmax) {
- force_split[split_index] = 1;
- force_split[i + 1] = 1;
- force_split[0] = 1;
- }
- }
- }
- if (is_key_frame || (low_res &&
- vt.split[i].split[j].part_variances.none.variance >
- (thresholds[1] << 1))) {
- force_split[split_index] = 0;
- // Go down to 4x4 down-sampling for variance.
- variance4x4downsample[i2 + j] = 1;
- for (k = 0; k < 4; k++) {
- int x8_idx = x16_idx + ((k & 1) << 3);
- int y8_idx = y16_idx + ((k >> 1) << 3);
- v8x8 *vst2 = is_key_frame ? &vst->split[k] :
- &vt2[i2 + j].split[k];
- fill_variance_4x4avg(s, sp, d, dp, x8_idx, y8_idx, vst2,
-#if CONFIG_VP9_HIGHBITDEPTH
- xd->cur_buf->flags,
-#endif
- pixels_wide,
- pixels_high,
- is_key_frame);
- }
- }
+ xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH,
+#endif // CONFIG_VP9_HIGHBITDEPTH
+ cm->sb_size,
+ (is_key_frame || low_res) ? BLOCK_4X4 : BLOCK_8X8,
+ pixels_wide, pixels_high,
+ src, src_stride, ref, ref_stride);
+
+ // Fill in the entire tree of variances and compute splits.
+ if (is_key_frame) {
+ fill_variance_tree(vt, BLOCK_4X4);
+ check_split_key_frame(vt, thre[1]);
+ } else {
+ fill_variance_tree(vt, BLOCK_8X8);
+ check_split(cpi, vt, segment_id, thre);
+ if (low_res) {
+ refine_variance_tree(vt, thre[1] << 1);
}
}
- // Fill the rest of the variance tree by summing split partition values.
- for (i = 0; i < 4; i++) {
- const int i2 = i << 2;
- for (j = 0; j < 4; j++) {
- if (variance4x4downsample[i2 + j] == 1) {
- v16x16 *vtemp = (!is_key_frame) ? &vt2[i2 + j] :
- &vt.split[i].split[j];
- for (m = 0; m < 4; m++)
- fill_variance_tree(&vtemp->split[m], BLOCK_8X8);
- fill_variance_tree(vtemp, BLOCK_16X16);
- }
- }
- fill_variance_tree(&vt.split[i], BLOCK_32X32);
- // If variance of this 32x32 block is above the threshold, force the block
- // to split. This also forces a split on the upper (64x64) level.
- if (!force_split[i + 1]) {
- get_variance(&vt.split[i].part_variances.none);
- if (vt.split[i].part_variances.none.variance > thresholds[1]) {
- force_split[i + 1] = 1;
- force_split[0] = 1;
- }
- }
- }
- if (!force_split[0]) {
- fill_variance_tree(&vt, BLOCK_64X64);
- get_variance(&vt.part_variances.none);
- }
+ vt->force_split |= mi_col + cm->mib_size > cm->mi_cols ||
+ mi_row + cm->mib_size > cm->mi_rows;
// Now go through the entire structure, splitting every block size until
// we get to one that's got a variance lower than our threshold.
- if ( mi_col + 8 > cm->mi_cols || mi_row + 8 > cm->mi_rows ||
- !set_vt_partitioning(cpi, x, xd, &vt, BLOCK_64X64, mi_row, mi_col,
- thresholds[0], BLOCK_16X16, force_split[0])) {
- for (i = 0; i < 4; ++i) {
- const int x32_idx = ((i & 1) << 2);
- const int y32_idx = ((i >> 1) << 2);
- const int i2 = i << 2;
- if (!set_vt_partitioning(cpi, x, xd, &vt.split[i], BLOCK_32X32,
- (mi_row + y32_idx), (mi_col + x32_idx),
- thresholds[1], BLOCK_16X16,
- force_split[i + 1])) {
- for (j = 0; j < 4; ++j) {
- const int x16_idx = ((j & 1) << 1);
- const int y16_idx = ((j >> 1) << 1);
- // For inter frames: if variance4x4downsample[] == 1 for this 16x16
- // block, then the variance is based on 4x4 down-sampling, so use vt2
- // in set_vt_partioning(), otherwise use vt.
- v16x16 *vtemp = (!is_key_frame &&
- variance4x4downsample[i2 + j] == 1) ?
- &vt2[i2 + j] : &vt.split[i].split[j];
- if (!set_vt_partitioning(cpi, x, xd, vtemp, BLOCK_16X16,
- mi_row + y32_idx + y16_idx,
- mi_col + x32_idx + x16_idx,
- thresholds[2],
- cpi->vbp_bsize_min,
- force_split[5 + i2 + j])) {
- for (k = 0; k < 4; ++k) {
- const int x8_idx = (k & 1);
- const int y8_idx = (k >> 1);
- if (use_4x4_partition) {
- if (!set_vt_partitioning(cpi, x, xd, &vtemp->split[k],
- BLOCK_8X8,
- mi_row + y32_idx + y16_idx + y8_idx,
- mi_col + x32_idx + x16_idx + x8_idx,
- thresholds[3], BLOCK_8X8, 0)) {
- set_block_size(cpi, x, xd,
- (mi_row + y32_idx + y16_idx + y8_idx),
- (mi_col + x32_idx + x16_idx + x8_idx),
- BLOCK_4X4);
- }
- } else {
- set_block_size(cpi, x, xd,
- (mi_row + y32_idx + y16_idx + y8_idx),
- (mi_col + x32_idx + x16_idx + x8_idx),
- BLOCK_8X8);
- }
- }
- }
- }
- }
- }
- }
- return 0;
+ set_vt_partitioning(cpi, x, xd, vt, mi_row, mi_col, thre, bmin);
}
static void update_state(VP10_COMP *cpi, ThreadData *td,
@@ -2596,10 +2478,6 @@
int chosen_rate_nocoef = INT_MAX;
#endif
-#if CONFIG_EXT_PARTITION_TYPES
- assert(0);
-#endif
-
if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols)
return;
@@ -2823,6 +2701,13 @@
#endif
}
break;
+#if CONFIG_EXT_PARTITION_TYPES
+ case PARTITION_VERT_A:
+ case PARTITION_VERT_B:
+ case PARTITION_HORZ_A:
+ case PARTITION_HORZ_B:
+ assert(0 && "Cannot handle extended partiton types");
+#endif // CONFIG_EXT_PARTITION_TYPES
default:
assert(0);
break;
@@ -4282,9 +4167,8 @@
&dummy_rate_nocoef,
#endif // CONFIG_SUPERTX
1, pc_root);
- } else if (sf->partition_search_type == VAR_BASED_PARTITION &&
- cm->frame_type != KEY_FRAME) {
- choose_partitioning(cpi, tile_info, x, mi_row, mi_col);
+ } else if (sf->partition_search_type == VAR_BASED_PARTITION) {
+ choose_partitioning(cpi, td, tile_info, x, mi_row, mi_col);
rd_use_partition(cpi, td, tile_data, mi, tp, mi_row, mi_col,
cm->sb_size, &dummy_rate, &dummy_dist,
#if CONFIG_SUPERTX
@@ -4318,8 +4202,8 @@
SUBFRAME_STATS *subframe_stats = &cpi->subframe_stats;
for (t = TX_4X4; t <= TX_32X32; ++t)
- full_to_model_counts(cpi->td.counts->coef[t],
- cpi->td.rd_counts.coef_counts[t]);
+ vp10_full_to_model_counts(cpi->td.counts->coef[t],
+ cpi->td.rd_counts.coef_counts[t]);
vp10_partial_adapt_probs(cm, mi_row, mi_col);
++cm->coef_probs_update_idx;
vp10_copy(subframe_stats->coef_probs_buf[cm->coef_probs_update_idx],
@@ -4328,7 +4212,7 @@
cpi->td.rd_counts.coef_counts);
vp10_copy(subframe_stats->eob_counts_buf[cm->coef_probs_update_idx],
cm->counts.eob_branch);
- fill_token_costs(x->token_costs, cm->fc->coef_probs);
+ vp10_fill_token_costs(x->token_costs, cm->fc->coef_probs);
}
}
#endif // CONFIG_ENTROPY
@@ -4553,6 +4437,10 @@
#endif
#endif
+ if (cpi->sf.partition_search_type == VAR_BASED_PARTITION &&
+ cpi->td.var_root[0] == NULL)
+ vp10_setup_var_tree(&cpi->common, &cpi->td);
+
{
struct vpx_usec_timer emr_timer;
vpx_usec_timer_start(&emr_timer);
diff --git a/vp10/encoder/encodemb.c b/vp10/encoder/encodemb.c
index 10e97cb..9acf00c 100644
--- a/vp10/encoder/encodemb.c
+++ b/vp10/encoder/encodemb.c
@@ -129,15 +129,7 @@
assert((!type && !plane) || (type && plane));
assert(eob <= default_eob);
-#if CONFIG_VP9_HIGHBITDEPTH
- if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH && xd->bd == BITDEPTH_10) {
- mul = 1;
- } else {
- mul = 1 + (tx_size == TX_32X32);
- }
-#else
- mul = 1 + (tx_size == TX_32X32);
-#endif
+ mul = 1 << get_tx_scale(xd, tx_type, tx_size);
/* Now set up a Viterbi trellis to evaluate alternative roundings. */
if (!ref)
@@ -323,35 +315,29 @@
#if CONFIG_VP9_HIGHBITDEPTH
typedef enum QUANT_FUNC {
QUANT_FUNC_LOWBD = 0,
- QUANT_FUNC_LOWBD_32 = 1,
- QUANT_FUNC_HIGHBD = 2,
- QUANT_FUNC_HIGHBD_32 = 3,
- QUANT_FUNC_LAST = 4
-} QUANT_FUNC;
-
-static VP10_QUANT_FACADE
- quant_func_list[VP10_XFORM_QUANT_LAST][QUANT_FUNC_LAST] = {
- {vp10_quantize_fp_facade, vp10_quantize_fp_32x32_facade,
- vp10_highbd_quantize_fp_facade, vp10_highbd_quantize_fp_32x32_facade},
- {vp10_quantize_b_facade, vp10_quantize_b_32x32_facade,
- vp10_highbd_quantize_b_facade, vp10_highbd_quantize_b_32x32_facade},
- {vp10_quantize_dc_facade, vp10_quantize_dc_32x32_facade,
- vp10_highbd_quantize_dc_facade, vp10_highbd_quantize_dc_32x32_facade},
- {NULL, NULL, NULL, NULL}};
-
-#else
-typedef enum QUANT_FUNC {
- QUANT_FUNC_LOWBD = 0,
- QUANT_FUNC_LOWBD_32 = 1,
+ QUANT_FUNC_HIGHBD = 1,
QUANT_FUNC_LAST = 2
} QUANT_FUNC;
static VP10_QUANT_FACADE
quant_func_list[VP10_XFORM_QUANT_LAST][QUANT_FUNC_LAST] = {
- {vp10_quantize_fp_facade, vp10_quantize_fp_32x32_facade},
- {vp10_quantize_b_facade, vp10_quantize_b_32x32_facade},
- {vp10_quantize_dc_facade, vp10_quantize_dc_32x32_facade},
+ {vp10_quantize_fp_facade, vp10_highbd_quantize_fp_facade},
+ {vp10_quantize_b_facade, vp10_highbd_quantize_b_facade},
+ {vp10_quantize_dc_facade, vp10_highbd_quantize_dc_facade},
{NULL, NULL}};
+
+#else
+typedef enum QUANT_FUNC {
+ QUANT_FUNC_LOWBD = 0,
+ QUANT_FUNC_LAST = 1
+} QUANT_FUNC;
+
+static VP10_QUANT_FACADE
+ quant_func_list[VP10_XFORM_QUANT_LAST][QUANT_FUNC_LAST] = {
+ {vp10_quantize_fp_facade},
+ {vp10_quantize_b_facade},
+ {vp10_quantize_dc_facade},
+ {NULL}};
#endif
static FWD_TXFM_OPT fwd_txfm_opt_list[VP10_XFORM_QUANT_LAST] = {
@@ -378,7 +364,9 @@
const int tx2d_size = tx1d_size * tx1d_size;
FWD_TXFM_PARAM fwd_txfm_param;
- fwd_txfm_param.tx_type = get_tx_type(plane_type, xd, block, tx_size);
+ QUANT_PARAM qparam;
+
+ fwd_txfm_param.tx_type = tx_type;
fwd_txfm_param.tx_size = tx_size;
fwd_txfm_param.fwd_txfm_opt = fwd_txfm_opt_list[xform_quant_idx];
fwd_txfm_param.rd_transform = x->use_lp32x32fdct;
@@ -386,6 +374,7 @@
src_diff = &p->src_diff[4 * (blk_row * diff_stride + blk_col)];
+ qparam.log_scale = get_tx_scale(xd, tx_type, tx_size);
#if CONFIG_VP9_HIGHBITDEPTH
fwd_txfm_param.bd = xd->bd;
if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
@@ -394,12 +383,9 @@
if (x->skip_block) {
vp10_quantize_skip(tx2d_size, qcoeff, dqcoeff, eob);
} else {
- if (tx_size == TX_32X32 && xd->bd != 10)
- quant_func_list[xform_quant_idx][QUANT_FUNC_HIGHBD_32](
- coeff, tx2d_size, p, qcoeff, pd, dqcoeff, eob, scan_order);
- else
- quant_func_list[xform_quant_idx][QUANT_FUNC_HIGHBD](
- coeff, tx2d_size, p, qcoeff, pd, dqcoeff, eob, scan_order);
+ quant_func_list[xform_quant_idx][QUANT_FUNC_HIGHBD](
+ coeff, tx2d_size, p, qcoeff, pd, dqcoeff, eob,
+ scan_order, &qparam);
}
}
return;
@@ -411,12 +397,9 @@
if (x->skip_block) {
vp10_quantize_skip(tx2d_size, qcoeff, dqcoeff, eob);
} else {
- if (tx_size == TX_32X32)
- quant_func_list[xform_quant_idx][QUANT_FUNC_LOWBD_32](
- coeff, tx2d_size, p, qcoeff, pd, dqcoeff, eob, scan_order);
- else
- quant_func_list[xform_quant_idx][QUANT_FUNC_LOWBD](
- coeff, tx2d_size, p, qcoeff, pd, dqcoeff, eob, scan_order);
+ quant_func_list[xform_quant_idx][QUANT_FUNC_LOWBD](
+ coeff, tx2d_size, p, qcoeff, pd, dqcoeff, eob,
+ scan_order, &qparam);
}
}
}
diff --git a/vp10/encoder/encoder.c b/vp10/encoder/encoder.c
index a39575b..f0de8ef 100644
--- a/vp10/encoder/encoder.c
+++ b/vp10/encoder/encoder.c
@@ -463,6 +463,9 @@
vp10_free_pc_tree(&cpi->td);
+ if (cpi->sf.partition_search_type == VAR_BASED_PARTITION)
+ vp10_free_var_tree(&cpi->td);
+
if (cpi->common.allow_screen_content_tools)
vpx_free(cpi->td.mb.palette_buffer);
@@ -1999,6 +2002,8 @@
CHECK_MEM_ERROR(cm, x->palette_buffer,
vpx_memalign(16, sizeof(*x->palette_buffer)));
}
+ // Reallocate the pc_tree, as it's contents depends on
+ // the state of cm->allow_screen_content_tools
vp10_free_pc_tree(&cpi->td);
vp10_setup_pc_tree(&cpi->common, &cpi->td);
}
@@ -2586,6 +2591,8 @@
vpx_free(thread_data->td->mb.palette_buffer);
vpx_free(thread_data->td->counts);
vp10_free_pc_tree(thread_data->td);
+ if (cpi->sf.partition_search_type == VAR_BASED_PARTITION)
+ vp10_free_var_tree(thread_data->td);
vpx_free(thread_data->td);
}
}
@@ -3406,13 +3413,9 @@
model_count[EOB_MODEL_TOKEN] = full_count[EOB_TOKEN];
}
-#if CONFIG_ENTROPY
-void full_to_model_counts(vp10_coeff_count_model *model_count,
- vp10_coeff_count *full_count) {
-#else
-static void full_to_model_counts(vp10_coeff_count_model *model_count,
- vp10_coeff_count *full_count) {
-#endif // CONFIG_ENTROPY
+
+void vp10_full_to_model_counts(vp10_coeff_count_model *model_count,
+ vp10_coeff_count *full_count) {
int i, j, k, l;
for (i = 0; i < PLANE_TYPES; ++i)
@@ -4403,8 +4406,8 @@
vp10_update_reference_frames(cpi);
for (t = TX_4X4; t <= TX_32X32; t++)
- full_to_model_counts(cpi->td.counts->coef[t],
- cpi->td.rd_counts.coef_counts[t]);
+ vp10_full_to_model_counts(cpi->td.counts->coef[t],
+ cpi->td.rd_counts.coef_counts[t]);
if (cm->refresh_frame_context == REFRESH_FRAME_CONTEXT_BACKWARD) {
#if CONFIG_ENTROPY
diff --git a/vp10/encoder/encoder.h b/vp10/encoder/encoder.h
index bf7815f..0f0d1f3 100644
--- a/vp10/encoder/encoder.h
+++ b/vp10/encoder/encoder.h
@@ -34,6 +34,7 @@
#include "vp10/encoder/rd.h"
#include "vp10/encoder/speed_features.h"
#include "vp10/encoder/tokenize.h"
+#include "vp10/encoder/variance_tree.h"
#if CONFIG_VP9_TEMPORAL_DENOISING
#include "vp10/encoder/denoiser.h"
@@ -267,6 +268,9 @@
PICK_MODE_CONTEXT *leaf_tree;
PC_TREE *pc_tree;
PC_TREE *pc_root[MAX_MIB_SIZE_LOG2 - MIN_MIB_SIZE_LOG2 + 1];
+
+ VAR_TREE *var_tree;
+ VAR_TREE *var_root[MAX_MIB_SIZE_LOG2 - MIN_MIB_SIZE_LOG2 + 1];
} ThreadData;
struct EncWorkerData;
@@ -568,9 +572,12 @@
int resize_count;
// VAR_BASED_PARTITION thresholds
- // 0 - threshold_64x64; 1 - threshold_32x32;
- // 2 - threshold_16x16; 3 - vbp_threshold_8x8;
- int64_t vbp_thresholds[4];
+ // 0 - threshold_128x128;
+ // 1 - threshold_64x64;
+ // 2 - threshold_32x32;
+ // 3 - threshold_16x16;
+ // 4 - threshold_8x8;
+ int64_t vbp_thresholds[5];
int64_t vbp_threshold_minmax;
int64_t vbp_threshold_sad;
BLOCK_SIZE vbp_bsize_min;
@@ -630,10 +637,8 @@
int vp10_get_quantizer(struct VP10_COMP *cpi);
-#if CONFIG_ENTROPY
-void full_to_model_counts(vp10_coeff_count_model *model_count,
- vp10_coeff_count *full_count);
-#endif // CONFIG_ENTROPY
+void vp10_full_to_model_counts(vp10_coeff_count_model *model_count,
+ vp10_coeff_count *full_count);
static INLINE int frame_is_kf_gf_arf(const VP10_COMP *cpi) {
return frame_is_intra_only(&cpi->common) ||
diff --git a/vp10/encoder/ethread.c b/vp10/encoder/ethread.c
index 2742ed2..e552ec5 100644
--- a/vp10/encoder/ethread.c
+++ b/vp10/encoder/ethread.c
@@ -93,6 +93,10 @@
thread_data->td->pc_tree = NULL;
vp10_setup_pc_tree(cm, thread_data->td);
+ // Set up variance tree if needed.
+ if (cpi->sf.partition_search_type == VAR_BASED_PARTITION)
+ vp10_setup_var_tree(cm, &cpi->td);
+
// Allocate frame counters in thread data.
CHECK_MEM_ERROR(cm, thread_data->td->counts,
vpx_calloc(1, sizeof(*thread_data->td->counts)));
diff --git a/vp10/encoder/quantize.c b/vp10/encoder/quantize.c
index 3f8f0f4..3919fee 100644
--- a/vp10/encoder/quantize.c
+++ b/vp10/encoder/quantize.c
@@ -33,52 +33,72 @@
const MACROBLOCK_PLANE *p, tran_low_t *qcoeff_ptr,
const MACROBLOCKD_PLANE *pd,
tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr,
- const scan_order *sc) {
+ const scan_order *sc, const QUANT_PARAM *qparam) {
// obsolete skip_block
const int skip_block = 0;
- vp10_quantize_fp(coeff_ptr, n_coeffs, skip_block, p->zbin, p->round_fp,
- p->quant_fp, p->quant_shift, qcoeff_ptr, dqcoeff_ptr,
- pd->dequant, eob_ptr, sc->scan, sc->iscan);
+ if (qparam->log_scale == 0) {
+ vp10_quantize_fp(coeff_ptr, n_coeffs, skip_block, p->zbin, p->round_fp,
+ p->quant_fp, p->quant_shift, qcoeff_ptr, dqcoeff_ptr,
+ pd->dequant, eob_ptr, sc->scan, sc->iscan);
+ } else {
+ vp10_quantize_fp_32x32(coeff_ptr, n_coeffs, skip_block, p->zbin,
+ p->round_fp, p->quant_fp, p->quant_shift, qcoeff_ptr,
+ dqcoeff_ptr, pd->dequant, eob_ptr, sc->scan,
+ sc->iscan);
+ }
}
void vp10_quantize_b_facade(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
const MACROBLOCK_PLANE *p, tran_low_t *qcoeff_ptr,
const MACROBLOCKD_PLANE *pd,
tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr,
- const scan_order *sc) {
+ const scan_order *sc, const QUANT_PARAM *qparam) {
// obsolete skip_block
const int skip_block = 0;
- vpx_quantize_b(coeff_ptr, n_coeffs, skip_block, p->zbin, p->round, p->quant,
- p->quant_shift, qcoeff_ptr, dqcoeff_ptr, pd->dequant, eob_ptr,
- sc->scan, sc->iscan);
+ if (qparam->log_scale == 0) {
+ vpx_quantize_b(coeff_ptr, n_coeffs, skip_block, p->zbin, p->round, p->quant,
+ p->quant_shift, qcoeff_ptr, dqcoeff_ptr, pd->dequant,
+ eob_ptr, sc->scan, sc->iscan);
+ } else {
+ vpx_quantize_b_32x32(coeff_ptr, n_coeffs, skip_block, p->zbin, p->round,
+ p->quant, p->quant_shift, qcoeff_ptr, dqcoeff_ptr,
+ pd->dequant, eob_ptr, sc->scan, sc->iscan);
+ }
}
void vp10_quantize_dc_facade(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
const MACROBLOCK_PLANE *p, tran_low_t *qcoeff_ptr,
const MACROBLOCKD_PLANE *pd,
tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr,
- const scan_order *sc) {
+ const scan_order *sc, const QUANT_PARAM *qparam) {
// obsolete skip_block
const int skip_block = 0;
(void)sc;
- vpx_quantize_dc(coeff_ptr, (int)n_coeffs, skip_block, p->round,
- p->quant_fp[0], qcoeff_ptr, dqcoeff_ptr, pd->dequant[0],
- eob_ptr);
+ if (qparam->log_scale == 0) {
+ vpx_quantize_dc(coeff_ptr, (int)n_coeffs, skip_block, p->round,
+ p->quant_fp[0], qcoeff_ptr, dqcoeff_ptr, pd->dequant[0],
+ eob_ptr);
+ } else {
+ vpx_quantize_dc_32x32(coeff_ptr, skip_block, p->round, p->quant_fp[0],
+ qcoeff_ptr, dqcoeff_ptr, pd->dequant[0], eob_ptr);
+ }
}
#if CONFIG_VP9_HIGHBITDEPTH
void vp10_highbd_quantize_fp_facade(
const tran_low_t *coeff_ptr, intptr_t n_coeffs, const MACROBLOCK_PLANE *p,
tran_low_t *qcoeff_ptr, const MACROBLOCKD_PLANE *pd,
- tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr, const scan_order *sc) {
+ tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr, const scan_order *sc,
+ const QUANT_PARAM *qparam) {
// obsolete skip_block
const int skip_block = 0;
- vp10_highbd_quantize_fp(coeff_ptr, n_coeffs, skip_block, p->zbin, p->round_fp,
- p->quant_fp, p->quant_shift, qcoeff_ptr, dqcoeff_ptr,
- pd->dequant, eob_ptr, sc->scan, sc->iscan);
+ vp10_highbd_quantize_fp(coeff_ptr, n_coeffs, skip_block, p->zbin,
+ p->round_fp, p->quant_fp, p->quant_shift,
+ qcoeff_ptr, dqcoeff_ptr, pd->dequant, eob_ptr,
+ sc->scan, sc->iscan, qparam->log_scale);
}
void vp10_highbd_quantize_b_facade(const tran_low_t *coeff_ptr,
@@ -86,114 +106,30 @@
tran_low_t *qcoeff_ptr,
const MACROBLOCKD_PLANE *pd,
tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr,
- const scan_order *sc) {
+ const scan_order *sc,
+ const QUANT_PARAM *qparam) {
// obsolete skip_block
const int skip_block = 0;
- vpx_highbd_quantize_b(coeff_ptr, n_coeffs, skip_block, p->zbin, p->round,
- p->quant, p->quant_shift, qcoeff_ptr, dqcoeff_ptr,
- pd->dequant, eob_ptr, sc->scan, sc->iscan);
+ vp10_highbd_quantize_b(coeff_ptr, n_coeffs, skip_block, p->zbin, p->round,
+ p->quant, p->quant_shift, qcoeff_ptr, dqcoeff_ptr,
+ pd->dequant, eob_ptr, sc->scan, sc->iscan,
+ qparam->log_scale);
}
void vp10_highbd_quantize_dc_facade(
const tran_low_t *coeff_ptr, intptr_t n_coeffs, const MACROBLOCK_PLANE *p,
tran_low_t *qcoeff_ptr, const MACROBLOCKD_PLANE *pd,
- tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr, const scan_order *sc) {
+ tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr, const scan_order *sc,
+ const QUANT_PARAM *qparam) {
// obsolete skip_block
const int skip_block = 0;
(void)sc;
- vpx_highbd_quantize_dc(coeff_ptr, (int)n_coeffs, skip_block, p->round,
+ vp10_highbd_quantize_dc(coeff_ptr, (int)n_coeffs, skip_block, p->round,
p->quant_fp[0], qcoeff_ptr, dqcoeff_ptr,
- pd->dequant[0], eob_ptr);
-}
-#endif // CONFIG_VP9_HIGHBITDEPTH
-
-void vp10_quantize_fp_32x32_facade(const tran_low_t *coeff_ptr,
- intptr_t n_coeffs, const MACROBLOCK_PLANE *p,
- tran_low_t *qcoeff_ptr,
- const MACROBLOCKD_PLANE *pd,
- tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr,
- const scan_order *sc) {
- // obsolete skip_block
- const int skip_block = 0;
-
- vp10_quantize_fp_32x32(coeff_ptr, n_coeffs, skip_block, p->zbin, p->round_fp,
- p->quant_fp, p->quant_shift, qcoeff_ptr, dqcoeff_ptr,
- pd->dequant, eob_ptr, sc->scan, sc->iscan);
-}
-
-void vp10_quantize_b_32x32_facade(const tran_low_t *coeff_ptr,
- intptr_t n_coeffs, const MACROBLOCK_PLANE *p,
- tran_low_t *qcoeff_ptr,
- const MACROBLOCKD_PLANE *pd,
- tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr,
- const scan_order *sc) {
- // obsolete skip_block
- const int skip_block = 0;
-
- vpx_quantize_b_32x32(coeff_ptr, n_coeffs, skip_block, p->zbin, p->round,
- p->quant, p->quant_shift, qcoeff_ptr, dqcoeff_ptr,
- pd->dequant, eob_ptr, sc->scan, sc->iscan);
-}
-
-void vp10_quantize_dc_32x32_facade(const tran_low_t *coeff_ptr,
- intptr_t n_coeffs, const MACROBLOCK_PLANE *p,
- tran_low_t *qcoeff_ptr,
- const MACROBLOCKD_PLANE *pd,
- tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr,
- const scan_order *sc) {
- // obsolete skip_block
- const int skip_block = 0;
-
- (void)sc;
- (void)n_coeffs;
-
- vpx_quantize_dc_32x32(coeff_ptr, skip_block, p->round, p->quant_fp[0],
- qcoeff_ptr, dqcoeff_ptr, pd->dequant[0], eob_ptr);
-}
-
-#if CONFIG_VP9_HIGHBITDEPTH
-void vp10_highbd_quantize_fp_32x32_facade(
- const tran_low_t *coeff_ptr, intptr_t n_coeffs, const MACROBLOCK_PLANE *p,
- tran_low_t *qcoeff_ptr, const MACROBLOCKD_PLANE *pd,
- tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr, const scan_order *sc) {
- // obsolete skip_block
- const int skip_block = 0;
-
- vp10_highbd_quantize_fp_32x32(coeff_ptr, n_coeffs, skip_block, p->zbin,
- p->round_fp, p->quant_fp, p->quant_shift,
- qcoeff_ptr, dqcoeff_ptr, pd->dequant, eob_ptr,
- sc->scan, sc->iscan);
-}
-
-void vp10_highbd_quantize_b_32x32_facade(
- const tran_low_t *coeff_ptr, intptr_t n_coeffs, const MACROBLOCK_PLANE *p,
- tran_low_t *qcoeff_ptr, const MACROBLOCKD_PLANE *pd,
- tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr, const scan_order *sc) {
- // obsolete skip_block
- const int skip_block = 0;
-
- vpx_highbd_quantize_b_32x32(coeff_ptr, n_coeffs, skip_block, p->zbin,
- p->round, p->quant, p->quant_shift, qcoeff_ptr,
- dqcoeff_ptr, pd->dequant, eob_ptr, sc->scan,
- sc->iscan);
-}
-
-void vp10_highbd_quantize_dc_32x32_facade(
- const tran_low_t *coeff_ptr, intptr_t n_coeffs, const MACROBLOCK_PLANE *p,
- tran_low_t *qcoeff_ptr, const MACROBLOCKD_PLANE *pd,
- tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr, const scan_order *sc) {
- // obsolete skip_block
- const int skip_block = 0;
-
- (void)sc;
- (void)n_coeffs;
-
- vpx_highbd_quantize_dc_32x32(coeff_ptr, skip_block, p->round, p->quant_fp[0],
- qcoeff_ptr, dqcoeff_ptr, pd->dequant[0],
- eob_ptr);
+ pd->dequant[0], eob_ptr, qparam->log_scale);
}
#endif // CONFIG_VP9_HIGHBITDEPTH
@@ -250,9 +186,11 @@
const int16_t *dequant_ptr,
uint16_t *eob_ptr,
const int16_t *scan,
- const int16_t *iscan) {
+ const int16_t *iscan, const int log_scale) {
int i;
int eob = -1;
+ const int scale = 1 << log_scale;
+ const int shift = 16 - log_scale;
// TODO(jingning) Decide the need of these arguments after the
// quantization process is completed.
(void)zbin_ptr;
@@ -271,9 +209,10 @@
const int coeff_sign = (coeff >> 31);
const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
const int64_t tmp = abs_coeff + round_ptr[rc != 0];
- const uint32_t abs_qcoeff = (uint32_t)((tmp * quant_ptr[rc != 0]) >> 16);
+ const uint32_t abs_qcoeff =
+ (uint32_t)((tmp * quant_ptr[rc != 0]) >> shift);
qcoeff_ptr[rc] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign);
- dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0];
+ dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0] / scale;
if (abs_qcoeff)
eob = i;
}
@@ -325,49 +264,101 @@
}
#if CONFIG_VP9_HIGHBITDEPTH
-void vp10_highbd_quantize_fp_32x32_c(const tran_low_t *coeff_ptr,
- intptr_t n_coeffs, int skip_block,
- const int16_t *zbin_ptr,
- const int16_t *round_ptr,
- const int16_t *quant_ptr,
- const int16_t *quant_shift_ptr,
- tran_low_t *qcoeff_ptr,
- tran_low_t *dqcoeff_ptr,
- const int16_t *dequant_ptr,
- uint16_t *eob_ptr,
- const int16_t *scan, const int16_t *iscan) {
- int i, eob = -1;
- (void)zbin_ptr;
- (void)quant_shift_ptr;
+void vp10_highbd_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+ int skip_block, const int16_t *zbin_ptr,
+ const int16_t *round_ptr,
+ const int16_t *quant_ptr,
+ const int16_t *quant_shift_ptr,
+ tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+ const int16_t *dequant_ptr,
+ uint16_t *eob_ptr, const int16_t *scan,
+ const int16_t *iscan, const int log_scale) {
+ int i, non_zero_count = (int)n_coeffs, eob = -1;
+ int zbins[2] = {zbin_ptr[0], zbin_ptr[1]};
+ int round[2] = {round_ptr[0], round_ptr[1]};
+ int nzbins[2];
+ int scale = 1;
+ int shift = 16;
(void)iscan;
+ if (log_scale > 0) {
+ zbins[0] = ROUND_POWER_OF_TWO(zbin_ptr[0], log_scale);
+ zbins[1] = ROUND_POWER_OF_TWO(zbin_ptr[1], log_scale);
+ round[0] = ROUND_POWER_OF_TWO(round_ptr[0], log_scale);
+ round[1] = ROUND_POWER_OF_TWO(round_ptr[1], log_scale);
+ scale = 1 << log_scale;
+ shift = 16 - log_scale;
+ }
+
+ nzbins[0] = zbins[0] * -1;
+ nzbins[1] = zbins[1] * -1;
+
memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
if (!skip_block) {
- for (i = 0; i < n_coeffs; i++) {
- uint32_t abs_qcoeff = 0;
+ // Pre-scan pass
+ for (i = (int)n_coeffs - 1; i >= 0; i--) {
+ const int rc = scan[i];
+ const int coeff = coeff_ptr[rc];
+
+ if (coeff < zbins[rc != 0] && coeff > nzbins[rc != 0])
+ non_zero_count--;
+ else
+ break;
+ }
+
+ // Quantization pass: All coefficients with index >= zero_flag are
+ // skippable. Note: zero_flag can be zero.
+ for (i = 0; i < non_zero_count; i++) {
const int rc = scan[i];
const int coeff = coeff_ptr[rc];
const int coeff_sign = (coeff >> 31);
const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
- if (abs_coeff >= (dequant_ptr[rc != 0] >> 2)) {
- const int64_t tmp = abs_coeff
- + ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1);
- abs_qcoeff = (uint32_t) ((tmp * quant_ptr[rc != 0]) >> 15);
+ if (abs_coeff >= zbins[rc != 0]) {
+ const int64_t tmp1 = abs_coeff + round[rc != 0];
+ const int64_t tmp2 = ((tmp1 * quant_ptr[rc != 0]) >> 16) + tmp1;
+ const uint32_t abs_qcoeff =
+ (uint32_t)((tmp2 * quant_shift_ptr[rc != 0]) >> shift);
qcoeff_ptr[rc] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign);
- dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0] / 2;
+ dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0] / scale;
+ if (abs_qcoeff)
+ eob = i;
}
-
- if (abs_qcoeff)
- eob = i;
}
}
*eob_ptr = eob + 1;
}
#endif
+#if CONFIG_VP9_HIGHBITDEPTH
+void vp10_highbd_quantize_dc(const tran_low_t *coeff_ptr,
+ int n_coeffs, int skip_block,
+ const int16_t *round_ptr, const int16_t quant,
+ tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+ const int16_t dequant_ptr, uint16_t *eob_ptr,
+ const int log_scale) {
+ int eob = -1;
+
+ memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+ memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
+
+ if (!skip_block) {
+ const int coeff = coeff_ptr[0];
+ const int coeff_sign = (coeff >> 31);
+ const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+ const int64_t tmp = abs_coeff + round_ptr[0];
+ const uint32_t abs_qcoeff = (uint32_t)((tmp * quant) >> (16 - log_scale));
+ qcoeff_ptr[0] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign);
+ dqcoeff_ptr[0] = qcoeff_ptr[0] * dequant_ptr / (1 << log_scale);
+ if (abs_qcoeff)
+ eob = 0;
+ }
+ *eob_ptr = eob + 1;
+}
+#endif
+
void vp10_regular_quantize_b_4x4(MACROBLOCK *x, int plane, int block,
const int16_t *scan, const int16_t *iscan) {
MACROBLOCKD *const xd = &x->e_mbd;
diff --git a/vp10/encoder/quantize.h b/vp10/encoder/quantize.h
index 6128460..5e62eb2 100644
--- a/vp10/encoder/quantize.h
+++ b/vp10/encoder/quantize.h
@@ -19,12 +19,17 @@
extern "C" {
#endif
+typedef struct QUANT_PARAM {
+ int log_scale;
+} QUANT_PARAM;
+
typedef void (*VP10_QUANT_FACADE)(const tran_low_t *coeff_ptr,
intptr_t n_coeffs, const MACROBLOCK_PLANE *p,
tran_low_t *qcoeff_ptr,
const MACROBLOCKD_PLANE *pd,
tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr,
- const scan_order *sc);
+ const scan_order *sc,
+ const QUANT_PARAM *qparam);
typedef struct {
// 0: dc 1: ac 2-8: ac repeated to SIMD width
@@ -48,7 +53,6 @@
void vp10_regular_quantize_b_4x4(MACROBLOCK *x, int plane, int block,
const int16_t *scan, const int16_t *iscan);
-
struct VP10_COMP;
struct VP10Common;
@@ -71,74 +75,48 @@
const MACROBLOCK_PLANE *p, tran_low_t *qcoeff_ptr,
const MACROBLOCKD_PLANE *pd,
tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr,
- const scan_order *sc);
+ const scan_order *sc, const QUANT_PARAM *qparam);
void vp10_quantize_b_facade(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
const MACROBLOCK_PLANE *p, tran_low_t *qcoeff_ptr,
const MACROBLOCKD_PLANE *pd,
tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr,
- const scan_order *sc);
+ const scan_order *sc, const QUANT_PARAM *qparam);
void vp10_quantize_dc_facade(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
const MACROBLOCK_PLANE *p, tran_low_t *qcoeff_ptr,
const MACROBLOCKD_PLANE *pd,
tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr,
- const scan_order *sc);
+ const scan_order *sc, const QUANT_PARAM *qparam);
#if CONFIG_VP9_HIGHBITDEPTH
void vp10_highbd_quantize_fp_facade(
const tran_low_t *coeff_ptr, intptr_t n_coeffs, const MACROBLOCK_PLANE *p,
tran_low_t *qcoeff_ptr, const MACROBLOCKD_PLANE *pd,
- tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr, const scan_order *sc);
+ tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr, const scan_order *sc,
+ const QUANT_PARAM *qparam);
void vp10_highbd_quantize_b_facade(const tran_low_t *coeff_ptr,
intptr_t n_coeffs, const MACROBLOCK_PLANE *p,
tran_low_t *qcoeff_ptr,
const MACROBLOCKD_PLANE *pd,
tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr,
- const scan_order *sc);
+ const scan_order *sc,
+ const QUANT_PARAM *qparam);
void vp10_highbd_quantize_dc_facade(
const tran_low_t *coeff_ptr, intptr_t n_coeffs, const MACROBLOCK_PLANE *p,
tran_low_t *qcoeff_ptr, const MACROBLOCKD_PLANE *pd,
- tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr, const scan_order *sc);
+ tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr, const scan_order *sc,
+ const QUANT_PARAM *qparam);
+
+void vp10_highbd_quantize_dc(const tran_low_t *coeff_ptr,
+ int n_coeffs, int skip_block,
+ const int16_t *round_ptr, const int16_t quant,
+ tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+ const int16_t dequant_ptr, uint16_t *eob_ptr,
+ const int log_scale);
#endif // CONFIG_VP9_HIGHBITDEPTH
-void vp10_quantize_fp_32x32_facade(const tran_low_t *coeff_ptr,
- intptr_t n_coeffs, const MACROBLOCK_PLANE *p,
- tran_low_t *qcoeff_ptr,
- const MACROBLOCKD_PLANE *pd,
- tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr,
- const scan_order *sc);
-
-void vp10_quantize_b_32x32_facade(const tran_low_t *coeff_ptr,
- intptr_t n_coeffs, const MACROBLOCK_PLANE *p,
- tran_low_t *qcoeff_ptr,
- const MACROBLOCKD_PLANE *pd,
- tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr,
- const scan_order *sc);
-
-void vp10_quantize_dc_32x32_facade(const tran_low_t *coeff_ptr,
- intptr_t n_coeffs, const MACROBLOCK_PLANE *p,
- tran_low_t *qcoeff_ptr,
- const MACROBLOCKD_PLANE *pd,
- tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr,
- const scan_order *sc);
-#if CONFIG_VP9_HIGHBITDEPTH
-void vp10_highbd_quantize_fp_32x32_facade(
- const tran_low_t *coeff_ptr, intptr_t n_coeffs, const MACROBLOCK_PLANE *p,
- tran_low_t *qcoeff_ptr, const MACROBLOCKD_PLANE *pd,
- tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr, const scan_order *sc);
-
-void vp10_highbd_quantize_b_32x32_facade(
- const tran_low_t *coeff_ptr, intptr_t n_coeffs, const MACROBLOCK_PLANE *p,
- tran_low_t *qcoeff_ptr, const MACROBLOCKD_PLANE *pd,
- tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr, const scan_order *sc);
-
-void vp10_highbd_quantize_dc_32x32_facade(
- const tran_low_t *coeff_ptr, intptr_t n_coeffs, const MACROBLOCK_PLANE *p,
- tran_low_t *qcoeff_ptr, const MACROBLOCKD_PLANE *pd,
- tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr, const scan_order *sc);
-#endif // CONFIG_VP9_HIGHBITDEPTH
#ifdef __cplusplus
} // extern "C"
#endif
diff --git a/vp10/encoder/rd.c b/vp10/encoder/rd.c
index ce9fad7..dc34f1f 100644
--- a/vp10/encoder/rd.c
+++ b/vp10/encoder/rd.c
@@ -152,13 +152,8 @@
#endif // CONFIG_EXT_INTRA
}
-#if CONFIG_ENTROPY
-void fill_token_costs(vp10_coeff_cost *c,
- vp10_coeff_probs_model (*p)[PLANE_TYPES]) {
-#else
-static void fill_token_costs(vp10_coeff_cost *c,
- vp10_coeff_probs_model (*p)[PLANE_TYPES]) {
-#endif // CONFIG_ENTROPY
+void vp10_fill_token_costs(vp10_coeff_cost *c,
+ vp10_coeff_probs_model (*p)[PLANE_TYPES]) {
int i, j, k, l;
TX_SIZE t;
for (t = TX_4X4; t <= TX_32X32; ++t)
@@ -397,7 +392,7 @@
#endif
}
if (cpi->oxcf.pass != 1) {
- fill_token_costs(x->token_costs, cm->fc->coef_probs);
+ vp10_fill_token_costs(x->token_costs, cm->fc->coef_probs);
if (cpi->sf.partition_search_type != VAR_BASED_PARTITION ||
cm->frame_type == KEY_FRAME) {
diff --git a/vp10/encoder/rd.h b/vp10/encoder/rd.h
index 80749dc..7aad9eb 100644
--- a/vp10/encoder/rd.h
+++ b/vp10/encoder/rd.h
@@ -341,10 +341,8 @@
int (*fact)[MAX_MODES], int rd_thresh,
int bsize, int best_mode_index);
-#if CONFIG_ENTROPY
-void fill_token_costs(vp10_coeff_cost *c,
- vp10_coeff_probs_model (*p)[PLANE_TYPES]);
-#endif // CONFIG_ENTROPY
+void vp10_fill_token_costs(vp10_coeff_cost *c,
+ vp10_coeff_probs_model (*p)[PLANE_TYPES]);
static INLINE int rd_less_than_thresh(int64_t best_rd, int thresh,
int thresh_fact) {
diff --git a/vp10/encoder/rdopt.c b/vp10/encoder/rdopt.c
index d4538af..05cb75c 100644
--- a/vp10/encoder/rdopt.c
+++ b/vp10/encoder/rdopt.c
@@ -1001,7 +1001,7 @@
const struct macroblock_plane *const p = &x->plane[plane];
const struct macroblockd_plane *const pd = &xd->plane[plane];
int64_t this_sse;
- int shift = tx_size == TX_32X32 ? 0 : 2;
+ int shift = (MAX_TX_SCALE - get_tx_scale(xd, 0, tx_size)) * 2;
tran_low_t *const coeff = BLOCK_OFFSET(p->coeff, block);
tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
#if CONFIG_VP9_HIGHBITDEPTH
@@ -1175,19 +1175,11 @@
const int64_t orig_sse = (int64_t)coeff[0] * coeff[0];
const int64_t resd_sse = coeff[0] - dqcoeff[0];
int64_t dc_correct = orig_sse - resd_sse * resd_sse;
+ int shift = (MAX_TX_SCALE - get_tx_scale(xd, 0, tx_size)) * 2;
#if CONFIG_VP9_HIGHBITDEPTH
dc_correct >>= ((xd->bd - 8) * 2);
- if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH &&
- xd->bd == BITDEPTH_10) {
- dc_correct >>= 2;
- } else {
- if (tx_size != TX_32X32)
- dc_correct >>= 2;
- }
-#else
- if (tx_size != TX_32X32)
- dc_correct >>= 2;
#endif
+ dc_correct >>= shift;
dist = VPXMAX(0, sse - dc_correct);
}
diff --git a/vp10/encoder/variance_tree.c b/vp10/encoder/variance_tree.c
new file mode 100644
index 0000000..d11ef2d
--- /dev/null
+++ b/vp10/encoder/variance_tree.c
@@ -0,0 +1,63 @@
+/*
+ * Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "vp10/encoder/variance_tree.h"
+#include "vp10/encoder/encoder.h"
+
+
+
+void vp10_setup_var_tree(struct VP10Common *cm, ThreadData *td) {
+ int i, j;
+#if CONFIG_EXT_PARTITION
+ const int leaf_nodes = 1024;
+ const int tree_nodes = 1024 + 256 + 64 + 16 + 4 + 1;
+#else
+ const int leaf_nodes = 256;
+ const int tree_nodes = 256 + 64 + 16 + 4 + 1;
+#endif // CONFIG_EXT_PARTITION
+ int index = 0;
+ VAR_TREE *this_var;
+ int nodes;
+
+ vpx_free(td->var_tree);
+ CHECK_MEM_ERROR(cm, td->var_tree, vpx_calloc(tree_nodes,
+ sizeof(*td->var_tree)));
+
+ this_var = &td->var_tree[0];
+
+ // Sets up all the leaf nodes in the tree.
+ for (index = 0; index < leaf_nodes; ++index) {
+ VAR_TREE *const leaf = &td->var_tree[index];
+ leaf->split[0] = NULL;
+ }
+
+ // Each node has 4 leaf nodes, fill in the child pointers
+ // from leafs to the root.
+ for (nodes = leaf_nodes >> 2; nodes > 0; nodes >>= 2) {
+ for (i = 0; i < nodes; ++i, ++index) {
+ VAR_TREE *const node = &td->var_tree[index];
+ for (j = 0; j < 4; j++)
+ node->split[j] = this_var++;
+ }
+ }
+
+ // Set up the root node for the largest superblock size
+ i = MAX_MIB_SIZE_LOG2 - MIN_MIB_SIZE_LOG2;
+ td->var_root[i] = &td->var_tree[tree_nodes - 1];
+ // Set up the root nodes for the rest of the possible superblock sizes
+ while (--i >= 0) {
+ td->var_root[i] = td->var_root[i+1]->split[0];
+ }
+}
+
+void vp10_free_var_tree(ThreadData *td) {
+ vpx_free(td->var_tree);
+ td->var_tree = NULL;
+}
diff --git a/vp10/encoder/variance_tree.h b/vp10/encoder/variance_tree.h
new file mode 100644
index 0000000..a10f7e7
--- /dev/null
+++ b/vp10/encoder/variance_tree.h
@@ -0,0 +1,98 @@
+/*
+ * Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VP10_ENCODER_VARIANCE_TREE_H_
+#define VP10_ENCODER_VARIANCE_TREE_H_
+
+#include <assert.h>
+
+#include "./vpx_config.h"
+
+#include "vpx/vpx_integer.h"
+
+#include "vp10/common/enums.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct VP10Common;
+struct ThreadData;
+
+typedef struct {
+ int64_t sum_square_error;
+ int64_t sum_error;
+ int log2_count;
+ int variance;
+} var;
+
+typedef struct {
+ var none;
+ var horz[2];
+ var vert[2];
+} partition_variance;
+
+typedef struct VAR_TREE {
+ int force_split;
+ partition_variance variances;
+ struct VAR_TREE *split[4];
+ BLOCK_SIZE bsize;
+ const uint8_t *src;
+ const uint8_t *ref;
+ int src_stride;
+ int ref_stride;
+ int width;
+ int height;
+#if CONFIG_VP9_HIGHBITDEPTH
+ int highbd;
+#endif // CONFIG_VP9_HIGHBITDEPTH
+} VAR_TREE;
+
+void vp10_setup_var_tree(struct VP10Common *cm, struct ThreadData *td);
+void vp10_free_var_tree(struct ThreadData *td);
+
+// Set variance values given sum square error, sum error, count.
+static INLINE void fill_variance(int64_t s2, int64_t s, int c, var *v) {
+ v->sum_square_error = s2;
+ v->sum_error = s;
+ v->log2_count = c;
+ v->variance = (int)(256 * (v->sum_square_error -
+ ((v->sum_error * v->sum_error) >> v->log2_count)) >> v->log2_count);
+}
+
+static INLINE void sum_2_variances(const var *a, const var *b, var *r) {
+ assert(a->log2_count == b->log2_count);
+ fill_variance(a->sum_square_error + b->sum_square_error,
+ a->sum_error + b->sum_error, a->log2_count + 1, r);
+}
+
+static INLINE void fill_variance_node(VAR_TREE *vt) {
+ sum_2_variances(&vt->split[0]->variances.none,
+ &vt->split[1]->variances.none,
+ &vt->variances.horz[0]);
+ sum_2_variances(&vt->split[2]->variances.none,
+ &vt->split[3]->variances.none,
+ &vt->variances.horz[1]);
+ sum_2_variances(&vt->split[0]->variances.none,
+ &vt->split[2]->variances.none,
+ &vt->variances.vert[0]);
+ sum_2_variances(&vt->split[1]->variances.none,
+ &vt->split[3]->variances.none,
+ &vt->variances.vert[1]);
+ sum_2_variances(&vt->variances.vert[0],
+ &vt->variances.vert[1],
+ &vt->variances.none);
+}
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif /* VP10_ENCODER_VARIANCE_TREE_H_ */
diff --git a/vp10/vp10cx.mk b/vp10/vp10cx.mk
index 34b766f..d174c8b 100644
--- a/vp10/vp10cx.mk
+++ b/vp10/vp10cx.mk
@@ -21,6 +21,8 @@
VP10_CX_SRCS-yes += encoder/bitwriter.h
VP10_CX_SRCS-yes += encoder/context_tree.c
VP10_CX_SRCS-yes += encoder/context_tree.h
+VP10_CX_SRCS-yes += encoder/variance_tree.c
+VP10_CX_SRCS-yes += encoder/variance_tree.h
VP10_CX_SRCS-yes += encoder/cost.h
VP10_CX_SRCS-yes += encoder/cost.c
VP10_CX_SRCS-yes += encoder/dct.c
diff --git a/vpx_dsp/avg.c b/vpx_dsp/avg.c
index 26fe785..d3695a9 100644
--- a/vpx_dsp/avg.c
+++ b/vpx_dsp/avg.c
@@ -12,22 +12,22 @@
#include "./vpx_dsp_rtcd.h"
#include "vpx_ports/mem.h"
-unsigned int vpx_avg_8x8_c(const uint8_t *s, int p) {
+unsigned int vpx_avg_8x8_c(const uint8_t *src, int stride) {
int i, j;
int sum = 0;
- for (i = 0; i < 8; ++i, s+=p)
- for (j = 0; j < 8; sum += s[j], ++j) {}
+ for (i = 0; i < 8; ++i, src += stride)
+ for (j = 0; j < 8; sum += src[j], ++j) {}
- return (sum + 32) >> 6;
+ return ROUND_POWER_OF_TWO(sum, 6);
}
-unsigned int vpx_avg_4x4_c(const uint8_t *s, int p) {
+unsigned int vpx_avg_4x4_c(const uint8_t *src, int stride) {
int i, j;
int sum = 0;
- for (i = 0; i < 4; ++i, s+=p)
- for (j = 0; j < 4; sum += s[j], ++j) {}
+ for (i = 0; i < 4; ++i, src += stride)
+ for (j = 0; j < 4; sum += src[j], ++j) {}
- return (sum + 8) >> 4;
+ return ROUND_POWER_OF_TWO(sum, 4);
}
// src_diff: first pass, 9 bit, dynamic range [-255, 255]
@@ -176,14 +176,15 @@
return var;
}
-void vpx_minmax_8x8_c(const uint8_t *s, int p, const uint8_t *d, int dp,
+void vpx_minmax_8x8_c(const uint8_t *src, int src_stride,
+ const uint8_t *ref, int ref_stride,
int *min, int *max) {
int i, j;
*min = 255;
*max = 0;
- for (i = 0; i < 8; ++i, s += p, d += dp) {
+ for (i = 0; i < 8; ++i, src += src_stride, ref += ref_stride) {
for (j = 0; j < 8; ++j) {
- int diff = abs(s[j]-d[j]);
+ int diff = abs(src[j]-ref[j]);
*min = diff < *min ? diff : *min;
*max = diff > *max ? diff : *max;
}
@@ -191,24 +192,24 @@
}
#if CONFIG_VP9_HIGHBITDEPTH
-unsigned int vpx_highbd_avg_8x8_c(const uint8_t *s8, int p) {
+unsigned int vpx_highbd_avg_8x8_c(const uint8_t *src, int stride) {
int i, j;
int sum = 0;
- const uint16_t* s = CONVERT_TO_SHORTPTR(s8);
- for (i = 0; i < 8; ++i, s+=p)
+ const uint16_t* s = CONVERT_TO_SHORTPTR(src);
+ for (i = 0; i < 8; ++i, s += stride)
for (j = 0; j < 8; sum += s[j], ++j) {}
- return (sum + 32) >> 6;
+ return ROUND_POWER_OF_TWO(sum, 6);
}
-unsigned int vpx_highbd_avg_4x4_c(const uint8_t *s8, int p) {
+unsigned int vpx_highbd_avg_4x4_c(const uint8_t *src, int stride) {
int i, j;
int sum = 0;
- const uint16_t* s = CONVERT_TO_SHORTPTR(s8);
- for (i = 0; i < 4; ++i, s+=p)
+ const uint16_t* s = CONVERT_TO_SHORTPTR(src);
+ for (i = 0; i < 4; ++i, s+=stride)
for (j = 0; j < 4; sum += s[j], ++j) {}
- return (sum + 8) >> 4;
+ return ROUND_POWER_OF_TWO(sum, 4);
}
void vpx_highbd_minmax_8x8_c(const uint8_t *s8, int p, const uint8_t *d8,
diff --git a/vpx_dsp/variance.h b/vpx_dsp/variance.h
index 1759854..dea2af9 100644
--- a/vpx_dsp/variance.h
+++ b/vpx_dsp/variance.h
@@ -23,10 +23,10 @@
#define FILTER_WEIGHT 128
typedef unsigned int(*vpx_sad_fn_t)(const uint8_t *a, int a_stride,
- const uint8_t *b_ptr, int b_stride);
+ const uint8_t *b, int b_stride);
-typedef unsigned int(*vpx_sad_avg_fn_t)(const uint8_t *a_ptr, int a_stride,
- const uint8_t *b_ptr, int b_stride,
+typedef unsigned int(*vpx_sad_avg_fn_t)(const uint8_t *a, int a_stride,
+ const uint8_t *b, int b_stride,
const uint8_t *second_pred);
typedef void (*vp8_copy32xn_fn_t)(const uint8_t *a, int a_stride,
@@ -50,10 +50,10 @@
const uint8_t *b, int b_stride,
unsigned int *sse);
-typedef unsigned int (*vpx_subp_avg_variance_fn_t)(const uint8_t *a_ptr,
+typedef unsigned int (*vpx_subp_avg_variance_fn_t)(const uint8_t *a,
int a_stride,
int xoffset, int yoffset,
- const uint8_t *b_ptr,
+ const uint8_t *b,
int b_stride,
unsigned int *sse,
const uint8_t *second_pred);
@@ -75,26 +75,25 @@
#endif // CONFIG_VP8
#if CONFIG_VP10 && CONFIG_EXT_INTER
-typedef unsigned int(*vpx_masked_sad_fn_t)(const uint8_t *src_ptr,
- int source_stride,
- const uint8_t *ref_ptr,
+typedef unsigned int(*vpx_masked_sad_fn_t)(const uint8_t *src,
+ int src_stride,
+ const uint8_t *ref,
int ref_stride,
const uint8_t *msk_ptr,
int msk_stride);
-typedef unsigned int (*vpx_masked_variance_fn_t)(const uint8_t *src_ptr,
- int source_stride,
- const uint8_t *ref_ptr,
+typedef unsigned int (*vpx_masked_variance_fn_t)(const uint8_t *src,
+ int src_stride,
+ const uint8_t *ref,
int ref_stride,
- const uint8_t *msk_ptr,
+ const uint8_t *msk,
int msk_stride,
unsigned int *sse);
-typedef unsigned int (*vpx_masked_subpixvariance_fn_t)(const uint8_t *src_ptr,
- int source_stride,
- int xoffset,
- int yoffset,
- const uint8_t *ref_ptr,
- int Refstride,
- const uint8_t *msk_ptr,
+typedef unsigned int (*vpx_masked_subpixvariance_fn_t)(const uint8_t *src,
+ int src_stride,
+ int xoffset, int yoffset,
+ const uint8_t *ref,
+ int ref_stride,
+ const uint8_t *msk,
int msk_stride,
unsigned int *sse);
#endif // CONFIG_VP10 && CONFIG_EXT_INTER
diff --git a/vpx_dsp/vpx_dsp.mk b/vpx_dsp/vpx_dsp.mk
index a9805d7..46ef5fc 100644
--- a/vpx_dsp/vpx_dsp.mk
+++ b/vpx_dsp/vpx_dsp.mk
@@ -266,6 +266,11 @@
endif
endif
+# high bit depth subtract
+ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
+DSP_SRCS-$(HAVE_SSE2) += x86/highbd_subtract_sse2.c
+endif
+
endif # CONFIG_VP9_ENCODER || CONFIG_VP10_ENCODER
ifeq ($(CONFIG_VP10_ENCODER),yes)
diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl
index 10a5280..a648e45 100644
--- a/vpx_dsp/vpx_dsp_rtcd_defs.pl
+++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl
@@ -965,10 +965,6 @@
#
add_proto qw/void vpx_subtract_block/, "int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride";
specialize qw/vpx_subtract_block neon msa/, "$sse2_x86inc";
-if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
- add_proto qw/void vpx_highbd_subtract_block/, "int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride, int bd";
- specialize qw/vpx_highbd_subtract_block/;
-}
if (vpx_config("CONFIG_VP10_ENCODER") eq "yes") {
#
@@ -991,6 +987,8 @@
specialize qw/vpx_highbd_avg_8x8/;
add_proto qw/unsigned int vpx_highbd_avg_4x4/, "const uint8_t *, int p";
specialize qw/vpx_highbd_avg_4x4/;
+ add_proto qw/void vpx_highbd_subtract_block/, "int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride, int bd";
+ specialize qw/vpx_highbd_subtract_block sse2/;
}
#
diff --git a/vpx_dsp/x86/highbd_subtract_sse2.c b/vpx_dsp/x86/highbd_subtract_sse2.c
new file mode 100644
index 0000000..33e464b
--- /dev/null
+++ b/vpx_dsp/x86/highbd_subtract_sse2.c
@@ -0,0 +1,366 @@
+/*
+ * Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include <emmintrin.h>
+#include <stddef.h>
+
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+
+typedef void (*SubtractWxHFuncType)(
+ int16_t *diff, ptrdiff_t diff_stride,
+ const uint16_t *src, ptrdiff_t src_stride,
+ const uint16_t *pred, ptrdiff_t pred_stride);
+
+static void subtract_4x4(int16_t *diff, ptrdiff_t diff_stride,
+ const uint16_t *src, ptrdiff_t src_stride,
+ const uint16_t *pred, ptrdiff_t pred_stride) {
+ __m128i u0, u1, u2, u3;
+ __m128i v0, v1, v2, v3;
+ __m128i x0, x1, x2, x3;
+ int64_t *store_diff = (int64_t *) (diff + 0 * diff_stride);
+
+ u0 = _mm_loadu_si128((__m128i const *) (src + 0 * src_stride));
+ u1 = _mm_loadu_si128((__m128i const *) (src + 1 * src_stride));
+ u2 = _mm_loadu_si128((__m128i const *) (src + 2 * src_stride));
+ u3 = _mm_loadu_si128((__m128i const *) (src + 3 * src_stride));
+
+ v0 = _mm_loadu_si128((__m128i const *) (pred + 0 * pred_stride));
+ v1 = _mm_loadu_si128((__m128i const *) (pred + 1 * pred_stride));
+ v2 = _mm_loadu_si128((__m128i const *) (pred + 2 * pred_stride));
+ v3 = _mm_loadu_si128((__m128i const *) (pred + 3 * pred_stride));
+
+ x0 = _mm_sub_epi16(u0, v0);
+ x1 = _mm_sub_epi16(u1, v1);
+ x2 = _mm_sub_epi16(u2, v2);
+ x3 = _mm_sub_epi16(u3, v3);
+
+ _mm_storel_epi64((__m128i *)store_diff, x0);
+ store_diff = (int64_t *) (diff + 1 * diff_stride);
+ _mm_storel_epi64((__m128i *)store_diff, x1);
+ store_diff = (int64_t *) (diff + 2 * diff_stride);
+ _mm_storel_epi64((__m128i *)store_diff, x2);
+ store_diff = (int64_t *) (diff + 3 * diff_stride);
+ _mm_storel_epi64((__m128i *)store_diff, x3);
+}
+
+static void subtract_4x8(int16_t *diff, ptrdiff_t diff_stride,
+ const uint16_t *src, ptrdiff_t src_stride,
+ const uint16_t *pred, ptrdiff_t pred_stride) {
+ __m128i u0, u1, u2, u3, u4, u5, u6, u7;
+ __m128i v0, v1, v2, v3, v4, v5, v6, v7;
+ __m128i x0, x1, x2, x3, x4, x5, x6, x7;
+ int64_t *store_diff = (int64_t *) (diff + 0 * diff_stride);
+
+ u0 = _mm_loadu_si128((__m128i const *) (src + 0 * src_stride));
+ u1 = _mm_loadu_si128((__m128i const *) (src + 1 * src_stride));
+ u2 = _mm_loadu_si128((__m128i const *) (src + 2 * src_stride));
+ u3 = _mm_loadu_si128((__m128i const *) (src + 3 * src_stride));
+ u4 = _mm_loadu_si128((__m128i const *) (src + 4 * src_stride));
+ u5 = _mm_loadu_si128((__m128i const *) (src + 5 * src_stride));
+ u6 = _mm_loadu_si128((__m128i const *) (src + 6 * src_stride));
+ u7 = _mm_loadu_si128((__m128i const *) (src + 7 * src_stride));
+
+ v0 = _mm_loadu_si128((__m128i const *) (pred + 0 * pred_stride));
+ v1 = _mm_loadu_si128((__m128i const *) (pred + 1 * pred_stride));
+ v2 = _mm_loadu_si128((__m128i const *) (pred + 2 * pred_stride));
+ v3 = _mm_loadu_si128((__m128i const *) (pred + 3 * pred_stride));
+ v4 = _mm_loadu_si128((__m128i const *) (pred + 4 * pred_stride));
+ v5 = _mm_loadu_si128((__m128i const *) (pred + 5 * pred_stride));
+ v6 = _mm_loadu_si128((__m128i const *) (pred + 6 * pred_stride));
+ v7 = _mm_loadu_si128((__m128i const *) (pred + 7 * pred_stride));
+
+ x0 = _mm_sub_epi16(u0, v0);
+ x1 = _mm_sub_epi16(u1, v1);
+ x2 = _mm_sub_epi16(u2, v2);
+ x3 = _mm_sub_epi16(u3, v3);
+ x4 = _mm_sub_epi16(u4, v4);
+ x5 = _mm_sub_epi16(u5, v5);
+ x6 = _mm_sub_epi16(u6, v6);
+ x7 = _mm_sub_epi16(u7, v7);
+
+ _mm_storel_epi64((__m128i *)store_diff, x0);
+ store_diff = (int64_t *) (diff + 1 * diff_stride);
+ _mm_storel_epi64((__m128i *)store_diff, x1);
+ store_diff = (int64_t *) (diff + 2 * diff_stride);
+ _mm_storel_epi64((__m128i *)store_diff, x2);
+ store_diff = (int64_t *) (diff + 3 * diff_stride);
+ _mm_storel_epi64((__m128i *)store_diff, x3);
+ store_diff = (int64_t *) (diff + 4 * diff_stride);
+ _mm_storel_epi64((__m128i *)store_diff, x4);
+ store_diff = (int64_t *) (diff + 5 * diff_stride);
+ _mm_storel_epi64((__m128i *)store_diff, x5);
+ store_diff = (int64_t *) (diff + 6 * diff_stride);
+ _mm_storel_epi64((__m128i *)store_diff, x6);
+ store_diff = (int64_t *) (diff + 7 * diff_stride);
+ _mm_storel_epi64((__m128i *)store_diff, x7);
+}
+
+static void subtract_8x4(int16_t *diff, ptrdiff_t diff_stride,
+ const uint16_t *src, ptrdiff_t src_stride,
+ const uint16_t *pred, ptrdiff_t pred_stride) {
+ __m128i u0, u1, u2, u3;
+ __m128i v0, v1, v2, v3;
+ __m128i x0, x1, x2, x3;
+
+ u0 = _mm_loadu_si128((__m128i const *) (src + 0 * src_stride));
+ u1 = _mm_loadu_si128((__m128i const *) (src + 1 * src_stride));
+ u2 = _mm_loadu_si128((__m128i const *) (src + 2 * src_stride));
+ u3 = _mm_loadu_si128((__m128i const *) (src + 3 * src_stride));
+
+ v0 = _mm_loadu_si128((__m128i const *) (pred + 0 * pred_stride));
+ v1 = _mm_loadu_si128((__m128i const *) (pred + 1 * pred_stride));
+ v2 = _mm_loadu_si128((__m128i const *) (pred + 2 * pred_stride));
+ v3 = _mm_loadu_si128((__m128i const *) (pred + 3 * pred_stride));
+
+ x0 = _mm_sub_epi16(u0, v0);
+ x1 = _mm_sub_epi16(u1, v1);
+ x2 = _mm_sub_epi16(u2, v2);
+ x3 = _mm_sub_epi16(u3, v3);
+
+ _mm_storeu_si128((__m128i *) (diff + 0 * diff_stride), x0);
+ _mm_storeu_si128((__m128i *) (diff + 1 * diff_stride), x1);
+ _mm_storeu_si128((__m128i *) (diff + 2 * diff_stride), x2);
+ _mm_storeu_si128((__m128i *) (diff + 3 * diff_stride), x3);
+}
+
+static void subtract_8x8(int16_t *diff, ptrdiff_t diff_stride,
+ const uint16_t *src, ptrdiff_t src_stride,
+ const uint16_t *pred, ptrdiff_t pred_stride) {
+ __m128i u0, u1, u2, u3, u4, u5, u6, u7;
+ __m128i v0, v1, v2, v3, v4, v5, v6, v7;
+ __m128i x0, x1, x2, x3, x4, x5, x6, x7;
+
+ u0 = _mm_loadu_si128((__m128i const *) (src + 0 * src_stride));
+ u1 = _mm_loadu_si128((__m128i const *) (src + 1 * src_stride));
+ u2 = _mm_loadu_si128((__m128i const *) (src + 2 * src_stride));
+ u3 = _mm_loadu_si128((__m128i const *) (src + 3 * src_stride));
+ u4 = _mm_loadu_si128((__m128i const *) (src + 4 * src_stride));
+ u5 = _mm_loadu_si128((__m128i const *) (src + 5 * src_stride));
+ u6 = _mm_loadu_si128((__m128i const *) (src + 6 * src_stride));
+ u7 = _mm_loadu_si128((__m128i const *) (src + 7 * src_stride));
+
+ v0 = _mm_loadu_si128((__m128i const *) (pred + 0 * pred_stride));
+ v1 = _mm_loadu_si128((__m128i const *) (pred + 1 * pred_stride));
+ v2 = _mm_loadu_si128((__m128i const *) (pred + 2 * pred_stride));
+ v3 = _mm_loadu_si128((__m128i const *) (pred + 3 * pred_stride));
+ v4 = _mm_loadu_si128((__m128i const *) (pred + 4 * pred_stride));
+ v5 = _mm_loadu_si128((__m128i const *) (pred + 5 * pred_stride));
+ v6 = _mm_loadu_si128((__m128i const *) (pred + 6 * pred_stride));
+ v7 = _mm_loadu_si128((__m128i const *) (pred + 7 * pred_stride));
+
+ x0 = _mm_sub_epi16(u0, v0);
+ x1 = _mm_sub_epi16(u1, v1);
+ x2 = _mm_sub_epi16(u2, v2);
+ x3 = _mm_sub_epi16(u3, v3);
+ x4 = _mm_sub_epi16(u4, v4);
+ x5 = _mm_sub_epi16(u5, v5);
+ x6 = _mm_sub_epi16(u6, v6);
+ x7 = _mm_sub_epi16(u7, v7);
+
+ _mm_storeu_si128((__m128i *) (diff + 0 * diff_stride), x0);
+ _mm_storeu_si128((__m128i *) (diff + 1 * diff_stride), x1);
+ _mm_storeu_si128((__m128i *) (diff + 2 * diff_stride), x2);
+ _mm_storeu_si128((__m128i *) (diff + 3 * diff_stride), x3);
+ _mm_storeu_si128((__m128i *) (diff + 4 * diff_stride), x4);
+ _mm_storeu_si128((__m128i *) (diff + 5 * diff_stride), x5);
+ _mm_storeu_si128((__m128i *) (diff + 6 * diff_stride), x6);
+ _mm_storeu_si128((__m128i *) (diff + 7 * diff_stride), x7);
+}
+
+static void subtract_8x16(int16_t *diff, ptrdiff_t diff_stride,
+ const uint16_t *src, ptrdiff_t src_stride,
+ const uint16_t *pred, ptrdiff_t pred_stride) {
+ subtract_8x8(diff, diff_stride, src, src_stride, pred, pred_stride);
+ diff += diff_stride << 3;
+ src += src_stride << 3;
+ pred += pred_stride << 3;
+ subtract_8x8(diff, diff_stride, src, src_stride, pred, pred_stride);
+}
+
+static void subtract_16x8(int16_t *diff, ptrdiff_t diff_stride,
+ const uint16_t *src, ptrdiff_t src_stride,
+ const uint16_t *pred, ptrdiff_t pred_stride) {
+ subtract_8x8(diff, diff_stride, src, src_stride, pred, pred_stride);
+ diff += 8;
+ src += 8;
+ pred += 8;
+ subtract_8x8(diff, diff_stride, src, src_stride, pred, pred_stride);
+}
+
+static void subtract_16x16(int16_t *diff, ptrdiff_t diff_stride,
+ const uint16_t *src, ptrdiff_t src_stride,
+ const uint16_t *pred, ptrdiff_t pred_stride) {
+ subtract_16x8(diff, diff_stride, src, src_stride, pred, pred_stride);
+ diff += diff_stride << 3;
+ src += src_stride << 3;
+ pred += pred_stride << 3;
+ subtract_16x8(diff, diff_stride, src, src_stride, pred, pred_stride);
+}
+
+static void subtract_16x32(int16_t *diff, ptrdiff_t diff_stride,
+ const uint16_t *src, ptrdiff_t src_stride,
+ const uint16_t *pred, ptrdiff_t pred_stride) {
+ subtract_16x16(diff, diff_stride, src, src_stride, pred, pred_stride);
+ diff += diff_stride << 4;
+ src += src_stride << 4;
+ pred += pred_stride << 4;
+ subtract_16x16(diff, diff_stride, src, src_stride, pred, pred_stride);
+}
+
+static void subtract_32x16(int16_t *diff, ptrdiff_t diff_stride,
+ const uint16_t *src, ptrdiff_t src_stride,
+ const uint16_t *pred, ptrdiff_t pred_stride) {
+ subtract_16x16(diff, diff_stride, src, src_stride, pred, pred_stride);
+ diff += 16;
+ src += 16;
+ pred += 16;
+ subtract_16x16(diff, diff_stride, src, src_stride, pred, pred_stride);
+}
+
+static void subtract_32x32(int16_t *diff, ptrdiff_t diff_stride,
+ const uint16_t *src, ptrdiff_t src_stride,
+ const uint16_t *pred, ptrdiff_t pred_stride) {
+ subtract_32x16(diff, diff_stride, src, src_stride, pred, pred_stride);
+ diff += diff_stride << 4;
+ src += src_stride << 4;
+ pred += pred_stride << 4;
+ subtract_32x16(diff, diff_stride, src, src_stride, pred, pred_stride);
+}
+
+static void subtract_32x64(int16_t *diff, ptrdiff_t diff_stride,
+ const uint16_t *src, ptrdiff_t src_stride,
+ const uint16_t *pred, ptrdiff_t pred_stride) {
+ subtract_32x32(diff, diff_stride, src, src_stride, pred, pred_stride);
+ diff += diff_stride << 5;
+ src += src_stride << 5;
+ pred += pred_stride << 5;
+ subtract_32x32(diff, diff_stride, src, src_stride, pred, pred_stride);
+}
+
+static void subtract_64x32(int16_t *diff, ptrdiff_t diff_stride,
+ const uint16_t *src, ptrdiff_t src_stride,
+ const uint16_t *pred, ptrdiff_t pred_stride) {
+ subtract_32x32(diff, diff_stride, src, src_stride, pred, pred_stride);
+ diff += 32;
+ src += 32;
+ pred += 32;
+ subtract_32x32(diff, diff_stride, src, src_stride, pred, pred_stride);
+}
+
+static void subtract_64x64(int16_t *diff, ptrdiff_t diff_stride,
+ const uint16_t *src, ptrdiff_t src_stride,
+ const uint16_t *pred, ptrdiff_t pred_stride) {
+ subtract_64x32(diff, diff_stride, src, src_stride, pred, pred_stride);
+ diff += diff_stride << 5;
+ src += src_stride << 5;
+ pred += pred_stride << 5;
+ subtract_64x32(diff, diff_stride, src, src_stride, pred, pred_stride);
+}
+
+static void subtract_64x128(int16_t *diff, ptrdiff_t diff_stride,
+ const uint16_t *src, ptrdiff_t src_stride,
+ const uint16_t *pred, ptrdiff_t pred_stride) {
+ subtract_64x64(diff, diff_stride, src, src_stride, pred, pred_stride);
+ diff += diff_stride << 6;
+ src += src_stride << 6;
+ pred += pred_stride << 6;
+ subtract_64x64(diff, diff_stride, src, src_stride, pred, pred_stride);
+}
+
+static void subtract_128x64(int16_t *diff, ptrdiff_t diff_stride,
+ const uint16_t *src, ptrdiff_t src_stride,
+ const uint16_t *pred, ptrdiff_t pred_stride) {
+ subtract_64x64(diff, diff_stride, src, src_stride, pred, pred_stride);
+ diff += 64;
+ src += 64;
+ pred += 64;
+ subtract_64x64(diff, diff_stride, src, src_stride, pred, pred_stride);
+}
+
+static void subtract_128x128(int16_t *diff, ptrdiff_t diff_stride,
+ const uint16_t *src, ptrdiff_t src_stride,
+ const uint16_t *pred, ptrdiff_t pred_stride) {
+ subtract_128x64(diff, diff_stride, src, src_stride, pred, pred_stride);
+ diff += diff_stride << 6;
+ src += src_stride << 6;
+ pred += pred_stride << 6;
+ subtract_128x64(diff, diff_stride, src, src_stride, pred, pred_stride);
+}
+
+static SubtractWxHFuncType getSubtractFunc(int rows, int cols) {
+ SubtractWxHFuncType ret_func_ptr = NULL;
+ if (rows == 4) {
+ if (cols == 4) {
+ ret_func_ptr = subtract_4x4;
+ } else if (cols == 8) {
+ ret_func_ptr = subtract_8x4;
+ }
+ } else if (rows == 8) {
+ if (cols == 4) {
+ ret_func_ptr = subtract_4x8;
+ } else if (cols == 8) {
+ ret_func_ptr = subtract_8x8;
+ } else if (cols == 16) {
+ ret_func_ptr = subtract_16x8;
+ }
+ } else if (rows == 16) {
+ if (cols == 8) {
+ ret_func_ptr = subtract_8x16;
+ } else if (cols == 16) {
+ ret_func_ptr = subtract_16x16;
+ } else if (cols == 32) {
+ ret_func_ptr = subtract_32x16;
+ }
+ } else if (rows == 32) {
+ if (cols == 16) {
+ ret_func_ptr = subtract_16x32;
+ } else if (cols == 32) {
+ ret_func_ptr = subtract_32x32;
+ } else if (cols == 64) {
+ ret_func_ptr = subtract_64x32;
+ }
+ } else if (rows == 64) {
+ if (cols == 32) {
+ ret_func_ptr = subtract_32x64;
+ } else if (cols == 64) {
+ ret_func_ptr = subtract_64x64;
+ } else if (cols == 128) {
+ ret_func_ptr = subtract_128x64;
+ }
+ } else if (rows == 128) {
+ if (cols == 64) {
+ ret_func_ptr = subtract_64x128;
+ } else if (cols == 128) {
+ ret_func_ptr = subtract_128x128;
+ }
+ }
+ if (!ret_func_ptr) {
+ assert(0);
+ }
+ return ret_func_ptr;
+}
+
+void vpx_highbd_subtract_block_sse2(
+ int rows, int cols,
+ int16_t *diff, ptrdiff_t diff_stride,
+ const uint8_t *src8, ptrdiff_t src_stride,
+ const uint8_t *pred8,
+ ptrdiff_t pred_stride,
+ int bd) {
+ uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+ uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
+ SubtractWxHFuncType func;
+ (void) bd;
+
+ func = getSubtractFunc(rows, cols);
+ func(diff, diff_stride, src, src_stride, pred, pred_stride);
+}