Merge "Refactoring and cosmetic changes to ext-inter expt" into nextgenv2
diff --git a/test/variance_test.cc b/test/variance_test.cc
index 0092e8c..78aabe6 100644
--- a/test/variance_test.cc
+++ b/test/variance_test.cc
@@ -74,6 +74,10 @@
return res;
}
+/* Note:
+ * Our codebase calculates the "diff" value in the variance algorithm by
+ * (src - ref).
+ */
static uint32_t variance_ref(const uint8_t *src, const uint8_t *ref,
int l2w, int l2h, int src_stride_coeff,
int ref_stride_coeff, uint32_t *sse_ptr,
@@ -87,14 +91,14 @@
for (int x = 0; x < w; x++) {
int diff;
if (!use_high_bit_depth_) {
- diff = ref[w * y * ref_stride_coeff + x] -
- src[w * y * src_stride_coeff + x];
+ diff = src[w * y * src_stride_coeff + x] -
+ ref[w * y * ref_stride_coeff + x];
se += diff;
sse += diff * diff;
#if CONFIG_VP9_HIGHBITDEPTH
} else {
- diff = CONVERT_TO_SHORTPTR(ref)[w * y * ref_stride_coeff + x] -
- CONVERT_TO_SHORTPTR(src)[w * y * src_stride_coeff + x];
+ diff = CONVERT_TO_SHORTPTR(src)[w * y * src_stride_coeff + x] -
+ CONVERT_TO_SHORTPTR(ref)[w * y * ref_stride_coeff + x];
se += diff;
sse += diff * diff;
#endif // CONFIG_VP9_HIGHBITDEPTH
@@ -328,8 +332,10 @@
log2height_, stride_coeff,
stride_coeff, &sse2,
use_high_bit_depth_, bit_depth_);
- EXPECT_EQ(sse1, sse2);
- EXPECT_EQ(var1, var2);
+ EXPECT_EQ(sse1, sse2)
+ << "Error at test index: " << i;
+ EXPECT_EQ(var1, var2)
+ << "Error at test index: " << i;
}
}
@@ -361,8 +367,10 @@
log2height_, src_stride_coeff,
ref_stride_coeff, &sse2,
use_high_bit_depth_, bit_depth_);
- EXPECT_EQ(sse1, sse2);
- EXPECT_EQ(var1, var2);
+ EXPECT_EQ(sse1, sse2)
+ << "Error at test index: " << i;
+ EXPECT_EQ(var1, var2)
+ << "Error at test index: " << i;
}
}
@@ -866,36 +874,36 @@
make_tuple(6, 7, &vpx_highbd_12_variance64x128_c, 12),
#endif // CONFIG_VP10 && CONFIG_EXT_PARTITION
make_tuple(6, 6, &vpx_highbd_12_variance64x64_c, 12),
- // make_tuple(6, 5, &vpx_highbd_12_variance64x32_c, 12),
- // make_tuple(5, 6, &vpx_highbd_12_variance32x64_c, 12),
- // make_tuple(5, 5, &vpx_highbd_12_variance32x32_c, 12),
+ make_tuple(6, 5, &vpx_highbd_12_variance64x32_c, 12),
+ make_tuple(5, 6, &vpx_highbd_12_variance32x64_c, 12),
+ make_tuple(5, 5, &vpx_highbd_12_variance32x32_c, 12),
make_tuple(5, 4, &vpx_highbd_12_variance32x16_c, 12),
make_tuple(4, 5, &vpx_highbd_12_variance16x32_c, 12),
- // make_tuple(4, 4, &vpx_highbd_12_variance16x16_c, 12),
- // make_tuple(4, 3, &vpx_highbd_12_variance16x8_c, 12),
- // make_tuple(3, 4, &vpx_highbd_12_variance8x16_c, 12),
- // make_tuple(3, 3, &vpx_highbd_12_variance8x8_c, 12),
- // make_tuple(3, 2, &vpx_highbd_12_variance8x4_c, 12),
- // make_tuple(2, 3, &vpx_highbd_12_variance4x8_c, 12),
+ make_tuple(4, 4, &vpx_highbd_12_variance16x16_c, 12),
+ make_tuple(4, 3, &vpx_highbd_12_variance16x8_c, 12),
+ make_tuple(3, 4, &vpx_highbd_12_variance8x16_c, 12),
+ make_tuple(3, 3, &vpx_highbd_12_variance8x8_c, 12),
+ make_tuple(3, 2, &vpx_highbd_12_variance8x4_c, 12),
+ make_tuple(2, 3, &vpx_highbd_12_variance4x8_c, 12),
make_tuple(2, 2, &vpx_highbd_12_variance4x4_c, 12),
#if CONFIG_VP10 && CONFIG_EXT_PARTITION
make_tuple(7, 7, &vpx_highbd_10_variance128x128_c, 10),
- // make_tuple(7, 6, &vpx_highbd_10_variance128x64_c, 10),
- // make_tuple(6, 7, &vpx_highbd_10_variance64x128_c, 10),
+ make_tuple(7, 6, &vpx_highbd_10_variance128x64_c, 10),
+ make_tuple(6, 7, &vpx_highbd_10_variance64x128_c, 10),
#endif // CONFIG_VP10 && CONFIG_EXT_PARTITION
- // make_tuple(6, 6, &vpx_highbd_10_variance64x64_c, 10),
- // make_tuple(6, 5, &vpx_highbd_10_variance64x32_c, 10),
- // make_tuple(5, 6, &vpx_highbd_10_variance32x64_c, 10),
- // make_tuple(5, 5, &vpx_highbd_10_variance32x32_c, 10),
- // make_tuple(5, 4, &vpx_highbd_10_variance32x16_c, 10),
- // make_tuple(4, 5, &vpx_highbd_10_variance16x32_c, 10),
- // make_tuple(4, 4, &vpx_highbd_10_variance16x16_c, 10),
- // make_tuple(4, 3, &vpx_highbd_10_variance16x8_c, 10),
- // make_tuple(3, 4, &vpx_highbd_10_variance8x16_c, 10),
- // make_tuple(3, 3, &vpx_highbd_10_variance8x8_c, 10),
- // make_tuple(3, 2, &vpx_highbd_10_variance8x4_c, 10),
- // make_tuple(2, 3, &vpx_highbd_10_variance4x8_c, 10),
- // make_tuple(2, 2, &vpx_highbd_10_variance4x4_c, 10),
+ make_tuple(6, 6, &vpx_highbd_10_variance64x64_c, 10),
+ make_tuple(6, 5, &vpx_highbd_10_variance64x32_c, 10),
+ make_tuple(5, 6, &vpx_highbd_10_variance32x64_c, 10),
+ make_tuple(5, 5, &vpx_highbd_10_variance32x32_c, 10),
+ make_tuple(5, 4, &vpx_highbd_10_variance32x16_c, 10),
+ make_tuple(4, 5, &vpx_highbd_10_variance16x32_c, 10),
+ make_tuple(4, 4, &vpx_highbd_10_variance16x16_c, 10),
+ make_tuple(4, 3, &vpx_highbd_10_variance16x8_c, 10),
+ make_tuple(3, 4, &vpx_highbd_10_variance8x16_c, 10),
+ make_tuple(3, 3, &vpx_highbd_10_variance8x8_c, 10),
+ make_tuple(3, 2, &vpx_highbd_10_variance8x4_c, 10),
+ make_tuple(2, 3, &vpx_highbd_10_variance4x8_c, 10),
+ make_tuple(2, 2, &vpx_highbd_10_variance4x4_c, 10),
#if CONFIG_VP10 && CONFIG_EXT_PARTITION
make_tuple(7, 7, &vpx_highbd_8_variance128x128_c, 8),
make_tuple(7, 6, &vpx_highbd_8_variance128x64_c, 8),
@@ -1138,25 +1146,25 @@
INSTANTIATE_TEST_CASE_P(
SSE2, VpxHBDVarianceTest,
::testing::Values(make_tuple(6, 6, &vpx_highbd_12_variance64x64_sse2, 12),
- // make_tuple(6, 5, &vpx_highbd_12_variance64x32_sse2, 12),
- // make_tuple(5, 6, &vpx_highbd_12_variance32x64_sse2, 12),
- // make_tuple(5, 5, &vpx_highbd_12_variance32x32_sse2, 12),
+ make_tuple(6, 5, &vpx_highbd_12_variance64x32_sse2, 12),
+ make_tuple(5, 6, &vpx_highbd_12_variance32x64_sse2, 12),
+ make_tuple(5, 5, &vpx_highbd_12_variance32x32_sse2, 12),
make_tuple(5, 4, &vpx_highbd_12_variance32x16_sse2, 12),
make_tuple(4, 5, &vpx_highbd_12_variance16x32_sse2, 12),
- // make_tuple(4, 4, &vpx_highbd_12_variance16x16_sse2, 12),
- // make_tuple(4, 3, &vpx_highbd_12_variance16x8_sse2, 12),
- // make_tuple(3, 4, &vpx_highbd_12_variance8x16_sse2, 12),
- // make_tuple(3, 3, &vpx_highbd_12_variance8x8_sse2, 12),
- // make_tuple(6, 6, &vpx_highbd_10_variance64x64_sse2, 10),
- // make_tuple(6, 5, &vpx_highbd_10_variance64x32_sse2, 10),
- // make_tuple(5, 6, &vpx_highbd_10_variance32x64_sse2, 10),
- // make_tuple(5, 5, &vpx_highbd_10_variance32x32_sse2, 10),
- // make_tuple(5, 4, &vpx_highbd_10_variance32x16_sse2, 10),
- // make_tuple(4, 5, &vpx_highbd_10_variance16x32_sse2, 10),
- // make_tuple(4, 4, &vpx_highbd_10_variance16x16_sse2, 10),
- // make_tuple(4, 3, &vpx_highbd_10_variance16x8_sse2, 10),
- // make_tuple(3, 4, &vpx_highbd_10_variance8x16_sse2, 10),
- // make_tuple(3, 3, &vpx_highbd_10_variance8x8_sse2, 10),
+ make_tuple(4, 4, &vpx_highbd_12_variance16x16_sse2, 12),
+ make_tuple(4, 3, &vpx_highbd_12_variance16x8_sse2, 12),
+ make_tuple(3, 4, &vpx_highbd_12_variance8x16_sse2, 12),
+ make_tuple(3, 3, &vpx_highbd_12_variance8x8_sse2, 12),
+ make_tuple(6, 6, &vpx_highbd_10_variance64x64_sse2, 10),
+ make_tuple(6, 5, &vpx_highbd_10_variance64x32_sse2, 10),
+ make_tuple(5, 6, &vpx_highbd_10_variance32x64_sse2, 10),
+ make_tuple(5, 5, &vpx_highbd_10_variance32x32_sse2, 10),
+ make_tuple(5, 4, &vpx_highbd_10_variance32x16_sse2, 10),
+ make_tuple(4, 5, &vpx_highbd_10_variance16x32_sse2, 10),
+ make_tuple(4, 4, &vpx_highbd_10_variance16x16_sse2, 10),
+ make_tuple(4, 3, &vpx_highbd_10_variance16x8_sse2, 10),
+ make_tuple(3, 4, &vpx_highbd_10_variance8x16_sse2, 10),
+ make_tuple(3, 3, &vpx_highbd_10_variance8x8_sse2, 10),
make_tuple(6, 6, &vpx_highbd_8_variance64x64_sse2, 8),
make_tuple(6, 5, &vpx_highbd_8_variance64x32_sse2, 8),
make_tuple(5, 6, &vpx_highbd_8_variance32x64_sse2, 8),
diff --git a/vp10/common/blockd.h b/vp10/common/blockd.h
index ae11556..5cecf79 100644
--- a/vp10/common/blockd.h
+++ b/vp10/common/blockd.h
@@ -363,40 +363,12 @@
static INLINE BLOCK_SIZE get_subsize(BLOCK_SIZE bsize,
PARTITION_TYPE partition) {
- return subsize_lookup[partition][bsize];
+ if (partition == PARTITION_INVALID)
+ return PARTITION_INVALID;
+ else
+ return subsize_lookup[partition][bsize];
}
-#if CONFIG_EXT_PARTITION_TYPES
-static INLINE PARTITION_TYPE get_partition(const MODE_INFO *const mi,
- int mi_stride, int mi_rows,
- int mi_cols, int mi_row,
- int mi_col, BLOCK_SIZE bsize) {
- const int bsl = b_width_log2_lookup[bsize];
- const int bs = (1 << bsl) / 4;
- MODE_INFO m = mi[mi_row * mi_stride + mi_col];
- PARTITION_TYPE partition = partition_lookup[bsl][m.mbmi.sb_type];
- if (partition != PARTITION_NONE && bsize > BLOCK_8X8 &&
- mi_row + bs < mi_rows && mi_col + bs < mi_cols) {
- BLOCK_SIZE h = get_subsize(bsize, PARTITION_HORZ_A);
- BLOCK_SIZE v = get_subsize(bsize, PARTITION_VERT_A);
- MODE_INFO m_right = mi[mi_row * mi_stride + mi_col + bs];
- MODE_INFO m_below = mi[(mi_row + bs) * mi_stride + mi_col];
- if (m.mbmi.sb_type == h) {
- return m_below.mbmi.sb_type == h ? PARTITION_HORZ : PARTITION_HORZ_B;
- } else if (m.mbmi.sb_type == v) {
- return m_right.mbmi.sb_type == v ? PARTITION_VERT : PARTITION_VERT_B;
- } else if (m_below.mbmi.sb_type == h) {
- return PARTITION_HORZ_A;
- } else if (m_right.mbmi.sb_type == v) {
- return PARTITION_VERT_A;
- } else {
- return PARTITION_SPLIT;
- }
- }
- return partition;
-}
-#endif // CONFIG_EXT_PARTITION_TYPES
-
static const TX_TYPE intra_mode_to_tx_type_context[INTRA_MODES] = {
DCT_DCT, // DC
ADST_DCT, // V
diff --git a/vp10/common/mfqe.c b/vp10/common/mfqe.c
index bd0b25b..52756bd 100644
--- a/vp10/common/mfqe.c
+++ b/vp10/common/mfqe.c
@@ -355,6 +355,12 @@
const YV12_BUFFER_CONFIG *show = cm->frame_to_show;
// Last decoded frame and will store the MFQE result.
YV12_BUFFER_CONFIG *dest = &cm->post_proc_buffer;
+
+#if CONFIG_EXT_PARTITION || CONFIG_EXT_PARTITION_TYPES
+ // TODO(any): Fix for ext parition types and 128 superblocks
+ assert(0);
+#endif // CONFIG_EXT_PARTITION || CONFIG_EXT_PARTITION_TYPES
+
// Loop through each super block.
for (mi_row = 0; mi_row < cm->mi_rows; mi_row += MAX_MIB_SIZE) {
for (mi_col = 0; mi_col < cm->mi_cols; mi_col += MAX_MIB_SIZE) {
diff --git a/vp10/common/onyxc_int.h b/vp10/common/onyxc_int.h
index cc443e7..d122495 100644
--- a/vp10/common/onyxc_int.h
+++ b/vp10/common/onyxc_int.h
@@ -372,7 +372,8 @@
return &cm->buffer_pool->frame_bufs[cm->ref_frame_map[index]].buf;
}
-static INLINE YV12_BUFFER_CONFIG *get_frame_new_buffer(VP10_COMMON *cm) {
+static INLINE YV12_BUFFER_CONFIG *get_frame_new_buffer(
+ const VP10_COMMON *const cm) {
return &cm->buffer_pool->frame_bufs[cm->new_fb_idx].buf;
}
@@ -651,6 +652,51 @@
}
#endif
+static INLINE PARTITION_TYPE get_partition(const VP10_COMMON *const cm,
+ const int mi_row,
+ const int mi_col,
+ const BLOCK_SIZE bsize) {
+ if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols) {
+ return PARTITION_INVALID;
+ } else {
+ const int offset = mi_row * cm->mi_stride + mi_col;
+ MODE_INFO **mi = cm->mi_grid_visible + offset;
+ const MB_MODE_INFO *const mbmi = &mi[0]->mbmi;
+ const int bsl = b_width_log2_lookup[bsize];
+ const PARTITION_TYPE partition = partition_lookup[bsl][mbmi->sb_type];
+#if !CONFIG_EXT_PARTITION_TYPES
+ return partition;
+#else
+ const int hbs = num_8x8_blocks_wide_lookup[bsize] / 2;
+
+ assert(cm->mi_grid_visible[offset] == &cm->mi[offset]);
+
+ if (partition != PARTITION_NONE &&
+ bsize > BLOCK_8X8 &&
+ mi_row + hbs < cm->mi_rows &&
+ mi_col + hbs < cm->mi_cols) {
+ const BLOCK_SIZE h = get_subsize(bsize, PARTITION_HORZ_A);
+ const BLOCK_SIZE v = get_subsize(bsize, PARTITION_VERT_A);
+ const MB_MODE_INFO *const mbmi_right = &mi[hbs]->mbmi;
+ const MB_MODE_INFO *const mbmi_below = &mi[hbs * cm->mi_stride]->mbmi;
+ if (mbmi->sb_type == h) {
+ return mbmi_below->sb_type == h ? PARTITION_HORZ : PARTITION_HORZ_B;
+ } else if (mbmi->sb_type == v) {
+ return mbmi_right->sb_type == v ? PARTITION_VERT : PARTITION_VERT_B;
+ } else if (mbmi_below->sb_type == h) {
+ return PARTITION_HORZ_A;
+ } else if (mbmi_right->sb_type == v) {
+ return PARTITION_VERT_A;
+ } else {
+ return PARTITION_SPLIT;
+ }
+ }
+
+ return partition;
+#endif // !CONFIG_EXT_PARTITION_TYPES
+ }
+}
+
#ifdef __cplusplus
} // extern "C"
#endif
diff --git a/vp10/decoder/decodeframe.c b/vp10/decoder/decodeframe.c
index 43ee719..8cc6b84 100644
--- a/vp10/decoder/decodeframe.c
+++ b/vp10/decoder/decodeframe.c
@@ -1026,7 +1026,11 @@
set_mi_row_col(xd, tile, mi_row_pred, bh, mi_col_pred, bw,
cm->mi_rows, cm->mi_cols);
+#if CONFIG_EXT_TILE
+ xd->up_available = (mi_row_ori > tile->mi_row_start);
+#else
xd->up_available = (mi_row_ori != 0);
+#endif // CONFIG_EXT_TILE
xd->left_available = (mi_col_ori > tile->mi_col_start);
set_plane_n4(xd, bw, bh, bwl, bhl);
@@ -1292,17 +1296,15 @@
int mi_row_top, int mi_col_top,
BLOCK_SIZE bsize, BLOCK_SIZE top_bsize,
uint8_t *dst_buf[3], int dst_stride[3]) {
- VP10_COMMON *const cm = &pbi->common;
- const int bsl = b_width_log2_lookup[bsize], hbs = (1 << bsl) / 4;
- PARTITION_TYPE partition;
- BLOCK_SIZE subsize;
-#if !CONFIG_EXT_PARTITION_TYPES
- MB_MODE_INFO *mbmi;
-#endif
- int i, offset = mi_row * cm->mi_stride + mi_col;
+ const VP10_COMMON *const cm = &pbi->common;
+ const int hbs = num_8x8_blocks_wide_lookup[bsize] / 2;
+ const PARTITION_TYPE partition = get_partition(cm, mi_row, mi_col, bsize);
+ const BLOCK_SIZE subsize = get_subsize(bsize, partition);
#if CONFIG_EXT_PARTITION_TYPES
- BLOCK_SIZE bsize2 = get_subsize(bsize, PARTITION_SPLIT);
+ const BLOCK_SIZE bsize2 = get_subsize(bsize, PARTITION_SPLIT);
#endif
+ int i;
+ const int mi_offset = mi_row * cm->mi_stride + mi_col;
uint8_t *dst_buf1[3], *dst_buf2[3], *dst_buf3[3];
DECLARE_ALIGNED(16, uint8_t,
@@ -1345,16 +1347,8 @@
if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols)
return;
- xd->mi = cm->mi_grid_visible + offset;
- xd->mi[0] = cm->mi + offset;
-#if CONFIG_EXT_PARTITION_TYPES
- partition = get_partition(cm->mi, cm->mi_stride, cm->mi_rows, cm->mi_cols,
- mi_row, mi_col, bsize);
-#else
- mbmi = &xd->mi[0]->mbmi;
- partition = partition_lookup[bsl][mbmi->sb_type];
-#endif
- subsize = get_subsize(bsize, partition);
+ xd->mi = cm->mi_grid_visible + mi_offset;
+ xd->mi[0] = cm->mi + mi_offset;
for (i = 0; i < MAX_MB_PLANE; i++) {
xd->plane[i].dst.buf = dst_buf[i];
diff --git a/vp10/encoder/bitstream.c b/vp10/encoder/bitstream.c
index e46a0fc..f402acb 100644
--- a/vp10/encoder/bitstream.c
+++ b/vp10/encoder/bitstream.c
@@ -1655,13 +1655,12 @@
int mi_row, int mi_col, BLOCK_SIZE bsize) {
const VP10_COMMON *const cm = &cpi->common;
MACROBLOCKD *const xd = &cpi->td.mb.e_mbd;
-
- const int bsl = b_width_log2_lookup[bsize];
- const int bs = (1 << bsl) / 4;
- PARTITION_TYPE partition;
- BLOCK_SIZE subsize;
- MODE_INFO *m = NULL;
+ const int hbs = num_8x8_blocks_wide_lookup[bsize] / 2;
+ const PARTITION_TYPE partition = get_partition(cm, mi_row, mi_col, bsize);
+ const BLOCK_SIZE subsize = get_subsize(bsize, partition);
#if CONFIG_SUPERTX
+ const int mi_offset = mi_row * cm->mi_stride + mi_col;
+ MB_MODE_INFO *mbmi = NULL;
const int pack_token = !supertx_enabled;
TX_SIZE supertx_size;
int plane;
@@ -1670,17 +1669,10 @@
if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols)
return;
- m = cm->mi_grid_visible[mi_row * cm->mi_stride + mi_col];
-
- partition = partition_lookup[bsl][m->mbmi.sb_type];
-#if CONFIG_EXT_PARTITION_TYPES
- partition = get_partition(cm->mi, cm->mi_stride, cm->mi_rows, cm->mi_cols,
- mi_row, mi_col, bsize);
-#endif
- write_partition(cm, xd, bs, mi_row, mi_col, partition, bsize, w);
- subsize = get_subsize(bsize, partition);
+ write_partition(cm, xd, hbs, mi_row, mi_col, partition, bsize, w);
#if CONFIG_SUPERTX
- xd->mi = cm->mi_grid_visible + (mi_row * cm->mi_stride + mi_col);
+ mbmi = &cm->mi_grid_visible[mi_offset]->mbmi;
+ xd->mi = cm->mi_grid_visible + mi_offset;
set_mi_row_col(xd, tile,
mi_row, num_8x8_blocks_high_lookup[bsize],
mi_col, num_8x8_blocks_wide_lookup[bsize],
@@ -1731,59 +1723,59 @@
case PARTITION_HORZ:
write_modes_b_wrapper(cpi, tile, w, tok, tok_end, supertx_enabled,
mi_row, mi_col);
- if (mi_row + bs < cm->mi_rows)
+ if (mi_row + hbs < cm->mi_rows)
write_modes_b_wrapper(cpi, tile, w, tok, tok_end,
- supertx_enabled, mi_row + bs, mi_col);
+ supertx_enabled, mi_row + hbs, mi_col);
break;
case PARTITION_VERT:
write_modes_b_wrapper(cpi, tile, w, tok, tok_end, supertx_enabled,
mi_row, mi_col);
- if (mi_col + bs < cm->mi_cols)
+ if (mi_col + hbs < cm->mi_cols)
write_modes_b_wrapper(cpi, tile, w, tok, tok_end,
- supertx_enabled, mi_row, mi_col + bs);
+ supertx_enabled, mi_row, mi_col + hbs);
break;
case PARTITION_SPLIT:
write_modes_sb_wrapper(cpi, tile, w, tok, tok_end, supertx_enabled,
mi_row, mi_col, subsize);
write_modes_sb_wrapper(cpi, tile, w, tok, tok_end, supertx_enabled,
- mi_row, mi_col + bs, subsize);
+ mi_row, mi_col + hbs, subsize);
write_modes_sb_wrapper(cpi, tile, w, tok, tok_end, supertx_enabled,
- mi_row + bs, mi_col, subsize);
+ mi_row + hbs, mi_col, subsize);
write_modes_sb_wrapper(cpi, tile, w, tok, tok_end, supertx_enabled,
- mi_row + bs, mi_col + bs, subsize);
+ mi_row + hbs, mi_col + hbs, subsize);
break;
#if CONFIG_EXT_PARTITION_TYPES
case PARTITION_HORZ_A:
write_modes_b_wrapper(cpi, tile, w, tok, tok_end, supertx_enabled,
mi_row, mi_col);
write_modes_b_wrapper(cpi, tile, w, tok, tok_end, supertx_enabled,
- mi_row, mi_col + bs);
+ mi_row, mi_col + hbs);
write_modes_b_wrapper(cpi, tile, w, tok, tok_end, supertx_enabled,
- mi_row + bs, mi_col);
+ mi_row + hbs, mi_col);
break;
case PARTITION_HORZ_B:
write_modes_b_wrapper(cpi, tile, w, tok, tok_end, supertx_enabled,
mi_row, mi_col);
write_modes_b_wrapper(cpi, tile, w, tok, tok_end, supertx_enabled,
- mi_row + bs, mi_col);
+ mi_row + hbs, mi_col);
write_modes_b_wrapper(cpi, tile, w, tok, tok_end, supertx_enabled,
- mi_row + bs, mi_col + bs);
+ mi_row + hbs, mi_col + hbs);
break;
case PARTITION_VERT_A:
write_modes_b_wrapper(cpi, tile, w, tok, tok_end, supertx_enabled,
mi_row, mi_col);
write_modes_b_wrapper(cpi, tile, w, tok, tok_end, supertx_enabled,
- mi_row + bs, mi_col);
+ mi_row + hbs, mi_col);
write_modes_b_wrapper(cpi, tile, w, tok, tok_end, supertx_enabled,
- mi_row, mi_col + bs);
+ mi_row, mi_col + hbs);
break;
case PARTITION_VERT_B:
write_modes_b_wrapper(cpi, tile, w, tok, tok_end, supertx_enabled,
mi_row, mi_col);
write_modes_b_wrapper(cpi, tile, w, tok, tok_end, supertx_enabled,
- mi_row, mi_col + bs);
+ mi_row, mi_col + hbs);
write_modes_b_wrapper(cpi, tile, w, tok, tok_end, supertx_enabled,
- mi_row + bs, mi_col + bs);
+ mi_row + hbs, mi_col + hbs);
break;
#endif // CONFIG_EXT_PARTITION_TYPES
default:
@@ -1792,15 +1784,15 @@
}
#if CONFIG_SUPERTX
if (partition != PARTITION_NONE && supertx_enabled && pack_token &&
- !m->mbmi.skip) {
+ !mbmi->skip) {
assert(*tok < tok_end);
for (plane = 0; plane < MAX_MB_PLANE; ++plane) {
- const int mbmi_txb_size = txsize_to_bsize[m->mbmi.tx_size];
+ const int mbmi_txb_size = txsize_to_bsize[mbmi->tx_size];
const int num_4x4_w = num_4x4_blocks_wide_lookup[mbmi_txb_size];
const int num_4x4_h = num_4x4_blocks_high_lookup[mbmi_txb_size];
int row, col;
- TX_SIZE tx = plane ? get_uv_tx_size(&m->mbmi, &xd->plane[plane])
- : m->mbmi.tx_size;
+ TX_SIZE tx = plane ? get_uv_tx_size(mbmi, &xd->plane[plane])
+ : mbmi->tx_size;
BLOCK_SIZE txb_size = txsize_to_bsize[tx];
int bw = num_4x4_blocks_wide_lookup[txb_size];
diff --git a/vp10/encoder/encodeframe.c b/vp10/encoder/encodeframe.c
index 3da16cc..d1230d3 100644
--- a/vp10/encoder/encodeframe.c
+++ b/vp10/encoder/encodeframe.c
@@ -373,7 +373,11 @@
assert(!(mi_col_pred & (mi_width - 1)) && !(mi_row_pred & (mi_height - 1)));
set_mi_row_col(xd, tile, mi_row_pred, mi_height, mi_col_pred, mi_width,
cm->mi_rows, cm->mi_cols);
+#if CONFIG_EXT_TILE
+ xd->up_available = (mi_row_ori > tile->mi_row_start);
+#else
xd->up_available = (mi_row_ori != 0);
+#endif // CONFIG_EXT_TILE
xd->left_available = (mi_col_ori > tile->mi_col_start);
// R/D setup.
@@ -2264,35 +2268,24 @@
TOKENEXTRA **tp, int mi_row, int mi_col,
int output_enabled, BLOCK_SIZE bsize,
PC_TREE *pc_tree) {
- VP10_COMMON *const cm = &cpi->common;
+ const VP10_COMMON *const cm = &cpi->common;
MACROBLOCK *const x = &td->mb;
MACROBLOCKD *const xd = &x->e_mbd;
- const int bsl = b_width_log2_lookup[bsize], hbs = (1 << bsl) / 4;
- int ctx;
- PARTITION_TYPE partition;
- BLOCK_SIZE subsize = bsize;
+ const int ctx = partition_plane_context(xd, mi_row, mi_col, bsize);
+ const int hbs = num_8x8_blocks_wide_lookup[bsize] / 2;
+ const PARTITION_TYPE partition = pc_tree->partitioning;
+ const BLOCK_SIZE subsize = get_subsize(bsize, partition);
#if CONFIG_EXT_PARTITION_TYPES
- BLOCK_SIZE bsize2 = get_subsize(bsize, PARTITION_SPLIT);
+ const BLOCK_SIZE bsize2 = get_subsize(bsize, PARTITION_SPLIT);
#endif
+ assert(bsize >= BLOCK_8X8);
+
if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols)
return;
- if (bsize >= BLOCK_8X8) {
- ctx = partition_plane_context(xd, mi_row, mi_col, bsize);
- subsize = get_subsize(bsize, pc_tree->partitioning);
- } else {
- ctx = 0;
- subsize = BLOCK_4X4;
- }
-
- partition = partition_lookup[bsl][subsize];
-#if CONFIG_EXT_PARTITION_TYPES
- if (bsize > BLOCK_8X8)
- partition = pc_tree->partitioning;
-#endif
- if (output_enabled && bsize != BLOCK_4X4)
+ if (output_enabled)
td->counts->partition[ctx][partition]++;
#if CONFIG_SUPERTX
@@ -2583,12 +2576,11 @@
MACROBLOCK *const x = &td->mb;
MACROBLOCKD *const xd = &x->e_mbd;
const int mis = cm->mi_stride;
- const int bsl = b_width_log2_lookup[bsize];
- const int mi_step = num_4x4_blocks_wide_lookup[bsize] / 2;
- const int bss = (1 << bsl) / 4;
+ const int bs = num_8x8_blocks_wide_lookup[bsize];
+ const int hbs = bs / 2;
int i, pl;
- PARTITION_TYPE partition = PARTITION_NONE;
- BLOCK_SIZE subsize;
+ const PARTITION_TYPE partition = get_partition(cm, mi_row, mi_col, bsize);
+ const BLOCK_SIZE subsize = get_subsize(bsize, partition);
RD_SEARCH_MACROBLOCK_CONTEXT x_ctx;
RD_COST last_part_rdc, none_rdc, chosen_rdc;
BLOCK_SIZE sub_subsize = BLOCK_4X4;
@@ -2616,9 +2608,6 @@
vp10_rd_cost_reset(&none_rdc);
vp10_rd_cost_reset(&chosen_rdc);
- partition = partition_lookup[bsl][bs_type];
- subsize = get_subsize(bsize, partition);
-
pc_tree->partitioning = partition;
#if CONFIG_VAR_TX
@@ -2643,7 +2632,7 @@
splits_below = 1;
for (i = 0; i < 4; i++) {
int jj = i >> 1, ii = i & 0x01;
- MODE_INFO *this_mi = mi_8x8[jj * bss * mis + ii * bss];
+ MODE_INFO *this_mi = mi_8x8[jj * hbs * mis + ii * hbs];
if (this_mi && this_mi->mbmi.sb_type >= sub_subsize) {
splits_below = 0;
}
@@ -2653,8 +2642,8 @@
// If partition is not none try none unless each of the 4 splits are split
// even further..
if (partition != PARTITION_NONE && !splits_below &&
- mi_row + (mi_step >> 1) < cm->mi_rows &&
- mi_col + (mi_step >> 1) < cm->mi_cols) {
+ mi_row + hbs < cm->mi_rows &&
+ mi_col + hbs < cm->mi_cols) {
pc_tree->partitioning = PARTITION_NONE;
rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &none_rdc,
#if CONFIG_SUPERTX
@@ -2705,7 +2694,7 @@
subsize, &pc_tree->horizontal[0],
INT64_MAX);
if (last_part_rdc.rate != INT_MAX &&
- bsize >= BLOCK_8X8 && mi_row + (mi_step >> 1) < cm->mi_rows) {
+ bsize >= BLOCK_8X8 && mi_row + hbs < cm->mi_rows) {
RD_COST tmp_rdc;
#if CONFIG_SUPERTX
int rt_nocoef = 0;
@@ -2715,7 +2704,7 @@
update_state(cpi, td, ctx, mi_row, mi_col, subsize, 0);
encode_superblock(cpi, td, tp, 0, mi_row, mi_col, subsize, ctx);
rd_pick_sb_modes(cpi, tile_data, x,
- mi_row + (mi_step >> 1), mi_col, &tmp_rdc,
+ mi_row + hbs, mi_col, &tmp_rdc,
#if CONFIG_SUPERTX
&rt_nocoef,
#endif
@@ -2748,7 +2737,7 @@
#endif
subsize, &pc_tree->vertical[0], INT64_MAX);
if (last_part_rdc.rate != INT_MAX &&
- bsize >= BLOCK_8X8 && mi_col + (mi_step >> 1) < cm->mi_cols) {
+ bsize >= BLOCK_8X8 && mi_col + hbs < cm->mi_cols) {
RD_COST tmp_rdc;
#if CONFIG_SUPERTX
int rt_nocoef = 0;
@@ -2758,7 +2747,7 @@
update_state(cpi, td, ctx, mi_row, mi_col, subsize, 0);
encode_superblock(cpi, td, tp, 0, mi_row, mi_col, subsize, ctx);
rd_pick_sb_modes(cpi, tile_data, x,
- mi_row, mi_col + (mi_step >> 1), &tmp_rdc,
+ mi_row, mi_col + hbs, &tmp_rdc,
#if CONFIG_SUPERTX
&rt_nocoef,
#endif
@@ -2801,8 +2790,8 @@
last_part_rate_nocoef = 0;
#endif
for (i = 0; i < 4; i++) {
- int x_idx = (i & 1) * (mi_step >> 1);
- int y_idx = (i >> 1) * (mi_step >> 1);
+ int x_idx = (i & 1) * hbs;
+ int y_idx = (i >> 1) * hbs;
int jj = i >> 1, ii = i & 0x01;
RD_COST tmp_rdc;
#if CONFIG_SUPERTX
@@ -2813,7 +2802,7 @@
vp10_rd_cost_init(&tmp_rdc);
rd_use_partition(cpi, td, tile_data,
- mi_8x8 + jj * bss * mis + ii * bss, tp,
+ mi_8x8 + jj * hbs * mis + ii * hbs, tp,
mi_row + y_idx, mi_col + x_idx, subsize,
&tmp_rdc.rate, &tmp_rdc.dist,
#if CONFIG_SUPERTX
@@ -2853,10 +2842,10 @@
&& cpi->sf.adjust_partitioning_from_last_frame
&& cpi->sf.partition_search_type == SEARCH_PARTITION
&& partition != PARTITION_SPLIT && bsize > BLOCK_8X8
- && (mi_row + mi_step < cm->mi_rows ||
- mi_row + (mi_step >> 1) == cm->mi_rows)
- && (mi_col + mi_step < cm->mi_cols ||
- mi_col + (mi_step >> 1) == cm->mi_cols)) {
+ && (mi_row + bs < cm->mi_rows ||
+ mi_row + hbs == cm->mi_rows)
+ && (mi_col + bs < cm->mi_cols ||
+ mi_col + hbs == cm->mi_cols)) {
BLOCK_SIZE split_subsize = get_subsize(bsize, PARTITION_SPLIT);
chosen_rdc.rate = 0;
chosen_rdc.dist = 0;
@@ -2870,8 +2859,8 @@
// Split partition.
for (i = 0; i < 4; i++) {
- int x_idx = (i & 1) * (mi_step >> 1);
- int y_idx = (i >> 1) * (mi_step >> 1);
+ int x_idx = (i & 1) * hbs;
+ int y_idx = (i >> 1) * hbs;
RD_COST tmp_rdc;
#if CONFIG_SUPERTX
int rt_nocoef = 0;
@@ -5164,29 +5153,20 @@
static int check_intra_sb(VP10_COMP *cpi, const TileInfo *const tile,
int mi_row, int mi_col, BLOCK_SIZE bsize,
PC_TREE *pc_tree) {
- VP10_COMMON *const cm = &cpi->common;
+ const VP10_COMMON *const cm = &cpi->common;
- const int bsl = b_width_log2_lookup[bsize], hbs = (1 << bsl) / 4;
- PARTITION_TYPE partition;
- BLOCK_SIZE subsize = bsize;
+ const int hbs = num_8x8_blocks_wide_lookup[bsize] / 2;
+ const PARTITION_TYPE partition = pc_tree->partitioning;
+ const BLOCK_SIZE subsize = get_subsize(bsize, partition);
#if CONFIG_EXT_PARTITION_TYPES
int i;
#endif
+ assert(bsize >= BLOCK_8X8);
+
if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols)
return 1;
- if (bsize >= BLOCK_8X8)
- subsize = get_subsize(bsize, pc_tree->partitioning);
- else
- subsize = BLOCK_4X4;
-
- partition = partition_lookup[bsl][subsize];
-#if CONFIG_EXT_PARTITION_TYPES
- if (bsize > BLOCK_8X8)
- partition = pc_tree->partitioning;
-#endif
-
switch (partition) {
case PARTITION_NONE:
return check_intra_b(&pc_tree->none);
@@ -5522,14 +5502,15 @@
MACROBLOCK *const x = &td->mb;
MACROBLOCKD *const xd = &x->e_mbd;
- const int bsl = b_width_log2_lookup[bsize], hbs = (1 << bsl) / 4;
- PARTITION_TYPE partition;
- BLOCK_SIZE subsize;
+ const int ctx = partition_plane_context(xd, mi_row, mi_col, bsize);
+ const int hbs = num_8x8_blocks_wide_lookup[bsize] / 2;
+ const PARTITION_TYPE partition = pc_tree->partitioning;
+ const BLOCK_SIZE subsize = get_subsize(bsize, partition);
#if CONFIG_EXT_PARTITION_TYPES
- BLOCK_SIZE bsize2 = get_subsize(bsize, PARTITION_SPLIT);
+ const BLOCK_SIZE bsize2 = get_subsize(bsize, PARTITION_SPLIT);
#endif
- int i, ctx;
+ int i;
uint8_t *dst_buf1[3], *dst_buf2[3], *dst_buf3[3];
DECLARE_ALIGNED(16, uint8_t, tmp_buf1[MAX_MB_PLANE * MAX_TX_SQUARE * 2]);
DECLARE_ALIGNED(16, uint8_t, tmp_buf2[MAX_MB_PLANE * MAX_TX_SQUARE * 2]);
@@ -5537,6 +5518,12 @@
int dst_stride1[3] = {MAX_TX_SIZE, MAX_TX_SIZE, MAX_TX_SIZE};
int dst_stride2[3] = {MAX_TX_SIZE, MAX_TX_SIZE, MAX_TX_SIZE};
int dst_stride3[3] = {MAX_TX_SIZE, MAX_TX_SIZE, MAX_TX_SIZE};
+
+ assert(bsize >= BLOCK_8X8);
+
+ if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols)
+ return;
+
#if CONFIG_VP9_HIGHBITDEPTH
if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
int len = sizeof(uint16_t);
@@ -5564,23 +5551,8 @@
}
#endif // CONFIG_VP9_HIGHBITDEPTH
- if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols)
- return;
-
- if (bsize >= BLOCK_8X8) {
- ctx = partition_plane_context(xd, mi_row, mi_col, bsize);
- subsize = get_subsize(bsize, pc_tree->partitioning);
- } else {
- ctx = 0;
- subsize = BLOCK_4X4;
- }
- partition = partition_lookup[bsl][subsize];
-#if CONFIG_EXT_PARTITION_TYPES
- if (bsize > BLOCK_8X8)
- partition = pc_tree->partitioning;
-#endif
- if (output_enabled && bsize != BLOCK_4X4 && bsize < top_bsize)
- cm->counts.partition[ctx][partition]++;
+ if (output_enabled && bsize < top_bsize)
+ cm->counts.partition[ctx][partition]++;
for (i = 0; i < MAX_MB_PLANE; i++) {
xd->plane[i].dst.buf = dst_buf[i];
diff --git a/vp10/encoder/mcomp.c b/vp10/encoder/mcomp.c
index 4327d97..0c8ec43 100644
--- a/vp10/encoder/mcomp.c
+++ b/vp10/encoder/mcomp.c
@@ -367,8 +367,8 @@
if (second_pred != NULL) {
if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
DECLARE_ALIGNED(16, uint16_t, comp_pred16[MAX_SB_SQUARE]);
- vpx_highbd_comp_avg_pred(comp_pred16, second_pred, w, h, y + offset,
- y_stride);
+ vpx_highbd_comp_avg_pred_c(comp_pred16, second_pred, w, h, y + offset,
+ y_stride);
besterr = vfp->vf(CONVERT_TO_BYTEPTR(comp_pred16), w, src, src_stride,
sse1);
} else {
diff --git a/vp10/encoder/segmentation.c b/vp10/encoder/segmentation.c
index e7f746f..8628b99 100644
--- a/vp10/encoder/segmentation.c
+++ b/vp10/encoder/segmentation.c
@@ -180,8 +180,7 @@
if (bsize == BLOCK_8X8)
partition = PARTITION_NONE;
else
- partition = get_partition(cm->mi, cm->mi_stride, cm->mi_rows, cm->mi_cols,
- mi_row, mi_col, bsize);
+ partition = get_partition(cm, mi_row, mi_col, bsize);
switch (partition) {
case PARTITION_NONE:
count_segs(cm, xd, tile, mi, no_pred_segcounts, temporal_predictor_count,
diff --git a/vpx_dsp/variance.c b/vpx_dsp/variance.c
index 24f42df..90c8bed 100644
--- a/vpx_dsp/variance.c
+++ b/vpx_dsp/variance.c
@@ -433,7 +433,7 @@
return *sse; \
}
-static void highbd_var_filter_block2d_bil_first_pass(
+void highbd_var_filter_block2d_bil_first_pass(
const uint8_t *src_ptr8,
uint16_t *output_ptr,
unsigned int src_pixels_per_line,
@@ -459,7 +459,7 @@
}
}
-static void highbd_var_filter_block2d_bil_second_pass(
+void highbd_var_filter_block2d_bil_second_pass(
const uint16_t *src_ptr,
uint16_t *output_ptr,
unsigned int src_pixels_per_line,
@@ -551,8 +551,8 @@
highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
bilinear_filters_2t[yoffset]); \
\
- vpx_highbd_comp_avg_pred(temp3, second_pred, W, H, \
- CONVERT_TO_BYTEPTR(temp2), W); \
+ vpx_highbd_comp_avg_pred_c(temp3, second_pred, W, H, \
+ CONVERT_TO_BYTEPTR(temp2), W); \
\
return vpx_highbd_8_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W, dst, \
dst_stride, sse); \
@@ -573,8 +573,8 @@
highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
bilinear_filters_2t[yoffset]); \
\
- vpx_highbd_comp_avg_pred(temp3, second_pred, W, H, \
- CONVERT_TO_BYTEPTR(temp2), W); \
+ vpx_highbd_comp_avg_pred_c(temp3, second_pred, W, H, \
+ CONVERT_TO_BYTEPTR(temp2), W); \
\
return vpx_highbd_10_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), \
W, dst, dst_stride, sse); \
@@ -595,8 +595,8 @@
highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
bilinear_filters_2t[yoffset]); \
\
- vpx_highbd_comp_avg_pred(temp3, second_pred, W, H, \
- CONVERT_TO_BYTEPTR(temp2), W); \
+ vpx_highbd_comp_avg_pred_c(temp3, second_pred, W, H, \
+ CONVERT_TO_BYTEPTR(temp2), W); \
\
return vpx_highbd_12_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), \
W, dst, dst_stride, sse); \
@@ -635,9 +635,9 @@
HIGHBD_MSE(8, 16)
HIGHBD_MSE(8, 8)
-void vpx_highbd_comp_avg_pred(uint16_t *comp_pred, const uint8_t *pred8,
- int width, int height, const uint8_t *ref8,
- int ref_stride) {
+void vpx_highbd_comp_avg_pred_c(uint16_t *comp_pred, const uint8_t *pred8,
+ int width, int height, const uint8_t *ref8,
+ int ref_stride) {
int i, j;
uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
diff --git a/vpx_dsp/variance.h b/vpx_dsp/variance.h
index 161d647..4ad23f8 100644
--- a/vpx_dsp/variance.h
+++ b/vpx_dsp/variance.h
@@ -130,6 +130,24 @@
} vp10_variance_fn_ptr_t;
#endif // CONFIG_VP10
+void highbd_var_filter_block2d_bil_first_pass(
+ const uint8_t *src_ptr8,
+ uint16_t *output_ptr,
+ unsigned int src_pixels_per_line,
+ int pixel_step,
+ unsigned int output_height,
+ unsigned int output_width,
+ const uint8_t *filter);
+
+void highbd_var_filter_block2d_bil_second_pass(
+ const uint16_t *src_ptr,
+ uint16_t *output_ptr,
+ unsigned int src_pixels_per_line,
+ unsigned int pixel_step,
+ unsigned int output_height,
+ unsigned int output_width,
+ const uint8_t *filter);
+
#ifdef __cplusplus
} // extern "C"
#endif
diff --git a/vpx_dsp/vpx_dsp.mk b/vpx_dsp/vpx_dsp.mk
index e371849..a9805d7 100644
--- a/vpx_dsp/vpx_dsp.mk
+++ b/vpx_dsp/vpx_dsp.mk
@@ -350,6 +350,7 @@
ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
DSP_SRCS-$(HAVE_SSE2) += x86/highbd_variance_sse2.c
+DSP_SRCS-$(HAVE_SSE4_1) += x86/highbd_variance_sse4.c
DSP_SRCS-$(HAVE_SSE2) += x86/highbd_variance_impl_sse2.asm
ifeq ($(CONFIG_USE_X86INC),yes)
DSP_SRCS-$(HAVE_SSE2) += x86/highbd_subpel_variance_impl_sse2.asm
diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl
index d01e81d..10a5280 100644
--- a/vpx_dsp/vpx_dsp_rtcd_defs.pl
+++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl
@@ -1316,10 +1316,17 @@
if ($w != 128 && $h != 128 && $w != 4 && $h != 4) {
specialize "vpx_highbd_${bd}_variance${w}x${h}", "sse2";
}
+ if ($w == 4 && $h == 4) {
+ specialize "vpx_highbd_${bd}_variance${w}x${h}", "sse4_1";
+ }
if ($w != 128 && $h != 128 && $w != 4) {
specialize "vpx_highbd_${bd}_sub_pixel_variance${w}x${h}", $sse2_x86inc;
specialize "vpx_highbd_${bd}_sub_pixel_avg_variance${w}x${h}", $sse2_x86inc;
}
+ if ($w == 4 && $h == 4) {
+ specialize "vpx_highbd_${bd}_sub_pixel_variance${w}x${h}", "sse4_1";
+ specialize "vpx_highbd_${bd}_sub_pixel_avg_variance${w}x${h}", "sse4_1";
+ }
}
}
} # CONFIG_VP9_HIGHBITDEPTH
diff --git a/vpx_dsp/x86/highbd_variance_sse4.c b/vpx_dsp/x86/highbd_variance_sse4.c
new file mode 100644
index 0000000..18ecc7e
--- /dev/null
+++ b/vpx_dsp/x86/highbd_variance_sse4.c
@@ -0,0 +1,236 @@
+/*
+ * Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <smmintrin.h> /* SSE4.1 */
+
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+
+#include "vpx_dsp/variance.h"
+#include "vpx_dsp/vpx_filter.h"
+
+static INLINE void variance4x4_64_sse4_1(const uint8_t *a8, int a_stride,
+ const uint8_t *b8, int b_stride,
+ uint64_t *sse, int64_t *sum) {
+ __m128i u0, u1, u2, u3;
+ __m128i s0, s1, s2, s3;
+ __m128i t0, t1, x0, y0;
+ __m128i a0, a1, a2, a3;
+ __m128i b0, b1, b2, b3;
+ __m128i k_one_epi16 = _mm_set1_epi16((int16_t)1);
+
+ uint16_t *a = CONVERT_TO_SHORTPTR(a8);
+ uint16_t *b = CONVERT_TO_SHORTPTR(b8);
+
+ a0 = _mm_loadu_si128((__m128i const *) (a + 0 * a_stride));
+ a1 = _mm_loadu_si128((__m128i const *) (a + 1 * a_stride));
+ a2 = _mm_loadu_si128((__m128i const *) (a + 2 * a_stride));
+ a3 = _mm_loadu_si128((__m128i const *) (a + 3 * a_stride));
+
+ b0 = _mm_loadu_si128((__m128i const *) (b + 0 * b_stride));
+ b1 = _mm_loadu_si128((__m128i const *) (b + 1 * b_stride));
+ b2 = _mm_loadu_si128((__m128i const *) (b + 2 * b_stride));
+ b3 = _mm_loadu_si128((__m128i const *) (b + 3 * b_stride));
+
+ u0 = _mm_unpacklo_epi16(a0, a1);
+ u1 = _mm_unpacklo_epi16(a2, a3);
+ u2 = _mm_unpacklo_epi16(b0, b1);
+ u3 = _mm_unpacklo_epi16(b2, b3);
+
+ s0 = _mm_sub_epi16(u0, u2);
+ s1 = _mm_sub_epi16(u1, u3);
+
+ t0 = _mm_madd_epi16(s0, k_one_epi16);
+ t1 = _mm_madd_epi16(s1, k_one_epi16);
+
+ s2 = _mm_hadd_epi32(t0, t1);
+ s3 = _mm_hadd_epi32(s2, s2);
+ y0 = _mm_hadd_epi32(s3, s3);
+
+ t0 = _mm_madd_epi16(s0, s0);
+ t1 = _mm_madd_epi16(s1, s1);
+
+ s2 = _mm_hadd_epi32(t0, t1);
+ s3 = _mm_hadd_epi32(s2, s2);
+ x0 = _mm_hadd_epi32(s3, s3);
+
+ *sse = (uint64_t)_mm_extract_epi32(x0, 0);
+ *sum = (int64_t)_mm_extract_epi32(y0, 0);
+}
+
+uint32_t vpx_highbd_8_variance4x4_sse4_1(const uint8_t *a,
+ int a_stride,
+ const uint8_t *b,
+ int b_stride,
+ uint32_t *sse) {
+ int64_t sum;
+ uint64_t local_sse;
+
+ variance4x4_64_sse4_1(a, a_stride, b, b_stride, &local_sse, &sum);
+ *sse = (uint32_t)local_sse;
+
+ return *sse - ((sum * sum) >> 4);
+}
+
+uint32_t vpx_highbd_10_variance4x4_sse4_1(const uint8_t *a,
+ int a_stride,
+ const uint8_t *b,
+ int b_stride,
+ uint32_t *sse) {
+ int64_t sum;
+ uint64_t local_sse;
+
+ variance4x4_64_sse4_1(a, a_stride, b, b_stride, &local_sse, &sum);
+ *sse = (uint32_t)ROUND_POWER_OF_TWO(local_sse, 4);
+ sum = ROUND_POWER_OF_TWO(sum, 2);
+
+ return *sse - ((sum * sum) >> 4);
+}
+
+uint32_t vpx_highbd_12_variance4x4_sse4_1(const uint8_t *a,
+ int a_stride,
+ const uint8_t *b,
+ int b_stride,
+ uint32_t *sse) {
+ int64_t sum;
+ uint64_t local_sse;
+
+ variance4x4_64_sse4_1(a, a_stride, b, b_stride, &local_sse, &sum);
+ *sse = (uint32_t)ROUND_POWER_OF_TWO(local_sse, 8);
+ sum = ROUND_POWER_OF_TWO(sum, 4);
+
+ return *sse - ((sum * sum) >> 4);
+}
+
+// Sub-pixel
+uint32_t vpx_highbd_8_sub_pixel_variance4x4_sse4_1(
+ const uint8_t *src, int src_stride,
+ int xoffset, int yoffset,
+ const uint8_t *dst, int dst_stride,
+ uint32_t *sse) {
+
+ uint16_t fdata3[(4 + 1) * 4];
+ uint16_t temp2[4 * 4];
+
+ highbd_var_filter_block2d_bil_first_pass(src, fdata3, src_stride, 1, 4 + 1,
+ 4, bilinear_filters_2t[xoffset]);
+ highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, 4, 4, 4, 4,
+ bilinear_filters_2t[yoffset]);
+
+ return vpx_highbd_8_variance4x4(CONVERT_TO_BYTEPTR(temp2),
+ 4, dst, dst_stride, sse);
+}
+
+uint32_t vpx_highbd_10_sub_pixel_variance4x4_sse4_1(
+ const uint8_t *src, int src_stride,
+ int xoffset, int yoffset,
+ const uint8_t *dst, int dst_stride,
+ uint32_t *sse) {
+
+ uint16_t fdata3[(4 + 1) * 4];
+ uint16_t temp2[4 * 4];
+
+ highbd_var_filter_block2d_bil_first_pass(src, fdata3, src_stride, 1, 4 + 1,
+ 4, bilinear_filters_2t[xoffset]);
+ highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, 4, 4, 4, 4,
+ bilinear_filters_2t[yoffset]);
+
+ return vpx_highbd_10_variance4x4(CONVERT_TO_BYTEPTR(temp2),
+ 4, dst, dst_stride, sse);
+}
+
+uint32_t vpx_highbd_12_sub_pixel_variance4x4_sse4_1(
+ const uint8_t *src, int src_stride,
+ int xoffset, int yoffset,
+ const uint8_t *dst, int dst_stride,
+ uint32_t *sse) {
+
+ uint16_t fdata3[(4 + 1) * 4];
+ uint16_t temp2[4 * 4];
+
+ highbd_var_filter_block2d_bil_first_pass(src, fdata3, src_stride, 1, 4 + 1,
+ 4, bilinear_filters_2t[xoffset]);
+ highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, 4, 4, 4, 4,
+ bilinear_filters_2t[yoffset]);
+
+ return vpx_highbd_12_variance4x4(CONVERT_TO_BYTEPTR(temp2),
+ 4, dst, dst_stride, sse);
+}
+
+// Sub-pixel average
+
+uint32_t vpx_highbd_8_sub_pixel_avg_variance4x4_sse4_1(
+ const uint8_t *src, int src_stride,
+ int xoffset, int yoffset,
+ const uint8_t *dst, int dst_stride,
+ uint32_t *sse,
+ const uint8_t *second_pred) {
+
+ uint16_t fdata3[(4 + 1) * 4];
+ uint16_t temp2[4 * 4];
+ DECLARE_ALIGNED(16, uint16_t, temp3[4 * 4]);
+
+ highbd_var_filter_block2d_bil_first_pass(src, fdata3, src_stride, 1, 4 + 1,
+ 4, bilinear_filters_2t[xoffset]);
+ highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, 4, 4, 4, 4,
+ bilinear_filters_2t[yoffset]);
+
+ vpx_highbd_comp_avg_pred_c(temp3, second_pred, 4, 4,
+ CONVERT_TO_BYTEPTR(temp2), 4);
+
+ return vpx_highbd_8_variance4x4(CONVERT_TO_BYTEPTR(temp3),
+ 4, dst, dst_stride, sse);
+}
+
+uint32_t vpx_highbd_10_sub_pixel_avg_variance4x4_sse4_1(
+ const uint8_t *src, int src_stride,
+ int xoffset, int yoffset,
+ const uint8_t *dst, int dst_stride,
+ uint32_t *sse,
+ const uint8_t *second_pred) {
+
+ uint16_t fdata3[(4 + 1) * 4];
+ uint16_t temp2[4 * 4];
+ DECLARE_ALIGNED(16, uint16_t, temp3[4 * 4]);
+
+ highbd_var_filter_block2d_bil_first_pass(src, fdata3, src_stride, 1, 4 + 1,
+ 4, bilinear_filters_2t[xoffset]);
+ highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, 4, 4, 4, 4,
+ bilinear_filters_2t[yoffset]);
+
+ vpx_highbd_comp_avg_pred_c(temp3, second_pred, 4, 4,
+ CONVERT_TO_BYTEPTR(temp2), 4);
+
+ return vpx_highbd_10_variance4x4(CONVERT_TO_BYTEPTR(temp3),
+ 4, dst, dst_stride, sse);
+}
+
+uint32_t vpx_highbd_12_sub_pixel_avg_variance4x4_sse4_1(
+ const uint8_t *src, int src_stride,
+ int xoffset, int yoffset,
+ const uint8_t *dst, int dst_stride,
+ uint32_t *sse,
+ const uint8_t *second_pred) {
+
+ uint16_t fdata3[(4 + 1) * 4];
+ uint16_t temp2[4 * 4];
+ DECLARE_ALIGNED(16, uint16_t, temp3[4 * 4]);
+
+ highbd_var_filter_block2d_bil_first_pass(src, fdata3, src_stride, 1, 4 + 1,
+ 4, bilinear_filters_2t[xoffset]);
+ highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, 4, 4, 4, 4,
+ bilinear_filters_2t[yoffset]);
+
+ vpx_highbd_comp_avg_pred_c(temp3, second_pred, 4, 4,
+ CONVERT_TO_BYTEPTR(temp2), 4);
+
+ return vpx_highbd_12_variance4x4(CONVERT_TO_BYTEPTR(temp3),
+ 4, dst, dst_stride, sse);
+}