Merge "Eliminate num_8x8 and num_4x4 width/height lookups"
diff --git a/test/fdct4x4_test.cc b/test/fdct4x4_test.cc
index 6294af1..4ee4ad4 100644
--- a/test/fdct4x4_test.cc
+++ b/test/fdct4x4_test.cc
@@ -494,6 +494,14 @@
make_tuple(&vp9_fwht4x4_mmx, &vp9_iwht4x4_16_add_c, 0, VPX_BITS_8)));
#endif
+#if CONFIG_USE_X86INC && HAVE_SSE2 && !CONFIG_VP9_HIGHBITDEPTH && \
+ !CONFIG_EMULATE_HARDWARE
+INSTANTIATE_TEST_CASE_P(
+ SSE2, Trans4x4WHT,
+ ::testing::Values(
+ make_tuple(&vp9_fwht4x4_c, &vp9_iwht4x4_16_add_sse2, 0, VPX_BITS_8)));
+#endif
+
#if HAVE_SSE2 && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
INSTANTIATE_TEST_CASE_P(
SSE2, Trans4x4DCT,
diff --git a/vp9/common/vp9_rtcd_defs.pl b/vp9/common/vp9_rtcd_defs.pl
index 524e79f..538f1ed 100644
--- a/vp9/common/vp9_rtcd_defs.pl
+++ b/vp9/common/vp9_rtcd_defs.pl
@@ -475,7 +475,7 @@
specialize qw/vp9_iwht4x4_1_add msa/;
add_proto qw/void vp9_iwht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
- specialize qw/vp9_iwht4x4_16_add msa/;
+ specialize qw/vp9_iwht4x4_16_add msa/, "$sse2_x86inc";
}
}
diff --git a/vp9/common/x86/vp9_idct_sse2.asm b/vp9/common/x86/vp9_idct_sse2.asm
new file mode 100644
index 0000000..9619e37
--- /dev/null
+++ b/vp9/common/x86/vp9_idct_sse2.asm
@@ -0,0 +1,88 @@
+;
+; Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+%include "third_party/x86inc/x86inc.asm"
+
+SECTION .text
+
+%macro TRANSFORM_COLS 0
+ paddw m0, m2
+ psubw m3, m1
+ psubw m4, m0, m3
+ psraw m4, 1
+ psubw m5, m4, m1 ;b1
+ psubw m4, m2 ;c1
+ psubw m0, m5
+ paddw m3, m4
+ ; m0 a0
+ SWAP 1, 4 ; m1 c1
+ SWAP 2, 3 ; m2 d1
+ SWAP 3, 5 ; m3 b1
+%endmacro
+
+%macro TRANSPOSE_4X4 0
+ SWAP 4, 0
+ SWAP 5, 2
+ punpcklwd m4, m1
+ pshufd m0, m4, 0x0e
+ punpcklwd m5, m3
+ pshufd m2, m5, 0x0e
+ SWAP 1, 4
+ SWAP 3, 0
+ punpckldq m1, m5
+ pshufd m4, m1, 0x0e
+ punpckldq m3, m2
+ pshufd m0, m3, 0x0e
+ SWAP 2, 3, 0, 1, 4
+%endmacro
+
+; transposes a 4x4 int16 matrix in xmm0 and xmm1 to the bottom half of xmm0-xmm3
+%macro TRANSPOSE_4X4_WIDE 0
+ mova m3, m0
+ punpcklwd m0, m1
+ punpckhwd m3, m1
+ mova m2, m0
+ punpcklwd m0, m3
+ punpckhwd m2, m3
+ pshufd m1, m0, 0x0e
+ pshufd m3, m2, 0x0e
+%endmacro
+
+%macro ADD_STORE_4P_2X 5 ; src1, src2, tmp1, tmp2, zero
+ movq m%3, [outputq]
+ movq m%4, [outputq + strideq]
+ punpcklbw m%3, m%5
+ punpcklbw m%4, m%5
+ paddw m%1, m%3
+ paddw m%2, m%4
+ packuswb m%1, m%5
+ packuswb m%2, m%5
+ movd [outputq], m%1
+ movd [outputq + strideq], m%2
+%endmacro
+
+INIT_XMM sse2
+cglobal iwht4x4_16_add, 3, 4, 7, input, output, stride
+ mova m0, [inputq + 0] ;a1
+ mova m1, [inputq + 16] ;c1
+
+ psraw m0, 2
+ psraw m1, 2
+
+ TRANSPOSE_4X4_WIDE
+ TRANSFORM_COLS
+ TRANSPOSE_4X4
+ TRANSFORM_COLS
+
+ pxor m4, m4
+ ADD_STORE_4P_2X 0, 1, 5, 6, 4
+ lea outputq, [outputq + 2 * strideq]
+ ADD_STORE_4P_2X 2, 3, 5, 6, 4
+
+ RET
diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c
index 02d986e..4f44b43 100644
--- a/vp9/encoder/vp9_encodeframe.c
+++ b/vp9/encoder/vp9_encodeframe.c
@@ -194,7 +194,6 @@
set_mode_info_offsets(cm, x, xd, mi_row, mi_col);
-
mbmi = &xd->mi[0]->mbmi;
// Set up destination pointers.
@@ -2123,38 +2122,6 @@
BLOCK_64X64
};
-// Checks to see if a macro block is at the edge of the active image.
-// In most cases this is the "real" edge unless there are formatting
-// bars embedded in the stream.
-static int active_edge_sb(VP9_COMP *cpi,
- int mi_row, int mi_col) {
- int is_active_edge = 0;
- int top_edge = 0;
- int bottom_edge = cpi->common.mi_rows;
- int left_edge = 0;
- int right_edge = cpi->common.mi_cols;
-
- // For two pass account for any formatting bars detected.
- if (cpi->oxcf.pass == 2) {
- TWO_PASS *twopass = &cpi->twopass;
-
- // The inactive region is specified in MBs not mi units.
- // The image edge is in the following MB row.
- top_edge += (int)(twopass->this_frame_stats.inactive_zone_rows * 2);
-
- bottom_edge -= (int)(twopass->this_frame_stats.inactive_zone_rows * 2);
- bottom_edge = MAX(top_edge, bottom_edge);
- }
-
- if (((top_edge >= mi_row) && (top_edge < (mi_row + MI_BLOCK_SIZE))) ||
- ((bottom_edge >= mi_row) && (bottom_edge < (mi_row + MI_BLOCK_SIZE))) ||
- ((left_edge >= mi_col) && (left_edge < (mi_col + MI_BLOCK_SIZE))) ||
- ((right_edge >= mi_col) && (right_edge < (mi_col + MI_BLOCK_SIZE)))) {
- is_active_edge = 1;
- }
-
- return is_active_edge;
-}
// Look at all the mode_info entries for blocks that are part of this
// partition and find the min and max values for sb_type.
@@ -2253,7 +2220,7 @@
// Test for blocks at the edge of the active image.
// This may be the actual edge of the image or where there are formatting
// bars.
- if (active_edge_sb(cpi, mi_row, mi_col)) {
+ if (vp9_active_edge_sb(cpi, mi_row, mi_col)) {
min_size = BLOCK_4X4;
} else {
min_size = MIN(cpi->sf.rd_auto_partition_min_limit,
diff --git a/vp9/encoder/vp9_encodemb.c b/vp9/encoder/vp9_encodemb.c
index 3130941..659ce72 100644
--- a/vp9/encoder/vp9_encodemb.c
+++ b/vp9/encoder/vp9_encodemb.c
@@ -763,7 +763,7 @@
}
void vp9_encode_block_intra(int plane, int block, BLOCK_SIZE plane_bsize,
- TX_SIZE tx_size, void *arg) {
+ TX_SIZE tx_size, void *arg) {
struct encode_b_args* const args = arg;
MACROBLOCK *const x = args->x;
MACROBLOCKD *const xd = &x->e_mbd;
diff --git a/vp9/encoder/vp9_firstpass.c b/vp9/encoder/vp9_firstpass.c
index a85b70b..61279f8 100644
--- a/vp9/encoder/vp9_firstpass.c
+++ b/vp9/encoder/vp9_firstpass.c
@@ -126,6 +126,7 @@
stats->pcnt_neutral,
stats->intra_skip_pct,
stats->inactive_zone_rows,
+ stats->inactive_zone_cols,
stats->MVr,
stats->mvr_abs,
stats->MVc,
@@ -164,6 +165,7 @@
section->pcnt_neutral = 0.0;
section->intra_skip_pct = 0.0;
section->inactive_zone_rows = 0.0;
+ section->inactive_zone_cols = 0.0;
section->MVr = 0.0;
section->mvr_abs = 0.0;
section->MVc = 0.0;
@@ -191,6 +193,7 @@
section->pcnt_neutral += frame->pcnt_neutral;
section->intra_skip_pct += frame->intra_skip_pct;
section->inactive_zone_rows += frame->inactive_zone_rows;
+ section->inactive_zone_cols += frame->inactive_zone_cols;
section->MVr += frame->MVr;
section->mvr_abs += frame->mvr_abs;
section->MVc += frame->MVc;
@@ -216,6 +219,7 @@
section->pcnt_neutral -= frame->pcnt_neutral;
section->intra_skip_pct -= frame->intra_skip_pct;
section->inactive_zone_rows -= frame->inactive_zone_rows;
+ section->inactive_zone_cols -= frame->inactive_zone_cols;
section->MVr -= frame->MVr;
section->mvr_abs -= frame->mvr_abs;
section->MVc -= frame->MVc;
@@ -1050,6 +1054,7 @@
fps.pcnt_neutral = (double)neutral_count / num_mbs;
fps.intra_skip_pct = (double)intra_skip_count / num_mbs;
fps.inactive_zone_rows = (double)image_data_start_row;
+ fps.inactive_zone_cols = (double)0; // TODO(paulwilkins): fix
if (mvcount > 0) {
fps.MVr = (double)sum_mvr / mvcount;
diff --git a/vp9/encoder/vp9_firstpass.h b/vp9/encoder/vp9_firstpass.h
index 0047932..49f9da3 100644
--- a/vp9/encoder/vp9_firstpass.h
+++ b/vp9/encoder/vp9_firstpass.h
@@ -53,6 +53,7 @@
double pcnt_neutral;
double intra_skip_pct;
double inactive_zone_rows; // Image mask rows top and bottom.
+ double inactive_zone_cols; // Image mask columns at left and right edges.
double MVr;
double mvr_abs;
double MVc;
diff --git a/vp9/encoder/vp9_rdopt.c b/vp9/encoder/vp9_rdopt.c
index 3a27e89..409300b 100644
--- a/vp9/encoder/vp9_rdopt.c
+++ b/vp9/encoder/vp9_rdopt.c
@@ -2898,6 +2898,47 @@
*this_rd += (*this_rd * var_factor) / 100;
}
+
+// Do we have an internal image edge (e.g. formatting bars).
+int vp9_internal_image_edge(VP9_COMP *cpi) {
+ return (cpi->oxcf.pass == 2) &&
+ ((cpi->twopass.this_frame_stats.inactive_zone_rows > 0) ||
+ (cpi->twopass.this_frame_stats.inactive_zone_cols > 0));
+}
+
+// Checks to see if a macro block is at the edge of the active image.
+// In most cases this is the "real" edge unless there are formatting
+// bars embedded in the stream.
+int vp9_active_edge_sb(VP9_COMP *cpi,
+ int mi_row, int mi_col) {
+ int is_active_edge = 0;
+ int top_edge = 0;
+ int bottom_edge = cpi->common.mi_rows;
+ int left_edge = 0;
+ int right_edge = cpi->common.mi_cols;
+
+ // For two pass account for any formatting bars detected.
+ if (cpi->oxcf.pass == 2) {
+ TWO_PASS *twopass = &cpi->twopass;
+
+ // The inactive region is specified in MBs not mi units.
+ // The image edge is in the following MB row.
+ top_edge += (int)(twopass->this_frame_stats.inactive_zone_rows * 2);
+
+ bottom_edge -= (int)(twopass->this_frame_stats.inactive_zone_rows * 2);
+ bottom_edge = MAX(top_edge, bottom_edge);
+ }
+
+ if (((top_edge >= mi_row) && (top_edge < (mi_row + MI_BLOCK_SIZE))) ||
+ ((bottom_edge >= mi_row) && (bottom_edge < (mi_row + MI_BLOCK_SIZE))) ||
+ ((left_edge >= mi_col) && (left_edge < (mi_col + MI_BLOCK_SIZE))) ||
+ ((right_edge >= mi_col) && (right_edge < (mi_col + MI_BLOCK_SIZE)))) {
+ is_active_edge = 1;
+ }
+
+ return is_active_edge;
+}
+
void vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi,
TileDataEnc *tile_data,
MACROBLOCK *x,
@@ -3751,13 +3792,15 @@
int skip_uv;
PREDICTION_MODE mode_uv = DC_PRED;
const int intra_cost_penalty = vp9_get_intra_cost_penalty(
- cm->base_qindex, cm->y_dc_delta_q, cm->bit_depth);
+ cm->base_qindex, cm->y_dc_delta_q, cm->bit_depth);
int_mv seg_mvs[4][MAX_REF_FRAMES];
b_mode_info best_bmodes[4];
int best_skip2 = 0;
int ref_frame_skip_mask[2] = { 0 };
int64_t mask_filter = 0;
int64_t filter_cache[SWITCHABLE_FILTER_CONTEXTS];
+ int internal_active_edge =
+ vp9_active_edge_sb(cpi, mi_row, mi_col) && vp9_internal_image_edge(cpi);
x->skip_encode = sf->skip_encode_frame && x->q_index < QIDX_SKIP_THRESH;
memset(x->zcoeff_blk[TX_4X4], 0, 4);
@@ -3843,7 +3886,8 @@
continue;
// Test best rd so far against threshold for trying this mode.
- if (rd_less_than_thresh(best_rd,
+ if (!internal_active_edge &&
+ rd_less_than_thresh(best_rd,
rd_opt->threshes[segment_id][bsize][ref_index],
tile_data->thresh_freq_fact[bsize][ref_index]))
continue;
diff --git a/vp9/encoder/vp9_rdopt.h b/vp9/encoder/vp9_rdopt.h
index 459b032..16a8c68 100644
--- a/vp9/encoder/vp9_rdopt.h
+++ b/vp9/encoder/vp9_rdopt.h
@@ -54,6 +54,9 @@
PICK_MODE_CONTEXT *ctx,
int64_t best_rd_so_far);
+int vp9_internal_image_edge(struct VP9_COMP *cpi);
+int vp9_active_edge_sb(struct VP9_COMP *cpi, int mi_row, int mi_col);
+
void vp9_rd_pick_inter_mode_sub8x8(struct VP9_COMP *cpi,
struct TileDataEnc *tile_data,
struct macroblock *x,
@@ -61,6 +64,7 @@
struct RD_COST *rd_cost,
BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx,
int64_t best_rd_so_far);
+
#ifdef __cplusplus
} // extern "C"
#endif
diff --git a/vp9/encoder/vp9_speed_features.c b/vp9/encoder/vp9_speed_features.c
index 6c6c4ed..b3e5a0e 100644
--- a/vp9/encoder/vp9_speed_features.c
+++ b/vp9/encoder/vp9_speed_features.c
@@ -12,6 +12,8 @@
#include "vp9/encoder/vp9_encoder.h"
#include "vp9/encoder/vp9_speed_features.h"
+#include "vp9/encoder/vp9_rdopt.h"
+
// Intra only frames, golden frames (except alt ref overlays) and
// alt ref frames tend to be coded at a higher than ambient quality
@@ -90,7 +92,8 @@
// If this is a two pass clip that fits the criteria for animated or
// graphics content then reset disable_split_mask for speeds 1-4.
if ((speed >= 1) && (cpi->oxcf.pass == 2) &&
- (cpi->twopass.fr_content_type == FC_GRAPHICS_ANIMATION)) {
+ ((cpi->twopass.fr_content_type == FC_GRAPHICS_ANIMATION) ||
+ (vp9_internal_image_edge(cpi)))) {
sf->disable_split_mask = DISABLE_COMPOUND_SPLIT;
}
diff --git a/vp9/vp9_common.mk b/vp9/vp9_common.mk
index 2e43c27..bcab558 100644
--- a/vp9/vp9_common.mk
+++ b/vp9/vp9_common.mk
@@ -75,6 +75,7 @@
VP9_COMMON_SRCS-$(CONFIG_VP9_POSTPROC) += common/vp9_postproc.c
VP9_COMMON_SRCS-$(CONFIG_VP9_POSTPROC) += common/vp9_mfqe.h
VP9_COMMON_SRCS-$(CONFIG_VP9_POSTPROC) += common/vp9_mfqe.c
+VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_idct_sse2.asm
VP9_COMMON_SRCS-$(HAVE_MMX) += common/x86/vp9_loopfilter_mmx.asm
VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_subpixel_8t_sse2.asm
VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_subpixel_bilinear_sse2.asm