Merge changes I2965e786,I144bedde * changes: vpx_memset16: drop unnecessary local vpx_memset16: quiet signed/unsigned warning
diff --git a/vp10/common/vp10_rtcd_defs.pl b/vp10/common/vp10_rtcd_defs.pl index 0d14ad8..4404701 100644 --- a/vp10/common/vp10_rtcd_defs.pl +++ b/vp10/common/vp10_rtcd_defs.pl
@@ -87,65 +87,127 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { # Note as optimized versions of these functions are added we need to add a check to ensure # that when CONFIG_EMULATE_HARDWARE is on, it defaults to the C versions only. - add_proto qw/void vp10_iht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type"; - specialize qw/vp10_iht4x4_16_add/; + if (vpx_config("CONFIG_EMULATE_HARDWARE") eq "yes") { + add_proto qw/void vp10_iht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type"; + specialize qw/vp10_iht4x4_16_add/; - add_proto qw/void vp10_iht8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type"; - specialize qw/vp10_iht8x8_64_add/; + add_proto qw/void vp10_iht8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type"; + specialize qw/vp10_iht8x8_64_add/; - add_proto qw/void vp10_iht16x16_256_add/, "const tran_low_t *input, uint8_t *output, int pitch, int tx_type"; - specialize qw/vp10_iht16x16_256_add/; + add_proto qw/void vp10_iht16x16_256_add/, "const tran_low_t *input, uint8_t *output, int pitch, int tx_type"; + specialize qw/vp10_iht16x16_256_add/; - add_proto qw/void vp10_fdct4x4/, "const int16_t *input, tran_low_t *output, int stride"; - specialize qw/vp10_fdct4x4 sse2/; + add_proto qw/void vp10_fdct4x4/, "const int16_t *input, tran_low_t *output, int stride"; + specialize qw/vp10_fdct4x4/; - add_proto qw/void vp10_fdct4x4_1/, "const int16_t *input, tran_low_t *output, int stride"; - specialize qw/vp10_fdct4x4_1 sse2/; + add_proto qw/void vp10_fdct4x4_1/, "const int16_t *input, tran_low_t *output, int stride"; + specialize qw/vp10_fdct4x4_1/; - add_proto qw/void vp10_fdct8x8/, "const int16_t *input, tran_low_t *output, int stride"; - specialize qw/vp10_fdct8x8 sse2/; + add_proto qw/void vp10_fdct8x8/, "const int16_t *input, tran_low_t *output, int stride"; + specialize qw/vp10_fdct8x8/; - add_proto qw/void vp10_fdct8x8_1/, "const int16_t *input, tran_low_t *output, int stride"; - specialize qw/vp10_fdct8x8_1 sse2/; + add_proto qw/void vp10_fdct8x8_1/, "const int16_t *input, tran_low_t *output, int stride"; + specialize qw/vp10_fdct8x8_1/; - add_proto qw/void vp10_fdct16x16/, "const int16_t *input, tran_low_t *output, int stride"; - specialize qw/vp10_fdct16x16 sse2/; + add_proto qw/void vp10_fdct16x16/, "const int16_t *input, tran_low_t *output, int stride"; + specialize qw/vp10_fdct16x16/; - add_proto qw/void vp10_fdct16x16_1/, "const int16_t *input, tran_low_t *output, int stride"; - specialize qw/vp10_fdct16x16_1 sse2/; + add_proto qw/void vp10_fdct16x16_1/, "const int16_t *input, tran_low_t *output, int stride"; + specialize qw/vp10_fdct16x16_1/; - add_proto qw/void vp10_fdct32x32/, "const int16_t *input, tran_low_t *output, int stride"; - specialize qw/vp10_fdct32x32 sse2/; + add_proto qw/void vp10_fdct32x32/, "const int16_t *input, tran_low_t *output, int stride"; + specialize qw/vp10_fdct32x32/; - add_proto qw/void vp10_fdct32x32_rd/, "const int16_t *input, tran_low_t *output, int stride"; - specialize qw/vp10_fdct32x32_rd sse2/; + add_proto qw/void vp10_fdct32x32_rd/, "const int16_t *input, tran_low_t *output, int stride"; + specialize qw/vp10_fdct32x32_rd/; - add_proto qw/void vp10_fdct32x32_1/, "const int16_t *input, tran_low_t *output, int stride"; - specialize qw/vp10_fdct32x32_1 sse2/; + add_proto qw/void vp10_fdct32x32_1/, "const int16_t *input, tran_low_t *output, int stride"; + specialize qw/vp10_fdct32x32_1/; - add_proto qw/void vp10_highbd_fdct4x4/, "const int16_t *input, tran_low_t *output, int stride"; - specialize qw/vp10_highbd_fdct4x4 sse2/; + add_proto qw/void vp10_highbd_fdct4x4/, "const int16_t *input, tran_low_t *output, int stride"; + specialize qw/vp10_highbd_fdct4x4/; - add_proto qw/void vp10_highbd_fdct8x8/, "const int16_t *input, tran_low_t *output, int stride"; - specialize qw/vp10_highbd_fdct8x8 sse2/; + add_proto qw/void vp10_highbd_fdct8x8/, "const int16_t *input, tran_low_t *output, int stride"; + specialize qw/vp10_highbd_fdct8x8/; - add_proto qw/void vp10_highbd_fdct8x8_1/, "const int16_t *input, tran_low_t *output, int stride"; - specialize qw/vp10_highbd_fdct8x8_1/; + add_proto qw/void vp10_highbd_fdct8x8_1/, "const int16_t *input, tran_low_t *output, int stride"; + specialize qw/vp10_highbd_fdct8x8_1/; - add_proto qw/void vp10_highbd_fdct16x16/, "const int16_t *input, tran_low_t *output, int stride"; - specialize qw/vp10_highbd_fdct16x16 sse2/; + add_proto qw/void vp10_highbd_fdct16x16/, "const int16_t *input, tran_low_t *output, int stride"; + specialize qw/vp10_highbd_fdct16x16/; - add_proto qw/void vp10_highbd_fdct16x16_1/, "const int16_t *input, tran_low_t *output, int stride"; - specialize qw/vp10_highbd_fdct16x16_1/; + add_proto qw/void vp10_highbd_fdct16x16_1/, "const int16_t *input, tran_low_t *output, int stride"; + specialize qw/vp10_highbd_fdct16x16_1/; - add_proto qw/void vp10_highbd_fdct32x32/, "const int16_t *input, tran_low_t *output, int stride"; - specialize qw/vp10_highbd_fdct32x32 sse2/; + add_proto qw/void vp10_highbd_fdct32x32/, "const int16_t *input, tran_low_t *output, int stride"; + specialize qw/vp10_highbd_fdct32x32/; - add_proto qw/void vp10_highbd_fdct32x32_rd/, "const int16_t *input, tran_low_t *output, int stride"; - specialize qw/vp10_highbd_fdct32x32_rd sse2/; + add_proto qw/void vp10_highbd_fdct32x32_rd/, "const int16_t *input, tran_low_t *output, int stride"; + specialize qw/vp10_highbd_fdct32x32_rd/; - add_proto qw/void vp10_highbd_fdct32x32_1/, "const int16_t *input, tran_low_t *output, int stride"; - specialize qw/vp10_highbd_fdct32x32_1/; + add_proto qw/void vp10_highbd_fdct32x32_1/, "const int16_t *input, tran_low_t *output, int stride"; + specialize qw/vp10_highbd_fdct32x32_1/; + } else { + add_proto qw/void vp10_iht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type"; + specialize qw/vp10_iht4x4_16_add sse2/; + + add_proto qw/void vp10_iht8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type"; + specialize qw/vp10_iht8x8_64_add sse2/; + + add_proto qw/void vp10_iht16x16_256_add/, "const tran_low_t *input, uint8_t *output, int pitch, int tx_type"; + specialize qw/vp10_iht16x16_256_add/; + + add_proto qw/void vp10_fdct4x4/, "const int16_t *input, tran_low_t *output, int stride"; + specialize qw/vp10_fdct4x4 sse2/; + + add_proto qw/void vp10_fdct4x4_1/, "const int16_t *input, tran_low_t *output, int stride"; + specialize qw/vp10_fdct4x4_1 sse2/; + + add_proto qw/void vp10_fdct8x8/, "const int16_t *input, tran_low_t *output, int stride"; + specialize qw/vp10_fdct8x8 sse2/; + + add_proto qw/void vp10_fdct8x8_1/, "const int16_t *input, tran_low_t *output, int stride"; + specialize qw/vp10_fdct8x8_1 sse2/; + + add_proto qw/void vp10_fdct16x16/, "const int16_t *input, tran_low_t *output, int stride"; + specialize qw/vp10_fdct16x16 sse2/; + + add_proto qw/void vp10_fdct16x16_1/, "const int16_t *input, tran_low_t *output, int stride"; + specialize qw/vp10_fdct16x16_1 sse2/; + + add_proto qw/void vp10_fdct32x32/, "const int16_t *input, tran_low_t *output, int stride"; + specialize qw/vp10_fdct32x32 sse2/; + + add_proto qw/void vp10_fdct32x32_rd/, "const int16_t *input, tran_low_t *output, int stride"; + specialize qw/vp10_fdct32x32_rd sse2/; + + add_proto qw/void vp10_fdct32x32_1/, "const int16_t *input, tran_low_t *output, int stride"; + specialize qw/vp10_fdct32x32_1 sse2/; + + add_proto qw/void vp10_highbd_fdct4x4/, "const int16_t *input, tran_low_t *output, int stride"; + specialize qw/vp10_highbd_fdct4x4 sse2/; + + add_proto qw/void vp10_highbd_fdct8x8/, "const int16_t *input, tran_low_t *output, int stride"; + specialize qw/vp10_highbd_fdct8x8 sse2/; + + add_proto qw/void vp10_highbd_fdct8x8_1/, "const int16_t *input, tran_low_t *output, int stride"; + specialize qw/vp10_highbd_fdct8x8_1/; + + add_proto qw/void vp10_highbd_fdct16x16/, "const int16_t *input, tran_low_t *output, int stride"; + specialize qw/vp10_highbd_fdct16x16 sse2/; + + add_proto qw/void vp10_highbd_fdct16x16_1/, "const int16_t *input, tran_low_t *output, int stride"; + specialize qw/vp10_highbd_fdct16x16_1/; + + add_proto qw/void vp10_highbd_fdct32x32/, "const int16_t *input, tran_low_t *output, int stride"; + specialize qw/vp10_highbd_fdct32x32 sse2/; + + add_proto qw/void vp10_highbd_fdct32x32_rd/, "const int16_t *input, tran_low_t *output, int stride"; + specialize qw/vp10_highbd_fdct32x32_rd sse2/; + + add_proto qw/void vp10_highbd_fdct32x32_1/, "const int16_t *input, tran_low_t *output, int stride"; + specialize qw/vp10_highbd_fdct32x32_1/; + } } else { # Force C versions if CONFIG_EMULATE_HARDWARE is 1 if (vpx_config("CONFIG_EMULATE_HARDWARE") eq "yes") {
diff --git a/vp10/common/x86/idct_intrin_sse2.c b/vp10/common/x86/idct_intrin_sse2.c index d58e263..a2c674b 100644 --- a/vp10/common/x86/idct_intrin_sse2.c +++ b/vp10/common/x86/idct_intrin_sse2.c
@@ -12,14 +12,14 @@ #include "vpx_dsp/x86/txfm_common_sse2.h" #include "vpx_ports/mem.h" -void vp10_iht4x4_16_add_sse2(const int16_t *input, uint8_t *dest, int stride, - int tx_type) { +void vp10_iht4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest, int stride, + int tx_type) { __m128i in[2]; const __m128i zero = _mm_setzero_si128(); const __m128i eight = _mm_set1_epi16(8); - in[0] = _mm_loadu_si128((const __m128i *)(input)); - in[1] = _mm_loadu_si128((const __m128i *)(input + 8)); + in[0] = load_input_data(input); + in[1] = load_input_data(input + 8); switch (tx_type) { case 0: // DCT_DCT @@ -77,21 +77,21 @@ } } -void vp10_iht8x8_64_add_sse2(const int16_t *input, uint8_t *dest, int stride, - int tx_type) { +void vp10_iht8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest, int stride, + int tx_type) { __m128i in[8]; const __m128i zero = _mm_setzero_si128(); const __m128i final_rounding = _mm_set1_epi16(1 << 4); // load input data - in[0] = _mm_load_si128((const __m128i *)input); - in[1] = _mm_load_si128((const __m128i *)(input + 8 * 1)); - in[2] = _mm_load_si128((const __m128i *)(input + 8 * 2)); - in[3] = _mm_load_si128((const __m128i *)(input + 8 * 3)); - in[4] = _mm_load_si128((const __m128i *)(input + 8 * 4)); - in[5] = _mm_load_si128((const __m128i *)(input + 8 * 5)); - in[6] = _mm_load_si128((const __m128i *)(input + 8 * 6)); - in[7] = _mm_load_si128((const __m128i *)(input + 8 * 7)); + in[0] = load_input_data(input); + in[1] = load_input_data(input + 8 * 1); + in[2] = load_input_data(input + 8 * 2); + in[3] = load_input_data(input + 8 * 3); + in[4] = load_input_data(input + 8 * 4); + in[5] = load_input_data(input + 8 * 5); + in[6] = load_input_data(input + 8 * 6); + in[7] = load_input_data(input + 8 * 7); switch (tx_type) { case 0: // DCT_DCT @@ -144,8 +144,8 @@ RECON_AND_STORE(dest + 7 * stride, in[7]); } -void vp10_iht16x16_256_add_sse2(const int16_t *input, uint8_t *dest, int stride, - int tx_type) { +void vp10_iht16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest, + int stride, int tx_type) { __m128i in0[16], in1[16]; load_buffer_8x16(input, in0);
diff --git a/vp9/encoder/vp9_aq_cyclicrefresh.c b/vp9/encoder/vp9_aq_cyclicrefresh.c index 968dad2..04fa8f6 100644 --- a/vp9/encoder/vp9_aq_cyclicrefresh.c +++ b/vp9/encoder/vp9_aq_cyclicrefresh.c
@@ -395,6 +395,7 @@ unsigned char *const seg_map = cpi->segmentation_map; int i, block_count, bl_index, sb_rows, sb_cols, sbs_in_frame; int xmis, ymis, x, y; + int consec_zero_mv_thresh = 0; memset(seg_map, CR_SEGMENT_ID_BASE, cm->mi_rows * cm->mi_cols); sb_cols = (cm->mi_cols + MI_BLOCK_SIZE - 1) / MI_BLOCK_SIZE; sb_rows = (cm->mi_rows + MI_BLOCK_SIZE - 1) / MI_BLOCK_SIZE; @@ -407,6 +408,9 @@ assert(cr->sb_index < sbs_in_frame); i = cr->sb_index; cr->target_num_seg_blocks = 0; + if (cpi->oxcf.content != VP9E_CONTENT_SCREEN && + cr->percent_refresh > 0) + consec_zero_mv_thresh = 10 * (100 / cr->percent_refresh); do { int sum_map = 0; // Get the mi_row/mi_col corresponding to superblock index i. @@ -416,9 +420,6 @@ int mi_col = sb_col_index * MI_BLOCK_SIZE; int qindex_thresh = vp9_get_qindex(&cm->seg, CR_SEGMENT_ID_BOOST2, cm->base_qindex); - int consec_zero_mv_thresh = - cpi->oxcf.content == VP9E_CONTENT_SCREEN ? 0 - : 10 * (100 / cr->percent_refresh); assert(mi_row >= 0 && mi_row < cm->mi_rows); assert(mi_col >= 0 && mi_col < cm->mi_cols); bl_index = mi_row * cm->mi_cols + mi_col;