Merge "SSE2 based h_predictor_32x32"
diff --git a/build/make/iosbuild.sh b/build/make/iosbuild.sh
index 6f7180d..ae5ba18 100755
--- a/build/make/iosbuild.sh
+++ b/build/make/iosbuild.sh
@@ -29,11 +29,14 @@
LIBVPX_SOURCE_DIR=$(cd ${SCRIPT_DIR}/../..; pwd)
LIPO=$(xcrun -sdk iphoneos${SDK} -find lipo)
ORIG_PWD="$(pwd)"
-TARGETS="arm64-darwin-gcc
- armv7-darwin-gcc
- armv7s-darwin-gcc
- x86-iphonesimulator-gcc
- x86_64-iphonesimulator-gcc"
+ARM_TARGETS="arm64-darwin-gcc
+ armv7-darwin-gcc
+ armv7s-darwin-gcc"
+SIM_TARGETS="x86-iphonesimulator-gcc
+ x86_64-iphonesimulator-gcc"
+OSX_TARGETS="x86-darwin15-gcc
+ x86_64-darwin15-gcc"
+TARGETS="${ARM_TARGETS} ${SIM_TARGETS}"
# Configures for the target specified by $1, and invokes make with the dist
# target using $DIST_DIR as the distribution output directory.
@@ -197,15 +200,27 @@
fi
}
+print_list() {
+ local indent="$1"
+ shift
+ local list="$@"
+ for entry in ${list}; do
+ echo "${indent}${entry}"
+ done
+}
+
iosbuild_usage() {
cat << EOF
Usage: ${0##*/} [arguments]
--help: Display this message and exit.
--extra-configure-args <args>: Extra args to pass when configuring libvpx.
+ --macosx: Uses darwin15 targets instead of iphonesimulator targets for x86
+ and x86_64. Allows linking to framework when builds target MacOSX
+ instead of iOS.
--preserve-build-output: Do not delete the build directory.
--show-build-output: Show output from each library build.
--targets <targets>: Override default target list. Defaults:
- ${TARGETS}
+$(print_list " " ${TARGETS})
--test-link: Confirms all targets can be linked. Functionally identical to
passing --enable-examples via --extra-configure-args.
--verbose: Output information about the environment and each stage of the
@@ -249,6 +264,9 @@
TARGETS="$2"
shift
;;
+ --macosx)
+ TARGETS="${ARM_TARGETS} ${OSX_TARGETS}"
+ ;;
--verbose)
VERBOSE=yes
;;
@@ -273,10 +291,12 @@
MAKEFLAGS=${MAKEFLAGS}
ORIG_PWD=${ORIG_PWD}
PRESERVE_BUILD_OUTPUT=${PRESERVE_BUILD_OUTPUT}
- TARGETS="${TARGETS}"
+ TARGETS="$(print_list "" ${TARGETS})"
+ OSX_TARGETS="${OSX_TARGETS}"
+ SIM_TARGETS="${SIM_TARGETS}"
EOF
fi
build_framework "${TARGETS}"
echo "Successfully built '${FRAMEWORK_DIR}' for:"
-echo " ${TARGETS}"
+print_list "" ${TARGETS}
diff --git a/third_party/x86inc/x86inc.asm b/third_party/x86inc/x86inc.asm
index be59de3..e7d3fa5 100644
--- a/third_party/x86inc/x86inc.asm
+++ b/third_party/x86inc/x86inc.asm
@@ -119,7 +119,7 @@
%if ABI_IS_32BIT
%if CONFIG_PIC=1
%ifidn __OUTPUT_FORMAT__,elf32
- %define GET_GOT_SAVE_ARG 1
+ %define GET_GOT_DEFINED 1
%define WRT_PLT wrt ..plt
%macro GET_GOT 1
extern _GLOBAL_OFFSET_TABLE_
@@ -138,7 +138,7 @@
%define RESTORE_GOT pop %1
%endmacro
%elifidn __OUTPUT_FORMAT__,macho32
- %define GET_GOT_SAVE_ARG 1
+ %define GET_GOT_DEFINED 1
%macro GET_GOT 1
push %1
call %%get_got
@@ -149,6 +149,8 @@
%undef RESTORE_GOT
%define RESTORE_GOT pop %1
%endmacro
+ %else
+ %define GET_GOT_DEFINED 0
%endif
%endif
diff --git a/vp10/common/reconintra.c b/vp10/common/reconintra.c
index b852a65..e9e3949 100644
--- a/vp10/common/reconintra.c
+++ b/vp10/common/reconintra.c
@@ -262,7 +262,7 @@
}
#if CONFIG_MISC_FIXES
-static inline void memset16(uint16_t *dst, int val, int n) {
+static INLINE void memset16(uint16_t *dst, int val, int n) {
while (n--)
*dst++ = val;
}
diff --git a/vp10/decoder/decodeframe.c b/vp10/decoder/decodeframe.c
index 80f1778..70d012b 100644
--- a/vp10/decoder/decodeframe.c
+++ b/vp10/decoder/decodeframe.c
@@ -1152,16 +1152,12 @@
cm->uv_dc_delta_q = read_delta_q(rb);
cm->uv_ac_delta_q = read_delta_q(rb);
cm->dequant_bit_depth = cm->bit_depth;
- for (i = 0; i < (cm->seg.enabled ? MAX_SEGMENTS : 1); ++i) {
-#if CONFIG_MISC_FIXES
- const int qindex = vp10_get_qindex(&cm->seg, i, cm->base_qindex);
-#endif
- xd->lossless[i] = cm->y_dc_delta_q == 0 &&
-#if CONFIG_MISC_FIXES
- qindex == 0 &&
-#else
- cm->base_qindex == 0 &&
-#endif
+ for (i = 0; i < MAX_SEGMENTS; ++i) {
+ const int qindex = CONFIG_MISC_FIXES && cm->seg.enabled ?
+ vp10_get_qindex(&cm->seg, i, cm->base_qindex) :
+ cm->base_qindex;
+ xd->lossless[i] = qindex == 0 &&
+ cm->y_dc_delta_q == 0 &&
cm->uv_dc_delta_q == 0 &&
cm->uv_ac_delta_q == 0;
}
diff --git a/vp10/encoder/bitstream.c b/vp10/encoder/bitstream.c
index 32275d4..5f6d9d3 100644
--- a/vp10/encoder/bitstream.c
+++ b/vp10/encoder/bitstream.c
@@ -1473,7 +1473,7 @@
assert(n_log2_tiles > 0);
vpx_wb_write_literal(&saved_wb, mag, 2);
if (mag < 3)
- data_sz = remux_tiles(data, data_sz, 1 << n_log2_tiles, mag);
+ data_sz = remux_tiles(data, (int)data_sz, 1 << n_log2_tiles, mag);
} else {
assert(n_log2_tiles == 0);
}
diff --git a/vp10/encoder/encodeframe.c b/vp10/encoder/encodeframe.c
index bcdcff3..44ca276 100644
--- a/vp10/encoder/encodeframe.c
+++ b/vp10/encoder/encodeframe.c
@@ -1155,7 +1155,7 @@
if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
x->source_variance =
vp10_high_get_sby_perpixel_variance(cpi, &x->plane[0].src,
- bsize, xd->bd);
+ bsize, xd->bd);
} else {
x->source_variance =
vp10_get_sby_perpixel_variance(cpi, &x->plane[0].src, bsize);
@@ -2579,7 +2579,7 @@
}
static TX_MODE select_tx_mode(const VP10_COMP *cpi, MACROBLOCKD *const xd) {
- if (!cpi->common.seg.enabled && xd->lossless[0])
+ if (xd->lossless[0])
return ONLY_4X4;
if (cpi->sf.tx_size_search_method == USE_LARGESTALL)
return ALLOW_32X32;
@@ -2702,16 +2702,12 @@
rdc->m_search_count = 0; // Count of motion search hits.
rdc->ex_search_count = 0; // Exhaustive mesh search hits.
- for (i = 0; i < (cm->seg.enabled ? MAX_SEGMENTS : 1); ++i) {
-#if CONFIG_MISC_FIXES
- const int qindex = vp10_get_qindex(&cm->seg, i, cm->base_qindex);
-#endif
- xd->lossless[i] = cm->y_dc_delta_q == 0 &&
-#if CONFIG_MISC_FIXES
- qindex == 0 &&
-#else
- cm->base_qindex == 0 &&
-#endif
+ for (i = 0; i < MAX_SEGMENTS; ++i) {
+ const int qindex = CONFIG_MISC_FIXES && cm->seg.enabled ?
+ vp10_get_qindex(&cm->seg, i, cm->base_qindex) :
+ cm->base_qindex;
+ xd->lossless[i] = qindex == 0 &&
+ cm->y_dc_delta_q == 0 &&
cm->uv_dc_delta_q == 0 &&
cm->uv_ac_delta_q == 0;
}
diff --git a/vp9/encoder/vp9_temporal_filter.c b/vp9/encoder/vp9_temporal_filter.c
index 16f9c85..015dbc0 100644
--- a/vp9/encoder/vp9_temporal_filter.c
+++ b/vp9/encoder/vp9_temporal_filter.c
@@ -135,15 +135,38 @@
for (i = 0, k = 0; i < block_height; i++) {
for (j = 0; j < block_width; j++, k++) {
- int src_byte = frame1[byte];
- int pixel_value = *frame2++;
+ int pixel_value = *frame2;
- modifier = src_byte - pixel_value;
- // This is an integer approximation of:
- // float coeff = (3.0 * modifer * modifier) / pow(2, strength);
- // modifier = (int)roundf(coeff > 16 ? 0 : 16-coeff);
- modifier *= modifier;
- modifier *= 3;
+ // non-local mean approach
+ int diff_sse[9] = { 0 };
+ int idx, idy, index = 0;
+
+ for (idy = -1; idy <= 1; ++idy) {
+ for (idx = -1; idx <= 1; ++idx) {
+ int row = i + idy;
+ int col = j + idx;
+
+ if (row >= 0 && row < (int)block_height &&
+ col >= 0 && col < (int)block_width) {
+ int diff = frame1[byte + idy * (int)stride + idx] -
+ frame2[idy * (int)block_width + idx];
+ diff_sse[index] = diff * diff;
+ ++index;
+ }
+ }
+ }
+
+ assert(index > 0);
+
+ modifier = 0;
+ for (idx = 0; idx < 9; ++idx)
+ modifier += diff_sse[idx];
+
+ modifier *= 3;
+ modifier /= index;
+
+ ++frame2;
+
modifier += rounding;
modifier >>= strength;
@@ -182,15 +205,34 @@
for (i = 0, k = 0; i < block_height; i++) {
for (j = 0; j < block_width; j++, k++) {
- int src_byte = frame1[byte];
- int pixel_value = *frame2++;
+ int pixel_value = *frame2;
+ int diff_sse[9] = { 0 };
+ int idx, idy, index = 0;
- modifier = src_byte - pixel_value;
- // This is an integer approximation of:
- // float coeff = (3.0 * modifer * modifier) / pow(2, strength);
- // modifier = (int)roundf(coeff > 16 ? 0 : 16-coeff);
- modifier *= modifier;
+ for (idy = -1; idy <= 1; ++idy) {
+ for (idx = -1; idx <= 1; ++idx) {
+ int row = i + idy;
+ int col = j + idx;
+
+ if (row >= 0 && row < (int)block_height &&
+ col >= 0 && col < (int)block_width) {
+ int diff = frame1[byte + idy * (int)stride + idx] -
+ frame2[idy * (int)block_width + idx];
+ diff_sse[index] = diff * diff;
+ ++index;
+ }
+ }
+ }
+ assert(index > 0);
+
+ modifier = 0;
+ for (idx = 0; idx < 9; ++idx)
+ modifier += diff_sse[idx];
+
modifier *= 3;
+ modifier /= index;
+
+ ++frame2;
modifier += rounding;
modifier >>= strength;
@@ -383,55 +425,58 @@
if (mbd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
int adj_strength = strength + 2 * (mbd->bd - 8);
// Apply the filter (YUV)
- vp9_highbd_temporal_filter_apply(f->y_buffer + mb_y_offset,
- f->y_stride,
- predictor, 16, 16, adj_strength,
- filter_weight,
- accumulator, count);
- vp9_highbd_temporal_filter_apply(f->u_buffer + mb_uv_offset,
- f->uv_stride, predictor + 256,
- mb_uv_width, mb_uv_height,
- adj_strength,
- filter_weight, accumulator + 256,
- count + 256);
- vp9_highbd_temporal_filter_apply(f->v_buffer + mb_uv_offset,
- f->uv_stride, predictor + 512,
- mb_uv_width, mb_uv_height,
- adj_strength, filter_weight,
- accumulator + 512, count + 512);
+ vp9_highbd_temporal_filter_apply_c(f->y_buffer + mb_y_offset,
+ f->y_stride,
+ predictor, 16, 16, adj_strength,
+ filter_weight,
+ accumulator, count);
+ vp9_highbd_temporal_filter_apply_c(f->u_buffer + mb_uv_offset,
+ f->uv_stride, predictor + 256,
+ mb_uv_width, mb_uv_height,
+ adj_strength,
+ filter_weight, accumulator + 256,
+ count + 256);
+ vp9_highbd_temporal_filter_apply_c(f->v_buffer + mb_uv_offset,
+ f->uv_stride, predictor + 512,
+ mb_uv_width, mb_uv_height,
+ adj_strength, filter_weight,
+ accumulator + 512, count + 512);
} else {
// Apply the filter (YUV)
- vp9_temporal_filter_apply(f->y_buffer + mb_y_offset, f->y_stride,
+ vp9_temporal_filter_apply_c(f->y_buffer + mb_y_offset, f->y_stride,
+ predictor, 16, 16,
+ strength, filter_weight,
+ accumulator, count);
+ vp9_temporal_filter_apply_c(f->u_buffer + mb_uv_offset,
+ f->uv_stride,
+ predictor + 256,
+ mb_uv_width, mb_uv_height, strength,
+ filter_weight, accumulator + 256,
+ count + 256);
+ vp9_temporal_filter_apply_c(f->v_buffer + mb_uv_offset,
+ f->uv_stride,
+ predictor + 512,
+ mb_uv_width, mb_uv_height, strength,
+ filter_weight, accumulator + 512,
+ count + 512);
+ }
+#else
+ // Apply the filter (YUV)
+ // TODO(jingning): Need SIMD optimization for this.
+ vp9_temporal_filter_apply_c(f->y_buffer + mb_y_offset, f->y_stride,
predictor, 16, 16,
strength, filter_weight,
accumulator, count);
- vp9_temporal_filter_apply(f->u_buffer + mb_uv_offset, f->uv_stride,
+ vp9_temporal_filter_apply_c(f->u_buffer + mb_uv_offset, f->uv_stride,
predictor + 256,
mb_uv_width, mb_uv_height, strength,
filter_weight, accumulator + 256,
count + 256);
- vp9_temporal_filter_apply(f->v_buffer + mb_uv_offset, f->uv_stride,
+ vp9_temporal_filter_apply_c(f->v_buffer + mb_uv_offset, f->uv_stride,
predictor + 512,
mb_uv_width, mb_uv_height, strength,
filter_weight, accumulator + 512,
count + 512);
- }
-#else
- // Apply the filter (YUV)
- vp9_temporal_filter_apply(f->y_buffer + mb_y_offset, f->y_stride,
- predictor, 16, 16,
- strength, filter_weight,
- accumulator, count);
- vp9_temporal_filter_apply(f->u_buffer + mb_uv_offset, f->uv_stride,
- predictor + 256,
- mb_uv_width, mb_uv_height, strength,
- filter_weight, accumulator + 256,
- count + 256);
- vp9_temporal_filter_apply(f->v_buffer + mb_uv_offset, f->uv_stride,
- predictor + 512,
- mb_uv_width, mb_uv_height, strength,
- filter_weight, accumulator + 512,
- count + 512);
#endif // CONFIG_VP9_HIGHBITDEPTH
}
}
diff --git a/vpx_dsp/x86/highbd_subpel_variance_impl_sse2.asm b/vpx_dsp/x86/highbd_subpel_variance_impl_sse2.asm
index 5f9c963..22d52a2 100644
--- a/vpx_dsp/x86/highbd_subpel_variance_impl_sse2.asm
+++ b/vpx_dsp/x86/highbd_subpel_variance_impl_sse2.asm
@@ -123,7 +123,10 @@
%define sec_str sec_stridemp
; Store bilin_filter and pw_8 location in stack
- GET_GOT eax
+ %if GET_GOT_DEFINED == 1
+ GET_GOT eax
+ add esp, 4 ; restore esp
+ %endif
lea ecx, [GLOBAL(bilin_filter_m)]
mov g_bilin_filterm, ecx
@@ -131,7 +134,6 @@
lea ecx, [GLOBAL(pw_8)]
mov g_pw_8m, ecx
- RESTORE_GOT ; restore esp
LOAD_IF_USED 0, 1 ; load eax, ecx back
%else
cglobal highbd_sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, \
@@ -140,7 +142,10 @@
%define block_height heightd
; Store bilin_filter and pw_8 location in stack
- GET_GOT eax
+ %if GET_GOT_DEFINED == 1
+ GET_GOT eax
+ add esp, 4 ; restore esp
+ %endif
lea ecx, [GLOBAL(bilin_filter_m)]
mov g_bilin_filterm, ecx
@@ -148,7 +153,6 @@
lea ecx, [GLOBAL(pw_8)]
mov g_pw_8m, ecx
- RESTORE_GOT ; restore esp
LOAD_IF_USED 0, 1 ; load eax, ecx back
%endif
%else
diff --git a/vpx_dsp/x86/intrapred_sse2.asm b/vpx_dsp/x86/intrapred_sse2.asm
index 4aca3ec..f27b8d9 100644
--- a/vpx_dsp/x86/intrapred_sse2.asm
+++ b/vpx_dsp/x86/intrapred_sse2.asm
@@ -47,9 +47,9 @@
INIT_XMM sse2
cglobal dc_left_predictor_4x4, 2, 5, 2, dst, stride, above, left, goffset
+ movifnidn leftq, leftmp
GET_GOT goffsetq
- movifnidn leftq, leftmp
pxor m1, m1
movd m0, [leftq]
psadbw m0, m1
@@ -143,9 +143,9 @@
INIT_XMM sse2
cglobal dc_left_predictor_8x8, 2, 5, 2, dst, stride, above, left, goffset
+ movifnidn leftq, leftmp
GET_GOT goffsetq
- movifnidn leftq, leftmp
pxor m1, m1
movq m0, [leftq]
DEFINE_ARGS dst, stride, stride3
@@ -239,14 +239,11 @@
GET_GOT goffsetq
pxor m1, m1
- pxor m2, m2
mova m0, [aboveq]
DEFINE_ARGS dst, stride, stride3, lines4
lea stride3q, [strideq*3]
mov lines4d, 4
psadbw m0, m1
- psadbw m2, m1
- paddw m0, m2
movhlps m2, m0
paddw m0, m2
paddw m0, [GLOBAL(pw2_16)]
@@ -271,14 +268,11 @@
GET_GOT goffsetq
pxor m1, m1
- pxor m2, m2
mova m0, [leftq]
DEFINE_ARGS dst, stride, stride3, lines4
lea stride3q, [strideq*3]
mov lines4d, 4
psadbw m0, m1
- psadbw m2, m1
- paddw m0, m2
movhlps m2, m0
paddw m0, m2
paddw m0, [GLOBAL(pw2_16)]
diff --git a/vpx_dsp/x86/subpel_variance_sse2.asm b/vpx_dsp/x86/subpel_variance_sse2.asm
index 1176a2f..c655e4b 100644
--- a/vpx_dsp/x86/subpel_variance_sse2.asm
+++ b/vpx_dsp/x86/subpel_variance_sse2.asm
@@ -139,7 +139,10 @@
%define sec_str sec_stridemp
;Store bilin_filter and pw_8 location in stack
- GET_GOT eax
+ %if GET_GOT_DEFINED == 1
+ GET_GOT eax
+ add esp, 4 ; restore esp
+ %endif
lea ecx, [GLOBAL(bilin_filter_m)]
mov g_bilin_filterm, ecx
@@ -147,7 +150,6 @@
lea ecx, [GLOBAL(pw_8)]
mov g_pw_8m, ecx
- RESTORE_GOT ; restore esp
LOAD_IF_USED 0, 1 ; load eax, ecx back
%else
cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, x_offset, \
@@ -156,7 +158,10 @@
%define block_height heightd
;Store bilin_filter and pw_8 location in stack
- GET_GOT eax
+ %if GET_GOT_DEFINED == 1
+ GET_GOT eax
+ add esp, 4 ; restore esp
+ %endif
lea ecx, [GLOBAL(bilin_filter_m)]
mov g_bilin_filterm, ecx
@@ -164,7 +169,6 @@
lea ecx, [GLOBAL(pw_8)]
mov g_pw_8m, ecx
- RESTORE_GOT ; restore esp
LOAD_IF_USED 0, 1 ; load eax, ecx back
%endif
%else
diff --git a/vpx_ports/x86_abi_support.asm b/vpx_ports/x86_abi_support.asm
index c94b76a..708fa10 100644
--- a/vpx_ports/x86_abi_support.asm
+++ b/vpx_ports/x86_abi_support.asm
@@ -189,7 +189,6 @@
%if ABI_IS_32BIT
%if CONFIG_PIC=1
%ifidn __OUTPUT_FORMAT__,elf32
- %define GET_GOT_SAVE_ARG 1
%define WRT_PLT wrt ..plt
%macro GET_GOT 1
extern _GLOBAL_OFFSET_TABLE_
@@ -208,7 +207,6 @@
%define RESTORE_GOT pop %1
%endmacro
%elifidn __OUTPUT_FORMAT__,macho32
- %define GET_GOT_SAVE_ARG 1
%macro GET_GOT 1
push %1
call %%get_got