Merge "Deadline is not supported in VP9 decoder, removing it completely."
diff --git a/build/make/Android.mk b/build/make/Android.mk
index 826ff2f..816334e 100644
--- a/build/make/Android.mk
+++ b/build/make/Android.mk
@@ -38,8 +38,9 @@
# For this we import the 'cpufeatures' module from the NDK sources.
# libvpx can also be configured without this runtime detection method.
# Configuring with --disable-runtime-cpu-detect will assume presence of NEON.
-# Configuring with --disable-runtime-cpu-detect --disable-neon will remove any
-# NEON dependency.
+# Configuring with --disable-runtime-cpu-detect --disable-neon \
+# --disable-neon-asm
+# will remove any NEON dependency.
# To change to building armeabi, run ./libvpx/configure again, but with
# --target=arm5te-android-gcc and modify the Application.mk file to
diff --git a/build/make/Makefile b/build/make/Makefile
index c4d53f1..63ec271 100644
--- a/build/make/Makefile
+++ b/build/make/Makefile
@@ -19,6 +19,7 @@
done
all: .DEFAULT
clean:: .DEFAULT
+exampletest: .DEFAULT
install:: .DEFAULT
test:: .DEFAULT
testdata:: .DEFAULT
@@ -105,6 +106,8 @@
.PHONY: dist
dist:
+.PHONY: exampletest
+exampletest:
.PHONY: install
install::
.PHONY: test
diff --git a/build/make/configure.sh b/build/make/configure.sh
index a2a64c2..ad7dc82 100755
--- a/build/make/configure.sh
+++ b/build/make/configure.sh
@@ -518,7 +518,7 @@
--enable-?*|--disable-?*)
eval `echo "$opt" | sed 's/--/action=/;s/-/ option=/;s/-/_/g'`
if echo "${ARCH_EXT_LIST}" | grep "^ *$option\$" >/dev/null; then
- [ $action = "disable" ] && RTCD_OPTIONS="${RTCD_OPTIONS}${opt} "
+ [ $action = "disable" ] && RTCD_OPTIONS="${RTCD_OPTIONS}--disable-${option} "
elif [ $action = "disable" ] && ! disabled $option ; then
echo "${CMDLINE_SELECT}" | grep "^ *$option\$" >/dev/null ||
die_unknown $opt
@@ -835,7 +835,7 @@
check_add_cflags -march=armv7-a -mfloat-abi=${float_abi}
check_add_asflags -march=armv7-a -mfloat-abi=${float_abi}
- if enabled neon
+ if enabled neon || enabled neon_asm
then
check_add_cflags -mfpu=neon #-ftree-vectorize
check_add_asflags -mfpu=neon
@@ -882,7 +882,7 @@
tune_asflags="--cpu="
if [ -z "${tune_cpu}" ]; then
if [ ${tgt_isa} = "armv7" ]; then
- if enabled neon
+ if enabled neon || enabled neon_asm
then
check_add_cflags --fpu=softvfp+vfpv3
check_add_asflags --fpu=softvfp+vfpv3
@@ -1284,8 +1284,8 @@
local makefile=$2
shift 2
for cfg; do
- upname="`toupper $cfg`"
if enabled $cfg; then
+ upname="`toupper $cfg`"
echo "${prefix}_${upname}=yes" >> $makefile
fi
done
diff --git a/configure b/configure
index bd95056..b6d645a 100755
--- a/configure
+++ b/configure
@@ -52,6 +52,7 @@
${toggle_multi_res_encoding} enable multiple-resolution encoding
${toggle_temporal_denoising} enable temporal denoising and disable the spatial denoiser
${toggle_webm_io} enable input from and output to WebM container
+ ${toggle_libyuv} enable libyuv
Codecs:
Codecs can be selectively enabled or disabled individually, or by family:
@@ -265,8 +266,9 @@
unistd_h
"
EXPERIMENT_LIST="
- multiple_arf
alpha
+ multiple_arf
+ spatial_svc
"
CONFIG_LIST="
external_build
@@ -314,6 +316,7 @@
os_support
unit_tests
webm_io
+ libyuv
decode_perf_tests
multi_res_encoding
temporal_denoising
@@ -367,6 +370,7 @@
postproc_visualizer
unit_tests
webm_io
+ libyuv
decode_perf_tests
multi_res_encoding
temporal_denoising
@@ -708,9 +712,11 @@
*-vs*)
soft_enable unit_tests
soft_enable webm_io
+ soft_enable libyuv
;;
*-android-*)
soft_enable webm_io
+ soft_enable libyuv
# GTestLog must be modified to use Android logging utilities.
;;
*-darwin-*)
@@ -727,6 +733,9 @@
check_cxx "$@" <<EOF && soft_enable webm_io
int z;
EOF
+ check_cxx "$@" <<EOF && soft_enable libyuv
+int z;
+EOF
;;
*)
enabled pthread_h && check_cxx "$@" <<EOF && soft_enable unit_tests
@@ -735,6 +744,9 @@
check_cxx "$@" <<EOF && soft_enable webm_io
int z;
EOF
+ check_cxx "$@" <<EOF && soft_enable libyuv
+int z;
+EOF
;;
esac
# libwebm needs to be linked with C++ standard library
diff --git a/examples.mk b/examples.mk
index c36159f..946c030 100644
--- a/examples.mk
+++ b/examples.mk
@@ -10,10 +10,24 @@
LIBYUV_SRCS += third_party/libyuv/include/libyuv/basic_types.h \
third_party/libyuv/include/libyuv/cpu_id.h \
+ third_party/libyuv/include/libyuv/planar_functions.h \
+ third_party/libyuv/include/libyuv/row.h \
third_party/libyuv/include/libyuv/scale.h \
- third_party/libyuv/source/row.h \
- third_party/libyuv/source/scale.c \
- third_party/libyuv/source/cpu_id.c
+ third_party/libyuv/include/libyuv/scale_row.h \
+ third_party/libyuv/source/cpu_id.cc \
+ third_party/libyuv/source/planar_functions.cc \
+ third_party/libyuv/source/row_any.cc \
+ third_party/libyuv/source/row_common.cc \
+ third_party/libyuv/source/row_mips.cc \
+ third_party/libyuv/source/row_neon.cc \
+ third_party/libyuv/source/row_posix.cc \
+ third_party/libyuv/source/row_win.cc \
+ third_party/libyuv/source/scale.cc \
+ third_party/libyuv/source/scale_common.cc \
+ third_party/libyuv/source/scale_mips.cc \
+ third_party/libyuv/source/scale_neon.cc \
+ third_party/libyuv/source/scale_posix.cc \
+ third_party/libyuv/source/scale_win.cc
LIBWEBM_MUXER_SRCS += third_party/libwebm/mkvmuxer.cpp \
third_party/libwebm/mkvmuxerutil.cpp \
@@ -42,7 +56,9 @@
vpxdec.SRCS += ivfdec.c ivfdec.h
vpxdec.SRCS += tools_common.c tools_common.h
vpxdec.SRCS += y4menc.c y4menc.h
-vpxdec.SRCS += $(LIBYUV_SRCS)
+ifeq ($(CONFIG_LIBYUV),yes)
+ vpxdec.SRCS += $(LIBYUV_SRCS)
+endif
ifeq ($(CONFIG_WEBM_IO),yes)
vpxdec.SRCS += $(LIBWEBM_PARSER_SRCS)
vpxdec.SRCS += webmdec.cc webmdec.h
@@ -60,34 +76,38 @@
vpxenc.SRCS += vpx_ports/mem_ops_aligned.h
vpxenc.SRCS += vpx_ports/vpx_timer.h
vpxenc.SRCS += vpxstats.c vpxstats.h
-vpxenc.SRCS += $(LIBYUV_SRCS)
+ifeq ($(CONFIG_LIBYUV),yes)
+ vpxenc.SRCS += $(LIBYUV_SRCS)
+endif
ifeq ($(CONFIG_WEBM_IO),yes)
vpxenc.SRCS += $(LIBWEBM_MUXER_SRCS)
vpxenc.SRCS += webmenc.cc webmenc.h
endif
vpxenc.GUID = 548DEC74-7A15-4B2B-AFC3-AA102E7C25C1
vpxenc.DESCRIPTION = Full featured encoder
-EXAMPLES-$(CONFIG_VP9_ENCODER) += vp9_spatial_scalable_encoder.c
-vp9_spatial_scalable_encoder.SRCS += args.c args.h
-vp9_spatial_scalable_encoder.SRCS += ivfenc.c ivfenc.h
-vp9_spatial_scalable_encoder.SRCS += tools_common.c tools_common.h
-vp9_spatial_scalable_encoder.SRCS += video_common.h
-vp9_spatial_scalable_encoder.SRCS += video_writer.h video_writer.c
-vp9_spatial_scalable_encoder.SRCS += vpxstats.c vpxstats.h
-vp9_spatial_scalable_encoder.GUID = 4A38598D-627D-4505-9C7B-D4020C84100D
-vp9_spatial_scalable_encoder.DESCRIPTION = Spatial Scalable Encoder
+ifeq ($(CONFIG_SPATIAL_SVC),yes)
+ EXAMPLES-$(CONFIG_VP9_ENCODER) += vp9_spatial_svc_encoder.c
+ vp9_spatial_svc_encoder.SRCS += args.c args.h
+ vp9_spatial_svc_encoder.SRCS += ivfenc.c ivfenc.h
+ vp9_spatial_svc_encoder.SRCS += tools_common.c tools_common.h
+ vp9_spatial_svc_encoder.SRCS += video_common.h
+ vp9_spatial_svc_encoder.SRCS += video_writer.h video_writer.c
+ vp9_spatial_svc_encoder.SRCS += vpxstats.c vpxstats.h
+ vp9_spatial_svc_encoder.GUID = 4A38598D-627D-4505-9C7B-D4020C84100D
+ vp9_spatial_svc_encoder.DESCRIPTION = VP9 Spatial SVC Encoder
+endif
ifneq ($(CONFIG_SHARED),yes)
EXAMPLES-$(CONFIG_VP9_ENCODER) += resize_util.c
endif
-EXAMPLES-$(CONFIG_ENCODERS) += vpx_temporal_scalable_patterns.c
-vpx_temporal_scalable_patterns.SRCS += ivfenc.c ivfenc.h
-vpx_temporal_scalable_patterns.SRCS += tools_common.c tools_common.h
-vpx_temporal_scalable_patterns.SRCS += video_common.h
-vpx_temporal_scalable_patterns.SRCS += video_writer.h video_writer.c
-vpx_temporal_scalable_patterns.GUID = B18C08F2-A439-4502-A78E-849BE3D60947
-vpx_temporal_scalable_patterns.DESCRIPTION = Temporal Scalability Encoder
+EXAMPLES-$(CONFIG_ENCODERS) += vpx_temporal_svc_encoder.c
+vpx_temporal_svc_encoder.SRCS += ivfenc.c ivfenc.h
+vpx_temporal_svc_encoder.SRCS += tools_common.c tools_common.h
+vpx_temporal_svc_encoder.SRCS += video_common.h
+vpx_temporal_svc_encoder.SRCS += video_writer.h video_writer.c
+vpx_temporal_svc_encoder.GUID = B18C08F2-A439-4502-A78E-849BE3D60947
+vpx_temporal_svc_encoder.DESCRIPTION = Temporal SVC Encoder
EXAMPLES-$(CONFIG_VP8_DECODER) += simple_decoder.c
simple_decoder.GUID = D3BBF1E9-2427-450D-BBFF-B2843C1D44CC
simple_decoder.SRCS += ivfdec.h ivfdec.c
@@ -158,11 +178,13 @@
ifeq ($(CONFIG_MULTI_RES_ENCODING),yes)
+ifeq ($(CONFIG_LIBYUV),yes)
EXAMPLES-$(CONFIG_VP8_DECODER) += vp8_multi_resolution_encoder.c
vp8_multi_resolution_encoder.SRCS += $(LIBYUV_SRCS)
vp8_multi_resolution_encoder.GUID = 04f8738e-63c8-423b-90fa-7c2703a374de
vp8_multi_resolution_encoder.DESCRIPTION = VP8 Multiple-resolution Encoding
endif
+endif
# Handle extra library flags depending on codec configuration
diff --git a/examples/set_maps.c b/examples/set_maps.c
index 4343832..4ba38ee 100644
--- a/examples/set_maps.c
+++ b/examples/set_maps.c
@@ -64,7 +64,8 @@
static void set_roi_map(const vpx_codec_enc_cfg_t *cfg,
vpx_codec_ctx_t *codec) {
unsigned int i;
- vpx_roi_map_t roi = {0};
+ vpx_roi_map_t roi;
+ memset(&roi, 0, sizeof(roi));
roi.rows = (cfg->g_h + 15) / 16;
roi.cols = (cfg->g_w + 15) / 16;
@@ -97,7 +98,7 @@
static void set_active_map(const vpx_codec_enc_cfg_t *cfg,
vpx_codec_ctx_t *codec) {
unsigned int i;
- vpx_active_map_t map = {0};
+ vpx_active_map_t map = {0, 0, 0};
map.rows = (cfg->g_h + 15) / 16;
map.cols = (cfg->g_w + 15) / 16;
@@ -114,7 +115,7 @@
static void unset_active_map(const vpx_codec_enc_cfg_t *cfg,
vpx_codec_ctx_t *codec) {
- vpx_active_map_t map = {0};
+ vpx_active_map_t map = {0, 0, 0};
map.rows = (cfg->g_h + 15) / 16;
map.cols = (cfg->g_w + 15) / 16;
@@ -153,22 +154,23 @@
int main(int argc, char **argv) {
FILE *infile = NULL;
- vpx_codec_ctx_t codec = {0};
- vpx_codec_enc_cfg_t cfg = {0};
+ vpx_codec_ctx_t codec;
+ vpx_codec_enc_cfg_t cfg;
int frame_count = 0;
- vpx_image_t raw = {0};
+ vpx_image_t raw;
vpx_codec_err_t res;
- VpxVideoInfo info = {0};
+ VpxVideoInfo info;
VpxVideoWriter *writer = NULL;
const VpxInterface *encoder = NULL;
const int fps = 2; // TODO(dkovalev) add command line argument
const double bits_per_pixel_per_frame = 0.067;
exec_name = argv[0];
-
if (argc != 6)
die("Invalid number of arguments");
+ memset(&info, 0, sizeof(info));
+
encoder = get_vpx_encoder_by_name(argv[1]);
if (!encoder)
die("Unsupported codec.");
diff --git a/examples/vp9_spatial_scalable_encoder.c b/examples/vp9_spatial_svc_encoder.c
similarity index 100%
rename from examples/vp9_spatial_scalable_encoder.c
rename to examples/vp9_spatial_svc_encoder.c
diff --git a/examples/vpx_temporal_scalable_patterns.c b/examples/vpx_temporal_svc_encoder.c
similarity index 99%
rename from examples/vpx_temporal_scalable_patterns.c
rename to examples/vpx_temporal_svc_encoder.c
index 07dd318..e45b50c 100644
--- a/examples/vpx_temporal_scalable_patterns.c
+++ b/examples/vpx_temporal_svc_encoder.c
@@ -8,7 +8,7 @@
* be found in the AUTHORS file in the root of the source tree.
*/
-// This is an example demonstrating how to implement a multi-layer VP9
+// This is an example demonstrating how to implement a multi-layer VPx
// encoding scheme based on temporal scalability for video applications
// that benefit from a scalable bitstream.
diff --git a/libs.mk b/libs.mk
index 85c5f8a..d02e9bc 100644
--- a/libs.mk
+++ b/libs.mk
@@ -115,7 +115,7 @@
CODEC_EXPORTS-yes += $(addprefix $(VP9_PREFIX),$(VP9_CX_EXPORTS))
CODEC_SRCS-yes += $(VP9_PREFIX)vp9cx.mk vpx/vp8.h vpx/vp8cx.h
INSTALL-LIBS-yes += include/vpx/vp8.h include/vpx/vp8cx.h
- INSTALL-LIBS-yes += include/vpx/svc_context.h
+ INSTALL-LIBS-$(CONFIG_SPATIAL_SVC) += include/vpx/svc_context.h
INSTALL_MAPS += include/vpx/% $(SRC_PATH_BARE)/$(VP9_PREFIX)/%
CODEC_DOC_SRCS += vpx/vp8.h vpx/vp8cx.h
CODEC_DOC_SECTIONS += vp9 vp9_encoder
@@ -556,3 +556,26 @@
utiltest:
@echo Unit tests must be enabled to make the utiltest target.
endif
+
+##
+## Example tests.
+##
+ifeq ($(CONFIG_UNIT_TESTS),yes)
+# All non-MSVC targets output example targets in a sub dir named examples.
+EXAMPLES_BIN_PATH = examples
+ifeq ($(CONFIG_MSVS),yes)
+# MSVC will build both Debug and Release configurations of the examples in a
+# sub directory named for the current target. Assume the user wants to
+# run the Release tools, and assign EXAMPLES_BIN_PATH accordingly.
+# TODO(tomfinegan): Is this adequate for ARM?
+# TODO(tomfinegan): Support running the debug versions of tools?
+EXAMPLES_BIN_PATH := $(TGT_OS:win64=x64)/Release
+endif
+exampletest: examples testdata
+ $(qexec)$(SRC_PATH_BARE)/test/examples.sh \
+ --test-data-path $(LIBVPX_TEST_DATA_PATH) \
+ --bin-path $(EXAMPLES_BIN_PATH)
+else
+exampletest:
+ @echo Unit tests must be enabled to make the exampletest target.
+endif
diff --git a/test/convolve_test.cc b/test/convolve_test.cc
index cbb4036..a6dcc98 100644
--- a/test/convolve_test.cc
+++ b/test/convolve_test.cc
@@ -341,6 +341,9 @@
for (int filter_bank = 0; filter_bank < kNumFilterBanks; ++filter_bank) {
const InterpKernel *filters =
vp9_get_interp_kernel(static_cast<INTERP_FILTER>(filter_bank));
+ const InterpKernel *const eighttap_smooth =
+ vp9_get_interp_kernel(EIGHTTAP_SMOOTH);
+
for (int filter_x = 0; filter_x < kNumFilters; ++filter_x) {
for (int filter_y = 0; filter_y < kNumFilters; ++filter_y) {
filter_block2d_8_c(in, kInputStride,
@@ -348,7 +351,7 @@
ref, kOutputStride,
Width(), Height());
- if (filters == vp9_sub_pel_filters_8lp || (filter_x && filter_y))
+ if (filters == eighttap_smooth || (filter_x && filter_y))
REGISTER_STATE_CHECK(
UUT_->hv8_(in, kInputStride, out, kOutputStride,
filters[filter_x], 16, filters[filter_y], 16,
@@ -396,6 +399,8 @@
for (int filter_bank = 0; filter_bank < kNumFilterBanks; ++filter_bank) {
const InterpKernel *filters =
vp9_get_interp_kernel(static_cast<INTERP_FILTER>(filter_bank));
+ const InterpKernel *const eighttap_smooth =
+ vp9_get_interp_kernel(EIGHTTAP_SMOOTH);
for (int filter_x = 0; filter_x < kNumFilters; ++filter_x) {
for (int filter_y = 0; filter_y < kNumFilters; ++filter_y) {
@@ -404,7 +409,7 @@
ref, kOutputStride,
Width(), Height());
- if (filters == vp9_sub_pel_filters_8lp || (filter_x && filter_y))
+ if (filters == eighttap_smooth || (filter_x && filter_y))
REGISTER_STATE_CHECK(
UUT_->hv8_avg_(in, kInputStride, out, kOutputStride,
filters[filter_x], 16, filters[filter_y], 16,
@@ -544,6 +549,7 @@
TEST_P(ConvolveTest, CheckScalingFiltering) {
uint8_t* const in = input();
uint8_t* const out = output();
+ const InterpKernel *const eighttap = vp9_get_interp_kernel(EIGHTTAP);
SetConstantInput(127);
@@ -551,8 +557,8 @@
for (int step = 1; step <= 32; ++step) {
/* Test the horizontal and vertical filters in combination. */
REGISTER_STATE_CHECK(UUT_->hv8_(in, kInputStride, out, kOutputStride,
- vp9_sub_pel_filters_8[frac], step,
- vp9_sub_pel_filters_8[frac], step,
+ eighttap[frac], step,
+ eighttap[frac], step,
Width(), Height()));
CheckGuardBlocks();
diff --git a/test/dct16x16_test.cc b/test/dct16x16_test.cc
index 143a267..7900bcf 100644
--- a/test/dct16x16_test.cc
+++ b/test/dct16x16_test.cc
@@ -512,7 +512,9 @@
make_tuple(&vp9_fht16x16_c, &vp9_iht16x16_256_add_c, 2),
make_tuple(&vp9_fht16x16_c, &vp9_iht16x16_256_add_c, 3)));
-#if HAVE_NEON_ASM
+// FIXME (jingning, fgalligan): need to simplify the corresponding steps
+// in neov version accordingly, and re-enable the unit test
+#if HAVE_NEON_ASM && 0
INSTANTIATE_TEST_CASE_P(
NEON, Trans16x16DCT,
::testing::Values(
diff --git a/test/examples.sh b/test/examples.sh
index ac2a18c..7ba9cce 100755
--- a/test/examples.sh
+++ b/test/examples.sh
@@ -24,5 +24,6 @@
for test in ${example_tests}; do
# Source each test script so that exporting variables can be avoided.
+ VPX_TEST_NAME="$(basename ${test%.*})"
. "${test}"
done
diff --git a/test/resize_util.sh b/test/resize_util.sh
new file mode 100755
index 0000000..2a8e3fb
--- /dev/null
+++ b/test/resize_util.sh
@@ -0,0 +1,66 @@
+#!/bin/sh
+##
+## Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+##
+## Use of this source code is governed by a BSD-style license
+## that can be found in the LICENSE file in the root of the source
+## tree. An additional intellectual property rights grant can be found
+## in the file PATENTS. All contributing project authors may
+## be found in the AUTHORS file in the root of the source tree.
+##
+## This file tests the libvpx resize_util example code. To add new tests to
+## this file, do the following:
+## 1. Write a shell function (this is your test).
+## 2. Add the function to resize_util_tests (on a new line).
+##
+. $(dirname $0)/tools_common.sh
+
+# Environment check: $YUV_RAW_INPUT is required.
+resize_util_verify_environment() {
+ if [ ! -e "${YUV_RAW_INPUT}" ]; then
+ echo "Libvpx test data must exist in LIBVPX_TEST_DATA_PATH."
+ return 1
+ fi
+}
+
+# Resizes $YUV_RAW_INPUT using the resize_util example. $1 is the output
+# dimensions that will be passed to resize_util.
+resize_util() {
+ local resizer="${LIBVPX_BIN_PATH}/resize_util${VPX_TEST_EXE_SUFFIX}"
+ local output_file="${VPX_TEST_OUTPUT_DIR}/resize_util.raw"
+ local frames_to_resize="10"
+ local target_dimensions="$1"
+
+ # resize_util is available only when CONFIG_SHARED is disabled.
+ if [ -z "$(vpx_config_option_enabled CONFIG_SHARED)" ]; then
+ [ -x "${resizer}" ] || return 1
+
+ eval "${resizer}" "${YUV_RAW_INPUT}" \
+ "${YUV_RAW_INPUT_WIDTH}x${YUV_RAW_INPUT_HEIGHT}" \
+ "${target_dimensions}" "${output_file}" ${frames_to_resize} \
+ ${devnull}
+
+ [ -e "${output_file}" ] || return 1
+ fi
+}
+
+# Halves each dimension of $YUV_RAW_INPUT using resize_util().
+resize_down() {
+ local target_width=$((${YUV_RAW_INPUT_WIDTH} / 2))
+ local target_height=$((${YUV_RAW_INPUT_HEIGHT} / 2))
+
+ resize_util "${target_width}x${target_height}"
+}
+
+# Doubles each dimension of $YUV_RAW_INPUT using resize_util().
+resize_up() {
+ local target_width=$((${YUV_RAW_INPUT_WIDTH} * 2))
+ local target_height=$((${YUV_RAW_INPUT_HEIGHT} * 2))
+
+ resize_util "${target_width}x${target_height}"
+}
+
+resize_util_tests="resize_down
+ resize_up"
+
+run_tests resize_util_verify_environment "${resize_util_tests}"
diff --git a/test/test.mk b/test/test.mk
index 44d2f9c..f0a27c7 100644
--- a/test/test.mk
+++ b/test/test.mk
@@ -113,10 +113,13 @@
LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += dct32x32_test.cc
LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += fdct4x4_test.cc
LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += fdct8x8_test.cc
-LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += svc_test.cc
LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += variance_test.cc
LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += vp9_subtract_test.cc
+ifeq ($(CONFIG_VP9_ENCODER),yes)
+LIBVPX_TEST_SRCS-$(CONFIG_SPATIAL_SVC) += svc_test.cc
+endif
+
endif # VP9
LIBVPX_TEST_SRCS-$(CONFIG_ENCODERS) += sad_test.cc
diff --git a/test/tools_common.sh b/test/tools_common.sh
index 9c10d48..472111c 100755
--- a/test/tools_common.sh
+++ b/test/tools_common.sh
@@ -302,8 +302,13 @@
# functions and are run unconditionally. Functions in positional parameter two
# are run according to the rules specified in vpx_test_usage().
run_tests() {
- env_tests="verify_vpx_test_environment ${1}"
- tests_to_filter="${2}"
+ local env_tests="verify_vpx_test_environment $1"
+ local tests_to_filter="$2"
+ local test_name="${VPX_TEST_NAME}"
+
+ if [ -z "${test_name}" ]; then
+ test_name="$(basename \"${0%.*}\")"
+ fi
if [ "${VPX_TEST_RUN_DISABLED_TESTS}" != "yes" ]; then
# Filter out DISABLED tests.
@@ -315,7 +320,7 @@
tests_to_filter=$(filter_strings "${tests_to_filter}" ${VPX_TEST_FILTER})
fi
- tests_to_run="${env_tests} ${tests_to_filter}"
+ local tests_to_run="${env_tests} ${tests_to_filter}"
check_git_hashes
@@ -328,8 +333,8 @@
test_end "${test}"
done
- tested_config="$(test_configuration_target) @ $(current_hash)"
- echo $(basename "${0%.*}"): Done, all tests pass for ${tested_config}.
+ local tested_config="$(test_configuration_target) @ $(current_hash)"
+ echo "${test_name}: Done, all tests pass for ${tested_config}."
}
vpx_test_usage() {
diff --git a/test/twopass_encoder.sh b/test/twopass_encoder.sh
new file mode 100755
index 0000000..fe3cbbb
--- /dev/null
+++ b/test/twopass_encoder.sh
@@ -0,0 +1,59 @@
+#!/bin/sh
+##
+## Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+##
+## Use of this source code is governed by a BSD-style license
+## that can be found in the LICENSE file in the root of the source
+## tree. An additional intellectual property rights grant can be found
+## in the file PATENTS. All contributing project authors may
+## be found in the AUTHORS file in the root of the source tree.
+##
+## This file tests the libvpx twopass_encoder example. To add new tests to this
+## file, do the following:
+## 1. Write a shell function (this is your test).
+## 2. Add the function to twopass_encoder_tests (on a new line).
+##
+. $(dirname $0)/tools_common.sh
+
+# Environment check: $YUV_RAW_INPUT is required.
+twopass_encoder_verify_environment() {
+ if [ ! -e "${YUV_RAW_INPUT}" ]; then
+ echo "Libvpx test data must exist in LIBVPX_TEST_DATA_PATH."
+ return 1
+ fi
+}
+
+# Runs twopass_encoder using the codec specified by $1.
+twopass_encoder() {
+ local encoder="${LIBVPX_BIN_PATH}/twopass_encoder${VPX_TEST_EXE_SUFFIX}"
+ local codec="$1"
+ local output_file="${VPX_TEST_OUTPUT_DIR}/twopass_encoder_${codec}.ivf"
+
+ [ -x "${encoder}" ] || return 1
+
+ eval "${encoder}" "${codec}" "${YUV_RAW_INPUT_WIDTH}" \
+ "${YUV_RAW_INPUT_HEIGHT}" "${YUV_RAW_INPUT}" "${output_file}" \
+ ${devnull}
+
+ [ -e "${output_file}" ] || return 1
+}
+
+twopass_encoder_vp8() {
+ if [ "$(vp8_encode_available)" = "yes" ]; then
+ twopass_encoder vp8 || return 1
+ fi
+}
+
+# TODO(tomfinegan): Add a frame limit param to twopass_encoder and enable this
+# test. VP9 is just too slow right now: This test takes 31m16s+ on a fast
+# machine.
+DISABLED_twopass_encoder_vp9() {
+ if [ "$(vp9_encode_available)" = "yes" ]; then
+ twopass_encoder vp9 || return 1
+ fi
+}
+
+twopass_encoder_tests="twopass_encoder_vp8
+ DISABLED_twopass_encoder_vp9"
+
+run_tests twopass_encoder_verify_environment "${twopass_encoder_tests}"
diff --git a/test/vp9_spatial_svc_encoder.sh b/test/vp9_spatial_svc_encoder.sh
new file mode 100755
index 0000000..635cfa2
--- /dev/null
+++ b/test/vp9_spatial_svc_encoder.sh
@@ -0,0 +1,89 @@
+#!/bin/sh
+##
+## Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+##
+## Use of this source code is governed by a BSD-style license
+## that can be found in the LICENSE file in the root of the source
+## tree. An additional intellectual property rights grant can be found
+## in the file PATENTS. All contributing project authors may
+## be found in the AUTHORS file in the root of the source tree.
+##
+## This file tests the libvpx vp9_spatial_svc_encoder example. To add new
+## tests to to this file, do the following:
+## 1. Write a shell function (this is your test).
+## 2. Add the function to vp9_spatial_svc_tests (on a new line).
+##
+. $(dirname $0)/tools_common.sh
+
+# Environment check: $YUV_RAW_INPUT is required.
+vp9_spatial_svc_encoder_verify_environment() {
+ if [ ! -e "${YUV_RAW_INPUT}" ]; then
+ echo "Libvpx test data must exist in LIBVPX_TEST_DATA_PATH."
+ return 1
+ fi
+}
+
+# Runs vp9_spatial_svc_encoder. $1 is the test name.
+vp9_spatial_svc_encoder() {
+ local encoder="${LIBVPX_BIN_PATH}/vp9_spatial_svc_encoder"
+ encoder="${encoder}${VPX_TEST_EXE_SUFFIX}"
+ local test_name="$1"
+ local output_file="${VPX_TEST_OUTPUT_DIR}/vp9_ssvc_encoder${test_name}.ivf"
+ local frames_to_encode="10"
+ local max_kf="9999"
+
+ shift
+
+ [ -x "${encoder}" ] || return 1
+
+ eval "${encoder}" -w "${YUV_RAW_INPUT_WIDTH}" -h "${YUV_RAW_INPUT_HEIGHT}" \
+ -k "${max_kf}" -f "${frames_to_encode}" "$@" "${YUV_RAW_INPUT}" \
+ "${output_file}" \
+ ${devnull}
+
+ [ -e "${output_file}" ] || return 1
+}
+
+# Each mode is run with layer count 1-$vp9_ssvc_test_layers.
+vp9_ssvc_test_layers=5
+
+vp9_spatial_svc_mode_i() {
+ if [ "$(vp9_encode_available)" = "yes" ]; then
+ local test_name="${FUNCNAME}"
+ for layers in $(seq 1 ${vp9_ssvc_test_layers}); do
+ vp9_spatial_svc_encoder "${test_name}" -m i -l ${layers}
+ done
+ fi
+}
+
+vp9_spatial_svc_mode_altip() {
+ if [ "$(vp9_encode_available)" = "yes" ]; then
+ local test_name="${FUNCNAME}"
+ for layers in $(seq 1 ${vp9_ssvc_test_layers}); do
+ vp9_spatial_svc_encoder "${test_name}" -m "alt-ip" -l ${layers}
+ done
+ fi
+}
+
+vp9_spatial_svc_mode_ip() {
+ if [ "$(vp9_encode_available)" = "yes" ]; then
+ local test_name="${FUNCNAME}"
+ vp9_spatial_svc_encoder "${test_name}" -m ip -l 1
+ fi
+}
+
+vp9_spatial_svc_mode_gf() {
+ if [ "$(vp9_encode_available)" = "yes" ]; then
+ local test_name="${FUNCNAME}"
+ for layers in $(seq 1 ${vp9_ssvc_test_layers}); do
+ vp9_spatial_svc_encoder "${test_name}" -m gf -l ${layers}
+ done
+ fi
+}
+
+vp9_spatial_svc_tests="vp9_spatial_svc_mode_i
+ vp9_spatial_svc_mode_altip
+ vp9_spatial_svc_mode_ip
+ vp9_spatial_svc_mode_gf"
+
+run_tests vp9_spatial_svc_encoder_verify_environment "${vp9_spatial_svc_tests}"
diff --git a/test/vpx_temporal_svc_encoder.sh b/test/vpx_temporal_svc_encoder.sh
new file mode 100755
index 0000000..ff64740
--- /dev/null
+++ b/test/vpx_temporal_svc_encoder.sh
@@ -0,0 +1,283 @@
+#!/bin/sh
+##
+## Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+##
+## Use of this source code is governed by a BSD-style license
+## that can be found in the LICENSE file in the root of the source
+## tree. An additional intellectual property rights grant can be found
+## in the file PATENTS. All contributing project authors may
+## be found in the AUTHORS file in the root of the source tree.
+##
+## This file tests the libvpx vpx_temporal_svc_encoder example. To add new
+## tests to this file, do the following:
+## 1. Write a shell function (this is your test).
+## 2. Add the function to vpx_tsvc_encoder_tests (on a new line).
+##
+. $(dirname $0)/tools_common.sh
+
+# Environment check: $YUV_RAW_INPUT is required.
+vpx_tsvc_encoder_verify_environment() {
+ if [ ! -e "${YUV_RAW_INPUT}" ]; then
+ echo "Libvpx test data must exist in LIBVPX_TEST_DATA_PATH."
+ return 1
+ fi
+}
+
+# Runs vpx_temporal_svc_encoder using the codec specified by $1 and output file
+# name by $2. Additional positional parameters are passed directly to
+# vpx_temporal_svc_encoder.
+vpx_tsvc_encoder() {
+ local encoder="${LIBVPX_BIN_PATH}/vpx_temporal_svc_encoder"
+ encoder="${encoder}${VPX_TEST_EXE_SUFFIX}"
+ local codec="$1"
+ local output_file_base="$2"
+ local output_file="${VPX_TEST_OUTPUT_DIR}/${output_file_base}"
+ local timebase_num="1"
+ local timebase_den="1000"
+ local speed="6"
+ local frame_drop_thresh="30"
+
+ shift 2
+
+ [ -x "${encoder}" ] || return 1
+
+ eval "${encoder}" "${YUV_RAW_INPUT}" "${output_file}" "${codec}" \
+ "${YUV_RAW_INPUT_WIDTH}" "${YUV_RAW_INPUT_HEIGHT}" \
+ "${timebase_num}" "${timebase_den}" "${speed}" "${frame_drop_thresh}" \
+ "$@" \
+ ${devnull}
+}
+
+# Confirms that all expected output files exist given the output file name
+# passed to vpx_temporal_svc_encoder.
+# The file name passed to vpx_temporal_svc_encoder is joined with the stream
+# number and the extension .ivf to produce per stream output files. Here $1 is
+# file name, and $2 is expected number of files.
+files_exist() {
+ local file_name="${VPX_TEST_OUTPUT_DIR}/$1"
+ local num_files="$(($2 - 1))"
+ for stream_num in $(seq 0 ${num_files}); do
+ [ -e "${file_name}_${stream_num}.ivf" ] || return 1
+ done
+}
+
+# Run vpx_temporal_svc_encoder in all supported modes for vp8 and vp9.
+
+vpx_tsvc_encoder_vp8_mode_0() {
+ if [ "$(vp8_encode_available)" = "yes" ]; then
+ vpx_tsvc_encoder vp8 "${FUNCNAME}" 0 200 || return 1
+ # Mode 0 produces 1 stream
+ files_exist "${FUNCNAME}" 1 || return 1
+ fi
+}
+
+vpx_tsvc_encoder_vp8_mode_1() {
+ if [ "$(vp8_encode_available)" = "yes" ]; then
+ vpx_tsvc_encoder vp8 "${FUNCNAME}" 1 200 400 || return 1
+ # Mode 1 produces 2 streams
+ files_exist "${FUNCNAME}" 2 || return 1
+ fi
+}
+
+vpx_tsvc_encoder_vp8_mode_2() {
+ if [ "$(vp8_encode_available)" = "yes" ]; then
+ vpx_tsvc_encoder vp8 "${FUNCNAME}" 2 200 400 || return 1
+ # Mode 2 produces 2 streams
+ files_exist "${FUNCNAME}" 2 || return 1
+ fi
+}
+
+vpx_tsvc_encoder_vp8_mode_3() {
+ if [ "$(vp8_encode_available)" = "yes" ]; then
+ vpx_tsvc_encoder vp8 "${FUNCNAME}" 3 200 400 600 || return 1
+ # Mode 3 produces 3 streams
+ files_exist "${FUNCNAME}" 3 || return 1
+ fi
+}
+
+vpx_tsvc_encoder_vp8_mode_4() {
+ if [ "$(vp8_encode_available)" = "yes" ]; then
+ vpx_tsvc_encoder vp8 "${FUNCNAME}" 4 200 400 600 || return 1
+ # Mode 4 produces 3 streams
+ files_exist "${FUNCNAME}" 3 || return 1
+ fi
+}
+
+vpx_tsvc_encoder_vp8_mode_5() {
+ if [ "$(vp8_encode_available)" = "yes" ]; then
+ vpx_tsvc_encoder vp8 "${FUNCNAME}" 5 200 400 600 || return 1
+ # Mode 5 produces 3 streams
+ files_exist "${FUNCNAME}" 3 || return 1
+ fi
+}
+
+vpx_tsvc_encoder_vp8_mode_6() {
+ if [ "$(vp8_encode_available)" = "yes" ]; then
+ vpx_tsvc_encoder vp8 "${FUNCNAME}" 6 200 400 600 || return 1
+ # Mode 6 produces 3 streams
+ files_exist "${FUNCNAME}" 3 || return 1
+ fi
+}
+
+vpx_tsvc_encoder_vp8_mode_7() {
+ if [ "$(vp8_encode_available)" = "yes" ]; then
+ vpx_tsvc_encoder vp8 "${FUNCNAME}" 7 200 400 600 800 1000 || return 1
+ # Mode 7 produces 5 streams
+ files_exist "${FUNCNAME}" 5 || return 1
+ fi
+}
+
+vpx_tsvc_encoder_vp8_mode_8() {
+ if [ "$(vp8_encode_available)" = "yes" ]; then
+ vpx_tsvc_encoder vp8 "${FUNCNAME}" 8 200 400 || return 1
+ # Mode 8 produces 2 streams
+ files_exist "${FUNCNAME}" 2 || return 1
+ fi
+}
+
+vpx_tsvc_encoder_vp8_mode_9() {
+ if [ "$(vp8_encode_available)" = "yes" ]; then
+ vpx_tsvc_encoder vp8 "${FUNCNAME}" 9 200 400 600 || return 1
+ # Mode 9 produces 3 streams
+ files_exist "${FUNCNAME}" 3 || return 1
+ fi
+}
+
+vpx_tsvc_encoder_vp8_mode_10() {
+ if [ "$(vp8_encode_available)" = "yes" ]; then
+ vpx_tsvc_encoder vp8 "${FUNCNAME}" 10 200 400 600 || return 1
+ # Mode 10 produces 3 streams
+ files_exist "${FUNCNAME}" 3 || return 1
+ fi
+}
+
+vpx_tsvc_encoder_vp8_mode_11() {
+ if [ "$(vp8_encode_available)" = "yes" ]; then
+ vpx_tsvc_encoder vp8 "${FUNCNAME}" 11 200 400 600 || return 1
+ # Mode 11 produces 3 streams
+ files_exist "${FUNCNAME}" 3 || return 1
+ fi
+}
+
+vpx_tsvc_encoder_vp9_mode_0() {
+ if [ "$(vp9_encode_available)" = "yes" ]; then
+ vpx_tsvc_encoder vp9 "${FUNCNAME}" 0 200 || return 1
+ # Mode 0 produces 1 stream
+ files_exist "${FUNCNAME}" 1 || return 1
+ fi
+}
+
+vpx_tsvc_encoder_vp9_mode_1() {
+ if [ "$(vp9_encode_available)" = "yes" ]; then
+ vpx_tsvc_encoder vp9 "${FUNCNAME}" 1 200 400 || return 1
+ # Mode 1 produces 2 streams
+ files_exist "${FUNCNAME}" 2 || return 1
+ fi
+}
+
+vpx_tsvc_encoder_vp9_mode_2() {
+ if [ "$(vp9_encode_available)" = "yes" ]; then
+ vpx_tsvc_encoder vp9 "${FUNCNAME}" 2 200 400 || return 1
+ # Mode 2 produces 2 streams
+ files_exist "${FUNCNAME}" 2 || return 1
+ fi
+}
+
+vpx_tsvc_encoder_vp9_mode_3() {
+ if [ "$(vp9_encode_available)" = "yes" ]; then
+ vpx_tsvc_encoder vp9 "${FUNCNAME}" 3 200 400 600 || return 1
+ # Mode 3 produces 3 streams
+ files_exist "${FUNCNAME}" 3 || return 1
+ fi
+}
+
+vpx_tsvc_encoder_vp9_mode_4() {
+ if [ "$(vp9_encode_available)" = "yes" ]; then
+ vpx_tsvc_encoder vp9 "${FUNCNAME}" 4 200 400 600 || return 1
+ # Mode 4 produces 3 streams
+ files_exist "${FUNCNAME}" 3 || return 1
+ fi
+}
+
+vpx_tsvc_encoder_vp9_mode_5() {
+ if [ "$(vp9_encode_available)" = "yes" ]; then
+ vpx_tsvc_encoder vp9 "${FUNCNAME}" 5 200 400 600 || return 1
+ # Mode 5 produces 3 streams
+ files_exist "${FUNCNAME}" 3 || return 1
+ fi
+}
+
+vpx_tsvc_encoder_vp9_mode_6() {
+ if [ "$(vp9_encode_available)" = "yes" ]; then
+ vpx_tsvc_encoder vp9 "${FUNCNAME}" 6 200 400 600 || return 1
+ # Mode 6 produces 3 streams
+ files_exist "${FUNCNAME}" 3 || return 1
+ fi
+}
+
+vpx_tsvc_encoder_vp9_mode_7() {
+ if [ "$(vp9_encode_available)" = "yes" ]; then
+ vpx_tsvc_encoder vp9 "${FUNCNAME}" 7 200 400 600 800 1000 || return 1
+ # Mode 7 produces 5 streams
+ files_exist "${FUNCNAME}" 5 || return 1
+ fi
+}
+
+vpx_tsvc_encoder_vp9_mode_8() {
+ if [ "$(vp9_encode_available)" = "yes" ]; then
+ vpx_tsvc_encoder vp9 "${FUNCNAME}" 8 200 400 || return 1
+ # Mode 8 produces 2 streams
+ files_exist "${FUNCNAME}" 2 || return 1
+ fi
+}
+
+vpx_tsvc_encoder_vp9_mode_9() {
+ if [ "$(vp9_encode_available)" = "yes" ]; then
+ vpx_tsvc_encoder vp9 "${FUNCNAME}" 9 200 400 600 || return 1
+ # Mode 9 produces 3 streams
+ files_exist "${FUNCNAME}" 3 || return 1
+ fi
+}
+
+vpx_tsvc_encoder_vp9_mode_10() {
+ if [ "$(vp9_encode_available)" = "yes" ]; then
+ vpx_tsvc_encoder vp9 "${FUNCNAME}" 10 200 400 600 || return 1
+ # Mode 10 produces 3 streams
+ files_exist "${FUNCNAME}" 3 || return 1
+ fi
+}
+
+vpx_tsvc_encoder_vp9_mode_11() {
+ if [ "$(vp9_encode_available)" = "yes" ]; then
+ vpx_tsvc_encoder vp9 "${FUNCNAME}" 11 200 400 600 || return 1
+ # Mode 11 produces 3 streams
+ files_exist "${FUNCNAME}" 3 || return 1
+ fi
+}
+
+vpx_tsvc_encoder_tests="vpx_tsvc_encoder_vp8_mode_0
+ vpx_tsvc_encoder_vp8_mode_1
+ vpx_tsvc_encoder_vp8_mode_2
+ vpx_tsvc_encoder_vp8_mode_3
+ vpx_tsvc_encoder_vp8_mode_4
+ vpx_tsvc_encoder_vp8_mode_5
+ vpx_tsvc_encoder_vp8_mode_6
+ vpx_tsvc_encoder_vp8_mode_7
+ vpx_tsvc_encoder_vp8_mode_8
+ vpx_tsvc_encoder_vp8_mode_9
+ vpx_tsvc_encoder_vp8_mode_10
+ vpx_tsvc_encoder_vp8_mode_11
+ vpx_tsvc_encoder_vp9_mode_0
+ vpx_tsvc_encoder_vp9_mode_1
+ vpx_tsvc_encoder_vp9_mode_2
+ vpx_tsvc_encoder_vp9_mode_3
+ vpx_tsvc_encoder_vp9_mode_4
+ vpx_tsvc_encoder_vp9_mode_5
+ vpx_tsvc_encoder_vp9_mode_6
+ vpx_tsvc_encoder_vp9_mode_7
+ vpx_tsvc_encoder_vp9_mode_8
+ vpx_tsvc_encoder_vp9_mode_9
+ vpx_tsvc_encoder_vp9_mode_10
+ vpx_tsvc_encoder_vp9_mode_11"
+
+run_tests vpx_tsvc_encoder_verify_environment "${vpx_tsvc_encoder_tests}"
diff --git a/third_party/libyuv/README.libvpx b/third_party/libyuv/README.libvpx
index d3495ca..577b42d 100644
--- a/third_party/libyuv/README.libvpx
+++ b/third_party/libyuv/README.libvpx
@@ -1,6 +1,6 @@
Name: libyuv
URL: http://code.google.com/p/libyuv/
-Version: 102
+Version: 1005
License: BSD
License File: LICENSE
@@ -13,5 +13,5 @@
in order to encode multiple resolution bit streams.
Local Modifications:
-Modified the original scaler code from C++ to C to fit in our current build
-system. This is a temporal solution, and will be improved later.
\ No newline at end of file
+Modified the original scaler code minimally with include file changes to fit
+in our current build system.
diff --git a/third_party/libyuv/include/libyuv/basic_types.h b/third_party/libyuv/include/libyuv/basic_types.h
index 30504ce..beb750b 100644
--- a/third_party/libyuv/include/libyuv/basic_types.h
+++ b/third_party/libyuv/include/libyuv/basic_types.h
@@ -1,22 +1,25 @@
/*
- * Copyright (c) 2011 The LibYuv project authors. All Rights Reserved.
+ * Copyright 2011 The LibYuv Project Authors. All rights reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
+ * in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
-#ifndef INCLUDE_LIBYUV_BASIC_TYPES_H_
+#ifndef INCLUDE_LIBYUV_BASIC_TYPES_H_ // NOLINT
#define INCLUDE_LIBYUV_BASIC_TYPES_H_
#include <stddef.h> // for NULL, size_t
-#if !(defined(_MSC_VER) && (_MSC_VER < 1600))
+#if defined(__ANDROID__) || (defined(_MSC_VER) && (_MSC_VER < 1600))
+#include <sys/types.h> // for uintptr_t on x86
+#else
#include <stdint.h> // for uintptr_t
#endif
+#ifndef GG_LONGLONG
#ifndef INT_TYPES_DEFINED
#define INT_TYPES_DEFINED
#ifdef COMPILER_MSVC
@@ -30,9 +33,9 @@
#endif
#define INT64_F "I64"
#else // COMPILER_MSVC
-#ifdef __LP64__
-typedef unsigned long uint64;
-typedef long int64;
+#if defined(__LP64__) && !defined(__OpenBSD__) && !defined(__APPLE__)
+typedef unsigned long uint64; // NOLINT
+typedef long int64; // NOLINT
#ifndef INT64_C
#define INT64_C(x) x ## L
#endif
@@ -40,9 +43,9 @@
#define UINT64_C(x) x ## UL
#endif
#define INT64_F "l"
-#else // __LP64__
-typedef unsigned long long uint64;
-typedef long long int64;
+#else // defined(__LP64__) && !defined(__OpenBSD__) && !defined(__APPLE__)
+typedef unsigned long long uint64; // NOLINT
+typedef long long int64; // NOLINT
#ifndef INT64_C
#define INT64_C(x) x ## LL
#endif
@@ -54,20 +57,62 @@
#endif // COMPILER_MSVC
typedef unsigned int uint32;
typedef int int32;
-typedef unsigned short uint16;
-typedef short int16;
+typedef unsigned short uint16; // NOLINT
+typedef short int16; // NOLINT
typedef unsigned char uint8;
-typedef char int8;
+typedef signed char int8;
#endif // INT_TYPES_DEFINED
+#endif // GG_LONGLONG
// Detect compiler is for x86 or x64.
#if defined(__x86_64__) || defined(_M_X64) || \
defined(__i386__) || defined(_M_IX86)
#define CPU_X86 1
#endif
+// Detect compiler is for ARM.
+#if defined(__arm__) || defined(_M_ARM)
+#define CPU_ARM 1
+#endif
+#ifndef ALIGNP
+#ifdef __cplusplus
#define ALIGNP(p, t) \
- ((uint8*)((((uintptr_t)(p) + \
- ((t)-1)) & ~((t)-1))))
+ (reinterpret_cast<uint8*>(((reinterpret_cast<uintptr_t>(p) + \
+ ((t) - 1)) & ~((t) - 1))))
+#else
+#define ALIGNP(p, t) \
+ ((uint8*)((((uintptr_t)(p) + ((t) - 1)) & ~((t) - 1)))) /* NOLINT */
+#endif
+#endif
-#endif // INCLUDE_LIBYUV_BASIC_TYPES_H_
+#if !defined(LIBYUV_API)
+#if defined(_WIN32) || defined(__CYGWIN__)
+#if defined(LIBYUV_BUILDING_SHARED_LIBRARY)
+#define LIBYUV_API __declspec(dllexport)
+#elif defined(LIBYUV_USING_SHARED_LIBRARY)
+#define LIBYUV_API __declspec(dllimport)
+#else
+#define LIBYUV_API
+#endif // LIBYUV_BUILDING_SHARED_LIBRARY
+#elif defined(__GNUC__) && (__GNUC__ >= 4) && !defined(__APPLE__) && \
+ (defined(LIBYUV_BUILDING_SHARED_LIBRARY) || \
+ defined(LIBYUV_USING_SHARED_LIBRARY))
+#define LIBYUV_API __attribute__ ((visibility ("default")))
+#else
+#define LIBYUV_API
+#endif // __GNUC__
+#endif // LIBYUV_API
+
+#define LIBYUV_BOOL int
+#define LIBYUV_FALSE 0
+#define LIBYUV_TRUE 1
+
+// Visual C x86 or GCC little endian.
+#if defined(__x86_64__) || defined(_M_X64) || \
+ defined(__i386__) || defined(_M_IX86) || \
+ defined(__arm__) || defined(_M_ARM) || \
+ (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)
+#define LIBYUV_LITTLE_ENDIAN
+#endif
+
+#endif // INCLUDE_LIBYUV_BASIC_TYPES_H_ NOLINT
diff --git a/third_party/libyuv/include/libyuv/cpu_id.h b/third_party/libyuv/include/libyuv/cpu_id.h
index 4a53b5b..fd6276b 100644
--- a/third_party/libyuv/include/libyuv/cpu_id.h
+++ b/third_party/libyuv/include/libyuv/cpu_id.h
@@ -1,49 +1,81 @@
/*
- * Copyright (c) 2011 The LibYuv project authors. All Rights Reserved.
+ * Copyright 2011 The LibYuv Project Authors. All rights reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
+ * in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
-#ifndef INCLUDE_LIBYUV_CPU_ID_H_
+#ifndef INCLUDE_LIBYUV_CPU_ID_H_ // NOLINT
#define INCLUDE_LIBYUV_CPU_ID_H_
+#include "basic_types.h"
+
#ifdef __cplusplus
namespace libyuv {
extern "C" {
#endif
-// These flags are only valid on x86 processors
-static const int kCpuHasSSE2 = 1;
-static const int kCpuHasSSSE3 = 2;
+// TODO(fbarchard): Consider overlapping bits for different architectures.
+// Internal flag to indicate cpuid requires initialization.
+#define kCpuInit 0x1
-// These flags are only valid on ARM processors
-static const int kCpuHasNEON = 4;
+// These flags are only valid on ARM processors.
+static const int kCpuHasARM = 0x2;
+static const int kCpuHasNEON = 0x4;
+// 0x8 reserved for future ARM flag.
-// Internal flag to indicate cpuid is initialized.
-static const int kCpuInitialized = 8;
+// These flags are only valid on x86 processors.
+static const int kCpuHasX86 = 0x10;
+static const int kCpuHasSSE2 = 0x20;
+static const int kCpuHasSSSE3 = 0x40;
+static const int kCpuHasSSE41 = 0x80;
+static const int kCpuHasSSE42 = 0x100;
+static const int kCpuHasAVX = 0x200;
+static const int kCpuHasAVX2 = 0x400;
+static const int kCpuHasERMS = 0x800;
+static const int kCpuHasFMA3 = 0x1000;
+// 0x2000, 0x4000, 0x8000 reserved for future X86 flags.
+
+// These flags are only valid on MIPS processors.
+static const int kCpuHasMIPS = 0x10000;
+static const int kCpuHasMIPS_DSP = 0x20000;
+static const int kCpuHasMIPS_DSPR2 = 0x40000;
+
+// Internal function used to auto-init.
+LIBYUV_API
+int InitCpuFlags(void);
+
+// Internal function for parsing /proc/cpuinfo.
+LIBYUV_API
+int ArmCpuCaps(const char* cpuinfo_name);
// Detect CPU has SSE2 etc.
-// test_flag parameter should be one of kCpuHas constants above
+// Test_flag parameter should be one of kCpuHas constants above.
// returns non-zero if instruction set is detected
static __inline int TestCpuFlag(int test_flag) {
- extern int cpu_info_;
- extern int InitCpuFlags();
- return (cpu_info_ ? cpu_info_ : InitCpuFlags()) & test_flag;
+ LIBYUV_API extern int cpu_info_;
+ return (cpu_info_ == kCpuInit ? InitCpuFlags() : cpu_info_) & test_flag;
}
// For testing, allow CPU flags to be disabled.
// ie MaskCpuFlags(~kCpuHasSSSE3) to disable SSSE3.
-// -1 to enable all cpu specific optimizations.
-// 0 to disable all cpu specific optimizations.
+// MaskCpuFlags(-1) to enable all cpu specific optimizations.
+// MaskCpuFlags(0) to disable all cpu specific optimizations.
+LIBYUV_API
void MaskCpuFlags(int enable_flags);
+// Low level cpuid for X86. Returns zeros on other CPUs.
+// eax is the info type that you want.
+// ecx is typically the cpu number, and should normally be zero.
+LIBYUV_API
+void CpuId(uint32 eax, uint32 ecx, uint32* cpu_info);
+
#ifdef __cplusplus
} // extern "C"
} // namespace libyuv
#endif
-#endif // INCLUDE_LIBYUV_CPU_ID_H_
+#endif // INCLUDE_LIBYUV_CPU_ID_H_ NOLINT
diff --git a/third_party/libyuv/include/libyuv/planar_functions.h b/third_party/libyuv/include/libyuv/planar_functions.h
new file mode 100644
index 0000000..43f8df3
--- /dev/null
+++ b/third_party/libyuv/include/libyuv/planar_functions.h
@@ -0,0 +1,439 @@
+/*
+ * Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef INCLUDE_LIBYUV_PLANAR_FUNCTIONS_H_ // NOLINT
+#define INCLUDE_LIBYUV_PLANAR_FUNCTIONS_H_
+
+#include "basic_types.h"
+
+// TODO(fbarchard): Remove the following headers includes.
+// #include "convert.h"
+// #include "convert_argb.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// Copy a plane of data.
+LIBYUV_API
+void CopyPlane(const uint8* src_y, int src_stride_y,
+ uint8* dst_y, int dst_stride_y,
+ int width, int height);
+
+LIBYUV_API
+void CopyPlane_16(const uint16* src_y, int src_stride_y,
+ uint16* dst_y, int dst_stride_y,
+ int width, int height);
+
+// Set a plane of data to a 32 bit value.
+LIBYUV_API
+void SetPlane(uint8* dst_y, int dst_stride_y,
+ int width, int height,
+ uint32 value);
+
+// Copy I400. Supports inverting.
+LIBYUV_API
+int I400ToI400(const uint8* src_y, int src_stride_y,
+ uint8* dst_y, int dst_stride_y,
+ int width, int height);
+
+
+// Copy I422 to I422.
+#define I422ToI422 I422Copy
+LIBYUV_API
+int I422Copy(const uint8* src_y, int src_stride_y,
+ const uint8* src_u, int src_stride_u,
+ const uint8* src_v, int src_stride_v,
+ uint8* dst_y, int dst_stride_y,
+ uint8* dst_u, int dst_stride_u,
+ uint8* dst_v, int dst_stride_v,
+ int width, int height);
+
+// Copy I444 to I444.
+#define I444ToI444 I444Copy
+LIBYUV_API
+int I444Copy(const uint8* src_y, int src_stride_y,
+ const uint8* src_u, int src_stride_u,
+ const uint8* src_v, int src_stride_v,
+ uint8* dst_y, int dst_stride_y,
+ uint8* dst_u, int dst_stride_u,
+ uint8* dst_v, int dst_stride_v,
+ int width, int height);
+
+// Convert YUY2 to I422.
+LIBYUV_API
+int YUY2ToI422(const uint8* src_yuy2, int src_stride_yuy2,
+ uint8* dst_y, int dst_stride_y,
+ uint8* dst_u, int dst_stride_u,
+ uint8* dst_v, int dst_stride_v,
+ int width, int height);
+
+// Convert UYVY to I422.
+LIBYUV_API
+int UYVYToI422(const uint8* src_uyvy, int src_stride_uyvy,
+ uint8* dst_y, int dst_stride_y,
+ uint8* dst_u, int dst_stride_u,
+ uint8* dst_v, int dst_stride_v,
+ int width, int height);
+
+// Convert I420 to I400. (calls CopyPlane ignoring u/v).
+LIBYUV_API
+int I420ToI400(const uint8* src_y, int src_stride_y,
+ const uint8* src_u, int src_stride_u,
+ const uint8* src_v, int src_stride_v,
+ uint8* dst_y, int dst_stride_y,
+ int width, int height);
+
+// Alias
+#define I420ToI420Mirror I420Mirror
+
+// I420 mirror.
+LIBYUV_API
+int I420Mirror(const uint8* src_y, int src_stride_y,
+ const uint8* src_u, int src_stride_u,
+ const uint8* src_v, int src_stride_v,
+ uint8* dst_y, int dst_stride_y,
+ uint8* dst_u, int dst_stride_u,
+ uint8* dst_v, int dst_stride_v,
+ int width, int height);
+
+// Alias
+#define I400ToI400Mirror I400Mirror
+
+// I400 mirror. A single plane is mirrored horizontally.
+// Pass negative height to achieve 180 degree rotation.
+LIBYUV_API
+int I400Mirror(const uint8* src_y, int src_stride_y,
+ uint8* dst_y, int dst_stride_y,
+ int width, int height);
+
+// Alias
+#define ARGBToARGBMirror ARGBMirror
+
+// ARGB mirror.
+LIBYUV_API
+int ARGBMirror(const uint8* src_argb, int src_stride_argb,
+ uint8* dst_argb, int dst_stride_argb,
+ int width, int height);
+
+// Convert NV12 to RGB565.
+LIBYUV_API
+int NV12ToRGB565(const uint8* src_y, int src_stride_y,
+ const uint8* src_uv, int src_stride_uv,
+ uint8* dst_rgb565, int dst_stride_rgb565,
+ int width, int height);
+
+// Convert NV21 to RGB565.
+LIBYUV_API
+int NV21ToRGB565(const uint8* src_y, int src_stride_y,
+ const uint8* src_uv, int src_stride_uv,
+ uint8* dst_rgb565, int dst_stride_rgb565,
+ int width, int height);
+
+// I422ToARGB is in convert_argb.h
+// Convert I422 to BGRA.
+LIBYUV_API
+int I422ToBGRA(const uint8* src_y, int src_stride_y,
+ const uint8* src_u, int src_stride_u,
+ const uint8* src_v, int src_stride_v,
+ uint8* dst_bgra, int dst_stride_bgra,
+ int width, int height);
+
+// Convert I422 to ABGR.
+LIBYUV_API
+int I422ToABGR(const uint8* src_y, int src_stride_y,
+ const uint8* src_u, int src_stride_u,
+ const uint8* src_v, int src_stride_v,
+ uint8* dst_abgr, int dst_stride_abgr,
+ int width, int height);
+
+// Convert I422 to RGBA.
+LIBYUV_API
+int I422ToRGBA(const uint8* src_y, int src_stride_y,
+ const uint8* src_u, int src_stride_u,
+ const uint8* src_v, int src_stride_v,
+ uint8* dst_rgba, int dst_stride_rgba,
+ int width, int height);
+
+// Draw a rectangle into I420.
+LIBYUV_API
+int I420Rect(uint8* dst_y, int dst_stride_y,
+ uint8* dst_u, int dst_stride_u,
+ uint8* dst_v, int dst_stride_v,
+ int x, int y, int width, int height,
+ int value_y, int value_u, int value_v);
+
+// Draw a rectangle into ARGB.
+LIBYUV_API
+int ARGBRect(uint8* dst_argb, int dst_stride_argb,
+ int x, int y, int width, int height, uint32 value);
+
+// Convert ARGB to gray scale ARGB.
+LIBYUV_API
+int ARGBGrayTo(const uint8* src_argb, int src_stride_argb,
+ uint8* dst_argb, int dst_stride_argb,
+ int width, int height);
+
+// Make a rectangle of ARGB gray scale.
+LIBYUV_API
+int ARGBGray(uint8* dst_argb, int dst_stride_argb,
+ int x, int y, int width, int height);
+
+// Make a rectangle of ARGB Sepia tone.
+LIBYUV_API
+int ARGBSepia(uint8* dst_argb, int dst_stride_argb,
+ int x, int y, int width, int height);
+
+// Apply a matrix rotation to each ARGB pixel.
+// matrix_argb is 4 signed ARGB values. -128 to 127 representing -2 to 2.
+// The first 4 coefficients apply to B, G, R, A and produce B of the output.
+// The next 4 coefficients apply to B, G, R, A and produce G of the output.
+// The next 4 coefficients apply to B, G, R, A and produce R of the output.
+// The last 4 coefficients apply to B, G, R, A and produce A of the output.
+LIBYUV_API
+int ARGBColorMatrix(const uint8* src_argb, int src_stride_argb,
+ uint8* dst_argb, int dst_stride_argb,
+ const int8* matrix_argb,
+ int width, int height);
+
+// Deprecated. Use ARGBColorMatrix instead.
+// Apply a matrix rotation to each ARGB pixel.
+// matrix_argb is 3 signed ARGB values. -128 to 127 representing -1 to 1.
+// The first 4 coefficients apply to B, G, R, A and produce B of the output.
+// The next 4 coefficients apply to B, G, R, A and produce G of the output.
+// The last 4 coefficients apply to B, G, R, A and produce R of the output.
+LIBYUV_API
+int RGBColorMatrix(uint8* dst_argb, int dst_stride_argb,
+ const int8* matrix_rgb,
+ int x, int y, int width, int height);
+
+// Apply a color table each ARGB pixel.
+// Table contains 256 ARGB values.
+LIBYUV_API
+int ARGBColorTable(uint8* dst_argb, int dst_stride_argb,
+ const uint8* table_argb,
+ int x, int y, int width, int height);
+
+// Apply a color table each ARGB pixel but preserve destination alpha.
+// Table contains 256 ARGB values.
+LIBYUV_API
+int RGBColorTable(uint8* dst_argb, int dst_stride_argb,
+ const uint8* table_argb,
+ int x, int y, int width, int height);
+
+// Apply a luma/color table each ARGB pixel but preserve destination alpha.
+// Table contains 32768 values indexed by [Y][C] where 7 it 7 bit luma from
+// RGB (YJ style) and C is an 8 bit color component (R, G or B).
+LIBYUV_API
+int ARGBLumaColorTable(const uint8* src_argb, int src_stride_argb,
+ uint8* dst_argb, int dst_stride_argb,
+ const uint8* luma_rgb_table,
+ int width, int height);
+
+// Apply a 3 term polynomial to ARGB values.
+// poly points to a 4x4 matrix. The first row is constants. The 2nd row is
+// coefficients for b, g, r and a. The 3rd row is coefficients for b squared,
+// g squared, r squared and a squared. The 4rd row is coefficients for b to
+// the 3, g to the 3, r to the 3 and a to the 3. The values are summed and
+// result clamped to 0 to 255.
+// A polynomial approximation can be dirived using software such as 'R'.
+
+LIBYUV_API
+int ARGBPolynomial(const uint8* src_argb, int src_stride_argb,
+ uint8* dst_argb, int dst_stride_argb,
+ const float* poly,
+ int width, int height);
+
+// Quantize a rectangle of ARGB. Alpha unaffected.
+// scale is a 16 bit fractional fixed point scaler between 0 and 65535.
+// interval_size should be a value between 1 and 255.
+// interval_offset should be a value between 0 and 255.
+LIBYUV_API
+int ARGBQuantize(uint8* dst_argb, int dst_stride_argb,
+ int scale, int interval_size, int interval_offset,
+ int x, int y, int width, int height);
+
+// Copy ARGB to ARGB.
+LIBYUV_API
+int ARGBCopy(const uint8* src_argb, int src_stride_argb,
+ uint8* dst_argb, int dst_stride_argb,
+ int width, int height);
+
+// Copy ARGB to ARGB.
+LIBYUV_API
+int ARGBCopyAlpha(const uint8* src_argb, int src_stride_argb,
+ uint8* dst_argb, int dst_stride_argb,
+ int width, int height);
+
+// Copy ARGB to ARGB.
+LIBYUV_API
+int ARGBCopyYToAlpha(const uint8* src_y, int src_stride_y,
+ uint8* dst_argb, int dst_stride_argb,
+ int width, int height);
+
+typedef void (*ARGBBlendRow)(const uint8* src_argb0, const uint8* src_argb1,
+ uint8* dst_argb, int width);
+
+// Get function to Alpha Blend ARGB pixels and store to destination.
+LIBYUV_API
+ARGBBlendRow GetARGBBlend();
+
+// Alpha Blend ARGB images and store to destination.
+// Alpha of destination is set to 255.
+LIBYUV_API
+int ARGBBlend(const uint8* src_argb0, int src_stride_argb0,
+ const uint8* src_argb1, int src_stride_argb1,
+ uint8* dst_argb, int dst_stride_argb,
+ int width, int height);
+
+// Multiply ARGB image by ARGB image. Shifted down by 8. Saturates to 255.
+LIBYUV_API
+int ARGBMultiply(const uint8* src_argb0, int src_stride_argb0,
+ const uint8* src_argb1, int src_stride_argb1,
+ uint8* dst_argb, int dst_stride_argb,
+ int width, int height);
+
+// Add ARGB image with ARGB image. Saturates to 255.
+LIBYUV_API
+int ARGBAdd(const uint8* src_argb0, int src_stride_argb0,
+ const uint8* src_argb1, int src_stride_argb1,
+ uint8* dst_argb, int dst_stride_argb,
+ int width, int height);
+
+// Subtract ARGB image (argb1) from ARGB image (argb0). Saturates to 0.
+LIBYUV_API
+int ARGBSubtract(const uint8* src_argb0, int src_stride_argb0,
+ const uint8* src_argb1, int src_stride_argb1,
+ uint8* dst_argb, int dst_stride_argb,
+ int width, int height);
+
+// Convert I422 to YUY2.
+LIBYUV_API
+int I422ToYUY2(const uint8* src_y, int src_stride_y,
+ const uint8* src_u, int src_stride_u,
+ const uint8* src_v, int src_stride_v,
+ uint8* dst_frame, int dst_stride_frame,
+ int width, int height);
+
+// Convert I422 to UYVY.
+LIBYUV_API
+int I422ToUYVY(const uint8* src_y, int src_stride_y,
+ const uint8* src_u, int src_stride_u,
+ const uint8* src_v, int src_stride_v,
+ uint8* dst_frame, int dst_stride_frame,
+ int width, int height);
+
+// Convert unattentuated ARGB to preattenuated ARGB.
+LIBYUV_API
+int ARGBAttenuate(const uint8* src_argb, int src_stride_argb,
+ uint8* dst_argb, int dst_stride_argb,
+ int width, int height);
+
+// Convert preattentuated ARGB to unattenuated ARGB.
+LIBYUV_API
+int ARGBUnattenuate(const uint8* src_argb, int src_stride_argb,
+ uint8* dst_argb, int dst_stride_argb,
+ int width, int height);
+
+// Convert MJPG to ARGB.
+LIBYUV_API
+int MJPGToARGB(const uint8* sample, size_t sample_size,
+ uint8* argb, int argb_stride,
+ int w, int h, int dw, int dh);
+
+// Internal function - do not call directly.
+// Computes table of cumulative sum for image where the value is the sum
+// of all values above and to the left of the entry. Used by ARGBBlur.
+LIBYUV_API
+int ARGBComputeCumulativeSum(const uint8* src_argb, int src_stride_argb,
+ int32* dst_cumsum, int dst_stride32_cumsum,
+ int width, int height);
+
+// Blur ARGB image.
+// dst_cumsum table of width * (height + 1) * 16 bytes aligned to
+// 16 byte boundary.
+// dst_stride32_cumsum is number of ints in a row (width * 4).
+// radius is number of pixels around the center. e.g. 1 = 3x3. 2=5x5.
+// Blur is optimized for radius of 5 (11x11) or less.
+LIBYUV_API
+int ARGBBlur(const uint8* src_argb, int src_stride_argb,
+ uint8* dst_argb, int dst_stride_argb,
+ int32* dst_cumsum, int dst_stride32_cumsum,
+ int width, int height, int radius);
+
+// Multiply ARGB image by ARGB value.
+LIBYUV_API
+int ARGBShade(const uint8* src_argb, int src_stride_argb,
+ uint8* dst_argb, int dst_stride_argb,
+ int width, int height, uint32 value);
+
+// Interpolate between two ARGB images using specified amount of interpolation
+// (0 to 255) and store to destination.
+// 'interpolation' is specified as 8 bit fraction where 0 means 100% src_argb0
+// and 255 means 1% src_argb0 and 99% src_argb1.
+// Internally uses ARGBScale bilinear filtering.
+// Caveat: This function will write up to 16 bytes beyond the end of dst_argb.
+LIBYUV_API
+int ARGBInterpolate(const uint8* src_argb0, int src_stride_argb0,
+ const uint8* src_argb1, int src_stride_argb1,
+ uint8* dst_argb, int dst_stride_argb,
+ int width, int height, int interpolation);
+
+#if defined(__pnacl__) || defined(__CLR_VER) || defined(COVERAGE_ENABLED) || \
+ defined(TARGET_IPHONE_SIMULATOR)
+#define LIBYUV_DISABLE_X86
+#endif
+
+// Row functions for copying a pixels from a source with a slope to a row
+// of destination. Useful for scaling, rotation, mirror, texture mapping.
+LIBYUV_API
+void ARGBAffineRow_C(const uint8* src_argb, int src_argb_stride,
+ uint8* dst_argb, const float* uv_dudv, int width);
+// The following are available on all x86 platforms:
+#if !defined(LIBYUV_DISABLE_X86) && \
+ (defined(_M_IX86) || defined(__x86_64__) || defined(__i386__))
+LIBYUV_API
+void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride,
+ uint8* dst_argb, const float* uv_dudv, int width);
+#define HAS_ARGBAFFINEROW_SSE2
+#endif // LIBYUV_DISABLE_X86
+
+// Shuffle ARGB channel order. e.g. BGRA to ARGB.
+// shuffler is 16 bytes and must be aligned.
+LIBYUV_API
+int ARGBShuffle(const uint8* src_bgra, int src_stride_bgra,
+ uint8* dst_argb, int dst_stride_argb,
+ const uint8* shuffler, int width, int height);
+
+// Sobel ARGB effect with planar output.
+LIBYUV_API
+int ARGBSobelToPlane(const uint8* src_argb, int src_stride_argb,
+ uint8* dst_y, int dst_stride_y,
+ int width, int height);
+
+// Sobel ARGB effect.
+LIBYUV_API
+int ARGBSobel(const uint8* src_argb, int src_stride_argb,
+ uint8* dst_argb, int dst_stride_argb,
+ int width, int height);
+
+// Sobel ARGB effect w/ Sobel X, Sobel, Sobel Y in ARGB.
+LIBYUV_API
+int ARGBSobelXY(const uint8* src_argb, int src_stride_argb,
+ uint8* dst_argb, int dst_stride_argb,
+ int width, int height);
+
+#ifdef __cplusplus
+} // extern "C"
+} // namespace libyuv
+#endif
+
+#endif // INCLUDE_LIBYUV_PLANAR_FUNCTIONS_H_ NOLINT
diff --git a/third_party/libyuv/include/libyuv/row.h b/third_party/libyuv/include/libyuv/row.h
new file mode 100644
index 0000000..daf5a45
--- /dev/null
+++ b/third_party/libyuv/include/libyuv/row.h
@@ -0,0 +1,1704 @@
+/*
+ * Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef INCLUDE_LIBYUV_ROW_H_ // NOLINT
+#define INCLUDE_LIBYUV_ROW_H_
+
+#include <stdlib.h> // For malloc.
+
+#include "basic_types.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+#define IS_ALIGNED(p, a) (!((uintptr_t)(p) & ((a) - 1)))
+
+#ifdef __cplusplus
+#define align_buffer_64(var, size) \
+ uint8* var##_mem = reinterpret_cast<uint8*>(malloc((size) + 63)); \
+ uint8* var = reinterpret_cast<uint8*> \
+ ((reinterpret_cast<intptr_t>(var##_mem) + 63) & ~63)
+#else
+#define align_buffer_64(var, size) \
+ uint8* var##_mem = (uint8*)(malloc((size) + 63)); /* NOLINT */ \
+ uint8* var = (uint8*)(((intptr_t)(var##_mem) + 63) & ~63) /* NOLINT */
+#endif
+
+#define free_aligned_buffer_64(var) \
+ free(var##_mem); \
+ var = 0
+
+#if defined(__pnacl__) || defined(__CLR_VER) || defined(COVERAGE_ENABLED) || \
+ defined(TARGET_IPHONE_SIMULATOR)
+#define LIBYUV_DISABLE_X86
+#endif
+// True if compiling for SSSE3 as a requirement.
+#if defined(__SSSE3__) || (defined(_M_IX86_FP) && (_M_IX86_FP >= 3))
+#define LIBYUV_SSSE3_ONLY
+#endif
+
+// Enable for NaCL pepper 33 for bundle and AVX2 support.
+// #define NEW_BINUTILS
+
+// The following are available on all x86 platforms:
+#if !defined(LIBYUV_DISABLE_X86) && \
+ (defined(_M_IX86) || defined(__x86_64__) || defined(__i386__))
+// Effects:
+#define HAS_ARGBADDROW_SSE2
+#define HAS_ARGBAFFINEROW_SSE2
+#define HAS_ARGBATTENUATEROW_SSSE3
+#define HAS_ARGBBLENDROW_SSSE3
+#define HAS_ARGBCOLORMATRIXROW_SSSE3
+#define HAS_ARGBCOLORTABLEROW_X86
+#define HAS_ARGBCOPYALPHAROW_SSE2
+#define HAS_ARGBCOPYYTOALPHAROW_SSE2
+#define HAS_ARGBGRAYROW_SSSE3
+#define HAS_ARGBLUMACOLORTABLEROW_SSSE3
+#define HAS_ARGBMIRRORROW_SSSE3
+#define HAS_ARGBMULTIPLYROW_SSE2
+#define HAS_ARGBPOLYNOMIALROW_SSE2
+#define HAS_ARGBQUANTIZEROW_SSE2
+#define HAS_ARGBSEPIAROW_SSSE3
+#define HAS_ARGBSHADEROW_SSE2
+#define HAS_ARGBSUBTRACTROW_SSE2
+#define HAS_ARGBTOUVROW_SSSE3
+#define HAS_ARGBUNATTENUATEROW_SSE2
+#define HAS_COMPUTECUMULATIVESUMROW_SSE2
+#define HAS_CUMULATIVESUMTOAVERAGEROW_SSE2
+#define HAS_INTERPOLATEROW_SSE2
+#define HAS_INTERPOLATEROW_SSSE3
+#define HAS_RGBCOLORTABLEROW_X86
+#define HAS_SOBELROW_SSE2
+#define HAS_SOBELTOPLANEROW_SSE2
+#define HAS_SOBELXROW_SSE2
+#define HAS_SOBELXYROW_SSE2
+#define HAS_SOBELYROW_SSE2
+
+// Conversions:
+#define HAS_ABGRTOUVROW_SSSE3
+#define HAS_ABGRTOYROW_SSSE3
+#define HAS_ARGB1555TOARGBROW_SSE2
+#define HAS_ARGB4444TOARGBROW_SSE2
+#define HAS_ARGBSHUFFLEROW_SSE2
+#define HAS_ARGBSHUFFLEROW_SSSE3
+#define HAS_ARGBTOARGB1555ROW_SSE2
+#define HAS_ARGBTOARGB4444ROW_SSE2
+#define HAS_ARGBTOBAYERGGROW_SSE2
+#define HAS_ARGBTOBAYERROW_SSSE3
+#define HAS_ARGBTORAWROW_SSSE3
+#define HAS_ARGBTORGB24ROW_SSSE3
+#define HAS_ARGBTORGB565ROW_SSE2
+#define HAS_ARGBTOUV422ROW_SSSE3
+#define HAS_ARGBTOUV444ROW_SSSE3
+#define HAS_ARGBTOUVJROW_SSSE3
+#define HAS_ARGBTOYJROW_SSSE3
+#define HAS_ARGBTOYROW_SSSE3
+#define HAS_BGRATOUVROW_SSSE3
+#define HAS_BGRATOYROW_SSSE3
+#define HAS_COPYROW_ERMS
+#define HAS_COPYROW_SSE2
+#define HAS_COPYROW_X86
+#define HAS_HALFROW_SSE2
+#define HAS_I400TOARGBROW_SSE2
+#define HAS_I411TOARGBROW_SSSE3
+#define HAS_I422TOARGB1555ROW_SSSE3
+#define HAS_I422TOABGRROW_SSSE3
+#define HAS_I422TOARGB1555ROW_SSSE3
+#define HAS_I422TOARGB4444ROW_SSSE3
+#define HAS_I422TOARGBROW_SSSE3
+#define HAS_I422TOBGRAROW_SSSE3
+#define HAS_I422TORAWROW_SSSE3
+#define HAS_I422TORGB24ROW_SSSE3
+#define HAS_I422TORGB565ROW_SSSE3
+#define HAS_I422TORGBAROW_SSSE3
+#define HAS_I422TOUYVYROW_SSE2
+#define HAS_I422TOYUY2ROW_SSE2
+#define HAS_I444TOARGBROW_SSSE3
+#define HAS_MERGEUVROW_SSE2
+#define HAS_MIRRORROW_SSE2
+#define HAS_MIRRORROW_SSSE3
+#define HAS_MIRRORROW_UV_SSSE3
+#define HAS_MIRRORUVROW_SSSE3
+#define HAS_NV12TOARGBROW_SSSE3
+#define HAS_NV12TORGB565ROW_SSSE3
+#define HAS_NV21TOARGBROW_SSSE3
+#define HAS_NV21TORGB565ROW_SSSE3
+#define HAS_RAWTOARGBROW_SSSE3
+#define HAS_RAWTOYROW_SSSE3
+#define HAS_RGB24TOARGBROW_SSSE3
+#define HAS_RGB24TOYROW_SSSE3
+#define HAS_RGB565TOARGBROW_SSE2
+#define HAS_RGBATOUVROW_SSSE3
+#define HAS_RGBATOYROW_SSSE3
+#define HAS_SETROW_X86
+#define HAS_SPLITUVROW_SSE2
+#define HAS_UYVYTOARGBROW_SSSE3
+#define HAS_UYVYTOUV422ROW_SSE2
+#define HAS_UYVYTOUVROW_SSE2
+#define HAS_UYVYTOYROW_SSE2
+#define HAS_YTOARGBROW_SSE2
+#define HAS_YUY2TOARGBROW_SSSE3
+#define HAS_YUY2TOUV422ROW_SSE2
+#define HAS_YUY2TOUVROW_SSE2
+#define HAS_YUY2TOYROW_SSE2
+#endif
+
+// GCC >= 4.7.0 required for AVX2.
+#if defined(__GNUC__) && (defined(__x86_64__) || defined(__i386__))
+#if (__GNUC__ > 4) || (__GNUC__ == 4 && (__GNUC_MINOR__ >= 7))
+#define GCC_HAS_AVX2 1
+#endif // GNUC >= 4.7
+#endif // __GNUC__
+
+// clang >= 3.4.0 required for AVX2.
+#if defined(__clang__) && (defined(__x86_64__) || defined(__i386__))
+#if (__clang_major__ > 3) || (__clang_major__ == 3 && (__clang_minor__ >= 4))
+#define CLANG_HAS_AVX2 1
+#endif // clang >= 3.4
+#endif // __clang__
+
+// Visual C 2012 required for AVX2.
+#if defined(_M_IX86) && defined(_MSC_VER) && _MSC_VER >= 1700
+#define VISUALC_HAS_AVX2 1
+#endif // VisualStudio >= 2012
+
+// The following are available on all x86 platforms, but
+// require VS2012, clang 3.4 or gcc 4.7.
+// The code supports NaCL but requires a new compiler and validator.
+#if !defined(LIBYUV_DISABLE_X86) && (defined(VISUALC_HAS_AVX2) || \
+ defined(CLANG_HAS_AVX2) || defined(GCC_HAS_AVX2))
+// Effects:
+#define HAS_ARGBPOLYNOMIALROW_AVX2
+#define HAS_ARGBSHUFFLEROW_AVX2
+#define HAS_ARGBCOPYALPHAROW_AVX2
+#define HAS_ARGBCOPYYTOALPHAROW_AVX2
+#endif
+
+// The following are require VS2012.
+// TODO(fbarchard): Port to gcc.
+#if !defined(LIBYUV_DISABLE_X86) && defined(VISUALC_HAS_AVX2)
+#define HAS_ARGBTOUVROW_AVX2
+#define HAS_ARGBTOYJROW_AVX2
+#define HAS_ARGBTOYROW_AVX2
+#define HAS_HALFROW_AVX2
+#define HAS_I422TOARGBROW_AVX2
+#define HAS_INTERPOLATEROW_AVX2
+#define HAS_MERGEUVROW_AVX2
+#define HAS_MIRRORROW_AVX2
+#define HAS_SPLITUVROW_AVX2
+#define HAS_UYVYTOUV422ROW_AVX2
+#define HAS_UYVYTOUVROW_AVX2
+#define HAS_UYVYTOYROW_AVX2
+#define HAS_YUY2TOUV422ROW_AVX2
+#define HAS_YUY2TOUVROW_AVX2
+#define HAS_YUY2TOYROW_AVX2
+
+// Effects:
+#define HAS_ARGBADDROW_AVX2
+#define HAS_ARGBATTENUATEROW_AVX2
+#define HAS_ARGBMIRRORROW_AVX2
+#define HAS_ARGBMULTIPLYROW_AVX2
+#define HAS_ARGBSUBTRACTROW_AVX2
+#define HAS_ARGBUNATTENUATEROW_AVX2
+#endif // defined(VISUALC_HAS_AVX2)
+
+// The following are Yasm x86 only:
+// TODO(fbarchard): Port AVX2 to inline.
+#if !defined(LIBYUV_DISABLE_X86) && defined(HAVE_YASM)
+ (defined(_M_IX86) || defined(_M_X64) || \
+ defined(__x86_64__) || defined(__i386__))
+#define HAS_MERGEUVROW_AVX2
+#define HAS_MERGEUVROW_MMX
+#define HAS_SPLITUVROW_AVX2
+#define HAS_SPLITUVROW_MMX
+#define HAS_UYVYTOYROW_AVX2
+#define HAS_UYVYTOYROW_MMX
+#define HAS_YUY2TOYROW_AVX2
+#define HAS_YUY2TOYROW_MMX
+#endif
+
+// The following are disabled when SSSE3 is available:
+#if !defined(LIBYUV_DISABLE_X86) && \
+ (defined(_M_IX86) || defined(__x86_64__) || defined(__i386__)) && \
+ !defined(LIBYUV_SSSE3_ONLY)
+#define HAS_ARGBBLENDROW_SSE2
+#define HAS_ARGBATTENUATEROW_SSE2
+#define HAS_MIRRORROW_SSE2
+#endif
+
+// The following are available on Neon platforms:
+#if !defined(LIBYUV_DISABLE_NEON) && \
+ (defined(__ARM_NEON__) || defined(LIBYUV_NEON))
+#define HAS_ABGRTOUVROW_NEON
+#define HAS_ABGRTOYROW_NEON
+#define HAS_ARGB1555TOARGBROW_NEON
+#define HAS_ARGB1555TOUVROW_NEON
+#define HAS_ARGB1555TOYROW_NEON
+#define HAS_ARGB4444TOARGBROW_NEON
+#define HAS_ARGB4444TOUVROW_NEON
+#define HAS_ARGB4444TOYROW_NEON
+#define HAS_ARGBTOARGB1555ROW_NEON
+#define HAS_ARGBTOARGB4444ROW_NEON
+#define HAS_ARGBTOBAYERROW_NEON
+#define HAS_ARGBTOBAYERGGROW_NEON
+#define HAS_ARGBTORAWROW_NEON
+#define HAS_ARGBTORGB24ROW_NEON
+#define HAS_ARGBTORGB565ROW_NEON
+#define HAS_ARGBTOUV411ROW_NEON
+#define HAS_ARGBTOUV422ROW_NEON
+#define HAS_ARGBTOUV444ROW_NEON
+#define HAS_ARGBTOUVROW_NEON
+#define HAS_ARGBTOUVJROW_NEON
+#define HAS_ARGBTOYROW_NEON
+#define HAS_ARGBTOYJROW_NEON
+#define HAS_BGRATOUVROW_NEON
+#define HAS_BGRATOYROW_NEON
+#define HAS_COPYROW_NEON
+#define HAS_HALFROW_NEON
+#define HAS_I400TOARGBROW_NEON
+#define HAS_I411TOARGBROW_NEON
+#define HAS_I422TOABGRROW_NEON
+#define HAS_I422TOARGB1555ROW_NEON
+#define HAS_I422TOARGB4444ROW_NEON
+#define HAS_I422TOARGBROW_NEON
+#define HAS_I422TOBGRAROW_NEON
+#define HAS_I422TORAWROW_NEON
+#define HAS_I422TORGB24ROW_NEON
+#define HAS_I422TORGB565ROW_NEON
+#define HAS_I422TORGBAROW_NEON
+#define HAS_I422TOUYVYROW_NEON
+#define HAS_I422TOYUY2ROW_NEON
+#define HAS_I444TOARGBROW_NEON
+#define HAS_MERGEUVROW_NEON
+#define HAS_MIRRORROW_NEON
+#define HAS_MIRRORUVROW_NEON
+#define HAS_NV12TOARGBROW_NEON
+#define HAS_NV12TORGB565ROW_NEON
+#define HAS_NV21TOARGBROW_NEON
+#define HAS_NV21TORGB565ROW_NEON
+#define HAS_RAWTOARGBROW_NEON
+#define HAS_RAWTOUVROW_NEON
+#define HAS_RAWTOYROW_NEON
+#define HAS_RGB24TOARGBROW_NEON
+#define HAS_RGB24TOUVROW_NEON
+#define HAS_RGB24TOYROW_NEON
+#define HAS_RGB565TOARGBROW_NEON
+#define HAS_RGB565TOUVROW_NEON
+#define HAS_RGB565TOYROW_NEON
+#define HAS_RGBATOUVROW_NEON
+#define HAS_RGBATOYROW_NEON
+#define HAS_SETROW_NEON
+#define HAS_SPLITUVROW_NEON
+#define HAS_UYVYTOARGBROW_NEON
+#define HAS_UYVYTOUV422ROW_NEON
+#define HAS_UYVYTOUVROW_NEON
+#define HAS_UYVYTOYROW_NEON
+#define HAS_YTOARGBROW_NEON
+#define HAS_YUY2TOARGBROW_NEON
+#define HAS_YUY2TOUV422ROW_NEON
+#define HAS_YUY2TOUVROW_NEON
+#define HAS_YUY2TOYROW_NEON
+
+// Effects:
+#define HAS_ARGBADDROW_NEON
+#define HAS_ARGBATTENUATEROW_NEON
+#define HAS_ARGBBLENDROW_NEON
+#define HAS_ARGBGRAYROW_NEON
+#define HAS_ARGBMIRRORROW_NEON
+#define HAS_ARGBMULTIPLYROW_NEON
+#define HAS_ARGBQUANTIZEROW_NEON
+#define HAS_ARGBSEPIAROW_NEON
+#define HAS_ARGBSHADEROW_NEON
+#define HAS_ARGBSUBTRACTROW_NEON
+#define HAS_SOBELROW_NEON
+#define HAS_SOBELTOPLANEROW_NEON
+#define HAS_SOBELXYROW_NEON
+#define HAS_SOBELXROW_NEON
+#define HAS_SOBELYROW_NEON
+#define HAS_INTERPOLATEROW_NEON
+// TODO(fbarchard): Investigate neon unittest failure.
+// #define HAS_ARGBCOLORMATRIXROW_NEON
+#endif
+
+// The following are available on Mips platforms:
+#if !defined(LIBYUV_DISABLE_MIPS) && defined(__mips__)
+#define HAS_COPYROW_MIPS
+#if defined(__mips_dsp) && (__mips_dsp_rev >= 2)
+#define HAS_I422TOABGRROW_MIPS_DSPR2
+#define HAS_I422TOARGBROW_MIPS_DSPR2
+#define HAS_I422TOBGRAROW_MIPS_DSPR2
+#define HAS_INTERPOLATEROWS_MIPS_DSPR2
+#define HAS_MIRRORROW_MIPS_DSPR2
+#define HAS_MIRRORUVROW_MIPS_DSPR2
+#define HAS_SPLITUVROW_MIPS_DSPR2
+#endif
+#endif
+
+#if defined(_MSC_VER) && !defined(__CLR_VER)
+#define SIMD_ALIGNED(var) __declspec(align(16)) var
+typedef __declspec(align(16)) int16 vec16[8];
+typedef __declspec(align(16)) int32 vec32[4];
+typedef __declspec(align(16)) int8 vec8[16];
+typedef __declspec(align(16)) uint16 uvec16[8];
+typedef __declspec(align(16)) uint32 uvec32[4];
+typedef __declspec(align(16)) uint8 uvec8[16];
+typedef __declspec(align(32)) int16 lvec16[16];
+typedef __declspec(align(32)) int32 lvec32[8];
+typedef __declspec(align(32)) int8 lvec8[32];
+typedef __declspec(align(32)) uint16 ulvec16[16];
+typedef __declspec(align(32)) uint32 ulvec32[8];
+typedef __declspec(align(32)) uint8 ulvec8[32];
+
+#elif defined(__GNUC__)
+// Caveat GCC 4.2 to 4.7 have a known issue using vectors with const.
+#define SIMD_ALIGNED(var) var __attribute__((aligned(16)))
+typedef int16 __attribute__((vector_size(16))) vec16;
+typedef int32 __attribute__((vector_size(16))) vec32;
+typedef int8 __attribute__((vector_size(16))) vec8;
+typedef uint16 __attribute__((vector_size(16))) uvec16;
+typedef uint32 __attribute__((vector_size(16))) uvec32;
+typedef uint8 __attribute__((vector_size(16))) uvec8;
+#else
+#define SIMD_ALIGNED(var) var
+typedef int16 vec16[8];
+typedef int32 vec32[4];
+typedef int8 vec8[16];
+typedef uint16 uvec16[8];
+typedef uint32 uvec32[4];
+typedef uint8 uvec8[16];
+#endif
+
+#if defined(__APPLE__) || defined(__x86_64__) || defined(__llvm__)
+#define OMITFP
+#else
+#define OMITFP __attribute__((optimize("omit-frame-pointer")))
+#endif
+
+// NaCL macros for GCC x86 and x64.
+
+// TODO(nfullagar): When pepper_33 toolchain is distributed, default to
+// NEW_BINUTILS and remove all BUNDLEALIGN occurances.
+#if defined(__native_client__)
+#define LABELALIGN ".p2align 5\n"
+#else
+#define LABELALIGN ".p2align 2\n"
+#endif
+#if defined(__native_client__) && defined(__x86_64__)
+#if defined(NEW_BINUTILS)
+#define BUNDLELOCK ".bundle_lock\n"
+#define BUNDLEUNLOCK ".bundle_unlock\n"
+#define BUNDLEALIGN "\n"
+#else
+#define BUNDLELOCK "\n"
+#define BUNDLEUNLOCK "\n"
+#define BUNDLEALIGN ".p2align 5\n"
+#endif
+#define MEMACCESS(base) "%%nacl:(%%r15,%q" #base ")"
+#define MEMACCESS2(offset, base) "%%nacl:" #offset "(%%r15,%q" #base ")"
+#define MEMLEA(offset, base) #offset "(%q" #base ")"
+#define MEMLEA3(offset, index, scale) \
+ #offset "(,%q" #index "," #scale ")"
+#define MEMLEA4(offset, base, index, scale) \
+ #offset "(%q" #base ",%q" #index "," #scale ")"
+#define MEMMOVESTRING(s, d) "%%nacl:(%q" #s "),%%nacl:(%q" #d "), %%r15"
+#define MEMSTORESTRING(reg, d) "%%" #reg ",%%nacl:(%q" #d "), %%r15"
+#define MEMOPREG(opcode, offset, base, index, scale, reg) \
+ BUNDLELOCK \
+ "lea " #offset "(%q" #base ",%q" #index "," #scale "),%%r14d\n" \
+ #opcode " (%%r15,%%r14),%%" #reg "\n" \
+ BUNDLEUNLOCK
+#define MEMOPMEM(opcode, reg, offset, base, index, scale) \
+ BUNDLELOCK \
+ "lea " #offset "(%q" #base ",%q" #index "," #scale "),%%r14d\n" \
+ #opcode " %%" #reg ",(%%r15,%%r14)\n" \
+ BUNDLEUNLOCK
+#define MEMOPARG(opcode, offset, base, index, scale, arg) \
+ BUNDLELOCK \
+ "lea " #offset "(%q" #base ",%q" #index "," #scale "),%%r14d\n" \
+ #opcode " (%%r15,%%r14),%" #arg "\n" \
+ BUNDLEUNLOCK
+#else
+#define BUNDLEALIGN "\n"
+#define MEMACCESS(base) "(%" #base ")"
+#define MEMACCESS2(offset, base) #offset "(%" #base ")"
+#define MEMLEA(offset, base) #offset "(%" #base ")"
+#define MEMLEA3(offset, index, scale) \
+ #offset "(,%" #index "," #scale ")"
+#define MEMLEA4(offset, base, index, scale) \
+ #offset "(%" #base ",%" #index "," #scale ")"
+#define MEMMOVESTRING(s, d)
+#define MEMSTORESTRING(reg, d)
+#define MEMOPREG(opcode, offset, base, index, scale, reg) \
+ #opcode " " #offset "(%" #base ",%" #index "," #scale "),%%" #reg "\n"
+#define MEMOPMEM(opcode, reg, offset, base, index, scale) \
+ #opcode " %%" #reg ","#offset "(%" #base ",%" #index "," #scale ")\n"
+#define MEMOPARG(opcode, offset, base, index, scale, arg) \
+ #opcode " " #offset "(%" #base ",%" #index "," #scale "),%" #arg "\n"
+#endif
+
+void I444ToARGBRow_NEON(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_argb,
+ int width);
+void I422ToARGBRow_NEON(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_argb,
+ int width);
+void I411ToARGBRow_NEON(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_argb,
+ int width);
+void I422ToBGRARow_NEON(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_bgra,
+ int width);
+void I422ToABGRRow_NEON(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_abgr,
+ int width);
+void I422ToRGBARow_NEON(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_rgba,
+ int width);
+void I422ToRGB24Row_NEON(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_rgb24,
+ int width);
+void I422ToRAWRow_NEON(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_raw,
+ int width);
+void I422ToRGB565Row_NEON(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_rgb565,
+ int width);
+void I422ToARGB1555Row_NEON(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_argb1555,
+ int width);
+void I422ToARGB4444Row_NEON(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_argb4444,
+ int width);
+void NV12ToARGBRow_NEON(const uint8* src_y,
+ const uint8* src_uv,
+ uint8* dst_argb,
+ int width);
+void NV21ToARGBRow_NEON(const uint8* src_y,
+ const uint8* src_vu,
+ uint8* dst_argb,
+ int width);
+void NV12ToRGB565Row_NEON(const uint8* src_y,
+ const uint8* src_uv,
+ uint8* dst_rgb565,
+ int width);
+void NV21ToRGB565Row_NEON(const uint8* src_y,
+ const uint8* src_vu,
+ uint8* dst_rgb565,
+ int width);
+void YUY2ToARGBRow_NEON(const uint8* src_yuy2,
+ uint8* dst_argb,
+ int width);
+void UYVYToARGBRow_NEON(const uint8* src_uyvy,
+ uint8* dst_argb,
+ int width);
+
+void ARGBToYRow_AVX2(const uint8* src_argb, uint8* dst_y, int pix);
+void ARGBToYRow_Any_AVX2(const uint8* src_argb, uint8* dst_y, int pix);
+void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix);
+void ARGBToYJRow_AVX2(const uint8* src_argb, uint8* dst_y, int pix);
+void ARGBToYJRow_Any_AVX2(const uint8* src_argb, uint8* dst_y, int pix);
+void ARGBToYJRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix);
+void BGRAToYRow_SSSE3(const uint8* src_bgra, uint8* dst_y, int pix);
+void ABGRToYRow_SSSE3(const uint8* src_abgr, uint8* dst_y, int pix);
+void RGBAToYRow_SSSE3(const uint8* src_rgba, uint8* dst_y, int pix);
+void RGB24ToYRow_SSSE3(const uint8* src_rgb24, uint8* dst_y, int pix);
+void RAWToYRow_SSSE3(const uint8* src_raw, uint8* dst_y, int pix);
+void ARGBToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix);
+void ARGBToYJRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix);
+void BGRAToYRow_Unaligned_SSSE3(const uint8* src_bgra, uint8* dst_y, int pix);
+void ABGRToYRow_Unaligned_SSSE3(const uint8* src_abgr, uint8* dst_y, int pix);
+void RGBAToYRow_Unaligned_SSSE3(const uint8* src_rgba, uint8* dst_y, int pix);
+void RGB24ToYRow_Unaligned_SSSE3(const uint8* src_rgb24, uint8* dst_y, int pix);
+void RAWToYRow_Unaligned_SSSE3(const uint8* src_raw, uint8* dst_y, int pix);
+void ARGBToYRow_NEON(const uint8* src_argb, uint8* dst_y, int pix);
+void ARGBToYJRow_NEON(const uint8* src_argb, uint8* dst_y, int pix);
+void ARGBToUV444Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
+ int pix);
+void ARGBToUV422Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
+ int pix);
+void ARGBToUV411Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
+ int pix);
+void ARGBToUVRow_NEON(const uint8* src_argb, int src_stride_argb,
+ uint8* dst_u, uint8* dst_v, int pix);
+void ARGBToUVJRow_NEON(const uint8* src_argb, int src_stride_argb,
+ uint8* dst_u, uint8* dst_v, int pix);
+void BGRAToUVRow_NEON(const uint8* src_bgra, int src_stride_bgra,
+ uint8* dst_u, uint8* dst_v, int pix);
+void ABGRToUVRow_NEON(const uint8* src_abgr, int src_stride_abgr,
+ uint8* dst_u, uint8* dst_v, int pix);
+void RGBAToUVRow_NEON(const uint8* src_rgba, int src_stride_rgba,
+ uint8* dst_u, uint8* dst_v, int pix);
+void RGB24ToUVRow_NEON(const uint8* src_rgb24, int src_stride_rgb24,
+ uint8* dst_u, uint8* dst_v, int pix);
+void RAWToUVRow_NEON(const uint8* src_raw, int src_stride_raw,
+ uint8* dst_u, uint8* dst_v, int pix);
+void RGB565ToUVRow_NEON(const uint8* src_rgb565, int src_stride_rgb565,
+ uint8* dst_u, uint8* dst_v, int pix);
+void ARGB1555ToUVRow_NEON(const uint8* src_argb1555, int src_stride_argb1555,
+ uint8* dst_u, uint8* dst_v, int pix);
+void ARGB4444ToUVRow_NEON(const uint8* src_argb4444, int src_stride_argb4444,
+ uint8* dst_u, uint8* dst_v, int pix);
+void BGRAToYRow_NEON(const uint8* src_bgra, uint8* dst_y, int pix);
+void ABGRToYRow_NEON(const uint8* src_abgr, uint8* dst_y, int pix);
+void RGBAToYRow_NEON(const uint8* src_rgba, uint8* dst_y, int pix);
+void RGB24ToYRow_NEON(const uint8* src_rgb24, uint8* dst_y, int pix);
+void RAWToYRow_NEON(const uint8* src_raw, uint8* dst_y, int pix);
+void RGB565ToYRow_NEON(const uint8* src_rgb565, uint8* dst_y, int pix);
+void ARGB1555ToYRow_NEON(const uint8* src_argb1555, uint8* dst_y, int pix);
+void ARGB4444ToYRow_NEON(const uint8* src_argb4444, uint8* dst_y, int pix);
+void ARGBToYRow_C(const uint8* src_argb, uint8* dst_y, int pix);
+void ARGBToYJRow_C(const uint8* src_argb, uint8* dst_y, int pix);
+void BGRAToYRow_C(const uint8* src_bgra, uint8* dst_y, int pix);
+void ABGRToYRow_C(const uint8* src_abgr, uint8* dst_y, int pix);
+void RGBAToYRow_C(const uint8* src_rgba, uint8* dst_y, int pix);
+void RGB24ToYRow_C(const uint8* src_rgb24, uint8* dst_y, int pix);
+void RAWToYRow_C(const uint8* src_raw, uint8* dst_y, int pix);
+void RGB565ToYRow_C(const uint8* src_rgb565, uint8* dst_y, int pix);
+void ARGB1555ToYRow_C(const uint8* src_argb1555, uint8* dst_y, int pix);
+void ARGB4444ToYRow_C(const uint8* src_argb4444, uint8* dst_y, int pix);
+void ARGBToYRow_Any_SSSE3(const uint8* src_argb, uint8* dst_y, int pix);
+void ARGBToYJRow_Any_SSSE3(const uint8* src_argb, uint8* dst_y, int pix);
+void BGRAToYRow_Any_SSSE3(const uint8* src_bgra, uint8* dst_y, int pix);
+void ABGRToYRow_Any_SSSE3(const uint8* src_abgr, uint8* dst_y, int pix);
+void RGBAToYRow_Any_SSSE3(const uint8* src_rgba, uint8* dst_y, int pix);
+void RGB24ToYRow_Any_SSSE3(const uint8* src_rgb24, uint8* dst_y, int pix);
+void RAWToYRow_Any_SSSE3(const uint8* src_raw, uint8* dst_y, int pix);
+void ARGBToYRow_Any_NEON(const uint8* src_argb, uint8* dst_y, int pix);
+void ARGBToYJRow_Any_NEON(const uint8* src_argb, uint8* dst_y, int pix);
+void BGRAToYRow_Any_NEON(const uint8* src_bgra, uint8* dst_y, int pix);
+void ABGRToYRow_Any_NEON(const uint8* src_abgr, uint8* dst_y, int pix);
+void RGBAToYRow_Any_NEON(const uint8* src_rgba, uint8* dst_y, int pix);
+void RGB24ToYRow_Any_NEON(const uint8* src_rgb24, uint8* dst_y, int pix);
+void RAWToYRow_Any_NEON(const uint8* src_raw, uint8* dst_y, int pix);
+void RGB565ToYRow_Any_NEON(const uint8* src_rgb565, uint8* dst_y, int pix);
+void ARGB1555ToYRow_Any_NEON(const uint8* src_argb1555, uint8* dst_y, int pix);
+void ARGB4444ToYRow_Any_NEON(const uint8* src_argb4444, uint8* dst_y, int pix);
+
+void ARGBToUVRow_AVX2(const uint8* src_argb, int src_stride_argb,
+ uint8* dst_u, uint8* dst_v, int width);
+void ARGBToUVRow_Any_AVX2(const uint8* src_argb, int src_stride_argb,
+ uint8* dst_u, uint8* dst_v, int width);
+void ARGBToUVRow_SSSE3(const uint8* src_argb, int src_stride_argb,
+ uint8* dst_u, uint8* dst_v, int width);
+void ARGBToUVJRow_SSSE3(const uint8* src_argb, int src_stride_argb,
+ uint8* dst_u, uint8* dst_v, int width);
+void BGRAToUVRow_SSSE3(const uint8* src_bgra, int src_stride_bgra,
+ uint8* dst_u, uint8* dst_v, int width);
+void ABGRToUVRow_SSSE3(const uint8* src_abgr, int src_stride_abgr,
+ uint8* dst_u, uint8* dst_v, int width);
+void RGBAToUVRow_SSSE3(const uint8* src_rgba, int src_stride_rgba,
+ uint8* dst_u, uint8* dst_v, int width);
+void ARGBToUVRow_Unaligned_SSSE3(const uint8* src_argb, int src_stride_argb,
+ uint8* dst_u, uint8* dst_v, int width);
+void ARGBToUVJRow_Unaligned_SSSE3(const uint8* src_argb, int src_stride_argb,
+ uint8* dst_u, uint8* dst_v, int width);
+void BGRAToUVRow_Unaligned_SSSE3(const uint8* src_bgra, int src_stride_bgra,
+ uint8* dst_u, uint8* dst_v, int width);
+void ABGRToUVRow_Unaligned_SSSE3(const uint8* src_abgr, int src_stride_abgr,
+ uint8* dst_u, uint8* dst_v, int width);
+void RGBAToUVRow_Unaligned_SSSE3(const uint8* src_rgba, int src_stride_rgba,
+ uint8* dst_u, uint8* dst_v, int width);
+void ARGBToUVRow_Any_SSSE3(const uint8* src_argb, int src_stride_argb,
+ uint8* dst_u, uint8* dst_v, int width);
+void ARGBToUVJRow_Any_SSSE3(const uint8* src_argb, int src_stride_argb,
+ uint8* dst_u, uint8* dst_v, int width);
+void BGRAToUVRow_Any_SSSE3(const uint8* src_bgra, int src_stride_bgra,
+ uint8* dst_u, uint8* dst_v, int width);
+void ABGRToUVRow_Any_SSSE3(const uint8* src_abgr, int src_stride_abgr,
+ uint8* dst_u, uint8* dst_v, int width);
+void RGBAToUVRow_Any_SSSE3(const uint8* src_rgba, int src_stride_rgba,
+ uint8* dst_u, uint8* dst_v, int width);
+void ARGBToUV444Row_Any_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
+ int pix);
+void ARGBToUV422Row_Any_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
+ int pix);
+void ARGBToUV411Row_Any_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
+ int pix);
+void ARGBToUVRow_Any_NEON(const uint8* src_argb, int src_stride_argb,
+ uint8* dst_u, uint8* dst_v, int pix);
+void ARGBToUVJRow_Any_NEON(const uint8* src_argb, int src_stride_argb,
+ uint8* dst_u, uint8* dst_v, int pix);
+void BGRAToUVRow_Any_NEON(const uint8* src_bgra, int src_stride_bgra,
+ uint8* dst_u, uint8* dst_v, int pix);
+void ABGRToUVRow_Any_NEON(const uint8* src_abgr, int src_stride_abgr,
+ uint8* dst_u, uint8* dst_v, int pix);
+void RGBAToUVRow_Any_NEON(const uint8* src_rgba, int src_stride_rgba,
+ uint8* dst_u, uint8* dst_v, int pix);
+void RGB24ToUVRow_Any_NEON(const uint8* src_rgb24, int src_stride_rgb24,
+ uint8* dst_u, uint8* dst_v, int pix);
+void RAWToUVRow_Any_NEON(const uint8* src_raw, int src_stride_raw,
+ uint8* dst_u, uint8* dst_v, int pix);
+void RGB565ToUVRow_Any_NEON(const uint8* src_rgb565, int src_stride_rgb565,
+ uint8* dst_u, uint8* dst_v, int pix);
+void ARGB1555ToUVRow_Any_NEON(const uint8* src_argb1555,
+ int src_stride_argb1555,
+ uint8* dst_u, uint8* dst_v, int pix);
+void ARGB4444ToUVRow_Any_NEON(const uint8* src_argb4444,
+ int src_stride_argb4444,
+ uint8* dst_u, uint8* dst_v, int pix);
+void ARGBToUVRow_C(const uint8* src_argb, int src_stride_argb,
+ uint8* dst_u, uint8* dst_v, int width);
+void ARGBToUVJRow_C(const uint8* src_argb, int src_stride_argb,
+ uint8* dst_u, uint8* dst_v, int width);
+void BGRAToUVRow_C(const uint8* src_bgra, int src_stride_bgra,
+ uint8* dst_u, uint8* dst_v, int width);
+void ABGRToUVRow_C(const uint8* src_abgr, int src_stride_abgr,
+ uint8* dst_u, uint8* dst_v, int width);
+void RGBAToUVRow_C(const uint8* src_rgba, int src_stride_rgba,
+ uint8* dst_u, uint8* dst_v, int width);
+void RGB24ToUVRow_C(const uint8* src_rgb24, int src_stride_rgb24,
+ uint8* dst_u, uint8* dst_v, int width);
+void RAWToUVRow_C(const uint8* src_raw, int src_stride_raw,
+ uint8* dst_u, uint8* dst_v, int width);
+void RGB565ToUVRow_C(const uint8* src_rgb565, int src_stride_rgb565,
+ uint8* dst_u, uint8* dst_v, int width);
+void ARGB1555ToUVRow_C(const uint8* src_argb1555, int src_stride_argb1555,
+ uint8* dst_u, uint8* dst_v, int width);
+void ARGB4444ToUVRow_C(const uint8* src_argb4444, int src_stride_argb4444,
+ uint8* dst_u, uint8* dst_v, int width);
+
+void ARGBToUV444Row_SSSE3(const uint8* src_argb,
+ uint8* dst_u, uint8* dst_v, int width);
+void ARGBToUV444Row_Unaligned_SSSE3(const uint8* src_argb,
+ uint8* dst_u, uint8* dst_v, int width);
+void ARGBToUV444Row_Any_SSSE3(const uint8* src_argb,
+ uint8* dst_u, uint8* dst_v, int width);
+
+void ARGBToUV422Row_SSSE3(const uint8* src_argb,
+ uint8* dst_u, uint8* dst_v, int width);
+void ARGBToUV422Row_Unaligned_SSSE3(const uint8* src_argb,
+ uint8* dst_u, uint8* dst_v, int width);
+void ARGBToUV422Row_Any_SSSE3(const uint8* src_argb,
+ uint8* dst_u, uint8* dst_v, int width);
+
+void ARGBToUV444Row_C(const uint8* src_argb,
+ uint8* dst_u, uint8* dst_v, int width);
+void ARGBToUV422Row_C(const uint8* src_argb,
+ uint8* dst_u, uint8* dst_v, int width);
+void ARGBToUV411Row_C(const uint8* src_argb,
+ uint8* dst_u, uint8* dst_v, int width);
+
+void MirrorRow_AVX2(const uint8* src, uint8* dst, int width);
+void MirrorRow_SSSE3(const uint8* src, uint8* dst, int width);
+void MirrorRow_SSE2(const uint8* src, uint8* dst, int width);
+void MirrorRow_NEON(const uint8* src, uint8* dst, int width);
+void MirrorRow_MIPS_DSPR2(const uint8* src, uint8* dst, int width);
+void MirrorRow_C(const uint8* src, uint8* dst, int width);
+
+void MirrorUVRow_SSSE3(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
+ int width);
+void MirrorUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
+ int width);
+void MirrorUVRow_MIPS_DSPR2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
+ int width);
+void MirrorUVRow_C(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
+ int width);
+
+void ARGBMirrorRow_AVX2(const uint8* src, uint8* dst, int width);
+void ARGBMirrorRow_SSSE3(const uint8* src, uint8* dst, int width);
+void ARGBMirrorRow_NEON(const uint8* src, uint8* dst, int width);
+void ARGBMirrorRow_C(const uint8* src, uint8* dst, int width);
+
+void SplitUVRow_C(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix);
+void SplitUVRow_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix);
+void SplitUVRow_AVX2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix);
+void SplitUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix);
+void SplitUVRow_MIPS_DSPR2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
+ int pix);
+void SplitUVRow_Unaligned_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
+ int pix);
+void SplitUVRow_Unaligned_MIPS_DSPR2(const uint8* src_uv, uint8* dst_u,
+ uint8* dst_v, int pix);
+void SplitUVRow_Any_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
+ int pix);
+void SplitUVRow_Any_AVX2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
+ int pix);
+void SplitUVRow_Any_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
+ int pix);
+void SplitUVRow_Any_MIPS_DSPR2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
+ int pix);
+
+void MergeUVRow_C(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
+ int width);
+void MergeUVRow_SSE2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
+ int width);
+void MergeUVRow_AVX2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
+ int width);
+void MergeUVRow_NEON(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
+ int width);
+void MergeUVRow_Unaligned_SSE2(const uint8* src_u, const uint8* src_v,
+ uint8* dst_uv, int width);
+void MergeUVRow_Any_SSE2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
+ int width);
+void MergeUVRow_Any_AVX2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
+ int width);
+void MergeUVRow_Any_NEON(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
+ int width);
+
+void CopyRow_SSE2(const uint8* src, uint8* dst, int count);
+void CopyRow_ERMS(const uint8* src, uint8* dst, int count);
+void CopyRow_X86(const uint8* src, uint8* dst, int count);
+void CopyRow_NEON(const uint8* src, uint8* dst, int count);
+void CopyRow_MIPS(const uint8* src, uint8* dst, int count);
+void CopyRow_C(const uint8* src, uint8* dst, int count);
+
+void CopyRow_16_C(const uint16* src, uint16* dst, int count);
+
+void ARGBCopyAlphaRow_C(const uint8* src_argb, uint8* dst_argb, int width);
+void ARGBCopyAlphaRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width);
+void ARGBCopyAlphaRow_AVX2(const uint8* src_argb, uint8* dst_argb, int width);
+
+void ARGBCopyYToAlphaRow_C(const uint8* src_y, uint8* dst_argb, int width);
+void ARGBCopyYToAlphaRow_SSE2(const uint8* src_y, uint8* dst_argb, int width);
+void ARGBCopyYToAlphaRow_AVX2(const uint8* src_y, uint8* dst_argb, int width);
+
+void SetRow_X86(uint8* dst, uint32 v32, int count);
+void ARGBSetRows_X86(uint8* dst, uint32 v32, int width,
+ int dst_stride, int height);
+void SetRow_NEON(uint8* dst, uint32 v32, int count);
+void ARGBSetRows_NEON(uint8* dst, uint32 v32, int width,
+ int dst_stride, int height);
+void SetRow_C(uint8* dst, uint32 v32, int count);
+void ARGBSetRows_C(uint8* dst, uint32 v32, int width, int dst_stride,
+ int height);
+
+// ARGBShufflers for BGRAToARGB etc.
+void ARGBShuffleRow_C(const uint8* src_argb, uint8* dst_argb,
+ const uint8* shuffler, int pix);
+void ARGBShuffleRow_SSE2(const uint8* src_argb, uint8* dst_argb,
+ const uint8* shuffler, int pix);
+void ARGBShuffleRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
+ const uint8* shuffler, int pix);
+void ARGBShuffleRow_AVX2(const uint8* src_argb, uint8* dst_argb,
+ const uint8* shuffler, int pix);
+void ARGBShuffleRow_NEON(const uint8* src_argb, uint8* dst_argb,
+ const uint8* shuffler, int pix);
+void ARGBShuffleRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_argb,
+ const uint8* shuffler, int pix);
+void ARGBShuffleRow_Any_SSE2(const uint8* src_argb, uint8* dst_argb,
+ const uint8* shuffler, int pix);
+void ARGBShuffleRow_Any_SSSE3(const uint8* src_argb, uint8* dst_argb,
+ const uint8* shuffler, int pix);
+void ARGBShuffleRow_Any_AVX2(const uint8* src_argb, uint8* dst_argb,
+ const uint8* shuffler, int pix);
+void ARGBShuffleRow_Any_NEON(const uint8* src_argb, uint8* dst_argb,
+ const uint8* shuffler, int pix);
+
+void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int pix);
+void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb, int pix);
+void RGB565ToARGBRow_SSE2(const uint8* src_rgb565, uint8* dst_argb, int pix);
+void ARGB1555ToARGBRow_SSE2(const uint8* src_argb1555, uint8* dst_argb,
+ int pix);
+void ARGB4444ToARGBRow_SSE2(const uint8* src_argb4444, uint8* dst_argb,
+ int pix);
+
+void RGB24ToARGBRow_NEON(const uint8* src_rgb24, uint8* dst_argb, int pix);
+void RAWToARGBRow_NEON(const uint8* src_raw, uint8* dst_argb, int pix);
+void RGB565ToARGBRow_NEON(const uint8* src_rgb565, uint8* dst_argb, int pix);
+void ARGB1555ToARGBRow_NEON(const uint8* src_argb1555, uint8* dst_argb,
+ int pix);
+void ARGB4444ToARGBRow_NEON(const uint8* src_argb4444, uint8* dst_argb,
+ int pix);
+void RGB24ToARGBRow_C(const uint8* src_rgb24, uint8* dst_argb, int pix);
+void RAWToARGBRow_C(const uint8* src_raw, uint8* dst_argb, int pix);
+void RGB565ToARGBRow_C(const uint8* src_rgb, uint8* dst_argb, int pix);
+void ARGB1555ToARGBRow_C(const uint8* src_argb, uint8* dst_argb, int pix);
+void ARGB4444ToARGBRow_C(const uint8* src_argb, uint8* dst_argb, int pix);
+void RGB24ToARGBRow_Any_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int pix);
+void RAWToARGBRow_Any_SSSE3(const uint8* src_raw, uint8* dst_argb, int pix);
+void RGB565ToARGBRow_Any_SSE2(const uint8* src_rgb565, uint8* dst_argb,
+ int pix);
+void ARGB1555ToARGBRow_Any_SSE2(const uint8* src_argb1555, uint8* dst_argb,
+ int pix);
+void ARGB4444ToARGBRow_Any_SSE2(const uint8* src_argb4444, uint8* dst_argb,
+ int pix);
+void RGB24ToARGBRow_Any_NEON(const uint8* src_rgb24, uint8* dst_argb, int pix);
+void RAWToARGBRow_Any_NEON(const uint8* src_raw, uint8* dst_argb, int pix);
+void RGB565ToARGBRow_Any_NEON(const uint8* src_rgb565, uint8* dst_argb,
+ int pix);
+void ARGB1555ToARGBRow_Any_NEON(const uint8* src_argb1555, uint8* dst_argb,
+ int pix);
+void ARGB4444ToARGBRow_Any_NEON(const uint8* src_argb4444, uint8* dst_argb,
+ int pix);
+
+void ARGBToRGB24Row_SSSE3(const uint8* src_argb, uint8* dst_rgb, int pix);
+void ARGBToRAWRow_SSSE3(const uint8* src_argb, uint8* dst_rgb, int pix);
+void ARGBToRGB565Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix);
+void ARGBToARGB1555Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix);
+void ARGBToARGB4444Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix);
+
+void ARGBToRGB24Row_NEON(const uint8* src_argb, uint8* dst_rgb, int pix);
+void ARGBToRAWRow_NEON(const uint8* src_argb, uint8* dst_rgb, int pix);
+void ARGBToRGB565Row_NEON(const uint8* src_argb, uint8* dst_rgb, int pix);
+void ARGBToARGB1555Row_NEON(const uint8* src_argb, uint8* dst_rgb, int pix);
+void ARGBToARGB4444Row_NEON(const uint8* src_argb, uint8* dst_rgb, int pix);
+
+void ARGBToRGBARow_C(const uint8* src_argb, uint8* dst_rgb, int pix);
+void ARGBToRGB24Row_C(const uint8* src_argb, uint8* dst_rgb, int pix);
+void ARGBToRAWRow_C(const uint8* src_argb, uint8* dst_rgb, int pix);
+void ARGBToRGB565Row_C(const uint8* src_argb, uint8* dst_rgb, int pix);
+void ARGBToARGB1555Row_C(const uint8* src_argb, uint8* dst_rgb, int pix);
+void ARGBToARGB4444Row_C(const uint8* src_argb, uint8* dst_rgb, int pix);
+
+void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix);
+void I400ToARGBRow_Unaligned_SSE2(const uint8* src_y, uint8* dst_argb, int pix);
+void I400ToARGBRow_NEON(const uint8* src_y, uint8* dst_argb, int pix);
+void I400ToARGBRow_C(const uint8* src_y, uint8* dst_argb, int pix);
+void I400ToARGBRow_Any_SSE2(const uint8* src_y, uint8* dst_argb, int pix);
+void I400ToARGBRow_Any_NEON(const uint8* src_y, uint8* dst_argb, int pix);
+
+void I444ToARGBRow_C(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_argb,
+ int width);
+void I422ToARGBRow_C(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_argb,
+ int width);
+void I411ToARGBRow_C(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_argb,
+ int width);
+void NV12ToARGBRow_C(const uint8* src_y,
+ const uint8* src_uv,
+ uint8* dst_argb,
+ int width);
+void NV21ToRGB565Row_C(const uint8* src_y,
+ const uint8* src_vu,
+ uint8* dst_argb,
+ int width);
+void NV12ToRGB565Row_C(const uint8* src_y,
+ const uint8* src_uv,
+ uint8* dst_argb,
+ int width);
+void NV21ToARGBRow_C(const uint8* src_y,
+ const uint8* src_vu,
+ uint8* dst_argb,
+ int width);
+void YUY2ToARGBRow_C(const uint8* src_yuy2,
+ uint8* dst_argb,
+ int width);
+void UYVYToARGBRow_C(const uint8* src_uyvy,
+ uint8* dst_argb,
+ int width);
+void I422ToBGRARow_C(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_bgra,
+ int width);
+void I422ToABGRRow_C(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_abgr,
+ int width);
+void I422ToRGBARow_C(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_rgba,
+ int width);
+void I422ToRGB24Row_C(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_rgb24,
+ int width);
+void I422ToRAWRow_C(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_raw,
+ int width);
+void I422ToARGB4444Row_C(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_argb4444,
+ int width);
+void I422ToARGB1555Row_C(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_argb4444,
+ int width);
+void I422ToRGB565Row_C(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_rgb565,
+ int width);
+void YToARGBRow_C(const uint8* src_y,
+ uint8* dst_argb,
+ int width);
+void I422ToARGBRow_AVX2(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_argb,
+ int width);
+void I444ToARGBRow_SSSE3(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_argb,
+ int width);
+void I422ToARGBRow_SSSE3(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_argb,
+ int width);
+void I411ToARGBRow_SSSE3(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_argb,
+ int width);
+void NV12ToARGBRow_SSSE3(const uint8* src_y,
+ const uint8* src_uv,
+ uint8* dst_argb,
+ int width);
+void NV21ToARGBRow_SSSE3(const uint8* src_y,
+ const uint8* src_vu,
+ uint8* dst_argb,
+ int width);
+void NV12ToRGB565Row_SSSE3(const uint8* src_y,
+ const uint8* src_uv,
+ uint8* dst_argb,
+ int width);
+void NV21ToRGB565Row_SSSE3(const uint8* src_y,
+ const uint8* src_vu,
+ uint8* dst_argb,
+ int width);
+void YUY2ToARGBRow_SSSE3(const uint8* src_yuy2,
+ uint8* dst_argb,
+ int width);
+void UYVYToARGBRow_SSSE3(const uint8* src_uyvy,
+ uint8* dst_argb,
+ int width);
+void I422ToBGRARow_SSSE3(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_bgra,
+ int width);
+void I422ToABGRRow_SSSE3(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_abgr,
+ int width);
+void I422ToRGBARow_SSSE3(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_rgba,
+ int width);
+void I422ToARGB4444Row_SSSE3(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_argb,
+ int width);
+void I422ToARGB1555Row_SSSE3(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_argb,
+ int width);
+void I422ToRGB565Row_SSSE3(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_argb,
+ int width);
+// RGB24/RAW are unaligned.
+void I422ToRGB24Row_SSSE3(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_rgb24,
+ int width);
+void I422ToRAWRow_SSSE3(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_raw,
+ int width);
+
+void I444ToARGBRow_Unaligned_SSSE3(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_argb,
+ int width);
+void I422ToARGBRow_Unaligned_SSSE3(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_argb,
+ int width);
+void I411ToARGBRow_Unaligned_SSSE3(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_argb,
+ int width);
+void NV12ToARGBRow_Unaligned_SSSE3(const uint8* src_y,
+ const uint8* src_uv,
+ uint8* dst_argb,
+ int width);
+void NV21ToARGBRow_Unaligned_SSSE3(const uint8* src_y,
+ const uint8* src_vu,
+ uint8* dst_argb,
+ int width);
+void YUY2ToARGBRow_Unaligned_SSSE3(const uint8* src_yuy2,
+ uint8* dst_argb,
+ int width);
+void UYVYToARGBRow_Unaligned_SSSE3(const uint8* src_uyvy,
+ uint8* dst_argb,
+ int width);
+void I422ToBGRARow_Unaligned_SSSE3(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_bgra,
+ int width);
+void I422ToABGRRow_Unaligned_SSSE3(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_abgr,
+ int width);
+void I422ToRGBARow_Unaligned_SSSE3(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_rgba,
+ int width);
+void I422ToARGBRow_Any_AVX2(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_argb,
+ int width);
+void I444ToARGBRow_Any_SSSE3(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_argb,
+ int width);
+void I422ToARGBRow_Any_SSSE3(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_argb,
+ int width);
+void I411ToARGBRow_Any_SSSE3(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_argb,
+ int width);
+void NV12ToARGBRow_Any_SSSE3(const uint8* src_y,
+ const uint8* src_uv,
+ uint8* dst_argb,
+ int width);
+void NV21ToARGBRow_Any_SSSE3(const uint8* src_y,
+ const uint8* src_vu,
+ uint8* dst_argb,
+ int width);
+void NV12ToRGB565Row_Any_SSSE3(const uint8* src_y,
+ const uint8* src_uv,
+ uint8* dst_argb,
+ int width);
+void NV21ToRGB565Row_Any_SSSE3(const uint8* src_y,
+ const uint8* src_vu,
+ uint8* dst_argb,
+ int width);
+void YUY2ToARGBRow_Any_SSSE3(const uint8* src_yuy2,
+ uint8* dst_argb,
+ int width);
+void UYVYToARGBRow_Any_SSSE3(const uint8* src_uyvy,
+ uint8* dst_argb,
+ int width);
+void I422ToBGRARow_Any_SSSE3(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_bgra,
+ int width);
+void I422ToABGRRow_Any_SSSE3(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_abgr,
+ int width);
+void I422ToRGBARow_Any_SSSE3(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_rgba,
+ int width);
+void I422ToARGB4444Row_Any_SSSE3(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_rgba,
+ int width);
+void I422ToARGB1555Row_Any_SSSE3(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_rgba,
+ int width);
+void I422ToRGB565Row_Any_SSSE3(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_rgba,
+ int width);
+// RGB24/RAW are unaligned.
+void I422ToRGB24Row_Any_SSSE3(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_argb,
+ int width);
+void I422ToRAWRow_Any_SSSE3(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_argb,
+ int width);
+void YToARGBRow_SSE2(const uint8* src_y,
+ uint8* dst_argb,
+ int width);
+void YToARGBRow_NEON(const uint8* src_y,
+ uint8* dst_argb,
+ int width);
+void YToARGBRow_Any_SSE2(const uint8* src_y,
+ uint8* dst_argb,
+ int width);
+void YToARGBRow_Any_NEON(const uint8* src_y,
+ uint8* dst_argb,
+ int width);
+
+// ARGB preattenuated alpha blend.
+void ARGBBlendRow_SSSE3(const uint8* src_argb, const uint8* src_argb1,
+ uint8* dst_argb, int width);
+void ARGBBlendRow_SSE2(const uint8* src_argb, const uint8* src_argb1,
+ uint8* dst_argb, int width);
+void ARGBBlendRow_NEON(const uint8* src_argb, const uint8* src_argb1,
+ uint8* dst_argb, int width);
+void ARGBBlendRow_C(const uint8* src_argb, const uint8* src_argb1,
+ uint8* dst_argb, int width);
+
+// ARGB multiply images. Same API as Blend, but these require
+// pointer and width alignment for SSE2.
+void ARGBMultiplyRow_C(const uint8* src_argb, const uint8* src_argb1,
+ uint8* dst_argb, int width);
+void ARGBMultiplyRow_SSE2(const uint8* src_argb, const uint8* src_argb1,
+ uint8* dst_argb, int width);
+void ARGBMultiplyRow_Any_SSE2(const uint8* src_argb, const uint8* src_argb1,
+ uint8* dst_argb, int width);
+void ARGBMultiplyRow_AVX2(const uint8* src_argb, const uint8* src_argb1,
+ uint8* dst_argb, int width);
+void ARGBMultiplyRow_Any_AVX2(const uint8* src_argb, const uint8* src_argb1,
+ uint8* dst_argb, int width);
+void ARGBMultiplyRow_NEON(const uint8* src_argb, const uint8* src_argb1,
+ uint8* dst_argb, int width);
+void ARGBMultiplyRow_Any_NEON(const uint8* src_argb, const uint8* src_argb1,
+ uint8* dst_argb, int width);
+
+// ARGB add images.
+void ARGBAddRow_C(const uint8* src_argb, const uint8* src_argb1,
+ uint8* dst_argb, int width);
+void ARGBAddRow_SSE2(const uint8* src_argb, const uint8* src_argb1,
+ uint8* dst_argb, int width);
+void ARGBAddRow_Any_SSE2(const uint8* src_argb, const uint8* src_argb1,
+ uint8* dst_argb, int width);
+void ARGBAddRow_AVX2(const uint8* src_argb, const uint8* src_argb1,
+ uint8* dst_argb, int width);
+void ARGBAddRow_Any_AVX2(const uint8* src_argb, const uint8* src_argb1,
+ uint8* dst_argb, int width);
+void ARGBAddRow_NEON(const uint8* src_argb, const uint8* src_argb1,
+ uint8* dst_argb, int width);
+void ARGBAddRow_Any_NEON(const uint8* src_argb, const uint8* src_argb1,
+ uint8* dst_argb, int width);
+
+// ARGB subtract images. Same API as Blend, but these require
+// pointer and width alignment for SSE2.
+void ARGBSubtractRow_C(const uint8* src_argb, const uint8* src_argb1,
+ uint8* dst_argb, int width);
+void ARGBSubtractRow_SSE2(const uint8* src_argb, const uint8* src_argb1,
+ uint8* dst_argb, int width);
+void ARGBSubtractRow_Any_SSE2(const uint8* src_argb, const uint8* src_argb1,
+ uint8* dst_argb, int width);
+void ARGBSubtractRow_AVX2(const uint8* src_argb, const uint8* src_argb1,
+ uint8* dst_argb, int width);
+void ARGBSubtractRow_Any_AVX2(const uint8* src_argb, const uint8* src_argb1,
+ uint8* dst_argb, int width);
+void ARGBSubtractRow_NEON(const uint8* src_argb, const uint8* src_argb1,
+ uint8* dst_argb, int width);
+void ARGBSubtractRow_Any_NEON(const uint8* src_argb, const uint8* src_argb1,
+ uint8* dst_argb, int width);
+
+void ARGBToRGB24Row_Any_SSSE3(const uint8* src_argb, uint8* dst_rgb, int pix);
+void ARGBToRAWRow_Any_SSSE3(const uint8* src_argb, uint8* dst_rgb, int pix);
+void ARGBToRGB565Row_Any_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix);
+void ARGBToARGB1555Row_Any_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix);
+void ARGBToARGB4444Row_Any_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix);
+
+void ARGBToRGB24Row_Any_NEON(const uint8* src_argb, uint8* dst_rgb, int pix);
+void ARGBToRAWRow_Any_NEON(const uint8* src_argb, uint8* dst_rgb, int pix);
+void ARGBToRGB565Row_Any_NEON(const uint8* src_argb, uint8* dst_rgb, int pix);
+void ARGBToARGB1555Row_Any_NEON(const uint8* src_argb, uint8* dst_rgb, int pix);
+void ARGBToARGB4444Row_Any_NEON(const uint8* src_argb, uint8* dst_rgb, int pix);
+
+void I444ToARGBRow_Any_NEON(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_argb,
+ int width);
+void I422ToARGBRow_Any_NEON(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_argb,
+ int width);
+void I411ToARGBRow_Any_NEON(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_argb,
+ int width);
+void I422ToBGRARow_Any_NEON(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_argb,
+ int width);
+void I422ToABGRRow_Any_NEON(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_argb,
+ int width);
+void I422ToRGBARow_Any_NEON(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_argb,
+ int width);
+void I422ToRGB24Row_Any_NEON(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_argb,
+ int width);
+void I422ToRAWRow_Any_NEON(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_argb,
+ int width);
+void I422ToARGB4444Row_Any_NEON(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_argb,
+ int width);
+void I422ToARGB1555Row_Any_NEON(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_argb,
+ int width);
+void I422ToRGB565Row_Any_NEON(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_argb,
+ int width);
+void NV12ToARGBRow_Any_NEON(const uint8* src_y,
+ const uint8* src_uv,
+ uint8* dst_argb,
+ int width);
+void NV21ToARGBRow_Any_NEON(const uint8* src_y,
+ const uint8* src_uv,
+ uint8* dst_argb,
+ int width);
+void NV12ToRGB565Row_Any_NEON(const uint8* src_y,
+ const uint8* src_uv,
+ uint8* dst_argb,
+ int width);
+void NV21ToRGB565Row_Any_NEON(const uint8* src_y,
+ const uint8* src_uv,
+ uint8* dst_argb,
+ int width);
+void YUY2ToARGBRow_Any_NEON(const uint8* src_yuy2,
+ uint8* dst_argb,
+ int width);
+void UYVYToARGBRow_Any_NEON(const uint8* src_uyvy,
+ uint8* dst_argb,
+ int width);
+void I422ToARGBRow_MIPS_DSPR2(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_argb,
+ int width);
+void I422ToBGRARow_MIPS_DSPR2(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_argb,
+ int width);
+void I422ToABGRRow_MIPS_DSPR2(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_argb,
+ int width);
+void I422ToARGBRow_MIPS_DSPR2(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_argb,
+ int width);
+void I422ToBGRARow_MIPS_DSPR2(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_argb,
+ int width);
+void I422ToABGRRow_MIPS_DSPR2(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_argb,
+ int width);
+
+void YUY2ToYRow_AVX2(const uint8* src_yuy2, uint8* dst_y, int pix);
+void YUY2ToUVRow_AVX2(const uint8* src_yuy2, int stride_yuy2,
+ uint8* dst_u, uint8* dst_v, int pix);
+void YUY2ToUV422Row_AVX2(const uint8* src_yuy2,
+ uint8* dst_u, uint8* dst_v, int pix);
+void YUY2ToYRow_SSE2(const uint8* src_yuy2, uint8* dst_y, int pix);
+void YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2,
+ uint8* dst_u, uint8* dst_v, int pix);
+void YUY2ToUV422Row_SSE2(const uint8* src_yuy2,
+ uint8* dst_u, uint8* dst_v, int pix);
+void YUY2ToYRow_Unaligned_SSE2(const uint8* src_yuy2,
+ uint8* dst_y, int pix);
+void YUY2ToUVRow_Unaligned_SSE2(const uint8* src_yuy2, int stride_yuy2,
+ uint8* dst_u, uint8* dst_v, int pix);
+void YUY2ToUV422Row_Unaligned_SSE2(const uint8* src_yuy2,
+ uint8* dst_u, uint8* dst_v, int pix);
+void YUY2ToYRow_NEON(const uint8* src_yuy2, uint8* dst_y, int pix);
+void YUY2ToUVRow_NEON(const uint8* src_yuy2, int stride_yuy2,
+ uint8* dst_u, uint8* dst_v, int pix);
+void YUY2ToUV422Row_NEON(const uint8* src_yuy2,
+ uint8* dst_u, uint8* dst_v, int pix);
+void YUY2ToYRow_C(const uint8* src_yuy2, uint8* dst_y, int pix);
+void YUY2ToUVRow_C(const uint8* src_yuy2, int stride_yuy2,
+ uint8* dst_u, uint8* dst_v, int pix);
+void YUY2ToUV422Row_C(const uint8* src_yuy2,
+ uint8* dst_u, uint8* dst_v, int pix);
+void YUY2ToYRow_Any_AVX2(const uint8* src_yuy2, uint8* dst_y, int pix);
+void YUY2ToUVRow_Any_AVX2(const uint8* src_yuy2, int stride_yuy2,
+ uint8* dst_u, uint8* dst_v, int pix);
+void YUY2ToUV422Row_Any_AVX2(const uint8* src_yuy2,
+ uint8* dst_u, uint8* dst_v, int pix);
+void YUY2ToYRow_Any_SSE2(const uint8* src_yuy2, uint8* dst_y, int pix);
+void YUY2ToUVRow_Any_SSE2(const uint8* src_yuy2, int stride_yuy2,
+ uint8* dst_u, uint8* dst_v, int pix);
+void YUY2ToUV422Row_Any_SSE2(const uint8* src_yuy2,
+ uint8* dst_u, uint8* dst_v, int pix);
+void YUY2ToYRow_Any_NEON(const uint8* src_yuy2, uint8* dst_y, int pix);
+void YUY2ToUVRow_Any_NEON(const uint8* src_yuy2, int stride_yuy2,
+ uint8* dst_u, uint8* dst_v, int pix);
+void YUY2ToUV422Row_Any_NEON(const uint8* src_yuy2,
+ uint8* dst_u, uint8* dst_v, int pix);
+void UYVYToYRow_AVX2(const uint8* src_uyvy, uint8* dst_y, int pix);
+void UYVYToUVRow_AVX2(const uint8* src_uyvy, int stride_uyvy,
+ uint8* dst_u, uint8* dst_v, int pix);
+void UYVYToUV422Row_AVX2(const uint8* src_uyvy,
+ uint8* dst_u, uint8* dst_v, int pix);
+void UYVYToYRow_SSE2(const uint8* src_uyvy, uint8* dst_y, int pix);
+void UYVYToUVRow_SSE2(const uint8* src_uyvy, int stride_uyvy,
+ uint8* dst_u, uint8* dst_v, int pix);
+void UYVYToUV422Row_SSE2(const uint8* src_uyvy,
+ uint8* dst_u, uint8* dst_v, int pix);
+void UYVYToYRow_Unaligned_SSE2(const uint8* src_uyvy,
+ uint8* dst_y, int pix);
+void UYVYToUVRow_Unaligned_SSE2(const uint8* src_uyvy, int stride_uyvy,
+ uint8* dst_u, uint8* dst_v, int pix);
+void UYVYToUV422Row_Unaligned_SSE2(const uint8* src_uyvy,
+ uint8* dst_u, uint8* dst_v, int pix);
+void UYVYToYRow_AVX2(const uint8* src_uyvy, uint8* dst_y, int pix);
+void UYVYToUVRow_AVX2(const uint8* src_uyvy, int stride_uyvy,
+ uint8* dst_u, uint8* dst_v, int pix);
+void UYVYToUV422Row_AVX2(const uint8* src_uyvy,
+ uint8* dst_u, uint8* dst_v, int pix);
+void UYVYToYRow_NEON(const uint8* src_uyvy, uint8* dst_y, int pix);
+void UYVYToUVRow_NEON(const uint8* src_uyvy, int stride_uyvy,
+ uint8* dst_u, uint8* dst_v, int pix);
+void UYVYToUV422Row_NEON(const uint8* src_uyvy,
+ uint8* dst_u, uint8* dst_v, int pix);
+
+void UYVYToYRow_C(const uint8* src_uyvy, uint8* dst_y, int pix);
+void UYVYToUVRow_C(const uint8* src_uyvy, int stride_uyvy,
+ uint8* dst_u, uint8* dst_v, int pix);
+void UYVYToUV422Row_C(const uint8* src_uyvy,
+ uint8* dst_u, uint8* dst_v, int pix);
+void UYVYToYRow_Any_AVX2(const uint8* src_uyvy, uint8* dst_y, int pix);
+void UYVYToUVRow_Any_AVX2(const uint8* src_uyvy, int stride_uyvy,
+ uint8* dst_u, uint8* dst_v, int pix);
+void UYVYToUV422Row_Any_AVX2(const uint8* src_uyvy,
+ uint8* dst_u, uint8* dst_v, int pix);
+void UYVYToYRow_Any_SSE2(const uint8* src_uyvy, uint8* dst_y, int pix);
+void UYVYToUVRow_Any_SSE2(const uint8* src_uyvy, int stride_uyvy,
+ uint8* dst_u, uint8* dst_v, int pix);
+void UYVYToUV422Row_Any_SSE2(const uint8* src_uyvy,
+ uint8* dst_u, uint8* dst_v, int pix);
+void UYVYToYRow_Any_NEON(const uint8* src_uyvy, uint8* dst_y, int pix);
+void UYVYToUVRow_Any_NEON(const uint8* src_uyvy, int stride_uyvy,
+ uint8* dst_u, uint8* dst_v, int pix);
+void UYVYToUV422Row_Any_NEON(const uint8* src_uyvy,
+ uint8* dst_u, uint8* dst_v, int pix);
+
+void HalfRow_C(const uint8* src_uv, int src_uv_stride,
+ uint8* dst_uv, int pix);
+void HalfRow_SSE2(const uint8* src_uv, int src_uv_stride,
+ uint8* dst_uv, int pix);
+void HalfRow_AVX2(const uint8* src_uv, int src_uv_stride,
+ uint8* dst_uv, int pix);
+void HalfRow_NEON(const uint8* src_uv, int src_uv_stride,
+ uint8* dst_uv, int pix);
+
+void HalfRow_16_C(const uint16* src_uv, int src_uv_stride,
+ uint16* dst_uv, int pix);
+
+void ARGBToBayerRow_C(const uint8* src_argb, uint8* dst_bayer,
+ uint32 selector, int pix);
+void ARGBToBayerRow_SSSE3(const uint8* src_argb, uint8* dst_bayer,
+ uint32 selector, int pix);
+void ARGBToBayerRow_NEON(const uint8* src_argb, uint8* dst_bayer,
+ uint32 selector, int pix);
+void ARGBToBayerRow_Any_SSSE3(const uint8* src_argb, uint8* dst_bayer,
+ uint32 selector, int pix);
+void ARGBToBayerRow_Any_NEON(const uint8* src_argb, uint8* dst_bayer,
+ uint32 selector, int pix);
+void ARGBToBayerGGRow_C(const uint8* src_argb, uint8* dst_bayer,
+ uint32 /* selector */, int pix);
+void ARGBToBayerGGRow_SSE2(const uint8* src_argb, uint8* dst_bayer,
+ uint32 /* selector */, int pix);
+void ARGBToBayerGGRow_NEON(const uint8* src_argb, uint8* dst_bayer,
+ uint32 /* selector */, int pix);
+void ARGBToBayerGGRow_Any_SSE2(const uint8* src_argb, uint8* dst_bayer,
+ uint32 /* selector */, int pix);
+void ARGBToBayerGGRow_Any_NEON(const uint8* src_argb, uint8* dst_bayer,
+ uint32 /* selector */, int pix);
+
+void I422ToYUY2Row_C(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_yuy2, int width);
+void I422ToUYVYRow_C(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_uyvy, int width);
+void I422ToYUY2Row_SSE2(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_yuy2, int width);
+void I422ToUYVYRow_SSE2(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_uyvy, int width);
+void I422ToYUY2Row_Any_SSE2(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_yuy2, int width);
+void I422ToUYVYRow_Any_SSE2(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_uyvy, int width);
+void I422ToYUY2Row_NEON(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_yuy2, int width);
+void I422ToUYVYRow_NEON(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_uyvy, int width);
+void I422ToYUY2Row_Any_NEON(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_yuy2, int width);
+void I422ToUYVYRow_Any_NEON(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_uyvy, int width);
+
+// Effects related row functions.
+void ARGBAttenuateRow_C(const uint8* src_argb, uint8* dst_argb, int width);
+void ARGBAttenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width);
+void ARGBAttenuateRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width);
+void ARGBAttenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, int width);
+void ARGBAttenuateRow_NEON(const uint8* src_argb, uint8* dst_argb, int width);
+void ARGBAttenuateRow_Any_SSE2(const uint8* src_argb, uint8* dst_argb,
+ int width);
+void ARGBAttenuateRow_Any_SSSE3(const uint8* src_argb, uint8* dst_argb,
+ int width);
+void ARGBAttenuateRow_Any_AVX2(const uint8* src_argb, uint8* dst_argb,
+ int width);
+void ARGBAttenuateRow_Any_NEON(const uint8* src_argb, uint8* dst_argb,
+ int width);
+
+// Inverse table for unattenuate, shared by C and SSE2.
+extern const uint32 fixed_invtbl8[256];
+void ARGBUnattenuateRow_C(const uint8* src_argb, uint8* dst_argb, int width);
+void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width);
+void ARGBUnattenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, int width);
+void ARGBUnattenuateRow_Any_SSE2(const uint8* src_argb, uint8* dst_argb,
+ int width);
+void ARGBUnattenuateRow_Any_AVX2(const uint8* src_argb, uint8* dst_argb,
+ int width);
+
+void ARGBGrayRow_C(const uint8* src_argb, uint8* dst_argb, int width);
+void ARGBGrayRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width);
+void ARGBGrayRow_NEON(const uint8* src_argb, uint8* dst_argb, int width);
+
+void ARGBSepiaRow_C(uint8* dst_argb, int width);
+void ARGBSepiaRow_SSSE3(uint8* dst_argb, int width);
+void ARGBSepiaRow_NEON(uint8* dst_argb, int width);
+
+void ARGBColorMatrixRow_C(const uint8* src_argb, uint8* dst_argb,
+ const int8* matrix_argb, int width);
+void ARGBColorMatrixRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
+ const int8* matrix_argb, int width);
+void ARGBColorMatrixRow_NEON(const uint8* src_argb, uint8* dst_argb,
+ const int8* matrix_argb, int width);
+
+void ARGBColorTableRow_C(uint8* dst_argb, const uint8* table_argb, int width);
+void ARGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb, int width);
+
+void RGBColorTableRow_C(uint8* dst_argb, const uint8* table_argb, int width);
+void RGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb, int width);
+
+void ARGBQuantizeRow_C(uint8* dst_argb, int scale, int interval_size,
+ int interval_offset, int width);
+void ARGBQuantizeRow_SSE2(uint8* dst_argb, int scale, int interval_size,
+ int interval_offset, int width);
+void ARGBQuantizeRow_NEON(uint8* dst_argb, int scale, int interval_size,
+ int interval_offset, int width);
+
+void ARGBShadeRow_C(const uint8* src_argb, uint8* dst_argb, int width,
+ uint32 value);
+void ARGBShadeRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width,
+ uint32 value);
+void ARGBShadeRow_NEON(const uint8* src_argb, uint8* dst_argb, int width,
+ uint32 value);
+
+// Used for blur.
+void CumulativeSumToAverageRow_SSE2(const int32* topleft, const int32* botleft,
+ int width, int area, uint8* dst, int count);
+void ComputeCumulativeSumRow_SSE2(const uint8* row, int32* cumsum,
+ const int32* previous_cumsum, int width);
+
+void CumulativeSumToAverageRow_C(const int32* topleft, const int32* botleft,
+ int width, int area, uint8* dst, int count);
+void ComputeCumulativeSumRow_C(const uint8* row, int32* cumsum,
+ const int32* previous_cumsum, int width);
+
+LIBYUV_API
+void ARGBAffineRow_C(const uint8* src_argb, int src_argb_stride,
+ uint8* dst_argb, const float* uv_dudv, int width);
+LIBYUV_API
+void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride,
+ uint8* dst_argb, const float* uv_dudv, int width);
+
+// Used for I420Scale, ARGBScale, and ARGBInterpolate.
+void InterpolateRow_C(uint8* dst_ptr, const uint8* src_ptr,
+ ptrdiff_t src_stride_ptr,
+ int width, int source_y_fraction);
+void InterpolateRow_SSE2(uint8* dst_ptr, const uint8* src_ptr,
+ ptrdiff_t src_stride_ptr, int width,
+ int source_y_fraction);
+void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
+ ptrdiff_t src_stride_ptr, int width,
+ int source_y_fraction);
+void InterpolateRow_AVX2(uint8* dst_ptr, const uint8* src_ptr,
+ ptrdiff_t src_stride_ptr, int width,
+ int source_y_fraction);
+void InterpolateRow_NEON(uint8* dst_ptr, const uint8* src_ptr,
+ ptrdiff_t src_stride_ptr, int width,
+ int source_y_fraction);
+void InterpolateRows_MIPS_DSPR2(uint8* dst_ptr, const uint8* src_ptr,
+ ptrdiff_t src_stride_ptr, int width,
+ int source_y_fraction);
+void InterpolateRow_Unaligned_SSE2(uint8* dst_ptr, const uint8* src_ptr,
+ ptrdiff_t src_stride_ptr, int width,
+ int source_y_fraction);
+void InterpolateRow_Unaligned_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
+ ptrdiff_t src_stride_ptr, int width,
+ int source_y_fraction);
+void InterpolateRow_Any_NEON(uint8* dst_ptr, const uint8* src_ptr,
+ ptrdiff_t src_stride_ptr, int width,
+ int source_y_fraction);
+void InterpolateRow_Any_SSE2(uint8* dst_ptr, const uint8* src_ptr,
+ ptrdiff_t src_stride_ptr, int width,
+ int source_y_fraction);
+void InterpolateRow_Any_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
+ ptrdiff_t src_stride_ptr, int width,
+ int source_y_fraction);
+void InterpolateRow_Any_AVX2(uint8* dst_ptr, const uint8* src_ptr,
+ ptrdiff_t src_stride_ptr, int width,
+ int source_y_fraction);
+void InterpolateRows_Any_MIPS_DSPR2(uint8* dst_ptr, const uint8* src_ptr,
+ ptrdiff_t src_stride_ptr, int width,
+ int source_y_fraction);
+
+void InterpolateRow_16_C(uint16* dst_ptr, const uint16* src_ptr,
+ ptrdiff_t src_stride_ptr,
+ int width, int source_y_fraction);
+
+// Sobel images.
+void SobelXRow_C(const uint8* src_y0, const uint8* src_y1, const uint8* src_y2,
+ uint8* dst_sobelx, int width);
+void SobelXRow_SSE2(const uint8* src_y0, const uint8* src_y1,
+ const uint8* src_y2, uint8* dst_sobelx, int width);
+void SobelXRow_NEON(const uint8* src_y0, const uint8* src_y1,
+ const uint8* src_y2, uint8* dst_sobelx, int width);
+void SobelYRow_C(const uint8* src_y0, const uint8* src_y1,
+ uint8* dst_sobely, int width);
+void SobelYRow_SSE2(const uint8* src_y0, const uint8* src_y1,
+ uint8* dst_sobely, int width);
+void SobelYRow_NEON(const uint8* src_y0, const uint8* src_y1,
+ uint8* dst_sobely, int width);
+void SobelRow_C(const uint8* src_sobelx, const uint8* src_sobely,
+ uint8* dst_argb, int width);
+void SobelRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
+ uint8* dst_argb, int width);
+void SobelRow_NEON(const uint8* src_sobelx, const uint8* src_sobely,
+ uint8* dst_argb, int width);
+void SobelToPlaneRow_C(const uint8* src_sobelx, const uint8* src_sobely,
+ uint8* dst_y, int width);
+void SobelToPlaneRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
+ uint8* dst_y, int width);
+void SobelToPlaneRow_NEON(const uint8* src_sobelx, const uint8* src_sobely,
+ uint8* dst_y, int width);
+void SobelXYRow_C(const uint8* src_sobelx, const uint8* src_sobely,
+ uint8* dst_argb, int width);
+void SobelXYRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
+ uint8* dst_argb, int width);
+void SobelXYRow_NEON(const uint8* src_sobelx, const uint8* src_sobely,
+ uint8* dst_argb, int width);
+
+void ARGBPolynomialRow_C(const uint8* src_argb,
+ uint8* dst_argb, const float* poly,
+ int width);
+void ARGBPolynomialRow_SSE2(const uint8* src_argb,
+ uint8* dst_argb, const float* poly,
+ int width);
+void ARGBPolynomialRow_AVX2(const uint8* src_argb,
+ uint8* dst_argb, const float* poly,
+ int width);
+
+void ARGBLumaColorTableRow_C(const uint8* src_argb, uint8* dst_argb, int width,
+ const uint8* luma, uint32 lumacoeff);
+void ARGBLumaColorTableRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
+ int width,
+ const uint8* luma, uint32 lumacoeff);
+
+#ifdef __cplusplus
+} // extern "C"
+} // namespace libyuv
+#endif
+
+#endif // INCLUDE_LIBYUV_ROW_H_ NOLINT
diff --git a/third_party/libyuv/include/libyuv/scale.h b/third_party/libyuv/include/libyuv/scale.h
index 35d0ff5..973d464 100644
--- a/third_party/libyuv/include/libyuv/scale.h
+++ b/third_party/libyuv/include/libyuv/scale.h
@@ -1,30 +1,45 @@
/*
- * Copyright (c) 2011 The LibYuv project authors. All Rights Reserved.
+ * Copyright 2011 The LibYuv Project Authors. All rights reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
+ * in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
-#ifndef INCLUDE_LIBYUV_SCALE_H_
+#ifndef INCLUDE_LIBYUV_SCALE_H_ // NOLINT
#define INCLUDE_LIBYUV_SCALE_H_
-#include "third_party/libyuv/include/libyuv/basic_types.h"
+#include "basic_types.h"
#ifdef __cplusplus
namespace libyuv {
extern "C" {
#endif
-// Supported filtering
-typedef enum {
- kFilterNone = 0, // Point sample; Fastest
- kFilterBilinear = 1, // Faster than box, but lower quality scaling down.
- kFilterBox = 2 // Highest quality
+// Supported filtering.
+typedef enum FilterMode {
+ kFilterNone = 0, // Point sample; Fastest.
+ kFilterLinear = 1, // Filter horizontally only.
+ kFilterBilinear = 2, // Faster than box, but lower quality scaling down.
+ kFilterBox = 3 // Highest quality.
} FilterModeEnum;
+// Scale a YUV plane.
+LIBYUV_API
+void ScalePlane(const uint8* src, int src_stride,
+ int src_width, int src_height,
+ uint8* dst, int dst_stride,
+ int dst_width, int dst_height,
+ enum FilterMode filtering);
+
+void ScalePlane_16(const uint16* src, int src_stride,
+ int src_width, int src_height,
+ uint16* dst, int dst_stride,
+ int dst_width, int dst_height,
+ enum FilterMode filtering);
+
// Scales a YUV 4:2:0 image from the src width and height to the
// dst width and height.
// If filtering is kFilterNone, a simple nearest-neighbor algorithm is
@@ -35,6 +50,7 @@
// quality image, at further expense of speed.
// Returns 0 if successful.
+LIBYUV_API
int I420Scale(const uint8* src_y, int src_stride_y,
const uint8* src_u, int src_stride_u,
const uint8* src_v, int src_stride_v,
@@ -43,28 +59,44 @@
uint8* dst_u, int dst_stride_u,
uint8* dst_v, int dst_stride_v,
int dst_width, int dst_height,
- FilterModeEnum filtering);
+ enum FilterMode filtering);
-// Legacy API. Deprecated
+LIBYUV_API
+int I420Scale_16(const uint16* src_y, int src_stride_y,
+ const uint16* src_u, int src_stride_u,
+ const uint16* src_v, int src_stride_v,
+ int src_width, int src_height,
+ uint16* dst_y, int dst_stride_y,
+ uint16* dst_u, int dst_stride_u,
+ uint16* dst_v, int dst_stride_v,
+ int dst_width, int dst_height,
+ enum FilterMode filtering);
+
+#ifdef __cplusplus
+// Legacy API. Deprecated.
+LIBYUV_API
int Scale(const uint8* src_y, const uint8* src_u, const uint8* src_v,
int src_stride_y, int src_stride_u, int src_stride_v,
int src_width, int src_height,
uint8* dst_y, uint8* dst_u, uint8* dst_v,
int dst_stride_y, int dst_stride_u, int dst_stride_v,
int dst_width, int dst_height,
- int interpolate);
+ LIBYUV_BOOL interpolate);
-// Legacy API. Deprecated
-int ScaleOffset(const uint8* src, int src_width, int src_height,
- uint8* dst, int dst_width, int dst_height, int dst_yoffset,
- int interpolate);
+// Legacy API. Deprecated.
+LIBYUV_API
+int ScaleOffset(const uint8* src_i420, int src_width, int src_height,
+ uint8* dst_i420, int dst_width, int dst_height, int dst_yoffset,
+ LIBYUV_BOOL interpolate);
-// For testing, allow disabling of optimizations.
-void SetUseReferenceImpl(int use);
+// For testing, allow disabling of specialized scalers.
+LIBYUV_API
+void SetUseReferenceImpl(LIBYUV_BOOL use);
+#endif // __cplusplus
#ifdef __cplusplus
} // extern "C"
} // namespace libyuv
#endif
-#endif // INCLUDE_LIBYUV_SCALE_H_
+#endif // INCLUDE_LIBYUV_SCALE_H_ NOLINT
diff --git a/third_party/libyuv/include/libyuv/scale_row.h b/third_party/libyuv/include/libyuv/scale_row.h
new file mode 100644
index 0000000..5d91f8f
--- /dev/null
+++ b/third_party/libyuv/include/libyuv/scale_row.h
@@ -0,0 +1,341 @@
+/*
+ * Copyright 2013 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef INCLUDE_LIBYUV_SCALE_ROW_H_ // NOLINT
+#define INCLUDE_LIBYUV_SCALE_ROW_H_
+
+#include "basic_types.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+#if defined(__pnacl__) || defined(__CLR_VER) || defined(COVERAGE_ENABLED) || \
+ defined(TARGET_IPHONE_SIMULATOR)
+#define LIBYUV_DISABLE_X86
+#endif
+
+// The following are available on all x86 platforms:
+#if !defined(LIBYUV_DISABLE_X86) && \
+ (defined(_M_IX86) || defined(__x86_64__) || defined(__i386__))
+#define HAS_SCALEROWDOWN2_SSE2
+#define HAS_SCALEROWDOWN4_SSE2
+#define HAS_SCALEROWDOWN34_SSSE3
+#define HAS_SCALEROWDOWN38_SSSE3
+#define HAS_SCALEADDROWS_SSE2
+#define HAS_SCALEFILTERCOLS_SSSE3
+#define HAS_SCALECOLSUP2_SSE2
+#define HAS_SCALEARGBROWDOWN2_SSE2
+#define HAS_SCALEARGBROWDOWNEVEN_SSE2
+#define HAS_SCALEARGBCOLS_SSE2
+#define HAS_SCALEARGBFILTERCOLS_SSSE3
+#define HAS_SCALEARGBCOLSUP2_SSE2
+#define HAS_FIXEDDIV_X86
+#define HAS_FIXEDDIV1_X86
+#endif
+
+// The following are available on Neon platforms:
+#if !defined(LIBYUV_DISABLE_NEON) && !defined(__native_client__) && \
+ (defined(__ARM_NEON__) || defined(LIBYUV_NEON))
+#define HAS_SCALEROWDOWN2_NEON
+#define HAS_SCALEROWDOWN4_NEON
+#define HAS_SCALEROWDOWN34_NEON
+#define HAS_SCALEROWDOWN38_NEON
+#define HAS_SCALEARGBROWDOWNEVEN_NEON
+#define HAS_SCALEARGBROWDOWN2_NEON
+#endif
+
+// The following are available on Mips platforms:
+#if !defined(LIBYUV_DISABLE_MIPS) && !defined(__native_client__) && \
+ defined(__mips__) && defined(__mips_dsp) && (__mips_dsp_rev >= 2)
+#define HAS_SCALEROWDOWN2_MIPS_DSPR2
+#define HAS_SCALEROWDOWN4_MIPS_DSPR2
+#define HAS_SCALEROWDOWN34_MIPS_DSPR2
+#define HAS_SCALEROWDOWN38_MIPS_DSPR2
+#endif
+
+// Scale ARGB vertically with bilinear interpolation.
+void ScalePlaneVertical(int src_height,
+ int dst_width, int dst_height,
+ int src_stride, int dst_stride,
+ const uint8* src_argb, uint8* dst_argb,
+ int x, int y, int dy,
+ int bpp, enum FilterMode filtering);
+
+void ScalePlaneVertical_16(int src_height,
+ int dst_width, int dst_height,
+ int src_stride, int dst_stride,
+ const uint16* src_argb, uint16* dst_argb,
+ int x, int y, int dy,
+ int wpp, enum FilterMode filtering);
+
+// Simplify the filtering based on scale factors.
+enum FilterMode ScaleFilterReduce(int src_width, int src_height,
+ int dst_width, int dst_height,
+ enum FilterMode filtering);
+
+// Divide num by div and return as 16.16 fixed point result.
+int FixedDiv_C(int num, int div);
+int FixedDiv_X86(int num, int div);
+// Divide num - 1 by div - 1 and return as 16.16 fixed point result.
+int FixedDiv1_C(int num, int div);
+int FixedDiv1_X86(int num, int div);
+#ifdef HAS_FIXEDDIV_X86
+#define FixedDiv FixedDiv_X86
+#define FixedDiv1 FixedDiv1_X86
+#else
+#define FixedDiv FixedDiv_C
+#define FixedDiv1 FixedDiv1_C
+#endif
+
+// Compute slope values for stepping.
+void ScaleSlope(int src_width, int src_height,
+ int dst_width, int dst_height,
+ enum FilterMode filtering,
+ int* x, int* y, int* dx, int* dy);
+
+void ScaleRowDown2_C(const uint8* src_ptr, ptrdiff_t src_stride,
+ uint8* dst, int dst_width);
+void ScaleRowDown2_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
+ uint16* dst, int dst_width);
+void ScaleRowDown2Linear_C(const uint8* src_ptr, ptrdiff_t src_stride,
+ uint8* dst, int dst_width);
+void ScaleRowDown2Linear_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
+ uint16* dst, int dst_width);
+void ScaleRowDown2Box_C(const uint8* src_ptr, ptrdiff_t src_stride,
+ uint8* dst, int dst_width);
+void ScaleRowDown2Box_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
+ uint16* dst, int dst_width);
+void ScaleRowDown4_C(const uint8* src_ptr, ptrdiff_t src_stride,
+ uint8* dst, int dst_width);
+void ScaleRowDown4_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
+ uint16* dst, int dst_width);
+void ScaleRowDown4Box_C(const uint8* src_ptr, ptrdiff_t src_stride,
+ uint8* dst, int dst_width);
+void ScaleRowDown4Box_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
+ uint16* dst, int dst_width);
+void ScaleRowDown34_C(const uint8* src_ptr, ptrdiff_t src_stride,
+ uint8* dst, int dst_width);
+void ScaleRowDown34_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
+ uint16* dst, int dst_width);
+void ScaleRowDown34_0_Box_C(const uint8* src_ptr, ptrdiff_t src_stride,
+ uint8* d, int dst_width);
+void ScaleRowDown34_0_Box_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
+ uint16* d, int dst_width);
+void ScaleRowDown34_1_Box_C(const uint8* src_ptr, ptrdiff_t src_stride,
+ uint8* d, int dst_width);
+void ScaleRowDown34_1_Box_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
+ uint16* d, int dst_width);
+void ScaleCols_C(uint8* dst_ptr, const uint8* src_ptr,
+ int dst_width, int x, int dx);
+void ScaleCols_16_C(uint16* dst_ptr, const uint16* src_ptr,
+ int dst_width, int x, int dx);
+void ScaleColsUp2_C(uint8* dst_ptr, const uint8* src_ptr,
+ int dst_width, int, int);
+void ScaleColsUp2_16_C(uint16* dst_ptr, const uint16* src_ptr,
+ int dst_width, int, int);
+void ScaleFilterCols_C(uint8* dst_ptr, const uint8* src_ptr,
+ int dst_width, int x, int dx);
+void ScaleFilterCols_16_C(uint16* dst_ptr, const uint16* src_ptr,
+ int dst_width, int x, int dx);
+void ScaleFilterCols64_C(uint8* dst_ptr, const uint8* src_ptr,
+ int dst_width, int x, int dx);
+void ScaleFilterCols64_16_C(uint16* dst_ptr, const uint16* src_ptr,
+ int dst_width, int x, int dx);
+void ScaleRowDown38_C(const uint8* src_ptr, ptrdiff_t src_stride,
+ uint8* dst, int dst_width);
+void ScaleRowDown38_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
+ uint16* dst, int dst_width);
+void ScaleRowDown38_3_Box_C(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst_ptr, int dst_width);
+void ScaleRowDown38_3_Box_16_C(const uint16* src_ptr,
+ ptrdiff_t src_stride,
+ uint16* dst_ptr, int dst_width);
+void ScaleRowDown38_2_Box_C(const uint8* src_ptr, ptrdiff_t src_stride,
+ uint8* dst_ptr, int dst_width);
+void ScaleRowDown38_2_Box_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
+ uint16* dst_ptr, int dst_width);
+void ScaleAddRows_C(const uint8* src_ptr, ptrdiff_t src_stride,
+ uint16* dst_ptr, int src_width, int src_height);
+void ScaleAddRows_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
+ uint32* dst_ptr, int src_width, int src_height);
+void ScaleARGBRowDown2_C(const uint8* src_argb,
+ ptrdiff_t src_stride,
+ uint8* dst_argb, int dst_width);
+void ScaleARGBRowDown2Linear_C(const uint8* src_argb,
+ ptrdiff_t src_stride,
+ uint8* dst_argb, int dst_width);
+void ScaleARGBRowDown2Box_C(const uint8* src_argb, ptrdiff_t src_stride,
+ uint8* dst_argb, int dst_width);
+void ScaleARGBRowDownEven_C(const uint8* src_argb, ptrdiff_t src_stride,
+ int src_stepx,
+ uint8* dst_argb, int dst_width);
+void ScaleARGBRowDownEvenBox_C(const uint8* src_argb,
+ ptrdiff_t src_stride,
+ int src_stepx,
+ uint8* dst_argb, int dst_width);
+void ScaleARGBCols_C(uint8* dst_argb, const uint8* src_argb,
+ int dst_width, int x, int dx);
+void ScaleARGBCols64_C(uint8* dst_argb, const uint8* src_argb,
+ int dst_width, int x, int dx);
+void ScaleARGBColsUp2_C(uint8* dst_argb, const uint8* src_argb,
+ int dst_width, int, int);
+void ScaleARGBFilterCols_C(uint8* dst_argb, const uint8* src_argb,
+ int dst_width, int x, int dx);
+void ScaleARGBFilterCols64_C(uint8* dst_argb, const uint8* src_argb,
+ int dst_width, int x, int dx);
+
+void ScaleRowDown2_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
+ uint8* dst_ptr, int dst_width);
+void ScaleRowDown2Linear_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
+ uint8* dst_ptr, int dst_width);
+void ScaleRowDown2Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
+ uint8* dst_ptr, int dst_width);
+void ScaleRowDown2_Unaligned_SSE2(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst_ptr, int dst_width);
+void ScaleRowDown2Linear_Unaligned_SSE2(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst_ptr, int dst_width);
+void ScaleRowDown2Box_Unaligned_SSE2(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst_ptr, int dst_width);
+void ScaleRowDown4_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
+ uint8* dst_ptr, int dst_width);
+void ScaleRowDown4Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
+ uint8* dst_ptr, int dst_width);
+void ScaleRowDown34_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
+ uint8* dst_ptr, int dst_width);
+void ScaleRowDown34_1_Box_SSSE3(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst_ptr, int dst_width);
+void ScaleRowDown34_0_Box_SSSE3(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst_ptr, int dst_width);
+void ScaleRowDown38_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
+ uint8* dst_ptr, int dst_width);
+void ScaleRowDown38_3_Box_SSSE3(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst_ptr, int dst_width);
+void ScaleRowDown38_2_Box_SSSE3(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst_ptr, int dst_width);
+void ScaleAddRows_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
+ uint16* dst_ptr, int src_width,
+ int src_height);
+void ScaleFilterCols_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
+ int dst_width, int x, int dx);
+void ScaleColsUp2_SSE2(uint8* dst_ptr, const uint8* src_ptr,
+ int dst_width, int x, int dx);
+void ScaleARGBRowDown2_SSE2(const uint8* src_argb,
+ ptrdiff_t src_stride,
+ uint8* dst_argb, int dst_width);
+void ScaleARGBRowDown2Linear_SSE2(const uint8* src_argb,
+ ptrdiff_t src_stride,
+ uint8* dst_argb, int dst_width);
+void ScaleARGBRowDown2Box_SSE2(const uint8* src_argb,
+ ptrdiff_t src_stride,
+ uint8* dst_argb, int dst_width);
+void ScaleARGBRowDownEven_SSE2(const uint8* src_argb, ptrdiff_t src_stride,
+ int src_stepx,
+ uint8* dst_argb, int dst_width);
+void ScaleARGBRowDownEvenBox_SSE2(const uint8* src_argb,
+ ptrdiff_t src_stride,
+ int src_stepx,
+ uint8* dst_argb, int dst_width);
+void ScaleARGBCols_SSE2(uint8* dst_argb, const uint8* src_argb,
+ int dst_width, int x, int dx);
+void ScaleARGBFilterCols_SSSE3(uint8* dst_argb, const uint8* src_argb,
+ int dst_width, int x, int dx);
+void ScaleARGBColsUp2_SSE2(uint8* dst_argb, const uint8* src_argb,
+ int dst_width, int x, int dx);
+// Row functions.
+void ScaleARGBRowDownEven_NEON(const uint8* src_argb, int src_stride,
+ int src_stepx,
+ uint8* dst_argb, int dst_width);
+void ScaleARGBRowDownEvenBox_NEON(const uint8* src_argb, int src_stride,
+ int src_stepx,
+ uint8* dst_argb, int dst_width);
+void ScaleARGBRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
+ uint8* dst, int dst_width);
+void ScaleARGBRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
+ uint8* dst, int dst_width);
+
+// ScaleRowDown2Box also used by planar functions
+// NEON downscalers with interpolation.
+
+// Note - not static due to reuse in convert for 444 to 420.
+void ScaleRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
+ uint8* dst, int dst_width);
+
+void ScaleRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
+ uint8* dst, int dst_width);
+
+void ScaleRowDown4_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
+ uint8* dst_ptr, int dst_width);
+void ScaleRowDown4Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
+ uint8* dst_ptr, int dst_width);
+
+// Down scale from 4 to 3 pixels. Use the neon multilane read/write
+// to load up the every 4th pixel into a 4 different registers.
+// Point samples 32 pixels to 24 pixels.
+void ScaleRowDown34_NEON(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst_ptr, int dst_width);
+void ScaleRowDown34_0_Box_NEON(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst_ptr, int dst_width);
+void ScaleRowDown34_1_Box_NEON(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst_ptr, int dst_width);
+
+// 32 -> 12
+void ScaleRowDown38_NEON(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst_ptr, int dst_width);
+// 32x3 -> 12x1
+void ScaleRowDown38_3_Box_NEON(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst_ptr, int dst_width);
+// 32x2 -> 12x1
+void ScaleRowDown38_2_Box_NEON(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst_ptr, int dst_width);
+
+void ScaleRowDown2_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
+ uint8* dst, int dst_width);
+void ScaleRowDown2Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
+ uint8* dst, int dst_width);
+void ScaleRowDown4_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
+ uint8* dst, int dst_width);
+void ScaleRowDown4Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
+ uint8* dst, int dst_width);
+void ScaleRowDown34_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
+ uint8* dst, int dst_width);
+void ScaleRowDown34_0_Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
+ uint8* d, int dst_width);
+void ScaleRowDown34_1_Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
+ uint8* d, int dst_width);
+void ScaleRowDown38_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
+ uint8* dst, int dst_width);
+void ScaleRowDown38_2_Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
+ uint8* dst_ptr, int dst_width);
+void ScaleRowDown38_3_Box_MIPS_DSPR2(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst_ptr, int dst_width);
+
+#ifdef __cplusplus
+} // extern "C"
+} // namespace libyuv
+#endif
+
+#endif // INCLUDE_LIBYUV_SCALE_ROW_H_ NOLINT
diff --git a/third_party/libyuv/source/cpu_id.c b/third_party/libyuv/source/cpu_id.c
deleted file mode 100644
index fccf3dd..0000000
--- a/third_party/libyuv/source/cpu_id.c
+++ /dev/null
@@ -1,81 +0,0 @@
-/*
- * Copyright (c) 2011 The LibYuv project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "third_party/libyuv/include/libyuv/cpu_id.h"
-
-#ifdef _MSC_VER
-#include <intrin.h>
-#endif
-#ifdef __ANDROID__
-#include <cpu-features.h>
-#endif
-
-#include "third_party/libyuv/include/libyuv/basic_types.h" // for CPU_X86
-
-// TODO(fbarchard): Use cpuid.h when gcc 4.4 is used on OSX and Linux.
-#if (defined(__pic__) || defined(__APPLE__)) && defined(__i386__)
-static inline void __cpuid(int cpu_info[4], int info_type) {
- asm volatile (
- "mov %%ebx, %%edi \n"
- "cpuid \n"
- "xchg %%edi, %%ebx \n"
- : "=a"(cpu_info[0]), "=D"(cpu_info[1]), "=c"(cpu_info[2]), "=d"(cpu_info[3])
- : "a"(info_type)
- );
-}
-#elif defined(__i386__) || defined(__x86_64__)
-static inline void __cpuid(int cpu_info[4], int info_type) {
- asm volatile (
- "cpuid \n"
- : "=a"(cpu_info[0]), "=b"(cpu_info[1]), "=c"(cpu_info[2]), "=d"(cpu_info[3])
- : "a"(info_type)
- );
-}
-#endif
-
-#ifdef __cplusplus
-namespace libyuv {
-extern "C" {
-#endif
-
-// CPU detect function for SIMD instruction sets.
-int cpu_info_ = 0;
-
-int InitCpuFlags() {
-#ifdef CPU_X86
- int cpu_info[4];
- __cpuid(cpu_info, 1);
- cpu_info_ = (cpu_info[3] & 0x04000000 ? kCpuHasSSE2 : 0) |
- (cpu_info[2] & 0x00000200 ? kCpuHasSSSE3 : 0) |
- kCpuInitialized;
-#elif defined(__ANDROID__) && defined(__ARM_NEON__)
- uint64_t features = android_getCpuFeatures();
- cpu_info_ = ((features & ANDROID_CPU_ARM_FEATURE_NEON) ? kCpuHasNEON : 0) |
- kCpuInitialized;
-#elif defined(__ARM_NEON__)
- // gcc -mfpu=neon defines __ARM_NEON__
- // Enable Neon if you want support for Neon and Arm, and use MaskCpuFlags
- // to disable Neon on devices that do not have it.
- cpu_info_ = kCpuHasNEON | kCpuInitialized;
-#else
- cpu_info_ = kCpuInitialized;
-#endif
- return cpu_info_;
-}
-
-void MaskCpuFlags(int enable_flags) {
- InitCpuFlags();
- cpu_info_ = (cpu_info_ & enable_flags) | kCpuInitialized;
-}
-
-#ifdef __cplusplus
-} // extern "C"
-} // namespace libyuv
-#endif
diff --git a/third_party/libyuv/source/cpu_id.cc b/third_party/libyuv/source/cpu_id.cc
new file mode 100644
index 0000000..520cfe5
--- /dev/null
+++ b/third_party/libyuv/source/cpu_id.cc
@@ -0,0 +1,283 @@
+/*
+ * Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "third_party/libyuv/include/libyuv/cpu_id.h"
+
+#ifdef _MSC_VER
+#include <intrin.h> // For __cpuidex()
+#endif
+#if !defined(__pnacl__) && !defined(__CLR_VER) && \
+ !defined(__native_client__) && defined(_M_X64) && \
+ defined(_MSC_VER) && (_MSC_FULL_VER >= 160040219)
+#include <immintrin.h> // For _xgetbv()
+#endif
+
+#if !defined(__native_client__)
+#include <stdlib.h> // For getenv()
+#endif
+
+// For ArmCpuCaps() but unittested on all platforms
+#include <stdio.h>
+#include <string.h>
+
+#include "third_party/libyuv/include/libyuv/basic_types.h" // For CPU_X86
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// For functions that use the stack and have runtime checks for overflow,
+// use SAFEBUFFERS to avoid additional check.
+#if defined(_MSC_VER) && (_MSC_FULL_VER >= 160040219)
+#define SAFEBUFFERS __declspec(safebuffers)
+#else
+#define SAFEBUFFERS
+#endif
+
+// Low level cpuid for X86. Returns zeros on other CPUs.
+#if !defined(__pnacl__) && !defined(__CLR_VER) && \
+ (defined(_M_IX86) || defined(_M_X64) || \
+ defined(__i386__) || defined(__x86_64__))
+LIBYUV_API
+void CpuId(uint32 info_eax, uint32 info_ecx, uint32* cpu_info) {
+#if defined(_MSC_VER)
+#if (_MSC_FULL_VER >= 160040219)
+ __cpuidex((int*)(cpu_info), info_eax, info_ecx);
+#elif defined(_M_IX86)
+ __asm {
+ mov eax, info_eax
+ mov ecx, info_ecx
+ mov edi, cpu_info
+ cpuid
+ mov [edi], eax
+ mov [edi + 4], ebx
+ mov [edi + 8], ecx
+ mov [edi + 12], edx
+ }
+#else
+ if (info_ecx == 0) {
+ __cpuid((int*)(cpu_info), info_eax);
+ } else {
+ cpu_info[3] = cpu_info[2] = cpu_info[1] = cpu_info[0] = 0;
+ }
+#endif
+#else // defined(_MSC_VER)
+ uint32 info_ebx, info_edx;
+ asm volatile ( // NOLINT
+#if defined( __i386__) && defined(__PIC__)
+ // Preserve ebx for fpic 32 bit.
+ "mov %%ebx, %%edi \n"
+ "cpuid \n"
+ "xchg %%edi, %%ebx \n"
+ : "=D" (info_ebx),
+#else
+ "cpuid \n"
+ : "=b" (info_ebx),
+#endif // defined( __i386__) && defined(__PIC__)
+ "+a" (info_eax), "+c" (info_ecx), "=d" (info_edx));
+ cpu_info[0] = info_eax;
+ cpu_info[1] = info_ebx;
+ cpu_info[2] = info_ecx;
+ cpu_info[3] = info_edx;
+#endif // defined(_MSC_VER)
+}
+
+#if !defined(__native_client__)
+#define HAS_XGETBV
+// X86 CPUs have xgetbv to detect OS saves high parts of ymm registers.
+int TestOsSaveYmm() {
+ uint32 xcr0 = 0u;
+#if defined(_MSC_VER) && (_MSC_FULL_VER >= 160040219)
+ xcr0 = (uint32)(_xgetbv(0)); // VS2010 SP1 required.
+#elif defined(_M_IX86)
+ __asm {
+ xor ecx, ecx // xcr 0
+ _asm _emit 0x0f _asm _emit 0x01 _asm _emit 0xd0 // For VS2010 and earlier.
+ mov xcr0, eax
+ }
+#elif defined(__i386__) || defined(__x86_64__)
+ asm(".byte 0x0f, 0x01, 0xd0" : "=a" (xcr0) : "c" (0) : "%edx");
+#endif // defined(_MSC_VER)
+ return((xcr0 & 6) == 6); // Is ymm saved?
+}
+#endif // !defined(__native_client__)
+#else
+LIBYUV_API
+void CpuId(uint32 eax, uint32 ecx, uint32* cpu_info) {
+ cpu_info[0] = cpu_info[1] = cpu_info[2] = cpu_info[3] = 0;
+}
+#endif
+
+// based on libvpx arm_cpudetect.c
+// For Arm, but public to allow testing on any CPU
+LIBYUV_API SAFEBUFFERS
+int ArmCpuCaps(const char* cpuinfo_name) {
+ char cpuinfo_line[512];
+ FILE* f = fopen(cpuinfo_name, "r");
+ if (!f) {
+ // Assume Neon if /proc/cpuinfo is unavailable.
+ // This will occur for Chrome sandbox for Pepper or Render process.
+ return kCpuHasNEON;
+ }
+ while (fgets(cpuinfo_line, sizeof(cpuinfo_line) - 1, f)) {
+ if (memcmp(cpuinfo_line, "Features", 8) == 0) {
+ char* p = strstr(cpuinfo_line, " neon");
+ if (p && (p[5] == ' ' || p[5] == '\n')) {
+ fclose(f);
+ return kCpuHasNEON;
+ }
+ }
+ }
+ fclose(f);
+ return 0;
+}
+
+#if defined(__mips__) && defined(__linux__)
+static int MipsCpuCaps(const char* search_string) {
+ char cpuinfo_line[512];
+ const char* file_name = "/proc/cpuinfo";
+ FILE* f = fopen(file_name, "r");
+ if (!f) {
+ // Assume DSP if /proc/cpuinfo is unavailable.
+ // This will occur for Chrome sandbox for Pepper or Render process.
+ return kCpuHasMIPS_DSP;
+ }
+ while (fgets(cpuinfo_line, sizeof(cpuinfo_line) - 1, f) != NULL) {
+ if (strstr(cpuinfo_line, search_string) != NULL) {
+ fclose(f);
+ return kCpuHasMIPS_DSP;
+ }
+ }
+ fclose(f);
+ return 0;
+}
+#endif
+
+// CPU detect function for SIMD instruction sets.
+LIBYUV_API
+int cpu_info_ = kCpuInit; // cpu_info is not initialized yet.
+
+// Test environment variable for disabling CPU features. Any non-zero value
+// to disable. Zero ignored to make it easy to set the variable on/off.
+#if !defined(__native_client__) && !defined(_M_ARM)
+
+static LIBYUV_BOOL TestEnv(const char* name) {
+ const char* var = getenv(name);
+ if (var) {
+ if (var[0] != '0') {
+ return LIBYUV_TRUE;
+ }
+ }
+ return LIBYUV_FALSE;
+}
+#else // nacl does not support getenv().
+static LIBYUV_BOOL TestEnv(const char*) {
+ return LIBYUV_FALSE;
+}
+#endif
+
+LIBYUV_API SAFEBUFFERS
+int InitCpuFlags(void) {
+#if !defined(__pnacl__) && !defined(__CLR_VER) && defined(CPU_X86)
+
+ uint32 cpu_info1[4] = { 0, 0, 0, 0 };
+ uint32 cpu_info7[4] = { 0, 0, 0, 0 };
+ CpuId(1, 0, cpu_info1);
+ CpuId(7, 0, cpu_info7);
+ cpu_info_ = ((cpu_info1[3] & 0x04000000) ? kCpuHasSSE2 : 0) |
+ ((cpu_info1[2] & 0x00000200) ? kCpuHasSSSE3 : 0) |
+ ((cpu_info1[2] & 0x00080000) ? kCpuHasSSE41 : 0) |
+ ((cpu_info1[2] & 0x00100000) ? kCpuHasSSE42 : 0) |
+ ((cpu_info7[1] & 0x00000200) ? kCpuHasERMS : 0) |
+ ((cpu_info1[2] & 0x00001000) ? kCpuHasFMA3 : 0) |
+ kCpuHasX86;
+#ifdef HAS_XGETBV
+ if ((cpu_info1[2] & 0x18000000) == 0x18000000 && // AVX and OSSave
+ TestOsSaveYmm()) { // Saves YMM.
+ cpu_info_ |= ((cpu_info7[1] & 0x00000020) ? kCpuHasAVX2 : 0) |
+ kCpuHasAVX;
+ }
+#endif
+ // Environment variable overrides for testing.
+ if (TestEnv("LIBYUV_DISABLE_X86")) {
+ cpu_info_ &= ~kCpuHasX86;
+ }
+ if (TestEnv("LIBYUV_DISABLE_SSE2")) {
+ cpu_info_ &= ~kCpuHasSSE2;
+ }
+ if (TestEnv("LIBYUV_DISABLE_SSSE3")) {
+ cpu_info_ &= ~kCpuHasSSSE3;
+ }
+ if (TestEnv("LIBYUV_DISABLE_SSE41")) {
+ cpu_info_ &= ~kCpuHasSSE41;
+ }
+ if (TestEnv("LIBYUV_DISABLE_SSE42")) {
+ cpu_info_ &= ~kCpuHasSSE42;
+ }
+ if (TestEnv("LIBYUV_DISABLE_AVX")) {
+ cpu_info_ &= ~kCpuHasAVX;
+ }
+ if (TestEnv("LIBYUV_DISABLE_AVX2")) {
+ cpu_info_ &= ~kCpuHasAVX2;
+ }
+ if (TestEnv("LIBYUV_DISABLE_ERMS")) {
+ cpu_info_ &= ~kCpuHasERMS;
+ }
+ if (TestEnv("LIBYUV_DISABLE_FMA3")) {
+ cpu_info_ &= ~kCpuHasFMA3;
+ }
+#elif defined(__mips__) && defined(__linux__)
+ // Linux mips parse text file for dsp detect.
+ cpu_info_ = MipsCpuCaps("dsp"); // set kCpuHasMIPS_DSP.
+#if defined(__mips_dspr2)
+ cpu_info_ |= kCpuHasMIPS_DSPR2;
+#endif
+ cpu_info_ |= kCpuHasMIPS;
+
+ if (getenv("LIBYUV_DISABLE_MIPS")) {
+ cpu_info_ &= ~kCpuHasMIPS;
+ }
+ if (getenv("LIBYUV_DISABLE_MIPS_DSP")) {
+ cpu_info_ &= ~kCpuHasMIPS_DSP;
+ }
+ if (getenv("LIBYUV_DISABLE_MIPS_DSPR2")) {
+ cpu_info_ &= ~kCpuHasMIPS_DSPR2;
+ }
+#elif defined(__arm__)
+// gcc -mfpu=neon defines __ARM_NEON__
+// __ARM_NEON__ generates code that requires Neon. NaCL also requires Neon.
+// For Linux, /proc/cpuinfo can be tested but without that assume Neon.
+#if defined(__ARM_NEON__) || defined(__native_client__) || !defined(__linux__)
+ cpu_info_ = kCpuHasNEON;
+#else
+ // Linux arm parse text file for neon detect.
+ cpu_info_ = ArmCpuCaps("/proc/cpuinfo");
+#endif
+ cpu_info_ |= kCpuHasARM;
+ if (TestEnv("LIBYUV_DISABLE_NEON")) {
+ cpu_info_ &= ~kCpuHasNEON;
+ }
+#endif // __arm__
+ if (TestEnv("LIBYUV_DISABLE_ASM")) {
+ cpu_info_ = 0;
+ }
+ return cpu_info_;
+}
+
+LIBYUV_API
+void MaskCpuFlags(int enable_flags) {
+ cpu_info_ = InitCpuFlags() & enable_flags;
+}
+
+#ifdef __cplusplus
+} // extern "C"
+} // namespace libyuv
+#endif
diff --git a/third_party/libyuv/source/planar_functions.cc b/third_party/libyuv/source/planar_functions.cc
new file mode 100644
index 0000000..68b8f46
--- /dev/null
+++ b/third_party/libyuv/source/planar_functions.cc
@@ -0,0 +1,2287 @@
+/*
+ * Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "third_party/libyuv/include/libyuv/planar_functions.h"
+
+#include <string.h> // for memset()
+
+#include "third_party/libyuv/include/libyuv/cpu_id.h"
+#ifdef HAVE_JPEG
+#include "third_party/libyuv/include/libyuv/mjpeg_decoder.h"
+#endif
+#include "third_party/libyuv/include/libyuv/row.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// Copy a plane of data
+LIBYUV_API
+void CopyPlane(const uint8* src_y, int src_stride_y,
+ uint8* dst_y, int dst_stride_y,
+ int width, int height) {
+ int y;
+ void (*CopyRow)(const uint8* src, uint8* dst, int width) = CopyRow_C;
+ // Coalesce rows.
+ if (src_stride_y == width &&
+ dst_stride_y == width) {
+ width *= height;
+ height = 1;
+ src_stride_y = dst_stride_y = 0;
+ }
+#if defined(HAS_COPYROW_X86)
+ if (TestCpuFlag(kCpuHasX86) && IS_ALIGNED(width, 4)) {
+ CopyRow = CopyRow_X86;
+ }
+#endif
+#if defined(HAS_COPYROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 32) &&
+ IS_ALIGNED(src_y, 16) && IS_ALIGNED(src_stride_y, 16) &&
+ IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
+ CopyRow = CopyRow_SSE2;
+ }
+#endif
+#if defined(HAS_COPYROW_ERMS)
+ if (TestCpuFlag(kCpuHasERMS)) {
+ CopyRow = CopyRow_ERMS;
+ }
+#endif
+#if defined(HAS_COPYROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 32)) {
+ CopyRow = CopyRow_NEON;
+ }
+#endif
+#if defined(HAS_COPYROW_MIPS)
+ if (TestCpuFlag(kCpuHasMIPS)) {
+ CopyRow = CopyRow_MIPS;
+ }
+#endif
+
+ // Copy plane
+ for (y = 0; y < height; ++y) {
+ CopyRow(src_y, dst_y, width);
+ src_y += src_stride_y;
+ dst_y += dst_stride_y;
+ }
+}
+
+LIBYUV_API
+void CopyPlane_16(const uint16* src_y, int src_stride_y,
+ uint16* dst_y, int dst_stride_y,
+ int width, int height) {
+ int y;
+ void (*CopyRow)(const uint16* src, uint16* dst, int width) = CopyRow_16_C;
+ // Coalesce rows.
+ if (src_stride_y == width &&
+ dst_stride_y == width) {
+ width *= height;
+ height = 1;
+ src_stride_y = dst_stride_y = 0;
+ }
+#if defined(HAS_COPYROW_16_X86)
+ if (TestCpuFlag(kCpuHasX86) && IS_ALIGNED(width, 4)) {
+ CopyRow = CopyRow_16_X86;
+ }
+#endif
+#if defined(HAS_COPYROW_16_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 32) &&
+ IS_ALIGNED(src_y, 16) && IS_ALIGNED(src_stride_y, 16) &&
+ IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
+ CopyRow = CopyRow_16_SSE2;
+ }
+#endif
+#if defined(HAS_COPYROW_16_ERMS)
+ if (TestCpuFlag(kCpuHasERMS)) {
+ CopyRow = CopyRow_16_ERMS;
+ }
+#endif
+#if defined(HAS_COPYROW_16_NEON)
+ if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 32)) {
+ CopyRow = CopyRow_16_NEON;
+ }
+#endif
+#if defined(HAS_COPYROW_16_MIPS)
+ if (TestCpuFlag(kCpuHasMIPS)) {
+ CopyRow = CopyRow_16_MIPS;
+ }
+#endif
+
+ // Copy plane
+ for (y = 0; y < height; ++y) {
+ CopyRow(src_y, dst_y, width);
+ src_y += src_stride_y;
+ dst_y += dst_stride_y;
+ }
+}
+
+// Copy I422.
+LIBYUV_API
+int I422Copy(const uint8* src_y, int src_stride_y,
+ const uint8* src_u, int src_stride_u,
+ const uint8* src_v, int src_stride_v,
+ uint8* dst_y, int dst_stride_y,
+ uint8* dst_u, int dst_stride_u,
+ uint8* dst_v, int dst_stride_v,
+ int width, int height) {
+ int halfwidth = (width + 1) >> 1;
+ if (!src_y || !src_u || !src_v ||
+ !dst_y || !dst_u || !dst_v ||
+ width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src_y = src_y + (height - 1) * src_stride_y;
+ src_u = src_u + (height - 1) * src_stride_u;
+ src_v = src_v + (height - 1) * src_stride_v;
+ src_stride_y = -src_stride_y;
+ src_stride_u = -src_stride_u;
+ src_stride_v = -src_stride_v;
+ }
+ CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
+ CopyPlane(src_u, src_stride_u, dst_u, dst_stride_u, halfwidth, height);
+ CopyPlane(src_v, src_stride_v, dst_v, dst_stride_v, halfwidth, height);
+ return 0;
+}
+
+// Copy I444.
+LIBYUV_API
+int I444Copy(const uint8* src_y, int src_stride_y,
+ const uint8* src_u, int src_stride_u,
+ const uint8* src_v, int src_stride_v,
+ uint8* dst_y, int dst_stride_y,
+ uint8* dst_u, int dst_stride_u,
+ uint8* dst_v, int dst_stride_v,
+ int width, int height) {
+ if (!src_y || !src_u || !src_v ||
+ !dst_y || !dst_u || !dst_v ||
+ width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src_y = src_y + (height - 1) * src_stride_y;
+ src_u = src_u + (height - 1) * src_stride_u;
+ src_v = src_v + (height - 1) * src_stride_v;
+ src_stride_y = -src_stride_y;
+ src_stride_u = -src_stride_u;
+ src_stride_v = -src_stride_v;
+ }
+
+ CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
+ CopyPlane(src_u, src_stride_u, dst_u, dst_stride_u, width, height);
+ CopyPlane(src_v, src_stride_v, dst_v, dst_stride_v, width, height);
+ return 0;
+}
+
+// Copy I400.
+LIBYUV_API
+int I400ToI400(const uint8* src_y, int src_stride_y,
+ uint8* dst_y, int dst_stride_y,
+ int width, int height) {
+ if (!src_y || !dst_y || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src_y = src_y + (height - 1) * src_stride_y;
+ src_stride_y = -src_stride_y;
+ }
+ CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
+ return 0;
+}
+
+// Convert I420 to I400.
+LIBYUV_API
+int I420ToI400(const uint8* src_y, int src_stride_y,
+ const uint8* src_u, int src_stride_u,
+ const uint8* src_v, int src_stride_v,
+ uint8* dst_y, int dst_stride_y,
+ int width, int height) {
+ if (!src_y || !dst_y || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src_y = src_y + (height - 1) * src_stride_y;
+ src_stride_y = -src_stride_y;
+ }
+ CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
+ return 0;
+}
+
+// Mirror a plane of data.
+void MirrorPlane(const uint8* src_y, int src_stride_y,
+ uint8* dst_y, int dst_stride_y,
+ int width, int height) {
+ int y;
+ void (*MirrorRow)(const uint8* src, uint8* dst, int width) = MirrorRow_C;
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src_y = src_y + (height - 1) * src_stride_y;
+ src_stride_y = -src_stride_y;
+ }
+#if defined(HAS_MIRRORROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 16)) {
+ MirrorRow = MirrorRow_NEON;
+ }
+#endif
+#if defined(HAS_MIRRORROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 16)) {
+ MirrorRow = MirrorRow_SSE2;
+ }
+#endif
+#if defined(HAS_MIRRORROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 16) &&
+ IS_ALIGNED(src_y, 16) && IS_ALIGNED(src_stride_y, 16) &&
+ IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
+ MirrorRow = MirrorRow_SSSE3;
+ }
+#endif
+#if defined(HAS_MIRRORROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2) && IS_ALIGNED(width, 32)) {
+ MirrorRow = MirrorRow_AVX2;
+ }
+#endif
+
+ // Mirror plane
+ for (y = 0; y < height; ++y) {
+ MirrorRow(src_y, dst_y, width);
+ src_y += src_stride_y;
+ dst_y += dst_stride_y;
+ }
+}
+
+// Convert YUY2 to I422.
+LIBYUV_API
+int YUY2ToI422(const uint8* src_yuy2, int src_stride_yuy2,
+ uint8* dst_y, int dst_stride_y,
+ uint8* dst_u, int dst_stride_u,
+ uint8* dst_v, int dst_stride_v,
+ int width, int height) {
+ int y;
+ void (*YUY2ToUV422Row)(const uint8* src_yuy2,
+ uint8* dst_u, uint8* dst_v, int pix) =
+ YUY2ToUV422Row_C;
+ void (*YUY2ToYRow)(const uint8* src_yuy2, uint8* dst_y, int pix) =
+ YUY2ToYRow_C;
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src_yuy2 = src_yuy2 + (height - 1) * src_stride_yuy2;
+ src_stride_yuy2 = -src_stride_yuy2;
+ }
+ // Coalesce rows.
+ if (src_stride_yuy2 == width * 2 &&
+ dst_stride_y == width &&
+ dst_stride_u * 2 == width &&
+ dst_stride_v * 2 == width) {
+ width *= height;
+ height = 1;
+ src_stride_yuy2 = dst_stride_y = dst_stride_u = dst_stride_v = 0;
+ }
+#if defined(HAS_YUY2TOYROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2) && width >= 16) {
+ YUY2ToUV422Row = YUY2ToUV422Row_Any_SSE2;
+ YUY2ToYRow = YUY2ToYRow_Any_SSE2;
+ if (IS_ALIGNED(width, 16)) {
+ YUY2ToUV422Row = YUY2ToUV422Row_Unaligned_SSE2;
+ YUY2ToYRow = YUY2ToYRow_Unaligned_SSE2;
+ if (IS_ALIGNED(src_yuy2, 16) && IS_ALIGNED(src_stride_yuy2, 16)) {
+ YUY2ToUV422Row = YUY2ToUV422Row_SSE2;
+ if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
+ YUY2ToYRow = YUY2ToYRow_SSE2;
+ }
+ }
+ }
+ }
+#endif
+#if defined(HAS_YUY2TOYROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2) && width >= 32) {
+ YUY2ToUV422Row = YUY2ToUV422Row_Any_AVX2;
+ YUY2ToYRow = YUY2ToYRow_Any_AVX2;
+ if (IS_ALIGNED(width, 32)) {
+ YUY2ToUV422Row = YUY2ToUV422Row_AVX2;
+ YUY2ToYRow = YUY2ToYRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_YUY2TOYROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+ YUY2ToYRow = YUY2ToYRow_Any_NEON;
+ if (width >= 16) {
+ YUY2ToUV422Row = YUY2ToUV422Row_Any_NEON;
+ }
+ if (IS_ALIGNED(width, 16)) {
+ YUY2ToYRow = YUY2ToYRow_NEON;
+ YUY2ToUV422Row = YUY2ToUV422Row_NEON;
+ }
+ }
+#endif
+
+ for (y = 0; y < height; ++y) {
+ YUY2ToUV422Row(src_yuy2, dst_u, dst_v, width);
+ YUY2ToYRow(src_yuy2, dst_y, width);
+ src_yuy2 += src_stride_yuy2;
+ dst_y += dst_stride_y;
+ dst_u += dst_stride_u;
+ dst_v += dst_stride_v;
+ }
+ return 0;
+}
+
+// Convert UYVY to I422.
+LIBYUV_API
+int UYVYToI422(const uint8* src_uyvy, int src_stride_uyvy,
+ uint8* dst_y, int dst_stride_y,
+ uint8* dst_u, int dst_stride_u,
+ uint8* dst_v, int dst_stride_v,
+ int width, int height) {
+ int y;
+ void (*UYVYToUV422Row)(const uint8* src_uyvy,
+ uint8* dst_u, uint8* dst_v, int pix) =
+ UYVYToUV422Row_C;
+ void (*UYVYToYRow)(const uint8* src_uyvy,
+ uint8* dst_y, int pix) = UYVYToYRow_C;
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src_uyvy = src_uyvy + (height - 1) * src_stride_uyvy;
+ src_stride_uyvy = -src_stride_uyvy;
+ }
+ // Coalesce rows.
+ if (src_stride_uyvy == width * 2 &&
+ dst_stride_y == width &&
+ dst_stride_u * 2 == width &&
+ dst_stride_v * 2 == width) {
+ width *= height;
+ height = 1;
+ src_stride_uyvy = dst_stride_y = dst_stride_u = dst_stride_v = 0;
+ }
+#if defined(HAS_UYVYTOYROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2) && width >= 16) {
+ UYVYToUV422Row = UYVYToUV422Row_Any_SSE2;
+ UYVYToYRow = UYVYToYRow_Any_SSE2;
+ if (IS_ALIGNED(width, 16)) {
+ UYVYToUV422Row = UYVYToUV422Row_Unaligned_SSE2;
+ UYVYToYRow = UYVYToYRow_Unaligned_SSE2;
+ if (IS_ALIGNED(src_uyvy, 16) && IS_ALIGNED(src_stride_uyvy, 16)) {
+ UYVYToUV422Row = UYVYToUV422Row_SSE2;
+ if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
+ UYVYToYRow = UYVYToYRow_SSE2;
+ }
+ }
+ }
+ }
+#endif
+#if defined(HAS_UYVYTOYROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2) && width >= 32) {
+ UYVYToUV422Row = UYVYToUV422Row_Any_AVX2;
+ UYVYToYRow = UYVYToYRow_Any_AVX2;
+ if (IS_ALIGNED(width, 32)) {
+ UYVYToUV422Row = UYVYToUV422Row_AVX2;
+ UYVYToYRow = UYVYToYRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_UYVYTOYROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+ UYVYToYRow = UYVYToYRow_Any_NEON;
+ if (width >= 16) {
+ UYVYToUV422Row = UYVYToUV422Row_Any_NEON;
+ }
+ if (IS_ALIGNED(width, 16)) {
+ UYVYToYRow = UYVYToYRow_NEON;
+ UYVYToUV422Row = UYVYToUV422Row_NEON;
+ }
+ }
+#endif
+
+ for (y = 0; y < height; ++y) {
+ UYVYToUV422Row(src_uyvy, dst_u, dst_v, width);
+ UYVYToYRow(src_uyvy, dst_y, width);
+ src_uyvy += src_stride_uyvy;
+ dst_y += dst_stride_y;
+ dst_u += dst_stride_u;
+ dst_v += dst_stride_v;
+ }
+ return 0;
+}
+
+// Mirror I400 with optional flipping
+LIBYUV_API
+int I400Mirror(const uint8* src_y, int src_stride_y,
+ uint8* dst_y, int dst_stride_y,
+ int width, int height) {
+ if (!src_y || !dst_y ||
+ width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src_y = src_y + (height - 1) * src_stride_y;
+ src_stride_y = -src_stride_y;
+ }
+
+ MirrorPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
+ return 0;
+}
+
+// Mirror I420 with optional flipping
+LIBYUV_API
+int I420Mirror(const uint8* src_y, int src_stride_y,
+ const uint8* src_u, int src_stride_u,
+ const uint8* src_v, int src_stride_v,
+ uint8* dst_y, int dst_stride_y,
+ uint8* dst_u, int dst_stride_u,
+ uint8* dst_v, int dst_stride_v,
+ int width, int height) {
+ int halfwidth = (width + 1) >> 1;
+ int halfheight = (height + 1) >> 1;
+ if (!src_y || !src_u || !src_v || !dst_y || !dst_u || !dst_v ||
+ width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ halfheight = (height + 1) >> 1;
+ src_y = src_y + (height - 1) * src_stride_y;
+ src_u = src_u + (halfheight - 1) * src_stride_u;
+ src_v = src_v + (halfheight - 1) * src_stride_v;
+ src_stride_y = -src_stride_y;
+ src_stride_u = -src_stride_u;
+ src_stride_v = -src_stride_v;
+ }
+
+ if (dst_y) {
+ MirrorPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
+ }
+ MirrorPlane(src_u, src_stride_u, dst_u, dst_stride_u, halfwidth, halfheight);
+ MirrorPlane(src_v, src_stride_v, dst_v, dst_stride_v, halfwidth, halfheight);
+ return 0;
+}
+
+// ARGB mirror.
+LIBYUV_API
+int ARGBMirror(const uint8* src_argb, int src_stride_argb,
+ uint8* dst_argb, int dst_stride_argb,
+ int width, int height) {
+ int y;
+ void (*ARGBMirrorRow)(const uint8* src, uint8* dst, int width) =
+ ARGBMirrorRow_C;
+ if (!src_argb || !dst_argb || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src_argb = src_argb + (height - 1) * src_stride_argb;
+ src_stride_argb = -src_stride_argb;
+ }
+
+#if defined(HAS_ARGBMIRRORROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 4) &&
+ IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16) &&
+ IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
+ ARGBMirrorRow = ARGBMirrorRow_SSSE3;
+ }
+#endif
+#if defined(HAS_ARGBMIRRORROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2) && IS_ALIGNED(width, 8)) {
+ ARGBMirrorRow = ARGBMirrorRow_AVX2;
+ }
+#endif
+#if defined(HAS_ARGBMIRRORROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 4)) {
+ ARGBMirrorRow = ARGBMirrorRow_NEON;
+ }
+#endif
+
+ // Mirror plane
+ for (y = 0; y < height; ++y) {
+ ARGBMirrorRow(src_argb, dst_argb, width);
+ src_argb += src_stride_argb;
+ dst_argb += dst_stride_argb;
+ }
+ return 0;
+}
+
+// Get a blender that optimized for the CPU, alignment and pixel count.
+// As there are 6 blenders to choose from, the caller should try to use
+// the same blend function for all pixels if possible.
+LIBYUV_API
+ARGBBlendRow GetARGBBlend() {
+ void (*ARGBBlendRow)(const uint8* src_argb, const uint8* src_argb1,
+ uint8* dst_argb, int width) = ARGBBlendRow_C;
+#if defined(HAS_ARGBBLENDROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ ARGBBlendRow = ARGBBlendRow_SSSE3;
+ return ARGBBlendRow;
+ }
+#endif
+#if defined(HAS_ARGBBLENDROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2)) {
+ ARGBBlendRow = ARGBBlendRow_SSE2;
+ }
+#endif
+#if defined(HAS_ARGBBLENDROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ ARGBBlendRow = ARGBBlendRow_NEON;
+ }
+#endif
+ return ARGBBlendRow;
+}
+
+// Alpha Blend 2 ARGB images and store to destination.
+LIBYUV_API
+int ARGBBlend(const uint8* src_argb0, int src_stride_argb0,
+ const uint8* src_argb1, int src_stride_argb1,
+ uint8* dst_argb, int dst_stride_argb,
+ int width, int height) {
+ int y;
+ void (*ARGBBlendRow)(const uint8* src_argb, const uint8* src_argb1,
+ uint8* dst_argb, int width) = GetARGBBlend();
+ if (!src_argb0 || !src_argb1 || !dst_argb || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ dst_argb = dst_argb + (height - 1) * dst_stride_argb;
+ dst_stride_argb = -dst_stride_argb;
+ }
+ // Coalesce rows.
+ if (src_stride_argb0 == width * 4 &&
+ src_stride_argb1 == width * 4 &&
+ dst_stride_argb == width * 4) {
+ width *= height;
+ height = 1;
+ src_stride_argb0 = src_stride_argb1 = dst_stride_argb = 0;
+ }
+
+ for (y = 0; y < height; ++y) {
+ ARGBBlendRow(src_argb0, src_argb1, dst_argb, width);
+ src_argb0 += src_stride_argb0;
+ src_argb1 += src_stride_argb1;
+ dst_argb += dst_stride_argb;
+ }
+ return 0;
+}
+
+// Multiply 2 ARGB images and store to destination.
+LIBYUV_API
+int ARGBMultiply(const uint8* src_argb0, int src_stride_argb0,
+ const uint8* src_argb1, int src_stride_argb1,
+ uint8* dst_argb, int dst_stride_argb,
+ int width, int height) {
+ int y;
+ void (*ARGBMultiplyRow)(const uint8* src0, const uint8* src1, uint8* dst,
+ int width) = ARGBMultiplyRow_C;
+ if (!src_argb0 || !src_argb1 || !dst_argb || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ dst_argb = dst_argb + (height - 1) * dst_stride_argb;
+ dst_stride_argb = -dst_stride_argb;
+ }
+ // Coalesce rows.
+ if (src_stride_argb0 == width * 4 &&
+ src_stride_argb1 == width * 4 &&
+ dst_stride_argb == width * 4) {
+ width *= height;
+ height = 1;
+ src_stride_argb0 = src_stride_argb1 = dst_stride_argb = 0;
+ }
+#if defined(HAS_ARGBMULTIPLYROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2) && width >= 4) {
+ ARGBMultiplyRow = ARGBMultiplyRow_Any_SSE2;
+ if (IS_ALIGNED(width, 4)) {
+ ARGBMultiplyRow = ARGBMultiplyRow_SSE2;
+ }
+ }
+#endif
+#if defined(HAS_ARGBMULTIPLYROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2) && width >= 8) {
+ ARGBMultiplyRow = ARGBMultiplyRow_Any_AVX2;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBMultiplyRow = ARGBMultiplyRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_ARGBMULTIPLYROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+ ARGBMultiplyRow = ARGBMultiplyRow_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBMultiplyRow = ARGBMultiplyRow_NEON;
+ }
+ }
+#endif
+
+ // Multiply plane
+ for (y = 0; y < height; ++y) {
+ ARGBMultiplyRow(src_argb0, src_argb1, dst_argb, width);
+ src_argb0 += src_stride_argb0;
+ src_argb1 += src_stride_argb1;
+ dst_argb += dst_stride_argb;
+ }
+ return 0;
+}
+
+// Add 2 ARGB images and store to destination.
+LIBYUV_API
+int ARGBAdd(const uint8* src_argb0, int src_stride_argb0,
+ const uint8* src_argb1, int src_stride_argb1,
+ uint8* dst_argb, int dst_stride_argb,
+ int width, int height) {
+ int y;
+ void (*ARGBAddRow)(const uint8* src0, const uint8* src1, uint8* dst,
+ int width) = ARGBAddRow_C;
+ if (!src_argb0 || !src_argb1 || !dst_argb || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ dst_argb = dst_argb + (height - 1) * dst_stride_argb;
+ dst_stride_argb = -dst_stride_argb;
+ }
+ // Coalesce rows.
+ if (src_stride_argb0 == width * 4 &&
+ src_stride_argb1 == width * 4 &&
+ dst_stride_argb == width * 4) {
+ width *= height;
+ height = 1;
+ src_stride_argb0 = src_stride_argb1 = dst_stride_argb = 0;
+ }
+#if defined(HAS_ARGBADDROW_SSE2) && defined(_MSC_VER)
+ if (TestCpuFlag(kCpuHasSSE2)) {
+ ARGBAddRow = ARGBAddRow_SSE2;
+ }
+#endif
+#if defined(HAS_ARGBADDROW_SSE2) && !defined(_MSC_VER)
+ if (TestCpuFlag(kCpuHasSSE2) && width >= 4) {
+ ARGBAddRow = ARGBAddRow_Any_SSE2;
+ if (IS_ALIGNED(width, 4)) {
+ ARGBAddRow = ARGBAddRow_SSE2;
+ }
+ }
+#endif
+#if defined(HAS_ARGBADDROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2) && width >= 8) {
+ ARGBAddRow = ARGBAddRow_Any_AVX2;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBAddRow = ARGBAddRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_ARGBADDROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+ ARGBAddRow = ARGBAddRow_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBAddRow = ARGBAddRow_NEON;
+ }
+ }
+#endif
+
+ // Add plane
+ for (y = 0; y < height; ++y) {
+ ARGBAddRow(src_argb0, src_argb1, dst_argb, width);
+ src_argb0 += src_stride_argb0;
+ src_argb1 += src_stride_argb1;
+ dst_argb += dst_stride_argb;
+ }
+ return 0;
+}
+
+// Subtract 2 ARGB images and store to destination.
+LIBYUV_API
+int ARGBSubtract(const uint8* src_argb0, int src_stride_argb0,
+ const uint8* src_argb1, int src_stride_argb1,
+ uint8* dst_argb, int dst_stride_argb,
+ int width, int height) {
+ int y;
+ void (*ARGBSubtractRow)(const uint8* src0, const uint8* src1, uint8* dst,
+ int width) = ARGBSubtractRow_C;
+ if (!src_argb0 || !src_argb1 || !dst_argb || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ dst_argb = dst_argb + (height - 1) * dst_stride_argb;
+ dst_stride_argb = -dst_stride_argb;
+ }
+ // Coalesce rows.
+ if (src_stride_argb0 == width * 4 &&
+ src_stride_argb1 == width * 4 &&
+ dst_stride_argb == width * 4) {
+ width *= height;
+ height = 1;
+ src_stride_argb0 = src_stride_argb1 = dst_stride_argb = 0;
+ }
+#if defined(HAS_ARGBSUBTRACTROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2) && width >= 4) {
+ ARGBSubtractRow = ARGBSubtractRow_Any_SSE2;
+ if (IS_ALIGNED(width, 4)) {
+ ARGBSubtractRow = ARGBSubtractRow_SSE2;
+ }
+ }
+#endif
+#if defined(HAS_ARGBSUBTRACTROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2) && width >= 8) {
+ ARGBSubtractRow = ARGBSubtractRow_Any_AVX2;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBSubtractRow = ARGBSubtractRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_ARGBSUBTRACTROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+ ARGBSubtractRow = ARGBSubtractRow_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBSubtractRow = ARGBSubtractRow_NEON;
+ }
+ }
+#endif
+
+ // Subtract plane
+ for (y = 0; y < height; ++y) {
+ ARGBSubtractRow(src_argb0, src_argb1, dst_argb, width);
+ src_argb0 += src_stride_argb0;
+ src_argb1 += src_stride_argb1;
+ dst_argb += dst_stride_argb;
+ }
+ return 0;
+}
+
+// Convert I422 to BGRA.
+LIBYUV_API
+int I422ToBGRA(const uint8* src_y, int src_stride_y,
+ const uint8* src_u, int src_stride_u,
+ const uint8* src_v, int src_stride_v,
+ uint8* dst_bgra, int dst_stride_bgra,
+ int width, int height) {
+ int y;
+ void (*I422ToBGRARow)(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* rgb_buf,
+ int width) = I422ToBGRARow_C;
+ if (!src_y || !src_u || !src_v ||
+ !dst_bgra ||
+ width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ dst_bgra = dst_bgra + (height - 1) * dst_stride_bgra;
+ dst_stride_bgra = -dst_stride_bgra;
+ }
+ // Coalesce rows.
+ if (src_stride_y == width &&
+ src_stride_u * 2 == width &&
+ src_stride_v * 2 == width &&
+ dst_stride_bgra == width * 4) {
+ width *= height;
+ height = 1;
+ src_stride_y = src_stride_u = src_stride_v = dst_stride_bgra = 0;
+ }
+#if defined(HAS_I422TOBGRAROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ I422ToBGRARow = I422ToBGRARow_Any_NEON;
+ if (IS_ALIGNED(width, 16)) {
+ I422ToBGRARow = I422ToBGRARow_NEON;
+ }
+ }
+#elif defined(HAS_I422TOBGRAROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {
+ I422ToBGRARow = I422ToBGRARow_Any_SSSE3;
+ if (IS_ALIGNED(width, 8)) {
+ I422ToBGRARow = I422ToBGRARow_Unaligned_SSSE3;
+ if (IS_ALIGNED(dst_bgra, 16) && IS_ALIGNED(dst_stride_bgra, 16)) {
+ I422ToBGRARow = I422ToBGRARow_SSSE3;
+ }
+ }
+ }
+#elif defined(HAS_I422TOBGRAROW_MIPS_DSPR2)
+ if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(width, 4) &&
+ IS_ALIGNED(src_y, 4) && IS_ALIGNED(src_stride_y, 4) &&
+ IS_ALIGNED(src_u, 2) && IS_ALIGNED(src_stride_u, 2) &&
+ IS_ALIGNED(src_v, 2) && IS_ALIGNED(src_stride_v, 2) &&
+ IS_ALIGNED(dst_bgra, 4) && IS_ALIGNED(dst_stride_bgra, 4)) {
+ I422ToBGRARow = I422ToBGRARow_MIPS_DSPR2;
+ }
+#endif
+
+ for (y = 0; y < height; ++y) {
+ I422ToBGRARow(src_y, src_u, src_v, dst_bgra, width);
+ dst_bgra += dst_stride_bgra;
+ src_y += src_stride_y;
+ src_u += src_stride_u;
+ src_v += src_stride_v;
+ }
+ return 0;
+}
+
+// Convert I422 to ABGR.
+LIBYUV_API
+int I422ToABGR(const uint8* src_y, int src_stride_y,
+ const uint8* src_u, int src_stride_u,
+ const uint8* src_v, int src_stride_v,
+ uint8* dst_abgr, int dst_stride_abgr,
+ int width, int height) {
+ int y;
+ void (*I422ToABGRRow)(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* rgb_buf,
+ int width) = I422ToABGRRow_C;
+ if (!src_y || !src_u || !src_v ||
+ !dst_abgr ||
+ width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ dst_abgr = dst_abgr + (height - 1) * dst_stride_abgr;
+ dst_stride_abgr = -dst_stride_abgr;
+ }
+ // Coalesce rows.
+ if (src_stride_y == width &&
+ src_stride_u * 2 == width &&
+ src_stride_v * 2 == width &&
+ dst_stride_abgr == width * 4) {
+ width *= height;
+ height = 1;
+ src_stride_y = src_stride_u = src_stride_v = dst_stride_abgr = 0;
+ }
+#if defined(HAS_I422TOABGRROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ I422ToABGRRow = I422ToABGRRow_Any_NEON;
+ if (IS_ALIGNED(width, 16)) {
+ I422ToABGRRow = I422ToABGRRow_NEON;
+ }
+ }
+#elif defined(HAS_I422TOABGRROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {
+ I422ToABGRRow = I422ToABGRRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 8)) {
+ I422ToABGRRow = I422ToABGRRow_Unaligned_SSSE3;
+ if (IS_ALIGNED(dst_abgr, 16) && IS_ALIGNED(dst_stride_abgr, 16)) {
+ I422ToABGRRow = I422ToABGRRow_SSSE3;
+ }
+ }
+ }
+#endif
+
+ for (y = 0; y < height; ++y) {
+ I422ToABGRRow(src_y, src_u, src_v, dst_abgr, width);
+ dst_abgr += dst_stride_abgr;
+ src_y += src_stride_y;
+ src_u += src_stride_u;
+ src_v += src_stride_v;
+ }
+ return 0;
+}
+
+// Convert I422 to RGBA.
+LIBYUV_API
+int I422ToRGBA(const uint8* src_y, int src_stride_y,
+ const uint8* src_u, int src_stride_u,
+ const uint8* src_v, int src_stride_v,
+ uint8* dst_rgba, int dst_stride_rgba,
+ int width, int height) {
+ int y;
+ void (*I422ToRGBARow)(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* rgb_buf,
+ int width) = I422ToRGBARow_C;
+ if (!src_y || !src_u || !src_v ||
+ !dst_rgba ||
+ width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ dst_rgba = dst_rgba + (height - 1) * dst_stride_rgba;
+ dst_stride_rgba = -dst_stride_rgba;
+ }
+ // Coalesce rows.
+ if (src_stride_y == width &&
+ src_stride_u * 2 == width &&
+ src_stride_v * 2 == width &&
+ dst_stride_rgba == width * 4) {
+ width *= height;
+ height = 1;
+ src_stride_y = src_stride_u = src_stride_v = dst_stride_rgba = 0;
+ }
+#if defined(HAS_I422TORGBAROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ I422ToRGBARow = I422ToRGBARow_Any_NEON;
+ if (IS_ALIGNED(width, 16)) {
+ I422ToRGBARow = I422ToRGBARow_NEON;
+ }
+ }
+#elif defined(HAS_I422TORGBAROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {
+ I422ToRGBARow = I422ToRGBARow_Any_SSSE3;
+ if (IS_ALIGNED(width, 8)) {
+ I422ToRGBARow = I422ToRGBARow_Unaligned_SSSE3;
+ if (IS_ALIGNED(dst_rgba, 16) && IS_ALIGNED(dst_stride_rgba, 16)) {
+ I422ToRGBARow = I422ToRGBARow_SSSE3;
+ }
+ }
+ }
+#endif
+
+ for (y = 0; y < height; ++y) {
+ I422ToRGBARow(src_y, src_u, src_v, dst_rgba, width);
+ dst_rgba += dst_stride_rgba;
+ src_y += src_stride_y;
+ src_u += src_stride_u;
+ src_v += src_stride_v;
+ }
+ return 0;
+}
+
+// Convert NV12 to RGB565.
+LIBYUV_API
+int NV12ToRGB565(const uint8* src_y, int src_stride_y,
+ const uint8* src_uv, int src_stride_uv,
+ uint8* dst_rgb565, int dst_stride_rgb565,
+ int width, int height) {
+ int y;
+ void (*NV12ToRGB565Row)(const uint8* y_buf,
+ const uint8* uv_buf,
+ uint8* rgb_buf,
+ int width) = NV12ToRGB565Row_C;
+ if (!src_y || !src_uv || !dst_rgb565 ||
+ width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ dst_rgb565 = dst_rgb565 + (height - 1) * dst_stride_rgb565;
+ dst_stride_rgb565 = -dst_stride_rgb565;
+ }
+#if defined(HAS_NV12TORGB565ROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {
+ NV12ToRGB565Row = NV12ToRGB565Row_Any_SSSE3;
+ if (IS_ALIGNED(width, 8)) {
+ NV12ToRGB565Row = NV12ToRGB565Row_SSSE3;
+ }
+ }
+#elif defined(HAS_NV12TORGB565ROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+ NV12ToRGB565Row = NV12ToRGB565Row_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ NV12ToRGB565Row = NV12ToRGB565Row_NEON;
+ }
+ }
+#endif
+
+ for (y = 0; y < height; ++y) {
+ NV12ToRGB565Row(src_y, src_uv, dst_rgb565, width);
+ dst_rgb565 += dst_stride_rgb565;
+ src_y += src_stride_y;
+ if (y & 1) {
+ src_uv += src_stride_uv;
+ }
+ }
+ return 0;
+}
+
+// Convert NV21 to RGB565.
+LIBYUV_API
+int NV21ToRGB565(const uint8* src_y, int src_stride_y,
+ const uint8* src_vu, int src_stride_vu,
+ uint8* dst_rgb565, int dst_stride_rgb565,
+ int width, int height) {
+ int y;
+ void (*NV21ToRGB565Row)(const uint8* y_buf,
+ const uint8* src_vu,
+ uint8* rgb_buf,
+ int width) = NV21ToRGB565Row_C;
+ if (!src_y || !src_vu || !dst_rgb565 ||
+ width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ dst_rgb565 = dst_rgb565 + (height - 1) * dst_stride_rgb565;
+ dst_stride_rgb565 = -dst_stride_rgb565;
+ }
+#if defined(HAS_NV21TORGB565ROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {
+ NV21ToRGB565Row = NV21ToRGB565Row_Any_SSSE3;
+ if (IS_ALIGNED(width, 8)) {
+ NV21ToRGB565Row = NV21ToRGB565Row_SSSE3;
+ }
+ }
+#elif defined(HAS_NV21TORGB565ROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+ NV21ToRGB565Row = NV21ToRGB565Row_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ NV21ToRGB565Row = NV21ToRGB565Row_NEON;
+ }
+ }
+#endif
+
+ for (y = 0; y < height; ++y) {
+ NV21ToRGB565Row(src_y, src_vu, dst_rgb565, width);
+ dst_rgb565 += dst_stride_rgb565;
+ src_y += src_stride_y;
+ if (y & 1) {
+ src_vu += src_stride_vu;
+ }
+ }
+ return 0;
+}
+
+LIBYUV_API
+void SetPlane(uint8* dst_y, int dst_stride_y,
+ int width, int height,
+ uint32 value) {
+ int y;
+ uint32 v32 = value | (value << 8) | (value << 16) | (value << 24);
+ void (*SetRow)(uint8* dst, uint32 value, int pix) = SetRow_C;
+ // Coalesce rows.
+ if (dst_stride_y == width) {
+ width *= height;
+ height = 1;
+ dst_stride_y = 0;
+ }
+#if defined(HAS_SETROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON) &&
+ IS_ALIGNED(width, 16) &&
+ IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
+ SetRow = SetRow_NEON;
+ }
+#endif
+#if defined(HAS_SETROW_X86)
+ if (TestCpuFlag(kCpuHasX86) && IS_ALIGNED(width, 4)) {
+ SetRow = SetRow_X86;
+ }
+#endif
+
+ // Set plane
+ for (y = 0; y < height; ++y) {
+ SetRow(dst_y, v32, width);
+ dst_y += dst_stride_y;
+ }
+}
+
+// Draw a rectangle into I420
+LIBYUV_API
+int I420Rect(uint8* dst_y, int dst_stride_y,
+ uint8* dst_u, int dst_stride_u,
+ uint8* dst_v, int dst_stride_v,
+ int x, int y,
+ int width, int height,
+ int value_y, int value_u, int value_v) {
+ int halfwidth = (width + 1) >> 1;
+ int halfheight = (height + 1) >> 1;
+ uint8* start_y = dst_y + y * dst_stride_y + x;
+ uint8* start_u = dst_u + (y / 2) * dst_stride_u + (x / 2);
+ uint8* start_v = dst_v + (y / 2) * dst_stride_v + (x / 2);
+ if (!dst_y || !dst_u || !dst_v ||
+ width <= 0 || height <= 0 ||
+ x < 0 || y < 0 ||
+ value_y < 0 || value_y > 255 ||
+ value_u < 0 || value_u > 255 ||
+ value_v < 0 || value_v > 255) {
+ return -1;
+ }
+
+ SetPlane(start_y, dst_stride_y, width, height, value_y);
+ SetPlane(start_u, dst_stride_u, halfwidth, halfheight, value_u);
+ SetPlane(start_v, dst_stride_v, halfwidth, halfheight, value_v);
+ return 0;
+}
+
+// Draw a rectangle into ARGB
+LIBYUV_API
+int ARGBRect(uint8* dst_argb, int dst_stride_argb,
+ int dst_x, int dst_y,
+ int width, int height,
+ uint32 value) {
+ if (!dst_argb ||
+ width <= 0 || height <= 0 ||
+ dst_x < 0 || dst_y < 0) {
+ return -1;
+ }
+ dst_argb += dst_y * dst_stride_argb + dst_x * 4;
+ // Coalesce rows.
+ if (dst_stride_argb == width * 4) {
+ width *= height;
+ height = 1;
+ dst_stride_argb = 0;
+ }
+#if defined(HAS_SETROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 16) &&
+ IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
+ ARGBSetRows_NEON(dst_argb, value, width, dst_stride_argb, height);
+ return 0;
+ }
+#endif
+#if defined(HAS_SETROW_X86)
+ if (TestCpuFlag(kCpuHasX86)) {
+ ARGBSetRows_X86(dst_argb, value, width, dst_stride_argb, height);
+ return 0;
+ }
+#endif
+ ARGBSetRows_C(dst_argb, value, width, dst_stride_argb, height);
+ return 0;
+}
+
+// Convert unattentuated ARGB to preattenuated ARGB.
+// An unattenutated ARGB alpha blend uses the formula
+// p = a * f + (1 - a) * b
+// where
+// p is output pixel
+// f is foreground pixel
+// b is background pixel
+// a is alpha value from foreground pixel
+// An preattenutated ARGB alpha blend uses the formula
+// p = f + (1 - a) * b
+// where
+// f is foreground pixel premultiplied by alpha
+
+LIBYUV_API
+int ARGBAttenuate(const uint8* src_argb, int src_stride_argb,
+ uint8* dst_argb, int dst_stride_argb,
+ int width, int height) {
+ int y;
+ void (*ARGBAttenuateRow)(const uint8* src_argb, uint8* dst_argb,
+ int width) = ARGBAttenuateRow_C;
+ if (!src_argb || !dst_argb || width <= 0 || height == 0) {
+ return -1;
+ }
+ if (height < 0) {
+ height = -height;
+ src_argb = src_argb + (height - 1) * src_stride_argb;
+ src_stride_argb = -src_stride_argb;
+ }
+ // Coalesce rows.
+ if (src_stride_argb == width * 4 &&
+ dst_stride_argb == width * 4) {
+ width *= height;
+ height = 1;
+ src_stride_argb = dst_stride_argb = 0;
+ }
+#if defined(HAS_ARGBATTENUATEROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2) && width >= 4 &&
+ IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16) &&
+ IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
+ ARGBAttenuateRow = ARGBAttenuateRow_Any_SSE2;
+ if (IS_ALIGNED(width, 4)) {
+ ARGBAttenuateRow = ARGBAttenuateRow_SSE2;
+ }
+ }
+#endif
+#if defined(HAS_ARGBATTENUATEROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3) && width >= 4) {
+ ARGBAttenuateRow = ARGBAttenuateRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 4)) {
+ ARGBAttenuateRow = ARGBAttenuateRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_ARGBATTENUATEROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2) && width >= 8) {
+ ARGBAttenuateRow = ARGBAttenuateRow_Any_AVX2;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBAttenuateRow = ARGBAttenuateRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_ARGBATTENUATEROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+ ARGBAttenuateRow = ARGBAttenuateRow_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBAttenuateRow = ARGBAttenuateRow_NEON;
+ }
+ }
+#endif
+
+ for (y = 0; y < height; ++y) {
+ ARGBAttenuateRow(src_argb, dst_argb, width);
+ src_argb += src_stride_argb;
+ dst_argb += dst_stride_argb;
+ }
+ return 0;
+}
+
+// Convert preattentuated ARGB to unattenuated ARGB.
+LIBYUV_API
+int ARGBUnattenuate(const uint8* src_argb, int src_stride_argb,
+ uint8* dst_argb, int dst_stride_argb,
+ int width, int height) {
+ int y;
+ void (*ARGBUnattenuateRow)(const uint8* src_argb, uint8* dst_argb,
+ int width) = ARGBUnattenuateRow_C;
+ if (!src_argb || !dst_argb || width <= 0 || height == 0) {
+ return -1;
+ }
+ if (height < 0) {
+ height = -height;
+ src_argb = src_argb + (height - 1) * src_stride_argb;
+ src_stride_argb = -src_stride_argb;
+ }
+ // Coalesce rows.
+ if (src_stride_argb == width * 4 &&
+ dst_stride_argb == width * 4) {
+ width *= height;
+ height = 1;
+ src_stride_argb = dst_stride_argb = 0;
+ }
+#if defined(HAS_ARGBUNATTENUATEROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2) && width >= 4) {
+ ARGBUnattenuateRow = ARGBUnattenuateRow_Any_SSE2;
+ if (IS_ALIGNED(width, 4)) {
+ ARGBUnattenuateRow = ARGBUnattenuateRow_SSE2;
+ }
+ }
+#endif
+#if defined(HAS_ARGBUNATTENUATEROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2) && width >= 8) {
+ ARGBUnattenuateRow = ARGBUnattenuateRow_Any_AVX2;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBUnattenuateRow = ARGBUnattenuateRow_AVX2;
+ }
+ }
+#endif
+// TODO(fbarchard): Neon version.
+
+ for (y = 0; y < height; ++y) {
+ ARGBUnattenuateRow(src_argb, dst_argb, width);
+ src_argb += src_stride_argb;
+ dst_argb += dst_stride_argb;
+ }
+ return 0;
+}
+
+// Convert ARGB to Grayed ARGB.
+LIBYUV_API
+int ARGBGrayTo(const uint8* src_argb, int src_stride_argb,
+ uint8* dst_argb, int dst_stride_argb,
+ int width, int height) {
+ int y;
+ void (*ARGBGrayRow)(const uint8* src_argb, uint8* dst_argb,
+ int width) = ARGBGrayRow_C;
+ if (!src_argb || !dst_argb || width <= 0 || height == 0) {
+ return -1;
+ }
+ if (height < 0) {
+ height = -height;
+ src_argb = src_argb + (height - 1) * src_stride_argb;
+ src_stride_argb = -src_stride_argb;
+ }
+ // Coalesce rows.
+ if (src_stride_argb == width * 4 &&
+ dst_stride_argb == width * 4) {
+ width *= height;
+ height = 1;
+ src_stride_argb = dst_stride_argb = 0;
+ }
+#if defined(HAS_ARGBGRAYROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 8) &&
+ IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16) &&
+ IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
+ ARGBGrayRow = ARGBGrayRow_SSSE3;
+ }
+#elif defined(HAS_ARGBGRAYROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) {
+ ARGBGrayRow = ARGBGrayRow_NEON;
+ }
+#endif
+
+ for (y = 0; y < height; ++y) {
+ ARGBGrayRow(src_argb, dst_argb, width);
+ src_argb += src_stride_argb;
+ dst_argb += dst_stride_argb;
+ }
+ return 0;
+}
+
+// Make a rectangle of ARGB gray scale.
+LIBYUV_API
+int ARGBGray(uint8* dst_argb, int dst_stride_argb,
+ int dst_x, int dst_y,
+ int width, int height) {
+ int y;
+ void (*ARGBGrayRow)(const uint8* src_argb, uint8* dst_argb,
+ int width) = ARGBGrayRow_C;
+ uint8* dst = dst_argb + dst_y * dst_stride_argb + dst_x * 4;
+ if (!dst_argb || width <= 0 || height <= 0 || dst_x < 0 || dst_y < 0) {
+ return -1;
+ }
+ // Coalesce rows.
+ if (dst_stride_argb == width * 4) {
+ width *= height;
+ height = 1;
+ dst_stride_argb = 0;
+ }
+#if defined(HAS_ARGBGRAYROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 8) &&
+ IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
+ ARGBGrayRow = ARGBGrayRow_SSSE3;
+ }
+#elif defined(HAS_ARGBGRAYROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) {
+ ARGBGrayRow = ARGBGrayRow_NEON;
+ }
+#endif
+ for (y = 0; y < height; ++y) {
+ ARGBGrayRow(dst, dst, width);
+ dst += dst_stride_argb;
+ }
+ return 0;
+}
+
+// Make a rectangle of ARGB Sepia tone.
+LIBYUV_API
+int ARGBSepia(uint8* dst_argb, int dst_stride_argb,
+ int dst_x, int dst_y, int width, int height) {
+ int y;
+ void (*ARGBSepiaRow)(uint8* dst_argb, int width) = ARGBSepiaRow_C;
+ uint8* dst = dst_argb + dst_y * dst_stride_argb + dst_x * 4;
+ if (!dst_argb || width <= 0 || height <= 0 || dst_x < 0 || dst_y < 0) {
+ return -1;
+ }
+ // Coalesce rows.
+ if (dst_stride_argb == width * 4) {
+ width *= height;
+ height = 1;
+ dst_stride_argb = 0;
+ }
+#if defined(HAS_ARGBSEPIAROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 8) &&
+ IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
+ ARGBSepiaRow = ARGBSepiaRow_SSSE3;
+ }
+#elif defined(HAS_ARGBSEPIAROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) {
+ ARGBSepiaRow = ARGBSepiaRow_NEON;
+ }
+#endif
+ for (y = 0; y < height; ++y) {
+ ARGBSepiaRow(dst, width);
+ dst += dst_stride_argb;
+ }
+ return 0;
+}
+
+// Apply a 4x4 matrix to each ARGB pixel.
+// Note: Normally for shading, but can be used to swizzle or invert.
+LIBYUV_API
+int ARGBColorMatrix(const uint8* src_argb, int src_stride_argb,
+ uint8* dst_argb, int dst_stride_argb,
+ const int8* matrix_argb,
+ int width, int height) {
+ int y;
+ void (*ARGBColorMatrixRow)(const uint8* src_argb, uint8* dst_argb,
+ const int8* matrix_argb, int width) = ARGBColorMatrixRow_C;
+ if (!src_argb || !dst_argb || !matrix_argb || width <= 0 || height == 0) {
+ return -1;
+ }
+ if (height < 0) {
+ height = -height;
+ src_argb = src_argb + (height - 1) * src_stride_argb;
+ src_stride_argb = -src_stride_argb;
+ }
+ // Coalesce rows.
+ if (src_stride_argb == width * 4 &&
+ dst_stride_argb == width * 4) {
+ width *= height;
+ height = 1;
+ src_stride_argb = dst_stride_argb = 0;
+ }
+#if defined(HAS_ARGBCOLORMATRIXROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 8) &&
+ IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
+ ARGBColorMatrixRow = ARGBColorMatrixRow_SSSE3;
+ }
+#elif defined(HAS_ARGBCOLORMATRIXROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) {
+ ARGBColorMatrixRow = ARGBColorMatrixRow_NEON;
+ }
+#endif
+ for (y = 0; y < height; ++y) {
+ ARGBColorMatrixRow(src_argb, dst_argb, matrix_argb, width);
+ src_argb += src_stride_argb;
+ dst_argb += dst_stride_argb;
+ }
+ return 0;
+}
+
+// Apply a 4x3 matrix to each ARGB pixel.
+// Deprecated.
+LIBYUV_API
+int RGBColorMatrix(uint8* dst_argb, int dst_stride_argb,
+ const int8* matrix_rgb,
+ int dst_x, int dst_y, int width, int height) {
+ SIMD_ALIGNED(int8 matrix_argb[16]);
+ uint8* dst = dst_argb + dst_y * dst_stride_argb + dst_x * 4;
+ if (!dst_argb || !matrix_rgb || width <= 0 || height <= 0 ||
+ dst_x < 0 || dst_y < 0) {
+ return -1;
+ }
+
+ // Convert 4x3 7 bit matrix to 4x4 6 bit matrix.
+ matrix_argb[0] = matrix_rgb[0] / 2;
+ matrix_argb[1] = matrix_rgb[1] / 2;
+ matrix_argb[2] = matrix_rgb[2] / 2;
+ matrix_argb[3] = matrix_rgb[3] / 2;
+ matrix_argb[4] = matrix_rgb[4] / 2;
+ matrix_argb[5] = matrix_rgb[5] / 2;
+ matrix_argb[6] = matrix_rgb[6] / 2;
+ matrix_argb[7] = matrix_rgb[7] / 2;
+ matrix_argb[8] = matrix_rgb[8] / 2;
+ matrix_argb[9] = matrix_rgb[9] / 2;
+ matrix_argb[10] = matrix_rgb[10] / 2;
+ matrix_argb[11] = matrix_rgb[11] / 2;
+ matrix_argb[14] = matrix_argb[13] = matrix_argb[12] = 0;
+ matrix_argb[15] = 64; // 1.0
+
+ return ARGBColorMatrix((const uint8*)(dst), dst_stride_argb,
+ dst, dst_stride_argb,
+ &matrix_argb[0], width, height);
+}
+
+// Apply a color table each ARGB pixel.
+// Table contains 256 ARGB values.
+LIBYUV_API
+int ARGBColorTable(uint8* dst_argb, int dst_stride_argb,
+ const uint8* table_argb,
+ int dst_x, int dst_y, int width, int height) {
+ int y;
+ void (*ARGBColorTableRow)(uint8* dst_argb, const uint8* table_argb,
+ int width) = ARGBColorTableRow_C;
+ uint8* dst = dst_argb + dst_y * dst_stride_argb + dst_x * 4;
+ if (!dst_argb || !table_argb || width <= 0 || height <= 0 ||
+ dst_x < 0 || dst_y < 0) {
+ return -1;
+ }
+ // Coalesce rows.
+ if (dst_stride_argb == width * 4) {
+ width *= height;
+ height = 1;
+ dst_stride_argb = 0;
+ }
+#if defined(HAS_ARGBCOLORTABLEROW_X86)
+ if (TestCpuFlag(kCpuHasX86)) {
+ ARGBColorTableRow = ARGBColorTableRow_X86;
+ }
+#endif
+ for (y = 0; y < height; ++y) {
+ ARGBColorTableRow(dst, table_argb, width);
+ dst += dst_stride_argb;
+ }
+ return 0;
+}
+
+// Apply a color table each ARGB pixel but preserve destination alpha.
+// Table contains 256 ARGB values.
+LIBYUV_API
+int RGBColorTable(uint8* dst_argb, int dst_stride_argb,
+ const uint8* table_argb,
+ int dst_x, int dst_y, int width, int height) {
+ int y;
+ void (*RGBColorTableRow)(uint8* dst_argb, const uint8* table_argb,
+ int width) = RGBColorTableRow_C;
+ uint8* dst = dst_argb + dst_y * dst_stride_argb + dst_x * 4;
+ if (!dst_argb || !table_argb || width <= 0 || height <= 0 ||
+ dst_x < 0 || dst_y < 0) {
+ return -1;
+ }
+ // Coalesce rows.
+ if (dst_stride_argb == width * 4) {
+ width *= height;
+ height = 1;
+ dst_stride_argb = 0;
+ }
+#if defined(HAS_RGBCOLORTABLEROW_X86)
+ if (TestCpuFlag(kCpuHasX86)) {
+ RGBColorTableRow = RGBColorTableRow_X86;
+ }
+#endif
+ for (y = 0; y < height; ++y) {
+ RGBColorTableRow(dst, table_argb, width);
+ dst += dst_stride_argb;
+ }
+ return 0;
+}
+
+// ARGBQuantize is used to posterize art.
+// e.g. rgb / qvalue * qvalue + qvalue / 2
+// But the low levels implement efficiently with 3 parameters, and could be
+// used for other high level operations.
+// dst_argb[0] = (b * scale >> 16) * interval_size + interval_offset;
+// where scale is 1 / interval_size as a fixed point value.
+// The divide is replaces with a multiply by reciprocal fixed point multiply.
+// Caveat - although SSE2 saturates, the C function does not and should be used
+// with care if doing anything but quantization.
+LIBYUV_API
+int ARGBQuantize(uint8* dst_argb, int dst_stride_argb,
+ int scale, int interval_size, int interval_offset,
+ int dst_x, int dst_y, int width, int height) {
+ int y;
+ void (*ARGBQuantizeRow)(uint8* dst_argb, int scale, int interval_size,
+ int interval_offset, int width) = ARGBQuantizeRow_C;
+ uint8* dst = dst_argb + dst_y * dst_stride_argb + dst_x * 4;
+ if (!dst_argb || width <= 0 || height <= 0 || dst_x < 0 || dst_y < 0 ||
+ interval_size < 1 || interval_size > 255) {
+ return -1;
+ }
+ // Coalesce rows.
+ if (dst_stride_argb == width * 4) {
+ width *= height;
+ height = 1;
+ dst_stride_argb = 0;
+ }
+#if defined(HAS_ARGBQUANTIZEROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 4) &&
+ IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
+ ARGBQuantizeRow = ARGBQuantizeRow_SSE2;
+ }
+#elif defined(HAS_ARGBQUANTIZEROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) {
+ ARGBQuantizeRow = ARGBQuantizeRow_NEON;
+ }
+#endif
+ for (y = 0; y < height; ++y) {
+ ARGBQuantizeRow(dst, scale, interval_size, interval_offset, width);
+ dst += dst_stride_argb;
+ }
+ return 0;
+}
+
+// Computes table of cumulative sum for image where the value is the sum
+// of all values above and to the left of the entry. Used by ARGBBlur.
+LIBYUV_API
+int ARGBComputeCumulativeSum(const uint8* src_argb, int src_stride_argb,
+ int32* dst_cumsum, int dst_stride32_cumsum,
+ int width, int height) {
+ int y;
+ void (*ComputeCumulativeSumRow)(const uint8* row, int32* cumsum,
+ const int32* previous_cumsum, int width) = ComputeCumulativeSumRow_C;
+ int32* previous_cumsum = dst_cumsum;
+ if (!dst_cumsum || !src_argb || width <= 0 || height <= 0) {
+ return -1;
+ }
+#if defined(HAS_CUMULATIVESUMTOAVERAGEROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2)) {
+ ComputeCumulativeSumRow = ComputeCumulativeSumRow_SSE2;
+ }
+#endif
+ memset(dst_cumsum, 0, width * sizeof(dst_cumsum[0]) * 4); // 4 int per pixel.
+ for (y = 0; y < height; ++y) {
+ ComputeCumulativeSumRow(src_argb, dst_cumsum, previous_cumsum, width);
+ previous_cumsum = dst_cumsum;
+ dst_cumsum += dst_stride32_cumsum;
+ src_argb += src_stride_argb;
+ }
+ return 0;
+}
+
+// Blur ARGB image.
+// Caller should allocate CumulativeSum table of width * height * 16 bytes
+// aligned to 16 byte boundary. height can be radius * 2 + 2 to save memory
+// as the buffer is treated as circular.
+LIBYUV_API
+int ARGBBlur(const uint8* src_argb, int src_stride_argb,
+ uint8* dst_argb, int dst_stride_argb,
+ int32* dst_cumsum, int dst_stride32_cumsum,
+ int width, int height, int radius) {
+ int y;
+ void (*ComputeCumulativeSumRow)(const uint8 *row, int32 *cumsum,
+ const int32* previous_cumsum, int width) = ComputeCumulativeSumRow_C;
+ void (*CumulativeSumToAverageRow)(const int32* topleft, const int32* botleft,
+ int width, int area, uint8* dst, int count) = CumulativeSumToAverageRow_C;
+ int32* cumsum_bot_row;
+ int32* max_cumsum_bot_row;
+ int32* cumsum_top_row;
+
+ if (!src_argb || !dst_argb || width <= 0 || height == 0) {
+ return -1;
+ }
+ if (height < 0) {
+ height = -height;
+ src_argb = src_argb + (height - 1) * src_stride_argb;
+ src_stride_argb = -src_stride_argb;
+ }
+ if (radius > height) {
+ radius = height;
+ }
+ if (radius > (width / 2 - 1)) {
+ radius = width / 2 - 1;
+ }
+ if (radius <= 0) {
+ return -1;
+ }
+#if defined(HAS_CUMULATIVESUMTOAVERAGEROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2)) {
+ ComputeCumulativeSumRow = ComputeCumulativeSumRow_SSE2;
+ CumulativeSumToAverageRow = CumulativeSumToAverageRow_SSE2;
+ }
+#endif
+ // Compute enough CumulativeSum for first row to be blurred. After this
+ // one row of CumulativeSum is updated at a time.
+ ARGBComputeCumulativeSum(src_argb, src_stride_argb,
+ dst_cumsum, dst_stride32_cumsum,
+ width, radius);
+
+ src_argb = src_argb + radius * src_stride_argb;
+ cumsum_bot_row = &dst_cumsum[(radius - 1) * dst_stride32_cumsum];
+
+ max_cumsum_bot_row = &dst_cumsum[(radius * 2 + 2) * dst_stride32_cumsum];
+ cumsum_top_row = &dst_cumsum[0];
+
+ for (y = 0; y < height; ++y) {
+ int top_y = ((y - radius - 1) >= 0) ? (y - radius - 1) : 0;
+ int bot_y = ((y + radius) < height) ? (y + radius) : (height - 1);
+ int area = radius * (bot_y - top_y);
+ int boxwidth = radius * 4;
+ int x;
+ int n;
+
+ // Increment cumsum_top_row pointer with circular buffer wrap around.
+ if (top_y) {
+ cumsum_top_row += dst_stride32_cumsum;
+ if (cumsum_top_row >= max_cumsum_bot_row) {
+ cumsum_top_row = dst_cumsum;
+ }
+ }
+ // Increment cumsum_bot_row pointer with circular buffer wrap around and
+ // then fill in a row of CumulativeSum.
+ if ((y + radius) < height) {
+ const int32* prev_cumsum_bot_row = cumsum_bot_row;
+ cumsum_bot_row += dst_stride32_cumsum;
+ if (cumsum_bot_row >= max_cumsum_bot_row) {
+ cumsum_bot_row = dst_cumsum;
+ }
+ ComputeCumulativeSumRow(src_argb, cumsum_bot_row, prev_cumsum_bot_row,
+ width);
+ src_argb += src_stride_argb;
+ }
+
+ // Left clipped.
+ for (x = 0; x < radius + 1; ++x) {
+ CumulativeSumToAverageRow(cumsum_top_row, cumsum_bot_row,
+ boxwidth, area, &dst_argb[x * 4], 1);
+ area += (bot_y - top_y);
+ boxwidth += 4;
+ }
+
+ // Middle unclipped.
+ n = (width - 1) - radius - x + 1;
+ CumulativeSumToAverageRow(cumsum_top_row, cumsum_bot_row,
+ boxwidth, area, &dst_argb[x * 4], n);
+
+ // Right clipped.
+ for (x += n; x <= width - 1; ++x) {
+ area -= (bot_y - top_y);
+ boxwidth -= 4;
+ CumulativeSumToAverageRow(cumsum_top_row + (x - radius - 1) * 4,
+ cumsum_bot_row + (x - radius - 1) * 4,
+ boxwidth, area, &dst_argb[x * 4], 1);
+ }
+ dst_argb += dst_stride_argb;
+ }
+ return 0;
+}
+
+// Multiply ARGB image by a specified ARGB value.
+LIBYUV_API
+int ARGBShade(const uint8* src_argb, int src_stride_argb,
+ uint8* dst_argb, int dst_stride_argb,
+ int width, int height, uint32 value) {
+ int y;
+ void (*ARGBShadeRow)(const uint8* src_argb, uint8* dst_argb,
+ int width, uint32 value) = ARGBShadeRow_C;
+ if (!src_argb || !dst_argb || width <= 0 || height == 0 || value == 0u) {
+ return -1;
+ }
+ if (height < 0) {
+ height = -height;
+ src_argb = src_argb + (height - 1) * src_stride_argb;
+ src_stride_argb = -src_stride_argb;
+ }
+ // Coalesce rows.
+ if (src_stride_argb == width * 4 &&
+ dst_stride_argb == width * 4) {
+ width *= height;
+ height = 1;
+ src_stride_argb = dst_stride_argb = 0;
+ }
+#if defined(HAS_ARGBSHADEROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 4) &&
+ IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16) &&
+ IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
+ ARGBShadeRow = ARGBShadeRow_SSE2;
+ }
+#elif defined(HAS_ARGBSHADEROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) {
+ ARGBShadeRow = ARGBShadeRow_NEON;
+ }
+#endif
+
+ for (y = 0; y < height; ++y) {
+ ARGBShadeRow(src_argb, dst_argb, width, value);
+ src_argb += src_stride_argb;
+ dst_argb += dst_stride_argb;
+ }
+ return 0;
+}
+
+// Interpolate 2 ARGB images by specified amount (0 to 255).
+LIBYUV_API
+int ARGBInterpolate(const uint8* src_argb0, int src_stride_argb0,
+ const uint8* src_argb1, int src_stride_argb1,
+ uint8* dst_argb, int dst_stride_argb,
+ int width, int height, int interpolation) {
+ int y;
+ void (*InterpolateRow)(uint8* dst_ptr, const uint8* src_ptr,
+ ptrdiff_t src_stride, int dst_width,
+ int source_y_fraction) = InterpolateRow_C;
+ if (!src_argb0 || !src_argb1 || !dst_argb || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ dst_argb = dst_argb + (height - 1) * dst_stride_argb;
+ dst_stride_argb = -dst_stride_argb;
+ }
+ // Coalesce rows.
+ if (src_stride_argb0 == width * 4 &&
+ src_stride_argb1 == width * 4 &&
+ dst_stride_argb == width * 4) {
+ width *= height;
+ height = 1;
+ src_stride_argb0 = src_stride_argb1 = dst_stride_argb = 0;
+ }
+#if defined(HAS_INTERPOLATEROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2) && width >= 4) {
+ InterpolateRow = InterpolateRow_Any_SSE2;
+ if (IS_ALIGNED(width, 4)) {
+ InterpolateRow = InterpolateRow_Unaligned_SSE2;
+ if (IS_ALIGNED(src_argb0, 16) && IS_ALIGNED(src_stride_argb0, 16) &&
+ IS_ALIGNED(src_argb1, 16) && IS_ALIGNED(src_stride_argb1, 16) &&
+ IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
+ InterpolateRow = InterpolateRow_SSE2;
+ }
+ }
+ }
+#endif
+#if defined(HAS_INTERPOLATEROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3) && width >= 4) {
+ InterpolateRow = InterpolateRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 4)) {
+ InterpolateRow = InterpolateRow_Unaligned_SSSE3;
+ if (IS_ALIGNED(src_argb0, 16) && IS_ALIGNED(src_stride_argb0, 16) &&
+ IS_ALIGNED(src_argb1, 16) && IS_ALIGNED(src_stride_argb1, 16) &&
+ IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
+ InterpolateRow = InterpolateRow_SSSE3;
+ }
+ }
+ }
+#endif
+#if defined(HAS_INTERPOLATEROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2) && width >= 8) {
+ InterpolateRow = InterpolateRow_Any_AVX2;
+ if (IS_ALIGNED(width, 8)) {
+ InterpolateRow = InterpolateRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_INTERPOLATEROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON) && width >= 4) {
+ InterpolateRow = InterpolateRow_Any_NEON;
+ if (IS_ALIGNED(width, 4)) {
+ InterpolateRow = InterpolateRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_INTERPOLATEROWS_MIPS_DSPR2)
+ if (TestCpuFlag(kCpuHasMIPS_DSPR2) && width >= 1 &&
+ IS_ALIGNED(src_argb0, 4) && IS_ALIGNED(src_stride_argb0, 4) &&
+ IS_ALIGNED(src_argb1, 4) && IS_ALIGNED(src_stride_argb1, 4) &&
+ IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride_argb, 4)) {
+ ScaleARGBFilterRows = InterpolateRow_MIPS_DSPR2;
+ }
+#endif
+
+ for (y = 0; y < height; ++y) {
+ InterpolateRow(dst_argb, src_argb0, src_argb1 - src_argb0,
+ width * 4, interpolation);
+ src_argb0 += src_stride_argb0;
+ src_argb1 += src_stride_argb1;
+ dst_argb += dst_stride_argb;
+ }
+ return 0;
+}
+
+// Shuffle ARGB channel order. e.g. BGRA to ARGB.
+LIBYUV_API
+int ARGBShuffle(const uint8* src_bgra, int src_stride_bgra,
+ uint8* dst_argb, int dst_stride_argb,
+ const uint8* shuffler, int width, int height) {
+ int y;
+ void (*ARGBShuffleRow)(const uint8* src_bgra, uint8* dst_argb,
+ const uint8* shuffler, int pix) = ARGBShuffleRow_C;
+ if (!src_bgra || !dst_argb ||
+ width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src_bgra = src_bgra + (height - 1) * src_stride_bgra;
+ src_stride_bgra = -src_stride_bgra;
+ }
+ // Coalesce rows.
+ if (src_stride_bgra == width * 4 &&
+ dst_stride_argb == width * 4) {
+ width *= height;
+ height = 1;
+ src_stride_bgra = dst_stride_argb = 0;
+ }
+#if defined(HAS_ARGBSHUFFLEROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2) && width >= 4) {
+ ARGBShuffleRow = ARGBShuffleRow_Any_SSE2;
+ if (IS_ALIGNED(width, 4)) {
+ ARGBShuffleRow = ARGBShuffleRow_SSE2;
+ }
+ }
+#endif
+#if defined(HAS_ARGBSHUFFLEROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {
+ ARGBShuffleRow = ARGBShuffleRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBShuffleRow = ARGBShuffleRow_Unaligned_SSSE3;
+ if (IS_ALIGNED(src_bgra, 16) && IS_ALIGNED(src_stride_bgra, 16) &&
+ IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
+ ARGBShuffleRow = ARGBShuffleRow_SSSE3;
+ }
+ }
+ }
+#endif
+#if defined(HAS_ARGBSHUFFLEROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2) && width >= 16) {
+ ARGBShuffleRow = ARGBShuffleRow_Any_AVX2;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBShuffleRow = ARGBShuffleRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_ARGBSHUFFLEROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON) && width >= 4) {
+ ARGBShuffleRow = ARGBShuffleRow_Any_NEON;
+ if (IS_ALIGNED(width, 4)) {
+ ARGBShuffleRow = ARGBShuffleRow_NEON;
+ }
+ }
+#endif
+
+ for (y = 0; y < height; ++y) {
+ ARGBShuffleRow(src_bgra, dst_argb, shuffler, width);
+ src_bgra += src_stride_bgra;
+ dst_argb += dst_stride_argb;
+ }
+ return 0;
+}
+
+// Sobel ARGB effect.
+static int ARGBSobelize(const uint8* src_argb, int src_stride_argb,
+ uint8* dst_argb, int dst_stride_argb,
+ int width, int height,
+ void (*SobelRow)(const uint8* src_sobelx,
+ const uint8* src_sobely,
+ uint8* dst, int width)) {
+ int y;
+ void (*ARGBToBayerRow)(const uint8* src_argb, uint8* dst_bayer,
+ uint32 selector, int pix) = ARGBToBayerGGRow_C;
+ void (*SobelYRow)(const uint8* src_y0, const uint8* src_y1,
+ uint8* dst_sobely, int width) = SobelYRow_C;
+ void (*SobelXRow)(const uint8* src_y0, const uint8* src_y1,
+ const uint8* src_y2, uint8* dst_sobely, int width) =
+ SobelXRow_C;
+ const int kEdge = 16; // Extra pixels at start of row for extrude/align.
+ if (!src_argb || !dst_argb || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src_argb = src_argb + (height - 1) * src_stride_argb;
+ src_stride_argb = -src_stride_argb;
+ }
+ // ARGBToBayer used to select G channel from ARGB.
+#if defined(HAS_ARGBTOBAYERGGROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2) && width >= 8 &&
+ IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16)) {
+ ARGBToBayerRow = ARGBToBayerGGRow_Any_SSE2;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBToBayerRow = ARGBToBayerGGRow_SSE2;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOBAYERROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3) && width >= 8 &&
+ IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16)) {
+ ARGBToBayerRow = ARGBToBayerRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBToBayerRow = ARGBToBayerRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOBAYERGGROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+ ARGBToBayerRow = ARGBToBayerGGRow_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBToBayerRow = ARGBToBayerGGRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_SOBELYROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2)) {
+ SobelYRow = SobelYRow_SSE2;
+ }
+#endif
+#if defined(HAS_SOBELYROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ SobelYRow = SobelYRow_NEON;
+ }
+#endif
+#if defined(HAS_SOBELXROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2)) {
+ SobelXRow = SobelXRow_SSE2;
+ }
+#endif
+#if defined(HAS_SOBELXROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ SobelXRow = SobelXRow_NEON;
+ }
+#endif
+ {
+ // 3 rows with edges before/after.
+ const int kRowSize = (width + kEdge + 15) & ~15;
+ align_buffer_64(rows, kRowSize * 2 + (kEdge + kRowSize * 3 + kEdge));
+ uint8* row_sobelx = rows;
+ uint8* row_sobely = rows + kRowSize;
+ uint8* row_y = rows + kRowSize * 2;
+
+ // Convert first row.
+ uint8* row_y0 = row_y + kEdge;
+ uint8* row_y1 = row_y0 + kRowSize;
+ uint8* row_y2 = row_y1 + kRowSize;
+ ARGBToBayerRow(src_argb, row_y0, 0x0d090501, width);
+ row_y0[-1] = row_y0[0];
+ memset(row_y0 + width, row_y0[width - 1], 16); // Extrude 16 for valgrind.
+ ARGBToBayerRow(src_argb, row_y1, 0x0d090501, width);
+ row_y1[-1] = row_y1[0];
+ memset(row_y1 + width, row_y1[width - 1], 16);
+ memset(row_y2 + width, 0, 16);
+
+ for (y = 0; y < height; ++y) {
+ // Convert next row of ARGB to Y.
+ if (y < (height - 1)) {
+ src_argb += src_stride_argb;
+ }
+ ARGBToBayerRow(src_argb, row_y2, 0x0d090501, width);
+ row_y2[-1] = row_y2[0];
+ row_y2[width] = row_y2[width - 1];
+
+ SobelXRow(row_y0 - 1, row_y1 - 1, row_y2 - 1, row_sobelx, width);
+ SobelYRow(row_y0 - 1, row_y2 - 1, row_sobely, width);
+ SobelRow(row_sobelx, row_sobely, dst_argb, width);
+
+ // Cycle thru circular queue of 3 row_y buffers.
+ {
+ uint8* row_yt = row_y0;
+ row_y0 = row_y1;
+ row_y1 = row_y2;
+ row_y2 = row_yt;
+ }
+
+ dst_argb += dst_stride_argb;
+ }
+ free_aligned_buffer_64(rows);
+ }
+ return 0;
+}
+
+// Sobel ARGB effect.
+LIBYUV_API
+int ARGBSobel(const uint8* src_argb, int src_stride_argb,
+ uint8* dst_argb, int dst_stride_argb,
+ int width, int height) {
+ void (*SobelRow)(const uint8* src_sobelx, const uint8* src_sobely,
+ uint8* dst_argb, int width) = SobelRow_C;
+#if defined(HAS_SOBELROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 16) &&
+ IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
+ SobelRow = SobelRow_SSE2;
+ }
+#endif
+#if defined(HAS_SOBELROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) {
+ SobelRow = SobelRow_NEON;
+ }
+#endif
+ return ARGBSobelize(src_argb, src_stride_argb, dst_argb, dst_stride_argb,
+ width, height, SobelRow);
+}
+
+// Sobel ARGB effect with planar output.
+LIBYUV_API
+int ARGBSobelToPlane(const uint8* src_argb, int src_stride_argb,
+ uint8* dst_y, int dst_stride_y,
+ int width, int height) {
+ void (*SobelToPlaneRow)(const uint8* src_sobelx, const uint8* src_sobely,
+ uint8* dst_, int width) = SobelToPlaneRow_C;
+#if defined(HAS_SOBELTOPLANEROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 16) &&
+ IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
+ SobelToPlaneRow = SobelToPlaneRow_SSE2;
+ }
+#endif
+#if defined(HAS_SOBELTOPLANEROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 16)) {
+ SobelToPlaneRow = SobelToPlaneRow_NEON;
+ }
+#endif
+ return ARGBSobelize(src_argb, src_stride_argb, dst_y, dst_stride_y,
+ width, height, SobelToPlaneRow);
+}
+
+// SobelXY ARGB effect.
+// Similar to Sobel, but also stores Sobel X in R and Sobel Y in B. G = Sobel.
+LIBYUV_API
+int ARGBSobelXY(const uint8* src_argb, int src_stride_argb,
+ uint8* dst_argb, int dst_stride_argb,
+ int width, int height) {
+ void (*SobelXYRow)(const uint8* src_sobelx, const uint8* src_sobely,
+ uint8* dst_argb, int width) = SobelXYRow_C;
+#if defined(HAS_SOBELXYROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 16) &&
+ IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
+ SobelXYRow = SobelXYRow_SSE2;
+ }
+#endif
+#if defined(HAS_SOBELXYROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) {
+ SobelXYRow = SobelXYRow_NEON;
+ }
+#endif
+ return ARGBSobelize(src_argb, src_stride_argb, dst_argb, dst_stride_argb,
+ width, height, SobelXYRow);
+}
+
+// Apply a 4x4 polynomial to each ARGB pixel.
+LIBYUV_API
+int ARGBPolynomial(const uint8* src_argb, int src_stride_argb,
+ uint8* dst_argb, int dst_stride_argb,
+ const float* poly,
+ int width, int height) {
+ int y;
+ void (*ARGBPolynomialRow)(const uint8* src_argb,
+ uint8* dst_argb, const float* poly,
+ int width) = ARGBPolynomialRow_C;
+ if (!src_argb || !dst_argb || !poly || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src_argb = src_argb + (height - 1) * src_stride_argb;
+ src_stride_argb = -src_stride_argb;
+ }
+ // Coalesce rows.
+ if (src_stride_argb == width * 4 &&
+ dst_stride_argb == width * 4) {
+ width *= height;
+ height = 1;
+ src_stride_argb = dst_stride_argb = 0;
+ }
+#if defined(HAS_ARGBPOLYNOMIALROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 2)) {
+ ARGBPolynomialRow = ARGBPolynomialRow_SSE2;
+ }
+#endif
+#if defined(HAS_ARGBPOLYNOMIALROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2) && TestCpuFlag(kCpuHasFMA3) &&
+ IS_ALIGNED(width, 2)) {
+ ARGBPolynomialRow = ARGBPolynomialRow_AVX2;
+ }
+#endif
+
+ for (y = 0; y < height; ++y) {
+ ARGBPolynomialRow(src_argb, dst_argb, poly, width);
+ src_argb += src_stride_argb;
+ dst_argb += dst_stride_argb;
+ }
+ return 0;
+}
+
+// Apply a lumacolortable to each ARGB pixel.
+LIBYUV_API
+int ARGBLumaColorTable(const uint8* src_argb, int src_stride_argb,
+ uint8* dst_argb, int dst_stride_argb,
+ const uint8* luma,
+ int width, int height) {
+ int y;
+ void (*ARGBLumaColorTableRow)(const uint8* src_argb, uint8* dst_argb,
+ int width, const uint8* luma, const uint32 lumacoeff) =
+ ARGBLumaColorTableRow_C;
+ if (!src_argb || !dst_argb || !luma || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src_argb = src_argb + (height - 1) * src_stride_argb;
+ src_stride_argb = -src_stride_argb;
+ }
+ // Coalesce rows.
+ if (src_stride_argb == width * 4 &&
+ dst_stride_argb == width * 4) {
+ width *= height;
+ height = 1;
+ src_stride_argb = dst_stride_argb = 0;
+ }
+#if defined(HAS_ARGBLUMACOLORTABLEROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 4)) {
+ ARGBLumaColorTableRow = ARGBLumaColorTableRow_SSSE3;
+ }
+#endif
+
+ for (y = 0; y < height; ++y) {
+ ARGBLumaColorTableRow(src_argb, dst_argb, width, luma, 0x00264b0f);
+ src_argb += src_stride_argb;
+ dst_argb += dst_stride_argb;
+ }
+ return 0;
+}
+
+// Copy Alpha from one ARGB image to another.
+LIBYUV_API
+int ARGBCopyAlpha(const uint8* src_argb, int src_stride_argb,
+ uint8* dst_argb, int dst_stride_argb,
+ int width, int height) {
+ int y;
+ void (*ARGBCopyAlphaRow)(const uint8* src_argb, uint8* dst_argb, int width) =
+ ARGBCopyAlphaRow_C;
+ if (!src_argb || !dst_argb || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src_argb = src_argb + (height - 1) * src_stride_argb;
+ src_stride_argb = -src_stride_argb;
+ }
+ // Coalesce rows.
+ if (src_stride_argb == width * 4 &&
+ dst_stride_argb == width * 4) {
+ width *= height;
+ height = 1;
+ src_stride_argb = dst_stride_argb = 0;
+ }
+#if defined(HAS_ARGBCOPYALPHAROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2) &&
+ IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16) &&
+ IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16) &&
+ IS_ALIGNED(width, 8)) {
+ ARGBCopyAlphaRow = ARGBCopyAlphaRow_SSE2;
+ }
+#endif
+#if defined(HAS_ARGBCOPYALPHAROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2) && IS_ALIGNED(width, 16)) {
+ ARGBCopyAlphaRow = ARGBCopyAlphaRow_AVX2;
+ }
+#endif
+
+ for (y = 0; y < height; ++y) {
+ ARGBCopyAlphaRow(src_argb, dst_argb, width);
+ src_argb += src_stride_argb;
+ dst_argb += dst_stride_argb;
+ }
+ return 0;
+}
+
+// Copy a planar Y channel to the alpha channel of a destination ARGB image.
+LIBYUV_API
+int ARGBCopyYToAlpha(const uint8* src_y, int src_stride_y,
+ uint8* dst_argb, int dst_stride_argb,
+ int width, int height) {
+ int y;
+ void (*ARGBCopyYToAlphaRow)(const uint8* src_y, uint8* dst_argb, int width) =
+ ARGBCopyYToAlphaRow_C;
+ if (!src_y || !dst_argb || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src_y = src_y + (height - 1) * src_stride_y;
+ src_stride_y = -src_stride_y;
+ }
+ // Coalesce rows.
+ if (src_stride_y == width &&
+ dst_stride_argb == width * 4) {
+ width *= height;
+ height = 1;
+ src_stride_y = dst_stride_argb = 0;
+ }
+#if defined(HAS_ARGBCOPYYTOALPHAROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2) &&
+ IS_ALIGNED(src_y, 16) && IS_ALIGNED(src_stride_y, 16) &&
+ IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16) &&
+ IS_ALIGNED(width, 8)) {
+ ARGBCopyYToAlphaRow = ARGBCopyYToAlphaRow_SSE2;
+ }
+#endif
+#if defined(HAS_ARGBCOPYYTOALPHAROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2) && IS_ALIGNED(width, 16)) {
+ ARGBCopyYToAlphaRow = ARGBCopyYToAlphaRow_AVX2;
+ }
+#endif
+
+ for (y = 0; y < height; ++y) {
+ ARGBCopyYToAlphaRow(src_y, dst_argb, width);
+ src_y += src_stride_y;
+ dst_argb += dst_stride_argb;
+ }
+ return 0;
+}
+
+#ifdef __cplusplus
+} // extern "C"
+} // namespace libyuv
+#endif
diff --git a/third_party/libyuv/source/row.h b/third_party/libyuv/source/row.h
deleted file mode 100644
index eabe180..0000000
--- a/third_party/libyuv/source/row.h
+++ /dev/null
@@ -1,264 +0,0 @@
-/*
- * Copyright (c) 2011 The LibYuv project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-#ifndef LIBYUV_SOURCE_ROW_H_
-#define LIBYUV_SOURCE_ROW_H_
-
-#include "third_party/libyuv/include/libyuv/basic_types.h"
-
-#define kMaxStride (2048 * 4)
-#define IS_ALIGNED(p, a) (!((uintptr_t)(p) & ((a) - 1)))
-
-#if defined(COVERAGE_ENABLED) || defined(TARGET_IPHONE_SIMULATOR)
-#define YUV_DISABLE_ASM
-#endif
-
-#if defined(__ARM_NEON__) && !defined(YUV_DISABLE_ASM)
-#define HAS_FASTCONVERTYUVTOARGBROW_NEON
-void FastConvertYUVToARGBRow_NEON(const uint8* y_buf,
- const uint8* u_buf,
- const uint8* v_buf,
- uint8* rgb_buf,
- int width);
-#define HAS_FASTCONVERTYUVTOBGRAROW_NEON
-void FastConvertYUVToBGRARow_NEON(const uint8* y_buf,
- const uint8* u_buf,
- const uint8* v_buf,
- uint8* rgb_buf,
- int width);
-#define HAS_FASTCONVERTYUVTOABGRROW_NEON
-void FastConvertYUVToABGRRow_NEON(const uint8* y_buf,
- const uint8* u_buf,
- const uint8* v_buf,
- uint8* rgb_buf,
- int width);
-#endif
-
-// The following are available on all x86 platforms
-#if (defined(_M_IX86) || defined(__x86_64__) || defined(__i386__)) && \
- !defined(YUV_DISABLE_ASM)
-#define HAS_ABGRTOARGBROW_SSSE3
-#define HAS_BGRATOARGBROW_SSSE3
-#define HAS_BG24TOARGBROW_SSSE3
-#define HAS_RAWTOARGBROW_SSSE3
-#define HAS_RGB24TOYROW_SSSE3
-#define HAS_RAWTOYROW_SSSE3
-#define HAS_RGB24TOUVROW_SSSE3
-#define HAS_RAWTOUVROW_SSSE3
-#define HAS_ARGBTOYROW_SSSE3
-#define HAS_BGRATOYROW_SSSE3
-#define HAS_ABGRTOYROW_SSSE3
-#define HAS_ARGBTOUVROW_SSSE3
-#define HAS_BGRATOUVROW_SSSE3
-#define HAS_ABGRTOUVROW_SSSE3
-#define HAS_I400TOARGBROW_SSE2
-#define HAS_FASTCONVERTYTOARGBROW_SSE2
-#define HAS_FASTCONVERTYUVTOARGBROW_SSSE3
-#define HAS_FASTCONVERTYUVTOBGRAROW_SSSE3
-#define HAS_FASTCONVERTYUVTOABGRROW_SSSE3
-#define HAS_FASTCONVERTYUV444TOARGBROW_SSSE3
-#define HAS_REVERSE_ROW_SSSE3
-#endif
-
-// The following are available on Neon platforms
-#if defined(__ARM_NEON__) && !defined(YUV_DISABLE_ASM)
-#define HAS_REVERSE_ROW_NEON
-#endif
-
-#ifdef __cplusplus
-namespace libyuv {
-extern "C" {
-#endif
-
-#ifdef HAS_ARGBTOYROW_SSSE3
-void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix);
-void BGRAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix);
-void ABGRToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix);
-void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
- uint8* dst_u, uint8* dst_v, int width);
-void BGRAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
- uint8* dst_u, uint8* dst_v, int width);
-void ABGRToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
- uint8* dst_u, uint8* dst_v, int width);
-#endif
-#if defined(HAS_BG24TOARGBROW_SSSE3) && defined(HAS_ARGBTOYROW_SSSE3)
-#define HASRGB24TOYROW_SSSE3
-#endif
-#ifdef HASRGB24TOYROW_SSSE3
-void RGB24ToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix);
-void RAWToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix);
-void RGB24ToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
- uint8* dst_u, uint8* dst_v, int width);
-void RAWToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
- uint8* dst_u, uint8* dst_v, int width);
-#endif
-#ifdef HAS_REVERSE_ROW_SSSE3
-void ReverseRow_SSSE3(const uint8* src, uint8* dst, int width);
-#endif
-#ifdef HAS_REVERSE_ROW_NEON
-void ReverseRow_NEON(const uint8* src, uint8* dst, int width);
-#endif
-void ReverseRow_C(const uint8* src, uint8* dst, int width);
-
-void ARGBToYRow_C(const uint8* src_argb, uint8* dst_y, int pix);
-void BGRAToYRow_C(const uint8* src_argb, uint8* dst_y, int pix);
-void ABGRToYRow_C(const uint8* src_argb, uint8* dst_y, int pix);
-void RGB24ToYRow_C(const uint8* src_argb, uint8* dst_y, int pix);
-void RAWToYRow_C(const uint8* src_argb, uint8* dst_y, int pix);
-void ARGBToUVRow_C(const uint8* src_argb0, int src_stride_argb,
- uint8* dst_u, uint8* dst_v, int width);
-void BGRAToUVRow_C(const uint8* src_argb0, int src_stride_argb,
- uint8* dst_u, uint8* dst_v, int width);
-void ABGRToUVRow_C(const uint8* src_argb0, int src_stride_argb,
- uint8* dst_u, uint8* dst_v, int width);
-void RGB24ToUVRow_C(const uint8* src_argb0, int src_stride_argb,
- uint8* dst_u, uint8* dst_v, int width);
-void RAWToUVRow_C(const uint8* src_argb0, int src_stride_argb,
- uint8* dst_u, uint8* dst_v, int width);
-
-#ifdef HAS_BG24TOARGBROW_SSSE3
-void ABGRToARGBRow_SSSE3(const uint8* src_abgr, uint8* dst_argb, int pix);
-void BGRAToARGBRow_SSSE3(const uint8* src_bgra, uint8* dst_argb, int pix);
-void BG24ToARGBRow_SSSE3(const uint8* src_bg24, uint8* dst_argb, int pix);
-void RAWToARGBRow_SSSE3(const uint8* src_bg24, uint8* dst_argb, int pix);
-#endif
-void ABGRToARGBRow_C(const uint8* src_abgr, uint8* dst_argb, int pix);
-void BGRAToARGBRow_C(const uint8* src_bgra, uint8* dst_argb, int pix);
-void BG24ToARGBRow_C(const uint8* src_bg24, uint8* dst_argb, int pix);
-void RAWToARGBRow_C(const uint8* src_bg24, uint8* dst_argb, int pix);
-
-#ifdef HAS_I400TOARGBROW_SSE2
-void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix);
-#endif
-void I400ToARGBRow_C(const uint8* src_y, uint8* dst_argb, int pix);
-
-#if defined(_MSC_VER)
-#define SIMD_ALIGNED(var) __declspec(align(16)) var
-typedef __declspec(align(16)) signed char vec8[16];
-typedef __declspec(align(16)) unsigned char uvec8[16];
-typedef __declspec(align(16)) signed short vec16[8];
-#else // __GNUC__
-#define SIMD_ALIGNED(var) var __attribute__((aligned(16)))
-typedef signed char __attribute__((vector_size(16))) vec8;
-typedef unsigned char __attribute__((vector_size(16))) uvec8;
-typedef signed short __attribute__((vector_size(16))) vec16;
-#endif
-
-//extern "C"
-SIMD_ALIGNED(const int16 kCoefficientsRgbY[768][4]);
-//extern "C"
-SIMD_ALIGNED(const int16 kCoefficientsBgraY[768][4]);
-//extern "C"
-SIMD_ALIGNED(const int16 kCoefficientsAbgrY[768][4]);
-
-void FastConvertYUVToARGBRow_C(const uint8* y_buf,
- const uint8* u_buf,
- const uint8* v_buf,
- uint8* rgb_buf,
- int width);
-
-void FastConvertYUVToBGRARow_C(const uint8* y_buf,
- const uint8* u_buf,
- const uint8* v_buf,
- uint8* rgb_buf,
- int width);
-
-void FastConvertYUVToABGRRow_C(const uint8* y_buf,
- const uint8* u_buf,
- const uint8* v_buf,
- uint8* rgb_buf,
- int width);
-
-void FastConvertYUV444ToARGBRow_C(const uint8* y_buf,
- const uint8* u_buf,
- const uint8* v_buf,
- uint8* rgb_buf,
- int width);
-
-void FastConvertYToARGBRow_C(const uint8* y_buf,
- uint8* rgb_buf,
- int width);
-
-#ifdef HAS_FASTCONVERTYUVTOARGBROW_SSE2
-void FastConvertYUVToARGBRow_SSE2(const uint8* y_buf,
- const uint8* u_buf,
- const uint8* v_buf,
- uint8* rgb_buf,
- int width);
-
-void FastConvertYUVToARGBRow4_SSE2(const uint8* y_buf,
- const uint8* u_buf,
- const uint8* v_buf,
- uint8* rgb_buf,
- int width);
-
-void FastConvertYUVToBGRARow_SSE2(const uint8* y_buf,
- const uint8* u_buf,
- const uint8* v_buf,
- uint8* rgb_buf,
- int width);
-
-void FastConvertYUVToABGRRow_SSE2(const uint8* y_buf,
- const uint8* u_buf,
- const uint8* v_buf,
- uint8* rgb_buf,
- int width);
-
-void FastConvertYUV444ToARGBRow_SSE2(const uint8* y_buf,
- const uint8* u_buf,
- const uint8* v_buf,
- uint8* rgb_buf,
- int width);
-
-void FastConvertYToARGBRow_SSE2(const uint8* y_buf,
- uint8* rgb_buf,
- int width);
-#endif
-
-#ifdef HAS_FASTCONVERTYUVTOARGBROW_SSSE3
-void FastConvertYUVToARGBRow_SSSE3(const uint8* y_buf,
- const uint8* u_buf,
- const uint8* v_buf,
- uint8* rgb_buf,
- int width);
-
-void FastConvertYUVToBGRARow_SSSE3(const uint8* y_buf,
- const uint8* u_buf,
- const uint8* v_buf,
- uint8* rgb_buf,
- int width);
-
-void FastConvertYUVToABGRRow_SSSE3(const uint8* y_buf,
- const uint8* u_buf,
- const uint8* v_buf,
- uint8* rgb_buf,
- int width);
-
-void FastConvertYUV444ToARGBRow_SSSE3(const uint8* y_buf,
- const uint8* u_buf,
- const uint8* v_buf,
- uint8* rgb_buf,
- int width);
-
-#endif
-
-#ifdef HAS_FASTCONVERTYTOARGBROW_SSE2
-void FastConvertYToARGBRow_SSE2(const uint8* y_buf,
- uint8* rgb_buf,
- int width);
-
-#endif
-
-#ifdef __cplusplus
-} // extern "C"
-} // namespace libyuv
-#endif
-
-#endif // LIBYUV_SOURCE_ROW_H_
diff --git a/third_party/libyuv/source/row_any.cc b/third_party/libyuv/source/row_any.cc
new file mode 100644
index 0000000..27a0de1
--- /dev/null
+++ b/third_party/libyuv/source/row_any.cc
@@ -0,0 +1,542 @@
+/*
+ * Copyright 2012 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "third_party/libyuv/include/libyuv/row.h"
+
+#include "third_party/libyuv/include/libyuv/basic_types.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// TODO(fbarchard): Consider 'any' functions handling any quantity of pixels.
+// TODO(fbarchard): Consider 'any' functions handling odd alignment.
+// YUV to RGB does multiple of 8 with SIMD and remainder with C.
+#define YANY(NAMEANY, I420TORGB_SIMD, I420TORGB_C, UV_SHIFT, BPP, MASK) \
+ void NAMEANY(const uint8* y_buf, \
+ const uint8* u_buf, \
+ const uint8* v_buf, \
+ uint8* rgb_buf, \
+ int width) { \
+ int n = width & ~MASK; \
+ I420TORGB_SIMD(y_buf, u_buf, v_buf, rgb_buf, n); \
+ I420TORGB_C(y_buf + n, \
+ u_buf + (n >> UV_SHIFT), \
+ v_buf + (n >> UV_SHIFT), \
+ rgb_buf + n * BPP, width & MASK); \
+ }
+
+#ifdef HAS_I422TOARGBROW_SSSE3
+YANY(I444ToARGBRow_Any_SSSE3, I444ToARGBRow_Unaligned_SSSE3, I444ToARGBRow_C,
+ 0, 4, 7)
+YANY(I422ToARGBRow_Any_SSSE3, I422ToARGBRow_Unaligned_SSSE3, I422ToARGBRow_C,
+ 1, 4, 7)
+YANY(I411ToARGBRow_Any_SSSE3, I411ToARGBRow_Unaligned_SSSE3, I411ToARGBRow_C,
+ 2, 4, 7)
+YANY(I422ToBGRARow_Any_SSSE3, I422ToBGRARow_Unaligned_SSSE3, I422ToBGRARow_C,
+ 1, 4, 7)
+YANY(I422ToABGRRow_Any_SSSE3, I422ToABGRRow_Unaligned_SSSE3, I422ToABGRRow_C,
+ 1, 4, 7)
+YANY(I422ToRGBARow_Any_SSSE3, I422ToRGBARow_Unaligned_SSSE3, I422ToRGBARow_C,
+ 1, 4, 7)
+// I422ToRGB565Row_SSSE3 is unaligned.
+YANY(I422ToARGB4444Row_Any_SSSE3, I422ToARGB4444Row_SSSE3, I422ToARGB4444Row_C,
+ 1, 2, 7)
+YANY(I422ToARGB1555Row_Any_SSSE3, I422ToARGB1555Row_SSSE3, I422ToARGB1555Row_C,
+ 1, 2, 7)
+YANY(I422ToRGB565Row_Any_SSSE3, I422ToRGB565Row_SSSE3, I422ToRGB565Row_C,
+ 1, 2, 7)
+// I422ToRGB24Row_SSSE3 is unaligned.
+YANY(I422ToRGB24Row_Any_SSSE3, I422ToRGB24Row_SSSE3, I422ToRGB24Row_C, 1, 3, 7)
+YANY(I422ToRAWRow_Any_SSSE3, I422ToRAWRow_SSSE3, I422ToRAWRow_C, 1, 3, 7)
+YANY(I422ToYUY2Row_Any_SSE2, I422ToYUY2Row_SSE2, I422ToYUY2Row_C, 1, 2, 15)
+YANY(I422ToUYVYRow_Any_SSE2, I422ToUYVYRow_SSE2, I422ToUYVYRow_C, 1, 2, 15)
+#endif // HAS_I422TOARGBROW_SSSE3
+#ifdef HAS_I422TOARGBROW_AVX2
+YANY(I422ToARGBRow_Any_AVX2, I422ToARGBRow_AVX2, I422ToARGBRow_C, 1, 4, 15)
+#endif // HAS_I422TOARGBROW_AVX2
+#ifdef HAS_I422TOARGBROW_NEON
+YANY(I444ToARGBRow_Any_NEON, I444ToARGBRow_NEON, I444ToARGBRow_C, 0, 4, 7)
+YANY(I422ToARGBRow_Any_NEON, I422ToARGBRow_NEON, I422ToARGBRow_C, 1, 4, 7)
+YANY(I411ToARGBRow_Any_NEON, I411ToARGBRow_NEON, I411ToARGBRow_C, 2, 4, 7)
+YANY(I422ToBGRARow_Any_NEON, I422ToBGRARow_NEON, I422ToBGRARow_C, 1, 4, 7)
+YANY(I422ToABGRRow_Any_NEON, I422ToABGRRow_NEON, I422ToABGRRow_C, 1, 4, 7)
+YANY(I422ToRGBARow_Any_NEON, I422ToRGBARow_NEON, I422ToRGBARow_C, 1, 4, 7)
+YANY(I422ToRGB24Row_Any_NEON, I422ToRGB24Row_NEON, I422ToRGB24Row_C, 1, 3, 7)
+YANY(I422ToRAWRow_Any_NEON, I422ToRAWRow_NEON, I422ToRAWRow_C, 1, 3, 7)
+YANY(I422ToARGB4444Row_Any_NEON, I422ToARGB4444Row_NEON, I422ToARGB4444Row_C,
+ 1, 2, 7)
+YANY(I422ToARGB1555Row_Any_NEON, I422ToARGB1555Row_NEON, I422ToARGB1555Row_C,
+ 1, 2, 7)
+YANY(I422ToRGB565Row_Any_NEON, I422ToRGB565Row_NEON, I422ToRGB565Row_C, 1, 2, 7)
+YANY(I422ToYUY2Row_Any_NEON, I422ToYUY2Row_NEON, I422ToYUY2Row_C, 1, 2, 15)
+YANY(I422ToUYVYRow_Any_NEON, I422ToUYVYRow_NEON, I422ToUYVYRow_C, 1, 2, 15)
+#endif // HAS_I422TOARGBROW_NEON
+#undef YANY
+
+// Wrappers to handle odd width
+#define NV2NY(NAMEANY, NV12TORGB_SIMD, NV12TORGB_C, UV_SHIFT, BPP) \
+ void NAMEANY(const uint8* y_buf, \
+ const uint8* uv_buf, \
+ uint8* rgb_buf, \
+ int width) { \
+ int n = width & ~7; \
+ NV12TORGB_SIMD(y_buf, uv_buf, rgb_buf, n); \
+ NV12TORGB_C(y_buf + n, \
+ uv_buf + (n >> UV_SHIFT), \
+ rgb_buf + n * BPP, width & 7); \
+ }
+
+#ifdef HAS_NV12TOARGBROW_SSSE3
+NV2NY(NV12ToARGBRow_Any_SSSE3, NV12ToARGBRow_Unaligned_SSSE3, NV12ToARGBRow_C,
+ 0, 4)
+NV2NY(NV21ToARGBRow_Any_SSSE3, NV21ToARGBRow_Unaligned_SSSE3, NV21ToARGBRow_C,
+ 0, 4)
+#endif // HAS_NV12TOARGBROW_SSSE3
+#ifdef HAS_NV12TOARGBROW_NEON
+NV2NY(NV12ToARGBRow_Any_NEON, NV12ToARGBRow_NEON, NV12ToARGBRow_C, 0, 4)
+NV2NY(NV21ToARGBRow_Any_NEON, NV21ToARGBRow_NEON, NV21ToARGBRow_C, 0, 4)
+#endif // HAS_NV12TOARGBROW_NEON
+#ifdef HAS_NV12TORGB565ROW_SSSE3
+NV2NY(NV12ToRGB565Row_Any_SSSE3, NV12ToRGB565Row_SSSE3, NV12ToRGB565Row_C,
+ 0, 2)
+NV2NY(NV21ToRGB565Row_Any_SSSE3, NV21ToRGB565Row_SSSE3, NV21ToRGB565Row_C,
+ 0, 2)
+#endif // HAS_NV12TORGB565ROW_SSSE3
+#ifdef HAS_NV12TORGB565ROW_NEON
+NV2NY(NV12ToRGB565Row_Any_NEON, NV12ToRGB565Row_NEON, NV12ToRGB565Row_C, 0, 2)
+NV2NY(NV21ToRGB565Row_Any_NEON, NV21ToRGB565Row_NEON, NV21ToRGB565Row_C, 0, 2)
+#endif // HAS_NV12TORGB565ROW_NEON
+#undef NVANY
+
+#define RGBANY(NAMEANY, ARGBTORGB_SIMD, ARGBTORGB_C, MASK, SBPP, BPP) \
+ void NAMEANY(const uint8* src, \
+ uint8* dst, \
+ int width) { \
+ int n = width & ~MASK; \
+ ARGBTORGB_SIMD(src, dst, n); \
+ ARGBTORGB_C(src + n * SBPP, dst + n * BPP, width & MASK); \
+ }
+
+#if defined(HAS_ARGBTORGB24ROW_SSSE3)
+RGBANY(ARGBToRGB24Row_Any_SSSE3, ARGBToRGB24Row_SSSE3, ARGBToRGB24Row_C,
+ 15, 4, 3)
+RGBANY(ARGBToRAWRow_Any_SSSE3, ARGBToRAWRow_SSSE3, ARGBToRAWRow_C,
+ 15, 4, 3)
+RGBANY(ARGBToRGB565Row_Any_SSE2, ARGBToRGB565Row_SSE2, ARGBToRGB565Row_C,
+ 3, 4, 2)
+RGBANY(ARGBToARGB1555Row_Any_SSE2, ARGBToARGB1555Row_SSE2, ARGBToARGB1555Row_C,
+ 3, 4, 2)
+RGBANY(ARGBToARGB4444Row_Any_SSE2, ARGBToARGB4444Row_SSE2, ARGBToARGB4444Row_C,
+ 3, 4, 2)
+#endif
+#if defined(HAS_I400TOARGBROW_SSE2)
+RGBANY(I400ToARGBRow_Any_SSE2, I400ToARGBRow_Unaligned_SSE2, I400ToARGBRow_C,
+ 7, 1, 4)
+#endif
+#if defined(HAS_YTOARGBROW_SSE2)
+RGBANY(YToARGBRow_Any_SSE2, YToARGBRow_SSE2, YToARGBRow_C,
+ 7, 1, 4)
+RGBANY(YUY2ToARGBRow_Any_SSSE3, YUY2ToARGBRow_Unaligned_SSSE3, YUY2ToARGBRow_C,
+ 15, 2, 4)
+RGBANY(UYVYToARGBRow_Any_SSSE3, UYVYToARGBRow_Unaligned_SSSE3, UYVYToARGBRow_C,
+ 15, 2, 4)
+// These require alignment on ARGB, so C is used for remainder.
+RGBANY(RGB24ToARGBRow_Any_SSSE3, RGB24ToARGBRow_SSSE3, RGB24ToARGBRow_C,
+ 15, 3, 4)
+RGBANY(RAWToARGBRow_Any_SSSE3, RAWToARGBRow_SSSE3, RAWToARGBRow_C,
+ 15, 3, 4)
+RGBANY(RGB565ToARGBRow_Any_SSE2, RGB565ToARGBRow_SSE2, RGB565ToARGBRow_C,
+ 7, 2, 4)
+RGBANY(ARGB1555ToARGBRow_Any_SSE2, ARGB1555ToARGBRow_SSE2, ARGB1555ToARGBRow_C,
+ 7, 2, 4)
+RGBANY(ARGB4444ToARGBRow_Any_SSE2, ARGB4444ToARGBRow_SSE2, ARGB4444ToARGBRow_C,
+ 7, 2, 4)
+#endif
+#if defined(HAS_ARGBTORGB24ROW_NEON)
+RGBANY(ARGBToRGB24Row_Any_NEON, ARGBToRGB24Row_NEON, ARGBToRGB24Row_C, 7, 4, 3)
+RGBANY(ARGBToRAWRow_Any_NEON, ARGBToRAWRow_NEON, ARGBToRAWRow_C, 7, 4, 3)
+RGBANY(ARGBToRGB565Row_Any_NEON, ARGBToRGB565Row_NEON, ARGBToRGB565Row_C,
+ 7, 4, 2)
+RGBANY(ARGBToARGB1555Row_Any_NEON, ARGBToARGB1555Row_NEON, ARGBToARGB1555Row_C,
+ 7, 4, 2)
+RGBANY(ARGBToARGB4444Row_Any_NEON, ARGBToARGB4444Row_NEON, ARGBToARGB4444Row_C,
+ 7, 4, 2)
+RGBANY(I400ToARGBRow_Any_NEON, I400ToARGBRow_NEON, I400ToARGBRow_C,
+ 7, 1, 4)
+RGBANY(YToARGBRow_Any_NEON, YToARGBRow_NEON, YToARGBRow_C,
+ 7, 1, 4)
+RGBANY(YUY2ToARGBRow_Any_NEON, YUY2ToARGBRow_NEON, YUY2ToARGBRow_C,
+ 7, 2, 4)
+RGBANY(UYVYToARGBRow_Any_NEON, UYVYToARGBRow_NEON, UYVYToARGBRow_C,
+ 7, 2, 4)
+#endif
+#undef RGBANY
+
+// ARGB to Bayer does multiple of 4 pixels, SSSE3 aligned src, unaligned dst.
+#define BAYERANY(NAMEANY, ARGBTORGB_SIMD, ARGBTORGB_C, MASK, SBPP, BPP) \
+ void NAMEANY(const uint8* src, \
+ uint8* dst, uint32 selector, \
+ int width) { \
+ int n = width & ~MASK; \
+ ARGBTORGB_SIMD(src, dst, selector, n); \
+ ARGBTORGB_C(src + n * SBPP, dst + n * BPP, selector, width & MASK); \
+ }
+
+#if defined(HAS_ARGBTOBAYERROW_SSSE3)
+BAYERANY(ARGBToBayerRow_Any_SSSE3, ARGBToBayerRow_SSSE3, ARGBToBayerRow_C,
+ 7, 4, 1)
+#endif
+#if defined(HAS_ARGBTOBAYERROW_NEON)
+BAYERANY(ARGBToBayerRow_Any_NEON, ARGBToBayerRow_NEON, ARGBToBayerRow_C,
+ 7, 4, 1)
+#endif
+#if defined(HAS_ARGBTOBAYERGGROW_SSE2)
+BAYERANY(ARGBToBayerGGRow_Any_SSE2, ARGBToBayerGGRow_SSE2, ARGBToBayerGGRow_C,
+ 7, 4, 1)
+#endif
+#if defined(HAS_ARGBTOBAYERGGROW_NEON)
+BAYERANY(ARGBToBayerGGRow_Any_NEON, ARGBToBayerGGRow_NEON, ARGBToBayerGGRow_C,
+ 7, 4, 1)
+#endif
+
+#undef BAYERANY
+
+// RGB/YUV to Y does multiple of 16 with SIMD and last 16 with SIMD.
+#define YANY(NAMEANY, ARGBTOY_SIMD, SBPP, BPP, NUM) \
+ void NAMEANY(const uint8* src_argb, uint8* dst_y, int width) { \
+ ARGBTOY_SIMD(src_argb, dst_y, width - NUM); \
+ ARGBTOY_SIMD(src_argb + (width - NUM) * SBPP, \
+ dst_y + (width - NUM) * BPP, NUM); \
+ }
+
+#ifdef HAS_ARGBTOYROW_AVX2
+YANY(ARGBToYRow_Any_AVX2, ARGBToYRow_AVX2, 4, 1, 32)
+YANY(ARGBToYJRow_Any_AVX2, ARGBToYJRow_AVX2, 4, 1, 32)
+YANY(YUY2ToYRow_Any_AVX2, YUY2ToYRow_AVX2, 2, 1, 32)
+YANY(UYVYToYRow_Any_AVX2, UYVYToYRow_AVX2, 2, 1, 32)
+#endif
+#ifdef HAS_ARGBTOYROW_SSSE3
+YANY(ARGBToYRow_Any_SSSE3, ARGBToYRow_Unaligned_SSSE3, 4, 1, 16)
+#endif
+#ifdef HAS_BGRATOYROW_SSSE3
+YANY(BGRAToYRow_Any_SSSE3, BGRAToYRow_Unaligned_SSSE3, 4, 1, 16)
+YANY(ABGRToYRow_Any_SSSE3, ABGRToYRow_Unaligned_SSSE3, 4, 1, 16)
+YANY(RGBAToYRow_Any_SSSE3, RGBAToYRow_Unaligned_SSSE3, 4, 1, 16)
+YANY(YUY2ToYRow_Any_SSE2, YUY2ToYRow_Unaligned_SSE2, 2, 1, 16)
+YANY(UYVYToYRow_Any_SSE2, UYVYToYRow_Unaligned_SSE2, 2, 1, 16)
+#endif
+#ifdef HAS_ARGBTOYJROW_SSSE3
+YANY(ARGBToYJRow_Any_SSSE3, ARGBToYJRow_Unaligned_SSSE3, 4, 1, 16)
+#endif
+#ifdef HAS_ARGBTOYROW_NEON
+YANY(ARGBToYRow_Any_NEON, ARGBToYRow_NEON, 4, 1, 8)
+YANY(ARGBToYJRow_Any_NEON, ARGBToYJRow_NEON, 4, 1, 8)
+YANY(BGRAToYRow_Any_NEON, BGRAToYRow_NEON, 4, 1, 8)
+YANY(ABGRToYRow_Any_NEON, ABGRToYRow_NEON, 4, 1, 8)
+YANY(RGBAToYRow_Any_NEON, RGBAToYRow_NEON, 4, 1, 8)
+YANY(RGB24ToYRow_Any_NEON, RGB24ToYRow_NEON, 3, 1, 8)
+YANY(RAWToYRow_Any_NEON, RAWToYRow_NEON, 3, 1, 8)
+YANY(RGB565ToYRow_Any_NEON, RGB565ToYRow_NEON, 2, 1, 8)
+YANY(ARGB1555ToYRow_Any_NEON, ARGB1555ToYRow_NEON, 2, 1, 8)
+YANY(ARGB4444ToYRow_Any_NEON, ARGB4444ToYRow_NEON, 2, 1, 8)
+YANY(YUY2ToYRow_Any_NEON, YUY2ToYRow_NEON, 2, 1, 16)
+YANY(UYVYToYRow_Any_NEON, UYVYToYRow_NEON, 2, 1, 16)
+YANY(RGB24ToARGBRow_Any_NEON, RGB24ToARGBRow_NEON, 3, 4, 8)
+YANY(RAWToARGBRow_Any_NEON, RAWToARGBRow_NEON, 3, 4, 8)
+YANY(RGB565ToARGBRow_Any_NEON, RGB565ToARGBRow_NEON, 2, 4, 8)
+YANY(ARGB1555ToARGBRow_Any_NEON, ARGB1555ToARGBRow_NEON, 2, 4, 8)
+YANY(ARGB4444ToARGBRow_Any_NEON, ARGB4444ToARGBRow_NEON, 2, 4, 8)
+#endif
+#undef YANY
+
+#define YANY(NAMEANY, ARGBTOY_SIMD, ARGBTOY_C, SBPP, BPP, MASK) \
+ void NAMEANY(const uint8* src_argb, uint8* dst_y, int width) { \
+ int n = width & ~MASK; \
+ ARGBTOY_SIMD(src_argb, dst_y, n); \
+ ARGBTOY_C(src_argb + n * SBPP, \
+ dst_y + n * BPP, width & MASK); \
+ }
+
+// Attenuate is destructive so last16 method can not be used due to overlap.
+#ifdef HAS_ARGBATTENUATEROW_SSSE3
+YANY(ARGBAttenuateRow_Any_SSSE3, ARGBAttenuateRow_SSSE3, ARGBAttenuateRow_C,
+ 4, 4, 3)
+#endif
+#ifdef HAS_ARGBATTENUATEROW_SSE2
+YANY(ARGBAttenuateRow_Any_SSE2, ARGBAttenuateRow_SSE2, ARGBAttenuateRow_C,
+ 4, 4, 3)
+#endif
+#ifdef HAS_ARGBUNATTENUATEROW_SSE2
+YANY(ARGBUnattenuateRow_Any_SSE2, ARGBUnattenuateRow_SSE2, ARGBUnattenuateRow_C,
+ 4, 4, 3)
+#endif
+#ifdef HAS_ARGBATTENUATEROW_AVX2
+YANY(ARGBAttenuateRow_Any_AVX2, ARGBAttenuateRow_AVX2, ARGBAttenuateRow_C,
+ 4, 4, 7)
+#endif
+#ifdef HAS_ARGBUNATTENUATEROW_AVX2
+YANY(ARGBUnattenuateRow_Any_AVX2, ARGBUnattenuateRow_AVX2, ARGBUnattenuateRow_C,
+ 4, 4, 7)
+#endif
+#ifdef HAS_ARGBATTENUATEROW_NEON
+YANY(ARGBAttenuateRow_Any_NEON, ARGBAttenuateRow_NEON, ARGBAttenuateRow_C,
+ 4, 4, 7)
+#endif
+#undef YANY
+
+// RGB/YUV to UV does multiple of 16 with SIMD and remainder with C.
+#define UVANY(NAMEANY, ANYTOUV_SIMD, ANYTOUV_C, BPP, MASK) \
+ void NAMEANY(const uint8* src_argb, int src_stride_argb, \
+ uint8* dst_u, uint8* dst_v, int width) { \
+ int n = width & ~MASK; \
+ ANYTOUV_SIMD(src_argb, src_stride_argb, dst_u, dst_v, n); \
+ ANYTOUV_C(src_argb + n * BPP, src_stride_argb, \
+ dst_u + (n >> 1), \
+ dst_v + (n >> 1), \
+ width & MASK); \
+ }
+
+#ifdef HAS_ARGBTOUVROW_AVX2
+UVANY(ARGBToUVRow_Any_AVX2, ARGBToUVRow_AVX2, ARGBToUVRow_C, 4, 31)
+UVANY(YUY2ToUVRow_Any_AVX2, YUY2ToUVRow_AVX2, YUY2ToUVRow_C, 2, 31)
+UVANY(UYVYToUVRow_Any_AVX2, UYVYToUVRow_AVX2, UYVYToUVRow_C, 2, 31)
+#endif
+#ifdef HAS_ARGBTOUVROW_SSSE3
+UVANY(ARGBToUVRow_Any_SSSE3, ARGBToUVRow_Unaligned_SSSE3, ARGBToUVRow_C, 4, 15)
+UVANY(ARGBToUVJRow_Any_SSSE3, ARGBToUVJRow_Unaligned_SSSE3, ARGBToUVJRow_C,
+ 4, 15)
+UVANY(BGRAToUVRow_Any_SSSE3, BGRAToUVRow_Unaligned_SSSE3, BGRAToUVRow_C, 4, 15)
+UVANY(ABGRToUVRow_Any_SSSE3, ABGRToUVRow_Unaligned_SSSE3, ABGRToUVRow_C, 4, 15)
+UVANY(RGBAToUVRow_Any_SSSE3, RGBAToUVRow_Unaligned_SSSE3, RGBAToUVRow_C, 4, 15)
+UVANY(YUY2ToUVRow_Any_SSE2, YUY2ToUVRow_Unaligned_SSE2, YUY2ToUVRow_C, 2, 15)
+UVANY(UYVYToUVRow_Any_SSE2, UYVYToUVRow_Unaligned_SSE2, UYVYToUVRow_C, 2, 15)
+#endif
+#ifdef HAS_ARGBTOUVROW_NEON
+UVANY(ARGBToUVRow_Any_NEON, ARGBToUVRow_NEON, ARGBToUVRow_C, 4, 15)
+UVANY(ARGBToUVJRow_Any_NEON, ARGBToUVJRow_NEON, ARGBToUVJRow_C, 4, 15)
+UVANY(BGRAToUVRow_Any_NEON, BGRAToUVRow_NEON, BGRAToUVRow_C, 4, 15)
+UVANY(ABGRToUVRow_Any_NEON, ABGRToUVRow_NEON, ABGRToUVRow_C, 4, 15)
+UVANY(RGBAToUVRow_Any_NEON, RGBAToUVRow_NEON, RGBAToUVRow_C, 4, 15)
+UVANY(RGB24ToUVRow_Any_NEON, RGB24ToUVRow_NEON, RGB24ToUVRow_C, 3, 15)
+UVANY(RAWToUVRow_Any_NEON, RAWToUVRow_NEON, RAWToUVRow_C, 3, 15)
+UVANY(RGB565ToUVRow_Any_NEON, RGB565ToUVRow_NEON, RGB565ToUVRow_C, 2, 15)
+UVANY(ARGB1555ToUVRow_Any_NEON, ARGB1555ToUVRow_NEON, ARGB1555ToUVRow_C, 2, 15)
+UVANY(ARGB4444ToUVRow_Any_NEON, ARGB4444ToUVRow_NEON, ARGB4444ToUVRow_C, 2, 15)
+UVANY(YUY2ToUVRow_Any_NEON, YUY2ToUVRow_NEON, YUY2ToUVRow_C, 2, 15)
+UVANY(UYVYToUVRow_Any_NEON, UYVYToUVRow_NEON, UYVYToUVRow_C, 2, 15)
+#endif
+#undef UVANY
+
+#define UV422ANY(NAMEANY, ANYTOUV_SIMD, ANYTOUV_C, BPP, MASK, SHIFT) \
+ void NAMEANY(const uint8* src_uv, \
+ uint8* dst_u, uint8* dst_v, int width) { \
+ int n = width & ~MASK; \
+ ANYTOUV_SIMD(src_uv, dst_u, dst_v, n); \
+ ANYTOUV_C(src_uv + n * BPP, \
+ dst_u + (n >> SHIFT), \
+ dst_v + (n >> SHIFT), \
+ width & MASK); \
+ }
+
+#ifdef HAS_ARGBTOUV444ROW_SSSE3
+UV422ANY(ARGBToUV444Row_Any_SSSE3, ARGBToUV444Row_Unaligned_SSSE3,
+ ARGBToUV444Row_C, 4, 15, 0)
+#endif
+#ifdef HAS_YUY2TOUV422ROW_AVX2
+UV422ANY(YUY2ToUV422Row_Any_AVX2, YUY2ToUV422Row_AVX2,
+ YUY2ToUV422Row_C, 2, 31, 1)
+UV422ANY(UYVYToUV422Row_Any_AVX2, UYVYToUV422Row_AVX2,
+ UYVYToUV422Row_C, 2, 31, 1)
+#endif
+#ifdef HAS_ARGBTOUVROW_SSSE3
+UV422ANY(ARGBToUV422Row_Any_SSSE3, ARGBToUV422Row_Unaligned_SSSE3,
+ ARGBToUV422Row_C, 4, 15, 1)
+UV422ANY(YUY2ToUV422Row_Any_SSE2, YUY2ToUV422Row_Unaligned_SSE2,
+ YUY2ToUV422Row_C, 2, 15, 1)
+UV422ANY(UYVYToUV422Row_Any_SSE2, UYVYToUV422Row_Unaligned_SSE2,
+ UYVYToUV422Row_C, 2, 15, 1)
+#endif
+#ifdef HAS_YUY2TOUV422ROW_NEON
+UV422ANY(ARGBToUV444Row_Any_NEON, ARGBToUV444Row_NEON,
+ ARGBToUV444Row_C, 4, 7, 0)
+UV422ANY(ARGBToUV422Row_Any_NEON, ARGBToUV422Row_NEON,
+ ARGBToUV422Row_C, 4, 15, 1)
+UV422ANY(ARGBToUV411Row_Any_NEON, ARGBToUV411Row_NEON,
+ ARGBToUV411Row_C, 4, 31, 2)
+UV422ANY(YUY2ToUV422Row_Any_NEON, YUY2ToUV422Row_NEON,
+ YUY2ToUV422Row_C, 2, 15, 1)
+UV422ANY(UYVYToUV422Row_Any_NEON, UYVYToUV422Row_NEON,
+ UYVYToUV422Row_C, 2, 15, 1)
+#endif
+#undef UV422ANY
+
+#define SPLITUVROWANY(NAMEANY, ANYTOUV_SIMD, ANYTOUV_C, MASK) \
+ void NAMEANY(const uint8* src_uv, \
+ uint8* dst_u, uint8* dst_v, int width) { \
+ int n = width & ~MASK; \
+ ANYTOUV_SIMD(src_uv, dst_u, dst_v, n); \
+ ANYTOUV_C(src_uv + n * 2, \
+ dst_u + n, \
+ dst_v + n, \
+ width & MASK); \
+ }
+
+#ifdef HAS_SPLITUVROW_SSE2
+SPLITUVROWANY(SplitUVRow_Any_SSE2, SplitUVRow_Unaligned_SSE2, SplitUVRow_C, 15)
+#endif
+#ifdef HAS_SPLITUVROW_AVX2
+SPLITUVROWANY(SplitUVRow_Any_AVX2, SplitUVRow_AVX2, SplitUVRow_C, 31)
+#endif
+#ifdef HAS_SPLITUVROW_NEON
+SPLITUVROWANY(SplitUVRow_Any_NEON, SplitUVRow_NEON, SplitUVRow_C, 15)
+#endif
+#ifdef HAS_SPLITUVROW_MIPS_DSPR2
+SPLITUVROWANY(SplitUVRow_Any_MIPS_DSPR2, SplitUVRow_Unaligned_MIPS_DSPR2,
+ SplitUVRow_C, 15)
+#endif
+#undef SPLITUVROWANY
+
+#define MERGEUVROW_ANY(NAMEANY, ANYTOUV_SIMD, ANYTOUV_C, MASK) \
+ void NAMEANY(const uint8* src_u, const uint8* src_v, \
+ uint8* dst_uv, int width) { \
+ int n = width & ~MASK; \
+ ANYTOUV_SIMD(src_u, src_v, dst_uv, n); \
+ ANYTOUV_C(src_u + n, \
+ src_v + n, \
+ dst_uv + n * 2, \
+ width & MASK); \
+ }
+
+#ifdef HAS_MERGEUVROW_SSE2
+MERGEUVROW_ANY(MergeUVRow_Any_SSE2, MergeUVRow_Unaligned_SSE2, MergeUVRow_C, 15)
+#endif
+#ifdef HAS_MERGEUVROW_AVX2
+MERGEUVROW_ANY(MergeUVRow_Any_AVX2, MergeUVRow_AVX2, MergeUVRow_C, 31)
+#endif
+#ifdef HAS_MERGEUVROW_NEON
+MERGEUVROW_ANY(MergeUVRow_Any_NEON, MergeUVRow_NEON, MergeUVRow_C, 15)
+#endif
+#undef MERGEUVROW_ANY
+
+#define MATHROW_ANY(NAMEANY, ARGBMATH_SIMD, ARGBMATH_C, MASK) \
+ void NAMEANY(const uint8* src_argb0, const uint8* src_argb1, \
+ uint8* dst_argb, int width) { \
+ int n = width & ~MASK; \
+ ARGBMATH_SIMD(src_argb0, src_argb1, dst_argb, n); \
+ ARGBMATH_C(src_argb0 + n * 4, \
+ src_argb1 + n * 4, \
+ dst_argb + n * 4, \
+ width & MASK); \
+ }
+
+#ifdef HAS_ARGBMULTIPLYROW_SSE2
+MATHROW_ANY(ARGBMultiplyRow_Any_SSE2, ARGBMultiplyRow_SSE2, ARGBMultiplyRow_C,
+ 3)
+#endif
+#ifdef HAS_ARGBADDROW_SSE2
+MATHROW_ANY(ARGBAddRow_Any_SSE2, ARGBAddRow_SSE2, ARGBAddRow_C, 3)
+#endif
+#ifdef HAS_ARGBSUBTRACTROW_SSE2
+MATHROW_ANY(ARGBSubtractRow_Any_SSE2, ARGBSubtractRow_SSE2, ARGBSubtractRow_C,
+ 3)
+#endif
+#ifdef HAS_ARGBMULTIPLYROW_AVX2
+MATHROW_ANY(ARGBMultiplyRow_Any_AVX2, ARGBMultiplyRow_AVX2, ARGBMultiplyRow_C,
+ 7)
+#endif
+#ifdef HAS_ARGBADDROW_AVX2
+MATHROW_ANY(ARGBAddRow_Any_AVX2, ARGBAddRow_AVX2, ARGBAddRow_C, 7)
+#endif
+#ifdef HAS_ARGBSUBTRACTROW_AVX2
+MATHROW_ANY(ARGBSubtractRow_Any_AVX2, ARGBSubtractRow_AVX2, ARGBSubtractRow_C,
+ 7)
+#endif
+#ifdef HAS_ARGBMULTIPLYROW_NEON
+MATHROW_ANY(ARGBMultiplyRow_Any_NEON, ARGBMultiplyRow_NEON, ARGBMultiplyRow_C,
+ 7)
+#endif
+#ifdef HAS_ARGBADDROW_NEON
+MATHROW_ANY(ARGBAddRow_Any_NEON, ARGBAddRow_NEON, ARGBAddRow_C, 7)
+#endif
+#ifdef HAS_ARGBSUBTRACTROW_NEON
+MATHROW_ANY(ARGBSubtractRow_Any_NEON, ARGBSubtractRow_NEON, ARGBSubtractRow_C,
+ 7)
+#endif
+#undef MATHROW_ANY
+
+// Shuffle may want to work in place, so last16 method can not be used.
+#define YANY(NAMEANY, ARGBTOY_SIMD, ARGBTOY_C, SBPP, BPP, MASK) \
+ void NAMEANY(const uint8* src_argb, uint8* dst_argb, \
+ const uint8* shuffler, int width) { \
+ int n = width & ~MASK; \
+ ARGBTOY_SIMD(src_argb, dst_argb, shuffler, n); \
+ ARGBTOY_C(src_argb + n * SBPP, \
+ dst_argb + n * BPP, shuffler, width & MASK); \
+ }
+
+#ifdef HAS_ARGBSHUFFLEROW_SSE2
+YANY(ARGBShuffleRow_Any_SSE2, ARGBShuffleRow_SSE2,
+ ARGBShuffleRow_C, 4, 4, 3)
+#endif
+#ifdef HAS_ARGBSHUFFLEROW_SSSE3
+YANY(ARGBShuffleRow_Any_SSSE3, ARGBShuffleRow_Unaligned_SSSE3,
+ ARGBShuffleRow_C, 4, 4, 7)
+#endif
+#ifdef HAS_ARGBSHUFFLEROW_AVX2
+YANY(ARGBShuffleRow_Any_AVX2, ARGBShuffleRow_AVX2,
+ ARGBShuffleRow_C, 4, 4, 15)
+#endif
+#ifdef HAS_ARGBSHUFFLEROW_NEON
+YANY(ARGBShuffleRow_Any_NEON, ARGBShuffleRow_NEON,
+ ARGBShuffleRow_C, 4, 4, 3)
+#endif
+#undef YANY
+
+// Interpolate may want to work in place, so last16 method can not be used.
+#define NANY(NAMEANY, TERP_SIMD, TERP_C, SBPP, BPP, MASK) \
+ void NAMEANY(uint8* dst_ptr, const uint8* src_ptr, \
+ ptrdiff_t src_stride_ptr, int width, \
+ int source_y_fraction) { \
+ int n = width & ~MASK; \
+ TERP_SIMD(dst_ptr, src_ptr, src_stride_ptr, \
+ n, source_y_fraction); \
+ TERP_C(dst_ptr + n * BPP, \
+ src_ptr + n * SBPP, src_stride_ptr, \
+ width & MASK, source_y_fraction); \
+ }
+
+#ifdef HAS_INTERPOLATEROW_AVX2
+NANY(InterpolateRow_Any_AVX2, InterpolateRow_AVX2,
+ InterpolateRow_C, 1, 1, 32)
+#endif
+#ifdef HAS_INTERPOLATEROW_SSSE3
+NANY(InterpolateRow_Any_SSSE3, InterpolateRow_Unaligned_SSSE3,
+ InterpolateRow_C, 1, 1, 15)
+#endif
+#ifdef HAS_INTERPOLATEROW_SSE2
+NANY(InterpolateRow_Any_SSE2, InterpolateRow_Unaligned_SSE2,
+ InterpolateRow_C, 1, 1, 15)
+#endif
+#ifdef HAS_INTERPOLATEROW_NEON
+NANY(InterpolateRow_Any_NEON, InterpolateRow_NEON,
+ InterpolateRow_C, 1, 1, 15)
+#endif
+#ifdef HAS_INTERPOLATEROW_MIPS_DSPR2
+NANY(InterpolateRow_Any_MIPS_DSPR2, InterpolateRow_MIPS_DSPR2,
+ InterpolateRow_C, 1, 1, 3)
+#endif
+#undef NANY
+
+#ifdef __cplusplus
+} // extern "C"
+} // namespace libyuv
+#endif
diff --git a/third_party/libyuv/source/row_common.cc b/third_party/libyuv/source/row_common.cc
new file mode 100644
index 0000000..ceb3836
--- /dev/null
+++ b/third_party/libyuv/source/row_common.cc
@@ -0,0 +1,2286 @@
+/*
+ * Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "third_party/libyuv/include/libyuv/row.h"
+
+#include <string.h> // For memcpy and memset.
+
+#include "third_party/libyuv/include/libyuv/basic_types.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// llvm x86 is poor at ternary operator, so use branchless min/max.
+
+#define USE_BRANCHLESS 1
+#if USE_BRANCHLESS
+static __inline int32 clamp0(int32 v) {
+ return ((-(v) >> 31) & (v));
+}
+
+static __inline int32 clamp255(int32 v) {
+ return (((255 - (v)) >> 31) | (v)) & 255;
+}
+
+static __inline uint32 Clamp(int32 val) {
+ int v = clamp0(val);
+ return (uint32)(clamp255(v));
+}
+
+static __inline uint32 Abs(int32 v) {
+ int m = v >> 31;
+ return (v + m) ^ m;
+}
+#else // USE_BRANCHLESS
+static __inline int32 clamp0(int32 v) {
+ return (v < 0) ? 0 : v;
+}
+
+static __inline int32 clamp255(int32 v) {
+ return (v > 255) ? 255 : v;
+}
+
+static __inline uint32 Clamp(int32 val) {
+ int v = clamp0(val);
+ return (uint32)(clamp255(v));
+}
+
+static __inline uint32 Abs(int32 v) {
+ return (v < 0) ? -v : v;
+}
+#endif // USE_BRANCHLESS
+
+#ifdef LIBYUV_LITTLE_ENDIAN
+#define WRITEWORD(p, v) *(uint32*)(p) = v
+#else
+static inline void WRITEWORD(uint8* p, uint32 v) {
+ p[0] = (uint8)(v & 255);
+ p[1] = (uint8)((v >> 8) & 255);
+ p[2] = (uint8)((v >> 16) & 255);
+ p[3] = (uint8)((v >> 24) & 255);
+}
+#endif
+
+void RGB24ToARGBRow_C(const uint8* src_rgb24, uint8* dst_argb, int width) {
+ int x;
+ for (x = 0; x < width; ++x) {
+ uint8 b = src_rgb24[0];
+ uint8 g = src_rgb24[1];
+ uint8 r = src_rgb24[2];
+ dst_argb[0] = b;
+ dst_argb[1] = g;
+ dst_argb[2] = r;
+ dst_argb[3] = 255u;
+ dst_argb += 4;
+ src_rgb24 += 3;
+ }
+}
+
+void RAWToARGBRow_C(const uint8* src_raw, uint8* dst_argb, int width) {
+ int x;
+ for (x = 0; x < width; ++x) {
+ uint8 r = src_raw[0];
+ uint8 g = src_raw[1];
+ uint8 b = src_raw[2];
+ dst_argb[0] = b;
+ dst_argb[1] = g;
+ dst_argb[2] = r;
+ dst_argb[3] = 255u;
+ dst_argb += 4;
+ src_raw += 3;
+ }
+}
+
+void RGB565ToARGBRow_C(const uint8* src_rgb565, uint8* dst_argb, int width) {
+ int x;
+ for (x = 0; x < width; ++x) {
+ uint8 b = src_rgb565[0] & 0x1f;
+ uint8 g = (src_rgb565[0] >> 5) | ((src_rgb565[1] & 0x07) << 3);
+ uint8 r = src_rgb565[1] >> 3;
+ dst_argb[0] = (b << 3) | (b >> 2);
+ dst_argb[1] = (g << 2) | (g >> 4);
+ dst_argb[2] = (r << 3) | (r >> 2);
+ dst_argb[3] = 255u;
+ dst_argb += 4;
+ src_rgb565 += 2;
+ }
+}
+
+void ARGB1555ToARGBRow_C(const uint8* src_argb1555, uint8* dst_argb,
+ int width) {
+ int x;
+ for (x = 0; x < width; ++x) {
+ uint8 b = src_argb1555[0] & 0x1f;
+ uint8 g = (src_argb1555[0] >> 5) | ((src_argb1555[1] & 0x03) << 3);
+ uint8 r = (src_argb1555[1] & 0x7c) >> 2;
+ uint8 a = src_argb1555[1] >> 7;
+ dst_argb[0] = (b << 3) | (b >> 2);
+ dst_argb[1] = (g << 3) | (g >> 2);
+ dst_argb[2] = (r << 3) | (r >> 2);
+ dst_argb[3] = -a;
+ dst_argb += 4;
+ src_argb1555 += 2;
+ }
+}
+
+void ARGB4444ToARGBRow_C(const uint8* src_argb4444, uint8* dst_argb,
+ int width) {
+ int x;
+ for (x = 0; x < width; ++x) {
+ uint8 b = src_argb4444[0] & 0x0f;
+ uint8 g = src_argb4444[0] >> 4;
+ uint8 r = src_argb4444[1] & 0x0f;
+ uint8 a = src_argb4444[1] >> 4;
+ dst_argb[0] = (b << 4) | b;
+ dst_argb[1] = (g << 4) | g;
+ dst_argb[2] = (r << 4) | r;
+ dst_argb[3] = (a << 4) | a;
+ dst_argb += 4;
+ src_argb4444 += 2;
+ }
+}
+
+void ARGBToRGB24Row_C(const uint8* src_argb, uint8* dst_rgb, int width) {
+ int x;
+ for (x = 0; x < width; ++x) {
+ uint8 b = src_argb[0];
+ uint8 g = src_argb[1];
+ uint8 r = src_argb[2];
+ dst_rgb[0] = b;
+ dst_rgb[1] = g;
+ dst_rgb[2] = r;
+ dst_rgb += 3;
+ src_argb += 4;
+ }
+}
+
+void ARGBToRAWRow_C(const uint8* src_argb, uint8* dst_rgb, int width) {
+ int x;
+ for (x = 0; x < width; ++x) {
+ uint8 b = src_argb[0];
+ uint8 g = src_argb[1];
+ uint8 r = src_argb[2];
+ dst_rgb[0] = r;
+ dst_rgb[1] = g;
+ dst_rgb[2] = b;
+ dst_rgb += 3;
+ src_argb += 4;
+ }
+}
+
+void ARGBToRGB565Row_C(const uint8* src_argb, uint8* dst_rgb, int width) {
+ int x;
+ for (x = 0; x < width - 1; x += 2) {
+ uint8 b0 = src_argb[0] >> 3;
+ uint8 g0 = src_argb[1] >> 2;
+ uint8 r0 = src_argb[2] >> 3;
+ uint8 b1 = src_argb[4] >> 3;
+ uint8 g1 = src_argb[5] >> 2;
+ uint8 r1 = src_argb[6] >> 3;
+ WRITEWORD(dst_rgb, b0 | (g0 << 5) | (r0 << 11) |
+ (b1 << 16) | (g1 << 21) | (r1 << 27));
+ dst_rgb += 4;
+ src_argb += 8;
+ }
+ if (width & 1) {
+ uint8 b0 = src_argb[0] >> 3;
+ uint8 g0 = src_argb[1] >> 2;
+ uint8 r0 = src_argb[2] >> 3;
+ *(uint16*)(dst_rgb) = b0 | (g0 << 5) | (r0 << 11);
+ }
+}
+
+void ARGBToARGB1555Row_C(const uint8* src_argb, uint8* dst_rgb, int width) {
+ int x;
+ for (x = 0; x < width - 1; x += 2) {
+ uint8 b0 = src_argb[0] >> 3;
+ uint8 g0 = src_argb[1] >> 3;
+ uint8 r0 = src_argb[2] >> 3;
+ uint8 a0 = src_argb[3] >> 7;
+ uint8 b1 = src_argb[4] >> 3;
+ uint8 g1 = src_argb[5] >> 3;
+ uint8 r1 = src_argb[6] >> 3;
+ uint8 a1 = src_argb[7] >> 7;
+ *(uint32*)(dst_rgb) =
+ b0 | (g0 << 5) | (r0 << 10) | (a0 << 15) |
+ (b1 << 16) | (g1 << 21) | (r1 << 26) | (a1 << 31);
+ dst_rgb += 4;
+ src_argb += 8;
+ }
+ if (width & 1) {
+ uint8 b0 = src_argb[0] >> 3;
+ uint8 g0 = src_argb[1] >> 3;
+ uint8 r0 = src_argb[2] >> 3;
+ uint8 a0 = src_argb[3] >> 7;
+ *(uint16*)(dst_rgb) =
+ b0 | (g0 << 5) | (r0 << 10) | (a0 << 15);
+ }
+}
+
+void ARGBToARGB4444Row_C(const uint8* src_argb, uint8* dst_rgb, int width) {
+ int x;
+ for (x = 0; x < width - 1; x += 2) {
+ uint8 b0 = src_argb[0] >> 4;
+ uint8 g0 = src_argb[1] >> 4;
+ uint8 r0 = src_argb[2] >> 4;
+ uint8 a0 = src_argb[3] >> 4;
+ uint8 b1 = src_argb[4] >> 4;
+ uint8 g1 = src_argb[5] >> 4;
+ uint8 r1 = src_argb[6] >> 4;
+ uint8 a1 = src_argb[7] >> 4;
+ *(uint32*)(dst_rgb) =
+ b0 | (g0 << 4) | (r0 << 8) | (a0 << 12) |
+ (b1 << 16) | (g1 << 20) | (r1 << 24) | (a1 << 28);
+ dst_rgb += 4;
+ src_argb += 8;
+ }
+ if (width & 1) {
+ uint8 b0 = src_argb[0] >> 4;
+ uint8 g0 = src_argb[1] >> 4;
+ uint8 r0 = src_argb[2] >> 4;
+ uint8 a0 = src_argb[3] >> 4;
+ *(uint16*)(dst_rgb) =
+ b0 | (g0 << 4) | (r0 << 8) | (a0 << 12);
+ }
+}
+
+static __inline int RGBToY(uint8 r, uint8 g, uint8 b) {
+ return (66 * r + 129 * g + 25 * b + 0x1080) >> 8;
+}
+
+static __inline int RGBToU(uint8 r, uint8 g, uint8 b) {
+ return (112 * b - 74 * g - 38 * r + 0x8080) >> 8;
+}
+static __inline int RGBToV(uint8 r, uint8 g, uint8 b) {
+ return (112 * r - 94 * g - 18 * b + 0x8080) >> 8;
+}
+
+#define MAKEROWY(NAME, R, G, B, BPP) \
+void NAME ## ToYRow_C(const uint8* src_argb0, uint8* dst_y, int width) { \
+ int x; \
+ for (x = 0; x < width; ++x) { \
+ dst_y[0] = RGBToY(src_argb0[R], src_argb0[G], src_argb0[B]); \
+ src_argb0 += BPP; \
+ dst_y += 1; \
+ } \
+} \
+void NAME ## ToUVRow_C(const uint8* src_rgb0, int src_stride_rgb, \
+ uint8* dst_u, uint8* dst_v, int width) { \
+ const uint8* src_rgb1 = src_rgb0 + src_stride_rgb; \
+ int x; \
+ for (x = 0; x < width - 1; x += 2) { \
+ uint8 ab = (src_rgb0[B] + src_rgb0[B + BPP] + \
+ src_rgb1[B] + src_rgb1[B + BPP]) >> 2; \
+ uint8 ag = (src_rgb0[G] + src_rgb0[G + BPP] + \
+ src_rgb1[G] + src_rgb1[G + BPP]) >> 2; \
+ uint8 ar = (src_rgb0[R] + src_rgb0[R + BPP] + \
+ src_rgb1[R] + src_rgb1[R + BPP]) >> 2; \
+ dst_u[0] = RGBToU(ar, ag, ab); \
+ dst_v[0] = RGBToV(ar, ag, ab); \
+ src_rgb0 += BPP * 2; \
+ src_rgb1 += BPP * 2; \
+ dst_u += 1; \
+ dst_v += 1; \
+ } \
+ if (width & 1) { \
+ uint8 ab = (src_rgb0[B] + src_rgb1[B]) >> 1; \
+ uint8 ag = (src_rgb0[G] + src_rgb1[G]) >> 1; \
+ uint8 ar = (src_rgb0[R] + src_rgb1[R]) >> 1; \
+ dst_u[0] = RGBToU(ar, ag, ab); \
+ dst_v[0] = RGBToV(ar, ag, ab); \
+ } \
+}
+
+MAKEROWY(ARGB, 2, 1, 0, 4)
+MAKEROWY(BGRA, 1, 2, 3, 4)
+MAKEROWY(ABGR, 0, 1, 2, 4)
+MAKEROWY(RGBA, 3, 2, 1, 4)
+MAKEROWY(RGB24, 2, 1, 0, 3)
+MAKEROWY(RAW, 0, 1, 2, 3)
+#undef MAKEROWY
+
+// JPeg uses a variation on BT.601-1 full range
+// y = 0.29900 * r + 0.58700 * g + 0.11400 * b
+// u = -0.16874 * r - 0.33126 * g + 0.50000 * b + center
+// v = 0.50000 * r - 0.41869 * g - 0.08131 * b + center
+// BT.601 Mpeg range uses:
+// b 0.1016 * 255 = 25.908 = 25
+// g 0.5078 * 255 = 129.489 = 129
+// r 0.2578 * 255 = 65.739 = 66
+// JPeg 8 bit Y (not used):
+// b 0.11400 * 256 = 29.184 = 29
+// g 0.58700 * 256 = 150.272 = 150
+// r 0.29900 * 256 = 76.544 = 77
+// JPeg 7 bit Y:
+// b 0.11400 * 128 = 14.592 = 15
+// g 0.58700 * 128 = 75.136 = 75
+// r 0.29900 * 128 = 38.272 = 38
+// JPeg 8 bit U:
+// b 0.50000 * 255 = 127.5 = 127
+// g -0.33126 * 255 = -84.4713 = -84
+// r -0.16874 * 255 = -43.0287 = -43
+// JPeg 8 bit V:
+// b -0.08131 * 255 = -20.73405 = -20
+// g -0.41869 * 255 = -106.76595 = -107
+// r 0.50000 * 255 = 127.5 = 127
+
+static __inline int RGBToYJ(uint8 r, uint8 g, uint8 b) {
+ return (38 * r + 75 * g + 15 * b + 64) >> 7;
+}
+
+static __inline int RGBToUJ(uint8 r, uint8 g, uint8 b) {
+ return (127 * b - 84 * g - 43 * r + 0x8080) >> 8;
+}
+static __inline int RGBToVJ(uint8 r, uint8 g, uint8 b) {
+ return (127 * r - 107 * g - 20 * b + 0x8080) >> 8;
+}
+
+#define AVGB(a, b) (((a) + (b) + 1) >> 1)
+
+#define MAKEROWYJ(NAME, R, G, B, BPP) \
+void NAME ## ToYJRow_C(const uint8* src_argb0, uint8* dst_y, int width) { \
+ int x; \
+ for (x = 0; x < width; ++x) { \
+ dst_y[0] = RGBToYJ(src_argb0[R], src_argb0[G], src_argb0[B]); \
+ src_argb0 += BPP; \
+ dst_y += 1; \
+ } \
+} \
+void NAME ## ToUVJRow_C(const uint8* src_rgb0, int src_stride_rgb, \
+ uint8* dst_u, uint8* dst_v, int width) { \
+ const uint8* src_rgb1 = src_rgb0 + src_stride_rgb; \
+ int x; \
+ for (x = 0; x < width - 1; x += 2) { \
+ uint8 ab = AVGB(AVGB(src_rgb0[B], src_rgb1[B]), \
+ AVGB(src_rgb0[B + BPP], src_rgb1[B + BPP])); \
+ uint8 ag = AVGB(AVGB(src_rgb0[G], src_rgb1[G]), \
+ AVGB(src_rgb0[G + BPP], src_rgb1[G + BPP])); \
+ uint8 ar = AVGB(AVGB(src_rgb0[R], src_rgb1[R]), \
+ AVGB(src_rgb0[R + BPP], src_rgb1[R + BPP])); \
+ dst_u[0] = RGBToUJ(ar, ag, ab); \
+ dst_v[0] = RGBToVJ(ar, ag, ab); \
+ src_rgb0 += BPP * 2; \
+ src_rgb1 += BPP * 2; \
+ dst_u += 1; \
+ dst_v += 1; \
+ } \
+ if (width & 1) { \
+ uint8 ab = AVGB(src_rgb0[B], src_rgb1[B]); \
+ uint8 ag = AVGB(src_rgb0[G], src_rgb1[G]); \
+ uint8 ar = AVGB(src_rgb0[R], src_rgb1[R]); \
+ dst_u[0] = RGBToUJ(ar, ag, ab); \
+ dst_v[0] = RGBToVJ(ar, ag, ab); \
+ } \
+}
+
+MAKEROWYJ(ARGB, 2, 1, 0, 4)
+#undef MAKEROWYJ
+
+void RGB565ToYRow_C(const uint8* src_rgb565, uint8* dst_y, int width) {
+ int x;
+ for (x = 0; x < width; ++x) {
+ uint8 b = src_rgb565[0] & 0x1f;
+ uint8 g = (src_rgb565[0] >> 5) | ((src_rgb565[1] & 0x07) << 3);
+ uint8 r = src_rgb565[1] >> 3;
+ b = (b << 3) | (b >> 2);
+ g = (g << 2) | (g >> 4);
+ r = (r << 3) | (r >> 2);
+ dst_y[0] = RGBToY(r, g, b);
+ src_rgb565 += 2;
+ dst_y += 1;
+ }
+}
+
+void ARGB1555ToYRow_C(const uint8* src_argb1555, uint8* dst_y, int width) {
+ int x;
+ for (x = 0; x < width; ++x) {
+ uint8 b = src_argb1555[0] & 0x1f;
+ uint8 g = (src_argb1555[0] >> 5) | ((src_argb1555[1] & 0x03) << 3);
+ uint8 r = (src_argb1555[1] & 0x7c) >> 2;
+ b = (b << 3) | (b >> 2);
+ g = (g << 3) | (g >> 2);
+ r = (r << 3) | (r >> 2);
+ dst_y[0] = RGBToY(r, g, b);
+ src_argb1555 += 2;
+ dst_y += 1;
+ }
+}
+
+void ARGB4444ToYRow_C(const uint8* src_argb4444, uint8* dst_y, int width) {
+ int x;
+ for (x = 0; x < width; ++x) {
+ uint8 b = src_argb4444[0] & 0x0f;
+ uint8 g = src_argb4444[0] >> 4;
+ uint8 r = src_argb4444[1] & 0x0f;
+ b = (b << 4) | b;
+ g = (g << 4) | g;
+ r = (r << 4) | r;
+ dst_y[0] = RGBToY(r, g, b);
+ src_argb4444 += 2;
+ dst_y += 1;
+ }
+}
+
+void RGB565ToUVRow_C(const uint8* src_rgb565, int src_stride_rgb565,
+ uint8* dst_u, uint8* dst_v, int width) {
+ const uint8* next_rgb565 = src_rgb565 + src_stride_rgb565;
+ int x;
+ for (x = 0; x < width - 1; x += 2) {
+ uint8 b0 = src_rgb565[0] & 0x1f;
+ uint8 g0 = (src_rgb565[0] >> 5) | ((src_rgb565[1] & 0x07) << 3);
+ uint8 r0 = src_rgb565[1] >> 3;
+ uint8 b1 = src_rgb565[2] & 0x1f;
+ uint8 g1 = (src_rgb565[2] >> 5) | ((src_rgb565[3] & 0x07) << 3);
+ uint8 r1 = src_rgb565[3] >> 3;
+ uint8 b2 = next_rgb565[0] & 0x1f;
+ uint8 g2 = (next_rgb565[0] >> 5) | ((next_rgb565[1] & 0x07) << 3);
+ uint8 r2 = next_rgb565[1] >> 3;
+ uint8 b3 = next_rgb565[2] & 0x1f;
+ uint8 g3 = (next_rgb565[2] >> 5) | ((next_rgb565[3] & 0x07) << 3);
+ uint8 r3 = next_rgb565[3] >> 3;
+ uint8 b = (b0 + b1 + b2 + b3); // 565 * 4 = 787.
+ uint8 g = (g0 + g1 + g2 + g3);
+ uint8 r = (r0 + r1 + r2 + r3);
+ b = (b << 1) | (b >> 6); // 787 -> 888.
+ r = (r << 1) | (r >> 6);
+ dst_u[0] = RGBToU(r, g, b);
+ dst_v[0] = RGBToV(r, g, b);
+ src_rgb565 += 4;
+ next_rgb565 += 4;
+ dst_u += 1;
+ dst_v += 1;
+ }
+ if (width & 1) {
+ uint8 b0 = src_rgb565[0] & 0x1f;
+ uint8 g0 = (src_rgb565[0] >> 5) | ((src_rgb565[1] & 0x07) << 3);
+ uint8 r0 = src_rgb565[1] >> 3;
+ uint8 b2 = next_rgb565[0] & 0x1f;
+ uint8 g2 = (next_rgb565[0] >> 5) | ((next_rgb565[1] & 0x07) << 3);
+ uint8 r2 = next_rgb565[1] >> 3;
+ uint8 b = (b0 + b2); // 565 * 2 = 676.
+ uint8 g = (g0 + g2);
+ uint8 r = (r0 + r2);
+ b = (b << 2) | (b >> 4); // 676 -> 888
+ g = (g << 1) | (g >> 6);
+ r = (r << 2) | (r >> 4);
+ dst_u[0] = RGBToU(r, g, b);
+ dst_v[0] = RGBToV(r, g, b);
+ }
+}
+
+void ARGB1555ToUVRow_C(const uint8* src_argb1555, int src_stride_argb1555,
+ uint8* dst_u, uint8* dst_v, int width) {
+ const uint8* next_argb1555 = src_argb1555 + src_stride_argb1555;
+ int x;
+ for (x = 0; x < width - 1; x += 2) {
+ uint8 b0 = src_argb1555[0] & 0x1f;
+ uint8 g0 = (src_argb1555[0] >> 5) | ((src_argb1555[1] & 0x03) << 3);
+ uint8 r0 = (src_argb1555[1] & 0x7c) >> 2;
+ uint8 b1 = src_argb1555[2] & 0x1f;
+ uint8 g1 = (src_argb1555[2] >> 5) | ((src_argb1555[3] & 0x03) << 3);
+ uint8 r1 = (src_argb1555[3] & 0x7c) >> 2;
+ uint8 b2 = next_argb1555[0] & 0x1f;
+ uint8 g2 = (next_argb1555[0] >> 5) | ((next_argb1555[1] & 0x03) << 3);
+ uint8 r2 = (next_argb1555[1] & 0x7c) >> 2;
+ uint8 b3 = next_argb1555[2] & 0x1f;
+ uint8 g3 = (next_argb1555[2] >> 5) | ((next_argb1555[3] & 0x03) << 3);
+ uint8 r3 = (next_argb1555[3] & 0x7c) >> 2;
+ uint8 b = (b0 + b1 + b2 + b3); // 555 * 4 = 777.
+ uint8 g = (g0 + g1 + g2 + g3);
+ uint8 r = (r0 + r1 + r2 + r3);
+ b = (b << 1) | (b >> 6); // 777 -> 888.
+ g = (g << 1) | (g >> 6);
+ r = (r << 1) | (r >> 6);
+ dst_u[0] = RGBToU(r, g, b);
+ dst_v[0] = RGBToV(r, g, b);
+ src_argb1555 += 4;
+ next_argb1555 += 4;
+ dst_u += 1;
+ dst_v += 1;
+ }
+ if (width & 1) {
+ uint8 b0 = src_argb1555[0] & 0x1f;
+ uint8 g0 = (src_argb1555[0] >> 5) | ((src_argb1555[1] & 0x03) << 3);
+ uint8 r0 = (src_argb1555[1] & 0x7c) >> 2;
+ uint8 b2 = next_argb1555[0] & 0x1f;
+ uint8 g2 = (next_argb1555[0] >> 5) | ((next_argb1555[1] & 0x03) << 3);
+ uint8 r2 = next_argb1555[1] >> 3;
+ uint8 b = (b0 + b2); // 555 * 2 = 666.
+ uint8 g = (g0 + g2);
+ uint8 r = (r0 + r2);
+ b = (b << 2) | (b >> 4); // 666 -> 888.
+ g = (g << 2) | (g >> 4);
+ r = (r << 2) | (r >> 4);
+ dst_u[0] = RGBToU(r, g, b);
+ dst_v[0] = RGBToV(r, g, b);
+ }
+}
+
+void ARGB4444ToUVRow_C(const uint8* src_argb4444, int src_stride_argb4444,
+ uint8* dst_u, uint8* dst_v, int width) {
+ const uint8* next_argb4444 = src_argb4444 + src_stride_argb4444;
+ int x;
+ for (x = 0; x < width - 1; x += 2) {
+ uint8 b0 = src_argb4444[0] & 0x0f;
+ uint8 g0 = src_argb4444[0] >> 4;
+ uint8 r0 = src_argb4444[1] & 0x0f;
+ uint8 b1 = src_argb4444[2] & 0x0f;
+ uint8 g1 = src_argb4444[2] >> 4;
+ uint8 r1 = src_argb4444[3] & 0x0f;
+ uint8 b2 = next_argb4444[0] & 0x0f;
+ uint8 g2 = next_argb4444[0] >> 4;
+ uint8 r2 = next_argb4444[1] & 0x0f;
+ uint8 b3 = next_argb4444[2] & 0x0f;
+ uint8 g3 = next_argb4444[2] >> 4;
+ uint8 r3 = next_argb4444[3] & 0x0f;
+ uint8 b = (b0 + b1 + b2 + b3); // 444 * 4 = 666.
+ uint8 g = (g0 + g1 + g2 + g3);
+ uint8 r = (r0 + r1 + r2 + r3);
+ b = (b << 2) | (b >> 4); // 666 -> 888.
+ g = (g << 2) | (g >> 4);
+ r = (r << 2) | (r >> 4);
+ dst_u[0] = RGBToU(r, g, b);
+ dst_v[0] = RGBToV(r, g, b);
+ src_argb4444 += 4;
+ next_argb4444 += 4;
+ dst_u += 1;
+ dst_v += 1;
+ }
+ if (width & 1) {
+ uint8 b0 = src_argb4444[0] & 0x0f;
+ uint8 g0 = src_argb4444[0] >> 4;
+ uint8 r0 = src_argb4444[1] & 0x0f;
+ uint8 b2 = next_argb4444[0] & 0x0f;
+ uint8 g2 = next_argb4444[0] >> 4;
+ uint8 r2 = next_argb4444[1] & 0x0f;
+ uint8 b = (b0 + b2); // 444 * 2 = 555.
+ uint8 g = (g0 + g2);
+ uint8 r = (r0 + r2);
+ b = (b << 3) | (b >> 2); // 555 -> 888.
+ g = (g << 3) | (g >> 2);
+ r = (r << 3) | (r >> 2);
+ dst_u[0] = RGBToU(r, g, b);
+ dst_v[0] = RGBToV(r, g, b);
+ }
+}
+
+void ARGBToUV444Row_C(const uint8* src_argb,
+ uint8* dst_u, uint8* dst_v, int width) {
+ int x;
+ for (x = 0; x < width; ++x) {
+ uint8 ab = src_argb[0];
+ uint8 ag = src_argb[1];
+ uint8 ar = src_argb[2];
+ dst_u[0] = RGBToU(ar, ag, ab);
+ dst_v[0] = RGBToV(ar, ag, ab);
+ src_argb += 4;
+ dst_u += 1;
+ dst_v += 1;
+ }
+}
+
+void ARGBToUV422Row_C(const uint8* src_argb,
+ uint8* dst_u, uint8* dst_v, int width) {
+ int x;
+ for (x = 0; x < width - 1; x += 2) {
+ uint8 ab = (src_argb[0] + src_argb[4]) >> 1;
+ uint8 ag = (src_argb[1] + src_argb[5]) >> 1;
+ uint8 ar = (src_argb[2] + src_argb[6]) >> 1;
+ dst_u[0] = RGBToU(ar, ag, ab);
+ dst_v[0] = RGBToV(ar, ag, ab);
+ src_argb += 8;
+ dst_u += 1;
+ dst_v += 1;
+ }
+ if (width & 1) {
+ uint8 ab = src_argb[0];
+ uint8 ag = src_argb[1];
+ uint8 ar = src_argb[2];
+ dst_u[0] = RGBToU(ar, ag, ab);
+ dst_v[0] = RGBToV(ar, ag, ab);
+ }
+}
+
+void ARGBToUV411Row_C(const uint8* src_argb,
+ uint8* dst_u, uint8* dst_v, int width) {
+ int x;
+ for (x = 0; x < width - 3; x += 4) {
+ uint8 ab = (src_argb[0] + src_argb[4] + src_argb[8] + src_argb[12]) >> 2;
+ uint8 ag = (src_argb[1] + src_argb[5] + src_argb[9] + src_argb[13]) >> 2;
+ uint8 ar = (src_argb[2] + src_argb[6] + src_argb[10] + src_argb[14]) >> 2;
+ dst_u[0] = RGBToU(ar, ag, ab);
+ dst_v[0] = RGBToV(ar, ag, ab);
+ src_argb += 16;
+ dst_u += 1;
+ dst_v += 1;
+ }
+ if ((width & 3) == 3) {
+ uint8 ab = (src_argb[0] + src_argb[4] + src_argb[8]) / 3;
+ uint8 ag = (src_argb[1] + src_argb[5] + src_argb[9]) / 3;
+ uint8 ar = (src_argb[2] + src_argb[6] + src_argb[10]) / 3;
+ dst_u[0] = RGBToU(ar, ag, ab);
+ dst_v[0] = RGBToV(ar, ag, ab);
+ } else if ((width & 3) == 2) {
+ uint8 ab = (src_argb[0] + src_argb[4]) >> 1;
+ uint8 ag = (src_argb[1] + src_argb[5]) >> 1;
+ uint8 ar = (src_argb[2] + src_argb[6]) >> 1;
+ dst_u[0] = RGBToU(ar, ag, ab);
+ dst_v[0] = RGBToV(ar, ag, ab);
+ } else if ((width & 3) == 1) {
+ uint8 ab = src_argb[0];
+ uint8 ag = src_argb[1];
+ uint8 ar = src_argb[2];
+ dst_u[0] = RGBToU(ar, ag, ab);
+ dst_v[0] = RGBToV(ar, ag, ab);
+ }
+}
+
+void ARGBGrayRow_C(const uint8* src_argb, uint8* dst_argb, int width) {
+ int x;
+ for (x = 0; x < width; ++x) {
+ uint8 y = RGBToYJ(src_argb[2], src_argb[1], src_argb[0]);
+ dst_argb[2] = dst_argb[1] = dst_argb[0] = y;
+ dst_argb[3] = src_argb[3];
+ dst_argb += 4;
+ src_argb += 4;
+ }
+}
+
+// Convert a row of image to Sepia tone.
+void ARGBSepiaRow_C(uint8* dst_argb, int width) {
+ int x;
+ for (x = 0; x < width; ++x) {
+ int b = dst_argb[0];
+ int g = dst_argb[1];
+ int r = dst_argb[2];
+ int sb = (b * 17 + g * 68 + r * 35) >> 7;
+ int sg = (b * 22 + g * 88 + r * 45) >> 7;
+ int sr = (b * 24 + g * 98 + r * 50) >> 7;
+ // b does not over flow. a is preserved from original.
+ dst_argb[0] = sb;
+ dst_argb[1] = clamp255(sg);
+ dst_argb[2] = clamp255(sr);
+ dst_argb += 4;
+ }
+}
+
+// Apply color matrix to a row of image. Matrix is signed.
+// TODO(fbarchard): Consider adding rounding (+32).
+void ARGBColorMatrixRow_C(const uint8* src_argb, uint8* dst_argb,
+ const int8* matrix_argb, int width) {
+ int x;
+ for (x = 0; x < width; ++x) {
+ int b = src_argb[0];
+ int g = src_argb[1];
+ int r = src_argb[2];
+ int a = src_argb[3];
+ int sb = (b * matrix_argb[0] + g * matrix_argb[1] +
+ r * matrix_argb[2] + a * matrix_argb[3]) >> 6;
+ int sg = (b * matrix_argb[4] + g * matrix_argb[5] +
+ r * matrix_argb[6] + a * matrix_argb[7]) >> 6;
+ int sr = (b * matrix_argb[8] + g * matrix_argb[9] +
+ r * matrix_argb[10] + a * matrix_argb[11]) >> 6;
+ int sa = (b * matrix_argb[12] + g * matrix_argb[13] +
+ r * matrix_argb[14] + a * matrix_argb[15]) >> 6;
+ dst_argb[0] = Clamp(sb);
+ dst_argb[1] = Clamp(sg);
+ dst_argb[2] = Clamp(sr);
+ dst_argb[3] = Clamp(sa);
+ src_argb += 4;
+ dst_argb += 4;
+ }
+}
+
+// Apply color table to a row of image.
+void ARGBColorTableRow_C(uint8* dst_argb, const uint8* table_argb, int width) {
+ int x;
+ for (x = 0; x < width; ++x) {
+ int b = dst_argb[0];
+ int g = dst_argb[1];
+ int r = dst_argb[2];
+ int a = dst_argb[3];
+ dst_argb[0] = table_argb[b * 4 + 0];
+ dst_argb[1] = table_argb[g * 4 + 1];
+ dst_argb[2] = table_argb[r * 4 + 2];
+ dst_argb[3] = table_argb[a * 4 + 3];
+ dst_argb += 4;
+ }
+}
+
+// Apply color table to a row of image.
+void RGBColorTableRow_C(uint8* dst_argb, const uint8* table_argb, int width) {
+ int x;
+ for (x = 0; x < width; ++x) {
+ int b = dst_argb[0];
+ int g = dst_argb[1];
+ int r = dst_argb[2];
+ dst_argb[0] = table_argb[b * 4 + 0];
+ dst_argb[1] = table_argb[g * 4 + 1];
+ dst_argb[2] = table_argb[r * 4 + 2];
+ dst_argb += 4;
+ }
+}
+
+void ARGBQuantizeRow_C(uint8* dst_argb, int scale, int interval_size,
+ int interval_offset, int width) {
+ int x;
+ for (x = 0; x < width; ++x) {
+ int b = dst_argb[0];
+ int g = dst_argb[1];
+ int r = dst_argb[2];
+ dst_argb[0] = (b * scale >> 16) * interval_size + interval_offset;
+ dst_argb[1] = (g * scale >> 16) * interval_size + interval_offset;
+ dst_argb[2] = (r * scale >> 16) * interval_size + interval_offset;
+ dst_argb += 4;
+ }
+}
+
+#define REPEAT8(v) (v) | ((v) << 8)
+#define SHADE(f, v) v * f >> 24
+
+void ARGBShadeRow_C(const uint8* src_argb, uint8* dst_argb, int width,
+ uint32 value) {
+ const uint32 b_scale = REPEAT8(value & 0xff);
+ const uint32 g_scale = REPEAT8((value >> 8) & 0xff);
+ const uint32 r_scale = REPEAT8((value >> 16) & 0xff);
+ const uint32 a_scale = REPEAT8(value >> 24);
+
+ int i;
+ for (i = 0; i < width; ++i) {
+ const uint32 b = REPEAT8(src_argb[0]);
+ const uint32 g = REPEAT8(src_argb[1]);
+ const uint32 r = REPEAT8(src_argb[2]);
+ const uint32 a = REPEAT8(src_argb[3]);
+ dst_argb[0] = SHADE(b, b_scale);
+ dst_argb[1] = SHADE(g, g_scale);
+ dst_argb[2] = SHADE(r, r_scale);
+ dst_argb[3] = SHADE(a, a_scale);
+ src_argb += 4;
+ dst_argb += 4;
+ }
+}
+#undef REPEAT8
+#undef SHADE
+
+#define REPEAT8(v) (v) | ((v) << 8)
+#define SHADE(f, v) v * f >> 16
+
+void ARGBMultiplyRow_C(const uint8* src_argb0, const uint8* src_argb1,
+ uint8* dst_argb, int width) {
+ int i;
+ for (i = 0; i < width; ++i) {
+ const uint32 b = REPEAT8(src_argb0[0]);
+ const uint32 g = REPEAT8(src_argb0[1]);
+ const uint32 r = REPEAT8(src_argb0[2]);
+ const uint32 a = REPEAT8(src_argb0[3]);
+ const uint32 b_scale = src_argb1[0];
+ const uint32 g_scale = src_argb1[1];
+ const uint32 r_scale = src_argb1[2];
+ const uint32 a_scale = src_argb1[3];
+ dst_argb[0] = SHADE(b, b_scale);
+ dst_argb[1] = SHADE(g, g_scale);
+ dst_argb[2] = SHADE(r, r_scale);
+ dst_argb[3] = SHADE(a, a_scale);
+ src_argb0 += 4;
+ src_argb1 += 4;
+ dst_argb += 4;
+ }
+}
+#undef REPEAT8
+#undef SHADE
+
+#define SHADE(f, v) clamp255(v + f)
+
+void ARGBAddRow_C(const uint8* src_argb0, const uint8* src_argb1,
+ uint8* dst_argb, int width) {
+ int i;
+ for (i = 0; i < width; ++i) {
+ const int b = src_argb0[0];
+ const int g = src_argb0[1];
+ const int r = src_argb0[2];
+ const int a = src_argb0[3];
+ const int b_add = src_argb1[0];
+ const int g_add = src_argb1[1];
+ const int r_add = src_argb1[2];
+ const int a_add = src_argb1[3];
+ dst_argb[0] = SHADE(b, b_add);
+ dst_argb[1] = SHADE(g, g_add);
+ dst_argb[2] = SHADE(r, r_add);
+ dst_argb[3] = SHADE(a, a_add);
+ src_argb0 += 4;
+ src_argb1 += 4;
+ dst_argb += 4;
+ }
+}
+#undef SHADE
+
+#define SHADE(f, v) clamp0(f - v)
+
+void ARGBSubtractRow_C(const uint8* src_argb0, const uint8* src_argb1,
+ uint8* dst_argb, int width) {
+ int i;
+ for (i = 0; i < width; ++i) {
+ const int b = src_argb0[0];
+ const int g = src_argb0[1];
+ const int r = src_argb0[2];
+ const int a = src_argb0[3];
+ const int b_sub = src_argb1[0];
+ const int g_sub = src_argb1[1];
+ const int r_sub = src_argb1[2];
+ const int a_sub = src_argb1[3];
+ dst_argb[0] = SHADE(b, b_sub);
+ dst_argb[1] = SHADE(g, g_sub);
+ dst_argb[2] = SHADE(r, r_sub);
+ dst_argb[3] = SHADE(a, a_sub);
+ src_argb0 += 4;
+ src_argb1 += 4;
+ dst_argb += 4;
+ }
+}
+#undef SHADE
+
+// Sobel functions which mimics SSSE3.
+void SobelXRow_C(const uint8* src_y0, const uint8* src_y1, const uint8* src_y2,
+ uint8* dst_sobelx, int width) {
+ int i;
+ for (i = 0; i < width; ++i) {
+ int a = src_y0[i];
+ int b = src_y1[i];
+ int c = src_y2[i];
+ int a_sub = src_y0[i + 2];
+ int b_sub = src_y1[i + 2];
+ int c_sub = src_y2[i + 2];
+ int a_diff = a - a_sub;
+ int b_diff = b - b_sub;
+ int c_diff = c - c_sub;
+ int sobel = Abs(a_diff + b_diff * 2 + c_diff);
+ dst_sobelx[i] = (uint8)(clamp255(sobel));
+ }
+}
+
+void SobelYRow_C(const uint8* src_y0, const uint8* src_y1,
+ uint8* dst_sobely, int width) {
+ int i;
+ for (i = 0; i < width; ++i) {
+ int a = src_y0[i + 0];
+ int b = src_y0[i + 1];
+ int c = src_y0[i + 2];
+ int a_sub = src_y1[i + 0];
+ int b_sub = src_y1[i + 1];
+ int c_sub = src_y1[i + 2];
+ int a_diff = a - a_sub;
+ int b_diff = b - b_sub;
+ int c_diff = c - c_sub;
+ int sobel = Abs(a_diff + b_diff * 2 + c_diff);
+ dst_sobely[i] = (uint8)(clamp255(sobel));
+ }
+}
+
+void SobelRow_C(const uint8* src_sobelx, const uint8* src_sobely,
+ uint8* dst_argb, int width) {
+ int i;
+ for (i = 0; i < width; ++i) {
+ int r = src_sobelx[i];
+ int b = src_sobely[i];
+ int s = clamp255(r + b);
+ dst_argb[0] = (uint8)(s);
+ dst_argb[1] = (uint8)(s);
+ dst_argb[2] = (uint8)(s);
+ dst_argb[3] = (uint8)(255u);
+ dst_argb += 4;
+ }
+}
+
+void SobelToPlaneRow_C(const uint8* src_sobelx, const uint8* src_sobely,
+ uint8* dst_y, int width) {
+ int i;
+ for (i = 0; i < width; ++i) {
+ int r = src_sobelx[i];
+ int b = src_sobely[i];
+ int s = clamp255(r + b);
+ dst_y[i] = (uint8)(s);
+ }
+}
+
+void SobelXYRow_C(const uint8* src_sobelx, const uint8* src_sobely,
+ uint8* dst_argb, int width) {
+ int i;
+ for (i = 0; i < width; ++i) {
+ int r = src_sobelx[i];
+ int b = src_sobely[i];
+ int g = clamp255(r + b);
+ dst_argb[0] = (uint8)(b);
+ dst_argb[1] = (uint8)(g);
+ dst_argb[2] = (uint8)(r);
+ dst_argb[3] = (uint8)(255u);
+ dst_argb += 4;
+ }
+}
+
+void I400ToARGBRow_C(const uint8* src_y, uint8* dst_argb, int width) {
+ // Copy a Y to RGB.
+ int x;
+ for (x = 0; x < width; ++x) {
+ uint8 y = src_y[0];
+ dst_argb[2] = dst_argb[1] = dst_argb[0] = y;
+ dst_argb[3] = 255u;
+ dst_argb += 4;
+ ++src_y;
+ }
+}
+
+// C reference code that mimics the YUV assembly.
+
+#define YG 74 /* (int8)(1.164 * 64 + 0.5) */
+
+#define UB 127 /* min(63,(int8)(2.018 * 64)) */
+#define UG -25 /* (int8)(-0.391 * 64 - 0.5) */
+#define UR 0
+
+#define VB 0
+#define VG -52 /* (int8)(-0.813 * 64 - 0.5) */
+#define VR 102 /* (int8)(1.596 * 64 + 0.5) */
+
+// Bias
+#define BB UB * 128 + VB * 128
+#define BG UG * 128 + VG * 128
+#define BR UR * 128 + VR * 128
+
+static __inline void YuvPixel(uint8 y, uint8 u, uint8 v,
+ uint8* b, uint8* g, uint8* r) {
+ int32 y1 = ((int32)(y) - 16) * YG;
+ *b = Clamp((int32)((u * UB + v * VB) - (BB) + y1) >> 6);
+ *g = Clamp((int32)((u * UG + v * VG) - (BG) + y1) >> 6);
+ *r = Clamp((int32)((u * UR + v * VR) - (BR) + y1) >> 6);
+}
+
+#if !defined(LIBYUV_DISABLE_NEON) && \
+ (defined(__ARM_NEON__) || defined(LIBYUV_NEON))
+// C mimic assembly.
+// TODO(fbarchard): Remove subsampling from Neon.
+void I444ToARGBRow_C(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* rgb_buf,
+ int width) {
+ int x;
+ for (x = 0; x < width - 1; x += 2) {
+ uint8 u = (src_u[0] + src_u[1] + 1) >> 1;
+ uint8 v = (src_v[0] + src_v[1] + 1) >> 1;
+ YuvPixel(src_y[0], u, v, rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
+ rgb_buf[3] = 255;
+ YuvPixel(src_y[1], u, v, rgb_buf + 4, rgb_buf + 5, rgb_buf + 6);
+ rgb_buf[7] = 255;
+ src_y += 2;
+ src_u += 2;
+ src_v += 2;
+ rgb_buf += 8; // Advance 2 pixels.
+ }
+ if (width & 1) {
+ YuvPixel(src_y[0], src_u[0], src_v[0],
+ rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
+ }
+}
+#else
+void I444ToARGBRow_C(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* rgb_buf,
+ int width) {
+ int x;
+ for (x = 0; x < width; ++x) {
+ YuvPixel(src_y[0], src_u[0], src_v[0],
+ rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
+ rgb_buf[3] = 255;
+ src_y += 1;
+ src_u += 1;
+ src_v += 1;
+ rgb_buf += 4; // Advance 1 pixel.
+ }
+}
+#endif
+// Also used for 420
+void I422ToARGBRow_C(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* rgb_buf,
+ int width) {
+ int x;
+ for (x = 0; x < width - 1; x += 2) {
+ YuvPixel(src_y[0], src_u[0], src_v[0],
+ rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
+ rgb_buf[3] = 255;
+ YuvPixel(src_y[1], src_u[0], src_v[0],
+ rgb_buf + 4, rgb_buf + 5, rgb_buf + 6);
+ rgb_buf[7] = 255;
+ src_y += 2;
+ src_u += 1;
+ src_v += 1;
+ rgb_buf += 8; // Advance 2 pixels.
+ }
+ if (width & 1) {
+ YuvPixel(src_y[0], src_u[0], src_v[0],
+ rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
+ rgb_buf[3] = 255;
+ }
+}
+
+void I422ToRGB24Row_C(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* rgb_buf,
+ int width) {
+ int x;
+ for (x = 0; x < width - 1; x += 2) {
+ YuvPixel(src_y[0], src_u[0], src_v[0],
+ rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
+ YuvPixel(src_y[1], src_u[0], src_v[0],
+ rgb_buf + 3, rgb_buf + 4, rgb_buf + 5);
+ src_y += 2;
+ src_u += 1;
+ src_v += 1;
+ rgb_buf += 6; // Advance 2 pixels.
+ }
+ if (width & 1) {
+ YuvPixel(src_y[0], src_u[0], src_v[0],
+ rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
+ }
+}
+
+void I422ToRAWRow_C(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* rgb_buf,
+ int width) {
+ int x;
+ for (x = 0; x < width - 1; x += 2) {
+ YuvPixel(src_y[0], src_u[0], src_v[0],
+ rgb_buf + 2, rgb_buf + 1, rgb_buf + 0);
+ YuvPixel(src_y[1], src_u[0], src_v[0],
+ rgb_buf + 5, rgb_buf + 4, rgb_buf + 3);
+ src_y += 2;
+ src_u += 1;
+ src_v += 1;
+ rgb_buf += 6; // Advance 2 pixels.
+ }
+ if (width & 1) {
+ YuvPixel(src_y[0], src_u[0], src_v[0],
+ rgb_buf + 2, rgb_buf + 1, rgb_buf + 0);
+ }
+}
+
+void I422ToARGB4444Row_C(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_argb4444,
+ int width) {
+ uint8 b0;
+ uint8 g0;
+ uint8 r0;
+ uint8 b1;
+ uint8 g1;
+ uint8 r1;
+ int x;
+ for (x = 0; x < width - 1; x += 2) {
+ YuvPixel(src_y[0], src_u[0], src_v[0], &b0, &g0, &r0);
+ YuvPixel(src_y[1], src_u[0], src_v[0], &b1, &g1, &r1);
+ b0 = b0 >> 4;
+ g0 = g0 >> 4;
+ r0 = r0 >> 4;
+ b1 = b1 >> 4;
+ g1 = g1 >> 4;
+ r1 = r1 >> 4;
+ *(uint32*)(dst_argb4444) = b0 | (g0 << 4) | (r0 << 8) |
+ (b1 << 16) | (g1 << 20) | (r1 << 24) | 0xf000f000;
+ src_y += 2;
+ src_u += 1;
+ src_v += 1;
+ dst_argb4444 += 4; // Advance 2 pixels.
+ }
+ if (width & 1) {
+ YuvPixel(src_y[0], src_u[0], src_v[0], &b0, &g0, &r0);
+ b0 = b0 >> 4;
+ g0 = g0 >> 4;
+ r0 = r0 >> 4;
+ *(uint16*)(dst_argb4444) = b0 | (g0 << 4) | (r0 << 8) |
+ 0xf000;
+ }
+}
+
+void I422ToARGB1555Row_C(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_argb1555,
+ int width) {
+ uint8 b0;
+ uint8 g0;
+ uint8 r0;
+ uint8 b1;
+ uint8 g1;
+ uint8 r1;
+ int x;
+ for (x = 0; x < width - 1; x += 2) {
+ YuvPixel(src_y[0], src_u[0], src_v[0], &b0, &g0, &r0);
+ YuvPixel(src_y[1], src_u[0], src_v[0], &b1, &g1, &r1);
+ b0 = b0 >> 3;
+ g0 = g0 >> 3;
+ r0 = r0 >> 3;
+ b1 = b1 >> 3;
+ g1 = g1 >> 3;
+ r1 = r1 >> 3;
+ *(uint32*)(dst_argb1555) = b0 | (g0 << 5) | (r0 << 10) |
+ (b1 << 16) | (g1 << 21) | (r1 << 26) | 0x80008000;
+ src_y += 2;
+ src_u += 1;
+ src_v += 1;
+ dst_argb1555 += 4; // Advance 2 pixels.
+ }
+ if (width & 1) {
+ YuvPixel(src_y[0], src_u[0], src_v[0], &b0, &g0, &r0);
+ b0 = b0 >> 3;
+ g0 = g0 >> 3;
+ r0 = r0 >> 3;
+ *(uint16*)(dst_argb1555) = b0 | (g0 << 5) | (r0 << 10) |
+ 0x8000;
+ }
+}
+
+void I422ToRGB565Row_C(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_rgb565,
+ int width) {
+ uint8 b0;
+ uint8 g0;
+ uint8 r0;
+ uint8 b1;
+ uint8 g1;
+ uint8 r1;
+ int x;
+ for (x = 0; x < width - 1; x += 2) {
+ YuvPixel(src_y[0], src_u[0], src_v[0], &b0, &g0, &r0);
+ YuvPixel(src_y[1], src_u[0], src_v[0], &b1, &g1, &r1);
+ b0 = b0 >> 3;
+ g0 = g0 >> 2;
+ r0 = r0 >> 3;
+ b1 = b1 >> 3;
+ g1 = g1 >> 2;
+ r1 = r1 >> 3;
+ *(uint32*)(dst_rgb565) = b0 | (g0 << 5) | (r0 << 11) |
+ (b1 << 16) | (g1 << 21) | (r1 << 27);
+ src_y += 2;
+ src_u += 1;
+ src_v += 1;
+ dst_rgb565 += 4; // Advance 2 pixels.
+ }
+ if (width & 1) {
+ YuvPixel(src_y[0], src_u[0], src_v[0], &b0, &g0, &r0);
+ b0 = b0 >> 3;
+ g0 = g0 >> 2;
+ r0 = r0 >> 3;
+ *(uint16*)(dst_rgb565) = b0 | (g0 << 5) | (r0 << 11);
+ }
+}
+
+void I411ToARGBRow_C(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* rgb_buf,
+ int width) {
+ int x;
+ for (x = 0; x < width - 3; x += 4) {
+ YuvPixel(src_y[0], src_u[0], src_v[0],
+ rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
+ rgb_buf[3] = 255;
+ YuvPixel(src_y[1], src_u[0], src_v[0],
+ rgb_buf + 4, rgb_buf + 5, rgb_buf + 6);
+ rgb_buf[7] = 255;
+ YuvPixel(src_y[2], src_u[0], src_v[0],
+ rgb_buf + 8, rgb_buf + 9, rgb_buf + 10);
+ rgb_buf[11] = 255;
+ YuvPixel(src_y[3], src_u[0], src_v[0],
+ rgb_buf + 12, rgb_buf + 13, rgb_buf + 14);
+ rgb_buf[15] = 255;
+ src_y += 4;
+ src_u += 1;
+ src_v += 1;
+ rgb_buf += 16; // Advance 4 pixels.
+ }
+ if (width & 2) {
+ YuvPixel(src_y[0], src_u[0], src_v[0],
+ rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
+ rgb_buf[3] = 255;
+ YuvPixel(src_y[1], src_u[0], src_v[0],
+ rgb_buf + 4, rgb_buf + 5, rgb_buf + 6);
+ rgb_buf[7] = 255;
+ src_y += 2;
+ rgb_buf += 8; // Advance 2 pixels.
+ }
+ if (width & 1) {
+ YuvPixel(src_y[0], src_u[0], src_v[0],
+ rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
+ rgb_buf[3] = 255;
+ }
+}
+
+void NV12ToARGBRow_C(const uint8* src_y,
+ const uint8* usrc_v,
+ uint8* rgb_buf,
+ int width) {
+ int x;
+ for (x = 0; x < width - 1; x += 2) {
+ YuvPixel(src_y[0], usrc_v[0], usrc_v[1],
+ rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
+ rgb_buf[3] = 255;
+ YuvPixel(src_y[1], usrc_v[0], usrc_v[1],
+ rgb_buf + 4, rgb_buf + 5, rgb_buf + 6);
+ rgb_buf[7] = 255;
+ src_y += 2;
+ usrc_v += 2;
+ rgb_buf += 8; // Advance 2 pixels.
+ }
+ if (width & 1) {
+ YuvPixel(src_y[0], usrc_v[0], usrc_v[1],
+ rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
+ rgb_buf[3] = 255;
+ }
+}
+
+void NV21ToARGBRow_C(const uint8* src_y,
+ const uint8* src_vu,
+ uint8* rgb_buf,
+ int width) {
+ int x;
+ for (x = 0; x < width - 1; x += 2) {
+ YuvPixel(src_y[0], src_vu[1], src_vu[0],
+ rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
+ rgb_buf[3] = 255;
+
+ YuvPixel(src_y[1], src_vu[1], src_vu[0],
+ rgb_buf + 4, rgb_buf + 5, rgb_buf + 6);
+ rgb_buf[7] = 255;
+
+ src_y += 2;
+ src_vu += 2;
+ rgb_buf += 8; // Advance 2 pixels.
+ }
+ if (width & 1) {
+ YuvPixel(src_y[0], src_vu[1], src_vu[0],
+ rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
+ rgb_buf[3] = 255;
+ }
+}
+
+void NV12ToRGB565Row_C(const uint8* src_y,
+ const uint8* usrc_v,
+ uint8* dst_rgb565,
+ int width) {
+ uint8 b0;
+ uint8 g0;
+ uint8 r0;
+ uint8 b1;
+ uint8 g1;
+ uint8 r1;
+ int x;
+ for (x = 0; x < width - 1; x += 2) {
+ YuvPixel(src_y[0], usrc_v[0], usrc_v[1], &b0, &g0, &r0);
+ YuvPixel(src_y[1], usrc_v[0], usrc_v[1], &b1, &g1, &r1);
+ b0 = b0 >> 3;
+ g0 = g0 >> 2;
+ r0 = r0 >> 3;
+ b1 = b1 >> 3;
+ g1 = g1 >> 2;
+ r1 = r1 >> 3;
+ *(uint32*)(dst_rgb565) = b0 | (g0 << 5) | (r0 << 11) |
+ (b1 << 16) | (g1 << 21) | (r1 << 27);
+ src_y += 2;
+ usrc_v += 2;
+ dst_rgb565 += 4; // Advance 2 pixels.
+ }
+ if (width & 1) {
+ YuvPixel(src_y[0], usrc_v[0], usrc_v[1], &b0, &g0, &r0);
+ b0 = b0 >> 3;
+ g0 = g0 >> 2;
+ r0 = r0 >> 3;
+ *(uint16*)(dst_rgb565) = b0 | (g0 << 5) | (r0 << 11);
+ }
+}
+
+void NV21ToRGB565Row_C(const uint8* src_y,
+ const uint8* vsrc_u,
+ uint8* dst_rgb565,
+ int width) {
+ uint8 b0;
+ uint8 g0;
+ uint8 r0;
+ uint8 b1;
+ uint8 g1;
+ uint8 r1;
+ int x;
+ for (x = 0; x < width - 1; x += 2) {
+ YuvPixel(src_y[0], vsrc_u[1], vsrc_u[0], &b0, &g0, &r0);
+ YuvPixel(src_y[1], vsrc_u[1], vsrc_u[0], &b1, &g1, &r1);
+ b0 = b0 >> 3;
+ g0 = g0 >> 2;
+ r0 = r0 >> 3;
+ b1 = b1 >> 3;
+ g1 = g1 >> 2;
+ r1 = r1 >> 3;
+ *(uint32*)(dst_rgb565) = b0 | (g0 << 5) | (r0 << 11) |
+ (b1 << 16) | (g1 << 21) | (r1 << 27);
+ src_y += 2;
+ vsrc_u += 2;
+ dst_rgb565 += 4; // Advance 2 pixels.
+ }
+ if (width & 1) {
+ YuvPixel(src_y[0], vsrc_u[1], vsrc_u[0], &b0, &g0, &r0);
+ b0 = b0 >> 3;
+ g0 = g0 >> 2;
+ r0 = r0 >> 3;
+ *(uint16*)(dst_rgb565) = b0 | (g0 << 5) | (r0 << 11);
+ }
+}
+
+void YUY2ToARGBRow_C(const uint8* src_yuy2,
+ uint8* rgb_buf,
+ int width) {
+ int x;
+ for (x = 0; x < width - 1; x += 2) {
+ YuvPixel(src_yuy2[0], src_yuy2[1], src_yuy2[3],
+ rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
+ rgb_buf[3] = 255;
+ YuvPixel(src_yuy2[2], src_yuy2[1], src_yuy2[3],
+ rgb_buf + 4, rgb_buf + 5, rgb_buf + 6);
+ rgb_buf[7] = 255;
+ src_yuy2 += 4;
+ rgb_buf += 8; // Advance 2 pixels.
+ }
+ if (width & 1) {
+ YuvPixel(src_yuy2[0], src_yuy2[1], src_yuy2[3],
+ rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
+ rgb_buf[3] = 255;
+ }
+}
+
+void UYVYToARGBRow_C(const uint8* src_uyvy,
+ uint8* rgb_buf,
+ int width) {
+ int x;
+ for (x = 0; x < width - 1; x += 2) {
+ YuvPixel(src_uyvy[1], src_uyvy[0], src_uyvy[2],
+ rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
+ rgb_buf[3] = 255;
+ YuvPixel(src_uyvy[3], src_uyvy[0], src_uyvy[2],
+ rgb_buf + 4, rgb_buf + 5, rgb_buf + 6);
+ rgb_buf[7] = 255;
+ src_uyvy += 4;
+ rgb_buf += 8; // Advance 2 pixels.
+ }
+ if (width & 1) {
+ YuvPixel(src_uyvy[1], src_uyvy[0], src_uyvy[2],
+ rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
+ rgb_buf[3] = 255;
+ }
+}
+
+void I422ToBGRARow_C(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* rgb_buf,
+ int width) {
+ int x;
+ for (x = 0; x < width - 1; x += 2) {
+ YuvPixel(src_y[0], src_u[0], src_v[0],
+ rgb_buf + 3, rgb_buf + 2, rgb_buf + 1);
+ rgb_buf[0] = 255;
+ YuvPixel(src_y[1], src_u[0], src_v[0],
+ rgb_buf + 7, rgb_buf + 6, rgb_buf + 5);
+ rgb_buf[4] = 255;
+ src_y += 2;
+ src_u += 1;
+ src_v += 1;
+ rgb_buf += 8; // Advance 2 pixels.
+ }
+ if (width & 1) {
+ YuvPixel(src_y[0], src_u[0], src_v[0],
+ rgb_buf + 3, rgb_buf + 2, rgb_buf + 1);
+ rgb_buf[0] = 255;
+ }
+}
+
+void I422ToABGRRow_C(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* rgb_buf,
+ int width) {
+ int x;
+ for (x = 0; x < width - 1; x += 2) {
+ YuvPixel(src_y[0], src_u[0], src_v[0],
+ rgb_buf + 2, rgb_buf + 1, rgb_buf + 0);
+ rgb_buf[3] = 255;
+ YuvPixel(src_y[1], src_u[0], src_v[0],
+ rgb_buf + 6, rgb_buf + 5, rgb_buf + 4);
+ rgb_buf[7] = 255;
+ src_y += 2;
+ src_u += 1;
+ src_v += 1;
+ rgb_buf += 8; // Advance 2 pixels.
+ }
+ if (width & 1) {
+ YuvPixel(src_y[0], src_u[0], src_v[0],
+ rgb_buf + 2, rgb_buf + 1, rgb_buf + 0);
+ rgb_buf[3] = 255;
+ }
+}
+
+void I422ToRGBARow_C(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* rgb_buf,
+ int width) {
+ int x;
+ for (x = 0; x < width - 1; x += 2) {
+ YuvPixel(src_y[0], src_u[0], src_v[0],
+ rgb_buf + 1, rgb_buf + 2, rgb_buf + 3);
+ rgb_buf[0] = 255;
+ YuvPixel(src_y[1], src_u[0], src_v[0],
+ rgb_buf + 5, rgb_buf + 6, rgb_buf + 7);
+ rgb_buf[4] = 255;
+ src_y += 2;
+ src_u += 1;
+ src_v += 1;
+ rgb_buf += 8; // Advance 2 pixels.
+ }
+ if (width & 1) {
+ YuvPixel(src_y[0], src_u[0], src_v[0],
+ rgb_buf + 1, rgb_buf + 2, rgb_buf + 3);
+ rgb_buf[0] = 255;
+ }
+}
+
+void YToARGBRow_C(const uint8* src_y, uint8* rgb_buf, int width) {
+ int x;
+ for (x = 0; x < width - 1; x += 2) {
+ YuvPixel(src_y[0], 128, 128,
+ rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
+ rgb_buf[3] = 255;
+ YuvPixel(src_y[1], 128, 128,
+ rgb_buf + 4, rgb_buf + 5, rgb_buf + 6);
+ rgb_buf[7] = 255;
+ src_y += 2;
+ rgb_buf += 8; // Advance 2 pixels.
+ }
+ if (width & 1) {
+ YuvPixel(src_y[0], 128, 128,
+ rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
+ rgb_buf[3] = 255;
+ }
+}
+
+void MirrorRow_C(const uint8* src, uint8* dst, int width) {
+ int x;
+ src += width - 1;
+ for (x = 0; x < width - 1; x += 2) {
+ dst[x] = src[0];
+ dst[x + 1] = src[-1];
+ src -= 2;
+ }
+ if (width & 1) {
+ dst[width - 1] = src[0];
+ }
+}
+
+void MirrorUVRow_C(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int width) {
+ int x;
+ src_uv += (width - 1) << 1;
+ for (x = 0; x < width - 1; x += 2) {
+ dst_u[x] = src_uv[0];
+ dst_u[x + 1] = src_uv[-2];
+ dst_v[x] = src_uv[1];
+ dst_v[x + 1] = src_uv[-2 + 1];
+ src_uv -= 4;
+ }
+ if (width & 1) {
+ dst_u[width - 1] = src_uv[0];
+ dst_v[width - 1] = src_uv[1];
+ }
+}
+
+void ARGBMirrorRow_C(const uint8* src, uint8* dst, int width) {
+ int x;
+ const uint32* src32 = (const uint32*)(src);
+ uint32* dst32 = (uint32*)(dst);
+ src32 += width - 1;
+ for (x = 0; x < width - 1; x += 2) {
+ dst32[x] = src32[0];
+ dst32[x + 1] = src32[-1];
+ src32 -= 2;
+ }
+ if (width & 1) {
+ dst32[width - 1] = src32[0];
+ }
+}
+
+void SplitUVRow_C(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int width) {
+ int x;
+ for (x = 0; x < width - 1; x += 2) {
+ dst_u[x] = src_uv[0];
+ dst_u[x + 1] = src_uv[2];
+ dst_v[x] = src_uv[1];
+ dst_v[x + 1] = src_uv[3];
+ src_uv += 4;
+ }
+ if (width & 1) {
+ dst_u[width - 1] = src_uv[0];
+ dst_v[width - 1] = src_uv[1];
+ }
+}
+
+void MergeUVRow_C(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
+ int width) {
+ int x;
+ for (x = 0; x < width - 1; x += 2) {
+ dst_uv[0] = src_u[x];
+ dst_uv[1] = src_v[x];
+ dst_uv[2] = src_u[x + 1];
+ dst_uv[3] = src_v[x + 1];
+ dst_uv += 4;
+ }
+ if (width & 1) {
+ dst_uv[0] = src_u[width - 1];
+ dst_uv[1] = src_v[width - 1];
+ }
+}
+
+void CopyRow_C(const uint8* src, uint8* dst, int count) {
+ memcpy(dst, src, count);
+}
+
+void CopyRow_16_C(const uint16* src, uint16* dst, int count) {
+ memcpy(dst, src, count * 2);
+}
+
+void SetRow_C(uint8* dst, uint32 v8, int count) {
+#ifdef _MSC_VER
+ // VC will generate rep stosb.
+ int x;
+ for (x = 0; x < count; ++x) {
+ dst[x] = v8;
+ }
+#else
+ memset(dst, v8, count);
+#endif
+}
+
+void ARGBSetRows_C(uint8* dst, uint32 v32, int width,
+ int dst_stride, int height) {
+ int y;
+ for (y = 0; y < height; ++y) {
+ uint32* d = (uint32*)(dst);
+ int x;
+ for (x = 0; x < width; ++x) {
+ d[x] = v32;
+ }
+ dst += dst_stride;
+ }
+}
+
+// Filter 2 rows of YUY2 UV's (422) into U and V (420).
+void YUY2ToUVRow_C(const uint8* src_yuy2, int src_stride_yuy2,
+ uint8* dst_u, uint8* dst_v, int width) {
+ // Output a row of UV values, filtering 2 rows of YUY2.
+ int x;
+ for (x = 0; x < width; x += 2) {
+ dst_u[0] = (src_yuy2[1] + src_yuy2[src_stride_yuy2 + 1] + 1) >> 1;
+ dst_v[0] = (src_yuy2[3] + src_yuy2[src_stride_yuy2 + 3] + 1) >> 1;
+ src_yuy2 += 4;
+ dst_u += 1;
+ dst_v += 1;
+ }
+}
+
+// Copy row of YUY2 UV's (422) into U and V (422).
+void YUY2ToUV422Row_C(const uint8* src_yuy2,
+ uint8* dst_u, uint8* dst_v, int width) {
+ // Output a row of UV values.
+ int x;
+ for (x = 0; x < width; x += 2) {
+ dst_u[0] = src_yuy2[1];
+ dst_v[0] = src_yuy2[3];
+ src_yuy2 += 4;
+ dst_u += 1;
+ dst_v += 1;
+ }
+}
+
+// Copy row of YUY2 Y's (422) into Y (420/422).
+void YUY2ToYRow_C(const uint8* src_yuy2, uint8* dst_y, int width) {
+ // Output a row of Y values.
+ int x;
+ for (x = 0; x < width - 1; x += 2) {
+ dst_y[x] = src_yuy2[0];
+ dst_y[x + 1] = src_yuy2[2];
+ src_yuy2 += 4;
+ }
+ if (width & 1) {
+ dst_y[width - 1] = src_yuy2[0];
+ }
+}
+
+// Filter 2 rows of UYVY UV's (422) into U and V (420).
+void UYVYToUVRow_C(const uint8* src_uyvy, int src_stride_uyvy,
+ uint8* dst_u, uint8* dst_v, int width) {
+ // Output a row of UV values.
+ int x;
+ for (x = 0; x < width; x += 2) {
+ dst_u[0] = (src_uyvy[0] + src_uyvy[src_stride_uyvy + 0] + 1) >> 1;
+ dst_v[0] = (src_uyvy[2] + src_uyvy[src_stride_uyvy + 2] + 1) >> 1;
+ src_uyvy += 4;
+ dst_u += 1;
+ dst_v += 1;
+ }
+}
+
+// Copy row of UYVY UV's (422) into U and V (422).
+void UYVYToUV422Row_C(const uint8* src_uyvy,
+ uint8* dst_u, uint8* dst_v, int width) {
+ // Output a row of UV values.
+ int x;
+ for (x = 0; x < width; x += 2) {
+ dst_u[0] = src_uyvy[0];
+ dst_v[0] = src_uyvy[2];
+ src_uyvy += 4;
+ dst_u += 1;
+ dst_v += 1;
+ }
+}
+
+// Copy row of UYVY Y's (422) into Y (420/422).
+void UYVYToYRow_C(const uint8* src_uyvy, uint8* dst_y, int width) {
+ // Output a row of Y values.
+ int x;
+ for (x = 0; x < width - 1; x += 2) {
+ dst_y[x] = src_uyvy[1];
+ dst_y[x + 1] = src_uyvy[3];
+ src_uyvy += 4;
+ }
+ if (width & 1) {
+ dst_y[width - 1] = src_uyvy[1];
+ }
+}
+
+#define BLEND(f, b, a) (((256 - a) * b) >> 8) + f
+
+// Blend src_argb0 over src_argb1 and store to dst_argb.
+// dst_argb may be src_argb0 or src_argb1.
+// This code mimics the SSSE3 version for better testability.
+void ARGBBlendRow_C(const uint8* src_argb0, const uint8* src_argb1,
+ uint8* dst_argb, int width) {
+ int x;
+ for (x = 0; x < width - 1; x += 2) {
+ uint32 fb = src_argb0[0];
+ uint32 fg = src_argb0[1];
+ uint32 fr = src_argb0[2];
+ uint32 a = src_argb0[3];
+ uint32 bb = src_argb1[0];
+ uint32 bg = src_argb1[1];
+ uint32 br = src_argb1[2];
+ dst_argb[0] = BLEND(fb, bb, a);
+ dst_argb[1] = BLEND(fg, bg, a);
+ dst_argb[2] = BLEND(fr, br, a);
+ dst_argb[3] = 255u;
+
+ fb = src_argb0[4 + 0];
+ fg = src_argb0[4 + 1];
+ fr = src_argb0[4 + 2];
+ a = src_argb0[4 + 3];
+ bb = src_argb1[4 + 0];
+ bg = src_argb1[4 + 1];
+ br = src_argb1[4 + 2];
+ dst_argb[4 + 0] = BLEND(fb, bb, a);
+ dst_argb[4 + 1] = BLEND(fg, bg, a);
+ dst_argb[4 + 2] = BLEND(fr, br, a);
+ dst_argb[4 + 3] = 255u;
+ src_argb0 += 8;
+ src_argb1 += 8;
+ dst_argb += 8;
+ }
+
+ if (width & 1) {
+ uint32 fb = src_argb0[0];
+ uint32 fg = src_argb0[1];
+ uint32 fr = src_argb0[2];
+ uint32 a = src_argb0[3];
+ uint32 bb = src_argb1[0];
+ uint32 bg = src_argb1[1];
+ uint32 br = src_argb1[2];
+ dst_argb[0] = BLEND(fb, bb, a);
+ dst_argb[1] = BLEND(fg, bg, a);
+ dst_argb[2] = BLEND(fr, br, a);
+ dst_argb[3] = 255u;
+ }
+}
+#undef BLEND
+#define ATTENUATE(f, a) (a | (a << 8)) * (f | (f << 8)) >> 24
+
+// Multiply source RGB by alpha and store to destination.
+// This code mimics the SSSE3 version for better testability.
+void ARGBAttenuateRow_C(const uint8* src_argb, uint8* dst_argb, int width) {
+ int i;
+ for (i = 0; i < width - 1; i += 2) {
+ uint32 b = src_argb[0];
+ uint32 g = src_argb[1];
+ uint32 r = src_argb[2];
+ uint32 a = src_argb[3];
+ dst_argb[0] = ATTENUATE(b, a);
+ dst_argb[1] = ATTENUATE(g, a);
+ dst_argb[2] = ATTENUATE(r, a);
+ dst_argb[3] = a;
+ b = src_argb[4];
+ g = src_argb[5];
+ r = src_argb[6];
+ a = src_argb[7];
+ dst_argb[4] = ATTENUATE(b, a);
+ dst_argb[5] = ATTENUATE(g, a);
+ dst_argb[6] = ATTENUATE(r, a);
+ dst_argb[7] = a;
+ src_argb += 8;
+ dst_argb += 8;
+ }
+
+ if (width & 1) {
+ const uint32 b = src_argb[0];
+ const uint32 g = src_argb[1];
+ const uint32 r = src_argb[2];
+ const uint32 a = src_argb[3];
+ dst_argb[0] = ATTENUATE(b, a);
+ dst_argb[1] = ATTENUATE(g, a);
+ dst_argb[2] = ATTENUATE(r, a);
+ dst_argb[3] = a;
+ }
+}
+#undef ATTENUATE
+
+// Divide source RGB by alpha and store to destination.
+// b = (b * 255 + (a / 2)) / a;
+// g = (g * 255 + (a / 2)) / a;
+// r = (r * 255 + (a / 2)) / a;
+// Reciprocal method is off by 1 on some values. ie 125
+// 8.8 fixed point inverse table with 1.0 in upper short and 1 / a in lower.
+#define T(a) 0x01000000 + (0x10000 / a)
+const uint32 fixed_invtbl8[256] = {
+ 0x01000000, 0x0100ffff, T(0x02), T(0x03), T(0x04), T(0x05), T(0x06), T(0x07),
+ T(0x08), T(0x09), T(0x0a), T(0x0b), T(0x0c), T(0x0d), T(0x0e), T(0x0f),
+ T(0x10), T(0x11), T(0x12), T(0x13), T(0x14), T(0x15), T(0x16), T(0x17),
+ T(0x18), T(0x19), T(0x1a), T(0x1b), T(0x1c), T(0x1d), T(0x1e), T(0x1f),
+ T(0x20), T(0x21), T(0x22), T(0x23), T(0x24), T(0x25), T(0x26), T(0x27),
+ T(0x28), T(0x29), T(0x2a), T(0x2b), T(0x2c), T(0x2d), T(0x2e), T(0x2f),
+ T(0x30), T(0x31), T(0x32), T(0x33), T(0x34), T(0x35), T(0x36), T(0x37),
+ T(0x38), T(0x39), T(0x3a), T(0x3b), T(0x3c), T(0x3d), T(0x3e), T(0x3f),
+ T(0x40), T(0x41), T(0x42), T(0x43), T(0x44), T(0x45), T(0x46), T(0x47),
+ T(0x48), T(0x49), T(0x4a), T(0x4b), T(0x4c), T(0x4d), T(0x4e), T(0x4f),
+ T(0x50), T(0x51), T(0x52), T(0x53), T(0x54), T(0x55), T(0x56), T(0x57),
+ T(0x58), T(0x59), T(0x5a), T(0x5b), T(0x5c), T(0x5d), T(0x5e), T(0x5f),
+ T(0x60), T(0x61), T(0x62), T(0x63), T(0x64), T(0x65), T(0x66), T(0x67),
+ T(0x68), T(0x69), T(0x6a), T(0x6b), T(0x6c), T(0x6d), T(0x6e), T(0x6f),
+ T(0x70), T(0x71), T(0x72), T(0x73), T(0x74), T(0x75), T(0x76), T(0x77),
+ T(0x78), T(0x79), T(0x7a), T(0x7b), T(0x7c), T(0x7d), T(0x7e), T(0x7f),
+ T(0x80), T(0x81), T(0x82), T(0x83), T(0x84), T(0x85), T(0x86), T(0x87),
+ T(0x88), T(0x89), T(0x8a), T(0x8b), T(0x8c), T(0x8d), T(0x8e), T(0x8f),
+ T(0x90), T(0x91), T(0x92), T(0x93), T(0x94), T(0x95), T(0x96), T(0x97),
+ T(0x98), T(0x99), T(0x9a), T(0x9b), T(0x9c), T(0x9d), T(0x9e), T(0x9f),
+ T(0xa0), T(0xa1), T(0xa2), T(0xa3), T(0xa4), T(0xa5), T(0xa6), T(0xa7),
+ T(0xa8), T(0xa9), T(0xaa), T(0xab), T(0xac), T(0xad), T(0xae), T(0xaf),
+ T(0xb0), T(0xb1), T(0xb2), T(0xb3), T(0xb4), T(0xb5), T(0xb6), T(0xb7),
+ T(0xb8), T(0xb9), T(0xba), T(0xbb), T(0xbc), T(0xbd), T(0xbe), T(0xbf),
+ T(0xc0), T(0xc1), T(0xc2), T(0xc3), T(0xc4), T(0xc5), T(0xc6), T(0xc7),
+ T(0xc8), T(0xc9), T(0xca), T(0xcb), T(0xcc), T(0xcd), T(0xce), T(0xcf),
+ T(0xd0), T(0xd1), T(0xd2), T(0xd3), T(0xd4), T(0xd5), T(0xd6), T(0xd7),
+ T(0xd8), T(0xd9), T(0xda), T(0xdb), T(0xdc), T(0xdd), T(0xde), T(0xdf),
+ T(0xe0), T(0xe1), T(0xe2), T(0xe3), T(0xe4), T(0xe5), T(0xe6), T(0xe7),
+ T(0xe8), T(0xe9), T(0xea), T(0xeb), T(0xec), T(0xed), T(0xee), T(0xef),
+ T(0xf0), T(0xf1), T(0xf2), T(0xf3), T(0xf4), T(0xf5), T(0xf6), T(0xf7),
+ T(0xf8), T(0xf9), T(0xfa), T(0xfb), T(0xfc), T(0xfd), T(0xfe), 0x01000100 };
+#undef T
+
+void ARGBUnattenuateRow_C(const uint8* src_argb, uint8* dst_argb, int width) {
+ int i;
+ for (i = 0; i < width; ++i) {
+ uint32 b = src_argb[0];
+ uint32 g = src_argb[1];
+ uint32 r = src_argb[2];
+ const uint32 a = src_argb[3];
+ const uint32 ia = fixed_invtbl8[a] & 0xffff; // 8.8 fixed point
+ b = (b * ia) >> 8;
+ g = (g * ia) >> 8;
+ r = (r * ia) >> 8;
+ // Clamping should not be necessary but is free in assembly.
+ dst_argb[0] = clamp255(b);
+ dst_argb[1] = clamp255(g);
+ dst_argb[2] = clamp255(r);
+ dst_argb[3] = a;
+ src_argb += 4;
+ dst_argb += 4;
+ }
+}
+
+void ComputeCumulativeSumRow_C(const uint8* row, int32* cumsum,
+ const int32* previous_cumsum, int width) {
+ int32 row_sum[4] = {0, 0, 0, 0};
+ int x;
+ for (x = 0; x < width; ++x) {
+ row_sum[0] += row[x * 4 + 0];
+ row_sum[1] += row[x * 4 + 1];
+ row_sum[2] += row[x * 4 + 2];
+ row_sum[3] += row[x * 4 + 3];
+ cumsum[x * 4 + 0] = row_sum[0] + previous_cumsum[x * 4 + 0];
+ cumsum[x * 4 + 1] = row_sum[1] + previous_cumsum[x * 4 + 1];
+ cumsum[x * 4 + 2] = row_sum[2] + previous_cumsum[x * 4 + 2];
+ cumsum[x * 4 + 3] = row_sum[3] + previous_cumsum[x * 4 + 3];
+ }
+}
+
+void CumulativeSumToAverageRow_C(const int32* tl, const int32* bl,
+ int w, int area, uint8* dst, int count) {
+ float ooa = 1.0f / area;
+ int i;
+ for (i = 0; i < count; ++i) {
+ dst[0] = (uint8)((bl[w + 0] + tl[0] - bl[0] - tl[w + 0]) * ooa);
+ dst[1] = (uint8)((bl[w + 1] + tl[1] - bl[1] - tl[w + 1]) * ooa);
+ dst[2] = (uint8)((bl[w + 2] + tl[2] - bl[2] - tl[w + 2]) * ooa);
+ dst[3] = (uint8)((bl[w + 3] + tl[3] - bl[3] - tl[w + 3]) * ooa);
+ dst += 4;
+ tl += 4;
+ bl += 4;
+ }
+}
+
+// Copy pixels from rotated source to destination row with a slope.
+LIBYUV_API
+void ARGBAffineRow_C(const uint8* src_argb, int src_argb_stride,
+ uint8* dst_argb, const float* uv_dudv, int width) {
+ int i;
+ // Render a row of pixels from source into a buffer.
+ float uv[2];
+ uv[0] = uv_dudv[0];
+ uv[1] = uv_dudv[1];
+ for (i = 0; i < width; ++i) {
+ int x = (int)(uv[0]);
+ int y = (int)(uv[1]);
+ *(uint32*)(dst_argb) =
+ *(const uint32*)(src_argb + y * src_argb_stride +
+ x * 4);
+ dst_argb += 4;
+ uv[0] += uv_dudv[2];
+ uv[1] += uv_dudv[3];
+ }
+}
+
+// Blend 2 rows into 1 for conversions such as I422ToI420.
+void HalfRow_C(const uint8* src_uv, int src_uv_stride,
+ uint8* dst_uv, int pix) {
+ int x;
+ for (x = 0; x < pix; ++x) {
+ dst_uv[x] = (src_uv[x] + src_uv[src_uv_stride + x] + 1) >> 1;
+ }
+}
+
+void HalfRow_16_C(const uint16* src_uv, int src_uv_stride,
+ uint16* dst_uv, int pix) {
+ int x;
+ for (x = 0; x < pix; ++x) {
+ dst_uv[x] = (src_uv[x] + src_uv[src_uv_stride + x] + 1) >> 1;
+ }
+}
+
+// C version 2x2 -> 2x1.
+void InterpolateRow_C(uint8* dst_ptr, const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ int width, int source_y_fraction) {
+ int y1_fraction = source_y_fraction;
+ int y0_fraction = 256 - y1_fraction;
+ const uint8* src_ptr1 = src_ptr + src_stride;
+ int x;
+ if (source_y_fraction == 0) {
+ memcpy(dst_ptr, src_ptr, width);
+ return;
+ }
+ if (source_y_fraction == 128) {
+ HalfRow_C(src_ptr, (int)(src_stride), dst_ptr, width);
+ return;
+ }
+ for (x = 0; x < width - 1; x += 2) {
+ dst_ptr[0] = (src_ptr[0] * y0_fraction + src_ptr1[0] * y1_fraction) >> 8;
+ dst_ptr[1] = (src_ptr[1] * y0_fraction + src_ptr1[1] * y1_fraction) >> 8;
+ src_ptr += 2;
+ src_ptr1 += 2;
+ dst_ptr += 2;
+ }
+ if (width & 1) {
+ dst_ptr[0] = (src_ptr[0] * y0_fraction + src_ptr1[0] * y1_fraction) >> 8;
+ }
+}
+
+void InterpolateRow_16_C(uint16* dst_ptr, const uint16* src_ptr,
+ ptrdiff_t src_stride,
+ int width, int source_y_fraction) {
+ int y1_fraction = source_y_fraction;
+ int y0_fraction = 256 - y1_fraction;
+ const uint16* src_ptr1 = src_ptr + src_stride;
+ int x;
+ if (source_y_fraction == 0) {
+ memcpy(dst_ptr, src_ptr, width * 2);
+ return;
+ }
+ if (source_y_fraction == 128) {
+ HalfRow_16_C(src_ptr, (int)(src_stride), dst_ptr, width);
+ return;
+ }
+ for (x = 0; x < width - 1; x += 2) {
+ dst_ptr[0] = (src_ptr[0] * y0_fraction + src_ptr1[0] * y1_fraction) >> 8;
+ dst_ptr[1] = (src_ptr[1] * y0_fraction + src_ptr1[1] * y1_fraction) >> 8;
+ src_ptr += 2;
+ src_ptr1 += 2;
+ dst_ptr += 2;
+ }
+ if (width & 1) {
+ dst_ptr[0] = (src_ptr[0] * y0_fraction + src_ptr1[0] * y1_fraction) >> 8;
+ }
+}
+
+// Select 2 channels from ARGB on alternating pixels. e.g. BGBGBGBG
+void ARGBToBayerRow_C(const uint8* src_argb,
+ uint8* dst_bayer, uint32 selector, int pix) {
+ int index0 = selector & 0xff;
+ int index1 = (selector >> 8) & 0xff;
+ // Copy a row of Bayer.
+ int x;
+ for (x = 0; x < pix - 1; x += 2) {
+ dst_bayer[0] = src_argb[index0];
+ dst_bayer[1] = src_argb[index1];
+ src_argb += 8;
+ dst_bayer += 2;
+ }
+ if (pix & 1) {
+ dst_bayer[0] = src_argb[index0];
+ }
+}
+
+// Select G channel from ARGB. e.g. GGGGGGGG
+void ARGBToBayerGGRow_C(const uint8* src_argb,
+ uint8* dst_bayer, uint32 selector, int pix) {
+ // Copy a row of G.
+ int x;
+ for (x = 0; x < pix - 1; x += 2) {
+ dst_bayer[0] = src_argb[1];
+ dst_bayer[1] = src_argb[5];
+ src_argb += 8;
+ dst_bayer += 2;
+ }
+ if (pix & 1) {
+ dst_bayer[0] = src_argb[1];
+ }
+}
+
+// Use first 4 shuffler values to reorder ARGB channels.
+void ARGBShuffleRow_C(const uint8* src_argb, uint8* dst_argb,
+ const uint8* shuffler, int pix) {
+ int index0 = shuffler[0];
+ int index1 = shuffler[1];
+ int index2 = shuffler[2];
+ int index3 = shuffler[3];
+ // Shuffle a row of ARGB.
+ int x;
+ for (x = 0; x < pix; ++x) {
+ // To support in-place conversion.
+ uint8 b = src_argb[index0];
+ uint8 g = src_argb[index1];
+ uint8 r = src_argb[index2];
+ uint8 a = src_argb[index3];
+ dst_argb[0] = b;
+ dst_argb[1] = g;
+ dst_argb[2] = r;
+ dst_argb[3] = a;
+ src_argb += 4;
+ dst_argb += 4;
+ }
+}
+
+void I422ToYUY2Row_C(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_frame, int width) {
+ int x;
+ for (x = 0; x < width - 1; x += 2) {
+ dst_frame[0] = src_y[0];
+ dst_frame[1] = src_u[0];
+ dst_frame[2] = src_y[1];
+ dst_frame[3] = src_v[0];
+ dst_frame += 4;
+ src_y += 2;
+ src_u += 1;
+ src_v += 1;
+ }
+ if (width & 1) {
+ dst_frame[0] = src_y[0];
+ dst_frame[1] = src_u[0];
+ dst_frame[2] = src_y[0]; // duplicate last y
+ dst_frame[3] = src_v[0];
+ }
+}
+
+void I422ToUYVYRow_C(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_frame, int width) {
+ int x;
+ for (x = 0; x < width - 1; x += 2) {
+ dst_frame[0] = src_u[0];
+ dst_frame[1] = src_y[0];
+ dst_frame[2] = src_v[0];
+ dst_frame[3] = src_y[1];
+ dst_frame += 4;
+ src_y += 2;
+ src_u += 1;
+ src_v += 1;
+ }
+ if (width & 1) {
+ dst_frame[0] = src_u[0];
+ dst_frame[1] = src_y[0];
+ dst_frame[2] = src_v[0];
+ dst_frame[3] = src_y[0]; // duplicate last y
+ }
+}
+
+#if !defined(LIBYUV_DISABLE_X86) && defined(HAS_I422TOARGBROW_SSSE3)
+// row_win.cc has asm version, but GCC uses 2 step wrapper.
+#if !defined(_MSC_VER) && (defined(__x86_64__) || defined(__i386__))
+void I422ToRGB565Row_SSSE3(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* rgb_buf,
+ int width) {
+ // Allocate a row of ARGB.
+ align_buffer_64(row, width * 4);
+ I422ToARGBRow_SSSE3(src_y, src_u, src_v, row, width);
+ ARGBToRGB565Row_SSE2(row, rgb_buf, width);
+ free_aligned_buffer_64(row);
+}
+#endif // !defined(_MSC_VER) && (defined(__x86_64__) || defined(__i386__))
+
+#if defined(_M_IX86) || defined(__x86_64__) || defined(__i386__)
+void I422ToARGB1555Row_SSSE3(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* rgb_buf,
+ int width) {
+ // Allocate a row of ARGB.
+ align_buffer_64(row, width * 4);
+ I422ToARGBRow_SSSE3(src_y, src_u, src_v, row, width);
+ ARGBToARGB1555Row_SSE2(row, rgb_buf, width);
+ free_aligned_buffer_64(row);
+}
+
+void I422ToARGB4444Row_SSSE3(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* rgb_buf,
+ int width) {
+ // Allocate a row of ARGB.
+ align_buffer_64(row, width * 4);
+ I422ToARGBRow_SSSE3(src_y, src_u, src_v, row, width);
+ ARGBToARGB4444Row_SSE2(row, rgb_buf, width);
+ free_aligned_buffer_64(row);
+}
+
+void NV12ToRGB565Row_SSSE3(const uint8* src_y,
+ const uint8* src_uv,
+ uint8* dst_rgb565,
+ int width) {
+ // Allocate a row of ARGB.
+ align_buffer_64(row, width * 4);
+ NV12ToARGBRow_SSSE3(src_y, src_uv, row, width);
+ ARGBToRGB565Row_SSE2(row, dst_rgb565, width);
+ free_aligned_buffer_64(row);
+}
+
+void NV21ToRGB565Row_SSSE3(const uint8* src_y,
+ const uint8* src_vu,
+ uint8* dst_rgb565,
+ int width) {
+ // Allocate a row of ARGB.
+ align_buffer_64(row, width * 4);
+ NV21ToARGBRow_SSSE3(src_y, src_vu, row, width);
+ ARGBToRGB565Row_SSE2(row, dst_rgb565, width);
+ free_aligned_buffer_64(row);
+}
+
+void YUY2ToARGBRow_SSSE3(const uint8* src_yuy2,
+ uint8* dst_argb,
+ int width) {
+ // Allocate a rows of yuv.
+ align_buffer_64(row_y, ((width + 63) & ~63) * 2);
+ uint8* row_u = row_y + ((width + 63) & ~63);
+ uint8* row_v = row_u + ((width + 63) & ~63) / 2;
+ YUY2ToUV422Row_SSE2(src_yuy2, row_u, row_v, width);
+ YUY2ToYRow_SSE2(src_yuy2, row_y, width);
+ I422ToARGBRow_SSSE3(row_y, row_u, row_v, dst_argb, width);
+ free_aligned_buffer_64(row_y);
+}
+
+void YUY2ToARGBRow_Unaligned_SSSE3(const uint8* src_yuy2,
+ uint8* dst_argb,
+ int width) {
+ // Allocate a rows of yuv.
+ align_buffer_64(row_y, ((width + 63) & ~63) * 2);
+ uint8* row_u = row_y + ((width + 63) & ~63);
+ uint8* row_v = row_u + ((width + 63) & ~63) / 2;
+ YUY2ToUV422Row_Unaligned_SSE2(src_yuy2, row_u, row_v, width);
+ YUY2ToYRow_Unaligned_SSE2(src_yuy2, row_y, width);
+ I422ToARGBRow_Unaligned_SSSE3(row_y, row_u, row_v, dst_argb, width);
+ free_aligned_buffer_64(row_y);
+}
+
+void UYVYToARGBRow_SSSE3(const uint8* src_uyvy,
+ uint8* dst_argb,
+ int width) {
+ // Allocate a rows of yuv.
+ align_buffer_64(row_y, ((width + 63) & ~63) * 2);
+ uint8* row_u = row_y + ((width + 63) & ~63);
+ uint8* row_v = row_u + ((width + 63) & ~63) / 2;
+ UYVYToUV422Row_SSE2(src_uyvy, row_u, row_v, width);
+ UYVYToYRow_SSE2(src_uyvy, row_y, width);
+ I422ToARGBRow_SSSE3(row_y, row_u, row_v, dst_argb, width);
+ free_aligned_buffer_64(row_y);
+}
+
+void UYVYToARGBRow_Unaligned_SSSE3(const uint8* src_uyvy,
+ uint8* dst_argb,
+ int width) {
+ // Allocate a rows of yuv.
+ align_buffer_64(row_y, ((width + 63) & ~63) * 2);
+ uint8* row_u = row_y + ((width + 63) & ~63);
+ uint8* row_v = row_u + ((width + 63) & ~63) / 2;
+ UYVYToUV422Row_Unaligned_SSE2(src_uyvy, row_u, row_v, width);
+ UYVYToYRow_Unaligned_SSE2(src_uyvy, row_y, width);
+ I422ToARGBRow_Unaligned_SSSE3(row_y, row_u, row_v, dst_argb, width);
+ free_aligned_buffer_64(row_y);
+}
+
+#endif // defined(_M_IX86) || defined(__x86_64__) || defined(__i386__)
+#endif // !defined(LIBYUV_DISABLE_X86)
+
+void ARGBPolynomialRow_C(const uint8* src_argb,
+ uint8* dst_argb, const float* poly,
+ int width) {
+ int i;
+ for (i = 0; i < width; ++i) {
+ float b = (float)(src_argb[0]);
+ float g = (float)(src_argb[1]);
+ float r = (float)(src_argb[2]);
+ float a = (float)(src_argb[3]);
+ float b2 = b * b;
+ float g2 = g * g;
+ float r2 = r * r;
+ float a2 = a * a;
+ float db = poly[0] + poly[4] * b;
+ float dg = poly[1] + poly[5] * g;
+ float dr = poly[2] + poly[6] * r;
+ float da = poly[3] + poly[7] * a;
+ float b3 = b2 * b;
+ float g3 = g2 * g;
+ float r3 = r2 * r;
+ float a3 = a2 * a;
+ db += poly[8] * b2;
+ dg += poly[9] * g2;
+ dr += poly[10] * r2;
+ da += poly[11] * a2;
+ db += poly[12] * b3;
+ dg += poly[13] * g3;
+ dr += poly[14] * r3;
+ da += poly[15] * a3;
+
+ dst_argb[0] = Clamp((int32)(db));
+ dst_argb[1] = Clamp((int32)(dg));
+ dst_argb[2] = Clamp((int32)(dr));
+ dst_argb[3] = Clamp((int32)(da));
+ src_argb += 4;
+ dst_argb += 4;
+ }
+}
+
+void ARGBLumaColorTableRow_C(const uint8* src_argb, uint8* dst_argb, int width,
+ const uint8* luma, uint32 lumacoeff) {
+ uint32 bc = lumacoeff & 0xff;
+ uint32 gc = (lumacoeff >> 8) & 0xff;
+ uint32 rc = (lumacoeff >> 16) & 0xff;
+
+ int i;
+ for (i = 0; i < width - 1; i += 2) {
+ // Luminance in rows, color values in columns.
+ const uint8* luma0 = ((src_argb[0] * bc + src_argb[1] * gc +
+ src_argb[2] * rc) & 0x7F00u) + luma;
+ const uint8* luma1;
+ dst_argb[0] = luma0[src_argb[0]];
+ dst_argb[1] = luma0[src_argb[1]];
+ dst_argb[2] = luma0[src_argb[2]];
+ dst_argb[3] = src_argb[3];
+ luma1 = ((src_argb[4] * bc + src_argb[5] * gc +
+ src_argb[6] * rc) & 0x7F00u) + luma;
+ dst_argb[4] = luma1[src_argb[4]];
+ dst_argb[5] = luma1[src_argb[5]];
+ dst_argb[6] = luma1[src_argb[6]];
+ dst_argb[7] = src_argb[7];
+ src_argb += 8;
+ dst_argb += 8;
+ }
+ if (width & 1) {
+ // Luminance in rows, color values in columns.
+ const uint8* luma0 = ((src_argb[0] * bc + src_argb[1] * gc +
+ src_argb[2] * rc) & 0x7F00u) + luma;
+ dst_argb[0] = luma0[src_argb[0]];
+ dst_argb[1] = luma0[src_argb[1]];
+ dst_argb[2] = luma0[src_argb[2]];
+ dst_argb[3] = src_argb[3];
+ }
+}
+
+void ARGBCopyAlphaRow_C(const uint8* src, uint8* dst, int width) {
+ int i;
+ for (i = 0; i < width - 1; i += 2) {
+ dst[3] = src[3];
+ dst[7] = src[7];
+ dst += 8;
+ src += 8;
+ }
+ if (width & 1) {
+ dst[3] = src[3];
+ }
+}
+
+void ARGBCopyYToAlphaRow_C(const uint8* src, uint8* dst, int width) {
+ int i;
+ for (i = 0; i < width - 1; i += 2) {
+ dst[3] = src[0];
+ dst[7] = src[1];
+ dst += 8;
+ src += 2;
+ }
+ if (width & 1) {
+ dst[3] = src[0];
+ }
+}
+
+#ifdef __cplusplus
+} // extern "C"
+} // namespace libyuv
+#endif
diff --git a/third_party/libyuv/source/row_mips.cc b/third_party/libyuv/source/row_mips.cc
new file mode 100644
index 0000000..a804670
--- /dev/null
+++ b/third_party/libyuv/source/row_mips.cc
@@ -0,0 +1,991 @@
+/*
+ * Copyright (c) 2012 The LibYuv project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "third_party/libyuv/include/libyuv/row.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// The following are available on Mips platforms:
+#if !defined(LIBYUV_DISABLE_MIPS) && defined(__mips__)
+
+#ifdef HAS_COPYROW_MIPS
+void CopyRow_MIPS(const uint8* src, uint8* dst, int count) {
+ __asm__ __volatile__ (
+ ".set noreorder \n"
+ ".set noat \n"
+ "slti $at, %[count], 8 \n"
+ "bne $at ,$zero, $last8 \n"
+ "xor $t8, %[src], %[dst] \n"
+ "andi $t8, $t8, 0x3 \n"
+
+ "bne $t8, $zero, unaligned \n"
+ "negu $a3, %[dst] \n"
+ // make dst/src aligned
+ "andi $a3, $a3, 0x3 \n"
+ "beq $a3, $zero, $chk16w \n"
+ // word-aligned now count is the remining bytes count
+ "subu %[count], %[count], $a3 \n"
+
+ "lwr $t8, 0(%[src]) \n"
+ "addu %[src], %[src], $a3 \n"
+ "swr $t8, 0(%[dst]) \n"
+ "addu %[dst], %[dst], $a3 \n"
+
+ // Now the dst/src are mutually word-aligned with word-aligned addresses
+ "$chk16w: \n"
+ "andi $t8, %[count], 0x3f \n" // whole 64-B chunks?
+ // t8 is the byte count after 64-byte chunks
+ "beq %[count], $t8, chk8w \n"
+ // There will be at most 1 32-byte chunk after it
+ "subu $a3, %[count], $t8 \n" // the reminder
+ // Here a3 counts bytes in 16w chunks
+ "addu $a3, %[dst], $a3 \n"
+ // Now a3 is the final dst after 64-byte chunks
+ "addu $t0, %[dst], %[count] \n"
+ // t0 is the "past the end" address
+
+ // When in the loop we exercise "pref 30,x(a1)", the a1+x should not be past
+ // the "t0-32" address
+ // This means: for x=128 the last "safe" a1 address is "t0-160"
+ // Alternatively, for x=64 the last "safe" a1 address is "t0-96"
+ // we will use "pref 30,128(a1)", so "t0-160" is the limit
+ "subu $t9, $t0, 160 \n"
+ // t9 is the "last safe pref 30,128(a1)" address
+ "pref 0, 0(%[src]) \n" // first line of src
+ "pref 0, 32(%[src]) \n" // second line of src
+ "pref 0, 64(%[src]) \n"
+ "pref 30, 32(%[dst]) \n"
+ // In case the a1 > t9 don't use "pref 30" at all
+ "sgtu $v1, %[dst], $t9 \n"
+ "bgtz $v1, $loop16w \n"
+ "nop \n"
+ // otherwise, start with using pref30
+ "pref 30, 64(%[dst]) \n"
+ "$loop16w: \n"
+ "pref 0, 96(%[src]) \n"
+ "lw $t0, 0(%[src]) \n"
+ "bgtz $v1, $skip_pref30_96 \n" // skip
+ "lw $t1, 4(%[src]) \n"
+ "pref 30, 96(%[dst]) \n" // continue
+ "$skip_pref30_96: \n"
+ "lw $t2, 8(%[src]) \n"
+ "lw $t3, 12(%[src]) \n"
+ "lw $t4, 16(%[src]) \n"
+ "lw $t5, 20(%[src]) \n"
+ "lw $t6, 24(%[src]) \n"
+ "lw $t7, 28(%[src]) \n"
+ "pref 0, 128(%[src]) \n"
+ // bring the next lines of src, addr 128
+ "sw $t0, 0(%[dst]) \n"
+ "sw $t1, 4(%[dst]) \n"
+ "sw $t2, 8(%[dst]) \n"
+ "sw $t3, 12(%[dst]) \n"
+ "sw $t4, 16(%[dst]) \n"
+ "sw $t5, 20(%[dst]) \n"
+ "sw $t6, 24(%[dst]) \n"
+ "sw $t7, 28(%[dst]) \n"
+ "lw $t0, 32(%[src]) \n"
+ "bgtz $v1, $skip_pref30_128 \n" // skip pref 30,128(a1)
+ "lw $t1, 36(%[src]) \n"
+ "pref 30, 128(%[dst]) \n" // set dest, addr 128
+ "$skip_pref30_128: \n"
+ "lw $t2, 40(%[src]) \n"
+ "lw $t3, 44(%[src]) \n"
+ "lw $t4, 48(%[src]) \n"
+ "lw $t5, 52(%[src]) \n"
+ "lw $t6, 56(%[src]) \n"
+ "lw $t7, 60(%[src]) \n"
+ "pref 0, 160(%[src]) \n"
+ // bring the next lines of src, addr 160
+ "sw $t0, 32(%[dst]) \n"
+ "sw $t1, 36(%[dst]) \n"
+ "sw $t2, 40(%[dst]) \n"
+ "sw $t3, 44(%[dst]) \n"
+ "sw $t4, 48(%[dst]) \n"
+ "sw $t5, 52(%[dst]) \n"
+ "sw $t6, 56(%[dst]) \n"
+ "sw $t7, 60(%[dst]) \n"
+
+ "addiu %[dst], %[dst], 64 \n" // adding 64 to dest
+ "sgtu $v1, %[dst], $t9 \n"
+ "bne %[dst], $a3, $loop16w \n"
+ " addiu %[src], %[src], 64 \n" // adding 64 to src
+ "move %[count], $t8 \n"
+
+ // Here we have src and dest word-aligned but less than 64-bytes to go
+
+ "chk8w: \n"
+ "pref 0, 0x0(%[src]) \n"
+ "andi $t8, %[count], 0x1f \n" // 32-byte chunk?
+ // the t8 is the reminder count past 32-bytes
+ "beq %[count], $t8, chk1w \n"
+ // count=t8,no 32-byte chunk
+ " nop \n"
+
+ "lw $t0, 0(%[src]) \n"
+ "lw $t1, 4(%[src]) \n"
+ "lw $t2, 8(%[src]) \n"
+ "lw $t3, 12(%[src]) \n"
+ "lw $t4, 16(%[src]) \n"
+ "lw $t5, 20(%[src]) \n"
+ "lw $t6, 24(%[src]) \n"
+ "lw $t7, 28(%[src]) \n"
+ "addiu %[src], %[src], 32 \n"
+
+ "sw $t0, 0(%[dst]) \n"
+ "sw $t1, 4(%[dst]) \n"
+ "sw $t2, 8(%[dst]) \n"
+ "sw $t3, 12(%[dst]) \n"
+ "sw $t4, 16(%[dst]) \n"
+ "sw $t5, 20(%[dst]) \n"
+ "sw $t6, 24(%[dst]) \n"
+ "sw $t7, 28(%[dst]) \n"
+ "addiu %[dst], %[dst], 32 \n"
+
+ "chk1w: \n"
+ "andi %[count], $t8, 0x3 \n"
+ // now count is the reminder past 1w chunks
+ "beq %[count], $t8, $last8 \n"
+ " subu $a3, $t8, %[count] \n"
+ // a3 is count of bytes in 1w chunks
+ "addu $a3, %[dst], $a3 \n"
+ // now a3 is the dst address past the 1w chunks
+ // copying in words (4-byte chunks)
+ "$wordCopy_loop: \n"
+ "lw $t3, 0(%[src]) \n"
+ // the first t3 may be equal t0 ... optimize?
+ "addiu %[src], %[src],4 \n"
+ "addiu %[dst], %[dst],4 \n"
+ "bne %[dst], $a3,$wordCopy_loop \n"
+ " sw $t3, -4(%[dst]) \n"
+
+ // For the last (<8) bytes
+ "$last8: \n"
+ "blez %[count], leave \n"
+ " addu $a3, %[dst], %[count] \n" // a3 -last dst address
+ "$last8loop: \n"
+ "lb $v1, 0(%[src]) \n"
+ "addiu %[src], %[src], 1 \n"
+ "addiu %[dst], %[dst], 1 \n"
+ "bne %[dst], $a3, $last8loop \n"
+ " sb $v1, -1(%[dst]) \n"
+
+ "leave: \n"
+ " j $ra \n"
+ " nop \n"
+
+ //
+ // UNALIGNED case
+ //
+
+ "unaligned: \n"
+ // got here with a3="negu a1"
+ "andi $a3, $a3, 0x3 \n" // a1 is word aligned?
+ "beqz $a3, $ua_chk16w \n"
+ " subu %[count], %[count], $a3 \n"
+ // bytes left after initial a3 bytes
+ "lwr $v1, 0(%[src]) \n"
+ "lwl $v1, 3(%[src]) \n"
+ "addu %[src], %[src], $a3 \n" // a3 may be 1, 2 or 3
+ "swr $v1, 0(%[dst]) \n"
+ "addu %[dst], %[dst], $a3 \n"
+ // below the dst will be word aligned (NOTE1)
+ "$ua_chk16w: \n"
+ "andi $t8, %[count], 0x3f \n" // whole 64-B chunks?
+ // t8 is the byte count after 64-byte chunks
+ "beq %[count], $t8, ua_chk8w \n"
+ // if a2==t8, no 64-byte chunks
+ // There will be at most 1 32-byte chunk after it
+ "subu $a3, %[count], $t8 \n" // the reminder
+ // Here a3 counts bytes in 16w chunks
+ "addu $a3, %[dst], $a3 \n"
+ // Now a3 is the final dst after 64-byte chunks
+ "addu $t0, %[dst], %[count] \n" // t0 "past the end"
+ "subu $t9, $t0, 160 \n"
+ // t9 is the "last safe pref 30,128(a1)" address
+ "pref 0, 0(%[src]) \n" // first line of src
+ "pref 0, 32(%[src]) \n" // second line addr 32
+ "pref 0, 64(%[src]) \n"
+ "pref 30, 32(%[dst]) \n"
+ // safe, as we have at least 64 bytes ahead
+ // In case the a1 > t9 don't use "pref 30" at all
+ "sgtu $v1, %[dst], $t9 \n"
+ "bgtz $v1, $ua_loop16w \n"
+ // skip "pref 30,64(a1)" for too short arrays
+ " nop \n"
+ // otherwise, start with using pref30
+ "pref 30, 64(%[dst]) \n"
+ "$ua_loop16w: \n"
+ "pref 0, 96(%[src]) \n"
+ "lwr $t0, 0(%[src]) \n"
+ "lwl $t0, 3(%[src]) \n"
+ "lwr $t1, 4(%[src]) \n"
+ "bgtz $v1, $ua_skip_pref30_96 \n"
+ " lwl $t1, 7(%[src]) \n"
+ "pref 30, 96(%[dst]) \n"
+ // continue setting up the dest, addr 96
+ "$ua_skip_pref30_96: \n"
+ "lwr $t2, 8(%[src]) \n"
+ "lwl $t2, 11(%[src]) \n"
+ "lwr $t3, 12(%[src]) \n"
+ "lwl $t3, 15(%[src]) \n"
+ "lwr $t4, 16(%[src]) \n"
+ "lwl $t4, 19(%[src]) \n"
+ "lwr $t5, 20(%[src]) \n"
+ "lwl $t5, 23(%[src]) \n"
+ "lwr $t6, 24(%[src]) \n"
+ "lwl $t6, 27(%[src]) \n"
+ "lwr $t7, 28(%[src]) \n"
+ "lwl $t7, 31(%[src]) \n"
+ "pref 0, 128(%[src]) \n"
+ // bring the next lines of src, addr 128
+ "sw $t0, 0(%[dst]) \n"
+ "sw $t1, 4(%[dst]) \n"
+ "sw $t2, 8(%[dst]) \n"
+ "sw $t3, 12(%[dst]) \n"
+ "sw $t4, 16(%[dst]) \n"
+ "sw $t5, 20(%[dst]) \n"
+ "sw $t6, 24(%[dst]) \n"
+ "sw $t7, 28(%[dst]) \n"
+ "lwr $t0, 32(%[src]) \n"
+ "lwl $t0, 35(%[src]) \n"
+ "lwr $t1, 36(%[src]) \n"
+ "bgtz $v1, ua_skip_pref30_128 \n"
+ " lwl $t1, 39(%[src]) \n"
+ "pref 30, 128(%[dst]) \n"
+ // continue setting up the dest, addr 128
+ "ua_skip_pref30_128: \n"
+
+ "lwr $t2, 40(%[src]) \n"
+ "lwl $t2, 43(%[src]) \n"
+ "lwr $t3, 44(%[src]) \n"
+ "lwl $t3, 47(%[src]) \n"
+ "lwr $t4, 48(%[src]) \n"
+ "lwl $t4, 51(%[src]) \n"
+ "lwr $t5, 52(%[src]) \n"
+ "lwl $t5, 55(%[src]) \n"
+ "lwr $t6, 56(%[src]) \n"
+ "lwl $t6, 59(%[src]) \n"
+ "lwr $t7, 60(%[src]) \n"
+ "lwl $t7, 63(%[src]) \n"
+ "pref 0, 160(%[src]) \n"
+ // bring the next lines of src, addr 160
+ "sw $t0, 32(%[dst]) \n"
+ "sw $t1, 36(%[dst]) \n"
+ "sw $t2, 40(%[dst]) \n"
+ "sw $t3, 44(%[dst]) \n"
+ "sw $t4, 48(%[dst]) \n"
+ "sw $t5, 52(%[dst]) \n"
+ "sw $t6, 56(%[dst]) \n"
+ "sw $t7, 60(%[dst]) \n"
+
+ "addiu %[dst],%[dst],64 \n" // adding 64 to dest
+ "sgtu $v1,%[dst],$t9 \n"
+ "bne %[dst],$a3,$ua_loop16w \n"
+ " addiu %[src],%[src],64 \n" // adding 64 to src
+ "move %[count],$t8 \n"
+
+ // Here we have src and dest word-aligned but less than 64-bytes to go
+
+ "ua_chk8w: \n"
+ "pref 0, 0x0(%[src]) \n"
+ "andi $t8, %[count], 0x1f \n" // 32-byte chunk?
+ // the t8 is the reminder count
+ "beq %[count], $t8, $ua_chk1w \n"
+ // when count==t8, no 32-byte chunk
+
+ "lwr $t0, 0(%[src]) \n"
+ "lwl $t0, 3(%[src]) \n"
+ "lwr $t1, 4(%[src]) \n"
+ "lwl $t1, 7(%[src]) \n"
+ "lwr $t2, 8(%[src]) \n"
+ "lwl $t2, 11(%[src]) \n"
+ "lwr $t3, 12(%[src]) \n"
+ "lwl $t3, 15(%[src]) \n"
+ "lwr $t4, 16(%[src]) \n"
+ "lwl $t4, 19(%[src]) \n"
+ "lwr $t5, 20(%[src]) \n"
+ "lwl $t5, 23(%[src]) \n"
+ "lwr $t6, 24(%[src]) \n"
+ "lwl $t6, 27(%[src]) \n"
+ "lwr $t7, 28(%[src]) \n"
+ "lwl $t7, 31(%[src]) \n"
+ "addiu %[src], %[src], 32 \n"
+
+ "sw $t0, 0(%[dst]) \n"
+ "sw $t1, 4(%[dst]) \n"
+ "sw $t2, 8(%[dst]) \n"
+ "sw $t3, 12(%[dst]) \n"
+ "sw $t4, 16(%[dst]) \n"
+ "sw $t5, 20(%[dst]) \n"
+ "sw $t6, 24(%[dst]) \n"
+ "sw $t7, 28(%[dst]) \n"
+ "addiu %[dst], %[dst], 32 \n"
+
+ "$ua_chk1w: \n"
+ "andi %[count], $t8, 0x3 \n"
+ // now count is the reminder past 1w chunks
+ "beq %[count], $t8, ua_smallCopy \n"
+ "subu $a3, $t8, %[count] \n"
+ // a3 is count of bytes in 1w chunks
+ "addu $a3, %[dst], $a3 \n"
+ // now a3 is the dst address past the 1w chunks
+
+ // copying in words (4-byte chunks)
+ "$ua_wordCopy_loop: \n"
+ "lwr $v1, 0(%[src]) \n"
+ "lwl $v1, 3(%[src]) \n"
+ "addiu %[src], %[src], 4 \n"
+ "addiu %[dst], %[dst], 4 \n"
+ // note: dst=a1 is word aligned here, see NOTE1
+ "bne %[dst], $a3, $ua_wordCopy_loop \n"
+ " sw $v1,-4(%[dst]) \n"
+
+ // Now less than 4 bytes (value in count) left to copy
+ "ua_smallCopy: \n"
+ "beqz %[count], leave \n"
+ " addu $a3, %[dst], %[count] \n" // a3 = last dst address
+ "$ua_smallCopy_loop: \n"
+ "lb $v1, 0(%[src]) \n"
+ "addiu %[src], %[src], 1 \n"
+ "addiu %[dst], %[dst], 1 \n"
+ "bne %[dst],$a3,$ua_smallCopy_loop \n"
+ " sb $v1, -1(%[dst]) \n"
+
+ "j $ra \n"
+ " nop \n"
+ ".set at \n"
+ ".set reorder \n"
+ : [dst] "+r" (dst), [src] "+r" (src)
+ : [count] "r" (count)
+ : "t0", "t1", "t2", "t3", "t4", "t5", "t6", "t7",
+ "t8", "t9", "a3", "v1", "at"
+ );
+}
+#endif // HAS_COPYROW_MIPS
+
+// MIPS DSPR2 functions
+#if !defined(LIBYUV_DISABLE_MIPS) && defined(__mips_dsp) && \
+ (__mips_dsp_rev >= 2)
+void SplitUVRow_MIPS_DSPR2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
+ int width) {
+ __asm__ __volatile__ (
+ ".set push \n"
+ ".set noreorder \n"
+ "srl $t4, %[width], 4 \n" // multiplies of 16
+ "blez $t4, 2f \n"
+ " andi %[width], %[width], 0xf \n" // residual
+
+ ".p2align 2 \n"
+ "1: \n"
+ "addiu $t4, $t4, -1 \n"
+ "lw $t0, 0(%[src_uv]) \n" // V1 | U1 | V0 | U0
+ "lw $t1, 4(%[src_uv]) \n" // V3 | U3 | V2 | U2
+ "lw $t2, 8(%[src_uv]) \n" // V5 | U5 | V4 | U4
+ "lw $t3, 12(%[src_uv]) \n" // V7 | U7 | V6 | U6
+ "lw $t5, 16(%[src_uv]) \n" // V9 | U9 | V8 | U8
+ "lw $t6, 20(%[src_uv]) \n" // V11 | U11 | V10 | U10
+ "lw $t7, 24(%[src_uv]) \n" // V13 | U13 | V12 | U12
+ "lw $t8, 28(%[src_uv]) \n" // V15 | U15 | V14 | U14
+ "addiu %[src_uv], %[src_uv], 32 \n"
+ "precrq.qb.ph $t9, $t1, $t0 \n" // V3 | V2 | V1 | V0
+ "precr.qb.ph $t0, $t1, $t0 \n" // U3 | U2 | U1 | U0
+ "precrq.qb.ph $t1, $t3, $t2 \n" // V7 | V6 | V5 | V4
+ "precr.qb.ph $t2, $t3, $t2 \n" // U7 | U6 | U5 | U4
+ "precrq.qb.ph $t3, $t6, $t5 \n" // V11 | V10 | V9 | V8
+ "precr.qb.ph $t5, $t6, $t5 \n" // U11 | U10 | U9 | U8
+ "precrq.qb.ph $t6, $t8, $t7 \n" // V15 | V14 | V13 | V12
+ "precr.qb.ph $t7, $t8, $t7 \n" // U15 | U14 | U13 | U12
+ "sw $t9, 0(%[dst_v]) \n"
+ "sw $t0, 0(%[dst_u]) \n"
+ "sw $t1, 4(%[dst_v]) \n"
+ "sw $t2, 4(%[dst_u]) \n"
+ "sw $t3, 8(%[dst_v]) \n"
+ "sw $t5, 8(%[dst_u]) \n"
+ "sw $t6, 12(%[dst_v]) \n"
+ "sw $t7, 12(%[dst_u]) \n"
+ "addiu %[dst_v], %[dst_v], 16 \n"
+ "bgtz $t4, 1b \n"
+ " addiu %[dst_u], %[dst_u], 16 \n"
+
+ "beqz %[width], 3f \n"
+ " nop \n"
+
+ "2: \n"
+ "lbu $t0, 0(%[src_uv]) \n"
+ "lbu $t1, 1(%[src_uv]) \n"
+ "addiu %[src_uv], %[src_uv], 2 \n"
+ "addiu %[width], %[width], -1 \n"
+ "sb $t0, 0(%[dst_u]) \n"
+ "sb $t1, 0(%[dst_v]) \n"
+ "addiu %[dst_u], %[dst_u], 1 \n"
+ "bgtz %[width], 2b \n"
+ " addiu %[dst_v], %[dst_v], 1 \n"
+
+ "3: \n"
+ ".set pop \n"
+ : [src_uv] "+r" (src_uv),
+ [width] "+r" (width),
+ [dst_u] "+r" (dst_u),
+ [dst_v] "+r" (dst_v)
+ :
+ : "t0", "t1", "t2", "t3",
+ "t4", "t5", "t6", "t7", "t8", "t9"
+ );
+}
+
+void SplitUVRow_Unaligned_MIPS_DSPR2(const uint8* src_uv, uint8* dst_u,
+ uint8* dst_v, int width) {
+ __asm__ __volatile__ (
+ ".set push \n"
+ ".set noreorder \n"
+ "srl $t4, %[width], 4 \n" // multiplies of 16
+ "blez $t4, 2f \n"
+ " andi %[width], %[width], 0xf \n" // residual
+
+ ".p2align 2 \n"
+ "1: \n"
+ "addiu $t4, $t4, -1 \n"
+ "lwr $t0, 0(%[src_uv]) \n"
+ "lwl $t0, 3(%[src_uv]) \n" // V1 | U1 | V0 | U0
+ "lwr $t1, 4(%[src_uv]) \n"
+ "lwl $t1, 7(%[src_uv]) \n" // V3 | U3 | V2 | U2
+ "lwr $t2, 8(%[src_uv]) \n"
+ "lwl $t2, 11(%[src_uv]) \n" // V5 | U5 | V4 | U4
+ "lwr $t3, 12(%[src_uv]) \n"
+ "lwl $t3, 15(%[src_uv]) \n" // V7 | U7 | V6 | U6
+ "lwr $t5, 16(%[src_uv]) \n"
+ "lwl $t5, 19(%[src_uv]) \n" // V9 | U9 | V8 | U8
+ "lwr $t6, 20(%[src_uv]) \n"
+ "lwl $t6, 23(%[src_uv]) \n" // V11 | U11 | V10 | U10
+ "lwr $t7, 24(%[src_uv]) \n"
+ "lwl $t7, 27(%[src_uv]) \n" // V13 | U13 | V12 | U12
+ "lwr $t8, 28(%[src_uv]) \n"
+ "lwl $t8, 31(%[src_uv]) \n" // V15 | U15 | V14 | U14
+ "precrq.qb.ph $t9, $t1, $t0 \n" // V3 | V2 | V1 | V0
+ "precr.qb.ph $t0, $t1, $t0 \n" // U3 | U2 | U1 | U0
+ "precrq.qb.ph $t1, $t3, $t2 \n" // V7 | V6 | V5 | V4
+ "precr.qb.ph $t2, $t3, $t2 \n" // U7 | U6 | U5 | U4
+ "precrq.qb.ph $t3, $t6, $t5 \n" // V11 | V10 | V9 | V8
+ "precr.qb.ph $t5, $t6, $t5 \n" // U11 | U10 | U9 | U8
+ "precrq.qb.ph $t6, $t8, $t7 \n" // V15 | V14 | V13 | V12
+ "precr.qb.ph $t7, $t8, $t7 \n" // U15 | U14 | U13 | U12
+ "addiu %[src_uv], %[src_uv], 32 \n"
+ "swr $t9, 0(%[dst_v]) \n"
+ "swl $t9, 3(%[dst_v]) \n"
+ "swr $t0, 0(%[dst_u]) \n"
+ "swl $t0, 3(%[dst_u]) \n"
+ "swr $t1, 4(%[dst_v]) \n"
+ "swl $t1, 7(%[dst_v]) \n"
+ "swr $t2, 4(%[dst_u]) \n"
+ "swl $t2, 7(%[dst_u]) \n"
+ "swr $t3, 8(%[dst_v]) \n"
+ "swl $t3, 11(%[dst_v]) \n"
+ "swr $t5, 8(%[dst_u]) \n"
+ "swl $t5, 11(%[dst_u]) \n"
+ "swr $t6, 12(%[dst_v]) \n"
+ "swl $t6, 15(%[dst_v]) \n"
+ "swr $t7, 12(%[dst_u]) \n"
+ "swl $t7, 15(%[dst_u]) \n"
+ "addiu %[dst_u], %[dst_u], 16 \n"
+ "bgtz $t4, 1b \n"
+ " addiu %[dst_v], %[dst_v], 16 \n"
+
+ "beqz %[width], 3f \n"
+ " nop \n"
+
+ "2: \n"
+ "lbu $t0, 0(%[src_uv]) \n"
+ "lbu $t1, 1(%[src_uv]) \n"
+ "addiu %[src_uv], %[src_uv], 2 \n"
+ "addiu %[width], %[width], -1 \n"
+ "sb $t0, 0(%[dst_u]) \n"
+ "sb $t1, 0(%[dst_v]) \n"
+ "addiu %[dst_u], %[dst_u], 1 \n"
+ "bgtz %[width], 2b \n"
+ " addiu %[dst_v], %[dst_v], 1 \n"
+
+ "3: \n"
+ ".set pop \n"
+ : [src_uv] "+r" (src_uv),
+ [width] "+r" (width),
+ [dst_u] "+r" (dst_u),
+ [dst_v] "+r" (dst_v)
+ :
+ : "t0", "t1", "t2", "t3",
+ "t4", "t5", "t6", "t7", "t8", "t9"
+ );
+}
+
+void MirrorRow_MIPS_DSPR2(const uint8* src, uint8* dst, int width) {
+ __asm__ __volatile__ (
+ ".set push \n"
+ ".set noreorder \n"
+
+ "srl $t4, %[width], 4 \n" // multiplies of 16
+ "andi $t5, %[width], 0xf \n"
+ "blez $t4, 2f \n"
+ " addu %[src], %[src], %[width] \n" // src += width
+
+ ".p2align 2 \n"
+ "1: \n"
+ "lw $t0, -16(%[src]) \n" // |3|2|1|0|
+ "lw $t1, -12(%[src]) \n" // |7|6|5|4|
+ "lw $t2, -8(%[src]) \n" // |11|10|9|8|
+ "lw $t3, -4(%[src]) \n" // |15|14|13|12|
+ "wsbh $t0, $t0 \n" // |2|3|0|1|
+ "wsbh $t1, $t1 \n" // |6|7|4|5|
+ "wsbh $t2, $t2 \n" // |10|11|8|9|
+ "wsbh $t3, $t3 \n" // |14|15|12|13|
+ "rotr $t0, $t0, 16 \n" // |0|1|2|3|
+ "rotr $t1, $t1, 16 \n" // |4|5|6|7|
+ "rotr $t2, $t2, 16 \n" // |8|9|10|11|
+ "rotr $t3, $t3, 16 \n" // |12|13|14|15|
+ "addiu %[src], %[src], -16 \n"
+ "addiu $t4, $t4, -1 \n"
+ "sw $t3, 0(%[dst]) \n" // |15|14|13|12|
+ "sw $t2, 4(%[dst]) \n" // |11|10|9|8|
+ "sw $t1, 8(%[dst]) \n" // |7|6|5|4|
+ "sw $t0, 12(%[dst]) \n" // |3|2|1|0|
+ "bgtz $t4, 1b \n"
+ " addiu %[dst], %[dst], 16 \n"
+ "beqz $t5, 3f \n"
+ " nop \n"
+
+ "2: \n"
+ "lbu $t0, -1(%[src]) \n"
+ "addiu $t5, $t5, -1 \n"
+ "addiu %[src], %[src], -1 \n"
+ "sb $t0, 0(%[dst]) \n"
+ "bgez $t5, 2b \n"
+ " addiu %[dst], %[dst], 1 \n"
+
+ "3: \n"
+ ".set pop \n"
+ : [src] "+r" (src), [dst] "+r" (dst)
+ : [width] "r" (width)
+ : "t0", "t1", "t2", "t3", "t4", "t5"
+ );
+}
+
+void MirrorUVRow_MIPS_DSPR2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
+ int width) {
+ int x = 0;
+ int y = 0;
+ __asm__ __volatile__ (
+ ".set push \n"
+ ".set noreorder \n"
+
+ "addu $t4, %[width], %[width] \n"
+ "srl %[x], %[width], 4 \n"
+ "andi %[y], %[width], 0xf \n"
+ "blez %[x], 2f \n"
+ " addu %[src_uv], %[src_uv], $t4 \n"
+
+ ".p2align 2 \n"
+ "1: \n"
+ "lw $t0, -32(%[src_uv]) \n" // |3|2|1|0|
+ "lw $t1, -28(%[src_uv]) \n" // |7|6|5|4|
+ "lw $t2, -24(%[src_uv]) \n" // |11|10|9|8|
+ "lw $t3, -20(%[src_uv]) \n" // |15|14|13|12|
+ "lw $t4, -16(%[src_uv]) \n" // |19|18|17|16|
+ "lw $t6, -12(%[src_uv]) \n" // |23|22|21|20|
+ "lw $t7, -8(%[src_uv]) \n" // |27|26|25|24|
+ "lw $t8, -4(%[src_uv]) \n" // |31|30|29|28|
+
+ "rotr $t0, $t0, 16 \n" // |1|0|3|2|
+ "rotr $t1, $t1, 16 \n" // |5|4|7|6|
+ "rotr $t2, $t2, 16 \n" // |9|8|11|10|
+ "rotr $t3, $t3, 16 \n" // |13|12|15|14|
+ "rotr $t4, $t4, 16 \n" // |17|16|19|18|
+ "rotr $t6, $t6, 16 \n" // |21|20|23|22|
+ "rotr $t7, $t7, 16 \n" // |25|24|27|26|
+ "rotr $t8, $t8, 16 \n" // |29|28|31|30|
+ "precr.qb.ph $t9, $t0, $t1 \n" // |0|2|4|6|
+ "precrq.qb.ph $t5, $t0, $t1 \n" // |1|3|5|7|
+ "precr.qb.ph $t0, $t2, $t3 \n" // |8|10|12|14|
+ "precrq.qb.ph $t1, $t2, $t3 \n" // |9|11|13|15|
+ "precr.qb.ph $t2, $t4, $t6 \n" // |16|18|20|22|
+ "precrq.qb.ph $t3, $t4, $t6 \n" // |17|19|21|23|
+ "precr.qb.ph $t4, $t7, $t8 \n" // |24|26|28|30|
+ "precrq.qb.ph $t6, $t7, $t8 \n" // |25|27|29|31|
+ "addiu %[src_uv], %[src_uv], -32 \n"
+ "addiu %[x], %[x], -1 \n"
+ "swr $t4, 0(%[dst_u]) \n"
+ "swl $t4, 3(%[dst_u]) \n" // |30|28|26|24|
+ "swr $t6, 0(%[dst_v]) \n"
+ "swl $t6, 3(%[dst_v]) \n" // |31|29|27|25|
+ "swr $t2, 4(%[dst_u]) \n"
+ "swl $t2, 7(%[dst_u]) \n" // |22|20|18|16|
+ "swr $t3, 4(%[dst_v]) \n"
+ "swl $t3, 7(%[dst_v]) \n" // |23|21|19|17|
+ "swr $t0, 8(%[dst_u]) \n"
+ "swl $t0, 11(%[dst_u]) \n" // |14|12|10|8|
+ "swr $t1, 8(%[dst_v]) \n"
+ "swl $t1, 11(%[dst_v]) \n" // |15|13|11|9|
+ "swr $t9, 12(%[dst_u]) \n"
+ "swl $t9, 15(%[dst_u]) \n" // |6|4|2|0|
+ "swr $t5, 12(%[dst_v]) \n"
+ "swl $t5, 15(%[dst_v]) \n" // |7|5|3|1|
+ "addiu %[dst_v], %[dst_v], 16 \n"
+ "bgtz %[x], 1b \n"
+ " addiu %[dst_u], %[dst_u], 16 \n"
+ "beqz %[y], 3f \n"
+ " nop \n"
+ "b 2f \n"
+ " nop \n"
+
+ "2: \n"
+ "lbu $t0, -2(%[src_uv]) \n"
+ "lbu $t1, -1(%[src_uv]) \n"
+ "addiu %[src_uv], %[src_uv], -2 \n"
+ "addiu %[y], %[y], -1 \n"
+ "sb $t0, 0(%[dst_u]) \n"
+ "sb $t1, 0(%[dst_v]) \n"
+ "addiu %[dst_u], %[dst_u], 1 \n"
+ "bgtz %[y], 2b \n"
+ " addiu %[dst_v], %[dst_v], 1 \n"
+
+ "3: \n"
+ ".set pop \n"
+ : [src_uv] "+r" (src_uv),
+ [dst_u] "+r" (dst_u),
+ [dst_v] "+r" (dst_v),
+ [x] "=&r" (x),
+ [y] "+r" (y)
+ : [width] "r" (width)
+ : "t0", "t1", "t2", "t3", "t4",
+ "t5", "t7", "t8", "t9"
+ );
+}
+
+// Convert (4 Y and 2 VU) I422 and arrange RGB values into
+// t5 = | 0 | B0 | 0 | b0 |
+// t4 = | 0 | B1 | 0 | b1 |
+// t9 = | 0 | G0 | 0 | g0 |
+// t8 = | 0 | G1 | 0 | g1 |
+// t2 = | 0 | R0 | 0 | r0 |
+// t1 = | 0 | R1 | 0 | r1 |
+#define I422ToTransientMipsRGB \
+ "lw $t0, 0(%[y_buf]) \n" \
+ "lhu $t1, 0(%[u_buf]) \n" \
+ "lhu $t2, 0(%[v_buf]) \n" \
+ "preceu.ph.qbr $t1, $t1 \n" \
+ "preceu.ph.qbr $t2, $t2 \n" \
+ "preceu.ph.qbra $t3, $t0 \n" \
+ "preceu.ph.qbla $t0, $t0 \n" \
+ "subu.ph $t1, $t1, $s5 \n" \
+ "subu.ph $t2, $t2, $s5 \n" \
+ "subu.ph $t3, $t3, $s4 \n" \
+ "subu.ph $t0, $t0, $s4 \n" \
+ "mul.ph $t3, $t3, $s0 \n" \
+ "mul.ph $t0, $t0, $s0 \n" \
+ "shll.ph $t4, $t1, 0x7 \n" \
+ "subu.ph $t4, $t4, $t1 \n" \
+ "mul.ph $t6, $t1, $s1 \n" \
+ "mul.ph $t1, $t2, $s2 \n" \
+ "addq_s.ph $t5, $t4, $t3 \n" \
+ "addq_s.ph $t4, $t4, $t0 \n" \
+ "shra.ph $t5, $t5, 6 \n" \
+ "shra.ph $t4, $t4, 6 \n" \
+ "addiu %[u_buf], 2 \n" \
+ "addiu %[v_buf], 2 \n" \
+ "addu.ph $t6, $t6, $t1 \n" \
+ "mul.ph $t1, $t2, $s3 \n" \
+ "addu.ph $t9, $t6, $t3 \n" \
+ "addu.ph $t8, $t6, $t0 \n" \
+ "shra.ph $t9, $t9, 6 \n" \
+ "shra.ph $t8, $t8, 6 \n" \
+ "addu.ph $t2, $t1, $t3 \n" \
+ "addu.ph $t1, $t1, $t0 \n" \
+ "shra.ph $t2, $t2, 6 \n" \
+ "shra.ph $t1, $t1, 6 \n" \
+ "subu.ph $t5, $t5, $s5 \n" \
+ "subu.ph $t4, $t4, $s5 \n" \
+ "subu.ph $t9, $t9, $s5 \n" \
+ "subu.ph $t8, $t8, $s5 \n" \
+ "subu.ph $t2, $t2, $s5 \n" \
+ "subu.ph $t1, $t1, $s5 \n" \
+ "shll_s.ph $t5, $t5, 8 \n" \
+ "shll_s.ph $t4, $t4, 8 \n" \
+ "shll_s.ph $t9, $t9, 8 \n" \
+ "shll_s.ph $t8, $t8, 8 \n" \
+ "shll_s.ph $t2, $t2, 8 \n" \
+ "shll_s.ph $t1, $t1, 8 \n" \
+ "shra.ph $t5, $t5, 8 \n" \
+ "shra.ph $t4, $t4, 8 \n" \
+ "shra.ph $t9, $t9, 8 \n" \
+ "shra.ph $t8, $t8, 8 \n" \
+ "shra.ph $t2, $t2, 8 \n" \
+ "shra.ph $t1, $t1, 8 \n" \
+ "addu.ph $t5, $t5, $s5 \n" \
+ "addu.ph $t4, $t4, $s5 \n" \
+ "addu.ph $t9, $t9, $s5 \n" \
+ "addu.ph $t8, $t8, $s5 \n" \
+ "addu.ph $t2, $t2, $s5 \n" \
+ "addu.ph $t1, $t1, $s5 \n"
+
+void I422ToARGBRow_MIPS_DSPR2(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* rgb_buf,
+ int width) {
+ __asm__ __volatile__ (
+ ".set push \n"
+ ".set noreorder \n"
+ "beqz %[width], 2f \n"
+ " repl.ph $s0, 74 \n" // |YG|YG| = |74|74|
+ "repl.ph $s1, -25 \n" // |UG|UG| = |-25|-25|
+ "repl.ph $s2, -52 \n" // |VG|VG| = |-52|-52|
+ "repl.ph $s3, 102 \n" // |VR|VR| = |102|102|
+ "repl.ph $s4, 16 \n" // |0|16|0|16|
+ "repl.ph $s5, 128 \n" // |128|128| // clipping
+ "lui $s6, 0xff00 \n"
+ "ori $s6, 0xff00 \n" // |ff|00|ff|00|ff|
+
+ ".p2align 2 \n"
+ "1: \n"
+ I422ToTransientMipsRGB
+// Arranging into argb format
+ "precr.qb.ph $t4, $t8, $t4 \n" // |G1|g1|B1|b1|
+ "precr.qb.ph $t5, $t9, $t5 \n" // |G0|g0|B0|b0|
+ "addiu %[width], -4 \n"
+ "precrq.qb.ph $t8, $t4, $t5 \n" // |G1|B1|G0|B0|
+ "precr.qb.ph $t9, $t4, $t5 \n" // |g1|b1|g0|b0|
+ "precr.qb.ph $t2, $t1, $t2 \n" // |R1|r1|R0|r0|
+
+ "addiu %[y_buf], 4 \n"
+ "preceu.ph.qbla $t1, $t2 \n" // |0 |R1|0 |R0|
+ "preceu.ph.qbra $t2, $t2 \n" // |0 |r1|0 |r0|
+ "or $t1, $t1, $s6 \n" // |ff|R1|ff|R0|
+ "or $t2, $t2, $s6 \n" // |ff|r1|ff|r0|
+ "precrq.ph.w $t0, $t2, $t9 \n" // |ff|r1|g1|b1|
+ "precrq.ph.w $t3, $t1, $t8 \n" // |ff|R1|G1|B1|
+ "sll $t9, $t9, 16 \n"
+ "sll $t8, $t8, 16 \n"
+ "packrl.ph $t2, $t2, $t9 \n" // |ff|r0|g0|b0|
+ "packrl.ph $t1, $t1, $t8 \n" // |ff|R0|G0|B0|
+// Store results.
+ "sw $t2, 0(%[rgb_buf]) \n"
+ "sw $t0, 4(%[rgb_buf]) \n"
+ "sw $t1, 8(%[rgb_buf]) \n"
+ "sw $t3, 12(%[rgb_buf]) \n"
+ "bnez %[width], 1b \n"
+ " addiu %[rgb_buf], 16 \n"
+ "2: \n"
+ ".set pop \n"
+ :[y_buf] "+r" (y_buf),
+ [u_buf] "+r" (u_buf),
+ [v_buf] "+r" (v_buf),
+ [width] "+r" (width),
+ [rgb_buf] "+r" (rgb_buf)
+ :
+ : "t0", "t1", "t2", "t3", "t4", "t5",
+ "t6", "t7", "t8", "t9",
+ "s0", "s1", "s2", "s3",
+ "s4", "s5", "s6"
+ );
+}
+
+void I422ToABGRRow_MIPS_DSPR2(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* rgb_buf,
+ int width) {
+ __asm__ __volatile__ (
+ ".set push \n"
+ ".set noreorder \n"
+ "beqz %[width], 2f \n"
+ " repl.ph $s0, 74 \n" // |YG|YG| = |74|74|
+ "repl.ph $s1, -25 \n" // |UG|UG| = |-25|-25|
+ "repl.ph $s2, -52 \n" // |VG|VG| = |-52|-52|
+ "repl.ph $s3, 102 \n" // |VR|VR| = |102|102|
+ "repl.ph $s4, 16 \n" // |0|16|0|16|
+ "repl.ph $s5, 128 \n" // |128|128|
+ "lui $s6, 0xff00 \n"
+ "ori $s6, 0xff00 \n" // |ff|00|ff|00|
+
+ ".p2align 2 \n"
+ "1: \n"
+ I422ToTransientMipsRGB
+// Arranging into abgr format
+ "precr.qb.ph $t0, $t8, $t1 \n" // |G1|g1|R1|r1|
+ "precr.qb.ph $t3, $t9, $t2 \n" // |G0|g0|R0|r0|
+ "precrq.qb.ph $t8, $t0, $t3 \n" // |G1|R1|G0|R0|
+ "precr.qb.ph $t9, $t0, $t3 \n" // |g1|r1|g0|r0|
+
+ "precr.qb.ph $t2, $t4, $t5 \n" // |B1|b1|B0|b0|
+ "addiu %[width], -4 \n"
+ "addiu %[y_buf], 4 \n"
+ "preceu.ph.qbla $t1, $t2 \n" // |0 |B1|0 |B0|
+ "preceu.ph.qbra $t2, $t2 \n" // |0 |b1|0 |b0|
+ "or $t1, $t1, $s6 \n" // |ff|B1|ff|B0|
+ "or $t2, $t2, $s6 \n" // |ff|b1|ff|b0|
+ "precrq.ph.w $t0, $t2, $t9 \n" // |ff|b1|g1|r1|
+ "precrq.ph.w $t3, $t1, $t8 \n" // |ff|B1|G1|R1|
+ "sll $t9, $t9, 16 \n"
+ "sll $t8, $t8, 16 \n"
+ "packrl.ph $t2, $t2, $t9 \n" // |ff|b0|g0|r0|
+ "packrl.ph $t1, $t1, $t8 \n" // |ff|B0|G0|R0|
+// Store results.
+ "sw $t2, 0(%[rgb_buf]) \n"
+ "sw $t0, 4(%[rgb_buf]) \n"
+ "sw $t1, 8(%[rgb_buf]) \n"
+ "sw $t3, 12(%[rgb_buf]) \n"
+ "bnez %[width], 1b \n"
+ " addiu %[rgb_buf], 16 \n"
+ "2: \n"
+ ".set pop \n"
+ :[y_buf] "+r" (y_buf),
+ [u_buf] "+r" (u_buf),
+ [v_buf] "+r" (v_buf),
+ [width] "+r" (width),
+ [rgb_buf] "+r" (rgb_buf)
+ :
+ : "t0", "t1", "t2", "t3", "t4", "t5",
+ "t6", "t7", "t8", "t9",
+ "s0", "s1", "s2", "s3",
+ "s4", "s5", "s6"
+ );
+}
+
+void I422ToBGRARow_MIPS_DSPR2(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* rgb_buf,
+ int width) {
+ __asm__ __volatile__ (
+ ".set push \n"
+ ".set noreorder \n"
+ "beqz %[width], 2f \n"
+ " repl.ph $s0, 74 \n" // |YG|YG| = |74 |74 |
+ "repl.ph $s1, -25 \n" // |UG|UG| = |-25|-25|
+ "repl.ph $s2, -52 \n" // |VG|VG| = |-52|-52|
+ "repl.ph $s3, 102 \n" // |VR|VR| = |102|102|
+ "repl.ph $s4, 16 \n" // |0|16|0|16|
+ "repl.ph $s5, 128 \n" // |128|128|
+ "lui $s6, 0xff \n"
+ "ori $s6, 0xff \n" // |00|ff|00|ff|
+
+ ".p2align 2 \n"
+ "1: \n"
+ I422ToTransientMipsRGB
+ // Arranging into bgra format
+ "precr.qb.ph $t4, $t4, $t8 \n" // |B1|b1|G1|g1|
+ "precr.qb.ph $t5, $t5, $t9 \n" // |B0|b0|G0|g0|
+ "precrq.qb.ph $t8, $t4, $t5 \n" // |B1|G1|B0|G0|
+ "precr.qb.ph $t9, $t4, $t5 \n" // |b1|g1|b0|g0|
+
+ "precr.qb.ph $t2, $t1, $t2 \n" // |R1|r1|R0|r0|
+ "addiu %[width], -4 \n"
+ "addiu %[y_buf], 4 \n"
+ "preceu.ph.qbla $t1, $t2 \n" // |0 |R1|0 |R0|
+ "preceu.ph.qbra $t2, $t2 \n" // |0 |r1|0 |r0|
+ "sll $t1, $t1, 8 \n" // |R1|0 |R0|0 |
+ "sll $t2, $t2, 8 \n" // |r1|0 |r0|0 |
+ "or $t1, $t1, $s6 \n" // |R1|ff|R0|ff|
+ "or $t2, $t2, $s6 \n" // |r1|ff|r0|ff|
+ "precrq.ph.w $t0, $t9, $t2 \n" // |b1|g1|r1|ff|
+ "precrq.ph.w $t3, $t8, $t1 \n" // |B1|G1|R1|ff|
+ "sll $t1, $t1, 16 \n"
+ "sll $t2, $t2, 16 \n"
+ "packrl.ph $t2, $t9, $t2 \n" // |b0|g0|r0|ff|
+ "packrl.ph $t1, $t8, $t1 \n" // |B0|G0|R0|ff|
+// Store results.
+ "sw $t2, 0(%[rgb_buf]) \n"
+ "sw $t0, 4(%[rgb_buf]) \n"
+ "sw $t1, 8(%[rgb_buf]) \n"
+ "sw $t3, 12(%[rgb_buf]) \n"
+ "bnez %[width], 1b \n"
+ " addiu %[rgb_buf], 16 \n"
+ "2: \n"
+ ".set pop \n"
+ :[y_buf] "+r" (y_buf),
+ [u_buf] "+r" (u_buf),
+ [v_buf] "+r" (v_buf),
+ [width] "+r" (width),
+ [rgb_buf] "+r" (rgb_buf)
+ :
+ : "t0", "t1", "t2", "t3", "t4", "t5",
+ "t6", "t7", "t8", "t9",
+ "s0", "s1", "s2", "s3",
+ "s4", "s5", "s6"
+ );
+}
+
+// Bilinear filter 8x2 -> 8x1
+void InterpolateRows_MIPS_DSPR2(uint8* dst_ptr, const uint8* src_ptr,
+ ptrdiff_t src_stride, int dst_width,
+ int source_y_fraction) {
+ int y0_fraction = 256 - source_y_fraction;
+ const uint8* src_ptr1 = src_ptr + src_stride;
+
+ __asm__ __volatile__ (
+ ".set push \n"
+ ".set noreorder \n"
+
+ "replv.ph $t0, %[y0_fraction] \n"
+ "replv.ph $t1, %[source_y_fraction] \n"
+
+ ".p2align 2 \n"
+ "1: \n"
+ "lw $t2, 0(%[src_ptr]) \n"
+ "lw $t3, 0(%[src_ptr1]) \n"
+ "lw $t4, 4(%[src_ptr]) \n"
+ "lw $t5, 4(%[src_ptr1]) \n"
+ "muleu_s.ph.qbl $t6, $t2, $t0 \n"
+ "muleu_s.ph.qbr $t7, $t2, $t0 \n"
+ "muleu_s.ph.qbl $t8, $t3, $t1 \n"
+ "muleu_s.ph.qbr $t9, $t3, $t1 \n"
+ "muleu_s.ph.qbl $t2, $t4, $t0 \n"
+ "muleu_s.ph.qbr $t3, $t4, $t0 \n"
+ "muleu_s.ph.qbl $t4, $t5, $t1 \n"
+ "muleu_s.ph.qbr $t5, $t5, $t1 \n"
+ "addq.ph $t6, $t6, $t8 \n"
+ "addq.ph $t7, $t7, $t9 \n"
+ "addq.ph $t2, $t2, $t4 \n"
+ "addq.ph $t3, $t3, $t5 \n"
+ "shra.ph $t6, $t6, 8 \n"
+ "shra.ph $t7, $t7, 8 \n"
+ "shra.ph $t2, $t2, 8 \n"
+ "shra.ph $t3, $t3, 8 \n"
+ "precr.qb.ph $t6, $t6, $t7 \n"
+ "precr.qb.ph $t2, $t2, $t3 \n"
+ "addiu %[src_ptr], %[src_ptr], 8 \n"
+ "addiu %[src_ptr1], %[src_ptr1], 8 \n"
+ "addiu %[dst_width], %[dst_width], -8 \n"
+ "sw $t6, 0(%[dst_ptr]) \n"
+ "sw $t2, 4(%[dst_ptr]) \n"
+ "bgtz %[dst_width], 1b \n"
+ " addiu %[dst_ptr], %[dst_ptr], 8 \n"
+
+ ".set pop \n"
+ : [dst_ptr] "+r" (dst_ptr),
+ [src_ptr1] "+r" (src_ptr1),
+ [src_ptr] "+r" (src_ptr),
+ [dst_width] "+r" (dst_width)
+ : [source_y_fraction] "r" (source_y_fraction),
+ [y0_fraction] "r" (y0_fraction),
+ [src_stride] "r" (src_stride)
+ : "t0", "t1", "t2", "t3", "t4", "t5",
+ "t6", "t7", "t8", "t9"
+ );
+}
+#endif // __mips_dsp_rev >= 2
+
+#endif // defined(__mips__)
+
+#ifdef __cplusplus
+} // extern "C"
+} // namespace libyuv
+#endif
diff --git a/third_party/libyuv/source/row_neon.cc b/third_party/libyuv/source/row_neon.cc
new file mode 100644
index 0000000..c5ae2c5
--- /dev/null
+++ b/third_party/libyuv/source/row_neon.cc
@@ -0,0 +1,2844 @@
+/*
+ * Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "third_party/libyuv/include/libyuv/row.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// This module is for GCC Neon
+#if !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__)
+
+// Read 8 Y, 4 U and 4 V from 422
+#define READYUV422 \
+ "vld1.8 {d0}, [%0]! \n" \
+ "vld1.32 {d2[0]}, [%1]! \n" \
+ "vld1.32 {d2[1]}, [%2]! \n"
+
+// Read 8 Y, 2 U and 2 V from 422
+#define READYUV411 \
+ "vld1.8 {d0}, [%0]! \n" \
+ "vld1.16 {d2[0]}, [%1]! \n" \
+ "vld1.16 {d2[1]}, [%2]! \n" \
+ "vmov.u8 d3, d2 \n" \
+ "vzip.u8 d2, d3 \n"
+
+// Read 8 Y, 8 U and 8 V from 444
+#define READYUV444 \
+ "vld1.8 {d0}, [%0]! \n" \
+ "vld1.8 {d2}, [%1]! \n" \
+ "vld1.8 {d3}, [%2]! \n" \
+ "vpaddl.u8 q1, q1 \n" \
+ "vrshrn.u16 d2, q1, #1 \n"
+
+// Read 8 Y, and set 4 U and 4 V to 128
+#define READYUV400 \
+ "vld1.8 {d0}, [%0]! \n" \
+ "vmov.u8 d2, #128 \n"
+
+// Read 8 Y and 4 UV from NV12
+#define READNV12 \
+ "vld1.8 {d0}, [%0]! \n" \
+ "vld1.8 {d2}, [%1]! \n" \
+ "vmov.u8 d3, d2 \n"/* split odd/even uv apart */\
+ "vuzp.u8 d2, d3 \n" \
+ "vtrn.u32 d2, d3 \n"
+
+// Read 8 Y and 4 VU from NV21
+#define READNV21 \
+ "vld1.8 {d0}, [%0]! \n" \
+ "vld1.8 {d2}, [%1]! \n" \
+ "vmov.u8 d3, d2 \n"/* split odd/even uv apart */\
+ "vuzp.u8 d3, d2 \n" \
+ "vtrn.u32 d2, d3 \n"
+
+// Read 8 YUY2
+#define READYUY2 \
+ "vld2.8 {d0, d2}, [%0]! \n" \
+ "vmov.u8 d3, d2 \n" \
+ "vuzp.u8 d2, d3 \n" \
+ "vtrn.u32 d2, d3 \n"
+
+// Read 8 UYVY
+#define READUYVY \
+ "vld2.8 {d2, d3}, [%0]! \n" \
+ "vmov.u8 d0, d3 \n" \
+ "vmov.u8 d3, d2 \n" \
+ "vuzp.u8 d2, d3 \n" \
+ "vtrn.u32 d2, d3 \n"
+
+#define YUV422TORGB \
+ "veor.u8 d2, d26 \n"/*subtract 128 from u and v*/\
+ "vmull.s8 q8, d2, d24 \n"/* u/v B/R component */\
+ "vmull.s8 q9, d2, d25 \n"/* u/v G component */\
+ "vmov.u8 d1, #0 \n"/* split odd/even y apart */\
+ "vtrn.u8 d0, d1 \n" \
+ "vsub.s16 q0, q0, q15 \n"/* offset y */\
+ "vmul.s16 q0, q0, q14 \n" \
+ "vadd.s16 d18, d19 \n" \
+ "vqadd.s16 d20, d0, d16 \n" /* B */ \
+ "vqadd.s16 d21, d1, d16 \n" \
+ "vqadd.s16 d22, d0, d17 \n" /* R */ \
+ "vqadd.s16 d23, d1, d17 \n" \
+ "vqadd.s16 d16, d0, d18 \n" /* G */ \
+ "vqadd.s16 d17, d1, d18 \n" \
+ "vqshrun.s16 d0, q10, #6 \n" /* B */ \
+ "vqshrun.s16 d1, q11, #6 \n" /* G */ \
+ "vqshrun.s16 d2, q8, #6 \n" /* R */ \
+ "vmovl.u8 q10, d0 \n"/* set up for reinterleave*/\
+ "vmovl.u8 q11, d1 \n" \
+ "vmovl.u8 q8, d2 \n" \
+ "vtrn.u8 d20, d21 \n" \
+ "vtrn.u8 d22, d23 \n" \
+ "vtrn.u8 d16, d17 \n" \
+ "vmov.u8 d21, d16 \n"
+
+static vec8 kUVToRB = { 127, 127, 127, 127, 102, 102, 102, 102,
+ 0, 0, 0, 0, 0, 0, 0, 0 };
+static vec8 kUVToG = { -25, -25, -25, -25, -52, -52, -52, -52,
+ 0, 0, 0, 0, 0, 0, 0, 0 };
+
+void I444ToARGBRow_NEON(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_argb,
+ int width) {
+ asm volatile (
+ "vld1.8 {d24}, [%5] \n"
+ "vld1.8 {d25}, [%6] \n"
+ "vmov.u8 d26, #128 \n"
+ "vmov.u16 q14, #74 \n"
+ "vmov.u16 q15, #16 \n"
+ ".p2align 2 \n"
+ "1: \n"
+ READYUV444
+ YUV422TORGB
+ "subs %4, %4, #8 \n"
+ "vmov.u8 d23, #255 \n"
+ "vst4.8 {d20, d21, d22, d23}, [%3]! \n"
+ "bgt 1b \n"
+ : "+r"(src_y), // %0
+ "+r"(src_u), // %1
+ "+r"(src_v), // %2
+ "+r"(dst_argb), // %3
+ "+r"(width) // %4
+ : "r"(&kUVToRB), // %5
+ "r"(&kUVToG) // %6
+ : "cc", "memory", "q0", "q1", "q2", "q3",
+ "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+ );
+}
+
+void I422ToARGBRow_NEON(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_argb,
+ int width) {
+ asm volatile (
+ "vld1.8 {d24}, [%5] \n"
+ "vld1.8 {d25}, [%6] \n"
+ "vmov.u8 d26, #128 \n"
+ "vmov.u16 q14, #74 \n"
+ "vmov.u16 q15, #16 \n"
+ ".p2align 2 \n"
+ "1: \n"
+ READYUV422
+ YUV422TORGB
+ "subs %4, %4, #8 \n"
+ "vmov.u8 d23, #255 \n"
+ "vst4.8 {d20, d21, d22, d23}, [%3]! \n"
+ "bgt 1b \n"
+ : "+r"(src_y), // %0
+ "+r"(src_u), // %1
+ "+r"(src_v), // %2
+ "+r"(dst_argb), // %3
+ "+r"(width) // %4
+ : "r"(&kUVToRB), // %5
+ "r"(&kUVToG) // %6
+ : "cc", "memory", "q0", "q1", "q2", "q3",
+ "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+ );
+}
+
+void I411ToARGBRow_NEON(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_argb,
+ int width) {
+ asm volatile (
+ "vld1.8 {d24}, [%5] \n"
+ "vld1.8 {d25}, [%6] \n"
+ "vmov.u8 d26, #128 \n"
+ "vmov.u16 q14, #74 \n"
+ "vmov.u16 q15, #16 \n"
+ ".p2align 2 \n"
+ "1: \n"
+ READYUV411
+ YUV422TORGB
+ "subs %4, %4, #8 \n"
+ "vmov.u8 d23, #255 \n"
+ "vst4.8 {d20, d21, d22, d23}, [%3]! \n"
+ "bgt 1b \n"
+ : "+r"(src_y), // %0
+ "+r"(src_u), // %1
+ "+r"(src_v), // %2
+ "+r"(dst_argb), // %3
+ "+r"(width) // %4
+ : "r"(&kUVToRB), // %5
+ "r"(&kUVToG) // %6
+ : "cc", "memory", "q0", "q1", "q2", "q3",
+ "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+ );
+}
+
+void I422ToBGRARow_NEON(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_bgra,
+ int width) {
+ asm volatile (
+ "vld1.8 {d24}, [%5] \n"
+ "vld1.8 {d25}, [%6] \n"
+ "vmov.u8 d26, #128 \n"
+ "vmov.u16 q14, #74 \n"
+ "vmov.u16 q15, #16 \n"
+ ".p2align 2 \n"
+ "1: \n"
+ READYUV422
+ YUV422TORGB
+ "subs %4, %4, #8 \n"
+ "vswp.u8 d20, d22 \n"
+ "vmov.u8 d19, #255 \n"
+ "vst4.8 {d19, d20, d21, d22}, [%3]! \n"
+ "bgt 1b \n"
+ : "+r"(src_y), // %0
+ "+r"(src_u), // %1
+ "+r"(src_v), // %2
+ "+r"(dst_bgra), // %3
+ "+r"(width) // %4
+ : "r"(&kUVToRB), // %5
+ "r"(&kUVToG) // %6
+ : "cc", "memory", "q0", "q1", "q2", "q3",
+ "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+ );
+}
+
+void I422ToABGRRow_NEON(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_abgr,
+ int width) {
+ asm volatile (
+ "vld1.8 {d24}, [%5] \n"
+ "vld1.8 {d25}, [%6] \n"
+ "vmov.u8 d26, #128 \n"
+ "vmov.u16 q14, #74 \n"
+ "vmov.u16 q15, #16 \n"
+ ".p2align 2 \n"
+ "1: \n"
+ READYUV422
+ YUV422TORGB
+ "subs %4, %4, #8 \n"
+ "vswp.u8 d20, d22 \n"
+ "vmov.u8 d23, #255 \n"
+ "vst4.8 {d20, d21, d22, d23}, [%3]! \n"
+ "bgt 1b \n"
+ : "+r"(src_y), // %0
+ "+r"(src_u), // %1
+ "+r"(src_v), // %2
+ "+r"(dst_abgr), // %3
+ "+r"(width) // %4
+ : "r"(&kUVToRB), // %5
+ "r"(&kUVToG) // %6
+ : "cc", "memory", "q0", "q1", "q2", "q3",
+ "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+ );
+}
+
+void I422ToRGBARow_NEON(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_rgba,
+ int width) {
+ asm volatile (
+ "vld1.8 {d24}, [%5] \n"
+ "vld1.8 {d25}, [%6] \n"
+ "vmov.u8 d26, #128 \n"
+ "vmov.u16 q14, #74 \n"
+ "vmov.u16 q15, #16 \n"
+ ".p2align 2 \n"
+ "1: \n"
+ READYUV422
+ YUV422TORGB
+ "subs %4, %4, #8 \n"
+ "vmov.u8 d19, #255 \n"
+ "vst4.8 {d19, d20, d21, d22}, [%3]! \n"
+ "bgt 1b \n"
+ : "+r"(src_y), // %0
+ "+r"(src_u), // %1
+ "+r"(src_v), // %2
+ "+r"(dst_rgba), // %3
+ "+r"(width) // %4
+ : "r"(&kUVToRB), // %5
+ "r"(&kUVToG) // %6
+ : "cc", "memory", "q0", "q1", "q2", "q3",
+ "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+ );
+}
+
+void I422ToRGB24Row_NEON(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_rgb24,
+ int width) {
+ asm volatile (
+ "vld1.8 {d24}, [%5] \n"
+ "vld1.8 {d25}, [%6] \n"
+ "vmov.u8 d26, #128 \n"
+ "vmov.u16 q14, #74 \n"
+ "vmov.u16 q15, #16 \n"
+ ".p2align 2 \n"
+ "1: \n"
+ READYUV422
+ YUV422TORGB
+ "subs %4, %4, #8 \n"
+ "vst3.8 {d20, d21, d22}, [%3]! \n"
+ "bgt 1b \n"
+ : "+r"(src_y), // %0
+ "+r"(src_u), // %1
+ "+r"(src_v), // %2
+ "+r"(dst_rgb24), // %3
+ "+r"(width) // %4
+ : "r"(&kUVToRB), // %5
+ "r"(&kUVToG) // %6
+ : "cc", "memory", "q0", "q1", "q2", "q3",
+ "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+ );
+}
+
+void I422ToRAWRow_NEON(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_raw,
+ int width) {
+ asm volatile (
+ "vld1.8 {d24}, [%5] \n"
+ "vld1.8 {d25}, [%6] \n"
+ "vmov.u8 d26, #128 \n"
+ "vmov.u16 q14, #74 \n"
+ "vmov.u16 q15, #16 \n"
+ ".p2align 2 \n"
+ "1: \n"
+ READYUV422
+ YUV422TORGB
+ "subs %4, %4, #8 \n"
+ "vswp.u8 d20, d22 \n"
+ "vst3.8 {d20, d21, d22}, [%3]! \n"
+ "bgt 1b \n"
+ : "+r"(src_y), // %0
+ "+r"(src_u), // %1
+ "+r"(src_v), // %2
+ "+r"(dst_raw), // %3
+ "+r"(width) // %4
+ : "r"(&kUVToRB), // %5
+ "r"(&kUVToG) // %6
+ : "cc", "memory", "q0", "q1", "q2", "q3",
+ "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+ );
+}
+
+#define ARGBTORGB565 \
+ "vshr.u8 d20, d20, #3 \n" /* B */ \
+ "vshr.u8 d21, d21, #2 \n" /* G */ \
+ "vshr.u8 d22, d22, #3 \n" /* R */ \
+ "vmovl.u8 q8, d20 \n" /* B */ \
+ "vmovl.u8 q9, d21 \n" /* G */ \
+ "vmovl.u8 q10, d22 \n" /* R */ \
+ "vshl.u16 q9, q9, #5 \n" /* G */ \
+ "vshl.u16 q10, q10, #11 \n" /* R */ \
+ "vorr q0, q8, q9 \n" /* BG */ \
+ "vorr q0, q0, q10 \n" /* BGR */
+
+void I422ToRGB565Row_NEON(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_rgb565,
+ int width) {
+ asm volatile (
+ "vld1.8 {d24}, [%5] \n"
+ "vld1.8 {d25}, [%6] \n"
+ "vmov.u8 d26, #128 \n"
+ "vmov.u16 q14, #74 \n"
+ "vmov.u16 q15, #16 \n"
+ ".p2align 2 \n"
+ "1: \n"
+ READYUV422
+ YUV422TORGB
+ "subs %4, %4, #8 \n"
+ ARGBTORGB565
+ "vst1.8 {q0}, [%3]! \n" // store 8 pixels RGB565.
+ "bgt 1b \n"
+ : "+r"(src_y), // %0
+ "+r"(src_u), // %1
+ "+r"(src_v), // %2
+ "+r"(dst_rgb565), // %3
+ "+r"(width) // %4
+ : "r"(&kUVToRB), // %5
+ "r"(&kUVToG) // %6
+ : "cc", "memory", "q0", "q1", "q2", "q3",
+ "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+ );
+}
+
+#define ARGBTOARGB1555 \
+ "vshr.u8 q10, q10, #3 \n" /* B */ \
+ "vshr.u8 d22, d22, #3 \n" /* R */ \
+ "vshr.u8 d23, d23, #7 \n" /* A */ \
+ "vmovl.u8 q8, d20 \n" /* B */ \
+ "vmovl.u8 q9, d21 \n" /* G */ \
+ "vmovl.u8 q10, d22 \n" /* R */ \
+ "vmovl.u8 q11, d23 \n" /* A */ \
+ "vshl.u16 q9, q9, #5 \n" /* G */ \
+ "vshl.u16 q10, q10, #10 \n" /* R */ \
+ "vshl.u16 q11, q11, #15 \n" /* A */ \
+ "vorr q0, q8, q9 \n" /* BG */ \
+ "vorr q1, q10, q11 \n" /* RA */ \
+ "vorr q0, q0, q1 \n" /* BGRA */
+
+void I422ToARGB1555Row_NEON(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_argb1555,
+ int width) {
+ asm volatile (
+ "vld1.8 {d24}, [%5] \n"
+ "vld1.8 {d25}, [%6] \n"
+ "vmov.u8 d26, #128 \n"
+ "vmov.u16 q14, #74 \n"
+ "vmov.u16 q15, #16 \n"
+ ".p2align 2 \n"
+ "1: \n"
+ READYUV422
+ YUV422TORGB
+ "subs %4, %4, #8 \n"
+ "vmov.u8 d23, #255 \n"
+ ARGBTOARGB1555
+ "vst1.8 {q0}, [%3]! \n" // store 8 pixels ARGB1555.
+ "bgt 1b \n"
+ : "+r"(src_y), // %0
+ "+r"(src_u), // %1
+ "+r"(src_v), // %2
+ "+r"(dst_argb1555), // %3
+ "+r"(width) // %4
+ : "r"(&kUVToRB), // %5
+ "r"(&kUVToG) // %6
+ : "cc", "memory", "q0", "q1", "q2", "q3",
+ "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+ );
+}
+
+#define ARGBTOARGB4444 \
+ "vshr.u8 d20, d20, #4 \n" /* B */ \
+ "vbic.32 d21, d21, d4 \n" /* G */ \
+ "vshr.u8 d22, d22, #4 \n" /* R */ \
+ "vbic.32 d23, d23, d4 \n" /* A */ \
+ "vorr d0, d20, d21 \n" /* BG */ \
+ "vorr d1, d22, d23 \n" /* RA */ \
+ "vzip.u8 d0, d1 \n" /* BGRA */
+
+void I422ToARGB4444Row_NEON(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_argb4444,
+ int width) {
+ asm volatile (
+ "vld1.8 {d24}, [%5] \n"
+ "vld1.8 {d25}, [%6] \n"
+ "vmov.u8 d26, #128 \n"
+ "vmov.u16 q14, #74 \n"
+ "vmov.u16 q15, #16 \n"
+ "vmov.u8 d4, #0x0f \n" // bits to clear with vbic.
+ ".p2align 2 \n"
+ "1: \n"
+ READYUV422
+ YUV422TORGB
+ "subs %4, %4, #8 \n"
+ "vmov.u8 d23, #255 \n"
+ ARGBTOARGB4444
+ "vst1.8 {q0}, [%3]! \n" // store 8 pixels ARGB4444.
+ "bgt 1b \n"
+ : "+r"(src_y), // %0
+ "+r"(src_u), // %1
+ "+r"(src_v), // %2
+ "+r"(dst_argb4444), // %3
+ "+r"(width) // %4
+ : "r"(&kUVToRB), // %5
+ "r"(&kUVToG) // %6
+ : "cc", "memory", "q0", "q1", "q2", "q3",
+ "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+ );
+}
+
+void YToARGBRow_NEON(const uint8* src_y,
+ uint8* dst_argb,
+ int width) {
+ asm volatile (
+ "vld1.8 {d24}, [%3] \n"
+ "vld1.8 {d25}, [%4] \n"
+ "vmov.u8 d26, #128 \n"
+ "vmov.u16 q14, #74 \n"
+ "vmov.u16 q15, #16 \n"
+ ".p2align 2 \n"
+ "1: \n"
+ READYUV400
+ YUV422TORGB
+ "subs %2, %2, #8 \n"
+ "vmov.u8 d23, #255 \n"
+ "vst4.8 {d20, d21, d22, d23}, [%1]! \n"
+ "bgt 1b \n"
+ : "+r"(src_y), // %0
+ "+r"(dst_argb), // %1
+ "+r"(width) // %2
+ : "r"(&kUVToRB), // %3
+ "r"(&kUVToG) // %4
+ : "cc", "memory", "q0", "q1", "q2", "q3",
+ "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+ );
+}
+
+void I400ToARGBRow_NEON(const uint8* src_y,
+ uint8* dst_argb,
+ int width) {
+ asm volatile (
+ ".p2align 2 \n"
+ "vmov.u8 d23, #255 \n"
+ "1: \n"
+ "vld1.8 {d20}, [%0]! \n"
+ "vmov d21, d20 \n"
+ "vmov d22, d20 \n"
+ "subs %2, %2, #8 \n"
+ "vst4.8 {d20, d21, d22, d23}, [%1]! \n"
+ "bgt 1b \n"
+ : "+r"(src_y), // %0
+ "+r"(dst_argb), // %1
+ "+r"(width) // %2
+ :
+ : "cc", "memory", "d20", "d21", "d22", "d23"
+ );
+}
+
+void NV12ToARGBRow_NEON(const uint8* src_y,
+ const uint8* src_uv,
+ uint8* dst_argb,
+ int width) {
+ asm volatile (
+ "vld1.8 {d24}, [%4] \n"
+ "vld1.8 {d25}, [%5] \n"
+ "vmov.u8 d26, #128 \n"
+ "vmov.u16 q14, #74 \n"
+ "vmov.u16 q15, #16 \n"
+ ".p2align 2 \n"
+ "1: \n"
+ READNV12
+ YUV422TORGB
+ "subs %3, %3, #8 \n"
+ "vmov.u8 d23, #255 \n"
+ "vst4.8 {d20, d21, d22, d23}, [%2]! \n"
+ "bgt 1b \n"
+ : "+r"(src_y), // %0
+ "+r"(src_uv), // %1
+ "+r"(dst_argb), // %2
+ "+r"(width) // %3
+ : "r"(&kUVToRB), // %4
+ "r"(&kUVToG) // %5
+ : "cc", "memory", "q0", "q1", "q2", "q3",
+ "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+ );
+}
+
+void NV21ToARGBRow_NEON(const uint8* src_y,
+ const uint8* src_uv,
+ uint8* dst_argb,
+ int width) {
+ asm volatile (
+ "vld1.8 {d24}, [%4] \n"
+ "vld1.8 {d25}, [%5] \n"
+ "vmov.u8 d26, #128 \n"
+ "vmov.u16 q14, #74 \n"
+ "vmov.u16 q15, #16 \n"
+ ".p2align 2 \n"
+ "1: \n"
+ READNV21
+ YUV422TORGB
+ "subs %3, %3, #8 \n"
+ "vmov.u8 d23, #255 \n"
+ "vst4.8 {d20, d21, d22, d23}, [%2]! \n"
+ "bgt 1b \n"
+ : "+r"(src_y), // %0
+ "+r"(src_uv), // %1
+ "+r"(dst_argb), // %2
+ "+r"(width) // %3
+ : "r"(&kUVToRB), // %4
+ "r"(&kUVToG) // %5
+ : "cc", "memory", "q0", "q1", "q2", "q3",
+ "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+ );
+}
+
+void NV12ToRGB565Row_NEON(const uint8* src_y,
+ const uint8* src_uv,
+ uint8* dst_rgb565,
+ int width) {
+ asm volatile (
+ "vld1.8 {d24}, [%4] \n"
+ "vld1.8 {d25}, [%5] \n"
+ "vmov.u8 d26, #128 \n"
+ "vmov.u16 q14, #74 \n"
+ "vmov.u16 q15, #16 \n"
+ ".p2align 2 \n"
+ "1: \n"
+ READNV12
+ YUV422TORGB
+ "subs %3, %3, #8 \n"
+ ARGBTORGB565
+ "vst1.8 {q0}, [%2]! \n" // store 8 pixels RGB565.
+ "bgt 1b \n"
+ : "+r"(src_y), // %0
+ "+r"(src_uv), // %1
+ "+r"(dst_rgb565), // %2
+ "+r"(width) // %3
+ : "r"(&kUVToRB), // %4
+ "r"(&kUVToG) // %5
+ : "cc", "memory", "q0", "q1", "q2", "q3",
+ "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+ );
+}
+
+void NV21ToRGB565Row_NEON(const uint8* src_y,
+ const uint8* src_uv,
+ uint8* dst_rgb565,
+ int width) {
+ asm volatile (
+ "vld1.8 {d24}, [%4] \n"
+ "vld1.8 {d25}, [%5] \n"
+ "vmov.u8 d26, #128 \n"
+ "vmov.u16 q14, #74 \n"
+ "vmov.u16 q15, #16 \n"
+ ".p2align 2 \n"
+ "1: \n"
+ READNV21
+ YUV422TORGB
+ "subs %3, %3, #8 \n"
+ ARGBTORGB565
+ "vst1.8 {q0}, [%2]! \n" // store 8 pixels RGB565.
+ "bgt 1b \n"
+ : "+r"(src_y), // %0
+ "+r"(src_uv), // %1
+ "+r"(dst_rgb565), // %2
+ "+r"(width) // %3
+ : "r"(&kUVToRB), // %4
+ "r"(&kUVToG) // %5
+ : "cc", "memory", "q0", "q1", "q2", "q3",
+ "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+ );
+}
+
+void YUY2ToARGBRow_NEON(const uint8* src_yuy2,
+ uint8* dst_argb,
+ int width) {
+ asm volatile (
+ "vld1.8 {d24}, [%3] \n"
+ "vld1.8 {d25}, [%4] \n"
+ "vmov.u8 d26, #128 \n"
+ "vmov.u16 q14, #74 \n"
+ "vmov.u16 q15, #16 \n"
+ ".p2align 2 \n"
+ "1: \n"
+ READYUY2
+ YUV422TORGB
+ "subs %2, %2, #8 \n"
+ "vmov.u8 d23, #255 \n"
+ "vst4.8 {d20, d21, d22, d23}, [%1]! \n"
+ "bgt 1b \n"
+ : "+r"(src_yuy2), // %0
+ "+r"(dst_argb), // %1
+ "+r"(width) // %2
+ : "r"(&kUVToRB), // %3
+ "r"(&kUVToG) // %4
+ : "cc", "memory", "q0", "q1", "q2", "q3",
+ "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+ );
+}
+
+void UYVYToARGBRow_NEON(const uint8* src_uyvy,
+ uint8* dst_argb,
+ int width) {
+ asm volatile (
+ "vld1.8 {d24}, [%3] \n"
+ "vld1.8 {d25}, [%4] \n"
+ "vmov.u8 d26, #128 \n"
+ "vmov.u16 q14, #74 \n"
+ "vmov.u16 q15, #16 \n"
+ ".p2align 2 \n"
+ "1: \n"
+ READUYVY
+ YUV422TORGB
+ "subs %2, %2, #8 \n"
+ "vmov.u8 d23, #255 \n"
+ "vst4.8 {d20, d21, d22, d23}, [%1]! \n"
+ "bgt 1b \n"
+ : "+r"(src_uyvy), // %0
+ "+r"(dst_argb), // %1
+ "+r"(width) // %2
+ : "r"(&kUVToRB), // %3
+ "r"(&kUVToG) // %4
+ : "cc", "memory", "q0", "q1", "q2", "q3",
+ "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+ );
+}
+
+// Reads 16 pairs of UV and write even values to dst_u and odd to dst_v.
+void SplitUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
+ int width) {
+ asm volatile (
+ ".p2align 2 \n"
+ "1: \n"
+ "vld2.8 {q0, q1}, [%0]! \n" // load 16 pairs of UV
+ "subs %3, %3, #16 \n" // 16 processed per loop
+ "vst1.8 {q0}, [%1]! \n" // store U
+ "vst1.8 {q1}, [%2]! \n" // store V
+ "bgt 1b \n"
+ : "+r"(src_uv), // %0
+ "+r"(dst_u), // %1
+ "+r"(dst_v), // %2
+ "+r"(width) // %3 // Output registers
+ : // Input registers
+ : "cc", "memory", "q0", "q1" // Clobber List
+ );
+}
+
+// Reads 16 U's and V's and writes out 16 pairs of UV.
+void MergeUVRow_NEON(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
+ int width) {
+ asm volatile (
+ ".p2align 2 \n"
+ "1: \n"
+ "vld1.8 {q0}, [%0]! \n" // load U
+ "vld1.8 {q1}, [%1]! \n" // load V
+ "subs %3, %3, #16 \n" // 16 processed per loop
+ "vst2.u8 {q0, q1}, [%2]! \n" // store 16 pairs of UV
+ "bgt 1b \n"
+ :
+ "+r"(src_u), // %0
+ "+r"(src_v), // %1
+ "+r"(dst_uv), // %2
+ "+r"(width) // %3 // Output registers
+ : // Input registers
+ : "cc", "memory", "q0", "q1" // Clobber List
+ );
+}
+
+// Copy multiple of 32. vld4.8 allow unaligned and is fastest on a15.
+void CopyRow_NEON(const uint8* src, uint8* dst, int count) {
+ asm volatile (
+ ".p2align 2 \n"
+ "1: \n"
+ "vld1.8 {d0, d1, d2, d3}, [%0]! \n" // load 32
+ "subs %2, %2, #32 \n" // 32 processed per loop
+ "vst1.8 {d0, d1, d2, d3}, [%1]! \n" // store 32
+ "bgt 1b \n"
+ : "+r"(src), // %0
+ "+r"(dst), // %1
+ "+r"(count) // %2 // Output registers
+ : // Input registers
+ : "cc", "memory", "q0", "q1" // Clobber List
+ );
+}
+
+// SetRow8 writes 'count' bytes using a 32 bit value repeated.
+void SetRow_NEON(uint8* dst, uint32 v32, int count) {
+ asm volatile (
+ "vdup.u32 q0, %2 \n" // duplicate 4 ints
+ "1: \n"
+ "subs %1, %1, #16 \n" // 16 bytes per loop
+ "vst1.8 {q0}, [%0]! \n" // store
+ "bgt 1b \n"
+ : "+r"(dst), // %0
+ "+r"(count) // %1
+ : "r"(v32) // %2
+ : "cc", "memory", "q0"
+ );
+}
+
+// TODO(fbarchard): Make fully assembler
+// SetRow32 writes 'count' words using a 32 bit value repeated.
+void ARGBSetRows_NEON(uint8* dst, uint32 v32, int width,
+ int dst_stride, int height) {
+ for (int y = 0; y < height; ++y) {
+ SetRow_NEON(dst, v32, width << 2);
+ dst += dst_stride;
+ }
+}
+
+void MirrorRow_NEON(const uint8* src, uint8* dst, int width) {
+ asm volatile (
+ // Start at end of source row.
+ "mov r3, #-16 \n"
+ "add %0, %0, %2 \n"
+ "sub %0, #16 \n"
+
+ ".p2align 2 \n"
+ "1: \n"
+ "vld1.8 {q0}, [%0], r3 \n" // src -= 16
+ "subs %2, #16 \n" // 16 pixels per loop.
+ "vrev64.8 q0, q0 \n"
+ "vst1.8 {d1}, [%1]! \n" // dst += 16
+ "vst1.8 {d0}, [%1]! \n"
+ "bgt 1b \n"
+ : "+r"(src), // %0
+ "+r"(dst), // %1
+ "+r"(width) // %2
+ :
+ : "cc", "memory", "r3", "q0"
+ );
+}
+
+void MirrorUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
+ int width) {
+ asm volatile (
+ // Start at end of source row.
+ "mov r12, #-16 \n"
+ "add %0, %0, %3, lsl #1 \n"
+ "sub %0, #16 \n"
+
+ ".p2align 2 \n"
+ "1: \n"
+ "vld2.8 {d0, d1}, [%0], r12 \n" // src -= 16
+ "subs %3, #8 \n" // 8 pixels per loop.
+ "vrev64.8 q0, q0 \n"
+ "vst1.8 {d0}, [%1]! \n" // dst += 8
+ "vst1.8 {d1}, [%2]! \n"
+ "bgt 1b \n"
+ : "+r"(src_uv), // %0
+ "+r"(dst_u), // %1
+ "+r"(dst_v), // %2
+ "+r"(width) // %3
+ :
+ : "cc", "memory", "r12", "q0"
+ );
+}
+
+void ARGBMirrorRow_NEON(const uint8* src, uint8* dst, int width) {
+ asm volatile (
+ // Start at end of source row.
+ "mov r3, #-16 \n"
+ "add %0, %0, %2, lsl #2 \n"
+ "sub %0, #16 \n"
+
+ ".p2align 2 \n"
+ "1: \n"
+ "vld1.8 {q0}, [%0], r3 \n" // src -= 16
+ "subs %2, #4 \n" // 4 pixels per loop.
+ "vrev64.32 q0, q0 \n"
+ "vst1.8 {d1}, [%1]! \n" // dst += 16
+ "vst1.8 {d0}, [%1]! \n"
+ "bgt 1b \n"
+ : "+r"(src), // %0
+ "+r"(dst), // %1
+ "+r"(width) // %2
+ :
+ : "cc", "memory", "r3", "q0"
+ );
+}
+
+void RGB24ToARGBRow_NEON(const uint8* src_rgb24, uint8* dst_argb, int pix) {
+ asm volatile (
+ "vmov.u8 d4, #255 \n" // Alpha
+ ".p2align 2 \n"
+ "1: \n"
+ "vld3.8 {d1, d2, d3}, [%0]! \n" // load 8 pixels of RGB24.
+ "subs %2, %2, #8 \n" // 8 processed per loop.
+ "vst4.8 {d1, d2, d3, d4}, [%1]! \n" // store 8 pixels of ARGB.
+ "bgt 1b \n"
+ : "+r"(src_rgb24), // %0
+ "+r"(dst_argb), // %1
+ "+r"(pix) // %2
+ :
+ : "cc", "memory", "d1", "d2", "d3", "d4" // Clobber List
+ );
+}
+
+void RAWToARGBRow_NEON(const uint8* src_raw, uint8* dst_argb, int pix) {
+ asm volatile (
+ "vmov.u8 d4, #255 \n" // Alpha
+ ".p2align 2 \n"
+ "1: \n"
+ "vld3.8 {d1, d2, d3}, [%0]! \n" // load 8 pixels of RAW.
+ "subs %2, %2, #8 \n" // 8 processed per loop.
+ "vswp.u8 d1, d3 \n" // swap R, B
+ "vst4.8 {d1, d2, d3, d4}, [%1]! \n" // store 8 pixels of ARGB.
+ "bgt 1b \n"
+ : "+r"(src_raw), // %0
+ "+r"(dst_argb), // %1
+ "+r"(pix) // %2
+ :
+ : "cc", "memory", "d1", "d2", "d3", "d4" // Clobber List
+ );
+}
+
+#define RGB565TOARGB \
+ "vshrn.u16 d6, q0, #5 \n" /* G xxGGGGGG */ \
+ "vuzp.u8 d0, d1 \n" /* d0 xxxBBBBB RRRRRxxx */ \
+ "vshl.u8 d6, d6, #2 \n" /* G GGGGGG00 upper 6 */ \
+ "vshr.u8 d1, d1, #3 \n" /* R 000RRRRR lower 5 */ \
+ "vshl.u8 q0, q0, #3 \n" /* B,R BBBBB000 upper 5 */ \
+ "vshr.u8 q2, q0, #5 \n" /* B,R 00000BBB lower 3 */ \
+ "vorr.u8 d0, d0, d4 \n" /* B */ \
+ "vshr.u8 d4, d6, #6 \n" /* G 000000GG lower 2 */ \
+ "vorr.u8 d2, d1, d5 \n" /* R */ \
+ "vorr.u8 d1, d4, d6 \n" /* G */
+
+void RGB565ToARGBRow_NEON(const uint8* src_rgb565, uint8* dst_argb, int pix) {
+ asm volatile (
+ "vmov.u8 d3, #255 \n" // Alpha
+ ".p2align 2 \n"
+ "1: \n"
+ "vld1.8 {q0}, [%0]! \n" // load 8 RGB565 pixels.
+ "subs %2, %2, #8 \n" // 8 processed per loop.
+ RGB565TOARGB
+ "vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 pixels of ARGB.
+ "bgt 1b \n"
+ : "+r"(src_rgb565), // %0
+ "+r"(dst_argb), // %1
+ "+r"(pix) // %2
+ :
+ : "cc", "memory", "q0", "q1", "q2", "q3" // Clobber List
+ );
+}
+
+#define ARGB1555TOARGB \
+ "vshrn.u16 d7, q0, #8 \n" /* A Arrrrrxx */ \
+ "vshr.u8 d6, d7, #2 \n" /* R xxxRRRRR */ \
+ "vshrn.u16 d5, q0, #5 \n" /* G xxxGGGGG */ \
+ "vmovn.u16 d4, q0 \n" /* B xxxBBBBB */ \
+ "vshr.u8 d7, d7, #7 \n" /* A 0000000A */ \
+ "vneg.s8 d7, d7 \n" /* A AAAAAAAA upper 8 */ \
+ "vshl.u8 d6, d6, #3 \n" /* R RRRRR000 upper 5 */ \
+ "vshr.u8 q1, q3, #5 \n" /* R,A 00000RRR lower 3 */ \
+ "vshl.u8 q0, q2, #3 \n" /* B,G BBBBB000 upper 5 */ \
+ "vshr.u8 q2, q0, #5 \n" /* B,G 00000BBB lower 3 */ \
+ "vorr.u8 q1, q1, q3 \n" /* R,A */ \
+ "vorr.u8 q0, q0, q2 \n" /* B,G */ \
+
+// RGB555TOARGB is same as ARGB1555TOARGB but ignores alpha.
+#define RGB555TOARGB \
+ "vshrn.u16 d6, q0, #5 \n" /* G xxxGGGGG */ \
+ "vuzp.u8 d0, d1 \n" /* d0 xxxBBBBB xRRRRRxx */ \
+ "vshl.u8 d6, d6, #3 \n" /* G GGGGG000 upper 5 */ \
+ "vshr.u8 d1, d1, #2 \n" /* R 00xRRRRR lower 5 */ \
+ "vshl.u8 q0, q0, #3 \n" /* B,R BBBBB000 upper 5 */ \
+ "vshr.u8 q2, q0, #5 \n" /* B,R 00000BBB lower 3 */ \
+ "vorr.u8 d0, d0, d4 \n" /* B */ \
+ "vshr.u8 d4, d6, #5 \n" /* G 00000GGG lower 3 */ \
+ "vorr.u8 d2, d1, d5 \n" /* R */ \
+ "vorr.u8 d1, d4, d6 \n" /* G */
+
+void ARGB1555ToARGBRow_NEON(const uint8* src_argb1555, uint8* dst_argb,
+ int pix) {
+ asm volatile (
+ "vmov.u8 d3, #255 \n" // Alpha
+ ".p2align 2 \n"
+ "1: \n"
+ "vld1.8 {q0}, [%0]! \n" // load 8 ARGB1555 pixels.
+ "subs %2, %2, #8 \n" // 8 processed per loop.
+ ARGB1555TOARGB
+ "vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 pixels of ARGB.
+ "bgt 1b \n"
+ : "+r"(src_argb1555), // %0
+ "+r"(dst_argb), // %1
+ "+r"(pix) // %2
+ :
+ : "cc", "memory", "q0", "q1", "q2", "q3" // Clobber List
+ );
+}
+
+#define ARGB4444TOARGB \
+ "vuzp.u8 d0, d1 \n" /* d0 BG, d1 RA */ \
+ "vshl.u8 q2, q0, #4 \n" /* B,R BBBB0000 */ \
+ "vshr.u8 q1, q0, #4 \n" /* G,A 0000GGGG */ \
+ "vshr.u8 q0, q2, #4 \n" /* B,R 0000BBBB */ \
+ "vorr.u8 q0, q0, q2 \n" /* B,R BBBBBBBB */ \
+ "vshl.u8 q2, q1, #4 \n" /* G,A GGGG0000 */ \
+ "vorr.u8 q1, q1, q2 \n" /* G,A GGGGGGGG */ \
+ "vswp.u8 d1, d2 \n" /* B,R,G,A -> B,G,R,A */
+
+void ARGB4444ToARGBRow_NEON(const uint8* src_argb4444, uint8* dst_argb,
+ int pix) {
+ asm volatile (
+ "vmov.u8 d3, #255 \n" // Alpha
+ ".p2align 2 \n"
+ "1: \n"
+ "vld1.8 {q0}, [%0]! \n" // load 8 ARGB4444 pixels.
+ "subs %2, %2, #8 \n" // 8 processed per loop.
+ ARGB4444TOARGB
+ "vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 pixels of ARGB.
+ "bgt 1b \n"
+ : "+r"(src_argb4444), // %0
+ "+r"(dst_argb), // %1
+ "+r"(pix) // %2
+ :
+ : "cc", "memory", "q0", "q1", "q2" // Clobber List
+ );
+}
+
+void ARGBToRGB24Row_NEON(const uint8* src_argb, uint8* dst_rgb24, int pix) {
+ asm volatile (
+ ".p2align 2 \n"
+ "1: \n"
+ "vld4.8 {d1, d2, d3, d4}, [%0]! \n" // load 8 pixels of ARGB.
+ "subs %2, %2, #8 \n" // 8 processed per loop.
+ "vst3.8 {d1, d2, d3}, [%1]! \n" // store 8 pixels of RGB24.
+ "bgt 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_rgb24), // %1
+ "+r"(pix) // %2
+ :
+ : "cc", "memory", "d1", "d2", "d3", "d4" // Clobber List
+ );
+}
+
+void ARGBToRAWRow_NEON(const uint8* src_argb, uint8* dst_raw, int pix) {
+ asm volatile (
+ ".p2align 2 \n"
+ "1: \n"
+ "vld4.8 {d1, d2, d3, d4}, [%0]! \n" // load 8 pixels of ARGB.
+ "subs %2, %2, #8 \n" // 8 processed per loop.
+ "vswp.u8 d1, d3 \n" // swap R, B
+ "vst3.8 {d1, d2, d3}, [%1]! \n" // store 8 pixels of RAW.
+ "bgt 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_raw), // %1
+ "+r"(pix) // %2
+ :
+ : "cc", "memory", "d1", "d2", "d3", "d4" // Clobber List
+ );
+}
+
+void YUY2ToYRow_NEON(const uint8* src_yuy2, uint8* dst_y, int pix) {
+ asm volatile (
+ ".p2align 2 \n"
+ "1: \n"
+ "vld2.8 {q0, q1}, [%0]! \n" // load 16 pixels of YUY2.
+ "subs %2, %2, #16 \n" // 16 processed per loop.
+ "vst1.8 {q0}, [%1]! \n" // store 16 pixels of Y.
+ "bgt 1b \n"
+ : "+r"(src_yuy2), // %0
+ "+r"(dst_y), // %1
+ "+r"(pix) // %2
+ :
+ : "cc", "memory", "q0", "q1" // Clobber List
+ );
+}
+
+void UYVYToYRow_NEON(const uint8* src_uyvy, uint8* dst_y, int pix) {
+ asm volatile (
+ ".p2align 2 \n"
+ "1: \n"
+ "vld2.8 {q0, q1}, [%0]! \n" // load 16 pixels of UYVY.
+ "subs %2, %2, #16 \n" // 16 processed per loop.
+ "vst1.8 {q1}, [%1]! \n" // store 16 pixels of Y.
+ "bgt 1b \n"
+ : "+r"(src_uyvy), // %0
+ "+r"(dst_y), // %1
+ "+r"(pix) // %2
+ :
+ : "cc", "memory", "q0", "q1" // Clobber List
+ );
+}
+
+void YUY2ToUV422Row_NEON(const uint8* src_yuy2, uint8* dst_u, uint8* dst_v,
+ int pix) {
+ asm volatile (
+ ".p2align 2 \n"
+ "1: \n"
+ "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of YUY2.
+ "subs %3, %3, #16 \n" // 16 pixels = 8 UVs.
+ "vst1.8 {d1}, [%1]! \n" // store 8 U.
+ "vst1.8 {d3}, [%2]! \n" // store 8 V.
+ "bgt 1b \n"
+ : "+r"(src_yuy2), // %0
+ "+r"(dst_u), // %1
+ "+r"(dst_v), // %2
+ "+r"(pix) // %3
+ :
+ : "cc", "memory", "d0", "d1", "d2", "d3" // Clobber List
+ );
+}
+
+void UYVYToUV422Row_NEON(const uint8* src_uyvy, uint8* dst_u, uint8* dst_v,
+ int pix) {
+ asm volatile (
+ ".p2align 2 \n"
+ "1: \n"
+ "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of UYVY.
+ "subs %3, %3, #16 \n" // 16 pixels = 8 UVs.
+ "vst1.8 {d0}, [%1]! \n" // store 8 U.
+ "vst1.8 {d2}, [%2]! \n" // store 8 V.
+ "bgt 1b \n"
+ : "+r"(src_uyvy), // %0
+ "+r"(dst_u), // %1
+ "+r"(dst_v), // %2
+ "+r"(pix) // %3
+ :
+ : "cc", "memory", "d0", "d1", "d2", "d3" // Clobber List
+ );
+}
+
+void YUY2ToUVRow_NEON(const uint8* src_yuy2, int stride_yuy2,
+ uint8* dst_u, uint8* dst_v, int pix) {
+ asm volatile (
+ "add %1, %0, %1 \n" // stride + src_yuy2
+ ".p2align 2 \n"
+ "1: \n"
+ "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of YUY2.
+ "subs %4, %4, #16 \n" // 16 pixels = 8 UVs.
+ "vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load next row YUY2.
+ "vrhadd.u8 d1, d1, d5 \n" // average rows of U
+ "vrhadd.u8 d3, d3, d7 \n" // average rows of V
+ "vst1.8 {d1}, [%2]! \n" // store 8 U.
+ "vst1.8 {d3}, [%3]! \n" // store 8 V.
+ "bgt 1b \n"
+ : "+r"(src_yuy2), // %0
+ "+r"(stride_yuy2), // %1
+ "+r"(dst_u), // %2
+ "+r"(dst_v), // %3
+ "+r"(pix) // %4
+ :
+ : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7" // Clobber List
+ );
+}
+
+void UYVYToUVRow_NEON(const uint8* src_uyvy, int stride_uyvy,
+ uint8* dst_u, uint8* dst_v, int pix) {
+ asm volatile (
+ "add %1, %0, %1 \n" // stride + src_uyvy
+ ".p2align 2 \n"
+ "1: \n"
+ "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of UYVY.
+ "subs %4, %4, #16 \n" // 16 pixels = 8 UVs.
+ "vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load next row UYVY.
+ "vrhadd.u8 d0, d0, d4 \n" // average rows of U
+ "vrhadd.u8 d2, d2, d6 \n" // average rows of V
+ "vst1.8 {d0}, [%2]! \n" // store 8 U.
+ "vst1.8 {d2}, [%3]! \n" // store 8 V.
+ "bgt 1b \n"
+ : "+r"(src_uyvy), // %0
+ "+r"(stride_uyvy), // %1
+ "+r"(dst_u), // %2
+ "+r"(dst_v), // %3
+ "+r"(pix) // %4
+ :
+ : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7" // Clobber List
+ );
+}
+
+void HalfRow_NEON(const uint8* src_uv, int src_uv_stride,
+ uint8* dst_uv, int pix) {
+ asm volatile (
+ // change the stride to row 2 pointer
+ "add %1, %0 \n"
+ "1: \n"
+ "vld1.8 {q0}, [%0]! \n" // load row 1 16 pixels.
+ "subs %3, %3, #16 \n" // 16 processed per loop
+ "vld1.8 {q1}, [%1]! \n" // load row 2 16 pixels.
+ "vrhadd.u8 q0, q1 \n" // average row 1 and 2
+ "vst1.8 {q0}, [%2]! \n"
+ "bgt 1b \n"
+ : "+r"(src_uv), // %0
+ "+r"(src_uv_stride), // %1
+ "+r"(dst_uv), // %2
+ "+r"(pix) // %3
+ :
+ : "cc", "memory", "q0", "q1" // Clobber List
+ );
+}
+
+// Select 2 channels from ARGB on alternating pixels. e.g. BGBGBGBG
+void ARGBToBayerRow_NEON(const uint8* src_argb, uint8* dst_bayer,
+ uint32 selector, int pix) {
+ asm volatile (
+ "vmov.u32 d6[0], %3 \n" // selector
+ "1: \n"
+ "vld1.8 {q0, q1}, [%0]! \n" // load row 8 pixels.
+ "subs %2, %2, #8 \n" // 8 processed per loop
+ "vtbl.8 d4, {d0, d1}, d6 \n" // look up 4 pixels
+ "vtbl.8 d5, {d2, d3}, d6 \n" // look up 4 pixels
+ "vtrn.u32 d4, d5 \n" // combine 8 pixels
+ "vst1.8 {d4}, [%1]! \n" // store 8.
+ "bgt 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_bayer), // %1
+ "+r"(pix) // %2
+ : "r"(selector) // %3
+ : "cc", "memory", "q0", "q1", "q2", "q3" // Clobber List
+ );
+}
+
+// Select G channels from ARGB. e.g. GGGGGGGG
+void ARGBToBayerGGRow_NEON(const uint8* src_argb, uint8* dst_bayer,
+ uint32 /*selector*/, int pix) {
+ asm volatile (
+ "1: \n"
+ "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load row 8 pixels.
+ "subs %2, %2, #8 \n" // 8 processed per loop
+ "vst1.8 {d1}, [%1]! \n" // store 8 G's.
+ "bgt 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_bayer), // %1
+ "+r"(pix) // %2
+ :
+ : "cc", "memory", "q0", "q1" // Clobber List
+ );
+}
+
+// For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
+void ARGBShuffleRow_NEON(const uint8* src_argb, uint8* dst_argb,
+ const uint8* shuffler, int pix) {
+ asm volatile (
+ "vld1.8 {q2}, [%3] \n" // shuffler
+ "1: \n"
+ "vld1.8 {q0}, [%0]! \n" // load 4 pixels.
+ "subs %2, %2, #4 \n" // 4 processed per loop
+ "vtbl.8 d2, {d0, d1}, d4 \n" // look up 2 first pixels
+ "vtbl.8 d3, {d0, d1}, d5 \n" // look up 2 next pixels
+ "vst1.8 {q1}, [%1]! \n" // store 4.
+ "bgt 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_argb), // %1
+ "+r"(pix) // %2
+ : "r"(shuffler) // %3
+ : "cc", "memory", "q0", "q1", "q2" // Clobber List
+ );
+}
+
+void I422ToYUY2Row_NEON(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_yuy2, int width) {
+ asm volatile (
+ ".p2align 2 \n"
+ "1: \n"
+ "vld2.8 {d0, d2}, [%0]! \n" // load 16 Ys
+ "vld1.8 {d1}, [%1]! \n" // load 8 Us
+ "vld1.8 {d3}, [%2]! \n" // load 8 Vs
+ "subs %4, %4, #16 \n" // 16 pixels
+ "vst4.8 {d0, d1, d2, d3}, [%3]! \n" // Store 8 YUY2/16 pixels.
+ "bgt 1b \n"
+ : "+r"(src_y), // %0
+ "+r"(src_u), // %1
+ "+r"(src_v), // %2
+ "+r"(dst_yuy2), // %3
+ "+r"(width) // %4
+ :
+ : "cc", "memory", "d0", "d1", "d2", "d3"
+ );
+}
+
+void I422ToUYVYRow_NEON(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_uyvy, int width) {
+ asm volatile (
+ ".p2align 2 \n"
+ "1: \n"
+ "vld2.8 {d1, d3}, [%0]! \n" // load 16 Ys
+ "vld1.8 {d0}, [%1]! \n" // load 8 Us
+ "vld1.8 {d2}, [%2]! \n" // load 8 Vs
+ "subs %4, %4, #16 \n" // 16 pixels
+ "vst4.8 {d0, d1, d2, d3}, [%3]! \n" // Store 8 UYVY/16 pixels.
+ "bgt 1b \n"
+ : "+r"(src_y), // %0
+ "+r"(src_u), // %1
+ "+r"(src_v), // %2
+ "+r"(dst_uyvy), // %3
+ "+r"(width) // %4
+ :
+ : "cc", "memory", "d0", "d1", "d2", "d3"
+ );
+}
+
+void ARGBToRGB565Row_NEON(const uint8* src_argb, uint8* dst_rgb565, int pix) {
+ asm volatile (
+ ".p2align 2 \n"
+ "1: \n"
+ "vld4.8 {d20, d21, d22, d23}, [%0]! \n" // load 8 pixels of ARGB.
+ "subs %2, %2, #8 \n" // 8 processed per loop.
+ ARGBTORGB565
+ "vst1.8 {q0}, [%1]! \n" // store 8 pixels RGB565.
+ "bgt 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_rgb565), // %1
+ "+r"(pix) // %2
+ :
+ : "cc", "memory", "q0", "q8", "q9", "q10", "q11"
+ );
+}
+
+void ARGBToARGB1555Row_NEON(const uint8* src_argb, uint8* dst_argb1555,
+ int pix) {
+ asm volatile (
+ ".p2align 2 \n"
+ "1: \n"
+ "vld4.8 {d20, d21, d22, d23}, [%0]! \n" // load 8 pixels of ARGB.
+ "subs %2, %2, #8 \n" // 8 processed per loop.
+ ARGBTOARGB1555
+ "vst1.8 {q0}, [%1]! \n" // store 8 pixels ARGB1555.
+ "bgt 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_argb1555), // %1
+ "+r"(pix) // %2
+ :
+ : "cc", "memory", "q0", "q8", "q9", "q10", "q11"
+ );
+}
+
+void ARGBToARGB4444Row_NEON(const uint8* src_argb, uint8* dst_argb4444,
+ int pix) {
+ asm volatile (
+ "vmov.u8 d4, #0x0f \n" // bits to clear with vbic.
+ ".p2align 2 \n"
+ "1: \n"
+ "vld4.8 {d20, d21, d22, d23}, [%0]! \n" // load 8 pixels of ARGB.
+ "subs %2, %2, #8 \n" // 8 processed per loop.
+ ARGBTOARGB4444
+ "vst1.8 {q0}, [%1]! \n" // store 8 pixels ARGB4444.
+ "bgt 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_argb4444), // %1
+ "+r"(pix) // %2
+ :
+ : "cc", "memory", "q0", "q8", "q9", "q10", "q11"
+ );
+}
+
+void ARGBToYRow_NEON(const uint8* src_argb, uint8* dst_y, int pix) {
+ asm volatile (
+ "vmov.u8 d24, #13 \n" // B * 0.1016 coefficient
+ "vmov.u8 d25, #65 \n" // G * 0.5078 coefficient
+ "vmov.u8 d26, #33 \n" // R * 0.2578 coefficient
+ "vmov.u8 d27, #16 \n" // Add 16 constant
+ ".p2align 2 \n"
+ "1: \n"
+ "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels.
+ "subs %2, %2, #8 \n" // 8 processed per loop.
+ "vmull.u8 q2, d0, d24 \n" // B
+ "vmlal.u8 q2, d1, d25 \n" // G
+ "vmlal.u8 q2, d2, d26 \n" // R
+ "vqrshrun.s16 d0, q2, #7 \n" // 16 bit to 8 bit Y
+ "vqadd.u8 d0, d27 \n"
+ "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y.
+ "bgt 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_y), // %1
+ "+r"(pix) // %2
+ :
+ : "cc", "memory", "q0", "q1", "q2", "q12", "q13"
+ );
+}
+
+void ARGBToYJRow_NEON(const uint8* src_argb, uint8* dst_y, int pix) {
+ asm volatile (
+ "vmov.u8 d24, #15 \n" // B * 0.11400 coefficient
+ "vmov.u8 d25, #75 \n" // G * 0.58700 coefficient
+ "vmov.u8 d26, #38 \n" // R * 0.29900 coefficient
+ ".p2align 2 \n"
+ "1: \n"
+ "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels.
+ "subs %2, %2, #8 \n" // 8 processed per loop.
+ "vmull.u8 q2, d0, d24 \n" // B
+ "vmlal.u8 q2, d1, d25 \n" // G
+ "vmlal.u8 q2, d2, d26 \n" // R
+ "vqrshrun.s16 d0, q2, #7 \n" // 15 bit to 8 bit Y
+ "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y.
+ "bgt 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_y), // %1
+ "+r"(pix) // %2
+ :
+ : "cc", "memory", "q0", "q1", "q2", "q12", "q13"
+ );
+}
+
+// 8x1 pixels.
+void ARGBToUV444Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
+ int pix) {
+ asm volatile (
+ "vmov.u8 d24, #112 \n" // UB / VR 0.875 coefficient
+ "vmov.u8 d25, #74 \n" // UG -0.5781 coefficient
+ "vmov.u8 d26, #38 \n" // UR -0.2969 coefficient
+ "vmov.u8 d27, #18 \n" // VB -0.1406 coefficient
+ "vmov.u8 d28, #94 \n" // VG -0.7344 coefficient
+ "vmov.u16 q15, #0x8080 \n" // 128.5
+ ".p2align 2 \n"
+ "1: \n"
+ "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels.
+ "subs %3, %3, #8 \n" // 8 processed per loop.
+ "vmull.u8 q2, d0, d24 \n" // B
+ "vmlsl.u8 q2, d1, d25 \n" // G
+ "vmlsl.u8 q2, d2, d26 \n" // R
+ "vadd.u16 q2, q2, q15 \n" // +128 -> unsigned
+
+ "vmull.u8 q3, d2, d24 \n" // R
+ "vmlsl.u8 q3, d1, d28 \n" // G
+ "vmlsl.u8 q3, d0, d27 \n" // B
+ "vadd.u16 q3, q3, q15 \n" // +128 -> unsigned
+
+ "vqshrn.u16 d0, q2, #8 \n" // 16 bit to 8 bit U
+ "vqshrn.u16 d1, q3, #8 \n" // 16 bit to 8 bit V
+
+ "vst1.8 {d0}, [%1]! \n" // store 8 pixels U.
+ "vst1.8 {d1}, [%2]! \n" // store 8 pixels V.
+ "bgt 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_u), // %1
+ "+r"(dst_v), // %2
+ "+r"(pix) // %3
+ :
+ : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q12", "q13", "q14", "q15"
+ );
+}
+
+// 16x1 pixels -> 8x1. pix is number of argb pixels. e.g. 16.
+void ARGBToUV422Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
+ int pix) {
+ asm volatile (
+ "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient
+ "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient
+ "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient
+ "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient
+ "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient
+ "vmov.u16 q15, #0x8080 \n" // 128.5
+ ".p2align 2 \n"
+ "1: \n"
+ "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels.
+ "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB pixels.
+
+ "vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts.
+ "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts.
+ "vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts.
+
+ "subs %3, %3, #16 \n" // 16 processed per loop.
+ "vmul.s16 q8, q0, q10 \n" // B
+ "vmls.s16 q8, q1, q11 \n" // G
+ "vmls.s16 q8, q2, q12 \n" // R
+ "vadd.u16 q8, q8, q15 \n" // +128 -> unsigned
+
+ "vmul.s16 q9, q2, q10 \n" // R
+ "vmls.s16 q9, q1, q14 \n" // G
+ "vmls.s16 q9, q0, q13 \n" // B
+ "vadd.u16 q9, q9, q15 \n" // +128 -> unsigned
+
+ "vqshrn.u16 d0, q8, #8 \n" // 16 bit to 8 bit U
+ "vqshrn.u16 d1, q9, #8 \n" // 16 bit to 8 bit V
+
+ "vst1.8 {d0}, [%1]! \n" // store 8 pixels U.
+ "vst1.8 {d1}, [%2]! \n" // store 8 pixels V.
+ "bgt 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_u), // %1
+ "+r"(dst_v), // %2
+ "+r"(pix) // %3
+ :
+ : "cc", "memory", "q0", "q1", "q2", "q3",
+ "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+ );
+}
+
+// 32x1 pixels -> 8x1. pix is number of argb pixels. e.g. 32.
+void ARGBToUV411Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
+ int pix) {
+ asm volatile (
+ "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient
+ "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient
+ "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient
+ "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient
+ "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient
+ "vmov.u16 q15, #0x8080 \n" // 128.5
+ ".p2align 2 \n"
+ "1: \n"
+ "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels.
+ "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB pixels.
+ "vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts.
+ "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts.
+ "vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts.
+ "vld4.8 {d8, d10, d12, d14}, [%0]! \n" // load 8 more ARGB pixels.
+ "vld4.8 {d9, d11, d13, d15}, [%0]! \n" // load last 8 ARGB pixels.
+ "vpaddl.u8 q4, q4 \n" // B 16 bytes -> 8 shorts.
+ "vpaddl.u8 q5, q5 \n" // G 16 bytes -> 8 shorts.
+ "vpaddl.u8 q6, q6 \n" // R 16 bytes -> 8 shorts.
+
+ "vpadd.u16 d0, d0, d1 \n" // B 16 shorts -> 8 shorts.
+ "vpadd.u16 d1, d8, d9 \n" // B
+ "vpadd.u16 d2, d2, d3 \n" // G 16 shorts -> 8 shorts.
+ "vpadd.u16 d3, d10, d11 \n" // G
+ "vpadd.u16 d4, d4, d5 \n" // R 16 shorts -> 8 shorts.
+ "vpadd.u16 d5, d12, d13 \n" // R
+
+ "vrshr.u16 q0, q0, #1 \n" // 2x average
+ "vrshr.u16 q1, q1, #1 \n"
+ "vrshr.u16 q2, q2, #1 \n"
+
+ "subs %3, %3, #32 \n" // 32 processed per loop.
+ "vmul.s16 q8, q0, q10 \n" // B
+ "vmls.s16 q8, q1, q11 \n" // G
+ "vmls.s16 q8, q2, q12 \n" // R
+ "vadd.u16 q8, q8, q15 \n" // +128 -> unsigned
+ "vmul.s16 q9, q2, q10 \n" // R
+ "vmls.s16 q9, q1, q14 \n" // G
+ "vmls.s16 q9, q0, q13 \n" // B
+ "vadd.u16 q9, q9, q15 \n" // +128 -> unsigned
+ "vqshrn.u16 d0, q8, #8 \n" // 16 bit to 8 bit U
+ "vqshrn.u16 d1, q9, #8 \n" // 16 bit to 8 bit V
+ "vst1.8 {d0}, [%1]! \n" // store 8 pixels U.
+ "vst1.8 {d1}, [%2]! \n" // store 8 pixels V.
+ "bgt 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_u), // %1
+ "+r"(dst_v), // %2
+ "+r"(pix) // %3
+ :
+ : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
+ "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+ );
+}
+
+// 16x2 pixels -> 8x1. pix is number of argb pixels. e.g. 16.
+#define RGBTOUV(QB, QG, QR) \
+ "vmul.s16 q8, " #QB ", q10 \n" /* B */ \
+ "vmls.s16 q8, " #QG ", q11 \n" /* G */ \
+ "vmls.s16 q8, " #QR ", q12 \n" /* R */ \
+ "vadd.u16 q8, q8, q15 \n" /* +128 -> unsigned */ \
+ "vmul.s16 q9, " #QR ", q10 \n" /* R */ \
+ "vmls.s16 q9, " #QG ", q14 \n" /* G */ \
+ "vmls.s16 q9, " #QB ", q13 \n" /* B */ \
+ "vadd.u16 q9, q9, q15 \n" /* +128 -> unsigned */ \
+ "vqshrn.u16 d0, q8, #8 \n" /* 16 bit to 8 bit U */ \
+ "vqshrn.u16 d1, q9, #8 \n" /* 16 bit to 8 bit V */
+
+// TODO(fbarchard): Consider vhadd vertical, then vpaddl horizontal, avoid shr.
+void ARGBToUVRow_NEON(const uint8* src_argb, int src_stride_argb,
+ uint8* dst_u, uint8* dst_v, int pix) {
+ asm volatile (
+ "add %1, %0, %1 \n" // src_stride + src_argb
+ "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient
+ "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient
+ "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient
+ "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient
+ "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient
+ "vmov.u16 q15, #0x8080 \n" // 128.5
+ ".p2align 2 \n"
+ "1: \n"
+ "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels.
+ "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB pixels.
+ "vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts.
+ "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts.
+ "vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts.
+ "vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more ARGB pixels.
+ "vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 ARGB pixels.
+ "vpadal.u8 q0, q4 \n" // B 16 bytes -> 8 shorts.
+ "vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts.
+ "vpadal.u8 q2, q6 \n" // R 16 bytes -> 8 shorts.
+
+ "vrshr.u16 q0, q0, #1 \n" // 2x average
+ "vrshr.u16 q1, q1, #1 \n"
+ "vrshr.u16 q2, q2, #1 \n"
+
+ "subs %4, %4, #16 \n" // 32 processed per loop.
+ RGBTOUV(q0, q1, q2)
+ "vst1.8 {d0}, [%2]! \n" // store 8 pixels U.
+ "vst1.8 {d1}, [%3]! \n" // store 8 pixels V.
+ "bgt 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(src_stride_argb), // %1
+ "+r"(dst_u), // %2
+ "+r"(dst_v), // %3
+ "+r"(pix) // %4
+ :
+ : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
+ "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+ );
+}
+
+// TODO(fbarchard): Subsample match C code.
+void ARGBToUVJRow_NEON(const uint8* src_argb, int src_stride_argb,
+ uint8* dst_u, uint8* dst_v, int pix) {
+ asm volatile (
+ "add %1, %0, %1 \n" // src_stride + src_argb
+ "vmov.s16 q10, #127 / 2 \n" // UB / VR 0.500 coefficient
+ "vmov.s16 q11, #84 / 2 \n" // UG -0.33126 coefficient
+ "vmov.s16 q12, #43 / 2 \n" // UR -0.16874 coefficient
+ "vmov.s16 q13, #20 / 2 \n" // VB -0.08131 coefficient
+ "vmov.s16 q14, #107 / 2 \n" // VG -0.41869 coefficient
+ "vmov.u16 q15, #0x8080 \n" // 128.5
+ ".p2align 2 \n"
+ "1: \n"
+ "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels.
+ "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB pixels.
+ "vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts.
+ "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts.
+ "vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts.
+ "vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more ARGB pixels.
+ "vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 ARGB pixels.
+ "vpadal.u8 q0, q4 \n" // B 16 bytes -> 8 shorts.
+ "vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts.
+ "vpadal.u8 q2, q6 \n" // R 16 bytes -> 8 shorts.
+
+ "vrshr.u16 q0, q0, #1 \n" // 2x average
+ "vrshr.u16 q1, q1, #1 \n"
+ "vrshr.u16 q2, q2, #1 \n"
+
+ "subs %4, %4, #16 \n" // 32 processed per loop.
+ RGBTOUV(q0, q1, q2)
+ "vst1.8 {d0}, [%2]! \n" // store 8 pixels U.
+ "vst1.8 {d1}, [%3]! \n" // store 8 pixels V.
+ "bgt 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(src_stride_argb), // %1
+ "+r"(dst_u), // %2
+ "+r"(dst_v), // %3
+ "+r"(pix) // %4
+ :
+ : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
+ "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+ );
+}
+
+void BGRAToUVRow_NEON(const uint8* src_bgra, int src_stride_bgra,
+ uint8* dst_u, uint8* dst_v, int pix) {
+ asm volatile (
+ "add %1, %0, %1 \n" // src_stride + src_bgra
+ "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient
+ "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient
+ "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient
+ "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient
+ "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient
+ "vmov.u16 q15, #0x8080 \n" // 128.5
+ ".p2align 2 \n"
+ "1: \n"
+ "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 BGRA pixels.
+ "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 BGRA pixels.
+ "vpaddl.u8 q3, q3 \n" // B 16 bytes -> 8 shorts.
+ "vpaddl.u8 q2, q2 \n" // G 16 bytes -> 8 shorts.
+ "vpaddl.u8 q1, q1 \n" // R 16 bytes -> 8 shorts.
+ "vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more BGRA pixels.
+ "vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 BGRA pixels.
+ "vpadal.u8 q3, q7 \n" // B 16 bytes -> 8 shorts.
+ "vpadal.u8 q2, q6 \n" // G 16 bytes -> 8 shorts.
+ "vpadal.u8 q1, q5 \n" // R 16 bytes -> 8 shorts.
+
+ "vrshr.u16 q1, q1, #1 \n" // 2x average
+ "vrshr.u16 q2, q2, #1 \n"
+ "vrshr.u16 q3, q3, #1 \n"
+
+ "subs %4, %4, #16 \n" // 32 processed per loop.
+ RGBTOUV(q3, q2, q1)
+ "vst1.8 {d0}, [%2]! \n" // store 8 pixels U.
+ "vst1.8 {d1}, [%3]! \n" // store 8 pixels V.
+ "bgt 1b \n"
+ : "+r"(src_bgra), // %0
+ "+r"(src_stride_bgra), // %1
+ "+r"(dst_u), // %2
+ "+r"(dst_v), // %3
+ "+r"(pix) // %4
+ :
+ : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
+ "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+ );
+}
+
+void ABGRToUVRow_NEON(const uint8* src_abgr, int src_stride_abgr,
+ uint8* dst_u, uint8* dst_v, int pix) {
+ asm volatile (
+ "add %1, %0, %1 \n" // src_stride + src_abgr
+ "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient
+ "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient
+ "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient
+ "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient
+ "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient
+ "vmov.u16 q15, #0x8080 \n" // 128.5
+ ".p2align 2 \n"
+ "1: \n"
+ "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ABGR pixels.
+ "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ABGR pixels.
+ "vpaddl.u8 q2, q2 \n" // B 16 bytes -> 8 shorts.
+ "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts.
+ "vpaddl.u8 q0, q0 \n" // R 16 bytes -> 8 shorts.
+ "vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more ABGR pixels.
+ "vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 ABGR pixels.
+ "vpadal.u8 q2, q6 \n" // B 16 bytes -> 8 shorts.
+ "vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts.
+ "vpadal.u8 q0, q4 \n" // R 16 bytes -> 8 shorts.
+
+ "vrshr.u16 q0, q0, #1 \n" // 2x average
+ "vrshr.u16 q1, q1, #1 \n"
+ "vrshr.u16 q2, q2, #1 \n"
+
+ "subs %4, %4, #16 \n" // 32 processed per loop.
+ RGBTOUV(q2, q1, q0)
+ "vst1.8 {d0}, [%2]! \n" // store 8 pixels U.
+ "vst1.8 {d1}, [%3]! \n" // store 8 pixels V.
+ "bgt 1b \n"
+ : "+r"(src_abgr), // %0
+ "+r"(src_stride_abgr), // %1
+ "+r"(dst_u), // %2
+ "+r"(dst_v), // %3
+ "+r"(pix) // %4
+ :
+ : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
+ "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+ );
+}
+
+void RGBAToUVRow_NEON(const uint8* src_rgba, int src_stride_rgba,
+ uint8* dst_u, uint8* dst_v, int pix) {
+ asm volatile (
+ "add %1, %0, %1 \n" // src_stride + src_rgba
+ "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient
+ "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient
+ "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient
+ "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient
+ "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient
+ "vmov.u16 q15, #0x8080 \n" // 128.5
+ ".p2align 2 \n"
+ "1: \n"
+ "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 RGBA pixels.
+ "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 RGBA pixels.
+ "vpaddl.u8 q0, q1 \n" // B 16 bytes -> 8 shorts.
+ "vpaddl.u8 q1, q2 \n" // G 16 bytes -> 8 shorts.
+ "vpaddl.u8 q2, q3 \n" // R 16 bytes -> 8 shorts.
+ "vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more RGBA pixels.
+ "vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 RGBA pixels.
+ "vpadal.u8 q0, q5 \n" // B 16 bytes -> 8 shorts.
+ "vpadal.u8 q1, q6 \n" // G 16 bytes -> 8 shorts.
+ "vpadal.u8 q2, q7 \n" // R 16 bytes -> 8 shorts.
+
+ "vrshr.u16 q0, q0, #1 \n" // 2x average
+ "vrshr.u16 q1, q1, #1 \n"
+ "vrshr.u16 q2, q2, #1 \n"
+
+ "subs %4, %4, #16 \n" // 32 processed per loop.
+ RGBTOUV(q0, q1, q2)
+ "vst1.8 {d0}, [%2]! \n" // store 8 pixels U.
+ "vst1.8 {d1}, [%3]! \n" // store 8 pixels V.
+ "bgt 1b \n"
+ : "+r"(src_rgba), // %0
+ "+r"(src_stride_rgba), // %1
+ "+r"(dst_u), // %2
+ "+r"(dst_v), // %3
+ "+r"(pix) // %4
+ :
+ : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
+ "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+ );
+}
+
+void RGB24ToUVRow_NEON(const uint8* src_rgb24, int src_stride_rgb24,
+ uint8* dst_u, uint8* dst_v, int pix) {
+ asm volatile (
+ "add %1, %0, %1 \n" // src_stride + src_rgb24
+ "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient
+ "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient
+ "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient
+ "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient
+ "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient
+ "vmov.u16 q15, #0x8080 \n" // 128.5
+ ".p2align 2 \n"
+ "1: \n"
+ "vld3.8 {d0, d2, d4}, [%0]! \n" // load 8 RGB24 pixels.
+ "vld3.8 {d1, d3, d5}, [%0]! \n" // load next 8 RGB24 pixels.
+ "vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts.
+ "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts.
+ "vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts.
+ "vld3.8 {d8, d10, d12}, [%1]! \n" // load 8 more RGB24 pixels.
+ "vld3.8 {d9, d11, d13}, [%1]! \n" // load last 8 RGB24 pixels.
+ "vpadal.u8 q0, q4 \n" // B 16 bytes -> 8 shorts.
+ "vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts.
+ "vpadal.u8 q2, q6 \n" // R 16 bytes -> 8 shorts.
+
+ "vrshr.u16 q0, q0, #1 \n" // 2x average
+ "vrshr.u16 q1, q1, #1 \n"
+ "vrshr.u16 q2, q2, #1 \n"
+
+ "subs %4, %4, #16 \n" // 32 processed per loop.
+ RGBTOUV(q0, q1, q2)
+ "vst1.8 {d0}, [%2]! \n" // store 8 pixels U.
+ "vst1.8 {d1}, [%3]! \n" // store 8 pixels V.
+ "bgt 1b \n"
+ : "+r"(src_rgb24), // %0
+ "+r"(src_stride_rgb24), // %1
+ "+r"(dst_u), // %2
+ "+r"(dst_v), // %3
+ "+r"(pix) // %4
+ :
+ : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
+ "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+ );
+}
+
+void RAWToUVRow_NEON(const uint8* src_raw, int src_stride_raw,
+ uint8* dst_u, uint8* dst_v, int pix) {
+ asm volatile (
+ "add %1, %0, %1 \n" // src_stride + src_raw
+ "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient
+ "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient
+ "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient
+ "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient
+ "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient
+ "vmov.u16 q15, #0x8080 \n" // 128.5
+ ".p2align 2 \n"
+ "1: \n"
+ "vld3.8 {d0, d2, d4}, [%0]! \n" // load 8 RAW pixels.
+ "vld3.8 {d1, d3, d5}, [%0]! \n" // load next 8 RAW pixels.
+ "vpaddl.u8 q2, q2 \n" // B 16 bytes -> 8 shorts.
+ "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts.
+ "vpaddl.u8 q0, q0 \n" // R 16 bytes -> 8 shorts.
+ "vld3.8 {d8, d10, d12}, [%1]! \n" // load 8 more RAW pixels.
+ "vld3.8 {d9, d11, d13}, [%1]! \n" // load last 8 RAW pixels.
+ "vpadal.u8 q2, q6 \n" // B 16 bytes -> 8 shorts.
+ "vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts.
+ "vpadal.u8 q0, q4 \n" // R 16 bytes -> 8 shorts.
+
+ "vrshr.u16 q0, q0, #1 \n" // 2x average
+ "vrshr.u16 q1, q1, #1 \n"
+ "vrshr.u16 q2, q2, #1 \n"
+
+ "subs %4, %4, #16 \n" // 32 processed per loop.
+ RGBTOUV(q2, q1, q0)
+ "vst1.8 {d0}, [%2]! \n" // store 8 pixels U.
+ "vst1.8 {d1}, [%3]! \n" // store 8 pixels V.
+ "bgt 1b \n"
+ : "+r"(src_raw), // %0
+ "+r"(src_stride_raw), // %1
+ "+r"(dst_u), // %2
+ "+r"(dst_v), // %3
+ "+r"(pix) // %4
+ :
+ : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
+ "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+ );
+}
+
+// 16x2 pixels -> 8x1. pix is number of argb pixels. e.g. 16.
+void RGB565ToUVRow_NEON(const uint8* src_rgb565, int src_stride_rgb565,
+ uint8* dst_u, uint8* dst_v, int pix) {
+ asm volatile (
+ "add %1, %0, %1 \n" // src_stride + src_argb
+ "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient
+ "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient
+ "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient
+ "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient
+ "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient
+ "vmov.u16 q15, #0x8080 \n" // 128.5
+ ".p2align 2 \n"
+ "1: \n"
+ "vld1.8 {q0}, [%0]! \n" // load 8 RGB565 pixels.
+ RGB565TOARGB
+ "vpaddl.u8 d8, d0 \n" // B 8 bytes -> 4 shorts.
+ "vpaddl.u8 d10, d1 \n" // G 8 bytes -> 4 shorts.
+ "vpaddl.u8 d12, d2 \n" // R 8 bytes -> 4 shorts.
+ "vld1.8 {q0}, [%0]! \n" // next 8 RGB565 pixels.
+ RGB565TOARGB
+ "vpaddl.u8 d9, d0 \n" // B 8 bytes -> 4 shorts.
+ "vpaddl.u8 d11, d1 \n" // G 8 bytes -> 4 shorts.
+ "vpaddl.u8 d13, d2 \n" // R 8 bytes -> 4 shorts.
+
+ "vld1.8 {q0}, [%1]! \n" // load 8 RGB565 pixels.
+ RGB565TOARGB
+ "vpadal.u8 d8, d0 \n" // B 8 bytes -> 4 shorts.
+ "vpadal.u8 d10, d1 \n" // G 8 bytes -> 4 shorts.
+ "vpadal.u8 d12, d2 \n" // R 8 bytes -> 4 shorts.
+ "vld1.8 {q0}, [%1]! \n" // next 8 RGB565 pixels.
+ RGB565TOARGB
+ "vpadal.u8 d9, d0 \n" // B 8 bytes -> 4 shorts.
+ "vpadal.u8 d11, d1 \n" // G 8 bytes -> 4 shorts.
+ "vpadal.u8 d13, d2 \n" // R 8 bytes -> 4 shorts.
+
+ "vrshr.u16 q4, q4, #1 \n" // 2x average
+ "vrshr.u16 q5, q5, #1 \n"
+ "vrshr.u16 q6, q6, #1 \n"
+
+ "subs %4, %4, #16 \n" // 16 processed per loop.
+ "vmul.s16 q8, q4, q10 \n" // B
+ "vmls.s16 q8, q5, q11 \n" // G
+ "vmls.s16 q8, q6, q12 \n" // R
+ "vadd.u16 q8, q8, q15 \n" // +128 -> unsigned
+ "vmul.s16 q9, q6, q10 \n" // R
+ "vmls.s16 q9, q5, q14 \n" // G
+ "vmls.s16 q9, q4, q13 \n" // B
+ "vadd.u16 q9, q9, q15 \n" // +128 -> unsigned
+ "vqshrn.u16 d0, q8, #8 \n" // 16 bit to 8 bit U
+ "vqshrn.u16 d1, q9, #8 \n" // 16 bit to 8 bit V
+ "vst1.8 {d0}, [%2]! \n" // store 8 pixels U.
+ "vst1.8 {d1}, [%3]! \n" // store 8 pixels V.
+ "bgt 1b \n"
+ : "+r"(src_rgb565), // %0
+ "+r"(src_stride_rgb565), // %1
+ "+r"(dst_u), // %2
+ "+r"(dst_v), // %3
+ "+r"(pix) // %4
+ :
+ : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
+ "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+ );
+}
+
+// 16x2 pixels -> 8x1. pix is number of argb pixels. e.g. 16.
+void ARGB1555ToUVRow_NEON(const uint8* src_argb1555, int src_stride_argb1555,
+ uint8* dst_u, uint8* dst_v, int pix) {
+ asm volatile (
+ "add %1, %0, %1 \n" // src_stride + src_argb
+ "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient
+ "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient
+ "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient
+ "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient
+ "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient
+ "vmov.u16 q15, #0x8080 \n" // 128.5
+ ".p2align 2 \n"
+ "1: \n"
+ "vld1.8 {q0}, [%0]! \n" // load 8 ARGB1555 pixels.
+ RGB555TOARGB
+ "vpaddl.u8 d8, d0 \n" // B 8 bytes -> 4 shorts.
+ "vpaddl.u8 d10, d1 \n" // G 8 bytes -> 4 shorts.
+ "vpaddl.u8 d12, d2 \n" // R 8 bytes -> 4 shorts.
+ "vld1.8 {q0}, [%0]! \n" // next 8 ARGB1555 pixels.
+ RGB555TOARGB
+ "vpaddl.u8 d9, d0 \n" // B 8 bytes -> 4 shorts.
+ "vpaddl.u8 d11, d1 \n" // G 8 bytes -> 4 shorts.
+ "vpaddl.u8 d13, d2 \n" // R 8 bytes -> 4 shorts.
+
+ "vld1.8 {q0}, [%1]! \n" // load 8 ARGB1555 pixels.
+ RGB555TOARGB
+ "vpadal.u8 d8, d0 \n" // B 8 bytes -> 4 shorts.
+ "vpadal.u8 d10, d1 \n" // G 8 bytes -> 4 shorts.
+ "vpadal.u8 d12, d2 \n" // R 8 bytes -> 4 shorts.
+ "vld1.8 {q0}, [%1]! \n" // next 8 ARGB1555 pixels.
+ RGB555TOARGB
+ "vpadal.u8 d9, d0 \n" // B 8 bytes -> 4 shorts.
+ "vpadal.u8 d11, d1 \n" // G 8 bytes -> 4 shorts.
+ "vpadal.u8 d13, d2 \n" // R 8 bytes -> 4 shorts.
+
+ "vrshr.u16 q4, q4, #1 \n" // 2x average
+ "vrshr.u16 q5, q5, #1 \n"
+ "vrshr.u16 q6, q6, #1 \n"
+
+ "subs %4, %4, #16 \n" // 16 processed per loop.
+ "vmul.s16 q8, q4, q10 \n" // B
+ "vmls.s16 q8, q5, q11 \n" // G
+ "vmls.s16 q8, q6, q12 \n" // R
+ "vadd.u16 q8, q8, q15 \n" // +128 -> unsigned
+ "vmul.s16 q9, q6, q10 \n" // R
+ "vmls.s16 q9, q5, q14 \n" // G
+ "vmls.s16 q9, q4, q13 \n" // B
+ "vadd.u16 q9, q9, q15 \n" // +128 -> unsigned
+ "vqshrn.u16 d0, q8, #8 \n" // 16 bit to 8 bit U
+ "vqshrn.u16 d1, q9, #8 \n" // 16 bit to 8 bit V
+ "vst1.8 {d0}, [%2]! \n" // store 8 pixels U.
+ "vst1.8 {d1}, [%3]! \n" // store 8 pixels V.
+ "bgt 1b \n"
+ : "+r"(src_argb1555), // %0
+ "+r"(src_stride_argb1555), // %1
+ "+r"(dst_u), // %2
+ "+r"(dst_v), // %3
+ "+r"(pix) // %4
+ :
+ : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
+ "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+ );
+}
+
+// 16x2 pixels -> 8x1. pix is number of argb pixels. e.g. 16.
+void ARGB4444ToUVRow_NEON(const uint8* src_argb4444, int src_stride_argb4444,
+ uint8* dst_u, uint8* dst_v, int pix) {
+ asm volatile (
+ "add %1, %0, %1 \n" // src_stride + src_argb
+ "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient
+ "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient
+ "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient
+ "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient
+ "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient
+ "vmov.u16 q15, #0x8080 \n" // 128.5
+ ".p2align 2 \n"
+ "1: \n"
+ "vld1.8 {q0}, [%0]! \n" // load 8 ARGB4444 pixels.
+ ARGB4444TOARGB
+ "vpaddl.u8 d8, d0 \n" // B 8 bytes -> 4 shorts.
+ "vpaddl.u8 d10, d1 \n" // G 8 bytes -> 4 shorts.
+ "vpaddl.u8 d12, d2 \n" // R 8 bytes -> 4 shorts.
+ "vld1.8 {q0}, [%0]! \n" // next 8 ARGB4444 pixels.
+ ARGB4444TOARGB
+ "vpaddl.u8 d9, d0 \n" // B 8 bytes -> 4 shorts.
+ "vpaddl.u8 d11, d1 \n" // G 8 bytes -> 4 shorts.
+ "vpaddl.u8 d13, d2 \n" // R 8 bytes -> 4 shorts.
+
+ "vld1.8 {q0}, [%1]! \n" // load 8 ARGB4444 pixels.
+ ARGB4444TOARGB
+ "vpadal.u8 d8, d0 \n" // B 8 bytes -> 4 shorts.
+ "vpadal.u8 d10, d1 \n" // G 8 bytes -> 4 shorts.
+ "vpadal.u8 d12, d2 \n" // R 8 bytes -> 4 shorts.
+ "vld1.8 {q0}, [%1]! \n" // next 8 ARGB4444 pixels.
+ ARGB4444TOARGB
+ "vpadal.u8 d9, d0 \n" // B 8 bytes -> 4 shorts.
+ "vpadal.u8 d11, d1 \n" // G 8 bytes -> 4 shorts.
+ "vpadal.u8 d13, d2 \n" // R 8 bytes -> 4 shorts.
+
+ "vrshr.u16 q4, q4, #1 \n" // 2x average
+ "vrshr.u16 q5, q5, #1 \n"
+ "vrshr.u16 q6, q6, #1 \n"
+
+ "subs %4, %4, #16 \n" // 16 processed per loop.
+ "vmul.s16 q8, q4, q10 \n" // B
+ "vmls.s16 q8, q5, q11 \n" // G
+ "vmls.s16 q8, q6, q12 \n" // R
+ "vadd.u16 q8, q8, q15 \n" // +128 -> unsigned
+ "vmul.s16 q9, q6, q10 \n" // R
+ "vmls.s16 q9, q5, q14 \n" // G
+ "vmls.s16 q9, q4, q13 \n" // B
+ "vadd.u16 q9, q9, q15 \n" // +128 -> unsigned
+ "vqshrn.u16 d0, q8, #8 \n" // 16 bit to 8 bit U
+ "vqshrn.u16 d1, q9, #8 \n" // 16 bit to 8 bit V
+ "vst1.8 {d0}, [%2]! \n" // store 8 pixels U.
+ "vst1.8 {d1}, [%3]! \n" // store 8 pixels V.
+ "bgt 1b \n"
+ : "+r"(src_argb4444), // %0
+ "+r"(src_stride_argb4444), // %1
+ "+r"(dst_u), // %2
+ "+r"(dst_v), // %3
+ "+r"(pix) // %4
+ :
+ : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
+ "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+ );
+}
+
+void RGB565ToYRow_NEON(const uint8* src_rgb565, uint8* dst_y, int pix) {
+ asm volatile (
+ "vmov.u8 d24, #13 \n" // B * 0.1016 coefficient
+ "vmov.u8 d25, #65 \n" // G * 0.5078 coefficient
+ "vmov.u8 d26, #33 \n" // R * 0.2578 coefficient
+ "vmov.u8 d27, #16 \n" // Add 16 constant
+ ".p2align 2 \n"
+ "1: \n"
+ "vld1.8 {q0}, [%0]! \n" // load 8 RGB565 pixels.
+ "subs %2, %2, #8 \n" // 8 processed per loop.
+ RGB565TOARGB
+ "vmull.u8 q2, d0, d24 \n" // B
+ "vmlal.u8 q2, d1, d25 \n" // G
+ "vmlal.u8 q2, d2, d26 \n" // R
+ "vqrshrun.s16 d0, q2, #7 \n" // 16 bit to 8 bit Y
+ "vqadd.u8 d0, d27 \n"
+ "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y.
+ "bgt 1b \n"
+ : "+r"(src_rgb565), // %0
+ "+r"(dst_y), // %1
+ "+r"(pix) // %2
+ :
+ : "cc", "memory", "q0", "q1", "q2", "q3", "q12", "q13"
+ );
+}
+
+void ARGB1555ToYRow_NEON(const uint8* src_argb1555, uint8* dst_y, int pix) {
+ asm volatile (
+ "vmov.u8 d24, #13 \n" // B * 0.1016 coefficient
+ "vmov.u8 d25, #65 \n" // G * 0.5078 coefficient
+ "vmov.u8 d26, #33 \n" // R * 0.2578 coefficient
+ "vmov.u8 d27, #16 \n" // Add 16 constant
+ ".p2align 2 \n"
+ "1: \n"
+ "vld1.8 {q0}, [%0]! \n" // load 8 ARGB1555 pixels.
+ "subs %2, %2, #8 \n" // 8 processed per loop.
+ ARGB1555TOARGB
+ "vmull.u8 q2, d0, d24 \n" // B
+ "vmlal.u8 q2, d1, d25 \n" // G
+ "vmlal.u8 q2, d2, d26 \n" // R
+ "vqrshrun.s16 d0, q2, #7 \n" // 16 bit to 8 bit Y
+ "vqadd.u8 d0, d27 \n"
+ "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y.
+ "bgt 1b \n"
+ : "+r"(src_argb1555), // %0
+ "+r"(dst_y), // %1
+ "+r"(pix) // %2
+ :
+ : "cc", "memory", "q0", "q1", "q2", "q3", "q12", "q13"
+ );
+}
+
+void ARGB4444ToYRow_NEON(const uint8* src_argb4444, uint8* dst_y, int pix) {
+ asm volatile (
+ "vmov.u8 d24, #13 \n" // B * 0.1016 coefficient
+ "vmov.u8 d25, #65 \n" // G * 0.5078 coefficient
+ "vmov.u8 d26, #33 \n" // R * 0.2578 coefficient
+ "vmov.u8 d27, #16 \n" // Add 16 constant
+ ".p2align 2 \n"
+ "1: \n"
+ "vld1.8 {q0}, [%0]! \n" // load 8 ARGB4444 pixels.
+ "subs %2, %2, #8 \n" // 8 processed per loop.
+ ARGB4444TOARGB
+ "vmull.u8 q2, d0, d24 \n" // B
+ "vmlal.u8 q2, d1, d25 \n" // G
+ "vmlal.u8 q2, d2, d26 \n" // R
+ "vqrshrun.s16 d0, q2, #7 \n" // 16 bit to 8 bit Y
+ "vqadd.u8 d0, d27 \n"
+ "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y.
+ "bgt 1b \n"
+ : "+r"(src_argb4444), // %0
+ "+r"(dst_y), // %1
+ "+r"(pix) // %2
+ :
+ : "cc", "memory", "q0", "q1", "q2", "q3", "q12", "q13"
+ );
+}
+
+void BGRAToYRow_NEON(const uint8* src_bgra, uint8* dst_y, int pix) {
+ asm volatile (
+ "vmov.u8 d4, #33 \n" // R * 0.2578 coefficient
+ "vmov.u8 d5, #65 \n" // G * 0.5078 coefficient
+ "vmov.u8 d6, #13 \n" // B * 0.1016 coefficient
+ "vmov.u8 d7, #16 \n" // Add 16 constant
+ ".p2align 2 \n"
+ "1: \n"
+ "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of BGRA.
+ "subs %2, %2, #8 \n" // 8 processed per loop.
+ "vmull.u8 q8, d1, d4 \n" // R
+ "vmlal.u8 q8, d2, d5 \n" // G
+ "vmlal.u8 q8, d3, d6 \n" // B
+ "vqrshrun.s16 d0, q8, #7 \n" // 16 bit to 8 bit Y
+ "vqadd.u8 d0, d7 \n"
+ "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y.
+ "bgt 1b \n"
+ : "+r"(src_bgra), // %0
+ "+r"(dst_y), // %1
+ "+r"(pix) // %2
+ :
+ : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8"
+ );
+}
+
+void ABGRToYRow_NEON(const uint8* src_abgr, uint8* dst_y, int pix) {
+ asm volatile (
+ "vmov.u8 d4, #33 \n" // R * 0.2578 coefficient
+ "vmov.u8 d5, #65 \n" // G * 0.5078 coefficient
+ "vmov.u8 d6, #13 \n" // B * 0.1016 coefficient
+ "vmov.u8 d7, #16 \n" // Add 16 constant
+ ".p2align 2 \n"
+ "1: \n"
+ "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of ABGR.
+ "subs %2, %2, #8 \n" // 8 processed per loop.
+ "vmull.u8 q8, d0, d4 \n" // R
+ "vmlal.u8 q8, d1, d5 \n" // G
+ "vmlal.u8 q8, d2, d6 \n" // B
+ "vqrshrun.s16 d0, q8, #7 \n" // 16 bit to 8 bit Y
+ "vqadd.u8 d0, d7 \n"
+ "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y.
+ "bgt 1b \n"
+ : "+r"(src_abgr), // %0
+ "+r"(dst_y), // %1
+ "+r"(pix) // %2
+ :
+ : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8"
+ );
+}
+
+void RGBAToYRow_NEON(const uint8* src_rgba, uint8* dst_y, int pix) {
+ asm volatile (
+ "vmov.u8 d4, #13 \n" // B * 0.1016 coefficient
+ "vmov.u8 d5, #65 \n" // G * 0.5078 coefficient
+ "vmov.u8 d6, #33 \n" // R * 0.2578 coefficient
+ "vmov.u8 d7, #16 \n" // Add 16 constant
+ ".p2align 2 \n"
+ "1: \n"
+ "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of RGBA.
+ "subs %2, %2, #8 \n" // 8 processed per loop.
+ "vmull.u8 q8, d1, d4 \n" // B
+ "vmlal.u8 q8, d2, d5 \n" // G
+ "vmlal.u8 q8, d3, d6 \n" // R
+ "vqrshrun.s16 d0, q8, #7 \n" // 16 bit to 8 bit Y
+ "vqadd.u8 d0, d7 \n"
+ "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y.
+ "bgt 1b \n"
+ : "+r"(src_rgba), // %0
+ "+r"(dst_y), // %1
+ "+r"(pix) // %2
+ :
+ : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8"
+ );
+}
+
+void RGB24ToYRow_NEON(const uint8* src_rgb24, uint8* dst_y, int pix) {
+ asm volatile (
+ "vmov.u8 d4, #13 \n" // B * 0.1016 coefficient
+ "vmov.u8 d5, #65 \n" // G * 0.5078 coefficient
+ "vmov.u8 d6, #33 \n" // R * 0.2578 coefficient
+ "vmov.u8 d7, #16 \n" // Add 16 constant
+ ".p2align 2 \n"
+ "1: \n"
+ "vld3.8 {d0, d1, d2}, [%0]! \n" // load 8 pixels of RGB24.
+ "subs %2, %2, #8 \n" // 8 processed per loop.
+ "vmull.u8 q8, d0, d4 \n" // B
+ "vmlal.u8 q8, d1, d5 \n" // G
+ "vmlal.u8 q8, d2, d6 \n" // R
+ "vqrshrun.s16 d0, q8, #7 \n" // 16 bit to 8 bit Y
+ "vqadd.u8 d0, d7 \n"
+ "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y.
+ "bgt 1b \n"
+ : "+r"(src_rgb24), // %0
+ "+r"(dst_y), // %1
+ "+r"(pix) // %2
+ :
+ : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8"
+ );
+}
+
+void RAWToYRow_NEON(const uint8* src_raw, uint8* dst_y, int pix) {
+ asm volatile (
+ "vmov.u8 d4, #33 \n" // R * 0.2578 coefficient
+ "vmov.u8 d5, #65 \n" // G * 0.5078 coefficient
+ "vmov.u8 d6, #13 \n" // B * 0.1016 coefficient
+ "vmov.u8 d7, #16 \n" // Add 16 constant
+ ".p2align 2 \n"
+ "1: \n"
+ "vld3.8 {d0, d1, d2}, [%0]! \n" // load 8 pixels of RAW.
+ "subs %2, %2, #8 \n" // 8 processed per loop.
+ "vmull.u8 q8, d0, d4 \n" // B
+ "vmlal.u8 q8, d1, d5 \n" // G
+ "vmlal.u8 q8, d2, d6 \n" // R
+ "vqrshrun.s16 d0, q8, #7 \n" // 16 bit to 8 bit Y
+ "vqadd.u8 d0, d7 \n"
+ "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y.
+ "bgt 1b \n"
+ : "+r"(src_raw), // %0
+ "+r"(dst_y), // %1
+ "+r"(pix) // %2
+ :
+ : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8"
+ );
+}
+
+// Bilinear filter 16x2 -> 16x1
+void InterpolateRow_NEON(uint8* dst_ptr,
+ const uint8* src_ptr, ptrdiff_t src_stride,
+ int dst_width, int source_y_fraction) {
+ asm volatile (
+ "cmp %4, #0 \n"
+ "beq 100f \n"
+ "add %2, %1 \n"
+ "cmp %4, #64 \n"
+ "beq 75f \n"
+ "cmp %4, #128 \n"
+ "beq 50f \n"
+ "cmp %4, #192 \n"
+ "beq 25f \n"
+
+ "vdup.8 d5, %4 \n"
+ "rsb %4, #256 \n"
+ "vdup.8 d4, %4 \n"
+ // General purpose row blend.
+ "1: \n"
+ "vld1.8 {q0}, [%1]! \n"
+ "vld1.8 {q1}, [%2]! \n"
+ "subs %3, %3, #16 \n"
+ "vmull.u8 q13, d0, d4 \n"
+ "vmull.u8 q14, d1, d4 \n"
+ "vmlal.u8 q13, d2, d5 \n"
+ "vmlal.u8 q14, d3, d5 \n"
+ "vrshrn.u16 d0, q13, #8 \n"
+ "vrshrn.u16 d1, q14, #8 \n"
+ "vst1.8 {q0}, [%0]! \n"
+ "bgt 1b \n"
+ "b 99f \n"
+
+ // Blend 25 / 75.
+ "25: \n"
+ "vld1.8 {q0}, [%1]! \n"
+ "vld1.8 {q1}, [%2]! \n"
+ "subs %3, %3, #16 \n"
+ "vrhadd.u8 q0, q1 \n"
+ "vrhadd.u8 q0, q1 \n"
+ "vst1.8 {q0}, [%0]! \n"
+ "bgt 25b \n"
+ "b 99f \n"
+
+ // Blend 50 / 50.
+ "50: \n"
+ "vld1.8 {q0}, [%1]! \n"
+ "vld1.8 {q1}, [%2]! \n"
+ "subs %3, %3, #16 \n"
+ "vrhadd.u8 q0, q1 \n"
+ "vst1.8 {q0}, [%0]! \n"
+ "bgt 50b \n"
+ "b 99f \n"
+
+ // Blend 75 / 25.
+ "75: \n"
+ "vld1.8 {q1}, [%1]! \n"
+ "vld1.8 {q0}, [%2]! \n"
+ "subs %3, %3, #16 \n"
+ "vrhadd.u8 q0, q1 \n"
+ "vrhadd.u8 q0, q1 \n"
+ "vst1.8 {q0}, [%0]! \n"
+ "bgt 75b \n"
+ "b 99f \n"
+
+ // Blend 100 / 0 - Copy row unchanged.
+ "100: \n"
+ "vld1.8 {q0}, [%1]! \n"
+ "subs %3, %3, #16 \n"
+ "vst1.8 {q0}, [%0]! \n"
+ "bgt 100b \n"
+
+ "99: \n"
+ : "+r"(dst_ptr), // %0
+ "+r"(src_ptr), // %1
+ "+r"(src_stride), // %2
+ "+r"(dst_width), // %3
+ "+r"(source_y_fraction) // %4
+ :
+ : "cc", "memory", "q0", "q1", "d4", "d5", "q13", "q14"
+ );
+}
+
+// dr * (256 - sa) / 256 + sr = dr - dr * sa / 256 + sr
+void ARGBBlendRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
+ uint8* dst_argb, int width) {
+ asm volatile (
+ "subs %3, #8 \n"
+ "blt 89f \n"
+ // Blend 8 pixels.
+ "8: \n"
+ "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of ARGB0.
+ "vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load 8 pixels of ARGB1.
+ "subs %3, %3, #8 \n" // 8 processed per loop.
+ "vmull.u8 q10, d4, d3 \n" // db * a
+ "vmull.u8 q11, d5, d3 \n" // dg * a
+ "vmull.u8 q12, d6, d3 \n" // dr * a
+ "vqrshrn.u16 d20, q10, #8 \n" // db >>= 8
+ "vqrshrn.u16 d21, q11, #8 \n" // dg >>= 8
+ "vqrshrn.u16 d22, q12, #8 \n" // dr >>= 8
+ "vqsub.u8 q2, q2, q10 \n" // dbg - dbg * a / 256
+ "vqsub.u8 d6, d6, d22 \n" // dr - dr * a / 256
+ "vqadd.u8 q0, q0, q2 \n" // + sbg
+ "vqadd.u8 d2, d2, d6 \n" // + sr
+ "vmov.u8 d3, #255 \n" // a = 255
+ "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 pixels of ARGB.
+ "bge 8b \n"
+
+ "89: \n"
+ "adds %3, #8-1 \n"
+ "blt 99f \n"
+
+ // Blend 1 pixels.
+ "1: \n"
+ "vld4.8 {d0[0],d1[0],d2[0],d3[0]}, [%0]! \n" // load 1 pixel ARGB0.
+ "vld4.8 {d4[0],d5[0],d6[0],d7[0]}, [%1]! \n" // load 1 pixel ARGB1.
+ "subs %3, %3, #1 \n" // 1 processed per loop.
+ "vmull.u8 q10, d4, d3 \n" // db * a
+ "vmull.u8 q11, d5, d3 \n" // dg * a
+ "vmull.u8 q12, d6, d3 \n" // dr * a
+ "vqrshrn.u16 d20, q10, #8 \n" // db >>= 8
+ "vqrshrn.u16 d21, q11, #8 \n" // dg >>= 8
+ "vqrshrn.u16 d22, q12, #8 \n" // dr >>= 8
+ "vqsub.u8 q2, q2, q10 \n" // dbg - dbg * a / 256
+ "vqsub.u8 d6, d6, d22 \n" // dr - dr * a / 256
+ "vqadd.u8 q0, q0, q2 \n" // + sbg
+ "vqadd.u8 d2, d2, d6 \n" // + sr
+ "vmov.u8 d3, #255 \n" // a = 255
+ "vst4.8 {d0[0],d1[0],d2[0],d3[0]}, [%2]! \n" // store 1 pixel.
+ "bge 1b \n"
+
+ "99: \n"
+
+ : "+r"(src_argb0), // %0
+ "+r"(src_argb1), // %1
+ "+r"(dst_argb), // %2
+ "+r"(width) // %3
+ :
+ : "cc", "memory", "q0", "q1", "q2", "q3", "q10", "q11", "q12"
+ );
+}
+
+// Attenuate 8 pixels at a time.
+void ARGBAttenuateRow_NEON(const uint8* src_argb, uint8* dst_argb, int width) {
+ asm volatile (
+ // Attenuate 8 pixels.
+ "1: \n"
+ "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of ARGB.
+ "subs %2, %2, #8 \n" // 8 processed per loop.
+ "vmull.u8 q10, d0, d3 \n" // b * a
+ "vmull.u8 q11, d1, d3 \n" // g * a
+ "vmull.u8 q12, d2, d3 \n" // r * a
+ "vqrshrn.u16 d0, q10, #8 \n" // b >>= 8
+ "vqrshrn.u16 d1, q11, #8 \n" // g >>= 8
+ "vqrshrn.u16 d2, q12, #8 \n" // r >>= 8
+ "vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 pixels of ARGB.
+ "bgt 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_argb), // %1
+ "+r"(width) // %2
+ :
+ : "cc", "memory", "q0", "q1", "q10", "q11", "q12"
+ );
+}
+
+// Quantize 8 ARGB pixels (32 bytes).
+// dst = (dst * scale >> 16) * interval_size + interval_offset;
+void ARGBQuantizeRow_NEON(uint8* dst_argb, int scale, int interval_size,
+ int interval_offset, int width) {
+ asm volatile (
+ "vdup.u16 q8, %2 \n"
+ "vshr.u16 q8, q8, #1 \n" // scale >>= 1
+ "vdup.u16 q9, %3 \n" // interval multiply.
+ "vdup.u16 q10, %4 \n" // interval add
+
+ // 8 pixel loop.
+ ".p2align 2 \n"
+ "1: \n"
+ "vld4.8 {d0, d2, d4, d6}, [%0] \n" // load 8 pixels of ARGB.
+ "subs %1, %1, #8 \n" // 8 processed per loop.
+ "vmovl.u8 q0, d0 \n" // b (0 .. 255)
+ "vmovl.u8 q1, d2 \n"
+ "vmovl.u8 q2, d4 \n"
+ "vqdmulh.s16 q0, q0, q8 \n" // b * scale
+ "vqdmulh.s16 q1, q1, q8 \n" // g
+ "vqdmulh.s16 q2, q2, q8 \n" // r
+ "vmul.u16 q0, q0, q9 \n" // b * interval_size
+ "vmul.u16 q1, q1, q9 \n" // g
+ "vmul.u16 q2, q2, q9 \n" // r
+ "vadd.u16 q0, q0, q10 \n" // b + interval_offset
+ "vadd.u16 q1, q1, q10 \n" // g
+ "vadd.u16 q2, q2, q10 \n" // r
+ "vqmovn.u16 d0, q0 \n"
+ "vqmovn.u16 d2, q1 \n"
+ "vqmovn.u16 d4, q2 \n"
+ "vst4.8 {d0, d2, d4, d6}, [%0]! \n" // store 8 pixels of ARGB.
+ "bgt 1b \n"
+ : "+r"(dst_argb), // %0
+ "+r"(width) // %1
+ : "r"(scale), // %2
+ "r"(interval_size), // %3
+ "r"(interval_offset) // %4
+ : "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9", "q10"
+ );
+}
+
+// Shade 8 pixels at a time by specified value.
+// NOTE vqrdmulh.s16 q10, q10, d0[0] must use a scaler register from 0 to 8.
+// Rounding in vqrdmulh does +1 to high if high bit of low s16 is set.
+void ARGBShadeRow_NEON(const uint8* src_argb, uint8* dst_argb, int width,
+ uint32 value) {
+ asm volatile (
+ "vdup.u32 q0, %3 \n" // duplicate scale value.
+ "vzip.u8 d0, d1 \n" // d0 aarrggbb.
+ "vshr.u16 q0, q0, #1 \n" // scale / 2.
+
+ // 8 pixel loop.
+ ".p2align 2 \n"
+ "1: \n"
+ "vld4.8 {d20, d22, d24, d26}, [%0]! \n" // load 8 pixels of ARGB.
+ "subs %2, %2, #8 \n" // 8 processed per loop.
+ "vmovl.u8 q10, d20 \n" // b (0 .. 255)
+ "vmovl.u8 q11, d22 \n"
+ "vmovl.u8 q12, d24 \n"
+ "vmovl.u8 q13, d26 \n"
+ "vqrdmulh.s16 q10, q10, d0[0] \n" // b * scale * 2
+ "vqrdmulh.s16 q11, q11, d0[1] \n" // g
+ "vqrdmulh.s16 q12, q12, d0[2] \n" // r
+ "vqrdmulh.s16 q13, q13, d0[3] \n" // a
+ "vqmovn.u16 d20, q10 \n"
+ "vqmovn.u16 d22, q11 \n"
+ "vqmovn.u16 d24, q12 \n"
+ "vqmovn.u16 d26, q13 \n"
+ "vst4.8 {d20, d22, d24, d26}, [%1]! \n" // store 8 pixels of ARGB.
+ "bgt 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_argb), // %1
+ "+r"(width) // %2
+ : "r"(value) // %3
+ : "cc", "memory", "q0", "q10", "q11", "q12", "q13"
+ );
+}
+
+// Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels
+// Similar to ARGBToYJ but stores ARGB.
+// C code is (15 * b + 75 * g + 38 * r + 64) >> 7;
+void ARGBGrayRow_NEON(const uint8* src_argb, uint8* dst_argb, int width) {
+ asm volatile (
+ "vmov.u8 d24, #15 \n" // B * 0.11400 coefficient
+ "vmov.u8 d25, #75 \n" // G * 0.58700 coefficient
+ "vmov.u8 d26, #38 \n" // R * 0.29900 coefficient
+ ".p2align 2 \n"
+ "1: \n"
+ "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels.
+ "subs %2, %2, #8 \n" // 8 processed per loop.
+ "vmull.u8 q2, d0, d24 \n" // B
+ "vmlal.u8 q2, d1, d25 \n" // G
+ "vmlal.u8 q2, d2, d26 \n" // R
+ "vqrshrun.s16 d0, q2, #7 \n" // 15 bit to 8 bit B
+ "vmov d1, d0 \n" // G
+ "vmov d2, d0 \n" // R
+ "vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 ARGB pixels.
+ "bgt 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_argb), // %1
+ "+r"(width) // %2
+ :
+ : "cc", "memory", "q0", "q1", "q2", "q12", "q13"
+ );
+}
+
+// Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels.
+// b = (r * 35 + g * 68 + b * 17) >> 7
+// g = (r * 45 + g * 88 + b * 22) >> 7
+// r = (r * 50 + g * 98 + b * 24) >> 7
+void ARGBSepiaRow_NEON(uint8* dst_argb, int width) {
+ asm volatile (
+ "vmov.u8 d20, #17 \n" // BB coefficient
+ "vmov.u8 d21, #68 \n" // BG coefficient
+ "vmov.u8 d22, #35 \n" // BR coefficient
+ "vmov.u8 d24, #22 \n" // GB coefficient
+ "vmov.u8 d25, #88 \n" // GG coefficient
+ "vmov.u8 d26, #45 \n" // GR coefficient
+ "vmov.u8 d28, #24 \n" // BB coefficient
+ "vmov.u8 d29, #98 \n" // BG coefficient
+ "vmov.u8 d30, #50 \n" // BR coefficient
+ ".p2align 2 \n"
+ "1: \n"
+ "vld4.8 {d0, d1, d2, d3}, [%0] \n" // load 8 ARGB pixels.
+ "subs %1, %1, #8 \n" // 8 processed per loop.
+ "vmull.u8 q2, d0, d20 \n" // B to Sepia B
+ "vmlal.u8 q2, d1, d21 \n" // G
+ "vmlal.u8 q2, d2, d22 \n" // R
+ "vmull.u8 q3, d0, d24 \n" // B to Sepia G
+ "vmlal.u8 q3, d1, d25 \n" // G
+ "vmlal.u8 q3, d2, d26 \n" // R
+ "vmull.u8 q8, d0, d28 \n" // B to Sepia R
+ "vmlal.u8 q8, d1, d29 \n" // G
+ "vmlal.u8 q8, d2, d30 \n" // R
+ "vqshrn.u16 d0, q2, #7 \n" // 16 bit to 8 bit B
+ "vqshrn.u16 d1, q3, #7 \n" // 16 bit to 8 bit G
+ "vqshrn.u16 d2, q8, #7 \n" // 16 bit to 8 bit R
+ "vst4.8 {d0, d1, d2, d3}, [%0]! \n" // store 8 ARGB pixels.
+ "bgt 1b \n"
+ : "+r"(dst_argb), // %0
+ "+r"(width) // %1
+ :
+ : "cc", "memory", "q0", "q1", "q2", "q3",
+ "q10", "q11", "q12", "q13", "q14", "q15"
+ );
+}
+
+// Tranform 8 ARGB pixels (32 bytes) with color matrix.
+// TODO(fbarchard): Was same as Sepia except matrix is provided. This function
+// needs to saturate. Consider doing a non-saturating version.
+void ARGBColorMatrixRow_NEON(const uint8* src_argb, uint8* dst_argb,
+ const int8* matrix_argb, int width) {
+ asm volatile (
+ "vld1.8 {q2}, [%3] \n" // load 3 ARGB vectors.
+ "vmovl.s8 q0, d4 \n" // B,G coefficients s16.
+ "vmovl.s8 q1, d5 \n" // R,A coefficients s16.
+
+ ".p2align 2 \n"
+ "1: \n"
+ "vld4.8 {d16, d18, d20, d22}, [%0]! \n" // load 8 ARGB pixels.
+ "subs %2, %2, #8 \n" // 8 processed per loop.
+ "vmovl.u8 q8, d16 \n" // b (0 .. 255) 16 bit
+ "vmovl.u8 q9, d18 \n" // g
+ "vmovl.u8 q10, d20 \n" // r
+ "vmovl.u8 q15, d22 \n" // a
+ "vmul.s16 q12, q8, d0[0] \n" // B = B * Matrix B
+ "vmul.s16 q13, q8, d1[0] \n" // G = B * Matrix G
+ "vmul.s16 q14, q8, d2[0] \n" // R = B * Matrix R
+ "vmul.s16 q15, q8, d3[0] \n" // A = B * Matrix A
+ "vmul.s16 q4, q9, d0[1] \n" // B += G * Matrix B
+ "vmul.s16 q5, q9, d1[1] \n" // G += G * Matrix G
+ "vmul.s16 q6, q9, d2[1] \n" // R += G * Matrix R
+ "vmul.s16 q7, q9, d3[1] \n" // A += G * Matrix A
+ "vqadd.s16 q12, q12, q4 \n" // Accumulate B
+ "vqadd.s16 q13, q13, q5 \n" // Accumulate G
+ "vqadd.s16 q14, q14, q6 \n" // Accumulate R
+ "vqadd.s16 q15, q15, q7 \n" // Accumulate A
+ "vmul.s16 q4, q10, d0[2] \n" // B += R * Matrix B
+ "vmul.s16 q5, q10, d1[2] \n" // G += R * Matrix G
+ "vmul.s16 q6, q10, d2[2] \n" // R += R * Matrix R
+ "vmul.s16 q7, q10, d3[2] \n" // A += R * Matrix A
+ "vqadd.s16 q12, q12, q4 \n" // Accumulate B
+ "vqadd.s16 q13, q13, q5 \n" // Accumulate G
+ "vqadd.s16 q14, q14, q6 \n" // Accumulate R
+ "vqadd.s16 q15, q15, q7 \n" // Accumulate A
+ "vmul.s16 q4, q15, d0[3] \n" // B += A * Matrix B
+ "vmul.s16 q5, q15, d1[3] \n" // G += A * Matrix G
+ "vmul.s16 q6, q15, d2[3] \n" // R += A * Matrix R
+ "vmul.s16 q7, q15, d3[3] \n" // A += A * Matrix A
+ "vqadd.s16 q12, q12, q4 \n" // Accumulate B
+ "vqadd.s16 q13, q13, q5 \n" // Accumulate G
+ "vqadd.s16 q14, q14, q6 \n" // Accumulate R
+ "vqadd.s16 q15, q15, q7 \n" // Accumulate A
+ "vqshrun.s16 d16, q12, #6 \n" // 16 bit to 8 bit B
+ "vqshrun.s16 d18, q13, #6 \n" // 16 bit to 8 bit G
+ "vqshrun.s16 d20, q14, #6 \n" // 16 bit to 8 bit R
+ "vqshrun.s16 d22, q15, #6 \n" // 16 bit to 8 bit A
+ "vst4.8 {d16, d18, d20, d22}, [%1]! \n" // store 8 ARGB pixels.
+ "bgt 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_argb), // %1
+ "+r"(width) // %2
+ : "r"(matrix_argb) // %3
+ : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9",
+ "q10", "q11", "q12", "q13", "q14", "q15"
+ );
+}
+
+// TODO(fbarchard): fix vqshrun in ARGBMultiplyRow_NEON and reenable.
+#ifdef HAS_ARGBMULTIPLYROW_NEON
+// Multiply 2 rows of ARGB pixels together, 8 pixels at a time.
+void ARGBMultiplyRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
+ uint8* dst_argb, int width) {
+ asm volatile (
+ // 8 pixel loop.
+ ".p2align 2 \n"
+ "1: \n"
+ "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels.
+ "vld4.8 {d1, d3, d5, d7}, [%1]! \n" // load 8 more ARGB pixels.
+ "subs %3, %3, #8 \n" // 8 processed per loop.
+ "vmull.u8 q0, d0, d1 \n" // multiply B
+ "vmull.u8 q1, d2, d3 \n" // multiply G
+ "vmull.u8 q2, d4, d5 \n" // multiply R
+ "vmull.u8 q3, d6, d7 \n" // multiply A
+ "vrshrn.u16 d0, q0, #8 \n" // 16 bit to 8 bit B
+ "vrshrn.u16 d1, q1, #8 \n" // 16 bit to 8 bit G
+ "vrshrn.u16 d2, q2, #8 \n" // 16 bit to 8 bit R
+ "vrshrn.u16 d3, q3, #8 \n" // 16 bit to 8 bit A
+ "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels.
+ "bgt 1b \n"
+
+ : "+r"(src_argb0), // %0
+ "+r"(src_argb1), // %1
+ "+r"(dst_argb), // %2
+ "+r"(width) // %3
+ :
+ : "cc", "memory", "q0", "q1", "q2", "q3"
+ );
+}
+#endif // HAS_ARGBMULTIPLYROW_NEON
+
+// Add 2 rows of ARGB pixels together, 8 pixels at a time.
+void ARGBAddRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
+ uint8* dst_argb, int width) {
+ asm volatile (
+ // 8 pixel loop.
+ ".p2align 2 \n"
+ "1: \n"
+ "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels.
+ "vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load 8 more ARGB pixels.
+ "subs %3, %3, #8 \n" // 8 processed per loop.
+ "vqadd.u8 q0, q0, q2 \n" // add B, G
+ "vqadd.u8 q1, q1, q3 \n" // add R, A
+ "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels.
+ "bgt 1b \n"
+
+ : "+r"(src_argb0), // %0
+ "+r"(src_argb1), // %1
+ "+r"(dst_argb), // %2
+ "+r"(width) // %3
+ :
+ : "cc", "memory", "q0", "q1", "q2", "q3"
+ );
+}
+
+// Subtract 2 rows of ARGB pixels, 8 pixels at a time.
+void ARGBSubtractRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
+ uint8* dst_argb, int width) {
+ asm volatile (
+ // 8 pixel loop.
+ ".p2align 2 \n"
+ "1: \n"
+ "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels.
+ "vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load 8 more ARGB pixels.
+ "subs %3, %3, #8 \n" // 8 processed per loop.
+ "vqsub.u8 q0, q0, q2 \n" // subtract B, G
+ "vqsub.u8 q1, q1, q3 \n" // subtract R, A
+ "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels.
+ "bgt 1b \n"
+
+ : "+r"(src_argb0), // %0
+ "+r"(src_argb1), // %1
+ "+r"(dst_argb), // %2
+ "+r"(width) // %3
+ :
+ : "cc", "memory", "q0", "q1", "q2", "q3"
+ );
+}
+
+// Adds Sobel X and Sobel Y and stores Sobel into ARGB.
+// A = 255
+// R = Sobel
+// G = Sobel
+// B = Sobel
+void SobelRow_NEON(const uint8* src_sobelx, const uint8* src_sobely,
+ uint8* dst_argb, int width) {
+ asm volatile (
+ "vmov.u8 d3, #255 \n" // alpha
+ // 8 pixel loop.
+ ".p2align 2 \n"
+ "1: \n"
+ "vld1.8 {d0}, [%0]! \n" // load 8 sobelx.
+ "vld1.8 {d1}, [%1]! \n" // load 8 sobely.
+ "subs %3, %3, #8 \n" // 8 processed per loop.
+ "vqadd.u8 d0, d0, d1 \n" // add
+ "vmov.u8 d1, d0 \n"
+ "vmov.u8 d2, d0 \n"
+ "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels.
+ "bgt 1b \n"
+ : "+r"(src_sobelx), // %0
+ "+r"(src_sobely), // %1
+ "+r"(dst_argb), // %2
+ "+r"(width) // %3
+ :
+ : "cc", "memory", "q0", "q1"
+ );
+}
+
+// Adds Sobel X and Sobel Y and stores Sobel into plane.
+void SobelToPlaneRow_NEON(const uint8* src_sobelx, const uint8* src_sobely,
+ uint8* dst_y, int width) {
+ asm volatile (
+ // 16 pixel loop.
+ ".p2align 2 \n"
+ "1: \n"
+ "vld1.8 {q0}, [%0]! \n" // load 16 sobelx.
+ "vld1.8 {q1}, [%1]! \n" // load 16 sobely.
+ "subs %3, %3, #16 \n" // 16 processed per loop.
+ "vqadd.u8 q0, q0, q1 \n" // add
+ "vst1.8 {q0}, [%2]! \n" // store 16 pixels.
+ "bgt 1b \n"
+ : "+r"(src_sobelx), // %0
+ "+r"(src_sobely), // %1
+ "+r"(dst_y), // %2
+ "+r"(width) // %3
+ :
+ : "cc", "memory", "q0", "q1"
+ );
+}
+
+// Mixes Sobel X, Sobel Y and Sobel into ARGB.
+// A = 255
+// R = Sobel X
+// G = Sobel
+// B = Sobel Y
+void SobelXYRow_NEON(const uint8* src_sobelx, const uint8* src_sobely,
+ uint8* dst_argb, int width) {
+ asm volatile (
+ "vmov.u8 d3, #255 \n" // alpha
+ // 8 pixel loop.
+ ".p2align 2 \n"
+ "1: \n"
+ "vld1.8 {d2}, [%0]! \n" // load 8 sobelx.
+ "vld1.8 {d0}, [%1]! \n" // load 8 sobely.
+ "subs %3, %3, #8 \n" // 8 processed per loop.
+ "vqadd.u8 d1, d0, d2 \n" // add
+ "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels.
+ "bgt 1b \n"
+ : "+r"(src_sobelx), // %0
+ "+r"(src_sobely), // %1
+ "+r"(dst_argb), // %2
+ "+r"(width) // %3
+ :
+ : "cc", "memory", "q0", "q1"
+ );
+}
+
+// SobelX as a matrix is
+// -1 0 1
+// -2 0 2
+// -1 0 1
+void SobelXRow_NEON(const uint8* src_y0, const uint8* src_y1,
+ const uint8* src_y2, uint8* dst_sobelx, int width) {
+ asm volatile (
+ ".p2align 2 \n"
+ "1: \n"
+ "vld1.8 {d0}, [%0],%5 \n" // top
+ "vld1.8 {d1}, [%0],%6 \n"
+ "vsubl.u8 q0, d0, d1 \n"
+ "vld1.8 {d2}, [%1],%5 \n" // center * 2
+ "vld1.8 {d3}, [%1],%6 \n"
+ "vsubl.u8 q1, d2, d3 \n"
+ "vadd.s16 q0, q0, q1 \n"
+ "vadd.s16 q0, q0, q1 \n"
+ "vld1.8 {d2}, [%2],%5 \n" // bottom
+ "vld1.8 {d3}, [%2],%6 \n"
+ "subs %4, %4, #8 \n" // 8 pixels
+ "vsubl.u8 q1, d2, d3 \n"
+ "vadd.s16 q0, q0, q1 \n"
+ "vabs.s16 q0, q0 \n"
+ "vqmovn.u16 d0, q0 \n"
+ "vst1.8 {d0}, [%3]! \n" // store 8 sobelx
+ "bgt 1b \n"
+ : "+r"(src_y0), // %0
+ "+r"(src_y1), // %1
+ "+r"(src_y2), // %2
+ "+r"(dst_sobelx), // %3
+ "+r"(width) // %4
+ : "r"(2), // %5
+ "r"(6) // %6
+ : "cc", "memory", "q0", "q1" // Clobber List
+ );
+}
+
+// SobelY as a matrix is
+// -1 -2 -1
+// 0 0 0
+// 1 2 1
+void SobelYRow_NEON(const uint8* src_y0, const uint8* src_y1,
+ uint8* dst_sobely, int width) {
+ asm volatile (
+ ".p2align 2 \n"
+ "1: \n"
+ "vld1.8 {d0}, [%0],%4 \n" // left
+ "vld1.8 {d1}, [%1],%4 \n"
+ "vsubl.u8 q0, d0, d1 \n"
+ "vld1.8 {d2}, [%0],%4 \n" // center * 2
+ "vld1.8 {d3}, [%1],%4 \n"
+ "vsubl.u8 q1, d2, d3 \n"
+ "vadd.s16 q0, q0, q1 \n"
+ "vadd.s16 q0, q0, q1 \n"
+ "vld1.8 {d2}, [%0],%5 \n" // right
+ "vld1.8 {d3}, [%1],%5 \n"
+ "subs %3, %3, #8 \n" // 8 pixels
+ "vsubl.u8 q1, d2, d3 \n"
+ "vadd.s16 q0, q0, q1 \n"
+ "vabs.s16 q0, q0 \n"
+ "vqmovn.u16 d0, q0 \n"
+ "vst1.8 {d0}, [%2]! \n" // store 8 sobely
+ "bgt 1b \n"
+ : "+r"(src_y0), // %0
+ "+r"(src_y1), // %1
+ "+r"(dst_sobely), // %2
+ "+r"(width) // %3
+ : "r"(1), // %4
+ "r"(6) // %5
+ : "cc", "memory", "q0", "q1" // Clobber List
+ );
+}
+#endif // __ARM_NEON__
+
+#ifdef __cplusplus
+} // extern "C"
+} // namespace libyuv
+#endif
diff --git a/third_party/libyuv/source/row_posix.cc b/third_party/libyuv/source/row_posix.cc
new file mode 100644
index 0000000..e477088
--- /dev/null
+++ b/third_party/libyuv/source/row_posix.cc
@@ -0,0 +1,6443 @@
+/*
+ * Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "third_party/libyuv/include/libyuv/row.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// This module is for GCC x86 and x64.
+#if !defined(LIBYUV_DISABLE_X86) && (defined(__x86_64__) || defined(__i386__))
+
+#if defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_ARGBGRAYROW_SSSE3)
+
+// Constants for ARGB
+static vec8 kARGBToY = {
+ 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0
+};
+
+// JPeg full range.
+static vec8 kARGBToYJ = {
+ 15, 75, 38, 0, 15, 75, 38, 0, 15, 75, 38, 0, 15, 75, 38, 0
+};
+#endif // defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_ARGBGRAYROW_SSSE3)
+
+#if defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_I422TOARGBROW_SSSE3)
+
+static vec8 kARGBToU = {
+ 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0
+};
+
+static vec8 kARGBToUJ = {
+ 127, -84, -43, 0, 127, -84, -43, 0, 127, -84, -43, 0, 127, -84, -43, 0
+};
+
+static vec8 kARGBToV = {
+ -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0,
+};
+
+static vec8 kARGBToVJ = {
+ -20, -107, 127, 0, -20, -107, 127, 0, -20, -107, 127, 0, -20, -107, 127, 0
+};
+
+// Constants for BGRA
+static vec8 kBGRAToY = {
+ 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13
+};
+
+static vec8 kBGRAToU = {
+ 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112
+};
+
+static vec8 kBGRAToV = {
+ 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18
+};
+
+// Constants for ABGR
+static vec8 kABGRToY = {
+ 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0
+};
+
+static vec8 kABGRToU = {
+ -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0
+};
+
+static vec8 kABGRToV = {
+ 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0
+};
+
+// Constants for RGBA.
+static vec8 kRGBAToY = {
+ 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33
+};
+
+static vec8 kRGBAToU = {
+ 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38
+};
+
+static vec8 kRGBAToV = {
+ 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112
+};
+
+static uvec8 kAddY16 = {
+ 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u
+};
+
+static vec16 kAddYJ64 = {
+ 64, 64, 64, 64, 64, 64, 64, 64
+};
+
+static uvec8 kAddUV128 = {
+ 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u,
+ 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u
+};
+
+static uvec16 kAddUVJ128 = {
+ 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u
+};
+#endif // defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_I422TOARGBROW_SSSE3)
+
+#ifdef HAS_RGB24TOARGBROW_SSSE3
+
+// Shuffle table for converting RGB24 to ARGB.
+static uvec8 kShuffleMaskRGB24ToARGB = {
+ 0u, 1u, 2u, 12u, 3u, 4u, 5u, 13u, 6u, 7u, 8u, 14u, 9u, 10u, 11u, 15u
+};
+
+// Shuffle table for converting RAW to ARGB.
+static uvec8 kShuffleMaskRAWToARGB = {
+ 2u, 1u, 0u, 12u, 5u, 4u, 3u, 13u, 8u, 7u, 6u, 14u, 11u, 10u, 9u, 15u
+};
+
+// Shuffle table for converting ARGB to RGB24.
+static uvec8 kShuffleMaskARGBToRGB24 = {
+ 0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 10u, 12u, 13u, 14u, 128u, 128u, 128u, 128u
+};
+
+// Shuffle table for converting ARGB to RAW.
+static uvec8 kShuffleMaskARGBToRAW = {
+ 2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 8u, 14u, 13u, 12u, 128u, 128u, 128u, 128u
+};
+
+// Shuffle table for converting ARGBToRGB24 for I422ToRGB24. First 8 + next 4
+static uvec8 kShuffleMaskARGBToRGB24_0 = {
+ 0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 128u, 128u, 128u, 128u, 10u, 12u, 13u, 14u
+};
+
+// Shuffle table for converting ARGB to RAW.
+static uvec8 kShuffleMaskARGBToRAW_0 = {
+ 2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 128u, 128u, 128u, 128u, 8u, 14u, 13u, 12u
+};
+#endif // HAS_RGB24TOARGBROW_SSSE3
+
+#if defined(TESTING) && defined(__x86_64__)
+void TestRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) {
+ asm volatile (
+ ".p2align 5 \n"
+ "mov %%eax,%%eax \n"
+ "mov %%ebx,%%ebx \n"
+ "mov %%ecx,%%ecx \n"
+ "mov %%edx,%%edx \n"
+ "mov %%esi,%%esi \n"
+ "mov %%edi,%%edi \n"
+ "mov %%ebp,%%ebp \n"
+ "mov %%esp,%%esp \n"
+ ".p2align 5 \n"
+ "mov %%r8d,%%r8d \n"
+ "mov %%r9d,%%r9d \n"
+ "mov %%r10d,%%r10d \n"
+ "mov %%r11d,%%r11d \n"
+ "mov %%r12d,%%r12d \n"
+ "mov %%r13d,%%r13d \n"
+ "mov %%r14d,%%r14d \n"
+ "mov %%r15d,%%r15d \n"
+ ".p2align 5 \n"
+ "lea (%%rax),%%eax \n"
+ "lea (%%rbx),%%ebx \n"
+ "lea (%%rcx),%%ecx \n"
+ "lea (%%rdx),%%edx \n"
+ "lea (%%rsi),%%esi \n"
+ "lea (%%rdi),%%edi \n"
+ "lea (%%rbp),%%ebp \n"
+ "lea (%%rsp),%%esp \n"
+ ".p2align 5 \n"
+ "lea (%%r8),%%r8d \n"
+ "lea (%%r9),%%r9d \n"
+ "lea (%%r10),%%r10d \n"
+ "lea (%%r11),%%r11d \n"
+ "lea (%%r12),%%r12d \n"
+ "lea (%%r13),%%r13d \n"
+ "lea (%%r14),%%r14d \n"
+ "lea (%%r15),%%r15d \n"
+
+ ".p2align 5 \n"
+ "lea 0x10(%%rax),%%eax \n"
+ "lea 0x10(%%rbx),%%ebx \n"
+ "lea 0x10(%%rcx),%%ecx \n"
+ "lea 0x10(%%rdx),%%edx \n"
+ "lea 0x10(%%rsi),%%esi \n"
+ "lea 0x10(%%rdi),%%edi \n"
+ "lea 0x10(%%rbp),%%ebp \n"
+ "lea 0x10(%%rsp),%%esp \n"
+ ".p2align 5 \n"
+ "lea 0x10(%%r8),%%r8d \n"
+ "lea 0x10(%%r9),%%r9d \n"
+ "lea 0x10(%%r10),%%r10d \n"
+ "lea 0x10(%%r11),%%r11d \n"
+ "lea 0x10(%%r12),%%r12d \n"
+ "lea 0x10(%%r13),%%r13d \n"
+ "lea 0x10(%%r14),%%r14d \n"
+ "lea 0x10(%%r15),%%r15d \n"
+
+ ".p2align 5 \n"
+ "add 0x10,%%eax \n"
+ "add 0x10,%%ebx \n"
+ "add 0x10,%%ecx \n"
+ "add 0x10,%%edx \n"
+ "add 0x10,%%esi \n"
+ "add 0x10,%%edi \n"
+ "add 0x10,%%ebp \n"
+ "add 0x10,%%esp \n"
+ ".p2align 5 \n"
+ "add 0x10,%%r8d \n"
+ "add 0x10,%%r9d \n"
+ "add 0x10,%%r10d \n"
+ "add 0x10,%%r11d \n"
+ "add 0x10,%%r12d \n"
+ "add 0x10,%%r13d \n"
+ "add 0x10,%%r14d \n"
+ "add 0x10,%%r15d \n"
+
+ ".p2align 2 \n"
+ "1: \n"
+ "movq " MEMACCESS(0) ",%%xmm0 \n"
+ "lea " MEMLEA(0x8,0) ",%0 \n"
+ "movdqa %%xmm0," MEMACCESS(1) " \n"
+ "lea " MEMLEA(0x20,1) ",%1 \n"
+ "sub $0x8,%2 \n"
+ "jg 1b \n"
+ : "+r"(src_y), // %0
+ "+r"(dst_argb), // %1
+ "+r"(pix) // %2
+ :
+ : "memory", "cc"
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm5"
+#endif
+ );
+}
+#endif // TESTING
+
+#ifdef HAS_I400TOARGBROW_SSE2
+void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) {
+ asm volatile (
+ "pcmpeqb %%xmm5,%%xmm5 \n"
+ "pslld $0x18,%%xmm5 \n"
+ LABELALIGN
+ "1: \n"
+ "movq " MEMACCESS(0) ",%%xmm0 \n"
+ "lea " MEMLEA(0x8,0) ",%0 \n"
+ "punpcklbw %%xmm0,%%xmm0 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "punpcklwd %%xmm0,%%xmm0 \n"
+ "punpckhwd %%xmm1,%%xmm1 \n"
+ "por %%xmm5,%%xmm0 \n"
+ "por %%xmm5,%%xmm1 \n"
+ "movdqa %%xmm0," MEMACCESS(1) " \n"
+ "movdqa %%xmm1," MEMACCESS2(0x10,1) " \n"
+ "lea " MEMLEA(0x20,1) ",%1 \n"
+ "sub $0x8,%2 \n"
+ "jg 1b \n"
+ : "+r"(src_y), // %0
+ "+r"(dst_argb), // %1
+ "+r"(pix) // %2
+ :
+ : "memory", "cc"
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm5"
+#endif
+ );
+}
+
+void I400ToARGBRow_Unaligned_SSE2(const uint8* src_y, uint8* dst_argb,
+ int pix) {
+ asm volatile (
+ "pcmpeqb %%xmm5,%%xmm5 \n"
+ "pslld $0x18,%%xmm5 \n"
+ LABELALIGN
+ "1: \n"
+ "movq " MEMACCESS(0) ",%%xmm0 \n"
+ "lea " MEMLEA(0x8,0) ",%0 \n"
+ "punpcklbw %%xmm0,%%xmm0 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "punpcklwd %%xmm0,%%xmm0 \n"
+ "punpckhwd %%xmm1,%%xmm1 \n"
+ "por %%xmm5,%%xmm0 \n"
+ "por %%xmm5,%%xmm1 \n"
+ "movdqu %%xmm0," MEMACCESS(1) " \n"
+ "movdqu %%xmm1," MEMACCESS2(0x10,1) " \n"
+ "lea " MEMLEA(0x20,1) ",%1 \n"
+ "sub $0x8,%2 \n"
+ "jg 1b \n"
+ : "+r"(src_y), // %0
+ "+r"(dst_argb), // %1
+ "+r"(pix) // %2
+ :
+ : "memory", "cc"
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm5"
+#endif
+ );
+}
+#endif // HAS_I400TOARGBROW_SSE2
+
+#ifdef HAS_RGB24TOARGBROW_SSSE3
+void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int pix) {
+ asm volatile (
+ "pcmpeqb %%xmm5,%%xmm5 \n" // generate mask 0xff000000
+ "pslld $0x18,%%xmm5 \n"
+ "movdqa %3,%%xmm4 \n"
+ LABELALIGN
+ "1: \n"
+ "movdqu " MEMACCESS(0) ",%%xmm0 \n"
+ "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
+ "movdqu " MEMACCESS2(0x20,0) ",%%xmm3 \n"
+ "lea " MEMLEA(0x30,0) ",%0 \n"
+ "movdqa %%xmm3,%%xmm2 \n"
+ "palignr $0x8,%%xmm1,%%xmm2 \n"
+ "pshufb %%xmm4,%%xmm2 \n"
+ "por %%xmm5,%%xmm2 \n"
+ "palignr $0xc,%%xmm0,%%xmm1 \n"
+ "pshufb %%xmm4,%%xmm0 \n"
+ "movdqa %%xmm2," MEMACCESS2(0x20,1) " \n"
+ "por %%xmm5,%%xmm0 \n"
+ "pshufb %%xmm4,%%xmm1 \n"
+ "movdqa %%xmm0," MEMACCESS(1) " \n"
+ "por %%xmm5,%%xmm1 \n"
+ "palignr $0x4,%%xmm3,%%xmm3 \n"
+ "pshufb %%xmm4,%%xmm3 \n"
+ "movdqa %%xmm1," MEMACCESS2(0x10,1) " \n"
+ "por %%xmm5,%%xmm3 \n"
+ "sub $0x10,%2 \n"
+ "movdqa %%xmm3," MEMACCESS2(0x30,1) " \n"
+ "lea " MEMLEA(0x40,1) ",%1 \n"
+ "jg 1b \n"
+ : "+r"(src_rgb24), // %0
+ "+r"(dst_argb), // %1
+ "+r"(pix) // %2
+ : "m"(kShuffleMaskRGB24ToARGB) // %3
+ : "memory", "cc"
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+#endif
+ );
+}
+
+void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb, int pix) {
+ asm volatile (
+ "pcmpeqb %%xmm5,%%xmm5 \n" // generate mask 0xff000000
+ "pslld $0x18,%%xmm5 \n"
+ "movdqa %3,%%xmm4 \n"
+ LABELALIGN
+ "1: \n"
+ "movdqu " MEMACCESS(0) ",%%xmm0 \n"
+ "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
+ "movdqu " MEMACCESS2(0x20,0) ",%%xmm3 \n"
+ "lea " MEMLEA(0x30,0) ",%0 \n"
+ "movdqa %%xmm3,%%xmm2 \n"
+ "palignr $0x8,%%xmm1,%%xmm2 \n"
+ "pshufb %%xmm4,%%xmm2 \n"
+ "por %%xmm5,%%xmm2 \n"
+ "palignr $0xc,%%xmm0,%%xmm1 \n"
+ "pshufb %%xmm4,%%xmm0 \n"
+ "movdqa %%xmm2," MEMACCESS2(0x20,1) " \n"
+ "por %%xmm5,%%xmm0 \n"
+ "pshufb %%xmm4,%%xmm1 \n"
+ "movdqa %%xmm0," MEMACCESS(1) " \n"
+ "por %%xmm5,%%xmm1 \n"
+ "palignr $0x4,%%xmm3,%%xmm3 \n"
+ "pshufb %%xmm4,%%xmm3 \n"
+ "movdqa %%xmm1," MEMACCESS2(0x10,1) " \n"
+ "por %%xmm5,%%xmm3 \n"
+ "sub $0x10,%2 \n"
+ "movdqa %%xmm3," MEMACCESS2(0x30,1) " \n"
+ "lea " MEMLEA(0x40,1) ",%1 \n"
+ "jg 1b \n"
+ : "+r"(src_raw), // %0
+ "+r"(dst_argb), // %1
+ "+r"(pix) // %2
+ : "m"(kShuffleMaskRAWToARGB) // %3
+ : "memory", "cc"
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+#endif
+ );
+}
+
+void RGB565ToARGBRow_SSE2(const uint8* src, uint8* dst, int pix) {
+ asm volatile (
+ "mov $0x1080108,%%eax \n"
+ "movd %%eax,%%xmm5 \n"
+ "pshufd $0x0,%%xmm5,%%xmm5 \n"
+ "mov $0x20802080,%%eax \n"
+ "movd %%eax,%%xmm6 \n"
+ "pshufd $0x0,%%xmm6,%%xmm6 \n"
+ "pcmpeqb %%xmm3,%%xmm3 \n"
+ "psllw $0xb,%%xmm3 \n"
+ "pcmpeqb %%xmm4,%%xmm4 \n"
+ "psllw $0xa,%%xmm4 \n"
+ "psrlw $0x5,%%xmm4 \n"
+ "pcmpeqb %%xmm7,%%xmm7 \n"
+ "psllw $0x8,%%xmm7 \n"
+ "sub %0,%1 \n"
+ "sub %0,%1 \n"
+ LABELALIGN
+ "1: \n"
+ "movdqu " MEMACCESS(0) ",%%xmm0 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "movdqa %%xmm0,%%xmm2 \n"
+ "pand %%xmm3,%%xmm1 \n"
+ "psllw $0xb,%%xmm2 \n"
+ "pmulhuw %%xmm5,%%xmm1 \n"
+ "pmulhuw %%xmm5,%%xmm2 \n"
+ "psllw $0x8,%%xmm1 \n"
+ "por %%xmm2,%%xmm1 \n"
+ "pand %%xmm4,%%xmm0 \n"
+ "pmulhuw %%xmm6,%%xmm0 \n"
+ "por %%xmm7,%%xmm0 \n"
+ "movdqa %%xmm1,%%xmm2 \n"
+ "punpcklbw %%xmm0,%%xmm1 \n"
+ "punpckhbw %%xmm0,%%xmm2 \n"
+ BUNDLEALIGN
+ MEMOPMEM(movdqa,xmm1,0x00,1,0,2) // movdqa %%xmm1,(%1,%0,2)
+ MEMOPMEM(movdqa,xmm2,0x10,1,0,2) // movdqa %%xmm2,0x10(%1,%0,2)
+ "lea " MEMLEA(0x10,0) ",%0 \n"
+ "sub $0x8,%2 \n"
+ "jg 1b \n"
+ : "+r"(src), // %0
+ "+r"(dst), // %1
+ "+r"(pix) // %2
+ :
+ : "memory", "cc", "eax"
+#if defined(__native_client__) && defined(__x86_64__)
+ , "r14"
+#endif
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
+#endif
+ );
+}
+
+void ARGB1555ToARGBRow_SSE2(const uint8* src, uint8* dst, int pix) {
+ asm volatile (
+ "mov $0x1080108,%%eax \n"
+ "movd %%eax,%%xmm5 \n"
+ "pshufd $0x0,%%xmm5,%%xmm5 \n"
+ "mov $0x42004200,%%eax \n"
+ "movd %%eax,%%xmm6 \n"
+ "pshufd $0x0,%%xmm6,%%xmm6 \n"
+ "pcmpeqb %%xmm3,%%xmm3 \n"
+ "psllw $0xb,%%xmm3 \n"
+ "movdqa %%xmm3,%%xmm4 \n"
+ "psrlw $0x6,%%xmm4 \n"
+ "pcmpeqb %%xmm7,%%xmm7 \n"
+ "psllw $0x8,%%xmm7 \n"
+ "sub %0,%1 \n"
+ "sub %0,%1 \n"
+ LABELALIGN
+ "1: \n"
+ "movdqu " MEMACCESS(0) ",%%xmm0 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "movdqa %%xmm0,%%xmm2 \n"
+ "psllw $0x1,%%xmm1 \n"
+ "psllw $0xb,%%xmm2 \n"
+ "pand %%xmm3,%%xmm1 \n"
+ "pmulhuw %%xmm5,%%xmm2 \n"
+ "pmulhuw %%xmm5,%%xmm1 \n"
+ "psllw $0x8,%%xmm1 \n"
+ "por %%xmm2,%%xmm1 \n"
+ "movdqa %%xmm0,%%xmm2 \n"
+ "pand %%xmm4,%%xmm0 \n"
+ "psraw $0x8,%%xmm2 \n"
+ "pmulhuw %%xmm6,%%xmm0 \n"
+ "pand %%xmm7,%%xmm2 \n"
+ "por %%xmm2,%%xmm0 \n"
+ "movdqa %%xmm1,%%xmm2 \n"
+ "punpcklbw %%xmm0,%%xmm1 \n"
+ "punpckhbw %%xmm0,%%xmm2 \n"
+ BUNDLEALIGN
+ MEMOPMEM(movdqa,xmm1,0x00,1,0,2) // movdqa %%xmm1,(%1,%0,2)
+ MEMOPMEM(movdqa,xmm2,0x10,1,0,2) // movdqa %%xmm2,0x10(%1,%0,2)
+ "lea " MEMLEA(0x10,0) ",%0 \n"
+ "sub $0x8,%2 \n"
+ "jg 1b \n"
+ : "+r"(src), // %0
+ "+r"(dst), // %1
+ "+r"(pix) // %2
+ :
+ : "memory", "cc", "eax"
+#if defined(__native_client__) && defined(__x86_64__)
+ , "r14"
+#endif
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
+#endif
+ );
+}
+
+void ARGB4444ToARGBRow_SSE2(const uint8* src, uint8* dst, int pix) {
+ asm volatile (
+ "mov $0xf0f0f0f,%%eax \n"
+ "movd %%eax,%%xmm4 \n"
+ "pshufd $0x0,%%xmm4,%%xmm4 \n"
+ "movdqa %%xmm4,%%xmm5 \n"
+ "pslld $0x4,%%xmm5 \n"
+ "sub %0,%1 \n"
+ "sub %0,%1 \n"
+ LABELALIGN
+ "1: \n"
+ "movdqu " MEMACCESS(0) ",%%xmm0 \n"
+ "movdqa %%xmm0,%%xmm2 \n"
+ "pand %%xmm4,%%xmm0 \n"
+ "pand %%xmm5,%%xmm2 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "movdqa %%xmm2,%%xmm3 \n"
+ "psllw $0x4,%%xmm1 \n"
+ "psrlw $0x4,%%xmm3 \n"
+ "por %%xmm1,%%xmm0 \n"
+ "por %%xmm3,%%xmm2 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "punpcklbw %%xmm2,%%xmm0 \n"
+ "punpckhbw %%xmm2,%%xmm1 \n"
+ BUNDLEALIGN
+ MEMOPMEM(movdqa,xmm0,0x00,1,0,2) // movdqa %%xmm0,(%1,%0,2)
+ MEMOPMEM(movdqa,xmm1,0x10,1,0,2) // movdqa %%xmm1,0x10(%1,%0,2)
+ "lea " MEMLEA(0x10,0) ",%0 \n"
+ "sub $0x8,%2 \n"
+ "jg 1b \n"
+ : "+r"(src), // %0
+ "+r"(dst), // %1
+ "+r"(pix) // %2
+ :
+ : "memory", "cc", "eax"
+#if defined(__native_client__) && defined(__x86_64__)
+ , "r14"
+#endif
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+#endif
+ );
+}
+
+void ARGBToRGB24Row_SSSE3(const uint8* src, uint8* dst, int pix) {
+ asm volatile (
+ "movdqa %3,%%xmm6 \n"
+ LABELALIGN
+ "1: \n"
+ "movdqu " MEMACCESS(0) ",%%xmm0 \n"
+ "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
+ "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
+ "movdqu " MEMACCESS2(0x30,0) ",%%xmm3 \n"
+ "lea " MEMLEA(0x40,0) ",%0 \n"
+ "pshufb %%xmm6,%%xmm0 \n"
+ "pshufb %%xmm6,%%xmm1 \n"
+ "pshufb %%xmm6,%%xmm2 \n"
+ "pshufb %%xmm6,%%xmm3 \n"
+ "movdqa %%xmm1,%%xmm4 \n"
+ "psrldq $0x4,%%xmm1 \n"
+ "pslldq $0xc,%%xmm4 \n"
+ "movdqa %%xmm2,%%xmm5 \n"
+ "por %%xmm4,%%xmm0 \n"
+ "pslldq $0x8,%%xmm5 \n"
+ "movdqu %%xmm0," MEMACCESS(1) " \n"
+ "por %%xmm5,%%xmm1 \n"
+ "psrldq $0x8,%%xmm2 \n"
+ "pslldq $0x4,%%xmm3 \n"
+ "por %%xmm3,%%xmm2 \n"
+ "movdqu %%xmm1," MEMACCESS2(0x10,1) " \n"
+ "movdqu %%xmm2," MEMACCESS2(0x20,1) " \n"
+ "lea " MEMLEA(0x30,1) ",%1 \n"
+ "sub $0x10,%2 \n"
+ "jg 1b \n"
+ : "+r"(src), // %0
+ "+r"(dst), // %1
+ "+r"(pix) // %2
+ : "m"(kShuffleMaskARGBToRGB24) // %3
+ : "memory", "cc"
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
+#endif
+ );
+}
+
+void ARGBToRAWRow_SSSE3(const uint8* src, uint8* dst, int pix) {
+ asm volatile (
+ "movdqa %3,%%xmm6 \n"
+ LABELALIGN
+ "1: \n"
+ "movdqu " MEMACCESS(0) ",%%xmm0 \n"
+ "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
+ "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
+ "movdqu " MEMACCESS2(0x30,0) ",%%xmm3 \n"
+ "lea " MEMLEA(0x40,0) ",%0 \n"
+ "pshufb %%xmm6,%%xmm0 \n"
+ "pshufb %%xmm6,%%xmm1 \n"
+ "pshufb %%xmm6,%%xmm2 \n"
+ "pshufb %%xmm6,%%xmm3 \n"
+ "movdqa %%xmm1,%%xmm4 \n"
+ "psrldq $0x4,%%xmm1 \n"
+ "pslldq $0xc,%%xmm4 \n"
+ "movdqa %%xmm2,%%xmm5 \n"
+ "por %%xmm4,%%xmm0 \n"
+ "pslldq $0x8,%%xmm5 \n"
+ "movdqu %%xmm0," MEMACCESS(1) " \n"
+ "por %%xmm5,%%xmm1 \n"
+ "psrldq $0x8,%%xmm2 \n"
+ "pslldq $0x4,%%xmm3 \n"
+ "por %%xmm3,%%xmm2 \n"
+ "movdqu %%xmm1," MEMACCESS2(0x10,1) " \n"
+ "movdqu %%xmm2," MEMACCESS2(0x20,1) " \n"
+ "lea " MEMLEA(0x30,1) ",%1 \n"
+ "sub $0x10,%2 \n"
+ "jg 1b \n"
+ : "+r"(src), // %0
+ "+r"(dst), // %1
+ "+r"(pix) // %2
+ : "m"(kShuffleMaskARGBToRAW) // %3
+ : "memory", "cc"
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
+#endif
+ );
+}
+
+void ARGBToRGB565Row_SSE2(const uint8* src, uint8* dst, int pix) {
+ asm volatile (
+ "pcmpeqb %%xmm3,%%xmm3 \n"
+ "psrld $0x1b,%%xmm3 \n"
+ "pcmpeqb %%xmm4,%%xmm4 \n"
+ "psrld $0x1a,%%xmm4 \n"
+ "pslld $0x5,%%xmm4 \n"
+ "pcmpeqb %%xmm5,%%xmm5 \n"
+ "pslld $0xb,%%xmm5 \n"
+ LABELALIGN
+ "1: \n"
+ "movdqa " MEMACCESS(0) ",%%xmm0 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "movdqa %%xmm0,%%xmm2 \n"
+ "pslld $0x8,%%xmm0 \n"
+ "psrld $0x3,%%xmm1 \n"
+ "psrld $0x5,%%xmm2 \n"
+ "psrad $0x10,%%xmm0 \n"
+ "pand %%xmm3,%%xmm1 \n"
+ "pand %%xmm4,%%xmm2 \n"
+ "pand %%xmm5,%%xmm0 \n"
+ "por %%xmm2,%%xmm1 \n"
+ "por %%xmm1,%%xmm0 \n"
+ "packssdw %%xmm0,%%xmm0 \n"
+ "lea " MEMLEA(0x10,0) ",%0 \n"
+ "movq %%xmm0," MEMACCESS(1) " \n"
+ "lea " MEMLEA(0x8,1) ",%1 \n"
+ "sub $0x4,%2 \n"
+ "jg 1b \n"
+ : "+r"(src), // %0
+ "+r"(dst), // %1
+ "+r"(pix) // %2
+ :
+ : "memory", "cc"
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+#endif
+ );
+}
+
+void ARGBToARGB1555Row_SSE2(const uint8* src, uint8* dst, int pix) {
+ asm volatile (
+ "pcmpeqb %%xmm4,%%xmm4 \n"
+ "psrld $0x1b,%%xmm4 \n"
+ "movdqa %%xmm4,%%xmm5 \n"
+ "pslld $0x5,%%xmm5 \n"
+ "movdqa %%xmm4,%%xmm6 \n"
+ "pslld $0xa,%%xmm6 \n"
+ "pcmpeqb %%xmm7,%%xmm7 \n"
+ "pslld $0xf,%%xmm7 \n"
+ LABELALIGN
+ "1: \n"
+ "movdqa " MEMACCESS(0) ",%%xmm0 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "movdqa %%xmm0,%%xmm2 \n"
+ "movdqa %%xmm0,%%xmm3 \n"
+ "psrad $0x10,%%xmm0 \n"
+ "psrld $0x3,%%xmm1 \n"
+ "psrld $0x6,%%xmm2 \n"
+ "psrld $0x9,%%xmm3 \n"
+ "pand %%xmm7,%%xmm0 \n"
+ "pand %%xmm4,%%xmm1 \n"
+ "pand %%xmm5,%%xmm2 \n"
+ "pand %%xmm6,%%xmm3 \n"
+ "por %%xmm1,%%xmm0 \n"
+ "por %%xmm3,%%xmm2 \n"
+ "por %%xmm2,%%xmm0 \n"
+ "packssdw %%xmm0,%%xmm0 \n"
+ "lea " MEMLEA(0x10,0) ",%0 \n"
+ "movq %%xmm0," MEMACCESS(1) " \n"
+ "lea " MEMACCESS2(0x8,1) ",%1 \n"
+ "sub $0x4,%2 \n"
+ "jg 1b \n"
+ : "+r"(src), // %0
+ "+r"(dst), // %1
+ "+r"(pix) // %2
+ :
+ : "memory", "cc"
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
+#endif
+ );
+}
+
+void ARGBToARGB4444Row_SSE2(const uint8* src, uint8* dst, int pix) {
+ asm volatile (
+ "pcmpeqb %%xmm4,%%xmm4 \n"
+ "psllw $0xc,%%xmm4 \n"
+ "movdqa %%xmm4,%%xmm3 \n"
+ "psrlw $0x8,%%xmm3 \n"
+ LABELALIGN
+ "1: \n"
+ "movdqa " MEMACCESS(0) ",%%xmm0 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "pand %%xmm3,%%xmm0 \n"
+ "pand %%xmm4,%%xmm1 \n"
+ "psrlq $0x4,%%xmm0 \n"
+ "psrlq $0x8,%%xmm1 \n"
+ "por %%xmm1,%%xmm0 \n"
+ "packuswb %%xmm0,%%xmm0 \n"
+ "lea " MEMLEA(0x10,0) ",%0 \n"
+ "movq %%xmm0," MEMACCESS(1) " \n"
+ "lea " MEMLEA(0x8,1) ",%1 \n"
+ "sub $0x4,%2 \n"
+ "jg 1b \n"
+ : "+r"(src), // %0
+ "+r"(dst), // %1
+ "+r"(pix) // %2
+ :
+ : "memory", "cc"
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
+#endif
+ );
+}
+#endif // HAS_RGB24TOARGBROW_SSSE3
+
+#ifdef HAS_ARGBTOYROW_SSSE3
+void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
+ asm volatile (
+ "movdqa %4,%%xmm5 \n"
+ "movdqa %3,%%xmm4 \n"
+ LABELALIGN
+ "1: \n"
+ "movdqa " MEMACCESS(0) ",%%xmm0 \n"
+ "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
+ "movdqa " MEMACCESS2(0x20,0) ",%%xmm2 \n"
+ "movdqa " MEMACCESS2(0x30,0) ",%%xmm3 \n"
+ "pmaddubsw %%xmm4,%%xmm0 \n"
+ "pmaddubsw %%xmm4,%%xmm1 \n"
+ "pmaddubsw %%xmm4,%%xmm2 \n"
+ "pmaddubsw %%xmm4,%%xmm3 \n"
+ "lea " MEMLEA(0x40,0) ",%0 \n"
+ "phaddw %%xmm1,%%xmm0 \n"
+ "phaddw %%xmm3,%%xmm2 \n"
+ "psrlw $0x7,%%xmm0 \n"
+ "psrlw $0x7,%%xmm2 \n"
+ "packuswb %%xmm2,%%xmm0 \n"
+ "paddb %%xmm5,%%xmm0 \n"
+ "sub $0x10,%2 \n"
+ "movdqa %%xmm0," MEMACCESS(1) " \n"
+ "lea " MEMLEA(0x10,1) ",%1 \n"
+ "jg 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_y), // %1
+ "+r"(pix) // %2
+ : "m"(kARGBToY), // %3
+ "m"(kAddY16) // %4
+ : "memory", "cc"
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+#endif
+ );
+}
+
+void ARGBToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
+ asm volatile (
+ "movdqa %4,%%xmm5 \n"
+ "movdqa %3,%%xmm4 \n"
+ LABELALIGN
+ "1: \n"
+ "movdqu " MEMACCESS(0) ",%%xmm0 \n"
+ "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
+ "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
+ "movdqu " MEMACCESS2(0x30,0) ",%%xmm3 \n"
+ "pmaddubsw %%xmm4,%%xmm0 \n"
+ "pmaddubsw %%xmm4,%%xmm1 \n"
+ "pmaddubsw %%xmm4,%%xmm2 \n"
+ "pmaddubsw %%xmm4,%%xmm3 \n"
+ "lea " MEMLEA(0x40,0) ",%0 \n"
+ "phaddw %%xmm1,%%xmm0 \n"
+ "phaddw %%xmm3,%%xmm2 \n"
+ "psrlw $0x7,%%xmm0 \n"
+ "psrlw $0x7,%%xmm2 \n"
+ "packuswb %%xmm2,%%xmm0 \n"
+ "paddb %%xmm5,%%xmm0 \n"
+ "sub $0x10,%2 \n"
+ "movdqu %%xmm0," MEMACCESS(1) " \n"
+ "lea " MEMLEA(0x10,1) ",%1 \n"
+ "jg 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_y), // %1
+ "+r"(pix) // %2
+ : "m"(kARGBToY), // %3
+ "m"(kAddY16) // %4
+ : "memory", "cc"
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+#endif
+ );
+}
+#endif // HAS_ARGBTOYROW_SSSE3
+
+#ifdef HAS_ARGBTOYJROW_SSSE3
+void ARGBToYJRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
+ asm volatile (
+ "movdqa %3,%%xmm4 \n"
+ "movdqa %4,%%xmm5 \n"
+ LABELALIGN
+ "1: \n"
+ "movdqa " MEMACCESS(0) ",%%xmm0 \n"
+ "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
+ "movdqa " MEMACCESS2(0x20,0) ",%%xmm2 \n"
+ "movdqa " MEMACCESS2(0x30,0) ",%%xmm3 \n"
+ "pmaddubsw %%xmm4,%%xmm0 \n"
+ "pmaddubsw %%xmm4,%%xmm1 \n"
+ "pmaddubsw %%xmm4,%%xmm2 \n"
+ "pmaddubsw %%xmm4,%%xmm3 \n"
+ "lea " MEMLEA(0x40,0) ",%0 \n"
+ "phaddw %%xmm1,%%xmm0 \n"
+ "phaddw %%xmm3,%%xmm2 \n"
+ "paddw %%xmm5,%%xmm0 \n"
+ "paddw %%xmm5,%%xmm2 \n"
+ "psrlw $0x7,%%xmm0 \n"
+ "psrlw $0x7,%%xmm2 \n"
+ "packuswb %%xmm2,%%xmm0 \n"
+ "sub $0x10,%2 \n"
+ "movdqa %%xmm0," MEMACCESS(1) " \n"
+ "lea " MEMLEA(0x10,1) ",%1 \n"
+ "jg 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_y), // %1
+ "+r"(pix) // %2
+ : "m"(kARGBToYJ), // %3
+ "m"(kAddYJ64) // %4
+ : "memory", "cc"
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+#endif
+ );
+}
+
+void ARGBToYJRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
+ asm volatile (
+ "movdqa %3,%%xmm4 \n"
+ "movdqa %4,%%xmm5 \n"
+ LABELALIGN
+ "1: \n"
+ "movdqu " MEMACCESS(0) ",%%xmm0 \n"
+ "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
+ "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
+ "movdqu " MEMACCESS2(0x30,0) ",%%xmm3 \n"
+ "pmaddubsw %%xmm4,%%xmm0 \n"
+ "pmaddubsw %%xmm4,%%xmm1 \n"
+ "pmaddubsw %%xmm4,%%xmm2 \n"
+ "pmaddubsw %%xmm4,%%xmm3 \n"
+ "lea " MEMLEA(0x40,0) ",%0 \n"
+ "phaddw %%xmm1,%%xmm0 \n"
+ "phaddw %%xmm3,%%xmm2 \n"
+ "paddw %%xmm5,%%xmm0 \n"
+ "paddw %%xmm5,%%xmm2 \n"
+ "psrlw $0x7,%%xmm0 \n"
+ "psrlw $0x7,%%xmm2 \n"
+ "packuswb %%xmm2,%%xmm0 \n"
+ "sub $0x10,%2 \n"
+ "movdqu %%xmm0," MEMACCESS(1) " \n"
+ "lea " MEMLEA(0x10,1) ",%1 \n"
+ "jg 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_y), // %1
+ "+r"(pix) // %2
+ : "m"(kARGBToYJ), // %3
+ "m"(kAddYJ64) // %4
+ : "memory", "cc"
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+#endif
+ );
+}
+#endif // HAS_ARGBTOYJROW_SSSE3
+
+#ifdef HAS_ARGBTOUVROW_SSSE3
+// TODO(fbarchard): pass xmm constants to single block of assembly.
+// fpic on GCC 4.2 for OSX runs out of GPR registers. "m" effectively takes
+// 3 registers - ebx, ebp and eax. "m" can be passed with 3 normal registers,
+// or 4 if stack frame is disabled. Doing 2 assembly blocks is a work around
+// and considered unsafe.
+void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
+ uint8* dst_u, uint8* dst_v, int width) {
+ asm volatile (
+ "movdqa %0,%%xmm4 \n"
+ "movdqa %1,%%xmm3 \n"
+ "movdqa %2,%%xmm5 \n"
+ :
+ : "m"(kARGBToU), // %0
+ "m"(kARGBToV), // %1
+ "m"(kAddUV128) // %2
+ );
+ asm volatile (
+ "sub %1,%2 \n"
+ LABELALIGN
+ "1: \n"
+ "movdqa " MEMACCESS(0) ",%%xmm0 \n"
+ "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
+ "movdqa " MEMACCESS2(0x20,0) ",%%xmm2 \n"
+ "movdqa " MEMACCESS2(0x30,0) ",%%xmm6 \n"
+ BUNDLEALIGN
+ MEMOPREG(pavgb,0x00,0,4,1,xmm0) // pavgb (%0,%4,1),%%xmm0
+ MEMOPREG(pavgb,0x10,0,4,1,xmm1) // pavgb 0x10(%0,%4,1),%%xmm1
+ MEMOPREG(pavgb,0x20,0,4,1,xmm2) // pavgb 0x20(%0,%4,1),%%xmm2
+ MEMOPREG(pavgb,0x30,0,4,1,xmm6) // pavgb 0x30(%0,%4,1),%%xmm6
+ "lea " MEMLEA(0x40,0) ",%0 \n"
+ "movdqa %%xmm0,%%xmm7 \n"
+ "shufps $0x88,%%xmm1,%%xmm0 \n"
+ "shufps $0xdd,%%xmm1,%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm0 \n"
+ "movdqa %%xmm2,%%xmm7 \n"
+ "shufps $0x88,%%xmm6,%%xmm2 \n"
+ "shufps $0xdd,%%xmm6,%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm2 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "movdqa %%xmm2,%%xmm6 \n"
+ "pmaddubsw %%xmm4,%%xmm0 \n"
+ "pmaddubsw %%xmm4,%%xmm2 \n"
+ "pmaddubsw %%xmm3,%%xmm1 \n"
+ "pmaddubsw %%xmm3,%%xmm6 \n"
+ "phaddw %%xmm2,%%xmm0 \n"
+ "phaddw %%xmm6,%%xmm1 \n"
+ "psraw $0x8,%%xmm0 \n"
+ "psraw $0x8,%%xmm1 \n"
+ "packsswb %%xmm1,%%xmm0 \n"
+ "paddb %%xmm5,%%xmm0 \n"
+ "sub $0x10,%3 \n"
+ "movlps %%xmm0," MEMACCESS(1) " \n"
+ BUNDLEALIGN
+ MEMOPMEM(movhps,xmm0,0x00,1,2,1) // movhps %%xmm0,(%1,%2,1)
+ "lea " MEMLEA(0x8,1) ",%1 \n"
+ "jg 1b \n"
+ : "+r"(src_argb0), // %0
+ "+r"(dst_u), // %1
+ "+r"(dst_v), // %2
+ "+rm"(width) // %3
+ : "r"((intptr_t)(src_stride_argb)) // %4
+ : "memory", "cc"
+#if defined(__native_client__) && defined(__x86_64__)
+ , "r14"
+#endif
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
+#endif
+ );
+}
+
+// TODO(fbarchard): Share code with ARGBToUVRow_SSSE3.
+void ARGBToUVJRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
+ uint8* dst_u, uint8* dst_v, int width) {
+ asm volatile (
+ "movdqa %0,%%xmm4 \n"
+ "movdqa %1,%%xmm3 \n"
+ "movdqa %2,%%xmm5 \n"
+ :
+ : "m"(kARGBToUJ), // %0
+ "m"(kARGBToVJ), // %1
+ "m"(kAddUVJ128) // %2
+ );
+ asm volatile (
+ "sub %1,%2 \n"
+ LABELALIGN
+ "1: \n"
+ "movdqa " MEMACCESS(0) ",%%xmm0 \n"
+ "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
+ "movdqa " MEMACCESS2(0x20,0) ",%%xmm2 \n"
+ "movdqa " MEMACCESS2(0x30,0) ",%%xmm6 \n"
+ BUNDLEALIGN
+ MEMOPREG(pavgb,0x00,0,4,1,xmm0) // pavgb (%0,%4,1),%%xmm0
+ MEMOPREG(pavgb,0x10,0,4,1,xmm1) // pavgb 0x10(%0,%4,1),%%xmm1
+ MEMOPREG(pavgb,0x20,0,4,1,xmm2) // pavgb 0x20(%0,%4,1),%%xmm2
+ MEMOPREG(pavgb,0x30,0,4,1,xmm6) // pavgb 0x30(%0,%4,1),%%xmm6
+ "lea " MEMLEA(0x40,0) ",%0 \n"
+ "movdqa %%xmm0,%%xmm7 \n"
+ "shufps $0x88,%%xmm1,%%xmm0 \n"
+ "shufps $0xdd,%%xmm1,%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm0 \n"
+ "movdqa %%xmm2,%%xmm7 \n"
+ "shufps $0x88,%%xmm6,%%xmm2 \n"
+ "shufps $0xdd,%%xmm6,%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm2 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "movdqa %%xmm2,%%xmm6 \n"
+ "pmaddubsw %%xmm4,%%xmm0 \n"
+ "pmaddubsw %%xmm4,%%xmm2 \n"
+ "pmaddubsw %%xmm3,%%xmm1 \n"
+ "pmaddubsw %%xmm3,%%xmm6 \n"
+ "phaddw %%xmm2,%%xmm0 \n"
+ "phaddw %%xmm6,%%xmm1 \n"
+ "paddw %%xmm5,%%xmm0 \n"
+ "paddw %%xmm5,%%xmm1 \n"
+ "psraw $0x8,%%xmm0 \n"
+ "psraw $0x8,%%xmm1 \n"
+ "packsswb %%xmm1,%%xmm0 \n"
+ "sub $0x10,%3 \n"
+ "movlps %%xmm0," MEMACCESS(1) " \n"
+ BUNDLEALIGN
+ MEMOPMEM(movhps,xmm0,0x00,1,2,1) // movhps %%xmm0,(%1,%2,1)
+ "lea " MEMLEA(0x8,1) ",%1 \n"
+ "jg 1b \n"
+ : "+r"(src_argb0), // %0
+ "+r"(dst_u), // %1
+ "+r"(dst_v), // %2
+ "+rm"(width) // %3
+ : "r"((intptr_t)(src_stride_argb)) // %4
+ : "memory", "cc"
+#if defined(__native_client__) && defined(__x86_64__)
+ , "r14"
+#endif
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
+#endif
+ );
+}
+
+void ARGBToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb,
+ uint8* dst_u, uint8* dst_v, int width) {
+ asm volatile (
+ "movdqa %0,%%xmm4 \n"
+ "movdqa %1,%%xmm3 \n"
+ "movdqa %2,%%xmm5 \n"
+ :
+ : "m"(kARGBToU), // %0
+ "m"(kARGBToV), // %1
+ "m"(kAddUV128) // %2
+ );
+ asm volatile (
+ "sub %1,%2 \n"
+ LABELALIGN
+ "1: \n"
+ "movdqu " MEMACCESS(0) ",%%xmm0 \n"
+ "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
+ "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
+ "movdqu " MEMACCESS2(0x30,0) ",%%xmm6 \n"
+ BUNDLEALIGN
+ MEMOPREG(movdqu,0x00,0,4,1,xmm7) // movdqu (%0,%4,1),%%xmm7
+ "pavgb %%xmm7,%%xmm0 \n"
+ MEMOPREG(movdqu,0x10,0,4,1,xmm7) // movdqu 0x10(%0,%4,1),%%xmm7
+ "pavgb %%xmm7,%%xmm1 \n"
+ MEMOPREG(movdqu,0x20,0,4,1,xmm7) // movdqu 0x20(%0,%4,1),%%xmm7
+ "pavgb %%xmm7,%%xmm2 \n"
+ MEMOPREG(movdqu,0x30,0,4,1,xmm7) // movdqu 0x30(%0,%4,1),%%xmm7
+ "pavgb %%xmm7,%%xmm6 \n"
+ "lea " MEMLEA(0x40,0) ",%0 \n"
+ "movdqa %%xmm0,%%xmm7 \n"
+ "shufps $0x88,%%xmm1,%%xmm0 \n"
+ "shufps $0xdd,%%xmm1,%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm0 \n"
+ "movdqa %%xmm2,%%xmm7 \n"
+ "shufps $0x88,%%xmm6,%%xmm2 \n"
+ "shufps $0xdd,%%xmm6,%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm2 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "movdqa %%xmm2,%%xmm6 \n"
+ "pmaddubsw %%xmm4,%%xmm0 \n"
+ "pmaddubsw %%xmm4,%%xmm2 \n"
+ "pmaddubsw %%xmm3,%%xmm1 \n"
+ "pmaddubsw %%xmm3,%%xmm6 \n"
+ "phaddw %%xmm2,%%xmm0 \n"
+ "phaddw %%xmm6,%%xmm1 \n"
+ "psraw $0x8,%%xmm0 \n"
+ "psraw $0x8,%%xmm1 \n"
+ "packsswb %%xmm1,%%xmm0 \n"
+ "paddb %%xmm5,%%xmm0 \n"
+ "sub $0x10,%3 \n"
+ "movlps %%xmm0," MEMACCESS(1) " \n"
+ BUNDLEALIGN
+ MEMOPMEM(movhps,xmm0,0x00,1,2,1) // movhps %%xmm0,(%1,%2,1)
+ "lea " MEMLEA(0x8,1) ",%1 \n"
+ "jg 1b \n"
+ : "+r"(src_argb0), // %0
+ "+r"(dst_u), // %1
+ "+r"(dst_v), // %2
+ "+rm"(width) // %3
+ : "r"((intptr_t)(src_stride_argb)) // %4
+ : "memory", "cc"
+#if defined(__native_client__) && defined(__x86_64__)
+ , "r14"
+#endif
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
+#endif
+ );
+}
+
+void ARGBToUVJRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb,
+ uint8* dst_u, uint8* dst_v, int width) {
+ asm volatile (
+ "movdqa %0,%%xmm4 \n"
+ "movdqa %1,%%xmm3 \n"
+ "movdqa %2,%%xmm5 \n"
+ :
+ : "m"(kARGBToUJ), // %0
+ "m"(kARGBToVJ), // %1
+ "m"(kAddUVJ128) // %2
+ );
+ asm volatile (
+ "sub %1,%2 \n"
+ LABELALIGN
+ "1: \n"
+ "movdqu " MEMACCESS(0) ",%%xmm0 \n"
+ "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
+ "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
+ "movdqu " MEMACCESS2(0x30,0) ",%%xmm6 \n"
+ BUNDLEALIGN
+ MEMOPREG(movdqu,0x00,0,4,1,xmm7) // movdqu (%0,%4,1),%%xmm7
+ "pavgb %%xmm7,%%xmm0 \n"
+ MEMOPREG(movdqu,0x10,0,4,1,xmm7) // movdqu 0x10(%0,%4,1),%%xmm7
+ "pavgb %%xmm7,%%xmm1 \n"
+ MEMOPREG(movdqu,0x20,0,4,1,xmm7) // movdqu 0x20(%0,%4,1),%%xmm7
+ "pavgb %%xmm7,%%xmm2 \n"
+ MEMOPREG(movdqu,0x30,0,4,1,xmm7) // movdqu 0x30(%0,%4,1),%%xmm7
+ "pavgb %%xmm7,%%xmm6 \n"
+ "lea " MEMLEA(0x40,0) ",%0 \n"
+ "movdqa %%xmm0,%%xmm7 \n"
+ "shufps $0x88,%%xmm1,%%xmm0 \n"
+ "shufps $0xdd,%%xmm1,%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm0 \n"
+ "movdqa %%xmm2,%%xmm7 \n"
+ "shufps $0x88,%%xmm6,%%xmm2 \n"
+ "shufps $0xdd,%%xmm6,%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm2 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "movdqa %%xmm2,%%xmm6 \n"
+ "pmaddubsw %%xmm4,%%xmm0 \n"
+ "pmaddubsw %%xmm4,%%xmm2 \n"
+ "pmaddubsw %%xmm3,%%xmm1 \n"
+ "pmaddubsw %%xmm3,%%xmm6 \n"
+ "phaddw %%xmm2,%%xmm0 \n"
+ "phaddw %%xmm6,%%xmm1 \n"
+ "paddw %%xmm5,%%xmm0 \n"
+ "paddw %%xmm5,%%xmm1 \n"
+ "psraw $0x8,%%xmm0 \n"
+ "psraw $0x8,%%xmm1 \n"
+ "packsswb %%xmm1,%%xmm0 \n"
+ "sub $0x10,%3 \n"
+ "movlps %%xmm0," MEMACCESS(1) " \n"
+ BUNDLEALIGN
+ MEMOPMEM(movhps,xmm0,0x00,1,2,1) // movhps %%xmm0,(%1,%2,1)
+ "lea " MEMLEA(0x8,1) ",%1 \n"
+ "jg 1b \n"
+ : "+r"(src_argb0), // %0
+ "+r"(dst_u), // %1
+ "+r"(dst_v), // %2
+ "+rm"(width) // %3
+ : "r"((intptr_t)(src_stride_argb))
+ : "memory", "cc"
+#if defined(__native_client__) && defined(__x86_64__)
+ , "r14"
+#endif
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
+#endif
+ );
+}
+
+void ARGBToUV444Row_SSSE3(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
+ int width) {
+ asm volatile (
+ "movdqa %0,%%xmm4 \n"
+ "movdqa %1,%%xmm3 \n"
+ "movdqa %2,%%xmm5 \n"
+ :
+ : "m"(kARGBToU), // %0
+ "m"(kARGBToV), // %1
+ "m"(kAddUV128) // %2
+ );
+ asm volatile (
+ "sub %1,%2 \n"
+ LABELALIGN
+ "1: \n"
+ "movdqa " MEMACCESS(0) ",%%xmm0 \n"
+ "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
+ "movdqa " MEMACCESS2(0x20,0) ",%%xmm2 \n"
+ "movdqa " MEMACCESS2(0x30,0) ",%%xmm6 \n"
+ "pmaddubsw %%xmm4,%%xmm0 \n"
+ "pmaddubsw %%xmm4,%%xmm1 \n"
+ "pmaddubsw %%xmm4,%%xmm2 \n"
+ "pmaddubsw %%xmm4,%%xmm6 \n"
+ "phaddw %%xmm1,%%xmm0 \n"
+ "phaddw %%xmm6,%%xmm2 \n"
+ "psraw $0x8,%%xmm0 \n"
+ "psraw $0x8,%%xmm2 \n"
+ "packsswb %%xmm2,%%xmm0 \n"
+ "paddb %%xmm5,%%xmm0 \n"
+ "sub $0x10,%3 \n"
+ "movdqa %%xmm0," MEMACCESS(1) " \n"
+ "movdqa " MEMACCESS(0) ",%%xmm0 \n"
+ "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
+ "movdqa " MEMACCESS2(0x20,0) ",%%xmm2 \n"
+ "movdqa " MEMACCESS2(0x30,0) ",%%xmm6 \n"
+ "pmaddubsw %%xmm3,%%xmm0 \n"
+ "pmaddubsw %%xmm3,%%xmm1 \n"
+ "pmaddubsw %%xmm3,%%xmm2 \n"
+ "pmaddubsw %%xmm3,%%xmm6 \n"
+ "phaddw %%xmm1,%%xmm0 \n"
+ "phaddw %%xmm6,%%xmm2 \n"
+ "psraw $0x8,%%xmm0 \n"
+ "psraw $0x8,%%xmm2 \n"
+ "packsswb %%xmm2,%%xmm0 \n"
+ "paddb %%xmm5,%%xmm0 \n"
+ "lea " MEMLEA(0x40,0) ",%0 \n"
+ BUNDLEALIGN
+ MEMOPMEM(movdqa,xmm0,0x00,1,2,1) // movdqa %%xmm0,(%1,%2,1)
+ "lea " MEMLEA(0x10,1) ",%1 \n"
+ "jg 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_u), // %1
+ "+r"(dst_v), // %2
+ "+rm"(width) // %3
+ :
+ : "memory", "cc"
+#if defined(__native_client__) && defined(__x86_64__)
+ , "r14"
+#endif
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm2", "xmm6"
+#endif
+ );
+}
+
+void ARGBToUV444Row_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_u,
+ uint8* dst_v, int width) {
+ asm volatile (
+ "movdqa %0,%%xmm4 \n"
+ "movdqa %1,%%xmm3 \n"
+ "movdqa %2,%%xmm5 \n"
+ :
+ : "m"(kARGBToU), // %0
+ "m"(kARGBToV), // %1
+ "m"(kAddUV128) // %2
+ );
+ asm volatile (
+ "sub %1,%2 \n"
+ LABELALIGN
+ "1: \n"
+ "movdqu " MEMACCESS(0) ",%%xmm0 \n"
+ "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
+ "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
+ "movdqu " MEMACCESS2(0x30,0) ",%%xmm6 \n"
+ "pmaddubsw %%xmm4,%%xmm0 \n"
+ "pmaddubsw %%xmm4,%%xmm1 \n"
+ "pmaddubsw %%xmm4,%%xmm2 \n"
+ "pmaddubsw %%xmm4,%%xmm6 \n"
+ "phaddw %%xmm1,%%xmm0 \n"
+ "phaddw %%xmm6,%%xmm2 \n"
+ "psraw $0x8,%%xmm0 \n"
+ "psraw $0x8,%%xmm2 \n"
+ "packsswb %%xmm2,%%xmm0 \n"
+ "paddb %%xmm5,%%xmm0 \n"
+ "sub $0x10,%3 \n"
+ "movdqu %%xmm0," MEMACCESS(1) " \n"
+ "movdqu " MEMACCESS(0) ",%%xmm0 \n"
+ "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
+ "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
+ "movdqu " MEMACCESS2(0x30,0) ",%%xmm6 \n"
+ "pmaddubsw %%xmm3,%%xmm0 \n"
+ "pmaddubsw %%xmm3,%%xmm1 \n"
+ "pmaddubsw %%xmm3,%%xmm2 \n"
+ "pmaddubsw %%xmm3,%%xmm6 \n"
+ "phaddw %%xmm1,%%xmm0 \n"
+ "phaddw %%xmm6,%%xmm2 \n"
+ "psraw $0x8,%%xmm0 \n"
+ "psraw $0x8,%%xmm2 \n"
+ "packsswb %%xmm2,%%xmm0 \n"
+ "paddb %%xmm5,%%xmm0 \n"
+ "lea " MEMLEA(0x40,0) ",%0 \n"
+ BUNDLEALIGN
+ MEMOPMEM(movdqu,xmm0,0x00,1,2,1) // movdqu %%xmm0,(%1,%2,1)
+ "lea " MEMLEA(0x10,1) ",%1 \n"
+ "jg 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_u), // %1
+ "+r"(dst_v), // %2
+ "+rm"(width) // %3
+ :
+ : "memory", "cc"
+#if defined(__native_client__) && defined(__x86_64__)
+ , "r14"
+#endif
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm2", "xmm6"
+#endif
+ );
+}
+
+void ARGBToUV422Row_SSSE3(const uint8* src_argb0,
+ uint8* dst_u, uint8* dst_v, int width) {
+ asm volatile (
+ "movdqa %0,%%xmm4 \n"
+ "movdqa %1,%%xmm3 \n"
+ "movdqa %2,%%xmm5 \n"
+ :
+ : "m"(kARGBToU), // %0
+ "m"(kARGBToV), // %1
+ "m"(kAddUV128) // %2
+ );
+ asm volatile (
+ "sub %1,%2 \n"
+ LABELALIGN
+ "1: \n"
+ "movdqa " MEMACCESS(0) ",%%xmm0 \n"
+ "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
+ "movdqa " MEMACCESS2(0x20,0) ",%%xmm2 \n"
+ "movdqa " MEMACCESS2(0x30,0) ",%%xmm6 \n"
+ "lea " MEMLEA(0x40,0) ",%0 \n"
+ "movdqa %%xmm0,%%xmm7 \n"
+ "shufps $0x88,%%xmm1,%%xmm0 \n"
+ "shufps $0xdd,%%xmm1,%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm0 \n"
+ "movdqa %%xmm2,%%xmm7 \n"
+ "shufps $0x88,%%xmm6,%%xmm2 \n"
+ "shufps $0xdd,%%xmm6,%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm2 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "movdqa %%xmm2,%%xmm6 \n"
+ "pmaddubsw %%xmm4,%%xmm0 \n"
+ "pmaddubsw %%xmm4,%%xmm2 \n"
+ "pmaddubsw %%xmm3,%%xmm1 \n"
+ "pmaddubsw %%xmm3,%%xmm6 \n"
+ "phaddw %%xmm2,%%xmm0 \n"
+ "phaddw %%xmm6,%%xmm1 \n"
+ "psraw $0x8,%%xmm0 \n"
+ "psraw $0x8,%%xmm1 \n"
+ "packsswb %%xmm1,%%xmm0 \n"
+ "paddb %%xmm5,%%xmm0 \n"
+ "sub $0x10,%3 \n"
+ "movlps %%xmm0," MEMACCESS(1) " \n"
+ BUNDLEALIGN
+ MEMOPMEM(movhps,xmm0,0x00,1,2,1) // movhps %%xmm0,(%1,%2,1)
+ "lea " MEMLEA(0x8,1) ",%1 \n"
+ "jg 1b \n"
+ : "+r"(src_argb0), // %0
+ "+r"(dst_u), // %1
+ "+r"(dst_v), // %2
+ "+rm"(width) // %3
+ :
+ : "memory", "cc"
+#if defined(__native_client__) && defined(__x86_64__)
+ , "r14"
+#endif
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
+#endif
+ );
+}
+
+void ARGBToUV422Row_Unaligned_SSSE3(const uint8* src_argb0,
+ uint8* dst_u, uint8* dst_v, int width) {
+ asm volatile (
+ "movdqa %0,%%xmm4 \n"
+ "movdqa %1,%%xmm3 \n"
+ "movdqa %2,%%xmm5 \n"
+ :
+ : "m"(kARGBToU), // %0
+ "m"(kARGBToV), // %1
+ "m"(kAddUV128) // %2
+ );
+ asm volatile (
+ "sub %1,%2 \n"
+ LABELALIGN
+ "1: \n"
+ "movdqu " MEMACCESS(0) ",%%xmm0 \n"
+ "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
+ "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
+ "movdqu " MEMACCESS2(0x30,0) ",%%xmm6 \n"
+ "lea " MEMLEA(0x40,0) ",%0 \n"
+ "movdqa %%xmm0,%%xmm7 \n"
+ "shufps $0x88,%%xmm1,%%xmm0 \n"
+ "shufps $0xdd,%%xmm1,%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm0 \n"
+ "movdqa %%xmm2,%%xmm7 \n"
+ "shufps $0x88,%%xmm6,%%xmm2 \n"
+ "shufps $0xdd,%%xmm6,%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm2 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "movdqa %%xmm2,%%xmm6 \n"
+ "pmaddubsw %%xmm4,%%xmm0 \n"
+ "pmaddubsw %%xmm4,%%xmm2 \n"
+ "pmaddubsw %%xmm3,%%xmm1 \n"
+ "pmaddubsw %%xmm3,%%xmm6 \n"
+ "phaddw %%xmm2,%%xmm0 \n"
+ "phaddw %%xmm6,%%xmm1 \n"
+ "psraw $0x8,%%xmm0 \n"
+ "psraw $0x8,%%xmm1 \n"
+ "packsswb %%xmm1,%%xmm0 \n"
+ "paddb %%xmm5,%%xmm0 \n"
+ "sub $0x10,%3 \n"
+ "movlps %%xmm0," MEMACCESS(1) " \n"
+ BUNDLEALIGN
+ MEMOPMEM(movhps,xmm0,0x00,1,2,1) // movhps %%xmm0,(%1,%2,1)
+ "lea " MEMLEA(0x8,1) ",%1 \n"
+ "jg 1b \n"
+ : "+r"(src_argb0), // %0
+ "+r"(dst_u), // %1
+ "+r"(dst_v), // %2
+ "+rm"(width) // %3
+ :
+ : "memory", "cc"
+#if defined(__native_client__) && defined(__x86_64__)
+ , "r14"
+#endif
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
+#endif
+ );
+}
+
+void BGRAToYRow_SSSE3(const uint8* src_bgra, uint8* dst_y, int pix) {
+ asm volatile (
+ "movdqa %4,%%xmm5 \n"
+ "movdqa %3,%%xmm4 \n"
+ LABELALIGN
+ "1: \n"
+ "movdqa " MEMACCESS(0) ",%%xmm0 \n"
+ "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
+ "movdqa " MEMACCESS2(0x20,0) ",%%xmm2 \n"
+ "movdqa " MEMACCESS2(0x30,0) ",%%xmm3 \n"
+ "pmaddubsw %%xmm4,%%xmm0 \n"
+ "pmaddubsw %%xmm4,%%xmm1 \n"
+ "pmaddubsw %%xmm4,%%xmm2 \n"
+ "pmaddubsw %%xmm4,%%xmm3 \n"
+ "lea " MEMLEA(0x40,0) ",%0 \n"
+ "phaddw %%xmm1,%%xmm0 \n"
+ "phaddw %%xmm3,%%xmm2 \n"
+ "psrlw $0x7,%%xmm0 \n"
+ "psrlw $0x7,%%xmm2 \n"
+ "packuswb %%xmm2,%%xmm0 \n"
+ "paddb %%xmm5,%%xmm0 \n"
+ "sub $0x10,%2 \n"
+ "movdqa %%xmm0," MEMACCESS(1) " \n"
+ "lea " MEMLEA(0x10,1) ",%1 \n"
+ "jg 1b \n"
+ : "+r"(src_bgra), // %0
+ "+r"(dst_y), // %1
+ "+r"(pix) // %2
+ : "m"(kBGRAToY), // %3
+ "m"(kAddY16) // %4
+ : "memory", "cc"
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+#endif
+ );
+}
+
+void BGRAToYRow_Unaligned_SSSE3(const uint8* src_bgra, uint8* dst_y, int pix) {
+ asm volatile (
+ "movdqa %4,%%xmm5 \n"
+ "movdqa %3,%%xmm4 \n"
+ LABELALIGN
+ "1: \n"
+ "movdqu " MEMACCESS(0) ",%%xmm0 \n"
+ "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
+ "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
+ "movdqu " MEMACCESS2(0x30,0) ",%%xmm3 \n"
+ "pmaddubsw %%xmm4,%%xmm0 \n"
+ "pmaddubsw %%xmm4,%%xmm1 \n"
+ "pmaddubsw %%xmm4,%%xmm2 \n"
+ "pmaddubsw %%xmm4,%%xmm3 \n"
+ "lea " MEMLEA(0x40,0) ",%0 \n"
+ "phaddw %%xmm1,%%xmm0 \n"
+ "phaddw %%xmm3,%%xmm2 \n"
+ "psrlw $0x7,%%xmm0 \n"
+ "psrlw $0x7,%%xmm2 \n"
+ "packuswb %%xmm2,%%xmm0 \n"
+ "paddb %%xmm5,%%xmm0 \n"
+ "sub $0x10,%2 \n"
+ "movdqu %%xmm0," MEMACCESS(1) " \n"
+ "lea " MEMLEA(0x10,1) ",%1 \n"
+ "jg 1b \n"
+ : "+r"(src_bgra), // %0
+ "+r"(dst_y), // %1
+ "+r"(pix) // %2
+ : "m"(kBGRAToY), // %3
+ "m"(kAddY16) // %4
+ : "memory", "cc"
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+#endif
+ );
+}
+
+void BGRAToUVRow_SSSE3(const uint8* src_bgra0, int src_stride_bgra,
+ uint8* dst_u, uint8* dst_v, int width) {
+ asm volatile (
+ "movdqa %0,%%xmm4 \n"
+ "movdqa %1,%%xmm3 \n"
+ "movdqa %2,%%xmm5 \n"
+ :
+ : "m"(kBGRAToU), // %0
+ "m"(kBGRAToV), // %1
+ "m"(kAddUV128) // %2
+ );
+ asm volatile (
+ "sub %1,%2 \n"
+ LABELALIGN
+ "1: \n"
+ "movdqa " MEMACCESS(0) ",%%xmm0 \n"
+ "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
+ "movdqa " MEMACCESS2(0x20,0) ",%%xmm2 \n"
+ "movdqa " MEMACCESS2(0x30,0) ",%%xmm6 \n"
+ BUNDLEALIGN
+ MEMOPREG(pavgb,0x00,0,4,1,xmm0) // pavgb (%0,%4,1),%%xmm0
+ MEMOPREG(pavgb,0x10,0,4,1,xmm1) // pavgb 0x10(%0,%4,1),%%xmm1
+ MEMOPREG(pavgb,0x20,0,4,1,xmm2) // pavgb 0x20(%0,%4,1),%%xmm2
+ MEMOPREG(pavgb,0x30,0,4,1,xmm6) // pavgb 0x30(%0,%4,1),%%xmm6
+ "lea " MEMLEA(0x40,0) ",%0 \n"
+ "movdqa %%xmm0,%%xmm7 \n"
+ "shufps $0x88,%%xmm1,%%xmm0 \n"
+ "shufps $0xdd,%%xmm1,%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm0 \n"
+ "movdqa %%xmm2,%%xmm7 \n"
+ "shufps $0x88,%%xmm6,%%xmm2 \n"
+ "shufps $0xdd,%%xmm6,%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm2 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "movdqa %%xmm2,%%xmm6 \n"
+ "pmaddubsw %%xmm4,%%xmm0 \n"
+ "pmaddubsw %%xmm4,%%xmm2 \n"
+ "pmaddubsw %%xmm3,%%xmm1 \n"
+ "pmaddubsw %%xmm3,%%xmm6 \n"
+ "phaddw %%xmm2,%%xmm0 \n"
+ "phaddw %%xmm6,%%xmm1 \n"
+ "psraw $0x8,%%xmm0 \n"
+ "psraw $0x8,%%xmm1 \n"
+ "packsswb %%xmm1,%%xmm0 \n"
+ "paddb %%xmm5,%%xmm0 \n"
+ "sub $0x10,%3 \n"
+ "movlps %%xmm0," MEMACCESS(1) " \n"
+ BUNDLEALIGN
+ MEMOPMEM(movhps,xmm0,0x00,1,2,1) // movhps %%xmm0,(%1,%2,1)
+ "lea " MEMLEA(0x8,1) ",%1 \n"
+ "jg 1b \n"
+ : "+r"(src_bgra0), // %0
+ "+r"(dst_u), // %1
+ "+r"(dst_v), // %2
+ "+rm"(width) // %3
+ : "r"((intptr_t)(src_stride_bgra)) // %4
+ : "memory", "cc"
+#if defined(__native_client__) && defined(__x86_64__)
+ , "r14"
+#endif
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
+#endif
+ );
+}
+
+void BGRAToUVRow_Unaligned_SSSE3(const uint8* src_bgra0, int src_stride_bgra,
+ uint8* dst_u, uint8* dst_v, int width) {
+ asm volatile (
+ "movdqa %0,%%xmm4 \n"
+ "movdqa %1,%%xmm3 \n"
+ "movdqa %2,%%xmm5 \n"
+ :
+ : "m"(kBGRAToU), // %0
+ "m"(kBGRAToV), // %1
+ "m"(kAddUV128) // %2
+ );
+ asm volatile (
+ "sub %1,%2 \n"
+ LABELALIGN
+ "1: \n"
+ "movdqu " MEMACCESS(0) ",%%xmm0 \n"
+ "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
+ "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
+ "movdqu " MEMACCESS2(0x30,0) ",%%xmm6 \n"
+ BUNDLEALIGN
+ MEMOPREG(movdqu,0x00,0,4,1,xmm7) // movdqu (%0,%4,1),%%xmm7
+ "pavgb %%xmm7,%%xmm0 \n"
+ MEMOPREG(movdqu,0x10,0,4,1,xmm7) // movdqu 0x10(%0,%4,1),%%xmm7
+ "pavgb %%xmm7,%%xmm1 \n"
+ MEMOPREG(movdqu,0x20,0,4,1,xmm7) // movdqu 0x20(%0,%4,1),%%xmm7
+ "pavgb %%xmm7,%%xmm2 \n"
+ MEMOPREG(movdqu,0x30,0,4,1,xmm7) // movdqu 0x30(%0,%4,1),%%xmm7
+ "pavgb %%xmm7,%%xmm6 \n"
+ "lea " MEMLEA(0x40,0) ",%0 \n"
+ "movdqa %%xmm0,%%xmm7 \n"
+ "shufps $0x88,%%xmm1,%%xmm0 \n"
+ "shufps $0xdd,%%xmm1,%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm0 \n"
+ "movdqa %%xmm2,%%xmm7 \n"
+ "shufps $0x88,%%xmm6,%%xmm2 \n"
+ "shufps $0xdd,%%xmm6,%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm2 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "movdqa %%xmm2,%%xmm6 \n"
+ "pmaddubsw %%xmm4,%%xmm0 \n"
+ "pmaddubsw %%xmm4,%%xmm2 \n"
+ "pmaddubsw %%xmm3,%%xmm1 \n"
+ "pmaddubsw %%xmm3,%%xmm6 \n"
+ "phaddw %%xmm2,%%xmm0 \n"
+ "phaddw %%xmm6,%%xmm1 \n"
+ "psraw $0x8,%%xmm0 \n"
+ "psraw $0x8,%%xmm1 \n"
+ "packsswb %%xmm1,%%xmm0 \n"
+ "paddb %%xmm5,%%xmm0 \n"
+ "sub $0x10,%3 \n"
+ "movlps %%xmm0," MEMACCESS(1) " \n"
+ BUNDLEALIGN
+ MEMOPMEM(movhps,xmm0,0x00,1,2,1) // movhps %%xmm0,(%1,%2,1)
+ "lea " MEMLEA(0x8,1) ",%1 \n"
+ "jg 1b \n"
+ : "+r"(src_bgra0), // %0
+ "+r"(dst_u), // %1
+ "+r"(dst_v), // %2
+ "+rm"(width) // %3
+ : "r"((intptr_t)(src_stride_bgra)) // %4
+ : "memory", "cc"
+#if defined(__native_client__) && defined(__x86_64__)
+ , "r14"
+#endif
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
+#endif
+ );
+}
+
+void ABGRToYRow_SSSE3(const uint8* src_abgr, uint8* dst_y, int pix) {
+ asm volatile (
+ "movdqa %4,%%xmm5 \n"
+ "movdqa %3,%%xmm4 \n"
+ LABELALIGN
+ "1: \n"
+ "movdqa " MEMACCESS(0) ",%%xmm0 \n"
+ "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
+ "movdqa " MEMACCESS2(0x20,0) ",%%xmm2 \n"
+ "movdqa " MEMACCESS2(0x30,0) ",%%xmm3 \n"
+ "pmaddubsw %%xmm4,%%xmm0 \n"
+ "pmaddubsw %%xmm4,%%xmm1 \n"
+ "pmaddubsw %%xmm4,%%xmm2 \n"
+ "pmaddubsw %%xmm4,%%xmm3 \n"
+ "lea " MEMLEA(0x40,0) ",%0 \n"
+ "phaddw %%xmm1,%%xmm0 \n"
+ "phaddw %%xmm3,%%xmm2 \n"
+ "psrlw $0x7,%%xmm0 \n"
+ "psrlw $0x7,%%xmm2 \n"
+ "packuswb %%xmm2,%%xmm0 \n"
+ "paddb %%xmm5,%%xmm0 \n"
+ "sub $0x10,%2 \n"
+ "movdqa %%xmm0," MEMACCESS(1) " \n"
+ "lea " MEMLEA(0x10,1) ",%1 \n"
+ "jg 1b \n"
+ : "+r"(src_abgr), // %0
+ "+r"(dst_y), // %1
+ "+r"(pix) // %2
+ : "m"(kABGRToY), // %3
+ "m"(kAddY16) // %4
+ : "memory", "cc"
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+#endif
+ );
+}
+
+void ABGRToYRow_Unaligned_SSSE3(const uint8* src_abgr, uint8* dst_y, int pix) {
+ asm volatile (
+ "movdqa %4,%%xmm5 \n"
+ "movdqa %3,%%xmm4 \n"
+ LABELALIGN
+ "1: \n"
+ "movdqu " MEMACCESS(0) ",%%xmm0 \n"
+ "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
+ "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
+ "movdqu " MEMACCESS2(0x30,0) ",%%xmm3 \n"
+ "pmaddubsw %%xmm4,%%xmm0 \n"
+ "pmaddubsw %%xmm4,%%xmm1 \n"
+ "pmaddubsw %%xmm4,%%xmm2 \n"
+ "pmaddubsw %%xmm4,%%xmm3 \n"
+ "lea " MEMLEA(0x40,0) ",%0 \n"
+ "phaddw %%xmm1,%%xmm0 \n"
+ "phaddw %%xmm3,%%xmm2 \n"
+ "psrlw $0x7,%%xmm0 \n"
+ "psrlw $0x7,%%xmm2 \n"
+ "packuswb %%xmm2,%%xmm0 \n"
+ "paddb %%xmm5,%%xmm0 \n"
+ "sub $0x10,%2 \n"
+ "movdqu %%xmm0," MEMACCESS(1) " \n"
+ "lea " MEMLEA(0x10,1) ",%1 \n"
+ "jg 1b \n"
+ : "+r"(src_abgr), // %0
+ "+r"(dst_y), // %1
+ "+r"(pix) // %2
+ : "m"(kABGRToY), // %3
+ "m"(kAddY16) // %4
+ : "memory", "cc"
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+#endif
+ );
+}
+
+void RGBAToYRow_SSSE3(const uint8* src_rgba, uint8* dst_y, int pix) {
+ asm volatile (
+ "movdqa %4,%%xmm5 \n"
+ "movdqa %3,%%xmm4 \n"
+ LABELALIGN
+ "1: \n"
+ "movdqa " MEMACCESS(0) ",%%xmm0 \n"
+ "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
+ "movdqa " MEMACCESS2(0x20,0) ",%%xmm2 \n"
+ "movdqa " MEMACCESS2(0x30,0) ",%%xmm3 \n"
+ "pmaddubsw %%xmm4,%%xmm0 \n"
+ "pmaddubsw %%xmm4,%%xmm1 \n"
+ "pmaddubsw %%xmm4,%%xmm2 \n"
+ "pmaddubsw %%xmm4,%%xmm3 \n"
+ "lea " MEMLEA(0x40,0) ",%0 \n"
+ "phaddw %%xmm1,%%xmm0 \n"
+ "phaddw %%xmm3,%%xmm2 \n"
+ "psrlw $0x7,%%xmm0 \n"
+ "psrlw $0x7,%%xmm2 \n"
+ "packuswb %%xmm2,%%xmm0 \n"
+ "paddb %%xmm5,%%xmm0 \n"
+ "sub $0x10,%2 \n"
+ "movdqa %%xmm0," MEMACCESS(1) " \n"
+ "lea " MEMLEA(0x10,1) ",%1 \n"
+ "jg 1b \n"
+ : "+r"(src_rgba), // %0
+ "+r"(dst_y), // %1
+ "+r"(pix) // %2
+ : "m"(kRGBAToY), // %3
+ "m"(kAddY16) // %4
+ : "memory", "cc"
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+#endif
+ );
+}
+
+void RGBAToYRow_Unaligned_SSSE3(const uint8* src_rgba, uint8* dst_y, int pix) {
+ asm volatile (
+ "movdqa %4,%%xmm5 \n"
+ "movdqa %3,%%xmm4 \n"
+ LABELALIGN
+ "1: \n"
+ "movdqu " MEMACCESS(0) ",%%xmm0 \n"
+ "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
+ "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
+ "movdqu " MEMACCESS2(0x30,0) ",%%xmm3 \n"
+ "pmaddubsw %%xmm4,%%xmm0 \n"
+ "pmaddubsw %%xmm4,%%xmm1 \n"
+ "pmaddubsw %%xmm4,%%xmm2 \n"
+ "pmaddubsw %%xmm4,%%xmm3 \n"
+ "lea " MEMLEA(0x40,0) ",%0 \n"
+ "phaddw %%xmm1,%%xmm0 \n"
+ "phaddw %%xmm3,%%xmm2 \n"
+ "psrlw $0x7,%%xmm0 \n"
+ "psrlw $0x7,%%xmm2 \n"
+ "packuswb %%xmm2,%%xmm0 \n"
+ "paddb %%xmm5,%%xmm0 \n"
+ "sub $0x10,%2 \n"
+ "movdqu %%xmm0," MEMACCESS(1) " \n"
+ "lea " MEMLEA(0x10,1) ",%1 \n"
+ "jg 1b \n"
+ : "+r"(src_rgba), // %0
+ "+r"(dst_y), // %1
+ "+r"(pix) // %2
+ : "m"(kRGBAToY), // %3
+ "m"(kAddY16) // %4
+ : "memory", "cc"
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+#endif
+ );
+}
+
+void ABGRToUVRow_SSSE3(const uint8* src_abgr0, int src_stride_abgr,
+ uint8* dst_u, uint8* dst_v, int width) {
+ asm volatile (
+ "movdqa %0,%%xmm4 \n"
+ "movdqa %1,%%xmm3 \n"
+ "movdqa %2,%%xmm5 \n"
+ :
+ : "m"(kABGRToU), // %0
+ "m"(kABGRToV), // %1
+ "m"(kAddUV128) // %2
+ );
+ asm volatile (
+ "sub %1,%2 \n"
+ LABELALIGN
+ "1: \n"
+ "movdqa " MEMACCESS(0) ",%%xmm0 \n"
+ "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
+ "movdqa " MEMACCESS2(0x20,0) ",%%xmm2 \n"
+ "movdqa " MEMACCESS2(0x30,0) ",%%xmm6 \n"
+ BUNDLEALIGN
+ MEMOPREG(pavgb,0x00,0,4,1,xmm0) // pavgb (%0,%4,1),%%xmm0
+ MEMOPREG(pavgb,0x10,0,4,1,xmm1) // pavgb 0x10(%0,%4,1),%%xmm1
+ MEMOPREG(pavgb,0x20,0,4,1,xmm2) // pavgb 0x20(%0,%4,1),%%xmm2
+ MEMOPREG(pavgb,0x30,0,4,1,xmm6) // pavgb 0x30(%0,%4,1),%%xmm6
+ "lea " MEMLEA(0x40,0) ",%0 \n"
+ "movdqa %%xmm0,%%xmm7 \n"
+ "shufps $0x88,%%xmm1,%%xmm0 \n"
+ "shufps $0xdd,%%xmm1,%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm0 \n"
+ "movdqa %%xmm2,%%xmm7 \n"
+ "shufps $0x88,%%xmm6,%%xmm2 \n"
+ "shufps $0xdd,%%xmm6,%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm2 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "movdqa %%xmm2,%%xmm6 \n"
+ "pmaddubsw %%xmm4,%%xmm0 \n"
+ "pmaddubsw %%xmm4,%%xmm2 \n"
+ "pmaddubsw %%xmm3,%%xmm1 \n"
+ "pmaddubsw %%xmm3,%%xmm6 \n"
+ "phaddw %%xmm2,%%xmm0 \n"
+ "phaddw %%xmm6,%%xmm1 \n"
+ "psraw $0x8,%%xmm0 \n"
+ "psraw $0x8,%%xmm1 \n"
+ "packsswb %%xmm1,%%xmm0 \n"
+ "paddb %%xmm5,%%xmm0 \n"
+ "sub $0x10,%3 \n"
+ "movlps %%xmm0," MEMACCESS(1) " \n"
+ BUNDLEALIGN
+ MEMOPMEM(movhps,xmm0,0x00,1,2,1) // movhps %%xmm0,(%1,%2,1)
+ "lea " MEMLEA(0x8,1) ",%1 \n"
+ "jg 1b \n"
+ : "+r"(src_abgr0), // %0
+ "+r"(dst_u), // %1
+ "+r"(dst_v), // %2
+ "+rm"(width) // %3
+ : "r"((intptr_t)(src_stride_abgr)) // %4
+ : "memory", "cc"
+#if defined(__native_client__) && defined(__x86_64__)
+ , "r14"
+#endif
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
+#endif
+ );
+}
+
+void ABGRToUVRow_Unaligned_SSSE3(const uint8* src_abgr0, int src_stride_abgr,
+ uint8* dst_u, uint8* dst_v, int width) {
+ asm volatile (
+ "movdqa %0,%%xmm4 \n"
+ "movdqa %1,%%xmm3 \n"
+ "movdqa %2,%%xmm5 \n"
+ :
+ : "m"(kABGRToU), // %0
+ "m"(kABGRToV), // %1
+ "m"(kAddUV128) // %2
+ );
+ asm volatile (
+ "sub %1,%2 \n"
+ LABELALIGN
+ "1: \n"
+ "movdqu " MEMACCESS(0) ",%%xmm0 \n"
+ "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
+ "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
+ "movdqu " MEMACCESS2(0x30,0) ",%%xmm6 \n"
+ BUNDLEALIGN
+ MEMOPREG(movdqu,0x00,0,4,1,xmm7) // movdqu (%0,%4,1),%%xmm7
+ "pavgb %%xmm7,%%xmm0 \n"
+ MEMOPREG(movdqu,0x10,0,4,1,xmm7) // movdqu 0x10(%0,%4,1),%%xmm7
+ "pavgb %%xmm7,%%xmm1 \n"
+ MEMOPREG(movdqu,0x20,0,4,1,xmm7) // movdqu 0x20(%0,%4,1),%%xmm7
+ "pavgb %%xmm7,%%xmm2 \n"
+ MEMOPREG(movdqu,0x30,0,4,1,xmm7) // movdqu 0x30(%0,%4,1),%%xmm7
+ "pavgb %%xmm7,%%xmm6 \n"
+ "lea " MEMLEA(0x40,0) ",%0 \n"
+ "movdqa %%xmm0,%%xmm7 \n"
+ "shufps $0x88,%%xmm1,%%xmm0 \n"
+ "shufps $0xdd,%%xmm1,%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm0 \n"
+ "movdqa %%xmm2,%%xmm7 \n"
+ "shufps $0x88,%%xmm6,%%xmm2 \n"
+ "shufps $0xdd,%%xmm6,%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm2 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "movdqa %%xmm2,%%xmm6 \n"
+ "pmaddubsw %%xmm4,%%xmm0 \n"
+ "pmaddubsw %%xmm4,%%xmm2 \n"
+ "pmaddubsw %%xmm3,%%xmm1 \n"
+ "pmaddubsw %%xmm3,%%xmm6 \n"
+ "phaddw %%xmm2,%%xmm0 \n"
+ "phaddw %%xmm6,%%xmm1 \n"
+ "psraw $0x8,%%xmm0 \n"
+ "psraw $0x8,%%xmm1 \n"
+ "packsswb %%xmm1,%%xmm0 \n"
+ "paddb %%xmm5,%%xmm0 \n"
+ "sub $0x10,%3 \n"
+ "movlps %%xmm0," MEMACCESS(1) " \n"
+ BUNDLEALIGN
+ MEMOPMEM(movhps,xmm0,0x00,1,2,1) // movhps %%xmm0,(%1,%2,1)
+ "lea " MEMLEA(0x8,1) ",%1 \n"
+ "jg 1b \n"
+ : "+r"(src_abgr0), // %0
+ "+r"(dst_u), // %1
+ "+r"(dst_v), // %2
+ "+rm"(width) // %3
+ : "r"((intptr_t)(src_stride_abgr)) // %4
+ : "memory", "cc"
+#if defined(__native_client__) && defined(__x86_64__)
+ , "r14"
+#endif
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
+#endif
+ );
+}
+
+void RGBAToUVRow_SSSE3(const uint8* src_rgba0, int src_stride_rgba,
+ uint8* dst_u, uint8* dst_v, int width) {
+ asm volatile (
+ "movdqa %0,%%xmm4 \n"
+ "movdqa %1,%%xmm3 \n"
+ "movdqa %2,%%xmm5 \n"
+ :
+ : "m"(kRGBAToU), // %0
+ "m"(kRGBAToV), // %1
+ "m"(kAddUV128) // %2
+ );
+ asm volatile (
+ "sub %1,%2 \n"
+ LABELALIGN
+ "1: \n"
+ "movdqa " MEMACCESS(0) ",%%xmm0 \n"
+ "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
+ "movdqa " MEMACCESS2(0x20,0) ",%%xmm2 \n"
+ "movdqa " MEMACCESS2(0x30,0) ",%%xmm6 \n"
+ BUNDLEALIGN
+ MEMOPREG(pavgb,0x00,0,4,1,xmm0) // pavgb (%0,%4,1),%%xmm0
+ MEMOPREG(pavgb,0x10,0,4,1,xmm1) // pavgb 0x10(%0,%4,1),%%xmm1
+ MEMOPREG(pavgb,0x20,0,4,1,xmm2) // pavgb 0x20(%0,%4,1),%%xmm2
+ MEMOPREG(pavgb,0x30,0,4,1,xmm6) // pavgb 0x30(%0,%4,1),%%xmm6
+ "lea " MEMLEA(0x40,0) ",%0 \n"
+ "movdqa %%xmm0,%%xmm7 \n"
+ "shufps $0x88,%%xmm1,%%xmm0 \n"
+ "shufps $0xdd,%%xmm1,%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm0 \n"
+ "movdqa %%xmm2,%%xmm7 \n"
+ "shufps $0x88,%%xmm6,%%xmm2 \n"
+ "shufps $0xdd,%%xmm6,%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm2 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "movdqa %%xmm2,%%xmm6 \n"
+ "pmaddubsw %%xmm4,%%xmm0 \n"
+ "pmaddubsw %%xmm4,%%xmm2 \n"
+ "pmaddubsw %%xmm3,%%xmm1 \n"
+ "pmaddubsw %%xmm3,%%xmm6 \n"
+ "phaddw %%xmm2,%%xmm0 \n"
+ "phaddw %%xmm6,%%xmm1 \n"
+ "psraw $0x8,%%xmm0 \n"
+ "psraw $0x8,%%xmm1 \n"
+ "packsswb %%xmm1,%%xmm0 \n"
+ "paddb %%xmm5,%%xmm0 \n"
+ "sub $0x10,%3 \n"
+ "movlps %%xmm0," MEMACCESS(1) " \n"
+ BUNDLEALIGN
+ MEMOPMEM(movhps,xmm0,0x00,1,2,1) // movhps %%xmm0,(%1,%2,1)
+ "lea " MEMLEA(0x8,1) ",%1 \n"
+ "jg 1b \n"
+ : "+r"(src_rgba0), // %0
+ "+r"(dst_u), // %1
+ "+r"(dst_v), // %2
+ "+rm"(width) // %3
+ : "r"((intptr_t)(src_stride_rgba))
+ : "memory", "cc"
+#if defined(__native_client__) && defined(__x86_64__)
+ , "r14"
+#endif
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
+#endif
+ );
+}
+
+void RGBAToUVRow_Unaligned_SSSE3(const uint8* src_rgba0, int src_stride_rgba,
+ uint8* dst_u, uint8* dst_v, int width) {
+ asm volatile (
+ "movdqa %0,%%xmm4 \n"
+ "movdqa %1,%%xmm3 \n"
+ "movdqa %2,%%xmm5 \n"
+ :
+ : "m"(kRGBAToU), // %0
+ "m"(kRGBAToV), // %1
+ "m"(kAddUV128) // %2
+ );
+ asm volatile (
+ "sub %1,%2 \n"
+ LABELALIGN
+ "1: \n"
+ "movdqu " MEMACCESS(0) ",%%xmm0 \n"
+ "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
+ "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
+ "movdqu " MEMACCESS2(0x30,0) ",%%xmm6 \n"
+ BUNDLEALIGN
+ MEMOPREG(movdqu,0x00,0,4,1,xmm7) // movdqu (%0,%4,1),%%xmm7
+ "pavgb %%xmm7,%%xmm0 \n"
+ MEMOPREG(movdqu,0x10,0,4,1,xmm7) // movdqu 0x10(%0,%4,1),%%xmm7
+ "pavgb %%xmm7,%%xmm1 \n"
+ MEMOPREG(movdqu,0x20,0,4,1,xmm7) // movdqu 0x20(%0,%4,1),%%xmm7
+ "pavgb %%xmm7,%%xmm2 \n"
+ MEMOPREG(movdqu,0x30,0,4,1,xmm7) // movdqu 0x30(%0,%4,1),%%xmm7
+ "pavgb %%xmm7,%%xmm6 \n"
+ "lea " MEMLEA(0x40,0) ",%0 \n"
+ "movdqa %%xmm0,%%xmm7 \n"
+ "shufps $0x88,%%xmm1,%%xmm0 \n"
+ "shufps $0xdd,%%xmm1,%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm0 \n"
+ "movdqa %%xmm2,%%xmm7 \n"
+ "shufps $0x88,%%xmm6,%%xmm2 \n"
+ "shufps $0xdd,%%xmm6,%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm2 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "movdqa %%xmm2,%%xmm6 \n"
+ "pmaddubsw %%xmm4,%%xmm0 \n"
+ "pmaddubsw %%xmm4,%%xmm2 \n"
+ "pmaddubsw %%xmm3,%%xmm1 \n"
+ "pmaddubsw %%xmm3,%%xmm6 \n"
+ "phaddw %%xmm2,%%xmm0 \n"
+ "phaddw %%xmm6,%%xmm1 \n"
+ "psraw $0x8,%%xmm0 \n"
+ "psraw $0x8,%%xmm1 \n"
+ "packsswb %%xmm1,%%xmm0 \n"
+ "paddb %%xmm5,%%xmm0 \n"
+ "sub $0x10,%3 \n"
+ "movlps %%xmm0," MEMACCESS(1) " \n"
+ BUNDLEALIGN
+ MEMOPMEM(movhps,xmm0,0x00,1,2,1) // movhps %%xmm0,(%1,%2,1)
+ "lea " MEMLEA(0x8,1) ",%1 \n"
+ "jg 1b \n"
+ : "+r"(src_rgba0), // %0
+ "+r"(dst_u), // %1
+ "+r"(dst_v), // %2
+ "+rm"(width) // %3
+ : "r"((intptr_t)(src_stride_rgba)) // %4
+ : "memory", "cc"
+#if defined(__native_client__) && defined(__x86_64__)
+ , "r14"
+#endif
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
+#endif
+ );
+}
+#endif // HAS_ARGBTOUVROW_SSSE3
+
+#ifdef HAS_I422TOARGBROW_SSSE3
+#define UB 127 /* min(63,(int8)(2.018 * 64)) */
+#define UG -25 /* (int8)(-0.391 * 64 - 0.5) */
+#define UR 0
+
+#define VB 0
+#define VG -52 /* (int8)(-0.813 * 64 - 0.5) */
+#define VR 102 /* (int8)(1.596 * 64 + 0.5) */
+
+// Bias
+#define BB UB * 128 + VB * 128
+#define BG UG * 128 + VG * 128
+#define BR UR * 128 + VR * 128
+
+#define YG 74 /* (int8)(1.164 * 64 + 0.5) */
+
+struct {
+ vec8 kUVToB; // 0
+ vec8 kUVToG; // 16
+ vec8 kUVToR; // 32
+ vec16 kUVBiasB; // 48
+ vec16 kUVBiasG; // 64
+ vec16 kUVBiasR; // 80
+ vec16 kYSub16; // 96
+ vec16 kYToRgb; // 112
+ vec8 kVUToB; // 128
+ vec8 kVUToG; // 144
+ vec8 kVUToR; // 160
+} static SIMD_ALIGNED(kYuvConstants) = {
+ { UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB },
+ { UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG },
+ { UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR },
+ { BB, BB, BB, BB, BB, BB, BB, BB },
+ { BG, BG, BG, BG, BG, BG, BG, BG },
+ { BR, BR, BR, BR, BR, BR, BR, BR },
+ { 16, 16, 16, 16, 16, 16, 16, 16 },
+ { YG, YG, YG, YG, YG, YG, YG, YG },
+ { VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB },
+ { VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG },
+ { VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR }
+};
+
+
+// Read 8 UV from 411
+#define READYUV444 \
+ "movq " MEMACCESS([u_buf]) ",%%xmm0 \n" \
+ BUNDLEALIGN \
+ MEMOPREG(movq, 0x00, [u_buf], [v_buf], 1, xmm1) \
+ "lea " MEMLEA(0x8, [u_buf]) ",%[u_buf] \n" \
+ "punpcklbw %%xmm1,%%xmm0 \n"
+
+// Read 4 UV from 422, upsample to 8 UV
+#define READYUV422 \
+ "movd " MEMACCESS([u_buf]) ",%%xmm0 \n" \
+ BUNDLEALIGN \
+ MEMOPREG(movd, 0x00, [u_buf], [v_buf], 1, xmm1) \
+ "lea " MEMLEA(0x4, [u_buf]) ",%[u_buf] \n" \
+ "punpcklbw %%xmm1,%%xmm0 \n" \
+ "punpcklwd %%xmm0,%%xmm0 \n"
+
+// Read 2 UV from 411, upsample to 8 UV
+#define READYUV411 \
+ "movd " MEMACCESS([u_buf]) ",%%xmm0 \n" \
+ BUNDLEALIGN \
+ MEMOPREG(movd, 0x00, [u_buf], [v_buf], 1, xmm1) \
+ "lea " MEMLEA(0x2, [u_buf]) ",%[u_buf] \n" \
+ "punpcklbw %%xmm1,%%xmm0 \n" \
+ "punpcklwd %%xmm0,%%xmm0 \n" \
+ "punpckldq %%xmm0,%%xmm0 \n"
+
+// Read 4 UV from NV12, upsample to 8 UV
+#define READNV12 \
+ "movq " MEMACCESS([uv_buf]) ",%%xmm0 \n" \
+ "lea " MEMLEA(0x8, [uv_buf]) ",%[uv_buf] \n" \
+ "punpcklwd %%xmm0,%%xmm0 \n"
+
+// Convert 8 pixels: 8 UV and 8 Y
+#define YUVTORGB \
+ "movdqa %%xmm0,%%xmm1 \n" \
+ "movdqa %%xmm0,%%xmm2 \n" \
+ "pmaddubsw " MEMACCESS([kYuvConstants]) ",%%xmm0 \n" \
+ "pmaddubsw " MEMACCESS2(16, [kYuvConstants]) ",%%xmm1 \n" \
+ "pmaddubsw " MEMACCESS2(32, [kYuvConstants]) ",%%xmm2 \n" \
+ "psubw " MEMACCESS2(48, [kYuvConstants]) ",%%xmm0 \n" \
+ "psubw " MEMACCESS2(64, [kYuvConstants]) ",%%xmm1 \n" \
+ "psubw " MEMACCESS2(80, [kYuvConstants]) ",%%xmm2 \n" \
+ "movq " MEMACCESS([y_buf]) ",%%xmm3 \n" \
+ "lea " MEMLEA(0x8, [y_buf]) ",%[y_buf] \n" \
+ "punpcklbw %%xmm4,%%xmm3 \n" \
+ "psubsw " MEMACCESS2(96, [kYuvConstants]) ",%%xmm3 \n" \
+ "pmullw " MEMACCESS2(112, [kYuvConstants]) ",%%xmm3 \n" \
+ "paddsw %%xmm3,%%xmm0 \n" \
+ "paddsw %%xmm3,%%xmm1 \n" \
+ "paddsw %%xmm3,%%xmm2 \n" \
+ "psraw $0x6,%%xmm0 \n" \
+ "psraw $0x6,%%xmm1 \n" \
+ "psraw $0x6,%%xmm2 \n" \
+ "packuswb %%xmm0,%%xmm0 \n" \
+ "packuswb %%xmm1,%%xmm1 \n" \
+ "packuswb %%xmm2,%%xmm2 \n"
+
+// Convert 8 pixels: 8 VU and 8 Y
+#define YVUTORGB \
+ "movdqa %%xmm0,%%xmm1 \n" \
+ "movdqa %%xmm0,%%xmm2 \n" \
+ "pmaddubsw " MEMACCESS2(128, [kYuvConstants]) ",%%xmm0 \n" \
+ "pmaddubsw " MEMACCESS2(144, [kYuvConstants]) ",%%xmm1 \n" \
+ "pmaddubsw " MEMACCESS2(160, [kYuvConstants]) ",%%xmm2 \n" \
+ "psubw " MEMACCESS2(48, [kYuvConstants]) ",%%xmm0 \n" \
+ "psubw " MEMACCESS2(64, [kYuvConstants]) ",%%xmm1 \n" \
+ "psubw " MEMACCESS2(80, [kYuvConstants]) ",%%xmm2 \n" \
+ "movq " MEMACCESS([y_buf]) ",%%xmm3 \n" \
+ "lea " MEMLEA(0x8, [y_buf]) ",%[y_buf] \n" \
+ "punpcklbw %%xmm4,%%xmm3 \n" \
+ "psubsw " MEMACCESS2(96, [kYuvConstants]) ",%%xmm3 \n" \
+ "pmullw " MEMACCESS2(112, [kYuvConstants]) ",%%xmm3 \n" \
+ "paddsw %%xmm3,%%xmm0 \n" \
+ "paddsw %%xmm3,%%xmm1 \n" \
+ "paddsw %%xmm3,%%xmm2 \n" \
+ "psraw $0x6,%%xmm0 \n" \
+ "psraw $0x6,%%xmm1 \n" \
+ "psraw $0x6,%%xmm2 \n" \
+ "packuswb %%xmm0,%%xmm0 \n" \
+ "packuswb %%xmm1,%%xmm1 \n" \
+ "packuswb %%xmm2,%%xmm2 \n"
+
+void OMITFP I444ToARGBRow_SSSE3(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* dst_argb,
+ int width) {
+ asm volatile (
+ "sub %[u_buf],%[v_buf] \n"
+ "pcmpeqb %%xmm5,%%xmm5 \n"
+ "pxor %%xmm4,%%xmm4 \n"
+ LABELALIGN
+ "1: \n"
+ READYUV444
+ YUVTORGB
+ "punpcklbw %%xmm1,%%xmm0 \n"
+ "punpcklbw %%xmm5,%%xmm2 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "punpcklwd %%xmm2,%%xmm0 \n"
+ "punpckhwd %%xmm2,%%xmm1 \n"
+ "movdqa %%xmm0," MEMACCESS([dst_argb]) " \n"
+ "movdqa %%xmm1," MEMACCESS2(0x10,[dst_argb]) " \n"
+ "lea " MEMLEA(0x20,[dst_argb]) ",%[dst_argb] \n"
+ "sub $0x8,%[width] \n"
+ "jg 1b \n"
+ : [y_buf]"+r"(y_buf), // %[y_buf]
+ [u_buf]"+r"(u_buf), // %[u_buf]
+ [v_buf]"+r"(v_buf), // %[v_buf]
+ [dst_argb]"+r"(dst_argb), // %[dst_argb]
+ [width]"+rm"(width) // %[width]
+ : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
+ : "memory", "cc"
+#if defined(__native_client__) && defined(__x86_64__)
+ , "r14"
+#endif
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+#endif
+ );
+}
+
+void OMITFP I422ToRGB24Row_SSSE3(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* dst_rgb24,
+ int width) {
+// fpic 32 bit gcc 4.2 on OSX runs out of GPR regs.
+#if defined(__i386__)
+ asm volatile (
+ "movdqa %[kShuffleMaskARGBToRGB24_0],%%xmm5 \n"
+ "movdqa %[kShuffleMaskARGBToRGB24],%%xmm6 \n"
+ :: [kShuffleMaskARGBToRGB24_0]"m"(kShuffleMaskARGBToRGB24_0),
+ [kShuffleMaskARGBToRGB24]"m"(kShuffleMaskARGBToRGB24));
+#endif
+
+ asm volatile (
+#if !defined(__i386__)
+ "movdqa %[kShuffleMaskARGBToRGB24_0],%%xmm5 \n"
+ "movdqa %[kShuffleMaskARGBToRGB24],%%xmm6 \n"
+#endif
+ "sub %[u_buf],%[v_buf] \n"
+ "pxor %%xmm4,%%xmm4 \n"
+ LABELALIGN
+ "1: \n"
+ READYUV422
+ YUVTORGB
+ "punpcklbw %%xmm1,%%xmm0 \n"
+ "punpcklbw %%xmm2,%%xmm2 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "punpcklwd %%xmm2,%%xmm0 \n"
+ "punpckhwd %%xmm2,%%xmm1 \n"
+ "pshufb %%xmm5,%%xmm0 \n"
+ "pshufb %%xmm6,%%xmm1 \n"
+ "palignr $0xc,%%xmm0,%%xmm1 \n"
+ "movq %%xmm0," MEMACCESS([dst_rgb24]) "\n"
+ "movdqu %%xmm1," MEMACCESS2(0x8,[dst_rgb24]) "\n"
+ "lea " MEMLEA(0x18,[dst_rgb24]) ",%[dst_rgb24] \n"
+ "sub $0x8,%[width] \n"
+ "jg 1b \n"
+ : [y_buf]"+r"(y_buf), // %[y_buf]
+ [u_buf]"+r"(u_buf), // %[u_buf]
+ [v_buf]"+r"(v_buf), // %[v_buf]
+ [dst_rgb24]"+r"(dst_rgb24), // %[dst_rgb24]
+ [width]"+rm"(width) // %[width]
+ : [kYuvConstants]"r"(&kYuvConstants.kUVToB)
+#if !defined(__i386__)
+ , [kShuffleMaskARGBToRGB24_0]"m"(kShuffleMaskARGBToRGB24_0),
+ [kShuffleMaskARGBToRGB24]"m"(kShuffleMaskARGBToRGB24)
+#endif
+ : "memory", "cc"
+#if defined(__native_client__) && defined(__x86_64__)
+ , "r14"
+#endif
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
+#endif
+ );
+}
+
+void OMITFP I422ToRAWRow_SSSE3(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* dst_raw,
+ int width) {
+// fpic 32 bit gcc 4.2 on OSX runs out of GPR regs.
+#if defined(__i386__)
+ asm volatile (
+ "movdqa %[kShuffleMaskARGBToRAW_0],%%xmm5 \n"
+ "movdqa %[kShuffleMaskARGBToRAW],%%xmm6 \n"
+ :: [kShuffleMaskARGBToRAW_0]"m"(kShuffleMaskARGBToRAW_0),
+ [kShuffleMaskARGBToRAW]"m"(kShuffleMaskARGBToRAW));
+#endif
+
+ asm volatile (
+#if !defined(__i386__)
+ "movdqa %[kShuffleMaskARGBToRAW_0],%%xmm5 \n"
+ "movdqa %[kShuffleMaskARGBToRAW],%%xmm6 \n"
+#endif
+ "sub %[u_buf],%[v_buf] \n"
+ "pxor %%xmm4,%%xmm4 \n"
+ LABELALIGN
+ "1: \n"
+ READYUV422
+ YUVTORGB
+ "punpcklbw %%xmm1,%%xmm0 \n"
+ "punpcklbw %%xmm2,%%xmm2 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "punpcklwd %%xmm2,%%xmm0 \n"
+ "punpckhwd %%xmm2,%%xmm1 \n"
+ "pshufb %%xmm5,%%xmm0 \n"
+ "pshufb %%xmm6,%%xmm1 \n"
+ "palignr $0xc,%%xmm0,%%xmm1 \n"
+ "movq %%xmm0," MEMACCESS([dst_raw]) " \n"
+ "movdqu %%xmm1," MEMACCESS2(0x8,[dst_raw]) "\n"
+ "lea " MEMLEA(0x18,[dst_raw]) ",%[dst_raw] \n"
+ "sub $0x8,%[width] \n"
+ "jg 1b \n"
+ : [y_buf]"+r"(y_buf), // %[y_buf]
+ [u_buf]"+r"(u_buf), // %[u_buf]
+ [v_buf]"+r"(v_buf), // %[v_buf]
+ [dst_raw]"+r"(dst_raw), // %[dst_raw]
+ [width]"+rm"(width) // %[width]
+ : [kYuvConstants]"r"(&kYuvConstants.kUVToB)
+#if !defined(__i386__)
+ , [kShuffleMaskARGBToRAW_0]"m"(kShuffleMaskARGBToRAW_0),
+ [kShuffleMaskARGBToRAW]"m"(kShuffleMaskARGBToRAW)
+#endif
+ : "memory", "cc"
+#if defined(__native_client__) && defined(__x86_64__)
+ , "r14"
+#endif
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
+#endif
+ );
+}
+
+void OMITFP I422ToARGBRow_SSSE3(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* dst_argb,
+ int width) {
+ asm volatile (
+ "sub %[u_buf],%[v_buf] \n"
+ "pcmpeqb %%xmm5,%%xmm5 \n"
+ "pxor %%xmm4,%%xmm4 \n"
+ LABELALIGN
+ "1: \n"
+ READYUV422
+ YUVTORGB
+ "punpcklbw %%xmm1,%%xmm0 \n"
+ "punpcklbw %%xmm5,%%xmm2 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "punpcklwd %%xmm2,%%xmm0 \n"
+ "punpckhwd %%xmm2,%%xmm1 \n"
+ "movdqa %%xmm0," MEMACCESS([dst_argb]) "\n"
+ "movdqa %%xmm1," MEMACCESS2(0x10,[dst_argb]) "\n"
+ "lea " MEMLEA(0x20,[dst_argb]) ",%[dst_argb] \n"
+ "sub $0x8,%[width] \n"
+ "jg 1b \n"
+ : [y_buf]"+r"(y_buf), // %[y_buf]
+ [u_buf]"+r"(u_buf), // %[u_buf]
+ [v_buf]"+r"(v_buf), // %[v_buf]
+ [dst_argb]"+r"(dst_argb), // %[dst_argb]
+ [width]"+rm"(width) // %[width]
+ : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
+ : "memory", "cc"
+#if defined(__native_client__) && defined(__x86_64__)
+ , "r14"
+#endif
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+#endif
+ );
+}
+
+void OMITFP I411ToARGBRow_SSSE3(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* dst_argb,
+ int width) {
+ asm volatile (
+ "sub %[u_buf],%[v_buf] \n"
+ "pcmpeqb %%xmm5,%%xmm5 \n"
+ "pxor %%xmm4,%%xmm4 \n"
+ LABELALIGN
+ "1: \n"
+ READYUV411
+ YUVTORGB
+ "punpcklbw %%xmm1,%%xmm0 \n"
+ "punpcklbw %%xmm5,%%xmm2 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "punpcklwd %%xmm2,%%xmm0 \n"
+ "punpckhwd %%xmm2,%%xmm1 \n"
+ "movdqa %%xmm0," MEMACCESS([dst_argb]) "\n"
+ "movdqa %%xmm1," MEMACCESS2(0x10,[dst_argb]) "\n"
+ "lea " MEMLEA(0x20,[dst_argb]) ",%[dst_argb] \n"
+ "sub $0x8,%[width] \n"
+ "jg 1b \n"
+ : [y_buf]"+r"(y_buf), // %[y_buf]
+ [u_buf]"+r"(u_buf), // %[u_buf]
+ [v_buf]"+r"(v_buf), // %[v_buf]
+ [dst_argb]"+r"(dst_argb), // %[dst_argb]
+ [width]"+rm"(width) // %[width]
+ : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
+ : "memory", "cc"
+#if defined(__native_client__) && defined(__x86_64__)
+ , "r14"
+#endif
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+#endif
+ );
+}
+
+void OMITFP NV12ToARGBRow_SSSE3(const uint8* y_buf,
+ const uint8* uv_buf,
+ uint8* dst_argb,
+ int width) {
+ asm volatile (
+ "pcmpeqb %%xmm5,%%xmm5 \n"
+ "pxor %%xmm4,%%xmm4 \n"
+ LABELALIGN
+ "1: \n"
+ READNV12
+ YUVTORGB
+ "punpcklbw %%xmm1,%%xmm0 \n"
+ "punpcklbw %%xmm5,%%xmm2 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "punpcklwd %%xmm2,%%xmm0 \n"
+ "punpckhwd %%xmm2,%%xmm1 \n"
+ "movdqa %%xmm0," MEMACCESS([dst_argb]) "\n"
+ "movdqa %%xmm1," MEMACCESS2(0x10,[dst_argb]) "\n"
+ "lea " MEMLEA(0x20,[dst_argb]) ",%[dst_argb] \n"
+ "sub $0x8,%[width] \n"
+ "jg 1b \n"
+ : [y_buf]"+r"(y_buf), // %[y_buf]
+ [uv_buf]"+r"(uv_buf), // %[uv_buf]
+ [dst_argb]"+r"(dst_argb), // %[dst_argb]
+ [width]"+rm"(width) // %[width]
+ : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
+ : "memory", "cc"
+ // Does not use r14.
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+#endif
+ );
+}
+
+void OMITFP NV21ToARGBRow_SSSE3(const uint8* y_buf,
+ const uint8* uv_buf,
+ uint8* dst_argb,
+ int width) {
+ asm volatile (
+ "pcmpeqb %%xmm5,%%xmm5 \n"
+ "pxor %%xmm4,%%xmm4 \n"
+ LABELALIGN
+ "1: \n"
+ READNV12
+ YVUTORGB
+ "punpcklbw %%xmm1,%%xmm0 \n"
+ "punpcklbw %%xmm5,%%xmm2 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "punpcklwd %%xmm2,%%xmm0 \n"
+ "punpckhwd %%xmm2,%%xmm1 \n"
+ "movdqa %%xmm0," MEMACCESS([dst_argb]) "\n"
+ "movdqa %%xmm1," MEMACCESS2(0x10,[dst_argb]) "\n"
+ "lea " MEMLEA(0x20,[dst_argb]) ",%[dst_argb] \n"
+ "sub $0x8,%[width] \n"
+ "jg 1b \n"
+ : [y_buf]"+r"(y_buf), // %[y_buf]
+ [uv_buf]"+r"(uv_buf), // %[uv_buf]
+ [dst_argb]"+r"(dst_argb), // %[dst_argb]
+ [width]"+rm"(width) // %[width]
+ : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
+ : "memory", "cc"
+ // Does not use r14.
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+#endif
+ );
+}
+
+void OMITFP I444ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* dst_argb,
+ int width) {
+ asm volatile (
+ "sub %[u_buf],%[v_buf] \n"
+ "pcmpeqb %%xmm5,%%xmm5 \n"
+ "pxor %%xmm4,%%xmm4 \n"
+ LABELALIGN
+ "1: \n"
+ READYUV444
+ YUVTORGB
+ "punpcklbw %%xmm1,%%xmm0 \n"
+ "punpcklbw %%xmm5,%%xmm2 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "punpcklwd %%xmm2,%%xmm0 \n"
+ "punpckhwd %%xmm2,%%xmm1 \n"
+ "movdqu %%xmm0," MEMACCESS([dst_argb]) "\n"
+ "movdqu %%xmm1," MEMACCESS2(0x10,[dst_argb]) "\n"
+ "lea " MEMLEA(0x20,[dst_argb]) ",%[dst_argb] \n"
+ "sub $0x8,%[width] \n"
+ "jg 1b \n"
+ : [y_buf]"+r"(y_buf), // %[y_buf]
+ [u_buf]"+r"(u_buf), // %[u_buf]
+ [v_buf]"+r"(v_buf), // %[v_buf]
+ [dst_argb]"+r"(dst_argb), // %[dst_argb]
+ [width]"+rm"(width) // %[width]
+ : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
+ : "memory", "cc"
+#if defined(__native_client__) && defined(__x86_64__)
+ , "r14"
+#endif
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+#endif
+ );
+}
+
+void OMITFP I422ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* dst_argb,
+ int width) {
+ asm volatile (
+ "sub %[u_buf],%[v_buf] \n"
+ "pcmpeqb %%xmm5,%%xmm5 \n"
+ "pxor %%xmm4,%%xmm4 \n"
+ LABELALIGN
+ "1: \n"
+ READYUV422
+ YUVTORGB
+ "punpcklbw %%xmm1,%%xmm0 \n"
+ "punpcklbw %%xmm5,%%xmm2 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "punpcklwd %%xmm2,%%xmm0 \n"
+ "punpckhwd %%xmm2,%%xmm1 \n"
+ "movdqu %%xmm0," MEMACCESS([dst_argb]) "\n"
+ "movdqu %%xmm1," MEMACCESS2(0x10,[dst_argb]) "\n"
+ "lea " MEMLEA(0x20,[dst_argb]) ",%[dst_argb] \n"
+ "sub $0x8,%[width] \n"
+ "jg 1b \n"
+ : [y_buf]"+r"(y_buf), // %[y_buf]
+ [u_buf]"+r"(u_buf), // %[u_buf]
+ [v_buf]"+r"(v_buf), // %[v_buf]
+ [dst_argb]"+r"(dst_argb), // %[dst_argb]
+ [width]"+rm"(width) // %[width]
+ : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
+ : "memory", "cc"
+#if defined(__native_client__) && defined(__x86_64__)
+ , "r14"
+#endif
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+#endif
+ );
+}
+
+void OMITFP I411ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* dst_argb,
+ int width) {
+ asm volatile (
+ "sub %[u_buf],%[v_buf] \n"
+ "pcmpeqb %%xmm5,%%xmm5 \n"
+ "pxor %%xmm4,%%xmm4 \n"
+ LABELALIGN
+ "1: \n"
+ READYUV411
+ YUVTORGB
+ "punpcklbw %%xmm1,%%xmm0 \n"
+ "punpcklbw %%xmm5,%%xmm2 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "punpcklwd %%xmm2,%%xmm0 \n"
+ "punpckhwd %%xmm2,%%xmm1 \n"
+ "movdqu %%xmm0," MEMACCESS([dst_argb]) "\n"
+ "movdqu %%xmm1," MEMACCESS2(0x10,[dst_argb]) "\n"
+ "lea " MEMLEA(0x20,[dst_argb]) ",%[dst_argb] \n"
+ "sub $0x8,%[width] \n"
+ "jg 1b \n"
+ : [y_buf]"+r"(y_buf), // %[y_buf]
+ [u_buf]"+r"(u_buf), // %[u_buf]
+ [v_buf]"+r"(v_buf), // %[v_buf]
+ [dst_argb]"+r"(dst_argb), // %[dst_argb]
+ [width]"+rm"(width) // %[width]
+ : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
+ : "memory", "cc"
+#if defined(__native_client__) && defined(__x86_64__)
+ , "r14"
+#endif
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+#endif
+ );
+}
+
+void OMITFP NV12ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
+ const uint8* uv_buf,
+ uint8* dst_argb,
+ int width) {
+ asm volatile (
+ "pcmpeqb %%xmm5,%%xmm5 \n"
+ "pxor %%xmm4,%%xmm4 \n"
+ LABELALIGN
+ "1: \n"
+ READNV12
+ YUVTORGB
+ "punpcklbw %%xmm1,%%xmm0 \n"
+ "punpcklbw %%xmm5,%%xmm2 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "punpcklwd %%xmm2,%%xmm0 \n"
+ "punpckhwd %%xmm2,%%xmm1 \n"
+ "movdqu %%xmm0," MEMACCESS([dst_argb]) "\n"
+ "movdqu %%xmm1," MEMACCESS2(0x10,[dst_argb]) "\n"
+ "lea " MEMLEA(0x20,[dst_argb]) ",%[dst_argb] \n"
+ "sub $0x8,%[width] \n"
+ "jg 1b \n"
+ : [y_buf]"+r"(y_buf), // %[y_buf]
+ [uv_buf]"+r"(uv_buf), // %[uv_buf]
+ [dst_argb]"+r"(dst_argb), // %[dst_argb]
+ [width]"+rm"(width) // %[width]
+ : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
+ : "memory", "cc"
+ // Does not use r14.
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+#endif
+ );
+}
+
+void OMITFP NV21ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
+ const uint8* uv_buf,
+ uint8* dst_argb,
+ int width) {
+ asm volatile (
+ "pcmpeqb %%xmm5,%%xmm5 \n"
+ "pxor %%xmm4,%%xmm4 \n"
+ LABELALIGN
+ "1: \n"
+ READNV12
+ YVUTORGB
+ "punpcklbw %%xmm1,%%xmm0 \n"
+ "punpcklbw %%xmm5,%%xmm2 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "punpcklwd %%xmm2,%%xmm0 \n"
+ "punpckhwd %%xmm2,%%xmm1 \n"
+ "movdqu %%xmm0," MEMACCESS([dst_argb]) "\n"
+ "movdqu %%xmm1," MEMACCESS2(0x10,[dst_argb]) "\n"
+ "lea " MEMLEA(0x20,[dst_argb]) ",%[dst_argb] \n"
+ "sub $0x8,%[width] \n"
+ "jg 1b \n"
+ : [y_buf]"+r"(y_buf), // %[y_buf]
+ [uv_buf]"+r"(uv_buf), // %[uv_buf]
+ [dst_argb]"+r"(dst_argb), // %[dst_argb]
+ [width]"+rm"(width) // %[width]
+ : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
+ : "memory", "cc"
+ // Does not use r14.
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+#endif
+ );
+}
+
+void OMITFP I422ToBGRARow_SSSE3(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* dst_bgra,
+ int width) {
+ asm volatile (
+ "sub %[u_buf],%[v_buf] \n"
+ "pcmpeqb %%xmm5,%%xmm5 \n"
+ "pxor %%xmm4,%%xmm4 \n"
+ LABELALIGN
+ "1: \n"
+ READYUV422
+ YUVTORGB
+ "pcmpeqb %%xmm5,%%xmm5 \n"
+ "punpcklbw %%xmm0,%%xmm1 \n"
+ "punpcklbw %%xmm2,%%xmm5 \n"
+ "movdqa %%xmm5,%%xmm0 \n"
+ "punpcklwd %%xmm1,%%xmm5 \n"
+ "punpckhwd %%xmm1,%%xmm0 \n"
+ "movdqa %%xmm5," MEMACCESS([dst_bgra]) "\n"
+ "movdqa %%xmm0," MEMACCESS2(0x10,[dst_bgra]) "\n"
+ "lea " MEMLEA(0x20,[dst_bgra]) ",%[dst_bgra] \n"
+ "sub $0x8,%[width] \n"
+ "jg 1b \n"
+ : [y_buf]"+r"(y_buf), // %[y_buf]
+ [u_buf]"+r"(u_buf), // %[u_buf]
+ [v_buf]"+r"(v_buf), // %[v_buf]
+ [dst_bgra]"+r"(dst_bgra), // %[dst_bgra]
+ [width]"+rm"(width) // %[width]
+ : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
+ : "memory", "cc"
+#if defined(__native_client__) && defined(__x86_64__)
+ , "r14"
+#endif
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+#endif
+ );
+}
+
+void OMITFP I422ToABGRRow_SSSE3(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* dst_abgr,
+ int width) {
+ asm volatile (
+ "sub %[u_buf],%[v_buf] \n"
+ "pcmpeqb %%xmm5,%%xmm5 \n"
+ "pxor %%xmm4,%%xmm4 \n"
+ LABELALIGN
+ "1: \n"
+ READYUV422
+ YUVTORGB
+ "punpcklbw %%xmm1,%%xmm2 \n"
+ "punpcklbw %%xmm5,%%xmm0 \n"
+ "movdqa %%xmm2,%%xmm1 \n"
+ "punpcklwd %%xmm0,%%xmm2 \n"
+ "punpckhwd %%xmm0,%%xmm1 \n"
+ "movdqa %%xmm2," MEMACCESS([dst_abgr]) "\n"
+ "movdqa %%xmm1," MEMACCESS2(0x10,[dst_abgr]) "\n"
+ "lea " MEMLEA(0x20,[dst_abgr]) ",%[dst_abgr] \n"
+ "sub $0x8,%[width] \n"
+ "jg 1b \n"
+ : [y_buf]"+r"(y_buf), // %[y_buf]
+ [u_buf]"+r"(u_buf), // %[u_buf]
+ [v_buf]"+r"(v_buf), // %[v_buf]
+ [dst_abgr]"+r"(dst_abgr), // %[dst_abgr]
+ [width]"+rm"(width) // %[width]
+ : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
+ : "memory", "cc"
+#if defined(__native_client__) && defined(__x86_64__)
+ , "r14"
+#endif
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+#endif
+ );
+}
+
+void OMITFP I422ToRGBARow_SSSE3(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* dst_rgba,
+ int width) {
+ asm volatile (
+ "sub %[u_buf],%[v_buf] \n"
+ "pcmpeqb %%xmm5,%%xmm5 \n"
+ "pxor %%xmm4,%%xmm4 \n"
+ LABELALIGN
+ "1: \n"
+ READYUV422
+ YUVTORGB
+ "pcmpeqb %%xmm5,%%xmm5 \n"
+ "punpcklbw %%xmm2,%%xmm1 \n"
+ "punpcklbw %%xmm0,%%xmm5 \n"
+ "movdqa %%xmm5,%%xmm0 \n"
+ "punpcklwd %%xmm1,%%xmm5 \n"
+ "punpckhwd %%xmm1,%%xmm0 \n"
+ "movdqa %%xmm5," MEMACCESS([dst_rgba]) "\n"
+ "movdqa %%xmm0," MEMACCESS2(0x10,[dst_rgba]) "\n"
+ "lea " MEMLEA(0x20,[dst_rgba]) ",%[dst_rgba] \n"
+ "sub $0x8,%[width] \n"
+ "jg 1b \n"
+ : [y_buf]"+r"(y_buf), // %[y_buf]
+ [u_buf]"+r"(u_buf), // %[u_buf]
+ [v_buf]"+r"(v_buf), // %[v_buf]
+ [dst_rgba]"+r"(dst_rgba), // %[dst_rgba]
+ [width]"+rm"(width) // %[width]
+ : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
+ : "memory", "cc"
+#if defined(__native_client__) && defined(__x86_64__)
+ , "r14"
+#endif
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+#endif
+ );
+}
+
+void OMITFP I422ToBGRARow_Unaligned_SSSE3(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* dst_bgra,
+ int width) {
+ asm volatile (
+ "sub %[u_buf],%[v_buf] \n"
+ "pcmpeqb %%xmm5,%%xmm5 \n"
+ "pxor %%xmm4,%%xmm4 \n"
+ LABELALIGN
+ "1: \n"
+ READYUV422
+ YUVTORGB
+ "pcmpeqb %%xmm5,%%xmm5 \n"
+ "punpcklbw %%xmm0,%%xmm1 \n"
+ "punpcklbw %%xmm2,%%xmm5 \n"
+ "movdqa %%xmm5,%%xmm0 \n"
+ "punpcklwd %%xmm1,%%xmm5 \n"
+ "punpckhwd %%xmm1,%%xmm0 \n"
+ "movdqu %%xmm5," MEMACCESS([dst_bgra]) "\n"
+ "movdqu %%xmm0," MEMACCESS2(0x10,[dst_bgra]) "\n"
+ "lea " MEMLEA(0x20,[dst_bgra]) ",%[dst_bgra] \n"
+ "sub $0x8,%[width] \n"
+ "jg 1b \n"
+ : [y_buf]"+r"(y_buf), // %[y_buf]
+ [u_buf]"+r"(u_buf), // %[u_buf]
+ [v_buf]"+r"(v_buf), // %[v_buf]
+ [dst_bgra]"+r"(dst_bgra), // %[dst_bgra]
+ [width]"+rm"(width) // %[width]
+ : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
+ : "memory", "cc"
+#if defined(__native_client__) && defined(__x86_64__)
+ , "r14"
+#endif
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+#endif
+ );
+}
+
+void OMITFP I422ToABGRRow_Unaligned_SSSE3(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* dst_abgr,
+ int width) {
+ asm volatile (
+ "sub %[u_buf],%[v_buf] \n"
+ "pcmpeqb %%xmm5,%%xmm5 \n"
+ "pxor %%xmm4,%%xmm4 \n"
+ LABELALIGN
+ "1: \n"
+ READYUV422
+ YUVTORGB
+ "punpcklbw %%xmm1,%%xmm2 \n"
+ "punpcklbw %%xmm5,%%xmm0 \n"
+ "movdqa %%xmm2,%%xmm1 \n"
+ "punpcklwd %%xmm0,%%xmm2 \n"
+ "punpckhwd %%xmm0,%%xmm1 \n"
+ "movdqu %%xmm2," MEMACCESS([dst_abgr]) "\n"
+ "movdqu %%xmm1," MEMACCESS2(0x10,[dst_abgr]) "\n"
+ "lea " MEMLEA(0x20,[dst_abgr]) ",%[dst_abgr] \n"
+ "sub $0x8,%[width] \n"
+ "jg 1b \n"
+ : [y_buf]"+r"(y_buf), // %[y_buf]
+ [u_buf]"+r"(u_buf), // %[u_buf]
+ [v_buf]"+r"(v_buf), // %[v_buf]
+ [dst_abgr]"+r"(dst_abgr), // %[dst_abgr]
+ [width]"+rm"(width) // %[width]
+ : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
+ : "memory", "cc"
+#if defined(__native_client__) && defined(__x86_64__)
+ , "r14"
+#endif
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+#endif
+ );
+}
+
+void OMITFP I422ToRGBARow_Unaligned_SSSE3(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* dst_rgba,
+ int width) {
+ asm volatile (
+ "sub %[u_buf],%[v_buf] \n"
+ "pcmpeqb %%xmm5,%%xmm5 \n"
+ "pxor %%xmm4,%%xmm4 \n"
+ LABELALIGN
+ "1: \n"
+ READYUV422
+ YUVTORGB
+ "pcmpeqb %%xmm5,%%xmm5 \n"
+ "punpcklbw %%xmm2,%%xmm1 \n"
+ "punpcklbw %%xmm0,%%xmm5 \n"
+ "movdqa %%xmm5,%%xmm0 \n"
+ "punpcklwd %%xmm1,%%xmm5 \n"
+ "punpckhwd %%xmm1,%%xmm0 \n"
+ "movdqu %%xmm5," MEMACCESS([dst_rgba]) "\n"
+ "movdqu %%xmm0," MEMACCESS2(0x10,[dst_rgba]) "\n"
+ "lea " MEMLEA(0x20,[dst_rgba]) ",%[dst_rgba] \n"
+ "sub $0x8,%[width] \n"
+ "jg 1b \n"
+ : [y_buf]"+r"(y_buf), // %[y_buf]
+ [u_buf]"+r"(u_buf), // %[u_buf]
+ [v_buf]"+r"(v_buf), // %[v_buf]
+ [dst_rgba]"+r"(dst_rgba), // %[dst_rgba]
+ [width]"+rm"(width) // %[width]
+ : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
+ : "memory", "cc"
+#if defined(__native_client__) && defined(__x86_64__)
+ , "r14"
+#endif
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+#endif
+ );
+}
+
+#endif // HAS_I422TOARGBROW_SSSE3
+
+#ifdef HAS_YTOARGBROW_SSE2
+void YToARGBRow_SSE2(const uint8* y_buf,
+ uint8* dst_argb,
+ int width) {
+ asm volatile (
+ "pxor %%xmm5,%%xmm5 \n"
+ "pcmpeqb %%xmm4,%%xmm4 \n"
+ "pslld $0x18,%%xmm4 \n"
+ "mov $0x00100010,%%eax \n"
+ "movd %%eax,%%xmm3 \n"
+ "pshufd $0x0,%%xmm3,%%xmm3 \n"
+ "mov $0x004a004a,%%eax \n"
+ "movd %%eax,%%xmm2 \n"
+ "pshufd $0x0,%%xmm2,%%xmm2 \n"
+ LABELALIGN
+ "1: \n"
+ // Step 1: Scale Y contribution to 8 G values. G = (y - 16) * 1.164
+ "movq " MEMACCESS(0) ",%%xmm0 \n"
+ "lea " MEMLEA(0x8,0) ",%0 \n"
+ "punpcklbw %%xmm5,%%xmm0 \n"
+ "psubusw %%xmm3,%%xmm0 \n"
+ "pmullw %%xmm2,%%xmm0 \n"
+ "psrlw $6, %%xmm0 \n"
+ "packuswb %%xmm0,%%xmm0 \n"
+
+ // Step 2: Weave into ARGB
+ "punpcklbw %%xmm0,%%xmm0 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "punpcklwd %%xmm0,%%xmm0 \n"
+ "punpckhwd %%xmm1,%%xmm1 \n"
+ "por %%xmm4,%%xmm0 \n"
+ "por %%xmm4,%%xmm1 \n"
+ "movdqa %%xmm0," MEMACCESS(1) " \n"
+ "movdqa %%xmm1," MEMACCESS2(0x10,1) " \n"
+ "lea " MEMLEA(0x20,1) ",%1 \n"
+
+ "sub $0x8,%2 \n"
+ "jg 1b \n"
+ : "+r"(y_buf), // %0
+ "+r"(dst_argb), // %1
+ "+rm"(width) // %2
+ :
+ : "memory", "cc", "eax"
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
+#endif
+ );
+}
+#endif // HAS_YTOARGBROW_SSE2
+
+#ifdef HAS_MIRRORROW_SSSE3
+// Shuffle table for reversing the bytes.
+static uvec8 kShuffleMirror = {
+ 15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u, 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u
+};
+
+void MirrorRow_SSSE3(const uint8* src, uint8* dst, int width) {
+ intptr_t temp_width = (intptr_t)(width);
+ asm volatile (
+ "movdqa %3,%%xmm5 \n"
+ "lea " MEMLEA(-0x10,0) ",%0 \n"
+ LABELALIGN
+ "1: \n"
+ MEMOPREG(movdqa,0x00,0,2,1,xmm0) // movdqa (%0,%2),%%xmm0
+ "pshufb %%xmm5,%%xmm0 \n"
+ "sub $0x10,%2 \n"
+ "movdqa %%xmm0," MEMACCESS(1) " \n"
+ "lea " MEMLEA(0x10,1) ",%1 \n"
+ "jg 1b \n"
+ : "+r"(src), // %0
+ "+r"(dst), // %1
+ "+r"(temp_width) // %2
+ : "m"(kShuffleMirror) // %3
+ : "memory", "cc"
+#if defined(__native_client__) && defined(__x86_64__)
+ , "r14"
+#endif
+#if defined(__SSE2__)
+ , "xmm0", "xmm5"
+#endif
+ );
+}
+#endif // HAS_MIRRORROW_SSSE3
+
+#ifdef HAS_MIRRORROW_SSE2
+void MirrorRow_SSE2(const uint8* src, uint8* dst, int width) {
+ intptr_t temp_width = (intptr_t)(width);
+ asm volatile (
+ "lea " MEMLEA(-0x10,0) ",%0 \n"
+ LABELALIGN
+ "1: \n"
+ MEMOPREG(movdqu,0x00,0,2,1,xmm0) // movdqu (%0,%2),%%xmm0
+ "movdqa %%xmm0,%%xmm1 \n"
+ "psllw $0x8,%%xmm0 \n"
+ "psrlw $0x8,%%xmm1 \n"
+ "por %%xmm1,%%xmm0 \n"
+ "pshuflw $0x1b,%%xmm0,%%xmm0 \n"
+ "pshufhw $0x1b,%%xmm0,%%xmm0 \n"
+ "pshufd $0x4e,%%xmm0,%%xmm0 \n"
+ "sub $0x10,%2 \n"
+ "movdqu %%xmm0," MEMACCESS(1) " \n"
+ "lea " MEMLEA(0x10,1)",%1 \n"
+ "jg 1b \n"
+ : "+r"(src), // %0
+ "+r"(dst), // %1
+ "+r"(temp_width) // %2
+ :
+ : "memory", "cc"
+#if defined(__native_client__) && defined(__x86_64__)
+ , "r14"
+#endif
+#if defined(__SSE2__)
+ , "xmm0", "xmm1"
+#endif
+ );
+}
+#endif // HAS_MIRRORROW_SSE2
+
+#ifdef HAS_MIRRORROW_UV_SSSE3
+// Shuffle table for reversing the bytes of UV channels.
+static uvec8 kShuffleMirrorUV = {
+ 14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u, 15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u
+};
+void MirrorUVRow_SSSE3(const uint8* src, uint8* dst_u, uint8* dst_v,
+ int width) {
+ intptr_t temp_width = (intptr_t)(width);
+ asm volatile (
+ "movdqa %4,%%xmm1 \n"
+ "lea " MEMLEA4(-0x10,0,3,2) ",%0 \n"
+ "sub %1,%2 \n"
+ LABELALIGN
+ "1: \n"
+ "movdqa " MEMACCESS(0) ",%%xmm0 \n"
+ "lea " MEMLEA(-0x10,0) ",%0 \n"
+ "pshufb %%xmm1,%%xmm0 \n"
+ "sub $8,%3 \n"
+ "movlpd %%xmm0," MEMACCESS(1) " \n"
+ BUNDLEALIGN
+ MEMOPMEM(movhpd,xmm0,0x00,1,2,1) // movhpd %%xmm0,(%1,%2)
+ "lea " MEMLEA(0x8,1) ",%1 \n"
+ "jg 1b \n"
+ : "+r"(src), // %0
+ "+r"(dst_u), // %1
+ "+r"(dst_v), // %2
+ "+r"(temp_width) // %3
+ : "m"(kShuffleMirrorUV) // %4
+ : "memory", "cc"
+#if defined(__native_client__) && defined(__x86_64__)
+ , "r14"
+#endif
+#if defined(__SSE2__)
+ , "xmm0", "xmm1"
+#endif
+ );
+}
+#endif // HAS_MIRRORROW_UV_SSSE3
+
+#ifdef HAS_ARGBMIRRORROW_SSSE3
+// Shuffle table for reversing the bytes.
+static uvec8 kARGBShuffleMirror = {
+ 12u, 13u, 14u, 15u, 8u, 9u, 10u, 11u, 4u, 5u, 6u, 7u, 0u, 1u, 2u, 3u
+};
+
+void ARGBMirrorRow_SSSE3(const uint8* src, uint8* dst, int width) {
+ intptr_t temp_width = (intptr_t)(width);
+ asm volatile (
+ "lea " MEMLEA4(-0x10,0,2,4) ",%0 \n"
+ "movdqa %3,%%xmm5 \n"
+ LABELALIGN
+ "1: \n"
+ "movdqa " MEMACCESS(0) ",%%xmm0 \n"
+ "pshufb %%xmm5,%%xmm0 \n"
+ "lea " MEMLEA(-0x10,0) ",%0 \n"
+ "sub $0x4,%2 \n"
+ "movdqa %%xmm0," MEMACCESS(1) " \n"
+ "lea " MEMLEA(0x10,1) ",%1 \n"
+ "jg 1b \n"
+ : "+r"(src), // %0
+ "+r"(dst), // %1
+ "+r"(temp_width) // %2
+ : "m"(kARGBShuffleMirror) // %3
+ : "memory", "cc"
+#if defined(__SSE2__)
+ , "xmm0", "xmm5"
+#endif
+ );
+}
+#endif // HAS_ARGBMIRRORROW_SSSE3
+
+#ifdef HAS_SPLITUVROW_SSE2
+void SplitUVRow_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) {
+ asm volatile (
+ "pcmpeqb %%xmm5,%%xmm5 \n"
+ "psrlw $0x8,%%xmm5 \n"
+ "sub %1,%2 \n"
+ LABELALIGN
+ "1: \n"
+ "movdqa " MEMACCESS(0) ",%%xmm0 \n"
+ "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
+ "lea " MEMLEA(0x20,0) ",%0 \n"
+ "movdqa %%xmm0,%%xmm2 \n"
+ "movdqa %%xmm1,%%xmm3 \n"
+ "pand %%xmm5,%%xmm0 \n"
+ "pand %%xmm5,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm0 \n"
+ "psrlw $0x8,%%xmm2 \n"
+ "psrlw $0x8,%%xmm3 \n"
+ "packuswb %%xmm3,%%xmm2 \n"
+ "movdqa %%xmm0," MEMACCESS(1) " \n"
+ MEMOPMEM(movdqa,xmm2,0x00,1,2,1) // movdqa %%xmm2,(%1,%2)
+ "lea " MEMLEA(0x10,1) ",%1 \n"
+ "sub $0x10,%3 \n"
+ "jg 1b \n"
+ : "+r"(src_uv), // %0
+ "+r"(dst_u), // %1
+ "+r"(dst_v), // %2
+ "+r"(pix) // %3
+ :
+ : "memory", "cc"
+#if defined(__native_client__) && defined(__x86_64__)
+ , "r14"
+#endif
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
+#endif
+ );
+}
+
+void SplitUVRow_Unaligned_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
+ int pix) {
+ asm volatile (
+ "pcmpeqb %%xmm5,%%xmm5 \n"
+ "psrlw $0x8,%%xmm5 \n"
+ "sub %1,%2 \n"
+ LABELALIGN
+ "1: \n"
+ "movdqu " MEMACCESS(0) ",%%xmm0 \n"
+ "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
+ "lea " MEMLEA(0x20,0) ",%0 \n"
+ "movdqa %%xmm0,%%xmm2 \n"
+ "movdqa %%xmm1,%%xmm3 \n"
+ "pand %%xmm5,%%xmm0 \n"
+ "pand %%xmm5,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm0 \n"
+ "psrlw $0x8,%%xmm2 \n"
+ "psrlw $0x8,%%xmm3 \n"
+ "packuswb %%xmm3,%%xmm2 \n"
+ "movdqu %%xmm0," MEMACCESS(1) " \n"
+ MEMOPMEM(movdqu,xmm2,0x00,1,2,1) // movdqu %%xmm2,(%1,%2)
+ "lea " MEMLEA(0x10,1) ",%1 \n"
+ "sub $0x10,%3 \n"
+ "jg 1b \n"
+ : "+r"(src_uv), // %0
+ "+r"(dst_u), // %1
+ "+r"(dst_v), // %2
+ "+r"(pix) // %3
+ :
+ : "memory", "cc"
+#if defined(__native_client__) && defined(__x86_64__)
+ , "r14"
+#endif
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
+#endif
+ );
+}
+#endif // HAS_SPLITUVROW_SSE2
+
+#ifdef HAS_MERGEUVROW_SSE2
+void MergeUVRow_SSE2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
+ int width) {
+ asm volatile (
+ "sub %0,%1 \n"
+ LABELALIGN
+ "1: \n"
+ "movdqa " MEMACCESS(0) ",%%xmm0 \n"
+ MEMOPREG(movdqa,0x00,0,1,1,xmm1) // movdqa (%0,%1,1),%%xmm1
+ "lea " MEMLEA(0x10,0) ",%0 \n"
+ "movdqa %%xmm0,%%xmm2 \n"
+ "punpcklbw %%xmm1,%%xmm0 \n"
+ "punpckhbw %%xmm1,%%xmm2 \n"
+ "movdqa %%xmm0," MEMACCESS(2) " \n"
+ "movdqa %%xmm2," MEMACCESS2(0x10,2) " \n"
+ "lea " MEMLEA(0x20,2) ",%2 \n"
+ "sub $0x10,%3 \n"
+ "jg 1b \n"
+ : "+r"(src_u), // %0
+ "+r"(src_v), // %1
+ "+r"(dst_uv), // %2
+ "+r"(width) // %3
+ :
+ : "memory", "cc"
+#if defined(__native_client__) && defined(__x86_64__)
+ , "r14"
+#endif
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm2"
+#endif
+ );
+}
+
+void MergeUVRow_Unaligned_SSE2(const uint8* src_u, const uint8* src_v,
+ uint8* dst_uv, int width) {
+ asm volatile (
+ "sub %0,%1 \n"
+ LABELALIGN
+ "1: \n"
+ "movdqu " MEMACCESS(0) ",%%xmm0 \n"
+ MEMOPREG(movdqu,0x00,0,1,1,xmm1) // movdqu (%0,%1,1),%%xmm1
+ "lea " MEMLEA(0x10,0) ",%0 \n"
+ "movdqa %%xmm0,%%xmm2 \n"
+ "punpcklbw %%xmm1,%%xmm0 \n"
+ "punpckhbw %%xmm1,%%xmm2 \n"
+ "movdqu %%xmm0," MEMACCESS(2) " \n"
+ "movdqu %%xmm2," MEMACCESS2(0x10,2) " \n"
+ "lea " MEMLEA(0x20,2) ",%2 \n"
+ "sub $0x10,%3 \n"
+ "jg 1b \n"
+ : "+r"(src_u), // %0
+ "+r"(src_v), // %1
+ "+r"(dst_uv), // %2
+ "+r"(width) // %3
+ :
+ : "memory", "cc"
+#if defined(__native_client__) && defined(__x86_64__)
+ , "r14"
+#endif
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm2"
+#endif
+ );
+}
+#endif // HAS_MERGEUVROW_SSE2
+
+#ifdef HAS_COPYROW_SSE2
+void CopyRow_SSE2(const uint8* src, uint8* dst, int count) {
+ asm volatile (
+ LABELALIGN
+ "1: \n"
+ "movdqa " MEMACCESS(0) ",%%xmm0 \n"
+ "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
+ "lea " MEMLEA(0x20,0) ",%0 \n"
+ "movdqa %%xmm0," MEMACCESS(1) " \n"
+ "movdqa %%xmm1," MEMACCESS2(0x10,1) " \n"
+ "lea " MEMLEA(0x20,1) ",%1 \n"
+ "sub $0x20,%2 \n"
+ "jg 1b \n"
+ : "+r"(src), // %0
+ "+r"(dst), // %1
+ "+r"(count) // %2
+ :
+ : "memory", "cc"
+#if defined(__SSE2__)
+ , "xmm0", "xmm1"
+#endif
+ );
+}
+#endif // HAS_COPYROW_SSE2
+
+#ifdef HAS_COPYROW_X86
+void CopyRow_X86(const uint8* src, uint8* dst, int width) {
+ size_t width_tmp = (size_t)(width);
+ asm volatile (
+ "shr $0x2,%2 \n"
+ "rep movsl " MEMMOVESTRING(0,1) " \n"
+ : "+S"(src), // %0
+ "+D"(dst), // %1
+ "+c"(width_tmp) // %2
+ :
+ : "memory", "cc"
+ );
+}
+#endif // HAS_COPYROW_X86
+
+#ifdef HAS_COPYROW_ERMS
+// Unaligned Multiple of 1.
+void CopyRow_ERMS(const uint8* src, uint8* dst, int width) {
+ size_t width_tmp = (size_t)(width);
+ asm volatile (
+ "rep movsb " MEMMOVESTRING(0,1) " \n"
+ : "+S"(src), // %0
+ "+D"(dst), // %1
+ "+c"(width_tmp) // %2
+ :
+ : "memory", "cc"
+ );
+}
+#endif // HAS_COPYROW_ERMS
+
+#ifdef HAS_ARGBCOPYALPHAROW_SSE2
+// width in pixels
+void ARGBCopyAlphaRow_SSE2(const uint8* src, uint8* dst, int width) {
+ asm volatile (
+ "pcmpeqb %%xmm0,%%xmm0 \n"
+ "pslld $0x18,%%xmm0 \n"
+ "pcmpeqb %%xmm1,%%xmm1 \n"
+ "psrld $0x8,%%xmm1 \n"
+ LABELALIGN
+ "1: \n"
+ "movdqa " MEMACCESS(0) ",%%xmm2 \n"
+ "movdqa " MEMACCESS2(0x10,0) ",%%xmm3 \n"
+ "lea " MEMLEA(0x20,0) ",%0 \n"
+ "movdqa " MEMACCESS(1) ",%%xmm4 \n"
+ "movdqa " MEMACCESS2(0x10,1) ",%%xmm5 \n"
+ "pand %%xmm0,%%xmm2 \n"
+ "pand %%xmm0,%%xmm3 \n"
+ "pand %%xmm1,%%xmm4 \n"
+ "pand %%xmm1,%%xmm5 \n"
+ "por %%xmm4,%%xmm2 \n"
+ "por %%xmm5,%%xmm3 \n"
+ "movdqa %%xmm2," MEMACCESS(1) " \n"
+ "movdqa %%xmm3," MEMACCESS2(0x10,1) " \n"
+ "lea " MEMLEA(0x20,1) ",%1 \n"
+ "sub $0x8,%2 \n"
+ "jg 1b \n"
+ : "+r"(src), // %0
+ "+r"(dst), // %1
+ "+r"(width) // %2
+ :
+ : "memory", "cc"
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+#endif
+ );
+}
+#endif // HAS_ARGBCOPYALPHAROW_SSE2
+
+#ifdef HAS_ARGBCOPYALPHAROW_AVX2
+// width in pixels
+void ARGBCopyAlphaRow_AVX2(const uint8* src, uint8* dst, int width) {
+ asm volatile (
+ "vpcmpeqb %%ymm0,%%ymm0,%%ymm0 \n"
+ "vpsrld $0x8,%%ymm0,%%ymm0 \n"
+ LABELALIGN
+ "1: \n"
+ "vmovdqu " MEMACCESS(0) ",%%ymm1 \n"
+ "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm2 \n"
+ "lea " MEMLEA(0x40,0) ",%0 \n"
+ "vpblendvb %%ymm0," MEMACCESS(1) ",%%ymm1,%%ymm1 \n"
+ "vpblendvb %%ymm0," MEMACCESS2(0x20,1) ",%%ymm2,%%ymm2 \n"
+ "vmovdqu %%ymm1," MEMACCESS(1) " \n"
+ "vmovdqu %%ymm2," MEMACCESS2(0x20,1) " \n"
+ "lea " MEMLEA(0x40,1) ",%1 \n"
+ "sub $0x10,%2 \n"
+ "jg 1b \n"
+ "vzeroupper \n"
+ : "+r"(src), // %0
+ "+r"(dst), // %1
+ "+r"(width) // %2
+ :
+ : "memory", "cc"
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm2"
+#endif
+ );
+}
+#endif // HAS_ARGBCOPYALPHAROW_AVX2
+
+#ifdef HAS_ARGBCOPYYTOALPHAROW_SSE2
+// width in pixels
+void ARGBCopyYToAlphaRow_SSE2(const uint8* src, uint8* dst, int width) {
+ asm volatile (
+ "pcmpeqb %%xmm0,%%xmm0 \n"
+ "pslld $0x18,%%xmm0 \n"
+ "pcmpeqb %%xmm1,%%xmm1 \n"
+ "psrld $0x8,%%xmm1 \n"
+ LABELALIGN
+ "1: \n"
+ "movq " MEMACCESS(0) ",%%xmm2 \n"
+ "lea " MEMLEA(0x8,0) ",%0 \n"
+ "punpcklbw %%xmm2,%%xmm2 \n"
+ "punpckhwd %%xmm2,%%xmm3 \n"
+ "punpcklwd %%xmm2,%%xmm2 \n"
+ "movdqa " MEMACCESS(1) ",%%xmm4 \n"
+ "movdqa " MEMACCESS2(0x10,1) ",%%xmm5 \n"
+ "pand %%xmm0,%%xmm2 \n"
+ "pand %%xmm0,%%xmm3 \n"
+ "pand %%xmm1,%%xmm4 \n"
+ "pand %%xmm1,%%xmm5 \n"
+ "por %%xmm4,%%xmm2 \n"
+ "por %%xmm5,%%xmm3 \n"
+ "movdqa %%xmm2," MEMACCESS(1) " \n"
+ "movdqa %%xmm3," MEMACCESS2(0x10,1) " \n"
+ "lea " MEMLEA(0x20,1) ",%1 \n"
+ "sub $0x8,%2 \n"
+ "jg 1b \n"
+ : "+r"(src), // %0
+ "+r"(dst), // %1
+ "+r"(width) // %2
+ :
+ : "memory", "cc"
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+#endif
+ );
+}
+#endif // HAS_ARGBCOPYYTOALPHAROW_SSE2
+
+#ifdef HAS_ARGBCOPYYTOALPHAROW_AVX2
+// width in pixels
+void ARGBCopyYToAlphaRow_AVX2(const uint8* src, uint8* dst, int width) {
+ asm volatile (
+ "vpcmpeqb %%ymm0,%%ymm0,%%ymm0 \n"
+ "vpsrld $0x8,%%ymm0,%%ymm0 \n"
+ LABELALIGN
+ "1: \n"
+ "vpmovzxbd " MEMACCESS(0) ",%%ymm1 \n"
+ "vpmovzxbd " MEMACCESS2(0x8,0) ",%%ymm2 \n"
+ "lea " MEMLEA(0x10,0) ",%0 \n"
+ "vpslld $0x18,%%ymm1,%%ymm1 \n"
+ "vpslld $0x18,%%ymm2,%%ymm2 \n"
+ "vpblendvb %%ymm0," MEMACCESS(1) ",%%ymm1,%%ymm1 \n"
+ "vpblendvb %%ymm0," MEMACCESS2(0x20,1) ",%%ymm2,%%ymm2 \n"
+ "vmovdqu %%ymm1," MEMACCESS(1) " \n"
+ "vmovdqu %%ymm2," MEMACCESS2(0x20,1) " \n"
+ "lea " MEMLEA(0x40,1) ",%1 \n"
+ "sub $0x10,%2 \n"
+ "jg 1b \n"
+ "vzeroupper \n"
+ : "+r"(src), // %0
+ "+r"(dst), // %1
+ "+r"(width) // %2
+ :
+ : "memory", "cc"
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm2"
+#endif
+ );
+}
+#endif // HAS_ARGBCOPYYTOALPHAROW_AVX2
+
+#ifdef HAS_SETROW_X86
+void SetRow_X86(uint8* dst, uint32 v32, int width) {
+ size_t width_tmp = (size_t)(width);
+ asm volatile (
+ "shr $0x2,%1 \n"
+ "rep stosl " MEMSTORESTRING(eax,0) " \n"
+ : "+D"(dst), // %0
+ "+c"(width_tmp) // %1
+ : "a"(v32) // %2
+ : "memory", "cc");
+}
+
+void ARGBSetRows_X86(uint8* dst, uint32 v32, int width,
+ int dst_stride, int height) {
+ for (int y = 0; y < height; ++y) {
+ size_t width_tmp = (size_t)(width);
+ uint32* d = (uint32*)(dst);
+ asm volatile (
+ "rep stosl " MEMSTORESTRING(eax,0) " \n"
+ : "+D"(d), // %0
+ "+c"(width_tmp) // %1
+ : "a"(v32) // %2
+ : "memory", "cc");
+ dst += dst_stride;
+ }
+}
+#endif // HAS_SETROW_X86
+
+#ifdef HAS_YUY2TOYROW_SSE2
+void YUY2ToYRow_SSE2(const uint8* src_yuy2, uint8* dst_y, int pix) {
+ asm volatile (
+ "pcmpeqb %%xmm5,%%xmm5 \n"
+ "psrlw $0x8,%%xmm5 \n"
+ LABELALIGN
+ "1: \n"
+ "movdqa " MEMACCESS(0) ",%%xmm0 \n"
+ "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
+ "lea " MEMLEA(0x20,0) ",%0 \n"
+ "pand %%xmm5,%%xmm0 \n"
+ "pand %%xmm5,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm0 \n"
+ "movdqa %%xmm0," MEMACCESS(1) " \n"
+ "lea " MEMLEA(0x10,1) ",%1 \n"
+ "sub $0x10,%2 \n"
+ "jg 1b \n"
+ : "+r"(src_yuy2), // %0
+ "+r"(dst_y), // %1
+ "+r"(pix) // %2
+ :
+ : "memory", "cc"
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm5"
+#endif
+ );
+}
+
+void YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2,
+ uint8* dst_u, uint8* dst_v, int pix) {
+ asm volatile (
+ "pcmpeqb %%xmm5,%%xmm5 \n"
+ "psrlw $0x8,%%xmm5 \n"
+ "sub %1,%2 \n"
+ LABELALIGN
+ "1: \n"
+ "movdqa " MEMACCESS(0) ",%%xmm0 \n"
+ "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
+ BUNDLEALIGN
+ MEMOPREG(movdqa,0x00,0,4,1,xmm2) // movdqa (%0,%4,1),%%xmm2
+ MEMOPREG(movdqa,0x10,0,4,1,xmm3) // movdqa 0x10(%0,%4,1),%%xmm3
+ "lea " MEMLEA(0x20,0) ",%0 \n"
+ "pavgb %%xmm2,%%xmm0 \n"
+ "pavgb %%xmm3,%%xmm1 \n"
+ "psrlw $0x8,%%xmm0 \n"
+ "psrlw $0x8,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm0 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "pand %%xmm5,%%xmm0 \n"
+ "packuswb %%xmm0,%%xmm0 \n"
+ "psrlw $0x8,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm1 \n"
+ "movq %%xmm0," MEMACCESS(1) " \n"
+ BUNDLEALIGN
+ MEMOPMEM(movq,xmm1,0x00,1,2,1) // movq %%xmm1,(%1,%2)
+ "lea " MEMLEA(0x8,1) ",%1 \n"
+ "sub $0x10,%3 \n"
+ "jg 1b \n"
+ : "+r"(src_yuy2), // %0
+ "+r"(dst_u), // %1
+ "+r"(dst_v), // %2
+ "+r"(pix) // %3
+ : "r"((intptr_t)(stride_yuy2)) // %4
+ : "memory", "cc"
+#if defined(__native_client__) && defined(__x86_64__)
+ , "r14"
+#endif
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
+#endif
+ );
+}
+
+void YUY2ToUV422Row_SSE2(const uint8* src_yuy2,
+ uint8* dst_u, uint8* dst_v, int pix) {
+ asm volatile (
+ "pcmpeqb %%xmm5,%%xmm5 \n"
+ "psrlw $0x8,%%xmm5 \n"
+ "sub %1,%2 \n"
+ LABELALIGN
+ "1: \n"
+ "movdqa " MEMACCESS(0) ",%%xmm0 \n"
+ "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
+ "lea " MEMLEA(0x20,0) ",%0 \n"
+ "psrlw $0x8,%%xmm0 \n"
+ "psrlw $0x8,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm0 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "pand %%xmm5,%%xmm0 \n"
+ "packuswb %%xmm0,%%xmm0 \n"
+ "psrlw $0x8,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm1 \n"
+ "movq %%xmm0," MEMACCESS(1) " \n"
+ BUNDLEALIGN
+ MEMOPMEM(movq,xmm1,0x00,1,2,1) // movq %%xmm1,(%1,%2)
+ "lea " MEMLEA(0x8,1) ",%1 \n"
+ "sub $0x10,%3 \n"
+ "jg 1b \n"
+ : "+r"(src_yuy2), // %0
+ "+r"(dst_u), // %1
+ "+r"(dst_v), // %2
+ "+r"(pix) // %3
+ :
+ : "memory", "cc"
+#if defined(__native_client__) && defined(__x86_64__)
+ , "r14"
+#endif
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm5"
+#endif
+ );
+}
+
+void YUY2ToYRow_Unaligned_SSE2(const uint8* src_yuy2,
+ uint8* dst_y, int pix) {
+ asm volatile (
+ "pcmpeqb %%xmm5,%%xmm5 \n"
+ "psrlw $0x8,%%xmm5 \n"
+ LABELALIGN
+ "1: \n"
+ "movdqu " MEMACCESS(0) ",%%xmm0 \n"
+ "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
+ "lea " MEMLEA(0x20,0) ",%0 \n"
+ "pand %%xmm5,%%xmm0 \n"
+ "pand %%xmm5,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm0 \n"
+ "sub $0x10,%2 \n"
+ "movdqu %%xmm0," MEMACCESS(1) " \n"
+ "lea " MEMLEA(0x10,1) ",%1 \n"
+ "jg 1b \n"
+ : "+r"(src_yuy2), // %0
+ "+r"(dst_y), // %1
+ "+r"(pix) // %2
+ :
+ : "memory", "cc"
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm5"
+#endif
+ );
+}
+
+void YUY2ToUVRow_Unaligned_SSE2(const uint8* src_yuy2,
+ int stride_yuy2,
+ uint8* dst_u, uint8* dst_v, int pix) {
+ asm volatile (
+ "pcmpeqb %%xmm5,%%xmm5 \n"
+ "psrlw $0x8,%%xmm5 \n"
+ "sub %1,%2 \n"
+ LABELALIGN
+ "1: \n"
+ "movdqu " MEMACCESS(0) ",%%xmm0 \n"
+ "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
+ BUNDLEALIGN
+ MEMOPREG(movdqu,0x00,0,4,1,xmm2) // movdqu (%0,%4,1),%%xmm2
+ MEMOPREG(movdqu,0x10,0,4,1,xmm3) // movdqu 0x10(%0,%4,1),%%xmm3
+ "lea " MEMLEA(0x20,0) ",%0 \n"
+ "pavgb %%xmm2,%%xmm0 \n"
+ "pavgb %%xmm3,%%xmm1 \n"
+ "psrlw $0x8,%%xmm0 \n"
+ "psrlw $0x8,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm0 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "pand %%xmm5,%%xmm0 \n"
+ "packuswb %%xmm0,%%xmm0 \n"
+ "psrlw $0x8,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm1 \n"
+ "movq %%xmm0," MEMACCESS(1) " \n"
+ BUNDLEALIGN
+ MEMOPMEM(movq,xmm1,0x00,1,2,1) // movq %%xmm1,(%1,%2)
+ "lea " MEMLEA(0x8,1) ",%1 \n"
+ "sub $0x10,%3 \n"
+ "jg 1b \n"
+ : "+r"(src_yuy2), // %0
+ "+r"(dst_u), // %1
+ "+r"(dst_v), // %2
+ "+r"(pix) // %3
+ : "r"((intptr_t)(stride_yuy2)) // %4
+ : "memory", "cc"
+#if defined(__native_client__) && defined(__x86_64__)
+ , "r14"
+#endif
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
+#endif
+ );
+}
+
+void YUY2ToUV422Row_Unaligned_SSE2(const uint8* src_yuy2,
+ uint8* dst_u, uint8* dst_v, int pix) {
+ asm volatile (
+ "pcmpeqb %%xmm5,%%xmm5 \n"
+ "psrlw $0x8,%%xmm5 \n"
+ "sub %1,%2 \n"
+ LABELALIGN
+ "1: \n"
+ "movdqu " MEMACCESS(0) ",%%xmm0 \n"
+ "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
+ "lea " MEMLEA(0x20,0) ",%0 \n"
+ "psrlw $0x8,%%xmm0 \n"
+ "psrlw $0x8,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm0 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "pand %%xmm5,%%xmm0 \n"
+ "packuswb %%xmm0,%%xmm0 \n"
+ "psrlw $0x8,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm1 \n"
+ "movq %%xmm0," MEMACCESS(1) " \n"
+ BUNDLEALIGN
+ MEMOPMEM(movq,xmm1,0x00,1,2,1) // movq %%xmm1,(%1,%2)
+ "lea " MEMLEA(0x8,1) ",%1 \n"
+ "sub $0x10,%3 \n"
+ "jg 1b \n"
+ : "+r"(src_yuy2), // %0
+ "+r"(dst_u), // %1
+ "+r"(dst_v), // %2
+ "+r"(pix) // %3
+ :
+ : "memory", "cc"
+#if defined(__native_client__) && defined(__x86_64__)
+ , "r14"
+#endif
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm5"
+#endif
+ );
+}
+
+void UYVYToYRow_SSE2(const uint8* src_uyvy, uint8* dst_y, int pix) {
+ asm volatile (
+ LABELALIGN
+ "1: \n"
+ "movdqa " MEMACCESS(0) ",%%xmm0 \n"
+ "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
+ "lea " MEMLEA(0x20,0) ",%0 \n"
+ "psrlw $0x8,%%xmm0 \n"
+ "psrlw $0x8,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm0 \n"
+ "sub $0x10,%2 \n"
+ "movdqa %%xmm0," MEMACCESS(1) " \n"
+ "lea " MEMLEA(0x10,1) ",%1 \n"
+ "jg 1b \n"
+ : "+r"(src_uyvy), // %0
+ "+r"(dst_y), // %1
+ "+r"(pix) // %2
+ :
+ : "memory", "cc"
+#if defined(__SSE2__)
+ , "xmm0", "xmm1"
+#endif
+ );
+}
+
+void UYVYToUVRow_SSE2(const uint8* src_uyvy, int stride_uyvy,
+ uint8* dst_u, uint8* dst_v, int pix) {
+ asm volatile (
+ "pcmpeqb %%xmm5,%%xmm5 \n"
+ "psrlw $0x8,%%xmm5 \n"
+ "sub %1,%2 \n"
+ LABELALIGN
+ "1: \n"
+ "movdqa " MEMACCESS(0) ",%%xmm0 \n"
+ "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
+ BUNDLEALIGN
+ MEMOPREG(movdqa,0x00,0,4,1,xmm2) // movdqa (%0,%4,1),%%xmm2
+ MEMOPREG(movdqa,0x10,0,4,1,xmm3) // movdqa 0x10(%0,%4,1),%%xmm3
+ "lea " MEMLEA(0x20,0) ",%0 \n"
+ "pavgb %%xmm2,%%xmm0 \n"
+ "pavgb %%xmm3,%%xmm1 \n"
+ "pand %%xmm5,%%xmm0 \n"
+ "pand %%xmm5,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm0 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "pand %%xmm5,%%xmm0 \n"
+ "packuswb %%xmm0,%%xmm0 \n"
+ "psrlw $0x8,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm1 \n"
+ "movq %%xmm0," MEMACCESS(1) " \n"
+ BUNDLEALIGN
+ MEMOPMEM(movq,xmm1,0x00,1,2,1) // movq %%xmm1,(%1,%2)
+ "lea " MEMLEA(0x8,1) ",%1 \n"
+ "sub $0x10,%3 \n"
+ "jg 1b \n"
+ : "+r"(src_uyvy), // %0
+ "+r"(dst_u), // %1
+ "+r"(dst_v), // %2
+ "+r"(pix) // %3
+ : "r"((intptr_t)(stride_uyvy)) // %4
+ : "memory", "cc"
+#if defined(__native_client__) && defined(__x86_64__)
+ , "r14"
+#endif
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
+#endif
+ );
+}
+
+void UYVYToUV422Row_SSE2(const uint8* src_uyvy,
+ uint8* dst_u, uint8* dst_v, int pix) {
+ asm volatile (
+ "pcmpeqb %%xmm5,%%xmm5 \n"
+ "psrlw $0x8,%%xmm5 \n"
+ "sub %1,%2 \n"
+ LABELALIGN
+ "1: \n"
+ "movdqa " MEMACCESS(0) ",%%xmm0 \n"
+ "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
+ "lea " MEMLEA(0x20,0) ",%0 \n"
+ "pand %%xmm5,%%xmm0 \n"
+ "pand %%xmm5,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm0 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "pand %%xmm5,%%xmm0 \n"
+ "packuswb %%xmm0,%%xmm0 \n"
+ "psrlw $0x8,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm1 \n"
+ "movq %%xmm0," MEMACCESS(1) " \n"
+ BUNDLEALIGN
+ MEMOPMEM(movq,xmm1,0x00,1,2,1) // movq %%xmm1,(%1,%2)
+ "lea " MEMLEA(0x8,1) ",%1 \n"
+ "sub $0x10,%3 \n"
+ "jg 1b \n"
+ : "+r"(src_uyvy), // %0
+ "+r"(dst_u), // %1
+ "+r"(dst_v), // %2
+ "+r"(pix) // %3
+ :
+ : "memory", "cc"
+#if defined(__native_client__) && defined(__x86_64__)
+ , "r14"
+#endif
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm5"
+#endif
+ );
+}
+
+void UYVYToYRow_Unaligned_SSE2(const uint8* src_uyvy,
+ uint8* dst_y, int pix) {
+ asm volatile (
+ LABELALIGN
+ "1: \n"
+ "movdqu " MEMACCESS(0) ",%%xmm0 \n"
+ "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
+ "lea " MEMLEA(0x20,0) ",%0 \n"
+ "psrlw $0x8,%%xmm0 \n"
+ "psrlw $0x8,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm0 \n"
+ "sub $0x10,%2 \n"
+ "movdqu %%xmm0," MEMACCESS(1) " \n"
+ "lea " MEMLEA(0x10,1) ",%1 \n"
+ "jg 1b \n"
+ : "+r"(src_uyvy), // %0
+ "+r"(dst_y), // %1
+ "+r"(pix) // %2
+ :
+ : "memory", "cc"
+#if defined(__SSE2__)
+ , "xmm0", "xmm1"
+#endif
+ );
+}
+
+void UYVYToUVRow_Unaligned_SSE2(const uint8* src_uyvy, int stride_uyvy,
+ uint8* dst_u, uint8* dst_v, int pix) {
+ asm volatile (
+ "pcmpeqb %%xmm5,%%xmm5 \n"
+ "psrlw $0x8,%%xmm5 \n"
+ "sub %1,%2 \n"
+ LABELALIGN
+ "1: \n"
+ "movdqu " MEMACCESS(0) ",%%xmm0 \n"
+ "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
+ BUNDLEALIGN
+ MEMOPREG(movdqu,0x00,0,4,1,xmm2) // movdqu (%0,%4,1),%%xmm2
+ MEMOPREG(movdqu,0x10,0,4,1,xmm3) // movdqu 0x10(%0,%4,1),%%xmm3
+ "lea " MEMLEA(0x20,0) ",%0 \n"
+ "pavgb %%xmm2,%%xmm0 \n"
+ "pavgb %%xmm3,%%xmm1 \n"
+ "pand %%xmm5,%%xmm0 \n"
+ "pand %%xmm5,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm0 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "pand %%xmm5,%%xmm0 \n"
+ "packuswb %%xmm0,%%xmm0 \n"
+ "psrlw $0x8,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm1 \n"
+ "movq %%xmm0," MEMACCESS(1) " \n"
+ BUNDLEALIGN
+ MEMOPMEM(movq,xmm1,0x00,1,2,1) // movq %%xmm1,(%1,%2)
+ "lea " MEMLEA(0x8,1) ",%1 \n"
+ "sub $0x10,%3 \n"
+ "jg 1b \n"
+ : "+r"(src_uyvy), // %0
+ "+r"(dst_u), // %1
+ "+r"(dst_v), // %2
+ "+r"(pix) // %3
+ : "r"((intptr_t)(stride_uyvy)) // %4
+ : "memory", "cc"
+#if defined(__native_client__) && defined(__x86_64__)
+ , "r14"
+#endif
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
+#endif
+ );
+}
+
+void UYVYToUV422Row_Unaligned_SSE2(const uint8* src_uyvy,
+ uint8* dst_u, uint8* dst_v, int pix) {
+ asm volatile (
+ "pcmpeqb %%xmm5,%%xmm5 \n"
+ "psrlw $0x8,%%xmm5 \n"
+ "sub %1,%2 \n"
+ LABELALIGN
+ "1: \n"
+ "movdqu " MEMACCESS(0) ",%%xmm0 \n"
+ "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
+ "lea " MEMLEA(0x20,0) ",%0 \n"
+ "pand %%xmm5,%%xmm0 \n"
+ "pand %%xmm5,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm0 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "pand %%xmm5,%%xmm0 \n"
+ "packuswb %%xmm0,%%xmm0 \n"
+ "psrlw $0x8,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm1 \n"
+ "movq %%xmm0," MEMACCESS(1) " \n"
+ BUNDLEALIGN
+ MEMOPMEM(movq,xmm1,0x00,1,2,1) // movq %%xmm1,(%1,%2)
+ "lea " MEMLEA(0x8,1) ",%1 \n"
+ "sub $0x10,%3 \n"
+ "jg 1b \n"
+ : "+r"(src_uyvy), // %0
+ "+r"(dst_u), // %1
+ "+r"(dst_v), // %2
+ "+r"(pix) // %3
+ :
+ : "memory", "cc"
+#if defined(__native_client__) && defined(__x86_64__)
+ , "r14"
+#endif
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm5"
+#endif
+ );
+}
+#endif // HAS_YUY2TOYROW_SSE2
+
+#ifdef HAS_ARGBBLENDROW_SSE2
+// Blend 8 pixels at a time.
+void ARGBBlendRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
+ uint8* dst_argb, int width) {
+ asm volatile (
+ "pcmpeqb %%xmm7,%%xmm7 \n"
+ "psrlw $0xf,%%xmm7 \n"
+ "pcmpeqb %%xmm6,%%xmm6 \n"
+ "psrlw $0x8,%%xmm6 \n"
+ "pcmpeqb %%xmm5,%%xmm5 \n"
+ "psllw $0x8,%%xmm5 \n"
+ "pcmpeqb %%xmm4,%%xmm4 \n"
+ "pslld $0x18,%%xmm4 \n"
+ "sub $0x1,%3 \n"
+ "je 91f \n"
+ "jl 99f \n"
+
+ // 1 pixel loop until destination pointer is aligned.
+ "10: \n"
+ "test $0xf,%2 \n"
+ "je 19f \n"
+ "movd " MEMACCESS(0) ",%%xmm3 \n"
+ "lea " MEMLEA(0x4,0) ",%0 \n"
+ "movdqa %%xmm3,%%xmm0 \n"
+ "pxor %%xmm4,%%xmm3 \n"
+ "movd " MEMACCESS(1) ",%%xmm2 \n"
+ "psrlw $0x8,%%xmm3 \n"
+ "pshufhw $0xf5,%%xmm3,%%xmm3 \n"
+ "pshuflw $0xf5,%%xmm3,%%xmm3 \n"
+ "pand %%xmm6,%%xmm2 \n"
+ "paddw %%xmm7,%%xmm3 \n"
+ "pmullw %%xmm3,%%xmm2 \n"
+ "movd " MEMACCESS(1) ",%%xmm1 \n"
+ "lea " MEMLEA(0x4,1) ",%1 \n"
+ "psrlw $0x8,%%xmm1 \n"
+ "por %%xmm4,%%xmm0 \n"
+ "pmullw %%xmm3,%%xmm1 \n"
+ "psrlw $0x8,%%xmm2 \n"
+ "paddusb %%xmm2,%%xmm0 \n"
+ "pand %%xmm5,%%xmm1 \n"
+ "paddusb %%xmm1,%%xmm0 \n"
+ "sub $0x1,%3 \n"
+ "movd %%xmm0," MEMACCESS(2) " \n"
+ "lea " MEMLEA(0x4,2) ",%2 \n"
+ "jge 10b \n"
+
+ "19: \n"
+ "add $1-4,%3 \n"
+ "jl 49f \n"
+
+ // 4 pixel loop.
+ LABELALIGN
+ "41: \n"
+ "movdqu " MEMACCESS(0) ",%%xmm3 \n"
+ "lea " MEMLEA(0x10,0) ",%0 \n"
+ "movdqa %%xmm3,%%xmm0 \n"
+ "pxor %%xmm4,%%xmm3 \n"
+ "movdqu " MEMACCESS(1) ",%%xmm2 \n"
+ "psrlw $0x8,%%xmm3 \n"
+ "pshufhw $0xf5,%%xmm3,%%xmm3 \n"
+ "pshuflw $0xf5,%%xmm3,%%xmm3 \n"
+ "pand %%xmm6,%%xmm2 \n"
+ "paddw %%xmm7,%%xmm3 \n"
+ "pmullw %%xmm3,%%xmm2 \n"
+ "movdqu " MEMACCESS(1) ",%%xmm1 \n"
+ "lea " MEMLEA(0x10,1) ",%1 \n"
+ "psrlw $0x8,%%xmm1 \n"
+ "por %%xmm4,%%xmm0 \n"
+ "pmullw %%xmm3,%%xmm1 \n"
+ "psrlw $0x8,%%xmm2 \n"
+ "paddusb %%xmm2,%%xmm0 \n"
+ "pand %%xmm5,%%xmm1 \n"
+ "paddusb %%xmm1,%%xmm0 \n"
+ "sub $0x4,%3 \n"
+ "movdqa %%xmm0," MEMACCESS(2) " \n"
+ "lea " MEMLEA(0x10,2) ",%2 \n"
+ "jge 41b \n"
+
+ "49: \n"
+ "add $0x3,%3 \n"
+ "jl 99f \n"
+
+ // 1 pixel loop.
+ "91: \n"
+ "movd " MEMACCESS(0) ",%%xmm3 \n"
+ "lea " MEMLEA(0x4,0) ",%0 \n"
+ "movdqa %%xmm3,%%xmm0 \n"
+ "pxor %%xmm4,%%xmm3 \n"
+ "movd " MEMACCESS(1) ",%%xmm2 \n"
+ "psrlw $0x8,%%xmm3 \n"
+ "pshufhw $0xf5,%%xmm3,%%xmm3 \n"
+ "pshuflw $0xf5,%%xmm3,%%xmm3 \n"
+ "pand %%xmm6,%%xmm2 \n"
+ "paddw %%xmm7,%%xmm3 \n"
+ "pmullw %%xmm3,%%xmm2 \n"
+ "movd " MEMACCESS(1) ",%%xmm1 \n"
+ "lea " MEMLEA(0x4,1) ",%1 \n"
+ "psrlw $0x8,%%xmm1 \n"
+ "por %%xmm4,%%xmm0 \n"
+ "pmullw %%xmm3,%%xmm1 \n"
+ "psrlw $0x8,%%xmm2 \n"
+ "paddusb %%xmm2,%%xmm0 \n"
+ "pand %%xmm5,%%xmm1 \n"
+ "paddusb %%xmm1,%%xmm0 \n"
+ "sub $0x1,%3 \n"
+ "movd %%xmm0," MEMACCESS(2) " \n"
+ "lea " MEMLEA(0x4,2) ",%2 \n"
+ "jge 91b \n"
+ "99: \n"
+ : "+r"(src_argb0), // %0
+ "+r"(src_argb1), // %1
+ "+r"(dst_argb), // %2
+ "+r"(width) // %3
+ :
+ : "memory", "cc"
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
+#endif
+ );
+}
+#endif // HAS_ARGBBLENDROW_SSE2
+
+#ifdef HAS_ARGBBLENDROW_SSSE3
+// Shuffle table for isolating alpha.
+static uvec8 kShuffleAlpha = {
+ 3u, 0x80, 3u, 0x80, 7u, 0x80, 7u, 0x80,
+ 11u, 0x80, 11u, 0x80, 15u, 0x80, 15u, 0x80
+};
+
+// Blend 8 pixels at a time
+// Shuffle table for reversing the bytes.
+
+// Same as SSE2, but replaces
+// psrlw xmm3, 8 // alpha
+// pshufhw xmm3, xmm3,0F5h // 8 alpha words
+// pshuflw xmm3, xmm3,0F5h
+// with..
+// pshufb xmm3, kShuffleAlpha // alpha
+
+void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1,
+ uint8* dst_argb, int width) {
+ asm volatile (
+ "pcmpeqb %%xmm7,%%xmm7 \n"
+ "psrlw $0xf,%%xmm7 \n"
+ "pcmpeqb %%xmm6,%%xmm6 \n"
+ "psrlw $0x8,%%xmm6 \n"
+ "pcmpeqb %%xmm5,%%xmm5 \n"
+ "psllw $0x8,%%xmm5 \n"
+ "pcmpeqb %%xmm4,%%xmm4 \n"
+ "pslld $0x18,%%xmm4 \n"
+ "sub $0x1,%3 \n"
+ "je 91f \n"
+ "jl 99f \n"
+
+ // 1 pixel loop until destination pointer is aligned.
+ "10: \n"
+ "test $0xf,%2 \n"
+ "je 19f \n"
+ "movd " MEMACCESS(0) ",%%xmm3 \n"
+ "lea " MEMLEA(0x4,0) ",%0 \n"
+ "movdqa %%xmm3,%%xmm0 \n"
+ "pxor %%xmm4,%%xmm3 \n"
+ "movd " MEMACCESS(1) ",%%xmm2 \n"
+ "pshufb %4,%%xmm3 \n"
+ "pand %%xmm6,%%xmm2 \n"
+ "paddw %%xmm7,%%xmm3 \n"
+ "pmullw %%xmm3,%%xmm2 \n"
+ "movd " MEMACCESS(1) ",%%xmm1 \n"
+ "lea " MEMLEA(0x4,1) ",%1 \n"
+ "psrlw $0x8,%%xmm1 \n"
+ "por %%xmm4,%%xmm0 \n"
+ "pmullw %%xmm3,%%xmm1 \n"
+ "psrlw $0x8,%%xmm2 \n"
+ "paddusb %%xmm2,%%xmm0 \n"
+ "pand %%xmm5,%%xmm1 \n"
+ "paddusb %%xmm1,%%xmm0 \n"
+ "sub $0x1,%3 \n"
+ "movd %%xmm0," MEMACCESS(2) " \n"
+ "lea " MEMLEA(0x4,2) ",%2 \n"
+ "jge 10b \n"
+
+ "19: \n"
+ "add $1-4,%3 \n"
+ "jl 49f \n"
+ "test $0xf,%0 \n"
+ "jne 41f \n"
+ "test $0xf,%1 \n"
+ "jne 41f \n"
+
+ // 4 pixel loop.
+ LABELALIGN
+ "40: \n"
+ "movdqa " MEMACCESS(0) ",%%xmm3 \n"
+ "lea " MEMLEA(0x10,0) ",%0 \n"
+ "movdqa %%xmm3,%%xmm0 \n"
+ "pxor %%xmm4,%%xmm3 \n"
+ "movdqa " MEMACCESS(1) ",%%xmm2 \n"
+ "pshufb %4,%%xmm3 \n"
+ "pand %%xmm6,%%xmm2 \n"
+ "paddw %%xmm7,%%xmm3 \n"
+ "pmullw %%xmm3,%%xmm2 \n"
+ "movdqa " MEMACCESS(1) ",%%xmm1 \n"
+ "lea " MEMLEA(0x10,1) ",%1 \n"
+ "psrlw $0x8,%%xmm1 \n"
+ "por %%xmm4,%%xmm0 \n"
+ "pmullw %%xmm3,%%xmm1 \n"
+ "psrlw $0x8,%%xmm2 \n"
+ "paddusb %%xmm2,%%xmm0 \n"
+ "pand %%xmm5,%%xmm1 \n"
+ "paddusb %%xmm1,%%xmm0 \n"
+ "sub $0x4,%3 \n"
+ "movdqa %%xmm0," MEMACCESS(2) " \n"
+ "lea " MEMLEA(0x10,2) ",%2 \n"
+ "jge 40b \n"
+ "jmp 49f \n"
+
+ // 4 pixel unaligned loop.
+ LABELALIGN
+ "41: \n"
+ "movdqu " MEMACCESS(0) ",%%xmm3 \n"
+ "lea " MEMLEA(0x10,0) ",%0 \n"
+ "movdqa %%xmm3,%%xmm0 \n"
+ "pxor %%xmm4,%%xmm3 \n"
+ "movdqu " MEMACCESS(1) ",%%xmm2 \n"
+ "pshufb %4,%%xmm3 \n"
+ "pand %%xmm6,%%xmm2 \n"
+ "paddw %%xmm7,%%xmm3 \n"
+ "pmullw %%xmm3,%%xmm2 \n"
+ "movdqu " MEMACCESS(1) ",%%xmm1 \n"
+ "lea " MEMLEA(0x10,1) ",%1 \n"
+ "psrlw $0x8,%%xmm1 \n"
+ "por %%xmm4,%%xmm0 \n"
+ "pmullw %%xmm3,%%xmm1 \n"
+ "psrlw $0x8,%%xmm2 \n"
+ "paddusb %%xmm2,%%xmm0 \n"
+ "pand %%xmm5,%%xmm1 \n"
+ "paddusb %%xmm1,%%xmm0 \n"
+ "sub $0x4,%3 \n"
+ "movdqa %%xmm0," MEMACCESS(2) " \n"
+ "lea " MEMLEA(0x10,2) ",%2 \n"
+ "jge 41b \n"
+
+ "49: \n"
+ "add $0x3,%3 \n"
+ "jl 99f \n"
+
+ // 1 pixel loop.
+ "91: \n"
+ "movd " MEMACCESS(0) ",%%xmm3 \n"
+ "lea " MEMLEA(0x4,0) ",%0 \n"
+ "movdqa %%xmm3,%%xmm0 \n"
+ "pxor %%xmm4,%%xmm3 \n"
+ "movd " MEMACCESS(1) ",%%xmm2 \n"
+ "pshufb %4,%%xmm3 \n"
+ "pand %%xmm6,%%xmm2 \n"
+ "paddw %%xmm7,%%xmm3 \n"
+ "pmullw %%xmm3,%%xmm2 \n"
+ "movd " MEMACCESS(1) ",%%xmm1 \n"
+ "lea " MEMLEA(0x4,1) ",%1 \n"
+ "psrlw $0x8,%%xmm1 \n"
+ "por %%xmm4,%%xmm0 \n"
+ "pmullw %%xmm3,%%xmm1 \n"
+ "psrlw $0x8,%%xmm2 \n"
+ "paddusb %%xmm2,%%xmm0 \n"
+ "pand %%xmm5,%%xmm1 \n"
+ "paddusb %%xmm1,%%xmm0 \n"
+ "sub $0x1,%3 \n"
+ "movd %%xmm0," MEMACCESS(2) " \n"
+ "lea " MEMLEA(0x4,2) ",%2 \n"
+ "jge 91b \n"
+ "99: \n"
+ : "+r"(src_argb0), // %0
+ "+r"(src_argb1), // %1
+ "+r"(dst_argb), // %2
+ "+r"(width) // %3
+ : "m"(kShuffleAlpha) // %4
+ : "memory", "cc"
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
+#endif
+ );
+}
+#endif // HAS_ARGBBLENDROW_SSSE3
+
+#ifdef HAS_ARGBATTENUATEROW_SSE2
+// Attenuate 4 pixels at a time.
+// aligned to 16 bytes
+void ARGBAttenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) {
+ asm volatile (
+ "pcmpeqb %%xmm4,%%xmm4 \n"
+ "pslld $0x18,%%xmm4 \n"
+ "pcmpeqb %%xmm5,%%xmm5 \n"
+ "psrld $0x8,%%xmm5 \n"
+
+ // 4 pixel loop.
+ LABELALIGN
+ "1: \n"
+ "movdqa " MEMACCESS(0) ",%%xmm0 \n"
+ "punpcklbw %%xmm0,%%xmm0 \n"
+ "pshufhw $0xff,%%xmm0,%%xmm2 \n"
+ "pshuflw $0xff,%%xmm2,%%xmm2 \n"
+ "pmulhuw %%xmm2,%%xmm0 \n"
+ "movdqa " MEMACCESS(0) ",%%xmm1 \n"
+ "punpckhbw %%xmm1,%%xmm1 \n"
+ "pshufhw $0xff,%%xmm1,%%xmm2 \n"
+ "pshuflw $0xff,%%xmm2,%%xmm2 \n"
+ "pmulhuw %%xmm2,%%xmm1 \n"
+ "movdqa " MEMACCESS(0) ",%%xmm2 \n"
+ "lea " MEMLEA(0x10,0) ",%0 \n"
+ "psrlw $0x8,%%xmm0 \n"
+ "pand %%xmm4,%%xmm2 \n"
+ "psrlw $0x8,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm0 \n"
+ "pand %%xmm5,%%xmm0 \n"
+ "por %%xmm2,%%xmm0 \n"
+ "sub $0x4,%2 \n"
+ "movdqa %%xmm0," MEMACCESS(1) " \n"
+ "lea " MEMLEA(0x10,1) ",%1 \n"
+ "jg 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_argb), // %1
+ "+r"(width) // %2
+ :
+ : "memory", "cc"
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+#endif
+ );
+}
+#endif // HAS_ARGBATTENUATEROW_SSE2
+
+#ifdef HAS_ARGBATTENUATEROW_SSSE3
+// Shuffle table duplicating alpha
+static uvec8 kShuffleAlpha0 = {
+ 3u, 3u, 3u, 3u, 3u, 3u, 128u, 128u, 7u, 7u, 7u, 7u, 7u, 7u, 128u, 128u,
+};
+static uvec8 kShuffleAlpha1 = {
+ 11u, 11u, 11u, 11u, 11u, 11u, 128u, 128u,
+ 15u, 15u, 15u, 15u, 15u, 15u, 128u, 128u,
+};
+// Attenuate 4 pixels at a time.
+// aligned to 16 bytes
+void ARGBAttenuateRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
+ asm volatile (
+ "pcmpeqb %%xmm3,%%xmm3 \n"
+ "pslld $0x18,%%xmm3 \n"
+ "movdqa %3,%%xmm4 \n"
+ "movdqa %4,%%xmm5 \n"
+
+ // 4 pixel loop.
+ LABELALIGN
+ "1: \n"
+ "movdqu " MEMACCESS(0) ",%%xmm0 \n"
+ "pshufb %%xmm4,%%xmm0 \n"
+ "movdqu " MEMACCESS(0) ",%%xmm1 \n"
+ "punpcklbw %%xmm1,%%xmm1 \n"
+ "pmulhuw %%xmm1,%%xmm0 \n"
+ "movdqu " MEMACCESS(0) ",%%xmm1 \n"
+ "pshufb %%xmm5,%%xmm1 \n"
+ "movdqu " MEMACCESS(0) ",%%xmm2 \n"
+ "punpckhbw %%xmm2,%%xmm2 \n"
+ "pmulhuw %%xmm2,%%xmm1 \n"
+ "movdqu " MEMACCESS(0) ",%%xmm2 \n"
+ "lea " MEMLEA(0x10,0) ",%0 \n"
+ "pand %%xmm3,%%xmm2 \n"
+ "psrlw $0x8,%%xmm0 \n"
+ "psrlw $0x8,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm0 \n"
+ "por %%xmm2,%%xmm0 \n"
+ "sub $0x4,%2 \n"
+ "movdqu %%xmm0," MEMACCESS(1) " \n"
+ "lea " MEMLEA(0x10,1) ",%1 \n"
+ "jg 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_argb), // %1
+ "+r"(width) // %2
+ : "m"(kShuffleAlpha0), // %3
+ "m"(kShuffleAlpha1) // %4
+ : "memory", "cc"
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+#endif
+ );
+}
+#endif // HAS_ARGBATTENUATEROW_SSSE3
+
+#ifdef HAS_ARGBUNATTENUATEROW_SSE2
+// Unattenuate 4 pixels at a time.
+// aligned to 16 bytes
+void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb,
+ int width) {
+ uintptr_t alpha = 0;
+ asm volatile (
+ // 4 pixel loop.
+ LABELALIGN
+ "1: \n"
+ "movdqu " MEMACCESS(0) ",%%xmm0 \n"
+ "movzb " MEMACCESS2(0x03,0) ",%3 \n"
+ "punpcklbw %%xmm0,%%xmm0 \n"
+ MEMOPREG(movd,0x00,4,3,4,xmm2) // movd 0x0(%4,%3,4),%%xmm2
+ "movzb " MEMACCESS2(0x07,0) ",%3 \n"
+ MEMOPREG(movd,0x00,4,3,4,xmm3) // movd 0x0(%4,%3,4),%%xmm3
+ "pshuflw $0x40,%%xmm2,%%xmm2 \n"
+ "pshuflw $0x40,%%xmm3,%%xmm3 \n"
+ "movlhps %%xmm3,%%xmm2 \n"
+ "pmulhuw %%xmm2,%%xmm0 \n"
+ "movdqu " MEMACCESS(0) ",%%xmm1 \n"
+ "movzb " MEMACCESS2(0x0b,0) ",%3 \n"
+ "punpckhbw %%xmm1,%%xmm1 \n"
+ BUNDLEALIGN
+ MEMOPREG(movd,0x00,4,3,4,xmm2) // movd 0x0(%4,%3,4),%%xmm2
+ "movzb " MEMACCESS2(0x0f,0) ",%3 \n"
+ MEMOPREG(movd,0x00,4,3,4,xmm3) // movd 0x0(%4,%3,4),%%xmm3
+ "pshuflw $0x40,%%xmm2,%%xmm2 \n"
+ "pshuflw $0x40,%%xmm3,%%xmm3 \n"
+ "movlhps %%xmm3,%%xmm2 \n"
+ "pmulhuw %%xmm2,%%xmm1 \n"
+ "lea " MEMLEA(0x10,0) ",%0 \n"
+ "packuswb %%xmm1,%%xmm0 \n"
+ "sub $0x4,%2 \n"
+ "movdqu %%xmm0," MEMACCESS(1) " \n"
+ "lea " MEMLEA(0x10,1) ",%1 \n"
+ "jg 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_argb), // %1
+ "+r"(width), // %2
+ "+r"(alpha) // %3
+ : "r"(fixed_invtbl8) // %4
+ : "memory", "cc"
+#if defined(__native_client__) && defined(__x86_64__)
+ , "r14"
+#endif
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+#endif
+ );
+}
+#endif // HAS_ARGBUNATTENUATEROW_SSE2
+
+#ifdef HAS_ARGBGRAYROW_SSSE3
+// Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels
+void ARGBGrayRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
+ asm volatile (
+ "movdqa %3,%%xmm4 \n"
+ "movdqa %4,%%xmm5 \n"
+
+ // 8 pixel loop.
+ LABELALIGN
+ "1: \n"
+ "movdqa " MEMACCESS(0) ",%%xmm0 \n"
+ "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
+ "pmaddubsw %%xmm4,%%xmm0 \n"
+ "pmaddubsw %%xmm4,%%xmm1 \n"
+ "phaddw %%xmm1,%%xmm0 \n"
+ "paddw %%xmm5,%%xmm0 \n"
+ "psrlw $0x7,%%xmm0 \n"
+ "packuswb %%xmm0,%%xmm0 \n"
+ "movdqa " MEMACCESS(0) ",%%xmm2 \n"
+ "movdqa " MEMACCESS2(0x10,0) ",%%xmm3 \n"
+ "lea " MEMLEA(0x20,0) ",%0 \n"
+ "psrld $0x18,%%xmm2 \n"
+ "psrld $0x18,%%xmm3 \n"
+ "packuswb %%xmm3,%%xmm2 \n"
+ "packuswb %%xmm2,%%xmm2 \n"
+ "movdqa %%xmm0,%%xmm3 \n"
+ "punpcklbw %%xmm0,%%xmm0 \n"
+ "punpcklbw %%xmm2,%%xmm3 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "punpcklwd %%xmm3,%%xmm0 \n"
+ "punpckhwd %%xmm3,%%xmm1 \n"
+ "sub $0x8,%2 \n"
+ "movdqa %%xmm0," MEMACCESS(1) " \n"
+ "movdqa %%xmm1," MEMACCESS2(0x10,1) " \n"
+ "lea " MEMLEA(0x20,1) ",%1 \n"
+ "jg 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_argb), // %1
+ "+r"(width) // %2
+ : "m"(kARGBToYJ), // %3
+ "m"(kAddYJ64) // %4
+ : "memory", "cc"
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+#endif
+ );
+}
+#endif // HAS_ARGBGRAYROW_SSSE3
+
+#ifdef HAS_ARGBSEPIAROW_SSSE3
+// b = (r * 35 + g * 68 + b * 17) >> 7
+// g = (r * 45 + g * 88 + b * 22) >> 7
+// r = (r * 50 + g * 98 + b * 24) >> 7
+// Constant for ARGB color to sepia tone
+static vec8 kARGBToSepiaB = {
+ 17, 68, 35, 0, 17, 68, 35, 0, 17, 68, 35, 0, 17, 68, 35, 0
+};
+
+static vec8 kARGBToSepiaG = {
+ 22, 88, 45, 0, 22, 88, 45, 0, 22, 88, 45, 0, 22, 88, 45, 0
+};
+
+static vec8 kARGBToSepiaR = {
+ 24, 98, 50, 0, 24, 98, 50, 0, 24, 98, 50, 0, 24, 98, 50, 0
+};
+
+// Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels.
+void ARGBSepiaRow_SSSE3(uint8* dst_argb, int width) {
+ asm volatile (
+ "movdqa %2,%%xmm2 \n"
+ "movdqa %3,%%xmm3 \n"
+ "movdqa %4,%%xmm4 \n"
+
+ // 8 pixel loop.
+ LABELALIGN
+ "1: \n"
+ "movdqa " MEMACCESS(0) ",%%xmm0 \n"
+ "movdqa " MEMACCESS2(0x10,0) ",%%xmm6 \n"
+ "pmaddubsw %%xmm2,%%xmm0 \n"
+ "pmaddubsw %%xmm2,%%xmm6 \n"
+ "phaddw %%xmm6,%%xmm0 \n"
+ "psrlw $0x7,%%xmm0 \n"
+ "packuswb %%xmm0,%%xmm0 \n"
+ "movdqa " MEMACCESS(0) ",%%xmm5 \n"
+ "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
+ "pmaddubsw %%xmm3,%%xmm5 \n"
+ "pmaddubsw %%xmm3,%%xmm1 \n"
+ "phaddw %%xmm1,%%xmm5 \n"
+ "psrlw $0x7,%%xmm5 \n"
+ "packuswb %%xmm5,%%xmm5 \n"
+ "punpcklbw %%xmm5,%%xmm0 \n"
+ "movdqa " MEMACCESS(0) ",%%xmm5 \n"
+ "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
+ "pmaddubsw %%xmm4,%%xmm5 \n"
+ "pmaddubsw %%xmm4,%%xmm1 \n"
+ "phaddw %%xmm1,%%xmm5 \n"
+ "psrlw $0x7,%%xmm5 \n"
+ "packuswb %%xmm5,%%xmm5 \n"
+ "movdqa " MEMACCESS(0) ",%%xmm6 \n"
+ "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
+ "psrld $0x18,%%xmm6 \n"
+ "psrld $0x18,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm6 \n"
+ "packuswb %%xmm6,%%xmm6 \n"
+ "punpcklbw %%xmm6,%%xmm5 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "punpcklwd %%xmm5,%%xmm0 \n"
+ "punpckhwd %%xmm5,%%xmm1 \n"
+ "sub $0x8,%1 \n"
+ "movdqa %%xmm0," MEMACCESS(0) " \n"
+ "movdqa %%xmm1," MEMACCESS2(0x10,0) " \n"
+ "lea " MEMLEA(0x20,0) ",%0 \n"
+ "jg 1b \n"
+ : "+r"(dst_argb), // %0
+ "+r"(width) // %1
+ : "m"(kARGBToSepiaB), // %2
+ "m"(kARGBToSepiaG), // %3
+ "m"(kARGBToSepiaR) // %4
+ : "memory", "cc"
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
+#endif
+ );
+}
+#endif // HAS_ARGBSEPIAROW_SSSE3
+
+#ifdef HAS_ARGBCOLORMATRIXROW_SSSE3
+// Tranform 8 ARGB pixels (32 bytes) with color matrix.
+// Same as Sepia except matrix is provided.
+void ARGBColorMatrixRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
+ const int8* matrix_argb, int width) {
+ asm volatile (
+ "movdqu " MEMACCESS(3) ",%%xmm5 \n"
+ "pshufd $0x00,%%xmm5,%%xmm2 \n"
+ "pshufd $0x55,%%xmm5,%%xmm3 \n"
+ "pshufd $0xaa,%%xmm5,%%xmm4 \n"
+ "pshufd $0xff,%%xmm5,%%xmm5 \n"
+
+ // 8 pixel loop.
+ LABELALIGN
+ "1: \n"
+ "movdqa " MEMACCESS(0) ",%%xmm0 \n"
+ "movdqa " MEMACCESS2(0x10,0) ",%%xmm7 \n"
+ "pmaddubsw %%xmm2,%%xmm0 \n"
+ "pmaddubsw %%xmm2,%%xmm7 \n"
+ "movdqa " MEMACCESS(0) ",%%xmm6 \n"
+ "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
+ "pmaddubsw %%xmm3,%%xmm6 \n"
+ "pmaddubsw %%xmm3,%%xmm1 \n"
+ "phaddsw %%xmm7,%%xmm0 \n"
+ "phaddsw %%xmm1,%%xmm6 \n"
+ "psraw $0x6,%%xmm0 \n"
+ "psraw $0x6,%%xmm6 \n"
+ "packuswb %%xmm0,%%xmm0 \n"
+ "packuswb %%xmm6,%%xmm6 \n"
+ "punpcklbw %%xmm6,%%xmm0 \n"
+ "movdqa " MEMACCESS(0) ",%%xmm1 \n"
+ "movdqa " MEMACCESS2(0x10,0) ",%%xmm7 \n"
+ "pmaddubsw %%xmm4,%%xmm1 \n"
+ "pmaddubsw %%xmm4,%%xmm7 \n"
+ "phaddsw %%xmm7,%%xmm1 \n"
+ "movdqa " MEMACCESS(0) ",%%xmm6 \n"
+ "movdqa " MEMACCESS2(0x10,0) ",%%xmm7 \n"
+ "pmaddubsw %%xmm5,%%xmm6 \n"
+ "pmaddubsw %%xmm5,%%xmm7 \n"
+ "phaddsw %%xmm7,%%xmm6 \n"
+ "psraw $0x6,%%xmm1 \n"
+ "psraw $0x6,%%xmm6 \n"
+ "packuswb %%xmm1,%%xmm1 \n"
+ "packuswb %%xmm6,%%xmm6 \n"
+ "punpcklbw %%xmm6,%%xmm1 \n"
+ "movdqa %%xmm0,%%xmm6 \n"
+ "punpcklwd %%xmm1,%%xmm0 \n"
+ "punpckhwd %%xmm1,%%xmm6 \n"
+ "sub $0x8,%2 \n"
+ "movdqa %%xmm0," MEMACCESS(1) " \n"
+ "movdqa %%xmm6," MEMACCESS2(0x10,1) " \n"
+ "lea " MEMLEA(0x20,0) ",%0 \n"
+ "lea " MEMLEA(0x20,1) ",%1 \n"
+ "jg 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_argb), // %1
+ "+r"(width) // %2
+ : "r"(matrix_argb) // %3
+ : "memory", "cc"
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
+#endif
+ );
+}
+#endif // HAS_ARGBCOLORMATRIXROW_SSSE3
+
+#ifdef HAS_ARGBQUANTIZEROW_SSE2
+// Quantize 4 ARGB pixels (16 bytes).
+// aligned to 16 bytes
+void ARGBQuantizeRow_SSE2(uint8* dst_argb, int scale, int interval_size,
+ int interval_offset, int width) {
+ asm volatile (
+ "movd %2,%%xmm2 \n"
+ "movd %3,%%xmm3 \n"
+ "movd %4,%%xmm4 \n"
+ "pshuflw $0x40,%%xmm2,%%xmm2 \n"
+ "pshufd $0x44,%%xmm2,%%xmm2 \n"
+ "pshuflw $0x40,%%xmm3,%%xmm3 \n"
+ "pshufd $0x44,%%xmm3,%%xmm3 \n"
+ "pshuflw $0x40,%%xmm4,%%xmm4 \n"
+ "pshufd $0x44,%%xmm4,%%xmm4 \n"
+ "pxor %%xmm5,%%xmm5 \n"
+ "pcmpeqb %%xmm6,%%xmm6 \n"
+ "pslld $0x18,%%xmm6 \n"
+
+ // 4 pixel loop.
+ LABELALIGN
+ "1: \n"
+ "movdqa " MEMACCESS(0) ",%%xmm0 \n"
+ "punpcklbw %%xmm5,%%xmm0 \n"
+ "pmulhuw %%xmm2,%%xmm0 \n"
+ "movdqa " MEMACCESS(0) ",%%xmm1 \n"
+ "punpckhbw %%xmm5,%%xmm1 \n"
+ "pmulhuw %%xmm2,%%xmm1 \n"
+ "pmullw %%xmm3,%%xmm0 \n"
+ "movdqa " MEMACCESS(0) ",%%xmm7 \n"
+ "pmullw %%xmm3,%%xmm1 \n"
+ "pand %%xmm6,%%xmm7 \n"
+ "paddw %%xmm4,%%xmm0 \n"
+ "paddw %%xmm4,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm0 \n"
+ "por %%xmm7,%%xmm0 \n"
+ "sub $0x4,%1 \n"
+ "movdqa %%xmm0," MEMACCESS(0) " \n"
+ "lea " MEMLEA(0x10,0) ",%0 \n"
+ "jg 1b \n"
+ : "+r"(dst_argb), // %0
+ "+r"(width) // %1
+ : "r"(scale), // %2
+ "r"(interval_size), // %3
+ "r"(interval_offset) // %4
+ : "memory", "cc"
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
+#endif
+ );
+}
+#endif // HAS_ARGBQUANTIZEROW_SSE2
+
+#ifdef HAS_ARGBSHADEROW_SSE2
+// Shade 4 pixels at a time by specified value.
+// Aligned to 16 bytes.
+void ARGBShadeRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width,
+ uint32 value) {
+ asm volatile (
+ "movd %3,%%xmm2 \n"
+ "punpcklbw %%xmm2,%%xmm2 \n"
+ "punpcklqdq %%xmm2,%%xmm2 \n"
+
+ // 4 pixel loop.
+ LABELALIGN
+ "1: \n"
+ "movdqa " MEMACCESS(0) ",%%xmm0 \n"
+ "lea " MEMLEA(0x10,0) ",%0 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "punpcklbw %%xmm0,%%xmm0 \n"
+ "punpckhbw %%xmm1,%%xmm1 \n"
+ "pmulhuw %%xmm2,%%xmm0 \n"
+ "pmulhuw %%xmm2,%%xmm1 \n"
+ "psrlw $0x8,%%xmm0 \n"
+ "psrlw $0x8,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm0 \n"
+ "sub $0x4,%2 \n"
+ "movdqa %%xmm0," MEMACCESS(1) " \n"
+ "lea " MEMLEA(0x10,1) ",%1 \n"
+ "jg 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_argb), // %1
+ "+r"(width) // %2
+ : "r"(value) // %3
+ : "memory", "cc"
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm2"
+#endif
+ );
+}
+#endif // HAS_ARGBSHADEROW_SSE2
+
+#ifdef HAS_ARGBMULTIPLYROW_SSE2
+// Multiply 2 rows of ARGB pixels together, 4 pixels at a time.
+void ARGBMultiplyRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
+ uint8* dst_argb, int width) {
+ asm volatile (
+ "pxor %%xmm5,%%xmm5 \n"
+
+ // 4 pixel loop.
+ LABELALIGN
+ "1: \n"
+ "movdqu " MEMACCESS(0) ",%%xmm0 \n"
+ "lea " MEMLEA(0x10,0) ",%0 \n"
+ "movdqu " MEMACCESS(1) ",%%xmm2 \n"
+ "lea " MEMLEA(0x10,1) ",%1 \n"
+ "movdqu %%xmm0,%%xmm1 \n"
+ "movdqu %%xmm2,%%xmm3 \n"
+ "punpcklbw %%xmm0,%%xmm0 \n"
+ "punpckhbw %%xmm1,%%xmm1 \n"
+ "punpcklbw %%xmm5,%%xmm2 \n"
+ "punpckhbw %%xmm5,%%xmm3 \n"
+ "pmulhuw %%xmm2,%%xmm0 \n"
+ "pmulhuw %%xmm3,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm0 \n"
+ "sub $0x4,%3 \n"
+ "movdqu %%xmm0," MEMACCESS(2) " \n"
+ "lea " MEMLEA(0x10,2) ",%2 \n"
+ "jg 1b \n"
+ : "+r"(src_argb0), // %0
+ "+r"(src_argb1), // %1
+ "+r"(dst_argb), // %2
+ "+r"(width) // %3
+ :
+ : "memory", "cc"
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
+#endif
+ );
+}
+#endif // HAS_ARGBMULTIPLYROW_SSE2
+
+#ifdef HAS_ARGBADDROW_SSE2
+// Add 2 rows of ARGB pixels together, 4 pixels at a time.
+void ARGBAddRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
+ uint8* dst_argb, int width) {
+ asm volatile (
+ // 4 pixel loop.
+ LABELALIGN
+ "1: \n"
+ "movdqu " MEMACCESS(0) ",%%xmm0 \n"
+ "lea " MEMLEA(0x10,0) ",%0 \n"
+ "movdqu " MEMACCESS(1) ",%%xmm1 \n"
+ "lea " MEMLEA(0x10,1) ",%1 \n"
+ "paddusb %%xmm1,%%xmm0 \n"
+ "sub $0x4,%3 \n"
+ "movdqu %%xmm0," MEMACCESS(2) " \n"
+ "lea " MEMLEA(0x10,2) ",%2 \n"
+ "jg 1b \n"
+ : "+r"(src_argb0), // %0
+ "+r"(src_argb1), // %1
+ "+r"(dst_argb), // %2
+ "+r"(width) // %3
+ :
+ : "memory", "cc"
+#if defined(__SSE2__)
+ , "xmm0", "xmm1"
+#endif
+ );
+}
+#endif // HAS_ARGBADDROW_SSE2
+
+#ifdef HAS_ARGBSUBTRACTROW_SSE2
+// Subtract 2 rows of ARGB pixels, 4 pixels at a time.
+void ARGBSubtractRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
+ uint8* dst_argb, int width) {
+ asm volatile (
+ // 4 pixel loop.
+ LABELALIGN
+ "1: \n"
+ "movdqu " MEMACCESS(0) ",%%xmm0 \n"
+ "lea " MEMLEA(0x10,0) ",%0 \n"
+ "movdqu " MEMACCESS(1) ",%%xmm1 \n"
+ "lea " MEMLEA(0x10,1) ",%1 \n"
+ "psubusb %%xmm1,%%xmm0 \n"
+ "sub $0x4,%3 \n"
+ "movdqu %%xmm0," MEMACCESS(2) " \n"
+ "lea " MEMLEA(0x10,2) ",%2 \n"
+ "jg 1b \n"
+ : "+r"(src_argb0), // %0
+ "+r"(src_argb1), // %1
+ "+r"(dst_argb), // %2
+ "+r"(width) // %3
+ :
+ : "memory", "cc"
+#if defined(__SSE2__)
+ , "xmm0", "xmm1"
+#endif
+ );
+}
+#endif // HAS_ARGBSUBTRACTROW_SSE2
+
+#ifdef HAS_SOBELXROW_SSE2
+// SobelX as a matrix is
+// -1 0 1
+// -2 0 2
+// -1 0 1
+void SobelXRow_SSE2(const uint8* src_y0, const uint8* src_y1,
+ const uint8* src_y2, uint8* dst_sobelx, int width) {
+ asm volatile (
+ "sub %0,%1 \n"
+ "sub %0,%2 \n"
+ "sub %0,%3 \n"
+ "pxor %%xmm5,%%xmm5 \n"
+
+ // 8 pixel loop.
+ LABELALIGN
+ "1: \n"
+ "movq " MEMACCESS(0) ",%%xmm0 \n"
+ "movq " MEMACCESS2(0x2,0) ",%%xmm1 \n"
+ "punpcklbw %%xmm5,%%xmm0 \n"
+ "punpcklbw %%xmm5,%%xmm1 \n"
+ "psubw %%xmm1,%%xmm0 \n"
+ BUNDLEALIGN
+ MEMOPREG(movq,0x00,0,1,1,xmm1) // movq (%0,%1,1),%%xmm1
+ MEMOPREG(movq,0x02,0,1,1,xmm2) // movq 0x2(%0,%1,1),%%xmm2
+ "punpcklbw %%xmm5,%%xmm1 \n"
+ "punpcklbw %%xmm5,%%xmm2 \n"
+ "psubw %%xmm2,%%xmm1 \n"
+ BUNDLEALIGN
+ MEMOPREG(movq,0x00,0,2,1,xmm2) // movq (%0,%2,1),%%xmm2
+ MEMOPREG(movq,0x02,0,2,1,xmm3) // movq 0x2(%0,%2,1),%%xmm3
+ "punpcklbw %%xmm5,%%xmm2 \n"
+ "punpcklbw %%xmm5,%%xmm3 \n"
+ "psubw %%xmm3,%%xmm2 \n"
+ "paddw %%xmm2,%%xmm0 \n"
+ "paddw %%xmm1,%%xmm0 \n"
+ "paddw %%xmm1,%%xmm0 \n"
+ "pxor %%xmm1,%%xmm1 \n"
+ "psubw %%xmm0,%%xmm1 \n"
+ "pmaxsw %%xmm1,%%xmm0 \n"
+ "packuswb %%xmm0,%%xmm0 \n"
+ "sub $0x8,%4 \n"
+ BUNDLEALIGN
+ MEMOPMEM(movq,xmm0,0x00,0,3,1) // movq %%xmm0,(%0,%3,1)
+ "lea " MEMLEA(0x8,0) ",%0 \n"
+ "jg 1b \n"
+ : "+r"(src_y0), // %0
+ "+r"(src_y1), // %1
+ "+r"(src_y2), // %2
+ "+r"(dst_sobelx), // %3
+ "+r"(width) // %4
+ :
+ : "memory", "cc"
+#if defined(__native_client__) && defined(__x86_64__)
+ , "r14"
+#endif
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
+#endif
+ );
+}
+#endif // HAS_SOBELXROW_SSE2
+
+#ifdef HAS_SOBELYROW_SSE2
+// SobelY as a matrix is
+// -1 -2 -1
+// 0 0 0
+// 1 2 1
+void SobelYRow_SSE2(const uint8* src_y0, const uint8* src_y1,
+ uint8* dst_sobely, int width) {
+ asm volatile (
+ "sub %0,%1 \n"
+ "sub %0,%2 \n"
+ "pxor %%xmm5,%%xmm5 \n"
+
+ // 8 pixel loop.
+ LABELALIGN
+ "1: \n"
+ "movq " MEMACCESS(0) ",%%xmm0 \n"
+ MEMOPREG(movq,0x00,0,1,1,xmm1) // movq (%0,%1,1),%%xmm1
+ "punpcklbw %%xmm5,%%xmm0 \n"
+ "punpcklbw %%xmm5,%%xmm1 \n"
+ "psubw %%xmm1,%%xmm0 \n"
+ BUNDLEALIGN
+ "movq " MEMACCESS2(0x1,0) ",%%xmm1 \n"
+ MEMOPREG(movq,0x01,0,1,1,xmm2) // movq 0x1(%0,%1,1),%%xmm2
+ "punpcklbw %%xmm5,%%xmm1 \n"
+ "punpcklbw %%xmm5,%%xmm2 \n"
+ "psubw %%xmm2,%%xmm1 \n"
+ BUNDLEALIGN
+ "movq " MEMACCESS2(0x2,0) ",%%xmm2 \n"
+ MEMOPREG(movq,0x02,0,1,1,xmm3) // movq 0x2(%0,%1,1),%%xmm3
+ "punpcklbw %%xmm5,%%xmm2 \n"
+ "punpcklbw %%xmm5,%%xmm3 \n"
+ "psubw %%xmm3,%%xmm2 \n"
+ "paddw %%xmm2,%%xmm0 \n"
+ "paddw %%xmm1,%%xmm0 \n"
+ "paddw %%xmm1,%%xmm0 \n"
+ "pxor %%xmm1,%%xmm1 \n"
+ "psubw %%xmm0,%%xmm1 \n"
+ "pmaxsw %%xmm1,%%xmm0 \n"
+ "packuswb %%xmm0,%%xmm0 \n"
+ "sub $0x8,%3 \n"
+ BUNDLEALIGN
+ MEMOPMEM(movq,xmm0,0x00,0,2,1) // movq %%xmm0,(%0,%2,1)
+ "lea " MEMLEA(0x8,0) ",%0 \n"
+ "jg 1b \n"
+ : "+r"(src_y0), // %0
+ "+r"(src_y1), // %1
+ "+r"(dst_sobely), // %2
+ "+r"(width) // %3
+ :
+ : "memory", "cc"
+#if defined(__native_client__) && defined(__x86_64__)
+ , "r14"
+#endif
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
+#endif
+ );
+}
+#endif // HAS_SOBELYROW_SSE2
+
+#ifdef HAS_SOBELROW_SSE2
+// Adds Sobel X and Sobel Y and stores Sobel into ARGB.
+// A = 255
+// R = Sobel
+// G = Sobel
+// B = Sobel
+void SobelRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
+ uint8* dst_argb, int width) {
+ asm volatile (
+ "sub %0,%1 \n"
+ "pcmpeqb %%xmm5,%%xmm5 \n"
+ "pslld $0x18,%%xmm5 \n"
+
+ // 8 pixel loop.
+ LABELALIGN
+ "1: \n"
+ "movdqa " MEMACCESS(0) ",%%xmm0 \n"
+ MEMOPREG(movdqa,0x00,0,1,1,xmm1) // movdqa (%0,%1,1),%%xmm1
+ "lea " MEMLEA(0x10,0) ",%0 \n"
+ "paddusb %%xmm1,%%xmm0 \n"
+ "movdqa %%xmm0,%%xmm2 \n"
+ "punpcklbw %%xmm0,%%xmm2 \n"
+ "punpckhbw %%xmm0,%%xmm0 \n"
+ "movdqa %%xmm2,%%xmm1 \n"
+ "punpcklwd %%xmm2,%%xmm1 \n"
+ "punpckhwd %%xmm2,%%xmm2 \n"
+ "por %%xmm5,%%xmm1 \n"
+ "por %%xmm5,%%xmm2 \n"
+ "movdqa %%xmm0,%%xmm3 \n"
+ "punpcklwd %%xmm0,%%xmm3 \n"
+ "punpckhwd %%xmm0,%%xmm0 \n"
+ "por %%xmm5,%%xmm3 \n"
+ "por %%xmm5,%%xmm0 \n"
+ "sub $0x10,%3 \n"
+ "movdqa %%xmm1," MEMACCESS(2) " \n"
+ "movdqa %%xmm2," MEMACCESS2(0x10,2) " \n"
+ "movdqa %%xmm3," MEMACCESS2(0x20,2) " \n"
+ "movdqa %%xmm0," MEMACCESS2(0x30,2) " \n"
+ "lea " MEMLEA(0x40,2) ",%2 \n"
+ "jg 1b \n"
+ : "+r"(src_sobelx), // %0
+ "+r"(src_sobely), // %1
+ "+r"(dst_argb), // %2
+ "+r"(width) // %3
+ :
+ : "memory", "cc"
+#if defined(__native_client__) && defined(__x86_64__)
+ , "r14"
+#endif
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
+#endif
+ );
+}
+#endif // HAS_SOBELROW_SSE2
+
+#ifdef HAS_SOBELTOPLANEROW_SSE2
+// Adds Sobel X and Sobel Y and stores Sobel into a plane.
+void SobelToPlaneRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
+ uint8* dst_y, int width) {
+ asm volatile (
+ "sub %0,%1 \n"
+ "pcmpeqb %%xmm5,%%xmm5 \n"
+ "pslld $0x18,%%xmm5 \n"
+
+ // 8 pixel loop.
+ LABELALIGN
+ "1: \n"
+ "movdqa " MEMACCESS(0) ",%%xmm0 \n"
+ MEMOPREG(movdqa,0x00,0,1,1,xmm1) // movdqa (%0,%1,1),%%xmm1
+ "lea " MEMLEA(0x10,0) ",%0 \n"
+ "paddusb %%xmm1,%%xmm0 \n"
+ "sub $0x10,%3 \n"
+ "movdqa %%xmm0," MEMACCESS(2) " \n"
+ "lea " MEMLEA(0x10,2) ",%2 \n"
+ "jg 1b \n"
+ : "+r"(src_sobelx), // %0
+ "+r"(src_sobely), // %1
+ "+r"(dst_y), // %2
+ "+r"(width) // %3
+ :
+ : "memory", "cc"
+#if defined(__native_client__) && defined(__x86_64__)
+ , "r14"
+#endif
+#if defined(__SSE2__)
+ , "xmm0", "xmm1"
+#endif
+ );
+}
+#endif // HAS_SOBELTOPLANEROW_SSE2
+
+#ifdef HAS_SOBELXYROW_SSE2
+// Mixes Sobel X, Sobel Y and Sobel into ARGB.
+// A = 255
+// R = Sobel X
+// G = Sobel
+// B = Sobel Y
+void SobelXYRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
+ uint8* dst_argb, int width) {
+ asm volatile (
+ "sub %0,%1 \n"
+ "pcmpeqb %%xmm5,%%xmm5 \n"
+
+ // 8 pixel loop.
+ LABELALIGN
+ "1: \n"
+ "movdqa " MEMACCESS(0) ",%%xmm0 \n"
+ MEMOPREG(movdqa,0x00,0,1,1,xmm1) // movdqa (%0,%1,1),%%xmm1
+ "lea " MEMLEA(0x10,0) ",%0 \n"
+ "movdqa %%xmm0,%%xmm2 \n"
+ "paddusb %%xmm1,%%xmm2 \n"
+ "movdqa %%xmm0,%%xmm3 \n"
+ "punpcklbw %%xmm5,%%xmm3 \n"
+ "punpckhbw %%xmm5,%%xmm0 \n"
+ "movdqa %%xmm1,%%xmm4 \n"
+ "punpcklbw %%xmm2,%%xmm4 \n"
+ "punpckhbw %%xmm2,%%xmm1 \n"
+ "movdqa %%xmm4,%%xmm6 \n"
+ "punpcklwd %%xmm3,%%xmm6 \n"
+ "punpckhwd %%xmm3,%%xmm4 \n"
+ "movdqa %%xmm1,%%xmm7 \n"
+ "punpcklwd %%xmm0,%%xmm7 \n"
+ "punpckhwd %%xmm0,%%xmm1 \n"
+ "sub $0x10,%3 \n"
+ "movdqa %%xmm6," MEMACCESS(2) " \n"
+ "movdqa %%xmm4," MEMACCESS2(0x10,2) " \n"
+ "movdqa %%xmm7," MEMACCESS2(0x20,2) " \n"
+ "movdqa %%xmm1," MEMACCESS2(0x30,2) " \n"
+ "lea " MEMLEA(0x40,2) ",%2 \n"
+ "jg 1b \n"
+ : "+r"(src_sobelx), // %0
+ "+r"(src_sobely), // %1
+ "+r"(dst_argb), // %2
+ "+r"(width) // %3
+ :
+ : "memory", "cc"
+#if defined(__native_client__) && defined(__x86_64__)
+ , "r14"
+#endif
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
+#endif
+ );
+}
+#endif // HAS_SOBELXYROW_SSE2
+
+#ifdef HAS_COMPUTECUMULATIVESUMROW_SSE2
+// Creates a table of cumulative sums where each value is a sum of all values
+// above and to the left of the value, inclusive of the value.
+void ComputeCumulativeSumRow_SSE2(const uint8* row, int32* cumsum,
+ const int32* previous_cumsum, int width) {
+ asm volatile (
+ "pxor %%xmm0,%%xmm0 \n"
+ "pxor %%xmm1,%%xmm1 \n"
+ "sub $0x4,%3 \n"
+ "jl 49f \n"
+ "test $0xf,%1 \n"
+ "jne 49f \n"
+
+ // 4 pixel loop \n"
+ LABELALIGN
+ "40: \n"
+ "movdqu " MEMACCESS(0) ",%%xmm2 \n"
+ "lea " MEMLEA(0x10,0) ",%0 \n"
+ "movdqa %%xmm2,%%xmm4 \n"
+ "punpcklbw %%xmm1,%%xmm2 \n"
+ "movdqa %%xmm2,%%xmm3 \n"
+ "punpcklwd %%xmm1,%%xmm2 \n"
+ "punpckhwd %%xmm1,%%xmm3 \n"
+ "punpckhbw %%xmm1,%%xmm4 \n"
+ "movdqa %%xmm4,%%xmm5 \n"
+ "punpcklwd %%xmm1,%%xmm4 \n"
+ "punpckhwd %%xmm1,%%xmm5 \n"
+ "paddd %%xmm2,%%xmm0 \n"
+ "movdqa " MEMACCESS(2) ",%%xmm2 \n"
+ "paddd %%xmm0,%%xmm2 \n"
+ "paddd %%xmm3,%%xmm0 \n"
+ "movdqa " MEMACCESS2(0x10,2) ",%%xmm3 \n"
+ "paddd %%xmm0,%%xmm3 \n"
+ "paddd %%xmm4,%%xmm0 \n"
+ "movdqa " MEMACCESS2(0x20,2) ",%%xmm4 \n"
+ "paddd %%xmm0,%%xmm4 \n"
+ "paddd %%xmm5,%%xmm0 \n"
+ "movdqa " MEMACCESS2(0x30,2) ",%%xmm5 \n"
+ "lea " MEMLEA(0x40,2) ",%2 \n"
+ "paddd %%xmm0,%%xmm5 \n"
+ "movdqa %%xmm2," MEMACCESS(1) " \n"
+ "movdqa %%xmm3," MEMACCESS2(0x10,1) " \n"
+ "movdqa %%xmm4," MEMACCESS2(0x20,1) " \n"
+ "movdqa %%xmm5," MEMACCESS2(0x30,1) " \n"
+ "lea " MEMLEA(0x40,1) ",%1 \n"
+ "sub $0x4,%3 \n"
+ "jge 40b \n"
+
+ "49: \n"
+ "add $0x3,%3 \n"
+ "jl 19f \n"
+
+ // 1 pixel loop \n"
+ LABELALIGN
+ "10: \n"
+ "movd " MEMACCESS(0) ",%%xmm2 \n"
+ "lea " MEMLEA(0x4,0) ",%0 \n"
+ "punpcklbw %%xmm1,%%xmm2 \n"
+ "punpcklwd %%xmm1,%%xmm2 \n"
+ "paddd %%xmm2,%%xmm0 \n"
+ "movdqu " MEMACCESS(2) ",%%xmm2 \n"
+ "lea " MEMLEA(0x10,2) ",%2 \n"
+ "paddd %%xmm0,%%xmm2 \n"
+ "movdqu %%xmm2," MEMACCESS(1) " \n"
+ "lea " MEMLEA(0x10,1) ",%1 \n"
+ "sub $0x1,%3 \n"
+ "jge 10b \n"
+
+ "19: \n"
+ : "+r"(row), // %0
+ "+r"(cumsum), // %1
+ "+r"(previous_cumsum), // %2
+ "+r"(width) // %3
+ :
+ : "memory", "cc"
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+#endif
+ );
+}
+#endif // HAS_COMPUTECUMULATIVESUMROW_SSE2
+
+#ifdef HAS_CUMULATIVESUMTOAVERAGEROW_SSE2
+void CumulativeSumToAverageRow_SSE2(const int32* topleft, const int32* botleft,
+ int width, int area, uint8* dst,
+ int count) {
+ asm volatile (
+ "movd %5,%%xmm5 \n"
+ "cvtdq2ps %%xmm5,%%xmm5 \n"
+ "rcpss %%xmm5,%%xmm4 \n"
+ "pshufd $0x0,%%xmm4,%%xmm4 \n"
+ "sub $0x4,%3 \n"
+ "jl 49f \n"
+ "cmpl $0x80,%5 \n"
+ "ja 40f \n"
+
+ "pshufd $0x0,%%xmm5,%%xmm5 \n"
+ "pcmpeqb %%xmm6,%%xmm6 \n"
+ "psrld $0x10,%%xmm6 \n"
+ "cvtdq2ps %%xmm6,%%xmm6 \n"
+ "addps %%xmm6,%%xmm5 \n"
+ "mulps %%xmm4,%%xmm5 \n"
+ "cvtps2dq %%xmm5,%%xmm5 \n"
+ "packssdw %%xmm5,%%xmm5 \n"
+
+ // 4 pixel small loop \n"
+ LABELALIGN
+ "4: \n"
+ "movdqa " MEMACCESS(0) ",%%xmm0 \n"
+ "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
+ "movdqa " MEMACCESS2(0x20,0) ",%%xmm2 \n"
+ "movdqa " MEMACCESS2(0x30,0) ",%%xmm3 \n"
+ BUNDLEALIGN
+ MEMOPREG(psubd,0x00,0,4,4,xmm0) // psubd 0x00(%0,%4,4),%%xmm0
+ MEMOPREG(psubd,0x10,0,4,4,xmm1) // psubd 0x10(%0,%4,4),%%xmm1
+ MEMOPREG(psubd,0x20,0,4,4,xmm2) // psubd 0x20(%0,%4,4),%%xmm2
+ MEMOPREG(psubd,0x30,0,4,4,xmm3) // psubd 0x30(%0,%4,4),%%xmm3
+ "lea " MEMLEA(0x40,0) ",%0 \n"
+ "psubd " MEMACCESS(1) ",%%xmm0 \n"
+ "psubd " MEMACCESS2(0x10,1) ",%%xmm1 \n"
+ "psubd " MEMACCESS2(0x20,1) ",%%xmm2 \n"
+ "psubd " MEMACCESS2(0x30,1) ",%%xmm3 \n"
+ BUNDLEALIGN
+ MEMOPREG(paddd,0x00,1,4,4,xmm0) // paddd 0x00(%1,%4,4),%%xmm0
+ MEMOPREG(paddd,0x10,1,4,4,xmm1) // paddd 0x10(%1,%4,4),%%xmm1
+ MEMOPREG(paddd,0x20,1,4,4,xmm2) // paddd 0x20(%1,%4,4),%%xmm2
+ MEMOPREG(paddd,0x30,1,4,4,xmm3) // paddd 0x30(%1,%4,4),%%xmm3
+ "lea " MEMLEA(0x40,1) ",%1 \n"
+ "packssdw %%xmm1,%%xmm0 \n"
+ "packssdw %%xmm3,%%xmm2 \n"
+ "pmulhuw %%xmm5,%%xmm0 \n"
+ "pmulhuw %%xmm5,%%xmm2 \n"
+ "packuswb %%xmm2,%%xmm0 \n"
+ "movdqu %%xmm0," MEMACCESS(2) " \n"
+ "lea " MEMLEA(0x10,2) ",%2 \n"
+ "sub $0x4,%3 \n"
+ "jge 4b \n"
+ "jmp 49f \n"
+
+ // 4 pixel loop \n"
+ LABELALIGN
+ "40: \n"
+ "movdqa " MEMACCESS(0) ",%%xmm0 \n"
+ "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
+ "movdqa " MEMACCESS2(0x20,0) ",%%xmm2 \n"
+ "movdqa " MEMACCESS2(0x30,0) ",%%xmm3 \n"
+ BUNDLEALIGN
+ MEMOPREG(psubd,0x00,0,4,4,xmm0) // psubd 0x00(%0,%4,4),%%xmm0
+ MEMOPREG(psubd,0x10,0,4,4,xmm1) // psubd 0x10(%0,%4,4),%%xmm1
+ MEMOPREG(psubd,0x20,0,4,4,xmm2) // psubd 0x20(%0,%4,4),%%xmm2
+ MEMOPREG(psubd,0x30,0,4,4,xmm3) // psubd 0x30(%0,%4,4),%%xmm3
+ "lea " MEMLEA(0x40,0) ",%0 \n"
+ "psubd " MEMACCESS(1) ",%%xmm0 \n"
+ "psubd " MEMACCESS2(0x10,1) ",%%xmm1 \n"
+ "psubd " MEMACCESS2(0x20,1) ",%%xmm2 \n"
+ "psubd " MEMACCESS2(0x30,1) ",%%xmm3 \n"
+ BUNDLEALIGN
+ MEMOPREG(paddd,0x00,1,4,4,xmm0) // paddd 0x00(%1,%4,4),%%xmm0
+ MEMOPREG(paddd,0x10,1,4,4,xmm1) // paddd 0x10(%1,%4,4),%%xmm1
+ MEMOPREG(paddd,0x20,1,4,4,xmm2) // paddd 0x20(%1,%4,4),%%xmm2
+ MEMOPREG(paddd,0x30,1,4,4,xmm3) // paddd 0x30(%1,%4,4),%%xmm3
+ "lea " MEMLEA(0x40,1) ",%1 \n"
+ "cvtdq2ps %%xmm0,%%xmm0 \n"
+ "cvtdq2ps %%xmm1,%%xmm1 \n"
+ "mulps %%xmm4,%%xmm0 \n"
+ "mulps %%xmm4,%%xmm1 \n"
+ "cvtdq2ps %%xmm2,%%xmm2 \n"
+ "cvtdq2ps %%xmm3,%%xmm3 \n"
+ "mulps %%xmm4,%%xmm2 \n"
+ "mulps %%xmm4,%%xmm3 \n"
+ "cvtps2dq %%xmm0,%%xmm0 \n"
+ "cvtps2dq %%xmm1,%%xmm1 \n"
+ "cvtps2dq %%xmm2,%%xmm2 \n"
+ "cvtps2dq %%xmm3,%%xmm3 \n"
+ "packssdw %%xmm1,%%xmm0 \n"
+ "packssdw %%xmm3,%%xmm2 \n"
+ "packuswb %%xmm2,%%xmm0 \n"
+ "movdqu %%xmm0," MEMACCESS(2) " \n"
+ "lea " MEMLEA(0x10,2) ",%2 \n"
+ "sub $0x4,%3 \n"
+ "jge 40b \n"
+
+ "49: \n"
+ "add $0x3,%3 \n"
+ "jl 19f \n"
+
+ // 1 pixel loop \n"
+ LABELALIGN
+ "10: \n"
+ "movdqa " MEMACCESS(0) ",%%xmm0 \n"
+ MEMOPREG(psubd,0x00,0,4,4,xmm0) // psubd 0x00(%0,%4,4),%%xmm0
+ "lea " MEMLEA(0x10,0) ",%0 \n"
+ "psubd " MEMACCESS(1) ",%%xmm0 \n"
+ BUNDLEALIGN
+ MEMOPREG(paddd,0x00,1,4,4,xmm0) // paddd 0x00(%1,%4,4),%%xmm0
+ "lea " MEMLEA(0x10,1) ",%1 \n"
+ "cvtdq2ps %%xmm0,%%xmm0 \n"
+ "mulps %%xmm4,%%xmm0 \n"
+ "cvtps2dq %%xmm0,%%xmm0 \n"
+ "packssdw %%xmm0,%%xmm0 \n"
+ "packuswb %%xmm0,%%xmm0 \n"
+ "movd %%xmm0," MEMACCESS(2) " \n"
+ "lea " MEMLEA(0x4,2) ",%2 \n"
+ "sub $0x1,%3 \n"
+ "jge 10b \n"
+ "19: \n"
+ : "+r"(topleft), // %0
+ "+r"(botleft), // %1
+ "+r"(dst), // %2
+ "+rm"(count) // %3
+ : "r"((intptr_t)(width)), // %4
+ "rm"(area) // %5
+ : "memory", "cc"
+#if defined(__native_client__) && defined(__x86_64__)
+ , "r14"
+#endif
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
+#endif
+ );
+}
+#endif // HAS_CUMULATIVESUMTOAVERAGEROW_SSE2
+
+#ifdef HAS_ARGBAFFINEROW_SSE2
+// Copy ARGB pixels from source image with slope to a row of destination.
+LIBYUV_API
+void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride,
+ uint8* dst_argb, const float* src_dudv, int width) {
+ intptr_t src_argb_stride_temp = src_argb_stride;
+ intptr_t temp = 0;
+ asm volatile (
+ "movq " MEMACCESS(3) ",%%xmm2 \n"
+ "movq " MEMACCESS2(0x08,3) ",%%xmm7 \n"
+ "shl $0x10,%1 \n"
+ "add $0x4,%1 \n"
+ "movd %1,%%xmm5 \n"
+ "sub $0x4,%4 \n"
+ "jl 49f \n"
+
+ "pshufd $0x44,%%xmm7,%%xmm7 \n"
+ "pshufd $0x0,%%xmm5,%%xmm5 \n"
+ "movdqa %%xmm2,%%xmm0 \n"
+ "addps %%xmm7,%%xmm0 \n"
+ "movlhps %%xmm0,%%xmm2 \n"
+ "movdqa %%xmm7,%%xmm4 \n"
+ "addps %%xmm4,%%xmm4 \n"
+ "movdqa %%xmm2,%%xmm3 \n"
+ "addps %%xmm4,%%xmm3 \n"
+ "addps %%xmm4,%%xmm4 \n"
+
+ // 4 pixel loop \n"
+ LABELALIGN
+ "40: \n"
+ "cvttps2dq %%xmm2,%%xmm0 \n" // x, y float to int first 2
+ "cvttps2dq %%xmm3,%%xmm1 \n" // x, y float to int next 2
+ "packssdw %%xmm1,%%xmm0 \n" // x, y as 8 shorts
+ "pmaddwd %%xmm5,%%xmm0 \n" // off = x * 4 + y * stride
+ "movd %%xmm0,%k1 \n"
+ "pshufd $0x39,%%xmm0,%%xmm0 \n"
+ "movd %%xmm0,%k5 \n"
+ "pshufd $0x39,%%xmm0,%%xmm0 \n"
+ BUNDLEALIGN
+ MEMOPREG(movd,0x00,0,1,1,xmm1) // movd (%0,%1,1),%%xmm1
+ MEMOPREG(movd,0x00,0,5,1,xmm6) // movd (%0,%5,1),%%xmm6
+ "punpckldq %%xmm6,%%xmm1 \n"
+ "addps %%xmm4,%%xmm2 \n"
+ "movq %%xmm1," MEMACCESS(2) " \n"
+ "movd %%xmm0,%k1 \n"
+ "pshufd $0x39,%%xmm0,%%xmm0 \n"
+ "movd %%xmm0,%k5 \n"
+ BUNDLEALIGN
+ MEMOPREG(movd,0x00,0,1,1,xmm0) // movd (%0,%1,1),%%xmm0
+ MEMOPREG(movd,0x00,0,5,1,xmm6) // movd (%0,%5,1),%%xmm6
+ "punpckldq %%xmm6,%%xmm0 \n"
+ "addps %%xmm4,%%xmm3 \n"
+ "sub $0x4,%4 \n"
+ "movq %%xmm0," MEMACCESS2(0x08,2) " \n"
+ "lea " MEMLEA(0x10,2) ",%2 \n"
+ "jge 40b \n"
+
+ "49: \n"
+ "add $0x3,%4 \n"
+ "jl 19f \n"
+
+ // 1 pixel loop \n"
+ LABELALIGN
+ "10: \n"
+ "cvttps2dq %%xmm2,%%xmm0 \n"
+ "packssdw %%xmm0,%%xmm0 \n"
+ "pmaddwd %%xmm5,%%xmm0 \n"
+ "addps %%xmm7,%%xmm2 \n"
+ "movd %%xmm0,%k1 \n"
+ BUNDLEALIGN
+ MEMOPREG(movd,0x00,0,1,1,xmm0) // movd (%0,%1,1),%%xmm0
+ "sub $0x1,%4 \n"
+ "movd %%xmm0," MEMACCESS(2) " \n"
+ "lea " MEMLEA(0x04,2) ",%2 \n"
+ "jge 10b \n"
+ "19: \n"
+ : "+r"(src_argb), // %0
+ "+r"(src_argb_stride_temp), // %1
+ "+r"(dst_argb), // %2
+ "+r"(src_dudv), // %3
+ "+rm"(width), // %4
+ "+r"(temp) // %5
+ :
+ : "memory", "cc"
+#if defined(__native_client__) && defined(__x86_64__)
+ , "r14"
+#endif
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
+#endif
+ );
+}
+#endif // HAS_ARGBAFFINEROW_SSE2
+
+#ifdef HAS_INTERPOLATEROW_SSSE3
+// Bilinear filter 16x2 -> 16x1
+void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
+ ptrdiff_t src_stride, int dst_width,
+ int source_y_fraction) {
+ asm volatile (
+ "sub %1,%0 \n"
+ "shr %3 \n"
+ "cmp $0x0,%3 \n"
+ "je 100f \n"
+ "cmp $0x20,%3 \n"
+ "je 75f \n"
+ "cmp $0x40,%3 \n"
+ "je 50f \n"
+ "cmp $0x60,%3 \n"
+ "je 25f \n"
+
+ "movd %3,%%xmm0 \n"
+ "neg %3 \n"
+ "add $0x80,%3 \n"
+ "movd %3,%%xmm5 \n"
+ "punpcklbw %%xmm0,%%xmm5 \n"
+ "punpcklwd %%xmm5,%%xmm5 \n"
+ "pshufd $0x0,%%xmm5,%%xmm5 \n"
+
+ // General purpose row blend.
+ LABELALIGN
+ "1: \n"
+ "movdqa " MEMACCESS(1) ",%%xmm0 \n"
+ MEMOPREG(movdqa,0x00,1,4,1,xmm2)
+ "movdqa %%xmm0,%%xmm1 \n"
+ "punpcklbw %%xmm2,%%xmm0 \n"
+ "punpckhbw %%xmm2,%%xmm1 \n"
+ "pmaddubsw %%xmm5,%%xmm0 \n"
+ "pmaddubsw %%xmm5,%%xmm1 \n"
+ "psrlw $0x7,%%xmm0 \n"
+ "psrlw $0x7,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm0 \n"
+ "sub $0x10,%2 \n"
+ BUNDLEALIGN
+ MEMOPMEM(movdqa,xmm0,0x00,1,0,1)
+ "lea " MEMLEA(0x10,1) ",%1 \n"
+ "jg 1b \n"
+ "jmp 99f \n"
+
+ // Blend 25 / 75.
+ LABELALIGN
+ "25: \n"
+ "movdqa " MEMACCESS(1) ",%%xmm0 \n"
+ MEMOPREG(movdqa,0x00,1,4,1,xmm1)
+ "pavgb %%xmm1,%%xmm0 \n"
+ "pavgb %%xmm1,%%xmm0 \n"
+ "sub $0x10,%2 \n"
+ BUNDLEALIGN
+ MEMOPMEM(movdqa,xmm0,0x00,1,0,1)
+ "lea " MEMLEA(0x10,1) ",%1 \n"
+ "jg 25b \n"
+ "jmp 99f \n"
+
+ // Blend 50 / 50.
+ LABELALIGN
+ "50: \n"
+ "movdqa " MEMACCESS(1) ",%%xmm0 \n"
+ MEMOPREG(movdqa,0x00,1,4,1,xmm1)
+ "pavgb %%xmm1,%%xmm0 \n"
+ "sub $0x10,%2 \n"
+ BUNDLEALIGN
+ MEMOPMEM(movdqa,xmm0,0x00,1,0,1)
+ "lea " MEMLEA(0x10,1) ",%1 \n"
+ "jg 50b \n"
+ "jmp 99f \n"
+
+ // Blend 75 / 25.
+ LABELALIGN
+ "75: \n"
+ "movdqa " MEMACCESS(1) ",%%xmm1 \n"
+ MEMOPREG(movdqa,0x00,1,4,1,xmm0)
+ "pavgb %%xmm1,%%xmm0 \n"
+ "pavgb %%xmm1,%%xmm0 \n"
+ "sub $0x10,%2 \n"
+ BUNDLEALIGN
+ MEMOPMEM(movdqa,xmm0,0x00,1,0,1)
+ "lea " MEMLEA(0x10,1) ",%1 \n"
+ "jg 75b \n"
+ "jmp 99f \n"
+
+ // Blend 100 / 0 - Copy row unchanged.
+ LABELALIGN
+ "100: \n"
+ "movdqa " MEMACCESS(1) ",%%xmm0 \n"
+ "sub $0x10,%2 \n"
+ MEMOPMEM(movdqa,xmm0,0x00,1,0,1)
+ "lea " MEMLEA(0x10,1) ",%1 \n"
+ "jg 100b \n"
+
+ "99: \n"
+ : "+r"(dst_ptr), // %0
+ "+r"(src_ptr), // %1
+ "+r"(dst_width), // %2
+ "+r"(source_y_fraction) // %3
+ : "r"((intptr_t)(src_stride)) // %4
+ : "memory", "cc"
+#if defined(__native_client__) && defined(__x86_64__)
+ , "r14"
+#endif
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm2", "xmm5"
+#endif
+ );
+}
+#endif // HAS_INTERPOLATEROW_SSSE3
+
+#ifdef HAS_INTERPOLATEROW_SSE2
+// Bilinear filter 16x2 -> 16x1
+void InterpolateRow_SSE2(uint8* dst_ptr, const uint8* src_ptr,
+ ptrdiff_t src_stride, int dst_width,
+ int source_y_fraction) {
+ asm volatile (
+ "sub %1,%0 \n"
+ "shr %3 \n"
+ "cmp $0x0,%3 \n"
+ "je 100f \n"
+ "cmp $0x20,%3 \n"
+ "je 75f \n"
+ "cmp $0x40,%3 \n"
+ "je 50f \n"
+ "cmp $0x60,%3 \n"
+ "je 25f \n"
+
+ "movd %3,%%xmm0 \n"
+ "neg %3 \n"
+ "add $0x80,%3 \n"
+ "movd %3,%%xmm5 \n"
+ "punpcklbw %%xmm0,%%xmm5 \n"
+ "punpcklwd %%xmm5,%%xmm5 \n"
+ "pshufd $0x0,%%xmm5,%%xmm5 \n"
+ "pxor %%xmm4,%%xmm4 \n"
+
+ // General purpose row blend.
+ LABELALIGN
+ "1: \n"
+ "movdqa " MEMACCESS(1) ",%%xmm0 \n"
+ MEMOPREG(movdqa,0x00,1,4,1,xmm2) // movdqa (%1,%4,1),%%xmm2
+ "movdqa %%xmm0,%%xmm1 \n"
+ "movdqa %%xmm2,%%xmm3 \n"
+ "punpcklbw %%xmm4,%%xmm2 \n"
+ "punpckhbw %%xmm4,%%xmm3 \n"
+ "punpcklbw %%xmm4,%%xmm0 \n"
+ "punpckhbw %%xmm4,%%xmm1 \n"
+ "psubw %%xmm0,%%xmm2 \n"
+ "psubw %%xmm1,%%xmm3 \n"
+ "paddw %%xmm2,%%xmm2 \n"
+ "paddw %%xmm3,%%xmm3 \n"
+ "pmulhw %%xmm5,%%xmm2 \n"
+ "pmulhw %%xmm5,%%xmm3 \n"
+ "paddw %%xmm2,%%xmm0 \n"
+ "paddw %%xmm3,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm0 \n"
+ "sub $0x10,%2 \n"
+ BUNDLEALIGN
+ MEMOPMEM(movdqa,xmm0,0x00,1,0,1) // movdqa %%xmm0,(%1,%0,1)
+ "lea " MEMLEA(0x10,1) ",%1 \n"
+ "jg 1b \n"
+ "jmp 99f \n"
+
+ // Blend 25 / 75.
+ LABELALIGN
+ "25: \n"
+ "movdqa " MEMACCESS(1) ",%%xmm0 \n"
+ MEMOPREG(movdqa,0x00,1,4,1,xmm1) // movdqa (%1,%4,1),%%xmm1
+ "pavgb %%xmm1,%%xmm0 \n"
+ "pavgb %%xmm1,%%xmm0 \n"
+ "sub $0x10,%2 \n"
+ BUNDLEALIGN
+ MEMOPMEM(movdqa,xmm0,0x00,1,0,1) // movdqa %%xmm0,(%1,%0,1)
+ "lea " MEMLEA(0x10,1) ",%1 \n"
+ "jg 25b \n"
+ "jmp 99f \n"
+
+ // Blend 50 / 50.
+ LABELALIGN
+ "50: \n"
+ "movdqa " MEMACCESS(1) ",%%xmm0 \n"
+ MEMOPREG(movdqa,0x00,1,4,1,xmm1) // movdqa (%1,%4,1),%%xmm1
+ "pavgb %%xmm1,%%xmm0 \n"
+ "sub $0x10,%2 \n"
+ BUNDLEALIGN
+ MEMOPMEM(movdqa,xmm0,0x00,1,0,1) // movdqa %%xmm0,(%1,%0,1)
+ "lea " MEMLEA(0x10,1) ",%1 \n"
+ "jg 50b \n"
+ "jmp 99f \n"
+
+ // Blend 75 / 25.
+ LABELALIGN
+ "75: \n"
+ "movdqa " MEMACCESS(1) ",%%xmm1 \n"
+ MEMOPREG(movdqa,0x00,1,4,1,xmm0) // movdqa (%1,%4,1),%%xmm0
+ "pavgb %%xmm1,%%xmm0 \n"
+ "pavgb %%xmm1,%%xmm0 \n"
+ "sub $0x10,%2 \n"
+ BUNDLEALIGN
+ MEMOPMEM(movdqa,xmm0,0x00,1,0,1) // movdqa %%xmm0,(%1,%0,1)
+ "lea " MEMLEA(0x10,1) ",%1 \n"
+ "jg 75b \n"
+ "jmp 99f \n"
+
+ // Blend 100 / 0 - Copy row unchanged.
+ LABELALIGN
+ "100: \n"
+ "movdqa " MEMACCESS(1) ",%%xmm0 \n"
+ "sub $0x10,%2 \n"
+ MEMOPMEM(movdqa,xmm0,0x00,1,0,1) // movdqa %%xmm0,(%1,%0,1)
+ "lea " MEMLEA(0x10,1) ",%1 \n"
+ "jg 100b \n"
+
+ "99: \n"
+ : "+r"(dst_ptr), // %0
+ "+r"(src_ptr), // %1
+ "+r"(dst_width), // %2
+ "+r"(source_y_fraction) // %3
+ : "r"((intptr_t)(src_stride)) // %4
+ : "memory", "cc"
+#if defined(__native_client__) && defined(__x86_64__)
+ , "r14"
+#endif
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+#endif
+ );
+}
+#endif // HAS_INTERPOLATEROW_SSE2
+
+#ifdef HAS_INTERPOLATEROW_SSSE3
+// Bilinear filter 16x2 -> 16x1
+void InterpolateRow_Unaligned_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
+ ptrdiff_t src_stride, int dst_width,
+ int source_y_fraction) {
+ asm volatile (
+ "sub %1,%0 \n"
+ "shr %3 \n"
+ "cmp $0x0,%3 \n"
+ "je 100f \n"
+ "cmp $0x20,%3 \n"
+ "je 75f \n"
+ "cmp $0x40,%3 \n"
+ "je 50f \n"
+ "cmp $0x60,%3 \n"
+ "je 25f \n"
+
+ "movd %3,%%xmm0 \n"
+ "neg %3 \n"
+ "add $0x80,%3 \n"
+ "movd %3,%%xmm5 \n"
+ "punpcklbw %%xmm0,%%xmm5 \n"
+ "punpcklwd %%xmm5,%%xmm5 \n"
+ "pshufd $0x0,%%xmm5,%%xmm5 \n"
+
+ // General purpose row blend.
+ LABELALIGN
+ "1: \n"
+ "movdqu " MEMACCESS(1) ",%%xmm0 \n"
+ MEMOPREG(movdqu,0x00,1,4,1,xmm2)
+ "movdqu %%xmm0,%%xmm1 \n"
+ "punpcklbw %%xmm2,%%xmm0 \n"
+ "punpckhbw %%xmm2,%%xmm1 \n"
+ "pmaddubsw %%xmm5,%%xmm0 \n"
+ "pmaddubsw %%xmm5,%%xmm1 \n"
+ "psrlw $0x7,%%xmm0 \n"
+ "psrlw $0x7,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm0 \n"
+ "sub $0x10,%2 \n"
+ BUNDLEALIGN
+ MEMOPMEM(movdqu,xmm0,0x00,1,0,1)
+ "lea " MEMLEA(0x10,1) ",%1 \n"
+ "jg 1b \n"
+ "jmp 99f \n"
+
+ // Blend 25 / 75.
+ LABELALIGN
+ "25: \n"
+ "movdqu " MEMACCESS(1) ",%%xmm0 \n"
+ MEMOPREG(movdqu,0x00,1,4,1,xmm1)
+ "pavgb %%xmm1,%%xmm0 \n"
+ "pavgb %%xmm1,%%xmm0 \n"
+ "sub $0x10,%2 \n"
+ BUNDLEALIGN
+ MEMOPMEM(movdqu,xmm0,0x00,1,0,1)
+ "lea " MEMLEA(0x10,1) ",%1 \n"
+ "jg 25b \n"
+ "jmp 99f \n"
+
+ // Blend 50 / 50.
+ LABELALIGN
+ "50: \n"
+ "movdqu " MEMACCESS(1) ",%%xmm0 \n"
+ MEMOPREG(movdqu,0x00,1,4,1,xmm1)
+ "pavgb %%xmm1,%%xmm0 \n"
+ "sub $0x10,%2 \n"
+ BUNDLEALIGN
+ MEMOPMEM(movdqu,xmm0,0x00,1,0,1)
+ "lea " MEMLEA(0x10,1) ",%1 \n"
+ "jg 50b \n"
+ "jmp 99f \n"
+
+ // Blend 75 / 25.
+ LABELALIGN
+ "75: \n"
+ "movdqu " MEMACCESS(1) ",%%xmm1 \n"
+ MEMOPREG(movdqu,0x00,1,4,1,xmm0)
+ "pavgb %%xmm1,%%xmm0 \n"
+ "pavgb %%xmm1,%%xmm0 \n"
+ "sub $0x10,%2 \n"
+ BUNDLEALIGN
+ MEMOPMEM(movdqu,xmm0,0x00,1,0,1)
+ "lea " MEMLEA(0x10,1) ",%1 \n"
+ "jg 75b \n"
+ "jmp 99f \n"
+
+ // Blend 100 / 0 - Copy row unchanged.
+ LABELALIGN
+ "100: \n"
+ "movdqu " MEMACCESS(1) ",%%xmm0 \n"
+ "sub $0x10,%2 \n"
+ MEMOPMEM(movdqu,xmm0,0x00,1,0,1)
+ "lea " MEMLEA(0x10,1) ",%1 \n"
+ "jg 100b \n"
+
+ "99: \n"
+ : "+r"(dst_ptr), // %0
+ "+r"(src_ptr), // %1
+ "+r"(dst_width), // %2
+ "+r"(source_y_fraction) // %3
+ : "r"((intptr_t)(src_stride)) // %4
+ : "memory", "cc"
+#if defined(__native_client__) && defined(__x86_64__)
+ , "r14"
+#endif
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm2", "xmm5"
+#endif
+ );
+}
+#endif // HAS_INTERPOLATEROW_SSSE3
+
+#ifdef HAS_INTERPOLATEROW_SSE2
+// Bilinear filter 16x2 -> 16x1
+void InterpolateRow_Unaligned_SSE2(uint8* dst_ptr, const uint8* src_ptr,
+ ptrdiff_t src_stride, int dst_width,
+ int source_y_fraction) {
+ asm volatile (
+ "sub %1,%0 \n"
+ "shr %3 \n"
+ "cmp $0x0,%3 \n"
+ "je 100f \n"
+ "cmp $0x20,%3 \n"
+ "je 75f \n"
+ "cmp $0x40,%3 \n"
+ "je 50f \n"
+ "cmp $0x60,%3 \n"
+ "je 25f \n"
+
+ "movd %3,%%xmm0 \n"
+ "neg %3 \n"
+ "add $0x80,%3 \n"
+ "movd %3,%%xmm5 \n"
+ "punpcklbw %%xmm0,%%xmm5 \n"
+ "punpcklwd %%xmm5,%%xmm5 \n"
+ "pshufd $0x0,%%xmm5,%%xmm5 \n"
+ "pxor %%xmm4,%%xmm4 \n"
+
+ // General purpose row blend.
+ LABELALIGN
+ "1: \n"
+ "movdqu " MEMACCESS(1) ",%%xmm0 \n"
+ MEMOPREG(movdqu,0x00,1,4,1,xmm2) // movdqu (%1,%4,1),%%xmm2
+ "movdqu %%xmm0,%%xmm1 \n"
+ "movdqu %%xmm2,%%xmm3 \n"
+ "punpcklbw %%xmm4,%%xmm2 \n"
+ "punpckhbw %%xmm4,%%xmm3 \n"
+ "punpcklbw %%xmm4,%%xmm0 \n"
+ "punpckhbw %%xmm4,%%xmm1 \n"
+ "psubw %%xmm0,%%xmm2 \n"
+ "psubw %%xmm1,%%xmm3 \n"
+ "paddw %%xmm2,%%xmm2 \n"
+ "paddw %%xmm3,%%xmm3 \n"
+ "pmulhw %%xmm5,%%xmm2 \n"
+ "pmulhw %%xmm5,%%xmm3 \n"
+ "paddw %%xmm2,%%xmm0 \n"
+ "paddw %%xmm3,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm0 \n"
+ "sub $0x10,%2 \n"
+ BUNDLEALIGN
+ MEMOPMEM(movdqu,xmm0,0x00,1,0,1) // movdqu %%xmm0,(%1,%0,1)
+ "lea " MEMLEA(0x10,1) ",%1 \n"
+ "jg 1b \n"
+ "jmp 99f \n"
+
+ // Blend 25 / 75.
+ LABELALIGN
+ "25: \n"
+ "movdqu " MEMACCESS(1) ",%%xmm0 \n"
+ MEMOPREG(movdqu,0x00,1,4,1,xmm1) // movdqu (%1,%4,1),%%xmm1
+ "pavgb %%xmm1,%%xmm0 \n"
+ "pavgb %%xmm1,%%xmm0 \n"
+ "sub $0x10,%2 \n"
+ BUNDLEALIGN
+ MEMOPMEM(movdqu,xmm0,0x00,1,0,1) // movdqu %%xmm0,(%1,%0,1)
+ "lea " MEMLEA(0x10,1) ",%1 \n"
+ "jg 25b \n"
+ "jmp 99f \n"
+
+ // Blend 50 / 50.
+ LABELALIGN
+ "50: \n"
+ "movdqu " MEMACCESS(1) ",%%xmm0 \n"
+ MEMOPREG(movdqu,0x00,1,4,1,xmm1) // movdqu (%1,%4,1),%%xmm1
+ "pavgb %%xmm1,%%xmm0 \n"
+ "sub $0x10,%2 \n"
+ BUNDLEALIGN
+ MEMOPMEM(movdqu,xmm0,0x00,1,0,1) // movdqu %%xmm0,(%1,%0,1)
+ "lea " MEMLEA(0x10,1) ",%1 \n"
+ "jg 50b \n"
+ "jmp 99f \n"
+
+ // Blend 75 / 25.
+ LABELALIGN
+ "75: \n"
+ "movdqu " MEMACCESS(1) ",%%xmm1 \n"
+ MEMOPREG(movdqu,0x00,1,4,1,xmm0) // movdqu (%1,%4,1),%%xmm0
+ "pavgb %%xmm1,%%xmm0 \n"
+ "pavgb %%xmm1,%%xmm0 \n"
+ "sub $0x10,%2 \n"
+ BUNDLEALIGN
+ MEMOPMEM(movdqu,xmm0,0x00,1,0,1) // movdqu %%xmm0,(%1,%0,1)
+ "lea " MEMLEA(0x10,1) ",%1 \n"
+ "jg 75b \n"
+ "jmp 99f \n"
+
+ // Blend 100 / 0 - Copy row unchanged.
+ LABELALIGN
+ "100: \n"
+ "movdqu " MEMACCESS(1) ",%%xmm0 \n"
+ "sub $0x10,%2 \n"
+ MEMOPMEM(movdqu,xmm0,0x00,1,0,1) // movdqu %%xmm0,(%1,%0,1)
+ "lea " MEMLEA(0x10,1) ",%1 \n"
+ "jg 100b \n"
+
+ "99: \n"
+ : "+r"(dst_ptr), // %0
+ "+r"(src_ptr), // %1
+ "+r"(dst_width), // %2
+ "+r"(source_y_fraction) // %3
+ : "r"((intptr_t)(src_stride)) // %4
+ : "memory", "cc"
+#if defined(__native_client__) && defined(__x86_64__)
+ , "r14"
+#endif
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+#endif
+ );
+}
+#endif // HAS_INTERPOLATEROW_SSE2
+
+#ifdef HAS_HALFROW_SSE2
+void HalfRow_SSE2(const uint8* src_uv, int src_uv_stride,
+ uint8* dst_uv, int pix) {
+ asm volatile (
+ "sub %0,%1 \n"
+ LABELALIGN
+ "1: \n"
+ "movdqa " MEMACCESS(0) ",%%xmm0 \n"
+ MEMOPREG(pavgb,0x00,0,3,1,xmm0) // pavgb (%0,%3),%%xmm0
+ "sub $0x10,%2 \n"
+ MEMOPMEM(movdqa,xmm0,0x00,0,1,1) // movdqa %%xmm0,(%0,%1)
+ "lea " MEMLEA(0x10,0) ",%0 \n"
+ "jg 1b \n"
+ : "+r"(src_uv), // %0
+ "+r"(dst_uv), // %1
+ "+r"(pix) // %2
+ : "r"((intptr_t)(src_uv_stride)) // %3
+ : "memory", "cc"
+#if defined(__SSE2__)
+ , "xmm0"
+#endif
+ );
+}
+#endif // HAS_HALFROW_SSE2
+
+#ifdef HAS_ARGBTOBAYERROW_SSSE3
+void ARGBToBayerRow_SSSE3(const uint8* src_argb, uint8* dst_bayer,
+ uint32 selector, int pix) {
+ asm volatile (
+ // NaCL caveat - assumes movd is from GPR
+ "movd %3,%%xmm5 \n"
+ "pshufd $0x0,%%xmm5,%%xmm5 \n"
+ LABELALIGN
+ "1: \n"
+ "movdqa " MEMACCESS(0) ",%%xmm0 \n"
+ "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
+ "lea " MEMLEA(0x20,0) ",%0 \n"
+ "pshufb %%xmm5,%%xmm0 \n"
+ "pshufb %%xmm5,%%xmm1 \n"
+ "punpckldq %%xmm1,%%xmm0 \n"
+ "sub $0x8,%2 \n"
+ "movq %%xmm0," MEMACCESS(1) " \n"
+ "lea " MEMLEA(0x8,1) ",%1 \n"
+ "jg 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_bayer), // %1
+ "+r"(pix) // %2
+ : "g"(selector) // %3
+ : "memory", "cc"
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm5"
+#endif
+ );
+}
+#endif // HAS_ARGBTOBAYERROW_SSSE3
+
+#ifdef HAS_ARGBTOBAYERGGROW_SSE2
+void ARGBToBayerGGRow_SSE2(const uint8* src_argb, uint8* dst_bayer,
+ uint32 selector, int pix) {
+ asm volatile (
+ "pcmpeqb %%xmm5,%%xmm5 \n"
+ "psrld $0x18,%%xmm5 \n"
+ LABELALIGN
+ "1: \n"
+ "movdqa " MEMACCESS(0) ",%%xmm0 \n"
+ "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
+ "lea " MEMLEA(0x20,0) ",%0 \n"
+ "psrld $0x8,%%xmm0 \n"
+ "psrld $0x8,%%xmm1 \n"
+ "pand %%xmm5,%%xmm0 \n"
+ "pand %%xmm5,%%xmm1 \n"
+ "packssdw %%xmm1,%%xmm0 \n"
+ "packuswb %%xmm1,%%xmm0 \n"
+ "sub $0x8,%2 \n"
+ "movq %%xmm0," MEMACCESS(1) " \n"
+ "lea " MEMLEA(0x8,1) ",%1 \n"
+ "jg 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_bayer), // %1
+ "+r"(pix) // %2
+ :
+ : "memory", "cc"
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm5"
+#endif
+ );
+}
+#endif // HAS_ARGBTOBAYERGGROW_SSE2
+
+#ifdef HAS_ARGBSHUFFLEROW_SSSE3
+// For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
+void ARGBShuffleRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
+ const uint8* shuffler, int pix) {
+ asm volatile (
+ "movdqa " MEMACCESS(3) ",%%xmm5 \n"
+ LABELALIGN
+ "1: \n"
+ "movdqa " MEMACCESS(0) ",%%xmm0 \n"
+ "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
+ "lea " MEMLEA(0x20,0) ",%0 \n"
+ "pshufb %%xmm5,%%xmm0 \n"
+ "pshufb %%xmm5,%%xmm1 \n"
+ "sub $0x8,%2 \n"
+ "movdqa %%xmm0," MEMACCESS(1) " \n"
+ "movdqa %%xmm1," MEMACCESS2(0x10,1) " \n"
+ "lea " MEMLEA(0x20,1) ",%1 \n"
+ "jg 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_argb), // %1
+ "+r"(pix) // %2
+ : "r"(shuffler) // %3
+ : "memory", "cc"
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm5"
+#endif
+ );
+}
+
+void ARGBShuffleRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_argb,
+ const uint8* shuffler, int pix) {
+ asm volatile (
+ "movdqa " MEMACCESS(3) ",%%xmm5 \n"
+ LABELALIGN
+ "1: \n"
+ "movdqu " MEMACCESS(0) ",%%xmm0 \n"
+ "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
+ "lea " MEMLEA(0x20,0) ",%0 \n"
+ "pshufb %%xmm5,%%xmm0 \n"
+ "pshufb %%xmm5,%%xmm1 \n"
+ "sub $0x8,%2 \n"
+ "movdqu %%xmm0," MEMACCESS(1) " \n"
+ "movdqu %%xmm1," MEMACCESS2(0x10,1) " \n"
+ "lea " MEMLEA(0x20,1) ",%1 \n"
+ "jg 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_argb), // %1
+ "+r"(pix) // %2
+ : "r"(shuffler) // %3
+ : "memory", "cc"
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm5"
+#endif
+ );
+}
+#endif // HAS_ARGBSHUFFLEROW_SSSE3
+
+#ifdef HAS_ARGBSHUFFLEROW_AVX2
+// For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
+void ARGBShuffleRow_AVX2(const uint8* src_argb, uint8* dst_argb,
+ const uint8* shuffler, int pix) {
+ asm volatile (
+ "vbroadcastf128 " MEMACCESS(3) ",%%ymm5 \n"
+ LABELALIGN
+ "1: \n"
+ "vmovdqu " MEMACCESS(0) ",%%ymm0 \n"
+ "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n"
+ "lea " MEMLEA(0x40,0) ",%0 \n"
+ "vpshufb %%ymm5,%%ymm0,%%ymm0 \n"
+ "vpshufb %%ymm5,%%ymm1,%%ymm1 \n"
+ "sub $0x10,%2 \n"
+ "vmovdqu %%ymm0," MEMACCESS(1) " \n"
+ "vmovdqu %%ymm1," MEMACCESS2(0x20,1) " \n"
+ "lea " MEMLEA(0x40,1) ",%1 \n"
+ "jg 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_argb), // %1
+ "+r"(pix) // %2
+ : "r"(shuffler) // %3
+ : "memory", "cc"
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm5"
+#endif
+ );
+}
+#endif // HAS_ARGBSHUFFLEROW_AVX2
+
+#ifdef HAS_ARGBSHUFFLEROW_SSE2
+// For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
+void ARGBShuffleRow_SSE2(const uint8* src_argb, uint8* dst_argb,
+ const uint8* shuffler, int pix) {
+ uintptr_t pixel_temp = 0u;
+ asm volatile (
+ "pxor %%xmm5,%%xmm5 \n"
+ "mov " MEMACCESS(4) ",%k2 \n"
+ "cmp $0x3000102,%k2 \n"
+ "je 3012f \n"
+ "cmp $0x10203,%k2 \n"
+ "je 123f \n"
+ "cmp $0x30201,%k2 \n"
+ "je 321f \n"
+ "cmp $0x2010003,%k2 \n"
+ "je 2103f \n"
+
+ LABELALIGN
+ "1: \n"
+ "movzb " MEMACCESS(4) ",%2 \n"
+ MEMOPARG(movzb,0x00,0,2,1,2) " \n" // movzb (%0,%2,1),%2
+ "mov %b2," MEMACCESS(1) " \n"
+ "movzb " MEMACCESS2(0x1,4) ",%2 \n"
+ MEMOPARG(movzb,0x00,0,2,1,2) " \n" // movzb (%0,%2,1),%2
+ "mov %b2," MEMACCESS2(0x1,1) " \n"
+ BUNDLEALIGN
+ "movzb " MEMACCESS2(0x2,4) ",%2 \n"
+ MEMOPARG(movzb,0x00,0,2,1,2) " \n" // movzb (%0,%2,1),%2
+ "mov %b2," MEMACCESS2(0x2,1) " \n"
+ "movzb " MEMACCESS2(0x3,4) ",%2 \n"
+ MEMOPARG(movzb,0x00,0,2,1,2) " \n" // movzb (%0,%2,1),%2
+ "mov %b2," MEMACCESS2(0x3,1) " \n"
+ "lea " MEMLEA(0x4,0) ",%0 \n"
+ "lea " MEMLEA(0x4,1) ",%1 \n"
+ "sub $0x1,%3 \n"
+ "jg 1b \n"
+ "jmp 99f \n"
+
+ LABELALIGN
+ "123: \n"
+ "movdqu " MEMACCESS(0) ",%%xmm0 \n"
+ "lea " MEMLEA(0x10,0) ",%0 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "punpcklbw %%xmm5,%%xmm0 \n"
+ "punpckhbw %%xmm5,%%xmm1 \n"
+ "pshufhw $0x1b,%%xmm0,%%xmm0 \n"
+ "pshuflw $0x1b,%%xmm0,%%xmm0 \n"
+ "pshufhw $0x1b,%%xmm1,%%xmm1 \n"
+ "pshuflw $0x1b,%%xmm1,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm0 \n"
+ "sub $0x4,%3 \n"
+ "movdqu %%xmm0," MEMACCESS(1) " \n"
+ "lea " MEMLEA(0x10,1) ",%1 \n"
+ "jg 123b \n"
+ "jmp 99f \n"
+
+ LABELALIGN
+ "321: \n"
+ "movdqu " MEMACCESS(0) ",%%xmm0 \n"
+ "lea " MEMLEA(0x10,0) ",%0 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "punpcklbw %%xmm5,%%xmm0 \n"
+ "punpckhbw %%xmm5,%%xmm1 \n"
+ "pshufhw $0x39,%%xmm0,%%xmm0 \n"
+ "pshuflw $0x39,%%xmm0,%%xmm0 \n"
+ "pshufhw $0x39,%%xmm1,%%xmm1 \n"
+ "pshuflw $0x39,%%xmm1,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm0 \n"
+ "sub $0x4,%3 \n"
+ "movdqu %%xmm0," MEMACCESS(1) " \n"
+ "lea " MEMLEA(0x10,1) ",%1 \n"
+ "jg 321b \n"
+ "jmp 99f \n"
+
+ LABELALIGN
+ "2103: \n"
+ "movdqu " MEMACCESS(0) ",%%xmm0 \n"
+ "lea " MEMLEA(0x10,0) ",%0 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "punpcklbw %%xmm5,%%xmm0 \n"
+ "punpckhbw %%xmm5,%%xmm1 \n"
+ "pshufhw $0x93,%%xmm0,%%xmm0 \n"
+ "pshuflw $0x93,%%xmm0,%%xmm0 \n"
+ "pshufhw $0x93,%%xmm1,%%xmm1 \n"
+ "pshuflw $0x93,%%xmm1,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm0 \n"
+ "sub $0x4,%3 \n"
+ "movdqu %%xmm0," MEMACCESS(1) " \n"
+ "lea " MEMLEA(0x10,1) ",%1 \n"
+ "jg 2103b \n"
+ "jmp 99f \n"
+
+ LABELALIGN
+ "3012: \n"
+ "movdqu " MEMACCESS(0) ",%%xmm0 \n"
+ "lea " MEMLEA(0x10,0) ",%0 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "punpcklbw %%xmm5,%%xmm0 \n"
+ "punpckhbw %%xmm5,%%xmm1 \n"
+ "pshufhw $0xc6,%%xmm0,%%xmm0 \n"
+ "pshuflw $0xc6,%%xmm0,%%xmm0 \n"
+ "pshufhw $0xc6,%%xmm1,%%xmm1 \n"
+ "pshuflw $0xc6,%%xmm1,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm0 \n"
+ "sub $0x4,%3 \n"
+ "movdqu %%xmm0," MEMACCESS(1) " \n"
+ "lea " MEMLEA(0x10,1) ",%1 \n"
+ "jg 3012b \n"
+
+ "99: \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_argb), // %1
+ "+d"(pixel_temp), // %2
+ "+r"(pix) // %3
+ : "r"(shuffler) // %4
+ : "memory", "cc"
+#if defined(__native_client__) && defined(__x86_64__)
+ , "r14"
+#endif
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm5"
+#endif
+ );
+}
+#endif // HAS_ARGBSHUFFLEROW_SSE2
+
+#ifdef HAS_I422TOYUY2ROW_SSE2
+void I422ToYUY2Row_SSE2(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_frame, int width) {
+ asm volatile (
+ "sub %1,%2 \n"
+ LABELALIGN
+ "1: \n"
+ "movq " MEMACCESS(1) ",%%xmm2 \n"
+ MEMOPREG(movq,0x00,1,2,1,xmm3) // movq (%1,%2,1),%%xmm3
+ "lea " MEMLEA(0x8,1) ",%1 \n"
+ "punpcklbw %%xmm3,%%xmm2 \n"
+ "movdqu " MEMACCESS(0) ",%%xmm0 \n"
+ "lea " MEMLEA(0x10,0) ",%0 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "punpcklbw %%xmm2,%%xmm0 \n"
+ "punpckhbw %%xmm2,%%xmm1 \n"
+ "movdqu %%xmm0," MEMACCESS(3) " \n"
+ "movdqu %%xmm1," MEMACCESS2(0x10,3) " \n"
+ "lea " MEMLEA(0x20,3) ",%3 \n"
+ "sub $0x10,%4 \n"
+ "jg 1b \n"
+ : "+r"(src_y), // %0
+ "+r"(src_u), // %1
+ "+r"(src_v), // %2
+ "+r"(dst_frame), // %3
+ "+rm"(width) // %4
+ :
+ : "memory", "cc"
+#if defined(__native_client__) && defined(__x86_64__)
+ , "r14"
+#endif
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm2", "xmm3"
+#endif
+ );
+}
+#endif // HAS_I422TOYUY2ROW_SSE2
+
+#ifdef HAS_I422TOUYVYROW_SSE2
+void I422ToUYVYRow_SSE2(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_frame, int width) {
+ asm volatile (
+ "sub %1,%2 \n"
+ LABELALIGN
+ "1: \n"
+ "movq " MEMACCESS(1) ",%%xmm2 \n"
+ MEMOPREG(movq,0x00,1,2,1,xmm3) // movq (%1,%2,1),%%xmm3
+ "lea " MEMLEA(0x8,1) ",%1 \n"
+ "punpcklbw %%xmm3,%%xmm2 \n"
+ "movdqu " MEMACCESS(0) ",%%xmm0 \n"
+ "movdqa %%xmm2,%%xmm1 \n"
+ "lea " MEMLEA(0x10,0) ",%0 \n"
+ "punpcklbw %%xmm0,%%xmm1 \n"
+ "punpckhbw %%xmm0,%%xmm2 \n"
+ "movdqu %%xmm1," MEMACCESS(3) " \n"
+ "movdqu %%xmm2," MEMACCESS2(0x10,3) " \n"
+ "lea " MEMLEA(0x20,3) ",%3 \n"
+ "sub $0x10,%4 \n"
+ "jg 1b \n"
+ : "+r"(src_y), // %0
+ "+r"(src_u), // %1
+ "+r"(src_v), // %2
+ "+r"(dst_frame), // %3
+ "+rm"(width) // %4
+ :
+ : "memory", "cc"
+#if defined(__native_client__) && defined(__x86_64__)
+ , "r14"
+#endif
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm2", "xmm3"
+#endif
+ );
+}
+#endif // HAS_I422TOUYVYROW_SSE2
+
+#ifdef HAS_ARGBPOLYNOMIALROW_SSE2
+void ARGBPolynomialRow_SSE2(const uint8* src_argb,
+ uint8* dst_argb, const float* poly,
+ int width) {
+ asm volatile (
+ "pxor %%xmm3,%%xmm3 \n"
+
+ // 2 pixel loop.
+ LABELALIGN
+ "1: \n"
+ "movq " MEMACCESS(0) ",%%xmm0 \n"
+ "lea " MEMLEA(0x8,0) ",%0 \n"
+ "punpcklbw %%xmm3,%%xmm0 \n"
+ "movdqa %%xmm0,%%xmm4 \n"
+ "punpcklwd %%xmm3,%%xmm0 \n"
+ "punpckhwd %%xmm3,%%xmm4 \n"
+ "cvtdq2ps %%xmm0,%%xmm0 \n"
+ "cvtdq2ps %%xmm4,%%xmm4 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "movdqa %%xmm4,%%xmm5 \n"
+ "mulps " MEMACCESS2(0x10,3) ",%%xmm0 \n"
+ "mulps " MEMACCESS2(0x10,3) ",%%xmm4 \n"
+ "addps " MEMACCESS(3) ",%%xmm0 \n"
+ "addps " MEMACCESS(3) ",%%xmm4 \n"
+ "movdqa %%xmm1,%%xmm2 \n"
+ "movdqa %%xmm5,%%xmm6 \n"
+ "mulps %%xmm1,%%xmm2 \n"
+ "mulps %%xmm5,%%xmm6 \n"
+ "mulps %%xmm2,%%xmm1 \n"
+ "mulps %%xmm6,%%xmm5 \n"
+ "mulps " MEMACCESS2(0x20,3) ",%%xmm2 \n"
+ "mulps " MEMACCESS2(0x20,3) ",%%xmm6 \n"
+ "mulps " MEMACCESS2(0x30,3) ",%%xmm1 \n"
+ "mulps " MEMACCESS2(0x30,3) ",%%xmm5 \n"
+ "addps %%xmm2,%%xmm0 \n"
+ "addps %%xmm6,%%xmm4 \n"
+ "addps %%xmm1,%%xmm0 \n"
+ "addps %%xmm5,%%xmm4 \n"
+ "cvttps2dq %%xmm0,%%xmm0 \n"
+ "cvttps2dq %%xmm4,%%xmm4 \n"
+ "packuswb %%xmm4,%%xmm0 \n"
+ "packuswb %%xmm0,%%xmm0 \n"
+ "sub $0x2,%2 \n"
+ "movq %%xmm0," MEMACCESS(1) " \n"
+ "lea " MEMLEA(0x8,1) ",%1 \n"
+ "jg 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_argb), // %1
+ "+r"(width) // %2
+ : "r"(poly) // %3
+ : "memory", "cc"
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
+#endif
+ );
+}
+#endif // HAS_ARGBPOLYNOMIALROW_SSE2
+
+#ifdef HAS_ARGBPOLYNOMIALROW_AVX2
+void ARGBPolynomialRow_AVX2(const uint8* src_argb,
+ uint8* dst_argb, const float* poly,
+ int width) {
+ asm volatile (
+ "vbroadcastf128 " MEMACCESS(3) ",%%ymm4 \n"
+ "vbroadcastf128 " MEMACCESS2(0x10,3) ",%%ymm5 \n"
+ "vbroadcastf128 " MEMACCESS2(0x20,3) ",%%ymm6 \n"
+ "vbroadcastf128 " MEMACCESS2(0x30,3) ",%%ymm7 \n"
+
+ // 2 pixel loop.
+ LABELALIGN
+ "1: \n"
+ "vpmovzxbd " MEMACCESS(0) ",%%ymm0 \n" // 2 ARGB pixels
+ "lea " MEMLEA(0x8,0) ",%0 \n"
+ "vcvtdq2ps %%ymm0,%%ymm0 \n" // X 8 floats
+ "vmulps %%ymm0,%%ymm0,%%ymm2 \n" // X * X
+ "vmulps %%ymm7,%%ymm0,%%ymm3 \n" // C3 * X
+ "vfmadd132ps %%ymm5,%%ymm4,%%ymm0 \n" // result = C0 + C1 * X
+ "vfmadd231ps %%ymm6,%%ymm2,%%ymm0 \n" // result += C2 * X * X
+ "vfmadd231ps %%ymm3,%%ymm2,%%ymm0 \n" // result += C3 * X * X * X
+ "vcvttps2dq %%ymm0,%%ymm0 \n"
+ "vpackusdw %%ymm0,%%ymm0,%%ymm0 \n"
+ "vpermq $0xd8,%%ymm0,%%ymm0 \n"
+ "vpackuswb %%xmm0,%%xmm0,%%xmm0 \n"
+ "sub $0x2,%2 \n"
+ "vmovq %%xmm0," MEMACCESS(1) " \n"
+ "lea " MEMLEA(0x8,1) ",%1 \n"
+ "jg 1b \n"
+ "vzeroupper \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_argb), // %1
+ "+r"(width) // %2
+ : "r"(poly) // %3
+ : "memory", "cc"
+#if defined(__SSE2__)
+// TODO(fbarchard): declare ymm usage when applicable.
+ , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
+#endif
+ );
+}
+#endif // HAS_ARGBPOLYNOMIALROW_AVX2
+
+#ifdef HAS_ARGBCOLORTABLEROW_X86
+// Tranform ARGB pixels with color table.
+void ARGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb,
+ int width) {
+ uintptr_t pixel_temp = 0u;
+ asm volatile (
+ // 1 pixel loop.
+ LABELALIGN
+ "1: \n"
+ "movzb " MEMACCESS(0) ",%1 \n"
+ "lea " MEMLEA(0x4,0) ",%0 \n"
+ MEMOPARG(movzb,0x00,3,1,4,1) " \n" // movzb (%3,%1,4),%1
+ "mov %b1," MEMACCESS2(-0x4,0) " \n"
+ "movzb " MEMACCESS2(-0x3,0) ",%1 \n"
+ MEMOPARG(movzb,0x01,3,1,4,1) " \n" // movzb 0x1(%3,%1,4),%1
+ "mov %b1," MEMACCESS2(-0x3,0) " \n"
+ "movzb " MEMACCESS2(-0x2,0) ",%1 \n"
+ MEMOPARG(movzb,0x02,3,1,4,1) " \n" // movzb 0x2(%3,%1,4),%1
+ "mov %b1," MEMACCESS2(-0x2,0) " \n"
+ "movzb " MEMACCESS2(-0x1,0) ",%1 \n"
+ MEMOPARG(movzb,0x03,3,1,4,1) " \n" // movzb 0x3(%3,%1,4),%1
+ "mov %b1," MEMACCESS2(-0x1,0) " \n"
+ "dec %2 \n"
+ "jg 1b \n"
+ : "+r"(dst_argb), // %0
+ "+d"(pixel_temp), // %1
+ "+r"(width) // %2
+ : "r"(table_argb) // %3
+ : "memory", "cc");
+}
+#endif // HAS_ARGBCOLORTABLEROW_X86
+
+#ifdef HAS_RGBCOLORTABLEROW_X86
+// Tranform RGB pixels with color table.
+void RGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb, int width) {
+ uintptr_t pixel_temp = 0u;
+ asm volatile (
+ // 1 pixel loop.
+ LABELALIGN
+ "1: \n"
+ "movzb " MEMACCESS(0) ",%1 \n"
+ "lea " MEMLEA(0x4,0) ",%0 \n"
+ MEMOPARG(movzb,0x00,3,1,4,1) " \n" // movzb (%3,%1,4),%1
+ "mov %b1," MEMACCESS2(-0x4,0) " \n"
+ "movzb " MEMACCESS2(-0x3,0) ",%1 \n"
+ MEMOPARG(movzb,0x01,3,1,4,1) " \n" // movzb 0x1(%3,%1,4),%1
+ "mov %b1," MEMACCESS2(-0x3,0) " \n"
+ "movzb " MEMACCESS2(-0x2,0) ",%1 \n"
+ MEMOPARG(movzb,0x02,3,1,4,1) " \n" // movzb 0x2(%3,%1,4),%1
+ "mov %b1," MEMACCESS2(-0x2,0) " \n"
+ "dec %2 \n"
+ "jg 1b \n"
+ : "+r"(dst_argb), // %0
+ "+d"(pixel_temp), // %1
+ "+r"(width) // %2
+ : "r"(table_argb) // %3
+ : "memory", "cc");
+}
+#endif // HAS_RGBCOLORTABLEROW_X86
+
+#ifdef HAS_ARGBLUMACOLORTABLEROW_SSSE3
+// Tranform RGB pixels with luma table.
+void ARGBLumaColorTableRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
+ int width,
+ const uint8* luma, uint32 lumacoeff) {
+ uintptr_t pixel_temp = 0u;
+ uintptr_t table_temp = 0u;
+ asm volatile (
+ "movd %6,%%xmm3 \n"
+ "pshufd $0x0,%%xmm3,%%xmm3 \n"
+ "pcmpeqb %%xmm4,%%xmm4 \n"
+ "psllw $0x8,%%xmm4 \n"
+ "pxor %%xmm5,%%xmm5 \n"
+
+ // 4 pixel loop.
+ LABELALIGN
+ "1: \n"
+ "movdqu " MEMACCESS(2) ",%%xmm0 \n"
+ "pmaddubsw %%xmm3,%%xmm0 \n"
+ "phaddw %%xmm0,%%xmm0 \n"
+ "pand %%xmm4,%%xmm0 \n"
+ "punpcklwd %%xmm5,%%xmm0 \n"
+ "movd %%xmm0,%k1 \n" // 32 bit offset
+ "add %5,%1 \n"
+ "pshufd $0x39,%%xmm0,%%xmm0 \n"
+
+ "movzb " MEMACCESS(2) ",%0 \n"
+ MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0
+ "mov %b0," MEMACCESS(3) " \n"
+ "movzb " MEMACCESS2(0x1,2) ",%0 \n"
+ MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0
+ "mov %b0," MEMACCESS2(0x1,3) " \n"
+ "movzb " MEMACCESS2(0x2,2) ",%0 \n"
+ MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0
+ "mov %b0," MEMACCESS2(0x2,3) " \n"
+ "movzb " MEMACCESS2(0x3,2) ",%0 \n"
+ "mov %b0," MEMACCESS2(0x3,3) " \n"
+
+ "movd %%xmm0,%k1 \n" // 32 bit offset
+ "add %5,%1 \n"
+ "pshufd $0x39,%%xmm0,%%xmm0 \n"
+
+ "movzb " MEMACCESS2(0x4,2) ",%0 \n"
+ MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0
+ "mov %b0," MEMACCESS2(0x4,3) " \n"
+ BUNDLEALIGN
+ "movzb " MEMACCESS2(0x5,2) ",%0 \n"
+ MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0
+ "mov %b0," MEMACCESS2(0x5,3) " \n"
+ "movzb " MEMACCESS2(0x6,2) ",%0 \n"
+ MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0
+ "mov %b0," MEMACCESS2(0x6,3) " \n"
+ "movzb " MEMACCESS2(0x7,2) ",%0 \n"
+ "mov %b0," MEMACCESS2(0x7,3) " \n"
+
+ "movd %%xmm0,%k1 \n" // 32 bit offset
+ "add %5,%1 \n"
+ "pshufd $0x39,%%xmm0,%%xmm0 \n"
+
+ "movzb " MEMACCESS2(0x8,2) ",%0 \n"
+ MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0
+ "mov %b0," MEMACCESS2(0x8,3) " \n"
+ "movzb " MEMACCESS2(0x9,2) ",%0 \n"
+ MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0
+ "mov %b0," MEMACCESS2(0x9,3) " \n"
+ "movzb " MEMACCESS2(0xa,2) ",%0 \n"
+ MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0
+ "mov %b0," MEMACCESS2(0xa,3) " \n"
+ "movzb " MEMACCESS2(0xb,2) ",%0 \n"
+ "mov %b0," MEMACCESS2(0xb,3) " \n"
+
+ "movd %%xmm0,%k1 \n" // 32 bit offset
+ "add %5,%1 \n"
+
+ "movzb " MEMACCESS2(0xc,2) ",%0 \n"
+ MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0
+ "mov %b0," MEMACCESS2(0xc,3) " \n"
+ "movzb " MEMACCESS2(0xd,2) ",%0 \n"
+ MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0
+ "mov %b0," MEMACCESS2(0xd,3) " \n"
+ "movzb " MEMACCESS2(0xe,2) ",%0 \n"
+ MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0
+ "mov %b0," MEMACCESS2(0xe,3) " \n"
+ "movzb " MEMACCESS2(0xf,2) ",%0 \n"
+ "mov %b0," MEMACCESS2(0xf,3) " \n"
+ "sub $0x4,%4 \n"
+ "lea " MEMLEA(0x10,2) ",%2 \n"
+ "lea " MEMLEA(0x10,3) ",%3 \n"
+ "jg 1b \n"
+ : "+d"(pixel_temp), // %0
+ "+a"(table_temp), // %1
+ "+r"(src_argb), // %2
+ "+r"(dst_argb), // %3
+ "+rm"(width) // %4
+ : "r"(luma), // %5
+ "rm"(lumacoeff) // %6
+ : "memory", "cc"
+#if defined(__SSE2__)
+ , "xmm0", "xmm3", "xmm4", "xmm5"
+#endif
+ );
+}
+#endif // HAS_ARGBLUMACOLORTABLEROW_SSSE3
+
+#endif // defined(__x86_64__) || defined(__i386__)
+
+#ifdef __cplusplus
+} // extern "C"
+} // namespace libyuv
+#endif
diff --git a/third_party/libyuv/source/row_win.cc b/third_party/libyuv/source/row_win.cc
new file mode 100644
index 0000000..2cfacad
--- /dev/null
+++ b/third_party/libyuv/source/row_win.cc
@@ -0,0 +1,7284 @@
+/*
+ * Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "third_party/libyuv/include/libyuv/row.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// This module is for Visual C x86.
+#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER)
+
+#ifdef HAS_ARGBTOYROW_SSSE3
+
+// Constants for ARGB.
+static const vec8 kARGBToY = {
+ 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0
+};
+
+// JPeg full range.
+static const vec8 kARGBToYJ = {
+ 15, 75, 38, 0, 15, 75, 38, 0, 15, 75, 38, 0, 15, 75, 38, 0
+};
+
+static const vec8 kARGBToU = {
+ 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0
+};
+
+static const vec8 kARGBToUJ = {
+ 127, -84, -43, 0, 127, -84, -43, 0, 127, -84, -43, 0, 127, -84, -43, 0
+};
+
+static const vec8 kARGBToV = {
+ -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0,
+};
+
+static const vec8 kARGBToVJ = {
+ -20, -107, 127, 0, -20, -107, 127, 0, -20, -107, 127, 0, -20, -107, 127, 0
+};
+
+// vpermd for vphaddw + vpackuswb vpermd.
+static const lvec32 kPermdARGBToY_AVX = {
+ 0, 4, 1, 5, 2, 6, 3, 7
+};
+
+// vpshufb for vphaddw + vpackuswb packed to shorts.
+static const lvec8 kShufARGBToUV_AVX = {
+ 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15,
+ 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15,
+};
+
+// Constants for BGRA.
+static const vec8 kBGRAToY = {
+ 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13
+};
+
+static const vec8 kBGRAToU = {
+ 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112
+};
+
+static const vec8 kBGRAToV = {
+ 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18
+};
+
+// Constants for ABGR.
+static const vec8 kABGRToY = {
+ 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0
+};
+
+static const vec8 kABGRToU = {
+ -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0
+};
+
+static const vec8 kABGRToV = {
+ 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0
+};
+
+// Constants for RGBA.
+static const vec8 kRGBAToY = {
+ 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33
+};
+
+static const vec8 kRGBAToU = {
+ 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38
+};
+
+static const vec8 kRGBAToV = {
+ 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112
+};
+
+static const uvec8 kAddY16 = {
+ 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u
+};
+
+static const vec16 kAddYJ64 = {
+ 64, 64, 64, 64, 64, 64, 64, 64
+};
+
+static const uvec8 kAddUV128 = {
+ 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u,
+ 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u
+};
+
+static const uvec16 kAddUVJ128 = {
+ 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u
+};
+
+// Shuffle table for converting RGB24 to ARGB.
+static const uvec8 kShuffleMaskRGB24ToARGB = {
+ 0u, 1u, 2u, 12u, 3u, 4u, 5u, 13u, 6u, 7u, 8u, 14u, 9u, 10u, 11u, 15u
+};
+
+// Shuffle table for converting RAW to ARGB.
+static const uvec8 kShuffleMaskRAWToARGB = {
+ 2u, 1u, 0u, 12u, 5u, 4u, 3u, 13u, 8u, 7u, 6u, 14u, 11u, 10u, 9u, 15u
+};
+
+// Shuffle table for converting ARGB to RGB24.
+static const uvec8 kShuffleMaskARGBToRGB24 = {
+ 0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 10u, 12u, 13u, 14u, 128u, 128u, 128u, 128u
+};
+
+// Shuffle table for converting ARGB to RAW.
+static const uvec8 kShuffleMaskARGBToRAW = {
+ 2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 8u, 14u, 13u, 12u, 128u, 128u, 128u, 128u
+};
+
+// Shuffle table for converting ARGBToRGB24 for I422ToRGB24. First 8 + next 4
+static const uvec8 kShuffleMaskARGBToRGB24_0 = {
+ 0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 128u, 128u, 128u, 128u, 10u, 12u, 13u, 14u
+};
+
+// Shuffle table for converting ARGB to RAW.
+static const uvec8 kShuffleMaskARGBToRAW_0 = {
+ 2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 128u, 128u, 128u, 128u, 8u, 14u, 13u, 12u
+};
+
+// Duplicates gray value 3 times and fills in alpha opaque.
+__declspec(naked) __declspec(align(16))
+void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) {
+ __asm {
+ mov eax, [esp + 4] // src_y
+ mov edx, [esp + 8] // dst_argb
+ mov ecx, [esp + 12] // pix
+ pcmpeqb xmm5, xmm5 // generate mask 0xff000000
+ pslld xmm5, 24
+
+ align 4
+ convertloop:
+ movq xmm0, qword ptr [eax]
+ lea eax, [eax + 8]
+ punpcklbw xmm0, xmm0
+ movdqa xmm1, xmm0
+ punpcklwd xmm0, xmm0
+ punpckhwd xmm1, xmm1
+ por xmm0, xmm5
+ por xmm1, xmm5
+ movdqa [edx], xmm0
+ movdqa [edx + 16], xmm1
+ lea edx, [edx + 32]
+ sub ecx, 8
+ jg convertloop
+ ret
+ }
+}
+
+__declspec(naked) __declspec(align(16))
+void I400ToARGBRow_Unaligned_SSE2(const uint8* src_y, uint8* dst_argb,
+ int pix) {
+ __asm {
+ mov eax, [esp + 4] // src_y
+ mov edx, [esp + 8] // dst_argb
+ mov ecx, [esp + 12] // pix
+ pcmpeqb xmm5, xmm5 // generate mask 0xff000000
+ pslld xmm5, 24
+
+ align 4
+ convertloop:
+ movq xmm0, qword ptr [eax]
+ lea eax, [eax + 8]
+ punpcklbw xmm0, xmm0
+ movdqa xmm1, xmm0
+ punpcklwd xmm0, xmm0
+ punpckhwd xmm1, xmm1
+ por xmm0, xmm5
+ por xmm1, xmm5
+ movdqu [edx], xmm0
+ movdqu [edx + 16], xmm1
+ lea edx, [edx + 32]
+ sub ecx, 8
+ jg convertloop
+ ret
+ }
+}
+
+__declspec(naked) __declspec(align(16))
+void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int pix) {
+ __asm {
+ mov eax, [esp + 4] // src_rgb24
+ mov edx, [esp + 8] // dst_argb
+ mov ecx, [esp + 12] // pix
+ pcmpeqb xmm5, xmm5 // generate mask 0xff000000
+ pslld xmm5, 24
+ movdqa xmm4, kShuffleMaskRGB24ToARGB
+
+ align 4
+ convertloop:
+ movdqu xmm0, [eax]
+ movdqu xmm1, [eax + 16]
+ movdqu xmm3, [eax + 32]
+ lea eax, [eax + 48]
+ movdqa xmm2, xmm3
+ palignr xmm2, xmm1, 8 // xmm2 = { xmm3[0:3] xmm1[8:15]}
+ pshufb xmm2, xmm4
+ por xmm2, xmm5
+ palignr xmm1, xmm0, 12 // xmm1 = { xmm3[0:7] xmm0[12:15]}
+ pshufb xmm0, xmm4
+ movdqa [edx + 32], xmm2
+ por xmm0, xmm5
+ pshufb xmm1, xmm4
+ movdqa [edx], xmm0
+ por xmm1, xmm5
+ palignr xmm3, xmm3, 4 // xmm3 = { xmm3[4:15]}
+ pshufb xmm3, xmm4
+ movdqa [edx + 16], xmm1
+ por xmm3, xmm5
+ sub ecx, 16
+ movdqa [edx + 48], xmm3
+ lea edx, [edx + 64]
+ jg convertloop
+ ret
+ }
+}
+
+__declspec(naked) __declspec(align(16))
+void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb,
+ int pix) {
+ __asm {
+ mov eax, [esp + 4] // src_raw
+ mov edx, [esp + 8] // dst_argb
+ mov ecx, [esp + 12] // pix
+ pcmpeqb xmm5, xmm5 // generate mask 0xff000000
+ pslld xmm5, 24
+ movdqa xmm4, kShuffleMaskRAWToARGB
+
+ align 4
+ convertloop:
+ movdqu xmm0, [eax]
+ movdqu xmm1, [eax + 16]
+ movdqu xmm3, [eax + 32]
+ lea eax, [eax + 48]
+ movdqa xmm2, xmm3
+ palignr xmm2, xmm1, 8 // xmm2 = { xmm3[0:3] xmm1[8:15]}
+ pshufb xmm2, xmm4
+ por xmm2, xmm5
+ palignr xmm1, xmm0, 12 // xmm1 = { xmm3[0:7] xmm0[12:15]}
+ pshufb xmm0, xmm4
+ movdqa [edx + 32], xmm2
+ por xmm0, xmm5
+ pshufb xmm1, xmm4
+ movdqa [edx], xmm0
+ por xmm1, xmm5
+ palignr xmm3, xmm3, 4 // xmm3 = { xmm3[4:15]}
+ pshufb xmm3, xmm4
+ movdqa [edx + 16], xmm1
+ por xmm3, xmm5
+ sub ecx, 16
+ movdqa [edx + 48], xmm3
+ lea edx, [edx + 64]
+ jg convertloop
+ ret
+ }
+}
+
+// pmul method to replicate bits.
+// Math to replicate bits:
+// (v << 8) | (v << 3)
+// v * 256 + v * 8
+// v * (256 + 8)
+// G shift of 5 is incorporated, so shift is 5 + 8 and 5 + 3
+// 20 instructions.
+__declspec(naked) __declspec(align(16))
+void RGB565ToARGBRow_SSE2(const uint8* src_rgb565, uint8* dst_argb,
+ int pix) {
+ __asm {
+ mov eax, 0x01080108 // generate multiplier to repeat 5 bits
+ movd xmm5, eax
+ pshufd xmm5, xmm5, 0
+ mov eax, 0x20802080 // multiplier shift by 5 and then repeat 6 bits
+ movd xmm6, eax
+ pshufd xmm6, xmm6, 0
+ pcmpeqb xmm3, xmm3 // generate mask 0xf800f800 for Red
+ psllw xmm3, 11
+ pcmpeqb xmm4, xmm4 // generate mask 0x07e007e0 for Green
+ psllw xmm4, 10
+ psrlw xmm4, 5
+ pcmpeqb xmm7, xmm7 // generate mask 0xff00ff00 for Alpha
+ psllw xmm7, 8
+
+ mov eax, [esp + 4] // src_rgb565
+ mov edx, [esp + 8] // dst_argb
+ mov ecx, [esp + 12] // pix
+ sub edx, eax
+ sub edx, eax
+
+ align 4
+ convertloop:
+ movdqu xmm0, [eax] // fetch 8 pixels of bgr565
+ movdqa xmm1, xmm0
+ movdqa xmm2, xmm0
+ pand xmm1, xmm3 // R in upper 5 bits
+ psllw xmm2, 11 // B in upper 5 bits
+ pmulhuw xmm1, xmm5 // * (256 + 8)
+ pmulhuw xmm2, xmm5 // * (256 + 8)
+ psllw xmm1, 8
+ por xmm1, xmm2 // RB
+ pand xmm0, xmm4 // G in middle 6 bits
+ pmulhuw xmm0, xmm6 // << 5 * (256 + 4)
+ por xmm0, xmm7 // AG
+ movdqa xmm2, xmm1
+ punpcklbw xmm1, xmm0
+ punpckhbw xmm2, xmm0
+ movdqa [eax * 2 + edx], xmm1 // store 4 pixels of ARGB
+ movdqa [eax * 2 + edx + 16], xmm2 // store next 4 pixels of ARGB
+ lea eax, [eax + 16]
+ sub ecx, 8
+ jg convertloop
+ ret
+ }
+}
+
+// 24 instructions
+__declspec(naked) __declspec(align(16))
+void ARGB1555ToARGBRow_SSE2(const uint8* src_argb1555, uint8* dst_argb,
+ int pix) {
+ __asm {
+ mov eax, 0x01080108 // generate multiplier to repeat 5 bits
+ movd xmm5, eax
+ pshufd xmm5, xmm5, 0
+ mov eax, 0x42004200 // multiplier shift by 6 and then repeat 5 bits
+ movd xmm6, eax
+ pshufd xmm6, xmm6, 0
+ pcmpeqb xmm3, xmm3 // generate mask 0xf800f800 for Red
+ psllw xmm3, 11
+ movdqa xmm4, xmm3 // generate mask 0x03e003e0 for Green
+ psrlw xmm4, 6
+ pcmpeqb xmm7, xmm7 // generate mask 0xff00ff00 for Alpha
+ psllw xmm7, 8
+
+ mov eax, [esp + 4] // src_argb1555
+ mov edx, [esp + 8] // dst_argb
+ mov ecx, [esp + 12] // pix
+ sub edx, eax
+ sub edx, eax
+
+ align 4
+ convertloop:
+ movdqu xmm0, [eax] // fetch 8 pixels of 1555
+ movdqa xmm1, xmm0
+ movdqa xmm2, xmm0
+ psllw xmm1, 1 // R in upper 5 bits
+ psllw xmm2, 11 // B in upper 5 bits
+ pand xmm1, xmm3
+ pmulhuw xmm2, xmm5 // * (256 + 8)
+ pmulhuw xmm1, xmm5 // * (256 + 8)
+ psllw xmm1, 8
+ por xmm1, xmm2 // RB
+ movdqa xmm2, xmm0
+ pand xmm0, xmm4 // G in middle 5 bits
+ psraw xmm2, 8 // A
+ pmulhuw xmm0, xmm6 // << 6 * (256 + 8)
+ pand xmm2, xmm7
+ por xmm0, xmm2 // AG
+ movdqa xmm2, xmm1
+ punpcklbw xmm1, xmm0
+ punpckhbw xmm2, xmm0
+ movdqa [eax * 2 + edx], xmm1 // store 4 pixels of ARGB
+ movdqa [eax * 2 + edx + 16], xmm2 // store next 4 pixels of ARGB
+ lea eax, [eax + 16]
+ sub ecx, 8
+ jg convertloop
+ ret
+ }
+}
+
+// 18 instructions.
+__declspec(naked) __declspec(align(16))
+void ARGB4444ToARGBRow_SSE2(const uint8* src_argb4444, uint8* dst_argb,
+ int pix) {
+ __asm {
+ mov eax, 0x0f0f0f0f // generate mask 0x0f0f0f0f
+ movd xmm4, eax
+ pshufd xmm4, xmm4, 0
+ movdqa xmm5, xmm4 // 0xf0f0f0f0 for high nibbles
+ pslld xmm5, 4
+ mov eax, [esp + 4] // src_argb4444
+ mov edx, [esp + 8] // dst_argb
+ mov ecx, [esp + 12] // pix
+ sub edx, eax
+ sub edx, eax
+
+ align 4
+ convertloop:
+ movdqu xmm0, [eax] // fetch 8 pixels of bgra4444
+ movdqa xmm2, xmm0
+ pand xmm0, xmm4 // mask low nibbles
+ pand xmm2, xmm5 // mask high nibbles
+ movdqa xmm1, xmm0
+ movdqa xmm3, xmm2
+ psllw xmm1, 4
+ psrlw xmm3, 4
+ por xmm0, xmm1
+ por xmm2, xmm3
+ movdqa xmm1, xmm0
+ punpcklbw xmm0, xmm2
+ punpckhbw xmm1, xmm2
+ movdqa [eax * 2 + edx], xmm0 // store 4 pixels of ARGB
+ movdqa [eax * 2 + edx + 16], xmm1 // store next 4 pixels of ARGB
+ lea eax, [eax + 16]
+ sub ecx, 8
+ jg convertloop
+ ret
+ }
+}
+
+__declspec(naked) __declspec(align(16))
+void ARGBToRGB24Row_SSSE3(const uint8* src_argb, uint8* dst_rgb, int pix) {
+ __asm {
+ mov eax, [esp + 4] // src_argb
+ mov edx, [esp + 8] // dst_rgb
+ mov ecx, [esp + 12] // pix
+ movdqa xmm6, kShuffleMaskARGBToRGB24
+
+ align 4
+ convertloop:
+ movdqu xmm0, [eax] // fetch 16 pixels of argb
+ movdqu xmm1, [eax + 16]
+ movdqu xmm2, [eax + 32]
+ movdqu xmm3, [eax + 48]
+ lea eax, [eax + 64]
+ pshufb xmm0, xmm6 // pack 16 bytes of ARGB to 12 bytes of RGB
+ pshufb xmm1, xmm6
+ pshufb xmm2, xmm6
+ pshufb xmm3, xmm6
+ movdqa xmm4, xmm1 // 4 bytes from 1 for 0
+ psrldq xmm1, 4 // 8 bytes from 1
+ pslldq xmm4, 12 // 4 bytes from 1 for 0
+ movdqa xmm5, xmm2 // 8 bytes from 2 for 1
+ por xmm0, xmm4 // 4 bytes from 1 for 0
+ pslldq xmm5, 8 // 8 bytes from 2 for 1
+ movdqu [edx], xmm0 // store 0
+ por xmm1, xmm5 // 8 bytes from 2 for 1
+ psrldq xmm2, 8 // 4 bytes from 2
+ pslldq xmm3, 4 // 12 bytes from 3 for 2
+ por xmm2, xmm3 // 12 bytes from 3 for 2
+ movdqu [edx + 16], xmm1 // store 1
+ movdqu [edx + 32], xmm2 // store 2
+ lea edx, [edx + 48]
+ sub ecx, 16
+ jg convertloop
+ ret
+ }
+}
+
+__declspec(naked) __declspec(align(16))
+void ARGBToRAWRow_SSSE3(const uint8* src_argb, uint8* dst_rgb, int pix) {
+ __asm {
+ mov eax, [esp + 4] // src_argb
+ mov edx, [esp + 8] // dst_rgb
+ mov ecx, [esp + 12] // pix
+ movdqa xmm6, kShuffleMaskARGBToRAW
+
+ align 4
+ convertloop:
+ movdqu xmm0, [eax] // fetch 16 pixels of argb
+ movdqu xmm1, [eax + 16]
+ movdqu xmm2, [eax + 32]
+ movdqu xmm3, [eax + 48]
+ lea eax, [eax + 64]
+ pshufb xmm0, xmm6 // pack 16 bytes of ARGB to 12 bytes of RGB
+ pshufb xmm1, xmm6
+ pshufb xmm2, xmm6
+ pshufb xmm3, xmm6
+ movdqa xmm4, xmm1 // 4 bytes from 1 for 0
+ psrldq xmm1, 4 // 8 bytes from 1
+ pslldq xmm4, 12 // 4 bytes from 1 for 0
+ movdqa xmm5, xmm2 // 8 bytes from 2 for 1
+ por xmm0, xmm4 // 4 bytes from 1 for 0
+ pslldq xmm5, 8 // 8 bytes from 2 for 1
+ movdqu [edx], xmm0 // store 0
+ por xmm1, xmm5 // 8 bytes from 2 for 1
+ psrldq xmm2, 8 // 4 bytes from 2
+ pslldq xmm3, 4 // 12 bytes from 3 for 2
+ por xmm2, xmm3 // 12 bytes from 3 for 2
+ movdqu [edx + 16], xmm1 // store 1
+ movdqu [edx + 32], xmm2 // store 2
+ lea edx, [edx + 48]
+ sub ecx, 16
+ jg convertloop
+ ret
+ }
+}
+
+__declspec(naked) __declspec(align(16))
+void ARGBToRGB565Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix) {
+ __asm {
+ mov eax, [esp + 4] // src_argb
+ mov edx, [esp + 8] // dst_rgb
+ mov ecx, [esp + 12] // pix
+ pcmpeqb xmm3, xmm3 // generate mask 0x0000001f
+ psrld xmm3, 27
+ pcmpeqb xmm4, xmm4 // generate mask 0x000007e0
+ psrld xmm4, 26
+ pslld xmm4, 5
+ pcmpeqb xmm5, xmm5 // generate mask 0xfffff800
+ pslld xmm5, 11
+
+ align 4
+ convertloop:
+ movdqa xmm0, [eax] // fetch 4 pixels of argb
+ movdqa xmm1, xmm0 // B
+ movdqa xmm2, xmm0 // G
+ pslld xmm0, 8 // R
+ psrld xmm1, 3 // B
+ psrld xmm2, 5 // G
+ psrad xmm0, 16 // R
+ pand xmm1, xmm3 // B
+ pand xmm2, xmm4 // G
+ pand xmm0, xmm5 // R
+ por xmm1, xmm2 // BG
+ por xmm0, xmm1 // BGR
+ packssdw xmm0, xmm0
+ lea eax, [eax + 16]
+ movq qword ptr [edx], xmm0 // store 4 pixels of RGB565
+ lea edx, [edx + 8]
+ sub ecx, 4
+ jg convertloop
+ ret
+ }
+}
+
+// TODO(fbarchard): Improve sign extension/packing.
+__declspec(naked) __declspec(align(16))
+void ARGBToARGB1555Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix) {
+ __asm {
+ mov eax, [esp + 4] // src_argb
+ mov edx, [esp + 8] // dst_rgb
+ mov ecx, [esp + 12] // pix
+ pcmpeqb xmm4, xmm4 // generate mask 0x0000001f
+ psrld xmm4, 27
+ movdqa xmm5, xmm4 // generate mask 0x000003e0
+ pslld xmm5, 5
+ movdqa xmm6, xmm4 // generate mask 0x00007c00
+ pslld xmm6, 10
+ pcmpeqb xmm7, xmm7 // generate mask 0xffff8000
+ pslld xmm7, 15
+
+ align 4
+ convertloop:
+ movdqa xmm0, [eax] // fetch 4 pixels of argb
+ movdqa xmm1, xmm0 // B
+ movdqa xmm2, xmm0 // G
+ movdqa xmm3, xmm0 // R
+ psrad xmm0, 16 // A
+ psrld xmm1, 3 // B
+ psrld xmm2, 6 // G
+ psrld xmm3, 9 // R
+ pand xmm0, xmm7 // A
+ pand xmm1, xmm4 // B
+ pand xmm2, xmm5 // G
+ pand xmm3, xmm6 // R
+ por xmm0, xmm1 // BA
+ por xmm2, xmm3 // GR
+ por xmm0, xmm2 // BGRA
+ packssdw xmm0, xmm0
+ lea eax, [eax + 16]
+ movq qword ptr [edx], xmm0 // store 4 pixels of ARGB1555
+ lea edx, [edx + 8]
+ sub ecx, 4
+ jg convertloop
+ ret
+ }
+}
+
+__declspec(naked) __declspec(align(16))
+void ARGBToARGB4444Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix) {
+ __asm {
+ mov eax, [esp + 4] // src_argb
+ mov edx, [esp + 8] // dst_rgb
+ mov ecx, [esp + 12] // pix
+ pcmpeqb xmm4, xmm4 // generate mask 0xf000f000
+ psllw xmm4, 12
+ movdqa xmm3, xmm4 // generate mask 0x00f000f0
+ psrlw xmm3, 8
+
+ align 4
+ convertloop:
+ movdqa xmm0, [eax] // fetch 4 pixels of argb
+ movdqa xmm1, xmm0
+ pand xmm0, xmm3 // low nibble
+ pand xmm1, xmm4 // high nibble
+ psrl xmm0, 4
+ psrl xmm1, 8
+ por xmm0, xmm1
+ packuswb xmm0, xmm0
+ lea eax, [eax + 16]
+ movq qword ptr [edx], xmm0 // store 4 pixels of ARGB4444
+ lea edx, [edx + 8]
+ sub ecx, 4
+ jg convertloop
+ ret
+ }
+}
+
+// Convert 16 ARGB pixels (64 bytes) to 16 Y values.
+__declspec(naked) __declspec(align(16))
+void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
+ __asm {
+ mov eax, [esp + 4] /* src_argb */
+ mov edx, [esp + 8] /* dst_y */
+ mov ecx, [esp + 12] /* pix */
+ movdqa xmm5, kAddY16
+ movdqa xmm4, kARGBToY
+
+ align 4
+ convertloop:
+ movdqa xmm0, [eax]
+ movdqa xmm1, [eax + 16]
+ movdqa xmm2, [eax + 32]
+ movdqa xmm3, [eax + 48]
+ pmaddubsw xmm0, xmm4
+ pmaddubsw xmm1, xmm4
+ pmaddubsw xmm2, xmm4
+ pmaddubsw xmm3, xmm4
+ lea eax, [eax + 64]
+ phaddw xmm0, xmm1
+ phaddw xmm2, xmm3
+ psrlw xmm0, 7
+ psrlw xmm2, 7
+ packuswb xmm0, xmm2
+ paddb xmm0, xmm5
+ sub ecx, 16
+ movdqa [edx], xmm0
+ lea edx, [edx + 16]
+ jg convertloop
+ ret
+ }
+}
+
+// Convert 16 ARGB pixels (64 bytes) to 16 Y values.
+__declspec(naked) __declspec(align(16))
+void ARGBToYJRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
+ __asm {
+ mov eax, [esp + 4] /* src_argb */
+ mov edx, [esp + 8] /* dst_y */
+ mov ecx, [esp + 12] /* pix */
+ movdqa xmm4, kARGBToYJ
+ movdqa xmm5, kAddYJ64
+
+ align 4
+ convertloop:
+ movdqa xmm0, [eax]
+ movdqa xmm1, [eax + 16]
+ movdqa xmm2, [eax + 32]
+ movdqa xmm3, [eax + 48]
+ pmaddubsw xmm0, xmm4
+ pmaddubsw xmm1, xmm4
+ pmaddubsw xmm2, xmm4
+ pmaddubsw xmm3, xmm4
+ lea eax, [eax + 64]
+ phaddw xmm0, xmm1
+ phaddw xmm2, xmm3
+ paddw xmm0, xmm5 // Add .5 for rounding.
+ paddw xmm2, xmm5
+ psrlw xmm0, 7
+ psrlw xmm2, 7
+ packuswb xmm0, xmm2
+ sub ecx, 16
+ movdqa [edx], xmm0
+ lea edx, [edx + 16]
+ jg convertloop
+ ret
+ }
+}
+
+#ifdef HAS_ARGBTOYROW_AVX2
+// Convert 32 ARGB pixels (128 bytes) to 32 Y values.
+__declspec(naked) __declspec(align(32))
+void ARGBToYRow_AVX2(const uint8* src_argb, uint8* dst_y, int pix) {
+ __asm {
+ mov eax, [esp + 4] /* src_argb */
+ mov edx, [esp + 8] /* dst_y */
+ mov ecx, [esp + 12] /* pix */
+ vbroadcastf128 ymm4, kARGBToY
+ vbroadcastf128 ymm5, kAddY16
+ vmovdqa ymm6, kPermdARGBToY_AVX
+
+ align 4
+ convertloop:
+ vmovdqu ymm0, [eax]
+ vmovdqu ymm1, [eax + 32]
+ vmovdqu ymm2, [eax + 64]
+ vmovdqu ymm3, [eax + 96]
+ vpmaddubsw ymm0, ymm0, ymm4
+ vpmaddubsw ymm1, ymm1, ymm4
+ vpmaddubsw ymm2, ymm2, ymm4
+ vpmaddubsw ymm3, ymm3, ymm4
+ lea eax, [eax + 128]
+ vphaddw ymm0, ymm0, ymm1 // mutates.
+ vphaddw ymm2, ymm2, ymm3
+ vpsrlw ymm0, ymm0, 7
+ vpsrlw ymm2, ymm2, 7
+ vpackuswb ymm0, ymm0, ymm2 // mutates.
+ vpermd ymm0, ymm6, ymm0 // For vphaddw + vpackuswb mutation.
+ vpaddb ymm0, ymm0, ymm5
+ sub ecx, 32
+ vmovdqu [edx], ymm0
+ lea edx, [edx + 32]
+ jg convertloop
+ vzeroupper
+ ret
+ }
+}
+#endif // HAS_ARGBTOYROW_AVX2
+
+#ifdef HAS_ARGBTOYROW_AVX2
+// Convert 32 ARGB pixels (128 bytes) to 32 Y values.
+__declspec(naked) __declspec(align(32))
+void ARGBToYJRow_AVX2(const uint8* src_argb, uint8* dst_y, int pix) {
+ __asm {
+ mov eax, [esp + 4] /* src_argb */
+ mov edx, [esp + 8] /* dst_y */
+ mov ecx, [esp + 12] /* pix */
+ vbroadcastf128 ymm4, kARGBToYJ
+ vbroadcastf128 ymm5, kAddYJ64
+ vmovdqa ymm6, kPermdARGBToY_AVX
+
+ align 4
+ convertloop:
+ vmovdqu ymm0, [eax]
+ vmovdqu ymm1, [eax + 32]
+ vmovdqu ymm2, [eax + 64]
+ vmovdqu ymm3, [eax + 96]
+ vpmaddubsw ymm0, ymm0, ymm4
+ vpmaddubsw ymm1, ymm1, ymm4
+ vpmaddubsw ymm2, ymm2, ymm4
+ vpmaddubsw ymm3, ymm3, ymm4
+ lea eax, [eax + 128]
+ vphaddw ymm0, ymm0, ymm1 // mutates.
+ vphaddw ymm2, ymm2, ymm3
+ vpaddw ymm0, ymm0, ymm5 // Add .5 for rounding.
+ vpaddw ymm2, ymm2, ymm5
+ vpsrlw ymm0, ymm0, 7
+ vpsrlw ymm2, ymm2, 7
+ vpackuswb ymm0, ymm0, ymm2 // mutates.
+ vpermd ymm0, ymm6, ymm0 // For vphaddw + vpackuswb mutation.
+ sub ecx, 32
+ vmovdqu [edx], ymm0
+ lea edx, [edx + 32]
+ jg convertloop
+
+ vzeroupper
+ ret
+ }
+}
+#endif // HAS_ARGBTOYJROW_AVX2
+
+__declspec(naked) __declspec(align(16))
+void ARGBToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
+ __asm {
+ mov eax, [esp + 4] /* src_argb */
+ mov edx, [esp + 8] /* dst_y */
+ mov ecx, [esp + 12] /* pix */
+ movdqa xmm5, kAddY16
+ movdqa xmm4, kARGBToY
+
+ align 4
+ convertloop:
+ movdqu xmm0, [eax]
+ movdqu xmm1, [eax + 16]
+ movdqu xmm2, [eax + 32]
+ movdqu xmm3, [eax + 48]
+ pmaddubsw xmm0, xmm4
+ pmaddubsw xmm1, xmm4
+ pmaddubsw xmm2, xmm4
+ pmaddubsw xmm3, xmm4
+ lea eax, [eax + 64]
+ phaddw xmm0, xmm1
+ phaddw xmm2, xmm3
+ psrlw xmm0, 7
+ psrlw xmm2, 7
+ packuswb xmm0, xmm2
+ paddb xmm0, xmm5
+ sub ecx, 16
+ movdqu [edx], xmm0
+ lea edx, [edx + 16]
+ jg convertloop
+ ret
+ }
+}
+
+__declspec(naked) __declspec(align(16))
+void ARGBToYJRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
+ __asm {
+ mov eax, [esp + 4] /* src_argb */
+ mov edx, [esp + 8] /* dst_y */
+ mov ecx, [esp + 12] /* pix */
+ movdqa xmm4, kARGBToYJ
+ movdqa xmm5, kAddYJ64
+
+ align 4
+ convertloop:
+ movdqu xmm0, [eax]
+ movdqu xmm1, [eax + 16]
+ movdqu xmm2, [eax + 32]
+ movdqu xmm3, [eax + 48]
+ pmaddubsw xmm0, xmm4
+ pmaddubsw xmm1, xmm4
+ pmaddubsw xmm2, xmm4
+ pmaddubsw xmm3, xmm4
+ lea eax, [eax + 64]
+ phaddw xmm0, xmm1
+ phaddw xmm2, xmm3
+ paddw xmm0, xmm5
+ paddw xmm2, xmm5
+ psrlw xmm0, 7
+ psrlw xmm2, 7
+ packuswb xmm0, xmm2
+ sub ecx, 16
+ movdqu [edx], xmm0
+ lea edx, [edx + 16]
+ jg convertloop
+ ret
+ }
+}
+
+__declspec(naked) __declspec(align(16))
+void BGRAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
+ __asm {
+ mov eax, [esp + 4] /* src_argb */
+ mov edx, [esp + 8] /* dst_y */
+ mov ecx, [esp + 12] /* pix */
+ movdqa xmm5, kAddY16
+ movdqa xmm4, kBGRAToY
+
+ align 4
+ convertloop:
+ movdqa xmm0, [eax]
+ movdqa xmm1, [eax + 16]
+ movdqa xmm2, [eax + 32]
+ movdqa xmm3, [eax + 48]
+ pmaddubsw xmm0, xmm4
+ pmaddubsw xmm1, xmm4
+ pmaddubsw xmm2, xmm4
+ pmaddubsw xmm3, xmm4
+ lea eax, [eax + 64]
+ phaddw xmm0, xmm1
+ phaddw xmm2, xmm3
+ psrlw xmm0, 7
+ psrlw xmm2, 7
+ packuswb xmm0, xmm2
+ paddb xmm0, xmm5
+ sub ecx, 16
+ movdqa [edx], xmm0
+ lea edx, [edx + 16]
+ jg convertloop
+ ret
+ }
+}
+
+__declspec(naked) __declspec(align(16))
+void BGRAToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
+ __asm {
+ mov eax, [esp + 4] /* src_argb */
+ mov edx, [esp + 8] /* dst_y */
+ mov ecx, [esp + 12] /* pix */
+ movdqa xmm5, kAddY16
+ movdqa xmm4, kBGRAToY
+
+ align 4
+ convertloop:
+ movdqu xmm0, [eax]
+ movdqu xmm1, [eax + 16]
+ movdqu xmm2, [eax + 32]
+ movdqu xmm3, [eax + 48]
+ pmaddubsw xmm0, xmm4
+ pmaddubsw xmm1, xmm4
+ pmaddubsw xmm2, xmm4
+ pmaddubsw xmm3, xmm4
+ lea eax, [eax + 64]
+ phaddw xmm0, xmm1
+ phaddw xmm2, xmm3
+ psrlw xmm0, 7
+ psrlw xmm2, 7
+ packuswb xmm0, xmm2
+ paddb xmm0, xmm5
+ sub ecx, 16
+ movdqu [edx], xmm0
+ lea edx, [edx + 16]
+ jg convertloop
+ ret
+ }
+}
+
+__declspec(naked) __declspec(align(16))
+void ABGRToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
+ __asm {
+ mov eax, [esp + 4] /* src_argb */
+ mov edx, [esp + 8] /* dst_y */
+ mov ecx, [esp + 12] /* pix */
+ movdqa xmm5, kAddY16
+ movdqa xmm4, kABGRToY
+
+ align 4
+ convertloop:
+ movdqa xmm0, [eax]
+ movdqa xmm1, [eax + 16]
+ movdqa xmm2, [eax + 32]
+ movdqa xmm3, [eax + 48]
+ pmaddubsw xmm0, xmm4
+ pmaddubsw xmm1, xmm4
+ pmaddubsw xmm2, xmm4
+ pmaddubsw xmm3, xmm4
+ lea eax, [eax + 64]
+ phaddw xmm0, xmm1
+ phaddw xmm2, xmm3
+ psrlw xmm0, 7
+ psrlw xmm2, 7
+ packuswb xmm0, xmm2
+ paddb xmm0, xmm5
+ sub ecx, 16
+ movdqa [edx], xmm0
+ lea edx, [edx + 16]
+ jg convertloop
+ ret
+ }
+}
+
+__declspec(naked) __declspec(align(16))
+void ABGRToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
+ __asm {
+ mov eax, [esp + 4] /* src_argb */
+ mov edx, [esp + 8] /* dst_y */
+ mov ecx, [esp + 12] /* pix */
+ movdqa xmm5, kAddY16
+ movdqa xmm4, kABGRToY
+
+ align 4
+ convertloop:
+ movdqu xmm0, [eax]
+ movdqu xmm1, [eax + 16]
+ movdqu xmm2, [eax + 32]
+ movdqu xmm3, [eax + 48]
+ pmaddubsw xmm0, xmm4
+ pmaddubsw xmm1, xmm4
+ pmaddubsw xmm2, xmm4
+ pmaddubsw xmm3, xmm4
+ lea eax, [eax + 64]
+ phaddw xmm0, xmm1
+ phaddw xmm2, xmm3
+ psrlw xmm0, 7
+ psrlw xmm2, 7
+ packuswb xmm0, xmm2
+ paddb xmm0, xmm5
+ sub ecx, 16
+ movdqu [edx], xmm0
+ lea edx, [edx + 16]
+ jg convertloop
+ ret
+ }
+}
+
+__declspec(naked) __declspec(align(16))
+void RGBAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
+ __asm {
+ mov eax, [esp + 4] /* src_argb */
+ mov edx, [esp + 8] /* dst_y */
+ mov ecx, [esp + 12] /* pix */
+ movdqa xmm5, kAddY16
+ movdqa xmm4, kRGBAToY
+
+ align 4
+ convertloop:
+ movdqa xmm0, [eax]
+ movdqa xmm1, [eax + 16]
+ movdqa xmm2, [eax + 32]
+ movdqa xmm3, [eax + 48]
+ pmaddubsw xmm0, xmm4
+ pmaddubsw xmm1, xmm4
+ pmaddubsw xmm2, xmm4
+ pmaddubsw xmm3, xmm4
+ lea eax, [eax + 64]
+ phaddw xmm0, xmm1
+ phaddw xmm2, xmm3
+ psrlw xmm0, 7
+ psrlw xmm2, 7
+ packuswb xmm0, xmm2
+ paddb xmm0, xmm5
+ sub ecx, 16
+ movdqa [edx], xmm0
+ lea edx, [edx + 16]
+ jg convertloop
+ ret
+ }
+}
+
+__declspec(naked) __declspec(align(16))
+void RGBAToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
+ __asm {
+ mov eax, [esp + 4] /* src_argb */
+ mov edx, [esp + 8] /* dst_y */
+ mov ecx, [esp + 12] /* pix */
+ movdqa xmm5, kAddY16
+ movdqa xmm4, kRGBAToY
+
+ align 4
+ convertloop:
+ movdqu xmm0, [eax]
+ movdqu xmm1, [eax + 16]
+ movdqu xmm2, [eax + 32]
+ movdqu xmm3, [eax + 48]
+ pmaddubsw xmm0, xmm4
+ pmaddubsw xmm1, xmm4
+ pmaddubsw xmm2, xmm4
+ pmaddubsw xmm3, xmm4
+ lea eax, [eax + 64]
+ phaddw xmm0, xmm1
+ phaddw xmm2, xmm3
+ psrlw xmm0, 7
+ psrlw xmm2, 7
+ packuswb xmm0, xmm2
+ paddb xmm0, xmm5
+ sub ecx, 16
+ movdqu [edx], xmm0
+ lea edx, [edx + 16]
+ jg convertloop
+ ret
+ }
+}
+
+__declspec(naked) __declspec(align(16))
+void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
+ uint8* dst_u, uint8* dst_v, int width) {
+ __asm {
+ push esi
+ push edi
+ mov eax, [esp + 8 + 4] // src_argb
+ mov esi, [esp + 8 + 8] // src_stride_argb
+ mov edx, [esp + 8 + 12] // dst_u
+ mov edi, [esp + 8 + 16] // dst_v
+ mov ecx, [esp + 8 + 20] // pix
+ movdqa xmm7, kARGBToU
+ movdqa xmm6, kARGBToV
+ movdqa xmm5, kAddUV128
+ sub edi, edx // stride from u to v
+
+ align 4
+ convertloop:
+ /* step 1 - subsample 16x2 argb pixels to 8x1 */
+ movdqa xmm0, [eax]
+ movdqa xmm1, [eax + 16]
+ movdqa xmm2, [eax + 32]
+ movdqa xmm3, [eax + 48]
+ pavgb xmm0, [eax + esi]
+ pavgb xmm1, [eax + esi + 16]
+ pavgb xmm2, [eax + esi + 32]
+ pavgb xmm3, [eax + esi + 48]
+ lea eax, [eax + 64]
+ movdqa xmm4, xmm0
+ shufps xmm0, xmm1, 0x88
+ shufps xmm4, xmm1, 0xdd
+ pavgb xmm0, xmm4
+ movdqa xmm4, xmm2
+ shufps xmm2, xmm3, 0x88
+ shufps xmm4, xmm3, 0xdd
+ pavgb xmm2, xmm4
+
+ // step 2 - convert to U and V
+ // from here down is very similar to Y code except
+ // instead of 16 different pixels, its 8 pixels of U and 8 of V
+ movdqa xmm1, xmm0
+ movdqa xmm3, xmm2
+ pmaddubsw xmm0, xmm7 // U
+ pmaddubsw xmm2, xmm7
+ pmaddubsw xmm1, xmm6 // V
+ pmaddubsw xmm3, xmm6
+ phaddw xmm0, xmm2
+ phaddw xmm1, xmm3
+ psraw xmm0, 8
+ psraw xmm1, 8
+ packsswb xmm0, xmm1
+ paddb xmm0, xmm5 // -> unsigned
+
+ // step 3 - store 8 U and 8 V values
+ sub ecx, 16
+ movlps qword ptr [edx], xmm0 // U
+ movhps qword ptr [edx + edi], xmm0 // V
+ lea edx, [edx + 8]
+ jg convertloop
+
+ pop edi
+ pop esi
+ ret
+ }
+}
+
+__declspec(naked) __declspec(align(16))
+void ARGBToUVJRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
+ uint8* dst_u, uint8* dst_v, int width) {
+ __asm {
+ push esi
+ push edi
+ mov eax, [esp + 8 + 4] // src_argb
+ mov esi, [esp + 8 + 8] // src_stride_argb
+ mov edx, [esp + 8 + 12] // dst_u
+ mov edi, [esp + 8 + 16] // dst_v
+ mov ecx, [esp + 8 + 20] // pix
+ movdqa xmm7, kARGBToUJ
+ movdqa xmm6, kARGBToVJ
+ movdqa xmm5, kAddUVJ128
+ sub edi, edx // stride from u to v
+
+ align 4
+ convertloop:
+ /* step 1 - subsample 16x2 argb pixels to 8x1 */
+ movdqa xmm0, [eax]
+ movdqa xmm1, [eax + 16]
+ movdqa xmm2, [eax + 32]
+ movdqa xmm3, [eax + 48]
+ pavgb xmm0, [eax + esi]
+ pavgb xmm1, [eax + esi + 16]
+ pavgb xmm2, [eax + esi + 32]
+ pavgb xmm3, [eax + esi + 48]
+ lea eax, [eax + 64]
+ movdqa xmm4, xmm0
+ shufps xmm0, xmm1, 0x88
+ shufps xmm4, xmm1, 0xdd
+ pavgb xmm0, xmm4
+ movdqa xmm4, xmm2
+ shufps xmm2, xmm3, 0x88
+ shufps xmm4, xmm3, 0xdd
+ pavgb xmm2, xmm4
+
+ // step 2 - convert to U and V
+ // from here down is very similar to Y code except
+ // instead of 16 different pixels, its 8 pixels of U and 8 of V
+ movdqa xmm1, xmm0
+ movdqa xmm3, xmm2
+ pmaddubsw xmm0, xmm7 // U
+ pmaddubsw xmm2, xmm7
+ pmaddubsw xmm1, xmm6 // V
+ pmaddubsw xmm3, xmm6
+ phaddw xmm0, xmm2
+ phaddw xmm1, xmm3
+ paddw xmm0, xmm5 // +.5 rounding -> unsigned
+ paddw xmm1, xmm5
+ psraw xmm0, 8
+ psraw xmm1, 8
+ packsswb xmm0, xmm1
+
+ // step 3 - store 8 U and 8 V values
+ sub ecx, 16
+ movlps qword ptr [edx], xmm0 // U
+ movhps qword ptr [edx + edi], xmm0 // V
+ lea edx, [edx + 8]
+ jg convertloop
+
+ pop edi
+ pop esi
+ ret
+ }
+}
+
+#ifdef HAS_ARGBTOUVROW_AVX2
+__declspec(naked) __declspec(align(32))
+void ARGBToUVRow_AVX2(const uint8* src_argb0, int src_stride_argb,
+ uint8* dst_u, uint8* dst_v, int width) {
+ __asm {
+ push esi
+ push edi
+ mov eax, [esp + 8 + 4] // src_argb
+ mov esi, [esp + 8 + 8] // src_stride_argb
+ mov edx, [esp + 8 + 12] // dst_u
+ mov edi, [esp + 8 + 16] // dst_v
+ mov ecx, [esp + 8 + 20] // pix
+ vbroadcastf128 ymm5, kAddUV128
+ vbroadcastf128 ymm6, kARGBToV
+ vbroadcastf128 ymm7, kARGBToU
+ sub edi, edx // stride from u to v
+
+ align 4
+ convertloop:
+ /* step 1 - subsample 32x2 argb pixels to 16x1 */
+ vmovdqu ymm0, [eax]
+ vmovdqu ymm1, [eax + 32]
+ vmovdqu ymm2, [eax + 64]
+ vmovdqu ymm3, [eax + 96]
+ vpavgb ymm0, ymm0, [eax + esi]
+ vpavgb ymm1, ymm1, [eax + esi + 32]
+ vpavgb ymm2, ymm2, [eax + esi + 64]
+ vpavgb ymm3, ymm3, [eax + esi + 96]
+ lea eax, [eax + 128]
+ vshufps ymm4, ymm0, ymm1, 0x88
+ vshufps ymm0, ymm0, ymm1, 0xdd
+ vpavgb ymm0, ymm0, ymm4 // mutated by vshufps
+ vshufps ymm4, ymm2, ymm3, 0x88
+ vshufps ymm2, ymm2, ymm3, 0xdd
+ vpavgb ymm2, ymm2, ymm4 // mutated by vshufps
+
+ // step 2 - convert to U and V
+ // from here down is very similar to Y code except
+ // instead of 32 different pixels, its 16 pixels of U and 16 of V
+ vpmaddubsw ymm1, ymm0, ymm7 // U
+ vpmaddubsw ymm3, ymm2, ymm7
+ vpmaddubsw ymm0, ymm0, ymm6 // V
+ vpmaddubsw ymm2, ymm2, ymm6
+ vphaddw ymm1, ymm1, ymm3 // mutates
+ vphaddw ymm0, ymm0, ymm2
+ vpsraw ymm1, ymm1, 8
+ vpsraw ymm0, ymm0, 8
+ vpacksswb ymm0, ymm1, ymm0 // mutates
+ vpermq ymm0, ymm0, 0xd8 // For vpacksswb
+ vpshufb ymm0, ymm0, kShufARGBToUV_AVX // For vshufps + vphaddw
+ vpaddb ymm0, ymm0, ymm5 // -> unsigned
+
+ // step 3 - store 16 U and 16 V values
+ sub ecx, 32
+ vextractf128 [edx], ymm0, 0 // U
+ vextractf128 [edx + edi], ymm0, 1 // V
+ lea edx, [edx + 16]
+ jg convertloop
+
+ pop edi
+ pop esi
+ vzeroupper
+ ret
+ }
+}
+#endif // HAS_ARGBTOUVROW_AVX2
+
+__declspec(naked) __declspec(align(16))
+void ARGBToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb,
+ uint8* dst_u, uint8* dst_v, int width) {
+ __asm {
+ push esi
+ push edi
+ mov eax, [esp + 8 + 4] // src_argb
+ mov esi, [esp + 8 + 8] // src_stride_argb
+ mov edx, [esp + 8 + 12] // dst_u
+ mov edi, [esp + 8 + 16] // dst_v
+ mov ecx, [esp + 8 + 20] // pix
+ movdqa xmm7, kARGBToU
+ movdqa xmm6, kARGBToV
+ movdqa xmm5, kAddUV128
+ sub edi, edx // stride from u to v
+
+ align 4
+ convertloop:
+ /* step 1 - subsample 16x2 argb pixels to 8x1 */
+ movdqu xmm0, [eax]
+ movdqu xmm1, [eax + 16]
+ movdqu xmm2, [eax + 32]
+ movdqu xmm3, [eax + 48]
+ movdqu xmm4, [eax + esi]
+ pavgb xmm0, xmm4
+ movdqu xmm4, [eax + esi + 16]
+ pavgb xmm1, xmm4
+ movdqu xmm4, [eax + esi + 32]
+ pavgb xmm2, xmm4
+ movdqu xmm4, [eax + esi + 48]
+ pavgb xmm3, xmm4
+ lea eax, [eax + 64]
+ movdqa xmm4, xmm0
+ shufps xmm0, xmm1, 0x88
+ shufps xmm4, xmm1, 0xdd
+ pavgb xmm0, xmm4
+ movdqa xmm4, xmm2
+ shufps xmm2, xmm3, 0x88
+ shufps xmm4, xmm3, 0xdd
+ pavgb xmm2, xmm4
+
+ // step 2 - convert to U and V
+ // from here down is very similar to Y code except
+ // instead of 16 different pixels, its 8 pixels of U and 8 of V
+ movdqa xmm1, xmm0
+ movdqa xmm3, xmm2
+ pmaddubsw xmm0, xmm7 // U
+ pmaddubsw xmm2, xmm7
+ pmaddubsw xmm1, xmm6 // V
+ pmaddubsw xmm3, xmm6
+ phaddw xmm0, xmm2
+ phaddw xmm1, xmm3
+ psraw xmm0, 8
+ psraw xmm1, 8
+ packsswb xmm0, xmm1
+ paddb xmm0, xmm5 // -> unsigned
+
+ // step 3 - store 8 U and 8 V values
+ sub ecx, 16
+ movlps qword ptr [edx], xmm0 // U
+ movhps qword ptr [edx + edi], xmm0 // V
+ lea edx, [edx + 8]
+ jg convertloop
+
+ pop edi
+ pop esi
+ ret
+ }
+}
+
+__declspec(naked) __declspec(align(16))
+void ARGBToUVJRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb,
+ uint8* dst_u, uint8* dst_v, int width) {
+ __asm {
+ push esi
+ push edi
+ mov eax, [esp + 8 + 4] // src_argb
+ mov esi, [esp + 8 + 8] // src_stride_argb
+ mov edx, [esp + 8 + 12] // dst_u
+ mov edi, [esp + 8 + 16] // dst_v
+ mov ecx, [esp + 8 + 20] // pix
+ movdqa xmm7, kARGBToUJ
+ movdqa xmm6, kARGBToVJ
+ movdqa xmm5, kAddUVJ128
+ sub edi, edx // stride from u to v
+
+ align 4
+ convertloop:
+ /* step 1 - subsample 16x2 argb pixels to 8x1 */
+ movdqu xmm0, [eax]
+ movdqu xmm1, [eax + 16]
+ movdqu xmm2, [eax + 32]
+ movdqu xmm3, [eax + 48]
+ movdqu xmm4, [eax + esi]
+ pavgb xmm0, xmm4
+ movdqu xmm4, [eax + esi + 16]
+ pavgb xmm1, xmm4
+ movdqu xmm4, [eax + esi + 32]
+ pavgb xmm2, xmm4
+ movdqu xmm4, [eax + esi + 48]
+ pavgb xmm3, xmm4
+ lea eax, [eax + 64]
+ movdqa xmm4, xmm0
+ shufps xmm0, xmm1, 0x88
+ shufps xmm4, xmm1, 0xdd
+ pavgb xmm0, xmm4
+ movdqa xmm4, xmm2
+ shufps xmm2, xmm3, 0x88
+ shufps xmm4, xmm3, 0xdd
+ pavgb xmm2, xmm4
+
+ // step 2 - convert to U and V
+ // from here down is very similar to Y code except
+ // instead of 16 different pixels, its 8 pixels of U and 8 of V
+ movdqa xmm1, xmm0
+ movdqa xmm3, xmm2
+ pmaddubsw xmm0, xmm7 // U
+ pmaddubsw xmm2, xmm7
+ pmaddubsw xmm1, xmm6 // V
+ pmaddubsw xmm3, xmm6
+ phaddw xmm0, xmm2
+ phaddw xmm1, xmm3
+ paddw xmm0, xmm5 // +.5 rounding -> unsigned
+ paddw xmm1, xmm5
+ psraw xmm0, 8
+ psraw xmm1, 8
+ packsswb xmm0, xmm1
+
+ // step 3 - store 8 U and 8 V values
+ sub ecx, 16
+ movlps qword ptr [edx], xmm0 // U
+ movhps qword ptr [edx + edi], xmm0 // V
+ lea edx, [edx + 8]
+ jg convertloop
+
+ pop edi
+ pop esi
+ ret
+ }
+}
+
+__declspec(naked) __declspec(align(16))
+void ARGBToUV444Row_SSSE3(const uint8* src_argb0,
+ uint8* dst_u, uint8* dst_v, int width) {
+ __asm {
+ push edi
+ mov eax, [esp + 4 + 4] // src_argb
+ mov edx, [esp + 4 + 8] // dst_u
+ mov edi, [esp + 4 + 12] // dst_v
+ mov ecx, [esp + 4 + 16] // pix
+ movdqa xmm7, kARGBToU
+ movdqa xmm6, kARGBToV
+ movdqa xmm5, kAddUV128
+ sub edi, edx // stride from u to v
+
+ align 4
+ convertloop:
+ /* convert to U and V */
+ movdqa xmm0, [eax] // U
+ movdqa xmm1, [eax + 16]
+ movdqa xmm2, [eax + 32]
+ movdqa xmm3, [eax + 48]
+ pmaddubsw xmm0, xmm7
+ pmaddubsw xmm1, xmm7
+ pmaddubsw xmm2, xmm7
+ pmaddubsw xmm3, xmm7
+ phaddw xmm0, xmm1
+ phaddw xmm2, xmm3
+ psraw xmm0, 8
+ psraw xmm2, 8
+ packsswb xmm0, xmm2
+ paddb xmm0, xmm5
+ sub ecx, 16
+ movdqa [edx], xmm0
+
+ movdqa xmm0, [eax] // V
+ movdqa xmm1, [eax + 16]
+ movdqa xmm2, [eax + 32]
+ movdqa xmm3, [eax + 48]
+ pmaddubsw xmm0, xmm6
+ pmaddubsw xmm1, xmm6
+ pmaddubsw xmm2, xmm6
+ pmaddubsw xmm3, xmm6
+ phaddw xmm0, xmm1
+ phaddw xmm2, xmm3
+ psraw xmm0, 8
+ psraw xmm2, 8
+ packsswb xmm0, xmm2
+ paddb xmm0, xmm5
+ lea eax, [eax + 64]
+ movdqa [edx + edi], xmm0
+ lea edx, [edx + 16]
+ jg convertloop
+
+ pop edi
+ ret
+ }
+}
+
+__declspec(naked) __declspec(align(16))
+void ARGBToUV444Row_Unaligned_SSSE3(const uint8* src_argb0,
+ uint8* dst_u, uint8* dst_v, int width) {
+ __asm {
+ push edi
+ mov eax, [esp + 4 + 4] // src_argb
+ mov edx, [esp + 4 + 8] // dst_u
+ mov edi, [esp + 4 + 12] // dst_v
+ mov ecx, [esp + 4 + 16] // pix
+ movdqa xmm7, kARGBToU
+ movdqa xmm6, kARGBToV
+ movdqa xmm5, kAddUV128
+ sub edi, edx // stride from u to v
+
+ align 4
+ convertloop:
+ /* convert to U and V */
+ movdqu xmm0, [eax] // U
+ movdqu xmm1, [eax + 16]
+ movdqu xmm2, [eax + 32]
+ movdqu xmm3, [eax + 48]
+ pmaddubsw xmm0, xmm7
+ pmaddubsw xmm1, xmm7
+ pmaddubsw xmm2, xmm7
+ pmaddubsw xmm3, xmm7
+ phaddw xmm0, xmm1
+ phaddw xmm2, xmm3
+ psraw xmm0, 8
+ psraw xmm2, 8
+ packsswb xmm0, xmm2
+ paddb xmm0, xmm5
+ sub ecx, 16
+ movdqu [edx], xmm0
+
+ movdqu xmm0, [eax] // V
+ movdqu xmm1, [eax + 16]
+ movdqu xmm2, [eax + 32]
+ movdqu xmm3, [eax + 48]
+ pmaddubsw xmm0, xmm6
+ pmaddubsw xmm1, xmm6
+ pmaddubsw xmm2, xmm6
+ pmaddubsw xmm3, xmm6
+ phaddw xmm0, xmm1
+ phaddw xmm2, xmm3
+ psraw xmm0, 8
+ psraw xmm2, 8
+ packsswb xmm0, xmm2
+ paddb xmm0, xmm5
+ lea eax, [eax + 64]
+ movdqu [edx + edi], xmm0
+ lea edx, [edx + 16]
+ jg convertloop
+
+ pop edi
+ ret
+ }
+}
+
+__declspec(naked) __declspec(align(16))
+void ARGBToUV422Row_SSSE3(const uint8* src_argb0,
+ uint8* dst_u, uint8* dst_v, int width) {
+ __asm {
+ push edi
+ mov eax, [esp + 4 + 4] // src_argb
+ mov edx, [esp + 4 + 8] // dst_u
+ mov edi, [esp + 4 + 12] // dst_v
+ mov ecx, [esp + 4 + 16] // pix
+ movdqa xmm7, kARGBToU
+ movdqa xmm6, kARGBToV
+ movdqa xmm5, kAddUV128
+ sub edi, edx // stride from u to v
+
+ align 4
+ convertloop:
+ /* step 1 - subsample 16x2 argb pixels to 8x1 */
+ movdqa xmm0, [eax]
+ movdqa xmm1, [eax + 16]
+ movdqa xmm2, [eax + 32]
+ movdqa xmm3, [eax + 48]
+ lea eax, [eax + 64]
+ movdqa xmm4, xmm0
+ shufps xmm0, xmm1, 0x88
+ shufps xmm4, xmm1, 0xdd
+ pavgb xmm0, xmm4
+ movdqa xmm4, xmm2
+ shufps xmm2, xmm3, 0x88
+ shufps xmm4, xmm3, 0xdd
+ pavgb xmm2, xmm4
+
+ // step 2 - convert to U and V
+ // from here down is very similar to Y code except
+ // instead of 16 different pixels, its 8 pixels of U and 8 of V
+ movdqa xmm1, xmm0
+ movdqa xmm3, xmm2
+ pmaddubsw xmm0, xmm7 // U
+ pmaddubsw xmm2, xmm7
+ pmaddubsw xmm1, xmm6 // V
+ pmaddubsw xmm3, xmm6
+ phaddw xmm0, xmm2
+ phaddw xmm1, xmm3
+ psraw xmm0, 8
+ psraw xmm1, 8
+ packsswb xmm0, xmm1
+ paddb xmm0, xmm5 // -> unsigned
+
+ // step 3 - store 8 U and 8 V values
+ sub ecx, 16
+ movlps qword ptr [edx], xmm0 // U
+ movhps qword ptr [edx + edi], xmm0 // V
+ lea edx, [edx + 8]
+ jg convertloop
+
+ pop edi
+ ret
+ }
+}
+
+__declspec(naked) __declspec(align(16))
+void ARGBToUV422Row_Unaligned_SSSE3(const uint8* src_argb0,
+ uint8* dst_u, uint8* dst_v, int width) {
+ __asm {
+ push edi
+ mov eax, [esp + 4 + 4] // src_argb
+ mov edx, [esp + 4 + 8] // dst_u
+ mov edi, [esp + 4 + 12] // dst_v
+ mov ecx, [esp + 4 + 16] // pix
+ movdqa xmm7, kARGBToU
+ movdqa xmm6, kARGBToV
+ movdqa xmm5, kAddUV128
+ sub edi, edx // stride from u to v
+
+ align 4
+ convertloop:
+ /* step 1 - subsample 16x2 argb pixels to 8x1 */
+ movdqu xmm0, [eax]
+ movdqu xmm1, [eax + 16]
+ movdqu xmm2, [eax + 32]
+ movdqu xmm3, [eax + 48]
+ lea eax, [eax + 64]
+ movdqa xmm4, xmm0
+ shufps xmm0, xmm1, 0x88
+ shufps xmm4, xmm1, 0xdd
+ pavgb xmm0, xmm4
+ movdqa xmm4, xmm2
+ shufps xmm2, xmm3, 0x88
+ shufps xmm4, xmm3, 0xdd
+ pavgb xmm2, xmm4
+
+ // step 2 - convert to U and V
+ // from here down is very similar to Y code except
+ // instead of 16 different pixels, its 8 pixels of U and 8 of V
+ movdqa xmm1, xmm0
+ movdqa xmm3, xmm2
+ pmaddubsw xmm0, xmm7 // U
+ pmaddubsw xmm2, xmm7
+ pmaddubsw xmm1, xmm6 // V
+ pmaddubsw xmm3, xmm6
+ phaddw xmm0, xmm2
+ phaddw xmm1, xmm3
+ psraw xmm0, 8
+ psraw xmm1, 8
+ packsswb xmm0, xmm1
+ paddb xmm0, xmm5 // -> unsigned
+
+ // step 3 - store 8 U and 8 V values
+ sub ecx, 16
+ movlps qword ptr [edx], xmm0 // U
+ movhps qword ptr [edx + edi], xmm0 // V
+ lea edx, [edx + 8]
+ jg convertloop
+
+ pop edi
+ ret
+ }
+}
+
+__declspec(naked) __declspec(align(16))
+void BGRAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
+ uint8* dst_u, uint8* dst_v, int width) {
+ __asm {
+ push esi
+ push edi
+ mov eax, [esp + 8 + 4] // src_argb
+ mov esi, [esp + 8 + 8] // src_stride_argb
+ mov edx, [esp + 8 + 12] // dst_u
+ mov edi, [esp + 8 + 16] // dst_v
+ mov ecx, [esp + 8 + 20] // pix
+ movdqa xmm7, kBGRAToU
+ movdqa xmm6, kBGRAToV
+ movdqa xmm5, kAddUV128
+ sub edi, edx // stride from u to v
+
+ align 4
+ convertloop:
+ /* step 1 - subsample 16x2 argb pixels to 8x1 */
+ movdqa xmm0, [eax]
+ movdqa xmm1, [eax + 16]
+ movdqa xmm2, [eax + 32]
+ movdqa xmm3, [eax + 48]
+ pavgb xmm0, [eax + esi]
+ pavgb xmm1, [eax + esi + 16]
+ pavgb xmm2, [eax + esi + 32]
+ pavgb xmm3, [eax + esi + 48]
+ lea eax, [eax + 64]
+ movdqa xmm4, xmm0
+ shufps xmm0, xmm1, 0x88
+ shufps xmm4, xmm1, 0xdd
+ pavgb xmm0, xmm4
+ movdqa xmm4, xmm2
+ shufps xmm2, xmm3, 0x88
+ shufps xmm4, xmm3, 0xdd
+ pavgb xmm2, xmm4
+
+ // step 2 - convert to U and V
+ // from here down is very similar to Y code except
+ // instead of 16 different pixels, its 8 pixels of U and 8 of V
+ movdqa xmm1, xmm0
+ movdqa xmm3, xmm2
+ pmaddubsw xmm0, xmm7 // U
+ pmaddubsw xmm2, xmm7
+ pmaddubsw xmm1, xmm6 // V
+ pmaddubsw xmm3, xmm6
+ phaddw xmm0, xmm2
+ phaddw xmm1, xmm3
+ psraw xmm0, 8
+ psraw xmm1, 8
+ packsswb xmm0, xmm1
+ paddb xmm0, xmm5 // -> unsigned
+
+ // step 3 - store 8 U and 8 V values
+ sub ecx, 16
+ movlps qword ptr [edx], xmm0 // U
+ movhps qword ptr [edx + edi], xmm0 // V
+ lea edx, [edx + 8]
+ jg convertloop
+
+ pop edi
+ pop esi
+ ret
+ }
+}
+
+__declspec(naked) __declspec(align(16))
+void BGRAToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb,
+ uint8* dst_u, uint8* dst_v, int width) {
+ __asm {
+ push esi
+ push edi
+ mov eax, [esp + 8 + 4] // src_argb
+ mov esi, [esp + 8 + 8] // src_stride_argb
+ mov edx, [esp + 8 + 12] // dst_u
+ mov edi, [esp + 8 + 16] // dst_v
+ mov ecx, [esp + 8 + 20] // pix
+ movdqa xmm7, kBGRAToU
+ movdqa xmm6, kBGRAToV
+ movdqa xmm5, kAddUV128
+ sub edi, edx // stride from u to v
+
+ align 4
+ convertloop:
+ /* step 1 - subsample 16x2 argb pixels to 8x1 */
+ movdqu xmm0, [eax]
+ movdqu xmm1, [eax + 16]
+ movdqu xmm2, [eax + 32]
+ movdqu xmm3, [eax + 48]
+ movdqu xmm4, [eax + esi]
+ pavgb xmm0, xmm4
+ movdqu xmm4, [eax + esi + 16]
+ pavgb xmm1, xmm4
+ movdqu xmm4, [eax + esi + 32]
+ pavgb xmm2, xmm4
+ movdqu xmm4, [eax + esi + 48]
+ pavgb xmm3, xmm4
+ lea eax, [eax + 64]
+ movdqa xmm4, xmm0
+ shufps xmm0, xmm1, 0x88
+ shufps xmm4, xmm1, 0xdd
+ pavgb xmm0, xmm4
+ movdqa xmm4, xmm2
+ shufps xmm2, xmm3, 0x88
+ shufps xmm4, xmm3, 0xdd
+ pavgb xmm2, xmm4
+
+ // step 2 - convert to U and V
+ // from here down is very similar to Y code except
+ // instead of 16 different pixels, its 8 pixels of U and 8 of V
+ movdqa xmm1, xmm0
+ movdqa xmm3, xmm2
+ pmaddubsw xmm0, xmm7 // U
+ pmaddubsw xmm2, xmm7
+ pmaddubsw xmm1, xmm6 // V
+ pmaddubsw xmm3, xmm6
+ phaddw xmm0, xmm2
+ phaddw xmm1, xmm3
+ psraw xmm0, 8
+ psraw xmm1, 8
+ packsswb xmm0, xmm1
+ paddb xmm0, xmm5 // -> unsigned
+
+ // step 3 - store 8 U and 8 V values
+ sub ecx, 16
+ movlps qword ptr [edx], xmm0 // U
+ movhps qword ptr [edx + edi], xmm0 // V
+ lea edx, [edx + 8]
+ jg convertloop
+
+ pop edi
+ pop esi
+ ret
+ }
+}
+
+__declspec(naked) __declspec(align(16))
+void ABGRToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
+ uint8* dst_u, uint8* dst_v, int width) {
+ __asm {
+ push esi
+ push edi
+ mov eax, [esp + 8 + 4] // src_argb
+ mov esi, [esp + 8 + 8] // src_stride_argb
+ mov edx, [esp + 8 + 12] // dst_u
+ mov edi, [esp + 8 + 16] // dst_v
+ mov ecx, [esp + 8 + 20] // pix
+ movdqa xmm7, kABGRToU
+ movdqa xmm6, kABGRToV
+ movdqa xmm5, kAddUV128
+ sub edi, edx // stride from u to v
+
+ align 4
+ convertloop:
+ /* step 1 - subsample 16x2 argb pixels to 8x1 */
+ movdqa xmm0, [eax]
+ movdqa xmm1, [eax + 16]
+ movdqa xmm2, [eax + 32]
+ movdqa xmm3, [eax + 48]
+ pavgb xmm0, [eax + esi]
+ pavgb xmm1, [eax + esi + 16]
+ pavgb xmm2, [eax + esi + 32]
+ pavgb xmm3, [eax + esi + 48]
+ lea eax, [eax + 64]
+ movdqa xmm4, xmm0
+ shufps xmm0, xmm1, 0x88
+ shufps xmm4, xmm1, 0xdd
+ pavgb xmm0, xmm4
+ movdqa xmm4, xmm2
+ shufps xmm2, xmm3, 0x88
+ shufps xmm4, xmm3, 0xdd
+ pavgb xmm2, xmm4
+
+ // step 2 - convert to U and V
+ // from here down is very similar to Y code except
+ // instead of 16 different pixels, its 8 pixels of U and 8 of V
+ movdqa xmm1, xmm0
+ movdqa xmm3, xmm2
+ pmaddubsw xmm0, xmm7 // U
+ pmaddubsw xmm2, xmm7
+ pmaddubsw xmm1, xmm6 // V
+ pmaddubsw xmm3, xmm6
+ phaddw xmm0, xmm2
+ phaddw xmm1, xmm3
+ psraw xmm0, 8
+ psraw xmm1, 8
+ packsswb xmm0, xmm1
+ paddb xmm0, xmm5 // -> unsigned
+
+ // step 3 - store 8 U and 8 V values
+ sub ecx, 16
+ movlps qword ptr [edx], xmm0 // U
+ movhps qword ptr [edx + edi], xmm0 // V
+ lea edx, [edx + 8]
+ jg convertloop
+
+ pop edi
+ pop esi
+ ret
+ }
+}
+
+__declspec(naked) __declspec(align(16))
+void ABGRToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb,
+ uint8* dst_u, uint8* dst_v, int width) {
+ __asm {
+ push esi
+ push edi
+ mov eax, [esp + 8 + 4] // src_argb
+ mov esi, [esp + 8 + 8] // src_stride_argb
+ mov edx, [esp + 8 + 12] // dst_u
+ mov edi, [esp + 8 + 16] // dst_v
+ mov ecx, [esp + 8 + 20] // pix
+ movdqa xmm7, kABGRToU
+ movdqa xmm6, kABGRToV
+ movdqa xmm5, kAddUV128
+ sub edi, edx // stride from u to v
+
+ align 4
+ convertloop:
+ /* step 1 - subsample 16x2 argb pixels to 8x1 */
+ movdqu xmm0, [eax]
+ movdqu xmm1, [eax + 16]
+ movdqu xmm2, [eax + 32]
+ movdqu xmm3, [eax + 48]
+ movdqu xmm4, [eax + esi]
+ pavgb xmm0, xmm4
+ movdqu xmm4, [eax + esi + 16]
+ pavgb xmm1, xmm4
+ movdqu xmm4, [eax + esi + 32]
+ pavgb xmm2, xmm4
+ movdqu xmm4, [eax + esi + 48]
+ pavgb xmm3, xmm4
+ lea eax, [eax + 64]
+ movdqa xmm4, xmm0
+ shufps xmm0, xmm1, 0x88
+ shufps xmm4, xmm1, 0xdd
+ pavgb xmm0, xmm4
+ movdqa xmm4, xmm2
+ shufps xmm2, xmm3, 0x88
+ shufps xmm4, xmm3, 0xdd
+ pavgb xmm2, xmm4
+
+ // step 2 - convert to U and V
+ // from here down is very similar to Y code except
+ // instead of 16 different pixels, its 8 pixels of U and 8 of V
+ movdqa xmm1, xmm0
+ movdqa xmm3, xmm2
+ pmaddubsw xmm0, xmm7 // U
+ pmaddubsw xmm2, xmm7
+ pmaddubsw xmm1, xmm6 // V
+ pmaddubsw xmm3, xmm6
+ phaddw xmm0, xmm2
+ phaddw xmm1, xmm3
+ psraw xmm0, 8
+ psraw xmm1, 8
+ packsswb xmm0, xmm1
+ paddb xmm0, xmm5 // -> unsigned
+
+ // step 3 - store 8 U and 8 V values
+ sub ecx, 16
+ movlps qword ptr [edx], xmm0 // U
+ movhps qword ptr [edx + edi], xmm0 // V
+ lea edx, [edx + 8]
+ jg convertloop
+
+ pop edi
+ pop esi
+ ret
+ }
+}
+
+__declspec(naked) __declspec(align(16))
+void RGBAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
+ uint8* dst_u, uint8* dst_v, int width) {
+ __asm {
+ push esi
+ push edi
+ mov eax, [esp + 8 + 4] // src_argb
+ mov esi, [esp + 8 + 8] // src_stride_argb
+ mov edx, [esp + 8 + 12] // dst_u
+ mov edi, [esp + 8 + 16] // dst_v
+ mov ecx, [esp + 8 + 20] // pix
+ movdqa xmm7, kRGBAToU
+ movdqa xmm6, kRGBAToV
+ movdqa xmm5, kAddUV128
+ sub edi, edx // stride from u to v
+
+ align 4
+ convertloop:
+ /* step 1 - subsample 16x2 argb pixels to 8x1 */
+ movdqa xmm0, [eax]
+ movdqa xmm1, [eax + 16]
+ movdqa xmm2, [eax + 32]
+ movdqa xmm3, [eax + 48]
+ pavgb xmm0, [eax + esi]
+ pavgb xmm1, [eax + esi + 16]
+ pavgb xmm2, [eax + esi + 32]
+ pavgb xmm3, [eax + esi + 48]
+ lea eax, [eax + 64]
+ movdqa xmm4, xmm0
+ shufps xmm0, xmm1, 0x88
+ shufps xmm4, xmm1, 0xdd
+ pavgb xmm0, xmm4
+ movdqa xmm4, xmm2
+ shufps xmm2, xmm3, 0x88
+ shufps xmm4, xmm3, 0xdd
+ pavgb xmm2, xmm4
+
+ // step 2 - convert to U and V
+ // from here down is very similar to Y code except
+ // instead of 16 different pixels, its 8 pixels of U and 8 of V
+ movdqa xmm1, xmm0
+ movdqa xmm3, xmm2
+ pmaddubsw xmm0, xmm7 // U
+ pmaddubsw xmm2, xmm7
+ pmaddubsw xmm1, xmm6 // V
+ pmaddubsw xmm3, xmm6
+ phaddw xmm0, xmm2
+ phaddw xmm1, xmm3
+ psraw xmm0, 8
+ psraw xmm1, 8
+ packsswb xmm0, xmm1
+ paddb xmm0, xmm5 // -> unsigned
+
+ // step 3 - store 8 U and 8 V values
+ sub ecx, 16
+ movlps qword ptr [edx], xmm0 // U
+ movhps qword ptr [edx + edi], xmm0 // V
+ lea edx, [edx + 8]
+ jg convertloop
+
+ pop edi
+ pop esi
+ ret
+ }
+}
+
+__declspec(naked) __declspec(align(16))
+void RGBAToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb,
+ uint8* dst_u, uint8* dst_v, int width) {
+ __asm {
+ push esi
+ push edi
+ mov eax, [esp + 8 + 4] // src_argb
+ mov esi, [esp + 8 + 8] // src_stride_argb
+ mov edx, [esp + 8 + 12] // dst_u
+ mov edi, [esp + 8 + 16] // dst_v
+ mov ecx, [esp + 8 + 20] // pix
+ movdqa xmm7, kRGBAToU
+ movdqa xmm6, kRGBAToV
+ movdqa xmm5, kAddUV128
+ sub edi, edx // stride from u to v
+
+ align 4
+ convertloop:
+ /* step 1 - subsample 16x2 argb pixels to 8x1 */
+ movdqu xmm0, [eax]
+ movdqu xmm1, [eax + 16]
+ movdqu xmm2, [eax + 32]
+ movdqu xmm3, [eax + 48]
+ movdqu xmm4, [eax + esi]
+ pavgb xmm0, xmm4
+ movdqu xmm4, [eax + esi + 16]
+ pavgb xmm1, xmm4
+ movdqu xmm4, [eax + esi + 32]
+ pavgb xmm2, xmm4
+ movdqu xmm4, [eax + esi + 48]
+ pavgb xmm3, xmm4
+ lea eax, [eax + 64]
+ movdqa xmm4, xmm0
+ shufps xmm0, xmm1, 0x88
+ shufps xmm4, xmm1, 0xdd
+ pavgb xmm0, xmm4
+ movdqa xmm4, xmm2
+ shufps xmm2, xmm3, 0x88
+ shufps xmm4, xmm3, 0xdd
+ pavgb xmm2, xmm4
+
+ // step 2 - convert to U and V
+ // from here down is very similar to Y code except
+ // instead of 16 different pixels, its 8 pixels of U and 8 of V
+ movdqa xmm1, xmm0
+ movdqa xmm3, xmm2
+ pmaddubsw xmm0, xmm7 // U
+ pmaddubsw xmm2, xmm7
+ pmaddubsw xmm1, xmm6 // V
+ pmaddubsw xmm3, xmm6
+ phaddw xmm0, xmm2
+ phaddw xmm1, xmm3
+ psraw xmm0, 8
+ psraw xmm1, 8
+ packsswb xmm0, xmm1
+ paddb xmm0, xmm5 // -> unsigned
+
+ // step 3 - store 8 U and 8 V values
+ sub ecx, 16
+ movlps qword ptr [edx], xmm0 // U
+ movhps qword ptr [edx + edi], xmm0 // V
+ lea edx, [edx + 8]
+ jg convertloop
+
+ pop edi
+ pop esi
+ ret
+ }
+}
+#endif // HAS_ARGBTOYROW_SSSE3
+
+#define YG 74 /* (int8)(1.164 * 64 + 0.5) */
+
+#define UB 127 /* min(63,(int8)(2.018 * 64)) */
+#define UG -25 /* (int8)(-0.391 * 64 - 0.5) */
+#define UR 0
+
+#define VB 0
+#define VG -52 /* (int8)(-0.813 * 64 - 0.5) */
+#define VR 102 /* (int8)(1.596 * 64 + 0.5) */
+
+// Bias
+#define BB UB * 128 + VB * 128
+#define BG UG * 128 + VG * 128
+#define BR UR * 128 + VR * 128
+
+#ifdef HAS_I422TOARGBROW_AVX2
+
+static const lvec8 kUVToB_AVX = {
+ UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB,
+ UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB
+};
+static const lvec8 kUVToR_AVX = {
+ UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR,
+ UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR
+};
+static const lvec8 kUVToG_AVX = {
+ UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG,
+ UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG
+};
+static const lvec16 kYToRgb_AVX = {
+ YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG
+};
+static const lvec16 kYSub16_AVX = {
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16
+};
+static const lvec16 kUVBiasB_AVX = {
+ BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB
+};
+static const lvec16 kUVBiasG_AVX = {
+ BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG
+};
+static const lvec16 kUVBiasR_AVX = {
+ BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR
+};
+
+// 16 pixels
+// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
+__declspec(naked) __declspec(align(16))
+void I422ToARGBRow_AVX2(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* dst_argb,
+ int width) {
+ __asm {
+ push esi
+ push edi
+ mov eax, [esp + 8 + 4] // Y
+ mov esi, [esp + 8 + 8] // U
+ mov edi, [esp + 8 + 12] // V
+ mov edx, [esp + 8 + 16] // argb
+ mov ecx, [esp + 8 + 20] // width
+ sub edi, esi
+ vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha
+ vpxor ymm4, ymm4, ymm4
+
+ align 4
+ convertloop:
+ vmovq xmm0, qword ptr [esi] // U
+ vmovq xmm1, qword ptr [esi + edi] // V
+ lea esi, [esi + 8]
+ vpunpcklbw ymm0, ymm0, ymm1 // UV
+ vpermq ymm0, ymm0, 0xd8
+ vpunpcklwd ymm0, ymm0, ymm0 // UVUV
+ vpmaddubsw ymm2, ymm0, kUVToB_AVX // scale B UV
+ vpmaddubsw ymm1, ymm0, kUVToG_AVX // scale G UV
+ vpmaddubsw ymm0, ymm0, kUVToR_AVX // scale R UV
+ vpsubw ymm2, ymm2, kUVBiasB_AVX // unbias back to signed
+ vpsubw ymm1, ymm1, kUVBiasG_AVX
+ vpsubw ymm0, ymm0, kUVBiasR_AVX
+
+ // Step 2: Find Y contribution to 16 R,G,B values
+ vmovdqu xmm3, [eax] // NOLINT
+ lea eax, [eax + 16]
+ vpermq ymm3, ymm3, 0xd8
+ vpunpcklbw ymm3, ymm3, ymm4
+ vpsubsw ymm3, ymm3, kYSub16_AVX
+ vpmullw ymm3, ymm3, kYToRgb_AVX
+ vpaddsw ymm2, ymm2, ymm3 // B += Y
+ vpaddsw ymm1, ymm1, ymm3 // G += Y
+ vpaddsw ymm0, ymm0, ymm3 // R += Y
+ vpsraw ymm2, ymm2, 6
+ vpsraw ymm1, ymm1, 6
+ vpsraw ymm0, ymm0, 6
+ vpackuswb ymm2, ymm2, ymm2 // B
+ vpackuswb ymm1, ymm1, ymm1 // G
+ vpackuswb ymm0, ymm0, ymm0 // R
+
+ // Step 3: Weave into ARGB
+ vpunpcklbw ymm2, ymm2, ymm1 // BG
+ vpermq ymm2, ymm2, 0xd8
+ vpunpcklbw ymm0, ymm0, ymm5 // RA
+ vpermq ymm0, ymm0, 0xd8
+ vpunpcklwd ymm1, ymm2, ymm0 // BGRA first 8 pixels
+ vpunpckhwd ymm2, ymm2, ymm0 // BGRA next 8 pixels
+ vmovdqu [edx], ymm1
+ vmovdqu [edx + 32], ymm2
+ lea edx, [edx + 64]
+ sub ecx, 16
+ jg convertloop
+ vzeroupper
+
+ pop edi
+ pop esi
+ ret
+ }
+}
+#endif // HAS_I422TOARGBROW_AVX2
+
+#ifdef HAS_I422TOARGBROW_SSSE3
+
+static const vec8 kUVToB = {
+ UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB
+};
+
+static const vec8 kUVToR = {
+ UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR
+};
+
+static const vec8 kUVToG = {
+ UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG
+};
+
+static const vec8 kVUToB = {
+ VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB,
+};
+
+static const vec8 kVUToR = {
+ VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR,
+};
+
+static const vec8 kVUToG = {
+ VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG,
+};
+
+static const vec16 kYToRgb = { YG, YG, YG, YG, YG, YG, YG, YG };
+static const vec16 kYSub16 = { 16, 16, 16, 16, 16, 16, 16, 16 };
+static const vec16 kUVBiasB = { BB, BB, BB, BB, BB, BB, BB, BB };
+static const vec16 kUVBiasG = { BG, BG, BG, BG, BG, BG, BG, BG };
+static const vec16 kUVBiasR = { BR, BR, BR, BR, BR, BR, BR, BR };
+
+// TODO(fbarchard): Read that does half size on Y and treats 420 as 444.
+
+// Read 8 UV from 444.
+#define READYUV444 __asm { \
+ __asm movq xmm0, qword ptr [esi] /* U */ /* NOLINT */ \
+ __asm movq xmm1, qword ptr [esi + edi] /* V */ /* NOLINT */ \
+ __asm lea esi, [esi + 8] \
+ __asm punpcklbw xmm0, xmm1 /* UV */ \
+ }
+
+// Read 4 UV from 422, upsample to 8 UV.
+#define READYUV422 __asm { \
+ __asm movd xmm0, [esi] /* U */ \
+ __asm movd xmm1, [esi + edi] /* V */ \
+ __asm lea esi, [esi + 4] \
+ __asm punpcklbw xmm0, xmm1 /* UV */ \
+ __asm punpcklwd xmm0, xmm0 /* UVUV (upsample) */ \
+ }
+
+// Read 2 UV from 411, upsample to 8 UV.
+#define READYUV411 __asm { \
+ __asm movzx ebx, word ptr [esi] /* U */ /* NOLINT */ \
+ __asm movd xmm0, ebx \
+ __asm movzx ebx, word ptr [esi + edi] /* V */ /* NOLINT */ \
+ __asm movd xmm1, ebx \
+ __asm lea esi, [esi + 2] \
+ __asm punpcklbw xmm0, xmm1 /* UV */ \
+ __asm punpcklwd xmm0, xmm0 /* UVUV (upsample) */ \
+ __asm punpckldq xmm0, xmm0 /* UVUV (upsample) */ \
+ }
+
+// Read 4 UV from NV12, upsample to 8 UV.
+#define READNV12 __asm { \
+ __asm movq xmm0, qword ptr [esi] /* UV */ /* NOLINT */ \
+ __asm lea esi, [esi + 8] \
+ __asm punpcklwd xmm0, xmm0 /* UVUV (upsample) */ \
+ }
+
+// Convert 8 pixels: 8 UV and 8 Y.
+#define YUVTORGB __asm { \
+ /* Step 1: Find 4 UV contributions to 8 R,G,B values */ \
+ __asm movdqa xmm1, xmm0 \
+ __asm movdqa xmm2, xmm0 \
+ __asm pmaddubsw xmm0, kUVToB /* scale B UV */ \
+ __asm pmaddubsw xmm1, kUVToG /* scale G UV */ \
+ __asm pmaddubsw xmm2, kUVToR /* scale R UV */ \
+ __asm psubw xmm0, kUVBiasB /* unbias back to signed */ \
+ __asm psubw xmm1, kUVBiasG \
+ __asm psubw xmm2, kUVBiasR \
+ /* Step 2: Find Y contribution to 8 R,G,B values */ \
+ __asm movq xmm3, qword ptr [eax] /* NOLINT */ \
+ __asm lea eax, [eax + 8] \
+ __asm punpcklbw xmm3, xmm4 \
+ __asm psubsw xmm3, kYSub16 \
+ __asm pmullw xmm3, kYToRgb \
+ __asm paddsw xmm0, xmm3 /* B += Y */ \
+ __asm paddsw xmm1, xmm3 /* G += Y */ \
+ __asm paddsw xmm2, xmm3 /* R += Y */ \
+ __asm psraw xmm0, 6 \
+ __asm psraw xmm1, 6 \
+ __asm psraw xmm2, 6 \
+ __asm packuswb xmm0, xmm0 /* B */ \
+ __asm packuswb xmm1, xmm1 /* G */ \
+ __asm packuswb xmm2, xmm2 /* R */ \
+ }
+
+// Convert 8 pixels: 8 VU and 8 Y.
+#define YVUTORGB __asm { \
+ /* Step 1: Find 4 UV contributions to 8 R,G,B values */ \
+ __asm movdqa xmm1, xmm0 \
+ __asm movdqa xmm2, xmm0 \
+ __asm pmaddubsw xmm0, kVUToB /* scale B UV */ \
+ __asm pmaddubsw xmm1, kVUToG /* scale G UV */ \
+ __asm pmaddubsw xmm2, kVUToR /* scale R UV */ \
+ __asm psubw xmm0, kUVBiasB /* unbias back to signed */ \
+ __asm psubw xmm1, kUVBiasG \
+ __asm psubw xmm2, kUVBiasR \
+ /* Step 2: Find Y contribution to 8 R,G,B values */ \
+ __asm movq xmm3, qword ptr [eax] /* NOLINT */ \
+ __asm lea eax, [eax + 8] \
+ __asm punpcklbw xmm3, xmm4 \
+ __asm psubsw xmm3, kYSub16 \
+ __asm pmullw xmm3, kYToRgb \
+ __asm paddsw xmm0, xmm3 /* B += Y */ \
+ __asm paddsw xmm1, xmm3 /* G += Y */ \
+ __asm paddsw xmm2, xmm3 /* R += Y */ \
+ __asm psraw xmm0, 6 \
+ __asm psraw xmm1, 6 \
+ __asm psraw xmm2, 6 \
+ __asm packuswb xmm0, xmm0 /* B */ \
+ __asm packuswb xmm1, xmm1 /* G */ \
+ __asm packuswb xmm2, xmm2 /* R */ \
+ }
+
+// 8 pixels, dest aligned 16.
+// 8 UV values, mixed with 8 Y producing 8 ARGB (32 bytes).
+__declspec(naked) __declspec(align(16))
+void I444ToARGBRow_SSSE3(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* dst_argb,
+ int width) {
+ __asm {
+ push esi
+ push edi
+ mov eax, [esp + 8 + 4] // Y
+ mov esi, [esp + 8 + 8] // U
+ mov edi, [esp + 8 + 12] // V
+ mov edx, [esp + 8 + 16] // argb
+ mov ecx, [esp + 8 + 20] // width
+ sub edi, esi
+ pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
+ pxor xmm4, xmm4
+
+ align 4
+ convertloop:
+ READYUV444
+ YUVTORGB
+
+ // Step 3: Weave into ARGB
+ punpcklbw xmm0, xmm1 // BG
+ punpcklbw xmm2, xmm5 // RA
+ movdqa xmm1, xmm0
+ punpcklwd xmm0, xmm2 // BGRA first 4 pixels
+ punpckhwd xmm1, xmm2 // BGRA next 4 pixels
+ movdqa [edx], xmm0
+ movdqa [edx + 16], xmm1
+ lea edx, [edx + 32]
+ sub ecx, 8
+ jg convertloop
+
+ pop edi
+ pop esi
+ ret
+ }
+}
+
+// 8 pixels, dest aligned 16.
+// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
+__declspec(naked) __declspec(align(16))
+void I422ToRGB24Row_SSSE3(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* dst_rgb24,
+ int width) {
+ __asm {
+ push esi
+ push edi
+ mov eax, [esp + 8 + 4] // Y
+ mov esi, [esp + 8 + 8] // U
+ mov edi, [esp + 8 + 12] // V
+ mov edx, [esp + 8 + 16] // rgb24
+ mov ecx, [esp + 8 + 20] // width
+ sub edi, esi
+ pxor xmm4, xmm4
+ movdqa xmm5, kShuffleMaskARGBToRGB24_0
+ movdqa xmm6, kShuffleMaskARGBToRGB24
+
+ align 4
+ convertloop:
+ READYUV422
+ YUVTORGB
+
+ // Step 3: Weave into RRGB
+ punpcklbw xmm0, xmm1 // BG
+ punpcklbw xmm2, xmm2 // RR
+ movdqa xmm1, xmm0
+ punpcklwd xmm0, xmm2 // BGRR first 4 pixels
+ punpckhwd xmm1, xmm2 // BGRR next 4 pixels
+ pshufb xmm0, xmm5 // Pack into first 8 and last 4 bytes.
+ pshufb xmm1, xmm6 // Pack into first 12 bytes.
+ palignr xmm1, xmm0, 12 // last 4 bytes of xmm0 + 12 from xmm1
+ movq qword ptr [edx], xmm0 // First 8 bytes
+ movdqu [edx + 8], xmm1 // Last 16 bytes. = 24 bytes, 8 RGB pixels.
+ lea edx, [edx + 24]
+ sub ecx, 8
+ jg convertloop
+
+ pop edi
+ pop esi
+ ret
+ }
+}
+
+// 8 pixels, dest aligned 16.
+// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
+__declspec(naked) __declspec(align(16))
+void I422ToRAWRow_SSSE3(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* dst_raw,
+ int width) {
+ __asm {
+ push esi
+ push edi
+ mov eax, [esp + 8 + 4] // Y
+ mov esi, [esp + 8 + 8] // U
+ mov edi, [esp + 8 + 12] // V
+ mov edx, [esp + 8 + 16] // raw
+ mov ecx, [esp + 8 + 20] // width
+ sub edi, esi
+ pxor xmm4, xmm4
+ movdqa xmm5, kShuffleMaskARGBToRAW_0
+ movdqa xmm6, kShuffleMaskARGBToRAW
+
+ align 4
+ convertloop:
+ READYUV422
+ YUVTORGB
+
+ // Step 3: Weave into RRGB
+ punpcklbw xmm0, xmm1 // BG
+ punpcklbw xmm2, xmm2 // RR
+ movdqa xmm1, xmm0
+ punpcklwd xmm0, xmm2 // BGRR first 4 pixels
+ punpckhwd xmm1, xmm2 // BGRR next 4 pixels
+ pshufb xmm0, xmm5 // Pack into first 8 and last 4 bytes.
+ pshufb xmm1, xmm6 // Pack into first 12 bytes.
+ palignr xmm1, xmm0, 12 // last 4 bytes of xmm0 + 12 from xmm1
+ movq qword ptr [edx], xmm0 // First 8 bytes
+ movdqu [edx + 8], xmm1 // Last 16 bytes. = 24 bytes, 8 RGB pixels.
+ lea edx, [edx + 24]
+ sub ecx, 8
+ jg convertloop
+
+ pop edi
+ pop esi
+ ret
+ }
+}
+
+// 8 pixels, dest unaligned.
+// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
+__declspec(naked) __declspec(align(16))
+void I422ToRGB565Row_SSSE3(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* rgb565_buf,
+ int width) {
+ __asm {
+ push esi
+ push edi
+ mov eax, [esp + 8 + 4] // Y
+ mov esi, [esp + 8 + 8] // U
+ mov edi, [esp + 8 + 12] // V
+ mov edx, [esp + 8 + 16] // rgb565
+ mov ecx, [esp + 8 + 20] // width
+ sub edi, esi
+ pxor xmm4, xmm4
+ pcmpeqb xmm5, xmm5 // generate mask 0x0000001f
+ psrld xmm5, 27
+ pcmpeqb xmm6, xmm6 // generate mask 0x000007e0
+ psrld xmm6, 26
+ pslld xmm6, 5
+ pcmpeqb xmm7, xmm7 // generate mask 0xfffff800
+ pslld xmm7, 11
+
+ align 4
+ convertloop:
+ READYUV422
+ YUVTORGB
+
+ // Step 3: Weave into RRGB
+ punpcklbw xmm0, xmm1 // BG
+ punpcklbw xmm2, xmm2 // RR
+ movdqa xmm1, xmm0
+ punpcklwd xmm0, xmm2 // BGRR first 4 pixels
+ punpckhwd xmm1, xmm2 // BGRR next 4 pixels
+
+ // Step 3b: RRGB -> RGB565
+ movdqa xmm3, xmm0 // B first 4 pixels of argb
+ movdqa xmm2, xmm0 // G
+ pslld xmm0, 8 // R
+ psrld xmm3, 3 // B
+ psrld xmm2, 5 // G
+ psrad xmm0, 16 // R
+ pand xmm3, xmm5 // B
+ pand xmm2, xmm6 // G
+ pand xmm0, xmm7 // R
+ por xmm3, xmm2 // BG
+ por xmm0, xmm3 // BGR
+ movdqa xmm3, xmm1 // B next 4 pixels of argb
+ movdqa xmm2, xmm1 // G
+ pslld xmm1, 8 // R
+ psrld xmm3, 3 // B
+ psrld xmm2, 5 // G
+ psrad xmm1, 16 // R
+ pand xmm3, xmm5 // B
+ pand xmm2, xmm6 // G
+ pand xmm1, xmm7 // R
+ por xmm3, xmm2 // BG
+ por xmm1, xmm3 // BGR
+ packssdw xmm0, xmm1
+ sub ecx, 8
+ movdqu [edx], xmm0 // store 8 pixels of RGB565
+ lea edx, [edx + 16]
+ jg convertloop
+
+ pop edi
+ pop esi
+ ret
+ }
+}
+
+// 8 pixels, dest aligned 16.
+// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
+__declspec(naked) __declspec(align(16))
+void I422ToARGBRow_SSSE3(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* dst_argb,
+ int width) {
+ __asm {
+ push esi
+ push edi
+ mov eax, [esp + 8 + 4] // Y
+ mov esi, [esp + 8 + 8] // U
+ mov edi, [esp + 8 + 12] // V
+ mov edx, [esp + 8 + 16] // argb
+ mov ecx, [esp + 8 + 20] // width
+ sub edi, esi
+ pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
+ pxor xmm4, xmm4
+
+ align 4
+ convertloop:
+ READYUV422
+ YUVTORGB
+
+ // Step 3: Weave into ARGB
+ punpcklbw xmm0, xmm1 // BG
+ punpcklbw xmm2, xmm5 // RA
+ movdqa xmm1, xmm0
+ punpcklwd xmm0, xmm2 // BGRA first 4 pixels
+ punpckhwd xmm1, xmm2 // BGRA next 4 pixels
+ movdqa [edx], xmm0
+ movdqa [edx + 16], xmm1
+ lea edx, [edx + 32]
+ sub ecx, 8
+ jg convertloop
+
+ pop edi
+ pop esi
+ ret
+ }
+}
+
+// 8 pixels, dest aligned 16.
+// 2 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
+// Similar to I420 but duplicate UV once more.
+__declspec(naked) __declspec(align(16))
+void I411ToARGBRow_SSSE3(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* dst_argb,
+ int width) {
+ __asm {
+ push ebx
+ push esi
+ push edi
+ mov eax, [esp + 12 + 4] // Y
+ mov esi, [esp + 12 + 8] // U
+ mov edi, [esp + 12 + 12] // V
+ mov edx, [esp + 12 + 16] // argb
+ mov ecx, [esp + 12 + 20] // width
+ sub edi, esi
+ pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
+ pxor xmm4, xmm4
+
+ align 4
+ convertloop:
+ READYUV411 // modifies EBX
+ YUVTORGB
+
+ // Step 3: Weave into ARGB
+ punpcklbw xmm0, xmm1 // BG
+ punpcklbw xmm2, xmm5 // RA
+ movdqa xmm1, xmm0
+ punpcklwd xmm0, xmm2 // BGRA first 4 pixels
+ punpckhwd xmm1, xmm2 // BGRA next 4 pixels
+ movdqa [edx], xmm0
+ movdqa [edx + 16], xmm1
+ lea edx, [edx + 32]
+ sub ecx, 8
+ jg convertloop
+
+ pop edi
+ pop esi
+ pop ebx
+ ret
+ }
+}
+
+// 8 pixels, dest aligned 16.
+// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
+__declspec(naked) __declspec(align(16))
+void NV12ToARGBRow_SSSE3(const uint8* y_buf,
+ const uint8* uv_buf,
+ uint8* dst_argb,
+ int width) {
+ __asm {
+ push esi
+ mov eax, [esp + 4 + 4] // Y
+ mov esi, [esp + 4 + 8] // UV
+ mov edx, [esp + 4 + 12] // argb
+ mov ecx, [esp + 4 + 16] // width
+ pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
+ pxor xmm4, xmm4
+
+ align 4
+ convertloop:
+ READNV12
+ YUVTORGB
+
+ // Step 3: Weave into ARGB
+ punpcklbw xmm0, xmm1 // BG
+ punpcklbw xmm2, xmm5 // RA
+ movdqa xmm1, xmm0
+ punpcklwd xmm0, xmm2 // BGRA first 4 pixels
+ punpckhwd xmm1, xmm2 // BGRA next 4 pixels
+ movdqa [edx], xmm0
+ movdqa [edx + 16], xmm1
+ lea edx, [edx + 32]
+ sub ecx, 8
+ jg convertloop
+
+ pop esi
+ ret
+ }
+}
+
+// 8 pixels, dest aligned 16.
+// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
+__declspec(naked) __declspec(align(16))
+void NV21ToARGBRow_SSSE3(const uint8* y_buf,
+ const uint8* uv_buf,
+ uint8* dst_argb,
+ int width) {
+ __asm {
+ push esi
+ mov eax, [esp + 4 + 4] // Y
+ mov esi, [esp + 4 + 8] // VU
+ mov edx, [esp + 4 + 12] // argb
+ mov ecx, [esp + 4 + 16] // width
+ pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
+ pxor xmm4, xmm4
+
+ align 4
+ convertloop:
+ READNV12
+ YVUTORGB
+
+ // Step 3: Weave into ARGB
+ punpcklbw xmm0, xmm1 // BG
+ punpcklbw xmm2, xmm5 // RA
+ movdqa xmm1, xmm0
+ punpcklwd xmm0, xmm2 // BGRA first 4 pixels
+ punpckhwd xmm1, xmm2 // BGRA next 4 pixels
+ movdqa [edx], xmm0
+ movdqa [edx + 16], xmm1
+ lea edx, [edx + 32]
+ sub ecx, 8
+ jg convertloop
+
+ pop esi
+ ret
+ }
+}
+
+// 8 pixels, unaligned.
+// 8 UV values, mixed with 8 Y producing 8 ARGB (32 bytes).
+__declspec(naked) __declspec(align(16))
+void I444ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* dst_argb,
+ int width) {
+ __asm {
+ push esi
+ push edi
+ mov eax, [esp + 8 + 4] // Y
+ mov esi, [esp + 8 + 8] // U
+ mov edi, [esp + 8 + 12] // V
+ mov edx, [esp + 8 + 16] // argb
+ mov ecx, [esp + 8 + 20] // width
+ sub edi, esi
+ pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
+ pxor xmm4, xmm4
+
+ align 4
+ convertloop:
+ READYUV444
+ YUVTORGB
+
+ // Step 3: Weave into ARGB
+ punpcklbw xmm0, xmm1 // BG
+ punpcklbw xmm2, xmm5 // RA
+ movdqa xmm1, xmm0
+ punpcklwd xmm0, xmm2 // BGRA first 4 pixels
+ punpckhwd xmm1, xmm2 // BGRA next 4 pixels
+ movdqu [edx], xmm0
+ movdqu [edx + 16], xmm1
+ lea edx, [edx + 32]
+ sub ecx, 8
+ jg convertloop
+
+ pop edi
+ pop esi
+ ret
+ }
+}
+
+// 8 pixels, unaligned.
+// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
+__declspec(naked) __declspec(align(16))
+void I422ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* dst_argb,
+ int width) {
+ __asm {
+ push esi
+ push edi
+ mov eax, [esp + 8 + 4] // Y
+ mov esi, [esp + 8 + 8] // U
+ mov edi, [esp + 8 + 12] // V
+ mov edx, [esp + 8 + 16] // argb
+ mov ecx, [esp + 8 + 20] // width
+ sub edi, esi
+ pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
+ pxor xmm4, xmm4
+
+ align 4
+ convertloop:
+ READYUV422
+ YUVTORGB
+
+ // Step 3: Weave into ARGB
+ punpcklbw xmm0, xmm1 // BG
+ punpcklbw xmm2, xmm5 // RA
+ movdqa xmm1, xmm0
+ punpcklwd xmm0, xmm2 // BGRA first 4 pixels
+ punpckhwd xmm1, xmm2 // BGRA next 4 pixels
+ movdqu [edx], xmm0
+ movdqu [edx + 16], xmm1
+ lea edx, [edx + 32]
+ sub ecx, 8
+ jg convertloop
+
+ pop edi
+ pop esi
+ ret
+ }
+}
+
+// 8 pixels, unaligned.
+// 2 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
+// Similar to I420 but duplicate UV once more.
+__declspec(naked) __declspec(align(16))
+void I411ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* dst_argb,
+ int width) {
+ __asm {
+ push ebx
+ push esi
+ push edi
+ mov eax, [esp + 12 + 4] // Y
+ mov esi, [esp + 12 + 8] // U
+ mov edi, [esp + 12 + 12] // V
+ mov edx, [esp + 12 + 16] // argb
+ mov ecx, [esp + 12 + 20] // width
+ sub edi, esi
+ pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
+ pxor xmm4, xmm4
+
+ align 4
+ convertloop:
+ READYUV411 // modifies EBX
+ YUVTORGB
+
+ // Step 3: Weave into ARGB
+ punpcklbw xmm0, xmm1 // BG
+ punpcklbw xmm2, xmm5 // RA
+ movdqa xmm1, xmm0
+ punpcklwd xmm0, xmm2 // BGRA first 4 pixels
+ punpckhwd xmm1, xmm2 // BGRA next 4 pixels
+ movdqu [edx], xmm0
+ movdqu [edx + 16], xmm1
+ lea edx, [edx + 32]
+ sub ecx, 8
+ jg convertloop
+
+ pop edi
+ pop esi
+ pop ebx
+ ret
+ }
+}
+
+// 8 pixels, dest aligned 16.
+// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
+__declspec(naked) __declspec(align(16))
+void NV12ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
+ const uint8* uv_buf,
+ uint8* dst_argb,
+ int width) {
+ __asm {
+ push esi
+ mov eax, [esp + 4 + 4] // Y
+ mov esi, [esp + 4 + 8] // UV
+ mov edx, [esp + 4 + 12] // argb
+ mov ecx, [esp + 4 + 16] // width
+ pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
+ pxor xmm4, xmm4
+
+ align 4
+ convertloop:
+ READNV12
+ YUVTORGB
+
+ // Step 3: Weave into ARGB
+ punpcklbw xmm0, xmm1 // BG
+ punpcklbw xmm2, xmm5 // RA
+ movdqa xmm1, xmm0
+ punpcklwd xmm0, xmm2 // BGRA first 4 pixels
+ punpckhwd xmm1, xmm2 // BGRA next 4 pixels
+ movdqu [edx], xmm0
+ movdqu [edx + 16], xmm1
+ lea edx, [edx + 32]
+ sub ecx, 8
+ jg convertloop
+
+ pop esi
+ ret
+ }
+}
+
+// 8 pixels, dest aligned 16.
+// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
+__declspec(naked) __declspec(align(16))
+void NV21ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
+ const uint8* uv_buf,
+ uint8* dst_argb,
+ int width) {
+ __asm {
+ push esi
+ mov eax, [esp + 4 + 4] // Y
+ mov esi, [esp + 4 + 8] // VU
+ mov edx, [esp + 4 + 12] // argb
+ mov ecx, [esp + 4 + 16] // width
+ pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
+ pxor xmm4, xmm4
+
+ align 4
+ convertloop:
+ READNV12
+ YVUTORGB
+
+ // Step 3: Weave into ARGB
+ punpcklbw xmm0, xmm1 // BG
+ punpcklbw xmm2, xmm5 // RA
+ movdqa xmm1, xmm0
+ punpcklwd xmm0, xmm2 // BGRA first 4 pixels
+ punpckhwd xmm1, xmm2 // BGRA next 4 pixels
+ movdqu [edx], xmm0
+ movdqu [edx + 16], xmm1
+ lea edx, [edx + 32]
+ sub ecx, 8
+ jg convertloop
+
+ pop esi
+ ret
+ }
+}
+
+__declspec(naked) __declspec(align(16))
+void I422ToBGRARow_SSSE3(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* dst_bgra,
+ int width) {
+ __asm {
+ push esi
+ push edi
+ mov eax, [esp + 8 + 4] // Y
+ mov esi, [esp + 8 + 8] // U
+ mov edi, [esp + 8 + 12] // V
+ mov edx, [esp + 8 + 16] // bgra
+ mov ecx, [esp + 8 + 20] // width
+ sub edi, esi
+ pxor xmm4, xmm4
+
+ align 4
+ convertloop:
+ READYUV422
+ YUVTORGB
+
+ // Step 3: Weave into BGRA
+ pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
+ punpcklbw xmm1, xmm0 // GB
+ punpcklbw xmm5, xmm2 // AR
+ movdqa xmm0, xmm5
+ punpcklwd xmm5, xmm1 // BGRA first 4 pixels
+ punpckhwd xmm0, xmm1 // BGRA next 4 pixels
+ movdqa [edx], xmm5
+ movdqa [edx + 16], xmm0
+ lea edx, [edx + 32]
+ sub ecx, 8
+ jg convertloop
+
+ pop edi
+ pop esi
+ ret
+ }
+}
+
+__declspec(naked) __declspec(align(16))
+void I422ToBGRARow_Unaligned_SSSE3(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* dst_bgra,
+ int width) {
+ __asm {
+ push esi
+ push edi
+ mov eax, [esp + 8 + 4] // Y
+ mov esi, [esp + 8 + 8] // U
+ mov edi, [esp + 8 + 12] // V
+ mov edx, [esp + 8 + 16] // bgra
+ mov ecx, [esp + 8 + 20] // width
+ sub edi, esi
+ pxor xmm4, xmm4
+
+ align 4
+ convertloop:
+ READYUV422
+ YUVTORGB
+
+ // Step 3: Weave into BGRA
+ pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
+ punpcklbw xmm1, xmm0 // GB
+ punpcklbw xmm5, xmm2 // AR
+ movdqa xmm0, xmm5
+ punpcklwd xmm5, xmm1 // BGRA first 4 pixels
+ punpckhwd xmm0, xmm1 // BGRA next 4 pixels
+ movdqu [edx], xmm5
+ movdqu [edx + 16], xmm0
+ lea edx, [edx + 32]
+ sub ecx, 8
+ jg convertloop
+
+ pop edi
+ pop esi
+ ret
+ }
+}
+
+__declspec(naked) __declspec(align(16))
+void I422ToABGRRow_SSSE3(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* dst_abgr,
+ int width) {
+ __asm {
+ push esi
+ push edi
+ mov eax, [esp + 8 + 4] // Y
+ mov esi, [esp + 8 + 8] // U
+ mov edi, [esp + 8 + 12] // V
+ mov edx, [esp + 8 + 16] // abgr
+ mov ecx, [esp + 8 + 20] // width
+ sub edi, esi
+ pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
+ pxor xmm4, xmm4
+
+ align 4
+ convertloop:
+ READYUV422
+ YUVTORGB
+
+ // Step 3: Weave into ARGB
+ punpcklbw xmm2, xmm1 // RG
+ punpcklbw xmm0, xmm5 // BA
+ movdqa xmm1, xmm2
+ punpcklwd xmm2, xmm0 // RGBA first 4 pixels
+ punpckhwd xmm1, xmm0 // RGBA next 4 pixels
+ movdqa [edx], xmm2
+ movdqa [edx + 16], xmm1
+ lea edx, [edx + 32]
+ sub ecx, 8
+ jg convertloop
+
+ pop edi
+ pop esi
+ ret
+ }
+}
+
+__declspec(naked) __declspec(align(16))
+void I422ToABGRRow_Unaligned_SSSE3(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* dst_abgr,
+ int width) {
+ __asm {
+ push esi
+ push edi
+ mov eax, [esp + 8 + 4] // Y
+ mov esi, [esp + 8 + 8] // U
+ mov edi, [esp + 8 + 12] // V
+ mov edx, [esp + 8 + 16] // abgr
+ mov ecx, [esp + 8 + 20] // width
+ sub edi, esi
+ pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
+ pxor xmm4, xmm4
+
+ align 4
+ convertloop:
+ READYUV422
+ YUVTORGB
+
+ // Step 3: Weave into ARGB
+ punpcklbw xmm2, xmm1 // RG
+ punpcklbw xmm0, xmm5 // BA
+ movdqa xmm1, xmm2
+ punpcklwd xmm2, xmm0 // RGBA first 4 pixels
+ punpckhwd xmm1, xmm0 // RGBA next 4 pixels
+ movdqu [edx], xmm2
+ movdqu [edx + 16], xmm1
+ lea edx, [edx + 32]
+ sub ecx, 8
+ jg convertloop
+
+ pop edi
+ pop esi
+ ret
+ }
+}
+
+__declspec(naked) __declspec(align(16))
+void I422ToRGBARow_SSSE3(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* dst_rgba,
+ int width) {
+ __asm {
+ push esi
+ push edi
+ mov eax, [esp + 8 + 4] // Y
+ mov esi, [esp + 8 + 8] // U
+ mov edi, [esp + 8 + 12] // V
+ mov edx, [esp + 8 + 16] // rgba
+ mov ecx, [esp + 8 + 20] // width
+ sub edi, esi
+ pxor xmm4, xmm4
+
+ align 4
+ convertloop:
+ READYUV422
+ YUVTORGB
+
+ // Step 3: Weave into RGBA
+ pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
+ punpcklbw xmm1, xmm2 // GR
+ punpcklbw xmm5, xmm0 // AB
+ movdqa xmm0, xmm5
+ punpcklwd xmm5, xmm1 // RGBA first 4 pixels
+ punpckhwd xmm0, xmm1 // RGBA next 4 pixels
+ movdqa [edx], xmm5
+ movdqa [edx + 16], xmm0
+ lea edx, [edx + 32]
+ sub ecx, 8
+ jg convertloop
+
+ pop edi
+ pop esi
+ ret
+ }
+}
+
+__declspec(naked) __declspec(align(16))
+void I422ToRGBARow_Unaligned_SSSE3(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* dst_rgba,
+ int width) {
+ __asm {
+ push esi
+ push edi
+ mov eax, [esp + 8 + 4] // Y
+ mov esi, [esp + 8 + 8] // U
+ mov edi, [esp + 8 + 12] // V
+ mov edx, [esp + 8 + 16] // rgba
+ mov ecx, [esp + 8 + 20] // width
+ sub edi, esi
+ pxor xmm4, xmm4
+
+ align 4
+ convertloop:
+ READYUV422
+ YUVTORGB
+
+ // Step 3: Weave into RGBA
+ pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
+ punpcklbw xmm1, xmm2 // GR
+ punpcklbw xmm5, xmm0 // AB
+ movdqa xmm0, xmm5
+ punpcklwd xmm5, xmm1 // RGBA first 4 pixels
+ punpckhwd xmm0, xmm1 // RGBA next 4 pixels
+ movdqu [edx], xmm5
+ movdqu [edx + 16], xmm0
+ lea edx, [edx + 32]
+ sub ecx, 8
+ jg convertloop
+
+ pop edi
+ pop esi
+ ret
+ }
+}
+
+#endif // HAS_I422TOARGBROW_SSSE3
+
+#ifdef HAS_YTOARGBROW_SSE2
+__declspec(naked) __declspec(align(16))
+void YToARGBRow_SSE2(const uint8* y_buf,
+ uint8* rgb_buf,
+ int width) {
+ __asm {
+ pxor xmm5, xmm5
+ pcmpeqb xmm4, xmm4 // generate mask 0xff000000
+ pslld xmm4, 24
+ mov eax, 0x00100010
+ movd xmm3, eax
+ pshufd xmm3, xmm3, 0
+ mov eax, 0x004a004a // 74
+ movd xmm2, eax
+ pshufd xmm2, xmm2,0
+ mov eax, [esp + 4] // Y
+ mov edx, [esp + 8] // rgb
+ mov ecx, [esp + 12] // width
+
+ align 4
+ convertloop:
+ // Step 1: Scale Y contribution to 8 G values. G = (y - 16) * 1.164
+ movq xmm0, qword ptr [eax]
+ lea eax, [eax + 8]
+ punpcklbw xmm0, xmm5 // 0.Y
+ psubusw xmm0, xmm3
+ pmullw xmm0, xmm2
+ psrlw xmm0, 6
+ packuswb xmm0, xmm0 // G
+
+ // Step 2: Weave into ARGB
+ punpcklbw xmm0, xmm0 // GG
+ movdqa xmm1, xmm0
+ punpcklwd xmm0, xmm0 // BGRA first 4 pixels
+ punpckhwd xmm1, xmm1 // BGRA next 4 pixels
+ por xmm0, xmm4
+ por xmm1, xmm4
+ movdqa [edx], xmm0
+ movdqa [edx + 16], xmm1
+ lea edx, [edx + 32]
+ sub ecx, 8
+ jg convertloop
+
+ ret
+ }
+}
+#endif // HAS_YTOARGBROW_SSE2
+
+#ifdef HAS_MIRRORROW_SSSE3
+// Shuffle table for reversing the bytes.
+static const uvec8 kShuffleMirror = {
+ 15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u, 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u
+};
+
+__declspec(naked) __declspec(align(16))
+void MirrorRow_SSSE3(const uint8* src, uint8* dst, int width) {
+ __asm {
+ mov eax, [esp + 4] // src
+ mov edx, [esp + 8] // dst
+ mov ecx, [esp + 12] // width
+ movdqa xmm5, kShuffleMirror
+ lea eax, [eax - 16]
+
+ align 4
+ convertloop:
+ movdqa xmm0, [eax + ecx]
+ pshufb xmm0, xmm5
+ sub ecx, 16
+ movdqa [edx], xmm0
+ lea edx, [edx + 16]
+ jg convertloop
+ ret
+ }
+}
+#endif // HAS_MIRRORROW_SSSE3
+
+#ifdef HAS_MIRRORROW_AVX2
+// Shuffle table for reversing the bytes.
+static const ulvec8 kShuffleMirror_AVX2 = {
+ 15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u, 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u,
+ 15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u, 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u
+};
+
+__declspec(naked) __declspec(align(16))
+void MirrorRow_AVX2(const uint8* src, uint8* dst, int width) {
+ __asm {
+ mov eax, [esp + 4] // src
+ mov edx, [esp + 8] // dst
+ mov ecx, [esp + 12] // width
+ vmovdqa ymm5, kShuffleMirror_AVX2
+ lea eax, [eax - 32]
+
+ align 4
+ convertloop:
+ vmovdqu ymm0, [eax + ecx]
+ vpshufb ymm0, ymm0, ymm5
+ vpermq ymm0, ymm0, 0x4e // swap high and low halfs
+ sub ecx, 32
+ vmovdqu [edx], ymm0
+ lea edx, [edx + 32]
+ jg convertloop
+ vzeroupper
+ ret
+ }
+}
+#endif // HAS_MIRRORROW_AVX2
+
+#ifdef HAS_MIRRORROW_SSE2
+// SSE2 version has movdqu so it can be used on unaligned buffers when SSSE3
+// version can not.
+__declspec(naked) __declspec(align(16))
+void MirrorRow_SSE2(const uint8* src, uint8* dst, int width) {
+ __asm {
+ mov eax, [esp + 4] // src
+ mov edx, [esp + 8] // dst
+ mov ecx, [esp + 12] // width
+ lea eax, [eax - 16]
+
+ align 4
+ convertloop:
+ movdqu xmm0, [eax + ecx]
+ movdqa xmm1, xmm0 // swap bytes
+ psllw xmm0, 8
+ psrlw xmm1, 8
+ por xmm0, xmm1
+ pshuflw xmm0, xmm0, 0x1b // swap words
+ pshufhw xmm0, xmm0, 0x1b
+ pshufd xmm0, xmm0, 0x4e // swap qwords
+ sub ecx, 16
+ movdqu [edx], xmm0
+ lea edx, [edx + 16]
+ jg convertloop
+ ret
+ }
+}
+#endif // HAS_MIRRORROW_SSE2
+
+#ifdef HAS_MIRRORROW_UV_SSSE3
+// Shuffle table for reversing the bytes of UV channels.
+static const uvec8 kShuffleMirrorUV = {
+ 14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u, 15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u
+};
+
+__declspec(naked) __declspec(align(16))
+void MirrorUVRow_SSSE3(const uint8* src, uint8* dst_u, uint8* dst_v,
+ int width) {
+ __asm {
+ push edi
+ mov eax, [esp + 4 + 4] // src
+ mov edx, [esp + 4 + 8] // dst_u
+ mov edi, [esp + 4 + 12] // dst_v
+ mov ecx, [esp + 4 + 16] // width
+ movdqa xmm1, kShuffleMirrorUV
+ lea eax, [eax + ecx * 2 - 16]
+ sub edi, edx
+
+ align 4
+ convertloop:
+ movdqa xmm0, [eax]
+ lea eax, [eax - 16]
+ pshufb xmm0, xmm1
+ sub ecx, 8
+ movlpd qword ptr [edx], xmm0
+ movhpd qword ptr [edx + edi], xmm0
+ lea edx, [edx + 8]
+ jg convertloop
+
+ pop edi
+ ret
+ }
+}
+#endif // HAS_MIRRORROW_UV_SSSE3
+
+#ifdef HAS_ARGBMIRRORROW_SSSE3
+// Shuffle table for reversing the bytes.
+static const uvec8 kARGBShuffleMirror = {
+ 12u, 13u, 14u, 15u, 8u, 9u, 10u, 11u, 4u, 5u, 6u, 7u, 0u, 1u, 2u, 3u
+};
+
+__declspec(naked) __declspec(align(16))
+void ARGBMirrorRow_SSSE3(const uint8* src, uint8* dst, int width) {
+ __asm {
+ mov eax, [esp + 4] // src
+ mov edx, [esp + 8] // dst
+ mov ecx, [esp + 12] // width
+ lea eax, [eax - 16 + ecx * 4] // last 4 pixels.
+ movdqa xmm5, kARGBShuffleMirror
+
+ align 4
+ convertloop:
+ movdqa xmm0, [eax]
+ lea eax, [eax - 16]
+ pshufb xmm0, xmm5
+ sub ecx, 4
+ movdqa [edx], xmm0
+ lea edx, [edx + 16]
+ jg convertloop
+ ret
+ }
+}
+#endif // HAS_ARGBMIRRORROW_SSSE3
+
+#ifdef HAS_ARGBMIRRORROW_AVX2
+// Shuffle table for reversing the bytes.
+static const ulvec32 kARGBShuffleMirror_AVX2 = {
+ 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u
+};
+
+__declspec(naked) __declspec(align(16))
+void ARGBMirrorRow_AVX2(const uint8* src, uint8* dst, int width) {
+ __asm {
+ mov eax, [esp + 4] // src
+ mov edx, [esp + 8] // dst
+ mov ecx, [esp + 12] // width
+ lea eax, [eax - 32]
+ vmovdqa ymm5, kARGBShuffleMirror_AVX2
+
+ align 4
+ convertloop:
+ vpermd ymm0, ymm5, [eax + ecx * 4] // permute dword order
+ sub ecx, 8
+ vmovdqu [edx], ymm0
+ lea edx, [edx + 32]
+ jg convertloop
+ vzeroupper
+ ret
+ }
+}
+#endif // HAS_ARGBMIRRORROW_AVX2
+
+#ifdef HAS_SPLITUVROW_SSE2
+__declspec(naked) __declspec(align(16))
+void SplitUVRow_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) {
+ __asm {
+ push edi
+ mov eax, [esp + 4 + 4] // src_uv
+ mov edx, [esp + 4 + 8] // dst_u
+ mov edi, [esp + 4 + 12] // dst_v
+ mov ecx, [esp + 4 + 16] // pix
+ pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
+ psrlw xmm5, 8
+ sub edi, edx
+
+ align 4
+ convertloop:
+ movdqa xmm0, [eax]
+ movdqa xmm1, [eax + 16]
+ lea eax, [eax + 32]
+ movdqa xmm2, xmm0
+ movdqa xmm3, xmm1
+ pand xmm0, xmm5 // even bytes
+ pand xmm1, xmm5
+ packuswb xmm0, xmm1
+ psrlw xmm2, 8 // odd bytes
+ psrlw xmm3, 8
+ packuswb xmm2, xmm3
+ movdqa [edx], xmm0
+ movdqa [edx + edi], xmm2
+ lea edx, [edx + 16]
+ sub ecx, 16
+ jg convertloop
+
+ pop edi
+ ret
+ }
+}
+
+__declspec(naked) __declspec(align(16))
+void SplitUVRow_Unaligned_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
+ int pix) {
+ __asm {
+ push edi
+ mov eax, [esp + 4 + 4] // src_uv
+ mov edx, [esp + 4 + 8] // dst_u
+ mov edi, [esp + 4 + 12] // dst_v
+ mov ecx, [esp + 4 + 16] // pix
+ pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
+ psrlw xmm5, 8
+ sub edi, edx
+
+ align 4
+ convertloop:
+ movdqu xmm0, [eax]
+ movdqu xmm1, [eax + 16]
+ lea eax, [eax + 32]
+ movdqa xmm2, xmm0
+ movdqa xmm3, xmm1
+ pand xmm0, xmm5 // even bytes
+ pand xmm1, xmm5
+ packuswb xmm0, xmm1
+ psrlw xmm2, 8 // odd bytes
+ psrlw xmm3, 8
+ packuswb xmm2, xmm3
+ movdqu [edx], xmm0
+ movdqu [edx + edi], xmm2
+ lea edx, [edx + 16]
+ sub ecx, 16
+ jg convertloop
+
+ pop edi
+ ret
+ }
+}
+#endif // HAS_SPLITUVROW_SSE2
+
+#ifdef HAS_SPLITUVROW_AVX2
+__declspec(naked) __declspec(align(16))
+void SplitUVRow_AVX2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) {
+ __asm {
+ push edi
+ mov eax, [esp + 4 + 4] // src_uv
+ mov edx, [esp + 4 + 8] // dst_u
+ mov edi, [esp + 4 + 12] // dst_v
+ mov ecx, [esp + 4 + 16] // pix
+ vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff
+ vpsrlw ymm5, ymm5, 8
+ sub edi, edx
+
+ align 4
+ convertloop:
+ vmovdqu ymm0, [eax]
+ vmovdqu ymm1, [eax + 32]
+ lea eax, [eax + 64]
+ vpsrlw ymm2, ymm0, 8 // odd bytes
+ vpsrlw ymm3, ymm1, 8
+ vpand ymm0, ymm0, ymm5 // even bytes
+ vpand ymm1, ymm1, ymm5
+ vpackuswb ymm0, ymm0, ymm1
+ vpackuswb ymm2, ymm2, ymm3
+ vpermq ymm0, ymm0, 0xd8
+ vpermq ymm2, ymm2, 0xd8
+ vmovdqu [edx], ymm0
+ vmovdqu [edx + edi], ymm2
+ lea edx, [edx + 32]
+ sub ecx, 32
+ jg convertloop
+
+ pop edi
+ vzeroupper
+ ret
+ }
+}
+#endif // HAS_SPLITUVROW_AVX2
+
+#ifdef HAS_MERGEUVROW_SSE2
+__declspec(naked) __declspec(align(16))
+void MergeUVRow_SSE2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
+ int width) {
+ __asm {
+ push edi
+ mov eax, [esp + 4 + 4] // src_u
+ mov edx, [esp + 4 + 8] // src_v
+ mov edi, [esp + 4 + 12] // dst_uv
+ mov ecx, [esp + 4 + 16] // width
+ sub edx, eax
+
+ align 4
+ convertloop:
+ movdqa xmm0, [eax] // read 16 U's
+ movdqa xmm1, [eax + edx] // and 16 V's
+ lea eax, [eax + 16]
+ movdqa xmm2, xmm0
+ punpcklbw xmm0, xmm1 // first 8 UV pairs
+ punpckhbw xmm2, xmm1 // next 8 UV pairs
+ movdqa [edi], xmm0
+ movdqa [edi + 16], xmm2
+ lea edi, [edi + 32]
+ sub ecx, 16
+ jg convertloop
+
+ pop edi
+ ret
+ }
+}
+
+__declspec(naked) __declspec(align(16))
+void MergeUVRow_Unaligned_SSE2(const uint8* src_u, const uint8* src_v,
+ uint8* dst_uv, int width) {
+ __asm {
+ push edi
+ mov eax, [esp + 4 + 4] // src_u
+ mov edx, [esp + 4 + 8] // src_v
+ mov edi, [esp + 4 + 12] // dst_uv
+ mov ecx, [esp + 4 + 16] // width
+ sub edx, eax
+
+ align 4
+ convertloop:
+ movdqu xmm0, [eax] // read 16 U's
+ movdqu xmm1, [eax + edx] // and 16 V's
+ lea eax, [eax + 16]
+ movdqa xmm2, xmm0
+ punpcklbw xmm0, xmm1 // first 8 UV pairs
+ punpckhbw xmm2, xmm1 // next 8 UV pairs
+ movdqu [edi], xmm0
+ movdqu [edi + 16], xmm2
+ lea edi, [edi + 32]
+ sub ecx, 16
+ jg convertloop
+
+ pop edi
+ ret
+ }
+}
+#endif // HAS_MERGEUVROW_SSE2
+
+#ifdef HAS_MERGEUVROW_AVX2
+__declspec(naked) __declspec(align(16))
+void MergeUVRow_AVX2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
+ int width) {
+ __asm {
+ push edi
+ mov eax, [esp + 4 + 4] // src_u
+ mov edx, [esp + 4 + 8] // src_v
+ mov edi, [esp + 4 + 12] // dst_uv
+ mov ecx, [esp + 4 + 16] // width
+ sub edx, eax
+
+ align 4
+ convertloop:
+ vmovdqu ymm0, [eax] // read 32 U's
+ vmovdqu ymm1, [eax + edx] // and 32 V's
+ lea eax, [eax + 32]
+ vpunpcklbw ymm2, ymm0, ymm1 // low 16 UV pairs. mutated qqword 0,2
+ vpunpckhbw ymm0, ymm0, ymm1 // high 16 UV pairs. mutated qqword 1,3
+ vperm2i128 ymm1, ymm2, ymm0, 0x20 // low 128 of ymm2 and low 128 of ymm0
+ vperm2i128 ymm2, ymm2, ymm0, 0x31 // high 128 of ymm2 and high 128 of ymm0
+ vmovdqu [edi], ymm1
+ vmovdqu [edi + 32], ymm2
+ lea edi, [edi + 64]
+ sub ecx, 32
+ jg convertloop
+
+ pop edi
+ vzeroupper
+ ret
+ }
+}
+#endif // HAS_MERGEUVROW_AVX2
+
+#ifdef HAS_COPYROW_SSE2
+// CopyRow copys 'count' bytes using a 16 byte load/store, 32 bytes at time.
+__declspec(naked) __declspec(align(16))
+void CopyRow_SSE2(const uint8* src, uint8* dst, int count) {
+ __asm {
+ mov eax, [esp + 4] // src
+ mov edx, [esp + 8] // dst
+ mov ecx, [esp + 12] // count
+
+ align 4
+ convertloop:
+ movdqa xmm0, [eax]
+ movdqa xmm1, [eax + 16]
+ lea eax, [eax + 32]
+ movdqa [edx], xmm0
+ movdqa [edx + 16], xmm1
+ lea edx, [edx + 32]
+ sub ecx, 32
+ jg convertloop
+ ret
+ }
+}
+#endif // HAS_COPYROW_SSE2
+
+// Unaligned Multiple of 1.
+__declspec(naked) __declspec(align(16))
+void CopyRow_ERMS(const uint8* src, uint8* dst, int count) {
+ __asm {
+ mov eax, esi
+ mov edx, edi
+ mov esi, [esp + 4] // src
+ mov edi, [esp + 8] // dst
+ mov ecx, [esp + 12] // count
+ rep movsb
+ mov edi, edx
+ mov esi, eax
+ ret
+ }
+}
+
+#ifdef HAS_COPYROW_X86
+__declspec(naked) __declspec(align(16))
+void CopyRow_X86(const uint8* src, uint8* dst, int count) {
+ __asm {
+ mov eax, esi
+ mov edx, edi
+ mov esi, [esp + 4] // src
+ mov edi, [esp + 8] // dst
+ mov ecx, [esp + 12] // count
+ shr ecx, 2
+ rep movsd
+ mov edi, edx
+ mov esi, eax
+ ret
+ }
+}
+#endif // HAS_COPYROW_X86
+
+#ifdef HAS_ARGBCOPYALPHAROW_SSE2
+// width in pixels
+__declspec(naked) __declspec(align(16))
+void ARGBCopyAlphaRow_SSE2(const uint8* src, uint8* dst, int width) {
+ __asm {
+ mov eax, [esp + 4] // src
+ mov edx, [esp + 8] // dst
+ mov ecx, [esp + 12] // count
+ pcmpeqb xmm0, xmm0 // generate mask 0xff000000
+ pslld xmm0, 24
+ pcmpeqb xmm1, xmm1 // generate mask 0x00ffffff
+ psrld xmm1, 8
+
+ align 4
+ convertloop:
+ movdqa xmm2, [eax]
+ movdqa xmm3, [eax + 16]
+ lea eax, [eax + 32]
+ movdqa xmm4, [edx]
+ movdqa xmm5, [edx + 16]
+ pand xmm2, xmm0
+ pand xmm3, xmm0
+ pand xmm4, xmm1
+ pand xmm5, xmm1
+ por xmm2, xmm4
+ por xmm3, xmm5
+ movdqa [edx], xmm2
+ movdqa [edx + 16], xmm3
+ lea edx, [edx + 32]
+ sub ecx, 8
+ jg convertloop
+
+ ret
+ }
+}
+#endif // HAS_ARGBCOPYALPHAROW_SSE2
+
+#ifdef HAS_ARGBCOPYALPHAROW_AVX2
+// width in pixels
+__declspec(naked) __declspec(align(16))
+void ARGBCopyAlphaRow_AVX2(const uint8* src, uint8* dst, int width) {
+ __asm {
+ mov eax, [esp + 4] // src
+ mov edx, [esp + 8] // dst
+ mov ecx, [esp + 12] // count
+ vpcmpeqb ymm0, ymm0, ymm0
+ vpsrld ymm0, ymm0, 8 // generate mask 0x00ffffff
+
+ align 4
+ convertloop:
+ vmovdqu ymm1, [eax]
+ vmovdqu ymm2, [eax + 32]
+ lea eax, [eax + 64]
+ vpblendvb ymm1, ymm1, [edx], ymm0
+ vpblendvb ymm2, ymm2, [edx + 32], ymm0
+ vmovdqu [edx], ymm1
+ vmovdqu [edx + 32], ymm2
+ lea edx, [edx + 64]
+ sub ecx, 16
+ jg convertloop
+
+ vzeroupper
+ ret
+ }
+}
+#endif // HAS_ARGBCOPYALPHAROW_AVX2
+
+#ifdef HAS_ARGBCOPYYTOALPHAROW_SSE2
+// width in pixels
+__declspec(naked) __declspec(align(16))
+void ARGBCopyYToAlphaRow_SSE2(const uint8* src, uint8* dst, int width) {
+ __asm {
+ mov eax, [esp + 4] // src
+ mov edx, [esp + 8] // dst
+ mov ecx, [esp + 12] // count
+ pcmpeqb xmm0, xmm0 // generate mask 0xff000000
+ pslld xmm0, 24
+ pcmpeqb xmm1, xmm1 // generate mask 0x00ffffff
+ psrld xmm1, 8
+
+ align 4
+ convertloop:
+ movq xmm2, qword ptr [eax] // 8 Y's
+ lea eax, [eax + 8]
+ punpcklbw xmm2, xmm2
+ punpckhwd xmm3, xmm2
+ punpcklwd xmm2, xmm2
+ movdqa xmm4, [edx]
+ movdqa xmm5, [edx + 16]
+ pand xmm2, xmm0
+ pand xmm3, xmm0
+ pand xmm4, xmm1
+ pand xmm5, xmm1
+ por xmm2, xmm4
+ por xmm3, xmm5
+ movdqa [edx], xmm2
+ movdqa [edx + 16], xmm3
+ lea edx, [edx + 32]
+ sub ecx, 8
+ jg convertloop
+
+ ret
+ }
+}
+#endif // HAS_ARGBCOPYYTOALPHAROW_SSE2
+
+#ifdef HAS_ARGBCOPYYTOALPHAROW_AVX2
+// width in pixels
+__declspec(naked) __declspec(align(16))
+void ARGBCopyYToAlphaRow_AVX2(const uint8* src, uint8* dst, int width) {
+ __asm {
+ mov eax, [esp + 4] // src
+ mov edx, [esp + 8] // dst
+ mov ecx, [esp + 12] // count
+ vpcmpeqb ymm0, ymm0, ymm0
+ vpsrld ymm0, ymm0, 8 // generate mask 0x00ffffff
+
+ align 4
+ convertloop:
+ vpmovzxbd ymm1, qword ptr [eax]
+ vpmovzxbd ymm2, qword ptr [eax + 8]
+ lea eax, [eax + 16]
+ vpslld ymm1, ymm1, 24
+ vpslld ymm2, ymm2, 24
+ vpblendvb ymm1, ymm1, [edx], ymm0
+ vpblendvb ymm2, ymm2, [edx + 32], ymm0
+ vmovdqu [edx], ymm1
+ vmovdqu [edx + 32], ymm2
+ lea edx, [edx + 64]
+ sub ecx, 16
+ jg convertloop
+
+ vzeroupper
+ ret
+ }
+}
+#endif // HAS_ARGBCOPYYTOALPHAROW_AVX2
+
+#ifdef HAS_SETROW_X86
+// SetRow8 writes 'count' bytes using a 32 bit value repeated.
+__declspec(naked) __declspec(align(16))
+void SetRow_X86(uint8* dst, uint32 v32, int count) {
+ __asm {
+ mov edx, edi
+ mov edi, [esp + 4] // dst
+ mov eax, [esp + 8] // v32
+ mov ecx, [esp + 12] // count
+ shr ecx, 2
+ rep stosd
+ mov edi, edx
+ ret
+ }
+}
+
+// SetRow32 writes 'count' words using a 32 bit value repeated.
+__declspec(naked) __declspec(align(16))
+void ARGBSetRows_X86(uint8* dst, uint32 v32, int width,
+ int dst_stride, int height) {
+ __asm {
+ push esi
+ push edi
+ push ebp
+ mov edi, [esp + 12 + 4] // dst
+ mov eax, [esp + 12 + 8] // v32
+ mov ebp, [esp + 12 + 12] // width
+ mov edx, [esp + 12 + 16] // dst_stride
+ mov esi, [esp + 12 + 20] // height
+ lea ecx, [ebp * 4]
+ sub edx, ecx // stride - width * 4
+
+ align 4
+ convertloop:
+ mov ecx, ebp
+ rep stosd
+ add edi, edx
+ sub esi, 1
+ jg convertloop
+
+ pop ebp
+ pop edi
+ pop esi
+ ret
+ }
+}
+#endif // HAS_SETROW_X86
+
+#ifdef HAS_YUY2TOYROW_AVX2
+__declspec(naked) __declspec(align(16))
+void YUY2ToYRow_AVX2(const uint8* src_yuy2,
+ uint8* dst_y, int pix) {
+ __asm {
+ mov eax, [esp + 4] // src_yuy2
+ mov edx, [esp + 8] // dst_y
+ mov ecx, [esp + 12] // pix
+ vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff
+ vpsrlw ymm5, ymm5, 8
+
+ align 4
+ convertloop:
+ vmovdqu ymm0, [eax]
+ vmovdqu ymm1, [eax + 32]
+ lea eax, [eax + 64]
+ vpand ymm0, ymm0, ymm5 // even bytes are Y
+ vpand ymm1, ymm1, ymm5
+ vpackuswb ymm0, ymm0, ymm1 // mutates.
+ vpermq ymm0, ymm0, 0xd8
+ sub ecx, 32
+ vmovdqu [edx], ymm0
+ lea edx, [edx + 32]
+ jg convertloop
+ vzeroupper
+ ret
+ }
+}
+
+__declspec(naked) __declspec(align(16))
+void YUY2ToUVRow_AVX2(const uint8* src_yuy2, int stride_yuy2,
+ uint8* dst_u, uint8* dst_v, int pix) {
+ __asm {
+ push esi
+ push edi
+ mov eax, [esp + 8 + 4] // src_yuy2
+ mov esi, [esp + 8 + 8] // stride_yuy2
+ mov edx, [esp + 8 + 12] // dst_u
+ mov edi, [esp + 8 + 16] // dst_v
+ mov ecx, [esp + 8 + 20] // pix
+ vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff
+ vpsrlw ymm5, ymm5, 8
+ sub edi, edx
+
+ align 4
+ convertloop:
+ vmovdqu ymm0, [eax]
+ vmovdqu ymm1, [eax + 32]
+ vpavgb ymm0, ymm0, [eax + esi]
+ vpavgb ymm1, ymm1, [eax + esi + 32]
+ lea eax, [eax + 64]
+ vpsrlw ymm0, ymm0, 8 // YUYV -> UVUV
+ vpsrlw ymm1, ymm1, 8
+ vpackuswb ymm0, ymm0, ymm1 // mutates.
+ vpermq ymm0, ymm0, 0xd8
+ vpand ymm1, ymm0, ymm5 // U
+ vpsrlw ymm0, ymm0, 8 // V
+ vpackuswb ymm1, ymm1, ymm1 // mutates.
+ vpackuswb ymm0, ymm0, ymm0 // mutates.
+ vpermq ymm1, ymm1, 0xd8
+ vpermq ymm0, ymm0, 0xd8
+ vextractf128 [edx], ymm1, 0 // U
+ vextractf128 [edx + edi], ymm0, 0 // V
+ lea edx, [edx + 16]
+ sub ecx, 32
+ jg convertloop
+
+ pop edi
+ pop esi
+ vzeroupper
+ ret
+ }
+}
+
+__declspec(naked) __declspec(align(16))
+void YUY2ToUV422Row_AVX2(const uint8* src_yuy2,
+ uint8* dst_u, uint8* dst_v, int pix) {
+ __asm {
+ push edi
+ mov eax, [esp + 4 + 4] // src_yuy2
+ mov edx, [esp + 4 + 8] // dst_u
+ mov edi, [esp + 4 + 12] // dst_v
+ mov ecx, [esp + 4 + 16] // pix
+ vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff
+ vpsrlw ymm5, ymm5, 8
+ sub edi, edx
+
+ align 4
+ convertloop:
+ vmovdqu ymm0, [eax]
+ vmovdqu ymm1, [eax + 32]
+ lea eax, [eax + 64]
+ vpsrlw ymm0, ymm0, 8 // YUYV -> UVUV
+ vpsrlw ymm1, ymm1, 8
+ vpackuswb ymm0, ymm0, ymm1 // mutates.
+ vpermq ymm0, ymm0, 0xd8
+ vpand ymm1, ymm0, ymm5 // U
+ vpsrlw ymm0, ymm0, 8 // V
+ vpackuswb ymm1, ymm1, ymm1 // mutates.
+ vpackuswb ymm0, ymm0, ymm0 // mutates.
+ vpermq ymm1, ymm1, 0xd8
+ vpermq ymm0, ymm0, 0xd8
+ vextractf128 [edx], ymm1, 0 // U
+ vextractf128 [edx + edi], ymm0, 0 // V
+ lea edx, [edx + 16]
+ sub ecx, 32
+ jg convertloop
+
+ pop edi
+ vzeroupper
+ ret
+ }
+}
+
+__declspec(naked) __declspec(align(16))
+void UYVYToYRow_AVX2(const uint8* src_uyvy,
+ uint8* dst_y, int pix) {
+ __asm {
+ mov eax, [esp + 4] // src_uyvy
+ mov edx, [esp + 8] // dst_y
+ mov ecx, [esp + 12] // pix
+
+ align 4
+ convertloop:
+ vmovdqu ymm0, [eax]
+ vmovdqu ymm1, [eax + 32]
+ lea eax, [eax + 64]
+ vpsrlw ymm0, ymm0, 8 // odd bytes are Y
+ vpsrlw ymm1, ymm1, 8
+ vpackuswb ymm0, ymm0, ymm1 // mutates.
+ vpermq ymm0, ymm0, 0xd8
+ sub ecx, 32
+ vmovdqu [edx], ymm0
+ lea edx, [edx + 32]
+ jg convertloop
+ ret
+ vzeroupper
+ }
+}
+
+__declspec(naked) __declspec(align(16))
+void UYVYToUVRow_AVX2(const uint8* src_uyvy, int stride_uyvy,
+ uint8* dst_u, uint8* dst_v, int pix) {
+ __asm {
+ push esi
+ push edi
+ mov eax, [esp + 8 + 4] // src_yuy2
+ mov esi, [esp + 8 + 8] // stride_yuy2
+ mov edx, [esp + 8 + 12] // dst_u
+ mov edi, [esp + 8 + 16] // dst_v
+ mov ecx, [esp + 8 + 20] // pix
+ vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff
+ vpsrlw ymm5, ymm5, 8
+ sub edi, edx
+
+ align 4
+ convertloop:
+ vmovdqu ymm0, [eax]
+ vmovdqu ymm1, [eax + 32]
+ vpavgb ymm0, ymm0, [eax + esi]
+ vpavgb ymm1, ymm1, [eax + esi + 32]
+ lea eax, [eax + 64]
+ vpand ymm0, ymm0, ymm5 // UYVY -> UVUV
+ vpand ymm1, ymm1, ymm5
+ vpackuswb ymm0, ymm0, ymm1 // mutates.
+ vpermq ymm0, ymm0, 0xd8
+ vpand ymm1, ymm0, ymm5 // U
+ vpsrlw ymm0, ymm0, 8 // V
+ vpackuswb ymm1, ymm1, ymm1 // mutates.
+ vpackuswb ymm0, ymm0, ymm0 // mutates.
+ vpermq ymm1, ymm1, 0xd8
+ vpermq ymm0, ymm0, 0xd8
+ vextractf128 [edx], ymm1, 0 // U
+ vextractf128 [edx + edi], ymm0, 0 // V
+ lea edx, [edx + 16]
+ sub ecx, 32
+ jg convertloop
+
+ pop edi
+ pop esi
+ vzeroupper
+ ret
+ }
+}
+
+__declspec(naked) __declspec(align(16))
+void UYVYToUV422Row_AVX2(const uint8* src_uyvy,
+ uint8* dst_u, uint8* dst_v, int pix) {
+ __asm {
+ push edi
+ mov eax, [esp + 4 + 4] // src_yuy2
+ mov edx, [esp + 4 + 8] // dst_u
+ mov edi, [esp + 4 + 12] // dst_v
+ mov ecx, [esp + 4 + 16] // pix
+ vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff
+ vpsrlw ymm5, ymm5, 8
+ sub edi, edx
+
+ align 4
+ convertloop:
+ vmovdqu ymm0, [eax]
+ vmovdqu ymm1, [eax + 32]
+ lea eax, [eax + 64]
+ vpand ymm0, ymm0, ymm5 // UYVY -> UVUV
+ vpand ymm1, ymm1, ymm5
+ vpackuswb ymm0, ymm0, ymm1 // mutates.
+ vpermq ymm0, ymm0, 0xd8
+ vpand ymm1, ymm0, ymm5 // U
+ vpsrlw ymm0, ymm0, 8 // V
+ vpackuswb ymm1, ymm1, ymm1 // mutates.
+ vpackuswb ymm0, ymm0, ymm0 // mutates.
+ vpermq ymm1, ymm1, 0xd8
+ vpermq ymm0, ymm0, 0xd8
+ vextractf128 [edx], ymm1, 0 // U
+ vextractf128 [edx + edi], ymm0, 0 // V
+ lea edx, [edx + 16]
+ sub ecx, 32
+ jg convertloop
+
+ pop edi
+ vzeroupper
+ ret
+ }
+}
+#endif // HAS_YUY2TOYROW_AVX2
+
+#ifdef HAS_YUY2TOYROW_SSE2
+__declspec(naked) __declspec(align(16))
+void YUY2ToYRow_SSE2(const uint8* src_yuy2,
+ uint8* dst_y, int pix) {
+ __asm {
+ mov eax, [esp + 4] // src_yuy2
+ mov edx, [esp + 8] // dst_y
+ mov ecx, [esp + 12] // pix
+ pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
+ psrlw xmm5, 8
+
+ align 4
+ convertloop:
+ movdqa xmm0, [eax]
+ movdqa xmm1, [eax + 16]
+ lea eax, [eax + 32]
+ pand xmm0, xmm5 // even bytes are Y
+ pand xmm1, xmm5
+ packuswb xmm0, xmm1
+ sub ecx, 16
+ movdqa [edx], xmm0
+ lea edx, [edx + 16]
+ jg convertloop
+ ret
+ }
+}
+
+__declspec(naked) __declspec(align(16))
+void YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2,
+ uint8* dst_u, uint8* dst_v, int pix) {
+ __asm {
+ push esi
+ push edi
+ mov eax, [esp + 8 + 4] // src_yuy2
+ mov esi, [esp + 8 + 8] // stride_yuy2
+ mov edx, [esp + 8 + 12] // dst_u
+ mov edi, [esp + 8 + 16] // dst_v
+ mov ecx, [esp + 8 + 20] // pix
+ pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
+ psrlw xmm5, 8
+ sub edi, edx
+
+ align 4
+ convertloop:
+ movdqa xmm0, [eax]
+ movdqa xmm1, [eax + 16]
+ movdqa xmm2, [eax + esi]
+ movdqa xmm3, [eax + esi + 16]
+ lea eax, [eax + 32]
+ pavgb xmm0, xmm2
+ pavgb xmm1, xmm3
+ psrlw xmm0, 8 // YUYV -> UVUV
+ psrlw xmm1, 8
+ packuswb xmm0, xmm1
+ movdqa xmm1, xmm0
+ pand xmm0, xmm5 // U
+ packuswb xmm0, xmm0
+ psrlw xmm1, 8 // V
+ packuswb xmm1, xmm1
+ movq qword ptr [edx], xmm0
+ movq qword ptr [edx + edi], xmm1
+ lea edx, [edx + 8]
+ sub ecx, 16
+ jg convertloop
+
+ pop edi
+ pop esi
+ ret
+ }
+}
+
+__declspec(naked) __declspec(align(16))
+void YUY2ToUV422Row_SSE2(const uint8* src_yuy2,
+ uint8* dst_u, uint8* dst_v, int pix) {
+ __asm {
+ push edi
+ mov eax, [esp + 4 + 4] // src_yuy2
+ mov edx, [esp + 4 + 8] // dst_u
+ mov edi, [esp + 4 + 12] // dst_v
+ mov ecx, [esp + 4 + 16] // pix
+ pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
+ psrlw xmm5, 8
+ sub edi, edx
+
+ align 4
+ convertloop:
+ movdqa xmm0, [eax]
+ movdqa xmm1, [eax + 16]
+ lea eax, [eax + 32]
+ psrlw xmm0, 8 // YUYV -> UVUV
+ psrlw xmm1, 8
+ packuswb xmm0, xmm1
+ movdqa xmm1, xmm0
+ pand xmm0, xmm5 // U
+ packuswb xmm0, xmm0
+ psrlw xmm1, 8 // V
+ packuswb xmm1, xmm1
+ movq qword ptr [edx], xmm0
+ movq qword ptr [edx + edi], xmm1
+ lea edx, [edx + 8]
+ sub ecx, 16
+ jg convertloop
+
+ pop edi
+ ret
+ }
+}
+
+__declspec(naked) __declspec(align(16))
+void YUY2ToYRow_Unaligned_SSE2(const uint8* src_yuy2,
+ uint8* dst_y, int pix) {
+ __asm {
+ mov eax, [esp + 4] // src_yuy2
+ mov edx, [esp + 8] // dst_y
+ mov ecx, [esp + 12] // pix
+ pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
+ psrlw xmm5, 8
+
+ align 4
+ convertloop:
+ movdqu xmm0, [eax]
+ movdqu xmm1, [eax + 16]
+ lea eax, [eax + 32]
+ pand xmm0, xmm5 // even bytes are Y
+ pand xmm1, xmm5
+ packuswb xmm0, xmm1
+ sub ecx, 16
+ movdqu [edx], xmm0
+ lea edx, [edx + 16]
+ jg convertloop
+ ret
+ }
+}
+
+__declspec(naked) __declspec(align(16))
+void YUY2ToUVRow_Unaligned_SSE2(const uint8* src_yuy2, int stride_yuy2,
+ uint8* dst_u, uint8* dst_v, int pix) {
+ __asm {
+ push esi
+ push edi
+ mov eax, [esp + 8 + 4] // src_yuy2
+ mov esi, [esp + 8 + 8] // stride_yuy2
+ mov edx, [esp + 8 + 12] // dst_u
+ mov edi, [esp + 8 + 16] // dst_v
+ mov ecx, [esp + 8 + 20] // pix
+ pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
+ psrlw xmm5, 8
+ sub edi, edx
+
+ align 4
+ convertloop:
+ movdqu xmm0, [eax]
+ movdqu xmm1, [eax + 16]
+ movdqu xmm2, [eax + esi]
+ movdqu xmm3, [eax + esi + 16]
+ lea eax, [eax + 32]
+ pavgb xmm0, xmm2
+ pavgb xmm1, xmm3
+ psrlw xmm0, 8 // YUYV -> UVUV
+ psrlw xmm1, 8
+ packuswb xmm0, xmm1
+ movdqa xmm1, xmm0
+ pand xmm0, xmm5 // U
+ packuswb xmm0, xmm0
+ psrlw xmm1, 8 // V
+ packuswb xmm1, xmm1
+ movq qword ptr [edx], xmm0
+ movq qword ptr [edx + edi], xmm1
+ lea edx, [edx + 8]
+ sub ecx, 16
+ jg convertloop
+
+ pop edi
+ pop esi
+ ret
+ }
+}
+
+__declspec(naked) __declspec(align(16))
+void YUY2ToUV422Row_Unaligned_SSE2(const uint8* src_yuy2,
+ uint8* dst_u, uint8* dst_v, int pix) {
+ __asm {
+ push edi
+ mov eax, [esp + 4 + 4] // src_yuy2
+ mov edx, [esp + 4 + 8] // dst_u
+ mov edi, [esp + 4 + 12] // dst_v
+ mov ecx, [esp + 4 + 16] // pix
+ pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
+ psrlw xmm5, 8
+ sub edi, edx
+
+ align 4
+ convertloop:
+ movdqu xmm0, [eax]
+ movdqu xmm1, [eax + 16]
+ lea eax, [eax + 32]
+ psrlw xmm0, 8 // YUYV -> UVUV
+ psrlw xmm1, 8
+ packuswb xmm0, xmm1
+ movdqa xmm1, xmm0
+ pand xmm0, xmm5 // U
+ packuswb xmm0, xmm0
+ psrlw xmm1, 8 // V
+ packuswb xmm1, xmm1
+ movq qword ptr [edx], xmm0
+ movq qword ptr [edx + edi], xmm1
+ lea edx, [edx + 8]
+ sub ecx, 16
+ jg convertloop
+
+ pop edi
+ ret
+ }
+}
+
+__declspec(naked) __declspec(align(16))
+void UYVYToYRow_SSE2(const uint8* src_uyvy,
+ uint8* dst_y, int pix) {
+ __asm {
+ mov eax, [esp + 4] // src_uyvy
+ mov edx, [esp + 8] // dst_y
+ mov ecx, [esp + 12] // pix
+
+ align 4
+ convertloop:
+ movdqa xmm0, [eax]
+ movdqa xmm1, [eax + 16]
+ lea eax, [eax + 32]
+ psrlw xmm0, 8 // odd bytes are Y
+ psrlw xmm1, 8
+ packuswb xmm0, xmm1
+ sub ecx, 16
+ movdqa [edx], xmm0
+ lea edx, [edx + 16]
+ jg convertloop
+ ret
+ }
+}
+
+__declspec(naked) __declspec(align(16))
+void UYVYToUVRow_SSE2(const uint8* src_uyvy, int stride_uyvy,
+ uint8* dst_u, uint8* dst_v, int pix) {
+ __asm {
+ push esi
+ push edi
+ mov eax, [esp + 8 + 4] // src_yuy2
+ mov esi, [esp + 8 + 8] // stride_yuy2
+ mov edx, [esp + 8 + 12] // dst_u
+ mov edi, [esp + 8 + 16] // dst_v
+ mov ecx, [esp + 8 + 20] // pix
+ pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
+ psrlw xmm5, 8
+ sub edi, edx
+
+ align 4
+ convertloop:
+ movdqa xmm0, [eax]
+ movdqa xmm1, [eax + 16]
+ movdqa xmm2, [eax + esi]
+ movdqa xmm3, [eax + esi + 16]
+ lea eax, [eax + 32]
+ pavgb xmm0, xmm2
+ pavgb xmm1, xmm3
+ pand xmm0, xmm5 // UYVY -> UVUV
+ pand xmm1, xmm5
+ packuswb xmm0, xmm1
+ movdqa xmm1, xmm0
+ pand xmm0, xmm5 // U
+ packuswb xmm0, xmm0
+ psrlw xmm1, 8 // V
+ packuswb xmm1, xmm1
+ movq qword ptr [edx], xmm0
+ movq qword ptr [edx + edi], xmm1
+ lea edx, [edx + 8]
+ sub ecx, 16
+ jg convertloop
+
+ pop edi
+ pop esi
+ ret
+ }
+}
+
+__declspec(naked) __declspec(align(16))
+void UYVYToUV422Row_SSE2(const uint8* src_uyvy,
+ uint8* dst_u, uint8* dst_v, int pix) {
+ __asm {
+ push edi
+ mov eax, [esp + 4 + 4] // src_yuy2
+ mov edx, [esp + 4 + 8] // dst_u
+ mov edi, [esp + 4 + 12] // dst_v
+ mov ecx, [esp + 4 + 16] // pix
+ pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
+ psrlw xmm5, 8
+ sub edi, edx
+
+ align 4
+ convertloop:
+ movdqa xmm0, [eax]
+ movdqa xmm1, [eax + 16]
+ lea eax, [eax + 32]
+ pand xmm0, xmm5 // UYVY -> UVUV
+ pand xmm1, xmm5
+ packuswb xmm0, xmm1
+ movdqa xmm1, xmm0
+ pand xmm0, xmm5 // U
+ packuswb xmm0, xmm0
+ psrlw xmm1, 8 // V
+ packuswb xmm1, xmm1
+ movq qword ptr [edx], xmm0
+ movq qword ptr [edx + edi], xmm1
+ lea edx, [edx + 8]
+ sub ecx, 16
+ jg convertloop
+
+ pop edi
+ ret
+ }
+}
+
+__declspec(naked) __declspec(align(16))
+void UYVYToYRow_Unaligned_SSE2(const uint8* src_uyvy,
+ uint8* dst_y, int pix) {
+ __asm {
+ mov eax, [esp + 4] // src_uyvy
+ mov edx, [esp + 8] // dst_y
+ mov ecx, [esp + 12] // pix
+
+ align 4
+ convertloop:
+ movdqu xmm0, [eax]
+ movdqu xmm1, [eax + 16]
+ lea eax, [eax + 32]
+ psrlw xmm0, 8 // odd bytes are Y
+ psrlw xmm1, 8
+ packuswb xmm0, xmm1
+ sub ecx, 16
+ movdqu [edx], xmm0
+ lea edx, [edx + 16]
+ jg convertloop
+ ret
+ }
+}
+
+__declspec(naked) __declspec(align(16))
+void UYVYToUVRow_Unaligned_SSE2(const uint8* src_uyvy, int stride_uyvy,
+ uint8* dst_u, uint8* dst_v, int pix) {
+ __asm {
+ push esi
+ push edi
+ mov eax, [esp + 8 + 4] // src_yuy2
+ mov esi, [esp + 8 + 8] // stride_yuy2
+ mov edx, [esp + 8 + 12] // dst_u
+ mov edi, [esp + 8 + 16] // dst_v
+ mov ecx, [esp + 8 + 20] // pix
+ pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
+ psrlw xmm5, 8
+ sub edi, edx
+
+ align 4
+ convertloop:
+ movdqu xmm0, [eax]
+ movdqu xmm1, [eax + 16]
+ movdqu xmm2, [eax + esi]
+ movdqu xmm3, [eax + esi + 16]
+ lea eax, [eax + 32]
+ pavgb xmm0, xmm2
+ pavgb xmm1, xmm3
+ pand xmm0, xmm5 // UYVY -> UVUV
+ pand xmm1, xmm5
+ packuswb xmm0, xmm1
+ movdqa xmm1, xmm0
+ pand xmm0, xmm5 // U
+ packuswb xmm0, xmm0
+ psrlw xmm1, 8 // V
+ packuswb xmm1, xmm1
+ movq qword ptr [edx], xmm0
+ movq qword ptr [edx + edi], xmm1
+ lea edx, [edx + 8]
+ sub ecx, 16
+ jg convertloop
+
+ pop edi
+ pop esi
+ ret
+ }
+}
+
+__declspec(naked) __declspec(align(16))
+void UYVYToUV422Row_Unaligned_SSE2(const uint8* src_uyvy,
+ uint8* dst_u, uint8* dst_v, int pix) {
+ __asm {
+ push edi
+ mov eax, [esp + 4 + 4] // src_yuy2
+ mov edx, [esp + 4 + 8] // dst_u
+ mov edi, [esp + 4 + 12] // dst_v
+ mov ecx, [esp + 4 + 16] // pix
+ pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
+ psrlw xmm5, 8
+ sub edi, edx
+
+ align 4
+ convertloop:
+ movdqu xmm0, [eax]
+ movdqu xmm1, [eax + 16]
+ lea eax, [eax + 32]
+ pand xmm0, xmm5 // UYVY -> UVUV
+ pand xmm1, xmm5
+ packuswb xmm0, xmm1
+ movdqa xmm1, xmm0
+ pand xmm0, xmm5 // U
+ packuswb xmm0, xmm0
+ psrlw xmm1, 8 // V
+ packuswb xmm1, xmm1
+ movq qword ptr [edx], xmm0
+ movq qword ptr [edx + edi], xmm1
+ lea edx, [edx + 8]
+ sub ecx, 16
+ jg convertloop
+
+ pop edi
+ ret
+ }
+}
+#endif // HAS_YUY2TOYROW_SSE2
+
+#ifdef HAS_ARGBBLENDROW_SSE2
+// Blend 8 pixels at a time.
+__declspec(naked) __declspec(align(16))
+void ARGBBlendRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
+ uint8* dst_argb, int width) {
+ __asm {
+ push esi
+ mov eax, [esp + 4 + 4] // src_argb0
+ mov esi, [esp + 4 + 8] // src_argb1
+ mov edx, [esp + 4 + 12] // dst_argb
+ mov ecx, [esp + 4 + 16] // width
+ pcmpeqb xmm7, xmm7 // generate constant 1
+ psrlw xmm7, 15
+ pcmpeqb xmm6, xmm6 // generate mask 0x00ff00ff
+ psrlw xmm6, 8
+ pcmpeqb xmm5, xmm5 // generate mask 0xff00ff00
+ psllw xmm5, 8
+ pcmpeqb xmm4, xmm4 // generate mask 0xff000000
+ pslld xmm4, 24
+
+ sub ecx, 1
+ je convertloop1 // only 1 pixel?
+ jl convertloop1b
+
+ // 1 pixel loop until destination pointer is aligned.
+ alignloop1:
+ test edx, 15 // aligned?
+ je alignloop1b
+ movd xmm3, [eax]
+ lea eax, [eax + 4]
+ movdqa xmm0, xmm3 // src argb
+ pxor xmm3, xmm4 // ~alpha
+ movd xmm2, [esi] // _r_b
+ psrlw xmm3, 8 // alpha
+ pshufhw xmm3, xmm3, 0F5h // 8 alpha words
+ pshuflw xmm3, xmm3, 0F5h
+ pand xmm2, xmm6 // _r_b
+ paddw xmm3, xmm7 // 256 - alpha
+ pmullw xmm2, xmm3 // _r_b * alpha
+ movd xmm1, [esi] // _a_g
+ lea esi, [esi + 4]
+ psrlw xmm1, 8 // _a_g
+ por xmm0, xmm4 // set alpha to 255
+ pmullw xmm1, xmm3 // _a_g * alpha
+ psrlw xmm2, 8 // _r_b convert to 8 bits again
+ paddusb xmm0, xmm2 // + src argb
+ pand xmm1, xmm5 // a_g_ convert to 8 bits again
+ paddusb xmm0, xmm1 // + src argb
+ sub ecx, 1
+ movd [edx], xmm0
+ lea edx, [edx + 4]
+ jge alignloop1
+
+ alignloop1b:
+ add ecx, 1 - 4
+ jl convertloop4b
+
+ // 4 pixel loop.
+ convertloop4:
+ movdqu xmm3, [eax] // src argb
+ lea eax, [eax + 16]
+ movdqa xmm0, xmm3 // src argb
+ pxor xmm3, xmm4 // ~alpha
+ movdqu xmm2, [esi] // _r_b
+ psrlw xmm3, 8 // alpha
+ pshufhw xmm3, xmm3, 0F5h // 8 alpha words
+ pshuflw xmm3, xmm3, 0F5h
+ pand xmm2, xmm6 // _r_b
+ paddw xmm3, xmm7 // 256 - alpha
+ pmullw xmm2, xmm3 // _r_b * alpha
+ movdqu xmm1, [esi] // _a_g
+ lea esi, [esi + 16]
+ psrlw xmm1, 8 // _a_g
+ por xmm0, xmm4 // set alpha to 255
+ pmullw xmm1, xmm3 // _a_g * alpha
+ psrlw xmm2, 8 // _r_b convert to 8 bits again
+ paddusb xmm0, xmm2 // + src argb
+ pand xmm1, xmm5 // a_g_ convert to 8 bits again
+ paddusb xmm0, xmm1 // + src argb
+ sub ecx, 4
+ movdqa [edx], xmm0
+ lea edx, [edx + 16]
+ jge convertloop4
+
+ convertloop4b:
+ add ecx, 4 - 1
+ jl convertloop1b
+
+ // 1 pixel loop.
+ convertloop1:
+ movd xmm3, [eax] // src argb
+ lea eax, [eax + 4]
+ movdqa xmm0, xmm3 // src argb
+ pxor xmm3, xmm4 // ~alpha
+ movd xmm2, [esi] // _r_b
+ psrlw xmm3, 8 // alpha
+ pshufhw xmm3, xmm3, 0F5h // 8 alpha words
+ pshuflw xmm3, xmm3, 0F5h
+ pand xmm2, xmm6 // _r_b
+ paddw xmm3, xmm7 // 256 - alpha
+ pmullw xmm2, xmm3 // _r_b * alpha
+ movd xmm1, [esi] // _a_g
+ lea esi, [esi + 4]
+ psrlw xmm1, 8 // _a_g
+ por xmm0, xmm4 // set alpha to 255
+ pmullw xmm1, xmm3 // _a_g * alpha
+ psrlw xmm2, 8 // _r_b convert to 8 bits again
+ paddusb xmm0, xmm2 // + src argb
+ pand xmm1, xmm5 // a_g_ convert to 8 bits again
+ paddusb xmm0, xmm1 // + src argb
+ sub ecx, 1
+ movd [edx], xmm0
+ lea edx, [edx + 4]
+ jge convertloop1
+
+ convertloop1b:
+ pop esi
+ ret
+ }
+}
+#endif // HAS_ARGBBLENDROW_SSE2
+
+#ifdef HAS_ARGBBLENDROW_SSSE3
+// Shuffle table for isolating alpha.
+static const uvec8 kShuffleAlpha = {
+ 3u, 0x80, 3u, 0x80, 7u, 0x80, 7u, 0x80,
+ 11u, 0x80, 11u, 0x80, 15u, 0x80, 15u, 0x80
+};
+// Same as SSE2, but replaces:
+// psrlw xmm3, 8 // alpha
+// pshufhw xmm3, xmm3, 0F5h // 8 alpha words
+// pshuflw xmm3, xmm3, 0F5h
+// with..
+// pshufb xmm3, kShuffleAlpha // alpha
+// Blend 8 pixels at a time.
+
+__declspec(naked) __declspec(align(16))
+void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1,
+ uint8* dst_argb, int width) {
+ __asm {
+ push esi
+ mov eax, [esp + 4 + 4] // src_argb0
+ mov esi, [esp + 4 + 8] // src_argb1
+ mov edx, [esp + 4 + 12] // dst_argb
+ mov ecx, [esp + 4 + 16] // width
+ pcmpeqb xmm7, xmm7 // generate constant 0x0001
+ psrlw xmm7, 15
+ pcmpeqb xmm6, xmm6 // generate mask 0x00ff00ff
+ psrlw xmm6, 8
+ pcmpeqb xmm5, xmm5 // generate mask 0xff00ff00
+ psllw xmm5, 8
+ pcmpeqb xmm4, xmm4 // generate mask 0xff000000
+ pslld xmm4, 24
+
+ sub ecx, 1
+ je convertloop1 // only 1 pixel?
+ jl convertloop1b
+
+ // 1 pixel loop until destination pointer is aligned.
+ alignloop1:
+ test edx, 15 // aligned?
+ je alignloop1b
+ movd xmm3, [eax]
+ lea eax, [eax + 4]
+ movdqa xmm0, xmm3 // src argb
+ pxor xmm3, xmm4 // ~alpha
+ movd xmm2, [esi] // _r_b
+ pshufb xmm3, kShuffleAlpha // alpha
+ pand xmm2, xmm6 // _r_b
+ paddw xmm3, xmm7 // 256 - alpha
+ pmullw xmm2, xmm3 // _r_b * alpha
+ movd xmm1, [esi] // _a_g
+ lea esi, [esi + 4]
+ psrlw xmm1, 8 // _a_g
+ por xmm0, xmm4 // set alpha to 255
+ pmullw xmm1, xmm3 // _a_g * alpha
+ psrlw xmm2, 8 // _r_b convert to 8 bits again
+ paddusb xmm0, xmm2 // + src argb
+ pand xmm1, xmm5 // a_g_ convert to 8 bits again
+ paddusb xmm0, xmm1 // + src argb
+ sub ecx, 1
+ movd [edx], xmm0
+ lea edx, [edx + 4]
+ jge alignloop1
+
+ alignloop1b:
+ add ecx, 1 - 4
+ jl convertloop4b
+
+ test eax, 15 // unaligned?
+ jne convertuloop4
+ test esi, 15 // unaligned?
+ jne convertuloop4
+
+ // 4 pixel loop.
+ convertloop4:
+ movdqa xmm3, [eax] // src argb
+ lea eax, [eax + 16]
+ movdqa xmm0, xmm3 // src argb
+ pxor xmm3, xmm4 // ~alpha
+ movdqa xmm2, [esi] // _r_b
+ pshufb xmm3, kShuffleAlpha // alpha
+ pand xmm2, xmm6 // _r_b
+ paddw xmm3, xmm7 // 256 - alpha
+ pmullw xmm2, xmm3 // _r_b * alpha
+ movdqa xmm1, [esi] // _a_g
+ lea esi, [esi + 16]
+ psrlw xmm1, 8 // _a_g
+ por xmm0, xmm4 // set alpha to 255
+ pmullw xmm1, xmm3 // _a_g * alpha
+ psrlw xmm2, 8 // _r_b convert to 8 bits again
+ paddusb xmm0, xmm2 // + src argb
+ pand xmm1, xmm5 // a_g_ convert to 8 bits again
+ paddusb xmm0, xmm1 // + src argb
+ sub ecx, 4
+ movdqa [edx], xmm0
+ lea edx, [edx + 16]
+ jge convertloop4
+ jmp convertloop4b
+
+ // 4 pixel unaligned loop.
+ convertuloop4:
+ movdqu xmm3, [eax] // src argb
+ lea eax, [eax + 16]
+ movdqa xmm0, xmm3 // src argb
+ pxor xmm3, xmm4 // ~alpha
+ movdqu xmm2, [esi] // _r_b
+ pshufb xmm3, kShuffleAlpha // alpha
+ pand xmm2, xmm6 // _r_b
+ paddw xmm3, xmm7 // 256 - alpha
+ pmullw xmm2, xmm3 // _r_b * alpha
+ movdqu xmm1, [esi] // _a_g
+ lea esi, [esi + 16]
+ psrlw xmm1, 8 // _a_g
+ por xmm0, xmm4 // set alpha to 255
+ pmullw xmm1, xmm3 // _a_g * alpha
+ psrlw xmm2, 8 // _r_b convert to 8 bits again
+ paddusb xmm0, xmm2 // + src argb
+ pand xmm1, xmm5 // a_g_ convert to 8 bits again
+ paddusb xmm0, xmm1 // + src argb
+ sub ecx, 4
+ movdqa [edx], xmm0
+ lea edx, [edx + 16]
+ jge convertuloop4
+
+ convertloop4b:
+ add ecx, 4 - 1
+ jl convertloop1b
+
+ // 1 pixel loop.
+ convertloop1:
+ movd xmm3, [eax] // src argb
+ lea eax, [eax + 4]
+ movdqa xmm0, xmm3 // src argb
+ pxor xmm3, xmm4 // ~alpha
+ movd xmm2, [esi] // _r_b
+ pshufb xmm3, kShuffleAlpha // alpha
+ pand xmm2, xmm6 // _r_b
+ paddw xmm3, xmm7 // 256 - alpha
+ pmullw xmm2, xmm3 // _r_b * alpha
+ movd xmm1, [esi] // _a_g
+ lea esi, [esi + 4]
+ psrlw xmm1, 8 // _a_g
+ por xmm0, xmm4 // set alpha to 255
+ pmullw xmm1, xmm3 // _a_g * alpha
+ psrlw xmm2, 8 // _r_b convert to 8 bits again
+ paddusb xmm0, xmm2 // + src argb
+ pand xmm1, xmm5 // a_g_ convert to 8 bits again
+ paddusb xmm0, xmm1 // + src argb
+ sub ecx, 1
+ movd [edx], xmm0
+ lea edx, [edx + 4]
+ jge convertloop1
+
+ convertloop1b:
+ pop esi
+ ret
+ }
+}
+#endif // HAS_ARGBBLENDROW_SSSE3
+
+#ifdef HAS_ARGBATTENUATEROW_SSE2
+// Attenuate 4 pixels at a time.
+// Aligned to 16 bytes.
+__declspec(naked) __declspec(align(16))
+void ARGBAttenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) {
+ __asm {
+ mov eax, [esp + 4] // src_argb0
+ mov edx, [esp + 8] // dst_argb
+ mov ecx, [esp + 12] // width
+ pcmpeqb xmm4, xmm4 // generate mask 0xff000000
+ pslld xmm4, 24
+ pcmpeqb xmm5, xmm5 // generate mask 0x00ffffff
+ psrld xmm5, 8
+
+ align 4
+ convertloop:
+ movdqa xmm0, [eax] // read 4 pixels
+ punpcklbw xmm0, xmm0 // first 2
+ pshufhw xmm2, xmm0, 0FFh // 8 alpha words
+ pshuflw xmm2, xmm2, 0FFh
+ pmulhuw xmm0, xmm2 // rgb * a
+ movdqa xmm1, [eax] // read 4 pixels
+ punpckhbw xmm1, xmm1 // next 2 pixels
+ pshufhw xmm2, xmm1, 0FFh // 8 alpha words
+ pshuflw xmm2, xmm2, 0FFh
+ pmulhuw xmm1, xmm2 // rgb * a
+ movdqa xmm2, [eax] // alphas
+ lea eax, [eax + 16]
+ psrlw xmm0, 8
+ pand xmm2, xmm4
+ psrlw xmm1, 8
+ packuswb xmm0, xmm1
+ pand xmm0, xmm5 // keep original alphas
+ por xmm0, xmm2
+ sub ecx, 4
+ movdqa [edx], xmm0
+ lea edx, [edx + 16]
+ jg convertloop
+
+ ret
+ }
+}
+#endif // HAS_ARGBATTENUATEROW_SSE2
+
+#ifdef HAS_ARGBATTENUATEROW_SSSE3
+// Shuffle table duplicating alpha.
+static const uvec8 kShuffleAlpha0 = {
+ 3u, 3u, 3u, 3u, 3u, 3u, 128u, 128u, 7u, 7u, 7u, 7u, 7u, 7u, 128u, 128u,
+};
+static const uvec8 kShuffleAlpha1 = {
+ 11u, 11u, 11u, 11u, 11u, 11u, 128u, 128u,
+ 15u, 15u, 15u, 15u, 15u, 15u, 128u, 128u,
+};
+__declspec(naked) __declspec(align(16))
+void ARGBAttenuateRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
+ __asm {
+ mov eax, [esp + 4] // src_argb0
+ mov edx, [esp + 8] // dst_argb
+ mov ecx, [esp + 12] // width
+ pcmpeqb xmm3, xmm3 // generate mask 0xff000000
+ pslld xmm3, 24
+ movdqa xmm4, kShuffleAlpha0
+ movdqa xmm5, kShuffleAlpha1
+
+ align 4
+ convertloop:
+ movdqu xmm0, [eax] // read 4 pixels
+ pshufb xmm0, xmm4 // isolate first 2 alphas
+ movdqu xmm1, [eax] // read 4 pixels
+ punpcklbw xmm1, xmm1 // first 2 pixel rgbs
+ pmulhuw xmm0, xmm1 // rgb * a
+ movdqu xmm1, [eax] // read 4 pixels
+ pshufb xmm1, xmm5 // isolate next 2 alphas
+ movdqu xmm2, [eax] // read 4 pixels
+ punpckhbw xmm2, xmm2 // next 2 pixel rgbs
+ pmulhuw xmm1, xmm2 // rgb * a
+ movdqu xmm2, [eax] // mask original alpha
+ lea eax, [eax + 16]
+ pand xmm2, xmm3
+ psrlw xmm0, 8
+ psrlw xmm1, 8
+ packuswb xmm0, xmm1
+ por xmm0, xmm2 // copy original alpha
+ sub ecx, 4
+ movdqu [edx], xmm0
+ lea edx, [edx + 16]
+ jg convertloop
+
+ ret
+ }
+}
+#endif // HAS_ARGBATTENUATEROW_SSSE3
+
+#ifdef HAS_ARGBATTENUATEROW_AVX2
+// Shuffle table duplicating alpha.
+static const ulvec8 kShuffleAlpha_AVX2 = {
+ 6u, 7u, 6u, 7u, 6u, 7u, 128u, 128u,
+ 14u, 15u, 14u, 15u, 14u, 15u, 128u, 128u,
+ 6u, 7u, 6u, 7u, 6u, 7u, 128u, 128u,
+ 14u, 15u, 14u, 15u, 14u, 15u, 128u, 128u,
+};
+__declspec(naked) __declspec(align(16))
+void ARGBAttenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, int width) {
+ __asm {
+ mov eax, [esp + 4] // src_argb0
+ mov edx, [esp + 8] // dst_argb
+ mov ecx, [esp + 12] // width
+ sub edx, eax
+ vmovdqa ymm4, kShuffleAlpha_AVX2
+ vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0xff000000
+ vpslld ymm5, ymm5, 24
+
+ align 4
+ convertloop:
+ vmovdqu ymm6, [eax] // read 8 pixels.
+ vpunpcklbw ymm0, ymm6, ymm6 // low 4 pixels. mutated.
+ vpunpckhbw ymm1, ymm6, ymm6 // high 4 pixels. mutated.
+ vpshufb ymm2, ymm0, ymm4 // low 4 alphas
+ vpshufb ymm3, ymm1, ymm4 // high 4 alphas
+ vpmulhuw ymm0, ymm0, ymm2 // rgb * a
+ vpmulhuw ymm1, ymm1, ymm3 // rgb * a
+ vpand ymm6, ymm6, ymm5 // isolate alpha
+ vpsrlw ymm0, ymm0, 8
+ vpsrlw ymm1, ymm1, 8
+ vpackuswb ymm0, ymm0, ymm1 // unmutated.
+ vpor ymm0, ymm0, ymm6 // copy original alpha
+ sub ecx, 8
+ vmovdqu [eax + edx], ymm0
+ lea eax, [eax + 32]
+ jg convertloop
+
+ vzeroupper
+ ret
+ }
+}
+#endif // HAS_ARGBATTENUATEROW_AVX2
+
+#ifdef HAS_ARGBUNATTENUATEROW_SSE2
+// Unattenuate 4 pixels at a time.
+// Aligned to 16 bytes.
+__declspec(naked) __declspec(align(16))
+void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb,
+ int width) {
+ __asm {
+ push esi
+ push edi
+ mov eax, [esp + 8 + 4] // src_argb0
+ mov edx, [esp + 8 + 8] // dst_argb
+ mov ecx, [esp + 8 + 12] // width
+
+ align 4
+ convertloop:
+ movdqu xmm0, [eax] // read 4 pixels
+ movzx esi, byte ptr [eax + 3] // first alpha
+ movzx edi, byte ptr [eax + 7] // second alpha
+ punpcklbw xmm0, xmm0 // first 2
+ movd xmm2, dword ptr fixed_invtbl8[esi * 4]
+ movd xmm3, dword ptr fixed_invtbl8[edi * 4]
+ pshuflw xmm2, xmm2, 040h // first 4 inv_alpha words. 1, a, a, a
+ pshuflw xmm3, xmm3, 040h // next 4 inv_alpha words
+ movlhps xmm2, xmm3
+ pmulhuw xmm0, xmm2 // rgb * a
+
+ movdqu xmm1, [eax] // read 4 pixels
+ movzx esi, byte ptr [eax + 11] // third alpha
+ movzx edi, byte ptr [eax + 15] // forth alpha
+ punpckhbw xmm1, xmm1 // next 2
+ movd xmm2, dword ptr fixed_invtbl8[esi * 4]
+ movd xmm3, dword ptr fixed_invtbl8[edi * 4]
+ pshuflw xmm2, xmm2, 040h // first 4 inv_alpha words
+ pshuflw xmm3, xmm3, 040h // next 4 inv_alpha words
+ movlhps xmm2, xmm3
+ pmulhuw xmm1, xmm2 // rgb * a
+ lea eax, [eax + 16]
+
+ packuswb xmm0, xmm1
+ sub ecx, 4
+ movdqu [edx], xmm0
+ lea edx, [edx + 16]
+ jg convertloop
+ pop edi
+ pop esi
+ ret
+ }
+}
+#endif // HAS_ARGBUNATTENUATEROW_SSE2
+
+#ifdef HAS_ARGBUNATTENUATEROW_AVX2
+// Shuffle table duplicating alpha.
+static const ulvec8 kUnattenShuffleAlpha_AVX2 = {
+ 0u, 1u, 0u, 1u, 0u, 1u, 6u, 7u, 8u, 9u, 8u, 9u, 8u, 9u, 14u, 15,
+ 0u, 1u, 0u, 1u, 0u, 1u, 6u, 7u, 8u, 9u, 8u, 9u, 8u, 9u, 14u, 15,
+};
+// TODO(fbarchard): Enable USE_GATHER for future hardware if faster.
+// USE_GATHER is not on by default, due to being a slow instruction.
+#ifdef USE_GATHER
+__declspec(naked) __declspec(align(16))
+void ARGBUnattenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb,
+ int width) {
+ __asm {
+ mov eax, [esp + 4] // src_argb0
+ mov edx, [esp + 8] // dst_argb
+ mov ecx, [esp + 12] // width
+ sub edx, eax
+ vmovdqa ymm4, kUnattenShuffleAlpha_AVX2
+
+ align 4
+ convertloop:
+ vmovdqu ymm6, [eax] // read 8 pixels.
+ vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0xffffffff for gather.
+ vpsrld ymm2, ymm6, 24 // alpha in low 8 bits.
+ vpunpcklbw ymm0, ymm6, ymm6 // low 4 pixels. mutated.
+ vpunpckhbw ymm1, ymm6, ymm6 // high 4 pixels. mutated.
+ vpgatherdd ymm3, [ymm2 * 4 + fixed_invtbl8], ymm5 // ymm5 cleared. 1, a
+ vpunpcklwd ymm2, ymm3, ymm3 // low 4 inverted alphas. mutated. 1, 1, a, a
+ vpunpckhwd ymm3, ymm3, ymm3 // high 4 inverted alphas. mutated.
+ vpshufb ymm2, ymm2, ymm4 // replicate low 4 alphas. 1, a, a, a
+ vpshufb ymm3, ymm3, ymm4 // replicate high 4 alphas
+ vpmulhuw ymm0, ymm0, ymm2 // rgb * ia
+ vpmulhuw ymm1, ymm1, ymm3 // rgb * ia
+ vpackuswb ymm0, ymm0, ymm1 // unmutated.
+ sub ecx, 8
+ vmovdqu [eax + edx], ymm0
+ lea eax, [eax + 32]
+ jg convertloop
+
+ vzeroupper
+ ret
+ }
+}
+#else // USE_GATHER
+__declspec(naked) __declspec(align(16))
+void ARGBUnattenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb,
+ int width) {
+ __asm {
+
+ mov eax, [esp + 4] // src_argb0
+ mov edx, [esp + 8] // dst_argb
+ mov ecx, [esp + 12] // width
+ sub edx, eax
+ vmovdqa ymm5, kUnattenShuffleAlpha_AVX2
+
+ push esi
+ push edi
+
+ align 4
+ convertloop:
+ // replace VPGATHER
+ movzx esi, byte ptr [eax + 3] // alpha0
+ movzx edi, byte ptr [eax + 7] // alpha1
+ vmovd xmm0, dword ptr fixed_invtbl8[esi * 4] // [1,a0]
+ vmovd xmm1, dword ptr fixed_invtbl8[edi * 4] // [1,a1]
+ movzx esi, byte ptr [eax + 11] // alpha2
+ movzx edi, byte ptr [eax + 15] // alpha3
+ vpunpckldq xmm6, xmm0, xmm1 // [1,a1,1,a0]
+ vmovd xmm2, dword ptr fixed_invtbl8[esi * 4] // [1,a2]
+ vmovd xmm3, dword ptr fixed_invtbl8[edi * 4] // [1,a3]
+ movzx esi, byte ptr [eax + 19] // alpha4
+ movzx edi, byte ptr [eax + 23] // alpha5
+ vpunpckldq xmm7, xmm2, xmm3 // [1,a3,1,a2]
+ vmovd xmm0, dword ptr fixed_invtbl8[esi * 4] // [1,a4]
+ vmovd xmm1, dword ptr fixed_invtbl8[edi * 4] // [1,a5]
+ movzx esi, byte ptr [eax + 27] // alpha6
+ movzx edi, byte ptr [eax + 31] // alpha7
+ vpunpckldq xmm0, xmm0, xmm1 // [1,a5,1,a4]
+ vmovd xmm2, dword ptr fixed_invtbl8[esi * 4] // [1,a6]
+ vmovd xmm3, dword ptr fixed_invtbl8[edi * 4] // [1,a7]
+ vpunpckldq xmm2, xmm2, xmm3 // [1,a7,1,a6]
+ vpunpcklqdq xmm3, xmm6, xmm7 // [1,a3,1,a2,1,a1,1,a0]
+ vpunpcklqdq xmm0, xmm0, xmm2 // [1,a7,1,a6,1,a5,1,a4]
+ vinserti128 ymm3, ymm3, xmm0, 1 // [1,a7,1,a6,1,a5,1,a4,1,a3,1,a2,1,a1,1,a0]
+ // end of VPGATHER
+
+ vmovdqu ymm6, [eax] // read 8 pixels.
+ vpunpcklbw ymm0, ymm6, ymm6 // low 4 pixels. mutated.
+ vpunpckhbw ymm1, ymm6, ymm6 // high 4 pixels. mutated.
+ vpunpcklwd ymm2, ymm3, ymm3 // low 4 inverted alphas. mutated. 1, 1, a, a
+ vpunpckhwd ymm3, ymm3, ymm3 // high 4 inverted alphas. mutated.
+ vpshufb ymm2, ymm2, ymm5 // replicate low 4 alphas. 1, a, a, a
+ vpshufb ymm3, ymm3, ymm5 // replicate high 4 alphas
+ vpmulhuw ymm0, ymm0, ymm2 // rgb * ia
+ vpmulhuw ymm1, ymm1, ymm3 // rgb * ia
+ vpackuswb ymm0, ymm0, ymm1 // unmutated.
+ sub ecx, 8
+ vmovdqu [eax + edx], ymm0
+ lea eax, [eax + 32]
+ jg convertloop
+
+ pop edi
+ pop esi
+ vzeroupper
+ ret
+ }
+}
+#endif // USE_GATHER
+#endif // HAS_ARGBATTENUATEROW_AVX2
+
+#ifdef HAS_ARGBGRAYROW_SSSE3
+// Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels.
+__declspec(naked) __declspec(align(16))
+void ARGBGrayRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
+ __asm {
+ mov eax, [esp + 4] /* src_argb */
+ mov edx, [esp + 8] /* dst_argb */
+ mov ecx, [esp + 12] /* width */
+ movdqa xmm4, kARGBToYJ
+ movdqa xmm5, kAddYJ64
+
+ align 4
+ convertloop:
+ movdqa xmm0, [eax] // G
+ movdqa xmm1, [eax + 16]
+ pmaddubsw xmm0, xmm4
+ pmaddubsw xmm1, xmm4
+ phaddw xmm0, xmm1
+ paddw xmm0, xmm5 // Add .5 for rounding.
+ psrlw xmm0, 7
+ packuswb xmm0, xmm0 // 8 G bytes
+ movdqa xmm2, [eax] // A
+ movdqa xmm3, [eax + 16]
+ lea eax, [eax + 32]
+ psrld xmm2, 24
+ psrld xmm3, 24
+ packuswb xmm2, xmm3
+ packuswb xmm2, xmm2 // 8 A bytes
+ movdqa xmm3, xmm0 // Weave into GG, GA, then GGGA
+ punpcklbw xmm0, xmm0 // 8 GG words
+ punpcklbw xmm3, xmm2 // 8 GA words
+ movdqa xmm1, xmm0
+ punpcklwd xmm0, xmm3 // GGGA first 4
+ punpckhwd xmm1, xmm3 // GGGA next 4
+ sub ecx, 8
+ movdqa [edx], xmm0
+ movdqa [edx + 16], xmm1
+ lea edx, [edx + 32]
+ jg convertloop
+ ret
+ }
+}
+#endif // HAS_ARGBGRAYROW_SSSE3
+
+#ifdef HAS_ARGBSEPIAROW_SSSE3
+// b = (r * 35 + g * 68 + b * 17) >> 7
+// g = (r * 45 + g * 88 + b * 22) >> 7
+// r = (r * 50 + g * 98 + b * 24) >> 7
+// Constant for ARGB color to sepia tone.
+static const vec8 kARGBToSepiaB = {
+ 17, 68, 35, 0, 17, 68, 35, 0, 17, 68, 35, 0, 17, 68, 35, 0
+};
+
+static const vec8 kARGBToSepiaG = {
+ 22, 88, 45, 0, 22, 88, 45, 0, 22, 88, 45, 0, 22, 88, 45, 0
+};
+
+static const vec8 kARGBToSepiaR = {
+ 24, 98, 50, 0, 24, 98, 50, 0, 24, 98, 50, 0, 24, 98, 50, 0
+};
+
+// Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels.
+__declspec(naked) __declspec(align(16))
+void ARGBSepiaRow_SSSE3(uint8* dst_argb, int width) {
+ __asm {
+ mov eax, [esp + 4] /* dst_argb */
+ mov ecx, [esp + 8] /* width */
+ movdqa xmm2, kARGBToSepiaB
+ movdqa xmm3, kARGBToSepiaG
+ movdqa xmm4, kARGBToSepiaR
+
+ align 4
+ convertloop:
+ movdqa xmm0, [eax] // B
+ movdqa xmm6, [eax + 16]
+ pmaddubsw xmm0, xmm2
+ pmaddubsw xmm6, xmm2
+ phaddw xmm0, xmm6
+ psrlw xmm0, 7
+ packuswb xmm0, xmm0 // 8 B values
+ movdqa xmm5, [eax] // G
+ movdqa xmm1, [eax + 16]
+ pmaddubsw xmm5, xmm3
+ pmaddubsw xmm1, xmm3
+ phaddw xmm5, xmm1
+ psrlw xmm5, 7
+ packuswb xmm5, xmm5 // 8 G values
+ punpcklbw xmm0, xmm5 // 8 BG values
+ movdqa xmm5, [eax] // R
+ movdqa xmm1, [eax + 16]
+ pmaddubsw xmm5, xmm4
+ pmaddubsw xmm1, xmm4
+ phaddw xmm5, xmm1
+ psrlw xmm5, 7
+ packuswb xmm5, xmm5 // 8 R values
+ movdqa xmm6, [eax] // A
+ movdqa xmm1, [eax + 16]
+ psrld xmm6, 24
+ psrld xmm1, 24
+ packuswb xmm6, xmm1
+ packuswb xmm6, xmm6 // 8 A values
+ punpcklbw xmm5, xmm6 // 8 RA values
+ movdqa xmm1, xmm0 // Weave BG, RA together
+ punpcklwd xmm0, xmm5 // BGRA first 4
+ punpckhwd xmm1, xmm5 // BGRA next 4
+ sub ecx, 8
+ movdqa [eax], xmm0
+ movdqa [eax + 16], xmm1
+ lea eax, [eax + 32]
+ jg convertloop
+ ret
+ }
+}
+#endif // HAS_ARGBSEPIAROW_SSSE3
+
+#ifdef HAS_ARGBCOLORMATRIXROW_SSSE3
+// Tranform 8 ARGB pixels (32 bytes) with color matrix.
+// Same as Sepia except matrix is provided.
+// TODO(fbarchard): packuswbs only use half of the reg. To make RGBA, combine R
+// and B into a high and low, then G/A, unpackl/hbw and then unpckl/hwd.
+__declspec(naked) __declspec(align(16))
+void ARGBColorMatrixRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
+ const int8* matrix_argb, int width) {
+ __asm {
+ mov eax, [esp + 4] /* src_argb */
+ mov edx, [esp + 8] /* dst_argb */
+ mov ecx, [esp + 12] /* matrix_argb */
+ movdqu xmm5, [ecx]
+ pshufd xmm2, xmm5, 0x00
+ pshufd xmm3, xmm5, 0x55
+ pshufd xmm4, xmm5, 0xaa
+ pshufd xmm5, xmm5, 0xff
+ mov ecx, [esp + 16] /* width */
+
+ align 4
+ convertloop:
+ movdqa xmm0, [eax] // B
+ movdqa xmm7, [eax + 16]
+ pmaddubsw xmm0, xmm2
+ pmaddubsw xmm7, xmm2
+ movdqa xmm6, [eax] // G
+ movdqa xmm1, [eax + 16]
+ pmaddubsw xmm6, xmm3
+ pmaddubsw xmm1, xmm3
+ phaddsw xmm0, xmm7 // B
+ phaddsw xmm6, xmm1 // G
+ psraw xmm0, 6 // B
+ psraw xmm6, 6 // G
+ packuswb xmm0, xmm0 // 8 B values
+ packuswb xmm6, xmm6 // 8 G values
+ punpcklbw xmm0, xmm6 // 8 BG values
+ movdqa xmm1, [eax] // R
+ movdqa xmm7, [eax + 16]
+ pmaddubsw xmm1, xmm4
+ pmaddubsw xmm7, xmm4
+ phaddsw xmm1, xmm7 // R
+ movdqa xmm6, [eax] // A
+ movdqa xmm7, [eax + 16]
+ pmaddubsw xmm6, xmm5
+ pmaddubsw xmm7, xmm5
+ phaddsw xmm6, xmm7 // A
+ psraw xmm1, 6 // R
+ psraw xmm6, 6 // A
+ packuswb xmm1, xmm1 // 8 R values
+ packuswb xmm6, xmm6 // 8 A values
+ punpcklbw xmm1, xmm6 // 8 RA values
+ movdqa xmm6, xmm0 // Weave BG, RA together
+ punpcklwd xmm0, xmm1 // BGRA first 4
+ punpckhwd xmm6, xmm1 // BGRA next 4
+ sub ecx, 8
+ movdqa [edx], xmm0
+ movdqa [edx + 16], xmm6
+ lea eax, [eax + 32]
+ lea edx, [edx + 32]
+ jg convertloop
+ ret
+ }
+}
+#endif // HAS_ARGBCOLORMATRIXROW_SSSE3
+
+#ifdef HAS_ARGBQUANTIZEROW_SSE2
+// Quantize 4 ARGB pixels (16 bytes).
+// Aligned to 16 bytes.
+__declspec(naked) __declspec(align(16))
+void ARGBQuantizeRow_SSE2(uint8* dst_argb, int scale, int interval_size,
+ int interval_offset, int width) {
+ __asm {
+ mov eax, [esp + 4] /* dst_argb */
+ movd xmm2, [esp + 8] /* scale */
+ movd xmm3, [esp + 12] /* interval_size */
+ movd xmm4, [esp + 16] /* interval_offset */
+ mov ecx, [esp + 20] /* width */
+ pshuflw xmm2, xmm2, 040h
+ pshufd xmm2, xmm2, 044h
+ pshuflw xmm3, xmm3, 040h
+ pshufd xmm3, xmm3, 044h
+ pshuflw xmm4, xmm4, 040h
+ pshufd xmm4, xmm4, 044h
+ pxor xmm5, xmm5 // constant 0
+ pcmpeqb xmm6, xmm6 // generate mask 0xff000000
+ pslld xmm6, 24
+
+ align 4
+ convertloop:
+ movdqa xmm0, [eax] // read 4 pixels
+ punpcklbw xmm0, xmm5 // first 2 pixels
+ pmulhuw xmm0, xmm2 // pixel * scale >> 16
+ movdqa xmm1, [eax] // read 4 pixels
+ punpckhbw xmm1, xmm5 // next 2 pixels
+ pmulhuw xmm1, xmm2
+ pmullw xmm0, xmm3 // * interval_size
+ movdqa xmm7, [eax] // read 4 pixels
+ pmullw xmm1, xmm3
+ pand xmm7, xmm6 // mask alpha
+ paddw xmm0, xmm4 // + interval_size / 2
+ paddw xmm1, xmm4
+ packuswb xmm0, xmm1
+ por xmm0, xmm7
+ sub ecx, 4
+ movdqa [eax], xmm0
+ lea eax, [eax + 16]
+ jg convertloop
+ ret
+ }
+}
+#endif // HAS_ARGBQUANTIZEROW_SSE2
+
+#ifdef HAS_ARGBSHADEROW_SSE2
+// Shade 4 pixels at a time by specified value.
+// Aligned to 16 bytes.
+__declspec(naked) __declspec(align(16))
+void ARGBShadeRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width,
+ uint32 value) {
+ __asm {
+ mov eax, [esp + 4] // src_argb
+ mov edx, [esp + 8] // dst_argb
+ mov ecx, [esp + 12] // width
+ movd xmm2, [esp + 16] // value
+ punpcklbw xmm2, xmm2
+ punpcklqdq xmm2, xmm2
+
+ align 4
+ convertloop:
+ movdqa xmm0, [eax] // read 4 pixels
+ lea eax, [eax + 16]
+ movdqa xmm1, xmm0
+ punpcklbw xmm0, xmm0 // first 2
+ punpckhbw xmm1, xmm1 // next 2
+ pmulhuw xmm0, xmm2 // argb * value
+ pmulhuw xmm1, xmm2 // argb * value
+ psrlw xmm0, 8
+ psrlw xmm1, 8
+ packuswb xmm0, xmm1
+ sub ecx, 4
+ movdqa [edx], xmm0
+ lea edx, [edx + 16]
+ jg convertloop
+
+ ret
+ }
+}
+#endif // HAS_ARGBSHADEROW_SSE2
+
+#ifdef HAS_ARGBMULTIPLYROW_SSE2
+// Multiply 2 rows of ARGB pixels together, 4 pixels at a time.
+__declspec(naked) __declspec(align(16))
+void ARGBMultiplyRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
+ uint8* dst_argb, int width) {
+ __asm {
+ push esi
+ mov eax, [esp + 4 + 4] // src_argb0
+ mov esi, [esp + 4 + 8] // src_argb1
+ mov edx, [esp + 4 + 12] // dst_argb
+ mov ecx, [esp + 4 + 16] // width
+ pxor xmm5, xmm5 // constant 0
+
+ align 4
+ convertloop:
+ movdqu xmm0, [eax] // read 4 pixels from src_argb0
+ movdqu xmm2, [esi] // read 4 pixels from src_argb1
+ movdqu xmm1, xmm0
+ movdqu xmm3, xmm2
+ punpcklbw xmm0, xmm0 // first 2
+ punpckhbw xmm1, xmm1 // next 2
+ punpcklbw xmm2, xmm5 // first 2
+ punpckhbw xmm3, xmm5 // next 2
+ pmulhuw xmm0, xmm2 // src_argb0 * src_argb1 first 2
+ pmulhuw xmm1, xmm3 // src_argb0 * src_argb1 next 2
+ lea eax, [eax + 16]
+ lea esi, [esi + 16]
+ packuswb xmm0, xmm1
+ sub ecx, 4
+ movdqu [edx], xmm0
+ lea edx, [edx + 16]
+ jg convertloop
+
+ pop esi
+ ret
+ }
+}
+#endif // HAS_ARGBMULTIPLYROW_SSE2
+
+#ifdef HAS_ARGBADDROW_SSE2
+// Add 2 rows of ARGB pixels together, 4 pixels at a time.
+// TODO(fbarchard): Port this to posix, neon and other math functions.
+__declspec(naked) __declspec(align(16))
+void ARGBAddRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
+ uint8* dst_argb, int width) {
+ __asm {
+ push esi
+ mov eax, [esp + 4 + 4] // src_argb0
+ mov esi, [esp + 4 + 8] // src_argb1
+ mov edx, [esp + 4 + 12] // dst_argb
+ mov ecx, [esp + 4 + 16] // width
+
+ sub ecx, 4
+ jl convertloop49
+
+ align 4
+ convertloop4:
+ movdqu xmm0, [eax] // read 4 pixels from src_argb0
+ lea eax, [eax + 16]
+ movdqu xmm1, [esi] // read 4 pixels from src_argb1
+ lea esi, [esi + 16]
+ paddusb xmm0, xmm1 // src_argb0 + src_argb1
+ sub ecx, 4
+ movdqu [edx], xmm0
+ lea edx, [edx + 16]
+ jge convertloop4
+
+ convertloop49:
+ add ecx, 4 - 1
+ jl convertloop19
+
+ convertloop1:
+ movd xmm0, [eax] // read 1 pixels from src_argb0
+ lea eax, [eax + 4]
+ movd xmm1, [esi] // read 1 pixels from src_argb1
+ lea esi, [esi + 4]
+ paddusb xmm0, xmm1 // src_argb0 + src_argb1
+ sub ecx, 1
+ movd [edx], xmm0
+ lea edx, [edx + 4]
+ jge convertloop1
+
+ convertloop19:
+ pop esi
+ ret
+ }
+}
+#endif // HAS_ARGBADDROW_SSE2
+
+#ifdef HAS_ARGBSUBTRACTROW_SSE2
+// Subtract 2 rows of ARGB pixels together, 4 pixels at a time.
+__declspec(naked) __declspec(align(16))
+void ARGBSubtractRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
+ uint8* dst_argb, int width) {
+ __asm {
+ push esi
+ mov eax, [esp + 4 + 4] // src_argb0
+ mov esi, [esp + 4 + 8] // src_argb1
+ mov edx, [esp + 4 + 12] // dst_argb
+ mov ecx, [esp + 4 + 16] // width
+
+ align 4
+ convertloop:
+ movdqu xmm0, [eax] // read 4 pixels from src_argb0
+ lea eax, [eax + 16]
+ movdqu xmm1, [esi] // read 4 pixels from src_argb1
+ lea esi, [esi + 16]
+ psubusb xmm0, xmm1 // src_argb0 - src_argb1
+ sub ecx, 4
+ movdqu [edx], xmm0
+ lea edx, [edx + 16]
+ jg convertloop
+
+ pop esi
+ ret
+ }
+}
+#endif // HAS_ARGBSUBTRACTROW_SSE2
+
+#ifdef HAS_ARGBMULTIPLYROW_AVX2
+// Multiply 2 rows of ARGB pixels together, 8 pixels at a time.
+__declspec(naked) __declspec(align(16))
+void ARGBMultiplyRow_AVX2(const uint8* src_argb0, const uint8* src_argb1,
+ uint8* dst_argb, int width) {
+ __asm {
+ push esi
+ mov eax, [esp + 4 + 4] // src_argb0
+ mov esi, [esp + 4 + 8] // src_argb1
+ mov edx, [esp + 4 + 12] // dst_argb
+ mov ecx, [esp + 4 + 16] // width
+ vpxor ymm5, ymm5, ymm5 // constant 0
+
+ align 4
+ convertloop:
+ vmovdqu ymm1, [eax] // read 8 pixels from src_argb0
+ lea eax, [eax + 32]
+ vmovdqu ymm3, [esi] // read 8 pixels from src_argb1
+ lea esi, [esi + 32]
+ vpunpcklbw ymm0, ymm1, ymm1 // low 4
+ vpunpckhbw ymm1, ymm1, ymm1 // high 4
+ vpunpcklbw ymm2, ymm3, ymm5 // low 4
+ vpunpckhbw ymm3, ymm3, ymm5 // high 4
+ vpmulhuw ymm0, ymm0, ymm2 // src_argb0 * src_argb1 low 4
+ vpmulhuw ymm1, ymm1, ymm3 // src_argb0 * src_argb1 high 4
+ vpackuswb ymm0, ymm0, ymm1
+ vmovdqu [edx], ymm0
+ lea edx, [edx + 32]
+ sub ecx, 8
+ jg convertloop
+
+ pop esi
+ vzeroupper
+ ret
+ }
+}
+#endif // HAS_ARGBMULTIPLYROW_AVX2
+
+#ifdef HAS_ARGBADDROW_AVX2
+// Add 2 rows of ARGB pixels together, 8 pixels at a time.
+__declspec(naked) __declspec(align(16))
+void ARGBAddRow_AVX2(const uint8* src_argb0, const uint8* src_argb1,
+ uint8* dst_argb, int width) {
+ __asm {
+ push esi
+ mov eax, [esp + 4 + 4] // src_argb0
+ mov esi, [esp + 4 + 8] // src_argb1
+ mov edx, [esp + 4 + 12] // dst_argb
+ mov ecx, [esp + 4 + 16] // width
+
+ align 4
+ convertloop:
+ vmovdqu ymm0, [eax] // read 8 pixels from src_argb0
+ lea eax, [eax + 32]
+ vpaddusb ymm0, ymm0, [esi] // add 8 pixels from src_argb1
+ lea esi, [esi + 32]
+ vmovdqu [edx], ymm0
+ lea edx, [edx + 32]
+ sub ecx, 8
+ jg convertloop
+
+ pop esi
+ vzeroupper
+ ret
+ }
+}
+#endif // HAS_ARGBADDROW_AVX2
+
+#ifdef HAS_ARGBSUBTRACTROW_AVX2
+// Subtract 2 rows of ARGB pixels together, 8 pixels at a time.
+__declspec(naked) __declspec(align(16))
+void ARGBSubtractRow_AVX2(const uint8* src_argb0, const uint8* src_argb1,
+ uint8* dst_argb, int width) {
+ __asm {
+ push esi
+ mov eax, [esp + 4 + 4] // src_argb0
+ mov esi, [esp + 4 + 8] // src_argb1
+ mov edx, [esp + 4 + 12] // dst_argb
+ mov ecx, [esp + 4 + 16] // width
+
+ align 4
+ convertloop:
+ vmovdqu ymm0, [eax] // read 8 pixels from src_argb0
+ lea eax, [eax + 32]
+ vpsubusb ymm0, ymm0, [esi] // src_argb0 - src_argb1
+ lea esi, [esi + 32]
+ vmovdqu [edx], ymm0
+ lea edx, [edx + 32]
+ sub ecx, 8
+ jg convertloop
+
+ pop esi
+ vzeroupper
+ ret
+ }
+}
+#endif // HAS_ARGBSUBTRACTROW_AVX2
+
+#ifdef HAS_SOBELXROW_SSE2
+// SobelX as a matrix is
+// -1 0 1
+// -2 0 2
+// -1 0 1
+__declspec(naked) __declspec(align(16))
+void SobelXRow_SSE2(const uint8* src_y0, const uint8* src_y1,
+ const uint8* src_y2, uint8* dst_sobelx, int width) {
+ __asm {
+ push esi
+ push edi
+ mov eax, [esp + 8 + 4] // src_y0
+ mov esi, [esp + 8 + 8] // src_y1
+ mov edi, [esp + 8 + 12] // src_y2
+ mov edx, [esp + 8 + 16] // dst_sobelx
+ mov ecx, [esp + 8 + 20] // width
+ sub esi, eax
+ sub edi, eax
+ sub edx, eax
+ pxor xmm5, xmm5 // constant 0
+
+ align 4
+ convertloop:
+ movq xmm0, qword ptr [eax] // read 8 pixels from src_y0[0]
+ movq xmm1, qword ptr [eax + 2] // read 8 pixels from src_y0[2]
+ punpcklbw xmm0, xmm5
+ punpcklbw xmm1, xmm5
+ psubw xmm0, xmm1
+ movq xmm1, qword ptr [eax + esi] // read 8 pixels from src_y1[0]
+ movq xmm2, qword ptr [eax + esi + 2] // read 8 pixels from src_y1[2]
+ punpcklbw xmm1, xmm5
+ punpcklbw xmm2, xmm5
+ psubw xmm1, xmm2
+ movq xmm2, qword ptr [eax + edi] // read 8 pixels from src_y2[0]
+ movq xmm3, qword ptr [eax + edi + 2] // read 8 pixels from src_y2[2]
+ punpcklbw xmm2, xmm5
+ punpcklbw xmm3, xmm5
+ psubw xmm2, xmm3
+ paddw xmm0, xmm2
+ paddw xmm0, xmm1
+ paddw xmm0, xmm1
+ pxor xmm1, xmm1 // abs = max(xmm0, -xmm0). SSSE3 could use pabsw
+ psubw xmm1, xmm0
+ pmaxsw xmm0, xmm1
+ packuswb xmm0, xmm0
+ sub ecx, 8
+ movq qword ptr [eax + edx], xmm0
+ lea eax, [eax + 8]
+ jg convertloop
+
+ pop edi
+ pop esi
+ ret
+ }
+}
+#endif // HAS_SOBELXROW_SSE2
+
+#ifdef HAS_SOBELYROW_SSE2
+// SobelY as a matrix is
+// -1 -2 -1
+// 0 0 0
+// 1 2 1
+__declspec(naked) __declspec(align(16))
+void SobelYRow_SSE2(const uint8* src_y0, const uint8* src_y1,
+ uint8* dst_sobely, int width) {
+ __asm {
+ push esi
+ mov eax, [esp + 4 + 4] // src_y0
+ mov esi, [esp + 4 + 8] // src_y1
+ mov edx, [esp + 4 + 12] // dst_sobely
+ mov ecx, [esp + 4 + 16] // width
+ sub esi, eax
+ sub edx, eax
+ pxor xmm5, xmm5 // constant 0
+
+ align 4
+ convertloop:
+ movq xmm0, qword ptr [eax] // read 8 pixels from src_y0[0]
+ movq xmm1, qword ptr [eax + esi] // read 8 pixels from src_y1[0]
+ punpcklbw xmm0, xmm5
+ punpcklbw xmm1, xmm5
+ psubw xmm0, xmm1
+ movq xmm1, qword ptr [eax + 1] // read 8 pixels from src_y0[1]
+ movq xmm2, qword ptr [eax + esi + 1] // read 8 pixels from src_y1[1]
+ punpcklbw xmm1, xmm5
+ punpcklbw xmm2, xmm5
+ psubw xmm1, xmm2
+ movq xmm2, qword ptr [eax + 2] // read 8 pixels from src_y0[2]
+ movq xmm3, qword ptr [eax + esi + 2] // read 8 pixels from src_y1[2]
+ punpcklbw xmm2, xmm5
+ punpcklbw xmm3, xmm5
+ psubw xmm2, xmm3
+ paddw xmm0, xmm2
+ paddw xmm0, xmm1
+ paddw xmm0, xmm1
+ pxor xmm1, xmm1 // abs = max(xmm0, -xmm0). SSSE3 could use pabsw
+ psubw xmm1, xmm0
+ pmaxsw xmm0, xmm1
+ packuswb xmm0, xmm0
+ sub ecx, 8
+ movq qword ptr [eax + edx], xmm0
+ lea eax, [eax + 8]
+ jg convertloop
+
+ pop esi
+ ret
+ }
+}
+#endif // HAS_SOBELYROW_SSE2
+
+#ifdef HAS_SOBELROW_SSE2
+// Adds Sobel X and Sobel Y and stores Sobel into ARGB.
+// A = 255
+// R = Sobel
+// G = Sobel
+// B = Sobel
+__declspec(naked) __declspec(align(16))
+void SobelRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
+ uint8* dst_argb, int width) {
+ __asm {
+ push esi
+ mov eax, [esp + 4 + 4] // src_sobelx
+ mov esi, [esp + 4 + 8] // src_sobely
+ mov edx, [esp + 4 + 12] // dst_argb
+ mov ecx, [esp + 4 + 16] // width
+ sub esi, eax
+ pcmpeqb xmm5, xmm5 // alpha 255
+ pslld xmm5, 24 // 0xff000000
+
+ align 4
+ convertloop:
+ movdqa xmm0, [eax] // read 16 pixels src_sobelx
+ movdqa xmm1, [eax + esi] // read 16 pixels src_sobely
+ lea eax, [eax + 16]
+ paddusb xmm0, xmm1 // sobel = sobelx + sobely
+ movdqa xmm2, xmm0 // GG
+ punpcklbw xmm2, xmm0 // First 8
+ punpckhbw xmm0, xmm0 // Next 8
+ movdqa xmm1, xmm2 // GGGG
+ punpcklwd xmm1, xmm2 // First 4
+ punpckhwd xmm2, xmm2 // Next 4
+ por xmm1, xmm5 // GGGA
+ por xmm2, xmm5
+ movdqa xmm3, xmm0 // GGGG
+ punpcklwd xmm3, xmm0 // Next 4
+ punpckhwd xmm0, xmm0 // Last 4
+ por xmm3, xmm5 // GGGA
+ por xmm0, xmm5
+ sub ecx, 16
+ movdqa [edx], xmm1
+ movdqa [edx + 16], xmm2
+ movdqa [edx + 32], xmm3
+ movdqa [edx + 48], xmm0
+ lea edx, [edx + 64]
+ jg convertloop
+
+ pop esi
+ ret
+ }
+}
+#endif // HAS_SOBELROW_SSE2
+
+#ifdef HAS_SOBELTOPLANEROW_SSE2
+// Adds Sobel X and Sobel Y and stores Sobel into a plane.
+__declspec(naked) __declspec(align(16))
+void SobelToPlaneRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
+ uint8* dst_y, int width) {
+ __asm {
+ push esi
+ mov eax, [esp + 4 + 4] // src_sobelx
+ mov esi, [esp + 4 + 8] // src_sobely
+ mov edx, [esp + 4 + 12] // dst_argb
+ mov ecx, [esp + 4 + 16] // width
+ sub esi, eax
+
+ align 4
+ convertloop:
+ movdqa xmm0, [eax] // read 16 pixels src_sobelx
+ movdqa xmm1, [eax + esi] // read 16 pixels src_sobely
+ lea eax, [eax + 16]
+ paddusb xmm0, xmm1 // sobel = sobelx + sobely
+ sub ecx, 16
+ movdqa [edx], xmm0
+ lea edx, [edx + 16]
+ jg convertloop
+
+ pop esi
+ ret
+ }
+}
+#endif // HAS_SOBELTOPLANEROW_SSE2
+
+#ifdef HAS_SOBELXYROW_SSE2
+// Mixes Sobel X, Sobel Y and Sobel into ARGB.
+// A = 255
+// R = Sobel X
+// G = Sobel
+// B = Sobel Y
+__declspec(naked) __declspec(align(16))
+void SobelXYRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
+ uint8* dst_argb, int width) {
+ __asm {
+ push esi
+ mov eax, [esp + 4 + 4] // src_sobelx
+ mov esi, [esp + 4 + 8] // src_sobely
+ mov edx, [esp + 4 + 12] // dst_argb
+ mov ecx, [esp + 4 + 16] // width
+ sub esi, eax
+ pcmpeqb xmm5, xmm5 // alpha 255
+
+ align 4
+ convertloop:
+ movdqa xmm0, [eax] // read 16 pixels src_sobelx
+ movdqa xmm1, [eax + esi] // read 16 pixels src_sobely
+ lea eax, [eax + 16]
+ movdqa xmm2, xmm0
+ paddusb xmm2, xmm1 // sobel = sobelx + sobely
+ movdqa xmm3, xmm0 // XA
+ punpcklbw xmm3, xmm5
+ punpckhbw xmm0, xmm5
+ movdqa xmm4, xmm1 // YS
+ punpcklbw xmm4, xmm2
+ punpckhbw xmm1, xmm2
+ movdqa xmm6, xmm4 // YSXA
+ punpcklwd xmm6, xmm3 // First 4
+ punpckhwd xmm4, xmm3 // Next 4
+ movdqa xmm7, xmm1 // YSXA
+ punpcklwd xmm7, xmm0 // Next 4
+ punpckhwd xmm1, xmm0 // Last 4
+ sub ecx, 16
+ movdqa [edx], xmm6
+ movdqa [edx + 16], xmm4
+ movdqa [edx + 32], xmm7
+ movdqa [edx + 48], xmm1
+ lea edx, [edx + 64]
+ jg convertloop
+
+ pop esi
+ ret
+ }
+}
+#endif // HAS_SOBELXYROW_SSE2
+
+#ifdef HAS_CUMULATIVESUMTOAVERAGEROW_SSE2
+// Consider float CumulativeSum.
+// Consider calling CumulativeSum one row at time as needed.
+// Consider circular CumulativeSum buffer of radius * 2 + 1 height.
+// Convert cumulative sum for an area to an average for 1 pixel.
+// topleft is pointer to top left of CumulativeSum buffer for area.
+// botleft is pointer to bottom left of CumulativeSum buffer.
+// width is offset from left to right of area in CumulativeSum buffer measured
+// in number of ints.
+// area is the number of pixels in the area being averaged.
+// dst points to pixel to store result to.
+// count is number of averaged pixels to produce.
+// Does 4 pixels at a time, requires CumulativeSum pointers to be 16 byte
+// aligned.
+void CumulativeSumToAverageRow_SSE2(const int32* topleft, const int32* botleft,
+ int width, int area, uint8* dst,
+ int count) {
+ __asm {
+ mov eax, topleft // eax topleft
+ mov esi, botleft // esi botleft
+ mov edx, width
+ movd xmm5, area
+ mov edi, dst
+ mov ecx, count
+ cvtdq2ps xmm5, xmm5
+ rcpss xmm4, xmm5 // 1.0f / area
+ pshufd xmm4, xmm4, 0
+ sub ecx, 4
+ jl l4b
+
+ cmp area, 128 // 128 pixels will not overflow 15 bits.
+ ja l4
+
+ pshufd xmm5, xmm5, 0 // area
+ pcmpeqb xmm6, xmm6 // constant of 65536.0 - 1 = 65535.0
+ psrld xmm6, 16
+ cvtdq2ps xmm6, xmm6
+ addps xmm5, xmm6 // (65536.0 + area - 1)
+ mulps xmm5, xmm4 // (65536.0 + area - 1) * 1 / area
+ cvtps2dq xmm5, xmm5 // 0.16 fixed point
+ packssdw xmm5, xmm5 // 16 bit shorts
+
+ // 4 pixel loop small blocks.
+ align 4
+ s4:
+ // top left
+ movdqa xmm0, [eax]
+ movdqa xmm1, [eax + 16]
+ movdqa xmm2, [eax + 32]
+ movdqa xmm3, [eax + 48]
+
+ // - top right
+ psubd xmm0, [eax + edx * 4]
+ psubd xmm1, [eax + edx * 4 + 16]
+ psubd xmm2, [eax + edx * 4 + 32]
+ psubd xmm3, [eax + edx * 4 + 48]
+ lea eax, [eax + 64]
+
+ // - bottom left
+ psubd xmm0, [esi]
+ psubd xmm1, [esi + 16]
+ psubd xmm2, [esi + 32]
+ psubd xmm3, [esi + 48]
+
+ // + bottom right
+ paddd xmm0, [esi + edx * 4]
+ paddd xmm1, [esi + edx * 4 + 16]
+ paddd xmm2, [esi + edx * 4 + 32]
+ paddd xmm3, [esi + edx * 4 + 48]
+ lea esi, [esi + 64]
+
+ packssdw xmm0, xmm1 // pack 4 pixels into 2 registers
+ packssdw xmm2, xmm3
+
+ pmulhuw xmm0, xmm5
+ pmulhuw xmm2, xmm5
+
+ packuswb xmm0, xmm2
+ movdqu [edi], xmm0
+ lea edi, [edi + 16]
+ sub ecx, 4
+ jge s4
+
+ jmp l4b
+
+ // 4 pixel loop
+ align 4
+ l4:
+ // top left
+ movdqa xmm0, [eax]
+ movdqa xmm1, [eax + 16]
+ movdqa xmm2, [eax + 32]
+ movdqa xmm3, [eax + 48]
+
+ // - top right
+ psubd xmm0, [eax + edx * 4]
+ psubd xmm1, [eax + edx * 4 + 16]
+ psubd xmm2, [eax + edx * 4 + 32]
+ psubd xmm3, [eax + edx * 4 + 48]
+ lea eax, [eax + 64]
+
+ // - bottom left
+ psubd xmm0, [esi]
+ psubd xmm1, [esi + 16]
+ psubd xmm2, [esi + 32]
+ psubd xmm3, [esi + 48]
+
+ // + bottom right
+ paddd xmm0, [esi + edx * 4]
+ paddd xmm1, [esi + edx * 4 + 16]
+ paddd xmm2, [esi + edx * 4 + 32]
+ paddd xmm3, [esi + edx * 4 + 48]
+ lea esi, [esi + 64]
+
+ cvtdq2ps xmm0, xmm0 // Average = Sum * 1 / Area
+ cvtdq2ps xmm1, xmm1
+ mulps xmm0, xmm4
+ mulps xmm1, xmm4
+ cvtdq2ps xmm2, xmm2
+ cvtdq2ps xmm3, xmm3
+ mulps xmm2, xmm4
+ mulps xmm3, xmm4
+ cvtps2dq xmm0, xmm0
+ cvtps2dq xmm1, xmm1
+ cvtps2dq xmm2, xmm2
+ cvtps2dq xmm3, xmm3
+ packssdw xmm0, xmm1
+ packssdw xmm2, xmm3
+ packuswb xmm0, xmm2
+ movdqu [edi], xmm0
+ lea edi, [edi + 16]
+ sub ecx, 4
+ jge l4
+
+ l4b:
+ add ecx, 4 - 1
+ jl l1b
+
+ // 1 pixel loop
+ align 4
+ l1:
+ movdqa xmm0, [eax]
+ psubd xmm0, [eax + edx * 4]
+ lea eax, [eax + 16]
+ psubd xmm0, [esi]
+ paddd xmm0, [esi + edx * 4]
+ lea esi, [esi + 16]
+ cvtdq2ps xmm0, xmm0
+ mulps xmm0, xmm4
+ cvtps2dq xmm0, xmm0
+ packssdw xmm0, xmm0
+ packuswb xmm0, xmm0
+ movd dword ptr [edi], xmm0
+ lea edi, [edi + 4]
+ sub ecx, 1
+ jge l1
+ l1b:
+ }
+}
+#endif // HAS_CUMULATIVESUMTOAVERAGEROW_SSE2
+
+#ifdef HAS_COMPUTECUMULATIVESUMROW_SSE2
+// Creates a table of cumulative sums where each value is a sum of all values
+// above and to the left of the value.
+void ComputeCumulativeSumRow_SSE2(const uint8* row, int32* cumsum,
+ const int32* previous_cumsum, int width) {
+ __asm {
+ mov eax, row
+ mov edx, cumsum
+ mov esi, previous_cumsum
+ mov ecx, width
+ pxor xmm0, xmm0
+ pxor xmm1, xmm1
+
+ sub ecx, 4
+ jl l4b
+ test edx, 15
+ jne l4b
+
+ // 4 pixel loop
+ align 4
+ l4:
+ movdqu xmm2, [eax] // 4 argb pixels 16 bytes.
+ lea eax, [eax + 16]
+ movdqa xmm4, xmm2
+
+ punpcklbw xmm2, xmm1
+ movdqa xmm3, xmm2
+ punpcklwd xmm2, xmm1
+ punpckhwd xmm3, xmm1
+
+ punpckhbw xmm4, xmm1
+ movdqa xmm5, xmm4
+ punpcklwd xmm4, xmm1
+ punpckhwd xmm5, xmm1
+
+ paddd xmm0, xmm2
+ movdqa xmm2, [esi] // previous row above.
+ paddd xmm2, xmm0
+
+ paddd xmm0, xmm3
+ movdqa xmm3, [esi + 16]
+ paddd xmm3, xmm0
+
+ paddd xmm0, xmm4
+ movdqa xmm4, [esi + 32]
+ paddd xmm4, xmm0
+
+ paddd xmm0, xmm5
+ movdqa xmm5, [esi + 48]
+ lea esi, [esi + 64]
+ paddd xmm5, xmm0
+
+ movdqa [edx], xmm2
+ movdqa [edx + 16], xmm3
+ movdqa [edx + 32], xmm4
+ movdqa [edx + 48], xmm5
+
+ lea edx, [edx + 64]
+ sub ecx, 4
+ jge l4
+
+ l4b:
+ add ecx, 4 - 1
+ jl l1b
+
+ // 1 pixel loop
+ align 4
+ l1:
+ movd xmm2, dword ptr [eax] // 1 argb pixel 4 bytes.
+ lea eax, [eax + 4]
+ punpcklbw xmm2, xmm1
+ punpcklwd xmm2, xmm1
+ paddd xmm0, xmm2
+ movdqu xmm2, [esi]
+ lea esi, [esi + 16]
+ paddd xmm2, xmm0
+ movdqu [edx], xmm2
+ lea edx, [edx + 16]
+ sub ecx, 1
+ jge l1
+
+ l1b:
+ }
+}
+#endif // HAS_COMPUTECUMULATIVESUMROW_SSE2
+
+#ifdef HAS_ARGBAFFINEROW_SSE2
+// Copy ARGB pixels from source image with slope to a row of destination.
+__declspec(naked) __declspec(align(16))
+LIBYUV_API
+void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride,
+ uint8* dst_argb, const float* uv_dudv, int width) {
+ __asm {
+ push esi
+ push edi
+ mov eax, [esp + 12] // src_argb
+ mov esi, [esp + 16] // stride
+ mov edx, [esp + 20] // dst_argb
+ mov ecx, [esp + 24] // pointer to uv_dudv
+ movq xmm2, qword ptr [ecx] // uv
+ movq xmm7, qword ptr [ecx + 8] // dudv
+ mov ecx, [esp + 28] // width
+ shl esi, 16 // 4, stride
+ add esi, 4
+ movd xmm5, esi
+ sub ecx, 4
+ jl l4b
+
+ // setup for 4 pixel loop
+ pshufd xmm7, xmm7, 0x44 // dup dudv
+ pshufd xmm5, xmm5, 0 // dup 4, stride
+ movdqa xmm0, xmm2 // x0, y0, x1, y1
+ addps xmm0, xmm7
+ movlhps xmm2, xmm0
+ movdqa xmm4, xmm7
+ addps xmm4, xmm4 // dudv *= 2
+ movdqa xmm3, xmm2 // x2, y2, x3, y3
+ addps xmm3, xmm4
+ addps xmm4, xmm4 // dudv *= 4
+
+ // 4 pixel loop
+ align 4
+ l4:
+ cvttps2dq xmm0, xmm2 // x, y float to int first 2
+ cvttps2dq xmm1, xmm3 // x, y float to int next 2
+ packssdw xmm0, xmm1 // x, y as 8 shorts
+ pmaddwd xmm0, xmm5 // offsets = x * 4 + y * stride.
+ movd esi, xmm0
+ pshufd xmm0, xmm0, 0x39 // shift right
+ movd edi, xmm0
+ pshufd xmm0, xmm0, 0x39 // shift right
+ movd xmm1, [eax + esi] // read pixel 0
+ movd xmm6, [eax + edi] // read pixel 1
+ punpckldq xmm1, xmm6 // combine pixel 0 and 1
+ addps xmm2, xmm4 // x, y += dx, dy first 2
+ movq qword ptr [edx], xmm1
+ movd esi, xmm0
+ pshufd xmm0, xmm0, 0x39 // shift right
+ movd edi, xmm0
+ movd xmm6, [eax + esi] // read pixel 2
+ movd xmm0, [eax + edi] // read pixel 3
+ punpckldq xmm6, xmm0 // combine pixel 2 and 3
+ addps xmm3, xmm4 // x, y += dx, dy next 2
+ sub ecx, 4
+ movq qword ptr 8[edx], xmm6
+ lea edx, [edx + 16]
+ jge l4
+
+ l4b:
+ add ecx, 4 - 1
+ jl l1b
+
+ // 1 pixel loop
+ align 4
+ l1:
+ cvttps2dq xmm0, xmm2 // x, y float to int
+ packssdw xmm0, xmm0 // x, y as shorts
+ pmaddwd xmm0, xmm5 // offset = x * 4 + y * stride
+ addps xmm2, xmm7 // x, y += dx, dy
+ movd esi, xmm0
+ movd xmm0, [eax + esi] // copy a pixel
+ sub ecx, 1
+ movd [edx], xmm0
+ lea edx, [edx + 4]
+ jge l1
+ l1b:
+ pop edi
+ pop esi
+ ret
+ }
+}
+#endif // HAS_ARGBAFFINEROW_SSE2
+
+#ifdef HAS_INTERPOLATEROW_AVX2
+// Bilinear filter 16x2 -> 16x1
+__declspec(naked) __declspec(align(16))
+void InterpolateRow_AVX2(uint8* dst_ptr, const uint8* src_ptr,
+ ptrdiff_t src_stride, int dst_width,
+ int source_y_fraction) {
+ __asm {
+ push esi
+ push edi
+ mov edi, [esp + 8 + 4] // dst_ptr
+ mov esi, [esp + 8 + 8] // src_ptr
+ mov edx, [esp + 8 + 12] // src_stride
+ mov ecx, [esp + 8 + 16] // dst_width
+ mov eax, [esp + 8 + 20] // source_y_fraction (0..255)
+ shr eax, 1
+ // Dispatch to specialized filters if applicable.
+ cmp eax, 0
+ je xloop100 // 0 / 128. Blend 100 / 0.
+ sub edi, esi
+ cmp eax, 32
+ je xloop75 // 32 / 128 is 0.25. Blend 75 / 25.
+ cmp eax, 64
+ je xloop50 // 64 / 128 is 0.50. Blend 50 / 50.
+ cmp eax, 96
+ je xloop25 // 96 / 128 is 0.75. Blend 25 / 75.
+
+ vmovd xmm0, eax // high fraction 0..127
+ neg eax
+ add eax, 128
+ vmovd xmm5, eax // low fraction 128..1
+ vpunpcklbw xmm5, xmm5, xmm0
+ vpunpcklwd xmm5, xmm5, xmm5
+ vpxor ymm0, ymm0, ymm0
+ vpermd ymm5, ymm0, ymm5
+
+ align 4
+ xloop:
+ vmovdqu ymm0, [esi]
+ vmovdqu ymm2, [esi + edx]
+ vpunpckhbw ymm1, ymm0, ymm2 // mutates
+ vpunpcklbw ymm0, ymm0, ymm2 // mutates
+ vpmaddubsw ymm0, ymm0, ymm5
+ vpmaddubsw ymm1, ymm1, ymm5
+ vpsrlw ymm0, ymm0, 7
+ vpsrlw ymm1, ymm1, 7
+ vpackuswb ymm0, ymm0, ymm1 // unmutates
+ sub ecx, 32
+ vmovdqu [esi + edi], ymm0
+ lea esi, [esi + 32]
+ jg xloop
+ jmp xloop99
+
+ // Blend 25 / 75.
+ align 4
+ xloop25:
+ vmovdqu ymm0, [esi]
+ vpavgb ymm0, ymm0, [esi + edx]
+ vpavgb ymm0, ymm0, [esi + edx]
+ sub ecx, 32
+ vmovdqu [esi + edi], ymm0
+ lea esi, [esi + 32]
+ jg xloop25
+ jmp xloop99
+
+ // Blend 50 / 50.
+ align 4
+ xloop50:
+ vmovdqu ymm0, [esi]
+ vpavgb ymm0, ymm0, [esi + edx]
+ sub ecx, 32
+ vmovdqu [esi + edi], ymm0
+ lea esi, [esi + 32]
+ jg xloop50
+ jmp xloop99
+
+ // Blend 75 / 25.
+ align 4
+ xloop75:
+ vmovdqu ymm0, [esi + edx]
+ vpavgb ymm0, ymm0, [esi]
+ vpavgb ymm0, ymm0, [esi]
+ sub ecx, 32
+ vmovdqu [esi + edi], ymm0
+ lea esi, [esi + 32]
+ jg xloop75
+ jmp xloop99
+
+ // Blend 100 / 0 - Copy row unchanged.
+ align 4
+ xloop100:
+ rep movsb
+
+ xloop99:
+ pop edi
+ pop esi
+ vzeroupper
+ ret
+ }
+}
+#endif // HAS_INTERPOLATEROW_AVX2
+
+#ifdef HAS_INTERPOLATEROW_SSSE3
+// Bilinear filter 16x2 -> 16x1
+__declspec(naked) __declspec(align(16))
+void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
+ ptrdiff_t src_stride, int dst_width,
+ int source_y_fraction) {
+ __asm {
+ push esi
+ push edi
+ mov edi, [esp + 8 + 4] // dst_ptr
+ mov esi, [esp + 8 + 8] // src_ptr
+ mov edx, [esp + 8 + 12] // src_stride
+ mov ecx, [esp + 8 + 16] // dst_width
+ mov eax, [esp + 8 + 20] // source_y_fraction (0..255)
+ sub edi, esi
+ shr eax, 1
+ // Dispatch to specialized filters if applicable.
+ cmp eax, 0
+ je xloop100 // 0 / 128. Blend 100 / 0.
+ cmp eax, 32
+ je xloop75 // 32 / 128 is 0.25. Blend 75 / 25.
+ cmp eax, 64
+ je xloop50 // 64 / 128 is 0.50. Blend 50 / 50.
+ cmp eax, 96
+ je xloop25 // 96 / 128 is 0.75. Blend 25 / 75.
+
+ movd xmm0, eax // high fraction 0..127
+ neg eax
+ add eax, 128
+ movd xmm5, eax // low fraction 128..1
+ punpcklbw xmm5, xmm0
+ punpcklwd xmm5, xmm5
+ pshufd xmm5, xmm5, 0
+
+ align 4
+ xloop:
+ movdqa xmm0, [esi]
+ movdqa xmm2, [esi + edx]
+ movdqa xmm1, xmm0
+ punpcklbw xmm0, xmm2
+ punpckhbw xmm1, xmm2
+ pmaddubsw xmm0, xmm5
+ pmaddubsw xmm1, xmm5
+ psrlw xmm0, 7
+ psrlw xmm1, 7
+ packuswb xmm0, xmm1
+ sub ecx, 16
+ movdqa [esi + edi], xmm0
+ lea esi, [esi + 16]
+ jg xloop
+ jmp xloop99
+
+ // Blend 25 / 75.
+ align 4
+ xloop25:
+ movdqa xmm0, [esi]
+ movdqa xmm1, [esi + edx]
+ pavgb xmm0, xmm1
+ pavgb xmm0, xmm1
+ sub ecx, 16
+ movdqa [esi + edi], xmm0
+ lea esi, [esi + 16]
+ jg xloop25
+ jmp xloop99
+
+ // Blend 50 / 50.
+ align 4
+ xloop50:
+ movdqa xmm0, [esi]
+ movdqa xmm1, [esi + edx]
+ pavgb xmm0, xmm1
+ sub ecx, 16
+ movdqa [esi + edi], xmm0
+ lea esi, [esi + 16]
+ jg xloop50
+ jmp xloop99
+
+ // Blend 75 / 25.
+ align 4
+ xloop75:
+ movdqa xmm1, [esi]
+ movdqa xmm0, [esi + edx]
+ pavgb xmm0, xmm1
+ pavgb xmm0, xmm1
+ sub ecx, 16
+ movdqa [esi + edi], xmm0
+ lea esi, [esi + 16]
+ jg xloop75
+ jmp xloop99
+
+ // Blend 100 / 0 - Copy row unchanged.
+ align 4
+ xloop100:
+ movdqa xmm0, [esi]
+ sub ecx, 16
+ movdqa [esi + edi], xmm0
+ lea esi, [esi + 16]
+ jg xloop100
+
+ xloop99:
+ pop edi
+ pop esi
+ ret
+ }
+}
+#endif // HAS_INTERPOLATEROW_SSSE3
+
+#ifdef HAS_INTERPOLATEROW_SSE2
+// Bilinear filter 16x2 -> 16x1
+__declspec(naked) __declspec(align(16))
+void InterpolateRow_SSE2(uint8* dst_ptr, const uint8* src_ptr,
+ ptrdiff_t src_stride, int dst_width,
+ int source_y_fraction) {
+ __asm {
+ push esi
+ push edi
+ mov edi, [esp + 8 + 4] // dst_ptr
+ mov esi, [esp + 8 + 8] // src_ptr
+ mov edx, [esp + 8 + 12] // src_stride
+ mov ecx, [esp + 8 + 16] // dst_width
+ mov eax, [esp + 8 + 20] // source_y_fraction (0..255)
+ sub edi, esi
+ // Dispatch to specialized filters if applicable.
+ cmp eax, 0
+ je xloop100 // 0 / 256. Blend 100 / 0.
+ cmp eax, 64
+ je xloop75 // 64 / 256 is 0.25. Blend 75 / 25.
+ cmp eax, 128
+ je xloop50 // 128 / 256 is 0.50. Blend 50 / 50.
+ cmp eax, 192
+ je xloop25 // 192 / 256 is 0.75. Blend 25 / 75.
+
+ movd xmm5, eax // xmm5 = y fraction
+ punpcklbw xmm5, xmm5
+ psrlw xmm5, 1
+ punpcklwd xmm5, xmm5
+ punpckldq xmm5, xmm5
+ punpcklqdq xmm5, xmm5
+ pxor xmm4, xmm4
+
+ align 4
+ xloop:
+ movdqa xmm0, [esi] // row0
+ movdqa xmm2, [esi + edx] // row1
+ movdqa xmm1, xmm0
+ movdqa xmm3, xmm2
+ punpcklbw xmm2, xmm4
+ punpckhbw xmm3, xmm4
+ punpcklbw xmm0, xmm4
+ punpckhbw xmm1, xmm4
+ psubw xmm2, xmm0 // row1 - row0
+ psubw xmm3, xmm1
+ paddw xmm2, xmm2 // 9 bits * 15 bits = 8.16
+ paddw xmm3, xmm3
+ pmulhw xmm2, xmm5 // scale diff
+ pmulhw xmm3, xmm5
+ paddw xmm0, xmm2 // sum rows
+ paddw xmm1, xmm3
+ packuswb xmm0, xmm1
+ sub ecx, 16
+ movdqa [esi + edi], xmm0
+ lea esi, [esi + 16]
+ jg xloop
+ jmp xloop99
+
+ // Blend 25 / 75.
+ align 4
+ xloop25:
+ movdqa xmm0, [esi]
+ movdqa xmm1, [esi + edx]
+ pavgb xmm0, xmm1
+ pavgb xmm0, xmm1
+ sub ecx, 16
+ movdqa [esi + edi], xmm0
+ lea esi, [esi + 16]
+ jg xloop25
+ jmp xloop99
+
+ // Blend 50 / 50.
+ align 4
+ xloop50:
+ movdqa xmm0, [esi]
+ movdqa xmm1, [esi + edx]
+ pavgb xmm0, xmm1
+ sub ecx, 16
+ movdqa [esi + edi], xmm0
+ lea esi, [esi + 16]
+ jg xloop50
+ jmp xloop99
+
+ // Blend 75 / 25.
+ align 4
+ xloop75:
+ movdqa xmm1, [esi]
+ movdqa xmm0, [esi + edx]
+ pavgb xmm0, xmm1
+ pavgb xmm0, xmm1
+ sub ecx, 16
+ movdqa [esi + edi], xmm0
+ lea esi, [esi + 16]
+ jg xloop75
+ jmp xloop99
+
+ // Blend 100 / 0 - Copy row unchanged.
+ align 4
+ xloop100:
+ movdqa xmm0, [esi]
+ sub ecx, 16
+ movdqa [esi + edi], xmm0
+ lea esi, [esi + 16]
+ jg xloop100
+
+ xloop99:
+ pop edi
+ pop esi
+ ret
+ }
+}
+#endif // HAS_INTERPOLATEROW_SSE2
+
+// Bilinear filter 16x2 -> 16x1
+__declspec(naked) __declspec(align(16))
+void InterpolateRow_Unaligned_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
+ ptrdiff_t src_stride, int dst_width,
+ int source_y_fraction) {
+ __asm {
+ push esi
+ push edi
+ mov edi, [esp + 8 + 4] // dst_ptr
+ mov esi, [esp + 8 + 8] // src_ptr
+ mov edx, [esp + 8 + 12] // src_stride
+ mov ecx, [esp + 8 + 16] // dst_width
+ mov eax, [esp + 8 + 20] // source_y_fraction (0..255)
+ sub edi, esi
+ shr eax, 1
+ // Dispatch to specialized filters if applicable.
+ cmp eax, 0
+ je xloop100 // 0 / 128. Blend 100 / 0.
+ cmp eax, 32
+ je xloop75 // 32 / 128 is 0.25. Blend 75 / 25.
+ cmp eax, 64
+ je xloop50 // 64 / 128 is 0.50. Blend 50 / 50.
+ cmp eax, 96
+ je xloop25 // 96 / 128 is 0.75. Blend 25 / 75.
+
+ movd xmm0, eax // high fraction 0..127
+ neg eax
+ add eax, 128
+ movd xmm5, eax // low fraction 128..1
+ punpcklbw xmm5, xmm0
+ punpcklwd xmm5, xmm5
+ pshufd xmm5, xmm5, 0
+
+ align 4
+ xloop:
+ movdqu xmm0, [esi]
+ movdqu xmm2, [esi + edx]
+ movdqu xmm1, xmm0
+ punpcklbw xmm0, xmm2
+ punpckhbw xmm1, xmm2
+ pmaddubsw xmm0, xmm5
+ pmaddubsw xmm1, xmm5
+ psrlw xmm0, 7
+ psrlw xmm1, 7
+ packuswb xmm0, xmm1
+ sub ecx, 16
+ movdqu [esi + edi], xmm0
+ lea esi, [esi + 16]
+ jg xloop
+ jmp xloop99
+
+ // Blend 25 / 75.
+ align 4
+ xloop25:
+ movdqu xmm0, [esi]
+ movdqu xmm1, [esi + edx]
+ pavgb xmm0, xmm1
+ pavgb xmm0, xmm1
+ sub ecx, 16
+ movdqu [esi + edi], xmm0
+ lea esi, [esi + 16]
+ jg xloop25
+ jmp xloop99
+
+ // Blend 50 / 50.
+ align 4
+ xloop50:
+ movdqu xmm0, [esi]
+ movdqu xmm1, [esi + edx]
+ pavgb xmm0, xmm1
+ sub ecx, 16
+ movdqu [esi + edi], xmm0
+ lea esi, [esi + 16]
+ jg xloop50
+ jmp xloop99
+
+ // Blend 75 / 25.
+ align 4
+ xloop75:
+ movdqu xmm1, [esi]
+ movdqu xmm0, [esi + edx]
+ pavgb xmm0, xmm1
+ pavgb xmm0, xmm1
+ sub ecx, 16
+ movdqu [esi + edi], xmm0
+ lea esi, [esi + 16]
+ jg xloop75
+ jmp xloop99
+
+ // Blend 100 / 0 - Copy row unchanged.
+ align 4
+ xloop100:
+ movdqu xmm0, [esi]
+ sub ecx, 16
+ movdqu [esi + edi], xmm0
+ lea esi, [esi + 16]
+ jg xloop100
+
+ xloop99:
+ pop edi
+ pop esi
+ ret
+ }
+}
+
+#ifdef HAS_INTERPOLATEROW_SSE2
+// Bilinear filter 16x2 -> 16x1
+__declspec(naked) __declspec(align(16))
+void InterpolateRow_Unaligned_SSE2(uint8* dst_ptr, const uint8* src_ptr,
+ ptrdiff_t src_stride, int dst_width,
+ int source_y_fraction) {
+ __asm {
+ push esi
+ push edi
+ mov edi, [esp + 8 + 4] // dst_ptr
+ mov esi, [esp + 8 + 8] // src_ptr
+ mov edx, [esp + 8 + 12] // src_stride
+ mov ecx, [esp + 8 + 16] // dst_width
+ mov eax, [esp + 8 + 20] // source_y_fraction (0..255)
+ sub edi, esi
+ // Dispatch to specialized filters if applicable.
+ cmp eax, 0
+ je xloop100 // 0 / 256. Blend 100 / 0.
+ cmp eax, 64
+ je xloop75 // 64 / 256 is 0.25. Blend 75 / 25.
+ cmp eax, 128
+ je xloop50 // 128 / 256 is 0.50. Blend 50 / 50.
+ cmp eax, 192
+ je xloop25 // 192 / 256 is 0.75. Blend 25 / 75.
+
+ movd xmm5, eax // xmm5 = y fraction
+ punpcklbw xmm5, xmm5
+ psrlw xmm5, 1
+ punpcklwd xmm5, xmm5
+ punpckldq xmm5, xmm5
+ punpcklqdq xmm5, xmm5
+ pxor xmm4, xmm4
+
+ align 4
+ xloop:
+ movdqu xmm0, [esi] // row0
+ movdqu xmm2, [esi + edx] // row1
+ movdqu xmm1, xmm0
+ movdqu xmm3, xmm2
+ punpcklbw xmm2, xmm4
+ punpckhbw xmm3, xmm4
+ punpcklbw xmm0, xmm4
+ punpckhbw xmm1, xmm4
+ psubw xmm2, xmm0 // row1 - row0
+ psubw xmm3, xmm1
+ paddw xmm2, xmm2 // 9 bits * 15 bits = 8.16
+ paddw xmm3, xmm3
+ pmulhw xmm2, xmm5 // scale diff
+ pmulhw xmm3, xmm5
+ paddw xmm0, xmm2 // sum rows
+ paddw xmm1, xmm3
+ packuswb xmm0, xmm1
+ sub ecx, 16
+ movdqu [esi + edi], xmm0
+ lea esi, [esi + 16]
+ jg xloop
+ jmp xloop99
+
+ // Blend 25 / 75.
+ align 4
+ xloop25:
+ movdqu xmm0, [esi]
+ movdqu xmm1, [esi + edx]
+ pavgb xmm0, xmm1
+ pavgb xmm0, xmm1
+ sub ecx, 16
+ movdqu [esi + edi], xmm0
+ lea esi, [esi + 16]
+ jg xloop25
+ jmp xloop99
+
+ // Blend 50 / 50.
+ align 4
+ xloop50:
+ movdqu xmm0, [esi]
+ movdqu xmm1, [esi + edx]
+ pavgb xmm0, xmm1
+ sub ecx, 16
+ movdqu [esi + edi], xmm0
+ lea esi, [esi + 16]
+ jg xloop50
+ jmp xloop99
+
+ // Blend 75 / 25.
+ align 4
+ xloop75:
+ movdqu xmm1, [esi]
+ movdqu xmm0, [esi + edx]
+ pavgb xmm0, xmm1
+ pavgb xmm0, xmm1
+ sub ecx, 16
+ movdqu [esi + edi], xmm0
+ lea esi, [esi + 16]
+ jg xloop75
+ jmp xloop99
+
+ // Blend 100 / 0 - Copy row unchanged.
+ align 4
+ xloop100:
+ movdqu xmm0, [esi]
+ sub ecx, 16
+ movdqu [esi + edi], xmm0
+ lea esi, [esi + 16]
+ jg xloop100
+
+ xloop99:
+ pop edi
+ pop esi
+ ret
+ }
+}
+#endif // HAS_INTERPOLATEROW_SSE2
+
+__declspec(naked) __declspec(align(16))
+void HalfRow_SSE2(const uint8* src_uv, int src_uv_stride,
+ uint8* dst_uv, int pix) {
+ __asm {
+ push edi
+ mov eax, [esp + 4 + 4] // src_uv
+ mov edx, [esp + 4 + 8] // src_uv_stride
+ mov edi, [esp + 4 + 12] // dst_v
+ mov ecx, [esp + 4 + 16] // pix
+ sub edi, eax
+
+ align 4
+ convertloop:
+ movdqa xmm0, [eax]
+ pavgb xmm0, [eax + edx]
+ sub ecx, 16
+ movdqa [eax + edi], xmm0
+ lea eax, [eax + 16]
+ jg convertloop
+ pop edi
+ ret
+ }
+}
+
+#ifdef HAS_HALFROW_AVX2
+__declspec(naked) __declspec(align(16))
+void HalfRow_AVX2(const uint8* src_uv, int src_uv_stride,
+ uint8* dst_uv, int pix) {
+ __asm {
+ push edi
+ mov eax, [esp + 4 + 4] // src_uv
+ mov edx, [esp + 4 + 8] // src_uv_stride
+ mov edi, [esp + 4 + 12] // dst_v
+ mov ecx, [esp + 4 + 16] // pix
+ sub edi, eax
+
+ align 4
+ convertloop:
+ vmovdqu ymm0, [eax]
+ vpavgb ymm0, ymm0, [eax + edx]
+ sub ecx, 32
+ vmovdqu [eax + edi], ymm0
+ lea eax, [eax + 32]
+ jg convertloop
+
+ pop edi
+ vzeroupper
+ ret
+ }
+}
+#endif // HAS_HALFROW_AVX2
+
+__declspec(naked) __declspec(align(16))
+void ARGBToBayerRow_SSSE3(const uint8* src_argb, uint8* dst_bayer,
+ uint32 selector, int pix) {
+ __asm {
+ mov eax, [esp + 4] // src_argb
+ mov edx, [esp + 8] // dst_bayer
+ movd xmm5, [esp + 12] // selector
+ mov ecx, [esp + 16] // pix
+ pshufd xmm5, xmm5, 0
+
+ align 4
+ wloop:
+ movdqa xmm0, [eax]
+ movdqa xmm1, [eax + 16]
+ lea eax, [eax + 32]
+ pshufb xmm0, xmm5
+ pshufb xmm1, xmm5
+ punpckldq xmm0, xmm1
+ sub ecx, 8
+ movq qword ptr [edx], xmm0
+ lea edx, [edx + 8]
+ jg wloop
+ ret
+ }
+}
+
+// Specialized ARGB to Bayer that just isolates G channel.
+__declspec(naked) __declspec(align(16))
+void ARGBToBayerGGRow_SSE2(const uint8* src_argb, uint8* dst_bayer,
+ uint32 selector, int pix) {
+ __asm {
+ mov eax, [esp + 4] // src_argb
+ mov edx, [esp + 8] // dst_bayer
+ // selector
+ mov ecx, [esp + 16] // pix
+ pcmpeqb xmm5, xmm5 // generate mask 0x000000ff
+ psrld xmm5, 24
+
+ align 4
+ wloop:
+ movdqa xmm0, [eax]
+ movdqa xmm1, [eax + 16]
+ lea eax, [eax + 32]
+ psrld xmm0, 8 // Move green to bottom.
+ psrld xmm1, 8
+ pand xmm0, xmm5
+ pand xmm1, xmm5
+ packssdw xmm0, xmm1
+ packuswb xmm0, xmm1
+ sub ecx, 8
+ movq qword ptr [edx], xmm0
+ lea edx, [edx + 8]
+ jg wloop
+ ret
+ }
+}
+
+// For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
+__declspec(naked) __declspec(align(16))
+void ARGBShuffleRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
+ const uint8* shuffler, int pix) {
+ __asm {
+ mov eax, [esp + 4] // src_argb
+ mov edx, [esp + 8] // dst_argb
+ mov ecx, [esp + 12] // shuffler
+ movdqa xmm5, [ecx]
+ mov ecx, [esp + 16] // pix
+
+ align 4
+ wloop:
+ movdqa xmm0, [eax]
+ movdqa xmm1, [eax + 16]
+ lea eax, [eax + 32]
+ pshufb xmm0, xmm5
+ pshufb xmm1, xmm5
+ sub ecx, 8
+ movdqa [edx], xmm0
+ movdqa [edx + 16], xmm1
+ lea edx, [edx + 32]
+ jg wloop
+ ret
+ }
+}
+
+__declspec(naked) __declspec(align(16))
+void ARGBShuffleRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_argb,
+ const uint8* shuffler, int pix) {
+ __asm {
+ mov eax, [esp + 4] // src_argb
+ mov edx, [esp + 8] // dst_argb
+ mov ecx, [esp + 12] // shuffler
+ movdqa xmm5, [ecx]
+ mov ecx, [esp + 16] // pix
+
+ align 4
+ wloop:
+ movdqu xmm0, [eax]
+ movdqu xmm1, [eax + 16]
+ lea eax, [eax + 32]
+ pshufb xmm0, xmm5
+ pshufb xmm1, xmm5
+ sub ecx, 8
+ movdqu [edx], xmm0
+ movdqu [edx + 16], xmm1
+ lea edx, [edx + 32]
+ jg wloop
+ ret
+ }
+}
+
+#ifdef HAS_ARGBSHUFFLEROW_AVX2
+__declspec(naked) __declspec(align(16))
+void ARGBShuffleRow_AVX2(const uint8* src_argb, uint8* dst_argb,
+ const uint8* shuffler, int pix) {
+ __asm {
+ mov eax, [esp + 4] // src_argb
+ mov edx, [esp + 8] // dst_argb
+ mov ecx, [esp + 12] // shuffler
+ vbroadcastf128 ymm5, [ecx] // same shuffle in high as low.
+ mov ecx, [esp + 16] // pix
+
+ align 4
+ wloop:
+ vmovdqu ymm0, [eax]
+ vmovdqu ymm1, [eax + 32]
+ lea eax, [eax + 64]
+ vpshufb ymm0, ymm0, ymm5
+ vpshufb ymm1, ymm1, ymm5
+ sub ecx, 16
+ vmovdqu [edx], ymm0
+ vmovdqu [edx + 32], ymm1
+ lea edx, [edx + 64]
+ jg wloop
+
+ vzeroupper
+ ret
+ }
+}
+#endif // HAS_ARGBSHUFFLEROW_AVX2
+
+__declspec(naked) __declspec(align(16))
+void ARGBShuffleRow_SSE2(const uint8* src_argb, uint8* dst_argb,
+ const uint8* shuffler, int pix) {
+ __asm {
+ push ebx
+ push esi
+ mov eax, [esp + 8 + 4] // src_argb
+ mov edx, [esp + 8 + 8] // dst_argb
+ mov esi, [esp + 8 + 12] // shuffler
+ mov ecx, [esp + 8 + 16] // pix
+ pxor xmm5, xmm5
+
+ mov ebx, [esi] // shuffler
+ cmp ebx, 0x03000102
+ je shuf_3012
+ cmp ebx, 0x00010203
+ je shuf_0123
+ cmp ebx, 0x00030201
+ je shuf_0321
+ cmp ebx, 0x02010003
+ je shuf_2103
+
+ // TODO(fbarchard): Use one source pointer and 3 offsets.
+ shuf_any1:
+ movzx ebx, byte ptr [esi]
+ movzx ebx, byte ptr [eax + ebx]
+ mov [edx], bl
+ movzx ebx, byte ptr [esi + 1]
+ movzx ebx, byte ptr [eax + ebx]
+ mov [edx + 1], bl
+ movzx ebx, byte ptr [esi + 2]
+ movzx ebx, byte ptr [eax + ebx]
+ mov [edx + 2], bl
+ movzx ebx, byte ptr [esi + 3]
+ movzx ebx, byte ptr [eax + ebx]
+ mov [edx + 3], bl
+ lea eax, [eax + 4]
+ lea edx, [edx + 4]
+ sub ecx, 1
+ jg shuf_any1
+ jmp shuf99
+
+ align 4
+ shuf_0123:
+ movdqu xmm0, [eax]
+ lea eax, [eax + 16]
+ movdqa xmm1, xmm0
+ punpcklbw xmm0, xmm5
+ punpckhbw xmm1, xmm5
+ pshufhw xmm0, xmm0, 01Bh // 1B = 00011011 = 0x0123 = BGRAToARGB
+ pshuflw xmm0, xmm0, 01Bh
+ pshufhw xmm1, xmm1, 01Bh
+ pshuflw xmm1, xmm1, 01Bh
+ packuswb xmm0, xmm1
+ sub ecx, 4
+ movdqu [edx], xmm0
+ lea edx, [edx + 16]
+ jg shuf_0123
+ jmp shuf99
+
+ align 4
+ shuf_0321:
+ movdqu xmm0, [eax]
+ lea eax, [eax + 16]
+ movdqa xmm1, xmm0
+ punpcklbw xmm0, xmm5
+ punpckhbw xmm1, xmm5
+ pshufhw xmm0, xmm0, 039h // 39 = 00111001 = 0x0321 = RGBAToARGB
+ pshuflw xmm0, xmm0, 039h
+ pshufhw xmm1, xmm1, 039h
+ pshuflw xmm1, xmm1, 039h
+ packuswb xmm0, xmm1
+ sub ecx, 4
+ movdqu [edx], xmm0
+ lea edx, [edx + 16]
+ jg shuf_0321
+ jmp shuf99
+
+ align 4
+ shuf_2103:
+ movdqu xmm0, [eax]
+ lea eax, [eax + 16]
+ movdqa xmm1, xmm0
+ punpcklbw xmm0, xmm5
+ punpckhbw xmm1, xmm5
+ pshufhw xmm0, xmm0, 093h // 93 = 10010011 = 0x2103 = ARGBToRGBA
+ pshuflw xmm0, xmm0, 093h
+ pshufhw xmm1, xmm1, 093h
+ pshuflw xmm1, xmm1, 093h
+ packuswb xmm0, xmm1
+ sub ecx, 4
+ movdqu [edx], xmm0
+ lea edx, [edx + 16]
+ jg shuf_2103
+ jmp shuf99
+
+ align 4
+ shuf_3012:
+ movdqu xmm0, [eax]
+ lea eax, [eax + 16]
+ movdqa xmm1, xmm0
+ punpcklbw xmm0, xmm5
+ punpckhbw xmm1, xmm5
+ pshufhw xmm0, xmm0, 0C6h // C6 = 11000110 = 0x3012 = ABGRToARGB
+ pshuflw xmm0, xmm0, 0C6h
+ pshufhw xmm1, xmm1, 0C6h
+ pshuflw xmm1, xmm1, 0C6h
+ packuswb xmm0, xmm1
+ sub ecx, 4
+ movdqu [edx], xmm0
+ lea edx, [edx + 16]
+ jg shuf_3012
+
+ shuf99:
+ pop esi
+ pop ebx
+ ret
+ }
+}
+
+// YUY2 - Macro-pixel = 2 image pixels
+// Y0U0Y1V0....Y2U2Y3V2...Y4U4Y5V4....
+
+// UYVY - Macro-pixel = 2 image pixels
+// U0Y0V0Y1
+
+__declspec(naked) __declspec(align(16))
+void I422ToYUY2Row_SSE2(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_frame, int width) {
+ __asm {
+ push esi
+ push edi
+ mov eax, [esp + 8 + 4] // src_y
+ mov esi, [esp + 8 + 8] // src_u
+ mov edx, [esp + 8 + 12] // src_v
+ mov edi, [esp + 8 + 16] // dst_frame
+ mov ecx, [esp + 8 + 20] // width
+ sub edx, esi
+
+ align 4
+ convertloop:
+ movq xmm2, qword ptr [esi] // U
+ movq xmm3, qword ptr [esi + edx] // V
+ lea esi, [esi + 8]
+ punpcklbw xmm2, xmm3 // UV
+ movdqu xmm0, [eax] // Y
+ lea eax, [eax + 16]
+ movdqa xmm1, xmm0
+ punpcklbw xmm0, xmm2 // YUYV
+ punpckhbw xmm1, xmm2
+ movdqu [edi], xmm0
+ movdqu [edi + 16], xmm1
+ lea edi, [edi + 32]
+ sub ecx, 16
+ jg convertloop
+
+ pop edi
+ pop esi
+ ret
+ }
+}
+
+__declspec(naked) __declspec(align(16))
+void I422ToUYVYRow_SSE2(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_frame, int width) {
+ __asm {
+ push esi
+ push edi
+ mov eax, [esp + 8 + 4] // src_y
+ mov esi, [esp + 8 + 8] // src_u
+ mov edx, [esp + 8 + 12] // src_v
+ mov edi, [esp + 8 + 16] // dst_frame
+ mov ecx, [esp + 8 + 20] // width
+ sub edx, esi
+
+ align 4
+ convertloop:
+ movq xmm2, qword ptr [esi] // U
+ movq xmm3, qword ptr [esi + edx] // V
+ lea esi, [esi + 8]
+ punpcklbw xmm2, xmm3 // UV
+ movdqu xmm0, [eax] // Y
+ movdqa xmm1, xmm2
+ lea eax, [eax + 16]
+ punpcklbw xmm1, xmm0 // UYVY
+ punpckhbw xmm2, xmm0
+ movdqu [edi], xmm1
+ movdqu [edi + 16], xmm2
+ lea edi, [edi + 32]
+ sub ecx, 16
+ jg convertloop
+
+ pop edi
+ pop esi
+ ret
+ }
+}
+
+#ifdef HAS_ARGBPOLYNOMIALROW_SSE2
+__declspec(naked) __declspec(align(16))
+void ARGBPolynomialRow_SSE2(const uint8* src_argb,
+ uint8* dst_argb, const float* poly,
+ int width) {
+ __asm {
+ push esi
+ mov eax, [esp + 4 + 4] /* src_argb */
+ mov edx, [esp + 4 + 8] /* dst_argb */
+ mov esi, [esp + 4 + 12] /* poly */
+ mov ecx, [esp + 4 + 16] /* width */
+ pxor xmm3, xmm3 // 0 constant for zero extending bytes to ints.
+
+ // 2 pixel loop.
+ align 4
+ convertloop:
+// pmovzxbd xmm0, dword ptr [eax] // BGRA pixel
+// pmovzxbd xmm4, dword ptr [eax + 4] // BGRA pixel
+ movq xmm0, qword ptr [eax] // BGRABGRA
+ lea eax, [eax + 8]
+ punpcklbw xmm0, xmm3
+ movdqa xmm4, xmm0
+ punpcklwd xmm0, xmm3 // pixel 0
+ punpckhwd xmm4, xmm3 // pixel 1
+ cvtdq2ps xmm0, xmm0 // 4 floats
+ cvtdq2ps xmm4, xmm4
+ movdqa xmm1, xmm0 // X
+ movdqa xmm5, xmm4
+ mulps xmm0, [esi + 16] // C1 * X
+ mulps xmm4, [esi + 16]
+ addps xmm0, [esi] // result = C0 + C1 * X
+ addps xmm4, [esi]
+ movdqa xmm2, xmm1
+ movdqa xmm6, xmm5
+ mulps xmm2, xmm1 // X * X
+ mulps xmm6, xmm5
+ mulps xmm1, xmm2 // X * X * X
+ mulps xmm5, xmm6
+ mulps xmm2, [esi + 32] // C2 * X * X
+ mulps xmm6, [esi + 32]
+ mulps xmm1, [esi + 48] // C3 * X * X * X
+ mulps xmm5, [esi + 48]
+ addps xmm0, xmm2 // result += C2 * X * X
+ addps xmm4, xmm6
+ addps xmm0, xmm1 // result += C3 * X * X * X
+ addps xmm4, xmm5
+ cvttps2dq xmm0, xmm0
+ cvttps2dq xmm4, xmm4
+ packuswb xmm0, xmm4
+ packuswb xmm0, xmm0
+ sub ecx, 2
+ movq qword ptr [edx], xmm0
+ lea edx, [edx + 8]
+ jg convertloop
+ pop esi
+ ret
+ }
+}
+#endif // HAS_ARGBPOLYNOMIALROW_SSE2
+
+#ifdef HAS_ARGBPOLYNOMIALROW_AVX2
+__declspec(naked) __declspec(align(16))
+void ARGBPolynomialRow_AVX2(const uint8* src_argb,
+ uint8* dst_argb, const float* poly,
+ int width) {
+ __asm {
+ mov eax, [esp + 4] /* src_argb */
+ mov edx, [esp + 8] /* dst_argb */
+ mov ecx, [esp + 12] /* poly */
+ vbroadcastf128 ymm4, [ecx] // C0
+ vbroadcastf128 ymm5, [ecx + 16] // C1
+ vbroadcastf128 ymm6, [ecx + 32] // C2
+ vbroadcastf128 ymm7, [ecx + 48] // C3
+ mov ecx, [esp + 16] /* width */
+
+ // 2 pixel loop.
+ align 4
+ convertloop:
+ vpmovzxbd ymm0, qword ptr [eax] // 2 BGRA pixels
+ lea eax, [eax + 8]
+ vcvtdq2ps ymm0, ymm0 // X 8 floats
+ vmulps ymm2, ymm0, ymm0 // X * X
+ vmulps ymm3, ymm0, ymm7 // C3 * X
+ vfmadd132ps ymm0, ymm4, ymm5 // result = C0 + C1 * X
+ vfmadd231ps ymm0, ymm2, ymm6 // result += C2 * X * X
+ vfmadd231ps ymm0, ymm2, ymm3 // result += C3 * X * X * X
+ vcvttps2dq ymm0, ymm0
+ vpackusdw ymm0, ymm0, ymm0 // b0g0r0a0_00000000_b0g0r0a0_00000000
+ vpermq ymm0, ymm0, 0xd8 // b0g0r0a0_b0g0r0a0_00000000_00000000
+ vpackuswb xmm0, xmm0, xmm0 // bgrabgra_00000000_00000000_00000000
+ sub ecx, 2
+ vmovq qword ptr [edx], xmm0
+ lea edx, [edx + 8]
+ jg convertloop
+ vzeroupper
+ ret
+ }
+}
+#endif // HAS_ARGBPOLYNOMIALROW_AVX2
+
+#ifdef HAS_ARGBCOLORTABLEROW_X86
+// Tranform ARGB pixels with color table.
+__declspec(naked) __declspec(align(16))
+void ARGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb,
+ int width) {
+ __asm {
+ push esi
+ mov eax, [esp + 4 + 4] /* dst_argb */
+ mov esi, [esp + 4 + 8] /* table_argb */
+ mov ecx, [esp + 4 + 12] /* width */
+
+ // 1 pixel loop.
+ align 4
+ convertloop:
+ movzx edx, byte ptr [eax]
+ lea eax, [eax + 4]
+ movzx edx, byte ptr [esi + edx * 4]
+ mov byte ptr [eax - 4], dl
+ movzx edx, byte ptr [eax - 4 + 1]
+ movzx edx, byte ptr [esi + edx * 4 + 1]
+ mov byte ptr [eax - 4 + 1], dl
+ movzx edx, byte ptr [eax - 4 + 2]
+ movzx edx, byte ptr [esi + edx * 4 + 2]
+ mov byte ptr [eax - 4 + 2], dl
+ movzx edx, byte ptr [eax - 4 + 3]
+ movzx edx, byte ptr [esi + edx * 4 + 3]
+ mov byte ptr [eax - 4 + 3], dl
+ dec ecx
+ jg convertloop
+ pop esi
+ ret
+ }
+}
+#endif // HAS_ARGBCOLORTABLEROW_X86
+
+#ifdef HAS_RGBCOLORTABLEROW_X86
+// Tranform RGB pixels with color table.
+__declspec(naked) __declspec(align(16))
+void RGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb, int width) {
+ __asm {
+ push esi
+ mov eax, [esp + 4 + 4] /* dst_argb */
+ mov esi, [esp + 4 + 8] /* table_argb */
+ mov ecx, [esp + 4 + 12] /* width */
+
+ // 1 pixel loop.
+ align 4
+ convertloop:
+ movzx edx, byte ptr [eax]
+ lea eax, [eax + 4]
+ movzx edx, byte ptr [esi + edx * 4]
+ mov byte ptr [eax - 4], dl
+ movzx edx, byte ptr [eax - 4 + 1]
+ movzx edx, byte ptr [esi + edx * 4 + 1]
+ mov byte ptr [eax - 4 + 1], dl
+ movzx edx, byte ptr [eax - 4 + 2]
+ movzx edx, byte ptr [esi + edx * 4 + 2]
+ mov byte ptr [eax - 4 + 2], dl
+ dec ecx
+ jg convertloop
+
+ pop esi
+ ret
+ }
+}
+#endif // HAS_RGBCOLORTABLEROW_X86
+
+#ifdef HAS_ARGBLUMACOLORTABLEROW_SSSE3
+// Tranform RGB pixels with luma table.
+__declspec(naked) __declspec(align(16))
+void ARGBLumaColorTableRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
+ int width,
+ const uint8* luma, uint32 lumacoeff) {
+ __asm {
+ push esi
+ push edi
+ mov eax, [esp + 8 + 4] /* src_argb */
+ mov edi, [esp + 8 + 8] /* dst_argb */
+ mov ecx, [esp + 8 + 12] /* width */
+ movd xmm2, dword ptr [esp + 8 + 16] // luma table
+ movd xmm3, dword ptr [esp + 8 + 20] // lumacoeff
+ pshufd xmm2, xmm2, 0
+ pshufd xmm3, xmm3, 0
+ pcmpeqb xmm4, xmm4 // generate mask 0xff00ff00
+ psllw xmm4, 8
+ pxor xmm5, xmm5
+
+ // 4 pixel loop.
+ align 4
+ convertloop:
+ movdqu xmm0, qword ptr [eax] // generate luma ptr
+ pmaddubsw xmm0, xmm3
+ phaddw xmm0, xmm0
+ pand xmm0, xmm4 // mask out low bits
+ punpcklwd xmm0, xmm5
+ paddd xmm0, xmm2 // add table base
+ movd esi, xmm0
+ pshufd xmm0, xmm0, 0x39 // 00111001 to rotate right 32
+
+ movzx edx, byte ptr [eax]
+ movzx edx, byte ptr [esi + edx]
+ mov byte ptr [edi], dl
+ movzx edx, byte ptr [eax + 1]
+ movzx edx, byte ptr [esi + edx]
+ mov byte ptr [edi + 1], dl
+ movzx edx, byte ptr [eax + 2]
+ movzx edx, byte ptr [esi + edx]
+ mov byte ptr [edi + 2], dl
+ movzx edx, byte ptr [eax + 3] // copy alpha.
+ mov byte ptr [edi + 3], dl
+
+ movd esi, xmm0
+ pshufd xmm0, xmm0, 0x39 // 00111001 to rotate right 32
+
+ movzx edx, byte ptr [eax + 4]
+ movzx edx, byte ptr [esi + edx]
+ mov byte ptr [edi + 4], dl
+ movzx edx, byte ptr [eax + 5]
+ movzx edx, byte ptr [esi + edx]
+ mov byte ptr [edi + 5], dl
+ movzx edx, byte ptr [eax + 6]
+ movzx edx, byte ptr [esi + edx]
+ mov byte ptr [edi + 6], dl
+ movzx edx, byte ptr [eax + 7] // copy alpha.
+ mov byte ptr [edi + 7], dl
+
+ movd esi, xmm0
+ pshufd xmm0, xmm0, 0x39 // 00111001 to rotate right 32
+
+ movzx edx, byte ptr [eax + 8]
+ movzx edx, byte ptr [esi + edx]
+ mov byte ptr [edi + 8], dl
+ movzx edx, byte ptr [eax + 9]
+ movzx edx, byte ptr [esi + edx]
+ mov byte ptr [edi + 9], dl
+ movzx edx, byte ptr [eax + 10]
+ movzx edx, byte ptr [esi + edx]
+ mov byte ptr [edi + 10], dl
+ movzx edx, byte ptr [eax + 11] // copy alpha.
+ mov byte ptr [edi + 11], dl
+
+ movd esi, xmm0
+
+ movzx edx, byte ptr [eax + 12]
+ movzx edx, byte ptr [esi + edx]
+ mov byte ptr [edi + 12], dl
+ movzx edx, byte ptr [eax + 13]
+ movzx edx, byte ptr [esi + edx]
+ mov byte ptr [edi + 13], dl
+ movzx edx, byte ptr [eax + 14]
+ movzx edx, byte ptr [esi + edx]
+ mov byte ptr [edi + 14], dl
+ movzx edx, byte ptr [eax + 15] // copy alpha.
+ mov byte ptr [edi + 15], dl
+
+ sub ecx, 4
+ lea eax, [eax + 16]
+ lea edi, [edi + 16]
+ jg convertloop
+
+ pop edi
+ pop esi
+ ret
+ }
+}
+#endif // HAS_ARGBLUMACOLORTABLEROW_SSSE3
+
+#endif // !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER)
+
+#ifdef __cplusplus
+} // extern "C"
+} // namespace libyuv
+#endif
diff --git a/third_party/libyuv/source/scale.c b/third_party/libyuv/source/scale.c
deleted file mode 100644
index 1809300..0000000
--- a/third_party/libyuv/source/scale.c
+++ /dev/null
@@ -1,3884 +0,0 @@
-/*
- * Copyright (c) 2011 The LibYuv project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "third_party/libyuv/include/libyuv/scale.h"
-
-#include <assert.h>
-#include <string.h>
-
-#include "third_party/libyuv/include/libyuv/cpu_id.h"
-#include "third_party/libyuv/source/row.h"
-
-#ifdef __cplusplus
-namespace libyuv {
-extern "C" {
-#endif
-
-/*
- * Note: Defining YUV_DISABLE_ASM allows to use c version.
- */
-//#define YUV_DISABLE_ASM
-
-#if defined(_MSC_VER)
-#define ALIGN16(var) __declspec(align(16)) var
-#else
-#define ALIGN16(var) var __attribute__((aligned(16)))
-#endif
-
-// Note: A Neon reference manual
-// http://infocenter.arm.com/help/index.jsp?topic=/com.arm.doc.dui0204j/CJAJIIGG.html
-// Note: Some SSE2 reference manuals
-// cpuvol1.pdf agner_instruction_tables.pdf 253666.pdf 253667.pdf
-
-// Set the following flag to true to revert to only
-// using the reference implementation ScalePlaneBox(), and
-// NOT the optimized versions. Useful for debugging and
-// when comparing the quality of the resulting YUV planes
-// as produced by the optimized and non-optimized versions.
-
-static int use_reference_impl_ = 0;
-
-void SetUseReferenceImpl(int use) {
- use_reference_impl_ = use;
-}
-
-// ScaleRowDown2Int also used by planar functions
-
-/**
- * NEON downscalers with interpolation.
- *
- * Provided by Fritz Koenig
- *
- */
-
-#if defined(__ARM_NEON__) && !defined(YUV_DISABLE_ASM)
-#define HAS_SCALEROWDOWN2_NEON
-void ScaleRowDown2_NEON(const uint8* src_ptr, int src_stride,
- uint8* dst, int dst_width) {
- asm volatile (
- "1: \n"
- "vld2.u8 {q0,q1}, [%0]! \n" // load even pixels into q0, odd into q1
- "vst1.u8 {q0}, [%1]! \n" // store even pixels
- "subs %2, %2, #16 \n" // 16 processed per loop
- "bhi 1b \n"
- : "+r"(src_ptr), // %0
- "+r"(dst), // %1
- "+r"(dst_width) // %2
- :
- : "q0", "q1" // Clobber List
- );
-}
-
-void ScaleRowDown2Int_NEON(const uint8* src_ptr, int src_stride,
- uint8* dst, int dst_width) {
- asm volatile (
- "add %1, %0 \n" // change the stride to row 2 pointer
- "1: \n"
- "vld1.u8 {q0,q1}, [%0]! \n" // load row 1 and post increment
- "vld1.u8 {q2,q3}, [%1]! \n" // load row 2 and post increment
- "vpaddl.u8 q0, q0 \n" // row 1 add adjacent
- "vpaddl.u8 q1, q1 \n"
- "vpadal.u8 q0, q2 \n" // row 2 add adjacent, add row 1 to row 2
- "vpadal.u8 q1, q3 \n"
- "vrshrn.u16 d0, q0, #2 \n" // downshift, round and pack
- "vrshrn.u16 d1, q1, #2 \n"
- "vst1.u8 {q0}, [%2]! \n"
- "subs %3, %3, #16 \n" // 16 processed per loop
- "bhi 1b \n"
- : "+r"(src_ptr), // %0
- "+r"(src_stride), // %1
- "+r"(dst), // %2
- "+r"(dst_width) // %3
- :
- : "q0", "q1", "q2", "q3" // Clobber List
- );
-}
-
-#define HAS_SCALEROWDOWN4_NEON
-static void ScaleRowDown4_NEON(const uint8* src_ptr, int src_stride,
- uint8* dst_ptr, int dst_width) {
- asm volatile (
- "1: \n"
- "vld2.u8 {d0, d1}, [%0]! \n"
- "vtrn.u8 d1, d0 \n"
- "vshrn.u16 d0, q0, #8 \n"
- "vst1.u32 {d0[1]}, [%1]! \n"
-
- "subs %2, #4 \n"
- "bhi 1b \n"
- : "+r"(src_ptr), // %0
- "+r"(dst_ptr), // %1
- "+r"(dst_width) // %2
- :
- : "q0", "q1", "memory", "cc"
- );
-}
-
-static void ScaleRowDown4Int_NEON(const uint8* src_ptr, int src_stride,
- uint8* dst_ptr, int dst_width) {
- asm volatile (
- "add r4, %0, %3 \n"
- "add r5, r4, %3 \n"
- "add %3, r5, %3 \n"
- "1: \n"
- "vld1.u8 {q0}, [%0]! \n" // load up 16x4 block of input data
- "vld1.u8 {q1}, [r4]! \n"
- "vld1.u8 {q2}, [r5]! \n"
- "vld1.u8 {q3}, [%3]! \n"
-
- "vpaddl.u8 q0, q0 \n"
- "vpadal.u8 q0, q1 \n"
- "vpadal.u8 q0, q2 \n"
- "vpadal.u8 q0, q3 \n"
-
- "vpaddl.u16 q0, q0 \n"
-
- "vrshrn.u32 d0, q0, #4 \n" // divide by 16 w/rounding
-
- "vmovn.u16 d0, q0 \n"
- "vst1.u32 {d0[0]}, [%1]! \n"
-
- "subs %2, #4 \n"
- "bhi 1b \n"
-
- : "+r"(src_ptr), // %0
- "+r"(dst_ptr), // %1
- "+r"(dst_width) // %2
- : "r"(src_stride) // %3
- : "r4", "r5", "q0", "q1", "q2", "q3", "memory", "cc"
- );
-}
-
-#define HAS_SCALEROWDOWN34_NEON
-// Down scale from 4 to 3 pixels. Use the neon multilane read/write
-// to load up the every 4th pixel into a 4 different registers.
-// Point samples 32 pixels to 24 pixels.
-static void ScaleRowDown34_NEON(const uint8* src_ptr, int src_stride,
- uint8* dst_ptr, int dst_width) {
- asm volatile (
- "1: \n"
- "vld4.u8 {d0, d1, d2, d3}, [%0]! \n" // src line 0
- "vmov d2, d3 \n" // order needs to be d0, d1, d2
- "vst3.u8 {d0, d1, d2}, [%1]! \n"
- "subs %2, #24 \n"
- "bhi 1b \n"
- : "+r"(src_ptr), // %0
- "+r"(dst_ptr), // %1
- "+r"(dst_width) // %2
- :
- : "d0", "d1", "d2", "d3", "memory", "cc"
- );
-}
-
-static void ScaleRowDown34_0_Int_NEON(const uint8* src_ptr, int src_stride,
- uint8* dst_ptr, int dst_width) {
- asm volatile (
- "vmov.u8 d24, #3 \n"
- "add %3, %0 \n"
- "1: \n"
- "vld4.u8 {d0, d1, d2, d3}, [%0]! \n" // src line 0
- "vld4.u8 {d4, d5, d6, d7}, [%3]! \n" // src line 1
-
- // filter src line 0 with src line 1
- // expand chars to shorts to allow for room
- // when adding lines together
- "vmovl.u8 q8, d4 \n"
- "vmovl.u8 q9, d5 \n"
- "vmovl.u8 q10, d6 \n"
- "vmovl.u8 q11, d7 \n"
-
- // 3 * line_0 + line_1
- "vmlal.u8 q8, d0, d24 \n"
- "vmlal.u8 q9, d1, d24 \n"
- "vmlal.u8 q10, d2, d24 \n"
- "vmlal.u8 q11, d3, d24 \n"
-
- // (3 * line_0 + line_1) >> 2
- "vqrshrn.u16 d0, q8, #2 \n"
- "vqrshrn.u16 d1, q9, #2 \n"
- "vqrshrn.u16 d2, q10, #2 \n"
- "vqrshrn.u16 d3, q11, #2 \n"
-
- // a0 = (src[0] * 3 + s[1] * 1) >> 2
- "vmovl.u8 q8, d1 \n"
- "vmlal.u8 q8, d0, d24 \n"
- "vqrshrn.u16 d0, q8, #2 \n"
-
- // a1 = (src[1] * 1 + s[2] * 1) >> 1
- "vrhadd.u8 d1, d1, d2 \n"
-
- // a2 = (src[2] * 1 + s[3] * 3) >> 2
- "vmovl.u8 q8, d2 \n"
- "vmlal.u8 q8, d3, d24 \n"
- "vqrshrn.u16 d2, q8, #2 \n"
-
- "vst3.u8 {d0, d1, d2}, [%1]! \n"
-
- "subs %2, #24 \n"
- "bhi 1b \n"
- : "+r"(src_ptr), // %0
- "+r"(dst_ptr), // %1
- "+r"(dst_width), // %2
- "+r"(src_stride) // %3
- :
- : "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11", "d24", "memory", "cc"
- );
-}
-
-static void ScaleRowDown34_1_Int_NEON(const uint8* src_ptr, int src_stride,
- uint8* dst_ptr, int dst_width) {
- asm volatile (
- "vmov.u8 d24, #3 \n"
- "add %3, %0 \n"
- "1: \n"
- "vld4.u8 {d0, d1, d2, d3}, [%0]! \n" // src line 0
- "vld4.u8 {d4, d5, d6, d7}, [%3]! \n" // src line 1
-
- // average src line 0 with src line 1
- "vrhadd.u8 q0, q0, q2 \n"
- "vrhadd.u8 q1, q1, q3 \n"
-
- // a0 = (src[0] * 3 + s[1] * 1) >> 2
- "vmovl.u8 q3, d1 \n"
- "vmlal.u8 q3, d0, d24 \n"
- "vqrshrn.u16 d0, q3, #2 \n"
-
- // a1 = (src[1] * 1 + s[2] * 1) >> 1
- "vrhadd.u8 d1, d1, d2 \n"
-
- // a2 = (src[2] * 1 + s[3] * 3) >> 2
- "vmovl.u8 q3, d2 \n"
- "vmlal.u8 q3, d3, d24 \n"
- "vqrshrn.u16 d2, q3, #2 \n"
-
- "vst3.u8 {d0, d1, d2}, [%1]! \n"
-
- "subs %2, #24 \n"
- "bhi 1b \n"
- : "+r"(src_ptr), // %0
- "+r"(dst_ptr), // %1
- "+r"(dst_width), // %2
- "+r"(src_stride) // %3
- :
- : "r4", "q0", "q1", "q2", "q3", "d24", "memory", "cc"
- );
-}
-
-#define HAS_SCALEROWDOWN38_NEON
-const uint8 shuf38[16] __attribute__ ((aligned(16))) =
- { 0, 3, 6, 8, 11, 14, 16, 19, 22, 24, 27, 30, 0, 0, 0, 0 };
-const uint8 shuf38_2[16] __attribute__ ((aligned(16))) =
- { 0, 8, 16, 2, 10, 17, 4, 12, 18, 6, 14, 19, 0, 0, 0, 0 };
-const unsigned short mult38_div6[8] __attribute__ ((aligned(16))) =
- { 65536 / 12, 65536 / 12, 65536 / 12, 65536 / 12,
- 65536 / 12, 65536 / 12, 65536 / 12, 65536 / 12 };
-const unsigned short mult38_div9[8] __attribute__ ((aligned(16))) =
- { 65536 / 18, 65536 / 18, 65536 / 18, 65536 / 18,
- 65536 / 18, 65536 / 18, 65536 / 18, 65536 / 18 };
-
-// 32 -> 12
-static void ScaleRowDown38_NEON(const uint8* src_ptr, int src_stride,
- uint8* dst_ptr, int dst_width) {
- asm volatile (
- "vld1.u8 {q3}, [%3] \n"
- "1: \n"
- "vld1.u8 {d0, d1, d2, d3}, [%0]! \n"
- "vtbl.u8 d4, {d0, d1, d2, d3}, d6 \n"
- "vtbl.u8 d5, {d0, d1, d2, d3}, d7 \n"
- "vst1.u8 {d4}, [%1]! \n"
- "vst1.u32 {d5[0]}, [%1]! \n"
- "subs %2, #12 \n"
- "bhi 1b \n"
- : "+r"(src_ptr), // %0
- "+r"(dst_ptr), // %1
- "+r"(dst_width) // %2
- : "r"(shuf38) // %3
- : "d0", "d1", "d2", "d3", "d4", "d5", "memory", "cc"
- );
-}
-
-// 32x3 -> 12x1
-static void ScaleRowDown38_3_Int_NEON(const uint8* src_ptr, int src_stride,
- uint8* dst_ptr, int dst_width) {
- asm volatile (
- "vld1.u16 {q13}, [%4] \n"
- "vld1.u8 {q14}, [%5] \n"
- "vld1.u8 {q15}, [%6] \n"
- "add r4, %0, %3, lsl #1 \n"
- "add %3, %0 \n"
- "1: \n"
-
- // d0 = 00 40 01 41 02 42 03 43
- // d1 = 10 50 11 51 12 52 13 53
- // d2 = 20 60 21 61 22 62 23 63
- // d3 = 30 70 31 71 32 72 33 73
- "vld4.u8 {d0, d1, d2, d3}, [%0]! \n"
- "vld4.u8 {d4, d5, d6, d7}, [%3]! \n"
- "vld4.u8 {d16, d17, d18, d19}, [r4]! \n"
-
- // Shuffle the input data around to get align the data
- // so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7
- // d0 = 00 10 01 11 02 12 03 13
- // d1 = 40 50 41 51 42 52 43 53
- "vtrn.u8 d0, d1 \n"
- "vtrn.u8 d4, d5 \n"
- "vtrn.u8 d16, d17 \n"
-
- // d2 = 20 30 21 31 22 32 23 33
- // d3 = 60 70 61 71 62 72 63 73
- "vtrn.u8 d2, d3 \n"
- "vtrn.u8 d6, d7 \n"
- "vtrn.u8 d18, d19 \n"
-
- // d0 = 00+10 01+11 02+12 03+13
- // d2 = 40+50 41+51 42+52 43+53
- "vpaddl.u8 q0, q0 \n"
- "vpaddl.u8 q2, q2 \n"
- "vpaddl.u8 q8, q8 \n"
-
- // d3 = 60+70 61+71 62+72 63+73
- "vpaddl.u8 d3, d3 \n"
- "vpaddl.u8 d7, d7 \n"
- "vpaddl.u8 d19, d19 \n"
-
- // combine source lines
- "vadd.u16 q0, q2 \n"
- "vadd.u16 q0, q8 \n"
- "vadd.u16 d4, d3, d7 \n"
- "vadd.u16 d4, d19 \n"
-
- // dst_ptr[3] = (s[6 + st * 0] + s[7 + st * 0]
- // + s[6 + st * 1] + s[7 + st * 1]
- // + s[6 + st * 2] + s[7 + st * 2]) / 6
- "vqrdmulh.s16 q2, q13 \n"
- "vmovn.u16 d4, q2 \n"
-
- // Shuffle 2,3 reg around so that 2 can be added to the
- // 0,1 reg and 3 can be added to the 4,5 reg. This
- // requires expanding from u8 to u16 as the 0,1 and 4,5
- // registers are already expanded. Then do transposes
- // to get aligned.
- // q2 = xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33
- "vmovl.u8 q1, d2 \n"
- "vmovl.u8 q3, d6 \n"
- "vmovl.u8 q9, d18 \n"
-
- // combine source lines
- "vadd.u16 q1, q3 \n"
- "vadd.u16 q1, q9 \n"
-
- // d4 = xx 20 xx 30 xx 22 xx 32
- // d5 = xx 21 xx 31 xx 23 xx 33
- "vtrn.u32 d2, d3 \n"
-
- // d4 = xx 20 xx 21 xx 22 xx 23
- // d5 = xx 30 xx 31 xx 32 xx 33
- "vtrn.u16 d2, d3 \n"
-
- // 0+1+2, 3+4+5
- "vadd.u16 q0, q1 \n"
-
- // Need to divide, but can't downshift as the the value
- // isn't a power of 2. So multiply by 65536 / n
- // and take the upper 16 bits.
- "vqrdmulh.s16 q0, q15 \n"
-
- // Align for table lookup, vtbl requires registers to
- // be adjacent
- "vmov.u8 d2, d4 \n"
-
- "vtbl.u8 d3, {d0, d1, d2}, d28 \n"
- "vtbl.u8 d4, {d0, d1, d2}, d29 \n"
-
- "vst1.u8 {d3}, [%1]! \n"
- "vst1.u32 {d4[0]}, [%1]! \n"
- "subs %2, #12 \n"
- "bhi 1b \n"
- : "+r"(src_ptr), // %0
- "+r"(dst_ptr), // %1
- "+r"(dst_width), // %2
- "+r"(src_stride) // %3
- : "r"(mult38_div6), // %4
- "r"(shuf38_2), // %5
- "r"(mult38_div9) // %6
- : "r4", "q0", "q1", "q2", "q3", "q8", "q9",
- "q13", "q14", "q15", "memory", "cc"
- );
-}
-
-// 32x2 -> 12x1
-static void ScaleRowDown38_2_Int_NEON(const uint8* src_ptr, int src_stride,
- uint8* dst_ptr, int dst_width) {
- asm volatile (
- "vld1.u16 {q13}, [%4] \n"
- "vld1.u8 {q14}, [%5] \n"
- "add %3, %0 \n"
- "1: \n"
-
- // d0 = 00 40 01 41 02 42 03 43
- // d1 = 10 50 11 51 12 52 13 53
- // d2 = 20 60 21 61 22 62 23 63
- // d3 = 30 70 31 71 32 72 33 73
- "vld4.u8 {d0, d1, d2, d3}, [%0]! \n"
- "vld4.u8 {d4, d5, d6, d7}, [%3]! \n"
-
- // Shuffle the input data around to get align the data
- // so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7
- // d0 = 00 10 01 11 02 12 03 13
- // d1 = 40 50 41 51 42 52 43 53
- "vtrn.u8 d0, d1 \n"
- "vtrn.u8 d4, d5 \n"
-
- // d2 = 20 30 21 31 22 32 23 33
- // d3 = 60 70 61 71 62 72 63 73
- "vtrn.u8 d2, d3 \n"
- "vtrn.u8 d6, d7 \n"
-
- // d0 = 00+10 01+11 02+12 03+13
- // d2 = 40+50 41+51 42+52 43+53
- "vpaddl.u8 q0, q0 \n"
- "vpaddl.u8 q2, q2 \n"
-
- // d3 = 60+70 61+71 62+72 63+73
- "vpaddl.u8 d3, d3 \n"
- "vpaddl.u8 d7, d7 \n"
-
- // combine source lines
- "vadd.u16 q0, q2 \n"
- "vadd.u16 d4, d3, d7 \n"
-
- // dst_ptr[3] = (s[6] + s[7] + s[6+st] + s[7+st]) / 4
- "vqrshrn.u16 d4, q2, #2 \n"
-
- // Shuffle 2,3 reg around so that 2 can be added to the
- // 0,1 reg and 3 can be added to the 4,5 reg. This
- // requires expanding from u8 to u16 as the 0,1 and 4,5
- // registers are already expanded. Then do transposes
- // to get aligned.
- // q2 = xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33
- "vmovl.u8 q1, d2 \n"
- "vmovl.u8 q3, d6 \n"
-
- // combine source lines
- "vadd.u16 q1, q3 \n"
-
- // d4 = xx 20 xx 30 xx 22 xx 32
- // d5 = xx 21 xx 31 xx 23 xx 33
- "vtrn.u32 d2, d3 \n"
-
- // d4 = xx 20 xx 21 xx 22 xx 23
- // d5 = xx 30 xx 31 xx 32 xx 33
- "vtrn.u16 d2, d3 \n"
-
- // 0+1+2, 3+4+5
- "vadd.u16 q0, q1 \n"
-
- // Need to divide, but can't downshift as the the value
- // isn't a power of 2. So multiply by 65536 / n
- // and take the upper 16 bits.
- "vqrdmulh.s16 q0, q13 \n"
-
- // Align for table lookup, vtbl requires registers to
- // be adjacent
- "vmov.u8 d2, d4 \n"
-
- "vtbl.u8 d3, {d0, d1, d2}, d28 \n"
- "vtbl.u8 d4, {d0, d1, d2}, d29 \n"
-
- "vst1.u8 {d3}, [%1]! \n"
- "vst1.u32 {d4[0]}, [%1]! \n"
- "subs %2, #12 \n"
- "bhi 1b \n"
- : "+r"(src_ptr), // %0
- "+r"(dst_ptr), // %1
- "+r"(dst_width), // %2
- "+r"(src_stride) // %3
- : "r"(mult38_div6), // %4
- "r"(shuf38_2) // %5
- : "q0", "q1", "q2", "q3", "q13", "q14", "memory", "cc"
- );
-}
-
-/**
- * SSE2 downscalers with interpolation.
- *
- * Provided by Frank Barchard (fbarchard@google.com)
- *
- */
-
-// Constants for SSE2 code
-#elif (defined(_M_IX86) || defined(__i386__) || defined(__x86_64__)) && \
- !defined(YUV_DISABLE_ASM)
-#if defined(_MSC_VER)
-#define TALIGN16(t, var) __declspec(align(16)) t _ ## var
-#elif (defined(__APPLE__) || defined(__MINGW32__) || defined(__CYGWIN__)) && defined(__i386__)
-#define TALIGN16(t, var) t var __attribute__((aligned(16)))
-#else
-#define TALIGN16(t, var) t _ ## var __attribute__((aligned(16)))
-#endif
-
-#if (defined(__APPLE__) || defined(__MINGW32__) || defined(__CYGWIN__)) && \
- defined(__i386__)
-#define DECLARE_FUNCTION(name) \
- ".text \n" \
- ".globl _" #name " \n" \
-"_" #name ": \n"
-#else
-#define DECLARE_FUNCTION(name) \
- ".text \n" \
- ".global " #name " \n" \
-#name ": \n"
-#endif
-
-
-// Offsets for source bytes 0 to 9
-//extern "C"
-TALIGN16(const uint8, shuf0[16]) =
- { 0, 1, 3, 4, 5, 7, 8, 9, 128, 128, 128, 128, 128, 128, 128, 128 };
-
-// Offsets for source bytes 11 to 20 with 8 subtracted = 3 to 12.
-//extern "C"
-TALIGN16(const uint8, shuf1[16]) =
- { 3, 4, 5, 7, 8, 9, 11, 12, 128, 128, 128, 128, 128, 128, 128, 128 };
-
-// Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31.
-//extern "C"
-TALIGN16(const uint8, shuf2[16]) =
- { 5, 7, 8, 9, 11, 12, 13, 15, 128, 128, 128, 128, 128, 128, 128, 128 };
-
-// Offsets for source bytes 0 to 10
-//extern "C"
-TALIGN16(const uint8, shuf01[16]) =
- { 0, 1, 1, 2, 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10 };
-
-// Offsets for source bytes 10 to 21 with 8 subtracted = 3 to 13.
-//extern "C"
-TALIGN16(const uint8, shuf11[16]) =
- { 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13 };
-
-// Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31.
-//extern "C"
-TALIGN16(const uint8, shuf21[16]) =
- { 5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13, 13, 14, 14, 15 };
-
-// Coefficients for source bytes 0 to 10
-//extern "C"
-TALIGN16(const uint8, madd01[16]) =
- { 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2 };
-
-// Coefficients for source bytes 10 to 21
-//extern "C"
-TALIGN16(const uint8, madd11[16]) =
- { 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1 };
-
-// Coefficients for source bytes 21 to 31
-//extern "C"
-TALIGN16(const uint8, madd21[16]) =
- { 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3 };
-
-// Coefficients for source bytes 21 to 31
-//extern "C"
-TALIGN16(const int16, round34[8]) =
- { 2, 2, 2, 2, 2, 2, 2, 2 };
-
-//extern "C"
-TALIGN16(const uint8, shuf38a[16]) =
- { 0, 3, 6, 8, 11, 14, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 };
-
-//extern "C"
-TALIGN16(const uint8, shuf38b[16]) =
- { 128, 128, 128, 128, 128, 128, 0, 3, 6, 8, 11, 14, 128, 128, 128, 128 };
-
-// Arrange words 0,3,6 into 0,1,2
-//extern "C"
-TALIGN16(const uint8, shufac0[16]) =
- { 0, 1, 6, 7, 12, 13, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 };
-
-// Arrange words 0,3,6 into 3,4,5
-//extern "C"
-TALIGN16(const uint8, shufac3[16]) =
- { 128, 128, 128, 128, 128, 128, 0, 1, 6, 7, 12, 13, 128, 128, 128, 128 };
-
-// Scaling values for boxes of 3x3 and 2x3
-//extern "C"
-TALIGN16(const uint16, scaleac3[8]) =
- { 65536 / 9, 65536 / 9, 65536 / 6, 65536 / 9, 65536 / 9, 65536 / 6, 0, 0 };
-
-// Arrange first value for pixels 0,1,2,3,4,5
-//extern "C"
-TALIGN16(const uint8, shufab0[16]) =
- { 0, 128, 3, 128, 6, 128, 8, 128, 11, 128, 14, 128, 128, 128, 128, 128 };
-
-// Arrange second value for pixels 0,1,2,3,4,5
-//extern "C"
-TALIGN16(const uint8, shufab1[16]) =
- { 1, 128, 4, 128, 7, 128, 9, 128, 12, 128, 15, 128, 128, 128, 128, 128 };
-
-// Arrange third value for pixels 0,1,2,3,4,5
-//extern "C"
-TALIGN16(const uint8, shufab2[16]) =
- { 2, 128, 5, 128, 128, 128, 10, 128, 13, 128, 128, 128, 128, 128, 128, 128 };
-
-// Scaling values for boxes of 3x2 and 2x2
-//extern "C"
-TALIGN16(const uint16, scaleab2[8]) =
- { 65536 / 3, 65536 / 3, 65536 / 2, 65536 / 3, 65536 / 3, 65536 / 2, 0, 0 };
-#endif
-
-#if defined(_M_IX86) && !defined(YUV_DISABLE_ASM) && defined(_MSC_VER)
-
-#define HAS_SCALEROWDOWN2_SSE2
-// Reads 32 pixels, throws half away and writes 16 pixels.
-// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned.
-__declspec(naked)
-static void ScaleRowDown2_SSE2(const uint8* src_ptr, int src_stride,
- uint8* dst_ptr, int dst_width) {
- __asm {
- mov eax, [esp + 4] // src_ptr
- // src_stride ignored
- mov edx, [esp + 12] // dst_ptr
- mov ecx, [esp + 16] // dst_width
- pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
- psrlw xmm5, 8
-
- wloop:
- movdqa xmm0, [eax]
- movdqa xmm1, [eax + 16]
- lea eax, [eax + 32]
- pand xmm0, xmm5
- pand xmm1, xmm5
- packuswb xmm0, xmm1
- movdqa [edx], xmm0
- lea edx, [edx + 16]
- sub ecx, 16
- ja wloop
-
- ret
- }
-}
-// Blends 32x2 rectangle to 16x1.
-// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned.
-__declspec(naked)
-void ScaleRowDown2Int_SSE2(const uint8* src_ptr, int src_stride,
- uint8* dst_ptr, int dst_width) {
- __asm {
- push esi
- mov eax, [esp + 4 + 4] // src_ptr
- mov esi, [esp + 4 + 8] // src_stride
- mov edx, [esp + 4 + 12] // dst_ptr
- mov ecx, [esp + 4 + 16] // dst_width
- pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
- psrlw xmm5, 8
-
- wloop:
- movdqa xmm0, [eax]
- movdqa xmm1, [eax + 16]
- movdqa xmm2, [eax + esi]
- movdqa xmm3, [eax + esi + 16]
- lea eax, [eax + 32]
- pavgb xmm0, xmm2 // average rows
- pavgb xmm1, xmm3
-
- movdqa xmm2, xmm0 // average columns (32 to 16 pixels)
- psrlw xmm0, 8
- movdqa xmm3, xmm1
- psrlw xmm1, 8
- pand xmm2, xmm5
- pand xmm3, xmm5
- pavgw xmm0, xmm2
- pavgw xmm1, xmm3
- packuswb xmm0, xmm1
-
- movdqa [edx], xmm0
- lea edx, [edx + 16]
- sub ecx, 16
- ja wloop
-
- pop esi
- ret
- }
-}
-
-#define HAS_SCALEROWDOWN4_SSE2
-// Point samples 32 pixels to 8 pixels.
-// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned.
-__declspec(naked)
-static void ScaleRowDown4_SSE2(const uint8* src_ptr, int src_stride,
- uint8* dst_ptr, int dst_width) {
- __asm {
- pushad
- mov esi, [esp + 32 + 4] // src_ptr
- // src_stride ignored
- mov edi, [esp + 32 + 12] // dst_ptr
- mov ecx, [esp + 32 + 16] // dst_width
- pcmpeqb xmm5, xmm5 // generate mask 0x000000ff
- psrld xmm5, 24
-
- wloop:
- movdqa xmm0, [esi]
- movdqa xmm1, [esi + 16]
- lea esi, [esi + 32]
- pand xmm0, xmm5
- pand xmm1, xmm5
- packuswb xmm0, xmm1
- packuswb xmm0, xmm0
- movq qword ptr [edi], xmm0
- lea edi, [edi + 8]
- sub ecx, 8
- ja wloop
-
- popad
- ret
- }
-}
-
-// Blends 32x4 rectangle to 8x1.
-// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned.
-__declspec(naked)
-static void ScaleRowDown4Int_SSE2(const uint8* src_ptr, int src_stride,
- uint8* dst_ptr, int dst_width) {
- __asm {
- pushad
- mov esi, [esp + 32 + 4] // src_ptr
- mov ebx, [esp + 32 + 8] // src_stride
- mov edi, [esp + 32 + 12] // dst_ptr
- mov ecx, [esp + 32 + 16] // dst_width
- pcmpeqb xmm7, xmm7 // generate mask 0x00ff00ff
- psrlw xmm7, 8
- lea edx, [ebx + ebx * 2] // src_stride * 3
-
- wloop:
- movdqa xmm0, [esi]
- movdqa xmm1, [esi + 16]
- movdqa xmm2, [esi + ebx]
- movdqa xmm3, [esi + ebx + 16]
- pavgb xmm0, xmm2 // average rows
- pavgb xmm1, xmm3
- movdqa xmm2, [esi + ebx * 2]
- movdqa xmm3, [esi + ebx * 2 + 16]
- movdqa xmm4, [esi + edx]
- movdqa xmm5, [esi + edx + 16]
- lea esi, [esi + 32]
- pavgb xmm2, xmm4
- pavgb xmm3, xmm5
- pavgb xmm0, xmm2
- pavgb xmm1, xmm3
-
- movdqa xmm2, xmm0 // average columns (32 to 16 pixels)
- psrlw xmm0, 8
- movdqa xmm3, xmm1
- psrlw xmm1, 8
- pand xmm2, xmm7
- pand xmm3, xmm7
- pavgw xmm0, xmm2
- pavgw xmm1, xmm3
- packuswb xmm0, xmm1
-
- movdqa xmm2, xmm0 // average columns (16 to 8 pixels)
- psrlw xmm0, 8
- pand xmm2, xmm7
- pavgw xmm0, xmm2
- packuswb xmm0, xmm0
-
- movq qword ptr [edi], xmm0
- lea edi, [edi + 8]
- sub ecx, 8
- ja wloop
-
- popad
- ret
- }
-}
-
-#define HAS_SCALEROWDOWN8_SSE2
-// Point samples 32 pixels to 4 pixels.
-// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 4 byte aligned.
-__declspec(naked)
-static void ScaleRowDown8_SSE2(const uint8* src_ptr, int src_stride,
- uint8* dst_ptr, int dst_width) {
- __asm {
- pushad
- mov esi, [esp + 32 + 4] // src_ptr
- // src_stride ignored
- mov edi, [esp + 32 + 12] // dst_ptr
- mov ecx, [esp + 32 + 16] // dst_width
- pcmpeqb xmm5, xmm5 // generate mask isolating 1 src 8 bytes
- psrlq xmm5, 56
-
- wloop:
- movdqa xmm0, [esi]
- movdqa xmm1, [esi + 16]
- lea esi, [esi + 32]
- pand xmm0, xmm5
- pand xmm1, xmm5
- packuswb xmm0, xmm1 // 32->16
- packuswb xmm0, xmm0 // 16->8
- packuswb xmm0, xmm0 // 8->4
- movd dword ptr [edi], xmm0
- lea edi, [edi + 4]
- sub ecx, 4
- ja wloop
-
- popad
- ret
- }
-}
-
-// Blends 32x8 rectangle to 4x1.
-// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 4 byte aligned.
-__declspec(naked)
-static void ScaleRowDown8Int_SSE2(const uint8* src_ptr, int src_stride,
- uint8* dst_ptr, int dst_width) {
- __asm {
- pushad
- mov esi, [esp + 32 + 4] // src_ptr
- mov ebx, [esp + 32 + 8] // src_stride
- mov edi, [esp + 32 + 12] // dst_ptr
- mov ecx, [esp + 32 + 16] // dst_width
- lea edx, [ebx + ebx * 2] // src_stride * 3
- pxor xmm7, xmm7
-
- wloop:
- movdqa xmm0, [esi] // average 8 rows to 1
- movdqa xmm1, [esi + 16]
- movdqa xmm2, [esi + ebx]
- movdqa xmm3, [esi + ebx + 16]
- pavgb xmm0, xmm2
- pavgb xmm1, xmm3
- movdqa xmm2, [esi + ebx * 2]
- movdqa xmm3, [esi + ebx * 2 + 16]
- movdqa xmm4, [esi + edx]
- movdqa xmm5, [esi + edx + 16]
- lea ebp, [esi + ebx * 4]
- lea esi, [esi + 32]
- pavgb xmm2, xmm4
- pavgb xmm3, xmm5
- pavgb xmm0, xmm2
- pavgb xmm1, xmm3
-
- movdqa xmm2, [ebp]
- movdqa xmm3, [ebp + 16]
- movdqa xmm4, [ebp + ebx]
- movdqa xmm5, [ebp + ebx + 16]
- pavgb xmm2, xmm4
- pavgb xmm3, xmm5
- movdqa xmm4, [ebp + ebx * 2]
- movdqa xmm5, [ebp + ebx * 2 + 16]
- movdqa xmm6, [ebp + edx]
- pavgb xmm4, xmm6
- movdqa xmm6, [ebp + edx + 16]
- pavgb xmm5, xmm6
- pavgb xmm2, xmm4
- pavgb xmm3, xmm5
- pavgb xmm0, xmm2
- pavgb xmm1, xmm3
-
- psadbw xmm0, xmm7 // average 32 pixels to 4
- psadbw xmm1, xmm7
- pshufd xmm0, xmm0, 0xd8 // x1x0 -> xx01
- pshufd xmm1, xmm1, 0x8d // x3x2 -> 32xx
- por xmm0, xmm1 // -> 3201
- psrlw xmm0, 3
- packuswb xmm0, xmm0
- packuswb xmm0, xmm0
- movd dword ptr [edi], xmm0
-
- lea edi, [edi + 4]
- sub ecx, 4
- ja wloop
-
- popad
- ret
- }
-}
-
-#define HAS_SCALEROWDOWN34_SSSE3
-// Point samples 32 pixels to 24 pixels.
-// Produces three 8 byte values. For each 8 bytes, 16 bytes are read.
-// Then shuffled to do the scaling.
-
-// Note that movdqa+palign may be better than movdqu.
-// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned.
-__declspec(naked)
-static void ScaleRowDown34_SSSE3(const uint8* src_ptr, int src_stride,
- uint8* dst_ptr, int dst_width) {
- __asm {
- pushad
- mov esi, [esp + 32 + 4] // src_ptr
- // src_stride ignored
- mov edi, [esp + 32 + 12] // dst_ptr
- mov ecx, [esp + 32 + 16] // dst_width
- movdqa xmm3, _shuf0
- movdqa xmm4, _shuf1
- movdqa xmm5, _shuf2
-
- wloop:
- movdqa xmm0, [esi]
- movdqa xmm1, [esi + 16]
- lea esi, [esi + 32]
- movdqa xmm2, xmm1
- palignr xmm1, xmm0, 8
- pshufb xmm0, xmm3
- pshufb xmm1, xmm4
- pshufb xmm2, xmm5
- movq qword ptr [edi], xmm0
- movq qword ptr [edi + 8], xmm1
- movq qword ptr [edi + 16], xmm2
- lea edi, [edi + 24]
- sub ecx, 24
- ja wloop
-
- popad
- ret
- }
-}
-
-// Blends 32x2 rectangle to 24x1
-// Produces three 8 byte values. For each 8 bytes, 16 bytes are read.
-// Then shuffled to do the scaling.
-
-// Register usage:
-// xmm0 src_row 0
-// xmm1 src_row 1
-// xmm2 shuf 0
-// xmm3 shuf 1
-// xmm4 shuf 2
-// xmm5 madd 0
-// xmm6 madd 1
-// xmm7 round34
-
-// Note that movdqa+palign may be better than movdqu.
-// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned.
-__declspec(naked)
-static void ScaleRowDown34_1_Int_SSSE3(const uint8* src_ptr, int src_stride,
- uint8* dst_ptr, int dst_width) {
- __asm {
- pushad
- mov esi, [esp + 32 + 4] // src_ptr
- mov ebx, [esp + 32 + 8] // src_stride
- mov edi, [esp + 32 + 12] // dst_ptr
- mov ecx, [esp + 32 + 16] // dst_width
- movdqa xmm2, _shuf01
- movdqa xmm3, _shuf11
- movdqa xmm4, _shuf21
- movdqa xmm5, _madd01
- movdqa xmm6, _madd11
- movdqa xmm7, _round34
-
- wloop:
- movdqa xmm0, [esi] // pixels 0..7
- movdqa xmm1, [esi+ebx]
- pavgb xmm0, xmm1
- pshufb xmm0, xmm2
- pmaddubsw xmm0, xmm5
- paddsw xmm0, xmm7
- psrlw xmm0, 2
- packuswb xmm0, xmm0
- movq qword ptr [edi], xmm0
- movdqu xmm0, [esi+8] // pixels 8..15
- movdqu xmm1, [esi+ebx+8]
- pavgb xmm0, xmm1
- pshufb xmm0, xmm3
- pmaddubsw xmm0, xmm6
- paddsw xmm0, xmm7
- psrlw xmm0, 2
- packuswb xmm0, xmm0
- movq qword ptr [edi+8], xmm0
- movdqa xmm0, [esi+16] // pixels 16..23
- movdqa xmm1, [esi+ebx+16]
- lea esi, [esi+32]
- pavgb xmm0, xmm1
- pshufb xmm0, xmm4
- movdqa xmm1, _madd21
- pmaddubsw xmm0, xmm1
- paddsw xmm0, xmm7
- psrlw xmm0, 2
- packuswb xmm0, xmm0
- movq qword ptr [edi+16], xmm0
- lea edi, [edi+24]
- sub ecx, 24
- ja wloop
-
- popad
- ret
- }
-}
-
-// Note that movdqa+palign may be better than movdqu.
-// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned.
-__declspec(naked)
-static void ScaleRowDown34_0_Int_SSSE3(const uint8* src_ptr, int src_stride,
- uint8* dst_ptr, int dst_width) {
- __asm {
- pushad
- mov esi, [esp + 32 + 4] // src_ptr
- mov ebx, [esp + 32 + 8] // src_stride
- mov edi, [esp + 32 + 12] // dst_ptr
- mov ecx, [esp + 32 + 16] // dst_width
- movdqa xmm2, _shuf01
- movdqa xmm3, _shuf11
- movdqa xmm4, _shuf21
- movdqa xmm5, _madd01
- movdqa xmm6, _madd11
- movdqa xmm7, _round34
-
- wloop:
- movdqa xmm0, [esi] // pixels 0..7
- movdqa xmm1, [esi+ebx]
- pavgb xmm1, xmm0
- pavgb xmm0, xmm1
- pshufb xmm0, xmm2
- pmaddubsw xmm0, xmm5
- paddsw xmm0, xmm7
- psrlw xmm0, 2
- packuswb xmm0, xmm0
- movq qword ptr [edi], xmm0
- movdqu xmm0, [esi+8] // pixels 8..15
- movdqu xmm1, [esi+ebx+8]
- pavgb xmm1, xmm0
- pavgb xmm0, xmm1
- pshufb xmm0, xmm3
- pmaddubsw xmm0, xmm6
- paddsw xmm0, xmm7
- psrlw xmm0, 2
- packuswb xmm0, xmm0
- movq qword ptr [edi+8], xmm0
- movdqa xmm0, [esi+16] // pixels 16..23
- movdqa xmm1, [esi+ebx+16]
- lea esi, [esi+32]
- pavgb xmm1, xmm0
- pavgb xmm0, xmm1
- pshufb xmm0, xmm4
- movdqa xmm1, _madd21
- pmaddubsw xmm0, xmm1
- paddsw xmm0, xmm7
- psrlw xmm0, 2
- packuswb xmm0, xmm0
- movq qword ptr [edi+16], xmm0
- lea edi, [edi+24]
- sub ecx, 24
- ja wloop
-
- popad
- ret
- }
-}
-
-#define HAS_SCALEROWDOWN38_SSSE3
-// 3/8 point sampler
-
-// Scale 32 pixels to 12
-__declspec(naked)
-static void ScaleRowDown38_SSSE3(const uint8* src_ptr, int src_stride,
- uint8* dst_ptr, int dst_width) {
- __asm {
- pushad
- mov esi, [esp + 32 + 4] // src_ptr
- mov edx, [esp + 32 + 8] // src_stride
- mov edi, [esp + 32 + 12] // dst_ptr
- mov ecx, [esp + 32 + 16] // dst_width
- movdqa xmm4, _shuf38a
- movdqa xmm5, _shuf38b
-
- xloop:
- movdqa xmm0, [esi] // 16 pixels -> 0,1,2,3,4,5
- movdqa xmm1, [esi + 16] // 16 pixels -> 6,7,8,9,10,11
- lea esi, [esi + 32]
- pshufb xmm0, xmm4
- pshufb xmm1, xmm5
- paddusb xmm0, xmm1
-
- movq qword ptr [edi], xmm0 // write 12 pixels
- movhlps xmm1, xmm0
- movd [edi + 8], xmm1
- lea edi, [edi + 12]
- sub ecx, 12
- ja xloop
-
- popad
- ret
- }
-}
-
-// Scale 16x3 pixels to 6x1 with interpolation
-__declspec(naked)
-static void ScaleRowDown38_3_Int_SSSE3(const uint8* src_ptr, int src_stride,
- uint8* dst_ptr, int dst_width) {
- __asm {
- pushad
- mov esi, [esp + 32 + 4] // src_ptr
- mov edx, [esp + 32 + 8] // src_stride
- mov edi, [esp + 32 + 12] // dst_ptr
- mov ecx, [esp + 32 + 16] // dst_width
- movdqa xmm4, _shufac0
- movdqa xmm5, _shufac3
- movdqa xmm6, _scaleac3
- pxor xmm7, xmm7
-
- xloop:
- movdqa xmm0, [esi] // sum up 3 rows into xmm0/1
- movdqa xmm2, [esi + edx]
- movhlps xmm1, xmm0
- movhlps xmm3, xmm2
- punpcklbw xmm0, xmm7
- punpcklbw xmm1, xmm7
- punpcklbw xmm2, xmm7
- punpcklbw xmm3, xmm7
- paddusw xmm0, xmm2
- paddusw xmm1, xmm3
- movdqa xmm2, [esi + edx * 2]
- lea esi, [esi + 16]
- movhlps xmm3, xmm2
- punpcklbw xmm2, xmm7
- punpcklbw xmm3, xmm7
- paddusw xmm0, xmm2
- paddusw xmm1, xmm3
-
- movdqa xmm2, xmm0 // 8 pixels -> 0,1,2 of xmm2
- psrldq xmm0, 2
- paddusw xmm2, xmm0
- psrldq xmm0, 2
- paddusw xmm2, xmm0
- pshufb xmm2, xmm4
-
- movdqa xmm3, xmm1 // 8 pixels -> 3,4,5 of xmm2
- psrldq xmm1, 2
- paddusw xmm3, xmm1
- psrldq xmm1, 2
- paddusw xmm3, xmm1
- pshufb xmm3, xmm5
- paddusw xmm2, xmm3
-
- pmulhuw xmm2, xmm6 // divide by 9,9,6, 9,9,6
- packuswb xmm2, xmm2
-
- movd [edi], xmm2 // write 6 pixels
- pextrw eax, xmm2, 2
- mov [edi + 4], ax
- lea edi, [edi + 6]
- sub ecx, 6
- ja xloop
-
- popad
- ret
- }
-}
-
-// Scale 16x2 pixels to 6x1 with interpolation
-__declspec(naked)
-static void ScaleRowDown38_2_Int_SSSE3(const uint8* src_ptr, int src_stride,
- uint8* dst_ptr, int dst_width) {
- __asm {
- pushad
- mov esi, [esp + 32 + 4] // src_ptr
- mov edx, [esp + 32 + 8] // src_stride
- mov edi, [esp + 32 + 12] // dst_ptr
- mov ecx, [esp + 32 + 16] // dst_width
- movdqa xmm4, _shufab0
- movdqa xmm5, _shufab1
- movdqa xmm6, _shufab2
- movdqa xmm7, _scaleab2
-
- xloop:
- movdqa xmm2, [esi] // average 2 rows into xmm2
- pavgb xmm2, [esi + edx]
- lea esi, [esi + 16]
-
- movdqa xmm0, xmm2 // 16 pixels -> 0,1,2,3,4,5 of xmm0
- pshufb xmm0, xmm4
- movdqa xmm1, xmm2
- pshufb xmm1, xmm5
- paddusw xmm0, xmm1
- pshufb xmm2, xmm6
- paddusw xmm0, xmm2
-
- pmulhuw xmm0, xmm7 // divide by 3,3,2, 3,3,2
- packuswb xmm0, xmm0
-
- movd [edi], xmm0 // write 6 pixels
- pextrw eax, xmm0, 2
- mov [edi + 4], ax
- lea edi, [edi + 6]
- sub ecx, 6
- ja xloop
-
- popad
- ret
- }
-}
-
-#define HAS_SCALEADDROWS_SSE2
-
-// Reads 8xN bytes and produces 16 shorts at a time.
-__declspec(naked)
-static void ScaleAddRows_SSE2(const uint8* src_ptr, int src_stride,
- uint16* dst_ptr, int src_width,
- int src_height) {
- __asm {
- pushad
- mov esi, [esp + 32 + 4] // src_ptr
- mov edx, [esp + 32 + 8] // src_stride
- mov edi, [esp + 32 + 12] // dst_ptr
- mov ecx, [esp + 32 + 16] // dst_width
- mov ebx, [esp + 32 + 20] // height
- pxor xmm5, xmm5
- dec ebx
-
- xloop:
- // first row
- movdqa xmm2, [esi]
- lea eax, [esi + edx]
- movhlps xmm3, xmm2
- mov ebp, ebx
- punpcklbw xmm2, xmm5
- punpcklbw xmm3, xmm5
-
- // sum remaining rows
- yloop:
- movdqa xmm0, [eax] // read 16 pixels
- lea eax, [eax + edx] // advance to next row
- movhlps xmm1, xmm0
- punpcklbw xmm0, xmm5
- punpcklbw xmm1, xmm5
- paddusw xmm2, xmm0 // sum 16 words
- paddusw xmm3, xmm1
- sub ebp, 1
- ja yloop
-
- movdqa [edi], xmm2
- movdqa [edi + 16], xmm3
- lea edi, [edi + 32]
- lea esi, [esi + 16]
-
- sub ecx, 16
- ja xloop
-
- popad
- ret
- }
-}
-
-// Bilinear row filtering combines 16x2 -> 16x1. SSE2 version.
-#define HAS_SCALEFILTERROWS_SSE2
-__declspec(naked)
-static void ScaleFilterRows_SSE2(uint8* dst_ptr, const uint8* src_ptr,
- int src_stride, int dst_width,
- int source_y_fraction) {
- __asm {
- push esi
- push edi
- mov edi, [esp + 8 + 4] // dst_ptr
- mov esi, [esp + 8 + 8] // src_ptr
- mov edx, [esp + 8 + 12] // src_stride
- mov ecx, [esp + 8 + 16] // dst_width
- mov eax, [esp + 8 + 20] // source_y_fraction (0..255)
- cmp eax, 0
- je xloop1
- cmp eax, 128
- je xloop2
-
- movd xmm6, eax // xmm6 = y fraction
- punpcklwd xmm6, xmm6
- pshufd xmm6, xmm6, 0
- neg eax // xmm5 = 256 - y fraction
- add eax, 256
- movd xmm5, eax
- punpcklwd xmm5, xmm5
- pshufd xmm5, xmm5, 0
- pxor xmm7, xmm7
-
- xloop:
- movdqa xmm0, [esi]
- movdqa xmm2, [esi + edx]
- lea esi, [esi + 16]
- movdqa xmm1, xmm0
- movdqa xmm3, xmm2
- punpcklbw xmm0, xmm7
- punpcklbw xmm2, xmm7
- punpckhbw xmm1, xmm7
- punpckhbw xmm3, xmm7
- pmullw xmm0, xmm5 // scale row 0
- pmullw xmm1, xmm5
- pmullw xmm2, xmm6 // scale row 1
- pmullw xmm3, xmm6
- paddusw xmm0, xmm2 // sum rows
- paddusw xmm1, xmm3
- psrlw xmm0, 8
- psrlw xmm1, 8
- packuswb xmm0, xmm1
- movdqa [edi], xmm0
- lea edi, [edi + 16]
- sub ecx, 16
- ja xloop
-
- mov al, [edi - 1]
- mov [edi], al
- pop edi
- pop esi
- ret
-
- xloop1:
- movdqa xmm0, [esi]
- lea esi, [esi + 16]
- movdqa [edi], xmm0
- lea edi, [edi + 16]
- sub ecx, 16
- ja xloop1
-
- mov al, [edi - 1]
- mov [edi], al
- pop edi
- pop esi
- ret
-
- xloop2:
- movdqa xmm0, [esi]
- movdqa xmm2, [esi + edx]
- lea esi, [esi + 16]
- pavgb xmm0, xmm2
- movdqa [edi], xmm0
- lea edi, [edi + 16]
- sub ecx, 16
- ja xloop2
-
- mov al, [edi - 1]
- mov [edi], al
- pop edi
- pop esi
- ret
- }
-}
-
-// Bilinear row filtering combines 16x2 -> 16x1. SSSE3 version.
-#define HAS_SCALEFILTERROWS_SSSE3
-__declspec(naked)
-static void ScaleFilterRows_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
- int src_stride, int dst_width,
- int source_y_fraction) {
- __asm {
- push esi
- push edi
- mov edi, [esp + 8 + 4] // dst_ptr
- mov esi, [esp + 8 + 8] // src_ptr
- mov edx, [esp + 8 + 12] // src_stride
- mov ecx, [esp + 8 + 16] // dst_width
- mov eax, [esp + 8 + 20] // source_y_fraction (0..255)
- shr eax, 1
- cmp eax, 0
- je xloop1
- cmp eax, 64
- je xloop2
-
- mov ah,al
- neg al
- add al, 128
- movd xmm5, eax
- punpcklwd xmm5, xmm5
- pshufd xmm5, xmm5, 0
-
- xloop:
- movdqa xmm0, [esi]
- movdqa xmm2, [esi + edx]
- lea esi, [esi + 16]
- movdqa xmm1, xmm0
- punpcklbw xmm0, xmm2
- punpckhbw xmm1, xmm2
- pmaddubsw xmm0, xmm5
- pmaddubsw xmm1, xmm5
- psrlw xmm0, 7
- psrlw xmm1, 7
- packuswb xmm0, xmm1
- movdqa [edi], xmm0
- lea edi, [edi + 16]
- sub ecx, 16
- ja xloop
-
- mov al, [edi - 1]
- mov [edi], al
- pop edi
- pop esi
- ret
-
- xloop1:
- movdqa xmm0, [esi]
- lea esi, [esi + 16]
- movdqa [edi], xmm0
- lea edi, [edi + 16]
- sub ecx, 16
- ja xloop1
-
- mov al, [edi - 1]
- mov [edi], al
- pop edi
- pop esi
- ret
-
- xloop2:
- movdqa xmm0, [esi]
- movdqa xmm2, [esi + edx]
- lea esi, [esi + 16]
- pavgb xmm0, xmm2
- movdqa [edi], xmm0
- lea edi, [edi + 16]
- sub ecx, 16
- ja xloop2
-
- mov al, [edi - 1]
- mov [edi], al
- pop edi
- pop esi
- ret
-
- }
-}
-
-// Note that movdqa+palign may be better than movdqu.
-// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned.
-__declspec(naked)
-static void ScaleFilterCols34_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
- int dst_width) {
- __asm {
- mov edx, [esp + 4] // dst_ptr
- mov eax, [esp + 8] // src_ptr
- mov ecx, [esp + 12] // dst_width
- movdqa xmm1, _round34
- movdqa xmm2, _shuf01
- movdqa xmm3, _shuf11
- movdqa xmm4, _shuf21
- movdqa xmm5, _madd01
- movdqa xmm6, _madd11
- movdqa xmm7, _madd21
-
- wloop:
- movdqa xmm0, [eax] // pixels 0..7
- pshufb xmm0, xmm2
- pmaddubsw xmm0, xmm5
- paddsw xmm0, xmm1
- psrlw xmm0, 2
- packuswb xmm0, xmm0
- movq qword ptr [edx], xmm0
- movdqu xmm0, [eax+8] // pixels 8..15
- pshufb xmm0, xmm3
- pmaddubsw xmm0, xmm6
- paddsw xmm0, xmm1
- psrlw xmm0, 2
- packuswb xmm0, xmm0
- movq qword ptr [edx+8], xmm0
- movdqa xmm0, [eax+16] // pixels 16..23
- lea eax, [eax+32]
- pshufb xmm0, xmm4
- pmaddubsw xmm0, xmm7
- paddsw xmm0, xmm1
- psrlw xmm0, 2
- packuswb xmm0, xmm0
- movq qword ptr [edx+16], xmm0
- lea edx, [edx+24]
- sub ecx, 24
- ja wloop
- ret
- }
-}
-
-#elif (defined(__x86_64__) || defined(__i386__)) && !defined(YUV_DISABLE_ASM)
-
-// GCC versions of row functions are verbatim conversions from Visual C.
-// Generated using gcc disassembly on Visual C object file:
-// objdump -D yuvscaler.obj >yuvscaler.txt
-#define HAS_SCALEROWDOWN2_SSE2
-static void ScaleRowDown2_SSE2(const uint8* src_ptr, int src_stride,
- uint8* dst_ptr, int dst_width) {
- asm volatile (
- "pcmpeqb %%xmm5,%%xmm5 \n"
- "psrlw $0x8,%%xmm5 \n"
-"1:"
- "movdqa (%0),%%xmm0 \n"
- "movdqa 0x10(%0),%%xmm1 \n"
- "lea 0x20(%0),%0 \n"
- "pand %%xmm5,%%xmm0 \n"
- "pand %%xmm5,%%xmm1 \n"
- "packuswb %%xmm1,%%xmm0 \n"
- "movdqa %%xmm0,(%1) \n"
- "lea 0x10(%1),%1 \n"
- "sub $0x10,%2 \n"
- "ja 1b \n"
- : "+r"(src_ptr), // %0
- "+r"(dst_ptr), // %1
- "+r"(dst_width) // %2
- :
- : "memory", "cc"
-);
-}
-
-static void ScaleRowDown2Int_SSE2(const uint8* src_ptr, int src_stride,
- uint8* dst_ptr, int dst_width) {
- asm volatile (
- "pcmpeqb %%xmm5,%%xmm5 \n"
- "psrlw $0x8,%%xmm5 \n"
-"1:"
- "movdqa (%0),%%xmm0 \n"
- "movdqa 0x10(%0),%%xmm1 \n"
- "movdqa (%0,%3,1),%%xmm2 \n"
- "movdqa 0x10(%0,%3,1),%%xmm3 \n"
- "lea 0x20(%0),%0 \n"
- "pavgb %%xmm2,%%xmm0 \n"
- "pavgb %%xmm3,%%xmm1 \n"
- "movdqa %%xmm0,%%xmm2 \n"
- "psrlw $0x8,%%xmm0 \n"
- "movdqa %%xmm1,%%xmm3 \n"
- "psrlw $0x8,%%xmm1 \n"
- "pand %%xmm5,%%xmm2 \n"
- "pand %%xmm5,%%xmm3 \n"
- "pavgw %%xmm2,%%xmm0 \n"
- "pavgw %%xmm3,%%xmm1 \n"
- "packuswb %%xmm1,%%xmm0 \n"
- "movdqa %%xmm0,(%1) \n"
- "lea 0x10(%1),%1 \n"
- "sub $0x10,%2 \n"
- "ja 1b \n"
- : "+r"(src_ptr), // %0
- "+r"(dst_ptr), // %1
- "+r"(dst_width) // %2
- : "r"((intptr_t)(src_stride)) // %3
- : "memory", "cc"
-);
-}
-
-#define HAS_SCALEROWDOWN4_SSE2
-static void ScaleRowDown4_SSE2(const uint8* src_ptr, int src_stride,
- uint8* dst_ptr, int dst_width) {
- asm volatile (
- "pcmpeqb %%xmm5,%%xmm5 \n"
- "psrld $0x18,%%xmm5 \n"
-"1:"
- "movdqa (%0),%%xmm0 \n"
- "movdqa 0x10(%0),%%xmm1 \n"
- "lea 0x20(%0),%0 \n"
- "pand %%xmm5,%%xmm0 \n"
- "pand %%xmm5,%%xmm1 \n"
- "packuswb %%xmm1,%%xmm0 \n"
- "packuswb %%xmm0,%%xmm0 \n"
- "movq %%xmm0,(%1) \n"
- "lea 0x8(%1),%1 \n"
- "sub $0x8,%2 \n"
- "ja 1b \n"
- : "+r"(src_ptr), // %0
- "+r"(dst_ptr), // %1
- "+r"(dst_width) // %2
- :
- : "memory", "cc"
-);
-}
-
-static void ScaleRowDown4Int_SSE2(const uint8* src_ptr, int src_stride,
- uint8* dst_ptr, int dst_width) {
- intptr_t temp = 0;
- asm volatile (
- "pcmpeqb %%xmm7,%%xmm7 \n"
- "psrlw $0x8,%%xmm7 \n"
- "lea (%4,%4,2),%3 \n"
-"1:"
- "movdqa (%0),%%xmm0 \n"
- "movdqa 0x10(%0),%%xmm1 \n"
- "movdqa (%0,%4,1),%%xmm2 \n"
- "movdqa 0x10(%0,%4,1),%%xmm3 \n"
- "pavgb %%xmm2,%%xmm0 \n"
- "pavgb %%xmm3,%%xmm1 \n"
- "movdqa (%0,%4,2),%%xmm2 \n"
- "movdqa 0x10(%0,%4,2),%%xmm3 \n"
- "movdqa (%0,%3,1),%%xmm4 \n"
- "movdqa 0x10(%0,%3,1),%%xmm5 \n"
- "lea 0x20(%0),%0 \n"
- "pavgb %%xmm4,%%xmm2 \n"
- "pavgb %%xmm2,%%xmm0 \n"
- "pavgb %%xmm5,%%xmm3 \n"
- "pavgb %%xmm3,%%xmm1 \n"
- "movdqa %%xmm0,%%xmm2 \n"
- "psrlw $0x8,%%xmm0 \n"
- "movdqa %%xmm1,%%xmm3 \n"
- "psrlw $0x8,%%xmm1 \n"
- "pand %%xmm7,%%xmm2 \n"
- "pand %%xmm7,%%xmm3 \n"
- "pavgw %%xmm2,%%xmm0 \n"
- "pavgw %%xmm3,%%xmm1 \n"
- "packuswb %%xmm1,%%xmm0 \n"
- "movdqa %%xmm0,%%xmm2 \n"
- "psrlw $0x8,%%xmm0 \n"
- "pand %%xmm7,%%xmm2 \n"
- "pavgw %%xmm2,%%xmm0 \n"
- "packuswb %%xmm0,%%xmm0 \n"
- "movq %%xmm0,(%1) \n"
- "lea 0x8(%1),%1 \n"
- "sub $0x8,%2 \n"
- "ja 1b \n"
- : "+r"(src_ptr), // %0
- "+r"(dst_ptr), // %1
- "+r"(dst_width), // %2
- "+r"(temp) // %3
- : "r"((intptr_t)(src_stride)) // %4
- : "memory", "cc"
-#if defined(__x86_64__)
- , "xmm6", "xmm7"
-#endif
-);
-}
-
-#define HAS_SCALEROWDOWN8_SSE2
-static void ScaleRowDown8_SSE2(const uint8* src_ptr, int src_stride,
- uint8* dst_ptr, int dst_width) {
- asm volatile (
- "pcmpeqb %%xmm5,%%xmm5 \n"
- "psrlq $0x38,%%xmm5 \n"
-"1:"
- "movdqa (%0),%%xmm0 \n"
- "movdqa 0x10(%0),%%xmm1 \n"
- "lea 0x20(%0),%0 \n"
- "pand %%xmm5,%%xmm0 \n"
- "pand %%xmm5,%%xmm1 \n"
- "packuswb %%xmm1,%%xmm0 \n"
- "packuswb %%xmm0,%%xmm0 \n"
- "packuswb %%xmm0,%%xmm0 \n"
- "movd %%xmm0,(%1) \n"
- "lea 0x4(%1),%1 \n"
- "sub $0x4,%2 \n"
- "ja 1b \n"
- : "+r"(src_ptr), // %0
- "+r"(dst_ptr), // %1
- "+r"(dst_width) // %2
- :
- : "memory", "cc"
-);
-}
-
-#if defined(__i386__)
-void ScaleRowDown8Int_SSE2(const uint8* src_ptr, int src_stride,
- uint8* dst_ptr, int dst_width);
- asm(
- DECLARE_FUNCTION(ScaleRowDown8Int_SSE2)
- "pusha \n"
- "mov 0x24(%esp),%esi \n"
- "mov 0x28(%esp),%ebx \n"
- "mov 0x2c(%esp),%edi \n"
- "mov 0x30(%esp),%ecx \n"
- "lea (%ebx,%ebx,2),%edx \n"
- "pxor %xmm7,%xmm7 \n"
-
-"1:"
- "movdqa (%esi),%xmm0 \n"
- "movdqa 0x10(%esi),%xmm1 \n"
- "movdqa (%esi,%ebx,1),%xmm2 \n"
- "movdqa 0x10(%esi,%ebx,1),%xmm3 \n"
- "pavgb %xmm2,%xmm0 \n"
- "pavgb %xmm3,%xmm1 \n"
- "movdqa (%esi,%ebx,2),%xmm2 \n"
- "movdqa 0x10(%esi,%ebx,2),%xmm3 \n"
- "movdqa (%esi,%edx,1),%xmm4 \n"
- "movdqa 0x10(%esi,%edx,1),%xmm5 \n"
- "lea (%esi,%ebx,4),%ebp \n"
- "lea 0x20(%esi),%esi \n"
- "pavgb %xmm4,%xmm2 \n"
- "pavgb %xmm5,%xmm3 \n"
- "pavgb %xmm2,%xmm0 \n"
- "pavgb %xmm3,%xmm1 \n"
- "movdqa 0x0(%ebp),%xmm2 \n"
- "movdqa 0x10(%ebp),%xmm3 \n"
- "movdqa 0x0(%ebp,%ebx,1),%xmm4 \n"
- "movdqa 0x10(%ebp,%ebx,1),%xmm5 \n"
- "pavgb %xmm4,%xmm2 \n"
- "pavgb %xmm5,%xmm3 \n"
- "movdqa 0x0(%ebp,%ebx,2),%xmm4 \n"
- "movdqa 0x10(%ebp,%ebx,2),%xmm5 \n"
- "movdqa 0x0(%ebp,%edx,1),%xmm6 \n"
- "pavgb %xmm6,%xmm4 \n"
- "movdqa 0x10(%ebp,%edx,1),%xmm6 \n"
- "pavgb %xmm6,%xmm5 \n"
- "pavgb %xmm4,%xmm2 \n"
- "pavgb %xmm5,%xmm3 \n"
- "pavgb %xmm2,%xmm0 \n"
- "pavgb %xmm3,%xmm1 \n"
- "psadbw %xmm7,%xmm0 \n"
- "psadbw %xmm7,%xmm1 \n"
- "pshufd $0xd8,%xmm0,%xmm0 \n"
- "pshufd $0x8d,%xmm1,%xmm1 \n"
- "por %xmm1,%xmm0 \n"
- "psrlw $0x3,%xmm0 \n"
- "packuswb %xmm0,%xmm0 \n"
- "packuswb %xmm0,%xmm0 \n"
- "movd %xmm0,(%edi) \n"
- "lea 0x4(%edi),%edi \n"
- "sub $0x4,%ecx \n"
- "ja 1b \n"
- "popa \n"
- "ret \n"
-);
-
-// fpic is used for magiccam plugin
-#if !defined(__PIC__)
-#define HAS_SCALEROWDOWN34_SSSE3
-void ScaleRowDown34_SSSE3(const uint8* src_ptr, int src_stride,
- uint8* dst_ptr, int dst_width);
- asm(
- DECLARE_FUNCTION(ScaleRowDown34_SSSE3)
- "pusha \n"
- "mov 0x24(%esp),%esi \n"
- "mov 0x2c(%esp),%edi \n"
- "mov 0x30(%esp),%ecx \n"
- "movdqa _shuf0,%xmm3 \n"
- "movdqa _shuf1,%xmm4 \n"
- "movdqa _shuf2,%xmm5 \n"
-
-"1:"
- "movdqa (%esi),%xmm0 \n"
- "movdqa 0x10(%esi),%xmm2 \n"
- "lea 0x20(%esi),%esi \n"
- "movdqa %xmm2,%xmm1 \n"
- "palignr $0x8,%xmm0,%xmm1 \n"
- "pshufb %xmm3,%xmm0 \n"
- "pshufb %xmm4,%xmm1 \n"
- "pshufb %xmm5,%xmm2 \n"
- "movq %xmm0,(%edi) \n"
- "movq %xmm1,0x8(%edi) \n"
- "movq %xmm2,0x10(%edi) \n"
- "lea 0x18(%edi),%edi \n"
- "sub $0x18,%ecx \n"
- "ja 1b \n"
- "popa \n"
- "ret \n"
-);
-
-void ScaleRowDown34_1_Int_SSSE3(const uint8* src_ptr, int src_stride,
- uint8* dst_ptr, int dst_width);
- asm(
- DECLARE_FUNCTION(ScaleRowDown34_1_Int_SSSE3)
- "pusha \n"
- "mov 0x24(%esp),%esi \n"
- "mov 0x28(%esp),%ebp \n"
- "mov 0x2c(%esp),%edi \n"
- "mov 0x30(%esp),%ecx \n"
- "movdqa _shuf01,%xmm2 \n"
- "movdqa _shuf11,%xmm3 \n"
- "movdqa _shuf21,%xmm4 \n"
- "movdqa _madd01,%xmm5 \n"
- "movdqa _madd11,%xmm6 \n"
- "movdqa _round34,%xmm7 \n"
-
-"1:"
- "movdqa (%esi),%xmm0 \n"
- "movdqa (%esi,%ebp),%xmm1 \n"
- "pavgb %xmm1,%xmm0 \n"
- "pshufb %xmm2,%xmm0 \n"
- "pmaddubsw %xmm5,%xmm0 \n"
- "paddsw %xmm7,%xmm0 \n"
- "psrlw $0x2,%xmm0 \n"
- "packuswb %xmm0,%xmm0 \n"
- "movq %xmm0,(%edi) \n"
- "movdqu 0x8(%esi),%xmm0 \n"
- "movdqu 0x8(%esi,%ebp),%xmm1 \n"
- "pavgb %xmm1,%xmm0 \n"
- "pshufb %xmm3,%xmm0 \n"
- "pmaddubsw %xmm6,%xmm0 \n"
- "paddsw %xmm7,%xmm0 \n"
- "psrlw $0x2,%xmm0 \n"
- "packuswb %xmm0,%xmm0 \n"
- "movq %xmm0,0x8(%edi) \n"
- "movdqa 0x10(%esi),%xmm0 \n"
- "movdqa 0x10(%esi,%ebp),%xmm1 \n"
- "lea 0x20(%esi),%esi \n"
- "pavgb %xmm1,%xmm0 \n"
- "pshufb %xmm4,%xmm0 \n"
- "movdqa _madd21,%xmm1 \n"
- "pmaddubsw %xmm1,%xmm0 \n"
- "paddsw %xmm7,%xmm0 \n"
- "psrlw $0x2,%xmm0 \n"
- "packuswb %xmm0,%xmm0 \n"
- "movq %xmm0,0x10(%edi) \n"
- "lea 0x18(%edi),%edi \n"
- "sub $0x18,%ecx \n"
- "ja 1b \n"
-
- "popa \n"
- "ret \n"
-);
-
-void ScaleRowDown34_0_Int_SSSE3(const uint8* src_ptr, int src_stride,
- uint8* dst_ptr, int dst_width);
- asm(
- DECLARE_FUNCTION(ScaleRowDown34_0_Int_SSSE3)
- "pusha \n"
- "mov 0x24(%esp),%esi \n"
- "mov 0x28(%esp),%ebp \n"
- "mov 0x2c(%esp),%edi \n"
- "mov 0x30(%esp),%ecx \n"
- "movdqa _shuf01,%xmm2 \n"
- "movdqa _shuf11,%xmm3 \n"
- "movdqa _shuf21,%xmm4 \n"
- "movdqa _madd01,%xmm5 \n"
- "movdqa _madd11,%xmm6 \n"
- "movdqa _round34,%xmm7 \n"
-
-"1:"
- "movdqa (%esi),%xmm0 \n"
- "movdqa (%esi,%ebp,1),%xmm1 \n"
- "pavgb %xmm0,%xmm1 \n"
- "pavgb %xmm1,%xmm0 \n"
- "pshufb %xmm2,%xmm0 \n"
- "pmaddubsw %xmm5,%xmm0 \n"
- "paddsw %xmm7,%xmm0 \n"
- "psrlw $0x2,%xmm0 \n"
- "packuswb %xmm0,%xmm0 \n"
- "movq %xmm0,(%edi) \n"
- "movdqu 0x8(%esi),%xmm0 \n"
- "movdqu 0x8(%esi,%ebp,1),%xmm1 \n"
- "pavgb %xmm0,%xmm1 \n"
- "pavgb %xmm1,%xmm0 \n"
- "pshufb %xmm3,%xmm0 \n"
- "pmaddubsw %xmm6,%xmm0 \n"
- "paddsw %xmm7,%xmm0 \n"
- "psrlw $0x2,%xmm0 \n"
- "packuswb %xmm0,%xmm0 \n"
- "movq %xmm0,0x8(%edi) \n"
- "movdqa 0x10(%esi),%xmm0 \n"
- "movdqa 0x10(%esi,%ebp,1),%xmm1 \n"
- "lea 0x20(%esi),%esi \n"
- "pavgb %xmm0,%xmm1 \n"
- "pavgb %xmm1,%xmm0 \n"
- "pshufb %xmm4,%xmm0 \n"
- "movdqa _madd21,%xmm1 \n"
- "pmaddubsw %xmm1,%xmm0 \n"
- "paddsw %xmm7,%xmm0 \n"
- "psrlw $0x2,%xmm0 \n"
- "packuswb %xmm0,%xmm0 \n"
- "movq %xmm0,0x10(%edi) \n"
- "lea 0x18(%edi),%edi \n"
- "sub $0x18,%ecx \n"
- "ja 1b \n"
- "popa \n"
- "ret \n"
-);
-
-#define HAS_SCALEROWDOWN38_SSSE3
-void ScaleRowDown38_SSSE3(const uint8* src_ptr, int src_stride,
- uint8* dst_ptr, int dst_width);
- asm(
- DECLARE_FUNCTION(ScaleRowDown38_SSSE3)
- "pusha \n"
- "mov 0x24(%esp),%esi \n"
- "mov 0x28(%esp),%edx \n"
- "mov 0x2c(%esp),%edi \n"
- "mov 0x30(%esp),%ecx \n"
- "movdqa _shuf38a ,%xmm4 \n"
- "movdqa _shuf38b ,%xmm5 \n"
-
-"1:"
- "movdqa (%esi),%xmm0 \n"
- "movdqa 0x10(%esi),%xmm1 \n"
- "lea 0x20(%esi),%esi \n"
- "pshufb %xmm4,%xmm0 \n"
- "pshufb %xmm5,%xmm1 \n"
- "paddusb %xmm1,%xmm0 \n"
- "movq %xmm0,(%edi) \n"
- "movhlps %xmm0,%xmm1 \n"
- "movd %xmm1,0x8(%edi) \n"
- "lea 0xc(%edi),%edi \n"
- "sub $0xc,%ecx \n"
- "ja 1b \n"
- "popa \n"
- "ret \n"
-);
-
-void ScaleRowDown38_3_Int_SSSE3(const uint8* src_ptr, int src_stride,
- uint8* dst_ptr, int dst_width);
- asm(
- DECLARE_FUNCTION(ScaleRowDown38_3_Int_SSSE3)
- "pusha \n"
- "mov 0x24(%esp),%esi \n"
- "mov 0x28(%esp),%edx \n"
- "mov 0x2c(%esp),%edi \n"
- "mov 0x30(%esp),%ecx \n"
- "movdqa _shufac0,%xmm4 \n"
- "movdqa _shufac3,%xmm5 \n"
- "movdqa _scaleac3,%xmm6 \n"
- "pxor %xmm7,%xmm7 \n"
-
-"1:"
- "movdqa (%esi),%xmm0 \n"
- "movdqa (%esi,%edx,1),%xmm2 \n"
- "movhlps %xmm0,%xmm1 \n"
- "movhlps %xmm2,%xmm3 \n"
- "punpcklbw %xmm7,%xmm0 \n"
- "punpcklbw %xmm7,%xmm1 \n"
- "punpcklbw %xmm7,%xmm2 \n"
- "punpcklbw %xmm7,%xmm3 \n"
- "paddusw %xmm2,%xmm0 \n"
- "paddusw %xmm3,%xmm1 \n"
- "movdqa (%esi,%edx,2),%xmm2 \n"
- "lea 0x10(%esi),%esi \n"
- "movhlps %xmm2,%xmm3 \n"
- "punpcklbw %xmm7,%xmm2 \n"
- "punpcklbw %xmm7,%xmm3 \n"
- "paddusw %xmm2,%xmm0 \n"
- "paddusw %xmm3,%xmm1 \n"
- "movdqa %xmm0,%xmm2 \n"
- "psrldq $0x2,%xmm0 \n"
- "paddusw %xmm0,%xmm2 \n"
- "psrldq $0x2,%xmm0 \n"
- "paddusw %xmm0,%xmm2 \n"
- "pshufb %xmm4,%xmm2 \n"
- "movdqa %xmm1,%xmm3 \n"
- "psrldq $0x2,%xmm1 \n"
- "paddusw %xmm1,%xmm3 \n"
- "psrldq $0x2,%xmm1 \n"
- "paddusw %xmm1,%xmm3 \n"
- "pshufb %xmm5,%xmm3 \n"
- "paddusw %xmm3,%xmm2 \n"
- "pmulhuw %xmm6,%xmm2 \n"
- "packuswb %xmm2,%xmm2 \n"
- "movd %xmm2,(%edi) \n"
- "pextrw $0x2,%xmm2,%eax \n"
- "mov %ax,0x4(%edi) \n"
- "lea 0x6(%edi),%edi \n"
- "sub $0x6,%ecx \n"
- "ja 1b \n"
- "popa \n"
- "ret \n"
-);
-
-void ScaleRowDown38_2_Int_SSSE3(const uint8* src_ptr, int src_stride,
- uint8* dst_ptr, int dst_width);
- asm(
- DECLARE_FUNCTION(ScaleRowDown38_2_Int_SSSE3)
- "pusha \n"
- "mov 0x24(%esp),%esi \n"
- "mov 0x28(%esp),%edx \n"
- "mov 0x2c(%esp),%edi \n"
- "mov 0x30(%esp),%ecx \n"
- "movdqa _shufab0,%xmm4 \n"
- "movdqa _shufab1,%xmm5 \n"
- "movdqa _shufab2,%xmm6 \n"
- "movdqa _scaleab2,%xmm7 \n"
-
-"1:"
- "movdqa (%esi),%xmm2 \n"
- "pavgb (%esi,%edx,1),%xmm2 \n"
- "lea 0x10(%esi),%esi \n"
- "movdqa %xmm2,%xmm0 \n"
- "pshufb %xmm4,%xmm0 \n"
- "movdqa %xmm2,%xmm1 \n"
- "pshufb %xmm5,%xmm1 \n"
- "paddusw %xmm1,%xmm0 \n"
- "pshufb %xmm6,%xmm2 \n"
- "paddusw %xmm2,%xmm0 \n"
- "pmulhuw %xmm7,%xmm0 \n"
- "packuswb %xmm0,%xmm0 \n"
- "movd %xmm0,(%edi) \n"
- "pextrw $0x2,%xmm0,%eax \n"
- "mov %ax,0x4(%edi) \n"
- "lea 0x6(%edi),%edi \n"
- "sub $0x6,%ecx \n"
- "ja 1b \n"
- "popa \n"
- "ret \n"
-);
-#endif // __PIC__
-
-#define HAS_SCALEADDROWS_SSE2
-void ScaleAddRows_SSE2(const uint8* src_ptr, int src_stride,
- uint16* dst_ptr, int src_width,
- int src_height);
- asm(
- DECLARE_FUNCTION(ScaleAddRows_SSE2)
- "pusha \n"
- "mov 0x24(%esp),%esi \n"
- "mov 0x28(%esp),%edx \n"
- "mov 0x2c(%esp),%edi \n"
- "mov 0x30(%esp),%ecx \n"
- "mov 0x34(%esp),%ebx \n"
- "pxor %xmm5,%xmm5 \n"
-
-"1:"
- "movdqa (%esi),%xmm2 \n"
- "lea (%esi,%edx,1),%eax \n"
- "movhlps %xmm2,%xmm3 \n"
- "lea -0x1(%ebx),%ebp \n"
- "punpcklbw %xmm5,%xmm2 \n"
- "punpcklbw %xmm5,%xmm3 \n"
-
-"2:"
- "movdqa (%eax),%xmm0 \n"
- "lea (%eax,%edx,1),%eax \n"
- "movhlps %xmm0,%xmm1 \n"
- "punpcklbw %xmm5,%xmm0 \n"
- "punpcklbw %xmm5,%xmm1 \n"
- "paddusw %xmm0,%xmm2 \n"
- "paddusw %xmm1,%xmm3 \n"
- "sub $0x1,%ebp \n"
- "ja 2b \n"
-
- "movdqa %xmm2,(%edi) \n"
- "movdqa %xmm3,0x10(%edi) \n"
- "lea 0x20(%edi),%edi \n"
- "lea 0x10(%esi),%esi \n"
- "sub $0x10,%ecx \n"
- "ja 1b \n"
- "popa \n"
- "ret \n"
-);
-
-// Bilinear row filtering combines 16x2 -> 16x1. SSE2 version
-#define HAS_SCALEFILTERROWS_SSE2
-void ScaleFilterRows_SSE2(uint8* dst_ptr,
- const uint8* src_ptr, int src_stride,
- int dst_width, int source_y_fraction);
- asm(
- DECLARE_FUNCTION(ScaleFilterRows_SSE2)
- "push %esi \n"
- "push %edi \n"
- "mov 0xc(%esp),%edi \n"
- "mov 0x10(%esp),%esi \n"
- "mov 0x14(%esp),%edx \n"
- "mov 0x18(%esp),%ecx \n"
- "mov 0x1c(%esp),%eax \n"
- "cmp $0x0,%eax \n"
- "je 2f \n"
- "cmp $0x80,%eax \n"
- "je 3f \n"
- "movd %eax,%xmm6 \n"
- "punpcklwd %xmm6,%xmm6 \n"
- "pshufd $0x0,%xmm6,%xmm6 \n"
- "neg %eax \n"
- "add $0x100,%eax \n"
- "movd %eax,%xmm5 \n"
- "punpcklwd %xmm5,%xmm5 \n"
- "pshufd $0x0,%xmm5,%xmm5 \n"
- "pxor %xmm7,%xmm7 \n"
-
-"1:"
- "movdqa (%esi),%xmm0 \n"
- "movdqa (%esi,%edx,1),%xmm2 \n"
- "lea 0x10(%esi),%esi \n"
- "movdqa %xmm0,%xmm1 \n"
- "movdqa %xmm2,%xmm3 \n"
- "punpcklbw %xmm7,%xmm0 \n"
- "punpcklbw %xmm7,%xmm2 \n"
- "punpckhbw %xmm7,%xmm1 \n"
- "punpckhbw %xmm7,%xmm3 \n"
- "pmullw %xmm5,%xmm0 \n"
- "pmullw %xmm5,%xmm1 \n"
- "pmullw %xmm6,%xmm2 \n"
- "pmullw %xmm6,%xmm3 \n"
- "paddusw %xmm2,%xmm0 \n"
- "paddusw %xmm3,%xmm1 \n"
- "psrlw $0x8,%xmm0 \n"
- "psrlw $0x8,%xmm1 \n"
- "packuswb %xmm1,%xmm0 \n"
- "movdqa %xmm0,(%edi) \n"
- "lea 0x10(%edi),%edi \n"
- "sub $0x10,%ecx \n"
- "ja 1b \n"
- "mov -0x1(%edi),%al \n"
- "mov %al,(%edi) \n"
- "pop %edi \n"
- "pop %esi \n"
- "ret \n"
-
-"2:"
- "movdqa (%esi),%xmm0 \n"
- "lea 0x10(%esi),%esi \n"
- "movdqa %xmm0,(%edi) \n"
- "lea 0x10(%edi),%edi \n"
- "sub $0x10,%ecx \n"
- "ja 2b \n"
-
- "mov -0x1(%edi),%al \n"
- "mov %al,(%edi) \n"
- "pop %edi \n"
- "pop %esi \n"
- "ret \n"
-
-"3:"
- "movdqa (%esi),%xmm0 \n"
- "movdqa (%esi,%edx,1),%xmm2 \n"
- "lea 0x10(%esi),%esi \n"
- "pavgb %xmm2,%xmm0 \n"
- "movdqa %xmm0,(%edi) \n"
- "lea 0x10(%edi),%edi \n"
- "sub $0x10,%ecx \n"
- "ja 3b \n"
-
- "mov -0x1(%edi),%al \n"
- "mov %al,(%edi) \n"
- "pop %edi \n"
- "pop %esi \n"
- "ret \n"
-);
-
-// Bilinear row filtering combines 16x2 -> 16x1. SSSE3 version
-#define HAS_SCALEFILTERROWS_SSSE3
-void ScaleFilterRows_SSSE3(uint8* dst_ptr,
- const uint8* src_ptr, int src_stride,
- int dst_width, int source_y_fraction);
- asm(
- DECLARE_FUNCTION(ScaleFilterRows_SSSE3)
- "push %esi \n"
- "push %edi \n"
- "mov 0xc(%esp),%edi \n"
- "mov 0x10(%esp),%esi \n"
- "mov 0x14(%esp),%edx \n"
- "mov 0x18(%esp),%ecx \n"
- "mov 0x1c(%esp),%eax \n"
- "shr %eax \n"
- "cmp $0x0,%eax \n"
- "je 2f \n"
- "cmp $0x40,%eax \n"
- "je 3f \n"
- "mov %al,%ah \n"
- "neg %al \n"
- "add $0x80,%al \n"
- "movd %eax,%xmm5 \n"
- "punpcklwd %xmm5,%xmm5 \n"
- "pshufd $0x0,%xmm5,%xmm5 \n"
-
-"1:"
- "movdqa (%esi),%xmm0 \n"
- "movdqa (%esi,%edx,1),%xmm2 \n"
- "lea 0x10(%esi),%esi \n"
- "movdqa %xmm0,%xmm1 \n"
- "punpcklbw %xmm2,%xmm0 \n"
- "punpckhbw %xmm2,%xmm1 \n"
- "pmaddubsw %xmm5,%xmm0 \n"
- "pmaddubsw %xmm5,%xmm1 \n"
- "psrlw $0x7,%xmm0 \n"
- "psrlw $0x7,%xmm1 \n"
- "packuswb %xmm1,%xmm0 \n"
- "movdqa %xmm0,(%edi) \n"
- "lea 0x10(%edi),%edi \n"
- "sub $0x10,%ecx \n"
- "ja 1b \n"
- "mov -0x1(%edi),%al \n"
- "mov %al,(%edi) \n"
- "pop %edi \n"
- "pop %esi \n"
- "ret \n"
-
-"2:"
- "movdqa (%esi),%xmm0 \n"
- "lea 0x10(%esi),%esi \n"
- "movdqa %xmm0,(%edi) \n"
- "lea 0x10(%edi),%edi \n"
- "sub $0x10,%ecx \n"
- "ja 2b \n"
- "mov -0x1(%edi),%al \n"
- "mov %al,(%edi) \n"
- "pop %edi \n"
- "pop %esi \n"
- "ret \n"
-
-"3:"
- "movdqa (%esi),%xmm0 \n"
- "movdqa (%esi,%edx,1),%xmm2 \n"
- "lea 0x10(%esi),%esi \n"
- "pavgb %xmm2,%xmm0 \n"
- "movdqa %xmm0,(%edi) \n"
- "lea 0x10(%edi),%edi \n"
- "sub $0x10,%ecx \n"
- "ja 3b \n"
- "mov -0x1(%edi),%al \n"
- "mov %al,(%edi) \n"
- "pop %edi \n"
- "pop %esi \n"
- "ret \n"
-);
-
-#elif defined(__x86_64__)
-static void ScaleRowDown8Int_SSE2(const uint8* src_ptr, int src_stride,
- uint8* dst_ptr, int dst_width) {
- asm volatile (
- "lea (%3,%3,2),%%r10 \n"
- "pxor %%xmm7,%%xmm7 \n"
-"1:"
- "movdqa (%0),%%xmm0 \n"
- "movdqa 0x10(%0),%%xmm1 \n"
- "movdqa (%0,%3,1),%%xmm2 \n"
- "movdqa 0x10(%0,%3,1),%%xmm3 \n"
- "pavgb %%xmm2,%%xmm0 \n"
- "pavgb %%xmm3,%%xmm1 \n"
- "movdqa (%0,%3,2),%%xmm2 \n"
- "movdqa 0x10(%0,%3,2),%%xmm3 \n"
- "movdqa (%0,%%r10,1),%%xmm4 \n"
- "movdqa 0x10(%0,%%r10,1),%%xmm5 \n"
- "lea (%0,%3,4),%%r11 \n"
- "lea 0x20(%0),%0 \n"
- "pavgb %%xmm4,%%xmm2 \n"
- "pavgb %%xmm5,%%xmm3 \n"
- "pavgb %%xmm2,%%xmm0 \n"
- "pavgb %%xmm3,%%xmm1 \n"
- "movdqa 0x0(%%r11),%%xmm2 \n"
- "movdqa 0x10(%%r11),%%xmm3 \n"
- "movdqa 0x0(%%r11,%3,1),%%xmm4 \n"
- "movdqa 0x10(%%r11,%3,1),%%xmm5 \n"
- "pavgb %%xmm4,%%xmm2 \n"
- "pavgb %%xmm5,%%xmm3 \n"
- "movdqa 0x0(%%r11,%3,2),%%xmm4 \n"
- "movdqa 0x10(%%r11,%3,2),%%xmm5 \n"
- "movdqa 0x0(%%r11,%%r10,1),%%xmm6 \n"
- "pavgb %%xmm6,%%xmm4 \n"
- "movdqa 0x10(%%r11,%%r10,1),%%xmm6 \n"
- "pavgb %%xmm6,%%xmm5 \n"
- "pavgb %%xmm4,%%xmm2 \n"
- "pavgb %%xmm5,%%xmm3 \n"
- "pavgb %%xmm2,%%xmm0 \n"
- "pavgb %%xmm3,%%xmm1 \n"
- "psadbw %%xmm7,%%xmm0 \n"
- "psadbw %%xmm7,%%xmm1 \n"
- "pshufd $0xd8,%%xmm0,%%xmm0 \n"
- "pshufd $0x8d,%%xmm1,%%xmm1 \n"
- "por %%xmm1,%%xmm0 \n"
- "psrlw $0x3,%%xmm0 \n"
- "packuswb %%xmm0,%%xmm0 \n"
- "packuswb %%xmm0,%%xmm0 \n"
- "movd %%xmm0,(%1) \n"
- "lea 0x4(%1),%1 \n"
- "sub $0x4,%2 \n"
- "ja 1b \n"
- : "+r"(src_ptr), // %0
- "+r"(dst_ptr), // %1
- "+r"(dst_width) // %2
- : "r"((intptr_t)(src_stride)) // %3
- : "memory", "cc", "r10", "r11", "xmm6", "xmm7"
-);
-}
-
-#define HAS_SCALEROWDOWN34_SSSE3
-static void ScaleRowDown34_SSSE3(const uint8* src_ptr, int src_stride,
- uint8* dst_ptr, int dst_width) {
- asm volatile (
- "movdqa (%3),%%xmm3 \n"
- "movdqa (%4),%%xmm4 \n"
- "movdqa (%5),%%xmm5 \n"
-"1:"
- "movdqa (%0),%%xmm0 \n"
- "movdqa 0x10(%0),%%xmm2 \n"
- "lea 0x20(%0),%0 \n"
- "movdqa %%xmm2,%%xmm1 \n"
- "palignr $0x8,%%xmm0,%%xmm1 \n"
- "pshufb %%xmm3,%%xmm0 \n"
- "pshufb %%xmm4,%%xmm1 \n"
- "pshufb %%xmm5,%%xmm2 \n"
- "movq %%xmm0,(%1) \n"
- "movq %%xmm1,0x8(%1) \n"
- "movq %%xmm2,0x10(%1) \n"
- "lea 0x18(%1),%1 \n"
- "sub $0x18,%2 \n"
- "ja 1b \n"
- : "+r"(src_ptr), // %0
- "+r"(dst_ptr), // %1
- "+r"(dst_width) // %2
- : "r"(_shuf0), // %3
- "r"(_shuf1), // %4
- "r"(_shuf2) // %5
- : "memory", "cc"
-);
-}
-
-static void ScaleRowDown34_1_Int_SSSE3(const uint8* src_ptr, int src_stride,
- uint8* dst_ptr, int dst_width) {
- asm volatile (
- "movdqa (%4),%%xmm2 \n" // _shuf01
- "movdqa (%5),%%xmm3 \n" // _shuf11
- "movdqa (%6),%%xmm4 \n" // _shuf21
- "movdqa (%7),%%xmm5 \n" // _madd01
- "movdqa (%8),%%xmm6 \n" // _madd11
- "movdqa (%9),%%xmm7 \n" // _round34
- "movdqa (%10),%%xmm8 \n" // _madd21
-"1:"
- "movdqa (%0),%%xmm0 \n"
- "movdqa (%0,%3),%%xmm1 \n"
- "pavgb %%xmm1,%%xmm0 \n"
- "pshufb %%xmm2,%%xmm0 \n"
- "pmaddubsw %%xmm5,%%xmm0 \n"
- "paddsw %%xmm7,%%xmm0 \n"
- "psrlw $0x2,%%xmm0 \n"
- "packuswb %%xmm0,%%xmm0 \n"
- "movq %%xmm0,(%1) \n"
- "movdqu 0x8(%0),%%xmm0 \n"
- "movdqu 0x8(%0,%3),%%xmm1 \n"
- "pavgb %%xmm1,%%xmm0 \n"
- "pshufb %%xmm3,%%xmm0 \n"
- "pmaddubsw %%xmm6,%%xmm0 \n"
- "paddsw %%xmm7,%%xmm0 \n"
- "psrlw $0x2,%%xmm0 \n"
- "packuswb %%xmm0,%%xmm0 \n"
- "movq %%xmm0,0x8(%1) \n"
- "movdqa 0x10(%0),%%xmm0 \n"
- "movdqa 0x10(%0,%3),%%xmm1 \n"
- "lea 0x20(%0),%0 \n"
- "pavgb %%xmm1,%%xmm0 \n"
- "pshufb %%xmm4,%%xmm0 \n"
- "pmaddubsw %%xmm8,%%xmm0 \n"
- "paddsw %%xmm7,%%xmm0 \n"
- "psrlw $0x2,%%xmm0 \n"
- "packuswb %%xmm0,%%xmm0 \n"
- "movq %%xmm0,0x10(%1) \n"
- "lea 0x18(%1),%1 \n"
- "sub $0x18,%2 \n"
- "ja 1b \n"
- : "+r"(src_ptr), // %0
- "+r"(dst_ptr), // %1
- "+r"(dst_width) // %2
- : "r"((intptr_t)(src_stride)), // %3
- "r"(_shuf01), // %4
- "r"(_shuf11), // %5
- "r"(_shuf21), // %6
- "r"(_madd01), // %7
- "r"(_madd11), // %8
- "r"(_round34), // %9
- "r"(_madd21) // %10
- : "memory", "cc", "xmm6", "xmm7", "xmm8"
-);
-}
-
-static void ScaleRowDown34_0_Int_SSSE3(const uint8* src_ptr, int src_stride,
- uint8* dst_ptr, int dst_width) {
- asm volatile (
- "movdqa (%4),%%xmm2 \n" // _shuf01
- "movdqa (%5),%%xmm3 \n" // _shuf11
- "movdqa (%6),%%xmm4 \n" // _shuf21
- "movdqa (%7),%%xmm5 \n" // _madd01
- "movdqa (%8),%%xmm6 \n" // _madd11
- "movdqa (%9),%%xmm7 \n" // _round34
- "movdqa (%10),%%xmm8 \n" // _madd21
-"1:"
- "movdqa (%0),%%xmm0 \n"
- "movdqa (%0,%3,1),%%xmm1 \n"
- "pavgb %%xmm0,%%xmm1 \n"
- "pavgb %%xmm1,%%xmm0 \n"
- "pshufb %%xmm2,%%xmm0 \n"
- "pmaddubsw %%xmm5,%%xmm0 \n"
- "paddsw %%xmm7,%%xmm0 \n"
- "psrlw $0x2,%%xmm0 \n"
- "packuswb %%xmm0,%%xmm0 \n"
- "movq %%xmm0,(%1) \n"
- "movdqu 0x8(%0),%%xmm0 \n"
- "movdqu 0x8(%0,%3,1),%%xmm1 \n"
- "pavgb %%xmm0,%%xmm1 \n"
- "pavgb %%xmm1,%%xmm0 \n"
- "pshufb %%xmm3,%%xmm0 \n"
- "pmaddubsw %%xmm6,%%xmm0 \n"
- "paddsw %%xmm7,%%xmm0 \n"
- "psrlw $0x2,%%xmm0 \n"
- "packuswb %%xmm0,%%xmm0 \n"
- "movq %%xmm0,0x8(%1) \n"
- "movdqa 0x10(%0),%%xmm0 \n"
- "movdqa 0x10(%0,%3,1),%%xmm1 \n"
- "lea 0x20(%0),%0 \n"
- "pavgb %%xmm0,%%xmm1 \n"
- "pavgb %%xmm1,%%xmm0 \n"
- "pshufb %%xmm4,%%xmm0 \n"
- "pmaddubsw %%xmm8,%%xmm0 \n"
- "paddsw %%xmm7,%%xmm0 \n"
- "psrlw $0x2,%%xmm0 \n"
- "packuswb %%xmm0,%%xmm0 \n"
- "movq %%xmm0,0x10(%1) \n"
- "lea 0x18(%1),%1 \n"
- "sub $0x18,%2 \n"
- "ja 1b \n"
- : "+r"(src_ptr), // %0
- "+r"(dst_ptr), // %1
- "+r"(dst_width) // %2
- : "r"((intptr_t)(src_stride)), // %3
- "r"(_shuf01), // %4
- "r"(_shuf11), // %5
- "r"(_shuf21), // %6
- "r"(_madd01), // %7
- "r"(_madd11), // %8
- "r"(_round34), // %9
- "r"(_madd21) // %10
- : "memory", "cc", "xmm6", "xmm7", "xmm8"
-);
-}
-
-#define HAS_SCALEROWDOWN38_SSSE3
-static void ScaleRowDown38_SSSE3(const uint8* src_ptr, int src_stride,
- uint8* dst_ptr, int dst_width) {
- asm volatile (
- "movdqa (%3),%%xmm4 \n"
- "movdqa (%4),%%xmm5 \n"
-"1:"
- "movdqa (%0),%%xmm0 \n"
- "movdqa 0x10(%0),%%xmm1 \n"
- "lea 0x20(%0),%0 \n"
- "pshufb %%xmm4,%%xmm0 \n"
- "pshufb %%xmm5,%%xmm1 \n"
- "paddusb %%xmm1,%%xmm0 \n"
- "movq %%xmm0,(%1) \n"
- "movhlps %%xmm0,%%xmm1 \n"
- "movd %%xmm1,0x8(%1) \n"
- "lea 0xc(%1),%1 \n"
- "sub $0xc,%2 \n"
- "ja 1b \n"
- : "+r"(src_ptr), // %0
- "+r"(dst_ptr), // %1
- "+r"(dst_width) // %2
- : "r"(_shuf38a), // %3
- "r"(_shuf38b) // %4
- : "memory", "cc"
-);
-}
-
-static void ScaleRowDown38_3_Int_SSSE3(const uint8* src_ptr, int src_stride,
- uint8* dst_ptr, int dst_width) {
- asm volatile (
- "movdqa (%4),%%xmm4 \n"
- "movdqa (%5),%%xmm5 \n"
- "movdqa (%6),%%xmm6 \n"
- "pxor %%xmm7,%%xmm7 \n"
-"1:"
- "movdqa (%0),%%xmm0 \n"
- "movdqa (%0,%3,1),%%xmm2 \n"
- "movhlps %%xmm0,%%xmm1 \n"
- "movhlps %%xmm2,%%xmm3 \n"
- "punpcklbw %%xmm7,%%xmm0 \n"
- "punpcklbw %%xmm7,%%xmm1 \n"
- "punpcklbw %%xmm7,%%xmm2 \n"
- "punpcklbw %%xmm7,%%xmm3 \n"
- "paddusw %%xmm2,%%xmm0 \n"
- "paddusw %%xmm3,%%xmm1 \n"
- "movdqa (%0,%3,2),%%xmm2 \n"
- "lea 0x10(%0),%0 \n"
- "movhlps %%xmm2,%%xmm3 \n"
- "punpcklbw %%xmm7,%%xmm2 \n"
- "punpcklbw %%xmm7,%%xmm3 \n"
- "paddusw %%xmm2,%%xmm0 \n"
- "paddusw %%xmm3,%%xmm1 \n"
- "movdqa %%xmm0,%%xmm2 \n"
- "psrldq $0x2,%%xmm0 \n"
- "paddusw %%xmm0,%%xmm2 \n"
- "psrldq $0x2,%%xmm0 \n"
- "paddusw %%xmm0,%%xmm2 \n"
- "pshufb %%xmm4,%%xmm2 \n"
- "movdqa %%xmm1,%%xmm3 \n"
- "psrldq $0x2,%%xmm1 \n"
- "paddusw %%xmm1,%%xmm3 \n"
- "psrldq $0x2,%%xmm1 \n"
- "paddusw %%xmm1,%%xmm3 \n"
- "pshufb %%xmm5,%%xmm3 \n"
- "paddusw %%xmm3,%%xmm2 \n"
- "pmulhuw %%xmm6,%%xmm2 \n"
- "packuswb %%xmm2,%%xmm2 \n"
- "movd %%xmm2,(%1) \n"
- "pextrw $0x2,%%xmm2,%%eax \n"
- "mov %%ax,0x4(%1) \n"
- "lea 0x6(%1),%1 \n"
- "sub $0x6,%2 \n"
- "ja 1b \n"
- : "+r"(src_ptr), // %0
- "+r"(dst_ptr), // %1
- "+r"(dst_width) // %2
- : "r"((intptr_t)(src_stride)), // %3
- "r"(_shufac0), // %4
- "r"(_shufac3), // %5
- "r"(_scaleac3) // %6
- : "memory", "cc", "rax", "xmm6", "xmm7"
-);
-}
-
-static void ScaleRowDown38_2_Int_SSSE3(const uint8* src_ptr, int src_stride,
- uint8* dst_ptr, int dst_width) {
- asm volatile (
- "movdqa (%4),%%xmm4 \n"
- "movdqa (%5),%%xmm5 \n"
- "movdqa (%6),%%xmm6 \n"
- "movdqa (%7),%%xmm7 \n"
-"1:"
- "movdqa (%0),%%xmm2 \n"
- "pavgb (%0,%3,1),%%xmm2 \n"
- "lea 0x10(%0),%0 \n"
- "movdqa %%xmm2,%%xmm0 \n"
- "pshufb %%xmm4,%%xmm0 \n"
- "movdqa %%xmm2,%%xmm1 \n"
- "pshufb %%xmm5,%%xmm1 \n"
- "paddusw %%xmm1,%%xmm0 \n"
- "pshufb %%xmm6,%%xmm2 \n"
- "paddusw %%xmm2,%%xmm0 \n"
- "pmulhuw %%xmm7,%%xmm0 \n"
- "packuswb %%xmm0,%%xmm0 \n"
- "movd %%xmm0,(%1) \n"
- "pextrw $0x2,%%xmm0,%%eax \n"
- "mov %%ax,0x4(%1) \n"
- "lea 0x6(%1),%1 \n"
- "sub $0x6,%2 \n"
- "ja 1b \n"
- : "+r"(src_ptr), // %0
- "+r"(dst_ptr), // %1
- "+r"(dst_width) // %2
- : "r"((intptr_t)(src_stride)), // %3
- "r"(_shufab0), // %4
- "r"(_shufab1), // %5
- "r"(_shufab2), // %6
- "r"(_scaleab2) // %7
- : "memory", "cc", "rax", "xmm6", "xmm7"
-);
-}
-
-#define HAS_SCALEADDROWS_SSE2
-static void ScaleAddRows_SSE2(const uint8* src_ptr, int src_stride,
- uint16* dst_ptr, int src_width,
- int src_height) {
- asm volatile (
- "pxor %%xmm5,%%xmm5 \n"
-"1:"
- "movdqa (%0),%%xmm2 \n"
- "lea (%0,%4,1),%%r10 \n"
- "movhlps %%xmm2,%%xmm3 \n"
- "lea -0x1(%3),%%r11 \n"
- "punpcklbw %%xmm5,%%xmm2 \n"
- "punpcklbw %%xmm5,%%xmm3 \n"
-
-"2:"
- "movdqa (%%r10),%%xmm0 \n"
- "lea (%%r10,%4,1),%%r10 \n"
- "movhlps %%xmm0,%%xmm1 \n"
- "punpcklbw %%xmm5,%%xmm0 \n"
- "punpcklbw %%xmm5,%%xmm1 \n"
- "paddusw %%xmm0,%%xmm2 \n"
- "paddusw %%xmm1,%%xmm3 \n"
- "sub $0x1,%%r11 \n"
- "ja 2b \n"
-
- "movdqa %%xmm2,(%1) \n"
- "movdqa %%xmm3,0x10(%1) \n"
- "lea 0x20(%1),%1 \n"
- "lea 0x10(%0),%0 \n"
- "sub $0x10,%2 \n"
- "ja 1b \n"
- : "+r"(src_ptr), // %0
- "+r"(dst_ptr), // %1
- "+r"(src_width), // %2
- "+r"(src_height) // %3
- : "r"((intptr_t)(src_stride)) // %4
- : "memory", "cc", "r10", "r11"
-);
-}
-
-// Bilinear row filtering combines 16x2 -> 16x1. SSE2 version
-#define HAS_SCALEFILTERROWS_SSE2
-static void ScaleFilterRows_SSE2(uint8* dst_ptr,
- const uint8* src_ptr, int src_stride,
- int dst_width, int source_y_fraction) {
- if (source_y_fraction == 0) {
- asm volatile (
- "1:"
- "movdqa (%1),%%xmm0 \n"
- "lea 0x10(%1),%1 \n"
- "movdqa %%xmm0,(%0) \n"
- "lea 0x10(%0),%0 \n"
- "sub $0x10,%2 \n"
- "ja 1b \n"
- "mov -0x1(%0),%%al \n"
- "mov %%al,(%0) \n"
- : "+r"(dst_ptr), // %0
- "+r"(src_ptr), // %1
- "+r"(dst_width) // %2
- :
- : "memory", "cc", "rax"
- );
- return;
- } else if (source_y_fraction == 128) {
- asm volatile (
- "1:"
- "movdqa (%1),%%xmm0 \n"
- "movdqa (%1,%3,1),%%xmm2 \n"
- "lea 0x10(%1),%1 \n"
- "pavgb %%xmm2,%%xmm0 \n"
- "movdqa %%xmm0,(%0) \n"
- "lea 0x10(%0),%0 \n"
- "sub $0x10,%2 \n"
- "ja 1b \n"
- "mov -0x1(%0),%%al \n"
- "mov %%al,(%0) \n"
- : "+r"(dst_ptr), // %0
- "+r"(src_ptr), // %1
- "+r"(dst_width) // %2
- : "r"((intptr_t)(src_stride)) // %3
- : "memory", "cc", "rax"
- );
- return;
- } else {
- asm volatile (
- "mov %3,%%eax \n"
- "movd %%eax,%%xmm6 \n"
- "punpcklwd %%xmm6,%%xmm6 \n"
- "pshufd $0x0,%%xmm6,%%xmm6 \n"
- "neg %%eax \n"
- "add $0x100,%%eax \n"
- "movd %%eax,%%xmm5 \n"
- "punpcklwd %%xmm5,%%xmm5 \n"
- "pshufd $0x0,%%xmm5,%%xmm5 \n"
- "pxor %%xmm7,%%xmm7 \n"
- "1:"
- "movdqa (%1),%%xmm0 \n"
- "movdqa (%1,%4,1),%%xmm2 \n"
- "lea 0x10(%1),%1 \n"
- "movdqa %%xmm0,%%xmm1 \n"
- "movdqa %%xmm2,%%xmm3 \n"
- "punpcklbw %%xmm7,%%xmm0 \n"
- "punpcklbw %%xmm7,%%xmm2 \n"
- "punpckhbw %%xmm7,%%xmm1 \n"
- "punpckhbw %%xmm7,%%xmm3 \n"
- "pmullw %%xmm5,%%xmm0 \n"
- "pmullw %%xmm5,%%xmm1 \n"
- "pmullw %%xmm6,%%xmm2 \n"
- "pmullw %%xmm6,%%xmm3 \n"
- "paddusw %%xmm2,%%xmm0 \n"
- "paddusw %%xmm3,%%xmm1 \n"
- "psrlw $0x8,%%xmm0 \n"
- "psrlw $0x8,%%xmm1 \n"
- "packuswb %%xmm1,%%xmm0 \n"
- "movdqa %%xmm0,(%0) \n"
- "lea 0x10(%0),%0 \n"
- "sub $0x10,%2 \n"
- "ja 1b \n"
- "mov -0x1(%0),%%al \n"
- "mov %%al,(%0) \n"
- : "+r"(dst_ptr), // %0
- "+r"(src_ptr), // %1
- "+r"(dst_width), // %2
- "+r"(source_y_fraction) // %3
- : "r"((intptr_t)(src_stride)) // %4
- : "memory", "cc", "rax", "xmm6", "xmm7"
- );
- }
- return;
-}
-
-// Bilinear row filtering combines 16x2 -> 16x1. SSSE3 version
-#define HAS_SCALEFILTERROWS_SSSE3
-static void ScaleFilterRows_SSSE3(uint8* dst_ptr,
- const uint8* src_ptr, int src_stride,
- int dst_width, int source_y_fraction) {
- source_y_fraction >>= 1;
- if (source_y_fraction == 0) {
- asm volatile (
- "1:"
- "movdqa (%1),%%xmm0 \n"
- "lea 0x10(%1),%1 \n"
- "movdqa %%xmm0,(%0) \n"
- "lea 0x10(%0),%0 \n"
- "sub $0x10,%2 \n"
- "ja 1b \n"
- "mov -0x1(%0),%%al \n"
- "mov %%al,(%0) \n"
- : "+r"(dst_ptr), // %0
- "+r"(src_ptr), // %1
- "+r"(dst_width) // %2
- :
- : "memory", "cc", "rax"
- );
- return;
- } else if (source_y_fraction == 64) {
- asm volatile (
- "1:"
- "movdqa (%1),%%xmm0 \n"
- "movdqa (%1,%3,1),%%xmm2 \n"
- "lea 0x10(%1),%1 \n"
- "pavgb %%xmm2,%%xmm0 \n"
- "movdqa %%xmm0,(%0) \n"
- "lea 0x10(%0),%0 \n"
- "sub $0x10,%2 \n"
- "ja 1b \n"
- "mov -0x1(%0),%%al \n"
- "mov %%al,(%0) \n"
- : "+r"(dst_ptr), // %0
- "+r"(src_ptr), // %1
- "+r"(dst_width) // %2
- : "r"((intptr_t)(src_stride)) // %3
- : "memory", "cc", "rax"
- );
- return;
- } else {
- asm volatile (
- "mov %3,%%eax \n"
- "mov %%al,%%ah \n"
- "neg %%al \n"
- "add $0x80,%%al \n"
- "movd %%eax,%%xmm5 \n"
- "punpcklwd %%xmm5,%%xmm5 \n"
- "pshufd $0x0,%%xmm5,%%xmm5 \n"
- "1:"
- "movdqa (%1),%%xmm0 \n"
- "movdqa (%1,%4,1),%%xmm2 \n"
- "lea 0x10(%1),%1 \n"
- "movdqa %%xmm0,%%xmm1 \n"
- "punpcklbw %%xmm2,%%xmm0 \n"
- "punpckhbw %%xmm2,%%xmm1 \n"
- "pmaddubsw %%xmm5,%%xmm0 \n"
- "pmaddubsw %%xmm5,%%xmm1 \n"
- "psrlw $0x7,%%xmm0 \n"
- "psrlw $0x7,%%xmm1 \n"
- "packuswb %%xmm1,%%xmm0 \n"
- "movdqa %%xmm0,(%0) \n"
- "lea 0x10(%0),%0 \n"
- "sub $0x10,%2 \n"
- "ja 1b \n"
- "mov -0x1(%0),%%al \n"
- "mov %%al,(%0) \n"
- : "+r"(dst_ptr), // %0
- "+r"(src_ptr), // %1
- "+r"(dst_width), // %2
- "+r"(source_y_fraction) // %3
- : "r"((intptr_t)(src_stride)) // %4
- : "memory", "cc", "rax"
- );
- }
- return;
-}
-#endif
-#endif
-
-// CPU agnostic row functions
-static void ScaleRowDown2_C(const uint8* src_ptr, int src_stride,
- uint8* dst, int dst_width) {
- int x;
- for (x = 0; x < dst_width; ++x) {
- *dst++ = *src_ptr;
- src_ptr += 2;
- }
-}
-
-static void ScaleRowDown2Int_C(const uint8* src_ptr, int src_stride,
- uint8* dst, int dst_width) {
- int x;
- for (x = 0; x < dst_width; ++x) {
- *dst++ = (src_ptr[0] + src_ptr[1] +
- src_ptr[src_stride] + src_ptr[src_stride + 1] + 2) >> 2;
- src_ptr += 2;
- }
-}
-
-static void ScaleRowDown4_C(const uint8* src_ptr, int src_stride,
- uint8* dst, int dst_width) {
- int x;
- for (x = 0; x < dst_width; ++x) {
- *dst++ = *src_ptr;
- src_ptr += 4;
- }
-}
-
-static void ScaleRowDown4Int_C(const uint8* src_ptr, int src_stride,
- uint8* dst, int dst_width) {
- int x;
- for (x = 0; x < dst_width; ++x) {
- *dst++ = (src_ptr[0] + src_ptr[1] + src_ptr[2] + src_ptr[3] +
- src_ptr[src_stride + 0] + src_ptr[src_stride + 1] +
- src_ptr[src_stride + 2] + src_ptr[src_stride + 3] +
- src_ptr[src_stride * 2 + 0] + src_ptr[src_stride * 2 + 1] +
- src_ptr[src_stride * 2 + 2] + src_ptr[src_stride * 2 + 3] +
- src_ptr[src_stride * 3 + 0] + src_ptr[src_stride * 3 + 1] +
- src_ptr[src_stride * 3 + 2] + src_ptr[src_stride * 3 + 3] +
- 8) >> 4;
- src_ptr += 4;
- }
-}
-
-// 640 output pixels is enough to allow 5120 input pixels with 1/8 scale down.
-// Keeping the total buffer under 4096 bytes avoids a stackcheck, saving 4% cpu.
-// The following 2 lines cause error on Windows.
-//static const int kMaxOutputWidth = 640;
-//static const int kMaxRow12 = 1280; //kMaxOutputWidth * 2;
-#define kMaxOutputWidth 640
-#define kMaxRow12 1280
-
-static void ScaleRowDown8_C(const uint8* src_ptr, int src_stride,
- uint8* dst, int dst_width) {
- int x;
- for (x = 0; x < dst_width; ++x) {
- *dst++ = *src_ptr;
- src_ptr += 8;
- }
-}
-
-// Note calling code checks width is less than max and if not
-// uses ScaleRowDown8_C instead.
-static void ScaleRowDown8Int_C(const uint8* src_ptr, int src_stride,
- uint8* dst, int dst_width) {
- ALIGN16(uint8 src_row[kMaxRow12 * 2]);
- assert(dst_width <= kMaxOutputWidth);
- ScaleRowDown4Int_C(src_ptr, src_stride, src_row, dst_width * 2);
- ScaleRowDown4Int_C(src_ptr + src_stride * 4, src_stride,
- src_row + kMaxOutputWidth,
- dst_width * 2);
- ScaleRowDown2Int_C(src_row, kMaxOutputWidth, dst, dst_width);
-}
-
-static void ScaleRowDown34_C(const uint8* src_ptr, int src_stride,
- uint8* dst, int dst_width) {
- uint8* dend;
- assert((dst_width % 3 == 0) && (dst_width > 0));
- dend = dst + dst_width;
- do {
- dst[0] = src_ptr[0];
- dst[1] = src_ptr[1];
- dst[2] = src_ptr[3];
- dst += 3;
- src_ptr += 4;
- } while (dst < dend);
-}
-
-// Filter rows 0 and 1 together, 3 : 1
-static void ScaleRowDown34_0_Int_C(const uint8* src_ptr, int src_stride,
- uint8* d, int dst_width) {
- uint8* dend;
- const uint8* s;
- const uint8* t;
- assert((dst_width % 3 == 0) && (dst_width > 0));
- dend = d + dst_width;
- s = src_ptr;
- t = src_ptr + src_stride;
- do {
- uint8 a0 = (s[0] * 3 + s[1] * 1 + 2) >> 2;
- uint8 a1 = (s[1] * 1 + s[2] * 1 + 1) >> 1;
- uint8 a2 = (s[2] * 1 + s[3] * 3 + 2) >> 2;
- uint8 b0 = (t[0] * 3 + t[1] * 1 + 2) >> 2;
- uint8 b1 = (t[1] * 1 + t[2] * 1 + 1) >> 1;
- uint8 b2 = (t[2] * 1 + t[3] * 3 + 2) >> 2;
- d[0] = (a0 * 3 + b0 + 2) >> 2;
- d[1] = (a1 * 3 + b1 + 2) >> 2;
- d[2] = (a2 * 3 + b2 + 2) >> 2;
- d += 3;
- s += 4;
- t += 4;
- } while (d < dend);
-}
-
-// Filter rows 1 and 2 together, 1 : 1
-static void ScaleRowDown34_1_Int_C(const uint8* src_ptr, int src_stride,
- uint8* d, int dst_width) {
- uint8* dend;
- const uint8* s;
- const uint8* t;
- assert((dst_width % 3 == 0) && (dst_width > 0));
- dend = d + dst_width;
- s = src_ptr;
- t = src_ptr + src_stride;
- do {
- uint8 a0 = (s[0] * 3 + s[1] * 1 + 2) >> 2;
- uint8 a1 = (s[1] * 1 + s[2] * 1 + 1) >> 1;
- uint8 a2 = (s[2] * 1 + s[3] * 3 + 2) >> 2;
- uint8 b0 = (t[0] * 3 + t[1] * 1 + 2) >> 2;
- uint8 b1 = (t[1] * 1 + t[2] * 1 + 1) >> 1;
- uint8 b2 = (t[2] * 1 + t[3] * 3 + 2) >> 2;
- d[0] = (a0 + b0 + 1) >> 1;
- d[1] = (a1 + b1 + 1) >> 1;
- d[2] = (a2 + b2 + 1) >> 1;
- d += 3;
- s += 4;
- t += 4;
- } while (d < dend);
-}
-
-#if defined(HAS_SCALEFILTERROWS_SSE2)
-// Filter row to 3/4
-static void ScaleFilterCols34_C(uint8* dst_ptr, const uint8* src_ptr,
- int dst_width) {
- uint8* dend;
- const uint8* s;
- assert((dst_width % 3 == 0) && (dst_width > 0));
- dend = dst_ptr + dst_width;
- s = src_ptr;
- do {
- dst_ptr[0] = (s[0] * 3 + s[1] * 1 + 2) >> 2;
- dst_ptr[1] = (s[1] * 1 + s[2] * 1 + 1) >> 1;
- dst_ptr[2] = (s[2] * 1 + s[3] * 3 + 2) >> 2;
- dst_ptr += 3;
- s += 4;
- } while (dst_ptr < dend);
-}
-#endif
-
-static void ScaleFilterCols_C(uint8* dst_ptr, const uint8* src_ptr,
- int dst_width, int dx) {
- int x = 0;
- int j;
- for (j = 0; j < dst_width; ++j) {
- int xi = x >> 16;
- int xf1 = x & 0xffff;
- int xf0 = 65536 - xf1;
-
- *dst_ptr++ = (src_ptr[xi] * xf0 + src_ptr[xi + 1] * xf1) >> 16;
- x += dx;
- }
-}
-
-//Not work on Windows
-//static const int kMaxInputWidth = 2560;
-#define kMaxInputWidth 2560
-#if defined(HAS_SCALEFILTERROWS_SSE2)
-#define HAS_SCALEROWDOWN34_SSE2
-// Filter rows 0 and 1 together, 3 : 1
-static void ScaleRowDown34_0_Int_SSE2(const uint8* src_ptr, int src_stride,
- uint8* dst_ptr, int dst_width) {
- ALIGN16(uint8 row[kMaxInputWidth]);
- assert((dst_width % 3 == 0) && (dst_width > 0));
- ScaleFilterRows_SSE2(row, src_ptr, src_stride, dst_width * 4 / 3, 256 / 4);
- ScaleFilterCols34_C(dst_ptr, row, dst_width);
-}
-
-// Filter rows 1 and 2 together, 1 : 1
-static void ScaleRowDown34_1_Int_SSE2(const uint8* src_ptr, int src_stride,
- uint8* dst_ptr, int dst_width) {
- ALIGN16(uint8 row[kMaxInputWidth]);
- assert((dst_width % 3 == 0) && (dst_width > 0));
- ScaleFilterRows_SSE2(row, src_ptr, src_stride, dst_width * 4 / 3, 256 / 2);
- ScaleFilterCols34_C(dst_ptr, row, dst_width);
-}
-#endif
-
-static void ScaleRowDown38_C(const uint8* src_ptr, int src_stride,
- uint8* dst, int dst_width) {
- int x;
- assert(dst_width % 3 == 0);
- for (x = 0; x < dst_width; x += 3) {
- dst[0] = src_ptr[0];
- dst[1] = src_ptr[3];
- dst[2] = src_ptr[6];
- dst += 3;
- src_ptr += 8;
- }
-}
-
-// 8x3 -> 3x1
-static void ScaleRowDown38_3_Int_C(const uint8* src_ptr, int src_stride,
- uint8* dst_ptr, int dst_width) {
- int i;
- assert((dst_width % 3 == 0) && (dst_width > 0));
- for (i = 0; i < dst_width; i+=3) {
- dst_ptr[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] +
- src_ptr[src_stride + 0] + src_ptr[src_stride + 1] +
- src_ptr[src_stride + 2] + src_ptr[src_stride * 2 + 0] +
- src_ptr[src_stride * 2 + 1] + src_ptr[src_stride * 2 + 2]) *
- (65536 / 9) >> 16;
- dst_ptr[1] = (src_ptr[3] + src_ptr[4] + src_ptr[5] +
- src_ptr[src_stride + 3] + src_ptr[src_stride + 4] +
- src_ptr[src_stride + 5] + src_ptr[src_stride * 2 + 3] +
- src_ptr[src_stride * 2 + 4] + src_ptr[src_stride * 2 + 5]) *
- (65536 / 9) >> 16;
- dst_ptr[2] = (src_ptr[6] + src_ptr[7] +
- src_ptr[src_stride + 6] + src_ptr[src_stride + 7] +
- src_ptr[src_stride * 2 + 6] + src_ptr[src_stride * 2 + 7]) *
- (65536 / 6) >> 16;
- src_ptr += 8;
- dst_ptr += 3;
- }
-}
-
-// 8x2 -> 3x1
-static void ScaleRowDown38_2_Int_C(const uint8* src_ptr, int src_stride,
- uint8* dst_ptr, int dst_width) {
- int i;
- assert((dst_width % 3 == 0) && (dst_width > 0));
- for (i = 0; i < dst_width; i+=3) {
- dst_ptr[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] +
- src_ptr[src_stride + 0] + src_ptr[src_stride + 1] +
- src_ptr[src_stride + 2]) * (65536 / 6) >> 16;
- dst_ptr[1] = (src_ptr[3] + src_ptr[4] + src_ptr[5] +
- src_ptr[src_stride + 3] + src_ptr[src_stride + 4] +
- src_ptr[src_stride + 5]) * (65536 / 6) >> 16;
- dst_ptr[2] = (src_ptr[6] + src_ptr[7] +
- src_ptr[src_stride + 6] + src_ptr[src_stride + 7]) *
- (65536 / 4) >> 16;
- src_ptr += 8;
- dst_ptr += 3;
- }
-}
-
-// C version 8x2 -> 8x1
-static void ScaleFilterRows_C(uint8* dst_ptr,
- const uint8* src_ptr, int src_stride,
- int dst_width, int source_y_fraction) {
- int y1_fraction;
- int y0_fraction;
- const uint8* src_ptr1;
- uint8* end;
- assert(dst_width > 0);
- y1_fraction = source_y_fraction;
- y0_fraction = 256 - y1_fraction;
- src_ptr1 = src_ptr + src_stride;
- end = dst_ptr + dst_width;
- do {
- dst_ptr[0] = (src_ptr[0] * y0_fraction + src_ptr1[0] * y1_fraction) >> 8;
- dst_ptr[1] = (src_ptr[1] * y0_fraction + src_ptr1[1] * y1_fraction) >> 8;
- dst_ptr[2] = (src_ptr[2] * y0_fraction + src_ptr1[2] * y1_fraction) >> 8;
- dst_ptr[3] = (src_ptr[3] * y0_fraction + src_ptr1[3] * y1_fraction) >> 8;
- dst_ptr[4] = (src_ptr[4] * y0_fraction + src_ptr1[4] * y1_fraction) >> 8;
- dst_ptr[5] = (src_ptr[5] * y0_fraction + src_ptr1[5] * y1_fraction) >> 8;
- dst_ptr[6] = (src_ptr[6] * y0_fraction + src_ptr1[6] * y1_fraction) >> 8;
- dst_ptr[7] = (src_ptr[7] * y0_fraction + src_ptr1[7] * y1_fraction) >> 8;
- src_ptr += 8;
- src_ptr1 += 8;
- dst_ptr += 8;
- } while (dst_ptr < end);
- dst_ptr[0] = dst_ptr[-1];
-}
-
-void ScaleAddRows_C(const uint8* src_ptr, int src_stride,
- uint16* dst_ptr, int src_width, int src_height) {
- int x,y;
- assert(src_width > 0);
- assert(src_height > 0);
- for (x = 0; x < src_width; ++x) {
- const uint8* s = src_ptr + x;
- int sum = 0;
- for (y = 0; y < src_height; ++y) {
- sum += s[0];
- s += src_stride;
- }
- dst_ptr[x] = sum;
- }
-}
-
-/**
- * Scale plane, 1/2
- *
- * This is an optimized version for scaling down a plane to 1/2 of
- * its original size.
- *
- */
-static void ScalePlaneDown2(int src_width, int src_height,
- int dst_width, int dst_height,
- int src_stride, int dst_stride,
- const uint8* src_ptr, uint8* dst_ptr,
- FilterModeEnum filtering) {
- void (*ScaleRowDown2)(const uint8* src_ptr, int src_stride,
- uint8* dst_ptr, int dst_width);
- assert(IS_ALIGNED(src_width, 2));
- assert(IS_ALIGNED(src_height, 2));
-
-#if defined(HAS_SCALEROWDOWN2_NEON)
- if (TestCpuFlag(kCpuHasNEON) &&
- IS_ALIGNED(dst_width, 16)) {
- ScaleRowDown2 = filtering ? ScaleRowDown2Int_NEON : ScaleRowDown2_NEON;
- } else
-#endif
-#if defined(HAS_SCALEROWDOWN2_SSE2)
- if (TestCpuFlag(kCpuHasSSE2) &&
- IS_ALIGNED(dst_width, 16) &&
- IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16) &&
- IS_ALIGNED(dst_ptr, 16) && IS_ALIGNED(dst_stride, 16)) {
- ScaleRowDown2 = filtering ? ScaleRowDown2Int_SSE2 : ScaleRowDown2_SSE2;
- } else
-#endif
- {
- ScaleRowDown2 = filtering ? ScaleRowDown2Int_C : ScaleRowDown2_C;
- }
-
- {
- int y;
- for (y = 0; y < dst_height; ++y) {
- ScaleRowDown2(src_ptr, src_stride, dst_ptr, dst_width);
- src_ptr += (src_stride << 1);
- dst_ptr += dst_stride;
- }
- }
-}
-
-/**
- * Scale plane, 1/4
- *
- * This is an optimized version for scaling down a plane to 1/4 of
- * its original size.
- */
-static void ScalePlaneDown4(int src_width, int src_height,
- int dst_width, int dst_height,
- int src_stride, int dst_stride,
- const uint8* src_ptr, uint8* dst_ptr,
- FilterModeEnum filtering) {
- void (*ScaleRowDown4)(const uint8* src_ptr, int src_stride,
- uint8* dst_ptr, int dst_width);
- assert(IS_ALIGNED(src_width, 4));
- assert(IS_ALIGNED(src_height, 4));
-
-#if defined(HAS_SCALEROWDOWN4_NEON)
- if (TestCpuFlag(kCpuHasNEON) &&
- IS_ALIGNED(dst_width, 4)) {
- ScaleRowDown4 = filtering ? ScaleRowDown4Int_NEON : ScaleRowDown4_NEON;
- } else
-#endif
-#if defined(HAS_SCALEROWDOWN4_SSE2)
- if (TestCpuFlag(kCpuHasSSE2) &&
- IS_ALIGNED(dst_width, 8) &&
- IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16) &&
- IS_ALIGNED(dst_ptr, 8) && IS_ALIGNED(dst_stride, 8)) {
- ScaleRowDown4 = filtering ? ScaleRowDown4Int_SSE2 : ScaleRowDown4_SSE2;
- } else
-#endif
- {
- ScaleRowDown4 = filtering ? ScaleRowDown4Int_C : ScaleRowDown4_C;
- }
-
- {
- int y;
- for (y = 0; y < dst_height; ++y) {
- ScaleRowDown4(src_ptr, src_stride, dst_ptr, dst_width);
- src_ptr += (src_stride << 2);
- dst_ptr += dst_stride;
- }
- }
-}
-
-/**
- * Scale plane, 1/8
- *
- * This is an optimized version for scaling down a plane to 1/8
- * of its original size.
- *
- */
-static void ScalePlaneDown8(int src_width, int src_height,
- int dst_width, int dst_height,
- int src_stride, int dst_stride,
- const uint8* src_ptr, uint8* dst_ptr,
- FilterModeEnum filtering) {
- void (*ScaleRowDown8)(const uint8* src_ptr, int src_stride,
- uint8* dst_ptr, int dst_width);
- assert(IS_ALIGNED(src_width, 8));
- assert(IS_ALIGNED(src_height, 8));
-
-#if defined(HAS_SCALEROWDOWN8_SSE2)
- if (TestCpuFlag(kCpuHasSSE2) &&
- IS_ALIGNED(dst_width, 4) &&
- IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16) &&
- IS_ALIGNED(dst_ptr, 4) && IS_ALIGNED(dst_stride, 4)) {
- ScaleRowDown8 = filtering ? ScaleRowDown8Int_SSE2 : ScaleRowDown8_SSE2;
- } else
-#endif
- {
- ScaleRowDown8 = filtering && (dst_width <= kMaxOutputWidth) ?
- ScaleRowDown8Int_C : ScaleRowDown8_C;
- }
-
- {
- int y;
- for (y = 0; y < dst_height; ++y) {
- ScaleRowDown8(src_ptr, src_stride, dst_ptr, dst_width);
- src_ptr += (src_stride << 3);
- dst_ptr += dst_stride;
- }
- }
-}
-
-/**
- * Scale plane down, 3/4
- *
- * Provided by Frank Barchard (fbarchard@google.com)
- *
- */
-static void ScalePlaneDown34(int src_width, int src_height,
- int dst_width, int dst_height,
- int src_stride, int dst_stride,
- const uint8* src_ptr, uint8* dst_ptr,
- FilterModeEnum filtering) {
- void (*ScaleRowDown34_0)(const uint8* src_ptr, int src_stride,
- uint8* dst_ptr, int dst_width);
- void (*ScaleRowDown34_1)(const uint8* src_ptr, int src_stride,
- uint8* dst_ptr, int dst_width);
- assert(dst_width % 3 == 0);
-#if defined(HAS_SCALEROWDOWN34_NEON)
- if (TestCpuFlag(kCpuHasNEON) &&
- (dst_width % 24 == 0)) {
- if (!filtering) {
- ScaleRowDown34_0 = ScaleRowDown34_NEON;
- ScaleRowDown34_1 = ScaleRowDown34_NEON;
- } else {
- ScaleRowDown34_0 = ScaleRowDown34_0_Int_NEON;
- ScaleRowDown34_1 = ScaleRowDown34_1_Int_NEON;
- }
- } else
-#endif
-
-#if defined(HAS_SCALEROWDOWN34_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3) &&
- (dst_width % 24 == 0) &&
- IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16) &&
- IS_ALIGNED(dst_ptr, 8) && IS_ALIGNED(dst_stride, 8)) {
- if (!filtering) {
- ScaleRowDown34_0 = ScaleRowDown34_SSSE3;
- ScaleRowDown34_1 = ScaleRowDown34_SSSE3;
- } else {
- ScaleRowDown34_0 = ScaleRowDown34_0_Int_SSSE3;
- ScaleRowDown34_1 = ScaleRowDown34_1_Int_SSSE3;
- }
- } else
-#endif
-#if defined(HAS_SCALEROWDOWN34_SSE2)
- if (TestCpuFlag(kCpuHasSSE2) &&
- (dst_width % 24 == 0) && IS_ALIGNED(src_stride, 16) &&
- IS_ALIGNED(dst_stride, 8) &&
- IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(dst_ptr, 8) &&
- filtering) {
- ScaleRowDown34_0 = ScaleRowDown34_0_Int_SSE2;
- ScaleRowDown34_1 = ScaleRowDown34_1_Int_SSE2;
- } else
-#endif
- {
- if (!filtering) {
- ScaleRowDown34_0 = ScaleRowDown34_C;
- ScaleRowDown34_1 = ScaleRowDown34_C;
- } else {
- ScaleRowDown34_0 = ScaleRowDown34_0_Int_C;
- ScaleRowDown34_1 = ScaleRowDown34_1_Int_C;
- }
- }
- {
- int src_row = 0;
- int y;
- for (y = 0; y < dst_height; ++y) {
- switch (src_row) {
- case 0:
- ScaleRowDown34_0(src_ptr, src_stride, dst_ptr, dst_width);
- break;
-
- case 1:
- ScaleRowDown34_1(src_ptr, src_stride, dst_ptr, dst_width);
- break;
-
- case 2:
- ScaleRowDown34_0(src_ptr + src_stride, -src_stride,
- dst_ptr, dst_width);
- break;
- }
- ++src_row;
- src_ptr += src_stride;
- dst_ptr += dst_stride;
- if (src_row >= 3) {
- src_ptr += src_stride;
- src_row = 0;
- }
- }
-}
-}
-
-/**
- * Scale plane, 3/8
- *
- * This is an optimized version for scaling down a plane to 3/8
- * of its original size.
- *
- * Reduces 16x3 to 6x1
- */
-static void ScalePlaneDown38(int src_width, int src_height,
- int dst_width, int dst_height,
- int src_stride, int dst_stride,
- const uint8* src_ptr, uint8* dst_ptr,
- FilterModeEnum filtering) {
- void (*ScaleRowDown38_3)(const uint8* src_ptr, int src_stride,
- uint8* dst_ptr, int dst_width);
- void (*ScaleRowDown38_2)(const uint8* src_ptr, int src_stride,
- uint8* dst_ptr, int dst_width);
- assert(dst_width % 3 == 0);
-#if defined(HAS_SCALEROWDOWN38_NEON)
- if (TestCpuFlag(kCpuHasNEON) &&
- (dst_width % 12 == 0)) {
- if (!filtering) {
- ScaleRowDown38_3 = ScaleRowDown38_NEON;
- ScaleRowDown38_2 = ScaleRowDown38_NEON;
- } else {
- ScaleRowDown38_3 = ScaleRowDown38_3_Int_NEON;
- ScaleRowDown38_2 = ScaleRowDown38_2_Int_NEON;
- }
- } else
-#endif
-
-#if defined(HAS_SCALEROWDOWN38_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3) &&
- (dst_width % 24 == 0) && IS_ALIGNED(src_stride, 16) &&
- IS_ALIGNED(dst_stride, 8) &&
- IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(dst_ptr, 8)) {
- if (!filtering) {
- ScaleRowDown38_3 = ScaleRowDown38_SSSE3;
- ScaleRowDown38_2 = ScaleRowDown38_SSSE3;
- } else {
- ScaleRowDown38_3 = ScaleRowDown38_3_Int_SSSE3;
- ScaleRowDown38_2 = ScaleRowDown38_2_Int_SSSE3;
- }
- } else
-#endif
- {
- if (!filtering) {
- ScaleRowDown38_3 = ScaleRowDown38_C;
- ScaleRowDown38_2 = ScaleRowDown38_C;
- } else {
- ScaleRowDown38_3 = ScaleRowDown38_3_Int_C;
- ScaleRowDown38_2 = ScaleRowDown38_2_Int_C;
- }
- }
- {
- int src_row = 0;
- int y;
- for (y = 0; y < dst_height; ++y) {
- switch (src_row) {
- case 0:
- case 1:
- ScaleRowDown38_3(src_ptr, src_stride, dst_ptr, dst_width);
- src_ptr += src_stride * 3;
- ++src_row;
- break;
-
- case 2:
- ScaleRowDown38_2(src_ptr, src_stride, dst_ptr, dst_width);
- src_ptr += src_stride * 2;
- src_row = 0;
- break;
- }
- dst_ptr += dst_stride;
- }
-}
-}
-
-__inline static uint32 SumBox(int iboxwidth, int iboxheight,
- int src_stride, const uint8* src_ptr) {
- int x, y;
- uint32 sum;
- assert(iboxwidth > 0);
- assert(iboxheight > 0);
- sum = 0u;
- for (y = 0; y < iboxheight; ++y) {
- for (x = 0; x < iboxwidth; ++x) {
- sum += src_ptr[x];
- }
- src_ptr += src_stride;
- }
- return sum;
-}
-
-static void ScalePlaneBoxRow(int dst_width, int boxheight,
- int dx, int src_stride,
- const uint8* src_ptr, uint8* dst_ptr) {
- int x = 0;
- int i;
- for (i = 0; i < dst_width; ++i) {
- int ix = x >> 16;
- int boxwidth;
- x += dx;
- boxwidth = (x >> 16) - ix;
- *dst_ptr++ = SumBox(boxwidth, boxheight, src_stride, src_ptr + ix) /
- (boxwidth * boxheight);
- }
-}
-
-__inline static uint32 SumPixels(int iboxwidth, const uint16* src_ptr) {
- uint32 sum;
- int x;
- assert(iboxwidth > 0);
- sum = 0u;
- for (x = 0; x < iboxwidth; ++x) {
- sum += src_ptr[x];
- }
- return sum;
-}
-
-static void ScaleAddCols2_C(int dst_width, int boxheight, int dx,
- const uint16* src_ptr, uint8* dst_ptr) {
- int scaletbl[2];
- int minboxwidth = (dx >> 16);
- scaletbl[0] = 65536 / (minboxwidth * boxheight);
- scaletbl[1] = 65536 / ((minboxwidth + 1) * boxheight);
- {
- int *scaleptr = scaletbl - minboxwidth;
- int x = 0;
- int i;
- for (i = 0; i < dst_width; ++i) {
- int ix = x >> 16;
- int boxwidth;
- x += dx;
- boxwidth = (x >> 16) - ix;
- *dst_ptr++ = SumPixels(boxwidth, src_ptr + ix) * scaleptr[boxwidth] >> 16;
- }
- }
-}
-
-static void ScaleAddCols1_C(int dst_width, int boxheight, int dx,
- const uint16* src_ptr, uint8* dst_ptr) {
- int boxwidth = (dx >> 16);
- int scaleval = 65536 / (boxwidth * boxheight);
- int x = 0;
- int i;
- for (i = 0; i < dst_width; ++i) {
- *dst_ptr++ = SumPixels(boxwidth, src_ptr + x) * scaleval >> 16;
- x += boxwidth;
- }
-}
-
-/**
- * Scale plane down to any dimensions, with interpolation.
- * (boxfilter).
- *
- * Same method as SimpleScale, which is fixed point, outputting
- * one pixel of destination using fixed point (16.16) to step
- * through source, sampling a box of pixel with simple
- * averaging.
- */
-static void ScalePlaneBox(int src_width, int src_height,
- int dst_width, int dst_height,
- int src_stride, int dst_stride,
- const uint8* src_ptr, uint8* dst_ptr) {
- int dx, dy;
- assert(dst_width > 0);
- assert(dst_height > 0);
- dy = (src_height << 16) / dst_height;
- dx = (src_width << 16) / dst_width;
- if (!IS_ALIGNED(src_width, 16) || (src_width > kMaxInputWidth) ||
- dst_height * 2 > src_height) {
- uint8* dst = dst_ptr;
- int dy = (src_height << 16) / dst_height;
- int dx = (src_width << 16) / dst_width;
- int y = 0;
- int j;
- for (j = 0; j < dst_height; ++j) {
- int iy = y >> 16;
- const uint8* const src = src_ptr + iy * src_stride;
- int boxheight;
- y += dy;
- if (y > (src_height << 16)) {
- y = (src_height << 16);
- }
- boxheight = (y >> 16) - iy;
- ScalePlaneBoxRow(dst_width, boxheight,
- dx, src_stride,
- src, dst);
-
- dst += dst_stride;
- }
- } else {
- ALIGN16(uint16 row[kMaxInputWidth]);
- void (*ScaleAddRows)(const uint8* src_ptr, int src_stride,
- uint16* dst_ptr, int src_width, int src_height);
- void (*ScaleAddCols)(int dst_width, int boxheight, int dx,
- const uint16* src_ptr, uint8* dst_ptr);
-#if defined(HAS_SCALEADDROWS_SSE2)
- if (TestCpuFlag(kCpuHasSSE2) &&
- IS_ALIGNED(src_stride, 16) && IS_ALIGNED(src_ptr, 16) &&
- IS_ALIGNED(src_width, 16)) {
- ScaleAddRows = ScaleAddRows_SSE2;
- } else
-#endif
- {
- ScaleAddRows = ScaleAddRows_C;
- }
- if (dx & 0xffff) {
- ScaleAddCols = ScaleAddCols2_C;
- } else {
- ScaleAddCols = ScaleAddCols1_C;
- }
-
- {
- int y = 0;
- int j;
- for (j = 0; j < dst_height; ++j) {
- int iy = y >> 16;
- const uint8* const src = src_ptr + iy * src_stride;
- int boxheight;
- y += dy;
- if (y > (src_height << 16)) {
- y = (src_height << 16);
- }
- boxheight = (y >> 16) - iy;
- ScaleAddRows(src, src_stride, row, src_width, boxheight);
- ScaleAddCols(dst_width, boxheight, dx, row, dst_ptr);
- dst_ptr += dst_stride;
- }
- }
- }
-}
-
-/**
- * Scale plane to/from any dimensions, with interpolation.
- */
-static void ScalePlaneBilinearSimple(int src_width, int src_height,
- int dst_width, int dst_height,
- int src_stride, int dst_stride,
- const uint8* src_ptr, uint8* dst_ptr) {
- int i, j;
- uint8* dst = dst_ptr;
- int dx = (src_width << 16) / dst_width;
- int dy = (src_height << 16) / dst_height;
- int maxx = ((src_width - 1) << 16) - 1;
- int maxy = ((src_height - 1) << 16) - 1;
- int y = (dst_height < src_height) ? 32768 :
- (src_height << 16) / dst_height - 32768;
- for (i = 0; i < dst_height; ++i) {
- int cy = (y < 0) ? 0 : y;
- int yi = cy >> 16;
- int yf = cy & 0xffff;
- const uint8* const src = src_ptr + yi * src_stride;
- int x = (dst_width < src_width) ? 32768 :
- (src_width << 16) / dst_width - 32768;
- for (j = 0; j < dst_width; ++j) {
- int cx = (x < 0) ? 0 : x;
- int xi = cx >> 16;
- int xf = cx & 0xffff;
- int r0 = (src[xi] * (65536 - xf) + src[xi + 1] * xf) >> 16;
- int r1 = (src[xi + src_stride] * (65536 - xf) +
- src[xi + src_stride + 1] * xf) >> 16;
- *dst++ = (r0 * (65536 - yf) + r1 * yf) >> 16;
- x += dx;
- if (x > maxx)
- x = maxx;
- }
- dst += dst_stride - dst_width;
- y += dy;
- if (y > maxy)
- y = maxy;
- }
-}
-
-/**
- * Scale plane to/from any dimensions, with bilinear
- * interpolation.
- */
-static void ScalePlaneBilinear(int src_width, int src_height,
- int dst_width, int dst_height,
- int src_stride, int dst_stride,
- const uint8* src_ptr, uint8* dst_ptr) {
- int dy;
- int dx;
- assert(dst_width > 0);
- assert(dst_height > 0);
- dy = (src_height << 16) / dst_height;
- dx = (src_width << 16) / dst_width;
- if (!IS_ALIGNED(src_width, 8) || (src_width > kMaxInputWidth)) {
- ScalePlaneBilinearSimple(src_width, src_height, dst_width, dst_height,
- src_stride, dst_stride, src_ptr, dst_ptr);
-
- } else {
- ALIGN16(uint8 row[kMaxInputWidth + 1]);
- void (*ScaleFilterRows)(uint8* dst_ptr, const uint8* src_ptr,
- int src_stride,
- int dst_width, int source_y_fraction);
- void (*ScaleFilterCols)(uint8* dst_ptr, const uint8* src_ptr,
- int dst_width, int dx);
-#if defined(HAS_SCALEFILTERROWS_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3) &&
- IS_ALIGNED(src_stride, 16) && IS_ALIGNED(src_ptr, 16) &&
- IS_ALIGNED(src_width, 16)) {
- ScaleFilterRows = ScaleFilterRows_SSSE3;
- } else
-#endif
-#if defined(HAS_SCALEFILTERROWS_SSE2)
- if (TestCpuFlag(kCpuHasSSE2) &&
- IS_ALIGNED(src_stride, 16) && IS_ALIGNED(src_ptr, 16) &&
- IS_ALIGNED(src_width, 16)) {
- ScaleFilterRows = ScaleFilterRows_SSE2;
- } else
-#endif
- {
- ScaleFilterRows = ScaleFilterRows_C;
- }
- ScaleFilterCols = ScaleFilterCols_C;
-
- {
- int y = 0;
- int maxy = ((src_height - 1) << 16) - 1; // max is filter of last 2 rows.
- int j;
- for (j = 0; j < dst_height; ++j) {
- int iy = y >> 16;
- int fy = (y >> 8) & 255;
- const uint8* const src = src_ptr + iy * src_stride;
- ScaleFilterRows(row, src, src_stride, src_width, fy);
- ScaleFilterCols(dst_ptr, row, dst_width, dx);
- dst_ptr += dst_stride;
- y += dy;
- if (y > maxy) {
- y = maxy;
- }
- }
- }
-}
-}
-
-/**
- * Scale plane to/from any dimensions, without interpolation.
- * Fixed point math is used for performance: The upper 16 bits
- * of x and dx is the integer part of the source position and
- * the lower 16 bits are the fixed decimal part.
- */
-static void ScalePlaneSimple(int src_width, int src_height,
- int dst_width, int dst_height,
- int src_stride, int dst_stride,
- const uint8* src_ptr, uint8* dst_ptr) {
- uint8* dst = dst_ptr;
- int dx = (src_width << 16) / dst_width;
- int y;
- for (y = 0; y < dst_height; ++y) {
- const uint8* const src = src_ptr + (y * src_height / dst_height) *
- src_stride;
- // TODO(fbarchard): Round X coordinate by setting x=0x8000.
- int x = 0;
- int i;
- for (i = 0; i < dst_width; ++i) {
- *dst++ = src[x >> 16];
- x += dx;
- }
- dst += dst_stride - dst_width;
- }
-}
-
-/**
- * Scale plane to/from any dimensions.
- */
-static void ScalePlaneAnySize(int src_width, int src_height,
- int dst_width, int dst_height,
- int src_stride, int dst_stride,
- const uint8* src_ptr, uint8* dst_ptr,
- FilterModeEnum filtering) {
- if (!filtering) {
- ScalePlaneSimple(src_width, src_height, dst_width, dst_height,
- src_stride, dst_stride, src_ptr, dst_ptr);
- } else {
- // fall back to non-optimized version
- ScalePlaneBilinear(src_width, src_height, dst_width, dst_height,
- src_stride, dst_stride, src_ptr, dst_ptr);
- }
-}
-
-/**
- * Scale plane down, any size
- *
- * This is an optimized version for scaling down a plane to any size.
- * The current implementation is ~10 times faster compared to the
- * reference implementation for e.g. XGA->LowResPAL
- *
- */
-static void ScalePlaneDown(int src_width, int src_height,
- int dst_width, int dst_height,
- int src_stride, int dst_stride,
- const uint8* src_ptr, uint8* dst_ptr,
- FilterModeEnum filtering) {
- if (!filtering) {
- ScalePlaneSimple(src_width, src_height, dst_width, dst_height,
- src_stride, dst_stride, src_ptr, dst_ptr);
- } else if (filtering == kFilterBilinear || src_height * 2 > dst_height) {
- // between 1/2x and 1x use bilinear
- ScalePlaneBilinear(src_width, src_height, dst_width, dst_height,
- src_stride, dst_stride, src_ptr, dst_ptr);
- } else {
- ScalePlaneBox(src_width, src_height, dst_width, dst_height,
- src_stride, dst_stride, src_ptr, dst_ptr);
- }
-}
-
-/**
- * Copy plane, no scaling
- *
- * This simply copies the given plane without scaling.
- * The current implementation is ~115 times faster
- * compared to the reference implementation.
- *
- */
-static void CopyPlane(int src_width, int src_height,
- int dst_width, int dst_height,
- int src_stride, int dst_stride,
- const uint8* src_ptr, uint8* dst_ptr) {
- if (src_stride == src_width && dst_stride == dst_width) {
- // All contiguous, so can use REALLY fast path.
- memcpy(dst_ptr, src_ptr, src_width * src_height);
- } else {
- // Not all contiguous; must copy scanlines individually
- const uint8* src = src_ptr;
- uint8* dst = dst_ptr;
- int i;
- for (i = 0; i < src_height; ++i) {
- memcpy(dst, src, src_width);
- dst += dst_stride;
- src += src_stride;
- }
- }
-}
-
-static void ScalePlane(const uint8* src, int src_stride,
- int src_width, int src_height,
- uint8* dst, int dst_stride,
- int dst_width, int dst_height,
- FilterModeEnum filtering, int use_ref) {
- // Use specialized scales to improve performance for common resolutions.
- // For example, all the 1/2 scalings will use ScalePlaneDown2()
- if (dst_width == src_width && dst_height == src_height) {
- // Straight copy.
- CopyPlane(src_width, src_height, dst_width, dst_height, src_stride,
- dst_stride, src, dst);
- } else if (dst_width <= src_width && dst_height <= src_height) {
- // Scale down.
- if (use_ref) {
- // For testing, allow the optimized versions to be disabled.
- ScalePlaneDown(src_width, src_height, dst_width, dst_height,
- src_stride, dst_stride, src, dst, filtering);
- } else if (4 * dst_width == 3 * src_width &&
- 4 * dst_height == 3 * src_height) {
- // optimized, 3/4
- ScalePlaneDown34(src_width, src_height, dst_width, dst_height,
- src_stride, dst_stride, src, dst, filtering);
- } else if (2 * dst_width == src_width && 2 * dst_height == src_height) {
- // optimized, 1/2
- ScalePlaneDown2(src_width, src_height, dst_width, dst_height,
- src_stride, dst_stride, src, dst, filtering);
- // 3/8 rounded up for odd sized chroma height.
- } else if (8 * dst_width == 3 * src_width &&
- dst_height == ((src_height * 3 + 7) / 8)) {
- // optimized, 3/8
- ScalePlaneDown38(src_width, src_height, dst_width, dst_height,
- src_stride, dst_stride, src, dst, filtering);
- } else if (4 * dst_width == src_width && 4 * dst_height == src_height) {
- // optimized, 1/4
- ScalePlaneDown4(src_width, src_height, dst_width, dst_height,
- src_stride, dst_stride, src, dst, filtering);
- } else if (8 * dst_width == src_width && 8 * dst_height == src_height) {
- // optimized, 1/8
- ScalePlaneDown8(src_width, src_height, dst_width, dst_height,
- src_stride, dst_stride, src, dst, filtering);
- } else {
- // Arbitrary downsample
- ScalePlaneDown(src_width, src_height, dst_width, dst_height,
- src_stride, dst_stride, src, dst, filtering);
- }
- } else {
- // Arbitrary scale up and/or down.
- ScalePlaneAnySize(src_width, src_height, dst_width, dst_height,
- src_stride, dst_stride, src, dst, filtering);
- }
-}
-
-/**
- * Scale a plane.
- *
- * This function in turn calls a scaling function
- * suitable for handling the desired resolutions.
- *
- */
-
-int I420Scale(const uint8* src_y, int src_stride_y,
- const uint8* src_u, int src_stride_u,
- const uint8* src_v, int src_stride_v,
- int src_width, int src_height,
- uint8* dst_y, int dst_stride_y,
- uint8* dst_u, int dst_stride_u,
- uint8* dst_v, int dst_stride_v,
- int dst_width, int dst_height,
- FilterModeEnum filtering) {
- if (!src_y || !src_u || !src_v || src_width <= 0 || src_height == 0 ||
- !dst_y || !dst_u || !dst_v || dst_width <= 0 || dst_height <= 0) {
- return -1;
- }
- // Negative height means invert the image.
- if (src_height < 0) {
- int halfheight;
- src_height = -src_height;
- halfheight = (src_height + 1) >> 1;
- src_y = src_y + (src_height - 1) * src_stride_y;
- src_u = src_u + (halfheight - 1) * src_stride_u;
- src_v = src_v + (halfheight - 1) * src_stride_v;
- src_stride_y = -src_stride_y;
- src_stride_u = -src_stride_u;
- src_stride_v = -src_stride_v;
- }
- {
- int src_halfwidth = (src_width + 1) >> 1;
- int src_halfheight = (src_height + 1) >> 1;
- int dst_halfwidth = (dst_width + 1) >> 1;
- int dst_halfheight = (dst_height + 1) >> 1;
-
- ScalePlane(src_y, src_stride_y, src_width, src_height,
- dst_y, dst_stride_y, dst_width, dst_height,
- filtering, use_reference_impl_);
- ScalePlane(src_u, src_stride_u, src_halfwidth, src_halfheight,
- dst_u, dst_stride_u, dst_halfwidth, dst_halfheight,
- filtering, use_reference_impl_);
- ScalePlane(src_v, src_stride_v, src_halfwidth, src_halfheight,
- dst_v, dst_stride_v, dst_halfwidth, dst_halfheight,
- filtering, use_reference_impl_);
- }
- return 0;
-}
-
-// Deprecated api
-int Scale(const uint8* src_y, const uint8* src_u, const uint8* src_v,
- int src_stride_y, int src_stride_u, int src_stride_v,
- int src_width, int src_height,
- uint8* dst_y, uint8* dst_u, uint8* dst_v,
- int dst_stride_y, int dst_stride_u, int dst_stride_v,
- int dst_width, int dst_height,
- int interpolate) {
- if (!src_y || !src_u || !src_v || src_width <= 0 || src_height == 0 ||
- !dst_y || !dst_u || !dst_v || dst_width <= 0 || dst_height <= 0) {
- return -1;
- }
- // Negative height means invert the image.
- if (src_height < 0) {
- int halfheight;
- src_height = -src_height;
- halfheight = (src_height + 1) >> 1;
- src_y = src_y + (src_height - 1) * src_stride_y;
- src_u = src_u + (halfheight - 1) * src_stride_u;
- src_v = src_v + (halfheight - 1) * src_stride_v;
- src_stride_y = -src_stride_y;
- src_stride_u = -src_stride_u;
- src_stride_v = -src_stride_v;
- }
- {
- int src_halfwidth = (src_width + 1) >> 1;
- int src_halfheight = (src_height + 1) >> 1;
- int dst_halfwidth = (dst_width + 1) >> 1;
- int dst_halfheight = (dst_height + 1) >> 1;
- FilterModeEnum filtering = interpolate ? kFilterBox : kFilterNone;
-
- ScalePlane(src_y, src_stride_y, src_width, src_height,
- dst_y, dst_stride_y, dst_width, dst_height,
- filtering, use_reference_impl_);
- ScalePlane(src_u, src_stride_u, src_halfwidth, src_halfheight,
- dst_u, dst_stride_u, dst_halfwidth, dst_halfheight,
- filtering, use_reference_impl_);
- ScalePlane(src_v, src_stride_v, src_halfwidth, src_halfheight,
- dst_v, dst_stride_v, dst_halfwidth, dst_halfheight,
- filtering, use_reference_impl_);
- }
- return 0;
-}
-
-// Deprecated api
-int ScaleOffset(const uint8* src, int src_width, int src_height,
- uint8* dst, int dst_width, int dst_height, int dst_yoffset,
- int interpolate) {
- if (!src || src_width <= 0 || src_height <= 0 ||
- !dst || dst_width <= 0 || dst_height <= 0 || dst_yoffset < 0 ||
- dst_yoffset >= dst_height) {
- return -1;
- }
- dst_yoffset = dst_yoffset & ~1; // chroma requires offset to multiple of 2.
- {
- int src_halfwidth = (src_width + 1) >> 1;
- int src_halfheight = (src_height + 1) >> 1;
- int dst_halfwidth = (dst_width + 1) >> 1;
- int dst_halfheight = (dst_height + 1) >> 1;
- int aheight = dst_height - dst_yoffset * 2; // actual output height
- const uint8* const src_y = src;
- const uint8* const src_u = src + src_width * src_height;
- const uint8* const src_v = src + src_width * src_height +
- src_halfwidth * src_halfheight;
- uint8* dst_y = dst + dst_yoffset * dst_width;
- uint8* dst_u = dst + dst_width * dst_height +
- (dst_yoffset >> 1) * dst_halfwidth;
- uint8* dst_v = dst + dst_width * dst_height + dst_halfwidth * dst_halfheight +
- (dst_yoffset >> 1) * dst_halfwidth;
- return Scale(src_y, src_u, src_v, src_width, src_halfwidth, src_halfwidth,
- src_width, src_height, dst_y, dst_u, dst_v, dst_width,
- dst_halfwidth, dst_halfwidth, dst_width, aheight, interpolate);
- }
-}
-
-#ifdef __cplusplus
-} // extern "C"
-} // namespace libyuv
-#endif
diff --git a/third_party/libyuv/source/scale.cc b/third_party/libyuv/source/scale.cc
new file mode 100644
index 0000000..31cedf1
--- /dev/null
+++ b/third_party/libyuv/source/scale.cc
@@ -0,0 +1,1716 @@
+/*
+ * Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "third_party/libyuv/include/libyuv/scale.h"
+
+#include <assert.h>
+#include <string.h>
+
+#include "third_party/libyuv/include/libyuv/cpu_id.h"
+#include "third_party/libyuv/include/libyuv/planar_functions.h" // CopyPlane
+#include "third_party/libyuv/include/libyuv/row.h"
+#include "third_party/libyuv/include/libyuv/scale_row.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// Remove this macro if OVERREAD is safe.
+#define AVOID_OVERREAD 1
+
+static __inline int Abs(int v) {
+ return v >= 0 ? v : -v;
+}
+
+#define SUBSAMPLE(v, a, s) (v < 0) ? (-((-v + a) >> s)) : ((v + a) >> s)
+
+// Scale plane, 1/2
+// This is an optimized version for scaling down a plane to 1/2 of
+// its original size.
+
+static void ScalePlaneDown2(int src_width, int src_height,
+ int dst_width, int dst_height,
+ int src_stride, int dst_stride,
+ const uint8* src_ptr, uint8* dst_ptr,
+ enum FilterMode filtering) {
+ int y;
+ void (*ScaleRowDown2)(const uint8* src_ptr, ptrdiff_t src_stride,
+ uint8* dst_ptr, int dst_width) =
+ filtering == kFilterNone ? ScaleRowDown2_C :
+ (filtering == kFilterLinear ? ScaleRowDown2Linear_C :
+ ScaleRowDown2Box_C);
+ int row_stride = src_stride << 1;
+ if (!filtering) {
+ src_ptr += src_stride; // Point to odd rows.
+ src_stride = 0;
+ }
+
+#if defined(HAS_SCALEROWDOWN2_NEON)
+ if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(dst_width, 16)) {
+ ScaleRowDown2 = filtering ? ScaleRowDown2Box_NEON : ScaleRowDown2_NEON;
+ }
+#elif defined(HAS_SCALEROWDOWN2_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 16)) {
+ ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_Unaligned_SSE2 :
+ (filtering == kFilterLinear ? ScaleRowDown2Linear_Unaligned_SSE2 :
+ ScaleRowDown2Box_Unaligned_SSE2);
+ if (IS_ALIGNED(src_ptr, 16) &&
+ IS_ALIGNED(src_stride, 16) && IS_ALIGNED(row_stride, 16) &&
+ IS_ALIGNED(dst_ptr, 16) && IS_ALIGNED(dst_stride, 16)) {
+ ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_SSE2 :
+ (filtering == kFilterLinear ? ScaleRowDown2Linear_SSE2 :
+ ScaleRowDown2Box_SSE2);
+ }
+ }
+#elif defined(HAS_SCALEROWDOWN2_MIPS_DSPR2)
+ if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(src_ptr, 4) &&
+ IS_ALIGNED(src_stride, 4) && IS_ALIGNED(row_stride, 4) &&
+ IS_ALIGNED(dst_ptr, 4) && IS_ALIGNED(dst_stride, 4)) {
+ ScaleRowDown2 = filtering ?
+ ScaleRowDown2Box_MIPS_DSPR2 : ScaleRowDown2_MIPS_DSPR2;
+ }
+#endif
+
+ if (filtering == kFilterLinear) {
+ src_stride = 0;
+ }
+ // TODO(fbarchard): Loop through source height to allow odd height.
+ for (y = 0; y < dst_height; ++y) {
+ ScaleRowDown2(src_ptr, src_stride, dst_ptr, dst_width);
+ src_ptr += row_stride;
+ dst_ptr += dst_stride;
+ }
+}
+
+static void ScalePlaneDown2_16(int src_width, int src_height,
+ int dst_width, int dst_height,
+ int src_stride, int dst_stride,
+ const uint16* src_ptr, uint16* dst_ptr,
+ enum FilterMode filtering) {
+ int y;
+ void (*ScaleRowDown2)(const uint16* src_ptr, ptrdiff_t src_stride,
+ uint16* dst_ptr, int dst_width) =
+ filtering == kFilterNone ? ScaleRowDown2_16_C :
+ (filtering == kFilterLinear ? ScaleRowDown2Linear_16_C :
+ ScaleRowDown2Box_16_C);
+ int row_stride = src_stride << 1;
+ if (!filtering) {
+ src_ptr += src_stride; // Point to odd rows.
+ src_stride = 0;
+ }
+
+#if defined(HAS_SCALEROWDOWN2_16_NEON)
+ if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(dst_width, 16)) {
+ ScaleRowDown2 = filtering ? ScaleRowDown2Box_16_NEON :
+ ScaleRowDown2_16_NEON;
+ }
+#elif defined(HAS_SCALEROWDOWN2_16_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 16)) {
+ ScaleRowDown2 = filtering == kFilterNone ?
+ ScaleRowDown2_Unaligned_16_SSE2 :
+ (filtering == kFilterLinear ? ScaleRowDown2Linear_Unaligned_16_SSE2 :
+ ScaleRowDown2Box_Unaligned_16_SSE2);
+ if (IS_ALIGNED(src_ptr, 16) &&
+ IS_ALIGNED(src_stride, 16) && IS_ALIGNED(row_stride, 16) &&
+ IS_ALIGNED(dst_ptr, 16) && IS_ALIGNED(dst_stride, 16)) {
+ ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_16_SSE2 :
+ (filtering == kFilterLinear ? ScaleRowDown2Linear_16_SSE2 :
+ ScaleRowDown2Box_16_SSE2);
+ }
+ }
+#elif defined(HAS_SCALEROWDOWN2_16_MIPS_DSPR2)
+ if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(src_ptr, 4) &&
+ IS_ALIGNED(src_stride, 4) && IS_ALIGNED(row_stride, 4) &&
+ IS_ALIGNED(dst_ptr, 4) && IS_ALIGNED(dst_stride, 4)) {
+ ScaleRowDown2 = filtering ?
+ ScaleRowDown2Box_16_MIPS_DSPR2 : ScaleRowDown2_16_MIPS_DSPR2;
+ }
+#endif
+
+ if (filtering == kFilterLinear) {
+ src_stride = 0;
+ }
+ // TODO(fbarchard): Loop through source height to allow odd height.
+ for (y = 0; y < dst_height; ++y) {
+ ScaleRowDown2(src_ptr, src_stride, dst_ptr, dst_width);
+ src_ptr += row_stride;
+ dst_ptr += dst_stride;
+ }
+}
+
+// Scale plane, 1/4
+// This is an optimized version for scaling down a plane to 1/4 of
+// its original size.
+
+static void ScalePlaneDown4(int src_width, int src_height,
+ int dst_width, int dst_height,
+ int src_stride, int dst_stride,
+ const uint8* src_ptr, uint8* dst_ptr,
+ enum FilterMode filtering) {
+ int y;
+ void (*ScaleRowDown4)(const uint8* src_ptr, ptrdiff_t src_stride,
+ uint8* dst_ptr, int dst_width) =
+ filtering ? ScaleRowDown4Box_C : ScaleRowDown4_C;
+ int row_stride = src_stride << 2;
+ if (!filtering) {
+ src_ptr += src_stride * 2; // Point to row 2.
+ src_stride = 0;
+ }
+#if defined(HAS_SCALEROWDOWN4_NEON)
+ if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(dst_width, 8)) {
+ ScaleRowDown4 = filtering ? ScaleRowDown4Box_NEON : ScaleRowDown4_NEON;
+ }
+#elif defined(HAS_SCALEROWDOWN4_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2) &&
+ IS_ALIGNED(dst_width, 8) && IS_ALIGNED(row_stride, 16) &&
+ IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16)) {
+ ScaleRowDown4 = filtering ? ScaleRowDown4Box_SSE2 : ScaleRowDown4_SSE2;
+ }
+#elif defined(HAS_SCALEROWDOWN4_MIPS_DSPR2)
+ if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(row_stride, 4) &&
+ IS_ALIGNED(src_ptr, 4) && IS_ALIGNED(src_stride, 4) &&
+ IS_ALIGNED(dst_ptr, 4) && IS_ALIGNED(dst_stride, 4)) {
+ ScaleRowDown4 = filtering ?
+ ScaleRowDown4Box_MIPS_DSPR2 : ScaleRowDown4_MIPS_DSPR2;
+ }
+#endif
+
+ if (filtering == kFilterLinear) {
+ src_stride = 0;
+ }
+ for (y = 0; y < dst_height; ++y) {
+ ScaleRowDown4(src_ptr, src_stride, dst_ptr, dst_width);
+ src_ptr += row_stride;
+ dst_ptr += dst_stride;
+ }
+}
+
+static void ScalePlaneDown4_16(int src_width, int src_height,
+ int dst_width, int dst_height,
+ int src_stride, int dst_stride,
+ const uint16* src_ptr, uint16* dst_ptr,
+ enum FilterMode filtering) {
+ int y;
+ void (*ScaleRowDown4)(const uint16* src_ptr, ptrdiff_t src_stride,
+ uint16* dst_ptr, int dst_width) =
+ filtering ? ScaleRowDown4Box_16_C : ScaleRowDown4_16_C;
+ int row_stride = src_stride << 2;
+ if (!filtering) {
+ src_ptr += src_stride * 2; // Point to row 2.
+ src_stride = 0;
+ }
+#if defined(HAS_SCALEROWDOWN4_16_NEON)
+ if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(dst_width, 8)) {
+ ScaleRowDown4 = filtering ? ScaleRowDown4Box_16_NEON :
+ ScaleRowDown4_16_NEON;
+ }
+#elif defined(HAS_SCALEROWDOWN4_16_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2) &&
+ IS_ALIGNED(dst_width, 8) && IS_ALIGNED(row_stride, 16) &&
+ IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16)) {
+ ScaleRowDown4 = filtering ? ScaleRowDown4Box_16_SSE2 :
+ ScaleRowDown4_16_SSE2;
+ }
+#elif defined(HAS_SCALEROWDOWN4_16_MIPS_DSPR2)
+ if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(row_stride, 4) &&
+ IS_ALIGNED(src_ptr, 4) && IS_ALIGNED(src_stride, 4) &&
+ IS_ALIGNED(dst_ptr, 4) && IS_ALIGNED(dst_stride, 4)) {
+ ScaleRowDown4 = filtering ?
+ ScaleRowDown4Box_16_MIPS_DSPR2 : ScaleRowDown4_16_MIPS_DSPR2;
+ }
+#endif
+
+ if (filtering == kFilterLinear) {
+ src_stride = 0;
+ }
+ for (y = 0; y < dst_height; ++y) {
+ ScaleRowDown4(src_ptr, src_stride, dst_ptr, dst_width);
+ src_ptr += row_stride;
+ dst_ptr += dst_stride;
+ }
+}
+
+// Scale plane down, 3/4
+
+static void ScalePlaneDown34(int src_width, int src_height,
+ int dst_width, int dst_height,
+ int src_stride, int dst_stride,
+ const uint8* src_ptr, uint8* dst_ptr,
+ enum FilterMode filtering) {
+ int y;
+ void (*ScaleRowDown34_0)(const uint8* src_ptr, ptrdiff_t src_stride,
+ uint8* dst_ptr, int dst_width);
+ void (*ScaleRowDown34_1)(const uint8* src_ptr, ptrdiff_t src_stride,
+ uint8* dst_ptr, int dst_width);
+ const int filter_stride = (filtering == kFilterLinear) ? 0 : src_stride;
+ assert(dst_width % 3 == 0);
+ if (!filtering) {
+ ScaleRowDown34_0 = ScaleRowDown34_C;
+ ScaleRowDown34_1 = ScaleRowDown34_C;
+ } else {
+ ScaleRowDown34_0 = ScaleRowDown34_0_Box_C;
+ ScaleRowDown34_1 = ScaleRowDown34_1_Box_C;
+ }
+#if defined(HAS_SCALEROWDOWN34_NEON)
+ if (TestCpuFlag(kCpuHasNEON) && (dst_width % 24 == 0)) {
+ if (!filtering) {
+ ScaleRowDown34_0 = ScaleRowDown34_NEON;
+ ScaleRowDown34_1 = ScaleRowDown34_NEON;
+ } else {
+ ScaleRowDown34_0 = ScaleRowDown34_0_Box_NEON;
+ ScaleRowDown34_1 = ScaleRowDown34_1_Box_NEON;
+ }
+ }
+#endif
+#if defined(HAS_SCALEROWDOWN34_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3) && (dst_width % 24 == 0) &&
+ IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16)) {
+ if (!filtering) {
+ ScaleRowDown34_0 = ScaleRowDown34_SSSE3;
+ ScaleRowDown34_1 = ScaleRowDown34_SSSE3;
+ } else {
+ ScaleRowDown34_0 = ScaleRowDown34_0_Box_SSSE3;
+ ScaleRowDown34_1 = ScaleRowDown34_1_Box_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_SCALEROWDOWN34_MIPS_DSPR2)
+ if (TestCpuFlag(kCpuHasMIPS_DSPR2) && (dst_width % 24 == 0) &&
+ IS_ALIGNED(src_ptr, 4) && IS_ALIGNED(src_stride, 4) &&
+ IS_ALIGNED(dst_ptr, 4) && IS_ALIGNED(dst_stride, 4)) {
+ if (!filtering) {
+ ScaleRowDown34_0 = ScaleRowDown34_MIPS_DSPR2;
+ ScaleRowDown34_1 = ScaleRowDown34_MIPS_DSPR2;
+ } else {
+ ScaleRowDown34_0 = ScaleRowDown34_0_Box_MIPS_DSPR2;
+ ScaleRowDown34_1 = ScaleRowDown34_1_Box_MIPS_DSPR2;
+ }
+ }
+#endif
+
+ for (y = 0; y < dst_height - 2; y += 3) {
+ ScaleRowDown34_0(src_ptr, filter_stride, dst_ptr, dst_width);
+ src_ptr += src_stride;
+ dst_ptr += dst_stride;
+ ScaleRowDown34_1(src_ptr, filter_stride, dst_ptr, dst_width);
+ src_ptr += src_stride;
+ dst_ptr += dst_stride;
+ ScaleRowDown34_0(src_ptr + src_stride, -filter_stride,
+ dst_ptr, dst_width);
+ src_ptr += src_stride * 2;
+ dst_ptr += dst_stride;
+ }
+
+ // Remainder 1 or 2 rows with last row vertically unfiltered
+ if ((dst_height % 3) == 2) {
+ ScaleRowDown34_0(src_ptr, filter_stride, dst_ptr, dst_width);
+ src_ptr += src_stride;
+ dst_ptr += dst_stride;
+ ScaleRowDown34_1(src_ptr, 0, dst_ptr, dst_width);
+ } else if ((dst_height % 3) == 1) {
+ ScaleRowDown34_0(src_ptr, 0, dst_ptr, dst_width);
+ }
+}
+
+static void ScalePlaneDown34_16(int src_width, int src_height,
+ int dst_width, int dst_height,
+ int src_stride, int dst_stride,
+ const uint16* src_ptr, uint16* dst_ptr,
+ enum FilterMode filtering) {
+ int y;
+ void (*ScaleRowDown34_0)(const uint16* src_ptr, ptrdiff_t src_stride,
+ uint16* dst_ptr, int dst_width);
+ void (*ScaleRowDown34_1)(const uint16* src_ptr, ptrdiff_t src_stride,
+ uint16* dst_ptr, int dst_width);
+ const int filter_stride = (filtering == kFilterLinear) ? 0 : src_stride;
+ assert(dst_width % 3 == 0);
+ if (!filtering) {
+ ScaleRowDown34_0 = ScaleRowDown34_16_C;
+ ScaleRowDown34_1 = ScaleRowDown34_16_C;
+ } else {
+ ScaleRowDown34_0 = ScaleRowDown34_0_Box_16_C;
+ ScaleRowDown34_1 = ScaleRowDown34_1_Box_16_C;
+ }
+#if defined(HAS_SCALEROWDOWN34_16_NEON)
+ if (TestCpuFlag(kCpuHasNEON) && (dst_width % 24 == 0)) {
+ if (!filtering) {
+ ScaleRowDown34_0 = ScaleRowDown34_16_NEON;
+ ScaleRowDown34_1 = ScaleRowDown34_16_NEON;
+ } else {
+ ScaleRowDown34_0 = ScaleRowDown34_0_Box_16_NEON;
+ ScaleRowDown34_1 = ScaleRowDown34_1_Box_16_NEON;
+ }
+ }
+#endif
+#if defined(HAS_SCALEROWDOWN34_16_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3) && (dst_width % 24 == 0) &&
+ IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16)) {
+ if (!filtering) {
+ ScaleRowDown34_0 = ScaleRowDown34_16_SSSE3;
+ ScaleRowDown34_1 = ScaleRowDown34_16_SSSE3;
+ } else {
+ ScaleRowDown34_0 = ScaleRowDown34_0_Box_16_SSSE3;
+ ScaleRowDown34_1 = ScaleRowDown34_1_Box_16_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_SCALEROWDOWN34_16_MIPS_DSPR2)
+ if (TestCpuFlag(kCpuHasMIPS_DSPR2) && (dst_width % 24 == 0) &&
+ IS_ALIGNED(src_ptr, 4) && IS_ALIGNED(src_stride, 4) &&
+ IS_ALIGNED(dst_ptr, 4) && IS_ALIGNED(dst_stride, 4)) {
+ if (!filtering) {
+ ScaleRowDown34_0 = ScaleRowDown34_16_MIPS_DSPR2;
+ ScaleRowDown34_1 = ScaleRowDown34_16_MIPS_DSPR2;
+ } else {
+ ScaleRowDown34_0 = ScaleRowDown34_0_Box_16_MIPS_DSPR2;
+ ScaleRowDown34_1 = ScaleRowDown34_1_Box_16_MIPS_DSPR2;
+ }
+ }
+#endif
+
+ for (y = 0; y < dst_height - 2; y += 3) {
+ ScaleRowDown34_0(src_ptr, filter_stride, dst_ptr, dst_width);
+ src_ptr += src_stride;
+ dst_ptr += dst_stride;
+ ScaleRowDown34_1(src_ptr, filter_stride, dst_ptr, dst_width);
+ src_ptr += src_stride;
+ dst_ptr += dst_stride;
+ ScaleRowDown34_0(src_ptr + src_stride, -filter_stride,
+ dst_ptr, dst_width);
+ src_ptr += src_stride * 2;
+ dst_ptr += dst_stride;
+ }
+
+ // Remainder 1 or 2 rows with last row vertically unfiltered
+ if ((dst_height % 3) == 2) {
+ ScaleRowDown34_0(src_ptr, filter_stride, dst_ptr, dst_width);
+ src_ptr += src_stride;
+ dst_ptr += dst_stride;
+ ScaleRowDown34_1(src_ptr, 0, dst_ptr, dst_width);
+ } else if ((dst_height % 3) == 1) {
+ ScaleRowDown34_0(src_ptr, 0, dst_ptr, dst_width);
+ }
+}
+
+
+// Scale plane, 3/8
+// This is an optimized version for scaling down a plane to 3/8
+// of its original size.
+//
+// Uses box filter arranges like this
+// aaabbbcc -> abc
+// aaabbbcc def
+// aaabbbcc ghi
+// dddeeeff
+// dddeeeff
+// dddeeeff
+// ggghhhii
+// ggghhhii
+// Boxes are 3x3, 2x3, 3x2 and 2x2
+
+static void ScalePlaneDown38(int src_width, int src_height,
+ int dst_width, int dst_height,
+ int src_stride, int dst_stride,
+ const uint8* src_ptr, uint8* dst_ptr,
+ enum FilterMode filtering) {
+ int y;
+ void (*ScaleRowDown38_3)(const uint8* src_ptr, ptrdiff_t src_stride,
+ uint8* dst_ptr, int dst_width);
+ void (*ScaleRowDown38_2)(const uint8* src_ptr, ptrdiff_t src_stride,
+ uint8* dst_ptr, int dst_width);
+ const int filter_stride = (filtering == kFilterLinear) ? 0 : src_stride;
+ assert(dst_width % 3 == 0);
+ if (!filtering) {
+ ScaleRowDown38_3 = ScaleRowDown38_C;
+ ScaleRowDown38_2 = ScaleRowDown38_C;
+ } else {
+ ScaleRowDown38_3 = ScaleRowDown38_3_Box_C;
+ ScaleRowDown38_2 = ScaleRowDown38_2_Box_C;
+ }
+#if defined(HAS_SCALEROWDOWN38_NEON)
+ if (TestCpuFlag(kCpuHasNEON) && (dst_width % 12 == 0)) {
+ if (!filtering) {
+ ScaleRowDown38_3 = ScaleRowDown38_NEON;
+ ScaleRowDown38_2 = ScaleRowDown38_NEON;
+ } else {
+ ScaleRowDown38_3 = ScaleRowDown38_3_Box_NEON;
+ ScaleRowDown38_2 = ScaleRowDown38_2_Box_NEON;
+ }
+ }
+#elif defined(HAS_SCALEROWDOWN38_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3) && (dst_width % 24 == 0) &&
+ IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16)) {
+ if (!filtering) {
+ ScaleRowDown38_3 = ScaleRowDown38_SSSE3;
+ ScaleRowDown38_2 = ScaleRowDown38_SSSE3;
+ } else {
+ ScaleRowDown38_3 = ScaleRowDown38_3_Box_SSSE3;
+ ScaleRowDown38_2 = ScaleRowDown38_2_Box_SSSE3;
+ }
+ }
+#elif defined(HAS_SCALEROWDOWN38_MIPS_DSPR2)
+ if (TestCpuFlag(kCpuHasMIPS_DSPR2) && (dst_width % 12 == 0) &&
+ IS_ALIGNED(src_ptr, 4) && IS_ALIGNED(src_stride, 4) &&
+ IS_ALIGNED(dst_ptr, 4) && IS_ALIGNED(dst_stride, 4)) {
+ if (!filtering) {
+ ScaleRowDown38_3 = ScaleRowDown38_MIPS_DSPR2;
+ ScaleRowDown38_2 = ScaleRowDown38_MIPS_DSPR2;
+ } else {
+ ScaleRowDown38_3 = ScaleRowDown38_3_Box_MIPS_DSPR2;
+ ScaleRowDown38_2 = ScaleRowDown38_2_Box_MIPS_DSPR2;
+ }
+ }
+#endif
+
+ for (y = 0; y < dst_height - 2; y += 3) {
+ ScaleRowDown38_3(src_ptr, filter_stride, dst_ptr, dst_width);
+ src_ptr += src_stride * 3;
+ dst_ptr += dst_stride;
+ ScaleRowDown38_3(src_ptr, filter_stride, dst_ptr, dst_width);
+ src_ptr += src_stride * 3;
+ dst_ptr += dst_stride;
+ ScaleRowDown38_2(src_ptr, filter_stride, dst_ptr, dst_width);
+ src_ptr += src_stride * 2;
+ dst_ptr += dst_stride;
+ }
+
+ // Remainder 1 or 2 rows with last row vertically unfiltered
+ if ((dst_height % 3) == 2) {
+ ScaleRowDown38_3(src_ptr, filter_stride, dst_ptr, dst_width);
+ src_ptr += src_stride * 3;
+ dst_ptr += dst_stride;
+ ScaleRowDown38_3(src_ptr, 0, dst_ptr, dst_width);
+ } else if ((dst_height % 3) == 1) {
+ ScaleRowDown38_3(src_ptr, 0, dst_ptr, dst_width);
+ }
+}
+
+static void ScalePlaneDown38_16(int src_width, int src_height,
+ int dst_width, int dst_height,
+ int src_stride, int dst_stride,
+ const uint16* src_ptr, uint16* dst_ptr,
+ enum FilterMode filtering) {
+ int y;
+ void (*ScaleRowDown38_3)(const uint16* src_ptr, ptrdiff_t src_stride,
+ uint16* dst_ptr, int dst_width);
+ void (*ScaleRowDown38_2)(const uint16* src_ptr, ptrdiff_t src_stride,
+ uint16* dst_ptr, int dst_width);
+ const int filter_stride = (filtering == kFilterLinear) ? 0 : src_stride;
+ assert(dst_width % 3 == 0);
+ if (!filtering) {
+ ScaleRowDown38_3 = ScaleRowDown38_16_C;
+ ScaleRowDown38_2 = ScaleRowDown38_16_C;
+ } else {
+ ScaleRowDown38_3 = ScaleRowDown38_3_Box_16_C;
+ ScaleRowDown38_2 = ScaleRowDown38_2_Box_16_C;
+ }
+#if defined(HAS_SCALEROWDOWN38_16_NEON)
+ if (TestCpuFlag(kCpuHasNEON) && (dst_width % 12 == 0)) {
+ if (!filtering) {
+ ScaleRowDown38_3 = ScaleRowDown38_16_NEON;
+ ScaleRowDown38_2 = ScaleRowDown38_16_NEON;
+ } else {
+ ScaleRowDown38_3 = ScaleRowDown38_3_Box_16_NEON;
+ ScaleRowDown38_2 = ScaleRowDown38_2_Box_16_NEON;
+ }
+ }
+#elif defined(HAS_SCALEROWDOWN38_16_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3) && (dst_width % 24 == 0) &&
+ IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16)) {
+ if (!filtering) {
+ ScaleRowDown38_3 = ScaleRowDown38_16_SSSE3;
+ ScaleRowDown38_2 = ScaleRowDown38_16_SSSE3;
+ } else {
+ ScaleRowDown38_3 = ScaleRowDown38_3_Box_16_SSSE3;
+ ScaleRowDown38_2 = ScaleRowDown38_2_Box_16_SSSE3;
+ }
+ }
+#elif defined(HAS_SCALEROWDOWN38_16_MIPS_DSPR2)
+ if (TestCpuFlag(kCpuHasMIPS_DSPR2) && (dst_width % 12 == 0) &&
+ IS_ALIGNED(src_ptr, 4) && IS_ALIGNED(src_stride, 4) &&
+ IS_ALIGNED(dst_ptr, 4) && IS_ALIGNED(dst_stride, 4)) {
+ if (!filtering) {
+ ScaleRowDown38_3 = ScaleRowDown38_16_MIPS_DSPR2;
+ ScaleRowDown38_2 = ScaleRowDown38_16_MIPS_DSPR2;
+ } else {
+ ScaleRowDown38_3 = ScaleRowDown38_3_Box_16_MIPS_DSPR2;
+ ScaleRowDown38_2 = ScaleRowDown38_2_Box_16_MIPS_DSPR2;
+ }
+ }
+#endif
+
+ for (y = 0; y < dst_height - 2; y += 3) {
+ ScaleRowDown38_3(src_ptr, filter_stride, dst_ptr, dst_width);
+ src_ptr += src_stride * 3;
+ dst_ptr += dst_stride;
+ ScaleRowDown38_3(src_ptr, filter_stride, dst_ptr, dst_width);
+ src_ptr += src_stride * 3;
+ dst_ptr += dst_stride;
+ ScaleRowDown38_2(src_ptr, filter_stride, dst_ptr, dst_width);
+ src_ptr += src_stride * 2;
+ dst_ptr += dst_stride;
+ }
+
+ // Remainder 1 or 2 rows with last row vertically unfiltered
+ if ((dst_height % 3) == 2) {
+ ScaleRowDown38_3(src_ptr, filter_stride, dst_ptr, dst_width);
+ src_ptr += src_stride * 3;
+ dst_ptr += dst_stride;
+ ScaleRowDown38_3(src_ptr, 0, dst_ptr, dst_width);
+ } else if ((dst_height % 3) == 1) {
+ ScaleRowDown38_3(src_ptr, 0, dst_ptr, dst_width);
+ }
+}
+
+static __inline uint32 SumBox(int iboxwidth, int iboxheight,
+ ptrdiff_t src_stride, const uint8* src_ptr) {
+ uint32 sum = 0u;
+ int y;
+ assert(iboxwidth > 0);
+ assert(iboxheight > 0);
+ for (y = 0; y < iboxheight; ++y) {
+ int x;
+ for (x = 0; x < iboxwidth; ++x) {
+ sum += src_ptr[x];
+ }
+ src_ptr += src_stride;
+ }
+ return sum;
+}
+
+static __inline uint32 SumBox_16(int iboxwidth, int iboxheight,
+ ptrdiff_t src_stride, const uint16* src_ptr) {
+ uint32 sum = 0u;
+ int y;
+ assert(iboxwidth > 0);
+ assert(iboxheight > 0);
+ for (y = 0; y < iboxheight; ++y) {
+ int x;
+ for (x = 0; x < iboxwidth; ++x) {
+ sum += src_ptr[x];
+ }
+ src_ptr += src_stride;
+ }
+ return sum;
+}
+
+static void ScalePlaneBoxRow_C(int dst_width, int boxheight,
+ int x, int dx, ptrdiff_t src_stride,
+ const uint8* src_ptr, uint8* dst_ptr) {
+ int i;
+ int boxwidth;
+ for (i = 0; i < dst_width; ++i) {
+ int ix = x >> 16;
+ x += dx;
+ boxwidth = (x >> 16) - ix;
+ *dst_ptr++ = SumBox(boxwidth, boxheight, src_stride, src_ptr + ix) /
+ (boxwidth * boxheight);
+ }
+}
+
+static void ScalePlaneBoxRow_16_C(int dst_width, int boxheight,
+ int x, int dx, ptrdiff_t src_stride,
+ const uint16* src_ptr, uint16* dst_ptr) {
+ int i;
+ int boxwidth;
+ for (i = 0; i < dst_width; ++i) {
+ int ix = x >> 16;
+ x += dx;
+ boxwidth = (x >> 16) - ix;
+ *dst_ptr++ = SumBox_16(boxwidth, boxheight, src_stride, src_ptr + ix) /
+ (boxwidth * boxheight);
+ }
+}
+
+static __inline uint32 SumPixels(int iboxwidth, const uint16* src_ptr) {
+ uint32 sum = 0u;
+ int x;
+ assert(iboxwidth > 0);
+ for (x = 0; x < iboxwidth; ++x) {
+ sum += src_ptr[x];
+ }
+ return sum;
+}
+
+static __inline uint32 SumPixels_16(int iboxwidth, const uint32* src_ptr) {
+ uint32 sum = 0u;
+ int x;
+ assert(iboxwidth > 0);
+ for (x = 0; x < iboxwidth; ++x) {
+ sum += src_ptr[x];
+ }
+ return sum;
+}
+
+static void ScaleAddCols2_C(int dst_width, int boxheight, int x, int dx,
+ const uint16* src_ptr, uint8* dst_ptr) {
+ int i;
+ int scaletbl[2];
+ int minboxwidth = (dx >> 16);
+ int* scaleptr = scaletbl - minboxwidth;
+ int boxwidth;
+ scaletbl[0] = 65536 / (minboxwidth * boxheight);
+ scaletbl[1] = 65536 / ((minboxwidth + 1) * boxheight);
+ for (i = 0; i < dst_width; ++i) {
+ int ix = x >> 16;
+ x += dx;
+ boxwidth = (x >> 16) - ix;
+ *dst_ptr++ = SumPixels(boxwidth, src_ptr + ix) * scaleptr[boxwidth] >> 16;
+ }
+}
+
+static void ScaleAddCols2_16_C(int dst_width, int boxheight, int x, int dx,
+ const uint32* src_ptr, uint16* dst_ptr) {
+ int i;
+ int scaletbl[2];
+ int minboxwidth = (dx >> 16);
+ int* scaleptr = scaletbl - minboxwidth;
+ int boxwidth;
+ scaletbl[0] = 65536 / (minboxwidth * boxheight);
+ scaletbl[1] = 65536 / ((minboxwidth + 1) * boxheight);
+ for (i = 0; i < dst_width; ++i) {
+ int ix = x >> 16;
+ x += dx;
+ boxwidth = (x >> 16) - ix;
+ *dst_ptr++ = SumPixels_16(boxwidth, src_ptr + ix) *
+ scaleptr[boxwidth] >> 16;
+ }
+}
+
+static void ScaleAddCols1_C(int dst_width, int boxheight, int x, int dx,
+ const uint16* src_ptr, uint8* dst_ptr) {
+ int boxwidth = (dx >> 16);
+ int scaleval = 65536 / (boxwidth * boxheight);
+ int i;
+ for (i = 0; i < dst_width; ++i) {
+ *dst_ptr++ = SumPixels(boxwidth, src_ptr + x) * scaleval >> 16;
+ x += boxwidth;
+ }
+}
+
+static void ScaleAddCols1_16_C(int dst_width, int boxheight, int x, int dx,
+ const uint32* src_ptr, uint16* dst_ptr) {
+ int boxwidth = (dx >> 16);
+ int scaleval = 65536 / (boxwidth * boxheight);
+ int i;
+ for (i = 0; i < dst_width; ++i) {
+ *dst_ptr++ = SumPixels_16(boxwidth, src_ptr + x) * scaleval >> 16;
+ x += boxwidth;
+ }
+}
+
+// Scale plane down to any dimensions, with interpolation.
+// (boxfilter).
+//
+// Same method as SimpleScale, which is fixed point, outputting
+// one pixel of destination using fixed point (16.16) to step
+// through source, sampling a box of pixel with simple
+// averaging.
+static void ScalePlaneBox(int src_width, int src_height,
+ int dst_width, int dst_height,
+ int src_stride, int dst_stride,
+ const uint8* src_ptr, uint8* dst_ptr) {
+ int j;
+ // Initial source x/y coordinate and step values as 16.16 fixed point.
+ int x = 0;
+ int y = 0;
+ int dx = 0;
+ int dy = 0;
+ const int max_y = (src_height << 16);
+ ScaleSlope(src_width, src_height, dst_width, dst_height, kFilterBox,
+ &x, &y, &dx, &dy);
+ src_width = Abs(src_width);
+ // TODO(fbarchard): Remove this and make AddRows handle boxheight 1.
+ if (!IS_ALIGNED(src_width, 16) || dst_height * 2 > src_height) {
+ uint8* dst = dst_ptr;
+ int j;
+ for (j = 0; j < dst_height; ++j) {
+ int boxheight;
+ int iy = y >> 16;
+ const uint8* src = src_ptr + iy * src_stride;
+ y += dy;
+ if (y > max_y) {
+ y = max_y;
+ }
+ boxheight = (y >> 16) - iy;
+ ScalePlaneBoxRow_C(dst_width, boxheight,
+ x, dx, src_stride,
+ src, dst);
+ dst += dst_stride;
+ }
+ return;
+ }
+ {
+ // Allocate a row buffer of uint16.
+ align_buffer_64(row16, src_width * 2);
+ void (*ScaleAddCols)(int dst_width, int boxheight, int x, int dx,
+ const uint16* src_ptr, uint8* dst_ptr) =
+ (dx & 0xffff) ? ScaleAddCols2_C: ScaleAddCols1_C;
+ void (*ScaleAddRows)(const uint8* src_ptr, ptrdiff_t src_stride,
+ uint16* dst_ptr, int src_width, int src_height) = ScaleAddRows_C;
+
+#if defined(HAS_SCALEADDROWS_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2) &&
+#ifdef AVOID_OVERREAD
+ IS_ALIGNED(src_width, 16) &&
+#endif
+ IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16)) {
+ ScaleAddRows = ScaleAddRows_SSE2;
+ }
+#endif
+
+ for (j = 0; j < dst_height; ++j) {
+ int boxheight;
+ int iy = y >> 16;
+ const uint8* src = src_ptr + iy * src_stride;
+ y += dy;
+ if (y > (src_height << 16)) {
+ y = (src_height << 16);
+ }
+ boxheight = (y >> 16) - iy;
+ ScaleAddRows(src, src_stride, (uint16*)(row16),
+ src_width, boxheight);
+ ScaleAddCols(dst_width, boxheight, x, dx, (uint16*)(row16),
+ dst_ptr);
+ dst_ptr += dst_stride;
+ }
+ free_aligned_buffer_64(row16);
+ }
+}
+
+static void ScalePlaneBox_16(int src_width, int src_height,
+ int dst_width, int dst_height,
+ int src_stride, int dst_stride,
+ const uint16* src_ptr, uint16* dst_ptr) {
+ int j;
+ // Initial source x/y coordinate and step values as 16.16 fixed point.
+ int x = 0;
+ int y = 0;
+ int dx = 0;
+ int dy = 0;
+ const int max_y = (src_height << 16);
+ ScaleSlope(src_width, src_height, dst_width, dst_height, kFilterBox,
+ &x, &y, &dx, &dy);
+ src_width = Abs(src_width);
+ // TODO(fbarchard): Remove this and make AddRows handle boxheight 1.
+ if (!IS_ALIGNED(src_width, 16) || dst_height * 2 > src_height) {
+ uint16* dst = dst_ptr;
+ int j;
+ for (j = 0; j < dst_height; ++j) {
+ int boxheight;
+ int iy = y >> 16;
+ const uint16* src = src_ptr + iy * src_stride;
+ y += dy;
+ if (y > max_y) {
+ y = max_y;
+ }
+ boxheight = (y >> 16) - iy;
+ ScalePlaneBoxRow_16_C(dst_width, boxheight,
+ x, dx, src_stride,
+ src, dst);
+ dst += dst_stride;
+ }
+ return;
+ }
+ {
+ // Allocate a row buffer of uint32.
+ align_buffer_64(row32, src_width * 4);
+ void (*ScaleAddCols)(int dst_width, int boxheight, int x, int dx,
+ const uint32* src_ptr, uint16* dst_ptr) =
+ (dx & 0xffff) ? ScaleAddCols2_16_C: ScaleAddCols1_16_C;
+ void (*ScaleAddRows)(const uint16* src_ptr, ptrdiff_t src_stride,
+ uint32* dst_ptr, int src_width, int src_height) = ScaleAddRows_16_C;
+
+#if defined(HAS_SCALEADDROWS_16_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2) &&
+#ifdef AVOID_OVERREAD
+ IS_ALIGNED(src_width, 16) &&
+#endif
+ IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16)) {
+ ScaleAddRows = ScaleAddRows_16_SSE2;
+ }
+#endif
+
+ for (j = 0; j < dst_height; ++j) {
+ int boxheight;
+ int iy = y >> 16;
+ const uint16* src = src_ptr + iy * src_stride;
+ y += dy;
+ if (y > (src_height << 16)) {
+ y = (src_height << 16);
+ }
+ boxheight = (y >> 16) - iy;
+ ScaleAddRows(src, src_stride, (uint32*)(row32),
+ src_width, boxheight);
+ ScaleAddCols(dst_width, boxheight, x, dx, (uint32*)(row32),
+ dst_ptr);
+ dst_ptr += dst_stride;
+ }
+ free_aligned_buffer_64(row32);
+ }
+}
+
+// Scale plane down with bilinear interpolation.
+void ScalePlaneBilinearDown(int src_width, int src_height,
+ int dst_width, int dst_height,
+ int src_stride, int dst_stride,
+ const uint8* src_ptr, uint8* dst_ptr,
+ enum FilterMode filtering) {
+ // Initial source x/y coordinate and step values as 16.16 fixed point.
+ int x = 0;
+ int y = 0;
+ int dx = 0;
+ int dy = 0;
+ // TODO(fbarchard): Consider not allocating row buffer for kFilterLinear.
+ // Allocate a row buffer.
+ align_buffer_64(row, src_width);
+
+ const int max_y = (src_height - 1) << 16;
+ int j;
+ void (*ScaleFilterCols)(uint8* dst_ptr, const uint8* src_ptr,
+ int dst_width, int x, int dx) =
+ (src_width >= 32768) ? ScaleFilterCols64_C : ScaleFilterCols_C;
+ void (*InterpolateRow)(uint8* dst_ptr, const uint8* src_ptr,
+ ptrdiff_t src_stride, int dst_width, int source_y_fraction) =
+ InterpolateRow_C;
+ ScaleSlope(src_width, src_height, dst_width, dst_height, filtering,
+ &x, &y, &dx, &dy);
+ src_width = Abs(src_width);
+
+#if defined(HAS_INTERPOLATEROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2) && src_width >= 16) {
+ InterpolateRow = InterpolateRow_Any_SSE2;
+ if (IS_ALIGNED(src_width, 16)) {
+ InterpolateRow = InterpolateRow_Unaligned_SSE2;
+ if (IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16)) {
+ InterpolateRow = InterpolateRow_SSE2;
+ }
+ }
+ }
+#endif
+#if defined(HAS_INTERPOLATEROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3) && src_width >= 16) {
+ InterpolateRow = InterpolateRow_Any_SSSE3;
+ if (IS_ALIGNED(src_width, 16)) {
+ InterpolateRow = InterpolateRow_Unaligned_SSSE3;
+ if (IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16)) {
+ InterpolateRow = InterpolateRow_SSSE3;
+ }
+ }
+ }
+#endif
+#if defined(HAS_INTERPOLATEROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2) && src_width >= 32) {
+ InterpolateRow = InterpolateRow_Any_AVX2;
+ if (IS_ALIGNED(src_width, 32)) {
+ InterpolateRow = InterpolateRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_INTERPOLATEROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON) && src_width >= 16) {
+ InterpolateRow = InterpolateRow_Any_NEON;
+ if (IS_ALIGNED(src_width, 16)) {
+ InterpolateRow = InterpolateRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_INTERPOLATEROW_MIPS_DSPR2)
+ if (TestCpuFlag(kCpuHasMIPS_DSPR2) && src_width >= 4) {
+ InterpolateRow = InterpolateRow_Any_MIPS_DSPR2;
+ if (IS_ALIGNED(src_width, 4)) {
+ InterpolateRow = InterpolateRow_MIPS_DSPR2;
+ }
+ }
+#endif
+
+
+#if defined(HAS_SCALEFILTERCOLS_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) {
+ ScaleFilterCols = ScaleFilterCols_SSSE3;
+ }
+#endif
+ if (y > max_y) {
+ y = max_y;
+ }
+
+ for (j = 0; j < dst_height; ++j) {
+ int yi = y >> 16;
+ const uint8* src = src_ptr + yi * src_stride;
+ if (filtering == kFilterLinear) {
+ ScaleFilterCols(dst_ptr, src, dst_width, x, dx);
+ } else {
+ int yf = (y >> 8) & 255;
+ InterpolateRow(row, src, src_stride, src_width, yf);
+ ScaleFilterCols(dst_ptr, row, dst_width, x, dx);
+ }
+ dst_ptr += dst_stride;
+ y += dy;
+ if (y > max_y) {
+ y = max_y;
+ }
+ }
+ free_aligned_buffer_64(row);
+}
+
+void ScalePlaneBilinearDown_16(int src_width, int src_height,
+ int dst_width, int dst_height,
+ int src_stride, int dst_stride,
+ const uint16* src_ptr, uint16* dst_ptr,
+ enum FilterMode filtering) {
+ // Initial source x/y coordinate and step values as 16.16 fixed point.
+ int x = 0;
+ int y = 0;
+ int dx = 0;
+ int dy = 0;
+ // TODO(fbarchard): Consider not allocating row buffer for kFilterLinear.
+ // Allocate a row buffer.
+ align_buffer_64(row, src_width * 2);
+
+ const int max_y = (src_height - 1) << 16;
+ int j;
+ void (*ScaleFilterCols)(uint16* dst_ptr, const uint16* src_ptr,
+ int dst_width, int x, int dx) =
+ (src_width >= 32768) ? ScaleFilterCols64_16_C : ScaleFilterCols_16_C;
+ void (*InterpolateRow)(uint16* dst_ptr, const uint16* src_ptr,
+ ptrdiff_t src_stride, int dst_width, int source_y_fraction) =
+ InterpolateRow_16_C;
+ ScaleSlope(src_width, src_height, dst_width, dst_height, filtering,
+ &x, &y, &dx, &dy);
+ src_width = Abs(src_width);
+
+#if defined(HAS_INTERPOLATEROW_16_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2) && src_width >= 16) {
+ InterpolateRow = InterpolateRow_Any_16_SSE2;
+ if (IS_ALIGNED(src_width, 16)) {
+ InterpolateRow = InterpolateRow_Unaligned_16_SSE2;
+ if (IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16)) {
+ InterpolateRow = InterpolateRow_16_SSE2;
+ }
+ }
+ }
+#endif
+#if defined(HAS_INTERPOLATEROW_16_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3) && src_width >= 16) {
+ InterpolateRow = InterpolateRow_Any_16_SSSE3;
+ if (IS_ALIGNED(src_width, 16)) {
+ InterpolateRow = InterpolateRow_Unaligned_16_SSSE3;
+ if (IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16)) {
+ InterpolateRow = InterpolateRow_16_SSSE3;
+ }
+ }
+ }
+#endif
+#if defined(HAS_INTERPOLATEROW_16_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2) && src_width >= 32) {
+ InterpolateRow = InterpolateRow_Any_16_AVX2;
+ if (IS_ALIGNED(src_width, 32)) {
+ InterpolateRow = InterpolateRow_16_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_INTERPOLATEROW_16_NEON)
+ if (TestCpuFlag(kCpuHasNEON) && src_width >= 16) {
+ InterpolateRow = InterpolateRow_Any_16_NEON;
+ if (IS_ALIGNED(src_width, 16)) {
+ InterpolateRow = InterpolateRow_16_NEON;
+ }
+ }
+#endif
+#if defined(HAS_INTERPOLATEROW_16_MIPS_DSPR2)
+ if (TestCpuFlag(kCpuHasMIPS_DSPR2) && src_width >= 4) {
+ InterpolateRow = InterpolateRow_Any_16_MIPS_DSPR2;
+ if (IS_ALIGNED(src_width, 4)) {
+ InterpolateRow = InterpolateRow_16_MIPS_DSPR2;
+ }
+ }
+#endif
+
+
+#if defined(HAS_SCALEFILTERCOLS_16_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) {
+ ScaleFilterCols = ScaleFilterCols_16_SSSE3;
+ }
+#endif
+ if (y > max_y) {
+ y = max_y;
+ }
+
+ for (j = 0; j < dst_height; ++j) {
+ int yi = y >> 16;
+ const uint16* src = src_ptr + yi * src_stride;
+ if (filtering == kFilterLinear) {
+ ScaleFilterCols(dst_ptr, src, dst_width, x, dx);
+ } else {
+ int yf = (y >> 8) & 255;
+ InterpolateRow((uint16*)row, src, src_stride, src_width, yf);
+ ScaleFilterCols(dst_ptr, (uint16*)row, dst_width, x, dx);
+ }
+ dst_ptr += dst_stride;
+ y += dy;
+ if (y > max_y) {
+ y = max_y;
+ }
+ }
+ free_aligned_buffer_64(row);
+}
+
+// Scale up down with bilinear interpolation.
+void ScalePlaneBilinearUp(int src_width, int src_height,
+ int dst_width, int dst_height,
+ int src_stride, int dst_stride,
+ const uint8* src_ptr, uint8* dst_ptr,
+ enum FilterMode filtering) {
+ int j;
+ // Initial source x/y coordinate and step values as 16.16 fixed point.
+ int x = 0;
+ int y = 0;
+ int dx = 0;
+ int dy = 0;
+ const int max_y = (src_height - 1) << 16;
+ void (*InterpolateRow)(uint8* dst_ptr, const uint8* src_ptr,
+ ptrdiff_t src_stride, int dst_width, int source_y_fraction) =
+ InterpolateRow_C;
+ void (*ScaleFilterCols)(uint8* dst_ptr, const uint8* src_ptr,
+ int dst_width, int x, int dx) =
+ filtering ? ScaleFilterCols_C : ScaleCols_C;
+ ScaleSlope(src_width, src_height, dst_width, dst_height, filtering,
+ &x, &y, &dx, &dy);
+ src_width = Abs(src_width);
+
+#if defined(HAS_INTERPOLATEROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2) && dst_width >= 16) {
+ InterpolateRow = InterpolateRow_Any_SSE2;
+ if (IS_ALIGNED(dst_width, 16)) {
+ InterpolateRow = InterpolateRow_Unaligned_SSE2;
+ if (IS_ALIGNED(dst_ptr, 16) && IS_ALIGNED(dst_stride, 16)) {
+ InterpolateRow = InterpolateRow_SSE2;
+ }
+ }
+ }
+#endif
+#if defined(HAS_INTERPOLATEROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3) && dst_width >= 16) {
+ InterpolateRow = InterpolateRow_Any_SSSE3;
+ if (IS_ALIGNED(dst_width, 16)) {
+ InterpolateRow = InterpolateRow_Unaligned_SSSE3;
+ if (IS_ALIGNED(dst_ptr, 16) && IS_ALIGNED(dst_stride, 16)) {
+ InterpolateRow = InterpolateRow_SSSE3;
+ }
+ }
+ }
+#endif
+#if defined(HAS_INTERPOLATEROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2) && dst_width >= 32) {
+ InterpolateRow = InterpolateRow_Any_AVX2;
+ if (IS_ALIGNED(dst_width, 32)) {
+ InterpolateRow = InterpolateRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_INTERPOLATEROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON) && dst_width >= 16) {
+ InterpolateRow = InterpolateRow_Any_NEON;
+ if (IS_ALIGNED(dst_width, 16)) {
+ InterpolateRow = InterpolateRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_INTERPOLATEROW_MIPS_DSPR2)
+ if (TestCpuFlag(kCpuHasMIPS_DSPR2) && dst_width >= 4) {
+ InterpolateRow = InterpolateRow_Any_MIPS_DSPR2;
+ if (IS_ALIGNED(dst_width, 4)) {
+ InterpolateRow = InterpolateRow_MIPS_DSPR2;
+ }
+ }
+#endif
+
+ if (filtering && src_width >= 32768) {
+ ScaleFilterCols = ScaleFilterCols64_C;
+ }
+#if defined(HAS_SCALEFILTERCOLS_SSSE3)
+ if (filtering && TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) {
+ ScaleFilterCols = ScaleFilterCols_SSSE3;
+ }
+#endif
+ if (!filtering && src_width * 2 == dst_width && x < 0x8000) {
+ ScaleFilterCols = ScaleColsUp2_C;
+#if defined(HAS_SCALECOLS_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8) &&
+ IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16) &&
+ IS_ALIGNED(dst_ptr, 16) && IS_ALIGNED(dst_stride, 16)) {
+ ScaleFilterCols = ScaleColsUp2_SSE2;
+ }
+#endif
+ }
+
+ if (y > max_y) {
+ y = max_y;
+ }
+ {
+ int yi = y >> 16;
+ const uint8* src = src_ptr + yi * src_stride;
+
+ // Allocate 2 row buffers.
+ const int kRowSize = (dst_width + 15) & ~15;
+ align_buffer_64(row, kRowSize * 2);
+
+ uint8* rowptr = row;
+ int rowstride = kRowSize;
+ int lasty = yi;
+
+ ScaleFilterCols(rowptr, src, dst_width, x, dx);
+ if (src_height > 1) {
+ src += src_stride;
+ }
+ ScaleFilterCols(rowptr + rowstride, src, dst_width, x, dx);
+ src += src_stride;
+
+ for (j = 0; j < dst_height; ++j) {
+ yi = y >> 16;
+ if (yi != lasty) {
+ if (y > max_y) {
+ y = max_y;
+ yi = y >> 16;
+ src = src_ptr + yi * src_stride;
+ }
+ if (yi != lasty) {
+ ScaleFilterCols(rowptr, src, dst_width, x, dx);
+ rowptr += rowstride;
+ rowstride = -rowstride;
+ lasty = yi;
+ src += src_stride;
+ }
+ }
+ if (filtering == kFilterLinear) {
+ InterpolateRow(dst_ptr, rowptr, 0, dst_width, 0);
+ } else {
+ int yf = (y >> 8) & 255;
+ InterpolateRow(dst_ptr, rowptr, rowstride, dst_width, yf);
+ }
+ dst_ptr += dst_stride;
+ y += dy;
+ }
+ free_aligned_buffer_64(row);
+ }
+}
+
+void ScalePlaneBilinearUp_16(int src_width, int src_height,
+ int dst_width, int dst_height,
+ int src_stride, int dst_stride,
+ const uint16* src_ptr, uint16* dst_ptr,
+ enum FilterMode filtering) {
+ int j;
+ // Initial source x/y coordinate and step values as 16.16 fixed point.
+ int x = 0;
+ int y = 0;
+ int dx = 0;
+ int dy = 0;
+ const int max_y = (src_height - 1) << 16;
+ void (*InterpolateRow)(uint16* dst_ptr, const uint16* src_ptr,
+ ptrdiff_t src_stride, int dst_width, int source_y_fraction) =
+ InterpolateRow_16_C;
+ void (*ScaleFilterCols)(uint16* dst_ptr, const uint16* src_ptr,
+ int dst_width, int x, int dx) =
+ filtering ? ScaleFilterCols_16_C : ScaleCols_16_C;
+ ScaleSlope(src_width, src_height, dst_width, dst_height, filtering,
+ &x, &y, &dx, &dy);
+ src_width = Abs(src_width);
+
+#if defined(HAS_INTERPOLATEROW_16_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2) && dst_width >= 16) {
+ InterpolateRow = InterpolateRow_Any_16_SSE2;
+ if (IS_ALIGNED(dst_width, 16)) {
+ InterpolateRow = InterpolateRow_Unaligned_16_SSE2;
+ if (IS_ALIGNED(dst_ptr, 16) && IS_ALIGNED(dst_stride, 16)) {
+ InterpolateRow = InterpolateRow_16_SSE2;
+ }
+ }
+ }
+#endif
+#if defined(HAS_INTERPOLATEROW_16_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3) && dst_width >= 16) {
+ InterpolateRow = InterpolateRow_Any_16_SSSE3;
+ if (IS_ALIGNED(dst_width, 16)) {
+ InterpolateRow = InterpolateRow_Unaligned_16_SSSE3;
+ if (IS_ALIGNED(dst_ptr, 16) && IS_ALIGNED(dst_stride, 16)) {
+ InterpolateRow = InterpolateRow_16_SSSE3;
+ }
+ }
+ }
+#endif
+#if defined(HAS_INTERPOLATEROW_16_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2) && dst_width >= 32) {
+ InterpolateRow = InterpolateRow_Any_16_AVX2;
+ if (IS_ALIGNED(dst_width, 32)) {
+ InterpolateRow = InterpolateRow_16_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_INTERPOLATEROW_16_NEON)
+ if (TestCpuFlag(kCpuHasNEON) && dst_width >= 16) {
+ InterpolateRow = InterpolateRow_Any_16_NEON;
+ if (IS_ALIGNED(dst_width, 16)) {
+ InterpolateRow = InterpolateRow_16_NEON;
+ }
+ }
+#endif
+#if defined(HAS_INTERPOLATEROW_16_MIPS_DSPR2)
+ if (TestCpuFlag(kCpuHasMIPS_DSPR2) && dst_width >= 4) {
+ InterpolateRow = InterpolateRow_Any_16_MIPS_DSPR2;
+ if (IS_ALIGNED(dst_width, 4)) {
+ InterpolateRow = InterpolateRow_16_MIPS_DSPR2;
+ }
+ }
+#endif
+
+ if (filtering && src_width >= 32768) {
+ ScaleFilterCols = ScaleFilterCols64_16_C;
+ }
+#if defined(HAS_SCALEFILTERCOLS_16_SSSE3)
+ if (filtering && TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) {
+ ScaleFilterCols = ScaleFilterCols_16_SSSE3;
+ }
+#endif
+ if (!filtering && src_width * 2 == dst_width && x < 0x8000) {
+ ScaleFilterCols = ScaleColsUp2_16_C;
+#if defined(HAS_SCALECOLS_16_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8) &&
+ IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16) &&
+ IS_ALIGNED(dst_ptr, 16) && IS_ALIGNED(dst_stride, 16)) {
+ ScaleFilterCols = ScaleColsUp2_16_SSE2;
+ }
+#endif
+ }
+
+ if (y > max_y) {
+ y = max_y;
+ }
+ {
+ int yi = y >> 16;
+ const uint16* src = src_ptr + yi * src_stride;
+
+ // Allocate 2 row buffers.
+ const int kRowSize = (dst_width + 15) & ~15;
+ align_buffer_64(row, kRowSize * 4);
+
+ uint16* rowptr = (uint16*)row;
+ int rowstride = kRowSize;
+ int lasty = yi;
+
+ ScaleFilterCols(rowptr, src, dst_width, x, dx);
+ if (src_height > 1) {
+ src += src_stride;
+ }
+ ScaleFilterCols(rowptr + rowstride, src, dst_width, x, dx);
+ src += src_stride;
+
+ for (j = 0; j < dst_height; ++j) {
+ yi = y >> 16;
+ if (yi != lasty) {
+ if (y > max_y) {
+ y = max_y;
+ yi = y >> 16;
+ src = src_ptr + yi * src_stride;
+ }
+ if (yi != lasty) {
+ ScaleFilterCols(rowptr, src, dst_width, x, dx);
+ rowptr += rowstride;
+ rowstride = -rowstride;
+ lasty = yi;
+ src += src_stride;
+ }
+ }
+ if (filtering == kFilterLinear) {
+ InterpolateRow(dst_ptr, rowptr, 0, dst_width, 0);
+ } else {
+ int yf = (y >> 8) & 255;
+ InterpolateRow(dst_ptr, rowptr, rowstride, dst_width, yf);
+ }
+ dst_ptr += dst_stride;
+ y += dy;
+ }
+ free_aligned_buffer_64(row);
+ }
+}
+
+// Scale Plane to/from any dimensions, without interpolation.
+// Fixed point math is used for performance: The upper 16 bits
+// of x and dx is the integer part of the source position and
+// the lower 16 bits are the fixed decimal part.
+
+static void ScalePlaneSimple(int src_width, int src_height,
+ int dst_width, int dst_height,
+ int src_stride, int dst_stride,
+ const uint8* src_ptr, uint8* dst_ptr) {
+ int i;
+ void (*ScaleCols)(uint8* dst_ptr, const uint8* src_ptr,
+ int dst_width, int x, int dx) = ScaleCols_C;
+ // Initial source x/y coordinate and step values as 16.16 fixed point.
+ int x = 0;
+ int y = 0;
+ int dx = 0;
+ int dy = 0;
+ ScaleSlope(src_width, src_height, dst_width, dst_height, kFilterNone,
+ &x, &y, &dx, &dy);
+ src_width = Abs(src_width);
+
+ if (src_width * 2 == dst_width && x < 0x8000) {
+ ScaleCols = ScaleColsUp2_C;
+#if defined(HAS_SCALECOLS_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8) &&
+ IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16) &&
+ IS_ALIGNED(dst_ptr, 16) && IS_ALIGNED(dst_stride, 16)) {
+ ScaleCols = ScaleColsUp2_SSE2;
+ }
+#endif
+ }
+
+ for (i = 0; i < dst_height; ++i) {
+ ScaleCols(dst_ptr, src_ptr + (y >> 16) * src_stride,
+ dst_width, x, dx);
+ dst_ptr += dst_stride;
+ y += dy;
+ }
+}
+
+static void ScalePlaneSimple_16(int src_width, int src_height,
+ int dst_width, int dst_height,
+ int src_stride, int dst_stride,
+ const uint16* src_ptr, uint16* dst_ptr) {
+ int i;
+ void (*ScaleCols)(uint16* dst_ptr, const uint16* src_ptr,
+ int dst_width, int x, int dx) = ScaleCols_16_C;
+ // Initial source x/y coordinate and step values as 16.16 fixed point.
+ int x = 0;
+ int y = 0;
+ int dx = 0;
+ int dy = 0;
+ ScaleSlope(src_width, src_height, dst_width, dst_height, kFilterNone,
+ &x, &y, &dx, &dy);
+ src_width = Abs(src_width);
+
+ if (src_width * 2 == dst_width && x < 0x8000) {
+ ScaleCols = ScaleColsUp2_16_C;
+#if defined(HAS_SCALECOLS_16_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8) &&
+ IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16) &&
+ IS_ALIGNED(dst_ptr, 16) && IS_ALIGNED(dst_stride, 16)) {
+ ScaleCols = ScaleColsUp2_16_SSE2;
+ }
+#endif
+ }
+
+ for (i = 0; i < dst_height; ++i) {
+ ScaleCols(dst_ptr, src_ptr + (y >> 16) * src_stride,
+ dst_width, x, dx);
+ dst_ptr += dst_stride;
+ y += dy;
+ }
+}
+
+// Scale a plane.
+// This function dispatches to a specialized scaler based on scale factor.
+
+LIBYUV_API
+void ScalePlane(const uint8* src, int src_stride,
+ int src_width, int src_height,
+ uint8* dst, int dst_stride,
+ int dst_width, int dst_height,
+ enum FilterMode filtering) {
+ // Simplify filtering when possible.
+ filtering = ScaleFilterReduce(src_width, src_height,
+ dst_width, dst_height,
+ filtering);
+
+ // Negative height means invert the image.
+ if (src_height < 0) {
+ src_height = -src_height;
+ src = src + (src_height - 1) * src_stride;
+ src_stride = -src_stride;
+ }
+
+ // Use specialized scales to improve performance for common resolutions.
+ // For example, all the 1/2 scalings will use ScalePlaneDown2()
+ if (dst_width == src_width && dst_height == src_height) {
+ // Straight copy.
+ CopyPlane(src, src_stride, dst, dst_stride, dst_width, dst_height);
+ return;
+ }
+ if (dst_width == src_width) {
+ int dy = FixedDiv(src_height, dst_height);
+ // Arbitrary scale vertically, but unscaled vertically.
+ ScalePlaneVertical(src_height,
+ dst_width, dst_height,
+ src_stride, dst_stride, src, dst,
+ 0, 0, dy, 1, filtering);
+ return;
+ }
+ if (dst_width <= Abs(src_width) && dst_height <= src_height) {
+ // Scale down.
+ if (4 * dst_width == 3 * src_width &&
+ 4 * dst_height == 3 * src_height) {
+ // optimized, 3/4
+ ScalePlaneDown34(src_width, src_height, dst_width, dst_height,
+ src_stride, dst_stride, src, dst, filtering);
+ return;
+ }
+ if (2 * dst_width == src_width && 2 * dst_height == src_height) {
+ // optimized, 1/2
+ ScalePlaneDown2(src_width, src_height, dst_width, dst_height,
+ src_stride, dst_stride, src, dst, filtering);
+ return;
+ }
+ // 3/8 rounded up for odd sized chroma height.
+ if (8 * dst_width == 3 * src_width &&
+ dst_height == ((src_height * 3 + 7) / 8)) {
+ // optimized, 3/8
+ ScalePlaneDown38(src_width, src_height, dst_width, dst_height,
+ src_stride, dst_stride, src, dst, filtering);
+ return;
+ }
+ if (4 * dst_width == src_width && 4 * dst_height == src_height &&
+ filtering != kFilterBilinear) {
+ // optimized, 1/4
+ ScalePlaneDown4(src_width, src_height, dst_width, dst_height,
+ src_stride, dst_stride, src, dst, filtering);
+ return;
+ }
+ }
+ if (filtering == kFilterBox && dst_height * 2 < src_height) {
+ ScalePlaneBox(src_width, src_height, dst_width, dst_height,
+ src_stride, dst_stride, src, dst);
+ return;
+ }
+ if (filtering && dst_height > src_height) {
+ ScalePlaneBilinearUp(src_width, src_height, dst_width, dst_height,
+ src_stride, dst_stride, src, dst, filtering);
+ return;
+ }
+ if (filtering) {
+ ScalePlaneBilinearDown(src_width, src_height, dst_width, dst_height,
+ src_stride, dst_stride, src, dst, filtering);
+ return;
+ }
+ ScalePlaneSimple(src_width, src_height, dst_width, dst_height,
+ src_stride, dst_stride, src, dst);
+}
+
+LIBYUV_API
+void ScalePlane_16(const uint16* src, int src_stride,
+ int src_width, int src_height,
+ uint16* dst, int dst_stride,
+ int dst_width, int dst_height,
+ enum FilterMode filtering) {
+ // Simplify filtering when possible.
+ filtering = ScaleFilterReduce(src_width, src_height,
+ dst_width, dst_height,
+ filtering);
+
+ // Negative height means invert the image.
+ if (src_height < 0) {
+ src_height = -src_height;
+ src = src + (src_height - 1) * src_stride;
+ src_stride = -src_stride;
+ }
+
+ // Use specialized scales to improve performance for common resolutions.
+ // For example, all the 1/2 scalings will use ScalePlaneDown2()
+ if (dst_width == src_width && dst_height == src_height) {
+ // Straight copy.
+ CopyPlane_16(src, src_stride, dst, dst_stride, dst_width, dst_height);
+ return;
+ }
+ if (dst_width == src_width) {
+ int dy = FixedDiv(src_height, dst_height);
+ // Arbitrary scale vertically, but unscaled vertically.
+ ScalePlaneVertical_16(src_height,
+ dst_width, dst_height,
+ src_stride, dst_stride, src, dst,
+ 0, 0, dy, 1, filtering);
+ return;
+ }
+ if (dst_width <= Abs(src_width) && dst_height <= src_height) {
+ // Scale down.
+ if (4 * dst_width == 3 * src_width &&
+ 4 * dst_height == 3 * src_height) {
+ // optimized, 3/4
+ ScalePlaneDown34_16(src_width, src_height, dst_width, dst_height,
+ src_stride, dst_stride, src, dst, filtering);
+ return;
+ }
+ if (2 * dst_width == src_width && 2 * dst_height == src_height) {
+ // optimized, 1/2
+ ScalePlaneDown2_16(src_width, src_height, dst_width, dst_height,
+ src_stride, dst_stride, src, dst, filtering);
+ return;
+ }
+ // 3/8 rounded up for odd sized chroma height.
+ if (8 * dst_width == 3 * src_width &&
+ dst_height == ((src_height * 3 + 7) / 8)) {
+ // optimized, 3/8
+ ScalePlaneDown38_16(src_width, src_height, dst_width, dst_height,
+ src_stride, dst_stride, src, dst, filtering);
+ return;
+ }
+ if (4 * dst_width == src_width && 4 * dst_height == src_height &&
+ filtering != kFilterBilinear) {
+ // optimized, 1/4
+ ScalePlaneDown4_16(src_width, src_height, dst_width, dst_height,
+ src_stride, dst_stride, src, dst, filtering);
+ return;
+ }
+ }
+ if (filtering == kFilterBox && dst_height * 2 < src_height) {
+ ScalePlaneBox_16(src_width, src_height, dst_width, dst_height,
+ src_stride, dst_stride, src, dst);
+ return;
+ }
+ if (filtering && dst_height > src_height) {
+ ScalePlaneBilinearUp_16(src_width, src_height, dst_width, dst_height,
+ src_stride, dst_stride, src, dst, filtering);
+ return;
+ }
+ if (filtering) {
+ ScalePlaneBilinearDown_16(src_width, src_height, dst_width, dst_height,
+ src_stride, dst_stride, src, dst, filtering);
+ return;
+ }
+ ScalePlaneSimple_16(src_width, src_height, dst_width, dst_height,
+ src_stride, dst_stride, src, dst);
+}
+
+// Scale an I420 image.
+// This function in turn calls a scaling function for each plane.
+
+LIBYUV_API
+int I420Scale(const uint8* src_y, int src_stride_y,
+ const uint8* src_u, int src_stride_u,
+ const uint8* src_v, int src_stride_v,
+ int src_width, int src_height,
+ uint8* dst_y, int dst_stride_y,
+ uint8* dst_u, int dst_stride_u,
+ uint8* dst_v, int dst_stride_v,
+ int dst_width, int dst_height,
+ enum FilterMode filtering) {
+ int src_halfwidth = SUBSAMPLE(src_width, 1, 1);
+ int src_halfheight = SUBSAMPLE(src_height, 1, 1);
+ int dst_halfwidth = SUBSAMPLE(dst_width, 1, 1);
+ int dst_halfheight = SUBSAMPLE(dst_height, 1, 1);
+ if (!src_y || !src_u || !src_v || src_width == 0 || src_height == 0 ||
+ !dst_y || !dst_u || !dst_v || dst_width <= 0 || dst_height <= 0) {
+ return -1;
+ }
+
+ ScalePlane(src_y, src_stride_y, src_width, src_height,
+ dst_y, dst_stride_y, dst_width, dst_height,
+ filtering);
+ ScalePlane(src_u, src_stride_u, src_halfwidth, src_halfheight,
+ dst_u, dst_stride_u, dst_halfwidth, dst_halfheight,
+ filtering);
+ ScalePlane(src_v, src_stride_v, src_halfwidth, src_halfheight,
+ dst_v, dst_stride_v, dst_halfwidth, dst_halfheight,
+ filtering);
+ return 0;
+}
+
+LIBYUV_API
+int I420Scale_16(const uint16* src_y, int src_stride_y,
+ const uint16* src_u, int src_stride_u,
+ const uint16* src_v, int src_stride_v,
+ int src_width, int src_height,
+ uint16* dst_y, int dst_stride_y,
+ uint16* dst_u, int dst_stride_u,
+ uint16* dst_v, int dst_stride_v,
+ int dst_width, int dst_height,
+ enum FilterMode filtering) {
+ int src_halfwidth = SUBSAMPLE(src_width, 1, 1);
+ int src_halfheight = SUBSAMPLE(src_height, 1, 1);
+ int dst_halfwidth = SUBSAMPLE(dst_width, 1, 1);
+ int dst_halfheight = SUBSAMPLE(dst_height, 1, 1);
+ if (!src_y || !src_u || !src_v || src_width == 0 || src_height == 0 ||
+ !dst_y || !dst_u || !dst_v || dst_width <= 0 || dst_height <= 0) {
+ return -1;
+ }
+
+ ScalePlane_16(src_y, src_stride_y, src_width, src_height,
+ dst_y, dst_stride_y, dst_width, dst_height,
+ filtering);
+ ScalePlane_16(src_u, src_stride_u, src_halfwidth, src_halfheight,
+ dst_u, dst_stride_u, dst_halfwidth, dst_halfheight,
+ filtering);
+ ScalePlane_16(src_v, src_stride_v, src_halfwidth, src_halfheight,
+ dst_v, dst_stride_v, dst_halfwidth, dst_halfheight,
+ filtering);
+ return 0;
+}
+
+// Deprecated api
+LIBYUV_API
+int Scale(const uint8* src_y, const uint8* src_u, const uint8* src_v,
+ int src_stride_y, int src_stride_u, int src_stride_v,
+ int src_width, int src_height,
+ uint8* dst_y, uint8* dst_u, uint8* dst_v,
+ int dst_stride_y, int dst_stride_u, int dst_stride_v,
+ int dst_width, int dst_height,
+ LIBYUV_BOOL interpolate) {
+ return I420Scale(src_y, src_stride_y,
+ src_u, src_stride_u,
+ src_v, src_stride_v,
+ src_width, src_height,
+ dst_y, dst_stride_y,
+ dst_u, dst_stride_u,
+ dst_v, dst_stride_v,
+ dst_width, dst_height,
+ interpolate ? kFilterBox : kFilterNone);
+}
+
+// Deprecated api
+LIBYUV_API
+int ScaleOffset(const uint8* src, int src_width, int src_height,
+ uint8* dst, int dst_width, int dst_height, int dst_yoffset,
+ LIBYUV_BOOL interpolate) {
+ // Chroma requires offset to multiple of 2.
+ int dst_yoffset_even = dst_yoffset & ~1;
+ int src_halfwidth = SUBSAMPLE(src_width, 1, 1);
+ int src_halfheight = SUBSAMPLE(src_height, 1, 1);
+ int dst_halfwidth = SUBSAMPLE(dst_width, 1, 1);
+ int dst_halfheight = SUBSAMPLE(dst_height, 1, 1);
+ int aheight = dst_height - dst_yoffset_even * 2; // actual output height
+ const uint8* src_y = src;
+ const uint8* src_u = src + src_width * src_height;
+ const uint8* src_v = src + src_width * src_height +
+ src_halfwidth * src_halfheight;
+ uint8* dst_y = dst + dst_yoffset_even * dst_width;
+ uint8* dst_u = dst + dst_width * dst_height +
+ (dst_yoffset_even >> 1) * dst_halfwidth;
+ uint8* dst_v = dst + dst_width * dst_height + dst_halfwidth * dst_halfheight +
+ (dst_yoffset_even >> 1) * dst_halfwidth;
+ if (!src || src_width <= 0 || src_height <= 0 ||
+ !dst || dst_width <= 0 || dst_height <= 0 || dst_yoffset_even < 0 ||
+ dst_yoffset_even >= dst_height) {
+ return -1;
+ }
+ return I420Scale(src_y, src_width,
+ src_u, src_halfwidth,
+ src_v, src_halfwidth,
+ src_width, src_height,
+ dst_y, dst_width,
+ dst_u, dst_halfwidth,
+ dst_v, dst_halfwidth,
+ dst_width, aheight,
+ interpolate ? kFilterBox : kFilterNone);
+}
+
+#ifdef __cplusplus
+} // extern "C"
+} // namespace libyuv
+#endif
diff --git a/third_party/libyuv/source/scale_common.cc b/third_party/libyuv/source/scale_common.cc
new file mode 100644
index 0000000..595ad66
--- /dev/null
+++ b/third_party/libyuv/source/scale_common.cc
@@ -0,0 +1,1165 @@
+/*
+ * Copyright 2013 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "third_party/libyuv/include/libyuv/scale.h"
+
+#include <assert.h>
+#include <string.h>
+
+#include "third_party/libyuv/include/libyuv/cpu_id.h"
+#include "third_party/libyuv/include/libyuv/planar_functions.h" // CopyARGB
+#include "third_party/libyuv/include/libyuv/row.h"
+#include "third_party/libyuv/include/libyuv/scale_row.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+static __inline int Abs(int v) {
+ return v >= 0 ? v : -v;
+}
+
+// CPU agnostic row functions
+void ScaleRowDown2_C(const uint8* src_ptr, ptrdiff_t src_stride,
+ uint8* dst, int dst_width) {
+ int x;
+ for (x = 0; x < dst_width - 1; x += 2) {
+ dst[0] = src_ptr[1];
+ dst[1] = src_ptr[3];
+ dst += 2;
+ src_ptr += 4;
+ }
+ if (dst_width & 1) {
+ dst[0] = src_ptr[1];
+ }
+}
+
+void ScaleRowDown2_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
+ uint16* dst, int dst_width) {
+ int x;
+ for (x = 0; x < dst_width - 1; x += 2) {
+ dst[0] = src_ptr[1];
+ dst[1] = src_ptr[3];
+ dst += 2;
+ src_ptr += 4;
+ }
+ if (dst_width & 1) {
+ dst[0] = src_ptr[1];
+ }
+}
+
+void ScaleRowDown2Linear_C(const uint8* src_ptr, ptrdiff_t src_stride,
+ uint8* dst, int dst_width) {
+ const uint8* s = src_ptr;
+ int x;
+ for (x = 0; x < dst_width - 1; x += 2) {
+ dst[0] = (s[0] + s[1] + 1) >> 1;
+ dst[1] = (s[2] + s[3] + 1) >> 1;
+ dst += 2;
+ s += 4;
+ }
+ if (dst_width & 1) {
+ dst[0] = (s[0] + s[1] + 1) >> 1;
+ }
+}
+
+void ScaleRowDown2Linear_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
+ uint16* dst, int dst_width) {
+ const uint16* s = src_ptr;
+ int x;
+ for (x = 0; x < dst_width - 1; x += 2) {
+ dst[0] = (s[0] + s[1] + 1) >> 1;
+ dst[1] = (s[2] + s[3] + 1) >> 1;
+ dst += 2;
+ s += 4;
+ }
+ if (dst_width & 1) {
+ dst[0] = (s[0] + s[1] + 1) >> 1;
+ }
+}
+
+void ScaleRowDown2Box_C(const uint8* src_ptr, ptrdiff_t src_stride,
+ uint8* dst, int dst_width) {
+ const uint8* s = src_ptr;
+ const uint8* t = src_ptr + src_stride;
+ int x;
+ for (x = 0; x < dst_width - 1; x += 2) {
+ dst[0] = (s[0] + s[1] + t[0] + t[1] + 2) >> 2;
+ dst[1] = (s[2] + s[3] + t[2] + t[3] + 2) >> 2;
+ dst += 2;
+ s += 4;
+ t += 4;
+ }
+ if (dst_width & 1) {
+ dst[0] = (s[0] + s[1] + t[0] + t[1] + 2) >> 2;
+ }
+}
+
+void ScaleRowDown2Box_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
+ uint16* dst, int dst_width) {
+ const uint16* s = src_ptr;
+ const uint16* t = src_ptr + src_stride;
+ int x;
+ for (x = 0; x < dst_width - 1; x += 2) {
+ dst[0] = (s[0] + s[1] + t[0] + t[1] + 2) >> 2;
+ dst[1] = (s[2] + s[3] + t[2] + t[3] + 2) >> 2;
+ dst += 2;
+ s += 4;
+ t += 4;
+ }
+ if (dst_width & 1) {
+ dst[0] = (s[0] + s[1] + t[0] + t[1] + 2) >> 2;
+ }
+}
+
+void ScaleRowDown4_C(const uint8* src_ptr, ptrdiff_t src_stride,
+ uint8* dst, int dst_width) {
+ int x;
+ for (x = 0; x < dst_width - 1; x += 2) {
+ dst[0] = src_ptr[2];
+ dst[1] = src_ptr[6];
+ dst += 2;
+ src_ptr += 8;
+ }
+ if (dst_width & 1) {
+ dst[0] = src_ptr[2];
+ }
+}
+
+void ScaleRowDown4_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
+ uint16* dst, int dst_width) {
+ int x;
+ for (x = 0; x < dst_width - 1; x += 2) {
+ dst[0] = src_ptr[2];
+ dst[1] = src_ptr[6];
+ dst += 2;
+ src_ptr += 8;
+ }
+ if (dst_width & 1) {
+ dst[0] = src_ptr[2];
+ }
+}
+
+void ScaleRowDown4Box_C(const uint8* src_ptr, ptrdiff_t src_stride,
+ uint8* dst, int dst_width) {
+ intptr_t stride = src_stride;
+ int x;
+ for (x = 0; x < dst_width - 1; x += 2) {
+ dst[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] + src_ptr[3] +
+ src_ptr[stride + 0] + src_ptr[stride + 1] +
+ src_ptr[stride + 2] + src_ptr[stride + 3] +
+ src_ptr[stride * 2 + 0] + src_ptr[stride * 2 + 1] +
+ src_ptr[stride * 2 + 2] + src_ptr[stride * 2 + 3] +
+ src_ptr[stride * 3 + 0] + src_ptr[stride * 3 + 1] +
+ src_ptr[stride * 3 + 2] + src_ptr[stride * 3 + 3] +
+ 8) >> 4;
+ dst[1] = (src_ptr[4] + src_ptr[5] + src_ptr[6] + src_ptr[7] +
+ src_ptr[stride + 4] + src_ptr[stride + 5] +
+ src_ptr[stride + 6] + src_ptr[stride + 7] +
+ src_ptr[stride * 2 + 4] + src_ptr[stride * 2 + 5] +
+ src_ptr[stride * 2 + 6] + src_ptr[stride * 2 + 7] +
+ src_ptr[stride * 3 + 4] + src_ptr[stride * 3 + 5] +
+ src_ptr[stride * 3 + 6] + src_ptr[stride * 3 + 7] +
+ 8) >> 4;
+ dst += 2;
+ src_ptr += 8;
+ }
+ if (dst_width & 1) {
+ dst[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] + src_ptr[3] +
+ src_ptr[stride + 0] + src_ptr[stride + 1] +
+ src_ptr[stride + 2] + src_ptr[stride + 3] +
+ src_ptr[stride * 2 + 0] + src_ptr[stride * 2 + 1] +
+ src_ptr[stride * 2 + 2] + src_ptr[stride * 2 + 3] +
+ src_ptr[stride * 3 + 0] + src_ptr[stride * 3 + 1] +
+ src_ptr[stride * 3 + 2] + src_ptr[stride * 3 + 3] +
+ 8) >> 4;
+ }
+}
+
+void ScaleRowDown4Box_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
+ uint16* dst, int dst_width) {
+ intptr_t stride = src_stride;
+ int x;
+ for (x = 0; x < dst_width - 1; x += 2) {
+ dst[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] + src_ptr[3] +
+ src_ptr[stride + 0] + src_ptr[stride + 1] +
+ src_ptr[stride + 2] + src_ptr[stride + 3] +
+ src_ptr[stride * 2 + 0] + src_ptr[stride * 2 + 1] +
+ src_ptr[stride * 2 + 2] + src_ptr[stride * 2 + 3] +
+ src_ptr[stride * 3 + 0] + src_ptr[stride * 3 + 1] +
+ src_ptr[stride * 3 + 2] + src_ptr[stride * 3 + 3] +
+ 8) >> 4;
+ dst[1] = (src_ptr[4] + src_ptr[5] + src_ptr[6] + src_ptr[7] +
+ src_ptr[stride + 4] + src_ptr[stride + 5] +
+ src_ptr[stride + 6] + src_ptr[stride + 7] +
+ src_ptr[stride * 2 + 4] + src_ptr[stride * 2 + 5] +
+ src_ptr[stride * 2 + 6] + src_ptr[stride * 2 + 7] +
+ src_ptr[stride * 3 + 4] + src_ptr[stride * 3 + 5] +
+ src_ptr[stride * 3 + 6] + src_ptr[stride * 3 + 7] +
+ 8) >> 4;
+ dst += 2;
+ src_ptr += 8;
+ }
+ if (dst_width & 1) {
+ dst[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] + src_ptr[3] +
+ src_ptr[stride + 0] + src_ptr[stride + 1] +
+ src_ptr[stride + 2] + src_ptr[stride + 3] +
+ src_ptr[stride * 2 + 0] + src_ptr[stride * 2 + 1] +
+ src_ptr[stride * 2 + 2] + src_ptr[stride * 2 + 3] +
+ src_ptr[stride * 3 + 0] + src_ptr[stride * 3 + 1] +
+ src_ptr[stride * 3 + 2] + src_ptr[stride * 3 + 3] +
+ 8) >> 4;
+ }
+}
+
+void ScaleRowDown34_C(const uint8* src_ptr, ptrdiff_t src_stride,
+ uint8* dst, int dst_width) {
+ int x;
+ assert((dst_width % 3 == 0) && (dst_width > 0));
+ for (x = 0; x < dst_width; x += 3) {
+ dst[0] = src_ptr[0];
+ dst[1] = src_ptr[1];
+ dst[2] = src_ptr[3];
+ dst += 3;
+ src_ptr += 4;
+ }
+}
+
+void ScaleRowDown34_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
+ uint16* dst, int dst_width) {
+ int x;
+ assert((dst_width % 3 == 0) && (dst_width > 0));
+ for (x = 0; x < dst_width; x += 3) {
+ dst[0] = src_ptr[0];
+ dst[1] = src_ptr[1];
+ dst[2] = src_ptr[3];
+ dst += 3;
+ src_ptr += 4;
+ }
+}
+
+// Filter rows 0 and 1 together, 3 : 1
+void ScaleRowDown34_0_Box_C(const uint8* src_ptr, ptrdiff_t src_stride,
+ uint8* d, int dst_width) {
+ const uint8* s = src_ptr;
+ const uint8* t = src_ptr + src_stride;
+ int x;
+ assert((dst_width % 3 == 0) && (dst_width > 0));
+ for (x = 0; x < dst_width; x += 3) {
+ uint8 a0 = (s[0] * 3 + s[1] * 1 + 2) >> 2;
+ uint8 a1 = (s[1] * 1 + s[2] * 1 + 1) >> 1;
+ uint8 a2 = (s[2] * 1 + s[3] * 3 + 2) >> 2;
+ uint8 b0 = (t[0] * 3 + t[1] * 1 + 2) >> 2;
+ uint8 b1 = (t[1] * 1 + t[2] * 1 + 1) >> 1;
+ uint8 b2 = (t[2] * 1 + t[3] * 3 + 2) >> 2;
+ d[0] = (a0 * 3 + b0 + 2) >> 2;
+ d[1] = (a1 * 3 + b1 + 2) >> 2;
+ d[2] = (a2 * 3 + b2 + 2) >> 2;
+ d += 3;
+ s += 4;
+ t += 4;
+ }
+}
+
+void ScaleRowDown34_0_Box_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
+ uint16* d, int dst_width) {
+ const uint16* s = src_ptr;
+ const uint16* t = src_ptr + src_stride;
+ int x;
+ assert((dst_width % 3 == 0) && (dst_width > 0));
+ for (x = 0; x < dst_width; x += 3) {
+ uint16 a0 = (s[0] * 3 + s[1] * 1 + 2) >> 2;
+ uint16 a1 = (s[1] * 1 + s[2] * 1 + 1) >> 1;
+ uint16 a2 = (s[2] * 1 + s[3] * 3 + 2) >> 2;
+ uint16 b0 = (t[0] * 3 + t[1] * 1 + 2) >> 2;
+ uint16 b1 = (t[1] * 1 + t[2] * 1 + 1) >> 1;
+ uint16 b2 = (t[2] * 1 + t[3] * 3 + 2) >> 2;
+ d[0] = (a0 * 3 + b0 + 2) >> 2;
+ d[1] = (a1 * 3 + b1 + 2) >> 2;
+ d[2] = (a2 * 3 + b2 + 2) >> 2;
+ d += 3;
+ s += 4;
+ t += 4;
+ }
+}
+
+// Filter rows 1 and 2 together, 1 : 1
+void ScaleRowDown34_1_Box_C(const uint8* src_ptr, ptrdiff_t src_stride,
+ uint8* d, int dst_width) {
+ const uint8* s = src_ptr;
+ const uint8* t = src_ptr + src_stride;
+ int x;
+ assert((dst_width % 3 == 0) && (dst_width > 0));
+ for (x = 0; x < dst_width; x += 3) {
+ uint8 a0 = (s[0] * 3 + s[1] * 1 + 2) >> 2;
+ uint8 a1 = (s[1] * 1 + s[2] * 1 + 1) >> 1;
+ uint8 a2 = (s[2] * 1 + s[3] * 3 + 2) >> 2;
+ uint8 b0 = (t[0] * 3 + t[1] * 1 + 2) >> 2;
+ uint8 b1 = (t[1] * 1 + t[2] * 1 + 1) >> 1;
+ uint8 b2 = (t[2] * 1 + t[3] * 3 + 2) >> 2;
+ d[0] = (a0 + b0 + 1) >> 1;
+ d[1] = (a1 + b1 + 1) >> 1;
+ d[2] = (a2 + b2 + 1) >> 1;
+ d += 3;
+ s += 4;
+ t += 4;
+ }
+}
+
+void ScaleRowDown34_1_Box_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
+ uint16* d, int dst_width) {
+ const uint16* s = src_ptr;
+ const uint16* t = src_ptr + src_stride;
+ int x;
+ assert((dst_width % 3 == 0) && (dst_width > 0));
+ for (x = 0; x < dst_width; x += 3) {
+ uint16 a0 = (s[0] * 3 + s[1] * 1 + 2) >> 2;
+ uint16 a1 = (s[1] * 1 + s[2] * 1 + 1) >> 1;
+ uint16 a2 = (s[2] * 1 + s[3] * 3 + 2) >> 2;
+ uint16 b0 = (t[0] * 3 + t[1] * 1 + 2) >> 2;
+ uint16 b1 = (t[1] * 1 + t[2] * 1 + 1) >> 1;
+ uint16 b2 = (t[2] * 1 + t[3] * 3 + 2) >> 2;
+ d[0] = (a0 + b0 + 1) >> 1;
+ d[1] = (a1 + b1 + 1) >> 1;
+ d[2] = (a2 + b2 + 1) >> 1;
+ d += 3;
+ s += 4;
+ t += 4;
+ }
+}
+
+// Scales a single row of pixels using point sampling.
+void ScaleCols_C(uint8* dst_ptr, const uint8* src_ptr,
+ int dst_width, int x, int dx) {
+ int j;
+ for (j = 0; j < dst_width - 1; j += 2) {
+ dst_ptr[0] = src_ptr[x >> 16];
+ x += dx;
+ dst_ptr[1] = src_ptr[x >> 16];
+ x += dx;
+ dst_ptr += 2;
+ }
+ if (dst_width & 1) {
+ dst_ptr[0] = src_ptr[x >> 16];
+ }
+}
+
+void ScaleCols_16_C(uint16* dst_ptr, const uint16* src_ptr,
+ int dst_width, int x, int dx) {
+ int j;
+ for (j = 0; j < dst_width - 1; j += 2) {
+ dst_ptr[0] = src_ptr[x >> 16];
+ x += dx;
+ dst_ptr[1] = src_ptr[x >> 16];
+ x += dx;
+ dst_ptr += 2;
+ }
+ if (dst_width & 1) {
+ dst_ptr[0] = src_ptr[x >> 16];
+ }
+}
+
+// Scales a single row of pixels up by 2x using point sampling.
+void ScaleColsUp2_C(uint8* dst_ptr, const uint8* src_ptr,
+ int dst_width, int x, int dx) {
+ int j;
+ for (j = 0; j < dst_width - 1; j += 2) {
+ dst_ptr[1] = dst_ptr[0] = src_ptr[0];
+ src_ptr += 1;
+ dst_ptr += 2;
+ }
+ if (dst_width & 1) {
+ dst_ptr[0] = src_ptr[0];
+ }
+}
+
+void ScaleColsUp2_16_C(uint16* dst_ptr, const uint16* src_ptr,
+ int dst_width, int x, int dx) {
+ int j;
+ for (j = 0; j < dst_width - 1; j += 2) {
+ dst_ptr[1] = dst_ptr[0] = src_ptr[0];
+ src_ptr += 1;
+ dst_ptr += 2;
+ }
+ if (dst_width & 1) {
+ dst_ptr[0] = src_ptr[0];
+ }
+}
+
+// (1-f)a + fb can be replaced with a + f(b-a)
+#define BLENDER(a, b, f) (uint8)((int)(a) + \
+ ((int)(f) * ((int)(b) - (int)(a)) >> 16))
+
+void ScaleFilterCols_C(uint8* dst_ptr, const uint8* src_ptr,
+ int dst_width, int x, int dx) {
+ int j;
+ for (j = 0; j < dst_width - 1; j += 2) {
+ int xi = x >> 16;
+ int a = src_ptr[xi];
+ int b = src_ptr[xi + 1];
+ dst_ptr[0] = BLENDER(a, b, x & 0xffff);
+ x += dx;
+ xi = x >> 16;
+ a = src_ptr[xi];
+ b = src_ptr[xi + 1];
+ dst_ptr[1] = BLENDER(a, b, x & 0xffff);
+ x += dx;
+ dst_ptr += 2;
+ }
+ if (dst_width & 1) {
+ int xi = x >> 16;
+ int a = src_ptr[xi];
+ int b = src_ptr[xi + 1];
+ dst_ptr[0] = BLENDER(a, b, x & 0xffff);
+ }
+}
+
+void ScaleFilterCols64_C(uint8* dst_ptr, const uint8* src_ptr,
+ int dst_width, int x32, int dx) {
+ int64 x = (int64)(x32);
+ int j;
+ for (j = 0; j < dst_width - 1; j += 2) {
+ int64 xi = x >> 16;
+ int a = src_ptr[xi];
+ int b = src_ptr[xi + 1];
+ dst_ptr[0] = BLENDER(a, b, x & 0xffff);
+ x += dx;
+ xi = x >> 16;
+ a = src_ptr[xi];
+ b = src_ptr[xi + 1];
+ dst_ptr[1] = BLENDER(a, b, x & 0xffff);
+ x += dx;
+ dst_ptr += 2;
+ }
+ if (dst_width & 1) {
+ int64 xi = x >> 16;
+ int a = src_ptr[xi];
+ int b = src_ptr[xi + 1];
+ dst_ptr[0] = BLENDER(a, b, x & 0xffff);
+ }
+}
+#undef BLENDER
+
+#define BLENDER(a, b, f) (uint16)((int)(a) + \
+ ((int)(f) * ((int)(b) - (int)(a)) >> 16))
+
+void ScaleFilterCols_16_C(uint16* dst_ptr, const uint16* src_ptr,
+ int dst_width, int x, int dx) {
+ int j;
+ for (j = 0; j < dst_width - 1; j += 2) {
+ int xi = x >> 16;
+ int a = src_ptr[xi];
+ int b = src_ptr[xi + 1];
+ dst_ptr[0] = BLENDER(a, b, x & 0xffff);
+ x += dx;
+ xi = x >> 16;
+ a = src_ptr[xi];
+ b = src_ptr[xi + 1];
+ dst_ptr[1] = BLENDER(a, b, x & 0xffff);
+ x += dx;
+ dst_ptr += 2;
+ }
+ if (dst_width & 1) {
+ int xi = x >> 16;
+ int a = src_ptr[xi];
+ int b = src_ptr[xi + 1];
+ dst_ptr[0] = BLENDER(a, b, x & 0xffff);
+ }
+}
+
+void ScaleFilterCols64_16_C(uint16* dst_ptr, const uint16* src_ptr,
+ int dst_width, int x32, int dx) {
+ int64 x = (int64)(x32);
+ int j;
+ for (j = 0; j < dst_width - 1; j += 2) {
+ int64 xi = x >> 16;
+ int a = src_ptr[xi];
+ int b = src_ptr[xi + 1];
+ dst_ptr[0] = BLENDER(a, b, x & 0xffff);
+ x += dx;
+ xi = x >> 16;
+ a = src_ptr[xi];
+ b = src_ptr[xi + 1];
+ dst_ptr[1] = BLENDER(a, b, x & 0xffff);
+ x += dx;
+ dst_ptr += 2;
+ }
+ if (dst_width & 1) {
+ int64 xi = x >> 16;
+ int a = src_ptr[xi];
+ int b = src_ptr[xi + 1];
+ dst_ptr[0] = BLENDER(a, b, x & 0xffff);
+ }
+}
+#undef BLENDER
+
+void ScaleRowDown38_C(const uint8* src_ptr, ptrdiff_t src_stride,
+ uint8* dst, int dst_width) {
+ int x;
+ assert(dst_width % 3 == 0);
+ for (x = 0; x < dst_width; x += 3) {
+ dst[0] = src_ptr[0];
+ dst[1] = src_ptr[3];
+ dst[2] = src_ptr[6];
+ dst += 3;
+ src_ptr += 8;
+ }
+}
+
+void ScaleRowDown38_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
+ uint16* dst, int dst_width) {
+ int x;
+ assert(dst_width % 3 == 0);
+ for (x = 0; x < dst_width; x += 3) {
+ dst[0] = src_ptr[0];
+ dst[1] = src_ptr[3];
+ dst[2] = src_ptr[6];
+ dst += 3;
+ src_ptr += 8;
+ }
+}
+
+// 8x3 -> 3x1
+void ScaleRowDown38_3_Box_C(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst_ptr, int dst_width) {
+ intptr_t stride = src_stride;
+ int i;
+ assert((dst_width % 3 == 0) && (dst_width > 0));
+ for (i = 0; i < dst_width; i += 3) {
+ dst_ptr[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] +
+ src_ptr[stride + 0] + src_ptr[stride + 1] +
+ src_ptr[stride + 2] + src_ptr[stride * 2 + 0] +
+ src_ptr[stride * 2 + 1] + src_ptr[stride * 2 + 2]) *
+ (65536 / 9) >> 16;
+ dst_ptr[1] = (src_ptr[3] + src_ptr[4] + src_ptr[5] +
+ src_ptr[stride + 3] + src_ptr[stride + 4] +
+ src_ptr[stride + 5] + src_ptr[stride * 2 + 3] +
+ src_ptr[stride * 2 + 4] + src_ptr[stride * 2 + 5]) *
+ (65536 / 9) >> 16;
+ dst_ptr[2] = (src_ptr[6] + src_ptr[7] +
+ src_ptr[stride + 6] + src_ptr[stride + 7] +
+ src_ptr[stride * 2 + 6] + src_ptr[stride * 2 + 7]) *
+ (65536 / 6) >> 16;
+ src_ptr += 8;
+ dst_ptr += 3;
+ }
+}
+
+void ScaleRowDown38_3_Box_16_C(const uint16* src_ptr,
+ ptrdiff_t src_stride,
+ uint16* dst_ptr, int dst_width) {
+ intptr_t stride = src_stride;
+ int i;
+ assert((dst_width % 3 == 0) && (dst_width > 0));
+ for (i = 0; i < dst_width; i += 3) {
+ dst_ptr[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] +
+ src_ptr[stride + 0] + src_ptr[stride + 1] +
+ src_ptr[stride + 2] + src_ptr[stride * 2 + 0] +
+ src_ptr[stride * 2 + 1] + src_ptr[stride * 2 + 2]) *
+ (65536 / 9) >> 16;
+ dst_ptr[1] = (src_ptr[3] + src_ptr[4] + src_ptr[5] +
+ src_ptr[stride + 3] + src_ptr[stride + 4] +
+ src_ptr[stride + 5] + src_ptr[stride * 2 + 3] +
+ src_ptr[stride * 2 + 4] + src_ptr[stride * 2 + 5]) *
+ (65536 / 9) >> 16;
+ dst_ptr[2] = (src_ptr[6] + src_ptr[7] +
+ src_ptr[stride + 6] + src_ptr[stride + 7] +
+ src_ptr[stride * 2 + 6] + src_ptr[stride * 2 + 7]) *
+ (65536 / 6) >> 16;
+ src_ptr += 8;
+ dst_ptr += 3;
+ }
+}
+
+// 8x2 -> 3x1
+void ScaleRowDown38_2_Box_C(const uint8* src_ptr, ptrdiff_t src_stride,
+ uint8* dst_ptr, int dst_width) {
+ intptr_t stride = src_stride;
+ int i;
+ assert((dst_width % 3 == 0) && (dst_width > 0));
+ for (i = 0; i < dst_width; i += 3) {
+ dst_ptr[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] +
+ src_ptr[stride + 0] + src_ptr[stride + 1] +
+ src_ptr[stride + 2]) * (65536 / 6) >> 16;
+ dst_ptr[1] = (src_ptr[3] + src_ptr[4] + src_ptr[5] +
+ src_ptr[stride + 3] + src_ptr[stride + 4] +
+ src_ptr[stride + 5]) * (65536 / 6) >> 16;
+ dst_ptr[2] = (src_ptr[6] + src_ptr[7] +
+ src_ptr[stride + 6] + src_ptr[stride + 7]) *
+ (65536 / 4) >> 16;
+ src_ptr += 8;
+ dst_ptr += 3;
+ }
+}
+
+void ScaleRowDown38_2_Box_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
+ uint16* dst_ptr, int dst_width) {
+ intptr_t stride = src_stride;
+ int i;
+ assert((dst_width % 3 == 0) && (dst_width > 0));
+ for (i = 0; i < dst_width; i += 3) {
+ dst_ptr[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] +
+ src_ptr[stride + 0] + src_ptr[stride + 1] +
+ src_ptr[stride + 2]) * (65536 / 6) >> 16;
+ dst_ptr[1] = (src_ptr[3] + src_ptr[4] + src_ptr[5] +
+ src_ptr[stride + 3] + src_ptr[stride + 4] +
+ src_ptr[stride + 5]) * (65536 / 6) >> 16;
+ dst_ptr[2] = (src_ptr[6] + src_ptr[7] +
+ src_ptr[stride + 6] + src_ptr[stride + 7]) *
+ (65536 / 4) >> 16;
+ src_ptr += 8;
+ dst_ptr += 3;
+ }
+}
+
+void ScaleAddRows_C(const uint8* src_ptr, ptrdiff_t src_stride,
+ uint16* dst_ptr, int src_width, int src_height) {
+ int x;
+ assert(src_width > 0);
+ assert(src_height > 0);
+ for (x = 0; x < src_width; ++x) {
+ const uint8* s = src_ptr + x;
+ unsigned int sum = 0u;
+ int y;
+ for (y = 0; y < src_height; ++y) {
+ sum += s[0];
+ s += src_stride;
+ }
+ // TODO(fbarchard): Consider limitting height to 256 to avoid overflow.
+ dst_ptr[x] = sum < 65535u ? sum : 65535u;
+ }
+}
+
+void ScaleAddRows_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
+ uint32* dst_ptr, int src_width, int src_height) {
+ int x;
+ assert(src_width > 0);
+ assert(src_height > 0);
+ for (x = 0; x < src_width; ++x) {
+ const uint16* s = src_ptr + x;
+ unsigned int sum = 0u;
+ int y;
+ for (y = 0; y < src_height; ++y) {
+ sum += s[0];
+ s += src_stride;
+ }
+ // No risk of overflow here now
+ dst_ptr[x] = sum;
+ }
+}
+
+void ScaleARGBRowDown2_C(const uint8* src_argb,
+ ptrdiff_t src_stride,
+ uint8* dst_argb, int dst_width) {
+ const uint32* src = (const uint32*)(src_argb);
+ uint32* dst = (uint32*)(dst_argb);
+
+ int x;
+ for (x = 0; x < dst_width - 1; x += 2) {
+ dst[0] = src[1];
+ dst[1] = src[3];
+ src += 4;
+ dst += 2;
+ }
+ if (dst_width & 1) {
+ dst[0] = src[1];
+ }
+}
+
+void ScaleARGBRowDown2Linear_C(const uint8* src_argb,
+ ptrdiff_t src_stride,
+ uint8* dst_argb, int dst_width) {
+ int x;
+ for (x = 0; x < dst_width; ++x) {
+ dst_argb[0] = (src_argb[0] + src_argb[4] + 1) >> 1;
+ dst_argb[1] = (src_argb[1] + src_argb[5] + 1) >> 1;
+ dst_argb[2] = (src_argb[2] + src_argb[6] + 1) >> 1;
+ dst_argb[3] = (src_argb[3] + src_argb[7] + 1) >> 1;
+ src_argb += 8;
+ dst_argb += 4;
+ }
+}
+
+void ScaleARGBRowDown2Box_C(const uint8* src_argb, ptrdiff_t src_stride,
+ uint8* dst_argb, int dst_width) {
+ int x;
+ for (x = 0; x < dst_width; ++x) {
+ dst_argb[0] = (src_argb[0] + src_argb[4] +
+ src_argb[src_stride] + src_argb[src_stride + 4] + 2) >> 2;
+ dst_argb[1] = (src_argb[1] + src_argb[5] +
+ src_argb[src_stride + 1] + src_argb[src_stride + 5] + 2) >> 2;
+ dst_argb[2] = (src_argb[2] + src_argb[6] +
+ src_argb[src_stride + 2] + src_argb[src_stride + 6] + 2) >> 2;
+ dst_argb[3] = (src_argb[3] + src_argb[7] +
+ src_argb[src_stride + 3] + src_argb[src_stride + 7] + 2) >> 2;
+ src_argb += 8;
+ dst_argb += 4;
+ }
+}
+
+void ScaleARGBRowDownEven_C(const uint8* src_argb, ptrdiff_t src_stride,
+ int src_stepx,
+ uint8* dst_argb, int dst_width) {
+ const uint32* src = (const uint32*)(src_argb);
+ uint32* dst = (uint32*)(dst_argb);
+
+ int x;
+ for (x = 0; x < dst_width - 1; x += 2) {
+ dst[0] = src[0];
+ dst[1] = src[src_stepx];
+ src += src_stepx * 2;
+ dst += 2;
+ }
+ if (dst_width & 1) {
+ dst[0] = src[0];
+ }
+}
+
+void ScaleARGBRowDownEvenBox_C(const uint8* src_argb,
+ ptrdiff_t src_stride,
+ int src_stepx,
+ uint8* dst_argb, int dst_width) {
+ int x;
+ for (x = 0; x < dst_width; ++x) {
+ dst_argb[0] = (src_argb[0] + src_argb[4] +
+ src_argb[src_stride] + src_argb[src_stride + 4] + 2) >> 2;
+ dst_argb[1] = (src_argb[1] + src_argb[5] +
+ src_argb[src_stride + 1] + src_argb[src_stride + 5] + 2) >> 2;
+ dst_argb[2] = (src_argb[2] + src_argb[6] +
+ src_argb[src_stride + 2] + src_argb[src_stride + 6] + 2) >> 2;
+ dst_argb[3] = (src_argb[3] + src_argb[7] +
+ src_argb[src_stride + 3] + src_argb[src_stride + 7] + 2) >> 2;
+ src_argb += src_stepx * 4;
+ dst_argb += 4;
+ }
+}
+
+// Scales a single row of pixels using point sampling.
+void ScaleARGBCols_C(uint8* dst_argb, const uint8* src_argb,
+ int dst_width, int x, int dx) {
+ const uint32* src = (const uint32*)(src_argb);
+ uint32* dst = (uint32*)(dst_argb);
+ int j;
+ for (j = 0; j < dst_width - 1; j += 2) {
+ dst[0] = src[x >> 16];
+ x += dx;
+ dst[1] = src[x >> 16];
+ x += dx;
+ dst += 2;
+ }
+ if (dst_width & 1) {
+ dst[0] = src[x >> 16];
+ }
+}
+
+void ScaleARGBCols64_C(uint8* dst_argb, const uint8* src_argb,
+ int dst_width, int x32, int dx) {
+ int64 x = (int64)(x32);
+ const uint32* src = (const uint32*)(src_argb);
+ uint32* dst = (uint32*)(dst_argb);
+ int j;
+ for (j = 0; j < dst_width - 1; j += 2) {
+ dst[0] = src[x >> 16];
+ x += dx;
+ dst[1] = src[x >> 16];
+ x += dx;
+ dst += 2;
+ }
+ if (dst_width & 1) {
+ dst[0] = src[x >> 16];
+ }
+}
+
+// Scales a single row of pixels up by 2x using point sampling.
+void ScaleARGBColsUp2_C(uint8* dst_argb, const uint8* src_argb,
+ int dst_width, int x, int dx) {
+ const uint32* src = (const uint32*)(src_argb);
+ uint32* dst = (uint32*)(dst_argb);
+ int j;
+ for (j = 0; j < dst_width - 1; j += 2) {
+ dst[1] = dst[0] = src[0];
+ src += 1;
+ dst += 2;
+ }
+ if (dst_width & 1) {
+ dst[0] = src[0];
+ }
+}
+
+// Mimics SSSE3 blender
+#define BLENDER1(a, b, f) ((a) * (0x7f ^ f) + (b) * f) >> 7
+#define BLENDERC(a, b, f, s) (uint32)( \
+ BLENDER1(((a) >> s) & 255, ((b) >> s) & 255, f) << s)
+#define BLENDER(a, b, f) \
+ BLENDERC(a, b, f, 24) | BLENDERC(a, b, f, 16) | \
+ BLENDERC(a, b, f, 8) | BLENDERC(a, b, f, 0)
+
+void ScaleARGBFilterCols_C(uint8* dst_argb, const uint8* src_argb,
+ int dst_width, int x, int dx) {
+ const uint32* src = (const uint32*)(src_argb);
+ uint32* dst = (uint32*)(dst_argb);
+ int j;
+ for (j = 0; j < dst_width - 1; j += 2) {
+ int xi = x >> 16;
+ int xf = (x >> 9) & 0x7f;
+ uint32 a = src[xi];
+ uint32 b = src[xi + 1];
+ dst[0] = BLENDER(a, b, xf);
+ x += dx;
+ xi = x >> 16;
+ xf = (x >> 9) & 0x7f;
+ a = src[xi];
+ b = src[xi + 1];
+ dst[1] = BLENDER(a, b, xf);
+ x += dx;
+ dst += 2;
+ }
+ if (dst_width & 1) {
+ int xi = x >> 16;
+ int xf = (x >> 9) & 0x7f;
+ uint32 a = src[xi];
+ uint32 b = src[xi + 1];
+ dst[0] = BLENDER(a, b, xf);
+ }
+}
+
+void ScaleARGBFilterCols64_C(uint8* dst_argb, const uint8* src_argb,
+ int dst_width, int x32, int dx) {
+ int64 x = (int64)(x32);
+ const uint32* src = (const uint32*)(src_argb);
+ uint32* dst = (uint32*)(dst_argb);
+ int j;
+ for (j = 0; j < dst_width - 1; j += 2) {
+ int64 xi = x >> 16;
+ int xf = (x >> 9) & 0x7f;
+ uint32 a = src[xi];
+ uint32 b = src[xi + 1];
+ dst[0] = BLENDER(a, b, xf);
+ x += dx;
+ xi = x >> 16;
+ xf = (x >> 9) & 0x7f;
+ a = src[xi];
+ b = src[xi + 1];
+ dst[1] = BLENDER(a, b, xf);
+ x += dx;
+ dst += 2;
+ }
+ if (dst_width & 1) {
+ int64 xi = x >> 16;
+ int xf = (x >> 9) & 0x7f;
+ uint32 a = src[xi];
+ uint32 b = src[xi + 1];
+ dst[0] = BLENDER(a, b, xf);
+ }
+}
+#undef BLENDER1
+#undef BLENDERC
+#undef BLENDER
+
+// Scale plane vertically with bilinear interpolation.
+void ScalePlaneVertical(int src_height,
+ int dst_width, int dst_height,
+ int src_stride, int dst_stride,
+ const uint8* src_argb, uint8* dst_argb,
+ int x, int y, int dy,
+ int bpp, enum FilterMode filtering) {
+ // TODO(fbarchard): Allow higher bpp.
+ int dst_width_bytes = dst_width * bpp;
+ void (*InterpolateRow)(uint8* dst_argb, const uint8* src_argb,
+ ptrdiff_t src_stride, int dst_width, int source_y_fraction) =
+ InterpolateRow_C;
+ const int max_y = (src_height > 1) ? ((src_height - 1) << 16) - 1 : 0;
+ int j;
+ assert(bpp >= 1 && bpp <= 4);
+ assert(src_height != 0);
+ assert(dst_width > 0);
+ assert(dst_height > 0);
+ src_argb += (x >> 16) * bpp;
+#if defined(HAS_INTERPOLATEROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2) && dst_width_bytes >= 16) {
+ InterpolateRow = InterpolateRow_Any_SSE2;
+ if (IS_ALIGNED(dst_width_bytes, 16)) {
+ InterpolateRow = InterpolateRow_Unaligned_SSE2;
+ if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride, 16) &&
+ IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride, 16)) {
+ InterpolateRow = InterpolateRow_SSE2;
+ }
+ }
+ }
+#endif
+#if defined(HAS_INTERPOLATEROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3) && dst_width_bytes >= 16) {
+ InterpolateRow = InterpolateRow_Any_SSSE3;
+ if (IS_ALIGNED(dst_width_bytes, 16)) {
+ InterpolateRow = InterpolateRow_Unaligned_SSSE3;
+ if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride, 16) &&
+ IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride, 16)) {
+ InterpolateRow = InterpolateRow_SSSE3;
+ }
+ }
+ }
+#endif
+#if defined(HAS_INTERPOLATEROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2) && dst_width_bytes >= 32) {
+ InterpolateRow = InterpolateRow_Any_AVX2;
+ if (IS_ALIGNED(dst_width_bytes, 32)) {
+ InterpolateRow = InterpolateRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_INTERPOLATEROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON) && dst_width_bytes >= 16) {
+ InterpolateRow = InterpolateRow_Any_NEON;
+ if (IS_ALIGNED(dst_width_bytes, 16)) {
+ InterpolateRow = InterpolateRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_INTERPOLATEROWS_MIPS_DSPR2)
+ if (TestCpuFlag(kCpuHasMIPS_DSPR2) && dst_width_bytes >= 4 &&
+ IS_ALIGNED(src_argb, 4) && IS_ALIGNED(src_stride, 4) &&
+ IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride, 4)) {
+ InterpolateRow = InterpolateRow_Any_MIPS_DSPR2;
+ if (IS_ALIGNED(dst_width_bytes, 4)) {
+ InterpolateRow = InterpolateRow_MIPS_DSPR2;
+ }
+ }
+#endif
+ for (j = 0; j < dst_height; ++j) {
+ int yi;
+ int yf;
+ if (y > max_y) {
+ y = max_y;
+ }
+ yi = y >> 16;
+ yf = filtering ? ((y >> 8) & 255) : 0;
+ InterpolateRow(dst_argb, src_argb + yi * src_stride,
+ src_stride, dst_width_bytes, yf);
+ dst_argb += dst_stride;
+ y += dy;
+ }
+}
+void ScalePlaneVertical_16(int src_height,
+ int dst_width, int dst_height,
+ int src_stride, int dst_stride,
+ const uint16* src_argb, uint16* dst_argb,
+ int x, int y, int dy,
+ int wpp, enum FilterMode filtering) {
+ // TODO(fbarchard): Allow higher wpp.
+ int dst_width_words = dst_width * wpp;
+ void (*InterpolateRow)(uint16* dst_argb, const uint16* src_argb,
+ ptrdiff_t src_stride, int dst_width, int source_y_fraction) =
+ InterpolateRow_16_C;
+ const int max_y = (src_height > 1) ? ((src_height - 1) << 16) - 1 : 0;
+ int j;
+ assert(wpp >= 1 && wpp <= 2);
+ assert(src_height != 0);
+ assert(dst_width > 0);
+ assert(dst_height > 0);
+ src_argb += (x >> 16) * wpp;
+#if defined(HAS_INTERPOLATEROW_16_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2) && dst_width_bytes >= 16) {
+ InterpolateRow = InterpolateRow_Any_16_SSE2;
+ if (IS_ALIGNED(dst_width_bytes, 16)) {
+ InterpolateRow = InterpolateRow_Unaligned_16_SSE2;
+ if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride, 16) &&
+ IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride, 16)) {
+ InterpolateRow = InterpolateRow_16_SSE2;
+ }
+ }
+ }
+#endif
+#if defined(HAS_INTERPOLATEROW_16_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3) && dst_width_bytes >= 16) {
+ InterpolateRow = InterpolateRow_Any_16_SSSE3;
+ if (IS_ALIGNED(dst_width_bytes, 16)) {
+ InterpolateRow = InterpolateRow_Unaligned_16_SSSE3;
+ if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride, 16) &&
+ IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride, 16)) {
+ InterpolateRow = InterpolateRow_16_SSSE3;
+ }
+ }
+ }
+#endif
+#if defined(HAS_INTERPOLATEROW_16_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2) && dst_width_bytes >= 32) {
+ InterpolateRow = InterpolateRow_Any_16_AVX2;
+ if (IS_ALIGNED(dst_width_bytes, 32)) {
+ InterpolateRow = InterpolateRow_16_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_INTERPOLATEROW_16_NEON)
+ if (TestCpuFlag(kCpuHasNEON) && dst_width_bytes >= 16) {
+ InterpolateRow = InterpolateRow_Any_16_NEON;
+ if (IS_ALIGNED(dst_width_bytes, 16)) {
+ InterpolateRow = InterpolateRow_16_NEON;
+ }
+ }
+#endif
+#if defined(HAS_INTERPOLATEROWS_16_MIPS_DSPR2)
+ if (TestCpuFlag(kCpuHasMIPS_DSPR2) && dst_width_bytes >= 4 &&
+ IS_ALIGNED(src_argb, 4) && IS_ALIGNED(src_stride, 4) &&
+ IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride, 4)) {
+ InterpolateRow = InterpolateRow_Any_16_MIPS_DSPR2;
+ if (IS_ALIGNED(dst_width_bytes, 4)) {
+ InterpolateRow = InterpolateRow_16_MIPS_DSPR2;
+ }
+ }
+#endif
+ for (j = 0; j < dst_height; ++j) {
+ int yi;
+ int yf;
+ if (y > max_y) {
+ y = max_y;
+ }
+ yi = y >> 16;
+ yf = filtering ? ((y >> 8) & 255) : 0;
+ InterpolateRow(dst_argb, src_argb + yi * src_stride,
+ src_stride, dst_width_words, yf);
+ dst_argb += dst_stride;
+ y += dy;
+ }
+}
+
+// Simplify the filtering based on scale factors.
+enum FilterMode ScaleFilterReduce(int src_width, int src_height,
+ int dst_width, int dst_height,
+ enum FilterMode filtering) {
+ if (src_width < 0) {
+ src_width = -src_width;
+ }
+ if (src_height < 0) {
+ src_height = -src_height;
+ }
+ if (filtering == kFilterBox) {
+ // If scaling both axis to 0.5 or larger, switch from Box to Bilinear.
+ if (dst_width * 2 >= src_width && dst_height * 2 >= src_height) {
+ filtering = kFilterBilinear;
+ }
+ // If scaling to larger, switch from Box to Bilinear.
+ if (dst_width >= src_width || dst_height >= src_height) {
+ filtering = kFilterBilinear;
+ }
+ }
+ if (filtering == kFilterBilinear) {
+ if (src_height == 1) {
+ filtering = kFilterLinear;
+ }
+ // TODO(fbarchard): Detect any odd scale factor and reduce to Linear.
+ if (dst_height == src_height || dst_height * 3 == src_height) {
+ filtering = kFilterLinear;
+ }
+ // TODO(fbarchard): Remove 1 pixel wide filter restriction, which is to
+ // avoid reading 2 pixels horizontally that causes memory exception.
+ if (src_width == 1) {
+ filtering = kFilterNone;
+ }
+ }
+ if (filtering == kFilterLinear) {
+ if (src_width == 1) {
+ filtering = kFilterNone;
+ }
+ // TODO(fbarchard): Detect any odd scale factor and reduce to None.
+ if (dst_width == src_width || dst_width * 3 == src_width) {
+ filtering = kFilterNone;
+ }
+ }
+ return filtering;
+}
+
+// Divide num by div and return as 16.16 fixed point result.
+int FixedDiv_C(int num, int div) {
+ return (int)(((int64)(num) << 16) / div);
+}
+
+// Divide num by div and return as 16.16 fixed point result.
+int FixedDiv1_C(int num, int div) {
+ return (int)((((int64)(num) << 16) - 0x00010001) /
+ (div - 1));
+}
+
+#define CENTERSTART(dx, s) (dx < 0) ? -((-dx >> 1) + s) : ((dx >> 1) + s)
+
+// Compute slope values for stepping.
+void ScaleSlope(int src_width, int src_height,
+ int dst_width, int dst_height,
+ enum FilterMode filtering,
+ int* x, int* y, int* dx, int* dy) {
+ assert(x != NULL);
+ assert(y != NULL);
+ assert(dx != NULL);
+ assert(dy != NULL);
+ assert(src_width != 0);
+ assert(src_height != 0);
+ assert(dst_width > 0);
+ assert(dst_height > 0);
+ // Check for 1 pixel and avoid FixedDiv overflow.
+ if (dst_width == 1 && src_width >= 32768) {
+ dst_width = src_width;
+ }
+ if (dst_height == 1 && src_height >= 32768) {
+ dst_height = src_height;
+ }
+ if (filtering == kFilterBox) {
+ // Scale step for point sampling duplicates all pixels equally.
+ *dx = FixedDiv(Abs(src_width), dst_width);
+ *dy = FixedDiv(src_height, dst_height);
+ *x = 0;
+ *y = 0;
+ } else if (filtering == kFilterBilinear) {
+ // Scale step for bilinear sampling renders last pixel once for upsample.
+ if (dst_width <= Abs(src_width)) {
+ *dx = FixedDiv(Abs(src_width), dst_width);
+ *x = CENTERSTART(*dx, -32768); // Subtract 0.5 (32768) to center filter.
+ } else if (dst_width > 1) {
+ *dx = FixedDiv1(Abs(src_width), dst_width);
+ *x = 0;
+ }
+ if (dst_height <= src_height) {
+ *dy = FixedDiv(src_height, dst_height);
+ *y = CENTERSTART(*dy, -32768); // Subtract 0.5 (32768) to center filter.
+ } else if (dst_height > 1) {
+ *dy = FixedDiv1(src_height, dst_height);
+ *y = 0;
+ }
+ } else if (filtering == kFilterLinear) {
+ // Scale step for bilinear sampling renders last pixel once for upsample.
+ if (dst_width <= Abs(src_width)) {
+ *dx = FixedDiv(Abs(src_width), dst_width);
+ *x = CENTERSTART(*dx, -32768); // Subtract 0.5 (32768) to center filter.
+ } else if (dst_width > 1) {
+ *dx = FixedDiv1(Abs(src_width), dst_width);
+ *x = 0;
+ }
+ *dy = FixedDiv(src_height, dst_height);
+ *y = *dy >> 1;
+ } else {
+ // Scale step for point sampling duplicates all pixels equally.
+ *dx = FixedDiv(Abs(src_width), dst_width);
+ *dy = FixedDiv(src_height, dst_height);
+ *x = CENTERSTART(*dx, 0);
+ *y = CENTERSTART(*dy, 0);
+ }
+ // Negative src_width means horizontally mirror.
+ if (src_width < 0) {
+ *x += (dst_width - 1) * *dx;
+ *dx = -*dx;
+ // src_width = -src_width; // Caller must do this.
+ }
+}
+#undef CENTERSTART
+
+#ifdef __cplusplus
+} // extern "C"
+} // namespace libyuv
+#endif
diff --git a/third_party/libyuv/source/scale_mips.cc b/third_party/libyuv/source/scale_mips.cc
new file mode 100644
index 0000000..5722dea
--- /dev/null
+++ b/third_party/libyuv/source/scale_mips.cc
@@ -0,0 +1,653 @@
+/*
+ * Copyright 2012 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "third_party/libyuv/include/libyuv/basic_types.h"
+#include "third_party/libyuv/include/libyuv/row.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// This module is for GCC MIPS DSPR2
+#if !defined(LIBYUV_DISABLE_MIPS) && \
+ defined(__mips_dsp) && (__mips_dsp_rev >= 2)
+
+void ScaleRowDown2_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
+ uint8* dst, int dst_width) {
+ __asm__ __volatile__(
+ ".set push \n"
+ ".set noreorder \n"
+
+ "srl $t9, %[dst_width], 4 \n" // iterations -> by 16
+ "beqz $t9, 2f \n"
+ " nop \n"
+
+ ".p2align 2 \n"
+ "1: \n"
+ "lw $t0, 0(%[src_ptr]) \n" // |3|2|1|0|
+ "lw $t1, 4(%[src_ptr]) \n" // |7|6|5|4|
+ "lw $t2, 8(%[src_ptr]) \n" // |11|10|9|8|
+ "lw $t3, 12(%[src_ptr]) \n" // |15|14|13|12|
+ "lw $t4, 16(%[src_ptr]) \n" // |19|18|17|16|
+ "lw $t5, 20(%[src_ptr]) \n" // |23|22|21|20|
+ "lw $t6, 24(%[src_ptr]) \n" // |27|26|25|24|
+ "lw $t7, 28(%[src_ptr]) \n" // |31|30|29|28|
+ // TODO(fbarchard): Use odd pixels instead of even.
+ "precr.qb.ph $t8, $t1, $t0 \n" // |6|4|2|0|
+ "precr.qb.ph $t0, $t3, $t2 \n" // |14|12|10|8|
+ "precr.qb.ph $t1, $t5, $t4 \n" // |22|20|18|16|
+ "precr.qb.ph $t2, $t7, $t6 \n" // |30|28|26|24|
+ "addiu %[src_ptr], %[src_ptr], 32 \n"
+ "addiu $t9, $t9, -1 \n"
+ "sw $t8, 0(%[dst]) \n"
+ "sw $t0, 4(%[dst]) \n"
+ "sw $t1, 8(%[dst]) \n"
+ "sw $t2, 12(%[dst]) \n"
+ "bgtz $t9, 1b \n"
+ " addiu %[dst], %[dst], 16 \n"
+
+ "2: \n"
+ "andi $t9, %[dst_width], 0xf \n" // residue
+ "beqz $t9, 3f \n"
+ " nop \n"
+
+ "21: \n"
+ "lbu $t0, 0(%[src_ptr]) \n"
+ "addiu %[src_ptr], %[src_ptr], 2 \n"
+ "addiu $t9, $t9, -1 \n"
+ "sb $t0, 0(%[dst]) \n"
+ "bgtz $t9, 21b \n"
+ " addiu %[dst], %[dst], 1 \n"
+
+ "3: \n"
+ ".set pop \n"
+ : [src_ptr] "+r" (src_ptr),
+ [dst] "+r" (dst)
+ : [dst_width] "r" (dst_width)
+ : "t0", "t1", "t2", "t3", "t4", "t5",
+ "t6", "t7", "t8", "t9"
+ );
+}
+
+void ScaleRowDown2Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
+ uint8* dst, int dst_width) {
+ const uint8* t = src_ptr + src_stride;
+
+ __asm__ __volatile__ (
+ ".set push \n"
+ ".set noreorder \n"
+
+ "srl $t9, %[dst_width], 3 \n" // iterations -> step 8
+ "bltz $t9, 2f \n"
+ " nop \n"
+
+ ".p2align 2 \n"
+ "1: \n"
+ "lw $t0, 0(%[src_ptr]) \n" // |3|2|1|0|
+ "lw $t1, 4(%[src_ptr]) \n" // |7|6|5|4|
+ "lw $t2, 8(%[src_ptr]) \n" // |11|10|9|8|
+ "lw $t3, 12(%[src_ptr]) \n" // |15|14|13|12|
+ "lw $t4, 0(%[t]) \n" // |19|18|17|16|
+ "lw $t5, 4(%[t]) \n" // |23|22|21|20|
+ "lw $t6, 8(%[t]) \n" // |27|26|25|24|
+ "lw $t7, 12(%[t]) \n" // |31|30|29|28|
+ "addiu $t9, $t9, -1 \n"
+ "srl $t8, $t0, 16 \n" // |X|X|3|2|
+ "ins $t0, $t4, 16, 16 \n" // |17|16|1|0|
+ "ins $t4, $t8, 0, 16 \n" // |19|18|3|2|
+ "raddu.w.qb $t0, $t0 \n" // |17+16+1+0|
+ "raddu.w.qb $t4, $t4 \n" // |19+18+3+2|
+ "shra_r.w $t0, $t0, 2 \n" // |t0+2|>>2
+ "shra_r.w $t4, $t4, 2 \n" // |t4+2|>>2
+ "srl $t8, $t1, 16 \n" // |X|X|7|6|
+ "ins $t1, $t5, 16, 16 \n" // |21|20|5|4|
+ "ins $t5, $t8, 0, 16 \n" // |22|23|7|6|
+ "raddu.w.qb $t1, $t1 \n" // |21+20+5+4|
+ "raddu.w.qb $t5, $t5 \n" // |23+22+7+6|
+ "shra_r.w $t1, $t1, 2 \n" // |t1+2|>>2
+ "shra_r.w $t5, $t5, 2 \n" // |t5+2|>>2
+ "srl $t8, $t2, 16 \n" // |X|X|11|10|
+ "ins $t2, $t6, 16, 16 \n" // |25|24|9|8|
+ "ins $t6, $t8, 0, 16 \n" // |27|26|11|10|
+ "raddu.w.qb $t2, $t2 \n" // |25+24+9+8|
+ "raddu.w.qb $t6, $t6 \n" // |27+26+11+10|
+ "shra_r.w $t2, $t2, 2 \n" // |t2+2|>>2
+ "shra_r.w $t6, $t6, 2 \n" // |t5+2|>>2
+ "srl $t8, $t3, 16 \n" // |X|X|15|14|
+ "ins $t3, $t7, 16, 16 \n" // |29|28|13|12|
+ "ins $t7, $t8, 0, 16 \n" // |31|30|15|14|
+ "raddu.w.qb $t3, $t3 \n" // |29+28+13+12|
+ "raddu.w.qb $t7, $t7 \n" // |31+30+15+14|
+ "shra_r.w $t3, $t3, 2 \n" // |t3+2|>>2
+ "shra_r.w $t7, $t7, 2 \n" // |t7+2|>>2
+ "addiu %[src_ptr], %[src_ptr], 16 \n"
+ "addiu %[t], %[t], 16 \n"
+ "sb $t0, 0(%[dst]) \n"
+ "sb $t4, 1(%[dst]) \n"
+ "sb $t1, 2(%[dst]) \n"
+ "sb $t5, 3(%[dst]) \n"
+ "sb $t2, 4(%[dst]) \n"
+ "sb $t6, 5(%[dst]) \n"
+ "sb $t3, 6(%[dst]) \n"
+ "sb $t7, 7(%[dst]) \n"
+ "bgtz $t9, 1b \n"
+ " addiu %[dst], %[dst], 8 \n"
+
+ "2: \n"
+ "andi $t9, %[dst_width], 0x7 \n" // x = residue
+ "beqz $t9, 3f \n"
+ " nop \n"
+
+ "21: \n"
+ "lwr $t1, 0(%[src_ptr]) \n"
+ "lwl $t1, 3(%[src_ptr]) \n"
+ "lwr $t2, 0(%[t]) \n"
+ "lwl $t2, 3(%[t]) \n"
+ "srl $t8, $t1, 16 \n"
+ "ins $t1, $t2, 16, 16 \n"
+ "ins $t2, $t8, 0, 16 \n"
+ "raddu.w.qb $t1, $t1 \n"
+ "raddu.w.qb $t2, $t2 \n"
+ "shra_r.w $t1, $t1, 2 \n"
+ "shra_r.w $t2, $t2, 2 \n"
+ "sb $t1, 0(%[dst]) \n"
+ "sb $t2, 1(%[dst]) \n"
+ "addiu %[src_ptr], %[src_ptr], 4 \n"
+ "addiu $t9, $t9, -2 \n"
+ "addiu %[t], %[t], 4 \n"
+ "bgtz $t9, 21b \n"
+ " addiu %[dst], %[dst], 2 \n"
+
+ "3: \n"
+ ".set pop \n"
+
+ : [src_ptr] "+r" (src_ptr),
+ [dst] "+r" (dst), [t] "+r" (t)
+ : [dst_width] "r" (dst_width)
+ : "t0", "t1", "t2", "t3", "t4", "t5",
+ "t6", "t7", "t8", "t9"
+ );
+}
+
+void ScaleRowDown4_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
+ uint8* dst, int dst_width) {
+ __asm__ __volatile__ (
+ ".set push \n"
+ ".set noreorder \n"
+
+ "srl $t9, %[dst_width], 3 \n"
+ "beqz $t9, 2f \n"
+ " nop \n"
+
+ ".p2align 2 \n"
+ "1: \n"
+ "lw $t1, 0(%[src_ptr]) \n" // |3|2|1|0|
+ "lw $t2, 4(%[src_ptr]) \n" // |7|6|5|4|
+ "lw $t3, 8(%[src_ptr]) \n" // |11|10|9|8|
+ "lw $t4, 12(%[src_ptr]) \n" // |15|14|13|12|
+ "lw $t5, 16(%[src_ptr]) \n" // |19|18|17|16|
+ "lw $t6, 20(%[src_ptr]) \n" // |23|22|21|20|
+ "lw $t7, 24(%[src_ptr]) \n" // |27|26|25|24|
+ "lw $t8, 28(%[src_ptr]) \n" // |31|30|29|28|
+ "precr.qb.ph $t1, $t2, $t1 \n" // |6|4|2|0|
+ "precr.qb.ph $t2, $t4, $t3 \n" // |14|12|10|8|
+ "precr.qb.ph $t5, $t6, $t5 \n" // |22|20|18|16|
+ "precr.qb.ph $t6, $t8, $t7 \n" // |30|28|26|24|
+ "precr.qb.ph $t1, $t2, $t1 \n" // |12|8|4|0|
+ "precr.qb.ph $t5, $t6, $t5 \n" // |28|24|20|16|
+ "addiu %[src_ptr], %[src_ptr], 32 \n"
+ "addiu $t9, $t9, -1 \n"
+ "sw $t1, 0(%[dst]) \n"
+ "sw $t5, 4(%[dst]) \n"
+ "bgtz $t9, 1b \n"
+ " addiu %[dst], %[dst], 8 \n"
+
+ "2: \n"
+ "andi $t9, %[dst_width], 7 \n" // residue
+ "beqz $t9, 3f \n"
+ " nop \n"
+
+ "21: \n"
+ "lbu $t1, 0(%[src_ptr]) \n"
+ "addiu %[src_ptr], %[src_ptr], 4 \n"
+ "addiu $t9, $t9, -1 \n"
+ "sb $t1, 0(%[dst]) \n"
+ "bgtz $t9, 21b \n"
+ " addiu %[dst], %[dst], 1 \n"
+
+ "3: \n"
+ ".set pop \n"
+ : [src_ptr] "+r" (src_ptr),
+ [dst] "+r" (dst)
+ : [dst_width] "r" (dst_width)
+ : "t1", "t2", "t3", "t4", "t5",
+ "t6", "t7", "t8", "t9"
+ );
+}
+
+void ScaleRowDown4Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
+ uint8* dst, int dst_width) {
+ intptr_t stride = src_stride;
+ const uint8* s1 = src_ptr + stride;
+ const uint8* s2 = s1 + stride;
+ const uint8* s3 = s2 + stride;
+
+ __asm__ __volatile__ (
+ ".set push \n"
+ ".set noreorder \n"
+
+ "srl $t9, %[dst_width], 1 \n"
+ "andi $t8, %[dst_width], 1 \n"
+
+ ".p2align 2 \n"
+ "1: \n"
+ "lw $t0, 0(%[src_ptr]) \n" // |3|2|1|0|
+ "lw $t1, 0(%[s1]) \n" // |7|6|5|4|
+ "lw $t2, 0(%[s2]) \n" // |11|10|9|8|
+ "lw $t3, 0(%[s3]) \n" // |15|14|13|12|
+ "lw $t4, 4(%[src_ptr]) \n" // |19|18|17|16|
+ "lw $t5, 4(%[s1]) \n" // |23|22|21|20|
+ "lw $t6, 4(%[s2]) \n" // |27|26|25|24|
+ "lw $t7, 4(%[s3]) \n" // |31|30|29|28|
+ "raddu.w.qb $t0, $t0 \n" // |3 + 2 + 1 + 0|
+ "raddu.w.qb $t1, $t1 \n" // |7 + 6 + 5 + 4|
+ "raddu.w.qb $t2, $t2 \n" // |11 + 10 + 9 + 8|
+ "raddu.w.qb $t3, $t3 \n" // |15 + 14 + 13 + 12|
+ "raddu.w.qb $t4, $t4 \n" // |19 + 18 + 17 + 16|
+ "raddu.w.qb $t5, $t5 \n" // |23 + 22 + 21 + 20|
+ "raddu.w.qb $t6, $t6 \n" // |27 + 26 + 25 + 24|
+ "raddu.w.qb $t7, $t7 \n" // |31 + 30 + 29 + 28|
+ "add $t0, $t0, $t1 \n"
+ "add $t1, $t2, $t3 \n"
+ "add $t0, $t0, $t1 \n"
+ "add $t4, $t4, $t5 \n"
+ "add $t6, $t6, $t7 \n"
+ "add $t4, $t4, $t6 \n"
+ "shra_r.w $t0, $t0, 4 \n"
+ "shra_r.w $t4, $t4, 4 \n"
+ "sb $t0, 0(%[dst]) \n"
+ "sb $t4, 1(%[dst]) \n"
+ "addiu %[src_ptr], %[src_ptr], 8 \n"
+ "addiu %[s1], %[s1], 8 \n"
+ "addiu %[s2], %[s2], 8 \n"
+ "addiu %[s3], %[s3], 8 \n"
+ "addiu $t9, $t9, -1 \n"
+ "bgtz $t9, 1b \n"
+ " addiu %[dst], %[dst], 2 \n"
+ "beqz $t8, 2f \n"
+ " nop \n"
+
+ "lw $t0, 0(%[src_ptr]) \n" // |3|2|1|0|
+ "lw $t1, 0(%[s1]) \n" // |7|6|5|4|
+ "lw $t2, 0(%[s2]) \n" // |11|10|9|8|
+ "lw $t3, 0(%[s3]) \n" // |15|14|13|12|
+ "raddu.w.qb $t0, $t0 \n" // |3 + 2 + 1 + 0|
+ "raddu.w.qb $t1, $t1 \n" // |7 + 6 + 5 + 4|
+ "raddu.w.qb $t2, $t2 \n" // |11 + 10 + 9 + 8|
+ "raddu.w.qb $t3, $t3 \n" // |15 + 14 + 13 + 12|
+ "add $t0, $t0, $t1 \n"
+ "add $t1, $t2, $t3 \n"
+ "add $t0, $t0, $t1 \n"
+ "shra_r.w $t0, $t0, 4 \n"
+ "sb $t0, 0(%[dst]) \n"
+
+ "2: \n"
+ ".set pop \n"
+
+ : [src_ptr] "+r" (src_ptr),
+ [dst] "+r" (dst),
+ [s1] "+r" (s1),
+ [s2] "+r" (s2),
+ [s3] "+r" (s3)
+ : [dst_width] "r" (dst_width)
+ : "t0", "t1", "t2", "t3", "t4", "t5",
+ "t6","t7", "t8", "t9"
+ );
+}
+
+void ScaleRowDown34_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
+ uint8* dst, int dst_width) {
+ __asm__ __volatile__ (
+ ".set push \n"
+ ".set noreorder \n"
+ ".p2align 2 \n"
+ "1: \n"
+ "lw $t1, 0(%[src_ptr]) \n" // |3|2|1|0|
+ "lw $t2, 4(%[src_ptr]) \n" // |7|6|5|4|
+ "lw $t3, 8(%[src_ptr]) \n" // |11|10|9|8|
+ "lw $t4, 12(%[src_ptr]) \n" // |15|14|13|12|
+ "lw $t5, 16(%[src_ptr]) \n" // |19|18|17|16|
+ "lw $t6, 20(%[src_ptr]) \n" // |23|22|21|20|
+ "lw $t7, 24(%[src_ptr]) \n" // |27|26|25|24|
+ "lw $t8, 28(%[src_ptr]) \n" // |31|30|29|28|
+ "precrq.qb.ph $t0, $t2, $t4 \n" // |7|5|15|13|
+ "precrq.qb.ph $t9, $t6, $t8 \n" // |23|21|31|30|
+ "addiu %[dst_width], %[dst_width], -24 \n"
+ "ins $t1, $t1, 8, 16 \n" // |3|1|0|X|
+ "ins $t4, $t0, 8, 16 \n" // |X|15|13|12|
+ "ins $t5, $t5, 8, 16 \n" // |19|17|16|X|
+ "ins $t8, $t9, 8, 16 \n" // |X|31|29|28|
+ "addiu %[src_ptr], %[src_ptr], 32 \n"
+ "packrl.ph $t0, $t3, $t0 \n" // |9|8|7|5|
+ "packrl.ph $t9, $t7, $t9 \n" // |25|24|23|21|
+ "prepend $t1, $t2, 8 \n" // |4|3|1|0|
+ "prepend $t3, $t4, 24 \n" // |15|13|12|11|
+ "prepend $t5, $t6, 8 \n" // |20|19|17|16|
+ "prepend $t7, $t8, 24 \n" // |31|29|28|27|
+ "sw $t1, 0(%[dst]) \n"
+ "sw $t0, 4(%[dst]) \n"
+ "sw $t3, 8(%[dst]) \n"
+ "sw $t5, 12(%[dst]) \n"
+ "sw $t9, 16(%[dst]) \n"
+ "sw $t7, 20(%[dst]) \n"
+ "bnez %[dst_width], 1b \n"
+ " addiu %[dst], %[dst], 24 \n"
+ ".set pop \n"
+ : [src_ptr] "+r" (src_ptr),
+ [dst] "+r" (dst),
+ [dst_width] "+r" (dst_width)
+ :
+ : "t0", "t1", "t2", "t3", "t4", "t5",
+ "t6","t7", "t8", "t9"
+ );
+}
+
+void ScaleRowDown34_0_Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
+ uint8* d, int dst_width) {
+ __asm__ __volatile__ (
+ ".set push \n"
+ ".set noreorder \n"
+ "repl.ph $t3, 3 \n" // 0x00030003
+
+ ".p2align 2 \n"
+ "1: \n"
+ "lw $t0, 0(%[src_ptr]) \n" // |S3|S2|S1|S0|
+ "lwx $t1, %[src_stride](%[src_ptr]) \n" // |T3|T2|T1|T0|
+ "rotr $t2, $t0, 8 \n" // |S0|S3|S2|S1|
+ "rotr $t6, $t1, 8 \n" // |T0|T3|T2|T1|
+ "muleu_s.ph.qbl $t4, $t2, $t3 \n" // |S0*3|S3*3|
+ "muleu_s.ph.qbl $t5, $t6, $t3 \n" // |T0*3|T3*3|
+ "andi $t0, $t2, 0xFFFF \n" // |0|0|S2|S1|
+ "andi $t1, $t6, 0xFFFF \n" // |0|0|T2|T1|
+ "raddu.w.qb $t0, $t0 \n"
+ "raddu.w.qb $t1, $t1 \n"
+ "shra_r.w $t0, $t0, 1 \n"
+ "shra_r.w $t1, $t1, 1 \n"
+ "preceu.ph.qbr $t2, $t2 \n" // |0|S2|0|S1|
+ "preceu.ph.qbr $t6, $t6 \n" // |0|T2|0|T1|
+ "rotr $t2, $t2, 16 \n" // |0|S1|0|S2|
+ "rotr $t6, $t6, 16 \n" // |0|T1|0|T2|
+ "addu.ph $t2, $t2, $t4 \n"
+ "addu.ph $t6, $t6, $t5 \n"
+ "sll $t5, $t0, 1 \n"
+ "add $t0, $t5, $t0 \n"
+ "shra_r.ph $t2, $t2, 2 \n"
+ "shra_r.ph $t6, $t6, 2 \n"
+ "shll.ph $t4, $t2, 1 \n"
+ "addq.ph $t4, $t4, $t2 \n"
+ "addu $t0, $t0, $t1 \n"
+ "addiu %[src_ptr], %[src_ptr], 4 \n"
+ "shra_r.w $t0, $t0, 2 \n"
+ "addu.ph $t6, $t6, $t4 \n"
+ "shra_r.ph $t6, $t6, 2 \n"
+ "srl $t1, $t6, 16 \n"
+ "addiu %[dst_width], %[dst_width], -3 \n"
+ "sb $t1, 0(%[d]) \n"
+ "sb $t0, 1(%[d]) \n"
+ "sb $t6, 2(%[d]) \n"
+ "bgtz %[dst_width], 1b \n"
+ " addiu %[d], %[d], 3 \n"
+ "3: \n"
+ ".set pop \n"
+ : [src_ptr] "+r" (src_ptr),
+ [src_stride] "+r" (src_stride),
+ [d] "+r" (d),
+ [dst_width] "+r" (dst_width)
+ :
+ : "t0", "t1", "t2", "t3",
+ "t4", "t5", "t6"
+ );
+}
+
+void ScaleRowDown34_1_Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
+ uint8* d, int dst_width) {
+ __asm__ __volatile__ (
+ ".set push \n"
+ ".set noreorder \n"
+ "repl.ph $t2, 3 \n" // 0x00030003
+
+ ".p2align 2 \n"
+ "1: \n"
+ "lw $t0, 0(%[src_ptr]) \n" // |S3|S2|S1|S0|
+ "lwx $t1, %[src_stride](%[src_ptr]) \n" // |T3|T2|T1|T0|
+ "rotr $t4, $t0, 8 \n" // |S0|S3|S2|S1|
+ "rotr $t6, $t1, 8 \n" // |T0|T3|T2|T1|
+ "muleu_s.ph.qbl $t3, $t4, $t2 \n" // |S0*3|S3*3|
+ "muleu_s.ph.qbl $t5, $t6, $t2 \n" // |T0*3|T3*3|
+ "andi $t0, $t4, 0xFFFF \n" // |0|0|S2|S1|
+ "andi $t1, $t6, 0xFFFF \n" // |0|0|T2|T1|
+ "raddu.w.qb $t0, $t0 \n"
+ "raddu.w.qb $t1, $t1 \n"
+ "shra_r.w $t0, $t0, 1 \n"
+ "shra_r.w $t1, $t1, 1 \n"
+ "preceu.ph.qbr $t4, $t4 \n" // |0|S2|0|S1|
+ "preceu.ph.qbr $t6, $t6 \n" // |0|T2|0|T1|
+ "rotr $t4, $t4, 16 \n" // |0|S1|0|S2|
+ "rotr $t6, $t6, 16 \n" // |0|T1|0|T2|
+ "addu.ph $t4, $t4, $t3 \n"
+ "addu.ph $t6, $t6, $t5 \n"
+ "shra_r.ph $t6, $t6, 2 \n"
+ "shra_r.ph $t4, $t4, 2 \n"
+ "addu.ph $t6, $t6, $t4 \n"
+ "addiu %[src_ptr], %[src_ptr], 4 \n"
+ "shra_r.ph $t6, $t6, 1 \n"
+ "addu $t0, $t0, $t1 \n"
+ "addiu %[dst_width], %[dst_width], -3 \n"
+ "shra_r.w $t0, $t0, 1 \n"
+ "srl $t1, $t6, 16 \n"
+ "sb $t1, 0(%[d]) \n"
+ "sb $t0, 1(%[d]) \n"
+ "sb $t6, 2(%[d]) \n"
+ "bgtz %[dst_width], 1b \n"
+ " addiu %[d], %[d], 3 \n"
+ "3: \n"
+ ".set pop \n"
+ : [src_ptr] "+r" (src_ptr),
+ [src_stride] "+r" (src_stride),
+ [d] "+r" (d),
+ [dst_width] "+r" (dst_width)
+ :
+ : "t0", "t1", "t2", "t3",
+ "t4", "t5", "t6"
+ );
+}
+
+void ScaleRowDown38_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
+ uint8* dst, int dst_width) {
+ __asm__ __volatile__ (
+ ".set push \n"
+ ".set noreorder \n"
+
+ ".p2align 2 \n"
+ "1: \n"
+ "lw $t0, 0(%[src_ptr]) \n" // |3|2|1|0|
+ "lw $t1, 4(%[src_ptr]) \n" // |7|6|5|4|
+ "lw $t2, 8(%[src_ptr]) \n" // |11|10|9|8|
+ "lw $t3, 12(%[src_ptr]) \n" // |15|14|13|12|
+ "lw $t4, 16(%[src_ptr]) \n" // |19|18|17|16|
+ "lw $t5, 20(%[src_ptr]) \n" // |23|22|21|20|
+ "lw $t6, 24(%[src_ptr]) \n" // |27|26|25|24|
+ "lw $t7, 28(%[src_ptr]) \n" // |31|30|29|28|
+ "wsbh $t0, $t0 \n" // |2|3|0|1|
+ "wsbh $t6, $t6 \n" // |26|27|24|25|
+ "srl $t0, $t0, 8 \n" // |X|2|3|0|
+ "srl $t3, $t3, 16 \n" // |X|X|15|14|
+ "srl $t5, $t5, 16 \n" // |X|X|23|22|
+ "srl $t7, $t7, 16 \n" // |X|X|31|30|
+ "ins $t1, $t2, 24, 8 \n" // |8|6|5|4|
+ "ins $t6, $t5, 0, 8 \n" // |26|27|24|22|
+ "ins $t1, $t0, 0, 16 \n" // |8|6|3|0|
+ "ins $t6, $t7, 24, 8 \n" // |30|27|24|22|
+ "prepend $t2, $t3, 24 \n" // |X|15|14|11|
+ "ins $t4, $t4, 16, 8 \n" // |19|16|17|X|
+ "ins $t4, $t2, 0, 16 \n" // |19|16|14|11|
+ "addiu %[src_ptr], %[src_ptr], 32 \n"
+ "addiu %[dst_width], %[dst_width], -12 \n"
+ "addiu $t8,%[dst_width], -12 \n"
+ "sw $t1, 0(%[dst]) \n"
+ "sw $t4, 4(%[dst]) \n"
+ "sw $t6, 8(%[dst]) \n"
+ "bgez $t8, 1b \n"
+ " addiu %[dst], %[dst], 12 \n"
+ ".set pop \n"
+ : [src_ptr] "+r" (src_ptr),
+ [dst] "+r" (dst),
+ [dst_width] "+r" (dst_width)
+ :
+ : "t0", "t1", "t2", "t3", "t4",
+ "t5", "t6", "t7", "t8"
+ );
+}
+
+void ScaleRowDown38_2_Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
+ uint8* dst_ptr, int dst_width) {
+ intptr_t stride = src_stride;
+ const uint8* t = src_ptr + stride;
+ const int c = 0x2AAA;
+
+ __asm__ __volatile__ (
+ ".set push \n"
+ ".set noreorder \n"
+
+ ".p2align 2 \n"
+ "1: \n"
+ "lw $t0, 0(%[src_ptr]) \n" // |S3|S2|S1|S0|
+ "lw $t1, 4(%[src_ptr]) \n" // |S7|S6|S5|S4|
+ "lw $t2, 0(%[t]) \n" // |T3|T2|T1|T0|
+ "lw $t3, 4(%[t]) \n" // |T7|T6|T5|T4|
+ "rotr $t1, $t1, 16 \n" // |S5|S4|S7|S6|
+ "packrl.ph $t4, $t1, $t3 \n" // |S7|S6|T7|T6|
+ "packrl.ph $t5, $t3, $t1 \n" // |T5|T4|S5|S4|
+ "raddu.w.qb $t4, $t4 \n" // S7+S6+T7+T6
+ "raddu.w.qb $t5, $t5 \n" // T5+T4+S5+S4
+ "precrq.qb.ph $t6, $t0, $t2 \n" // |S3|S1|T3|T1|
+ "precrq.qb.ph $t6, $t6, $t6 \n" // |S3|T3|S3|T3|
+ "srl $t4, $t4, 2 \n" // t4 / 4
+ "srl $t6, $t6, 16 \n" // |0|0|S3|T3|
+ "raddu.w.qb $t6, $t6 \n" // 0+0+S3+T3
+ "addu $t6, $t5, $t6 \n"
+ "mul $t6, $t6, %[c] \n" // t6 * 0x2AAA
+ "sll $t0, $t0, 8 \n" // |S2|S1|S0|0|
+ "sll $t2, $t2, 8 \n" // |T2|T1|T0|0|
+ "raddu.w.qb $t0, $t0 \n" // S2+S1+S0+0
+ "raddu.w.qb $t2, $t2 \n" // T2+T1+T0+0
+ "addu $t0, $t0, $t2 \n"
+ "mul $t0, $t0, %[c] \n" // t0 * 0x2AAA
+ "addiu %[src_ptr], %[src_ptr], 8 \n"
+ "addiu %[t], %[t], 8 \n"
+ "addiu %[dst_width], %[dst_width], -3 \n"
+ "addiu %[dst_ptr], %[dst_ptr], 3 \n"
+ "srl $t6, $t6, 16 \n"
+ "srl $t0, $t0, 16 \n"
+ "sb $t4, -1(%[dst_ptr]) \n"
+ "sb $t6, -2(%[dst_ptr]) \n"
+ "bgtz %[dst_width], 1b \n"
+ " sb $t0, -3(%[dst_ptr]) \n"
+ ".set pop \n"
+ : [src_ptr] "+r" (src_ptr),
+ [dst_ptr] "+r" (dst_ptr),
+ [t] "+r" (t),
+ [dst_width] "+r" (dst_width)
+ : [c] "r" (c)
+ : "t0", "t1", "t2", "t3", "t4", "t5", "t6"
+ );
+}
+
+void ScaleRowDown38_3_Box_MIPS_DSPR2(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst_ptr, int dst_width) {
+ intptr_t stride = src_stride;
+ const uint8* s1 = src_ptr + stride;
+ stride += stride;
+ const uint8* s2 = src_ptr + stride;
+ const int c1 = 0x1C71;
+ const int c2 = 0x2AAA;
+
+ __asm__ __volatile__ (
+ ".set push \n"
+ ".set noreorder \n"
+
+ ".p2align 2 \n"
+ "1: \n"
+ "lw $t0, 0(%[src_ptr]) \n" // |S3|S2|S1|S0|
+ "lw $t1, 4(%[src_ptr]) \n" // |S7|S6|S5|S4|
+ "lw $t2, 0(%[s1]) \n" // |T3|T2|T1|T0|
+ "lw $t3, 4(%[s1]) \n" // |T7|T6|T5|T4|
+ "lw $t4, 0(%[s2]) \n" // |R3|R2|R1|R0|
+ "lw $t5, 4(%[s2]) \n" // |R7|R6|R5|R4|
+ "rotr $t1, $t1, 16 \n" // |S5|S4|S7|S6|
+ "packrl.ph $t6, $t1, $t3 \n" // |S7|S6|T7|T6|
+ "raddu.w.qb $t6, $t6 \n" // S7+S6+T7+T6
+ "packrl.ph $t7, $t3, $t1 \n" // |T5|T4|S5|S4|
+ "raddu.w.qb $t7, $t7 \n" // T5+T4+S5+S4
+ "sll $t8, $t5, 16 \n" // |R5|R4|0|0|
+ "raddu.w.qb $t8, $t8 \n" // R5+R4
+ "addu $t7, $t7, $t8 \n"
+ "srl $t8, $t5, 16 \n" // |0|0|R7|R6|
+ "raddu.w.qb $t8, $t8 \n" // R7 + R6
+ "addu $t6, $t6, $t8 \n"
+ "mul $t6, $t6, %[c2] \n" // t6 * 0x2AAA
+ "precrq.qb.ph $t8, $t0, $t2 \n" // |S3|S1|T3|T1|
+ "precrq.qb.ph $t8, $t8, $t4 \n" // |S3|T3|R3|R1|
+ "srl $t8, $t8, 8 \n" // |0|S3|T3|R3|
+ "raddu.w.qb $t8, $t8 \n" // S3 + T3 + R3
+ "addu $t7, $t7, $t8 \n"
+ "mul $t7, $t7, %[c1] \n" // t7 * 0x1C71
+ "sll $t0, $t0, 8 \n" // |S2|S1|S0|0|
+ "sll $t2, $t2, 8 \n" // |T2|T1|T0|0|
+ "sll $t4, $t4, 8 \n" // |R2|R1|R0|0|
+ "raddu.w.qb $t0, $t0 \n"
+ "raddu.w.qb $t2, $t2 \n"
+ "raddu.w.qb $t4, $t4 \n"
+ "addu $t0, $t0, $t2 \n"
+ "addu $t0, $t0, $t4 \n"
+ "mul $t0, $t0, %[c1] \n" // t0 * 0x1C71
+ "addiu %[src_ptr], %[src_ptr], 8 \n"
+ "addiu %[s1], %[s1], 8 \n"
+ "addiu %[s2], %[s2], 8 \n"
+ "addiu %[dst_width], %[dst_width], -3 \n"
+ "addiu %[dst_ptr], %[dst_ptr], 3 \n"
+ "srl $t6, $t6, 16 \n"
+ "srl $t7, $t7, 16 \n"
+ "srl $t0, $t0, 16 \n"
+ "sb $t6, -1(%[dst_ptr]) \n"
+ "sb $t7, -2(%[dst_ptr]) \n"
+ "bgtz %[dst_width], 1b \n"
+ " sb $t0, -3(%[dst_ptr]) \n"
+ ".set pop \n"
+ : [src_ptr] "+r" (src_ptr),
+ [dst_ptr] "+r" (dst_ptr),
+ [s1] "+r" (s1),
+ [s2] "+r" (s2),
+ [dst_width] "+r" (dst_width)
+ : [c1] "r" (c1), [c2] "r" (c2)
+ : "t0", "t1", "t2", "t3", "t4",
+ "t5", "t6", "t7", "t8"
+ );
+}
+
+#endif // defined(__mips_dsp) && (__mips_dsp_rev >= 2)
+
+#ifdef __cplusplus
+} // extern "C"
+} // namespace libyuv
+#endif
+
diff --git a/third_party/libyuv/source/scale_neon.cc b/third_party/libyuv/source/scale_neon.cc
new file mode 100644
index 0000000..704cfd2
--- /dev/null
+++ b/third_party/libyuv/source/scale_neon.cc
@@ -0,0 +1,684 @@
+/*
+ * Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "third_party/libyuv/include/libyuv/row.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// This module is for GCC Neon.
+#if !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__)
+
+// NEON downscalers with interpolation.
+// Provided by Fritz Koenig
+
+// Read 32x1 throw away even pixels, and write 16x1.
+void ScaleRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
+ uint8* dst, int dst_width) {
+ asm volatile (
+ ".p2align 2 \n"
+ "1: \n"
+ // load even pixels into q0, odd into q1
+ "vld2.8 {q0, q1}, [%0]! \n"
+ "subs %2, %2, #16 \n" // 16 processed per loop
+ "vst1.8 {q1}, [%1]! \n" // store odd pixels
+ "bgt 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst), // %1
+ "+r"(dst_width) // %2
+ :
+ : "q0", "q1" // Clobber List
+ );
+}
+
+// Read 32x2 average down and write 16x1.
+void ScaleRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
+ uint8* dst, int dst_width) {
+ asm volatile (
+ // change the stride to row 2 pointer
+ "add %1, %0 \n"
+ ".p2align 2 \n"
+ "1: \n"
+ "vld1.8 {q0, q1}, [%0]! \n" // load row 1 and post inc
+ "vld1.8 {q2, q3}, [%1]! \n" // load row 2 and post inc
+ "subs %3, %3, #16 \n" // 16 processed per loop
+ "vpaddl.u8 q0, q0 \n" // row 1 add adjacent
+ "vpaddl.u8 q1, q1 \n"
+ "vpadal.u8 q0, q2 \n" // row 2 add adjacent + row1
+ "vpadal.u8 q1, q3 \n"
+ "vrshrn.u16 d0, q0, #2 \n" // downshift, round and pack
+ "vrshrn.u16 d1, q1, #2 \n"
+ "vst1.8 {q0}, [%2]! \n"
+ "bgt 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(src_stride), // %1
+ "+r"(dst), // %2
+ "+r"(dst_width) // %3
+ :
+ : "q0", "q1", "q2", "q3" // Clobber List
+ );
+}
+
+void ScaleRowDown4_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
+ uint8* dst_ptr, int dst_width) {
+ asm volatile (
+ ".p2align 2 \n"
+ "1: \n"
+ "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0
+ "subs %2, %2, #8 \n" // 8 processed per loop
+ "vst1.8 {d2}, [%1]! \n"
+ "bgt 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(dst_width) // %2
+ :
+ : "q0", "q1", "memory", "cc"
+ );
+}
+
+void ScaleRowDown4Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
+ uint8* dst_ptr, int dst_width) {
+ asm volatile (
+ "add r4, %0, %3 \n"
+ "add r5, r4, %3 \n"
+ "add %3, r5, %3 \n"
+ ".p2align 2 \n"
+ "1: \n"
+ "vld1.8 {q0}, [%0]! \n" // load up 16x4
+ "vld1.8 {q1}, [r4]! \n"
+ "vld1.8 {q2}, [r5]! \n"
+ "vld1.8 {q3}, [%3]! \n"
+ "subs %2, %2, #4 \n"
+ "vpaddl.u8 q0, q0 \n"
+ "vpadal.u8 q0, q1 \n"
+ "vpadal.u8 q0, q2 \n"
+ "vpadal.u8 q0, q3 \n"
+ "vpaddl.u16 q0, q0 \n"
+ "vrshrn.u32 d0, q0, #4 \n" // divide by 16 w/rounding
+ "vmovn.u16 d0, q0 \n"
+ "vst1.32 {d0[0]}, [%1]! \n"
+ "bgt 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(dst_width) // %2
+ : "r"(src_stride) // %3
+ : "r4", "r5", "q0", "q1", "q2", "q3", "memory", "cc"
+ );
+}
+
+// Down scale from 4 to 3 pixels. Use the neon multilane read/write
+// to load up the every 4th pixel into a 4 different registers.
+// Point samples 32 pixels to 24 pixels.
+void ScaleRowDown34_NEON(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst_ptr, int dst_width) {
+ asm volatile (
+ ".p2align 2 \n"
+ "1: \n"
+ "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0
+ "subs %2, %2, #24 \n"
+ "vmov d2, d3 \n" // order d0, d1, d2
+ "vst3.8 {d0, d1, d2}, [%1]! \n"
+ "bgt 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(dst_width) // %2
+ :
+ : "d0", "d1", "d2", "d3", "memory", "cc"
+ );
+}
+
+void ScaleRowDown34_0_Box_NEON(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst_ptr, int dst_width) {
+ asm volatile (
+ "vmov.u8 d24, #3 \n"
+ "add %3, %0 \n"
+ ".p2align 2 \n"
+ "1: \n"
+ "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0
+ "vld4.8 {d4, d5, d6, d7}, [%3]! \n" // src line 1
+ "subs %2, %2, #24 \n"
+
+ // filter src line 0 with src line 1
+ // expand chars to shorts to allow for room
+ // when adding lines together
+ "vmovl.u8 q8, d4 \n"
+ "vmovl.u8 q9, d5 \n"
+ "vmovl.u8 q10, d6 \n"
+ "vmovl.u8 q11, d7 \n"
+
+ // 3 * line_0 + line_1
+ "vmlal.u8 q8, d0, d24 \n"
+ "vmlal.u8 q9, d1, d24 \n"
+ "vmlal.u8 q10, d2, d24 \n"
+ "vmlal.u8 q11, d3, d24 \n"
+
+ // (3 * line_0 + line_1) >> 2
+ "vqrshrn.u16 d0, q8, #2 \n"
+ "vqrshrn.u16 d1, q9, #2 \n"
+ "vqrshrn.u16 d2, q10, #2 \n"
+ "vqrshrn.u16 d3, q11, #2 \n"
+
+ // a0 = (src[0] * 3 + s[1] * 1) >> 2
+ "vmovl.u8 q8, d1 \n"
+ "vmlal.u8 q8, d0, d24 \n"
+ "vqrshrn.u16 d0, q8, #2 \n"
+
+ // a1 = (src[1] * 1 + s[2] * 1) >> 1
+ "vrhadd.u8 d1, d1, d2 \n"
+
+ // a2 = (src[2] * 1 + s[3] * 3) >> 2
+ "vmovl.u8 q8, d2 \n"
+ "vmlal.u8 q8, d3, d24 \n"
+ "vqrshrn.u16 d2, q8, #2 \n"
+
+ "vst3.8 {d0, d1, d2}, [%1]! \n"
+
+ "bgt 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(dst_width), // %2
+ "+r"(src_stride) // %3
+ :
+ : "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11", "d24", "memory", "cc"
+ );
+}
+
+void ScaleRowDown34_1_Box_NEON(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst_ptr, int dst_width) {
+ asm volatile (
+ "vmov.u8 d24, #3 \n"
+ "add %3, %0 \n"
+ ".p2align 2 \n"
+ "1: \n"
+ "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0
+ "vld4.8 {d4, d5, d6, d7}, [%3]! \n" // src line 1
+ "subs %2, %2, #24 \n"
+ // average src line 0 with src line 1
+ "vrhadd.u8 q0, q0, q2 \n"
+ "vrhadd.u8 q1, q1, q3 \n"
+
+ // a0 = (src[0] * 3 + s[1] * 1) >> 2
+ "vmovl.u8 q3, d1 \n"
+ "vmlal.u8 q3, d0, d24 \n"
+ "vqrshrn.u16 d0, q3, #2 \n"
+
+ // a1 = (src[1] * 1 + s[2] * 1) >> 1
+ "vrhadd.u8 d1, d1, d2 \n"
+
+ // a2 = (src[2] * 1 + s[3] * 3) >> 2
+ "vmovl.u8 q3, d2 \n"
+ "vmlal.u8 q3, d3, d24 \n"
+ "vqrshrn.u16 d2, q3, #2 \n"
+
+ "vst3.8 {d0, d1, d2}, [%1]! \n"
+ "bgt 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(dst_width), // %2
+ "+r"(src_stride) // %3
+ :
+ : "r4", "q0", "q1", "q2", "q3", "d24", "memory", "cc"
+ );
+}
+
+#define HAS_SCALEROWDOWN38_NEON
+static uvec8 kShuf38 =
+ { 0, 3, 6, 8, 11, 14, 16, 19, 22, 24, 27, 30, 0, 0, 0, 0 };
+static uvec8 kShuf38_2 =
+ { 0, 8, 16, 2, 10, 17, 4, 12, 18, 6, 14, 19, 0, 0, 0, 0 };
+static vec16 kMult38_Div6 =
+ { 65536 / 12, 65536 / 12, 65536 / 12, 65536 / 12,
+ 65536 / 12, 65536 / 12, 65536 / 12, 65536 / 12 };
+static vec16 kMult38_Div9 =
+ { 65536 / 18, 65536 / 18, 65536 / 18, 65536 / 18,
+ 65536 / 18, 65536 / 18, 65536 / 18, 65536 / 18 };
+
+// 32 -> 12
+void ScaleRowDown38_NEON(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst_ptr, int dst_width) {
+ asm volatile (
+ "vld1.8 {q3}, [%3] \n"
+ ".p2align 2 \n"
+ "1: \n"
+ "vld1.8 {d0, d1, d2, d3}, [%0]! \n"
+ "subs %2, %2, #12 \n"
+ "vtbl.u8 d4, {d0, d1, d2, d3}, d6 \n"
+ "vtbl.u8 d5, {d0, d1, d2, d3}, d7 \n"
+ "vst1.8 {d4}, [%1]! \n"
+ "vst1.32 {d5[0]}, [%1]! \n"
+ "bgt 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(dst_width) // %2
+ : "r"(&kShuf38) // %3
+ : "d0", "d1", "d2", "d3", "d4", "d5", "memory", "cc"
+ );
+}
+
+// 32x3 -> 12x1
+void OMITFP ScaleRowDown38_3_Box_NEON(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst_ptr, int dst_width) {
+ asm volatile (
+ "vld1.16 {q13}, [%4] \n"
+ "vld1.8 {q14}, [%5] \n"
+ "vld1.8 {q15}, [%6] \n"
+ "add r4, %0, %3, lsl #1 \n"
+ "add %3, %0 \n"
+ ".p2align 2 \n"
+ "1: \n"
+
+ // d0 = 00 40 01 41 02 42 03 43
+ // d1 = 10 50 11 51 12 52 13 53
+ // d2 = 20 60 21 61 22 62 23 63
+ // d3 = 30 70 31 71 32 72 33 73
+ "vld4.8 {d0, d1, d2, d3}, [%0]! \n"
+ "vld4.8 {d4, d5, d6, d7}, [%3]! \n"
+ "vld4.8 {d16, d17, d18, d19}, [r4]! \n"
+ "subs %2, %2, #12 \n"
+
+ // Shuffle the input data around to get align the data
+ // so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7
+ // d0 = 00 10 01 11 02 12 03 13
+ // d1 = 40 50 41 51 42 52 43 53
+ "vtrn.u8 d0, d1 \n"
+ "vtrn.u8 d4, d5 \n"
+ "vtrn.u8 d16, d17 \n"
+
+ // d2 = 20 30 21 31 22 32 23 33
+ // d3 = 60 70 61 71 62 72 63 73
+ "vtrn.u8 d2, d3 \n"
+ "vtrn.u8 d6, d7 \n"
+ "vtrn.u8 d18, d19 \n"
+
+ // d0 = 00+10 01+11 02+12 03+13
+ // d2 = 40+50 41+51 42+52 43+53
+ "vpaddl.u8 q0, q0 \n"
+ "vpaddl.u8 q2, q2 \n"
+ "vpaddl.u8 q8, q8 \n"
+
+ // d3 = 60+70 61+71 62+72 63+73
+ "vpaddl.u8 d3, d3 \n"
+ "vpaddl.u8 d7, d7 \n"
+ "vpaddl.u8 d19, d19 \n"
+
+ // combine source lines
+ "vadd.u16 q0, q2 \n"
+ "vadd.u16 q0, q8 \n"
+ "vadd.u16 d4, d3, d7 \n"
+ "vadd.u16 d4, d19 \n"
+
+ // dst_ptr[3] = (s[6 + st * 0] + s[7 + st * 0]
+ // + s[6 + st * 1] + s[7 + st * 1]
+ // + s[6 + st * 2] + s[7 + st * 2]) / 6
+ "vqrdmulh.s16 q2, q2, q13 \n"
+ "vmovn.u16 d4, q2 \n"
+
+ // Shuffle 2,3 reg around so that 2 can be added to the
+ // 0,1 reg and 3 can be added to the 4,5 reg. This
+ // requires expanding from u8 to u16 as the 0,1 and 4,5
+ // registers are already expanded. Then do transposes
+ // to get aligned.
+ // q2 = xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33
+ "vmovl.u8 q1, d2 \n"
+ "vmovl.u8 q3, d6 \n"
+ "vmovl.u8 q9, d18 \n"
+
+ // combine source lines
+ "vadd.u16 q1, q3 \n"
+ "vadd.u16 q1, q9 \n"
+
+ // d4 = xx 20 xx 30 xx 22 xx 32
+ // d5 = xx 21 xx 31 xx 23 xx 33
+ "vtrn.u32 d2, d3 \n"
+
+ // d4 = xx 20 xx 21 xx 22 xx 23
+ // d5 = xx 30 xx 31 xx 32 xx 33
+ "vtrn.u16 d2, d3 \n"
+
+ // 0+1+2, 3+4+5
+ "vadd.u16 q0, q1 \n"
+
+ // Need to divide, but can't downshift as the the value
+ // isn't a power of 2. So multiply by 65536 / n
+ // and take the upper 16 bits.
+ "vqrdmulh.s16 q0, q0, q15 \n"
+
+ // Align for table lookup, vtbl requires registers to
+ // be adjacent
+ "vmov.u8 d2, d4 \n"
+
+ "vtbl.u8 d3, {d0, d1, d2}, d28 \n"
+ "vtbl.u8 d4, {d0, d1, d2}, d29 \n"
+
+ "vst1.8 {d3}, [%1]! \n"
+ "vst1.32 {d4[0]}, [%1]! \n"
+ "bgt 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(dst_width), // %2
+ "+r"(src_stride) // %3
+ : "r"(&kMult38_Div6), // %4
+ "r"(&kShuf38_2), // %5
+ "r"(&kMult38_Div9) // %6
+ : "r4", "q0", "q1", "q2", "q3", "q8", "q9",
+ "q13", "q14", "q15", "memory", "cc"
+ );
+}
+
+// 32x2 -> 12x1
+void ScaleRowDown38_2_Box_NEON(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst_ptr, int dst_width) {
+ asm volatile (
+ "vld1.16 {q13}, [%4] \n"
+ "vld1.8 {q14}, [%5] \n"
+ "add %3, %0 \n"
+ ".p2align 2 \n"
+ "1: \n"
+
+ // d0 = 00 40 01 41 02 42 03 43
+ // d1 = 10 50 11 51 12 52 13 53
+ // d2 = 20 60 21 61 22 62 23 63
+ // d3 = 30 70 31 71 32 72 33 73
+ "vld4.8 {d0, d1, d2, d3}, [%0]! \n"
+ "vld4.8 {d4, d5, d6, d7}, [%3]! \n"
+ "subs %2, %2, #12 \n"
+
+ // Shuffle the input data around to get align the data
+ // so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7
+ // d0 = 00 10 01 11 02 12 03 13
+ // d1 = 40 50 41 51 42 52 43 53
+ "vtrn.u8 d0, d1 \n"
+ "vtrn.u8 d4, d5 \n"
+
+ // d2 = 20 30 21 31 22 32 23 33
+ // d3 = 60 70 61 71 62 72 63 73
+ "vtrn.u8 d2, d3 \n"
+ "vtrn.u8 d6, d7 \n"
+
+ // d0 = 00+10 01+11 02+12 03+13
+ // d2 = 40+50 41+51 42+52 43+53
+ "vpaddl.u8 q0, q0 \n"
+ "vpaddl.u8 q2, q2 \n"
+
+ // d3 = 60+70 61+71 62+72 63+73
+ "vpaddl.u8 d3, d3 \n"
+ "vpaddl.u8 d7, d7 \n"
+
+ // combine source lines
+ "vadd.u16 q0, q2 \n"
+ "vadd.u16 d4, d3, d7 \n"
+
+ // dst_ptr[3] = (s[6] + s[7] + s[6+st] + s[7+st]) / 4
+ "vqrshrn.u16 d4, q2, #2 \n"
+
+ // Shuffle 2,3 reg around so that 2 can be added to the
+ // 0,1 reg and 3 can be added to the 4,5 reg. This
+ // requires expanding from u8 to u16 as the 0,1 and 4,5
+ // registers are already expanded. Then do transposes
+ // to get aligned.
+ // q2 = xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33
+ "vmovl.u8 q1, d2 \n"
+ "vmovl.u8 q3, d6 \n"
+
+ // combine source lines
+ "vadd.u16 q1, q3 \n"
+
+ // d4 = xx 20 xx 30 xx 22 xx 32
+ // d5 = xx 21 xx 31 xx 23 xx 33
+ "vtrn.u32 d2, d3 \n"
+
+ // d4 = xx 20 xx 21 xx 22 xx 23
+ // d5 = xx 30 xx 31 xx 32 xx 33
+ "vtrn.u16 d2, d3 \n"
+
+ // 0+1+2, 3+4+5
+ "vadd.u16 q0, q1 \n"
+
+ // Need to divide, but can't downshift as the the value
+ // isn't a power of 2. So multiply by 65536 / n
+ // and take the upper 16 bits.
+ "vqrdmulh.s16 q0, q0, q13 \n"
+
+ // Align for table lookup, vtbl requires registers to
+ // be adjacent
+ "vmov.u8 d2, d4 \n"
+
+ "vtbl.u8 d3, {d0, d1, d2}, d28 \n"
+ "vtbl.u8 d4, {d0, d1, d2}, d29 \n"
+
+ "vst1.8 {d3}, [%1]! \n"
+ "vst1.32 {d4[0]}, [%1]! \n"
+ "bgt 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(dst_width), // %2
+ "+r"(src_stride) // %3
+ : "r"(&kMult38_Div6), // %4
+ "r"(&kShuf38_2) // %5
+ : "q0", "q1", "q2", "q3", "q13", "q14", "memory", "cc"
+ );
+}
+
+// 16x2 -> 16x1
+void ScaleFilterRows_NEON(uint8* dst_ptr,
+ const uint8* src_ptr, ptrdiff_t src_stride,
+ int dst_width, int source_y_fraction) {
+ asm volatile (
+ "cmp %4, #0 \n"
+ "beq 100f \n"
+ "add %2, %1 \n"
+ "cmp %4, #64 \n"
+ "beq 75f \n"
+ "cmp %4, #128 \n"
+ "beq 50f \n"
+ "cmp %4, #192 \n"
+ "beq 25f \n"
+
+ "vdup.8 d5, %4 \n"
+ "rsb %4, #256 \n"
+ "vdup.8 d4, %4 \n"
+ // General purpose row blend.
+ "1: \n"
+ "vld1.8 {q0}, [%1]! \n"
+ "vld1.8 {q1}, [%2]! \n"
+ "subs %3, %3, #16 \n"
+ "vmull.u8 q13, d0, d4 \n"
+ "vmull.u8 q14, d1, d4 \n"
+ "vmlal.u8 q13, d2, d5 \n"
+ "vmlal.u8 q14, d3, d5 \n"
+ "vrshrn.u16 d0, q13, #8 \n"
+ "vrshrn.u16 d1, q14, #8 \n"
+ "vst1.8 {q0}, [%0]! \n"
+ "bgt 1b \n"
+ "b 99f \n"
+
+ // Blend 25 / 75.
+ "25: \n"
+ "vld1.8 {q0}, [%1]! \n"
+ "vld1.8 {q1}, [%2]! \n"
+ "subs %3, %3, #16 \n"
+ "vrhadd.u8 q0, q1 \n"
+ "vrhadd.u8 q0, q1 \n"
+ "vst1.8 {q0}, [%0]! \n"
+ "bgt 25b \n"
+ "b 99f \n"
+
+ // Blend 50 / 50.
+ "50: \n"
+ "vld1.8 {q0}, [%1]! \n"
+ "vld1.8 {q1}, [%2]! \n"
+ "subs %3, %3, #16 \n"
+ "vrhadd.u8 q0, q1 \n"
+ "vst1.8 {q0}, [%0]! \n"
+ "bgt 50b \n"
+ "b 99f \n"
+
+ // Blend 75 / 25.
+ "75: \n"
+ "vld1.8 {q1}, [%1]! \n"
+ "vld1.8 {q0}, [%2]! \n"
+ "subs %3, %3, #16 \n"
+ "vrhadd.u8 q0, q1 \n"
+ "vrhadd.u8 q0, q1 \n"
+ "vst1.8 {q0}, [%0]! \n"
+ "bgt 75b \n"
+ "b 99f \n"
+
+ // Blend 100 / 0 - Copy row unchanged.
+ "100: \n"
+ "vld1.8 {q0}, [%1]! \n"
+ "subs %3, %3, #16 \n"
+ "vst1.8 {q0}, [%0]! \n"
+ "bgt 100b \n"
+
+ "99: \n"
+ "vst1.8 {d1[7]}, [%0] \n"
+ : "+r"(dst_ptr), // %0
+ "+r"(src_ptr), // %1
+ "+r"(src_stride), // %2
+ "+r"(dst_width), // %3
+ "+r"(source_y_fraction) // %4
+ :
+ : "q0", "q1", "d4", "d5", "q13", "q14", "memory", "cc"
+ );
+}
+
+void ScaleARGBRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
+ uint8* dst, int dst_width) {
+ asm volatile (
+ ".p2align 2 \n"
+ "1: \n"
+ // load even pixels into q0, odd into q1
+ "vld2.32 {q0, q1}, [%0]! \n"
+ "vld2.32 {q2, q3}, [%0]! \n"
+ "subs %2, %2, #8 \n" // 8 processed per loop
+ "vst1.8 {q1}, [%1]! \n" // store odd pixels
+ "vst1.8 {q3}, [%1]! \n"
+ "bgt 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst), // %1
+ "+r"(dst_width) // %2
+ :
+ : "memory", "cc", "q0", "q1", "q2", "q3" // Clobber List
+ );
+}
+
+void ScaleARGBRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
+ uint8* dst, int dst_width) {
+ asm volatile (
+ // change the stride to row 2 pointer
+ "add %1, %1, %0 \n"
+ ".p2align 2 \n"
+ "1: \n"
+ "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels.
+ "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB pixels.
+ "subs %3, %3, #8 \n" // 8 processed per loop.
+ "vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts.
+ "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts.
+ "vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts.
+ "vpaddl.u8 q3, q3 \n" // A 16 bytes -> 8 shorts.
+ "vld4.8 {d16, d18, d20, d22}, [%1]! \n" // load 8 more ARGB pixels.
+ "vld4.8 {d17, d19, d21, d23}, [%1]! \n" // load last 8 ARGB pixels.
+ "vpadal.u8 q0, q8 \n" // B 16 bytes -> 8 shorts.
+ "vpadal.u8 q1, q9 \n" // G 16 bytes -> 8 shorts.
+ "vpadal.u8 q2, q10 \n" // R 16 bytes -> 8 shorts.
+ "vpadal.u8 q3, q11 \n" // A 16 bytes -> 8 shorts.
+ "vrshrn.u16 d0, q0, #2 \n" // downshift, round and pack
+ "vrshrn.u16 d1, q1, #2 \n"
+ "vrshrn.u16 d2, q2, #2 \n"
+ "vrshrn.u16 d3, q3, #2 \n"
+ "vst4.8 {d0, d1, d2, d3}, [%2]! \n"
+ "bgt 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(src_stride), // %1
+ "+r"(dst), // %2
+ "+r"(dst_width) // %3
+ :
+ : "memory", "cc", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11"
+ );
+}
+
+// Reads 4 pixels at a time.
+// Alignment requirement: src_argb 4 byte aligned.
+void ScaleARGBRowDownEven_NEON(const uint8* src_argb, ptrdiff_t src_stride,
+ int src_stepx, uint8* dst_argb, int dst_width) {
+ asm volatile (
+ "mov r12, %3, lsl #2 \n"
+ ".p2align 2 \n"
+ "1: \n"
+ "vld1.32 {d0[0]}, [%0], r12 \n"
+ "vld1.32 {d0[1]}, [%0], r12 \n"
+ "vld1.32 {d1[0]}, [%0], r12 \n"
+ "vld1.32 {d1[1]}, [%0], r12 \n"
+ "subs %2, %2, #4 \n" // 4 pixels per loop.
+ "vst1.8 {q0}, [%1]! \n"
+ "bgt 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_argb), // %1
+ "+r"(dst_width) // %2
+ : "r"(src_stepx) // %3
+ : "memory", "cc", "r12", "q0"
+ );
+}
+
+// Reads 4 pixels at a time.
+// Alignment requirement: src_argb 4 byte aligned.
+void ScaleARGBRowDownEvenBox_NEON(const uint8* src_argb, ptrdiff_t src_stride,
+ int src_stepx,
+ uint8* dst_argb, int dst_width) {
+ asm volatile (
+ "mov r12, %4, lsl #2 \n"
+ "add %1, %1, %0 \n"
+ ".p2align 2 \n"
+ "1: \n"
+ "vld1.8 {d0}, [%0], r12 \n" // Read 4 2x2 blocks -> 2x1
+ "vld1.8 {d1}, [%1], r12 \n"
+ "vld1.8 {d2}, [%0], r12 \n"
+ "vld1.8 {d3}, [%1], r12 \n"
+ "vld1.8 {d4}, [%0], r12 \n"
+ "vld1.8 {d5}, [%1], r12 \n"
+ "vld1.8 {d6}, [%0], r12 \n"
+ "vld1.8 {d7}, [%1], r12 \n"
+ "vaddl.u8 q0, d0, d1 \n"
+ "vaddl.u8 q1, d2, d3 \n"
+ "vaddl.u8 q2, d4, d5 \n"
+ "vaddl.u8 q3, d6, d7 \n"
+ "vswp.8 d1, d2 \n" // ab_cd -> ac_bd
+ "vswp.8 d5, d6 \n" // ef_gh -> eg_fh
+ "vadd.u16 q0, q0, q1 \n" // (a+b)_(c+d)
+ "vadd.u16 q2, q2, q3 \n" // (e+f)_(g+h)
+ "vrshrn.u16 d0, q0, #2 \n" // first 2 pixels.
+ "vrshrn.u16 d1, q2, #2 \n" // next 2 pixels.
+ "subs %3, %3, #4 \n" // 4 pixels per loop.
+ "vst1.8 {q0}, [%2]! \n"
+ "bgt 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(src_stride), // %1
+ "+r"(dst_argb), // %2
+ "+r"(dst_width) // %3
+ : "r"(src_stepx) // %4
+ : "memory", "cc", "r12", "q0", "q1", "q2", "q3"
+ );
+}
+
+#endif // __ARM_NEON__
+
+#ifdef __cplusplus
+} // extern "C"
+} // namespace libyuv
+#endif
diff --git a/third_party/libyuv/source/scale_posix.cc b/third_party/libyuv/source/scale_posix.cc
new file mode 100644
index 0000000..18b0810
--- /dev/null
+++ b/third_party/libyuv/source/scale_posix.cc
@@ -0,0 +1,1315 @@
+/*
+ * Copyright 2013 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "third_party/libyuv/include/libyuv/row.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// This module is for GCC x86 and x64.
+#if !defined(LIBYUV_DISABLE_X86) && (defined(__x86_64__) || defined(__i386__))
+
+// Offsets for source bytes 0 to 9
+static uvec8 kShuf0 =
+ { 0, 1, 3, 4, 5, 7, 8, 9, 128, 128, 128, 128, 128, 128, 128, 128 };
+
+// Offsets for source bytes 11 to 20 with 8 subtracted = 3 to 12.
+static uvec8 kShuf1 =
+ { 3, 4, 5, 7, 8, 9, 11, 12, 128, 128, 128, 128, 128, 128, 128, 128 };
+
+// Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31.
+static uvec8 kShuf2 =
+ { 5, 7, 8, 9, 11, 12, 13, 15, 128, 128, 128, 128, 128, 128, 128, 128 };
+
+// Offsets for source bytes 0 to 10
+static uvec8 kShuf01 =
+ { 0, 1, 1, 2, 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10 };
+
+// Offsets for source bytes 10 to 21 with 8 subtracted = 3 to 13.
+static uvec8 kShuf11 =
+ { 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13 };
+
+// Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31.
+static uvec8 kShuf21 =
+ { 5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13, 13, 14, 14, 15 };
+
+// Coefficients for source bytes 0 to 10
+static uvec8 kMadd01 =
+ { 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2 };
+
+// Coefficients for source bytes 10 to 21
+static uvec8 kMadd11 =
+ { 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1 };
+
+// Coefficients for source bytes 21 to 31
+static uvec8 kMadd21 =
+ { 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3 };
+
+// Coefficients for source bytes 21 to 31
+static vec16 kRound34 =
+ { 2, 2, 2, 2, 2, 2, 2, 2 };
+
+static uvec8 kShuf38a =
+ { 0, 3, 6, 8, 11, 14, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 };
+
+static uvec8 kShuf38b =
+ { 128, 128, 128, 128, 128, 128, 0, 3, 6, 8, 11, 14, 128, 128, 128, 128 };
+
+// Arrange words 0,3,6 into 0,1,2
+static uvec8 kShufAc =
+ { 0, 1, 6, 7, 12, 13, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 };
+
+// Arrange words 0,3,6 into 3,4,5
+static uvec8 kShufAc3 =
+ { 128, 128, 128, 128, 128, 128, 0, 1, 6, 7, 12, 13, 128, 128, 128, 128 };
+
+// Scaling values for boxes of 3x3 and 2x3
+static uvec16 kScaleAc33 =
+ { 65536 / 9, 65536 / 9, 65536 / 6, 65536 / 9, 65536 / 9, 65536 / 6, 0, 0 };
+
+// Arrange first value for pixels 0,1,2,3,4,5
+static uvec8 kShufAb0 =
+ { 0, 128, 3, 128, 6, 128, 8, 128, 11, 128, 14, 128, 128, 128, 128, 128 };
+
+// Arrange second value for pixels 0,1,2,3,4,5
+static uvec8 kShufAb1 =
+ { 1, 128, 4, 128, 7, 128, 9, 128, 12, 128, 15, 128, 128, 128, 128, 128 };
+
+// Arrange third value for pixels 0,1,2,3,4,5
+static uvec8 kShufAb2 =
+ { 2, 128, 5, 128, 128, 128, 10, 128, 13, 128, 128, 128, 128, 128, 128, 128 };
+
+// Scaling values for boxes of 3x2 and 2x2
+static uvec16 kScaleAb2 =
+ { 65536 / 3, 65536 / 3, 65536 / 2, 65536 / 3, 65536 / 3, 65536 / 2, 0, 0 };
+
+// GCC versions of row functions are verbatim conversions from Visual C.
+// Generated using gcc disassembly on Visual C object file:
+// objdump -D yuvscaler.obj >yuvscaler.txt
+
+void ScaleRowDown2_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
+ uint8* dst_ptr, int dst_width) {
+ asm volatile (
+ LABELALIGN
+ "1: \n"
+ "movdqa " MEMACCESS(0) ",%%xmm0 \n"
+ "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
+ "lea " MEMLEA(0x20,0) ",%0 \n"
+ "psrlw $0x8,%%xmm0 \n"
+ "psrlw $0x8,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm0 \n"
+ "movdqa %%xmm0," MEMACCESS(1) " \n"
+ "lea " MEMLEA(0x10,1) ",%1 \n"
+ "sub $0x10,%2 \n"
+ "jg 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(dst_width) // %2
+ :
+ : "memory", "cc"
+#if defined(__SSE2__)
+ , "xmm0", "xmm1"
+#endif
+ );
+}
+
+void ScaleRowDown2Linear_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
+ uint8* dst_ptr, int dst_width) {
+ asm volatile (
+ "pcmpeqb %%xmm5,%%xmm5 \n"
+ "psrlw $0x8,%%xmm5 \n"
+
+ LABELALIGN
+ "1: \n"
+ "movdqa " MEMACCESS(0) ",%%xmm0 \n"
+ "movdqa " MEMACCESS2(0x10, 0) ",%%xmm1 \n"
+ "lea " MEMLEA(0x20,0) ",%0 \n"
+ "movdqa %%xmm0,%%xmm2 \n"
+ "psrlw $0x8,%%xmm0 \n"
+ "movdqa %%xmm1,%%xmm3 \n"
+ "psrlw $0x8,%%xmm1 \n"
+ "pand %%xmm5,%%xmm2 \n"
+ "pand %%xmm5,%%xmm3 \n"
+ "pavgw %%xmm2,%%xmm0 \n"
+ "pavgw %%xmm3,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm0 \n"
+ "movdqa %%xmm0," MEMACCESS(1) " \n"
+ "lea " MEMLEA(0x10,1) ",%1 \n"
+ "sub $0x10,%2 \n"
+ "jg 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(dst_width) // %2
+ :
+ : "memory", "cc"
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm5"
+#endif
+ );
+}
+
+void ScaleRowDown2Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
+ uint8* dst_ptr, int dst_width) {
+ asm volatile (
+ "pcmpeqb %%xmm5,%%xmm5 \n"
+ "psrlw $0x8,%%xmm5 \n"
+
+ LABELALIGN
+ "1: \n"
+ "movdqa " MEMACCESS(0) ",%%xmm0 \n"
+ "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
+ MEMOPREG(movdqa,0x00,0,3,1,xmm2) // movdqa (%0,%3,1),%%xmm2
+ BUNDLEALIGN
+ MEMOPREG(movdqa,0x10,0,3,1,xmm3) // movdqa 0x10(%0,%3,1),%%xmm3
+ "lea " MEMLEA(0x20,0) ",%0 \n"
+ "pavgb %%xmm2,%%xmm0 \n"
+ "pavgb %%xmm3,%%xmm1 \n"
+ "movdqa %%xmm0,%%xmm2 \n"
+ "psrlw $0x8,%%xmm0 \n"
+ "movdqa %%xmm1,%%xmm3 \n"
+ "psrlw $0x8,%%xmm1 \n"
+ "pand %%xmm5,%%xmm2 \n"
+ "pand %%xmm5,%%xmm3 \n"
+ "pavgw %%xmm2,%%xmm0 \n"
+ "pavgw %%xmm3,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm0 \n"
+ "movdqa %%xmm0," MEMACCESS(1) " \n"
+ "lea " MEMLEA(0x10,1) ",%1 \n"
+ "sub $0x10,%2 \n"
+ "jg 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(dst_width) // %2
+ : "r"((intptr_t)(src_stride)) // %3
+ : "memory", "cc"
+#if defined(__native_client__) && defined(__x86_64__)
+ , "r14"
+#endif
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
+#endif
+ );
+}
+
+void ScaleRowDown2_Unaligned_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
+ uint8* dst_ptr, int dst_width) {
+ asm volatile (
+ LABELALIGN
+ "1: \n"
+ "movdqu " MEMACCESS(0) ",%%xmm0 \n"
+ "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
+ "lea " MEMLEA(0x20,0) ",%0 \n"
+ "psrlw $0x8,%%xmm0 \n"
+ "psrlw $0x8,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm0 \n"
+ "movdqu %%xmm0," MEMACCESS(1) " \n"
+ "lea " MEMLEA(0x10,1) ",%1 \n"
+ "sub $0x10,%2 \n"
+ "jg 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(dst_width) // %2
+ :
+ : "memory", "cc"
+#if defined(__SSE2__)
+ , "xmm0", "xmm1"
+#endif
+ );
+}
+
+void ScaleRowDown2Linear_Unaligned_SSE2(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst_ptr, int dst_width) {
+ asm volatile (
+ "pcmpeqb %%xmm5,%%xmm5 \n"
+ "psrlw $0x8,%%xmm5 \n"
+
+ LABELALIGN
+ "1: \n"
+ "movdqu " MEMACCESS(0) ",%%xmm0 \n"
+ "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
+ "lea " MEMLEA(0x20,0) ",%0 \n"
+ "movdqa %%xmm0,%%xmm2 \n"
+ "psrlw $0x8,%%xmm0 \n"
+ "movdqa %%xmm1,%%xmm3 \n"
+ "psrlw $0x8,%%xmm1 \n"
+ "pand %%xmm5,%%xmm2 \n"
+ "pand %%xmm5,%%xmm3 \n"
+ "pavgw %%xmm2,%%xmm0 \n"
+ "pavgw %%xmm3,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm0 \n"
+ "movdqu %%xmm0," MEMACCESS(1) " \n"
+ "lea " MEMLEA(0x10,1) ",%1 \n"
+ "sub $0x10,%2 \n"
+ "jg 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(dst_width) // %2
+ :
+ : "memory", "cc"
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm5"
+#endif
+ );
+}
+
+void ScaleRowDown2Box_Unaligned_SSE2(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst_ptr, int dst_width) {
+ asm volatile (
+ "pcmpeqb %%xmm5,%%xmm5 \n"
+ "psrlw $0x8,%%xmm5 \n"
+
+ LABELALIGN
+ "1: \n"
+ "movdqu " MEMACCESS(0) ",%%xmm0 \n"
+ "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
+ MEMOPREG(movdqu,0x00,0,3,1,xmm2) // movdqu (%0,%3,1),%%xmm2
+ BUNDLEALIGN
+ MEMOPREG(movdqu,0x10,0,3,1,xmm3) // movdqu 0x10(%0,%3,1),%%xmm3
+ "lea " MEMLEA(0x20,0) ",%0 \n"
+ "pavgb %%xmm2,%%xmm0 \n"
+ "pavgb %%xmm3,%%xmm1 \n"
+ "movdqa %%xmm0,%%xmm2 \n"
+ "psrlw $0x8,%%xmm0 \n"
+ "movdqa %%xmm1,%%xmm3 \n"
+ "psrlw $0x8,%%xmm1 \n"
+ "pand %%xmm5,%%xmm2 \n"
+ "pand %%xmm5,%%xmm3 \n"
+ "pavgw %%xmm2,%%xmm0 \n"
+ "pavgw %%xmm3,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm0 \n"
+ "movdqu %%xmm0," MEMACCESS(1) " \n"
+ "lea " MEMLEA(0x10,1) ",%1 \n"
+ "sub $0x10,%2 \n"
+ "jg 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(dst_width) // %2
+ : "r"((intptr_t)(src_stride)) // %3
+ : "memory", "cc"
+#if defined(__native_client__) && defined(__x86_64__)
+ , "r14"
+#endif
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
+#endif
+ );
+}
+
+void ScaleRowDown4_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
+ uint8* dst_ptr, int dst_width) {
+ asm volatile (
+ "pcmpeqb %%xmm5,%%xmm5 \n"
+ "psrld $0x18,%%xmm5 \n"
+ "pslld $0x10,%%xmm5 \n"
+
+ LABELALIGN
+ "1: \n"
+ "movdqa " MEMACCESS(0) ",%%xmm0 \n"
+ "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
+ "lea " MEMLEA(0x20,0) ",%0 \n"
+ "pand %%xmm5,%%xmm0 \n"
+ "pand %%xmm5,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm0 \n"
+ "psrlw $0x8,%%xmm0 \n"
+ "packuswb %%xmm0,%%xmm0 \n"
+ "movq %%xmm0," MEMACCESS(1) " \n"
+ "lea " MEMLEA(0x8,1) ",%1 \n"
+ "sub $0x8,%2 \n"
+ "jg 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(dst_width) // %2
+ :
+ : "memory", "cc"
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm5"
+#endif
+ );
+}
+
+void ScaleRowDown4Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
+ uint8* dst_ptr, int dst_width) {
+ intptr_t stridex3 = 0;
+ asm volatile (
+ "pcmpeqb %%xmm7,%%xmm7 \n"
+ "psrlw $0x8,%%xmm7 \n"
+ "lea " MEMLEA4(0x00,4,4,2) ",%3 \n"
+
+ LABELALIGN
+ "1: \n"
+ "movdqa " MEMACCESS(0) ",%%xmm0 \n"
+ "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
+ MEMOPREG(movdqa,0x00,0,4,1,xmm2) // movdqa (%0,%4,1),%%xmm2
+ BUNDLEALIGN
+ MEMOPREG(movdqa,0x10,0,4,1,xmm3) // movdqa 0x10(%0,%4,1),%%xmm3
+ "pavgb %%xmm2,%%xmm0 \n"
+ "pavgb %%xmm3,%%xmm1 \n"
+ MEMOPREG(movdqa,0x00,0,4,2,xmm2) // movdqa (%0,%4,2),%%xmm2
+ BUNDLEALIGN
+ MEMOPREG(movdqa,0x10,0,4,2,xmm3) // movdqa 0x10(%0,%4,2),%%xmm3
+ MEMOPREG(movdqa,0x00,0,3,1,xmm4) // movdqa (%0,%3,1),%%xmm4
+ MEMOPREG(movdqa,0x10,0,3,1,xmm5) // movdqa 0x10(%0,%3,1),%%xmm5
+ "lea " MEMLEA(0x20,0) ",%0 \n"
+ "pavgb %%xmm4,%%xmm2 \n"
+ "pavgb %%xmm2,%%xmm0 \n"
+ "pavgb %%xmm5,%%xmm3 \n"
+ "pavgb %%xmm3,%%xmm1 \n"
+ "movdqa %%xmm0,%%xmm2 \n"
+ "psrlw $0x8,%%xmm0 \n"
+ "movdqa %%xmm1,%%xmm3 \n"
+ "psrlw $0x8,%%xmm1 \n"
+ "pand %%xmm7,%%xmm2 \n"
+ "pand %%xmm7,%%xmm3 \n"
+ "pavgw %%xmm2,%%xmm0 \n"
+ "pavgw %%xmm3,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm0 \n"
+ "movdqa %%xmm0,%%xmm2 \n"
+ "psrlw $0x8,%%xmm0 \n"
+ "pand %%xmm7,%%xmm2 \n"
+ "pavgw %%xmm2,%%xmm0 \n"
+ "packuswb %%xmm0,%%xmm0 \n"
+ "movq %%xmm0," MEMACCESS(1) " \n"
+ "lea " MEMLEA(0x8,1) ",%1 \n"
+ "sub $0x8,%2 \n"
+ "jg 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(dst_width), // %2
+ "+r"(stridex3) // %3
+ : "r"((intptr_t)(src_stride)) // %4
+ : "memory", "cc"
+#if defined(__native_client__) && defined(__x86_64__)
+ , "r14"
+#endif
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm7"
+#endif
+ );
+}
+
+void ScaleRowDown34_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
+ uint8* dst_ptr, int dst_width) {
+ asm volatile (
+ "movdqa %0,%%xmm3 \n"
+ "movdqa %1,%%xmm4 \n"
+ "movdqa %2,%%xmm5 \n"
+ :
+ : "m"(kShuf0), // %0
+ "m"(kShuf1), // %1
+ "m"(kShuf2) // %2
+ );
+ asm volatile (
+ LABELALIGN
+ "1: \n"
+ "movdqa " MEMACCESS(0) ",%%xmm0 \n"
+ "movdqa " MEMACCESS2(0x10,0) ",%%xmm2 \n"
+ "lea " MEMLEA(0x20,0) ",%0 \n"
+ "movdqa %%xmm2,%%xmm1 \n"
+ "palignr $0x8,%%xmm0,%%xmm1 \n"
+ "pshufb %%xmm3,%%xmm0 \n"
+ "pshufb %%xmm4,%%xmm1 \n"
+ "pshufb %%xmm5,%%xmm2 \n"
+ "movq %%xmm0," MEMACCESS(1) " \n"
+ "movq %%xmm1," MEMACCESS2(0x8,1) " \n"
+ "movq %%xmm2," MEMACCESS2(0x10,1) " \n"
+ "lea " MEMLEA(0x18,1) ",%1 \n"
+ "sub $0x18,%2 \n"
+ "jg 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(dst_width) // %2
+ :
+ : "memory", "cc"
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+#endif
+ );
+}
+
+void ScaleRowDown34_1_Box_SSSE3(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst_ptr, int dst_width) {
+ asm volatile (
+ "movdqa %0,%%xmm2 \n" // kShuf01
+ "movdqa %1,%%xmm3 \n" // kShuf11
+ "movdqa %2,%%xmm4 \n" // kShuf21
+ :
+ : "m"(kShuf01), // %0
+ "m"(kShuf11), // %1
+ "m"(kShuf21) // %2
+ );
+ asm volatile (
+ "movdqa %0,%%xmm5 \n" // kMadd01
+ "movdqa %1,%%xmm0 \n" // kMadd11
+ "movdqa %2,%%xmm1 \n" // kRound34
+ :
+ : "m"(kMadd01), // %0
+ "m"(kMadd11), // %1
+ "m"(kRound34) // %2
+ );
+ asm volatile (
+ LABELALIGN
+ "1: \n"
+ "movdqa " MEMACCESS(0) ",%%xmm6 \n"
+ MEMOPREG(movdqa,0x00,0,3,1,xmm7) // movdqa (%0,%3),%%xmm7
+ "pavgb %%xmm7,%%xmm6 \n"
+ "pshufb %%xmm2,%%xmm6 \n"
+ "pmaddubsw %%xmm5,%%xmm6 \n"
+ "paddsw %%xmm1,%%xmm6 \n"
+ "psrlw $0x2,%%xmm6 \n"
+ "packuswb %%xmm6,%%xmm6 \n"
+ "movq %%xmm6," MEMACCESS(1) " \n"
+ "movdqu " MEMACCESS2(0x8,0) ",%%xmm6 \n"
+ MEMOPREG(movdqu,0x8,0,3,1,xmm7) // movdqu 0x8(%0,%3),%%xmm7
+ "pavgb %%xmm7,%%xmm6 \n"
+ "pshufb %%xmm3,%%xmm6 \n"
+ "pmaddubsw %%xmm0,%%xmm6 \n"
+ "paddsw %%xmm1,%%xmm6 \n"
+ "psrlw $0x2,%%xmm6 \n"
+ "packuswb %%xmm6,%%xmm6 \n"
+ "movq %%xmm6," MEMACCESS2(0x8,1) " \n"
+ "movdqa " MEMACCESS2(0x10,0) ",%%xmm6 \n"
+ BUNDLEALIGN
+ MEMOPREG(movdqa,0x10,0,3,1,xmm7) // movdqa 0x10(%0,%3),%%xmm7
+ "lea " MEMLEA(0x20,0) ",%0 \n"
+ "pavgb %%xmm7,%%xmm6 \n"
+ "pshufb %%xmm4,%%xmm6 \n"
+ "pmaddubsw %4,%%xmm6 \n"
+ "paddsw %%xmm1,%%xmm6 \n"
+ "psrlw $0x2,%%xmm6 \n"
+ "packuswb %%xmm6,%%xmm6 \n"
+ "movq %%xmm6," MEMACCESS2(0x10,1) " \n"
+ "lea " MEMLEA(0x18,1) ",%1 \n"
+ "sub $0x18,%2 \n"
+ "jg 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(dst_width) // %2
+ : "r"((intptr_t)(src_stride)), // %3
+ "m"(kMadd21) // %4
+ : "memory", "cc"
+#if defined(__native_client__) && defined(__x86_64__)
+ , "r14"
+#endif
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
+#endif
+ );
+}
+
+void ScaleRowDown34_0_Box_SSSE3(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst_ptr, int dst_width) {
+ asm volatile (
+ "movdqa %0,%%xmm2 \n" // kShuf01
+ "movdqa %1,%%xmm3 \n" // kShuf11
+ "movdqa %2,%%xmm4 \n" // kShuf21
+ :
+ : "m"(kShuf01), // %0
+ "m"(kShuf11), // %1
+ "m"(kShuf21) // %2
+ );
+ asm volatile (
+ "movdqa %0,%%xmm5 \n" // kMadd01
+ "movdqa %1,%%xmm0 \n" // kMadd11
+ "movdqa %2,%%xmm1 \n" // kRound34
+ :
+ : "m"(kMadd01), // %0
+ "m"(kMadd11), // %1
+ "m"(kRound34) // %2
+ );
+
+ asm volatile (
+ LABELALIGN
+ "1: \n"
+ "movdqa " MEMACCESS(0) ",%%xmm6 \n"
+ MEMOPREG(movdqa,0x00,0,3,1,xmm7) // movdqa (%0,%3,1),%%xmm7
+ "pavgb %%xmm6,%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm6 \n"
+ "pshufb %%xmm2,%%xmm6 \n"
+ "pmaddubsw %%xmm5,%%xmm6 \n"
+ "paddsw %%xmm1,%%xmm6 \n"
+ "psrlw $0x2,%%xmm6 \n"
+ "packuswb %%xmm6,%%xmm6 \n"
+ "movq %%xmm6," MEMACCESS(1) " \n"
+ "movdqu " MEMACCESS2(0x8,0) ",%%xmm6 \n"
+ MEMOPREG(movdqu,0x8,0,3,1,xmm7) // movdqu 0x8(%0,%3,1),%%xmm7
+ "pavgb %%xmm6,%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm6 \n"
+ "pshufb %%xmm3,%%xmm6 \n"
+ "pmaddubsw %%xmm0,%%xmm6 \n"
+ "paddsw %%xmm1,%%xmm6 \n"
+ "psrlw $0x2,%%xmm6 \n"
+ "packuswb %%xmm6,%%xmm6 \n"
+ "movq %%xmm6," MEMACCESS2(0x8,1) " \n"
+ "movdqa " MEMACCESS2(0x10,0) ",%%xmm6 \n"
+ MEMOPREG(movdqa,0x10,0,3,1,xmm7) // movdqa 0x10(%0,%3,1),%%xmm7
+ "lea " MEMLEA(0x20,0) ",%0 \n"
+ "pavgb %%xmm6,%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm6 \n"
+ "pshufb %%xmm4,%%xmm6 \n"
+ "pmaddubsw %4,%%xmm6 \n"
+ "paddsw %%xmm1,%%xmm6 \n"
+ "psrlw $0x2,%%xmm6 \n"
+ "packuswb %%xmm6,%%xmm6 \n"
+ "movq %%xmm6," MEMACCESS2(0x10,1) " \n"
+ "lea " MEMLEA(0x18,1) ",%1 \n"
+ "sub $0x18,%2 \n"
+ "jg 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(dst_width) // %2
+ : "r"((intptr_t)(src_stride)), // %3
+ "m"(kMadd21) // %4
+ : "memory", "cc"
+#if defined(__native_client__) && defined(__x86_64__)
+ , "r14"
+#endif
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
+#endif
+ );
+}
+
+void ScaleRowDown38_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
+ uint8* dst_ptr, int dst_width) {
+ asm volatile (
+ "movdqa %3,%%xmm4 \n"
+ "movdqa %4,%%xmm5 \n"
+
+ LABELALIGN
+ "1: \n"
+ "movdqa " MEMACCESS(0) ",%%xmm0 \n"
+ "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
+ "lea " MEMLEA(0x20,0) ",%0 \n"
+ "pshufb %%xmm4,%%xmm0 \n"
+ "pshufb %%xmm5,%%xmm1 \n"
+ "paddusb %%xmm1,%%xmm0 \n"
+ "movq %%xmm0," MEMACCESS(1) " \n"
+ "movhlps %%xmm0,%%xmm1 \n"
+ "movd %%xmm1," MEMACCESS2(0x8,1) " \n"
+ "lea " MEMLEA(0xc,1) ",%1 \n"
+ "sub $0xc,%2 \n"
+ "jg 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(dst_width) // %2
+ : "m"(kShuf38a), // %3
+ "m"(kShuf38b) // %4
+ : "memory", "cc"
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm4", "xmm5"
+#endif
+ );
+}
+
+void ScaleRowDown38_2_Box_SSSE3(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst_ptr, int dst_width) {
+ asm volatile (
+ "movdqa %0,%%xmm2 \n"
+ "movdqa %1,%%xmm3 \n"
+ "movdqa %2,%%xmm4 \n"
+ "movdqa %3,%%xmm5 \n"
+ :
+ : "m"(kShufAb0), // %0
+ "m"(kShufAb1), // %1
+ "m"(kShufAb2), // %2
+ "m"(kScaleAb2) // %3
+ );
+ asm volatile (
+ LABELALIGN
+ "1: \n"
+ "movdqa " MEMACCESS(0) ",%%xmm0 \n"
+ MEMOPREG(pavgb,0x00,0,3,1,xmm0) // pavgb (%0,%3,1),%%xmm0
+ "lea " MEMLEA(0x10,0) ",%0 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "pshufb %%xmm2,%%xmm1 \n"
+ "movdqa %%xmm0,%%xmm6 \n"
+ "pshufb %%xmm3,%%xmm6 \n"
+ "paddusw %%xmm6,%%xmm1 \n"
+ "pshufb %%xmm4,%%xmm0 \n"
+ "paddusw %%xmm0,%%xmm1 \n"
+ "pmulhuw %%xmm5,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm1 \n"
+ "sub $0x6,%2 \n"
+ "movd %%xmm1," MEMACCESS(1) " \n"
+ "psrlq $0x10,%%xmm1 \n"
+ "movd %%xmm1," MEMACCESS2(0x2,1) " \n"
+ "lea " MEMLEA(0x6,1) ",%1 \n"
+ "jg 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(dst_width) // %2
+ : "r"((intptr_t)(src_stride)) // %3
+ : "memory", "cc"
+#if defined(__native_client__) && defined(__x86_64__)
+ , "r14"
+#endif
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
+#endif
+ );
+}
+
+void ScaleRowDown38_3_Box_SSSE3(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst_ptr, int dst_width) {
+ asm volatile (
+ "movdqa %0,%%xmm2 \n"
+ "movdqa %1,%%xmm3 \n"
+ "movdqa %2,%%xmm4 \n"
+ "pxor %%xmm5,%%xmm5 \n"
+ :
+ : "m"(kShufAc), // %0
+ "m"(kShufAc3), // %1
+ "m"(kScaleAc33) // %2
+ );
+ asm volatile (
+ LABELALIGN
+ "1: \n"
+ "movdqa " MEMACCESS(0) ",%%xmm0 \n"
+ MEMOPREG(movdqa,0x00,0,3,1,xmm6) // movdqa (%0,%3,1),%%xmm6
+ "movhlps %%xmm0,%%xmm1 \n"
+ "movhlps %%xmm6,%%xmm7 \n"
+ "punpcklbw %%xmm5,%%xmm0 \n"
+ "punpcklbw %%xmm5,%%xmm1 \n"
+ "punpcklbw %%xmm5,%%xmm6 \n"
+ "punpcklbw %%xmm5,%%xmm7 \n"
+ "paddusw %%xmm6,%%xmm0 \n"
+ "paddusw %%xmm7,%%xmm1 \n"
+ MEMOPREG(movdqa,0x00,0,3,2,xmm6) // movdqa (%0,%3,2),%%xmm6
+ "lea " MEMLEA(0x10,0) ",%0 \n"
+ "movhlps %%xmm6,%%xmm7 \n"
+ "punpcklbw %%xmm5,%%xmm6 \n"
+ "punpcklbw %%xmm5,%%xmm7 \n"
+ "paddusw %%xmm6,%%xmm0 \n"
+ "paddusw %%xmm7,%%xmm1 \n"
+ "movdqa %%xmm0,%%xmm6 \n"
+ "psrldq $0x2,%%xmm0 \n"
+ "paddusw %%xmm0,%%xmm6 \n"
+ "psrldq $0x2,%%xmm0 \n"
+ "paddusw %%xmm0,%%xmm6 \n"
+ "pshufb %%xmm2,%%xmm6 \n"
+ "movdqa %%xmm1,%%xmm7 \n"
+ "psrldq $0x2,%%xmm1 \n"
+ "paddusw %%xmm1,%%xmm7 \n"
+ "psrldq $0x2,%%xmm1 \n"
+ "paddusw %%xmm1,%%xmm7 \n"
+ "pshufb %%xmm3,%%xmm7 \n"
+ "paddusw %%xmm7,%%xmm6 \n"
+ "pmulhuw %%xmm4,%%xmm6 \n"
+ "packuswb %%xmm6,%%xmm6 \n"
+ "sub $0x6,%2 \n"
+ "movd %%xmm6," MEMACCESS(1) " \n"
+ "psrlq $0x10,%%xmm6 \n"
+ "movd %%xmm6," MEMACCESS2(0x2,1) " \n"
+ "lea " MEMLEA(0x6,1) ",%1 \n"
+ "jg 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(dst_width) // %2
+ : "r"((intptr_t)(src_stride)) // %3
+ : "memory", "cc"
+#if defined(__native_client__) && defined(__x86_64__)
+ , "r14"
+#endif
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
+#endif
+ );
+}
+
+void ScaleAddRows_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
+ uint16* dst_ptr, int src_width, int src_height) {
+ int tmp_height = 0;
+ intptr_t tmp_src = 0;
+ asm volatile (
+ "pxor %%xmm4,%%xmm4 \n"
+ "sub $0x1,%5 \n"
+
+ LABELALIGN
+ "1: \n"
+ "movdqa " MEMACCESS(0) ",%%xmm0 \n"
+ "mov %0,%3 \n"
+ "add %6,%0 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "punpcklbw %%xmm4,%%xmm0 \n"
+ "punpckhbw %%xmm4,%%xmm1 \n"
+ "mov %5,%2 \n"
+ "test %2,%2 \n"
+ "je 3f \n"
+
+ LABELALIGN
+ "2: \n"
+ "movdqa " MEMACCESS(0) ",%%xmm2 \n"
+ "add %6,%0 \n"
+ "movdqa %%xmm2,%%xmm3 \n"
+ "punpcklbw %%xmm4,%%xmm2 \n"
+ "punpckhbw %%xmm4,%%xmm3 \n"
+ "paddusw %%xmm2,%%xmm0 \n"
+ "paddusw %%xmm3,%%xmm1 \n"
+ "sub $0x1,%2 \n"
+ "jg 2b \n"
+
+ LABELALIGN
+ "3: \n"
+ "movdqa %%xmm0," MEMACCESS(1) " \n"
+ "movdqa %%xmm1," MEMACCESS2(0x10,1) " \n"
+ "lea " MEMLEA(0x10,3) ",%0 \n"
+ "lea " MEMLEA(0x20,1) ",%1 \n"
+ "sub $0x10,%4 \n"
+ "jg 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(tmp_height), // %2
+ "+r"(tmp_src), // %3
+ "+r"(src_width), // %4
+ "+rm"(src_height) // %5
+ : "rm"((intptr_t)(src_stride)) // %6
+ : "memory", "cc"
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
+#endif
+ );
+}
+
+// Bilinear column filtering. SSSE3 version.
+void ScaleFilterCols_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
+ int dst_width, int x, int dx) {
+ intptr_t x0 = 0, x1 = 0, temp_pixel = 0;
+ asm volatile (
+ "movd %6,%%xmm2 \n"
+ "movd %7,%%xmm3 \n"
+ "movl $0x04040000,%k2 \n"
+ "movd %k2,%%xmm5 \n"
+ "pcmpeqb %%xmm6,%%xmm6 \n"
+ "psrlw $0x9,%%xmm6 \n"
+ "pextrw $0x1,%%xmm2,%k3 \n"
+ "subl $0x2,%5 \n"
+ "jl 29f \n"
+ "movdqa %%xmm2,%%xmm0 \n"
+ "paddd %%xmm3,%%xmm0 \n"
+ "punpckldq %%xmm0,%%xmm2 \n"
+ "punpckldq %%xmm3,%%xmm3 \n"
+ "paddd %%xmm3,%%xmm3 \n"
+ "pextrw $0x3,%%xmm2,%k4 \n"
+
+ LABELALIGN
+ "2: \n"
+ "movdqa %%xmm2,%%xmm1 \n"
+ "paddd %%xmm3,%%xmm2 \n"
+ MEMOPARG(movzwl,0x00,1,3,1,k2) // movzwl (%1,%3,1),%k2
+ "movd %k2,%%xmm0 \n"
+ "psrlw $0x9,%%xmm1 \n"
+ BUNDLEALIGN
+ MEMOPARG(movzwl,0x00,1,4,1,k2) // movzwl (%1,%4,1),%k2
+ "movd %k2,%%xmm4 \n"
+ "pshufb %%xmm5,%%xmm1 \n"
+ "punpcklwd %%xmm4,%%xmm0 \n"
+ "pxor %%xmm6,%%xmm1 \n"
+ "pmaddubsw %%xmm1,%%xmm0 \n"
+ "pextrw $0x1,%%xmm2,%k3 \n"
+ "pextrw $0x3,%%xmm2,%k4 \n"
+ "psrlw $0x7,%%xmm0 \n"
+ "packuswb %%xmm0,%%xmm0 \n"
+ "movd %%xmm0,%k2 \n"
+ "mov %w2," MEMACCESS(0) " \n"
+ "lea " MEMLEA(0x2,0) ",%0 \n"
+ "sub $0x2,%5 \n"
+ "jge 2b \n"
+
+ LABELALIGN
+ "29: \n"
+ "addl $0x1,%5 \n"
+ "jl 99f \n"
+ MEMOPARG(movzwl,0x00,1,3,1,k2) // movzwl (%1,%3,1),%k2
+ "movd %k2,%%xmm0 \n"
+ "psrlw $0x9,%%xmm2 \n"
+ "pshufb %%xmm5,%%xmm2 \n"
+ "pxor %%xmm6,%%xmm2 \n"
+ "pmaddubsw %%xmm2,%%xmm0 \n"
+ "psrlw $0x7,%%xmm0 \n"
+ "packuswb %%xmm0,%%xmm0 \n"
+ "movd %%xmm0,%k2 \n"
+ "mov %b2," MEMACCESS(0) " \n"
+ "99: \n"
+ : "+r"(dst_ptr), // %0
+ "+r"(src_ptr), // %1
+ "+a"(temp_pixel), // %2
+ "+r"(x0), // %3
+ "+r"(x1), // %4
+ "+rm"(dst_width) // %5
+ : "rm"(x), // %6
+ "rm"(dx) // %7
+ : "memory", "cc"
+#if defined(__native_client__) && defined(__x86_64__)
+ , "r14"
+#endif
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
+#endif
+ );
+}
+
+// Reads 4 pixels, duplicates them and writes 8 pixels.
+// Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned.
+void ScaleColsUp2_SSE2(uint8* dst_ptr, const uint8* src_ptr,
+ int dst_width, int x, int dx) {
+ asm volatile (
+ LABELALIGN
+ "1: \n"
+ "movdqa " MEMACCESS(1) ",%%xmm0 \n"
+ "lea " MEMLEA(0x10,1) ",%1 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "punpcklbw %%xmm0,%%xmm0 \n"
+ "punpckhbw %%xmm1,%%xmm1 \n"
+ "sub $0x20,%2 \n"
+ "movdqa %%xmm0," MEMACCESS(0) " \n"
+ "movdqa %%xmm1," MEMACCESS2(0x10,0) " \n"
+ "lea " MEMLEA(0x20,0) ",%0 \n"
+ "jg 1b \n"
+
+ : "+r"(dst_ptr), // %0
+ "+r"(src_ptr), // %1
+ "+r"(dst_width) // %2
+ :
+ : "memory", "cc"
+#if defined(__SSE2__)
+ , "xmm0", "xmm1"
+#endif
+ );
+}
+
+void ScaleARGBRowDown2_SSE2(const uint8* src_argb,
+ ptrdiff_t src_stride,
+ uint8* dst_argb, int dst_width) {
+ asm volatile (
+ LABELALIGN
+ "1: \n"
+ "movdqa " MEMACCESS(0) ",%%xmm0 \n"
+ "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
+ "lea " MEMLEA(0x20,0) ",%0 \n"
+ "shufps $0xdd,%%xmm1,%%xmm0 \n"
+ "sub $0x4,%2 \n"
+ "movdqa %%xmm0," MEMACCESS(1) " \n"
+ "lea " MEMLEA(0x10,1) ",%1 \n"
+ "jg 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_argb), // %1
+ "+r"(dst_width) // %2
+ :
+ : "memory", "cc"
+#if defined(__SSE2__)
+ , "xmm0", "xmm1"
+#endif
+ );
+}
+
+void ScaleARGBRowDown2Linear_SSE2(const uint8* src_argb,
+ ptrdiff_t src_stride,
+ uint8* dst_argb, int dst_width) {
+ asm volatile (
+ LABELALIGN
+ "1: \n"
+ "movdqa " MEMACCESS(0) ",%%xmm0 \n"
+ "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
+ "lea " MEMLEA(0x20,0) ",%0 \n"
+ "movdqa %%xmm0,%%xmm2 \n"
+ "shufps $0x88,%%xmm1,%%xmm0 \n"
+ "shufps $0xdd,%%xmm1,%%xmm2 \n"
+ "pavgb %%xmm2,%%xmm0 \n"
+ "sub $0x4,%2 \n"
+ "movdqa %%xmm0," MEMACCESS(1) " \n"
+ "lea " MEMLEA(0x10,1) ",%1 \n"
+ "jg 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_argb), // %1
+ "+r"(dst_width) // %2
+ :
+ : "memory", "cc"
+#if defined(__SSE2__)
+ , "xmm0", "xmm1"
+#endif
+ );
+}
+
+void ScaleARGBRowDown2Box_SSE2(const uint8* src_argb,
+ ptrdiff_t src_stride,
+ uint8* dst_argb, int dst_width) {
+ asm volatile (
+ LABELALIGN
+ "1: \n"
+ "movdqa " MEMACCESS(0) ",%%xmm0 \n"
+ "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
+ BUNDLEALIGN
+ MEMOPREG(movdqa,0x00,0,3,1,xmm2) // movdqa (%0,%3,1),%%xmm2
+ MEMOPREG(movdqa,0x10,0,3,1,xmm3) // movdqa 0x10(%0,%3,1),%%xmm3
+ "lea " MEMLEA(0x20,0) ",%0 \n"
+ "pavgb %%xmm2,%%xmm0 \n"
+ "pavgb %%xmm3,%%xmm1 \n"
+ "movdqa %%xmm0,%%xmm2 \n"
+ "shufps $0x88,%%xmm1,%%xmm0 \n"
+ "shufps $0xdd,%%xmm1,%%xmm2 \n"
+ "pavgb %%xmm2,%%xmm0 \n"
+ "sub $0x4,%2 \n"
+ "movdqa %%xmm0," MEMACCESS(1) " \n"
+ "lea " MEMLEA(0x10,1) ",%1 \n"
+ "jg 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_argb), // %1
+ "+r"(dst_width) // %2
+ : "r"((intptr_t)(src_stride)) // %3
+ : "memory", "cc"
+#if defined(__native_client__) && defined(__x86_64__)
+ , "r14"
+#endif
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm2", "xmm3"
+#endif
+ );
+}
+
+// Reads 4 pixels at a time.
+// Alignment requirement: dst_argb 16 byte aligned.
+void ScaleARGBRowDownEven_SSE2(const uint8* src_argb, ptrdiff_t src_stride,
+ int src_stepx,
+ uint8* dst_argb, int dst_width) {
+ intptr_t src_stepx_x4 = (intptr_t)(src_stepx);
+ intptr_t src_stepx_x12 = 0;
+ asm volatile (
+ "lea " MEMLEA3(0x00,1,4) ",%1 \n"
+ "lea " MEMLEA4(0x00,1,1,2) ",%4 \n"
+ LABELALIGN
+ "1: \n"
+ "movd " MEMACCESS(0) ",%%xmm0 \n"
+ MEMOPREG(movd,0x00,0,1,1,xmm1) // movd (%0,%1,1),%%xmm1
+ "punpckldq %%xmm1,%%xmm0 \n"
+ BUNDLEALIGN
+ MEMOPREG(movd,0x00,0,1,2,xmm2) // movd (%0,%1,2),%%xmm2
+ MEMOPREG(movd,0x00,0,4,1,xmm3) // movd (%0,%4,1),%%xmm3
+ "lea " MEMLEA4(0x00,0,1,4) ",%0 \n"
+ "punpckldq %%xmm3,%%xmm2 \n"
+ "punpcklqdq %%xmm2,%%xmm0 \n"
+ "sub $0x4,%3 \n"
+ "movdqa %%xmm0," MEMACCESS(2) " \n"
+ "lea " MEMLEA(0x10,2) ",%2 \n"
+ "jg 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(src_stepx_x4), // %1
+ "+r"(dst_argb), // %2
+ "+r"(dst_width), // %3
+ "+r"(src_stepx_x12) // %4
+ :
+ : "memory", "cc"
+#if defined(__native_client__) && defined(__x86_64__)
+ , "r14"
+#endif
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm2", "xmm3"
+#endif
+ );
+}
+
+// Blends four 2x2 to 4x1.
+// Alignment requirement: dst_argb 16 byte aligned.
+void ScaleARGBRowDownEvenBox_SSE2(const uint8* src_argb,
+ ptrdiff_t src_stride, int src_stepx,
+ uint8* dst_argb, int dst_width) {
+ intptr_t src_stepx_x4 = (intptr_t)(src_stepx);
+ intptr_t src_stepx_x12 = 0;
+ intptr_t row1 = (intptr_t)(src_stride);
+ asm volatile (
+ "lea " MEMLEA3(0x00,1,4) ",%1 \n"
+ "lea " MEMLEA4(0x00,1,1,2) ",%4 \n"
+ "lea " MEMLEA4(0x00,0,5,1) ",%5 \n"
+
+ LABELALIGN
+ "1: \n"
+ "movq " MEMACCESS(0) ",%%xmm0 \n"
+ MEMOPREG(movhps,0x00,0,1,1,xmm0) // movhps (%0,%1,1),%%xmm0
+ MEMOPREG(movq,0x00,0,1,2,xmm1) // movq (%0,%1,2),%%xmm1
+ BUNDLEALIGN
+ MEMOPREG(movhps,0x00,0,4,1,xmm1) // movhps (%0,%4,1),%%xmm1
+ "lea " MEMLEA4(0x00,0,1,4) ",%0 \n"
+ "movq " MEMACCESS(5) ",%%xmm2 \n"
+ BUNDLEALIGN
+ MEMOPREG(movhps,0x00,5,1,1,xmm2) // movhps (%5,%1,1),%%xmm2
+ MEMOPREG(movq,0x00,5,1,2,xmm3) // movq (%5,%1,2),%%xmm3
+ MEMOPREG(movhps,0x00,5,4,1,xmm3) // movhps (%5,%4,1),%%xmm3
+ "lea " MEMLEA4(0x00,5,1,4) ",%5 \n"
+ "pavgb %%xmm2,%%xmm0 \n"
+ "pavgb %%xmm3,%%xmm1 \n"
+ "movdqa %%xmm0,%%xmm2 \n"
+ "shufps $0x88,%%xmm1,%%xmm0 \n"
+ "shufps $0xdd,%%xmm1,%%xmm2 \n"
+ "pavgb %%xmm2,%%xmm0 \n"
+ "sub $0x4,%3 \n"
+ "movdqa %%xmm0," MEMACCESS(2) " \n"
+ "lea " MEMLEA(0x10,2) ",%2 \n"
+ "jg 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(src_stepx_x4), // %1
+ "+r"(dst_argb), // %2
+ "+rm"(dst_width), // %3
+ "+r"(src_stepx_x12), // %4
+ "+r"(row1) // %5
+ :
+ : "memory", "cc"
+#if defined(__native_client__) && defined(__x86_64__)
+ , "r14"
+#endif
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm2", "xmm3"
+#endif
+ );
+}
+
+void ScaleARGBCols_SSE2(uint8* dst_argb, const uint8* src_argb,
+ int dst_width, int x, int dx) {
+ intptr_t x0 = 0, x1 = 0;
+ asm volatile (
+ "movd %5,%%xmm2 \n"
+ "movd %6,%%xmm3 \n"
+ "pshufd $0x0,%%xmm2,%%xmm2 \n"
+ "pshufd $0x11,%%xmm3,%%xmm0 \n"
+ "paddd %%xmm0,%%xmm2 \n"
+ "paddd %%xmm3,%%xmm3 \n"
+ "pshufd $0x5,%%xmm3,%%xmm0 \n"
+ "paddd %%xmm0,%%xmm2 \n"
+ "paddd %%xmm3,%%xmm3 \n"
+ "pshufd $0x0,%%xmm3,%%xmm3 \n"
+ "pextrw $0x1,%%xmm2,%k0 \n"
+ "pextrw $0x3,%%xmm2,%k1 \n"
+ "cmp $0x0,%4 \n"
+ "jl 99f \n"
+ "sub $0x4,%4 \n"
+ "jl 49f \n"
+
+ LABELALIGN
+ "40: \n"
+ MEMOPREG(movd,0x00,3,0,4,xmm0) // movd (%3,%0,4),%%xmm0
+ MEMOPREG(movd,0x00,3,1,4,xmm1) // movd (%3,%1,4),%%xmm1
+ "pextrw $0x5,%%xmm2,%k0 \n"
+ "pextrw $0x7,%%xmm2,%k1 \n"
+ "paddd %%xmm3,%%xmm2 \n"
+ "punpckldq %%xmm1,%%xmm0 \n"
+ MEMOPREG(movd,0x00,3,0,4,xmm1) // movd (%3,%0,4),%%xmm1
+ MEMOPREG(movd,0x00,3,1,4,xmm4) // movd (%3,%1,4),%%xmm4
+ "pextrw $0x1,%%xmm2,%k0 \n"
+ "pextrw $0x3,%%xmm2,%k1 \n"
+ "punpckldq %%xmm4,%%xmm1 \n"
+ "punpcklqdq %%xmm1,%%xmm0 \n"
+ "sub $0x4,%4 \n"
+ "movdqu %%xmm0," MEMACCESS(2) " \n"
+ "lea " MEMLEA(0x10,2) ",%2 \n"
+ "jge 40b \n"
+
+ "49: \n"
+ "test $0x2,%4 \n"
+ "je 29f \n"
+ BUNDLEALIGN
+ MEMOPREG(movd,0x00,3,0,4,xmm0) // movd (%3,%0,4),%%xmm0
+ MEMOPREG(movd,0x00,3,1,4,xmm1) // movd (%3,%1,4),%%xmm1
+ "pextrw $0x5,%%xmm2,%k0 \n"
+ "punpckldq %%xmm1,%%xmm0 \n"
+ "movq %%xmm0," MEMACCESS(2) " \n"
+ "lea " MEMLEA(0x8,2) ",%2 \n"
+ "29: \n"
+ "test $0x1,%4 \n"
+ "je 99f \n"
+ MEMOPREG(movd,0x00,3,0,4,xmm0) // movd (%3,%0,4),%%xmm0
+ "movd %%xmm0," MEMACCESS(2) " \n"
+ "99: \n"
+ : "+a"(x0), // %0
+ "+d"(x1), // %1
+ "+r"(dst_argb), // %2
+ "+r"(src_argb), // %3
+ "+r"(dst_width) // %4
+ : "rm"(x), // %5
+ "rm"(dx) // %6
+ : "memory", "cc"
+#if defined(__native_client__) && defined(__x86_64__)
+ , "r14"
+#endif
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
+#endif
+ );
+}
+
+// Reads 4 pixels, duplicates them and writes 8 pixels.
+// Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned.
+void ScaleARGBColsUp2_SSE2(uint8* dst_argb, const uint8* src_argb,
+ int dst_width, int x, int dx) {
+ asm volatile (
+ LABELALIGN
+ "1: \n"
+ "movdqa " MEMACCESS(1) ",%%xmm0 \n"
+ "lea " MEMLEA(0x10,1) ",%1 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "punpckldq %%xmm0,%%xmm0 \n"
+ "punpckhdq %%xmm1,%%xmm1 \n"
+ "sub $0x8,%2 \n"
+ "movdqa %%xmm0," MEMACCESS(0) " \n"
+ "movdqa %%xmm1," MEMACCESS2(0x10,0) " \n"
+ "lea " MEMLEA(0x20,0) ",%0 \n"
+ "jg 1b \n"
+
+ : "+r"(dst_argb), // %0
+ "+r"(src_argb), // %1
+ "+r"(dst_width) // %2
+ :
+ : "memory", "cc"
+#if defined(__native_client__) && defined(__x86_64__)
+ , "r14"
+#endif
+#if defined(__SSE2__)
+ , "xmm0", "xmm1"
+#endif
+ );
+}
+
+// Shuffle table for arranging 2 pixels into pairs for pmaddubsw
+static uvec8 kShuffleColARGB = {
+ 0u, 4u, 1u, 5u, 2u, 6u, 3u, 7u, // bbggrraa 1st pixel
+ 8u, 12u, 9u, 13u, 10u, 14u, 11u, 15u // bbggrraa 2nd pixel
+};
+
+// Shuffle table for duplicating 2 fractions into 8 bytes each
+static uvec8 kShuffleFractions = {
+ 0u, 0u, 0u, 0u, 0u, 0u, 0u, 0u, 4u, 4u, 4u, 4u, 4u, 4u, 4u, 4u,
+};
+
+// Bilinear row filtering combines 4x2 -> 4x1. SSSE3 version
+void ScaleARGBFilterCols_SSSE3(uint8* dst_argb, const uint8* src_argb,
+ int dst_width, int x, int dx) {
+ intptr_t x0 = 0, x1 = 0;
+ asm volatile (
+ "movdqa %0,%%xmm4 \n"
+ "movdqa %1,%%xmm5 \n"
+ :
+ : "m"(kShuffleColARGB), // %0
+ "m"(kShuffleFractions) // %1
+ );
+
+ asm volatile (
+ "movd %5,%%xmm2 \n"
+ "movd %6,%%xmm3 \n"
+ "pcmpeqb %%xmm6,%%xmm6 \n"
+ "psrlw $0x9,%%xmm6 \n"
+ "pextrw $0x1,%%xmm2,%k3 \n"
+ "sub $0x2,%2 \n"
+ "jl 29f \n"
+ "movdqa %%xmm2,%%xmm0 \n"
+ "paddd %%xmm3,%%xmm0 \n"
+ "punpckldq %%xmm0,%%xmm2 \n"
+ "punpckldq %%xmm3,%%xmm3 \n"
+ "paddd %%xmm3,%%xmm3 \n"
+ "pextrw $0x3,%%xmm2,%k4 \n"
+
+ LABELALIGN
+ "2: \n"
+ "movdqa %%xmm2,%%xmm1 \n"
+ "paddd %%xmm3,%%xmm2 \n"
+ MEMOPREG(movq,0x00,1,3,4,xmm0) // movq (%1,%3,4),%%xmm0
+ "psrlw $0x9,%%xmm1 \n"
+ BUNDLEALIGN
+ MEMOPREG(movhps,0x00,1,4,4,xmm0) // movhps (%1,%4,4),%%xmm0
+ "pshufb %%xmm5,%%xmm1 \n"
+ "pshufb %%xmm4,%%xmm0 \n"
+ "pxor %%xmm6,%%xmm1 \n"
+ "pmaddubsw %%xmm1,%%xmm0 \n"
+ "psrlw $0x7,%%xmm0 \n"
+ "pextrw $0x1,%%xmm2,%k3 \n"
+ "pextrw $0x3,%%xmm2,%k4 \n"
+ "packuswb %%xmm0,%%xmm0 \n"
+ "movq %%xmm0," MEMACCESS(0) " \n"
+ "lea " MEMLEA(0x8,0) ",%0 \n"
+ "sub $0x2,%2 \n"
+ "jge 2b \n"
+
+ LABELALIGN
+ "29: \n"
+ "add $0x1,%2 \n"
+ "jl 99f \n"
+ "psrlw $0x9,%%xmm2 \n"
+ BUNDLEALIGN
+ MEMOPREG(movq,0x00,1,3,4,xmm0) // movq (%1,%3,4),%%xmm0
+ "pshufb %%xmm5,%%xmm2 \n"
+ "pshufb %%xmm4,%%xmm0 \n"
+ "pxor %%xmm6,%%xmm2 \n"
+ "pmaddubsw %%xmm2,%%xmm0 \n"
+ "psrlw $0x7,%%xmm0 \n"
+ "packuswb %%xmm0,%%xmm0 \n"
+ "movd %%xmm0," MEMACCESS(0) " \n"
+
+ LABELALIGN
+ "99: \n"
+ : "+r"(dst_argb), // %0
+ "+r"(src_argb), // %1
+ "+rm"(dst_width), // %2
+ "+r"(x0), // %3
+ "+r"(x1) // %4
+ : "rm"(x), // %5
+ "rm"(dx) // %6
+ : "memory", "cc"
+#if defined(__native_client__) && defined(__x86_64__)
+ , "r14"
+#endif
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
+#endif
+ );
+}
+
+// Divide num by div and return as 16.16 fixed point result.
+int FixedDiv_X86(int num, int div) {
+ asm volatile (
+ "cdq \n"
+ "shld $0x10,%%eax,%%edx \n"
+ "shl $0x10,%%eax \n"
+ "idiv %1 \n"
+ "mov %0, %%eax \n"
+ : "+a"(num) // %0
+ : "c"(div) // %1
+ : "memory", "cc", "edx"
+ );
+ return num;
+}
+
+// Divide num - 1 by div - 1 and return as 16.16 fixed point result.
+int FixedDiv1_X86(int num, int div) {
+ asm volatile (
+ "cdq \n"
+ "shld $0x10,%%eax,%%edx \n"
+ "shl $0x10,%%eax \n"
+ "sub $0x10001,%%eax \n"
+ "sbb $0x0,%%edx \n"
+ "sub $0x1,%1 \n"
+ "idiv %1 \n"
+ "mov %0, %%eax \n"
+ : "+a"(num) // %0
+ : "c"(div) // %1
+ : "memory", "cc", "edx"
+ );
+ return num;
+}
+
+#endif // defined(__x86_64__) || defined(__i386__)
+
+#ifdef __cplusplus
+} // extern "C"
+} // namespace libyuv
+#endif
diff --git a/third_party/libyuv/source/scale_win.cc b/third_party/libyuv/source/scale_win.cc
new file mode 100644
index 0000000..bd5cca8
--- /dev/null
+++ b/third_party/libyuv/source/scale_win.cc
@@ -0,0 +1,1320 @@
+/*
+ * Copyright 2013 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "third_party/libyuv/include/libyuv/row.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// This module is for Visual C x86.
+#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER)
+
+// Offsets for source bytes 0 to 9
+static uvec8 kShuf0 =
+ { 0, 1, 3, 4, 5, 7, 8, 9, 128, 128, 128, 128, 128, 128, 128, 128 };
+
+// Offsets for source bytes 11 to 20 with 8 subtracted = 3 to 12.
+static uvec8 kShuf1 =
+ { 3, 4, 5, 7, 8, 9, 11, 12, 128, 128, 128, 128, 128, 128, 128, 128 };
+
+// Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31.
+static uvec8 kShuf2 =
+ { 5, 7, 8, 9, 11, 12, 13, 15, 128, 128, 128, 128, 128, 128, 128, 128 };
+
+// Offsets for source bytes 0 to 10
+static uvec8 kShuf01 =
+ { 0, 1, 1, 2, 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10 };
+
+// Offsets for source bytes 10 to 21 with 8 subtracted = 3 to 13.
+static uvec8 kShuf11 =
+ { 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13 };
+
+// Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31.
+static uvec8 kShuf21 =
+ { 5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13, 13, 14, 14, 15 };
+
+// Coefficients for source bytes 0 to 10
+static uvec8 kMadd01 =
+ { 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2 };
+
+// Coefficients for source bytes 10 to 21
+static uvec8 kMadd11 =
+ { 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1 };
+
+// Coefficients for source bytes 21 to 31
+static uvec8 kMadd21 =
+ { 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3 };
+
+// Coefficients for source bytes 21 to 31
+static vec16 kRound34 =
+ { 2, 2, 2, 2, 2, 2, 2, 2 };
+
+static uvec8 kShuf38a =
+ { 0, 3, 6, 8, 11, 14, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 };
+
+static uvec8 kShuf38b =
+ { 128, 128, 128, 128, 128, 128, 0, 3, 6, 8, 11, 14, 128, 128, 128, 128 };
+
+// Arrange words 0,3,6 into 0,1,2
+static uvec8 kShufAc =
+ { 0, 1, 6, 7, 12, 13, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 };
+
+// Arrange words 0,3,6 into 3,4,5
+static uvec8 kShufAc3 =
+ { 128, 128, 128, 128, 128, 128, 0, 1, 6, 7, 12, 13, 128, 128, 128, 128 };
+
+// Scaling values for boxes of 3x3 and 2x3
+static uvec16 kScaleAc33 =
+ { 65536 / 9, 65536 / 9, 65536 / 6, 65536 / 9, 65536 / 9, 65536 / 6, 0, 0 };
+
+// Arrange first value for pixels 0,1,2,3,4,5
+static uvec8 kShufAb0 =
+ { 0, 128, 3, 128, 6, 128, 8, 128, 11, 128, 14, 128, 128, 128, 128, 128 };
+
+// Arrange second value for pixels 0,1,2,3,4,5
+static uvec8 kShufAb1 =
+ { 1, 128, 4, 128, 7, 128, 9, 128, 12, 128, 15, 128, 128, 128, 128, 128 };
+
+// Arrange third value for pixels 0,1,2,3,4,5
+static uvec8 kShufAb2 =
+ { 2, 128, 5, 128, 128, 128, 10, 128, 13, 128, 128, 128, 128, 128, 128, 128 };
+
+// Scaling values for boxes of 3x2 and 2x2
+static uvec16 kScaleAb2 =
+ { 65536 / 3, 65536 / 3, 65536 / 2, 65536 / 3, 65536 / 3, 65536 / 2, 0, 0 };
+
+// Reads 32 pixels, throws half away and writes 16 pixels.
+// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned.
+__declspec(naked) __declspec(align(16))
+void ScaleRowDown2_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
+ uint8* dst_ptr, int dst_width) {
+ __asm {
+ mov eax, [esp + 4] // src_ptr
+ // src_stride ignored
+ mov edx, [esp + 12] // dst_ptr
+ mov ecx, [esp + 16] // dst_width
+
+ align 4
+ wloop:
+ movdqa xmm0, [eax]
+ movdqa xmm1, [eax + 16]
+ lea eax, [eax + 32]
+ psrlw xmm0, 8 // isolate odd pixels.
+ psrlw xmm1, 8
+ packuswb xmm0, xmm1
+ sub ecx, 16
+ movdqa [edx], xmm0
+ lea edx, [edx + 16]
+ jg wloop
+
+ ret
+ }
+}
+
+// Blends 32x1 rectangle to 16x1.
+// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned.
+__declspec(naked) __declspec(align(16))
+void ScaleRowDown2Linear_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
+ uint8* dst_ptr, int dst_width) {
+ __asm {
+ mov eax, [esp + 4] // src_ptr
+ // src_stride
+ mov edx, [esp + 12] // dst_ptr
+ mov ecx, [esp + 16] // dst_width
+ pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
+ psrlw xmm5, 8
+
+ align 4
+ wloop:
+ movdqa xmm0, [eax]
+ movdqa xmm1, [eax + 16]
+ lea eax, [eax + 32]
+
+ movdqa xmm2, xmm0 // average columns (32 to 16 pixels)
+ psrlw xmm0, 8
+ movdqa xmm3, xmm1
+ psrlw xmm1, 8
+ pand xmm2, xmm5
+ pand xmm3, xmm5
+ pavgw xmm0, xmm2
+ pavgw xmm1, xmm3
+ packuswb xmm0, xmm1
+
+ sub ecx, 16
+ movdqa [edx], xmm0
+ lea edx, [edx + 16]
+ jg wloop
+
+ ret
+ }
+}
+
+// Blends 32x2 rectangle to 16x1.
+// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned.
+__declspec(naked) __declspec(align(16))
+void ScaleRowDown2Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
+ uint8* dst_ptr, int dst_width) {
+ __asm {
+ push esi
+ mov eax, [esp + 4 + 4] // src_ptr
+ mov esi, [esp + 4 + 8] // src_stride
+ mov edx, [esp + 4 + 12] // dst_ptr
+ mov ecx, [esp + 4 + 16] // dst_width
+ pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
+ psrlw xmm5, 8
+
+ align 4
+ wloop:
+ movdqa xmm0, [eax]
+ movdqa xmm1, [eax + 16]
+ movdqa xmm2, [eax + esi]
+ movdqa xmm3, [eax + esi + 16]
+ lea eax, [eax + 32]
+ pavgb xmm0, xmm2 // average rows
+ pavgb xmm1, xmm3
+
+ movdqa xmm2, xmm0 // average columns (32 to 16 pixels)
+ psrlw xmm0, 8
+ movdqa xmm3, xmm1
+ psrlw xmm1, 8
+ pand xmm2, xmm5
+ pand xmm3, xmm5
+ pavgw xmm0, xmm2
+ pavgw xmm1, xmm3
+ packuswb xmm0, xmm1
+
+ sub ecx, 16
+ movdqa [edx], xmm0
+ lea edx, [edx + 16]
+ jg wloop
+
+ pop esi
+ ret
+ }
+}
+
+// Reads 32 pixels, throws half away and writes 16 pixels.
+// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned.
+__declspec(naked) __declspec(align(16))
+void ScaleRowDown2_Unaligned_SSE2(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst_ptr, int dst_width) {
+ __asm {
+ mov eax, [esp + 4] // src_ptr
+ // src_stride ignored
+ mov edx, [esp + 12] // dst_ptr
+ mov ecx, [esp + 16] // dst_width
+
+ align 4
+ wloop:
+ movdqu xmm0, [eax]
+ movdqu xmm1, [eax + 16]
+ lea eax, [eax + 32]
+ psrlw xmm0, 8 // isolate odd pixels.
+ psrlw xmm1, 8
+ packuswb xmm0, xmm1
+ sub ecx, 16
+ movdqu [edx], xmm0
+ lea edx, [edx + 16]
+ jg wloop
+
+ ret
+ }
+}
+
+// Blends 32x1 rectangle to 16x1.
+// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned.
+__declspec(naked) __declspec(align(16))
+void ScaleRowDown2Linear_Unaligned_SSE2(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst_ptr, int dst_width) {
+ __asm {
+ mov eax, [esp + 4] // src_ptr
+ // src_stride
+ mov edx, [esp + 12] // dst_ptr
+ mov ecx, [esp + 16] // dst_width
+ pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
+ psrlw xmm5, 8
+
+ align 4
+ wloop:
+ movdqu xmm0, [eax]
+ movdqu xmm1, [eax + 16]
+ lea eax, [eax + 32]
+
+ movdqa xmm2, xmm0 // average columns (32 to 16 pixels)
+ psrlw xmm0, 8
+ movdqa xmm3, xmm1
+ psrlw xmm1, 8
+ pand xmm2, xmm5
+ pand xmm3, xmm5
+ pavgw xmm0, xmm2
+ pavgw xmm1, xmm3
+ packuswb xmm0, xmm1
+
+ sub ecx, 16
+ movdqu [edx], xmm0
+ lea edx, [edx + 16]
+ jg wloop
+
+ ret
+ }
+}
+
+// Blends 32x2 rectangle to 16x1.
+// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned.
+__declspec(naked) __declspec(align(16))
+void ScaleRowDown2Box_Unaligned_SSE2(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst_ptr, int dst_width) {
+ __asm {
+ push esi
+ mov eax, [esp + 4 + 4] // src_ptr
+ mov esi, [esp + 4 + 8] // src_stride
+ mov edx, [esp + 4 + 12] // dst_ptr
+ mov ecx, [esp + 4 + 16] // dst_width
+ pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
+ psrlw xmm5, 8
+
+ align 4
+ wloop:
+ movdqu xmm0, [eax]
+ movdqu xmm1, [eax + 16]
+ movdqu xmm2, [eax + esi]
+ movdqu xmm3, [eax + esi + 16]
+ lea eax, [eax + 32]
+ pavgb xmm0, xmm2 // average rows
+ pavgb xmm1, xmm3
+
+ movdqa xmm2, xmm0 // average columns (32 to 16 pixels)
+ psrlw xmm0, 8
+ movdqa xmm3, xmm1
+ psrlw xmm1, 8
+ pand xmm2, xmm5
+ pand xmm3, xmm5
+ pavgw xmm0, xmm2
+ pavgw xmm1, xmm3
+ packuswb xmm0, xmm1
+
+ sub ecx, 16
+ movdqu [edx], xmm0
+ lea edx, [edx + 16]
+ jg wloop
+
+ pop esi
+ ret
+ }
+}
+
+// Point samples 32 pixels to 8 pixels.
+// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned.
+__declspec(naked) __declspec(align(16))
+void ScaleRowDown4_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
+ uint8* dst_ptr, int dst_width) {
+ __asm {
+ mov eax, [esp + 4] // src_ptr
+ // src_stride ignored
+ mov edx, [esp + 12] // dst_ptr
+ mov ecx, [esp + 16] // dst_width
+ pcmpeqb xmm5, xmm5 // generate mask 0x00ff0000
+ psrld xmm5, 24
+ pslld xmm5, 16
+
+ align 4
+ wloop:
+ movdqa xmm0, [eax]
+ movdqa xmm1, [eax + 16]
+ lea eax, [eax + 32]
+ pand xmm0, xmm5
+ pand xmm1, xmm5
+ packuswb xmm0, xmm1
+ psrlw xmm0, 8
+ packuswb xmm0, xmm0
+ sub ecx, 8
+ movq qword ptr [edx], xmm0
+ lea edx, [edx + 8]
+ jg wloop
+
+ ret
+ }
+}
+
+// Blends 32x4 rectangle to 8x1.
+// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned.
+__declspec(naked) __declspec(align(16))
+void ScaleRowDown4Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
+ uint8* dst_ptr, int dst_width) {
+ __asm {
+ push esi
+ push edi
+ mov eax, [esp + 8 + 4] // src_ptr
+ mov esi, [esp + 8 + 8] // src_stride
+ mov edx, [esp + 8 + 12] // dst_ptr
+ mov ecx, [esp + 8 + 16] // dst_width
+ lea edi, [esi + esi * 2] // src_stride * 3
+ pcmpeqb xmm7, xmm7 // generate mask 0x00ff00ff
+ psrlw xmm7, 8
+
+ align 4
+ wloop:
+ movdqa xmm0, [eax]
+ movdqa xmm1, [eax + 16]
+ movdqa xmm2, [eax + esi]
+ movdqa xmm3, [eax + esi + 16]
+ pavgb xmm0, xmm2 // average rows
+ pavgb xmm1, xmm3
+ movdqa xmm2, [eax + esi * 2]
+ movdqa xmm3, [eax + esi * 2 + 16]
+ movdqa xmm4, [eax + edi]
+ movdqa xmm5, [eax + edi + 16]
+ lea eax, [eax + 32]
+ pavgb xmm2, xmm4
+ pavgb xmm3, xmm5
+ pavgb xmm0, xmm2
+ pavgb xmm1, xmm3
+
+ movdqa xmm2, xmm0 // average columns (32 to 16 pixels)
+ psrlw xmm0, 8
+ movdqa xmm3, xmm1
+ psrlw xmm1, 8
+ pand xmm2, xmm7
+ pand xmm3, xmm7
+ pavgw xmm0, xmm2
+ pavgw xmm1, xmm3
+ packuswb xmm0, xmm1
+
+ movdqa xmm2, xmm0 // average columns (16 to 8 pixels)
+ psrlw xmm0, 8
+ pand xmm2, xmm7
+ pavgw xmm0, xmm2
+ packuswb xmm0, xmm0
+
+ sub ecx, 8
+ movq qword ptr [edx], xmm0
+ lea edx, [edx + 8]
+ jg wloop
+
+ pop edi
+ pop esi
+ ret
+ }
+}
+
+// Point samples 32 pixels to 24 pixels.
+// Produces three 8 byte values. For each 8 bytes, 16 bytes are read.
+// Then shuffled to do the scaling.
+
+// Note that movdqa+palign may be better than movdqu.
+// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned.
+__declspec(naked) __declspec(align(16))
+void ScaleRowDown34_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
+ uint8* dst_ptr, int dst_width) {
+ __asm {
+ mov eax, [esp + 4] // src_ptr
+ // src_stride ignored
+ mov edx, [esp + 12] // dst_ptr
+ mov ecx, [esp + 16] // dst_width
+ movdqa xmm3, kShuf0
+ movdqa xmm4, kShuf1
+ movdqa xmm5, kShuf2
+
+ align 4
+ wloop:
+ movdqa xmm0, [eax]
+ movdqa xmm1, [eax + 16]
+ lea eax, [eax + 32]
+ movdqa xmm2, xmm1
+ palignr xmm1, xmm0, 8
+ pshufb xmm0, xmm3
+ pshufb xmm1, xmm4
+ pshufb xmm2, xmm5
+ movq qword ptr [edx], xmm0
+ movq qword ptr [edx + 8], xmm1
+ movq qword ptr [edx + 16], xmm2
+ lea edx, [edx + 24]
+ sub ecx, 24
+ jg wloop
+
+ ret
+ }
+}
+
+// Blends 32x2 rectangle to 24x1
+// Produces three 8 byte values. For each 8 bytes, 16 bytes are read.
+// Then shuffled to do the scaling.
+
+// Register usage:
+// xmm0 src_row 0
+// xmm1 src_row 1
+// xmm2 shuf 0
+// xmm3 shuf 1
+// xmm4 shuf 2
+// xmm5 madd 0
+// xmm6 madd 1
+// xmm7 kRound34
+
+// Note that movdqa+palign may be better than movdqu.
+// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned.
+__declspec(naked) __declspec(align(16))
+void ScaleRowDown34_1_Box_SSSE3(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst_ptr, int dst_width) {
+ __asm {
+ push esi
+ mov eax, [esp + 4 + 4] // src_ptr
+ mov esi, [esp + 4 + 8] // src_stride
+ mov edx, [esp + 4 + 12] // dst_ptr
+ mov ecx, [esp + 4 + 16] // dst_width
+ movdqa xmm2, kShuf01
+ movdqa xmm3, kShuf11
+ movdqa xmm4, kShuf21
+ movdqa xmm5, kMadd01
+ movdqa xmm6, kMadd11
+ movdqa xmm7, kRound34
+
+ align 4
+ wloop:
+ movdqa xmm0, [eax] // pixels 0..7
+ movdqa xmm1, [eax + esi]
+ pavgb xmm0, xmm1
+ pshufb xmm0, xmm2
+ pmaddubsw xmm0, xmm5
+ paddsw xmm0, xmm7
+ psrlw xmm0, 2
+ packuswb xmm0, xmm0
+ movq qword ptr [edx], xmm0
+ movdqu xmm0, [eax + 8] // pixels 8..15
+ movdqu xmm1, [eax + esi + 8]
+ pavgb xmm0, xmm1
+ pshufb xmm0, xmm3
+ pmaddubsw xmm0, xmm6
+ paddsw xmm0, xmm7
+ psrlw xmm0, 2
+ packuswb xmm0, xmm0
+ movq qword ptr [edx + 8], xmm0
+ movdqa xmm0, [eax + 16] // pixels 16..23
+ movdqa xmm1, [eax + esi + 16]
+ lea eax, [eax + 32]
+ pavgb xmm0, xmm1
+ pshufb xmm0, xmm4
+ movdqa xmm1, kMadd21
+ pmaddubsw xmm0, xmm1
+ paddsw xmm0, xmm7
+ psrlw xmm0, 2
+ packuswb xmm0, xmm0
+ sub ecx, 24
+ movq qword ptr [edx + 16], xmm0
+ lea edx, [edx + 24]
+ jg wloop
+
+ pop esi
+ ret
+ }
+}
+
+// Note that movdqa+palign may be better than movdqu.
+// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned.
+__declspec(naked) __declspec(align(16))
+void ScaleRowDown34_0_Box_SSSE3(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst_ptr, int dst_width) {
+ __asm {
+ push esi
+ mov eax, [esp + 4 + 4] // src_ptr
+ mov esi, [esp + 4 + 8] // src_stride
+ mov edx, [esp + 4 + 12] // dst_ptr
+ mov ecx, [esp + 4 + 16] // dst_width
+ movdqa xmm2, kShuf01
+ movdqa xmm3, kShuf11
+ movdqa xmm4, kShuf21
+ movdqa xmm5, kMadd01
+ movdqa xmm6, kMadd11
+ movdqa xmm7, kRound34
+
+ align 4
+ wloop:
+ movdqa xmm0, [eax] // pixels 0..7
+ movdqa xmm1, [eax + esi]
+ pavgb xmm1, xmm0
+ pavgb xmm0, xmm1
+ pshufb xmm0, xmm2
+ pmaddubsw xmm0, xmm5
+ paddsw xmm0, xmm7
+ psrlw xmm0, 2
+ packuswb xmm0, xmm0
+ movq qword ptr [edx], xmm0
+ movdqu xmm0, [eax + 8] // pixels 8..15
+ movdqu xmm1, [eax + esi + 8]
+ pavgb xmm1, xmm0
+ pavgb xmm0, xmm1
+ pshufb xmm0, xmm3
+ pmaddubsw xmm0, xmm6
+ paddsw xmm0, xmm7
+ psrlw xmm0, 2
+ packuswb xmm0, xmm0
+ movq qword ptr [edx + 8], xmm0
+ movdqa xmm0, [eax + 16] // pixels 16..23
+ movdqa xmm1, [eax + esi + 16]
+ lea eax, [eax + 32]
+ pavgb xmm1, xmm0
+ pavgb xmm0, xmm1
+ pshufb xmm0, xmm4
+ movdqa xmm1, kMadd21
+ pmaddubsw xmm0, xmm1
+ paddsw xmm0, xmm7
+ psrlw xmm0, 2
+ packuswb xmm0, xmm0
+ sub ecx, 24
+ movq qword ptr [edx + 16], xmm0
+ lea edx, [edx+24]
+ jg wloop
+
+ pop esi
+ ret
+ }
+}
+
+// 3/8 point sampler
+
+// Scale 32 pixels to 12
+__declspec(naked) __declspec(align(16))
+void ScaleRowDown38_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
+ uint8* dst_ptr, int dst_width) {
+ __asm {
+ mov eax, [esp + 4] // src_ptr
+ // src_stride ignored
+ mov edx, [esp + 12] // dst_ptr
+ mov ecx, [esp + 16] // dst_width
+ movdqa xmm4, kShuf38a
+ movdqa xmm5, kShuf38b
+
+ align 4
+ xloop:
+ movdqa xmm0, [eax] // 16 pixels -> 0,1,2,3,4,5
+ movdqa xmm1, [eax + 16] // 16 pixels -> 6,7,8,9,10,11
+ lea eax, [eax + 32]
+ pshufb xmm0, xmm4
+ pshufb xmm1, xmm5
+ paddusb xmm0, xmm1
+
+ sub ecx, 12
+ movq qword ptr [edx], xmm0 // write 12 pixels
+ movhlps xmm1, xmm0
+ movd [edx + 8], xmm1
+ lea edx, [edx + 12]
+ jg xloop
+
+ ret
+ }
+}
+
+// Scale 16x3 pixels to 6x1 with interpolation
+__declspec(naked) __declspec(align(16))
+void ScaleRowDown38_3_Box_SSSE3(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst_ptr, int dst_width) {
+ __asm {
+ push esi
+ mov eax, [esp + 4 + 4] // src_ptr
+ mov esi, [esp + 4 + 8] // src_stride
+ mov edx, [esp + 4 + 12] // dst_ptr
+ mov ecx, [esp + 4 + 16] // dst_width
+ movdqa xmm2, kShufAc
+ movdqa xmm3, kShufAc3
+ movdqa xmm4, kScaleAc33
+ pxor xmm5, xmm5
+
+ align 4
+ xloop:
+ movdqa xmm0, [eax] // sum up 3 rows into xmm0/1
+ movdqa xmm6, [eax + esi]
+ movhlps xmm1, xmm0
+ movhlps xmm7, xmm6
+ punpcklbw xmm0, xmm5
+ punpcklbw xmm1, xmm5
+ punpcklbw xmm6, xmm5
+ punpcklbw xmm7, xmm5
+ paddusw xmm0, xmm6
+ paddusw xmm1, xmm7
+ movdqa xmm6, [eax + esi * 2]
+ lea eax, [eax + 16]
+ movhlps xmm7, xmm6
+ punpcklbw xmm6, xmm5
+ punpcklbw xmm7, xmm5
+ paddusw xmm0, xmm6
+ paddusw xmm1, xmm7
+
+ movdqa xmm6, xmm0 // 8 pixels -> 0,1,2 of xmm6
+ psrldq xmm0, 2
+ paddusw xmm6, xmm0
+ psrldq xmm0, 2
+ paddusw xmm6, xmm0
+ pshufb xmm6, xmm2
+
+ movdqa xmm7, xmm1 // 8 pixels -> 3,4,5 of xmm6
+ psrldq xmm1, 2
+ paddusw xmm7, xmm1
+ psrldq xmm1, 2
+ paddusw xmm7, xmm1
+ pshufb xmm7, xmm3
+ paddusw xmm6, xmm7
+
+ pmulhuw xmm6, xmm4 // divide by 9,9,6, 9,9,6
+ packuswb xmm6, xmm6
+
+ sub ecx, 6
+ movd [edx], xmm6 // write 6 pixels
+ psrlq xmm6, 16
+ movd [edx + 2], xmm6
+ lea edx, [edx + 6]
+ jg xloop
+
+ pop esi
+ ret
+ }
+}
+
+// Scale 16x2 pixels to 6x1 with interpolation
+__declspec(naked) __declspec(align(16))
+void ScaleRowDown38_2_Box_SSSE3(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst_ptr, int dst_width) {
+ __asm {
+ push esi
+ mov eax, [esp + 4 + 4] // src_ptr
+ mov esi, [esp + 4 + 8] // src_stride
+ mov edx, [esp + 4 + 12] // dst_ptr
+ mov ecx, [esp + 4 + 16] // dst_width
+ movdqa xmm2, kShufAb0
+ movdqa xmm3, kShufAb1
+ movdqa xmm4, kShufAb2
+ movdqa xmm5, kScaleAb2
+
+ align 4
+ xloop:
+ movdqa xmm0, [eax] // average 2 rows into xmm0
+ pavgb xmm0, [eax + esi]
+ lea eax, [eax + 16]
+
+ movdqa xmm1, xmm0 // 16 pixels -> 0,1,2,3,4,5 of xmm1
+ pshufb xmm1, xmm2
+ movdqa xmm6, xmm0
+ pshufb xmm6, xmm3
+ paddusw xmm1, xmm6
+ pshufb xmm0, xmm4
+ paddusw xmm1, xmm0
+
+ pmulhuw xmm1, xmm5 // divide by 3,3,2, 3,3,2
+ packuswb xmm1, xmm1
+
+ sub ecx, 6
+ movd [edx], xmm1 // write 6 pixels
+ psrlq xmm1, 16
+ movd [edx + 2], xmm1
+ lea edx, [edx + 6]
+ jg xloop
+
+ pop esi
+ ret
+ }
+}
+
+// Reads 16xN bytes and produces 16 shorts at a time.
+// TODO(fbarchard): Make this handle 4xN bytes for any width ARGB.
+__declspec(naked) __declspec(align(16))
+void ScaleAddRows_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
+ uint16* dst_ptr, int src_width,
+ int src_height) {
+ __asm {
+ push esi
+ push edi
+ push ebx
+ push ebp
+ mov esi, [esp + 16 + 4] // src_ptr
+ mov edx, [esp + 16 + 8] // src_stride
+ mov edi, [esp + 16 + 12] // dst_ptr
+ mov ecx, [esp + 16 + 16] // dst_width
+ mov ebx, [esp + 16 + 20] // height
+ pxor xmm4, xmm4
+ dec ebx
+
+ align 4
+ xloop:
+ // first row
+ movdqa xmm0, [esi]
+ lea eax, [esi + edx]
+ movdqa xmm1, xmm0
+ punpcklbw xmm0, xmm4
+ punpckhbw xmm1, xmm4
+ lea esi, [esi + 16]
+ mov ebp, ebx
+ test ebp, ebp
+ je ydone
+
+ // sum remaining rows
+ align 4
+ yloop:
+ movdqa xmm2, [eax] // read 16 pixels
+ lea eax, [eax + edx] // advance to next row
+ movdqa xmm3, xmm2
+ punpcklbw xmm2, xmm4
+ punpckhbw xmm3, xmm4
+ paddusw xmm0, xmm2 // sum 16 words
+ paddusw xmm1, xmm3
+ sub ebp, 1
+ jg yloop
+
+ align 4
+ ydone:
+ movdqa [edi], xmm0
+ movdqa [edi + 16], xmm1
+ lea edi, [edi + 32]
+
+ sub ecx, 16
+ jg xloop
+
+ pop ebp
+ pop ebx
+ pop edi
+ pop esi
+ ret
+ }
+}
+
+// Bilinear column filtering. SSSE3 version.
+// TODO(fbarchard): Port to Neon
+// TODO(fbarchard): Switch the following:
+// xor ebx, ebx
+// mov bx, word ptr [esi + eax] // 2 source x0 pixels
+// To
+// movzx ebx, word ptr [esi + eax] // 2 source x0 pixels
+// when drmemory bug fixed.
+// https://code.google.com/p/drmemory/issues/detail?id=1396
+
+__declspec(naked) __declspec(align(16))
+void ScaleFilterCols_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
+ int dst_width, int x, int dx) {
+ __asm {
+ push ebx
+ push esi
+ push edi
+ mov edi, [esp + 12 + 4] // dst_ptr
+ mov esi, [esp + 12 + 8] // src_ptr
+ mov ecx, [esp + 12 + 12] // dst_width
+ movd xmm2, [esp + 12 + 16] // x
+ movd xmm3, [esp + 12 + 20] // dx
+ mov eax, 0x04040000 // shuffle to line up fractions with pixel.
+ movd xmm5, eax
+ pcmpeqb xmm6, xmm6 // generate 0x007f for inverting fraction.
+ psrlw xmm6, 9
+ pextrw eax, xmm2, 1 // get x0 integer. preroll
+ sub ecx, 2
+ jl xloop29
+
+ movdqa xmm0, xmm2 // x1 = x0 + dx
+ paddd xmm0, xmm3
+ punpckldq xmm2, xmm0 // x0 x1
+ punpckldq xmm3, xmm3 // dx dx
+ paddd xmm3, xmm3 // dx * 2, dx * 2
+ pextrw edx, xmm2, 3 // get x1 integer. preroll
+
+ // 2 Pixel loop.
+ align 4
+ xloop2:
+ movdqa xmm1, xmm2 // x0, x1 fractions.
+ paddd xmm2, xmm3 // x += dx
+ movzx ebx, word ptr [esi + eax] // 2 source x0 pixels
+ movd xmm0, ebx
+ psrlw xmm1, 9 // 7 bit fractions.
+ movzx ebx, word ptr [esi + edx] // 2 source x1 pixels
+ movd xmm4, ebx
+ pshufb xmm1, xmm5 // 0011
+ punpcklwd xmm0, xmm4
+ pxor xmm1, xmm6 // 0..7f and 7f..0
+ pmaddubsw xmm0, xmm1 // 16 bit, 2 pixels.
+ pextrw eax, xmm2, 1 // get x0 integer. next iteration.
+ pextrw edx, xmm2, 3 // get x1 integer. next iteration.
+ psrlw xmm0, 7 // 8.7 fixed point to low 8 bits.
+ packuswb xmm0, xmm0 // 8 bits, 2 pixels.
+ movd ebx, xmm0
+ mov [edi], bx
+ lea edi, [edi + 2]
+ sub ecx, 2 // 2 pixels
+ jge xloop2
+
+ align 4
+ xloop29:
+
+ add ecx, 2 - 1
+ jl xloop99
+
+ // 1 pixel remainder
+ movzx ebx, word ptr [esi + eax] // 2 source x0 pixels
+ movd xmm0, ebx
+ psrlw xmm2, 9 // 7 bit fractions.
+ pshufb xmm2, xmm5 // 0011
+ pxor xmm2, xmm6 // 0..7f and 7f..0
+ pmaddubsw xmm0, xmm2 // 16 bit
+ psrlw xmm0, 7 // 8.7 fixed point to low 8 bits.
+ packuswb xmm0, xmm0 // 8 bits
+ movd ebx, xmm0
+ mov [edi], bl
+
+ align 4
+ xloop99:
+
+ pop edi
+ pop esi
+ pop ebx
+ ret
+ }
+}
+
+// Reads 16 pixels, duplicates them and writes 32 pixels.
+// Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned.
+__declspec(naked) __declspec(align(16))
+void ScaleColsUp2_SSE2(uint8* dst_ptr, const uint8* src_ptr,
+ int dst_width, int x, int dx) {
+ __asm {
+ mov edx, [esp + 4] // dst_ptr
+ mov eax, [esp + 8] // src_ptr
+ mov ecx, [esp + 12] // dst_width
+
+ align 4
+ wloop:
+ movdqa xmm0, [eax]
+ lea eax, [eax + 16]
+ movdqa xmm1, xmm0
+ punpcklbw xmm0, xmm0
+ punpckhbw xmm1, xmm1
+ sub ecx, 32
+ movdqa [edx], xmm0
+ movdqa [edx + 16], xmm1
+ lea edx, [edx + 32]
+ jg wloop
+
+ ret
+ }
+}
+
+// Reads 8 pixels, throws half away and writes 4 even pixels (0, 2, 4, 6)
+// Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned.
+__declspec(naked) __declspec(align(16))
+void ScaleARGBRowDown2_SSE2(const uint8* src_argb,
+ ptrdiff_t src_stride,
+ uint8* dst_argb, int dst_width) {
+ __asm {
+ mov eax, [esp + 4] // src_argb
+ // src_stride ignored
+ mov edx, [esp + 12] // dst_argb
+ mov ecx, [esp + 16] // dst_width
+
+ align 4
+ wloop:
+ movdqa xmm0, [eax]
+ movdqa xmm1, [eax + 16]
+ lea eax, [eax + 32]
+ shufps xmm0, xmm1, 0xdd
+ sub ecx, 4
+ movdqa [edx], xmm0
+ lea edx, [edx + 16]
+ jg wloop
+
+ ret
+ }
+}
+
+// Blends 8x1 rectangle to 4x1.
+// Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned.
+__declspec(naked) __declspec(align(16))
+void ScaleARGBRowDown2Linear_SSE2(const uint8* src_argb,
+ ptrdiff_t src_stride,
+ uint8* dst_argb, int dst_width) {
+ __asm {
+ mov eax, [esp + 4] // src_argb
+ // src_stride ignored
+ mov edx, [esp + 12] // dst_argb
+ mov ecx, [esp + 16] // dst_width
+
+ align 4
+ wloop:
+ movdqa xmm0, [eax]
+ movdqa xmm1, [eax + 16]
+ lea eax, [eax + 32]
+ movdqa xmm2, xmm0
+ shufps xmm0, xmm1, 0x88 // even pixels
+ shufps xmm2, xmm1, 0xdd // odd pixels
+ pavgb xmm0, xmm2
+ sub ecx, 4
+ movdqa [edx], xmm0
+ lea edx, [edx + 16]
+ jg wloop
+
+ ret
+ }
+}
+
+// Blends 8x2 rectangle to 4x1.
+// Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned.
+__declspec(naked) __declspec(align(16))
+void ScaleARGBRowDown2Box_SSE2(const uint8* src_argb,
+ ptrdiff_t src_stride,
+ uint8* dst_argb, int dst_width) {
+ __asm {
+ push esi
+ mov eax, [esp + 4 + 4] // src_argb
+ mov esi, [esp + 4 + 8] // src_stride
+ mov edx, [esp + 4 + 12] // dst_argb
+ mov ecx, [esp + 4 + 16] // dst_width
+
+ align 4
+ wloop:
+ movdqa xmm0, [eax]
+ movdqa xmm1, [eax + 16]
+ movdqa xmm2, [eax + esi]
+ movdqa xmm3, [eax + esi + 16]
+ lea eax, [eax + 32]
+ pavgb xmm0, xmm2 // average rows
+ pavgb xmm1, xmm3
+ movdqa xmm2, xmm0 // average columns (8 to 4 pixels)
+ shufps xmm0, xmm1, 0x88 // even pixels
+ shufps xmm2, xmm1, 0xdd // odd pixels
+ pavgb xmm0, xmm2
+ sub ecx, 4
+ movdqa [edx], xmm0
+ lea edx, [edx + 16]
+ jg wloop
+
+ pop esi
+ ret
+ }
+}
+
+// Reads 4 pixels at a time.
+// Alignment requirement: dst_argb 16 byte aligned.
+__declspec(naked) __declspec(align(16))
+void ScaleARGBRowDownEven_SSE2(const uint8* src_argb, ptrdiff_t src_stride,
+ int src_stepx,
+ uint8* dst_argb, int dst_width) {
+ __asm {
+ push ebx
+ push edi
+ mov eax, [esp + 8 + 4] // src_argb
+ // src_stride ignored
+ mov ebx, [esp + 8 + 12] // src_stepx
+ mov edx, [esp + 8 + 16] // dst_argb
+ mov ecx, [esp + 8 + 20] // dst_width
+ lea ebx, [ebx * 4]
+ lea edi, [ebx + ebx * 2]
+
+ align 4
+ wloop:
+ movd xmm0, [eax]
+ movd xmm1, [eax + ebx]
+ punpckldq xmm0, xmm1
+ movd xmm2, [eax + ebx * 2]
+ movd xmm3, [eax + edi]
+ lea eax, [eax + ebx * 4]
+ punpckldq xmm2, xmm3
+ punpcklqdq xmm0, xmm2
+ sub ecx, 4
+ movdqa [edx], xmm0
+ lea edx, [edx + 16]
+ jg wloop
+
+ pop edi
+ pop ebx
+ ret
+ }
+}
+
+// Blends four 2x2 to 4x1.
+// Alignment requirement: dst_argb 16 byte aligned.
+__declspec(naked) __declspec(align(16))
+void ScaleARGBRowDownEvenBox_SSE2(const uint8* src_argb,
+ ptrdiff_t src_stride,
+ int src_stepx,
+ uint8* dst_argb, int dst_width) {
+ __asm {
+ push ebx
+ push esi
+ push edi
+ mov eax, [esp + 12 + 4] // src_argb
+ mov esi, [esp + 12 + 8] // src_stride
+ mov ebx, [esp + 12 + 12] // src_stepx
+ mov edx, [esp + 12 + 16] // dst_argb
+ mov ecx, [esp + 12 + 20] // dst_width
+ lea esi, [eax + esi] // row1 pointer
+ lea ebx, [ebx * 4]
+ lea edi, [ebx + ebx * 2]
+
+ align 4
+ wloop:
+ movq xmm0, qword ptr [eax] // row0 4 pairs
+ movhps xmm0, qword ptr [eax + ebx]
+ movq xmm1, qword ptr [eax + ebx * 2]
+ movhps xmm1, qword ptr [eax + edi]
+ lea eax, [eax + ebx * 4]
+ movq xmm2, qword ptr [esi] // row1 4 pairs
+ movhps xmm2, qword ptr [esi + ebx]
+ movq xmm3, qword ptr [esi + ebx * 2]
+ movhps xmm3, qword ptr [esi + edi]
+ lea esi, [esi + ebx * 4]
+ pavgb xmm0, xmm2 // average rows
+ pavgb xmm1, xmm3
+ movdqa xmm2, xmm0 // average columns (8 to 4 pixels)
+ shufps xmm0, xmm1, 0x88 // even pixels
+ shufps xmm2, xmm1, 0xdd // odd pixels
+ pavgb xmm0, xmm2
+ sub ecx, 4
+ movdqa [edx], xmm0
+ lea edx, [edx + 16]
+ jg wloop
+
+ pop edi
+ pop esi
+ pop ebx
+ ret
+ }
+}
+
+// Column scaling unfiltered. SSE2 version.
+__declspec(naked) __declspec(align(16))
+void ScaleARGBCols_SSE2(uint8* dst_argb, const uint8* src_argb,
+ int dst_width, int x, int dx) {
+ __asm {
+ push edi
+ push esi
+ mov edi, [esp + 8 + 4] // dst_argb
+ mov esi, [esp + 8 + 8] // src_argb
+ mov ecx, [esp + 8 + 12] // dst_width
+ movd xmm2, [esp + 8 + 16] // x
+ movd xmm3, [esp + 8 + 20] // dx
+
+ pshufd xmm2, xmm2, 0 // x0 x0 x0 x0
+ pshufd xmm0, xmm3, 0x11 // dx 0 dx 0
+ paddd xmm2, xmm0
+ paddd xmm3, xmm3 // 0, 0, 0, dx * 2
+ pshufd xmm0, xmm3, 0x05 // dx * 2, dx * 2, 0, 0
+ paddd xmm2, xmm0 // x3 x2 x1 x0
+ paddd xmm3, xmm3 // 0, 0, 0, dx * 4
+ pshufd xmm3, xmm3, 0 // dx * 4, dx * 4, dx * 4, dx * 4
+
+ pextrw eax, xmm2, 1 // get x0 integer.
+ pextrw edx, xmm2, 3 // get x1 integer.
+
+ cmp ecx, 0
+ jle xloop99
+ sub ecx, 4
+ jl xloop49
+
+ // 4 Pixel loop.
+ align 4
+ xloop4:
+ movd xmm0, [esi + eax * 4] // 1 source x0 pixels
+ movd xmm1, [esi + edx * 4] // 1 source x1 pixels
+ pextrw eax, xmm2, 5 // get x2 integer.
+ pextrw edx, xmm2, 7 // get x3 integer.
+ paddd xmm2, xmm3 // x += dx
+ punpckldq xmm0, xmm1 // x0 x1
+
+ movd xmm1, [esi + eax * 4] // 1 source x2 pixels
+ movd xmm4, [esi + edx * 4] // 1 source x3 pixels
+ pextrw eax, xmm2, 1 // get x0 integer. next iteration.
+ pextrw edx, xmm2, 3 // get x1 integer. next iteration.
+ punpckldq xmm1, xmm4 // x2 x3
+ punpcklqdq xmm0, xmm1 // x0 x1 x2 x3
+ sub ecx, 4 // 4 pixels
+ movdqu [edi], xmm0
+ lea edi, [edi + 16]
+ jge xloop4
+
+ align 4
+ xloop49:
+ test ecx, 2
+ je xloop29
+
+ // 2 Pixels.
+ movd xmm0, [esi + eax * 4] // 1 source x0 pixels
+ movd xmm1, [esi + edx * 4] // 1 source x1 pixels
+ pextrw eax, xmm2, 5 // get x2 integer.
+ punpckldq xmm0, xmm1 // x0 x1
+
+ movq qword ptr [edi], xmm0
+ lea edi, [edi + 8]
+
+ xloop29:
+ test ecx, 1
+ je xloop99
+
+ // 1 Pixels.
+ movd xmm0, [esi + eax * 4] // 1 source x2 pixels
+ movd dword ptr [edi], xmm0
+ align 4
+ xloop99:
+
+ pop esi
+ pop edi
+ ret
+ }
+}
+
+// Bilinear row filtering combines 2x1 -> 1x1. SSSE3 version.
+// TODO(fbarchard): Port to Neon
+
+// Shuffle table for arranging 2 pixels into pairs for pmaddubsw
+static uvec8 kShuffleColARGB = {
+ 0u, 4u, 1u, 5u, 2u, 6u, 3u, 7u, // bbggrraa 1st pixel
+ 8u, 12u, 9u, 13u, 10u, 14u, 11u, 15u // bbggrraa 2nd pixel
+};
+
+// Shuffle table for duplicating 2 fractions into 8 bytes each
+static uvec8 kShuffleFractions = {
+ 0u, 0u, 0u, 0u, 0u, 0u, 0u, 0u, 4u, 4u, 4u, 4u, 4u, 4u, 4u, 4u,
+};
+
+__declspec(naked) __declspec(align(16))
+void ScaleARGBFilterCols_SSSE3(uint8* dst_argb, const uint8* src_argb,
+ int dst_width, int x, int dx) {
+ __asm {
+ push esi
+ push edi
+ mov edi, [esp + 8 + 4] // dst_argb
+ mov esi, [esp + 8 + 8] // src_argb
+ mov ecx, [esp + 8 + 12] // dst_width
+ movd xmm2, [esp + 8 + 16] // x
+ movd xmm3, [esp + 8 + 20] // dx
+ movdqa xmm4, kShuffleColARGB
+ movdqa xmm5, kShuffleFractions
+ pcmpeqb xmm6, xmm6 // generate 0x007f for inverting fraction.
+ psrlw xmm6, 9
+ pextrw eax, xmm2, 1 // get x0 integer. preroll
+ sub ecx, 2
+ jl xloop29
+
+ movdqa xmm0, xmm2 // x1 = x0 + dx
+ paddd xmm0, xmm3
+ punpckldq xmm2, xmm0 // x0 x1
+ punpckldq xmm3, xmm3 // dx dx
+ paddd xmm3, xmm3 // dx * 2, dx * 2
+ pextrw edx, xmm2, 3 // get x1 integer. preroll
+
+ // 2 Pixel loop.
+ align 4
+ xloop2:
+ movdqa xmm1, xmm2 // x0, x1 fractions.
+ paddd xmm2, xmm3 // x += dx
+ movq xmm0, qword ptr [esi + eax * 4] // 2 source x0 pixels
+ psrlw xmm1, 9 // 7 bit fractions.
+ movhps xmm0, qword ptr [esi + edx * 4] // 2 source x1 pixels
+ pshufb xmm1, xmm5 // 0000000011111111
+ pshufb xmm0, xmm4 // arrange pixels into pairs
+ pxor xmm1, xmm6 // 0..7f and 7f..0
+ pmaddubsw xmm0, xmm1 // argb_argb 16 bit, 2 pixels.
+ pextrw eax, xmm2, 1 // get x0 integer. next iteration.
+ pextrw edx, xmm2, 3 // get x1 integer. next iteration.
+ psrlw xmm0, 7 // argb 8.7 fixed point to low 8 bits.
+ packuswb xmm0, xmm0 // argb_argb 8 bits, 2 pixels.
+ movq qword ptr [edi], xmm0
+ lea edi, [edi + 8]
+ sub ecx, 2 // 2 pixels
+ jge xloop2
+
+ align 4
+ xloop29:
+
+ add ecx, 2 - 1
+ jl xloop99
+
+ // 1 pixel remainder
+ psrlw xmm2, 9 // 7 bit fractions.
+ movq xmm0, qword ptr [esi + eax * 4] // 2 source x0 pixels
+ pshufb xmm2, xmm5 // 00000000
+ pshufb xmm0, xmm4 // arrange pixels into pairs
+ pxor xmm2, xmm6 // 0..7f and 7f..0
+ pmaddubsw xmm0, xmm2 // argb 16 bit, 1 pixel.
+ psrlw xmm0, 7
+ packuswb xmm0, xmm0 // argb 8 bits, 1 pixel.
+ movd [edi], xmm0
+
+ align 4
+ xloop99:
+
+ pop edi
+ pop esi
+ ret
+ }
+}
+
+// Reads 4 pixels, duplicates them and writes 8 pixels.
+// Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned.
+__declspec(naked) __declspec(align(16))
+void ScaleARGBColsUp2_SSE2(uint8* dst_argb, const uint8* src_argb,
+ int dst_width, int x, int dx) {
+ __asm {
+ mov edx, [esp + 4] // dst_argb
+ mov eax, [esp + 8] // src_argb
+ mov ecx, [esp + 12] // dst_width
+
+ align 4
+ wloop:
+ movdqa xmm0, [eax]
+ lea eax, [eax + 16]
+ movdqa xmm1, xmm0
+ punpckldq xmm0, xmm0
+ punpckhdq xmm1, xmm1
+ sub ecx, 8
+ movdqa [edx], xmm0
+ movdqa [edx + 16], xmm1
+ lea edx, [edx + 32]
+ jg wloop
+
+ ret
+ }
+}
+
+// Divide num by div and return as 16.16 fixed point result.
+__declspec(naked) __declspec(align(16))
+int FixedDiv_X86(int num, int div) {
+ __asm {
+ mov eax, [esp + 4] // num
+ cdq // extend num to 64 bits
+ shld edx, eax, 16 // 32.16
+ shl eax, 16
+ idiv dword ptr [esp + 8]
+ ret
+ }
+}
+
+// Divide num by div and return as 16.16 fixed point result.
+__declspec(naked) __declspec(align(16))
+int FixedDiv1_X86(int num, int div) {
+ __asm {
+ mov eax, [esp + 4] // num
+ mov ecx, [esp + 8] // denom
+ cdq // extend num to 64 bits
+ shld edx, eax, 16 // 32.16
+ shl eax, 16
+ sub eax, 0x00010001
+ sbb edx, 0
+ sub ecx, 1
+ idiv ecx
+ ret
+ }
+}
+
+#endif // !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER)
+
+#ifdef __cplusplus
+} // extern "C"
+} // namespace libyuv
+#endif
diff --git a/third_party/x86inc/x86inc.asm b/third_party/x86inc/x86inc.asm
index 5b22b94..2134676 100644
--- a/third_party/x86inc/x86inc.asm
+++ b/third_party/x86inc/x86inc.asm
@@ -395,6 +395,23 @@
%assign n_arg_names %0
%endmacro
+%if ARCH_X86_64
+%macro ALLOC_STACK 2 ; stack_size, num_regs
+ %assign %%stack_aligment ((mmsize + 15) & ~15)
+ %assign stack_size_padded %1
+
+ %assign %%reg_num (%2 - 1)
+ %xdefine rsp_tmp r %+ %%reg_num
+ mov rsp_tmp, rsp
+ sub rsp, stack_size_padded
+ and rsp, ~(%%stack_aligment - 1)
+%endmacro
+
+%macro RESTORE_STACK 0 ; reset rsp register
+ mov rsp, rsp_tmp
+%endmacro
+%endif
+
%if WIN64 ; Windows x64 ;=================================================
DECLARE_REG 0, rcx, ecx, cx, cl
diff --git a/tools_common.h b/tools_common.h
index 2124882..e033de2 100644
--- a/tools_common.h
+++ b/tools_common.h
@@ -89,7 +89,7 @@
enum VideoFileType file_type;
uint32_t width;
uint32_t height;
- int use_i420;
+ vpx_img_fmt_t fmt;
int only_i420;
uint32_t fourcc;
struct VpxRational framerate;
diff --git a/vp8/common/arm/dequantize_arm.c b/vp8/common/arm/dequantize_arm.c
index 3e37e08..1f8157f 100644
--- a/vp8/common/arm/dequantize_arm.c
+++ b/vp8/common/arm/dequantize_arm.c
@@ -12,26 +12,9 @@
#include "vpx_config.h"
#include "vp8/common/blockd.h"
-#if HAVE_NEON_ASM
-extern void vp8_dequantize_b_loop_neon(short *Q, short *DQC, short *DQ);
-#endif
-
#if HAVE_MEDIA
extern void vp8_dequantize_b_loop_v6(short *Q, short *DQC, short *DQ);
-#endif
-#if HAVE_NEON_ASM
-
-void vp8_dequantize_b_neon(BLOCKD *d, short *DQC)
-{
- short *DQ = d->dqcoeff;
- short *Q = d->qcoeff;
-
- vp8_dequantize_b_loop_neon(Q, DQC, DQ);
-}
-#endif
-
-#if HAVE_MEDIA
void vp8_dequantize_b_v6(BLOCKD *d, short *DQC)
{
short *DQ = d->dqcoeff;
diff --git a/vp8/common/arm/loopfilter_arm.c b/vp8/common/arm/loopfilter_arm.c
index 5d693c6..f37ca63 100644
--- a/vp8/common/arm/loopfilter_arm.c
+++ b/vp8/common/arm/loopfilter_arm.c
@@ -25,20 +25,24 @@
extern prototype_loopfilter(vp8_mbloop_filter_vertical_edge_armv6);
#endif
-#if HAVE_NEON_ASM
+#if HAVE_NEON_ASM || HAVE_NEON
typedef void loopfilter_y_neon(unsigned char *src, int pitch,
unsigned char blimit, unsigned char limit, unsigned char thresh);
typedef void loopfilter_uv_neon(unsigned char *u, int pitch,
unsigned char blimit, unsigned char limit, unsigned char thresh,
unsigned char *v);
+#endif
+#if HAVE_NEON_ASM
extern loopfilter_y_neon vp8_loop_filter_horizontal_edge_y_neon;
extern loopfilter_y_neon vp8_loop_filter_vertical_edge_y_neon;
-extern loopfilter_y_neon vp8_mbloop_filter_horizontal_edge_y_neon;
-extern loopfilter_y_neon vp8_mbloop_filter_vertical_edge_y_neon;
-
extern loopfilter_uv_neon vp8_loop_filter_horizontal_edge_uv_neon;
extern loopfilter_uv_neon vp8_loop_filter_vertical_edge_uv_neon;
+#endif
+
+#if HAVE_NEON
+extern loopfilter_y_neon vp8_mbloop_filter_horizontal_edge_y_neon;
+extern loopfilter_y_neon vp8_mbloop_filter_vertical_edge_y_neon;
extern loopfilter_uv_neon vp8_mbloop_filter_horizontal_edge_uv_neon;
extern loopfilter_uv_neon vp8_mbloop_filter_vertical_edge_uv_neon;
#endif
@@ -118,7 +122,7 @@
}
#endif
-#if HAVE_NEON_ASM
+#if HAVE_NEON
/* NEON loopfilter functions */
/* Horizontal MB filtering */
void vp8_loop_filter_mbh_neon(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
@@ -146,7 +150,9 @@
if (u_ptr)
vp8_mbloop_filter_vertical_edge_uv_neon(u_ptr, uv_stride, mblim, lim, hev_thr, v_ptr);
}
+#endif
+#if HAVE_NEON_ASM
/* Horizontal B Filtering */
void vp8_loop_filter_bh_neon(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
int y_stride, int uv_stride, loop_filter_info *lfi)
diff --git a/vp8/common/arm/neon/dequantizeb_neon.c b/vp8/common/arm/neon/dequantizeb_neon.c
index 60f69c8..54e709d 100644
--- a/vp8/common/arm/neon/dequantizeb_neon.c
+++ b/vp8/common/arm/neon/dequantizeb_neon.c
@@ -10,18 +10,16 @@
#include <arm_neon.h>
-void vp8_dequantize_b_loop_neon(
- int16_t *Q,
- int16_t *DQC,
- int16_t *DQ) {
+#include "vp8/common/blockd.h"
+
+void vp8_dequantize_b_neon(BLOCKD *d, short *DQC) {
int16x8x2_t qQ, qDQC, qDQ;
- qQ = vld2q_s16(Q);
+ qQ = vld2q_s16(d->qcoeff);
qDQC = vld2q_s16(DQC);
qDQ.val[0] = vmulq_s16(qQ.val[0], qDQC.val[0]);
qDQ.val[1] = vmulq_s16(qQ.val[1], qDQC.val[1]);
- vst2q_s16(DQ, qDQ);
- return;
+ vst2q_s16(d->dqcoeff, qDQ);
}
diff --git a/vp8/common/arm/neon/idct_dequant_0_2x_neon.asm b/vp8/common/arm/neon/idct_dequant_0_2x_neon.asm
new file mode 100644
index 0000000..3a39210
--- /dev/null
+++ b/vp8/common/arm/neon/idct_dequant_0_2x_neon.asm
@@ -0,0 +1,81 @@
+;
+; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license and patent
+; grant that can be found in the LICENSE file in the root of the source
+; tree. All contributing project authors may be found in the AUTHORS
+; file in the root of the source tree.
+;
+
+
+ EXPORT |idct_dequant_0_2x_neon|
+ ARM
+ REQUIRE8
+ PRESERVE8
+
+ AREA ||.text||, CODE, READONLY, ALIGN=2
+;void idct_dequant_0_2x_neon(short *q, short dq,
+; unsigned char *dst, int stride);
+; r0 *q
+; r1 dq
+; r2 *dst
+; r3 stride
+|idct_dequant_0_2x_neon| PROC
+ push {r4, r5}
+ vpush {d8-d15}
+
+ add r12, r2, #4
+ vld1.32 {d2[0]}, [r2], r3
+ vld1.32 {d8[0]}, [r12], r3
+ vld1.32 {d2[1]}, [r2], r3
+ vld1.32 {d8[1]}, [r12], r3
+ vld1.32 {d4[0]}, [r2], r3
+ vld1.32 {d10[0]}, [r12], r3
+ vld1.32 {d4[1]}, [r2], r3
+ vld1.32 {d10[1]}, [r12], r3
+
+ ldrh r12, [r0] ; lo q
+ ldrh r4, [r0, #32] ; hi q
+ mov r5, #0
+ strh r5, [r0]
+ strh r5, [r0, #32]
+
+ sxth r12, r12 ; lo
+ mul r0, r12, r1
+ add r0, r0, #4
+ asr r0, r0, #3
+ vdup.16 q0, r0
+ sxth r4, r4 ; hi
+ mul r0, r4, r1
+ add r0, r0, #4
+ asr r0, r0, #3
+ vdup.16 q3, r0
+
+ vaddw.u8 q1, q0, d2 ; lo
+ vaddw.u8 q2, q0, d4
+ vaddw.u8 q4, q3, d8 ; hi
+ vaddw.u8 q5, q3, d10
+
+ sub r2, r2, r3, lsl #2 ; dst - 4*stride
+ add r0, r2, #4
+
+ vqmovun.s16 d2, q1 ; lo
+ vqmovun.s16 d4, q2
+ vqmovun.s16 d8, q4 ; hi
+ vqmovun.s16 d10, q5
+
+ vst1.32 {d2[0]}, [r2], r3 ; lo
+ vst1.32 {d8[0]}, [r0], r3 ; hi
+ vst1.32 {d2[1]}, [r2], r3
+ vst1.32 {d8[1]}, [r0], r3
+ vst1.32 {d4[0]}, [r2], r3
+ vst1.32 {d10[0]}, [r0], r3
+ vst1.32 {d4[1]}, [r2]
+ vst1.32 {d10[1]}, [r0]
+
+ vpop {d8-d15}
+ pop {r4, r5}
+ bx lr
+
+ ENDP ; |idct_dequant_0_2x_neon|
+ END
diff --git a/vp8/common/arm/neon/idct_dequant_0_2x_neon.c b/vp8/common/arm/neon/idct_dequant_0_2x_neon.c
deleted file mode 100644
index 967c322..0000000
--- a/vp8/common/arm/neon/idct_dequant_0_2x_neon.c
+++ /dev/null
@@ -1,62 +0,0 @@
-/*
- * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-#include <arm_neon.h>
-
-void idct_dequant_0_2x_neon(
- int16_t *q,
- int16_t dq,
- unsigned char *dst,
- int stride) {
- unsigned char *dst0;
- int i, a0, a1;
- int16x8x2_t q2Add;
- int32x2_t d2s32, d4s32;
- uint8x8_t d2u8, d4u8;
- uint16x8_t q1u16, q2u16;
-
- a0 = ((q[0] * dq) + 4) >> 3;
- a1 = ((q[16] * dq) + 4) >> 3;
- q[0] = q[16] = 0;
- q2Add.val[0] = vdupq_n_s16((int16_t)a0);
- q2Add.val[1] = vdupq_n_s16((int16_t)a1);
-
- for (i = 0; i < 2; i++, dst += 4) {
- dst0 = dst;
- d2s32 = vld1_lane_s32((const int32_t *)dst0, d2s32, 0);
- dst0 += stride;
- d2s32 = vld1_lane_s32((const int32_t *)dst0, d2s32, 1);
- dst0 += stride;
- d4s32 = vld1_lane_s32((const int32_t *)dst0, d4s32, 0);
- dst0 += stride;
- d4s32 = vld1_lane_s32((const int32_t *)dst0, d4s32, 1);
-
- q1u16 = vaddw_u8(vreinterpretq_u16_s16(q2Add.val[i]),
- vreinterpret_u8_s32(d2s32));
- q2u16 = vaddw_u8(vreinterpretq_u16_s16(q2Add.val[i]),
- vreinterpret_u8_s32(d4s32));
-
- d2u8 = vqmovun_s16(vreinterpretq_s16_u16(q1u16));
- d4u8 = vqmovun_s16(vreinterpretq_s16_u16(q2u16));
-
- d2s32 = vreinterpret_s32_u8(d2u8);
- d4s32 = vreinterpret_s32_u8(d4u8);
-
- dst0 = dst;
- vst1_lane_s32((int32_t *)dst0, d2s32, 0);
- dst0 += stride;
- vst1_lane_s32((int32_t *)dst0, d2s32, 1);
- dst0 += stride;
- vst1_lane_s32((int32_t *)dst0, d4s32, 0);
- dst0 += stride;
- vst1_lane_s32((int32_t *)dst0, d4s32, 1);
- }
- return;
-}
diff --git a/vp8/common/arm/neon/idct_dequant_full_2x_neon.asm b/vp8/common/arm/neon/idct_dequant_full_2x_neon.asm
new file mode 100644
index 0000000..8da0fa0
--- /dev/null
+++ b/vp8/common/arm/neon/idct_dequant_full_2x_neon.asm
@@ -0,0 +1,199 @@
+;
+; Copyright (c) 2010 The Webm project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+
+ EXPORT |idct_dequant_full_2x_neon|
+ ARM
+ REQUIRE8
+ PRESERVE8
+
+ AREA ||.text||, CODE, READONLY, ALIGN=2
+;void idct_dequant_full_2x_neon(short *q, short *dq,
+; unsigned char *dst, int stride);
+; r0 *q,
+; r1 *dq,
+; r2 *dst
+; r3 stride
+|idct_dequant_full_2x_neon| PROC
+ vpush {d8-d15}
+
+ vld1.16 {q0, q1}, [r1] ; dq (same l/r)
+ vld1.16 {q2, q3}, [r0] ; l q
+ add r0, r0, #32
+ vld1.16 {q4, q5}, [r0] ; r q
+ add r12, r2, #4
+
+ ; interleave the predictors
+ vld1.32 {d28[0]}, [r2], r3 ; l pre
+ vld1.32 {d28[1]}, [r12], r3 ; r pre
+ vld1.32 {d29[0]}, [r2], r3
+ vld1.32 {d29[1]}, [r12], r3
+ vld1.32 {d30[0]}, [r2], r3
+ vld1.32 {d30[1]}, [r12], r3
+ vld1.32 {d31[0]}, [r2], r3
+ vld1.32 {d31[1]}, [r12]
+
+ adr r1, cospi8sqrt2minus1 ; pointer to the first constant
+
+ ; dequant: q[i] = q[i] * dq[i]
+ vmul.i16 q2, q2, q0
+ vmul.i16 q3, q3, q1
+ vmul.i16 q4, q4, q0
+ vmul.i16 q5, q5, q1
+
+ vld1.16 {d0}, [r1]
+
+ ; q2: l0r0 q3: l8r8
+ ; q4: l4r4 q5: l12r12
+ vswp d5, d8
+ vswp d7, d10
+
+ ; _CONSTANTS_ * 4,12 >> 16
+ ; q6: 4 * sinpi : c1/temp1
+ ; q7: 12 * sinpi : d1/temp2
+ ; q8: 4 * cospi
+ ; q9: 12 * cospi
+ vqdmulh.s16 q6, q4, d0[2] ; sinpi8sqrt2
+ vqdmulh.s16 q7, q5, d0[2]
+ vqdmulh.s16 q8, q4, d0[0] ; cospi8sqrt2minus1
+ vqdmulh.s16 q9, q5, d0[0]
+
+ vqadd.s16 q10, q2, q3 ; a1 = 0 + 8
+ vqsub.s16 q11, q2, q3 ; b1 = 0 - 8
+
+ ; vqdmulh only accepts signed values. this was a problem because
+ ; our constant had the high bit set, and was treated as a negative value.
+ ; vqdmulh also doubles the value before it shifts by 16. we need to
+ ; compensate for this. in the case of sinpi8sqrt2, the lowest bit is 0,
+ ; so we can shift the constant without losing precision. this avoids
+ ; shift again afterward, but also avoids the sign issue. win win!
+ ; for cospi8sqrt2minus1 the lowest bit is 1, so we lose precision if we
+ ; pre-shift it
+ vshr.s16 q8, q8, #1
+ vshr.s16 q9, q9, #1
+
+ ; q4: 4 + 4 * cospi : d1/temp1
+ ; q5: 12 + 12 * cospi : c1/temp2
+ vqadd.s16 q4, q4, q8
+ vqadd.s16 q5, q5, q9
+
+ ; c1 = temp1 - temp2
+ ; d1 = temp1 + temp2
+ vqsub.s16 q2, q6, q5
+ vqadd.s16 q3, q4, q7
+
+ ; [0]: a1+d1
+ ; [1]: b1+c1
+ ; [2]: b1-c1
+ ; [3]: a1-d1
+ vqadd.s16 q4, q10, q3
+ vqadd.s16 q5, q11, q2
+ vqsub.s16 q6, q11, q2
+ vqsub.s16 q7, q10, q3
+
+ ; rotate
+ vtrn.32 q4, q6
+ vtrn.32 q5, q7
+ vtrn.16 q4, q5
+ vtrn.16 q6, q7
+ ; idct loop 2
+ ; q4: l 0, 4, 8,12 r 0, 4, 8,12
+ ; q5: l 1, 5, 9,13 r 1, 5, 9,13
+ ; q6: l 2, 6,10,14 r 2, 6,10,14
+ ; q7: l 3, 7,11,15 r 3, 7,11,15
+
+ ; q8: 1 * sinpi : c1/temp1
+ ; q9: 3 * sinpi : d1/temp2
+ ; q10: 1 * cospi
+ ; q11: 3 * cospi
+ vqdmulh.s16 q8, q5, d0[2] ; sinpi8sqrt2
+ vqdmulh.s16 q9, q7, d0[2]
+ vqdmulh.s16 q10, q5, d0[0] ; cospi8sqrt2minus1
+ vqdmulh.s16 q11, q7, d0[0]
+
+ vqadd.s16 q2, q4, q6 ; a1 = 0 + 2
+ vqsub.s16 q3, q4, q6 ; b1 = 0 - 2
+
+ ; see note on shifting above
+ vshr.s16 q10, q10, #1
+ vshr.s16 q11, q11, #1
+
+ ; q10: 1 + 1 * cospi : d1/temp1
+ ; q11: 3 + 3 * cospi : c1/temp2
+ vqadd.s16 q10, q5, q10
+ vqadd.s16 q11, q7, q11
+
+ ; q8: c1 = temp1 - temp2
+ ; q9: d1 = temp1 + temp2
+ vqsub.s16 q8, q8, q11
+ vqadd.s16 q9, q10, q9
+
+ ; a1+d1
+ ; b1+c1
+ ; b1-c1
+ ; a1-d1
+ vqadd.s16 q4, q2, q9
+ vqadd.s16 q5, q3, q8
+ vqsub.s16 q6, q3, q8
+ vqsub.s16 q7, q2, q9
+
+ ; +4 >> 3 (rounding)
+ vrshr.s16 q4, q4, #3 ; lo
+ vrshr.s16 q5, q5, #3
+ vrshr.s16 q6, q6, #3 ; hi
+ vrshr.s16 q7, q7, #3
+
+ vtrn.32 q4, q6
+ vtrn.32 q5, q7
+ vtrn.16 q4, q5
+ vtrn.16 q6, q7
+
+ ; adding pre
+ ; input is still packed. pre was read interleaved
+ vaddw.u8 q4, q4, d28
+ vaddw.u8 q5, q5, d29
+ vaddw.u8 q6, q6, d30
+ vaddw.u8 q7, q7, d31
+
+ vmov.i16 q14, #0
+ vmov q15, q14
+ vst1.16 {q14, q15}, [r0] ; write over high input
+ sub r0, r0, #32
+ vst1.16 {q14, q15}, [r0] ; write over low input
+
+ sub r2, r2, r3, lsl #2 ; dst - 4*stride
+ add r1, r2, #4 ; hi
+
+ ;saturate and narrow
+ vqmovun.s16 d0, q4 ; lo
+ vqmovun.s16 d1, q5
+ vqmovun.s16 d2, q6 ; hi
+ vqmovun.s16 d3, q7
+
+ vst1.32 {d0[0]}, [r2], r3 ; lo
+ vst1.32 {d0[1]}, [r1], r3 ; hi
+ vst1.32 {d1[0]}, [r2], r3
+ vst1.32 {d1[1]}, [r1], r3
+ vst1.32 {d2[0]}, [r2], r3
+ vst1.32 {d2[1]}, [r1], r3
+ vst1.32 {d3[0]}, [r2]
+ vst1.32 {d3[1]}, [r1]
+
+ vpop {d8-d15}
+ bx lr
+
+ ENDP ; |idct_dequant_full_2x_neon|
+
+; Constant Pool
+cospi8sqrt2minus1 DCD 0x4e7b
+; because the lowest bit in 0x8a8c is 0, we can pre-shift this
+sinpi8sqrt2 DCD 0x4546
+
+ END
diff --git a/vp8/common/arm/neon/idct_dequant_full_2x_neon.c b/vp8/common/arm/neon/idct_dequant_full_2x_neon.c
deleted file mode 100644
index a60ed46..0000000
--- a/vp8/common/arm/neon/idct_dequant_full_2x_neon.c
+++ /dev/null
@@ -1,185 +0,0 @@
-/*
- * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-#include <arm_neon.h>
-
-static const int16_t cospi8sqrt2minus1 = 20091;
-static const int16_t sinpi8sqrt2 = 17734;
-// because the lowest bit in 0x8a8c is 0, we can pre-shift this
-
-void idct_dequant_full_2x_neon(
- int16_t *q,
- int16_t *dq,
- unsigned char *dst,
- int stride) {
- unsigned char *dst0, *dst1;
- int32x2_t d28, d29, d30, d31;
- int16x8_t q0, q1, q2, q3, q4, q5, q6, q7, q8, q9, q10, q11;
- int16x8_t qEmpty = vdupq_n_s16(0);
- int32x4x2_t q2tmp0, q2tmp1;
- int16x8x2_t q2tmp2, q2tmp3;
- int16x4_t dLow0, dLow1, dHigh0, dHigh1;
-
- d28 = d29 = d30 = d31 = vdup_n_s32(0);
-
- // load dq
- q0 = vld1q_s16(dq);
- dq += 8;
- q1 = vld1q_s16(dq);
-
- // load q
- q2 = vld1q_s16(q);
- vst1q_s16(q, qEmpty);
- q += 8;
- q3 = vld1q_s16(q);
- vst1q_s16(q, qEmpty);
- q += 8;
- q4 = vld1q_s16(q);
- vst1q_s16(q, qEmpty);
- q += 8;
- q5 = vld1q_s16(q);
- vst1q_s16(q, qEmpty);
-
- // load src from dst
- dst0 = dst;
- dst1 = dst + 4;
- d28 = vld1_lane_s32((const int32_t *)dst0, d28, 0);
- dst0 += stride;
- d28 = vld1_lane_s32((const int32_t *)dst1, d28, 1);
- dst1 += stride;
- d29 = vld1_lane_s32((const int32_t *)dst0, d29, 0);
- dst0 += stride;
- d29 = vld1_lane_s32((const int32_t *)dst1, d29, 1);
- dst1 += stride;
-
- d30 = vld1_lane_s32((const int32_t *)dst0, d30, 0);
- dst0 += stride;
- d30 = vld1_lane_s32((const int32_t *)dst1, d30, 1);
- dst1 += stride;
- d31 = vld1_lane_s32((const int32_t *)dst0, d31, 0);
- d31 = vld1_lane_s32((const int32_t *)dst1, d31, 1);
-
- q2 = vmulq_s16(q2, q0);
- q3 = vmulq_s16(q3, q1);
- q4 = vmulq_s16(q4, q0);
- q5 = vmulq_s16(q5, q1);
-
- // vswp
- dLow0 = vget_low_s16(q2);
- dHigh0 = vget_high_s16(q2);
- dLow1 = vget_low_s16(q4);
- dHigh1 = vget_high_s16(q4);
- q2 = vcombine_s16(dLow0, dLow1);
- q4 = vcombine_s16(dHigh0, dHigh1);
-
- dLow0 = vget_low_s16(q3);
- dHigh0 = vget_high_s16(q3);
- dLow1 = vget_low_s16(q5);
- dHigh1 = vget_high_s16(q5);
- q3 = vcombine_s16(dLow0, dLow1);
- q5 = vcombine_s16(dHigh0, dHigh1);
-
- q6 = vqdmulhq_n_s16(q4, sinpi8sqrt2);
- q7 = vqdmulhq_n_s16(q5, sinpi8sqrt2);
- q8 = vqdmulhq_n_s16(q4, cospi8sqrt2minus1);
- q9 = vqdmulhq_n_s16(q5, cospi8sqrt2minus1);
-
- q10 = vqaddq_s16(q2, q3);
- q11 = vqsubq_s16(q2, q3);
-
- q8 = vshrq_n_s16(q8, 1);
- q9 = vshrq_n_s16(q9, 1);
-
- q4 = vqaddq_s16(q4, q8);
- q5 = vqaddq_s16(q5, q9);
-
- q2 = vqsubq_s16(q6, q5);
- q3 = vqaddq_s16(q7, q4);
-
- q4 = vqaddq_s16(q10, q3);
- q5 = vqaddq_s16(q11, q2);
- q6 = vqsubq_s16(q11, q2);
- q7 = vqsubq_s16(q10, q3);
-
- q2tmp0 = vtrnq_s32(vreinterpretq_s32_s16(q4), vreinterpretq_s32_s16(q6));
- q2tmp1 = vtrnq_s32(vreinterpretq_s32_s16(q5), vreinterpretq_s32_s16(q7));
- q2tmp2 = vtrnq_s16(vreinterpretq_s16_s32(q2tmp0.val[0]),
- vreinterpretq_s16_s32(q2tmp1.val[0]));
- q2tmp3 = vtrnq_s16(vreinterpretq_s16_s32(q2tmp0.val[1]),
- vreinterpretq_s16_s32(q2tmp1.val[1]));
-
- // loop 2
- q8 = vqdmulhq_n_s16(q2tmp2.val[1], sinpi8sqrt2);
- q9 = vqdmulhq_n_s16(q2tmp3.val[1], sinpi8sqrt2);
- q10 = vqdmulhq_n_s16(q2tmp2.val[1], cospi8sqrt2minus1);
- q11 = vqdmulhq_n_s16(q2tmp3.val[1], cospi8sqrt2minus1);
-
- q2 = vqaddq_s16(q2tmp2.val[0], q2tmp3.val[0]);
- q3 = vqsubq_s16(q2tmp2.val[0], q2tmp3.val[0]);
-
- q10 = vshrq_n_s16(q10, 1);
- q11 = vshrq_n_s16(q11, 1);
-
- q10 = vqaddq_s16(q2tmp2.val[1], q10);
- q11 = vqaddq_s16(q2tmp3.val[1], q11);
-
- q8 = vqsubq_s16(q8, q11);
- q9 = vqaddq_s16(q9, q10);
-
- q4 = vqaddq_s16(q2, q9);
- q5 = vqaddq_s16(q3, q8);
- q6 = vqsubq_s16(q3, q8);
- q7 = vqsubq_s16(q2, q9);
-
- q4 = vrshrq_n_s16(q4, 3);
- q5 = vrshrq_n_s16(q5, 3);
- q6 = vrshrq_n_s16(q6, 3);
- q7 = vrshrq_n_s16(q7, 3);
-
- q2tmp0 = vtrnq_s32(vreinterpretq_s32_s16(q4), vreinterpretq_s32_s16(q6));
- q2tmp1 = vtrnq_s32(vreinterpretq_s32_s16(q5), vreinterpretq_s32_s16(q7));
- q2tmp2 = vtrnq_s16(vreinterpretq_s16_s32(q2tmp0.val[0]),
- vreinterpretq_s16_s32(q2tmp1.val[0]));
- q2tmp3 = vtrnq_s16(vreinterpretq_s16_s32(q2tmp0.val[1]),
- vreinterpretq_s16_s32(q2tmp1.val[1]));
-
- q4 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q2tmp2.val[0]),
- vreinterpret_u8_s32(d28)));
- q5 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q2tmp2.val[1]),
- vreinterpret_u8_s32(d29)));
- q6 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q2tmp3.val[0]),
- vreinterpret_u8_s32(d30)));
- q7 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q2tmp3.val[1]),
- vreinterpret_u8_s32(d31)));
-
- d28 = vreinterpret_s32_u8(vqmovun_s16(q4));
- d29 = vreinterpret_s32_u8(vqmovun_s16(q5));
- d30 = vreinterpret_s32_u8(vqmovun_s16(q6));
- d31 = vreinterpret_s32_u8(vqmovun_s16(q7));
-
- dst0 = dst;
- dst1 = dst + 4;
- vst1_lane_s32((int32_t *)dst0, d28, 0);
- dst0 += stride;
- vst1_lane_s32((int32_t *)dst1, d28, 1);
- dst1 += stride;
- vst1_lane_s32((int32_t *)dst0, d29, 0);
- dst0 += stride;
- vst1_lane_s32((int32_t *)dst1, d29, 1);
- dst1 += stride;
-
- vst1_lane_s32((int32_t *)dst0, d30, 0);
- dst0 += stride;
- vst1_lane_s32((int32_t *)dst1, d30, 1);
- dst1 += stride;
- vst1_lane_s32((int32_t *)dst0, d31, 0);
- vst1_lane_s32((int32_t *)dst1, d31, 1);
- return;
-}
diff --git a/vp8/common/arm/reconintra_arm.c b/vp8/common/arm/reconintra_arm.c
index 765fc3a..e55a33c 100644
--- a/vp8/common/arm/reconintra_arm.c
+++ b/vp8/common/arm/reconintra_arm.c
@@ -14,7 +14,7 @@
#include "vp8/common/blockd.h"
#include "vpx_mem/vpx_mem.h"
-#if HAVE_NEON_ARM
+#if HAVE_NEON_ASM
extern void vp8_build_intra_predictors_mby_neon_func(
unsigned char *y_buffer,
unsigned char *ypred_ptr,
diff --git a/vp8/common/rtcd_defs.pl b/vp8/common/rtcd_defs.pl
index dcf5b8e..cbfd76a 100644
--- a/vp8/common/rtcd_defs.pl
+++ b/vp8/common/rtcd_defs.pl
@@ -29,9 +29,8 @@
# Dequant
#
add_proto qw/void vp8_dequantize_b/, "struct blockd*, short *dqc";
-specialize qw/vp8_dequantize_b mmx media neon_asm/;
+specialize qw/vp8_dequantize_b mmx media neon/;
$vp8_dequantize_b_media=vp8_dequantize_b_v6;
-$vp8_dequantize_b_neon_asm=vp8_dequantize_b_neon;
add_proto qw/void vp8_dequant_idct_add/, "short *input, short *dq, unsigned char *output, int stride";
specialize qw/vp8_dequant_idct_add mmx media neon dspr2/;
@@ -54,9 +53,8 @@
# Loopfilter
#
add_proto qw/void vp8_loop_filter_mbv/, "unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi";
-specialize qw/vp8_loop_filter_mbv mmx sse2 media neon_asm dspr2/;
+specialize qw/vp8_loop_filter_mbv mmx sse2 media neon dspr2/;
$vp8_loop_filter_mbv_media=vp8_loop_filter_mbv_armv6;
-$vp8_loop_filter_mbv_neon_asm=vp8_loop_filter_mbv_neon;
$vp8_loop_filter_mbv_dspr2=vp8_loop_filter_mbv_dspr2;
add_proto qw/void vp8_loop_filter_bv/, "unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi";
@@ -66,9 +64,8 @@
$vp8_loop_filter_bv_dspr2=vp8_loop_filter_bv_dspr2;
add_proto qw/void vp8_loop_filter_mbh/, "unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi";
-specialize qw/vp8_loop_filter_mbh mmx sse2 media neon_asm dspr2/;
+specialize qw/vp8_loop_filter_mbh mmx sse2 media neon dspr2/;
$vp8_loop_filter_mbh_media=vp8_loop_filter_mbh_armv6;
-$vp8_loop_filter_mbh_neon_asm=vp8_loop_filter_mbh_neon;
$vp8_loop_filter_mbh_dspr2=vp8_loop_filter_mbh_dspr2;
add_proto qw/void vp8_loop_filter_bh/, "unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi";
@@ -87,12 +84,12 @@
$vp8_loop_filter_simple_mbv_neon_asm=vp8_loop_filter_mbvs_neon;
add_proto qw/void vp8_loop_filter_simple_mbh/, "unsigned char *y, int ystride, const unsigned char *blimit";
-specialize qw/vp8_loop_filter_simple_mbh mmx sse2 media neon_asm/;
+specialize qw/vp8_loop_filter_simple_mbh mmx sse2 media neon/;
$vp8_loop_filter_simple_mbh_c=vp8_loop_filter_simple_horizontal_edge_c;
$vp8_loop_filter_simple_mbh_mmx=vp8_loop_filter_simple_horizontal_edge_mmx;
$vp8_loop_filter_simple_mbh_sse2=vp8_loop_filter_simple_horizontal_edge_sse2;
$vp8_loop_filter_simple_mbh_media=vp8_loop_filter_simple_horizontal_edge_armv6;
-$vp8_loop_filter_simple_mbh_neon_asm=vp8_loop_filter_mbhs_neon;
+$vp8_loop_filter_simple_mbh_neon=vp8_loop_filter_mbhs_neon;
add_proto qw/void vp8_loop_filter_simple_bv/, "unsigned char *y, int ystride, const unsigned char *blimit";
specialize qw/vp8_loop_filter_simple_bv mmx sse2 media neon_asm/;
@@ -103,12 +100,12 @@
$vp8_loop_filter_simple_bv_neon_asm=vp8_loop_filter_bvs_neon;
add_proto qw/void vp8_loop_filter_simple_bh/, "unsigned char *y, int ystride, const unsigned char *blimit";
-specialize qw/vp8_loop_filter_simple_bh mmx sse2 media neon_asm/;
+specialize qw/vp8_loop_filter_simple_bh mmx sse2 media neon/;
$vp8_loop_filter_simple_bh_c=vp8_loop_filter_bhs_c;
$vp8_loop_filter_simple_bh_mmx=vp8_loop_filter_bhs_mmx;
$vp8_loop_filter_simple_bh_sse2=vp8_loop_filter_bhs_sse2;
$vp8_loop_filter_simple_bh_media=vp8_loop_filter_bhs_armv6;
-$vp8_loop_filter_simple_bh_neon_asm=vp8_loop_filter_bhs_neon;
+$vp8_loop_filter_simple_bh_neon=vp8_loop_filter_bhs_neon;
#
# IDCT
@@ -555,7 +552,7 @@
# Denoiser filter
#
if (vpx_config("CONFIG_TEMPORAL_DENOISING") eq "yes") {
- add_proto qw/int vp8_denoiser_filter/, "struct yv12_buffer_config* mc_running_avg, struct yv12_buffer_config* running_avg, struct macroblock* signal, unsigned int motion_magnitude2, int y_offset, int uv_offset";
+ add_proto qw/int vp8_denoiser_filter/, "unsigned char *mc_running_avg_y, int mc_avg_y_stride, unsigned char *running_avg_y, int avg_y_stride, unsigned char *sig, int sig_stride, unsigned int motion_magnitude, int increase_denoising";
specialize qw/vp8_denoiser_filter sse2 neon/;
}
diff --git a/vp8/common/x86/variance_impl_mmx.asm b/vp8/common/x86/variance_impl_mmx.asm
index d9120d0..7d5e681 100644
--- a/vp8/common/x86/variance_impl_mmx.asm
+++ b/vp8/common/x86/variance_impl_mmx.asm
@@ -342,8 +342,8 @@
movsxd rdx, dword ptr arg(3) ;[recon_stride]
; Row 1
- movq mm0, [rax] ; Copy eight bytes to mm0
- movq mm1, [rbx] ; Copy eight bytes to mm1
+ movd mm0, [rax] ; Copy four bytes to mm0
+ movd mm1, [rbx] ; Copy four bytes to mm1
punpcklbw mm0, mm6 ; unpack to higher prrcision
punpcklbw mm1, mm6
psubsw mm0, mm1 ; A-B (low order) to MM0
@@ -351,12 +351,12 @@
pmaddwd mm0, mm0 ; square and accumulate
add rbx,rdx ; Inc pointer into ref data
add rax,rcx ; Inc pointer into the new data
- movq mm1, [rbx] ; Copy eight bytes to mm1
+ movd mm1, [rbx] ; Copy four bytes to mm1
paddd mm7, mm0 ; accumulate in mm7
; Row 2
- movq mm0, [rax] ; Copy eight bytes to mm0
+ movd mm0, [rax] ; Copy four bytes to mm0
punpcklbw mm0, mm6 ; unpack to higher prrcision
punpcklbw mm1, mm6
psubsw mm0, mm1 ; A-B (low order) to MM0
@@ -365,12 +365,12 @@
pmaddwd mm0, mm0 ; square and accumulate
add rbx,rdx ; Inc pointer into ref data
add rax,rcx ; Inc pointer into the new data
- movq mm1, [rbx] ; Copy eight bytes to mm1
+ movd mm1, [rbx] ; Copy four bytes to mm1
paddd mm7, mm0 ; accumulate in mm7
; Row 3
- movq mm0, [rax] ; Copy eight bytes to mm0
- punpcklbw mm0, mm6 ; unpack to higher prrcision
+ movd mm0, [rax] ; Copy four bytes to mm0
+ punpcklbw mm0, mm6 ; unpack to higher precision
punpcklbw mm1, mm6
psubsw mm0, mm1 ; A-B (low order) to MM0
paddw mm5, mm0 ; accumulate differences in mm5
@@ -378,11 +378,11 @@
pmaddwd mm0, mm0 ; square and accumulate
add rbx,rdx ; Inc pointer into ref data
add rax,rcx ; Inc pointer into the new data
- movq mm1, [rbx] ; Copy eight bytes to mm1
+ movd mm1, [rbx] ; Copy four bytes to mm1
paddd mm7, mm0 ; accumulate in mm7
; Row 4
- movq mm0, [rax] ; Copy eight bytes to mm0
+ movd mm0, [rax] ; Copy four bytes to mm0
punpcklbw mm0, mm6 ; unpack to higher prrcision
punpcklbw mm1, mm6
diff --git a/vp8/encoder/arm/neon/denoising_neon.c b/vp8/encoder/arm/neon/denoising_neon.c
index 23dc0a9..32ce65a 100644
--- a/vp8/encoder/arm/neon/denoising_neon.c
+++ b/vp8/encoder/arm/neon/denoising_neon.c
@@ -45,10 +45,13 @@
* [16, 255] 3 6 7
*/
-int vp8_denoiser_filter_neon(YV12_BUFFER_CONFIG *mc_running_avg,
- YV12_BUFFER_CONFIG *running_avg,
- MACROBLOCK *signal, unsigned int motion_magnitude,
- int y_offset, int uv_offset) {
+int vp8_denoiser_filter_neon(unsigned char *mc_running_avg_y,
+ int mc_running_avg_y_stride,
+ unsigned char *running_avg_y,
+ int running_avg_y_stride,
+ unsigned char *sig, int sig_stride,
+ unsigned int motion_magnitude,
+ int increase_denoising) {
/* If motion_magnitude is small, making the denoiser more aggressive by
* increasing the adjustment for each level, level1 adjustment is
* increased, the deltas stay the same.
@@ -60,14 +63,6 @@
const uint8x16_t v_level1_threshold = vdupq_n_u8(4);
const uint8x16_t v_level2_threshold = vdupq_n_u8(8);
const uint8x16_t v_level3_threshold = vdupq_n_u8(16);
-
- /* Local variables for array pointers and strides. */
- unsigned char *sig = signal->thismb;
- int sig_stride = 16;
- unsigned char *mc_running_avg_y = mc_running_avg->y_buffer + y_offset;
- int mc_running_avg_y_stride = mc_running_avg->y_stride;
- unsigned char *running_avg_y = running_avg->y_buffer + y_offset;
- int running_avg_y_stride = running_avg->y_stride;
int64x2_t v_sum_diff_total = vdupq_n_s64(0);
/* Go over lines. */
diff --git a/vp8/encoder/block.h b/vp8/encoder/block.h
index dd733e5..34879cf 100644
--- a/vp8/encoder/block.h
+++ b/vp8/encoder/block.h
@@ -125,6 +125,7 @@
int optimize;
int q_index;
+ int increase_denoising;
#if CONFIG_TEMPORAL_DENOISING
MB_PREDICTION_MODE best_sse_inter_mode;
diff --git a/vp8/encoder/denoising.c b/vp8/encoder/denoising.c
index 7819265..1e645fb 100644
--- a/vp8/encoder/denoising.c
+++ b/vp8/encoder/denoising.c
@@ -21,6 +21,7 @@
*/
static const unsigned int SSE_DIFF_THRESHOLD = 16 * 16 * 20;
static const unsigned int SSE_THRESHOLD = 16 * 16 * 40;
+static const unsigned int SSE_THRESHOLD_HIGH = 16 * 16 * 60;
/*
* The filter function was modified to reduce the computational complexity.
@@ -51,27 +52,32 @@
* [16, 255] 6 7
*/
-int vp8_denoiser_filter_c(YV12_BUFFER_CONFIG *mc_running_avg,
- YV12_BUFFER_CONFIG *running_avg, MACROBLOCK *signal,
- unsigned int motion_magnitude, int y_offset,
- int uv_offset)
+int vp8_denoiser_filter_c(unsigned char *mc_running_avg_y, int mc_avg_y_stride,
+ unsigned char *running_avg_y, int avg_y_stride,
+ unsigned char *sig, int sig_stride,
+ unsigned int motion_magnitude,
+ int increase_denoising)
{
- unsigned char *sig = signal->thismb;
- int sig_stride = 16;
- unsigned char *mc_running_avg_y = mc_running_avg->y_buffer + y_offset;
- int mc_avg_y_stride = mc_running_avg->y_stride;
- unsigned char *running_avg_y = running_avg->y_buffer + y_offset;
- int avg_y_stride = running_avg->y_stride;
- int r, c, i;
+ unsigned char *running_avg_y_start = running_avg_y;
+ unsigned char *sig_start = sig;
+ int sum_diff_thresh;
+ int r, c;
int sum_diff = 0;
int adj_val[3] = {3, 4, 6};
-
+ int shift_inc1 = 0;
+ int shift_inc2 = 1;
/* If motion_magnitude is small, making the denoiser more aggressive by
- * increasing the adjustment for each level. */
+ * increasing the adjustment for each level. Add another increment for
+ * blocks that are labeled for increase denoising. */
if (motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD)
{
- for (i = 0; i < 3; i++)
- adj_val[i] += 1;
+ if (increase_denoising) {
+ shift_inc1 = 1;
+ shift_inc2 = 2;
+ }
+ adj_val[0] += shift_inc2;
+ adj_val[1] += shift_inc2;
+ adj_val[2] += shift_inc2;
}
for (r = 0; r < 16; ++r)
@@ -85,8 +91,9 @@
diff = mc_running_avg_y[c] - sig[c];
absdiff = abs(diff);
- /* When |diff| < 4, use pixel value from last denoised raw. */
- if (absdiff <= 3)
+ // When |diff| <= |3 + shift_inc1|, use pixel value from
+ // last denoised raw.
+ if (absdiff <= 3 + shift_inc1)
{
running_avg_y[c] = mc_running_avg_y[c];
sum_diff += diff;
@@ -127,11 +134,12 @@
running_avg_y += avg_y_stride;
}
- if (abs(sum_diff) > SUM_DIFF_THRESHOLD)
+ sum_diff_thresh= SUM_DIFF_THRESHOLD;
+ if (increase_denoising) sum_diff_thresh = SUM_DIFF_THRESHOLD_HIGH;
+ if (abs(sum_diff) > sum_diff_thresh)
return COPY_BLOCK;
- vp8_copy_mem16x16(running_avg->y_buffer + y_offset, avg_y_stride,
- signal->thismb, sig_stride);
+ vp8_copy_mem16x16(running_avg_y_start, avg_y_stride, sig_start, sig_stride);
return FILTER_BLOCK;
}
@@ -192,7 +200,7 @@
int mv_row;
int mv_col;
unsigned int motion_magnitude2;
-
+ unsigned int sse_thresh;
MV_REFERENCE_FRAME frame = x->best_reference_frame;
MV_REFERENCE_FRAME zero_frame = x->best_zeromv_reference_frame;
@@ -277,7 +285,10 @@
mv_row = x->best_sse_mv.as_mv.row;
mv_col = x->best_sse_mv.as_mv.col;
motion_magnitude2 = mv_row * mv_row + mv_col * mv_col;
- if (best_sse > SSE_THRESHOLD || motion_magnitude2
+ sse_thresh = SSE_THRESHOLD;
+ if (x->increase_denoising) sse_thresh = SSE_THRESHOLD_HIGH;
+
+ if (best_sse > sse_thresh || motion_magnitude2
> 8 * NOISE_MOTION_THRESHOLD)
{
decision = COPY_BLOCK;
@@ -285,12 +296,18 @@
if (decision == FILTER_BLOCK)
{
+ unsigned char *mc_running_avg_y =
+ denoiser->yv12_mc_running_avg.y_buffer + recon_yoffset;
+ int mc_avg_y_stride = denoiser->yv12_mc_running_avg.y_stride;
+ unsigned char *running_avg_y =
+ denoiser->yv12_running_avg[INTRA_FRAME].y_buffer + recon_yoffset;
+ int avg_y_stride = denoiser->yv12_running_avg[INTRA_FRAME].y_stride;
+
/* Filter. */
- decision = vp8_denoiser_filter(&denoiser->yv12_mc_running_avg,
- &denoiser->yv12_running_avg[INTRA_FRAME],
- x,
- motion_magnitude2,
- recon_yoffset, recon_uvoffset);
+ decision = vp8_denoiser_filter(mc_running_avg_y, mc_avg_y_stride,
+ running_avg_y, avg_y_stride,
+ x->thismb, 16, motion_magnitude2,
+ x->increase_denoising);
}
if (decision == COPY_BLOCK)
{
diff --git a/vp8/encoder/denoising.h b/vp8/encoder/denoising.h
index cc9913a..ae744d2 100644
--- a/vp8/encoder/denoising.h
+++ b/vp8/encoder/denoising.h
@@ -18,6 +18,7 @@
#endif
#define SUM_DIFF_THRESHOLD (16 * 16 * 2)
+#define SUM_DIFF_THRESHOLD_HIGH (16 * 16 * 3)
#define MOTION_MAGNITUDE_THRESHOLD (8*3)
enum vp8_denoiser_decision
diff --git a/vp8/encoder/pickinter.c b/vp8/encoder/pickinter.c
index 39a3baf..cf6a82f 100644
--- a/vp8/encoder/pickinter.c
+++ b/vp8/encoder/pickinter.c
@@ -1177,6 +1177,7 @@
x->best_reference_frame = best_mbmode.ref_frame;
best_sse = best_rd_sse;
}
+ x->increase_denoising = 0;
vp8_denoiser_denoise_mb(&cpi->denoiser, x, best_sse, zero_mv_sse,
recon_yoffset, recon_uvoffset);
diff --git a/vp8/encoder/x86/denoising_sse2.c b/vp8/encoder/x86/denoising_sse2.c
index cceb826..5112f89 100644
--- a/vp8/encoder/x86/denoising_sse2.c
+++ b/vp8/encoder/x86/denoising_sse2.c
@@ -22,26 +22,28 @@
signed char e[16];
};
-int vp8_denoiser_filter_sse2(YV12_BUFFER_CONFIG *mc_running_avg,
- YV12_BUFFER_CONFIG *running_avg,
- MACROBLOCK *signal, unsigned int motion_magnitude,
- int y_offset, int uv_offset)
+int vp8_denoiser_filter_sse2(unsigned char *mc_running_avg_y,
+ int mc_avg_y_stride,
+ unsigned char *running_avg_y, int avg_y_stride,
+ unsigned char *sig, int sig_stride,
+ unsigned int motion_magnitude,
+ int increase_denoising)
{
- unsigned char *sig = signal->thismb;
- int sig_stride = 16;
- unsigned char *mc_running_avg_y = mc_running_avg->y_buffer + y_offset;
- int mc_avg_y_stride = mc_running_avg->y_stride;
- unsigned char *running_avg_y = running_avg->y_buffer + y_offset;
- int avg_y_stride = running_avg->y_stride;
+ unsigned char *running_avg_y_start = running_avg_y;
+ unsigned char *sig_start = sig;
+ int sum_diff_thresh;
int r;
+ int shift_inc = (increase_denoising &&
+ motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD) ? 1 : 0;
__m128i acc_diff = _mm_setzero_si128();
const __m128i k_0 = _mm_setzero_si128();
- const __m128i k_4 = _mm_set1_epi8(4);
+ const __m128i k_4 = _mm_set1_epi8(4 + shift_inc);
const __m128i k_8 = _mm_set1_epi8(8);
const __m128i k_16 = _mm_set1_epi8(16);
/* Modify each level's adjustment according to motion_magnitude. */
const __m128i l3 = _mm_set1_epi8(
- (motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD) ? 7 : 6);
+ (motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD) ?
+ 7 + shift_inc : 6);
/* Difference between level 3 and level 2 is 2. */
const __m128i l32 = _mm_set1_epi8(2);
/* Difference between level 2 and level 1 is 1. */
@@ -108,13 +110,14 @@
+ s.e[6] + s.e[7] + s.e[8] + s.e[9] + s.e[10] + s.e[11]
+ s.e[12] + s.e[13] + s.e[14] + s.e[15];
- if (abs(sum_diff) > SUM_DIFF_THRESHOLD)
+ sum_diff_thresh = SUM_DIFF_THRESHOLD;
+ if (increase_denoising) sum_diff_thresh = SUM_DIFF_THRESHOLD_HIGH;
+ if (abs(sum_diff) > sum_diff_thresh)
{
return COPY_BLOCK;
}
}
- vp8_copy_mem16x16(running_avg->y_buffer + y_offset, avg_y_stride,
- signal->thismb, sig_stride);
+ vp8_copy_mem16x16(running_avg_y_start, avg_y_stride, sig_start, sig_stride);
return FILTER_BLOCK;
}
diff --git a/vp8/vp8_common.mk b/vp8/vp8_common.mk
index e58cdad..8282547 100644
--- a/vp8/vp8_common.mk
+++ b/vp8/vp8_common.mk
@@ -158,11 +158,13 @@
VP8_COMMON_SRCS-$(HAVE_MEDIA) += common/arm/armv6/vp8_variance_halfpixvar16x16_hv_armv6$(ASM)
# common (neon)
-VP8_COMMON_SRCS-$(ARCH_NEON_ASM) += common/arm/reconintra_arm.c
+#VP8_COMMON_SRCS-$(HAVE_NEON_ASM) += common/arm/reconintra_arm.c
VP8_COMMON_SRCS-$(HAVE_NEON_ASM) += common/arm/neon/loopfilter_neon$(ASM)
VP8_COMMON_SRCS-$(HAVE_NEON_ASM) += common/arm/neon/loopfiltersimpleverticaledge_neon$(ASM)
-VP8_COMMON_SRCS-$(HAVE_NEON_ASM) += common/arm/neon/buildintrapredictorsmby_neon$(ASM)
+#VP8_COMMON_SRCS-$(HAVE_NEON_ASM) += common/arm/neon/buildintrapredictorsmby_neon$(ASM)
VP8_COMMON_SRCS-$(HAVE_NEON_ASM) += common/arm/neon/idct_blk_neon.c
+VP8_COMMON_SRCS-$(HAVE_NEON_ASM) += common/arm/neon/idct_dequant_0_2x_neon$(ASM)
+VP8_COMMON_SRCS-$(HAVE_NEON_ASM) += common/arm/neon/idct_dequant_full_2x_neon$(ASM)
VP8_COMMON_SRCS-$(HAVE_NEON_ASM) += common/arm/neon/vp8_subpixelvariance8x8_neon$(ASM)
VP8_COMMON_SRCS-$(HAVE_NEON_ASM) += common/arm/neon/vp8_subpixelvariance16x16_neon$(ASM)
VP8_COMMON_SRCS-$(HAVE_NEON_ASM) += common/arm/neon/vp8_subpixelvariance16x16s_neon$(ASM)
@@ -173,14 +175,12 @@
VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/dc_only_idct_add_neon.c
VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/dequant_idct_neon.c
VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/dequantizeb_neon.c
-VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/idct_dequant_full_2x_neon.c
VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/iwalsh_neon.c
VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/loopfiltersimplehorizontaledge_neon.c
VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/mbloopfilter_neon.c
VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/sad_neon.c
VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/shortidct4x4llm_neon.c
VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/sixtappredict_neon.c
-VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/idct_dequant_0_2x_neon.c
VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/variance_neon.c
$(eval $(call rtcd_h_template,vp8_rtcd,vp8/common/rtcd_defs.pl))
diff --git a/vp9/common/vp9_common.h b/vp9/common/vp9_common.h
index 2dccb70..04db7c0 100644
--- a/vp9/common/vp9_common.h
+++ b/vp9/common/vp9_common.h
@@ -45,7 +45,7 @@
vpx_memcpy(dest, src, n * sizeof(*src)); \
}
-#define vp9_zero(dest) vpx_memset(&dest, 0, sizeof(dest))
+#define vp9_zero(dest) vpx_memset(&(dest), 0, sizeof(dest))
#define vp9_zero_array(dest, n) vpx_memset(dest, 0, n * sizeof(*dest))
static INLINE uint8_t clip_pixel(int val) {
diff --git a/vp9/common/vp9_debugmodes.c b/vp9/common/vp9_debugmodes.c
index 8f150a4..d2522bb 100644
--- a/vp9/common/vp9_debugmodes.c
+++ b/vp9/common/vp9_debugmodes.c
@@ -24,10 +24,9 @@
*/
static void print_mi_data(VP9_COMMON *cm, FILE *file, const char *descriptor,
size_t member_offset) {
- int mi_row;
- int mi_col;
+ int mi_row, mi_col;
int mi_index = 0;
- MODE_INFO **mi_8x8 = cm->mi_grid_visible;
+ MODE_INFO **mi = cm->mi_grid_visible;
int rows = cm->mi_rows;
int cols = cm->mi_cols;
char prefix = descriptor[0];
@@ -38,7 +37,7 @@
fprintf(file, "%c ", prefix);
for (mi_col = 0; mi_col < cols; mi_col++) {
fprintf(file, "%2d ",
- *((int*) ((char *) (&mi_8x8[mi_index]->mbmi) +
+ *((int*) ((char *) (&mi[mi_index]->mbmi) +
member_offset)));
mi_index++;
}
@@ -52,7 +51,7 @@
int mi_col;
int mi_index = 0;
FILE *mvs = fopen(file, "a");
- MODE_INFO **mi_8x8 = cm->mi_grid_visible;
+ MODE_INFO **mi = cm->mi_grid_visible;
int rows = cm->mi_rows;
int cols = cm->mi_cols;
@@ -67,8 +66,8 @@
for (mi_row = 0; mi_row < rows; mi_row++) {
fprintf(mvs, "V ");
for (mi_col = 0; mi_col < cols; mi_col++) {
- fprintf(mvs, "%4d:%4d ", mi_8x8[mi_index]->mbmi.mv[0].as_mv.row,
- mi_8x8[mi_index]->mbmi.mv[0].as_mv.col);
+ fprintf(mvs, "%4d:%4d ", mi[mi_index]->mbmi.mv[0].as_mv.row,
+ mi[mi_index]->mbmi.mv[0].as_mv.col);
mi_index++;
}
fprintf(mvs, "\n");
diff --git a/vp9/common/vp9_filter.c b/vp9/common/vp9_filter.c
index 7474a88..afcdf22 100644
--- a/vp9/common/vp9_filter.c
+++ b/vp9/common/vp9_filter.c
@@ -32,7 +32,8 @@
};
// Lagrangian interpolation filter
-const InterpKernel vp9_sub_pel_filters_8[SUBPEL_SHIFTS] = {
+DECLARE_ALIGNED(256, const InterpKernel,
+ vp9_sub_pel_filters_8[SUBPEL_SHIFTS]) = {
{ 0, 0, 0, 128, 0, 0, 0, 0},
{ 0, 1, -5, 126, 8, -3, 1, 0},
{ -1, 3, -10, 122, 18, -6, 2, 0},
@@ -52,7 +53,8 @@
};
// DCT based filter
-const InterpKernel vp9_sub_pel_filters_8s[SUBPEL_SHIFTS] = {
+DECLARE_ALIGNED(256, const InterpKernel,
+ vp9_sub_pel_filters_8s[SUBPEL_SHIFTS]) = {
{0, 0, 0, 128, 0, 0, 0, 0},
{-1, 3, -7, 127, 8, -3, 1, 0},
{-2, 5, -13, 125, 17, -6, 3, -1},
@@ -72,7 +74,8 @@
};
// freqmultiplier = 0.5
-const InterpKernel vp9_sub_pel_filters_8lp[SUBPEL_SHIFTS] = {
+DECLARE_ALIGNED(256, const InterpKernel,
+ vp9_sub_pel_filters_8lp[SUBPEL_SHIFTS]) = {
{ 0, 0, 0, 128, 0, 0, 0, 0},
{-3, -1, 32, 64, 38, 1, -3, 0},
{-2, -2, 29, 63, 41, 2, -3, 0},
diff --git a/vp9/common/vp9_filter.h b/vp9/common/vp9_filter.h
index 29d3867..8c359c7 100644
--- a/vp9/common/vp9_filter.h
+++ b/vp9/common/vp9_filter.h
@@ -41,12 +41,6 @@
DECLARE_ALIGNED(256, extern const InterpKernel,
vp9_bilinear_filters[SUBPEL_SHIFTS]);
-DECLARE_ALIGNED(256, extern const InterpKernel,
- vp9_sub_pel_filters_8[SUBPEL_SHIFTS]);
-DECLARE_ALIGNED(256, extern const InterpKernel,
- vp9_sub_pel_filters_8s[SUBPEL_SHIFTS]);
-DECLARE_ALIGNED(256, extern const InterpKernel,
- vp9_sub_pel_filters_8lp[SUBPEL_SHIFTS]);
// The VP9_BILINEAR_FILTERS_2TAP macro returns a pointer to the bilinear
// filter kernel as a 2 tap filter.
diff --git a/vp9/common/vp9_loopfilter.c b/vp9/common/vp9_loopfilter.c
index 5b43e23..efd0249 100644
--- a/vp9/common/vp9_loopfilter.c
+++ b/vp9/common/vp9_loopfilter.c
@@ -619,12 +619,12 @@
// by mi_row, mi_col.
// TODO(JBB): This function only works for yv12.
void vp9_setup_mask(VP9_COMMON *const cm, const int mi_row, const int mi_col,
- MODE_INFO **mi_8x8, const int mode_info_stride,
+ MODE_INFO **mi, const int mode_info_stride,
LOOP_FILTER_MASK *lfm) {
int idx_32, idx_16, idx_8;
const loop_filter_info_n *const lfi_n = &cm->lf_info;
- MODE_INFO **mip = mi_8x8;
- MODE_INFO **mip2 = mi_8x8;
+ MODE_INFO **mip = mi;
+ MODE_INFO **mip2 = mi;
// These are offsets to the next mi in the 64x64 block. It is what gets
// added to the mi ptr as we go through each loop. It helps us to avoids
@@ -1192,32 +1192,33 @@
}
void vp9_loop_filter_rows(const YV12_BUFFER_CONFIG *frame_buffer,
- VP9_COMMON *cm, MACROBLOCKD *xd,
+ VP9_COMMON *cm,
+ struct macroblockd_plane planes[MAX_MB_PLANE],
int start, int stop, int y_only) {
const int num_planes = y_only ? 1 : MAX_MB_PLANE;
- int mi_row, mi_col;
+ const int use_420 = y_only || (planes[1].subsampling_y == 1 &&
+ planes[1].subsampling_x == 1);
LOOP_FILTER_MASK lfm;
- int use_420 = y_only || (xd->plane[1].subsampling_y == 1 &&
- xd->plane[1].subsampling_x == 1);
+ int mi_row, mi_col;
for (mi_row = start; mi_row < stop; mi_row += MI_BLOCK_SIZE) {
- MODE_INFO **mi_8x8 = cm->mi_grid_visible + mi_row * cm->mi_stride;
+ MODE_INFO **mi = cm->mi_grid_visible + mi_row * cm->mi_stride;
for (mi_col = 0; mi_col < cm->mi_cols; mi_col += MI_BLOCK_SIZE) {
int plane;
- vp9_setup_dst_planes(xd, frame_buffer, mi_row, mi_col);
+ vp9_setup_dst_planes(planes, frame_buffer, mi_row, mi_col);
// TODO(JBB): Make setup_mask work for non 420.
if (use_420)
- vp9_setup_mask(cm, mi_row, mi_col, mi_8x8 + mi_col, cm->mi_stride,
+ vp9_setup_mask(cm, mi_row, mi_col, mi + mi_col, cm->mi_stride,
&lfm);
for (plane = 0; plane < num_planes; ++plane) {
if (use_420)
- vp9_filter_block_plane(cm, &xd->plane[plane], mi_row, &lfm);
+ vp9_filter_block_plane(cm, &planes[plane], mi_row, &lfm);
else
- filter_block_plane_non420(cm, &xd->plane[plane], mi_8x8 + mi_col,
+ filter_block_plane_non420(cm, &planes[plane], mi + mi_col,
mi_row, mi_col);
}
}
@@ -1239,7 +1240,7 @@
}
end_mi_row = start_mi_row + mi_rows_to_filter;
vp9_loop_filter_frame_init(cm, frame_filter_level);
- vp9_loop_filter_rows(frame, cm, xd,
+ vp9_loop_filter_rows(frame, cm, xd->plane,
start_mi_row, end_mi_row,
y_only);
}
@@ -1247,7 +1248,7 @@
int vp9_loop_filter_worker(void *arg1, void *arg2) {
LFWorkerData *const lf_data = (LFWorkerData*)arg1;
(void)arg2;
- vp9_loop_filter_rows(lf_data->frame_buffer, lf_data->cm, &lf_data->xd,
+ vp9_loop_filter_rows(lf_data->frame_buffer, lf_data->cm, lf_data->planes,
lf_data->start, lf_data->stop, lf_data->y_only);
return 1;
}
diff --git a/vp9/common/vp9_loopfilter.h b/vp9/common/vp9_loopfilter.h
index 83463c5..6fa2773 100644
--- a/vp9/common/vp9_loopfilter.h
+++ b/vp9/common/vp9_loopfilter.h
@@ -112,15 +112,15 @@
// Apply the loop filter to [start, stop) macro block rows in frame_buffer.
void vp9_loop_filter_rows(const YV12_BUFFER_CONFIG *frame_buffer,
- struct VP9Common *cm, struct macroblockd *xd,
+ struct VP9Common *cm,
+ struct macroblockd_plane planes[MAX_MB_PLANE],
int start, int stop, int y_only);
typedef struct LoopFilterWorkerData {
const YV12_BUFFER_CONFIG *frame_buffer;
struct VP9Common *cm;
- struct macroblockd xd; // TODO(jzern): most of this is unnecessary to the
- // loopfilter. the planes are necessary as their state
- // is changed during decode.
+ struct macroblockd_plane planes[MAX_MB_PLANE];
+
int start;
int stop;
int y_only;
diff --git a/vp9/common/vp9_postproc.c b/vp9/common/vp9_postproc.c
index 5601a93..9f32104 100644
--- a/vp9/common/vp9_postproc.c
+++ b/vp9/common/vp9_postproc.c
@@ -24,61 +24,7 @@
#include "vp9/common/vp9_systemdependent.h"
#include "vp9/common/vp9_textblit.h"
-#define RGB_TO_YUV(t) \
- ( (0.257*(float)(t >> 16)) + (0.504*(float)(t >> 8 & 0xff)) + \
- (0.098*(float)(t & 0xff)) + 16), \
- (-(0.148*(float)(t >> 16)) - (0.291*(float)(t >> 8 & 0xff)) + \
- (0.439*(float)(t & 0xff)) + 128), \
- ( (0.439*(float)(t >> 16)) - (0.368*(float)(t >> 8 & 0xff)) - \
- (0.071*(float)(t & 0xff)) + 128)
-
-/* global constants */
-#if 0 && CONFIG_POSTPROC_VISUALIZER
-static const unsigned char PREDICTION_MODE_colors[MB_MODE_COUNT][3] = {
- { RGB_TO_YUV(0x98FB98) }, /* PaleGreen */
- { RGB_TO_YUV(0x00FF00) }, /* Green */
- { RGB_TO_YUV(0xADFF2F) }, /* GreenYellow */
- { RGB_TO_YUV(0x8F0000) }, /* Dark Red */
- { RGB_TO_YUV(0x008F8F) }, /* Dark Cyan */
- { RGB_TO_YUV(0x008F8F) }, /* Dark Cyan */
- { RGB_TO_YUV(0x008F8F) }, /* Dark Cyan */
- { RGB_TO_YUV(0x8F0000) }, /* Dark Red */
- { RGB_TO_YUV(0x8F0000) }, /* Dark Red */
- { RGB_TO_YUV(0x228B22) }, /* ForestGreen */
- { RGB_TO_YUV(0x006400) }, /* DarkGreen */
- { RGB_TO_YUV(0x98F5FF) }, /* Cadet Blue */
- { RGB_TO_YUV(0x6CA6CD) }, /* Sky Blue */
- { RGB_TO_YUV(0x00008B) }, /* Dark blue */
- { RGB_TO_YUV(0x551A8B) }, /* Purple */
- { RGB_TO_YUV(0xFF0000) } /* Red */
- { RGB_TO_YUV(0xCC33FF) }, /* Magenta */
-};
-
-static const unsigned char B_PREDICTION_MODE_colors[INTRA_MODES][3] = {
- { RGB_TO_YUV(0x6633ff) }, /* Purple */
- { RGB_TO_YUV(0xcc33ff) }, /* Magenta */
- { RGB_TO_YUV(0xff33cc) }, /* Pink */
- { RGB_TO_YUV(0xff3366) }, /* Coral */
- { RGB_TO_YUV(0x3366ff) }, /* Blue */
- { RGB_TO_YUV(0xed00f5) }, /* Dark Blue */
- { RGB_TO_YUV(0x2e00b8) }, /* Dark Purple */
- { RGB_TO_YUV(0xff6633) }, /* Orange */
- { RGB_TO_YUV(0x33ccff) }, /* Light Blue */
- { RGB_TO_YUV(0x8ab800) }, /* Green */
- { RGB_TO_YUV(0xffcc33) }, /* Light Orange */
- { RGB_TO_YUV(0x33ffcc) }, /* Aqua */
- { RGB_TO_YUV(0x66ff33) }, /* Light Green */
- { RGB_TO_YUV(0xccff33) }, /* Yellow */
-};
-
-static const unsigned char MV_REFERENCE_FRAME_colors[MAX_REF_FRAMES][3] = {
- { RGB_TO_YUV(0x00ff00) }, /* Blue */
- { RGB_TO_YUV(0x0000ff) }, /* Green */
- { RGB_TO_YUV(0xffff00) }, /* Yellow */
- { RGB_TO_YUV(0xff0000) }, /* Red */
-};
-#endif
-
+#if CONFIG_VP9_POSTPROC
static const short kernel5[] = {
1, 1, 4, 1, 1
};
@@ -448,163 +394,6 @@
}
}
-/* Blend the macro block with a solid colored square. Leave the
- * edges unblended to give distinction to macro blocks in areas
- * filled with the same color block.
- */
-void vp9_blend_mb_inner_c(uint8_t *y, uint8_t *u, uint8_t *v,
- int y1, int u1, int v1, int alpha, int stride) {
- int i, j;
- int y1_const = y1 * ((1 << 16) - alpha);
- int u1_const = u1 * ((1 << 16) - alpha);
- int v1_const = v1 * ((1 << 16) - alpha);
-
- y += 2 * stride + 2;
- for (i = 0; i < 12; i++) {
- for (j = 0; j < 12; j++) {
- y[j] = (y[j] * alpha + y1_const) >> 16;
- }
- y += stride;
- }
-
- stride >>= 1;
-
- u += stride + 1;
- v += stride + 1;
-
- for (i = 0; i < 6; i++) {
- for (j = 0; j < 6; j++) {
- u[j] = (u[j] * alpha + u1_const) >> 16;
- v[j] = (v[j] * alpha + v1_const) >> 16;
- }
- u += stride;
- v += stride;
- }
-}
-
-/* Blend only the edge of the macro block. Leave center
- * unblended to allow for other visualizations to be layered.
- */
-void vp9_blend_mb_outer_c(uint8_t *y, uint8_t *u, uint8_t *v,
- int y1, int u1, int v1, int alpha, int stride) {
- int i, j;
- int y1_const = y1 * ((1 << 16) - alpha);
- int u1_const = u1 * ((1 << 16) - alpha);
- int v1_const = v1 * ((1 << 16) - alpha);
-
- for (i = 0; i < 2; i++) {
- for (j = 0; j < 16; j++) {
- y[j] = (y[j] * alpha + y1_const) >> 16;
- }
- y += stride;
- }
-
- for (i = 0; i < 12; i++) {
- y[0] = (y[0] * alpha + y1_const) >> 16;
- y[1] = (y[1] * alpha + y1_const) >> 16;
- y[14] = (y[14] * alpha + y1_const) >> 16;
- y[15] = (y[15] * alpha + y1_const) >> 16;
- y += stride;
- }
-
- for (i = 0; i < 2; i++) {
- for (j = 0; j < 16; j++) {
- y[j] = (y[j] * alpha + y1_const) >> 16;
- }
- y += stride;
- }
-
- stride >>= 1;
-
- for (j = 0; j < 8; j++) {
- u[j] = (u[j] * alpha + u1_const) >> 16;
- v[j] = (v[j] * alpha + v1_const) >> 16;
- }
- u += stride;
- v += stride;
-
- for (i = 0; i < 6; i++) {
- u[0] = (u[0] * alpha + u1_const) >> 16;
- v[0] = (v[0] * alpha + v1_const) >> 16;
-
- u[7] = (u[7] * alpha + u1_const) >> 16;
- v[7] = (v[7] * alpha + v1_const) >> 16;
-
- u += stride;
- v += stride;
- }
-
- for (j = 0; j < 8; j++) {
- u[j] = (u[j] * alpha + u1_const) >> 16;
- v[j] = (v[j] * alpha + v1_const) >> 16;
- }
-}
-
-void vp9_blend_b_c(uint8_t *y, uint8_t *u, uint8_t *v,
- int y1, int u1, int v1, int alpha, int stride) {
- int i, j;
- int y1_const = y1 * ((1 << 16) - alpha);
- int u1_const = u1 * ((1 << 16) - alpha);
- int v1_const = v1 * ((1 << 16) - alpha);
-
- for (i = 0; i < 4; i++) {
- for (j = 0; j < 4; j++) {
- y[j] = (y[j] * alpha + y1_const) >> 16;
- }
- y += stride;
- }
-
- stride >>= 1;
-
- for (i = 0; i < 2; i++) {
- for (j = 0; j < 2; j++) {
- u[j] = (u[j] * alpha + u1_const) >> 16;
- v[j] = (v[j] * alpha + v1_const) >> 16;
- }
- u += stride;
- v += stride;
- }
-}
-
-static void constrain_line(int x0, int *x1, int y0, int *y1,
- int width, int height) {
- int dx;
- int dy;
-
- if (*x1 > width) {
- dx = *x1 - x0;
- dy = *y1 - y0;
-
- *x1 = width;
- if (dx)
- *y1 = ((width - x0) * dy) / dx + y0;
- }
- if (*x1 < 0) {
- dx = *x1 - x0;
- dy = *y1 - y0;
-
- *x1 = 0;
- if (dx)
- *y1 = ((0 - x0) * dy) / dx + y0;
- }
- if (*y1 > height) {
- dx = *x1 - x0;
- dy = *y1 - y0;
-
- *y1 = height;
- if (dy)
- *x1 = ((height - y0) * dx) / dy + x0;
- }
- if (*y1 < 0) {
- dx = *x1 - x0;
- dy = *y1 - y0;
-
- *y1 = 0;
- if (dy)
- *x1 = ((0 - y0) * dx) / dy + x0;
- }
-}
-
int vp9_post_proc_frame(struct VP9Common *cm,
YV12_BUFFER_CONFIG *dest, vp9_ppflags_t *ppflags) {
const int q = MIN(63, cm->lf.filter_level * 10 / 6);
@@ -643,328 +432,6 @@
ppbuf->y_width, ppbuf->y_height, ppbuf->y_stride);
}
-#if 0 && CONFIG_POSTPROC_VISUALIZER
- if (flags & VP9D_DEBUG_TXT_FRAME_INFO) {
- char message[512];
- snprintf(message, sizeof(message) -1,
- "F%1dG%1dQ%3dF%3dP%d_s%dx%d",
- (cm->frame_type == KEY_FRAME),
- cm->refresh_golden_frame,
- cm->base_qindex,
- cm->filter_level,
- flags,
- cm->mb_cols, cm->mb_rows);
- vp9_blit_text(message, ppbuf->y_buffer, ppbuf->y_stride);
- }
-
- if (flags & VP9D_DEBUG_TXT_MBLK_MODES) {
- int i, j;
- uint8_t *y_ptr;
- int mb_rows = ppbuf->y_height >> 4;
- int mb_cols = ppbuf->y_width >> 4;
- int mb_index = 0;
- MODE_INFO *mi = cm->mi;
-
- y_ptr = post->y_buffer + 4 * post->y_stride + 4;
-
- /* vp9_filter each macro block */
- for (i = 0; i < mb_rows; i++) {
- for (j = 0; j < mb_cols; j++) {
- char zz[4];
-
- snprintf(zz, sizeof(zz) - 1, "%c", mi[mb_index].mbmi.mode + 'a');
-
- vp9_blit_text(zz, y_ptr, post->y_stride);
- mb_index++;
- y_ptr += 16;
- }
-
- mb_index++; /* border */
- y_ptr += post->y_stride * 16 - post->y_width;
- }
- }
-
- if (flags & VP9D_DEBUG_TXT_DC_DIFF) {
- int i, j;
- uint8_t *y_ptr;
- int mb_rows = ppbuf->y_height >> 4;
- int mb_cols = ppbuf->y_width >> 4;
- int mb_index = 0;
- MODE_INFO *mi = cm->mi;
-
- y_ptr = post->y_buffer + 4 * post->y_stride + 4;
-
- /* vp9_filter each macro block */
- for (i = 0; i < mb_rows; i++) {
- for (j = 0; j < mb_cols; j++) {
- char zz[4];
- int dc_diff = !(mi[mb_index].mbmi.mode != I4X4_PRED &&
- mi[mb_index].mbmi.mode != SPLITMV &&
- mi[mb_index].mbmi.skip);
-
- if (cm->frame_type == KEY_FRAME)
- snprintf(zz, sizeof(zz) - 1, "a");
- else
- snprintf(zz, sizeof(zz) - 1, "%c", dc_diff + '0');
-
- vp9_blit_text(zz, y_ptr, post->y_stride);
- mb_index++;
- y_ptr += 16;
- }
-
- mb_index++; /* border */
- y_ptr += post->y_stride * 16 - post->y_width;
- }
- }
-
- if (flags & VP9D_DEBUG_TXT_RATE_INFO) {
- char message[512];
- snprintf(message, sizeof(message),
- "Bitrate: %10.2f framerate: %10.2f ",
- cm->bitrate, cm->framerate);
- vp9_blit_text(message, ppbuf->y_buffer, ppbuf->y_stride);
- }
-
- /* Draw motion vectors */
- if ((flags & VP9D_DEBUG_DRAW_MV) && ppflags->display_mv_flag) {
- int width = ppbuf->y_width;
- int height = ppbuf->y_height;
- uint8_t *y_buffer = ppbuf->y_buffer;
- int y_stride = ppbuf->y_stride;
- MODE_INFO *mi = cm->mi;
- int x0, y0;
-
- for (y0 = 0; y0 < height; y0 += 16) {
- for (x0 = 0; x0 < width; x0 += 16) {
- int x1, y1;
-
- if (!(ppflags->display_mv_flag & (1 << mi->mbmi.mode))) {
- mi++;
- continue;
- }
-
- if (mi->mbmi.mode == SPLITMV) {
- switch (mi->mbmi.partitioning) {
- case PARTITIONING_16X8 : { /* mv_top_bottom */
- union b_mode_info *bmi = &mi->bmi[0];
- MV *mv = &bmi->mv.as_mv;
-
- x1 = x0 + 8 + (mv->col >> 3);
- y1 = y0 + 4 + (mv->row >> 3);
-
- constrain_line(x0 + 8, &x1, y0 + 4, &y1, width, height);
- vp9_blit_line(x0 + 8, x1, y0 + 4, y1, y_buffer, y_stride);
-
- bmi = &mi->bmi[8];
-
- x1 = x0 + 8 + (mv->col >> 3);
- y1 = y0 + 12 + (mv->row >> 3);
-
- constrain_line(x0 + 8, &x1, y0 + 12, &y1, width, height);
- vp9_blit_line(x0 + 8, x1, y0 + 12, y1, y_buffer, y_stride);
-
- break;
- }
- case PARTITIONING_8X16 : { /* mv_left_right */
- union b_mode_info *bmi = &mi->bmi[0];
- MV *mv = &bmi->mv.as_mv;
-
- x1 = x0 + 4 + (mv->col >> 3);
- y1 = y0 + 8 + (mv->row >> 3);
-
- constrain_line(x0 + 4, &x1, y0 + 8, &y1, width, height);
- vp9_blit_line(x0 + 4, x1, y0 + 8, y1, y_buffer, y_stride);
-
- bmi = &mi->bmi[2];
-
- x1 = x0 + 12 + (mv->col >> 3);
- y1 = y0 + 8 + (mv->row >> 3);
-
- constrain_line(x0 + 12, &x1, y0 + 8, &y1, width, height);
- vp9_blit_line(x0 + 12, x1, y0 + 8, y1, y_buffer, y_stride);
-
- break;
- }
- case PARTITIONING_8X8 : { /* mv_quarters */
- union b_mode_info *bmi = &mi->bmi[0];
- MV *mv = &bmi->mv.as_mv;
-
- x1 = x0 + 4 + (mv->col >> 3);
- y1 = y0 + 4 + (mv->row >> 3);
-
- constrain_line(x0 + 4, &x1, y0 + 4, &y1, width, height);
- vp9_blit_line(x0 + 4, x1, y0 + 4, y1, y_buffer, y_stride);
-
- bmi = &mi->bmi[2];
-
- x1 = x0 + 12 + (mv->col >> 3);
- y1 = y0 + 4 + (mv->row >> 3);
-
- constrain_line(x0 + 12, &x1, y0 + 4, &y1, width, height);
- vp9_blit_line(x0 + 12, x1, y0 + 4, y1, y_buffer, y_stride);
-
- bmi = &mi->bmi[8];
-
- x1 = x0 + 4 + (mv->col >> 3);
- y1 = y0 + 12 + (mv->row >> 3);
-
- constrain_line(x0 + 4, &x1, y0 + 12, &y1, width, height);
- vp9_blit_line(x0 + 4, x1, y0 + 12, y1, y_buffer, y_stride);
-
- bmi = &mi->bmi[10];
-
- x1 = x0 + 12 + (mv->col >> 3);
- y1 = y0 + 12 + (mv->row >> 3);
-
- constrain_line(x0 + 12, &x1, y0 + 12, &y1, width, height);
- vp9_blit_line(x0 + 12, x1, y0 + 12, y1, y_buffer, y_stride);
- break;
- }
- case PARTITIONING_4X4:
- default : {
- union b_mode_info *bmi = mi->bmi;
- int bx0, by0;
-
- for (by0 = y0; by0 < (y0 + 16); by0 += 4) {
- for (bx0 = x0; bx0 < (x0 + 16); bx0 += 4) {
- MV *mv = &bmi->mv.as_mv;
-
- x1 = bx0 + 2 + (mv->col >> 3);
- y1 = by0 + 2 + (mv->row >> 3);
-
- constrain_line(bx0 + 2, &x1, by0 + 2, &y1, width, height);
- vp9_blit_line(bx0 + 2, x1, by0 + 2, y1, y_buffer, y_stride);
-
- bmi++;
- }
- }
- }
- }
- } else if (is_inter_mode(mi->mbmi.mode)) {
- MV *mv = &mi->mbmi.mv.as_mv;
- const int lx0 = x0 + 8;
- const int ly0 = y0 + 8;
-
- x1 = lx0 + (mv->col >> 3);
- y1 = ly0 + (mv->row >> 3);
-
- if (x1 != lx0 && y1 != ly0) {
- constrain_line(lx0, &x1, ly0 - 1, &y1, width, height);
- vp9_blit_line(lx0, x1, ly0 - 1, y1, y_buffer, y_stride);
-
- constrain_line(lx0, &x1, ly0 + 1, &y1, width, height);
- vp9_blit_line(lx0, x1, ly0 + 1, y1, y_buffer, y_stride);
- } else {
- vp9_blit_line(lx0, x1, ly0, y1, y_buffer, y_stride);
- }
- }
-
- mi++;
- }
- mi++;
- }
- }
-
- /* Color in block modes */
- if ((flags & VP9D_DEBUG_CLR_BLK_MODES)
- && (ppflags->display_mb_modes_flag || ppflags->display_b_modes_flag)) {
- int y, x;
- int width = ppbuf->y_width;
- int height = ppbuf->y_height;
- uint8_t *y_ptr = ppbuf->y_buffer;
- uint8_t *u_ptr = ppbuf->u_buffer;
- uint8_t *v_ptr = ppbuf->v_buffer;
- int y_stride = ppbuf->y_stride;
- MODE_INFO *mi = cm->mi;
-
- for (y = 0; y < height; y += 16) {
- for (x = 0; x < width; x += 16) {
- int Y = 0, U = 0, V = 0;
-
- if (mi->mbmi.mode == I4X4_PRED &&
- ((ppflags->display_mb_modes_flag & I4X4_PRED) ||
- ppflags->display_b_modes_flag)) {
- int by, bx;
- uint8_t *yl, *ul, *vl;
- union b_mode_info *bmi = mi->bmi;
-
- yl = y_ptr + x;
- ul = u_ptr + (x >> 1);
- vl = v_ptr + (x >> 1);
-
- for (by = 0; by < 16; by += 4) {
- for (bx = 0; bx < 16; bx += 4) {
- if ((ppflags->display_b_modes_flag & (1 << mi->mbmi.mode))
- || (ppflags->display_mb_modes_flag & I4X4_PRED)) {
- Y = B_PREDICTION_MODE_colors[bmi->as_mode][0];
- U = B_PREDICTION_MODE_colors[bmi->as_mode][1];
- V = B_PREDICTION_MODE_colors[bmi->as_mode][2];
-
- vp9_blend_b(yl + bx, ul + (bx >> 1), vl + (bx >> 1), Y, U, V,
- 0xc000, y_stride);
- }
- bmi++;
- }
-
- yl += y_stride * 4;
- ul += y_stride * 1;
- vl += y_stride * 1;
- }
- } else if (ppflags->display_mb_modes_flag & (1 << mi->mbmi.mode)) {
- Y = PREDICTION_MODE_colors[mi->mbmi.mode][0];
- U = PREDICTION_MODE_colors[mi->mbmi.mode][1];
- V = PREDICTION_MODE_colors[mi->mbmi.mode][2];
-
- vp9_blend_mb_inner(y_ptr + x, u_ptr + (x >> 1), v_ptr + (x >> 1),
- Y, U, V, 0xc000, y_stride);
- }
-
- mi++;
- }
- y_ptr += y_stride * 16;
- u_ptr += y_stride * 4;
- v_ptr += y_stride * 4;
-
- mi++;
- }
- }
-
- /* Color in frame reference blocks */
- if ((flags & VP9D_DEBUG_CLR_FRM_REF_BLKS) &&
- ppflags->display_ref_frame_flag) {
- int y, x;
- int width = ppbuf->y_width;
- int height = ppbuf->y_height;
- uint8_t *y_ptr = ppbuf->y_buffer;
- uint8_t *u_ptr = ppbuf->u_buffer;
- uint8_t *v_ptr = ppbuf->v_buffer;
- int y_stride = ppbuf->y_stride;
- MODE_INFO *mi = cm->mi;
-
- for (y = 0; y < height; y += 16) {
- for (x = 0; x < width; x += 16) {
- int Y = 0, U = 0, V = 0;
-
- if (ppflags->display_ref_frame_flag & (1 << mi->mbmi.ref_frame)) {
- Y = MV_REFERENCE_FRAME_colors[mi->mbmi.ref_frame][0];
- U = MV_REFERENCE_FRAME_colors[mi->mbmi.ref_frame][1];
- V = MV_REFERENCE_FRAME_colors[mi->mbmi.ref_frame][2];
-
- vp9_blend_mb_outer(y_ptr + x, u_ptr + (x >> 1), v_ptr + (x >> 1),
- Y, U, V, 0xc000, y_stride);
- }
-
- mi++;
- }
- y_ptr += y_stride * 16;
- u_ptr += y_stride * 4;
- v_ptr += y_stride * 4;
-
- mi++;
- }
- }
-#endif
-
*dest = *ppbuf;
/* handle problem with extending borders */
@@ -975,3 +442,4 @@
return 0;
}
+#endif
diff --git a/vp9/common/vp9_ppflags.h b/vp9/common/vp9_ppflags.h
index e8b04d2..1644a1b 100644
--- a/vp9/common/vp9_ppflags.h
+++ b/vp9/common/vp9_ppflags.h
@@ -33,12 +33,6 @@
int post_proc_flag;
int deblocking_level;
int noise_level;
-#if CONFIG_POSTPROC_VISUALIZER
- int display_ref_frame_flag;
- int display_mb_modes_flag;
- int display_b_modes_flag;
- int display_mv_flag;
-#endif // CONFIG_POSTPROC_VISUALIZER
} vp9_ppflags_t;
#ifdef __cplusplus
diff --git a/vp9/common/vp9_reconinter.c b/vp9/common/vp9_reconinter.c
index e722d6a..edc36d7 100644
--- a/vp9/common/vp9_reconinter.c
+++ b/vp9/common/vp9_reconinter.c
@@ -409,7 +409,7 @@
}
}
-void vp9_setup_dst_planes(MACROBLOCKD *xd,
+void vp9_setup_dst_planes(struct macroblockd_plane planes[MAX_MB_PLANE],
const YV12_BUFFER_CONFIG *src,
int mi_row, int mi_col) {
uint8_t *const buffers[4] = {src->y_buffer, src->u_buffer, src->v_buffer,
@@ -419,7 +419,7 @@
int i;
for (i = 0; i < MAX_MB_PLANE; ++i) {
- struct macroblockd_plane *const pd = &xd->plane[i];
+ struct macroblockd_plane *const pd = &planes[i];
setup_pred_plane(&pd->dst, buffers[i], strides[i], mi_row, mi_col, NULL,
pd->subsampling_x, pd->subsampling_y);
}
diff --git a/vp9/common/vp9_reconinter.h b/vp9/common/vp9_reconinter.h
index 86f3158..58c596e 100644
--- a/vp9/common/vp9_reconinter.h
+++ b/vp9/common/vp9_reconinter.h
@@ -57,7 +57,8 @@
dst->stride = stride;
}
-void vp9_setup_dst_planes(MACROBLOCKD *xd, const YV12_BUFFER_CONFIG *src,
+void vp9_setup_dst_planes(struct macroblockd_plane planes[MAX_MB_PLANE],
+ const YV12_BUFFER_CONFIG *src,
int mi_row, int mi_col);
void vp9_setup_pre_planes(MACROBLOCKD *xd, int idx,
diff --git a/vp9/common/vp9_reconintra.c b/vp9/common/vp9_reconintra.c
index 32e4551..403e105 100644
--- a/vp9/common/vp9_reconintra.c
+++ b/vp9/common/vp9_reconintra.c
@@ -31,6 +31,9 @@
ADST_ADST, // TM
};
+// This serves as a wrapper function, so that all the prediction functions
+// can be unified and accessed as a pointer array. Note that the boundary
+// above and left are not necessarily used all the time.
#define intra_pred_sized(type, size) \
void vp9_##type##_predictor_##size##x##size##_c(uint8_t *dst, \
ptrdiff_t stride, \
@@ -48,7 +51,7 @@
static INLINE void d207_predictor(uint8_t *dst, ptrdiff_t stride, int bs,
const uint8_t *above, const uint8_t *left) {
int r, c;
-
+ (void) above;
// first column
for (r = 0; r < bs - 1; ++r)
dst[r * stride] = ROUND_POWER_OF_TWO(left[r] + left[r + 1], 1);
@@ -77,6 +80,7 @@
static INLINE void d63_predictor(uint8_t *dst, ptrdiff_t stride, int bs,
const uint8_t *above, const uint8_t *left) {
int r, c;
+ (void) left;
for (r = 0; r < bs; ++r) {
for (c = 0; c < bs; ++c)
dst[c] = r & 1 ? ROUND_POWER_OF_TWO(above[r/2 + c] +
@@ -92,6 +96,7 @@
static INLINE void d45_predictor(uint8_t *dst, ptrdiff_t stride, int bs,
const uint8_t *above, const uint8_t *left) {
int r, c;
+ (void) left;
for (r = 0; r < bs; ++r) {
for (c = 0; c < bs; ++c)
dst[c] = r + c + 2 < bs * 2 ? ROUND_POWER_OF_TWO(above[r + c] +
@@ -184,6 +189,7 @@
static INLINE void v_predictor(uint8_t *dst, ptrdiff_t stride, int bs,
const uint8_t *above, const uint8_t *left) {
int r;
+ (void) left;
for (r = 0; r < bs; r++) {
vpx_memcpy(dst, above, bs);
@@ -195,6 +201,7 @@
static INLINE void h_predictor(uint8_t *dst, ptrdiff_t stride, int bs,
const uint8_t *above, const uint8_t *left) {
int r;
+ (void) above;
for (r = 0; r < bs; r++) {
vpx_memset(dst, left[r], bs);
@@ -219,6 +226,8 @@
static INLINE void dc_128_predictor(uint8_t *dst, ptrdiff_t stride, int bs,
const uint8_t *above, const uint8_t *left) {
int r;
+ (void) above;
+ (void) left;
for (r = 0; r < bs; r++) {
vpx_memset(dst, 128, bs);
@@ -231,6 +240,7 @@
const uint8_t *above,
const uint8_t *left) {
int i, r, expected_dc, sum = 0;
+ (void) above;
for (i = 0; i < bs; i++)
sum += left[i];
@@ -246,6 +256,7 @@
static INLINE void dc_top_predictor(uint8_t *dst, ptrdiff_t stride, int bs,
const uint8_t *above, const uint8_t *left) {
int i, r, expected_dc, sum = 0;
+ (void) left;
for (i = 0; i < bs; i++)
sum += above[i];
diff --git a/vp9/common/vp9_rtcd_defs.pl b/vp9/common/vp9_rtcd_defs.pl
index 7754763..1827396 100644
--- a/vp9/common/vp9_rtcd_defs.pl
+++ b/vp9/common/vp9_rtcd_defs.pl
@@ -703,7 +703,7 @@
# ENCODEMB INVOKE
add_proto qw/int64_t vp9_block_error/, "const int16_t *coeff, const int16_t *dqcoeff, intptr_t block_size, int64_t *ssz";
-specialize qw/vp9_block_error/, "$sse2_x86inc";
+specialize qw/vp9_block_error avx2/, "$sse2_x86inc";
add_proto qw/void vp9_subtract_block/, "int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride";
specialize qw/vp9_subtract_block/, "$sse2_x86inc";
@@ -772,7 +772,7 @@
add_proto qw/int vp9_full_range_search/, "const struct macroblock *x, const struct search_site_config *cfg, struct mv *ref_mv, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv";
specialize qw/vp9_full_range_search/;
-add_proto qw/void vp9_temporal_filter_apply/, "uint8_t *frame1, unsigned int stride, uint8_t *frame2, unsigned int block_size, int strength, int filter_weight, unsigned int *accumulator, uint16_t *count";
+add_proto qw/void vp9_temporal_filter_apply/, "uint8_t *frame1, unsigned int stride, uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, unsigned int *accumulator, uint16_t *count";
specialize qw/vp9_temporal_filter_apply sse2/;
}
diff --git a/vp9/common/vp9_tile_common.c b/vp9/common/vp9_tile_common.c
index 78909dd..8c4a303 100644
--- a/vp9/common/vp9_tile_common.c
+++ b/vp9/common/vp9_tile_common.c
@@ -21,13 +21,21 @@
return MIN(offset, mis);
}
-void vp9_tile_init(TileInfo *tile, const VP9_COMMON *cm, int row, int col) {
+void vp9_tile_set_row(TileInfo *tile, const VP9_COMMON *cm, int row) {
tile->mi_row_start = get_tile_offset(row, cm->mi_rows, cm->log2_tile_rows);
tile->mi_row_end = get_tile_offset(row + 1, cm->mi_rows, cm->log2_tile_rows);
+}
+
+void vp9_tile_set_col(TileInfo *tile, const VP9_COMMON *cm, int col) {
tile->mi_col_start = get_tile_offset(col, cm->mi_cols, cm->log2_tile_cols);
tile->mi_col_end = get_tile_offset(col + 1, cm->mi_cols, cm->log2_tile_cols);
}
+void vp9_tile_init(TileInfo *tile, const VP9_COMMON *cm, int row, int col) {
+ vp9_tile_set_row(tile, cm, row);
+ vp9_tile_set_col(tile, cm, col);
+}
+
void vp9_get_tile_n_bits(int mi_cols,
int *min_log2_tile_cols, int *max_log2_tile_cols) {
const int sb_cols = mi_cols_aligned_to_sb(mi_cols) >> MI_BLOCK_SIZE_LOG2;
diff --git a/vp9/common/vp9_tile_common.h b/vp9/common/vp9_tile_common.h
index a97719e..ae58805 100644
--- a/vp9/common/vp9_tile_common.h
+++ b/vp9/common/vp9_tile_common.h
@@ -27,6 +27,9 @@
void vp9_tile_init(TileInfo *tile, const struct VP9Common *cm,
int row, int col);
+void vp9_tile_set_row(TileInfo *tile, const struct VP9Common *cm, int row);
+void vp9_tile_set_col(TileInfo *tile, const struct VP9Common *cm, int col);
+
void vp9_get_tile_n_bits(int mi_cols,
int *min_log2_tile_cols, int *max_log2_tile_cols);
diff --git a/vp9/decoder/vp9_decodeframe.c b/vp9/decoder/vp9_decodeframe.c
index 9dc0cf1..de58939 100644
--- a/vp9/decoder/vp9_decodeframe.c
+++ b/vp9/decoder/vp9_decodeframe.c
@@ -316,7 +316,7 @@
// as they are always compared to values that are in 1/8th pel units
set_mi_row_col(xd, tile, mi_row, bh, mi_col, bw, cm->mi_rows, cm->mi_cols);
- vp9_setup_dst_planes(xd, get_frame_new_buffer(cm), mi_row, mi_col);
+ vp9_setup_dst_planes(xd->plane, get_frame_new_buffer(cm), mi_row, mi_col);
return &xd->mi[0]->mbmi;
}
@@ -675,64 +675,6 @@
setup_display_size(cm, rb);
}
-static void decode_tile(VP9Decoder *pbi, const TileInfo *const tile,
- int do_loopfilter_inline, vp9_reader *r) {
- const int num_threads = pbi->max_threads;
- VP9_COMMON *const cm = &pbi->common;
- int mi_row, mi_col;
- MACROBLOCKD *xd = &pbi->mb;
-
- if (do_loopfilter_inline) {
- LFWorkerData *const lf_data = (LFWorkerData*)pbi->lf_worker.data1;
- lf_data->frame_buffer = get_frame_new_buffer(cm);
- lf_data->cm = cm;
- lf_data->xd = pbi->mb;
- lf_data->stop = 0;
- lf_data->y_only = 0;
- vp9_loop_filter_frame_init(cm, cm->lf.filter_level);
- }
-
- for (mi_row = tile->mi_row_start; mi_row < tile->mi_row_end;
- mi_row += MI_BLOCK_SIZE) {
- // For a SB there are 2 left contexts, each pertaining to a MB row within
- vp9_zero(xd->left_context);
- vp9_zero(xd->left_seg_context);
- for (mi_col = tile->mi_col_start; mi_col < tile->mi_col_end;
- mi_col += MI_BLOCK_SIZE) {
- decode_partition(cm, xd, tile, mi_row, mi_col, r, BLOCK_64X64);
- }
-
- if (do_loopfilter_inline) {
- const int lf_start = mi_row - MI_BLOCK_SIZE;
- LFWorkerData *const lf_data = (LFWorkerData*)pbi->lf_worker.data1;
-
- // delay the loopfilter by 1 macroblock row.
- if (lf_start < 0) continue;
-
- // decoding has completed: finish up the loop filter in this thread.
- if (mi_row + MI_BLOCK_SIZE >= tile->mi_row_end) continue;
-
- vp9_worker_sync(&pbi->lf_worker);
- lf_data->start = lf_start;
- lf_data->stop = mi_row;
- if (num_threads > 1) {
- vp9_worker_launch(&pbi->lf_worker);
- } else {
- vp9_worker_execute(&pbi->lf_worker);
- }
- }
- }
-
- if (do_loopfilter_inline) {
- LFWorkerData *const lf_data = (LFWorkerData*)pbi->lf_worker.data1;
-
- vp9_worker_sync(&pbi->lf_worker);
- lf_data->start = lf_data->stop;
- lf_data->stop = cm->mi_rows;
- vp9_worker_execute(&pbi->lf_worker);
- }
-}
-
static void setup_tile_info(VP9_COMMON *cm, struct vp9_read_bit_buffer *rb) {
int min_log2_tile_cols, max_log2_tile_cols, max_ones;
vp9_get_tile_n_bits(cm->mi_cols, &min_log2_tile_cols, &max_log2_tile_cols);
@@ -811,16 +753,35 @@
static const uint8_t *decode_tiles(VP9Decoder *pbi,
const uint8_t *data,
- const uint8_t *data_end,
- int do_loopfilter_inline) {
+ const uint8_t *data_end) {
VP9_COMMON *const cm = &pbi->common;
const int aligned_cols = mi_cols_aligned_to_sb(cm->mi_cols);
const int tile_cols = 1 << cm->log2_tile_cols;
const int tile_rows = 1 << cm->log2_tile_rows;
TileBuffer tile_buffers[4][1 << 6];
int tile_row, tile_col;
- const uint8_t *end = NULL;
- vp9_reader r;
+ int mi_row, mi_col;
+ TileData *tile_data = NULL;
+
+ if (cm->lf.filter_level && pbi->lf_worker.data1 == NULL) {
+ CHECK_MEM_ERROR(cm, pbi->lf_worker.data1,
+ vpx_memalign(32, sizeof(LFWorkerData)));
+ pbi->lf_worker.hook = (VP9WorkerHook)vp9_loop_filter_worker;
+ if (pbi->max_threads > 1 && !vp9_worker_reset(&pbi->lf_worker)) {
+ vpx_internal_error(&cm->error, VPX_CODEC_ERROR,
+ "Loop filter thread creation failed");
+ }
+ }
+
+ if (cm->lf.filter_level) {
+ LFWorkerData *const lf_data = (LFWorkerData*)pbi->lf_worker.data1;
+ lf_data->frame_buffer = get_frame_new_buffer(cm);
+ lf_data->cm = cm;
+ vp9_copy(lf_data->planes, pbi->mb.plane);
+ lf_data->stop = 0;
+ lf_data->y_only = 0;
+ vp9_loop_filter_frame_init(cm, cm->lf.filter_level);
+ }
assert(tile_rows <= 4);
assert(tile_cols <= (1 << 6));
@@ -835,26 +796,88 @@
get_tile_buffers(pbi, data, data_end, tile_cols, tile_rows, tile_buffers);
- // Decode tiles using data from tile_buffers
+ if (pbi->tile_data == NULL ||
+ (tile_cols * tile_rows) != pbi->total_tiles) {
+ vpx_free(pbi->tile_data);
+ CHECK_MEM_ERROR(
+ cm,
+ pbi->tile_data,
+ vpx_malloc(tile_cols * tile_rows * (sizeof(*pbi->tile_data))));
+ pbi->total_tiles = tile_rows * tile_cols;
+ }
+
+ // Load all tile information into tile_data.
for (tile_row = 0; tile_row < tile_rows; ++tile_row) {
for (tile_col = 0; tile_col < tile_cols; ++tile_col) {
- const int col = pbi->inv_tile_order ? tile_cols - tile_col - 1 : tile_col;
- const int last_tile = tile_row == tile_rows - 1 &&
- col == tile_cols - 1;
- const TileBuffer *const buf = &tile_buffers[tile_row][col];
TileInfo tile;
-
- vp9_tile_init(&tile, cm, tile_row, col);
- setup_token_decoder(buf->data, data_end, buf->size, &cm->error, &r,
- pbi->decrypt_cb, pbi->decrypt_state);
- decode_tile(pbi, &tile, do_loopfilter_inline, &r);
-
- if (last_tile)
- end = vp9_reader_find_end(&r);
+ const TileBuffer *const buf = &tile_buffers[tile_row][tile_col];
+ tile_data = pbi->tile_data + tile_cols * tile_row + tile_col;
+ tile_data->cm = cm;
+ tile_data->xd = pbi->mb;
+ tile_data->xd.corrupted = 0;
+ vp9_tile_init(&tile, tile_data->cm, tile_row, tile_col);
+ setup_token_decoder(buf->data, data_end, buf->size, &cm->error,
+ &tile_data->bit_reader, pbi->decrypt_cb,
+ pbi->decrypt_state);
+ init_macroblockd(cm, &tile_data->xd);
+ vp9_zero(tile_data->xd.dqcoeff);
}
}
- return end;
+ for (tile_row = 0; tile_row < tile_rows; ++tile_row) {
+ TileInfo tile;
+ vp9_tile_set_row(&tile, cm, tile_row);
+ for (mi_row = tile.mi_row_start; mi_row < tile.mi_row_end;
+ mi_row += MI_BLOCK_SIZE) {
+ for (tile_col = 0; tile_col < tile_cols; ++tile_col) {
+ const int col = pbi->inv_tile_order ?
+ tile_cols - tile_col - 1 : tile_col;
+ tile_data = pbi->tile_data + tile_cols * tile_row + col;
+ vp9_tile_set_col(&tile, tile_data->cm, col);
+ vp9_zero(tile_data->xd.left_context);
+ vp9_zero(tile_data->xd.left_seg_context);
+ for (mi_col = tile.mi_col_start; mi_col < tile.mi_col_end;
+ mi_col += MI_BLOCK_SIZE) {
+ decode_partition(tile_data->cm, &tile_data->xd, &tile, mi_row, mi_col,
+ &tile_data->bit_reader, BLOCK_64X64);
+ }
+ }
+ // Loopfilter one row.
+ if (cm->lf.filter_level) {
+ const int lf_start = mi_row - MI_BLOCK_SIZE;
+ LFWorkerData *const lf_data = (LFWorkerData*)pbi->lf_worker.data1;
+
+ // delay the loopfilter by 1 macroblock row.
+ if (lf_start < 0) continue;
+
+ // decoding has completed: finish up the loop filter in this thread.
+ if (mi_row + MI_BLOCK_SIZE >= cm->mi_rows) continue;
+
+ vp9_worker_sync(&pbi->lf_worker);
+ lf_data->start = lf_start;
+ lf_data->stop = mi_row;
+ if (pbi->max_threads > 1) {
+ vp9_worker_launch(&pbi->lf_worker);
+ } else {
+ vp9_worker_execute(&pbi->lf_worker);
+ }
+ }
+ }
+ }
+
+ // Loopfilter remaining rows in the frame.
+ if (cm->lf.filter_level) {
+ LFWorkerData *const lf_data = (LFWorkerData*)pbi->lf_worker.data1;
+ vp9_worker_sync(&pbi->lf_worker);
+ lf_data->start = lf_data->stop;
+ lf_data->stop = cm->mi_rows;
+ vp9_worker_execute(&pbi->lf_worker);
+ }
+
+ // Get last tile data.
+ tile_data = pbi->tile_data + tile_cols * tile_rows - 1;
+
+ return vp9_reader_find_end(&tile_data->bit_reader);
}
static int tile_worker_hook(void *arg1, void *arg2) {
@@ -1307,8 +1330,6 @@
const int tile_rows = 1 << cm->log2_tile_rows;
const int tile_cols = 1 << cm->log2_tile_cols;
YV12_BUFFER_CONFIG *const new_fb = get_frame_new_buffer(cm);
- const int do_loopfilter_inline = tile_rows == 1 && tile_cols == 1 &&
- cm->lf.filter_level;
xd->cur_buf = new_fb;
if (!first_partition_size) {
@@ -1325,16 +1346,6 @@
vpx_internal_error(&cm->error, VPX_CODEC_CORRUPT_FRAME,
"Truncated packet or corrupt header length");
- if (do_loopfilter_inline && pbi->lf_worker.data1 == NULL) {
- CHECK_MEM_ERROR(cm, pbi->lf_worker.data1,
- vpx_memalign(32, sizeof(LFWorkerData)));
- pbi->lf_worker.hook = (VP9WorkerHook)vp9_loop_filter_worker;
- if (pbi->max_threads > 1 && !vp9_worker_reset(&pbi->lf_worker)) {
- vpx_internal_error(&cm->error, VPX_CODEC_ERROR,
- "Loop filter thread creation failed");
- }
- }
-
init_macroblockd(cm, &pbi->mb);
if (cm->coding_use_prev_mi)
@@ -1357,9 +1368,11 @@
if (pbi->max_threads > 1 && tile_rows == 1 && tile_cols > 1 &&
cm->frame_parallel_decoding_mode) {
*p_data_end = decode_tiles_mt(pbi, data + first_partition_size, data_end);
+ // If multiple threads are used to decode tiles, then we use those threads
+ // to do parallel loopfiltering.
+ vp9_loop_filter_frame_mt(new_fb, pbi, cm, cm->lf.filter_level, 0);
} else {
- *p_data_end = decode_tiles(pbi, data + first_partition_size, data_end,
- do_loopfilter_inline);
+ *p_data_end = decode_tiles(pbi, data + first_partition_size, data_end);
}
new_fb->corrupted |= xd->corrupted;
@@ -1388,16 +1401,5 @@
if (cm->refresh_frame_context)
cm->frame_contexts[cm->frame_context_idx] = cm->fc;
- // Loopfilter
- if (!do_loopfilter_inline) {
- // If multiple threads are used to decode tiles, then we use those threads
- // to do parallel loopfiltering.
- if (pbi->num_tile_workers) {
- vp9_loop_filter_frame_mt(new_fb, pbi, cm, cm->lf.filter_level, 0, 0);
- } else {
- vp9_loop_filter_frame(new_fb, cm, &pbi->mb, cm->lf.filter_level, 0, 0);
- }
- }
-
return 0;
}
diff --git a/vp9/decoder/vp9_decoder.c b/vp9/decoder/vp9_decoder.c
index 80f0727..8902f17 100644
--- a/vp9/decoder/vp9_decoder.c
+++ b/vp9/decoder/vp9_decoder.c
@@ -32,7 +32,7 @@
#include "vp9/decoder/vp9_detokenize.h"
#include "vp9/decoder/vp9_dthread.h"
-void vp9_initialize_dec() {
+static void initialize_dec() {
static int init_done = 0;
if (!init_done) {
@@ -58,7 +58,7 @@
}
cm->error.setjmp = 1;
- vp9_initialize_dec();
+ initialize_dec();
vp9_rtcd();
@@ -90,6 +90,7 @@
vp9_remove_common(cm);
vp9_worker_end(&pbi->lf_worker);
vpx_free(pbi->lf_worker.data1);
+ vpx_free(pbi->tile_data);
for (i = 0; i < pbi->num_tile_workers; ++i) {
VP9Worker *const worker = &pbi->tile_workers[i];
vp9_worker_end(worker);
@@ -301,11 +302,14 @@
int vp9_get_raw_frame(VP9Decoder *pbi, YV12_BUFFER_CONFIG *sd,
vp9_ppflags_t *flags) {
int ret = -1;
+#if !CONFIG_VP9_POSTPROC
+ (void)*flags;
+#endif
if (pbi->ready_for_new_data == 1)
return ret;
- /* ie no raw frame to show!!! */
+ /* no raw frame to show!!! */
if (pbi->common.show_frame == 0)
return ret;
@@ -314,8 +318,8 @@
#if CONFIG_VP9_POSTPROC
ret = vp9_post_proc_frame(&pbi->common, sd, flags);
#else
- *sd = *pbi->common.frame_to_show;
- ret = 0;
+ *sd = *pbi->common.frame_to_show;
+ ret = 0;
#endif /*!CONFIG_POSTPROC*/
vp9_clear_system_state();
return ret;
diff --git a/vp9/decoder/vp9_decoder.h b/vp9/decoder/vp9_decoder.h
index 12014a7..1a5576e 100644
--- a/vp9/decoder/vp9_decoder.h
+++ b/vp9/decoder/vp9_decoder.h
@@ -27,6 +27,13 @@
extern "C" {
#endif
+// TODO(hkuang): combine this with TileWorkerData.
+typedef struct TileData {
+ VP9_COMMON *cm;
+ vp9_reader bit_reader;
+ DECLARE_ALIGNED(16, MACROBLOCKD, xd);
+} TileData;
+
typedef struct VP9Decoder {
DECLARE_ALIGNED(16, MACROBLOCKD, mb);
@@ -39,10 +46,12 @@
int decoded_key_frame;
VP9Worker lf_worker;
-
VP9Worker *tile_workers;
int num_tile_workers;
+ TileData *tile_data;
+ int total_tiles;
+
VP9LfSync lf_row_sync;
vpx_decrypt_cb decrypt_cb;
@@ -52,8 +61,6 @@
int inv_tile_order;
} VP9Decoder;
-void vp9_initialize_dec();
-
int vp9_receive_compressed_data(struct VP9Decoder *pbi,
size_t size, const uint8_t **dest);
diff --git a/vp9/decoder/vp9_dthread.c b/vp9/decoder/vp9_dthread.c
index 5fe5ed7..bc6c418 100644
--- a/vp9/decoder/vp9_dthread.c
+++ b/vp9/decoder/vp9_dthread.c
@@ -89,7 +89,8 @@
// Implement row loopfiltering for each thread.
static void loop_filter_rows_mt(const YV12_BUFFER_CONFIG *const frame_buffer,
- VP9_COMMON *const cm, MACROBLOCKD *const xd,
+ VP9_COMMON *const cm,
+ struct macroblockd_plane planes[MAX_MB_PLANE],
int start, int stop, int y_only,
VP9LfSync *const lf_sync, int num_lf_workers) {
const int num_planes = y_only ? 1 : MAX_MB_PLANE;
@@ -107,11 +108,11 @@
sync_read(lf_sync, r, c);
- vp9_setup_dst_planes(xd, frame_buffer, mi_row, mi_col);
+ vp9_setup_dst_planes(planes, frame_buffer, mi_row, mi_col);
vp9_setup_mask(cm, mi_row, mi_col, mi + mi_col, cm->mi_stride, &lfm);
for (plane = 0; plane < num_planes; ++plane) {
- vp9_filter_block_plane(cm, &xd->plane[plane], mi_row, &lfm);
+ vp9_filter_block_plane(cm, &planes[plane], mi_row, &lfm);
}
sync_write(lf_sync, r, c, sb_cols);
@@ -124,7 +125,7 @@
TileWorkerData *const tile_data = (TileWorkerData*)arg1;
LFWorkerData *const lf_data = &tile_data->lfdata;
- loop_filter_rows_mt(lf_data->frame_buffer, lf_data->cm, &lf_data->xd,
+ loop_filter_rows_mt(lf_data->frame_buffer, lf_data->cm, lf_data->planes,
lf_data->start, lf_data->stop, lf_data->y_only,
lf_data->lf_sync, lf_data->num_lf_workers);
return 1;
@@ -135,7 +136,7 @@
void vp9_loop_filter_frame_mt(YV12_BUFFER_CONFIG *frame,
VP9Decoder *pbi, VP9_COMMON *cm,
int frame_filter_level,
- int y_only, int partial_frame) {
+ int y_only) {
VP9LfSync *const lf_sync = &pbi->lf_row_sync;
// Number of superblock rows and cols
const int sb_rows = mi_cols_aligned_to_sb(cm->mi_rows) >> MI_BLOCK_SIZE_LOG2;
@@ -186,7 +187,7 @@
// Loopfilter data
lf_data->frame_buffer = frame;
lf_data->cm = cm;
- lf_data->xd = pbi->mb;
+ vp9_copy(lf_data->planes, pbi->mb.plane);
lf_data->start = i;
lf_data->stop = sb_rows;
lf_data->y_only = y_only; // always do all planes in decoder
diff --git a/vp9/decoder/vp9_dthread.h b/vp9/decoder/vp9_dthread.h
index c3b7a29..a727e2a 100644
--- a/vp9/decoder/vp9_dthread.h
+++ b/vp9/decoder/vp9_dthread.h
@@ -52,6 +52,6 @@
struct VP9Decoder *pbi,
struct VP9Common *cm,
int frame_filter_level,
- int y_only, int partial_frame);
+ int y_only);
#endif // VP9_DECODER_VP9_DTHREAD_H_
diff --git a/vp9/encoder/vp9_bitstream.c b/vp9/encoder/vp9_bitstream.c
index 35d2ecf..8ef2b2e 100644
--- a/vp9/encoder/vp9_bitstream.c
+++ b/vp9/encoder/vp9_bitstream.c
@@ -485,8 +485,8 @@
}
static void build_tree_distribution(VP9_COMP *cpi, TX_SIZE tx_size,
- vp9_coeff_stats *coef_branch_ct) {
- vp9_coeff_probs_model *coef_probs = cpi->frame_coef_probs[tx_size];
+ vp9_coeff_stats *coef_branch_ct,
+ vp9_coeff_probs_model *coef_probs) {
vp9_coeff_count *coef_counts = cpi->coef_counts[tx_size];
unsigned int (*eob_branch_ct)[REF_TYPES][COEF_BANDS][COEFF_CONTEXTS] =
cpi->common.counts.eob_branch[tx_size];
@@ -513,10 +513,9 @@
static void update_coef_probs_common(vp9_writer* const bc, VP9_COMP *cpi,
TX_SIZE tx_size,
- vp9_coeff_stats *frame_branch_ct) {
- vp9_coeff_probs_model *new_frame_coef_probs = cpi->frame_coef_probs[tx_size];
- vp9_coeff_probs_model *old_frame_coef_probs =
- cpi->common.fc.coef_probs[tx_size];
+ vp9_coeff_stats *frame_branch_ct,
+ vp9_coeff_probs_model *new_coef_probs) {
+ vp9_coeff_probs_model *old_coef_probs = cpi->common.fc.coef_probs[tx_size];
const vp9_prob upd = DIFF_UPDATE_PROB;
const int entropy_nodes_update = UNCONSTRAINED_NODES;
int i, j, k, l, t;
@@ -530,14 +529,14 @@
for (k = 0; k < COEF_BANDS; ++k) {
for (l = 0; l < BAND_COEFF_CONTEXTS(k); ++l) {
for (t = 0; t < entropy_nodes_update; ++t) {
- vp9_prob newp = new_frame_coef_probs[i][j][k][l][t];
- const vp9_prob oldp = old_frame_coef_probs[i][j][k][l][t];
+ vp9_prob newp = new_coef_probs[i][j][k][l][t];
+ const vp9_prob oldp = old_coef_probs[i][j][k][l][t];
int s;
int u = 0;
if (t == PIVOT_NODE)
s = vp9_prob_diff_update_savings_search_model(
frame_branch_ct[i][j][k][l][0],
- old_frame_coef_probs[i][j][k][l], &newp, upd);
+ old_coef_probs[i][j][k][l], &newp, upd);
else
s = vp9_prob_diff_update_savings_search(
frame_branch_ct[i][j][k][l][t], oldp, &newp, upd);
@@ -567,15 +566,15 @@
for (l = 0; l < BAND_COEFF_CONTEXTS(k); ++l) {
// calc probs and branch cts for this frame only
for (t = 0; t < entropy_nodes_update; ++t) {
- vp9_prob newp = new_frame_coef_probs[i][j][k][l][t];
- vp9_prob *oldp = old_frame_coef_probs[i][j][k][l] + t;
+ vp9_prob newp = new_coef_probs[i][j][k][l][t];
+ vp9_prob *oldp = old_coef_probs[i][j][k][l] + t;
const vp9_prob upd = DIFF_UPDATE_PROB;
int s;
int u = 0;
if (t == PIVOT_NODE)
s = vp9_prob_diff_update_savings_search_model(
frame_branch_ct[i][j][k][l][0],
- old_frame_coef_probs[i][j][k][l], &newp, upd);
+ old_coef_probs[i][j][k][l], &newp, upd);
else
s = vp9_prob_diff_update_savings_search(
frame_branch_ct[i][j][k][l][t],
@@ -612,8 +611,8 @@
for (l = 0; l < BAND_COEFF_CONTEXTS(k); ++l) {
// calc probs and branch cts for this frame only
for (t = 0; t < entropy_nodes_update; ++t) {
- vp9_prob newp = new_frame_coef_probs[i][j][k][l][t];
- vp9_prob *oldp = old_frame_coef_probs[i][j][k][l] + t;
+ vp9_prob newp = new_coef_probs[i][j][k][l][t];
+ vp9_prob *oldp = old_coef_probs[i][j][k][l] + t;
int s;
int u = 0;
if (l >= prev_coef_contexts_to_update ||
@@ -623,7 +622,7 @@
if (t == PIVOT_NODE)
s = vp9_prob_diff_update_savings_search_model(
frame_branch_ct[i][j][k][l][0],
- old_frame_coef_probs[i][j][k][l], &newp, upd);
+ old_coef_probs[i][j][k][l], &newp, upd);
else
s = vp9_prob_diff_update_savings_search(
frame_branch_ct[i][j][k][l][t],
@@ -670,14 +669,17 @@
const TX_SIZE max_tx_size = tx_mode_to_biggest_tx_size[tx_mode];
TX_SIZE tx_size;
vp9_coeff_stats frame_branch_ct[TX_SIZES][PLANE_TYPES];
+ vp9_coeff_probs_model frame_coef_probs[TX_SIZES][PLANE_TYPES];
vp9_clear_system_state();
for (tx_size = TX_4X4; tx_size <= TX_32X32; ++tx_size)
- build_tree_distribution(cpi, tx_size, frame_branch_ct[tx_size]);
+ build_tree_distribution(cpi, tx_size, frame_branch_ct[tx_size],
+ frame_coef_probs[tx_size]);
for (tx_size = TX_4X4; tx_size <= max_tx_size; ++tx_size)
- update_coef_probs_common(w, cpi, tx_size, frame_branch_ct[tx_size]);
+ update_coef_probs_common(w, cpi, tx_size, frame_branch_ct[tx_size],
+ frame_coef_probs[tx_size]);
}
static void encode_loopfilter(struct loopfilter *lf,
diff --git a/vp9/encoder/vp9_block.h b/vp9/encoder/vp9_block.h
index 2ccf4f8..c406860 100644
--- a/vp9/encoder/vp9_block.h
+++ b/vp9/encoder/vp9_block.h
@@ -20,43 +20,6 @@
extern "C" {
#endif
-// Structure to hold snapshot of coding context during the mode picking process
-typedef struct {
- MODE_INFO mic;
- uint8_t *zcoeff_blk;
- int16_t *coeff[MAX_MB_PLANE][3];
- int16_t *qcoeff[MAX_MB_PLANE][3];
- int16_t *dqcoeff[MAX_MB_PLANE][3];
- uint16_t *eobs[MAX_MB_PLANE][3];
-
- // dual buffer pointers, 0: in use, 1: best in store
- int16_t *coeff_pbuf[MAX_MB_PLANE][3];
- int16_t *qcoeff_pbuf[MAX_MB_PLANE][3];
- int16_t *dqcoeff_pbuf[MAX_MB_PLANE][3];
- uint16_t *eobs_pbuf[MAX_MB_PLANE][3];
-
- int is_coded;
- int num_4x4_blk;
- int skip;
- int_mv best_ref_mv[2];
- int_mv ref_mvs[MAX_REF_FRAMES][MAX_MV_REF_CANDIDATES];
- int rate;
- int distortion;
- int best_mode_index;
- int rddiv;
- int rdmult;
- int hybrid_pred_diff;
- int comp_pred_diff;
- int single_pred_diff;
- int64_t tx_rd_diff[TX_MODES];
- int64_t best_filter_diff[SWITCHABLE_FILTER_CONTEXTS];
-
- // motion vector cache for adaptive motion search control in partition
- // search loop
- int_mv pred_mv[MAX_REF_FRAMES];
- INTERP_FILTER pred_interp_filter;
-} PICK_MODE_CONTEXT;
-
struct macroblock_plane {
DECLARE_ALIGNED(16, int16_t, src_diff[64 * 64]);
int16_t *qcoeff;
@@ -73,18 +36,6 @@
// Zbin Over Quant value
int16_t zbin_extra;
};
-typedef struct PC_TREE {
- int index;
- PARTITION_TYPE partitioning;
- BLOCK_SIZE block_size;
- PICK_MODE_CONTEXT none;
- PICK_MODE_CONTEXT horizontal[2];
- PICK_MODE_CONTEXT vertical[2];
- union {
- struct PC_TREE *split[4];
- PICK_MODE_CONTEXT *leaf_split[4];
- };
-} PC_TREE;
/* The [2] dimension is for whether we skip the EOB node (i.e. if previous
* coefficient in this block was zero) or not. */
@@ -155,17 +106,11 @@
int skip_encode;
// Used to store sub partition's choices.
- int_mv pred_mv[MAX_REF_FRAMES];
-
- PICK_MODE_CONTEXT *leaf_tree;
- PC_TREE *pc_tree;
- PC_TREE *pc_root;
- int partition_cost[PARTITION_CONTEXTS][PARTITION_TYPES];
+ MV pred_mv[MAX_REF_FRAMES];
void (*fwd_txm4x4)(const int16_t *input, int16_t *output, int stride);
};
-
#ifdef __cplusplus
} // extern "C"
#endif
diff --git a/vp9/encoder/vp9_context_tree.c b/vp9/encoder/vp9_context_tree.c
index 659935c..9b7a932 100644
--- a/vp9/encoder/vp9_context_tree.c
+++ b/vp9/encoder/vp9_context_tree.c
@@ -8,14 +8,14 @@
* be found in the AUTHORS file in the root of the source tree.
*/
-
#include "vp9/encoder/vp9_context_tree.h"
+#include "vp9/encoder/vp9_encoder.h"
static const BLOCK_SIZE square[] = {
- BLOCK_8X8,
- BLOCK_16X16,
- BLOCK_32X32,
- BLOCK_64X64,
+ BLOCK_8X8,
+ BLOCK_16X16,
+ BLOCK_32X32,
+ BLOCK_64X64,
};
static void alloc_mode_context(VP9_COMMON *cm, int num_4x4_blk,
@@ -62,30 +62,32 @@
}
}
}
-static void free_tree_contexts(PC_TREE *this_pc) {
- free_mode_context(&this_pc->none);
- free_mode_context(&this_pc->horizontal[0]);
- free_mode_context(&this_pc->horizontal[1]);
- free_mode_context(&this_pc->vertical[0]);
- free_mode_context(&this_pc->vertical[1]);
-}
-static void alloc_tree_contexts(VP9_COMMON *cm, PC_TREE *this_pc,
+
+static void alloc_tree_contexts(VP9_COMMON *cm, PC_TREE *tree,
int num_4x4_blk) {
- alloc_mode_context(cm, num_4x4_blk, &this_pc->none);
- alloc_mode_context(cm, num_4x4_blk/2, &this_pc->horizontal[0]);
- alloc_mode_context(cm, num_4x4_blk/2, &this_pc->vertical[0]);
+ alloc_mode_context(cm, num_4x4_blk, &tree->none);
+ alloc_mode_context(cm, num_4x4_blk/2, &tree->horizontal[0]);
+ alloc_mode_context(cm, num_4x4_blk/2, &tree->vertical[0]);
/* TODO(Jbb): for 4x8 and 8x4 these allocated values are not used.
* Figure out a better way to do this. */
- alloc_mode_context(cm, num_4x4_blk/2, &this_pc->horizontal[1]);
- alloc_mode_context(cm, num_4x4_blk/2, &this_pc->vertical[1]);
+ alloc_mode_context(cm, num_4x4_blk/2, &tree->horizontal[1]);
+ alloc_mode_context(cm, num_4x4_blk/2, &tree->vertical[1]);
+}
+
+static void free_tree_contexts(PC_TREE *tree) {
+ free_mode_context(&tree->none);
+ free_mode_context(&tree->horizontal[0]);
+ free_mode_context(&tree->horizontal[1]);
+ free_mode_context(&tree->vertical[0]);
+ free_mode_context(&tree->vertical[1]);
}
// This function sets up a tree of contexts such that at each square
// partition level. There are contexts for none, horizontal, vertical, and
// split. Along with a block_size value and a selected block_size which
// represents the state of our search.
-void vp9_setup_pc_tree(VP9_COMMON *cm, MACROBLOCK *x) {
+void vp9_setup_pc_tree(VP9_COMMON *cm, VP9_COMP *cpi) {
int i, j;
const int leaf_nodes = 64;
const int tree_nodes = 64 + 16 + 4 + 1;
@@ -95,61 +97,62 @@
int square_index = 1;
int nodes;
- vpx_free(x->leaf_tree);
- CHECK_MEM_ERROR(cm, x->leaf_tree, vpx_calloc(leaf_nodes,
- sizeof(PICK_MODE_CONTEXT)));
- vpx_free(x->pc_tree);
- CHECK_MEM_ERROR(cm, x->pc_tree, vpx_calloc(tree_nodes, sizeof(PC_TREE)));
+ vpx_free(cpi->leaf_tree);
+ CHECK_MEM_ERROR(cm, cpi->leaf_tree, vpx_calloc(leaf_nodes,
+ sizeof(*cpi->leaf_tree)));
+ vpx_free(cpi->pc_tree);
+ CHECK_MEM_ERROR(cm, cpi->pc_tree, vpx_calloc(tree_nodes,
+ sizeof(*cpi->pc_tree)));
- this_pc = &x->pc_tree[0];
- this_leaf = &x->leaf_tree[0];
+ this_pc = &cpi->pc_tree[0];
+ this_leaf = &cpi->leaf_tree[0];
// 4x4 blocks smaller than 8x8 but in the same 8x8 block share the same
// context so we only need to allocate 1 for each 8x8 block.
for (i = 0; i < leaf_nodes; ++i)
- alloc_mode_context(cm, 1, &x->leaf_tree[i]);
+ alloc_mode_context(cm, 1, &cpi->leaf_tree[i]);
// Sets up all the leaf nodes in the tree.
for (pc_tree_index = 0; pc_tree_index < leaf_nodes; ++pc_tree_index) {
- x->pc_tree[pc_tree_index].block_size = square[0];
- alloc_tree_contexts(cm, &x->pc_tree[pc_tree_index], 4);
- x->pc_tree[pc_tree_index].leaf_split[0] = this_leaf++;
- for (j = 1; j < 4; j++) {
- x->pc_tree[pc_tree_index].leaf_split[j] =
- x->pc_tree[pc_tree_index].leaf_split[0];
- }
+ PC_TREE *const tree = &cpi->pc_tree[pc_tree_index];
+ tree->block_size = square[0];
+ alloc_tree_contexts(cm, tree, 4);
+ tree->leaf_split[0] = this_leaf++;
+ for (j = 1; j < 4; j++)
+ tree->leaf_split[j] = tree->leaf_split[0];
}
// Each node has 4 leaf nodes, fill each block_size level of the tree
// from leafs to the root.
- for (nodes = 16; nodes > 0; nodes >>= 2, ++square_index) {
- for (i = 0; i < nodes; ++pc_tree_index, ++i) {
- alloc_tree_contexts(cm, &x->pc_tree[pc_tree_index],
- 4 << (2 * square_index));
- x->pc_tree[pc_tree_index].block_size = square[square_index];
- for (j = 0; j < 4; j++) {
- x->pc_tree[pc_tree_index].split[j] = this_pc++;
- }
+ for (nodes = 16; nodes > 0; nodes >>= 2) {
+ for (i = 0; i < nodes; ++i) {
+ PC_TREE *const tree = &cpi->pc_tree[pc_tree_index];
+ alloc_tree_contexts(cm, tree, 4 << (2 * square_index));
+ tree->block_size = square[square_index];
+ for (j = 0; j < 4; j++)
+ tree->split[j] = this_pc++;
+ ++pc_tree_index;
}
+ ++square_index;
}
- x->pc_root = &x->pc_tree[tree_nodes-1];
- x->pc_root[0].none.best_mode_index = 2;
+ cpi->pc_root = &cpi->pc_tree[tree_nodes - 1];
+ cpi->pc_root[0].none.best_mode_index = 2;
}
-void vp9_free_pc_tree(MACROBLOCK *m) {
+void vp9_free_pc_tree(VP9_COMP *cpi) {
const int tree_nodes = 64 + 16 + 4 + 1;
int i;
// Set up all 4x4 mode contexts
for (i = 0; i < 64; ++i)
- free_mode_context(&m->leaf_tree[i]);
+ free_mode_context(&cpi->leaf_tree[i]);
// Sets up all the leaf nodes in the tree.
- for (i = 0; i < tree_nodes; i++) {
- free_tree_contexts(&m->pc_tree[i]);
- }
- vpx_free(m->pc_tree);
- m->pc_tree = 0;
- vpx_free(m->leaf_tree);
- m->leaf_tree = 0;
+ for (i = 0; i < tree_nodes; ++i)
+ free_tree_contexts(&cpi->pc_tree[i]);
+
+ vpx_free(cpi->pc_tree);
+ cpi->pc_tree = NULL;
+ vpx_free(cpi->leaf_tree);
+ cpi->leaf_tree = NULL;
}
diff --git a/vp9/encoder/vp9_context_tree.h b/vp9/encoder/vp9_context_tree.h
index 66a6f00..bb384aa 100644
--- a/vp9/encoder/vp9_context_tree.h
+++ b/vp9/encoder/vp9_context_tree.h
@@ -11,9 +11,55 @@
#ifndef VP9_ENCODER_VP9_CONTEXT_TREE_H_
#define VP9_ENCODER_VP9_CONTEXT_TREE_H_
-#include "vp9/encoder/vp9_encoder.h"
+#include "vp9/common/vp9_onyxc_int.h"
-void vp9_setup_pc_tree(VP9_COMMON *cm, MACROBLOCK *x);
-void vp9_free_pc_tree(MACROBLOCK *x);
+struct VP9_COMP;
+
+// Structure to hold snapshot of coding context during the mode picking process
+typedef struct {
+ MODE_INFO mic;
+ uint8_t *zcoeff_blk;
+ int16_t *coeff[MAX_MB_PLANE][3];
+ int16_t *qcoeff[MAX_MB_PLANE][3];
+ int16_t *dqcoeff[MAX_MB_PLANE][3];
+ uint16_t *eobs[MAX_MB_PLANE][3];
+
+ // dual buffer pointers, 0: in use, 1: best in store
+ int16_t *coeff_pbuf[MAX_MB_PLANE][3];
+ int16_t *qcoeff_pbuf[MAX_MB_PLANE][3];
+ int16_t *dqcoeff_pbuf[MAX_MB_PLANE][3];
+ uint16_t *eobs_pbuf[MAX_MB_PLANE][3];
+
+ int is_coded;
+ int num_4x4_blk;
+ int skip;
+ int best_mode_index;
+ int hybrid_pred_diff;
+ int comp_pred_diff;
+ int single_pred_diff;
+ int64_t tx_rd_diff[TX_MODES];
+ int64_t best_filter_diff[SWITCHABLE_FILTER_CONTEXTS];
+
+ // motion vector cache for adaptive motion search control in partition
+ // search loop
+ MV pred_mv[MAX_REF_FRAMES];
+ INTERP_FILTER pred_interp_filter;
+} PICK_MODE_CONTEXT;
+
+typedef struct PC_TREE {
+ int index;
+ PARTITION_TYPE partitioning;
+ BLOCK_SIZE block_size;
+ PICK_MODE_CONTEXT none;
+ PICK_MODE_CONTEXT horizontal[2];
+ PICK_MODE_CONTEXT vertical[2];
+ union {
+ struct PC_TREE *split[4];
+ PICK_MODE_CONTEXT *leaf_split[4];
+ };
+} PC_TREE;
+
+void vp9_setup_pc_tree(struct VP9Common *cm, struct VP9_COMP *cpi);
+void vp9_free_pc_tree(struct VP9_COMP *cpi);
#endif /* VP9_ENCODER_VP9_CONTEXT_TREE_H_ */
diff --git a/vp9/encoder/vp9_dct.c b/vp9/encoder/vp9_dct.c
index d523239..5772767 100644
--- a/vp9/encoder/vp9_dct.c
+++ b/vp9/encoder/vp9_dct.c
@@ -445,20 +445,20 @@
step3[7] = step1[7] + step2[4];
// step 4
temp1 = step3[1] * -cospi_8_64 + step3[6] * cospi_24_64;
- temp2 = step3[2] * -cospi_24_64 - step3[5] * cospi_8_64;
+ temp2 = step3[2] * cospi_24_64 + step3[5] * cospi_8_64;
step2[1] = fdct_round_shift(temp1);
step2[2] = fdct_round_shift(temp2);
- temp1 = step3[2] * -cospi_8_64 + step3[5] * cospi_24_64;
+ temp1 = step3[2] * cospi_8_64 - step3[5] * cospi_24_64;
temp2 = step3[1] * cospi_24_64 + step3[6] * cospi_8_64;
step2[5] = fdct_round_shift(temp1);
step2[6] = fdct_round_shift(temp2);
// step 5
step1[0] = step3[0] + step2[1];
step1[1] = step3[0] - step2[1];
- step1[2] = step3[3] - step2[2];
- step1[3] = step3[3] + step2[2];
- step1[4] = step3[4] + step2[5];
- step1[5] = step3[4] - step2[5];
+ step1[2] = step3[3] + step2[2];
+ step1[3] = step3[3] - step2[2];
+ step1[4] = step3[4] - step2[5];
+ step1[5] = step3[4] + step2[5];
step1[6] = step3[7] - step2[6];
step1[7] = step3[7] + step2[6];
// step 6
@@ -755,10 +755,10 @@
// step 4
temp1 = step3[1] * -cospi_8_64 + step3[6] * cospi_24_64;
- temp2 = step3[2] * -cospi_24_64 - step3[5] * cospi_8_64;
+ temp2 = step3[2] * cospi_24_64 + step3[5] * cospi_8_64;
step2[1] = fdct_round_shift(temp1);
step2[2] = fdct_round_shift(temp2);
- temp1 = step3[2] * -cospi_8_64 + step3[5] * cospi_24_64;
+ temp1 = step3[2] * cospi_8_64 - step3[5] * cospi_24_64;
temp2 = step3[1] * cospi_24_64 + step3[6] * cospi_8_64;
step2[5] = fdct_round_shift(temp1);
step2[6] = fdct_round_shift(temp2);
@@ -766,10 +766,10 @@
// step 5
step1[0] = step3[0] + step2[1];
step1[1] = step3[0] - step2[1];
- step1[2] = step3[3] - step2[2];
- step1[3] = step3[3] + step2[2];
- step1[4] = step3[4] + step2[5];
- step1[5] = step3[4] - step2[5];
+ step1[2] = step3[3] + step2[2];
+ step1[3] = step3[3] - step2[2];
+ step1[4] = step3[4] - step2[5];
+ step1[5] = step3[4] + step2[5];
step1[6] = step3[7] - step2[6];
step1[7] = step3[7] + step2[6];
diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c
index fba9465..ef33fca 100644
--- a/vp9/encoder/vp9_encodeframe.c
+++ b/vp9/encoder/vp9_encodeframe.c
@@ -70,6 +70,12 @@
128, 128, 128, 128, 128, 128, 128, 128
};
+typedef struct {
+ unsigned int sse;
+ int sum;
+ unsigned int var;
+} diff;
+
static void get_sse_sum_8x8(const uint8_t *src, int src_stride,
const uint8_t *ref, int ref_stride,
unsigned int *sse, int *sum) {
@@ -201,7 +207,7 @@
mbmi = &xd->mi[0]->mbmi;
// Set up destination pointers.
- vp9_setup_dst_planes(xd, get_frame_new_buffer(cm), mi_row, mi_col);
+ vp9_setup_dst_planes(xd->plane, get_frame_new_buffer(cm), mi_row, mi_col);
// Set up limit values for MV components.
// Mv beyond the range do not produce new/different prediction block.
@@ -1538,7 +1544,7 @@
pl = partition_plane_context(xd, mi_row, mi_col, bsize);
if (none_rate < INT_MAX) {
- none_rate += x->partition_cost[pl][PARTITION_NONE];
+ none_rate += cpi->partition_cost[pl][PARTITION_NONE];
none_rd = RDCOST(x->rdmult, x->rddiv, none_rate, none_dist);
}
@@ -1636,7 +1642,7 @@
pl = partition_plane_context(xd, mi_row, mi_col, bsize);
if (last_part_rate < INT_MAX) {
- last_part_rate += x->partition_cost[pl][partition];
+ last_part_rate += cpi->partition_cost[pl][partition];
last_part_rd = RDCOST(x->rdmult, x->rddiv, last_part_rate, last_part_dist);
}
@@ -1689,11 +1695,11 @@
pl = partition_plane_context(xd, mi_row + y_idx, mi_col + x_idx,
split_subsize);
- chosen_rate += x->partition_cost[pl][PARTITION_NONE];
+ chosen_rate += cpi->partition_cost[pl][PARTITION_NONE];
}
pl = partition_plane_context(xd, mi_row, mi_col, bsize);
if (chosen_rate < INT_MAX) {
- chosen_rate += x->partition_cost[pl][PARTITION_SPLIT];
+ chosen_rate += cpi->partition_cost[pl][PARTITION_SPLIT];
chosen_rd = RDCOST(x->rdmult, x->rddiv, chosen_rate, chosen_dist);
}
}
@@ -1805,15 +1811,11 @@
BLOCK_SIZE *max_block_size) {
VP9_COMMON *const cm = &cpi->common;
MACROBLOCKD *const xd = &cpi->mb.e_mbd;
- MODE_INFO **mi_8x8 = xd->mi;
- const int left_in_image = xd->left_available && mi_8x8[-1];
- const int above_in_image = xd->up_available &&
- mi_8x8[-xd->mi_stride];
- MODE_INFO **above_sb64_mi_8x8;
- MODE_INFO **left_sb64_mi_8x8;
-
- int row8x8_remaining = tile->mi_row_end - mi_row;
- int col8x8_remaining = tile->mi_col_end - mi_col;
+ MODE_INFO **mi = xd->mi;
+ const int left_in_image = xd->left_available && mi[-1];
+ const int above_in_image = xd->up_available && mi[-xd->mi_stride];
+ const int row8x8_remaining = tile->mi_row_end - mi_row;
+ const int col8x8_remaining = tile->mi_col_end - mi_col;
int bh, bw;
BLOCK_SIZE min_size = BLOCK_4X4;
BLOCK_SIZE max_size = BLOCK_64X64;
@@ -1833,15 +1835,13 @@
}
// Find the min and max partition sizes used in the left SB64
if (left_in_image) {
- left_sb64_mi_8x8 = &mi_8x8[-MI_BLOCK_SIZE];
- get_sb_partition_size_range(cpi, left_sb64_mi_8x8,
- &min_size, &max_size);
+ MODE_INFO **left_sb64_mi = &mi[-MI_BLOCK_SIZE];
+ get_sb_partition_size_range(cpi, left_sb64_mi, &min_size, &max_size);
}
// Find the min and max partition sizes used in the above SB64.
if (above_in_image) {
- above_sb64_mi_8x8 = &mi_8x8[-xd->mi_stride * MI_BLOCK_SIZE];
- get_sb_partition_size_range(cpi, above_sb64_mi_8x8,
- &min_size, &max_size);
+ MODE_INFO **above_sb64_mi = &mi[-xd->mi_stride * MI_BLOCK_SIZE];
+ get_sb_partition_size_range(cpi, above_sb64_mi, &min_size, &max_size);
}
// adjust observed min and max
if (cpi->sf.auto_min_max_partition_size == RELAXED_NEIGHBORING_MIN_MAX) {
@@ -2021,7 +2021,7 @@
if (this_rate != INT_MAX) {
if (bsize >= BLOCK_8X8) {
pl = partition_plane_context(xd, mi_row, mi_col, bsize);
- this_rate += x->partition_cost[pl][PARTITION_NONE];
+ this_rate += cpi->partition_cost[pl][PARTITION_NONE];
}
sum_rd = RDCOST(x->rdmult, x->rddiv, this_rate, this_dist);
if (sum_rd < best_rd) {
@@ -2109,7 +2109,7 @@
if (sum_rd < best_rd && i == 4) {
pl = partition_plane_context(xd, mi_row, mi_col, bsize);
- sum_rate += x->partition_cost[pl][PARTITION_SPLIT];
+ sum_rate += cpi->partition_cost[pl][PARTITION_SPLIT];
sum_rd = RDCOST(x->rdmult, x->rddiv, sum_rate, sum_dist);
if (sum_rd < best_rd) {
best_rate = sum_rate;
@@ -2163,7 +2163,7 @@
}
if (sum_rd < best_rd) {
pl = partition_plane_context(xd, mi_row, mi_col, bsize);
- sum_rate += x->partition_cost[pl][PARTITION_HORZ];
+ sum_rate += cpi->partition_cost[pl][PARTITION_HORZ];
sum_rd = RDCOST(x->rdmult, x->rddiv, sum_rate, sum_dist);
if (sum_rd < best_rd) {
best_rd = sum_rd;
@@ -2212,7 +2212,7 @@
}
if (sum_rd < best_rd) {
pl = partition_plane_context(xd, mi_row, mi_col, bsize);
- sum_rate += x->partition_cost[pl][PARTITION_VERT];
+ sum_rate += cpi->partition_cost[pl][PARTITION_VERT];
sum_rd = RDCOST(x->rdmult, x->rddiv, sum_rate, sum_dist);
if (sum_rd < best_rd) {
best_rate = sum_rate;
@@ -2274,17 +2274,16 @@
int64_t dummy_dist;
int i;
- MACROBLOCK *x = &cpi->mb;
if (sf->adaptive_pred_interp_filter) {
for (i = 0; i < 64; ++i)
- x->leaf_tree[i].pred_interp_filter = SWITCHABLE;
+ cpi->leaf_tree[i].pred_interp_filter = SWITCHABLE;
for (i = 0; i < 64; ++i) {
- x->pc_tree[i].vertical[0].pred_interp_filter = SWITCHABLE;
- x->pc_tree[i].vertical[1].pred_interp_filter = SWITCHABLE;
- x->pc_tree[i].horizontal[0].pred_interp_filter = SWITCHABLE;
- x->pc_tree[i].horizontal[1].pred_interp_filter = SWITCHABLE;
+ cpi->pc_tree[i].vertical[0].pred_interp_filter = SWITCHABLE;
+ cpi->pc_tree[i].vertical[1].pred_interp_filter = SWITCHABLE;
+ cpi->pc_tree[i].horizontal[0].pred_interp_filter = SWITCHABLE;
+ cpi->pc_tree[i].horizontal[1].pred_interp_filter = SWITCHABLE;
}
}
@@ -2296,26 +2295,26 @@
sf->partition_search_type == VAR_BASED_PARTITION ||
sf->partition_search_type == VAR_BASED_FIXED_PARTITION) {
const int idx_str = cm->mi_stride * mi_row + mi_col;
- MODE_INFO **mi_8x8 = cm->mi_grid_visible + idx_str;
- MODE_INFO **prev_mi_8x8 = cm->prev_mi_grid_visible + idx_str;
+ MODE_INFO **mi = cm->mi_grid_visible + idx_str;
+ MODE_INFO **prev_mi = cm->prev_mi_grid_visible + idx_str;
cpi->mb.source_variance = UINT_MAX;
if (sf->partition_search_type == FIXED_PARTITION) {
set_offsets(cpi, tile, mi_row, mi_col, BLOCK_64X64);
- set_fixed_partitioning(cpi, tile, mi_8x8, mi_row, mi_col,
+ set_fixed_partitioning(cpi, tile, mi, mi_row, mi_col,
sf->always_this_block_size);
- rd_use_partition(cpi, tile, mi_8x8, tp, mi_row, mi_col, BLOCK_64X64,
- &dummy_rate, &dummy_dist, 1, x->pc_root);
+ rd_use_partition(cpi, tile, mi, tp, mi_row, mi_col, BLOCK_64X64,
+ &dummy_rate, &dummy_dist, 1, cpi->pc_root);
} else if (sf->partition_search_type == VAR_BASED_FIXED_PARTITION) {
BLOCK_SIZE bsize;
set_offsets(cpi, tile, mi_row, mi_col, BLOCK_64X64);
bsize = get_rd_var_based_fixed_partition(cpi, mi_row, mi_col);
- set_fixed_partitioning(cpi, tile, mi_8x8, mi_row, mi_col, bsize);
- rd_use_partition(cpi, tile, mi_8x8, tp, mi_row, mi_col, BLOCK_64X64,
- &dummy_rate, &dummy_dist, 1, x->pc_root);
+ set_fixed_partitioning(cpi, tile, mi, mi_row, mi_col, bsize);
+ rd_use_partition(cpi, tile, mi, tp, mi_row, mi_col, BLOCK_64X64,
+ &dummy_rate, &dummy_dist, 1, cpi->pc_root);
} else if (sf->partition_search_type == VAR_BASED_PARTITION) {
choose_partitioning(cpi, tile, mi_row, mi_col);
- rd_use_partition(cpi, tile, mi_8x8, tp, mi_row, mi_col, BLOCK_64X64,
- &dummy_rate, &dummy_dist, 1, x->pc_root);
+ rd_use_partition(cpi, tile, mi, tp, mi_row, mi_col, BLOCK_64X64,
+ &dummy_rate, &dummy_dist, 1, cpi->pc_root);
} else {
if ((cm->current_video_frame
% sf->last_partitioning_redo_frequency) == 0
@@ -2325,7 +2324,7 @@
|| cpi->rc.is_src_frame_alt_ref
|| ((sf->use_lastframe_partitioning ==
LAST_FRAME_PARTITION_LOW_MOTION) &&
- sb_has_motion(cm, prev_mi_8x8))) {
+ sb_has_motion(cm, prev_mi))) {
// If required set upper and lower partition size limits
if (sf->auto_min_max_partition_size) {
set_offsets(cpi, tile, mi_row, mi_col, BLOCK_64X64);
@@ -2334,16 +2333,17 @@
&sf->max_partition_size);
}
rd_pick_partition(cpi, tile, tp, mi_row, mi_col, BLOCK_64X64,
- &dummy_rate, &dummy_dist, 1, INT64_MAX, x->pc_root);
+ &dummy_rate, &dummy_dist, 1, INT64_MAX,
+ cpi->pc_root);
} else {
if (sf->constrain_copy_partition &&
- sb_has_motion(cm, prev_mi_8x8))
- constrain_copy_partitioning(cpi, tile, mi_8x8, prev_mi_8x8,
+ sb_has_motion(cm, prev_mi))
+ constrain_copy_partitioning(cpi, tile, mi, prev_mi,
mi_row, mi_col, BLOCK_16X16);
else
- copy_partitioning(cm, mi_8x8, prev_mi_8x8);
- rd_use_partition(cpi, tile, mi_8x8, tp, mi_row, mi_col, BLOCK_64X64,
- &dummy_rate, &dummy_dist, 1, x->pc_root);
+ copy_partitioning(cm, mi, prev_mi);
+ rd_use_partition(cpi, tile, mi, tp, mi_row, mi_col, BLOCK_64X64,
+ &dummy_rate, &dummy_dist, 1, cpi->pc_root);
}
}
} else {
@@ -2355,7 +2355,7 @@
&sf->max_partition_size);
}
rd_pick_partition(cpi, tile, tp, mi_row, mi_col, BLOCK_64X64,
- &dummy_rate, &dummy_dist, 1, INT64_MAX, x->pc_root);
+ &dummy_rate, &dummy_dist, 1, INT64_MAX, cpi->pc_root);
}
}
}
@@ -2633,7 +2633,7 @@
if (this_rate != INT_MAX) {
int pl = partition_plane_context(xd, mi_row, mi_col, bsize);
- this_rate += x->partition_cost[pl][PARTITION_NONE];
+ this_rate += cpi->partition_cost[pl][PARTITION_NONE];
sum_rd = RDCOST(x->rdmult, x->rddiv, this_rate, this_dist);
if (sum_rd < best_rd) {
int64_t stop_thresh = 4096;
@@ -2671,7 +2671,7 @@
sum_rd = 0;
if (do_split) {
int pl = partition_plane_context(xd, mi_row, mi_col, bsize);
- sum_rate += x->partition_cost[pl][PARTITION_SPLIT];
+ sum_rate += cpi->partition_cost[pl][PARTITION_SPLIT];
subsize = get_subsize(bsize, PARTITION_SPLIT);
for (i = 0; i < 4 && sum_rd < best_rd; ++i) {
const int x_idx = (i & 1) * ms;
@@ -2730,7 +2730,7 @@
sum_rd = INT64_MAX;
} else {
int pl = partition_plane_context(xd, mi_row, mi_col, bsize);
- this_rate += x->partition_cost[pl][PARTITION_HORZ];
+ this_rate += cpi->partition_cost[pl][PARTITION_HORZ];
sum_rate += this_rate;
sum_dist += this_dist;
sum_rd = RDCOST(x->rdmult, x->rddiv, sum_rate, sum_dist);
@@ -2764,7 +2764,7 @@
sum_rd = INT64_MAX;
} else {
int pl = partition_plane_context(xd, mi_row, mi_col, bsize);
- this_rate += x->partition_cost[pl][PARTITION_VERT];
+ this_rate += cpi->partition_cost[pl][PARTITION_VERT];
sum_rate += this_rate;
sum_dist += this_dist;
sum_rd = RDCOST(x->rdmult, x->rddiv, sum_rate, sum_dist);
@@ -2822,7 +2822,7 @@
static void nonrd_use_partition(VP9_COMP *cpi,
const TileInfo *const tile,
- MODE_INFO **mi_8x8,
+ MODE_INFO **mi,
TOKENEXTRA **tp,
int mi_row, int mi_col,
BLOCK_SIZE bsize, int output_enabled,
@@ -2841,7 +2841,7 @@
if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols)
return;
- subsize = (bsize >= BLOCK_8X8) ? mi_8x8[0]->mbmi.sb_type : BLOCK_4X4;
+ subsize = (bsize >= BLOCK_8X8) ? mi[0]->mbmi.sb_type : BLOCK_4X4;
partition = partition_lookup[bsl][subsize];
switch (partition) {
@@ -2869,7 +2869,7 @@
if (mi_row + hbs < cm->mi_rows) {
nonrd_pick_sb_modes(cpi, tile, mi_row + hbs, mi_col,
&rate, &dist, subsize);
- pc_tree->horizontal[1].mic.mbmi = mi_8x8[0]->mbmi;
+ pc_tree->horizontal[1].mic.mbmi = mi[0]->mbmi;
if (rate != INT_MAX && dist != INT64_MAX &&
*totrate != INT_MAX && *totdist != INT64_MAX) {
*totrate += rate;
@@ -2879,10 +2879,10 @@
break;
case PARTITION_SPLIT:
subsize = get_subsize(bsize, PARTITION_SPLIT);
- nonrd_use_partition(cpi, tile, mi_8x8, tp, mi_row, mi_col,
+ nonrd_use_partition(cpi, tile, mi, tp, mi_row, mi_col,
subsize, output_enabled, totrate, totdist,
pc_tree->split[0]);
- nonrd_use_partition(cpi, tile, mi_8x8 + hbs, tp,
+ nonrd_use_partition(cpi, tile, mi + hbs, tp,
mi_row, mi_col + hbs, subsize, output_enabled,
&rate, &dist, pc_tree->split[1]);
if (rate != INT_MAX && dist != INT64_MAX &&
@@ -2890,7 +2890,7 @@
*totrate += rate;
*totdist += dist;
}
- nonrd_use_partition(cpi, tile, mi_8x8 + hbs * mis, tp,
+ nonrd_use_partition(cpi, tile, mi + hbs * mis, tp,
mi_row + hbs, mi_col, subsize, output_enabled,
&rate, &dist, pc_tree->split[2]);
if (rate != INT_MAX && dist != INT64_MAX &&
@@ -2898,7 +2898,7 @@
*totrate += rate;
*totdist += dist;
}
- nonrd_use_partition(cpi, tile, mi_8x8 + hbs * mis + hbs, tp,
+ nonrd_use_partition(cpi, tile, mi + hbs * mis + hbs, tp,
mi_row + hbs, mi_col + hbs, subsize, output_enabled,
&rate, &dist, pc_tree->split[3]);
if (rate != INT_MAX && dist != INT64_MAX &&
@@ -2937,8 +2937,8 @@
int dummy_rate = 0;
int64_t dummy_dist = 0;
const int idx_str = cm->mi_stride * mi_row + mi_col;
- MODE_INFO **mi_8x8 = cm->mi_grid_visible + idx_str;
- MODE_INFO **prev_mi_8x8 = cm->prev_mi_grid_visible + idx_str;
+ MODE_INFO **mi = cm->mi_grid_visible + idx_str;
+ MODE_INFO **prev_mi = cm->prev_mi_grid_visible + idx_str;
BLOCK_SIZE bsize;
x->in_static_area = 0;
@@ -2949,22 +2949,22 @@
switch (cpi->sf.partition_search_type) {
case VAR_BASED_PARTITION:
choose_partitioning(cpi, tile, mi_row, mi_col);
- nonrd_use_partition(cpi, tile, mi_8x8, tp, mi_row, mi_col, BLOCK_64X64,
- 1, &dummy_rate, &dummy_dist, x->pc_root);
+ nonrd_use_partition(cpi, tile, mi, tp, mi_row, mi_col, BLOCK_64X64,
+ 1, &dummy_rate, &dummy_dist, cpi->pc_root);
break;
case SOURCE_VAR_BASED_PARTITION:
- set_source_var_based_partition(cpi, tile, mi_8x8, mi_row, mi_col);
- nonrd_use_partition(cpi, tile, mi_8x8, tp, mi_row, mi_col, BLOCK_64X64,
- 1, &dummy_rate, &dummy_dist, x->pc_root);
+ set_source_var_based_partition(cpi, tile, mi, mi_row, mi_col);
+ nonrd_use_partition(cpi, tile, mi, tp, mi_row, mi_col, BLOCK_64X64,
+ 1, &dummy_rate, &dummy_dist, cpi->pc_root);
break;
case VAR_BASED_FIXED_PARTITION:
case FIXED_PARTITION:
bsize = cpi->sf.partition_search_type == FIXED_PARTITION ?
cpi->sf.always_this_block_size :
get_nonrd_var_based_fixed_partition(cpi, mi_row, mi_col);
- set_fixed_partitioning(cpi, tile, mi_8x8, mi_row, mi_col, bsize);
- nonrd_use_partition(cpi, tile, mi_8x8, tp, mi_row, mi_col, BLOCK_64X64,
- 1, &dummy_rate, &dummy_dist, x->pc_root);
+ set_fixed_partitioning(cpi, tile, mi, mi_row, mi_col, bsize);
+ nonrd_use_partition(cpi, tile, mi, tp, mi_row, mi_col, BLOCK_64X64,
+ 1, &dummy_rate, &dummy_dist, cpi->pc_root);
break;
case REFERENCE_PARTITION:
if (cpi->sf.partition_check ||
@@ -2975,12 +2975,12 @@
&cpi->sf.max_partition_size);
nonrd_pick_partition(cpi, tile, tp, mi_row, mi_col, BLOCK_64X64,
&dummy_rate, &dummy_dist, 1, INT64_MAX,
- x->pc_root);
+ cpi->pc_root);
} else {
- copy_partitioning(cm, mi_8x8, prev_mi_8x8);
- nonrd_use_partition(cpi, tile, mi_8x8, tp, mi_row, mi_col,
+ copy_partitioning(cm, mi, prev_mi);
+ nonrd_use_partition(cpi, tile, mi, tp, mi_row, mi_col,
BLOCK_64X64, 1, &dummy_rate, &dummy_dist,
- x->pc_root);
+ cpi->pc_root);
}
break;
default:
@@ -3043,7 +3043,7 @@
int i;
struct macroblock_plane *const p = x->plane;
struct macroblockd_plane *const pd = xd->plane;
- PICK_MODE_CONTEXT *ctx = &x->pc_root->none;
+ PICK_MODE_CONTEXT *ctx = &cpi->pc_root->none;
for (i = 0; i < MAX_MB_PLANE; ++i) {
p[i].coeff = ctx->coeff_pbuf[i][0];
diff --git a/vp9/encoder/vp9_encodeframe.h b/vp9/encoder/vp9_encodeframe.h
index 131e932..72343cd 100644
--- a/vp9/encoder/vp9_encodeframe.h
+++ b/vp9/encoder/vp9_encodeframe.h
@@ -20,12 +20,6 @@
struct yv12_buffer_config;
struct VP9_COMP;
-typedef struct {
- unsigned int sse;
- int sum;
- unsigned int var;
-} diff;
-
void vp9_setup_src_planes(struct macroblock *x,
const struct yv12_buffer_config *src,
int mi_row, int mi_col);
diff --git a/vp9/encoder/vp9_encoder.c b/vp9/encoder/vp9_encoder.c
index 6474763..1f68f03 100644
--- a/vp9/encoder/vp9_encoder.c
+++ b/vp9/encoder/vp9_encoder.c
@@ -187,7 +187,7 @@
vpx_free(cpi->tok);
cpi->tok = 0;
- vp9_free_pc_tree(&cpi->mb);
+ vp9_free_pc_tree(cpi);
for (i = 0; i < cpi->svc.number_spatial_layers; ++i) {
LAYER_CONTEXT *const lc = &cpi->svc.layer_context[i];
@@ -455,7 +455,7 @@
CHECK_MEM_ERROR(cm, cpi->tok, vpx_calloc(tokens, sizeof(*cpi->tok)));
}
- vp9_setup_pc_tree(&cpi->common, &cpi->mb);
+ vp9_setup_pc_tree(&cpi->common, cpi);
}
static void update_frame_size(VP9_COMP *cpi) {
@@ -1414,6 +1414,7 @@
dst->alpha_buffer};
const int dst_strides[4] = {dst->y_stride, dst->uv_stride, dst->uv_stride,
dst->alpha_stride};
+ const InterpKernel *const kernel = vp9_get_interp_kernel(EIGHTTAP);
int x, y, i;
for (y = 0; y < dst_h; y += 16) {
@@ -1429,8 +1430,8 @@
uint8_t *dst_ptr = dsts[i] + (y / factor) * dst_stride + (x / factor);
vp9_convolve8(src_ptr, src_stride, dst_ptr, dst_stride,
- vp9_sub_pel_filters_8[x_q4 & 0xf], 16 * src_w / dst_w,
- vp9_sub_pel_filters_8[y_q4 & 0xf], 16 * src_h / dst_h,
+ kernel[x_q4 & 0xf], 16 * src_w / dst_w,
+ kernel[y_q4 & 0xf], 16 * src_h / dst_h,
16 / factor, 16 / factor);
}
}
@@ -1725,8 +1726,6 @@
#endif
static void encode_without_recode_loop(VP9_COMP *cpi,
- size_t *size,
- uint8_t *dest,
int q) {
VP9_COMMON *const cm = &cpi->common;
vp9_clear_system_state();
@@ -2162,7 +2161,7 @@
}
if (cpi->sf.recode_loop == DISALLOW_RECODE) {
- encode_without_recode_loop(cpi, size, dest, q);
+ encode_without_recode_loop(cpi, q);
} else {
encode_with_recode_loop(cpi, size, dest, q, bottom_index, top_index);
}
@@ -2779,6 +2778,9 @@
int vp9_get_preview_raw_frame(VP9_COMP *cpi, YV12_BUFFER_CONFIG *dest,
vp9_ppflags_t *flags) {
VP9_COMMON *cm = &cpi->common;
+#if !CONFIG_VP9_POSTPROC
+ (void)flags;
+#endif
if (!cm->show_frame) {
return -1;
@@ -2787,7 +2789,6 @@
#if CONFIG_VP9_POSTPROC
ret = vp9_post_proc_frame(cm, dest, flags);
#else
-
if (cm->frame_to_show) {
*dest = *cm->frame_to_show;
dest->y_width = cm->width;
@@ -2798,64 +2799,13 @@
} else {
ret = -1;
}
-
#endif // !CONFIG_VP9_POSTPROC
vp9_clear_system_state();
return ret;
}
}
-int vp9_set_roimap(VP9_COMP *cpi, unsigned char *map, unsigned int rows,
- unsigned int cols, int delta_q[MAX_SEGMENTS],
- int delta_lf[MAX_SEGMENTS],
- unsigned int threshold[MAX_SEGMENTS]) {
- signed char feature_data[SEG_LVL_MAX][MAX_SEGMENTS];
- struct segmentation *seg = &cpi->common.seg;
- const VP9_COMMON *const cm = &cpi->common;
- int i;
-
- if (cm->mb_rows != rows || cm->mb_cols != cols)
- return -1;
-
- if (!map) {
- vp9_disable_segmentation(seg);
- return 0;
- }
-
- vpx_memcpy(cpi->segmentation_map, map, cm->mi_rows * cm->mi_cols);
-
- // Activate segmentation.
- vp9_enable_segmentation(seg);
-
- // Set up the quant, LF and breakout threshold segment data
- for (i = 0; i < MAX_SEGMENTS; i++) {
- feature_data[SEG_LVL_ALT_Q][i] = delta_q[i];
- feature_data[SEG_LVL_ALT_LF][i] = delta_lf[i];
- cpi->segment_encode_breakout[i] = threshold[i];
- }
-
- // Enable the loop and quant changes in the feature mask
- for (i = 0; i < MAX_SEGMENTS; i++) {
- if (delta_q[i])
- vp9_enable_segfeature(seg, i, SEG_LVL_ALT_Q);
- else
- vp9_disable_segfeature(seg, i, SEG_LVL_ALT_Q);
-
- if (delta_lf[i])
- vp9_enable_segfeature(seg, i, SEG_LVL_ALT_LF);
- else
- vp9_disable_segfeature(seg, i, SEG_LVL_ALT_LF);
- }
-
- // Initialize the feature data structure
- // SEGMENT_DELTADATA 0, SEGMENT_ABSDATA 1
- vp9_set_segment_data(seg, &feature_data[0][0], SEGMENT_DELTADATA);
-
- return 0;
-}
-
-int vp9_set_active_map(VP9_COMP *cpi, unsigned char *map,
- unsigned int rows, unsigned int cols) {
+int vp9_set_active_map(VP9_COMP *cpi, unsigned char *map, int rows, int cols) {
if (rows == cpi->common.mb_rows && cols == cpi->common.mb_cols) {
if (map) {
vpx_memcpy(cpi->active_map, map, rows * cols);
diff --git a/vp9/encoder/vp9_encoder.h b/vp9/encoder/vp9_encoder.h
index f48909e..47c9019 100644
--- a/vp9/encoder/vp9_encoder.h
+++ b/vp9/encoder/vp9_encoder.h
@@ -24,6 +24,7 @@
#include "vp9/common/vp9_onyxc_int.h"
#include "vp9/encoder/vp9_aq_cyclicrefresh.h"
+#include "vp9/encoder/vp9_context_tree.h"
#include "vp9/encoder/vp9_encodemb.h"
#include "vp9/encoder/vp9_firstpass.h"
#include "vp9/encoder/vp9_lookahead.h"
@@ -391,7 +392,6 @@
RATE_CONTROL rc;
vp9_coeff_count coef_counts[TX_SIZES][PLANE_TYPES];
- vp9_coeff_probs_model frame_coef_probs[TX_SIZES][PLANE_TYPES];
struct vpx_codec_pkt_list *output_pkt_list;
@@ -411,8 +411,8 @@
// Default value is 1. From first pass stats, encode_breakout may be disabled.
ENCODE_BREAKOUT_TYPE allow_encode_breakout;
- // Get threshold from external input. In real time mode, it can be
- // overwritten according to encoding speed.
+ // Get threshold from external input. A suggested threshold is 800 for HD
+ // clips, and 300 for < HD clips.
int encode_breakout;
unsigned char *segmentation_map;
@@ -438,7 +438,7 @@
uint64_t time_pick_lpf;
uint64_t time_encode_sb_row;
- struct twopass_rc twopass;
+ TWO_PASS twopass;
YV12_BUFFER_CONFIG alt_ref_buffer;
YV12_BUFFER_CONFIG *frames[MAX_LAG_BUFFERS];
@@ -503,6 +503,11 @@
int y_mode_costs[INTRA_MODES][INTRA_MODES][INTRA_MODES];
int switchable_interp_costs[SWITCHABLE_FILTER_CONTEXTS][SWITCHABLE_FILTERS];
+ PICK_MODE_CONTEXT *leaf_tree;
+ PC_TREE *pc_tree;
+ PC_TREE *pc_root;
+ int partition_cost[PARTITION_CONTEXTS][PARTITION_TYPES];
+
#if CONFIG_MULTIPLE_ARF
// ARF tracking variables.
int multi_arf_enabled;
@@ -552,14 +557,7 @@
int vp9_update_entropy(VP9_COMP *cpi, int update);
-int vp9_set_roimap(VP9_COMP *cpi, unsigned char *map,
- unsigned int rows, unsigned int cols,
- int delta_q[MAX_SEGMENTS],
- int delta_lf[MAX_SEGMENTS],
- unsigned int threshold[MAX_SEGMENTS]);
-
-int vp9_set_active_map(VP9_COMP *cpi, unsigned char *map,
- unsigned int rows, unsigned int cols);
+int vp9_set_active_map(VP9_COMP *cpi, unsigned char *map, int rows, int cols);
int vp9_set_internal_size(VP9_COMP *cpi,
VPX_SCALING horiz_mode, VPX_SCALING vert_mode);
diff --git a/vp9/encoder/vp9_firstpass.c b/vp9/encoder/vp9_firstpass.c
index 43ae7c1..9929ae1 100644
--- a/vp9/encoder/vp9_firstpass.c
+++ b/vp9/encoder/vp9_firstpass.c
@@ -46,6 +46,9 @@
#define GF_RMAX 96.0
#define ERR_DIVISOR 150.0
#define MIN_DECAY_FACTOR 0.1
+#define SVC_FACTOR_PT_LOW 0.45
+#define FACTOR_PT_LOW 0.5
+#define FACTOR_PT_HIGH 0.9
#define KF_MB_INTRA_MIN 150
#define GF_MB_INTRA_MIN 100
@@ -61,6 +64,7 @@
#define MIN_GF_INTERVAL 4
#endif
+
// #define LONG_TERM_VBR_CORRECTION
static void swap_yv12(YV12_BUFFER_CONFIG *a, YV12_BUFFER_CONFIG *b) {
@@ -78,12 +82,12 @@
// Resets the first pass file to the given position using a relative seek from
// the current position.
-static void reset_fpf_position(struct twopass_rc *p,
+static void reset_fpf_position(TWO_PASS *p,
const FIRSTPASS_STATS *position) {
p->stats_in = position;
}
-static int lookup_next_frame_stats(const struct twopass_rc *p,
+static int lookup_next_frame_stats(const TWO_PASS *p,
FIRSTPASS_STATS *next_frame) {
if (p->stats_in >= p->stats_in_end)
return EOF;
@@ -94,7 +98,7 @@
// Read frame stats at an offset from the current position.
-static int read_frame_stats(const struct twopass_rc *p,
+static int read_frame_stats(const TWO_PASS *p,
FIRSTPASS_STATS *frame_stats, int offset) {
const FIRSTPASS_STATS *fps_ptr = p->stats_in;
@@ -111,7 +115,7 @@
return 1;
}
-static int input_stats(struct twopass_rc *p, FIRSTPASS_STATS *fps) {
+static int input_stats(TWO_PASS *p, FIRSTPASS_STATS *fps) {
if (p->stats_in >= p->stats_in_end)
return EOF;
@@ -257,7 +261,7 @@
// harder frames.
static double calculate_modified_err(const VP9_COMP *cpi,
const FIRSTPASS_STATS *this_frame) {
- const struct twopass_rc *twopass = &cpi->twopass;
+ const TWO_PASS *twopass = &cpi->twopass;
const SVC *const svc = &cpi->svc;
const FIRSTPASS_STATS *stats;
double av_err;
@@ -475,7 +479,7 @@
TileInfo tile;
struct macroblock_plane *const p = x->plane;
struct macroblockd_plane *const pd = xd->plane;
- const PICK_MODE_CONTEXT *ctx = &x->pc_root->none;
+ const PICK_MODE_CONTEXT *ctx = &cpi->pc_root->none;
int i;
int recon_yoffset, recon_uvoffset;
@@ -500,7 +504,7 @@
int new_mv_count = 0;
int sum_in_vectors = 0;
uint32_t lastmv_as_int = 0;
- struct twopass_rc *twopass = &cpi->twopass;
+ TWO_PASS *twopass = &cpi->twopass;
const MV zero_mv = {0, 0};
const YV12_BUFFER_CONFIG *first_ref_buf = lst_yv12;
@@ -540,7 +544,7 @@
vp9_setup_src_planes(x, cpi->Source, 0, 0);
vp9_setup_pre_planes(xd, 0, first_ref_buf, 0, 0, NULL);
- vp9_setup_dst_planes(xd, new_yv12, 0, 0);
+ vp9_setup_dst_planes(xd->plane, new_yv12, 0, 0);
xd->mi = cm->mi_grid_visible;
xd->mi[0] = cm->mi;
@@ -937,8 +941,8 @@
for (q = rc->best_quality; q < rc->worst_quality; ++q) {
const double factor =
calc_correction_factor(err_per_mb, ERR_DIVISOR,
- is_svc_upper_layer ? 0.8 : 0.5,
- is_svc_upper_layer ? 1.0 : 0.90, q);
+ is_svc_upper_layer ? SVC_FACTOR_PT_LOW :
+ FACTOR_PT_LOW, FACTOR_PT_HIGH, q);
const int bits_per_mb = vp9_rc_bits_per_mb(INTER_FRAME, q,
factor * speed_term);
if (bits_per_mb <= target_norm_bits_per_mb)
@@ -959,7 +963,7 @@
const VP9EncoderConfig *const oxcf = &cpi->oxcf;
const int is_spatial_svc = (svc->number_spatial_layers > 1) &&
(svc->number_temporal_layers == 1);
- struct twopass_rc *const twopass = is_spatial_svc ?
+ TWO_PASS *const twopass = is_spatial_svc ?
&svc->layer_context[svc->spatial_layer_id].twopass : &cpi->twopass;
double frame_rate;
FIRSTPASS_STATS *stats;
@@ -1007,25 +1011,6 @@
// This variable monitors how far behind the second ref update is lagging.
twopass->sr_update_lag = 1;
- // Scan the first pass file and calculate an average Intra / Inter error
- // score ratio for the sequence.
- {
- const FIRSTPASS_STATS *const start_pos = twopass->stats_in;
- FIRSTPASS_STATS this_frame;
- double sum_iiratio = 0.0;
-
- while (input_stats(twopass, &this_frame) != EOF) {
- const double iiratio = this_frame.intra_error /
- DOUBLE_DIVIDE_CHECK(this_frame.coded_error);
- sum_iiratio += fclamp(iiratio, 1.0, 20.0);
- }
-
- twopass->avg_iiratio = sum_iiratio /
- DOUBLE_DIVIDE_CHECK((double)stats->count);
-
- reset_fpf_position(twopass, start_pos);
- }
-
// Scan the first pass file and calculate a modified total error based upon
// the bias/power function used to allocate bits.
{
@@ -1072,7 +1057,7 @@
// Function to test for a condition where a complex transition is followed
// by a static section. For example in slide shows where there is a fade
// between slides. This is to help with more optimal kf and gf positioning.
-static int detect_transition_to_still(struct twopass_rc *twopass,
+static int detect_transition_to_still(TWO_PASS *twopass,
int frame_interval, int still_interval,
double loop_decay_rate,
double last_decay_rate) {
@@ -1110,7 +1095,7 @@
// This function detects a flash through the high relative pcnt_second_ref
// score in the frame following a flash frame. The offset passed in should
// reflect this.
-static int detect_flash(const struct twopass_rc *twopass, int offset) {
+static int detect_flash(const TWO_PASS *twopass, int offset) {
FIRSTPASS_STATS next_frame;
int flash_detected = 0;
@@ -1196,7 +1181,7 @@
int f_frames, int b_frames,
int *f_boost, int *b_boost) {
FIRSTPASS_STATS this_frame;
- struct twopass_rc *const twopass = &cpi->twopass;
+ TWO_PASS *const twopass = &cpi->twopass;
int i;
double boost_score = 0.0;
double mv_ratio_accumulator = 0.0;
@@ -1418,34 +1403,29 @@
#endif
// Calculate a section intra ratio used in setting max loop filter.
-static void calculate_section_intra_ratio(struct twopass_rc *twopass,
- const FIRSTPASS_STATS *start_pos,
- int section_length) {
- FIRSTPASS_STATS next_frame = { 0 };
- FIRSTPASS_STATS sectionstats = { 0 };
- int i;
+static int calculate_section_intra_ratio(const FIRSTPASS_STATS *begin,
+ const FIRSTPASS_STATS *end,
+ int section_length) {
+ const FIRSTPASS_STATS *s = begin;
+ double intra_error = 0.0;
+ double coded_error = 0.0;
+ int i = 0;
- reset_fpf_position(twopass, start_pos);
-
- for (i = 0; i < section_length; ++i) {
- input_stats(twopass, &next_frame);
- accumulate_stats(§ionstats, &next_frame);
+ while (s < end && i < section_length) {
+ intra_error += s->intra_error;
+ coded_error += s->coded_error;
+ ++s;
+ ++i;
}
- avg_stats(§ionstats);
-
- twopass->section_intra_rating =
- (int)(sectionstats.intra_error /
- DOUBLE_DIVIDE_CHECK(sectionstats.coded_error));
-
- reset_fpf_position(twopass, start_pos);
+ return (int)(intra_error / DOUBLE_DIVIDE_CHECK(coded_error));
}
// Calculate the total bits to allocate in this GF/ARF group.
static int64_t calculate_total_gf_group_bits(VP9_COMP *cpi,
double gf_group_err) {
const RATE_CONTROL *const rc = &cpi->rc;
- const struct twopass_rc *const twopass = &cpi->twopass;
+ const TWO_PASS *const twopass = &cpi->twopass;
const int max_bits = frame_max_bits(rc, &cpi->oxcf);
int64_t total_group_bits;
@@ -1469,13 +1449,36 @@
return total_group_bits;
}
+// Calculate the number bits extra to assign to boosted frames in a group.
+static int calculate_boost_bits(int frame_count,
+ int boost, int64_t total_group_bits) {
+ int allocation_chunks;
+
+ // return 0 for invalid inputs (could arise e.g. through rounding errors)
+ if (!boost || (total_group_bits <= 0) || (frame_count <= 0) )
+ return 0;
+
+ allocation_chunks = (frame_count * 100) + boost;
+
+ // Prevent overflow.
+ if (boost > 1023) {
+ int divisor = boost >> 10;
+ boost /= divisor;
+ allocation_chunks /= divisor;
+ }
+
+ // Calculate the number of extra bits for use in the boosted frame or frames.
+ return MAX((int)(((int64_t)boost * total_group_bits) / allocation_chunks), 0);
+}
+
+
// Analyse and define a gf/arf group.
static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
RATE_CONTROL *const rc = &cpi->rc;
const VP9EncoderConfig *const oxcf = &cpi->oxcf;
- struct twopass_rc *const twopass = &cpi->twopass;
- FIRSTPASS_STATS next_frame = { 0 };
- const FIRSTPASS_STATS *start_pos;
+ TWO_PASS *const twopass = &cpi->twopass;
+ FIRSTPASS_STATS next_frame;
+ const FIRSTPASS_STATS *const start_pos = twopass->stats_in;
int i;
double boost_score = 0.0;
double old_boost_score = 0.0;
@@ -1501,11 +1504,10 @@
int flash_detected;
int active_max_gf_interval;
- twopass->gf_group_bits = 0;
-
vp9_clear_system_state();
+ vp9_zero(next_frame);
- start_pos = twopass->stats_in;
+ twopass->gf_group_bits = 0;
// Load stats for the current frame.
mod_frame_err = calculate_modified_err(cpi, this_frame);
@@ -1707,127 +1709,65 @@
}
#endif
#endif
+ // Reset the file position.
+ reset_fpf_position(twopass, start_pos);
// Calculate the bits to be allocated to the gf/arf group as a whole
twopass->gf_group_bits = calculate_total_gf_group_bits(cpi, gf_group_err);
- // Reset the file position.
- reset_fpf_position(twopass, start_pos);
-
- // Assign bits to the arf or gf.
- for (i = 0; i <= (rc->source_alt_ref_pending &&
- cpi->common.frame_type != KEY_FRAME); ++i) {
- int allocation_chunks;
+ // Calculate the extra bits to be used for boosted frame(s)
+ {
int q = rc->last_q[INTER_FRAME];
- int gf_bits;
-
int boost = (rc->gfu_boost * gfboost_qadjust(q)) / 100;
// Set max and minimum boost and hence minimum allocation.
boost = clamp(boost, 125, (rc->baseline_gf_interval + 1) * 200);
- if (rc->source_alt_ref_pending && i == 0)
- allocation_chunks = ((rc->baseline_gf_interval + 1) * 100) + boost;
- else
- allocation_chunks = (rc->baseline_gf_interval * 100) + (boost - 100);
+ // Calculate the extra bits to be used for boosted frame(s)
+ twopass->gf_bits = calculate_boost_bits(rc->baseline_gf_interval,
+ boost, twopass->gf_group_bits);
- // Prevent overflow.
- if (boost > 1023) {
- int divisor = boost >> 10;
- boost /= divisor;
- allocation_chunks /= divisor;
- }
- // Calculate the number of bits to be spent on the gf or arf based on
- // the boost number.
- gf_bits = (int)((double)boost * (twopass->gf_group_bits /
- (double)allocation_chunks));
-
- // If the frame that is to be boosted is simpler than the average for
- // the gf/arf group then use an alternative calculation
- // based on the error score of the frame itself.
- if (rc->baseline_gf_interval < 1 ||
- mod_frame_err < gf_group_err / (double)rc->baseline_gf_interval) {
- double alt_gf_grp_bits = (double)twopass->kf_group_bits *
- (mod_frame_err * (double)rc->baseline_gf_interval) /
- DOUBLE_DIVIDE_CHECK(twopass->kf_group_error_left);
-
- int alt_gf_bits = (int)((double)boost * (alt_gf_grp_bits /
- (double)allocation_chunks));
-
- if (gf_bits > alt_gf_bits)
- gf_bits = alt_gf_bits;
- } else {
- // If it is harder than other frames in the group make sure it at
- // least receives an allocation in keeping with its relative error
- // score, otherwise it may be worse off than an "un-boosted" frame.
- int alt_gf_bits = (int)((double)twopass->kf_group_bits *
- mod_frame_err /
- DOUBLE_DIVIDE_CHECK(twopass->kf_group_error_left));
-
- if (alt_gf_bits > gf_bits)
- gf_bits = alt_gf_bits;
- }
-
- // Don't allow a negative value for gf_bits.
- if (gf_bits < 0)
- gf_bits = 0;
-
- if (i == 0) {
- twopass->gf_bits = gf_bits;
- }
- if (i == 1 ||
- (!rc->source_alt_ref_pending && cpi->common.frame_type != KEY_FRAME &&
- !vp9_is_upper_layer_key_frame(cpi))) {
- // Calculate the per frame bit target for this frame.
- vp9_rc_set_frame_target(cpi, gf_bits);
+ // For key frames the frame target rate is set already.
+ // NOTE: We dont bother to check for the special case of ARF overlay
+ // frames here, as there is clamping code for this in the function
+ // vp9_rc_clamp_pframe_target_size(), which applies to one and two pass
+ // encodes.
+ if (cpi->common.frame_type != KEY_FRAME &&
+ !vp9_is_upper_layer_key_frame(cpi)) {
+ vp9_rc_set_frame_target(cpi, twopass->gf_bits);
}
}
- {
- // Adjust KF group bits and error remaining.
- twopass->kf_group_error_left -= (int64_t)gf_group_err;
+ // Adjust KF group bits and error remaining.
+ twopass->kf_group_error_left -= (int64_t)gf_group_err;
- // If this is an arf update we want to remove the score for the overlay
- // frame at the end which will usually be very cheap to code.
- // The overlay frame has already, in effect, been coded so we want to spread
- // the remaining bits among the other frames.
- // For normal GFs remove the score for the GF itself unless this is
- // also a key frame in which case it has already been accounted for.
- if (rc->source_alt_ref_pending) {
- twopass->gf_group_error_left = (int64_t)(gf_group_err - mod_frame_err);
- } else if (cpi->common.frame_type != KEY_FRAME) {
- twopass->gf_group_error_left = (int64_t)(gf_group_err
- - gf_first_frame_err);
- } else {
- twopass->gf_group_error_left = (int64_t)gf_group_err;
- }
-
- // This condition could fail if there are two kfs very close together
- // despite MIN_GF_INTERVAL and would cause a divide by 0 in the
- // calculation of alt_extra_bits.
- if (rc->baseline_gf_interval >= 3) {
- const int boost = rc->source_alt_ref_pending ? b_boost : rc->gfu_boost;
-
- if (boost >= 150) {
- const int pct_extra = MIN(20, (boost - 100) / 50);
- const int alt_extra_bits = (int)((
- MAX(twopass->gf_group_bits - twopass->gf_bits, 0) *
- pct_extra) / 100);
- twopass->gf_group_bits -= alt_extra_bits;
- }
- }
+ // If this is an arf update we want to remove the score for the overlay
+ // frame at the end which will usually be very cheap to code.
+ // The overlay frame has already, in effect, been coded so we want to spread
+ // the remaining bits among the other frames.
+ // For normal GFs remove the score for the GF itself unless this is
+ // also a key frame in which case it has already been accounted for.
+ if (rc->source_alt_ref_pending) {
+ twopass->gf_group_error_left = (int64_t)(gf_group_err - mod_frame_err);
+ } else if (cpi->common.frame_type != KEY_FRAME) {
+ twopass->gf_group_error_left = (int64_t)(gf_group_err
+ - gf_first_frame_err);
+ } else {
+ twopass->gf_group_error_left = (int64_t)gf_group_err;
}
// Calculate a section intra ratio used in setting max loop filter.
if (cpi->common.frame_type != KEY_FRAME) {
- calculate_section_intra_ratio(twopass, start_pos, rc->baseline_gf_interval);
+ twopass->section_intra_rating =
+ calculate_section_intra_ratio(start_pos, twopass->stats_in_end,
+ rc->baseline_gf_interval);
}
}
// Allocate bits to a normal frame that is neither a gf an arf or a key frame.
static void assign_std_frame_bits(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
- struct twopass_rc *twopass = &cpi->twopass;
+ TWO_PASS *twopass = &cpi->twopass;
// For a single frame.
const int max_bits = frame_max_bits(&cpi->rc, &cpi->oxcf);
// Calculate modified prediction error used in bit allocation.
@@ -1856,7 +1796,7 @@
vp9_rc_set_frame_target(cpi, target_frame_size);
}
-static int test_candidate_kf(struct twopass_rc *twopass,
+static int test_candidate_kf(TWO_PASS *twopass,
const FIRSTPASS_STATS *last_frame,
const FIRSTPASS_STATS *this_frame,
const FIRSTPASS_STATS *next_frame) {
@@ -1936,9 +1876,9 @@
static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
int i, j;
RATE_CONTROL *const rc = &cpi->rc;
- struct twopass_rc *const twopass = &cpi->twopass;
+ TWO_PASS *const twopass = &cpi->twopass;
const FIRSTPASS_STATS first_frame = *this_frame;
- const FIRSTPASS_STATS *start_position = twopass->stats_in;
+ const FIRSTPASS_STATS *const start_position = twopass->stats_in;
FIRSTPASS_STATS next_frame;
FIRSTPASS_STATS last_frame;
double decay_accumulator = 1.0;
@@ -2072,15 +2012,15 @@
} else {
twopass->kf_group_bits = 0;
}
+ twopass->kf_group_bits = MAX(0, twopass->kf_group_bits);
+
// Reset the first pass file position.
reset_fpf_position(twopass, start_position);
- // Determine how big to make this keyframe based on how well the subsequent
- // frames use inter blocks.
+ // Scan through the kf group collating various stats used to deteermine
+ // how many bits to spend on it.
decay_accumulator = 1.0;
boost_score = 0.0;
-
- // Scan through the kf group collating various stats.
for (i = 0; i < rc->frames_to_key; ++i) {
if (EOF == input_stats(twopass, &next_frame))
break;
@@ -2117,84 +2057,29 @@
}
}
+ // Store the zero motion percentage
+ twopass->kf_zeromotion_pct = (int)(zero_motion_accumulator * 100.0);
+
// Calculate a section intra ratio used in setting max loop filter.
- calculate_section_intra_ratio(twopass, start_position, rc->frames_to_key);
+ twopass->section_intra_rating =
+ calculate_section_intra_ratio(start_position, twopass->stats_in_end,
+ rc->frames_to_key);
// Work out how many bits to allocate for the key frame itself.
- if (1) {
- int kf_boost = (int)boost_score;
- int allocation_chunks;
+ rc->kf_boost = (int)boost_score;
- if (kf_boost < (rc->frames_to_key * 3))
- kf_boost = (rc->frames_to_key * 3);
+ if (rc->kf_boost < (rc->frames_to_key * 3))
+ rc->kf_boost = (rc->frames_to_key * 3);
+ if (rc->kf_boost < MIN_KF_BOOST)
+ rc->kf_boost = MIN_KF_BOOST;
- if (kf_boost < MIN_KF_BOOST)
- kf_boost = MIN_KF_BOOST;
+ twopass->kf_bits = calculate_boost_bits((rc->frames_to_key - 1),
+ rc->kf_boost, twopass->kf_group_bits);
- // Make a note of baseline boost and the zero motion
- // accumulator value for use elsewhere.
- rc->kf_boost = kf_boost;
- twopass->kf_zeromotion_pct = (int)(zero_motion_accumulator * 100.0);
+ twopass->kf_group_bits -= twopass->kf_bits;
- // Key frame size depends on:
- // (1) the error score for the whole key frame group,
- // (2) the key frames' own error if this is smaller than the
- // average for the group (optional),
- // (3) insuring that the frame receives at least the allocation it would
- // have received based on its own error score vs the error score
- // remaining.
- // Special case:
- // If the sequence appears almost totally static we want to spend almost
- // all of the bits on the key frame.
- //
- // We use (cpi->rc.frames_to_key - 1) below because the key frame itself is
- // taken care of by kf_boost.
- if (zero_motion_accumulator >= 0.99) {
- allocation_chunks = ((rc->frames_to_key - 1) * 10) + kf_boost;
- } else {
- allocation_chunks = ((rc->frames_to_key - 1) * 100) + kf_boost;
- }
-
- // Prevent overflow.
- if (kf_boost > 1028) {
- const int divisor = kf_boost >> 10;
- kf_boost /= divisor;
- allocation_chunks /= divisor;
- }
-
- twopass->kf_group_bits = MAX(0, twopass->kf_group_bits);
- // Calculate the number of bits to be spent on the key frame.
- twopass->kf_bits = (int)((double)kf_boost *
- ((double)twopass->kf_group_bits / allocation_chunks));
-
- // If the key frame is actually easier than the average for the
- // kf group (which does sometimes happen, e.g. a blank intro frame)
- // then use an alternate calculation based on the kf error score
- // which should give a smaller key frame.
- if (kf_mod_err < kf_group_err / rc->frames_to_key) {
- double alt_kf_grp_bits = ((double)twopass->bits_left *
- (kf_mod_err * (double)rc->frames_to_key) /
- DOUBLE_DIVIDE_CHECK(twopass->modified_error_left));
-
- const int alt_kf_bits = (int)((double)kf_boost *
- (alt_kf_grp_bits / (double)allocation_chunks));
-
- if (twopass->kf_bits > alt_kf_bits)
- twopass->kf_bits = alt_kf_bits;
- } else {
- // Else if it is much harder than other frames in the group make sure
- // it at least receives an allocation in keeping with its relative
- // error score.
- const int alt_kf_bits = (int)((double)twopass->bits_left * (kf_mod_err /
- DOUBLE_DIVIDE_CHECK(twopass->modified_error_left)));
-
- if (alt_kf_bits > twopass->kf_bits)
- twopass->kf_bits = alt_kf_bits;
- }
- twopass->kf_group_bits -= twopass->kf_bits;
- // Per frame bit target for this frame.
- vp9_rc_set_frame_target(cpi, twopass->kf_bits);
- }
+ // Per frame bit target for this frame.
+ vp9_rc_set_frame_target(cpi, twopass->kf_bits);
// Note the total error score of the kf group minus the key frame itself.
twopass->kf_group_error_left = (int)(kf_group_err - kf_mod_err);
@@ -2238,17 +2123,15 @@
void vp9_rc_get_second_pass_params(VP9_COMP *cpi) {
VP9_COMMON *const cm = &cpi->common;
RATE_CONTROL *const rc = &cpi->rc;
- struct twopass_rc *const twopass = &cpi->twopass;
+ TWO_PASS *const twopass = &cpi->twopass;
int frames_left;
FIRSTPASS_STATS this_frame;
FIRSTPASS_STATS this_frame_copy;
- double this_frame_intra_error;
- double this_frame_coded_error;
int target;
LAYER_CONTEXT *lc = NULL;
- int is_spatial_svc = (cpi->use_svc && cpi->svc.number_temporal_layers == 1);
-
+ const int is_spatial_svc = (cpi->use_svc &&
+ cpi->svc.number_temporal_layers == 1);
if (is_spatial_svc) {
lc = &cpi->svc.layer_context[cpi->svc.spatial_layer_id];
frames_left = (int)(twopass->total_stats.count -
@@ -2298,27 +2181,27 @@
if (EOF == input_stats(twopass, &this_frame))
return;
- this_frame_intra_error = this_frame.intra_error;
- this_frame_coded_error = this_frame.coded_error;
-
// Keyframe and section processing.
if (rc->frames_to_key == 0 ||
(cpi->frame_flags & FRAMEFLAGS_KEY)) {
// Define next KF group and assign bits to it.
this_frame_copy = this_frame;
find_next_key_frame(cpi, &this_frame_copy);
- // Don't place key frame in any enhancement layers in spatial svc
- if (cpi->use_svc && cpi->svc.number_temporal_layers == 1) {
- lc->is_key_frame = 1;
- if (cpi->svc.spatial_layer_id > 0) {
- cm->frame_type = INTER_FRAME;
+ } else {
+ cm->frame_type = INTER_FRAME;
+ }
+
+ if (is_spatial_svc) {
+ if (cpi->svc.spatial_layer_id == 0) {
+ lc->is_key_frame = (cm->frame_type == KEY_FRAME);
+ } else {
+ cm->frame_type = INTER_FRAME;
+ lc->is_key_frame = cpi->svc.layer_context[0].is_key_frame;
+
+ if (lc->is_key_frame) {
+ cpi->ref_frame_flags &= (~VP9_LAST_FLAG);
}
}
- } else {
- if (cpi->use_svc && cpi->svc.number_temporal_layers == 1) {
- lc->is_key_frame = 0;
- }
- cm->frame_type = INTER_FRAME;
}
// Is this frame a GF / ARF? (Note: a key frame is always also a GF).
@@ -2355,9 +2238,6 @@
assign_std_frame_bits(cpi, &this_frame_copy);
}
- // Keep a globally available copy of this and the next frame's iiratio.
- twopass->this_iiratio = (int)(this_frame_intra_error /
- DOUBLE_DIVIDE_CHECK(this_frame_coded_error));
{
FIRSTPASS_STATS next_frame;
if (lookup_next_frame_stats(twopass, &next_frame) != EOF) {
@@ -2384,6 +2264,7 @@
}
void vp9_twopass_postencode_update(VP9_COMP *cpi) {
+ TWO_PASS *const twopass = &cpi->twopass;
RATE_CONTROL *const rc = &cpi->rc;
#ifdef LONG_TERM_VBR_CORRECTION
// In this experimental mode, the VBR correction is done exclusively through
@@ -2405,14 +2286,13 @@
// vs. actual bitrate gradually as we progress towards the end of the
// sequence in order to mitigate this effect.
const double progress =
- (double)(cpi->twopass.stats_in - cpi->twopass.stats_in_start) /
- (cpi->twopass.stats_in_end - cpi->twopass.stats_in_start);
+ (double)(twopass->stats_in - twopass->stats_in_start) /
+ (twopass->stats_in_end - twopass->stats_in_start);
const int bits_used = (int)(progress * rc->this_frame_target +
(1.0 - progress) * rc->projected_frame_size);
#endif
- cpi->twopass.bits_left -= bits_used;
- cpi->twopass.bits_left = MAX(cpi->twopass.bits_left, 0);
+ twopass->bits_left = MAX(twopass->bits_left - bits_used, 0);
#ifdef LONG_TERM_VBR_CORRECTION
if (cpi->common.frame_type != KEY_FRAME &&
@@ -2422,12 +2302,12 @@
vp9_is_upper_layer_key_frame(cpi)) {
// For key frames kf_group_bits already had the target bits subtracted out.
// So now update to the correct value based on the actual bits used.
- cpi->twopass.kf_group_bits += cpi->rc.this_frame_target - bits_used;
+ twopass->kf_group_bits += rc->this_frame_target - bits_used;
} else {
#endif
- cpi->twopass.kf_group_bits -= bits_used;
- cpi->twopass.gf_group_bits -= bits_used;
- cpi->twopass.gf_group_bits = MAX(cpi->twopass.gf_group_bits, 0);
+ twopass->kf_group_bits -= bits_used;
+ twopass->gf_group_bits -= bits_used;
+ twopass->gf_group_bits = MAX(twopass->gf_group_bits, 0);
}
- cpi->twopass.kf_group_bits = MAX(cpi->twopass.kf_group_bits, 0);
+ twopass->kf_group_bits = MAX(twopass->kf_group_bits, 0);
}
diff --git a/vp9/encoder/vp9_firstpass.h b/vp9/encoder/vp9_firstpass.h
index f7ba423..d84793e 100644
--- a/vp9/encoder/vp9_firstpass.h
+++ b/vp9/encoder/vp9_firstpass.h
@@ -38,10 +38,9 @@
int64_t spatial_layer_id;
} FIRSTPASS_STATS;
-struct twopass_rc {
+typedef struct {
unsigned int section_intra_rating;
unsigned int next_iiratio;
- unsigned int this_iiratio;
FIRSTPASS_STATS total_stats;
FIRSTPASS_STATS this_frame_stats;
const FIRSTPASS_STATS *stats_in;
@@ -50,8 +49,6 @@
FIRSTPASS_STATS total_left_stats;
int first_pass_done;
int64_t bits_left;
- int64_t clip_bits_total;
- double avg_iiratio;
double modified_error_min;
double modified_error_max;
double modified_error_total;
@@ -72,7 +69,6 @@
int64_t gf_group_bits;
// Bits for the golden frame or ARF - 2 pass only
int gf_bits;
- int alt_extra_bits;
int sr_update_lag;
@@ -80,7 +76,7 @@
int gf_zeromotion_pct;
int active_worst_quality;
-};
+} TWO_PASS;
struct VP9_COMP;
diff --git a/vp9/encoder/vp9_mbgraph.c b/vp9/encoder/vp9_mbgraph.c
index e7dcc7a..041e583 100644
--- a/vp9/encoder/vp9_mbgraph.c
+++ b/vp9/encoder/vp9_mbgraph.c
@@ -20,7 +20,6 @@
#include "vp9/common/vp9_systemdependent.h"
-
static unsigned int do_16x16_motion_iteration(VP9_COMP *cpi,
const MV *ref_mv,
MV *dst_mv,
@@ -236,9 +235,10 @@
int mb_col, mb_row, offset = 0;
int mb_y_offset = 0, arf_y_offset = 0, gld_y_offset = 0;
- MV arf_top_mv = {0, 0}, gld_top_mv = {0, 0};
- MODE_INFO mi_local = { { 0 } };
+ MV gld_top_mv = {0, 0};
+ MODE_INFO mi_local;
+ vp9_zero(mi_local);
// Set up limit values for motion vectors to prevent them extending outside
// the UMV borders.
x->mv_row_min = -BORDER_MV_PIXELS_B16;
@@ -253,7 +253,7 @@
mi_local.mbmi.ref_frame[1] = NONE;
for (mb_row = 0; mb_row < cm->mb_rows; mb_row++) {
- MV arf_left_mv = arf_top_mv, gld_left_mv = gld_top_mv;
+ MV gld_left_mv = gld_top_mv;
int mb_y_in_offset = mb_y_offset;
int arf_y_in_offset = arf_y_offset;
int gld_y_in_offset = gld_y_offset;
@@ -270,10 +270,8 @@
update_mbgraph_mb_stats(cpi, mb_stats, buf, mb_y_in_offset,
golden_ref, &gld_left_mv, alt_ref,
mb_row, mb_col);
- arf_left_mv = mb_stats->ref[ALTREF_FRAME].m.mv.as_mv;
gld_left_mv = mb_stats->ref[GOLDEN_FRAME].m.mv.as_mv;
if (mb_col == 0) {
- arf_top_mv = arf_left_mv;
gld_top_mv = gld_left_mv;
}
xd->left_available = 1;
diff --git a/vp9/encoder/vp9_mcomp.c b/vp9/encoder/vp9_mcomp.c
index 43c8ab8..4f7d6f1 100644
--- a/vp9/encoder/vp9_mcomp.c
+++ b/vp9/encoder/vp9_mcomp.c
@@ -886,6 +886,10 @@
int r, c, i;
int start_col, end_col, start_row, end_row;
+ // The cfg and search_param parameters are not used in this search variant
+ (void)cfg;
+ (void)search_param;
+
clamp_mv(ref_mv, x->mv_col_min, x->mv_col_max, x->mv_row_min, x->mv_row_max);
*best_mv = *ref_mv;
*num00 = 11;
@@ -1551,7 +1555,7 @@
int search_range,
const vp9_variance_fn_ptr_t *fn_ptr,
const MV *center_mv,
- const uint8_t *second_pred, int w, int h) {
+ const uint8_t *second_pred) {
const MV neighbors[8] = {{-1, 0}, {0, -1}, {0, 1}, {1, 0},
{-1, -1}, {1, -1}, {-1, 1}, {1, 1}};
const MACROBLOCKD *const xd = &x->e_mbd;
diff --git a/vp9/encoder/vp9_mcomp.h b/vp9/encoder/vp9_mcomp.h
index 827957d..873edf3 100644
--- a/vp9/encoder/vp9_mcomp.h
+++ b/vp9/encoder/vp9_mcomp.h
@@ -144,8 +144,7 @@
MV *ref_mv, int error_per_bit,
int search_range,
const vp9_variance_fn_ptr_t *fn_ptr,
- const MV *center_mv, const uint8_t *second_pred,
- int w, int h);
+ const MV *center_mv, const uint8_t *second_pred);
#ifdef __cplusplus
} // extern "C"
#endif
diff --git a/vp9/encoder/vp9_pickmode.c b/vp9/encoder/vp9_pickmode.c
index 78fba73..437b680 100644
--- a/vp9/encoder/vp9_pickmode.c
+++ b/vp9/encoder/vp9_pickmode.c
@@ -31,7 +31,7 @@
int_mv *tmp_mv, int *rate_mv) {
MACROBLOCKD *xd = &x->e_mbd;
MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
- struct buf_2d backup_yv12[MAX_MB_PLANE] = {{0}};
+ struct buf_2d backup_yv12[MAX_MB_PLANE] = {{0, 0}};
int step_param;
int sadpb = x->sadperbit16;
MV mvp_full;
@@ -79,7 +79,7 @@
if (x->mv_best_ref_index[ref] < 2)
mvp_full = mbmi->ref_mvs[ref][x->mv_best_ref_index[ref]].as_mv;
else
- mvp_full = x->pred_mv[ref].as_mv;
+ mvp_full = x->pred_mv[ref];
mvp_full.col >>= 3;
mvp_full.row >>= 3;
@@ -110,7 +110,7 @@
MV *tmp_mv) {
MACROBLOCKD *xd = &x->e_mbd;
MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
- struct buf_2d backup_yv12[MAX_MB_PLANE] = {{0}};
+ struct buf_2d backup_yv12[MAX_MB_PLANE] = {{0, 0}};
int ref = mbmi->ref_frame[0];
MV ref_mv = mbmi->ref_mvs[ref][0].as_mv;
int dis;
@@ -143,12 +143,13 @@
xd->plane[i].pre[0] = backup_yv12[i];
}
- x->pred_mv[ref].as_mv = *tmp_mv;
+ x->pred_mv[ref] = *tmp_mv;
}
static void model_rd_for_sb_y(VP9_COMP *cpi, BLOCK_SIZE bsize,
MACROBLOCK *x, MACROBLOCKD *xd,
- int *out_rate_sum, int64_t *out_dist_sum) {
+ int *out_rate_sum, int64_t *out_dist_sum,
+ unsigned int *var_y, unsigned int *sse_y) {
// Note our transform coeffs are 8 times an orthogonal transform.
// Hence quantizer step is also 8 times. To get effective quantizer
// we need to divide by 8 before sending to modeling function.
@@ -162,6 +163,9 @@
unsigned int var = cpi->fn_ptr[bsize].vf(p->src.buf, p->src.stride,
pd->dst.buf, pd->dst.stride, &sse);
+ *var_y = var;
+ *sse_y = sse;
+
// TODO(jingning) This is a temporary solution to account for frames with
// light changes. Need to customize the rate-distortion modeling for non-RD
// mode decision.
@@ -198,6 +202,9 @@
int rate = INT_MAX;
int64_t dist = INT64_MAX;
+ // var_y and sse_y are saved to be used in skipping checking
+ unsigned int var_y = UINT_MAX;
+ unsigned int sse_y = UINT_MAX;
VP9_COMMON *cm = &cpi->common;
int intra_cost_penalty = 20 * vp9_dc_quant(cm->base_qindex, cm->y_dc_delta_q);
@@ -219,8 +226,7 @@
x->skip_encode = cpi->sf.skip_encode_frame && x->q_index < QIDX_SKIP_THRESH;
x->skip = 0;
- if (!x->in_active_map)
- x->skip = 1;
+
// initialize mode decisions
*returnrate = INT_MAX;
*returndistortion = INT64_MAX;
@@ -318,54 +324,37 @@
pred_filter_search &&
((mbmi->mv[0].as_mv.row & 0x07) != 0 ||
(mbmi->mv[0].as_mv.col & 0x07) != 0)) {
- int64_t tmp_rdcost1 = INT64_MAX;
- int64_t tmp_rdcost2 = INT64_MAX;
- int64_t tmp_rdcost3 = INT64_MAX;
int pf_rate[3];
int64_t pf_dist[3];
+ unsigned int pf_var[3];
+ unsigned int pf_sse[3];
+ int64_t best_cost = INT64_MAX;
+ INTERP_FILTER best_filter = SWITCHABLE, filter;
- mbmi->interp_filter = EIGHTTAP;
- vp9_build_inter_predictors_sby(xd, mi_row, mi_col, bsize);
- model_rd_for_sb_y(cpi, bsize, x, xd, &pf_rate[EIGHTTAP],
- &pf_dist[EIGHTTAP]);
- tmp_rdcost1 = RDCOST(x->rdmult, x->rddiv,
- vp9_get_switchable_rate(cpi) + pf_rate[EIGHTTAP],
- pf_dist[EIGHTTAP]);
-
- mbmi->interp_filter = EIGHTTAP_SHARP;
- vp9_build_inter_predictors_sby(xd, mi_row, mi_col, bsize);
- model_rd_for_sb_y(cpi, bsize, x, xd, &pf_rate[EIGHTTAP_SHARP],
- &pf_dist[EIGHTTAP_SHARP]);
- tmp_rdcost2 = RDCOST(x->rdmult, x->rddiv, vp9_get_switchable_rate(cpi) +
- pf_rate[EIGHTTAP_SHARP],
- pf_dist[EIGHTTAP_SHARP]);
-
- mbmi->interp_filter = EIGHTTAP_SMOOTH;
- vp9_build_inter_predictors_sby(xd, mi_row, mi_col, bsize);
- model_rd_for_sb_y(cpi, bsize, x, xd, &pf_rate[EIGHTTAP_SMOOTH],
- &pf_dist[EIGHTTAP_SMOOTH]);
- tmp_rdcost3 = RDCOST(x->rdmult, x->rddiv, vp9_get_switchable_rate(cpi) +
- pf_rate[EIGHTTAP_SMOOTH],
- pf_dist[EIGHTTAP_SMOOTH]);
-
- if (tmp_rdcost2 < tmp_rdcost1) {
- if (tmp_rdcost2 < tmp_rdcost3)
- mbmi->interp_filter = EIGHTTAP_SHARP;
- else
- mbmi->interp_filter = EIGHTTAP_SMOOTH;
- } else {
- if (tmp_rdcost1 < tmp_rdcost3)
- mbmi->interp_filter = EIGHTTAP;
- else
- mbmi->interp_filter = EIGHTTAP_SMOOTH;
+ for (filter = EIGHTTAP; filter <= EIGHTTAP_SHARP; ++filter) {
+ int64_t cost;
+ mbmi->interp_filter = filter;
+ vp9_build_inter_predictors_sby(xd, mi_row, mi_col, bsize);
+ model_rd_for_sb_y(cpi, bsize, x, xd, &pf_rate[filter],
+ &pf_dist[filter], &pf_var[filter], &pf_sse[filter]);
+ cost = RDCOST(x->rdmult, x->rddiv,
+ vp9_get_switchable_rate(cpi) + pf_rate[filter],
+ pf_dist[filter]);
+ if (cost < best_cost) {
+ best_filter = filter;
+ best_cost = cost;
+ }
}
+ mbmi->interp_filter = best_filter;
rate = pf_rate[mbmi->interp_filter];
dist = pf_dist[mbmi->interp_filter];
+ var_y = pf_var[mbmi->interp_filter];
+ sse_y = pf_sse[mbmi->interp_filter];
} else {
mbmi->interp_filter = (filter_ref == SWITCHABLE) ? EIGHTTAP: filter_ref;
vp9_build_inter_predictors_sby(xd, mi_row, mi_col, bsize);
- model_rd_for_sb_y(cpi, bsize, x, xd, &rate, &dist);
+ model_rd_for_sb_y(cpi, bsize, x, xd, &rate, &dist, &var_y, &sse_y);
}
rate += rate_mv;
@@ -373,7 +362,78 @@
[INTER_OFFSET(this_mode)];
this_rd = RDCOST(x->rdmult, x->rddiv, rate, dist);
- if (this_rd < best_rd) {
+ // Skipping checking: test to see if this block can be reconstructed by
+ // prediction only.
+ if (!x->in_active_map) {
+ x->skip = 1;
+ } else if (cpi->allow_encode_breakout && x->encode_breakout) {
+ const BLOCK_SIZE uv_size = get_plane_block_size(bsize, &xd->plane[1]);
+ unsigned int var = var_y, sse = sse_y;
+ // Skipping threshold for ac.
+ unsigned int thresh_ac;
+ // Skipping threshold for dc.
+ unsigned int thresh_dc;
+ // Set a maximum for threshold to avoid big PSNR loss in low bit rate
+ // case. Use extreme low threshold for static frames to limit skipping.
+ const unsigned int max_thresh = 36000;
+ // The encode_breakout input
+ const unsigned int min_thresh =
+ MIN(((unsigned int)x->encode_breakout << 4), max_thresh);
+
+ // Calculate threshold according to dequant value.
+ thresh_ac = (xd->plane[0].dequant[1] * xd->plane[0].dequant[1]) / 9;
+ thresh_ac = clamp(thresh_ac, min_thresh, max_thresh);
+
+ // Adjust ac threshold according to partition size.
+ thresh_ac >>= 8 - (b_width_log2_lookup[bsize] +
+ b_height_log2_lookup[bsize]);
+
+ thresh_dc = (xd->plane[0].dequant[0] * xd->plane[0].dequant[0] >> 6);
+
+ // Y skipping condition checking for ac and dc.
+ if (var <= thresh_ac && (sse - var) <= thresh_dc) {
+ unsigned int sse_u, sse_v;
+ unsigned int var_u, var_v;
+
+ // Skip u v prediction for less calculation, that won't affect
+ // result much.
+ var_u = cpi->fn_ptr[uv_size].vf(x->plane[1].src.buf,
+ x->plane[1].src.stride,
+ xd->plane[1].dst.buf,
+ xd->plane[1].dst.stride, &sse_u);
+
+ // U skipping condition checking
+ if ((var_u * 4 <= thresh_ac) && (sse_u - var_u <= thresh_dc)) {
+ var_v = cpi->fn_ptr[uv_size].vf(x->plane[2].src.buf,
+ x->plane[2].src.stride,
+ xd->plane[2].dst.buf,
+ xd->plane[2].dst.stride, &sse_v);
+
+ // V skipping condition checking
+ if ((var_v * 4 <= thresh_ac) && (sse_v - var_v <= thresh_dc)) {
+ x->skip = 1;
+
+ // The cost of skip bit needs to be added.
+ rate = rate_mv;
+ rate += cpi->inter_mode_cost[mbmi->mode_context[ref_frame]]
+ [INTER_OFFSET(this_mode)];
+
+ // More on this part of rate
+ // rate += vp9_cost_bit(vp9_get_skip_prob(cm, xd), 1);
+
+ // Scaling factor for SSE from spatial domain to frequency
+ // domain is 16. Adjust distortion accordingly.
+ // TODO(yunqingwang): In this function, only y-plane dist is
+ // calculated.
+ dist = (sse << 4); // + ((sse_u + sse_v) << 4);
+ this_rd = RDCOST(x->rdmult, x->rddiv, rate, dist);
+ // *disable_skip = 1;
+ }
+ }
+ }
+ }
+
+ if (this_rd < best_rd || x->skip) {
best_rd = this_rd;
*returnrate = rate;
*returndistortion = dist;
@@ -381,6 +441,9 @@
best_pred_filter = mbmi->interp_filter;
best_ref_frame = ref_frame;
}
+
+ if (x->skip)
+ break;
}
}
@@ -392,14 +455,15 @@
// Perform intra prediction search, if the best SAD is above a certain
// threshold.
- if (best_rd > inter_mode_thresh && bsize < cpi->sf.max_intra_bsize) {
+ if (!x->skip && best_rd > inter_mode_thresh &&
+ bsize < cpi->sf.max_intra_bsize) {
for (this_mode = DC_PRED; this_mode <= DC_PRED; ++this_mode) {
vp9_predict_intra_block(xd, 0, b_width_log2(bsize),
mbmi->tx_size, this_mode,
&p->src.buf[0], p->src.stride,
&pd->dst.buf[0], pd->dst.stride, 0, 0, 0);
- model_rd_for_sb_y(cpi, bsize, x, xd, &rate, &dist);
+ model_rd_for_sb_y(cpi, bsize, x, xd, &rate, &dist, &var_y, &sse_y);
rate += cpi->mbmode_cost[this_mode];
rate += intra_cost_penalty;
this_rd = RDCOST(x->rdmult, x->rddiv, rate, dist);
diff --git a/vp9/encoder/vp9_quantize.c b/vp9/encoder/vp9_quantize.c
index 5206bb6..4d3086d 100644
--- a/vp9/encoder/vp9_quantize.c
+++ b/vp9/encoder/vp9_quantize.c
@@ -32,6 +32,7 @@
zbin_ptr[1] + zbin_oq_value };
const int nzbins[2] = { zbins[0] * -1,
zbins[1] * -1 };
+ (void)iscan;
vpx_memset(qcoeff_ptr, 0, count * sizeof(int16_t));
vpx_memset(dqcoeff_ptr, 0, count * sizeof(int16_t));
@@ -87,6 +88,7 @@
int idx = 0;
int idx_arr[1024];
int i, eob = -1;
+ (void)iscan;
vpx_memset(qcoeff_ptr, 0, n_coeffs * sizeof(int16_t));
vpx_memset(dqcoeff_ptr, 0, n_coeffs * sizeof(int16_t));
diff --git a/vp9/encoder/vp9_ratectrl.c b/vp9/encoder/vp9_ratectrl.c
index 0f4c4da..a04622c 100644
--- a/vp9/encoder/vp9_ratectrl.c
+++ b/vp9/encoder/vp9_ratectrl.c
@@ -48,6 +48,7 @@
static int arfgf_low_motion_minq[QINDEX_RANGE];
static int arfgf_high_motion_minq[QINDEX_RANGE];
static int inter_minq[QINDEX_RANGE];
+static int rtc_minq[QINDEX_RANGE];
static int gf_high = 2000;
static int gf_low = 400;
static int kf_high = 5000;
@@ -84,6 +85,7 @@
arfgf_low_motion_minq[i] = get_minq_index(maxq, 0.0000015, -0.0009, 0.30);
arfgf_high_motion_minq[i] = get_minq_index(maxq, 0.0000021, -0.00125, 0.50);
inter_minq[i] = get_minq_index(maxq, 0.00000271, -0.00113, 0.90);
+ rtc_minq[i] = get_minq_index(maxq, 0.00000271, -0.00113, 0.70);
}
}
@@ -549,14 +551,14 @@
// Use the lower of active_worst_quality and recent/average Q.
if (cm->current_video_frame > 1) {
if (rc->avg_frame_qindex[INTER_FRAME] < active_worst_quality)
- active_best_quality = inter_minq[rc->avg_frame_qindex[INTER_FRAME]];
+ active_best_quality = rtc_minq[rc->avg_frame_qindex[INTER_FRAME]];
else
- active_best_quality = inter_minq[active_worst_quality];
+ active_best_quality = rtc_minq[active_worst_quality];
} else {
if (rc->avg_frame_qindex[KEY_FRAME] < active_worst_quality)
- active_best_quality = inter_minq[rc->avg_frame_qindex[KEY_FRAME]];
+ active_best_quality = rtc_minq[rc->avg_frame_qindex[KEY_FRAME]];
else
- active_best_quality = inter_minq[active_worst_quality];
+ active_best_quality = rtc_minq[active_worst_quality];
}
}
@@ -1145,10 +1147,6 @@
cpi->rc.frames_to_key--;
}
-static int test_for_kf_one_pass(VP9_COMP *cpi) {
- // Placeholder function for auto key frame
- return 0;
-}
// Use this macro to turn on/off use of alt-refs in one-pass mode.
#define USE_ALTREF_FOR_ONE_PASS 1
@@ -1180,11 +1178,12 @@
VP9_COMMON *const cm = &cpi->common;
RATE_CONTROL *const rc = &cpi->rc;
int target;
+ // TODO(yaowu): replace the "auto_key && 0" below with proper decision logic.
if (!cpi->refresh_alt_ref_frame &&
(cm->current_video_frame == 0 ||
(cpi->frame_flags & FRAMEFLAGS_KEY) ||
rc->frames_to_key == 0 ||
- (cpi->oxcf.auto_key && test_for_kf_one_pass(cpi)))) {
+ (cpi->oxcf.auto_key && 0))) {
cm->frame_type = KEY_FRAME;
rc->this_key_frame_forced = cm->current_video_frame != 0 &&
rc->frames_to_key == 0;
@@ -1311,10 +1310,11 @@
VP9_COMMON *const cm = &cpi->common;
RATE_CONTROL *const rc = &cpi->rc;
int target;
+ // TODO(yaowu): replace the "auto_key && 0" below with proper decision logic.
if ((cm->current_video_frame == 0 ||
(cpi->frame_flags & FRAMEFLAGS_KEY) ||
rc->frames_to_key == 0 ||
- (cpi->oxcf.auto_key && test_for_kf_one_pass(cpi)))) {
+ (cpi->oxcf.auto_key && 0))) {
cm->frame_type = KEY_FRAME;
rc->this_key_frame_forced = cm->current_video_frame != 0 &&
rc->frames_to_key == 0;
diff --git a/vp9/encoder/vp9_rdopt.c b/vp9/encoder/vp9_rdopt.c
index 22e19fe..601e64d 100644
--- a/vp9/encoder/vp9_rdopt.c
+++ b/vp9/encoder/vp9_rdopt.c
@@ -242,6 +242,31 @@
cpi->mb.sadperbit4 = sad_per_bit4lut[qindex];
}
+static void swap_block_ptr(MACROBLOCK *x, PICK_MODE_CONTEXT *ctx,
+ int m, int n, int min_plane, int max_plane) {
+ int i;
+
+ for (i = min_plane; i < max_plane; ++i) {
+ struct macroblock_plane *const p = &x->plane[i];
+ struct macroblockd_plane *const pd = &x->e_mbd.plane[i];
+
+ p->coeff = ctx->coeff_pbuf[i][m];
+ p->qcoeff = ctx->qcoeff_pbuf[i][m];
+ pd->dqcoeff = ctx->dqcoeff_pbuf[i][m];
+ p->eobs = ctx->eobs_pbuf[i][m];
+
+ ctx->coeff_pbuf[i][m] = ctx->coeff_pbuf[i][n];
+ ctx->qcoeff_pbuf[i][m] = ctx->qcoeff_pbuf[i][n];
+ ctx->dqcoeff_pbuf[i][m] = ctx->dqcoeff_pbuf[i][n];
+ ctx->eobs_pbuf[i][m] = ctx->eobs_pbuf[i][n];
+
+ ctx->coeff_pbuf[i][n] = p->coeff;
+ ctx->qcoeff_pbuf[i][n] = p->qcoeff;
+ ctx->dqcoeff_pbuf[i][n] = pd->dqcoeff;
+ ctx->eobs_pbuf[i][n] = p->eobs;
+ }
+}
+
static void set_block_thresholds(const VP9_COMMON *cm, RD_OPT *rd) {
int i, bsize, segment_id;
@@ -297,7 +322,7 @@
fill_token_costs(x->token_costs, cm->fc.coef_probs);
for (i = 0; i < PARTITION_CONTEXTS; i++)
- vp9_cost_tokens(x->partition_cost[i], get_partition_probs(cm, i),
+ vp9_cost_tokens(cpi->partition_cost[i], get_partition_probs(cm, i),
vp9_partition_tree);
}
@@ -745,7 +770,8 @@
int use_fast_coef_casting) {
MACROBLOCKD *const xd = &x->e_mbd;
const struct macroblockd_plane *const pd = &xd->plane[plane];
- struct rdcost_block_args args = { 0 };
+ struct rdcost_block_args args;
+ vp9_zero(args);
args.x = x;
args.best_rd = ref_best_rd;
args.use_fast_coef_costing = use_fast_coef_casting;
@@ -1387,27 +1413,8 @@
*rate_tokenonly = this_rate_tokenonly;
*distortion = this_distortion;
*skippable = s;
- if (!x->select_txfm_size) {
- int i;
- struct macroblock_plane *const p = x->plane;
- struct macroblockd_plane *const pd = xd->plane;
- for (i = 1; i < MAX_MB_PLANE; ++i) {
- p[i].coeff = ctx->coeff_pbuf[i][2];
- p[i].qcoeff = ctx->qcoeff_pbuf[i][2];
- pd[i].dqcoeff = ctx->dqcoeff_pbuf[i][2];
- p[i].eobs = ctx->eobs_pbuf[i][2];
-
- ctx->coeff_pbuf[i][2] = ctx->coeff_pbuf[i][0];
- ctx->qcoeff_pbuf[i][2] = ctx->qcoeff_pbuf[i][0];
- ctx->dqcoeff_pbuf[i][2] = ctx->dqcoeff_pbuf[i][0];
- ctx->eobs_pbuf[i][2] = ctx->eobs_pbuf[i][0];
-
- ctx->coeff_pbuf[i][0] = p[i].coeff;
- ctx->qcoeff_pbuf[i][0] = p[i].qcoeff;
- ctx->dqcoeff_pbuf[i][0] = pd[i].dqcoeff;
- ctx->eobs_pbuf[i][0] = p[i].eobs;
- }
- }
+ if (!x->select_txfm_size)
+ swap_block_ptr(x, ctx, 2, 0, 1, MAX_MB_PLANE);
}
}
@@ -1843,8 +1850,8 @@
mvp_full.col = bsi->mvp.as_mv.col >> 3;
if (cpi->sf.adaptive_motion_search && cm->show_frame) {
- mvp_full.row = x->pred_mv[mbmi->ref_frame[0]].as_mv.row >> 3;
- mvp_full.col = x->pred_mv[mbmi->ref_frame[0]].as_mv.col >> 3;
+ mvp_full.row = x->pred_mv[mbmi->ref_frame[0]].row >> 3;
+ mvp_full.col = x->pred_mv[mbmi->ref_frame[0]].col >> 3;
step_param = MAX(step_param, 8);
}
@@ -1895,7 +1902,7 @@
}
if (cpi->sf.adaptive_motion_search)
- x->pred_mv[mbmi->ref_frame[0]].as_mv = *new_mv;
+ x->pred_mv[mbmi->ref_frame[0]] = *new_mv;
// restore src pointers
mi_buf_restore(x, orig_src, orig_pre);
@@ -2101,14 +2108,14 @@
cpi->common.show_frame &&
block_size < cpi->sf.max_partition_size);
- int_mv pred_mv[3];
- pred_mv[0] = mbmi->ref_mvs[ref_frame][0];
- pred_mv[1] = mbmi->ref_mvs[ref_frame][1];
+ MV pred_mv[3];
+ pred_mv[0] = mbmi->ref_mvs[ref_frame][0].as_mv;
+ pred_mv[1] = mbmi->ref_mvs[ref_frame][1].as_mv;
pred_mv[2] = x->pred_mv[ref_frame];
// Get the sad for each candidate reference mv
for (i = 0; i < num_mv_refs; i++) {
- this_mv.as_int = pred_mv[i].as_int;
+ this_mv.as_mv = pred_mv[i];
max_mv = MAX(max_mv,
MAX(abs(this_mv.as_mv.row), abs(this_mv.as_mv.col)) >> 3);
@@ -2215,10 +2222,6 @@
ctx->skip = x->skip;
ctx->best_mode_index = mode_index;
ctx->mic = *xd->mi[0];
-
- ctx->best_ref_mv[0].as_int = ref_mv->as_int;
- ctx->best_ref_mv[1].as_int = second_ref_mv->as_int;
-
ctx->single_pred_diff = (int)comp_pred_diff[SINGLE_REFERENCE];
ctx->comp_pred_diff = (int)comp_pred_diff[COMPOUND_REFERENCE];
ctx->hybrid_pred_diff = (int)comp_pred_diff[REFERENCE_MODE_SELECT];
@@ -2312,7 +2315,7 @@
MACROBLOCKD *xd = &x->e_mbd;
const VP9_COMMON *cm = &cpi->common;
MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
- struct buf_2d backup_yv12[MAX_MB_PLANE] = {{0}};
+ struct buf_2d backup_yv12[MAX_MB_PLANE] = {{0, 0}};
int bestsme = INT_MAX;
int step_param;
int sadpb = x->sadperbit16;
@@ -2331,7 +2334,7 @@
MV pred_mv[3];
pred_mv[0] = mbmi->ref_mvs[ref][0].as_mv;
pred_mv[1] = mbmi->ref_mvs[ref][1].as_mv;
- pred_mv[2] = x->pred_mv[ref].as_mv;
+ pred_mv[2] = x->pred_mv[ref];
if (scaled_ref_frame) {
int i;
@@ -2376,7 +2379,8 @@
for (i = LAST_FRAME; i <= ALTREF_FRAME && cm->show_frame; ++i) {
if ((x->pred_mv_sad[ref] >> 3) > x->pred_mv_sad[i]) {
- x->pred_mv[ref].as_int = 0;
+ x->pred_mv[ref].row = 0;
+ x->pred_mv[ref].col = 0;
tmp_mv->as_int = INVALID_MV;
if (scaled_ref_frame) {
@@ -2417,7 +2421,7 @@
x->nmvjointcost, x->mvcost, MV_COST_WEIGHT);
if (cpi->sf.adaptive_motion_search && cm->show_frame)
- x->pred_mv[ref].as_int = tmp_mv->as_int;
+ x->pred_mv[ref] = tmp_mv->as_mv;
if (scaled_ref_frame) {
int i;
@@ -2514,8 +2518,7 @@
bestsme = vp9_refining_search_8p_c(x, &tmp_mv, sadpb,
search_range,
&cpi->fn_ptr[bsize],
- &ref_mv[id].as_mv, second_pred,
- pw, ph);
+ &ref_mv[id].as_mv, second_pred);
if (bestsme < INT_MAX)
bestsme = vp9_get_mvpred_av_var(x, &tmp_mv, &ref_mv[id].as_mv,
second_pred, &cpi->fn_ptr[bsize], 1);
@@ -2931,30 +2934,6 @@
return this_rd; // if 0, this will be re-calculated by caller
}
-static void swap_block_ptr(MACROBLOCK *x, PICK_MODE_CONTEXT *ctx,
- int max_plane) {
- struct macroblock_plane *const p = x->plane;
- struct macroblockd_plane *const pd = x->e_mbd.plane;
- int i;
-
- for (i = 0; i < max_plane; ++i) {
- p[i].coeff = ctx->coeff_pbuf[i][1];
- p[i].qcoeff = ctx->qcoeff_pbuf[i][1];
- pd[i].dqcoeff = ctx->dqcoeff_pbuf[i][1];
- p[i].eobs = ctx->eobs_pbuf[i][1];
-
- ctx->coeff_pbuf[i][1] = ctx->coeff_pbuf[i][0];
- ctx->qcoeff_pbuf[i][1] = ctx->qcoeff_pbuf[i][0];
- ctx->dqcoeff_pbuf[i][1] = ctx->dqcoeff_pbuf[i][0];
- ctx->eobs_pbuf[i][1] = ctx->eobs_pbuf[i][0];
-
- ctx->coeff_pbuf[i][0] = p[i].coeff;
- ctx->qcoeff_pbuf[i][0] = p[i].qcoeff;
- ctx->dqcoeff_pbuf[i][0] = pd[i].dqcoeff;
- ctx->eobs_pbuf[i][0] = p[i].eobs;
- }
-}
-
void vp9_rd_pick_intra_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
int *returnrate, int64_t *returndist,
BLOCK_SIZE bsize,
@@ -3069,7 +3048,7 @@
int64_t best_pred_rd[REFERENCE_MODES];
int64_t best_filter_rd[SWITCHABLE_FILTER_CONTEXTS];
int64_t best_filter_diff[SWITCHABLE_FILTER_CONTEXTS];
- MB_MODE_INFO best_mbmode = { 0 };
+ MB_MODE_INFO best_mbmode;
int mode_index, best_mode_index = -1;
unsigned int ref_costs_single[MAX_REF_FRAMES], ref_costs_comp[MAX_REF_FRAMES];
vp9_prob comp_mode_p;
@@ -3095,7 +3074,7 @@
const int intra_y_mode_mask =
cpi->sf.intra_y_mode_mask[max_txsize_lookup[bsize]];
int disable_inter_mode_mask = cpi->sf.disable_inter_mode_mask[bsize];
-
+ vp9_zero(best_mbmode);
x->skip_encode = cpi->sf.skip_encode_frame && x->q_index < QIDX_SKIP_THRESH;
estimate_ref_frame_costs(cm, xd, segment_id, ref_costs_single, ref_costs_comp,
@@ -3476,7 +3455,7 @@
best_mbmode = *mbmi;
best_skip2 = this_skip2;
if (!x->select_txfm_size)
- swap_block_ptr(x, ctx, max_plane);
+ swap_block_ptr(x, ctx, 1, 0, 0, max_plane);
vpx_memcpy(ctx->zcoeff_blk, x->zcoeff_blk[mbmi->tx_size],
sizeof(uint8_t) * ctx->num_4x4_blk);
@@ -3678,7 +3657,7 @@
int64_t best_pred_rd[REFERENCE_MODES];
int64_t best_filter_rd[SWITCHABLE_FILTER_CONTEXTS];
int64_t best_filter_diff[SWITCHABLE_FILTER_CONTEXTS];
- MB_MODE_INFO best_mbmode = { 0 };
+ MB_MODE_INFO best_mbmode;
int ref_index, best_ref_index = 0;
unsigned int ref_costs_single[MAX_REF_FRAMES], ref_costs_comp[MAX_REF_FRAMES];
vp9_prob comp_mode_p;
@@ -3698,6 +3677,7 @@
x->skip_encode = cpi->sf.skip_encode_frame && x->q_index < QIDX_SKIP_THRESH;
vpx_memset(x->zcoeff_blk[TX_4X4], 0, 4);
+ vp9_zero(best_mbmode);
for (i = 0; i < 4; i++) {
int j;
@@ -4130,7 +4110,7 @@
best_mbmode = *mbmi;
best_skip2 = this_skip2;
if (!x->select_txfm_size)
- swap_block_ptr(x, ctx, max_plane);
+ swap_block_ptr(x, ctx, 1, 0, 0, max_plane);
vpx_memcpy(ctx->zcoeff_blk, x->zcoeff_blk[TX_4X4],
sizeof(uint8_t) * ctx->num_4x4_blk);
diff --git a/vp9/encoder/vp9_segmentation.c b/vp9/encoder/vp9_segmentation.c
index 7537d1b..574df62 100644
--- a/vp9/encoder/vp9_segmentation.c
+++ b/vp9/encoder/vp9_segmentation.c
@@ -109,7 +109,7 @@
}
static void count_segs(VP9_COMP *cpi, const TileInfo *const tile,
- MODE_INFO **mi_8x8,
+ MODE_INFO **mi,
int *no_pred_segcounts,
int (*temporal_predictor_count)[2],
int *t_unpred_seg_counts,
@@ -121,7 +121,7 @@
if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols)
return;
- xd->mi = mi_8x8;
+ xd->mi = mi;
segment_id = xd->mi[0]->mbmi.segment_id;
set_mi_row_col(xd, tile, mi_row, bh, mi_col, bw, cm->mi_rows, cm->mi_cols);
@@ -131,7 +131,7 @@
// Temporal prediction not allowed on key frames
if (cm->frame_type != KEY_FRAME) {
- const BLOCK_SIZE bsize = mi_8x8[0]->mbmi.sb_type;
+ const BLOCK_SIZE bsize = xd->mi[0]->mbmi.sb_type;
// Test to see if the segment id matches the predicted value.
const int pred_segment_id = vp9_get_segment_id(cm, cm->last_frame_seg_map,
bsize, mi_row, mi_col);
@@ -143,14 +143,14 @@
xd->mi[0]->mbmi.seg_id_predicted = pred_flag;
temporal_predictor_count[pred_context][pred_flag]++;
+ // Update the "unpredicted" segment count
if (!pred_flag)
- // Update the "unpredicted" segment count
t_unpred_seg_counts[segment_id]++;
}
}
static void count_segs_sb(VP9_COMP *cpi, const TileInfo *const tile,
- MODE_INFO **mi_8x8,
+ MODE_INFO **mi,
int *no_pred_segcounts,
int (*temporal_predictor_count)[2],
int *t_unpred_seg_counts,
@@ -164,22 +164,22 @@
if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols)
return;
- bw = num_8x8_blocks_wide_lookup[mi_8x8[0]->mbmi.sb_type];
- bh = num_8x8_blocks_high_lookup[mi_8x8[0]->mbmi.sb_type];
+ bw = num_8x8_blocks_wide_lookup[mi[0]->mbmi.sb_type];
+ bh = num_8x8_blocks_high_lookup[mi[0]->mbmi.sb_type];
if (bw == bs && bh == bs) {
- count_segs(cpi, tile, mi_8x8, no_pred_segcounts, temporal_predictor_count,
+ count_segs(cpi, tile, mi, no_pred_segcounts, temporal_predictor_count,
t_unpred_seg_counts, bs, bs, mi_row, mi_col);
} else if (bw == bs && bh < bs) {
- count_segs(cpi, tile, mi_8x8, no_pred_segcounts, temporal_predictor_count,
+ count_segs(cpi, tile, mi, no_pred_segcounts, temporal_predictor_count,
t_unpred_seg_counts, bs, hbs, mi_row, mi_col);
- count_segs(cpi, tile, mi_8x8 + hbs * mis, no_pred_segcounts,
+ count_segs(cpi, tile, mi + hbs * mis, no_pred_segcounts,
temporal_predictor_count, t_unpred_seg_counts, bs, hbs,
mi_row + hbs, mi_col);
} else if (bw < bs && bh == bs) {
- count_segs(cpi, tile, mi_8x8, no_pred_segcounts, temporal_predictor_count,
+ count_segs(cpi, tile, mi, no_pred_segcounts, temporal_predictor_count,
t_unpred_seg_counts, hbs, bs, mi_row, mi_col);
- count_segs(cpi, tile, mi_8x8 + hbs,
+ count_segs(cpi, tile, mi + hbs,
no_pred_segcounts, temporal_predictor_count, t_unpred_seg_counts,
hbs, bs, mi_row, mi_col + hbs);
} else {
@@ -192,7 +192,7 @@
const int mi_dc = hbs * (n & 1);
const int mi_dr = hbs * (n >> 1);
- count_segs_sb(cpi, tile, &mi_8x8[mi_dr * mis + mi_dc],
+ count_segs_sb(cpi, tile, &mi[mi_dr * mis + mi_dc],
no_pred_segcounts, temporal_predictor_count,
t_unpred_seg_counts,
mi_row + mi_dr, mi_col + mi_dc, subsize);
@@ -217,9 +217,6 @@
vp9_prob t_pred_tree[SEG_TREE_PROBS];
vp9_prob t_nopred_prob[PREDICTION_PROBS];
- const int mis = cm->mi_stride;
- MODE_INFO **mi_ptr, **mi;
-
// Set default state for the segment tree probabilities and the
// temporal coding probabilities
vpx_memset(seg->tree_probs, 255, sizeof(seg->tree_probs));
@@ -229,12 +226,13 @@
// predicts this one
for (tile_col = 0; tile_col < 1 << cm->log2_tile_cols; tile_col++) {
TileInfo tile;
-
+ MODE_INFO **mi_ptr;
vp9_tile_init(&tile, cm, 0, tile_col);
+
mi_ptr = cm->mi_grid_visible + tile.mi_col_start;
for (mi_row = 0; mi_row < cm->mi_rows;
- mi_row += 8, mi_ptr += 8 * mis) {
- mi = mi_ptr;
+ mi_row += 8, mi_ptr += 8 * cm->mi_stride) {
+ MODE_INFO **mi = mi_ptr;
for (mi_col = tile.mi_col_start; mi_col < tile.mi_col_end;
mi_col += 8, mi += 8)
count_segs_sb(cpi, &tile, mi, no_pred_segcounts,
diff --git a/vp9/encoder/vp9_speed_features.c b/vp9/encoder/vp9_speed_features.c
index 93e23ee..7b2d1e2 100644
--- a/vp9/encoder/vp9_speed_features.c
+++ b/vp9/encoder/vp9_speed_features.c
@@ -146,7 +146,6 @@
int speed) {
sf->static_segmentation = 0;
sf->adaptive_rd_thresh = 1;
- sf->encode_breakout_thresh = 1;
sf->use_fast_coef_costing = 1;
if (speed == 1) {
@@ -169,7 +168,6 @@
sf->intra_y_mode_mask[TX_32X32] = INTRA_DC_H_V;
sf->intra_uv_mode_mask[TX_32X32] = INTRA_DC_H_V;
sf->intra_uv_mode_mask[TX_16X16] = INTRA_DC_H_V;
- sf->encode_breakout_thresh = 8;
}
if (speed >= 2) {
@@ -208,7 +206,6 @@
sf->intra_y_mode_mask[TX_16X16] = INTRA_DC_H_V;
sf->intra_uv_mode_mask[TX_32X32] = INTRA_DC_H_V;
sf->intra_uv_mode_mask[TX_16X16] = INTRA_DC_H_V;
- sf->encode_breakout_thresh = 200;
}
if (speed >= 3) {
@@ -226,7 +223,6 @@
sf->optimize_coefficients = 0;
sf->disable_split_mask = DISABLE_ALL_SPLIT;
sf->lpf_pick = LPF_PICK_FROM_Q;
- sf->encode_breakout_thresh = 700;
}
if (speed >= 4) {
@@ -245,7 +241,6 @@
}
sf->intra_y_mode_mask[TX_32X32] = INTRA_DC_ONLY;
sf->frame_parameter_update = 0;
- sf->encode_breakout_thresh = 1000;
sf->search_method = FAST_HEX;
sf->disable_inter_mode_mask[BLOCK_32X32] = 1 << INTER_OFFSET(ZEROMV);
sf->disable_inter_mode_mask[BLOCK_32X64] = ~(1 << INTER_OFFSET(NEARESTMV));
@@ -338,7 +333,6 @@
sf->use_fast_coef_costing = 0;
sf->mode_skip_start = MAX_MODES; // Mode index at which mode skip mask set
sf->use_nonrd_pick_mode = 0;
- sf->encode_breakout_thresh = 0;
for (i = 0; i < BLOCK_SIZES; ++i)
sf->disable_inter_mode_mask[i] = 0;
sf->max_intra_bsize = BLOCK_64X64;
@@ -384,10 +378,6 @@
cpi->mb.optimize = sf->optimize_coefficients == 1 && cpi->pass != 1;
- if (cpi->encode_breakout && oxcf->mode == REALTIME &&
- sf->encode_breakout_thresh > cpi->encode_breakout)
- cpi->encode_breakout = sf->encode_breakout_thresh;
-
if (sf->disable_split_mask == DISABLE_ALL_SPLIT)
sf->adaptive_pred_interp_filter = 0;
diff --git a/vp9/encoder/vp9_speed_features.h b/vp9/encoder/vp9_speed_features.h
index 46806c9..d8c1a8b 100644
--- a/vp9/encoder/vp9_speed_features.h
+++ b/vp9/encoder/vp9_speed_features.h
@@ -321,10 +321,6 @@
// This flag controls the use of non-RD mode decision.
int use_nonrd_pick_mode;
- // This variable sets the encode_breakout threshold. Currently, it is only
- // enabled in real time mode.
- int encode_breakout_thresh;
-
// A binary mask indicating if NEARESTMV, NEARMV, ZEROMV, NEWMV
// modes are disabled in order from LSB to MSB for each BLOCK_SIZE.
int disable_inter_mode_mask[BLOCK_SIZES];
diff --git a/vp9/encoder/vp9_svc_layercontext.c b/vp9/encoder/vp9_svc_layercontext.c
index 2e98fa7..dd28496 100644
--- a/vp9/encoder/vp9_svc_layercontext.c
+++ b/vp9/encoder/vp9_svc_layercontext.c
@@ -203,7 +203,7 @@
int i;
for (i = 0; i < svc->number_spatial_layers; ++i) {
- struct twopass_rc *const twopass = &svc->layer_context[i].twopass;
+ TWO_PASS *const twopass = &svc->layer_context[i].twopass;
svc->spatial_layer_id = i;
vp9_init_second_pass(cpi);
diff --git a/vp9/encoder/vp9_svc_layercontext.h b/vp9/encoder/vp9_svc_layercontext.h
index 74d9c1c..6881ce1 100644
--- a/vp9/encoder/vp9_svc_layercontext.h
+++ b/vp9/encoder/vp9_svc_layercontext.h
@@ -27,7 +27,7 @@
int64_t maximum_buffer_size;
double framerate;
int avg_frame_size;
- struct twopass_rc twopass;
+ TWO_PASS twopass;
struct vpx_fixed_buf rc_twopass_stats_in;
unsigned int current_video_frame_in_layer;
int is_key_frame;
diff --git a/vp9/encoder/vp9_temporal_filter.c b/vp9/encoder/vp9_temporal_filter.c
index 6eff200..f501971 100644
--- a/vp9/encoder/vp9_temporal_filter.c
+++ b/vp9/encoder/vp9_temporal_filter.c
@@ -34,7 +34,8 @@
uint8_t *u_mb_ptr,
uint8_t *v_mb_ptr,
int stride,
- int uv_block_size,
+ int uv_block_width,
+ int uv_block_height,
int mv_row,
int mv_col,
uint8_t *pred,
@@ -47,7 +48,7 @@
enum mv_precision mv_precision_uv;
int uv_stride;
- if (uv_block_size == 8) {
+ if (uv_block_width == 8) {
uv_stride = (stride + 1) >> 1;
mv_precision_uv = MV_PRECISION_Q4;
} else {
@@ -64,18 +65,18 @@
kernel, MV_PRECISION_Q3, x, y);
vp9_build_inter_predictor(u_mb_ptr, uv_stride,
- &pred[256], uv_block_size,
+ &pred[256], uv_block_width,
&mv,
scale,
- uv_block_size, uv_block_size,
+ uv_block_width, uv_block_height,
which_mv,
kernel, mv_precision_uv, x, y);
vp9_build_inter_predictor(v_mb_ptr, uv_stride,
- &pred[512], uv_block_size,
+ &pred[512], uv_block_width,
&mv,
scale,
- uv_block_size, uv_block_size,
+ uv_block_width, uv_block_height,
which_mv,
kernel, mv_precision_uv, x, y);
}
@@ -91,7 +92,8 @@
void vp9_temporal_filter_apply_c(uint8_t *frame1,
unsigned int stride,
uint8_t *frame2,
- unsigned int block_size,
+ unsigned int block_width,
+ unsigned int block_height,
int strength,
int filter_weight,
unsigned int *accumulator,
@@ -101,8 +103,8 @@
int byte = 0;
const int rounding = strength > 0 ? 1 << (strength - 1) : 0;
- for (i = 0, k = 0; i < block_size; i++) {
- for (j = 0; j < block_size; j++, k++) {
+ for (i = 0, k = 0; i < block_height; i++) {
+ for (j = 0; j < block_width; j++, k++) {
int src_byte = frame1[byte];
int pixel_value = *frame2++;
@@ -127,7 +129,7 @@
byte++;
}
- byte += stride - block_size;
+ byte += stride - block_width;
}
}
@@ -204,14 +206,12 @@
uint8_t *dst1, *dst2;
DECLARE_ALIGNED_ARRAY(16, uint8_t, predictor, 16 * 16 * 3);
const int mb_uv_height = 16 >> mbd->plane[1].subsampling_y;
+ const int mb_uv_width = 16 >> mbd->plane[1].subsampling_x;
// Save input state
uint8_t* input_buffer[MAX_MB_PLANE];
int i;
- // TODO(aconverse): Add 4:2:2 support
- assert(mbd->plane[1].subsampling_x == mbd->plane[1].subsampling_y);
-
for (i = 0; i < MAX_MB_PLANE; i++)
input_buffer[i] = mbd->plane[i].pre[0].buf;
@@ -275,7 +275,7 @@
cpi->frames[frame]->u_buffer + mb_uv_offset,
cpi->frames[frame]->v_buffer + mb_uv_offset,
cpi->frames[frame]->y_stride,
- mb_uv_height,
+ mb_uv_width, mb_uv_height,
mbd->mi[0]->bmi[0].as_mv[0].as_mv.row,
mbd->mi[0]->bmi[0].as_mv[0].as_mv.col,
predictor, scale,
@@ -283,16 +283,17 @@
// Apply the filter (YUV)
vp9_temporal_filter_apply(f->y_buffer + mb_y_offset, f->y_stride,
- predictor, 16, strength, filter_weight,
+ predictor, 16, 16,
+ strength, filter_weight,
accumulator, count);
-
vp9_temporal_filter_apply(f->u_buffer + mb_uv_offset, f->uv_stride,
- predictor + 256, mb_uv_height, strength,
+ predictor + 256,
+ mb_uv_width, mb_uv_height, strength,
filter_weight, accumulator + 256,
count + 256);
-
vp9_temporal_filter_apply(f->v_buffer + mb_uv_offset, f->uv_stride,
- predictor + 512, mb_uv_height, strength,
+ predictor + 512,
+ mb_uv_width, mb_uv_height, strength,
filter_weight, accumulator + 512,
count + 512);
}
@@ -321,7 +322,7 @@
stride = cpi->alt_ref_buffer.uv_stride;
byte = mb_uv_offset;
for (i = 0, k = 256; i < mb_uv_height; i++) {
- for (j = 0; j < mb_uv_height; j++, k++) {
+ for (j = 0; j < mb_uv_width; j++, k++) {
int m = k + 256;
// U
@@ -339,13 +340,13 @@
// move to next pixel
byte++;
}
- byte += stride - mb_uv_height;
+ byte += stride - mb_uv_width;
}
mb_y_offset += 16;
- mb_uv_offset += mb_uv_height;
+ mb_uv_offset += mb_uv_width;
}
mb_y_offset += 16 * (f->y_stride - mb_cols);
- mb_uv_offset += mb_uv_height * (f->uv_stride - mb_cols);
+ mb_uv_offset += mb_uv_height * f->uv_stride - mb_uv_width * mb_cols;
}
// Restore input state
diff --git a/vp9/encoder/vp9_tokenize.c b/vp9/encoder/vp9_tokenize.c
index 8ce98d9..dcca92d 100644
--- a/vp9/encoder/vp9_tokenize.c
+++ b/vp9/encoder/vp9_tokenize.c
@@ -232,7 +232,6 @@
cpi->common.fc.coef_probs[tx_size][type][ref];
unsigned int (*const eob_branch)[COEFF_CONTEXTS] =
cpi->common.counts.eob_branch[tx_size][type][ref];
-
const uint8_t *const band = get_band_translate(tx_size);
const int seg_eob = get_tx_eob(&cpi->common.seg, segment_id, tx_size);
@@ -289,14 +288,17 @@
MACROBLOCK *x;
int *skippable;
};
-
static void is_skippable(int plane, int block,
BLOCK_SIZE plane_bsize, TX_SIZE tx_size,
void *argv) {
struct is_skippable_args *args = argv;
+ (void)plane_bsize;
+ (void)tx_size;
args->skippable[0] &= (!args->x->plane[plane].eobs[block]);
}
+// TODO(yaowu): rewrite and optimize this function to remove the usage of
+// vp9_foreach_transform_block() and simplify is_skippable().
int vp9_is_skippable_in_plane(MACROBLOCK *x, BLOCK_SIZE bsize, int plane) {
int result = 1;
struct is_skippable_args args = {x, &result};
diff --git a/vp9/encoder/x86/vp9_dct_sse2.c b/vp9/encoder/x86/vp9_dct_sse2.c
index 6865822..1f58d87 100644
--- a/vp9/encoder/x86/vp9_dct_sse2.c
+++ b/vp9/encoder/x86/vp9_dct_sse2.c
@@ -1187,7 +1187,7 @@
const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);
- const __m128i k__cospi_m24_m08 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
+ const __m128i k__cospi_p08_m24 = pair_set_epi16(cospi_8_64, -cospi_24_64);
const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);
const __m128i k__cospi_p28_p04 = pair_set_epi16(cospi_28_64, cospi_4_64);
const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64);
@@ -1513,8 +1513,8 @@
const __m128i t3 = _mm_unpackhi_epi16(step3_2, step3_5);
const __m128i u0 = _mm_madd_epi16(t0, k__cospi_m08_p24);
const __m128i u1 = _mm_madd_epi16(t1, k__cospi_m08_p24);
- const __m128i u2 = _mm_madd_epi16(t2, k__cospi_m24_m08);
- const __m128i u3 = _mm_madd_epi16(t3, k__cospi_m24_m08);
+ const __m128i u2 = _mm_madd_epi16(t2, k__cospi_p24_p08);
+ const __m128i u3 = _mm_madd_epi16(t3, k__cospi_p24_p08);
// dct_const_round_shift
const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
@@ -1535,8 +1535,8 @@
const __m128i t3 = _mm_unpackhi_epi16(step3_2, step3_5);
const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p24_p08);
const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p24_p08);
- const __m128i u2 = _mm_madd_epi16(t2, k__cospi_m08_p24);
- const __m128i u3 = _mm_madd_epi16(t3, k__cospi_m08_p24);
+ const __m128i u2 = _mm_madd_epi16(t2, k__cospi_p08_m24);
+ const __m128i u3 = _mm_madd_epi16(t3, k__cospi_p08_m24);
// dct_const_round_shift
const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
@@ -1554,10 +1554,10 @@
{
step1_0 = _mm_add_epi16(step3_0, step2_1);
step1_1 = _mm_sub_epi16(step3_0, step2_1);
- step1_2 = _mm_sub_epi16(step3_3, step2_2);
- step1_3 = _mm_add_epi16(step3_3, step2_2);
- step1_4 = _mm_add_epi16(step3_4, step2_5);
- step1_5 = _mm_sub_epi16(step3_4, step2_5);
+ step1_2 = _mm_add_epi16(step3_3, step2_2);
+ step1_3 = _mm_sub_epi16(step3_3, step2_2);
+ step1_4 = _mm_sub_epi16(step3_4, step2_5);
+ step1_5 = _mm_add_epi16(step3_4, step2_5);
step1_6 = _mm_sub_epi16(step3_7, step2_6);
step1_7 = _mm_add_epi16(step3_7, step2_6);
}
@@ -1848,7 +1848,7 @@
const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64);
const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);
- const __m128i k__cospi_m24_m08 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
+ const __m128i k__cospi_p08_m24 = pair_set_epi16(cospi_8_64, -cospi_24_64);
const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);
const __m128i k__cospi_p28_p04 = pair_set_epi16(cospi_28_64, cospi_4_64);
const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64);
@@ -2052,10 +2052,10 @@
v[0] = _mm_madd_epi16(u[0], k__cospi_m08_p24);
v[1] = _mm_madd_epi16(u[1], k__cospi_m08_p24);
- v[2] = _mm_madd_epi16(u[2], k__cospi_m24_m08);
- v[3] = _mm_madd_epi16(u[3], k__cospi_m24_m08);
- v[4] = _mm_madd_epi16(u[2], k__cospi_m08_p24);
- v[5] = _mm_madd_epi16(u[3], k__cospi_m08_p24);
+ v[2] = _mm_madd_epi16(u[2], k__cospi_p24_p08);
+ v[3] = _mm_madd_epi16(u[3], k__cospi_p24_p08);
+ v[4] = _mm_madd_epi16(u[2], k__cospi_p08_m24);
+ v[5] = _mm_madd_epi16(u[3], k__cospi_p08_m24);
v[6] = _mm_madd_epi16(u[0], k__cospi_p24_p08);
v[7] = _mm_madd_epi16(u[1], k__cospi_p24_p08);
@@ -2085,10 +2085,10 @@
// stage 5
s[0] = _mm_add_epi16(p[0], t[1]);
s[1] = _mm_sub_epi16(p[0], t[1]);
- s[2] = _mm_sub_epi16(p[3], t[2]);
- s[3] = _mm_add_epi16(p[3], t[2]);
- s[4] = _mm_add_epi16(p[4], t[5]);
- s[5] = _mm_sub_epi16(p[4], t[5]);
+ s[2] = _mm_add_epi16(p[3], t[2]);
+ s[3] = _mm_sub_epi16(p[3], t[2]);
+ s[4] = _mm_sub_epi16(p[4], t[5]);
+ s[5] = _mm_add_epi16(p[4], t[5]);
s[6] = _mm_sub_epi16(p[7], t[6]);
s[7] = _mm_add_epi16(p[7], t[6]);
diff --git a/vp9/encoder/x86/vp9_error_intrin_avx2.c b/vp9/encoder/x86/vp9_error_intrin_avx2.c
new file mode 100644
index 0000000..c67490f
--- /dev/null
+++ b/vp9/encoder/x86/vp9_error_intrin_avx2.c
@@ -0,0 +1,72 @@
+/*
+ * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ * Usee of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <immintrin.h> // AVX2
+#include "vpx/vpx_integer.h"
+
+
+int64_t vp9_block_error_avx2(const int16_t *coeff,
+ const int16_t *dqcoeff,
+ intptr_t block_size,
+ int64_t *ssz) {
+ __m256i sse_reg, ssz_reg, coeff_reg, dqcoeff_reg;
+ __m256i exp_dqcoeff_lo, exp_dqcoeff_hi, exp_coeff_lo, exp_coeff_hi;
+ __m256i sse_reg_64hi, ssz_reg_64hi;
+ __m128i sse_reg128, ssz_reg128;
+ int64_t sse;
+ int i;
+ const __m256i zero_reg = _mm256_set1_epi16(0);
+
+ // init sse and ssz registerd to zero
+ sse_reg = _mm256_set1_epi16(0);
+ ssz_reg = _mm256_set1_epi16(0);
+
+ for (i = 0 ; i < block_size ; i+= 16) {
+ // load 32 bytes from coeff and dqcoeff
+ coeff_reg = _mm256_loadu_si256((const __m256i *)(coeff + i));
+ dqcoeff_reg = _mm256_loadu_si256((const __m256i *)(dqcoeff + i));
+ // dqcoeff - coeff
+ dqcoeff_reg = _mm256_sub_epi16(dqcoeff_reg, coeff_reg);
+ // madd (dqcoeff - coeff)
+ dqcoeff_reg = _mm256_madd_epi16(dqcoeff_reg, dqcoeff_reg);
+ // madd coeff
+ coeff_reg = _mm256_madd_epi16(coeff_reg, coeff_reg);
+ // expand each double word of madd (dqcoeff - coeff) to quad word
+ exp_dqcoeff_lo = _mm256_unpacklo_epi32(dqcoeff_reg, zero_reg);
+ exp_dqcoeff_hi = _mm256_unpackhi_epi32(dqcoeff_reg, zero_reg);
+ // expand each double word of madd (coeff) to quad word
+ exp_coeff_lo = _mm256_unpacklo_epi32(coeff_reg, zero_reg);
+ exp_coeff_hi = _mm256_unpackhi_epi32(coeff_reg, zero_reg);
+ // add each quad word of madd (dqcoeff - coeff) and madd (coeff)
+ sse_reg = _mm256_add_epi64(sse_reg, exp_dqcoeff_lo);
+ ssz_reg = _mm256_add_epi64(ssz_reg, exp_coeff_lo);
+ sse_reg = _mm256_add_epi64(sse_reg, exp_dqcoeff_hi);
+ ssz_reg = _mm256_add_epi64(ssz_reg, exp_coeff_hi);
+ }
+ // save the higher 64 bit of each 128 bit lane
+ sse_reg_64hi = _mm256_srli_si256(sse_reg, 8);
+ ssz_reg_64hi = _mm256_srli_si256(ssz_reg, 8);
+ // add the higher 64 bit to the low 64 bit
+ sse_reg = _mm256_add_epi64(sse_reg, sse_reg_64hi);
+ ssz_reg = _mm256_add_epi64(ssz_reg, ssz_reg_64hi);
+
+ // add each 64 bit from each of the 128 bit lane of the 256 bit
+ sse_reg128 = _mm_add_epi64(_mm256_castsi256_si128(sse_reg),
+ _mm256_extractf128_si256(sse_reg, 1));
+
+ ssz_reg128 = _mm_add_epi64(_mm256_castsi256_si128(ssz_reg),
+ _mm256_extractf128_si256(ssz_reg, 1));
+
+ // store the results
+ _mm_storel_epi64((__m128i*)(&sse), sse_reg128);
+
+ _mm_storel_epi64((__m128i*)(ssz), ssz_reg128);
+ return sse;
+}
diff --git a/vp9/encoder/x86/vp9_temporal_filter_apply_sse2.asm b/vp9/encoder/x86/vp9_temporal_filter_apply_sse2.asm
index d2d13b3..673e0b3 100644
--- a/vp9/encoder/x86/vp9_temporal_filter_apply_sse2.asm
+++ b/vp9/encoder/x86/vp9_temporal_filter_apply_sse2.asm
@@ -15,41 +15,45 @@
; (unsigned char *frame1, | 0
; unsigned int stride, | 1
; unsigned char *frame2, | 2
-; unsigned int block_size, | 3
-; int strength, | 4
-; int filter_weight, | 5
-; unsigned int *accumulator, | 6
-; unsigned short *count) | 7
+; unsigned int block_width, | 3
+; unsigned int block_height, | 4
+; int strength, | 5
+; int filter_weight, | 6
+; unsigned int *accumulator, | 7
+; unsigned short *count) | 8
global sym(vp9_temporal_filter_apply_sse2) PRIVATE
sym(vp9_temporal_filter_apply_sse2):
push rbp
mov rbp, rsp
- SHADOW_ARGS_TO_STACK 8
+ SHADOW_ARGS_TO_STACK 9
SAVE_XMM 7
GET_GOT rbx
push rsi
push rdi
ALIGN_STACK 16, rax
- %define block_size 0
- %define strength 16
- %define filter_weight 32
- %define rounding_bit 48
- %define rbp_backup 64
- %define stack_size 80
+ %define block_width 0
+ %define block_height 16
+ %define strength 32
+ %define filter_weight 48
+ %define rounding_bit 64
+ %define rbp_backup 80
+ %define stack_size 96
sub rsp, stack_size
mov [rsp + rbp_backup], rbp
; end prolog
mov rdx, arg(3)
- mov [rsp + block_size], rdx
- movd xmm6, arg(4)
+ mov [rsp + block_width], rdx
+ mov rdx, arg(4)
+ mov [rsp + block_height], rdx
+ movd xmm6, arg(5)
movdqa [rsp + strength], xmm6 ; where strength is used, all 16 bytes are read
; calculate the rounding bit outside the loop
; 0x8000 >> (16 - strength)
mov rdx, 16
- sub rdx, arg(4) ; 16 - strength
+ sub rdx, arg(5) ; 16 - strength
movq xmm4, rdx ; can't use rdx w/ shift
movdqa xmm5, [GLOBAL(_const_top_bit)]
psrlw xmm5, xmm4
@@ -57,11 +61,11 @@
mov rsi, arg(0) ; src/frame1
mov rdx, arg(2) ; predictor frame
- mov rdi, arg(6) ; accumulator
- mov rax, arg(7) ; count
+ mov rdi, arg(7) ; accumulator
+ mov rax, arg(8) ; count
; dup the filter weight and store for later
- movd xmm0, arg(5) ; filter_weight
+ movd xmm0, arg(6) ; filter_weight
pshuflw xmm0, xmm0, 0
punpcklwd xmm0, xmm0
movdqa [rsp + filter_weight], xmm0
@@ -69,10 +73,11 @@
mov rbp, arg(1) ; stride
pxor xmm7, xmm7 ; zero for extraction
- lea rcx, [rdx + 16*16*1]
- cmp dword ptr [rsp + block_size], 8
+ mov rcx, [rsp + block_width]
+ imul rcx, [rsp + block_height]
+ add rcx, rdx
+ cmp dword ptr [rsp + block_width], 8
jne .temporal_filter_apply_load_16
- lea rcx, [rdx + 8*8*1]
.temporal_filter_apply_load_8:
movq xmm0, [rsi] ; first row
@@ -178,7 +183,7 @@
cmp rdx, rcx
je .temporal_filter_apply_epilog
pxor xmm7, xmm7 ; zero for extraction
- cmp dword ptr [rsp + block_size], 16
+ cmp dword ptr [rsp + block_width], 16
je .temporal_filter_apply_load_16
jmp .temporal_filter_apply_load_8
diff --git a/vp9/vp9_cx_iface.c b/vp9/vp9_cx_iface.c
index c4058bb..5a8a4f4 100644
--- a/vp9/vp9_cx_iface.c
+++ b/vp9/vp9_cx_iface.c
@@ -211,8 +211,8 @@
ERROR("Option --tune=ssim is not currently supported in VP9.");
if (cfg->g_pass == VPX_RC_LAST_PASS) {
- size_t packet_sz = sizeof(FIRSTPASS_STATS);
- int n_packets = (int)(cfg->rc_twopass_stats_in.sz / packet_sz);
+ const size_t packet_sz = sizeof(FIRSTPASS_STATS);
+ const int n_packets = (int)(cfg->rc_twopass_stats_in.sz / packet_sz);
const FIRSTPASS_STATS *stats;
if (cfg->rc_twopass_stats_in.buf == NULL)
@@ -464,7 +464,7 @@
static vpx_codec_err_t ctrl_get_param(vpx_codec_alg_priv_t *ctx, int ctrl_id,
va_list args) {
- void *arg = va_arg(args, void *);
+ void *const arg = va_arg(args, void *);
#define MAP(id, var) case id: *(RECAST(id, arg)) = var; break
@@ -525,6 +525,7 @@
static vpx_codec_err_t encoder_init(vpx_codec_ctx_t *ctx,
vpx_codec_priv_enc_mr_cfg_t *data) {
vpx_codec_err_t res = VPX_CODEC_OK;
+ (void)data;
if (ctx->priv == NULL) {
int i;
@@ -880,14 +881,15 @@
return res;
}
-static const vpx_codec_cx_pkt_t *encoder_get_cxdata(vpx_codec_alg_priv_t *ctx,
+static const vpx_codec_cx_pkt_t *encoder_get_cxdata(vpx_codec_alg_priv_t *ctx,
vpx_codec_iter_t *iter) {
return vpx_codec_pkt_list_get(&ctx->pkt_list.head, iter);
}
static vpx_codec_err_t ctrl_set_reference(vpx_codec_alg_priv_t *ctx,
- int ctr_id, va_list args) {
+ int ctrl_id, va_list args) {
vpx_ref_frame_t *const frame = va_arg(args, vpx_ref_frame_t *);
+ (void)ctrl_id;
if (frame != NULL) {
YV12_BUFFER_CONFIG sd;
@@ -902,8 +904,9 @@
}
static vpx_codec_err_t ctrl_copy_reference(vpx_codec_alg_priv_t *ctx,
- int ctr_id, va_list args) {
+ int ctrl_id, va_list args) {
vpx_ref_frame_t *const frame = va_arg(args, vpx_ref_frame_t *);
+ (void)ctrl_id;
if (frame != NULL) {
YV12_BUFFER_CONFIG sd;
@@ -918,11 +921,12 @@
}
static vpx_codec_err_t ctrl_get_reference(vpx_codec_alg_priv_t *ctx,
- int ctr_id, va_list args) {
- vp9_ref_frame_t *frame = va_arg(args, vp9_ref_frame_t *);
+ int ctrl_id, va_list args) {
+ vp9_ref_frame_t *const frame = va_arg(args, vp9_ref_frame_t *);
+ (void)ctrl_id;
if (frame != NULL) {
- YV12_BUFFER_CONFIG* fb;
+ YV12_BUFFER_CONFIG *fb;
vp9_get_reference_enc(ctx->cpi, frame->idx, &fb);
yuvconfig2image(&frame->img, fb, NULL);
@@ -955,7 +959,8 @@
static vpx_image_t *encoder_get_preview(vpx_codec_alg_priv_t *ctx) {
YV12_BUFFER_CONFIG sd;
- vp9_ppflags_t flags = {0};
+ vp9_ppflags_t flags;
+ vp9_zero(flags);
if (ctx->preview_ppcfg.post_proc_flag) {
flags.post_proc_flag = ctx->preview_ppcfg.post_proc_flag;
@@ -972,39 +977,51 @@
}
static vpx_codec_err_t ctrl_update_entropy(vpx_codec_alg_priv_t *ctx,
- int ctr_id, va_list args) {
+ int ctrl_id, va_list args) {
const int update = va_arg(args, int);
+ (void)ctrl_id;
+
vp9_update_entropy(ctx->cpi, update);
return VPX_CODEC_OK;
}
static vpx_codec_err_t ctrl_update_reference(vpx_codec_alg_priv_t *ctx,
- int ctr_id, va_list args) {
+ int ctrl_id, va_list args) {
const int ref_frame_flags = va_arg(args, int);
+ (void)ctrl_id;
+
vp9_update_reference(ctx->cpi, ref_frame_flags);
return VPX_CODEC_OK;
}
static vpx_codec_err_t ctrl_use_reference(vpx_codec_alg_priv_t *ctx,
- int ctr_id, va_list args) {
+ int ctrl_id, va_list args) {
const int reference_flag = va_arg(args, int);
+ (void)ctrl_id;
+
vp9_use_as_reference(ctx->cpi, reference_flag);
return VPX_CODEC_OK;
}
static vpx_codec_err_t ctrl_set_roi_map(vpx_codec_alg_priv_t *ctx,
- int ctr_id, va_list args) {
+ int ctrl_id, va_list args) {
+ (void)ctx;
+ (void)ctrl_id;
+ (void)args;
+
// TODO(yaowu): Need to re-implement and test for VP9.
return VPX_CODEC_INVALID_PARAM;
}
static vpx_codec_err_t ctrl_set_active_map(vpx_codec_alg_priv_t *ctx,
- int ctr_id, va_list args) {
+ int ctrl_id, va_list args) {
vpx_active_map_t *const map = va_arg(args, vpx_active_map_t *);
+ (void)ctrl_id;
if (map) {
- if (!vp9_set_active_map(ctx->cpi, map->active_map, map->rows, map->cols))
+ if (!vp9_set_active_map(ctx->cpi, map->active_map,
+ (int)map->rows, (int)map->cols))
return VPX_CODEC_OK;
else
return VPX_CODEC_INVALID_PARAM;
@@ -1014,8 +1031,9 @@
}
static vpx_codec_err_t ctrl_set_scale_mode(vpx_codec_alg_priv_t *ctx,
- int ctr_id, va_list args) {
+ int ctrl_id, va_list args) {
vpx_scaling_mode_t *const mode = va_arg(args, vpx_scaling_mode_t *);
+ (void)ctrl_id;
if (mode) {
const int res = vp9_set_internal_size(ctx->cpi,
@@ -1027,10 +1045,12 @@
}
}
-static vpx_codec_err_t ctrl_set_svc(vpx_codec_alg_priv_t *ctx, int ctr_id,
+static vpx_codec_err_t ctrl_set_svc(vpx_codec_alg_priv_t *ctx, int ctrl_id,
va_list args) {
int data = va_arg(args, int);
const vpx_codec_enc_cfg_t *cfg = &ctx->cfg;
+ (void)ctrl_id;
+
vp9_set_svc(ctx->cpi, data);
// CBR or two pass mode for SVC with both temporal and spatial layers
// not yet supported.
@@ -1046,11 +1066,12 @@
}
static vpx_codec_err_t ctrl_set_svc_layer_id(vpx_codec_alg_priv_t *ctx,
- int ctr_id,
- va_list args) {
+ int ctrl_id, va_list args) {
vpx_svc_layer_id_t *const data = va_arg(args, vpx_svc_layer_id_t *);
VP9_COMP *const cpi = (VP9_COMP *)ctx->cpi;
SVC *const svc = &cpi->svc;
+ (void)ctrl_id;
+
svc->spatial_layer_id = data->spatial_layer_id;
svc->temporal_layer_id = data->temporal_layer_id;
// Checks on valid layer_id input.
@@ -1066,9 +1087,10 @@
}
static vpx_codec_err_t ctrl_set_svc_parameters(vpx_codec_alg_priv_t *ctx,
- int ctr_id, va_list args) {
+ int ctrl_id, va_list args) {
VP9_COMP *const cpi = ctx->cpi;
vpx_svc_parameters_t *const params = va_arg(args, vpx_svc_parameters_t *);
+ (void)ctrl_id;
if (params == NULL)
return VPX_CODEC_INVALID_PARAM;
@@ -1213,6 +1235,7 @@
NOT_IMPLEMENTED, // vpx_codec_get_si_fn_t
NOT_IMPLEMENTED, // vpx_codec_decode_fn_t
NOT_IMPLEMENTED, // vpx_codec_frame_get_fn_t
+ NOT_IMPLEMENTED // vpx_codec_set_fb_fn_t
},
{ // NOLINT
encoder_usage_cfg_map, // vpx_codec_enc_cfg_map_t
@@ -1221,6 +1244,6 @@
encoder_set_config, // vpx_codec_enc_config_set_fn_t
NOT_IMPLEMENTED, // vpx_codec_get_global_headers_fn_t
encoder_get_preview, // vpx_codec_get_preview_frame_fn_t
- NOT_IMPLEMENTED , // vpx_codec_enc_mr_get_mem_loc_fn_t
+ NOT_IMPLEMENTED // vpx_codec_enc_mr_get_mem_loc_fn_t
}
};
diff --git a/vp9/vp9_dx_iface.c b/vp9/vp9_dx_iface.c
index 9ba4d56..328b98f 100644
--- a/vp9/vp9_dx_iface.c
+++ b/vp9/vp9_dx_iface.c
@@ -32,21 +32,12 @@
vpx_codec_priv_t base;
vpx_codec_dec_cfg_t cfg;
vp9_stream_info_t si;
- int decoder_init;
struct VP9Decoder *pbi;
int postproc_cfg_set;
vp8_postproc_cfg_t postproc_cfg;
-#if CONFIG_POSTPROC_VISUALIZER
- unsigned int dbg_postproc_flag;
- int dbg_color_ref_frame_flag;
- int dbg_color_mb_modes_flag;
- int dbg_color_b_modes_flag;
- int dbg_display_mv_flag;
-#endif
vpx_decrypt_cb decrypt_cb;
void *decrypt_state;
vpx_image_t img;
- int img_setup;
int img_avail;
int invert_tile_order;
@@ -226,22 +217,10 @@
static void set_ppflags(const vpx_codec_alg_priv_t *ctx,
vp9_ppflags_t *flags) {
flags->post_proc_flag =
-#if CONFIG_POSTPROC_VISUALIZER
- (ctx->dbg_color_ref_frame_flag ? VP9D_DEBUG_CLR_FRM_REF_BLKS : 0) |
- (ctx->dbg_color_mb_modes_flag ? VP9D_DEBUG_CLR_BLK_MODES : 0) |
- (ctx->dbg_color_b_modes_flag ? VP9D_DEBUG_CLR_BLK_MODES : 0) |
- (ctx->dbg_display_mv_flag ? VP9D_DEBUG_DRAW_MV : 0) |
-#endif
ctx->postproc_cfg.post_proc_flag;
flags->deblocking_level = ctx->postproc_cfg.deblocking_level;
flags->noise_level = ctx->postproc_cfg.noise_level;
-#if CONFIG_POSTPROC_VISUALIZER
- flags->display_ref_frame_flag = ctx->dbg_color_ref_frame_flag;
- flags->display_mb_modes_flag = ctx->dbg_color_mb_modes_flag;
- flags->display_b_modes_flag = ctx->dbg_color_b_modes_flag;
- flags->display_mv_flag = ctx->dbg_display_mv_flag;
-#endif
}
static void init_decoder(vpx_codec_alg_priv_t *ctx) {
@@ -252,8 +231,6 @@
ctx->pbi->max_threads = ctx->cfg.threads;
ctx->pbi->inv_tile_order = ctx->invert_tile_order;
- vp9_initialize_dec();
-
// If postprocessing was enabled by the application and a
// configuration has not been provided, default it.
if (!ctx->postproc_cfg_set &&
@@ -286,12 +263,10 @@
}
// Initialize the decoder instance on the first frame
- if (!ctx->decoder_init) {
+ if (ctx->pbi == NULL) {
init_decoder(ctx);
if (ctx->pbi == NULL)
return VPX_CODEC_ERROR;
-
- ctx->decoder_init = 1;
}
// Set these even if already initialized. The caller may have changed the
@@ -540,22 +515,7 @@
static vpx_codec_err_t ctrl_set_dbg_options(vpx_codec_alg_priv_t *ctx,
int ctrl_id, va_list args) {
-#if CONFIG_POSTPROC_VISUALIZER && CONFIG_POSTPROC
- int data = va_arg(args, int);
-
-#define MAP(id, var) case id: var = data; break;
-
- switch (ctrl_id) {
- MAP(VP8_SET_DBG_COLOR_REF_FRAME, ctx->dbg_color_ref_frame_flag);
- MAP(VP8_SET_DBG_COLOR_MB_MODES, ctx->dbg_color_mb_modes_flag);
- MAP(VP8_SET_DBG_COLOR_B_MODES, ctx->dbg_color_b_modes_flag);
- MAP(VP8_SET_DBG_DISPLAY_MV, ctx->dbg_display_mv_flag);
- }
-
- return VPX_CODEC_OK;
-#else
return VPX_CODEC_INCAPABLE;
-#endif
}
static vpx_codec_err_t ctrl_get_last_ref_updates(vpx_codec_alg_priv_t *ctx,
diff --git a/vp9/vp9cx.mk b/vp9/vp9cx.mk
index bc9a478..6e5c521 100644
--- a/vp9/vp9cx.mk
+++ b/vp9/vp9cx.mk
@@ -103,6 +103,7 @@
ifeq ($(CONFIG_USE_X86INC),yes)
VP9_CX_SRCS-$(HAVE_MMX) += encoder/x86/vp9_dct_mmx.asm
VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_error_sse2.asm
+VP9_CX_SRCS-$(HAVE_AVX2) += encoder/x86/vp9_error_intrin_avx2.c
VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_sad_sse2.asm
VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_subtract_sse2.asm
VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_variance_sse2.c
diff --git a/vpx/internal/vpx_codec_internal.h b/vpx/internal/vpx_codec_internal.h
index 51ca65e..83c3308 100644
--- a/vpx/internal/vpx_codec_internal.h
+++ b/vpx/internal/vpx_codec_internal.h
@@ -170,8 +170,8 @@
* \ref MUST be non-zero.
*/
typedef const struct vpx_codec_ctrl_fn_map {
- int ctrl_id;
- vpx_codec_control_fn_t fn;
+ int ctrl_id;
+ vpx_codec_control_fn_t fn;
} vpx_codec_ctrl_fn_map_t;
/*!\brief decode data function pointer prototype
diff --git a/vpx/src/svc_encodeframe.c b/vpx/src/svc_encodeframe.c
index b874be7..4009a8a 100644
--- a/vpx/src/svc_encodeframe.c
+++ b/vpx/src/svc_encodeframe.c
@@ -496,7 +496,6 @@
vpx_codec_err_t vpx_svc_init(SvcContext *svc_ctx, vpx_codec_ctx_t *codec_ctx,
vpx_codec_iface_t *iface,
vpx_codec_enc_cfg_t *enc_cfg) {
- int max_intra_size_pct;
vpx_codec_err_t res;
SvcInternal *const si = get_svc_internal(svc_ctx);
if (svc_ctx == NULL || codec_ctx == NULL || iface == NULL ||
@@ -575,7 +574,6 @@
// modify encoder configuration
enc_cfg->ss_number_layers = si->layers;
enc_cfg->ts_number_layers = 1; // Temporal layers not used in this encoder.
- enc_cfg->kf_mode = VPX_KF_DISABLED;
// Lag in frames not currently supported
enc_cfg->g_lag_in_frames = 0;
@@ -605,16 +603,8 @@
}
vpx_codec_control(codec_ctx, VP9E_SET_SVC, 1);
- vpx_codec_control(codec_ctx, VP8E_SET_CPUUSED, 1);
- vpx_codec_control(codec_ctx, VP8E_SET_STATIC_THRESHOLD, 1);
- vpx_codec_control(codec_ctx, VP8E_SET_NOISE_SENSITIVITY, 1);
vpx_codec_control(codec_ctx, VP8E_SET_TOKEN_PARTITIONS, 1);
- max_intra_size_pct =
- (int)(((double)enc_cfg->rc_buf_optimal_sz * 0.5) *
- ((double)enc_cfg->g_timebase.den / enc_cfg->g_timebase.num) / 10.0);
- vpx_codec_control(codec_ctx, VP8E_SET_MAX_INTRA_BITRATE_PCT,
- max_intra_size_pct);
return VPX_CODEC_OK;
}
@@ -869,8 +859,7 @@
si->rc_stats_buf_used = 0;
si->layers = svc_ctx->spatial_layers;
- if (si->frame_within_gop >= si->kf_dist ||
- si->encode_frame_count == 0) {
+ if (si->encode_frame_count == 0) {
si->frame_within_gop = 0;
}
si->is_keyframe = (si->frame_within_gop == 0);
diff --git a/vpx/vpx_codec.mk b/vpx/vpx_codec.mk
index 98d1d56..a1ad3c5 100644
--- a/vpx/vpx_codec.mk
+++ b/vpx/vpx_codec.mk
@@ -15,8 +15,10 @@
API_SRCS-$(CONFIG_VP8_ENCODER) += vp8cx.h
API_DOC_SRCS-$(CONFIG_VP8_ENCODER) += vp8.h
API_DOC_SRCS-$(CONFIG_VP8_ENCODER) += vp8cx.h
-API_SRCS-$(CONFIG_VP9_ENCODER) += src/svc_encodeframe.c
-API_SRCS-$(CONFIG_VP9_ENCODER) += svc_context.h
+ifeq ($(CONFIG_VP9_ENCODER),yes)
+ API_SRCS-$(CONFIG_SPATIAL_SVC) += src/svc_encodeframe.c
+ API_SRCS-$(CONFIG_SPATIAL_SVC) += svc_context.h
+endif
API_SRCS-$(CONFIG_VP8_DECODER) += vp8.h
API_SRCS-$(CONFIG_VP8_DECODER) += vp8dx.h
diff --git a/vpxdec.c b/vpxdec.c
index ed37c70..127e65f 100644
--- a/vpxdec.c
+++ b/vpxdec.c
@@ -873,8 +873,16 @@
}
if (img->d_w != scaled_img->d_w || img->d_h != scaled_img->d_h) {
+#if CONFIG_LIBYUV
vpx_image_scale(img, scaled_img, kFilterBox);
img = scaled_img;
+#else
+ fprintf(stderr, "Failed to scale output frame: %s.\n"
+ "Scaling is disabled in this configuration. "
+ "To enable scaling, configure with --enable-libyuv\n",
+ vpx_codec_error(&decoder));
+ return EXIT_FAILURE;
+#endif
}
}
diff --git a/vpxenc.c b/vpxenc.c
index 96a7ab6..d46a83e 100644
--- a/vpxenc.c
+++ b/vpxenc.c
@@ -755,7 +755,7 @@
input->height = input->y4m.pic_h;
input->framerate.numerator = input->y4m.fps_n;
input->framerate.denominator = input->y4m.fps_d;
- input->use_i420 = 0;
+ input->fmt = input->y4m.vpx_fmt;
} else
fatal("Unsupported Y4M stream.");
} else if (input->detect.buf_read == 4 && fourcc_is_ivf(input->detect.buf)) {
@@ -1059,6 +1059,23 @@
}
}
+static const char* file_type_to_string(enum VideoFileType t) {
+ switch (t) {
+ case FILE_TYPE_RAW: return "RAW";
+ case FILE_TYPE_Y4M: return "Y4M";
+ default: return "Other";
+ }
+}
+
+static const char* image_format_to_string(vpx_img_fmt_t f) {
+ switch (f) {
+ case VPX_IMG_FMT_I420: return "I420";
+ case VPX_IMG_FMT_I422: return "I422";
+ case VPX_IMG_FMT_I444: return "I444";
+ case VPX_IMG_FMT_YV12: return "YV12";
+ default: return "Other";
+ }
+}
static void show_stream_config(struct stream_state *stream,
struct VpxEncoderConfig *global,
@@ -1070,8 +1087,10 @@
if (stream->index == 0) {
fprintf(stderr, "Codec: %s\n",
vpx_codec_iface_name(global->codec->interface()));
- fprintf(stderr, "Source file: %s Format: %s\n", input->filename,
- input->use_i420 ? "I420" : "YV12");
+ fprintf(stderr, "Source file: %s File Type: %s Format: %s\n",
+ input->filename,
+ file_type_to_string(input->file_type),
+ image_format_to_string(input->fmt));
}
if (stream->next || stream->index)
fprintf(stderr, "\nStream Index: %d\n", stream->index);
@@ -1245,6 +1264,11 @@
/* Scale if necessary */
if (img && (img->d_w != cfg->g_w || img->d_h != cfg->g_h)) {
+ if (img->fmt != VPX_IMG_FMT_I420 && img->fmt != VPX_IMG_FMT_YV12) {
+ fprintf(stderr, "%s can only scale 4:2:0 8bpp inputs\n", exec_name);
+ exit(EXIT_FAILURE);
+ }
+#if CONFIG_LIBYUV
if (!stream->img)
stream->img = vpx_img_alloc(NULL, VPX_IMG_FMT_I420,
cfg->g_w, cfg->g_h, 16);
@@ -1260,8 +1284,15 @@
stream->img->stride[VPX_PLANE_V],
stream->img->d_w, stream->img->d_h,
kFilterBox);
-
img = stream->img;
+#else
+ stream->encoder.err = 1;
+ ctx_exit_on_error(&stream->encoder,
+ "Stream %d: Failed to encode frame.\n"
+ "Scaling disabled in this configuration. \n"
+ "To enable, configure with --enable-libyuv\n",
+ stream->index);
+#endif
}
vpx_usec_timer_start(&timer);
@@ -1501,7 +1532,6 @@
/* Setup default input stream settings */
input.framerate.numerator = 30;
input.framerate.denominator = 1;
- input.use_i420 = 1;
input.only_i420 = 1;
/* First parse the global configuration values, because we want to apply
@@ -1511,6 +1541,7 @@
argv = argv_dup(argc - 1, argv_ + 1);
parse_global_config(&global, argv);
+ input.fmt = global.use_i420 ? VPX_IMG_FMT_I420 : VPX_IMG_FMT_YV12;
{
/* Now parse each stream's parameters. Using a local scope here
@@ -1611,10 +1642,7 @@
frames.*/
memset(&raw, 0, sizeof(raw));
else
- vpx_img_alloc(&raw,
- input.use_i420 ? VPX_IMG_FMT_I420
- : VPX_IMG_FMT_YV12,
- input.width, input.height, 32);
+ vpx_img_alloc(&raw, input.fmt, input.width, input.height, 32);
FOREACH_STREAM(stream->rate_hist =
init_rate_histogram(&stream->config.cfg,