Merge "Remove unused vp8_strict_quantize"
diff --git a/README b/README
index f9c24ff..6f864d8 100644
--- a/README
+++ b/README
@@ -65,6 +65,7 @@
armv7-win32-vs12
armv7s-darwin-gcc
mips32-linux-gcc
+ mips64-linux-gcc
ppc32-darwin8-gcc
ppc32-darwin9-gcc
ppc32-linux-gcc
diff --git a/build/make/gen_msvs_proj.sh b/build/make/gen_msvs_proj.sh
index 3653309..7907225 100755
--- a/build/make/gen_msvs_proj.sh
+++ b/build/make/gen_msvs_proj.sh
@@ -245,13 +245,13 @@
case "$target" in
x86_64*)
platforms[0]="x64"
- asm_Debug_cmdline="yasm -Xvc -g cv8 -f \$(PlatformName) ${yasmincs} "\$(InputPath)""
- asm_Release_cmdline="yasm -Xvc -f \$(PlatformName) ${yasmincs} "\$(InputPath)""
+ asm_Debug_cmdline="yasm -Xvc -g cv8 -f win64 ${yasmincs} "\$(InputPath)""
+ asm_Release_cmdline="yasm -Xvc -f win64 ${yasmincs} "\$(InputPath)""
;;
x86*)
platforms[0]="Win32"
- asm_Debug_cmdline="yasm -Xvc -g cv8 -f \$(PlatformName) ${yasmincs} "\$(InputPath)""
- asm_Release_cmdline="yasm -Xvc -f \$(PlatformName) ${yasmincs} "\$(InputPath)""
+ asm_Debug_cmdline="yasm -Xvc -g cv8 -f win32 ${yasmincs} "\$(InputPath)""
+ asm_Release_cmdline="yasm -Xvc -f win32 ${yasmincs} "\$(InputPath)""
;;
*) die "Unsupported target $target!"
;;
diff --git a/build/make/gen_msvs_vcxproj.sh b/build/make/gen_msvs_vcxproj.sh
index 23ef6a3..56b9a3b 100755
--- a/build/make/gen_msvs_vcxproj.sh
+++ b/build/make/gen_msvs_vcxproj.sh
@@ -253,13 +253,13 @@
case "$target" in
x86_64*)
platforms[0]="x64"
- asm_Debug_cmdline="yasm -Xvc -g cv8 -f \$(PlatformName) ${yasmincs} "%(FullPath)""
- asm_Release_cmdline="yasm -Xvc -f \$(PlatformName) ${yasmincs} "%(FullPath)""
+ asm_Debug_cmdline="yasm -Xvc -g cv8 -f win64 ${yasmincs} "%(FullPath)""
+ asm_Release_cmdline="yasm -Xvc -f win64 ${yasmincs} "%(FullPath)""
;;
x86*)
platforms[0]="Win32"
- asm_Debug_cmdline="yasm -Xvc -g cv8 -f \$(PlatformName) ${yasmincs} "%(FullPath)""
- asm_Release_cmdline="yasm -Xvc -f \$(PlatformName) ${yasmincs} "%(FullPath)""
+ asm_Debug_cmdline="yasm -Xvc -g cv8 -f win32 ${yasmincs} "%(FullPath)""
+ asm_Release_cmdline="yasm -Xvc -f win32 ${yasmincs} "%(FullPath)""
;;
arm*)
asm_Debug_cmdline="armasm -nologo "%(FullPath)""
diff --git a/build/make/rtcd.pl b/build/make/rtcd.pl
index 28ef69c..40bcb33 100755
--- a/build/make/rtcd.pl
+++ b/build/make/rtcd.pl
@@ -49,7 +49,7 @@
my %config = ();
while (<CONFIG_FILE>) {
- next if !/^CONFIG_/;
+ next if !/^(?:CONFIG_|HAVE_)/;
chomp;
my @pair = split /=/;
$config{$pair[0]} = $pair[1];
@@ -365,13 +365,13 @@
@REQUIRES = filter(keys %required ? keys %required : qw/mmx sse sse2/);
&require(@REQUIRES);
x86;
-} elsif ($opts{arch} eq 'mips32') {
- @ALL_ARCHS = filter(qw/mips32/);
+} elsif ($opts{arch} eq 'mips32' || $opts{arch} eq 'mips64') {
+ @ALL_ARCHS = filter("$opts{arch}");
open CONFIG_FILE, $opts{config} or
die "Error opening config file '$opts{config}': $!\n";
while (<CONFIG_FILE>) {
if (/HAVE_DSPR2=yes/) {
- @ALL_ARCHS = filter(qw/mips32 dspr2/);
+ @ALL_ARCHS = filter("$opts{arch}", qw/dspr2/);
last;
}
}
diff --git a/configure b/configure
index 92ca061..7b9c211 100755
--- a/configure
+++ b/configure
@@ -111,6 +111,7 @@
all_platforms="${all_platforms} armv7-win32-vs12"
all_platforms="${all_platforms} armv7s-darwin-gcc"
all_platforms="${all_platforms} mips32-linux-gcc"
+all_platforms="${all_platforms} mips64-linux-gcc"
all_platforms="${all_platforms} ppc32-darwin8-gcc"
all_platforms="${all_platforms} ppc32-darwin9-gcc"
all_platforms="${all_platforms} ppc32-linux-gcc"
@@ -254,6 +255,8 @@
mips32
dspr2
+ mips64
+
mmx
sse
sse2
@@ -278,6 +281,7 @@
spatial_svc
vp9_temporal_denoising
fp_mb_stats
+ emulate_hardware_highbitdepth
"
CONFIG_LIST="
external_build
@@ -331,6 +335,7 @@
multi_res_encoding
temporal_denoising
coefficient_range_checking
+ vp9_highbitdepth
experimental
size_limit
${EXPERIMENT_LIST}
@@ -389,6 +394,7 @@
multi_res_encoding
temporal_denoising
coefficient_range_checking
+ vp9_highbitdepth
experimental
"
diff --git a/examples.mk b/examples.mk
index 91bd45a..fd67a44 100644
--- a/examples.mk
+++ b/examples.mk
@@ -31,6 +31,7 @@
third_party/libyuv/source/scale_common.cc \
third_party/libyuv/source/scale_mips.cc \
third_party/libyuv/source/scale_neon.cc \
+ third_party/libyuv/source/scale_neon64.cc \
third_party/libyuv/source/scale_posix.cc \
third_party/libyuv/source/scale_win.cc \
@@ -113,7 +114,7 @@
vpx_temporal_svc_encoder.SRCS += video_writer.h video_writer.c
vpx_temporal_svc_encoder.GUID = B18C08F2-A439-4502-A78E-849BE3D60947
vpx_temporal_svc_encoder.DESCRIPTION = Temporal SVC Encoder
-EXAMPLES-$(CONFIG_VP8_DECODER) += simple_decoder.c
+EXAMPLES-$(CONFIG_DECODERS) += simple_decoder.c
simple_decoder.GUID = D3BBF1E9-2427-450D-BBFF-B2843C1D44CC
simple_decoder.SRCS += ivfdec.h ivfdec.c
simple_decoder.SRCS += tools_common.h tools_common.c
@@ -122,7 +123,7 @@
simple_decoder.SRCS += vpx_ports/mem_ops.h
simple_decoder.SRCS += vpx_ports/mem_ops_aligned.h
simple_decoder.DESCRIPTION = Simplified decoder loop
-EXAMPLES-$(CONFIG_VP8_DECODER) += postproc.c
+EXAMPLES-$(CONFIG_DECODERS) += postproc.c
postproc.SRCS += ivfdec.h ivfdec.c
postproc.SRCS += tools_common.h tools_common.c
postproc.SRCS += video_common.h
@@ -131,7 +132,7 @@
postproc.SRCS += vpx_ports/mem_ops_aligned.h
postproc.GUID = 65E33355-F35E-4088-884D-3FD4905881D7
postproc.DESCRIPTION = Decoder postprocessor control
-EXAMPLES-$(CONFIG_VP8_DECODER) += decode_to_md5.c
+EXAMPLES-$(CONFIG_DECODERS) += decode_to_md5.c
decode_to_md5.SRCS += md5_utils.h md5_utils.c
decode_to_md5.SRCS += ivfdec.h ivfdec.c
decode_to_md5.SRCS += tools_common.h tools_common.c
@@ -141,29 +142,34 @@
decode_to_md5.SRCS += vpx_ports/mem_ops_aligned.h
decode_to_md5.GUID = 59120B9B-2735-4BFE-B022-146CA340FE42
decode_to_md5.DESCRIPTION = Frame by frame MD5 checksum
-EXAMPLES-$(CONFIG_VP8_ENCODER) += simple_encoder.c
+EXAMPLES-$(CONFIG_ENCODERS) += simple_encoder.c
simple_encoder.SRCS += ivfenc.h ivfenc.c
simple_encoder.SRCS += tools_common.h tools_common.c
simple_encoder.SRCS += video_common.h
simple_encoder.SRCS += video_writer.h video_writer.c
simple_encoder.GUID = 4607D299-8A71-4D2C-9B1D-071899B6FBFD
simple_encoder.DESCRIPTION = Simplified encoder loop
-EXAMPLES-$(CONFIG_VP8_ENCODER) += twopass_encoder.c
+EXAMPLES-$(CONFIG_VP9_ENCODER) += vp9_lossless_encoder.c
+vp9_lossless_encoder.SRCS += ivfenc.h ivfenc.c
+vp9_lossless_encoder.SRCS += tools_common.h tools_common.c
+vp9_lossless_encoder.SRCS += video_common.h
+vp9_lossless_encoder.SRCS += video_writer.h video_writer.c
+vp9_lossless_encoder.GUID = B63C7C88-5348-46DC-A5A6-CC151EF93366
+vp9_lossless_encoder.DESCRIPTION = Simplified lossless VP9 encoder
+EXAMPLES-$(CONFIG_ENCODERS) += twopass_encoder.c
twopass_encoder.SRCS += ivfenc.h ivfenc.c
twopass_encoder.SRCS += tools_common.h tools_common.c
twopass_encoder.SRCS += video_common.h
twopass_encoder.SRCS += video_writer.h video_writer.c
twopass_encoder.GUID = 73494FA6-4AF9-4763-8FBB-265C92402FD8
twopass_encoder.DESCRIPTION = Two-pass encoder loop
-ifeq ($(CONFIG_DECODERS),yes)
-EXAMPLES-$(CONFIG_VP8_ENCODER) += decode_with_drops.c
+EXAMPLES-$(CONFIG_DECODERS) += decode_with_drops.c
decode_with_drops.SRCS += ivfdec.h ivfdec.c
decode_with_drops.SRCS += tools_common.h tools_common.c
decode_with_drops.SRCS += video_common.h
decode_with_drops.SRCS += video_reader.h video_reader.c
decode_with_drops.SRCS += vpx_ports/mem_ops.h
decode_with_drops.SRCS += vpx_ports/mem_ops_aligned.h
-endif
decode_with_drops.GUID = CE5C53C4-8DDA-438A-86ED-0DDD3CDB8D26
decode_with_drops.DESCRIPTION = Drops frames while decoding
EXAMPLES-$(CONFIG_ENCODERS) += set_maps.c
@@ -185,7 +191,9 @@
ifeq ($(CONFIG_MULTI_RES_ENCODING),yes)
ifeq ($(CONFIG_LIBYUV),yes)
EXAMPLES-$(CONFIG_VP8_ENCODER) += vp8_multi_resolution_encoder.c
+vp8_multi_resolution_encoder.SRCS += ivfenc.h ivfenc.c
vp8_multi_resolution_encoder.SRCS += tools_common.h tools_common.c
+vp8_multi_resolution_encoder.SRCS += video_writer.h video_writer.c
vp8_multi_resolution_encoder.SRCS += $(LIBYUV_SRCS)
vp8_multi_resolution_encoder.GUID = 04f8738e-63c8-423b-90fa-7c2703a374de
vp8_multi_resolution_encoder.DESCRIPTION = VP8 Multiple-resolution Encoding
diff --git a/examples/decode_to_md5.c b/examples/decode_to_md5.c
index 1c56303..fbc0f4a 100644
--- a/examples/decode_to_md5.c
+++ b/examples/decode_to_md5.c
@@ -33,8 +33,6 @@
#include <stdlib.h>
#include <string.h>
-#define VPX_CODEC_DISABLE_COMPAT 1
-
#include "vpx/vp8dx.h"
#include "vpx/vpx_decoder.h"
diff --git a/examples/decode_with_drops.c b/examples/decode_with_drops.c
index a20fdac..9423e38 100644
--- a/examples/decode_with_drops.c
+++ b/examples/decode_with_drops.c
@@ -56,8 +56,6 @@
#include <stdlib.h>
#include <string.h>
-#define VPX_CODEC_DISABLE_COMPAT 1
-
#include "vpx/vp8dx.h"
#include "vpx/vpx_decoder.h"
diff --git a/examples/postproc.c b/examples/postproc.c
index 59c50b1..c74347c 100644
--- a/examples/postproc.c
+++ b/examples/postproc.c
@@ -43,8 +43,6 @@
#include <stdlib.h>
#include <string.h>
-#define VPX_CODEC_DISABLE_COMPAT 1
-
#include "vpx/vp8dx.h"
#include "vpx/vpx_decoder.h"
diff --git a/examples/set_maps.c b/examples/set_maps.c
index af8c582..851adc4 100644
--- a/examples/set_maps.c
+++ b/examples/set_maps.c
@@ -42,11 +42,11 @@
// Use the `simple_decoder` example to decode this sample, and observe
// the change in the image at frames 22, 33, and 44.
+#include <assert.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
-#define VPX_CODEC_DISABLE_COMPAT 1
#include "vpx/vp8cx.h"
#include "vpx/vpx_encoder.h"
@@ -125,10 +125,11 @@
die_codec(codec, "Failed to set active map");
}
-static void encode_frame(vpx_codec_ctx_t *codec,
- vpx_image_t *img,
- int frame_index,
- VpxVideoWriter *writer) {
+static int encode_frame(vpx_codec_ctx_t *codec,
+ vpx_image_t *img,
+ int frame_index,
+ VpxVideoWriter *writer) {
+ int got_pkts = 0;
vpx_codec_iter_t iter = NULL;
const vpx_codec_cx_pkt_t *pkt = NULL;
const vpx_codec_err_t res = vpx_codec_encode(codec, img, frame_index, 1, 0,
@@ -137,6 +138,8 @@
die_codec(codec, "Failed to encode frame");
while ((pkt = vpx_codec_get_cx_data(codec, &iter)) != NULL) {
+ got_pkts = 1;
+
if (pkt->kind == VPX_CODEC_CX_FRAME_PKT) {
const int keyframe = (pkt->data.frame.flags & VPX_FRAME_IS_KEY) != 0;
if (!vpx_video_writer_write_frame(writer,
@@ -150,6 +153,8 @@
fflush(stdout);
}
}
+
+ return got_pkts;
}
int main(int argc, char **argv) {
@@ -172,9 +177,10 @@
memset(&info, 0, sizeof(info));
encoder = get_vpx_encoder_by_name(argv[1]);
- if (!encoder)
+ if (encoder == NULL) {
die("Unsupported codec.");
-
+ }
+ assert(encoder != NULL);
info.codec_fourcc = encoder->fourcc;
info.frame_width = strtol(argv[2], NULL, 0);
info.frame_height = strtol(argv[3], NULL, 0);
@@ -217,6 +223,7 @@
if (vpx_codec_enc_init(&codec, encoder->codec_interface(), &cfg, 0))
die_codec(&codec, "Failed to initialize encoder");
+ // Encode frames.
while (vpx_img_read(&raw, infile)) {
++frame_count;
@@ -230,7 +237,10 @@
encode_frame(&codec, &raw, frame_count, writer);
}
- encode_frame(&codec, NULL, -1, writer);
+
+ // Flush encoder.
+ while (encode_frame(&codec, NULL, -1, writer)) {}
+
printf("\n");
fclose(infile);
printf("Processed %d frames.\n", frame_count);
diff --git a/examples/simple_decoder.c b/examples/simple_decoder.c
index 3318758..3f7d6aa 100644
--- a/examples/simple_decoder.c
+++ b/examples/simple_decoder.c
@@ -29,9 +29,7 @@
// -----------------
// For decoders, you only have to include `vpx_decoder.h` and then any
// header files for the specific codecs you use. In this case, we're using
-// vp8. The `VPX_CODEC_DISABLE_COMPAT` macro can be defined to ensure
-// strict compliance with the latest SDK by disabling some backwards
-// compatibility features. Defining this macro is encouraged.
+// vp8.
//
// Initializing The Codec
// ----------------------
@@ -81,8 +79,6 @@
#include <stdlib.h>
#include <string.h>
-#define VPX_CODEC_DISABLE_COMPAT 1
-
#include "vpx/vp8dx.h"
#include "vpx/vpx_decoder.h"
diff --git a/examples/simple_encoder.c b/examples/simple_encoder.c
index 30bb73a..f20c246 100644
--- a/examples/simple_encoder.c
+++ b/examples/simple_encoder.c
@@ -28,9 +28,7 @@
// -----------------
// For encoders, you only have to include `vpx_encoder.h` and then any
// header files for the specific codecs you use. In this case, we're using
-// vp8. The `VPX_CODEC_DISABLE_COMPAT` macro can be defined to ensure
-// strict compliance with the latest SDK by disabling some backwards
-// compatibility features. Defining this macro is encouraged.
+// vp8.
//
// Getting The Default Configuration
// ---------------------------------
@@ -101,7 +99,6 @@
#include <stdlib.h>
#include <string.h>
-#define VPX_CODEC_DISABLE_COMPAT 1
#include "vpx/vpx_encoder.h"
#include "./tools_common.h"
diff --git a/examples/twopass_encoder.c b/examples/twopass_encoder.c
index 369b1d8..653ae94 100644
--- a/examples/twopass_encoder.c
+++ b/examples/twopass_encoder.c
@@ -28,9 +28,8 @@
// Encoding A Frame
// ----------------
// Encoding a frame in two pass mode is identical to the simple encoder
-// example, except the deadline is set to VPX_DL_BEST_QUALITY to get the
-// best quality possible. VPX_DL_GOOD_QUALITY could also be used.
-//
+// example. To increase the quality while sacrificing encoding speed,
+// VPX_DL_BEST_QUALITY can be used in place of VPX_DL_GOOD_QUALITY.
//
// Processing Statistics Packets
// -----------------------------
@@ -52,7 +51,6 @@
#include <stdlib.h>
#include <string.h>
-#define VPX_CODEC_DISABLE_COMPAT 1
#include "vpx/vpx_encoder.h"
#include "./tools_common.h"
@@ -66,13 +64,14 @@
exit(EXIT_FAILURE);
}
-static void get_frame_stats(vpx_codec_ctx_t *ctx,
- const vpx_image_t *img,
- vpx_codec_pts_t pts,
- unsigned int duration,
- vpx_enc_frame_flags_t flags,
- unsigned int deadline,
- vpx_fixed_buf_t *stats) {
+static int get_frame_stats(vpx_codec_ctx_t *ctx,
+ const vpx_image_t *img,
+ vpx_codec_pts_t pts,
+ unsigned int duration,
+ vpx_enc_frame_flags_t flags,
+ unsigned int deadline,
+ vpx_fixed_buf_t *stats) {
+ int got_pkts = 0;
vpx_codec_iter_t iter = NULL;
const vpx_codec_cx_pkt_t *pkt = NULL;
const vpx_codec_err_t res = vpx_codec_encode(ctx, img, pts, duration, flags,
@@ -81,6 +80,8 @@
die_codec(ctx, "Failed to get frame stats.");
while ((pkt = vpx_codec_get_cx_data(ctx, &iter)) != NULL) {
+ got_pkts = 1;
+
if (pkt->kind == VPX_CODEC_STATS_PKT) {
const uint8_t *const pkt_buf = pkt->data.twopass_stats.buf;
const size_t pkt_size = pkt->data.twopass_stats.sz;
@@ -89,15 +90,18 @@
stats->sz += pkt_size;
}
}
+
+ return got_pkts;
}
-static void encode_frame(vpx_codec_ctx_t *ctx,
- const vpx_image_t *img,
- vpx_codec_pts_t pts,
- unsigned int duration,
- vpx_enc_frame_flags_t flags,
- unsigned int deadline,
- VpxVideoWriter *writer) {
+static int encode_frame(vpx_codec_ctx_t *ctx,
+ const vpx_image_t *img,
+ vpx_codec_pts_t pts,
+ unsigned int duration,
+ vpx_enc_frame_flags_t flags,
+ unsigned int deadline,
+ VpxVideoWriter *writer) {
+ int got_pkts = 0;
vpx_codec_iter_t iter = NULL;
const vpx_codec_cx_pkt_t *pkt = NULL;
const vpx_codec_err_t res = vpx_codec_encode(ctx, img, pts, duration, flags,
@@ -106,6 +110,7 @@
die_codec(ctx, "Failed to encode frame.");
while ((pkt = vpx_codec_get_cx_data(ctx, &iter)) != NULL) {
+ got_pkts = 1;
if (pkt->kind == VPX_CODEC_CX_FRAME_PKT) {
const int keyframe = (pkt->data.frame.flags & VPX_FRAME_IS_KEY) != 0;
@@ -117,19 +122,90 @@
fflush(stdout);
}
}
+
+ return got_pkts;
+}
+
+static vpx_fixed_buf_t pass0(vpx_image_t *raw,
+ FILE *infile,
+ const VpxInterface *encoder,
+ const vpx_codec_enc_cfg_t *cfg) {
+ vpx_codec_ctx_t codec;
+ int frame_count = 0;
+ vpx_fixed_buf_t stats = {NULL, 0};
+
+ if (vpx_codec_enc_init(&codec, encoder->codec_interface(), cfg, 0))
+ die_codec(&codec, "Failed to initialize encoder");
+
+ // Calculate frame statistics.
+ while (vpx_img_read(raw, infile)) {
+ ++frame_count;
+ get_frame_stats(&codec, raw, frame_count, 1, 0, VPX_DL_GOOD_QUALITY,
+ &stats);
+ }
+
+ // Flush encoder.
+ while (get_frame_stats(&codec, NULL, frame_count, 1, 0,
+ VPX_DL_GOOD_QUALITY, &stats)) {}
+
+ printf("Pass 0 complete. Processed %d frames.\n", frame_count);
+ if (vpx_codec_destroy(&codec))
+ die_codec(&codec, "Failed to destroy codec.");
+
+ return stats;
+}
+
+static void pass1(vpx_image_t *raw,
+ FILE *infile,
+ const char *outfile_name,
+ const VpxInterface *encoder,
+ const vpx_codec_enc_cfg_t *cfg) {
+ VpxVideoInfo info = {
+ encoder->fourcc,
+ cfg->g_w,
+ cfg->g_h,
+ {cfg->g_timebase.num, cfg->g_timebase.den}
+ };
+ VpxVideoWriter *writer = NULL;
+ vpx_codec_ctx_t codec;
+ int frame_count = 0;
+
+ writer = vpx_video_writer_open(outfile_name, kContainerIVF, &info);
+ if (!writer)
+ die("Failed to open %s for writing", outfile_name);
+
+ if (vpx_codec_enc_init(&codec, encoder->codec_interface(), cfg, 0))
+ die_codec(&codec, "Failed to initialize encoder");
+
+ // Encode frames.
+ while (vpx_img_read(raw, infile)) {
+ ++frame_count;
+ encode_frame(&codec, raw, frame_count, 1, 0, VPX_DL_GOOD_QUALITY, writer);
+ }
+
+ // Flush encoder.
+ while (encode_frame(&codec, NULL, -1, 1, 0, VPX_DL_GOOD_QUALITY, writer)) {}
+
+ printf("\n");
+
+ if (vpx_codec_destroy(&codec))
+ die_codec(&codec, "Failed to destroy codec.");
+
+ vpx_video_writer_close(writer);
+
+ printf("Pass 1 complete. Processed %d frames.\n", frame_count);
}
int main(int argc, char **argv) {
FILE *infile = NULL;
- VpxVideoWriter *writer = NULL;
+ int w, h;
vpx_codec_ctx_t codec;
vpx_codec_enc_cfg_t cfg;
vpx_image_t raw;
vpx_codec_err_t res;
- vpx_fixed_buf_t stats = {0};
- VpxVideoInfo info = {0};
+ vpx_fixed_buf_t stats;
+
const VpxInterface *encoder = NULL;
- int pass;
const int fps = 30; // TODO(dkovalev) add command line argument
const int bitrate = 200; // kbit/s TODO(dkovalev) add command line argument
const char *const codec_arg = argv[1];
@@ -146,85 +222,44 @@
if (!encoder)
die("Unsupported codec.");
- info.codec_fourcc = encoder->fourcc;
- info.time_base.numerator = 1;
- info.time_base.denominator = fps;
- info.frame_width = strtol(width_arg, NULL, 0);
- info.frame_height = strtol(height_arg, NULL, 0);
+ w = strtol(width_arg, NULL, 0);
+ h = strtol(height_arg, NULL, 0);
- if (info.frame_width <= 0 ||
- info.frame_height <= 0 ||
- (info.frame_width % 2) != 0 ||
- (info.frame_height % 2) != 0) {
- die("Invalid frame size: %dx%d", info.frame_width, info.frame_height);
- }
+ if (w <= 0 || h <= 0 || (w % 2) != 0 || (h % 2) != 0)
+ die("Invalid frame size: %dx%d", w, h);
- if (!vpx_img_alloc(&raw, VPX_IMG_FMT_I420, info.frame_width,
- info.frame_height, 1)) {
- die("Failed to allocate image", info.frame_width, info.frame_height);
- }
-
- writer = vpx_video_writer_open(outfile_arg, kContainerIVF, &info);
- if (!writer)
- die("Failed to open %s for writing", outfile_arg);
+ if (!vpx_img_alloc(&raw, VPX_IMG_FMT_I420, w, h, 1))
+ die("Failed to allocate image", w, h);
printf("Using %s\n", vpx_codec_iface_name(encoder->codec_interface()));
+ // Configuration
res = vpx_codec_enc_config_default(encoder->codec_interface(), &cfg, 0);
if (res)
die_codec(&codec, "Failed to get default codec config.");
- cfg.g_w = info.frame_width;
- cfg.g_h = info.frame_height;
- cfg.g_timebase.num = info.time_base.numerator;
- cfg.g_timebase.den = info.time_base.denominator;
+ cfg.g_w = w;
+ cfg.g_h = h;
+ cfg.g_timebase.num = 1;
+ cfg.g_timebase.den = fps;
cfg.rc_target_bitrate = bitrate;
- for (pass = 0; pass < 2; ++pass) {
- int frame_count = 0;
+ if (!(infile = fopen(infile_arg, "rb")))
+ die("Failed to open %s for reading", infile_arg);
- if (pass == 0) {
- cfg.g_pass = VPX_RC_FIRST_PASS;
- } else {
- cfg.g_pass = VPX_RC_LAST_PASS;
- cfg.rc_twopass_stats_in = stats;
- }
+ // Pass 0
+ cfg.g_pass = VPX_RC_FIRST_PASS;
+ stats = pass0(&raw, infile, encoder, &cfg);
- if (!(infile = fopen(infile_arg, "rb")))
- die("Failed to open %s for reading", infile_arg);
-
- if (vpx_codec_enc_init(&codec, encoder->codec_interface(), &cfg, 0))
- die_codec(&codec, "Failed to initialize encoder");
-
- while (vpx_img_read(&raw, infile)) {
- ++frame_count;
-
- if (pass == 0) {
- get_frame_stats(&codec, &raw, frame_count, 1, 0, VPX_DL_BEST_QUALITY,
- &stats);
- } else {
- encode_frame(&codec, &raw, frame_count, 1, 0, VPX_DL_BEST_QUALITY,
- writer);
- }
- }
-
- if (pass == 0) {
- get_frame_stats(&codec, NULL, frame_count, 1, 0, VPX_DL_BEST_QUALITY,
- &stats);
- } else {
- printf("\n");
- }
-
- fclose(infile);
- printf("Pass %d complete. Processed %d frames.\n", pass + 1, frame_count);
- if (vpx_codec_destroy(&codec))
- die_codec(&codec, "Failed to destroy codec.");
- }
-
- vpx_img_free(&raw);
+ // Pass 1
+ rewind(infile);
+ cfg.g_pass = VPX_RC_LAST_PASS;
+ cfg.rc_twopass_stats_in = stats;
+ pass1(&raw, infile, outfile_arg, encoder, &cfg);
free(stats.buf);
- vpx_video_writer_close(writer);
+ vpx_img_free(&raw);
+ fclose(infile);
return EXIT_SUCCESS;
}
diff --git a/examples/vp8_multi_resolution_encoder.c b/examples/vp8_multi_resolution_encoder.c
index d41e442..9f50dc7 100644
--- a/examples/vp8_multi_resolution_encoder.c
+++ b/examples/vp8_multi_resolution_encoder.c
@@ -8,446 +8,292 @@
* be found in the AUTHORS file in the root of the source tree.
*/
-/*
- * This is an example demonstrating multi-resolution encoding in VP8.
- * High-resolution input video is down-sampled to lower-resolutions. The
- * encoder then encodes the video and outputs multiple bitstreams with
- * different resolutions.
- */
+
+// This is an example demonstrating multi-resolution encoding in VP8.
+// High-resolution input video is down-sampled to lower-resolutions. The
+// encoder then encodes the video and outputs multiple bitstreams with
+// different resolutions.
+//
+// Configure with --enable-multi-res-encoding flag to enable this example.
+
#include <stdio.h>
#include <stdlib.h>
-#include <stdarg.h>
#include <string.h>
-#include <math.h>
-#define VPX_CODEC_DISABLE_COMPAT 1
-#include "vpx/vpx_encoder.h"
-#include "vpx/vp8cx.h"
-#include "vpx_ports/mem_ops.h"
-#include "./tools_common.h"
-#define interface (vpx_codec_vp8_cx())
-#define fourcc 0x30385056
-void usage_exit() {
- exit(EXIT_FAILURE);
-}
-
-/*
- * The input video frame is downsampled several times to generate a multi-level
- * hierarchical structure. NUM_ENCODERS is defined as the number of encoding
- * levels required. For example, if the size of input video is 1280x720,
- * NUM_ENCODERS is 3, and down-sampling factor is 2, the encoder outputs 3
- * bitstreams with resolution of 1280x720(level 0), 640x360(level 1), and
- * 320x180(level 2) respectively.
- */
-#define NUM_ENCODERS 3
-
-/* This example uses the scaler function in libyuv. */
#include "third_party/libyuv/include/libyuv/basic_types.h"
#include "third_party/libyuv/include/libyuv/scale.h"
#include "third_party/libyuv/include/libyuv/cpu_id.h"
-int (*read_frame_p)(FILE *f, vpx_image_t *img);
+#include "vpx/vpx_encoder.h"
+#include "vpx/vp8cx.h"
-static int read_frame(FILE *f, vpx_image_t *img) {
- size_t nbytes, to_read;
- int res = 1;
+#include "./tools_common.h"
+#include "./video_writer.h"
- to_read = img->w*img->h*3/2;
- nbytes = fread(img->planes[0], 1, to_read, f);
- if(nbytes != to_read) {
- res = 0;
- if(nbytes > 0)
- printf("Warning: Read partial frame. Check your width & height!\n");
- }
- return res;
+// The input video frame is downsampled several times to generate a
+// multi-level hierarchical structure. kNumEncoders is defined as the number
+// of encoding levels required. For example, if the size of input video is
+// 1280x720, kNumEncoders is 3, and down-sampling factor is 2, the encoder
+// outputs 3 bitstreams with resolution of 1280x720(level 0),
+// 640x360(level 1), and 320x180(level 2) respectively.
+#define kNumEncoders 3
+
+static const char *exec_name;
+
+void usage_exit() {
+ fprintf(stderr,
+ "Usage: %s <width> <height> <infile> <outfile(s)> <output psnr?>\n",
+ exec_name);
+ exit(EXIT_FAILURE);
}
-static int read_frame_by_row(FILE *f, vpx_image_t *img) {
- size_t nbytes, to_read;
- int res = 1;
- int plane;
+int main(int argc, char *argv[]) {
+ int frame_cnt = 0;
+ FILE *infile = NULL;
+ VpxVideoWriter *writers[kNumEncoders];
+ vpx_codec_ctx_t codec[kNumEncoders];
+ vpx_codec_enc_cfg_t cfg[kNumEncoders];
+ vpx_image_t raw[kNumEncoders];
+ const VpxInterface *const encoder = get_vpx_encoder_by_name("vp8");
+ // Currently, only realtime mode is supported in multi-resolution encoding.
+ const int arg_deadline = VPX_DL_REALTIME;
+ int i;
+ int width = 0;
+ int height = 0;
+ int frame_avail = 0;
+ int got_data = 0;
- for (plane = 0; plane < 3; plane++)
+ // Set show_psnr to 1/0 to show/not show PSNR. Choose show_psnr=0 if you
+ // don't need to know PSNR, which will skip PSNR calculation and save
+ // encoding time.
+ int show_psnr = 0;
+ uint64_t psnr_sse_total[kNumEncoders] = {0};
+ uint64_t psnr_samples_total[kNumEncoders] = {0};
+ double psnr_totals[kNumEncoders][4] = {{0, 0}};
+ int psnr_count[kNumEncoders] = {0};
+
+ // Set the required target bitrates for each resolution level.
+ // If target bitrate for highest-resolution level is set to 0,
+ // (i.e. target_bitrate[0]=0), we skip encoding at that level.
+ unsigned int target_bitrate[kNumEncoders] = {1000, 500, 100};
+
+ // Enter the frame rate of the input video.
+ const int framerate = 30;
+ // Set down-sampling factor for each resolution level.
+ // dsf[0] controls down sampling from level 0 to level 1;
+ // dsf[1] controls down sampling from level 1 to level 2;
+ // dsf[2] is not used.
+ vpx_rational_t dsf[kNumEncoders] = {{2, 1}, {2, 1}, {1, 1}};
+
+ exec_name = argv[0];
+
+ if (!encoder)
+ die("Unsupported codec.");
+
+ // exe_name, input width, input height, input file,
+ // output file 1, output file 2, output file 3, psnr on/off
+ if (argc != (5 + kNumEncoders))
+ die("Invalid number of input options.");
+
+ printf("Using %s\n", vpx_codec_iface_name(encoder->codec_interface()));
+
+ width = strtol(argv[1], NULL, 0);
+ height = strtol(argv[2], NULL, 0);
+
+ if (width < 16 || width % 2 || height < 16 || height % 2)
+ die("Invalid resolution: %ldx%ld", width, height);
+
+ // Open input video file for encoding
+ if (!(infile = fopen(argv[3], "rb")))
+ die("Failed to open %s for reading", argv[3]);
+
+ show_psnr = strtol(argv[kNumEncoders + 4], NULL, 0);
+
+ // Populate default encoder configuration
+ for (i = 0; i < kNumEncoders; ++i) {
+ vpx_codec_err_t res =
+ vpx_codec_enc_config_default(encoder->codec_interface(), &cfg[i], 0);
+ if (res != VPX_CODEC_OK) {
+ printf("Failed to get config: %s\n", vpx_codec_err_to_string(res));
+ return EXIT_FAILURE;
+ }
+ }
+
+ // Update the default configuration according to needs of the application.
+ // Highest-resolution encoder settings
+ cfg[0].g_w = width;
+ cfg[0].g_h = height;
+ cfg[0].g_threads = 1;
+ cfg[0].rc_dropframe_thresh = 30;
+ cfg[0].rc_end_usage = VPX_CBR;
+ cfg[0].rc_resize_allowed = 0;
+ cfg[0].rc_min_quantizer = 4;
+ cfg[0].rc_max_quantizer = 56;
+ cfg[0].rc_undershoot_pct = 98;
+ cfg[0].rc_overshoot_pct = 100;
+ cfg[0].rc_buf_initial_sz = 500;
+ cfg[0].rc_buf_optimal_sz = 600;
+ cfg[0].rc_buf_sz = 1000;
+ cfg[0].g_error_resilient = 1;
+ cfg[0].g_lag_in_frames = 0;
+ cfg[0].kf_mode = VPX_KF_AUTO; // VPX_KF_DISABLED
+ cfg[0].kf_min_dist = 3000;
+ cfg[0].kf_max_dist = 3000;
+ cfg[0].rc_target_bitrate = target_bitrate[0];
+ cfg[0].g_timebase.num = 1;
+ cfg[0].g_timebase.den = framerate;
+
+ // Other-resolution encoder settings
+ for (i = 1; i < kNumEncoders; ++i) {
+ cfg[i] = cfg[0];
+ cfg[i].g_threads = 1;
+ cfg[i].rc_target_bitrate = target_bitrate[i];
+
+ // Note: Width & height of other-resolution encoders are calculated
+ // from the highest-resolution encoder's size and the corresponding
+ // down_sampling_factor.
{
- unsigned char *ptr;
- int w = (plane ? (1 + img->d_w) / 2 : img->d_w);
- int h = (plane ? (1 + img->d_h) / 2 : img->d_h);
- int r;
+ unsigned int iw = cfg[i - 1].g_w * dsf[i - 1].den + dsf[i - 1].num - 1;
+ unsigned int ih = cfg[i - 1].g_h * dsf[i - 1].den + dsf[i - 1].num - 1;
+ cfg[i].g_w = iw / dsf[i - 1].num;
+ cfg[i].g_h = ih / dsf[i - 1].num;
+ }
- /* Determine the correct plane based on the image format. The for-loop
- * always counts in Y,U,V order, but this may not match the order of
- * the data on disk.
- */
- switch (plane)
- {
- case 1:
- ptr = img->planes[img->fmt==VPX_IMG_FMT_YV12? VPX_PLANE_V : VPX_PLANE_U];
+ // Make width & height to be multiplier of 2.
+ if ((cfg[i].g_w) % 2)
+ cfg[i].g_w++;
+
+ if ((cfg[i].g_h) % 2)
+ cfg[i].g_h++;
+ }
+
+ // Open output file for each encoder to output bitstreams
+ for (i = 0; i < kNumEncoders; ++i) {
+ VpxVideoInfo info = {
+ encoder->fourcc,
+ cfg[i].g_w,
+ cfg[i].g_h,
+ {cfg[i].g_timebase.num, cfg[i].g_timebase.den}
+ };
+
+ if (!(writers[i] = vpx_video_writer_open(argv[i+4], kContainerIVF, &info)))
+ die("Failed to open %s for writing", argv[i+4]);
+ }
+
+ // Allocate image for each encoder
+ for (i = 0; i < kNumEncoders; ++i)
+ if (!vpx_img_alloc(&raw[i], VPX_IMG_FMT_I420, cfg[i].g_w, cfg[i].g_h, 32))
+ die("Failed to allocate image", cfg[i].g_w, cfg[i].g_h);
+
+ // Initialize multi-encoder
+ if (vpx_codec_enc_init_multi(&codec[0], encoder->codec_interface(), &cfg[0],
+ kNumEncoders,
+ show_psnr ? VPX_CODEC_USE_PSNR : 0, &dsf[0]))
+ die_codec(&codec[0], "Failed to initialize encoder");
+
+ // The extra encoding configuration parameters can be set as follows.
+ for (i = 0; i < kNumEncoders; i++) {
+ // Set encoding speed
+ if (vpx_codec_control(&codec[i], VP8E_SET_CPUUSED, -6))
+ die_codec(&codec[i], "Failed to set cpu_used");
+
+ // Set static threshold.
+ if (vpx_codec_control(&codec[i], VP8E_SET_STATIC_THRESHOLD, 1))
+ die_codec(&codec[i], "Failed to set static threshold");
+
+ // Set NOISE_SENSITIVITY to do TEMPORAL_DENOISING
+ // Enable denoising for the highest-resolution encoder.
+ if (vpx_codec_control(&codec[0], VP8E_SET_NOISE_SENSITIVITY, i == 0))
+ die_codec(&codec[0], "Failed to set noise_sensitivity");
+ }
+
+ frame_avail = 1;
+ got_data = 0;
+
+ while (frame_avail || got_data) {
+ vpx_codec_iter_t iter[kNumEncoders] = {NULL};
+ const vpx_codec_cx_pkt_t *pkt[kNumEncoders];
+
+ frame_avail = vpx_img_read(&raw[0], infile);
+
+ if (frame_avail) {
+ for (i = 1; i < kNumEncoders; ++i) {
+ vpx_image_t *const prev = &raw[i - 1];
+
+ // Scale the image down a number of times by downsampling factor
+ // FilterMode 1 or 2 give better psnr than FilterMode 0.
+ I420Scale(prev->planes[VPX_PLANE_Y], prev->stride[VPX_PLANE_Y],
+ prev->planes[VPX_PLANE_U], prev->stride[VPX_PLANE_U],
+ prev->planes[VPX_PLANE_V], prev->stride[VPX_PLANE_V],
+ prev->d_w, prev->d_h,
+ raw[i].planes[VPX_PLANE_Y], raw[i].stride[VPX_PLANE_Y],
+ raw[i].planes[VPX_PLANE_U], raw[i].stride[VPX_PLANE_U],
+ raw[i].planes[VPX_PLANE_V], raw[i].stride[VPX_PLANE_V],
+ raw[i].d_w, raw[i].d_h, 1);
+ }
+ }
+
+ // Encode frame.
+ if (vpx_codec_encode(&codec[0], frame_avail? &raw[0] : NULL,
+ frame_cnt, 1, 0, arg_deadline)) {
+ die_codec(&codec[0], "Failed to encode frame");
+ }
+
+ for (i = kNumEncoders - 1; i >= 0; i--) {
+ got_data = 0;
+
+ while ((pkt[i] = vpx_codec_get_cx_data(&codec[i], &iter[i]))) {
+ got_data = 1;
+ switch (pkt[i]->kind) {
+ case VPX_CODEC_CX_FRAME_PKT:
+ vpx_video_writer_write_frame(writers[i], pkt[i]->data.frame.buf,
+ pkt[i]->data.frame.sz, frame_cnt - 1);
+ break;
+ case VPX_CODEC_PSNR_PKT:
+ if (show_psnr) {
+ int j;
+ psnr_sse_total[i] += pkt[i]->data.psnr.sse[0];
+ psnr_samples_total[i] += pkt[i]->data.psnr.samples[0];
+ for (j = 0; j < 4; j++)
+ psnr_totals[i][j] += pkt[i]->data.psnr.psnr[j];
+ psnr_count[i]++;
+ }
break;
- case 2:
- ptr = img->planes[img->fmt==VPX_IMG_FMT_YV12?VPX_PLANE_U : VPX_PLANE_V];
+ default:
break;
- default:
- ptr = img->planes[plane];
}
+ printf(pkt[i]->kind == VPX_CODEC_CX_FRAME_PKT &&
+ (pkt[i]->data.frame.flags & VPX_FRAME_IS_KEY)? "K":".");
+ fflush(stdout);
+ }
+ }
+ frame_cnt++;
+ }
+ printf("\n");
- for (r = 0; r < h; r++)
- {
- to_read = w;
+ fclose(infile);
- nbytes = fread(ptr, 1, to_read, f);
- if(nbytes != to_read) {
- res = 0;
- if(nbytes > 0)
- printf("Warning: Read partial frame. Check your width & height!\n");
- break;
- }
+ printf("Processed %d frames.\n", frame_cnt - 1);
+ for (i = 0; i < kNumEncoders; ++i) {
+ // Calculate PSNR and print it out
+ if (show_psnr && psnr_count[i] > 0) {
+ int j;
+ double ovpsnr = sse_to_psnr(psnr_samples_total[i], 255.0,
+ psnr_sse_total[i]);
- ptr += img->stride[plane];
- }
- if (!res)
- break;
+ fprintf(stderr, "\n ENC%d PSNR (Overall/Avg/Y/U/V)", i);
+ fprintf(stderr, " %.3lf", ovpsnr);
+ for (j = 0; j < 4; j++)
+ fprintf(stderr, " %.3lf", psnr_totals[i][j]/psnr_count[i]);
}
- return res;
-}
+ if (vpx_codec_destroy(&codec[i]))
+ die_codec(&codec[i], "Failed to destroy codec");
-static void write_ivf_file_header(FILE *outfile,
- const vpx_codec_enc_cfg_t *cfg,
- int frame_cnt) {
- char header[32];
+ vpx_img_free(&raw[i]);
+ vpx_video_writer_close(writers[i]);
+ }
+ printf("\n");
- if(cfg->g_pass != VPX_RC_ONE_PASS && cfg->g_pass != VPX_RC_LAST_PASS)
- return;
- header[0] = 'D';
- header[1] = 'K';
- header[2] = 'I';
- header[3] = 'F';
- mem_put_le16(header+4, 0); /* version */
- mem_put_le16(header+6, 32); /* headersize */
- mem_put_le32(header+8, fourcc); /* headersize */
- mem_put_le16(header+12, cfg->g_w); /* width */
- mem_put_le16(header+14, cfg->g_h); /* height */
- mem_put_le32(header+16, cfg->g_timebase.den); /* rate */
- mem_put_le32(header+20, cfg->g_timebase.num); /* scale */
- mem_put_le32(header+24, frame_cnt); /* length */
- mem_put_le32(header+28, 0); /* unused */
-
- (void) fwrite(header, 1, 32, outfile);
-}
-
-static void write_ivf_frame_header(FILE *outfile,
- const vpx_codec_cx_pkt_t *pkt)
-{
- char header[12];
- vpx_codec_pts_t pts;
-
- if(pkt->kind != VPX_CODEC_CX_FRAME_PKT)
- return;
-
- pts = pkt->data.frame.pts;
- mem_put_le32(header, pkt->data.frame.sz);
- mem_put_le32(header+4, pts&0xFFFFFFFF);
- mem_put_le32(header+8, pts >> 32);
-
- (void) fwrite(header, 1, 12, outfile);
-}
-
-int main(int argc, char **argv)
-{
- FILE *infile, *outfile[NUM_ENCODERS];
- vpx_codec_ctx_t codec[NUM_ENCODERS];
- vpx_codec_enc_cfg_t cfg[NUM_ENCODERS];
- vpx_codec_pts_t frame_cnt = 0;
- vpx_image_t raw[NUM_ENCODERS];
- vpx_codec_err_t res[NUM_ENCODERS];
-
- int i;
- long width;
- long height;
- int frame_avail;
- int got_data;
- int flags = 0;
-
- /*Currently, only realtime mode is supported in multi-resolution encoding.*/
- int arg_deadline = VPX_DL_REALTIME;
-
- /* Set show_psnr to 1/0 to show/not show PSNR. Choose show_psnr=0 if you
- don't need to know PSNR, which will skip PSNR calculation and save
- encoding time. */
- int show_psnr = 0;
- uint64_t psnr_sse_total[NUM_ENCODERS] = {0};
- uint64_t psnr_samples_total[NUM_ENCODERS] = {0};
- double psnr_totals[NUM_ENCODERS][4] = {{0,0}};
- int psnr_count[NUM_ENCODERS] = {0};
-
- /* Set the required target bitrates for each resolution level.
- * If target bitrate for highest-resolution level is set to 0,
- * (i.e. target_bitrate[0]=0), we skip encoding at that level.
- */
- unsigned int target_bitrate[NUM_ENCODERS]={1000, 500, 100};
- /* Enter the frame rate of the input video */
- int framerate = 30;
- /* Set down-sampling factor for each resolution level.
- dsf[0] controls down sampling from level 0 to level 1;
- dsf[1] controls down sampling from level 1 to level 2;
- dsf[2] is not used. */
- vpx_rational_t dsf[NUM_ENCODERS] = {{2, 1}, {2, 1}, {1, 1}};
-
- if(argc!= (5+NUM_ENCODERS))
- die("Usage: %s <width> <height> <infile> <outfile(s)> <output psnr?>\n",
- argv[0]);
-
- printf("Using %s\n",vpx_codec_iface_name(interface));
-
- width = strtol(argv[1], NULL, 0);
- height = strtol(argv[2], NULL, 0);
-
- if(width < 16 || width%2 || height <16 || height%2)
- die("Invalid resolution: %ldx%ld", width, height);
-
- /* Open input video file for encoding */
- if(!(infile = fopen(argv[3], "rb")))
- die("Failed to open %s for reading", argv[3]);
-
- /* Open output file for each encoder to output bitstreams */
- for (i=0; i< NUM_ENCODERS; i++)
- {
- if(!target_bitrate[i])
- {
- outfile[i] = NULL;
- continue;
- }
-
- if(!(outfile[i] = fopen(argv[i+4], "wb")))
- die("Failed to open %s for writing", argv[i+4]);
- }
-
- show_psnr = strtol(argv[NUM_ENCODERS + 4], NULL, 0);
-
- /* Populate default encoder configuration */
- for (i=0; i< NUM_ENCODERS; i++)
- {
- res[i] = vpx_codec_enc_config_default(interface, &cfg[i], 0);
- if(res[i]) {
- printf("Failed to get config: %s\n", vpx_codec_err_to_string(res[i]));
- return EXIT_FAILURE;
- }
- }
-
- /*
- * Update the default configuration according to needs of the application.
- */
- /* Highest-resolution encoder settings */
- cfg[0].g_w = width;
- cfg[0].g_h = height;
- cfg[0].g_threads = 1; /* number of threads used */
- cfg[0].rc_dropframe_thresh = 30;
- cfg[0].rc_end_usage = VPX_CBR;
- cfg[0].rc_resize_allowed = 0;
- cfg[0].rc_min_quantizer = 4;
- cfg[0].rc_max_quantizer = 56;
- cfg[0].rc_undershoot_pct = 98;
- cfg[0].rc_overshoot_pct = 100;
- cfg[0].rc_buf_initial_sz = 500;
- cfg[0].rc_buf_optimal_sz = 600;
- cfg[0].rc_buf_sz = 1000;
- cfg[0].g_error_resilient = 1; /* Enable error resilient mode */
- cfg[0].g_lag_in_frames = 0;
-
- /* Disable automatic keyframe placement */
- /* Note: These 3 settings are copied to all levels. But, except the lowest
- * resolution level, all other levels are set to VPX_KF_DISABLED internally.
- */
- //cfg[0].kf_mode = VPX_KF_DISABLED;
- cfg[0].kf_mode = VPX_KF_AUTO;
- cfg[0].kf_min_dist = 3000;
- cfg[0].kf_max_dist = 3000;
-
- cfg[0].rc_target_bitrate = target_bitrate[0]; /* Set target bitrate */
- cfg[0].g_timebase.num = 1; /* Set fps */
- cfg[0].g_timebase.den = framerate;
-
- /* Other-resolution encoder settings */
- for (i=1; i< NUM_ENCODERS; i++)
- {
- memcpy(&cfg[i], &cfg[0], sizeof(vpx_codec_enc_cfg_t));
-
- cfg[i].g_threads = 1; /* number of threads used */
- cfg[i].rc_target_bitrate = target_bitrate[i];
-
- /* Note: Width & height of other-resolution encoders are calculated
- * from the highest-resolution encoder's size and the corresponding
- * down_sampling_factor.
- */
- {
- unsigned int iw = cfg[i-1].g_w*dsf[i-1].den + dsf[i-1].num - 1;
- unsigned int ih = cfg[i-1].g_h*dsf[i-1].den + dsf[i-1].num - 1;
- cfg[i].g_w = iw/dsf[i-1].num;
- cfg[i].g_h = ih/dsf[i-1].num;
- }
-
- /* Make width & height to be multiplier of 2. */
- // Should support odd size ???
- if((cfg[i].g_w)%2)cfg[i].g_w++;
- if((cfg[i].g_h)%2)cfg[i].g_h++;
- }
-
- /* Allocate image for each encoder */
- for (i=0; i< NUM_ENCODERS; i++)
- if(!vpx_img_alloc(&raw[i], VPX_IMG_FMT_I420, cfg[i].g_w, cfg[i].g_h, 32))
- die("Failed to allocate image", cfg[i].g_w, cfg[i].g_h);
-
- if (raw[0].stride[VPX_PLANE_Y] == raw[0].d_w)
- read_frame_p = read_frame;
- else
- read_frame_p = read_frame_by_row;
-
- for (i=0; i< NUM_ENCODERS; i++)
- if(outfile[i])
- write_ivf_file_header(outfile[i], &cfg[i], 0);
-
- /* Initialize multi-encoder */
- if(vpx_codec_enc_init_multi(&codec[0], interface, &cfg[0], NUM_ENCODERS,
- (show_psnr ? VPX_CODEC_USE_PSNR : 0), &dsf[0]))
- die_codec(&codec[0], "Failed to initialize encoder");
-
- /* The extra encoding configuration parameters can be set as follows. */
- /* Set encoding speed */
- for ( i=0; i<NUM_ENCODERS; i++)
- {
- int speed = -6;
- if(vpx_codec_control(&codec[i], VP8E_SET_CPUUSED, speed))
- die_codec(&codec[i], "Failed to set cpu_used");
- }
-
- /* Set static threshold. */
- for ( i=0; i<NUM_ENCODERS; i++)
- {
- unsigned int static_thresh = 1;
- if(vpx_codec_control(&codec[i], VP8E_SET_STATIC_THRESHOLD, static_thresh))
- die_codec(&codec[i], "Failed to set static threshold");
- }
-
- /* Set NOISE_SENSITIVITY to do TEMPORAL_DENOISING */
- /* Enable denoising for the highest-resolution encoder. */
- if(vpx_codec_control(&codec[0], VP8E_SET_NOISE_SENSITIVITY, 1))
- die_codec(&codec[0], "Failed to set noise_sensitivity");
- for ( i=1; i< NUM_ENCODERS; i++)
- {
- if(vpx_codec_control(&codec[i], VP8E_SET_NOISE_SENSITIVITY, 0))
- die_codec(&codec[i], "Failed to set noise_sensitivity");
- }
-
-
- frame_avail = 1;
- got_data = 0;
-
- while(frame_avail || got_data)
- {
- vpx_codec_iter_t iter[NUM_ENCODERS]={NULL};
- const vpx_codec_cx_pkt_t *pkt[NUM_ENCODERS];
-
- flags = 0;
- frame_avail = read_frame_p(infile, &raw[0]);
-
- if(frame_avail)
- {
- for ( i=1; i<NUM_ENCODERS; i++)
- {
- /*Scale the image down a number of times by downsampling factor*/
- /* FilterMode 1 or 2 give better psnr than FilterMode 0. */
- I420Scale(raw[i-1].planes[VPX_PLANE_Y], raw[i-1].stride[VPX_PLANE_Y],
- raw[i-1].planes[VPX_PLANE_U], raw[i-1].stride[VPX_PLANE_U],
- raw[i-1].planes[VPX_PLANE_V], raw[i-1].stride[VPX_PLANE_V],
- raw[i-1].d_w, raw[i-1].d_h,
- raw[i].planes[VPX_PLANE_Y], raw[i].stride[VPX_PLANE_Y],
- raw[i].planes[VPX_PLANE_U], raw[i].stride[VPX_PLANE_U],
- raw[i].planes[VPX_PLANE_V], raw[i].stride[VPX_PLANE_V],
- raw[i].d_w, raw[i].d_h, 1);
- }
- }
-
- /* Encode each frame at multi-levels */
- if(vpx_codec_encode(&codec[0], frame_avail? &raw[0] : NULL,
- frame_cnt, 1, flags, arg_deadline))
- die_codec(&codec[0], "Failed to encode frame");
-
- for (i=NUM_ENCODERS-1; i>=0 ; i--)
- {
- got_data = 0;
-
- while( (pkt[i] = vpx_codec_get_cx_data(&codec[i], &iter[i])) )
- {
- got_data = 1;
- switch(pkt[i]->kind) {
- case VPX_CODEC_CX_FRAME_PKT:
- write_ivf_frame_header(outfile[i], pkt[i]);
- (void) fwrite(pkt[i]->data.frame.buf, 1,
- pkt[i]->data.frame.sz, outfile[i]);
- break;
- case VPX_CODEC_PSNR_PKT:
- if (show_psnr)
- {
- int j;
-
- psnr_sse_total[i] += pkt[i]->data.psnr.sse[0];
- psnr_samples_total[i] += pkt[i]->data.psnr.samples[0];
- for (j = 0; j < 4; j++)
- {
- //fprintf(stderr, "%.3lf ", pkt[i]->data.psnr.psnr[j]);
- psnr_totals[i][j] += pkt[i]->data.psnr.psnr[j];
- }
- psnr_count[i]++;
- }
-
- break;
- default:
- break;
- }
- printf(pkt[i]->kind == VPX_CODEC_CX_FRAME_PKT
- && (pkt[i]->data.frame.flags & VPX_FRAME_IS_KEY)? "K":".");
- fflush(stdout);
- }
- }
- frame_cnt++;
- }
- printf("\n");
-
- fclose(infile);
-
- printf("Processed %ld frames.\n",(long int)frame_cnt-1);
- for (i=0; i< NUM_ENCODERS; i++)
- {
- /* Calculate PSNR and print it out */
- if ( (show_psnr) && (psnr_count[i]>0) )
- {
- int j;
- double ovpsnr = sse_to_psnr(psnr_samples_total[i], 255.0,
- psnr_sse_total[i]);
-
- fprintf(stderr, "\n ENC%d PSNR (Overall/Avg/Y/U/V)", i);
-
- fprintf(stderr, " %.3lf", ovpsnr);
- for (j = 0; j < 4; j++)
- {
- fprintf(stderr, " %.3lf", psnr_totals[i][j]/psnr_count[i]);
- }
- }
-
- if(vpx_codec_destroy(&codec[i]))
- die_codec(&codec[i], "Failed to destroy codec");
-
- vpx_img_free(&raw[i]);
-
- if(!outfile[i])
- continue;
-
- /* Try to rewrite the file header with the actual frame count */
- if(!fseek(outfile[i], 0, SEEK_SET))
- write_ivf_file_header(outfile[i], &cfg[i], frame_cnt-1);
- fclose(outfile[i]);
- }
- printf("\n");
-
- return EXIT_SUCCESS;
+ return EXIT_SUCCESS;
}
diff --git a/examples/vp8cx_set_ref.c b/examples/vp8cx_set_ref.c
index 46a40ca..b0961a2 100644
--- a/examples/vp8cx_set_ref.c
+++ b/examples/vp8cx_set_ref.c
@@ -50,7 +50,6 @@
#include <stdlib.h>
#include <string.h>
-#define VPX_CODEC_DISABLE_COMPAT 1
#include "vpx/vp8cx.h"
#include "vpx/vpx_encoder.h"
@@ -65,10 +64,11 @@
exit(EXIT_FAILURE);
}
-static void encode_frame(vpx_codec_ctx_t *codec,
- vpx_image_t *img,
- int frame_index,
- VpxVideoWriter *writer) {
+static int encode_frame(vpx_codec_ctx_t *codec,
+ vpx_image_t *img,
+ int frame_index,
+ VpxVideoWriter *writer) {
+ int got_pkts = 0;
vpx_codec_iter_t iter = NULL;
const vpx_codec_cx_pkt_t *pkt = NULL;
const vpx_codec_err_t res = vpx_codec_encode(codec, img, frame_index, 1, 0,
@@ -77,6 +77,8 @@
die_codec(codec, "Failed to encode frame");
while ((pkt = vpx_codec_get_cx_data(codec, &iter)) != NULL) {
+ got_pkts = 1;
+
if (pkt->kind == VPX_CODEC_CX_FRAME_PKT) {
const int keyframe = (pkt->data.frame.flags & VPX_FRAME_IS_KEY) != 0;
if (!vpx_video_writer_write_frame(writer,
@@ -90,6 +92,8 @@
fflush(stdout);
}
}
+
+ return got_pkts;
}
int main(int argc, char **argv) {
@@ -160,6 +164,7 @@
if (vpx_codec_enc_init(&codec, encoder->codec_interface(), &cfg, 0))
die_codec(&codec, "Failed to initialize encoder");
+ // Encode frames.
while (vpx_img_read(&raw, infile)) {
if (frame_count + 1 == update_frame_num) {
vpx_ref_frame_t ref;
@@ -171,7 +176,9 @@
encode_frame(&codec, &raw, frame_count++, writer);
}
- encode_frame(&codec, NULL, -1, writer);
+
+ // Flush encoder.
+ while (encode_frame(&codec, NULL, -1, writer)) {};
printf("\n");
fclose(infile);
diff --git a/examples/vp9_lossless_encoder.c b/examples/vp9_lossless_encoder.c
new file mode 100644
index 0000000..3fcda0c
--- /dev/null
+++ b/examples/vp9_lossless_encoder.c
@@ -0,0 +1,144 @@
+/*
+ * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "vpx/vpx_encoder.h"
+#include "vpx/vp8cx.h"
+
+#include "./tools_common.h"
+#include "./video_writer.h"
+
+static const char *exec_name;
+
+void usage_exit() {
+ fprintf(stderr, "vp9_lossless_encoder: Example demonstrating VP9 lossless "
+ "encoding feature. Supports raw input only.\n");
+ fprintf(stderr, "Usage: %s <width> <height> <infile> <outfile>\n", exec_name);
+ exit(EXIT_FAILURE);
+}
+
+static int encode_frame(vpx_codec_ctx_t *codec,
+ vpx_image_t *img,
+ int frame_index,
+ int flags,
+ VpxVideoWriter *writer) {
+ int got_pkts = 0;
+ vpx_codec_iter_t iter = NULL;
+ const vpx_codec_cx_pkt_t *pkt = NULL;
+ const vpx_codec_err_t res = vpx_codec_encode(codec, img, frame_index, 1,
+ flags, VPX_DL_GOOD_QUALITY);
+ if (res != VPX_CODEC_OK)
+ die_codec(codec, "Failed to encode frame");
+
+ while ((pkt = vpx_codec_get_cx_data(codec, &iter)) != NULL) {
+ got_pkts = 1;
+
+ if (pkt->kind == VPX_CODEC_CX_FRAME_PKT) {
+ const int keyframe = (pkt->data.frame.flags & VPX_FRAME_IS_KEY) != 0;
+ if (!vpx_video_writer_write_frame(writer,
+ pkt->data.frame.buf,
+ pkt->data.frame.sz,
+ pkt->data.frame.pts)) {
+ die_codec(codec, "Failed to write compressed frame");
+ }
+ printf(keyframe ? "K" : ".");
+ fflush(stdout);
+ }
+ }
+
+ return got_pkts;
+}
+
+int main(int argc, char **argv) {
+ FILE *infile = NULL;
+ vpx_codec_ctx_t codec;
+ vpx_codec_enc_cfg_t cfg;
+ int frame_count = 0;
+ vpx_image_t raw;
+ vpx_codec_err_t res;
+ VpxVideoInfo info = {0};
+ VpxVideoWriter *writer = NULL;
+ const VpxInterface *encoder = NULL;
+ const int fps = 30;
+
+ exec_name = argv[0];
+
+ if (argc < 5)
+ die("Invalid number of arguments");
+
+ encoder = get_vpx_encoder_by_name("vp9");
+ if (!encoder)
+ die("Unsupported codec.");
+
+ info.codec_fourcc = encoder->fourcc;
+ info.frame_width = strtol(argv[1], NULL, 0);
+ info.frame_height = strtol(argv[2], NULL, 0);
+ info.time_base.numerator = 1;
+ info.time_base.denominator = fps;
+
+ if (info.frame_width <= 0 ||
+ info.frame_height <= 0 ||
+ (info.frame_width % 2) != 0 ||
+ (info.frame_height % 2) != 0) {
+ die("Invalid frame size: %dx%d", info.frame_width, info.frame_height);
+ }
+
+ if (!vpx_img_alloc(&raw, VPX_IMG_FMT_I420, info.frame_width,
+ info.frame_height, 1)) {
+ die("Failed to allocate image.");
+ }
+
+ printf("Using %s\n", vpx_codec_iface_name(encoder->codec_interface()));
+
+ res = vpx_codec_enc_config_default(encoder->codec_interface(), &cfg, 0);
+ if (res)
+ die_codec(&codec, "Failed to get default codec config.");
+
+ cfg.g_w = info.frame_width;
+ cfg.g_h = info.frame_height;
+ cfg.g_timebase.num = info.time_base.numerator;
+ cfg.g_timebase.den = info.time_base.denominator;
+
+ writer = vpx_video_writer_open(argv[4], kContainerIVF, &info);
+ if (!writer)
+ die("Failed to open %s for writing.", argv[4]);
+
+ if (!(infile = fopen(argv[3], "rb")))
+ die("Failed to open %s for reading.", argv[3]);
+
+ if (vpx_codec_enc_init(&codec, encoder->codec_interface(), &cfg, 0))
+ die_codec(&codec, "Failed to initialize encoder");
+
+ if (vpx_codec_control_(&codec, VP9E_SET_LOSSLESS, 1))
+ die_codec(&codec, "Failed to use lossless mode");
+
+ // Encode frames.
+ while (vpx_img_read(&raw, infile)) {
+ encode_frame(&codec, &raw, frame_count++, 0, writer);
+ }
+
+ // Flush encoder.
+ while (encode_frame(&codec, NULL, -1, 0, writer)) {}
+
+ printf("\n");
+ fclose(infile);
+ printf("Processed %d frames.\n", frame_count);
+
+ vpx_img_free(&raw);
+ if (vpx_codec_destroy(&codec))
+ die_codec(&codec, "Failed to destroy codec.");
+
+ vpx_video_writer_close(writer);
+
+ return EXIT_SUCCESS;
+}
diff --git a/examples/vp9_spatial_svc_encoder.c b/examples/vp9_spatial_svc_encoder.c
index 223f37e..7ce3403 100644
--- a/examples/vp9_spatial_svc_encoder.c
+++ b/examples/vp9_spatial_svc_encoder.c
@@ -38,8 +38,10 @@
ARG_DEF("t", "timebase", 1, "timebase (num/den)");
static const arg_def_t bitrate_arg = ARG_DEF(
"b", "target-bitrate", 1, "encoding bitrate, in kilobits per second");
-static const arg_def_t layers_arg =
- ARG_DEF("l", "layers", 1, "number of SVC layers");
+static const arg_def_t spatial_layers_arg =
+ ARG_DEF("sl", "spatial-layers", 1, "number of spatial SVC layers");
+static const arg_def_t temporal_layers_arg =
+ ARG_DEF("tl", "temporal-layers", 1, "number of temporal SVC layers");
static const arg_def_t kf_dist_arg =
ARG_DEF("k", "kf-dist", 1, "number of frames between keyframes");
static const arg_def_t scale_factors_arg =
@@ -65,10 +67,11 @@
static const arg_def_t *svc_args[] = {
&frames_arg, &width_arg, &height_arg,
- &timebase_arg, &bitrate_arg, &skip_frames_arg, &layers_arg,
+ &timebase_arg, &bitrate_arg, &skip_frames_arg, &spatial_layers_arg,
&kf_dist_arg, &scale_factors_arg, &quantizers_arg, &passes_arg,
&pass_arg, &fpf_name_arg, &min_q_arg, &max_q_arg,
- &min_bitrate_arg, &max_bitrate_arg, NULL
+ &min_bitrate_arg, &max_bitrate_arg, &temporal_layers_arg,
+ NULL
};
static const uint32_t default_frames_to_skip = 0;
@@ -79,6 +82,7 @@
static const uint32_t default_timebase_den = 60;
static const uint32_t default_bitrate = 1000;
static const uint32_t default_spatial_layers = 5;
+static const uint32_t default_temporal_layers = 1;
static const uint32_t default_kf_dist = 100;
typedef struct {
@@ -119,6 +123,7 @@
// initialize SvcContext with parameters that will be passed to vpx_svc_init
svc_ctx->log_level = SVC_LOG_DEBUG;
svc_ctx->spatial_layers = default_spatial_layers;
+ svc_ctx->temporal_layers = default_temporal_layers;
// start with default encoder configuration
res = vpx_codec_enc_config_default(vpx_codec_vp9_cx(), enc_cfg, 0);
@@ -156,8 +161,10 @@
enc_cfg->rc_target_bitrate = arg_parse_uint(&arg);
} else if (arg_match(&arg, &skip_frames_arg, argi)) {
app_input->frames_to_skip = arg_parse_uint(&arg);
- } else if (arg_match(&arg, &layers_arg, argi)) {
+ } else if (arg_match(&arg, &spatial_layers_arg, argi)) {
svc_ctx->spatial_layers = arg_parse_uint(&arg);
+ } else if (arg_match(&arg, &temporal_layers_arg, argi)) {
+ svc_ctx->temporal_layers = arg_parse_uint(&arg);
} else if (arg_match(&arg, &kf_dist_arg, argi)) {
enc_cfg->kf_min_dist = arg_parse_uint(&arg);
enc_cfg->kf_max_dist = enc_cfg->kf_min_dist;
@@ -275,7 +282,7 @@
int frame_duration = 1; /* 1 timebase tick per frame */
FILE *infile = NULL;
int end_of_stream = 0;
- int frame_size;
+ int frames_received = 0;
memset(&svc_ctx, 0, sizeof(svc_ctx));
svc_ctx.log_print = 1;
@@ -318,6 +325,8 @@
// Encode frames
while (!end_of_stream) {
+ vpx_codec_iter_t iter = NULL;
+ const vpx_codec_cx_pkt_t *cx_pkt;
if (frame_cnt >= app_input.frames_to_code || !vpx_img_read(&raw, infile)) {
// We need one extra vpx_svc_encode call at end of stream to flush
// encoder and get remaining data
@@ -330,18 +339,34 @@
if (res != VPX_CODEC_OK) {
die_codec(&codec, "Failed to encode frame");
}
- if (!(app_input.passes == 2 && app_input.pass == 1)) {
- while ((frame_size = vpx_svc_get_frame_size(&svc_ctx)) > 0) {
- vpx_video_writer_write_frame(writer,
- vpx_svc_get_buffer(&svc_ctx),
- frame_size, pts);
+
+ while ((cx_pkt = vpx_codec_get_cx_data(&codec, &iter)) != NULL) {
+ switch (cx_pkt->kind) {
+ case VPX_CODEC_CX_FRAME_PKT: {
+ if (cx_pkt->data.frame.sz > 0)
+ vpx_video_writer_write_frame(writer,
+ cx_pkt->data.frame.buf,
+ cx_pkt->data.frame.sz,
+ cx_pkt->data.frame.pts);
+
+ printf("SVC frame: %d, kf: %d, size: %d, pts: %d\n", frames_received,
+ !!(cx_pkt->data.frame.flags & VPX_FRAME_IS_KEY),
+ (int)cx_pkt->data.frame.sz, (int)cx_pkt->data.frame.pts);
+ ++frames_received;
+ break;
+ }
+ case VPX_CODEC_STATS_PKT: {
+ stats_write(&app_input.rc_stats,
+ cx_pkt->data.twopass_stats.buf,
+ cx_pkt->data.twopass_stats.sz);
+ break;
+ }
+ default: {
+ break;
+ }
}
}
- if (vpx_svc_get_rc_stats_buffer_size(&svc_ctx) > 0) {
- stats_write(&app_input.rc_stats,
- vpx_svc_get_rc_stats_buffer(&svc_ctx),
- vpx_svc_get_rc_stats_buffer_size(&svc_ctx));
- }
+
if (!end_of_stream) {
++frame_cnt;
pts += frame_duration;
diff --git a/examples/vpx_temporal_svc_encoder.c b/examples/vpx_temporal_svc_encoder.c
index be3e7b2..3de983e 100644
--- a/examples/vpx_temporal_svc_encoder.c
+++ b/examples/vpx_temporal_svc_encoder.c
@@ -12,12 +12,12 @@
// encoding scheme based on temporal scalability for video applications
// that benefit from a scalable bitstream.
+#include <assert.h>
#include <math.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
-#define VPX_CODEC_DISABLE_COMPAT 1
#include "./vpx_config.h"
#include "vpx_ports/vpx_timer.h"
#include "vpx/vp8cx.h"
@@ -37,7 +37,8 @@
kDenoiserOff,
kDenoiserOnYOnly,
kDenoiserOnYUV,
- kDenoiserOnYUVAggressive // Aggressive mode not implemented currently.
+ kDenoiserOnYUVAggressive,
+ kDenoiserOnAdaptive
};
static int mode_to_num_layers[12] = {1, 2, 2, 3, 3, 3, 3, 5, 2, 3, 3, 3};
@@ -437,7 +438,7 @@
}
int main(int argc, char **argv) {
- VpxVideoWriter *outfile[VPX_TS_MAX_LAYERS];
+ VpxVideoWriter *outfile[VPX_TS_MAX_LAYERS] = {NULL};
vpx_codec_ctx_t codec;
vpx_codec_enc_cfg_t cfg;
int frame_cnt = 0;
@@ -455,7 +456,6 @@
int layering_mode = 0;
int layer_flags[VPX_TS_MAX_PERIODICITY] = {0};
int flag_periodicity = 1;
- int max_intra_size_pct;
vpx_svc_layer_id_t layer_id = {0, 0};
const VpxInterface *encoder = NULL;
FILE *infile = NULL;
@@ -569,6 +569,8 @@
outfile[i] = vpx_video_writer_open(file_name, kContainerIVF, &info);
if (!outfile[i])
die("Failed to open %s for writing", file_name);
+
+ assert(outfile[i] != NULL);
}
// No spatial layers in this encoder.
cfg.ss_number_layers = 1;
@@ -594,11 +596,11 @@
// This controls the maximum target size of the key frame.
// For generating smaller key frames, use a smaller max_intra_size_pct
// value, like 100 or 200.
- max_intra_size_pct = (int) (((double)cfg.rc_buf_optimal_sz * 0.5)
- * ((double) cfg.g_timebase.den / cfg.g_timebase.num) / 10.0);
- // For low-quality key frame.
- max_intra_size_pct = 200;
- vpx_codec_control(&codec, VP8E_SET_MAX_INTRA_BITRATE_PCT, max_intra_size_pct);
+ {
+ const int max_intra_size_pct = 200;
+ vpx_codec_control(&codec, VP8E_SET_MAX_INTRA_BITRATE_PCT,
+ max_intra_size_pct);
+ }
frame_avail = 1;
while (frame_avail || got_data) {
diff --git a/libs.mk b/libs.mk
index 25fbc2c..f9f2d80 100644
--- a/libs.mk
+++ b/libs.mk
@@ -133,6 +133,8 @@
CODEC_DOC_SECTIONS += vp9 vp9_decoder
endif
+VP9_PREFIX=vp9/
+$(BUILD_PFX)$(VP9_PREFIX)%.c.o: CFLAGS += -Wextra
ifeq ($(CONFIG_ENCODERS),yes)
CODEC_DOC_SECTIONS += encoder
@@ -529,7 +531,6 @@
@echo " [CREATE] $@"
@rm -f $@
@echo "INPUT += $^" >> $@
- @echo "PREDEFINED = VPX_CODEC_DISABLE_COMPAT" >> $@
@echo "INCLUDE_PATH += ." >> $@;
@echo "ENABLED_SECTIONS += $(sort $(CODEC_DOC_SECTIONS))" >> $@
diff --git a/test/active_map_test.cc b/test/active_map_test.cc
index a9bb540..0221995 100644
--- a/test/active_map_test.cc
+++ b/test/active_map_test.cc
@@ -38,7 +38,7 @@
if (video->frame() == 1) {
encoder->Control(VP8E_SET_CPUUSED, cpu_used_);
} else if (video->frame() == 3) {
- vpx_active_map_t map = {0};
+ vpx_active_map_t map = vpx_active_map_t();
uint8_t active_map[9 * 13] = {
1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0,
1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0,
@@ -57,7 +57,7 @@
map.active_map = active_map;
encoder->Control(VP8E_SET_ACTIVEMAP, &map);
} else if (video->frame() == 15) {
- vpx_active_map_t map = {0};
+ vpx_active_map_t map = vpx_active_map_t();
map.cols = (kWidth + 15) / 16;
map.rows = (kHeight + 15) / 16;
map.active_map = NULL;
diff --git a/test/convolve_test.cc b/test/convolve_test.cc
index 5b4a20e..d1e7f1d 100644
--- a/test/convolve_test.cc
+++ b/test/convolve_test.cc
@@ -581,6 +581,8 @@
using std::tr1::make_tuple;
+#if CONFIG_VP9_HIGHBITDEPTH
+#else
const ConvolveFunctions convolve8_c(
vp9_convolve8_horiz_c, vp9_convolve8_avg_horiz_c,
vp9_convolve8_vert_c, vp9_convolve8_avg_vert_c,
@@ -600,8 +602,11 @@
make_tuple(64, 32, &convolve8_c),
make_tuple(32, 64, &convolve8_c),
make_tuple(64, 64, &convolve8_c)));
+#endif
-#if HAVE_SSE2
+#if HAVE_SSE2 && ARCH_X86_64
+#if CONFIG_VP9_HIGHBITDEPTH
+#else
const ConvolveFunctions convolve8_sse2(
vp9_convolve8_horiz_sse2, vp9_convolve8_avg_horiz_sse2,
vp9_convolve8_vert_sse2, vp9_convolve8_avg_vert_sse2,
@@ -622,6 +627,7 @@
make_tuple(32, 64, &convolve8_sse2),
make_tuple(64, 64, &convolve8_sse2)));
#endif
+#endif
#if HAVE_SSSE3
const ConvolveFunctions convolve8_ssse3(
@@ -645,27 +651,7 @@
make_tuple(64, 64, &convolve8_ssse3)));
#endif
-#if HAVE_AVX2
-// TODO(jzern): these prototypes can be removed after the avx2 versions are
-// reenabled in vp9_rtcd_defs.pl.
-extern "C" {
-void vp9_convolve8_vert_avx2(const uint8_t *src, ptrdiff_t src_stride,
- uint8_t *dst, ptrdiff_t dst_stride,
- const int16_t *filter_x, int x_step_q4,
- const int16_t *filter_y, int y_step_q4,
- int w, int h);
-void vp9_convolve8_horiz_avx2(const uint8_t *src, ptrdiff_t src_stride,
- uint8_t *dst, ptrdiff_t dst_stride,
- const int16_t *filter_x, int x_step_q4,
- const int16_t *filter_y, int y_step_q4,
- int w, int h);
-void vp9_convolve8_avx2(const uint8_t *src, ptrdiff_t src_stride,
- uint8_t *dst, ptrdiff_t dst_stride,
- const int16_t *filter_x, int x_step_q4,
- const int16_t *filter_y, int y_step_q4,
- int w, int h);
-}
-
+#if HAVE_AVX2 && HAVE_SSSE3
const ConvolveFunctions convolve8_avx2(
vp9_convolve8_horiz_avx2, vp9_convolve8_avg_horiz_ssse3,
vp9_convolve8_vert_avx2, vp9_convolve8_avg_vert_ssse3,
@@ -676,9 +662,7 @@
make_tuple(8, 4, &convolve8_avx2),
make_tuple(4, 8, &convolve8_avx2),
make_tuple(8, 8, &convolve8_avx2),
- make_tuple(8, 16, &convolve8_avx2)));
-
-INSTANTIATE_TEST_CASE_P(DISABLED_AVX2, ConvolveTest, ::testing::Values(
+ make_tuple(8, 16, &convolve8_avx2),
make_tuple(16, 8, &convolve8_avx2),
make_tuple(16, 16, &convolve8_avx2),
make_tuple(32, 16, &convolve8_avx2),
@@ -687,7 +671,7 @@
make_tuple(64, 32, &convolve8_avx2),
make_tuple(32, 64, &convolve8_avx2),
make_tuple(64, 64, &convolve8_avx2)));
-#endif
+#endif // HAVE_AVX2 && HAVE_SSSE3
#if HAVE_NEON_ASM
const ConvolveFunctions convolve8_neon(
diff --git a/test/cq_test.cc b/test/cq_test.cc
index 7da7b80..4e8019a 100644
--- a/test/cq_test.cc
+++ b/test/cq_test.cc
@@ -8,6 +8,7 @@
* be found in the AUTHORS file in the root of the source tree.
*/
#include <cmath>
+#include <map>
#include "third_party/googletest/src/include/gtest/gtest.h"
#include "test/codec_factory.h"
#include "test/encode_test_driver.h"
@@ -24,6 +25,28 @@
class CQTest : public ::libvpx_test::EncoderTest,
public ::libvpx_test::CodecTestWithParam<int> {
+ public:
+ // maps the cqlevel to the bitrate produced.
+ typedef std::map<int, uint32_t> BitrateMap;
+
+ static void SetUpTestCase() {
+ bitrates_.clear();
+ }
+
+ static void TearDownTestCase() {
+ ASSERT_TRUE(!HasFailure())
+ << "skipping bitrate validation due to earlier failure.";
+ uint32_t prev_actual_bitrate = kCQTargetBitrate;
+ for (BitrateMap::const_iterator iter = bitrates_.begin();
+ iter != bitrates_.end(); ++iter) {
+ const uint32_t cq_actual_bitrate = iter->second;
+ EXPECT_LE(cq_actual_bitrate, prev_actual_bitrate)
+ << "cq_level: " << iter->first
+ << ", bitrate should decrease with increase in CQ level.";
+ prev_actual_bitrate = cq_actual_bitrate;
+ }
+ }
+
protected:
CQTest() : EncoderTest(GET_PARAM(0)), cq_level_(GET_PARAM(1)) {
init_flags_ = VPX_CODEC_USE_PSNR;
@@ -66,9 +89,12 @@
return pow(10.0, avg_psnr / 10.0) / file_size_;
}
+ int cq_level() const { return cq_level_; }
size_t file_size() const { return file_size_; }
int n_frames() const { return n_frames_; }
+ static BitrateMap bitrates_;
+
private:
int cq_level_;
size_t file_size_;
@@ -76,7 +102,8 @@
int n_frames_;
};
-unsigned int prev_actual_bitrate = kCQTargetBitrate;
+CQTest::BitrateMap CQTest::bitrates_;
+
TEST_P(CQTest, LinearPSNRIsHigherForCQLevel) {
const vpx_rational timebase = { 33333333, 1000000000 };
cfg_.g_timebase = timebase;
@@ -91,8 +118,7 @@
const unsigned int cq_actual_bitrate =
static_cast<unsigned int>(file_size()) * 8 * 30 / (n_frames() * 1000);
EXPECT_LE(cq_actual_bitrate, kCQTargetBitrate);
- EXPECT_LE(cq_actual_bitrate, prev_actual_bitrate);
- prev_actual_bitrate = cq_actual_bitrate;
+ bitrates_[cq_level()] = cq_actual_bitrate;
// try targeting the approximate same bitrate with VBR mode
cfg_.rc_end_usage = VPX_VBR;
diff --git a/test/datarate_test.cc b/test/datarate_test.cc
index 8dcf26c..a3d730a 100644
--- a/test/datarate_test.cc
+++ b/test/datarate_test.cc
@@ -42,6 +42,9 @@
virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
::libvpx_test::Encoder *encoder) {
+ if (video->frame() == 1) {
+ encoder->Control(VP8E_SET_NOISE_SENSITIVITY, denoiser_on_);
+ }
const vpx_rational_t tb = video->timebase();
timebase_ = static_cast<double>(tb.num) / tb.den;
duration_ = 0;
@@ -120,9 +123,40 @@
double file_datarate_;
double effective_datarate_;
size_t bits_in_last_frame_;
+ int denoiser_on_;
};
+// Check basic datarate targeting, for a single bitrate, but loop over the
+// various denoiser settings.
+TEST_P(DatarateTestLarge, DenoiserLevels) {
+ cfg_.rc_buf_initial_sz = 500;
+ cfg_.rc_dropframe_thresh = 1;
+ cfg_.rc_max_quantizer = 56;
+ cfg_.rc_end_usage = VPX_CBR;
+ ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
+ 30, 1, 0, 140);
+ for (int j = 1; j < 5; ++j) {
+ // Run over the denoiser levels.
+ // For the temporal denoiser (#if CONFIG_TEMPORAL_DENOISING) the level j
+ // refers to the 4 denoiser modes: denoiserYonly, denoiserOnYUV,
+ // denoiserOnAggressive, and denoiserOnAdaptive.
+ // For the spatial denoiser (if !CONFIG_TEMPORAL_DENOISING), the level j
+ // refers to the blur thresholds: 20, 40, 60 80.
+ // The j = 0 case (denoiser off) is covered in the tests below.
+ denoiser_on_ = j;
+ cfg_.rc_target_bitrate = 300;
+ ResetModel();
+ ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+ ASSERT_GE(cfg_.rc_target_bitrate, effective_datarate_ * 0.95)
+ << " The datarate for the file exceeds the target!";
+
+ ASSERT_LE(cfg_.rc_target_bitrate, file_datarate_ * 1.3)
+ << " The datarate for the file missed the target!";
+ }
+}
+
TEST_P(DatarateTestLarge, BasicBufferModel) {
+ denoiser_on_ = 0;
cfg_.rc_buf_initial_sz = 500;
cfg_.rc_dropframe_thresh = 1;
cfg_.rc_max_quantizer = 56;
@@ -154,6 +188,7 @@
}
TEST_P(DatarateTestLarge, ChangingDropFrameThresh) {
+ denoiser_on_ = 0;
cfg_.rc_buf_initial_sz = 500;
cfg_.rc_max_quantizer = 36;
cfg_.rc_end_usage = VPX_CBR;
diff --git a/test/dct16x16_test.cc b/test/dct16x16_test.cc
index ee417ce..d1ce109 100644
--- a/test/dct16x16_test.cc
+++ b/test/dct16x16_test.cc
@@ -20,12 +20,9 @@
#include "./vp9_rtcd.h"
#include "vp9/common/vp9_entropy.h"
+#include "vpx/vpx_codec.h"
#include "vpx/vpx_integer.h"
-extern "C" {
-void vp9_idct16x16_256_add_c(const int16_t *input, uint8_t *output, int pitch);
-}
-
using libvpx_test::ACMRandom;
namespace {
@@ -258,40 +255,72 @@
}
}
-typedef void (*FdctFunc)(const int16_t *in, int16_t *out, int stride);
-typedef void (*IdctFunc)(const int16_t *in, uint8_t *out, int stride);
-typedef void (*FhtFunc)(const int16_t *in, int16_t *out, int stride,
+typedef void (*FdctFunc)(const int16_t *in, tran_low_t *out, int stride);
+typedef void (*IdctFunc)(const tran_low_t *in, uint8_t *out, int stride);
+typedef void (*FhtFunc)(const int16_t *in, tran_low_t *out, int stride,
int tx_type);
-typedef void (*IhtFunc)(const int16_t *in, uint8_t *out, int stride,
+typedef void (*IhtFunc)(const tran_low_t *in, uint8_t *out, int stride,
int tx_type);
-typedef std::tr1::tuple<FdctFunc, IdctFunc, int> Dct16x16Param;
-typedef std::tr1::tuple<FhtFunc, IhtFunc, int> Ht16x16Param;
+typedef std::tr1::tuple<FdctFunc, IdctFunc, int, vpx_bit_depth_t> Dct16x16Param;
+typedef std::tr1::tuple<FhtFunc, IhtFunc, int, vpx_bit_depth_t> Ht16x16Param;
-void fdct16x16_ref(const int16_t *in, int16_t *out, int stride, int tx_type) {
+void fdct16x16_ref(const int16_t *in, tran_low_t *out, int stride,
+ int /*tx_type*/) {
vp9_fdct16x16_c(in, out, stride);
}
-void idct16x16_ref(const int16_t *in, uint8_t *dest, int stride, int tx_type) {
+void idct16x16_ref(const tran_low_t *in, uint8_t *dest, int stride,
+ int /*tx_type*/) {
vp9_idct16x16_256_add_c(in, dest, stride);
}
-void fht16x16_ref(const int16_t *in, int16_t *out, int stride, int tx_type) {
+void fht16x16_ref(const int16_t *in, tran_low_t *out, int stride,
+ int tx_type) {
vp9_fht16x16_c(in, out, stride, tx_type);
}
-void iht16x16_ref(const int16_t *in, uint8_t *dest, int stride, int tx_type) {
+void iht16x16_ref(const tran_low_t *in, uint8_t *dest, int stride,
+ int tx_type) {
vp9_iht16x16_256_add_c(in, dest, stride, tx_type);
}
+#if CONFIG_VP9_HIGHBITDEPTH
+void idct16x16_10(const tran_low_t *in, uint8_t *out, int stride) {
+ vp9_high_idct16x16_256_add_c(in, out, stride, 10);
+}
+
+void idct16x16_12(const tran_low_t *in, uint8_t *out, int stride) {
+ vp9_high_idct16x16_256_add_c(in, out, stride, 12);
+}
+
+void idct16x16_10_ref(const tran_low_t *in, uint8_t *out, int stride,
+ int tx_type) {
+ idct16x16_10(in, out, stride);
+}
+
+void idct16x16_12_ref(const tran_low_t *in, uint8_t *out, int stride,
+ int tx_type) {
+ idct16x16_12(in, out, stride);
+}
+
+void iht16x16_10(const tran_low_t *in, uint8_t *out, int stride, int tx_type) {
+ vp9_high_iht16x16_256_add_c(in, out, stride, tx_type, 10);
+}
+
+void iht16x16_12(const tran_low_t *in, uint8_t *out, int stride, int tx_type) {
+ vp9_high_iht16x16_256_add_c(in, out, stride, tx_type, 12);
+}
+#endif
+
class Trans16x16TestBase {
public:
virtual ~Trans16x16TestBase() {}
protected:
- virtual void RunFwdTxfm(int16_t *in, int16_t *out, int stride) = 0;
+ virtual void RunFwdTxfm(int16_t *in, tran_low_t *out, int stride) = 0;
- virtual void RunInvTxfm(int16_t *out, uint8_t *dst, int stride) = 0;
+ virtual void RunInvTxfm(tran_low_t *out, uint8_t *dst, int stride) = 0;
void RunAccuracyCheck() {
ACMRandom rnd(ACMRandom::DeterministicSeed());
@@ -300,23 +329,48 @@
const int count_test_block = 10000;
for (int i = 0; i < count_test_block; ++i) {
DECLARE_ALIGNED_ARRAY(16, int16_t, test_input_block, kNumCoeffs);
- DECLARE_ALIGNED_ARRAY(16, int16_t, test_temp_block, kNumCoeffs);
+ DECLARE_ALIGNED_ARRAY(16, tran_low_t, test_temp_block, kNumCoeffs);
DECLARE_ALIGNED_ARRAY(16, uint8_t, dst, kNumCoeffs);
DECLARE_ALIGNED_ARRAY(16, uint8_t, src, kNumCoeffs);
+#if CONFIG_VP9_HIGHBITDEPTH
+ DECLARE_ALIGNED_ARRAY(16, uint16_t, dst16, kNumCoeffs);
+ DECLARE_ALIGNED_ARRAY(16, uint16_t, src16, kNumCoeffs);
+#endif
- // Initialize a test block with input range [-255, 255].
+ // Initialize a test block with input range [-mask_, mask_].
for (int j = 0; j < kNumCoeffs; ++j) {
- src[j] = rnd.Rand8();
- dst[j] = rnd.Rand8();
- test_input_block[j] = src[j] - dst[j];
+ if (bit_depth_ == VPX_BITS_8) {
+ src[j] = rnd.Rand8();
+ dst[j] = rnd.Rand8();
+ test_input_block[j] = src[j] - dst[j];
+#if CONFIG_VP9_HIGHBITDEPTH
+ } else {
+ src16[j] = rnd.Rand16() & mask_;
+ dst16[j] = rnd.Rand16() & mask_;
+ test_input_block[j] = src16[j] - dst16[j];
+#endif
+ }
}
ASM_REGISTER_STATE_CHECK(RunFwdTxfm(test_input_block,
test_temp_block, pitch_));
- ASM_REGISTER_STATE_CHECK(RunInvTxfm(test_temp_block, dst, pitch_));
+ if (bit_depth_ == VPX_BITS_8) {
+ ASM_REGISTER_STATE_CHECK(
+ RunInvTxfm(test_temp_block, dst, pitch_));
+#if CONFIG_VP9_HIGHBITDEPTH
+ } else {
+ ASM_REGISTER_STATE_CHECK(
+ RunInvTxfm(test_temp_block, CONVERT_TO_BYTEPTR(dst16), pitch_));
+#endif
+ }
for (int j = 0; j < kNumCoeffs; ++j) {
+#if CONFIG_VP9_HIGHBITDEPTH
+ const uint32_t diff =
+ bit_depth_ == VPX_BITS_8 ? dst[j] - src[j] : dst16[j] - src16[j];
+#else
const uint32_t diff = dst[j] - src[j];
+#endif
const uint32_t error = diff * diff;
if (max_error < error)
max_error = error;
@@ -324,10 +378,10 @@
}
}
- EXPECT_GE(1u, max_error)
+ EXPECT_GE(1u << 2 * (bit_depth_ - 8), max_error)
<< "Error: 16x16 FHT/IHT has an individual round trip error > 1";
- EXPECT_GE(count_test_block , total_error)
+ EXPECT_GE(count_test_block << 2 * (bit_depth_ - 8), total_error)
<< "Error: 16x16 FHT/IHT has average round trip error > 1 per block";
}
@@ -335,13 +389,13 @@
ACMRandom rnd(ACMRandom::DeterministicSeed());
const int count_test_block = 1000;
DECLARE_ALIGNED_ARRAY(16, int16_t, input_block, kNumCoeffs);
- DECLARE_ALIGNED_ARRAY(16, int16_t, output_ref_block, kNumCoeffs);
- DECLARE_ALIGNED_ARRAY(16, int16_t, output_block, kNumCoeffs);
+ DECLARE_ALIGNED_ARRAY(16, tran_low_t, output_ref_block, kNumCoeffs);
+ DECLARE_ALIGNED_ARRAY(16, tran_low_t, output_block, kNumCoeffs);
for (int i = 0; i < count_test_block; ++i) {
- // Initialize a test block with input range [-255, 255].
+ // Initialize a test block with input range [-mask_, mask_].
for (int j = 0; j < kNumCoeffs; ++j)
- input_block[j] = rnd.Rand8() - rnd.Rand8();
+ input_block[j] = (rnd.Rand16() & mask_) - (rnd.Rand16() & mask_);
fwd_txfm_ref(input_block, output_ref_block, pitch_, tx_type_);
ASM_REGISTER_STATE_CHECK(RunFwdTxfm(input_block, output_block, pitch_));
@@ -357,21 +411,21 @@
const int count_test_block = 1000;
DECLARE_ALIGNED_ARRAY(16, int16_t, input_block, kNumCoeffs);
DECLARE_ALIGNED_ARRAY(16, int16_t, input_extreme_block, kNumCoeffs);
- DECLARE_ALIGNED_ARRAY(16, int16_t, output_ref_block, kNumCoeffs);
- DECLARE_ALIGNED_ARRAY(16, int16_t, output_block, kNumCoeffs);
+ DECLARE_ALIGNED_ARRAY(16, tran_low_t, output_ref_block, kNumCoeffs);
+ DECLARE_ALIGNED_ARRAY(16, tran_low_t, output_block, kNumCoeffs);
for (int i = 0; i < count_test_block; ++i) {
- // Initialize a test block with input range [-255, 255].
+ // Initialize a test block with input range [-mask_, mask_].
for (int j = 0; j < kNumCoeffs; ++j) {
- input_block[j] = rnd.Rand8() - rnd.Rand8();
- input_extreme_block[j] = rnd.Rand8() % 2 ? 255 : -255;
+ input_block[j] = (rnd.Rand16() & mask_) - (rnd.Rand16() & mask_);
+ input_extreme_block[j] = rnd.Rand8() % 2 ? mask_ : -mask_;
}
if (i == 0) {
for (int j = 0; j < kNumCoeffs; ++j)
- input_extreme_block[j] = 255;
+ input_extreme_block[j] = mask_;
} else if (i == 1) {
for (int j = 0; j < kNumCoeffs; ++j)
- input_extreme_block[j] = -255;
+ input_extreme_block[j] = -mask_;
}
fwd_txfm_ref(input_extreme_block, output_ref_block, pitch_, tx_type_);
@@ -381,7 +435,7 @@
// The minimum quant value is 4.
for (int j = 0; j < kNumCoeffs; ++j) {
EXPECT_EQ(output_block[j], output_ref_block[j]);
- EXPECT_GE(4 * DCT_MAX_VALUE, abs(output_block[j]))
+ EXPECT_GE(4 * DCT_MAX_VALUE << (bit_depth_ - 8), abs(output_block[j]))
<< "Error: 16x16 FDCT has coefficient larger than 4*DCT_MAX_VALUE";
}
}
@@ -392,39 +446,65 @@
const int count_test_block = 1000;
DECLARE_ALIGNED_ARRAY(16, int16_t, input_block, kNumCoeffs);
DECLARE_ALIGNED_ARRAY(16, int16_t, input_extreme_block, kNumCoeffs);
- DECLARE_ALIGNED_ARRAY(16, int16_t, output_ref_block, kNumCoeffs);
+ DECLARE_ALIGNED_ARRAY(16, tran_low_t, output_ref_block, kNumCoeffs);
DECLARE_ALIGNED_ARRAY(16, uint8_t, dst, kNumCoeffs);
DECLARE_ALIGNED_ARRAY(16, uint8_t, ref, kNumCoeffs);
+#if CONFIG_VP9_HIGHBITDEPTH
+ DECLARE_ALIGNED_ARRAY(16, uint16_t, dst16, kNumCoeffs);
+ DECLARE_ALIGNED_ARRAY(16, uint16_t, ref16, kNumCoeffs);
+#endif
for (int i = 0; i < count_test_block; ++i) {
- // Initialize a test block with input range [-255, 255].
+ // Initialize a test block with input range [-mask_, mask_].
for (int j = 0; j < kNumCoeffs; ++j) {
- input_block[j] = rnd.Rand8() - rnd.Rand8();
- input_extreme_block[j] = rnd.Rand8() % 2 ? 255 : -255;
+ if (bit_depth_ == VPX_BITS_8)
+ input_block[j] = rnd.Rand8() - rnd.Rand8();
+ else
+ input_block[j] = (rnd.Rand16() & mask_) - (rnd.Rand16() & mask_);
+ input_extreme_block[j] = rnd.Rand8() % 2 ? mask_ : -mask_;
}
if (i == 0)
for (int j = 0; j < kNumCoeffs; ++j)
- input_extreme_block[j] = 255;
+ input_extreme_block[j] = mask_;
if (i == 1)
for (int j = 0; j < kNumCoeffs; ++j)
- input_extreme_block[j] = -255;
+ input_extreme_block[j] = -mask_;
fwd_txfm_ref(input_extreme_block, output_ref_block, pitch_, tx_type_);
// clear reconstructed pixel buffers
vpx_memset(dst, 0, kNumCoeffs * sizeof(uint8_t));
vpx_memset(ref, 0, kNumCoeffs * sizeof(uint8_t));
+#if CONFIG_VP9_HIGHBITDEPTH
+ vpx_memset(dst16, 0, kNumCoeffs * sizeof(uint16_t));
+ vpx_memset(ref16, 0, kNumCoeffs * sizeof(uint16_t));
+#endif
// quantization with maximum allowed step sizes
output_ref_block[0] = (output_ref_block[0] / dc_thred) * dc_thred;
for (int j = 1; j < kNumCoeffs; ++j)
output_ref_block[j] = (output_ref_block[j] / ac_thred) * ac_thred;
- inv_txfm_ref(output_ref_block, ref, pitch_, tx_type_);
- ASM_REGISTER_STATE_CHECK(RunInvTxfm(output_ref_block, dst, pitch_));
-
- for (int j = 0; j < kNumCoeffs; ++j)
- EXPECT_EQ(ref[j], dst[j]);
+ if (bit_depth_ == VPX_BITS_8) {
+ inv_txfm_ref(output_ref_block, ref, pitch_, tx_type_);
+ ASM_REGISTER_STATE_CHECK(RunInvTxfm(output_ref_block, dst, pitch_));
+#if CONFIG_VP9_HIGHBITDEPTH
+ } else {
+ inv_txfm_ref(output_ref_block, CONVERT_TO_BYTEPTR(ref16), pitch_,
+ tx_type_);
+ ASM_REGISTER_STATE_CHECK(RunInvTxfm(output_ref_block,
+ CONVERT_TO_BYTEPTR(dst16), pitch_));
+#endif
+ }
+ if (bit_depth_ == VPX_BITS_8) {
+ for (int j = 0; j < kNumCoeffs; ++j)
+ EXPECT_EQ(ref[j], dst[j]);
+#if CONFIG_VP9_HIGHBITDEPTH
+ } else {
+ for (int j = 0; j < kNumCoeffs; ++j)
+ EXPECT_EQ(ref16[j], dst16[j]);
+#endif
+ }
}
}
@@ -432,28 +512,52 @@
ACMRandom rnd(ACMRandom::DeterministicSeed());
const int count_test_block = 1000;
DECLARE_ALIGNED_ARRAY(16, int16_t, in, kNumCoeffs);
- DECLARE_ALIGNED_ARRAY(16, int16_t, coeff, kNumCoeffs);
+ DECLARE_ALIGNED_ARRAY(16, tran_low_t, coeff, kNumCoeffs);
DECLARE_ALIGNED_ARRAY(16, uint8_t, dst, kNumCoeffs);
DECLARE_ALIGNED_ARRAY(16, uint8_t, src, kNumCoeffs);
+#if CONFIG_VP9_HIGHBITDEPTH
+ DECLARE_ALIGNED_ARRAY(16, uint16_t, dst16, kNumCoeffs);
+ DECLARE_ALIGNED_ARRAY(16, uint16_t, src16, kNumCoeffs);
+#endif
for (int i = 0; i < count_test_block; ++i) {
double out_r[kNumCoeffs];
// Initialize a test block with input range [-255, 255].
for (int j = 0; j < kNumCoeffs; ++j) {
- src[j] = rnd.Rand8();
- dst[j] = rnd.Rand8();
- in[j] = src[j] - dst[j];
+ if (bit_depth_ == VPX_BITS_8) {
+ src[j] = rnd.Rand8();
+ dst[j] = rnd.Rand8();
+ in[j] = src[j] - dst[j];
+#if CONFIG_VP9_HIGHBITDEPTH
+ } else {
+ src16[j] = rnd.Rand16() & mask_;
+ dst16[j] = rnd.Rand16() & mask_;
+ in[j] = src16[j] - dst16[j];
+#endif
+ }
}
reference_16x16_dct_2d(in, out_r);
for (int j = 0; j < kNumCoeffs; ++j)
coeff[j] = round(out_r[j]);
- ASM_REGISTER_STATE_CHECK(RunInvTxfm(coeff, dst, 16));
+ if (bit_depth_ == VPX_BITS_8) {
+ ASM_REGISTER_STATE_CHECK(RunInvTxfm(coeff, dst, 16));
+#if CONFIG_VP9_HIGHBITDEPTH
+ } else {
+ ASM_REGISTER_STATE_CHECK(RunInvTxfm(coeff, CONVERT_TO_BYTEPTR(dst16),
+ 16));
+#endif
+ }
for (int j = 0; j < kNumCoeffs; ++j) {
+#if CONFIG_VP9_HIGHBITDEPTH
+ const uint32_t diff =
+ bit_depth_ == VPX_BITS_8 ? dst[j] - src[j] : dst16[j] - src16[j];
+#else
const uint32_t diff = dst[j] - src[j];
+#endif
const uint32_t error = diff * diff;
EXPECT_GE(1u, error)
<< "Error: 16x16 IDCT has error " << error
@@ -463,6 +567,8 @@
}
int pitch_;
int tx_type_;
+ vpx_bit_depth_t bit_depth_;
+ int mask_;
FhtFunc fwd_txfm_ref;
IhtFunc inv_txfm_ref;
};
@@ -477,17 +583,34 @@
fwd_txfm_ = GET_PARAM(0);
inv_txfm_ = GET_PARAM(1);
tx_type_ = GET_PARAM(2);
+ bit_depth_ = GET_PARAM(3);
pitch_ = 16;
fwd_txfm_ref = fdct16x16_ref;
inv_txfm_ref = idct16x16_ref;
+ mask_ = (1 << bit_depth_) - 1;
+#if CONFIG_VP9_HIGHBITDEPTH
+ switch (bit_depth_) {
+ case 10:
+ inv_txfm_ref = idct16x16_10_ref;
+ break;
+ case 12:
+ inv_txfm_ref = idct16x16_12_ref;
+ break;
+ default:
+ inv_txfm_ref = idct16x16_ref;
+ break;
+ }
+#else
+ inv_txfm_ref = idct16x16_ref;
+#endif
}
virtual void TearDown() { libvpx_test::ClearSystemState(); }
protected:
- void RunFwdTxfm(int16_t *in, int16_t *out, int stride) {
+ void RunFwdTxfm(int16_t *in, tran_low_t *out, int stride) {
fwd_txfm_(in, out, stride);
}
- void RunInvTxfm(int16_t *out, uint8_t *dst, int stride) {
+ void RunInvTxfm(tran_low_t *out, uint8_t *dst, int stride) {
inv_txfm_(out, dst, stride);
}
@@ -527,17 +650,34 @@
fwd_txfm_ = GET_PARAM(0);
inv_txfm_ = GET_PARAM(1);
tx_type_ = GET_PARAM(2);
+ bit_depth_ = GET_PARAM(3);
pitch_ = 16;
fwd_txfm_ref = fht16x16_ref;
inv_txfm_ref = iht16x16_ref;
+ mask_ = (1 << bit_depth_) - 1;
+#if CONFIG_VP9_HIGHBITDEPTH
+ switch (bit_depth_) {
+ case VPX_BITS_10:
+ inv_txfm_ref = iht16x16_10;
+ break;
+ case VPX_BITS_12:
+ inv_txfm_ref = iht16x16_12;
+ break;
+ default:
+ inv_txfm_ref = iht16x16_ref;
+ break;
+ }
+#else
+ inv_txfm_ref = iht16x16_ref;
+#endif
}
virtual void TearDown() { libvpx_test::ClearSystemState(); }
protected:
- void RunFwdTxfm(int16_t *in, int16_t *out, int stride) {
+ void RunFwdTxfm(int16_t *in, tran_low_t *out, int stride) {
fwd_txfm_(in, out, stride, tx_type_);
}
- void RunInvTxfm(int16_t *out, uint8_t *dst, int stride) {
+ void RunInvTxfm(tran_low_t *out, uint8_t *dst, int stride) {
inv_txfm_(out, dst, stride, tx_type_);
}
@@ -565,45 +705,78 @@
using std::tr1::make_tuple;
+#if CONFIG_VP9_HIGHBITDEPTH
INSTANTIATE_TEST_CASE_P(
C, Trans16x16DCT,
::testing::Values(
- make_tuple(&vp9_fdct16x16_c, &vp9_idct16x16_256_add_c, 0)));
+ make_tuple(&vp9_high_fdct16x16_c, &idct16x16_10, 0, VPX_BITS_10),
+ make_tuple(&vp9_high_fdct16x16_c, &idct16x16_12, 0, VPX_BITS_12),
+ make_tuple(&vp9_fdct16x16_c, &vp9_idct16x16_256_add_c, 0, VPX_BITS_8)));
+#else
+INSTANTIATE_TEST_CASE_P(
+ C, Trans16x16DCT,
+ ::testing::Values(
+ make_tuple(&vp9_fdct16x16_c, &vp9_idct16x16_256_add_c, 0, VPX_BITS_8)));
+#endif
+
+#if CONFIG_VP9_HIGHBITDEPTH
INSTANTIATE_TEST_CASE_P(
C, Trans16x16HT,
::testing::Values(
- make_tuple(&vp9_fht16x16_c, &vp9_iht16x16_256_add_c, 0),
- make_tuple(&vp9_fht16x16_c, &vp9_iht16x16_256_add_c, 1),
- make_tuple(&vp9_fht16x16_c, &vp9_iht16x16_256_add_c, 2),
- make_tuple(&vp9_fht16x16_c, &vp9_iht16x16_256_add_c, 3)));
+ make_tuple(&vp9_high_fht16x16_c, &iht16x16_10, 0, VPX_BITS_10),
+ make_tuple(&vp9_high_fht16x16_c, &iht16x16_10, 1, VPX_BITS_10),
+ make_tuple(&vp9_high_fht16x16_c, &iht16x16_10, 2, VPX_BITS_10),
+ make_tuple(&vp9_high_fht16x16_c, &iht16x16_10, 3, VPX_BITS_10),
+ make_tuple(&vp9_high_fht16x16_c, &iht16x16_12, 0, VPX_BITS_12),
+ make_tuple(&vp9_high_fht16x16_c, &iht16x16_12, 1, VPX_BITS_12),
+ make_tuple(&vp9_high_fht16x16_c, &iht16x16_12, 2, VPX_BITS_12),
+ make_tuple(&vp9_high_fht16x16_c, &iht16x16_12, 3, VPX_BITS_12),
+ make_tuple(&vp9_fht16x16_c, &vp9_iht16x16_256_add_c, 0, VPX_BITS_8),
+ make_tuple(&vp9_fht16x16_c, &vp9_iht16x16_256_add_c, 1, VPX_BITS_8),
+ make_tuple(&vp9_fht16x16_c, &vp9_iht16x16_256_add_c, 2, VPX_BITS_8),
+ make_tuple(&vp9_fht16x16_c, &vp9_iht16x16_256_add_c, 3, VPX_BITS_8)));
+#else
+INSTANTIATE_TEST_CASE_P(
+ C, Trans16x16HT,
+ ::testing::Values(
+ make_tuple(&vp9_fht16x16_c, &vp9_iht16x16_256_add_c, 0, VPX_BITS_8),
+ make_tuple(&vp9_fht16x16_c, &vp9_iht16x16_256_add_c, 1, VPX_BITS_8),
+ make_tuple(&vp9_fht16x16_c, &vp9_iht16x16_256_add_c, 2, VPX_BITS_8),
+ make_tuple(&vp9_fht16x16_c, &vp9_iht16x16_256_add_c, 3, VPX_BITS_8)));
+#endif
-#if HAVE_NEON_ASM
+#if HAVE_NEON_ASM && !CONFIG_VP9_HIGHBITDEPTH
INSTANTIATE_TEST_CASE_P(
NEON, Trans16x16DCT,
::testing::Values(
make_tuple(&vp9_fdct16x16_c,
- &vp9_idct16x16_256_add_neon, 0)));
+ &vp9_idct16x16_256_add_neon, 0, VPX_BITS_8)));
#endif
-#if HAVE_SSE2
+#if HAVE_SSE2 && !CONFIG_VP9_HIGHBITDEPTH
INSTANTIATE_TEST_CASE_P(
SSE2, Trans16x16DCT,
::testing::Values(
make_tuple(&vp9_fdct16x16_sse2,
- &vp9_idct16x16_256_add_sse2, 0)));
+ &vp9_idct16x16_256_add_sse2, 0, VPX_BITS_8)));
INSTANTIATE_TEST_CASE_P(
SSE2, Trans16x16HT,
::testing::Values(
- make_tuple(&vp9_fht16x16_sse2, &vp9_iht16x16_256_add_sse2, 0),
- make_tuple(&vp9_fht16x16_sse2, &vp9_iht16x16_256_add_sse2, 1),
- make_tuple(&vp9_fht16x16_sse2, &vp9_iht16x16_256_add_sse2, 2),
- make_tuple(&vp9_fht16x16_sse2, &vp9_iht16x16_256_add_sse2, 3)));
+ make_tuple(&vp9_fht16x16_sse2, &vp9_iht16x16_256_add_sse2, 0,
+ VPX_BITS_8),
+ make_tuple(&vp9_fht16x16_sse2, &vp9_iht16x16_256_add_sse2, 1,
+ VPX_BITS_8),
+ make_tuple(&vp9_fht16x16_sse2, &vp9_iht16x16_256_add_sse2, 2,
+ VPX_BITS_8),
+ make_tuple(&vp9_fht16x16_sse2, &vp9_iht16x16_256_add_sse2, 3,
+ VPX_BITS_8)));
#endif
-#if HAVE_SSSE3
+#if HAVE_SSSE3 && !CONFIG_VP9_HIGHBITDEPTH
INSTANTIATE_TEST_CASE_P(
SSSE3, Trans16x16DCT,
::testing::Values(
- make_tuple(&vp9_fdct16x16_c, &vp9_idct16x16_256_add_ssse3, 0)));
+ make_tuple(&vp9_fdct16x16_c, &vp9_idct16x16_256_add_ssse3, 0,
+ VPX_BITS_8)));
#endif
} // namespace
diff --git a/test/dct32x32_test.cc b/test/dct32x32_test.cc
index 4f34a44..c7a1931 100644
--- a/test/dct32x32_test.cc
+++ b/test/dct32x32_test.cc
@@ -21,6 +21,7 @@
#include "./vpx_config.h"
#include "./vp9_rtcd.h"
#include "vp9/common/vp9_entropy.h"
+#include "vpx/vpx_codec.h"
#include "vpx/vpx_integer.h"
using libvpx_test::ACMRandom;
@@ -37,7 +38,7 @@
const int kNumCoeffs = 1024;
const double kPi = 3.141592653589793238462643383279502884;
-void reference_32x32_dct_1d(const double in[32], double out[32], int stride) {
+void reference_32x32_dct_1d(const double in[32], double out[32]) {
const double kInvSqrt2 = 0.707106781186547524400844362104;
for (int k = 0; k < 32; k++) {
out[k] = 0.0;
@@ -55,7 +56,7 @@
double temp_in[32], temp_out[32];
for (int j = 0; j < 32; ++j)
temp_in[j] = input[j*32 + i];
- reference_32x32_dct_1d(temp_in, temp_out, 1);
+ reference_32x32_dct_1d(temp_in, temp_out);
for (int j = 0; j < 32; ++j)
output[j * 32 + i] = temp_out[j];
}
@@ -64,17 +65,28 @@
double temp_in[32], temp_out[32];
for (int j = 0; j < 32; ++j)
temp_in[j] = output[j + i*32];
- reference_32x32_dct_1d(temp_in, temp_out, 1);
+ reference_32x32_dct_1d(temp_in, temp_out);
// Scale by some magic number
for (int j = 0; j < 32; ++j)
output[j + i * 32] = temp_out[j] / 4;
}
}
-typedef void (*FwdTxfmFunc)(const int16_t *in, int16_t *out, int stride);
-typedef void (*InvTxfmFunc)(const int16_t *in, uint8_t *out, int stride);
+typedef void (*FwdTxfmFunc)(const int16_t *in, tran_low_t *out, int stride);
+typedef void (*InvTxfmFunc)(const tran_low_t *in, uint8_t *out, int stride);
-typedef std::tr1::tuple<FwdTxfmFunc, InvTxfmFunc, int> Trans32x32Param;
+typedef std::tr1::tuple<FwdTxfmFunc, InvTxfmFunc, int, vpx_bit_depth_t>
+ Trans32x32Param;
+
+#if CONFIG_VP9_HIGHBITDEPTH
+void idct32x32_10(const tran_low_t *in, uint8_t *out, int stride) {
+ vp9_high_idct32x32_1024_add_c(in, out, stride, 10);
+}
+
+void idct32x32_12(const tran_low_t *in, uint8_t *out, int stride) {
+ vp9_high_idct32x32_1024_add_c(in, out, stride, 12);
+}
+#endif
class Trans32x32Test : public ::testing::TestWithParam<Trans32x32Param> {
public:
@@ -84,12 +96,16 @@
inv_txfm_ = GET_PARAM(1);
version_ = GET_PARAM(2); // 0: high precision forward transform
// 1: low precision version for rd loop
+ bit_depth_ = GET_PARAM(3);
+ mask_ = (1 << bit_depth_) - 1;
}
virtual void TearDown() { libvpx_test::ClearSystemState(); }
protected:
int version_;
+ vpx_bit_depth_t bit_depth_;
+ int mask_;
FwdTxfmFunc fwd_txfm_;
InvTxfmFunc inv_txfm_;
};
@@ -100,23 +116,47 @@
int64_t total_error = 0;
const int count_test_block = 1000;
DECLARE_ALIGNED_ARRAY(16, int16_t, test_input_block, kNumCoeffs);
- DECLARE_ALIGNED_ARRAY(16, int16_t, test_temp_block, kNumCoeffs);
+ DECLARE_ALIGNED_ARRAY(16, tran_low_t, test_temp_block, kNumCoeffs);
DECLARE_ALIGNED_ARRAY(16, uint8_t, dst, kNumCoeffs);
DECLARE_ALIGNED_ARRAY(16, uint8_t, src, kNumCoeffs);
+#if CONFIG_VP9_HIGHBITDEPTH
+ DECLARE_ALIGNED_ARRAY(16, uint16_t, dst16, kNumCoeffs);
+ DECLARE_ALIGNED_ARRAY(16, uint16_t, src16, kNumCoeffs);
+#endif
for (int i = 0; i < count_test_block; ++i) {
- // Initialize a test block with input range [-255, 255].
+ // Initialize a test block with input range [-mask_, mask_].
for (int j = 0; j < kNumCoeffs; ++j) {
- src[j] = rnd.Rand8();
- dst[j] = rnd.Rand8();
- test_input_block[j] = src[j] - dst[j];
+ if (bit_depth_ == 8) {
+ src[j] = rnd.Rand8();
+ dst[j] = rnd.Rand8();
+ test_input_block[j] = src[j] - dst[j];
+#if CONFIG_VP9_HIGHBITDEPTH
+ } else {
+ src16[j] = rnd.Rand16() & mask_;
+ dst16[j] = rnd.Rand16() & mask_;
+ test_input_block[j] = src16[j] - dst16[j];
+#endif
+ }
}
ASM_REGISTER_STATE_CHECK(fwd_txfm_(test_input_block, test_temp_block, 32));
- ASM_REGISTER_STATE_CHECK(inv_txfm_(test_temp_block, dst, 32));
+ if (bit_depth_ == VPX_BITS_8) {
+ ASM_REGISTER_STATE_CHECK(inv_txfm_(test_temp_block, dst, 32));
+#if CONFIG_VP9_HIGHBITDEPTH
+ } else {
+ ASM_REGISTER_STATE_CHECK(inv_txfm_(test_temp_block,
+ CONVERT_TO_BYTEPTR(dst16), 32));
+#endif
+ }
for (int j = 0; j < kNumCoeffs; ++j) {
+#if CONFIG_VP9_HIGHBITDEPTH
+ const uint32_t diff =
+ bit_depth_ == VPX_BITS_8 ? dst[j] - src[j] : dst16[j] - src16[j];
+#else
const uint32_t diff = dst[j] - src[j];
+#endif
const uint32_t error = diff * diff;
if (max_error < error)
max_error = error;
@@ -129,10 +169,10 @@
total_error /= 45;
}
- EXPECT_GE(1u, max_error)
+ EXPECT_GE(1u << 2 * (bit_depth_ - 8), max_error)
<< "Error: 32x32 FDCT/IDCT has an individual round-trip error > 1";
- EXPECT_GE(count_test_block, total_error)
+ EXPECT_GE(count_test_block << 2 * (bit_depth_ - 8), total_error)
<< "Error: 32x32 FDCT/IDCT has average round-trip error > 1 per block";
}
@@ -141,12 +181,12 @@
const int count_test_block = 1000;
DECLARE_ALIGNED_ARRAY(16, int16_t, input_block, kNumCoeffs);
- DECLARE_ALIGNED_ARRAY(16, int16_t, output_ref_block, kNumCoeffs);
- DECLARE_ALIGNED_ARRAY(16, int16_t, output_block, kNumCoeffs);
+ DECLARE_ALIGNED_ARRAY(16, tran_low_t, output_ref_block, kNumCoeffs);
+ DECLARE_ALIGNED_ARRAY(16, tran_low_t, output_block, kNumCoeffs);
for (int i = 0; i < count_test_block; ++i) {
for (int j = 0; j < kNumCoeffs; ++j)
- input_block[j] = rnd.Rand8() - rnd.Rand8();
+ input_block[j] = (rnd.Rand16() & mask_) - (rnd.Rand16() & mask_);
const int stride = 32;
vp9_fdct32x32_c(input_block, output_ref_block, stride);
@@ -170,21 +210,21 @@
DECLARE_ALIGNED_ARRAY(16, int16_t, input_block, kNumCoeffs);
DECLARE_ALIGNED_ARRAY(16, int16_t, input_extreme_block, kNumCoeffs);
- DECLARE_ALIGNED_ARRAY(16, int16_t, output_ref_block, kNumCoeffs);
- DECLARE_ALIGNED_ARRAY(16, int16_t, output_block, kNumCoeffs);
+ DECLARE_ALIGNED_ARRAY(16, tran_low_t, output_ref_block, kNumCoeffs);
+ DECLARE_ALIGNED_ARRAY(16, tran_low_t, output_block, kNumCoeffs);
for (int i = 0; i < count_test_block; ++i) {
- // Initialize a test block with input range [-255, 255].
+ // Initialize a test block with input range [-mask_, mask_].
for (int j = 0; j < kNumCoeffs; ++j) {
- input_block[j] = rnd.Rand8() - rnd.Rand8();
- input_extreme_block[j] = rnd.Rand8() & 1 ? 255 : -255;
+ input_block[j] = (rnd.Rand16() & mask_) - (rnd.Rand16() & mask_);
+ input_extreme_block[j] = rnd.Rand8() & 1 ? mask_ : -mask_;
}
if (i == 0) {
for (int j = 0; j < kNumCoeffs; ++j)
- input_extreme_block[j] = 255;
+ input_extreme_block[j] = mask_;
} else if (i == 1) {
for (int j = 0; j < kNumCoeffs; ++j)
- input_extreme_block[j] = -255;
+ input_extreme_block[j] = -mask_;
}
const int stride = 32;
@@ -201,9 +241,9 @@
EXPECT_GE(6, abs(output_block[j] - output_ref_block[j]))
<< "Error: 32x32 FDCT rd has mismatched coefficients";
}
- EXPECT_GE(4 * DCT_MAX_VALUE, abs(output_ref_block[j]))
+ EXPECT_GE(4 * DCT_MAX_VALUE << (bit_depth_ - 8), abs(output_ref_block[j]))
<< "Error: 32x32 FDCT C has coefficient larger than 4*DCT_MAX_VALUE";
- EXPECT_GE(4 * DCT_MAX_VALUE, abs(output_block[j]))
+ EXPECT_GE(4 * DCT_MAX_VALUE << (bit_depth_ - 8), abs(output_block[j]))
<< "Error: 32x32 FDCT has coefficient larger than "
<< "4*DCT_MAX_VALUE";
}
@@ -214,26 +254,49 @@
ACMRandom rnd(ACMRandom::DeterministicSeed());
const int count_test_block = 1000;
DECLARE_ALIGNED_ARRAY(16, int16_t, in, kNumCoeffs);
- DECLARE_ALIGNED_ARRAY(16, int16_t, coeff, kNumCoeffs);
+ DECLARE_ALIGNED_ARRAY(16, tran_low_t, coeff, kNumCoeffs);
DECLARE_ALIGNED_ARRAY(16, uint8_t, dst, kNumCoeffs);
DECLARE_ALIGNED_ARRAY(16, uint8_t, src, kNumCoeffs);
+#if CONFIG_VP9_HIGHBITDEPTH
+ DECLARE_ALIGNED_ARRAY(16, uint16_t, dst16, kNumCoeffs);
+ DECLARE_ALIGNED_ARRAY(16, uint16_t, src16, kNumCoeffs);
+#endif
for (int i = 0; i < count_test_block; ++i) {
double out_r[kNumCoeffs];
// Initialize a test block with input range [-255, 255]
for (int j = 0; j < kNumCoeffs; ++j) {
- src[j] = rnd.Rand8();
- dst[j] = rnd.Rand8();
- in[j] = src[j] - dst[j];
+ if (bit_depth_ == VPX_BITS_8) {
+ src[j] = rnd.Rand8();
+ dst[j] = rnd.Rand8();
+ in[j] = src[j] - dst[j];
+#if CONFIG_VP9_HIGHBITDEPTH
+ } else {
+ src16[j] = rnd.Rand16() & mask_;
+ dst16[j] = rnd.Rand16() & mask_;
+ in[j] = src16[j] - dst16[j];
+#endif
+ }
}
reference_32x32_dct_2d(in, out_r);
for (int j = 0; j < kNumCoeffs; ++j)
coeff[j] = round(out_r[j]);
- ASM_REGISTER_STATE_CHECK(inv_txfm_(coeff, dst, 32));
+ if (bit_depth_ == VPX_BITS_8) {
+ ASM_REGISTER_STATE_CHECK(inv_txfm_(coeff, dst, 32));
+#if CONFIG_VP9_HIGHBITDEPTH
+ } else {
+ ASM_REGISTER_STATE_CHECK(inv_txfm_(coeff, CONVERT_TO_BYTEPTR(dst16), 32));
+#endif
+ }
for (int j = 0; j < kNumCoeffs; ++j) {
+#if CONFIG_VP9_HIGHBITDEPTH
+ const int diff =
+ bit_depth_ == VPX_BITS_8 ? dst[j] - src[j] : dst16[j] - src16[j];
+#else
const int diff = dst[j] - src[j];
+#endif
const int error = diff * diff;
EXPECT_GE(1, error)
<< "Error: 32x32 IDCT has error " << error
@@ -244,39 +307,59 @@
using std::tr1::make_tuple;
+#if CONFIG_VP9_HIGHBITDEPTH
INSTANTIATE_TEST_CASE_P(
C, Trans32x32Test,
::testing::Values(
- make_tuple(&vp9_fdct32x32_c, &vp9_idct32x32_1024_add_c, 0),
- make_tuple(&vp9_fdct32x32_rd_c, &vp9_idct32x32_1024_add_c, 1)));
+ make_tuple(&vp9_high_fdct32x32_c,
+ &idct32x32_10, 0, VPX_BITS_10),
+ make_tuple(&vp9_high_fdct32x32_rd_c,
+ &idct32x32_10, 1, VPX_BITS_10),
+ make_tuple(&vp9_high_fdct32x32_c,
+ &idct32x32_12, 0, VPX_BITS_12),
+ make_tuple(&vp9_high_fdct32x32_rd_c,
+ &idct32x32_12, 1, VPX_BITS_12),
+ make_tuple(&vp9_fdct32x32_c,
+ &vp9_idct32x32_1024_add_c, 0, VPX_BITS_8),
+ make_tuple(&vp9_fdct32x32_rd_c,
+ &vp9_idct32x32_1024_add_c, 1, VPX_BITS_8)));
+#else
+INSTANTIATE_TEST_CASE_P(
+ C, Trans32x32Test,
+ ::testing::Values(
+ make_tuple(&vp9_fdct32x32_c,
+ &vp9_idct32x32_1024_add_c, 0, VPX_BITS_8),
+ make_tuple(&vp9_fdct32x32_rd_c,
+ &vp9_idct32x32_1024_add_c, 1, VPX_BITS_8)));
+#endif
-#if HAVE_NEON_ASM
+#if HAVE_NEON_ASM && !CONFIG_VP9_HIGHBITDEPTH
INSTANTIATE_TEST_CASE_P(
NEON, Trans32x32Test,
::testing::Values(
make_tuple(&vp9_fdct32x32_c,
- &vp9_idct32x32_1024_add_neon, 0),
+ &vp9_idct32x32_1024_add_neon, 0, VPX_BITS_8),
make_tuple(&vp9_fdct32x32_rd_c,
- &vp9_idct32x32_1024_add_neon, 1)));
+ &vp9_idct32x32_1024_add_neon, 1, VPX_BITS_8)));
#endif
-#if HAVE_SSE2
+#if HAVE_SSE2 && !CONFIG_VP9_HIGHBITDEPTH
INSTANTIATE_TEST_CASE_P(
SSE2, Trans32x32Test,
::testing::Values(
make_tuple(&vp9_fdct32x32_sse2,
- &vp9_idct32x32_1024_add_sse2, 0),
+ &vp9_idct32x32_1024_add_sse2, 0, VPX_BITS_8),
make_tuple(&vp9_fdct32x32_rd_sse2,
- &vp9_idct32x32_1024_add_sse2, 1)));
+ &vp9_idct32x32_1024_add_sse2, 1, VPX_BITS_8)));
#endif
-#if HAVE_AVX2
+#if HAVE_AVX2 && !CONFIG_VP9_HIGHBITDEPTH
INSTANTIATE_TEST_CASE_P(
AVX2, Trans32x32Test,
::testing::Values(
make_tuple(&vp9_fdct32x32_avx2,
- &vp9_idct32x32_1024_add_sse2, 0),
+ &vp9_idct32x32_1024_add_sse2, 0, VPX_BITS_8),
make_tuple(&vp9_fdct32x32_rd_avx2,
- &vp9_idct32x32_1024_add_sse2, 1)));
+ &vp9_idct32x32_1024_add_sse2, 1, VPX_BITS_8)));
#endif
} // namespace
diff --git a/test/decode_perf_test.cc b/test/decode_perf_test.cc
index 11529b3..5a71140 100644
--- a/test/decode_perf_test.cc
+++ b/test/decode_perf_test.cc
@@ -74,7 +74,7 @@
libvpx_test::WebMVideoSource video(video_name);
video.Init();
- vpx_codec_dec_cfg_t cfg = {0};
+ vpx_codec_dec_cfg_t cfg = vpx_codec_dec_cfg_t();
cfg.threads = threads;
libvpx_test::VP9Decoder decoder(cfg, 0);
diff --git a/test/decode_test_driver.cc b/test/decode_test_driver.cc
index 99610eb..0ef4f7b 100644
--- a/test/decode_test_driver.cc
+++ b/test/decode_test_driver.cc
@@ -106,7 +106,7 @@
}
void DecoderTest::RunLoop(CompressedVideoSource *video) {
- vpx_codec_dec_cfg_t dec_cfg = {0};
+ vpx_codec_dec_cfg_t dec_cfg = vpx_codec_dec_cfg_t();
RunLoop(video, dec_cfg);
}
diff --git a/test/decode_test_driver.h b/test/decode_test_driver.h
index 1f73c7d..a757b59 100644
--- a/test/decode_test_driver.h
+++ b/test/decode_test_driver.h
@@ -125,20 +125,20 @@
const vpx_codec_dec_cfg_t &dec_cfg);
// Hook to be called before decompressing every frame.
- virtual void PreDecodeFrameHook(const CompressedVideoSource& video,
- Decoder *decoder) {}
+ virtual void PreDecodeFrameHook(const CompressedVideoSource& /*video*/,
+ Decoder* /*decoder*/) {}
// Hook to be called to handle decode result. Return true to continue.
virtual bool HandleDecodeResult(const vpx_codec_err_t res_dec,
- const CompressedVideoSource& /* video */,
+ const CompressedVideoSource& /*video*/,
Decoder *decoder) {
EXPECT_EQ(VPX_CODEC_OK, res_dec) << decoder->DecodeError();
return VPX_CODEC_OK == res_dec;
}
// Hook to be called on every decompressed frame.
- virtual void DecompressedFrameHook(const vpx_image_t& img,
- const unsigned int frame_number) {}
+ virtual void DecompressedFrameHook(const vpx_image_t& /*img*/,
+ const unsigned int /*frame_number*/) {}
// Hook to be called on peek result
virtual void HandlePeekResult(Decoder* const decoder,
diff --git a/test/encode_test_driver.cc b/test/encode_test_driver.cc
index 6d4281d..9702ddf 100644
--- a/test/encode_test_driver.cc
+++ b/test/encode_test_driver.cc
@@ -133,13 +133,13 @@
return match;
}
-void EncoderTest::MismatchHook(const vpx_image_t *img1,
- const vpx_image_t *img2) {
+void EncoderTest::MismatchHook(const vpx_image_t* /*img1*/,
+ const vpx_image_t* /*img2*/) {
ASSERT_TRUE(0) << "Encode/Decode mismatch found";
}
void EncoderTest::RunLoop(VideoSource *video) {
- vpx_codec_dec_cfg_t dec_cfg = {0};
+ vpx_codec_dec_cfg_t dec_cfg = vpx_codec_dec_cfg_t();
stats_.Reset();
diff --git a/test/encode_test_driver.h b/test/encode_test_driver.h
index 2270ce2..a77bd64 100644
--- a/test/encode_test_driver.h
+++ b/test/encode_test_driver.h
@@ -189,20 +189,21 @@
virtual void RunLoop(VideoSource *video);
// Hook to be called at the beginning of a pass.
- virtual void BeginPassHook(unsigned int pass) {}
+ virtual void BeginPassHook(unsigned int /*pass*/) {}
// Hook to be called at the end of a pass.
virtual void EndPassHook() {}
// Hook to be called before encoding a frame.
- virtual void PreEncodeFrameHook(VideoSource *video) {}
- virtual void PreEncodeFrameHook(VideoSource *video, Encoder *encoder) {}
+ virtual void PreEncodeFrameHook(VideoSource* /*video*/) {}
+ virtual void PreEncodeFrameHook(VideoSource* /*video*/,
+ Encoder* /*encoder*/) {}
// Hook to be called on every compressed data packet.
- virtual void FramePktHook(const vpx_codec_cx_pkt_t *pkt) {}
+ virtual void FramePktHook(const vpx_codec_cx_pkt_t* /*pkt*/) {}
// Hook to be called on every PSNR packet.
- virtual void PSNRPktHook(const vpx_codec_cx_pkt_t *pkt) {}
+ virtual void PSNRPktHook(const vpx_codec_cx_pkt_t* /*pkt*/) {}
// Hook to determine whether the encode loop should continue.
virtual bool Continue() const {
@@ -218,19 +219,19 @@
const vpx_image_t *img2);
// Hook to be called on every decompressed frame.
- virtual void DecompressedFrameHook(const vpx_image_t& img,
- vpx_codec_pts_t pts) {}
+ virtual void DecompressedFrameHook(const vpx_image_t& /*img*/,
+ vpx_codec_pts_t /*pts*/) {}
// Hook to be called to handle decode result. Return true to continue.
virtual bool HandleDecodeResult(const vpx_codec_err_t res_dec,
- const VideoSource& /* video */,
+ const VideoSource& /*video*/,
Decoder *decoder) {
EXPECT_EQ(VPX_CODEC_OK, res_dec) << decoder->DecodeError();
return VPX_CODEC_OK == res_dec;
}
// Hook that can modify the encoder's output data
- virtual const vpx_codec_cx_pkt_t * MutateEncoderOutputHook(
+ virtual const vpx_codec_cx_pkt_t *MutateEncoderOutputHook(
const vpx_codec_cx_pkt_t *pkt) {
return pkt;
}
diff --git a/test/examples.sh b/test/examples.sh
index 7ba9cce..39f7e39 100755
--- a/test/examples.sh
+++ b/test/examples.sh
@@ -15,7 +15,7 @@
example_tests=$(ls $(dirname $0)/*.sh)
# List of script names to exclude.
-exclude_list="examples vpxdec vpxenc tools_common"
+exclude_list="examples tools_common"
# Filter out the scripts in $exclude_list.
for word in ${exclude_list}; do
diff --git a/test/external_frame_buffer_test.cc b/test/external_frame_buffer_test.cc
index fb0449d..44eba33 100644
--- a/test/external_frame_buffer_test.cc
+++ b/test/external_frame_buffer_test.cc
@@ -285,7 +285,7 @@
video_->Init();
video_->Begin();
- vpx_codec_dec_cfg_t cfg = {0};
+ vpx_codec_dec_cfg_t cfg = vpx_codec_dec_cfg_t();
decoder_ = new libvpx_test::VP9Decoder(cfg, 0);
ASSERT_TRUE(decoder_ != NULL);
}
diff --git a/test/fdct4x4_test.cc b/test/fdct4x4_test.cc
index 7c48260..f803c8e 100644
--- a/test/fdct4x4_test.cc
+++ b/test/fdct4x4_test.cc
@@ -20,46 +20,71 @@
#include "./vp9_rtcd.h"
#include "vp9/common/vp9_entropy.h"
+#include "vpx/vpx_codec.h"
#include "vpx/vpx_integer.h"
-extern "C" {
-void vp9_idct4x4_16_add_c(const int16_t *input, uint8_t *output, int pitch);
-}
-
using libvpx_test::ACMRandom;
namespace {
const int kNumCoeffs = 16;
-typedef void (*FdctFunc)(const int16_t *in, int16_t *out, int stride);
-typedef void (*IdctFunc)(const int16_t *in, uint8_t *out, int stride);
-typedef void (*FhtFunc)(const int16_t *in, int16_t *out, int stride,
+typedef void (*FdctFunc)(const int16_t *in, tran_low_t *out, int stride);
+typedef void (*IdctFunc)(const tran_low_t *in, uint8_t *out, int stride);
+typedef void (*FhtFunc)(const int16_t *in, tran_low_t *out, int stride,
int tx_type);
-typedef void (*IhtFunc)(const int16_t *in, uint8_t *out, int stride,
+typedef void (*IhtFunc)(const tran_low_t *in, uint8_t *out, int stride,
int tx_type);
-typedef std::tr1::tuple<FdctFunc, IdctFunc, int> Dct4x4Param;
-typedef std::tr1::tuple<FhtFunc, IhtFunc, int> Ht4x4Param;
+typedef std::tr1::tuple<FdctFunc, IdctFunc, int, vpx_bit_depth_t> Dct4x4Param;
+typedef std::tr1::tuple<FhtFunc, IhtFunc, int, vpx_bit_depth_t> Ht4x4Param;
-void fdct4x4_ref(const int16_t *in, int16_t *out, int stride, int tx_type) {
+void fdct4x4_ref(const int16_t *in, tran_low_t *out, int stride,
+ int tx_type) {
vp9_fdct4x4_c(in, out, stride);
}
-void fht4x4_ref(const int16_t *in, int16_t *out, int stride, int tx_type) {
+void fht4x4_ref(const int16_t *in, tran_low_t *out, int stride, int tx_type) {
vp9_fht4x4_c(in, out, stride, tx_type);
}
-void fwht4x4_ref(const int16_t *in, int16_t *out, int stride, int tx_type) {
+void fwht4x4_ref(const int16_t *in, tran_low_t *out, int stride,
+ int tx_type) {
vp9_fwht4x4_c(in, out, stride);
}
+#if CONFIG_VP9_HIGHBITDEPTH
+void idct4x4_10(const tran_low_t *in, uint8_t *out, int stride) {
+ vp9_high_idct4x4_16_add_c(in, out, stride, 10);
+}
+
+void idct4x4_12(const tran_low_t *in, uint8_t *out, int stride) {
+ vp9_high_idct4x4_16_add_c(in, out, stride, 12);
+}
+
+void iht4x4_10(const tran_low_t *in, uint8_t *out, int stride, int tx_type) {
+ vp9_high_iht4x4_16_add_c(in, out, stride, tx_type, 10);
+}
+
+void iht4x4_12(const tran_low_t *in, uint8_t *out, int stride, int tx_type) {
+ vp9_high_iht4x4_16_add_c(in, out, stride, tx_type, 12);
+}
+
+void iwht4x4_10(const tran_low_t *in, uint8_t *out, int stride) {
+ vp9_high_iwht4x4_16_add_c(in, out, stride, 10);
+}
+
+void iwht4x4_12(const tran_low_t *in, uint8_t *out, int stride) {
+ vp9_high_iwht4x4_16_add_c(in, out, stride, 12);
+}
+#endif
+
class Trans4x4TestBase {
public:
virtual ~Trans4x4TestBase() {}
protected:
- virtual void RunFwdTxfm(const int16_t *in, int16_t *out, int stride) = 0;
+ virtual void RunFwdTxfm(const int16_t *in, tran_low_t *out, int stride) = 0;
- virtual void RunInvTxfm(const int16_t *out, uint8_t *dst, int stride) = 0;
+ virtual void RunInvTxfm(const tran_low_t *out, uint8_t *dst, int stride) = 0;
void RunAccuracyCheck(int limit) {
ACMRandom rnd(ACMRandom::DeterministicSeed());
@@ -68,23 +93,47 @@
const int count_test_block = 10000;
for (int i = 0; i < count_test_block; ++i) {
DECLARE_ALIGNED_ARRAY(16, int16_t, test_input_block, kNumCoeffs);
- DECLARE_ALIGNED_ARRAY(16, int16_t, test_temp_block, kNumCoeffs);
+ DECLARE_ALIGNED_ARRAY(16, tran_low_t, test_temp_block, kNumCoeffs);
DECLARE_ALIGNED_ARRAY(16, uint8_t, dst, kNumCoeffs);
DECLARE_ALIGNED_ARRAY(16, uint8_t, src, kNumCoeffs);
+#if CONFIG_VP9_HIGHBITDEPTH
+ DECLARE_ALIGNED_ARRAY(16, uint16_t, dst16, kNumCoeffs);
+ DECLARE_ALIGNED_ARRAY(16, uint16_t, src16, kNumCoeffs);
+#endif
// Initialize a test block with input range [-255, 255].
for (int j = 0; j < kNumCoeffs; ++j) {
- src[j] = rnd.Rand8();
- dst[j] = rnd.Rand8();
- test_input_block[j] = src[j] - dst[j];
+ if (bit_depth_ == VPX_BITS_8) {
+ src[j] = rnd.Rand8();
+ dst[j] = rnd.Rand8();
+ test_input_block[j] = src[j] - dst[j];
+#if CONFIG_VP9_HIGHBITDEPTH
+ } else {
+ src16[j] = rnd.Rand16() & mask_;
+ dst16[j] = rnd.Rand16() & mask_;
+ test_input_block[j] = src16[j] - dst16[j];
+#endif
+ }
}
ASM_REGISTER_STATE_CHECK(RunFwdTxfm(test_input_block,
test_temp_block, pitch_));
- ASM_REGISTER_STATE_CHECK(RunInvTxfm(test_temp_block, dst, pitch_));
+ if (bit_depth_ == VPX_BITS_8) {
+ ASM_REGISTER_STATE_CHECK(RunInvTxfm(test_temp_block, dst, pitch_));
+#if CONFIG_VP9_HIGHBITDEPTH
+ } else {
+ ASM_REGISTER_STATE_CHECK(RunInvTxfm(test_temp_block,
+ CONVERT_TO_BYTEPTR(dst16), pitch_));
+#endif
+ }
for (int j = 0; j < kNumCoeffs; ++j) {
+#if CONFIG_VP9_HIGHBITDEPTH
+ const uint32_t diff =
+ bit_depth_ == VPX_BITS_8 ? dst[j] - src[j] : dst16[j] - src16[j];
+#else
const uint32_t diff = dst[j] - src[j];
+#endif
const uint32_t error = diff * diff;
if (max_error < error)
max_error = error;
@@ -105,13 +154,13 @@
ACMRandom rnd(ACMRandom::DeterministicSeed());
const int count_test_block = 5000;
DECLARE_ALIGNED_ARRAY(16, int16_t, input_block, kNumCoeffs);
- DECLARE_ALIGNED_ARRAY(16, int16_t, output_ref_block, kNumCoeffs);
- DECLARE_ALIGNED_ARRAY(16, int16_t, output_block, kNumCoeffs);
+ DECLARE_ALIGNED_ARRAY(16, tran_low_t, output_ref_block, kNumCoeffs);
+ DECLARE_ALIGNED_ARRAY(16, tran_low_t, output_block, kNumCoeffs);
for (int i = 0; i < count_test_block; ++i) {
- // Initialize a test block with input range [-255, 255].
+ // Initialize a test block with input range [-mask_, mask_].
for (int j = 0; j < kNumCoeffs; ++j)
- input_block[j] = rnd.Rand8() - rnd.Rand8();
+ input_block[j] = (rnd.Rand16() & mask_) - (rnd.Rand16() & mask_);
fwd_txfm_ref(input_block, output_ref_block, pitch_, tx_type_);
ASM_REGISTER_STATE_CHECK(RunFwdTxfm(input_block, output_block, pitch_));
@@ -127,21 +176,21 @@
const int count_test_block = 5000;
DECLARE_ALIGNED_ARRAY(16, int16_t, input_block, kNumCoeffs);
DECLARE_ALIGNED_ARRAY(16, int16_t, input_extreme_block, kNumCoeffs);
- DECLARE_ALIGNED_ARRAY(16, int16_t, output_ref_block, kNumCoeffs);
- DECLARE_ALIGNED_ARRAY(16, int16_t, output_block, kNumCoeffs);
+ DECLARE_ALIGNED_ARRAY(16, tran_low_t, output_ref_block, kNumCoeffs);
+ DECLARE_ALIGNED_ARRAY(16, tran_low_t, output_block, kNumCoeffs);
for (int i = 0; i < count_test_block; ++i) {
- // Initialize a test block with input range [-255, 255].
+ // Initialize a test block with input range [-mask_, mask_].
for (int j = 0; j < kNumCoeffs; ++j) {
- input_block[j] = rnd.Rand8() - rnd.Rand8();
- input_extreme_block[j] = rnd.Rand8() % 2 ? 255 : -255;
+ input_block[j] = (rnd.Rand16() & mask_) - (rnd.Rand16() & mask_);
+ input_extreme_block[j] = rnd.Rand8() % 2 ? mask_ : -mask_;
}
if (i == 0) {
for (int j = 0; j < kNumCoeffs; ++j)
- input_extreme_block[j] = 255;
+ input_extreme_block[j] = mask_;
} else if (i == 1) {
for (int j = 0; j < kNumCoeffs; ++j)
- input_extreme_block[j] = -255;
+ input_extreme_block[j] = -mask_;
}
fwd_txfm_ref(input_extreme_block, output_ref_block, pitch_, tx_type_);
@@ -151,8 +200,8 @@
// The minimum quant value is 4.
for (int j = 0; j < kNumCoeffs; ++j) {
EXPECT_EQ(output_block[j], output_ref_block[j]);
- EXPECT_GE(4 * DCT_MAX_VALUE, abs(output_block[j]))
- << "Error: 16x16 FDCT has coefficient larger than 4*DCT_MAX_VALUE";
+ EXPECT_GE(4 * DCT_MAX_VALUE << (bit_depth_ - 8), abs(output_block[j]))
+ << "Error: 4x4 FDCT has coefficient larger than 4*DCT_MAX_VALUE";
}
}
}
@@ -161,24 +210,48 @@
ACMRandom rnd(ACMRandom::DeterministicSeed());
const int count_test_block = 1000;
DECLARE_ALIGNED_ARRAY(16, int16_t, in, kNumCoeffs);
- DECLARE_ALIGNED_ARRAY(16, int16_t, coeff, kNumCoeffs);
+ DECLARE_ALIGNED_ARRAY(16, tran_low_t, coeff, kNumCoeffs);
DECLARE_ALIGNED_ARRAY(16, uint8_t, dst, kNumCoeffs);
DECLARE_ALIGNED_ARRAY(16, uint8_t, src, kNumCoeffs);
+#if CONFIG_VP9_HIGHBITDEPTH
+ DECLARE_ALIGNED_ARRAY(16, uint16_t, dst16, kNumCoeffs);
+ DECLARE_ALIGNED_ARRAY(16, uint16_t, src16, kNumCoeffs);
+#endif
for (int i = 0; i < count_test_block; ++i) {
- // Initialize a test block with input range [-255, 255].
+ // Initialize a test block with input range [-mask_, mask_].
for (int j = 0; j < kNumCoeffs; ++j) {
- src[j] = rnd.Rand8();
- dst[j] = rnd.Rand8();
- in[j] = src[j] - dst[j];
+ if (bit_depth_ == VPX_BITS_8) {
+ src[j] = rnd.Rand8();
+ dst[j] = rnd.Rand8();
+ in[j] = src[j] - dst[j];
+#if CONFIG_VP9_HIGHBITDEPTH
+ } else {
+ src16[j] = rnd.Rand16() & mask_;
+ dst16[j] = rnd.Rand16() & mask_;
+ in[j] = src16[j] - dst16[j];
+#endif
+ }
}
fwd_txfm_ref(in, coeff, pitch_, tx_type_);
- ASM_REGISTER_STATE_CHECK(RunInvTxfm(coeff, dst, pitch_));
+ if (bit_depth_ == VPX_BITS_8) {
+ ASM_REGISTER_STATE_CHECK(RunInvTxfm(coeff, dst, pitch_));
+#if CONFIG_VP9_HIGHBITDEPTH
+ } else {
+ ASM_REGISTER_STATE_CHECK(RunInvTxfm(coeff, CONVERT_TO_BYTEPTR(dst16),
+ pitch_));
+#endif
+ }
for (int j = 0; j < kNumCoeffs; ++j) {
+#if CONFIG_VP9_HIGHBITDEPTH
+ const uint32_t diff =
+ bit_depth_ == VPX_BITS_8 ? dst[j] - src[j] : dst16[j] - src16[j];
+#else
const uint32_t diff = dst[j] - src[j];
+#endif
const uint32_t error = diff * diff;
EXPECT_GE(static_cast<uint32_t>(limit), error)
<< "Error: 4x4 IDCT has error " << error
@@ -190,6 +263,8 @@
int pitch_;
int tx_type_;
FhtFunc fwd_txfm_ref;
+ vpx_bit_depth_t bit_depth_;
+ int mask_;
};
class Trans4x4DCT
@@ -204,14 +279,16 @@
tx_type_ = GET_PARAM(2);
pitch_ = 4;
fwd_txfm_ref = fdct4x4_ref;
+ bit_depth_ = GET_PARAM(3);
+ mask_ = (1 << bit_depth_) - 1;
}
virtual void TearDown() { libvpx_test::ClearSystemState(); }
protected:
- void RunFwdTxfm(const int16_t *in, int16_t *out, int stride) {
+ void RunFwdTxfm(const int16_t *in, tran_low_t *out, int stride) {
fwd_txfm_(in, out, stride);
}
- void RunInvTxfm(const int16_t *out, uint8_t *dst, int stride) {
+ void RunInvTxfm(const tran_low_t *out, uint8_t *dst, int stride) {
inv_txfm_(out, dst, stride);
}
@@ -247,15 +324,17 @@
tx_type_ = GET_PARAM(2);
pitch_ = 4;
fwd_txfm_ref = fht4x4_ref;
+ bit_depth_ = GET_PARAM(3);
+ mask_ = (1 << bit_depth_) - 1;
}
virtual void TearDown() { libvpx_test::ClearSystemState(); }
protected:
- void RunFwdTxfm(const int16_t *in, int16_t *out, int stride) {
+ void RunFwdTxfm(const int16_t *in, tran_low_t *out, int stride) {
fwd_txfm_(in, out, stride, tx_type_);
}
- void RunInvTxfm(const int16_t *out, uint8_t *dst, int stride) {
+ void RunInvTxfm(const tran_low_t *out, uint8_t *dst, int stride) {
inv_txfm_(out, dst, stride, tx_type_);
}
@@ -291,14 +370,16 @@
tx_type_ = GET_PARAM(2);
pitch_ = 4;
fwd_txfm_ref = fwht4x4_ref;
+ bit_depth_ = GET_PARAM(3);
+ mask_ = (1 << bit_depth_) - 1;
}
virtual void TearDown() { libvpx_test::ClearSystemState(); }
protected:
- void RunFwdTxfm(const int16_t *in, int16_t *out, int stride) {
+ void RunFwdTxfm(const int16_t *in, tran_low_t *out, int stride) {
fwd_txfm_(in, out, stride);
}
- void RunInvTxfm(const int16_t *out, uint8_t *dst, int stride) {
+ void RunInvTxfm(const tran_low_t *out, uint8_t *dst, int stride) {
inv_txfm_(out, dst, stride);
}
@@ -323,57 +404,95 @@
}
using std::tr1::make_tuple;
+#if CONFIG_VP9_HIGHBITDEPTH
INSTANTIATE_TEST_CASE_P(
C, Trans4x4DCT,
::testing::Values(
- make_tuple(&vp9_fdct4x4_c, &vp9_idct4x4_16_add_c, 0)));
+ make_tuple(&vp9_high_fdct4x4_c, &idct4x4_10, 0, VPX_BITS_10),
+ make_tuple(&vp9_high_fdct4x4_c, &idct4x4_12, 0, VPX_BITS_12),
+ make_tuple(&vp9_fdct4x4_c, &vp9_idct4x4_16_add_c, 0, VPX_BITS_8)));
+#else
+INSTANTIATE_TEST_CASE_P(
+ C, Trans4x4DCT,
+ ::testing::Values(
+ make_tuple(&vp9_fdct4x4_c, &vp9_idct4x4_16_add_c, 0, VPX_BITS_8)));
+#endif
+
+#if CONFIG_VP9_HIGHBITDEPTH
INSTANTIATE_TEST_CASE_P(
C, Trans4x4HT,
::testing::Values(
- make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_c, 0),
- make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_c, 1),
- make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_c, 2),
- make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_c, 3)));
+ make_tuple(&vp9_high_fht4x4_c, &iht4x4_10, 0, VPX_BITS_10),
+ make_tuple(&vp9_high_fht4x4_c, &iht4x4_10, 1, VPX_BITS_10),
+ make_tuple(&vp9_high_fht4x4_c, &iht4x4_10, 2, VPX_BITS_10),
+ make_tuple(&vp9_high_fht4x4_c, &iht4x4_10, 3, VPX_BITS_10),
+ make_tuple(&vp9_high_fht4x4_c, &iht4x4_12, 0, VPX_BITS_12),
+ make_tuple(&vp9_high_fht4x4_c, &iht4x4_12, 1, VPX_BITS_12),
+ make_tuple(&vp9_high_fht4x4_c, &iht4x4_12, 2, VPX_BITS_12),
+ make_tuple(&vp9_high_fht4x4_c, &iht4x4_12, 3, VPX_BITS_12),
+ make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_c, 0, VPX_BITS_8),
+ make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_c, 1, VPX_BITS_8),
+ make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_c, 2, VPX_BITS_8),
+ make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_c, 3, VPX_BITS_8)));
+#else
+INSTANTIATE_TEST_CASE_P(
+ C, Trans4x4HT,
+ ::testing::Values(
+ make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_c, 0, VPX_BITS_8),
+ make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_c, 1, VPX_BITS_8),
+ make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_c, 2, VPX_BITS_8),
+ make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_c, 3, VPX_BITS_8)));
+#endif
+
+#if CONFIG_VP9_HIGHBITDEPTH
INSTANTIATE_TEST_CASE_P(
C, Trans4x4WHT,
::testing::Values(
- make_tuple(&vp9_fwht4x4_c, &vp9_iwht4x4_16_add_c, 0)));
+ make_tuple(&vp9_high_fwht4x4_c, &iwht4x4_10, 0, VPX_BITS_10),
+ make_tuple(&vp9_high_fwht4x4_c, &iwht4x4_12, 0, VPX_BITS_12),
+ make_tuple(&vp9_fwht4x4_c, &vp9_iwht4x4_16_add_c, 0, VPX_BITS_8)));
+#else
+INSTANTIATE_TEST_CASE_P(
+ C, Trans4x4WHT,
+ ::testing::Values(
+ make_tuple(&vp9_fwht4x4_c, &vp9_iwht4x4_16_add_c, 0, VPX_BITS_8)));
+#endif
-#if HAVE_NEON_ASM
+#if HAVE_NEON_ASM && !CONFIG_VP9_HIGHBITDEPTH
INSTANTIATE_TEST_CASE_P(
NEON, Trans4x4DCT,
::testing::Values(
make_tuple(&vp9_fdct4x4_c,
- &vp9_idct4x4_16_add_neon, 0)));
+ &vp9_idct4x4_16_add_neon, 0, VPX_BITS_8)));
INSTANTIATE_TEST_CASE_P(
DISABLED_NEON, Trans4x4HT,
::testing::Values(
- make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_neon, 0),
- make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_neon, 1),
- make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_neon, 2),
- make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_neon, 3)));
+ make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_neon, 0, VPX_BITS_8),
+ make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_neon, 1, VPX_BITS_8),
+ make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_neon, 2, VPX_BITS_8),
+ make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_neon, 3, VPX_BITS_8)));
#endif
-#if CONFIG_USE_X86INC && HAVE_MMX
+#if CONFIG_USE_X86INC && HAVE_MMX && !CONFIG_VP9_HIGHBITDEPTH
INSTANTIATE_TEST_CASE_P(
MMX, Trans4x4WHT,
::testing::Values(
- make_tuple(&vp9_fwht4x4_mmx, &vp9_iwht4x4_16_add_c, 0)));
+ make_tuple(&vp9_fwht4x4_mmx, &vp9_iwht4x4_16_add_c, 0, VPX_BITS_8)));
#endif
-#if HAVE_SSE2
+#if HAVE_SSE2 && !CONFIG_VP9_HIGHBITDEPTH
INSTANTIATE_TEST_CASE_P(
SSE2, Trans4x4DCT,
::testing::Values(
make_tuple(&vp9_fdct4x4_sse2,
- &vp9_idct4x4_16_add_sse2, 0)));
+ &vp9_idct4x4_16_add_sse2, 0, VPX_BITS_8)));
INSTANTIATE_TEST_CASE_P(
SSE2, Trans4x4HT,
::testing::Values(
- make_tuple(&vp9_fht4x4_sse2, &vp9_iht4x4_16_add_sse2, 0),
- make_tuple(&vp9_fht4x4_sse2, &vp9_iht4x4_16_add_sse2, 1),
- make_tuple(&vp9_fht4x4_sse2, &vp9_iht4x4_16_add_sse2, 2),
- make_tuple(&vp9_fht4x4_sse2, &vp9_iht4x4_16_add_sse2, 3)));
+ make_tuple(&vp9_fht4x4_sse2, &vp9_iht4x4_16_add_sse2, 0, VPX_BITS_8),
+ make_tuple(&vp9_fht4x4_sse2, &vp9_iht4x4_16_add_sse2, 1, VPX_BITS_8),
+ make_tuple(&vp9_fht4x4_sse2, &vp9_iht4x4_16_add_sse2, 2, VPX_BITS_8),
+ make_tuple(&vp9_fht4x4_sse2, &vp9_iht4x4_16_add_sse2, 3, VPX_BITS_8)));
#endif
} // namespace
diff --git a/test/fdct8x8_test.cc b/test/fdct8x8_test.cc
index 567e5f6..5455534 100644
--- a/test/fdct8x8_test.cc
+++ b/test/fdct8x8_test.cc
@@ -20,45 +20,96 @@
#include "./vp9_rtcd.h"
#include "vp9/common/vp9_entropy.h"
+#include "vpx/vpx_codec.h"
#include "vpx/vpx_integer.h"
-extern "C" {
-void vp9_idct8x8_64_add_c(const int16_t *input, uint8_t *output, int pitch);
+const int kNumCoeffs = 64;
+const double kPi = 3.141592653589793238462643383279502884;
+void reference_8x8_dct_1d(const double in[8], double out[8], int stride) {
+ const double kInvSqrt2 = 0.707106781186547524400844362104;
+ for (int k = 0; k < 8; k++) {
+ out[k] = 0.0;
+ for (int n = 0; n < 8; n++)
+ out[k] += in[n] * cos(kPi * (2 * n + 1) * k / 16.0);
+ if (k == 0)
+ out[k] = out[k] * kInvSqrt2;
+ }
+}
+
+void reference_8x8_dct_2d(const int16_t input[kNumCoeffs],
+ double output[kNumCoeffs]) {
+ // First transform columns
+ for (int i = 0; i < 8; ++i) {
+ double temp_in[8], temp_out[8];
+ for (int j = 0; j < 8; ++j)
+ temp_in[j] = input[j*8 + i];
+ reference_8x8_dct_1d(temp_in, temp_out, 1);
+ for (int j = 0; j < 8; ++j)
+ output[j * 8 + i] = temp_out[j];
+ }
+ // Then transform rows
+ for (int i = 0; i < 8; ++i) {
+ double temp_in[8], temp_out[8];
+ for (int j = 0; j < 8; ++j)
+ temp_in[j] = output[j + i*8];
+ reference_8x8_dct_1d(temp_in, temp_out, 1);
+ // Scale by some magic number
+ for (int j = 0; j < 8; ++j)
+ output[j + i * 8] = temp_out[j] * 2;
+ }
}
using libvpx_test::ACMRandom;
namespace {
-typedef void (*FdctFunc)(const int16_t *in, int16_t *out, int stride);
-typedef void (*IdctFunc)(const int16_t *in, uint8_t *out, int stride);
-typedef void (*FhtFunc)(const int16_t *in, int16_t *out, int stride,
+typedef void (*FdctFunc)(const int16_t *in, tran_low_t *out, int stride);
+typedef void (*IdctFunc)(const tran_low_t *in, uint8_t *out, int stride);
+typedef void (*FhtFunc)(const int16_t *in, tran_low_t *out, int stride,
int tx_type);
-typedef void (*IhtFunc)(const int16_t *in, uint8_t *out, int stride,
+typedef void (*IhtFunc)(const tran_low_t *in, uint8_t *out, int stride,
int tx_type);
-typedef std::tr1::tuple<FdctFunc, IdctFunc, int> Dct8x8Param;
-typedef std::tr1::tuple<FhtFunc, IhtFunc, int> Ht8x8Param;
+typedef std::tr1::tuple<FdctFunc, IdctFunc, int, vpx_bit_depth_t> Dct8x8Param;
+typedef std::tr1::tuple<FhtFunc, IhtFunc, int, vpx_bit_depth_t> Ht8x8Param;
-void fdct8x8_ref(const int16_t *in, int16_t *out, int stride, int tx_type) {
+void fdct8x8_ref(const int16_t *in, tran_low_t *out, int stride, int tx_type) {
vp9_fdct8x8_c(in, out, stride);
}
-void fht8x8_ref(const int16_t *in, int16_t *out, int stride, int tx_type) {
+void fht8x8_ref(const int16_t *in, tran_low_t *out, int stride, int tx_type) {
vp9_fht8x8_c(in, out, stride, tx_type);
}
+#if CONFIG_VP9_HIGHBITDEPTH
+void idct8x8_10(const tran_low_t *in, uint8_t *out, int stride) {
+ vp9_high_idct8x8_64_add_c(in, out, stride, 10);
+}
+
+void idct8x8_12(const tran_low_t *in, uint8_t *out, int stride) {
+ vp9_high_idct8x8_64_add_c(in, out, stride, 12);
+}
+
+void iht8x8_10(const tran_low_t *in, uint8_t *out, int stride, int tx_type) {
+ vp9_high_iht8x8_64_add_c(in, out, stride, tx_type, 10);
+}
+
+void iht8x8_12(const tran_low_t *in, uint8_t *out, int stride, int tx_type) {
+ vp9_high_iht8x8_64_add_c(in, out, stride, tx_type, 12);
+}
+#endif
+
class FwdTrans8x8TestBase {
public:
virtual ~FwdTrans8x8TestBase() {}
protected:
- virtual void RunFwdTxfm(int16_t *in, int16_t *out, int stride) = 0;
- virtual void RunInvTxfm(int16_t *out, uint8_t *dst, int stride) = 0;
+ virtual void RunFwdTxfm(int16_t *in, tran_low_t *out, int stride) = 0;
+ virtual void RunInvTxfm(tran_low_t *out, uint8_t *dst, int stride) = 0;
void RunSignBiasCheck() {
ACMRandom rnd(ACMRandom::DeterministicSeed());
DECLARE_ALIGNED_ARRAY(16, int16_t, test_input_block, 64);
- DECLARE_ALIGNED_ARRAY(16, int16_t, test_output_block, 64);
+ DECLARE_ALIGNED_ARRAY(16, tran_low_t, test_output_block, 64);
int count_sign_block[64][2];
const int count_test_block = 100000;
@@ -67,7 +118,8 @@
for (int i = 0; i < count_test_block; ++i) {
// Initialize a test block with input range [-255, 255].
for (int j = 0; j < 64; ++j)
- test_input_block[j] = rnd.Rand8() - rnd.Rand8();
+ test_input_block[j] = ((rnd.Rand16() >> (16 - bit_depth_)) & mask_) -
+ ((rnd.Rand16() >> (16 - bit_depth_)) & mask_);
ASM_REGISTER_STATE_CHECK(
RunFwdTxfm(test_input_block, test_output_block, pitch_));
@@ -82,7 +134,7 @@
for (int j = 0; j < 64; ++j) {
const int diff = abs(count_sign_block[j][0] - count_sign_block[j][1]);
const int max_diff = 1125;
- EXPECT_LT(diff, max_diff)
+ EXPECT_LT(diff, max_diff << (bit_depth_ - 8))
<< "Error: 8x8 FDCT/FHT has a sign bias > "
<< 1. * max_diff / count_test_block * 100 << "%"
<< " for input range [-255, 255] at index " << j
@@ -111,7 +163,7 @@
for (int j = 0; j < 64; ++j) {
const int diff = abs(count_sign_block[j][0] - count_sign_block[j][1]);
const int max_diff = 10000;
- EXPECT_LT(diff, max_diff)
+ EXPECT_LT(diff, max_diff << (bit_depth_ - 8))
<< "Error: 4x4 FDCT/FHT has a sign bias > "
<< 1. * max_diff / count_test_block * 100 << "%"
<< " for input range [-15, 15] at index " << j
@@ -127,16 +179,28 @@
int total_error = 0;
const int count_test_block = 100000;
DECLARE_ALIGNED_ARRAY(16, int16_t, test_input_block, 64);
- DECLARE_ALIGNED_ARRAY(16, int16_t, test_temp_block, 64);
+ DECLARE_ALIGNED_ARRAY(16, tran_low_t, test_temp_block, 64);
DECLARE_ALIGNED_ARRAY(16, uint8_t, dst, 64);
DECLARE_ALIGNED_ARRAY(16, uint8_t, src, 64);
+#if CONFIG_VP9_HIGHBITDEPTH
+ DECLARE_ALIGNED_ARRAY(16, uint16_t, dst16, 64);
+ DECLARE_ALIGNED_ARRAY(16, uint16_t, src16, 64);
+#endif
for (int i = 0; i < count_test_block; ++i) {
// Initialize a test block with input range [-255, 255].
for (int j = 0; j < 64; ++j) {
- src[j] = rnd.Rand8();
- dst[j] = rnd.Rand8();
- test_input_block[j] = src[j] - dst[j];
+ if (bit_depth_ == VPX_BITS_8) {
+ src[j] = rnd.Rand8();
+ dst[j] = rnd.Rand8();
+ test_input_block[j] = src[j] - dst[j];
+#if CONFIG_VP9_HIGHBITDEPTH
+ } else {
+ src16[j] = rnd.Rand16() & mask_;
+ dst16[j] = rnd.Rand16() & mask_;
+ test_input_block[j] = src16[j] - dst16[j];
+#endif
+ }
}
ASM_REGISTER_STATE_CHECK(
@@ -152,11 +216,23 @@
test_temp_block[j] *= 4;
}
}
- ASM_REGISTER_STATE_CHECK(
- RunInvTxfm(test_temp_block, dst, pitch_));
+ if (bit_depth_ == VPX_BITS_8) {
+ ASM_REGISTER_STATE_CHECK(
+ RunInvTxfm(test_temp_block, dst, pitch_));
+#if CONFIG_VP9_HIGHBITDEPTH
+ } else {
+ ASM_REGISTER_STATE_CHECK(
+ RunInvTxfm(test_temp_block, CONVERT_TO_BYTEPTR(dst16), pitch_));
+#endif
+ }
for (int j = 0; j < 64; ++j) {
+#if CONFIG_VP9_HIGHBITDEPTH
+ const int diff =
+ bit_depth_ == VPX_BITS_8 ? dst[j] - src[j] : dst16[j] - src16[j];
+#else
const int diff = dst[j] - src[j];
+#endif
const int error = diff * diff;
if (max_error < error)
max_error = error;
@@ -164,11 +240,11 @@
}
}
- EXPECT_GE(1, max_error)
+ EXPECT_GE(1 << 2 * (bit_depth_ - 8), max_error)
<< "Error: 8x8 FDCT/IDCT or FHT/IHT has an individual"
<< " roundtrip error > 1";
- EXPECT_GE(count_test_block/5, total_error)
+ EXPECT_GE((count_test_block << 2 * (bit_depth_ - 8))/5, total_error)
<< "Error: 8x8 FDCT/IDCT or FHT/IHT has average roundtrip "
<< "error > 1/5 per block";
}
@@ -180,37 +256,68 @@
int total_coeff_error = 0;
const int count_test_block = 100000;
DECLARE_ALIGNED_ARRAY(16, int16_t, test_input_block, 64);
- DECLARE_ALIGNED_ARRAY(16, int16_t, test_temp_block, 64);
- DECLARE_ALIGNED_ARRAY(16, int16_t, ref_temp_block, 64);
+ DECLARE_ALIGNED_ARRAY(16, tran_low_t, test_temp_block, 64);
+ DECLARE_ALIGNED_ARRAY(16, tran_low_t, ref_temp_block, 64);
DECLARE_ALIGNED_ARRAY(16, uint8_t, dst, 64);
DECLARE_ALIGNED_ARRAY(16, uint8_t, src, 64);
+#if CONFIG_VP9_HIGHBITDEPTH
+ DECLARE_ALIGNED_ARRAY(16, uint16_t, dst16, 64);
+ DECLARE_ALIGNED_ARRAY(16, uint16_t, src16, 64);
+#endif
for (int i = 0; i < count_test_block; ++i) {
- // Initialize a test block with input range [-255, 255].
+ // Initialize a test block with input range [-mask_, mask_].
for (int j = 0; j < 64; ++j) {
- if (i == 0) {
- src[j] = 255;
- dst[j] = 0;
- } else if (i == 1) {
- src[j] = 0;
- dst[j] = 255;
+ if (bit_depth_ == VPX_BITS_8) {
+ if (i == 0) {
+ src[j] = 255;
+ dst[j] = 0;
+ } else if (i == 1) {
+ src[j] = 0;
+ dst[j] = 255;
+ } else {
+ src[j] = rnd.Rand8() % 2 ? 255 : 0;
+ dst[j] = rnd.Rand8() % 2 ? 255 : 0;
+ }
+ test_input_block[j] = src[j] - dst[j];
+#if CONFIG_VP9_HIGHBITDEPTH
} else {
- src[j] = rnd.Rand8() % 2 ? 255 : 0;
- dst[j] = rnd.Rand8() % 2 ? 255 : 0;
+ if (i == 0) {
+ src16[j] = mask_;
+ dst16[j] = 0;
+ } else if (i == 1) {
+ src16[j] = 0;
+ dst16[j] = mask_;
+ } else {
+ src16[j] = rnd.Rand8() % 2 ? mask_ : 0;
+ dst16[j] = rnd.Rand8() % 2 ? mask_ : 0;
+ }
+ test_input_block[j] = src16[j] - dst16[j];
+#endif
}
-
- test_input_block[j] = src[j] - dst[j];
}
ASM_REGISTER_STATE_CHECK(
RunFwdTxfm(test_input_block, test_temp_block, pitch_));
ASM_REGISTER_STATE_CHECK(
fwd_txfm_ref(test_input_block, ref_temp_block, pitch_, tx_type_));
- ASM_REGISTER_STATE_CHECK(
- RunInvTxfm(test_temp_block, dst, pitch_));
+ if (bit_depth_ == VPX_BITS_8) {
+ ASM_REGISTER_STATE_CHECK(
+ RunInvTxfm(test_temp_block, dst, pitch_));
+#if CONFIG_VP9_HIGHBITDEPTH
+ } else {
+ ASM_REGISTER_STATE_CHECK(
+ RunInvTxfm(test_temp_block, CONVERT_TO_BYTEPTR(dst16), pitch_));
+#endif
+ }
for (int j = 0; j < 64; ++j) {
+#if CONFIG_VP9_HIGHBITDEPTH
+ const int diff =
+ bit_depth_ == VPX_BITS_8 ? dst[j] - src[j] : dst16[j] - src16[j];
+#else
const int diff = dst[j] - src[j];
+#endif
const int error = diff * diff;
if (max_error < error)
max_error = error;
@@ -220,11 +327,11 @@
total_coeff_error += abs(coeff_diff);
}
- EXPECT_GE(1, max_error)
+ EXPECT_GE(1 << 2 * (bit_depth_ - 8), max_error)
<< "Error: Extremal 8x8 FDCT/IDCT or FHT/IHT has"
<< "an individual roundtrip error > 1";
- EXPECT_GE(count_test_block/5, total_error)
+ EXPECT_GE((count_test_block << 2 * (bit_depth_ - 8))/5, total_error)
<< "Error: Extremal 8x8 FDCT/IDCT or FHT/IHT has average"
<< " roundtrip error > 1/5 per block";
@@ -234,9 +341,97 @@
}
}
+ void RunInvAccuracyCheck() {
+ ACMRandom rnd(ACMRandom::DeterministicSeed());
+ const int count_test_block = 1000;
+ DECLARE_ALIGNED_ARRAY(16, int16_t, in, kNumCoeffs);
+ DECLARE_ALIGNED_ARRAY(16, tran_low_t, coeff, kNumCoeffs);
+ DECLARE_ALIGNED_ARRAY(16, uint8_t, dst, kNumCoeffs);
+ DECLARE_ALIGNED_ARRAY(16, uint8_t, src, kNumCoeffs);
+#if CONFIG_VP9_HIGHBITDEPTH
+ DECLARE_ALIGNED_ARRAY(16, uint16_t, src16, kNumCoeffs);
+ DECLARE_ALIGNED_ARRAY(16, uint16_t, dst16, kNumCoeffs);
+#endif
+
+ for (int i = 0; i < count_test_block; ++i) {
+ double out_r[kNumCoeffs];
+
+ // Initialize a test block with input range [-255, 255].
+ for (int j = 0; j < kNumCoeffs; ++j) {
+ if (bit_depth_ == VPX_BITS_8) {
+ src[j] = rnd.Rand8() % 2 ? 255 : 0;
+ dst[j] = src[j] > 0 ? 0 : 255;
+ in[j] = src[j] - dst[j];
+#if CONFIG_VP9_HIGHBITDEPTH
+ } else {
+ src16[j] = rnd.Rand8() % 2 ? mask_ : 0;
+ dst16[j] = src16[j] > 0 ? 0 : mask_;
+ in[j] = src16[j] - dst16[j];
+#endif
+ }
+ }
+
+ reference_8x8_dct_2d(in, out_r);
+ for (int j = 0; j < kNumCoeffs; ++j)
+ coeff[j] = round(out_r[j]);
+
+ if (bit_depth_ == VPX_BITS_8) {
+ ASM_REGISTER_STATE_CHECK(RunInvTxfm(coeff, dst, pitch_));
+#if CONFIG_VP9_HIGHBITDEPTH
+ } else {
+ ASM_REGISTER_STATE_CHECK(RunInvTxfm(coeff, CONVERT_TO_BYTEPTR(dst16),
+ pitch_));
+#endif
+ }
+
+ for (int j = 0; j < kNumCoeffs; ++j) {
+#if CONFIG_VP9_HIGHBITDEPTH
+ const uint32_t diff =
+ bit_depth_ == VPX_BITS_8 ? dst[j] - src[j] : dst16[j] - src16[j];
+#else
+ const uint32_t diff = dst[j] - src[j];
+#endif
+ const uint32_t error = diff * diff;
+ EXPECT_GE(1u << 2 * (bit_depth_ - 8), error)
+ << "Error: 8x8 IDCT has error " << error
+ << " at index " << j;
+ }
+ }
+ }
+
+ void RunFwdAccuracyCheck() {
+ ACMRandom rnd(ACMRandom::DeterministicSeed());
+ const int count_test_block = 1000;
+ DECLARE_ALIGNED_ARRAY(16, int16_t, in, kNumCoeffs);
+ DECLARE_ALIGNED_ARRAY(16, tran_low_t, coeff_r, kNumCoeffs);
+ DECLARE_ALIGNED_ARRAY(16, tran_low_t, coeff, kNumCoeffs);
+
+ for (int i = 0; i < count_test_block; ++i) {
+ double out_r[kNumCoeffs];
+
+ // Initialize a test block with input range [-mask_, mask_].
+ for (int j = 0; j < kNumCoeffs; ++j)
+ in[j] = rnd.Rand8() % 2 == 0 ? mask_ : -mask_;
+
+ RunFwdTxfm(in, coeff, pitch_);
+ reference_8x8_dct_2d(in, out_r);
+ for (int j = 0; j < kNumCoeffs; ++j)
+ coeff_r[j] = round(out_r[j]);
+
+ for (int j = 0; j < kNumCoeffs; ++j) {
+ const uint32_t diff = coeff[j] - coeff_r[j];
+ const uint32_t error = diff * diff;
+ EXPECT_GE(9u << 2 * (bit_depth_ - 8), error)
+ << "Error: 8x8 DCT has error " << error
+ << " at index " << j;
+ }
+ }
+ }
int pitch_;
int tx_type_;
FhtFunc fwd_txfm_ref;
+ vpx_bit_depth_t bit_depth_;
+ int mask_;
};
class FwdTrans8x8DCT
@@ -251,15 +446,17 @@
tx_type_ = GET_PARAM(2);
pitch_ = 8;
fwd_txfm_ref = fdct8x8_ref;
+ bit_depth_ = GET_PARAM(3);
+ mask_ = (1 << bit_depth_) - 1;
}
virtual void TearDown() { libvpx_test::ClearSystemState(); }
protected:
- void RunFwdTxfm(int16_t *in, int16_t *out, int stride) {
+ void RunFwdTxfm(int16_t *in, tran_low_t *out, int stride) {
fwd_txfm_(in, out, stride);
}
- void RunInvTxfm(int16_t *out, uint8_t *dst, int stride) {
+ void RunInvTxfm(tran_low_t *out, uint8_t *dst, int stride) {
inv_txfm_(out, dst, stride);
}
@@ -279,6 +476,14 @@
RunExtremalCheck();
}
+TEST_P(FwdTrans8x8DCT, FwdAccuracyCheck) {
+ RunFwdAccuracyCheck();
+}
+
+TEST_P(FwdTrans8x8DCT, InvAccuracyCheck) {
+ RunInvAccuracyCheck();
+}
+
class FwdTrans8x8HT
: public FwdTrans8x8TestBase,
public ::testing::TestWithParam<Ht8x8Param> {
@@ -291,15 +496,17 @@
tx_type_ = GET_PARAM(2);
pitch_ = 8;
fwd_txfm_ref = fht8x8_ref;
+ bit_depth_ = GET_PARAM(3);
+ mask_ = (1 << bit_depth_) - 1;
}
virtual void TearDown() { libvpx_test::ClearSystemState(); }
protected:
- void RunFwdTxfm(int16_t *in, int16_t *out, int stride) {
+ void RunFwdTxfm(int16_t *in, tran_low_t *out, int stride) {
fwd_txfm_(in, out, stride, tx_type_);
}
- void RunInvTxfm(int16_t *out, uint8_t *dst, int stride) {
+ void RunInvTxfm(tran_low_t *out, uint8_t *dst, int stride) {
inv_txfm_(out, dst, stride, tx_type_);
}
@@ -321,50 +528,81 @@
using std::tr1::make_tuple;
+#if CONFIG_VP9_HIGHBITDEPTH
INSTANTIATE_TEST_CASE_P(
C, FwdTrans8x8DCT,
::testing::Values(
- make_tuple(&vp9_fdct8x8_c, &vp9_idct8x8_64_add_c, 0)));
+ make_tuple(&vp9_high_fdct8x8_c, &idct8x8_10, 0, VPX_BITS_10),
+ make_tuple(&vp9_high_fdct8x8_c, &idct8x8_12, 0, VPX_BITS_12),
+ make_tuple(&vp9_fdct8x8_c, &vp9_idct8x8_64_add_c, 0, VPX_BITS_8)));
+#else
+INSTANTIATE_TEST_CASE_P(
+ C, FwdTrans8x8DCT,
+ ::testing::Values(
+ make_tuple(&vp9_fdct8x8_c, &vp9_idct8x8_64_add_c, 0, VPX_BITS_8)));
+#endif
+
+#if CONFIG_VP9_HIGHBITDEPTH
INSTANTIATE_TEST_CASE_P(
C, FwdTrans8x8HT,
::testing::Values(
- make_tuple(&vp9_fht8x8_c, &vp9_iht8x8_64_add_c, 0),
- make_tuple(&vp9_fht8x8_c, &vp9_iht8x8_64_add_c, 1),
- make_tuple(&vp9_fht8x8_c, &vp9_iht8x8_64_add_c, 2),
- make_tuple(&vp9_fht8x8_c, &vp9_iht8x8_64_add_c, 3)));
+ make_tuple(&vp9_high_fht8x8_c, &iht8x8_10, 0, VPX_BITS_10),
+ make_tuple(&vp9_high_fht8x8_c, &iht8x8_10, 1, VPX_BITS_10),
+ make_tuple(&vp9_high_fht8x8_c, &iht8x8_10, 2, VPX_BITS_10),
+ make_tuple(&vp9_high_fht8x8_c, &iht8x8_10, 3, VPX_BITS_10),
+ make_tuple(&vp9_high_fht8x8_c, &iht8x8_12, 0, VPX_BITS_12),
+ make_tuple(&vp9_high_fht8x8_c, &iht8x8_12, 1, VPX_BITS_12),
+ make_tuple(&vp9_high_fht8x8_c, &iht8x8_12, 2, VPX_BITS_12),
+ make_tuple(&vp9_high_fht8x8_c, &iht8x8_12, 3, VPX_BITS_12),
+ make_tuple(&vp9_fht8x8_c, &vp9_iht8x8_64_add_c, 0, VPX_BITS_8),
+ make_tuple(&vp9_fht8x8_c, &vp9_iht8x8_64_add_c, 1, VPX_BITS_8),
+ make_tuple(&vp9_fht8x8_c, &vp9_iht8x8_64_add_c, 2, VPX_BITS_8),
+ make_tuple(&vp9_fht8x8_c, &vp9_iht8x8_64_add_c, 3, VPX_BITS_8)));
+#else
+INSTANTIATE_TEST_CASE_P(
+ C, FwdTrans8x8HT,
+ ::testing::Values(
+ make_tuple(&vp9_fht8x8_c, &vp9_iht8x8_64_add_c, 0, VPX_BITS_8),
+ make_tuple(&vp9_fht8x8_c, &vp9_iht8x8_64_add_c, 1, VPX_BITS_8),
+ make_tuple(&vp9_fht8x8_c, &vp9_iht8x8_64_add_c, 2, VPX_BITS_8),
+ make_tuple(&vp9_fht8x8_c, &vp9_iht8x8_64_add_c, 3, VPX_BITS_8)));
+#endif
-#if HAVE_NEON_ASM
+#if HAVE_NEON_ASM && !CONFIG_VP9_HIGHBITDEPTH
INSTANTIATE_TEST_CASE_P(
NEON, FwdTrans8x8DCT,
::testing::Values(
- make_tuple(&vp9_fdct8x8_neon, &vp9_idct8x8_64_add_neon, 0)));
+ make_tuple(&vp9_fdct8x8_neon, &vp9_idct8x8_64_add_neon, 0,
+ VPX_BITS_8)));
INSTANTIATE_TEST_CASE_P(
DISABLED_NEON, FwdTrans8x8HT,
::testing::Values(
- make_tuple(&vp9_fht8x8_c, &vp9_iht8x8_64_add_neon, 0),
- make_tuple(&vp9_fht8x8_c, &vp9_iht8x8_64_add_neon, 1),
- make_tuple(&vp9_fht8x8_c, &vp9_iht8x8_64_add_neon, 2),
- make_tuple(&vp9_fht8x8_c, &vp9_iht8x8_64_add_neon, 3)));
+ make_tuple(&vp9_fht8x8_c, &vp9_iht8x8_64_add_neon, 0, VPX_BITS_8),
+ make_tuple(&vp9_fht8x8_c, &vp9_iht8x8_64_add_neon, 1, VPX_BITS_8),
+ make_tuple(&vp9_fht8x8_c, &vp9_iht8x8_64_add_neon, 2, VPX_BITS_8),
+ make_tuple(&vp9_fht8x8_c, &vp9_iht8x8_64_add_neon, 3, VPX_BITS_8)));
#endif
-#if HAVE_SSE2
+#if HAVE_SSE2 && !CONFIG_VP9_HIGHBITDEPTH
INSTANTIATE_TEST_CASE_P(
SSE2, FwdTrans8x8DCT,
::testing::Values(
- make_tuple(&vp9_fdct8x8_sse2, &vp9_idct8x8_64_add_sse2, 0)));
+ make_tuple(&vp9_fdct8x8_sse2, &vp9_idct8x8_64_add_sse2, 0,
+ VPX_BITS_8)));
INSTANTIATE_TEST_CASE_P(
SSE2, FwdTrans8x8HT,
::testing::Values(
- make_tuple(&vp9_fht8x8_sse2, &vp9_iht8x8_64_add_sse2, 0),
- make_tuple(&vp9_fht8x8_sse2, &vp9_iht8x8_64_add_sse2, 1),
- make_tuple(&vp9_fht8x8_sse2, &vp9_iht8x8_64_add_sse2, 2),
- make_tuple(&vp9_fht8x8_sse2, &vp9_iht8x8_64_add_sse2, 3)));
+ make_tuple(&vp9_fht8x8_sse2, &vp9_iht8x8_64_add_sse2, 0, VPX_BITS_8),
+ make_tuple(&vp9_fht8x8_sse2, &vp9_iht8x8_64_add_sse2, 1, VPX_BITS_8),
+ make_tuple(&vp9_fht8x8_sse2, &vp9_iht8x8_64_add_sse2, 2, VPX_BITS_8),
+ make_tuple(&vp9_fht8x8_sse2, &vp9_iht8x8_64_add_sse2, 3, VPX_BITS_8)));
#endif
-#if HAVE_SSSE3 && ARCH_X86_64
+#if HAVE_SSSE3 && ARCH_X86_64 && !CONFIG_VP9_HIGHBITDEPTH
INSTANTIATE_TEST_CASE_P(
SSSE3, FwdTrans8x8DCT,
::testing::Values(
- make_tuple(&vp9_fdct8x8_ssse3, &vp9_idct8x8_64_add_ssse3, 0)));
+ make_tuple(&vp9_fdct8x8_ssse3, &vp9_idct8x8_64_add_ssse3, 0,
+ VPX_BITS_8)));
#endif
} // namespace
diff --git a/test/frame_size_tests.cc b/test/frame_size_tests.cc
index 2400c20..1c9a522 100644
--- a/test/frame_size_tests.cc
+++ b/test/frame_size_tests.cc
@@ -27,7 +27,7 @@
}
virtual bool HandleDecodeResult(const vpx_codec_err_t res_dec,
- const libvpx_test::VideoSource &video,
+ const libvpx_test::VideoSource& /*video*/,
libvpx_test::Decoder *decoder) {
EXPECT_EQ(expected_res_, res_dec) << decoder->DecodeError();
return !::testing::Test::HasFailure();
@@ -72,7 +72,13 @@
// one for each lag in frames (for 2 pass), and then one for each possible
// reference buffer (8) - we can end up with up to 30 buffers of roughly this
// size or almost 1 gig of memory.
+ // In total the allocations will exceed 2GiB which may cause a failure with
+ // mingw + wine, use a smaller size in that case.
+#if defined(_WIN32) && !defined(_WIN64)
+ video.SetSize(4096, 3072);
+#else
video.SetSize(4096, 4096);
+#endif
video.set_limit(2);
expected_res_ = VPX_CODEC_OK;
ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
diff --git a/test/idct8x8_test.cc b/test/idct8x8_test.cc
index 5f4c33a..f488cb4 100644
--- a/test/idct8x8_test.cc
+++ b/test/idct8x8_test.cc
@@ -109,7 +109,8 @@
ACMRandom rnd(ACMRandom::DeterministicSeed());
const int count_test_block = 10000;
for (int i = 0; i < count_test_block; ++i) {
- int16_t input[64], coeff[64];
+ int16_t input[64];
+ tran_low_t coeff[64];
double output_r[64];
uint8_t dst[64], src[64];
diff --git a/test/intrapred_test.cc b/test/intrapred_test.cc
index ead4760..f0d9c34 100644
--- a/test/intrapred_test.cc
+++ b/test/intrapred_test.cc
@@ -294,6 +294,11 @@
::testing::Values(
vp8_build_intra_predictors_mby_s_ssse3));
#endif
+#if HAVE_NEON
+INSTANTIATE_TEST_CASE_P(NEON, IntraPredYTest,
+ ::testing::Values(
+ vp8_build_intra_predictors_mby_s_neon));
+#endif
typedef void (*IntraPredUvFunc)(MACROBLOCKD *x,
uint8_t *uabove_row,
@@ -382,5 +387,10 @@
::testing::Values(
vp8_build_intra_predictors_mbuv_s_ssse3));
#endif
+#if HAVE_NEON
+INSTANTIATE_TEST_CASE_P(NEON, IntraPredUVTest,
+ ::testing::Values(
+ vp8_build_intra_predictors_mbuv_s_neon));
+#endif
} // namespace
diff --git a/test/invalid_file_test.cc b/test/invalid_file_test.cc
index 0a1c17c..b61d490 100644
--- a/test/invalid_file_test.cc
+++ b/test/invalid_file_test.cc
@@ -73,7 +73,7 @@
void RunTest() {
const DecodeParam input = GET_PARAM(1);
libvpx_test::CompressedVideoSource *video = NULL;
- vpx_codec_dec_cfg_t cfg = {0};
+ vpx_codec_dec_cfg_t cfg = vpx_codec_dec_cfg_t();
cfg.threads = input.threads;
const std::string filename = input.filename;
@@ -113,9 +113,14 @@
const DecodeParam kVP9InvalidFileTests[] = {
{1, "invalid-vp90-02-v2.webm"},
{1, "invalid-vp90-2-00-quantizer-00.webm.ivf.s5861_r01-05_b6-.v2.ivf"},
- {1, "invalid-vp90-03-v2.webm"},
+ {1, "invalid-vp90-03-v3.webm"},
{1, "invalid-vp90-2-00-quantizer-11.webm.ivf.s52984_r01-05_b6-.ivf"},
{1, "invalid-vp90-2-00-quantizer-11.webm.ivf.s52984_r01-05_b6-z.ivf"},
+ {1, "invalid-vp90-2-12-droppable_1.ivf.s3676_r01-05_b6-.ivf"},
+ {1, "invalid-vp90-2-05-resize.ivf.s59293_r01-05_b6-.ivf"},
+ {1, "invalid-vp90-2-09-subpixel-00.ivf.s20492_r01-05_b6-.v2.ivf"},
+ {1, "invalid-vp91-2-mixedrefcsp-444to420.ivf"},
+ {1, "invalid-vp90-2-12-droppable_1.ivf.s73804_r01-05_b6-.ivf"},
};
VP9_INSTANTIATE_TEST_CASE(InvalidFileTest,
@@ -126,9 +131,9 @@
class InvalidFileInvalidPeekTest : public InvalidFileTest {
protected:
InvalidFileInvalidPeekTest() : InvalidFileTest() {}
- virtual void HandlePeekResult(libvpx_test::Decoder *const decoder,
- libvpx_test::CompressedVideoSource *video,
- const vpx_codec_err_t res_peek) {}
+ virtual void HandlePeekResult(libvpx_test::Decoder *const /*decoder*/,
+ libvpx_test::CompressedVideoSource* /*video*/,
+ const vpx_codec_err_t /*res_peek*/) {}
};
TEST_P(InvalidFileInvalidPeekTest, ReturnCode) {
@@ -144,6 +149,10 @@
const DecodeParam kMultiThreadedVP9InvalidFileTests[] = {
{4, "invalid-vp90-2-08-tile_1x4_frame_parallel_all_key.webm"},
+ {4, "invalid-"
+ "vp90-2-08-tile_1x2_frame_parallel.webm.ivf.s47039_r01-05_b6-.ivf"},
+ {2, "invalid-vp90-2-09-aq2.webm.ivf.s3984_r01-05_b6-.v2.ivf"},
+ {4, "invalid-vp90-2-09-subpixel-00.ivf.s19552_r01-05_b6-.v2.ivf"},
};
INSTANTIATE_TEST_CASE_P(
diff --git a/test/md5_helper.h b/test/md5_helper.h
index dc95582..1db712b 100644
--- a/test/md5_helper.h
+++ b/test/md5_helper.h
@@ -28,7 +28,8 @@
// plane, we never want to round down and thus skip a pixel so if
// we are shifting by 1 (chroma_shift) we add 1 before doing the shift.
// This works only for chroma_shift of 0 and 1.
- const int bytes_per_sample = (img->fmt & VPX_IMG_FMT_HIGH) ? 2 : 1;
+ const int bytes_per_sample =
+ (img->fmt & VPX_IMG_FMT_HIGHBITDEPTH) ? 2 : 1;
const int h = plane ? (img->d_h + img->y_chroma_shift) >>
img->y_chroma_shift : img->d_h;
const int w = (plane ? (img->d_w + img->x_chroma_shift) >>
diff --git a/test/partial_idct_test.cc b/test/partial_idct_test.cc
index 15f4e6c..9c24fee 100644
--- a/test/partial_idct_test.cc
+++ b/test/partial_idct_test.cc
@@ -26,8 +26,8 @@
using libvpx_test::ACMRandom;
namespace {
-typedef void (*FwdTxfmFunc)(const int16_t *in, int16_t *out, int stride);
-typedef void (*InvTxfmFunc)(const int16_t *in, uint8_t *out, int stride);
+typedef void (*FwdTxfmFunc)(const int16_t *in, tran_low_t *out, int stride);
+typedef void (*InvTxfmFunc)(const tran_low_t *in, uint8_t *out, int stride);
typedef std::tr1::tuple<FwdTxfmFunc,
InvTxfmFunc,
InvTxfmFunc,
@@ -74,8 +74,8 @@
FAIL() << "Wrong Size!";
break;
}
- DECLARE_ALIGNED_ARRAY(16, int16_t, test_coef_block1, kMaxNumCoeffs);
- DECLARE_ALIGNED_ARRAY(16, int16_t, test_coef_block2, kMaxNumCoeffs);
+ DECLARE_ALIGNED_ARRAY(16, tran_low_t, test_coef_block1, kMaxNumCoeffs);
+ DECLARE_ALIGNED_ARRAY(16, tran_low_t, test_coef_block2, kMaxNumCoeffs);
DECLARE_ALIGNED_ARRAY(16, uint8_t, dst1, kMaxNumCoeffs);
DECLARE_ALIGNED_ARRAY(16, uint8_t, dst2, kMaxNumCoeffs);
@@ -83,7 +83,7 @@
const int block_size = size * size;
DECLARE_ALIGNED_ARRAY(16, int16_t, input_extreme_block, kMaxNumCoeffs);
- DECLARE_ALIGNED_ARRAY(16, int16_t, output_ref_block, kMaxNumCoeffs);
+ DECLARE_ALIGNED_ARRAY(16, tran_low_t, output_ref_block, kMaxNumCoeffs);
int max_error = 0;
for (int i = 0; i < count_test_block; ++i) {
@@ -153,8 +153,8 @@
FAIL() << "Wrong Size!";
break;
}
- DECLARE_ALIGNED_ARRAY(16, int16_t, test_coef_block1, kMaxNumCoeffs);
- DECLARE_ALIGNED_ARRAY(16, int16_t, test_coef_block2, kMaxNumCoeffs);
+ DECLARE_ALIGNED_ARRAY(16, tran_low_t, test_coef_block1, kMaxNumCoeffs);
+ DECLARE_ALIGNED_ARRAY(16, tran_low_t, test_coef_block2, kMaxNumCoeffs);
DECLARE_ALIGNED_ARRAY(16, uint8_t, dst1, kMaxNumCoeffs);
DECLARE_ALIGNED_ARRAY(16, uint8_t, dst2, kMaxNumCoeffs);
const int count_test_block = 1000;
@@ -229,6 +229,7 @@
&vp9_idct4x4_16_add_c,
&vp9_idct4x4_1_add_c,
TX_4X4, 1)));
+
#if HAVE_NEON_ASM
INSTANTIATE_TEST_CASE_P(
NEON, PartialIDctTest,
@@ -259,7 +260,7 @@
TX_4X4, 1)));
#endif
-#if HAVE_SSE2
+#if HAVE_SSE2 && !CONFIG_VP9_HIGHBITDEPTH
INSTANTIATE_TEST_CASE_P(
SSE2, PartialIDctTest,
::testing::Values(
@@ -293,7 +294,7 @@
TX_4X4, 1)));
#endif
-#if HAVE_SSSE3 && ARCH_X86_64
+#if HAVE_SSSE3 && ARCH_X86_64 && !CONFIG_VP9_HIGHBITDEPTH
INSTANTIATE_TEST_CASE_P(
SSSE3_64, PartialIDctTest,
::testing::Values(
@@ -303,7 +304,7 @@
TX_8X8, 12)));
#endif
-#if HAVE_SSSE3
+#if HAVE_SSSE3 && !CONFIG_VP9_HIGHBITDEPTH
INSTANTIATE_TEST_CASE_P(
SSSE3, PartialIDctTest,
::testing::Values(
diff --git a/test/resize_test.cc b/test/resize_test.cc
index 8d08f1e..9d0c570 100644
--- a/test/resize_test.cc
+++ b/test/resize_test.cc
@@ -211,8 +211,8 @@
EXPECT_NEAR(pkt->data.psnr.psnr[0], frame0_psnr_, 2.0);
}
- virtual void FramePktHook(const vpx_codec_cx_pkt_t *pkt) {
#if WRITE_COMPRESSED_STREAM
+ virtual void FramePktHook(const vpx_codec_cx_pkt_t *pkt) {
++out_frames_;
// Write initial file header if first frame.
@@ -222,8 +222,8 @@
// Write frame header and data.
write_ivf_frame_header(pkt, outfile_);
(void)fwrite(pkt->data.frame.buf, 1, pkt->data.frame.sz, outfile_);
-#endif
}
+#endif
double frame0_psnr_;
#if WRITE_COMPRESSED_STREAM
diff --git a/test/sad_test.cc b/test/sad_test.cc
index e63770b..5377c1e 100644
--- a/test/sad_test.cc
+++ b/test/sad_test.cc
@@ -505,21 +505,6 @@
INSTANTIATE_TEST_CASE_P(MMX, SADTest, ::testing::ValuesIn(mmx_tests));
#endif // CONFIG_VP8_ENCODER
-#if CONFIG_VP9_ENCODER
-const SadMxNVp9Func sad_16x16_mmx_vp9 = vp9_sad16x16_mmx;
-const SadMxNVp9Func sad_8x16_mmx_vp9 = vp9_sad8x16_mmx;
-const SadMxNVp9Func sad_16x8_mmx_vp9 = vp9_sad16x8_mmx;
-const SadMxNVp9Func sad_8x8_mmx_vp9 = vp9_sad8x8_mmx;
-const SadMxNVp9Func sad_4x4_mmx_vp9 = vp9_sad4x4_mmx;
-const SadMxNVp9Param mmx_vp9_tests[] = {
- make_tuple(16, 16, sad_16x16_mmx_vp9),
- make_tuple(8, 16, sad_8x16_mmx_vp9),
- make_tuple(16, 8, sad_16x8_mmx_vp9),
- make_tuple(8, 8, sad_8x8_mmx_vp9),
- make_tuple(4, 4, sad_4x4_mmx_vp9),
-};
-INSTANTIATE_TEST_CASE_P(MMX, SADVP9Test, ::testing::ValuesIn(mmx_vp9_tests));
-#endif // CONFIG_VP9_ENCODER
#endif // HAVE_MMX
#if HAVE_SSE
diff --git a/test/set_maps.sh b/test/set_maps.sh
new file mode 100755
index 0000000..e7c8d43
--- /dev/null
+++ b/test/set_maps.sh
@@ -0,0 +1,59 @@
+#!/bin/sh
+##
+## Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+##
+## Use of this source code is governed by a BSD-style license
+## that can be found in the LICENSE file in the root of the source
+## tree. An additional intellectual property rights grant can be found
+## in the file PATENTS. All contributing project authors may
+## be found in the AUTHORS file in the root of the source tree.
+##
+## This file tests the libvpx set_maps example. To add new tests to this file,
+## do the following:
+## 1. Write a shell function (this is your test).
+## 2. Add the function to set_maps_tests (on a new line).
+##
+. $(dirname $0)/tools_common.sh
+
+# Environment check: $YUV_RAW_INPUT is required, and set_maps must exist in
+# $LIBVPX_BIN_PATH.
+set_maps_verify_environment() {
+ if [ ! -e "${YUV_RAW_INPUT}" ]; then
+ echo "Libvpx test data must exist in LIBVPX_TEST_DATA_PATH."
+ return 1
+ fi
+ if [ -z "$(vpx_tool_path set_maps)" ]; then
+ elog "set_maps not found. It must exist in LIBVPX_BIN_PATH or its parent."
+ return 1
+ fi
+}
+
+# Runs set_maps using the codec specified by $1.
+set_maps() {
+ local encoder="$(vpx_tool_path set_maps)"
+ local codec="$1"
+ local output_file="${VPX_TEST_OUTPUT_DIR}/set_maps_${codec}.ivf"
+
+ eval "${VPX_TEST_PREFIX}" "${encoder}" "${codec}" "${YUV_RAW_INPUT_WIDTH}" \
+ "${YUV_RAW_INPUT_HEIGHT}" "${YUV_RAW_INPUT}" "${output_file}" \
+ ${devnull}
+
+ [ -e "${output_file}" ] || return 1
+}
+
+set_maps_vp8() {
+ if [ "$(vp8_encode_available)" = "yes" ]; then
+ set_maps vp8 || return 1
+ fi
+}
+
+set_maps_vp9() {
+ if [ "$(vp9_encode_available)" = "yes" ]; then
+ set_maps vp9 || return 1
+ fi
+}
+
+set_maps_tests="set_maps_vp8
+ set_maps_vp9"
+
+run_tests set_maps_verify_environment "${set_maps_tests}"
diff --git a/test/subtract_test.cc b/test/subtract_test.cc
index 6619fb1..ff42725 100644
--- a/test/subtract_test.cc
+++ b/test/subtract_test.cc
@@ -105,7 +105,7 @@
INSTANTIATE_TEST_CASE_P(C, SubtractBlockTest,
::testing::Values(vp8_subtract_b_c));
-#if HAVE_NEON_ASM
+#if HAVE_NEON
INSTANTIATE_TEST_CASE_P(NEON, SubtractBlockTest,
::testing::Values(vp8_subtract_b_neon));
#endif
diff --git a/test/svc_test.cc b/test/svc_test.cc
index e9cf38d..5035dee 100644
--- a/test/svc_test.cc
+++ b/test/svc_test.cc
@@ -13,6 +13,9 @@
#include "test/codec_factory.h"
#include "test/decode_test_driver.h"
#include "test/i420_video_source.h"
+
+#include "vp9/decoder/vp9_decoder.h"
+
#include "vpx/svc_context.h"
#include "vpx/vp8cx.h"
#include "vpx/vpx_encoder.h"
@@ -21,6 +24,7 @@
using libvpx_test::CodecFactory;
using libvpx_test::Decoder;
+using libvpx_test::DxDataIterator;
using libvpx_test::VP9CodecFactory;
class SvcTest : public ::testing::Test {
@@ -56,15 +60,311 @@
codec_enc_.kf_min_dist = 100;
codec_enc_.kf_max_dist = 100;
- vpx_codec_dec_cfg_t dec_cfg = {0};
+ vpx_codec_dec_cfg_t dec_cfg = vpx_codec_dec_cfg_t();
VP9CodecFactory codec_factory;
decoder_ = codec_factory.CreateDecoder(dec_cfg, 0);
}
virtual void TearDown() {
- vpx_svc_release(&svc_);
+ ReleaseEncoder();
delete(decoder_);
+ }
+
+ void InitializeEncoder() {
+ const vpx_codec_err_t res =
+ vpx_svc_init(&svc_, &codec_, vpx_codec_vp9_cx(), &codec_enc_);
+ EXPECT_EQ(VPX_CODEC_OK, res);
+ vpx_codec_control(&codec_, VP8E_SET_CPUUSED, 4); // Make the test faster
+ codec_initialized_ = true;
+ }
+
+ void ReleaseEncoder() {
+ vpx_svc_release(&svc_);
if (codec_initialized_) vpx_codec_destroy(&codec_);
+ codec_initialized_ = false;
+ }
+
+ void GetStatsData(std::string *const stats_buf) {
+ vpx_codec_iter_t iter = NULL;
+ const vpx_codec_cx_pkt_t *cx_pkt;
+
+ while ((cx_pkt = vpx_codec_get_cx_data(&codec_, &iter)) != NULL) {
+ if (cx_pkt->kind == VPX_CODEC_STATS_PKT) {
+ EXPECT_GT(cx_pkt->data.twopass_stats.sz, 0U);
+ ASSERT_TRUE(cx_pkt->data.twopass_stats.buf != NULL);
+ stats_buf->append(static_cast<char*>(cx_pkt->data.twopass_stats.buf),
+ cx_pkt->data.twopass_stats.sz);
+ }
+ }
+ }
+
+ void Pass1EncodeNFrames(const int n, const int layers,
+ std::string *const stats_buf) {
+ vpx_codec_err_t res;
+
+ ASSERT_GT(n, 0);
+ ASSERT_GT(layers, 0);
+ svc_.spatial_layers = layers;
+ codec_enc_.g_pass = VPX_RC_FIRST_PASS;
+ InitializeEncoder();
+
+ libvpx_test::I420VideoSource video(test_file_name_, kWidth, kHeight,
+ codec_enc_.g_timebase.den,
+ codec_enc_.g_timebase.num, 0, 30);
+ video.Begin();
+
+ for (int i = 0; i < n; ++i) {
+ res = vpx_svc_encode(&svc_, &codec_, video.img(), video.pts(),
+ video.duration(), VPX_DL_GOOD_QUALITY);
+ ASSERT_EQ(VPX_CODEC_OK, res);
+ GetStatsData(stats_buf);
+ video.Next();
+ }
+
+ // Flush encoder and test EOS packet.
+ res = vpx_svc_encode(&svc_, &codec_, NULL, video.pts(),
+ video.duration(), VPX_DL_GOOD_QUALITY);
+ ASSERT_EQ(VPX_CODEC_OK, res);
+ GetStatsData(stats_buf);
+
+ ReleaseEncoder();
+ }
+
+ void StoreFrames(const size_t max_frame_received,
+ struct vpx_fixed_buf *const outputs,
+ size_t *const frame_received) {
+ vpx_codec_iter_t iter = NULL;
+ const vpx_codec_cx_pkt_t *cx_pkt;
+
+ while ((cx_pkt = vpx_codec_get_cx_data(&codec_, &iter)) != NULL) {
+ if (cx_pkt->kind == VPX_CODEC_CX_FRAME_PKT) {
+ const size_t frame_size = cx_pkt->data.frame.sz;
+
+ EXPECT_GT(frame_size, 0U);
+ ASSERT_TRUE(cx_pkt->data.frame.buf != NULL);
+ ASSERT_LT(*frame_received, max_frame_received);
+
+ if (*frame_received == 0)
+ EXPECT_EQ(1, !!(cx_pkt->data.frame.flags & VPX_FRAME_IS_KEY));
+
+ outputs[*frame_received].buf = malloc(frame_size + 16);
+ ASSERT_TRUE(outputs[*frame_received].buf != NULL);
+ memcpy(outputs[*frame_received].buf, cx_pkt->data.frame.buf,
+ frame_size);
+ outputs[*frame_received].sz = frame_size;
+ ++(*frame_received);
+ }
+ }
+ }
+
+ void Pass2EncodeNFrames(std::string *const stats_buf,
+ const int n, const int layers,
+ struct vpx_fixed_buf *const outputs) {
+ vpx_codec_err_t res;
+ size_t frame_received = 0;
+
+ ASSERT_TRUE(outputs != NULL);
+ ASSERT_GT(n, 0);
+ ASSERT_GT(layers, 0);
+ svc_.spatial_layers = layers;
+ codec_enc_.rc_target_bitrate = 500;
+ if (codec_enc_.g_pass == VPX_RC_LAST_PASS) {
+ ASSERT_TRUE(stats_buf != NULL);
+ ASSERT_GT(stats_buf->size(), 0U);
+ codec_enc_.rc_twopass_stats_in.buf = &(*stats_buf)[0];
+ codec_enc_.rc_twopass_stats_in.sz = stats_buf->size();
+ }
+ InitializeEncoder();
+
+ libvpx_test::I420VideoSource video(test_file_name_, kWidth, kHeight,
+ codec_enc_.g_timebase.den,
+ codec_enc_.g_timebase.num, 0, 30);
+ video.Begin();
+
+ for (int i = 0; i < n; ++i) {
+ res = vpx_svc_encode(&svc_, &codec_, video.img(), video.pts(),
+ video.duration(), VPX_DL_GOOD_QUALITY);
+ ASSERT_EQ(VPX_CODEC_OK, res);
+ StoreFrames(n, outputs, &frame_received);
+ video.Next();
+ }
+
+ // Flush encoder.
+ res = vpx_svc_encode(&svc_, &codec_, NULL, 0,
+ video.duration(), VPX_DL_GOOD_QUALITY);
+ EXPECT_EQ(VPX_CODEC_OK, res);
+ StoreFrames(n, outputs, &frame_received);
+
+ EXPECT_EQ(frame_received, static_cast<size_t>(n));
+
+ ReleaseEncoder();
+ }
+
+ void DecodeNFrames(const struct vpx_fixed_buf *const inputs, const int n) {
+ int decoded_frames = 0;
+ int received_frames = 0;
+
+ ASSERT_TRUE(inputs != NULL);
+ ASSERT_GT(n, 0);
+
+ for (int i = 0; i < n; ++i) {
+ ASSERT_TRUE(inputs[i].buf != NULL);
+ ASSERT_GT(inputs[i].sz, 0U);
+ const vpx_codec_err_t res_dec =
+ decoder_->DecodeFrame(static_cast<const uint8_t *>(inputs[i].buf),
+ inputs[i].sz);
+ ASSERT_EQ(VPX_CODEC_OK, res_dec) << decoder_->DecodeError();
+ ++decoded_frames;
+
+ DxDataIterator dec_iter = decoder_->GetDxData();
+ while (dec_iter.Next() != NULL) {
+ ++received_frames;
+ }
+ }
+ EXPECT_EQ(decoded_frames, n);
+ EXPECT_EQ(received_frames, n);
+ }
+
+ void DropLayersAndMakeItVP9Comaptible(struct vpx_fixed_buf *const inputs,
+ const int num_super_frames,
+ const int remained_spatial_layers,
+ const bool is_multiple_frame_contexts) {
+ ASSERT_TRUE(inputs != NULL);
+ ASSERT_GT(num_super_frames, 0);
+ ASSERT_GT(remained_spatial_layers, 0);
+
+ for (int i = 0; i < num_super_frames; ++i) {
+ uint32_t frame_sizes[8] = {0};
+ int frame_count = 0;
+ int frames_found = 0;
+ int frame;
+ ASSERT_TRUE(inputs[i].buf != NULL);
+ ASSERT_GT(inputs[i].sz, 0U);
+
+ vpx_codec_err_t res =
+ vp9_parse_superframe_index(static_cast<const uint8_t*>(inputs[i].buf),
+ inputs[i].sz, frame_sizes, &frame_count,
+ NULL, NULL);
+ ASSERT_EQ(VPX_CODEC_OK, res);
+
+ if (frame_count == 0) {
+ // There's no super frame but only a single frame.
+ ASSERT_EQ(1, remained_spatial_layers);
+ if (is_multiple_frame_contexts) {
+ // Make a new super frame.
+ uint8_t marker = 0xc1;
+ unsigned int mask;
+ int mag;
+
+ // Choose the magnitude.
+ for (mag = 0, mask = 0xff; mag < 4; ++mag) {
+ if (inputs[i].sz < mask)
+ break;
+ mask <<= 8;
+ mask |= 0xff;
+ }
+ marker |= mag << 3;
+ int index_sz = 2 + (mag + 1) * 2;
+
+ inputs[i].buf = realloc(inputs[i].buf, inputs[i].sz + index_sz + 16);
+ ASSERT_TRUE(inputs[i].buf != NULL);
+ uint8_t *frame_data = static_cast<uint8_t*>(inputs[i].buf);
+ frame_data[0] &= ~2; // Set the show_frame flag to 0.
+ frame_data += inputs[i].sz;
+ // Add an one byte frame with show_existing_frame.
+ *frame_data++ = 0x88;
+
+ // Write the super frame index.
+ *frame_data++ = marker;
+
+ frame_sizes[0] = inputs[i].sz;
+ frame_sizes[1] = 1;
+ for (int j = 0; j < 2; ++j) {
+ unsigned int this_sz = frame_sizes[j];
+ for (int k = 0; k <= mag; k++) {
+ *frame_data++ = this_sz & 0xff;
+ this_sz >>= 8;
+ }
+ }
+ *frame_data++ = marker;
+ inputs[i].sz += index_sz + 1;
+ }
+ } else {
+ // Found a super frame.
+ uint8_t *frame_data = static_cast<uint8_t*>(inputs[i].buf);
+ uint8_t *frame_start = frame_data;
+ for (frame = 0; frame < frame_count; ++frame) {
+ // Looking for a visible frame.
+ if (frame_data[0] & 0x02) {
+ ++frames_found;
+ if (frames_found == remained_spatial_layers)
+ break;
+ }
+ frame_data += frame_sizes[frame];
+ }
+ ASSERT_LT(frame, frame_count) << "Couldn't find a visible frame. "
+ << "remained_spatial_layers: " << remained_spatial_layers
+ << " super_frame: " << i
+ << " is_multiple_frame_context: " << is_multiple_frame_contexts;
+ if (frame == frame_count - 1 && !is_multiple_frame_contexts)
+ continue;
+
+ frame_data += frame_sizes[frame];
+
+ // We need to add one more frame for multiple frame contexts.
+ if (is_multiple_frame_contexts)
+ ++frame;
+ uint8_t marker =
+ static_cast<const uint8_t*>(inputs[i].buf)[inputs[i].sz - 1];
+ const uint32_t mag = ((marker >> 3) & 0x3) + 1;
+ const size_t index_sz = 2 + mag * frame_count;
+ const size_t new_index_sz = 2 + mag * (frame + 1);
+ marker &= 0x0f8;
+ marker |= frame;
+
+ // Copy existing frame sizes.
+ memmove(frame_data + (is_multiple_frame_contexts ? 2 : 1),
+ frame_start + inputs[i].sz - index_sz + 1, new_index_sz - 2);
+ if (is_multiple_frame_contexts) {
+ // Add a one byte frame with flag show_existing_frame.
+ *frame_data++ = 0x88 | (remained_spatial_layers - 1);
+ }
+ // New marker.
+ frame_data[0] = marker;
+ frame_data += (mag * (frame + 1) + 1);
+
+ if (is_multiple_frame_contexts) {
+ // Write the frame size for the one byte frame.
+ frame_data -= mag;
+ *frame_data++ = 1;
+ for (uint32_t j = 1; j < mag; ++j) {
+ *frame_data++ = 0;
+ }
+ }
+
+ *frame_data++ = marker;
+ inputs[i].sz = frame_data - frame_start;
+
+ if (is_multiple_frame_contexts) {
+ // Change the show frame flag to 0 for all frames.
+ for (int j = 0; j < frame; ++j) {
+ frame_start[0] &= ~2;
+ frame_start += frame_sizes[j];
+ }
+ }
+ }
+ }
+ }
+
+ void FreeBitstreamBuffers(struct vpx_fixed_buf *const inputs, const int n) {
+ ASSERT_TRUE(inputs != NULL);
+ ASSERT_GT(n, 0);
+
+ for (int i = 0; i < n; ++i) {
+ free(inputs[i].buf);
+ inputs[i].buf = NULL;
+ inputs[i].sz = 0;
+ }
}
SvcContext svc_;
@@ -93,9 +393,7 @@
EXPECT_EQ(VPX_CODEC_INVALID_PARAM, res);
svc_.spatial_layers = 0; // use default layers
- res = vpx_svc_init(&svc_, &codec_, codec_iface_, &codec_enc_);
- EXPECT_EQ(VPX_CODEC_OK, res);
- codec_initialized_ = true;
+ InitializeEncoder();
EXPECT_EQ(VPX_SS_DEFAULT_LAYERS, svc_.spatial_layers);
}
@@ -106,9 +404,7 @@
EXPECT_EQ(VPX_CODEC_INVALID_PARAM, res);
vpx_svc_set_scale_factors(&svc_, "4/16,16/16"); // valid scale values
- res = vpx_svc_init(&svc_, &codec_, codec_iface_, &codec_enc_);
- EXPECT_EQ(VPX_CODEC_OK, res);
- codec_initialized_ = true;
+ InitializeEncoder();
}
TEST_F(SvcTest, InvalidOptions) {
@@ -122,20 +418,17 @@
}
TEST_F(SvcTest, SetLayersOption) {
- vpx_codec_err_t res = vpx_svc_set_options(&svc_, "layers=3");
+ vpx_codec_err_t res = vpx_svc_set_options(&svc_, "spatial-layers=3");
EXPECT_EQ(VPX_CODEC_OK, res);
- res = vpx_svc_init(&svc_, &codec_, vpx_codec_vp9_cx(), &codec_enc_);
- EXPECT_EQ(VPX_CODEC_OK, res);
- codec_initialized_ = true;
+ InitializeEncoder();
EXPECT_EQ(3, svc_.spatial_layers);
}
TEST_F(SvcTest, SetMultipleOptions) {
vpx_codec_err_t res =
- vpx_svc_set_options(&svc_, "layers=2 scale-factors=1/3,2/3");
- res = vpx_svc_init(&svc_, &codec_, vpx_codec_vp9_cx(), &codec_enc_);
+ vpx_svc_set_options(&svc_, "spatial-layers=2 scale-factors=1/3,2/3");
EXPECT_EQ(VPX_CODEC_OK, res);
- codec_initialized_ = true;
+ InitializeEncoder();
EXPECT_EQ(2, svc_.spatial_layers);
}
@@ -149,9 +442,7 @@
res = vpx_svc_set_options(&svc_, "scale-factors=1/3,2/3");
EXPECT_EQ(VPX_CODEC_OK, res);
- res = vpx_svc_init(&svc_, &codec_, vpx_codec_vp9_cx(), &codec_enc_);
- EXPECT_EQ(VPX_CODEC_OK, res);
- codec_initialized_ = true;
+ InitializeEncoder();
}
TEST_F(SvcTest, SetQuantizersOption) {
@@ -162,9 +453,7 @@
EXPECT_EQ(VPX_CODEC_INVALID_PARAM, res);
vpx_svc_set_options(&svc_, "quantizers=40,45");
- res = vpx_svc_init(&svc_, &codec_, vpx_codec_vp9_cx(), &codec_enc_);
- EXPECT_EQ(VPX_CODEC_OK, res);
- codec_initialized_ = true;
+ InitializeEncoder();
}
TEST_F(SvcTest, SetAutoAltRefOption) {
@@ -180,9 +469,7 @@
EXPECT_EQ(VPX_CODEC_INVALID_PARAM, res);
vpx_svc_set_options(&svc_, "auto-alt-refs=0,1,1,1,0");
- res = vpx_svc_init(&svc_, &codec_, vpx_codec_vp9_cx(), &codec_enc_);
- EXPECT_EQ(VPX_CODEC_OK, res);
- codec_initialized_ = true;
+ InitializeEncoder();
}
TEST_F(SvcTest, SetQuantizers) {
@@ -200,9 +487,7 @@
res = vpx_svc_set_quantizers(&svc_, "40,30");
EXPECT_EQ(VPX_CODEC_OK, res);
- res = vpx_svc_init(&svc_, &codec_, vpx_codec_vp9_cx(), &codec_enc_);
- EXPECT_EQ(VPX_CODEC_OK, res);
- codec_initialized_ = true;
+ InitializeEncoder();
}
TEST_F(SvcTest, SetScaleFactors) {
@@ -220,121 +505,25 @@
res = vpx_svc_set_scale_factors(&svc_, "4/16,16/16");
EXPECT_EQ(VPX_CODEC_OK, res);
- res = vpx_svc_init(&svc_, &codec_, vpx_codec_vp9_cx(), &codec_enc_);
- EXPECT_EQ(VPX_CODEC_OK, res);
- codec_initialized_ = true;
+ InitializeEncoder();
}
// Test that decoder can handle an SVC frame as the first frame in a sequence.
-TEST_F(SvcTest, FirstFrameHasLayers) {
- svc_.spatial_layers = 2;
- vpx_svc_set_scale_factors(&svc_, "4/16,16/16");
- vpx_svc_set_quantizers(&svc_, "40,30");
-
- vpx_codec_err_t res =
- vpx_svc_init(&svc_, &codec_, vpx_codec_vp9_cx(), &codec_enc_);
- EXPECT_EQ(VPX_CODEC_OK, res);
- codec_initialized_ = true;
-
- libvpx_test::I420VideoSource video(test_file_name_, kWidth, kHeight,
- codec_enc_.g_timebase.den,
- codec_enc_.g_timebase.num, 0, 30);
- video.Begin();
-
- res = vpx_svc_encode(&svc_, &codec_, video.img(), video.pts(),
- video.duration(), VPX_DL_GOOD_QUALITY);
- EXPECT_EQ(VPX_CODEC_OK, res);
-
- if (vpx_svc_get_frame_size(&svc_) == 0) {
- // Flush encoder
- res = vpx_svc_encode(&svc_, &codec_, NULL, 0,
- video.duration(), VPX_DL_GOOD_QUALITY);
- EXPECT_EQ(VPX_CODEC_OK, res);
- }
-
- int frame_size = vpx_svc_get_frame_size(&svc_);
- EXPECT_GT(frame_size, 0);
- const vpx_codec_err_t res_dec = decoder_->DecodeFrame(
- static_cast<const uint8_t *>(vpx_svc_get_buffer(&svc_)), frame_size);
-
- // this test fails with a decoder error
- ASSERT_EQ(VPX_CODEC_OK, res_dec) << decoder_->DecodeError();
+TEST_F(SvcTest, OnePassEncodeOneFrame) {
+ codec_enc_.g_pass = VPX_RC_ONE_PASS;
+ vpx_fixed_buf output = {0};
+ Pass2EncodeNFrames(NULL, 1, 2, &output);
+ DecodeNFrames(&output, 1);
+ FreeBitstreamBuffers(&output, 1);
}
-TEST_F(SvcTest, EncodeThreeFrames) {
- svc_.spatial_layers = 2;
- vpx_svc_set_scale_factors(&svc_, "4/16,16/16");
- vpx_svc_set_quantizers(&svc_, "40,30");
- int decoded_frames = 0;
- vpx_codec_err_t res_dec;
- int frame_size;
-
- vpx_codec_err_t res =
- vpx_svc_init(&svc_, &codec_, vpx_codec_vp9_cx(), &codec_enc_);
- ASSERT_EQ(VPX_CODEC_OK, res);
- codec_initialized_ = true;
-
- libvpx_test::I420VideoSource video(test_file_name_, kWidth, kHeight,
- codec_enc_.g_timebase.den,
- codec_enc_.g_timebase.num, 0, 30);
- // FRAME 0
- video.Begin();
- // This frame is a keyframe.
- res = vpx_svc_encode(&svc_, &codec_, video.img(), video.pts(),
- video.duration(), VPX_DL_GOOD_QUALITY);
-
- if ((frame_size = vpx_svc_get_frame_size(&svc_)) > 0) {
- EXPECT_EQ((decoded_frames == 0), vpx_svc_is_keyframe(&svc_));
- res_dec = decoder_->DecodeFrame(
- static_cast<const uint8_t *>(vpx_svc_get_buffer(&svc_)), frame_size);
- ASSERT_EQ(VPX_CODEC_OK, res_dec) << decoder_->DecodeError();
- ++decoded_frames;
- }
-
- // FRAME 1
- video.Next();
- // This is a P-frame.
- res = vpx_svc_encode(&svc_, &codec_, video.img(), video.pts(),
- video.duration(), VPX_DL_GOOD_QUALITY);
- ASSERT_EQ(VPX_CODEC_OK, res);
-
- if ((frame_size = vpx_svc_get_frame_size(&svc_)) > 0) {
- EXPECT_EQ((decoded_frames == 0), vpx_svc_is_keyframe(&svc_));
- res_dec = decoder_->DecodeFrame(
- static_cast<const uint8_t *>(vpx_svc_get_buffer(&svc_)), frame_size);
- ASSERT_EQ(VPX_CODEC_OK, res_dec) << decoder_->DecodeError();
- ++decoded_frames;
- }
-
- // FRAME 2
- video.Next();
- // This is a P-frame.
- res = vpx_svc_encode(&svc_, &codec_, video.img(), video.pts(),
- video.duration(), VPX_DL_GOOD_QUALITY);
- ASSERT_EQ(VPX_CODEC_OK, res);
-
- if ((frame_size = vpx_svc_get_frame_size(&svc_)) > 0) {
- EXPECT_EQ((decoded_frames == 0), vpx_svc_is_keyframe(&svc_));
- res_dec = decoder_->DecodeFrame(
- static_cast<const uint8_t *>(vpx_svc_get_buffer(&svc_)), frame_size);
- ASSERT_EQ(VPX_CODEC_OK, res_dec) << decoder_->DecodeError();
- ++decoded_frames;
- }
-
- // Flush encoder
- res = vpx_svc_encode(&svc_, &codec_, NULL, 0,
- video.duration(), VPX_DL_GOOD_QUALITY);
- EXPECT_EQ(VPX_CODEC_OK, res);
-
- while ((frame_size = vpx_svc_get_frame_size(&svc_)) > 0) {
- EXPECT_EQ((decoded_frames == 0), vpx_svc_is_keyframe(&svc_));
- res_dec = decoder_->DecodeFrame(
- static_cast<const uint8_t *>(vpx_svc_get_buffer(&svc_)), frame_size);
- ASSERT_EQ(VPX_CODEC_OK, res_dec) << decoder_->DecodeError();
- ++decoded_frames;
- }
-
- EXPECT_EQ(decoded_frames, 3);
+TEST_F(SvcTest, OnePassEncodeThreeFrames) {
+ codec_enc_.g_pass = VPX_RC_ONE_PASS;
+ vpx_fixed_buf outputs[3];
+ memset(&outputs[0], 0, sizeof(outputs));
+ Pass2EncodeNFrames(NULL, 3, 2, &outputs[0]);
+ DecodeNFrames(&outputs[0], 3);
+ FreeBitstreamBuffers(&outputs[0], 3);
}
TEST_F(SvcTest, GetLayerResolution) {
@@ -342,14 +531,11 @@
vpx_svc_set_scale_factors(&svc_, "4/16,8/16");
vpx_svc_set_quantizers(&svc_, "40,30");
- vpx_codec_err_t res =
- vpx_svc_init(&svc_, &codec_, vpx_codec_vp9_cx(), &codec_enc_);
- EXPECT_EQ(VPX_CODEC_OK, res);
- codec_initialized_ = true;
+ InitializeEncoder();
// ensure that requested layer is a valid layer
uint32_t layer_width, layer_height;
- res = vpx_svc_get_layer_resolution(&svc_, svc_.spatial_layers,
+ vpx_codec_err_t res = vpx_svc_get_layer_resolution(&svc_, svc_.spatial_layers,
&layer_width, &layer_height);
EXPECT_EQ(VPX_CODEC_INVALID_PARAM, res);
@@ -373,131 +559,313 @@
EXPECT_EQ(kHeight * 8 / 16, layer_height);
}
-TEST_F(SvcTest, TwoPassEncode) {
+TEST_F(SvcTest, TwoPassEncode10Frames) {
// First pass encode
std::string stats_buf;
- svc_.spatial_layers = 2;
- codec_enc_.g_pass = VPX_RC_FIRST_PASS;
- vpx_svc_set_scale_factors(&svc_, "4/16,16/16");
- vpx_svc_set_quantizers(&svc_, "40,30");
- vpx_svc_set_options(&svc_, "auto-alt-refs=1,1");
-
- vpx_codec_err_t res =
- vpx_svc_init(&svc_, &codec_, vpx_codec_vp9_cx(), &codec_enc_);
- ASSERT_EQ(VPX_CODEC_OK, res);
- codec_initialized_ = true;
-
- libvpx_test::I420VideoSource video(test_file_name_, kWidth, kHeight,
- codec_enc_.g_timebase.den,
- codec_enc_.g_timebase.num, 0, 30);
- // FRAME 0
- video.Begin();
- res = vpx_svc_encode(&svc_, &codec_, video.img(), video.pts(),
- video.duration(), VPX_DL_GOOD_QUALITY);
- ASSERT_EQ(VPX_CODEC_OK, res);
- size_t stats_size = vpx_svc_get_rc_stats_buffer_size(&svc_);
- EXPECT_GT(stats_size, 0U);
- const char *stats_data = vpx_svc_get_rc_stats_buffer(&svc_);
- ASSERT_TRUE(stats_data != NULL);
- stats_buf.append(stats_data, stats_size);
-
- // FRAME 1
- video.Next();
- res = vpx_svc_encode(&svc_, &codec_, video.img(), video.pts(),
- video.duration(), VPX_DL_GOOD_QUALITY);
- stats_size = vpx_svc_get_rc_stats_buffer_size(&svc_);
- EXPECT_GT(stats_size, 0U);
- stats_data = vpx_svc_get_rc_stats_buffer(&svc_);
- ASSERT_TRUE(stats_data != NULL);
- stats_buf.append(stats_data, stats_size);
-
- // Flush encoder and test EOS packet
- res = vpx_svc_encode(&svc_, &codec_, NULL, video.pts(),
- video.duration(), VPX_DL_GOOD_QUALITY);
- stats_size = vpx_svc_get_rc_stats_buffer_size(&svc_);
- EXPECT_GT(stats_size, 0U);
- stats_data = vpx_svc_get_rc_stats_buffer(&svc_);
- ASSERT_TRUE(stats_data != NULL);
- stats_buf.append(stats_data, stats_size);
-
- // Tear down encoder
- vpx_svc_release(&svc_);
- vpx_codec_destroy(&codec_);
+ Pass1EncodeNFrames(10, 2, &stats_buf);
// Second pass encode
- int decoded_frames = 0;
- vpx_codec_err_t res_dec;
- int frame_size;
codec_enc_.g_pass = VPX_RC_LAST_PASS;
- vpx_svc_set_scale_factors(&svc_, "4/16,16/16");
- vpx_svc_set_quantizers(&svc_, "40,30");
+ vpx_fixed_buf outputs[10];
+ memset(&outputs[0], 0, sizeof(outputs));
+ Pass2EncodeNFrames(&stats_buf, 10, 2, &outputs[0]);
+ DecodeNFrames(&outputs[0], 10);
+ FreeBitstreamBuffers(&outputs[0], 10);
+}
+
+TEST_F(SvcTest, TwoPassEncode20FramesWithAltRef) {
+ // First pass encode
+ std::string stats_buf;
+ Pass1EncodeNFrames(20, 2, &stats_buf);
+
+ // Second pass encode
+ codec_enc_.g_pass = VPX_RC_LAST_PASS;
vpx_svc_set_options(&svc_, "auto-alt-refs=1,1");
- codec_enc_.rc_twopass_stats_in.buf = &stats_buf[0];
- codec_enc_.rc_twopass_stats_in.sz = stats_buf.size();
+ vpx_fixed_buf outputs[20];
+ memset(&outputs[0], 0, sizeof(outputs));
+ Pass2EncodeNFrames(&stats_buf, 20, 2, &outputs[0]);
+ DecodeNFrames(&outputs[0], 20);
+ FreeBitstreamBuffers(&outputs[0], 20);
+}
- res = vpx_svc_init(&svc_, &codec_, vpx_codec_vp9_cx(), &codec_enc_);
- ASSERT_EQ(VPX_CODEC_OK, res);
- codec_initialized_ = true;
+TEST_F(SvcTest, TwoPassEncode2SpatialLayersDecodeBaseLayerOnly) {
+ // First pass encode
+ std::string stats_buf;
+ Pass1EncodeNFrames(10, 2, &stats_buf);
- // FRAME 0
- video.Begin();
- // This frame is a keyframe.
- res = vpx_svc_encode(&svc_, &codec_, video.img(), video.pts(),
- video.duration(), VPX_DL_GOOD_QUALITY);
- ASSERT_EQ(VPX_CODEC_OK, res);
+ // Second pass encode
+ codec_enc_.g_pass = VPX_RC_LAST_PASS;
+ vpx_svc_set_options(&svc_, "auto-alt-refs=1,1");
+ vpx_fixed_buf outputs[10];
+ memset(&outputs[0], 0, sizeof(outputs));
+ Pass2EncodeNFrames(&stats_buf, 10, 2, &outputs[0]);
+ DropLayersAndMakeItVP9Comaptible(&outputs[0], 10, 1, false);
+ DecodeNFrames(&outputs[0], 10);
+ FreeBitstreamBuffers(&outputs[0], 10);
+}
- if ((frame_size = vpx_svc_get_frame_size(&svc_)) > 0) {
- EXPECT_EQ((decoded_frames == 0), vpx_svc_is_keyframe(&svc_));
- res_dec = decoder_->DecodeFrame(
- static_cast<const uint8_t *>(vpx_svc_get_buffer(&svc_)), frame_size);
- ASSERT_EQ(VPX_CODEC_OK, res_dec) << decoder_->DecodeError();
- ++decoded_frames;
- }
+TEST_F(SvcTest, TwoPassEncode5SpatialLayersDecode54321Layers) {
+ // First pass encode
+ std::string stats_buf;
+ Pass1EncodeNFrames(10, 5, &stats_buf);
- // FRAME 1
- video.Next();
- // This is a P-frame.
- res = vpx_svc_encode(&svc_, &codec_, video.img(), video.pts(),
- video.duration(), VPX_DL_GOOD_QUALITY);
- ASSERT_EQ(VPX_CODEC_OK, res);
+ // Second pass encode
+ codec_enc_.g_pass = VPX_RC_LAST_PASS;
+ vpx_svc_set_options(&svc_, "auto-alt-refs=0,1,1,1,0");
+ vpx_fixed_buf outputs[10];
+ memset(&outputs[0], 0, sizeof(outputs));
+ Pass2EncodeNFrames(&stats_buf, 10, 5, &outputs[0]);
- if ((frame_size = vpx_svc_get_frame_size(&svc_)) > 0) {
- EXPECT_EQ((decoded_frames == 0), vpx_svc_is_keyframe(&svc_));
- res_dec = decoder_->DecodeFrame(
- static_cast<const uint8_t *>(vpx_svc_get_buffer(&svc_)), frame_size);
- ASSERT_EQ(VPX_CODEC_OK, res_dec) << decoder_->DecodeError();
- ++decoded_frames;
- }
+ DecodeNFrames(&outputs[0], 10);
+ DropLayersAndMakeItVP9Comaptible(&outputs[0], 10, 4, false);
+ DecodeNFrames(&outputs[0], 10);
+ DropLayersAndMakeItVP9Comaptible(&outputs[0], 10, 3, false);
+ DecodeNFrames(&outputs[0], 10);
+ DropLayersAndMakeItVP9Comaptible(&outputs[0], 10, 2, false);
+ DecodeNFrames(&outputs[0], 10);
+ DropLayersAndMakeItVP9Comaptible(&outputs[0], 10, 1, false);
+ DecodeNFrames(&outputs[0], 10);
- // FRAME 2
- video.Next();
- // This is a P-frame.
- res = vpx_svc_encode(&svc_, &codec_, video.img(), video.pts(),
- video.duration(), VPX_DL_GOOD_QUALITY);
- ASSERT_EQ(VPX_CODEC_OK, res);
+ FreeBitstreamBuffers(&outputs[0], 10);
+}
- if ((frame_size = vpx_svc_get_frame_size(&svc_)) > 0) {
- EXPECT_EQ((decoded_frames == 0), vpx_svc_is_keyframe(&svc_));
- res_dec = decoder_->DecodeFrame(
- static_cast<const uint8_t *>(vpx_svc_get_buffer(&svc_)), frame_size);
- ASSERT_EQ(VPX_CODEC_OK, res_dec) << decoder_->DecodeError();
- ++decoded_frames;
- }
+TEST_F(SvcTest, TwoPassEncode2SNRLayers) {
+ // First pass encode
+ std::string stats_buf;
+ vpx_svc_set_options(&svc_, "scale-factors=1/1,1/1");
+ Pass1EncodeNFrames(20, 2, &stats_buf);
- // Flush encoder
- res = vpx_svc_encode(&svc_, &codec_, NULL, 0,
- video.duration(), VPX_DL_GOOD_QUALITY);
+ // Second pass encode
+ codec_enc_.g_pass = VPX_RC_LAST_PASS;
+ vpx_svc_set_options(&svc_,
+ "auto-alt-refs=1,1 scale-factors=1/1,1/1");
+ vpx_fixed_buf outputs[20];
+ memset(&outputs[0], 0, sizeof(outputs));
+ Pass2EncodeNFrames(&stats_buf, 20, 2, &outputs[0]);
+ DecodeNFrames(&outputs[0], 20);
+ FreeBitstreamBuffers(&outputs[0], 20);
+}
+
+TEST_F(SvcTest, TwoPassEncode3SNRLayersDecode321Layers) {
+ // First pass encode
+ std::string stats_buf;
+ vpx_svc_set_options(&svc_, "scale-factors=1/1,1/1,1/1");
+ Pass1EncodeNFrames(20, 3, &stats_buf);
+
+ // Second pass encode
+ codec_enc_.g_pass = VPX_RC_LAST_PASS;
+ vpx_svc_set_options(&svc_,
+ "auto-alt-refs=1,1,1 scale-factors=1/1,1/1,1/1");
+ vpx_fixed_buf outputs[20];
+ memset(&outputs[0], 0, sizeof(outputs));
+ Pass2EncodeNFrames(&stats_buf, 20, 3, &outputs[0]);
+ DecodeNFrames(&outputs[0], 20);
+ DropLayersAndMakeItVP9Comaptible(&outputs[0], 20, 2, false);
+ DecodeNFrames(&outputs[0], 20);
+ DropLayersAndMakeItVP9Comaptible(&outputs[0], 20, 1, false);
+ DecodeNFrames(&outputs[0], 20);
+
+ FreeBitstreamBuffers(&outputs[0], 20);
+}
+
+TEST_F(SvcTest, SetMultipleFrameContextsOption) {
+ svc_.spatial_layers = 5;
+ vpx_codec_err_t res =
+ vpx_svc_set_options(&svc_, "multi-frame-contexts=1");
EXPECT_EQ(VPX_CODEC_OK, res);
+ res = vpx_svc_init(&svc_, &codec_, vpx_codec_vp9_cx(), &codec_enc_);
+ EXPECT_EQ(VPX_CODEC_INVALID_PARAM, res);
- while ((frame_size = vpx_svc_get_frame_size(&svc_)) > 0) {
- EXPECT_EQ((decoded_frames == 0), vpx_svc_is_keyframe(&svc_));
- res_dec = decoder_->DecodeFrame(
- static_cast<const uint8_t *>(vpx_svc_get_buffer(&svc_)), frame_size);
- ASSERT_EQ(VPX_CODEC_OK, res_dec) << decoder_->DecodeError();
- ++decoded_frames;
+ svc_.spatial_layers = 2;
+ res = vpx_svc_set_options(&svc_, "multi-frame-contexts=1");
+ InitializeEncoder();
+}
+
+TEST_F(SvcTest, TwoPassEncode2SpatialLayersWithMultipleFrameContexts) {
+ // First pass encode
+ std::string stats_buf;
+ Pass1EncodeNFrames(10, 2, &stats_buf);
+
+ // Second pass encode
+ codec_enc_.g_pass = VPX_RC_LAST_PASS;
+ codec_enc_.g_error_resilient = 0;
+ vpx_svc_set_options(&svc_, "auto-alt-refs=1,1 multi-frame-contexts=1");
+ vpx_fixed_buf outputs[10];
+ memset(&outputs[0], 0, sizeof(outputs));
+ Pass2EncodeNFrames(&stats_buf, 10, 2, &outputs[0]);
+ DropLayersAndMakeItVP9Comaptible(&outputs[0], 10, 2, true);
+ DecodeNFrames(&outputs[0], 10);
+ FreeBitstreamBuffers(&outputs[0], 10);
+}
+
+TEST_F(SvcTest,
+ TwoPassEncode2SpatialLayersWithMultipleFrameContextsDecodeBaselayer) {
+ // First pass encode
+ std::string stats_buf;
+ Pass1EncodeNFrames(10, 2, &stats_buf);
+
+ // Second pass encode
+ codec_enc_.g_pass = VPX_RC_LAST_PASS;
+ codec_enc_.g_error_resilient = 0;
+ vpx_svc_set_options(&svc_, "auto-alt-refs=1,1 multi-frame-contexts=1");
+ vpx_fixed_buf outputs[10];
+ memset(&outputs[0], 0, sizeof(outputs));
+ Pass2EncodeNFrames(&stats_buf, 10, 2, &outputs[0]);
+ DropLayersAndMakeItVP9Comaptible(&outputs[0], 10, 1, true);
+ DecodeNFrames(&outputs[0], 10);
+ FreeBitstreamBuffers(&outputs[0], 10);
+}
+
+TEST_F(SvcTest, TwoPassEncode2SNRLayersWithMultipleFrameContexts) {
+ // First pass encode
+ std::string stats_buf;
+ vpx_svc_set_options(&svc_, "scale-factors=1/1,1/1");
+ Pass1EncodeNFrames(10, 2, &stats_buf);
+
+ // Second pass encode
+ codec_enc_.g_pass = VPX_RC_LAST_PASS;
+ codec_enc_.g_error_resilient = 0;
+ vpx_svc_set_options(&svc_, "auto-alt-refs=1,1 scale-factors=1/1,1/1 "
+ "multi-frame-contexts=1");
+ vpx_fixed_buf outputs[10];
+ memset(&outputs[0], 0, sizeof(outputs));
+ Pass2EncodeNFrames(&stats_buf, 10, 2, &outputs[0]);
+ DropLayersAndMakeItVP9Comaptible(&outputs[0], 10, 2, true);
+ DecodeNFrames(&outputs[0], 10);
+ FreeBitstreamBuffers(&outputs[0], 10);
+}
+
+TEST_F(SvcTest,
+ TwoPassEncode3SNRLayersWithMultipleFrameContextsDecode321Layer) {
+ // First pass encode
+ std::string stats_buf;
+ vpx_svc_set_options(&svc_, "scale-factors=1/1,1/1,1/1");
+ Pass1EncodeNFrames(10, 3, &stats_buf);
+
+ // Second pass encode
+ codec_enc_.g_pass = VPX_RC_LAST_PASS;
+ codec_enc_.g_error_resilient = 0;
+ vpx_svc_set_options(&svc_, "auto-alt-refs=1,1,1 scale-factors=1/1,1/1,1/1 "
+ "multi-frame-contexts=1");
+ vpx_fixed_buf outputs[10];
+ memset(&outputs[0], 0, sizeof(outputs));
+ Pass2EncodeNFrames(&stats_buf, 10, 3, &outputs[0]);
+
+ vpx_fixed_buf outputs_new[10];
+ for (int i = 0; i < 10; ++i) {
+ outputs_new[i].buf = malloc(outputs[i].sz + 16);
+ ASSERT_TRUE(outputs_new[i].buf != NULL);
+ memcpy(outputs_new[i].buf, outputs[i].buf, outputs[i].sz);
+ outputs_new[i].sz = outputs[i].sz;
}
+ DropLayersAndMakeItVP9Comaptible(&outputs_new[0], 10, 3, true);
+ DecodeNFrames(&outputs_new[0], 10);
- EXPECT_EQ(decoded_frames, 3);
+ for (int i = 0; i < 10; ++i) {
+ memcpy(outputs_new[i].buf, outputs[i].buf, outputs[i].sz);
+ outputs_new[i].sz = outputs[i].sz;
+ }
+ DropLayersAndMakeItVP9Comaptible(&outputs_new[0], 10, 2, true);
+ DecodeNFrames(&outputs_new[0], 10);
+
+ for (int i = 0; i < 10; ++i) {
+ memcpy(outputs_new[i].buf, outputs[i].buf, outputs[i].sz);
+ outputs_new[i].sz = outputs[i].sz;
+ }
+ DropLayersAndMakeItVP9Comaptible(&outputs_new[0], 10, 1, true);
+ DecodeNFrames(&outputs_new[0], 10);
+
+ FreeBitstreamBuffers(&outputs[0], 10);
+ FreeBitstreamBuffers(&outputs_new[0], 10);
+}
+
+TEST_F(SvcTest, TwoPassEncode2TemporalLayers) {
+ // First pass encode
+ std::string stats_buf;
+ vpx_svc_set_options(&svc_, "scale-factors=1/1");
+ svc_.temporal_layers = 2;
+ Pass1EncodeNFrames(10, 1, &stats_buf);
+
+ // Second pass encode
+ codec_enc_.g_pass = VPX_RC_LAST_PASS;
+ svc_.temporal_layers = 2;
+ vpx_svc_set_options(&svc_, "auto-alt-refs=1 scale-factors=1/1");
+ vpx_fixed_buf outputs[10];
+ memset(&outputs[0], 0, sizeof(outputs));
+ Pass2EncodeNFrames(&stats_buf, 10, 1, &outputs[0]);
+ DecodeNFrames(&outputs[0], 10);
+ FreeBitstreamBuffers(&outputs[0], 10);
+}
+
+TEST_F(SvcTest, TwoPassEncode2TemporalLayersWithMultipleFrameContexts) {
+ // First pass encode
+ std::string stats_buf;
+ vpx_svc_set_options(&svc_, "scale-factors=1/1");
+ svc_.temporal_layers = 2;
+ Pass1EncodeNFrames(10, 1, &stats_buf);
+
+ // Second pass encode
+ codec_enc_.g_pass = VPX_RC_LAST_PASS;
+ svc_.temporal_layers = 2;
+ codec_enc_.g_error_resilient = 0;
+ vpx_svc_set_options(&svc_, "auto-alt-refs=1 scale-factors=1/1 "
+ "multi-frame-contexts=1");
+ vpx_fixed_buf outputs[10];
+ memset(&outputs[0], 0, sizeof(outputs));
+ Pass2EncodeNFrames(&stats_buf, 10, 1, &outputs[0]);
+ DropLayersAndMakeItVP9Comaptible(&outputs[0], 10, 1, true);
+ DecodeNFrames(&outputs[0], 10);
+ FreeBitstreamBuffers(&outputs[0], 10);
+}
+
+TEST_F(SvcTest, TwoPassEncode2TemporalLayersDecodeBaseLayer) {
+ // First pass encode
+ std::string stats_buf;
+ vpx_svc_set_options(&svc_, "scale-factors=1/1");
+ svc_.temporal_layers = 2;
+ Pass1EncodeNFrames(10, 1, &stats_buf);
+
+ // Second pass encode
+ codec_enc_.g_pass = VPX_RC_LAST_PASS;
+ svc_.temporal_layers = 2;
+ vpx_svc_set_options(&svc_, "auto-alt-refs=1 scale-factors=1/1");
+ vpx_fixed_buf outputs[10];
+ memset(&outputs[0], 0, sizeof(outputs));
+ Pass2EncodeNFrames(&stats_buf, 10, 1, &outputs[0]);
+
+ vpx_fixed_buf base_layer[5];
+ for (int i = 0; i < 5; ++i)
+ base_layer[i] = outputs[i * 2];
+
+ DecodeNFrames(&base_layer[0], 5);
+ FreeBitstreamBuffers(&outputs[0], 10);
+}
+
+TEST_F(SvcTest,
+ TwoPassEncode2TemporalLayersWithMultipleFrameContextsDecodeBaseLayer) {
+ // First pass encode
+ std::string stats_buf;
+ vpx_svc_set_options(&svc_, "scale-factors=1/1");
+ svc_.temporal_layers = 2;
+ Pass1EncodeNFrames(10, 1, &stats_buf);
+
+ // Second pass encode
+ codec_enc_.g_pass = VPX_RC_LAST_PASS;
+ svc_.temporal_layers = 2;
+ codec_enc_.g_error_resilient = 0;
+ vpx_svc_set_options(&svc_, "auto-alt-refs=1 scale-factors=1/1 "
+ "multi-frame-contexts=1");
+ vpx_fixed_buf outputs[10];
+ memset(&outputs[0], 0, sizeof(outputs));
+ Pass2EncodeNFrames(&stats_buf, 10, 1, &outputs[0]);
+ DropLayersAndMakeItVP9Comaptible(&outputs[0], 10, 1, true);
+
+ vpx_fixed_buf base_layer[5];
+ for (int i = 0; i < 5; ++i)
+ base_layer[i] = outputs[i * 2];
+
+ DecodeNFrames(&base_layer[0], 5);
+ FreeBitstreamBuffers(&outputs[0], 10);
}
} // namespace
diff --git a/test/test-data.sha1 b/test/test-data.sha1
index ee6289f..e6114ab 100644
--- a/test/test-data.sha1
+++ b/test/test-data.sha1
@@ -10,8 +10,8 @@
25751f5d3b05ff03f0719ad42cd625348eb8961e invalid-vp90-01-v2.webm.res
d78e2fceba5ac942246503ec8366f879c4775ca5 invalid-vp90-02-v2.webm
8e2eff4af87d2b561cce2365713269e301457ef3 invalid-vp90-02-v2.webm.res
-df1a1453feb3c00d7d89746c7003b4163523bff3 invalid-vp90-03-v2.webm
-25dd58c22d23f75304d7ce7f69f4e5b02ef9119a invalid-vp90-03-v2.webm.res
+df1a1453feb3c00d7d89746c7003b4163523bff3 invalid-vp90-03-v3.webm
+4935c62becc68c13642a03db1e6d3e2331c1c612 invalid-vp90-03-v3.webm.res
d637297561dd904eb2c97a9015deeb31c4a1e8d2 invalid-vp90-2-08-tile_1x4_frame_parallel_all_key.webm
3a204bdbeaa3c6458b77bcebb8366d107267f55d invalid-vp90-2-08-tile_1x4_frame_parallel_all_key.webm.res
a432f96ff0a787268e2f94a8092ab161a18d1b06 park_joy_90p_10_420.y4m
@@ -681,3 +681,23 @@
c77e4a26616add298a05dd5d12397be22c0e40c5 vp90-2-18-resize.ivf
c12918cf0a716417fba2de35c3fc5ab90e52dfce vp90-2-18-resize.ivf.md5
717da707afcaa1f692ff1946f291054eb75a4f06 screendata.y4m
+b7c1296630cdf1a7ef493d15ff4f9eb2999202f6 invalid-vp90-2-08-tile_1x2_frame_parallel.webm.ivf.s47039_r01-05_b6-.ivf
+0a3884edb3fd8f9d9b500223e650f7de257b67d8 invalid-vp90-2-08-tile_1x2_frame_parallel.webm.ivf.s47039_r01-05_b6-.ivf.res
+fac89b5735be8a86b0dc05159f996a5c3208ae32 invalid-vp90-2-09-aq2.webm.ivf.s3984_r01-05_b6-.v2.ivf
+0a3884edb3fd8f9d9b500223e650f7de257b67d8 invalid-vp90-2-09-aq2.webm.ivf.s3984_r01-05_b6-.v2.ivf.res
+4506dfdcdf8ee4250924b075a0dcf1f070f72e5a invalid-vp90-2-09-subpixel-00.ivf.s19552_r01-05_b6-.v2.ivf
+bcdedaf168ac225575468fda77502d2dc9fd5baa invalid-vp90-2-09-subpixel-00.ivf.s19552_r01-05_b6-.v2.ivf.res
+65e93f9653bcf65b022f7d225268d1a90a76e7bb vp90-2-19-skip.webm
+368dccdde5288c13c25695d2eacdc7402cadf613 vp90-2-19-skip.webm.md5
+ffe460282df2b0e7d4603c2158653ad96f574b02 vp90-2-19-skip-01.webm
+bd21bc9eda4a4a36b221d71ede3a139fc3c7bd85 vp90-2-19-skip-01.webm.md5
+b03c408cf23158638da18dbc3323b99a1635c68a invalid-vp90-2-12-droppable_1.ivf.s3676_r01-05_b6-.ivf
+0a3884edb3fd8f9d9b500223e650f7de257b67d8 invalid-vp90-2-12-droppable_1.ivf.s3676_r01-05_b6-.ivf.res
+5e67e24e7f53fd189e565513cef8519b1bd6c712 invalid-vp90-2-05-resize.ivf.s59293_r01-05_b6-.ivf
+741158f67c0d9d23726624d06bdc482ad368afc9 invalid-vp90-2-05-resize.ivf.s59293_r01-05_b6-.ivf.res
+8b1f7bf7e86c0976d277f60e8fcd9539e75a079a invalid-vp90-2-09-subpixel-00.ivf.s20492_r01-05_b6-.v2.ivf
+9c6bdf048fb2e66f07d4b4db5b32e6f303bd6109 invalid-vp90-2-09-subpixel-00.ivf.s20492_r01-05_b6-.v2.ivf.res
+552e372e9b78127389fb06b34545df2cec15ba6d invalid-vp91-2-mixedrefcsp-444to420.ivf
+a61774cf03fc584bd9f0904fc145253bb8ea6c4c invalid-vp91-2-mixedrefcsp-444to420.ivf.res
+812d05a64a0d83c1b504d0519927ddc5a2cdb273 invalid-vp90-2-12-droppable_1.ivf.s73804_r01-05_b6-.ivf
+1e472baaf5f6113459f0399a38a5a5e68d17799d invalid-vp90-2-12-droppable_1.ivf.s73804_r01-05_b6-.ivf.res
diff --git a/test/test.mk b/test/test.mk
index 0814c2b..b92b6da 100644
--- a/test/test.mk
+++ b/test/test.mk
@@ -785,6 +785,10 @@
LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-17-show-existing-frame.webm.md5
LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-18-resize.ivf
LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-18-resize.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-19-skip.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-19-skip.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-19-skip-01.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-19-skip-01.webm.md5
LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp91-2-04-yuv444.webm
LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp91-2-04-yuv444.webm.md5
@@ -793,16 +797,32 @@
LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-01-v2.webm.res
LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-02-v2.webm
LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-02-v2.webm.res
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-03-v2.webm
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-03-v2.webm.res
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-03-v3.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-03-v3.webm.res
LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-00-quantizer-00.webm.ivf.s5861_r01-05_b6-.v2.ivf
LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-00-quantizer-00.webm.ivf.s5861_r01-05_b6-.v2.ivf.res
LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-00-quantizer-11.webm.ivf.s52984_r01-05_b6-.ivf
LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-00-quantizer-11.webm.ivf.s52984_r01-05_b6-.ivf.res
LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-00-quantizer-11.webm.ivf.s52984_r01-05_b6-z.ivf
LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-00-quantizer-11.webm.ivf.s52984_r01-05_b6-z.ivf.res
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-05-resize.ivf.s59293_r01-05_b6-.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-05-resize.ivf.s59293_r01-05_b6-.ivf.res
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-08-tile_1x2_frame_parallel.webm.ivf.s47039_r01-05_b6-.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-08-tile_1x2_frame_parallel.webm.ivf.s47039_r01-05_b6-.ivf.res
LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-08-tile_1x4_frame_parallel_all_key.webm
LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-08-tile_1x4_frame_parallel_all_key.webm.res
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-09-aq2.webm.ivf.s3984_r01-05_b6-.v2.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-09-aq2.webm.ivf.s3984_r01-05_b6-.v2.ivf.res
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-09-subpixel-00.ivf.s19552_r01-05_b6-.v2.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-09-subpixel-00.ivf.s19552_r01-05_b6-.v2.ivf.res
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-09-subpixel-00.ivf.s20492_r01-05_b6-.v2.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-09-subpixel-00.ivf.s20492_r01-05_b6-.v2.ivf.res
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-12-droppable_1.ivf.s3676_r01-05_b6-.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-12-droppable_1.ivf.s3676_r01-05_b6-.ivf.res
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-12-droppable_1.ivf.s73804_r01-05_b6-.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-12-droppable_1.ivf.s73804_r01-05_b6-.ivf.res
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp91-2-mixedrefcsp-444to420.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp91-2-mixedrefcsp-444to420.ivf.res
ifeq ($(CONFIG_DECODE_PERF_TESTS),yes)
# BBB VP9 streams
diff --git a/test/test_vectors.cc b/test/test_vectors.cc
index dbdbdd6..cccebf8 100644
--- a/test/test_vectors.cc
+++ b/test/test_vectors.cc
@@ -181,7 +181,8 @@
"vp90-2-14-resize-fp-tiles-8-2.webm", "vp90-2-14-resize-fp-tiles-8-4.webm",
"vp90-2-15-segkey.webm", "vp90-2-15-segkey_adpq.webm",
"vp90-2-16-intra-only.webm", "vp90-2-17-show-existing-frame.webm",
- "vp90-2-18-resize.ivf", "vp91-2-04-yuv444.webm",
+ "vp90-2-18-resize.ivf", "vp90-2-19-skip.webm",
+ "vp90-2-19-skip-01.webm", "vp91-2-04-yuv444.webm",
};
const int kNumVP9TestVectors = NELEMENTS(kVP9TestVectors);
#endif // CONFIG_VP9_DECODER
diff --git a/test/tile_independence_test.cc b/test/tile_independence_test.cc
index d714452..b9f879d 100644
--- a/test/tile_independence_test.cc
+++ b/test/tile_independence_test.cc
@@ -29,7 +29,7 @@
md5_inv_order_(),
n_tiles_(GET_PARAM(1)) {
init_flags_ = VPX_CODEC_USE_PSNR;
- vpx_codec_dec_cfg_t cfg;
+ vpx_codec_dec_cfg_t cfg = vpx_codec_dec_cfg_t();
cfg.w = 704;
cfg.h = 144;
cfg.threads = 1;
diff --git a/test/tools_common.sh b/test/tools_common.sh
index 0bfefba..15c7bce 100755
--- a/test/tools_common.sh
+++ b/test/tools_common.sh
@@ -144,6 +144,24 @@
fi
}
+# Echoes path to $1 when it's executable and exists in ${LIBVPX_BIN_PATH}, or an
+# empty string. Caller is responsible for testing the string once the function
+# returns.
+vpx_tool_path() {
+ local readonly tool_name="$1"
+ local tool_path="${LIBVPX_BIN_PATH}/${tool_name}${VPX_TEST_EXE_SUFFIX}"
+ if [ ! -x "${tool_path}" ]; then
+ # Try one directory up: when running via examples.sh the tool could be in
+ # the parent directory of $LIBVPX_BIN_PATH.
+ tool_path="${LIBVPX_BIN_PATH}/../${tool_name}${VPX_TEST_EXE_SUFFIX}"
+ fi
+
+ if [ ! -x "${tool_path}" ]; then
+ tool_path=""
+ fi
+ echo "${tool_path}"
+}
+
# Echoes yes to stdout when the file named by positional parameter one exists
# in LIBVPX_BIN_PATH, and is executable.
vpx_tool_available() {
diff --git a/test/user_priv_test.cc b/test/user_priv_test.cc
index 22fce85..8512d88 100644
--- a/test/user_priv_test.cc
+++ b/test/user_priv_test.cc
@@ -47,7 +47,7 @@
libvpx_test::WebMVideoSource video(filename);
video.Init();
- vpx_codec_dec_cfg_t cfg = {0};
+ vpx_codec_dec_cfg_t cfg = vpx_codec_dec_cfg_t();
libvpx_test::VP9Decoder decoder(cfg, 0);
libvpx_test::MD5 md5;
diff --git a/test/variance_test.cc b/test/variance_test.cc
index 7d81182..f76402e 100644
--- a/test/variance_test.cc
+++ b/test/variance_test.cc
@@ -35,6 +35,14 @@
using ::std::tr1::tuple;
using libvpx_test::ACMRandom;
+static unsigned int mb_ss_ref(const int16_t *src) {
+ unsigned int res = 0;
+ for (int i = 0; i < 256; ++i) {
+ res += src[i] * src[i];
+ }
+ return res;
+}
+
static unsigned int variance_ref(const uint8_t *ref, const uint8_t *src,
int l2w, int l2h, unsigned int *sse_ptr) {
int se = 0;
@@ -76,6 +84,50 @@
return sse - (((int64_t) se * se) >> (l2w + l2h));
}
+typedef unsigned int (*SumOfSquaresFunction)(const int16_t *src);
+
+class SumOfSquaresTest : public ::testing::TestWithParam<SumOfSquaresFunction> {
+ public:
+ SumOfSquaresTest() : func_(GetParam()) {}
+
+ virtual ~SumOfSquaresTest() {
+ libvpx_test::ClearSystemState();
+ }
+
+ protected:
+ void ConstTest();
+ void RefTest();
+
+ SumOfSquaresFunction func_;
+ ACMRandom rnd_;
+};
+
+void SumOfSquaresTest::ConstTest() {
+ int16_t mem[256];
+ unsigned int res;
+ for (int v = 0; v < 256; ++v) {
+ for (int i = 0; i < 256; ++i) {
+ mem[i] = v;
+ }
+ ASM_REGISTER_STATE_CHECK(res = func_(mem));
+ EXPECT_EQ(256u * (v * v), res);
+ }
+}
+
+void SumOfSquaresTest::RefTest() {
+ int16_t mem[256];
+ for (int i = 0; i < 100; ++i) {
+ for (int j = 0; j < 256; ++j) {
+ mem[j] = rnd_.Rand8() - rnd_.Rand8();
+ }
+
+ const unsigned int expected = mb_ss_ref(mem);
+ unsigned int res;
+ ASM_REGISTER_STATE_CHECK(res = func_(mem));
+ EXPECT_EQ(expected, res);
+ }
+}
+
template<typename VarianceFunctionType>
class VarianceTest
: public ::testing::TestWithParam<tuple<int, int, VarianceFunctionType> > {
@@ -88,7 +140,7 @@
height_ = 1 << log2height_;
variance_ = get<2>(params);
- rnd(ACMRandom::DeterministicSeed());
+ rnd_.Reset(ACMRandom::DeterministicSeed());
block_size_ = width_ * height_;
src_ = reinterpret_cast<uint8_t *>(vpx_memalign(16, block_size_));
ref_ = new uint8_t[block_size_];
@@ -107,7 +159,7 @@
void RefTest();
void OneQuarterTest();
- ACMRandom rnd;
+ ACMRandom rnd_;
uint8_t* src_;
uint8_t* ref_;
int width_, log2width_;
@@ -135,8 +187,8 @@
void VarianceTest<VarianceFunctionType>::RefTest() {
for (int i = 0; i < 10; ++i) {
for (int j = 0; j < block_size_; j++) {
- src_[j] = rnd.Rand8();
- ref_[j] = rnd.Rand8();
+ src_[j] = rnd_.Rand8();
+ ref_[j] = rnd_.Rand8();
}
unsigned int sse1, sse2;
unsigned int var1;
@@ -206,7 +258,7 @@
height_ = 1 << log2height_;
subpel_variance_ = get<2>(params);
- rnd(ACMRandom::DeterministicSeed());
+ rnd_.Reset(ACMRandom::DeterministicSeed());
block_size_ = width_ * height_;
src_ = reinterpret_cast<uint8_t *>(vpx_memalign(16, block_size_));
sec_ = reinterpret_cast<uint8_t *>(vpx_memalign(16, block_size_));
@@ -226,7 +278,7 @@
protected:
void RefTest();
- ACMRandom rnd;
+ ACMRandom rnd_;
uint8_t *src_;
uint8_t *ref_;
uint8_t *sec_;
@@ -241,10 +293,10 @@
for (int x = 0; x < 16; ++x) {
for (int y = 0; y < 16; ++y) {
for (int j = 0; j < block_size_; j++) {
- src_[j] = rnd.Rand8();
+ src_[j] = rnd_.Rand8();
}
for (int j = 0; j < block_size_ + width_ + height_ + 1; j++) {
- ref_[j] = rnd.Rand8();
+ ref_[j] = rnd_.Rand8();
}
unsigned int sse1, sse2;
unsigned int var1;
@@ -263,11 +315,11 @@
for (int x = 0; x < 16; ++x) {
for (int y = 0; y < 16; ++y) {
for (int j = 0; j < block_size_; j++) {
- src_[j] = rnd.Rand8();
- sec_[j] = rnd.Rand8();
+ src_[j] = rnd_.Rand8();
+ sec_[j] = rnd_.Rand8();
}
for (int j = 0; j < block_size_ + width_ + height_ + 1; j++) {
- ref_[j] = rnd.Rand8();
+ ref_[j] = rnd_.Rand8();
}
unsigned int sse1, sse2;
unsigned int var1;
@@ -362,6 +414,13 @@
namespace vp9 {
#if CONFIG_VP9_ENCODER
+
+TEST_P(SumOfSquaresTest, Const) { ConstTest(); }
+TEST_P(SumOfSquaresTest, Ref) { RefTest(); }
+
+INSTANTIATE_TEST_CASE_P(C, SumOfSquaresTest,
+ ::testing::Values(vp9_get_mb_ss_c));
+
typedef VarianceTest<vp9_variance_fn_t> VP9VarianceTest;
typedef SubpelVarianceTest<vp9_subpixvariance_fn_t> VP9SubpelVarianceTest;
typedef SubpelVarianceTest<vp9_subp_avg_variance_fn_t> VP9SubpelAvgVarianceTest;
@@ -485,23 +544,12 @@
make_tuple(6, 5, subpel_avg_variance64x32_c),
make_tuple(6, 6, subpel_avg_variance64x64_c)));
-#if HAVE_MMX
-const vp9_variance_fn_t variance4x4_mmx = vp9_variance4x4_mmx;
-const vp9_variance_fn_t variance8x8_mmx = vp9_variance8x8_mmx;
-const vp9_variance_fn_t variance8x16_mmx = vp9_variance8x16_mmx;
-const vp9_variance_fn_t variance16x8_mmx = vp9_variance16x8_mmx;
-const vp9_variance_fn_t variance16x16_mmx = vp9_variance16x16_mmx;
-INSTANTIATE_TEST_CASE_P(
- MMX, VP9VarianceTest,
- ::testing::Values(make_tuple(2, 2, variance4x4_mmx),
- make_tuple(3, 3, variance8x8_mmx),
- make_tuple(3, 4, variance8x16_mmx),
- make_tuple(4, 3, variance16x8_mmx),
- make_tuple(4, 4, variance16x16_mmx)));
-#endif
-
#if HAVE_SSE2
#if CONFIG_USE_X86INC
+
+INSTANTIATE_TEST_CASE_P(SSE2, SumOfSquaresTest,
+ ::testing::Values(vp9_get_mb_ss_sse2));
+
const vp9_variance_fn_t variance4x4_sse2 = vp9_variance4x4_sse2;
const vp9_variance_fn_t variance4x8_sse2 = vp9_variance4x8_sse2;
const vp9_variance_fn_t variance8x4_sse2 = vp9_variance8x4_sse2;
diff --git a/test/video_source.h b/test/video_source.h
index c924f96..84bfa8e 100644
--- a/test/video_source.h
+++ b/test/video_source.h
@@ -92,12 +92,7 @@
protected:
void CloseFile() {
if (file_) {
- // Close if file pointer is associated with an open file
-#if defined(_WIN32)
- if (file_->_ptr != NULL) fclose(file_);
-#else
- if (fileno(file_) != -1) fclose(file_);
-#endif
+ fclose(file_);
file_ = NULL;
}
}
diff --git a/test/vp8_decrypt_test.cc b/test/vp8_decrypt_test.cc
index 470fdf1..972a1d9 100644
--- a/test/vp8_decrypt_test.cc
+++ b/test/vp8_decrypt_test.cc
@@ -47,7 +47,7 @@
libvpx_test::IVFVideoSource video("vp80-00-comprehensive-001.ivf");
video.Init();
- vpx_codec_dec_cfg_t dec_cfg = {0};
+ vpx_codec_dec_cfg_t dec_cfg = vpx_codec_dec_cfg_t();
VP8Decoder decoder(dec_cfg, 0);
video.Begin();
diff --git a/test/vp8_multi_resolution_encoder.sh b/test/vp8_multi_resolution_encoder.sh
new file mode 100755
index 0000000..a8b7fe7
--- /dev/null
+++ b/test/vp8_multi_resolution_encoder.sh
@@ -0,0 +1,75 @@
+#!/bin/sh
+##
+## Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+##
+## Use of this source code is governed by a BSD-style license
+## that can be found in the LICENSE file in the root of the source
+## tree. An additional intellectual property rights grant can be found
+## in the file PATENTS. All contributing project authors may
+## be found in the AUTHORS file in the root of the source tree.
+##
+## This file tests the libvpx vp8_multi_resolution_encoder example. To add new
+## tests to this file, do the following:
+## 1. Write a shell function (this is your test).
+## 2. Add the function to vp8_mre_tests (on a new line).
+##
+. $(dirname $0)/tools_common.sh
+
+# Environment check: $YUV_RAW_INPUT is required.
+vp8_multi_resolution_encoder_verify_environment() {
+ if [ "$(vpx_config_option_enabled CONFIG_MULTI_RES_ENCODING)" = "yes" ]; then
+ if [ ! -e "${YUV_RAW_INPUT}" ]; then
+ elog "Libvpx test data must exist in LIBVPX_TEST_DATA_PATH."
+ return 1
+ fi
+ local readonly app="vp8_multi_resolution_encoder"
+ if [ -z "$(vpx_tool_path "${app}")" ]; then
+ elog "${app} not found. It must exist in LIBVPX_BIN_PATH or its parent."
+ return 1
+ fi
+ fi
+}
+
+# Runs vp8_multi_resolution_encoder. Simply forwards all arguments to
+# vp8_multi_resolution_encoder after building path to the executable.
+vp8_mre() {
+ local readonly encoder="$(vpx_tool_path vp8_multi_resolution_encoder)"
+ if [ ! -x "${encoder}" ]; then
+ elog "${encoder} does not exist or is not executable."
+ return 1
+ fi
+
+ eval "${VPX_TEST_PREFIX}" "${encoder}" "$@" ${devnull}
+}
+
+vp8_multi_resolution_encoder_three_formats() {
+ local readonly output_files="${VPX_TEST_OUTPUT_DIR}/vp8_mre_0.ivf
+ ${VPX_TEST_OUTPUT_DIR}/vp8_mre_1.ivf
+ ${VPX_TEST_OUTPUT_DIR}/vp8_mre_2.ivf"
+
+ if [ "$(vpx_config_option_enabled CONFIG_MULTI_RES_ENCODING)" = "yes" ]; then
+ if [ "$(vp8_encode_available)" = "yes" ]; then
+ # Param order:
+ # Input width
+ # Input height
+ # Input file path
+ # Output file names
+ # Output PSNR
+ vp8_mre "${YUV_RAW_INPUT_WIDTH}" \
+ "${YUV_RAW_INPUT_HEIGHT}" \
+ "${YUV_RAW_INPUT}" \
+ ${output_files} \
+ 0
+
+ for output_file in ${output_files}; do
+ if [ ! -e "${output_file}" ]; then
+ elog "Missing output file: ${output_file}"
+ return 1
+ fi
+ done
+ fi
+ fi
+}
+
+vp8_mre_tests="vp8_multi_resolution_encoder_three_formats"
+run_tests vp8_multi_resolution_encoder_verify_environment "${vp8_mre_tests}"
diff --git a/test/vp9_decrypt_test.cc b/test/vp9_decrypt_test.cc
index 88a3c14..d988612 100644
--- a/test/vp9_decrypt_test.cc
+++ b/test/vp9_decrypt_test.cc
@@ -47,7 +47,7 @@
libvpx_test::IVFVideoSource video("vp90-2-05-resize.ivf");
video.Init();
- vpx_codec_dec_cfg_t dec_cfg = {0};
+ vpx_codec_dec_cfg_t dec_cfg = vpx_codec_dec_cfg_t();
VP9Decoder decoder(dec_cfg, 0);
video.Begin();
diff --git a/test/vp9_thread_test.cc b/test/vp9_thread_test.cc
index d7fc4ee..cc35476 100644
--- a/test/vp9_thread_test.cc
+++ b/test/vp9_thread_test.cc
@@ -163,7 +163,7 @@
libvpx_test::WebMVideoSource video(filename);
video.Init();
- vpx_codec_dec_cfg_t cfg = {0};
+ vpx_codec_dec_cfg_t cfg = vpx_codec_dec_cfg_t();
cfg.threads = num_threads;
libvpx_test::VP9Decoder decoder(cfg, 0);
diff --git a/test/vpxdec.sh b/test/vpxdec.sh
index 836b13c..f92acbd 100755
--- a/test/vpxdec.sh
+++ b/test/vpxdec.sh
@@ -17,14 +17,13 @@
# Environment check: Make sure input is available.
vpxdec_verify_environment() {
if [ ! -e "${VP8_IVF_FILE}" ] || [ ! -e "${VP9_WEBM_FILE}" ]; then
- echo "Libvpx test data must exist in LIBVPX_TEST_DATA_PATH."
+ elog "Libvpx test data must exist in LIBVPX_TEST_DATA_PATH."
return 1
fi
-}
-
-# Echoes yes to stdout when vpxdec exists according to vpx_tool_available().
-vpxdec_available() {
- [ -n "$(vpx_tool_available vpxdec)" ] && echo yes
+ if [ -z "$(vpx_tool_path vpxdec)" ]; then
+ elog "vpxdec not found. It must exist in LIBVPX_BIN_PATH or its parent."
+ return 1
+ fi
}
# Wrapper function for running vpxdec with pipe input. Requires that
@@ -32,8 +31,8 @@
# input file path and shifted away. All remaining parameters are passed through
# to vpxdec.
vpxdec_pipe() {
- local decoder="${LIBVPX_BIN_PATH}/vpxdec${VPX_TEST_EXE_SUFFIX}"
- local input="$1"
+ local readonly decoder="$(vpx_tool_path vpxdec)"
+ local readonly input="$1"
shift
cat "${input}" | eval "${VPX_TEST_PREFIX}" "${decoder}" - "$@" ${devnull}
}
@@ -42,22 +41,20 @@
# the directory containing vpxdec. $1 one is used as the input file path and
# shifted away. All remaining parameters are passed through to vpxdec.
vpxdec() {
- local decoder="${LIBVPX_BIN_PATH}/vpxdec${VPX_TEST_EXE_SUFFIX}"
- local input="${1}"
+ local readonly decoder="$(vpx_tool_path vpxdec)"
+ local readonly input="$1"
shift
eval "${VPX_TEST_PREFIX}" "${decoder}" "$input" "$@" ${devnull}
}
vpxdec_can_decode_vp8() {
- if [ "$(vpxdec_available)" = "yes" ] && \
- [ "$(vp8_decode_available)" = "yes" ]; then
+ if [ "$(vp8_decode_available)" = "yes" ]; then
echo yes
fi
}
vpxdec_can_decode_vp9() {
- if [ "$(vpxdec_available)" = "yes" ] && \
- [ "$(vp9_decode_available)" = "yes" ]; then
+ if [ "$(vp9_decode_available)" = "yes" ]; then
echo yes
fi
}
diff --git a/test/vpxenc.sh b/test/vpxenc.sh
index 6e9ad35..9674bdc 100755
--- a/test/vpxenc.sh
+++ b/test/vpxenc.sh
@@ -20,28 +20,59 @@
# Environment check: Make sure input is available.
vpxenc_verify_environment() {
if [ ! -e "${YUV_RAW_INPUT}" ]; then
- echo "The file ${YUV_RAW_INPUT##*/} must exist in LIBVPX_TEST_DATA_PATH."
+ elog "The file ${YUV_RAW_INPUT##*/} must exist in LIBVPX_TEST_DATA_PATH."
+ return 1
+ fi
+ if [ -z "$(vpx_tool_path vpxenc)" ]; then
+ elog "vpxenc not found. It must exist in LIBVPX_BIN_PATH or its parent."
return 1
fi
}
vpxenc_can_encode_vp8() {
- if [ "$(vpxenc_available)" = "yes" ] && \
- [ "$(vp8_encode_available)" = "yes" ]; then
+ if [ "$(vp8_encode_available)" = "yes" ]; then
echo yes
fi
}
vpxenc_can_encode_vp9() {
- if [ "$(vpxenc_available)" = "yes" ] && \
- [ "$(vp9_encode_available)" = "yes" ]; then
+ if [ "$(vp9_encode_available)" = "yes" ]; then
echo yes
fi
}
-# Echoes yes to stdout when vpxenc exists according to vpx_tool_available().
-vpxenc_available() {
- [ -n "$(vpx_tool_available vpxenc)" ] && echo yes
+# Echo vpxenc command line parameters allowing use of
+# hantro_collage_w352h288.yuv as input.
+yuv_input_hantro_collage() {
+ echo ""${YUV_RAW_INPUT}"
+ --width="${YUV_RAW_INPUT_WIDTH}"
+ --height="${YUV_RAW_INPUT_HEIGHT}""
+}
+
+# Echo default vpxenc real time encoding params. $1 is the codec, which defaults
+# to vp8 if unspecified.
+vpxenc_rt_params() {
+ local readonly codec="${1:-vp8}"
+ echo "--codec=${codec}
+ --buf-initial-sz=500
+ --buf-optimal-sz=600
+ --buf-sz=1000
+ --cpu-used=-5
+ --end-usage=cbr
+ --error-resilient=1
+ --kf-max-dist=90000
+ --lag-in-frames=0
+ --max-intra-rate=300
+ --max-q=56
+ --min-q=2
+ --noise-sensitivity=0
+ --overshoot-pct=50
+ --passes=1
+ --profile=0
+ --resize-allowed=0
+ --rt
+ --static-thresh=0
+ --undershoot-pct=50"
}
# Wrapper function for running vpxenc with pipe input. Requires that
@@ -49,51 +80,34 @@
# input file path and shifted away. All remaining parameters are passed through
# to vpxenc.
vpxenc_pipe() {
- local readonly encoder="${LIBVPX_BIN_PATH}/vpxenc${VPX_TEST_EXE_SUFFIX}"
+ local readonly encoder="$(vpx_tool_path vpxenc)"
local readonly input="$1"
shift
- cat "${input}" | eval "${VPX_TEST_PREFIX}" "${encoder}" - "$@" ${devnull}
+ cat "${input}" | eval "${VPX_TEST_PREFIX}" "${encoder}" - \
+ --test-decode=fatal \
+ "$@" ${devnull}
}
# Wrapper function for running vpxenc. Requires that LIBVPX_BIN_PATH points to
# the directory containing vpxenc. $1 one is used as the input file path and
# shifted away. All remaining parameters are passed through to vpxenc.
vpxenc() {
- local readonly encoder="${LIBVPX_BIN_PATH}/vpxenc${VPX_TEST_EXE_SUFFIX}"
- local readonly input="${1}"
+ local readonly encoder="$(vpx_tool_path vpxenc)"
+ local readonly input="$1"
shift
- eval "${VPX_TEST_PREFIX}" "${encoder}" "$input" "$@" ${devnull}
+ eval "${VPX_TEST_PREFIX}" "${encoder}" "${input}" \
+ --test-decode=fatal \
+ "$@" ${devnull}
}
vpxenc_vp8_ivf() {
if [ "$(vpxenc_can_encode_vp8)" = "yes" ]; then
local readonly output="${VPX_TEST_OUTPUT_DIR}/vp8.ivf"
- vpxenc --codec=vp8 \
- --width="${YUV_RAW_INPUT_WIDTH}" \
- --height="${YUV_RAW_INPUT_HEIGHT}" \
+ vpxenc $(yuv_input_hantro_collage) \
+ --codec=vp8 \
--limit="${TEST_FRAMES}" \
--ivf \
- --output="${output}" \
- "${YUV_RAW_INPUT}"
-
- if [ ! -e "${output}" ]; then
- elog "Output file does not exist."
- return 1
- fi
- fi
-}
-
-vpxenc_vp8_ivf_piped_input() {
- if [ "$(vpxenc_can_encode_vp8)" = "yes" ]; then
- local readonly output="${VPX_TEST_OUTPUT_DIR}/vp8_piped_input.ivf"
- cat "${YUV_RAW_INPUT}" \
- | vpxenc --codec=vp8 \
- --width="${YUV_RAW_INPUT_WIDTH}" \
- --height="${YUV_RAW_INPUT_HEIGHT}" \
- --limit="${TEST_FRAMES}" \
- --ivf \
- --output="${output}" \
- -
+ --output="${output}"
if [ ! -e "${output}" ]; then
elog "Output file does not exist."
@@ -106,12 +120,78 @@
if [ "$(vpxenc_can_encode_vp8)" = "yes" ] && \
[ "$(webm_io_available)" = "yes" ]; then
local readonly output="${VPX_TEST_OUTPUT_DIR}/vp8.webm"
- vpxenc --codec=vp8 \
- --width="${YUV_RAW_INPUT_WIDTH}" \
- --height="${YUV_RAW_INPUT_HEIGHT}" \
+ vpxenc $(yuv_input_hantro_collage) \
+ --codec=vp8 \
+ --limit="${TEST_FRAMES}" \
+ --output="${output}"
+
+ if [ ! -e "${output}" ]; then
+ elog "Output file does not exist."
+ return 1
+ fi
+ fi
+}
+
+vpxenc_vp8_webm_rt() {
+ if [ "$(vpxenc_can_encode_vp8)" = "yes" ] && \
+ [ "$(webm_io_available)" = "yes" ]; then
+ local readonly output="${VPX_TEST_OUTPUT_DIR}/vp8_rt.webm"
+ vpxenc $(yuv_input_hantro_collage) \
+ $(vpxenc_rt_params vp8) \
+ --output="${output}"
+ if [ ! -e "${output}" ]; then
+ elog "Output file does not exist."
+ return 1
+ fi
+ fi
+}
+
+vpxenc_vp8_webm_2pass() {
+ if [ "$(vpxenc_can_encode_vp8)" = "yes" ] && \
+ [ "$(webm_io_available)" = "yes" ]; then
+ local readonly output="${VPX_TEST_OUTPUT_DIR}/vp8.webm"
+ vpxenc $(yuv_input_hantro_collage) \
+ --codec=vp8 \
--limit="${TEST_FRAMES}" \
--output="${output}" \
- "${YUV_RAW_INPUT}"
+ --passes=2
+
+ if [ ! -e "${output}" ]; then
+ elog "Output file does not exist."
+ return 1
+ fi
+ fi
+}
+
+vpxenc_vp8_webm_lag10_frames20() {
+ if [ "$(vpxenc_can_encode_vp8)" = "yes" ] && \
+ [ "$(webm_io_available)" = "yes" ]; then
+ local readonly lag_total_frames=20
+ local readonly lag_frames=10
+ local readonly output="${VPX_TEST_OUTPUT_DIR}/vp8_lag10_frames20.webm"
+ vpxenc $(yuv_input_hantro_collage) \
+ --codec=vp8 \
+ --limit="${lag_total_frames}" \
+ --lag-in-frames="${lag_frames}" \
+ --output="${output}" \
+ --auto-alt-ref=1 \
+ --passes=2
+
+ if [ ! -e "${output}" ]; then
+ elog "Output file does not exist."
+ return 1
+ fi
+ fi
+}
+
+vpxenc_vp8_ivf_piped_input() {
+ if [ "$(vpxenc_can_encode_vp8)" = "yes" ]; then
+ local readonly output="${VPX_TEST_OUTPUT_DIR}/vp8_piped_input.ivf"
+ vpxenc_pipe $(yuv_input_hantro_collage) \
+ --codec=vp8 \
+ --limit="${TEST_FRAMES}" \
+ --ivf \
+ --output="${output}"
if [ ! -e "${output}" ]; then
elog "Output file does not exist."
@@ -123,13 +203,11 @@
vpxenc_vp9_ivf() {
if [ "$(vpxenc_can_encode_vp9)" = "yes" ]; then
local readonly output="${VPX_TEST_OUTPUT_DIR}/vp9.ivf"
- vpxenc --codec=vp9 \
- --width="${YUV_RAW_INPUT_WIDTH}" \
- --height="${YUV_RAW_INPUT_HEIGHT}" \
+ vpxenc $(yuv_input_hantro_collage) \
+ --codec=vp9 \
--limit="${TEST_FRAMES}" \
--ivf \
- --output="${output}" \
- "${YUV_RAW_INPUT}"
+ --output="${output}"
if [ ! -e "${output}" ]; then
elog "Output file does not exist."
@@ -142,12 +220,42 @@
if [ "$(vpxenc_can_encode_vp9)" = "yes" ] && \
[ "$(webm_io_available)" = "yes" ]; then
local readonly output="${VPX_TEST_OUTPUT_DIR}/vp9.webm"
- vpxenc --codec=vp9 \
- --width="${YUV_RAW_INPUT_WIDTH}" \
- --height="${YUV_RAW_INPUT_HEIGHT}" \
+ vpxenc $(yuv_input_hantro_collage) \
+ --codec=vp9 \
+ --limit="${TEST_FRAMES}" \
+ --output="${output}"
+
+ if [ ! -e "${output}" ]; then
+ elog "Output file does not exist."
+ return 1
+ fi
+ fi
+}
+
+vpxenc_vp9_webm_rt() {
+ if [ "$(vpxenc_can_encode_vp9)" = "yes" ] && \
+ [ "$(webm_io_available)" = "yes" ]; then
+ local readonly output="${VPX_TEST_OUTPUT_DIR}/vp9_rt.webm"
+ vpxenc $(yuv_input_hantro_collage) \
+ $(vpxenc_rt_params vp9) \
+ --output="${output}"
+
+ if [ ! -e "${output}" ]; then
+ elog "Output file does not exist."
+ return 1
+ fi
+ fi
+}
+
+vpxenc_vp9_webm_2pass() {
+ if [ "$(vpxenc_can_encode_vp9)" = "yes" ] && \
+ [ "$(webm_io_available)" = "yes" ]; then
+ local readonly output="${VPX_TEST_OUTPUT_DIR}/vp9.webm"
+ vpxenc $(yuv_input_hantro_collage) \
+ --codec=vp9 \
--limit="${TEST_FRAMES}" \
--output="${output}" \
- "${YUV_RAW_INPUT}"
+ --passes=2
if [ ! -e "${output}" ]; then
elog "Output file does not exist."
@@ -159,15 +267,12 @@
vpxenc_vp9_ivf_lossless() {
if [ "$(vpxenc_can_encode_vp9)" = "yes" ]; then
local readonly output="${VPX_TEST_OUTPUT_DIR}/vp9_lossless.ivf"
- vpxenc --codec=vp9 \
- --width="${YUV_RAW_INPUT_WIDTH}" \
- --height="${YUV_RAW_INPUT_HEIGHT}" \
+ vpxenc $(yuv_input_hantro_collage) \
+ --codec=vp9 \
--limit="${TEST_FRAMES}" \
--ivf \
--output="${output}" \
- --lossless=1 \
- --test-decode=fatal \
- "${YUV_RAW_INPUT}"
+ --lossless=1
if [ ! -e "${output}" ]; then
elog "Output file does not exist."
@@ -179,16 +284,34 @@
vpxenc_vp9_ivf_minq0_maxq0() {
if [ "$(vpxenc_can_encode_vp9)" = "yes" ]; then
local readonly output="${VPX_TEST_OUTPUT_DIR}/vp9_lossless_minq0_maxq0.ivf"
- vpxenc --codec=vp9 \
- --width="${YUV_RAW_INPUT_WIDTH}" \
- --height="${YUV_RAW_INPUT_HEIGHT}" \
+ vpxenc $(yuv_input_hantro_collage) \
+ --codec=vp9 \
--limit="${TEST_FRAMES}" \
--ivf \
--output="${output}" \
--min-q=0 \
- --max-q=0 \
- --test-decode=fatal \
- "${YUV_RAW_INPUT}"
+ --max-q=0
+
+ if [ ! -e "${output}" ]; then
+ elog "Output file does not exist."
+ return 1
+ fi
+ fi
+}
+
+vpxenc_vp9_webm_lag10_frames20() {
+ if [ "$(vpxenc_can_encode_vp9)" = "yes" ] && \
+ [ "$(webm_io_available)" = "yes" ]; then
+ local readonly lag_total_frames=20
+ local readonly lag_frames=10
+ local readonly output="${VPX_TEST_OUTPUT_DIR}/vp9_lag10_frames20.webm"
+ vpxenc $(yuv_input_hantro_collage) \
+ --codec=vp9 \
+ --limit="${lag_total_frames}" \
+ --lag-in-frames="${lag_frames}" \
+ --output="${output}" \
+ --passes=2 \
+ --auto-alt-ref=1
if [ ! -e "${output}" ]; then
elog "Output file does not exist."
@@ -199,10 +322,16 @@
vpxenc_tests="vpxenc_vp8_ivf
vpxenc_vp8_webm
+ vpxenc_vp8_webm_rt
+ vpxenc_vp8_webm_2pass
+ vpxenc_vp8_webm_lag10_frames20
vpxenc_vp8_ivf_piped_input
vpxenc_vp9_ivf
vpxenc_vp9_webm
+ vpxenc_vp9_webm_rt
+ vpxenc_vp9_webm_2pass
vpxenc_vp9_ivf_lossless
- vpxenc_vp9_ivf_minq0_maxq0"
+ vpxenc_vp9_ivf_minq0_maxq0
+ vpxenc_vp9_webm_lag10_frames20"
run_tests vpxenc_verify_environment "${vpxenc_tests}"
diff --git a/test/y4m_test.cc b/test/y4m_test.cc
index d4a2ede..58a6fe3 100644
--- a/test/y4m_test.cc
+++ b/test/y4m_test.cc
@@ -57,7 +57,7 @@
for (plane = 0; plane < 3; ++plane) {
const unsigned char *buf = img->planes[plane];
const int stride = img->stride[plane];
- const int bytes_per_sample = (img->fmt & VPX_IMG_FMT_HIGH) ? 2 : 1;
+ const int bytes_per_sample = (img->fmt & VPX_IMG_FMT_HIGHBITDEPTH) ? 2 : 1;
const int h = (plane ? (img->d_h + img->y_chroma_shift) >>
img->y_chroma_shift : img->d_h);
const int w = (plane ? (img->d_w + img->x_chroma_shift) >>
@@ -141,11 +141,11 @@
Y4mVideoWriteTest() {}
virtual ~Y4mVideoWriteTest() {
- CloseSource();
delete tmpfile_;
+ input_file_ = NULL;
}
- virtual void ReplaceInputFile(FILE *input_file) {
+ void ReplaceInputFile(FILE *input_file) {
CloseSource();
frame_ = 0;
input_file_ = input_file;
diff --git a/third_party/libyuv/README.libvpx b/third_party/libyuv/README.libvpx
index fa5b498..3869d25 100644
--- a/third_party/libyuv/README.libvpx
+++ b/third_party/libyuv/README.libvpx
@@ -1,6 +1,6 @@
Name: libyuv
URL: http://code.google.com/p/libyuv/
-Version: 1041
+Version: 1060
License: BSD
License File: LICENSE
@@ -13,4 +13,4 @@
in order to encode multiple resolution bit streams.
Local Modifications:
-None.
+cherry-pick 'Issue 24479004: Fix building with MSVC for arm'
diff --git a/third_party/libyuv/include/libyuv/mjpeg_decoder.h b/third_party/libyuv/include/libyuv/mjpeg_decoder.h
index 82fd95d..8423121 100644
--- a/third_party/libyuv/include/libyuv/mjpeg_decoder.h
+++ b/third_party/libyuv/include/libyuv/mjpeg_decoder.h
@@ -153,7 +153,6 @@
int* subsample_x, int* subsample_y, int number_of_components);
private:
-
void AllocOutputBuffers(int num_outbufs);
void DestroyOutputBuffers();
diff --git a/third_party/libyuv/include/libyuv/row.h b/third_party/libyuv/include/libyuv/row.h
index fdfe1ae..4b3c870 100644
--- a/third_party/libyuv/include/libyuv/row.h
+++ b/third_party/libyuv/include/libyuv/row.h
@@ -252,6 +252,94 @@
// The following are available on arm64 platforms:
#if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
+// #define HAS_I444TOARGBROW_NEON
+// #define HAS_I422TOARGBROW_NEON
+// #define HAS_I411TOARGBROW_NEON
+// #define HAS_I422TOBGRAROW_NEON
+// #define HAS_I422TOABGRROW_NEON
+// #define HAS_I422TORGBAROW_NEON
+// #define HAS_I422TORGB24ROW_NEON
+// #define HAS_I422TORAWROW_NEON
+// #define HAS_I422TORGB565ROW_NEON
+// #define HAS_I422TOARGB1555ROW_NEON
+// #define HAS_I422TOARGB4444ROW_NEON
+// #define HAS_YTOARGBROW_NEON
+// #define HAS_I400TOARGBROW_NEON
+// #define HAS_NV12TOARGBROW_NEON
+// #define HAS_NV21TOARGBROW_NEON
+// #define HAS_NV12TORGB565ROW_NEON
+// #define HAS_NV21TORGB565ROW_NEON
+// #define HAS_YUY2TOARGBROW_NEON
+// #define HAS_UYVYTOARGBROW_NEON
+#define HAS_SPLITUVROW_NEON
+#define HAS_MERGEUVROW_NEON
+#define HAS_COPYROW_NEON
+#define HAS_SETROW_NEON
+#define HAS_ARGBSETROWS_NEON
+#define HAS_MIRRORROW_NEON
+#define HAS_MIRRORUVROW_NEON
+#define HAS_ARGBMIRRORROW_NEON
+#define HAS_RGB24TOARGBROW_NEON
+#define HAS_RAWTOARGBROW_NEON
+// #define HAS_RGB565TOARGBROW_NEON
+// #define HAS_ARGB1555TOARGBROW_NEON
+// #define HAS_ARGB4444TOARGBROW_NEON
+#define HAS_ARGBTORGB24ROW_NEON
+#define HAS_ARGBTORAWROW_NEON
+#define HAS_YUY2TOYROW_NEON
+#define HAS_UYVYTOYROW_NEON
+#define HAS_YUY2TOUV422ROW_NEON
+#define HAS_UYVYTOUV422ROW_NEON
+#define HAS_YUY2TOUVROW_NEON
+#define HAS_UYVYTOUVROW_NEON
+#define HAS_HALFROW_NEON
+#define HAS_ARGBTOBAYERROW_NEON
+#define HAS_ARGBTOBAYERGGROW_NEON
+#define HAS_ARGBSHUFFLEROW_NEON
+#define HAS_I422TOYUY2ROW_NEON
+#define HAS_I422TOUYVYROW_NEON
+// #define HAS_ARGBTORGB565ROW_NEON
+// #define HAS_ARGBTOARGB1555ROW_NEON
+// #define HAS_ARGBTOARGB4444ROW_NEON
+#define HAS_ARGBTOYROW_NEON
+#define HAS_ARGBTOYJROW_NEON
+// #define HAS_ARGBTOUV444ROW_NEON
+// #define HAS_ARGBTOUV422ROW_NEON
+// #define HAS_ARGBTOUV411ROW_NEON
+// #define HAS_ARGBTOUVROW_NEON
+// #define HAS_ARGBTOUVJROW_NEON
+// #define HAS_BGRATOUVROW_NEON
+// #define HAS_ABGRTOUVROW_NEON
+// #define HAS_RGBATOUVROW_NEON
+// #define HAS_RGB24TOUVROW_NEON
+// #define HAS_RAWTOUVROW_NEON
+// #define HAS_RGB565TOUVROW_NEON
+// #define HAS_ARGB1555TOUVROW_NEON
+// #define HAS_ARGB4444TOUVROW_NEON
+// #define HAS_RGB565TOYROW_NEON
+// #define HAS_ARGB1555TOYROW_NEON
+// #define HAS_ARGB4444TOYROW_NEON
+// #define HAS_BGRATOYROW_NEON
+// #define HAS_ABGRTOYROW_NEON
+// #define HAS_RGBATOYROW_NEON
+// #define HAS_RGB24TOYROW_NEON
+// #define HAS_RAWTOYROW_NEON
+// #define HAS_INTERPOLATEROW_NEON
+// #define HAS_ARGBBLENDROW_NEON
+// #define HAS_ARGBATTENUATEROW_NEON
+// #define HAS_ARGBQUANTIZEROW_NEON
+// #define HAS_ARGBSHADEROW_NEON
+// #define HAS_ARGBGRAYROW_NEON
+// #define HAS_ARGBSEPIAROW_NEON
+// #define HAS_ARGBCOLORMATRIXROW_NEON
+#define HAS_ARGBMULTIPLYROW_NEON
+#define HAS_ARGBADDROW_NEON
+#define HAS_ARGBSUBTRACTROW_NEON
+#define HAS_SOBELROW_NEON
+#define HAS_SOBELTOPLANEROW_NEON
+#define HAS_SOBELXYROW_NEON
+#define HAS_SOBELXROW_NEON
+#define HAS_SOBELYROW_NEON
#endif
// The following are available on Neon platforms:
@@ -465,7 +553,7 @@
#opcode " " #offset "(%" #base ",%" #index "," #scale "),%" #arg "\n"
#endif // defined(__native_client__) && defined(__x86_64__)
-#if defined(__arm__)
+#if defined(__arm__) || defined(__aarch64__)
#undef MEMACCESS
#if defined(__native_client__)
#define MEMACCESS(base) ".p2align 3\nbic %" #base ", #0xc0000000\n"
diff --git a/third_party/libyuv/include/libyuv/scale_row.h b/third_party/libyuv/include/libyuv/scale_row.h
index 8dc0762..3c49542 100644
--- a/third_party/libyuv/include/libyuv/scale_row.h
+++ b/third_party/libyuv/include/libyuv/scale_row.h
@@ -51,6 +51,14 @@
#define HAS_SCALEROWDOWN38_NEON
#define HAS_SCALEARGBROWDOWNEVEN_NEON
#define HAS_SCALEARGBROWDOWN2_NEON
+#elif !defined(LIBYUV_DISABLE_NEON) && !defined(__native_client__) && \
+ (defined(__aarch64__) || defined(LIBYUV_NEON))
+/* #define HAS_SCALEROWDOWN2_NEON */
+/* #define HAS_SCALEROWDOWN4_NEON */
+/* #define HAS_SCALEROWDOWN34_NEON */
+/* #define HAS_SCALEROWDOWN38_NEON */
+/* #define HAS_SCALEARGBROWDOWNEVEN_NEON */
+/* #define HAS_SCALEARGBROWDOWN2_NEON */
#endif
// The following are available on Mips platforms:
diff --git a/third_party/libyuv/include/libyuv/version.h b/third_party/libyuv/include/libyuv/version.h
index 912c4c9..73a7f1b 100644
--- a/third_party/libyuv/include/libyuv/version.h
+++ b/third_party/libyuv/include/libyuv/version.h
@@ -11,6 +11,6 @@
#ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT
#define INCLUDE_LIBYUV_VERSION_H_
-#define LIBYUV_VERSION 1041
+#define LIBYUV_VERSION 1059
#endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT
diff --git a/third_party/libyuv/source/compare.cc b/third_party/libyuv/source/compare.cc
index 9ea81b4..dc715e0 100644
--- a/third_party/libyuv/source/compare.cc
+++ b/third_party/libyuv/source/compare.cc
@@ -80,7 +80,7 @@
uint32 SumSquareError_C(const uint8* src_a, const uint8* src_b, int count);
#if !defined(LIBYUV_DISABLE_NEON) && \
- (defined(__ARM_NEON__) || defined(LIBYUV_NEON))
+ (defined(__ARM_NEON__) || defined(LIBYUV_NEON) || defined(__aarch64__))
#define HAS_SUMSQUAREERROR_NEON
uint32 SumSquareError_NEON(const uint8* src_a, const uint8* src_b, int count);
#endif
diff --git a/third_party/libyuv/source/compare_neon.cc b/third_party/libyuv/source/compare_neon.cc
index 5e7b8e4..55052c0 100644
--- a/third_party/libyuv/source/compare_neon.cc
+++ b/third_party/libyuv/source/compare_neon.cc
@@ -56,6 +56,45 @@
return sse;
}
+#elif !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
+
+uint32 SumSquareError_NEON(const uint8* src_a, const uint8* src_b, int count) {
+ volatile uint32 sse;
+ asm volatile (
+ "eor v16.16b, v16.16b, v16.16b \n"
+ "eor v18.16b, v18.16b, v18.16b \n"
+ "eor v17.16b, v17.16b, v17.16b \n"
+ "eor v19.16b, v19.16b, v19.16b \n"
+
+ ".p2align 2 \n"
+ "1: \n"
+ MEMACCESS(0)
+ "ld1 {v0.16b}, [%0], #16 \n"
+ MEMACCESS(1)
+ "ld1 {v1.16b}, [%1], #16 \n"
+ "subs %2, %2, #16 \n"
+ "usubl v2.8h, v0.8b, v1.8b \n"
+ "usubl2 v3.8h, v0.16b, v1.16b \n"
+ "smlal v16.4s, v2.4h, v2.4h \n"
+ "smlal v17.4s, v3.4h, v3.4h \n"
+ "smlal2 v18.4s, v2.8h, v2.8h \n"
+ "smlal2 v19.4s, v3.8h, v3.8h \n"
+ "bgt 1b \n"
+
+ "add v16.4s, v16.4s, v17.4s \n"
+ "add v18.4s, v18.4s, v19.4s \n"
+ "add v19.4s, v16.4s, v18.4s \n"
+ "addv s0, v19.4s \n"
+ "fmov %w3, s0 \n"
+ : "+r"(src_a),
+ "+r"(src_b),
+ "+r"(count),
+ "=r"(sse)
+ :
+ : "cc", "v0", "v1", "v2", "v3", "v16", "v17", "v18", "v19");
+ return sse;
+}
+
#endif // __ARM_NEON__
#ifdef __cplusplus
diff --git a/third_party/libyuv/source/convert.cc b/third_party/libyuv/source/convert.cc
index 874a6cb..a8e294f 100644
--- a/third_party/libyuv/source/convert.cc
+++ b/third_party/libyuv/source/convert.cc
@@ -401,7 +401,7 @@
uint8* dst_v, int dst_stride_v,
int width, int height) {
int y;
- int halfheight = (height + 1) >> 1;
+ int halfheight;
void (*CopyRow)(const uint8* src, uint8* dst, int width) = CopyRow_C;
void (*YUY2ToUV422Row)(const uint8* src_yuy2, uint8* dst_u, uint8* dst_v,
int pix) = YUY2ToUV422Row_C;
@@ -711,11 +711,13 @@
if (IS_ALIGNED(width, 8)) {
ARGBToYRow = ARGBToYRow_NEON;
}
- if (width >= 16) {
- ARGBToUVRow = ARGBToUVRow_Any_NEON;
- if (IS_ALIGNED(width, 16)) {
- ARGBToUVRow = ARGBToUVRow_NEON;
- }
+ }
+#endif
+#if defined(HAS_ARGBTOUVROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON) && width >= 16) {
+ ARGBToUVRow = ARGBToUVRow_Any_NEON;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToUVRow = ARGBToUVRow_NEON;
}
}
#endif
@@ -963,9 +965,6 @@
uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C;
void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) =
ARGBToYRow_C;
- // Allocate 2 rows of ARGB.
- const int kRowSize = (width * 4 + 15) & ~15;
- align_buffer_64(row, kRowSize * 2);
#endif
if (!src_rgb24 || !dst_y || !dst_u || !dst_v ||
width <= 0 || height == 0) {
@@ -1022,36 +1021,44 @@
#endif // HAS_ARGBTOUVROW_SSSE3
#endif // HAS_RGB24TOYROW_NEON
- for (y = 0; y < height - 1; y += 2) {
-#if defined(HAS_RGB24TOYROW_NEON)
- RGB24ToUVRow(src_rgb24, src_stride_rgb24, dst_u, dst_v, width);
- RGB24ToYRow(src_rgb24, dst_y, width);
- RGB24ToYRow(src_rgb24 + src_stride_rgb24, dst_y + dst_stride_y, width);
-#else
- RGB24ToARGBRow(src_rgb24, row, width);
- RGB24ToARGBRow(src_rgb24 + src_stride_rgb24, row + kRowSize, width);
- ARGBToUVRow(row, kRowSize, dst_u, dst_v, width);
- ARGBToYRow(row, dst_y, width);
- ARGBToYRow(row + kRowSize, dst_y + dst_stride_y, width);
-#endif
- src_rgb24 += src_stride_rgb24 * 2;
- dst_y += dst_stride_y * 2;
- dst_u += dst_stride_u;
- dst_v += dst_stride_v;
- }
- if (height & 1) {
-#if defined(HAS_RGB24TOYROW_NEON)
- RGB24ToUVRow(src_rgb24, 0, dst_u, dst_v, width);
- RGB24ToYRow(src_rgb24, dst_y, width);
-#else
- RGB24ToARGBRow(src_rgb24, row, width);
- ARGBToUVRow(row, 0, dst_u, dst_v, width);
- ARGBToYRow(row, dst_y, width);
-#endif
- }
+ {
#if !defined(HAS_RGB24TOYROW_NEON)
- free_aligned_buffer_64(row);
+ // Allocate 2 rows of ARGB.
+ const int kRowSize = (width * 4 + 15) & ~15;
+ align_buffer_64(row, kRowSize * 2);
#endif
+
+ for (y = 0; y < height - 1; y += 2) {
+#if defined(HAS_RGB24TOYROW_NEON)
+ RGB24ToUVRow(src_rgb24, src_stride_rgb24, dst_u, dst_v, width);
+ RGB24ToYRow(src_rgb24, dst_y, width);
+ RGB24ToYRow(src_rgb24 + src_stride_rgb24, dst_y + dst_stride_y, width);
+#else
+ RGB24ToARGBRow(src_rgb24, row, width);
+ RGB24ToARGBRow(src_rgb24 + src_stride_rgb24, row + kRowSize, width);
+ ARGBToUVRow(row, kRowSize, dst_u, dst_v, width);
+ ARGBToYRow(row, dst_y, width);
+ ARGBToYRow(row + kRowSize, dst_y + dst_stride_y, width);
+#endif
+ src_rgb24 += src_stride_rgb24 * 2;
+ dst_y += dst_stride_y * 2;
+ dst_u += dst_stride_u;
+ dst_v += dst_stride_v;
+ }
+ if (height & 1) {
+#if defined(HAS_RGB24TOYROW_NEON)
+ RGB24ToUVRow(src_rgb24, 0, dst_u, dst_v, width);
+ RGB24ToYRow(src_rgb24, dst_y, width);
+#else
+ RGB24ToARGBRow(src_rgb24, row, width);
+ ARGBToUVRow(row, 0, dst_u, dst_v, width);
+ ARGBToYRow(row, dst_y, width);
+#endif
+ }
+#if !defined(HAS_RGB24TOYROW_NEON)
+ free_aligned_buffer_64(row);
+#endif
+ }
return 0;
}
@@ -1075,9 +1082,6 @@
uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C;
void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) =
ARGBToYRow_C;
- // Allocate 2 rows of ARGB.
- const int kRowSize = (width * 4 + 15) & ~15;
- align_buffer_64(row, kRowSize * 2);
#endif
if (!src_raw || !dst_y || !dst_u || !dst_v ||
width <= 0 || height == 0) {
@@ -1134,36 +1138,42 @@
#endif // HAS_ARGBTOUVROW_SSSE3
#endif // HAS_RAWTOYROW_NEON
- for (y = 0; y < height - 1; y += 2) {
-#if defined(HAS_RAWTOYROW_NEON)
- RAWToUVRow(src_raw, src_stride_raw, dst_u, dst_v, width);
- RAWToYRow(src_raw, dst_y, width);
- RAWToYRow(src_raw + src_stride_raw, dst_y + dst_stride_y, width);
-#else
- RAWToARGBRow(src_raw, row, width);
- RAWToARGBRow(src_raw + src_stride_raw, row + kRowSize, width);
- ARGBToUVRow(row, kRowSize, dst_u, dst_v, width);
- ARGBToYRow(row, dst_y, width);
- ARGBToYRow(row + kRowSize, dst_y + dst_stride_y, width);
-#endif
- src_raw += src_stride_raw * 2;
- dst_y += dst_stride_y * 2;
- dst_u += dst_stride_u;
- dst_v += dst_stride_v;
+ {
+ // Allocate 2 rows of ARGB.
+ const int kRowSize = (width * 4 + 15) & ~15;
+ align_buffer_64(row, kRowSize * 2);
+
+ for (y = 0; y < height - 1; y += 2) {
+ #if defined(HAS_RAWTOYROW_NEON)
+ RAWToUVRow(src_raw, src_stride_raw, dst_u, dst_v, width);
+ RAWToYRow(src_raw, dst_y, width);
+ RAWToYRow(src_raw + src_stride_raw, dst_y + dst_stride_y, width);
+ #else
+ RAWToARGBRow(src_raw, row, width);
+ RAWToARGBRow(src_raw + src_stride_raw, row + kRowSize, width);
+ ARGBToUVRow(row, kRowSize, dst_u, dst_v, width);
+ ARGBToYRow(row, dst_y, width);
+ ARGBToYRow(row + kRowSize, dst_y + dst_stride_y, width);
+ #endif
+ src_raw += src_stride_raw * 2;
+ dst_y += dst_stride_y * 2;
+ dst_u += dst_stride_u;
+ dst_v += dst_stride_v;
+ }
+ if (height & 1) {
+ #if defined(HAS_RAWTOYROW_NEON)
+ RAWToUVRow(src_raw, 0, dst_u, dst_v, width);
+ RAWToYRow(src_raw, dst_y, width);
+ #else
+ RAWToARGBRow(src_raw, row, width);
+ ARGBToUVRow(row, 0, dst_u, dst_v, width);
+ ARGBToYRow(row, dst_y, width);
+ #endif
+ }
+ #if !defined(HAS_RAWTOYROW_NEON)
+ free_aligned_buffer_64(row);
+ #endif
}
- if (height & 1) {
-#if defined(HAS_RAWTOYROW_NEON)
- RAWToUVRow(src_raw, 0, dst_u, dst_v, width);
- RAWToYRow(src_raw, dst_y, width);
-#else
- RAWToARGBRow(src_raw, row, width);
- ARGBToUVRow(row, 0, dst_u, dst_v, width);
- ARGBToYRow(row, dst_y, width);
-#endif
- }
-#if !defined(HAS_RAWTOYROW_NEON)
- free_aligned_buffer_64(row);
-#endif
return 0;
}
@@ -1187,9 +1197,6 @@
uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C;
void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) =
ARGBToYRow_C;
- // Allocate 2 rows of ARGB.
- const int kRowSize = (width * 4 + 15) & ~15;
- align_buffer_64(row, kRowSize * 2);
#endif
if (!src_rgb565 || !dst_y || !dst_u || !dst_v ||
width <= 0 || height == 0) {
@@ -1246,36 +1253,44 @@
#endif // HAS_ARGBTOUVROW_SSSE3
#endif // HAS_RGB565TOYROW_NEON
- for (y = 0; y < height - 1; y += 2) {
-#if defined(HAS_RGB565TOYROW_NEON)
- RGB565ToUVRow(src_rgb565, src_stride_rgb565, dst_u, dst_v, width);
- RGB565ToYRow(src_rgb565, dst_y, width);
- RGB565ToYRow(src_rgb565 + src_stride_rgb565, dst_y + dst_stride_y, width);
-#else
- RGB565ToARGBRow(src_rgb565, row, width);
- RGB565ToARGBRow(src_rgb565 + src_stride_rgb565, row + kRowSize, width);
- ARGBToUVRow(row, kRowSize, dst_u, dst_v, width);
- ARGBToYRow(row, dst_y, width);
- ARGBToYRow(row + kRowSize, dst_y + dst_stride_y, width);
-#endif
- src_rgb565 += src_stride_rgb565 * 2;
- dst_y += dst_stride_y * 2;
- dst_u += dst_stride_u;
- dst_v += dst_stride_v;
- }
- if (height & 1) {
-#if defined(HAS_RGB565TOYROW_NEON)
- RGB565ToUVRow(src_rgb565, 0, dst_u, dst_v, width);
- RGB565ToYRow(src_rgb565, dst_y, width);
-#else
- RGB565ToARGBRow(src_rgb565, row, width);
- ARGBToUVRow(row, 0, dst_u, dst_v, width);
- ARGBToYRow(row, dst_y, width);
-#endif
- }
+ {
#if !defined(HAS_RGB565TOYROW_NEON)
- free_aligned_buffer_64(row);
+ // Allocate 2 rows of ARGB.
+ const int kRowSize = (width * 4 + 15) & ~15;
+ align_buffer_64(row, kRowSize * 2);
#endif
+
+ for (y = 0; y < height - 1; y += 2) {
+#if defined(HAS_RGB565TOYROW_NEON)
+ RGB565ToUVRow(src_rgb565, src_stride_rgb565, dst_u, dst_v, width);
+ RGB565ToYRow(src_rgb565, dst_y, width);
+ RGB565ToYRow(src_rgb565 + src_stride_rgb565, dst_y + dst_stride_y, width);
+#else
+ RGB565ToARGBRow(src_rgb565, row, width);
+ RGB565ToARGBRow(src_rgb565 + src_stride_rgb565, row + kRowSize, width);
+ ARGBToUVRow(row, kRowSize, dst_u, dst_v, width);
+ ARGBToYRow(row, dst_y, width);
+ ARGBToYRow(row + kRowSize, dst_y + dst_stride_y, width);
+#endif
+ src_rgb565 += src_stride_rgb565 * 2;
+ dst_y += dst_stride_y * 2;
+ dst_u += dst_stride_u;
+ dst_v += dst_stride_v;
+ }
+ if (height & 1) {
+#if defined(HAS_RGB565TOYROW_NEON)
+ RGB565ToUVRow(src_rgb565, 0, dst_u, dst_v, width);
+ RGB565ToYRow(src_rgb565, dst_y, width);
+#else
+ RGB565ToARGBRow(src_rgb565, row, width);
+ ARGBToUVRow(row, 0, dst_u, dst_v, width);
+ ARGBToYRow(row, dst_y, width);
+#endif
+ }
+#if !defined(HAS_RGB565TOYROW_NEON)
+ free_aligned_buffer_64(row);
+#endif
+ }
return 0;
}
@@ -1299,9 +1314,6 @@
uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C;
void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) =
ARGBToYRow_C;
- // Allocate 2 rows of ARGB.
- const int kRowSize = (width * 4 + 15) & ~15;
- align_buffer_64(row, kRowSize * 2);
#endif
if (!src_argb1555 || !dst_y || !dst_u || !dst_v ||
width <= 0 || height == 0) {
@@ -1358,38 +1370,45 @@
#endif // HAS_ARGBTOUVROW_SSSE3
#endif // HAS_ARGB1555TOYROW_NEON
- for (y = 0; y < height - 1; y += 2) {
-#if defined(HAS_ARGB1555TOYROW_NEON)
- ARGB1555ToUVRow(src_argb1555, src_stride_argb1555, dst_u, dst_v, width);
- ARGB1555ToYRow(src_argb1555, dst_y, width);
- ARGB1555ToYRow(src_argb1555 + src_stride_argb1555, dst_y + dst_stride_y,
- width);
-#else
- ARGB1555ToARGBRow(src_argb1555, row, width);
- ARGB1555ToARGBRow(src_argb1555 + src_stride_argb1555, row + kRowSize,
- width);
- ARGBToUVRow(row, kRowSize, dst_u, dst_v, width);
- ARGBToYRow(row, dst_y, width);
- ARGBToYRow(row + kRowSize, dst_y + dst_stride_y, width);
+ {
+#if !defined(HAS_ARGB1555TOYROW_NEON)
+ // Allocate 2 rows of ARGB.
+ const int kRowSize = (width * 4 + 15) & ~15;
+ align_buffer_64(row, kRowSize * 2);
#endif
- src_argb1555 += src_stride_argb1555 * 2;
- dst_y += dst_stride_y * 2;
- dst_u += dst_stride_u;
- dst_v += dst_stride_v;
- }
- if (height & 1) {
+ for (y = 0; y < height - 1; y += 2) {
#if defined(HAS_ARGB1555TOYROW_NEON)
- ARGB1555ToUVRow(src_argb1555, 0, dst_u, dst_v, width);
- ARGB1555ToYRow(src_argb1555, dst_y, width);
+ ARGB1555ToUVRow(src_argb1555, src_stride_argb1555, dst_u, dst_v, width);
+ ARGB1555ToYRow(src_argb1555, dst_y, width);
+ ARGB1555ToYRow(src_argb1555 + src_stride_argb1555, dst_y + dst_stride_y,
+ width);
#else
- ARGB1555ToARGBRow(src_argb1555, row, width);
- ARGBToUVRow(row, 0, dst_u, dst_v, width);
- ARGBToYRow(row, dst_y, width);
+ ARGB1555ToARGBRow(src_argb1555, row, width);
+ ARGB1555ToARGBRow(src_argb1555 + src_stride_argb1555, row + kRowSize,
+ width);
+ ARGBToUVRow(row, kRowSize, dst_u, dst_v, width);
+ ARGBToYRow(row, dst_y, width);
+ ARGBToYRow(row + kRowSize, dst_y + dst_stride_y, width);
#endif
- }
+ src_argb1555 += src_stride_argb1555 * 2;
+ dst_y += dst_stride_y * 2;
+ dst_u += dst_stride_u;
+ dst_v += dst_stride_v;
+ }
+ if (height & 1) {
+#if defined(HAS_ARGB1555TOYROW_NEON)
+ ARGB1555ToUVRow(src_argb1555, 0, dst_u, dst_v, width);
+ ARGB1555ToYRow(src_argb1555, dst_y, width);
+#else
+ ARGB1555ToARGBRow(src_argb1555, row, width);
+ ARGBToUVRow(row, 0, dst_u, dst_v, width);
+ ARGBToYRow(row, dst_y, width);
+#endif
+ }
#if !defined(HAS_ARGB1555TOYROW_NEON)
free_aligned_buffer_64(row);
#endif
+ }
return 0;
}
@@ -1413,9 +1432,6 @@
uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C;
void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) =
ARGBToYRow_C;
- // Allocate 2 rows of ARGB.
- const int kRowSize = (width * 4 + 15) & ~15;
- align_buffer_64(row, kRowSize * 2);
#endif
if (!src_argb4444 || !dst_y || !dst_u || !dst_v ||
width <= 0 || height == 0) {
@@ -1472,38 +1488,46 @@
#endif // HAS_ARGBTOUVROW_SSSE3
#endif // HAS_ARGB4444TOYROW_NEON
- for (y = 0; y < height - 1; y += 2) {
-#if defined(HAS_ARGB4444TOYROW_NEON)
- ARGB4444ToUVRow(src_argb4444, src_stride_argb4444, dst_u, dst_v, width);
- ARGB4444ToYRow(src_argb4444, dst_y, width);
- ARGB4444ToYRow(src_argb4444 + src_stride_argb4444, dst_y + dst_stride_y,
- width);
-#else
- ARGB4444ToARGBRow(src_argb4444, row, width);
- ARGB4444ToARGBRow(src_argb4444 + src_stride_argb4444, row + kRowSize,
- width);
- ARGBToUVRow(row, kRowSize, dst_u, dst_v, width);
- ARGBToYRow(row, dst_y, width);
- ARGBToYRow(row + kRowSize, dst_y + dst_stride_y, width);
-#endif
- src_argb4444 += src_stride_argb4444 * 2;
- dst_y += dst_stride_y * 2;
- dst_u += dst_stride_u;
- dst_v += dst_stride_v;
- }
- if (height & 1) {
-#if defined(HAS_ARGB4444TOYROW_NEON)
- ARGB4444ToUVRow(src_argb4444, 0, dst_u, dst_v, width);
- ARGB4444ToYRow(src_argb4444, dst_y, width);
-#else
- ARGB4444ToARGBRow(src_argb4444, row, width);
- ARGBToUVRow(row, 0, dst_u, dst_v, width);
- ARGBToYRow(row, dst_y, width);
-#endif
- }
+ {
#if !defined(HAS_ARGB4444TOYROW_NEON)
- free_aligned_buffer_64(row);
+ // Allocate 2 rows of ARGB.
+ const int kRowSize = (width * 4 + 15) & ~15;
+ align_buffer_64(row, kRowSize * 2);
#endif
+
+ for (y = 0; y < height - 1; y += 2) {
+#if defined(HAS_ARGB4444TOYROW_NEON)
+ ARGB4444ToUVRow(src_argb4444, src_stride_argb4444, dst_u, dst_v, width);
+ ARGB4444ToYRow(src_argb4444, dst_y, width);
+ ARGB4444ToYRow(src_argb4444 + src_stride_argb4444, dst_y + dst_stride_y,
+ width);
+#else
+ ARGB4444ToARGBRow(src_argb4444, row, width);
+ ARGB4444ToARGBRow(src_argb4444 + src_stride_argb4444, row + kRowSize,
+ width);
+ ARGBToUVRow(row, kRowSize, dst_u, dst_v, width);
+ ARGBToYRow(row, dst_y, width);
+ ARGBToYRow(row + kRowSize, dst_y + dst_stride_y, width);
+#endif
+ src_argb4444 += src_stride_argb4444 * 2;
+ dst_y += dst_stride_y * 2;
+ dst_u += dst_stride_u;
+ dst_v += dst_stride_v;
+ }
+ if (height & 1) {
+#if defined(HAS_ARGB4444TOYROW_NEON)
+ ARGB4444ToUVRow(src_argb4444, 0, dst_u, dst_v, width);
+ ARGB4444ToYRow(src_argb4444, dst_y, width);
+#else
+ ARGB4444ToARGBRow(src_argb4444, row, width);
+ ARGBToUVRow(row, 0, dst_u, dst_v, width);
+ ARGBToYRow(row, dst_y, width);
+#endif
+ }
+#if !defined(HAS_ARGB4444TOYROW_NEON)
+ free_aligned_buffer_64(row);
+#endif
+ }
return 0;
}
diff --git a/third_party/libyuv/source/convert_from_argb.cc b/third_party/libyuv/source/convert_from_argb.cc
index 121a416..de461dd 100644
--- a/third_party/libyuv/source/convert_from_argb.cc
+++ b/third_party/libyuv/source/convert_from_argb.cc
@@ -60,6 +60,13 @@
}
}
}
+#elif defined(HAS_ARGBTOUV444ROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+ ARGBToUV444Row = ARGBToUV444Row_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBToUV444Row = ARGBToUV444Row_NEON;
+ }
+ }
#endif
#if defined(HAS_ARGBTOYROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
@@ -76,10 +83,8 @@
#elif defined(HAS_ARGBTOYROW_NEON)
if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
ARGBToYRow = ARGBToYRow_Any_NEON;
- ARGBToUV444Row = ARGBToUV444Row_Any_NEON;
if (IS_ALIGNED(width, 8)) {
ARGBToYRow = ARGBToYRow_NEON;
- ARGBToUV444Row = ARGBToUV444Row_NEON;
}
}
#endif
@@ -134,6 +139,13 @@
}
}
}
+#elif defined(HAS_ARGBTOUV422ROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON) && width >= 16) {
+ ARGBToUV422Row = ARGBToUV422Row_Any_NEON;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToUV422Row = ARGBToUV422Row_NEON;
+ }
+ }
#endif
#if defined(HAS_ARGBTOYROW_SSSE3)
@@ -153,12 +165,6 @@
if (IS_ALIGNED(width, 8)) {
ARGBToYRow = ARGBToYRow_NEON;
}
- if (width >= 16) {
- ARGBToUV422Row = ARGBToUV422Row_Any_NEON;
- if (IS_ALIGNED(width, 16)) {
- ARGBToUV422Row = ARGBToUV422Row_NEON;
- }
- }
}
#endif
@@ -228,11 +234,13 @@
if (IS_ALIGNED(width, 8)) {
ARGBToYRow = ARGBToYRow_NEON;
}
- if (width >= 32) {
- ARGBToUV411Row = ARGBToUV411Row_Any_NEON;
- if (IS_ALIGNED(width, 32)) {
- ARGBToUV411Row = ARGBToUV411Row_NEON;
- }
+ }
+#endif
+#if defined(HAS_ARGBTOUV411ROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON) && width >= 32) {
+ ARGBToUV411Row = ARGBToUV411Row_Any_NEON;
+ if (IS_ALIGNED(width, 32)) {
+ ARGBToUV411Row = ARGBToUV411Row_NEON;
}
}
#endif
@@ -261,9 +269,6 @@
ARGBToYRow_C;
void (*MergeUVRow_)(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
int width) = MergeUVRow_C;
- // Allocate a rows of uv.
- align_buffer_64(row_u, ((halfwidth + 15) & ~15) * 2);
- uint8* row_v = row_u + ((halfwidth + 15) & ~15);
if (!src_argb ||
!dst_y || !dst_uv ||
width <= 0 || height == 0) {
@@ -296,11 +301,13 @@
if (IS_ALIGNED(width, 8)) {
ARGBToYRow = ARGBToYRow_NEON;
}
- if (width >= 16) {
- ARGBToUVRow = ARGBToUVRow_Any_NEON;
- if (IS_ALIGNED(width, 16)) {
- ARGBToUVRow = ARGBToUVRow_NEON;
- }
+ }
+#endif
+#if defined(HAS_ARGBTOUVROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON) && width >= 16) {
+ ARGBToUVRow = ARGBToUVRow_Any_NEON;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToUVRow = ARGBToUVRow_NEON;
}
}
#endif
@@ -331,22 +338,27 @@
}
}
#endif
+ {
+ // Allocate a rows of uv.
+ align_buffer_64(row_u, ((halfwidth + 15) & ~15) * 2);
+ uint8* row_v = row_u + ((halfwidth + 15) & ~15);
- for (y = 0; y < height - 1; y += 2) {
- ARGBToUVRow(src_argb, src_stride_argb, row_u, row_v, width);
- MergeUVRow_(row_u, row_v, dst_uv, halfwidth);
- ARGBToYRow(src_argb, dst_y, width);
- ARGBToYRow(src_argb + src_stride_argb, dst_y + dst_stride_y, width);
- src_argb += src_stride_argb * 2;
- dst_y += dst_stride_y * 2;
- dst_uv += dst_stride_uv;
+ for (y = 0; y < height - 1; y += 2) {
+ ARGBToUVRow(src_argb, src_stride_argb, row_u, row_v, width);
+ MergeUVRow_(row_u, row_v, dst_uv, halfwidth);
+ ARGBToYRow(src_argb, dst_y, width);
+ ARGBToYRow(src_argb + src_stride_argb, dst_y + dst_stride_y, width);
+ src_argb += src_stride_argb * 2;
+ dst_y += dst_stride_y * 2;
+ dst_uv += dst_stride_uv;
+ }
+ if (height & 1) {
+ ARGBToUVRow(src_argb, 0, row_u, row_v, width);
+ MergeUVRow_(row_u, row_v, dst_uv, halfwidth);
+ ARGBToYRow(src_argb, dst_y, width);
+ }
+ free_aligned_buffer_64(row_u);
}
- if (height & 1) {
- ARGBToUVRow(src_argb, 0, row_u, row_v, width);
- MergeUVRow_(row_u, row_v, dst_uv, halfwidth);
- ARGBToYRow(src_argb, dst_y, width);
- }
- free_aligned_buffer_64(row_u);
return 0;
}
@@ -364,9 +376,6 @@
ARGBToYRow_C;
void (*MergeUVRow_)(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
int width) = MergeUVRow_C;
- // Allocate a rows of uv.
- align_buffer_64(row_u, ((halfwidth + 15) & ~15) * 2);
- uint8* row_v = row_u + ((halfwidth + 15) & ~15);
if (!src_argb ||
!dst_y || !dst_uv ||
width <= 0 || height == 0) {
@@ -399,11 +408,13 @@
if (IS_ALIGNED(width, 8)) {
ARGBToYRow = ARGBToYRow_NEON;
}
- if (width >= 16) {
- ARGBToUVRow = ARGBToUVRow_Any_NEON;
- if (IS_ALIGNED(width, 16)) {
- ARGBToUVRow = ARGBToUVRow_NEON;
- }
+ }
+#endif
+#if defined(HAS_ARGBTOUVROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON) && width >= 16) {
+ ARGBToUVRow = ARGBToUVRow_Any_NEON;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToUVRow = ARGBToUVRow_NEON;
}
}
#endif
@@ -434,22 +445,27 @@
}
}
#endif
+ {
+ // Allocate a rows of uv.
+ align_buffer_64(row_u, ((halfwidth + 15) & ~15) * 2);
+ uint8* row_v = row_u + ((halfwidth + 15) & ~15);
- for (y = 0; y < height - 1; y += 2) {
- ARGBToUVRow(src_argb, src_stride_argb, row_u, row_v, width);
- MergeUVRow_(row_v, row_u, dst_uv, halfwidth);
- ARGBToYRow(src_argb, dst_y, width);
- ARGBToYRow(src_argb + src_stride_argb, dst_y + dst_stride_y, width);
- src_argb += src_stride_argb * 2;
- dst_y += dst_stride_y * 2;
- dst_uv += dst_stride_uv;
+ for (y = 0; y < height - 1; y += 2) {
+ ARGBToUVRow(src_argb, src_stride_argb, row_u, row_v, width);
+ MergeUVRow_(row_v, row_u, dst_uv, halfwidth);
+ ARGBToYRow(src_argb, dst_y, width);
+ ARGBToYRow(src_argb + src_stride_argb, dst_y + dst_stride_y, width);
+ src_argb += src_stride_argb * 2;
+ dst_y += dst_stride_y * 2;
+ dst_uv += dst_stride_uv;
+ }
+ if (height & 1) {
+ ARGBToUVRow(src_argb, 0, row_u, row_v, width);
+ MergeUVRow_(row_v, row_u, dst_uv, halfwidth);
+ ARGBToYRow(src_argb, dst_y, width);
+ }
+ free_aligned_buffer_64(row_u);
}
- if (height & 1) {
- ARGBToUVRow(src_argb, 0, row_u, row_v, width);
- MergeUVRow_(row_v, row_u, dst_uv, halfwidth);
- ARGBToYRow(src_argb, dst_y, width);
- }
- free_aligned_buffer_64(row_u);
return 0;
}
@@ -493,6 +509,13 @@
}
}
}
+#elif defined(HAS_ARGBTOUV422ROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON) && width >= 16) {
+ ARGBToUV422Row = ARGBToUV422Row_Any_NEON;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToUV422Row = ARGBToUV422Row_NEON;
+ }
+ }
#endif
#if defined(HAS_ARGBTOYROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
@@ -510,12 +533,6 @@
if (IS_ALIGNED(width, 8)) {
ARGBToYRow = ARGBToYRow_NEON;
}
- if (width >= 16) {
- ARGBToUV422Row = ARGBToUV422Row_Any_NEON;
- if (IS_ALIGNED(width, 16)) {
- ARGBToUV422Row = ARGBToUV422Row_NEON;
- }
- }
}
#endif
@@ -594,6 +611,13 @@
}
}
}
+#elif defined(HAS_ARGBTOUV422ROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON) && width >= 16) {
+ ARGBToUV422Row = ARGBToUV422Row_Any_NEON;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToUV422Row = ARGBToUV422Row_NEON;
+ }
+ }
#endif
#if defined(HAS_ARGBTOYROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
@@ -611,12 +635,6 @@
if (IS_ALIGNED(width, 8)) {
ARGBToYRow = ARGBToYRow_NEON;
}
- if (width >= 16) {
- ARGBToUV422Row = ARGBToUV422Row_Any_NEON;
- if (IS_ALIGNED(width, 16)) {
- ARGBToUV422Row = ARGBToUV422Row_NEON;
- }
- }
}
#endif
@@ -1022,11 +1040,13 @@
if (IS_ALIGNED(width, 8)) {
ARGBToYJRow = ARGBToYJRow_NEON;
}
- if (width >= 16) {
- ARGBToUVJRow = ARGBToUVJRow_Any_NEON;
- if (IS_ALIGNED(width, 16)) {
- ARGBToUVJRow = ARGBToUVJRow_NEON;
- }
+ }
+#endif
+#if defined(HAS_ARGBTOUVJROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON) && width >= 16) {
+ ARGBToUVJRow = ARGBToUVJRow_Any_NEON;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToUVJRow = ARGBToUVJRow_NEON;
}
}
#endif
diff --git a/third_party/libyuv/source/cpu_id.cc b/third_party/libyuv/source/cpu_id.cc
index 2e0d61d..8f8a403 100644
--- a/third_party/libyuv/source/cpu_id.cc
+++ b/third_party/libyuv/source/cpu_id.cc
@@ -14,8 +14,9 @@
#include <intrin.h> // For __cpuidex()
#endif
#if !defined(__pnacl__) && !defined(__CLR_VER) && \
- !defined(__native_client__) && defined(_M_X64) && \
- defined(_MSC_VER) && (_MSC_FULL_VER >= 160040219)
+ !defined(__native_client__) && \
+ defined(_MSC_VER) && (_MSC_FULL_VER >= 160040219) && \
+ (defined(_M_IX86) || defined(_M_X64))
#include <immintrin.h> // For _xgetbv()
#endif
@@ -97,7 +98,7 @@
uint32 xcr0 = 0u;
#if defined(_MSC_VER) && (_MSC_FULL_VER >= 160040219)
xcr0 = (uint32)(_xgetbv(0)); // VS2010 SP1 required.
-#elif defined(_M_IX86)
+#elif defined(_M_IX86) && defined(_MSC_VER)
__asm {
xor ecx, ecx // xcr 0
_asm _emit 0x0f _asm _emit 0x01 _asm _emit 0xd0 // For VS2010 and earlier.
@@ -256,12 +257,17 @@
if (getenv("LIBYUV_DISABLE_MIPS_DSPR2")) {
cpu_info_ &= ~kCpuHasMIPS_DSPR2;
}
-#elif defined(__arm__)
+#elif defined(__arm__) || defined(__aarch64__)
// gcc -mfpu=neon defines __ARM_NEON__
// __ARM_NEON__ generates code that requires Neon. NaCL also requires Neon.
// For Linux, /proc/cpuinfo can be tested but without that assume Neon.
#if defined(__ARM_NEON__) || defined(__native_client__) || !defined(__linux__)
cpu_info_ = kCpuHasNEON;
+// For aarch64(arm64), /proc/cpuinfo's feature is not complete, e.g. no neon
+// flag in it.
+// So for aarch64, neon enabling is hard coded here.
+#elif defined(__aarch64__)
+ cpu_info_ = kCpuHasNEON;
#else
// Linux arm parse text file for neon detect.
cpu_info_ = ArmCpuCaps("/proc/cpuinfo");
diff --git a/third_party/libyuv/source/format_conversion.cc b/third_party/libyuv/source/format_conversion.cc
index a3daf96..3c17371 100644
--- a/third_party/libyuv/source/format_conversion.cc
+++ b/third_party/libyuv/source/format_conversion.cc
@@ -332,11 +332,13 @@
if (IS_ALIGNED(width, 8)) {
ARGBToYRow = ARGBToYRow_NEON;
}
- if (width >= 16) {
- ARGBToUVRow = ARGBToUVRow_Any_NEON;
- if (IS_ALIGNED(width, 16)) {
- ARGBToUVRow = ARGBToUVRow_NEON;
- }
+ }
+#endif
+#if defined(HAS_ARGBTOUVROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON) && width >= 16) {
+ ARGBToUVRow = ARGBToUVRow_Any_NEON;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToUVRow = ARGBToUVRow_NEON;
}
}
#endif
diff --git a/third_party/libyuv/source/mjpeg_decoder.cc b/third_party/libyuv/source/mjpeg_decoder.cc
index 15b0ed8..36028c3 100644
--- a/third_party/libyuv/source/mjpeg_decoder.cc
+++ b/third_party/libyuv/source/mjpeg_decoder.cc
@@ -13,8 +13,8 @@
#ifdef HAVE_JPEG
#include <assert.h>
-#if !defined(__pnacl__) && !defined(__CLR_VER) && !defined(COVERAGE_ENABLED) &&\
- !defined(TARGET_IPHONE_SIMULATOR)
+#if !defined(__pnacl__) && !defined(__CLR_VER) && \
+ !defined(COVERAGE_ENABLED) && !defined(TARGET_IPHONE_SIMULATOR)
// Must be included before jpeglib.
#include <setjmp.h>
#define HAVE_SETJMP
@@ -101,7 +101,7 @@
}
buf_.data = src;
- buf_.len = (int)(src_len);
+ buf_.len = static_cast<int>(src_len);
buf_vec_.pos = 0;
decompress_struct_->client_data = &buf_vec_;
#ifdef HAVE_SETJMP
@@ -411,7 +411,7 @@
}
boolean fill_input_buffer(j_decompress_ptr cinfo) {
- BufferVector* buf_vec = (BufferVector*)(cinfo->client_data);
+ BufferVector* buf_vec = reinterpret_cast<BufferVector*>(cinfo->client_data);
if (buf_vec->pos >= buf_vec->len) {
assert(0 && "No more data");
// ERROR: No more data
@@ -447,7 +447,7 @@
// ERROR: Error in jpeglib: buf
#endif
- SetJmpErrorMgr* mgr = (SetJmpErrorMgr*)(cinfo->err);
+ SetJmpErrorMgr* mgr = reinterpret_cast<SetJmpErrorMgr*>(cinfo->err);
// This rewinds the call stack to the point of the corresponding setjmp()
// and causes it to return (for a second time) with value 1.
longjmp(mgr->setjmp_buffer, 1);
diff --git a/third_party/libyuv/source/row_any.cc b/third_party/libyuv/source/row_any.cc
index 97ef844..ce8b3da 100644
--- a/third_party/libyuv/source/row_any.cc
+++ b/third_party/libyuv/source/row_any.cc
@@ -79,9 +79,13 @@
YANY(I422ToARGB1555Row_Any_NEON, I422ToARGB1555Row_NEON, I422ToARGB1555Row_C,
1, 2, 7)
YANY(I422ToRGB565Row_Any_NEON, I422ToRGB565Row_NEON, I422ToRGB565Row_C, 1, 2, 7)
-YANY(I422ToYUY2Row_Any_NEON, I422ToYUY2Row_NEON, I422ToYUY2Row_C, 1, 2, 15)
-YANY(I422ToUYVYRow_Any_NEON, I422ToUYVYRow_NEON, I422ToUYVYRow_C, 1, 2, 15)
#endif // HAS_I422TOARGBROW_NEON
+#ifdef HAS_I422TOYUY2ROW_NEON
+YANY(I422ToYUY2Row_Any_NEON, I422ToYUY2Row_NEON, I422ToYUY2Row_C, 1, 2, 15)
+#endif // HAS_I422TOYUY2ROW_NEON
+#ifdef HAS_I422TOUYVYROW_NEON
+YANY(I422ToUYVYRow_Any_NEON, I422ToUYVYRow_NEON, I422ToUYVYRow_C, 1, 2, 15)
+#endif // HAS_I422TOUYVYROW_NEON
#undef YANY
// Wrappers to handle odd width
@@ -250,12 +254,26 @@
YANY(RGB565ToYRow_Any_NEON, RGB565ToYRow_NEON, 2, 1, 8)
YANY(ARGB1555ToYRow_Any_NEON, ARGB1555ToYRow_NEON, 2, 1, 8)
YANY(ARGB4444ToYRow_Any_NEON, ARGB4444ToYRow_NEON, 2, 1, 8)
+#endif
+#ifdef HAS_YUY2TOYROW_NEON
YANY(YUY2ToYRow_Any_NEON, YUY2ToYRow_NEON, 2, 1, 16)
+#endif
+#ifdef HAS_UYVYTOYROW_NEON
YANY(UYVYToYRow_Any_NEON, UYVYToYRow_NEON, 2, 1, 16)
+#endif
+#ifdef HAS_RGB24TOARGBROW_NEON
YANY(RGB24ToARGBRow_Any_NEON, RGB24ToARGBRow_NEON, 3, 4, 8)
+#endif
+#ifdef HAS_RAWTOARGBROW_NEON
YANY(RAWToARGBRow_Any_NEON, RAWToARGBRow_NEON, 3, 4, 8)
+#endif
+#ifdef HAS_RGB565TOARGBROW_NEON
YANY(RGB565ToARGBRow_Any_NEON, RGB565ToARGBRow_NEON, 2, 4, 8)
+#endif
+#ifdef HAS_ARGB1555TOARGBROW_NEON
YANY(ARGB1555ToARGBRow_Any_NEON, ARGB1555ToARGBRow_NEON, 2, 4, 8)
+#endif
+#ifdef HAS_ARGB4444TOARGBROW_NEON
YANY(ARGB4444ToARGBRow_Any_NEON, ARGB4444ToARGBRow_NEON, 2, 4, 8)
#endif
#undef YANY
@@ -333,7 +351,11 @@
UVANY(RGB565ToUVRow_Any_NEON, RGB565ToUVRow_NEON, RGB565ToUVRow_C, 2, 15)
UVANY(ARGB1555ToUVRow_Any_NEON, ARGB1555ToUVRow_NEON, ARGB1555ToUVRow_C, 2, 15)
UVANY(ARGB4444ToUVRow_Any_NEON, ARGB4444ToUVRow_NEON, ARGB4444ToUVRow_C, 2, 15)
+#endif
+#ifdef HAS_YUY2TOUVROW_NEON
UVANY(YUY2ToUVRow_Any_NEON, YUY2ToUVRow_NEON, YUY2ToUVRow_C, 2, 15)
+#endif
+#ifdef HAS_UYVYTOUVROW_NEON
UVANY(UYVYToUVRow_Any_NEON, UYVYToUVRow_NEON, UYVYToUVRow_C, 2, 15)
#endif
#undef UVANY
diff --git a/third_party/libyuv/source/row_neon64.cc b/third_party/libyuv/source/row_neon64.cc
index 46e9ceb..21111cf 100644
--- a/third_party/libyuv/source/row_neon64.cc
+++ b/third_party/libyuv/source/row_neon64.cc
@@ -824,19 +824,19 @@
".p2align 2 \n"
"1: \n"
MEMACCESS(0)
- "vld2.8 {q0, q1}, [%0]! \n" // load 16 pairs of UV
+ "ld2 {v0.16b, v1.16b}, [%0], #32 \n" // load 16 pairs of UV
"subs %3, %3, #16 \n" // 16 processed per loop
MEMACCESS(1)
- "vst1.8 {q0}, [%1]! \n" // store U
+ "st1 {v0.16b}, [%1], #16 \n" // store U
MEMACCESS(2)
- "vst1.8 {q1}, [%2]! \n" // store V
+ "st1 {v1.16b}, [%2], #16 \n" // store V
"bgt 1b \n"
: "+r"(src_uv), // %0
"+r"(dst_u), // %1
"+r"(dst_v), // %2
"+r"(width) // %3 // Output registers
: // Input registers
- : "cc", "memory", "q0", "q1" // Clobber List
+ : "cc", "memory", "v0", "v1" // Clobber List
);
}
#endif // HAS_SPLITUVROW_NEON
@@ -849,12 +849,12 @@
".p2align 2 \n"
"1: \n"
MEMACCESS(0)
- "vld1.8 {q0}, [%0]! \n" // load U
+ "ld1 {v0.16b}, [%0], #16 \n" // load U
MEMACCESS(1)
- "vld1.8 {q1}, [%1]! \n" // load V
+ "ld1 {v1.16b}, [%1], #16 \n" // load V
"subs %3, %3, #16 \n" // 16 processed per loop
MEMACCESS(2)
- "vst2.u8 {q0, q1}, [%2]! \n" // store 16 pairs of UV
+ "st2 {v0.16b, v1.16b}, [%2], #32 \n" // store 16 pairs of UV
"bgt 1b \n"
:
"+r"(src_u), // %0
@@ -862,7 +862,7 @@
"+r"(dst_uv), // %2
"+r"(width) // %3 // Output registers
: // Input registers
- : "cc", "memory", "q0", "q1" // Clobber List
+ : "cc", "memory", "v0", "v1" // Clobber List
);
}
#endif // HAS_MERGEUVROW_NEON
@@ -874,16 +874,16 @@
".p2align 2 \n"
"1: \n"
MEMACCESS(0)
- "vld1.8 {d0, d1, d2, d3}, [%0]! \n" // load 32
+ "ld1 {v0.8b-v3.8b}, [%0], #32 \n" // load 32
"subs %2, %2, #32 \n" // 32 processed per loop
MEMACCESS(1)
- "vst1.8 {d0, d1, d2, d3}, [%1]! \n" // store 32
+ "st1 {v0.8b-v3.8b}, [%1], #32 \n" // store 32
"bgt 1b \n"
: "+r"(src), // %0
"+r"(dst), // %1
"+r"(count) // %2 // Output registers
: // Input registers
- : "cc", "memory", "q0", "q1" // Clobber List
+ : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List
);
}
#endif // HAS_COPYROW_NEON
@@ -892,16 +892,16 @@
#ifdef HAS_SETROW_NEON
void SetRow_NEON(uint8* dst, uint32 v32, int count) {
asm volatile (
- "vdup.u32 q0, %2 \n" // duplicate 4 ints
+ "dup v0.4s, %w2 \n" // duplicate 4 ints
"1: \n"
"subs %1, %1, #16 \n" // 16 bytes per loop
MEMACCESS(0)
- "vst1.8 {q0}, [%0]! \n" // store
+ "st1 {v0.16b}, [%0], #16 \n" // store
"bgt 1b \n"
: "+r"(dst), // %0
"+r"(count) // %1
: "r"(v32) // %2
- : "cc", "memory", "q0"
+ : "cc", "memory", "v0"
);
}
#endif // HAS_SETROW_NEON
@@ -922,26 +922,25 @@
void MirrorRow_NEON(const uint8* src, uint8* dst, int width) {
asm volatile (
// Start at end of source row.
- "mov r3, #-16 \n"
"add %0, %0, %2 \n"
- "sub %0, #16 \n"
+ "sub %0, %0, #16 \n"
".p2align 2 \n"
"1: \n"
MEMACCESS(0)
- "vld1.8 {q0}, [%0], r3 \n" // src -= 16
- "subs %2, #16 \n" // 16 pixels per loop.
- "vrev64.8 q0, q0 \n"
+ "ld1 {v0.16b}, [%0], %3 \n" // src -= 16
+ "subs %2, %2, #16 \n" // 16 pixels per loop.
+ "rev64 v0.16b, v0.16b \n"
MEMACCESS(1)
- "vst1.8 {d1}, [%1]! \n" // dst += 16
+ "st1 {v0.D}[1], [%1], #8 \n" // dst += 16
MEMACCESS(1)
- "vst1.8 {d0}, [%1]! \n"
+ "st1 {v0.D}[0], [%1], #8 \n"
"bgt 1b \n"
: "+r"(src), // %0
"+r"(dst), // %1
"+r"(width) // %2
- :
- : "cc", "memory", "r3", "q0"
+ : "r"((ptrdiff_t)-16) // %3
+ : "cc", "memory", "v0"
);
}
#endif // HAS_MIRRORROW_NEON
@@ -951,27 +950,27 @@
int width) {
asm volatile (
// Start at end of source row.
- "mov r12, #-16 \n"
"add %0, %0, %3, lsl #1 \n"
- "sub %0, #16 \n"
+ "sub %0, %0, #16 \n"
".p2align 2 \n"
"1: \n"
MEMACCESS(0)
- "vld2.8 {d0, d1}, [%0], r12 \n" // src -= 16
- "subs %3, #8 \n" // 8 pixels per loop.
- "vrev64.8 q0, q0 \n"
+ "ld2 {v0.8b, v1.8b}, [%0], %4 \n" // src -= 16
+ "subs %3, %3, #8 \n" // 8 pixels per loop.
+ "rev64 v0.8b, v0.8b \n"
+ "rev64 v1.8b, v1.8b \n"
MEMACCESS(1)
- "vst1.8 {d0}, [%1]! \n" // dst += 8
+ "st1 {v0.8b}, [%1], #8 \n" // dst += 8
MEMACCESS(2)
- "vst1.8 {d1}, [%2]! \n"
+ "st1 {v1.8b}, [%2], #8 \n"
"bgt 1b \n"
: "+r"(src_uv), // %0
"+r"(dst_u), // %1
"+r"(dst_v), // %2
"+r"(width) // %3
- :
- : "cc", "memory", "r12", "q0"
+ : "r"((ptrdiff_t)-16) // %4
+ : "cc", "memory", "v0", "v1"
);
}
#endif // HAS_MIRRORUVROW_NEON
@@ -980,26 +979,25 @@
void ARGBMirrorRow_NEON(const uint8* src, uint8* dst, int width) {
asm volatile (
// Start at end of source row.
- "mov r3, #-16 \n"
"add %0, %0, %2, lsl #2 \n"
- "sub %0, #16 \n"
+ "sub %0, %0, #16 \n"
".p2align 2 \n"
"1: \n"
MEMACCESS(0)
- "vld1.8 {q0}, [%0], r3 \n" // src -= 16
- "subs %2, #4 \n" // 4 pixels per loop.
- "vrev64.32 q0, q0 \n"
+ "ld1 {v0.16b}, [%0], %3 \n" // src -= 16
+ "subs %2, %2, #4 \n" // 4 pixels per loop.
+ "rev64 v0.4s, v0.4s \n"
MEMACCESS(1)
- "vst1.8 {d1}, [%1]! \n" // dst += 16
+ "st1 {v0.D}[1], [%1], #8 \n" // dst += 16
MEMACCESS(1)
- "vst1.8 {d0}, [%1]! \n"
+ "st1 {v0.D}[0], [%1], #8 \n"
"bgt 1b \n"
: "+r"(src), // %0
"+r"(dst), // %1
"+r"(width) // %2
- :
- : "cc", "memory", "r3", "q0"
+ : "r"((ptrdiff_t)-16) // %3
+ : "cc", "memory", "v0"
);
}
#endif // HAS_ARGBMIRRORROW_NEON
@@ -1007,20 +1005,20 @@
#ifdef HAS_RGB24TOARGBROW_NEON
void RGB24ToARGBRow_NEON(const uint8* src_rgb24, uint8* dst_argb, int pix) {
asm volatile (
- "vmov.u8 d4, #255 \n" // Alpha
+ "movi v4.8b, #255 \n" // Alpha
".p2align 2 \n"
"1: \n"
MEMACCESS(0)
- "vld3.8 {d1, d2, d3}, [%0]! \n" // load 8 pixels of RGB24.
+ "ld3 {v1.8b-v3.8b}, [%0], #24 \n" // load 8 pixels of RGB24.
"subs %2, %2, #8 \n" // 8 processed per loop.
MEMACCESS(1)
- "vst4.8 {d1, d2, d3, d4}, [%1]! \n" // store 8 pixels of ARGB.
+ "st4 {v1.8b-v4.8b}, [%1], #32 \n" // store 8 pixels of ARGB.
"bgt 1b \n"
: "+r"(src_rgb24), // %0
"+r"(dst_argb), // %1
"+r"(pix) // %2
:
- : "cc", "memory", "d1", "d2", "d3", "d4" // Clobber List
+ : "cc", "memory", "v1", "v2", "v3", "v4" // Clobber List
);
}
#endif // HAS_RGB24TOARGBROW_NEON
@@ -1028,21 +1026,22 @@
#ifdef HAS_RAWTOARGBROW_NEON
void RAWToARGBRow_NEON(const uint8* src_raw, uint8* dst_argb, int pix) {
asm volatile (
- "vmov.u8 d4, #255 \n" // Alpha
+ "movi v5.8b, #255 \n" // Alpha
".p2align 2 \n"
"1: \n"
MEMACCESS(0)
- "vld3.8 {d1, d2, d3}, [%0]! \n" // load 8 pixels of RAW.
+ "ld3 {v0.8b-v2.8b}, [%0], #24 \n" // read r g b
"subs %2, %2, #8 \n" // 8 processed per loop.
- "vswp.u8 d1, d3 \n" // swap R, B
+ "mov v3.8b, v1.8b \n" // move g
+ "mov v4.8b, v0.8b \n" // move r
MEMACCESS(1)
- "vst4.8 {d1, d2, d3, d4}, [%1]! \n" // store 8 pixels of ARGB.
+ "st4 {v2.8b-v5.8b}, [%1], #32 \n" // store b g r a
"bgt 1b \n"
: "+r"(src_raw), // %0
"+r"(dst_argb), // %1
"+r"(pix) // %2
:
- : "cc", "memory", "d1", "d2", "d3", "d4" // Clobber List
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5" // Clobber List
);
}
#endif // HAS_RAWTOARGBROW_NEON
@@ -1170,16 +1169,16 @@
".p2align 2 \n"
"1: \n"
MEMACCESS(0)
- "vld4.8 {d1, d2, d3, d4}, [%0]! \n" // load 8 pixels of ARGB.
+ "ld4 {v1.8b-v4.8b}, [%0], #32 \n" // load 8 pixels of ARGB.
"subs %2, %2, #8 \n" // 8 processed per loop.
MEMACCESS(1)
- "vst3.8 {d1, d2, d3}, [%1]! \n" // store 8 pixels of RGB24.
+ "st3 {v1.8b-v3.8b}, [%1], #24 \n" // store 8 pixels of RGB24.
"bgt 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_rgb24), // %1
"+r"(pix) // %2
:
- : "cc", "memory", "d1", "d2", "d3", "d4" // Clobber List
+ : "cc", "memory", "v1", "v2", "v3", "v4" // Clobber List
);
}
#endif // HAS_ARGBTORGB24ROW_NEON
@@ -1190,17 +1189,18 @@
".p2align 2 \n"
"1: \n"
MEMACCESS(0)
- "vld4.8 {d1, d2, d3, d4}, [%0]! \n" // load 8 pixels of ARGB.
+ "ld4 {v1.8b-v4.8b}, [%0], #32 \n" // load b g r a
"subs %2, %2, #8 \n" // 8 processed per loop.
- "vswp.u8 d1, d3 \n" // swap R, B
+ "mov v4.8b, v2.8b \n" // mov g
+ "mov v5.8b, v1.8b \n" // mov b
MEMACCESS(1)
- "vst3.8 {d1, d2, d3}, [%1]! \n" // store 8 pixels of RAW.
+ "st3 {v3.8b-v5.8b}, [%1], #24 \n" // store r g b
"bgt 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_raw), // %1
"+r"(pix) // %2
:
- : "cc", "memory", "d1", "d2", "d3", "d4" // Clobber List
+ : "cc", "memory", "v1", "v2", "v3", "v4", "v5" // Clobber List
);
}
#endif // HAS_ARGBTORAWROW_NEON
@@ -1211,16 +1211,16 @@
".p2align 2 \n"
"1: \n"
MEMACCESS(0)
- "vld2.8 {q0, q1}, [%0]! \n" // load 16 pixels of YUY2.
+ "ld2 {v0.16b, v1.16b}, [%0], #32 \n" // load 16 pixels of YUY2.
"subs %2, %2, #16 \n" // 16 processed per loop.
MEMACCESS(1)
- "vst1.8 {q0}, [%1]! \n" // store 16 pixels of Y.
+ "st1 {v0.16b}, [%1], #16 \n" // store 16 pixels of Y.
"bgt 1b \n"
: "+r"(src_yuy2), // %0
"+r"(dst_y), // %1
"+r"(pix) // %2
:
- : "cc", "memory", "q0", "q1" // Clobber List
+ : "cc", "memory", "v0", "v1" // Clobber List
);
}
#endif // HAS_YUY2TOYROW_NEON
@@ -1231,16 +1231,16 @@
".p2align 2 \n"
"1: \n"
MEMACCESS(0)
- "vld2.8 {q0, q1}, [%0]! \n" // load 16 pixels of UYVY.
+ "ld2 {v0.16b, v1.16b}, [%0], #32 \n" // load 16 pixels of UYVY.
"subs %2, %2, #16 \n" // 16 processed per loop.
MEMACCESS(1)
- "vst1.8 {q1}, [%1]! \n" // store 16 pixels of Y.
+ "st1 {v1.16b}, [%1], #16 \n" // store 16 pixels of Y.
"bgt 1b \n"
: "+r"(src_uyvy), // %0
"+r"(dst_y), // %1
"+r"(pix) // %2
:
- : "cc", "memory", "q0", "q1" // Clobber List
+ : "cc", "memory", "v0", "v1" // Clobber List
);
}
#endif // HAS_UYVYTOYROW_NEON
@@ -1252,19 +1252,19 @@
".p2align 2 \n"
"1: \n"
MEMACCESS(0)
- "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of YUY2.
+ "ld4 {v0.8b-v3.8b}, [%0], #32 \n" // load 16 pixels of YUY2.
"subs %3, %3, #16 \n" // 16 pixels = 8 UVs.
MEMACCESS(1)
- "vst1.8 {d1}, [%1]! \n" // store 8 U.
+ "st1 {v1.8b}, [%1], #8 \n" // store 8 U.
MEMACCESS(2)
- "vst1.8 {d3}, [%2]! \n" // store 8 V.
+ "st1 {v3.8b}, [%2], #8 \n" // store 8 V.
"bgt 1b \n"
: "+r"(src_yuy2), // %0
"+r"(dst_u), // %1
"+r"(dst_v), // %2
"+r"(pix) // %3
:
- : "cc", "memory", "d0", "d1", "d2", "d3" // Clobber List
+ : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List
);
}
#endif // HAS_YUY2TOUV422ROW_NEON
@@ -1276,19 +1276,19 @@
".p2align 2 \n"
"1: \n"
MEMACCESS(0)
- "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of UYVY.
+ "ld4 {v0.8b-v3.8b}, [%0], #32 \n" // load 16 pixels of UYVY.
"subs %3, %3, #16 \n" // 16 pixels = 8 UVs.
MEMACCESS(1)
- "vst1.8 {d0}, [%1]! \n" // store 8 U.
+ "st1 {v0.8b}, [%1], #8 \n" // store 8 U.
MEMACCESS(2)
- "vst1.8 {d2}, [%2]! \n" // store 8 V.
+ "st1 {v2.8b}, [%2], #8 \n" // store 8 V.
"bgt 1b \n"
: "+r"(src_uyvy), // %0
"+r"(dst_u), // %1
"+r"(dst_v), // %2
"+r"(pix) // %3
:
- : "cc", "memory", "d0", "d1", "d2", "d3" // Clobber List
+ : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List
);
}
#endif // HAS_UYVYTOUV422ROW_NEON
@@ -1297,20 +1297,20 @@
void YUY2ToUVRow_NEON(const uint8* src_yuy2, int stride_yuy2,
uint8* dst_u, uint8* dst_v, int pix) {
asm volatile (
- "add %1, %0, %1 \n" // stride + src_yuy2
+ "add %x1, %x0, %w1, sxtw \n" // stride + src_yuy2
".p2align 2 \n"
"1: \n"
MEMACCESS(0)
- "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of YUY2.
+ "ld4 {v0.8b-v3.8b}, [%0], #32 \n" // load 16 pixels of YUY2.
"subs %4, %4, #16 \n" // 16 pixels = 8 UVs.
MEMACCESS(1)
- "vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load next row YUY2.
- "vrhadd.u8 d1, d1, d5 \n" // average rows of U
- "vrhadd.u8 d3, d3, d7 \n" // average rows of V
+ "ld4 {v4.8b-v7.8b}, [%1], #32 \n" // load next row YUY2.
+ "urhadd v1.8b, v1.8b, v5.8b \n" // average rows of U
+ "urhadd v3.8b, v3.8b, v7.8b \n" // average rows of V
MEMACCESS(2)
- "vst1.8 {d1}, [%2]! \n" // store 8 U.
+ "st1 {v1.8b}, [%2], #8 \n" // store 8 U.
MEMACCESS(3)
- "vst1.8 {d3}, [%3]! \n" // store 8 V.
+ "st1 {v3.8b}, [%3], #8 \n" // store 8 V.
"bgt 1b \n"
: "+r"(src_yuy2), // %0
"+r"(stride_yuy2), // %1
@@ -1318,7 +1318,7 @@
"+r"(dst_v), // %3
"+r"(pix) // %4
:
- : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7" // Clobber List
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7" // Clobber List
);
}
#endif // HAS_YUY2TOUVROW_NEON
@@ -1327,20 +1327,20 @@
void UYVYToUVRow_NEON(const uint8* src_uyvy, int stride_uyvy,
uint8* dst_u, uint8* dst_v, int pix) {
asm volatile (
- "add %1, %0, %1 \n" // stride + src_uyvy
+ "add %x1, %x0, %w1, sxtw \n" // stride + src_uyvy
".p2align 2 \n"
"1: \n"
MEMACCESS(0)
- "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of UYVY.
+ "ld4 {v0.8b-v3.8b}, [%0], #32 \n" // load 16 pixels of UYVY.
"subs %4, %4, #16 \n" // 16 pixels = 8 UVs.
MEMACCESS(1)
- "vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load next row UYVY.
- "vrhadd.u8 d0, d0, d4 \n" // average rows of U
- "vrhadd.u8 d2, d2, d6 \n" // average rows of V
+ "ld4 {v4.8b-v7.8b}, [%1], #32 \n" // load next row UYVY.
+ "urhadd v0.8b, v0.8b, v4.8b \n" // average rows of U
+ "urhadd v2.8b, v2.8b, v6.8b \n" // average rows of V
MEMACCESS(2)
- "vst1.8 {d0}, [%2]! \n" // store 8 U.
+ "st1 {v0.8b}, [%2], #8 \n" // store 8 U.
MEMACCESS(3)
- "vst1.8 {d2}, [%3]! \n" // store 8 V.
+ "st1 {v2.8b}, [%3], #8 \n" // store 8 V.
"bgt 1b \n"
: "+r"(src_uyvy), // %0
"+r"(stride_uyvy), // %1
@@ -1348,7 +1348,7 @@
"+r"(dst_v), // %3
"+r"(pix) // %4
:
- : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7" // Clobber List
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7" // Clobber List
);
}
#endif // HAS_UYVYTOUVROW_NEON
@@ -1358,23 +1358,23 @@
uint8* dst_uv, int pix) {
asm volatile (
// change the stride to row 2 pointer
- "add %1, %0 \n"
+ "add %x1, %x0, %w1, sxtw \n"
"1: \n"
MEMACCESS(0)
- "vld1.8 {q0}, [%0]! \n" // load row 1 16 pixels.
+ "ld1 {v0.16b}, [%0], #16 \n" // load row 1 16 pixels.
"subs %3, %3, #16 \n" // 16 processed per loop
MEMACCESS(1)
- "vld1.8 {q1}, [%1]! \n" // load row 2 16 pixels.
- "vrhadd.u8 q0, q1 \n" // average row 1 and 2
+ "ld1 {v1.16b}, [%1], #16 \n" // load row 2 16 pixels.
+ "urhadd v0.16b, v0.16b, v1.16b \n" // average row 1 and 2
MEMACCESS(2)
- "vst1.8 {q0}, [%2]! \n"
+ "st1 {v0.16b}, [%2], #16 \n"
"bgt 1b \n"
: "+r"(src_uv), // %0
"+r"(src_uv_stride), // %1
"+r"(dst_uv), // %2
"+r"(pix) // %3
:
- : "cc", "memory", "q0", "q1" // Clobber List
+ : "cc", "memory", "v0", "v1" // Clobber List
);
}
#endif // HAS_HALFROW_NEON
@@ -1384,22 +1384,22 @@
void ARGBToBayerRow_NEON(const uint8* src_argb, uint8* dst_bayer,
uint32 selector, int pix) {
asm volatile (
- "vmov.u32 d6[0], %3 \n" // selector
+ "mov v2.s[0], %w3 \n" // selector
"1: \n"
MEMACCESS(0)
- "vld1.8 {q0, q1}, [%0]! \n" // load row 8 pixels.
+ "ld1 {v0.16b, v1.16b}, [%0], 32 \n" // load row 8 pixels.
"subs %2, %2, #8 \n" // 8 processed per loop
- "vtbl.8 d4, {d0, d1}, d6 \n" // look up 4 pixels
- "vtbl.8 d5, {d2, d3}, d6 \n" // look up 4 pixels
- "vtrn.u32 d4, d5 \n" // combine 8 pixels
+ "tbl v4.8b, {v0.16b}, v2.8b \n" // look up 4 pixels
+ "tbl v5.8b, {v1.16b}, v2.8b \n" // look up 4 pixels
+ "trn1 v4.4s, v4.4s, v5.4s \n" // combine 8 pixels
MEMACCESS(1)
- "vst1.8 {d4}, [%1]! \n" // store 8.
+ "st1 {v4.8b}, [%1], #8 \n" // store 8.
"bgt 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_bayer), // %1
"+r"(pix) // %2
: "r"(selector) // %3
- : "cc", "memory", "q0", "q1", "q2", "q3" // Clobber List
+ : "cc", "memory", "v0", "v1", "v2", "v4", "v5" // Clobber List
);
}
#endif // HAS_ARGBTOBAYERROW_NEON
@@ -1411,16 +1411,16 @@
asm volatile (
"1: \n"
MEMACCESS(0)
- "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load row 8 pixels.
+ "ld4 {v0.8b-v3.8b}, [%0], #32 \n" // load row 8 pixels.
"subs %2, %2, #8 \n" // 8 processed per loop
MEMACCESS(1)
- "vst1.8 {d1}, [%1]! \n" // store 8 G's.
+ "st1 {v1.8b}, [%1], #8 \n" // store 8 G's.
"bgt 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_bayer), // %1
"+r"(pix) // %2
:
- : "cc", "memory", "q0", "q1" // Clobber List
+ : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List
);
}
#endif // HAS_ARGBTOBAYERGGROW_NEON
@@ -1431,21 +1431,20 @@
const uint8* shuffler, int pix) {
asm volatile (
MEMACCESS(3)
- "vld1.8 {q2}, [%3] \n" // shuffler
+ "ld1 {v2.16b}, [%3] \n" // shuffler
"1: \n"
MEMACCESS(0)
- "vld1.8 {q0}, [%0]! \n" // load 4 pixels.
+ "ld1 {v0.16b}, [%0], #16 \n" // load 4 pixels.
"subs %2, %2, #4 \n" // 4 processed per loop
- "vtbl.8 d2, {d0, d1}, d4 \n" // look up 2 first pixels
- "vtbl.8 d3, {d0, d1}, d5 \n" // look up 2 next pixels
+ "tbl v1.16b, {v0.16b}, v2.16b \n" // look up 4 pixels
MEMACCESS(1)
- "vst1.8 {q1}, [%1]! \n" // store 4.
+ "st1 {v1.16b}, [%1], #16 \n" // store 4.
"bgt 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_argb), // %1
"+r"(pix) // %2
: "r"(shuffler) // %3
- : "cc", "memory", "q0", "q1", "q2" // Clobber List
+ : "cc", "memory", "v0", "v1", "v2" // Clobber List
);
}
#endif // HAS_ARGBSHUFFLEROW_NEON
@@ -1459,14 +1458,15 @@
".p2align 2 \n"
"1: \n"
MEMACCESS(0)
- "vld2.8 {d0, d2}, [%0]! \n" // load 16 Ys
+ "ld2 {v0.8b, v1.8b}, [%0], #16 \n" // load 16 Ys
+ "mov v2.8b, v1.8b \n"
MEMACCESS(1)
- "vld1.8 {d1}, [%1]! \n" // load 8 Us
+ "ld1 {v1.8b}, [%1], #8 \n" // load 8 Us
MEMACCESS(2)
- "vld1.8 {d3}, [%2]! \n" // load 8 Vs
+ "ld1 {v3.8b}, [%2], #8 \n" // load 8 Vs
"subs %4, %4, #16 \n" // 16 pixels
MEMACCESS(3)
- "vst4.8 {d0, d1, d2, d3}, [%3]! \n" // Store 8 YUY2/16 pixels.
+ "st4 {v0.8b-v3.8b}, [%3], #32 \n" // Store 8 YUY2/16 pixels.
"bgt 1b \n"
: "+r"(src_y), // %0
"+r"(src_u), // %1
@@ -1474,7 +1474,7 @@
"+r"(dst_yuy2), // %3
"+r"(width) // %4
:
- : "cc", "memory", "d0", "d1", "d2", "d3"
+ : "cc", "memory", "v0", "v1", "v2", "v3"
);
}
#endif // HAS_I422TOYUY2ROW_NEON
@@ -1488,14 +1488,15 @@
".p2align 2 \n"
"1: \n"
MEMACCESS(0)
- "vld2.8 {d1, d3}, [%0]! \n" // load 16 Ys
+ "ld2 {v1.8b, v2.8b}, [%0], #16 \n" // load 16 Ys
+ "mov v3.8b, v2.8b \n"
MEMACCESS(1)
- "vld1.8 {d0}, [%1]! \n" // load 8 Us
+ "ld1 {v0.8b}, [%1], #8 \n" // load 8 Us
MEMACCESS(2)
- "vld1.8 {d2}, [%2]! \n" // load 8 Vs
+ "ld1 {v2.8b}, [%2], #8 \n" // load 8 Vs
"subs %4, %4, #16 \n" // 16 pixels
MEMACCESS(3)
- "vst4.8 {d0, d1, d2, d3}, [%3]! \n" // Store 8 UYVY/16 pixels.
+ "st4 {v0.8b-v3.8b}, [%3], #32 \n" // Store 8 UYVY/16 pixels.
"bgt 1b \n"
: "+r"(src_y), // %0
"+r"(src_u), // %1
@@ -1503,7 +1504,7 @@
"+r"(dst_uyvy), // %3
"+r"(width) // %4
:
- : "cc", "memory", "d0", "d1", "d2", "d3"
+ : "cc", "memory", "v0", "v1", "v2", "v3"
);
}
#endif // HAS_I422TOUYVYROW_NEON
@@ -1577,28 +1578,28 @@
#ifdef HAS_ARGBTOYROW_NEON
void ARGBToYRow_NEON(const uint8* src_argb, uint8* dst_y, int pix) {
asm volatile (
- "vmov.u8 d24, #13 \n" // B * 0.1016 coefficient
- "vmov.u8 d25, #65 \n" // G * 0.5078 coefficient
- "vmov.u8 d26, #33 \n" // R * 0.2578 coefficient
- "vmov.u8 d27, #16 \n" // Add 16 constant
+ "movi v4.8b, #13 \n" // B * 0.1016 coefficient
+ "movi v5.8b, #65 \n" // G * 0.5078 coefficient
+ "movi v6.8b, #33 \n" // R * 0.2578 coefficient
+ "movi v7.8b, #16 \n" // Add 16 constant
".p2align 2 \n"
"1: \n"
MEMACCESS(0)
- "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels.
+ "ld4 {v0.8b-v3.8b}, [%0], #32 \n" // load 8 ARGB pixels.
"subs %2, %2, #8 \n" // 8 processed per loop.
- "vmull.u8 q2, d0, d24 \n" // B
- "vmlal.u8 q2, d1, d25 \n" // G
- "vmlal.u8 q2, d2, d26 \n" // R
- "vqrshrun.s16 d0, q2, #7 \n" // 16 bit to 8 bit Y
- "vqadd.u8 d0, d27 \n"
+ "umull v3.8h, v0.8b, v4.8b \n" // B
+ "umlal v3.8h, v1.8b, v5.8b \n" // G
+ "umlal v3.8h, v2.8b, v6.8b \n" // R
+ "sqrshrun v0.8b, v3.8h, #7 \n" // 16 bit to 8 bit Y
+ "uqadd v0.8b, v0.8b, v7.8b \n"
MEMACCESS(1)
- "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y.
+ "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
"bgt 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_y), // %1
"+r"(pix) // %2
:
- : "cc", "memory", "q0", "q1", "q2", "q12", "q13"
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"
);
}
#endif // HAS_ARGBTOYROW_NEON
@@ -1606,26 +1607,26 @@
#ifdef HAS_ARGBTOYJROW_NEON
void ARGBToYJRow_NEON(const uint8* src_argb, uint8* dst_y, int pix) {
asm volatile (
- "vmov.u8 d24, #15 \n" // B * 0.11400 coefficient
- "vmov.u8 d25, #75 \n" // G * 0.58700 coefficient
- "vmov.u8 d26, #38 \n" // R * 0.29900 coefficient
+ "movi v4.8b, #15 \n" // B * 0.11400 coefficient
+ "movi v5.8b, #75 \n" // G * 0.58700 coefficient
+ "movi v6.8b, #38 \n" // R * 0.29900 coefficient
".p2align 2 \n"
"1: \n"
MEMACCESS(0)
- "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels.
+ "ld4 {v0.8b-v3.8b}, [%0], #32 \n" // load 8 ARGB pixels.
"subs %2, %2, #8 \n" // 8 processed per loop.
- "vmull.u8 q2, d0, d24 \n" // B
- "vmlal.u8 q2, d1, d25 \n" // G
- "vmlal.u8 q2, d2, d26 \n" // R
- "vqrshrun.s16 d0, q2, #7 \n" // 15 bit to 8 bit Y
+ "umull v3.8h, v0.8b, v4.8b \n" // B
+ "umlal v3.8h, v1.8b, v5.8b \n" // G
+ "umlal v3.8h, v2.8b, v6.8b \n" // R
+ "sqrshrun v0.8b, v3.8h, #7 \n" // 15 bit to 8 bit Y
MEMACCESS(1)
- "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y.
+ "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
"bgt 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_y), // %1
"+r"(pix) // %2
:
- : "cc", "memory", "q0", "q1", "q2", "q12", "q13"
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6"
);
}
#endif // HAS_ARGBTOYJROW_NEON
@@ -3048,20 +3049,20 @@
".p2align 2 \n"
"1: \n"
MEMACCESS(0)
- "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels.
+ "ld4 {v0.8b-v3.8b}, [%0], #32 \n" // load 8 ARGB pixels.
MEMACCESS(1)
- "vld4.8 {d1, d3, d5, d7}, [%1]! \n" // load 8 more ARGB pixels.
+ "ld4 {v4.8b-v7.8b}, [%1], #32 \n" // load 8 more ARGB pixels.
"subs %3, %3, #8 \n" // 8 processed per loop.
- "vmull.u8 q0, d0, d1 \n" // multiply B
- "vmull.u8 q1, d2, d3 \n" // multiply G
- "vmull.u8 q2, d4, d5 \n" // multiply R
- "vmull.u8 q3, d6, d7 \n" // multiply A
- "vrshrn.u16 d0, q0, #8 \n" // 16 bit to 8 bit B
- "vrshrn.u16 d1, q1, #8 \n" // 16 bit to 8 bit G
- "vrshrn.u16 d2, q2, #8 \n" // 16 bit to 8 bit R
- "vrshrn.u16 d3, q3, #8 \n" // 16 bit to 8 bit A
+ "umull v0.8h, v0.8b, v4.8b \n" // multiply B
+ "umull v1.8h, v1.8b, v5.8b \n" // multiply G
+ "umull v2.8h, v2.8b, v6.8b \n" // multiply R
+ "umull v3.8h, v3.8b, v7.8b \n" // multiply A
+ "rshrn v0.8b, v0.8h, #8 \n" // 16 bit to 8 bit B
+ "rshrn v1.8b, v1.8h, #8 \n" // 16 bit to 8 bit G
+ "rshrn v2.8b, v2.8h, #8 \n" // 16 bit to 8 bit R
+ "rshrn v3.8b, v3.8h, #8 \n" // 16 bit to 8 bit A
MEMACCESS(2)
- "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels.
+ "st4 {v0.8b-v3.8b}, [%2], #32 \n" // store 8 ARGB pixels.
"bgt 1b \n"
: "+r"(src_argb0), // %0
@@ -3069,7 +3070,7 @@
"+r"(dst_argb), // %2
"+r"(width) // %3
:
- : "cc", "memory", "q0", "q1", "q2", "q3"
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"
);
}
#endif // HAS_ARGBMULTIPLYROW_NEON
@@ -3083,14 +3084,16 @@
".p2align 2 \n"
"1: \n"
MEMACCESS(0)
- "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels.
+ "ld4 {v0.8b-v3.8b}, [%0], #32 \n" // load 8 ARGB pixels.
MEMACCESS(1)
- "vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load 8 more ARGB pixels.
+ "ld4 {v4.8b-v7.8b}, [%1], #32 \n" // load 8 more ARGB pixels.
"subs %3, %3, #8 \n" // 8 processed per loop.
- "vqadd.u8 q0, q0, q2 \n" // add B, G
- "vqadd.u8 q1, q1, q3 \n" // add R, A
+ "uqadd v0.8b, v0.8b, v4.8b \n"
+ "uqadd v1.8b, v1.8b, v5.8b \n"
+ "uqadd v2.8b, v2.8b, v6.8b \n"
+ "uqadd v3.8b, v3.8b, v7.8b \n"
MEMACCESS(2)
- "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels.
+ "st4 {v0.8b-v3.8b}, [%2], #32 \n" // store 8 ARGB pixels.
"bgt 1b \n"
: "+r"(src_argb0), // %0
@@ -3098,7 +3101,7 @@
"+r"(dst_argb), // %2
"+r"(width) // %3
:
- : "cc", "memory", "q0", "q1", "q2", "q3"
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"
);
}
#endif // HAS_ARGBADDROW_NEON
@@ -3112,14 +3115,16 @@
".p2align 2 \n"
"1: \n"
MEMACCESS(0)
- "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels.
+ "ld4 {v0.8b-v3.8b}, [%0], #32 \n" // load 8 ARGB pixels.
MEMACCESS(1)
- "vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load 8 more ARGB pixels.
+ "ld4 {v4.8b-v7.8b}, [%1], #32 \n" // load 8 more ARGB pixels.
"subs %3, %3, #8 \n" // 8 processed per loop.
- "vqsub.u8 q0, q0, q2 \n" // subtract B, G
- "vqsub.u8 q1, q1, q3 \n" // subtract R, A
+ "uqsub v0.8b, v0.8b, v4.8b \n"
+ "uqsub v1.8b, v1.8b, v5.8b \n"
+ "uqsub v2.8b, v2.8b, v6.8b \n"
+ "uqsub v3.8b, v3.8b, v7.8b \n"
MEMACCESS(2)
- "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels.
+ "st4 {v0.8b-v3.8b}, [%2], #32 \n" // store 8 ARGB pixels.
"bgt 1b \n"
: "+r"(src_argb0), // %0
@@ -3127,7 +3132,7 @@
"+r"(dst_argb), // %2
"+r"(width) // %3
:
- : "cc", "memory", "q0", "q1", "q2", "q3"
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"
);
}
#endif // HAS_ARGBSUBTRACTROW_NEON
@@ -3141,27 +3146,27 @@
void SobelRow_NEON(const uint8* src_sobelx, const uint8* src_sobely,
uint8* dst_argb, int width) {
asm volatile (
- "vmov.u8 d3, #255 \n" // alpha
+ "movi v3.8b, #255 \n" // alpha
// 8 pixel loop.
".p2align 2 \n"
"1: \n"
MEMACCESS(0)
- "vld1.8 {d0}, [%0]! \n" // load 8 sobelx.
+ "ld1 {v0.8b}, [%0], #8 \n" // load 8 sobelx.
MEMACCESS(1)
- "vld1.8 {d1}, [%1]! \n" // load 8 sobely.
+ "ld1 {v1.8b}, [%1], #8 \n" // load 8 sobely.
"subs %3, %3, #8 \n" // 8 processed per loop.
- "vqadd.u8 d0, d0, d1 \n" // add
- "vmov.u8 d1, d0 \n"
- "vmov.u8 d2, d0 \n"
+ "uqadd v0.8b, v0.8b, v1.8b \n" // add
+ "mov v1.8b, v0.8b \n"
+ "mov v2.8b, v0.8b \n"
MEMACCESS(2)
- "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels.
+ "st4 {v0.8b-v3.8b}, [%2], #32 \n" // store 8 ARGB pixels.
"bgt 1b \n"
: "+r"(src_sobelx), // %0
"+r"(src_sobely), // %1
"+r"(dst_argb), // %2
"+r"(width) // %3
:
- : "cc", "memory", "q0", "q1"
+ : "cc", "memory", "v0", "v1", "v2", "v3"
);
}
#endif // HAS_SOBELROW_NEON
@@ -3175,20 +3180,20 @@
".p2align 2 \n"
"1: \n"
MEMACCESS(0)
- "vld1.8 {q0}, [%0]! \n" // load 16 sobelx.
+ "ld1 {v0.16b}, [%0], #16 \n" // load 16 sobelx.
MEMACCESS(1)
- "vld1.8 {q1}, [%1]! \n" // load 16 sobely.
+ "ld1 {v1.16b}, [%1], #16 \n" // load 16 sobely.
"subs %3, %3, #16 \n" // 16 processed per loop.
- "vqadd.u8 q0, q0, q1 \n" // add
+ "uqadd v0.16b, v0.16b, v1.16b \n" // add
MEMACCESS(2)
- "vst1.8 {q0}, [%2]! \n" // store 16 pixels.
+ "st1 {v0.16b}, [%2], #16 \n" // store 16 pixels.
"bgt 1b \n"
: "+r"(src_sobelx), // %0
"+r"(src_sobely), // %1
"+r"(dst_y), // %2
"+r"(width) // %3
:
- : "cc", "memory", "q0", "q1"
+ : "cc", "memory", "v0", "v1"
);
}
#endif // HAS_SOBELTOPLANEROW_NEON
@@ -3202,25 +3207,25 @@
void SobelXYRow_NEON(const uint8* src_sobelx, const uint8* src_sobely,
uint8* dst_argb, int width) {
asm volatile (
- "vmov.u8 d3, #255 \n" // alpha
+ "movi v3.8b, #255 \n" // alpha
// 8 pixel loop.
".p2align 2 \n"
"1: \n"
MEMACCESS(0)
- "vld1.8 {d2}, [%0]! \n" // load 8 sobelx.
+ "ld1 {v2.8b}, [%0], #8 \n" // load 8 sobelx.
MEMACCESS(1)
- "vld1.8 {d0}, [%1]! \n" // load 8 sobely.
+ "ld1 {v0.8b}, [%1], #8 \n" // load 8 sobely.
"subs %3, %3, #8 \n" // 8 processed per loop.
- "vqadd.u8 d1, d0, d2 \n" // add
+ "uqadd v1.8b, v0.8b, v2.8b \n" // add
MEMACCESS(2)
- "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels.
+ "st4 {v0.8b-v3.8b}, [%2], #32 \n" // store 8 ARGB pixels.
"bgt 1b \n"
: "+r"(src_sobelx), // %0
"+r"(src_sobely), // %1
"+r"(dst_argb), // %2
"+r"(width) // %3
:
- : "cc", "memory", "q0", "q1"
+ : "cc", "memory", "v0", "v1", "v2", "v3"
);
}
#endif // HAS_SOBELXYROW_NEON
@@ -3236,28 +3241,28 @@
".p2align 2 \n"
"1: \n"
MEMACCESS(0)
- "vld1.8 {d0}, [%0],%5 \n" // top
+ "ld1 {v0.8b}, [%0],%5 \n" // top
MEMACCESS(0)
- "vld1.8 {d1}, [%0],%6 \n"
- "vsubl.u8 q0, d0, d1 \n"
+ "ld1 {v1.8b}, [%0],%6 \n"
+ "usubl v0.8h, v0.8b, v1.8b \n"
MEMACCESS(1)
- "vld1.8 {d2}, [%1],%5 \n" // center * 2
+ "ld1 {v2.8b}, [%1],%5 \n" // center * 2
MEMACCESS(1)
- "vld1.8 {d3}, [%1],%6 \n"
- "vsubl.u8 q1, d2, d3 \n"
- "vadd.s16 q0, q0, q1 \n"
- "vadd.s16 q0, q0, q1 \n"
+ "ld1 {v3.8b}, [%1],%6 \n"
+ "usubl v1.8h, v2.8b, v3.8b \n"
+ "add v0.8h, v0.8h, v1.8h \n"
+ "add v0.8h, v0.8h, v1.8h \n"
MEMACCESS(2)
- "vld1.8 {d2}, [%2],%5 \n" // bottom
+ "ld1 {v2.8b}, [%2],%5 \n" // bottom
MEMACCESS(2)
- "vld1.8 {d3}, [%2],%6 \n"
+ "ld1 {v3.8b}, [%2],%6 \n"
"subs %4, %4, #8 \n" // 8 pixels
- "vsubl.u8 q1, d2, d3 \n"
- "vadd.s16 q0, q0, q1 \n"
- "vabs.s16 q0, q0 \n"
- "vqmovn.u16 d0, q0 \n"
+ "usubl v1.8h, v2.8b, v3.8b \n"
+ "add v0.8h, v0.8h, v1.8h \n"
+ "abs v0.8h, v0.8h \n"
+ "uqxtn v0.8b, v0.8h \n"
MEMACCESS(3)
- "vst1.8 {d0}, [%3]! \n" // store 8 sobelx
+ "st1 {v0.8b}, [%3], #8 \n" // store 8 sobelx
"bgt 1b \n"
: "+r"(src_y0), // %0
"+r"(src_y1), // %1
@@ -3266,7 +3271,7 @@
"+r"(width) // %4
: "r"(2), // %5
"r"(6) // %6
- : "cc", "memory", "q0", "q1" // Clobber List
+ : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List
);
}
#endif // HAS_SOBELXROW_NEON
@@ -3282,28 +3287,28 @@
".p2align 2 \n"
"1: \n"
MEMACCESS(0)
- "vld1.8 {d0}, [%0],%4 \n" // left
+ "ld1 {v0.8b}, [%0],%4 \n" // left
MEMACCESS(1)
- "vld1.8 {d1}, [%1],%4 \n"
- "vsubl.u8 q0, d0, d1 \n"
+ "ld1 {v1.8b}, [%1],%4 \n"
+ "usubl v0.8h, v0.8b, v1.8b \n"
MEMACCESS(0)
- "vld1.8 {d2}, [%0],%4 \n" // center * 2
+ "ld1 {v2.8b}, [%0],%4 \n" // center * 2
MEMACCESS(1)
- "vld1.8 {d3}, [%1],%4 \n"
- "vsubl.u8 q1, d2, d3 \n"
- "vadd.s16 q0, q0, q1 \n"
- "vadd.s16 q0, q0, q1 \n"
+ "ld1 {v3.8b}, [%1],%4 \n"
+ "usubl v1.8h, v2.8b, v3.8b \n"
+ "add v0.8h, v0.8h, v1.8h \n"
+ "add v0.8h, v0.8h, v1.8h \n"
MEMACCESS(0)
- "vld1.8 {d2}, [%0],%5 \n" // right
+ "ld1 {v2.8b}, [%0],%5 \n" // right
MEMACCESS(1)
- "vld1.8 {d3}, [%1],%5 \n"
+ "ld1 {v3.8b}, [%1],%5 \n"
"subs %3, %3, #8 \n" // 8 pixels
- "vsubl.u8 q1, d2, d3 \n"
- "vadd.s16 q0, q0, q1 \n"
- "vabs.s16 q0, q0 \n"
- "vqmovn.u16 d0, q0 \n"
+ "usubl v1.8h, v2.8b, v3.8b \n"
+ "add v0.8h, v0.8h, v1.8h \n"
+ "abs v0.8h, v0.8h \n"
+ "uqxtn v0.8b, v0.8h \n"
MEMACCESS(2)
- "vst1.8 {d0}, [%2]! \n" // store 8 sobely
+ "st1 {v0.8b}, [%2], #8 \n" // store 8 sobely
"bgt 1b \n"
: "+r"(src_y0), // %0
"+r"(src_y1), // %1
@@ -3311,7 +3316,7 @@
"+r"(width) // %3
: "r"(1), // %4
"r"(6) // %5
- : "cc", "memory", "q0", "q1" // Clobber List
+ : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List
);
}
#endif // HAS_SOBELYROW_NEON
diff --git a/third_party/libyuv/source/row_win.cc b/third_party/libyuv/source/row_win.cc
index 8eb8889..d79c353 100644
--- a/third_party/libyuv/source/row_win.cc
+++ b/third_party/libyuv/source/row_win.cc
@@ -10,7 +10,7 @@
#include "libyuv/row.h"
-#if defined (_M_X64)
+#if defined (_M_X64) && !defined(LIBYUV_DISABLE_X86) && defined(_MSC_VER)
#include <emmintrin.h>
#include <tmmintrin.h> // For _mm_maddubs_epi16
#endif
@@ -21,7 +21,8 @@
#endif
// This module is for Visual C.
-#if !defined(LIBYUV_DISABLE_X86) && defined(_MSC_VER)
+#if !defined(LIBYUV_DISABLE_X86) && defined(_MSC_VER) && \
+ (defined(_M_IX86) || defined(_M_X64))
#define YG 74 /* (int8)(1.164 * 64 + 0.5) */
@@ -78,7 +79,6 @@
const uint8* v_buf,
uint8* dst_argb,
int width) {
-
__m128i xmm0, xmm1, xmm2, xmm3;
const __m128i xmm5 = _mm_set1_epi8(-1);
const __m128i xmm4 = _mm_setzero_si128();
@@ -132,7 +132,6 @@
const uint8* v_buf,
uint8* dst_argb,
int width) {
-
__m128i xmm0, xmm1, xmm2, xmm3;
const __m128i xmm5 = _mm_set1_epi8(-1);
const __m128i xmm4 = _mm_setzero_si128();
diff --git a/third_party/libyuv/source/scale_neon64.cc b/third_party/libyuv/source/scale_neon64.cc
new file mode 100644
index 0000000..64c7d10
--- /dev/null
+++ b/third_party/libyuv/source/scale_neon64.cc
@@ -0,0 +1,790 @@
+/*
+ * Copyright 2014 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/row.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// This module is for GCC Neon.
+#if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
+#ifdef HAS_SCALEROWDOWN2_NEON
+// Read 32x1 throw away even pixels, and write 16x1.
+void ScaleRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
+ uint8* dst, int dst_width) {
+ asm volatile (
+ ".p2align 2 \n"
+ "1: \n"
+ // load even pixels into q0, odd into q1
+ MEMACCESS(0)
+ "vld2.8 {q0, q1}, [%0]! \n"
+ "subs %2, %2, #16 \n" // 16 processed per loop
+ MEMACCESS(1)
+ "vst1.8 {q1}, [%1]! \n" // store odd pixels
+ "bgt 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst), // %1
+ "+r"(dst_width) // %2
+ :
+ : "q0", "q1" // Clobber List
+ );
+}
+#endif //HAS_SCALEROWDOWN2_NEON
+
+#ifdef HAS_SCALEROWDOWN2_NEON
+// Read 32x2 average down and write 16x1.
+void ScaleRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
+ uint8* dst, int dst_width) {
+ asm volatile (
+ // change the stride to row 2 pointer
+ "add %1, %0 \n"
+ ".p2align 2 \n"
+ "1: \n"
+ MEMACCESS(0)
+ "vld1.8 {q0, q1}, [%0]! \n" // load row 1 and post inc
+ MEMACCESS(1)
+ "vld1.8 {q2, q3}, [%1]! \n" // load row 2 and post inc
+ "subs %3, %3, #16 \n" // 16 processed per loop
+ "vpaddl.u8 q0, q0 \n" // row 1 add adjacent
+ "vpaddl.u8 q1, q1 \n"
+ "vpadal.u8 q0, q2 \n" // row 2 add adjacent + row1
+ "vpadal.u8 q1, q3 \n"
+ "vrshrn.u16 d0, q0, #2 \n" // downshift, round and pack
+ "vrshrn.u16 d1, q1, #2 \n"
+ MEMACCESS(2)
+ "vst1.8 {q0}, [%2]! \n"
+ "bgt 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(src_stride), // %1
+ "+r"(dst), // %2
+ "+r"(dst_width) // %3
+ :
+ : "q0", "q1", "q2", "q3" // Clobber List
+ );
+}
+#endif //HAS_SCALEROWDOWN2_NEON
+
+#ifdef HAS_SCALEROWDOWN4_NEON
+void ScaleRowDown4_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
+ uint8* dst_ptr, int dst_width) {
+ asm volatile (
+ ".p2align 2 \n"
+ "1: \n"
+ MEMACCESS(0)
+ "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0
+ "subs %2, %2, #8 \n" // 8 processed per loop
+ MEMACCESS(1)
+ "vst1.8 {d2}, [%1]! \n"
+ "bgt 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(dst_width) // %2
+ :
+ : "q0", "q1", "memory", "cc"
+ );
+}
+#endif //HAS_SCALEROWDOWN4_NEON
+
+#ifdef HAS_SCALEROWDOWN4_NEON
+void ScaleRowDown4Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
+ uint8* dst_ptr, int dst_width) {
+ const uint8* src_ptr1 = src_ptr + src_stride;
+ const uint8* src_ptr2 = src_ptr + src_stride * 2;
+ const uint8* src_ptr3 = src_ptr + src_stride * 3;
+asm volatile (
+ ".p2align 2 \n"
+ "1: \n"
+ MEMACCESS(0)
+ "vld1.8 {q0}, [%0]! \n" // load up 16x4
+ MEMACCESS(3)
+ "vld1.8 {q1}, [%3]! \n"
+ MEMACCESS(4)
+ "vld1.8 {q2}, [%4]! \n"
+ MEMACCESS(5)
+ "vld1.8 {q3}, [%5]! \n"
+ "subs %2, %2, #4 \n"
+ "vpaddl.u8 q0, q0 \n"
+ "vpadal.u8 q0, q1 \n"
+ "vpadal.u8 q0, q2 \n"
+ "vpadal.u8 q0, q3 \n"
+ "vpaddl.u16 q0, q0 \n"
+ "vrshrn.u32 d0, q0, #4 \n" // divide by 16 w/rounding
+ "vmovn.u16 d0, q0 \n"
+ MEMACCESS(1)
+ "vst1.32 {d0[0]}, [%1]! \n"
+ "bgt 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(dst_width), // %2
+ "+r"(src_ptr1), // %3
+ "+r"(src_ptr2), // %4
+ "+r"(src_ptr3) // %5
+ :
+ : "q0", "q1", "q2", "q3", "memory", "cc"
+ );
+}
+#endif //HAS_SCALEROWDOWN4_NEON
+
+#ifdef HAS_SCALEROWDOWN34_NEON
+// Down scale from 4 to 3 pixels. Use the neon multilane read/write
+// to load up the every 4th pixel into a 4 different registers.
+// Point samples 32 pixels to 24 pixels.
+void ScaleRowDown34_NEON(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst_ptr, int dst_width) {
+ asm volatile (
+ ".p2align 2 \n"
+ "1: \n"
+ MEMACCESS(0)
+ "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0
+ "subs %2, %2, #24 \n"
+ "vmov d2, d3 \n" // order d0, d1, d2
+ MEMACCESS(1)
+ "vst3.8 {d0, d1, d2}, [%1]! \n"
+ "bgt 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(dst_width) // %2
+ :
+ : "d0", "d1", "d2", "d3", "memory", "cc"
+ );
+}
+#endif //HAS_SCALEROWDOWN34_NEON
+
+#ifdef HAS_SCALEROWDOWN34_NEON
+void ScaleRowDown34_0_Box_NEON(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst_ptr, int dst_width) {
+ asm volatile (
+ "vmov.u8 d24, #3 \n"
+ "add %3, %0 \n"
+ ".p2align 2 \n"
+ "1: \n"
+ MEMACCESS(0)
+ "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0
+ MEMACCESS(3)
+ "vld4.8 {d4, d5, d6, d7}, [%3]! \n" // src line 1
+ "subs %2, %2, #24 \n"
+
+ // filter src line 0 with src line 1
+ // expand chars to shorts to allow for room
+ // when adding lines together
+ "vmovl.u8 q8, d4 \n"
+ "vmovl.u8 q9, d5 \n"
+ "vmovl.u8 q10, d6 \n"
+ "vmovl.u8 q11, d7 \n"
+
+ // 3 * line_0 + line_1
+ "vmlal.u8 q8, d0, d24 \n"
+ "vmlal.u8 q9, d1, d24 \n"
+ "vmlal.u8 q10, d2, d24 \n"
+ "vmlal.u8 q11, d3, d24 \n"
+
+ // (3 * line_0 + line_1) >> 2
+ "vqrshrn.u16 d0, q8, #2 \n"
+ "vqrshrn.u16 d1, q9, #2 \n"
+ "vqrshrn.u16 d2, q10, #2 \n"
+ "vqrshrn.u16 d3, q11, #2 \n"
+
+ // a0 = (src[0] * 3 + s[1] * 1) >> 2
+ "vmovl.u8 q8, d1 \n"
+ "vmlal.u8 q8, d0, d24 \n"
+ "vqrshrn.u16 d0, q8, #2 \n"
+
+ // a1 = (src[1] * 1 + s[2] * 1) >> 1
+ "vrhadd.u8 d1, d1, d2 \n"
+
+ // a2 = (src[2] * 1 + s[3] * 3) >> 2
+ "vmovl.u8 q8, d2 \n"
+ "vmlal.u8 q8, d3, d24 \n"
+ "vqrshrn.u16 d2, q8, #2 \n"
+
+ MEMACCESS(1)
+ "vst3.8 {d0, d1, d2}, [%1]! \n"
+
+ "bgt 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(dst_width), // %2
+ "+r"(src_stride) // %3
+ :
+ : "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11", "d24", "memory", "cc"
+ );
+}
+#endif //ScaleRowDown34_0_Box_NEON
+
+#ifdef HAS_SCALEROWDOWN34_NEON
+void ScaleRowDown34_1_Box_NEON(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst_ptr, int dst_width) {
+ asm volatile (
+ "vmov.u8 d24, #3 \n"
+ "add %3, %0 \n"
+ ".p2align 2 \n"
+ "1: \n"
+ MEMACCESS(0)
+ "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0
+ MEMACCESS(3)
+ "vld4.8 {d4, d5, d6, d7}, [%3]! \n" // src line 1
+ "subs %2, %2, #24 \n"
+ // average src line 0 with src line 1
+ "vrhadd.u8 q0, q0, q2 \n"
+ "vrhadd.u8 q1, q1, q3 \n"
+
+ // a0 = (src[0] * 3 + s[1] * 1) >> 2
+ "vmovl.u8 q3, d1 \n"
+ "vmlal.u8 q3, d0, d24 \n"
+ "vqrshrn.u16 d0, q3, #2 \n"
+
+ // a1 = (src[1] * 1 + s[2] * 1) >> 1
+ "vrhadd.u8 d1, d1, d2 \n"
+
+ // a2 = (src[2] * 1 + s[3] * 3) >> 2
+ "vmovl.u8 q3, d2 \n"
+ "vmlal.u8 q3, d3, d24 \n"
+ "vqrshrn.u16 d2, q3, #2 \n"
+
+ MEMACCESS(1)
+ "vst3.8 {d0, d1, d2}, [%1]! \n"
+ "bgt 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(dst_width), // %2
+ "+r"(src_stride) // %3
+ :
+ : "r4", "q0", "q1", "q2", "q3", "d24", "memory", "cc"
+ );
+}
+#endif //HAS_SCALEROWDOWN34_NEON
+
+#ifdef HAS_SCALEROWDOWN38_NEON
+#define HAS_SCALEROWDOWN38_NEON
+static uvec8 kShuf38 =
+ { 0, 3, 6, 8, 11, 14, 16, 19, 22, 24, 27, 30, 0, 0, 0, 0 };
+static uvec8 kShuf38_2 =
+ { 0, 8, 16, 2, 10, 17, 4, 12, 18, 6, 14, 19, 0, 0, 0, 0 };
+static vec16 kMult38_Div6 =
+ { 65536 / 12, 65536 / 12, 65536 / 12, 65536 / 12,
+ 65536 / 12, 65536 / 12, 65536 / 12, 65536 / 12 };
+static vec16 kMult38_Div9 =
+ { 65536 / 18, 65536 / 18, 65536 / 18, 65536 / 18,
+ 65536 / 18, 65536 / 18, 65536 / 18, 65536 / 18 };
+
+// 32 -> 12
+void ScaleRowDown38_NEON(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst_ptr, int dst_width) {
+ asm volatile (
+ MEMACCESS(3)
+ "vld1.8 {q3}, [%3] \n"
+ ".p2align 2 \n"
+ "1: \n"
+ MEMACCESS(0)
+ "vld1.8 {d0, d1, d2, d3}, [%0]! \n"
+ "subs %2, %2, #12 \n"
+ "vtbl.u8 d4, {d0, d1, d2, d3}, d6 \n"
+ "vtbl.u8 d5, {d0, d1, d2, d3}, d7 \n"
+ MEMACCESS(1)
+ "vst1.8 {d4}, [%1]! \n"
+ MEMACCESS(1)
+ "vst1.32 {d5[0]}, [%1]! \n"
+ "bgt 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(dst_width) // %2
+ : "r"(&kShuf38) // %3
+ : "d0", "d1", "d2", "d3", "d4", "d5", "memory", "cc"
+ );
+}
+
+#endif //HAS_SCALEROWDOWN38_NEON
+
+#ifdef HAS_SCALEROWDOWN38_NEON
+// 32x3 -> 12x1
+void OMITFP ScaleRowDown38_3_Box_NEON(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst_ptr, int dst_width) {
+ const uint8* src_ptr1 = src_ptr + src_stride * 2;
+
+ asm volatile (
+ MEMACCESS(5)
+ "vld1.16 {q13}, [%5] \n"
+ MEMACCESS(6)
+ "vld1.8 {q14}, [%6] \n"
+ MEMACCESS(7)
+ "vld1.8 {q15}, [%7] \n"
+ "add %3, %0 \n"
+ ".p2align 2 \n"
+ "1: \n"
+
+ // d0 = 00 40 01 41 02 42 03 43
+ // d1 = 10 50 11 51 12 52 13 53
+ // d2 = 20 60 21 61 22 62 23 63
+ // d3 = 30 70 31 71 32 72 33 73
+ MEMACCESS(0)
+ "vld4.8 {d0, d1, d2, d3}, [%0]! \n"
+ MEMACCESS(3)
+ "vld4.8 {d4, d5, d6, d7}, [%3]! \n"
+ MEMACCESS(4)
+ "vld4.8 {d16, d17, d18, d19}, [%4]! \n"
+ "subs %2, %2, #12 \n"
+
+ // Shuffle the input data around to get align the data
+ // so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7
+ // d0 = 00 10 01 11 02 12 03 13
+ // d1 = 40 50 41 51 42 52 43 53
+ "vtrn.u8 d0, d1 \n"
+ "vtrn.u8 d4, d5 \n"
+ "vtrn.u8 d16, d17 \n"
+
+ // d2 = 20 30 21 31 22 32 23 33
+ // d3 = 60 70 61 71 62 72 63 73
+ "vtrn.u8 d2, d3 \n"
+ "vtrn.u8 d6, d7 \n"
+ "vtrn.u8 d18, d19 \n"
+
+ // d0 = 00+10 01+11 02+12 03+13
+ // d2 = 40+50 41+51 42+52 43+53
+ "vpaddl.u8 q0, q0 \n"
+ "vpaddl.u8 q2, q2 \n"
+ "vpaddl.u8 q8, q8 \n"
+
+ // d3 = 60+70 61+71 62+72 63+73
+ "vpaddl.u8 d3, d3 \n"
+ "vpaddl.u8 d7, d7 \n"
+ "vpaddl.u8 d19, d19 \n"
+
+ // combine source lines
+ "vadd.u16 q0, q2 \n"
+ "vadd.u16 q0, q8 \n"
+ "vadd.u16 d4, d3, d7 \n"
+ "vadd.u16 d4, d19 \n"
+
+ // dst_ptr[3] = (s[6 + st * 0] + s[7 + st * 0]
+ // + s[6 + st * 1] + s[7 + st * 1]
+ // + s[6 + st * 2] + s[7 + st * 2]) / 6
+ "vqrdmulh.s16 q2, q2, q13 \n"
+ "vmovn.u16 d4, q2 \n"
+
+ // Shuffle 2,3 reg around so that 2 can be added to the
+ // 0,1 reg and 3 can be added to the 4,5 reg. This
+ // requires expanding from u8 to u16 as the 0,1 and 4,5
+ // registers are already expanded. Then do transposes
+ // to get aligned.
+ // q2 = xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33
+ "vmovl.u8 q1, d2 \n"
+ "vmovl.u8 q3, d6 \n"
+ "vmovl.u8 q9, d18 \n"
+
+ // combine source lines
+ "vadd.u16 q1, q3 \n"
+ "vadd.u16 q1, q9 \n"
+
+ // d4 = xx 20 xx 30 xx 22 xx 32
+ // d5 = xx 21 xx 31 xx 23 xx 33
+ "vtrn.u32 d2, d3 \n"
+
+ // d4 = xx 20 xx 21 xx 22 xx 23
+ // d5 = xx 30 xx 31 xx 32 xx 33
+ "vtrn.u16 d2, d3 \n"
+
+ // 0+1+2, 3+4+5
+ "vadd.u16 q0, q1 \n"
+
+ // Need to divide, but can't downshift as the the value
+ // isn't a power of 2. So multiply by 65536 / n
+ // and take the upper 16 bits.
+ "vqrdmulh.s16 q0, q0, q15 \n"
+
+ // Align for table lookup, vtbl requires registers to
+ // be adjacent
+ "vmov.u8 d2, d4 \n"
+
+ "vtbl.u8 d3, {d0, d1, d2}, d28 \n"
+ "vtbl.u8 d4, {d0, d1, d2}, d29 \n"
+
+ MEMACCESS(1)
+ "vst1.8 {d3}, [%1]! \n"
+ MEMACCESS(1)
+ "vst1.32 {d4[0]}, [%1]! \n"
+ "bgt 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(dst_width), // %2
+ "+r"(src_stride), // %3
+ "+r"(src_ptr1) // %4
+ : "r"(&kMult38_Div6), // %5
+ "r"(&kShuf38_2), // %6
+ "r"(&kMult38_Div9) // %7
+ : "q0", "q1", "q2", "q3", "q8", "q9", "q13", "q14", "q15", "memory", "cc"
+ );
+}
+#endif //HAS_SCALEROWDOWN38_NEON
+
+#ifdef HAS_SCALEROWDOWN38_NEON
+// 32x2 -> 12x1
+void ScaleRowDown38_2_Box_NEON(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst_ptr, int dst_width) {
+ asm volatile (
+ MEMACCESS(4)
+ "vld1.16 {q13}, [%4] \n"
+ MEMACCESS(5)
+ "vld1.8 {q14}, [%5] \n"
+ "add %3, %0 \n"
+ ".p2align 2 \n"
+ "1: \n"
+
+ // d0 = 00 40 01 41 02 42 03 43
+ // d1 = 10 50 11 51 12 52 13 53
+ // d2 = 20 60 21 61 22 62 23 63
+ // d3 = 30 70 31 71 32 72 33 73
+ MEMACCESS(0)
+ "vld4.8 {d0, d1, d2, d3}, [%0]! \n"
+ MEMACCESS(3)
+ "vld4.8 {d4, d5, d6, d7}, [%3]! \n"
+ "subs %2, %2, #12 \n"
+
+ // Shuffle the input data around to get align the data
+ // so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7
+ // d0 = 00 10 01 11 02 12 03 13
+ // d1 = 40 50 41 51 42 52 43 53
+ "vtrn.u8 d0, d1 \n"
+ "vtrn.u8 d4, d5 \n"
+
+ // d2 = 20 30 21 31 22 32 23 33
+ // d3 = 60 70 61 71 62 72 63 73
+ "vtrn.u8 d2, d3 \n"
+ "vtrn.u8 d6, d7 \n"
+
+ // d0 = 00+10 01+11 02+12 03+13
+ // d2 = 40+50 41+51 42+52 43+53
+ "vpaddl.u8 q0, q0 \n"
+ "vpaddl.u8 q2, q2 \n"
+
+ // d3 = 60+70 61+71 62+72 63+73
+ "vpaddl.u8 d3, d3 \n"
+ "vpaddl.u8 d7, d7 \n"
+
+ // combine source lines
+ "vadd.u16 q0, q2 \n"
+ "vadd.u16 d4, d3, d7 \n"
+
+ // dst_ptr[3] = (s[6] + s[7] + s[6+st] + s[7+st]) / 4
+ "vqrshrn.u16 d4, q2, #2 \n"
+
+ // Shuffle 2,3 reg around so that 2 can be added to the
+ // 0,1 reg and 3 can be added to the 4,5 reg. This
+ // requires expanding from u8 to u16 as the 0,1 and 4,5
+ // registers are already expanded. Then do transposes
+ // to get aligned.
+ // q2 = xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33
+ "vmovl.u8 q1, d2 \n"
+ "vmovl.u8 q3, d6 \n"
+
+ // combine source lines
+ "vadd.u16 q1, q3 \n"
+
+ // d4 = xx 20 xx 30 xx 22 xx 32
+ // d5 = xx 21 xx 31 xx 23 xx 33
+ "vtrn.u32 d2, d3 \n"
+
+ // d4 = xx 20 xx 21 xx 22 xx 23
+ // d5 = xx 30 xx 31 xx 32 xx 33
+ "vtrn.u16 d2, d3 \n"
+
+ // 0+1+2, 3+4+5
+ "vadd.u16 q0, q1 \n"
+
+ // Need to divide, but can't downshift as the the value
+ // isn't a power of 2. So multiply by 65536 / n
+ // and take the upper 16 bits.
+ "vqrdmulh.s16 q0, q0, q13 \n"
+
+ // Align for table lookup, vtbl requires registers to
+ // be adjacent
+ "vmov.u8 d2, d4 \n"
+
+ "vtbl.u8 d3, {d0, d1, d2}, d28 \n"
+ "vtbl.u8 d4, {d0, d1, d2}, d29 \n"
+
+ MEMACCESS(1)
+ "vst1.8 {d3}, [%1]! \n"
+ MEMACCESS(1)
+ "vst1.32 {d4[0]}, [%1]! \n"
+ "bgt 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(dst_width), // %2
+ "+r"(src_stride) // %3
+ : "r"(&kMult38_Div6), // %4
+ "r"(&kShuf38_2) // %5
+ : "q0", "q1", "q2", "q3", "q13", "q14", "memory", "cc"
+ );
+}
+#endif //HAS_SCALEROWDOWN38_NEON
+
+#if 0
+// 16x2 -> 16x1
+void ScaleFilterRows_NEON(uint8* dst_ptr,
+ const uint8* src_ptr, ptrdiff_t src_stride,
+ int dst_width, int source_y_fraction) {
+ asm volatile (
+ "cmp %4, #0 \n"
+ "beq 100f \n"
+ "add %2, %1 \n"
+ "cmp %4, #64 \n"
+ "beq 75f \n"
+ "cmp %4, #128 \n"
+ "beq 50f \n"
+ "cmp %4, #192 \n"
+ "beq 25f \n"
+
+ "vdup.8 d5, %4 \n"
+ "rsb %4, #256 \n"
+ "vdup.8 d4, %4 \n"
+ // General purpose row blend.
+ "1: \n"
+ MEMACCESS(1)
+ "vld1.8 {q0}, [%1]! \n"
+ MEMACCESS(2)
+ "vld1.8 {q1}, [%2]! \n"
+ "subs %3, %3, #16 \n"
+ "vmull.u8 q13, d0, d4 \n"
+ "vmull.u8 q14, d1, d4 \n"
+ "vmlal.u8 q13, d2, d5 \n"
+ "vmlal.u8 q14, d3, d5 \n"
+ "vrshrn.u16 d0, q13, #8 \n"
+ "vrshrn.u16 d1, q14, #8 \n"
+ MEMACCESS(0)
+ "vst1.8 {q0}, [%0]! \n"
+ "bgt 1b \n"
+ "b 99f \n"
+
+ // Blend 25 / 75.
+ "25: \n"
+ MEMACCESS(1)
+ "vld1.8 {q0}, [%1]! \n"
+ MEMACCESS(2)
+ "vld1.8 {q1}, [%2]! \n"
+ "subs %3, %3, #16 \n"
+ "vrhadd.u8 q0, q1 \n"
+ "vrhadd.u8 q0, q1 \n"
+ MEMACCESS(0)
+ "vst1.8 {q0}, [%0]! \n"
+ "bgt 25b \n"
+ "b 99f \n"
+
+ // Blend 50 / 50.
+ "50: \n"
+ MEMACCESS(1)
+ "vld1.8 {q0}, [%1]! \n"
+ MEMACCESS(2)
+ "vld1.8 {q1}, [%2]! \n"
+ "subs %3, %3, #16 \n"
+ "vrhadd.u8 q0, q1 \n"
+ MEMACCESS(0)
+ "vst1.8 {q0}, [%0]! \n"
+ "bgt 50b \n"
+ "b 99f \n"
+
+ // Blend 75 / 25.
+ "75: \n"
+ MEMACCESS(1)
+ "vld1.8 {q1}, [%1]! \n"
+ MEMACCESS(2)
+ "vld1.8 {q0}, [%2]! \n"
+ "subs %3, %3, #16 \n"
+ "vrhadd.u8 q0, q1 \n"
+ "vrhadd.u8 q0, q1 \n"
+ MEMACCESS(0)
+ "vst1.8 {q0}, [%0]! \n"
+ "bgt 75b \n"
+ "b 99f \n"
+
+ // Blend 100 / 0 - Copy row unchanged.
+ "100: \n"
+ MEMACCESS(1)
+ "vld1.8 {q0}, [%1]! \n"
+ "subs %3, %3, #16 \n"
+ MEMACCESS(0)
+ "vst1.8 {q0}, [%0]! \n"
+ "bgt 100b \n"
+
+ "99: \n"
+ MEMACCESS(0)
+ "vst1.8 {d1[7]}, [%0] \n"
+ : "+r"(dst_ptr), // %0
+ "+r"(src_ptr), // %1
+ "+r"(src_stride), // %2
+ "+r"(dst_width), // %3
+ "+r"(source_y_fraction) // %4
+ :
+ : "q0", "q1", "d4", "d5", "q13", "q14", "memory", "cc"
+ );
+}
+#endif //0
+
+#ifdef HAS_SCALEARGBROWDOWN2_NEON
+void ScaleARGBRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
+ uint8* dst, int dst_width) {
+ asm volatile (
+ ".p2align 2 \n"
+ "1: \n"
+ // load even pixels into q0, odd into q1
+ MEMACCESS(0)
+ "vld2.32 {q0, q1}, [%0]! \n"
+ MEMACCESS(0)
+ "vld2.32 {q2, q3}, [%0]! \n"
+ "subs %2, %2, #8 \n" // 8 processed per loop
+ MEMACCESS(1)
+ "vst1.8 {q1}, [%1]! \n" // store odd pixels
+ MEMACCESS(1)
+ "vst1.8 {q3}, [%1]! \n"
+ "bgt 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst), // %1
+ "+r"(dst_width) // %2
+ :
+ : "memory", "cc", "q0", "q1", "q2", "q3" // Clobber List
+ );
+}
+#endif //HAS_SCALEARGBROWDOWN2_NEON
+
+#ifdef HAS_SCALEARGBROWDOWN2_NEON
+void ScaleARGBRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
+ uint8* dst, int dst_width) {
+ asm volatile (
+ // change the stride to row 2 pointer
+ "add %1, %1, %0 \n"
+ ".p2align 2 \n"
+ "1: \n"
+ MEMACCESS(0)
+ "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels.
+ MEMACCESS(0)
+ "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB pixels.
+ "subs %3, %3, #8 \n" // 8 processed per loop.
+ "vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts.
+ "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts.
+ "vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts.
+ "vpaddl.u8 q3, q3 \n" // A 16 bytes -> 8 shorts.
+ MEMACCESS(1)
+ "vld4.8 {d16, d18, d20, d22}, [%1]! \n" // load 8 more ARGB pixels.
+ MEMACCESS(1)
+ "vld4.8 {d17, d19, d21, d23}, [%1]! \n" // load last 8 ARGB pixels.
+ "vpadal.u8 q0, q8 \n" // B 16 bytes -> 8 shorts.
+ "vpadal.u8 q1, q9 \n" // G 16 bytes -> 8 shorts.
+ "vpadal.u8 q2, q10 \n" // R 16 bytes -> 8 shorts.
+ "vpadal.u8 q3, q11 \n" // A 16 bytes -> 8 shorts.
+ "vrshrn.u16 d0, q0, #2 \n" // downshift, round and pack
+ "vrshrn.u16 d1, q1, #2 \n"
+ "vrshrn.u16 d2, q2, #2 \n"
+ "vrshrn.u16 d3, q3, #2 \n"
+ MEMACCESS(2)
+ "vst4.8 {d0, d1, d2, d3}, [%2]! \n"
+ "bgt 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(src_stride), // %1
+ "+r"(dst), // %2
+ "+r"(dst_width) // %3
+ :
+ : "memory", "cc", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11"
+ );
+}
+#endif //HAS_SCALEARGBROWDOWN2_NEON
+
+#ifdef HAS_SCALEARGBROWDOWNEVEN_NEON
+// Reads 4 pixels at a time.
+// Alignment requirement: src_argb 4 byte aligned.
+void ScaleARGBRowDownEven_NEON(const uint8* src_argb, ptrdiff_t src_stride,
+ int src_stepx, uint8* dst_argb, int dst_width) {
+ asm volatile (
+ "mov r12, %3, lsl #2 \n"
+ ".p2align 2 \n"
+ "1: \n"
+ MEMACCESS(0)
+ "vld1.32 {d0[0]}, [%0], r12 \n"
+ MEMACCESS(0)
+ "vld1.32 {d0[1]}, [%0], r12 \n"
+ MEMACCESS(0)
+ "vld1.32 {d1[0]}, [%0], r12 \n"
+ MEMACCESS(0)
+ "vld1.32 {d1[1]}, [%0], r12 \n"
+ "subs %2, %2, #4 \n" // 4 pixels per loop.
+ MEMACCESS(1)
+ "vst1.8 {q0}, [%1]! \n"
+ "bgt 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_argb), // %1
+ "+r"(dst_width) // %2
+ : "r"(src_stepx) // %3
+ : "memory", "cc", "r12", "q0"
+ );
+}
+#endif //HAS_SCALEARGBROWDOWNEVEN_NEON
+
+#ifdef HAS_SCALEARGBROWDOWNEVEN_NEON
+// Reads 4 pixels at a time.
+// Alignment requirement: src_argb 4 byte aligned.
+void ScaleARGBRowDownEvenBox_NEON(const uint8* src_argb, ptrdiff_t src_stride,
+ int src_stepx,
+ uint8* dst_argb, int dst_width) {
+ asm volatile (
+ "mov r12, %4, lsl #2 \n"
+ "add %1, %1, %0 \n"
+ ".p2align 2 \n"
+ "1: \n"
+ MEMACCESS(0)
+ "vld1.8 {d0}, [%0], r12 \n" // Read 4 2x2 blocks -> 2x1
+ MEMACCESS(1)
+ "vld1.8 {d1}, [%1], r12 \n"
+ MEMACCESS(0)
+ "vld1.8 {d2}, [%0], r12 \n"
+ MEMACCESS(1)
+ "vld1.8 {d3}, [%1], r12 \n"
+ MEMACCESS(0)
+ "vld1.8 {d4}, [%0], r12 \n"
+ MEMACCESS(1)
+ "vld1.8 {d5}, [%1], r12 \n"
+ MEMACCESS(0)
+ "vld1.8 {d6}, [%0], r12 \n"
+ MEMACCESS(1)
+ "vld1.8 {d7}, [%1], r12 \n"
+ "vaddl.u8 q0, d0, d1 \n"
+ "vaddl.u8 q1, d2, d3 \n"
+ "vaddl.u8 q2, d4, d5 \n"
+ "vaddl.u8 q3, d6, d7 \n"
+ "vswp.8 d1, d2 \n" // ab_cd -> ac_bd
+ "vswp.8 d5, d6 \n" // ef_gh -> eg_fh
+ "vadd.u16 q0, q0, q1 \n" // (a+b)_(c+d)
+ "vadd.u16 q2, q2, q3 \n" // (e+f)_(g+h)
+ "vrshrn.u16 d0, q0, #2 \n" // first 2 pixels.
+ "vrshrn.u16 d1, q2, #2 \n" // next 2 pixels.
+ "subs %3, %3, #4 \n" // 4 pixels per loop.
+ MEMACCESS(2)
+ "vst1.8 {q0}, [%2]! \n"
+ "bgt 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(src_stride), // %1
+ "+r"(dst_argb), // %2
+ "+r"(dst_width) // %3
+ : "r"(src_stepx) // %4
+ : "memory", "cc", "r12", "q0", "q1", "q2", "q3"
+ );
+}
+#endif // HAS_SCALEARGBROWDOWNEVEN_NEON
+#endif // __aarch64__
+
+#ifdef __cplusplus
+} // extern "C"
+} // namespace libyuv
+#endif
diff --git a/tools_common.c b/tools_common.c
index 7cfd066..2ec1711 100644
--- a/tools_common.c
+++ b/tools_common.c
@@ -83,7 +83,7 @@
struct FileTypeDetectionBuffer *detect = &input_ctx->detect;
int plane = 0;
int shortread = 0;
- const int bytespp = (yuv_frame->fmt & VPX_IMG_FMT_HIGH) ? 2 : 1;
+ const int bytespp = (yuv_frame->fmt & VPX_IMG_FMT_HIGHBITDEPTH) ? 2 : 1;
for (plane = 0; plane < 3; ++plane) {
uint8_t *ptr;
@@ -241,7 +241,8 @@
for (plane = 0; plane < 3; ++plane) {
unsigned char *buf = img->planes[plane];
const int stride = img->stride[plane];
- const int w = vpx_img_plane_width(img, plane);
+ const int w = vpx_img_plane_width(img, plane) *
+ ((img->fmt & VPX_IMG_FMT_HIGHBITDEPTH) ? 2 : 1);
const int h = vpx_img_plane_height(img, plane);
int y;
diff --git a/tools_common.h b/tools_common.h
index 558413e..c1f466b 100644
--- a/tools_common.h
+++ b/tools_common.h
@@ -103,17 +103,25 @@
extern "C" {
#endif
+#if defined(__GNUC__)
+#define VPX_NO_RETURN __attribute__((noreturn))
+#else
+#define VPX_NO_RETURN
+#endif
+
/* Sets a stdio stream into binary mode */
FILE *set_binary_mode(FILE *stream);
-void die(const char *fmt, ...);
-void fatal(const char *fmt, ...);
+void die(const char *fmt, ...) VPX_NO_RETURN;
+void fatal(const char *fmt, ...) VPX_NO_RETURN;
void warn(const char *fmt, ...);
-void die_codec(vpx_codec_ctx_t *ctx, const char *s);
+void die_codec(vpx_codec_ctx_t *ctx, const char *s) VPX_NO_RETURN;
/* The tool including this file must define usage_exit() */
-void usage_exit();
+void usage_exit() VPX_NO_RETURN;
+
+#undef VPX_NO_RETURN
int read_yuv_frame(struct VpxInputContext *input_ctx, vpx_image_t *yuv_frame);
diff --git a/vp8/common/arm/loopfilter_arm.c b/vp8/common/arm/loopfilter_arm.c
index f37ca63..5840c2b 100644
--- a/vp8/common/arm/loopfilter_arm.c
+++ b/vp8/common/arm/loopfilter_arm.c
@@ -25,22 +25,18 @@
extern prototype_loopfilter(vp8_mbloop_filter_vertical_edge_armv6);
#endif
-#if HAVE_NEON_ASM || HAVE_NEON
+#if HAVE_NEON
typedef void loopfilter_y_neon(unsigned char *src, int pitch,
unsigned char blimit, unsigned char limit, unsigned char thresh);
typedef void loopfilter_uv_neon(unsigned char *u, int pitch,
unsigned char blimit, unsigned char limit, unsigned char thresh,
unsigned char *v);
-#endif
-#if HAVE_NEON_ASM
extern loopfilter_y_neon vp8_loop_filter_horizontal_edge_y_neon;
extern loopfilter_y_neon vp8_loop_filter_vertical_edge_y_neon;
extern loopfilter_uv_neon vp8_loop_filter_horizontal_edge_uv_neon;
extern loopfilter_uv_neon vp8_loop_filter_vertical_edge_uv_neon;
-#endif
-#if HAVE_NEON
extern loopfilter_y_neon vp8_mbloop_filter_horizontal_edge_y_neon;
extern loopfilter_y_neon vp8_mbloop_filter_vertical_edge_y_neon;
extern loopfilter_uv_neon vp8_mbloop_filter_horizontal_edge_uv_neon;
@@ -150,9 +146,7 @@
if (u_ptr)
vp8_mbloop_filter_vertical_edge_uv_neon(u_ptr, uv_stride, mblim, lim, hev_thr, v_ptr);
}
-#endif
-#if HAVE_NEON_ASM
/* Horizontal B Filtering */
void vp8_loop_filter_bh_neon(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
int y_stride, int uv_stride, loop_filter_info *lfi)
diff --git a/vp8/common/arm/neon/bilinearpredict_neon.c b/vp8/common/arm/neon/bilinearpredict_neon.c
index d77f2ba..9824a31 100644
--- a/vp8/common/arm/neon/bilinearpredict_neon.c
+++ b/vp8/common/arm/neon/bilinearpredict_neon.c
@@ -10,7 +10,7 @@
#include <arm_neon.h>
-static const uint16_t bifilter4_coeff[8][2] = {
+static const uint8_t bifilter4_coeff[8][2] = {
{128, 0},
{112, 16},
{ 96, 32},
@@ -64,8 +64,8 @@
q1u8 = vcombine_u8(d2u8, d3u8);
q2u8 = vcombine_u8(d4u8, d5u8);
- d0u8 = vdup_n_u8((uint8_t)bifilter4_coeff[xoffset][0]);
- d1u8 = vdup_n_u8((uint8_t)bifilter4_coeff[xoffset][1]);
+ d0u8 = vdup_n_u8(bifilter4_coeff[xoffset][0]);
+ d1u8 = vdup_n_u8(bifilter4_coeff[xoffset][1]);
q4u64 = vshrq_n_u64(vreinterpretq_u64_u8(q1u8), 8);
q5u64 = vshrq_n_u64(vreinterpretq_u64_u8(q2u8), 8);
@@ -155,8 +155,8 @@
q4u8 = vld1q_u8(src_ptr); src_ptr += src_pixels_per_line;
q5u8 = vld1q_u8(src_ptr);
- d0u8 = vdup_n_u8((uint8_t)bifilter4_coeff[xoffset][0]);
- d1u8 = vdup_n_u8((uint8_t)bifilter4_coeff[xoffset][1]);
+ d0u8 = vdup_n_u8(bifilter4_coeff[xoffset][0]);
+ d1u8 = vdup_n_u8(bifilter4_coeff[xoffset][1]);
q6u16 = vmull_u8(vget_low_u8(q1u8), d0u8);
q7u16 = vmull_u8(vget_low_u8(q2u8), d0u8);
@@ -245,8 +245,8 @@
q3u8 = vld1q_u8(src_ptr); src_ptr += src_pixels_per_line;
q4u8 = vld1q_u8(src_ptr); src_ptr += src_pixels_per_line;
- d0u8 = vdup_n_u8((uint8_t)bifilter4_coeff[xoffset][0]);
- d1u8 = vdup_n_u8((uint8_t)bifilter4_coeff[xoffset][1]);
+ d0u8 = vdup_n_u8(bifilter4_coeff[xoffset][0]);
+ d1u8 = vdup_n_u8(bifilter4_coeff[xoffset][1]);
q6u16 = vmull_u8(vget_low_u8(q1u8), d0u8);
q7u16 = vmull_u8(vget_low_u8(q2u8), d0u8);
diff --git a/vp8/common/arm/neon/buildintrapredictorsmby_neon.asm b/vp8/common/arm/neon/buildintrapredictorsmby_neon.asm
deleted file mode 100644
index a8730aa..0000000
--- a/vp8/common/arm/neon/buildintrapredictorsmby_neon.asm
+++ /dev/null
@@ -1,595 +0,0 @@
-;
-; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
-
- EXPORT |vp8_build_intra_predictors_mby_neon_func|
- EXPORT |vp8_build_intra_predictors_mby_s_neon_func|
-
- ARM
- REQUIRE8
- PRESERVE8
-
- AREA ||.text||, CODE, READONLY, ALIGN=2
-; r0 unsigned char *y_buffer
-; r1 unsigned char *ypred_ptr
-; r2 int y_stride
-; r3 int mode
-; stack int Up
-; stack int Left
-
-|vp8_build_intra_predictors_mby_neon_func| PROC
- push {r4-r8, lr}
- vpush {d8-d15}
-
- cmp r3, #0
- beq case_dc_pred
- cmp r3, #1
- beq case_v_pred
- cmp r3, #2
- beq case_h_pred
- cmp r3, #3
- beq case_tm_pred
-
-case_dc_pred
- ldr r4, [sp, #88] ; Up
- ldr r5, [sp, #92] ; Left
-
- ; Default the DC average to 128
- mov r12, #128
- vdup.u8 q0, r12
-
- ; Zero out running sum
- mov r12, #0
-
- ; compute shift and jump
- adds r7, r4, r5
- beq skip_dc_pred_up_left
-
- ; Load above row, if it exists
- cmp r4, #0
- beq skip_dc_pred_up
-
- sub r6, r0, r2
- vld1.8 {q1}, [r6]
- vpaddl.u8 q2, q1
- vpaddl.u16 q3, q2
- vpaddl.u32 q4, q3
-
- vmov.32 r4, d8[0]
- vmov.32 r6, d9[0]
-
- add r12, r4, r6
-
- ; Move back to interger registers
-
-skip_dc_pred_up
-
- cmp r5, #0
- beq skip_dc_pred_left
-
- sub r0, r0, #1
-
- ; Load left row, if it exists
- ldrb r3, [r0], r2
- ldrb r4, [r0], r2
- ldrb r5, [r0], r2
- ldrb r6, [r0], r2
-
- add r12, r12, r3
- add r12, r12, r4
- add r12, r12, r5
- add r12, r12, r6
-
- ldrb r3, [r0], r2
- ldrb r4, [r0], r2
- ldrb r5, [r0], r2
- ldrb r6, [r0], r2
-
- add r12, r12, r3
- add r12, r12, r4
- add r12, r12, r5
- add r12, r12, r6
-
- ldrb r3, [r0], r2
- ldrb r4, [r0], r2
- ldrb r5, [r0], r2
- ldrb r6, [r0], r2
-
- add r12, r12, r3
- add r12, r12, r4
- add r12, r12, r5
- add r12, r12, r6
-
- ldrb r3, [r0], r2
- ldrb r4, [r0], r2
- ldrb r5, [r0], r2
- ldrb r6, [r0]
-
- add r12, r12, r3
- add r12, r12, r4
- add r12, r12, r5
- add r12, r12, r6
-
-skip_dc_pred_left
- add r7, r7, #3 ; Shift
- sub r4, r7, #1
- mov r5, #1
- add r12, r12, r5, lsl r4
- mov r5, r12, lsr r7 ; expected_dc
-
- vdup.u8 q0, r5
-
-skip_dc_pred_up_left
- vst1.u8 {q0}, [r1]!
- vst1.u8 {q0}, [r1]!
- vst1.u8 {q0}, [r1]!
- vst1.u8 {q0}, [r1]!
- vst1.u8 {q0}, [r1]!
- vst1.u8 {q0}, [r1]!
- vst1.u8 {q0}, [r1]!
- vst1.u8 {q0}, [r1]!
- vst1.u8 {q0}, [r1]!
- vst1.u8 {q0}, [r1]!
- vst1.u8 {q0}, [r1]!
- vst1.u8 {q0}, [r1]!
- vst1.u8 {q0}, [r1]!
- vst1.u8 {q0}, [r1]!
- vst1.u8 {q0}, [r1]!
- vst1.u8 {q0}, [r1]!
-
- vpop {d8-d15}
- pop {r4-r8,pc}
-case_v_pred
- ; Copy down above row
- sub r6, r0, r2
- vld1.8 {q0}, [r6]
-
- vst1.u8 {q0}, [r1]!
- vst1.u8 {q0}, [r1]!
- vst1.u8 {q0}, [r1]!
- vst1.u8 {q0}, [r1]!
- vst1.u8 {q0}, [r1]!
- vst1.u8 {q0}, [r1]!
- vst1.u8 {q0}, [r1]!
- vst1.u8 {q0}, [r1]!
- vst1.u8 {q0}, [r1]!
- vst1.u8 {q0}, [r1]!
- vst1.u8 {q0}, [r1]!
- vst1.u8 {q0}, [r1]!
- vst1.u8 {q0}, [r1]!
- vst1.u8 {q0}, [r1]!
- vst1.u8 {q0}, [r1]!
- vst1.u8 {q0}, [r1]!
- vpop {d8-d15}
- pop {r4-r8,pc}
-
-case_h_pred
- ; Load 4x yleft_col
- sub r0, r0, #1
-
- ldrb r3, [r0], r2
- ldrb r4, [r0], r2
- ldrb r5, [r0], r2
- ldrb r6, [r0], r2
- vdup.u8 q0, r3
- vdup.u8 q1, r4
- vdup.u8 q2, r5
- vdup.u8 q3, r6
- vst1.u8 {q0}, [r1]!
- vst1.u8 {q1}, [r1]!
- vst1.u8 {q2}, [r1]!
- vst1.u8 {q3}, [r1]!
-
- ldrb r3, [r0], r2
- ldrb r4, [r0], r2
- ldrb r5, [r0], r2
- ldrb r6, [r0], r2
- vdup.u8 q0, r3
- vdup.u8 q1, r4
- vdup.u8 q2, r5
- vdup.u8 q3, r6
- vst1.u8 {q0}, [r1]!
- vst1.u8 {q1}, [r1]!
- vst1.u8 {q2}, [r1]!
- vst1.u8 {q3}, [r1]!
-
-
- ldrb r3, [r0], r2
- ldrb r4, [r0], r2
- ldrb r5, [r0], r2
- ldrb r6, [r0], r2
- vdup.u8 q0, r3
- vdup.u8 q1, r4
- vdup.u8 q2, r5
- vdup.u8 q3, r6
- vst1.u8 {q0}, [r1]!
- vst1.u8 {q1}, [r1]!
- vst1.u8 {q2}, [r1]!
- vst1.u8 {q3}, [r1]!
-
- ldrb r3, [r0], r2
- ldrb r4, [r0], r2
- ldrb r5, [r0], r2
- ldrb r6, [r0], r2
- vdup.u8 q0, r3
- vdup.u8 q1, r4
- vdup.u8 q2, r5
- vdup.u8 q3, r6
- vst1.u8 {q0}, [r1]!
- vst1.u8 {q1}, [r1]!
- vst1.u8 {q2}, [r1]!
- vst1.u8 {q3}, [r1]!
-
- vpop {d8-d15}
- pop {r4-r8,pc}
-
-case_tm_pred
- ; Load yabove_row
- sub r3, r0, r2
- vld1.8 {q8}, [r3]
-
- ; Load ytop_left
- sub r3, r3, #1
- ldrb r7, [r3]
-
- vdup.u16 q7, r7
-
- ; Compute yabove_row - ytop_left
- mov r3, #1
- vdup.u8 q0, r3
-
- vmull.u8 q4, d16, d0
- vmull.u8 q5, d17, d0
-
- vsub.s16 q4, q4, q7
- vsub.s16 q5, q5, q7
-
- ; Load 4x yleft_col
- sub r0, r0, #1
- mov r12, #4
-
-case_tm_pred_loop
- ldrb r3, [r0], r2
- ldrb r4, [r0], r2
- ldrb r5, [r0], r2
- ldrb r6, [r0], r2
- vdup.u16 q0, r3
- vdup.u16 q1, r4
- vdup.u16 q2, r5
- vdup.u16 q3, r6
-
- vqadd.s16 q8, q0, q4
- vqadd.s16 q9, q0, q5
-
- vqadd.s16 q10, q1, q4
- vqadd.s16 q11, q1, q5
-
- vqadd.s16 q12, q2, q4
- vqadd.s16 q13, q2, q5
-
- vqadd.s16 q14, q3, q4
- vqadd.s16 q15, q3, q5
-
- vqshrun.s16 d0, q8, #0
- vqshrun.s16 d1, q9, #0
-
- vqshrun.s16 d2, q10, #0
- vqshrun.s16 d3, q11, #0
-
- vqshrun.s16 d4, q12, #0
- vqshrun.s16 d5, q13, #0
-
- vqshrun.s16 d6, q14, #0
- vqshrun.s16 d7, q15, #0
-
- vst1.u8 {q0}, [r1]!
- vst1.u8 {q1}, [r1]!
- vst1.u8 {q2}, [r1]!
- vst1.u8 {q3}, [r1]!
-
- subs r12, r12, #1
- bne case_tm_pred_loop
-
- vpop {d8-d15}
- pop {r4-r8,pc}
-
- ENDP
-
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-; r0 unsigned char *y_buffer
-; r1 unsigned char *ypred_ptr
-; r2 int y_stride
-; r3 int mode
-; stack int Up
-; stack int Left
-
-|vp8_build_intra_predictors_mby_s_neon_func| PROC
- push {r4-r8, lr}
- vpush {d8-d15}
-
- mov r1, r0 ; unsigned char *ypred_ptr = x->dst.y_buffer; //x->Predictor;
-
- cmp r3, #0
- beq case_dc_pred_s
- cmp r3, #1
- beq case_v_pred_s
- cmp r3, #2
- beq case_h_pred_s
- cmp r3, #3
- beq case_tm_pred_s
-
-case_dc_pred_s
- ldr r4, [sp, #88] ; Up
- ldr r5, [sp, #92] ; Left
-
- ; Default the DC average to 128
- mov r12, #128
- vdup.u8 q0, r12
-
- ; Zero out running sum
- mov r12, #0
-
- ; compute shift and jump
- adds r7, r4, r5
- beq skip_dc_pred_up_left_s
-
- ; Load above row, if it exists
- cmp r4, #0
- beq skip_dc_pred_up_s
-
- sub r6, r0, r2
- vld1.8 {q1}, [r6]
- vpaddl.u8 q2, q1
- vpaddl.u16 q3, q2
- vpaddl.u32 q4, q3
-
- vmov.32 r4, d8[0]
- vmov.32 r6, d9[0]
-
- add r12, r4, r6
-
- ; Move back to interger registers
-
-skip_dc_pred_up_s
-
- cmp r5, #0
- beq skip_dc_pred_left_s
-
- sub r0, r0, #1
-
- ; Load left row, if it exists
- ldrb r3, [r0], r2
- ldrb r4, [r0], r2
- ldrb r5, [r0], r2
- ldrb r6, [r0], r2
-
- add r12, r12, r3
- add r12, r12, r4
- add r12, r12, r5
- add r12, r12, r6
-
- ldrb r3, [r0], r2
- ldrb r4, [r0], r2
- ldrb r5, [r0], r2
- ldrb r6, [r0], r2
-
- add r12, r12, r3
- add r12, r12, r4
- add r12, r12, r5
- add r12, r12, r6
-
- ldrb r3, [r0], r2
- ldrb r4, [r0], r2
- ldrb r5, [r0], r2
- ldrb r6, [r0], r2
-
- add r12, r12, r3
- add r12, r12, r4
- add r12, r12, r5
- add r12, r12, r6
-
- ldrb r3, [r0], r2
- ldrb r4, [r0], r2
- ldrb r5, [r0], r2
- ldrb r6, [r0]
-
- add r12, r12, r3
- add r12, r12, r4
- add r12, r12, r5
- add r12, r12, r6
-
-skip_dc_pred_left_s
- add r7, r7, #3 ; Shift
- sub r4, r7, #1
- mov r5, #1
- add r12, r12, r5, lsl r4
- mov r5, r12, lsr r7 ; expected_dc
-
- vdup.u8 q0, r5
-
-skip_dc_pred_up_left_s
- vst1.u8 {q0}, [r1], r2
- vst1.u8 {q0}, [r1], r2
- vst1.u8 {q0}, [r1], r2
- vst1.u8 {q0}, [r1], r2
- vst1.u8 {q0}, [r1], r2
- vst1.u8 {q0}, [r1], r2
- vst1.u8 {q0}, [r1], r2
- vst1.u8 {q0}, [r1], r2
- vst1.u8 {q0}, [r1], r2
- vst1.u8 {q0}, [r1], r2
- vst1.u8 {q0}, [r1], r2
- vst1.u8 {q0}, [r1], r2
- vst1.u8 {q0}, [r1], r2
- vst1.u8 {q0}, [r1], r2
- vst1.u8 {q0}, [r1], r2
- vst1.u8 {q0}, [r1], r2
-
- vpop {d8-d15}
- pop {r4-r8,pc}
-case_v_pred_s
- ; Copy down above row
- sub r6, r0, r2
- vld1.8 {q0}, [r6]
-
- vst1.u8 {q0}, [r1], r2
- vst1.u8 {q0}, [r1], r2
- vst1.u8 {q0}, [r1], r2
- vst1.u8 {q0}, [r1], r2
- vst1.u8 {q0}, [r1], r2
- vst1.u8 {q0}, [r1], r2
- vst1.u8 {q0}, [r1], r2
- vst1.u8 {q0}, [r1], r2
- vst1.u8 {q0}, [r1], r2
- vst1.u8 {q0}, [r1], r2
- vst1.u8 {q0}, [r1], r2
- vst1.u8 {q0}, [r1], r2
- vst1.u8 {q0}, [r1], r2
- vst1.u8 {q0}, [r1], r2
- vst1.u8 {q0}, [r1], r2
- vst1.u8 {q0}, [r1], r2
-
- vpop {d8-d15}
- pop {r4-r8,pc}
-
-case_h_pred_s
- ; Load 4x yleft_col
- sub r0, r0, #1
-
- ldrb r3, [r0], r2
- ldrb r4, [r0], r2
- ldrb r5, [r0], r2
- ldrb r6, [r0], r2
- vdup.u8 q0, r3
- vdup.u8 q1, r4
- vdup.u8 q2, r5
- vdup.u8 q3, r6
- vst1.u8 {q0}, [r1], r2
- vst1.u8 {q1}, [r1], r2
- vst1.u8 {q2}, [r1], r2
- vst1.u8 {q3}, [r1], r2
-
- ldrb r3, [r0], r2
- ldrb r4, [r0], r2
- ldrb r5, [r0], r2
- ldrb r6, [r0], r2
- vdup.u8 q0, r3
- vdup.u8 q1, r4
- vdup.u8 q2, r5
- vdup.u8 q3, r6
- vst1.u8 {q0}, [r1], r2
- vst1.u8 {q1}, [r1], r2
- vst1.u8 {q2}, [r1], r2
- vst1.u8 {q3}, [r1], r2
-
-
- ldrb r3, [r0], r2
- ldrb r4, [r0], r2
- ldrb r5, [r0], r2
- ldrb r6, [r0], r2
- vdup.u8 q0, r3
- vdup.u8 q1, r4
- vdup.u8 q2, r5
- vdup.u8 q3, r6
- vst1.u8 {q0}, [r1], r2
- vst1.u8 {q1}, [r1], r2
- vst1.u8 {q2}, [r1], r2
- vst1.u8 {q3}, [r1], r2
-
- ldrb r3, [r0], r2
- ldrb r4, [r0], r2
- ldrb r5, [r0], r2
- ldrb r6, [r0], r2
- vdup.u8 q0, r3
- vdup.u8 q1, r4
- vdup.u8 q2, r5
- vdup.u8 q3, r6
- vst1.u8 {q0}, [r1], r2
- vst1.u8 {q1}, [r1], r2
- vst1.u8 {q2}, [r1], r2
- vst1.u8 {q3}, [r1], r2
-
- vpop {d8-d15}
- pop {r4-r8,pc}
-
-case_tm_pred_s
- ; Load yabove_row
- sub r3, r0, r2
- vld1.8 {q8}, [r3]
-
- ; Load ytop_left
- sub r3, r3, #1
- ldrb r7, [r3]
-
- vdup.u16 q7, r7
-
- ; Compute yabove_row - ytop_left
- mov r3, #1
- vdup.u8 q0, r3
-
- vmull.u8 q4, d16, d0
- vmull.u8 q5, d17, d0
-
- vsub.s16 q4, q4, q7
- vsub.s16 q5, q5, q7
-
- ; Load 4x yleft_col
- sub r0, r0, #1
- mov r12, #4
-
-case_tm_pred_loop_s
- ldrb r3, [r0], r2
- ldrb r4, [r0], r2
- ldrb r5, [r0], r2
- ldrb r6, [r0], r2
- vdup.u16 q0, r3
- vdup.u16 q1, r4
- vdup.u16 q2, r5
- vdup.u16 q3, r6
-
- vqadd.s16 q8, q0, q4
- vqadd.s16 q9, q0, q5
-
- vqadd.s16 q10, q1, q4
- vqadd.s16 q11, q1, q5
-
- vqadd.s16 q12, q2, q4
- vqadd.s16 q13, q2, q5
-
- vqadd.s16 q14, q3, q4
- vqadd.s16 q15, q3, q5
-
- vqshrun.s16 d0, q8, #0
- vqshrun.s16 d1, q9, #0
-
- vqshrun.s16 d2, q10, #0
- vqshrun.s16 d3, q11, #0
-
- vqshrun.s16 d4, q12, #0
- vqshrun.s16 d5, q13, #0
-
- vqshrun.s16 d6, q14, #0
- vqshrun.s16 d7, q15, #0
-
- vst1.u8 {q0}, [r1], r2
- vst1.u8 {q1}, [r1], r2
- vst1.u8 {q2}, [r1], r2
- vst1.u8 {q3}, [r1], r2
-
- subs r12, r12, #1
- bne case_tm_pred_loop_s
-
- vpop {d8-d15}
- pop {r4-r8,pc}
-
- ENDP
-
-
- END
diff --git a/vp8/common/arm/neon/idct_dequant_0_2x_neon.asm b/vp8/common/arm/neon/idct_dequant_0_2x_neon.asm
deleted file mode 100644
index 3a39210..0000000
--- a/vp8/common/arm/neon/idct_dequant_0_2x_neon.asm
+++ /dev/null
@@ -1,81 +0,0 @@
-;
-; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license and patent
-; grant that can be found in the LICENSE file in the root of the source
-; tree. All contributing project authors may be found in the AUTHORS
-; file in the root of the source tree.
-;
-
-
- EXPORT |idct_dequant_0_2x_neon|
- ARM
- REQUIRE8
- PRESERVE8
-
- AREA ||.text||, CODE, READONLY, ALIGN=2
-;void idct_dequant_0_2x_neon(short *q, short dq,
-; unsigned char *dst, int stride);
-; r0 *q
-; r1 dq
-; r2 *dst
-; r3 stride
-|idct_dequant_0_2x_neon| PROC
- push {r4, r5}
- vpush {d8-d15}
-
- add r12, r2, #4
- vld1.32 {d2[0]}, [r2], r3
- vld1.32 {d8[0]}, [r12], r3
- vld1.32 {d2[1]}, [r2], r3
- vld1.32 {d8[1]}, [r12], r3
- vld1.32 {d4[0]}, [r2], r3
- vld1.32 {d10[0]}, [r12], r3
- vld1.32 {d4[1]}, [r2], r3
- vld1.32 {d10[1]}, [r12], r3
-
- ldrh r12, [r0] ; lo q
- ldrh r4, [r0, #32] ; hi q
- mov r5, #0
- strh r5, [r0]
- strh r5, [r0, #32]
-
- sxth r12, r12 ; lo
- mul r0, r12, r1
- add r0, r0, #4
- asr r0, r0, #3
- vdup.16 q0, r0
- sxth r4, r4 ; hi
- mul r0, r4, r1
- add r0, r0, #4
- asr r0, r0, #3
- vdup.16 q3, r0
-
- vaddw.u8 q1, q0, d2 ; lo
- vaddw.u8 q2, q0, d4
- vaddw.u8 q4, q3, d8 ; hi
- vaddw.u8 q5, q3, d10
-
- sub r2, r2, r3, lsl #2 ; dst - 4*stride
- add r0, r2, #4
-
- vqmovun.s16 d2, q1 ; lo
- vqmovun.s16 d4, q2
- vqmovun.s16 d8, q4 ; hi
- vqmovun.s16 d10, q5
-
- vst1.32 {d2[0]}, [r2], r3 ; lo
- vst1.32 {d8[0]}, [r0], r3 ; hi
- vst1.32 {d2[1]}, [r2], r3
- vst1.32 {d8[1]}, [r0], r3
- vst1.32 {d4[0]}, [r2], r3
- vst1.32 {d10[0]}, [r0], r3
- vst1.32 {d4[1]}, [r2]
- vst1.32 {d10[1]}, [r0]
-
- vpop {d8-d15}
- pop {r4, r5}
- bx lr
-
- ENDP ; |idct_dequant_0_2x_neon|
- END
diff --git a/vp8/common/arm/neon/idct_dequant_0_2x_neon.c b/vp8/common/arm/neon/idct_dequant_0_2x_neon.c
new file mode 100644
index 0000000..967c322
--- /dev/null
+++ b/vp8/common/arm/neon/idct_dequant_0_2x_neon.c
@@ -0,0 +1,62 @@
+/*
+ * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+void idct_dequant_0_2x_neon(
+ int16_t *q,
+ int16_t dq,
+ unsigned char *dst,
+ int stride) {
+ unsigned char *dst0;
+ int i, a0, a1;
+ int16x8x2_t q2Add;
+ int32x2_t d2s32, d4s32;
+ uint8x8_t d2u8, d4u8;
+ uint16x8_t q1u16, q2u16;
+
+ a0 = ((q[0] * dq) + 4) >> 3;
+ a1 = ((q[16] * dq) + 4) >> 3;
+ q[0] = q[16] = 0;
+ q2Add.val[0] = vdupq_n_s16((int16_t)a0);
+ q2Add.val[1] = vdupq_n_s16((int16_t)a1);
+
+ for (i = 0; i < 2; i++, dst += 4) {
+ dst0 = dst;
+ d2s32 = vld1_lane_s32((const int32_t *)dst0, d2s32, 0);
+ dst0 += stride;
+ d2s32 = vld1_lane_s32((const int32_t *)dst0, d2s32, 1);
+ dst0 += stride;
+ d4s32 = vld1_lane_s32((const int32_t *)dst0, d4s32, 0);
+ dst0 += stride;
+ d4s32 = vld1_lane_s32((const int32_t *)dst0, d4s32, 1);
+
+ q1u16 = vaddw_u8(vreinterpretq_u16_s16(q2Add.val[i]),
+ vreinterpret_u8_s32(d2s32));
+ q2u16 = vaddw_u8(vreinterpretq_u16_s16(q2Add.val[i]),
+ vreinterpret_u8_s32(d4s32));
+
+ d2u8 = vqmovun_s16(vreinterpretq_s16_u16(q1u16));
+ d4u8 = vqmovun_s16(vreinterpretq_s16_u16(q2u16));
+
+ d2s32 = vreinterpret_s32_u8(d2u8);
+ d4s32 = vreinterpret_s32_u8(d4u8);
+
+ dst0 = dst;
+ vst1_lane_s32((int32_t *)dst0, d2s32, 0);
+ dst0 += stride;
+ vst1_lane_s32((int32_t *)dst0, d2s32, 1);
+ dst0 += stride;
+ vst1_lane_s32((int32_t *)dst0, d4s32, 0);
+ dst0 += stride;
+ vst1_lane_s32((int32_t *)dst0, d4s32, 1);
+ }
+ return;
+}
diff --git a/vp8/common/arm/neon/idct_dequant_full_2x_neon.asm b/vp8/common/arm/neon/idct_dequant_full_2x_neon.asm
deleted file mode 100644
index 8da0fa0..0000000
--- a/vp8/common/arm/neon/idct_dequant_full_2x_neon.asm
+++ /dev/null
@@ -1,199 +0,0 @@
-;
-; Copyright (c) 2010 The Webm project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
-
- EXPORT |idct_dequant_full_2x_neon|
- ARM
- REQUIRE8
- PRESERVE8
-
- AREA ||.text||, CODE, READONLY, ALIGN=2
-;void idct_dequant_full_2x_neon(short *q, short *dq,
-; unsigned char *dst, int stride);
-; r0 *q,
-; r1 *dq,
-; r2 *dst
-; r3 stride
-|idct_dequant_full_2x_neon| PROC
- vpush {d8-d15}
-
- vld1.16 {q0, q1}, [r1] ; dq (same l/r)
- vld1.16 {q2, q3}, [r0] ; l q
- add r0, r0, #32
- vld1.16 {q4, q5}, [r0] ; r q
- add r12, r2, #4
-
- ; interleave the predictors
- vld1.32 {d28[0]}, [r2], r3 ; l pre
- vld1.32 {d28[1]}, [r12], r3 ; r pre
- vld1.32 {d29[0]}, [r2], r3
- vld1.32 {d29[1]}, [r12], r3
- vld1.32 {d30[0]}, [r2], r3
- vld1.32 {d30[1]}, [r12], r3
- vld1.32 {d31[0]}, [r2], r3
- vld1.32 {d31[1]}, [r12]
-
- adr r1, cospi8sqrt2minus1 ; pointer to the first constant
-
- ; dequant: q[i] = q[i] * dq[i]
- vmul.i16 q2, q2, q0
- vmul.i16 q3, q3, q1
- vmul.i16 q4, q4, q0
- vmul.i16 q5, q5, q1
-
- vld1.16 {d0}, [r1]
-
- ; q2: l0r0 q3: l8r8
- ; q4: l4r4 q5: l12r12
- vswp d5, d8
- vswp d7, d10
-
- ; _CONSTANTS_ * 4,12 >> 16
- ; q6: 4 * sinpi : c1/temp1
- ; q7: 12 * sinpi : d1/temp2
- ; q8: 4 * cospi
- ; q9: 12 * cospi
- vqdmulh.s16 q6, q4, d0[2] ; sinpi8sqrt2
- vqdmulh.s16 q7, q5, d0[2]
- vqdmulh.s16 q8, q4, d0[0] ; cospi8sqrt2minus1
- vqdmulh.s16 q9, q5, d0[0]
-
- vqadd.s16 q10, q2, q3 ; a1 = 0 + 8
- vqsub.s16 q11, q2, q3 ; b1 = 0 - 8
-
- ; vqdmulh only accepts signed values. this was a problem because
- ; our constant had the high bit set, and was treated as a negative value.
- ; vqdmulh also doubles the value before it shifts by 16. we need to
- ; compensate for this. in the case of sinpi8sqrt2, the lowest bit is 0,
- ; so we can shift the constant without losing precision. this avoids
- ; shift again afterward, but also avoids the sign issue. win win!
- ; for cospi8sqrt2minus1 the lowest bit is 1, so we lose precision if we
- ; pre-shift it
- vshr.s16 q8, q8, #1
- vshr.s16 q9, q9, #1
-
- ; q4: 4 + 4 * cospi : d1/temp1
- ; q5: 12 + 12 * cospi : c1/temp2
- vqadd.s16 q4, q4, q8
- vqadd.s16 q5, q5, q9
-
- ; c1 = temp1 - temp2
- ; d1 = temp1 + temp2
- vqsub.s16 q2, q6, q5
- vqadd.s16 q3, q4, q7
-
- ; [0]: a1+d1
- ; [1]: b1+c1
- ; [2]: b1-c1
- ; [3]: a1-d1
- vqadd.s16 q4, q10, q3
- vqadd.s16 q5, q11, q2
- vqsub.s16 q6, q11, q2
- vqsub.s16 q7, q10, q3
-
- ; rotate
- vtrn.32 q4, q6
- vtrn.32 q5, q7
- vtrn.16 q4, q5
- vtrn.16 q6, q7
- ; idct loop 2
- ; q4: l 0, 4, 8,12 r 0, 4, 8,12
- ; q5: l 1, 5, 9,13 r 1, 5, 9,13
- ; q6: l 2, 6,10,14 r 2, 6,10,14
- ; q7: l 3, 7,11,15 r 3, 7,11,15
-
- ; q8: 1 * sinpi : c1/temp1
- ; q9: 3 * sinpi : d1/temp2
- ; q10: 1 * cospi
- ; q11: 3 * cospi
- vqdmulh.s16 q8, q5, d0[2] ; sinpi8sqrt2
- vqdmulh.s16 q9, q7, d0[2]
- vqdmulh.s16 q10, q5, d0[0] ; cospi8sqrt2minus1
- vqdmulh.s16 q11, q7, d0[0]
-
- vqadd.s16 q2, q4, q6 ; a1 = 0 + 2
- vqsub.s16 q3, q4, q6 ; b1 = 0 - 2
-
- ; see note on shifting above
- vshr.s16 q10, q10, #1
- vshr.s16 q11, q11, #1
-
- ; q10: 1 + 1 * cospi : d1/temp1
- ; q11: 3 + 3 * cospi : c1/temp2
- vqadd.s16 q10, q5, q10
- vqadd.s16 q11, q7, q11
-
- ; q8: c1 = temp1 - temp2
- ; q9: d1 = temp1 + temp2
- vqsub.s16 q8, q8, q11
- vqadd.s16 q9, q10, q9
-
- ; a1+d1
- ; b1+c1
- ; b1-c1
- ; a1-d1
- vqadd.s16 q4, q2, q9
- vqadd.s16 q5, q3, q8
- vqsub.s16 q6, q3, q8
- vqsub.s16 q7, q2, q9
-
- ; +4 >> 3 (rounding)
- vrshr.s16 q4, q4, #3 ; lo
- vrshr.s16 q5, q5, #3
- vrshr.s16 q6, q6, #3 ; hi
- vrshr.s16 q7, q7, #3
-
- vtrn.32 q4, q6
- vtrn.32 q5, q7
- vtrn.16 q4, q5
- vtrn.16 q6, q7
-
- ; adding pre
- ; input is still packed. pre was read interleaved
- vaddw.u8 q4, q4, d28
- vaddw.u8 q5, q5, d29
- vaddw.u8 q6, q6, d30
- vaddw.u8 q7, q7, d31
-
- vmov.i16 q14, #0
- vmov q15, q14
- vst1.16 {q14, q15}, [r0] ; write over high input
- sub r0, r0, #32
- vst1.16 {q14, q15}, [r0] ; write over low input
-
- sub r2, r2, r3, lsl #2 ; dst - 4*stride
- add r1, r2, #4 ; hi
-
- ;saturate and narrow
- vqmovun.s16 d0, q4 ; lo
- vqmovun.s16 d1, q5
- vqmovun.s16 d2, q6 ; hi
- vqmovun.s16 d3, q7
-
- vst1.32 {d0[0]}, [r2], r3 ; lo
- vst1.32 {d0[1]}, [r1], r3 ; hi
- vst1.32 {d1[0]}, [r2], r3
- vst1.32 {d1[1]}, [r1], r3
- vst1.32 {d2[0]}, [r2], r3
- vst1.32 {d2[1]}, [r1], r3
- vst1.32 {d3[0]}, [r2]
- vst1.32 {d3[1]}, [r1]
-
- vpop {d8-d15}
- bx lr
-
- ENDP ; |idct_dequant_full_2x_neon|
-
-; Constant Pool
-cospi8sqrt2minus1 DCD 0x4e7b
-; because the lowest bit in 0x8a8c is 0, we can pre-shift this
-sinpi8sqrt2 DCD 0x4546
-
- END
diff --git a/vp8/common/arm/neon/idct_dequant_full_2x_neon.c b/vp8/common/arm/neon/idct_dequant_full_2x_neon.c
new file mode 100644
index 0000000..a60ed46
--- /dev/null
+++ b/vp8/common/arm/neon/idct_dequant_full_2x_neon.c
@@ -0,0 +1,185 @@
+/*
+ * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+static const int16_t cospi8sqrt2minus1 = 20091;
+static const int16_t sinpi8sqrt2 = 17734;
+// because the lowest bit in 0x8a8c is 0, we can pre-shift this
+
+void idct_dequant_full_2x_neon(
+ int16_t *q,
+ int16_t *dq,
+ unsigned char *dst,
+ int stride) {
+ unsigned char *dst0, *dst1;
+ int32x2_t d28, d29, d30, d31;
+ int16x8_t q0, q1, q2, q3, q4, q5, q6, q7, q8, q9, q10, q11;
+ int16x8_t qEmpty = vdupq_n_s16(0);
+ int32x4x2_t q2tmp0, q2tmp1;
+ int16x8x2_t q2tmp2, q2tmp3;
+ int16x4_t dLow0, dLow1, dHigh0, dHigh1;
+
+ d28 = d29 = d30 = d31 = vdup_n_s32(0);
+
+ // load dq
+ q0 = vld1q_s16(dq);
+ dq += 8;
+ q1 = vld1q_s16(dq);
+
+ // load q
+ q2 = vld1q_s16(q);
+ vst1q_s16(q, qEmpty);
+ q += 8;
+ q3 = vld1q_s16(q);
+ vst1q_s16(q, qEmpty);
+ q += 8;
+ q4 = vld1q_s16(q);
+ vst1q_s16(q, qEmpty);
+ q += 8;
+ q5 = vld1q_s16(q);
+ vst1q_s16(q, qEmpty);
+
+ // load src from dst
+ dst0 = dst;
+ dst1 = dst + 4;
+ d28 = vld1_lane_s32((const int32_t *)dst0, d28, 0);
+ dst0 += stride;
+ d28 = vld1_lane_s32((const int32_t *)dst1, d28, 1);
+ dst1 += stride;
+ d29 = vld1_lane_s32((const int32_t *)dst0, d29, 0);
+ dst0 += stride;
+ d29 = vld1_lane_s32((const int32_t *)dst1, d29, 1);
+ dst1 += stride;
+
+ d30 = vld1_lane_s32((const int32_t *)dst0, d30, 0);
+ dst0 += stride;
+ d30 = vld1_lane_s32((const int32_t *)dst1, d30, 1);
+ dst1 += stride;
+ d31 = vld1_lane_s32((const int32_t *)dst0, d31, 0);
+ d31 = vld1_lane_s32((const int32_t *)dst1, d31, 1);
+
+ q2 = vmulq_s16(q2, q0);
+ q3 = vmulq_s16(q3, q1);
+ q4 = vmulq_s16(q4, q0);
+ q5 = vmulq_s16(q5, q1);
+
+ // vswp
+ dLow0 = vget_low_s16(q2);
+ dHigh0 = vget_high_s16(q2);
+ dLow1 = vget_low_s16(q4);
+ dHigh1 = vget_high_s16(q4);
+ q2 = vcombine_s16(dLow0, dLow1);
+ q4 = vcombine_s16(dHigh0, dHigh1);
+
+ dLow0 = vget_low_s16(q3);
+ dHigh0 = vget_high_s16(q3);
+ dLow1 = vget_low_s16(q5);
+ dHigh1 = vget_high_s16(q5);
+ q3 = vcombine_s16(dLow0, dLow1);
+ q5 = vcombine_s16(dHigh0, dHigh1);
+
+ q6 = vqdmulhq_n_s16(q4, sinpi8sqrt2);
+ q7 = vqdmulhq_n_s16(q5, sinpi8sqrt2);
+ q8 = vqdmulhq_n_s16(q4, cospi8sqrt2minus1);
+ q9 = vqdmulhq_n_s16(q5, cospi8sqrt2minus1);
+
+ q10 = vqaddq_s16(q2, q3);
+ q11 = vqsubq_s16(q2, q3);
+
+ q8 = vshrq_n_s16(q8, 1);
+ q9 = vshrq_n_s16(q9, 1);
+
+ q4 = vqaddq_s16(q4, q8);
+ q5 = vqaddq_s16(q5, q9);
+
+ q2 = vqsubq_s16(q6, q5);
+ q3 = vqaddq_s16(q7, q4);
+
+ q4 = vqaddq_s16(q10, q3);
+ q5 = vqaddq_s16(q11, q2);
+ q6 = vqsubq_s16(q11, q2);
+ q7 = vqsubq_s16(q10, q3);
+
+ q2tmp0 = vtrnq_s32(vreinterpretq_s32_s16(q4), vreinterpretq_s32_s16(q6));
+ q2tmp1 = vtrnq_s32(vreinterpretq_s32_s16(q5), vreinterpretq_s32_s16(q7));
+ q2tmp2 = vtrnq_s16(vreinterpretq_s16_s32(q2tmp0.val[0]),
+ vreinterpretq_s16_s32(q2tmp1.val[0]));
+ q2tmp3 = vtrnq_s16(vreinterpretq_s16_s32(q2tmp0.val[1]),
+ vreinterpretq_s16_s32(q2tmp1.val[1]));
+
+ // loop 2
+ q8 = vqdmulhq_n_s16(q2tmp2.val[1], sinpi8sqrt2);
+ q9 = vqdmulhq_n_s16(q2tmp3.val[1], sinpi8sqrt2);
+ q10 = vqdmulhq_n_s16(q2tmp2.val[1], cospi8sqrt2minus1);
+ q11 = vqdmulhq_n_s16(q2tmp3.val[1], cospi8sqrt2minus1);
+
+ q2 = vqaddq_s16(q2tmp2.val[0], q2tmp3.val[0]);
+ q3 = vqsubq_s16(q2tmp2.val[0], q2tmp3.val[0]);
+
+ q10 = vshrq_n_s16(q10, 1);
+ q11 = vshrq_n_s16(q11, 1);
+
+ q10 = vqaddq_s16(q2tmp2.val[1], q10);
+ q11 = vqaddq_s16(q2tmp3.val[1], q11);
+
+ q8 = vqsubq_s16(q8, q11);
+ q9 = vqaddq_s16(q9, q10);
+
+ q4 = vqaddq_s16(q2, q9);
+ q5 = vqaddq_s16(q3, q8);
+ q6 = vqsubq_s16(q3, q8);
+ q7 = vqsubq_s16(q2, q9);
+
+ q4 = vrshrq_n_s16(q4, 3);
+ q5 = vrshrq_n_s16(q5, 3);
+ q6 = vrshrq_n_s16(q6, 3);
+ q7 = vrshrq_n_s16(q7, 3);
+
+ q2tmp0 = vtrnq_s32(vreinterpretq_s32_s16(q4), vreinterpretq_s32_s16(q6));
+ q2tmp1 = vtrnq_s32(vreinterpretq_s32_s16(q5), vreinterpretq_s32_s16(q7));
+ q2tmp2 = vtrnq_s16(vreinterpretq_s16_s32(q2tmp0.val[0]),
+ vreinterpretq_s16_s32(q2tmp1.val[0]));
+ q2tmp3 = vtrnq_s16(vreinterpretq_s16_s32(q2tmp0.val[1]),
+ vreinterpretq_s16_s32(q2tmp1.val[1]));
+
+ q4 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q2tmp2.val[0]),
+ vreinterpret_u8_s32(d28)));
+ q5 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q2tmp2.val[1]),
+ vreinterpret_u8_s32(d29)));
+ q6 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q2tmp3.val[0]),
+ vreinterpret_u8_s32(d30)));
+ q7 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q2tmp3.val[1]),
+ vreinterpret_u8_s32(d31)));
+
+ d28 = vreinterpret_s32_u8(vqmovun_s16(q4));
+ d29 = vreinterpret_s32_u8(vqmovun_s16(q5));
+ d30 = vreinterpret_s32_u8(vqmovun_s16(q6));
+ d31 = vreinterpret_s32_u8(vqmovun_s16(q7));
+
+ dst0 = dst;
+ dst1 = dst + 4;
+ vst1_lane_s32((int32_t *)dst0, d28, 0);
+ dst0 += stride;
+ vst1_lane_s32((int32_t *)dst1, d28, 1);
+ dst1 += stride;
+ vst1_lane_s32((int32_t *)dst0, d29, 0);
+ dst0 += stride;
+ vst1_lane_s32((int32_t *)dst1, d29, 1);
+ dst1 += stride;
+
+ vst1_lane_s32((int32_t *)dst0, d30, 0);
+ dst0 += stride;
+ vst1_lane_s32((int32_t *)dst1, d30, 1);
+ dst1 += stride;
+ vst1_lane_s32((int32_t *)dst0, d31, 0);
+ vst1_lane_s32((int32_t *)dst1, d31, 1);
+ return;
+}
diff --git a/vp8/common/arm/neon/loopfilter_neon.asm b/vp8/common/arm/neon/loopfilter_neon.asm
deleted file mode 100644
index c4f09c7..0000000
--- a/vp8/common/arm/neon/loopfilter_neon.asm
+++ /dev/null
@@ -1,409 +0,0 @@
-;
-; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
-
- EXPORT |vp8_loop_filter_horizontal_edge_y_neon|
- EXPORT |vp8_loop_filter_horizontal_edge_uv_neon|
- EXPORT |vp8_loop_filter_vertical_edge_y_neon|
- EXPORT |vp8_loop_filter_vertical_edge_uv_neon|
- ARM
-
- AREA ||.text||, CODE, READONLY, ALIGN=2
-
-; r0 unsigned char *src
-; r1 int pitch
-; r2 unsigned char blimit
-; r3 unsigned char limit
-; sp unsigned char thresh,
-|vp8_loop_filter_horizontal_edge_y_neon| PROC
- push {lr}
- vpush {d8-d15}
-
- vdup.u8 q0, r2 ; duplicate blimit
- vdup.u8 q1, r3 ; duplicate limit
- sub r2, r0, r1, lsl #2 ; move src pointer down by 4 lines
- ldr r3, [sp, #68] ; load thresh
- add r12, r2, r1
- add r1, r1, r1
-
- vdup.u8 q2, r3 ; duplicate thresh
-
- vld1.u8 {q3}, [r2@128], r1 ; p3
- vld1.u8 {q4}, [r12@128], r1 ; p2
- vld1.u8 {q5}, [r2@128], r1 ; p1
- vld1.u8 {q6}, [r12@128], r1 ; p0
- vld1.u8 {q7}, [r2@128], r1 ; q0
- vld1.u8 {q8}, [r12@128], r1 ; q1
- vld1.u8 {q9}, [r2@128] ; q2
- vld1.u8 {q10}, [r12@128] ; q3
-
- sub r2, r2, r1, lsl #1
- sub r12, r12, r1, lsl #1
-
- bl vp8_loop_filter_neon
-
- vst1.u8 {q5}, [r2@128], r1 ; store op1
- vst1.u8 {q6}, [r12@128], r1 ; store op0
- vst1.u8 {q7}, [r2@128], r1 ; store oq0
- vst1.u8 {q8}, [r12@128], r1 ; store oq1
-
- vpop {d8-d15}
- pop {pc}
- ENDP ; |vp8_loop_filter_horizontal_edge_y_neon|
-
-
-; r0 unsigned char *u,
-; r1 int pitch,
-; r2 unsigned char blimit
-; r3 unsigned char limit
-; sp unsigned char thresh,
-; sp+4 unsigned char *v
-|vp8_loop_filter_horizontal_edge_uv_neon| PROC
- push {lr}
- vpush {d8-d15}
-
- vdup.u8 q0, r2 ; duplicate blimit
- vdup.u8 q1, r3 ; duplicate limit
- ldr r12, [sp, #68] ; load thresh
- ldr r2, [sp, #72] ; load v ptr
- vdup.u8 q2, r12 ; duplicate thresh
-
- sub r3, r0, r1, lsl #2 ; move u pointer down by 4 lines
- sub r12, r2, r1, lsl #2 ; move v pointer down by 4 lines
-
- vld1.u8 {d6}, [r3@64], r1 ; p3
- vld1.u8 {d7}, [r12@64], r1 ; p3
- vld1.u8 {d8}, [r3@64], r1 ; p2
- vld1.u8 {d9}, [r12@64], r1 ; p2
- vld1.u8 {d10}, [r3@64], r1 ; p1
- vld1.u8 {d11}, [r12@64], r1 ; p1
- vld1.u8 {d12}, [r3@64], r1 ; p0
- vld1.u8 {d13}, [r12@64], r1 ; p0
- vld1.u8 {d14}, [r3@64], r1 ; q0
- vld1.u8 {d15}, [r12@64], r1 ; q0
- vld1.u8 {d16}, [r3@64], r1 ; q1
- vld1.u8 {d17}, [r12@64], r1 ; q1
- vld1.u8 {d18}, [r3@64], r1 ; q2
- vld1.u8 {d19}, [r12@64], r1 ; q2
- vld1.u8 {d20}, [r3@64] ; q3
- vld1.u8 {d21}, [r12@64] ; q3
-
- bl vp8_loop_filter_neon
-
- sub r0, r0, r1, lsl #1
- sub r2, r2, r1, lsl #1
-
- vst1.u8 {d10}, [r0@64], r1 ; store u op1
- vst1.u8 {d11}, [r2@64], r1 ; store v op1
- vst1.u8 {d12}, [r0@64], r1 ; store u op0
- vst1.u8 {d13}, [r2@64], r1 ; store v op0
- vst1.u8 {d14}, [r0@64], r1 ; store u oq0
- vst1.u8 {d15}, [r2@64], r1 ; store v oq0
- vst1.u8 {d16}, [r0@64] ; store u oq1
- vst1.u8 {d17}, [r2@64] ; store v oq1
-
- vpop {d8-d15}
- pop {pc}
- ENDP ; |vp8_loop_filter_horizontal_edge_uv_neon|
-
-; void vp8_loop_filter_vertical_edge_y_neon(unsigned char *src, int pitch,
-; const signed char *flimit,
-; const signed char *limit,
-; const signed char *thresh,
-; int count)
-; r0 unsigned char *src
-; r1 int pitch
-; r2 unsigned char blimit
-; r3 unsigned char limit
-; sp unsigned char thresh,
-
-|vp8_loop_filter_vertical_edge_y_neon| PROC
- push {lr}
- vpush {d8-d15}
-
- vdup.u8 q0, r2 ; duplicate blimit
- vdup.u8 q1, r3 ; duplicate limit
- sub r2, r0, #4 ; src ptr down by 4 columns
- add r1, r1, r1
- ldr r3, [sp, #68] ; load thresh
- add r12, r2, r1, asr #1
-
- vld1.u8 {d6}, [r2], r1
- vld1.u8 {d8}, [r12], r1
- vld1.u8 {d10}, [r2], r1
- vld1.u8 {d12}, [r12], r1
- vld1.u8 {d14}, [r2], r1
- vld1.u8 {d16}, [r12], r1
- vld1.u8 {d18}, [r2], r1
- vld1.u8 {d20}, [r12], r1
-
- vld1.u8 {d7}, [r2], r1 ; load second 8-line src data
- vld1.u8 {d9}, [r12], r1
- vld1.u8 {d11}, [r2], r1
- vld1.u8 {d13}, [r12], r1
- vld1.u8 {d15}, [r2], r1
- vld1.u8 {d17}, [r12], r1
- vld1.u8 {d19}, [r2]
- vld1.u8 {d21}, [r12]
-
- ;transpose to 8x16 matrix
- vtrn.32 q3, q7
- vtrn.32 q4, q8
- vtrn.32 q5, q9
- vtrn.32 q6, q10
-
- vdup.u8 q2, r3 ; duplicate thresh
-
- vtrn.16 q3, q5
- vtrn.16 q4, q6
- vtrn.16 q7, q9
- vtrn.16 q8, q10
-
- vtrn.8 q3, q4
- vtrn.8 q5, q6
- vtrn.8 q7, q8
- vtrn.8 q9, q10
-
- bl vp8_loop_filter_neon
-
- vswp d12, d11
- vswp d16, d13
-
- sub r0, r0, #2 ; dst ptr
-
- vswp d14, d12
- vswp d16, d15
-
- add r12, r0, r1, asr #1
-
- ;store op1, op0, oq0, oq1
- vst4.8 {d10[0], d11[0], d12[0], d13[0]}, [r0], r1
- vst4.8 {d10[1], d11[1], d12[1], d13[1]}, [r12], r1
- vst4.8 {d10[2], d11[2], d12[2], d13[2]}, [r0], r1
- vst4.8 {d10[3], d11[3], d12[3], d13[3]}, [r12], r1
- vst4.8 {d10[4], d11[4], d12[4], d13[4]}, [r0], r1
- vst4.8 {d10[5], d11[5], d12[5], d13[5]}, [r12], r1
- vst4.8 {d10[6], d11[6], d12[6], d13[6]}, [r0], r1
- vst4.8 {d10[7], d11[7], d12[7], d13[7]}, [r12], r1
-
- vst4.8 {d14[0], d15[0], d16[0], d17[0]}, [r0], r1
- vst4.8 {d14[1], d15[1], d16[1], d17[1]}, [r12], r1
- vst4.8 {d14[2], d15[2], d16[2], d17[2]}, [r0], r1
- vst4.8 {d14[3], d15[3], d16[3], d17[3]}, [r12], r1
- vst4.8 {d14[4], d15[4], d16[4], d17[4]}, [r0], r1
- vst4.8 {d14[5], d15[5], d16[5], d17[5]}, [r12], r1
- vst4.8 {d14[6], d15[6], d16[6], d17[6]}, [r0]
- vst4.8 {d14[7], d15[7], d16[7], d17[7]}, [r12]
-
- vpop {d8-d15}
- pop {pc}
- ENDP ; |vp8_loop_filter_vertical_edge_y_neon|
-
-; void vp8_loop_filter_vertical_edge_uv_neon(unsigned char *u, int pitch
-; const signed char *flimit,
-; const signed char *limit,
-; const signed char *thresh,
-; unsigned char *v)
-; r0 unsigned char *u,
-; r1 int pitch,
-; r2 unsigned char blimit
-; r3 unsigned char limit
-; sp unsigned char thresh,
-; sp+4 unsigned char *v
-|vp8_loop_filter_vertical_edge_uv_neon| PROC
- push {lr}
- vpush {d8-d15}
-
- vdup.u8 q0, r2 ; duplicate blimit
- sub r12, r0, #4 ; move u pointer down by 4 columns
- ldr r2, [sp, #72] ; load v ptr
- vdup.u8 q1, r3 ; duplicate limit
- sub r3, r2, #4 ; move v pointer down by 4 columns
-
- vld1.u8 {d6}, [r12], r1 ;load u data
- vld1.u8 {d7}, [r3], r1 ;load v data
- vld1.u8 {d8}, [r12], r1
- vld1.u8 {d9}, [r3], r1
- vld1.u8 {d10}, [r12], r1
- vld1.u8 {d11}, [r3], r1
- vld1.u8 {d12}, [r12], r1
- vld1.u8 {d13}, [r3], r1
- vld1.u8 {d14}, [r12], r1
- vld1.u8 {d15}, [r3], r1
- vld1.u8 {d16}, [r12], r1
- vld1.u8 {d17}, [r3], r1
- vld1.u8 {d18}, [r12], r1
- vld1.u8 {d19}, [r3], r1
- vld1.u8 {d20}, [r12]
- vld1.u8 {d21}, [r3]
-
- ldr r12, [sp, #68] ; load thresh
-
- ;transpose to 8x16 matrix
- vtrn.32 q3, q7
- vtrn.32 q4, q8
- vtrn.32 q5, q9
- vtrn.32 q6, q10
-
- vdup.u8 q2, r12 ; duplicate thresh
-
- vtrn.16 q3, q5
- vtrn.16 q4, q6
- vtrn.16 q7, q9
- vtrn.16 q8, q10
-
- vtrn.8 q3, q4
- vtrn.8 q5, q6
- vtrn.8 q7, q8
- vtrn.8 q9, q10
-
- bl vp8_loop_filter_neon
-
- vswp d12, d11
- vswp d16, d13
- vswp d14, d12
- vswp d16, d15
-
- sub r0, r0, #2
- sub r2, r2, #2
-
- ;store op1, op0, oq0, oq1
- vst4.8 {d10[0], d11[0], d12[0], d13[0]}, [r0], r1
- vst4.8 {d14[0], d15[0], d16[0], d17[0]}, [r2], r1
- vst4.8 {d10[1], d11[1], d12[1], d13[1]}, [r0], r1
- vst4.8 {d14[1], d15[1], d16[1], d17[1]}, [r2], r1
- vst4.8 {d10[2], d11[2], d12[2], d13[2]}, [r0], r1
- vst4.8 {d14[2], d15[2], d16[2], d17[2]}, [r2], r1
- vst4.8 {d10[3], d11[3], d12[3], d13[3]}, [r0], r1
- vst4.8 {d14[3], d15[3], d16[3], d17[3]}, [r2], r1
- vst4.8 {d10[4], d11[4], d12[4], d13[4]}, [r0], r1
- vst4.8 {d14[4], d15[4], d16[4], d17[4]}, [r2], r1
- vst4.8 {d10[5], d11[5], d12[5], d13[5]}, [r0], r1
- vst4.8 {d14[5], d15[5], d16[5], d17[5]}, [r2], r1
- vst4.8 {d10[6], d11[6], d12[6], d13[6]}, [r0], r1
- vst4.8 {d14[6], d15[6], d16[6], d17[6]}, [r2], r1
- vst4.8 {d10[7], d11[7], d12[7], d13[7]}, [r0]
- vst4.8 {d14[7], d15[7], d16[7], d17[7]}, [r2]
-
- vpop {d8-d15}
- pop {pc}
- ENDP ; |vp8_loop_filter_vertical_edge_uv_neon|
-
-; void vp8_loop_filter_neon();
-; This is a helper function for the loopfilters. The invidual functions do the
-; necessary load, transpose (if necessary) and store.
-
-; r0-r3 PRESERVE
-; q0 flimit
-; q1 limit
-; q2 thresh
-; q3 p3
-; q4 p2
-; q5 p1
-; q6 p0
-; q7 q0
-; q8 q1
-; q9 q2
-; q10 q3
-|vp8_loop_filter_neon| PROC
-
- ; vp8_filter_mask
- vabd.u8 q11, q3, q4 ; abs(p3 - p2)
- vabd.u8 q12, q4, q5 ; abs(p2 - p1)
- vabd.u8 q13, q5, q6 ; abs(p1 - p0)
- vabd.u8 q14, q8, q7 ; abs(q1 - q0)
- vabd.u8 q3, q9, q8 ; abs(q2 - q1)
- vabd.u8 q4, q10, q9 ; abs(q3 - q2)
-
- vmax.u8 q11, q11, q12
- vmax.u8 q12, q13, q14
- vmax.u8 q3, q3, q4
- vmax.u8 q15, q11, q12
-
- vabd.u8 q9, q6, q7 ; abs(p0 - q0)
-
- ; vp8_hevmask
- vcgt.u8 q13, q13, q2 ; (abs(p1 - p0) > thresh)*-1
- vcgt.u8 q14, q14, q2 ; (abs(q1 - q0) > thresh)*-1
- vmax.u8 q15, q15, q3
-
- vmov.u8 q10, #0x80 ; 0x80
-
- vabd.u8 q2, q5, q8 ; a = abs(p1 - q1)
- vqadd.u8 q9, q9, q9 ; b = abs(p0 - q0) * 2
-
- vcge.u8 q15, q1, q15
-
- ; vp8_filter() function
- ; convert to signed
- veor q7, q7, q10 ; qs0
- vshr.u8 q2, q2, #1 ; a = a / 2
- veor q6, q6, q10 ; ps0
-
- veor q5, q5, q10 ; ps1
- vqadd.u8 q9, q9, q2 ; a = b + a
-
- veor q8, q8, q10 ; qs1
-
- vmov.u8 q10, #3 ; #3
-
- vsubl.s8 q2, d14, d12 ; ( qs0 - ps0)
- vsubl.s8 q11, d15, d13
-
- vcge.u8 q9, q0, q9 ; (a > flimit * 2 + limit) * -1
-
- vmovl.u8 q4, d20
-
- vqsub.s8 q1, q5, q8 ; vp8_filter = clamp(ps1-qs1)
- vorr q14, q13, q14 ; vp8_hevmask
-
- vmul.i16 q2, q2, q4 ; 3 * ( qs0 - ps0)
- vmul.i16 q11, q11, q4
-
- vand q1, q1, q14 ; vp8_filter &= hev
- vand q15, q15, q9 ; vp8_filter_mask
-
- vaddw.s8 q2, q2, d2
- vaddw.s8 q11, q11, d3
-
- vmov.u8 q9, #4 ; #4
-
- ; vp8_filter = clamp(vp8_filter + 3 * ( qs0 - ps0))
- vqmovn.s16 d2, q2
- vqmovn.s16 d3, q11
- vand q1, q1, q15 ; vp8_filter &= mask
-
- vqadd.s8 q2, q1, q10 ; Filter2 = clamp(vp8_filter+3)
- vqadd.s8 q1, q1, q9 ; Filter1 = clamp(vp8_filter+4)
- vshr.s8 q2, q2, #3 ; Filter2 >>= 3
- vshr.s8 q1, q1, #3 ; Filter1 >>= 3
-
-
- vqadd.s8 q11, q6, q2 ; u = clamp(ps0 + Filter2)
- vqsub.s8 q10, q7, q1 ; u = clamp(qs0 - Filter1)
-
- ; outer tap adjustments: ++vp8_filter >> 1
- vrshr.s8 q1, q1, #1
- vbic q1, q1, q14 ; vp8_filter &= ~hev
- vmov.u8 q0, #0x80 ; 0x80
- vqadd.s8 q13, q5, q1 ; u = clamp(ps1 + vp8_filter)
- vqsub.s8 q12, q8, q1 ; u = clamp(qs1 - vp8_filter)
-
- veor q6, q11, q0 ; *op0 = u^0x80
- veor q7, q10, q0 ; *oq0 = u^0x80
- veor q5, q13, q0 ; *op1 = u^0x80
- veor q8, q12, q0 ; *oq1 = u^0x80
-
- bx lr
- ENDP ; |vp8_loop_filter_horizontal_edge_y_neon|
-
-;-----------------
-
- END
diff --git a/vp8/common/arm/neon/loopfilter_neon.c b/vp8/common/arm/neon/loopfilter_neon.c
new file mode 100644
index 0000000..0bec7fb
--- /dev/null
+++ b/vp8/common/arm/neon/loopfilter_neon.c
@@ -0,0 +1,549 @@
+/*
+ * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+#include "./vpx_config.h"
+
+static INLINE void vp8_loop_filter_neon(
+ uint8x16_t qblimit, // flimit
+ uint8x16_t qlimit, // limit
+ uint8x16_t qthresh, // thresh
+ uint8x16_t q3, // p3
+ uint8x16_t q4, // p2
+ uint8x16_t q5, // p1
+ uint8x16_t q6, // p0
+ uint8x16_t q7, // q0
+ uint8x16_t q8, // q1
+ uint8x16_t q9, // q2
+ uint8x16_t q10, // q3
+ uint8x16_t *q5r, // p1
+ uint8x16_t *q6r, // p0
+ uint8x16_t *q7r, // q0
+ uint8x16_t *q8r) { // q1
+ uint8x16_t q0u8, q1u8, q2u8, q11u8, q12u8, q13u8, q14u8, q15u8;
+ int16x8_t q2s16, q11s16;
+ uint16x8_t q4u16;
+ int8x16_t q1s8, q2s8, q10s8, q11s8, q12s8, q13s8;
+ int8x8_t d2s8, d3s8;
+
+ q11u8 = vabdq_u8(q3, q4);
+ q12u8 = vabdq_u8(q4, q5);
+ q13u8 = vabdq_u8(q5, q6);
+ q14u8 = vabdq_u8(q8, q7);
+ q3 = vabdq_u8(q9, q8);
+ q4 = vabdq_u8(q10, q9);
+
+ q11u8 = vmaxq_u8(q11u8, q12u8);
+ q12u8 = vmaxq_u8(q13u8, q14u8);
+ q3 = vmaxq_u8(q3, q4);
+ q15u8 = vmaxq_u8(q11u8, q12u8);
+
+ q9 = vabdq_u8(q6, q7);
+
+ // vp8_hevmask
+ q13u8 = vcgtq_u8(q13u8, qthresh);
+ q14u8 = vcgtq_u8(q14u8, qthresh);
+ q15u8 = vmaxq_u8(q15u8, q3);
+
+ q2u8 = vabdq_u8(q5, q8);
+ q9 = vqaddq_u8(q9, q9);
+
+ q15u8 = vcgeq_u8(qlimit, q15u8);
+
+ // vp8_filter() function
+ // convert to signed
+ q10 = vdupq_n_u8(0x80);
+ q8 = veorq_u8(q8, q10);
+ q7 = veorq_u8(q7, q10);
+ q6 = veorq_u8(q6, q10);
+ q5 = veorq_u8(q5, q10);
+
+ q2u8 = vshrq_n_u8(q2u8, 1);
+ q9 = vqaddq_u8(q9, q2u8);
+
+ q10 = vdupq_n_u8(3);
+
+ q2s16 = vsubl_s8(vget_low_s8(vreinterpretq_s8_u8(q7)),
+ vget_low_s8(vreinterpretq_s8_u8(q6)));
+ q11s16 = vsubl_s8(vget_high_s8(vreinterpretq_s8_u8(q7)),
+ vget_high_s8(vreinterpretq_s8_u8(q6)));
+
+ q9 = vcgeq_u8(qblimit, q9);
+
+ q1s8 = vqsubq_s8(vreinterpretq_s8_u8(q5),
+ vreinterpretq_s8_u8(q8));
+
+ q14u8 = vorrq_u8(q13u8, q14u8);
+
+ q4u16 = vmovl_u8(vget_low_u8(q10));
+ q2s16 = vmulq_s16(q2s16, vreinterpretq_s16_u16(q4u16));
+ q11s16 = vmulq_s16(q11s16, vreinterpretq_s16_u16(q4u16));
+
+ q1u8 = vandq_u8(vreinterpretq_u8_s8(q1s8), q14u8);
+ q15u8 = vandq_u8(q15u8, q9);
+
+ q1s8 = vreinterpretq_s8_u8(q1u8);
+ q2s16 = vaddw_s8(q2s16, vget_low_s8(q1s8));
+ q11s16 = vaddw_s8(q11s16, vget_high_s8(q1s8));
+
+ q9 = vdupq_n_u8(4);
+ // vp8_filter = clamp(vp8_filter + 3 * ( qs0 - ps0))
+ d2s8 = vqmovn_s16(q2s16);
+ d3s8 = vqmovn_s16(q11s16);
+ q1s8 = vcombine_s8(d2s8, d3s8);
+ q1u8 = vandq_u8(vreinterpretq_u8_s8(q1s8), q15u8);
+ q1s8 = vreinterpretq_s8_u8(q1u8);
+
+ q2s8 = vqaddq_s8(q1s8, vreinterpretq_s8_u8(q10));
+ q1s8 = vqaddq_s8(q1s8, vreinterpretq_s8_u8(q9));
+ q2s8 = vshrq_n_s8(q2s8, 3);
+ q1s8 = vshrq_n_s8(q1s8, 3);
+
+ q11s8 = vqaddq_s8(vreinterpretq_s8_u8(q6), q2s8);
+ q10s8 = vqsubq_s8(vreinterpretq_s8_u8(q7), q1s8);
+
+ q1s8 = vrshrq_n_s8(q1s8, 1);
+ q1s8 = vbicq_s8(q1s8, vreinterpretq_s8_u8(q14u8));
+
+ q13s8 = vqaddq_s8(vreinterpretq_s8_u8(q5), q1s8);
+ q12s8 = vqsubq_s8(vreinterpretq_s8_u8(q8), q1s8);
+
+ q0u8 = vdupq_n_u8(0x80);
+ *q8r = veorq_u8(vreinterpretq_u8_s8(q12s8), q0u8);
+ *q7r = veorq_u8(vreinterpretq_u8_s8(q10s8), q0u8);
+ *q6r = veorq_u8(vreinterpretq_u8_s8(q11s8), q0u8);
+ *q5r = veorq_u8(vreinterpretq_u8_s8(q13s8), q0u8);
+ return;
+}
+
+void vp8_loop_filter_horizontal_edge_y_neon(
+ unsigned char *src,
+ int pitch,
+ unsigned char blimit,
+ unsigned char limit,
+ unsigned char thresh) {
+ uint8x16_t qblimit, qlimit, qthresh, q3, q4;
+ uint8x16_t q5, q6, q7, q8, q9, q10;
+
+ qblimit = vdupq_n_u8(blimit);
+ qlimit = vdupq_n_u8(limit);
+ qthresh = vdupq_n_u8(thresh);
+ src -= (pitch << 2);
+
+ q3 = vld1q_u8(src);
+ src += pitch;
+ q4 = vld1q_u8(src);
+ src += pitch;
+ q5 = vld1q_u8(src);
+ src += pitch;
+ q6 = vld1q_u8(src);
+ src += pitch;
+ q7 = vld1q_u8(src);
+ src += pitch;
+ q8 = vld1q_u8(src);
+ src += pitch;
+ q9 = vld1q_u8(src);
+ src += pitch;
+ q10 = vld1q_u8(src);
+
+ vp8_loop_filter_neon(qblimit, qlimit, qthresh, q3, q4,
+ q5, q6, q7, q8, q9, q10,
+ &q5, &q6, &q7, &q8);
+
+ src -= (pitch * 5);
+ vst1q_u8(src, q5);
+ src += pitch;
+ vst1q_u8(src, q6);
+ src += pitch;
+ vst1q_u8(src, q7);
+ src += pitch;
+ vst1q_u8(src, q8);
+ return;
+}
+
+void vp8_loop_filter_horizontal_edge_uv_neon(
+ unsigned char *u,
+ int pitch,
+ unsigned char blimit,
+ unsigned char limit,
+ unsigned char thresh,
+ unsigned char *v) {
+ uint8x16_t qblimit, qlimit, qthresh, q3, q4;
+ uint8x16_t q5, q6, q7, q8, q9, q10;
+ uint8x8_t d6, d7, d8, d9, d10, d11, d12, d13, d14;
+ uint8x8_t d15, d16, d17, d18, d19, d20, d21;
+
+ qblimit = vdupq_n_u8(blimit);
+ qlimit = vdupq_n_u8(limit);
+ qthresh = vdupq_n_u8(thresh);
+
+ u -= (pitch << 2);
+ v -= (pitch << 2);
+
+ d6 = vld1_u8(u);
+ u += pitch;
+ d7 = vld1_u8(v);
+ v += pitch;
+ d8 = vld1_u8(u);
+ u += pitch;
+ d9 = vld1_u8(v);
+ v += pitch;
+ d10 = vld1_u8(u);
+ u += pitch;
+ d11 = vld1_u8(v);
+ v += pitch;
+ d12 = vld1_u8(u);
+ u += pitch;
+ d13 = vld1_u8(v);
+ v += pitch;
+ d14 = vld1_u8(u);
+ u += pitch;
+ d15 = vld1_u8(v);
+ v += pitch;
+ d16 = vld1_u8(u);
+ u += pitch;
+ d17 = vld1_u8(v);
+ v += pitch;
+ d18 = vld1_u8(u);
+ u += pitch;
+ d19 = vld1_u8(v);
+ v += pitch;
+ d20 = vld1_u8(u);
+ d21 = vld1_u8(v);
+
+ q3 = vcombine_u8(d6, d7);
+ q4 = vcombine_u8(d8, d9);
+ q5 = vcombine_u8(d10, d11);
+ q6 = vcombine_u8(d12, d13);
+ q7 = vcombine_u8(d14, d15);
+ q8 = vcombine_u8(d16, d17);
+ q9 = vcombine_u8(d18, d19);
+ q10 = vcombine_u8(d20, d21);
+
+ vp8_loop_filter_neon(qblimit, qlimit, qthresh, q3, q4,
+ q5, q6, q7, q8, q9, q10,
+ &q5, &q6, &q7, &q8);
+
+ u -= (pitch * 5);
+ vst1_u8(u, vget_low_u8(q5));
+ u += pitch;
+ vst1_u8(u, vget_low_u8(q6));
+ u += pitch;
+ vst1_u8(u, vget_low_u8(q7));
+ u += pitch;
+ vst1_u8(u, vget_low_u8(q8));
+
+ v -= (pitch * 5);
+ vst1_u8(v, vget_high_u8(q5));
+ v += pitch;
+ vst1_u8(v, vget_high_u8(q6));
+ v += pitch;
+ vst1_u8(v, vget_high_u8(q7));
+ v += pitch;
+ vst1_u8(v, vget_high_u8(q8));
+ return;
+}
+
+static INLINE void write_4x8(unsigned char *dst, int pitch,
+ const uint8x8x4_t result) {
+#if (__GNUC__ == 4 && (__GNUC_MINOR__ >= 7))
+ vst4_lane_u8(dst, result, 0);
+ dst += pitch;
+ vst4_lane_u8(dst, result, 1);
+ dst += pitch;
+ vst4_lane_u8(dst, result, 2);
+ dst += pitch;
+ vst4_lane_u8(dst, result, 3);
+ dst += pitch;
+ vst4_lane_u8(dst, result, 4);
+ dst += pitch;
+ vst4_lane_u8(dst, result, 5);
+ dst += pitch;
+ vst4_lane_u8(dst, result, 6);
+ dst += pitch;
+ vst4_lane_u8(dst, result, 7);
+#else
+ /*
+ * uint8x8x4_t result
+ 00 01 02 03 | 04 05 06 07
+ 10 11 12 13 | 14 15 16 17
+ 20 21 22 23 | 24 25 26 27
+ 30 31 32 33 | 34 35 36 37
+ ---
+ * after vtrn_u16
+ 00 01 20 21 | 04 05 24 25
+ 02 03 22 23 | 06 07 26 27
+ 10 11 30 31 | 14 15 34 35
+ 12 13 32 33 | 16 17 36 37
+ ---
+ * after vtrn_u8
+ 00 10 20 30 | 04 14 24 34
+ 01 11 21 31 | 05 15 25 35
+ 02 12 22 32 | 06 16 26 36
+ 03 13 23 33 | 07 17 27 37
+ */
+ const uint16x4x2_t r02_u16 = vtrn_u16(vreinterpret_u16_u8(result.val[0]),
+ vreinterpret_u16_u8(result.val[2]));
+ const uint16x4x2_t r13_u16 = vtrn_u16(vreinterpret_u16_u8(result.val[1]),
+ vreinterpret_u16_u8(result.val[3]));
+ const uint8x8x2_t r01_u8 = vtrn_u8(vreinterpret_u8_u16(r02_u16.val[0]),
+ vreinterpret_u8_u16(r13_u16.val[0]));
+ const uint8x8x2_t r23_u8 = vtrn_u8(vreinterpret_u8_u16(r02_u16.val[1]),
+ vreinterpret_u8_u16(r13_u16.val[1]));
+ const uint32x2_t x_0_4 = vreinterpret_u32_u8(r01_u8.val[0]);
+ const uint32x2_t x_1_5 = vreinterpret_u32_u8(r01_u8.val[1]);
+ const uint32x2_t x_2_6 = vreinterpret_u32_u8(r23_u8.val[0]);
+ const uint32x2_t x_3_7 = vreinterpret_u32_u8(r23_u8.val[1]);
+ vst1_lane_u32((uint32_t *)dst, x_0_4, 0);
+ dst += pitch;
+ vst1_lane_u32((uint32_t *)dst, x_1_5, 0);
+ dst += pitch;
+ vst1_lane_u32((uint32_t *)dst, x_2_6, 0);
+ dst += pitch;
+ vst1_lane_u32((uint32_t *)dst, x_3_7, 0);
+ dst += pitch;
+ vst1_lane_u32((uint32_t *)dst, x_0_4, 1);
+ dst += pitch;
+ vst1_lane_u32((uint32_t *)dst, x_1_5, 1);
+ dst += pitch;
+ vst1_lane_u32((uint32_t *)dst, x_2_6, 1);
+ dst += pitch;
+ vst1_lane_u32((uint32_t *)dst, x_3_7, 1);
+#endif
+}
+
+void vp8_loop_filter_vertical_edge_y_neon(
+ unsigned char *src,
+ int pitch,
+ unsigned char blimit,
+ unsigned char limit,
+ unsigned char thresh) {
+ unsigned char *s, *d;
+ uint8x16_t qblimit, qlimit, qthresh, q3, q4;
+ uint8x16_t q5, q6, q7, q8, q9, q10;
+ uint8x8_t d6, d7, d8, d9, d10, d11, d12, d13, d14;
+ uint8x8_t d15, d16, d17, d18, d19, d20, d21;
+ uint32x4x2_t q2tmp0, q2tmp1, q2tmp2, q2tmp3;
+ uint16x8x2_t q2tmp4, q2tmp5, q2tmp6, q2tmp7;
+ uint8x16x2_t q2tmp8, q2tmp9, q2tmp10, q2tmp11;
+ uint8x8x4_t q4ResultH, q4ResultL;
+
+ qblimit = vdupq_n_u8(blimit);
+ qlimit = vdupq_n_u8(limit);
+ qthresh = vdupq_n_u8(thresh);
+
+ s = src - 4;
+ d6 = vld1_u8(s);
+ s += pitch;
+ d8 = vld1_u8(s);
+ s += pitch;
+ d10 = vld1_u8(s);
+ s += pitch;
+ d12 = vld1_u8(s);
+ s += pitch;
+ d14 = vld1_u8(s);
+ s += pitch;
+ d16 = vld1_u8(s);
+ s += pitch;
+ d18 = vld1_u8(s);
+ s += pitch;
+ d20 = vld1_u8(s);
+ s += pitch;
+ d7 = vld1_u8(s);
+ s += pitch;
+ d9 = vld1_u8(s);
+ s += pitch;
+ d11 = vld1_u8(s);
+ s += pitch;
+ d13 = vld1_u8(s);
+ s += pitch;
+ d15 = vld1_u8(s);
+ s += pitch;
+ d17 = vld1_u8(s);
+ s += pitch;
+ d19 = vld1_u8(s);
+ s += pitch;
+ d21 = vld1_u8(s);
+
+ q3 = vcombine_u8(d6, d7);
+ q4 = vcombine_u8(d8, d9);
+ q5 = vcombine_u8(d10, d11);
+ q6 = vcombine_u8(d12, d13);
+ q7 = vcombine_u8(d14, d15);
+ q8 = vcombine_u8(d16, d17);
+ q9 = vcombine_u8(d18, d19);
+ q10 = vcombine_u8(d20, d21);
+
+ q2tmp0 = vtrnq_u32(vreinterpretq_u32_u8(q3), vreinterpretq_u32_u8(q7));
+ q2tmp1 = vtrnq_u32(vreinterpretq_u32_u8(q4), vreinterpretq_u32_u8(q8));
+ q2tmp2 = vtrnq_u32(vreinterpretq_u32_u8(q5), vreinterpretq_u32_u8(q9));
+ q2tmp3 = vtrnq_u32(vreinterpretq_u32_u8(q6), vreinterpretq_u32_u8(q10));
+
+ q2tmp4 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp0.val[0]),
+ vreinterpretq_u16_u32(q2tmp2.val[0]));
+ q2tmp5 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp1.val[0]),
+ vreinterpretq_u16_u32(q2tmp3.val[0]));
+ q2tmp6 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp0.val[1]),
+ vreinterpretq_u16_u32(q2tmp2.val[1]));
+ q2tmp7 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp1.val[1]),
+ vreinterpretq_u16_u32(q2tmp3.val[1]));
+
+ q2tmp8 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp4.val[0]),
+ vreinterpretq_u8_u16(q2tmp5.val[0]));
+ q2tmp9 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp4.val[1]),
+ vreinterpretq_u8_u16(q2tmp5.val[1]));
+ q2tmp10 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp6.val[0]),
+ vreinterpretq_u8_u16(q2tmp7.val[0]));
+ q2tmp11 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp6.val[1]),
+ vreinterpretq_u8_u16(q2tmp7.val[1]));
+
+ q3 = q2tmp8.val[0];
+ q4 = q2tmp8.val[1];
+ q5 = q2tmp9.val[0];
+ q6 = q2tmp9.val[1];
+ q7 = q2tmp10.val[0];
+ q8 = q2tmp10.val[1];
+ q9 = q2tmp11.val[0];
+ q10 = q2tmp11.val[1];
+
+ vp8_loop_filter_neon(qblimit, qlimit, qthresh, q3, q4,
+ q5, q6, q7, q8, q9, q10,
+ &q5, &q6, &q7, &q8);
+
+ q4ResultL.val[0] = vget_low_u8(q5); // d10
+ q4ResultL.val[1] = vget_low_u8(q6); // d12
+ q4ResultL.val[2] = vget_low_u8(q7); // d14
+ q4ResultL.val[3] = vget_low_u8(q8); // d16
+ q4ResultH.val[0] = vget_high_u8(q5); // d11
+ q4ResultH.val[1] = vget_high_u8(q6); // d13
+ q4ResultH.val[2] = vget_high_u8(q7); // d15
+ q4ResultH.val[3] = vget_high_u8(q8); // d17
+
+ d = src - 2;
+ write_4x8(d, pitch, q4ResultL);
+ d += pitch * 8;
+ write_4x8(d, pitch, q4ResultH);
+}
+
+void vp8_loop_filter_vertical_edge_uv_neon(
+ unsigned char *u,
+ int pitch,
+ unsigned char blimit,
+ unsigned char limit,
+ unsigned char thresh,
+ unsigned char *v) {
+ unsigned char *us, *ud;
+ unsigned char *vs, *vd;
+ uint8x16_t qblimit, qlimit, qthresh, q3, q4;
+ uint8x16_t q5, q6, q7, q8, q9, q10;
+ uint8x8_t d6, d7, d8, d9, d10, d11, d12, d13, d14;
+ uint8x8_t d15, d16, d17, d18, d19, d20, d21;
+ uint32x4x2_t q2tmp0, q2tmp1, q2tmp2, q2tmp3;
+ uint16x8x2_t q2tmp4, q2tmp5, q2tmp6, q2tmp7;
+ uint8x16x2_t q2tmp8, q2tmp9, q2tmp10, q2tmp11;
+ uint8x8x4_t q4ResultH, q4ResultL;
+
+ qblimit = vdupq_n_u8(blimit);
+ qlimit = vdupq_n_u8(limit);
+ qthresh = vdupq_n_u8(thresh);
+
+ us = u - 4;
+ d6 = vld1_u8(us);
+ us += pitch;
+ d8 = vld1_u8(us);
+ us += pitch;
+ d10 = vld1_u8(us);
+ us += pitch;
+ d12 = vld1_u8(us);
+ us += pitch;
+ d14 = vld1_u8(us);
+ us += pitch;
+ d16 = vld1_u8(us);
+ us += pitch;
+ d18 = vld1_u8(us);
+ us += pitch;
+ d20 = vld1_u8(us);
+
+ vs = v - 4;
+ d7 = vld1_u8(vs);
+ vs += pitch;
+ d9 = vld1_u8(vs);
+ vs += pitch;
+ d11 = vld1_u8(vs);
+ vs += pitch;
+ d13 = vld1_u8(vs);
+ vs += pitch;
+ d15 = vld1_u8(vs);
+ vs += pitch;
+ d17 = vld1_u8(vs);
+ vs += pitch;
+ d19 = vld1_u8(vs);
+ vs += pitch;
+ d21 = vld1_u8(vs);
+
+ q3 = vcombine_u8(d6, d7);
+ q4 = vcombine_u8(d8, d9);
+ q5 = vcombine_u8(d10, d11);
+ q6 = vcombine_u8(d12, d13);
+ q7 = vcombine_u8(d14, d15);
+ q8 = vcombine_u8(d16, d17);
+ q9 = vcombine_u8(d18, d19);
+ q10 = vcombine_u8(d20, d21);
+
+ q2tmp0 = vtrnq_u32(vreinterpretq_u32_u8(q3), vreinterpretq_u32_u8(q7));
+ q2tmp1 = vtrnq_u32(vreinterpretq_u32_u8(q4), vreinterpretq_u32_u8(q8));
+ q2tmp2 = vtrnq_u32(vreinterpretq_u32_u8(q5), vreinterpretq_u32_u8(q9));
+ q2tmp3 = vtrnq_u32(vreinterpretq_u32_u8(q6), vreinterpretq_u32_u8(q10));
+
+ q2tmp4 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp0.val[0]),
+ vreinterpretq_u16_u32(q2tmp2.val[0]));
+ q2tmp5 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp1.val[0]),
+ vreinterpretq_u16_u32(q2tmp3.val[0]));
+ q2tmp6 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp0.val[1]),
+ vreinterpretq_u16_u32(q2tmp2.val[1]));
+ q2tmp7 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp1.val[1]),
+ vreinterpretq_u16_u32(q2tmp3.val[1]));
+
+ q2tmp8 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp4.val[0]),
+ vreinterpretq_u8_u16(q2tmp5.val[0]));
+ q2tmp9 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp4.val[1]),
+ vreinterpretq_u8_u16(q2tmp5.val[1]));
+ q2tmp10 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp6.val[0]),
+ vreinterpretq_u8_u16(q2tmp7.val[0]));
+ q2tmp11 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp6.val[1]),
+ vreinterpretq_u8_u16(q2tmp7.val[1]));
+
+ q3 = q2tmp8.val[0];
+ q4 = q2tmp8.val[1];
+ q5 = q2tmp9.val[0];
+ q6 = q2tmp9.val[1];
+ q7 = q2tmp10.val[0];
+ q8 = q2tmp10.val[1];
+ q9 = q2tmp11.val[0];
+ q10 = q2tmp11.val[1];
+
+ vp8_loop_filter_neon(qblimit, qlimit, qthresh, q3, q4,
+ q5, q6, q7, q8, q9, q10,
+ &q5, &q6, &q7, &q8);
+
+ q4ResultL.val[0] = vget_low_u8(q5); // d10
+ q4ResultL.val[1] = vget_low_u8(q6); // d12
+ q4ResultL.val[2] = vget_low_u8(q7); // d14
+ q4ResultL.val[3] = vget_low_u8(q8); // d16
+ ud = u - 2;
+ write_4x8(ud, pitch, q4ResultL);
+
+ q4ResultH.val[0] = vget_high_u8(q5); // d11
+ q4ResultH.val[1] = vget_high_u8(q6); // d13
+ q4ResultH.val[2] = vget_high_u8(q7); // d15
+ q4ResultH.val[3] = vget_high_u8(q8); // d17
+ vd = v - 2;
+ write_4x8(vd, pitch, q4ResultH);
+}
diff --git a/vp8/common/arm/neon/loopfiltersimpleverticaledge_neon.asm b/vp8/common/arm/neon/loopfiltersimpleverticaledge_neon.asm
deleted file mode 100644
index 78d13c8..0000000
--- a/vp8/common/arm/neon/loopfiltersimpleverticaledge_neon.asm
+++ /dev/null
@@ -1,156 +0,0 @@
-;
-; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
-
- EXPORT |vp8_loop_filter_bvs_neon|
- EXPORT |vp8_loop_filter_mbvs_neon|
- ARM
- PRESERVE8
-
- AREA ||.text||, CODE, READONLY, ALIGN=2
-
-; r0 unsigned char *s, PRESERVE
-; r1 int p, PRESERVE
-; q1 limit, PRESERVE
-
-|vp8_loop_filter_simple_vertical_edge_neon| PROC
- vpush {d8-d15}
-
- sub r0, r0, #2 ; move src pointer down by 2 columns
- add r12, r1, r1
- add r3, r0, r1
-
- vld4.8 {d6[0], d7[0], d8[0], d9[0]}, [r0], r12
- vld4.8 {d6[1], d7[1], d8[1], d9[1]}, [r3], r12
- vld4.8 {d6[2], d7[2], d8[2], d9[2]}, [r0], r12
- vld4.8 {d6[3], d7[3], d8[3], d9[3]}, [r3], r12
- vld4.8 {d6[4], d7[4], d8[4], d9[4]}, [r0], r12
- vld4.8 {d6[5], d7[5], d8[5], d9[5]}, [r3], r12
- vld4.8 {d6[6], d7[6], d8[6], d9[6]}, [r0], r12
- vld4.8 {d6[7], d7[7], d8[7], d9[7]}, [r3], r12
-
- vld4.8 {d10[0], d11[0], d12[0], d13[0]}, [r0], r12
- vld4.8 {d10[1], d11[1], d12[1], d13[1]}, [r3], r12
- vld4.8 {d10[2], d11[2], d12[2], d13[2]}, [r0], r12
- vld4.8 {d10[3], d11[3], d12[3], d13[3]}, [r3], r12
- vld4.8 {d10[4], d11[4], d12[4], d13[4]}, [r0], r12
- vld4.8 {d10[5], d11[5], d12[5], d13[5]}, [r3], r12
- vld4.8 {d10[6], d11[6], d12[6], d13[6]}, [r0], r12
- vld4.8 {d10[7], d11[7], d12[7], d13[7]}, [r3]
-
- vswp d7, d10
- vswp d12, d9
-
- ;vp8_filter_mask() function
- ;vp8_hevmask() function
- sub r0, r0, r1, lsl #4
- vabd.u8 q15, q5, q4 ; abs(p0 - q0)
- vabd.u8 q14, q3, q6 ; abs(p1 - q1)
-
- vqadd.u8 q15, q15, q15 ; abs(p0 - q0) * 2
- vshr.u8 q14, q14, #1 ; abs(p1 - q1) / 2
- vmov.u8 q0, #0x80 ; 0x80
- vmov.s16 q11, #3
- vqadd.u8 q15, q15, q14 ; abs(p0 - q0) * 2 + abs(p1 - q1) / 2
-
- veor q4, q4, q0 ; qs0: q0 offset to convert to a signed value
- veor q5, q5, q0 ; ps0: p0 offset to convert to a signed value
- veor q3, q3, q0 ; ps1: p1 offset to convert to a signed value
- veor q6, q6, q0 ; qs1: q1 offset to convert to a signed value
-
- vcge.u8 q15, q1, q15 ; abs(p0 - q0)*2 + abs(p1-q1)/2 > flimit*2 + limit)*-1
-
- vsubl.s8 q2, d8, d10 ; ( qs0 - ps0)
- vsubl.s8 q13, d9, d11
-
- vqsub.s8 q14, q3, q6 ; vp8_filter = vp8_signed_char_clamp(ps1-qs1)
-
- vmul.s16 q2, q2, q11 ; 3 * ( qs0 - ps0)
- vmul.s16 q13, q13, q11
-
- vmov.u8 q11, #0x03 ; 0x03
- vmov.u8 q12, #0x04 ; 0x04
-
- vaddw.s8 q2, q2, d28 ; vp8_filter + 3 * ( qs0 - ps0)
- vaddw.s8 q13, q13, d29
-
- vqmovn.s16 d28, q2 ; vp8_filter = vp8_signed_char_clamp(vp8_filter + 3 * ( qs0 - ps0))
- vqmovn.s16 d29, q13
-
- add r0, r0, #1
- add r3, r0, r1
-
- vand q14, q14, q15 ; vp8_filter &= mask
-
- vqadd.s8 q2, q14, q11 ; Filter2 = vp8_signed_char_clamp(vp8_filter+3)
- vqadd.s8 q3, q14, q12 ; Filter1 = vp8_signed_char_clamp(vp8_filter+4)
- vshr.s8 q2, q2, #3 ; Filter2 >>= 3
- vshr.s8 q14, q3, #3 ; Filter1 >>= 3
-
- ;calculate output
- vqadd.s8 q11, q5, q2 ; u = vp8_signed_char_clamp(ps0 + Filter2)
- vqsub.s8 q10, q4, q14 ; u = vp8_signed_char_clamp(qs0 - Filter1)
-
- veor q6, q11, q0 ; *op0 = u^0x80
- veor q7, q10, q0 ; *oq0 = u^0x80
- add r12, r1, r1
- vswp d13, d14
-
- ;store op1, op0, oq0, oq1
- vst2.8 {d12[0], d13[0]}, [r0], r12
- vst2.8 {d12[1], d13[1]}, [r3], r12
- vst2.8 {d12[2], d13[2]}, [r0], r12
- vst2.8 {d12[3], d13[3]}, [r3], r12
- vst2.8 {d12[4], d13[4]}, [r0], r12
- vst2.8 {d12[5], d13[5]}, [r3], r12
- vst2.8 {d12[6], d13[6]}, [r0], r12
- vst2.8 {d12[7], d13[7]}, [r3], r12
- vst2.8 {d14[0], d15[0]}, [r0], r12
- vst2.8 {d14[1], d15[1]}, [r3], r12
- vst2.8 {d14[2], d15[2]}, [r0], r12
- vst2.8 {d14[3], d15[3]}, [r3], r12
- vst2.8 {d14[4], d15[4]}, [r0], r12
- vst2.8 {d14[5], d15[5]}, [r3], r12
- vst2.8 {d14[6], d15[6]}, [r0], r12
- vst2.8 {d14[7], d15[7]}, [r3]
-
- vpop {d8-d15}
- bx lr
- ENDP ; |vp8_loop_filter_simple_vertical_edge_neon|
-
-; r0 unsigned char *y
-; r1 int ystride
-; r2 const unsigned char *blimit
-
-|vp8_loop_filter_bvs_neon| PROC
- push {r4, lr}
- ldrb r3, [r2] ; load blim from mem
- mov r4, r0
- add r0, r0, #4
- vdup.s8 q1, r3 ; duplicate blim
- bl vp8_loop_filter_simple_vertical_edge_neon
- ; vp8_loop_filter_simple_vertical_edge_neon preserves r1 and q1
- add r0, r4, #8
- bl vp8_loop_filter_simple_vertical_edge_neon
- add r0, r4, #12
- pop {r4, lr}
- b vp8_loop_filter_simple_vertical_edge_neon
- ENDP ;|vp8_loop_filter_bvs_neon|
-
-; r0 unsigned char *y
-; r1 int ystride
-; r2 const unsigned char *blimit
-
-|vp8_loop_filter_mbvs_neon| PROC
- ldrb r3, [r2] ; load mblim from mem
- vdup.s8 q1, r3 ; duplicate mblim
- b vp8_loop_filter_simple_vertical_edge_neon
- ENDP ;|vp8_loop_filter_bvs_neon|
- END
diff --git a/vp8/common/arm/neon/loopfiltersimpleverticaledge_neon.c b/vp8/common/arm/neon/loopfiltersimpleverticaledge_neon.c
new file mode 100644
index 0000000..d5178bb
--- /dev/null
+++ b/vp8/common/arm/neon/loopfiltersimpleverticaledge_neon.c
@@ -0,0 +1,279 @@
+/*
+ * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+#include "./vpx_config.h"
+
+#if (__GNUC__ == 4 && (__GNUC_MINOR__ >= 7))
+static INLINE void write_2x8(unsigned char *dst, int pitch,
+ const uint8x8x2_t result,
+ const uint8x8x2_t result2) {
+ vst2_lane_u8(dst, result, 0);
+ dst += pitch;
+ vst2_lane_u8(dst, result, 1);
+ dst += pitch;
+ vst2_lane_u8(dst, result, 2);
+ dst += pitch;
+ vst2_lane_u8(dst, result, 3);
+ dst += pitch;
+ vst2_lane_u8(dst, result, 4);
+ dst += pitch;
+ vst2_lane_u8(dst, result, 5);
+ dst += pitch;
+ vst2_lane_u8(dst, result, 6);
+ dst += pitch;
+ vst2_lane_u8(dst, result, 7);
+ dst += pitch;
+
+ vst2_lane_u8(dst, result2, 0);
+ dst += pitch;
+ vst2_lane_u8(dst, result2, 1);
+ dst += pitch;
+ vst2_lane_u8(dst, result2, 2);
+ dst += pitch;
+ vst2_lane_u8(dst, result2, 3);
+ dst += pitch;
+ vst2_lane_u8(dst, result2, 4);
+ dst += pitch;
+ vst2_lane_u8(dst, result2, 5);
+ dst += pitch;
+ vst2_lane_u8(dst, result2, 6);
+ dst += pitch;
+ vst2_lane_u8(dst, result2, 7);
+}
+#else
+static INLINE void write_2x4(unsigned char *dst, int pitch,
+ const uint8x8x2_t result) {
+ /*
+ * uint8x8x2_t result
+ 00 01 02 03 | 04 05 06 07
+ 10 11 12 13 | 14 15 16 17
+ ---
+ * after vtrn_u8
+ 00 10 02 12 | 04 14 06 16
+ 01 11 03 13 | 05 15 07 17
+ */
+ const uint8x8x2_t r01_u8 = vtrn_u8(result.val[0],
+ result.val[1]);
+ const uint16x4_t x_0_4 = vreinterpret_u16_u8(r01_u8.val[0]);
+ const uint16x4_t x_1_5 = vreinterpret_u16_u8(r01_u8.val[1]);
+ vst1_lane_u16((uint16_t *)dst, x_0_4, 0);
+ dst += pitch;
+ vst1_lane_u16((uint16_t *)dst, x_1_5, 0);
+ dst += pitch;
+ vst1_lane_u16((uint16_t *)dst, x_0_4, 1);
+ dst += pitch;
+ vst1_lane_u16((uint16_t *)dst, x_1_5, 1);
+ dst += pitch;
+ vst1_lane_u16((uint16_t *)dst, x_0_4, 2);
+ dst += pitch;
+ vst1_lane_u16((uint16_t *)dst, x_1_5, 2);
+ dst += pitch;
+ vst1_lane_u16((uint16_t *)dst, x_0_4, 3);
+ dst += pitch;
+ vst1_lane_u16((uint16_t *)dst, x_1_5, 3);
+}
+
+static INLINE void write_2x8(unsigned char *dst, int pitch,
+ const uint8x8x2_t result,
+ const uint8x8x2_t result2) {
+ write_2x4(dst, pitch, result);
+ dst += pitch * 8;
+ write_2x4(dst, pitch, result2);
+}
+#endif
+
+
+#if (__GNUC__ == 4 && (__GNUC_MINOR__ >= 7))
+static INLINE
+uint8x8x4_t read_4x8(unsigned char *src, int pitch, uint8x8x4_t x) {
+ x = vld4_lane_u8(src, x, 0);
+ src += pitch;
+ x = vld4_lane_u8(src, x, 1);
+ src += pitch;
+ x = vld4_lane_u8(src, x, 2);
+ src += pitch;
+ x = vld4_lane_u8(src, x, 3);
+ src += pitch;
+ x = vld4_lane_u8(src, x, 4);
+ src += pitch;
+ x = vld4_lane_u8(src, x, 5);
+ src += pitch;
+ x = vld4_lane_u8(src, x, 6);
+ src += pitch;
+ x = vld4_lane_u8(src, x, 7);
+ return x;
+}
+#else
+static INLINE
+uint8x8x4_t read_4x8(unsigned char *src, int pitch, uint8x8x4_t x) {
+ const uint8x8_t a = vld1_u8(src);
+ const uint8x8_t b = vld1_u8(src + pitch * 1);
+ const uint8x8_t c = vld1_u8(src + pitch * 2);
+ const uint8x8_t d = vld1_u8(src + pitch * 3);
+ const uint8x8_t e = vld1_u8(src + pitch * 4);
+ const uint8x8_t f = vld1_u8(src + pitch * 5);
+ const uint8x8_t g = vld1_u8(src + pitch * 6);
+ const uint8x8_t h = vld1_u8(src + pitch * 7);
+ const uint32x2x2_t r04_u32 = vtrn_u32(vreinterpret_u32_u8(a),
+ vreinterpret_u32_u8(e));
+ const uint32x2x2_t r15_u32 = vtrn_u32(vreinterpret_u32_u8(b),
+ vreinterpret_u32_u8(f));
+ const uint32x2x2_t r26_u32 = vtrn_u32(vreinterpret_u32_u8(c),
+ vreinterpret_u32_u8(g));
+ const uint32x2x2_t r37_u32 = vtrn_u32(vreinterpret_u32_u8(d),
+ vreinterpret_u32_u8(h));
+ const uint16x4x2_t r02_u16 = vtrn_u16(vreinterpret_u16_u32(r04_u32.val[0]),
+ vreinterpret_u16_u32(r26_u32.val[0]));
+ const uint16x4x2_t r13_u16 = vtrn_u16(vreinterpret_u16_u32(r15_u32.val[0]),
+ vreinterpret_u16_u32(r37_u32.val[0]));
+ const uint8x8x2_t r01_u8 = vtrn_u8(vreinterpret_u8_u16(r02_u16.val[0]),
+ vreinterpret_u8_u16(r13_u16.val[0]));
+ const uint8x8x2_t r23_u8 = vtrn_u8(vreinterpret_u8_u16(r02_u16.val[1]),
+ vreinterpret_u8_u16(r13_u16.val[1]));
+ /*
+ * after vtrn_u32
+ 00 01 02 03 | 40 41 42 43
+ 10 11 12 13 | 50 51 52 53
+ 20 21 22 23 | 60 61 62 63
+ 30 31 32 33 | 70 71 72 73
+ ---
+ * after vtrn_u16
+ 00 01 20 21 | 40 41 60 61
+ 02 03 22 23 | 42 43 62 63
+ 10 11 30 31 | 50 51 70 71
+ 12 13 32 33 | 52 52 72 73
+
+ 00 01 20 21 | 40 41 60 61
+ 10 11 30 31 | 50 51 70 71
+ 02 03 22 23 | 42 43 62 63
+ 12 13 32 33 | 52 52 72 73
+ ---
+ * after vtrn_u8
+ 00 10 20 30 | 40 50 60 70
+ 01 11 21 31 | 41 51 61 71
+ 02 12 22 32 | 42 52 62 72
+ 03 13 23 33 | 43 53 63 73
+ */
+ x.val[0] = r01_u8.val[0];
+ x.val[1] = r01_u8.val[1];
+ x.val[2] = r23_u8.val[0];
+ x.val[3] = r23_u8.val[1];
+
+ return x;
+}
+#endif
+
+static INLINE void vp8_loop_filter_simple_vertical_edge_neon(
+ unsigned char *s,
+ int p,
+ const unsigned char *blimit) {
+ unsigned char *src1;
+ uint8x16_t qblimit, q0u8;
+ uint8x16_t q3u8, q4u8, q5u8, q6u8, q7u8, q11u8, q12u8, q14u8, q15u8;
+ int16x8_t q2s16, q13s16, q11s16;
+ int8x8_t d28s8, d29s8;
+ int8x16_t q2s8, q3s8, q10s8, q11s8, q14s8;
+ uint8x8x4_t d0u8x4; // d6, d7, d8, d9
+ uint8x8x4_t d1u8x4; // d10, d11, d12, d13
+ uint8x8x2_t d2u8x2; // d12, d13
+ uint8x8x2_t d3u8x2; // d14, d15
+
+ qblimit = vdupq_n_u8(*blimit);
+
+ src1 = s - 2;
+ d0u8x4 = read_4x8(src1, p, d0u8x4);
+ src1 += p * 8;
+ d1u8x4 = read_4x8(src1, p, d1u8x4);
+
+ q3u8 = vcombine_u8(d0u8x4.val[0], d1u8x4.val[0]); // d6 d10
+ q4u8 = vcombine_u8(d0u8x4.val[2], d1u8x4.val[2]); // d8 d12
+ q5u8 = vcombine_u8(d0u8x4.val[1], d1u8x4.val[1]); // d7 d11
+ q6u8 = vcombine_u8(d0u8x4.val[3], d1u8x4.val[3]); // d9 d13
+
+ q15u8 = vabdq_u8(q5u8, q4u8);
+ q14u8 = vabdq_u8(q3u8, q6u8);
+
+ q15u8 = vqaddq_u8(q15u8, q15u8);
+ q14u8 = vshrq_n_u8(q14u8, 1);
+ q0u8 = vdupq_n_u8(0x80);
+ q11s16 = vdupq_n_s16(3);
+ q15u8 = vqaddq_u8(q15u8, q14u8);
+
+ q3u8 = veorq_u8(q3u8, q0u8);
+ q4u8 = veorq_u8(q4u8, q0u8);
+ q5u8 = veorq_u8(q5u8, q0u8);
+ q6u8 = veorq_u8(q6u8, q0u8);
+
+ q15u8 = vcgeq_u8(qblimit, q15u8);
+
+ q2s16 = vsubl_s8(vget_low_s8(vreinterpretq_s8_u8(q4u8)),
+ vget_low_s8(vreinterpretq_s8_u8(q5u8)));
+ q13s16 = vsubl_s8(vget_high_s8(vreinterpretq_s8_u8(q4u8)),
+ vget_high_s8(vreinterpretq_s8_u8(q5u8)));
+
+ q14s8 = vqsubq_s8(vreinterpretq_s8_u8(q3u8),
+ vreinterpretq_s8_u8(q6u8));
+
+ q2s16 = vmulq_s16(q2s16, q11s16);
+ q13s16 = vmulq_s16(q13s16, q11s16);
+
+ q11u8 = vdupq_n_u8(3);
+ q12u8 = vdupq_n_u8(4);
+
+ q2s16 = vaddw_s8(q2s16, vget_low_s8(q14s8));
+ q13s16 = vaddw_s8(q13s16, vget_high_s8(q14s8));
+
+ d28s8 = vqmovn_s16(q2s16);
+ d29s8 = vqmovn_s16(q13s16);
+ q14s8 = vcombine_s8(d28s8, d29s8);
+
+ q14s8 = vandq_s8(q14s8, vreinterpretq_s8_u8(q15u8));
+
+ q2s8 = vqaddq_s8(q14s8, vreinterpretq_s8_u8(q11u8));
+ q3s8 = vqaddq_s8(q14s8, vreinterpretq_s8_u8(q12u8));
+ q2s8 = vshrq_n_s8(q2s8, 3);
+ q14s8 = vshrq_n_s8(q3s8, 3);
+
+ q11s8 = vqaddq_s8(vreinterpretq_s8_u8(q5u8), q2s8);
+ q10s8 = vqsubq_s8(vreinterpretq_s8_u8(q4u8), q14s8);
+
+ q6u8 = veorq_u8(vreinterpretq_u8_s8(q11s8), q0u8);
+ q7u8 = veorq_u8(vreinterpretq_u8_s8(q10s8), q0u8);
+
+ d2u8x2.val[0] = vget_low_u8(q6u8); // d12
+ d2u8x2.val[1] = vget_low_u8(q7u8); // d14
+ d3u8x2.val[0] = vget_high_u8(q6u8); // d13
+ d3u8x2.val[1] = vget_high_u8(q7u8); // d15
+
+ src1 = s - 1;
+ write_2x8(src1, p, d2u8x2, d3u8x2);
+}
+
+void vp8_loop_filter_bvs_neon(
+ unsigned char *y_ptr,
+ int y_stride,
+ const unsigned char *blimit) {
+ y_ptr += 4;
+ vp8_loop_filter_simple_vertical_edge_neon(y_ptr, y_stride, blimit);
+ y_ptr += 4;
+ vp8_loop_filter_simple_vertical_edge_neon(y_ptr, y_stride, blimit);
+ y_ptr += 4;
+ vp8_loop_filter_simple_vertical_edge_neon(y_ptr, y_stride, blimit);
+ return;
+}
+
+void vp8_loop_filter_mbvs_neon(
+ unsigned char *y_ptr,
+ int y_stride,
+ const unsigned char *blimit) {
+ vp8_loop_filter_simple_vertical_edge_neon(y_ptr, y_stride, blimit);
+ return;
+}
diff --git a/vp8/common/arm/neon/reconintra_neon.c b/vp8/common/arm/neon/reconintra_neon.c
new file mode 100644
index 0000000..af52cd5
--- /dev/null
+++ b/vp8/common/arm/neon/reconintra_neon.c
@@ -0,0 +1,210 @@
+/*
+ * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "vp8/common/blockd.h"
+
+void vp8_build_intra_predictors_mby_s_neon(MACROBLOCKD *x,
+ unsigned char * yabove_row,
+ unsigned char * yleft,
+ int left_stride,
+ unsigned char * ypred_ptr,
+ int y_stride) {
+ const int mode = x->mode_info_context->mbmi.mode;
+ int i;
+
+ switch (mode) {
+ case DC_PRED:
+ {
+ int shift = x->up_available + x->left_available;
+ uint8x16_t v_expected_dc = vdupq_n_u8(128);
+
+ if (shift) {
+ unsigned int average = 0;
+ int expected_dc;
+ if (x->up_available) {
+ const uint8x16_t v_above = vld1q_u8(yabove_row);
+ const uint16x8_t a = vpaddlq_u8(v_above);
+ const uint32x4_t b = vpaddlq_u16(a);
+ const uint64x2_t c = vpaddlq_u32(b);
+ const uint32x2_t d = vadd_u32(vreinterpret_u32_u64(vget_low_u64(c)),
+ vreinterpret_u32_u64(vget_high_u64(c)));
+ average = vget_lane_u32(d, 0);
+ }
+ if (x->left_available) {
+ for (i = 0; i < 16; ++i) {
+ average += yleft[0];
+ yleft += left_stride;
+ }
+ }
+ shift += 3;
+ expected_dc = (average + (1 << (shift - 1))) >> shift;
+ v_expected_dc = vmovq_n_u8((uint8_t)expected_dc);
+ }
+ for (i = 0; i < 16; ++i) {
+ vst1q_u8(ypred_ptr, v_expected_dc);
+ ypred_ptr += y_stride;
+ }
+ }
+ break;
+ case V_PRED:
+ {
+ const uint8x16_t v_above = vld1q_u8(yabove_row);
+ for (i = 0; i < 16; ++i) {
+ vst1q_u8(ypred_ptr, v_above);
+ ypred_ptr += y_stride;
+ }
+ }
+ break;
+ case H_PRED:
+ {
+ for (i = 0; i < 16; ++i) {
+ const uint8x16_t v_yleft = vmovq_n_u8((uint8_t)yleft[0]);
+ yleft += left_stride;
+ vst1q_u8(ypred_ptr, v_yleft);
+ ypred_ptr += y_stride;
+ }
+ }
+ break;
+ case TM_PRED:
+ {
+ const uint16x8_t v_ytop_left = vmovq_n_u16((int16_t)yabove_row[-1]);
+ const uint8x16_t v_above = vld1q_u8(yabove_row);
+ for (i = 0; i < 16; ++i) {
+ const uint8x8_t v_yleft = vmov_n_u8((int8_t)yleft[0]);
+ const uint16x8_t a_lo = vaddl_u8(vget_low_u8(v_above), v_yleft);
+ const uint16x8_t a_hi = vaddl_u8(vget_high_u8(v_above), v_yleft);
+ const int16x8_t b_lo = vsubq_s16(vreinterpretq_s16_u16(a_lo),
+ vreinterpretq_s16_u16(v_ytop_left));
+ const int16x8_t b_hi = vsubq_s16(vreinterpretq_s16_u16(a_hi),
+ vreinterpretq_s16_u16(v_ytop_left));
+ const uint8x8_t pred_lo = vqmovun_s16(b_lo);
+ const uint8x8_t pred_hi = vqmovun_s16(b_hi);
+
+ vst1q_u8(ypred_ptr, vcombine_u8(pred_lo, pred_hi));
+ ypred_ptr += y_stride;
+ yleft += left_stride;
+ }
+ }
+ break;
+ }
+}
+
+void vp8_build_intra_predictors_mbuv_s_neon(MACROBLOCKD *x,
+ unsigned char * uabove_row,
+ unsigned char * vabove_row,
+ unsigned char * uleft,
+ unsigned char * vleft,
+ int left_stride,
+ unsigned char * upred_ptr,
+ unsigned char * vpred_ptr,
+ int pred_stride) {
+ const int mode = x->mode_info_context->mbmi.uv_mode;
+ int i;
+
+ switch (mode) {
+ case DC_PRED:
+ {
+ int shift = x->up_available + x->left_available;
+ uint8x8_t v_expected_udc = vdup_n_u8(128);
+ uint8x8_t v_expected_vdc = vdup_n_u8(128);
+
+ if (shift) {
+ unsigned int average_u = 0;
+ unsigned int average_v = 0;
+ int expected_udc;
+ int expected_vdc;
+ if (x->up_available) {
+ const uint8x8_t v_uabove = vld1_u8(uabove_row);
+ const uint8x8_t v_vabove = vld1_u8(vabove_row);
+ const uint16x8_t a = vpaddlq_u8(vcombine_u8(v_uabove, v_vabove));
+ const uint32x4_t b = vpaddlq_u16(a);
+ const uint64x2_t c = vpaddlq_u32(b);
+ average_u = vgetq_lane_u32(vreinterpretq_u32_u64((c)), 0);
+ average_v = vgetq_lane_u32(vreinterpretq_u32_u64((c)), 2);
+ }
+ if (x->left_available) {
+ for (i = 0; i < 8; ++i) {
+ average_u += uleft[0];
+ uleft += left_stride;
+ average_v += vleft[0];
+ vleft += left_stride;
+ }
+ }
+ shift += 2;
+ expected_udc = (average_u + (1 << (shift - 1))) >> shift;
+ expected_vdc = (average_v + (1 << (shift - 1))) >> shift;
+ v_expected_udc = vmov_n_u8((uint8_t)expected_udc);
+ v_expected_vdc = vmov_n_u8((uint8_t)expected_vdc);
+ }
+ for (i = 0; i < 8; ++i) {
+ vst1_u8(upred_ptr, v_expected_udc);
+ upred_ptr += pred_stride;
+ vst1_u8(vpred_ptr, v_expected_vdc);
+ vpred_ptr += pred_stride;
+ }
+ }
+ break;
+ case V_PRED:
+ {
+ const uint8x8_t v_uabove = vld1_u8(uabove_row);
+ const uint8x8_t v_vabove = vld1_u8(vabove_row);
+ for (i = 0; i < 8; ++i) {
+ vst1_u8(upred_ptr, v_uabove);
+ upred_ptr += pred_stride;
+ vst1_u8(vpred_ptr, v_vabove);
+ vpred_ptr += pred_stride;
+ }
+ }
+ break;
+ case H_PRED:
+ {
+ for (i = 0; i < 8; ++i) {
+ const uint8x8_t v_uleft = vmov_n_u8((uint8_t)uleft[0]);
+ const uint8x8_t v_vleft = vmov_n_u8((uint8_t)vleft[0]);
+ uleft += left_stride;
+ vleft += left_stride;
+ vst1_u8(upred_ptr, v_uleft);
+ upred_ptr += pred_stride;
+ vst1_u8(vpred_ptr, v_vleft);
+ vpred_ptr += pred_stride;
+ }
+ }
+ break;
+ case TM_PRED:
+ {
+ const uint16x8_t v_utop_left = vmovq_n_u16((int16_t)uabove_row[-1]);
+ const uint16x8_t v_vtop_left = vmovq_n_u16((int16_t)vabove_row[-1]);
+ const uint8x8_t v_uabove = vld1_u8(uabove_row);
+ const uint8x8_t v_vabove = vld1_u8(vabove_row);
+ for (i = 0; i < 8; ++i) {
+ const uint8x8_t v_uleft = vmov_n_u8((int8_t)uleft[0]);
+ const uint8x8_t v_vleft = vmov_n_u8((int8_t)vleft[0]);
+ const uint16x8_t a_u = vaddl_u8(v_uabove, v_uleft);
+ const uint16x8_t a_v = vaddl_u8(v_vabove, v_vleft);
+ const int16x8_t b_u = vsubq_s16(vreinterpretq_s16_u16(a_u),
+ vreinterpretq_s16_u16(v_utop_left));
+ const int16x8_t b_v = vsubq_s16(vreinterpretq_s16_u16(a_v),
+ vreinterpretq_s16_u16(v_vtop_left));
+ const uint8x8_t pred_u = vqmovun_s16(b_u);
+ const uint8x8_t pred_v = vqmovun_s16(b_v);
+
+ vst1_u8(upred_ptr, pred_u);
+ vst1_u8(vpred_ptr, pred_v);
+ upred_ptr += pred_stride;
+ vpred_ptr += pred_stride;
+ uleft += left_stride;
+ vleft += left_stride;
+ }
+ }
+ break;
+ }
+}
diff --git a/vp8/common/arm/neon/vp8_subpixelvariance16x16_neon.asm b/vp8/common/arm/neon/vp8_subpixelvariance16x16_neon.asm
deleted file mode 100644
index adc5b7e..0000000
--- a/vp8/common/arm/neon/vp8_subpixelvariance16x16_neon.asm
+++ /dev/null
@@ -1,425 +0,0 @@
-;
-; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
-
-;-----------------
-
- EXPORT |vp8_sub_pixel_variance16x16_neon_func|
- ARM
- REQUIRE8
- PRESERVE8
-
- AREA ||.text||, CODE, READONLY, ALIGN=2
-; r0 unsigned char *src_ptr,
-; r1 int src_pixels_per_line,
-; r2 int xoffset,
-; r3 int yoffset,
-; stack(r4) unsigned char *dst_ptr,
-; stack(r5) int dst_pixels_per_line,
-; stack(r6) unsigned int *sse
-;note: most of the code is copied from bilinear_predict16x16_neon and vp8_variance16x16_neon.
-
-bilinear_taps_coeff
- DCD 128, 0, 112, 16, 96, 32, 80, 48, 64, 64, 48, 80, 32, 96, 16, 112
-
-|vp8_sub_pixel_variance16x16_neon_func| PROC
- push {r4-r6, lr}
- vpush {d8-d15}
-
- adr r12, bilinear_taps_coeff
- ldr r4, [sp, #80] ;load *dst_ptr from stack
- ldr r5, [sp, #84] ;load dst_pixels_per_line from stack
- ldr r6, [sp, #88] ;load *sse from stack
-
- cmp r2, #0 ;skip first_pass filter if xoffset=0
- beq secondpass_bfilter16x16_only
-
- add r2, r12, r2, lsl #3 ;calculate filter location
-
- cmp r3, #0 ;skip second_pass filter if yoffset=0
-
- vld1.s32 {d31}, [r2] ;load first_pass filter
-
- beq firstpass_bfilter16x16_only
-
- sub sp, sp, #272 ;reserve space on stack for temporary storage
- vld1.u8 {d2, d3, d4}, [r0], r1 ;load src data
- mov lr, sp
- vld1.u8 {d5, d6, d7}, [r0], r1
-
- mov r2, #3 ;loop counter
- vld1.u8 {d8, d9, d10}, [r0], r1
-
- vdup.8 d0, d31[0] ;first_pass filter (d0 d1)
- vld1.u8 {d11, d12, d13}, [r0], r1
-
- vdup.8 d1, d31[4]
-
-;First Pass: output_height lines x output_width columns (17x16)
-vp8e_filt_blk2d_fp16x16_loop_neon
- pld [r0]
- pld [r0, r1]
- pld [r0, r1, lsl #1]
-
- vmull.u8 q7, d2, d0 ;(src_ptr[0] * Filter[0])
- vmull.u8 q8, d3, d0
- vmull.u8 q9, d5, d0
- vmull.u8 q10, d6, d0
- vmull.u8 q11, d8, d0
- vmull.u8 q12, d9, d0
- vmull.u8 q13, d11, d0
- vmull.u8 q14, d12, d0
-
- vext.8 d2, d2, d3, #1 ;construct src_ptr[1]
- vext.8 d5, d5, d6, #1
- vext.8 d8, d8, d9, #1
- vext.8 d11, d11, d12, #1
-
- vmlal.u8 q7, d2, d1 ;(src_ptr[0] * Filter[1])
- vmlal.u8 q9, d5, d1
- vmlal.u8 q11, d8, d1
- vmlal.u8 q13, d11, d1
-
- vext.8 d3, d3, d4, #1
- vext.8 d6, d6, d7, #1
- vext.8 d9, d9, d10, #1
- vext.8 d12, d12, d13, #1
-
- vmlal.u8 q8, d3, d1 ;(src_ptr[0] * Filter[1])
- vmlal.u8 q10, d6, d1
- vmlal.u8 q12, d9, d1
- vmlal.u8 q14, d12, d1
-
- subs r2, r2, #1
-
- vqrshrn.u16 d14, q7, #7 ;shift/round/saturate to u8
- vqrshrn.u16 d15, q8, #7
- vqrshrn.u16 d16, q9, #7
- vqrshrn.u16 d17, q10, #7
- vqrshrn.u16 d18, q11, #7
- vqrshrn.u16 d19, q12, #7
- vqrshrn.u16 d20, q13, #7
-
- vld1.u8 {d2, d3, d4}, [r0], r1 ;load src data
- vqrshrn.u16 d21, q14, #7
- vld1.u8 {d5, d6, d7}, [r0], r1
-
- vst1.u8 {d14, d15, d16, d17}, [lr]! ;store result
- vld1.u8 {d8, d9, d10}, [r0], r1
- vst1.u8 {d18, d19, d20, d21}, [lr]!
- vld1.u8 {d11, d12, d13}, [r0], r1
-
- bne vp8e_filt_blk2d_fp16x16_loop_neon
-
-;First-pass filtering for rest 5 lines
- vld1.u8 {d14, d15, d16}, [r0], r1
-
- vmull.u8 q9, d2, d0 ;(src_ptr[0] * Filter[0])
- vmull.u8 q10, d3, d0
- vmull.u8 q11, d5, d0
- vmull.u8 q12, d6, d0
- vmull.u8 q13, d8, d0
- vmull.u8 q14, d9, d0
-
- vext.8 d2, d2, d3, #1 ;construct src_ptr[1]
- vext.8 d5, d5, d6, #1
- vext.8 d8, d8, d9, #1
-
- vmlal.u8 q9, d2, d1 ;(src_ptr[0] * Filter[1])
- vmlal.u8 q11, d5, d1
- vmlal.u8 q13, d8, d1
-
- vext.8 d3, d3, d4, #1
- vext.8 d6, d6, d7, #1
- vext.8 d9, d9, d10, #1
-
- vmlal.u8 q10, d3, d1 ;(src_ptr[0] * Filter[1])
- vmlal.u8 q12, d6, d1
- vmlal.u8 q14, d9, d1
-
- vmull.u8 q1, d11, d0
- vmull.u8 q2, d12, d0
- vmull.u8 q3, d14, d0
- vmull.u8 q4, d15, d0
-
- vext.8 d11, d11, d12, #1 ;construct src_ptr[1]
- vext.8 d14, d14, d15, #1
-
- vmlal.u8 q1, d11, d1 ;(src_ptr[0] * Filter[1])
- vmlal.u8 q3, d14, d1
-
- vext.8 d12, d12, d13, #1
- vext.8 d15, d15, d16, #1
-
- vmlal.u8 q2, d12, d1 ;(src_ptr[0] * Filter[1])
- vmlal.u8 q4, d15, d1
-
- vqrshrn.u16 d10, q9, #7 ;shift/round/saturate to u8
- vqrshrn.u16 d11, q10, #7
- vqrshrn.u16 d12, q11, #7
- vqrshrn.u16 d13, q12, #7
- vqrshrn.u16 d14, q13, #7
- vqrshrn.u16 d15, q14, #7
- vqrshrn.u16 d16, q1, #7
- vqrshrn.u16 d17, q2, #7
- vqrshrn.u16 d18, q3, #7
- vqrshrn.u16 d19, q4, #7
-
- vst1.u8 {d10, d11, d12, d13}, [lr]! ;store result
- vst1.u8 {d14, d15, d16, d17}, [lr]!
- vst1.u8 {d18, d19}, [lr]!
-
-;Second pass: 16x16
-;secondpass_filter
- add r3, r12, r3, lsl #3
- sub lr, lr, #272
-
- vld1.u32 {d31}, [r3] ;load second_pass filter
-
- sub sp, sp, #256
- mov r3, sp
-
- vld1.u8 {d22, d23}, [lr]! ;load src data
-
- vdup.8 d0, d31[0] ;second_pass filter parameters (d0 d1)
- vdup.8 d1, d31[4]
- mov r12, #4 ;loop counter
-
-vp8e_filt_blk2d_sp16x16_loop_neon
- vld1.u8 {d24, d25}, [lr]!
- vmull.u8 q1, d22, d0 ;(src_ptr[0] * Filter[0])
- vld1.u8 {d26, d27}, [lr]!
- vmull.u8 q2, d23, d0
- vld1.u8 {d28, d29}, [lr]!
- vmull.u8 q3, d24, d0
- vld1.u8 {d30, d31}, [lr]!
-
- vmull.u8 q4, d25, d0
- vmull.u8 q5, d26, d0
- vmull.u8 q6, d27, d0
- vmull.u8 q7, d28, d0
- vmull.u8 q8, d29, d0
-
- vmlal.u8 q1, d24, d1 ;(src_ptr[pixel_step] * Filter[1])
- vmlal.u8 q2, d25, d1
- vmlal.u8 q3, d26, d1
- vmlal.u8 q4, d27, d1
- vmlal.u8 q5, d28, d1
- vmlal.u8 q6, d29, d1
- vmlal.u8 q7, d30, d1
- vmlal.u8 q8, d31, d1
-
- subs r12, r12, #1
-
- vqrshrn.u16 d2, q1, #7 ;shift/round/saturate to u8
- vqrshrn.u16 d3, q2, #7
- vqrshrn.u16 d4, q3, #7
- vqrshrn.u16 d5, q4, #7
- vqrshrn.u16 d6, q5, #7
- vqrshrn.u16 d7, q6, #7
- vqrshrn.u16 d8, q7, #7
- vqrshrn.u16 d9, q8, #7
-
- vst1.u8 {d2, d3}, [r3]! ;store result
- vst1.u8 {d4, d5}, [r3]!
- vst1.u8 {d6, d7}, [r3]!
- vmov q11, q15
- vst1.u8 {d8, d9}, [r3]!
-
- bne vp8e_filt_blk2d_sp16x16_loop_neon
-
- b sub_pixel_variance16x16_neon
-
-;--------------------
-firstpass_bfilter16x16_only
- mov r2, #4 ;loop counter
- sub sp, sp, #528 ;reserve space on stack for temporary storage
- vdup.8 d0, d31[0] ;first_pass filter (d0 d1)
- vdup.8 d1, d31[4]
- mov r3, sp
-
-;First Pass: output_height lines x output_width columns (16x16)
-vp8e_filt_blk2d_fpo16x16_loop_neon
- vld1.u8 {d2, d3, d4}, [r0], r1 ;load src data
- vld1.u8 {d5, d6, d7}, [r0], r1
- vld1.u8 {d8, d9, d10}, [r0], r1
- vld1.u8 {d11, d12, d13}, [r0], r1
-
- pld [r0]
- pld [r0, r1]
- pld [r0, r1, lsl #1]
-
- vmull.u8 q7, d2, d0 ;(src_ptr[0] * Filter[0])
- vmull.u8 q8, d3, d0
- vmull.u8 q9, d5, d0
- vmull.u8 q10, d6, d0
- vmull.u8 q11, d8, d0
- vmull.u8 q12, d9, d0
- vmull.u8 q13, d11, d0
- vmull.u8 q14, d12, d0
-
- vext.8 d2, d2, d3, #1 ;construct src_ptr[1]
- vext.8 d5, d5, d6, #1
- vext.8 d8, d8, d9, #1
- vext.8 d11, d11, d12, #1
-
- vmlal.u8 q7, d2, d1 ;(src_ptr[0] * Filter[1])
- vmlal.u8 q9, d5, d1
- vmlal.u8 q11, d8, d1
- vmlal.u8 q13, d11, d1
-
- vext.8 d3, d3, d4, #1
- vext.8 d6, d6, d7, #1
- vext.8 d9, d9, d10, #1
- vext.8 d12, d12, d13, #1
-
- vmlal.u8 q8, d3, d1 ;(src_ptr[0] * Filter[1])
- vmlal.u8 q10, d6, d1
- vmlal.u8 q12, d9, d1
- vmlal.u8 q14, d12, d1
-
- subs r2, r2, #1
-
- vqrshrn.u16 d14, q7, #7 ;shift/round/saturate to u8
- vqrshrn.u16 d15, q8, #7
- vqrshrn.u16 d16, q9, #7
- vqrshrn.u16 d17, q10, #7
- vqrshrn.u16 d18, q11, #7
- vqrshrn.u16 d19, q12, #7
- vqrshrn.u16 d20, q13, #7
- vst1.u8 {d14, d15}, [r3]! ;store result
- vqrshrn.u16 d21, q14, #7
-
- vst1.u8 {d16, d17}, [r3]!
- vst1.u8 {d18, d19}, [r3]!
- vst1.u8 {d20, d21}, [r3]!
-
- bne vp8e_filt_blk2d_fpo16x16_loop_neon
-
- b sub_pixel_variance16x16_neon
-
-;---------------------
-secondpass_bfilter16x16_only
-;Second pass: 16x16
-;secondpass_filter
- sub sp, sp, #528 ;reserve space on stack for temporary storage
- add r3, r12, r3, lsl #3
- mov r12, #4 ;loop counter
- vld1.u32 {d31}, [r3] ;load second_pass filter
- vld1.u8 {d22, d23}, [r0], r1 ;load src data
- mov r3, sp
-
- vdup.8 d0, d31[0] ;second_pass filter parameters (d0 d1)
- vdup.8 d1, d31[4]
-
-vp8e_filt_blk2d_spo16x16_loop_neon
- vld1.u8 {d24, d25}, [r0], r1
- vmull.u8 q1, d22, d0 ;(src_ptr[0] * Filter[0])
- vld1.u8 {d26, d27}, [r0], r1
- vmull.u8 q2, d23, d0
- vld1.u8 {d28, d29}, [r0], r1
- vmull.u8 q3, d24, d0
- vld1.u8 {d30, d31}, [r0], r1
-
- vmull.u8 q4, d25, d0
- vmull.u8 q5, d26, d0
- vmull.u8 q6, d27, d0
- vmull.u8 q7, d28, d0
- vmull.u8 q8, d29, d0
-
- vmlal.u8 q1, d24, d1 ;(src_ptr[pixel_step] * Filter[1])
- vmlal.u8 q2, d25, d1
- vmlal.u8 q3, d26, d1
- vmlal.u8 q4, d27, d1
- vmlal.u8 q5, d28, d1
- vmlal.u8 q6, d29, d1
- vmlal.u8 q7, d30, d1
- vmlal.u8 q8, d31, d1
-
- vqrshrn.u16 d2, q1, #7 ;shift/round/saturate to u8
- vqrshrn.u16 d3, q2, #7
- vqrshrn.u16 d4, q3, #7
- vqrshrn.u16 d5, q4, #7
- vqrshrn.u16 d6, q5, #7
- vqrshrn.u16 d7, q6, #7
- vqrshrn.u16 d8, q7, #7
- vqrshrn.u16 d9, q8, #7
-
- vst1.u8 {d2, d3}, [r3]! ;store result
- subs r12, r12, #1
- vst1.u8 {d4, d5}, [r3]!
- vmov q11, q15
- vst1.u8 {d6, d7}, [r3]!
- vst1.u8 {d8, d9}, [r3]!
-
- bne vp8e_filt_blk2d_spo16x16_loop_neon
-
- b sub_pixel_variance16x16_neon
-
-;----------------------------
-;variance16x16
-sub_pixel_variance16x16_neon
- vmov.i8 q8, #0 ;q8 - sum
- vmov.i8 q9, #0 ;q9, q10 - sse
- vmov.i8 q10, #0
-
- sub r3, r3, #256
- mov r12, #8
-
-sub_pixel_variance16x16_neon_loop
- vld1.8 {q0}, [r3]! ;Load up source and reference
- vld1.8 {q2}, [r4], r5
- vld1.8 {q1}, [r3]!
- vld1.8 {q3}, [r4], r5
-
- vsubl.u8 q11, d0, d4 ;diff
- vsubl.u8 q12, d1, d5
- vsubl.u8 q13, d2, d6
- vsubl.u8 q14, d3, d7
-
- vpadal.s16 q8, q11 ;sum
- vmlal.s16 q9, d22, d22 ;sse
- vmlal.s16 q10, d23, d23
-
- subs r12, r12, #1
-
- vpadal.s16 q8, q12
- vmlal.s16 q9, d24, d24
- vmlal.s16 q10, d25, d25
- vpadal.s16 q8, q13
- vmlal.s16 q9, d26, d26
- vmlal.s16 q10, d27, d27
- vpadal.s16 q8, q14
- vmlal.s16 q9, d28, d28
- vmlal.s16 q10, d29, d29
-
- bne sub_pixel_variance16x16_neon_loop
-
- vadd.u32 q10, q9, q10 ;accumulate sse
- vpaddl.s32 q0, q8 ;accumulate sum
-
- vpaddl.u32 q1, q10
- vadd.s64 d0, d0, d1
- vadd.u64 d1, d2, d3
-
- vmull.s32 q5, d0, d0
- vst1.32 {d1[0]}, [r6] ;store sse
- vshr.u32 d10, d10, #8
- vsub.u32 d0, d1, d10
-
- add sp, sp, #528
- vmov.32 r0, d0[0] ;return
-
- vpop {d8-d15}
- pop {r4-r6,pc}
-
- ENDP
-
- END
diff --git a/vp8/common/arm/neon/vp8_subpixelvariance16x16s_neon.asm b/vp8/common/arm/neon/vp8_subpixelvariance16x16s_neon.asm
deleted file mode 100644
index b0829af..0000000
--- a/vp8/common/arm/neon/vp8_subpixelvariance16x16s_neon.asm
+++ /dev/null
@@ -1,583 +0,0 @@
-;
-; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
-
- EXPORT |vp8_variance_halfpixvar16x16_h_neon|
- EXPORT |vp8_variance_halfpixvar16x16_v_neon|
- EXPORT |vp8_variance_halfpixvar16x16_hv_neon|
- EXPORT |vp8_sub_pixel_variance16x16s_neon|
- ARM
- REQUIRE8
- PRESERVE8
-
- AREA ||.text||, CODE, READONLY, ALIGN=2
-
-;================================================
-;unsigned int vp8_variance_halfpixvar16x16_h_neon
-;(
-; unsigned char *src_ptr, r0
-; int src_pixels_per_line, r1
-; unsigned char *dst_ptr, r2
-; int dst_pixels_per_line, r3
-; unsigned int *sse
-;);
-;================================================
-|vp8_variance_halfpixvar16x16_h_neon| PROC
- push {lr}
- vpush {d8-d15}
-
- mov r12, #4 ;loop counter
- ldr lr, [sp, #68] ;load *sse from stack
- vmov.i8 q8, #0 ;q8 - sum
- vmov.i8 q9, #0 ;q9, q10 - sse
- vmov.i8 q10, #0
-
-;First Pass: output_height lines x output_width columns (16x16)
-vp8_filt_fpo16x16s_4_0_loop_neon
- vld1.u8 {d0, d1, d2, d3}, [r0], r1 ;load src data
- vld1.8 {q11}, [r2], r3
- vld1.u8 {d4, d5, d6, d7}, [r0], r1
- vld1.8 {q12}, [r2], r3
- vld1.u8 {d8, d9, d10, d11}, [r0], r1
- vld1.8 {q13}, [r2], r3
- vld1.u8 {d12, d13, d14, d15}, [r0], r1
-
- ;pld [r0]
- ;pld [r0, r1]
- ;pld [r0, r1, lsl #1]
-
- vext.8 q1, q0, q1, #1 ;construct src_ptr[1]
- vext.8 q3, q2, q3, #1
- vext.8 q5, q4, q5, #1
- vext.8 q7, q6, q7, #1
-
- vrhadd.u8 q0, q0, q1 ;(src_ptr[0]+src_ptr[1])/round/shift right 1
- vld1.8 {q14}, [r2], r3
- vrhadd.u8 q1, q2, q3
- vrhadd.u8 q2, q4, q5
- vrhadd.u8 q3, q6, q7
-
- vsubl.u8 q4, d0, d22 ;diff
- vsubl.u8 q5, d1, d23
- vsubl.u8 q6, d2, d24
- vsubl.u8 q7, d3, d25
- vsubl.u8 q0, d4, d26
- vsubl.u8 q1, d5, d27
- vsubl.u8 q2, d6, d28
- vsubl.u8 q3, d7, d29
-
- vpadal.s16 q8, q4 ;sum
- vmlal.s16 q9, d8, d8 ;sse
- vmlal.s16 q10, d9, d9
-
- subs r12, r12, #1
-
- vpadal.s16 q8, q5
- vmlal.s16 q9, d10, d10
- vmlal.s16 q10, d11, d11
- vpadal.s16 q8, q6
- vmlal.s16 q9, d12, d12
- vmlal.s16 q10, d13, d13
- vpadal.s16 q8, q7
- vmlal.s16 q9, d14, d14
- vmlal.s16 q10, d15, d15
-
- vpadal.s16 q8, q0 ;sum
- vmlal.s16 q9, d0, d0 ;sse
- vmlal.s16 q10, d1, d1
- vpadal.s16 q8, q1
- vmlal.s16 q9, d2, d2
- vmlal.s16 q10, d3, d3
- vpadal.s16 q8, q2
- vmlal.s16 q9, d4, d4
- vmlal.s16 q10, d5, d5
- vpadal.s16 q8, q3
- vmlal.s16 q9, d6, d6
- vmlal.s16 q10, d7, d7
-
- bne vp8_filt_fpo16x16s_4_0_loop_neon
-
- vadd.u32 q10, q9, q10 ;accumulate sse
- vpaddl.s32 q0, q8 ;accumulate sum
-
- vpaddl.u32 q1, q10
- vadd.s64 d0, d0, d1
- vadd.u64 d1, d2, d3
-
- vmull.s32 q5, d0, d0
- vst1.32 {d1[0]}, [lr] ;store sse
- vshr.u32 d10, d10, #8
- vsub.u32 d0, d1, d10
-
- vmov.32 r0, d0[0] ;return
-
- vpop {d8-d15}
- pop {pc}
- ENDP
-
-;================================================
-;unsigned int vp8_variance_halfpixvar16x16_v_neon
-;(
-; unsigned char *src_ptr, r0
-; int src_pixels_per_line, r1
-; unsigned char *dst_ptr, r2
-; int dst_pixels_per_line, r3
-; unsigned int *sse
-;);
-;================================================
-|vp8_variance_halfpixvar16x16_v_neon| PROC
- push {lr}
- vpush {d8-d15}
-
- mov r12, #4 ;loop counter
-
- vld1.u8 {q0}, [r0], r1 ;load src data
- ldr lr, [sp, #68] ;load *sse from stack
-
- vmov.i8 q8, #0 ;q8 - sum
- vmov.i8 q9, #0 ;q9, q10 - sse
- vmov.i8 q10, #0
-
-vp8_filt_spo16x16s_0_4_loop_neon
- vld1.u8 {q2}, [r0], r1
- vld1.8 {q1}, [r2], r3
- vld1.u8 {q4}, [r0], r1
- vld1.8 {q3}, [r2], r3
- vld1.u8 {q6}, [r0], r1
- vld1.8 {q5}, [r2], r3
- vld1.u8 {q15}, [r0], r1
-
- vrhadd.u8 q0, q0, q2
- vld1.8 {q7}, [r2], r3
- vrhadd.u8 q2, q2, q4
- vrhadd.u8 q4, q4, q6
- vrhadd.u8 q6, q6, q15
-
- vsubl.u8 q11, d0, d2 ;diff
- vsubl.u8 q12, d1, d3
- vsubl.u8 q13, d4, d6
- vsubl.u8 q14, d5, d7
- vsubl.u8 q0, d8, d10
- vsubl.u8 q1, d9, d11
- vsubl.u8 q2, d12, d14
- vsubl.u8 q3, d13, d15
-
- vpadal.s16 q8, q11 ;sum
- vmlal.s16 q9, d22, d22 ;sse
- vmlal.s16 q10, d23, d23
-
- subs r12, r12, #1
-
- vpadal.s16 q8, q12
- vmlal.s16 q9, d24, d24
- vmlal.s16 q10, d25, d25
- vpadal.s16 q8, q13
- vmlal.s16 q9, d26, d26
- vmlal.s16 q10, d27, d27
- vpadal.s16 q8, q14
- vmlal.s16 q9, d28, d28
- vmlal.s16 q10, d29, d29
-
- vpadal.s16 q8, q0 ;sum
- vmlal.s16 q9, d0, d0 ;sse
- vmlal.s16 q10, d1, d1
- vpadal.s16 q8, q1
- vmlal.s16 q9, d2, d2
- vmlal.s16 q10, d3, d3
- vpadal.s16 q8, q2
- vmlal.s16 q9, d4, d4
- vmlal.s16 q10, d5, d5
-
- vmov q0, q15
-
- vpadal.s16 q8, q3
- vmlal.s16 q9, d6, d6
- vmlal.s16 q10, d7, d7
-
- bne vp8_filt_spo16x16s_0_4_loop_neon
-
- vadd.u32 q10, q9, q10 ;accumulate sse
- vpaddl.s32 q0, q8 ;accumulate sum
-
- vpaddl.u32 q1, q10
- vadd.s64 d0, d0, d1
- vadd.u64 d1, d2, d3
-
- vmull.s32 q5, d0, d0
- vst1.32 {d1[0]}, [lr] ;store sse
- vshr.u32 d10, d10, #8
- vsub.u32 d0, d1, d10
-
- vmov.32 r0, d0[0] ;return
-
- vpop {d8-d15}
- pop {pc}
- ENDP
-
-;================================================
-;unsigned int vp8_variance_halfpixvar16x16_hv_neon
-;(
-; unsigned char *src_ptr, r0
-; int src_pixels_per_line, r1
-; unsigned char *dst_ptr, r2
-; int dst_pixels_per_line, r3
-; unsigned int *sse
-;);
-;================================================
-|vp8_variance_halfpixvar16x16_hv_neon| PROC
- push {lr}
- vpush {d8-d15}
-
- vld1.u8 {d0, d1, d2, d3}, [r0], r1 ;load src data
-
- ldr lr, [sp, #68] ;load *sse from stack
- vmov.i8 q13, #0 ;q8 - sum
- vext.8 q1, q0, q1, #1 ;construct src_ptr[1]
-
- vmov.i8 q14, #0 ;q9, q10 - sse
- vmov.i8 q15, #0
-
- mov r12, #4 ;loop counter
- vrhadd.u8 q0, q0, q1 ;(src_ptr[0]+src_ptr[1])/round/shift right 1
-
-;First Pass: output_height lines x output_width columns (17x16)
-vp8_filt16x16s_4_4_loop_neon
- vld1.u8 {d4, d5, d6, d7}, [r0], r1
- vld1.u8 {d8, d9, d10, d11}, [r0], r1
- vld1.u8 {d12, d13, d14, d15}, [r0], r1
- vld1.u8 {d16, d17, d18, d19}, [r0], r1
-
- ;pld [r0]
- ;pld [r0, r1]
- ;pld [r0, r1, lsl #1]
-
- vext.8 q3, q2, q3, #1 ;construct src_ptr[1]
- vext.8 q5, q4, q5, #1
- vext.8 q7, q6, q7, #1
- vext.8 q9, q8, q9, #1
-
- vrhadd.u8 q1, q2, q3 ;(src_ptr[0]+src_ptr[1])/round/shift right 1
- vrhadd.u8 q2, q4, q5
- vrhadd.u8 q3, q6, q7
- vrhadd.u8 q4, q8, q9
-
- vld1.8 {q5}, [r2], r3
- vrhadd.u8 q0, q0, q1
- vld1.8 {q6}, [r2], r3
- vrhadd.u8 q1, q1, q2
- vld1.8 {q7}, [r2], r3
- vrhadd.u8 q2, q2, q3
- vld1.8 {q8}, [r2], r3
- vrhadd.u8 q3, q3, q4
-
- vsubl.u8 q9, d0, d10 ;diff
- vsubl.u8 q10, d1, d11
- vsubl.u8 q11, d2, d12
- vsubl.u8 q12, d3, d13
-
- vsubl.u8 q0, d4, d14 ;diff
- vsubl.u8 q1, d5, d15
- vsubl.u8 q5, d6, d16
- vsubl.u8 q6, d7, d17
-
- vpadal.s16 q13, q9 ;sum
- vmlal.s16 q14, d18, d18 ;sse
- vmlal.s16 q15, d19, d19
-
- vpadal.s16 q13, q10 ;sum
- vmlal.s16 q14, d20, d20 ;sse
- vmlal.s16 q15, d21, d21
-
- vpadal.s16 q13, q11 ;sum
- vmlal.s16 q14, d22, d22 ;sse
- vmlal.s16 q15, d23, d23
-
- vpadal.s16 q13, q12 ;sum
- vmlal.s16 q14, d24, d24 ;sse
- vmlal.s16 q15, d25, d25
-
- subs r12, r12, #1
-
- vpadal.s16 q13, q0 ;sum
- vmlal.s16 q14, d0, d0 ;sse
- vmlal.s16 q15, d1, d1
-
- vpadal.s16 q13, q1 ;sum
- vmlal.s16 q14, d2, d2 ;sse
- vmlal.s16 q15, d3, d3
-
- vpadal.s16 q13, q5 ;sum
- vmlal.s16 q14, d10, d10 ;sse
- vmlal.s16 q15, d11, d11
-
- vmov q0, q4
-
- vpadal.s16 q13, q6 ;sum
- vmlal.s16 q14, d12, d12 ;sse
- vmlal.s16 q15, d13, d13
-
- bne vp8_filt16x16s_4_4_loop_neon
-
- vadd.u32 q15, q14, q15 ;accumulate sse
- vpaddl.s32 q0, q13 ;accumulate sum
-
- vpaddl.u32 q1, q15
- vadd.s64 d0, d0, d1
- vadd.u64 d1, d2, d3
-
- vmull.s32 q5, d0, d0
- vst1.32 {d1[0]}, [lr] ;store sse
- vshr.u32 d10, d10, #8
- vsub.u32 d0, d1, d10
-
- vmov.32 r0, d0[0] ;return
-
- vpop {d8-d15}
- pop {pc}
- ENDP
-
-;==============================
-; r0 unsigned char *src_ptr,
-; r1 int src_pixels_per_line,
-; r2 int xoffset,
-; r3 int yoffset,
-; stack unsigned char *dst_ptr,
-; stack int dst_pixels_per_line,
-; stack unsigned int *sse
-;note: in vp8_find_best_half_pixel_step()(called when 8<Speed<15), and first call of vp8_find_best_sub_pixel_step()
-;(called when speed<=8). xoffset/yoffset can only be 4 or 0, which means either by pass the filter,
-;or filter coeff is {64, 64}. This simplified program only works in this situation.
-;note: It happens that both xoffset and yoffset are zero. This can be handled in c code later.
-
-|vp8_sub_pixel_variance16x16s_neon| PROC
- push {r4, lr}
- vpush {d8-d15}
-
- ldr r4, [sp, #72] ;load *dst_ptr from stack
- ldr r12, [sp, #76] ;load dst_pixels_per_line from stack
- ldr lr, [sp, #80] ;load *sse from stack
-
- cmp r2, #0 ;skip first_pass filter if xoffset=0
- beq secondpass_bfilter16x16s_only
-
- cmp r3, #0 ;skip second_pass filter if yoffset=0
- beq firstpass_bfilter16x16s_only
-
- vld1.u8 {d0, d1, d2, d3}, [r0], r1 ;load src data
- sub sp, sp, #256 ;reserve space on stack for temporary storage
- vext.8 q1, q0, q1, #1 ;construct src_ptr[1]
- mov r3, sp
- mov r2, #4 ;loop counter
- vrhadd.u8 q0, q0, q1 ;(src_ptr[0]+src_ptr[1])/round/shift right 1
-
-;First Pass: output_height lines x output_width columns (17x16)
-vp8e_filt_blk2d_fp16x16s_loop_neon
- vld1.u8 {d4, d5, d6, d7}, [r0], r1
- vld1.u8 {d8, d9, d10, d11}, [r0], r1
- vld1.u8 {d12, d13, d14, d15}, [r0], r1
- vld1.u8 {d16, d17, d18, d19}, [r0], r1
-
- ;pld [r0]
- ;pld [r0, r1]
- ;pld [r0, r1, lsl #1]
-
- vext.8 q3, q2, q3, #1 ;construct src_ptr[1]
- vext.8 q5, q4, q5, #1
- vext.8 q7, q6, q7, #1
- vext.8 q9, q8, q9, #1
-
- vrhadd.u8 q1, q2, q3 ;(src_ptr[0]+src_ptr[1])/round/shift right 1
- vrhadd.u8 q2, q4, q5
- vrhadd.u8 q3, q6, q7
- vrhadd.u8 q4, q8, q9
-
- vrhadd.u8 q0, q0, q1
- vrhadd.u8 q1, q1, q2
- vrhadd.u8 q2, q2, q3
- vrhadd.u8 q3, q3, q4
-
- subs r2, r2, #1
- vst1.u8 {d0, d1 ,d2, d3}, [r3]! ;store result
- vmov q0, q4
- vst1.u8 {d4, d5, d6, d7}, [r3]!
-
- bne vp8e_filt_blk2d_fp16x16s_loop_neon
-
- b sub_pixel_variance16x16s_neon
-
-;--------------------
-firstpass_bfilter16x16s_only
- mov r2, #2 ;loop counter
- sub sp, sp, #256 ;reserve space on stack for temporary storage
- mov r3, sp
-
-;First Pass: output_height lines x output_width columns (16x16)
-vp8e_filt_blk2d_fpo16x16s_loop_neon
- vld1.u8 {d0, d1, d2, d3}, [r0], r1 ;load src data
- vld1.u8 {d4, d5, d6, d7}, [r0], r1
- vld1.u8 {d8, d9, d10, d11}, [r0], r1
- vld1.u8 {d12, d13, d14, d15}, [r0], r1
-
- ;pld [r0]
- ;pld [r0, r1]
- ;pld [r0, r1, lsl #1]
-
- vext.8 q1, q0, q1, #1 ;construct src_ptr[1]
- vld1.u8 {d16, d17, d18, d19}, [r0], r1
- vext.8 q3, q2, q3, #1
- vld1.u8 {d20, d21, d22, d23}, [r0], r1
- vext.8 q5, q4, q5, #1
- vld1.u8 {d24, d25, d26, d27}, [r0], r1
- vext.8 q7, q6, q7, #1
- vld1.u8 {d28, d29, d30, d31}, [r0], r1
- vext.8 q9, q8, q9, #1
- vext.8 q11, q10, q11, #1
- vext.8 q13, q12, q13, #1
- vext.8 q15, q14, q15, #1
-
- vrhadd.u8 q0, q0, q1 ;(src_ptr[0]+src_ptr[1])/round/shift right 1
- vrhadd.u8 q1, q2, q3
- vrhadd.u8 q2, q4, q5
- vrhadd.u8 q3, q6, q7
- vrhadd.u8 q4, q8, q9
- vrhadd.u8 q5, q10, q11
- vrhadd.u8 q6, q12, q13
- vrhadd.u8 q7, q14, q15
-
- subs r2, r2, #1
-
- vst1.u8 {d0, d1, d2, d3}, [r3]! ;store result
- vst1.u8 {d4, d5, d6, d7}, [r3]!
- vst1.u8 {d8, d9, d10, d11}, [r3]!
- vst1.u8 {d12, d13, d14, d15}, [r3]!
-
- bne vp8e_filt_blk2d_fpo16x16s_loop_neon
-
- b sub_pixel_variance16x16s_neon
-
-;---------------------
-secondpass_bfilter16x16s_only
- sub sp, sp, #256 ;reserve space on stack for temporary storage
-
- mov r2, #2 ;loop counter
- vld1.u8 {d0, d1}, [r0], r1 ;load src data
- mov r3, sp
-
-vp8e_filt_blk2d_spo16x16s_loop_neon
- vld1.u8 {d2, d3}, [r0], r1
- vld1.u8 {d4, d5}, [r0], r1
- vld1.u8 {d6, d7}, [r0], r1
- vld1.u8 {d8, d9}, [r0], r1
-
- vrhadd.u8 q0, q0, q1
- vld1.u8 {d10, d11}, [r0], r1
- vrhadd.u8 q1, q1, q2
- vld1.u8 {d12, d13}, [r0], r1
- vrhadd.u8 q2, q2, q3
- vld1.u8 {d14, d15}, [r0], r1
- vrhadd.u8 q3, q3, q4
- vld1.u8 {d16, d17}, [r0], r1
- vrhadd.u8 q4, q4, q5
- vrhadd.u8 q5, q5, q6
- vrhadd.u8 q6, q6, q7
- vrhadd.u8 q7, q7, q8
-
- subs r2, r2, #1
-
- vst1.u8 {d0, d1, d2, d3}, [r3]! ;store result
- vmov q0, q8
- vst1.u8 {d4, d5, d6, d7}, [r3]!
- vst1.u8 {d8, d9, d10, d11}, [r3]! ;store result
- vst1.u8 {d12, d13, d14, d15}, [r3]!
-
- bne vp8e_filt_blk2d_spo16x16s_loop_neon
-
- b sub_pixel_variance16x16s_neon
-
-;----------------------------
-;variance16x16
-sub_pixel_variance16x16s_neon
- vmov.i8 q8, #0 ;q8 - sum
- vmov.i8 q9, #0 ;q9, q10 - sse
- vmov.i8 q10, #0
-
- sub r3, r3, #256
- mov r2, #4
-
-sub_pixel_variance16x16s_neon_loop
- vld1.8 {q0}, [r3]! ;Load up source and reference
- vld1.8 {q1}, [r4], r12
- vld1.8 {q2}, [r3]!
- vld1.8 {q3}, [r4], r12
- vld1.8 {q4}, [r3]!
- vld1.8 {q5}, [r4], r12
- vld1.8 {q6}, [r3]!
- vld1.8 {q7}, [r4], r12
-
- vsubl.u8 q11, d0, d2 ;diff
- vsubl.u8 q12, d1, d3
- vsubl.u8 q13, d4, d6
- vsubl.u8 q14, d5, d7
- vsubl.u8 q0, d8, d10
- vsubl.u8 q1, d9, d11
- vsubl.u8 q2, d12, d14
- vsubl.u8 q3, d13, d15
-
- vpadal.s16 q8, q11 ;sum
- vmlal.s16 q9, d22, d22 ;sse
- vmlal.s16 q10, d23, d23
-
- subs r2, r2, #1
-
- vpadal.s16 q8, q12
- vmlal.s16 q9, d24, d24
- vmlal.s16 q10, d25, d25
- vpadal.s16 q8, q13
- vmlal.s16 q9, d26, d26
- vmlal.s16 q10, d27, d27
- vpadal.s16 q8, q14
- vmlal.s16 q9, d28, d28
- vmlal.s16 q10, d29, d29
-
- vpadal.s16 q8, q0 ;sum
- vmlal.s16 q9, d0, d0 ;sse
- vmlal.s16 q10, d1, d1
- vpadal.s16 q8, q1
- vmlal.s16 q9, d2, d2
- vmlal.s16 q10, d3, d3
- vpadal.s16 q8, q2
- vmlal.s16 q9, d4, d4
- vmlal.s16 q10, d5, d5
- vpadal.s16 q8, q3
- vmlal.s16 q9, d6, d6
- vmlal.s16 q10, d7, d7
-
- bne sub_pixel_variance16x16s_neon_loop
-
- vadd.u32 q10, q9, q10 ;accumulate sse
- vpaddl.s32 q0, q8 ;accumulate sum
-
- vpaddl.u32 q1, q10
- vadd.s64 d0, d0, d1
- vadd.u64 d1, d2, d3
-
- vmull.s32 q5, d0, d0
- vst1.32 {d1[0]}, [lr] ;store sse
- vshr.u32 d10, d10, #8
- vsub.u32 d0, d1, d10
-
- add sp, sp, #256
- vmov.32 r0, d0[0] ;return
-
- vpop {d8-d15}
- pop {r4, pc}
- ENDP
-
- END
diff --git a/vp8/common/arm/neon/vp8_subpixelvariance8x8_neon.asm b/vp8/common/arm/neon/vp8_subpixelvariance8x8_neon.asm
deleted file mode 100644
index 9d9f9e0..0000000
--- a/vp8/common/arm/neon/vp8_subpixelvariance8x8_neon.asm
+++ /dev/null
@@ -1,225 +0,0 @@
-;
-; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
-
- EXPORT |vp8_sub_pixel_variance8x8_neon|
- ARM
- REQUIRE8
- PRESERVE8
-
- AREA ||.text||, CODE, READONLY, ALIGN=2
-; r0 unsigned char *src_ptr,
-; r1 int src_pixels_per_line,
-; r2 int xoffset,
-; r3 int yoffset,
-; stack(r4) unsigned char *dst_ptr,
-; stack(r5) int dst_pixels_per_line,
-; stack(r6) unsigned int *sse
-;note: most of the code is copied from bilinear_predict8x8_neon and vp8_variance8x8_neon.
-
-|vp8_sub_pixel_variance8x8_neon| PROC
- push {r4-r5, lr}
- vpush {d8-d15}
-
- adr r12, bilinear_taps_coeff
- ldr r4, [sp, #76] ;load *dst_ptr from stack
- ldr r5, [sp, #80] ;load dst_pixels_per_line from stack
- ldr lr, [sp, #84] ;load *sse from stack
-
- cmp r2, #0 ;skip first_pass filter if xoffset=0
- beq skip_firstpass_filter
-
-;First pass: output_height lines x output_width columns (9x8)
- add r2, r12, r2, lsl #3 ;calculate filter location
-
- vld1.u8 {q1}, [r0], r1 ;load src data
- vld1.u32 {d31}, [r2] ;load first_pass filter
- vld1.u8 {q2}, [r0], r1
- vdup.8 d0, d31[0] ;first_pass filter (d0 d1)
- vld1.u8 {q3}, [r0], r1
- vdup.8 d1, d31[4]
- vld1.u8 {q4}, [r0], r1
-
- vmull.u8 q6, d2, d0 ;(src_ptr[0] * Filter[0])
- vmull.u8 q7, d4, d0
- vmull.u8 q8, d6, d0
- vmull.u8 q9, d8, d0
-
- vext.8 d3, d2, d3, #1 ;construct src_ptr[-1]
- vext.8 d5, d4, d5, #1
- vext.8 d7, d6, d7, #1
- vext.8 d9, d8, d9, #1
-
- vmlal.u8 q6, d3, d1 ;(src_ptr[1] * Filter[1])
- vmlal.u8 q7, d5, d1
- vmlal.u8 q8, d7, d1
- vmlal.u8 q9, d9, d1
-
- vld1.u8 {q1}, [r0], r1 ;load src data
- vqrshrn.u16 d22, q6, #7 ;shift/round/saturate to u8
- vld1.u8 {q2}, [r0], r1
- vqrshrn.u16 d23, q7, #7
- vld1.u8 {q3}, [r0], r1
- vqrshrn.u16 d24, q8, #7
- vld1.u8 {q4}, [r0], r1
- vqrshrn.u16 d25, q9, #7
-
- ;first_pass filtering on the rest 5-line data
- vld1.u8 {q5}, [r0], r1
-
- vmull.u8 q6, d2, d0 ;(src_ptr[0] * Filter[0])
- vmull.u8 q7, d4, d0
- vmull.u8 q8, d6, d0
- vmull.u8 q9, d8, d0
- vmull.u8 q10, d10, d0
-
- vext.8 d3, d2, d3, #1 ;construct src_ptr[-1]
- vext.8 d5, d4, d5, #1
- vext.8 d7, d6, d7, #1
- vext.8 d9, d8, d9, #1
- vext.8 d11, d10, d11, #1
-
- vmlal.u8 q6, d3, d1 ;(src_ptr[1] * Filter[1])
- vmlal.u8 q7, d5, d1
- vmlal.u8 q8, d7, d1
- vmlal.u8 q9, d9, d1
- vmlal.u8 q10, d11, d1
-
- vqrshrn.u16 d26, q6, #7 ;shift/round/saturate to u8
- vqrshrn.u16 d27, q7, #7
- vqrshrn.u16 d28, q8, #7
- vqrshrn.u16 d29, q9, #7
- vqrshrn.u16 d30, q10, #7
-
-;Second pass: 8x8
-secondpass_filter
- cmp r3, #0 ;skip second_pass filter if yoffset=0
- ;skip_secondpass_filter
- beq sub_pixel_variance8x8_neon
-
- add r3, r12, r3, lsl #3
-
- vld1.u32 {d31}, [r3] ;load second_pass filter
-
- vdup.8 d0, d31[0] ;second_pass filter parameters (d0 d1)
- vdup.8 d1, d31[4]
-
- vmull.u8 q1, d22, d0 ;(src_ptr[0] * Filter[0])
- vmull.u8 q2, d23, d0
- vmull.u8 q3, d24, d0
- vmull.u8 q4, d25, d0
- vmull.u8 q5, d26, d0
- vmull.u8 q6, d27, d0
- vmull.u8 q7, d28, d0
- vmull.u8 q8, d29, d0
-
- vmlal.u8 q1, d23, d1 ;(src_ptr[pixel_step] * Filter[1])
- vmlal.u8 q2, d24, d1
- vmlal.u8 q3, d25, d1
- vmlal.u8 q4, d26, d1
- vmlal.u8 q5, d27, d1
- vmlal.u8 q6, d28, d1
- vmlal.u8 q7, d29, d1
- vmlal.u8 q8, d30, d1
-
- vqrshrn.u16 d22, q1, #7 ;shift/round/saturate to u8
- vqrshrn.u16 d23, q2, #7
- vqrshrn.u16 d24, q3, #7
- vqrshrn.u16 d25, q4, #7
- vqrshrn.u16 d26, q5, #7
- vqrshrn.u16 d27, q6, #7
- vqrshrn.u16 d28, q7, #7
- vqrshrn.u16 d29, q8, #7
-
- b sub_pixel_variance8x8_neon
-
-;--------------------
-skip_firstpass_filter
- vld1.u8 {d22}, [r0], r1 ;load src data
- vld1.u8 {d23}, [r0], r1
- vld1.u8 {d24}, [r0], r1
- vld1.u8 {d25}, [r0], r1
- vld1.u8 {d26}, [r0], r1
- vld1.u8 {d27}, [r0], r1
- vld1.u8 {d28}, [r0], r1
- vld1.u8 {d29}, [r0], r1
- vld1.u8 {d30}, [r0], r1
-
- b secondpass_filter
-
-;----------------------
-;vp8_variance8x8_neon
-sub_pixel_variance8x8_neon
- vmov.i8 q8, #0 ;q8 - sum
- vmov.i8 q9, #0 ;q9, q10 - sse
- vmov.i8 q10, #0
-
- mov r12, #2
-
-sub_pixel_variance8x8_neon_loop
- vld1.8 {d0}, [r4], r5 ;load dst data
- subs r12, r12, #1
- vld1.8 {d1}, [r4], r5
- vld1.8 {d2}, [r4], r5
- vsubl.u8 q4, d22, d0 ;calculate diff
- vld1.8 {d3}, [r4], r5
-
- vsubl.u8 q5, d23, d1
- vsubl.u8 q6, d24, d2
-
- vpadal.s16 q8, q4 ;sum
- vmlal.s16 q9, d8, d8 ;sse
- vmlal.s16 q10, d9, d9
-
- vsubl.u8 q7, d25, d3
-
- vpadal.s16 q8, q5
- vmlal.s16 q9, d10, d10
- vmlal.s16 q10, d11, d11
-
- vmov q11, q13
-
- vpadal.s16 q8, q6
- vmlal.s16 q9, d12, d12
- vmlal.s16 q10, d13, d13
-
- vmov q12, q14
-
- vpadal.s16 q8, q7
- vmlal.s16 q9, d14, d14
- vmlal.s16 q10, d15, d15
-
- bne sub_pixel_variance8x8_neon_loop
-
- vadd.u32 q10, q9, q10 ;accumulate sse
- vpaddl.s32 q0, q8 ;accumulate sum
-
- vpaddl.u32 q1, q10
- vadd.s64 d0, d0, d1
- vadd.u64 d1, d2, d3
-
- vmull.s32 q5, d0, d0
- vst1.32 {d1[0]}, [lr] ;store sse
- vshr.u32 d10, d10, #6
- vsub.u32 d0, d1, d10
-
- vmov.32 r0, d0[0] ;return
-
- vpop {d8-d15}
- pop {r4-r5, pc}
-
- ENDP
-
-;-----------------
-
-bilinear_taps_coeff
- DCD 128, 0, 112, 16, 96, 32, 80, 48, 64, 64, 48, 80, 32, 96, 16, 112
-
- END
diff --git a/vp8/common/arm/neon/vp8_subpixelvariance_neon.c b/vp8/common/arm/neon/vp8_subpixelvariance_neon.c
new file mode 100644
index 0000000..6405bf2
--- /dev/null
+++ b/vp8/common/arm/neon/vp8_subpixelvariance_neon.c
@@ -0,0 +1,1028 @@
+/*
+ * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+#include "vpx_ports/mem.h"
+#include "vpx/vpx_integer.h"
+
+#ifdef _MSC_VER
+#define __builtin_prefetch(x)
+#endif
+
+static const uint16_t bilinear_taps_coeff[8][2] = {
+ {128, 0},
+ {112, 16},
+ { 96, 32},
+ { 80, 48},
+ { 64, 64},
+ { 48, 80},
+ { 32, 96},
+ { 16, 112}
+};
+
+unsigned int vp8_sub_pixel_variance16x16_neon_func(
+ const unsigned char *src_ptr,
+ int src_pixels_per_line,
+ int xoffset,
+ int yoffset,
+ const unsigned char *dst_ptr,
+ int dst_pixels_per_line,
+ unsigned int *sse) {
+ int i;
+ DECLARE_ALIGNED_ARRAY(16, unsigned char, tmp, 528);
+ unsigned char *tmpp;
+ unsigned char *tmpp2;
+ uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8, d8u8, d9u8;
+ uint8x8_t d10u8, d11u8, d12u8, d13u8, d14u8, d15u8, d16u8, d17u8, d18u8;
+ uint8x8_t d19u8, d20u8, d21u8;
+ int16x4_t d22s16, d23s16, d24s16, d25s16, d26s16, d27s16, d28s16, d29s16;
+ uint32x2_t d0u32, d10u32;
+ int64x1_t d0s64, d1s64, d2s64, d3s64;
+ uint8x16_t q0u8, q1u8, q2u8, q3u8, q4u8, q5u8, q6u8, q7u8, q8u8, q9u8;
+ uint8x16_t q10u8, q11u8, q12u8, q13u8, q14u8, q15u8;
+ uint16x8_t q1u16, q2u16, q3u16, q4u16, q5u16, q6u16, q7u16, q8u16;
+ uint16x8_t q9u16, q10u16, q11u16, q12u16, q13u16, q14u16;
+ int32x4_t q8s32, q9s32, q10s32;
+ int64x2_t q0s64, q1s64, q5s64;
+
+ tmpp2 = tmp + 272;
+ tmpp = tmp;
+ if (xoffset == 0) { // secondpass_bfilter16x16_only
+ d0u8 = vdup_n_u8(bilinear_taps_coeff[yoffset][0]);
+ d1u8 = vdup_n_u8(bilinear_taps_coeff[yoffset][1]);
+
+ q11u8 = vld1q_u8(src_ptr);
+ src_ptr += src_pixels_per_line;
+ for (i = 4; i > 0; i--) {
+ q12u8 = vld1q_u8(src_ptr);
+ src_ptr += src_pixels_per_line;
+ q13u8 = vld1q_u8(src_ptr);
+ src_ptr += src_pixels_per_line;
+ q14u8 = vld1q_u8(src_ptr);
+ src_ptr += src_pixels_per_line;
+ q15u8 = vld1q_u8(src_ptr);
+ src_ptr += src_pixels_per_line;
+
+ __builtin_prefetch(src_ptr);
+ __builtin_prefetch(src_ptr + src_pixels_per_line);
+ __builtin_prefetch(src_ptr + src_pixels_per_line * 2);
+
+ q1u16 = vmull_u8(vget_low_u8(q11u8), d0u8);
+ q2u16 = vmull_u8(vget_high_u8(q11u8), d0u8);
+ q3u16 = vmull_u8(vget_low_u8(q12u8), d0u8);
+ q4u16 = vmull_u8(vget_high_u8(q12u8), d0u8);
+ q5u16 = vmull_u8(vget_low_u8(q13u8), d0u8);
+ q6u16 = vmull_u8(vget_high_u8(q13u8), d0u8);
+ q7u16 = vmull_u8(vget_low_u8(q14u8), d0u8);
+ q8u16 = vmull_u8(vget_high_u8(q14u8), d0u8);
+
+ q1u16 = vmlal_u8(q1u16, vget_low_u8(q12u8), d1u8);
+ q2u16 = vmlal_u8(q2u16, vget_high_u8(q12u8), d1u8);
+ q3u16 = vmlal_u8(q3u16, vget_low_u8(q13u8), d1u8);
+ q4u16 = vmlal_u8(q4u16, vget_high_u8(q13u8), d1u8);
+ q5u16 = vmlal_u8(q5u16, vget_low_u8(q14u8), d1u8);
+ q6u16 = vmlal_u8(q6u16, vget_high_u8(q14u8), d1u8);
+ q7u16 = vmlal_u8(q7u16, vget_low_u8(q15u8), d1u8);
+ q8u16 = vmlal_u8(q8u16, vget_high_u8(q15u8), d1u8);
+
+ d2u8 = vqrshrn_n_u16(q1u16, 7);
+ d3u8 = vqrshrn_n_u16(q2u16, 7);
+ d4u8 = vqrshrn_n_u16(q3u16, 7);
+ d5u8 = vqrshrn_n_u16(q4u16, 7);
+ d6u8 = vqrshrn_n_u16(q5u16, 7);
+ d7u8 = vqrshrn_n_u16(q6u16, 7);
+ d8u8 = vqrshrn_n_u16(q7u16, 7);
+ d9u8 = vqrshrn_n_u16(q8u16, 7);
+
+ q1u8 = vcombine_u8(d2u8, d3u8);
+ q2u8 = vcombine_u8(d4u8, d5u8);
+ q3u8 = vcombine_u8(d6u8, d7u8);
+ q4u8 = vcombine_u8(d8u8, d9u8);
+
+ q11u8 = q15u8;
+
+ vst1q_u8((uint8_t *)tmpp2, q1u8);
+ tmpp2 += 16;
+ vst1q_u8((uint8_t *)tmpp2, q2u8);
+ tmpp2 += 16;
+ vst1q_u8((uint8_t *)tmpp2, q3u8);
+ tmpp2 += 16;
+ vst1q_u8((uint8_t *)tmpp2, q4u8);
+ tmpp2 += 16;
+ }
+ } else if (yoffset == 0) { // firstpass_bfilter16x16_only
+ d0u8 = vdup_n_u8(bilinear_taps_coeff[xoffset][0]);
+ d1u8 = vdup_n_u8(bilinear_taps_coeff[xoffset][1]);
+
+ for (i = 4; i > 0 ; i--) {
+ d2u8 = vld1_u8(src_ptr);
+ d3u8 = vld1_u8(src_ptr + 8);
+ d4u8 = vld1_u8(src_ptr + 16);
+ src_ptr += src_pixels_per_line;
+ d5u8 = vld1_u8(src_ptr);
+ d6u8 = vld1_u8(src_ptr + 8);
+ d7u8 = vld1_u8(src_ptr + 16);
+ src_ptr += src_pixels_per_line;
+ d8u8 = vld1_u8(src_ptr);
+ d9u8 = vld1_u8(src_ptr + 8);
+ d10u8 = vld1_u8(src_ptr + 16);
+ src_ptr += src_pixels_per_line;
+ d11u8 = vld1_u8(src_ptr);
+ d12u8 = vld1_u8(src_ptr + 8);
+ d13u8 = vld1_u8(src_ptr + 16);
+ src_ptr += src_pixels_per_line;
+
+ __builtin_prefetch(src_ptr);
+ __builtin_prefetch(src_ptr + src_pixels_per_line);
+ __builtin_prefetch(src_ptr + src_pixels_per_line * 2);
+
+ q7u16 = vmull_u8(d2u8, d0u8);
+ q8u16 = vmull_u8(d3u8, d0u8);
+ q9u16 = vmull_u8(d5u8, d0u8);
+ q10u16 = vmull_u8(d6u8, d0u8);
+ q11u16 = vmull_u8(d8u8, d0u8);
+ q12u16 = vmull_u8(d9u8, d0u8);
+ q13u16 = vmull_u8(d11u8, d0u8);
+ q14u16 = vmull_u8(d12u8, d0u8);
+
+ d2u8 = vext_u8(d2u8, d3u8, 1);
+ d5u8 = vext_u8(d5u8, d6u8, 1);
+ d8u8 = vext_u8(d8u8, d9u8, 1);
+ d11u8 = vext_u8(d11u8, d12u8, 1);
+
+ q7u16 = vmlal_u8(q7u16, d2u8, d1u8);
+ q9u16 = vmlal_u8(q9u16, d5u8, d1u8);
+ q11u16 = vmlal_u8(q11u16, d8u8, d1u8);
+ q13u16 = vmlal_u8(q13u16, d11u8, d1u8);
+
+ d3u8 = vext_u8(d3u8, d4u8, 1);
+ d6u8 = vext_u8(d6u8, d7u8, 1);
+ d9u8 = vext_u8(d9u8, d10u8, 1);
+ d12u8 = vext_u8(d12u8, d13u8, 1);
+
+ q8u16 = vmlal_u8(q8u16, d3u8, d1u8);
+ q10u16 = vmlal_u8(q10u16, d6u8, d1u8);
+ q12u16 = vmlal_u8(q12u16, d9u8, d1u8);
+ q14u16 = vmlal_u8(q14u16, d12u8, d1u8);
+
+ d14u8 = vqrshrn_n_u16(q7u16, 7);
+ d15u8 = vqrshrn_n_u16(q8u16, 7);
+ d16u8 = vqrshrn_n_u16(q9u16, 7);
+ d17u8 = vqrshrn_n_u16(q10u16, 7);
+ d18u8 = vqrshrn_n_u16(q11u16, 7);
+ d19u8 = vqrshrn_n_u16(q12u16, 7);
+ d20u8 = vqrshrn_n_u16(q13u16, 7);
+ d21u8 = vqrshrn_n_u16(q14u16, 7);
+
+ q7u8 = vcombine_u8(d14u8, d15u8);
+ q8u8 = vcombine_u8(d16u8, d17u8);
+ q9u8 = vcombine_u8(d18u8, d19u8);
+ q10u8 = vcombine_u8(d20u8, d21u8);
+
+ vst1q_u8((uint8_t *)tmpp2, q7u8);
+ tmpp2 += 16;
+ vst1q_u8((uint8_t *)tmpp2, q8u8);
+ tmpp2 += 16;
+ vst1q_u8((uint8_t *)tmpp2, q9u8);
+ tmpp2 += 16;
+ vst1q_u8((uint8_t *)tmpp2, q10u8);
+ tmpp2 += 16;
+ }
+ } else {
+ d0u8 = vdup_n_u8(bilinear_taps_coeff[xoffset][0]);
+ d1u8 = vdup_n_u8(bilinear_taps_coeff[xoffset][1]);
+
+ d2u8 = vld1_u8(src_ptr);
+ d3u8 = vld1_u8(src_ptr + 8);
+ d4u8 = vld1_u8(src_ptr + 16);
+ src_ptr += src_pixels_per_line;
+ d5u8 = vld1_u8(src_ptr);
+ d6u8 = vld1_u8(src_ptr + 8);
+ d7u8 = vld1_u8(src_ptr + 16);
+ src_ptr += src_pixels_per_line;
+ d8u8 = vld1_u8(src_ptr);
+ d9u8 = vld1_u8(src_ptr + 8);
+ d10u8 = vld1_u8(src_ptr + 16);
+ src_ptr += src_pixels_per_line;
+ d11u8 = vld1_u8(src_ptr);
+ d12u8 = vld1_u8(src_ptr + 8);
+ d13u8 = vld1_u8(src_ptr + 16);
+ src_ptr += src_pixels_per_line;
+
+ // First Pass: output_height lines x output_width columns (17x16)
+ for (i = 3; i > 0; i--) {
+ q7u16 = vmull_u8(d2u8, d0u8);
+ q8u16 = vmull_u8(d3u8, d0u8);
+ q9u16 = vmull_u8(d5u8, d0u8);
+ q10u16 = vmull_u8(d6u8, d0u8);
+ q11u16 = vmull_u8(d8u8, d0u8);
+ q12u16 = vmull_u8(d9u8, d0u8);
+ q13u16 = vmull_u8(d11u8, d0u8);
+ q14u16 = vmull_u8(d12u8, d0u8);
+
+ d2u8 = vext_u8(d2u8, d3u8, 1);
+ d5u8 = vext_u8(d5u8, d6u8, 1);
+ d8u8 = vext_u8(d8u8, d9u8, 1);
+ d11u8 = vext_u8(d11u8, d12u8, 1);
+
+ q7u16 = vmlal_u8(q7u16, d2u8, d1u8);
+ q9u16 = vmlal_u8(q9u16, d5u8, d1u8);
+ q11u16 = vmlal_u8(q11u16, d8u8, d1u8);
+ q13u16 = vmlal_u8(q13u16, d11u8, d1u8);
+
+ d3u8 = vext_u8(d3u8, d4u8, 1);
+ d6u8 = vext_u8(d6u8, d7u8, 1);
+ d9u8 = vext_u8(d9u8, d10u8, 1);
+ d12u8 = vext_u8(d12u8, d13u8, 1);
+
+ q8u16 = vmlal_u8(q8u16, d3u8, d1u8);
+ q10u16 = vmlal_u8(q10u16, d6u8, d1u8);
+ q12u16 = vmlal_u8(q12u16, d9u8, d1u8);
+ q14u16 = vmlal_u8(q14u16, d12u8, d1u8);
+
+ d14u8 = vqrshrn_n_u16(q7u16, 7);
+ d15u8 = vqrshrn_n_u16(q8u16, 7);
+ d16u8 = vqrshrn_n_u16(q9u16, 7);
+ d17u8 = vqrshrn_n_u16(q10u16, 7);
+ d18u8 = vqrshrn_n_u16(q11u16, 7);
+ d19u8 = vqrshrn_n_u16(q12u16, 7);
+ d20u8 = vqrshrn_n_u16(q13u16, 7);
+ d21u8 = vqrshrn_n_u16(q14u16, 7);
+
+ d2u8 = vld1_u8(src_ptr);
+ d3u8 = vld1_u8(src_ptr + 8);
+ d4u8 = vld1_u8(src_ptr + 16);
+ src_ptr += src_pixels_per_line;
+ d5u8 = vld1_u8(src_ptr);
+ d6u8 = vld1_u8(src_ptr + 8);
+ d7u8 = vld1_u8(src_ptr + 16);
+ src_ptr += src_pixels_per_line;
+ d8u8 = vld1_u8(src_ptr);
+ d9u8 = vld1_u8(src_ptr + 8);
+ d10u8 = vld1_u8(src_ptr + 16);
+ src_ptr += src_pixels_per_line;
+ d11u8 = vld1_u8(src_ptr);
+ d12u8 = vld1_u8(src_ptr + 8);
+ d13u8 = vld1_u8(src_ptr + 16);
+ src_ptr += src_pixels_per_line;
+
+ q7u8 = vcombine_u8(d14u8, d15u8);
+ q8u8 = vcombine_u8(d16u8, d17u8);
+ q9u8 = vcombine_u8(d18u8, d19u8);
+ q10u8 = vcombine_u8(d20u8, d21u8);
+
+ vst1q_u8((uint8_t *)tmpp, q7u8);
+ tmpp += 16;
+ vst1q_u8((uint8_t *)tmpp, q8u8);
+ tmpp += 16;
+ vst1q_u8((uint8_t *)tmpp, q9u8);
+ tmpp += 16;
+ vst1q_u8((uint8_t *)tmpp, q10u8);
+ tmpp += 16;
+ }
+
+ // First-pass filtering for rest 5 lines
+ d14u8 = vld1_u8(src_ptr);
+ d15u8 = vld1_u8(src_ptr + 8);
+ d16u8 = vld1_u8(src_ptr + 16);
+ src_ptr += src_pixels_per_line;
+
+ q9u16 = vmull_u8(d2u8, d0u8);
+ q10u16 = vmull_u8(d3u8, d0u8);
+ q11u16 = vmull_u8(d5u8, d0u8);
+ q12u16 = vmull_u8(d6u8, d0u8);
+ q13u16 = vmull_u8(d8u8, d0u8);
+ q14u16 = vmull_u8(d9u8, d0u8);
+
+ d2u8 = vext_u8(d2u8, d3u8, 1);
+ d5u8 = vext_u8(d5u8, d6u8, 1);
+ d8u8 = vext_u8(d8u8, d9u8, 1);
+
+ q9u16 = vmlal_u8(q9u16, d2u8, d1u8);
+ q11u16 = vmlal_u8(q11u16, d5u8, d1u8);
+ q13u16 = vmlal_u8(q13u16, d8u8, d1u8);
+
+ d3u8 = vext_u8(d3u8, d4u8, 1);
+ d6u8 = vext_u8(d6u8, d7u8, 1);
+ d9u8 = vext_u8(d9u8, d10u8, 1);
+
+ q10u16 = vmlal_u8(q10u16, d3u8, d1u8);
+ q12u16 = vmlal_u8(q12u16, d6u8, d1u8);
+ q14u16 = vmlal_u8(q14u16, d9u8, d1u8);
+
+ q1u16 = vmull_u8(d11u8, d0u8);
+ q2u16 = vmull_u8(d12u8, d0u8);
+ q3u16 = vmull_u8(d14u8, d0u8);
+ q4u16 = vmull_u8(d15u8, d0u8);
+
+ d11u8 = vext_u8(d11u8, d12u8, 1);
+ d14u8 = vext_u8(d14u8, d15u8, 1);
+
+ q1u16 = vmlal_u8(q1u16, d11u8, d1u8);
+ q3u16 = vmlal_u8(q3u16, d14u8, d1u8);
+
+ d12u8 = vext_u8(d12u8, d13u8, 1);
+ d15u8 = vext_u8(d15u8, d16u8, 1);
+
+ q2u16 = vmlal_u8(q2u16, d12u8, d1u8);
+ q4u16 = vmlal_u8(q4u16, d15u8, d1u8);
+
+ d10u8 = vqrshrn_n_u16(q9u16, 7);
+ d11u8 = vqrshrn_n_u16(q10u16, 7);
+ d12u8 = vqrshrn_n_u16(q11u16, 7);
+ d13u8 = vqrshrn_n_u16(q12u16, 7);
+ d14u8 = vqrshrn_n_u16(q13u16, 7);
+ d15u8 = vqrshrn_n_u16(q14u16, 7);
+ d16u8 = vqrshrn_n_u16(q1u16, 7);
+ d17u8 = vqrshrn_n_u16(q2u16, 7);
+ d18u8 = vqrshrn_n_u16(q3u16, 7);
+ d19u8 = vqrshrn_n_u16(q4u16, 7);
+
+ q5u8 = vcombine_u8(d10u8, d11u8);
+ q6u8 = vcombine_u8(d12u8, d13u8);
+ q7u8 = vcombine_u8(d14u8, d15u8);
+ q8u8 = vcombine_u8(d16u8, d17u8);
+ q9u8 = vcombine_u8(d18u8, d19u8);
+
+ vst1q_u8((uint8_t *)tmpp, q5u8);
+ tmpp += 16;
+ vst1q_u8((uint8_t *)tmpp, q6u8);
+ tmpp += 16;
+ vst1q_u8((uint8_t *)tmpp, q7u8);
+ tmpp += 16;
+ vst1q_u8((uint8_t *)tmpp, q8u8);
+ tmpp += 16;
+ vst1q_u8((uint8_t *)tmpp, q9u8);
+
+ // secondpass_filter
+ d0u8 = vdup_n_u8(bilinear_taps_coeff[yoffset][0]);
+ d1u8 = vdup_n_u8(bilinear_taps_coeff[yoffset][1]);
+
+ tmpp = tmp;
+ tmpp2 = tmpp + 272;
+ q11u8 = vld1q_u8(tmpp);
+ tmpp += 16;
+ for (i = 4; i > 0; i--) {
+ q12u8 = vld1q_u8(tmpp);
+ tmpp += 16;
+ q13u8 = vld1q_u8(tmpp);
+ tmpp += 16;
+ q14u8 = vld1q_u8(tmpp);
+ tmpp += 16;
+ q15u8 = vld1q_u8(tmpp);
+ tmpp += 16;
+
+ q1u16 = vmull_u8(vget_low_u8(q11u8), d0u8);
+ q2u16 = vmull_u8(vget_high_u8(q11u8), d0u8);
+ q3u16 = vmull_u8(vget_low_u8(q12u8), d0u8);
+ q4u16 = vmull_u8(vget_high_u8(q12u8), d0u8);
+ q5u16 = vmull_u8(vget_low_u8(q13u8), d0u8);
+ q6u16 = vmull_u8(vget_high_u8(q13u8), d0u8);
+ q7u16 = vmull_u8(vget_low_u8(q14u8), d0u8);
+ q8u16 = vmull_u8(vget_high_u8(q14u8), d0u8);
+
+ q1u16 = vmlal_u8(q1u16, vget_low_u8(q12u8), d1u8);
+ q2u16 = vmlal_u8(q2u16, vget_high_u8(q12u8), d1u8);
+ q3u16 = vmlal_u8(q3u16, vget_low_u8(q13u8), d1u8);
+ q4u16 = vmlal_u8(q4u16, vget_high_u8(q13u8), d1u8);
+ q5u16 = vmlal_u8(q5u16, vget_low_u8(q14u8), d1u8);
+ q6u16 = vmlal_u8(q6u16, vget_high_u8(q14u8), d1u8);
+ q7u16 = vmlal_u8(q7u16, vget_low_u8(q15u8), d1u8);
+ q8u16 = vmlal_u8(q8u16, vget_high_u8(q15u8), d1u8);
+
+ d2u8 = vqrshrn_n_u16(q1u16, 7);
+ d3u8 = vqrshrn_n_u16(q2u16, 7);
+ d4u8 = vqrshrn_n_u16(q3u16, 7);
+ d5u8 = vqrshrn_n_u16(q4u16, 7);
+ d6u8 = vqrshrn_n_u16(q5u16, 7);
+ d7u8 = vqrshrn_n_u16(q6u16, 7);
+ d8u8 = vqrshrn_n_u16(q7u16, 7);
+ d9u8 = vqrshrn_n_u16(q8u16, 7);
+
+ q1u8 = vcombine_u8(d2u8, d3u8);
+ q2u8 = vcombine_u8(d4u8, d5u8);
+ q3u8 = vcombine_u8(d6u8, d7u8);
+ q4u8 = vcombine_u8(d8u8, d9u8);
+
+ q11u8 = q15u8;
+
+ vst1q_u8((uint8_t *)tmpp2, q1u8);
+ tmpp2 += 16;
+ vst1q_u8((uint8_t *)tmpp2, q2u8);
+ tmpp2 += 16;
+ vst1q_u8((uint8_t *)tmpp2, q3u8);
+ tmpp2 += 16;
+ vst1q_u8((uint8_t *)tmpp2, q4u8);
+ tmpp2 += 16;
+ }
+ }
+
+ // sub_pixel_variance16x16_neon
+ q8s32 = vdupq_n_s32(0);
+ q9s32 = vdupq_n_s32(0);
+ q10s32 = vdupq_n_s32(0);
+
+ tmpp = tmp + 272;
+ for (i = 0; i < 8; i++) { // sub_pixel_variance16x16_neon_loop
+ q0u8 = vld1q_u8(tmpp);
+ tmpp += 16;
+ q1u8 = vld1q_u8(tmpp);
+ tmpp += 16;
+ q2u8 = vld1q_u8(dst_ptr);
+ dst_ptr += dst_pixels_per_line;
+ q3u8 = vld1q_u8(dst_ptr);
+ dst_ptr += dst_pixels_per_line;
+
+ d0u8 = vget_low_u8(q0u8);
+ d1u8 = vget_high_u8(q0u8);
+ d2u8 = vget_low_u8(q1u8);
+ d3u8 = vget_high_u8(q1u8);
+
+ q11u16 = vsubl_u8(d0u8, vget_low_u8(q2u8));
+ q12u16 = vsubl_u8(d1u8, vget_high_u8(q2u8));
+ q13u16 = vsubl_u8(d2u8, vget_low_u8(q3u8));
+ q14u16 = vsubl_u8(d3u8, vget_high_u8(q3u8));
+
+ d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16));
+ d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16));
+ q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q11u16));
+ q9s32 = vmlal_s16(q9s32, d22s16, d22s16);
+ q10s32 = vmlal_s16(q10s32, d23s16, d23s16);
+
+ d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16));
+ d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16));
+ q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q12u16));
+ q9s32 = vmlal_s16(q9s32, d24s16, d24s16);
+ q10s32 = vmlal_s16(q10s32, d25s16, d25s16);
+
+ d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16));
+ d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16));
+ q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q13u16));
+ q9s32 = vmlal_s16(q9s32, d26s16, d26s16);
+ q10s32 = vmlal_s16(q10s32, d27s16, d27s16);
+
+ d28s16 = vreinterpret_s16_u16(vget_low_u16(q14u16));
+ d29s16 = vreinterpret_s16_u16(vget_high_u16(q14u16));
+ q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q14u16));
+ q9s32 = vmlal_s16(q9s32, d28s16, d28s16);
+ q10s32 = vmlal_s16(q10s32, d29s16, d29s16);
+ }
+
+ q10s32 = vaddq_s32(q10s32, q9s32);
+ q0s64 = vpaddlq_s32(q8s32);
+ q1s64 = vpaddlq_s32(q10s32);
+
+ d0s64 = vget_low_s64(q0s64);
+ d1s64 = vget_high_s64(q0s64);
+ d2s64 = vget_low_s64(q1s64);
+ d3s64 = vget_high_s64(q1s64);
+ d0s64 = vadd_s64(d0s64, d1s64);
+ d1s64 = vadd_s64(d2s64, d3s64);
+
+ q5s64 = vmull_s32(vreinterpret_s32_s64(d0s64),
+ vreinterpret_s32_s64(d0s64));
+ vst1_lane_u32((uint32_t *)sse, vreinterpret_u32_s64(d1s64), 0);
+
+ d10u32 = vshr_n_u32(vreinterpret_u32_s64(vget_low_s64(q5s64)), 8);
+ d0u32 = vsub_u32(vreinterpret_u32_s64(d1s64), d10u32);
+
+ return vget_lane_u32(d0u32, 0);
+}
+
+unsigned int vp8_variance_halfpixvar16x16_h_neon(
+ const unsigned char *src_ptr,
+ int source_stride,
+ const unsigned char *ref_ptr,
+ int recon_stride,
+ unsigned int *sse) {
+ int i;
+ uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8;
+ int16x4_t d0s16, d1s16, d2s16, d3s16, d4s16, d5s16, d6s16, d7s16;
+ int16x4_t d8s16, d9s16, d10s16, d11s16, d12s16, d13s16, d14s16, d15s16;
+ uint32x2_t d0u32, d10u32;
+ int64x1_t d0s64, d1s64, d2s64, d3s64;
+ uint8x16_t q0u8, q1u8, q2u8, q3u8, q4u8, q5u8, q6u8;
+ uint8x16_t q7u8, q11u8, q12u8, q13u8, q14u8;
+ uint16x8_t q0u16, q1u16, q2u16, q3u16, q4u16, q5u16, q6u16, q7u16;
+ int32x4_t q8s32, q9s32, q10s32;
+ int64x2_t q0s64, q1s64, q5s64;
+
+ q8s32 = vdupq_n_s32(0);
+ q9s32 = vdupq_n_s32(0);
+ q10s32 = vdupq_n_s32(0);
+
+ for (i = 0; i < 4; i++) { // vp8_filt_fpo16x16s_4_0_loop_neon
+ q0u8 = vld1q_u8(src_ptr);
+ q1u8 = vld1q_u8(src_ptr + 16);
+ src_ptr += source_stride;
+ q2u8 = vld1q_u8(src_ptr);
+ q3u8 = vld1q_u8(src_ptr + 16);
+ src_ptr += source_stride;
+ q4u8 = vld1q_u8(src_ptr);
+ q5u8 = vld1q_u8(src_ptr + 16);
+ src_ptr += source_stride;
+ q6u8 = vld1q_u8(src_ptr);
+ q7u8 = vld1q_u8(src_ptr + 16);
+ src_ptr += source_stride;
+
+ q11u8 = vld1q_u8(ref_ptr);
+ ref_ptr += recon_stride;
+ q12u8 = vld1q_u8(ref_ptr);
+ ref_ptr += recon_stride;
+ q13u8 = vld1q_u8(ref_ptr);
+ ref_ptr += recon_stride;
+ q14u8 = vld1q_u8(ref_ptr);
+ ref_ptr += recon_stride;
+
+ q1u8 = vextq_u8(q0u8, q1u8, 1);
+ q3u8 = vextq_u8(q2u8, q3u8, 1);
+ q5u8 = vextq_u8(q4u8, q5u8, 1);
+ q7u8 = vextq_u8(q6u8, q7u8, 1);
+
+ q0u8 = vrhaddq_u8(q0u8, q1u8);
+ q1u8 = vrhaddq_u8(q2u8, q3u8);
+ q2u8 = vrhaddq_u8(q4u8, q5u8);
+ q3u8 = vrhaddq_u8(q6u8, q7u8);
+
+ d0u8 = vget_low_u8(q0u8);
+ d1u8 = vget_high_u8(q0u8);
+ d2u8 = vget_low_u8(q1u8);
+ d3u8 = vget_high_u8(q1u8);
+ d4u8 = vget_low_u8(q2u8);
+ d5u8 = vget_high_u8(q2u8);
+ d6u8 = vget_low_u8(q3u8);
+ d7u8 = vget_high_u8(q3u8);
+
+ q4u16 = vsubl_u8(d0u8, vget_low_u8(q11u8));
+ q5u16 = vsubl_u8(d1u8, vget_high_u8(q11u8));
+ q6u16 = vsubl_u8(d2u8, vget_low_u8(q12u8));
+ q7u16 = vsubl_u8(d3u8, vget_high_u8(q12u8));
+ q0u16 = vsubl_u8(d4u8, vget_low_u8(q13u8));
+ q1u16 = vsubl_u8(d5u8, vget_high_u8(q13u8));
+ q2u16 = vsubl_u8(d6u8, vget_low_u8(q14u8));
+ q3u16 = vsubl_u8(d7u8, vget_high_u8(q14u8));
+
+ d8s16 = vreinterpret_s16_u16(vget_low_u16(q4u16));
+ d9s16 = vreinterpret_s16_u16(vget_high_u16(q4u16));
+ q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q4u16));
+ q9s32 = vmlal_s16(q9s32, d8s16, d8s16);
+ q10s32 = vmlal_s16(q10s32, d9s16, d9s16);
+ d10s16 = vreinterpret_s16_u16(vget_low_u16(q5u16));
+ d11s16 = vreinterpret_s16_u16(vget_high_u16(q5u16));
+ q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q5u16));
+ q9s32 = vmlal_s16(q9s32, d10s16, d10s16);
+ q10s32 = vmlal_s16(q10s32, d11s16, d11s16);
+ d12s16 = vreinterpret_s16_u16(vget_low_u16(q6u16));
+ d13s16 = vreinterpret_s16_u16(vget_high_u16(q6u16));
+ q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q6u16));
+ q9s32 = vmlal_s16(q9s32, d12s16, d12s16);
+ q10s32 = vmlal_s16(q10s32, d13s16, d13s16);
+ d14s16 = vreinterpret_s16_u16(vget_low_u16(q7u16));
+ d15s16 = vreinterpret_s16_u16(vget_high_u16(q7u16));
+ q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q7u16));
+ q9s32 = vmlal_s16(q9s32, d14s16, d14s16);
+ q10s32 = vmlal_s16(q10s32, d15s16, d15s16);
+ d0s16 = vreinterpret_s16_u16(vget_low_u16(q0u16));
+ d1s16 = vreinterpret_s16_u16(vget_high_u16(q0u16));
+ q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q0u16));
+ q9s32 = vmlal_s16(q9s32, d0s16, d0s16);
+ q10s32 = vmlal_s16(q10s32, d1s16, d1s16);
+ d2s16 = vreinterpret_s16_u16(vget_low_u16(q1u16));
+ d3s16 = vreinterpret_s16_u16(vget_high_u16(q1u16));
+ q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q1u16));
+ q9s32 = vmlal_s16(q9s32, d2s16, d2s16);
+ q10s32 = vmlal_s16(q10s32, d3s16, d3s16);
+ d4s16 = vreinterpret_s16_u16(vget_low_u16(q2u16));
+ d5s16 = vreinterpret_s16_u16(vget_high_u16(q2u16));
+ q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q2u16));
+ q9s32 = vmlal_s16(q9s32, d4s16, d4s16);
+ q10s32 = vmlal_s16(q10s32, d5s16, d5s16);
+ d6s16 = vreinterpret_s16_u16(vget_low_u16(q3u16));
+ d7s16 = vreinterpret_s16_u16(vget_high_u16(q3u16));
+ q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q3u16));
+ q9s32 = vmlal_s16(q9s32, d6s16, d6s16);
+ q10s32 = vmlal_s16(q10s32, d7s16, d7s16);
+ }
+
+ q10s32 = vaddq_s32(q10s32, q9s32);
+ q0s64 = vpaddlq_s32(q8s32);
+ q1s64 = vpaddlq_s32(q10s32);
+
+ d0s64 = vget_low_s64(q0s64);
+ d1s64 = vget_high_s64(q0s64);
+ d2s64 = vget_low_s64(q1s64);
+ d3s64 = vget_high_s64(q1s64);
+ d0s64 = vadd_s64(d0s64, d1s64);
+ d1s64 = vadd_s64(d2s64, d3s64);
+
+ q5s64 = vmull_s32(vreinterpret_s32_s64(d0s64),
+ vreinterpret_s32_s64(d0s64));
+ vst1_lane_u32((uint32_t *)sse, vreinterpret_u32_s64(d1s64), 0);
+
+ d10u32 = vshr_n_u32(vreinterpret_u32_s64(vget_low_s64(q5s64)), 8);
+ d0u32 = vsub_u32(vreinterpret_u32_s64(d1s64), d10u32);
+
+ return vget_lane_u32(d0u32, 0);
+}
+
+unsigned int vp8_variance_halfpixvar16x16_v_neon(
+ const unsigned char *src_ptr,
+ int source_stride,
+ const unsigned char *ref_ptr,
+ int recon_stride,
+ unsigned int *sse) {
+ int i;
+ uint8x8_t d0u8, d1u8, d4u8, d5u8, d8u8, d9u8, d12u8, d13u8;
+ int16x4_t d22s16, d23s16, d24s16, d25s16, d26s16, d27s16, d28s16, d29s16;
+ int16x4_t d0s16, d1s16, d2s16, d3s16, d4s16, d5s16, d6s16, d7s16;
+ uint32x2_t d0u32, d10u32;
+ int64x1_t d0s64, d1s64, d2s64, d3s64;
+ uint8x16_t q0u8, q1u8, q2u8, q3u8, q4u8, q5u8, q6u8, q7u8, q15u8;
+ uint16x8_t q0u16, q1u16, q2u16, q3u16, q11u16, q12u16, q13u16, q14u16;
+ int32x4_t q8s32, q9s32, q10s32;
+ int64x2_t q0s64, q1s64, q5s64;
+
+ q8s32 = vdupq_n_s32(0);
+ q9s32 = vdupq_n_s32(0);
+ q10s32 = vdupq_n_s32(0);
+
+ q0u8 = vld1q_u8(src_ptr);
+ src_ptr += source_stride;
+ for (i = 0; i < 4; i++) { // vp8_filt_fpo16x16s_4_0_loop_neon
+ q2u8 = vld1q_u8(src_ptr);
+ src_ptr += source_stride;
+ q4u8 = vld1q_u8(src_ptr);
+ src_ptr += source_stride;
+ q6u8 = vld1q_u8(src_ptr);
+ src_ptr += source_stride;
+ q15u8 = vld1q_u8(src_ptr);
+ src_ptr += source_stride;
+
+ q1u8 = vld1q_u8(ref_ptr);
+ ref_ptr += recon_stride;
+ q3u8 = vld1q_u8(ref_ptr);
+ ref_ptr += recon_stride;
+ q5u8 = vld1q_u8(ref_ptr);
+ ref_ptr += recon_stride;
+ q7u8 = vld1q_u8(ref_ptr);
+ ref_ptr += recon_stride;
+
+ q0u8 = vrhaddq_u8(q0u8, q2u8);
+ q2u8 = vrhaddq_u8(q2u8, q4u8);
+ q4u8 = vrhaddq_u8(q4u8, q6u8);
+ q6u8 = vrhaddq_u8(q6u8, q15u8);
+
+ d0u8 = vget_low_u8(q0u8);
+ d1u8 = vget_high_u8(q0u8);
+ d4u8 = vget_low_u8(q2u8);
+ d5u8 = vget_high_u8(q2u8);
+ d8u8 = vget_low_u8(q4u8);
+ d9u8 = vget_high_u8(q4u8);
+ d12u8 = vget_low_u8(q6u8);
+ d13u8 = vget_high_u8(q6u8);
+
+ q11u16 = vsubl_u8(d0u8, vget_low_u8(q1u8));
+ q12u16 = vsubl_u8(d1u8, vget_high_u8(q1u8));
+ q13u16 = vsubl_u8(d4u8, vget_low_u8(q3u8));
+ q14u16 = vsubl_u8(d5u8, vget_high_u8(q3u8));
+ q0u16 = vsubl_u8(d8u8, vget_low_u8(q5u8));
+ q1u16 = vsubl_u8(d9u8, vget_high_u8(q5u8));
+ q2u16 = vsubl_u8(d12u8, vget_low_u8(q7u8));
+ q3u16 = vsubl_u8(d13u8, vget_high_u8(q7u8));
+
+ d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16));
+ d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16));
+ q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q11u16));
+ q9s32 = vmlal_s16(q9s32, d22s16, d22s16);
+ q10s32 = vmlal_s16(q10s32, d23s16, d23s16);
+ d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16));
+ d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16));
+ q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q12u16));
+ q9s32 = vmlal_s16(q9s32, d24s16, d24s16);
+ q10s32 = vmlal_s16(q10s32, d25s16, d25s16);
+ d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16));
+ d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16));
+ q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q13u16));
+ q9s32 = vmlal_s16(q9s32, d26s16, d26s16);
+ q10s32 = vmlal_s16(q10s32, d27s16, d27s16);
+ d28s16 = vreinterpret_s16_u16(vget_low_u16(q14u16));
+ d29s16 = vreinterpret_s16_u16(vget_high_u16(q14u16));
+ q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q14u16));
+ q9s32 = vmlal_s16(q9s32, d28s16, d28s16);
+ q10s32 = vmlal_s16(q10s32, d29s16, d29s16);
+ d0s16 = vreinterpret_s16_u16(vget_low_u16(q0u16));
+ d1s16 = vreinterpret_s16_u16(vget_high_u16(q0u16));
+ q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q0u16));
+ q9s32 = vmlal_s16(q9s32, d0s16, d0s16);
+ q10s32 = vmlal_s16(q10s32, d1s16, d1s16);
+ d2s16 = vreinterpret_s16_u16(vget_low_u16(q1u16));
+ d3s16 = vreinterpret_s16_u16(vget_high_u16(q1u16));
+ q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q1u16));
+ q9s32 = vmlal_s16(q9s32, d2s16, d2s16);
+ q10s32 = vmlal_s16(q10s32, d3s16, d3s16);
+ d4s16 = vreinterpret_s16_u16(vget_low_u16(q2u16));
+ d5s16 = vreinterpret_s16_u16(vget_high_u16(q2u16));
+ q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q2u16));
+ q9s32 = vmlal_s16(q9s32, d4s16, d4s16);
+ q10s32 = vmlal_s16(q10s32, d5s16, d5s16);
+ d6s16 = vreinterpret_s16_u16(vget_low_u16(q3u16));
+ d7s16 = vreinterpret_s16_u16(vget_high_u16(q3u16));
+ q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q3u16));
+ q9s32 = vmlal_s16(q9s32, d6s16, d6s16);
+ q10s32 = vmlal_s16(q10s32, d7s16, d7s16);
+
+ q0u8 = q15u8;
+ }
+
+ q10s32 = vaddq_s32(q10s32, q9s32);
+ q0s64 = vpaddlq_s32(q8s32);
+ q1s64 = vpaddlq_s32(q10s32);
+
+ d0s64 = vget_low_s64(q0s64);
+ d1s64 = vget_high_s64(q0s64);
+ d2s64 = vget_low_s64(q1s64);
+ d3s64 = vget_high_s64(q1s64);
+ d0s64 = vadd_s64(d0s64, d1s64);
+ d1s64 = vadd_s64(d2s64, d3s64);
+
+ q5s64 = vmull_s32(vreinterpret_s32_s64(d0s64),
+ vreinterpret_s32_s64(d0s64));
+ vst1_lane_u32((uint32_t *)sse, vreinterpret_u32_s64(d1s64), 0);
+
+ d10u32 = vshr_n_u32(vreinterpret_u32_s64(vget_low_s64(q5s64)), 8);
+ d0u32 = vsub_u32(vreinterpret_u32_s64(d1s64), d10u32);
+
+ return vget_lane_u32(d0u32, 0);
+}
+
+unsigned int vp8_variance_halfpixvar16x16_hv_neon(
+ const unsigned char *src_ptr,
+ int source_stride,
+ const unsigned char *ref_ptr,
+ int recon_stride,
+ unsigned int *sse) {
+ int i;
+ uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8;
+ int16x4_t d0s16, d1s16, d2s16, d3s16, d10s16, d11s16, d12s16, d13s16;
+ int16x4_t d18s16, d19s16, d20s16, d21s16, d22s16, d23s16, d24s16, d25s16;
+ uint32x2_t d0u32, d10u32;
+ int64x1_t d0s64, d1s64, d2s64, d3s64;
+ uint8x16_t q0u8, q1u8, q2u8, q3u8, q4u8, q5u8, q6u8, q7u8, q8u8, q9u8;
+ uint16x8_t q0u16, q1u16, q5u16, q6u16, q9u16, q10u16, q11u16, q12u16;
+ int32x4_t q13s32, q14s32, q15s32;
+ int64x2_t q0s64, q1s64, q5s64;
+
+ q13s32 = vdupq_n_s32(0);
+ q14s32 = vdupq_n_s32(0);
+ q15s32 = vdupq_n_s32(0);
+
+ q0u8 = vld1q_u8(src_ptr);
+ q1u8 = vld1q_u8(src_ptr + 16);
+ src_ptr += source_stride;
+ q1u8 = vextq_u8(q0u8, q1u8, 1);
+ q0u8 = vrhaddq_u8(q0u8, q1u8);
+ for (i = 0; i < 4; i++) { // vp8_filt_fpo16x16s_4_0_loop_neon
+ q2u8 = vld1q_u8(src_ptr);
+ q3u8 = vld1q_u8(src_ptr + 16);
+ src_ptr += source_stride;
+ q4u8 = vld1q_u8(src_ptr);
+ q5u8 = vld1q_u8(src_ptr + 16);
+ src_ptr += source_stride;
+ q6u8 = vld1q_u8(src_ptr);
+ q7u8 = vld1q_u8(src_ptr + 16);
+ src_ptr += source_stride;
+ q8u8 = vld1q_u8(src_ptr);
+ q9u8 = vld1q_u8(src_ptr + 16);
+ src_ptr += source_stride;
+
+ q3u8 = vextq_u8(q2u8, q3u8, 1);
+ q5u8 = vextq_u8(q4u8, q5u8, 1);
+ q7u8 = vextq_u8(q6u8, q7u8, 1);
+ q9u8 = vextq_u8(q8u8, q9u8, 1);
+
+ q1u8 = vrhaddq_u8(q2u8, q3u8);
+ q2u8 = vrhaddq_u8(q4u8, q5u8);
+ q3u8 = vrhaddq_u8(q6u8, q7u8);
+ q4u8 = vrhaddq_u8(q8u8, q9u8);
+ q0u8 = vrhaddq_u8(q0u8, q1u8);
+ q1u8 = vrhaddq_u8(q1u8, q2u8);
+ q2u8 = vrhaddq_u8(q2u8, q3u8);
+ q3u8 = vrhaddq_u8(q3u8, q4u8);
+
+ q5u8 = vld1q_u8(ref_ptr);
+ ref_ptr += recon_stride;
+ q6u8 = vld1q_u8(ref_ptr);
+ ref_ptr += recon_stride;
+ q7u8 = vld1q_u8(ref_ptr);
+ ref_ptr += recon_stride;
+ q8u8 = vld1q_u8(ref_ptr);
+ ref_ptr += recon_stride;
+
+ d0u8 = vget_low_u8(q0u8);
+ d1u8 = vget_high_u8(q0u8);
+ d2u8 = vget_low_u8(q1u8);
+ d3u8 = vget_high_u8(q1u8);
+ d4u8 = vget_low_u8(q2u8);
+ d5u8 = vget_high_u8(q2u8);
+ d6u8 = vget_low_u8(q3u8);
+ d7u8 = vget_high_u8(q3u8);
+
+ q9u16 = vsubl_u8(d0u8, vget_low_u8(q5u8));
+ q10u16 = vsubl_u8(d1u8, vget_high_u8(q5u8));
+ q11u16 = vsubl_u8(d2u8, vget_low_u8(q6u8));
+ q12u16 = vsubl_u8(d3u8, vget_high_u8(q6u8));
+ q0u16 = vsubl_u8(d4u8, vget_low_u8(q7u8));
+ q1u16 = vsubl_u8(d5u8, vget_high_u8(q7u8));
+ q5u16 = vsubl_u8(d6u8, vget_low_u8(q8u8));
+ q6u16 = vsubl_u8(d7u8, vget_high_u8(q8u8));
+
+ d18s16 = vreinterpret_s16_u16(vget_low_u16(q9u16));
+ d19s16 = vreinterpret_s16_u16(vget_high_u16(q9u16));
+ q13s32 = vpadalq_s16(q13s32, vreinterpretq_s16_u16(q9u16));
+ q14s32 = vmlal_s16(q14s32, d18s16, d18s16);
+ q15s32 = vmlal_s16(q15s32, d19s16, d19s16);
+
+ d20s16 = vreinterpret_s16_u16(vget_low_u16(q10u16));
+ d21s16 = vreinterpret_s16_u16(vget_high_u16(q10u16));
+ q13s32 = vpadalq_s16(q13s32, vreinterpretq_s16_u16(q10u16));
+ q14s32 = vmlal_s16(q14s32, d20s16, d20s16);
+ q15s32 = vmlal_s16(q15s32, d21s16, d21s16);
+
+ d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16));
+ d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16));
+ q13s32 = vpadalq_s16(q13s32, vreinterpretq_s16_u16(q11u16));
+ q14s32 = vmlal_s16(q14s32, d22s16, d22s16);
+ q15s32 = vmlal_s16(q15s32, d23s16, d23s16);
+
+ d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16));
+ d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16));
+ q13s32 = vpadalq_s16(q13s32, vreinterpretq_s16_u16(q12u16));
+ q14s32 = vmlal_s16(q14s32, d24s16, d24s16);
+ q15s32 = vmlal_s16(q15s32, d25s16, d25s16);
+
+ d0s16 = vreinterpret_s16_u16(vget_low_u16(q0u16));
+ d1s16 = vreinterpret_s16_u16(vget_high_u16(q0u16));
+ q13s32 = vpadalq_s16(q13s32, vreinterpretq_s16_u16(q0u16));
+ q14s32 = vmlal_s16(q14s32, d0s16, d0s16);
+ q15s32 = vmlal_s16(q15s32, d1s16, d1s16);
+
+ d2s16 = vreinterpret_s16_u16(vget_low_u16(q1u16));
+ d3s16 = vreinterpret_s16_u16(vget_high_u16(q1u16));
+ q13s32 = vpadalq_s16(q13s32, vreinterpretq_s16_u16(q1u16));
+ q14s32 = vmlal_s16(q14s32, d2s16, d2s16);
+ q15s32 = vmlal_s16(q15s32, d3s16, d3s16);
+
+ d10s16 = vreinterpret_s16_u16(vget_low_u16(q5u16));
+ d11s16 = vreinterpret_s16_u16(vget_high_u16(q5u16));
+ q13s32 = vpadalq_s16(q13s32, vreinterpretq_s16_u16(q5u16));
+ q14s32 = vmlal_s16(q14s32, d10s16, d10s16);
+ q15s32 = vmlal_s16(q15s32, d11s16, d11s16);
+
+ d12s16 = vreinterpret_s16_u16(vget_low_u16(q6u16));
+ d13s16 = vreinterpret_s16_u16(vget_high_u16(q6u16));
+ q13s32 = vpadalq_s16(q13s32, vreinterpretq_s16_u16(q6u16));
+ q14s32 = vmlal_s16(q14s32, d12s16, d12s16);
+ q15s32 = vmlal_s16(q15s32, d13s16, d13s16);
+
+ q0u8 = q4u8;
+ }
+
+ q15s32 = vaddq_s32(q14s32, q15s32);
+ q0s64 = vpaddlq_s32(q13s32);
+ q1s64 = vpaddlq_s32(q15s32);
+
+ d0s64 = vget_low_s64(q0s64);
+ d1s64 = vget_high_s64(q0s64);
+ d2s64 = vget_low_s64(q1s64);
+ d3s64 = vget_high_s64(q1s64);
+ d0s64 = vadd_s64(d0s64, d1s64);
+ d1s64 = vadd_s64(d2s64, d3s64);
+
+ q5s64 = vmull_s32(vreinterpret_s32_s64(d0s64),
+ vreinterpret_s32_s64(d0s64));
+ vst1_lane_u32((uint32_t *)sse, vreinterpret_u32_s64(d1s64), 0);
+
+ d10u32 = vshr_n_u32(vreinterpret_u32_s64(vget_low_s64(q5s64)), 8);
+ d0u32 = vsub_u32(vreinterpret_u32_s64(d1s64), d10u32);
+
+ return vget_lane_u32(d0u32, 0);
+}
+
+enum { kWidth8 = 8 };
+enum { kHeight8 = 8 };
+enum { kHeight8PlusOne = 9 };
+enum { kPixelStepOne = 1 };
+enum { kAlign16 = 16 };
+
+#define FILTER_BITS 7
+
+static INLINE int horizontal_add_s16x8(const int16x8_t v_16x8) {
+ const int32x4_t a = vpaddlq_s16(v_16x8);
+ const int64x2_t b = vpaddlq_s32(a);
+ const int32x2_t c = vadd_s32(vreinterpret_s32_s64(vget_low_s64(b)),
+ vreinterpret_s32_s64(vget_high_s64(b)));
+ return vget_lane_s32(c, 0);
+}
+
+static INLINE int horizontal_add_s32x4(const int32x4_t v_32x4) {
+ const int64x2_t b = vpaddlq_s32(v_32x4);
+ const int32x2_t c = vadd_s32(vreinterpret_s32_s64(vget_low_s64(b)),
+ vreinterpret_s32_s64(vget_high_s64(b)));
+ return vget_lane_s32(c, 0);
+}
+
+static void variance_neon_w8(const uint8_t *a, int a_stride,
+ const uint8_t *b, int b_stride,
+ int w, int h, unsigned int *sse, int *sum) {
+ int i, j;
+ int16x8_t v_sum = vdupq_n_s16(0);
+ int32x4_t v_sse_lo = vdupq_n_s32(0);
+ int32x4_t v_sse_hi = vdupq_n_s32(0);
+
+ for (i = 0; i < h; ++i) {
+ for (j = 0; j < w; j += 8) {
+ const uint8x8_t v_a = vld1_u8(&a[j]);
+ const uint8x8_t v_b = vld1_u8(&b[j]);
+ const uint16x8_t v_diff = vsubl_u8(v_a, v_b);
+ const int16x8_t sv_diff = vreinterpretq_s16_u16(v_diff);
+ v_sum = vaddq_s16(v_sum, sv_diff);
+ v_sse_lo = vmlal_s16(v_sse_lo,
+ vget_low_s16(sv_diff),
+ vget_low_s16(sv_diff));
+ v_sse_hi = vmlal_s16(v_sse_hi,
+ vget_high_s16(sv_diff),
+ vget_high_s16(sv_diff));
+ }
+ a += a_stride;
+ b += b_stride;
+ }
+
+ *sum = horizontal_add_s16x8(v_sum);
+ *sse = (unsigned int)horizontal_add_s32x4(vaddq_s32(v_sse_lo, v_sse_hi));
+}
+
+static unsigned int variance8x8_neon(const uint8_t *a, int a_stride,
+ const uint8_t *b, int b_stride,
+ unsigned int *sse) {
+ int sum;
+ variance_neon_w8(a, a_stride, b, b_stride, kWidth8, kHeight8, sse, &sum);
+ return *sse - (((int64_t)sum * sum) / (kWidth8 * kHeight8));
+}
+
+static void var_filter_block2d_bil_w8(const uint8_t *src_ptr,
+ uint8_t *output_ptr,
+ unsigned int src_pixels_per_line,
+ int pixel_step,
+ unsigned int output_height,
+ unsigned int output_width,
+ const uint16_t *vpx_filter) {
+ const uint8x8_t f0 = vmov_n_u8((uint8_t)vpx_filter[0]);
+ const uint8x8_t f1 = vmov_n_u8((uint8_t)vpx_filter[1]);
+ unsigned int i;
+ for (i = 0; i < output_height; ++i) {
+ const uint8x8_t src_0 = vld1_u8(&src_ptr[0]);
+ const uint8x8_t src_1 = vld1_u8(&src_ptr[pixel_step]);
+ const uint16x8_t a = vmull_u8(src_0, f0);
+ const uint16x8_t b = vmlal_u8(a, src_1, f1);
+ const uint8x8_t out = vrshrn_n_u16(b, FILTER_BITS);
+ vst1_u8(&output_ptr[0], out);
+ // Next row...
+ src_ptr += src_pixels_per_line;
+ output_ptr += output_width;
+ }
+}
+
+unsigned int vp8_sub_pixel_variance8x8_neon(
+ const unsigned char *src,
+ int src_stride,
+ int xoffset,
+ int yoffset,
+ const unsigned char *dst,
+ int dst_stride,
+ unsigned int *sse) {
+ DECLARE_ALIGNED_ARRAY(kAlign16, uint8_t, temp2, kHeight8 * kWidth8);
+ DECLARE_ALIGNED_ARRAY(kAlign16, uint8_t, fdata3, kHeight8PlusOne * kWidth8);
+ if (xoffset == 0) {
+ var_filter_block2d_bil_w8(src, temp2, src_stride, kWidth8, kHeight8,
+ kWidth8, bilinear_taps_coeff[yoffset]);
+ } else if (yoffset == 0) {
+ var_filter_block2d_bil_w8(src, temp2, src_stride, kPixelStepOne,
+ kHeight8PlusOne, kWidth8,
+ bilinear_taps_coeff[xoffset]);
+ } else {
+ var_filter_block2d_bil_w8(src, fdata3, src_stride, kPixelStepOne,
+ kHeight8PlusOne, kWidth8,
+ bilinear_taps_coeff[xoffset]);
+ var_filter_block2d_bil_w8(fdata3, temp2, kWidth8, kWidth8, kHeight8,
+ kWidth8, bilinear_taps_coeff[yoffset]);
+ }
+ return variance8x8_neon(temp2, kWidth8, dst, dst_stride, sse);
+}
+
diff --git a/vp8/common/arm/reconintra_arm.c b/vp8/common/arm/reconintra_arm.c
deleted file mode 100644
index e55a33c..0000000
--- a/vp8/common/arm/reconintra_arm.c
+++ /dev/null
@@ -1,58 +0,0 @@
-/*
- * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#include "vpx_config.h"
-#include "vp8_rtcd.h"
-#include "vp8/common/blockd.h"
-#include "vpx_mem/vpx_mem.h"
-
-#if HAVE_NEON_ASM
-extern void vp8_build_intra_predictors_mby_neon_func(
- unsigned char *y_buffer,
- unsigned char *ypred_ptr,
- int y_stride,
- int mode,
- int Up,
- int Left);
-
-void vp8_build_intra_predictors_mby_neon(MACROBLOCKD *x)
-{
- unsigned char *y_buffer = x->dst.y_buffer;
- unsigned char *ypred_ptr = x->predictor;
- int y_stride = x->dst.y_stride;
- int mode = x->mode_info_context->mbmi.mode;
- int Up = x->up_available;
- int Left = x->left_available;
-
- vp8_build_intra_predictors_mby_neon_func(y_buffer, ypred_ptr, y_stride, mode, Up, Left);
-}
-
-extern void vp8_build_intra_predictors_mby_s_neon_func(
- unsigned char *y_buffer,
- unsigned char *ypred_ptr,
- int y_stride,
- int mode,
- int Up,
- int Left);
-
-void vp8_build_intra_predictors_mby_s_neon(MACROBLOCKD *x)
-{
- unsigned char *y_buffer = x->dst.y_buffer;
- unsigned char *ypred_ptr = x->predictor;
- int y_stride = x->dst.y_stride;
- int mode = x->mode_info_context->mbmi.mode;
- int Up = x->up_available;
- int Left = x->left_available;
-
- vp8_build_intra_predictors_mby_s_neon_func(y_buffer, ypred_ptr, y_stride, mode, Up, Left);
-}
-
-#endif
diff --git a/vp8/common/arm/variance_arm.c b/vp8/common/arm/variance_arm.c
index e3f7083..467a509 100644
--- a/vp8/common/arm/variance_arm.c
+++ b/vp8/common/arm/variance_arm.c
@@ -95,7 +95,7 @@
#endif /* HAVE_MEDIA */
-#if HAVE_NEON_ASM
+#if HAVE_NEON
extern unsigned int vp8_sub_pixel_variance16x16_neon_func
(
diff --git a/vp8/common/onyx.h b/vp8/common/onyx.h
index a46fbfb..d48c4fe 100644
--- a/vp8/common/onyx.h
+++ b/vp8/common/onyx.h
@@ -108,7 +108,8 @@
* For temporal denoiser: noise_sensitivity = 0 means off,
* noise_sensitivity = 1 means temporal denoiser on for Y channel only,
* noise_sensitivity = 2 means temporal denoiser on for all channels.
- * noise_sensitivity >= 3 means aggressive denoising mode.
+ * noise_sensitivity = 3 means aggressive denoising mode.
+ * noise_sensitivity >= 4 means adaptive denoising mode.
* Temporal denoiser is enabled via the configuration option:
* CONFIG_TEMPORAL_DENOISING.
* For spatial denoiser: noise_sensitivity controls the amount of
@@ -223,7 +224,7 @@
int arnr_strength;
int arnr_type;
- struct vpx_fixed_buf two_pass_stats_in;
+ vpx_fixed_buf_t two_pass_stats_in;
struct vpx_codec_pkt_list *output_pkt_list;
vp8e_tuning tuning;
diff --git a/vp8/common/rtcd_defs.pl b/vp8/common/rtcd_defs.pl
index fd9afd2..0070c28 100644
--- a/vp8/common/rtcd_defs.pl
+++ b/vp8/common/rtcd_defs.pl
@@ -38,15 +38,13 @@
$vp8_dequant_idct_add_dspr2=vp8_dequant_idct_add_dspr2;
add_proto qw/void vp8_dequant_idct_add_y_block/, "short *q, short *dq, unsigned char *dst, int stride, char *eobs";
-specialize qw/vp8_dequant_idct_add_y_block mmx sse2 media neon_asm dspr2/;
+specialize qw/vp8_dequant_idct_add_y_block mmx sse2 media neon dspr2/;
$vp8_dequant_idct_add_y_block_media=vp8_dequant_idct_add_y_block_v6;
-$vp8_dequant_idct_add_y_block_neon_asm=vp8_dequant_idct_add_y_block_neon;
$vp8_dequant_idct_add_y_block_dspr2=vp8_dequant_idct_add_y_block_dspr2;
add_proto qw/void vp8_dequant_idct_add_uv_block/, "short *q, short *dq, unsigned char *dst_u, unsigned char *dst_v, int stride, char *eobs";
-specialize qw/vp8_dequant_idct_add_uv_block mmx sse2 media neon_asm dspr2/;
+specialize qw/vp8_dequant_idct_add_uv_block mmx sse2 media neon dspr2/;
$vp8_dequant_idct_add_uv_block_media=vp8_dequant_idct_add_uv_block_v6;
-$vp8_dequant_idct_add_uv_block_neon_asm=vp8_dequant_idct_add_uv_block_neon;
$vp8_dequant_idct_add_y_block_dspr2=vp8_dequant_idct_add_y_block_dspr2;
#
@@ -58,9 +56,8 @@
$vp8_loop_filter_mbv_dspr2=vp8_loop_filter_mbv_dspr2;
add_proto qw/void vp8_loop_filter_bv/, "unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi";
-specialize qw/vp8_loop_filter_bv mmx sse2 media neon_asm dspr2/;
+specialize qw/vp8_loop_filter_bv mmx sse2 media neon dspr2/;
$vp8_loop_filter_bv_media=vp8_loop_filter_bv_armv6;
-$vp8_loop_filter_bv_neon_asm=vp8_loop_filter_bv_neon;
$vp8_loop_filter_bv_dspr2=vp8_loop_filter_bv_dspr2;
add_proto qw/void vp8_loop_filter_mbh/, "unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi";
@@ -69,19 +66,18 @@
$vp8_loop_filter_mbh_dspr2=vp8_loop_filter_mbh_dspr2;
add_proto qw/void vp8_loop_filter_bh/, "unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi";
-specialize qw/vp8_loop_filter_bh mmx sse2 media neon_asm dspr2/;
+specialize qw/vp8_loop_filter_bh mmx sse2 media neon dspr2/;
$vp8_loop_filter_bh_media=vp8_loop_filter_bh_armv6;
-$vp8_loop_filter_bh_neon_asm=vp8_loop_filter_bh_neon;
$vp8_loop_filter_bh_dspr2=vp8_loop_filter_bh_dspr2;
add_proto qw/void vp8_loop_filter_simple_mbv/, "unsigned char *y, int ystride, const unsigned char *blimit";
-specialize qw/vp8_loop_filter_simple_mbv mmx sse2 media neon_asm/;
+specialize qw/vp8_loop_filter_simple_mbv mmx sse2 media neon/;
$vp8_loop_filter_simple_mbv_c=vp8_loop_filter_simple_vertical_edge_c;
$vp8_loop_filter_simple_mbv_mmx=vp8_loop_filter_simple_vertical_edge_mmx;
$vp8_loop_filter_simple_mbv_sse2=vp8_loop_filter_simple_vertical_edge_sse2;
$vp8_loop_filter_simple_mbv_media=vp8_loop_filter_simple_vertical_edge_armv6;
-$vp8_loop_filter_simple_mbv_neon_asm=vp8_loop_filter_mbvs_neon;
+$vp8_loop_filter_simple_mbv_neon=vp8_loop_filter_mbvs_neon;
add_proto qw/void vp8_loop_filter_simple_mbh/, "unsigned char *y, int ystride, const unsigned char *blimit";
specialize qw/vp8_loop_filter_simple_mbh mmx sse2 media neon/;
@@ -92,12 +88,12 @@
$vp8_loop_filter_simple_mbh_neon=vp8_loop_filter_mbhs_neon;
add_proto qw/void vp8_loop_filter_simple_bv/, "unsigned char *y, int ystride, const unsigned char *blimit";
-specialize qw/vp8_loop_filter_simple_bv mmx sse2 media neon_asm/;
+specialize qw/vp8_loop_filter_simple_bv mmx sse2 media neon/;
$vp8_loop_filter_simple_bv_c=vp8_loop_filter_bvs_c;
$vp8_loop_filter_simple_bv_mmx=vp8_loop_filter_bvs_mmx;
$vp8_loop_filter_simple_bv_sse2=vp8_loop_filter_bvs_sse2;
$vp8_loop_filter_simple_bv_media=vp8_loop_filter_bvs_armv6;
-$vp8_loop_filter_simple_bv_neon_asm=vp8_loop_filter_bvs_neon;
+$vp8_loop_filter_simple_bv_neon=vp8_loop_filter_bvs_neon;
add_proto qw/void vp8_loop_filter_simple_bh/, "unsigned char *y, int ystride, const unsigned char *blimit";
specialize qw/vp8_loop_filter_simple_bh mmx sse2 media neon/;
@@ -153,11 +149,10 @@
$vp8_copy_mem8x4_dspr2=vp8_copy_mem8x4_dspr2;
add_proto qw/void vp8_build_intra_predictors_mby_s/, "struct macroblockd *x, unsigned char * yabove_row, unsigned char * yleft, int left_stride, unsigned char * ypred_ptr, int y_stride";
-specialize qw/vp8_build_intra_predictors_mby_s sse2 ssse3/;
-#TODO: fix assembly for neon
+specialize qw/vp8_build_intra_predictors_mby_s sse2 ssse3 neon/;
add_proto qw/void vp8_build_intra_predictors_mbuv_s/, "struct macroblockd *x, unsigned char * uabove_row, unsigned char * vabove_row, unsigned char *uleft, unsigned char *vleft, int left_stride, unsigned char * upred_ptr, unsigned char * vpred_ptr, int pred_stride";
-specialize qw/vp8_build_intra_predictors_mbuv_s sse2 ssse3/;
+specialize qw/vp8_build_intra_predictors_mbuv_s sse2 ssse3 neon/;
add_proto qw/void vp8_intra4x4_predict/, "unsigned char *Above, unsigned char *yleft, int left_stride, int b_mode, unsigned char *dst, int dst_stride, unsigned char top_left";
specialize qw/vp8_intra4x4_predict media/;
@@ -294,22 +289,19 @@
$vp8_sub_pixel_variance16x16_neon_asm=vp8_sub_pixel_variance16x16_neon;
add_proto qw/unsigned int vp8_variance_halfpixvar16x16_h/, "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse";
-specialize qw/vp8_variance_halfpixvar16x16_h mmx sse2 media neon_asm/;
+specialize qw/vp8_variance_halfpixvar16x16_h mmx sse2 media neon/;
$vp8_variance_halfpixvar16x16_h_sse2=vp8_variance_halfpixvar16x16_h_wmt;
$vp8_variance_halfpixvar16x16_h_media=vp8_variance_halfpixvar16x16_h_armv6;
-$vp8_variance_halfpixvar16x16_h_neon_asm=vp8_variance_halfpixvar16x16_h_neon;
add_proto qw/unsigned int vp8_variance_halfpixvar16x16_v/, "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse";
-specialize qw/vp8_variance_halfpixvar16x16_v mmx sse2 media neon_asm/;
+specialize qw/vp8_variance_halfpixvar16x16_v mmx sse2 media neon/;
$vp8_variance_halfpixvar16x16_v_sse2=vp8_variance_halfpixvar16x16_v_wmt;
$vp8_variance_halfpixvar16x16_v_media=vp8_variance_halfpixvar16x16_v_armv6;
-$vp8_variance_halfpixvar16x16_v_neon_asm=vp8_variance_halfpixvar16x16_v_neon;
add_proto qw/unsigned int vp8_variance_halfpixvar16x16_hv/, "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse";
-specialize qw/vp8_variance_halfpixvar16x16_hv mmx sse2 media neon_asm/;
+specialize qw/vp8_variance_halfpixvar16x16_hv mmx sse2 media neon/;
$vp8_variance_halfpixvar16x16_hv_sse2=vp8_variance_halfpixvar16x16_hv_wmt;
$vp8_variance_halfpixvar16x16_hv_media=vp8_variance_halfpixvar16x16_hv_armv6;
-$vp8_variance_halfpixvar16x16_hv_neon_asm=vp8_variance_halfpixvar16x16_hv_neon;
#
# Single block SAD
@@ -446,19 +438,16 @@
# Forward DCT
#
add_proto qw/void vp8_short_fdct4x4/, "short *input, short *output, int pitch";
-specialize qw/vp8_short_fdct4x4 mmx sse2 media neon_asm/;
+specialize qw/vp8_short_fdct4x4 mmx sse2 media neon/;
$vp8_short_fdct4x4_media=vp8_short_fdct4x4_armv6;
-$vp8_short_fdct4x4_neon_asm=vp8_short_fdct4x4_neon;
add_proto qw/void vp8_short_fdct8x4/, "short *input, short *output, int pitch";
-specialize qw/vp8_short_fdct8x4 mmx sse2 media neon_asm/;
+specialize qw/vp8_short_fdct8x4 mmx sse2 media neon/;
$vp8_short_fdct8x4_media=vp8_short_fdct8x4_armv6;
-$vp8_short_fdct8x4_neon_asm=vp8_short_fdct8x4_neon;
add_proto qw/void vp8_short_walsh4x4/, "short *input, short *output, int pitch";
-specialize qw/vp8_short_walsh4x4 sse2 media neon_asm/;
+specialize qw/vp8_short_walsh4x4 sse2 media neon/;
$vp8_short_walsh4x4_media=vp8_short_walsh4x4_armv6;
-$vp8_short_walsh4x4_neon_asm=vp8_short_walsh4x4_neon;
#
# Quantizer
@@ -503,19 +492,16 @@
$vp8_mbuverror_sse2=vp8_mbuverror_xmm;
add_proto qw/void vp8_subtract_b/, "struct block *be, struct blockd *bd, int pitch";
-specialize qw/vp8_subtract_b mmx sse2 media neon_asm/;
+specialize qw/vp8_subtract_b mmx sse2 media neon/;
$vp8_subtract_b_media=vp8_subtract_b_armv6;
-$vp8_subtract_b_neon_asm=vp8_subtract_b_neon;
add_proto qw/void vp8_subtract_mby/, "short *diff, unsigned char *src, int src_stride, unsigned char *pred, int pred_stride";
-specialize qw/vp8_subtract_mby mmx sse2 media neon_asm/;
+specialize qw/vp8_subtract_mby mmx sse2 media neon/;
$vp8_subtract_mby_media=vp8_subtract_mby_armv6;
-$vp8_subtract_mby_neon_asm=vp8_subtract_mby_neon;
add_proto qw/void vp8_subtract_mbuv/, "short *diff, unsigned char *usrc, unsigned char *vsrc, int src_stride, unsigned char *upred, unsigned char *vpred, int pred_stride";
-specialize qw/vp8_subtract_mbuv mmx sse2 media neon_asm/;
+specialize qw/vp8_subtract_mbuv mmx sse2 media neon/;
$vp8_subtract_mbuv_media=vp8_subtract_mbuv_armv6;
-$vp8_subtract_mbuv_neon_asm=vp8_subtract_mbuv_neon;
#
# Motion search
@@ -541,13 +527,6 @@
}
#
-# Pick Loopfilter
-#
-add_proto qw/void vp8_yv12_copy_partial_frame/, "struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc";
-specialize qw/vp8_yv12_copy_partial_frame neon_asm/;
-$vp8_yv12_copy_partial_frame_neon_asm=vp8_yv12_copy_partial_frame_neon;
-
-#
# Denoiser filter
#
if (vpx_config("CONFIG_TEMPORAL_DENOISING") eq "yes") {
@@ -555,7 +534,6 @@
specialize qw/vp8_denoiser_filter sse2 neon/;
add_proto qw/int vp8_denoiser_filter_uv/, "unsigned char *mc_running_avg, int mc_avg_stride, unsigned char *running_avg, int avg_stride, unsigned char *sig, int sig_stride, unsigned int motion_magnitude, int increase_denoising";
specialize qw/vp8_denoiser_filter_uv sse2 neon/;
-
}
# End of encoder only functions
diff --git a/vp8/encoder/arm/neon/picklpf_arm.c b/vp8/encoder/arm/neon/picklpf_arm.c
deleted file mode 100644
index ec8071e..0000000
--- a/vp8/encoder/arm/neon/picklpf_arm.c
+++ /dev/null
@@ -1,46 +0,0 @@
-/*
- * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "vp8/common/loopfilter.h"
-#include "vpx_scale/yv12config.h"
-
-extern void vp8_memcpy_partial_neon(unsigned char *dst_ptr,
- unsigned char *src_ptr,
- int sz);
-
-
-void vp8_yv12_copy_partial_frame_neon(YV12_BUFFER_CONFIG *src_ybc,
- YV12_BUFFER_CONFIG *dst_ybc)
-{
- unsigned char *src_y, *dst_y;
- int yheight;
- int ystride;
- int yoffset;
- int linestocopy;
-
- yheight = src_ybc->y_height;
- ystride = src_ybc->y_stride;
-
- /* number of MB rows to use in partial filtering */
- linestocopy = (yheight >> 4) / PARTIAL_FRAME_FRACTION;
- linestocopy = linestocopy ? linestocopy << 4 : 16; /* 16 lines per MB */
-
- /* Copy extra 4 so that full filter context is available if filtering done
- * on the copied partial frame and not original. Partial filter does mb
- * filtering for top row also, which can modify3 pixels above.
- */
- linestocopy += 4;
- /* partial image starts at ~middle of frame (macroblock border) */
- yoffset = ystride * (((yheight >> 5) * 16) - 4);
- src_y = src_ybc->y_buffer + yoffset;
- dst_y = dst_ybc->y_buffer + yoffset;
-
- vp8_memcpy_partial_neon(dst_y, src_y, ystride * linestocopy);
-}
diff --git a/vp8/encoder/arm/neon/shortfdct_neon.asm b/vp8/encoder/arm/neon/shortfdct_neon.asm
deleted file mode 100644
index 5ea8dd8..0000000
--- a/vp8/encoder/arm/neon/shortfdct_neon.asm
+++ /dev/null
@@ -1,221 +0,0 @@
-;
-; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
-
- EXPORT |vp8_short_fdct4x4_neon|
- EXPORT |vp8_short_fdct8x4_neon|
-
- ARM
- REQUIRE8
- PRESERVE8
-
- AREA ||.text||, CODE, READONLY, ALIGN=4
-
-
- ALIGN 16 ; enable use of @128 bit aligned loads
-coeff
- DCW 5352, 5352, 5352, 5352
- DCW 2217, 2217, 2217, 2217
- DCD 14500, 14500, 14500, 14500
- DCD 7500, 7500, 7500, 7500
- DCD 12000, 12000, 12000, 12000
- DCD 51000, 51000, 51000, 51000
-
-;void vp8_short_fdct4x4_c(short *input, short *output, int pitch)
-|vp8_short_fdct4x4_neon| PROC
-
- ; Part one
- vld1.16 {d0}, [r0@64], r2
- adr r12, coeff
- vld1.16 {d1}, [r0@64], r2
- vld1.16 {q8}, [r12@128]! ; d16=5352, d17=2217
- vld1.16 {d2}, [r0@64], r2
- vld1.32 {q9, q10}, [r12@128]! ; q9=14500, q10=7500
- vld1.16 {d3}, [r0@64], r2
-
- ; transpose d0=ip[0], d1=ip[1], d2=ip[2], d3=ip[3]
- vtrn.32 d0, d2
- vtrn.32 d1, d3
- vld1.32 {q11,q12}, [r12@128] ; q11=12000, q12=51000
- vtrn.16 d0, d1
- vtrn.16 d2, d3
-
- vadd.s16 d4, d0, d3 ; a1 = ip[0] + ip[3]
- vadd.s16 d5, d1, d2 ; b1 = ip[1] + ip[2]
- vsub.s16 d6, d1, d2 ; c1 = ip[1] - ip[2]
- vsub.s16 d7, d0, d3 ; d1 = ip[0] - ip[3]
-
- vshl.s16 q2, q2, #3 ; (a1, b1) << 3
- vshl.s16 q3, q3, #3 ; (c1, d1) << 3
-
- vadd.s16 d0, d4, d5 ; op[0] = a1 + b1
- vsub.s16 d2, d4, d5 ; op[2] = a1 - b1
-
- vmlal.s16 q9, d7, d16 ; d1*5352 + 14500
- vmlal.s16 q10, d7, d17 ; d1*2217 + 7500
- vmlal.s16 q9, d6, d17 ; c1*2217 + d1*5352 + 14500
- vmlsl.s16 q10, d6, d16 ; d1*2217 - c1*5352 + 7500
-
- vshrn.s32 d1, q9, #12 ; op[1] = (c1*2217 + d1*5352 + 14500)>>12
- vshrn.s32 d3, q10, #12 ; op[3] = (d1*2217 - c1*5352 + 7500)>>12
-
-
- ; Part two
-
- ; transpose d0=ip[0], d1=ip[4], d2=ip[8], d3=ip[12]
- vtrn.32 d0, d2
- vtrn.32 d1, d3
- vtrn.16 d0, d1
- vtrn.16 d2, d3
-
- vmov.s16 d26, #7
-
- vadd.s16 d4, d0, d3 ; a1 = ip[0] + ip[12]
- vadd.s16 d5, d1, d2 ; b1 = ip[4] + ip[8]
- vsub.s16 d6, d1, d2 ; c1 = ip[4] - ip[8]
- vadd.s16 d4, d4, d26 ; a1 + 7
- vsub.s16 d7, d0, d3 ; d1 = ip[0] - ip[12]
-
- vadd.s16 d0, d4, d5 ; op[0] = a1 + b1 + 7
- vsub.s16 d2, d4, d5 ; op[8] = a1 - b1 + 7
-
- vmlal.s16 q11, d7, d16 ; d1*5352 + 12000
- vmlal.s16 q12, d7, d17 ; d1*2217 + 51000
-
- vceq.s16 d4, d7, #0
-
- vshr.s16 d0, d0, #4
- vshr.s16 d2, d2, #4
-
- vmlal.s16 q11, d6, d17 ; c1*2217 + d1*5352 + 12000
- vmlsl.s16 q12, d6, d16 ; d1*2217 - c1*5352 + 51000
-
- vmvn d4, d4
- vshrn.s32 d1, q11, #16 ; op[4] = (c1*2217 + d1*5352 + 12000)>>16
- vsub.s16 d1, d1, d4 ; op[4] += (d1!=0)
- vshrn.s32 d3, q12, #16 ; op[12]= (d1*2217 - c1*5352 + 51000)>>16
-
- vst1.16 {q0, q1}, [r1@128]
-
- bx lr
-
- ENDP
-
-;void vp8_short_fdct8x4_c(short *input, short *output, int pitch)
-|vp8_short_fdct8x4_neon| PROC
-
- ; Part one
-
- vld1.16 {q0}, [r0@128], r2
- adr r12, coeff
- vld1.16 {q1}, [r0@128], r2
- vld1.16 {q8}, [r12@128]! ; d16=5352, d17=2217
- vld1.16 {q2}, [r0@128], r2
- vld1.32 {q9, q10}, [r12@128]! ; q9=14500, q10=7500
- vld1.16 {q3}, [r0@128], r2
-
- ; transpose q0=ip[0], q1=ip[1], q2=ip[2], q3=ip[3]
- vtrn.32 q0, q2 ; [A0|B0]
- vtrn.32 q1, q3 ; [A1|B1]
- vtrn.16 q0, q1 ; [A2|B2]
- vtrn.16 q2, q3 ; [A3|B3]
-
- vadd.s16 q11, q0, q3 ; a1 = ip[0] + ip[3]
- vadd.s16 q12, q1, q2 ; b1 = ip[1] + ip[2]
- vsub.s16 q13, q1, q2 ; c1 = ip[1] - ip[2]
- vsub.s16 q14, q0, q3 ; d1 = ip[0] - ip[3]
-
- vshl.s16 q11, q11, #3 ; a1 << 3
- vshl.s16 q12, q12, #3 ; b1 << 3
- vshl.s16 q13, q13, #3 ; c1 << 3
- vshl.s16 q14, q14, #3 ; d1 << 3
-
- vadd.s16 q0, q11, q12 ; [A0 | B0] = a1 + b1
- vsub.s16 q2, q11, q12 ; [A2 | B2] = a1 - b1
-
- vmov.s16 q11, q9 ; 14500
- vmov.s16 q12, q10 ; 7500
-
- vmlal.s16 q9, d28, d16 ; A[1] = d1*5352 + 14500
- vmlal.s16 q10, d28, d17 ; A[3] = d1*2217 + 7500
- vmlal.s16 q11, d29, d16 ; B[1] = d1*5352 + 14500
- vmlal.s16 q12, d29, d17 ; B[3] = d1*2217 + 7500
-
- vmlal.s16 q9, d26, d17 ; A[1] = c1*2217 + d1*5352 + 14500
- vmlsl.s16 q10, d26, d16 ; A[3] = d1*2217 - c1*5352 + 7500
- vmlal.s16 q11, d27, d17 ; B[1] = c1*2217 + d1*5352 + 14500
- vmlsl.s16 q12, d27, d16 ; B[3] = d1*2217 - c1*5352 + 7500
-
- vshrn.s32 d2, q9, #12 ; A[1] = (c1*2217 + d1*5352 + 14500)>>12
- vshrn.s32 d6, q10, #12 ; A[3] = (d1*2217 - c1*5352 + 7500)>>12
- vshrn.s32 d3, q11, #12 ; B[1] = (c1*2217 + d1*5352 + 14500)>>12
- vshrn.s32 d7, q12, #12 ; B[3] = (d1*2217 - c1*5352 + 7500)>>12
-
-
- ; Part two
- vld1.32 {q9,q10}, [r12@128] ; q9=12000, q10=51000
-
- ; transpose q0=ip[0], q1=ip[4], q2=ip[8], q3=ip[12]
- vtrn.32 q0, q2 ; q0=[A0 | B0]
- vtrn.32 q1, q3 ; q1=[A4 | B4]
- vtrn.16 q0, q1 ; q2=[A8 | B8]
- vtrn.16 q2, q3 ; q3=[A12|B12]
-
- vmov.s16 q15, #7
-
- vadd.s16 q11, q0, q3 ; a1 = ip[0] + ip[12]
- vadd.s16 q12, q1, q2 ; b1 = ip[4] + ip[8]
- vadd.s16 q11, q11, q15 ; a1 + 7
- vsub.s16 q13, q1, q2 ; c1 = ip[4] - ip[8]
- vsub.s16 q14, q0, q3 ; d1 = ip[0] - ip[12]
-
- vadd.s16 q0, q11, q12 ; a1 + b1 + 7
- vsub.s16 q1, q11, q12 ; a1 - b1 + 7
-
- vmov.s16 q11, q9 ; 12000
- vmov.s16 q12, q10 ; 51000
-
- vshr.s16 d0, d0, #4 ; A[0] = (a1 + b1 + 7)>>4
- vshr.s16 d4, d1, #4 ; B[0] = (a1 + b1 + 7)>>4
- vshr.s16 d2, d2, #4 ; A[8] = (a1 + b1 + 7)>>4
- vshr.s16 d6, d3, #4 ; B[8] = (a1 + b1 + 7)>>4
-
-
- vmlal.s16 q9, d28, d16 ; A[4] = d1*5352 + 12000
- vmlal.s16 q10, d28, d17 ; A[12] = d1*2217 + 51000
- vmlal.s16 q11, d29, d16 ; B[4] = d1*5352 + 12000
- vmlal.s16 q12, d29, d17 ; B[12] = d1*2217 + 51000
-
- vceq.s16 q14, q14, #0
-
- vmlal.s16 q9, d26, d17 ; A[4] = c1*2217 + d1*5352 + 12000
- vmlsl.s16 q10, d26, d16 ; A[12] = d1*2217 - c1*5352 + 51000
- vmlal.s16 q11, d27, d17 ; B[4] = c1*2217 + d1*5352 + 12000
- vmlsl.s16 q12, d27, d16 ; B[12] = d1*2217 - c1*5352 + 51000
-
- vmvn q14, q14
-
- vshrn.s32 d1, q9, #16 ; A[4] = (c1*2217 + d1*5352 + 12000)>>16
- vshrn.s32 d3, q10, #16 ; A[12]= (d1*2217 - c1*5352 + 51000)>>16
- vsub.s16 d1, d1, d28 ; A[4] += (d1!=0)
-
- vshrn.s32 d5, q11, #16 ; B[4] = (c1*2217 + d1*5352 + 12000)>>16
- vshrn.s32 d7, q12, #16 ; B[12]= (d1*2217 - c1*5352 + 51000)>>16
- vsub.s16 d5, d5, d29 ; B[4] += (d1!=0)
-
- vst1.16 {q0, q1}, [r1@128]! ; block A
- vst1.16 {q2, q3}, [r1@128]! ; block B
-
- bx lr
-
- ENDP
-
- END
-
diff --git a/vp8/encoder/arm/neon/shortfdct_neon.c b/vp8/encoder/arm/neon/shortfdct_neon.c
new file mode 100644
index 0000000..391e5f9
--- /dev/null
+++ b/vp8/encoder/arm/neon/shortfdct_neon.c
@@ -0,0 +1,269 @@
+/*
+ * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+void vp8_short_fdct4x4_neon(
+ int16_t *input,
+ int16_t *output,
+ int pitch) {
+ int16x4_t d0s16, d1s16, d2s16, d3s16, d4s16, d5s16, d6s16, d7s16;
+ int16x4_t d16s16, d17s16, d26s16, dEmptys16;
+ uint16x4_t d4u16;
+ int16x8_t q0s16, q1s16;
+ int32x4_t q9s32, q10s32, q11s32, q12s32;
+ int16x4x2_t v2tmp0, v2tmp1;
+ int32x2x2_t v2tmp2, v2tmp3;
+
+ d16s16 = vdup_n_s16(5352);
+ d17s16 = vdup_n_s16(2217);
+ q9s32 = vdupq_n_s32(14500);
+ q10s32 = vdupq_n_s32(7500);
+ q11s32 = vdupq_n_s32(12000);
+ q12s32 = vdupq_n_s32(51000);
+
+ // Part one
+ pitch >>= 1;
+ d0s16 = vld1_s16(input);
+ input += pitch;
+ d1s16 = vld1_s16(input);
+ input += pitch;
+ d2s16 = vld1_s16(input);
+ input += pitch;
+ d3s16 = vld1_s16(input);
+
+ v2tmp2 = vtrn_s32(vreinterpret_s32_s16(d0s16),
+ vreinterpret_s32_s16(d2s16));
+ v2tmp3 = vtrn_s32(vreinterpret_s32_s16(d1s16),
+ vreinterpret_s32_s16(d3s16));
+ v2tmp0 = vtrn_s16(vreinterpret_s16_s32(v2tmp2.val[0]), // d0
+ vreinterpret_s16_s32(v2tmp3.val[0])); // d1
+ v2tmp1 = vtrn_s16(vreinterpret_s16_s32(v2tmp2.val[1]), // d2
+ vreinterpret_s16_s32(v2tmp3.val[1])); // d3
+
+ d4s16 = vadd_s16(v2tmp0.val[0], v2tmp1.val[1]);
+ d5s16 = vadd_s16(v2tmp0.val[1], v2tmp1.val[0]);
+ d6s16 = vsub_s16(v2tmp0.val[1], v2tmp1.val[0]);
+ d7s16 = vsub_s16(v2tmp0.val[0], v2tmp1.val[1]);
+
+ d4s16 = vshl_n_s16(d4s16, 3);
+ d5s16 = vshl_n_s16(d5s16, 3);
+ d6s16 = vshl_n_s16(d6s16, 3);
+ d7s16 = vshl_n_s16(d7s16, 3);
+
+ d0s16 = vadd_s16(d4s16, d5s16);
+ d2s16 = vsub_s16(d4s16, d5s16);
+
+ q9s32 = vmlal_s16(q9s32, d7s16, d16s16);
+ q10s32 = vmlal_s16(q10s32, d7s16, d17s16);
+ q9s32 = vmlal_s16(q9s32, d6s16, d17s16);
+ q10s32 = vmlsl_s16(q10s32, d6s16, d16s16);
+
+ d1s16 = vshrn_n_s32(q9s32, 12);
+ d3s16 = vshrn_n_s32(q10s32, 12);
+
+ // Part two
+ v2tmp2 = vtrn_s32(vreinterpret_s32_s16(d0s16),
+ vreinterpret_s32_s16(d2s16));
+ v2tmp3 = vtrn_s32(vreinterpret_s32_s16(d1s16),
+ vreinterpret_s32_s16(d3s16));
+ v2tmp0 = vtrn_s16(vreinterpret_s16_s32(v2tmp2.val[0]), // d0
+ vreinterpret_s16_s32(v2tmp3.val[0])); // d1
+ v2tmp1 = vtrn_s16(vreinterpret_s16_s32(v2tmp2.val[1]), // d2
+ vreinterpret_s16_s32(v2tmp3.val[1])); // d3
+
+ d4s16 = vadd_s16(v2tmp0.val[0], v2tmp1.val[1]);
+ d5s16 = vadd_s16(v2tmp0.val[1], v2tmp1.val[0]);
+ d6s16 = vsub_s16(v2tmp0.val[1], v2tmp1.val[0]);
+ d7s16 = vsub_s16(v2tmp0.val[0], v2tmp1.val[1]);
+
+ d26s16 = vdup_n_s16(7);
+ d4s16 = vadd_s16(d4s16, d26s16);
+
+ d0s16 = vadd_s16(d4s16, d5s16);
+ d2s16 = vsub_s16(d4s16, d5s16);
+
+ q11s32 = vmlal_s16(q11s32, d7s16, d16s16);
+ q12s32 = vmlal_s16(q12s32, d7s16, d17s16);
+
+ dEmptys16 = vdup_n_s16(0);
+ d4u16 = vceq_s16(d7s16, dEmptys16);
+
+ d0s16 = vshr_n_s16(d0s16, 4);
+ d2s16 = vshr_n_s16(d2s16, 4);
+
+ q11s32 = vmlal_s16(q11s32, d6s16, d17s16);
+ q12s32 = vmlsl_s16(q12s32, d6s16, d16s16);
+
+ d4u16 = vmvn_u16(d4u16);
+ d1s16 = vshrn_n_s32(q11s32, 16);
+ d1s16 = vsub_s16(d1s16, vreinterpret_s16_u16(d4u16));
+ d3s16 = vshrn_n_s32(q12s32, 16);
+
+ q0s16 = vcombine_s16(d0s16, d1s16);
+ q1s16 = vcombine_s16(d2s16, d3s16);
+
+ vst1q_s16(output, q0s16);
+ vst1q_s16(output + 8, q1s16);
+ return;
+}
+
+void vp8_short_fdct8x4_neon(
+ int16_t *input,
+ int16_t *output,
+ int pitch) {
+ int16x4_t d0s16, d1s16, d2s16, d3s16, d4s16, d5s16, d6s16, d7s16;
+ int16x4_t d16s16, d17s16, d26s16, d27s16, d28s16, d29s16;
+ uint16x4_t d28u16, d29u16;
+ uint16x8_t q14u16;
+ int16x8_t q0s16, q1s16, q2s16, q3s16;
+ int16x8_t q11s16, q12s16, q13s16, q14s16, q15s16, qEmptys16;
+ int32x4_t q9s32, q10s32, q11s32, q12s32;
+ int16x8x2_t v2tmp0, v2tmp1;
+ int32x4x2_t v2tmp2, v2tmp3;
+
+ d16s16 = vdup_n_s16(5352);
+ d17s16 = vdup_n_s16(2217);
+ q9s32 = vdupq_n_s32(14500);
+ q10s32 = vdupq_n_s32(7500);
+
+ // Part one
+ pitch >>= 1;
+ q0s16 = vld1q_s16(input);
+ input += pitch;
+ q1s16 = vld1q_s16(input);
+ input += pitch;
+ q2s16 = vld1q_s16(input);
+ input += pitch;
+ q3s16 = vld1q_s16(input);
+
+ v2tmp2 = vtrnq_s32(vreinterpretq_s32_s16(q0s16),
+ vreinterpretq_s32_s16(q2s16));
+ v2tmp3 = vtrnq_s32(vreinterpretq_s32_s16(q1s16),
+ vreinterpretq_s32_s16(q3s16));
+ v2tmp0 = vtrnq_s16(vreinterpretq_s16_s32(v2tmp2.val[0]), // q0
+ vreinterpretq_s16_s32(v2tmp3.val[0])); // q1
+ v2tmp1 = vtrnq_s16(vreinterpretq_s16_s32(v2tmp2.val[1]), // q2
+ vreinterpretq_s16_s32(v2tmp3.val[1])); // q3
+
+ q11s16 = vaddq_s16(v2tmp0.val[0], v2tmp1.val[1]);
+ q12s16 = vaddq_s16(v2tmp0.val[1], v2tmp1.val[0]);
+ q13s16 = vsubq_s16(v2tmp0.val[1], v2tmp1.val[0]);
+ q14s16 = vsubq_s16(v2tmp0.val[0], v2tmp1.val[1]);
+
+ q11s16 = vshlq_n_s16(q11s16, 3);
+ q12s16 = vshlq_n_s16(q12s16, 3);
+ q13s16 = vshlq_n_s16(q13s16, 3);
+ q14s16 = vshlq_n_s16(q14s16, 3);
+
+ q0s16 = vaddq_s16(q11s16, q12s16);
+ q2s16 = vsubq_s16(q11s16, q12s16);
+
+ q11s32 = q9s32;
+ q12s32 = q10s32;
+
+ d26s16 = vget_low_s16(q13s16);
+ d27s16 = vget_high_s16(q13s16);
+ d28s16 = vget_low_s16(q14s16);
+ d29s16 = vget_high_s16(q14s16);
+
+ q9s32 = vmlal_s16(q9s32, d28s16, d16s16);
+ q10s32 = vmlal_s16(q10s32, d28s16, d17s16);
+ q11s32 = vmlal_s16(q11s32, d29s16, d16s16);
+ q12s32 = vmlal_s16(q12s32, d29s16, d17s16);
+
+ q9s32 = vmlal_s16(q9s32, d26s16, d17s16);
+ q10s32 = vmlsl_s16(q10s32, d26s16, d16s16);
+ q11s32 = vmlal_s16(q11s32, d27s16, d17s16);
+ q12s32 = vmlsl_s16(q12s32, d27s16, d16s16);
+
+ d2s16 = vshrn_n_s32(q9s32, 12);
+ d6s16 = vshrn_n_s32(q10s32, 12);
+ d3s16 = vshrn_n_s32(q11s32, 12);
+ d7s16 = vshrn_n_s32(q12s32, 12);
+ q1s16 = vcombine_s16(d2s16, d3s16);
+ q3s16 = vcombine_s16(d6s16, d7s16);
+
+ // Part two
+ q9s32 = vdupq_n_s32(12000);
+ q10s32 = vdupq_n_s32(51000);
+
+ v2tmp2 = vtrnq_s32(vreinterpretq_s32_s16(q0s16),
+ vreinterpretq_s32_s16(q2s16));
+ v2tmp3 = vtrnq_s32(vreinterpretq_s32_s16(q1s16),
+ vreinterpretq_s32_s16(q3s16));
+ v2tmp0 = vtrnq_s16(vreinterpretq_s16_s32(v2tmp2.val[0]), // q0
+ vreinterpretq_s16_s32(v2tmp3.val[0])); // q1
+ v2tmp1 = vtrnq_s16(vreinterpretq_s16_s32(v2tmp2.val[1]), // q2
+ vreinterpretq_s16_s32(v2tmp3.val[1])); // q3
+
+ q11s16 = vaddq_s16(v2tmp0.val[0], v2tmp1.val[1]);
+ q12s16 = vaddq_s16(v2tmp0.val[1], v2tmp1.val[0]);
+ q13s16 = vsubq_s16(v2tmp0.val[1], v2tmp1.val[0]);
+ q14s16 = vsubq_s16(v2tmp0.val[0], v2tmp1.val[1]);
+
+ q15s16 = vdupq_n_s16(7);
+ q11s16 = vaddq_s16(q11s16, q15s16);
+ q0s16 = vaddq_s16(q11s16, q12s16);
+ q1s16 = vsubq_s16(q11s16, q12s16);
+
+ q11s32 = q9s32;
+ q12s32 = q10s32;
+
+ d0s16 = vget_low_s16(q0s16);
+ d1s16 = vget_high_s16(q0s16);
+ d2s16 = vget_low_s16(q1s16);
+ d3s16 = vget_high_s16(q1s16);
+
+ d0s16 = vshr_n_s16(d0s16, 4);
+ d4s16 = vshr_n_s16(d1s16, 4);
+ d2s16 = vshr_n_s16(d2s16, 4);
+ d6s16 = vshr_n_s16(d3s16, 4);
+
+ d26s16 = vget_low_s16(q13s16);
+ d27s16 = vget_high_s16(q13s16);
+ d28s16 = vget_low_s16(q14s16);
+ d29s16 = vget_high_s16(q14s16);
+
+ q9s32 = vmlal_s16(q9s32, d28s16, d16s16);
+ q10s32 = vmlal_s16(q10s32, d28s16, d17s16);
+ q11s32 = vmlal_s16(q11s32, d29s16, d16s16);
+ q12s32 = vmlal_s16(q12s32, d29s16, d17s16);
+
+ q9s32 = vmlal_s16(q9s32, d26s16, d17s16);
+ q10s32 = vmlsl_s16(q10s32, d26s16, d16s16);
+ q11s32 = vmlal_s16(q11s32, d27s16, d17s16);
+ q12s32 = vmlsl_s16(q12s32, d27s16, d16s16);
+
+ d1s16 = vshrn_n_s32(q9s32, 16);
+ d3s16 = vshrn_n_s32(q10s32, 16);
+ d5s16 = vshrn_n_s32(q11s32, 16);
+ d7s16 = vshrn_n_s32(q12s32, 16);
+
+ qEmptys16 = vdupq_n_s16(0);
+ q14u16 = vceqq_s16(q14s16, qEmptys16);
+ q14u16 = vmvnq_u16(q14u16);
+
+ d28u16 = vget_low_u16(q14u16);
+ d29u16 = vget_high_u16(q14u16);
+ d1s16 = vsub_s16(d1s16, vreinterpret_s16_u16(d28u16));
+ d5s16 = vsub_s16(d5s16, vreinterpret_s16_u16(d29u16));
+
+ q0s16 = vcombine_s16(d0s16, d1s16);
+ q1s16 = vcombine_s16(d2s16, d3s16);
+ q2s16 = vcombine_s16(d4s16, d5s16);
+ q3s16 = vcombine_s16(d6s16, d7s16);
+
+ vst1q_s16(output, q0s16);
+ vst1q_s16(output + 8, q1s16);
+ vst1q_s16(output + 16, q2s16);
+ vst1q_s16(output + 24, q3s16);
+ return;
+}
diff --git a/vp8/encoder/arm/neon/subtract_neon.asm b/vp8/encoder/arm/neon/subtract_neon.asm
deleted file mode 100644
index 840cb33..0000000
--- a/vp8/encoder/arm/neon/subtract_neon.asm
+++ /dev/null
@@ -1,205 +0,0 @@
-;
-; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
- EXPORT |vp8_subtract_b_neon|
- EXPORT |vp8_subtract_mby_neon|
- EXPORT |vp8_subtract_mbuv_neon|
-
- INCLUDE vp8_asm_enc_offsets.asm
-
- ARM
- REQUIRE8
- PRESERVE8
-
- AREA ||.text||, CODE, READONLY, ALIGN=2
-
-;void vp8_subtract_b_neon(BLOCK *be, BLOCKD *bd, int pitch)
-|vp8_subtract_b_neon| PROC
-
- stmfd sp!, {r4-r7}
-
- ldr r3, [r0, #vp8_block_base_src]
- ldr r4, [r0, #vp8_block_src]
- ldr r5, [r0, #vp8_block_src_diff]
- ldr r3, [r3]
- ldr r6, [r0, #vp8_block_src_stride]
- add r3, r3, r4 ; src = *base_src + src
- ldr r7, [r1, #vp8_blockd_predictor]
-
- vld1.8 {d0}, [r3], r6 ;load src
- vld1.8 {d1}, [r7], r2 ;load pred
- vld1.8 {d2}, [r3], r6
- vld1.8 {d3}, [r7], r2
- vld1.8 {d4}, [r3], r6
- vld1.8 {d5}, [r7], r2
- vld1.8 {d6}, [r3], r6
- vld1.8 {d7}, [r7], r2
-
- vsubl.u8 q10, d0, d1
- vsubl.u8 q11, d2, d3
- vsubl.u8 q12, d4, d5
- vsubl.u8 q13, d6, d7
-
- mov r2, r2, lsl #1
-
- vst1.16 {d20}, [r5], r2 ;store diff
- vst1.16 {d22}, [r5], r2
- vst1.16 {d24}, [r5], r2
- vst1.16 {d26}, [r5], r2
-
- ldmfd sp!, {r4-r7}
- bx lr
-
- ENDP
-
-
-;==========================================
-;void vp8_subtract_mby_neon(short *diff, unsigned char *src, int src_stride
-; unsigned char *pred, int pred_stride)
-|vp8_subtract_mby_neon| PROC
- push {r4-r7}
- vpush {d8-d15}
-
- mov r12, #4
- ldr r4, [sp, #80] ; pred_stride
- mov r6, #32 ; "diff" stride x2
- add r5, r0, #16 ; second diff pointer
-
-subtract_mby_loop
- vld1.8 {q0}, [r1], r2 ;load src
- vld1.8 {q1}, [r3], r4 ;load pred
- vld1.8 {q2}, [r1], r2
- vld1.8 {q3}, [r3], r4
- vld1.8 {q4}, [r1], r2
- vld1.8 {q5}, [r3], r4
- vld1.8 {q6}, [r1], r2
- vld1.8 {q7}, [r3], r4
-
- vsubl.u8 q8, d0, d2
- vsubl.u8 q9, d1, d3
- vsubl.u8 q10, d4, d6
- vsubl.u8 q11, d5, d7
- vsubl.u8 q12, d8, d10
- vsubl.u8 q13, d9, d11
- vsubl.u8 q14, d12, d14
- vsubl.u8 q15, d13, d15
-
- vst1.16 {q8}, [r0], r6 ;store diff
- vst1.16 {q9}, [r5], r6
- vst1.16 {q10}, [r0], r6
- vst1.16 {q11}, [r5], r6
- vst1.16 {q12}, [r0], r6
- vst1.16 {q13}, [r5], r6
- vst1.16 {q14}, [r0], r6
- vst1.16 {q15}, [r5], r6
-
- subs r12, r12, #1
- bne subtract_mby_loop
-
- vpop {d8-d15}
- pop {r4-r7}
- bx lr
- ENDP
-
-;=================================
-;void vp8_subtract_mbuv_c(short *diff, unsigned char *usrc, unsigned char *vsrc,
-; int src_stride, unsigned char *upred,
-; unsigned char *vpred, int pred_stride)
-
-|vp8_subtract_mbuv_neon| PROC
- push {r4-r7}
- vpush {d8-d15}
-
- ldr r4, [sp, #80] ; upred
- ldr r5, [sp, #84] ; vpred
- ldr r6, [sp, #88] ; pred_stride
- add r0, r0, #512 ; short *udiff = diff + 256;
- mov r12, #32 ; "diff" stride x2
- add r7, r0, #16 ; second diff pointer
-
-;u
- vld1.8 {d0}, [r1], r3 ;load usrc
- vld1.8 {d1}, [r4], r6 ;load upred
- vld1.8 {d2}, [r1], r3
- vld1.8 {d3}, [r4], r6
- vld1.8 {d4}, [r1], r3
- vld1.8 {d5}, [r4], r6
- vld1.8 {d6}, [r1], r3
- vld1.8 {d7}, [r4], r6
- vld1.8 {d8}, [r1], r3
- vld1.8 {d9}, [r4], r6
- vld1.8 {d10}, [r1], r3
- vld1.8 {d11}, [r4], r6
- vld1.8 {d12}, [r1], r3
- vld1.8 {d13}, [r4], r6
- vld1.8 {d14}, [r1], r3
- vld1.8 {d15}, [r4], r6
-
- vsubl.u8 q8, d0, d1
- vsubl.u8 q9, d2, d3
- vsubl.u8 q10, d4, d5
- vsubl.u8 q11, d6, d7
- vsubl.u8 q12, d8, d9
- vsubl.u8 q13, d10, d11
- vsubl.u8 q14, d12, d13
- vsubl.u8 q15, d14, d15
-
- vst1.16 {q8}, [r0], r12 ;store diff
- vst1.16 {q9}, [r7], r12
- vst1.16 {q10}, [r0], r12
- vst1.16 {q11}, [r7], r12
- vst1.16 {q12}, [r0], r12
- vst1.16 {q13}, [r7], r12
- vst1.16 {q14}, [r0], r12
- vst1.16 {q15}, [r7], r12
-
-;v
- vld1.8 {d0}, [r2], r3 ;load vsrc
- vld1.8 {d1}, [r5], r6 ;load vpred
- vld1.8 {d2}, [r2], r3
- vld1.8 {d3}, [r5], r6
- vld1.8 {d4}, [r2], r3
- vld1.8 {d5}, [r5], r6
- vld1.8 {d6}, [r2], r3
- vld1.8 {d7}, [r5], r6
- vld1.8 {d8}, [r2], r3
- vld1.8 {d9}, [r5], r6
- vld1.8 {d10}, [r2], r3
- vld1.8 {d11}, [r5], r6
- vld1.8 {d12}, [r2], r3
- vld1.8 {d13}, [r5], r6
- vld1.8 {d14}, [r2], r3
- vld1.8 {d15}, [r5], r6
-
- vsubl.u8 q8, d0, d1
- vsubl.u8 q9, d2, d3
- vsubl.u8 q10, d4, d5
- vsubl.u8 q11, d6, d7
- vsubl.u8 q12, d8, d9
- vsubl.u8 q13, d10, d11
- vsubl.u8 q14, d12, d13
- vsubl.u8 q15, d14, d15
-
- vst1.16 {q8}, [r0], r12 ;store diff
- vst1.16 {q9}, [r7], r12
- vst1.16 {q10}, [r0], r12
- vst1.16 {q11}, [r7], r12
- vst1.16 {q12}, [r0], r12
- vst1.16 {q13}, [r7], r12
- vst1.16 {q14}, [r0], r12
- vst1.16 {q15}, [r7], r12
-
- vpop {d8-d15}
- pop {r4-r7}
- bx lr
-
- ENDP
-
- END
diff --git a/vp8/encoder/arm/neon/subtract_neon.c b/vp8/encoder/arm/neon/subtract_neon.c
new file mode 100644
index 0000000..d3ab7b1
--- /dev/null
+++ b/vp8/encoder/arm/neon/subtract_neon.c
@@ -0,0 +1,154 @@
+/*
+ * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+#include "vp8/encoder/block.h"
+
+void vp8_subtract_b_neon(
+ BLOCK *be,
+ BLOCKD *bd,
+ int pitch) {
+ unsigned char *src_ptr, *predictor;
+ int src_stride;
+ int16_t *src_diff;
+ uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8;
+ uint16x8_t q10u16, q11u16, q12u16, q13u16;
+
+ src_ptr = *be->base_src + be->src;
+ src_stride = be->src_stride;
+ predictor = bd->predictor;
+
+ d0u8 = vld1_u8(src_ptr);
+ src_ptr += src_stride;
+ d2u8 = vld1_u8(src_ptr);
+ src_ptr += src_stride;
+ d4u8 = vld1_u8(src_ptr);
+ src_ptr += src_stride;
+ d6u8 = vld1_u8(src_ptr);
+
+ d1u8 = vld1_u8(predictor);
+ predictor += pitch;
+ d3u8 = vld1_u8(predictor);
+ predictor += pitch;
+ d5u8 = vld1_u8(predictor);
+ predictor += pitch;
+ d7u8 = vld1_u8(predictor);
+
+ q10u16 = vsubl_u8(d0u8, d1u8);
+ q11u16 = vsubl_u8(d2u8, d3u8);
+ q12u16 = vsubl_u8(d4u8, d5u8);
+ q13u16 = vsubl_u8(d6u8, d7u8);
+
+ src_diff = be->src_diff;
+ vst1_u16((uint16_t *)src_diff, vget_low_u16(q10u16));
+ src_diff += pitch;
+ vst1_u16((uint16_t *)src_diff, vget_low_u16(q11u16));
+ src_diff += pitch;
+ vst1_u16((uint16_t *)src_diff, vget_low_u16(q12u16));
+ src_diff += pitch;
+ vst1_u16((uint16_t *)src_diff, vget_low_u16(q13u16));
+ return;
+}
+
+void vp8_subtract_mby_neon(
+ int16_t *diff,
+ unsigned char *src,
+ int src_stride,
+ unsigned char *pred,
+ int pred_stride) {
+ int i;
+ uint8x16_t q0u8, q1u8, q2u8, q3u8;
+ uint16x8_t q8u16, q9u16, q10u16, q11u16;
+
+ for (i = 0; i < 8; i++) { // subtract_mby_loop
+ q0u8 = vld1q_u8(src);
+ src += src_stride;
+ q2u8 = vld1q_u8(src);
+ src += src_stride;
+ q1u8 = vld1q_u8(pred);
+ pred += pred_stride;
+ q3u8 = vld1q_u8(pred);
+ pred += pred_stride;
+
+ q8u16 = vsubl_u8(vget_low_u8(q0u8), vget_low_u8(q1u8));
+ q9u16 = vsubl_u8(vget_high_u8(q0u8), vget_high_u8(q1u8));
+ q10u16 = vsubl_u8(vget_low_u8(q2u8), vget_low_u8(q3u8));
+ q11u16 = vsubl_u8(vget_high_u8(q2u8), vget_high_u8(q3u8));
+
+ vst1q_u16((uint16_t *)diff, q8u16);
+ diff += 8;
+ vst1q_u16((uint16_t *)diff, q9u16);
+ diff += 8;
+ vst1q_u16((uint16_t *)diff, q10u16);
+ diff += 8;
+ vst1q_u16((uint16_t *)diff, q11u16);
+ diff += 8;
+ }
+ return;
+}
+
+void vp8_subtract_mbuv_neon(
+ int16_t *diff,
+ unsigned char *usrc,
+ unsigned char *vsrc,
+ int src_stride,
+ unsigned char *upred,
+ unsigned char *vpred,
+ int pred_stride) {
+ int i, j;
+ unsigned char *src_ptr, *pred_ptr;
+ uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8;
+ uint16x8_t q8u16, q9u16, q10u16, q11u16;
+
+ diff += 256;
+ for (i = 0; i < 2; i++) {
+ if (i == 0) {
+ src_ptr = usrc;
+ pred_ptr = upred;
+ } else if (i == 1) {
+ src_ptr = vsrc;
+ pred_ptr = vpred;
+ }
+
+ for (j = 0; j < 2; j++) {
+ d0u8 = vld1_u8(src_ptr);
+ src_ptr += src_stride;
+ d1u8 = vld1_u8(pred_ptr);
+ pred_ptr += pred_stride;
+ d2u8 = vld1_u8(src_ptr);
+ src_ptr += src_stride;
+ d3u8 = vld1_u8(pred_ptr);
+ pred_ptr += pred_stride;
+ d4u8 = vld1_u8(src_ptr);
+ src_ptr += src_stride;
+ d5u8 = vld1_u8(pred_ptr);
+ pred_ptr += pred_stride;
+ d6u8 = vld1_u8(src_ptr);
+ src_ptr += src_stride;
+ d7u8 = vld1_u8(pred_ptr);
+ pred_ptr += pred_stride;
+
+ q8u16 = vsubl_u8(d0u8, d1u8);
+ q9u16 = vsubl_u8(d2u8, d3u8);
+ q10u16 = vsubl_u8(d4u8, d5u8);
+ q11u16 = vsubl_u8(d6u8, d7u8);
+
+ vst1q_u16((uint16_t *)diff, q8u16);
+ diff += 8;
+ vst1q_u16((uint16_t *)diff, q9u16);
+ diff += 8;
+ vst1q_u16((uint16_t *)diff, q10u16);
+ diff += 8;
+ vst1q_u16((uint16_t *)diff, q11u16);
+ diff += 8;
+ }
+ }
+ return;
+}
diff --git a/vp8/encoder/arm/neon/vp8_memcpy_neon.asm b/vp8/encoder/arm/neon/vp8_memcpy_neon.asm
deleted file mode 100644
index d219e2d..0000000
--- a/vp8/encoder/arm/neon/vp8_memcpy_neon.asm
+++ /dev/null
@@ -1,72 +0,0 @@
-;
-; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
-
- EXPORT |vp8_memcpy_partial_neon|
-
- ARM
- REQUIRE8
- PRESERVE8
-
- AREA ||.text||, CODE, READONLY, ALIGN=2
-;=========================================
-;this is not a full memcpy function!!!
-;void vp8_memcpy_partial_neon(unsigned char *dst_ptr, unsigned char *src_ptr,
-; int sz);
-|vp8_memcpy_partial_neon| PROC
- vpush {d8-d15}
- ;pld [r1] ;preload pred data
- ;pld [r1, #128]
- ;pld [r1, #256]
- ;pld [r1, #384]
-
- mov r12, r2, lsr #8 ;copy 256 bytes data at one time
-
-memcpy_neon_loop
- vld1.8 {q0, q1}, [r1]! ;load src data
- subs r12, r12, #1
- vld1.8 {q2, q3}, [r1]!
- vst1.8 {q0, q1}, [r0]! ;copy to dst_ptr
- vld1.8 {q4, q5}, [r1]!
- vst1.8 {q2, q3}, [r0]!
- vld1.8 {q6, q7}, [r1]!
- vst1.8 {q4, q5}, [r0]!
- vld1.8 {q8, q9}, [r1]!
- vst1.8 {q6, q7}, [r0]!
- vld1.8 {q10, q11}, [r1]!
- vst1.8 {q8, q9}, [r0]!
- vld1.8 {q12, q13}, [r1]!
- vst1.8 {q10, q11}, [r0]!
- vld1.8 {q14, q15}, [r1]!
- vst1.8 {q12, q13}, [r0]!
- vst1.8 {q14, q15}, [r0]!
-
- ;pld [r1] ;preload pred data -- need to adjust for real device
- ;pld [r1, #128]
- ;pld [r1, #256]
- ;pld [r1, #384]
-
- bne memcpy_neon_loop
-
- ands r3, r2, #0xff ;extra copy
- beq done_copy_neon_loop
-
-extra_copy_neon_loop
- vld1.8 {q0}, [r1]! ;load src data
- subs r3, r3, #16
- vst1.8 {q0}, [r0]!
- bne extra_copy_neon_loop
-
-done_copy_neon_loop
- vpop {d8-d15}
- bx lr
- ENDP
-
- END
diff --git a/vp8/encoder/arm/neon/vp8_shortwalsh4x4_neon.asm b/vp8/encoder/arm/neon/vp8_shortwalsh4x4_neon.asm
deleted file mode 100644
index 22266297..0000000
--- a/vp8/encoder/arm/neon/vp8_shortwalsh4x4_neon.asm
+++ /dev/null
@@ -1,103 +0,0 @@
-;
-; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
-
- EXPORT |vp8_short_walsh4x4_neon|
-
- ARM
- REQUIRE8
- PRESERVE8
-
- AREA ||.text||, CODE, READONLY, ALIGN=2
-;void vp8_short_walsh4x4_neon(short *input, short *output, int pitch)
-; r0 short *input,
-; r1 short *output,
-; r2 int pitch
-|vp8_short_walsh4x4_neon| PROC
-
- vld1.16 {d0}, [r0@64], r2 ; load input
- vld1.16 {d1}, [r0@64], r2
- vld1.16 {d2}, [r0@64], r2
- vld1.16 {d3}, [r0@64]
-
- ;First for-loop
- ;transpose d0, d1, d2, d3. Then, d0=ip[0], d1=ip[1], d2=ip[2], d3=ip[3]
- vtrn.32 d0, d2
- vtrn.32 d1, d3
-
- vmov.s32 q15, #3 ; add 3 to all values
-
- vtrn.16 d0, d1
- vtrn.16 d2, d3
-
- vadd.s16 d4, d0, d2 ; ip[0] + ip[2]
- vadd.s16 d5, d1, d3 ; ip[1] + ip[3]
- vsub.s16 d6, d1, d3 ; ip[1] - ip[3]
- vsub.s16 d7, d0, d2 ; ip[0] - ip[2]
-
- vshl.s16 d4, d4, #2 ; a1 = (ip[0] + ip[2]) << 2
- vshl.s16 d5, d5, #2 ; d1 = (ip[1] + ip[3]) << 2
- vshl.s16 d6, d6, #2 ; c1 = (ip[1] - ip[3]) << 2
- vceq.s16 d16, d4, #0 ; a1 == 0
- vshl.s16 d7, d7, #2 ; b1 = (ip[0] - ip[2]) << 2
-
- vadd.s16 d0, d4, d5 ; a1 + d1
- vmvn d16, d16 ; a1 != 0
- vsub.s16 d3, d4, d5 ; op[3] = a1 - d1
- vadd.s16 d1, d7, d6 ; op[1] = b1 + c1
- vsub.s16 d2, d7, d6 ; op[2] = b1 - c1
- vsub.s16 d0, d0, d16 ; op[0] = a1 + d1 + (a1 != 0)
-
- ;Second for-loop
- ;transpose d0, d1, d2, d3, Then, d0=ip[0], d1=ip[4], d2=ip[8], d3=ip[12]
- vtrn.32 d1, d3
- vtrn.32 d0, d2
- vtrn.16 d2, d3
- vtrn.16 d0, d1
-
- vaddl.s16 q8, d0, d2 ; a1 = ip[0]+ip[8]
- vaddl.s16 q9, d1, d3 ; d1 = ip[4]+ip[12]
- vsubl.s16 q10, d1, d3 ; c1 = ip[4]-ip[12]
- vsubl.s16 q11, d0, d2 ; b1 = ip[0]-ip[8]
-
- vadd.s32 q0, q8, q9 ; a2 = a1 + d1
- vadd.s32 q1, q11, q10 ; b2 = b1 + c1
- vsub.s32 q2, q11, q10 ; c2 = b1 - c1
- vsub.s32 q3, q8, q9 ; d2 = a1 - d1
-
- vclt.s32 q8, q0, #0
- vclt.s32 q9, q1, #0
- vclt.s32 q10, q2, #0
- vclt.s32 q11, q3, #0
-
- ; subtract -1 (or 0)
- vsub.s32 q0, q0, q8 ; a2 += a2 < 0
- vsub.s32 q1, q1, q9 ; b2 += b2 < 0
- vsub.s32 q2, q2, q10 ; c2 += c2 < 0
- vsub.s32 q3, q3, q11 ; d2 += d2 < 0
-
- vadd.s32 q8, q0, q15 ; a2 + 3
- vadd.s32 q9, q1, q15 ; b2 + 3
- vadd.s32 q10, q2, q15 ; c2 + 3
- vadd.s32 q11, q3, q15 ; d2 + 3
-
- ; vrshrn? would add 1 << 3-1 = 2
- vshrn.s32 d0, q8, #3
- vshrn.s32 d1, q9, #3
- vshrn.s32 d2, q10, #3
- vshrn.s32 d3, q11, #3
-
- vst1.16 {q0, q1}, [r1@128]
-
- bx lr
-
- ENDP
-
- END
diff --git a/vp8/encoder/arm/neon/vp8_shortwalsh4x4_neon.c b/vp8/encoder/arm/neon/vp8_shortwalsh4x4_neon.c
new file mode 100644
index 0000000..d6b67f8
--- /dev/null
+++ b/vp8/encoder/arm/neon/vp8_shortwalsh4x4_neon.c
@@ -0,0 +1,118 @@
+/*
+ * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+void vp8_short_walsh4x4_neon(
+ int16_t *input,
+ int16_t *output,
+ int pitch) {
+ uint16x4_t d16u16;
+ int16x8_t q0s16, q1s16;
+ int16x4_t dEmptys16, d0s16, d1s16, d2s16, d3s16, d4s16, d5s16, d6s16, d7s16;
+ int32x4_t qEmptys32, q0s32, q1s32, q2s32, q3s32, q8s32;
+ int32x4_t q9s32, q10s32, q11s32, q15s32;
+ uint32x4_t q8u32, q9u32, q10u32, q11u32;
+ int16x4x2_t v2tmp0, v2tmp1;
+ int32x2x2_t v2tmp2, v2tmp3;
+
+ dEmptys16 = vdup_n_s16(0);
+ qEmptys32 = vdupq_n_s32(0);
+ q15s32 = vdupq_n_s32(3);
+
+ d0s16 = vld1_s16(input);
+ input += pitch/2;
+ d1s16 = vld1_s16(input);
+ input += pitch/2;
+ d2s16 = vld1_s16(input);
+ input += pitch/2;
+ d3s16 = vld1_s16(input);
+
+ v2tmp2 = vtrn_s32(vreinterpret_s32_s16(d0s16),
+ vreinterpret_s32_s16(d2s16));
+ v2tmp3 = vtrn_s32(vreinterpret_s32_s16(d1s16),
+ vreinterpret_s32_s16(d3s16));
+ v2tmp0 = vtrn_s16(vreinterpret_s16_s32(v2tmp2.val[0]), // d0
+ vreinterpret_s16_s32(v2tmp3.val[0])); // d1
+ v2tmp1 = vtrn_s16(vreinterpret_s16_s32(v2tmp2.val[1]), // d2
+ vreinterpret_s16_s32(v2tmp3.val[1])); // d3
+
+ d4s16 = vadd_s16(v2tmp0.val[0], v2tmp1.val[0]);
+ d5s16 = vadd_s16(v2tmp0.val[1], v2tmp1.val[1]);
+ d6s16 = vsub_s16(v2tmp0.val[1], v2tmp1.val[1]);
+ d7s16 = vsub_s16(v2tmp0.val[0], v2tmp1.val[0]);
+
+ d4s16 = vshl_n_s16(d4s16, 2);
+ d5s16 = vshl_n_s16(d5s16, 2);
+ d6s16 = vshl_n_s16(d6s16, 2);
+ d7s16 = vshl_n_s16(d7s16, 2);
+
+ d16u16 = vceq_s16(d4s16, dEmptys16);
+ d16u16 = vmvn_u16(d16u16);
+
+ d0s16 = vadd_s16(d4s16, d5s16);
+ d3s16 = vsub_s16(d4s16, d5s16);
+ d1s16 = vadd_s16(d7s16, d6s16);
+ d2s16 = vsub_s16(d7s16, d6s16);
+
+ d0s16 = vsub_s16(d0s16, vreinterpret_s16_u16(d16u16));
+
+ // Second for-loop
+ v2tmp2 = vtrn_s32(vreinterpret_s32_s16(d1s16),
+ vreinterpret_s32_s16(d3s16));
+ v2tmp3 = vtrn_s32(vreinterpret_s32_s16(d0s16),
+ vreinterpret_s32_s16(d2s16));
+ v2tmp0 = vtrn_s16(vreinterpret_s16_s32(v2tmp3.val[1]), // d2
+ vreinterpret_s16_s32(v2tmp2.val[1])); // d3
+ v2tmp1 = vtrn_s16(vreinterpret_s16_s32(v2tmp3.val[0]), // d0
+ vreinterpret_s16_s32(v2tmp2.val[0])); // d1
+
+ q8s32 = vaddl_s16(v2tmp1.val[0], v2tmp0.val[0]);
+ q9s32 = vaddl_s16(v2tmp1.val[1], v2tmp0.val[1]);
+ q10s32 = vsubl_s16(v2tmp1.val[1], v2tmp0.val[1]);
+ q11s32 = vsubl_s16(v2tmp1.val[0], v2tmp0.val[0]);
+
+ q0s32 = vaddq_s32(q8s32, q9s32);
+ q1s32 = vaddq_s32(q11s32, q10s32);
+ q2s32 = vsubq_s32(q11s32, q10s32);
+ q3s32 = vsubq_s32(q8s32, q9s32);
+
+ q8u32 = vcltq_s32(q0s32, qEmptys32);
+ q9u32 = vcltq_s32(q1s32, qEmptys32);
+ q10u32 = vcltq_s32(q2s32, qEmptys32);
+ q11u32 = vcltq_s32(q3s32, qEmptys32);
+
+ q8s32 = vreinterpretq_s32_u32(q8u32);
+ q9s32 = vreinterpretq_s32_u32(q9u32);
+ q10s32 = vreinterpretq_s32_u32(q10u32);
+ q11s32 = vreinterpretq_s32_u32(q11u32);
+
+ q0s32 = vsubq_s32(q0s32, q8s32);
+ q1s32 = vsubq_s32(q1s32, q9s32);
+ q2s32 = vsubq_s32(q2s32, q10s32);
+ q3s32 = vsubq_s32(q3s32, q11s32);
+
+ q8s32 = vaddq_s32(q0s32, q15s32);
+ q9s32 = vaddq_s32(q1s32, q15s32);
+ q10s32 = vaddq_s32(q2s32, q15s32);
+ q11s32 = vaddq_s32(q3s32, q15s32);
+
+ d0s16 = vshrn_n_s32(q8s32, 3);
+ d1s16 = vshrn_n_s32(q9s32, 3);
+ d2s16 = vshrn_n_s32(q10s32, 3);
+ d3s16 = vshrn_n_s32(q11s32, 3);
+
+ q0s16 = vcombine_s16(d0s16, d1s16);
+ q1s16 = vcombine_s16(d2s16, d3s16);
+
+ vst1q_s16(output, q0s16);
+ vst1q_s16(output + 8, q1s16);
+ return;
+}
diff --git a/vp8/encoder/denoising.c b/vp8/encoder/denoising.c
index 75401fc..2f33d4a 100644
--- a/vp8/encoder/denoising.c
+++ b/vp8/encoder/denoising.c
@@ -68,6 +68,10 @@
int adj_val[3] = {3, 4, 6};
int shift_inc1 = 0;
int shift_inc2 = 1;
+ int col_sum[16] = {0, 0, 0, 0,
+ 0, 0, 0, 0,
+ 0, 0, 0, 0,
+ 0, 0, 0, 0};
/* If motion_magnitude is small, making the denoiser more aggressive by
* increasing the adjustment for each level. Add another increment for
* blocks that are labeled for increase denoising. */
@@ -98,11 +102,11 @@
if (absdiff <= 3 + shift_inc1)
{
running_avg_y[c] = mc_running_avg_y[c];
- sum_diff += diff;
+ col_sum[c] += diff;
}
else
{
- if (absdiff >= 4 && absdiff <= 7)
+ if (absdiff >= 4 + shift_inc1 && absdiff <= 7)
adjustment = adj_val[0];
else if (absdiff >= 8 && absdiff <= 15)
adjustment = adj_val[1];
@@ -116,7 +120,7 @@
else
running_avg_y[c] = sig[c] + adjustment;
- sum_diff += adjustment;
+ col_sum[c] += adjustment;
}
else
{
@@ -125,7 +129,7 @@
else
running_avg_y[c] = sig[c] - adjustment;
- sum_diff -= adjustment;
+ col_sum[c] -= adjustment;
}
}
}
@@ -136,6 +140,23 @@
running_avg_y += avg_y_stride;
}
+ for (c = 0; c < 16; ++c) {
+ // Below we clip the value in the same way which SSE code use.
+ // When adopting aggressive denoiser, the adj_val for each pixel
+ // could be at most 8 (this is current max adjustment of the map).
+ // In SSE code, we calculate the sum of adj_val for
+ // the columns, so the sum could be upto 128(16 rows). However,
+ // the range of the value is -128 ~ 127 in SSE code, that's why
+ // we do this change in C code.
+ // We don't do this for UV denoiser, since there are only 8 rows,
+ // and max adjustments <= 8, so the sum of the columns will not
+ // exceed 64.
+ if (col_sum[c] >= 128) {
+ col_sum[c] = 127;
+ }
+ sum_diff += col_sum[c];
+ }
+
sum_diff_thresh= SUM_DIFF_THRESHOLD;
if (increase_denoising) sum_diff_thresh = SUM_DIFF_THRESHOLD_HIGH;
if (abs(sum_diff) > sum_diff_thresh) {
@@ -166,14 +187,14 @@
running_avg_y[c] = 0;
else
running_avg_y[c] = running_avg_y[c] - adjustment;
- sum_diff -= adjustment;
+ col_sum[c] -= adjustment;
} else if (diff < 0) {
// Bring denoised signal up.
if (running_avg_y[c] + adjustment > 255)
running_avg_y[c] = 255;
else
running_avg_y[c] = running_avg_y[c] + adjustment;
- sum_diff += adjustment;
+ col_sum[c] += adjustment;
}
}
// TODO(marpan): Check here if abs(sum_diff) has gone below the
@@ -182,6 +203,15 @@
mc_running_avg_y += mc_avg_y_stride;
running_avg_y += avg_y_stride;
}
+
+ sum_diff = 0;
+ for (c = 0; c < 16; ++c) {
+ if (col_sum[c] >= 128) {
+ col_sum[c] = 127;
+ }
+ sum_diff += col_sum[c];
+ }
+
if (abs(sum_diff) > sum_diff_thresh)
return COPY_BLOCK;
} else {
@@ -341,8 +371,10 @@
denoiser->denoiser_mode = kDenoiserOnYOnly;
} else if (mode == 2) {
denoiser->denoiser_mode = kDenoiserOnYUV;
- } else {
+ } else if (mode == 3) {
denoiser->denoiser_mode = kDenoiserOnYUVAggressive;
+ } else {
+ denoiser->denoiser_mode = kDenoiserOnAdaptive;
}
if (denoiser->denoiser_mode != kDenoiserOnYUVAggressive) {
denoiser->denoise_pars.scale_sse_thresh = 1;
@@ -397,9 +429,40 @@
vpx_memset(denoiser->yv12_mc_running_avg.buffer_alloc, 0,
denoiser->yv12_mc_running_avg.frame_size);
+ if (vp8_yv12_alloc_frame_buffer(&denoiser->yv12_last_source, width,
+ height, VP8BORDERINPIXELS) < 0) {
+ vp8_denoiser_free(denoiser);
+ return 1;
+ }
+ vpx_memset(denoiser->yv12_last_source.buffer_alloc, 0,
+ denoiser->yv12_last_source.frame_size);
+
denoiser->denoise_state = vpx_calloc((num_mb_rows * num_mb_cols), 1);
vpx_memset(denoiser->denoise_state, 0, (num_mb_rows * num_mb_cols));
vp8_denoiser_set_parameters(denoiser, mode);
+ denoiser->nmse_source_diff = 0;
+ denoiser->nmse_source_diff_count = 0;
+ denoiser->qp_avg = 0;
+ // QP threshold below which we can go up to aggressive mode.
+ denoiser->qp_threshold_up = 80;
+ // QP threshold above which we can go back down to normal mode.
+ // For now keep this second threshold high, so not used currently.
+ denoiser->qp_threshold_down = 128;
+ // Bitrate thresholds and noise metric (nmse) thresholds for switching to
+ // aggressive mode.
+ // TODO(marpan): Adjust thresholds, including effect on resolution.
+ denoiser->bitrate_threshold = 200000; // (bits/sec).
+ denoiser->threshold_aggressive_mode = 35;
+ if (width * height > 640 * 480) {
+ denoiser->bitrate_threshold = 500000;
+ denoiser->threshold_aggressive_mode = 100;
+ } else if (width * height > 960 * 540) {
+ denoiser->bitrate_threshold = 800000;
+ denoiser->threshold_aggressive_mode = 150;
+ } else if (width * height > 1280 * 720) {
+ denoiser->bitrate_threshold = 2000000;
+ denoiser->threshold_aggressive_mode = 1400;
+ }
return 0;
}
@@ -414,6 +477,7 @@
vp8_yv12_de_alloc_frame_buffer(&denoiser->yv12_running_avg[i]);
}
vp8_yv12_de_alloc_frame_buffer(&denoiser->yv12_mc_running_avg);
+ vp8_yv12_de_alloc_frame_buffer(&denoiser->yv12_last_source);
vpx_free(denoiser->denoise_state);
}
diff --git a/vp8/encoder/denoising.h b/vp8/encoder/denoising.h
index 89832d3..bb0c2ce 100644
--- a/vp8/encoder/denoising.h
+++ b/vp8/encoder/denoising.h
@@ -19,12 +19,12 @@
#endif
#define SUM_DIFF_THRESHOLD (16 * 16 * 2)
-#define SUM_DIFF_THRESHOLD_HIGH (16 * 16 * 3)
+#define SUM_DIFF_THRESHOLD_HIGH (600)
#define MOTION_MAGNITUDE_THRESHOLD (8*3)
#define SUM_DIFF_THRESHOLD_UV (96) // (8 * 8 * 1.5)
#define SUM_DIFF_THRESHOLD_HIGH_UV (8 * 8 * 2)
-#define SUM_DIFF_FROM_AVG_THRESH_UV (8 * 8 * 4)
+#define SUM_DIFF_FROM_AVG_THRESH_UV (8 * 8 * 8)
#define MOTION_MAGNITUDE_THRESHOLD_UV (8*3)
enum vp8_denoiser_decision
@@ -43,7 +43,8 @@
kDenoiserOff,
kDenoiserOnYOnly,
kDenoiserOnYUV,
- kDenoiserOnYUVAggressive
+ kDenoiserOnYUVAggressive,
+ kDenoiserOnAdaptive
};
typedef struct {
@@ -72,9 +73,18 @@
{
YV12_BUFFER_CONFIG yv12_running_avg[MAX_REF_FRAMES];
YV12_BUFFER_CONFIG yv12_mc_running_avg;
+ // TODO(marpan): Should remove yv12_last_source and use vp8_lookahead_peak.
+ YV12_BUFFER_CONFIG yv12_last_source;
unsigned char* denoise_state;
int num_mb_cols;
int denoiser_mode;
+ int threshold_aggressive_mode;
+ int nmse_source_diff;
+ int nmse_source_diff_count;
+ int qp_avg;
+ int qp_threshold_up;
+ int qp_threshold_down;
+ int bitrate_threshold;
denoise_params denoise_pars;
} VP8_DENOISER;
@@ -83,6 +93,8 @@
void vp8_denoiser_free(VP8_DENOISER *denoiser);
+void vp8_denoiser_set_parameters(VP8_DENOISER *denoiser, int mode);
+
void vp8_denoiser_denoise_mb(VP8_DENOISER *denoiser,
MACROBLOCK *x,
unsigned int best_sse,
diff --git a/vp8/encoder/onyx_if.c b/vp8/encoder/onyx_if.c
index 7140f2f..791a485 100644
--- a/vp8/encoder/onyx_if.c
+++ b/vp8/encoder/onyx_if.c
@@ -615,19 +615,21 @@
cpi->cyclic_refresh_mode_index = i;
#if CONFIG_TEMPORAL_DENOISING
- if (cpi->denoiser.denoiser_mode == kDenoiserOnYUVAggressive &&
- Q < (int)cpi->denoiser.denoise_pars.qp_thresh) {
- // Under aggressive denoising mode, use segmentation to turn off loop
- // filter below some qp thresh. The loop filter is turned off for all
- // blocks that have been encoded as ZEROMV LAST x frames in a row,
- // where x is set by cpi->denoiser.denoise_pars.consec_zerolast.
- // This is to avoid "dot" artifacts that can occur from repeated
- // loop filtering on noisy input source.
- cpi->cyclic_refresh_q = Q;
- lf_adjustment = -MAX_LOOP_FILTER;
- for (i = 0; i < mbs_in_frame; ++i) {
- seg_map[i] = (cpi->consec_zero_last[i] >
- cpi->denoiser.denoise_pars.consec_zerolast) ? 1 : 0;
+ if (cpi->oxcf.noise_sensitivity > 0) {
+ if (cpi->denoiser.denoiser_mode == kDenoiserOnYUVAggressive &&
+ Q < (int)cpi->denoiser.denoise_pars.qp_thresh) {
+ // Under aggressive denoising, use segmentation to turn off loop
+ // filter below some qp thresh. The filter is turned off for all
+ // blocks that have been encoded as ZEROMV LAST x frames in a row,
+ // where x is set by cpi->denoiser.denoise_pars.consec_zerolast.
+ // This is to avoid "dot" artifacts that can occur from repeated
+ // loop filtering on noisy input source.
+ cpi->cyclic_refresh_q = Q;
+ lf_adjustment = -MAX_LOOP_FILTER;
+ for (i = 0; i < mbs_in_frame; ++i) {
+ seg_map[i] = (cpi->consec_zero_last[i] >
+ cpi->denoiser.denoise_pars.consec_zerolast) ? 1 : 0;
+ }
}
}
#endif
@@ -3150,10 +3152,8 @@
cm->alt_fb_idx = cm->gld_fb_idx = cm->new_fb_idx;
-#if CONFIG_MULTI_RES_ENCODING
cpi->current_ref_frames[GOLDEN_FRAME] = cm->current_video_frame;
cpi->current_ref_frames[ALTREF_FRAME] = cm->current_video_frame;
-#endif
}
else /* For non key frames */
{
@@ -3165,9 +3165,7 @@
cm->yv12_fb[cm->alt_fb_idx].flags &= ~VP8_ALTR_FRAME;
cm->alt_fb_idx = cm->new_fb_idx;
-#if CONFIG_MULTI_RES_ENCODING
cpi->current_ref_frames[ALTREF_FRAME] = cm->current_video_frame;
-#endif
}
else if (cm->copy_buffer_to_arf)
{
@@ -3181,10 +3179,8 @@
yv12_fb[cm->alt_fb_idx].flags &= ~VP8_ALTR_FRAME;
cm->alt_fb_idx = cm->lst_fb_idx;
-#if CONFIG_MULTI_RES_ENCODING
cpi->current_ref_frames[ALTREF_FRAME] =
cpi->current_ref_frames[LAST_FRAME];
-#endif
}
}
else /* if (cm->copy_buffer_to_arf == 2) */
@@ -3195,10 +3191,8 @@
yv12_fb[cm->alt_fb_idx].flags &= ~VP8_ALTR_FRAME;
cm->alt_fb_idx = cm->gld_fb_idx;
-#if CONFIG_MULTI_RES_ENCODING
cpi->current_ref_frames[ALTREF_FRAME] =
cpi->current_ref_frames[GOLDEN_FRAME];
-#endif
}
}
}
@@ -3211,9 +3205,7 @@
cm->yv12_fb[cm->gld_fb_idx].flags &= ~VP8_GOLD_FRAME;
cm->gld_fb_idx = cm->new_fb_idx;
-#if CONFIG_MULTI_RES_ENCODING
cpi->current_ref_frames[GOLDEN_FRAME] = cm->current_video_frame;
-#endif
}
else if (cm->copy_buffer_to_gf)
{
@@ -3227,10 +3219,8 @@
yv12_fb[cm->gld_fb_idx].flags &= ~VP8_GOLD_FRAME;
cm->gld_fb_idx = cm->lst_fb_idx;
-#if CONFIG_MULTI_RES_ENCODING
cpi->current_ref_frames[GOLDEN_FRAME] =
cpi->current_ref_frames[LAST_FRAME];
-#endif
}
}
else /* if (cm->copy_buffer_to_gf == 2) */
@@ -3241,10 +3231,8 @@
yv12_fb[cm->gld_fb_idx].flags &= ~VP8_GOLD_FRAME;
cm->gld_fb_idx = cm->alt_fb_idx;
-#if CONFIG_MULTI_RES_ENCODING
cpi->current_ref_frames[GOLDEN_FRAME] =
cpi->current_ref_frames[ALTREF_FRAME];
-#endif
}
}
}
@@ -3256,9 +3244,7 @@
cm->yv12_fb[cm->lst_fb_idx].flags &= ~VP8_LAST_FRAME;
cm->lst_fb_idx = cm->new_fb_idx;
-#if CONFIG_MULTI_RES_ENCODING
cpi->current_ref_frames[LAST_FRAME] = cm->current_video_frame;
-#endif
}
#if CONFIG_TEMPORAL_DENOISING
@@ -3299,12 +3285,133 @@
&cpi->denoiser.yv12_running_avg[LAST_FRAME]);
}
}
+ if (cpi->oxcf.noise_sensitivity == 4)
+ vp8_yv12_copy_frame(cpi->Source, &cpi->denoiser.yv12_last_source);
}
#endif
}
+#if CONFIG_TEMPORAL_DENOISING
+static void process_denoiser_mode_change(VP8_COMP *cpi) {
+ const VP8_COMMON *const cm = &cpi->common;
+ int i, j;
+ int total = 0;
+ int num_blocks = 0;
+ // Number of blocks skipped along row/column in computing the
+ // nmse (normalized mean square error) of source.
+ int skip = 2;
+ // Only select blocks for computing nmse that have been encoded
+ // as ZERO LAST min_consec_zero_last frames in a row.
+ // Scale with number of temporal layers.
+ int min_consec_zero_last = 8 / cpi->oxcf.number_of_layers;
+ // Decision is tested for changing the denoising mode every
+ // num_mode_change times this function is called. Note that this
+ // function called every 8 frames, so (8 * num_mode_change) is number
+ // of frames where denoising mode change is tested for switch.
+ int num_mode_change = 15;
+ // Framerate factor, to compensate for larger mse at lower framerates.
+ // Use ref_framerate, which is full source framerate for temporal layers.
+ // TODO(marpan): Adjust this factor.
+ int fac_framerate = cpi->ref_framerate < 25.0f ? 80 : 100;
+ int tot_num_blocks = cm->mb_rows * cm->mb_cols;
+ int ystride = cpi->Source->y_stride;
+ unsigned char *src = cpi->Source->y_buffer;
+ unsigned char *dst = cpi->denoiser.yv12_last_source.y_buffer;
+ static const unsigned char const_source[16] = {
+ 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
+ 128, 128, 128};
+
+ // Loop through the Y plane, every skip blocks along rows and columns,
+ // summing the normalized mean square error, only for blocks that have
+ // been encoded as ZEROMV LAST at least min_consec_zero_last least frames in
+ // a row and have small sum difference between current and previous frame.
+ // Normalization here is by the contrast of the current frame block.
+ for (i = 0; i < cm->Height; i += 16 * skip) {
+ int block_index_row = (i >> 4) * cm->mb_cols;
+ for (j = 0; j < cm->Width; j += 16 * skip) {
+ int index = block_index_row + (j >> 4);
+ if (cpi->consec_zero_last[index] >= min_consec_zero_last) {
+ unsigned int sse;
+ const unsigned int mse = vp8_mse16x16(src + j,
+ ystride,
+ dst + j,
+ ystride,
+ &sse);
+ const unsigned int var = vp8_variance16x16(src + j,
+ ystride,
+ dst + j,
+ ystride,
+ &sse);
+ // Only consider this block as valid for noise measurement
+ // if the sum_diff average of the current and previous frame
+ // is small (to avoid effects from lighting change).
+ if ((mse - var) < 256) {
+ const unsigned int act = vp8_variance16x16(src + j,
+ ystride,
+ const_source,
+ 0,
+ &sse);
+ if (act > 0)
+ total += mse / act;
+ num_blocks++;
+ }
+ }
+ }
+ src += 16 * skip * ystride;
+ dst += 16 * skip * ystride;
+ }
+ total = total * fac_framerate / 100;
+
+ // Only consider this frame as valid sample if we have computed nmse over
+ // at least ~1/16 blocks, and Total > 0 (Total == 0 can happen if the
+ // application inputs duplicate frames, or contrast is all zero).
+ if (total > 0 &&
+ (num_blocks > (tot_num_blocks >> 4))) {
+ // Update the recursive mean square source_diff.
+ if (cpi->denoiser.nmse_source_diff_count == 0) {
+ // First sample in new interval.
+ cpi->denoiser.nmse_source_diff = total;
+ cpi->denoiser.qp_avg = cm->base_qindex;
+ } else {
+ // For subsequent samples, use average with weight ~1/4 for new sample.
+ cpi->denoiser.nmse_source_diff = (int)((total >> 2) +
+ 3 * (cpi->denoiser.nmse_source_diff >> 2));
+ cpi->denoiser.qp_avg = (int)((cm->base_qindex >> 2) +
+ 3 * (cpi->denoiser.qp_avg >> 2));
+ }
+ cpi->denoiser.nmse_source_diff_count++;
+ }
+ // Check for changing the denoiser mode, when we have obtained #samples =
+ // num_mode_change. Condition the change also on the bitrate and QP.
+ if (cpi->denoiser.nmse_source_diff_count == num_mode_change) {
+ // Check for going up: from normal to aggressive mode.
+ if ((cpi->denoiser.denoiser_mode == kDenoiserOnYUV) &&
+ (cpi->denoiser.nmse_source_diff >
+ cpi->denoiser.threshold_aggressive_mode) &&
+ (cpi->denoiser.qp_avg < cpi->denoiser.qp_threshold_up &&
+ cpi->target_bandwidth > cpi->denoiser.bitrate_threshold)) {
+ vp8_denoiser_set_parameters(&cpi->denoiser, kDenoiserOnYUVAggressive);
+ } else {
+ // Check for going down: from aggressive to normal mode.
+ if (((cpi->denoiser.denoiser_mode == kDenoiserOnYUVAggressive) &&
+ (cpi->denoiser.nmse_source_diff <
+ cpi->denoiser.threshold_aggressive_mode)) ||
+ ((cpi->denoiser.denoiser_mode == kDenoiserOnYUVAggressive) &&
+ (cpi->denoiser.qp_avg > cpi->denoiser.qp_threshold_down ||
+ cpi->target_bandwidth < cpi->denoiser.bitrate_threshold))) {
+ vp8_denoiser_set_parameters(&cpi->denoiser, kDenoiserOnYUV);
+ }
+ }
+ // Reset metric and counter for next interval.
+ cpi->denoiser.nmse_source_diff = 0;
+ cpi->denoiser.qp_avg = 0;
+ cpi->denoiser.nmse_source_diff_count = 0;
+ }
+}
+#endif
+
void vp8_loopfilter_frame(VP8_COMP *cpi, VP8_COMMON *cm)
{
const FRAME_TYPE frame_type = cm->frame_type;
@@ -3461,6 +3568,12 @@
{
/* Key frame from VFW/auto-keyframe/first frame */
cm->frame_type = KEY_FRAME;
+#if CONFIG_TEMPORAL_DENOISING
+ if (cpi->oxcf.noise_sensitivity == 4) {
+ // For adaptive mode, reset denoiser to normal mode on key frame.
+ vp8_denoiser_set_parameters(&cpi->denoiser, kDenoiserOnYUV);
+ }
+#endif
}
#if CONFIG_MULTI_RES_ENCODING
@@ -3494,6 +3607,31 @@
}
#endif
+ // Find the reference frame closest to the current frame.
+ cpi->closest_reference_frame = LAST_FRAME;
+ if (cm->frame_type != KEY_FRAME) {
+ int i;
+ MV_REFERENCE_FRAME closest_ref = INTRA_FRAME;
+ if (cpi->ref_frame_flags & VP8_LAST_FRAME) {
+ closest_ref = LAST_FRAME;
+ } else if (cpi->ref_frame_flags & VP8_GOLD_FRAME) {
+ closest_ref = GOLDEN_FRAME;
+ } else if (cpi->ref_frame_flags & VP8_ALTR_FRAME) {
+ closest_ref = ALTREF_FRAME;
+ }
+ for (i = 1; i <= 3; i++) {
+ vpx_ref_frame_type_t ref_frame_type = (vpx_ref_frame_type_t)
+ ((i == 3) ? 4 : i);
+ if (cpi->ref_frame_flags & ref_frame_type) {
+ if ((cm->current_video_frame - cpi->current_ref_frames[i]) <
+ (cm->current_video_frame - cpi->current_ref_frames[closest_ref])) {
+ closest_ref = i;
+ }
+ }
+ }
+ cpi->closest_reference_frame = closest_ref;
+ }
+
/* Set various flags etc to special state if it is a key frame */
if (cm->frame_type == KEY_FRAME)
{
@@ -4420,7 +4558,8 @@
{
for (mb_col = 0; mb_col < cm->mb_cols; mb_col ++)
{
- if(tmp->mbmi.mode == ZEROMV)
+ if (tmp->mbmi.mode == ZEROMV &&
+ tmp->mbmi.ref_frame == LAST_FRAME)
cpi->zeromv_count++;
tmp++;
}
@@ -4462,6 +4601,21 @@
cm->frame_to_show = &cm->yv12_fb[cm->new_fb_idx];
+#if CONFIG_TEMPORAL_DENOISING
+ // For the adaptive denoising mode (noise_sensitivity == 4), sample the mse
+ // of source diff (between current and previous frame), and determine if we
+ // should switch the denoiser mode. Sampling refers to computing the mse for
+ // a sub-sample of the frame (i.e., skip x blocks along row/column), and
+ // only for blocks in that set that have used ZEROMV LAST, along with some
+ // constraint on the sum diff between blocks. This process is called every
+ // ~8 frames, to further reduce complexity.
+ if (cpi->oxcf.noise_sensitivity == 4 &&
+ cpi->frames_since_key % 8 == 0 &&
+ cm->frame_type != KEY_FRAME) {
+ process_denoiser_mode_change(cpi);
+ }
+#endif
+
#if CONFIG_MULTITHREAD
if (cpi->b_multi_threaded)
{
diff --git a/vp8/encoder/onyx_int.h b/vp8/encoder/onyx_int.h
index 7a8baca..f0424e6 100644
--- a/vp8/encoder/onyx_int.h
+++ b/vp8/encoder/onyx_int.h
@@ -684,9 +684,10 @@
int mr_low_res_mb_cols;
/* Indicate if lower-res mv info is available */
unsigned char mr_low_res_mv_avail;
+#endif
/* The frame number of each reference frames */
unsigned int current_ref_frames[MAX_REF_FRAMES];
-#endif
+ MV_REFERENCE_FRAME closest_reference_frame;
struct rd_costs_struct
{
diff --git a/vp8/encoder/pickinter.c b/vp8/encoder/pickinter.c
index d0ad721..43f8957 100644
--- a/vp8/encoder/pickinter.c
+++ b/vp8/encoder/pickinter.c
@@ -487,6 +487,7 @@
MB_PREDICTION_MODE this_mode = x->e_mbd.mode_info_context->mbmi.mode;
int_mv mv = x->e_mbd.mode_info_context->mbmi.mv;
int this_rd;
+ int denoise_aggressive = 0;
/* Exit early and don't compute the distortion if this macroblock
* is marked inactive. */
if (cpi->active_map_enabled && x->active_ptr[0] == 0)
@@ -505,16 +506,18 @@
this_rd = RDCOST(x->rdmult, x->rddiv, rate2, *distortion2);
- /* Adjust rd to bias to ZEROMV */
- if(this_mode == ZEROMV)
- {
- /* Bias to ZEROMV on LAST_FRAME reference when it is available. */
- if ((cpi->ref_frame_flags & VP8_LAST_FRAME &
- cpi->common.refresh_last_frame)
- && x->e_mbd.mode_info_context->mbmi.ref_frame != LAST_FRAME)
- rd_adj = 100;
+#if CONFIG_TEMPORAL_DENOISING
+ if (cpi->oxcf.noise_sensitivity > 0) {
+ denoise_aggressive =
+ (cpi->denoiser.denoiser_mode == kDenoiserOnYUVAggressive) ? 1 : 0;
+ }
+#endif
- // rd_adj <= 100
+ // Adjust rd for ZEROMV and LAST, if LAST is the closest reference frame.
+ if (this_mode == ZEROMV &&
+ x->e_mbd.mode_info_context->mbmi.ref_frame == LAST_FRAME &&
+ (denoise_aggressive || cpi->closest_reference_frame == LAST_FRAME))
+ {
this_rd = ((int64_t)this_rd) * rd_adj / 100;
}
diff --git a/vp8/encoder/picklpf.c b/vp8/encoder/picklpf.c
index 250d04c..f0c8f28 100644
--- a/vp8/encoder/picklpf.c
+++ b/vp8/encoder/picklpf.c
@@ -23,8 +23,8 @@
extern int vp8_calc_ss_err(YV12_BUFFER_CONFIG *source, YV12_BUFFER_CONFIG *dest);
-void vp8_yv12_copy_partial_frame_c(YV12_BUFFER_CONFIG *src_ybc,
- YV12_BUFFER_CONFIG *dst_ybc)
+static void yv12_copy_partial_frame(YV12_BUFFER_CONFIG *src_ybc,
+ YV12_BUFFER_CONFIG *dst_ybc)
{
unsigned char *src_y, *dst_y;
int yheight;
@@ -173,7 +173,7 @@
/* Get the err using the previous frame's filter value. */
/* Copy the unfiltered / processed recon buffer to the new buffer */
- vp8_yv12_copy_partial_frame(saved_frame, cm->frame_to_show);
+ yv12_copy_partial_frame(saved_frame, cm->frame_to_show);
vp8_loop_filter_partial_frame(cm, &cpi->mb.e_mbd, filt_val);
best_err = calc_partial_ssl_err(sd, cm->frame_to_show);
@@ -184,7 +184,7 @@
while (filt_val >= min_filter_level)
{
/* Apply the loop filter */
- vp8_yv12_copy_partial_frame(saved_frame, cm->frame_to_show);
+ yv12_copy_partial_frame(saved_frame, cm->frame_to_show);
vp8_loop_filter_partial_frame(cm, &cpi->mb.e_mbd, filt_val);
/* Get the err for filtered frame */
@@ -214,7 +214,7 @@
while (filt_val < max_filter_level)
{
/* Apply the loop filter */
- vp8_yv12_copy_partial_frame(saved_frame, cm->frame_to_show);
+ yv12_copy_partial_frame(saved_frame, cm->frame_to_show);
vp8_loop_filter_partial_frame(cm, &cpi->mb.e_mbd, filt_val);
diff --git a/vp8/vp8_common.mk b/vp8/vp8_common.mk
index 6db031f..9b11c0d 100644
--- a/vp8/vp8_common.mk
+++ b/vp8/vp8_common.mk
@@ -155,30 +155,25 @@
VP8_COMMON_SRCS-$(HAVE_MEDIA) += common/arm/armv6/vp8_variance_halfpixvar16x16_v_armv6$(ASM)
VP8_COMMON_SRCS-$(HAVE_MEDIA) += common/arm/armv6/vp8_variance_halfpixvar16x16_hv_armv6$(ASM)
-# common (neon)
-#VP8_COMMON_SRCS-$(HAVE_NEON_ASM) += common/arm/reconintra_arm.c
-VP8_COMMON_SRCS-$(HAVE_NEON_ASM) += common/arm/neon/loopfilter_neon$(ASM)
-VP8_COMMON_SRCS-$(HAVE_NEON_ASM) += common/arm/neon/loopfiltersimpleverticaledge_neon$(ASM)
-#VP8_COMMON_SRCS-$(HAVE_NEON_ASM) += common/arm/neon/buildintrapredictorsmby_neon$(ASM)
-VP8_COMMON_SRCS-$(HAVE_NEON_ASM) += common/arm/neon/idct_blk_neon.c
-VP8_COMMON_SRCS-$(HAVE_NEON_ASM) += common/arm/neon/idct_dequant_0_2x_neon$(ASM)
-VP8_COMMON_SRCS-$(HAVE_NEON_ASM) += common/arm/neon/idct_dequant_full_2x_neon$(ASM)
-VP8_COMMON_SRCS-$(HAVE_NEON_ASM) += common/arm/neon/vp8_subpixelvariance8x8_neon$(ASM)
-VP8_COMMON_SRCS-$(HAVE_NEON_ASM) += common/arm/neon/vp8_subpixelvariance16x16_neon$(ASM)
-VP8_COMMON_SRCS-$(HAVE_NEON_ASM) += common/arm/neon/vp8_subpixelvariance16x16s_neon$(ASM)
-
# common (neon intrinsics)
VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/bilinearpredict_neon.c
VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/copymem_neon.c
VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/dc_only_idct_add_neon.c
VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/dequant_idct_neon.c
VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/dequantizeb_neon.c
+VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/idct_blk_neon.c
+VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/idct_dequant_0_2x_neon.c
+VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/idct_dequant_full_2x_neon.c
VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/iwalsh_neon.c
+VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/loopfilter_neon.c
VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/loopfiltersimplehorizontaledge_neon.c
+VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/loopfiltersimpleverticaledge_neon.c
VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/mbloopfilter_neon.c
+VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/reconintra_neon.c
VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/sad_neon.c
VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/shortidct4x4llm_neon.c
VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/sixtappredict_neon.c
VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/variance_neon.c
+VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp8_subpixelvariance_neon.c
$(eval $(call rtcd_h_template,vp8_rtcd,vp8/common/rtcd_defs.pl))
diff --git a/vp8/vp8_cx_iface.c b/vp8/vp8_cx_iface.c
index d515fc0..b1b079c 100644
--- a/vp8/vp8_cx_iface.c
+++ b/vp8/vp8_cx_iface.c
@@ -14,6 +14,7 @@
#include "vpx/vpx_codec.h"
#include "vpx/internal/vpx_codec_internal.h"
#include "vpx_version.h"
+#include "vpx_mem/vpx_mem.h"
#include "vp8/encoder/onyx_int.h"
#include "vpx/vp8cx.h"
#include "vp8/encoder/firstpass.h"
@@ -39,40 +40,28 @@
};
-struct extraconfig_map
-{
- int usage;
- struct vp8_extracfg cfg;
-};
-
-static const struct extraconfig_map extracfg_map[] =
-{
- {
- 0,
- {
- NULL,
+static struct vp8_extracfg default_extracfg = {
+ NULL,
#if !(CONFIG_REALTIME_ONLY)
- 0, /* cpu_used */
+ 0, /* cpu_used */
#else
- 4, /* cpu_used */
+ 4, /* cpu_used */
#endif
- 0, /* enable_auto_alt_ref */
- 0, /* noise_sensitivity */
- 0, /* Sharpness */
- 0, /* static_thresh */
+ 0, /* enable_auto_alt_ref */
+ 0, /* noise_sensitivity */
+ 0, /* Sharpness */
+ 0, /* static_thresh */
#if (CONFIG_REALTIME_ONLY & CONFIG_ONTHEFLY_BITPACKING)
- VP8_EIGHT_TOKENPARTITION,
+ VP8_EIGHT_TOKENPARTITION,
#else
- VP8_ONE_TOKENPARTITION, /* token_partitions */
+ VP8_ONE_TOKENPARTITION, /* token_partitions */
#endif
- 0, /* arnr_max_frames */
- 3, /* arnr_strength */
- 3, /* arnr_type*/
- 0, /* tuning*/
- 10, /* cq_level */
- 0, /* rc_max_intra_bitrate_pct */
- }
- }
+ 0, /* arnr_max_frames */
+ 3, /* arnr_strength */
+ 3, /* arnr_type*/
+ 0, /* tuning*/
+ 10, /* cq_level */
+ 0, /* rc_max_intra_bitrate_pct */
};
struct vpx_codec_alg_priv
@@ -631,27 +620,21 @@
vpx_codec_priv_enc_mr_cfg_t *mr_cfg)
{
vpx_codec_err_t res = VPX_CODEC_OK;
- struct vpx_codec_alg_priv *priv;
- vpx_codec_enc_cfg_t *cfg;
- unsigned int i;
- struct VP8_COMP *optr;
vp8_rtcd();
if (!ctx->priv)
{
- priv = calloc(1, sizeof(struct vpx_codec_alg_priv));
+ struct vpx_codec_alg_priv *priv =
+ (struct vpx_codec_alg_priv *)vpx_calloc(1, sizeof(*priv));
if (!priv)
{
return VPX_CODEC_MEM_ERROR;
}
- ctx->priv = &priv->base;
- ctx->priv->sz = sizeof(*ctx->priv);
- ctx->priv->iface = ctx->iface;
- ctx->priv->alg_priv = priv;
+ ctx->priv = (vpx_codec_priv_t *)priv;
ctx->priv->init_flags = ctx->init_flags;
if (ctx->config.enc)
@@ -659,21 +642,11 @@
/* Update the reference to the config structure to an
* internal copy.
*/
- ctx->priv->alg_priv->cfg = *ctx->config.enc;
- ctx->config.enc = &ctx->priv->alg_priv->cfg;
+ priv->cfg = *ctx->config.enc;
+ ctx->config.enc = &priv->cfg;
}
- cfg = &ctx->priv->alg_priv->cfg;
-
- /* Select the extra vp8 configuration table based on the current
- * usage value. If the current usage value isn't found, use the
- * values for usage case 0.
- */
- for (i = 0;
- extracfg_map[i].usage && extracfg_map[i].usage != cfg->g_usage;
- i++);
-
- priv->vp8_cfg = extracfg_map[i].cfg;
+ priv->vp8_cfg = default_extracfg;
priv->vp8_cfg.pkt_list = &priv->pkt_list.head;
priv->cx_data_sz = priv->cfg.g_w * priv->cfg.g_h * 3 / 2 * 2;
@@ -696,17 +669,10 @@
if (!res)
{
- set_vp8e_config(&ctx->priv->alg_priv->oxcf,
- ctx->priv->alg_priv->cfg,
- ctx->priv->alg_priv->vp8_cfg,
- mr_cfg);
-
- optr = vp8_create_compressor(&ctx->priv->alg_priv->oxcf);
-
- if (!optr)
+ set_vp8e_config(&priv->oxcf, priv->cfg, priv->vp8_cfg, mr_cfg);
+ priv->cpi = vp8_create_compressor(&priv->oxcf);
+ if (!priv->cpi)
res = VPX_CODEC_MEM_ERROR;
- else
- ctx->priv->alg_priv->cpi = optr;
}
}
@@ -727,24 +693,30 @@
free(ctx->cx_data);
vp8_remove_compressor(&ctx->cpi);
- free(ctx);
+ vpx_free(ctx);
return VPX_CODEC_OK;
}
static vpx_codec_err_t image2yuvconfig(const vpx_image_t *img,
YV12_BUFFER_CONFIG *yv12)
{
+ const int y_w = img->d_w;
+ const int y_h = img->d_h;
+ const int uv_w = (img->d_w + 1) / 2;
+ const int uv_h = (img->d_h + 1) / 2;
vpx_codec_err_t res = VPX_CODEC_OK;
yv12->y_buffer = img->planes[VPX_PLANE_Y];
yv12->u_buffer = img->planes[VPX_PLANE_U];
yv12->v_buffer = img->planes[VPX_PLANE_V];
- yv12->y_crop_width = img->d_w;
- yv12->y_crop_height = img->d_h;
- yv12->y_width = img->d_w;
- yv12->y_height = img->d_h;
- yv12->uv_width = (1 + yv12->y_width) / 2;
- yv12->uv_height = (1 + yv12->y_height) / 2;
+ yv12->y_crop_width = y_w;
+ yv12->y_crop_height = y_h;
+ yv12->y_width = y_w;
+ yv12->y_height = y_h;
+ yv12->uv_crop_width = uv_w;
+ yv12->uv_crop_height = uv_h;
+ yv12->uv_width = uv_w;
+ yv12->uv_height = uv_h;
yv12->y_stride = img->stride[VPX_PLANE_Y];
yv12->uv_stride = img->stride[VPX_PLANE_U];
@@ -1273,6 +1245,9 @@
320, /* g_width */
240, /* g_height */
+ VPX_BITS_8, /* g_bit_depth */
+ 8, /* g_input_bit_depth */
+
{1, 30}, /* g_timebase */
0, /* g_error_resilient */
@@ -1341,10 +1316,10 @@
vp8e_destroy, /* vpx_codec_destroy_fn_t destroy; */
vp8e_ctf_maps, /* vpx_codec_ctrl_fn_map_t *ctrl_maps; */
{
- NOT_IMPLEMENTED, /* vpx_codec_peek_si_fn_t peek_si; */
- NOT_IMPLEMENTED, /* vpx_codec_get_si_fn_t get_si; */
- NOT_IMPLEMENTED, /* vpx_codec_decode_fn_t decode; */
- NOT_IMPLEMENTED, /* vpx_codec_frame_get_fn_t frame_get; */
+ NULL, /* vpx_codec_peek_si_fn_t peek_si; */
+ NULL, /* vpx_codec_get_si_fn_t get_si; */
+ NULL, /* vpx_codec_decode_fn_t decode; */
+ NULL, /* vpx_codec_frame_get_fn_t frame_get; */
},
{
1, /* 1 cfg map */
@@ -1352,7 +1327,7 @@
vp8e_encode, /* vpx_codec_encode_fn_t encode; */
vp8e_get_cxdata, /* vpx_codec_get_cx_data_fn_t frame_get; */
vp8e_set_config,
- NOT_IMPLEMENTED,
+ NULL,
vp8e_get_preview,
vp8e_mr_alloc_mem,
} /* encoder functions */
diff --git a/vp8/vp8_dx_iface.c b/vp8/vp8_dx_iface.c
index 9a0cdb7..3ab8ed0 100644
--- a/vp8/vp8_dx_iface.c
+++ b/vp8/vp8_dx_iface.c
@@ -80,30 +80,30 @@
static void vp8_init_ctx(vpx_codec_ctx_t *ctx)
{
- ctx->priv =
- (vpx_codec_priv_t *)vpx_memalign(8, sizeof(vpx_codec_alg_priv_t));
- vpx_memset(ctx->priv, 0, sizeof(vpx_codec_alg_priv_t));
- ctx->priv->sz = sizeof(*ctx->priv);
- ctx->priv->iface = ctx->iface;
- ctx->priv->alg_priv = (vpx_codec_alg_priv_t *)ctx->priv;
- ctx->priv->alg_priv->si.sz = sizeof(ctx->priv->alg_priv->si);
- ctx->priv->alg_priv->decrypt_cb = NULL;
- ctx->priv->alg_priv->decrypt_state = NULL;
- ctx->priv->alg_priv->flushed = 0;
+ vpx_codec_alg_priv_t *priv =
+ (vpx_codec_alg_priv_t *)vpx_calloc(1, sizeof(*priv));
+
+ ctx->priv = (vpx_codec_priv_t *)priv;
ctx->priv->init_flags = ctx->init_flags;
+ priv->si.sz = sizeof(priv->si);
+ priv->decrypt_cb = NULL;
+ priv->decrypt_state = NULL;
+ priv->flushed = 0;
+
if (ctx->config.dec)
{
/* Update the reference to the config structure to an internal copy. */
- ctx->priv->alg_priv->cfg = *ctx->config.dec;
- ctx->config.dec = &ctx->priv->alg_priv->cfg;
+ priv->cfg = *ctx->config.dec;
+ ctx->config.dec = &priv->cfg;
}
}
static vpx_codec_err_t vp8_init(vpx_codec_ctx_t *ctx,
vpx_codec_priv_enc_mr_cfg_t *data)
{
- vpx_codec_err_t res = VPX_CODEC_OK;
+ vpx_codec_err_t res = VPX_CODEC_OK;
+ vpx_codec_alg_priv_t *priv = NULL;
(void) data;
vp8_rtcd();
@@ -115,29 +115,30 @@
if (!ctx->priv)
{
vp8_init_ctx(ctx);
+ priv = (vpx_codec_alg_priv_t *)ctx->priv;
/* initialize number of fragments to zero */
- ctx->priv->alg_priv->fragments.count = 0;
+ priv->fragments.count = 0;
/* is input fragments enabled? */
- ctx->priv->alg_priv->fragments.enabled =
- (ctx->priv->alg_priv->base.init_flags &
- VPX_CODEC_USE_INPUT_FRAGMENTS);
+ priv->fragments.enabled =
+ (priv->base.init_flags & VPX_CODEC_USE_INPUT_FRAGMENTS);
/*post processing level initialized to do nothing */
}
+ else
+ {
+ priv = (vpx_codec_alg_priv_t *)ctx->priv;
+ }
- ctx->priv->alg_priv->yv12_frame_buffers.use_frame_threads =
- (ctx->priv->alg_priv->base.init_flags &
- VPX_CODEC_USE_FRAME_THREADING);
+ priv->yv12_frame_buffers.use_frame_threads =
+ (ctx->priv->init_flags & VPX_CODEC_USE_FRAME_THREADING);
/* for now, disable frame threading */
- ctx->priv->alg_priv->yv12_frame_buffers.use_frame_threads = 0;
+ priv->yv12_frame_buffers.use_frame_threads = 0;
- if(ctx->priv->alg_priv->yv12_frame_buffers.use_frame_threads &&
- (( ctx->priv->alg_priv->base.init_flags &
- VPX_CODEC_USE_ERROR_CONCEALMENT)
- || ( ctx->priv->alg_priv->base.init_flags &
- VPX_CODEC_USE_INPUT_FRAGMENTS) ) )
+ if (priv->yv12_frame_buffers.use_frame_threads &&
+ ((ctx->priv->init_flags & VPX_CODEC_USE_ERROR_CONCEALMENT) ||
+ (ctx->priv->init_flags & VPX_CODEC_USE_INPUT_FRAGMENTS)))
{
/* row-based threading, error concealment, and input fragments will
* not be supported when using frame-based threading */
@@ -566,17 +567,23 @@
static vpx_codec_err_t image2yuvconfig(const vpx_image_t *img,
YV12_BUFFER_CONFIG *yv12)
{
+ const int y_w = img->d_w;
+ const int y_h = img->d_h;
+ const int uv_w = (img->d_w + 1) / 2;
+ const int uv_h = (img->d_h + 1) / 2;
vpx_codec_err_t res = VPX_CODEC_OK;
yv12->y_buffer = img->planes[VPX_PLANE_Y];
yv12->u_buffer = img->planes[VPX_PLANE_U];
yv12->v_buffer = img->planes[VPX_PLANE_V];
- yv12->y_crop_width = img->d_w;
- yv12->y_crop_height = img->d_h;
- yv12->y_width = img->d_w;
- yv12->y_height = img->d_h;
- yv12->uv_width = yv12->y_width / 2;
- yv12->uv_height = yv12->y_height / 2;
+ yv12->y_crop_width = y_w;
+ yv12->y_crop_height = y_h;
+ yv12->y_width = y_w;
+ yv12->y_height = y_h;
+ yv12->uv_crop_width = uv_w;
+ yv12->uv_crop_height = uv_h;
+ yv12->uv_width = uv_w;
+ yv12->uv_height = uv_h;
yv12->y_stride = img->stride[VPX_PLANE_Y];
yv12->uv_stride = img->stride[VPX_PLANE_U];
@@ -809,15 +816,15 @@
vp8_get_si, /* vpx_codec_get_si_fn_t get_si; */
vp8_decode, /* vpx_codec_decode_fn_t decode; */
vp8_get_frame, /* vpx_codec_frame_get_fn_t frame_get; */
- NOT_IMPLEMENTED,
+ NULL,
},
{ /* encoder functions */
0,
- NOT_IMPLEMENTED,
- NOT_IMPLEMENTED,
- NOT_IMPLEMENTED,
- NOT_IMPLEMENTED,
- NOT_IMPLEMENTED,
- NOT_IMPLEMENTED
+ NULL,
+ NULL,
+ NULL,
+ NULL,
+ NULL,
+ NULL
}
};
diff --git a/vp8/vp8cx_arm.mk b/vp8/vp8cx_arm.mk
index 5733048..551271e 100644
--- a/vp8/vp8cx_arm.mk
+++ b/vp8/vp8cx_arm.mk
@@ -36,11 +36,9 @@
#File list for neon
# encoder
VP8_CX_SRCS-$(HAVE_NEON_ASM) += encoder/arm/neon/fastquantizeb_neon$(ASM)
-VP8_CX_SRCS-$(HAVE_NEON_ASM) += encoder/arm/neon/picklpf_arm.c
-VP8_CX_SRCS-$(HAVE_NEON_ASM) += encoder/arm/neon/shortfdct_neon$(ASM)
-VP8_CX_SRCS-$(HAVE_NEON_ASM) += encoder/arm/neon/subtract_neon$(ASM)
VP8_CX_SRCS-$(HAVE_NEON_ASM) += encoder/arm/neon/vp8_mse16x16_neon$(ASM)
-VP8_CX_SRCS-$(HAVE_NEON_ASM) += encoder/arm/neon/vp8_memcpy_neon$(ASM)
-VP8_CX_SRCS-$(HAVE_NEON_ASM) += encoder/arm/neon/vp8_shortwalsh4x4_neon$(ASM)
VP8_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/denoising_neon.c
+VP8_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp8_shortwalsh4x4_neon.c
+VP8_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/subtract_neon.c
+VP8_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/shortfdct_neon.c
diff --git a/vp9/common/vp9_alloccommon.c b/vp9/common/vp9_alloccommon.c
index b379656..21ae8d5 100644
--- a/vp9/common/vp9_alloccommon.c
+++ b/vp9/common/vp9_alloccommon.c
@@ -34,7 +34,7 @@
cm->mi_cols = aligned_width >> MI_SIZE_LOG2;
cm->mi_rows = aligned_height >> MI_SIZE_LOG2;
- cm->mi_stride = cm->mi_cols + MI_BLOCK_SIZE;
+ cm->mi_stride = calc_mi_size(cm->mi_cols);
cm->mb_cols = (cm->mi_cols + 1) >> 1;
cm->mb_rows = (cm->mi_rows + 1) >> 1;
@@ -60,16 +60,18 @@
for (i = 0; i < 2; ++i) {
cm->mip_array[i] =
- (MODE_INFO *)vpx_calloc(mi_size, sizeof(*cm->mip));
+ (MODE_INFO *)vpx_calloc(mi_size, sizeof(MODE_INFO));
if (cm->mip_array[i] == NULL)
return 1;
cm->mi_grid_base_array[i] =
- (MODE_INFO **)vpx_calloc(mi_size, sizeof(*cm->mi_grid_base));
+ (MODE_INFO **)vpx_calloc(mi_size, sizeof(MODE_INFO*));
if (cm->mi_grid_base_array[i] == NULL)
return 1;
}
+ cm->mi_alloc_size = mi_size;
+
// Init the index.
cm->mi_idx = 0;
cm->prev_mi_idx = 1;
@@ -131,7 +133,8 @@
vp9_free_context_buffers(cm);
vp9_set_mb_mi(cm, width, height);
- if (alloc_mi(cm, cm->mi_stride * (cm->mi_rows + MI_BLOCK_SIZE))) goto fail;
+ if (alloc_mi(cm, cm->mi_stride * calc_mi_size(cm->mi_rows)))
+ goto fail;
cm->last_frame_seg_map = (uint8_t *)vpx_calloc(cm->mi_rows * cm->mi_cols, 1);
if (!cm->last_frame_seg_map) goto fail;
@@ -174,7 +177,11 @@
for (i = 0; i < FRAME_BUFFERS; ++i) {
cm->frame_bufs[i].ref_count = 0;
if (vp9_alloc_frame_buffer(&cm->frame_bufs[i].buf, width, height,
- ss_x, ss_y, VP9_ENC_BORDER_IN_PIXELS) < 0)
+ ss_x, ss_y,
+#if CONFIG_VP9_HIGHBITDEPTH
+ cm->use_highbitdepth,
+#endif
+ VP9_ENC_BORDER_IN_PIXELS) < 0)
goto fail;
}
@@ -182,6 +189,9 @@
#if CONFIG_INTERNAL_STATS || CONFIG_VP9_POSTPROC
if (vp9_alloc_frame_buffer(&cm->post_proc_buffer, width, height, ss_x, ss_y,
+#if CONFIG_VP9_HIGHBITDEPTH
+ cm->use_highbitdepth,
+#endif
VP9_ENC_BORDER_IN_PIXELS) < 0)
goto fail;
#endif
diff --git a/vp9/common/vp9_blockd.h b/vp9/common/vp9_blockd.h
index 951e6e0..7d4ed95 100644
--- a/vp9/common/vp9_blockd.h
+++ b/vp9/common/vp9_blockd.h
@@ -21,6 +21,7 @@
#include "vp9/common/vp9_common_data.h"
#include "vp9/common/vp9_enums.h"
#include "vp9/common/vp9_filter.h"
+#include "vp9/common/vp9_idct.h"
#include "vp9/common/vp9_mv.h"
#include "vp9/common/vp9_scale.h"
#include "vp9/common/vp9_seg_common.h"
@@ -176,7 +177,7 @@
};
struct macroblockd_plane {
- int16_t *dqcoeff;
+ tran_low_t *dqcoeff;
PLANE_TYPE plane_type;
int subsampling_x;
int subsampling_y;
@@ -223,11 +224,17 @@
/* mc buffer */
DECLARE_ALIGNED(16, uint8_t, mc_buf[80 * 2 * 80 * 2]);
+#if CONFIG_VP9_HIGHBITDEPTH
+ /* Bit depth: 8, 10, 12 */
+ int bd;
+ DECLARE_ALIGNED(16, uint16_t, mc_buf_high[80 * 2 * 80 * 2]);
+#endif
+
int lossless;
int corrupted;
- DECLARE_ALIGNED(16, int16_t, dqcoeff[MAX_MB_PLANE][64 * 64]);
+ DECLARE_ALIGNED(16, tran_low_t, dqcoeff[MAX_MB_PLANE][64 * 64]);
ENTROPY_CONTEXT *above_context[MAX_MB_PLANE];
ENTROPY_CONTEXT left_context[MAX_MB_PLANE][16];
diff --git a/vp9/common/vp9_common.h b/vp9/common/vp9_common.h
index 2788e66..5587192 100644
--- a/vp9/common/vp9_common.h
+++ b/vp9/common/vp9_common.h
@@ -64,6 +64,11 @@
return num_values > 0 ? get_msb(num_values) + 1 : 0;
}
+#if CONFIG_VP9_HIGHBITDEPTH
+#define CONVERT_TO_SHORTPTR(x) ((uint16_t*)(((uintptr_t)x) << 1))
+#define CONVERT_TO_BYTEPTR(x) ((uint8_t*)(((uintptr_t)x) >> 1 ))
+#endif // CONFIG_VP9_HIGHBITDEPTH
+
#if CONFIG_DEBUG
#define CHECK_MEM_ERROR(cm, lval, expr) do { \
lval = (expr); \
diff --git a/vp9/common/vp9_enums.h b/vp9/common/vp9_enums.h
index d776313..8817fdb 100644
--- a/vp9/common/vp9_enums.h
+++ b/vp9/common/vp9_enums.h
@@ -40,12 +40,6 @@
MAX_PROFILES
} BITSTREAM_PROFILE;
-typedef enum BIT_DEPTH {
- BITS_8,
- BITS_10,
- BITS_12
-} BIT_DEPTH;
-
typedef enum BLOCK_SIZE {
BLOCK_4X4,
BLOCK_4X8,
diff --git a/vp9/common/vp9_idct.c b/vp9/common/vp9_idct.c
index 856d41e..b196fc5 100644
--- a/vp9/common/vp9_idct.c
+++ b/vp9/common/vp9_idct.c
@@ -18,14 +18,47 @@
#include "vp9/common/vp9_common.h"
#include "vp9/common/vp9_idct.h"
-void vp9_iwht4x4_16_add_c(const int16_t *input, uint8_t *dest, int stride) {
+#if CONFIG_EMULATE_HARDWARE_HIGHBITDEPTH
+// When CONFIG_EMULATE_HW_HIGHBITDEPTH is 1 the transform performs strict
+// overflow wrapping to match expected hardware implementations.
+// bd of 8 uses trans_low with 16bits, need to remove 16bits
+// bd of 10 uses trans_low with 18bits, need to remove 14bits
+// bd of 12 uses trans_low with 20bits, need to remove 12bits
+// bd of x uses trans_low with 8+x bits, need to remove 24-x bits
+#define WRAPLOW(x) ((((int32_t)x) << (24 - bd)) >> (24 - bd))
+#else
+#define WRAPLOW(x) (x)
+#endif // CONFIG_EMULATE_HARDWARE_HIGHBITDEPTH
+
+#if CONFIG_VP9_HIGHBITDEPTH
+static INLINE tran_low_t clamp_high(tran_high_t value, tran_low_t low,
+ tran_low_t high) {
+ return value < low ? low : (value > high ? high : value);
+}
+
+static INLINE tran_low_t clip_pixel_bd_high(tran_high_t dest,
+ tran_high_t trans, int bd) {
+ trans = WRAPLOW(trans);
+ switch (bd) {
+ case 8:
+ default:
+ return clamp_high(WRAPLOW(dest + trans), 0, 255);
+ case 10:
+ return clamp_high(WRAPLOW(dest + trans), 0, 1023);
+ case 12:
+ return clamp_high(WRAPLOW(dest + trans), 0, 4095);
+ }
+}
+#endif // CONFIG_VP9_HIGHBITDEPTH
+
+void vp9_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
/* 4-point reversible, orthonormal inverse Walsh-Hadamard in 3.5 adds,
0.5 shifts per pixel. */
int i;
- int16_t output[16];
- int a1, b1, c1, d1, e1;
- const int16_t *ip = input;
- int16_t *op = output;
+ tran_low_t output[16];
+ tran_high_t a1, b1, c1, d1, e1;
+ const tran_low_t *ip = input;
+ tran_low_t *op = output;
for (i = 0; i < 4; i++) {
a1 = ip[0] >> UNIT_QUANT_SHIFT;
@@ -70,12 +103,12 @@
}
}
-void vp9_iwht4x4_1_add_c(const int16_t *in, uint8_t *dest, int dest_stride) {
+void vp9_iwht4x4_1_add_c(const tran_low_t *in, uint8_t *dest, int dest_stride) {
int i;
- int a1, e1;
- int16_t tmp[4];
- const int16_t *ip = in;
- int16_t *op = tmp;
+ tran_high_t a1, e1;
+ tran_low_t tmp[4];
+ const tran_low_t *ip = in;
+ tran_low_t *op = tmp;
a1 = ip[0] >> UNIT_QUANT_SHIFT;
e1 = a1 >> 1;
@@ -96,9 +129,9 @@
}
}
-static void idct4(const int16_t *input, int16_t *output) {
- int16_t step[4];
- int temp1, temp2;
+static void idct4(const tran_low_t *input, tran_low_t *output) {
+ tran_low_t step[4];
+ tran_high_t temp1, temp2;
// stage 1
temp1 = (input[0] + input[2]) * cospi_16_64;
temp2 = (input[0] - input[2]) * cospi_16_64;
@@ -116,11 +149,11 @@
output[3] = step[0] - step[3];
}
-void vp9_idct4x4_16_add_c(const int16_t *input, uint8_t *dest, int stride) {
- int16_t out[4 * 4];
- int16_t *outptr = out;
+void vp9_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
+ tran_low_t out[4 * 4];
+ tran_low_t *outptr = out;
int i, j;
- int16_t temp_in[4], temp_out[4];
+ tran_low_t temp_in[4], temp_out[4];
// Rows
for (i = 0; i < 4; ++i) {
@@ -140,10 +173,11 @@
}
}
-void vp9_idct4x4_1_add_c(const int16_t *input, uint8_t *dest, int dest_stride) {
+void vp9_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest,
+ int dest_stride) {
int i;
- int a1;
- int16_t out = dct_const_round_shift(input[0] * cospi_16_64);
+ tran_high_t a1;
+ tran_low_t out = dct_const_round_shift(input[0] * cospi_16_64);
out = dct_const_round_shift(out * cospi_16_64);
a1 = ROUND_POWER_OF_TWO(out, 4);
@@ -156,9 +190,9 @@
}
}
-static void idct8(const int16_t *input, int16_t *output) {
- int16_t step1[8], step2[8];
- int temp1, temp2;
+static void idct8(const tran_low_t *input, tran_low_t *output) {
+ tran_low_t step1[8], step2[8];
+ tran_high_t temp1, temp2;
// stage 1
step1[0] = input[0];
step1[2] = input[4];
@@ -201,11 +235,11 @@
output[7] = step1[0] - step1[7];
}
-void vp9_idct8x8_64_add_c(const int16_t *input, uint8_t *dest, int stride) {
- int16_t out[8 * 8];
- int16_t *outptr = out;
+void vp9_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
+ tran_low_t out[8 * 8];
+ tran_low_t *outptr = out;
int i, j;
- int16_t temp_in[8], temp_out[8];
+ tran_low_t temp_in[8], temp_out[8];
// First transform rows
for (i = 0; i < 8; ++i) {
@@ -225,10 +259,10 @@
}
}
-void vp9_idct8x8_1_add_c(const int16_t *input, uint8_t *dest, int stride) {
+void vp9_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
int i, j;
- int a1;
- int16_t out = dct_const_round_shift(input[0] * cospi_16_64);
+ tran_high_t a1;
+ tran_low_t out = dct_const_round_shift(input[0] * cospi_16_64);
out = dct_const_round_shift(out * cospi_16_64);
a1 = ROUND_POWER_OF_TWO(out, 5);
for (j = 0; j < 8; ++j) {
@@ -238,13 +272,13 @@
}
}
-static void iadst4(const int16_t *input, int16_t *output) {
- int s0, s1, s2, s3, s4, s5, s6, s7;
+static void iadst4(const tran_low_t *input, tran_low_t *output) {
+ tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;
- int x0 = input[0];
- int x1 = input[1];
- int x2 = input[2];
- int x3 = input[3];
+ tran_high_t x0 = input[0];
+ tran_high_t x1 = input[1];
+ tran_high_t x2 = input[2];
+ tran_high_t x3 = input[3];
if (!(x0 | x1 | x2 | x3)) {
output[0] = output[1] = output[2] = output[3] = 0;
@@ -280,7 +314,7 @@
output[3] = dct_const_round_shift(s3);
}
-void vp9_iht4x4_16_add_c(const int16_t *input, uint8_t *dest, int stride,
+void vp9_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride,
int tx_type) {
const transform_2d IHT_4[] = {
{ idct4, idct4 }, // DCT_DCT = 0
@@ -290,9 +324,9 @@
};
int i, j;
- int16_t out[4 * 4];
- int16_t *outptr = out;
- int16_t temp_in[4], temp_out[4];
+ tran_low_t out[4 * 4];
+ tran_low_t *outptr = out;
+ tran_low_t temp_in[4], temp_out[4];
// inverse transform row vectors
for (i = 0; i < 4; ++i) {
@@ -311,17 +345,17 @@
+ dest[j * stride + i]);
}
}
-static void iadst8(const int16_t *input, int16_t *output) {
+static void iadst8(const tran_low_t *input, tran_low_t *output) {
int s0, s1, s2, s3, s4, s5, s6, s7;
- int x0 = input[7];
- int x1 = input[0];
- int x2 = input[5];
- int x3 = input[2];
- int x4 = input[3];
- int x5 = input[4];
- int x6 = input[1];
- int x7 = input[6];
+ tran_high_t x0 = input[7];
+ tran_high_t x1 = input[0];
+ tran_high_t x2 = input[5];
+ tran_high_t x3 = input[2];
+ tran_high_t x4 = input[3];
+ tran_high_t x5 = input[4];
+ tran_high_t x6 = input[1];
+ tran_high_t x7 = input[6];
if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7)) {
output[0] = output[1] = output[2] = output[3] = output[4]
@@ -395,12 +429,12 @@
{ iadst8, iadst8 } // ADST_ADST = 3
};
-void vp9_iht8x8_64_add_c(const int16_t *input, uint8_t *dest, int stride,
+void vp9_iht8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride,
int tx_type) {
int i, j;
- int16_t out[8 * 8];
- int16_t *outptr = out;
- int16_t temp_in[8], temp_out[8];
+ tran_low_t out[8 * 8];
+ tran_low_t *outptr = out;
+ tran_low_t temp_in[8], temp_out[8];
const transform_2d ht = IHT_8[tx_type];
// inverse transform row vectors
@@ -421,11 +455,11 @@
}
}
-void vp9_idct8x8_12_add_c(const int16_t *input, uint8_t *dest, int stride) {
- int16_t out[8 * 8] = { 0 };
- int16_t *outptr = out;
+void vp9_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
+ tran_low_t out[8 * 8] = { 0 };
+ tran_low_t *outptr = out;
int i, j;
- int16_t temp_in[8], temp_out[8];
+ tran_low_t temp_in[8], temp_out[8];
// First transform rows
// only first 4 row has non-zero coefs
@@ -446,9 +480,9 @@
}
}
-static void idct16(const int16_t *input, int16_t *output) {
- int16_t step1[16], step2[16];
- int temp1, temp2;
+static void idct16(const tran_low_t *input, tran_low_t *output) {
+ tran_low_t step1[16], step2[16];
+ tran_high_t temp1, temp2;
// stage 1
step1[0] = input[0/2];
@@ -611,11 +645,12 @@
output[15] = step2[0] - step2[15];
}
-void vp9_idct16x16_256_add_c(const int16_t *input, uint8_t *dest, int stride) {
- int16_t out[16 * 16];
- int16_t *outptr = out;
+void vp9_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest,
+ int stride) {
+ tran_low_t out[16 * 16];
+ tran_low_t *outptr = out;
int i, j;
- int16_t temp_in[16], temp_out[16];
+ tran_low_t temp_in[16], temp_out[16];
// First transform rows
for (i = 0; i < 16; ++i) {
@@ -635,25 +670,26 @@
}
}
-static void iadst16(const int16_t *input, int16_t *output) {
- int s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14, s15;
+static void iadst16(const tran_low_t *input, tran_low_t *output) {
+ tran_high_t s0, s1, s2, s3, s4, s5, s6, s7, s8;
+ tran_high_t s9, s10, s11, s12, s13, s14, s15;
- int x0 = input[15];
- int x1 = input[0];
- int x2 = input[13];
- int x3 = input[2];
- int x4 = input[11];
- int x5 = input[4];
- int x6 = input[9];
- int x7 = input[6];
- int x8 = input[7];
- int x9 = input[8];
- int x10 = input[5];
- int x11 = input[10];
- int x12 = input[3];
- int x13 = input[12];
- int x14 = input[1];
- int x15 = input[14];
+ tran_high_t x0 = input[15];
+ tran_high_t x1 = input[0];
+ tran_high_t x2 = input[13];
+ tran_high_t x3 = input[2];
+ tran_high_t x4 = input[11];
+ tran_high_t x5 = input[4];
+ tran_high_t x6 = input[9];
+ tran_high_t x7 = input[6];
+ tran_high_t x8 = input[7];
+ tran_high_t x9 = input[8];
+ tran_high_t x10 = input[5];
+ tran_high_t x11 = input[10];
+ tran_high_t x12 = input[3];
+ tran_high_t x13 = input[12];
+ tran_high_t x14 = input[1];
+ tran_high_t x15 = input[14];
if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7 | x8
| x9 | x10 | x11 | x12 | x13 | x14 | x15)) {
@@ -813,12 +849,12 @@
{ iadst16, iadst16 } // ADST_ADST = 3
};
-void vp9_iht16x16_256_add_c(const int16_t *input, uint8_t *dest, int stride,
+void vp9_iht16x16_256_add_c(const tran_low_t *input, uint8_t *dest, int stride,
int tx_type) {
int i, j;
- int16_t out[16 * 16];
- int16_t *outptr = out;
- int16_t temp_in[16], temp_out[16];
+ tran_low_t out[16 * 16];
+ tran_low_t *outptr = out;
+ tran_low_t temp_in[16], temp_out[16];
const transform_2d ht = IHT_16[tx_type];
// Rows
@@ -839,11 +875,12 @@
}
}
-void vp9_idct16x16_10_add_c(const int16_t *input, uint8_t *dest, int stride) {
- int16_t out[16 * 16] = { 0 };
- int16_t *outptr = out;
+void vp9_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest,
+ int stride) {
+ tran_low_t out[16 * 16] = { 0 };
+ tran_low_t *outptr = out;
int i, j;
- int16_t temp_in[16], temp_out[16];
+ tran_low_t temp_in[16], temp_out[16];
// First transform rows. Since all non-zero dct coefficients are in
// upper-left 4x4 area, we only need to calculate first 4 rows here.
@@ -864,10 +901,10 @@
}
}
-void vp9_idct16x16_1_add_c(const int16_t *input, uint8_t *dest, int stride) {
+void vp9_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
int i, j;
- int a1;
- int16_t out = dct_const_round_shift(input[0] * cospi_16_64);
+ tran_high_t a1;
+ tran_low_t out = dct_const_round_shift(input[0] * cospi_16_64);
out = dct_const_round_shift(out * cospi_16_64);
a1 = ROUND_POWER_OF_TWO(out, 6);
for (j = 0; j < 16; ++j) {
@@ -877,9 +914,9 @@
}
}
-static void idct32(const int16_t *input, int16_t *output) {
- int16_t step1[32], step2[32];
- int temp1, temp2;
+static void idct32(const tran_low_t *input, tran_low_t *output) {
+ tran_low_t step1[32], step2[32];
+ tran_high_t temp1, temp2;
// stage 1
step1[0] = input[0];
@@ -1244,11 +1281,12 @@
output[31] = step1[0] - step1[31];
}
-void vp9_idct32x32_1024_add_c(const int16_t *input, uint8_t *dest, int stride) {
- int16_t out[32 * 32];
- int16_t *outptr = out;
+void vp9_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest,
+ int stride) {
+ tran_low_t out[32 * 32];
+ tran_low_t *outptr = out;
int i, j;
- int16_t temp_in[32], temp_out[32];
+ tran_low_t temp_in[32], temp_out[32];
// Rows
for (i = 0; i < 32; ++i) {
@@ -1265,7 +1303,7 @@
if (zero_coeff[0] | zero_coeff[1])
idct32(input, outptr);
else
- vpx_memset(outptr, 0, sizeof(int16_t) * 32);
+ vpx_memset(outptr, 0, sizeof(tran_low_t) * 32);
input += 32;
outptr += 32;
}
@@ -1281,11 +1319,12 @@
}
}
-void vp9_idct32x32_34_add_c(const int16_t *input, uint8_t *dest, int stride) {
- int16_t out[32 * 32] = {0};
- int16_t *outptr = out;
+void vp9_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest,
+ int stride) {
+ tran_low_t out[32 * 32] = {0};
+ tran_low_t *outptr = out;
int i, j;
- int16_t temp_in[32], temp_out[32];
+ tran_low_t temp_in[32], temp_out[32];
// Rows
// only upper-left 8x8 has non-zero coeff
@@ -1306,11 +1345,11 @@
}
}
-void vp9_idct32x32_1_add_c(const int16_t *input, uint8_t *dest, int stride) {
+void vp9_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
int i, j;
- int a1;
+ tran_high_t a1;
- int16_t out = dct_const_round_shift(input[0] * cospi_16_64);
+ tran_low_t out = dct_const_round_shift(input[0] * cospi_16_64);
out = dct_const_round_shift(out * cospi_16_64);
a1 = ROUND_POWER_OF_TWO(out, 6);
@@ -1322,7 +1361,8 @@
}
// idct
-void vp9_idct4x4_add(const int16_t *input, uint8_t *dest, int stride, int eob) {
+void vp9_idct4x4_add(const tran_low_t *input, uint8_t *dest, int stride,
+ int eob) {
if (eob > 1)
vp9_idct4x4_16_add(input, dest, stride);
else
@@ -1330,14 +1370,16 @@
}
-void vp9_iwht4x4_add(const int16_t *input, uint8_t *dest, int stride, int eob) {
+void vp9_iwht4x4_add(const tran_low_t *input, uint8_t *dest, int stride,
+ int eob) {
if (eob > 1)
vp9_iwht4x4_16_add(input, dest, stride);
else
vp9_iwht4x4_1_add(input, dest, stride);
}
-void vp9_idct8x8_add(const int16_t *input, uint8_t *dest, int stride, int eob) {
+void vp9_idct8x8_add(const tran_low_t *input, uint8_t *dest, int stride,
+ int eob) {
// If dc is 1, then input[0] is the reconstructed value, do not need
// dequantization. Also, when dc is 1, dc is counted in eobs, namely eobs >=1.
@@ -1354,7 +1396,7 @@
vp9_idct8x8_64_add(input, dest, stride);
}
-void vp9_idct16x16_add(const int16_t *input, uint8_t *dest, int stride,
+void vp9_idct16x16_add(const tran_low_t *input, uint8_t *dest, int stride,
int eob) {
/* The calculation can be simplified if there are not many non-zero dct
* coefficients. Use eobs to separate different cases. */
@@ -1367,7 +1409,7 @@
vp9_idct16x16_256_add(input, dest, stride);
}
-void vp9_idct32x32_add(const int16_t *input, uint8_t *dest, int stride,
+void vp9_idct32x32_add(const tran_low_t *input, uint8_t *dest, int stride,
int eob) {
if (eob == 1)
vp9_idct32x32_1_add(input, dest, stride);
@@ -1379,7 +1421,7 @@
}
// iht
-void vp9_iht4x4_add(TX_TYPE tx_type, const int16_t *input, uint8_t *dest,
+void vp9_iht4x4_add(TX_TYPE tx_type, const tran_low_t *input, uint8_t *dest,
int stride, int eob) {
if (tx_type == DCT_DCT)
vp9_idct4x4_add(input, dest, stride, eob);
@@ -1387,7 +1429,7 @@
vp9_iht4x4_16_add(input, dest, stride, tx_type);
}
-void vp9_iht8x8_add(TX_TYPE tx_type, const int16_t *input, uint8_t *dest,
+void vp9_iht8x8_add(TX_TYPE tx_type, const tran_low_t *input, uint8_t *dest,
int stride, int eob) {
if (tx_type == DCT_DCT) {
vp9_idct8x8_add(input, dest, stride, eob);
@@ -1396,7 +1438,7 @@
}
}
-void vp9_iht16x16_add(TX_TYPE tx_type, const int16_t *input, uint8_t *dest,
+void vp9_iht16x16_add(TX_TYPE tx_type, const tran_low_t *input, uint8_t *dest,
int stride, int eob) {
if (tx_type == DCT_DCT) {
vp9_idct16x16_add(input, dest, stride, eob);
@@ -1404,3 +1446,1433 @@
vp9_iht16x16_256_add(input, dest, stride, tx_type);
}
}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+void vp9_high_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest8,
+ int stride, int bd) {
+ /* 4-point reversible, orthonormal inverse Walsh-Hadamard in 3.5 adds,
+ 0.5 shifts per pixel. */
+ int i;
+ tran_low_t output[16];
+ tran_high_t a1, b1, c1, d1, e1;
+ const tran_low_t *ip = input;
+ tran_low_t *op = output;
+ uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
+
+ for (i = 0; i < 4; i++) {
+ a1 = ip[0] >> UNIT_QUANT_SHIFT;
+ c1 = ip[1] >> UNIT_QUANT_SHIFT;
+ d1 = ip[2] >> UNIT_QUANT_SHIFT;
+ b1 = ip[3] >> UNIT_QUANT_SHIFT;
+ a1 += c1;
+ d1 -= b1;
+ e1 = (a1 - d1) >> 1;
+ b1 = e1 - b1;
+ c1 = e1 - c1;
+ a1 -= b1;
+ d1 += c1;
+ op[0] = WRAPLOW(a1);
+ op[1] = WRAPLOW(b1);
+ op[2] = WRAPLOW(c1);
+ op[3] = WRAPLOW(d1);
+ ip += 4;
+ op += 4;
+ }
+
+ ip = output;
+ for (i = 0; i < 4; i++) {
+ a1 = ip[4 * 0];
+ c1 = ip[4 * 1];
+ d1 = ip[4 * 2];
+ b1 = ip[4 * 3];
+ a1 += c1;
+ d1 -= b1;
+ e1 = (a1 - d1) >> 1;
+ b1 = e1 - b1;
+ c1 = e1 - c1;
+ a1 -= b1;
+ d1 += c1;
+ dest[stride * 0] = clip_pixel_bd_high(dest[stride * 0], a1, bd);
+ dest[stride * 1] = clip_pixel_bd_high(dest[stride * 1], b1, bd);
+ dest[stride * 2] = clip_pixel_bd_high(dest[stride * 2], c1, bd);
+ dest[stride * 3] = clip_pixel_bd_high(dest[stride * 3], d1, bd);
+
+ ip++;
+ dest++;
+ }
+}
+
+static void high_idct4(const tran_low_t *input, tran_low_t *output, int bd) {
+ tran_low_t step[4];
+ tran_high_t temp1, temp2;
+ (void) bd;
+ // stage 1
+ temp1 = (input[0] + input[2]) * cospi_16_64;
+ temp2 = (input[0] - input[2]) * cospi_16_64;
+ step[0] = WRAPLOW(dct_const_round_shift(temp1));
+ step[1] = WRAPLOW(dct_const_round_shift(temp2));
+ temp1 = input[1] * cospi_24_64 - input[3] * cospi_8_64;
+ temp2 = input[1] * cospi_8_64 + input[3] * cospi_24_64;
+ step[2] = WRAPLOW(dct_const_round_shift(temp1));
+ step[3] = WRAPLOW(dct_const_round_shift(temp2));
+
+ // stage 2
+ output[0] = WRAPLOW(step[0] + step[3]);
+ output[1] = WRAPLOW(step[1] + step[2]);
+ output[2] = WRAPLOW(step[1] - step[2]);
+ output[3] = WRAPLOW(step[0] - step[3]);
+}
+
+void vp9_high_iwht4x4_1_add_c(const tran_low_t *in, uint8_t *dest8,
+ int dest_stride, int bd) {
+ int i;
+ tran_high_t a1, e1;
+ tran_low_t tmp[4];
+ const tran_low_t *ip = in;
+ tran_low_t *op = tmp;
+ uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
+ (void) bd;
+
+ a1 = ip[0] >> UNIT_QUANT_SHIFT;
+ e1 = a1 >> 1;
+ a1 -= e1;
+ op[0] = WRAPLOW(a1);
+ op[1] = op[2] = op[3] = WRAPLOW(e1);
+
+ ip = tmp;
+ for (i = 0; i < 4; i++) {
+ e1 = ip[0] >> 1;
+ a1 = ip[0] - e1;
+ dest[dest_stride * 0] = clip_pixel_bd_high(dest[dest_stride * 0], a1, bd);
+ dest[dest_stride * 1] = clip_pixel_bd_high(dest[dest_stride * 1], e1, bd);
+ dest[dest_stride * 2] = clip_pixel_bd_high(dest[dest_stride * 2], e1, bd);
+ dest[dest_stride * 3] = clip_pixel_bd_high(dest[dest_stride * 3], e1, bd);
+ ip++;
+ dest++;
+ }
+}
+
+void vp9_high_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest8,
+ int stride, int bd) {
+ tran_low_t out[4 * 4];
+ tran_low_t *outptr = out;
+ int i, j;
+ tran_low_t temp_in[4], temp_out[4];
+ uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
+
+ // Rows
+ for (i = 0; i < 4; ++i) {
+ high_idct4(input, outptr, bd);
+ input += 4;
+ outptr += 4;
+ }
+
+ // Columns
+ for (i = 0; i < 4; ++i) {
+ for (j = 0; j < 4; ++j)
+ temp_in[j] = out[j * 4 + i];
+ high_idct4(temp_in, temp_out, bd);
+ for (j = 0; j < 4; ++j)
+ dest[j * stride + i] = clip_pixel_bd_high(
+ dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 4), bd);
+ }
+}
+
+void vp9_high_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest8,
+ int dest_stride, int bd) {
+ int i;
+ tran_high_t a1;
+ tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64));
+ uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
+
+ out = WRAPLOW(dct_const_round_shift(out * cospi_16_64));
+ a1 = ROUND_POWER_OF_TWO(out, 4);
+
+ for (i = 0; i < 4; i++) {
+ dest[0] = clip_pixel_bd_high(dest[0], a1, bd);
+ dest[1] = clip_pixel_bd_high(dest[1], a1, bd);
+ dest[2] = clip_pixel_bd_high(dest[2], a1, bd);
+ dest[3] = clip_pixel_bd_high(dest[3], a1, bd);
+ dest += dest_stride;
+ }
+}
+
+static void high_idct8(const tran_low_t *input, tran_low_t *output, int bd) {
+ tran_low_t step1[8], step2[8];
+ tran_high_t temp1, temp2;
+ // stage 1
+ step1[0] = input[0];
+ step1[2] = input[4];
+ step1[1] = input[2];
+ step1[3] = input[6];
+ temp1 = input[1] * cospi_28_64 - input[7] * cospi_4_64;
+ temp2 = input[1] * cospi_4_64 + input[7] * cospi_28_64;
+ step1[4] = WRAPLOW(dct_const_round_shift(temp1));
+ step1[7] = WRAPLOW(dct_const_round_shift(temp2));
+ temp1 = input[5] * cospi_12_64 - input[3] * cospi_20_64;
+ temp2 = input[5] * cospi_20_64 + input[3] * cospi_12_64;
+ step1[5] = WRAPLOW(dct_const_round_shift(temp1));
+ step1[6] = WRAPLOW(dct_const_round_shift(temp2));
+
+ // stage 2 & stage 3 - even half
+ high_idct4(step1, step1, bd);
+
+ // stage 2 - odd half
+ step2[4] = WRAPLOW(step1[4] + step1[5]);
+ step2[5] = WRAPLOW(step1[4] - step1[5]);
+ step2[6] = WRAPLOW(-step1[6] + step1[7]);
+ step2[7] = WRAPLOW(step1[6] + step1[7]);
+
+ // stage 3 - odd half
+ step1[4] = step2[4];
+ temp1 = (step2[6] - step2[5]) * cospi_16_64;
+ temp2 = (step2[5] + step2[6]) * cospi_16_64;
+ step1[5] = WRAPLOW(dct_const_round_shift(temp1));
+ step1[6] = WRAPLOW(dct_const_round_shift(temp2));
+ step1[7] = step2[7];
+
+ // stage 4
+ output[0] = WRAPLOW(step1[0] + step1[7]);
+ output[1] = WRAPLOW(step1[1] + step1[6]);
+ output[2] = WRAPLOW(step1[2] + step1[5]);
+ output[3] = WRAPLOW(step1[3] + step1[4]);
+ output[4] = WRAPLOW(step1[3] - step1[4]);
+ output[5] = WRAPLOW(step1[2] - step1[5]);
+ output[6] = WRAPLOW(step1[1] - step1[6]);
+ output[7] = WRAPLOW(step1[0] - step1[7]);
+}
+
+void vp9_high_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest8,
+ int stride, int bd) {
+ tran_low_t out[8 * 8];
+ tran_low_t *outptr = out;
+ int i, j;
+ tran_low_t temp_in[8], temp_out[8];
+ uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
+
+ // First transform rows.
+ for (i = 0; i < 8; ++i) {
+ high_idct8(input, outptr, bd);
+ input += 8;
+ outptr += 8;
+ }
+
+ // Then transform columns.
+ for (i = 0; i < 8; ++i) {
+ for (j = 0; j < 8; ++j)
+ temp_in[j] = out[j * 8 + i];
+ high_idct8(temp_in, temp_out, bd);
+ for (j = 0; j < 8; ++j)
+ dest[j * stride + i] = clip_pixel_bd_high(dest[j * stride + i],
+ ROUND_POWER_OF_TWO(temp_out[j], 5),
+ bd);
+ }
+}
+
+void vp9_high_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest8,
+ int stride, int bd) {
+ int i, j;
+ tran_high_t a1;
+ tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64));
+ uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
+ out = WRAPLOW(dct_const_round_shift(out * cospi_16_64));
+ a1 = ROUND_POWER_OF_TWO(out, 5);
+ for (j = 0; j < 8; ++j) {
+ for (i = 0; i < 8; ++i)
+ dest[i] = clip_pixel_bd_high(dest[i], a1, bd);
+ dest += stride;
+ }
+}
+
+static void high_iadst4(const tran_low_t *input, tran_low_t *output, int bd) {
+ tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;
+
+ tran_high_t x0 = input[0];
+ tran_high_t x1 = input[1];
+ tran_high_t x2 = input[2];
+ tran_high_t x3 = input[3];
+ (void) bd;
+
+ if (!(x0 | x1 | x2 | x3)) {
+ vpx_memset(output, 0, 4 * sizeof(*output));
+ return;
+ }
+
+ s0 = sinpi_1_9 * x0;
+ s1 = sinpi_2_9 * x0;
+ s2 = sinpi_3_9 * x1;
+ s3 = sinpi_4_9 * x2;
+ s4 = sinpi_1_9 * x2;
+ s5 = sinpi_2_9 * x3;
+ s6 = sinpi_4_9 * x3;
+ s7 = x0 - x2 + x3;
+
+ x0 = s0 + s3 + s5;
+ x1 = s1 - s4 - s6;
+ x2 = sinpi_3_9 * s7;
+ x3 = s2;
+
+ s0 = x0 + x3;
+ s1 = x1 + x3;
+ s2 = x2;
+ s3 = x0 + x1 - x3;
+
+ // 1-D transform scaling factor is sqrt(2).
+ // The overall dynamic range is 14b (input) + 14b (multiplication scaling)
+ // + 1b (addition) = 29b.
+ // Hence the output bit depth is 15b.
+ output[0] = WRAPLOW(dct_const_round_shift(s0));
+ output[1] = WRAPLOW(dct_const_round_shift(s1));
+ output[2] = WRAPLOW(dct_const_round_shift(s2));
+ output[3] = WRAPLOW(dct_const_round_shift(s3));
+}
+
+void vp9_high_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest8,
+ int stride, int tx_type, int bd) {
+ const high_transform_2d IHT_4[] = {
+ { high_idct4, high_idct4 }, // DCT_DCT = 0
+ { high_iadst4, high_idct4 }, // ADST_DCT = 1
+ { high_idct4, high_iadst4 }, // DCT_ADST = 2
+ { high_iadst4, high_iadst4 } // ADST_ADST = 3
+ };
+ uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
+
+ int i, j;
+ tran_low_t out[4 * 4];
+ tran_low_t *outptr = out;
+ tran_low_t temp_in[4], temp_out[4];
+
+ // Inverse transform row vectors.
+ for (i = 0; i < 4; ++i) {
+ IHT_4[tx_type].rows(input, outptr, bd);
+ input += 4;
+ outptr += 4;
+ }
+
+ // Inverse transform column vectors.
+ for (i = 0; i < 4; ++i) {
+ for (j = 0; j < 4; ++j)
+ temp_in[j] = out[j * 4 + i];
+ IHT_4[tx_type].cols(temp_in, temp_out, bd);
+ for (j = 0; j < 4; ++j)
+ dest[j * stride + i] = clip_pixel_bd_high(
+ dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 4), bd);
+ }
+}
+
+static void high_iadst8(const tran_low_t *input, tran_low_t *output, int bd) {
+ tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;
+
+ tran_high_t x0 = input[7];
+ tran_high_t x1 = input[0];
+ tran_high_t x2 = input[5];
+ tran_high_t x3 = input[2];
+ tran_high_t x4 = input[3];
+ tran_high_t x5 = input[4];
+ tran_high_t x6 = input[1];
+ tran_high_t x7 = input[6];
+ (void) bd;
+
+ if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7)) {
+ vpx_memset(output, 0, 8 * sizeof(*output));
+ return;
+ }
+
+ // stage 1
+ s0 = cospi_2_64 * x0 + cospi_30_64 * x1;
+ s1 = cospi_30_64 * x0 - cospi_2_64 * x1;
+ s2 = cospi_10_64 * x2 + cospi_22_64 * x3;
+ s3 = cospi_22_64 * x2 - cospi_10_64 * x3;
+ s4 = cospi_18_64 * x4 + cospi_14_64 * x5;
+ s5 = cospi_14_64 * x4 - cospi_18_64 * x5;
+ s6 = cospi_26_64 * x6 + cospi_6_64 * x7;
+ s7 = cospi_6_64 * x6 - cospi_26_64 * x7;
+
+ x0 = WRAPLOW(dct_const_round_shift(s0 + s4));
+ x1 = WRAPLOW(dct_const_round_shift(s1 + s5));
+ x2 = WRAPLOW(dct_const_round_shift(s2 + s6));
+ x3 = WRAPLOW(dct_const_round_shift(s3 + s7));
+ x4 = WRAPLOW(dct_const_round_shift(s0 - s4));
+ x5 = WRAPLOW(dct_const_round_shift(s1 - s5));
+ x6 = WRAPLOW(dct_const_round_shift(s2 - s6));
+ x7 = WRAPLOW(dct_const_round_shift(s3 - s7));
+
+ // stage 2
+ s0 = x0;
+ s1 = x1;
+ s2 = x2;
+ s3 = x3;
+ s4 = cospi_8_64 * x4 + cospi_24_64 * x5;
+ s5 = cospi_24_64 * x4 - cospi_8_64 * x5;
+ s6 = -cospi_24_64 * x6 + cospi_8_64 * x7;
+ s7 = cospi_8_64 * x6 + cospi_24_64 * x7;
+
+ x0 = s0 + s2;
+ x1 = s1 + s3;
+ x2 = s0 - s2;
+ x3 = s1 - s3;
+ x4 = WRAPLOW(dct_const_round_shift(s4 + s6));
+ x5 = WRAPLOW(dct_const_round_shift(s5 + s7));
+ x6 = WRAPLOW(dct_const_round_shift(s4 - s6));
+ x7 = WRAPLOW(dct_const_round_shift(s5 - s7));
+
+ // stage 3
+ s2 = cospi_16_64 * (x2 + x3);
+ s3 = cospi_16_64 * (x2 - x3);
+ s6 = cospi_16_64 * (x6 + x7);
+ s7 = cospi_16_64 * (x6 - x7);
+
+ x2 = WRAPLOW(dct_const_round_shift(s2));
+ x3 = WRAPLOW(dct_const_round_shift(s3));
+ x6 = WRAPLOW(dct_const_round_shift(s6));
+ x7 = WRAPLOW(dct_const_round_shift(s7));
+
+ output[0] = WRAPLOW(x0);
+ output[1] = WRAPLOW(-x4);
+ output[2] = WRAPLOW(x6);
+ output[3] = WRAPLOW(-x2);
+ output[4] = WRAPLOW(x3);
+ output[5] = WRAPLOW(-x7);
+ output[6] = WRAPLOW(x5);
+ output[7] = WRAPLOW(-x1);
+}
+
+static const high_transform_2d HIGH_IHT_8[] = {
+ { high_idct8, high_idct8 }, // DCT_DCT = 0
+ { high_iadst8, high_idct8 }, // ADST_DCT = 1
+ { high_idct8, high_iadst8 }, // DCT_ADST = 2
+ { high_iadst8, high_iadst8 } // ADST_ADST = 3
+};
+
+void vp9_high_iht8x8_64_add_c(const tran_low_t *input, uint8_t *dest8,
+ int stride, int tx_type, int bd) {
+ int i, j;
+ tran_low_t out[8 * 8];
+ tran_low_t *outptr = out;
+ tran_low_t temp_in[8], temp_out[8];
+ const high_transform_2d ht = HIGH_IHT_8[tx_type];
+ uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
+
+ // Inverse transform row vectors.
+ for (i = 0; i < 8; ++i) {
+ ht.rows(input, outptr, bd);
+ input += 8;
+ outptr += 8;
+ }
+
+ // Inverse transform column vectors.
+ for (i = 0; i < 8; ++i) {
+ for (j = 0; j < 8; ++j)
+ temp_in[j] = out[j * 8 + i];
+ ht.cols(temp_in, temp_out, bd);
+ for (j = 0; j < 8; ++j)
+ dest[j * stride + i] = clip_pixel_bd_high(
+ dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 5), bd);
+ }
+}
+
+void vp9_high_idct8x8_10_add_c(const tran_low_t *input, uint8_t *dest8,
+ int stride, int bd) {
+ tran_low_t out[8 * 8] = { 0 };
+ tran_low_t *outptr = out;
+ int i, j;
+ tran_low_t temp_in[8], temp_out[8];
+ uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
+
+ // First transform rows.
+ // Only first 4 row has non-zero coefs.
+ for (i = 0; i < 4; ++i) {
+ high_idct8(input, outptr, bd);
+ input += 8;
+ outptr += 8;
+ }
+ // Then transform columns.
+ for (i = 0; i < 8; ++i) {
+ for (j = 0; j < 8; ++j)
+ temp_in[j] = out[j * 8 + i];
+ high_idct8(temp_in, temp_out, bd);
+ for (j = 0; j < 8; ++j)
+ dest[j * stride + i] = clip_pixel_bd_high(
+ dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 5), bd);
+ }
+}
+
+static void high_idct16(const tran_low_t *input, tran_low_t *output, int bd) {
+ tran_low_t step1[16], step2[16];
+ tran_high_t temp1, temp2;
+ (void) bd;
+
+ // stage 1
+ step1[0] = input[0/2];
+ step1[1] = input[16/2];
+ step1[2] = input[8/2];
+ step1[3] = input[24/2];
+ step1[4] = input[4/2];
+ step1[5] = input[20/2];
+ step1[6] = input[12/2];
+ step1[7] = input[28/2];
+ step1[8] = input[2/2];
+ step1[9] = input[18/2];
+ step1[10] = input[10/2];
+ step1[11] = input[26/2];
+ step1[12] = input[6/2];
+ step1[13] = input[22/2];
+ step1[14] = input[14/2];
+ step1[15] = input[30/2];
+
+ // stage 2
+ step2[0] = step1[0];
+ step2[1] = step1[1];
+ step2[2] = step1[2];
+ step2[3] = step1[3];
+ step2[4] = step1[4];
+ step2[5] = step1[5];
+ step2[6] = step1[6];
+ step2[7] = step1[7];
+
+ temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64;
+ temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64;
+ step2[8] = WRAPLOW(dct_const_round_shift(temp1));
+ step2[15] = WRAPLOW(dct_const_round_shift(temp2));
+
+ temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64;
+ temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64;
+ step2[9] = WRAPLOW(dct_const_round_shift(temp1));
+ step2[14] = WRAPLOW(dct_const_round_shift(temp2));
+
+ temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64;
+ temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64;
+ step2[10] = WRAPLOW(dct_const_round_shift(temp1));
+ step2[13] = WRAPLOW(dct_const_round_shift(temp2));
+
+ temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64;
+ temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64;
+ step2[11] = WRAPLOW(dct_const_round_shift(temp1));
+ step2[12] = WRAPLOW(dct_const_round_shift(temp2));
+
+ // stage 3
+ step1[0] = step2[0];
+ step1[1] = step2[1];
+ step1[2] = step2[2];
+ step1[3] = step2[3];
+
+ temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64;
+ temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64;
+ step1[4] = WRAPLOW(dct_const_round_shift(temp1));
+ step1[7] = WRAPLOW(dct_const_round_shift(temp2));
+ temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64;
+ temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64;
+ step1[5] = WRAPLOW(dct_const_round_shift(temp1));
+ step1[6] = WRAPLOW(dct_const_round_shift(temp2));
+
+ step1[8] = WRAPLOW(step2[8] + step2[9]);
+ step1[9] = WRAPLOW(step2[8] - step2[9]);
+ step1[10] = WRAPLOW(-step2[10] + step2[11]);
+ step1[11] = WRAPLOW(step2[10] + step2[11]);
+ step1[12] = WRAPLOW(step2[12] + step2[13]);
+ step1[13] = WRAPLOW(step2[12] - step2[13]);
+ step1[14] = WRAPLOW(-step2[14] + step2[15]);
+ step1[15] = WRAPLOW(step2[14] + step2[15]);
+
+ // stage 4
+ temp1 = (step1[0] + step1[1]) * cospi_16_64;
+ temp2 = (step1[0] - step1[1]) * cospi_16_64;
+ step2[0] = WRAPLOW(dct_const_round_shift(temp1));
+ step2[1] = WRAPLOW(dct_const_round_shift(temp2));
+ temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64;
+ temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64;
+ step2[2] = WRAPLOW(dct_const_round_shift(temp1));
+ step2[3] = WRAPLOW(dct_const_round_shift(temp2));
+ step2[4] = WRAPLOW(step1[4] + step1[5]);
+ step2[5] = WRAPLOW(step1[4] - step1[5]);
+ step2[6] = WRAPLOW(-step1[6] + step1[7]);
+ step2[7] = WRAPLOW(step1[6] + step1[7]);
+
+ step2[8] = step1[8];
+ step2[15] = step1[15];
+ temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64;
+ temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64;
+ step2[9] = WRAPLOW(dct_const_round_shift(temp1));
+ step2[14] = WRAPLOW(dct_const_round_shift(temp2));
+ temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64;
+ temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64;
+ step2[10] = WRAPLOW(dct_const_round_shift(temp1));
+ step2[13] = WRAPLOW(dct_const_round_shift(temp2));
+ step2[11] = step1[11];
+ step2[12] = step1[12];
+
+ // stage 5
+ step1[0] = WRAPLOW(step2[0] + step2[3]);
+ step1[1] = WRAPLOW(step2[1] + step2[2]);
+ step1[2] = WRAPLOW(step2[1] - step2[2]);
+ step1[3] = WRAPLOW(step2[0] - step2[3]);
+ step1[4] = step2[4];
+ temp1 = (step2[6] - step2[5]) * cospi_16_64;
+ temp2 = (step2[5] + step2[6]) * cospi_16_64;
+ step1[5] = WRAPLOW(dct_const_round_shift(temp1));
+ step1[6] = WRAPLOW(dct_const_round_shift(temp2));
+ step1[7] = step2[7];
+
+ step1[8] = WRAPLOW(step2[8] + step2[11]);
+ step1[9] = WRAPLOW(step2[9] + step2[10]);
+ step1[10] = WRAPLOW(step2[9] - step2[10]);
+ step1[11] = WRAPLOW(step2[8] - step2[11]);
+ step1[12] = WRAPLOW(-step2[12] + step2[15]);
+ step1[13] = WRAPLOW(-step2[13] + step2[14]);
+ step1[14] = WRAPLOW(step2[13] + step2[14]);
+ step1[15] = WRAPLOW(step2[12] + step2[15]);
+
+ // stage 6
+ step2[0] = WRAPLOW(step1[0] + step1[7]);
+ step2[1] = WRAPLOW(step1[1] + step1[6]);
+ step2[2] = WRAPLOW(step1[2] + step1[5]);
+ step2[3] = WRAPLOW(step1[3] + step1[4]);
+ step2[4] = WRAPLOW(step1[3] - step1[4]);
+ step2[5] = WRAPLOW(step1[2] - step1[5]);
+ step2[6] = WRAPLOW(step1[1] - step1[6]);
+ step2[7] = WRAPLOW(step1[0] - step1[7]);
+ step2[8] = step1[8];
+ step2[9] = step1[9];
+ temp1 = (-step1[10] + step1[13]) * cospi_16_64;
+ temp2 = (step1[10] + step1[13]) * cospi_16_64;
+ step2[10] = WRAPLOW(dct_const_round_shift(temp1));
+ step2[13] = WRAPLOW(dct_const_round_shift(temp2));
+ temp1 = (-step1[11] + step1[12]) * cospi_16_64;
+ temp2 = (step1[11] + step1[12]) * cospi_16_64;
+ step2[11] = WRAPLOW(dct_const_round_shift(temp1));
+ step2[12] = WRAPLOW(dct_const_round_shift(temp2));
+ step2[14] = step1[14];
+ step2[15] = step1[15];
+
+ // stage 7
+ output[0] = WRAPLOW(step2[0] + step2[15]);
+ output[1] = WRAPLOW(step2[1] + step2[14]);
+ output[2] = WRAPLOW(step2[2] + step2[13]);
+ output[3] = WRAPLOW(step2[3] + step2[12]);
+ output[4] = WRAPLOW(step2[4] + step2[11]);
+ output[5] = WRAPLOW(step2[5] + step2[10]);
+ output[6] = WRAPLOW(step2[6] + step2[9]);
+ output[7] = WRAPLOW(step2[7] + step2[8]);
+ output[8] = WRAPLOW(step2[7] - step2[8]);
+ output[9] = WRAPLOW(step2[6] - step2[9]);
+ output[10] = WRAPLOW(step2[5] - step2[10]);
+ output[11] = WRAPLOW(step2[4] - step2[11]);
+ output[12] = WRAPLOW(step2[3] - step2[12]);
+ output[13] = WRAPLOW(step2[2] - step2[13]);
+ output[14] = WRAPLOW(step2[1] - step2[14]);
+ output[15] = WRAPLOW(step2[0] - step2[15]);
+}
+
+void vp9_high_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest8,
+ int stride, int bd) {
+ tran_low_t out[16 * 16];
+ tran_low_t *outptr = out;
+ int i, j;
+ tran_low_t temp_in[16], temp_out[16];
+ uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
+
+ // First transform rows.
+ for (i = 0; i < 16; ++i) {
+ high_idct16(input, outptr, bd);
+ input += 16;
+ outptr += 16;
+ }
+
+ // Then transform columns.
+ for (i = 0; i < 16; ++i) {
+ for (j = 0; j < 16; ++j)
+ temp_in[j] = out[j * 16 + i];
+ high_idct16(temp_in, temp_out, bd);
+ for (j = 0; j < 16; ++j)
+ dest[j * stride + i] = clip_pixel_bd_high(
+ dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd);
+ }
+}
+
+static void high_iadst16(const tran_low_t *input, tran_low_t *output, int bd) {
+ tran_high_t s0, s1, s2, s3, s4, s5, s6, s7, s8;
+ tran_high_t s9, s10, s11, s12, s13, s14, s15;
+
+ tran_high_t x0 = input[15];
+ tran_high_t x1 = input[0];
+ tran_high_t x2 = input[13];
+ tran_high_t x3 = input[2];
+ tran_high_t x4 = input[11];
+ tran_high_t x5 = input[4];
+ tran_high_t x6 = input[9];
+ tran_high_t x7 = input[6];
+ tran_high_t x8 = input[7];
+ tran_high_t x9 = input[8];
+ tran_high_t x10 = input[5];
+ tran_high_t x11 = input[10];
+ tran_high_t x12 = input[3];
+ tran_high_t x13 = input[12];
+ tran_high_t x14 = input[1];
+ tran_high_t x15 = input[14];
+ (void) bd;
+
+ if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7 | x8
+ | x9 | x10 | x11 | x12 | x13 | x14 | x15)) {
+ vpx_memset(output, 0, 16 * sizeof(*output));
+ return;
+ }
+
+ // stage 1
+ s0 = x0 * cospi_1_64 + x1 * cospi_31_64;
+ s1 = x0 * cospi_31_64 - x1 * cospi_1_64;
+ s2 = x2 * cospi_5_64 + x3 * cospi_27_64;
+ s3 = x2 * cospi_27_64 - x3 * cospi_5_64;
+ s4 = x4 * cospi_9_64 + x5 * cospi_23_64;
+ s5 = x4 * cospi_23_64 - x5 * cospi_9_64;
+ s6 = x6 * cospi_13_64 + x7 * cospi_19_64;
+ s7 = x6 * cospi_19_64 - x7 * cospi_13_64;
+ s8 = x8 * cospi_17_64 + x9 * cospi_15_64;
+ s9 = x8 * cospi_15_64 - x9 * cospi_17_64;
+ s10 = x10 * cospi_21_64 + x11 * cospi_11_64;
+ s11 = x10 * cospi_11_64 - x11 * cospi_21_64;
+ s12 = x12 * cospi_25_64 + x13 * cospi_7_64;
+ s13 = x12 * cospi_7_64 - x13 * cospi_25_64;
+ s14 = x14 * cospi_29_64 + x15 * cospi_3_64;
+ s15 = x14 * cospi_3_64 - x15 * cospi_29_64;
+
+ x0 = WRAPLOW(dct_const_round_shift(s0 + s8));
+ x1 = WRAPLOW(dct_const_round_shift(s1 + s9));
+ x2 = WRAPLOW(dct_const_round_shift(s2 + s10));
+ x3 = WRAPLOW(dct_const_round_shift(s3 + s11));
+ x4 = WRAPLOW(dct_const_round_shift(s4 + s12));
+ x5 = WRAPLOW(dct_const_round_shift(s5 + s13));
+ x6 = WRAPLOW(dct_const_round_shift(s6 + s14));
+ x7 = WRAPLOW(dct_const_round_shift(s7 + s15));
+ x8 = WRAPLOW(dct_const_round_shift(s0 - s8));
+ x9 = WRAPLOW(dct_const_round_shift(s1 - s9));
+ x10 = WRAPLOW(dct_const_round_shift(s2 - s10));
+ x11 = WRAPLOW(dct_const_round_shift(s3 - s11));
+ x12 = WRAPLOW(dct_const_round_shift(s4 - s12));
+ x13 = WRAPLOW(dct_const_round_shift(s5 - s13));
+ x14 = WRAPLOW(dct_const_round_shift(s6 - s14));
+ x15 = WRAPLOW(dct_const_round_shift(s7 - s15));
+
+ // stage 2
+ s0 = x0;
+ s1 = x1;
+ s2 = x2;
+ s3 = x3;
+ s4 = x4;
+ s5 = x5;
+ s6 = x6;
+ s7 = x7;
+ s8 = x8 * cospi_4_64 + x9 * cospi_28_64;
+ s9 = x8 * cospi_28_64 - x9 * cospi_4_64;
+ s10 = x10 * cospi_20_64 + x11 * cospi_12_64;
+ s11 = x10 * cospi_12_64 - x11 * cospi_20_64;
+ s12 = -x12 * cospi_28_64 + x13 * cospi_4_64;
+ s13 = x12 * cospi_4_64 + x13 * cospi_28_64;
+ s14 = -x14 * cospi_12_64 + x15 * cospi_20_64;
+ s15 = x14 * cospi_20_64 + x15 * cospi_12_64;
+
+ x0 = WRAPLOW(s0 + s4);
+ x1 = WRAPLOW(s1 + s5);
+ x2 = WRAPLOW(s2 + s6);
+ x3 = WRAPLOW(s3 + s7);
+ x4 = WRAPLOW(s0 - s4);
+ x5 = WRAPLOW(s1 - s5);
+ x6 = WRAPLOW(s2 - s6);
+ x7 = WRAPLOW(s3 - s7);
+ x8 = WRAPLOW(dct_const_round_shift(s8 + s12));
+ x9 = WRAPLOW(dct_const_round_shift(s9 + s13));
+ x10 = WRAPLOW(dct_const_round_shift(s10 + s14));
+ x11 = WRAPLOW(dct_const_round_shift(s11 + s15));
+ x12 = WRAPLOW(dct_const_round_shift(s8 - s12));
+ x13 = WRAPLOW(dct_const_round_shift(s9 - s13));
+ x14 = WRAPLOW(dct_const_round_shift(s10 - s14));
+ x15 = WRAPLOW(dct_const_round_shift(s11 - s15));
+
+ // stage 3
+ s0 = x0;
+ s1 = x1;
+ s2 = x2;
+ s3 = x3;
+ s4 = x4 * cospi_8_64 + x5 * cospi_24_64;
+ s5 = x4 * cospi_24_64 - x5 * cospi_8_64;
+ s6 = -x6 * cospi_24_64 + x7 * cospi_8_64;
+ s7 = x6 * cospi_8_64 + x7 * cospi_24_64;
+ s8 = x8;
+ s9 = x9;
+ s10 = x10;
+ s11 = x11;
+ s12 = x12 * cospi_8_64 + x13 * cospi_24_64;
+ s13 = x12 * cospi_24_64 - x13 * cospi_8_64;
+ s14 = -x14 * cospi_24_64 + x15 * cospi_8_64;
+ s15 = x14 * cospi_8_64 + x15 * cospi_24_64;
+
+ x0 = WRAPLOW(s0 + s2);
+ x1 = WRAPLOW(s1 + s3);
+ x2 = WRAPLOW(s0 - s2);
+ x3 = WRAPLOW(s1 - s3);
+ x4 = WRAPLOW(dct_const_round_shift(s4 + s6));
+ x5 = WRAPLOW(dct_const_round_shift(s5 + s7));
+ x6 = WRAPLOW(dct_const_round_shift(s4 - s6));
+ x7 = WRAPLOW(dct_const_round_shift(s5 - s7));
+ x8 = WRAPLOW(s8 + s10);
+ x9 = WRAPLOW(s9 + s11);
+ x10 = WRAPLOW(s8 - s10);
+ x11 = WRAPLOW(s9 - s11);
+ x12 = WRAPLOW(dct_const_round_shift(s12 + s14));
+ x13 = WRAPLOW(dct_const_round_shift(s13 + s15));
+ x14 = WRAPLOW(dct_const_round_shift(s12 - s14));
+ x15 = WRAPLOW(dct_const_round_shift(s13 - s15));
+
+ // stage 4
+ s2 = (- cospi_16_64) * (x2 + x3);
+ s3 = cospi_16_64 * (x2 - x3);
+ s6 = cospi_16_64 * (x6 + x7);
+ s7 = cospi_16_64 * (-x6 + x7);
+ s10 = cospi_16_64 * (x10 + x11);
+ s11 = cospi_16_64 * (-x10 + x11);
+ s14 = (- cospi_16_64) * (x14 + x15);
+ s15 = cospi_16_64 * (x14 - x15);
+
+ x2 = WRAPLOW(dct_const_round_shift(s2));
+ x3 = WRAPLOW(dct_const_round_shift(s3));
+ x6 = WRAPLOW(dct_const_round_shift(s6));
+ x7 = WRAPLOW(dct_const_round_shift(s7));
+ x10 = WRAPLOW(dct_const_round_shift(s10));
+ x11 = WRAPLOW(dct_const_round_shift(s11));
+ x14 = WRAPLOW(dct_const_round_shift(s14));
+ x15 = WRAPLOW(dct_const_round_shift(s15));
+
+ output[0] = WRAPLOW(x0);
+ output[1] = WRAPLOW(-x8);
+ output[2] = WRAPLOW(x12);
+ output[3] = WRAPLOW(-x4);
+ output[4] = WRAPLOW(x6);
+ output[5] = WRAPLOW(x14);
+ output[6] = WRAPLOW(x10);
+ output[7] = WRAPLOW(x2);
+ output[8] = WRAPLOW(x3);
+ output[9] = WRAPLOW(x11);
+ output[10] = WRAPLOW(x15);
+ output[11] = WRAPLOW(x7);
+ output[12] = WRAPLOW(x5);
+ output[13] = WRAPLOW(-x13);
+ output[14] = WRAPLOW(x9);
+ output[15] = WRAPLOW(-x1);
+}
+
+static const high_transform_2d HIGH_IHT_16[] = {
+ { high_idct16, high_idct16 }, // DCT_DCT = 0
+ { high_iadst16, high_idct16 }, // ADST_DCT = 1
+ { high_idct16, high_iadst16 }, // DCT_ADST = 2
+ { high_iadst16, high_iadst16 } // ADST_ADST = 3
+};
+
+void vp9_high_iht16x16_256_add_c(const tran_low_t *input, uint8_t *dest8,
+ int stride, int tx_type, int bd) {
+ int i, j;
+ tran_low_t out[16 * 16];
+ tran_low_t *outptr = out;
+ tran_low_t temp_in[16], temp_out[16];
+ const high_transform_2d ht = HIGH_IHT_16[tx_type];
+ uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
+
+ // Rows
+ for (i = 0; i < 16; ++i) {
+ ht.rows(input, outptr, bd);
+ input += 16;
+ outptr += 16;
+ }
+
+ // Columns
+ for (i = 0; i < 16; ++i) {
+ for (j = 0; j < 16; ++j)
+ temp_in[j] = out[j * 16 + i];
+ ht.cols(temp_in, temp_out, bd);
+ for (j = 0; j < 16; ++j)
+ dest[j * stride + i] = clip_pixel_bd_high(
+ dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd);
+ }
+}
+
+void vp9_high_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest8,
+ int stride, int bd) {
+ tran_low_t out[16 * 16] = { 0 };
+ tran_low_t *outptr = out;
+ int i, j;
+ tran_low_t temp_in[16], temp_out[16];
+ uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
+
+ // First transform rows. Since all non-zero dct coefficients are in
+ // upper-left 4x4 area, we only need to calculate first 4 rows here.
+ for (i = 0; i < 4; ++i) {
+ high_idct16(input, outptr, bd);
+ input += 16;
+ outptr += 16;
+ }
+
+ // Then transform columns.
+ for (i = 0; i < 16; ++i) {
+ for (j = 0; j < 16; ++j)
+ temp_in[j] = out[j*16 + i];
+ high_idct16(temp_in, temp_out, bd);
+ for (j = 0; j < 16; ++j)
+ dest[j * stride + i] = clip_pixel_bd_high(
+ dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd);
+ }
+}
+
+void vp9_high_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest8,
+ int stride, int bd) {
+ int i, j;
+ tran_high_t a1;
+ tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64));
+ uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
+
+ out = WRAPLOW(dct_const_round_shift(out * cospi_16_64));
+ a1 = ROUND_POWER_OF_TWO(out, 6);
+ for (j = 0; j < 16; ++j) {
+ for (i = 0; i < 16; ++i)
+ dest[i] = clip_pixel_bd_high(dest[i], a1, bd);
+ dest += stride;
+ }
+}
+
+static void high_idct32(const tran_low_t *input, tran_low_t *output, int bd) {
+ tran_low_t step1[32], step2[32];
+ tran_high_t temp1, temp2;
+ (void) bd;
+
+ // stage 1
+ step1[0] = input[0];
+ step1[1] = input[16];
+ step1[2] = input[8];
+ step1[3] = input[24];
+ step1[4] = input[4];
+ step1[5] = input[20];
+ step1[6] = input[12];
+ step1[7] = input[28];
+ step1[8] = input[2];
+ step1[9] = input[18];
+ step1[10] = input[10];
+ step1[11] = input[26];
+ step1[12] = input[6];
+ step1[13] = input[22];
+ step1[14] = input[14];
+ step1[15] = input[30];
+
+ temp1 = input[1] * cospi_31_64 - input[31] * cospi_1_64;
+ temp2 = input[1] * cospi_1_64 + input[31] * cospi_31_64;
+ step1[16] = WRAPLOW(dct_const_round_shift(temp1));
+ step1[31] = WRAPLOW(dct_const_round_shift(temp2));
+
+ temp1 = input[17] * cospi_15_64 - input[15] * cospi_17_64;
+ temp2 = input[17] * cospi_17_64 + input[15] * cospi_15_64;
+ step1[17] = WRAPLOW(dct_const_round_shift(temp1));
+ step1[30] = WRAPLOW(dct_const_round_shift(temp2));
+
+ temp1 = input[9] * cospi_23_64 - input[23] * cospi_9_64;
+ temp2 = input[9] * cospi_9_64 + input[23] * cospi_23_64;
+ step1[18] = WRAPLOW(dct_const_round_shift(temp1));
+ step1[29] = WRAPLOW(dct_const_round_shift(temp2));
+
+ temp1 = input[25] * cospi_7_64 - input[7] * cospi_25_64;
+ temp2 = input[25] * cospi_25_64 + input[7] * cospi_7_64;
+ step1[19] = WRAPLOW(dct_const_round_shift(temp1));
+ step1[28] = WRAPLOW(dct_const_round_shift(temp2));
+
+ temp1 = input[5] * cospi_27_64 - input[27] * cospi_5_64;
+ temp2 = input[5] * cospi_5_64 + input[27] * cospi_27_64;
+ step1[20] = WRAPLOW(dct_const_round_shift(temp1));
+ step1[27] = WRAPLOW(dct_const_round_shift(temp2));
+
+ temp1 = input[21] * cospi_11_64 - input[11] * cospi_21_64;
+ temp2 = input[21] * cospi_21_64 + input[11] * cospi_11_64;
+ step1[21] = WRAPLOW(dct_const_round_shift(temp1));
+ step1[26] = WRAPLOW(dct_const_round_shift(temp2));
+
+ temp1 = input[13] * cospi_19_64 - input[19] * cospi_13_64;
+ temp2 = input[13] * cospi_13_64 + input[19] * cospi_19_64;
+ step1[22] = WRAPLOW(dct_const_round_shift(temp1));
+ step1[25] = WRAPLOW(dct_const_round_shift(temp2));
+
+ temp1 = input[29] * cospi_3_64 - input[3] * cospi_29_64;
+ temp2 = input[29] * cospi_29_64 + input[3] * cospi_3_64;
+ step1[23] = WRAPLOW(dct_const_round_shift(temp1));
+ step1[24] = WRAPLOW(dct_const_round_shift(temp2));
+
+ // stage 2
+ step2[0] = step1[0];
+ step2[1] = step1[1];
+ step2[2] = step1[2];
+ step2[3] = step1[3];
+ step2[4] = step1[4];
+ step2[5] = step1[5];
+ step2[6] = step1[6];
+ step2[7] = step1[7];
+
+ temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64;
+ temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64;
+ step2[8] = WRAPLOW(dct_const_round_shift(temp1));
+ step2[15] = WRAPLOW(dct_const_round_shift(temp2));
+
+ temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64;
+ temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64;
+ step2[9] = WRAPLOW(dct_const_round_shift(temp1));
+ step2[14] = WRAPLOW(dct_const_round_shift(temp2));
+
+ temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64;
+ temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64;
+ step2[10] = WRAPLOW(dct_const_round_shift(temp1));
+ step2[13] = WRAPLOW(dct_const_round_shift(temp2));
+
+ temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64;
+ temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64;
+ step2[11] = WRAPLOW(dct_const_round_shift(temp1));
+ step2[12] = WRAPLOW(dct_const_round_shift(temp2));
+
+ step2[16] = WRAPLOW(step1[16] + step1[17]);
+ step2[17] = WRAPLOW(step1[16] - step1[17]);
+ step2[18] = WRAPLOW(-step1[18] + step1[19]);
+ step2[19] = WRAPLOW(step1[18] + step1[19]);
+ step2[20] = WRAPLOW(step1[20] + step1[21]);
+ step2[21] = WRAPLOW(step1[20] - step1[21]);
+ step2[22] = WRAPLOW(-step1[22] + step1[23]);
+ step2[23] = WRAPLOW(step1[22] + step1[23]);
+ step2[24] = WRAPLOW(step1[24] + step1[25]);
+ step2[25] = WRAPLOW(step1[24] - step1[25]);
+ step2[26] = WRAPLOW(-step1[26] + step1[27]);
+ step2[27] = WRAPLOW(step1[26] + step1[27]);
+ step2[28] = WRAPLOW(step1[28] + step1[29]);
+ step2[29] = WRAPLOW(step1[28] - step1[29]);
+ step2[30] = WRAPLOW(-step1[30] + step1[31]);
+ step2[31] = WRAPLOW(step1[30] + step1[31]);
+
+ // stage 3
+ step1[0] = step2[0];
+ step1[1] = step2[1];
+ step1[2] = step2[2];
+ step1[3] = step2[3];
+
+ temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64;
+ temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64;
+ step1[4] = WRAPLOW(dct_const_round_shift(temp1));
+ step1[7] = WRAPLOW(dct_const_round_shift(temp2));
+ temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64;
+ temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64;
+ step1[5] = WRAPLOW(dct_const_round_shift(temp1));
+ step1[6] = WRAPLOW(dct_const_round_shift(temp2));
+
+ step1[8] = WRAPLOW(step2[8] + step2[9]);
+ step1[9] = WRAPLOW(step2[8] - step2[9]);
+ step1[10] = WRAPLOW(-step2[10] + step2[11]);
+ step1[11] = WRAPLOW(step2[10] + step2[11]);
+ step1[12] = WRAPLOW(step2[12] + step2[13]);
+ step1[13] = WRAPLOW(step2[12] - step2[13]);
+ step1[14] = WRAPLOW(-step2[14] + step2[15]);
+ step1[15] = WRAPLOW(step2[14] + step2[15]);
+
+ step1[16] = step2[16];
+ step1[31] = step2[31];
+ temp1 = -step2[17] * cospi_4_64 + step2[30] * cospi_28_64;
+ temp2 = step2[17] * cospi_28_64 + step2[30] * cospi_4_64;
+ step1[17] = WRAPLOW(dct_const_round_shift(temp1));
+ step1[30] = WRAPLOW(dct_const_round_shift(temp2));
+ temp1 = -step2[18] * cospi_28_64 - step2[29] * cospi_4_64;
+ temp2 = -step2[18] * cospi_4_64 + step2[29] * cospi_28_64;
+ step1[18] = WRAPLOW(dct_const_round_shift(temp1));
+ step1[29] = WRAPLOW(dct_const_round_shift(temp2));
+ step1[19] = step2[19];
+ step1[20] = step2[20];
+ temp1 = -step2[21] * cospi_20_64 + step2[26] * cospi_12_64;
+ temp2 = step2[21] * cospi_12_64 + step2[26] * cospi_20_64;
+ step1[21] = WRAPLOW(dct_const_round_shift(temp1));
+ step1[26] = WRAPLOW(dct_const_round_shift(temp2));
+ temp1 = -step2[22] * cospi_12_64 - step2[25] * cospi_20_64;
+ temp2 = -step2[22] * cospi_20_64 + step2[25] * cospi_12_64;
+ step1[22] = WRAPLOW(dct_const_round_shift(temp1));
+ step1[25] = WRAPLOW(dct_const_round_shift(temp2));
+ step1[23] = step2[23];
+ step1[24] = step2[24];
+ step1[27] = step2[27];
+ step1[28] = step2[28];
+
+ // stage 4
+ temp1 = (step1[0] + step1[1]) * cospi_16_64;
+ temp2 = (step1[0] - step1[1]) * cospi_16_64;
+ step2[0] = WRAPLOW(dct_const_round_shift(temp1));
+ step2[1] = WRAPLOW(dct_const_round_shift(temp2));
+ temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64;
+ temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64;
+ step2[2] = WRAPLOW(dct_const_round_shift(temp1));
+ step2[3] = WRAPLOW(dct_const_round_shift(temp2));
+ step2[4] = WRAPLOW(step1[4] + step1[5]);
+ step2[5] = WRAPLOW(step1[4] - step1[5]);
+ step2[6] = WRAPLOW(-step1[6] + step1[7]);
+ step2[7] = WRAPLOW(step1[6] + step1[7]);
+
+ step2[8] = step1[8];
+ step2[15] = step1[15];
+ temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64;
+ temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64;
+ step2[9] = WRAPLOW(dct_const_round_shift(temp1));
+ step2[14] = WRAPLOW(dct_const_round_shift(temp2));
+ temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64;
+ temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64;
+ step2[10] = WRAPLOW(dct_const_round_shift(temp1));
+ step2[13] = WRAPLOW(dct_const_round_shift(temp2));
+ step2[11] = step1[11];
+ step2[12] = step1[12];
+
+ step2[16] = WRAPLOW(step1[16] + step1[19]);
+ step2[17] = WRAPLOW(step1[17] + step1[18]);
+ step2[18] = WRAPLOW(step1[17] - step1[18]);
+ step2[19] = WRAPLOW(step1[16] - step1[19]);
+ step2[20] = WRAPLOW(-step1[20] + step1[23]);
+ step2[21] = WRAPLOW(-step1[21] + step1[22]);
+ step2[22] = WRAPLOW(step1[21] + step1[22]);
+ step2[23] = WRAPLOW(step1[20] + step1[23]);
+
+ step2[24] = WRAPLOW(step1[24] + step1[27]);
+ step2[25] = WRAPLOW(step1[25] + step1[26]);
+ step2[26] = WRAPLOW(step1[25] - step1[26]);
+ step2[27] = WRAPLOW(step1[24] - step1[27]);
+ step2[28] = WRAPLOW(-step1[28] + step1[31]);
+ step2[29] = WRAPLOW(-step1[29] + step1[30]);
+ step2[30] = WRAPLOW(step1[29] + step1[30]);
+ step2[31] = WRAPLOW(step1[28] + step1[31]);
+
+ // stage 5
+ step1[0] = WRAPLOW(step2[0] + step2[3]);
+ step1[1] = WRAPLOW(step2[1] + step2[2]);
+ step1[2] = WRAPLOW(step2[1] - step2[2]);
+ step1[3] = WRAPLOW(step2[0] - step2[3]);
+ step1[4] = step2[4];
+ temp1 = (step2[6] - step2[5]) * cospi_16_64;
+ temp2 = (step2[5] + step2[6]) * cospi_16_64;
+ step1[5] = WRAPLOW(dct_const_round_shift(temp1));
+ step1[6] = WRAPLOW(dct_const_round_shift(temp2));
+ step1[7] = step2[7];
+
+ step1[8] = WRAPLOW(step2[8] + step2[11]);
+ step1[9] = WRAPLOW(step2[9] + step2[10]);
+ step1[10] = WRAPLOW(step2[9] - step2[10]);
+ step1[11] = WRAPLOW(step2[8] - step2[11]);
+ step1[12] = WRAPLOW(-step2[12] + step2[15]);
+ step1[13] = WRAPLOW(-step2[13] + step2[14]);
+ step1[14] = WRAPLOW(step2[13] + step2[14]);
+ step1[15] = WRAPLOW(step2[12] + step2[15]);
+
+ step1[16] = step2[16];
+ step1[17] = step2[17];
+ temp1 = -step2[18] * cospi_8_64 + step2[29] * cospi_24_64;
+ temp2 = step2[18] * cospi_24_64 + step2[29] * cospi_8_64;
+ step1[18] = WRAPLOW(dct_const_round_shift(temp1));
+ step1[29] = WRAPLOW(dct_const_round_shift(temp2));
+ temp1 = -step2[19] * cospi_8_64 + step2[28] * cospi_24_64;
+ temp2 = step2[19] * cospi_24_64 + step2[28] * cospi_8_64;
+ step1[19] = WRAPLOW(dct_const_round_shift(temp1));
+ step1[28] = WRAPLOW(dct_const_round_shift(temp2));
+ temp1 = -step2[20] * cospi_24_64 - step2[27] * cospi_8_64;
+ temp2 = -step2[20] * cospi_8_64 + step2[27] * cospi_24_64;
+ step1[20] = WRAPLOW(dct_const_round_shift(temp1));
+ step1[27] = WRAPLOW(dct_const_round_shift(temp2));
+ temp1 = -step2[21] * cospi_24_64 - step2[26] * cospi_8_64;
+ temp2 = -step2[21] * cospi_8_64 + step2[26] * cospi_24_64;
+ step1[21] = WRAPLOW(dct_const_round_shift(temp1));
+ step1[26] = WRAPLOW(dct_const_round_shift(temp2));
+ step1[22] = step2[22];
+ step1[23] = step2[23];
+ step1[24] = step2[24];
+ step1[25] = step2[25];
+ step1[30] = step2[30];
+ step1[31] = step2[31];
+
+ // stage 6
+ step2[0] = WRAPLOW(step1[0] + step1[7]);
+ step2[1] = WRAPLOW(step1[1] + step1[6]);
+ step2[2] = WRAPLOW(step1[2] + step1[5]);
+ step2[3] = WRAPLOW(step1[3] + step1[4]);
+ step2[4] = WRAPLOW(step1[3] - step1[4]);
+ step2[5] = WRAPLOW(step1[2] - step1[5]);
+ step2[6] = WRAPLOW(step1[1] - step1[6]);
+ step2[7] = WRAPLOW(step1[0] - step1[7]);
+ step2[8] = step1[8];
+ step2[9] = step1[9];
+ temp1 = (-step1[10] + step1[13]) * cospi_16_64;
+ temp2 = (step1[10] + step1[13]) * cospi_16_64;
+ step2[10] = WRAPLOW(dct_const_round_shift(temp1));
+ step2[13] = WRAPLOW(dct_const_round_shift(temp2));
+ temp1 = (-step1[11] + step1[12]) * cospi_16_64;
+ temp2 = (step1[11] + step1[12]) * cospi_16_64;
+ step2[11] = WRAPLOW(dct_const_round_shift(temp1));
+ step2[12] = WRAPLOW(dct_const_round_shift(temp2));
+ step2[14] = WRAPLOW(step1[14]);
+ step2[15] = WRAPLOW(step1[15]);
+
+ step2[16] = WRAPLOW(step1[16] + step1[23]);
+ step2[17] = WRAPLOW(step1[17] + step1[22]);
+ step2[18] = WRAPLOW(step1[18] + step1[21]);
+ step2[19] = WRAPLOW(step1[19] + step1[20]);
+ step2[20] = WRAPLOW(step1[19] - step1[20]);
+ step2[21] = WRAPLOW(step1[18] - step1[21]);
+ step2[22] = WRAPLOW(step1[17] - step1[22]);
+ step2[23] = WRAPLOW(step1[16] - step1[23]);
+
+ step2[24] = WRAPLOW(-step1[24] + step1[31]);
+ step2[25] = WRAPLOW(-step1[25] + step1[30]);
+ step2[26] = WRAPLOW(-step1[26] + step1[29]);
+ step2[27] = WRAPLOW(-step1[27] + step1[28]);
+ step2[28] = WRAPLOW(step1[27] + step1[28]);
+ step2[29] = WRAPLOW(step1[26] + step1[29]);
+ step2[30] = WRAPLOW(step1[25] + step1[30]);
+ step2[31] = WRAPLOW(step1[24] + step1[31]);
+
+ // stage 7
+ step1[0] = WRAPLOW(step2[0] + step2[15]);
+ step1[1] = WRAPLOW(step2[1] + step2[14]);
+ step1[2] = WRAPLOW(step2[2] + step2[13]);
+ step1[3] = WRAPLOW(step2[3] + step2[12]);
+ step1[4] = WRAPLOW(step2[4] + step2[11]);
+ step1[5] = WRAPLOW(step2[5] + step2[10]);
+ step1[6] = WRAPLOW(step2[6] + step2[9]);
+ step1[7] = WRAPLOW(step2[7] + step2[8]);
+ step1[8] = WRAPLOW(step2[7] - step2[8]);
+ step1[9] = WRAPLOW(step2[6] - step2[9]);
+ step1[10] = WRAPLOW(step2[5] - step2[10]);
+ step1[11] = WRAPLOW(step2[4] - step2[11]);
+ step1[12] = WRAPLOW(step2[3] - step2[12]);
+ step1[13] = WRAPLOW(step2[2] - step2[13]);
+ step1[14] = WRAPLOW(step2[1] - step2[14]);
+ step1[15] = WRAPLOW(step2[0] - step2[15]);
+
+ step1[16] = step2[16];
+ step1[17] = step2[17];
+ step1[18] = step2[18];
+ step1[19] = step2[19];
+ temp1 = (-step2[20] + step2[27]) * cospi_16_64;
+ temp2 = (step2[20] + step2[27]) * cospi_16_64;
+ step1[20] = WRAPLOW(dct_const_round_shift(temp1));
+ step1[27] = WRAPLOW(dct_const_round_shift(temp2));
+ temp1 = (-step2[21] + step2[26]) * cospi_16_64;
+ temp2 = (step2[21] + step2[26]) * cospi_16_64;
+ step1[21] = WRAPLOW(dct_const_round_shift(temp1));
+ step1[26] = WRAPLOW(dct_const_round_shift(temp2));
+ temp1 = (-step2[22] + step2[25]) * cospi_16_64;
+ temp2 = (step2[22] + step2[25]) * cospi_16_64;
+ step1[22] = WRAPLOW(dct_const_round_shift(temp1));
+ step1[25] = WRAPLOW(dct_const_round_shift(temp2));
+ temp1 = (-step2[23] + step2[24]) * cospi_16_64;
+ temp2 = (step2[23] + step2[24]) * cospi_16_64;
+ step1[23] = WRAPLOW(dct_const_round_shift(temp1));
+ step1[24] = WRAPLOW(dct_const_round_shift(temp2));
+ step1[28] = step2[28];
+ step1[29] = step2[29];
+ step1[30] = step2[30];
+ step1[31] = step2[31];
+
+ // final stage
+ output[0] = WRAPLOW(step1[0] + step1[31]);
+ output[1] = WRAPLOW(step1[1] + step1[30]);
+ output[2] = WRAPLOW(step1[2] + step1[29]);
+ output[3] = WRAPLOW(step1[3] + step1[28]);
+ output[4] = WRAPLOW(step1[4] + step1[27]);
+ output[5] = WRAPLOW(step1[5] + step1[26]);
+ output[6] = WRAPLOW(step1[6] + step1[25]);
+ output[7] = WRAPLOW(step1[7] + step1[24]);
+ output[8] = WRAPLOW(step1[8] + step1[23]);
+ output[9] = WRAPLOW(step1[9] + step1[22]);
+ output[10] = WRAPLOW(step1[10] + step1[21]);
+ output[11] = WRAPLOW(step1[11] + step1[20]);
+ output[12] = WRAPLOW(step1[12] + step1[19]);
+ output[13] = WRAPLOW(step1[13] + step1[18]);
+ output[14] = WRAPLOW(step1[14] + step1[17]);
+ output[15] = WRAPLOW(step1[15] + step1[16]);
+ output[16] = WRAPLOW(step1[15] - step1[16]);
+ output[17] = WRAPLOW(step1[14] - step1[17]);
+ output[18] = WRAPLOW(step1[13] - step1[18]);
+ output[19] = WRAPLOW(step1[12] - step1[19]);
+ output[20] = WRAPLOW(step1[11] - step1[20]);
+ output[21] = WRAPLOW(step1[10] - step1[21]);
+ output[22] = WRAPLOW(step1[9] - step1[22]);
+ output[23] = WRAPLOW(step1[8] - step1[23]);
+ output[24] = WRAPLOW(step1[7] - step1[24]);
+ output[25] = WRAPLOW(step1[6] - step1[25]);
+ output[26] = WRAPLOW(step1[5] - step1[26]);
+ output[27] = WRAPLOW(step1[4] - step1[27]);
+ output[28] = WRAPLOW(step1[3] - step1[28]);
+ output[29] = WRAPLOW(step1[2] - step1[29]);
+ output[30] = WRAPLOW(step1[1] - step1[30]);
+ output[31] = WRAPLOW(step1[0] - step1[31]);
+}
+
+void vp9_high_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest8,
+ int stride, int bd) {
+ tran_low_t out[32 * 32];
+ tran_low_t *outptr = out;
+ int i, j;
+ tran_low_t temp_in[32], temp_out[32];
+ uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
+
+ // Rows
+ for (i = 0; i < 32; ++i) {
+ tran_low_t zero_coeff[16];
+ for (j = 0; j < 16; ++j)
+ zero_coeff[j] = input[2 * j] | input[2 * j + 1];
+ for (j = 0; j < 8; ++j)
+ zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1];
+ for (j = 0; j < 4; ++j)
+ zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1];
+ for (j = 0; j < 2; ++j)
+ zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1];
+
+ if (zero_coeff[0] | zero_coeff[1])
+ high_idct32(input, outptr, bd);
+ else
+ vpx_memset(outptr, 0, sizeof(tran_low_t) * 32);
+ input += 32;
+ outptr += 32;
+ }
+
+ // Columns
+ for (i = 0; i < 32; ++i) {
+ for (j = 0; j < 32; ++j)
+ temp_in[j] = out[j * 32 + i];
+ high_idct32(temp_in, temp_out, bd);
+ for (j = 0; j < 32; ++j)
+ dest[j * stride + i] = clip_pixel_bd_high(
+ dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd);
+ }
+}
+
+void vp9_high_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest8,
+ int stride, int bd) {
+ tran_low_t out[32 * 32] = {0};
+ tran_low_t *outptr = out;
+ int i, j;
+ tran_low_t temp_in[32], temp_out[32];
+ uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
+
+ // Rows
+ // Only upper-left 8x8 has non-zero coeff.
+ for (i = 0; i < 8; ++i) {
+ high_idct32(input, outptr, bd);
+ input += 32;
+ outptr += 32;
+ }
+ // Columns
+ for (i = 0; i < 32; ++i) {
+ for (j = 0; j < 32; ++j)
+ temp_in[j] = out[j * 32 + i];
+ high_idct32(temp_in, temp_out, bd);
+ for (j = 0; j < 32; ++j)
+ dest[j * stride + i] = clip_pixel_bd_high(
+ dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd);
+ }
+}
+
+void vp9_high_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest8,
+ int stride, int bd) {
+ int i, j;
+ int a1;
+ uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
+
+ tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64));
+ out = WRAPLOW(dct_const_round_shift(out * cospi_16_64));
+ a1 = ROUND_POWER_OF_TWO(out, 6);
+
+ for (j = 0; j < 32; ++j) {
+ for (i = 0; i < 32; ++i)
+ dest[i] = clip_pixel_bd_high(dest[i], a1, bd);
+ dest += stride;
+ }
+}
+
+// idct
+void vp9_high_idct4x4_add(const tran_low_t *input, uint8_t *dest, int stride,
+ int eob, int bd) {
+ if (eob > 1)
+ vp9_high_idct4x4_16_add(input, dest, stride, bd);
+ else
+ vp9_high_idct4x4_1_add(input, dest, stride, bd);
+}
+
+
+void vp9_high_iwht4x4_add(const tran_low_t *input, uint8_t *dest, int stride,
+ int eob, int bd) {
+ if (eob > 1)
+ vp9_high_iwht4x4_16_add(input, dest, stride, bd);
+ else
+ vp9_high_iwht4x4_1_add(input, dest, stride, bd);
+}
+
+void vp9_high_idct8x8_add(const tran_low_t *input, uint8_t *dest, int stride,
+ int eob, int bd) {
+ // If dc is 1, then input[0] is the reconstructed value, do not need
+ // dequantization. Also, when dc is 1, dc is counted in eobs, namely eobs >=1.
+
+ // The calculation can be simplified if there are not many non-zero dct
+ // coefficients. Use eobs to decide what to do.
+ // TODO(yunqingwang): "eobs = 1" case is also handled in vp9_short_idct8x8_c.
+ // Combine that with code here.
+ // DC only DCT coefficient
+ if (eob == 1) {
+ vp9_high_idct8x8_1_add(input, dest, stride, bd);
+ } else if (eob <= 10) {
+ vp9_high_idct8x8_10_add(input, dest, stride, bd);
+ } else {
+ vp9_high_idct8x8_64_add(input, dest, stride, bd);
+ }
+}
+
+void vp9_high_idct16x16_add(const tran_low_t *input, uint8_t *dest, int stride,
+ int eob, int bd) {
+ // The calculation can be simplified if there are not many non-zero dct
+ // coefficients. Use eobs to separate different cases.
+ // DC only DCT coefficient.
+ if (eob == 1) {
+ vp9_high_idct16x16_1_add(input, dest, stride, bd);
+ } else if (eob <= 10) {
+ vp9_high_idct16x16_10_add(input, dest, stride, bd);
+ } else {
+ vp9_high_idct16x16_256_add(input, dest, stride, bd);
+ }
+}
+
+void vp9_high_idct32x32_add(const tran_low_t *input, uint8_t *dest, int stride,
+ int eob, int bd) {
+ // Non-zero coeff only in upper-left 8x8
+ if (eob == 1) {
+ vp9_high_idct32x32_1_add(input, dest, stride, bd);
+ } else if (eob <= 34) {
+ vp9_high_idct32x32_34_add(input, dest, stride, bd);
+ } else {
+ vp9_high_idct32x32_1024_add(input, dest, stride, bd);
+ }
+}
+
+// iht
+void vp9_high_iht4x4_add(TX_TYPE tx_type, const tran_low_t *input,
+ uint8_t *dest, int stride, int eob, int bd) {
+ if (tx_type == DCT_DCT)
+ vp9_high_idct4x4_add(input, dest, stride, eob, bd);
+ else
+ vp9_high_iht4x4_16_add(input, dest, stride, tx_type, bd);
+}
+
+void vp9_high_iht8x8_add(TX_TYPE tx_type, const tran_low_t *input,
+ uint8_t *dest, int stride, int eob, int bd) {
+ if (tx_type == DCT_DCT) {
+ vp9_high_idct8x8_add(input, dest, stride, eob, bd);
+ } else {
+ vp9_high_iht8x8_64_add(input, dest, stride, tx_type, bd);
+ }
+}
+
+void vp9_high_iht16x16_add(TX_TYPE tx_type, const tran_low_t *input,
+ uint8_t *dest, int stride, int eob, int bd) {
+ if (tx_type == DCT_DCT) {
+ vp9_high_idct16x16_add(input, dest, stride, eob, bd);
+ } else {
+ vp9_high_iht16x16_256_add(input, dest, stride, tx_type, bd);
+ }
+}
+#endif // CONFIG_VP9_HIGHBITDEPTH
diff --git a/vp9/common/vp9_idct.h b/vp9/common/vp9_idct.h
index 7f595e1..694be3c 100644
--- a/vp9/common/vp9_idct.h
+++ b/vp9/common/vp9_idct.h
@@ -36,52 +36,69 @@
#define dual_set_epi16(a, b) \
_mm_set_epi16(b, b, b, b, a, a, a, a)
+// Note:
+// tran_low_t is the datatype used for final transform coefficients.
+// tran_high_t is the datatype used for intermediate transform stages.
+#if CONFIG_VP9_HIGHBITDEPTH
+typedef int64_t tran_high_t;
+typedef int32_t tran_low_t;
+#else
+typedef int32_t tran_high_t;
+typedef int16_t tran_low_t;
+#endif
+
// Constants:
// for (int i = 1; i< 32; ++i)
// printf("static const int cospi_%d_64 = %.0f;\n", i,
// round(16384 * cos(i*M_PI/64)));
// Note: sin(k*Pi/64) = cos((32-k)*Pi/64)
-static const int cospi_1_64 = 16364;
-static const int cospi_2_64 = 16305;
-static const int cospi_3_64 = 16207;
-static const int cospi_4_64 = 16069;
-static const int cospi_5_64 = 15893;
-static const int cospi_6_64 = 15679;
-static const int cospi_7_64 = 15426;
-static const int cospi_8_64 = 15137;
-static const int cospi_9_64 = 14811;
-static const int cospi_10_64 = 14449;
-static const int cospi_11_64 = 14053;
-static const int cospi_12_64 = 13623;
-static const int cospi_13_64 = 13160;
-static const int cospi_14_64 = 12665;
-static const int cospi_15_64 = 12140;
-static const int cospi_16_64 = 11585;
-static const int cospi_17_64 = 11003;
-static const int cospi_18_64 = 10394;
-static const int cospi_19_64 = 9760;
-static const int cospi_20_64 = 9102;
-static const int cospi_21_64 = 8423;
-static const int cospi_22_64 = 7723;
-static const int cospi_23_64 = 7005;
-static const int cospi_24_64 = 6270;
-static const int cospi_25_64 = 5520;
-static const int cospi_26_64 = 4756;
-static const int cospi_27_64 = 3981;
-static const int cospi_28_64 = 3196;
-static const int cospi_29_64 = 2404;
-static const int cospi_30_64 = 1606;
-static const int cospi_31_64 = 804;
+static const tran_high_t cospi_1_64 = 16364;
+static const tran_high_t cospi_2_64 = 16305;
+static const tran_high_t cospi_3_64 = 16207;
+static const tran_high_t cospi_4_64 = 16069;
+static const tran_high_t cospi_5_64 = 15893;
+static const tran_high_t cospi_6_64 = 15679;
+static const tran_high_t cospi_7_64 = 15426;
+static const tran_high_t cospi_8_64 = 15137;
+static const tran_high_t cospi_9_64 = 14811;
+static const tran_high_t cospi_10_64 = 14449;
+static const tran_high_t cospi_11_64 = 14053;
+static const tran_high_t cospi_12_64 = 13623;
+static const tran_high_t cospi_13_64 = 13160;
+static const tran_high_t cospi_14_64 = 12665;
+static const tran_high_t cospi_15_64 = 12140;
+static const tran_high_t cospi_16_64 = 11585;
+static const tran_high_t cospi_17_64 = 11003;
+static const tran_high_t cospi_18_64 = 10394;
+static const tran_high_t cospi_19_64 = 9760;
+static const tran_high_t cospi_20_64 = 9102;
+static const tran_high_t cospi_21_64 = 8423;
+static const tran_high_t cospi_22_64 = 7723;
+static const tran_high_t cospi_23_64 = 7005;
+static const tran_high_t cospi_24_64 = 6270;
+static const tran_high_t cospi_25_64 = 5520;
+static const tran_high_t cospi_26_64 = 4756;
+static const tran_high_t cospi_27_64 = 3981;
+static const tran_high_t cospi_28_64 = 3196;
+static const tran_high_t cospi_29_64 = 2404;
+static const tran_high_t cospi_30_64 = 1606;
+static const tran_high_t cospi_31_64 = 804;
// 16384 * sqrt(2) * sin(kPi/9) * 2 / 3
-static const int sinpi_1_9 = 5283;
-static const int sinpi_2_9 = 9929;
-static const int sinpi_3_9 = 13377;
-static const int sinpi_4_9 = 15212;
+static const tran_high_t sinpi_1_9 = 5283;
+static const tran_high_t sinpi_2_9 = 9929;
+static const tran_high_t sinpi_3_9 = 13377;
+static const tran_high_t sinpi_4_9 = 15212;
-static INLINE int dct_const_round_shift(int input) {
- int rv = ROUND_POWER_OF_TWO(input, DCT_CONST_BITS);
-#if CONFIG_COEFFICIENT_RANGE_CHECKING
+static INLINE tran_low_t dct_const_round_shift(tran_high_t input) {
+ tran_high_t rv = ROUND_POWER_OF_TWO(input, DCT_CONST_BITS);
+#if CONFIG_VP9_HIGHBITDEPTH
+ // For valid highbitdepth VP9 streams, intermediate stage coefficients will
+ // stay within the ranges:
+ // - 8 bit: signed 16 bit integer
+ // - 10 bit: signed 18 bit integer
+ // - 12 bit: signed 20 bit integer
+#elif CONFIG_COEFFICIENT_RANGE_CHECKING
// For valid VP9 input streams, intermediate stage coefficients should always
// stay within the range of a signed 16 bit integer. Coefficients can go out
// of this range for invalid/corrupt VP9 streams. However, strictly checking
@@ -91,32 +108,59 @@
assert(INT16_MIN <= rv);
assert(rv <= INT16_MAX);
#endif
- return (int16_t)rv;
+ return (tran_low_t)rv;
}
-typedef void (*transform_1d)(const int16_t*, int16_t*);
+typedef void (*transform_1d)(const tran_low_t*, tran_low_t*);
typedef struct {
transform_1d cols, rows; // vertical and horizontal
} transform_2d;
-void vp9_iwht4x4_add(const int16_t *input, uint8_t *dest, int stride, int eob);
+#if CONFIG_VP9_HIGHBITDEPTH
+typedef void (*high_transform_1d)(const tran_low_t*, tran_low_t*, int bd);
-void vp9_idct4x4_add(const int16_t *input, uint8_t *dest, int stride, int eob);
-void vp9_idct8x8_add(const int16_t *input, uint8_t *dest, int stride, int eob);
-void vp9_idct16x16_add(const int16_t *input, uint8_t *dest, int stride, int
+typedef struct {
+ high_transform_1d cols, rows; // vertical and horizontal
+} high_transform_2d;
+#endif // CONFIG_VP9_HIGHBITDEPTH
+
+void vp9_iwht4x4_add(const tran_low_t *input, uint8_t *dest, int stride,
+ int eob);
+void vp9_idct4x4_add(const tran_low_t *input, uint8_t *dest, int stride,
+ int eob);
+void vp9_idct8x8_add(const tran_low_t *input, uint8_t *dest, int stride,
+ int eob);
+void vp9_idct16x16_add(const tran_low_t *input, uint8_t *dest, int stride, int
eob);
-void vp9_idct32x32_add(const int16_t *input, uint8_t *dest, int stride,
+void vp9_idct32x32_add(const tran_low_t *input, uint8_t *dest, int stride,
int eob);
-void vp9_iht4x4_add(TX_TYPE tx_type, const int16_t *input, uint8_t *dest,
+void vp9_iht4x4_add(TX_TYPE tx_type, const tran_low_t *input, uint8_t *dest,
int stride, int eob);
-void vp9_iht8x8_add(TX_TYPE tx_type, const int16_t *input, uint8_t *dest,
+void vp9_iht8x8_add(TX_TYPE tx_type, const tran_low_t *input, uint8_t *dest,
int stride, int eob);
-void vp9_iht16x16_add(TX_TYPE tx_type, const int16_t *input, uint8_t *dest,
+void vp9_iht16x16_add(TX_TYPE tx_type, const tran_low_t *input, uint8_t *dest,
int stride, int eob);
-
+#if CONFIG_VP9_HIGHBITDEPTH
+void vp9_high_iwht4x4_add(const tran_low_t *input, uint8_t *dest, int stride,
+ int eob, int bd);
+void vp9_high_idct4x4_add(const tran_low_t *input, uint8_t *dest, int stride,
+ int eob, int bd);
+void vp9_high_idct8x8_add(const tran_low_t *input, uint8_t *dest, int stride,
+ int eob, int bd);
+void vp9_high_idct16x16_add(const tran_low_t *input, uint8_t *dest, int stride,
+ int eob, int bd);
+void vp9_high_idct32x32_add(const tran_low_t *input, uint8_t *dest, int stride,
+ int eob, int bd);
+void vp9_high_iht4x4_add(TX_TYPE tx_type, const tran_low_t *input,
+ uint8_t *dest, int stride, int eob, int bd);
+void vp9_high_iht8x8_add(TX_TYPE tx_type, const tran_low_t *input,
+ uint8_t *dest, int stride, int eob, int bd);
+void vp9_high_iht16x16_add(TX_TYPE tx_type, const tran_low_t *input,
+ uint8_t *dest, int stride, int eob, int bd);
+#endif // CONFIG_VP9_HIGHBITDEPTH
#ifdef __cplusplus
} // extern "C"
#endif
diff --git a/vp9/common/vp9_loopfilter.c b/vp9/common/vp9_loopfilter.c
index 3b39d42..4d03c4d 100644
--- a/vp9/common/vp9_loopfilter.c
+++ b/vp9/common/vp9_loopfilter.c
@@ -1193,7 +1193,7 @@
}
}
-void vp9_loop_filter_rows(const YV12_BUFFER_CONFIG *frame_buffer,
+void vp9_loop_filter_rows(YV12_BUFFER_CONFIG *frame_buffer,
VP9_COMMON *cm,
struct macroblockd_plane planes[MAX_MB_PLANE],
int start, int stop, int y_only) {
@@ -1247,9 +1247,8 @@
y_only);
}
-int vp9_loop_filter_worker(void *arg1, void *arg2) {
- LFWorkerData *const lf_data = (LFWorkerData*)arg1;
- (void)arg2;
+int vp9_loop_filter_worker(LFWorkerData *const lf_data, void *unused) {
+ (void)unused;
vp9_loop_filter_rows(lf_data->frame_buffer, lf_data->cm, lf_data->planes,
lf_data->start, lf_data->stop, lf_data->y_only);
return 1;
diff --git a/vp9/common/vp9_loopfilter.h b/vp9/common/vp9_loopfilter.h
index 6fa2773..69e7dd0 100644
--- a/vp9/common/vp9_loopfilter.h
+++ b/vp9/common/vp9_loopfilter.h
@@ -111,13 +111,13 @@
int y_only, int partial_frame);
// Apply the loop filter to [start, stop) macro block rows in frame_buffer.
-void vp9_loop_filter_rows(const YV12_BUFFER_CONFIG *frame_buffer,
+void vp9_loop_filter_rows(YV12_BUFFER_CONFIG *frame_buffer,
struct VP9Common *cm,
struct macroblockd_plane planes[MAX_MB_PLANE],
int start, int stop, int y_only);
typedef struct LoopFilterWorkerData {
- const YV12_BUFFER_CONFIG *frame_buffer;
+ YV12_BUFFER_CONFIG *frame_buffer;
struct VP9Common *cm;
struct macroblockd_plane planes[MAX_MB_PLANE];
@@ -129,8 +129,8 @@
int num_lf_workers;
} LFWorkerData;
-// Operates on the rows described by LFWorkerData passed as 'arg1'.
-int vp9_loop_filter_worker(void *arg1, void *arg2);
+// Operates on the rows described by 'lf_data'.
+int vp9_loop_filter_worker(LFWorkerData *const lf_data, void *unused);
#ifdef __cplusplus
} // extern "C"
#endif
diff --git a/vp9/common/vp9_mv.h b/vp9/common/vp9_mv.h
index 3eb7f9d..5d89da8 100644
--- a/vp9/common/vp9_mv.h
+++ b/vp9/common/vp9_mv.h
@@ -34,6 +34,14 @@
int32_t col;
} MV32;
+static INLINE int is_zero_mv(const MV *mv) {
+ return *((const uint32_t *)mv) == 0;
+}
+
+static INLINE int is_equal_mv(const MV *a, const MV *b) {
+ return *((const uint32_t *)a) == *((const uint32_t *)b);
+}
+
static INLINE void clamp_mv(MV *mv, int min_col, int max_col,
int min_row, int max_row) {
mv->col = clamp(mv->col, min_col, max_col);
diff --git a/vp9/common/vp9_onyxc_int.h b/vp9/common/vp9_onyxc_int.h
index dff077c..637867a 100644
--- a/vp9/common/vp9_onyxc_int.h
+++ b/vp9/common/vp9_onyxc_int.h
@@ -84,6 +84,10 @@
int subsampling_x;
int subsampling_y;
+#if CONFIG_VP9_HIGHBITDEPTH
+ int use_highbitdepth; // Marks if we need to use 16bit frame buffers.
+#endif
+
YV12_BUFFER_CONFIG *frame_to_show;
RefCntBuffer frame_bufs[FRAME_BUFFERS];
@@ -137,6 +141,7 @@
int mi_idx;
int prev_mi_idx;
+ int mi_alloc_size;
MODE_INFO *mip_array[2];
MODE_INFO **mi_grid_base_array[2];
@@ -178,8 +183,8 @@
unsigned int current_video_frame;
BITSTREAM_PROFILE profile;
- // BITS_8 in versions 0 and 1, BITS_10 or BITS_12 in version 2
- BIT_DEPTH bit_depth;
+ // VPX_BITS_8 in profile 0 or 1, VPX_BITS_10 or VPX_BITS_12 in profile 2 or 3.
+ vpx_bit_depth_t bit_depth;
#if CONFIG_VP9_POSTPROC
struct postproc_state postproc_state;
@@ -207,7 +212,7 @@
return NULL;
if (cm->ref_frame_map[index] < 0)
return NULL;
- assert(cm->ref_frame_map[index] < REF_FRAMES);
+ assert(cm->ref_frame_map[index] < FRAME_BUFFERS);
return &cm->frame_bufs[cm->ref_frame_map[index]].buf;
}
@@ -275,6 +280,11 @@
}
}
+static INLINE int calc_mi_size(int len) {
+ // len is in mi units.
+ return len + MI_BLOCK_SIZE;
+}
+
static INLINE void set_mi_row_col(MACROBLOCKD *xd, const TileInfo *const tile,
int mi_row, int bh,
int mi_col, int bw,
diff --git a/vp9/common/vp9_postproc.c b/vp9/common/vp9_postproc.c
index abda4e6..e4e6ce7 100644
--- a/vp9/common/vp9_postproc.c
+++ b/vp9/common/vp9_postproc.c
@@ -366,6 +366,9 @@
unsigned int width, unsigned int height, int pitch) {
unsigned int i, j;
+ // TODO(jbb): why does simd code use both but c doesn't, normalize and
+ // fix..
+ (void) bothclamp;
for (i = 0; i < height; i++) {
uint8_t *pos = start + i * pitch;
char *ref = (char *)(noise + (rand() & 0xff)); // NOLINT
diff --git a/vp9/common/vp9_reconintra.c b/vp9/common/vp9_reconintra.c
index 403e105..471929a 100644
--- a/vp9/common/vp9_reconintra.c
+++ b/vp9/common/vp9_reconintra.c
@@ -9,11 +9,9 @@
*/
#include "./vpx_config.h"
+#include "./vp9_rtcd.h"
#include "vpx_mem/vpx_mem.h"
-#include "vpx_ports/vpx_once.h"
-
-#include "./vp9_rtcd.h"
#include "vp9/common/vp9_reconintra.h"
#include "vp9/common/vp9_onyxc_int.h"
@@ -292,32 +290,32 @@
typedef void (*intra_pred_fn)(uint8_t *dst, ptrdiff_t stride,
const uint8_t *above, const uint8_t *left);
-static intra_pred_fn pred[INTRA_MODES][4];
-static intra_pred_fn dc_pred[2][2][4];
+static intra_pred_fn pred[INTRA_MODES][TX_SIZES];
+static intra_pred_fn dc_pred[2][2][TX_SIZES];
-static void init_intra_pred_fn_ptrs(void) {
-#define intra_pred_allsizes(l, type) \
- l[0] = vp9_##type##_predictor_4x4; \
- l[1] = vp9_##type##_predictor_8x8; \
- l[2] = vp9_##type##_predictor_16x16; \
- l[3] = vp9_##type##_predictor_32x32
+void vp9_init_intra_predictors() {
+#define INIT_ALL_SIZES(p, type) \
+ p[TX_4X4] = vp9_##type##_predictor_4x4; \
+ p[TX_8X8] = vp9_##type##_predictor_8x8; \
+ p[TX_16X16] = vp9_##type##_predictor_16x16; \
+ p[TX_32X32] = vp9_##type##_predictor_32x32
- intra_pred_allsizes(pred[V_PRED], v);
- intra_pred_allsizes(pred[H_PRED], h);
- intra_pred_allsizes(pred[D207_PRED], d207);
- intra_pred_allsizes(pred[D45_PRED], d45);
- intra_pred_allsizes(pred[D63_PRED], d63);
- intra_pred_allsizes(pred[D117_PRED], d117);
- intra_pred_allsizes(pred[D135_PRED], d135);
- intra_pred_allsizes(pred[D153_PRED], d153);
- intra_pred_allsizes(pred[TM_PRED], tm);
+ INIT_ALL_SIZES(pred[V_PRED], v);
+ INIT_ALL_SIZES(pred[H_PRED], h);
+ INIT_ALL_SIZES(pred[D207_PRED], d207);
+ INIT_ALL_SIZES(pred[D45_PRED], d45);
+ INIT_ALL_SIZES(pred[D63_PRED], d63);
+ INIT_ALL_SIZES(pred[D117_PRED], d117);
+ INIT_ALL_SIZES(pred[D135_PRED], d135);
+ INIT_ALL_SIZES(pred[D153_PRED], d153);
+ INIT_ALL_SIZES(pred[TM_PRED], tm);
- intra_pred_allsizes(dc_pred[0][0], dc_128);
- intra_pred_allsizes(dc_pred[0][1], dc_top);
- intra_pred_allsizes(dc_pred[1][0], dc_left);
- intra_pred_allsizes(dc_pred[1][1], dc);
+ INIT_ALL_SIZES(dc_pred[0][0], dc_128);
+ INIT_ALL_SIZES(dc_pred[0][1], dc_top);
+ INIT_ALL_SIZES(dc_pred[1][0], dc_left);
+ INIT_ALL_SIZES(dc_pred[1][1], dc);
-#undef intra_pred_allsizes
+#undef INIT_ALL_SIZES
}
static void build_intra_predictors(const MACROBLOCKD *xd, const uint8_t *ref,
@@ -343,8 +341,6 @@
// 129 G H .. S T T T T T
// ..
- once(init_intra_pred_fn_ptrs);
-
// Get current frame pointer, width and height.
if (plane == 0) {
frame_width = xd->cur_buf->y_width;
diff --git a/vp9/common/vp9_reconintra.h b/vp9/common/vp9_reconintra.h
index d09d2a1..845f3bc 100644
--- a/vp9/common/vp9_reconintra.h
+++ b/vp9/common/vp9_reconintra.h
@@ -18,6 +18,8 @@
extern "C" {
#endif
+void vp9_init_intra_predictors();
+
void vp9_predict_intra_block(const MACROBLOCKD *xd, int block_idx, int bwl_in,
TX_SIZE tx_size, PREDICTION_MODE mode,
const uint8_t *ref, int ref_stride,
diff --git a/vp9/common/vp9_rtcd_defs.pl b/vp9/common/vp9_rtcd_defs.pl
index 708f41b..32bcf9a 100644
--- a/vp9/common/vp9_rtcd_defs.pl
+++ b/vp9/common/vp9_rtcd_defs.pl
@@ -6,6 +6,7 @@
#include "vpx/vpx_integer.h"
#include "vp9/common/vp9_enums.h"
+#include "vp9/common/vp9_idct.h"
struct macroblockd;
@@ -45,6 +46,13 @@
$avx_x86_64 = $avx2_x86_64 = '';
}
+# optimizations which depend on multiple features
+if ((vpx_config("HAVE_AVX2") eq "yes") && (vpx_config("HAVE_SSSE3") eq "yes")) {
+ $avx2_ssse3 = 'avx2';
+} else {
+ $avx2_ssse3 = '';
+}
+
#
# RECON
#
@@ -268,7 +276,7 @@
#
if (vpx_config("CONFIG_VP9_POSTPROC") eq "yes") {
add_proto qw/void vp9_mbpost_proc_down/, "uint8_t *dst, int pitch, int rows, int cols, int flimit";
-specialize qw/vp9_mbpost_proc_down mmx sse2/;
+specialize qw/vp9_mbpost_proc_down sse2/;
$vp9_mbpost_proc_down_sse2=vp9_mbpost_proc_down_xmm;
add_proto qw/void vp9_mbpost_proc_across_ip/, "uint8_t *src, int pitch, int rows, int cols, int flimit";
@@ -276,23 +284,14 @@
$vp9_mbpost_proc_across_ip_sse2=vp9_mbpost_proc_across_ip_xmm;
add_proto qw/void vp9_post_proc_down_and_across/, "const uint8_t *src_ptr, uint8_t *dst_ptr, int src_pixels_per_line, int dst_pixels_per_line, int rows, int cols, int flimit";
-specialize qw/vp9_post_proc_down_and_across mmx sse2/;
+specialize qw/vp9_post_proc_down_and_across sse2/;
$vp9_post_proc_down_and_across_sse2=vp9_post_proc_down_and_across_xmm;
add_proto qw/void vp9_plane_add_noise/, "uint8_t *Start, char *noise, char blackclamp[16], char whiteclamp[16], char bothclamp[16], unsigned int Width, unsigned int Height, int Pitch";
-specialize qw/vp9_plane_add_noise mmx sse2/;
+specialize qw/vp9_plane_add_noise sse2/;
$vp9_plane_add_noise_sse2=vp9_plane_add_noise_wmt;
}
-add_proto qw/void vp9_blend_mb_inner/, "uint8_t *y, uint8_t *u, uint8_t *v, int y1, int u1, int v1, int alpha, int stride";
-specialize qw/vp9_blend_mb_inner/;
-
-add_proto qw/void vp9_blend_mb_outer/, "uint8_t *y, uint8_t *u, uint8_t *v, int y1, int u1, int v1, int alpha, int stride";
-specialize qw/vp9_blend_mb_outer/;
-
-add_proto qw/void vp9_blend_b/, "uint8_t *y, uint8_t *u, uint8_t *v, int y1, int u1, int v1, int alpha, int stride";
-specialize qw/vp9_blend_b/;
-
#
# Sub Pixel Filters
#
@@ -305,15 +304,15 @@
$vp9_convolve_avg_neon_asm=vp9_convolve_avg_neon;
add_proto qw/void vp9_convolve8/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
-specialize qw/vp9_convolve8 sse2 ssse3 neon_asm dspr2/;
+specialize qw/vp9_convolve8 sse2 ssse3 neon_asm dspr2/, "$avx2_ssse3";
$vp9_convolve8_neon_asm=vp9_convolve8_neon;
add_proto qw/void vp9_convolve8_horiz/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
-specialize qw/vp9_convolve8_horiz sse2 ssse3 neon_asm dspr2/;
+specialize qw/vp9_convolve8_horiz sse2 ssse3 neon_asm dspr2/, "$avx2_ssse3";
$vp9_convolve8_horiz_neon_asm=vp9_convolve8_horiz_neon;
add_proto qw/void vp9_convolve8_vert/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
-specialize qw/vp9_convolve8_vert sse2 ssse3 neon_asm dspr2/;
+specialize qw/vp9_convolve8_vert sse2 ssse3 neon_asm dspr2/, "$avx2_ssse3";
$vp9_convolve8_vert_neon_asm=vp9_convolve8_vert_neon;
add_proto qw/void vp9_convolve8_avg/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
@@ -331,68 +330,177 @@
#
# dct
#
-add_proto qw/void vp9_idct4x4_1_add/, "const int16_t *input, uint8_t *dest, int dest_stride";
-specialize qw/vp9_idct4x4_1_add sse2 neon_asm dspr2/;
-$vp9_idct4x4_1_add_neon_asm=vp9_idct4x4_1_add_neon;
+if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
+ add_proto qw/void vp9_idct4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+ specialize qw/vp9_idct4x4_1_add/;
-add_proto qw/void vp9_idct4x4_16_add/, "const int16_t *input, uint8_t *dest, int dest_stride";
-specialize qw/vp9_idct4x4_16_add sse2 neon_asm dspr2/;
-$vp9_idct4x4_16_add_neon_asm=vp9_idct4x4_16_add_neon;
+ add_proto qw/void vp9_idct4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+ specialize qw/vp9_idct4x4_16_add/;
-add_proto qw/void vp9_idct8x8_1_add/, "const int16_t *input, uint8_t *dest, int dest_stride";
-specialize qw/vp9_idct8x8_1_add sse2 neon_asm dspr2/;
-$vp9_idct8x8_1_add_neon_asm=vp9_idct8x8_1_add_neon;
+ add_proto qw/void vp9_idct8x8_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+ specialize qw/vp9_idct8x8_1_add/;
-add_proto qw/void vp9_idct8x8_64_add/, "const int16_t *input, uint8_t *dest, int dest_stride";
-specialize qw/vp9_idct8x8_64_add sse2 neon_asm dspr2/, "$ssse3_x86_64";
-$vp9_idct8x8_64_add_neon_asm=vp9_idct8x8_64_add_neon;
+ add_proto qw/void vp9_idct8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+ specialize qw/vp9_idct8x8_64_add/;
-add_proto qw/void vp9_idct8x8_12_add/, "const int16_t *input, uint8_t *dest, int dest_stride";
-specialize qw/vp9_idct8x8_12_add sse2 neon_asm dspr2/, "$ssse3_x86_64";
-$vp9_idct8x8_12_add_neon_asm=vp9_idct8x8_12_add_neon;
+ add_proto qw/void vp9_idct8x8_12_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+ specialize qw/vp9_idct8x8_12_add/;
-add_proto qw/void vp9_idct16x16_1_add/, "const int16_t *input, uint8_t *dest, int dest_stride";
-specialize qw/vp9_idct16x16_1_add sse2 neon_asm dspr2/;
-$vp9_idct16x16_1_add_neon_asm=vp9_idct16x16_1_add_neon;
+ add_proto qw/void vp9_idct16x16_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+ specialize qw/vp9_idct16x16_1_add/;
-add_proto qw/void vp9_idct16x16_256_add/, "const int16_t *input, uint8_t *dest, int dest_stride";
-specialize qw/vp9_idct16x16_256_add sse2 ssse3 neon_asm dspr2/;
-$vp9_idct16x16_256_add_neon_asm=vp9_idct16x16_256_add_neon;
+ add_proto qw/void vp9_idct16x16_256_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+ specialize qw/vp9_idct16x16_256_add/;
-add_proto qw/void vp9_idct16x16_10_add/, "const int16_t *input, uint8_t *dest, int dest_stride";
-specialize qw/vp9_idct16x16_10_add sse2 ssse3 neon_asm dspr2/;
-$vp9_idct16x16_10_add_neon_asm=vp9_idct16x16_10_add_neon;
+ add_proto qw/void vp9_idct16x16_10_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+ specialize qw/vp9_idct16x16_10_add/;
-add_proto qw/void vp9_idct32x32_1024_add/, "const int16_t *input, uint8_t *dest, int dest_stride";
-specialize qw/vp9_idct32x32_1024_add sse2 neon_asm dspr2/;
-$vp9_idct32x32_1024_add_neon_asm=vp9_idct32x32_1024_add_neon;
+ add_proto qw/void vp9_idct32x32_1024_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+ specialize qw/vp9_idct32x32_1024_add/;
-add_proto qw/void vp9_idct32x32_34_add/, "const int16_t *input, uint8_t *dest, int dest_stride";
-specialize qw/vp9_idct32x32_34_add sse2 neon_asm dspr2/;
-$vp9_idct32x32_34_add_neon_asm=vp9_idct32x32_1024_add_neon;
+ add_proto qw/void vp9_idct32x32_34_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+ specialize qw/vp9_idct32x32_34_add/;
-add_proto qw/void vp9_idct32x32_1_add/, "const int16_t *input, uint8_t *dest, int dest_stride";
-specialize qw/vp9_idct32x32_1_add sse2 neon_asm dspr2/;
-$vp9_idct32x32_1_add_neon_asm=vp9_idct32x32_1_add_neon;
+ add_proto qw/void vp9_idct32x32_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+ specialize qw/vp9_idct32x32_1_add/;
-add_proto qw/void vp9_iht4x4_16_add/, "const int16_t *input, uint8_t *dest, int dest_stride, int tx_type";
-specialize qw/vp9_iht4x4_16_add sse2 neon_asm dspr2/;
-$vp9_iht4x4_16_add_neon_asm=vp9_iht4x4_16_add_neon;
+ add_proto qw/void vp9_iht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
+ specialize qw/vp9_iht4x4_16_add/;
-add_proto qw/void vp9_iht8x8_64_add/, "const int16_t *input, uint8_t *dest, int dest_stride, int tx_type";
-specialize qw/vp9_iht8x8_64_add sse2 neon_asm dspr2/;
-$vp9_iht8x8_64_add_neon_asm=vp9_iht8x8_64_add_neon;
+ add_proto qw/void vp9_iht8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
+ specialize qw/vp9_iht8x8_64_add/;
-add_proto qw/void vp9_iht16x16_256_add/, "const int16_t *input, uint8_t *output, int pitch, int tx_type";
-specialize qw/vp9_iht16x16_256_add sse2 dspr2/;
+ add_proto qw/void vp9_iht16x16_256_add/, "const tran_low_t *input, uint8_t *output, int pitch, int tx_type";
+ specialize qw/vp9_iht16x16_256_add/;
+
+ # dct and add
+
+ add_proto qw/void vp9_iwht4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+ specialize qw/vp9_iwht4x4_1_add/;
+
+ add_proto qw/void vp9_iwht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+ specialize qw/vp9_iwht4x4_16_add/;
+} else {
+ add_proto qw/void vp9_idct4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+ specialize qw/vp9_idct4x4_1_add sse2 neon_asm dspr2/;
+ $vp9_idct4x4_1_add_neon_asm=vp9_idct4x4_1_add_neon;
+
+ add_proto qw/void vp9_idct4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+ specialize qw/vp9_idct4x4_16_add sse2 neon_asm dspr2/;
+ $vp9_idct4x4_16_add_neon_asm=vp9_idct4x4_16_add_neon;
+
+ add_proto qw/void vp9_idct8x8_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+ specialize qw/vp9_idct8x8_1_add sse2 neon_asm dspr2/;
+ $vp9_idct8x8_1_add_neon_asm=vp9_idct8x8_1_add_neon;
+
+ add_proto qw/void vp9_idct8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+ specialize qw/vp9_idct8x8_64_add sse2 neon_asm dspr2/, "$ssse3_x86_64";
+ $vp9_idct8x8_64_add_neon_asm=vp9_idct8x8_64_add_neon;
+
+ add_proto qw/void vp9_idct8x8_12_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+ specialize qw/vp9_idct8x8_12_add sse2 neon_asm dspr2/, "$ssse3_x86_64";
+ $vp9_idct8x8_12_add_neon_asm=vp9_idct8x8_12_add_neon;
+
+ add_proto qw/void vp9_idct16x16_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+ specialize qw/vp9_idct16x16_1_add sse2 neon_asm dspr2/;
+ $vp9_idct16x16_1_add_neon_asm=vp9_idct16x16_1_add_neon;
+
+ add_proto qw/void vp9_idct16x16_256_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+ specialize qw/vp9_idct16x16_256_add sse2 ssse3 neon_asm dspr2/;
+ $vp9_idct16x16_256_add_neon_asm=vp9_idct16x16_256_add_neon;
+
+ add_proto qw/void vp9_idct16x16_10_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+ specialize qw/vp9_idct16x16_10_add sse2 ssse3 neon_asm dspr2/;
+ $vp9_idct16x16_10_add_neon_asm=vp9_idct16x16_10_add_neon;
+
+ add_proto qw/void vp9_idct32x32_1024_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+ specialize qw/vp9_idct32x32_1024_add sse2 neon_asm dspr2/;
+ $vp9_idct32x32_1024_add_neon_asm=vp9_idct32x32_1024_add_neon;
+
+ add_proto qw/void vp9_idct32x32_34_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+ specialize qw/vp9_idct32x32_34_add sse2 neon_asm dspr2/;
+ $vp9_idct32x32_34_add_neon_asm=vp9_idct32x32_1024_add_neon;
+
+ add_proto qw/void vp9_idct32x32_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+ specialize qw/vp9_idct32x32_1_add sse2 neon_asm dspr2/;
+ $vp9_idct32x32_1_add_neon_asm=vp9_idct32x32_1_add_neon;
+
+ add_proto qw/void vp9_iht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
+ specialize qw/vp9_iht4x4_16_add sse2 neon_asm dspr2/;
+ $vp9_iht4x4_16_add_neon_asm=vp9_iht4x4_16_add_neon;
+
+ add_proto qw/void vp9_iht8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
+ specialize qw/vp9_iht8x8_64_add sse2 neon_asm dspr2/;
+ $vp9_iht8x8_64_add_neon_asm=vp9_iht8x8_64_add_neon;
+
+ add_proto qw/void vp9_iht16x16_256_add/, "const tran_low_t *input, uint8_t *output, int pitch, int tx_type";
+ specialize qw/vp9_iht16x16_256_add sse2 dspr2/;
+
+ # dct and add
+
+ add_proto qw/void vp9_iwht4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+ specialize qw/vp9_iwht4x4_1_add/;
+
+ add_proto qw/void vp9_iwht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+ specialize qw/vp9_iwht4x4_16_add/;
+}
+
+
+# High bitdepth functions
+if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
+#
+# dct
+#
+add_proto qw/void vp9_high_idct4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
+specialize qw/vp9_high_idct4x4_1_add/;
+
+add_proto qw/void vp9_high_idct4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
+specialize qw/vp9_high_idct4x4_16_add/;
+
+add_proto qw/void vp9_high_idct8x8_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
+specialize qw/vp9_high_idct8x8_1_add/;
+
+add_proto qw/void vp9_high_idct8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
+specialize qw/vp9_high_idct8x8_64_add/;
+
+add_proto qw/void vp9_high_idct8x8_10_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
+specialize qw/vp9_high_idct8x8_10_add/;
+
+add_proto qw/void vp9_high_idct16x16_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
+specialize qw/vp9_high_idct16x16_1_add/;
+
+add_proto qw/void vp9_high_idct16x16_256_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
+specialize qw/vp9_high_idct16x16_256_add/;
+
+add_proto qw/void vp9_high_idct16x16_10_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
+specialize qw/vp9_high_idct16x16_10_add/;
+
+add_proto qw/void vp9_high_idct32x32_1024_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
+specialize qw/vp9_high_idct32x32_1024_add/;
+
+add_proto qw/void vp9_high_idct32x32_34_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
+specialize qw/vp9_high_idct32x32_34_add/;
+
+add_proto qw/void vp9_high_idct32x32_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
+specialize qw/vp9_high_idct32x32_1_add/;
+
+add_proto qw/void vp9_high_iht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type, int bd";
+specialize qw/vp9_high_iht4x4_16_add/;
+
+add_proto qw/void vp9_high_iht8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type, int bd";
+specialize qw/vp9_high_iht8x8_64_add/;
+
+add_proto qw/void vp9_high_iht16x16_256_add/, "const tran_low_t *input, uint8_t *output, int pitch, int tx_type, int bd";
+specialize qw/vp9_high_iht16x16_256_add/;
# dct and add
-add_proto qw/void vp9_iwht4x4_1_add/, "const int16_t *input, uint8_t *dest, int dest_stride";
-specialize qw/vp9_iwht4x4_1_add/;
+add_proto qw/void vp9_high_iwht4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
+specialize qw/vp9_high_iwht4x4_1_add/;
-add_proto qw/void vp9_iwht4x4_16_add/, "const int16_t *input, uint8_t *dest, int dest_stride";
-specialize qw/vp9_iwht4x4_16_add/;
+add_proto qw/void vp9_high_iwht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
+specialize qw/vp9_high_iwht4x4_16_add/;
+}
#
# Encoder functions below this point.
@@ -420,19 +528,19 @@
specialize qw/vp9_variance64x64 avx2/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_variance16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-specialize qw/vp9_variance16x16 mmx avx2 neon/, "$sse2_x86inc";
+specialize qw/vp9_variance16x16 avx2 neon/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_variance16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-specialize qw/vp9_variance16x8 mmx/, "$sse2_x86inc";
+specialize qw/vp9_variance16x8/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_variance8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-specialize qw/vp9_variance8x16 mmx/, "$sse2_x86inc";
+specialize qw/vp9_variance8x16/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_variance8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-specialize qw/vp9_variance8x8 mmx neon/, "$sse2_x86inc";
+specialize qw/vp9_variance8x8 neon/, "$sse2_x86inc";
add_proto qw/void vp9_get8x8var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
-specialize qw/vp9_get8x8var mmx neon/, "$sse2_x86inc";
+specialize qw/vp9_get8x8var neon/, "$sse2_x86inc";
add_proto qw/void vp9_get16x16var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
specialize qw/vp9_get16x16var avx2 neon/, "$sse2_x86inc";
@@ -444,7 +552,7 @@
specialize qw/vp9_variance4x8/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_variance4x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-specialize qw/vp9_variance4x4 mmx/, "$sse2_x86inc";
+specialize qw/vp9_variance4x4/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_sub_pixel_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vp9_sub_pixel_variance64x64 avx2/, "$sse2_x86inc", "$ssse3_x86inc";
@@ -545,16 +653,16 @@
specialize qw/vp9_sad32x32 neon/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_sad16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride";
-specialize qw/vp9_sad16x16 mmx neon/, "$sse2_x86inc";
+specialize qw/vp9_sad16x16 neon/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_sad16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride";
-specialize qw/vp9_sad16x8 mmx/, "$sse2_x86inc";
+specialize qw/vp9_sad16x8/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_sad8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride";
-specialize qw/vp9_sad8x16 mmx/, "$sse2_x86inc";
+specialize qw/vp9_sad8x16/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_sad8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride";
-specialize qw/vp9_sad8x8 mmx neon/, "$sse2_x86inc";
+specialize qw/vp9_sad8x8 neon/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_sad8x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride";
specialize qw/vp9_sad8x4/, "$sse2_x86inc";
@@ -563,7 +671,7 @@
specialize qw/vp9_sad4x8/, "$sse_x86inc";
add_proto qw/unsigned int vp9_sad4x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride";
-specialize qw/vp9_sad4x4 mmx/, "$sse_x86inc";
+specialize qw/vp9_sad4x4/, "$sse_x86inc";
add_proto qw/unsigned int vp9_sad64x64_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
specialize qw/vp9_sad64x64_avg/, "$sse2_x86inc";
@@ -693,38 +801,57 @@
specialize qw/vp9_sad4x4x4d sse/;
add_proto qw/unsigned int vp9_mse16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse";
-specialize qw/vp9_mse16x16 mmx avx2/, "$sse2_x86inc";
+specialize qw/vp9_mse16x16 avx2/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_mse8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse";
-specialize qw/vp9_mse8x16/;
+specialize qw/vp9_mse8x16/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_mse16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse";
-specialize qw/vp9_mse16x8/;
+specialize qw/vp9_mse16x8/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_mse8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse";
-specialize qw/vp9_mse8x8/;
+specialize qw/vp9_mse8x8/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_get_mb_ss/, "const int16_t *";
-specialize qw/vp9_get_mb_ss mmx sse2/;
+specialize qw/vp9_get_mb_ss/, "$sse2_x86inc";
# ENCODEMB INVOKE
-add_proto qw/int64_t vp9_block_error/, "const int16_t *coeff, const int16_t *dqcoeff, intptr_t block_size, int64_t *ssz";
-specialize qw/vp9_block_error avx2/, "$sse2_x86inc";
-
add_proto qw/void vp9_subtract_block/, "int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride";
specialize qw/vp9_subtract_block neon/, "$sse2_x86inc";
-add_proto qw/void vp9_quantize_fp/, "const int16_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
-specialize qw/vp9_quantize_fp neon/, "$ssse3_x86_64";
+if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
+# the transform coefficients are held in 32-bit
+# values, so the assembler code for vp9_block_error can no longer be used.
+ add_proto qw/int64_t vp9_block_error/, "const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz";
+ specialize qw/vp9_block_error/;
-add_proto qw/void vp9_quantize_fp_32x32/, "const int16_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
-specialize qw/vp9_quantize_fp_32x32/, "$ssse3_x86_64";
+ add_proto qw/void vp9_quantize_fp/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+ specialize qw/vp9_quantize_fp/;
-add_proto qw/void vp9_quantize_b/, "const int16_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
-specialize qw/vp9_quantize_b/, "$ssse3_x86_64";
+ add_proto qw/void vp9_quantize_fp_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+ specialize qw/vp9_quantize_fp_32x32/;
-add_proto qw/void vp9_quantize_b_32x32/, "const int16_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
-specialize qw/vp9_quantize_b_32x32/, "$ssse3_x86_64";
+ add_proto qw/void vp9_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+ specialize qw/vp9_quantize_b/;
+
+ add_proto qw/void vp9_quantize_b_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+ specialize qw/vp9_quantize_b_32x32/;
+} else {
+ add_proto qw/int64_t vp9_block_error/, "const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz";
+ specialize qw/vp9_block_error avx2/;
+
+ add_proto qw/void vp9_quantize_fp/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+ specialize qw/vp9_quantize_fp neon/, "$ssse3_x86_64";
+
+ add_proto qw/void vp9_quantize_fp_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+ specialize qw/vp9_quantize_fp_32x32/, "$ssse3_x86_64";
+
+ add_proto qw/void vp9_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+ specialize qw/vp9_quantize_b/, "$ssse3_x86_64";
+
+ add_proto qw/void vp9_quantize_b_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+ specialize qw/vp9_quantize_b_32x32/;
+}
#
# Structured Similarity (SSIM)
@@ -738,44 +865,86 @@
}
# fdct functions
-add_proto qw/void vp9_fht4x4/, "const int16_t *input, int16_t *output, int stride, int tx_type";
-specialize qw/vp9_fht4x4 sse2/;
-add_proto qw/void vp9_fht8x8/, "const int16_t *input, int16_t *output, int stride, int tx_type";
-specialize qw/vp9_fht8x8 sse2/;
+if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
+ add_proto qw/void vp9_fht4x4/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
+ specialize qw/vp9_fht4x4/;
-add_proto qw/void vp9_fht16x16/, "const int16_t *input, int16_t *output, int stride, int tx_type";
-specialize qw/vp9_fht16x16 sse2/;
+ add_proto qw/void vp9_fht8x8/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
+ specialize qw/vp9_fht8x8/;
-add_proto qw/void vp9_fwht4x4/, "const int16_t *input, int16_t *output, int stride";
-specialize qw/vp9_fwht4x4/, "$mmx_x86inc";
+ add_proto qw/void vp9_fht16x16/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
+ specialize qw/vp9_fht16x16/;
-add_proto qw/void vp9_fdct4x4_1/, "const int16_t *input, int16_t *output, int stride";
-specialize qw/vp9_fdct4x4_1 sse2/;
+ add_proto qw/void vp9_fwht4x4/, "const int16_t *input, tran_low_t *output, int stride";
+ specialize qw/vp9_fwht4x4/;
-add_proto qw/void vp9_fdct4x4/, "const int16_t *input, int16_t *output, int stride";
-specialize qw/vp9_fdct4x4 sse2/;
+ add_proto qw/void vp9_fdct4x4_1/, "const int16_t *input, tran_low_t *output, int stride";
+ specialize qw/vp9_fdct4x4_1/;
-add_proto qw/void vp9_fdct8x8_1/, "const int16_t *input, int16_t *output, int stride";
-specialize qw/vp9_fdct8x8_1 sse2 neon/;
+ add_proto qw/void vp9_fdct4x4/, "const int16_t *input, tran_low_t *output, int stride";
+ specialize qw/vp9_fdct4x4/;
-add_proto qw/void vp9_fdct8x8/, "const int16_t *input, int16_t *output, int stride";
-specialize qw/vp9_fdct8x8 sse2 neon/, "$ssse3_x86_64";
+ add_proto qw/void vp9_fdct8x8_1/, "const int16_t *input, tran_low_t *output, int stride";
+ specialize qw/vp9_fdct8x8_1/;
-add_proto qw/void vp9_fdct16x16_1/, "const int16_t *input, int16_t *output, int stride";
-specialize qw/vp9_fdct16x16_1 sse2/;
+ add_proto qw/void vp9_fdct8x8/, "const int16_t *input, tran_low_t *output, int stride";
+ specialize qw/vp9_fdct8x8/;
-add_proto qw/void vp9_fdct16x16/, "const int16_t *input, int16_t *output, int stride";
-specialize qw/vp9_fdct16x16 sse2/;
+ add_proto qw/void vp9_fdct16x16_1/, "const int16_t *input, tran_low_t *output, int stride";
+ specialize qw/vp9_fdct16x16_1/;
-add_proto qw/void vp9_fdct32x32_1/, "const int16_t *input, int16_t *output, int stride";
-specialize qw/vp9_fdct32x32_1 sse2/;
+ add_proto qw/void vp9_fdct16x16/, "const int16_t *input, tran_low_t *output, int stride";
+ specialize qw/vp9_fdct16x16/;
-add_proto qw/void vp9_fdct32x32/, "const int16_t *input, int16_t *output, int stride";
-specialize qw/vp9_fdct32x32 sse2 avx2/;
+ add_proto qw/void vp9_fdct32x32_1/, "const int16_t *input, tran_low_t *output, int stride";
+ specialize qw/vp9_fdct32x32_1/;
-add_proto qw/void vp9_fdct32x32_rd/, "const int16_t *input, int16_t *output, int stride";
-specialize qw/vp9_fdct32x32_rd sse2 avx2/;
+ add_proto qw/void vp9_fdct32x32/, "const int16_t *input, tran_low_t *output, int stride";
+ specialize qw/vp9_fdct32x32/;
+
+ add_proto qw/void vp9_fdct32x32_rd/, "const int16_t *input, tran_low_t *output, int stride";
+ specialize qw/vp9_fdct32x32_rd/;
+} else {
+ add_proto qw/void vp9_fht4x4/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
+ specialize qw/vp9_fht4x4 sse2/;
+
+ add_proto qw/void vp9_fht8x8/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
+ specialize qw/vp9_fht8x8 sse2/;
+
+ add_proto qw/void vp9_fht16x16/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
+ specialize qw/vp9_fht16x16 sse2/;
+
+ add_proto qw/void vp9_fwht4x4/, "const int16_t *input, tran_low_t *output, int stride";
+ specialize qw/vp9_fwht4x4/, "$mmx_x86inc";
+
+ add_proto qw/void vp9_fdct4x4_1/, "const int16_t *input, tran_low_t *output, int stride";
+ specialize qw/vp9_fdct4x4_1 sse2/;
+
+ add_proto qw/void vp9_fdct4x4/, "const int16_t *input, tran_low_t *output, int stride";
+ specialize qw/vp9_fdct4x4 sse2/;
+
+ add_proto qw/void vp9_fdct8x8_1/, "const int16_t *input, tran_low_t *output, int stride";
+ specialize qw/vp9_fdct8x8_1 sse2 neon/;
+
+ add_proto qw/void vp9_fdct8x8/, "const int16_t *input, tran_low_t *output, int stride";
+ specialize qw/vp9_fdct8x8 sse2 neon/, "$ssse3_x86_64";
+
+ add_proto qw/void vp9_fdct16x16_1/, "const int16_t *input, tran_low_t *output, int stride";
+ specialize qw/vp9_fdct16x16_1 sse2/;
+
+ add_proto qw/void vp9_fdct16x16/, "const int16_t *input, tran_low_t *output, int stride";
+ specialize qw/vp9_fdct16x16 sse2/;
+
+ add_proto qw/void vp9_fdct32x32_1/, "const int16_t *input, tran_low_t *output, int stride";
+ specialize qw/vp9_fdct32x32_1 sse2/;
+
+ add_proto qw/void vp9_fdct32x32/, "const int16_t *input, tran_low_t *output, int stride";
+ specialize qw/vp9_fdct32x32 sse2 avx2/;
+
+ add_proto qw/void vp9_fdct32x32_rd/, "const int16_t *input, tran_low_t *output, int stride";
+ specialize qw/vp9_fdct32x32_rd sse2 avx2/;
+}
#
# Motion search
@@ -797,6 +966,654 @@
add_proto qw/void vp9_temporal_filter_apply/, "uint8_t *frame1, unsigned int stride, uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, unsigned int *accumulator, uint16_t *count";
specialize qw/vp9_temporal_filter_apply sse2/;
+if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
+
+ # variance
+ add_proto qw/unsigned int vp9_high_variance32x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+ specialize qw/vp9_high_variance32x16/;
+
+ add_proto qw/unsigned int vp9_high_variance16x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+ specialize qw/vp9_high_variance16x32/;
+
+ add_proto qw/unsigned int vp9_high_variance64x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+ specialize qw/vp9_high_variance64x32/;
+
+ add_proto qw/unsigned int vp9_high_variance32x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+ specialize qw/vp9_high_variance32x64/;
+
+ add_proto qw/unsigned int vp9_high_variance32x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+ specialize qw/vp9_high_variance32x32/;
+
+ add_proto qw/unsigned int vp9_high_variance64x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+ specialize qw/vp9_high_variance64x64/;
+
+ add_proto qw/unsigned int vp9_high_variance16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+ specialize qw/vp9_high_variance16x16/;
+
+ add_proto qw/unsigned int vp9_high_variance16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+ specialize qw/vp9_high_variance16x8/;
+
+ add_proto qw/unsigned int vp9_high_variance8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+ specialize qw/vp9_high_variance8x16/;
+
+ add_proto qw/unsigned int vp9_high_variance8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+ specialize qw/vp9_high_variance8x8/;
+
+ add_proto qw/unsigned int vp9_high_variance8x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+ specialize qw/vp9_high_variance8x4/;
+
+ add_proto qw/unsigned int vp9_high_variance4x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+ specialize qw/vp9_high_variance4x8/;
+
+ add_proto qw/unsigned int vp9_high_variance4x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+ specialize qw/vp9_high_variance4x4/;
+
+ add_proto qw/void vp9_high_get8x8var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
+ specialize qw/vp9_high_get8x8var/;
+
+ add_proto qw/void vp9_high_get16x16var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
+ specialize qw/vp9_high_get16x16var/;
+
+ add_proto qw/unsigned int vp9_high_10_variance32x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+ specialize qw/vp9_high_10_variance32x16/;
+
+ add_proto qw/unsigned int vp9_high_10_variance16x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+ specialize qw/vp9_high_10_variance16x32/;
+
+ add_proto qw/unsigned int vp9_high_10_variance64x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+ specialize qw/vp9_high_10_variance64x32/;
+
+ add_proto qw/unsigned int vp9_high_10_variance32x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+ specialize qw/vp9_high_10_variance32x64/;
+
+ add_proto qw/unsigned int vp9_high_10_variance32x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+ specialize qw/vp9_high_10_variance32x32/;
+
+ add_proto qw/unsigned int vp9_high_10_variance64x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+ specialize qw/vp9_high_10_variance64x64/;
+
+ add_proto qw/unsigned int vp9_high_10_variance16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+ specialize qw/vp9_high_10_variance16x16/;
+
+ add_proto qw/unsigned int vp9_high_10_variance16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+ specialize qw/vp9_high_10_variance16x8/;
+
+ add_proto qw/unsigned int vp9_high_10_variance8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+ specialize qw/vp9_high_10_variance8x16/;
+
+ add_proto qw/unsigned int vp9_high_10_variance8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+ specialize qw/vp9_high_10_variance8x8/;
+
+ add_proto qw/unsigned int vp9_high_10_variance8x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+ specialize qw/vp9_high_10_variance8x4/;
+
+ add_proto qw/unsigned int vp9_high_10_variance4x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+ specialize qw/vp9_high_10_variance4x8/;
+
+ add_proto qw/unsigned int vp9_high_10_variance4x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+ specialize qw/vp9_high_10_variance4x4/;
+
+ add_proto qw/void vp9_high_10_get8x8var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
+ specialize qw/vp9_high_10_get8x8var/;
+
+ add_proto qw/void vp9_high_10_get16x16var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
+ specialize qw/vp9_high_10_get16x16var/;
+
+ add_proto qw/unsigned int vp9_high_12_variance32x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+ specialize qw/vp9_high_12_variance32x16/;
+
+ add_proto qw/unsigned int vp9_high_12_variance16x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+ specialize qw/vp9_high_12_variance16x32/;
+
+ add_proto qw/unsigned int vp9_high_12_variance64x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+ specialize qw/vp9_high_12_variance64x32/;
+
+ add_proto qw/unsigned int vp9_high_12_variance32x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+ specialize qw/vp9_high_12_variance32x64/;
+
+ add_proto qw/unsigned int vp9_high_12_variance32x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+ specialize qw/vp9_high_12_variance32x32/;
+
+ add_proto qw/unsigned int vp9_high_12_variance64x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+ specialize qw/vp9_high_12_variance64x64/;
+
+ add_proto qw/unsigned int vp9_high_12_variance16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+ specialize qw/vp9_high_12_variance16x16/;
+
+ add_proto qw/unsigned int vp9_high_12_variance16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+ specialize qw/vp9_high_12_variance16x8/;
+
+ add_proto qw/unsigned int vp9_high_12_variance8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+ specialize qw/vp9_high_12_variance8x16/;
+
+ add_proto qw/unsigned int vp9_high_12_variance8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+ specialize qw/vp9_high_12_variance8x8/;
+
+ add_proto qw/unsigned int vp9_high_12_variance8x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+ specialize qw/vp9_high_12_variance8x4/;
+
+ add_proto qw/unsigned int vp9_high_12_variance4x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+ specialize qw/vp9_high_12_variance4x8/;
+
+ add_proto qw/unsigned int vp9_high_12_variance4x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+ specialize qw/vp9_high_12_variance4x4/;
+
+ add_proto qw/void vp9_high_12_get8x8var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
+ specialize qw/vp9_high_12_get8x8var/;
+
+ add_proto qw/void vp9_high_12_get16x16var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
+ specialize qw/vp9_high_12_get16x16var/;
+
+ add_proto qw/unsigned int vp9_high_sub_pixel_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+ specialize qw/vp9_high_sub_pixel_variance64x64/;
+
+ add_proto qw/unsigned int vp9_high_sub_pixel_avg_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
+ specialize qw/vp9_high_sub_pixel_avg_variance64x64/;
+
+ add_proto qw/unsigned int vp9_high_sub_pixel_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+ specialize qw/vp9_high_sub_pixel_variance32x64/;
+
+ add_proto qw/unsigned int vp9_high_sub_pixel_avg_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
+ specialize qw/vp9_high_sub_pixel_avg_variance32x64/;
+
+ add_proto qw/unsigned int vp9_high_sub_pixel_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+ specialize qw/vp9_high_sub_pixel_variance64x32/;
+
+ add_proto qw/unsigned int vp9_high_sub_pixel_avg_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
+ specialize qw/vp9_high_sub_pixel_avg_variance64x32/;
+
+ add_proto qw/unsigned int vp9_high_sub_pixel_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+ specialize qw/vp9_high_sub_pixel_variance32x16/;
+
+ add_proto qw/unsigned int vp9_high_sub_pixel_avg_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
+ specialize qw/vp9_high_sub_pixel_avg_variance32x16/;
+
+ add_proto qw/unsigned int vp9_high_sub_pixel_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+ specialize qw/vp9_high_sub_pixel_variance16x32/;
+
+ add_proto qw/unsigned int vp9_high_sub_pixel_avg_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
+ specialize qw/vp9_high_sub_pixel_avg_variance16x32/;
+
+ add_proto qw/unsigned int vp9_high_sub_pixel_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+ specialize qw/vp9_high_sub_pixel_variance32x32/;
+
+ add_proto qw/unsigned int vp9_high_sub_pixel_avg_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
+ specialize qw/vp9_high_sub_pixel_avg_variance32x32/;
+
+ add_proto qw/unsigned int vp9_high_sub_pixel_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+ specialize qw/vp9_high_sub_pixel_variance16x16/;
+
+ add_proto qw/unsigned int vp9_high_sub_pixel_avg_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
+ specialize qw/vp9_high_sub_pixel_avg_variance16x16/;
+
+ add_proto qw/unsigned int vp9_high_sub_pixel_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+ specialize qw/vp9_high_sub_pixel_variance8x16/;
+
+ add_proto qw/unsigned int vp9_high_sub_pixel_avg_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
+ specialize qw/vp9_high_sub_pixel_avg_variance8x16/;
+
+ add_proto qw/unsigned int vp9_high_sub_pixel_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+ specialize qw/vp9_high_sub_pixel_variance16x8/;
+
+ add_proto qw/unsigned int vp9_high_sub_pixel_avg_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
+ specialize qw/vp9_high_sub_pixel_avg_variance16x8/;
+
+ add_proto qw/unsigned int vp9_high_sub_pixel_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+ specialize qw/vp9_high_sub_pixel_variance8x8/;
+
+ add_proto qw/unsigned int vp9_high_sub_pixel_avg_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
+ specialize qw/vp9_high_sub_pixel_avg_variance8x8/;
+
+ add_proto qw/unsigned int vp9_high_sub_pixel_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+ specialize qw/vp9_high_sub_pixel_variance8x4/;
+
+ add_proto qw/unsigned int vp9_high_sub_pixel_avg_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
+ specialize qw/vp9_high_sub_pixel_avg_variance8x4/;
+
+ add_proto qw/unsigned int vp9_high_sub_pixel_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+ specialize qw/vp9_high_sub_pixel_variance4x8/;
+
+ add_proto qw/unsigned int vp9_high_sub_pixel_avg_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
+ specialize qw/vp9_high_sub_pixel_avg_variance4x8/;
+
+ add_proto qw/unsigned int vp9_high_sub_pixel_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+ specialize qw/vp9_high_sub_pixel_variance4x4/;
+
+ add_proto qw/unsigned int vp9_high_sub_pixel_avg_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
+ specialize qw/vp9_high_sub_pixel_avg_variance4x4/;
+
+ add_proto qw/unsigned int vp9_high_10_sub_pixel_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+ specialize qw/vp9_high_10_sub_pixel_variance64x64/;
+
+ add_proto qw/unsigned int vp9_high_10_sub_pixel_avg_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
+ specialize qw/vp9_high_10_sub_pixel_avg_variance64x64/;
+
+ add_proto qw/unsigned int vp9_high_10_sub_pixel_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+ specialize qw/vp9_high_10_sub_pixel_variance32x64/;
+
+ add_proto qw/unsigned int vp9_high_10_sub_pixel_avg_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
+ specialize qw/vp9_high_10_sub_pixel_avg_variance32x64/;
+
+ add_proto qw/unsigned int vp9_high_10_sub_pixel_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+ specialize qw/vp9_high_10_sub_pixel_variance64x32/;
+
+ add_proto qw/unsigned int vp9_high_10_sub_pixel_avg_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
+ specialize qw/vp9_high_10_sub_pixel_avg_variance64x32/;
+
+ add_proto qw/unsigned int vp9_high_10_sub_pixel_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+ specialize qw/vp9_high_10_sub_pixel_variance32x16/;
+
+ add_proto qw/unsigned int vp9_high_10_sub_pixel_avg_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
+ specialize qw/vp9_high_10_sub_pixel_avg_variance32x16/;
+
+ add_proto qw/unsigned int vp9_high_10_sub_pixel_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+ specialize qw/vp9_high_10_sub_pixel_variance16x32/;
+
+ add_proto qw/unsigned int vp9_high_10_sub_pixel_avg_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
+ specialize qw/vp9_high_10_sub_pixel_avg_variance16x32/;
+
+ add_proto qw/unsigned int vp9_high_10_sub_pixel_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+ specialize qw/vp9_high_10_sub_pixel_variance32x32/;
+
+ add_proto qw/unsigned int vp9_high_10_sub_pixel_avg_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
+ specialize qw/vp9_high_10_sub_pixel_avg_variance32x32/;
+
+ add_proto qw/unsigned int vp9_high_10_sub_pixel_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+ specialize qw/vp9_high_10_sub_pixel_variance16x16/;
+
+ add_proto qw/unsigned int vp9_high_10_sub_pixel_avg_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
+ specialize qw/vp9_high_10_sub_pixel_avg_variance16x16/;
+
+ add_proto qw/unsigned int vp9_high_10_sub_pixel_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+ specialize qw/vp9_high_10_sub_pixel_variance8x16/;
+
+ add_proto qw/unsigned int vp9_high_10_sub_pixel_avg_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
+ specialize qw/vp9_high_10_sub_pixel_avg_variance8x16/;
+
+ add_proto qw/unsigned int vp9_high_10_sub_pixel_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+ specialize qw/vp9_high_10_sub_pixel_variance16x8/;
+
+ add_proto qw/unsigned int vp9_high_10_sub_pixel_avg_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
+ specialize qw/vp9_high_10_sub_pixel_avg_variance16x8/;
+
+ add_proto qw/unsigned int vp9_high_10_sub_pixel_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+ specialize qw/vp9_high_10_sub_pixel_variance8x8/;
+
+ add_proto qw/unsigned int vp9_high_10_sub_pixel_avg_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
+ specialize qw/vp9_high_10_sub_pixel_avg_variance8x8/;
+
+ add_proto qw/unsigned int vp9_high_10_sub_pixel_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+ specialize qw/vp9_high_10_sub_pixel_variance8x4/;
+
+ add_proto qw/unsigned int vp9_high_10_sub_pixel_avg_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
+ specialize qw/vp9_high_10_sub_pixel_avg_variance8x4/;
+
+ add_proto qw/unsigned int vp9_high_10_sub_pixel_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+ specialize qw/vp9_high_10_sub_pixel_variance4x8/;
+
+ add_proto qw/unsigned int vp9_high_10_sub_pixel_avg_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
+ specialize qw/vp9_high_10_sub_pixel_avg_variance4x8/;
+
+ add_proto qw/unsigned int vp9_high_10_sub_pixel_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+ specialize qw/vp9_high_10_sub_pixel_variance4x4/;
+
+ add_proto qw/unsigned int vp9_high_10_sub_pixel_avg_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
+ specialize qw/vp9_high_10_sub_pixel_avg_variance4x4/;
+
+ add_proto qw/unsigned int vp9_high_12_sub_pixel_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+ specialize qw/vp9_high_12_sub_pixel_variance64x64/;
+
+ add_proto qw/unsigned int vp9_high_12_sub_pixel_avg_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
+ specialize qw/vp9_high_12_sub_pixel_avg_variance64x64/;
+
+ add_proto qw/unsigned int vp9_high_12_sub_pixel_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+ specialize qw/vp9_high_12_sub_pixel_variance32x64/;
+
+ add_proto qw/unsigned int vp9_high_12_sub_pixel_avg_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
+ specialize qw/vp9_high_12_sub_pixel_avg_variance32x64/;
+
+ add_proto qw/unsigned int vp9_high_12_sub_pixel_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+ specialize qw/vp9_high_12_sub_pixel_variance64x32/;
+
+ add_proto qw/unsigned int vp9_high_12_sub_pixel_avg_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
+ specialize qw/vp9_high_12_sub_pixel_avg_variance64x32/;
+
+ add_proto qw/unsigned int vp9_high_12_sub_pixel_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+ specialize qw/vp9_high_12_sub_pixel_variance32x16/;
+
+ add_proto qw/unsigned int vp9_high_12_sub_pixel_avg_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
+ specialize qw/vp9_high_12_sub_pixel_avg_variance32x16/;
+
+ add_proto qw/unsigned int vp9_high_12_sub_pixel_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+ specialize qw/vp9_high_12_sub_pixel_variance16x32/;
+
+ add_proto qw/unsigned int vp9_high_12_sub_pixel_avg_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
+ specialize qw/vp9_high_12_sub_pixel_avg_variance16x32/;
+
+ add_proto qw/unsigned int vp9_high_12_sub_pixel_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+ specialize qw/vp9_high_12_sub_pixel_variance32x32/;
+
+ add_proto qw/unsigned int vp9_high_12_sub_pixel_avg_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
+ specialize qw/vp9_high_12_sub_pixel_avg_variance32x32/;
+
+ add_proto qw/unsigned int vp9_high_12_sub_pixel_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+ specialize qw/vp9_high_12_sub_pixel_variance16x16/;
+
+ add_proto qw/unsigned int vp9_high_12_sub_pixel_avg_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
+ specialize qw/vp9_high_12_sub_pixel_avg_variance16x16/;
+
+ add_proto qw/unsigned int vp9_high_12_sub_pixel_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+ specialize qw/vp9_high_12_sub_pixel_variance8x16/;
+
+ add_proto qw/unsigned int vp9_high_12_sub_pixel_avg_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
+ specialize qw/vp9_high_12_sub_pixel_avg_variance8x16/;
+
+ add_proto qw/unsigned int vp9_high_12_sub_pixel_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+ specialize qw/vp9_high_12_sub_pixel_variance16x8/;
+
+ add_proto qw/unsigned int vp9_high_12_sub_pixel_avg_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
+ specialize qw/vp9_high_12_sub_pixel_avg_variance16x8/;
+
+ add_proto qw/unsigned int vp9_high_12_sub_pixel_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+ specialize qw/vp9_high_12_sub_pixel_variance8x8/;
+
+ add_proto qw/unsigned int vp9_high_12_sub_pixel_avg_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
+ specialize qw/vp9_high_12_sub_pixel_avg_variance8x8/;
+
+ add_proto qw/unsigned int vp9_high_12_sub_pixel_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+ specialize qw/vp9_high_12_sub_pixel_variance8x4/;
+
+ add_proto qw/unsigned int vp9_high_12_sub_pixel_avg_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
+ specialize qw/vp9_high_12_sub_pixel_avg_variance8x4/;
+
+ add_proto qw/unsigned int vp9_high_12_sub_pixel_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+ specialize qw/vp9_high_12_sub_pixel_variance4x8/;
+
+ add_proto qw/unsigned int vp9_high_12_sub_pixel_avg_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
+ specialize qw/vp9_high_12_sub_pixel_avg_variance4x8/;
+
+ add_proto qw/unsigned int vp9_high_12_sub_pixel_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+ specialize qw/vp9_high_12_sub_pixel_variance4x4/;
+
+ add_proto qw/unsigned int vp9_high_12_sub_pixel_avg_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
+ specialize qw/vp9_high_12_sub_pixel_avg_variance4x4/;
+
+ add_proto qw/unsigned int vp9_high_sad64x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride";
+ specialize qw/vp9_high_sad64x64/;
+
+ add_proto qw/unsigned int vp9_high_sad32x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride";
+ specialize qw/vp9_high_sad32x64/;
+
+ add_proto qw/unsigned int vp9_high_sad64x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride";
+ specialize qw/vp9_high_sad64x32/;
+
+ add_proto qw/unsigned int vp9_high_sad32x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride";
+ specialize qw/vp9_high_sad32x16/;
+
+ add_proto qw/unsigned int vp9_high_sad16x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride";
+ specialize qw/vp9_high_sad16x32/;
+
+ add_proto qw/unsigned int vp9_high_sad32x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride";
+ specialize qw/vp9_high_sad32x32/;
+
+ add_proto qw/unsigned int vp9_high_sad16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride";
+ specialize qw/vp9_high_sad16x16/;
+
+ add_proto qw/unsigned int vp9_high_sad16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride";
+ specialize qw/vp9_high_sad16x8/;
+
+ add_proto qw/unsigned int vp9_high_sad8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride";
+ specialize qw/vp9_high_sad8x16/;
+
+ add_proto qw/unsigned int vp9_high_sad8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride";
+ specialize qw/vp9_high_sad8x8/;
+
+ add_proto qw/unsigned int vp9_high_sad8x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride";
+ specialize qw/vp9_high_sad8x4/;
+
+ add_proto qw/unsigned int vp9_high_sad4x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride";
+ specialize qw/vp9_high_sad4x8/;
+
+ add_proto qw/unsigned int vp9_high_sad4x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride";
+ specialize qw/vp9_high_sad4x4/;
+
+ add_proto qw/unsigned int vp9_high_sad64x64_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
+ specialize qw/vp9_high_sad64x64_avg/;
+
+ add_proto qw/unsigned int vp9_high_sad32x64_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
+ specialize qw/vp9_high_sad32x64_avg/;
+
+ add_proto qw/unsigned int vp9_high_sad64x32_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
+ specialize qw/vp9_high_sad64x32_avg/;
+
+ add_proto qw/unsigned int vp9_high_sad32x16_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
+ specialize qw/vp9_high_sad32x16_avg/;
+
+ add_proto qw/unsigned int vp9_high_sad16x32_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
+ specialize qw/vp9_high_sad16x32_avg/;
+
+ add_proto qw/unsigned int vp9_high_sad32x32_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
+ specialize qw/vp9_high_sad32x32_avg/;
+
+ add_proto qw/unsigned int vp9_high_sad16x16_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
+ specialize qw/vp9_high_sad16x16_avg/;
+
+ add_proto qw/unsigned int vp9_high_sad16x8_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
+ specialize qw/vp9_high_sad16x8_avg/;
+
+ add_proto qw/unsigned int vp9_high_sad8x16_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
+ specialize qw/vp9_high_sad8x16_avg/;
+
+ add_proto qw/unsigned int vp9_high_sad8x8_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
+ specialize qw/vp9_high_sad8x8_avg/;
+
+ add_proto qw/unsigned int vp9_high_sad8x4_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
+ specialize qw/vp9_high_sad8x4_avg/;
+
+ add_proto qw/unsigned int vp9_high_sad4x8_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
+ specialize qw/vp9_high_sad4x8_avg/;
+
+ add_proto qw/unsigned int vp9_high_sad4x4_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
+ specialize qw/vp9_high_sad4x4_avg/;
+
+ add_proto qw/void vp9_high_sad64x64x3/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array";
+ specialize qw/vp9_high_sad64x64x3/;
+
+ add_proto qw/void vp9_high_sad32x32x3/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array";
+ specialize qw/vp9_high_sad32x32x3/;
+
+ add_proto qw/void vp9_high_sad16x16x3/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array";
+ specialize qw/vp9_high_sad16x16x3/;
+
+ add_proto qw/void vp9_high_sad16x8x3/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array";
+ specialize qw/vp9_high_sad16x8x3/;
+
+ add_proto qw/void vp9_high_sad8x16x3/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array";
+ specialize qw/vp9_high_sad8x16x3/;
+
+ add_proto qw/void vp9_high_sad8x8x3/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array";
+ specialize qw/vp9_high_sad8x8x3/;
+
+ add_proto qw/void vp9_high_sad4x4x3/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array";
+ specialize qw/vp9_high_sad4x4x3/;
+
+ add_proto qw/void vp9_high_sad64x64x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
+ specialize qw/vp9_high_sad64x64x8/;
+
+ add_proto qw/void vp9_high_sad32x32x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
+ specialize qw/vp9_high_sad32x32x8/;
+
+ add_proto qw/void vp9_high_sad16x16x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
+ specialize qw/vp9_high_sad16x16x8/;
+
+ add_proto qw/void vp9_high_sad16x8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
+ specialize qw/vp9_high_sad16x8x8/;
+
+ add_proto qw/void vp9_high_sad8x16x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
+ specialize qw/vp9_high_sad8x16x8/;
+
+ add_proto qw/void vp9_high_sad8x8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
+ specialize qw/vp9_high_sad8x8x8/;
+
+ add_proto qw/void vp9_high_sad8x4x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
+ specialize qw/vp9_high_sad8x4x8/;
+
+ add_proto qw/void vp9_high_sad4x8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
+ specialize qw/vp9_high_sad4x8x8/;
+
+ add_proto qw/void vp9_high_sad4x4x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
+ specialize qw/vp9_high_sad4x4x8/;
+
+ add_proto qw/void vp9_high_sad64x64x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array";
+ specialize qw/vp9_high_sad64x64x4d/;
+
+ add_proto qw/void vp9_high_sad32x64x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array";
+ specialize qw/vp9_high_sad32x64x4d/;
+
+ add_proto qw/void vp9_high_sad64x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array";
+ specialize qw/vp9_high_sad64x32x4d/;
+
+ add_proto qw/void vp9_high_sad32x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array";
+ specialize qw/vp9_high_sad32x16x4d/;
+
+ add_proto qw/void vp9_high_sad16x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array";
+ specialize qw/vp9_high_sad16x32x4d/;
+
+ add_proto qw/void vp9_high_sad32x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array";
+ specialize qw/vp9_high_sad32x32x4d/;
+
+ add_proto qw/void vp9_high_sad16x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array";
+ specialize qw/vp9_high_sad16x16x4d/;
+
+ add_proto qw/void vp9_high_sad16x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array";
+ specialize qw/vp9_high_sad16x8x4d/;
+
+ add_proto qw/void vp9_high_sad8x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array";
+ specialize qw/vp9_high_sad8x16x4d/;
+
+ add_proto qw/void vp9_high_sad8x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array";
+ specialize qw/vp9_high_sad8x8x4d/;
+
+ # TODO(jingning): need to convert these 4x8/8x4 functions into sse2 form
+ add_proto qw/void vp9_high_sad8x4x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array";
+ specialize qw/vp9_high_sad8x4x4d/;
+
+ add_proto qw/void vp9_high_sad4x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array";
+ specialize qw/vp9_high_sad4x8x4d/;
+
+ add_proto qw/void vp9_high_sad4x4x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array";
+ specialize qw/vp9_high_sad4x4x4d/;
+
+ add_proto qw/unsigned int vp9_high_mse16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse";
+ specialize qw/vp9_high_mse16x16/;
+
+ add_proto qw/unsigned int vp9_high_mse8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse";
+ specialize qw/vp9_high_mse8x16/;
+
+ add_proto qw/unsigned int vp9_high_mse16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse";
+ specialize qw/vp9_high_mse16x8/;
+
+ add_proto qw/unsigned int vp9_high_mse8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse";
+ specialize qw/vp9_high_mse8x8/;
+
+ add_proto qw/unsigned int vp9_high_10_mse16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse";
+ specialize qw/vp9_high_10_mse16x16/;
+
+ add_proto qw/unsigned int vp9_high_10_mse8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse";
+ specialize qw/vp9_high_10_mse8x16/;
+
+ add_proto qw/unsigned int vp9_high_10_mse16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse";
+ specialize qw/vp9_high_10_mse16x8/;
+
+ add_proto qw/unsigned int vp9_high_10_mse8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse";
+ specialize qw/vp9_high_10_mse8x8/;
+
+ add_proto qw/unsigned int vp9_high_12_mse16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse";
+ specialize qw/vp9_high_12_mse16x16/;
+
+ add_proto qw/unsigned int vp9_high_12_mse8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse";
+ specialize qw/vp9_high_12_mse8x16/;
+
+ add_proto qw/unsigned int vp9_high_12_mse16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse";
+ specialize qw/vp9_high_12_mse16x8/;
+
+ add_proto qw/unsigned int vp9_high_12_mse8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse";
+ specialize qw/vp9_high_12_mse8x8/;
+
+ # ENCODEMB INVOKE
+
+ add_proto qw/int64_t vp9_high_block_error/, "const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz, int bd";
+ specialize qw/vp9_high_block_error/;
+
+ add_proto qw/void vp9_high_subtract_block/, "int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride, int bd";
+ specialize qw/vp9_high_subtract_block/;
+
+ add_proto qw/void vp9_high_quantize_fp/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+ specialize qw/vp9_high_quantize_fp/;
+
+ add_proto qw/void vp9_high_quantize_fp_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+ specialize qw/vp9_high_quantize_fp_32x32/;
+
+ add_proto qw/void vp9_high_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+ specialize qw/vp9_high_quantize_b/;
+
+ add_proto qw/void vp9_high_quantize_b_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+ specialize qw/vp9_high_quantize_b_32x32/;
+
+ #
+ # Structured Similarity (SSIM)
+ #
+ if (vpx_config("CONFIG_INTERNAL_STATS") eq "yes") {
+ add_proto qw/void vp9_high_ssim_parms_8x8/, "uint16_t *s, int sp, uint16_t *r, int rp, uint32_t *sum_s, uint32_t *sum_r, uint32_t *sum_sq_s, uint32_t *sum_sq_r, uint32_t *sum_sxr";
+ specialize qw/vp9_high_ssim_parms_8x8/;
+
+ add_proto qw/void vp9_high_ssim_parms_8x8_shift/, "uint16_t *s, int sp, uint16_t *r, int rp, uint32_t *sum_s, uint32_t *sum_r, uint32_t *sum_sq_s, uint32_t *sum_sq_r, uint32_t *sum_sxr, unsigned int bd, unsigned int shift";
+ specialize qw/vp9_high_ssim_parms_8x8_shift/;
+ }
+
+ # fdct functions
+ add_proto qw/void vp9_high_fht4x4/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
+ specialize qw/vp9_high_fht4x4/;
+
+ add_proto qw/void vp9_high_fht8x8/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
+ specialize qw/vp9_high_fht8x8/;
+
+ add_proto qw/void vp9_high_fht16x16/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
+ specialize qw/vp9_high_fht16x16/;
+
+ add_proto qw/void vp9_high_fwht4x4/, "const int16_t *input, tran_low_t *output, int stride";
+ specialize qw/vp9_high_fwht4x4/;
+
+ add_proto qw/void vp9_high_fdct4x4/, "const int16_t *input, tran_low_t *output, int stride";
+ specialize qw/vp9_high_fdct4x4/;
+
+ add_proto qw/void vp9_high_fdct8x8_1/, "const int16_t *input, tran_low_t *output, int stride";
+ specialize qw/vp9_high_fdct8x8_1/;
+
+ add_proto qw/void vp9_high_fdct8x8/, "const int16_t *input, tran_low_t *output, int stride";
+ specialize qw/vp9_high_fdct8x8/;
+
+ add_proto qw/void vp9_high_fdct16x16_1/, "const int16_t *input, tran_low_t *output, int stride";
+ specialize qw/vp9_high_fdct16x16_1/;
+
+ add_proto qw/void vp9_high_fdct16x16/, "const int16_t *input, tran_low_t *output, int stride";
+ specialize qw/vp9_high_fdct16x16/;
+
+ add_proto qw/void vp9_high_fdct32x32_1/, "const int16_t *input, tran_low_t *output, int stride";
+ specialize qw/vp9_high_fdct32x32_1/;
+
+ add_proto qw/void vp9_high_fdct32x32/, "const int16_t *input, tran_low_t *output, int stride";
+ specialize qw/vp9_high_fdct32x32/;
+
+ add_proto qw/void vp9_high_fdct32x32_rd/, "const int16_t *input, tran_low_t *output, int stride";
+ specialize qw/vp9_high_fdct32x32_rd/;
+
+ add_proto qw/void vp9_high_temporal_filter_apply/, "uint8_t *frame1, unsigned int stride, uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, unsigned int *accumulator, uint16_t *count";
+ specialize qw/vp9_high_temporal_filter_apply/;
+
+}
+# End vp9_high encoder functions
+
}
# end encoder functions
1;
diff --git a/vp9/common/x86/vp9_asm_stubs.c b/vp9/common/x86/vp9_asm_stubs.c
index 1b4904c..b6847b9 100644
--- a/vp9/common/x86/vp9_asm_stubs.c
+++ b/vp9/common/x86/vp9_asm_stubs.c
@@ -139,25 +139,25 @@
filter_x, x_step_q4, filter_y, y_step_q4, w, h); \
} \
}
-#if HAVE_AVX2
+#if HAVE_AVX2 && HAVE_SSSE3
filter8_1dfunction vp9_filter_block1d16_v8_avx2;
filter8_1dfunction vp9_filter_block1d16_h8_avx2;
filter8_1dfunction vp9_filter_block1d4_v8_ssse3;
-#if (ARCH_X86_64)
+#if ARCH_X86_64
filter8_1dfunction vp9_filter_block1d8_v8_intrin_ssse3;
filter8_1dfunction vp9_filter_block1d8_h8_intrin_ssse3;
filter8_1dfunction vp9_filter_block1d4_h8_intrin_ssse3;
#define vp9_filter_block1d8_v8_avx2 vp9_filter_block1d8_v8_intrin_ssse3
#define vp9_filter_block1d8_h8_avx2 vp9_filter_block1d8_h8_intrin_ssse3
#define vp9_filter_block1d4_h8_avx2 vp9_filter_block1d4_h8_intrin_ssse3
-#else
+#else // ARCH_X86
filter8_1dfunction vp9_filter_block1d8_v8_ssse3;
filter8_1dfunction vp9_filter_block1d8_h8_ssse3;
filter8_1dfunction vp9_filter_block1d4_h8_ssse3;
#define vp9_filter_block1d8_v8_avx2 vp9_filter_block1d8_v8_ssse3
#define vp9_filter_block1d8_h8_avx2 vp9_filter_block1d8_h8_ssse3
#define vp9_filter_block1d4_h8_avx2 vp9_filter_block1d4_h8_ssse3
-#endif
+#endif // ARCH_X86_64 / ARCH_X86
filter8_1dfunction vp9_filter_block1d16_v2_ssse3;
filter8_1dfunction vp9_filter_block1d16_h2_ssse3;
filter8_1dfunction vp9_filter_block1d8_v2_ssse3;
@@ -190,9 +190,9 @@
// const int16_t *filter_y, int y_step_q4,
// int w, int h);
FUN_CONV_2D(, avx2);
-#endif
+#endif // HAVE_AX2 && HAVE_SSSE3
#if HAVE_SSSE3
-#if (ARCH_X86_64)
+#if ARCH_X86_64
filter8_1dfunction vp9_filter_block1d16_v8_intrin_ssse3;
filter8_1dfunction vp9_filter_block1d16_h8_intrin_ssse3;
filter8_1dfunction vp9_filter_block1d8_v8_intrin_ssse3;
@@ -204,14 +204,14 @@
#define vp9_filter_block1d8_v8_ssse3 vp9_filter_block1d8_v8_intrin_ssse3
#define vp9_filter_block1d8_h8_ssse3 vp9_filter_block1d8_h8_intrin_ssse3
#define vp9_filter_block1d4_h8_ssse3 vp9_filter_block1d4_h8_intrin_ssse3
-#else
+#else // ARCH_X86
filter8_1dfunction vp9_filter_block1d16_v8_ssse3;
filter8_1dfunction vp9_filter_block1d16_h8_ssse3;
filter8_1dfunction vp9_filter_block1d8_v8_ssse3;
filter8_1dfunction vp9_filter_block1d8_h8_ssse3;
filter8_1dfunction vp9_filter_block1d4_v8_ssse3;
filter8_1dfunction vp9_filter_block1d4_h8_ssse3;
-#endif
+#endif // ARCH_X86_64 / ARCH_X86
filter8_1dfunction vp9_filter_block1d16_v8_avg_ssse3;
filter8_1dfunction vp9_filter_block1d16_h8_avg_ssse3;
filter8_1dfunction vp9_filter_block1d8_v8_avg_ssse3;
@@ -270,7 +270,7 @@
// int w, int h);
FUN_CONV_2D(, ssse3);
FUN_CONV_2D(avg_ , ssse3);
-#endif
+#endif // HAVE_SSSE3
#if HAVE_SSE2
filter8_1dfunction vp9_filter_block1d16_v8_sse2;
@@ -336,4 +336,4 @@
// int w, int h);
FUN_CONV_2D(, sse2);
FUN_CONV_2D(avg_ , sse2);
-#endif
+#endif // HAVE_SSE2
diff --git a/vp9/common/x86/vp9_idct_intrin_sse2.c b/vp9/common/x86/vp9_idct_intrin_sse2.c
index b60f8a0..df60987 100644
--- a/vp9/common/x86/vp9_idct_intrin_sse2.c
+++ b/vp9/common/x86/vp9_idct_intrin_sse2.c
@@ -3573,6 +3573,7 @@
int stride) {
const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
const __m128i final_rounding = _mm_set1_epi16(1<<5);
+ const __m128i zero = _mm_setzero_si128();
// idct constants for each stage
const __m128i stg1_0 = pair_set_epi16(cospi_31_64, -cospi_1_64);
@@ -3635,7 +3636,6 @@
stp2_30, stp2_31;
__m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
int i, j, i32;
- int zero_flag[2];
for (i = 0; i < 4; i++) {
i32 = (i << 5);
@@ -3710,13 +3710,7 @@
zero_idx[13] = _mm_or_si128(zero_idx[10], zero_idx[11]);
zero_idx[14] = _mm_or_si128(zero_idx[12], zero_idx[13]);
- zero_idx[0] = _mm_unpackhi_epi64(zero_idx[14], zero_idx[14]);
- zero_idx[1] = _mm_or_si128(zero_idx[0], zero_idx[14]);
- zero_idx[2] = _mm_srli_epi64(zero_idx[1], 32);
- zero_flag[0] = _mm_cvtsi128_si32(zero_idx[1]);
- zero_flag[1] = _mm_cvtsi128_si32(zero_idx[2]);
-
- if (!zero_flag[0] && !zero_flag[1]) {
+ if (_mm_movemask_epi8(_mm_cmpeq_epi32(zero_idx[14], zero)) == 0xFFFF) {
col[i32 + 0] = _mm_setzero_si128();
col[i32 + 1] = _mm_setzero_si128();
col[i32 + 2] = _mm_setzero_si128();
@@ -3795,7 +3789,6 @@
col[i32 + 31] = _mm_sub_epi16(stp1_0, stp1_31);
}
for (i = 0; i < 4; i++) {
- const __m128i zero = _mm_setzero_si128();
// Second 1-D idct
j = i << 3;
diff --git a/vp9/common/x86/vp9_postproc_mmx.asm b/vp9/common/x86/vp9_postproc_mmx.asm
deleted file mode 100644
index 5b8deef..0000000
--- a/vp9/common/x86/vp9_postproc_mmx.asm
+++ /dev/null
@@ -1,533 +0,0 @@
-;
-; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
-
-%include "vpx_ports/x86_abi_support.asm"
-
-%define VP9_FILTER_WEIGHT 128
-%define VP9_FILTER_SHIFT 7
-
-;void vp9_post_proc_down_and_across_mmx
-;(
-; unsigned char *src_ptr,
-; unsigned char *dst_ptr,
-; int src_pixels_per_line,
-; int dst_pixels_per_line,
-; int rows,
-; int cols,
-; int flimit
-;)
-global sym(vp9_post_proc_down_and_across_mmx) PRIVATE
-sym(vp9_post_proc_down_and_across_mmx):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 7
- GET_GOT rbx
- push rsi
- push rdi
- ; end prolog
-
-%if ABI_IS_32BIT=1 && CONFIG_PIC=1
- ; move the global rd onto the stack, since we don't have enough registers
- ; to do PIC addressing
- movq mm0, [GLOBAL(rd)]
- sub rsp, 8
- movq [rsp], mm0
-%define RD [rsp]
-%else
-%define RD [GLOBAL(rd)]
-%endif
-
- push rbx
- lea rbx, [GLOBAL(Blur)]
- movd mm2, dword ptr arg(6) ;flimit
- punpcklwd mm2, mm2
- punpckldq mm2, mm2
-
- mov rsi, arg(0) ;src_ptr
- mov rdi, arg(1) ;dst_ptr
-
- movsxd rcx, DWORD PTR arg(4) ;rows
- movsxd rax, DWORD PTR arg(2) ;src_pixels_per_line ; destination pitch?
- pxor mm0, mm0 ; mm0 = 00000000
-
-.nextrow:
-
- xor rdx, rdx ; clear out rdx for use as loop counter
-.nextcol:
-
- pxor mm7, mm7 ; mm7 = 00000000
- movq mm6, [rbx + 32 ] ; mm6 = kernel 2 taps
- movq mm3, [rsi] ; mm4 = r0 p0..p7
- punpcklbw mm3, mm0 ; mm3 = p0..p3
- movq mm1, mm3 ; mm1 = p0..p3
- pmullw mm3, mm6 ; mm3 *= kernel 2 modifiers
-
- movq mm6, [rbx + 48] ; mm6 = kernel 3 taps
- movq mm5, [rsi + rax] ; mm4 = r1 p0..p7
- punpcklbw mm5, mm0 ; mm5 = r1 p0..p3
- pmullw mm6, mm5 ; mm6 *= p0..p3 * kernel 3 modifiers
- paddusw mm3, mm6 ; mm3 += mm6
-
- ; thresholding
- movq mm7, mm1 ; mm7 = r0 p0..p3
- psubusw mm7, mm5 ; mm7 = r0 p0..p3 - r1 p0..p3
- psubusw mm5, mm1 ; mm5 = r1 p0..p3 - r0 p0..p3
- paddusw mm7, mm5 ; mm7 = abs(r0 p0..p3 - r1 p0..p3)
- pcmpgtw mm7, mm2
-
- movq mm6, [rbx + 64 ] ; mm6 = kernel 4 modifiers
- movq mm5, [rsi + 2*rax] ; mm4 = r2 p0..p7
- punpcklbw mm5, mm0 ; mm5 = r2 p0..p3
- pmullw mm6, mm5 ; mm5 *= kernel 4 modifiers
- paddusw mm3, mm6 ; mm3 += mm5
-
- ; thresholding
- movq mm6, mm1 ; mm6 = r0 p0..p3
- psubusw mm6, mm5 ; mm6 = r0 p0..p3 - r2 p0..p3
- psubusw mm5, mm1 ; mm5 = r2 p0..p3 - r2 p0..p3
- paddusw mm6, mm5 ; mm6 = abs(r0 p0..p3 - r2 p0..p3)
- pcmpgtw mm6, mm2
- por mm7, mm6 ; accumulate thresholds
-
-
- neg rax
- movq mm6, [rbx ] ; kernel 0 taps
- movq mm5, [rsi+2*rax] ; mm4 = r-2 p0..p7
- punpcklbw mm5, mm0 ; mm5 = r-2 p0..p3
- pmullw mm6, mm5 ; mm5 *= kernel 0 modifiers
- paddusw mm3, mm6 ; mm3 += mm5
-
- ; thresholding
- movq mm6, mm1 ; mm6 = r0 p0..p3
- psubusw mm6, mm5 ; mm6 = p0..p3 - r-2 p0..p3
- psubusw mm5, mm1 ; mm5 = r-2 p0..p3 - p0..p3
- paddusw mm6, mm5 ; mm6 = abs(r0 p0..p3 - r-2 p0..p3)
- pcmpgtw mm6, mm2
- por mm7, mm6 ; accumulate thresholds
-
- movq mm6, [rbx + 16] ; kernel 1 taps
- movq mm4, [rsi+rax] ; mm4 = r-1 p0..p7
- punpcklbw mm4, mm0 ; mm4 = r-1 p0..p3
- pmullw mm6, mm4 ; mm4 *= kernel 1 modifiers.
- paddusw mm3, mm6 ; mm3 += mm5
-
- ; thresholding
- movq mm6, mm1 ; mm6 = r0 p0..p3
- psubusw mm6, mm4 ; mm6 = p0..p3 - r-2 p0..p3
- psubusw mm4, mm1 ; mm5 = r-1 p0..p3 - p0..p3
- paddusw mm6, mm4 ; mm6 = abs(r0 p0..p3 - r-1 p0..p3)
- pcmpgtw mm6, mm2
- por mm7, mm6 ; accumulate thresholds
-
-
- paddusw mm3, RD ; mm3 += round value
- psraw mm3, VP9_FILTER_SHIFT ; mm3 /= 128
-
- pand mm1, mm7 ; mm1 select vals > thresh from source
- pandn mm7, mm3 ; mm7 select vals < thresh from blurred result
- paddusw mm1, mm7 ; combination
-
- packuswb mm1, mm0 ; pack to bytes
-
- movd [rdi], mm1 ;
- neg rax ; pitch is positive
-
-
- add rsi, 4
- add rdi, 4
- add rdx, 4
-
- cmp edx, dword ptr arg(5) ;cols
- jl .nextcol
- ; done with the all cols, start the across filtering in place
- sub rsi, rdx
- sub rdi, rdx
-
-
- push rax
- xor rdx, rdx
- mov rax, [rdi-4];
-
-.acrossnextcol:
- pxor mm7, mm7 ; mm7 = 00000000
- movq mm6, [rbx + 32 ] ;
- movq mm4, [rdi+rdx] ; mm4 = p0..p7
- movq mm3, mm4 ; mm3 = p0..p7
- punpcklbw mm3, mm0 ; mm3 = p0..p3
- movq mm1, mm3 ; mm1 = p0..p3
- pmullw mm3, mm6 ; mm3 *= kernel 2 modifiers
-
- movq mm6, [rbx + 48]
- psrlq mm4, 8 ; mm4 = p1..p7
- movq mm5, mm4 ; mm5 = p1..p7
- punpcklbw mm5, mm0 ; mm5 = p1..p4
- pmullw mm6, mm5 ; mm6 *= p1..p4 * kernel 3 modifiers
- paddusw mm3, mm6 ; mm3 += mm6
-
- ; thresholding
- movq mm7, mm1 ; mm7 = p0..p3
- psubusw mm7, mm5 ; mm7 = p0..p3 - p1..p4
- psubusw mm5, mm1 ; mm5 = p1..p4 - p0..p3
- paddusw mm7, mm5 ; mm7 = abs(p0..p3 - p1..p4)
- pcmpgtw mm7, mm2
-
- movq mm6, [rbx + 64 ]
- psrlq mm4, 8 ; mm4 = p2..p7
- movq mm5, mm4 ; mm5 = p2..p7
- punpcklbw mm5, mm0 ; mm5 = p2..p5
- pmullw mm6, mm5 ; mm5 *= kernel 4 modifiers
- paddusw mm3, mm6 ; mm3 += mm5
-
- ; thresholding
- movq mm6, mm1 ; mm6 = p0..p3
- psubusw mm6, mm5 ; mm6 = p0..p3 - p1..p4
- psubusw mm5, mm1 ; mm5 = p1..p4 - p0..p3
- paddusw mm6, mm5 ; mm6 = abs(p0..p3 - p1..p4)
- pcmpgtw mm6, mm2
- por mm7, mm6 ; accumulate thresholds
-
-
- movq mm6, [rbx ]
- movq mm4, [rdi+rdx-2] ; mm4 = p-2..p5
- movq mm5, mm4 ; mm5 = p-2..p5
- punpcklbw mm5, mm0 ; mm5 = p-2..p1
- pmullw mm6, mm5 ; mm5 *= kernel 0 modifiers
- paddusw mm3, mm6 ; mm3 += mm5
-
- ; thresholding
- movq mm6, mm1 ; mm6 = p0..p3
- psubusw mm6, mm5 ; mm6 = p0..p3 - p1..p4
- psubusw mm5, mm1 ; mm5 = p1..p4 - p0..p3
- paddusw mm6, mm5 ; mm6 = abs(p0..p3 - p1..p4)
- pcmpgtw mm6, mm2
- por mm7, mm6 ; accumulate thresholds
-
- movq mm6, [rbx + 16]
- psrlq mm4, 8 ; mm4 = p-1..p5
- punpcklbw mm4, mm0 ; mm4 = p-1..p2
- pmullw mm6, mm4 ; mm4 *= kernel 1 modifiers.
- paddusw mm3, mm6 ; mm3 += mm5
-
- ; thresholding
- movq mm6, mm1 ; mm6 = p0..p3
- psubusw mm6, mm4 ; mm6 = p0..p3 - p1..p4
- psubusw mm4, mm1 ; mm5 = p1..p4 - p0..p3
- paddusw mm6, mm4 ; mm6 = abs(p0..p3 - p1..p4)
- pcmpgtw mm6, mm2
- por mm7, mm6 ; accumulate thresholds
-
- paddusw mm3, RD ; mm3 += round value
- psraw mm3, VP9_FILTER_SHIFT ; mm3 /= 128
-
- pand mm1, mm7 ; mm1 select vals > thresh from source
- pandn mm7, mm3 ; mm7 select vals < thresh from blurred result
- paddusw mm1, mm7 ; combination
-
- packuswb mm1, mm0 ; pack to bytes
- mov DWORD PTR [rdi+rdx-4], eax ; store previous four bytes
- movd eax, mm1
-
- add rdx, 4
- cmp edx, dword ptr arg(5) ;cols
- jl .acrossnextcol;
-
- mov DWORD PTR [rdi+rdx-4], eax
- pop rax
-
- ; done with this rwo
- add rsi,rax ; next line
- movsxd rax, dword ptr arg(3) ;dst_pixels_per_line ; destination pitch?
- add rdi,rax ; next destination
- movsxd rax, dword ptr arg(2) ;src_pixels_per_line ; destination pitch?
-
- dec rcx ; decrement count
- jnz .nextrow ; next row
- pop rbx
-
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_GOT
- UNSHADOW_ARGS
- pop rbp
- ret
-%undef RD
-
-
-;void vp9_mbpost_proc_down_mmx(unsigned char *dst,
-; int pitch, int rows, int cols,int flimit)
-extern sym(vp9_rv)
-global sym(vp9_mbpost_proc_down_mmx) PRIVATE
-sym(vp9_mbpost_proc_down_mmx):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 5
- GET_GOT rbx
- push rsi
- push rdi
- ; end prolog
-
- ALIGN_STACK 16, rax
- sub rsp, 136
-
- ; unsigned char d[16][8] at [rsp]
- ; create flimit2 at [rsp+128]
- mov eax, dword ptr arg(4) ;flimit
- mov [rsp+128], eax
- mov [rsp+128+4], eax
-%define flimit2 [rsp+128]
-
-%if ABI_IS_32BIT=0
- lea r8, [GLOBAL(sym(vp9_rv))]
-%endif
-
- ;rows +=8;
- add dword ptr arg(2), 8
-
- ;for(c=0; c<cols; c+=4)
-.loop_col:
- mov rsi, arg(0) ;s
- pxor mm0, mm0 ;
-
- movsxd rax, dword ptr arg(1) ;pitch ;
- neg rax ; rax = -pitch
-
- lea rsi, [rsi + rax*8]; ; rdi = s[-pitch*8]
- neg rax
-
-
- pxor mm5, mm5
- pxor mm6, mm6 ;
-
- pxor mm7, mm7 ;
- mov rdi, rsi
-
- mov rcx, 15 ;
-
-.loop_initvar:
- movd mm1, DWORD PTR [rdi];
- punpcklbw mm1, mm0 ;
-
- paddw mm5, mm1 ;
- pmullw mm1, mm1 ;
-
- movq mm2, mm1 ;
- punpcklwd mm1, mm0 ;
-
- punpckhwd mm2, mm0 ;
- paddd mm6, mm1 ;
-
- paddd mm7, mm2 ;
- lea rdi, [rdi+rax] ;
-
- dec rcx
- jne .loop_initvar
- ;save the var and sum
- xor rdx, rdx
-.loop_row:
- movd mm1, DWORD PTR [rsi] ; [s-pitch*8]
- movd mm2, DWORD PTR [rdi] ; [s+pitch*7]
-
- punpcklbw mm1, mm0
- punpcklbw mm2, mm0
-
- paddw mm5, mm2
- psubw mm5, mm1
-
- pmullw mm2, mm2
- movq mm4, mm2
-
- punpcklwd mm2, mm0
- punpckhwd mm4, mm0
-
- paddd mm6, mm2
- paddd mm7, mm4
-
- pmullw mm1, mm1
- movq mm2, mm1
-
- punpcklwd mm1, mm0
- psubd mm6, mm1
-
- punpckhwd mm2, mm0
- psubd mm7, mm2
-
-
- movq mm3, mm6
- pslld mm3, 4
-
- psubd mm3, mm6
- movq mm1, mm5
-
- movq mm4, mm5
- pmullw mm1, mm1
-
- pmulhw mm4, mm4
- movq mm2, mm1
-
- punpcklwd mm1, mm4
- punpckhwd mm2, mm4
-
- movq mm4, mm7
- pslld mm4, 4
-
- psubd mm4, mm7
-
- psubd mm3, mm1
- psubd mm4, mm2
-
- psubd mm3, flimit2
- psubd mm4, flimit2
-
- psrad mm3, 31
- psrad mm4, 31
-
- packssdw mm3, mm4
- packsswb mm3, mm0
-
- movd mm1, DWORD PTR [rsi+rax*8]
-
- movq mm2, mm1
- punpcklbw mm1, mm0
-
- paddw mm1, mm5
- mov rcx, rdx
-
- and rcx, 127
-%if ABI_IS_32BIT=1 && CONFIG_PIC=1
- push rax
- lea rax, [GLOBAL(sym(vp9_rv))]
- movq mm4, [rax + rcx*2] ;vp9_rv[rcx*2]
- pop rax
-%elif ABI_IS_32BIT=0
- movq mm4, [r8 + rcx*2] ;vp9_rv[rcx*2]
-%else
- movq mm4, [sym(vp9_rv) + rcx*2]
-%endif
- paddw mm1, mm4
- ;paddw xmm1, eight8s
- psraw mm1, 4
-
- packuswb mm1, mm0
- pand mm1, mm3
-
- pandn mm3, mm2
- por mm1, mm3
-
- and rcx, 15
- movd DWORD PTR [rsp+rcx*4], mm1 ;d[rcx*4]
-
- mov rcx, rdx
- sub rcx, 8
-
- and rcx, 15
- movd mm1, DWORD PTR [rsp+rcx*4] ;d[rcx*4]
-
- movd [rsi], mm1
- lea rsi, [rsi+rax]
-
- lea rdi, [rdi+rax]
- add rdx, 1
-
- cmp edx, dword arg(2) ;rows
- jl .loop_row
-
-
- add dword arg(0), 4 ; s += 4
- sub dword arg(3), 4 ; cols -= 4
- cmp dword arg(3), 0
- jg .loop_col
-
- add rsp, 136
- pop rsp
-
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_GOT
- UNSHADOW_ARGS
- pop rbp
- ret
-%undef flimit2
-
-
-;void vp9_plane_add_noise_mmx (unsigned char *start, unsigned char *noise,
-; unsigned char blackclamp[16],
-; unsigned char whiteclamp[16],
-; unsigned char bothclamp[16],
-; unsigned int width, unsigned int height, int pitch)
-global sym(vp9_plane_add_noise_mmx) PRIVATE
-sym(vp9_plane_add_noise_mmx):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 8
- GET_GOT rbx
- push rsi
- push rdi
- ; end prolog
-
-.addnoise_loop:
- call sym(LIBVPX_RAND) WRT_PLT
- mov rcx, arg(1) ;noise
- and rax, 0xff
- add rcx, rax
-
- ; we rely on the fact that the clamping vectors are stored contiguously
- ; in black/white/both order. Note that we have to reload this here because
- ; rdx could be trashed by rand()
- mov rdx, arg(2) ; blackclamp
-
-
- mov rdi, rcx
- movsxd rcx, dword arg(5) ;[Width]
- mov rsi, arg(0) ;Pos
- xor rax,rax
-
-.addnoise_nextset:
- movq mm1,[rsi+rax] ; get the source
-
- psubusb mm1, [rdx] ;blackclamp ; clamp both sides so we don't outrange adding noise
- paddusb mm1, [rdx+32] ;bothclamp
- psubusb mm1, [rdx+16] ;whiteclamp
-
- movq mm2,[rdi+rax] ; get the noise for this line
- paddb mm1,mm2 ; add it in
- movq [rsi+rax],mm1 ; store the result
-
- add rax,8 ; move to the next line
-
- cmp rax, rcx
- jl .addnoise_nextset
-
- movsxd rax, dword arg(7) ; Pitch
- add arg(0), rax ; Start += Pitch
- sub dword arg(6), 1 ; Height -= 1
- jg .addnoise_loop
-
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_GOT
- UNSHADOW_ARGS
- pop rbp
- ret
-
-
-SECTION_RODATA
-align 16
-Blur:
- times 16 dw 16
- times 8 dw 64
- times 16 dw 16
- times 8 dw 0
-
-rd:
- times 4 dw 0x40
diff --git a/vp9/common/x86/vp9_subpixel_8t_intrin_avx2.c b/vp9/common/x86/vp9_subpixel_8t_intrin_avx2.c
index d109e13..3bc7d39 100644
--- a/vp9/common/x86/vp9_subpixel_8t_intrin_avx2.c
+++ b/vp9/common/x86/vp9_subpixel_8t_intrin_avx2.c
@@ -307,7 +307,7 @@
__m256i addFilterReg64;
__m256i srcReg32b1, srcReg32b2, srcReg32b3, srcReg32b4, srcReg32b5;
__m256i srcReg32b6, srcReg32b7, srcReg32b8, srcReg32b9, srcReg32b10;
- __m256i srcReg32b11, srcReg32b12, srcReg32b13, filtersReg32;
+ __m256i srcReg32b11, srcReg32b12, filtersReg32;
__m256i firstFilters, secondFilters, thirdFilters, forthFilters;
unsigned int i;
unsigned int src_stride, dst_stride;
@@ -409,35 +409,35 @@
// multiply 2 adjacent elements with the filter and add the result
srcReg32b10 = _mm256_maddubs_epi16(srcReg32b10, firstFilters);
srcReg32b6 = _mm256_maddubs_epi16(srcReg32b4, forthFilters);
- srcReg32b1 = _mm256_maddubs_epi16(srcReg32b1, firstFilters);
- srcReg32b8 = _mm256_maddubs_epi16(srcReg32b7, forthFilters);
// add and saturate the results together
srcReg32b10 = _mm256_adds_epi16(srcReg32b10, srcReg32b6);
- srcReg32b1 = _mm256_adds_epi16(srcReg32b1, srcReg32b8);
-
// multiply 2 adjacent elements with the filter and add the result
srcReg32b8 = _mm256_maddubs_epi16(srcReg32b11, secondFilters);
- srcReg32b6 = _mm256_maddubs_epi16(srcReg32b3, secondFilters);
-
- // multiply 2 adjacent elements with the filter and add the result
srcReg32b12 = _mm256_maddubs_epi16(srcReg32b2, thirdFilters);
- srcReg32b13 = _mm256_maddubs_epi16(srcReg32b5, thirdFilters);
-
// add and saturate the results together
srcReg32b10 = _mm256_adds_epi16(srcReg32b10,
_mm256_min_epi16(srcReg32b8, srcReg32b12));
- srcReg32b1 = _mm256_adds_epi16(srcReg32b1,
- _mm256_min_epi16(srcReg32b6, srcReg32b13));
-
- // add and saturate the results together
srcReg32b10 = _mm256_adds_epi16(srcReg32b10,
_mm256_max_epi16(srcReg32b8, srcReg32b12));
- srcReg32b1 = _mm256_adds_epi16(srcReg32b1,
- _mm256_max_epi16(srcReg32b6, srcReg32b13));
+ // multiply 2 adjacent elements with the filter and add the result
+ srcReg32b1 = _mm256_maddubs_epi16(srcReg32b1, firstFilters);
+ srcReg32b6 = _mm256_maddubs_epi16(srcReg32b7, forthFilters);
+
+ srcReg32b1 = _mm256_adds_epi16(srcReg32b1, srcReg32b6);
+
+ // multiply 2 adjacent elements with the filter and add the result
+ srcReg32b8 = _mm256_maddubs_epi16(srcReg32b3, secondFilters);
+ srcReg32b12 = _mm256_maddubs_epi16(srcReg32b5, thirdFilters);
+
+ // add and saturate the results together
+ srcReg32b1 = _mm256_adds_epi16(srcReg32b1,
+ _mm256_min_epi16(srcReg32b8, srcReg32b12));
+ srcReg32b1 = _mm256_adds_epi16(srcReg32b1,
+ _mm256_max_epi16(srcReg32b8, srcReg32b12));
srcReg32b10 = _mm256_adds_epi16(srcReg32b10, addFilterReg64);
srcReg32b1 = _mm256_adds_epi16(srcReg32b1, addFilterReg64);
diff --git a/vp9/decoder/vp9_decodeframe.c b/vp9/decoder/vp9_decodeframe.c
index 0797168..7615cdd 100644
--- a/vp9/decoder/vp9_decodeframe.c
+++ b/vp9/decoder/vp9_decodeframe.c
@@ -195,7 +195,7 @@
struct macroblockd_plane *const pd = &xd->plane[plane];
if (eob > 0) {
TX_TYPE tx_type = DCT_DCT;
- int16_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
+ tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
if (xd->lossless) {
tx_type = DCT_DCT;
vp9_iwht4x4_add(dqcoeff, dst, stride, eob);
@@ -330,6 +330,9 @@
if (!vp9_is_valid_scale(&ref_buffer->sf))
vpx_internal_error(&cm->error, VPX_CODEC_UNSUP_BITSTREAM,
"Invalid scale factors");
+ if (ref_buffer->buf->corrupted)
+ vpx_internal_error(&cm->error, VPX_CODEC_CORRUPT_FRAME,
+ "Block reference is corrupt");
vp9_setup_pre_planes(xd, idx, ref_buffer->buf, mi_row, mi_col,
&ref_buffer->sf);
xd->corrupted |= ref_buffer->buf->corrupted;
@@ -627,16 +630,17 @@
"Width and height beyond allowed size.");
#endif
if (cm->width != width || cm->height != height) {
- const int aligned_width = ALIGN_POWER_OF_TWO(width, MI_SIZE_LOG2);
- const int aligned_height = ALIGN_POWER_OF_TWO(height, MI_SIZE_LOG2);
+ const int new_mi_rows =
+ ALIGN_POWER_OF_TWO(height, MI_SIZE_LOG2) >> MI_SIZE_LOG2;
+ const int new_mi_cols =
+ ALIGN_POWER_OF_TWO(width, MI_SIZE_LOG2) >> MI_SIZE_LOG2;
- // Change in frame size (assumption: color format does not change).
- if (cm->width == 0 || cm->height == 0 ||
- aligned_width > cm->width ||
- aligned_width * aligned_height > cm->width * cm->height) {
+ // Allocations in vp9_alloc_context_buffers() depend on individual
+ // dimensions as well as the overall size.
+ if (new_mi_cols > cm->mi_cols || new_mi_rows > cm->mi_rows) {
if (vp9_alloc_context_buffers(cm, width, height))
vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
- "Failed to allocate frame buffers");
+ "Failed to allocate context buffers");
} else {
vp9_set_mb_mi(cm, width, height);
}
@@ -654,12 +658,25 @@
if (vp9_realloc_frame_buffer(
get_frame_new_buffer(cm), cm->width, cm->height,
- cm->subsampling_x, cm->subsampling_y, VP9_DEC_BORDER_IN_PIXELS,
+ cm->subsampling_x, cm->subsampling_y,
+#if CONFIG_VP9_HIGHBITDEPTH
+ cm->use_highbitdepth,
+#endif
+ VP9_DEC_BORDER_IN_PIXELS,
&cm->frame_bufs[cm->new_fb_idx].raw_frame_buffer, cm->get_fb_cb,
cm->cb_priv)) {
vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
"Failed to allocate frame buffer");
}
+ cm->frame_bufs[cm->new_fb_idx].buf.bit_depth = (unsigned int)cm->bit_depth;
+}
+
+static INLINE int valid_ref_frame_img_fmt(vpx_bit_depth_t ref_bit_depth,
+ int ref_xss, int ref_yss,
+ vpx_bit_depth_t this_bit_depth,
+ int this_xss, int this_yss) {
+ return ref_bit_depth == this_bit_depth && ref_xss == this_xss &&
+ ref_yss == this_yss;
}
static void setup_frame_size_with_refs(VP9_COMMON *cm,
@@ -672,6 +689,10 @@
YV12_BUFFER_CONFIG *const buf = cm->frame_refs[i].buf;
width = buf->y_crop_width;
height = buf->y_crop_height;
+ if (buf->corrupted) {
+ vpx_internal_error(&cm->error, VPX_CODEC_CORRUPT_FRAME,
+ "Frame reference is corrupt");
+ }
found = 1;
break;
}
@@ -695,18 +716,35 @@
if (!has_valid_ref_frame)
vpx_internal_error(&cm->error, VPX_CODEC_CORRUPT_FRAME,
"Referenced frame has invalid size");
+ for (i = 0; i < REFS_PER_FRAME; ++i) {
+ RefBuffer *const ref_frame = &cm->frame_refs[i];
+ if (!valid_ref_frame_img_fmt(
+ ref_frame->buf->bit_depth,
+ ref_frame->buf->uv_crop_width < ref_frame->buf->y_crop_width,
+ ref_frame->buf->uv_crop_height < ref_frame->buf->y_crop_height,
+ cm->bit_depth,
+ cm->subsampling_x,
+ cm->subsampling_y))
+ vpx_internal_error(&cm->error, VPX_CODEC_CORRUPT_FRAME,
+ "Referenced frame has incompatible color space");
+ }
resize_context_buffers(cm, width, height);
setup_display_size(cm, rb);
if (vp9_realloc_frame_buffer(
get_frame_new_buffer(cm), cm->width, cm->height,
- cm->subsampling_x, cm->subsampling_y, VP9_DEC_BORDER_IN_PIXELS,
+ cm->subsampling_x, cm->subsampling_y,
+#if CONFIG_VP9_HIGHBITDEPTH
+ cm->use_highbitdepth,
+#endif
+ VP9_DEC_BORDER_IN_PIXELS,
&cm->frame_bufs[cm->new_fb_idx].raw_frame_buffer, cm->get_fb_cb,
cm->cb_priv)) {
vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
"Failed to allocate frame buffer");
}
+ cm->frame_bufs[cm->new_fb_idx].buf.bit_depth = (unsigned int)cm->bit_depth;
}
static void setup_tile_info(VP9_COMMON *cm, struct vp9_read_bit_buffer *rb) {
@@ -814,6 +852,8 @@
if (cm->lf.filter_level) {
LFWorkerData *const lf_data = (LFWorkerData*)pbi->lf_worker.data1;
+ // Be sure to sync as we might be resuming after a failed frame decode.
+ winterface->sync(&pbi->lf_worker);
lf_data->frame_buffer = get_frame_new_buffer(cm);
lf_data->cm = cm;
vp9_copy(lf_data->planes, pbi->mb.plane);
@@ -883,7 +923,7 @@
pbi->mb.corrupted |= tile_data->xd.corrupted;
}
// Loopfilter one row.
- if (cm->lf.filter_level) {
+ if (cm->lf.filter_level && !pbi->mb.corrupted) {
const int lf_start = mi_row - MI_BLOCK_SIZE;
LFWorkerData *const lf_data = (LFWorkerData*)pbi->lf_worker.data1;
@@ -906,7 +946,7 @@
}
// Loopfilter remaining rows in the frame.
- if (cm->lf.filter_level) {
+ if (cm->lf.filter_level && !pbi->mb.corrupted) {
LFWorkerData *const lf_data = (LFWorkerData*)pbi->lf_worker.data1;
winterface->sync(&pbi->lf_worker);
lf_data->start = lf_data->stop;
@@ -920,9 +960,8 @@
return vp9_reader_find_end(&tile_data->bit_reader);
}
-static int tile_worker_hook(void *arg1, void *arg2) {
- TileWorkerData *const tile_data = (TileWorkerData*)arg1;
- const TileInfo *const tile = (TileInfo*)arg2;
+static int tile_worker_hook(TileWorkerData *const tile_data,
+ const TileInfo *const tile) {
int mi_row, mi_col;
for (mi_row = tile->mi_row_start; mi_row < tile->mi_row_end;
@@ -995,6 +1034,7 @@
// Reset tile decoding hook
for (n = 0; n < num_workers; ++n) {
+ winterface->sync(&pbi->tile_workers[n]);
pbi->tile_workers[n].hook = (VP9WorkerHook)tile_worker_hook;
}
@@ -1098,7 +1138,7 @@
static void read_bitdepth_colorspace_sampling(
VP9_COMMON *cm, struct vp9_read_bit_buffer *rb) {
if (cm->profile >= PROFILE_2)
- cm->bit_depth = vp9_rb_read_bit(rb) ? BITS_12 : BITS_10;
+ cm->bit_depth = vp9_rb_read_bit(rb) ? VPX_BITS_12 : VPX_BITS_10;
cm->color_space = (COLOR_SPACE)vp9_rb_read_literal(rb, 3);
if (cm->color_space != SRGB) {
vp9_rb_read_bit(rb); // [16,235] (including xvycc) vs [0,255] range
@@ -1142,6 +1182,7 @@
"Invalid frame marker");
cm->profile = vp9_read_profile(rb);
+
if (cm->profile >= MAX_PROFILES)
vpx_internal_error(&cm->error, VPX_CODEC_UNSUP_BITSTREAM,
"Unsupported bitstream profile");
@@ -1181,6 +1222,7 @@
}
setup_frame_size(cm, rb);
+ pbi->need_resync = 0;
} else {
cm->intra_only = cm->show_frame ? 0 : vp9_rb_read_bit(rb);
@@ -1204,6 +1246,7 @@
pbi->refresh_frame_flags = vp9_rb_read_literal(rb, REF_FRAMES);
setup_frame_size(cm, rb);
+ pbi->need_resync = 0;
} else {
pbi->refresh_frame_flags = vp9_rb_read_literal(rb, REF_FRAMES);
for (i = 0; i < REFS_PER_FRAME; ++i) {
@@ -1232,6 +1275,12 @@
}
}
+ if (pbi->need_resync) {
+ vpx_internal_error(&cm->error, VPX_CODEC_CORRUPT_FRAME,
+ "Keyframe / intra-only frame required to reset decoder"
+ " state");
+ }
+
if (!cm->error_resilient_mode) {
cm->refresh_frame_context = vp9_rb_read_bit(rb);
cm->frame_parallel_decoding_mode = vp9_rb_read_bit(rb);
@@ -1400,7 +1449,7 @@
if (!first_partition_size) {
// showing a frame directly
- *p_data_end = data + 1;
+ *p_data_end = data + (cm->profile <= PROFILE_2 ? 1 : 2);
return;
}
@@ -1431,9 +1480,11 @@
if (pbi->max_threads > 1 && tile_rows == 1 && tile_cols > 1 &&
cm->frame_parallel_decoding_mode) {
*p_data_end = decode_tiles_mt(pbi, data + first_partition_size, data_end);
- // If multiple threads are used to decode tiles, then we use those threads
- // to do parallel loopfiltering.
- vp9_loop_filter_frame_mt(new_fb, pbi, cm, cm->lf.filter_level, 0);
+ if (!xd->corrupted) {
+ // If multiple threads are used to decode tiles, then we use those threads
+ // to do parallel loopfiltering.
+ vp9_loop_filter_frame_mt(new_fb, pbi, cm, cm->lf.filter_level, 0);
+ }
} else {
*p_data_end = decode_tiles(pbi, data + first_partition_size, data_end);
}
diff --git a/vp9/decoder/vp9_decoder.c b/vp9/decoder/vp9_decoder.c
index 2a2f0f5..6ee3d70 100644
--- a/vp9/decoder/vp9_decoder.c
+++ b/vp9/decoder/vp9_decoder.c
@@ -25,6 +25,7 @@
#include "vp9/common/vp9_postproc.h"
#endif
#include "vp9/common/vp9_quant_common.h"
+#include "vp9/common/vp9_reconintra.h"
#include "vp9/common/vp9_systemdependent.h"
#include "vp9/decoder/vp9_decodeframe.h"
@@ -36,7 +37,9 @@
static int init_done = 0;
if (!init_done) {
+ vp9_rtcd();
vp9_init_neighbors();
+ vp9_init_intra_predictors();
init_done = 1;
}
}
@@ -57,15 +60,15 @@
}
cm->error.setjmp = 1;
+ pbi->need_resync = 1;
initialize_dec();
- vp9_rtcd();
-
// Initialize the references to not point to any frame buffers.
vpx_memset(&cm->ref_frame_map, -1, sizeof(cm->ref_frame_map));
cm->current_video_frame = 0;
pbi->ready_for_new_data = 1;
+ cm->bit_depth = VPX_BITS_8;
// vp9_init_dequantizer() is first called here. Add check in
// frame_init_dequantizer() to avoid unnecessary calling of
@@ -96,10 +99,8 @@
}
vpx_free(pbi->tile_workers);
- if (pbi->num_tile_workers) {
- const int sb_rows =
- mi_cols_aligned_to_sb(cm->mi_rows) >> MI_BLOCK_SIZE_LOG2;
- vp9_loop_filter_dealloc(&pbi->lf_row_sync, sb_rows);
+ if (pbi->num_tile_workers > 0) {
+ vp9_loop_filter_dealloc(&pbi->lf_row_sync);
}
vp9_remove_common(cm);
@@ -123,8 +124,12 @@
* later commit that adds VP9-specific controls for this functionality.
*/
if (ref_frame_flag == VP9_LAST_FLAG) {
- const YV12_BUFFER_CONFIG *const cfg =
- &cm->frame_bufs[cm->ref_frame_map[0]].buf;
+ const YV12_BUFFER_CONFIG *const cfg = get_ref_frame(cm, 0);
+ if (cfg == NULL) {
+ vpx_internal_error(&cm->error, VPX_CODEC_ERROR,
+ "No 'last' reference frame");
+ return VPX_CODEC_ERROR;
+ }
if (!equal_dimensions(cfg, sd))
vpx_internal_error(&cm->error, VPX_CODEC_ERROR,
"Incorrect buffer dimensions");
@@ -234,6 +239,7 @@
cm->new_fb_idx = get_free_fb(cm);
if (setjmp(cm->error.jmp)) {
+ pbi->need_resync = 1;
cm->error.setjmp = 0;
vp9_clear_system_state();
@@ -310,3 +316,67 @@
vp9_clear_system_state();
return ret;
}
+
+vpx_codec_err_t vp9_parse_superframe_index(const uint8_t *data,
+ size_t data_sz,
+ uint32_t sizes[8], int *count,
+ vpx_decrypt_cb decrypt_cb,
+ void *decrypt_state) {
+ // A chunk ending with a byte matching 0xc0 is an invalid chunk unless
+ // it is a super frame index. If the last byte of real video compression
+ // data is 0xc0 the encoder must add a 0 byte. If we have the marker but
+ // not the associated matching marker byte at the front of the index we have
+ // an invalid bitstream and need to return an error.
+
+ uint8_t marker;
+
+ assert(data_sz);
+ marker = read_marker(decrypt_cb, decrypt_state, data + data_sz - 1);
+ *count = 0;
+
+ if ((marker & 0xe0) == 0xc0) {
+ const uint32_t frames = (marker & 0x7) + 1;
+ const uint32_t mag = ((marker >> 3) & 0x3) + 1;
+ const size_t index_sz = 2 + mag * frames;
+
+ // This chunk is marked as having a superframe index but doesn't have
+ // enough data for it, thus it's an invalid superframe index.
+ if (data_sz < index_sz)
+ return VPX_CODEC_CORRUPT_FRAME;
+
+ {
+ const uint8_t marker2 = read_marker(decrypt_cb, decrypt_state,
+ data + data_sz - index_sz);
+
+ // This chunk is marked as having a superframe index but doesn't have
+ // the matching marker byte at the front of the index therefore it's an
+ // invalid chunk.
+ if (marker != marker2)
+ return VPX_CODEC_CORRUPT_FRAME;
+ }
+
+ {
+ // Found a valid superframe index.
+ uint32_t i, j;
+ const uint8_t *x = &data[data_sz - index_sz + 1];
+
+ // Frames has a maximum of 8 and mag has a maximum of 4.
+ uint8_t clear_buffer[32];
+ assert(sizeof(clear_buffer) >= frames * mag);
+ if (decrypt_cb) {
+ decrypt_cb(decrypt_state, x, clear_buffer, frames * mag);
+ x = clear_buffer;
+ }
+
+ for (i = 0; i < frames; ++i) {
+ uint32_t this_sz = 0;
+
+ for (j = 0; j < mag; ++j)
+ this_sz |= (*x++) << (j * 8);
+ sizes[i] = this_sz;
+ }
+ *count = frames;
+ }
+ }
+ return VPX_CODEC_OK;
+}
diff --git a/vp9/decoder/vp9_decoder.h b/vp9/decoder/vp9_decoder.h
index 223b66f..4f52bb9 100644
--- a/vp9/decoder/vp9_decoder.h
+++ b/vp9/decoder/vp9_decoder.h
@@ -58,6 +58,7 @@
int max_threads;
int inv_tile_order;
+ int need_resync; // wait for key/intra-only frame
} VP9Decoder;
int vp9_receive_compressed_data(struct VP9Decoder *pbi,
@@ -78,6 +79,25 @@
void vp9_decoder_remove(struct VP9Decoder *pbi);
+static INLINE uint8_t read_marker(vpx_decrypt_cb decrypt_cb,
+ void *decrypt_state,
+ const uint8_t *data) {
+ if (decrypt_cb) {
+ uint8_t marker;
+ decrypt_cb(decrypt_state, data, &marker, 1);
+ return marker;
+ }
+ return *data;
+}
+
+// This function is exposed for use in tests, as well as the inlined function
+// "read_marker".
+vpx_codec_err_t vp9_parse_superframe_index(const uint8_t *data,
+ size_t data_sz,
+ uint32_t sizes[8], int *count,
+ vpx_decrypt_cb decrypt_cb,
+ void *decrypt_state);
+
#ifdef __cplusplus
} // extern "C"
#endif
diff --git a/vp9/decoder/vp9_detokenize.c b/vp9/decoder/vp9_detokenize.c
index 91cdf38..76ca1ae 100644
--- a/vp9/decoder/vp9_detokenize.c
+++ b/vp9/decoder/vp9_detokenize.c
@@ -51,7 +51,7 @@
} while (0)
static int decode_coefs(VP9_COMMON *cm, const MACROBLOCKD *xd, PLANE_TYPE type,
- int16_t *dqcoeff, TX_SIZE tx_size, const int16_t *dq,
+ tran_low_t *dqcoeff, TX_SIZE tx_size, const int16_t *dq,
int ctx, const int16_t *scan, const int16_t *nb,
vp9_reader *r) {
const int max_eob = 16 << (tx_size << 1);
diff --git a/vp9/decoder/vp9_dthread.c b/vp9/decoder/vp9_dthread.c
index 5dda49a..6635880 100644
--- a/vp9/decoder/vp9_dthread.c
+++ b/vp9/decoder/vp9_dthread.c
@@ -121,10 +121,10 @@
}
// Row-based multi-threaded loopfilter hook
-static int loop_filter_row_worker(void *arg1, void *arg2) {
- TileWorkerData *const tile_data = (TileWorkerData*)arg1;
+static int loop_filter_row_worker(TileWorkerData *const tile_data,
+ void *unused) {
LFWorkerData *const lf_data = &tile_data->lfdata;
- (void) arg2;
+ (void)unused;
loop_filter_rows_mt(lf_data->frame_buffer, lf_data->cm, lf_data->planes,
lf_data->start, lf_data->stop, lf_data->y_only,
lf_data->lf_sync, lf_data->num_lf_workers);
@@ -145,24 +145,13 @@
const int num_workers = MIN(pbi->max_threads & ~1, tile_cols);
int i;
- // Allocate memory used in thread synchronization.
- // This always needs to be done even if frame_filter_level is 0.
- if (!cm->current_video_frame || cm->last_height != cm->height) {
- if (cm->last_height != cm->height) {
- const int aligned_last_height =
- ALIGN_POWER_OF_TWO(cm->last_height, MI_SIZE_LOG2);
- const int last_sb_rows =
- mi_cols_aligned_to_sb(aligned_last_height >> MI_SIZE_LOG2) >>
- MI_BLOCK_SIZE_LOG2;
-
- vp9_loop_filter_dealloc(lf_sync, last_sb_rows);
- }
-
- vp9_loop_filter_alloc(cm, lf_sync, sb_rows, cm->width);
- }
-
if (!frame_filter_level) return;
+ if (!lf_sync->sync_range || cm->last_height != cm->height) {
+ vp9_loop_filter_dealloc(lf_sync);
+ vp9_loop_filter_alloc(lf_sync, cm, sb_rows, cm->width);
+ }
+
vp9_loop_filter_frame_init(cm, frame_filter_level);
// Initialize cur_sb_col to -1 for all SB rows.
@@ -225,21 +214,24 @@
}
// Allocate memory for lf row synchronization
-void vp9_loop_filter_alloc(VP9_COMMON *cm, VP9LfSync *lf_sync, int rows,
+void vp9_loop_filter_alloc(VP9LfSync *lf_sync, VP9_COMMON *cm, int rows,
int width) {
+ lf_sync->rows = rows;
#if CONFIG_MULTITHREAD
- int i;
+ {
+ int i;
- CHECK_MEM_ERROR(cm, lf_sync->mutex_,
- vpx_malloc(sizeof(*lf_sync->mutex_) * rows));
- for (i = 0; i < rows; ++i) {
- pthread_mutex_init(&lf_sync->mutex_[i], NULL);
- }
+ CHECK_MEM_ERROR(cm, lf_sync->mutex_,
+ vpx_malloc(sizeof(*lf_sync->mutex_) * rows));
+ for (i = 0; i < rows; ++i) {
+ pthread_mutex_init(&lf_sync->mutex_[i], NULL);
+ }
- CHECK_MEM_ERROR(cm, lf_sync->cond_,
- vpx_malloc(sizeof(*lf_sync->cond_) * rows));
- for (i = 0; i < rows; ++i) {
- pthread_cond_init(&lf_sync->cond_[i], NULL);
+ CHECK_MEM_ERROR(cm, lf_sync->cond_,
+ vpx_malloc(sizeof(*lf_sync->cond_) * rows));
+ for (i = 0; i < rows; ++i) {
+ pthread_cond_init(&lf_sync->cond_[i], NULL);
+ }
}
#endif // CONFIG_MULTITHREAD
@@ -251,23 +243,19 @@
}
// Deallocate lf synchronization related mutex and data
-void vp9_loop_filter_dealloc(VP9LfSync *lf_sync, int rows) {
-#if !CONFIG_MULTITHREAD
- (void)rows;
-#endif // !CONFIG_MULTITHREAD
-
+void vp9_loop_filter_dealloc(VP9LfSync *lf_sync) {
if (lf_sync != NULL) {
#if CONFIG_MULTITHREAD
int i;
if (lf_sync->mutex_ != NULL) {
- for (i = 0; i < rows; ++i) {
+ for (i = 0; i < lf_sync->rows; ++i) {
pthread_mutex_destroy(&lf_sync->mutex_[i]);
}
vpx_free(lf_sync->mutex_);
}
if (lf_sync->cond_ != NULL) {
- for (i = 0; i < rows; ++i) {
+ for (i = 0; i < lf_sync->rows; ++i) {
pthread_cond_destroy(&lf_sync->cond_[i]);
}
vpx_free(lf_sync->cond_);
diff --git a/vp9/decoder/vp9_dthread.h b/vp9/decoder/vp9_dthread.h
index 423bd88..b1fbdeb 100644
--- a/vp9/decoder/vp9_dthread.h
+++ b/vp9/decoder/vp9_dthread.h
@@ -38,14 +38,15 @@
// The optimal sync_range for different resolution and platform should be
// determined by testing. Currently, it is chosen to be a power-of-2 number.
int sync_range;
+ int rows;
} VP9LfSync;
// Allocate memory for loopfilter row synchronization.
-void vp9_loop_filter_alloc(struct VP9Common *cm, VP9LfSync *lf_sync,
- int rows, int width);
+void vp9_loop_filter_alloc(VP9LfSync *lf_sync, VP9_COMMON *cm, int rows,
+ int width);
// Deallocate loopfilter synchronization related mutex and data.
-void vp9_loop_filter_dealloc(VP9LfSync *lf_sync, int rows);
+void vp9_loop_filter_dealloc(VP9LfSync *lf_sync);
// Multi-threaded loopfilter that uses the tile threads.
void vp9_loop_filter_frame_mt(YV12_BUFFER_CONFIG *frame,
diff --git a/vp9/encoder/arm/neon/vp9_quantize_neon.c b/vp9/encoder/arm/neon/vp9_quantize_neon.c
index 2d5ec79..8c13d0d 100644
--- a/vp9/encoder/arm/neon/vp9_quantize_neon.c
+++ b/vp9/encoder/arm/neon/vp9_quantize_neon.c
@@ -28,7 +28,6 @@
int16_t *dqcoeff_ptr, const int16_t *dequant_ptr,
int zbin_oq_value, uint16_t *eob_ptr,
const int16_t *scan, const int16_t *iscan) {
- int i;
// TODO(jingning) Decide the need of these arguments after the
// quantization process is completed.
(void)zbin_ptr;
@@ -39,7 +38,7 @@
if (!skip_block) {
// Quantization pass: All coefficients with index >= zero_flag are
// skippable. Note: zero_flag can be zero.
-
+ int i;
const int16x8_t v_zero = vdupq_n_s16(0);
const int16x8_t v_one = vdupq_n_s16(1);
int16x8_t v_eobmax_76543210 = vdupq_n_s16(-1);
@@ -50,13 +49,12 @@
v_round = vsetq_lane_s16(round_ptr[0], v_round, 0);
v_quant = vsetq_lane_s16(quant_ptr[0], v_quant, 0);
v_dequant = vsetq_lane_s16(dequant_ptr[0], v_dequant, 0);
-
- for (i = 0; i < count; i += 8) {
- const int16x8_t v_iscan = vld1q_s16(&iscan[i]);
- const int16x8_t v_coeff = vld1q_s16(&coeff_ptr[i]);
+ // process dc and the first seven ac coeffs
+ {
+ const int16x8_t v_iscan = vld1q_s16(&iscan[0]);
+ const int16x8_t v_coeff = vld1q_s16(&coeff_ptr[0]);
const int16x8_t v_coeff_sign = vshrq_n_s16(v_coeff, 15);
- const int16x8_t v_abs_coeff = vabsq_s16(v_coeff);
- const int16x8_t v_tmp = vqaddq_s16(v_abs_coeff, v_round);
+ const int16x8_t v_tmp = vabaq_s16(v_round, v_coeff, v_zero);
const int32x4_t v_tmp_lo = vmull_s16(vget_low_s16(v_tmp),
vget_low_s16(v_quant));
const int32x4_t v_tmp_hi = vmull_s16(vget_high_s16(v_tmp),
@@ -65,20 +63,39 @@
vshrn_n_s32(v_tmp_hi, 16));
const uint16x8_t v_nz_mask = vceqq_s16(v_tmp2, v_zero);
const int16x8_t v_iscan_plus1 = vaddq_s16(v_iscan, v_one);
- const int16x8_t v_nz_iscan =
- vandq_s16(vmvnq_s16(vreinterpretq_s16_u16(v_nz_mask)), v_iscan_plus1);
+ const int16x8_t v_nz_iscan = vbslq_s16(v_nz_mask, v_zero, v_iscan_plus1);
const int16x8_t v_qcoeff_a = veorq_s16(v_tmp2, v_coeff_sign);
const int16x8_t v_qcoeff = vsubq_s16(v_qcoeff_a, v_coeff_sign);
const int16x8_t v_dqcoeff = vmulq_s16(v_qcoeff, v_dequant);
-
v_eobmax_76543210 = vmaxq_s16(v_eobmax_76543210, v_nz_iscan);
-
- vst1q_s16(&qcoeff_ptr[i], v_qcoeff);
- vst1q_s16(&dqcoeff_ptr[i], v_dqcoeff);
+ vst1q_s16(&qcoeff_ptr[0], v_qcoeff);
+ vst1q_s16(&dqcoeff_ptr[0], v_dqcoeff);
v_round = vmovq_n_s16(round_ptr[1]);
v_quant = vmovq_n_s16(quant_ptr[1]);
v_dequant = vmovq_n_s16(dequant_ptr[1]);
}
+ // now process the rest of the ac coeffs
+ for (i = 8; i < count; i += 8) {
+ const int16x8_t v_iscan = vld1q_s16(&iscan[i]);
+ const int16x8_t v_coeff = vld1q_s16(&coeff_ptr[i]);
+ const int16x8_t v_coeff_sign = vshrq_n_s16(v_coeff, 15);
+ const int16x8_t v_tmp = vabaq_s16(v_round, v_coeff, v_zero);
+ const int32x4_t v_tmp_lo = vmull_s16(vget_low_s16(v_tmp),
+ vget_low_s16(v_quant));
+ const int32x4_t v_tmp_hi = vmull_s16(vget_high_s16(v_tmp),
+ vget_high_s16(v_quant));
+ const int16x8_t v_tmp2 = vcombine_s16(vshrn_n_s32(v_tmp_lo, 16),
+ vshrn_n_s32(v_tmp_hi, 16));
+ const uint16x8_t v_nz_mask = vceqq_s16(v_tmp2, v_zero);
+ const int16x8_t v_iscan_plus1 = vaddq_s16(v_iscan, v_one);
+ const int16x8_t v_nz_iscan = vbslq_s16(v_nz_mask, v_zero, v_iscan_plus1);
+ const int16x8_t v_qcoeff_a = veorq_s16(v_tmp2, v_coeff_sign);
+ const int16x8_t v_qcoeff = vsubq_s16(v_qcoeff_a, v_coeff_sign);
+ const int16x8_t v_dqcoeff = vmulq_s16(v_qcoeff, v_dequant);
+ v_eobmax_76543210 = vmaxq_s16(v_eobmax_76543210, v_nz_iscan);
+ vst1q_s16(&qcoeff_ptr[i], v_qcoeff);
+ vst1q_s16(&dqcoeff_ptr[i], v_dqcoeff);
+ }
{
const int16x4_t v_eobmax_3210 =
vmax_s16(vget_low_s16(v_eobmax_76543210),
diff --git a/vp9/encoder/vp9_bitstream.c b/vp9/encoder/vp9_bitstream.c
index b0ff0fa..b605248 100644
--- a/vp9/encoder/vp9_bitstream.c
+++ b/vp9/encoder/vp9_bitstream.c
@@ -294,6 +294,7 @@
vp9_write_token(w, vp9_switchable_interp_tree,
cm->fc.switchable_interp_prob[ctx],
&switchable_interp_encodings[mbmi->interp_filter]);
+ ++cpi->interp_filter_selected[0][mbmi->interp_filter];
} else {
assert(mbmi->interp_filter == cm->interp_filter);
}
@@ -421,11 +422,13 @@
const int bs = (1 << bsl) / 4;
PARTITION_TYPE partition;
BLOCK_SIZE subsize;
- const MODE_INFO *m = cm->mi_grid_visible[mi_row * cm->mi_stride + mi_col];
+ const MODE_INFO *m = NULL;
if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols)
return;
+ m = cm->mi_grid_visible[mi_row * cm->mi_stride + mi_col];
+
partition = partition_lookup[bsl][m->mbmi.sb_type];
write_partition(cm, xd, bs, mi_row, mi_col, partition, bsize, w);
subsize = get_subsize(bsize, partition);
@@ -668,8 +671,6 @@
vp9_coeff_stats frame_branch_ct[TX_SIZES][PLANE_TYPES];
vp9_coeff_probs_model frame_coef_probs[TX_SIZES][PLANE_TYPES];
- vp9_clear_system_state();
-
for (tx_size = TX_4X4; tx_size <= TX_32X32; ++tx_size)
build_tree_distribution(cpi, tx_size, frame_branch_ct[tx_size],
frame_coef_probs[tx_size]);
@@ -996,8 +997,10 @@
// Set "found" to 0 for temporal svc and for spatial svc key frame
if (cpi->use_svc &&
- (cpi->svc.number_spatial_layers == 1 ||
- cpi->svc.layer_context[cpi->svc.spatial_layer_id].is_key_frame)) {
+ ((cpi->svc.number_temporal_layers > 1 &&
+ cpi->oxcf.rc_mode == VPX_CBR) ||
+ (cpi->svc.number_spatial_layers > 1 &&
+ cpi->svc.layer_context[cpi->svc.spatial_layer_id].is_key_frame))) {
found = 0;
}
vp9_wb_write_bit(wb, found);
@@ -1043,8 +1046,8 @@
static void write_bitdepth_colorspace_sampling(
VP9_COMMON *const cm, struct vp9_write_bit_buffer *wb) {
if (cm->profile >= PROFILE_2) {
- assert(cm->bit_depth > BITS_8);
- vp9_wb_write_bit(wb, cm->bit_depth - BITS_10);
+ assert(cm->bit_depth > VPX_BITS_8);
+ vp9_wb_write_bit(wb, cm->bit_depth == VPX_BITS_10 ? 0 : 1);
}
vp9_wb_write_literal(wb, cm->color_space, 3);
if (cm->color_space != SRGB) {
@@ -1081,7 +1084,16 @@
write_bitdepth_colorspace_sampling(cm, wb);
write_frame_size(cm, wb);
} else {
- if (!cm->show_frame)
+ // In spatial svc if it's not error_resilient_mode then we need to code all
+ // visible frames as invisible. But we need to keep the show_frame flag so
+ // that the publisher could know whether it is supposed to be visible.
+ // So we will code the show_frame flag as it is. Then code the intra_only
+ // bit here. This will make the bitstream incompatible. In the player we
+ // will change to show_frame flag to 0, then add an one byte frame with
+ // show_existing_frame flag which tells the decoder which frame we want to
+ // show.
+ if (!cm->show_frame ||
+ (is_two_pass_svc(cpi) && cm->error_resilient_mode == 0))
vp9_wb_write_bit(wb, cm->intra_only);
if (!cm->error_resilient_mode)
diff --git a/vp9/encoder/vp9_bitstream.h b/vp9/encoder/vp9_bitstream.h
index 8e82d1c..b488261 100644
--- a/vp9/encoder/vp9_bitstream.h
+++ b/vp9/encoder/vp9_bitstream.h
@@ -26,7 +26,7 @@
return !cpi->multi_arf_allowed && cpi->refresh_golden_frame &&
cpi->rc.is_src_frame_alt_ref &&
(!cpi->use_svc || // Add spatial svc base layer case here
- (is_spatial_svc(cpi) &&
+ (is_two_pass_svc(cpi) &&
cpi->svc.spatial_layer_id == 0 &&
cpi->svc.layer_context[0].gold_ref_idx >=0 &&
cpi->oxcf.ss_play_alternate[0]));
diff --git a/vp9/encoder/vp9_block.h b/vp9/encoder/vp9_block.h
index bd3b0fd..767bd7f 100644
--- a/vp9/encoder/vp9_block.h
+++ b/vp9/encoder/vp9_block.h
@@ -28,8 +28,8 @@
struct macroblock_plane {
DECLARE_ALIGNED(16, int16_t, src_diff[64 * 64]);
- int16_t *qcoeff;
- int16_t *coeff;
+ tran_low_t *qcoeff;
+ tran_low_t *coeff;
uint16_t *eobs;
struct buf_2d src;
@@ -76,16 +76,12 @@
int pred_mv_sad[MAX_REF_FRAMES];
int nmvjointcost[MV_JOINTS];
- int nmvcosts[2][MV_VALS];
int *nmvcost[2];
- int nmvcosts_hp[2][MV_VALS];
int *nmvcost_hp[2];
int **mvcost;
int nmvjointsadcost[MV_JOINTS];
- int nmvsadcosts[2][MV_VALS];
int *nmvsadcost[2];
- int nmvsadcosts_hp[2][MV_VALS];
int *nmvsadcost_hp[2];
int **mvsadcost;
@@ -116,15 +112,19 @@
int quant_fp;
// skip forward transform and quantization
- int skip_txfm[MAX_MB_PLANE];
+ uint8_t skip_txfm[MAX_MB_PLANE << 2];
- int64_t bsse[MAX_MB_PLANE];
+ int64_t bsse[MAX_MB_PLANE << 2];
// Used to store sub partition's choices.
MV pred_mv[MAX_REF_FRAMES];
- void (*fwd_txm4x4)(const int16_t *input, int16_t *output, int stride);
- void (*itxm_add)(const int16_t *input, uint8_t *dest, int stride, int eob);
+ void (*fwd_txm4x4)(const int16_t *input, tran_low_t *output, int stride);
+ void (*itxm_add)(const tran_low_t *input, uint8_t *dest, int stride, int eob);
+#if CONFIG_VP9_HIGHBITDEPTH
+ void (*high_itxm_add)(const tran_low_t *input, uint8_t *dest, int stride,
+ int eob, int bd);
+#endif
};
#ifdef __cplusplus
diff --git a/vp9/encoder/vp9_context_tree.c b/vp9/encoder/vp9_context_tree.c
index 9b7a932..12acc51 100644
--- a/vp9/encoder/vp9_context_tree.c
+++ b/vp9/encoder/vp9_context_tree.c
@@ -30,13 +30,13 @@
for (i = 0; i < MAX_MB_PLANE; ++i) {
for (k = 0; k < 3; ++k) {
CHECK_MEM_ERROR(cm, ctx->coeff[i][k],
- vpx_memalign(16, num_pix * sizeof(int16_t)));
+ vpx_memalign(16, num_pix * sizeof(*ctx->coeff[i][k])));
CHECK_MEM_ERROR(cm, ctx->qcoeff[i][k],
- vpx_memalign(16, num_pix * sizeof(int16_t)));
+ vpx_memalign(16, num_pix * sizeof(*ctx->qcoeff[i][k])));
CHECK_MEM_ERROR(cm, ctx->dqcoeff[i][k],
- vpx_memalign(16, num_pix * sizeof(int16_t)));
+ vpx_memalign(16, num_pix * sizeof(*ctx->dqcoeff[i][k])));
CHECK_MEM_ERROR(cm, ctx->eobs[i][k],
- vpx_memalign(16, num_pix * sizeof(uint16_t)));
+ vpx_memalign(16, num_pix * sizeof(*ctx->eobs[i][k])));
ctx->coeff_pbuf[i][k] = ctx->coeff[i][k];
ctx->qcoeff_pbuf[i][k] = ctx->qcoeff[i][k];
ctx->dqcoeff_pbuf[i][k] = ctx->dqcoeff[i][k];
diff --git a/vp9/encoder/vp9_context_tree.h b/vp9/encoder/vp9_context_tree.h
index d60e6c3..97f0741 100644
--- a/vp9/encoder/vp9_context_tree.h
+++ b/vp9/encoder/vp9_context_tree.h
@@ -19,21 +19,24 @@
typedef struct {
MODE_INFO mic;
uint8_t *zcoeff_blk;
- int16_t *coeff[MAX_MB_PLANE][3];
- int16_t *qcoeff[MAX_MB_PLANE][3];
- int16_t *dqcoeff[MAX_MB_PLANE][3];
+ tran_low_t *coeff[MAX_MB_PLANE][3];
+ tran_low_t *qcoeff[MAX_MB_PLANE][3];
+ tran_low_t *dqcoeff[MAX_MB_PLANE][3];
uint16_t *eobs[MAX_MB_PLANE][3];
// dual buffer pointers, 0: in use, 1: best in store
- int16_t *coeff_pbuf[MAX_MB_PLANE][3];
- int16_t *qcoeff_pbuf[MAX_MB_PLANE][3];
- int16_t *dqcoeff_pbuf[MAX_MB_PLANE][3];
+ tran_low_t *coeff_pbuf[MAX_MB_PLANE][3];
+ tran_low_t *qcoeff_pbuf[MAX_MB_PLANE][3];
+ tran_low_t *dqcoeff_pbuf[MAX_MB_PLANE][3];
uint16_t *eobs_pbuf[MAX_MB_PLANE][3];
int is_coded;
int num_4x4_blk;
int skip;
- int skip_txfm[MAX_MB_PLANE];
+ // For current partition, only if all Y, U, and V transform blocks'
+ // coefficients are quantized to 0, skippable is set to 0.
+ int skippable;
+ uint8_t skip_txfm[MAX_MB_PLANE << 2];
int best_mode_index;
int hybrid_pred_diff;
int comp_pred_diff;
diff --git a/vp9/encoder/vp9_dct.c b/vp9/encoder/vp9_dct.c
index 59222f0..eff8996 100644
--- a/vp9/encoder/vp9_dct.c
+++ b/vp9/encoder/vp9_dct.c
@@ -18,15 +18,17 @@
#include "vp9/common/vp9_idct.h"
#include "vp9/common/vp9_systemdependent.h"
-static INLINE int fdct_round_shift(int input) {
- int rv = ROUND_POWER_OF_TWO(input, DCT_CONST_BITS);
- assert(INT16_MIN <= rv && rv <= INT16_MAX);
+static INLINE tran_high_t fdct_round_shift(tran_high_t input) {
+ tran_high_t rv = ROUND_POWER_OF_TWO(input, DCT_CONST_BITS);
+ // TODO(debargha, peter.derivaz): Find new bounds for this assert
+ // and make the bounds consts.
+ // assert(INT16_MIN <= rv && rv <= INT16_MAX);
return rv;
}
-static void fdct4(const int16_t *input, int16_t *output) {
- int16_t step[4];
- int temp1, temp2;
+static void fdct4(const tran_low_t *input, tran_low_t *output) {
+ tran_high_t step[4];
+ tran_high_t temp1, temp2;
step[0] = input[0] + input[3];
step[1] = input[1] + input[2];
@@ -43,9 +45,9 @@
output[3] = fdct_round_shift(temp2);
}
-void vp9_fdct4x4_1_c(const int16_t *input, int16_t *output, int stride) {
+void vp9_fdct4x4_1_c(const int16_t *input, tran_low_t *output, int stride) {
int r, c;
- int16_t sum = 0;
+ tran_low_t sum = 0;
for (r = 0; r < 4; ++r)
for (c = 0; c < 4; ++c)
sum += input[r * stride + c];
@@ -54,7 +56,7 @@
output[1] = 0;
}
-void vp9_fdct4x4_c(const int16_t *input, int16_t *output, int stride) {
+void vp9_fdct4x4_c(const int16_t *input, tran_low_t *output, int stride) {
// The 2D transform is done with two passes which are actually pretty
// similar. In the first one, we transform the columns and transpose
// the results. In the second one, we transform the rows. To achieve that,
@@ -63,22 +65,23 @@
// in normal/row positions).
int pass;
// We need an intermediate buffer between passes.
- int16_t intermediate[4 * 4];
- const int16_t *in = input;
- int16_t *out = intermediate;
+ tran_low_t intermediate[4 * 4];
+ const int16_t *in_pass0 = input;
+ const tran_low_t *in = NULL;
+ tran_low_t *out = intermediate;
// Do the two transform/transpose passes
for (pass = 0; pass < 2; ++pass) {
- /*canbe16*/ int input[4];
- /*canbe16*/ int step[4];
- /*needs32*/ int temp1, temp2;
+ tran_high_t input[4]; // canbe16
+ tran_high_t step[4]; // canbe16
+ tran_high_t temp1, temp2; // needs32
int i;
for (i = 0; i < 4; ++i) {
// Load inputs.
if (0 == pass) {
- input[0] = in[0 * stride] * 16;
- input[1] = in[1 * stride] * 16;
- input[2] = in[2 * stride] * 16;
- input[3] = in[3 * stride] * 16;
+ input[0] = in_pass0[0 * stride] * 16;
+ input[1] = in_pass0[1 * stride] * 16;
+ input[2] = in_pass0[2 * stride] * 16;
+ input[3] = in_pass0[3 * stride] * 16;
if (i == 0 && input[0]) {
input[0] += 1;
}
@@ -102,6 +105,7 @@
out[1] = fdct_round_shift(temp1);
out[3] = fdct_round_shift(temp2);
// Do next column (which is a transposed row in second/horizontal pass)
+ in_pass0++;
in++;
out += 4;
}
@@ -119,9 +123,9 @@
}
}
-static void fadst4(const int16_t *input, int16_t *output) {
- int x0, x1, x2, x3;
- int s0, s1, s2, s3, s4, s5, s6, s7;
+static void fadst4(const tran_low_t *input, tran_low_t *output) {
+ tran_high_t x0, x1, x2, x3;
+ tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;
x0 = input[0];
x1 = input[1];
@@ -166,15 +170,15 @@
{ fadst4, fadst4 } // ADST_ADST = 3
};
-void vp9_fht4x4_c(const int16_t *input, int16_t *output,
+void vp9_fht4x4_c(const int16_t *input, tran_low_t *output,
int stride, int tx_type) {
if (tx_type == DCT_DCT) {
vp9_fdct4x4_c(input, output, stride);
} else {
- int16_t out[4 * 4];
- int16_t *outptr = &out[0];
+ tran_low_t out[4 * 4];
+ tran_low_t *outptr = &out[0];
int i, j;
- int16_t temp_in[4], temp_out[4];
+ tran_low_t temp_in[4], temp_out[4];
const transform_2d ht = FHT_4[tx_type];
// Columns
@@ -199,10 +203,10 @@
}
}
-static void fdct8(const int16_t *input, int16_t *output) {
- /*canbe16*/ int s0, s1, s2, s3, s4, s5, s6, s7;
- /*needs32*/ int t0, t1, t2, t3;
- /*canbe16*/ int x0, x1, x2, x3;
+static void fdct8(const tran_low_t *input, tran_low_t *output) {
+ tran_high_t s0, s1, s2, s3, s4, s5, s6, s7; // canbe16
+ tran_high_t t0, t1, t2, t3; // needs32
+ tran_high_t x0, x1, x2, x3; // canbe16
// stage 1
s0 = input[0] + input[7];
@@ -251,9 +255,9 @@
output[7] = fdct_round_shift(t3);
}
-void vp9_fdct8x8_1_c(const int16_t *input, int16_t *output, int stride) {
+void vp9_fdct8x8_1_c(const int16_t *input, tran_low_t *output, int stride) {
int r, c;
- int16_t sum = 0;
+ tran_low_t sum = 0;
for (r = 0; r < 8; ++r)
for (c = 0; c < 8; ++c)
sum += input[r * stride + c];
@@ -262,16 +266,16 @@
output[1] = 0;
}
-void vp9_fdct8x8_c(const int16_t *input, int16_t *final_output, int stride) {
+void vp9_fdct8x8_c(const int16_t *input, tran_low_t *final_output, int stride) {
int i, j;
- int16_t intermediate[64];
+ tran_low_t intermediate[64];
// Transform columns
{
- int16_t *output = intermediate;
- /*canbe16*/ int s0, s1, s2, s3, s4, s5, s6, s7;
- /*needs32*/ int t0, t1, t2, t3;
- /*canbe16*/ int x0, x1, x2, x3;
+ tran_low_t *output = intermediate;
+ tran_high_t s0, s1, s2, s3, s4, s5, s6, s7; // canbe16
+ tran_high_t t0, t1, t2, t3; // needs32
+ tran_high_t x0, x1, x2, x3; // canbe16
int i;
for (i = 0; i < 8; i++) {
@@ -333,9 +337,9 @@
}
}
-void vp9_fdct16x16_1_c(const int16_t *input, int16_t *output, int stride) {
+void vp9_fdct16x16_1_c(const int16_t *input, tran_low_t *output, int stride) {
int r, c;
- int16_t sum = 0;
+ tran_low_t sum = 0;
for (r = 0; r < 16; ++r)
for (c = 0; c < 16; ++c)
sum += input[r * stride + c];
@@ -344,7 +348,7 @@
output[1] = 0;
}
-void vp9_fdct16x16_c(const int16_t *input, int16_t *output, int stride) {
+void vp9_fdct16x16_c(const int16_t *input, tran_low_t *output, int stride) {
// The 2D transform is done with two passes which are actually pretty
// similar. In the first one, we transform the columns and transpose
// the results. In the second one, we transform the rows. To achieve that,
@@ -353,37 +357,38 @@
// in normal/row positions).
int pass;
// We need an intermediate buffer between passes.
- int16_t intermediate[256];
- const int16_t *in = input;
- int16_t *out = intermediate;
+ tran_low_t intermediate[256];
+ const int16_t *in_pass0 = input;
+ const tran_low_t *in = NULL;
+ tran_low_t *out = intermediate;
// Do the two transform/transpose passes
for (pass = 0; pass < 2; ++pass) {
- /*canbe16*/ int step1[8];
- /*canbe16*/ int step2[8];
- /*canbe16*/ int step3[8];
- /*canbe16*/ int input[8];
- /*needs32*/ int temp1, temp2;
+ tran_high_t step1[8]; // canbe16
+ tran_high_t step2[8]; // canbe16
+ tran_high_t step3[8]; // canbe16
+ tran_high_t input[8]; // canbe16
+ tran_high_t temp1, temp2; // needs32
int i;
for (i = 0; i < 16; i++) {
if (0 == pass) {
// Calculate input for the first 8 results.
- input[0] = (in[0 * stride] + in[15 * stride]) * 4;
- input[1] = (in[1 * stride] + in[14 * stride]) * 4;
- input[2] = (in[2 * stride] + in[13 * stride]) * 4;
- input[3] = (in[3 * stride] + in[12 * stride]) * 4;
- input[4] = (in[4 * stride] + in[11 * stride]) * 4;
- input[5] = (in[5 * stride] + in[10 * stride]) * 4;
- input[6] = (in[6 * stride] + in[ 9 * stride]) * 4;
- input[7] = (in[7 * stride] + in[ 8 * stride]) * 4;
+ input[0] = (in_pass0[0 * stride] + in_pass0[15 * stride]) * 4;
+ input[1] = (in_pass0[1 * stride] + in_pass0[14 * stride]) * 4;
+ input[2] = (in_pass0[2 * stride] + in_pass0[13 * stride]) * 4;
+ input[3] = (in_pass0[3 * stride] + in_pass0[12 * stride]) * 4;
+ input[4] = (in_pass0[4 * stride] + in_pass0[11 * stride]) * 4;
+ input[5] = (in_pass0[5 * stride] + in_pass0[10 * stride]) * 4;
+ input[6] = (in_pass0[6 * stride] + in_pass0[ 9 * stride]) * 4;
+ input[7] = (in_pass0[7 * stride] + in_pass0[ 8 * stride]) * 4;
// Calculate input for the next 8 results.
- step1[0] = (in[7 * stride] - in[ 8 * stride]) * 4;
- step1[1] = (in[6 * stride] - in[ 9 * stride]) * 4;
- step1[2] = (in[5 * stride] - in[10 * stride]) * 4;
- step1[3] = (in[4 * stride] - in[11 * stride]) * 4;
- step1[4] = (in[3 * stride] - in[12 * stride]) * 4;
- step1[5] = (in[2 * stride] - in[13 * stride]) * 4;
- step1[6] = (in[1 * stride] - in[14 * stride]) * 4;
- step1[7] = (in[0 * stride] - in[15 * stride]) * 4;
+ step1[0] = (in_pass0[7 * stride] - in_pass0[ 8 * stride]) * 4;
+ step1[1] = (in_pass0[6 * stride] - in_pass0[ 9 * stride]) * 4;
+ step1[2] = (in_pass0[5 * stride] - in_pass0[10 * stride]) * 4;
+ step1[3] = (in_pass0[4 * stride] - in_pass0[11 * stride]) * 4;
+ step1[4] = (in_pass0[3 * stride] - in_pass0[12 * stride]) * 4;
+ step1[5] = (in_pass0[2 * stride] - in_pass0[13 * stride]) * 4;
+ step1[6] = (in_pass0[1 * stride] - in_pass0[14 * stride]) * 4;
+ step1[7] = (in_pass0[0 * stride] - in_pass0[15 * stride]) * 4;
} else {
// Calculate input for the first 8 results.
input[0] = ((in[0 * 16] + 1) >> 2) + ((in[15 * 16] + 1) >> 2);
@@ -406,9 +411,9 @@
}
// Work on the first eight values; fdct8(input, even_results);
{
- /*canbe16*/ int s0, s1, s2, s3, s4, s5, s6, s7;
- /*needs32*/ int t0, t1, t2, t3;
- /*canbe16*/ int x0, x1, x2, x3;
+ tran_high_t s0, s1, s2, s3, s4, s5, s6, s7; // canbe16
+ tran_high_t t0, t1, t2, t3; // needs32
+ tran_high_t x0, x1, x2, x3; // canbe16
// stage 1
s0 = input[0] + input[7];
@@ -514,6 +519,7 @@
}
// Do next column (which is a transposed row in second/horizontal pass)
in++;
+ in_pass0++;
out += 16;
}
// Setup in/out for next pass.
@@ -522,17 +528,17 @@
}
}
-static void fadst8(const int16_t *input, int16_t *output) {
- int s0, s1, s2, s3, s4, s5, s6, s7;
+static void fadst8(const tran_low_t *input, tran_low_t *output) {
+ tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;
- int x0 = input[7];
- int x1 = input[0];
- int x2 = input[5];
- int x3 = input[2];
- int x4 = input[3];
- int x5 = input[4];
- int x6 = input[1];
- int x7 = input[6];
+ tran_high_t x0 = input[7];
+ tran_high_t x1 = input[0];
+ tran_high_t x2 = input[5];
+ tran_high_t x3 = input[2];
+ tran_high_t x4 = input[3];
+ tran_high_t x5 = input[4];
+ tran_high_t x6 = input[1];
+ tran_high_t x7 = input[6];
// stage 1
s0 = cospi_2_64 * x0 + cospi_30_64 * x1;
@@ -600,15 +606,15 @@
{ fadst8, fadst8 } // ADST_ADST = 3
};
-void vp9_fht8x8_c(const int16_t *input, int16_t *output,
+void vp9_fht8x8_c(const int16_t *input, tran_low_t *output,
int stride, int tx_type) {
if (tx_type == DCT_DCT) {
vp9_fdct8x8_c(input, output, stride);
} else {
- int16_t out[64];
- int16_t *outptr = &out[0];
+ tran_low_t out[64];
+ tran_low_t *outptr = &out[0];
int i, j;
- int16_t temp_in[8], temp_out[8];
+ tran_low_t temp_in[8], temp_out[8];
const transform_2d ht = FHT_8[tx_type];
// Columns
@@ -633,17 +639,18 @@
/* 4-point reversible, orthonormal Walsh-Hadamard in 3.5 adds, 0.5 shifts per
pixel. */
-void vp9_fwht4x4_c(const int16_t *input, int16_t *output, int stride) {
+void vp9_fwht4x4_c(const int16_t *input, tran_low_t *output, int stride) {
int i;
- int a1, b1, c1, d1, e1;
- const int16_t *ip = input;
- int16_t *op = output;
+ tran_high_t a1, b1, c1, d1, e1;
+ const int16_t *ip_pass0 = input;
+ const tran_low_t *ip = NULL;
+ tran_low_t *op = output;
for (i = 0; i < 4; i++) {
- a1 = ip[0 * stride];
- b1 = ip[1 * stride];
- c1 = ip[2 * stride];
- d1 = ip[3 * stride];
+ a1 = ip_pass0[0 * stride];
+ b1 = ip_pass0[1 * stride];
+ c1 = ip_pass0[2 * stride];
+ d1 = ip_pass0[3 * stride];
a1 += b1;
d1 = d1 - c1;
@@ -657,7 +664,7 @@
op[8] = d1;
op[12] = b1;
- ip++;
+ ip_pass0++;
op++;
}
ip = output;
@@ -687,12 +694,12 @@
}
// Rewrote to use same algorithm as others.
-static void fdct16(const int16_t in[16], int16_t out[16]) {
- /*canbe16*/ int step1[8];
- /*canbe16*/ int step2[8];
- /*canbe16*/ int step3[8];
- /*canbe16*/ int input[8];
- /*needs32*/ int temp1, temp2;
+static void fdct16(const tran_low_t in[16], tran_low_t out[16]) {
+ tran_high_t step1[8]; // canbe16
+ tran_high_t step2[8]; // canbe16
+ tran_high_t step3[8]; // canbe16
+ tran_high_t input[8]; // canbe16
+ tran_high_t temp1, temp2; // needs32
// step 1
input[0] = in[0] + in[15];
@@ -715,9 +722,9 @@
// fdct8(step, step);
{
- /*canbe16*/ int s0, s1, s2, s3, s4, s5, s6, s7;
- /*needs32*/ int t0, t1, t2, t3;
- /*canbe16*/ int x0, x1, x2, x3;
+ tran_high_t s0, s1, s2, s3, s4, s5, s6, s7; // canbe16
+ tran_high_t t0, t1, t2, t3; // needs32
+ tran_high_t x0, x1, x2, x3; // canbe16
// stage 1
s0 = input[0] + input[7];
@@ -828,25 +835,26 @@
out[15] = fdct_round_shift(temp2);
}
-static void fadst16(const int16_t *input, int16_t *output) {
- int s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14, s15;
+static void fadst16(const tran_low_t *input, tran_low_t *output) {
+ tran_high_t s0, s1, s2, s3, s4, s5, s6, s7, s8;
+ tran_high_t s9, s10, s11, s12, s13, s14, s15;
- int x0 = input[15];
- int x1 = input[0];
- int x2 = input[13];
- int x3 = input[2];
- int x4 = input[11];
- int x5 = input[4];
- int x6 = input[9];
- int x7 = input[6];
- int x8 = input[7];
- int x9 = input[8];
- int x10 = input[5];
- int x11 = input[10];
- int x12 = input[3];
- int x13 = input[12];
- int x14 = input[1];
- int x15 = input[14];
+ tran_high_t x0 = input[15];
+ tran_high_t x1 = input[0];
+ tran_high_t x2 = input[13];
+ tran_high_t x3 = input[2];
+ tran_high_t x4 = input[11];
+ tran_high_t x5 = input[4];
+ tran_high_t x6 = input[9];
+ tran_high_t x7 = input[6];
+ tran_high_t x8 = input[7];
+ tran_high_t x9 = input[8];
+ tran_high_t x10 = input[5];
+ tran_high_t x11 = input[10];
+ tran_high_t x12 = input[3];
+ tran_high_t x13 = input[12];
+ tran_high_t x14 = input[1];
+ tran_high_t x15 = input[14];
// stage 1
s0 = x0 * cospi_1_64 + x1 * cospi_31_64;
@@ -997,15 +1005,15 @@
{ fadst16, fadst16 } // ADST_ADST = 3
};
-void vp9_fht16x16_c(const int16_t *input, int16_t *output,
+void vp9_fht16x16_c(const int16_t *input, tran_low_t *output,
int stride, int tx_type) {
if (tx_type == DCT_DCT) {
vp9_fdct16x16_c(input, output, stride);
} else {
- int16_t out[256];
- int16_t *outptr = &out[0];
+ tran_low_t out[256];
+ tran_low_t *outptr = &out[0];
int i, j;
- int16_t temp_in[16], temp_out[16];
+ tran_low_t temp_in[16], temp_out[16];
const transform_2d ht = FHT_16[tx_type];
// Columns
@@ -1028,19 +1036,21 @@
}
}
-static INLINE int dct_32_round(int input) {
- int rv = ROUND_POWER_OF_TWO(input, DCT_CONST_BITS);
- assert(-131072 <= rv && rv <= 131071);
+static INLINE tran_high_t dct_32_round(tran_high_t input) {
+ tran_high_t rv = ROUND_POWER_OF_TWO(input, DCT_CONST_BITS);
+ // TODO(debargha, peter.derivaz): Find new bounds for this assert,
+ // and make the bounds consts.
+ // assert(-131072 <= rv && rv <= 131071);
return rv;
}
-static INLINE int half_round_shift(int input) {
- int rv = (input + 1 + (input < 0)) >> 2;
+static INLINE tran_high_t half_round_shift(tran_high_t input) {
+ tran_high_t rv = (input + 1 + (input < 0)) >> 2;
return rv;
}
-static void fdct32(const int *input, int *output, int round) {
- int step[32];
+static void fdct32(const tran_high_t *input, tran_high_t *output, int round) {
+ tran_high_t step[32];
// Stage 1
step[0] = input[0] + input[(32 - 1)];
step[1] = input[1] + input[(32 - 2)];
@@ -1362,9 +1372,9 @@
output[31] = dct_32_round(step[31] * cospi_31_64 + step[16] * -cospi_1_64);
}
-void vp9_fdct32x32_1_c(const int16_t *input, int16_t *output, int stride) {
+void vp9_fdct32x32_1_c(const int16_t *input, tran_low_t *output, int stride) {
int r, c;
- int16_t sum = 0;
+ tran_low_t sum = 0;
for (r = 0; r < 32; ++r)
for (c = 0; c < 32; ++c)
sum += input[r * stride + c];
@@ -1373,13 +1383,13 @@
output[1] = 0;
}
-void vp9_fdct32x32_c(const int16_t *input, int16_t *out, int stride) {
+void vp9_fdct32x32_c(const int16_t *input, tran_low_t *out, int stride) {
int i, j;
- int output[32 * 32];
+ tran_high_t output[32 * 32];
// Columns
for (i = 0; i < 32; ++i) {
- int temp_in[32], temp_out[32];
+ tran_high_t temp_in[32], temp_out[32];
for (j = 0; j < 32; ++j)
temp_in[j] = input[j * stride + i] * 4;
fdct32(temp_in, temp_out, 0);
@@ -1389,7 +1399,7 @@
// Rows
for (i = 0; i < 32; ++i) {
- int temp_in[32], temp_out[32];
+ tran_high_t temp_in[32], temp_out[32];
for (j = 0; j < 32; ++j)
temp_in[j] = output[j + i * 32];
fdct32(temp_in, temp_out, 0);
@@ -1401,13 +1411,13 @@
// Note that although we use dct_32_round in dct32 computation flow,
// this 2d fdct32x32 for rate-distortion optimization loop is operating
// within 16 bits precision.
-void vp9_fdct32x32_rd_c(const int16_t *input, int16_t *out, int stride) {
+void vp9_fdct32x32_rd_c(const int16_t *input, tran_low_t *out, int stride) {
int i, j;
- int output[32 * 32];
+ tran_high_t output[32 * 32];
// Columns
for (i = 0; i < 32; ++i) {
- int temp_in[32], temp_out[32];
+ tran_high_t temp_in[32], temp_out[32];
for (j = 0; j < 32; ++j)
temp_in[j] = input[j * stride + i] * 4;
fdct32(temp_in, temp_out, 0);
@@ -1420,7 +1430,7 @@
// Rows
for (i = 0; i < 32; ++i) {
- int temp_in[32], temp_out[32];
+ tran_high_t temp_in[32], temp_out[32];
for (j = 0; j < 32; ++j)
temp_in[j] = output[j + i * 32];
fdct32(temp_in, temp_out, 1);
@@ -1428,3 +1438,61 @@
out[j + i * 32] = temp_out[j];
}
}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+void vp9_high_fdct4x4_c(const int16_t *input, tran_low_t *output, int stride) {
+ vp9_fdct4x4_c(input, output, stride);
+}
+
+void vp9_high_fht4x4_c(const int16_t *input, tran_low_t *output,
+ int stride, int tx_type) {
+ vp9_fht4x4_c(input, output, stride, tx_type);
+}
+
+void vp9_high_fdct8x8_1_c(const int16_t *input, tran_low_t *final_output,
+ int stride) {
+ vp9_fdct8x8_1_c(input, final_output, stride);
+}
+
+void vp9_high_fdct8x8_c(const int16_t *input, tran_low_t *final_output,
+ int stride) {
+ vp9_fdct8x8_c(input, final_output, stride);
+}
+
+void vp9_high_fdct16x16_1_c(const int16_t *input, tran_low_t *output,
+ int stride) {
+ vp9_fdct16x16_1_c(input, output, stride);
+}
+
+void vp9_high_fdct16x16_c(const int16_t *input, tran_low_t *output,
+ int stride) {
+ vp9_fdct16x16_c(input, output, stride);
+}
+
+void vp9_high_fht8x8_c(const int16_t *input, tran_low_t *output,
+ int stride, int tx_type) {
+ vp9_fht8x8_c(input, output, stride, tx_type);
+}
+
+void vp9_high_fwht4x4_c(const int16_t *input, tran_low_t *output, int stride) {
+ vp9_fwht4x4_c(input, output, stride);
+}
+
+void vp9_high_fht16x16_c(const int16_t *input, tran_low_t *output,
+ int stride, int tx_type) {
+ vp9_fht16x16_c(input, output, stride, tx_type);
+}
+
+void vp9_high_fdct32x32_1_c(const int16_t *input, tran_low_t *out, int stride) {
+ vp9_fdct32x32_1_c(input, out, stride);
+}
+
+void vp9_high_fdct32x32_c(const int16_t *input, tran_low_t *out, int stride) {
+ vp9_fdct32x32_c(input, out, stride);
+}
+
+void vp9_high_fdct32x32_rd_c(const int16_t *input, tran_low_t *out,
+ int stride) {
+ vp9_fdct32x32_rd_c(input, out, stride);
+}
+#endif // CONFIG_VP9_HIGHBITDEPTH
diff --git a/vp9/encoder/vp9_denoiser.c b/vp9/encoder/vp9_denoiser.c
index 90ea9cc..75b9449 100644
--- a/vp9/encoder/vp9_denoiser.c
+++ b/vp9/encoder/vp9_denoiser.c
@@ -78,7 +78,8 @@
int mc_avg_stride,
uint8_t *avg, int avg_stride,
int increase_denoising,
- BLOCK_SIZE bs) {
+ BLOCK_SIZE bs,
+ int motion_magnitude) {
int r, c;
const uint8_t *sig_start = sig;
const uint8_t *mc_avg_start = mc_avg;
@@ -86,6 +87,19 @@
int diff, adj, absdiff, delta;
int adj_val[] = {3, 4, 6};
int total_adj = 0;
+ int shift_inc = 1;
+
+ // If motion_magnitude is small, making the denoiser more aggressive by
+ // increasing the adjustment for each level. Add another increment for
+ // blocks that are labeled for increase denoising.
+ if (motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD) {
+ if (increase_denoising) {
+ shift_inc = 2;
+ }
+ adj_val[0] += shift_inc;
+ adj_val[1] += shift_inc;
+ adj_val[2] += shift_inc;
+ }
// First attempt to apply a strong temporal denoising filter.
for (r = 0; r < heights[bs]; ++r) {
@@ -130,7 +144,8 @@
// Otherwise, we try to dampen the filter if the delta is not too high.
delta = ((abs(total_adj) - total_adj_strong_thresh(bs, increase_denoising))
>> 8) + 1;
- if (delta > delta_thresh(bs, increase_denoising)) {
+
+ if (delta >= delta_thresh(bs, increase_denoising)) {
return COPY_BLOCK;
}
@@ -145,11 +160,17 @@
adj = delta;
}
if (diff > 0) {
+ // Diff positive means we made positive adjustment above
+ // (in first try/attempt), so now make negative adjustment to bring
+ // denoised signal down.
avg[c] = MAX(0, avg[c] - adj);
- total_adj += adj;
- } else {
- avg[c] = MIN(UINT8_MAX, avg[c] + adj);
total_adj -= adj;
+ } else {
+ // Diff negative means we made negative adjustment above
+ // (in first try/attempt), so now make positive adjustment to bring
+ // denoised signal up.
+ avg[c] = MIN(UINT8_MAX, avg[c] + adj);
+ total_adj += adj;
}
}
sig += sig_stride;
@@ -185,7 +206,8 @@
int increase_denoising,
int mi_row,
int mi_col,
- PICK_MODE_CONTEXT *ctx
+ PICK_MODE_CONTEXT *ctx,
+ int *motion_magnitude
) {
int mv_col, mv_row;
int sse_diff = ctx->zeromv_sse - ctx->newmv_sse;
@@ -210,6 +232,8 @@
mv_col = ctx->best_sse_mv.as_mv.col;
mv_row = ctx->best_sse_mv.as_mv.row;
+ *motion_magnitude = mv_row * mv_row + mv_col * mv_col;
+
frame = ctx->best_reference_frame;
// If the best reference frame uses inter-prediction and there is enough of a
@@ -297,6 +321,7 @@
void vp9_denoiser_denoise(VP9_DENOISER *denoiser, MACROBLOCK *mb,
int mi_row, int mi_col, BLOCK_SIZE bs,
PICK_MODE_CONTEXT *ctx) {
+ int motion_magnitude = 0;
VP9_DENOISER_DECISION decision = FILTER_BLOCK;
YV12_BUFFER_CONFIG avg = denoiser->running_avg_y[INTRA_FRAME];
YV12_BUFFER_CONFIG mc_avg = denoiser->mc_running_avg_y;
@@ -307,13 +332,14 @@
decision = perform_motion_compensation(denoiser, mb, bs,
denoiser->increase_denoising,
- mi_row, mi_col, ctx);
+ mi_row, mi_col, ctx,
+ &motion_magnitude);
if (decision == FILTER_BLOCK) {
decision = denoiser_filter(src.buf, src.stride,
mc_avg_start, mc_avg.y_stride,
avg_start, avg.y_stride,
- 0, bs);
+ 0, bs, motion_magnitude);
}
if (decision == FILTER_BLOCK) {
@@ -370,8 +396,8 @@
ctx->newmv_sse = UINT_MAX;
}
-void vp9_denoiser_update_frame_stats(VP9_DENOISER *denoiser, MB_MODE_INFO *mbmi,
- unsigned int sse, PREDICTION_MODE mode,
+void vp9_denoiser_update_frame_stats(MB_MODE_INFO *mbmi, unsigned int sse,
+ PREDICTION_MODE mode,
PICK_MODE_CONTEXT *ctx) {
// TODO(tkopp): Use both MVs if possible
if (mbmi->mv[0].as_int == 0 && sse < ctx->zeromv_sse) {
@@ -388,13 +414,21 @@
}
int vp9_denoiser_alloc(VP9_DENOISER *denoiser, int width, int height,
- int ssx, int ssy, int border) {
+ int ssx, int ssy,
+#if CONFIG_VP9_HIGHBITDEPTH
+ int use_highbitdepth,
+#endif
+ int border) {
int i, fail;
assert(denoiser != NULL);
for (i = 0; i < MAX_REF_FRAMES; ++i) {
fail = vp9_alloc_frame_buffer(&denoiser->running_avg_y[i], width, height,
- ssx, ssy, border);
+ ssx, ssy,
+#if CONFIG_VP9_HIGHBITDEPTH
+ use_highbitdepth,
+#endif
+ border);
if (fail) {
vp9_denoiser_free(denoiser);
return 1;
@@ -405,7 +439,11 @@
}
fail = vp9_alloc_frame_buffer(&denoiser->mc_running_avg_y, width, height,
- ssx, ssy, border);
+ ssx, ssy,
+#if CONFIG_VP9_HIGHBITDEPTH
+ use_highbitdepth,
+#endif
+ border);
if (fail) {
vp9_denoiser_free(denoiser);
return 1;
diff --git a/vp9/encoder/vp9_denoiser.h b/vp9/encoder/vp9_denoiser.h
index d93846f..fa714b1 100644
--- a/vp9/encoder/vp9_denoiser.h
+++ b/vp9/encoder/vp9_denoiser.h
@@ -18,6 +18,8 @@
extern "C" {
#endif
+#define MOTION_MAGNITUDE_THRESHOLD (8 * 3)
+
typedef enum vp9_denoiser_decision {
COPY_BLOCK,
FILTER_BLOCK
@@ -42,12 +44,16 @@
void vp9_denoiser_reset_frame_stats(PICK_MODE_CONTEXT *ctx);
-void vp9_denoiser_update_frame_stats(VP9_DENOISER *denoiser, MB_MODE_INFO *mbmi,
+void vp9_denoiser_update_frame_stats(MB_MODE_INFO *mbmi,
unsigned int sse, PREDICTION_MODE mode,
PICK_MODE_CONTEXT *ctx);
int vp9_denoiser_alloc(VP9_DENOISER *denoiser, int width, int height,
- int ssx, int ssy, int border);
+ int ssx, int ssy,
+#if CONFIG_VP9_HIGHBITDEPTH
+ int use_highbitdepth,
+#endif
+ int border);
void vp9_denoiser_free(VP9_DENOISER *denoiser);
diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c
index b20b662..c62b52f 100644
--- a/vp9/encoder/vp9_encodeframe.c
+++ b/vp9/encoder/vp9_encodeframe.c
@@ -539,10 +539,9 @@
}
// Else for cyclic refresh mode update the segment map, set the segment id
// and then update the quantizer.
- else if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ) {
+ if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ) {
vp9_cyclic_refresh_update_segment(cpi, &xd->mi[0]->mbmi,
mi_row, mi_col, bsize, 1);
- vp9_init_plane_quantizers(cpi, x);
}
}
@@ -728,6 +727,7 @@
p[i].eobs = ctx->eobs_pbuf[i][0];
}
ctx->is_coded = 0;
+ ctx->skippable = 0;
x->skip_recode = 0;
// Set to zero to make sure we do not use the previous encoded frame stats
@@ -1233,30 +1233,23 @@
}
}
-static int is_background(VP9_COMP *cpi, const TileInfo *const tile,
+static int is_background(const VP9_COMP *cpi, const TileInfo *const tile,
int mi_row, int mi_col) {
- MACROBLOCK *x = &cpi->mb;
- uint8_t *src, *pre;
- int src_stride, pre_stride;
-
+ // This assumes the input source frames are of the same dimension.
const int row8x8_remaining = tile->mi_row_end - mi_row;
const int col8x8_remaining = tile->mi_col_end - mi_col;
-
+ const int x = mi_col * MI_SIZE;
+ const int y = mi_row * MI_SIZE;
+ const int src_stride = cpi->Source->y_stride;
+ const uint8_t *const src = &cpi->Source->y_buffer[y * src_stride + x];
+ const int pre_stride = cpi->Last_Source->y_stride;
+ const uint8_t *const pre = &cpi->Last_Source->y_buffer[y * pre_stride + x];
int this_sad = 0;
int threshold = 0;
- // This assumes the input source frames are of the same dimension.
- src_stride = cpi->Source->y_stride;
- src = cpi->Source->y_buffer + (mi_row * MI_SIZE) * src_stride +
- (mi_col * MI_SIZE);
- pre_stride = cpi->Last_Source->y_stride;
- pre = cpi->Last_Source->y_buffer + (mi_row * MI_SIZE) * pre_stride +
- (mi_col * MI_SIZE);
-
if (row8x8_remaining >= MI_BLOCK_SIZE &&
col8x8_remaining >= MI_BLOCK_SIZE) {
- this_sad = cpi->fn_ptr[BLOCK_64X64].sdf(src, src_stride,
- pre, pre_stride);
+ this_sad = cpi->fn_ptr[BLOCK_64X64].sdf(src, src_stride, pre, pre_stride);
threshold = (1 << 12);
} else {
int r, c;
@@ -1267,8 +1260,7 @@
threshold = (row8x8_remaining * col8x8_remaining) << 6;
}
- x->in_static_area = (this_sad < 2 * threshold);
- return x->in_static_area;
+ return this_sad < 2 * threshold;
}
static int sb_has_motion(const VP9_COMMON *cm, MODE_INFO **prev_mi_8x8,
@@ -1724,7 +1716,8 @@
// function so repeat calls can accumulate a min and max of more than one sb64.
static void get_sb_partition_size_range(MACROBLOCKD *xd, MODE_INFO **mi_8x8,
BLOCK_SIZE *min_block_size,
- BLOCK_SIZE *max_block_size ) {
+ BLOCK_SIZE *max_block_size,
+ int bs_hist[BLOCK_SIZES]) {
int sb_width_in_blocks = MI_BLOCK_SIZE;
int sb_height_in_blocks = MI_BLOCK_SIZE;
int i, j;
@@ -1735,6 +1728,7 @@
for (j = 0; j < sb_width_in_blocks; ++j) {
MODE_INFO * mi = mi_8x8[index+j];
BLOCK_SIZE sb_type = mi ? mi->mbmi.sb_type : 0;
+ bs_hist[sb_type]++;
*min_block_size = MIN(*min_block_size, sb_type);
*max_block_size = MAX(*max_block_size, sb_type);
}
@@ -1767,6 +1761,9 @@
int bh, bw;
BLOCK_SIZE min_size = BLOCK_4X4;
BLOCK_SIZE max_size = BLOCK_64X64;
+ int i = 0;
+ int bs_hist[BLOCK_SIZES] = {0};
+
// Trap case where we do not have a prediction.
if (left_in_image || above_in_image || cm->frame_type != KEY_FRAME) {
// Default "min to max" and "max to min"
@@ -1779,22 +1776,51 @@
if (cm->frame_type != KEY_FRAME) {
MODE_INFO **const prev_mi =
&cm->prev_mi_grid_visible[mi_row * xd->mi_stride + mi_col];
- get_sb_partition_size_range(xd, prev_mi, &min_size, &max_size);
+ get_sb_partition_size_range(xd, prev_mi, &min_size, &max_size, bs_hist);
}
// Find the min and max partition sizes used in the left SB64
if (left_in_image) {
MODE_INFO **left_sb64_mi = &mi[-MI_BLOCK_SIZE];
- get_sb_partition_size_range(xd, left_sb64_mi, &min_size, &max_size);
+ get_sb_partition_size_range(xd, left_sb64_mi, &min_size, &max_size,
+ bs_hist);
}
// Find the min and max partition sizes used in the above SB64.
if (above_in_image) {
MODE_INFO **above_sb64_mi = &mi[-xd->mi_stride * MI_BLOCK_SIZE];
- get_sb_partition_size_range(xd, above_sb64_mi, &min_size, &max_size);
+ get_sb_partition_size_range(xd, above_sb64_mi, &min_size, &max_size,
+ bs_hist);
}
+
// adjust observed min and max
if (cpi->sf.auto_min_max_partition_size == RELAXED_NEIGHBORING_MIN_MAX) {
min_size = min_partition_size[min_size];
max_size = max_partition_size[max_size];
+ } else if (cpi->sf.auto_min_max_partition_size ==
+ CONSTRAIN_NEIGHBORING_MIN_MAX) {
+ // adjust the search range based on the histogram of the observed
+ // partition sizes from left, above the previous co-located blocks
+ int sum = 0;
+ int first_moment = 0;
+ int second_moment = 0;
+ int var_unnormalized = 0;
+
+ for (i = 0; i < BLOCK_SIZES; i++) {
+ sum += bs_hist[i];
+ first_moment += bs_hist[i] * i;
+ second_moment += bs_hist[i] * i * i;
+ }
+
+ // if variance is small enough,
+ // adjust the range around its mean size, which gives a tighter range
+ var_unnormalized = second_moment - first_moment * first_moment / sum;
+ if (var_unnormalized <= 4 * sum) {
+ int mean = first_moment / sum;
+ min_size = min_partition_size[mean];
+ max_size = max_partition_size[mean];
+ } else {
+ min_size = min_partition_size[min_size];
+ max_size = max_partition_size[max_size];
+ }
}
}
@@ -1811,6 +1837,7 @@
next_square_size[max_size] < min_size) {
min_size = next_square_size[max_size];
}
+
*min_block_size = min_size;
*max_block_size = max_size;
}
@@ -2061,17 +2088,6 @@
save_context(cpi, mi_row, mi_col, a, l, sa, sl, bsize);
- if (cpi->sf.disable_split_var_thresh && partition_none_allowed) {
- unsigned int source_variancey;
- vp9_setup_src_planes(x, cpi->Source, mi_row, mi_col);
- source_variancey = get_sby_perpixel_variance(cpi, &x->plane[0].src, bsize);
- if (source_variancey < cpi->sf.disable_split_var_thresh) {
- do_split = 0;
- if (source_variancey < cpi->sf.disable_split_var_thresh / 2)
- do_rect = 0;
- }
- }
-
#if CONFIG_FP_MB_STATS
if (cpi->use_fp_mb_stats) {
set_offsets(cpi, tile, mi_row, mi_col, bsize);
@@ -2143,8 +2159,8 @@
sum_rd = RDCOST(x->rdmult, x->rddiv, this_rate, this_dist);
if (sum_rd < best_rd) {
- int64_t stop_thresh = 4096;
- int64_t stop_thresh_rd;
+ int64_t dist_breakout_thr = cpi->sf.partition_search_breakout_dist_thr;
+ int rate_breakout_thr = cpi->sf.partition_search_breakout_rate_thr;
best_rate = this_rate;
best_dist = this_dist;
@@ -2152,14 +2168,18 @@
if (bsize >= BLOCK_8X8)
pc_tree->partitioning = PARTITION_NONE;
- // Adjust threshold according to partition size.
- stop_thresh >>= 8 - (b_width_log2(bsize) +
+ // Adjust dist breakout threshold according to the partition size.
+ dist_breakout_thr >>= 8 - (b_width_log2(bsize) +
b_height_log2(bsize));
- stop_thresh_rd = RDCOST(x->rdmult, x->rddiv, 0, stop_thresh);
- // If obtained distortion is very small, choose current partition
- // and stop splitting.
- if (!x->e_mbd.lossless && best_rd < stop_thresh_rd) {
+ // If all y, u, v transform blocks in this partition are skippable, and
+ // the dist & rate are within the thresholds, the partition search is
+ // terminated for current branch of the partition search tree.
+ // The dist & rate thresholds are set to 0 at speed 0 to disable the
+ // early termination at that speed.
+ if (!x->e_mbd.lossless &&
+ (ctx->skippable && best_dist < dist_breakout_thr &&
+ best_rate < rate_breakout_thr)) {
do_split = 0;
do_rect = 0;
}
@@ -2446,6 +2466,9 @@
vp9_zero(cpi->mb.pred_mv);
cpi->pc_root->index = 0;
+ // TODO(yunqingwang): use_lastframe_partitioning is no longer used in good-
+ // quality encoding. Need to evaluate it in real-time encoding later to
+ // decide if it can be removed too. And then, do the code cleanup.
if ((sf->partition_search_type == SEARCH_PARTITION &&
sf->use_lastframe_partitioning) ||
sf->partition_search_type == FIXED_PARTITION ||
@@ -2480,7 +2503,7 @@
if (gf_grp->update_type[gf_grp->index - 1] == OVERLAY_UPDATE)
last_was_mid_sequence_overlay = 1;
}
- if ((cm->current_video_frame
+ if ((cpi->rc.frames_since_key
% sf->last_partitioning_redo_frequency) == 0
|| last_was_mid_sequence_overlay
|| cm->prev_mi == 0
@@ -2581,35 +2604,15 @@
}
static TX_MODE select_tx_mode(const VP9_COMP *cpi) {
- if (cpi->mb.e_mbd.lossless) {
+ if (cpi->mb.e_mbd.lossless)
return ONLY_4X4;
- } else if (cpi->common.current_video_frame == 0) {
+ if (cpi->sf.tx_size_search_method == USE_LARGESTALL)
+ return ALLOW_32X32;
+ else if (cpi->sf.tx_size_search_method == USE_FULL_RD||
+ cpi->sf.tx_size_search_method == USE_TX_8X8)
return TX_MODE_SELECT;
- } else {
- if (cpi->sf.tx_size_search_method == USE_LARGESTALL) {
- return ALLOW_32X32;
- } else if (cpi->sf.tx_size_search_method == USE_FULL_RD) {
- const RD_OPT *const rd_opt = &cpi->rd;
- const MV_REFERENCE_FRAME frame_type = get_frame_type(cpi);
- return rd_opt->tx_select_threshes[frame_type][ALLOW_32X32] >
- rd_opt->tx_select_threshes[frame_type][TX_MODE_SELECT] ?
- ALLOW_32X32 : TX_MODE_SELECT;
- } else if (cpi->sf.tx_size_search_method == USE_TX_8X8) {
- return TX_MODE_SELECT;
- } else {
- unsigned int total = 0;
- int i;
- for (i = 0; i < TX_SIZES; ++i)
- total += cpi->tx_stepdown_count[i];
-
- if (total) {
- const double fraction = (double)cpi->tx_stepdown_count[0] / total;
- return fraction > 0.90 ? ALLOW_32X32 : TX_MODE_SELECT;
- } else {
- return cpi->common.tx_mode;
- }
- }
- }
+ else
+ return cpi->common.tx_mode;
}
static void nonrd_pick_sb_modes(VP9_COMP *cpi, const TileInfo *const tile,
@@ -3114,7 +3117,7 @@
break;
case REFERENCE_PARTITION:
if (sf->partition_check ||
- !is_background(cpi, tile, mi_row, mi_col)) {
+ !(x->in_static_area = is_background(cpi, tile, mi_row, mi_col))) {
set_modeinfo_offsets(cm, xd, mi_row, mi_col);
auto_partition_range(cpi, tile, mi_row, mi_col,
&sf->min_partition_size,
@@ -3292,7 +3295,6 @@
vp9_zero(cm->counts);
vp9_zero(cpi->coef_counts);
- vp9_zero(cpi->tx_stepdown_count);
vp9_zero(rd_opt->comp_pred_diff);
vp9_zero(rd_opt->filter_diff);
vp9_zero(rd_opt->tx_select_diff);
diff --git a/vp9/encoder/vp9_encodemb.c b/vp9/encoder/vp9_encodemb.c
index 8a737e1..794e6d0 100644
--- a/vp9/encoder/vp9_encodemb.c
+++ b/vp9/encoder/vp9_encodemb.c
@@ -107,9 +107,9 @@
vp9_token_state tokens[1025][2];
unsigned best_index[1025][2];
uint8_t token_cache[1024];
- const int16_t *const coeff = BLOCK_OFFSET(mb->plane[plane].coeff, block);
- int16_t *const qcoeff = BLOCK_OFFSET(p->qcoeff, block);
- int16_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
+ const tran_low_t *const coeff = BLOCK_OFFSET(mb->plane[plane].coeff, block);
+ tran_low_t *const qcoeff = BLOCK_OFFSET(p->qcoeff, block);
+ tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
const int eob = p->eobs[block];
const PLANE_TYPE type = pd->plane_type;
const int default_eob = 16 << (tx_size << 1);
@@ -294,22 +294,33 @@
}
static INLINE void fdct32x32(int rd_transform,
- const int16_t *src, int16_t *dst, int src_stride) {
+ const int16_t *src, tran_low_t *dst,
+ int src_stride) {
if (rd_transform)
vp9_fdct32x32_rd(src, dst, src_stride);
else
vp9_fdct32x32(src, dst, src_stride);
}
+#if CONFIG_VP9_HIGHBITDEPTH
+static INLINE void high_fdct32x32(int rd_transform, const int16_t *src,
+ tran_low_t *dst, int src_stride) {
+ if (rd_transform)
+ vp9_high_fdct32x32_rd(src, dst, src_stride);
+ else
+ vp9_high_fdct32x32(src, dst, src_stride);
+}
+#endif
+
void vp9_xform_quant_fp(MACROBLOCK *x, int plane, int block,
BLOCK_SIZE plane_bsize, TX_SIZE tx_size) {
MACROBLOCKD *const xd = &x->e_mbd;
const struct macroblock_plane *const p = &x->plane[plane];
const struct macroblockd_plane *const pd = &xd->plane[plane];
const scan_order *const scan_order = &vp9_default_scan_orders[tx_size];
- int16_t *const coeff = BLOCK_OFFSET(p->coeff, block);
- int16_t *const qcoeff = BLOCK_OFFSET(p->qcoeff, block);
- int16_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
+ tran_low_t *const coeff = BLOCK_OFFSET(p->coeff, block);
+ tran_low_t *const qcoeff = BLOCK_OFFSET(p->qcoeff, block);
+ tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
uint16_t *const eob = &p->eobs[block];
const int diff_stride = 4 * num_4x4_blocks_wide_lookup[plane_bsize];
int i, j;
@@ -357,9 +368,9 @@
MACROBLOCKD *const xd = &x->e_mbd;
const struct macroblock_plane *const p = &x->plane[plane];
const struct macroblockd_plane *const pd = &xd->plane[plane];
- int16_t *const coeff = BLOCK_OFFSET(p->coeff, block);
- int16_t *const qcoeff = BLOCK_OFFSET(p->qcoeff, block);
- int16_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
+ tran_low_t *const coeff = BLOCK_OFFSET(p->coeff, block);
+ tran_low_t *const qcoeff = BLOCK_OFFSET(p->qcoeff, block);
+ tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
uint16_t *const eob = &p->eobs[block];
const int diff_stride = 4 * num_4x4_blocks_wide_lookup[plane_bsize];
int i, j;
@@ -405,9 +416,9 @@
const struct macroblock_plane *const p = &x->plane[plane];
const struct macroblockd_plane *const pd = &xd->plane[plane];
const scan_order *const scan_order = &vp9_default_scan_orders[tx_size];
- int16_t *const coeff = BLOCK_OFFSET(p->coeff, block);
- int16_t *const qcoeff = BLOCK_OFFSET(p->qcoeff, block);
- int16_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
+ tran_low_t *const coeff = BLOCK_OFFSET(p->coeff, block);
+ tran_low_t *const qcoeff = BLOCK_OFFSET(p->qcoeff, block);
+ tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
uint16_t *const eob = &p->eobs[block];
const int diff_stride = 4 * num_4x4_blocks_wide_lookup[plane_bsize];
int i, j;
@@ -458,7 +469,7 @@
struct optimize_ctx *const ctx = args->ctx;
struct macroblock_plane *const p = &x->plane[plane];
struct macroblockd_plane *const pd = &xd->plane[plane];
- int16_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
+ tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
int i, j;
uint8_t *dst;
ENTROPY_CONTEXT *a, *l;
@@ -476,20 +487,24 @@
}
if (!x->skip_recode) {
- if (x->skip_txfm[plane] == 0) {
- // full forward transform and quantization
- if (x->quant_fp)
- vp9_xform_quant_fp(x, plane, block, plane_bsize, tx_size);
- else
- vp9_xform_quant(x, plane, block, plane_bsize, tx_size);
- } else if (x->skip_txfm[plane] == 2) {
- // fast path forward transform and quantization
- vp9_xform_quant_dc(x, plane, block, plane_bsize, tx_size);
+ if (max_txsize_lookup[plane_bsize] == tx_size) {
+ if (x->skip_txfm[(plane << 2) + (block >> (tx_size << 1))] == 0) {
+ // full forward transform and quantization
+ if (x->quant_fp)
+ vp9_xform_quant_fp(x, plane, block, plane_bsize, tx_size);
+ else
+ vp9_xform_quant(x, plane, block, plane_bsize, tx_size);
+ } else if (x->skip_txfm[(plane << 2) + (block >> (tx_size << 1))] == 2) {
+ // fast path forward transform and quantization
+ vp9_xform_quant_dc(x, plane, block, plane_bsize, tx_size);
+ } else {
+ // skip forward transform
+ p->eobs[block] = 0;
+ *a = *l = 0;
+ return;
+ }
} else {
- // skip forward transform
- p->eobs[block] = 0;
- *a = *l = 0;
- return;
+ vp9_xform_quant(x, plane, block, plane_bsize, tx_size);
}
}
@@ -534,7 +549,7 @@
MACROBLOCKD *const xd = &x->e_mbd;
struct macroblock_plane *const p = &x->plane[plane];
struct macroblockd_plane *const pd = &xd->plane[plane];
- int16_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
+ tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
int i, j;
uint8_t *dst;
txfrm_block_to_raster_xy(plane_bsize, tx_size, block, &i, &j);
@@ -583,9 +598,9 @@
MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
struct macroblock_plane *const p = &x->plane[plane];
struct macroblockd_plane *const pd = &xd->plane[plane];
- int16_t *coeff = BLOCK_OFFSET(p->coeff, block);
- int16_t *qcoeff = BLOCK_OFFSET(p->qcoeff, block);
- int16_t *dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
+ tran_low_t *coeff = BLOCK_OFFSET(p->coeff, block);
+ tran_low_t *qcoeff = BLOCK_OFFSET(p->qcoeff, block);
+ tran_low_t *dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
const scan_order *scan_order;
TX_TYPE tx_type;
PREDICTION_MODE mode;
diff --git a/vp9/encoder/vp9_encodemv.c b/vp9/encoder/vp9_encodemv.c
index 9ad6db0..9d42a12 100644
--- a/vp9/encoder/vp9_encodemv.c
+++ b/vp9/encoder/vp9_encodemv.c
@@ -216,7 +216,7 @@
// If auto_mv_step_size is enabled then keep track of the largest
// motion vector component used.
- if (!cpi->dummy_packing && cpi->sf.mv.auto_mv_step_size) {
+ if (cpi->sf.mv.auto_mv_step_size) {
unsigned int maxv = MAX(abs(mv->row), abs(mv->col)) >> 3;
cpi->max_mv_magnitude = MAX(maxv, cpi->max_mv_magnitude);
}
diff --git a/vp9/encoder/vp9_encoder.c b/vp9/encoder/vp9_encoder.c
index 524744c..b3884d0 100644
--- a/vp9/encoder/vp9_encoder.c
+++ b/vp9/encoder/vp9_encoder.c
@@ -24,6 +24,7 @@
#include "vp9/common/vp9_postproc.h"
#endif
#include "vp9/common/vp9_reconinter.h"
+#include "vp9/common/vp9_reconintra.h"
#include "vp9/common/vp9_systemdependent.h"
#include "vp9/common/vp9_tile_common.h"
@@ -128,11 +129,13 @@
}
if (cm->frame_type == KEY_FRAME) {
- if (!is_spatial_svc(cpi))
+ if (!is_two_pass_svc(cpi))
cpi->refresh_golden_frame = 1;
cpi->refresh_alt_ref_frame = 1;
+ vp9_zero(cpi->interp_filter_selected);
} else {
cm->fc = cm->frame_contexts[cm->frame_context_idx];
+ vp9_zero(cpi->interp_filter_selected[0]);
}
}
@@ -140,7 +143,9 @@
static int init_done = 0;
if (!init_done) {
+ vp9_rtcd();
vp9_init_neighbors();
+ vp9_init_intra_predictors();
vp9_coef_tree_initialize();
vp9_tokenize_initialize();
vp9_init_me_luts();
@@ -167,6 +172,26 @@
vpx_free(cpi->complexity_map);
cpi->complexity_map = NULL;
+ vpx_free(cpi->nmvcosts[0]);
+ vpx_free(cpi->nmvcosts[1]);
+ cpi->nmvcosts[0] = NULL;
+ cpi->nmvcosts[1] = NULL;
+
+ vpx_free(cpi->nmvcosts_hp[0]);
+ vpx_free(cpi->nmvcosts_hp[1]);
+ cpi->nmvcosts_hp[0] = NULL;
+ cpi->nmvcosts_hp[1] = NULL;
+
+ vpx_free(cpi->nmvsadcosts[0]);
+ vpx_free(cpi->nmvsadcosts[1]);
+ cpi->nmvsadcosts[0] = NULL;
+ cpi->nmvsadcosts[1] = NULL;
+
+ vpx_free(cpi->nmvsadcosts_hp[0]);
+ vpx_free(cpi->nmvsadcosts_hp[1]);
+ cpi->nmvsadcosts_hp[0] = NULL;
+ cpi->nmvsadcosts_hp[1] = NULL;
+
vp9_cyclic_refresh_free(cpi->cyclic_refresh);
cpi->cyclic_refresh = NULL;
@@ -212,8 +237,15 @@
// intended for use in a re-code loop in vp9_compress_frame where the
// quantizer value is adjusted between loop iterations.
vp9_copy(cc->nmvjointcost, cpi->mb.nmvjointcost);
- vp9_copy(cc->nmvcosts, cpi->mb.nmvcosts);
- vp9_copy(cc->nmvcosts_hp, cpi->mb.nmvcosts_hp);
+
+ vpx_memcpy(cc->nmvcosts[0], cpi->nmvcosts[0],
+ MV_VALS * sizeof(*cpi->nmvcosts[0]));
+ vpx_memcpy(cc->nmvcosts[1], cpi->nmvcosts[1],
+ MV_VALS * sizeof(*cpi->nmvcosts[1]));
+ vpx_memcpy(cc->nmvcosts_hp[0], cpi->nmvcosts_hp[0],
+ MV_VALS * sizeof(*cpi->nmvcosts_hp[0]));
+ vpx_memcpy(cc->nmvcosts_hp[1], cpi->nmvcosts_hp[1],
+ MV_VALS * sizeof(*cpi->nmvcosts_hp[1]));
vp9_copy(cc->segment_pred_probs, cm->seg.pred_probs);
@@ -233,8 +265,15 @@
// Restore key state variables to the snapshot state stored in the
// previous call to vp9_save_coding_context.
vp9_copy(cpi->mb.nmvjointcost, cc->nmvjointcost);
- vp9_copy(cpi->mb.nmvcosts, cc->nmvcosts);
- vp9_copy(cpi->mb.nmvcosts_hp, cc->nmvcosts_hp);
+
+ vpx_memcpy(cpi->nmvcosts[0], cc->nmvcosts[0],
+ MV_VALS * sizeof(*cc->nmvcosts[0]));
+ vpx_memcpy(cpi->nmvcosts[1], cc->nmvcosts[1],
+ MV_VALS * sizeof(*cc->nmvcosts[1]));
+ vpx_memcpy(cpi->nmvcosts_hp[0], cc->nmvcosts_hp[0],
+ MV_VALS * sizeof(*cc->nmvcosts_hp[0]));
+ vpx_memcpy(cpi->nmvcosts_hp[1], cc->nmvcosts_hp[1],
+ MV_VALS * sizeof(*cc->nmvcosts_hp[1]));
vp9_copy(cm->seg.pred_probs, cc->segment_pred_probs);
@@ -386,27 +425,15 @@
}
}
-
-static void set_speed_features(VP9_COMP *cpi) {
-#if CONFIG_INTERNAL_STATS
- int i;
- for (i = 0; i < MAX_MODES; ++i)
- cpi->mode_chosen_counts[i] = 0;
-#endif
-
- vp9_set_speed_features(cpi);
-
- // Set rd thresholds based on mode and speed setting
- vp9_set_rd_speed_thresholds(cpi);
- vp9_set_rd_speed_thresholds_sub8x8(cpi);
-}
-
static void alloc_raw_frame_buffers(VP9_COMP *cpi) {
VP9_COMMON *cm = &cpi->common;
const VP9EncoderConfig *oxcf = &cpi->oxcf;
cpi->lookahead = vp9_lookahead_init(oxcf->width, oxcf->height,
cm->subsampling_x, cm->subsampling_y,
+#if CONFIG_VP9_HIGHBITDEPTH
+ cm->use_highbitdepth,
+#endif
oxcf->lag_in_frames);
if (!cpi->lookahead)
vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
@@ -415,6 +442,9 @@
if (vp9_realloc_frame_buffer(&cpi->alt_ref_buffer,
oxcf->width, oxcf->height,
cm->subsampling_x, cm->subsampling_y,
+#if CONFIG_VP9_HIGHBITDEPTH
+ cm->use_highbitdepth,
+#endif
VP9_ENC_BORDER_IN_PIXELS, NULL, NULL, NULL))
vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
"Failed to allocate altref buffer");
@@ -432,6 +462,9 @@
if (vp9_realloc_frame_buffer(&cpi->last_frame_uf,
cm->width, cm->height,
cm->subsampling_x, cm->subsampling_y,
+#if CONFIG_VP9_HIGHBITDEPTH
+ cm->use_highbitdepth,
+#endif
VP9_ENC_BORDER_IN_PIXELS, NULL, NULL, NULL))
vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
"Failed to allocate last frame buffer");
@@ -439,6 +472,9 @@
if (vp9_realloc_frame_buffer(&cpi->scaled_source,
cm->width, cm->height,
cm->subsampling_x, cm->subsampling_y,
+#if CONFIG_VP9_HIGHBITDEPTH
+ cm->use_highbitdepth,
+#endif
VP9_ENC_BORDER_IN_PIXELS, NULL, NULL, NULL))
vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
"Failed to allocate scaled source buffer");
@@ -446,6 +482,9 @@
if (vp9_realloc_frame_buffer(&cpi->scaled_last_source,
cm->width, cm->height,
cm->subsampling_x, cm->subsampling_y,
+#if CONFIG_VP9_HIGHBITDEPTH
+ cm->use_highbitdepth,
+#endif
VP9_ENC_BORDER_IN_PIXELS, NULL, NULL, NULL))
vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
"Failed to allocate scaled last source buffer");
@@ -474,10 +513,13 @@
vp9_init_context_buffers(cm);
init_macroblockd(cm, xd);
- if (is_spatial_svc(cpi)) {
+ if (is_two_pass_svc(cpi)) {
if (vp9_realloc_frame_buffer(&cpi->alt_ref_buffer,
cm->width, cm->height,
cm->subsampling_x, cm->subsampling_y,
+#if CONFIG_VP9_HIGHBITDEPTH
+ cm->use_highbitdepth,
+#endif
VP9_ENC_BORDER_IN_PIXELS, NULL, NULL, NULL))
vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
"Failed to reallocate alt_ref_buffer");
@@ -485,18 +527,10 @@
}
void vp9_new_framerate(VP9_COMP *cpi, double framerate) {
- cpi->oxcf.framerate = framerate < 0.1 ? 30 : framerate;
+ cpi->framerate = framerate < 0.1 ? 30 : framerate;
vp9_rc_update_framerate(cpi);
}
-int64_t vp9_rescale(int64_t val, int64_t num, int denom) {
- int64_t llnum = num;
- int64_t llden = denom;
- int64_t llval = val;
-
- return (llval * llnum / llden);
-}
-
static void set_tile_limits(VP9_COMP *cpi) {
VP9_COMMON *const cm = &cpi->common;
@@ -518,9 +552,13 @@
VP9_COMMON *const cm = &cpi->common;
cpi->oxcf = *oxcf;
+ cpi->framerate = oxcf->init_framerate;
cm->profile = oxcf->profile;
cm->bit_depth = oxcf->bit_depth;
+#if CONFIG_VP9_HIGHBITDEPTH
+ cm->use_highbitdepth = oxcf->use_highbitdepth;
+#endif
cm->color_space = UNKNOWN;
cm->width = oxcf->width;
@@ -532,10 +570,10 @@
// Temporal scalability.
cpi->svc.number_temporal_layers = oxcf->ts_number_layers;
- if ((cpi->svc.number_temporal_layers > 1 &&
- cpi->oxcf.rc_mode == VPX_CBR) ||
- (cpi->svc.number_spatial_layers > 1 &&
- cpi->oxcf.mode == TWO_PASS_SECOND_BEST)) {
+ if ((cpi->svc.number_temporal_layers > 1 && cpi->oxcf.rc_mode == VPX_CBR) ||
+ ((cpi->svc.number_temporal_layers > 1 ||
+ cpi->svc.number_spatial_layers > 1) &&
+ cpi->oxcf.pass == 2)) {
vp9_init_layer_context(cpi);
}
@@ -550,6 +588,20 @@
set_tile_limits(cpi);
}
+static void set_rc_buffer_sizes(RATE_CONTROL *rc,
+ const VP9EncoderConfig *oxcf) {
+ const int64_t bandwidth = oxcf->target_bandwidth;
+ const int64_t starting = oxcf->starting_buffer_level_ms;
+ const int64_t optimal = oxcf->optimal_buffer_level_ms;
+ const int64_t maximum = oxcf->maximum_buffer_size_ms;
+
+ rc->starting_buffer_level = starting * bandwidth / 1000;
+ rc->optimal_buffer_level = (optimal == 0) ? bandwidth / 8
+ : optimal * bandwidth / 1000;
+ rc->maximum_buffer_size = (maximum == 0) ? bandwidth / 8
+ : maximum * bandwidth / 1000;
+}
+
void vp9_change_config(struct VP9_COMP *cpi, const VP9EncoderConfig *oxcf) {
VP9_COMMON *const cm = &cpi->common;
RATE_CONTROL *const rc = &cpi->rc;
@@ -559,11 +611,16 @@
cm->bit_depth = oxcf->bit_depth;
if (cm->profile <= PROFILE_1)
- assert(cm->bit_depth == BITS_8);
+ assert(cm->bit_depth == VPX_BITS_8);
else
- assert(cm->bit_depth > BITS_8);
+ assert(cm->bit_depth > VPX_BITS_8);
cpi->oxcf = *oxcf;
+#if CONFIG_VP9_HIGHBITDEPTH
+ if (cpi->oxcf.use_highbitdepth) {
+ cpi->mb.e_mbd.bd = (int)cm->bit_depth;
+ }
+#endif
rc->baseline_gf_interval = DEFAULT_GF_INTERVAL;
@@ -583,35 +640,15 @@
}
cpi->encode_breakout = cpi->oxcf.encode_breakout;
- // local file playback mode == really big buffer
- if (cpi->oxcf.rc_mode == VPX_VBR) {
- cpi->oxcf.starting_buffer_level_ms = 60000;
- cpi->oxcf.optimal_buffer_level_ms = 60000;
- cpi->oxcf.maximum_buffer_size_ms = 240000;
- }
+ set_rc_buffer_sizes(rc, &cpi->oxcf);
- rc->starting_buffer_level = vp9_rescale(cpi->oxcf.starting_buffer_level_ms,
- cpi->oxcf.target_bandwidth, 1000);
-
- // Set or reset optimal and maximum buffer levels.
- if (cpi->oxcf.optimal_buffer_level_ms == 0)
- rc->optimal_buffer_level = cpi->oxcf.target_bandwidth / 8;
- else
- rc->optimal_buffer_level = vp9_rescale(cpi->oxcf.optimal_buffer_level_ms,
- cpi->oxcf.target_bandwidth, 1000);
-
- if (cpi->oxcf.maximum_buffer_size_ms == 0)
- rc->maximum_buffer_size = cpi->oxcf.target_bandwidth / 8;
- else
- rc->maximum_buffer_size = vp9_rescale(cpi->oxcf.maximum_buffer_size_ms,
- cpi->oxcf.target_bandwidth, 1000);
// Under a configuration change, where maximum_buffer_size may change,
// keep buffer level clipped to the maximum allowed buffer size.
rc->bits_off_target = MIN(rc->bits_off_target, rc->maximum_buffer_size);
rc->buffer_level = MIN(rc->buffer_level, rc->maximum_buffer_size);
// Set up frame rate and related parameters rate control values.
- vp9_new_framerate(cpi, cpi->oxcf.framerate);
+ vp9_new_framerate(cpi, cpi->framerate);
// Set absolute upper and lower quality limits
rc->worst_quality = cpi->oxcf.worst_allowed_q;
@@ -633,7 +670,9 @@
if ((cpi->svc.number_temporal_layers > 1 &&
cpi->oxcf.rc_mode == VPX_CBR) ||
- (cpi->svc.number_spatial_layers > 1 && cpi->oxcf.pass == 2)) {
+ ((cpi->svc.number_temporal_layers > 1 ||
+ cpi->svc.number_spatial_layers > 1) &&
+ cpi->oxcf.pass == 2)) {
vp9_update_layer_context_change_config(cpi,
(int)cpi->oxcf.target_bandwidth);
}
@@ -656,6 +695,9 @@
if (cpi->oxcf.noise_sensitivity > 0) {
vp9_denoiser_alloc(&(cpi->denoiser), cm->width, cm->height,
cm->subsampling_x, cm->subsampling_y,
+#if CONFIG_VP9_HIGHBITDEPTH
+ cm->use_highbitdepth,
+#endif
VP9_ENC_BORDER_IN_PIXELS);
}
#endif
@@ -722,19 +764,12 @@
cm->error.setjmp = 1;
- vp9_rtcd();
-
cpi->use_svc = 0;
init_config(cpi, oxcf);
vp9_rc_init(&cpi->oxcf, oxcf->pass, &cpi->rc);
cm->current_video_frame = 0;
-
- cpi->gold_is_last = 0;
- cpi->alt_is_last = 0;
- cpi->gold_is_alt = 0;
-
cpi->skippable_frame = 0;
// Create the encoder segmentation map and set all entries to 0
@@ -754,6 +789,23 @@
CHECK_MEM_ERROR(cm, cpi->coding_context.last_frame_seg_map_copy,
vpx_calloc(cm->mi_rows * cm->mi_cols, 1));
+ CHECK_MEM_ERROR(cm, cpi->nmvcosts[0],
+ vpx_calloc(MV_VALS, sizeof(*cpi->nmvcosts[0])));
+ CHECK_MEM_ERROR(cm, cpi->nmvcosts[1],
+ vpx_calloc(MV_VALS, sizeof(*cpi->nmvcosts[1])));
+ CHECK_MEM_ERROR(cm, cpi->nmvcosts_hp[0],
+ vpx_calloc(MV_VALS, sizeof(*cpi->nmvcosts_hp[0])));
+ CHECK_MEM_ERROR(cm, cpi->nmvcosts_hp[1],
+ vpx_calloc(MV_VALS, sizeof(*cpi->nmvcosts_hp[1])));
+ CHECK_MEM_ERROR(cm, cpi->nmvsadcosts[0],
+ vpx_calloc(MV_VALS, sizeof(*cpi->nmvsadcosts[0])));
+ CHECK_MEM_ERROR(cm, cpi->nmvsadcosts[1],
+ vpx_calloc(MV_VALS, sizeof(*cpi->nmvsadcosts[1])));
+ CHECK_MEM_ERROR(cm, cpi->nmvsadcosts_hp[0],
+ vpx_calloc(MV_VALS, sizeof(*cpi->nmvsadcosts_hp[0])));
+ CHECK_MEM_ERROR(cm, cpi->nmvsadcosts_hp[1],
+ vpx_calloc(MV_VALS, sizeof(*cpi->nmvsadcosts_hp[1])));
+
for (i = 0; i < (sizeof(cpi->mbgraph_stats) /
sizeof(cpi->mbgraph_stats[0])); i++) {
CHECK_MEM_ERROR(cm, cpi->mbgraph_stats[i].mb_stats,
@@ -834,16 +886,16 @@
cpi->first_time_stamp_ever = INT64_MAX;
cal_nmvjointsadcost(cpi->mb.nmvjointsadcost);
- cpi->mb.nmvcost[0] = &cpi->mb.nmvcosts[0][MV_MAX];
- cpi->mb.nmvcost[1] = &cpi->mb.nmvcosts[1][MV_MAX];
- cpi->mb.nmvsadcost[0] = &cpi->mb.nmvsadcosts[0][MV_MAX];
- cpi->mb.nmvsadcost[1] = &cpi->mb.nmvsadcosts[1][MV_MAX];
+ cpi->mb.nmvcost[0] = &cpi->nmvcosts[0][MV_MAX];
+ cpi->mb.nmvcost[1] = &cpi->nmvcosts[1][MV_MAX];
+ cpi->mb.nmvsadcost[0] = &cpi->nmvsadcosts[0][MV_MAX];
+ cpi->mb.nmvsadcost[1] = &cpi->nmvsadcosts[1][MV_MAX];
cal_nmvsadcosts(cpi->mb.nmvsadcost);
- cpi->mb.nmvcost_hp[0] = &cpi->mb.nmvcosts_hp[0][MV_MAX];
- cpi->mb.nmvcost_hp[1] = &cpi->mb.nmvcosts_hp[1][MV_MAX];
- cpi->mb.nmvsadcost_hp[0] = &cpi->mb.nmvsadcosts_hp[0][MV_MAX];
- cpi->mb.nmvsadcost_hp[1] = &cpi->mb.nmvsadcosts_hp[1][MV_MAX];
+ cpi->mb.nmvcost_hp[0] = &cpi->nmvcosts_hp[0][MV_MAX];
+ cpi->mb.nmvcost_hp[1] = &cpi->nmvcosts_hp[1][MV_MAX];
+ cpi->mb.nmvsadcost_hp[0] = &cpi->nmvsadcosts_hp[0][MV_MAX];
+ cpi->mb.nmvsadcost_hp[1] = &cpi->nmvsadcosts_hp[1][MV_MAX];
cal_nmvsadcosts_hp(cpi->mb.nmvsadcost_hp);
#if CONFIG_VP9_TEMPORAL_DENOISING
@@ -860,8 +912,6 @@
kf_list = fopen("kf_list.stt", "w");
#endif
- cpi->output_pkt_list = oxcf->output_pkt_list;
-
cpi->allow_encode_breakout = ENCODE_BREAKOUT_ENABLED;
if (oxcf->pass == 1) {
@@ -871,7 +921,7 @@
const int packets = (int)(oxcf->two_pass_stats_in.sz / packet_sz);
if (cpi->svc.number_spatial_layers > 1
- && cpi->svc.number_temporal_layers == 1) {
+ || cpi->svc.number_temporal_layers > 1) {
FIRSTPASS_STATS *const stats = oxcf->two_pass_stats_in.buf;
FIRSTPASS_STATS *stats_copy[VPX_SS_MAX_LAYERS] = {0};
int i;
@@ -929,7 +979,7 @@
}
}
- set_speed_features(cpi);
+ vp9_set_speed_features(cpi);
// Allocate memory to store variances for a frame.
CHECK_MEM_ERROR(cm, cpi->source_diff_var,
@@ -1012,10 +1062,6 @@
vp9_sub_pixel_avg_variance4x4,
vp9_sad4x4x3, vp9_sad4x4x8, vp9_sad4x4x4d)
- cpi->full_search_sad = vp9_full_search_sad;
- cpi->diamond_search_sad = vp9_diamond_search_sad;
- cpi->refining_search_sad = vp9_refining_search_sad;
-
/* vp9_init_quantizer() is first called here. Add check in
* vp9_frame_init_quantizer() so that vp9_init_quantizer is only
* called later when needed. This will avoid unnecessary calls of
@@ -1234,7 +1280,10 @@
pkt.data.psnr.psnr[i] = psnr.psnr[i];
}
pkt.kind = VPX_CODEC_PSNR_PKT;
- vpx_codec_pkt_list_add(cpi->output_pkt_list, &pkt);
+ if (is_two_pass_svc(cpi))
+ cpi->svc.layer_context[cpi->svc.spatial_layer_id].psnr_pkt = pkt.data.psnr;
+ else
+ vpx_codec_pkt_list_add(cpi->output_pkt_list, &pkt);
}
int vp9_use_as_reference(VP9_COMP *cpi, int ref_frame_flags) {
@@ -1418,40 +1467,6 @@
vp9_extend_frame_borders(dst);
}
-#define WRITE_RECON_BUFFER 0
-#if WRITE_RECON_BUFFER
-void write_cx_frame_to_file(YV12_BUFFER_CONFIG *frame, int this_frame) {
- FILE *yframe;
- int i;
- char filename[255];
-
- snprintf(filename, sizeof(filename), "cx\\y%04d.raw", this_frame);
- yframe = fopen(filename, "wb");
-
- for (i = 0; i < frame->y_height; i++)
- fwrite(frame->y_buffer + i * frame->y_stride,
- frame->y_width, 1, yframe);
-
- fclose(yframe);
- snprintf(filename, sizeof(filename), "cx\\u%04d.raw", this_frame);
- yframe = fopen(filename, "wb");
-
- for (i = 0; i < frame->uv_height; i++)
- fwrite(frame->u_buffer + i * frame->uv_stride,
- frame->uv_width, 1, yframe);
-
- fclose(yframe);
- snprintf(filename, sizeof(filename), "cx\\v%04d.raw", this_frame);
- yframe = fopen(filename, "wb");
-
- for (i = 0; i < frame->uv_height; i++)
- fwrite(frame->v_buffer + i * frame->uv_stride,
- frame->uv_width, 1, yframe);
-
- fclose(yframe);
-}
-#endif
-
// Function to test for conditions that indicate we should loop
// back and recode a frame.
static int recode_loop_test(const VP9_COMP *cpi,
@@ -1517,7 +1532,7 @@
cpi->alt_fb_idx = cpi->gld_fb_idx;
cpi->gld_fb_idx = tmp;
- if (is_spatial_svc(cpi)) {
+ if (is_two_pass_svc(cpi)) {
cpi->svc.layer_context[0].gold_ref_idx = cpi->gld_fb_idx;
cpi->svc.layer_context[0].alt_ref_idx = cpi->alt_fb_idx;
}
@@ -1531,17 +1546,32 @@
ref_cnt_fb(cm->frame_bufs,
&cm->ref_frame_map[arf_idx], cm->new_fb_idx);
+ vpx_memcpy(cpi->interp_filter_selected[ALTREF_FRAME],
+ cpi->interp_filter_selected[0],
+ sizeof(cpi->interp_filter_selected[0]));
}
if (cpi->refresh_golden_frame) {
ref_cnt_fb(cm->frame_bufs,
&cm->ref_frame_map[cpi->gld_fb_idx], cm->new_fb_idx);
+ if (!cpi->rc.is_src_frame_alt_ref)
+ vpx_memcpy(cpi->interp_filter_selected[GOLDEN_FRAME],
+ cpi->interp_filter_selected[0],
+ sizeof(cpi->interp_filter_selected[0]));
+ else
+ vpx_memcpy(cpi->interp_filter_selected[GOLDEN_FRAME],
+ cpi->interp_filter_selected[ALTREF_FRAME],
+ sizeof(cpi->interp_filter_selected[ALTREF_FRAME]));
}
}
if (cpi->refresh_last_frame) {
ref_cnt_fb(cm->frame_bufs,
&cm->ref_frame_map[cpi->lst_fb_idx], cm->new_fb_idx);
+ if (!cpi->rc.is_src_frame_alt_ref)
+ vpx_memcpy(cpi->interp_filter_selected[LAST_FRAME],
+ cpi->interp_filter_selected[0],
+ sizeof(cpi->interp_filter_selected[0]));
}
#if CONFIG_VP9_TEMPORAL_DENOISING
if (cpi->oxcf.noise_sensitivity > 0) {
@@ -1596,6 +1626,9 @@
vp9_realloc_frame_buffer(&cm->frame_bufs[new_fb].buf,
cm->width, cm->height,
cm->subsampling_x, cm->subsampling_y,
+#if CONFIG_VP9_HIGHBITDEPTH
+ cm->use_highbitdepth,
+#endif
VP9_ENC_BORDER_IN_PIXELS, NULL, NULL, NULL);
scale_and_extend_frame(ref, &cm->frame_bufs[new_fb].buf);
cpi->scaled_ref_idx[ref_frame - 1] = new_fb;
@@ -1770,7 +1803,6 @@
// to recode.
if (cpi->sf.recode_loop >= ALLOW_RECODE_KFARFGF) {
save_coding_context(cpi);
- cpi->dummy_packing = 1;
if (!cpi->sf.use_nonrd_pick_mode)
vp9_pack_bitstream(cpi, dest, size);
@@ -1919,36 +1951,26 @@
} while (loop);
}
-static void get_ref_frame_flags(VP9_COMP *cpi) {
- if (cpi->refresh_last_frame & cpi->refresh_golden_frame)
- cpi->gold_is_last = 1;
- else if (cpi->refresh_last_frame ^ cpi->refresh_golden_frame)
- cpi->gold_is_last = 0;
+static int get_ref_frame_flags(const VP9_COMP *cpi) {
+ const int *const map = cpi->common.ref_frame_map;
+ const int gold_is_last = map[cpi->gld_fb_idx] == map[cpi->lst_fb_idx];
+ const int alt_is_last = map[cpi->alt_fb_idx] == map[cpi->lst_fb_idx];
+ const int gold_is_alt = map[cpi->gld_fb_idx] == map[cpi->alt_fb_idx];
+ int flags = VP9_ALT_FLAG | VP9_GOLD_FLAG | VP9_LAST_FLAG;
- if (cpi->refresh_last_frame & cpi->refresh_alt_ref_frame)
- cpi->alt_is_last = 1;
- else if (cpi->refresh_last_frame ^ cpi->refresh_alt_ref_frame)
- cpi->alt_is_last = 0;
+ if (gold_is_last)
+ flags &= ~VP9_GOLD_FLAG;
- if (cpi->refresh_alt_ref_frame & cpi->refresh_golden_frame)
- cpi->gold_is_alt = 1;
- else if (cpi->refresh_alt_ref_frame ^ cpi->refresh_golden_frame)
- cpi->gold_is_alt = 0;
+ if (cpi->rc.frames_till_gf_update_due == INT_MAX && !is_two_pass_svc(cpi))
+ flags &= ~VP9_GOLD_FLAG;
- cpi->ref_frame_flags = VP9_ALT_FLAG | VP9_GOLD_FLAG | VP9_LAST_FLAG;
+ if (alt_is_last)
+ flags &= ~VP9_ALT_FLAG;
- if (cpi->gold_is_last)
- cpi->ref_frame_flags &= ~VP9_GOLD_FLAG;
+ if (gold_is_alt)
+ flags &= ~VP9_ALT_FLAG;
- if (cpi->rc.frames_till_gf_update_due == INT_MAX &&
- !is_spatial_svc(cpi))
- cpi->ref_frame_flags &= ~VP9_GOLD_FLAG;
-
- if (cpi->alt_is_last)
- cpi->ref_frame_flags &= ~VP9_ALT_FLAG;
-
- if (cpi->gold_is_alt)
- cpi->ref_frame_flags &= ~VP9_ALT_FLAG;
+ return flags;
}
static void set_ext_overrides(VP9_COMP *cpi) {
@@ -1980,18 +2002,16 @@
}
}
-static void configure_skippable_frame(VP9_COMP *cpi) {
+static int is_skippable_frame(const VP9_COMP *cpi) {
// If the current frame does not have non-zero motion vector detected in the
// first pass, and so do its previous and forward frames, then this frame
// can be skipped for partition check, and the partition size is assigned
// according to the variance
+ const SVC *const svc = &cpi->svc;
+ const TWO_PASS *const twopass = is_two_pass_svc(cpi) ?
+ &svc->layer_context[svc->spatial_layer_id].twopass : &cpi->twopass;
- SVC *const svc = &cpi->svc;
- TWO_PASS *const twopass = is_spatial_svc(cpi) ?
- &svc->layer_context[svc->spatial_layer_id].twopass
- : &cpi->twopass;
-
- cpi->skippable_frame = (!frame_is_intra_only(&cpi->common) &&
+ return (!frame_is_intra_only(&cpi->common) &&
twopass->stats_in - 2 > twopass->stats_in_start &&
twopass->stats_in < twopass->stats_in_end &&
(twopass->stats_in - 1)->pcnt_inter - (twopass->stats_in - 1)->pcnt_motion
@@ -2017,19 +2037,69 @@
cm->ref_frame_sign_bias[ALTREF_FRAME] = arf_sign_bias;
}
+static void set_mv_search_params(VP9_COMP *cpi) {
+ const VP9_COMMON *const cm = &cpi->common;
+ const unsigned int max_mv_def = MIN(cm->width, cm->height);
+
+ // Default based on max resolution.
+ cpi->mv_step_param = vp9_init_search_range(max_mv_def);
+
+ if (cpi->sf.mv.auto_mv_step_size) {
+ if (frame_is_intra_only(cm)) {
+ // Initialize max_mv_magnitude for use in the first INTER frame
+ // after a key/intra-only frame.
+ cpi->max_mv_magnitude = max_mv_def;
+ } else {
+ if (cm->show_frame)
+ // Allow mv_steps to correspond to twice the max mv magnitude found
+ // in the previous frame, capped by the default max_mv_magnitude based
+ // on resolution.
+ cpi->mv_step_param =
+ vp9_init_search_range(MIN(max_mv_def, 2 * cpi->max_mv_magnitude));
+ cpi->max_mv_magnitude = 0;
+ }
+ }
+}
+
+
+int setup_interp_filter_search_mask(VP9_COMP *cpi) {
+ INTERP_FILTER ifilter;
+ int ref_total[MAX_REF_FRAMES] = {0};
+ MV_REFERENCE_FRAME ref;
+ int mask = 0;
+ if (cpi->common.last_frame_type == KEY_FRAME ||
+ cpi->refresh_alt_ref_frame)
+ return mask;
+ for (ref = LAST_FRAME; ref <= ALTREF_FRAME; ++ref)
+ for (ifilter = EIGHTTAP; ifilter <= EIGHTTAP_SHARP; ++ifilter)
+ ref_total[ref] += cpi->interp_filter_selected[ref][ifilter];
+
+ for (ifilter = EIGHTTAP; ifilter <= EIGHTTAP_SHARP; ++ifilter) {
+ if ((ref_total[LAST_FRAME] &&
+ cpi->interp_filter_selected[LAST_FRAME][ifilter] == 0) &&
+ (ref_total[GOLDEN_FRAME] == 0 ||
+ cpi->interp_filter_selected[GOLDEN_FRAME][ifilter] * 50
+ < ref_total[GOLDEN_FRAME]) &&
+ (ref_total[ALTREF_FRAME] == 0 ||
+ cpi->interp_filter_selected[ALTREF_FRAME][ifilter] * 50
+ < ref_total[ALTREF_FRAME]))
+ mask |= 1 << ifilter;
+ }
+ return mask;
+}
+
static void encode_frame_to_data_rate(VP9_COMP *cpi,
size_t *size,
uint8_t *dest,
unsigned int *frame_flags) {
VP9_COMMON *const cm = &cpi->common;
+ const VP9EncoderConfig *const oxcf = &cpi->oxcf;
+ struct segmentation *const seg = &cm->seg;
TX_SIZE t;
int q;
int top_index;
int bottom_index;
- const SPEED_FEATURES *const sf = &cpi->sf;
- const unsigned int max_mv_def = MIN(cm->width, cm->height);
- struct segmentation *const seg = &cm->seg;
set_ext_overrides(cpi);
cpi->Source = vp9_scale_if_required(cm, cpi->un_scaled_source,
@@ -2055,24 +2125,13 @@
// Set default state for segment based loop filter update flags.
cm->lf.mode_ref_delta_update = 0;
- // Initialize cpi->mv_step_param to default based on max resolution.
- cpi->mv_step_param = vp9_init_search_range(max_mv_def);
- // Initialize cpi->max_mv_magnitude and cpi->mv_step_param if appropriate.
- if (sf->mv.auto_mv_step_size) {
- if (frame_is_intra_only(cm)) {
- // Initialize max_mv_magnitude for use in the first INTER frame
- // after a key/intra-only frame.
- cpi->max_mv_magnitude = max_mv_def;
- } else {
- if (cm->show_frame)
- // Allow mv_steps to correspond to twice the max mv magnitude found
- // in the previous frame, capped by the default max_mv_magnitude based
- // on resolution.
- cpi->mv_step_param = vp9_init_search_range(MIN(max_mv_def, 2 *
- cpi->max_mv_magnitude));
- cpi->max_mv_magnitude = 0;
- }
- }
+ set_mv_search_params(cpi);
+
+ if (cpi->oxcf.pass == 2 &&
+ cpi->sf.adaptive_interp_filter_search)
+ cpi->sf.interp_filter_search_mask =
+ setup_interp_filter_search_mask(cpi);
+
// Set various flags etc to special state if it is a key frame.
if (frame_is_intra_only(cm)) {
@@ -2088,9 +2147,7 @@
// The alternate reference frame cannot be active for a key frame.
cpi->rc.source_alt_ref_active = 0;
- cm->error_resilient_mode = (cpi->oxcf.error_resilient_mode != 0);
- cm->frame_parallel_decoding_mode =
- (cpi->oxcf.frame_parallel_decoding_mode != 0);
+ cm->error_resilient_mode = oxcf->error_resilient_mode;
// By default, encoder assumes decoder can use prev_mi.
if (cm->error_resilient_mode) {
@@ -2098,29 +2155,59 @@
cm->reset_frame_context = 0;
cm->refresh_frame_context = 0;
} else if (cm->intra_only) {
+ cm->frame_parallel_decoding_mode = oxcf->frame_parallel_decoding_mode;
// Only reset the current context.
cm->reset_frame_context = 2;
}
}
+ if (is_two_pass_svc(cpi) && cm->error_resilient_mode == 0) {
+ cm->frame_context_idx =
+ cpi->svc.spatial_layer_id * cpi->svc.number_temporal_layers +
+ cpi->svc.temporal_layer_id;
+
+ // The probs will be updated based on the frame type of its previous
+ // frame if frame_parallel_decoding_mode is 0. The type may vary for
+ // the frame after a key frame in base layer since we may drop enhancement
+ // layers. So set frame_parallel_decoding_mode to 1 in this case.
+ if (cpi->svc.number_temporal_layers == 1) {
+ if (cpi->svc.spatial_layer_id == 0 &&
+ cpi->svc.layer_context[0].last_frame_type == KEY_FRAME)
+ cm->frame_parallel_decoding_mode = 1;
+ else
+ cm->frame_parallel_decoding_mode = 0;
+ } else if (cpi->svc.spatial_layer_id == 0) {
+ // Find the 2nd frame in temporal base layer and 1st frame in temporal
+ // enhancement layers from the key frame.
+ int i;
+ for (i = 0; i < cpi->svc.number_temporal_layers; ++i) {
+ if (cpi->svc.layer_context[0].frames_from_key_frame == 1 << i) {
+ cm->frame_parallel_decoding_mode = 1;
+ break;
+ }
+ }
+ if (i == cpi->svc.number_temporal_layers)
+ cm->frame_parallel_decoding_mode = 0;
+ }
+ }
// Configure experimental use of segmentation for enhanced coding of
// static regions if indicated.
// Only allowed in second pass of two pass (as requires lagged coding)
// and if the relevant speed feature flag is set.
- if (cpi->oxcf.pass == 2 && cpi->sf.static_segmentation)
+ if (oxcf->pass == 2 && cpi->sf.static_segmentation)
configure_static_seg_features(cpi);
// Check if the current frame is skippable for the partition search in the
// second pass according to the first pass stats
- if (cpi->oxcf.pass == 2 &&
- (!cpi->use_svc || is_spatial_svc(cpi))) {
- configure_skippable_frame(cpi);
+ if (oxcf->pass == 2 &&
+ (!cpi->use_svc || is_two_pass_svc(cpi))) {
+ cpi->skippable_frame = is_skippable_frame(cpi);
}
// For 1 pass CBR, check if we are dropping this frame.
// Never drop on key frame.
- if (cpi->oxcf.pass == 0 &&
- cpi->oxcf.rc_mode == VPX_CBR &&
+ if (oxcf->pass == 0 &&
+ oxcf->rc_mode == VPX_CBR &&
cm->frame_type != KEY_FRAME) {
if (vp9_rc_drop_frame(cpi)) {
vp9_rc_postencode_update_drop_frame(cpi);
@@ -2132,9 +2219,9 @@
vp9_clear_system_state();
#if CONFIG_VP9_POSTPROC
- if (cpi->oxcf.noise_sensitivity > 0) {
+ if (oxcf->noise_sensitivity > 0) {
int l = 0;
- switch (cpi->oxcf.noise_sensitivity) {
+ switch (oxcf->noise_sensitivity) {
case 1:
l = 20;
break;
@@ -2156,7 +2243,18 @@
}
#endif
- set_speed_features(cpi);
+#if CONFIG_INTERNAL_STATS
+ {
+ int i;
+ for (i = 0; i < MAX_MODES; ++i)
+ cpi->mode_chosen_counts[i] = 0;
+ }
+#endif
+
+ vp9_set_speed_features(cpi);
+
+ vp9_set_rd_speed_thresholds(cpi);
+ vp9_set_rd_speed_thresholds_sub8x8(cpi);
// Decide q and q bounds.
q = vp9_rc_pick_q_and_bounds(cpi, &bottom_index, &top_index);
@@ -2175,7 +2273,7 @@
#if CONFIG_VP9_TEMPORAL_DENOISING
#ifdef OUTPUT_YUV_DENOISED
- if (cpi->oxcf.noise_sensitivity > 0) {
+ if (oxcf->noise_sensitivity > 0) {
vp9_write_yuv_frame_420(&cpi->denoiser.running_avg_y[INTRA_FRAME],
yuv_denoised_file);
}
@@ -2196,29 +2294,10 @@
cm->frame_to_show = get_frame_new_buffer(cm);
-#if WRITE_RECON_BUFFER
- if (cm->show_frame)
- write_cx_frame_to_file(cm->frame_to_show,
- cm->current_video_frame);
- else
- write_cx_frame_to_file(cm->frame_to_show,
- cm->current_video_frame + 1000);
-#endif
-
// Pick the loop filter level for the frame.
loopfilter_frame(cpi, cm);
-#if WRITE_RECON_BUFFER
- if (cm->show_frame)
- write_cx_frame_to_file(cm->frame_to_show,
- cm->current_video_frame + 2000);
- else
- write_cx_frame_to_file(cm->frame_to_show,
- cm->current_video_frame + 3000);
-#endif
-
// build the bitstream
- cpi->dummy_packing = 0;
vp9_pack_bitstream(cpi, dest, size);
if (cm->seg.update_map)
@@ -2250,7 +2329,7 @@
else
cpi->frame_flags &= ~FRAMEFLAGS_ALTREF;
- get_ref_frame_flags(cpi);
+ cpi->ref_frame_flags = get_ref_frame_flags(cpi);
cm->last_frame_type = cm->frame_type;
vp9_rc_postencode_update(cpi, *size);
@@ -2277,8 +2356,12 @@
cm->last_height = cm->height;
// reset to normal state now that we are done.
- if (!cm->show_existing_frame)
- cm->last_show_frame = cm->show_frame;
+ if (!cm->show_existing_frame) {
+ if (is_two_pass_svc(cpi) && cm->error_resilient_mode == 0)
+ cm->last_show_frame = 0;
+ else
+ cm->last_show_frame = cm->show_frame;
+ }
if (cm->show_frame) {
vp9_swap_mi_and_prev_mi(cm);
@@ -2287,8 +2370,12 @@
// update not a real frame
++cm->current_video_frame;
if (cpi->use_svc)
- vp9_inc_frame_in_layer(&cpi->svc);
+ vp9_inc_frame_in_layer(cpi);
}
+
+ if (is_two_pass_svc(cpi))
+ cpi->svc.layer_context[cpi->svc.spatial_layer_id].last_frame_type =
+ cm->frame_type;
}
static void SvcEncode(VP9_COMP *cpi, size_t *size, uint8_t *dest,
@@ -2361,7 +2448,7 @@
vpx_usec_timer_start(&timer);
#if CONFIG_SPATIAL_SVC
- if (is_spatial_svc(cpi))
+ if (is_two_pass_svc(cpi))
res = vp9_svc_lookahead_push(cpi, cpi->lookahead, sd, time_stamp, end_time,
frame_flags);
else
@@ -2403,18 +2490,19 @@
cm->seg.update_data;
}
-void adjust_frame_rate(VP9_COMP *cpi) {
+void adjust_frame_rate(VP9_COMP *cpi,
+ const struct lookahead_entry *source) {
int64_t this_duration;
int step = 0;
- if (cpi->source->ts_start == cpi->first_time_stamp_ever) {
- this_duration = cpi->source->ts_end - cpi->source->ts_start;
+ if (source->ts_start == cpi->first_time_stamp_ever) {
+ this_duration = source->ts_end - source->ts_start;
step = 1;
} else {
int64_t last_duration = cpi->last_end_time_stamp_seen
- cpi->last_time_stamp_seen;
- this_duration = cpi->source->ts_end - cpi->last_end_time_stamp_seen;
+ this_duration = source->ts_end - cpi->last_end_time_stamp_seen;
// do a step update if the duration changes by 10%
if (last_duration)
@@ -2428,17 +2516,17 @@
// Average this frame's rate into the last second's average
// frame rate. If we haven't seen 1 second yet, then average
// over the whole interval seen.
- const double interval = MIN((double)(cpi->source->ts_end
+ const double interval = MIN((double)(source->ts_end
- cpi->first_time_stamp_ever), 10000000.0);
- double avg_duration = 10000000.0 / cpi->oxcf.framerate;
+ double avg_duration = 10000000.0 / cpi->framerate;
avg_duration *= (interval - avg_duration + this_duration);
avg_duration /= interval;
vp9_new_framerate(cpi, 10000000.0 / avg_duration);
}
}
- cpi->last_time_stamp_seen = cpi->source->ts_start;
- cpi->last_end_time_stamp_seen = cpi->source->ts_end;
+ cpi->last_time_stamp_seen = source->ts_start;
+ cpi->last_end_time_stamp_seen = source->ts_end;
}
// Returns 0 if this is not an alt ref else the offset of the source frame
@@ -2459,7 +2547,8 @@
return arf_src_index;
}
-static void check_src_altref(VP9_COMP *cpi) {
+static void check_src_altref(VP9_COMP *cpi,
+ const struct lookahead_entry *source) {
RATE_CONTROL *const rc = &cpi->rc;
if (cpi->oxcf.pass == 2) {
@@ -2468,7 +2557,7 @@
(gf_group->update_type[gf_group->index] == OVERLAY_UPDATE);
} else {
rc->is_src_frame_alt_ref = cpi->alt_ref_source &&
- (cpi->source == cpi->alt_ref_source);
+ (source == cpi->alt_ref_source);
}
if (rc->is_src_frame_alt_ref) {
@@ -2484,18 +2573,18 @@
int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags,
size_t *size, uint8_t *dest,
int64_t *time_stamp, int64_t *time_end, int flush) {
+ const VP9EncoderConfig *const oxcf = &cpi->oxcf;
VP9_COMMON *const cm = &cpi->common;
MACROBLOCKD *const xd = &cpi->mb.e_mbd;
RATE_CONTROL *const rc = &cpi->rc;
struct vpx_usec_timer cmptimer;
YV12_BUFFER_CONFIG *force_src_buffer = NULL;
+ struct lookahead_entry *last_source = NULL;
+ struct lookahead_entry *source = NULL;
MV_REFERENCE_FRAME ref_frame;
int arf_src_index;
- if (!cpi)
- return -1;
-
- if (is_spatial_svc(cpi) && cpi->oxcf.pass == 2) {
+ if (is_two_pass_svc(cpi) && oxcf->pass == 2) {
#if CONFIG_SPATIAL_SVC
vp9_svc_lookahead_peek(cpi, cpi->lookahead, 0, 1);
#endif
@@ -2504,9 +2593,6 @@
vpx_usec_timer_start(&cmptimer);
- cpi->source = NULL;
- cpi->last_source = NULL;
-
vp9_set_high_precision_mv(cpi, ALTREF_HIGH_PRECISION_MV);
// Normal defaults
@@ -2522,21 +2608,20 @@
assert(arf_src_index <= rc->frames_to_key);
#if CONFIG_SPATIAL_SVC
- if (is_spatial_svc(cpi))
- cpi->source = vp9_svc_lookahead_peek(cpi, cpi->lookahead,
- arf_src_index, 0);
+ if (is_two_pass_svc(cpi))
+ source = vp9_svc_lookahead_peek(cpi, cpi->lookahead, arf_src_index, 0);
else
#endif
- cpi->source = vp9_lookahead_peek(cpi->lookahead, arf_src_index);
- if (cpi->source != NULL) {
- cpi->alt_ref_source = cpi->source;
+ source = vp9_lookahead_peek(cpi->lookahead, arf_src_index);
+ if (source != NULL) {
+ cpi->alt_ref_source = source;
#if CONFIG_SPATIAL_SVC
- if (is_spatial_svc(cpi) && cpi->svc.spatial_layer_id > 0) {
+ if (is_two_pass_svc(cpi) && cpi->svc.spatial_layer_id > 0) {
int i;
// Reference a hidden frame from a lower layer
for (i = cpi->svc.spatial_layer_id - 1; i >= 0; --i) {
- if (cpi->oxcf.ss_play_alternate[i]) {
+ if (oxcf->ss_play_alternate[i]) {
cpi->gld_fb_idx = cpi->svc.layer_context[i].alt_ref_idx;
break;
}
@@ -2545,7 +2630,7 @@
cpi->svc.layer_context[cpi->svc.spatial_layer_id].has_alt_frame = 1;
#endif
- if (cpi->oxcf.arnr_max_frames > 0) {
+ if (oxcf->arnr_max_frames > 0) {
// Produce the filtered ARF frame.
vp9_temporal_filter(cpi, arf_src_index);
vp9_extend_frame_borders(&cpi->alt_ref_buffer);
@@ -2563,62 +2648,57 @@
}
}
- if (!cpi->source) {
+ if (!source) {
// Get last frame source.
if (cm->current_video_frame > 0) {
#if CONFIG_SPATIAL_SVC
- if (is_spatial_svc(cpi))
- cpi->last_source = vp9_svc_lookahead_peek(cpi, cpi->lookahead, -1, 0);
+ if (is_two_pass_svc(cpi))
+ last_source = vp9_svc_lookahead_peek(cpi, cpi->lookahead, -1, 0);
else
#endif
- cpi->last_source = vp9_lookahead_peek(cpi->lookahead, -1);
- if (cpi->last_source == NULL)
+ last_source = vp9_lookahead_peek(cpi->lookahead, -1);
+ if (last_source == NULL)
return -1;
}
// Read in the source frame.
#if CONFIG_SPATIAL_SVC
- if (is_spatial_svc(cpi))
- cpi->source = vp9_svc_lookahead_pop(cpi, cpi->lookahead, flush);
+ if (is_two_pass_svc(cpi))
+ source = vp9_svc_lookahead_pop(cpi, cpi->lookahead, flush);
else
#endif
- cpi->source = vp9_lookahead_pop(cpi->lookahead, flush);
- if (cpi->source != NULL) {
+ source = vp9_lookahead_pop(cpi->lookahead, flush);
+ if (source != NULL) {
cm->show_frame = 1;
cm->intra_only = 0;
// Check to see if the frame should be encoded as an arf overlay.
- check_src_altref(cpi);
+ check_src_altref(cpi, source);
}
}
- if (cpi->source) {
+ if (source) {
cpi->un_scaled_source = cpi->Source = force_src_buffer ? force_src_buffer
- : &cpi->source->img;
+ : &source->img;
- if (cpi->last_source != NULL) {
- cpi->unscaled_last_source = &cpi->last_source->img;
- } else {
- cpi->unscaled_last_source = NULL;
- }
+ cpi->unscaled_last_source = last_source != NULL ? &last_source->img : NULL;
- *time_stamp = cpi->source->ts_start;
- *time_end = cpi->source->ts_end;
- *frame_flags =
- (cpi->source->flags & VPX_EFLAG_FORCE_KF) ? FRAMEFLAGS_KEY : 0;
+ *time_stamp = source->ts_start;
+ *time_end = source->ts_end;
+ *frame_flags = (source->flags & VPX_EFLAG_FORCE_KF) ? FRAMEFLAGS_KEY : 0;
} else {
*size = 0;
- if (flush && cpi->oxcf.pass == 1 && !cpi->twopass.first_pass_done) {
+ if (flush && oxcf->pass == 1 && !cpi->twopass.first_pass_done) {
vp9_end_first_pass(cpi); /* get last stats packet */
cpi->twopass.first_pass_done = 1;
}
return -1;
}
- if (cpi->source->ts_start < cpi->first_time_stamp_ever) {
- cpi->first_time_stamp_ever = cpi->source->ts_start;
- cpi->last_end_time_stamp_seen = cpi->source->ts_start;
+ if (source->ts_start < cpi->first_time_stamp_ever) {
+ cpi->first_time_stamp_ever = source->ts_start;
+ cpi->last_end_time_stamp_seen = source->ts_start;
}
// Clear down mmx registers
@@ -2626,11 +2706,11 @@
// adjust frame rates based on timestamps given
if (cm->show_frame) {
- adjust_frame_rate(cpi);
+ adjust_frame_rate(cpi, source);
}
if (cpi->svc.number_temporal_layers > 1 &&
- cpi->oxcf.rc_mode == VPX_CBR) {
+ oxcf->rc_mode == VPX_CBR) {
vp9_update_temporal_layer_framerate(cpi);
vp9_restore_layer_context(cpi);
}
@@ -2647,7 +2727,7 @@
if (!cpi->use_svc && cpi->multi_arf_allowed) {
if (cm->frame_type == KEY_FRAME) {
init_buffer_indices(cpi);
- } else if (cpi->oxcf.pass == 2) {
+ } else if (oxcf->pass == 2) {
const GF_GROUP *const gf_group = &cpi->twopass.gf_group;
cpi->alt_fb_idx = gf_group->arf_ref_idx[gf_group->index];
}
@@ -2655,19 +2735,22 @@
cpi->frame_flags = *frame_flags;
- if (cpi->oxcf.pass == 2 &&
+ if (oxcf->pass == 2 &&
cm->current_video_frame == 0 &&
- cpi->oxcf.allow_spatial_resampling &&
- cpi->oxcf.rc_mode == VPX_VBR) {
+ oxcf->allow_spatial_resampling &&
+ oxcf->rc_mode == VPX_VBR) {
// Internal scaling is triggered on the first frame.
- vp9_set_size_literal(cpi, cpi->oxcf.scaled_frame_width,
- cpi->oxcf.scaled_frame_height);
+ vp9_set_size_literal(cpi, oxcf->scaled_frame_width,
+ oxcf->scaled_frame_height);
}
// Reset the frame pointers to the current frame size
vp9_realloc_frame_buffer(get_frame_new_buffer(cm),
cm->width, cm->height,
cm->subsampling_x, cm->subsampling_y,
+#if CONFIG_VP9_HIGHBITDEPTH
+ cm->use_highbitdepth,
+#endif
VP9_ENC_BORDER_IN_PIXELS, NULL, NULL, NULL);
alloc_util_frame_buffers(cpi);
@@ -2689,18 +2772,27 @@
set_ref_ptrs(cm, xd, LAST_FRAME, LAST_FRAME);
- if (cpi->oxcf.aq_mode == VARIANCE_AQ) {
+ if (oxcf->aq_mode == VARIANCE_AQ) {
vp9_vaq_init();
}
- if (cpi->oxcf.pass == 1 &&
- (!cpi->use_svc || is_spatial_svc(cpi))) {
- const int lossless = is_lossless_requested(&cpi->oxcf);
+ if (oxcf->pass == 1 &&
+ (!cpi->use_svc || is_two_pass_svc(cpi))) {
+ const int lossless = is_lossless_requested(oxcf);
+#if CONFIG_VP9_HIGHBITDEPTH
+ if (cpi->oxcf.use_highbitdepth)
+ cpi->mb.fwd_txm4x4 = lossless ? vp9_high_fwht4x4 : vp9_high_fdct4x4;
+ else
+ cpi->mb.fwd_txm4x4 = lossless ? vp9_fwht4x4 : vp9_fdct4x4;
+ cpi->mb.high_itxm_add = lossless ? vp9_high_iwht4x4_add :
+ vp9_high_idct4x4_add;
+#else
cpi->mb.fwd_txm4x4 = lossless ? vp9_fwht4x4 : vp9_fdct4x4;
+#endif
cpi->mb.itxm_add = lossless ? vp9_iwht4x4_add : vp9_idct4x4_add;
- vp9_first_pass(cpi);
- } else if (cpi->oxcf.pass == 2 &&
- (!cpi->use_svc || is_spatial_svc(cpi))) {
+ vp9_first_pass(cpi, source);
+ } else if (oxcf->pass == 2 &&
+ (!cpi->use_svc || is_two_pass_svc(cpi))) {
Pass2Encode(cpi, size, dest, frame_flags);
} else if (cpi->use_svc) {
SvcEncode(cpi, size, dest, frame_flags);
@@ -2723,20 +2815,22 @@
// Save layer specific state.
if ((cpi->svc.number_temporal_layers > 1 &&
- cpi->oxcf.rc_mode == VPX_CBR) ||
- (cpi->svc.number_spatial_layers > 1 && cpi->oxcf.pass == 2)) {
+ oxcf->rc_mode == VPX_CBR) ||
+ ((cpi->svc.number_temporal_layers > 1 ||
+ cpi->svc.number_spatial_layers > 1) &&
+ oxcf->pass == 2)) {
vp9_save_layer_context(cpi);
}
vpx_usec_timer_mark(&cmptimer);
cpi->time_compress_data += vpx_usec_timer_elapsed(&cmptimer);
- if (cpi->b_calculate_psnr && cpi->oxcf.pass != 1 && cm->show_frame)
+ if (cpi->b_calculate_psnr && oxcf->pass != 1 && cm->show_frame)
generate_psnr_packet(cpi);
#if CONFIG_INTERNAL_STATS
- if (cpi->oxcf.pass != 1) {
+ if (oxcf->pass != 1) {
cpi->bytes += (int)(*size);
if (cm->show_frame) {
@@ -2776,12 +2870,12 @@
cpi->totalp_sq_error += psnr2.sse[0];
cpi->totalp_samples += psnr2.samples[0];
- frame_ssim2 = vp9_calc_ssim(orig, recon, 1, &weight);
+ frame_ssim2 = vp9_calc_ssim(orig, recon, &weight);
cpi->summed_quality += frame_ssim2 * weight;
cpi->summed_weights += weight;
- frame_ssim2 = vp9_calc_ssim(orig, &cm->post_proc_buffer, 1, &weight);
+ frame_ssim2 = vp9_calc_ssim(orig, &cm->post_proc_buffer, &weight);
cpi->summedp_quality += frame_ssim2 * weight;
cpi->summedp_weights += weight;
@@ -2797,6 +2891,7 @@
}
}
+
if (cpi->b_calculate_ssimg) {
double y, u, v, frame_all;
frame_all = vp9_calc_ssimg(cpi->Source, cm->frame_to_show, &y, &u, &v);
diff --git a/vp9/encoder/vp9_encoder.h b/vp9/encoder/vp9_encoder.h
index c841da2..80774de 100644
--- a/vp9/encoder/vp9_encoder.h
+++ b/vp9/encoder/vp9_encoder.h
@@ -82,36 +82,19 @@
} VPX_SCALING;
typedef enum {
- // Good Quality Fast Encoding. The encoder balances quality with the
- // amount of time it takes to encode the output. (speed setting
- // controls how fast)
- ONE_PASS_GOOD = 1,
+ // Good Quality Fast Encoding. The encoder balances quality with the amount of
+ // time it takes to encode the output. Speed setting controls how fast.
+ GOOD,
- // One Pass - Best Quality. The encoder places priority on the
- // quality of the output over encoding speed. The output is compressed
- // at the highest possible quality. This option takes the longest
- // amount of time to encode. (speed setting ignored)
- ONE_PASS_BEST = 2,
+ // The encoder places priority on the quality of the output over encoding
+ // speed. The output is compressed at the highest possible quality. This
+ // option takes the longest amount of time to encode. Speed setting ignored.
+ BEST,
- // Two Pass - First Pass. The encoder generates a file of statistics
- // for use in the second encoding pass. (speed setting controls how fast)
- TWO_PASS_FIRST = 3,
-
- // Two Pass - Second Pass. The encoder uses the statistics that were
- // generated in the first encoding pass to create the compressed
- // output. (speed setting controls how fast)
- TWO_PASS_SECOND_GOOD = 4,
-
- // Two Pass - Second Pass Best. The encoder uses the statistics that
- // were generated in the first encoding pass to create the compressed
- // output using the highest possible quality, and taking a
- // longer amount of time to encode. (speed setting ignored)
- TWO_PASS_SECOND_BEST = 5,
-
- // Realtime/Live Encoding. This mode is optimized for realtime
- // encoding (for example, capturing a television signal or feed from
- // a live camera). (speed setting controls how fast)
- REALTIME = 6,
+ // Realtime/Live Encoding. This mode is optimized for realtime encoding (for
+ // example, capturing a television signal or feed from a live camera). Speed
+ // setting controls how fast.
+ REALTIME
} MODE;
typedef enum {
@@ -131,10 +114,11 @@
typedef struct VP9EncoderConfig {
BITSTREAM_PROFILE profile;
- BIT_DEPTH bit_depth;
+ vpx_bit_depth_t bit_depth; // Codec bit-depth.
int width; // width of data passed to the compressor
int height; // height of data passed to the compressor
- double framerate; // set to passed in framerate
+ unsigned int input_bit_depth; // Input bit depth.
+ double init_framerate; // set to passed in framerate
int64_t target_bandwidth; // bandwidth to be used in kilobits per second
int noise_sensitivity; // pre processing blur: recommendation 0
@@ -220,39 +204,35 @@
int arnr_max_frames;
int arnr_strength;
- int arnr_type;
int tile_columns;
int tile_rows;
- struct vpx_fixed_buf two_pass_stats_in;
- struct vpx_codec_pkt_list *output_pkt_list;
+ vpx_fixed_buf_t two_pass_stats_in;
+ struct vpx_codec_pkt_list *output_pkt_list;
#if CONFIG_FP_MB_STATS
- struct vpx_fixed_buf firstpass_mb_stats_in;
+ vpx_fixed_buf_t firstpass_mb_stats_in;
#endif
vp8e_tuning tuning;
vp9e_tune_content content;
+#if CONFIG_VP9_HIGHBITDEPTH
+ int use_highbitdepth;
+#endif
} VP9EncoderConfig;
static INLINE int is_lossless_requested(const VP9EncoderConfig *cfg) {
return cfg->best_allowed_q == 0 && cfg->worst_allowed_q == 0;
}
-static INLINE int is_best_mode(MODE mode) {
- return mode == ONE_PASS_BEST || mode == TWO_PASS_SECOND_BEST;
-}
-
typedef struct VP9_COMP {
QUANTS quants;
MACROBLOCK mb;
VP9_COMMON common;
VP9EncoderConfig oxcf;
struct lookahead_ctx *lookahead;
- struct lookahead_entry *source;
struct lookahead_entry *alt_ref_source;
- struct lookahead_entry *last_source;
YV12_BUFFER_CONFIG *Source;
YV12_BUFFER_CONFIG *Last_Source; // NULL for first frame and alt_ref frames
@@ -261,10 +241,6 @@
YV12_BUFFER_CONFIG *unscaled_last_source;
YV12_BUFFER_CONFIG scaled_last_source;
- int gold_is_last; // gold same as last frame ( short circuit gold searches)
- int alt_is_last; // Alt same as last ( short circuit altref search)
- int gold_is_alt; // don't do both alt and gold search ( just do gold).
-
int skippable_frame;
int scaled_ref_idx[3];
@@ -296,18 +272,23 @@
CODING_CONTEXT coding_context;
+ int *nmvcosts[2];
+ int *nmvcosts_hp[2];
+ int *nmvsadcosts[2];
+ int *nmvsadcosts_hp[2];
+
int zbin_mode_boost;
int zbin_mode_boost_enabled;
- int active_arnr_frames; // <= cpi->oxcf.arnr_max_frames
- int active_arnr_strength; // <= cpi->oxcf.arnr_max_strength
int64_t last_time_stamp_seen;
int64_t last_end_time_stamp_seen;
int64_t first_time_stamp_ever;
RATE_CONTROL rc;
+ double framerate;
vp9_coeff_count coef_counts[TX_SIZES][PLANE_TYPES];
+ int interp_filter_selected[MAX_REF_FRAMES][SWITCHABLE];
struct vpx_codec_pkt_list *output_pkt_list;
@@ -354,7 +335,7 @@
TWO_PASS twopass;
YV12_BUFFER_CONFIG alt_ref_buffer;
- YV12_BUFFER_CONFIG *frames[MAX_LAG_BUFFERS];
+
#if CONFIG_INTERNAL_STATS
unsigned int mode_chosen_counts[MAX_MODES];
@@ -393,10 +374,6 @@
int droppable;
- int dummy_packing; /* flag to indicate if packing is dummy */
-
- unsigned int tx_stepdown_count[TX_SIZES];
-
int initial_width;
int initial_height;
@@ -415,7 +392,7 @@
search_site_config ss_cfg;
int mbmode_cost[INTRA_MODES];
- unsigned inter_mode_cost[INTER_MODE_CONTEXTS][INTER_MODES];
+ unsigned int inter_mode_cost[INTER_MODE_CONTEXTS][INTER_MODES];
int intra_uv_mode_cost[FRAME_TYPES][INTRA_MODES];
int y_mode_costs[INTRA_MODES][INTRA_MODES][INTRA_MODES];
int switchable_interp_costs[SWITCHABLE_FILTER_CONTEXTS][SWITCHABLE_FILTERS];
@@ -496,14 +473,6 @@
.buf;
}
-// Intra only frames, golden frames (except alt ref overlays) and
-// alt ref frames tend to be coded at a higher than ambient quality
-static INLINE int frame_is_boosted(const VP9_COMP *cpi) {
- return frame_is_intra_only(&cpi->common) || cpi->refresh_alt_ref_frame ||
- (cpi->refresh_golden_frame && !cpi->rc.is_src_frame_alt_ref) ||
- vp9_is_upper_layer_key_frame(cpi);
-}
-
static INLINE int get_token_alloc(int mb_rows, int mb_cols) {
// TODO(JBB): double check we can't exceed this token count if we have a
// 32x32 transform crossing a boundary at a multiple of 16.
@@ -521,8 +490,6 @@
void vp9_update_reference_frames(VP9_COMP *cpi);
-int64_t vp9_rescale(int64_t val, int64_t num, int denom);
-
void vp9_set_high_precision_mv(VP9_COMP *cpi, int allow_high_precision_mv);
YV12_BUFFER_CONFIG *vp9_scale_if_required(VP9_COMMON *cm,
@@ -531,16 +498,17 @@
void vp9_apply_encoding_flags(VP9_COMP *cpi, vpx_enc_frame_flags_t flags);
-static INLINE int is_spatial_svc(const struct VP9_COMP *const cpi) {
+static INLINE int is_two_pass_svc(const struct VP9_COMP *const cpi) {
return cpi->use_svc &&
- cpi->svc.number_temporal_layers == 1 &&
- cpi->svc.number_spatial_layers > 1;
+ (cpi->svc.number_temporal_layers > 1 ||
+ cpi->svc.number_spatial_layers > 1) &&
+ (cpi->oxcf.pass == 1 || cpi->oxcf.pass == 2);
}
static INLINE int is_altref_enabled(const VP9_COMP *const cpi) {
return cpi->oxcf.mode != REALTIME && cpi->oxcf.lag_in_frames > 0 &&
(cpi->oxcf.play_alternate &&
- (!is_spatial_svc(cpi) ||
+ (!is_two_pass_svc(cpi) ||
cpi->oxcf.ss_play_alternate[cpi->svc.spatial_layer_id]));
}
@@ -557,6 +525,10 @@
return frame_index & 0x1;
}
+static INLINE int *cond_sad_list(const struct VP9_COMP *cpi, int *sad_list) {
+ return cpi->sf.mv.subpel_search_method != SUBPEL_TREE ? sad_list : NULL;
+}
+
#ifdef __cplusplus
} // extern "C"
#endif
diff --git a/vp9/encoder/vp9_firstpass.c b/vp9/encoder/vp9_firstpass.c
index 295e437..54b57cf 100644
--- a/vp9/encoder/vp9_firstpass.c
+++ b/vp9/encoder/vp9_firstpass.c
@@ -76,16 +76,6 @@
p->stats_in = position;
}
-static int lookup_next_frame_stats(const TWO_PASS *p,
- FIRSTPASS_STATS *next_frame) {
- if (p->stats_in >= p->stats_in_end)
- return EOF;
-
- *next_frame = *p->stats_in;
- return 1;
-}
-
-
// Read frame stats at an offset from the current position.
static const FIRSTPASS_STATS *read_frame_stats(const TWO_PASS *p, int offset) {
if ((offset >= 0 && p->stats_in + offset >= p->stats_in_end) ||
@@ -256,7 +246,7 @@
}
void vp9_end_first_pass(VP9_COMP *cpi) {
- if (is_spatial_svc(cpi)) {
+ if (is_two_pass_svc(cpi)) {
int i;
for (i = 0; i < cpi->svc.number_spatial_layers; ++i) {
output_stats(&cpi->svc.layer_context[i].twopass.total_stats,
@@ -396,7 +386,7 @@
cpi->rc.frames_to_key = INT_MAX;
}
-void vp9_first_pass(VP9_COMP *cpi) {
+void vp9_first_pass(VP9_COMP *cpi, const struct lookahead_entry *source) {
int mb_row, mb_col;
MACROBLOCK *const x = &cpi->mb;
VP9_COMMON *const cm = &cpi->common;
@@ -428,10 +418,12 @@
int neutral_count = 0;
int new_mv_count = 0;
int sum_in_vectors = 0;
- uint32_t lastmv_as_int = 0;
+ MV lastmv = {0, 0};
TWO_PASS *twopass = &cpi->twopass;
const MV zero_mv = {0, 0};
const YV12_BUFFER_CONFIG *first_ref_buf = lst_yv12;
+ LAYER_CONTEXT *const lc = is_two_pass_svc(cpi) ?
+ &cpi->svc.layer_context[cpi->svc.spatial_layer_id] : NULL;
#if CONFIG_FP_MB_STATS
if (cpi->use_fp_mb_stats) {
@@ -444,42 +436,52 @@
set_first_pass_params(cpi);
vp9_set_quantizer(cm, find_fp_qindex());
- if (is_spatial_svc(cpi)) {
- MV_REFERENCE_FRAME ref_frame = LAST_FRAME;
- const YV12_BUFFER_CONFIG *scaled_ref_buf = NULL;
- twopass = &cpi->svc.layer_context[cpi->svc.spatial_layer_id].twopass;
+ if (lc != NULL) {
+ twopass = &lc->twopass;
- if (cpi->common.current_video_frame == 0) {
- cpi->ref_frame_flags = 0;
+ cpi->lst_fb_idx = cpi->svc.spatial_layer_id;
+ cpi->ref_frame_flags = VP9_LAST_FLAG;
+
+ if (cpi->svc.number_spatial_layers + cpi->svc.spatial_layer_id <
+ REF_FRAMES) {
+ cpi->gld_fb_idx =
+ cpi->svc.number_spatial_layers + cpi->svc.spatial_layer_id;
+ cpi->ref_frame_flags |= VP9_GOLD_FLAG;
+ cpi->refresh_golden_frame = (lc->current_video_frame_in_layer == 0);
} else {
- LAYER_CONTEXT *lc = &cpi->svc.layer_context[cpi->svc.spatial_layer_id];
- if (lc->current_video_frame_in_layer == 0)
- cpi->ref_frame_flags = VP9_GOLD_FLAG;
- else
- cpi->ref_frame_flags = VP9_LAST_FLAG | VP9_GOLD_FLAG;
+ cpi->refresh_golden_frame = 0;
}
+ if (lc->current_video_frame_in_layer == 0)
+ cpi->ref_frame_flags = 0;
+
vp9_scale_references(cpi);
// Use either last frame or alt frame for motion search.
if (cpi->ref_frame_flags & VP9_LAST_FLAG) {
- scaled_ref_buf = vp9_get_scaled_ref_frame(cpi, LAST_FRAME);
- ref_frame = LAST_FRAME;
- } else if (cpi->ref_frame_flags & VP9_GOLD_FLAG) {
- scaled_ref_buf = vp9_get_scaled_ref_frame(cpi, GOLDEN_FRAME);
- ref_frame = GOLDEN_FRAME;
+ first_ref_buf = vp9_get_scaled_ref_frame(cpi, LAST_FRAME);
+ if (first_ref_buf == NULL)
+ first_ref_buf = get_ref_frame_buffer(cpi, LAST_FRAME);
}
- if (scaled_ref_buf != NULL)
- first_ref_buf = scaled_ref_buf;
+ if (cpi->ref_frame_flags & VP9_GOLD_FLAG) {
+ const int ref_idx =
+ cm->ref_frame_map[get_ref_frame_idx(cpi, GOLDEN_FRAME)];
+ const int scaled_idx = cpi->scaled_ref_idx[GOLDEN_FRAME - 1];
+
+ gld_yv12 = (scaled_idx != ref_idx) ? &cm->frame_bufs[scaled_idx].buf :
+ get_ref_frame_buffer(cpi, GOLDEN_FRAME);
+ } else {
+ gld_yv12 = NULL;
+ }
recon_y_stride = new_yv12->y_stride;
recon_uv_stride = new_yv12->uv_stride;
uv_mb_height = 16 >> (new_yv12->y_height > new_yv12->uv_height);
- // Disable golden frame for svc first pass for now.
- gld_yv12 = NULL;
- set_ref_ptrs(cm, xd, ref_frame, NONE);
+ set_ref_ptrs(cm, xd,
+ (cpi->ref_frame_flags & VP9_LAST_FLAG) ? LAST_FRAME: NONE,
+ (cpi->ref_frame_flags & VP9_GOLD_FLAG) ? GOLDEN_FRAME : NONE);
cpi->Source = vp9_scale_if_required(cm, cpi->un_scaled_source,
&cpi->scaled_source);
@@ -511,9 +513,7 @@
vp9_tile_init(&tile, cm, 0, 0);
for (mb_row = 0; mb_row < cm->mb_rows; ++mb_row) {
- int_mv best_ref_mv;
-
- best_ref_mv.as_int = 0;
+ MV best_ref_mv = {0, 0};
// Reset above block coeffs.
xd->up_available = (mb_row != 0);
@@ -591,16 +591,16 @@
x->mv_col_max = ((cm->mb_cols - 1 - mb_col) * 16) + BORDER_MV_PIXELS_B16;
// Other than for the first frame do a motion search.
- if (cm->current_video_frame > 0) {
+ if ((lc == NULL && cm->current_video_frame > 0) ||
+ (lc != NULL && lc->current_video_frame_in_layer > 0)) {
int tmp_err, motion_error, raw_motion_error;
- int_mv mv, tmp_mv;
+ // Assume 0,0 motion with no mv overhead.
+ MV mv = {0, 0} , tmp_mv = {0, 0};
struct buf_2d unscaled_last_source_buf_2d;
xd->plane[0].pre[0].buf = first_ref_buf->y_buffer + recon_yoffset;
motion_error = get_prediction_error(bsize, &x->plane[0].src,
&xd->plane[0].pre[0]);
- // Assume 0,0 motion with no mv overhead.
- mv.as_int = tmp_mv.as_int = 0;
// Compute the motion error of the 0,0 motion using the last source
// frame as the reference. Skip the further motion search on
@@ -613,11 +613,10 @@
&unscaled_last_source_buf_2d);
// TODO(pengchong): Replace the hard-coded threshold
- if (raw_motion_error > 25 || is_spatial_svc(cpi)) {
+ if (raw_motion_error > 25 || lc != NULL) {
// Test last reference frame using the previous best mv as the
// starting point (best reference) for the search.
- first_pass_motion_search(cpi, x, &best_ref_mv.as_mv, &mv.as_mv,
- &motion_error);
+ first_pass_motion_search(cpi, x, &best_ref_mv, &mv, &motion_error);
if (cpi->oxcf.aq_mode == VARIANCE_AQ) {
vp9_clear_system_state();
motion_error = (int)(motion_error * error_weight);
@@ -625,9 +624,9 @@
// If the current best reference mv is not centered on 0,0 then do a
// 0,0 based search as well.
- if (best_ref_mv.as_int) {
+ if (!is_zero_mv(&best_ref_mv)) {
tmp_err = INT_MAX;
- first_pass_motion_search(cpi, x, &zero_mv, &tmp_mv.as_mv, &tmp_err);
+ first_pass_motion_search(cpi, x, &zero_mv, &tmp_mv, &tmp_err);
if (cpi->oxcf.aq_mode == VARIANCE_AQ) {
vp9_clear_system_state();
tmp_err = (int)(tmp_err * error_weight);
@@ -635,12 +634,14 @@
if (tmp_err < motion_error) {
motion_error = tmp_err;
- mv.as_int = tmp_mv.as_int;
+ mv = tmp_mv;
}
}
// Search in an older reference frame.
- if (cm->current_video_frame > 1 && gld_yv12 != NULL) {
+ if (((lc == NULL && cm->current_video_frame > 1) ||
+ (lc != NULL && lc->current_video_frame_in_layer > 1))
+ && gld_yv12 != NULL) {
// Assume 0,0 motion with no mv overhead.
int gf_motion_error;
@@ -648,7 +649,7 @@
gf_motion_error = get_prediction_error(bsize, &x->plane[0].src,
&xd->plane[0].pre[0]);
- first_pass_motion_search(cpi, x, &zero_mv, &tmp_mv.as_mv,
+ first_pass_motion_search(cpi, x, &zero_mv, &tmp_mv,
&gf_motion_error);
if (cpi->oxcf.aq_mode == VARIANCE_AQ) {
vp9_clear_system_state();
@@ -679,7 +680,8 @@
}
// Start by assuming that intra mode is best.
- best_ref_mv.as_int = 0;
+ best_ref_mv.row = 0;
+ best_ref_mv.col = 0;
#if CONFIG_FP_MB_STATS
if (cpi->use_fp_mb_stats) {
@@ -703,25 +705,25 @@
this_error < 2 * intrapenalty)
++neutral_count;
- mv.as_mv.row *= 8;
- mv.as_mv.col *= 8;
+ mv.row *= 8;
+ mv.col *= 8;
this_error = motion_error;
xd->mi[0]->mbmi.mode = NEWMV;
- xd->mi[0]->mbmi.mv[0] = mv;
+ xd->mi[0]->mbmi.mv[0].as_mv = mv;
xd->mi[0]->mbmi.tx_size = TX_4X4;
xd->mi[0]->mbmi.ref_frame[0] = LAST_FRAME;
xd->mi[0]->mbmi.ref_frame[1] = NONE;
vp9_build_inter_predictors_sby(xd, mb_row << 1, mb_col << 1, bsize);
vp9_encode_sby_pass1(x, bsize);
- sum_mvr += mv.as_mv.row;
- sum_mvr_abs += abs(mv.as_mv.row);
- sum_mvc += mv.as_mv.col;
- sum_mvc_abs += abs(mv.as_mv.col);
- sum_mvrs += mv.as_mv.row * mv.as_mv.row;
- sum_mvcs += mv.as_mv.col * mv.as_mv.col;
+ sum_mvr += mv.row;
+ sum_mvr_abs += abs(mv.row);
+ sum_mvc += mv.col;
+ sum_mvc_abs += abs(mv.col);
+ sum_mvrs += mv.row * mv.row;
+ sum_mvcs += mv.col * mv.col;
++intercount;
- best_ref_mv.as_int = mv.as_int;
+ best_ref_mv = mv;
#if CONFIG_FP_MB_STATS
if (cpi->use_fp_mb_stats) {
@@ -739,7 +741,7 @@
}
#endif
- if (mv.as_int) {
+ if (!is_zero_mv(&mv)) {
++mvcount;
#if CONFIG_FP_MB_STATS
@@ -770,33 +772,33 @@
#endif
// Non-zero vector, was it different from the last non zero vector?
- if (mv.as_int != lastmv_as_int)
+ if (!is_equal_mv(&mv, &lastmv))
++new_mv_count;
- lastmv_as_int = mv.as_int;
+ lastmv = mv;
// Does the row vector point inwards or outwards?
if (mb_row < cm->mb_rows / 2) {
- if (mv.as_mv.row > 0)
+ if (mv.row > 0)
--sum_in_vectors;
- else if (mv.as_mv.row < 0)
+ else if (mv.row < 0)
++sum_in_vectors;
} else if (mb_row > cm->mb_rows / 2) {
- if (mv.as_mv.row > 0)
+ if (mv.row > 0)
++sum_in_vectors;
- else if (mv.as_mv.row < 0)
+ else if (mv.row < 0)
--sum_in_vectors;
}
// Does the col vector point inwards or outwards?
if (mb_col < cm->mb_cols / 2) {
- if (mv.as_mv.col > 0)
+ if (mv.col > 0)
--sum_in_vectors;
- else if (mv.as_mv.col < 0)
+ else if (mv.col < 0)
++sum_in_vectors;
} else if (mb_col > cm->mb_cols / 2) {
- if (mv.as_mv.col > 0)
+ if (mv.col > 0)
++sum_in_vectors;
- else if (mv.as_mv.col < 0)
+ else if (mv.col < 0)
--sum_in_vectors;
}
}
@@ -864,7 +866,7 @@
// TODO(paulwilkins): Handle the case when duration is set to 0, or
// something less than the full time between subsequent values of
// cpi->source_time_stamp.
- fps.duration = (double)(cpi->source->ts_end - cpi->source->ts_start);
+ fps.duration = (double)(source->ts_end - source->ts_start);
// Don't want to do output stats with a stack variable!
twopass->this_frame_stats = fps;
@@ -895,7 +897,7 @@
vp9_extend_frame_borders(new_yv12);
- if (is_spatial_svc(cpi)) {
+ if (lc != NULL) {
vp9_update_reference_frames(cpi);
} else {
// Swap frame pointers so last frame refers to the frame we just compressed.
@@ -904,7 +906,7 @@
// Special case for the first frame. Copy into the GF buffer as a second
// reference.
- if (cm->current_video_frame == 0 && gld_yv12 != NULL) {
+ if (cm->current_video_frame == 0 && gld_yv12 != NULL && lc == NULL) {
vp8_yv12_copy_frame(lst_yv12, gld_yv12);
}
@@ -926,7 +928,7 @@
++cm->current_video_frame;
if (cpi->use_svc)
- vp9_inc_frame_in_layer(&cpi->svc);
+ vp9_inc_frame_in_layer(cpi);
}
static double calc_correction_factor(double err_per_mb,
@@ -964,7 +966,7 @@
BPER_MB_NORMBITS) / num_mbs;
int q;
int is_svc_upper_layer = 0;
- if (is_spatial_svc(cpi) && cpi->svc.spatial_layer_id > 0)
+ if (is_two_pass_svc(cpi) && cpi->svc.spatial_layer_id > 0)
is_svc_upper_layer = 1;
// Try and pick a max Q that will be high enough to encode the
@@ -992,9 +994,9 @@
void vp9_init_second_pass(VP9_COMP *cpi) {
SVC *const svc = &cpi->svc;
const VP9EncoderConfig *const oxcf = &cpi->oxcf;
- const int is_spatial_svc = (svc->number_spatial_layers > 1) &&
- (svc->number_temporal_layers == 1);
- TWO_PASS *const twopass = is_spatial_svc ?
+ const int is_two_pass_svc = (svc->number_spatial_layers > 1) ||
+ (svc->number_temporal_layers > 1);
+ TWO_PASS *const twopass = is_two_pass_svc ?
&svc->layer_context[svc->spatial_layer_id].twopass : &cpi->twopass;
double frame_rate;
FIRSTPASS_STATS *stats;
@@ -1017,7 +1019,7 @@
// It is calculated based on the actual durations of all frames from the
// first pass.
- if (is_spatial_svc) {
+ if (is_two_pass_svc) {
vp9_update_spatial_layer_framerate(cpi, frame_rate);
twopass->bits_left = (int64_t)(stats->duration *
svc->layer_context[svc->spatial_layer_id].target_bandwidth /
@@ -1032,7 +1034,7 @@
// scores used in the second pass. We have this minimum to make sure
// that clips that are static but "low complexity" in the intra domain
// are still boosted appropriately for KF/GF/ARF.
- if (!is_spatial_svc) {
+ if (!is_two_pass_svc) {
// We don't know the number of MBs for each layer at this point.
// So we will do it later.
twopass->kf_intra_err_min = KF_MB_INTRA_MIN * cpi->common.MBs;
@@ -1081,8 +1083,7 @@
// This function gives an estimate of how badly we believe the prediction
// quality is decaying from frame to frame.
-static double get_zero_motion_factor(const VP9_COMMON *cm,
- const FIRSTPASS_STATS *frame) {
+static double get_zero_motion_factor(const FIRSTPASS_STATS *frame) {
const double sr_ratio = frame->coded_error /
DOUBLE_DIVIDE_CHECK(frame->sr_coded_error);
const double zero_motion_pct = frame->pcnt_inter -
@@ -1095,12 +1096,10 @@
// Function to test for a condition where a complex transition is followed
// by a static section. For example in slide shows where there is a fade
// between slides. This is to help with more optimal kf and gf positioning.
-static int detect_transition_to_still(TWO_PASS *twopass,
+static int detect_transition_to_still(const TWO_PASS *twopass,
int frame_interval, int still_interval,
double loop_decay_rate,
double last_decay_rate) {
- int trans_to_still = 0;
-
// Break clause to detect very still sections after motion
// For example a static image after a fade or other transition
// instead of a clean scene cut.
@@ -1108,26 +1107,22 @@
loop_decay_rate >= 0.999 &&
last_decay_rate < 0.9) {
int j;
- const FIRSTPASS_STATS *position = twopass->stats_in;
- FIRSTPASS_STATS tmp_next_frame;
// Look ahead a few frames to see if static condition persists...
for (j = 0; j < still_interval; ++j) {
- if (EOF == input_stats(twopass, &tmp_next_frame))
+ const FIRSTPASS_STATS *stats = &twopass->stats_in[j];
+ if (stats >= twopass->stats_in_end)
break;
- if (tmp_next_frame.pcnt_inter - tmp_next_frame.pcnt_motion < 0.999)
+ if (stats->pcnt_inter - stats->pcnt_motion < 0.999)
break;
}
- reset_fpf_position(twopass, position);
-
// Only if it does do we signal a transition to still.
- if (j == still_interval)
- trans_to_still = 1;
+ return j == still_interval;
}
- return trans_to_still;
+ return 0;
}
// This function detects a flash through the high relative pcnt_second_ref
@@ -1373,7 +1368,8 @@
double group_error, int gf_arf_bits) {
RATE_CONTROL *const rc = &cpi->rc;
const VP9EncoderConfig *const oxcf = &cpi->oxcf;
- TWO_PASS *twopass = &cpi->twopass;
+ TWO_PASS *const twopass = &cpi->twopass;
+ GF_GROUP *const gf_group = &twopass->gf_group;
FIRSTPASS_STATS frame_stats;
int i;
int frame_index = 1;
@@ -1386,6 +1382,13 @@
int mid_boost_bits = 0;
int mid_frame_idx;
unsigned char arf_buffer_indices[MAX_ACTIVE_ARFS];
+ int alt_frame_index = frame_index;
+ int has_temporal_layers = is_two_pass_svc(cpi) &&
+ cpi->svc.number_temporal_layers > 1;
+
+ // Only encode alt reference frame in temporal base layer.
+ if (has_temporal_layers)
+ alt_frame_index = cpi->svc.number_temporal_layers;
key_frame = cpi->common.frame_type == KEY_FRAME ||
vp9_is_upper_layer_key_frame(cpi);
@@ -1396,17 +1399,17 @@
// is also the golden frame.
if (!key_frame) {
if (rc->source_alt_ref_active) {
- twopass->gf_group.update_type[0] = OVERLAY_UPDATE;
- twopass->gf_group.rf_level[0] = INTER_NORMAL;
- twopass->gf_group.bit_allocation[0] = 0;
- twopass->gf_group.arf_update_idx[0] = arf_buffer_indices[0];
- twopass->gf_group.arf_ref_idx[0] = arf_buffer_indices[0];
+ gf_group->update_type[0] = OVERLAY_UPDATE;
+ gf_group->rf_level[0] = INTER_NORMAL;
+ gf_group->bit_allocation[0] = 0;
+ gf_group->arf_update_idx[0] = arf_buffer_indices[0];
+ gf_group->arf_ref_idx[0] = arf_buffer_indices[0];
} else {
- twopass->gf_group.update_type[0] = GF_UPDATE;
- twopass->gf_group.rf_level[0] = GF_ARF_STD;
- twopass->gf_group.bit_allocation[0] = gf_arf_bits;
- twopass->gf_group.arf_update_idx[0] = arf_buffer_indices[0];
- twopass->gf_group.arf_ref_idx[0] = arf_buffer_indices[0];
+ gf_group->update_type[0] = GF_UPDATE;
+ gf_group->rf_level[0] = GF_ARF_STD;
+ gf_group->bit_allocation[0] = gf_arf_bits;
+ gf_group->arf_update_idx[0] = arf_buffer_indices[0];
+ gf_group->arf_ref_idx[0] = arf_buffer_indices[0];
}
// Step over the golden frame / overlay frame
@@ -1421,25 +1424,33 @@
// Store the bits to spend on the ARF if there is one.
if (rc->source_alt_ref_pending) {
- twopass->gf_group.update_type[frame_index] = ARF_UPDATE;
- twopass->gf_group.rf_level[frame_index] = GF_ARF_STD;
- twopass->gf_group.bit_allocation[frame_index] = gf_arf_bits;
- twopass->gf_group.arf_src_offset[frame_index] =
- (unsigned char)(rc->baseline_gf_interval - 1);
- twopass->gf_group.arf_update_idx[frame_index] = arf_buffer_indices[0];
- twopass->gf_group.arf_ref_idx[frame_index] =
+ gf_group->update_type[alt_frame_index] = ARF_UPDATE;
+ gf_group->rf_level[alt_frame_index] = GF_ARF_STD;
+ gf_group->bit_allocation[alt_frame_index] = gf_arf_bits;
+
+ if (has_temporal_layers)
+ gf_group->arf_src_offset[alt_frame_index] =
+ (unsigned char)(rc->baseline_gf_interval -
+ cpi->svc.number_temporal_layers);
+ else
+ gf_group->arf_src_offset[alt_frame_index] =
+ (unsigned char)(rc->baseline_gf_interval - 1);
+
+ gf_group->arf_update_idx[alt_frame_index] = arf_buffer_indices[0];
+ gf_group->arf_ref_idx[alt_frame_index] =
arf_buffer_indices[cpi->multi_arf_last_grp_enabled &&
rc->source_alt_ref_active];
- ++frame_index;
+ if (!has_temporal_layers)
+ ++frame_index;
if (cpi->multi_arf_enabled) {
// Set aside a slot for a level 1 arf.
- twopass->gf_group.update_type[frame_index] = ARF_UPDATE;
- twopass->gf_group.rf_level[frame_index] = GF_ARF_LOW;
- twopass->gf_group.arf_src_offset[frame_index] =
+ gf_group->update_type[frame_index] = ARF_UPDATE;
+ gf_group->rf_level[frame_index] = GF_ARF_LOW;
+ gf_group->arf_src_offset[frame_index] =
(unsigned char)((rc->baseline_gf_interval >> 1) - 1);
- twopass->gf_group.arf_update_idx[frame_index] = arf_buffer_indices[1];
- twopass->gf_group.arf_ref_idx[frame_index] = arf_buffer_indices[0];
+ gf_group->arf_update_idx[frame_index] = arf_buffer_indices[1];
+ gf_group->arf_ref_idx[frame_index] = arf_buffer_indices[0];
++frame_index;
}
}
@@ -1453,6 +1464,10 @@
if (EOF == input_stats(twopass, &frame_stats))
break;
+ if (has_temporal_layers && frame_index == alt_frame_index) {
+ ++frame_index;
+ }
+
modified_err = calculate_modified_err(twopass, oxcf, &frame_stats);
if (group_error > 0)
@@ -1469,16 +1484,16 @@
if (frame_index <= mid_frame_idx)
arf_idx = 1;
}
- twopass->gf_group.arf_update_idx[frame_index] = arf_buffer_indices[arf_idx];
- twopass->gf_group.arf_ref_idx[frame_index] = arf_buffer_indices[arf_idx];
+ gf_group->arf_update_idx[frame_index] = arf_buffer_indices[arf_idx];
+ gf_group->arf_ref_idx[frame_index] = arf_buffer_indices[arf_idx];
target_frame_size = clamp(target_frame_size, 0,
MIN(max_bits, (int)total_group_bits));
- twopass->gf_group.update_type[frame_index] = LF_UPDATE;
- twopass->gf_group.rf_level[frame_index] = INTER_NORMAL;
+ gf_group->update_type[frame_index] = LF_UPDATE;
+ gf_group->rf_level[frame_index] = INTER_NORMAL;
- twopass->gf_group.bit_allocation[frame_index] = target_frame_size;
+ gf_group->bit_allocation[frame_index] = target_frame_size;
++frame_index;
}
@@ -1486,23 +1501,23 @@
// We need to configure the frame at the end of the sequence + 1 that will be
// the start frame for the next group. Otherwise prior to the call to
// vp9_rc_get_second_pass_params() the data will be undefined.
- twopass->gf_group.arf_update_idx[frame_index] = arf_buffer_indices[0];
- twopass->gf_group.arf_ref_idx[frame_index] = arf_buffer_indices[0];
+ gf_group->arf_update_idx[frame_index] = arf_buffer_indices[0];
+ gf_group->arf_ref_idx[frame_index] = arf_buffer_indices[0];
if (rc->source_alt_ref_pending) {
- twopass->gf_group.update_type[frame_index] = OVERLAY_UPDATE;
- twopass->gf_group.rf_level[frame_index] = INTER_NORMAL;
+ gf_group->update_type[frame_index] = OVERLAY_UPDATE;
+ gf_group->rf_level[frame_index] = INTER_NORMAL;
// Final setup for second arf and its overlay.
if (cpi->multi_arf_enabled) {
- twopass->gf_group.bit_allocation[2] =
- twopass->gf_group.bit_allocation[mid_frame_idx] + mid_boost_bits;
- twopass->gf_group.update_type[mid_frame_idx] = OVERLAY_UPDATE;
- twopass->gf_group.bit_allocation[mid_frame_idx] = 0;
+ gf_group->bit_allocation[2] =
+ gf_group->bit_allocation[mid_frame_idx] + mid_boost_bits;
+ gf_group->update_type[mid_frame_idx] = OVERLAY_UPDATE;
+ gf_group->bit_allocation[mid_frame_idx] = 0;
}
} else {
- twopass->gf_group.update_type[frame_index] = GF_UPDATE;
- twopass->gf_group.rf_level[frame_index] = GF_ARF_STD;
+ gf_group->update_type[frame_index] = GF_UPDATE;
+ gf_group->rf_level[frame_index] = GF_ARF_STD;
}
// Note whether multi-arf was enabled this group for next time.
@@ -1554,8 +1569,6 @@
vp9_clear_system_state();
vp9_zero(next_frame);
- gf_group_bits = 0;
-
// Load stats for the current frame.
mod_frame_err = calculate_modified_err(twopass, oxcf, this_frame);
@@ -1615,9 +1628,8 @@
decay_accumulator = decay_accumulator * loop_decay_rate;
// Monitor for static sections.
- zero_motion_accumulator =
- MIN(zero_motion_accumulator,
- get_zero_motion_factor(&cpi->common, &next_frame));
+ zero_motion_accumulator = MIN(zero_motion_accumulator,
+ get_zero_motion_factor(&next_frame));
// Break clause to detect very still sections after motion. For example,
// a static image after a fade or other transition.
@@ -1677,6 +1689,21 @@
else
rc->baseline_gf_interval = i;
+ // Only encode alt reference frame in temporal base layer. So
+ // baseline_gf_interval should be multiple of a temporal layer group
+ // (typically the frame distance between two base layer frames)
+ if (is_two_pass_svc(cpi) && cpi->svc.number_temporal_layers > 1) {
+ int count = (1 << (cpi->svc.number_temporal_layers - 1)) - 1;
+ int new_gf_interval = (rc->baseline_gf_interval + count) & (~count);
+ int j;
+ for (j = 0; j < new_gf_interval - rc->baseline_gf_interval; ++j) {
+ if (EOF == input_stats(twopass, this_frame))
+ break;
+ gf_group_err += calculate_modified_err(twopass, oxcf, this_frame);
+ }
+ rc->baseline_gf_interval = new_gf_interval;
+ }
+
rc->frames_till_gf_update_due = rc->baseline_gf_interval;
// Should we use the alternate reference frame.
@@ -1831,6 +1858,7 @@
int i, j;
RATE_CONTROL *const rc = &cpi->rc;
TWO_PASS *const twopass = &cpi->twopass;
+ GF_GROUP *const gf_group = &twopass->gf_group;
const VP9EncoderConfig *const oxcf = &cpi->oxcf;
const FIRSTPASS_STATS first_frame = *this_frame;
const FIRSTPASS_STATS *const start_position = twopass->stats_in;
@@ -1849,7 +1877,7 @@
cpi->common.frame_type = KEY_FRAME;
// Reset the GF group data structures.
- vp9_zero(twopass->gf_group);
+ vp9_zero(*gf_group);
// Is this a forced key frame by interval.
rc->this_key_frame_forced = rc->next_key_frame_forced;
@@ -1881,16 +1909,17 @@
input_stats(twopass, this_frame);
// Provided that we are not at the end of the file...
- if (cpi->oxcf.auto_key &&
- lookup_next_frame_stats(twopass, &next_frame) != EOF) {
+ if (cpi->oxcf.auto_key && twopass->stats_in < twopass->stats_in_end) {
double loop_decay_rate;
// Check for a scene cut.
- if (test_candidate_kf(twopass, &last_frame, this_frame, &next_frame))
+ if (test_candidate_kf(twopass, &last_frame, this_frame,
+ twopass->stats_in))
break;
// How fast is the prediction quality decaying?
- loop_decay_rate = get_prediction_decay_rate(&cpi->common, &next_frame);
+ loop_decay_rate = get_prediction_decay_rate(&cpi->common,
+ twopass->stats_in);
// We want to know something about the recent past... rather than
// as used elsewhere where we are concerned with decay in prediction
@@ -1947,6 +1976,18 @@
rc->next_key_frame_forced = 0;
}
+ if (is_two_pass_svc(cpi) && cpi->svc.number_temporal_layers > 1) {
+ int count = (1 << (cpi->svc.number_temporal_layers - 1)) - 1;
+ int new_frame_to_key = (rc->frames_to_key + count) & (~count);
+ int j;
+ for (j = 0; j < new_frame_to_key - rc->frames_to_key; ++j) {
+ if (EOF == input_stats(twopass, this_frame))
+ break;
+ kf_group_err += calculate_modified_err(twopass, oxcf, this_frame);
+ }
+ rc->frames_to_key = new_frame_to_key;
+ }
+
// Special case for the last key frame of the file.
if (twopass->stats_in >= twopass->stats_in_end) {
// Accumulate kf group error.
@@ -1987,9 +2028,8 @@
break;
// Monitor for static sections.
- zero_motion_accumulator =
- MIN(zero_motion_accumulator,
- get_zero_motion_factor(&cpi->common, &next_frame));
+ zero_motion_accumulator =MIN(zero_motion_accumulator,
+ get_zero_motion_factor(&next_frame));
// For the first few frames collect data to decide kf boost.
if (i <= (rc->max_gf_interval * 2)) {
@@ -2040,9 +2080,9 @@
twopass->kf_group_bits -= kf_bits;
// Save the bits to spend on the key frame.
- twopass->gf_group.bit_allocation[0] = kf_bits;
- twopass->gf_group.update_type[0] = KF_UPDATE;
- twopass->gf_group.rf_level[0] = KF_STD;
+ gf_group->bit_allocation[0] = kf_bits;
+ gf_group->update_type[0] = KF_UPDATE;
+ gf_group->rf_level[0] = KF_STD;
// Note the total error score of the kf group minus the key frame itself.
twopass->kf_group_error_left = (int)(kf_group_err - kf_mod_err);
@@ -2106,7 +2146,7 @@
assert(0);
break;
}
- if (is_spatial_svc(cpi)) {
+ if (is_two_pass_svc(cpi)) {
if (cpi->svc.layer_context[cpi->svc.spatial_layer_id].gold_ref_idx < 0)
cpi->refresh_golden_frame = 0;
if (cpi->alt_ref_source == NULL)
@@ -2119,15 +2159,16 @@
VP9_COMMON *const cm = &cpi->common;
RATE_CONTROL *const rc = &cpi->rc;
TWO_PASS *const twopass = &cpi->twopass;
+ GF_GROUP *const gf_group = &twopass->gf_group;
int frames_left;
FIRSTPASS_STATS this_frame;
FIRSTPASS_STATS this_frame_copy;
int target_rate;
- LAYER_CONTEXT *lc = NULL;
+ LAYER_CONTEXT *const lc = is_two_pass_svc(cpi) ?
+ &cpi->svc.layer_context[cpi->svc.spatial_layer_id] : 0;
- if (is_spatial_svc(cpi)) {
- lc = &cpi->svc.layer_context[cpi->svc.spatial_layer_id];
+ if (lc != NULL) {
frames_left = (int)(twopass->total_stats.count -
lc->current_video_frame_in_layer);
} else {
@@ -2140,10 +2181,10 @@
// If this is an arf frame then we dont want to read the stats file or
// advance the input pointer as we already have what we need.
- if (twopass->gf_group.update_type[twopass->gf_group.index] == ARF_UPDATE) {
+ if (gf_group->update_type[gf_group->index] == ARF_UPDATE) {
int target_rate;
configure_buffer_updates(cpi);
- target_rate = twopass->gf_group.bit_allocation[twopass->gf_group.index];
+ target_rate = gf_group->bit_allocation[gf_group->index];
target_rate = vp9_rc_clamp_pframe_target_size(cpi, target_rate);
rc->base_frame_target = target_rate;
@@ -2154,7 +2195,7 @@
vp9_rc_set_frame_target(cpi, target_rate);
cm->frame_type = INTER_FRAME;
- if (is_spatial_svc(cpi)) {
+ if (lc != NULL) {
if (cpi->svc.spatial_layer_id == 0) {
lc->is_key_frame = 0;
} else {
@@ -2170,7 +2211,7 @@
vp9_clear_system_state();
- if (is_spatial_svc(cpi) && twopass->kf_intra_err_min == 0) {
+ if (lc != NULL && twopass->kf_intra_err_min == 0) {
twopass->kf_intra_err_min = KF_MB_INTRA_MIN * cpi->common.MBs;
twopass->gf_intra_err_min = GF_MB_INTRA_MIN * cpi->common.MBs;
}
@@ -2178,8 +2219,7 @@
if (cpi->oxcf.rc_mode == VPX_Q) {
twopass->active_worst_quality = cpi->oxcf.cq_level;
} else if (cm->current_video_frame == 0 ||
- (is_spatial_svc(cpi) &&
- lc->current_video_frame_in_layer == 0)) {
+ (lc != NULL && lc->current_video_frame_in_layer == 0)) {
// Special case code for first frame.
const int section_target_bandwidth = (int)(twopass->bits_left /
frames_left);
@@ -2205,18 +2245,21 @@
cm->frame_type = INTER_FRAME;
}
- if (is_spatial_svc(cpi)) {
+ if (lc != NULL) {
if (cpi->svc.spatial_layer_id == 0) {
lc->is_key_frame = (cm->frame_type == KEY_FRAME);
- if (lc->is_key_frame)
+ if (lc->is_key_frame) {
cpi->ref_frame_flags &=
(~VP9_LAST_FLAG & ~VP9_GOLD_FLAG & ~VP9_ALT_FLAG);
+ lc->frames_from_key_frame = 0;
+ }
} else {
cm->frame_type = INTER_FRAME;
lc->is_key_frame = cpi->svc.layer_context[0].is_key_frame;
if (lc->is_key_frame) {
cpi->ref_frame_flags &= (~VP9_LAST_FLAG);
+ lc->frames_from_key_frame = 0;
}
}
}
@@ -2236,13 +2279,13 @@
}
rc->frames_till_gf_update_due = rc->baseline_gf_interval;
- if (!is_spatial_svc(cpi))
+ if (lc != NULL)
cpi->refresh_golden_frame = 1;
}
configure_buffer_updates(cpi);
- target_rate = twopass->gf_group.bit_allocation[twopass->gf_group.index];
+ target_rate = gf_group->bit_allocation[gf_group->index];
if (cpi->common.frame_type == KEY_FRAME)
target_rate = vp9_rc_clamp_iframe_target_size(cpi, target_rate);
else
diff --git a/vp9/encoder/vp9_firstpass.h b/vp9/encoder/vp9_firstpass.h
index bf8c9fd..aaa6b03 100644
--- a/vp9/encoder/vp9_firstpass.h
+++ b/vp9/encoder/vp9_firstpass.h
@@ -121,7 +121,7 @@
void vp9_init_first_pass(struct VP9_COMP *cpi);
void vp9_rc_get_first_pass_params(struct VP9_COMP *cpi);
-void vp9_first_pass(struct VP9_COMP *cpi);
+void vp9_first_pass(struct VP9_COMP *cpi, const struct lookahead_entry *source);
void vp9_end_first_pass(struct VP9_COMP *cpi);
void vp9_init_second_pass(struct VP9_COMP *cpi);
diff --git a/vp9/encoder/vp9_lookahead.c b/vp9/encoder/vp9_lookahead.c
index e743517..823e7a1 100644
--- a/vp9/encoder/vp9_lookahead.c
+++ b/vp9/encoder/vp9_lookahead.c
@@ -50,6 +50,9 @@
unsigned int height,
unsigned int subsampling_x,
unsigned int subsampling_y,
+#if CONFIG_VP9_HIGHBITDEPTH
+ int use_highbitdepth,
+#endif
unsigned int depth) {
struct lookahead_ctx *ctx = NULL;
@@ -70,6 +73,9 @@
for (i = 0; i < depth; i++)
if (vp9_alloc_frame_buffer(&ctx->buf[i].img,
width, height, subsampling_x, subsampling_y,
+#if CONFIG_VP9_HIGHBITDEPTH
+ use_highbitdepth,
+#endif
VP9_ENC_BORDER_IN_PIXELS))
goto bail;
}
diff --git a/vp9/encoder/vp9_lookahead.h b/vp9/encoder/vp9_lookahead.h
index 678c51a..2786193 100644
--- a/vp9/encoder/vp9_lookahead.h
+++ b/vp9/encoder/vp9_lookahead.h
@@ -56,6 +56,9 @@
unsigned int height,
unsigned int subsampling_x,
unsigned int subsampling_y,
+#if CONFIG_VP9_HIGHBITDEPTH
+ int use_highbitdepth,
+#endif
unsigned int depth);
diff --git a/vp9/encoder/vp9_mbgraph.c b/vp9/encoder/vp9_mbgraph.c
index 6e04e2a..b8e7164 100644
--- a/vp9/encoder/vp9_mbgraph.c
+++ b/vp9/encoder/vp9_mbgraph.c
@@ -34,6 +34,7 @@
const int tmp_row_min = x->mv_row_min;
const int tmp_row_max = x->mv_row_max;
MV ref_full;
+ int sad_list[5];
// Further step/diamond searches as necessary
int step_param = mv_sf->reduce_first_step_size;
@@ -45,8 +46,9 @@
ref_full.row = ref_mv->row >> 3;
/*cpi->sf.search_method == HEX*/
- vp9_hex_search(x, &ref_full, step_param, x->errorperbit, 0, &v_fn_ptr, 0,
- ref_mv, dst_mv);
+ vp9_hex_search(x, &ref_full, step_param, x->errorperbit, 0,
+ cond_sad_list(cpi, sad_list),
+ &v_fn_ptr, 0, ref_mv, dst_mv);
// Try sub-pixel MC
// if (bestsme > error_thresh && bestsme < INT_MAX)
@@ -55,8 +57,10 @@
unsigned int sse;
cpi->find_fractional_mv_step(
x, dst_mv, ref_mv, cpi->common.allow_high_precision_mv, x->errorperbit,
- &v_fn_ptr, 0, mv_sf->subpel_iters_per_step, NULL, NULL, &distortion,
- &sse, NULL, 0, 0);
+ &v_fn_ptr, 0, mv_sf->subpel_iters_per_step,
+ cond_sad_list(cpi, sad_list),
+ NULL, NULL,
+ &distortion, &sse, NULL, 0, 0);
}
xd->mi[0]->mbmi.mode = NEWMV;
diff --git a/vp9/encoder/vp9_mcomp.c b/vp9/encoder/vp9_mcomp.c
index ae924d5..d6f6b25 100644
--- a/vp9/encoder/vp9_mcomp.c
+++ b/vp9/encoder/vp9_mcomp.c
@@ -256,6 +256,137 @@
} \
}
+#define SETUP_SUBPEL_SEARCH \
+ const uint8_t *const z = x->plane[0].src.buf; \
+ const int src_stride = x->plane[0].src.stride; \
+ const MACROBLOCKD *xd = &x->e_mbd; \
+ unsigned int besterr = INT_MAX; \
+ unsigned int sse; \
+ unsigned int whichdir; \
+ int thismse; \
+ const unsigned int halfiters = iters_per_step; \
+ const unsigned int quarteriters = iters_per_step; \
+ const unsigned int eighthiters = iters_per_step; \
+ const int y_stride = xd->plane[0].pre[0].stride; \
+ const int offset = bestmv->row * y_stride + bestmv->col; \
+ const uint8_t *const y = xd->plane[0].pre[0].buf; \
+ \
+ int rr = ref_mv->row; \
+ int rc = ref_mv->col; \
+ int br = bestmv->row * 8; \
+ int bc = bestmv->col * 8; \
+ int hstep = 4; \
+ const int minc = MAX(x->mv_col_min * 8, ref_mv->col - MV_MAX); \
+ const int maxc = MIN(x->mv_col_max * 8, ref_mv->col + MV_MAX); \
+ const int minr = MAX(x->mv_row_min * 8, ref_mv->row - MV_MAX); \
+ const int maxr = MIN(x->mv_row_max * 8, ref_mv->row + MV_MAX); \
+ int tr = br; \
+ int tc = bc; \
+ \
+ bestmv->row *= 8; \
+ bestmv->col *= 8; \
+ if (second_pred != NULL) { \
+ DECLARE_ALIGNED_ARRAY(16, uint8_t, comp_pred, 64 * 64); \
+ vp9_comp_avg_pred(comp_pred, second_pred, w, h, y + offset, y_stride); \
+ besterr = vfp->vf(comp_pred, w, z, src_stride, sse1); \
+ } else { \
+ besterr = vfp->vf(y + offset, y_stride, z, src_stride, sse1); \
+ } \
+ *distortion = besterr; \
+ besterr += mv_err_cost(bestmv, ref_mv, mvjcost, mvcost, error_per_bit);
+
+int vp9_find_best_sub_pixel_tree_pruned(const MACROBLOCK *x,
+ MV *bestmv, const MV *ref_mv,
+ int allow_hp,
+ int error_per_bit,
+ const vp9_variance_fn_ptr_t *vfp,
+ int forced_stop,
+ int iters_per_step,
+ int *sad_list,
+ int *mvjcost, int *mvcost[2],
+ int *distortion,
+ unsigned int *sse1,
+ const uint8_t *second_pred,
+ int w, int h) {
+ SETUP_SUBPEL_SEARCH;
+
+ if (sad_list &&
+ sad_list[0] != INT_MAX && sad_list[1] != INT_MAX &&
+ sad_list[2] != INT_MAX && sad_list[3] != INT_MAX &&
+ sad_list[4] != INT_MAX) {
+ unsigned int left, right, up, down, diag;
+ whichdir = (sad_list[1] < sad_list[3] ? 0 : 1) +
+ (sad_list[2] < sad_list[4] ? 0 : 2);
+ switch (whichdir) {
+ case 0:
+ CHECK_BETTER(left, tr, tc - hstep);
+ CHECK_BETTER(up, tr - hstep, tc);
+ CHECK_BETTER(diag, tr - hstep, tc - hstep);
+ break;
+ case 1:
+ CHECK_BETTER(right, tr, tc + hstep);
+ CHECK_BETTER(up, tr - hstep, tc);
+ CHECK_BETTER(diag, tr - hstep, tc + hstep);
+ break;
+ case 2:
+ CHECK_BETTER(left, tr, tc - hstep);
+ CHECK_BETTER(down, tr + hstep, tc);
+ CHECK_BETTER(diag, tr + hstep, tc - hstep);
+ break;
+ case 3:
+ CHECK_BETTER(right, tr, tc + hstep);
+ CHECK_BETTER(down, tr + hstep, tc);
+ CHECK_BETTER(diag, tr + hstep, tc + hstep);
+ break;
+ }
+ } else {
+ FIRST_LEVEL_CHECKS;
+ if (halfiters > 1) {
+ SECOND_LEVEL_CHECKS;
+ }
+ }
+
+ tr = br;
+ tc = bc;
+
+ // Each subsequent iteration checks at least one point in common with
+ // the last iteration could be 2 ( if diag selected) 1/4 pel
+
+ // Note forced_stop: 0 - full, 1 - qtr only, 2 - half only
+ if (forced_stop != 2) {
+ hstep >>= 1;
+ FIRST_LEVEL_CHECKS;
+ if (quarteriters > 1) {
+ SECOND_LEVEL_CHECKS;
+ }
+ tr = br;
+ tc = bc;
+ }
+
+ if (allow_hp && vp9_use_mv_hp(ref_mv) && forced_stop == 0) {
+ hstep >>= 1;
+ FIRST_LEVEL_CHECKS;
+ if (eighthiters > 1) {
+ SECOND_LEVEL_CHECKS;
+ }
+ tr = br;
+ tc = bc;
+ }
+ // These lines insure static analysis doesn't warn that
+ // tr and tc aren't used after the above point.
+ (void) tr;
+ (void) tc;
+
+ bestmv->row = br;
+ bestmv->col = bc;
+
+ if ((abs(bestmv->col - ref_mv->col) > (MAX_FULL_PEL_VAL << 3)) ||
+ (abs(bestmv->row - ref_mv->row) > (MAX_FULL_PEL_VAL << 3)))
+ return INT_MAX;
+
+ return besterr;
+}
+
int vp9_find_best_sub_pixel_tree(const MACROBLOCK *x,
MV *bestmv, const MV *ref_mv,
int allow_hp,
@@ -263,55 +394,14 @@
const vp9_variance_fn_ptr_t *vfp,
int forced_stop,
int iters_per_step,
+ int *sad_list,
int *mvjcost, int *mvcost[2],
int *distortion,
unsigned int *sse1,
const uint8_t *second_pred,
int w, int h) {
- const uint8_t *const z = x->plane[0].src.buf;
- const int src_stride = x->plane[0].src.stride;
- const MACROBLOCKD *xd = &x->e_mbd;
- unsigned int besterr = INT_MAX;
- unsigned int sse;
- unsigned int whichdir;
- int thismse;
- const unsigned int halfiters = iters_per_step;
- const unsigned int quarteriters = iters_per_step;
- const unsigned int eighthiters = iters_per_step;
-
- const int y_stride = xd->plane[0].pre[0].stride;
- const int offset = bestmv->row * y_stride + bestmv->col;
- const uint8_t *const y = xd->plane[0].pre[0].buf;
-
- int rr = ref_mv->row;
- int rc = ref_mv->col;
- int br = bestmv->row * 8;
- int bc = bestmv->col * 8;
- int hstep = 4;
- const int minc = MAX(x->mv_col_min * 8, ref_mv->col - MV_MAX);
- const int maxc = MIN(x->mv_col_max * 8, ref_mv->col + MV_MAX);
- const int minr = MAX(x->mv_row_min * 8, ref_mv->row - MV_MAX);
- const int maxr = MIN(x->mv_row_max * 8, ref_mv->row + MV_MAX);
-
- int tr = br;
- int tc = bc;
-
- // central mv
- bestmv->row *= 8;
- bestmv->col *= 8;
-
- // calculate central point error
- // TODO(yunqingwang): central pointer error was already calculated in full-
- // pixel search, and can be passed in this function.
- if (second_pred != NULL) {
- DECLARE_ALIGNED_ARRAY(16, uint8_t, comp_pred, 64 * 64);
- vp9_comp_avg_pred(comp_pred, second_pred, w, h, y + offset, y_stride);
- besterr = vfp->vf(comp_pred, w, z, src_stride, sse1);
- } else {
- besterr = vfp->vf(y + offset, y_stride, z, src_stride, sse1);
- }
- *distortion = besterr;
- besterr += mv_err_cost(bestmv, ref_mv, mvjcost, mvcost, error_per_bit);
+ SETUP_SUBPEL_SEARCH;
+ (void) sad_list; // to silence compiler warning
// Each subsequent iteration checks at least one point in
// common with the last iteration could be 2 ( if diag selected)
@@ -398,14 +488,17 @@
// Each scale can have a different number of candidates and shape of
// candidates as indicated in the num_candidates and candidates arrays
// passed into this function
+//
static int vp9_pattern_search(const MACROBLOCK *x,
MV *ref_mv,
int search_param,
int sad_per_bit,
- int do_init_search, int do_refine,
+ int do_init_search,
+ int *sad_list,
const vp9_variance_fn_ptr_t *vfp,
int use_mvcost,
- const MV *center_mv, MV *best_mv,
+ const MV *center_mv,
+ MV *best_mv,
const int num_candidates[MAX_PATTERN_SCALES],
const MV candidates[MAX_PATTERN_SCALES]
[MAX_PATTERN_CANDIDATES]) {
@@ -413,7 +506,7 @@
static const int search_param_to_steps[MAX_MVSEARCH_STEPS] = {
10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
};
- int i, j, s, t;
+ int i, s, t;
const struct buf_2d *const what = &x->plane[0].src;
const struct buf_2d *const in_what = &xd->plane[0].pre[0];
int br, bc;
@@ -552,47 +645,38 @@
} while (s--);
}
- // Check 4 1-away neighbors if do_refine is true.
- // For most well-designed schemes do_refine will not be necessary.
- if (do_refine) {
- static const MV neighbors[4] = {{0, -1}, { -1, 0}, {1, 0}, {0, 1}};
-
- for (j = 0; j < 16; j++) {
- int best_site = -1;
- if (check_bounds(x, br, bc, 1)) {
- for (i = 0; i < 4; i++) {
- const MV this_mv = {br + neighbors[i].row,
- bc + neighbors[i].col};
- thissad = vfp->sdf(what->buf, what->stride,
- get_buf_from_mv(in_what, &this_mv),
- in_what->stride);
- CHECK_BETTER
- }
- } else {
- for (i = 0; i < 4; i++) {
- const MV this_mv = {br + neighbors[i].row,
- bc + neighbors[i].col};
- if (!is_mv_in(x, &this_mv))
- continue;
- thissad = vfp->sdf(what->buf, what->stride,
- get_buf_from_mv(in_what, &this_mv),
- in_what->stride);
- CHECK_BETTER
- }
+ // Returns the one-away integer pel sad values around the best as follows:
+ // sad_list[0]: sad at the best integer pel
+ // sad_list[1]: sad at delta {0, -1} (left) from the best integer pel
+ // sad_list[2]: sad at delta {-1, 0} (top) from the best integer pel
+ // sad_list[3]: sad at delta { 0, 1} (right) from the best integer pel
+ // sad_list[4]: sad at delta { 1, 0} (bottom) from the best integer pel
+ if (sad_list) {
+ static const MV neighbors[4] = {{0, -1}, {-1, 0}, {0, 1}, {1, 0}};
+ sad_list[0] = bestsad;
+ if (check_bounds(x, br, bc, 1)) {
+ for (i = 0; i < 4; i++) {
+ const MV this_mv = {br + neighbors[i].row,
+ bc + neighbors[i].col};
+ sad_list[i + 1] = vfp->sdf(what->buf, what->stride,
+ get_buf_from_mv(in_what, &this_mv),
+ in_what->stride);
}
-
- if (best_site == -1) {
- break;
- } else {
- br += neighbors[best_site].row;
- bc += neighbors[best_site].col;
+ } else {
+ for (i = 0; i < 4; i++) {
+ const MV this_mv = {br + neighbors[i].row,
+ bc + neighbors[i].col};
+ if (!is_mv_in(x, &this_mv))
+ sad_list[i + 1] = INT_MAX;
+ else
+ sad_list[i + 1] = vfp->sdf(what->buf, what->stride,
+ get_buf_from_mv(in_what, &this_mv),
+ in_what->stride);
}
}
}
-
best_mv->row = br;
best_mv->col = bc;
-
return bestsad;
}
@@ -634,6 +718,7 @@
int search_param,
int sad_per_bit,
int do_init_search,
+ int *sad_list,
const vp9_variance_fn_ptr_t *vfp,
int use_mvcost,
const MV *center_mv, MV *best_mv) {
@@ -658,7 +743,7 @@
{ -1024, 0}},
};
return vp9_pattern_search(x, ref_mv, search_param, sad_per_bit,
- do_init_search, 0, vfp, use_mvcost,
+ do_init_search, sad_list, vfp, use_mvcost,
center_mv, best_mv,
hex_num_candidates, hex_candidates);
}
@@ -668,6 +753,7 @@
int search_param,
int sad_per_bit,
int do_init_search,
+ int *sad_list,
const vp9_variance_fn_ptr_t *vfp,
int use_mvcost,
const MV *center_mv,
@@ -699,7 +785,7 @@
{-512, 512}, {-1024, 0}},
};
return vp9_pattern_search(x, ref_mv, search_param, sad_per_bit,
- do_init_search, 0, vfp, use_mvcost,
+ do_init_search, sad_list, vfp, use_mvcost,
center_mv, best_mv,
bigdia_num_candidates, bigdia_candidates);
}
@@ -709,6 +795,7 @@
int search_param,
int sad_per_bit,
int do_init_search,
+ int *sad_list,
const vp9_variance_fn_ptr_t *vfp,
int use_mvcost,
const MV *center_mv,
@@ -740,7 +827,7 @@
{0, 1024}, {-1024, 1024}, {-1024, 0}},
};
return vp9_pattern_search(x, ref_mv, search_param, sad_per_bit,
- do_init_search, 0, vfp, use_mvcost,
+ do_init_search, sad_list, vfp, use_mvcost,
center_mv, best_mv,
square_num_candidates, square_candidates);
}
@@ -750,12 +837,13 @@
int search_param,
int sad_per_bit,
int do_init_search, // must be zero for fast_hex
+ int *sad_list,
const vp9_variance_fn_ptr_t *vfp,
int use_mvcost,
const MV *center_mv,
MV *best_mv) {
return vp9_hex_search(x, ref_mv, MAX(MAX_MVSEARCH_STEPS - 2, search_param),
- sad_per_bit, do_init_search, vfp, use_mvcost,
+ sad_per_bit, do_init_search, sad_list, vfp, use_mvcost,
center_mv, best_mv);
}
@@ -764,13 +852,14 @@
int search_param,
int sad_per_bit,
int do_init_search,
+ int *sad_list,
const vp9_variance_fn_ptr_t *vfp,
int use_mvcost,
const MV *center_mv,
MV *best_mv) {
return vp9_bigdia_search(x, ref_mv, MAX(MAX_MVSEARCH_STEPS - 2, search_param),
- sad_per_bit, do_init_search, vfp, use_mvcost,
- center_mv, best_mv);
+ sad_per_bit, do_init_search, sad_list, vfp,
+ use_mvcost, center_mv, best_mv);
}
#undef CHECK_BETTER
@@ -1368,33 +1457,41 @@
int vp9_full_pixel_search(VP9_COMP *cpi, MACROBLOCK *x,
BLOCK_SIZE bsize, MV *mvp_full,
int step_param, int error_per_bit,
+ int *sad_list,
const MV *ref_mv, MV *tmp_mv,
int var_max, int rd) {
const SPEED_FEATURES *const sf = &cpi->sf;
const SEARCH_METHODS method = sf->mv.search_method;
vp9_variance_fn_ptr_t *fn_ptr = &cpi->fn_ptr[bsize];
int var = 0;
+ if (sad_list) {
+ sad_list[0] = INT_MAX;
+ sad_list[1] = INT_MAX;
+ sad_list[2] = INT_MAX;
+ sad_list[3] = INT_MAX;
+ sad_list[4] = INT_MAX;
+ }
switch (method) {
case FAST_DIAMOND:
var = vp9_fast_dia_search(x, mvp_full, step_param, error_per_bit, 0,
- fn_ptr, 1, ref_mv, tmp_mv);
+ sad_list, fn_ptr, 1, ref_mv, tmp_mv);
break;
case FAST_HEX:
var = vp9_fast_hex_search(x, mvp_full, step_param, error_per_bit, 0,
- fn_ptr, 1, ref_mv, tmp_mv);
+ sad_list, fn_ptr, 1, ref_mv, tmp_mv);
break;
case HEX:
var = vp9_hex_search(x, mvp_full, step_param, error_per_bit, 1,
- fn_ptr, 1, ref_mv, tmp_mv);
+ sad_list, fn_ptr, 1, ref_mv, tmp_mv);
break;
case SQUARE:
var = vp9_square_search(x, mvp_full, step_param, error_per_bit, 1,
- fn_ptr, 1, ref_mv, tmp_mv);
+ sad_list, fn_ptr, 1, ref_mv, tmp_mv);
break;
case BIGDIA:
var = vp9_bigdia_search(x, mvp_full, step_param, error_per_bit, 1,
- fn_ptr, 1, ref_mv, tmp_mv);
+ sad_list, fn_ptr, 1, ref_mv, tmp_mv);
break;
case NSTEP:
var = vp9_full_pixel_diamond(cpi, x, mvp_full, step_param, error_per_bit,
diff --git a/vp9/encoder/vp9_mcomp.h b/vp9/encoder/vp9_mcomp.h
index 298fbb6..9b4734a 100644
--- a/vp9/encoder/vp9_mcomp.h
+++ b/vp9/encoder/vp9_mcomp.h
@@ -79,6 +79,7 @@
int search_param,
int error_per_bit,
int do_init_search,
+ int *sad_list,
const vp9_variance_fn_ptr_t *vf,
int use_mvcost,
const MV *center_mv,
@@ -98,12 +99,14 @@
const vp9_variance_fn_ptr_t *vfp,
int forced_stop, // 0 - full, 1 - qtr only, 2 - half only
int iters_per_step,
+ int *sad_list,
int *mvjcost, int *mvcost[2],
int *distortion, unsigned int *sse1,
const uint8_t *second_pred,
int w, int h);
extern fractional_mv_step_fp vp9_find_best_sub_pixel_tree;
+extern fractional_mv_step_fp vp9_find_best_sub_pixel_tree_pruned;
typedef int (*vp9_full_search_fn_t)(const MACROBLOCK *x,
const MV *ref_mv, int sad_per_bit,
@@ -136,8 +139,10 @@
int vp9_full_pixel_search(struct VP9_COMP *cpi, MACROBLOCK *x,
BLOCK_SIZE bsize, MV *mvp_full,
int step_param, int error_per_bit,
+ int *sad_list,
const MV *ref_mv, MV *tmp_mv,
int var_max, int rd);
+
#ifdef __cplusplus
} // extern "C"
#endif
diff --git a/vp9/encoder/vp9_picklpf.c b/vp9/encoder/vp9_picklpf.c
index d365489..5557d7f 100644
--- a/vp9/encoder/vp9_picklpf.c
+++ b/vp9/encoder/vp9_picklpf.c
@@ -77,7 +77,6 @@
while (filter_step > 0) {
const int filt_high = MIN(filt_mid + filter_step, max_filter_level);
const int filt_low = MAX(filt_mid - filter_step, min_filter_level);
- int filt_err;
// Bias against raising loop filter in favor of lowering it.
int bias = (best_err >> (15 - (filt_mid / 8))) * filter_step;
@@ -92,17 +91,14 @@
if (filt_direction <= 0 && filt_low != filt_mid) {
// Get Low filter error score
if (ss_err[filt_low] < 0) {
- filt_err = try_filter_frame(sd, cpi, filt_low, partial_frame);
- ss_err[filt_low] = filt_err;
- } else {
- filt_err = ss_err[filt_low];
+ ss_err[filt_low] = try_filter_frame(sd, cpi, filt_low, partial_frame);
}
// If value is close to the best so far then bias towards a lower loop
// filter value.
- if ((filt_err - bias) < best_err) {
+ if ((ss_err[filt_low] - bias) < best_err) {
// Was it actually better than the previous best?
- if (filt_err < best_err)
- best_err = filt_err;
+ if (ss_err[filt_low] < best_err)
+ best_err = ss_err[filt_low];
filt_best = filt_low;
}
@@ -111,14 +107,11 @@
// Now look at filt_high
if (filt_direction >= 0 && filt_high != filt_mid) {
if (ss_err[filt_high] < 0) {
- filt_err = try_filter_frame(sd, cpi, filt_high, partial_frame);
- ss_err[filt_high] = filt_err;
- } else {
- filt_err = ss_err[filt_high];
+ ss_err[filt_high] = try_filter_frame(sd, cpi, filt_high, partial_frame);
}
// Was it better than the previous best?
- if (filt_err < (best_err - bias)) {
- best_err = filt_err;
+ if (ss_err[filt_high] < (best_err - bias)) {
+ best_err = ss_err[filt_high];
filt_best = filt_high;
}
}
diff --git a/vp9/encoder/vp9_pickmode.c b/vp9/encoder/vp9_pickmode.c
index 6115f5a..a97d778 100644
--- a/vp9/encoder/vp9_pickmode.c
+++ b/vp9/encoder/vp9_pickmode.c
@@ -17,6 +17,7 @@
#include "vpx_mem/vpx_mem.h"
+#include "vp9/common/vp9_blockd.h"
#include "vp9/common/vp9_common.h"
#include "vp9/common/vp9_mvref_common.h"
#include "vp9/common/vp9_reconinter.h"
@@ -27,11 +28,17 @@
#include "vp9/encoder/vp9_ratectrl.h"
#include "vp9/encoder/vp9_rd.h"
+typedef struct {
+ uint8_t *data;
+ int stride;
+ int in_use;
+} PRED_BUFFER;
+
static int mv_refs_rt(const VP9_COMMON *cm, const MACROBLOCKD *xd,
- const TileInfo *const tile,
- MODE_INFO *mi, MV_REFERENCE_FRAME ref_frame,
- int_mv *mv_ref_list,
- int mi_row, int mi_col) {
+ const TileInfo *const tile,
+ MODE_INFO *mi, MV_REFERENCE_FRAME ref_frame,
+ int_mv *mv_ref_list,
+ int mi_row, int mi_col) {
const int *ref_sign_bias = cm->ref_frame_sign_bias;
int i, refmv_count = 0;
@@ -125,6 +132,7 @@
const int tmp_row_min = x->mv_row_min;
const int tmp_row_max = x->mv_row_max;
int rv = 0;
+ int sad_list[5];
const YV12_BUFFER_CONFIG *scaled_ref_frame = vp9_get_scaled_ref_frame(cpi,
ref);
if (cpi->common.show_frame &&
@@ -151,8 +159,9 @@
mvp_full.col >>= 3;
mvp_full.row >>= 3;
- vp9_full_pixel_search(cpi, x, bsize, &mvp_full, step_param, sadpb, &ref_mv,
- &tmp_mv->as_mv, INT_MAX, 0);
+ vp9_full_pixel_search(cpi, x, bsize, &mvp_full, step_param, sadpb,
+ cond_sad_list(cpi, sad_list),
+ &ref_mv, &tmp_mv->as_mv, INT_MAX, 0);
x->mv_col_min = tmp_col_min;
x->mv_col_max = tmp_col_max;
@@ -178,6 +187,7 @@
&cpi->fn_ptr[bsize],
cpi->sf.mv.subpel_force_stop,
cpi->sf.mv.subpel_iters_per_step,
+ cond_sad_list(cpi, sad_list),
x->nmvjointcost, x->mvcost,
&dis, &x->pred_sse[ref], NULL, 0, 0);
x->pred_mv[ref] = tmp_mv->as_mv;
@@ -220,13 +230,15 @@
if (cpi->common.tx_mode == TX_MODE_SELECT) {
if (sse > (var << 2))
- xd->mi[0]->mbmi.tx_size = MIN(max_txsize_lookup[bsize],
- tx_mode_to_biggest_tx_size[cpi->common.tx_mode]);
+ xd->mi[0]->mbmi.tx_size =
+ MIN(max_txsize_lookup[bsize],
+ tx_mode_to_biggest_tx_size[cpi->common.tx_mode]);
else
xd->mi[0]->mbmi.tx_size = TX_8X8;
} else {
- xd->mi[0]->mbmi.tx_size = MIN(max_txsize_lookup[bsize],
- tx_mode_to_biggest_tx_size[cpi->common.tx_mode]);
+ xd->mi[0]->mbmi.tx_size =
+ MIN(max_txsize_lookup[bsize],
+ tx_mode_to_biggest_tx_size[cpi->common.tx_mode]);
}
vp9_model_rd_from_var_lapndz(sse - var, 1 << num_pels_log2_lookup[bsize],
@@ -253,7 +265,8 @@
}
static void free_pred_buffer(PRED_BUFFER *p) {
- p->in_use = 0;
+ if (p != NULL)
+ p->in_use = 0;
}
static void encode_breakout_test(VP9_COMP *cpi, MACROBLOCK *x,
@@ -326,7 +339,7 @@
// The cost of skip bit needs to be added.
*rate = cpi->inter_mode_cost[mbmi->mode_context[ref_frame]]
- [INTER_OFFSET(this_mode)];
+ [INTER_OFFSET(this_mode)];
// More on this part of rate
// rate += vp9_cost_bit(vp9_get_skip_prob(cm, xd), 1);
@@ -343,7 +356,53 @@
}
}
-static const THR_MODES mode_idx[MAX_REF_FRAMES - 1][4] = {
+struct estimate_block_intra_args {
+ VP9_COMP *cpi;
+ MACROBLOCK *x;
+ PREDICTION_MODE mode;
+ int rate;
+ int64_t dist;
+};
+
+static void estimate_block_intra(int plane, int block, BLOCK_SIZE plane_bsize,
+ TX_SIZE tx_size, void *arg) {
+ struct estimate_block_intra_args* const args = arg;
+ VP9_COMP *const cpi = args->cpi;
+ MACROBLOCK *const x = args->x;
+ MACROBLOCKD *const xd = &x->e_mbd;
+ struct macroblock_plane *const p = &x->plane[0];
+ struct macroblockd_plane *const pd = &xd->plane[0];
+ const BLOCK_SIZE bsize_tx = txsize_to_bsize[tx_size];
+ uint8_t *const src_buf_base = p->src.buf;
+ uint8_t *const dst_buf_base = pd->dst.buf;
+ const int src_stride = p->src.stride;
+ const int dst_stride = pd->dst.stride;
+ int i, j;
+ int rate;
+ int64_t dist;
+ unsigned int var_y, sse_y;
+ txfrm_block_to_raster_xy(plane_bsize, tx_size, block, &i, &j);
+ assert(plane == 0);
+ (void) plane;
+
+ p->src.buf = &src_buf_base[4 * (j * src_stride + i)];
+ pd->dst.buf = &dst_buf_base[4 * (j * dst_stride + i)];
+ // Use source buffer as an approximation for the fully reconstructed buffer.
+ vp9_predict_intra_block(xd, block >> (2 * tx_size),
+ b_width_log2(plane_bsize),
+ tx_size, args->mode,
+ p->src.buf, src_stride,
+ pd->dst.buf, dst_stride,
+ i, j, 0);
+ // This procedure assumes zero offset from p->src.buf and pd->dst.buf.
+ model_rd_for_sb_y(cpi, bsize_tx, x, xd, &rate, &dist, &var_y, &sse_y);
+ p->src.buf = src_buf_base;
+ pd->dst.buf = dst_buf_base;
+ args->rate += rate;
+ args->dist += dist;
+}
+
+static const THR_MODES mode_idx[MAX_REF_FRAMES - 1][INTER_MODES] = {
{THR_NEARESTMV, THR_NEARMV, THR_ZEROMV, THR_NEWMV},
{THR_NEARESTG, THR_NEARG, THR_ZEROG, THR_NEWG},
{THR_NEARESTA, THR_NEARA, THR_ZEROA, THR_NEWA},
@@ -351,21 +410,21 @@
// TODO(jingning) placeholder for inter-frame non-RD mode decision.
// this needs various further optimizations. to be continued..
-int64_t vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
- const TileInfo *const tile,
- int mi_row, int mi_col,
- int *returnrate,
- int64_t *returndistortion,
- BLOCK_SIZE bsize,
- PICK_MODE_CONTEXT *ctx) {
- MACROBLOCKD *xd = &x->e_mbd;
- MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
- struct macroblock_plane *const p = &x->plane[0];
+void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
+ const TileInfo *const tile,
+ int mi_row, int mi_col,
+ int *returnrate,
+ int64_t *returndistortion,
+ BLOCK_SIZE bsize,
+ PICK_MODE_CONTEXT *ctx) {
+ VP9_COMMON *const cm = &cpi->common;
+ MACROBLOCKD *const xd = &x->e_mbd;
+ MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
struct macroblockd_plane *const pd = &xd->plane[0];
- PREDICTION_MODE this_mode, best_mode = ZEROMV;
+ PREDICTION_MODE best_mode = ZEROMV;
MV_REFERENCE_FRAME ref_frame, best_ref_frame = LAST_FRAME;
TX_SIZE best_tx_size = MIN(max_txsize_lookup[bsize],
- tx_mode_to_biggest_tx_size[cpi->common.tx_mode]);
+ tx_mode_to_biggest_tx_size[cm->tx_mode]);
INTERP_FILTER best_pred_filter = EIGHTTAP;
int_mv frame_mv[MB_MODE_COUNT][MAX_REF_FRAMES];
struct buf_2d yv12_mb[4][MAX_MB_PLANE];
@@ -373,33 +432,30 @@
VP9_ALT_FLAG };
int64_t best_rd = INT64_MAX;
int64_t this_rd = INT64_MAX;
- int skip_txfm = 0;
+ uint8_t skip_txfm = 0;
int rate = INT_MAX;
int64_t dist = INT64_MAX;
// var_y and sse_y are saved to be used in skipping checking
unsigned int var_y = UINT_MAX;
unsigned int sse_y = UINT_MAX;
- VP9_COMMON *cm = &cpi->common;
- int intra_cost_penalty = 20 * vp9_dc_quant(cm->base_qindex, cm->y_dc_delta_q);
-
+ const int intra_cost_penalty =
+ 20 * vp9_dc_quant(cm->base_qindex, cm->y_dc_delta_q);
const int64_t inter_mode_thresh = RDCOST(x->rdmult, x->rddiv,
intra_cost_penalty, 0);
- const int64_t intra_mode_cost = 50;
+ const int intra_mode_cost = 50;
- unsigned char segment_id = mbmi->segment_id;
+ const int8_t segment_id = mbmi->segment_id;
const int *const rd_threshes = cpi->rd.threshes[segment_id][bsize];
const int *const rd_thresh_freq_fact = cpi->rd.thresh_freq_fact[bsize];
- // Mode index conversion form THR_MODES to PREDICTION_MODE for a ref frame.
INTERP_FILTER filter_ref = cm->interp_filter;
- int bsl = mi_width_log2(bsize);
+ const int bsl = mi_width_log2(bsize);
const int pred_filter_search = cm->interp_filter == SWITCHABLE ?
(((mi_row + mi_col) >> bsl) +
get_chessboard_index(cm->current_video_frame)) & 0x1 : 0;
int const_motion[MAX_REF_FRAMES] = { 0 };
- int bh = num_4x4_blocks_high_lookup[bsize] << 2;
- int bw = num_4x4_blocks_wide_lookup[bsize] << 2;
- int pixels_in_block = bh * bw;
+ const int bh = num_4x4_blocks_high_lookup[bsize] << 2;
+ const int bw = num_4x4_blocks_wide_lookup[bsize] << 2;
// For speed 6, the result of interp filter is reused later in actual encoding
// process.
// tmp[3] points to dst buffer, and the other 3 point to allocated buffers.
@@ -408,15 +464,11 @@
struct buf_2d orig_dst = pd->dst;
PRED_BUFFER *best_pred = NULL;
PRED_BUFFER *this_mode_pred = NULL;
- int i;
- // CTX is used by the temporal denoiser which is currently being developed.
- // TODO(jbb): when temporal denoiser is finished and in the default build
- // remove the following line;
- (void) ctx;
if (cpi->sf.reuse_inter_pred_sby) {
+ int i;
for (i = 0; i < 3; i++) {
- tmp[i].data = &pred_buf[pixels_in_block * i];
+ tmp[i].data = &pred_buf[bw * bh * i];
tmp[i].stride = bw;
tmp[i].in_use = 0;
}
@@ -442,6 +494,7 @@
mbmi->segment_id = segment_id;
for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
+ PREDICTION_MODE this_mode;
x->pred_mv_sad[ref_frame] = INT_MAX;
frame_mv[NEWMV][ref_frame].as_int = INVALID_MV;
frame_mv[ZEROMV][ref_frame].as_int = 0;
@@ -496,8 +549,9 @@
if (!(cpi->sf.inter_mode_mask[bsize] & (1 << this_mode)))
continue;
- mode_rd_thresh = rd_threshes[mode_idx[ref_frame - LAST_FRAME]
- [this_mode - NEARESTMV]];
+ mode_rd_thresh =
+ rd_threshes[mode_idx[ref_frame -
+ LAST_FRAME][INTER_OFFSET(this_mode)]];
if (rd_less_than_thresh(best_rd, mode_rd_thresh,
rd_thresh_freq_fact[this_mode]))
continue;
@@ -514,7 +568,7 @@
if (this_mode != NEARESTMV &&
frame_mv[this_mode][ref_frame].as_int ==
frame_mv[NEARESTMV][ref_frame].as_int)
- continue;
+ continue;
mbmi->mode = this_mode;
mbmi->mv[0].as_int = frame_mv[this_mode][ref_frame].as_int;
@@ -593,7 +647,7 @@
rate += rate_mv;
rate += cpi->inter_mode_cost[mbmi->mode_context[ref_frame]]
- [INTER_OFFSET(this_mode)];
+ [INTER_OFFSET(this_mode)];
this_rd = RDCOST(x->rdmult, x->rddiv, rate, dist);
// Skipping checking: test to see if this block can be reconstructed by
@@ -609,9 +663,10 @@
#if CONFIG_VP9_TEMPORAL_DENOISING
if (cpi->oxcf.noise_sensitivity > 0) {
- vp9_denoiser_update_frame_stats(&cpi->denoiser, mbmi, sse_y,
- this_mode, ctx);
+ vp9_denoiser_update_frame_stats(mbmi, sse_y, this_mode, ctx);
}
+#else
+ (void)ctx;
#endif
if (this_rd < best_rd || x->skip) {
@@ -625,8 +680,7 @@
skip_txfm = x->skip_txfm[0];
if (cpi->sf.reuse_inter_pred_sby) {
- if (best_pred != NULL)
- free_pred_buffer(best_pred);
+ free_pred_buffer(best_pred);
best_pred = this_mode_pred;
}
@@ -646,16 +700,11 @@
// If best prediction is not in dst buf, then copy the prediction block from
// temp buf to dst buf.
- if (cpi->sf.reuse_inter_pred_sby && best_pred->data != orig_dst.buf) {
- uint8_t *copy_from, *copy_to;
-
+ if (best_pred != NULL && cpi->sf.reuse_inter_pred_sby &&
+ best_pred->data != orig_dst.buf) {
pd->dst = orig_dst;
- copy_to = pd->dst.buf;
-
- copy_from = best_pred->data;
-
- vp9_convolve_copy(copy_from, bw, copy_to, pd->dst.stride, NULL, 0, NULL, 0,
- bw, bh);
+ vp9_convolve_copy(best_pred->data, bw, pd->dst.buf, pd->dst.stride, NULL, 0,
+ NULL, 0, bw, bh);
}
mbmi->mode = best_mode;
@@ -670,20 +719,11 @@
// threshold.
if (!x->skip && best_rd > inter_mode_thresh &&
bsize <= cpi->sf.max_intra_bsize) {
- int i, j;
- const int width = num_4x4_blocks_wide_lookup[bsize];
- const int height = num_4x4_blocks_high_lookup[bsize];
-
- int rate2 = 0;
- int64_t dist2 = 0;
- const int dst_stride = cpi->sf.reuse_inter_pred_sby ? bw : pd->dst.stride;
- const int src_stride = p->src.stride;
- int block_idx = 0;
-
- TX_SIZE tmp_tx_size = MIN(max_txsize_lookup[bsize],
- tx_mode_to_biggest_tx_size[cpi->common.tx_mode]);
- const BLOCK_SIZE bsize_tx = txsize_to_bsize[tmp_tx_size];
- const int step = 1 << tmp_tx_size;
+ PREDICTION_MODE this_mode;
+ struct estimate_block_intra_args args = { cpi, x, DC_PRED, 0, 0 };
+ const TX_SIZE intra_tx_size =
+ MIN(max_txsize_lookup[bsize],
+ tx_mode_to_biggest_tx_size[cpi->common.tx_mode]);
if (cpi->sf.reuse_inter_pred_sby) {
pd->dst.buf = tmp[0].data;
@@ -691,44 +731,26 @@
}
for (this_mode = DC_PRED; this_mode <= DC_PRED; ++this_mode) {
- uint8_t *const src_buf_base = p->src.buf;
- uint8_t *const dst_buf_base = pd->dst.buf;
- for (j = 0; j < height; j += step) {
- for (i = 0; i < width; i += step) {
- p->src.buf = &src_buf_base[4 * (j * src_stride + i)];
- pd->dst.buf = &dst_buf_base[4 * (j * dst_stride + i)];
- // Use source buffer as an approximation for the fully reconstructed
- // buffer
- vp9_predict_intra_block(xd, block_idx, b_width_log2(bsize),
- tmp_tx_size, this_mode,
- p->src.buf, src_stride,
- pd->dst.buf, dst_stride,
- i, j, 0);
- model_rd_for_sb_y(cpi, bsize_tx, x, xd, &rate, &dist, &var_y, &sse_y);
- rate2 += rate;
- dist2 += dist;
- ++block_idx;
- }
- }
- p->src.buf = src_buf_base;
- pd->dst.buf = dst_buf_base;
-
- rate = rate2;
- dist = dist2;
-
+ const TX_SIZE saved_tx_size = mbmi->tx_size;
+ args.mode = this_mode;
+ args.rate = 0;
+ args.dist = 0;
+ mbmi->tx_size = intra_tx_size;
+ vp9_foreach_transformed_block_in_plane(xd, bsize, 0,
+ estimate_block_intra, &args);
+ mbmi->tx_size = saved_tx_size;
+ rate = args.rate;
+ dist = args.dist;
rate += cpi->mbmode_cost[this_mode];
rate += intra_cost_penalty;
this_rd = RDCOST(x->rdmult, x->rddiv, rate, dist);
- if (cpi->sf.reuse_inter_pred_sby)
- pd->dst = orig_dst;
-
if (this_rd + intra_mode_cost < best_rd) {
best_rd = this_rd;
*returnrate = rate;
*returndistortion = dist;
mbmi->mode = this_mode;
- mbmi->tx_size = tmp_tx_size;
+ mbmi->tx_size = intra_tx_size;
mbmi->ref_frame[0] = INTRA_FRAME;
mbmi->uv_mode = this_mode;
mbmi->mv[0].as_int = INVALID_MV;
@@ -736,7 +758,7 @@
x->skip_txfm[0] = skip_txfm;
}
}
+ if (cpi->sf.reuse_inter_pred_sby)
+ pd->dst = orig_dst;
}
-
- return INT64_MAX;
}
diff --git a/vp9/encoder/vp9_pickmode.h b/vp9/encoder/vp9_pickmode.h
index 49c6feb..97aeca7 100644
--- a/vp9/encoder/vp9_pickmode.h
+++ b/vp9/encoder/vp9_pickmode.h
@@ -17,19 +17,13 @@
extern "C" {
#endif
-typedef struct {
- uint8_t *data;
- int stride;
- int in_use;
-} PRED_BUFFER;
-
-int64_t vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
- const struct TileInfo *const tile,
- int mi_row, int mi_col,
- int *returnrate,
- int64_t *returndistortion,
- BLOCK_SIZE bsize,
- PICK_MODE_CONTEXT *ctx);
+void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
+ const struct TileInfo *const tile,
+ int mi_row, int mi_col,
+ int *returnrate,
+ int64_t *returndistortion,
+ BLOCK_SIZE bsize,
+ PICK_MODE_CONTEXT *ctx);
#ifdef __cplusplus
} // extern "C"
diff --git a/vp9/encoder/vp9_quantize.c b/vp9/encoder/vp9_quantize.c
index eababdb..d49eb95 100644
--- a/vp9/encoder/vp9_quantize.c
+++ b/vp9/encoder/vp9_quantize.c
@@ -19,9 +19,9 @@
#include "vp9/encoder/vp9_quantize.h"
#include "vp9/encoder/vp9_rd.h"
-void vp9_quantize_dc(const int16_t *coeff_ptr, int skip_block,
+void vp9_quantize_dc(const tran_low_t *coeff_ptr, int skip_block,
const int16_t *round_ptr, const int16_t quant,
- int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr,
+ tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
const int16_t dequant_ptr, uint16_t *eob_ptr) {
const int rc = 0;
const int coeff = coeff_ptr[rc];
@@ -40,9 +40,9 @@
*eob_ptr = eob + 1;
}
-void vp9_quantize_dc_32x32(const int16_t *coeff_ptr, int skip_block,
+void vp9_quantize_dc_32x32(const tran_low_t *coeff_ptr, int skip_block,
const int16_t *round_ptr, const int16_t quant,
- int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr,
+ tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
const int16_t dequant_ptr, uint16_t *eob_ptr) {
const int rc = 0;
const int coeff = coeff_ptr[rc];
@@ -62,11 +62,11 @@
*eob_ptr = eob + 1;
}
-void vp9_quantize_fp_c(const int16_t *coeff_ptr, intptr_t count,
+void vp9_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
int skip_block,
const int16_t *zbin_ptr, const int16_t *round_ptr,
const int16_t *quant_ptr, const int16_t *quant_shift_ptr,
- int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr,
+ tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
const int16_t *dequant_ptr,
int zbin_oq_value, uint16_t *eob_ptr,
const int16_t *scan, const int16_t *iscan) {
@@ -78,13 +78,13 @@
(void)zbin_oq_value;
(void)iscan;
- vpx_memset(qcoeff_ptr, 0, count * sizeof(int16_t));
- vpx_memset(dqcoeff_ptr, 0, count * sizeof(int16_t));
+ vpx_memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+ vpx_memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
if (!skip_block) {
// Quantization pass: All coefficients with index >= zero_flag are
// skippable. Note: zero_flag can be zero.
- for (i = 0; i < count; i++) {
+ for (i = 0; i < n_coeffs; i++) {
const int rc = scan[i];
const int coeff = coeff_ptr[rc];
const int coeff_sign = (coeff >> 31);
@@ -105,12 +105,12 @@
// TODO(jingning) Refactor this file and combine functions with similar
// operations.
-void vp9_quantize_fp_32x32_c(const int16_t *coeff_ptr, intptr_t n_coeffs,
+void vp9_quantize_fp_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
int skip_block,
const int16_t *zbin_ptr, const int16_t *round_ptr,
const int16_t *quant_ptr,
const int16_t *quant_shift_ptr,
- int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr,
+ tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
const int16_t *dequant_ptr,
int zbin_oq_value, uint16_t *eob_ptr,
const int16_t *scan, const int16_t *iscan) {
@@ -120,8 +120,8 @@
(void)zbin_oq_value;
(void)iscan;
- vpx_memset(qcoeff_ptr, 0, n_coeffs * sizeof(int16_t));
- vpx_memset(dqcoeff_ptr, 0, n_coeffs * sizeof(int16_t));
+ vpx_memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+ vpx_memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
if (!skip_block) {
for (i = 0; i < n_coeffs; i++) {
@@ -146,27 +146,27 @@
*eob_ptr = eob + 1;
}
-void vp9_quantize_b_c(const int16_t *coeff_ptr, intptr_t count,
+void vp9_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
int skip_block,
const int16_t *zbin_ptr, const int16_t *round_ptr,
const int16_t *quant_ptr, const int16_t *quant_shift_ptr,
- int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr,
+ tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
const int16_t *dequant_ptr,
int zbin_oq_value, uint16_t *eob_ptr,
const int16_t *scan, const int16_t *iscan) {
- int i, non_zero_count = (int)count, eob = -1;
+ int i, non_zero_count = (int)n_coeffs, eob = -1;
const int zbins[2] = { zbin_ptr[0] + zbin_oq_value,
zbin_ptr[1] + zbin_oq_value };
const int nzbins[2] = { zbins[0] * -1,
zbins[1] * -1 };
(void)iscan;
- vpx_memset(qcoeff_ptr, 0, count * sizeof(int16_t));
- vpx_memset(dqcoeff_ptr, 0, count * sizeof(int16_t));
+ vpx_memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+ vpx_memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
if (!skip_block) {
// Pre-scan pass
- for (i = (int)count - 1; i >= 0; i--) {
+ for (i = (int)n_coeffs - 1; i >= 0; i--) {
const int rc = scan[i];
const int coeff = coeff_ptr[rc];
@@ -199,12 +199,12 @@
*eob_ptr = eob + 1;
}
-void vp9_quantize_b_32x32_c(const int16_t *coeff_ptr, intptr_t n_coeffs,
+void vp9_quantize_b_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
int skip_block,
const int16_t *zbin_ptr, const int16_t *round_ptr,
const int16_t *quant_ptr,
const int16_t *quant_shift_ptr,
- int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr,
+ tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
const int16_t *dequant_ptr,
int zbin_oq_value, uint16_t *eob_ptr,
const int16_t *scan, const int16_t *iscan) {
@@ -217,8 +217,8 @@
int i, eob = -1;
(void)iscan;
- vpx_memset(qcoeff_ptr, 0, n_coeffs * sizeof(int16_t));
- vpx_memset(dqcoeff_ptr, 0, n_coeffs * sizeof(int16_t));
+ vpx_memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+ vpx_memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
if (!skip_block) {
// Pre-scan pass
@@ -280,13 +280,19 @@
*shift = 1 << (16 - l);
}
+static int get_qzbin_factor(int q, vpx_bit_depth_t bit_depth) {
+ int quant = vp9_dc_quant(q, 0);
+ (void) bit_depth;
+ return q == 0 ? 64 : (quant < 148 ? 84 : 80);
+}
+
void vp9_init_quantizer(VP9_COMP *cpi) {
VP9_COMMON *const cm = &cpi->common;
QUANTS *const quants = &cpi->quants;
int i, q, quant;
for (q = 0; q < QINDEX_RANGE; q++) {
- const int qzbin_factor = q == 0 ? 64 : (vp9_dc_quant(q, 0) < 148 ? 84 : 80);
+ const int qzbin_factor = get_qzbin_factor(q, cm->bit_depth);
const int qrounding_factor = q == 0 ? 64 : 48;
for (i = 0; i < 2; ++i) {
diff --git a/vp9/encoder/vp9_quantize.h b/vp9/encoder/vp9_quantize.h
index 262529b..d7edb0b 100644
--- a/vp9/encoder/vp9_quantize.h
+++ b/vp9/encoder/vp9_quantize.h
@@ -37,17 +37,29 @@
DECLARE_ALIGNED(16, int16_t, uv_round[QINDEX_RANGE][8]);
} QUANTS;
-void vp9_quantize_dc(const int16_t *coeff_ptr, int skip_block,
+void vp9_quantize_dc(const tran_low_t *coeff_ptr, int skip_block,
const int16_t *round_ptr, const int16_t quant_ptr,
- int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr,
+ tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
const int16_t dequant_ptr, uint16_t *eob_ptr);
-void vp9_quantize_dc_32x32(const int16_t *coeff_ptr, int skip_block,
+void vp9_quantize_dc_32x32(const tran_low_t *coeff_ptr, int skip_block,
const int16_t *round_ptr, const int16_t quant_ptr,
- int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr,
+ tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
const int16_t dequant_ptr, uint16_t *eob_ptr);
void vp9_regular_quantize_b_4x4(MACROBLOCK *x, int plane, int block,
const int16_t *scan, const int16_t *iscan);
+#if CONFIG_VP9_HIGHBITDEPTH
+void vp9_high_quantize_dc(const tran_low_t *coeff_ptr, int skip_block,
+ const int16_t *round_ptr, const int16_t quant_ptr,
+ tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+ const int16_t dequant_ptr, uint16_t *eob_ptr);
+void vp9_high_quantize_dc_32x32(const tran_low_t *coeff_ptr, int skip_block,
+ const int16_t *round_ptr,
+ const int16_t quant_ptr, tran_low_t *qcoeff_ptr,
+ tran_low_t *dqcoeff_ptr,
+ const int16_t dequant_ptr, uint16_t *eob_ptr);
+#endif
+
struct VP9_COMP;
struct VP9Common;
diff --git a/vp9/encoder/vp9_ratectrl.c b/vp9/encoder/vp9_ratectrl.c
index 290567e..b607c85 100644
--- a/vp9/encoder/vp9_ratectrl.c
+++ b/vp9/encoder/vp9_ratectrl.c
@@ -646,7 +646,6 @@
int q;
if (frame_is_intra_only(cm)) {
- active_best_quality = rc->best_quality;
// Handle the special case for key frames forced when we have reached
// the maximum key frame interval. Here force the Q to a range
@@ -1208,7 +1207,7 @@
? INT_MAX : (int)(rc->starting_buffer_level / 2);
} else {
int kf_boost = 32;
- double framerate = oxcf->framerate;
+ double framerate = cpi->framerate;
if (svc->number_temporal_layers > 1 &&
oxcf->rc_mode == VPX_CBR) {
// Use the layer framerate for temporal layers CBR mode.
@@ -1236,7 +1235,7 @@
cm->frame_type = KEY_FRAME;
rc->source_alt_ref_active = 0;
- if (is_spatial_svc(cpi)) {
+ if (is_two_pass_svc(cpi)) {
cpi->svc.layer_context[cpi->svc.spatial_layer_id].is_key_frame = 1;
cpi->ref_frame_flags &=
(~VP9_LAST_FLAG & ~VP9_GOLD_FLAG & ~VP9_ALT_FLAG);
@@ -1248,7 +1247,7 @@
} else {
cm->frame_type = INTER_FRAME;
- if (is_spatial_svc(cpi)) {
+ if (is_two_pass_svc(cpi)) {
LAYER_CONTEXT *lc = &cpi->svc.layer_context[cpi->svc.spatial_layer_id];
if (cpi->svc.spatial_layer_id == 0) {
lc->is_key_frame = 0;
@@ -1364,7 +1363,7 @@
RATE_CONTROL *const rc = &cpi->rc;
int vbr_max_bits;
- rc->avg_frame_bandwidth = (int)(oxcf->target_bandwidth / oxcf->framerate);
+ rc->avg_frame_bandwidth = (int)(oxcf->target_bandwidth / cpi->framerate);
rc->min_frame_bandwidth = (int)(rc->avg_frame_bandwidth *
oxcf->two_pass_vbrmin_section / 100);
diff --git a/vp9/encoder/vp9_rd.c b/vp9/encoder/vp9_rd.c
index 4fc3e9e..b826ff4 100644
--- a/vp9/encoder/vp9_rd.c
+++ b/vp9/encoder/vp9_rd.c
@@ -364,20 +364,16 @@
int ref_frame, BLOCK_SIZE block_size) {
MACROBLOCKD *xd = &x->e_mbd;
MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
- int_mv this_mv;
int i;
int zero_seen = 0;
int best_index = 0;
int best_sad = INT_MAX;
int this_sad = INT_MAX;
int max_mv = 0;
-
uint8_t *src_y_ptr = x->plane[0].src.buf;
uint8_t *ref_y_ptr;
- int row_offset, col_offset;
- int num_mv_refs = MAX_MV_REF_CANDIDATES +
+ const int num_mv_refs = MAX_MV_REF_CANDIDATES +
(cpi->sf.adaptive_motion_search &&
- cpi->common.show_frame &&
block_size < cpi->sf.max_partition_size);
MV pred_mv[3];
@@ -387,19 +383,16 @@
// Get the sad for each candidate reference mv.
for (i = 0; i < num_mv_refs; ++i) {
- this_mv.as_mv = pred_mv[i];
+ const MV *this_mv = &pred_mv[i];
- max_mv = MAX(max_mv,
- MAX(abs(this_mv.as_mv.row), abs(this_mv.as_mv.col)) >> 3);
- // Only need to check zero mv once.
- if (!this_mv.as_int && zero_seen)
+ max_mv = MAX(max_mv, MAX(abs(this_mv->row), abs(this_mv->col)) >> 3);
+ if (is_zero_mv(this_mv) && zero_seen)
continue;
- zero_seen = zero_seen || !this_mv.as_int;
+ zero_seen |= is_zero_mv(this_mv);
- row_offset = this_mv.as_mv.row >> 3;
- col_offset = this_mv.as_mv.col >> 3;
- ref_y_ptr = ref_y_buffer + (ref_y_stride * row_offset) + col_offset;
+ ref_y_ptr =
+ &ref_y_buffer[ref_y_stride * (this_mv->row >> 3) + (this_mv->col >> 3)];
// Find sad for current vector.
this_sad = cpi->fn_ptr[block_size].sdf(src_y_ptr, x->plane[0].src.stride,
@@ -462,7 +455,7 @@
// Set baseline threshold values.
for (i = 0; i < MAX_MODES; ++i)
- rd->thresh_mult[i] = is_best_mode(cpi->oxcf.mode) ? -500 : 0;
+ rd->thresh_mult[i] = cpi->oxcf.mode == BEST ? -500 : 0;
rd->thresh_mult[THR_NEARESTMV] = 0;
rd->thresh_mult[THR_NEARESTG] = 0;
@@ -548,7 +541,7 @@
int i;
for (i = 0; i < MAX_REFS; ++i)
- rd->thresh_mult_sub8x8[i] = is_best_mode(cpi->oxcf.mode) ? -500 : 0;
+ rd->thresh_mult_sub8x8[i] = cpi->oxcf.mode == BEST ? -500 : 0;
rd->thresh_mult_sub8x8[THR_LAST] += 2500;
rd->thresh_mult_sub8x8[THR_GOLD] += 2500;
diff --git a/vp9/encoder/vp9_rdopt.c b/vp9/encoder/vp9_rdopt.c
index cc55dd7..9df85de 100644
--- a/vp9/encoder/vp9_rdopt.c
+++ b/vp9/encoder/vp9_rdopt.c
@@ -41,9 +41,14 @@
#define RD_THRESH_MAX_FACT 64
#define RD_THRESH_INC 1
-#define LAST_FRAME_MODE_MASK 0xFFEDCD60
-#define GOLDEN_FRAME_MODE_MASK 0xFFDA3BB0
-#define ALT_REF_MODE_MASK 0xFFC648D0
+#define LAST_FRAME_MODE_MASK ((1 << GOLDEN_FRAME) | (1 << ALTREF_FRAME) | \
+ (1 << INTRA_FRAME))
+#define GOLDEN_FRAME_MODE_MASK ((1 << LAST_FRAME) | (1 << ALTREF_FRAME) | \
+ (1 << INTRA_FRAME))
+#define ALT_REF_MODE_MASK ((1 << LAST_FRAME) | (1 << GOLDEN_FRAME) | \
+ (1 << INTRA_FRAME))
+
+#define SECOND_REF_FRAME_MASK ((1 << ALTREF_FRAME) | 0x01)
#define MIN_EARLY_TERM_INDEX 3
@@ -171,30 +176,53 @@
int64_t dist_sum = 0;
const int ref = xd->mi[0]->mbmi.ref_frame[0];
unsigned int sse;
+ unsigned int var = 0;
+ unsigned int sum_sse = 0;
const int shift = 8;
+ int rate;
+ int64_t dist;
+
+ x->pred_sse[ref] = 0;
for (i = 0; i < MAX_MB_PLANE; ++i) {
struct macroblock_plane *const p = &x->plane[i];
struct macroblockd_plane *const pd = &xd->plane[i];
const BLOCK_SIZE bs = get_plane_block_size(bsize, pd);
+ const TX_SIZE max_tx_size = max_txsize_lookup[bs];
+ const BLOCK_SIZE unit_size = txsize_to_bsize[max_tx_size];
+ int bw = 1 << (b_width_log2_lookup[bs] - b_width_log2_lookup[unit_size]);
+ int bh = 1 << (b_height_log2_lookup[bs] - b_width_log2_lookup[unit_size]);
+ int idx, idy;
+ int lw = b_width_log2_lookup[unit_size] + 2;
+ int lh = b_height_log2_lookup[unit_size] + 2;
- const unsigned int var = cpi->fn_ptr[bs].vf(p->src.buf, p->src.stride,
- pd->dst.buf, pd->dst.stride,
- &sse);
+ sum_sse = 0;
- if (!x->select_tx_size) {
- if (sse < p->quant_thred[0] >> shift)
- x->skip_txfm[i] = 1;
- else if (var < p->quant_thred[1] >> shift)
- x->skip_txfm[i] = 2;
- else
- x->skip_txfm[i] = 0;
+ for (idy = 0; idy < bh; ++idy) {
+ for (idx = 0; idx < bw; ++idx) {
+ uint8_t *src = p->src.buf + (idy * p->src.stride << lh) + (idx << lw);
+ uint8_t *dst = pd->dst.buf + (idy * pd->dst.stride << lh) + (idx << lh);
+ int block_idx = (idy << 1) + idx;
+
+ var = cpi->fn_ptr[unit_size].vf(src, p->src.stride,
+ dst, pd->dst.stride, &sse);
+ x->bsse[(i << 2) + block_idx] = sse;
+ sum_sse += sse;
+
+ if (!x->select_tx_size) {
+ if (x->bsse[(i << 2) + block_idx] < p->quant_thred[0] >> shift)
+ x->skip_txfm[(i << 2) + block_idx] = 1;
+ else if (var < p->quant_thred[1] >> shift)
+ x->skip_txfm[(i << 2) + block_idx] = 2;
+ else
+ x->skip_txfm[(i << 2) + block_idx] = 0;
+ }
+
+ if (i == 0)
+ x->pred_sse[ref] += sse;
+ }
}
- x->bsse[i] = sse;
- if (i == 0)
- x->pred_sse[ref] = sse;
-
// Fast approximate the modelling function.
if (cpi->oxcf.speed > 4) {
int64_t rate;
@@ -210,9 +238,7 @@
rate_sum += rate;
dist_sum += dist;
} else {
- int rate;
- int64_t dist;
- vp9_model_rd_from_var_lapndz(sse, 1 << num_pels_log2_lookup[bs],
+ vp9_model_rd_from_var_lapndz(sum_sse, 1 << num_pels_log2_lookup[bs],
pd->dequant[1] >> 3, &rate, &dist);
rate_sum += rate;
dist_sum += dist;
@@ -223,7 +249,7 @@
*out_dist_sum = dist_sum << 4;
}
-int64_t vp9_block_error_c(const int16_t *coeff, const int16_t *dqcoeff,
+int64_t vp9_block_error_c(const tran_low_t *coeff, const tran_low_t *dqcoeff,
intptr_t block_size, int64_t *ssz) {
int i;
int64_t error = 0, sqcoeff = 0;
@@ -262,7 +288,7 @@
const PLANE_TYPE type = pd->plane_type;
const int16_t *band_count = &band_counts[tx_size][1];
const int eob = p->eobs[block];
- const int16_t *const qcoeff = BLOCK_OFFSET(p->qcoeff, block);
+ const tran_low_t *const qcoeff = BLOCK_OFFSET(p->qcoeff, block);
unsigned int (*token_costs)[2][COEFF_CONTEXTS][ENTROPY_TOKENS] =
x->token_costs[tx_size][type][is_inter_block(mbmi)];
uint8_t token_cache[32 * 32];
@@ -332,8 +358,8 @@
const struct macroblockd_plane *const pd = &xd->plane[plane];
int64_t this_sse;
int shift = tx_size == TX_32X32 ? 0 : 2;
- int16_t *const coeff = BLOCK_OFFSET(p->coeff, block);
- int16_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
+ tran_low_t *const coeff = BLOCK_OFFSET(p->coeff, block);
+ tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
args->dist = vp9_block_error(coeff, dqcoeff, 16 << ss_txfrm_size,
&this_sse) >> shift;
args->sse = this_sse >> shift;
@@ -372,17 +398,17 @@
if (!is_inter_block(mbmi)) {
vp9_encode_block_intra(x, plane, block, plane_bsize, tx_size, &mbmi->skip);
dist_block(plane, block, tx_size, args);
- } else {
- if (x->skip_txfm[plane] == 0) {
+ } else if (max_txsize_lookup[plane_bsize] == tx_size) {
+ if (x->skip_txfm[(plane << 2) + (block >> (tx_size << 1))] == 0) {
// full forward transform and quantization
vp9_xform_quant(x, plane, block, plane_bsize, tx_size);
dist_block(plane, block, tx_size, args);
- } else if (x->skip_txfm[plane] == 2) {
+ } else if (x->skip_txfm[(plane << 2) + (block >> (tx_size << 1))] == 2) {
// compute DC coefficient
- int16_t *const coeff = BLOCK_OFFSET(x->plane[plane].coeff, block);
- int16_t *const dqcoeff = BLOCK_OFFSET(xd->plane[plane].dqcoeff, block);
+ tran_low_t *const coeff = BLOCK_OFFSET(x->plane[plane].coeff, block);
+ tran_low_t *const dqcoeff = BLOCK_OFFSET(xd->plane[plane].dqcoeff, block);
vp9_xform_quant_dc(x, plane, block, plane_bsize, tx_size);
- args->sse = x->bsse[plane] << 4;
+ args->sse = x->bsse[(plane << 2) + (block >> (tx_size << 1))] << 4;
args->dist = args->sse;
if (!x->plane[plane].eobs[block])
args->dist = args->sse - ((coeff[0] * coeff[0] -
@@ -390,9 +416,13 @@
} else {
// skip forward transform
x->plane[plane].eobs[block] = 0;
- args->sse = x->bsse[plane] << 4;
+ args->sse = x->bsse[(plane << 2) + (block >> (tx_size << 1))] << 4;
args->dist = args->sse;
}
+ } else {
+ // full forward transform and quantization
+ vp9_xform_quant(x, plane, block, plane_bsize, tx_size);
+ dist_block(plane, block, tx_size, args);
}
rate_block(plane, block, plane_bsize, tx_size, args);
@@ -468,7 +498,6 @@
txfm_rd_in_plane(x, rate, distortion, skip,
sse, ref_best_rd, 0, bs,
mbmi->tx_size, cpi->sf.use_fast_coef_costing);
- cpi->tx_stepdown_count[0]++;
}
static void choose_tx_size_from_rd(VP9_COMP *cpi, MACROBLOCK *x,
@@ -490,24 +519,24 @@
{INT64_MAX, INT64_MAX},
{INT64_MAX, INT64_MAX},
{INT64_MAX, INT64_MAX}};
- TX_SIZE n, m;
+ int n, m;
int s0, s1;
const TX_SIZE max_mode_tx_size = tx_mode_to_biggest_tx_size[cm->tx_mode];
int64_t best_rd = INT64_MAX;
- TX_SIZE best_tx = TX_4X4;
+ TX_SIZE best_tx = max_tx_size;
const vp9_prob *tx_probs = get_tx_probs2(max_tx_size, xd, &cm->fc.tx_probs);
assert(skip_prob > 0);
s0 = vp9_cost_bit(skip_prob, 0);
s1 = vp9_cost_bit(skip_prob, 1);
- for (n = TX_4X4; n <= max_tx_size; n++) {
+ for (n = max_tx_size; n >= 0; n--) {
txfm_rd_in_plane(x, &r[n][0], &d[n], &s[n],
&sse[n], ref_best_rd, 0, bs, n,
cpi->sf.use_fast_coef_costing);
r[n][1] = r[n][0];
if (r[n][0] < INT_MAX) {
- for (m = 0; m <= n - (n == max_tx_size); m++) {
+ for (m = 0; m <= n - (n == (int) max_tx_size); m++) {
if (m == n)
r[n][1] += vp9_cost_zero(tx_probs[m]);
else
@@ -523,6 +552,13 @@
rd[n][1] = RDCOST(x->rdmult, x->rddiv, r[n][1] + s0, d[n]);
}
+ // Early termination in transform size search.
+ if (cpi->sf.tx_size_search_breakout &&
+ (rd[n][1] == INT64_MAX ||
+ (n < (int) max_tx_size && rd[n][1] > rd[n + 1][1]) ||
+ s[n] == 1))
+ break;
+
if (rd[n][1] < best_rd) {
best_tx = n;
best_rd = rd[n][1];
@@ -544,60 +580,36 @@
if (max_tx_size == TX_32X32 && best_tx == TX_32X32) {
tx_cache[TX_MODE_SELECT] = rd[TX_32X32][1];
- cpi->tx_stepdown_count[0]++;
} else if (max_tx_size >= TX_16X16 && best_tx == TX_16X16) {
tx_cache[TX_MODE_SELECT] = rd[TX_16X16][1];
- cpi->tx_stepdown_count[max_tx_size - TX_16X16]++;
} else if (rd[TX_8X8][1] < rd[TX_4X4][1]) {
tx_cache[TX_MODE_SELECT] = rd[TX_8X8][1];
- cpi->tx_stepdown_count[max_tx_size - TX_8X8]++;
} else {
tx_cache[TX_MODE_SELECT] = rd[TX_4X4][1];
- cpi->tx_stepdown_count[max_tx_size - TX_4X4]++;
}
}
-static void inter_super_block_yrd(VP9_COMP *cpi, MACROBLOCK *x, int *rate,
- int64_t *distortion, int *skip,
- int64_t *psse, BLOCK_SIZE bs,
- int64_t txfm_cache[TX_MODES],
- int64_t ref_best_rd) {
+static void super_block_yrd(VP9_COMP *cpi, MACROBLOCK *x, int *rate,
+ int64_t *distortion, int *skip,
+ int64_t *psse, BLOCK_SIZE bs,
+ int64_t txfm_cache[TX_MODES],
+ int64_t ref_best_rd) {
MACROBLOCKD *xd = &x->e_mbd;
+ int64_t sse;
+ int64_t *ret_sse = psse ? psse : &sse;
assert(bs == xd->mi[0]->mbmi.sb_type);
- vp9_subtract_plane(x, bs, 0);
-
if (cpi->sf.tx_size_search_method == USE_LARGESTALL || xd->lossless) {
vpx_memset(txfm_cache, 0, TX_MODES * sizeof(int64_t));
- choose_largest_tx_size(cpi, x, rate, distortion, skip, psse, ref_best_rd,
+ choose_largest_tx_size(cpi, x, rate, distortion, skip, ret_sse, ref_best_rd,
bs);
} else {
- choose_tx_size_from_rd(cpi, x, rate, distortion, skip, psse,
+ choose_tx_size_from_rd(cpi, x, rate, distortion, skip, ret_sse,
txfm_cache, ref_best_rd, bs);
}
}
-static void intra_super_block_yrd(VP9_COMP *cpi, MACROBLOCK *x, int *rate,
- int64_t *distortion, int *skip,
- BLOCK_SIZE bs,
- int64_t txfm_cache[TX_MODES],
- int64_t ref_best_rd) {
- MACROBLOCKD *xd = &x->e_mbd;
- int64_t sse;
-
- assert(bs == xd->mi[0]->mbmi.sb_type);
- if (cpi->sf.tx_size_search_method != USE_FULL_RD || xd->lossless) {
- vpx_memset(txfm_cache, 0, TX_MODES * sizeof(int64_t));
- choose_largest_tx_size(cpi, x, rate, distortion, skip, &sse, ref_best_rd,
- bs);
- } else {
- choose_tx_size_from_rd(cpi, x, rate, distortion, skip, &sse,
- txfm_cache, ref_best_rd, bs);
- }
-}
-
-
static int conditional_skipintra(PREDICTION_MODE mode,
PREDICTION_MODE best_intra_mode) {
if (mode == D117_PRED &&
@@ -678,7 +690,7 @@
uint8_t *const dst = &dst_init[idx * 4 + idy * 4 * dst_stride];
int16_t *const src_diff = raster_block_offset_int16(BLOCK_8X8, block,
p->src_diff);
- int16_t *const coeff = BLOCK_OFFSET(x->plane[0].coeff, block);
+ tran_low_t *const coeff = BLOCK_OFFSET(x->plane[0].coeff, block);
xd->mi[0]->bmi[block].as_mode = mode;
vp9_predict_intra_block(xd, block, 1,
TX_4X4, mode,
@@ -847,8 +859,8 @@
}
mic->mbmi.mode = mode;
- intra_super_block_yrd(cpi, x, &this_rate_tokenonly, &this_distortion,
- &s, bsize, local_tx_cache, best_rd);
+ super_block_yrd(cpi, x, &this_rate_tokenonly, &this_distortion,
+ &s, NULL, bsize, local_tx_cache, best_rd);
if (this_rate_tokenonly == INT_MAX)
continue;
@@ -1125,7 +1137,7 @@
for (idy = 0; idy < height / 4; ++idy) {
for (idx = 0; idx < width / 4; ++idx) {
int64_t ssz, rd, rd1, rd2;
- int16_t* coeff;
+ tran_low_t* coeff;
k += (idy * 2 + idx);
coeff = BLOCK_OFFSET(p->coeff, k);
@@ -1217,11 +1229,9 @@
// TODO(aconverse): Find out if this is still productive then clean up or remove
static int check_best_zero_mv(
const VP9_COMP *cpi, const uint8_t mode_context[MAX_REF_FRAMES],
- int_mv frame_mv[MB_MODE_COUNT][MAX_REF_FRAMES],
- int inter_mode_mask, int this_mode,
+ int_mv frame_mv[MB_MODE_COUNT][MAX_REF_FRAMES], int this_mode,
const MV_REFERENCE_FRAME ref_frames[2]) {
- if ((inter_mode_mask & (1 << ZEROMV)) &&
- (this_mode == NEARMV || this_mode == NEARESTMV || this_mode == ZEROMV) &&
+ if ((this_mode == NEARMV || this_mode == NEARESTMV || this_mode == ZEROMV) &&
frame_mv[this_mode][ref_frames[0]].as_int == 0 &&
(ref_frames[1] == NONE ||
frame_mv[this_mode][ref_frames[1]].as_int == 0)) {
@@ -1339,7 +1349,6 @@
continue;
if (!check_best_zero_mv(cpi, mbmi->mode_context, frame_mv,
- inter_mode_mask,
this_mode, mbmi->ref_frame))
continue;
@@ -1358,13 +1367,14 @@
int sadpb = x->sadperbit4;
MV mvp_full;
int max_mv;
+ int sad_list[5];
/* Is the best so far sufficiently good that we cant justify doing
* and new motion search. */
if (best_rd < label_mv_thresh)
break;
- if (!is_best_mode(cpi->oxcf.mode)) {
+ if (cpi->oxcf.mode != BEST) {
// use previous block's result as next block's MV predictor.
if (i > 0) {
bsi->mvp.as_int = mi->bmi[i - 1].as_mv[0].as_int;
@@ -1390,7 +1400,7 @@
mvp_full.row = bsi->mvp.as_mv.row >> 3;
mvp_full.col = bsi->mvp.as_mv.col >> 3;
- if (cpi->sf.adaptive_motion_search && cm->show_frame) {
+ if (cpi->sf.adaptive_motion_search) {
mvp_full.row = x->pred_mv[mbmi->ref_frame[0]].row >> 3;
mvp_full.col = x->pred_mv[mbmi->ref_frame[0]].col >> 3;
step_param = MAX(step_param, 8);
@@ -1401,12 +1411,14 @@
vp9_set_mv_search_range(x, &bsi->ref_mv[0]->as_mv);
- bestsme = vp9_full_pixel_search(cpi, x, bsize, &mvp_full, step_param,
- sadpb, &bsi->ref_mv[0]->as_mv, new_mv,
- INT_MAX, 1);
+ bestsme = vp9_full_pixel_search(
+ cpi, x, bsize, &mvp_full, step_param, sadpb,
+ cpi->sf.mv.subpel_search_method != SUBPEL_TREE ? sad_list : NULL,
+ &bsi->ref_mv[0]->as_mv, new_mv,
+ INT_MAX, 1);
// Should we do a full search (best quality only)
- if (is_best_mode(cpi->oxcf.mode)) {
+ if (cpi->oxcf.mode == BEST) {
int_mv *const best_mv = &mi->bmi[i].as_mv[0];
/* Check if mvp_full is within the range. */
clamp_mv(&mvp_full, x->mv_col_min, x->mv_col_max,
@@ -1415,6 +1427,7 @@
sadpb, 16, &cpi->fn_ptr[bsize],
&bsi->ref_mv[0]->as_mv,
&best_mv->as_mv);
+ sad_list[1] = sad_list[2] = sad_list[3] = sad_list[4] = INT_MAX;
if (thissme < bestsme) {
bestsme = thissme;
*new_mv = best_mv->as_mv;
@@ -1427,17 +1440,19 @@
if (bestsme < INT_MAX) {
int distortion;
- cpi->find_fractional_mv_step(x,
- new_mv,
- &bsi->ref_mv[0]->as_mv,
- cm->allow_high_precision_mv,
- x->errorperbit, &cpi->fn_ptr[bsize],
- cpi->sf.mv.subpel_force_stop,
- cpi->sf.mv.subpel_iters_per_step,
- x->nmvjointcost, x->mvcost,
- &distortion,
- &x->pred_sse[mbmi->ref_frame[0]],
- NULL, 0, 0);
+ cpi->find_fractional_mv_step(
+ x,
+ new_mv,
+ &bsi->ref_mv[0]->as_mv,
+ cm->allow_high_precision_mv,
+ x->errorperbit, &cpi->fn_ptr[bsize],
+ cpi->sf.mv.subpel_force_stop,
+ cpi->sf.mv.subpel_iters_per_step,
+ cond_sad_list(cpi, sad_list),
+ x->nmvjointcost, x->mvcost,
+ &distortion,
+ &x->pred_sse[mbmi->ref_frame[0]],
+ NULL, 0, 0);
// save motion search result for use in compound prediction
seg_mvs[i][mbmi->ref_frame[0]].as_mv = *new_mv;
@@ -1694,12 +1709,14 @@
int mode_index,
int64_t comp_pred_diff[REFERENCE_MODES],
const int64_t tx_size_diff[TX_MODES],
- int64_t best_filter_diff[SWITCHABLE_FILTER_CONTEXTS]) {
+ int64_t best_filter_diff[SWITCHABLE_FILTER_CONTEXTS],
+ int skippable) {
MACROBLOCKD *const xd = &x->e_mbd;
// Take a snapshot of the coding context so it can be
// restored if we decide to encode this way
ctx->skip = x->skip;
+ ctx->skippable = skippable;
ctx->best_mode_index = mode_index;
ctx->mic = *xd->mi[0];
ctx->single_pred_diff = (int)comp_pred_diff[SINGLE_REFERENCE];
@@ -1765,6 +1782,7 @@
int tmp_col_max = x->mv_col_max;
int tmp_row_min = x->mv_row_min;
int tmp_row_max = x->mv_row_max;
+ int sad_list[5];
const YV12_BUFFER_CONFIG *scaled_ref_frame = vp9_get_scaled_ref_frame(cpi,
ref);
@@ -1799,8 +1817,7 @@
step_param = cpi->mv_step_param;
}
- if (cpi->sf.adaptive_motion_search && bsize < BLOCK_64X64 &&
- cm->show_frame) {
+ if (cpi->sf.adaptive_motion_search && bsize < BLOCK_64X64) {
int boffset = 2 * (b_width_log2(BLOCK_64X64) - MIN(b_height_log2(bsize),
b_width_log2(bsize)));
step_param = MAX(step_param, boffset);
@@ -1837,6 +1854,7 @@
mvp_full.row >>= 3;
bestsme = vp9_full_pixel_search(cpi, x, bsize, &mvp_full, step_param, sadpb,
+ cond_sad_list(cpi, sad_list),
&ref_mv, &tmp_mv->as_mv, INT_MAX, 1);
x->mv_col_min = tmp_col_min;
@@ -1852,13 +1870,14 @@
&cpi->fn_ptr[bsize],
cpi->sf.mv.subpel_force_stop,
cpi->sf.mv.subpel_iters_per_step,
+ cond_sad_list(cpi, sad_list),
x->nmvjointcost, x->mvcost,
&dis, &x->pred_sse[ref], NULL, 0, 0);
}
*rate_mv = vp9_mv_bit_cost(&tmp_mv->as_mv, &ref_mv,
x->nmvjointcost, x->mvcost, MV_COST_WEIGHT);
- if (cpi->sf.adaptive_motion_search && cm->show_frame)
+ if (cpi->sf.adaptive_motion_search)
x->pred_mv[ref] = tmp_mv->as_mv;
if (scaled_ref_frame) {
@@ -1976,6 +1995,7 @@
x->errorperbit,
&cpi->fn_ptr[bsize],
0, cpi->sf.mv.subpel_iters_per_step,
+ NULL,
x->nmvjointcost, x->mvcost,
&dis, &sse, second_pred,
pw, ph);
@@ -2111,6 +2131,8 @@
int_mv (*mode_mv)[MAX_REF_FRAMES],
int mi_row, int mi_col,
int_mv single_newmv[MAX_REF_FRAMES],
+ INTERP_FILTER (*single_filter)[MAX_REF_FRAMES],
+ int (*single_skippable)[MAX_REF_FRAMES],
int64_t *psse,
const int64_t ref_best_rd) {
VP9_COMMON *cm = &cpi->common;
@@ -2128,14 +2150,14 @@
DECLARE_ALIGNED_ARRAY(16, uint8_t, tmp_buf, MAX_MB_PLANE * 64 * 64);
int pred_exists = 0;
int intpel_mv;
- int64_t rd, best_rd = INT64_MAX;
+ int64_t rd, tmp_rd, best_rd = INT64_MAX;
int best_needs_copy = 0;
uint8_t *orig_dst[MAX_MB_PLANE];
int orig_dst_stride[MAX_MB_PLANE];
int rs = 0;
INTERP_FILTER best_filter = SWITCHABLE;
- int skip_txfm[MAX_MB_PLANE] = {0};
- int64_t bsse[MAX_MB_PLANE] = {0};
+ uint8_t skip_txfm[MAX_MB_PLANE << 2] = {0};
+ int64_t bsse[MAX_MB_PLANE << 2] = {0};
int bsl = mi_width_log2_lookup[bsize];
int pred_filter_search = cpi->sf.cb_pred_filter_search ?
@@ -2157,6 +2179,12 @@
if (frame_mv[refs[0]].as_int == INVALID_MV ||
frame_mv[refs[1]].as_int == INVALID_MV)
return INT64_MAX;
+
+ if (cpi->sf.adaptive_mode_search) {
+ if (single_filter[this_mode][refs[0]] ==
+ single_filter[this_mode][refs[1]])
+ best_filter = single_filter[this_mode][refs[0]];
+ }
}
if (this_mode == NEWMV) {
@@ -2218,6 +2246,10 @@
* if the first is known */
*rate2 += cost_mv_ref(cpi, this_mode, mbmi->mode_context[refs[0]]);
+ if (RDCOST(x->rdmult, x->rddiv, *rate2, 0) > ref_best_rd &&
+ mbmi->mode != NEARESTMV)
+ return INT64_MAX;
+
pred_exists = 0;
// Are all MVs integer pel for Y and UV
intpel_mv = !mv_has_subpel(&mbmi->mv[0].as_mv);
@@ -2256,6 +2288,13 @@
} else {
int rate_sum = 0;
int64_t dist_sum = 0;
+ if (i > 0 && cpi->sf.adaptive_interp_filter_search &&
+ (cpi->sf.interp_filter_search_mask & (1 << i))) {
+ rate_sum = INT_MAX;
+ dist_sum = INT64_MAX;
+ continue;
+ }
+
if ((cm->interp_filter == SWITCHABLE &&
(!i || best_needs_copy)) ||
(cm->interp_filter != SWITCHABLE &&
@@ -2306,6 +2345,7 @@
(cm->interp_filter != SWITCHABLE &&
cm->interp_filter == mbmi->interp_filter)) {
pred_exists = 1;
+ tmp_rd = best_rd;
}
}
restore_dst_buf(xd, orig_dst, orig_dst_stride);
@@ -2324,17 +2364,30 @@
xd->plane[i].dst.stride = 64;
}
}
+ rd = tmp_rd + RDCOST(x->rdmult, x->rddiv, rs, 0);
} else {
- // Handles the special case when a filter that is not in the
- // switchable list (ex. bilinear, 6-tap) is indicated at the frame level
- vp9_build_inter_predictors_sb(xd, mi_row, mi_col, bsize);
- }
-
- if (cpi->sf.use_rd_breakout && ref_best_rd < INT64_MAX) {
int tmp_rate;
int64_t tmp_dist;
+ // Handles the special case when a filter that is not in the
+ // switchable list (ex. bilinear) is indicated at the frame level, or
+ // skip condition holds.
+ vp9_build_inter_predictors_sb(xd, mi_row, mi_col, bsize);
model_rd_for_sb(cpi, bsize, x, xd, &tmp_rate, &tmp_dist);
rd = RDCOST(x->rdmult, x->rddiv, rs + tmp_rate, tmp_dist);
+ vpx_memcpy(skip_txfm, x->skip_txfm, sizeof(skip_txfm));
+ vpx_memcpy(bsse, x->bsse, sizeof(bsse));
+ }
+
+ if (!is_comp_pred)
+ single_filter[this_mode][refs[0]] = mbmi->interp_filter;
+
+ if (cpi->sf.adaptive_mode_search)
+ if (is_comp_pred)
+ if (single_skippable[this_mode][refs[0]] &&
+ single_skippable[this_mode][refs[1]])
+ vpx_memset(skip_txfm, 1, sizeof(skip_txfm));
+
+ if (cpi->sf.use_rd_breakout && ref_best_rd < INT64_MAX) {
// if current pred_error modeled rd is substantially more than the best
// so far, do not bother doing full rd
if (rd / 2 > ref_best_rd) {
@@ -2344,7 +2397,7 @@
}
if (cm->interp_filter == SWITCHABLE)
- *rate2 += vp9_get_switchable_rate(cpi);
+ *rate2 += rs;
if (!is_comp_pred) {
if (cpi->allow_encode_breakout)
@@ -2361,8 +2414,9 @@
int64_t rdcosty = INT64_MAX;
// Y cost and distortion
- inter_super_block_yrd(cpi, x, rate_y, distortion_y, &skippable_y, psse,
- bsize, txfm_cache, ref_best_rd);
+ vp9_subtract_plane(x, bsize, 0);
+ super_block_yrd(cpi, x, rate_y, distortion_y, &skippable_y, psse,
+ bsize, txfm_cache, ref_best_rd);
if (*rate_y == INT_MAX) {
*rate2 = INT_MAX;
@@ -2392,6 +2446,9 @@
*skippable = skippable_y && skippable_uv;
}
+ if (!is_comp_pred)
+ single_skippable[this_mode][refs[0]] = *skippable;
+
restore_dst_buf(xd, orig_dst, orig_dst_stride);
return this_rd; // if 0, this will be re-calculated by caller
}
@@ -2498,10 +2555,12 @@
PREDICTION_MODE this_mode;
MV_REFERENCE_FRAME ref_frame, second_ref_frame;
unsigned char segment_id = mbmi->segment_id;
- int comp_pred, i;
+ int comp_pred, i, k;
int_mv frame_mv[MB_MODE_COUNT][MAX_REF_FRAMES];
struct buf_2d yv12_mb[4][MAX_MB_PLANE];
int_mv single_newmv[MAX_REF_FRAMES] = { { 0 } };
+ INTERP_FILTER single_inter_filter[MB_MODE_COUNT][MAX_REF_FRAMES];
+ int single_skippable[MB_MODE_COUNT][MAX_REF_FRAMES];
static const int flag_list[4] = { 0, VP9_LAST_FLAG, VP9_GOLD_FLAG,
VP9_ALT_FLAG };
int64_t best_rd = best_rd_so_far;
@@ -2512,30 +2571,27 @@
int64_t best_filter_rd[SWITCHABLE_FILTER_CONTEXTS];
int64_t best_filter_diff[SWITCHABLE_FILTER_CONTEXTS];
MB_MODE_INFO best_mbmode;
+ int best_mode_skippable = 0;
int mode_index, best_mode_index = -1;
unsigned int ref_costs_single[MAX_REF_FRAMES], ref_costs_comp[MAX_REF_FRAMES];
vp9_prob comp_mode_p;
int64_t best_intra_rd = INT64_MAX;
int64_t best_inter_rd = INT64_MAX;
+ unsigned int best_pred_sse = UINT_MAX;
PREDICTION_MODE best_intra_mode = DC_PRED;
MV_REFERENCE_FRAME best_inter_ref_frame = LAST_FRAME;
int rate_uv_intra[TX_SIZES], rate_uv_tokenonly[TX_SIZES];
int64_t dist_uv[TX_SIZES];
int skip_uv[TX_SIZES];
PREDICTION_MODE mode_uv[TX_SIZES];
- int64_t mode_distortions[MB_MODE_COUNT] = {-1};
int intra_cost_penalty = 20 * vp9_dc_quant(cm->base_qindex, cm->y_dc_delta_q);
- const int bws = num_8x8_blocks_wide_lookup[bsize] / 2;
- const int bhs = num_8x8_blocks_high_lookup[bsize] / 2;
int best_skip2 = 0;
- int mode_skip_mask = 0;
+ uint8_t ref_frame_skip_mask[2] = { 0 };
+ uint16_t mode_skip_mask[MAX_REF_FRAMES] = { 0 };
int mode_skip_start = cpi->sf.mode_skip_start + 1;
const int *const rd_threshes = rd_opt->threshes[segment_id][bsize];
const int *const rd_thresh_freq_fact = rd_opt->thresh_freq_fact[bsize];
const int mode_search_skip_flags = cpi->sf.mode_search_skip_flags;
- const int intra_y_mode_mask =
- cpi->sf.intra_y_mode_mask[max_txsize_lookup[bsize]];
- int inter_mode_mask = cpi->sf.inter_mode_mask[bsize];
vp9_zero(best_mbmode);
x->skip_encode = cpi->sf.skip_encode_frame && x->q_index < QIDX_SKIP_THRESH;
@@ -2552,6 +2608,12 @@
rate_uv_intra[i] = INT_MAX;
for (i = 0; i < MAX_REF_FRAMES; ++i)
x->pred_sse[i] = INT_MAX;
+ for (i = 0; i < MB_MODE_COUNT; ++i) {
+ for (k = 0; k < MAX_REF_FRAMES; ++k) {
+ single_inter_filter[i][k] = SWITCHABLE;
+ single_skippable[i][k] = 0;
+ }
+ }
*returnrate = INT_MAX;
@@ -2566,23 +2628,17 @@
}
for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
- // All modes from vp9_mode_order that use this frame as any ref
- static const int ref_frame_mask_all[] = {
- 0x0, 0x123291, 0x25c444, 0x39b722
- };
- // Fixed mv modes (NEARESTMV, NEARMV, ZEROMV) from vp9_mode_order that use
- // this frame as their primary ref
- static const int ref_frame_mask_fixedmv[] = {
- 0x0, 0x121281, 0x24c404, 0x080102
- };
if (!(cpi->ref_frame_flags & flag_list[ref_frame])) {
- // Skip modes for missing references
- mode_skip_mask |= ref_frame_mask_all[ref_frame];
+ // Skip checking missing references in both single and compound reference
+ // modes. Note that a mode will be skipped iff both reference frames
+ // are masked out.
+ ref_frame_skip_mask[0] |= (1 << ref_frame);
+ ref_frame_skip_mask[1] |= SECOND_REF_FRAME_MASK;
} else if (cpi->sf.reference_masking) {
for (i = LAST_FRAME; i <= ALTREF_FRAME; ++i) {
// Skip fixed mv modes for poor references
if ((x->pred_mv_sad[ref_frame] >> 2) > x->pred_mv_sad[i]) {
- mode_skip_mask |= ref_frame_mask_fixedmv[ref_frame];
+ mode_skip_mask[ref_frame] |= INTER_NEAREST_NEAR_ZERO;
break;
}
}
@@ -2591,7 +2647,8 @@
// then do nothing if the current ref frame is not allowed..
if (vp9_segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME) &&
vp9_get_segdata(seg, segment_id, SEG_LVL_REF_FRAME) != (int)ref_frame) {
- mode_skip_mask |= ref_frame_mask_all[ref_frame];
+ ref_frame_skip_mask[0] |= (1 << ref_frame);
+ ref_frame_skip_mask[1] |= SECOND_REF_FRAME_MASK;
}
}
@@ -2604,35 +2661,32 @@
// an unfiltered alternative. We allow near/nearest as well
// because they may result in zero-zero MVs but be cheaper.
if (cpi->rc.is_src_frame_alt_ref && (cpi->oxcf.arnr_max_frames == 0)) {
- mode_skip_mask =
- ~((1 << THR_NEARESTA) | (1 << THR_NEARA) | (1 << THR_ZEROA));
+ ref_frame_skip_mask[0] = (1 << LAST_FRAME) | (1 << GOLDEN_FRAME);
+ ref_frame_skip_mask[1] = SECOND_REF_FRAME_MASK;
+ mode_skip_mask[ALTREF_FRAME] = ~INTER_NEAREST_NEAR_ZERO;
if (frame_mv[NEARMV][ALTREF_FRAME].as_int != 0)
- mode_skip_mask |= (1 << THR_NEARA);
+ mode_skip_mask[ALTREF_FRAME] |= (1 << NEARMV);
if (frame_mv[NEARESTMV][ALTREF_FRAME].as_int != 0)
- mode_skip_mask |= (1 << THR_NEARESTA);
+ mode_skip_mask[ALTREF_FRAME] |= (1 << NEARESTMV);
}
}
- // TODO(JBB): This is to make up for the fact that we don't have sad
- // functions that work when the block size reads outside the umv. We
- // should fix this either by making the motion search just work on
- // a representative block in the boundary ( first ) and then implement a
- // function that does sads when inside the border..
- if ((mi_row + bhs) > cm->mi_rows || (mi_col + bws) > cm->mi_cols) {
- const int new_modes_mask =
- (1 << THR_NEWMV) | (1 << THR_NEWG) | (1 << THR_NEWA) |
- (1 << THR_COMP_NEWLA) | (1 << THR_COMP_NEWGA);
- mode_skip_mask |= new_modes_mask;
+ if (cpi->rc.is_src_frame_alt_ref) {
+ if (cpi->sf.alt_ref_search_fp) {
+ mode_skip_mask[ALTREF_FRAME] = 0;
+ ref_frame_skip_mask[0] = ~(1 << ALTREF_FRAME);
+ ref_frame_skip_mask[1] = SECOND_REF_FRAME_MASK;
+ }
}
if (bsize > cpi->sf.max_intra_bsize) {
- const int all_intra_modes = (1 << THR_DC) | (1 << THR_TM) |
- (1 << THR_H_PRED) | (1 << THR_V_PRED) | (1 << THR_D135_PRED) |
- (1 << THR_D207_PRED) | (1 << THR_D153_PRED) | (1 << THR_D63_PRED) |
- (1 << THR_D117_PRED) | (1 << THR_D45_PRED);
- mode_skip_mask |= all_intra_modes;
+ ref_frame_skip_mask[0] |= (1 << INTRA_FRAME);
+ ref_frame_skip_mask[1] |= (1 << INTRA_FRAME);
}
+ mode_skip_mask[INTRA_FRAME] |=
+ ~(cpi->sf.intra_y_mode_mask[max_txsize_lookup[bsize]]);
+
for (mode_index = 0; mode_index < MAX_MODES; ++mode_index) {
int mode_excluded = 0;
int64_t this_rd = INT64_MAX;
@@ -2647,20 +2701,26 @@
int64_t total_sse = INT64_MAX;
int early_term = 0;
+ this_mode = vp9_mode_order[mode_index].mode;
+ ref_frame = vp9_mode_order[mode_index].ref_frame[0];
+ second_ref_frame = vp9_mode_order[mode_index].ref_frame[1];
+
// Look at the reference frame of the best mode so far and set the
// skip mask to look at a subset of the remaining modes.
if (mode_index == mode_skip_start && best_mode_index >= 0) {
- switch (vp9_mode_order[best_mode_index].ref_frame[0]) {
+ switch (best_mbmode.ref_frame[0]) {
case INTRA_FRAME:
break;
case LAST_FRAME:
- mode_skip_mask |= LAST_FRAME_MODE_MASK;
+ ref_frame_skip_mask[0] |= LAST_FRAME_MODE_MASK;
+ ref_frame_skip_mask[1] |= SECOND_REF_FRAME_MASK;
break;
case GOLDEN_FRAME:
- mode_skip_mask |= GOLDEN_FRAME_MODE_MASK;
+ ref_frame_skip_mask[0] |= GOLDEN_FRAME_MODE_MASK;
+ ref_frame_skip_mask[1] |= SECOND_REF_FRAME_MASK;
break;
case ALTREF_FRAME:
- mode_skip_mask |= ALT_REF_MODE_MASK;
+ ref_frame_skip_mask[0] |= ALT_REF_MODE_MASK;
break;
case NONE:
case MAX_REF_FRAMES:
@@ -2668,7 +2728,12 @@
break;
}
}
- if (mode_skip_mask & (1 << mode_index))
+
+ if (ref_frame_skip_mask[0] & (1 << ref_frame) &&
+ ref_frame_skip_mask[1] & (1 << MAX(0, second_ref_frame)))
+ continue;
+
+ if (mode_skip_mask[ref_frame] & (1 << this_mode))
continue;
// Test best rd so far against threshold for trying this mode.
@@ -2676,17 +2741,63 @@
rd_thresh_freq_fact[mode_index]))
continue;
- this_mode = vp9_mode_order[mode_index].mode;
- ref_frame = vp9_mode_order[mode_index].ref_frame[0];
- if (ref_frame != INTRA_FRAME && !(inter_mode_mask & (1 << this_mode)))
- continue;
- second_ref_frame = vp9_mode_order[mode_index].ref_frame[1];
+ if (cpi->sf.motion_field_mode_search) {
+ const int mi_width = MIN(num_8x8_blocks_wide_lookup[bsize],
+ tile->mi_col_end - mi_col);
+ const int mi_height = MIN(num_8x8_blocks_high_lookup[bsize],
+ tile->mi_row_end - mi_row);
+ const int bsl = mi_width_log2(bsize);
+ int cb_partition_search_ctrl = (((mi_row + mi_col) >> bsl)
+ + get_chessboard_index(cm->current_video_frame)) & 0x1;
+ MB_MODE_INFO *ref_mbmi;
+ int const_motion = 1;
+ int skip_ref_frame = !cb_partition_search_ctrl;
+ MV_REFERENCE_FRAME rf = NONE;
+ int_mv ref_mv;
+ ref_mv.as_int = INVALID_MV;
+
+ if ((mi_row - 1) >= tile->mi_row_start) {
+ ref_mv = xd->mi[-xd->mi_stride]->mbmi.mv[0];
+ rf = xd->mi[-xd->mi_stride]->mbmi.ref_frame[0];
+ for (i = 0; i < mi_width; ++i) {
+ ref_mbmi = &xd->mi[-xd->mi_stride + i]->mbmi;
+ const_motion &= (ref_mv.as_int == ref_mbmi->mv[0].as_int) &&
+ (ref_frame == ref_mbmi->ref_frame[0]);
+ skip_ref_frame &= (rf == ref_mbmi->ref_frame[0]);
+ }
+ }
+
+ if ((mi_col - 1) >= tile->mi_col_start) {
+ if (ref_mv.as_int == INVALID_MV)
+ ref_mv = xd->mi[-1]->mbmi.mv[0];
+ if (rf == NONE)
+ rf = xd->mi[-1]->mbmi.ref_frame[0];
+ for (i = 0; i < mi_height; ++i) {
+ ref_mbmi = &xd->mi[i * xd->mi_stride - 1]->mbmi;
+ const_motion &= (ref_mv.as_int == ref_mbmi->mv[0].as_int) &&
+ (ref_frame == ref_mbmi->ref_frame[0]);
+ skip_ref_frame &= (rf == ref_mbmi->ref_frame[0]);
+ }
+ }
+
+ if (skip_ref_frame && this_mode != NEARESTMV && this_mode != NEWMV)
+ if (rf > INTRA_FRAME)
+ if (ref_frame != rf)
+ continue;
+
+ if (const_motion)
+ if (this_mode == NEARMV || this_mode == ZEROMV)
+ continue;
+ }
comp_pred = second_ref_frame > INTRA_FRAME;
if (comp_pred) {
+ if (!cm->allow_comp_inter_inter)
+ continue;
+
if ((mode_search_skip_flags & FLAG_SKIP_COMP_BESTINTRA) &&
best_mode_index >=0 &&
- vp9_mode_order[best_mode_index].ref_frame[0] == INTRA_FRAME)
+ best_mbmode.ref_frame[0] == INTRA_FRAME)
continue;
if ((mode_search_skip_flags & FLAG_SKIP_COMP_REFMISMATCH) &&
ref_frame != best_inter_ref_frame &&
@@ -2699,8 +2810,10 @@
}
if (ref_frame == INTRA_FRAME) {
- if (!(intra_y_mode_mask & (1 << this_mode)))
- continue;
+ if (cpi->sf.adaptive_mode_search)
+ if ((x->source_variance << num_pels_log2_lookup[bsize]) > best_pred_sse)
+ continue;
+
if (this_mode != DC_PRED) {
// Disable intra modes other than DC_PRED for blocks with low variance
// Threshold for intra skipping based on source variance
@@ -2714,7 +2827,7 @@
if ((mode_search_skip_flags & FLAG_SKIP_INTRA_BESTINTER) &&
(this_mode >= D45_PRED && this_mode <= TM_PRED)) {
if (best_mode_index >= 0 &&
- vp9_mode_order[best_mode_index].ref_frame[0] > INTRA_FRAME)
+ best_mbmode.ref_frame[0] > INTRA_FRAME)
continue;
}
if (mode_search_skip_flags & FLAG_SKIP_INTRA_DIRMISMATCH) {
@@ -2725,7 +2838,7 @@
} else {
const MV_REFERENCE_FRAME ref_frames[2] = {ref_frame, second_ref_frame};
if (!check_best_zero_mv(cpi, mbmi->mode_context, frame_mv,
- inter_mode_mask, this_mode, ref_frames))
+ this_mode, ref_frames))
continue;
}
@@ -2737,6 +2850,8 @@
// them for this frame.
mbmi->interp_filter = cm->interp_filter == SWITCHABLE ? EIGHTTAP
: cm->interp_filter;
+ mbmi->mv[0].as_int = mbmi->mv[1].as_int = 0;
+
x->skip = 0;
set_ref_ptrs(cm, xd, ref_frame, second_ref_frame);
@@ -2752,8 +2867,8 @@
if (ref_frame == INTRA_FRAME) {
TX_SIZE uv_tx;
- intra_super_block_yrd(cpi, x, &rate_y, &distortion_y, &skippable,
- bsize, tx_cache, best_rd);
+ super_block_yrd(cpi, x, &rate_y, &distortion_y, &skippable,
+ NULL, bsize, tx_cache, best_rd);
if (rate_y == INT_MAX)
continue;
@@ -2783,7 +2898,8 @@
&rate_uv, &distortion_uv,
&disable_skip, frame_mv,
mi_row, mi_col,
- single_newmv, &total_sse, best_rd);
+ single_newmv, single_inter_filter,
+ single_skippable, &total_sse, best_rd);
if (this_rd == INT64_MAX)
continue;
@@ -2860,12 +2976,6 @@
best_filter_rd[i] = MIN(best_filter_rd[i], this_rd);
}
- // Store the respective mode distortions for later use.
- if (mode_distortions[this_mode] == -1
- || distortion2 < mode_distortions[this_mode]) {
- mode_distortions[this_mode] = distortion2;
- }
-
// Did this mode help.. i.e. is it the new best mode
if (this_rd < best_rd || x->skip) {
int max_plane = MAX_MB_PLANE;
@@ -2877,6 +2987,8 @@
/* required for left and above block mv */
mbmi->mv[0].as_int = 0;
max_plane = 1;
+ } else {
+ best_pred_sse = x->pred_sse[ref_frame];
}
*returnrate = rate2;
@@ -2884,6 +2996,8 @@
best_rd = this_rd;
best_mbmode = *mbmi;
best_skip2 = this_skip2;
+ best_mode_skippable = skippable;
+
if (!x->select_tx_size)
swap_block_ptr(x, ctx, 1, 0, 0, max_plane);
vpx_memcpy(ctx->zcoeff_blk, x->zcoeff_blk[mbmi->tx_size],
@@ -2983,13 +3097,35 @@
break;
}
+ // The inter modes' rate costs are not calculated precisely in some cases.
+ // Therefore, sometimes, NEWMV is chosen instead of NEARESTMV, NEARMV, and
+ // ZEROMV. Here, checks are added for those cases, and the mode decisions
+ // are corrected.
+ if (best_mbmode.mode == NEWMV) {
+ const MV_REFERENCE_FRAME refs[2] = {best_mbmode.ref_frame[0],
+ best_mbmode.ref_frame[1]};
+ int comp_pred_mode = refs[1] > INTRA_FRAME;
+
+ if (frame_mv[NEARESTMV][refs[0]].as_int == best_mbmode.mv[0].as_int &&
+ ((comp_pred_mode && frame_mv[NEARESTMV][refs[1]].as_int ==
+ best_mbmode.mv[1].as_int) || !comp_pred_mode))
+ best_mbmode.mode = NEARESTMV;
+ else if (frame_mv[NEARMV][refs[0]].as_int == best_mbmode.mv[0].as_int &&
+ ((comp_pred_mode && frame_mv[NEARMV][refs[1]].as_int ==
+ best_mbmode.mv[1].as_int) || !comp_pred_mode))
+ best_mbmode.mode = NEARMV;
+ else if (best_mbmode.mv[0].as_int == 0 &&
+ ((comp_pred_mode && best_mbmode.mv[1].as_int == 0) || !comp_pred_mode))
+ best_mbmode.mode = ZEROMV;
+ }
+
if (best_mode_index < 0 || best_rd >= best_rd_so_far)
return INT64_MAX;
// If we used an estimate for the uv intra rd in the loop above...
if (cpi->sf.use_uv_intra_rd_estimate) {
// Do Intra UV best rd mode selection if best mode choice above was intra.
- if (vp9_mode_order[best_mode_index].ref_frame[0] == INTRA_FRAME) {
+ if (best_mbmode.ref_frame[0] == INTRA_FRAME) {
TX_SIZE uv_tx_size;
*mbmi = best_mbmode;
uv_tx_size = get_uv_tx_size(mbmi, &xd->plane[1]);
@@ -3039,9 +3175,8 @@
vp9_zero(best_tx_diff);
}
- set_ref_ptrs(cm, xd, mbmi->ref_frame[0], mbmi->ref_frame[1]);
- store_coding_context(x, ctx, best_mode_index,
- best_pred_diff, best_tx_diff, best_filter_diff);
+ store_coding_context(x, ctx, best_mode_index, best_pred_diff,
+ best_tx_diff, best_filter_diff, best_mode_skippable);
return best_rd;
}
@@ -3146,7 +3281,7 @@
if (!x->select_tx_size)
swap_block_ptr(x, ctx, 1, 0, 0, MAX_MB_PLANE);
store_coding_context(x, ctx, THR_ZEROMV,
- best_pred_diff, best_tx_diff, best_filter_diff);
+ best_pred_diff, best_tx_diff, best_filter_diff, 0);
return this_rd;
}
@@ -3283,6 +3418,9 @@
comp_pred = second_ref_frame > INTRA_FRAME;
if (comp_pred) {
+ if (!cm->allow_comp_inter_inter)
+ continue;
+
if (!(cpi->ref_frame_flags & flag_list[second_ref_frame]))
continue;
// Do not allow compound prediction if the segment level reference frame
@@ -3316,14 +3454,12 @@
// If the segment reference frame feature is enabled....
// then do nothing if the current ref frame is not allowed..
if (vp9_segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME) &&
- vp9_get_segdata(seg, segment_id, SEG_LVL_REF_FRAME) !=
- (int)ref_frame) {
+ vp9_get_segdata(seg, segment_id, SEG_LVL_REF_FRAME) != (int)ref_frame) {
continue;
// Disable this drop out case if the ref frame
// segment level feature is enabled for this segment. This is to
// prevent the possibility that we end up unable to pick any mode.
- } else if (!vp9_segfeature_active(seg, segment_id,
- SEG_LVL_REF_FRAME)) {
+ } else if (!vp9_segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME)) {
// Only consider ZEROMV/ALTREF_FRAME for alt ref frame,
// unless ARNR filtering is enabled in which case we want
// an unfiltered alternative. We allow near/nearest as well
@@ -3749,9 +3885,8 @@
vp9_zero(best_filter_diff);
}
- set_ref_ptrs(cm, xd, mbmi->ref_frame[0], mbmi->ref_frame[1]);
store_coding_context(x, ctx, best_ref_index,
- best_pred_diff, best_tx_diff, best_filter_diff);
+ best_pred_diff, best_tx_diff, best_filter_diff, 0);
return best_rd;
}
diff --git a/vp9/encoder/vp9_sad.c b/vp9/encoder/vp9_sad.c
index d062636..cee6ce1 100644
--- a/vp9/encoder/vp9_sad.c
+++ b/vp9/encoder/vp9_sad.c
@@ -14,6 +14,9 @@
#include "./vpx_config.h"
#include "vpx/vpx_integer.h"
+#if CONFIG_VP9_HIGHBITDEPTH
+#include "vp9/common/vp9_common.h"
+#endif
#include "vp9/encoder/vp9_variance.h"
static INLINE unsigned int sad(const uint8_t *a, int a_stride,
@@ -131,3 +134,138 @@
sadMxNxK(4, 4, 3)
sadMxNxK(4, 4, 8)
sadMxNx4D(4, 4)
+
+#if CONFIG_VP9_HIGHBITDEPTH
+static INLINE unsigned int high_sad(const uint8_t *a8, int a_stride,
+ const uint8_t *b8, int b_stride,
+ int width, int height) {
+ int y, x;
+ unsigned int sad = 0;
+ const uint16_t *a = CONVERT_TO_SHORTPTR(a8);
+ const uint16_t *b = CONVERT_TO_SHORTPTR(b8);
+ for (y = 0; y < height; y++) {
+ for (x = 0; x < width; x++)
+ sad += abs(a[x] - b[x]);
+
+ a += a_stride;
+ b += b_stride;
+ }
+ return sad;
+}
+
+static INLINE unsigned int high_sadb(const uint8_t *a8, int a_stride,
+ const uint16_t *b, int b_stride,
+ int width, int height) {
+ int y, x;
+ unsigned int sad = 0;
+ const uint16_t *a = CONVERT_TO_SHORTPTR(a8);
+ for (y = 0; y < height; y++) {
+ for (x = 0; x < width; x++)
+ sad += abs(a[x] - b[x]);
+
+ a += a_stride;
+ b += b_stride;
+ }
+ return sad;
+}
+
+#define high_sadMxN(m, n) \
+unsigned int vp9_high_sad##m##x##n##_c(const uint8_t *src, int src_stride, \
+ const uint8_t *ref, int ref_stride) { \
+ return high_sad(src, src_stride, ref, ref_stride, m, n); \
+} \
+unsigned int vp9_high_sad##m##x##n##_avg_c(const uint8_t *src, int src_stride, \
+ const uint8_t *ref, int ref_stride, \
+ const uint8_t *second_pred) { \
+ uint16_t comp_pred[m * n]; \
+ vp9_high_comp_avg_pred(comp_pred, second_pred, m, n, ref, ref_stride); \
+ return high_sadb(src, src_stride, comp_pred, m, m, n); \
+}
+
+#define high_sadMxNxK(m, n, k) \
+void vp9_high_sad##m##x##n##x##k##_c(const uint8_t *src, int src_stride, \
+ const uint8_t *ref, int ref_stride, \
+ unsigned int *sads) { \
+ int i; \
+ for (i = 0; i < k; ++i) \
+ sads[i] = vp9_high_sad##m##x##n##_c(src, src_stride, &ref[i], ref_stride); \
+}
+
+#define high_sadMxNx4D(m, n) \
+void vp9_high_sad##m##x##n##x4d_c(const uint8_t *src, int src_stride, \
+ const uint8_t *const refs[], \
+ int ref_stride, unsigned int *sads) { \
+ int i; \
+ for (i = 0; i < 4; ++i) \
+ sads[i] = vp9_high_sad##m##x##n##_c(src, src_stride, refs[i], ref_stride); \
+}
+
+// 64x64
+high_sadMxN(64, 64)
+high_sadMxNxK(64, 64, 3)
+high_sadMxNxK(64, 64, 8)
+high_sadMxNx4D(64, 64)
+
+// 64x32
+high_sadMxN(64, 32)
+high_sadMxNx4D(64, 32)
+
+// 32x64
+high_sadMxN(32, 64)
+high_sadMxNx4D(32, 64)
+
+// 32x32
+high_sadMxN(32, 32)
+high_sadMxNxK(32, 32, 3)
+high_sadMxNxK(32, 32, 8)
+high_sadMxNx4D(32, 32)
+
+// 32x16
+high_sadMxN(32, 16)
+high_sadMxNx4D(32, 16)
+
+// 16x32
+high_sadMxN(16, 32)
+high_sadMxNx4D(16, 32)
+
+// 16x16
+high_sadMxN(16, 16)
+high_sadMxNxK(16, 16, 3)
+high_sadMxNxK(16, 16, 8)
+high_sadMxNx4D(16, 16)
+
+// 16x8
+high_sadMxN(16, 8)
+high_sadMxNxK(16, 8, 3)
+high_sadMxNxK(16, 8, 8)
+high_sadMxNx4D(16, 8)
+
+// 8x16
+high_sadMxN(8, 16)
+high_sadMxNxK(8, 16, 3)
+high_sadMxNxK(8, 16, 8)
+high_sadMxNx4D(8, 16)
+
+// 8x8
+high_sadMxN(8, 8)
+high_sadMxNxK(8, 8, 3)
+high_sadMxNxK(8, 8, 8)
+high_sadMxNx4D(8, 8)
+
+// 8x4
+high_sadMxN(8, 4)
+high_sadMxNxK(8, 4, 8)
+high_sadMxNx4D(8, 4)
+
+// 4x8
+high_sadMxN(4, 8)
+high_sadMxNxK(4, 8, 8)
+high_sadMxNx4D(4, 8)
+
+// 4x4
+high_sadMxN(4, 4)
+high_sadMxNxK(4, 4, 3)
+high_sadMxNxK(4, 4, 8)
+high_sadMxNx4D(4, 4)
+
+#endif // CONFIG_VP9_HIGHBITDEPTH
diff --git a/vp9/encoder/vp9_speed_features.c b/vp9/encoder/vp9_speed_features.c
index a426453..75d86ce 100644
--- a/vp9/encoder/vp9_speed_features.c
+++ b/vp9/encoder/vp9_speed_features.c
@@ -13,54 +13,26 @@
#include "vp9/encoder/vp9_encoder.h"
#include "vp9/encoder/vp9_speed_features.h"
-enum {
- INTRA_ALL = (1 << DC_PRED) |
- (1 << V_PRED) | (1 << H_PRED) |
- (1 << D45_PRED) | (1 << D135_PRED) |
- (1 << D117_PRED) | (1 << D153_PRED) |
- (1 << D207_PRED) | (1 << D63_PRED) |
- (1 << TM_PRED),
- INTRA_DC = (1 << DC_PRED),
- INTRA_DC_TM = (1 << DC_PRED) | (1 << TM_PRED),
- INTRA_DC_H_V = (1 << DC_PRED) | (1 << V_PRED) | (1 << H_PRED),
- INTRA_DC_TM_H_V = (1 << DC_PRED) | (1 << TM_PRED) | (1 << V_PRED) |
- (1 << H_PRED)
-};
+// Intra only frames, golden frames (except alt ref overlays) and
+// alt ref frames tend to be coded at a higher than ambient quality
+static int frame_is_boosted(const VP9_COMP *cpi) {
+ return frame_is_intra_only(&cpi->common) ||
+ cpi->refresh_alt_ref_frame ||
+ (cpi->refresh_golden_frame && !cpi->rc.is_src_frame_alt_ref) ||
+ vp9_is_upper_layer_key_frame(cpi);
+}
-enum {
- INTER_ALL = (1 << NEARESTMV) | (1 << NEARMV) | (1 << ZEROMV) | (1 << NEWMV),
- INTER_NEAREST = (1 << NEARESTMV),
- INTER_NEAREST_NEAR_NEW = (1 << NEARESTMV) | (1 << NEARMV) | (1 << NEWMV)
-};
-
-enum {
- DISABLE_ALL_INTER_SPLIT = (1 << THR_COMP_GA) |
- (1 << THR_COMP_LA) |
- (1 << THR_ALTR) |
- (1 << THR_GOLD) |
- (1 << THR_LAST),
-
- DISABLE_ALL_SPLIT = (1 << THR_INTRA) | DISABLE_ALL_INTER_SPLIT,
-
- DISABLE_COMPOUND_SPLIT = (1 << THR_COMP_GA) | (1 << THR_COMP_LA),
-
- LAST_AND_INTRA_SPLIT_ONLY = (1 << THR_COMP_GA) |
- (1 << THR_COMP_LA) |
- (1 << THR_ALTR) |
- (1 << THR_GOLD)
-};
static void set_good_speed_feature(VP9_COMP *cpi, VP9_COMMON *cm,
SPEED_FEATURES *sf, int speed) {
+ const int boosted = frame_is_boosted(cpi);
+
sf->adaptive_rd_thresh = 1;
- sf->recode_loop = (speed < 1) ? ALLOW_RECODE : ALLOW_RECODE_KFMAXBW;
sf->allow_skip_recode = 1;
if (speed >= 1) {
sf->use_square_partition_only = !frame_is_intra_only(cm);
sf->less_rectangular_check = 1;
- sf->tx_size_search_method = frame_is_boosted(cpi) ? USE_FULL_RD
- : USE_LARGESTALL;
if (MIN(cm->width, cm->height) >= 720)
sf->disable_split_mask = cm->show_frame ? DISABLE_ALL_SPLIT
@@ -80,19 +52,26 @@
sf->intra_uv_mode_mask[TX_32X32] = INTRA_DC_H_V;
sf->intra_y_mode_mask[TX_16X16] = INTRA_DC_H_V;
sf->intra_uv_mode_mask[TX_16X16] = INTRA_DC_H_V;
+
+ sf->tx_size_search_breakout = 1;
+
+ if (MIN(cm->width, cm->height) >= 720)
+ sf->partition_search_breakout_dist_thr = (1 << 23);
+ else
+ sf->partition_search_breakout_dist_thr = (1 << 21);
+ sf->partition_search_breakout_rate_thr = 500;
}
if (speed >= 2) {
+ sf->tx_size_search_method = frame_is_boosted(cpi) ? USE_FULL_RD
+ : USE_LARGESTALL;
+
if (MIN(cm->width, cm->height) >= 720) {
- sf->lf_motion_threshold = LOW_MOTION_THRESHOLD;
- sf->last_partitioning_redo_frequency = 3;
sf->disable_split_mask = cm->show_frame ? DISABLE_ALL_SPLIT
: DISABLE_ALL_INTER_SPLIT;
sf->adaptive_pred_interp_filter = 0;
} else {
sf->disable_split_mask = LAST_AND_INTRA_SPLIT_ONLY;
- sf->last_partitioning_redo_frequency = 2;
- sf->lf_motion_threshold = NO_MOTION_THRESHOLD;
}
sf->reference_masking = 1;
@@ -102,9 +81,13 @@
FLAG_SKIP_INTRA_LOWVAR;
sf->disable_filter_search_var_thresh = 100;
sf->comp_inter_joint_search_thresh = BLOCK_SIZES;
- sf->auto_min_max_partition_size = RELAXED_NEIGHBORING_MIN_MAX;
- sf->use_lastframe_partitioning = LAST_FRAME_PARTITION_LOW_MOTION;
- sf->adjust_partitioning_from_last_frame = 1;
+ sf->auto_min_max_partition_size = CONSTRAIN_NEIGHBORING_MIN_MAX;
+
+ if (MIN(cm->width, cm->height) >= 720)
+ sf->partition_search_breakout_dist_thr = (1 << 24);
+ else
+ sf->partition_search_breakout_dist_thr = (1 << 22);
+ sf->partition_search_breakout_rate_thr = 700;
}
if (speed >= 3) {
@@ -112,21 +95,28 @@
: USE_LARGESTALL;
if (MIN(cm->width, cm->height) >= 720) {
sf->disable_split_mask = DISABLE_ALL_SPLIT;
- sf->cb_partition_search = frame_is_boosted(cpi) ? 0 : 1;
} else {
+ sf->max_intra_bsize = BLOCK_32X32;
sf->disable_split_mask = DISABLE_ALL_INTER_SPLIT;
}
-
sf->adaptive_pred_interp_filter = 0;
+ sf->adaptive_mode_search = 1;
+ sf->cb_partition_search = !boosted;
sf->cb_pred_filter_search = 1;
-
- sf->lf_motion_threshold = LOW_MOTION_THRESHOLD;
- sf->last_partitioning_redo_frequency = 3;
+ sf->alt_ref_search_fp = 1;
+ sf->motion_field_mode_search = !boosted;
sf->recode_loop = ALLOW_RECODE_KFMAXBW;
sf->adaptive_rd_thresh = 3;
sf->mode_skip_start = 6;
- sf->use_fast_coef_updates = ONE_LOOP_REDUCED;
- sf->use_fast_coef_costing = 1;
+ sf->intra_y_mode_mask[TX_32X32] = INTRA_DC;
+ sf->intra_uv_mode_mask[TX_32X32] = INTRA_DC;
+ sf->adaptive_interp_filter_search = 1;
+
+ if (MIN(cm->width, cm->height) >= 720)
+ sf->partition_search_breakout_dist_thr = (1 << 25);
+ else
+ sf->partition_search_breakout_dist_thr = (1 << 23);
+ sf->partition_search_breakout_rate_thr = 1000;
}
if (speed >= 4) {
@@ -137,8 +127,15 @@
sf->mode_search_skip_flags |= FLAG_SKIP_COMP_REFMISMATCH |
FLAG_EARLY_TERMINATE;
sf->disable_filter_search_var_thresh = 200;
- sf->use_lastframe_partitioning = LAST_FRAME_PARTITION_ALL;
sf->use_lp32x32fdct = 1;
+ sf->use_fast_coef_updates = ONE_LOOP_REDUCED;
+ sf->use_fast_coef_costing = 1;
+
+ if (MIN(cm->width, cm->height) >= 720)
+ sf->partition_search_breakout_dist_thr = (1 << 26);
+ else
+ sf->partition_search_breakout_dist_thr = (1 << 24);
+ sf->partition_search_breakout_rate_thr = 1500;
}
if (speed >= 5) {
@@ -162,8 +159,8 @@
static void set_rt_speed_feature(VP9_COMP *cpi, SPEED_FEATURES *sf,
int speed, vp9e_tune_content content) {
VP9_COMMON *const cm = &cpi->common;
- const int frames_since_key =
- cm->frame_type == KEY_FRAME ? 0 : cpi->rc.frames_since_key;
+ const int is_keyframe = cm->frame_type == KEY_FRAME;
+ const int frames_since_key = is_keyframe ? 0 : cpi->rc.frames_since_key;
sf->static_segmentation = 0;
sf->adaptive_rd_thresh = 1;
sf->use_fast_coef_costing = 1;
@@ -181,6 +178,7 @@
sf->disable_split_mask = DISABLE_COMPOUND_SPLIT;
sf->use_rd_breakout = 1;
+
sf->adaptive_motion_search = 1;
sf->adaptive_pred_interp_filter = 1;
sf->mv.auto_mv_step_size = 1;
@@ -258,17 +256,16 @@
}
if (speed >= 5) {
- sf->use_quant_fp = cm->frame_type == KEY_FRAME ? 0 : 1;
- sf->auto_min_max_partition_size = (cm->frame_type == KEY_FRAME) ?
- RELAXED_NEIGHBORING_MIN_MAX : STRICT_NEIGHBORING_MIN_MAX;
+ sf->use_quant_fp = !is_keyframe;
+ sf->auto_min_max_partition_size = is_keyframe ? RELAXED_NEIGHBORING_MIN_MAX
+ : STRICT_NEIGHBORING_MIN_MAX;
sf->max_partition_size = BLOCK_32X32;
sf->min_partition_size = BLOCK_8X8;
sf->partition_check =
(frames_since_key % sf->last_partitioning_redo_frequency == 1);
- sf->force_frame_boost = cm->frame_type == KEY_FRAME ||
- (frames_since_key %
- (sf->last_partitioning_redo_frequency << 1) == 1);
- sf->max_delta_qindex = (cm->frame_type == KEY_FRAME) ? 20 : 15;
+ sf->force_frame_boost = is_keyframe ||
+ (frames_since_key % (sf->last_partitioning_redo_frequency << 1) == 1);
+ sf->max_delta_qindex = is_keyframe ? 20 : 15;
sf->partition_search_type = REFERENCE_PARTITION;
sf->use_nonrd_pick_mode = 1;
sf->allow_skip_recode = 0;
@@ -286,8 +283,7 @@
sf->partition_search_type = SOURCE_VAR_BASED_PARTITION;
sf->search_type_check_frequency = 50;
- sf->tx_size_search_method = (cm->frame_type == KEY_FRAME) ?
- USE_LARGESTALL : USE_TX_8X8;
+ sf->tx_size_search_method = is_keyframe ? USE_LARGESTALL : USE_TX_8X8;
// This feature is only enabled when partition search is disabled.
sf->reuse_inter_pred_sby = 1;
@@ -297,6 +293,7 @@
sf->mv.reduce_first_step_size = 1;
}
+
if (speed >= 7) {
sf->mv.search_method = FAST_DIAMOND;
sf->mv.fullpel_search_step_param = 10;
@@ -305,10 +302,12 @@
800 : 300;
sf->elevate_newmv_thresh = 2500;
}
+
if (speed >= 12) {
sf->elevate_newmv_thresh = 4000;
sf->mv.subpel_force_stop = 2;
}
+
if (speed >= 13) {
int i;
sf->max_intra_bsize = BLOCK_32X32;
@@ -341,8 +340,11 @@
sf->use_lp32x32fdct = 0;
sf->adaptive_motion_search = 0;
sf->adaptive_pred_interp_filter = 0;
+ sf->adaptive_mode_search = 0;
sf->cb_pred_filter_search = 0;
sf->cb_partition_search = 0;
+ sf->motion_field_mode_search = 0;
+ sf->alt_ref_search_fp = 0;
sf->use_quant_fp = 0;
sf->reference_masking = 0;
sf->partition_search_type = SEARCH_PARTITION;
@@ -358,8 +360,9 @@
sf->mode_search_skip_flags = 0;
sf->force_frame_boost = 0;
sf->max_delta_qindex = 0;
- sf->disable_split_var_thresh = 0;
sf->disable_filter_search_var_thresh = 0;
+ sf->adaptive_interp_filter_search = 0;
+
for (i = 0; i < TX_SIZES; i++) {
sf->intra_y_mode_mask[i] = INTRA_ALL;
sf->intra_uv_mode_mask[i] = INTRA_ALL;
@@ -386,21 +389,20 @@
// Recode loop tolerence %.
sf->recode_tolerance = 25;
sf->default_interp_filter = SWITCHABLE;
+ sf->tx_size_search_breakout = 0;
+ sf->partition_search_breakout_dist_thr = 0;
+ sf->partition_search_breakout_rate_thr = 0;
- switch (oxcf->mode) {
- case ONE_PASS_BEST:
- case TWO_PASS_SECOND_BEST: // This is the best quality mode.
- cpi->diamond_search_sad = vp9_full_range_search;
- break;
- case TWO_PASS_FIRST:
- case ONE_PASS_GOOD:
- case TWO_PASS_SECOND_GOOD:
- set_good_speed_feature(cpi, cm, sf, oxcf->speed);
- break;
- case REALTIME:
- set_rt_speed_feature(cpi, sf, oxcf->speed, oxcf->content);
- break;
- }
+ if (oxcf->mode == REALTIME)
+ set_rt_speed_feature(cpi, sf, oxcf->speed, oxcf->content);
+ else if (oxcf->mode == GOOD)
+ set_good_speed_feature(cpi, cm, sf, oxcf->speed);
+
+ cpi->full_search_sad = vp9_full_search_sad;
+ cpi->diamond_search_sad = oxcf->mode == BEST ? vp9_full_range_search
+ : vp9_diamond_search_sad;
+ cpi->refining_search_sad = vp9_refining_search_sad;
+
// Slow quant, dct and trellis not worthwhile for first pass
// so make sure they are always turned off.
@@ -415,6 +417,8 @@
if (sf->mv.subpel_search_method == SUBPEL_TREE) {
cpi->find_fractional_mv_step = vp9_find_best_sub_pixel_tree;
+ } else if (sf->mv.subpel_search_method == SUBPEL_TREE_PRUNED) {
+ cpi->find_fractional_mv_step = vp9_find_best_sub_pixel_tree_pruned;
}
cpi->mb.optimize = sf->optimize_coefficients == 1 && oxcf->pass != 1;
diff --git a/vp9/encoder/vp9_speed_features.h b/vp9/encoder/vp9_speed_features.h
index de731ce..0f49154 100644
--- a/vp9/encoder/vp9_speed_features.h
+++ b/vp9/encoder/vp9_speed_features.h
@@ -17,6 +17,44 @@
extern "C" {
#endif
+enum {
+ INTRA_ALL = (1 << DC_PRED) |
+ (1 << V_PRED) | (1 << H_PRED) |
+ (1 << D45_PRED) | (1 << D135_PRED) |
+ (1 << D117_PRED) | (1 << D153_PRED) |
+ (1 << D207_PRED) | (1 << D63_PRED) |
+ (1 << TM_PRED),
+ INTRA_DC = (1 << DC_PRED),
+ INTRA_DC_TM = (1 << DC_PRED) | (1 << TM_PRED),
+ INTRA_DC_H_V = (1 << DC_PRED) | (1 << V_PRED) | (1 << H_PRED),
+ INTRA_DC_TM_H_V = (1 << DC_PRED) | (1 << TM_PRED) | (1 << V_PRED) |
+ (1 << H_PRED)
+};
+
+enum {
+ INTER_ALL = (1 << NEARESTMV) | (1 << NEARMV) | (1 << ZEROMV) | (1 << NEWMV),
+ INTER_NEAREST = (1 << NEARESTMV),
+ INTER_NEAREST_NEAR_NEW = (1 << NEARESTMV) | (1 << NEARMV) | (1 << NEWMV),
+ INTER_NEAREST_NEAR_ZERO = (1 << NEARESTMV) | (1 << NEARMV) | (1 << ZEROMV),
+};
+
+enum {
+ DISABLE_ALL_INTER_SPLIT = (1 << THR_COMP_GA) |
+ (1 << THR_COMP_LA) |
+ (1 << THR_ALTR) |
+ (1 << THR_GOLD) |
+ (1 << THR_LAST),
+
+ DISABLE_ALL_SPLIT = (1 << THR_INTRA) | DISABLE_ALL_INTER_SPLIT,
+
+ DISABLE_COMPOUND_SPLIT = (1 << THR_COMP_GA) | (1 << THR_COMP_LA),
+
+ LAST_AND_INTRA_SPLIT_ONLY = (1 << THR_COMP_GA) |
+ (1 << THR_COMP_LA) |
+ (1 << THR_ALTR) |
+ (1 << THR_GOLD)
+};
+
typedef enum {
DIAMOND = 0,
NSTEP = 1,
@@ -40,6 +78,7 @@
typedef enum {
SUBPEL_TREE = 0,
+ SUBPEL_TREE_PRUNED = 1,
// Other methods to come
} SUBPEL_SEARCH_METHODS;
@@ -63,7 +102,8 @@
typedef enum {
NOT_IN_USE = 0,
RELAXED_NEIGHBORING_MIN_MAX = 1,
- STRICT_NEIGHBORING_MIN_MAX = 2
+ CONSTRAIN_NEIGHBORING_MIN_MAX = 2,
+ STRICT_NEIGHBORING_MIN_MAX = 3
} AUTO_MIN_MAX_MODE;
typedef enum {
@@ -102,6 +142,12 @@
} MODE_SEARCH_SKIP_LOGIC;
typedef enum {
+ FLAG_SKIP_EIGHTTAP = 1 << EIGHTTAP,
+ FLAG_SKIP_EIGHTTAP_SMOOTH = 1 << EIGHTTAP_SMOOTH,
+ FLAG_SKIP_EIGHTTAP_SHARP = 1 << EIGHTTAP_SHARP,
+} INTERP_FILTER_MASK;
+
+typedef enum {
// Search partitions using RD/NONRD criterion
SEARCH_PARTITION = 0,
@@ -283,11 +329,18 @@
// was selected, and 2 means we use 8 tap if no 8x8 filter mode was selected.
int adaptive_pred_interp_filter;
+ // Adaptive prediction mode search
+ int adaptive_mode_search;
+
// Chessboard pattern prediction filter type search
int cb_pred_filter_search;
int cb_partition_search;
+ int motion_field_mode_search;
+
+ int alt_ref_search_fp;
+
// Fast quantization process path
int use_quant_fp;
@@ -307,9 +360,6 @@
// defined in the MODE_SEARCH_SKIP_HEURISTICS enum
unsigned int mode_search_skip_flags;
- // A source variance threshold below which the split mode is disabled
- unsigned int disable_split_var_thresh;
-
// A source variance threshold below which filter search is disabled
// Choose a very large value (UINT_MAX) to use 8-tap always
unsigned int disable_filter_search_var_thresh;
@@ -374,6 +424,20 @@
// default interp filter choice
INTERP_FILTER default_interp_filter;
+
+ // Early termination in transform size search, which only applies while
+ // tx_size_search_method is USE_FULL_RD.
+ int tx_size_search_breakout;
+
+ // adaptive interp_filter search to allow skip of certain filter types.
+ int adaptive_interp_filter_search;
+
+ // mask for skip evaluation of certain interp_filter type.
+ INTERP_FILTER_MASK interp_filter_search_mask;
+
+ // Partition search early breakout thresholds.
+ int64_t partition_search_breakout_dist_thr;
+ int partition_search_breakout_rate_thr;
} SPEED_FEATURES;
struct VP9_COMP;
diff --git a/vp9/encoder/vp9_ssim.c b/vp9/encoder/vp9_ssim.c
index 026e6a8..8435640 100644
--- a/vp9/encoder/vp9_ssim.c
+++ b/vp9/encoder/vp9_ssim.c
@@ -95,7 +95,7 @@
return ssim_total;
}
double vp9_calc_ssim(YV12_BUFFER_CONFIG *source, YV12_BUFFER_CONFIG *dest,
- int lumamask, double *weight) {
+ double *weight) {
double a, b, c;
double ssimv;
diff --git a/vp9/encoder/vp9_ssim.h b/vp9/encoder/vp9_ssim.h
index a581c2c..d1dd1b7 100644
--- a/vp9/encoder/vp9_ssim.h
+++ b/vp9/encoder/vp9_ssim.h
@@ -18,7 +18,7 @@
#include "vpx_scale/yv12config.h"
double vp9_calc_ssim(YV12_BUFFER_CONFIG *source, YV12_BUFFER_CONFIG *dest,
- int lumamask, double *weight);
+ double *weight);
double vp9_calc_ssimg(YV12_BUFFER_CONFIG *source, YV12_BUFFER_CONFIG *dest,
double *ssim_y, double *ssim_u, double *ssim_v);
diff --git a/vp9/encoder/vp9_svc_layercontext.c b/vp9/encoder/vp9_svc_layercontext.c
index bf949c4..7545d87 100644
--- a/vp9/encoder/vp9_svc_layercontext.c
+++ b/vp9/encoder/vp9_svc_layercontext.c
@@ -19,12 +19,12 @@
const VP9EncoderConfig *const oxcf = &cpi->oxcf;
int layer;
int layer_end;
- int alt_ref_idx = svc->number_spatial_layers;
+ int alt_ref_idx = svc->number_spatial_layers * svc->number_temporal_layers;
svc->spatial_layer_id = 0;
svc->temporal_layer_id = 0;
- if (svc->number_temporal_layers > 1) {
+ if (svc->number_temporal_layers > 1 && cpi->oxcf.rc_mode == VPX_CBR) {
layer_end = svc->number_temporal_layers;
} else {
layer_end = svc->number_spatial_layers;
@@ -36,6 +36,8 @@
int i;
lc->current_video_frame_in_layer = 0;
lc->layer_size = 0;
+ lc->frames_from_key_frame = 0;
+ lc->last_frame_type = FRAME_TYPES;
lrc->ni_av_qi = oxcf->worst_allowed_q;
lrc->total_actual_bits = 0;
lrc->total_target_vs_actual = 0;
@@ -50,7 +52,7 @@
lrc->rate_correction_factors[i] = 1.0;
}
- if (svc->number_temporal_layers > 1) {
+ if (svc->number_temporal_layers > 1 && cpi->oxcf.rc_mode == VPX_CBR) {
lc->target_bandwidth = oxcf->ts_target_bitrate[layer];
lrc->last_q[INTER_FRAME] = oxcf->worst_allowed_q;
lrc->avg_frame_qindex[INTER_FRAME] = oxcf->worst_allowed_q;
@@ -69,13 +71,14 @@
lc->gold_ref_idx = -1;
}
- lrc->buffer_level = vp9_rescale((int)(oxcf->starting_buffer_level_ms),
- lc->target_bandwidth, 1000);
+ lrc->buffer_level = oxcf->starting_buffer_level_ms *
+ lc->target_bandwidth / 1000;
lrc->bits_off_target = lrc->buffer_level;
}
// Still have extra buffer for base layer golden frame
- if (svc->number_spatial_layers > 1 && alt_ref_idx < REF_FRAMES)
+ if (!(svc->number_temporal_layers > 1 && cpi->oxcf.rc_mode == VPX_CBR)
+ && alt_ref_idx < REF_FRAMES)
svc->layer_context[0].gold_ref_idx = alt_ref_idx;
}
@@ -89,7 +92,7 @@
int layer_end;
float bitrate_alloc = 1.0;
- if (svc->number_temporal_layers > 1) {
+ if (svc->number_temporal_layers > 1 && cpi->oxcf.rc_mode == VPX_CBR) {
layer_end = svc->number_temporal_layers;
} else {
layer_end = svc->number_spatial_layers;
@@ -99,7 +102,7 @@
LAYER_CONTEXT *const lc = &svc->layer_context[layer];
RATE_CONTROL *const lrc = &lc->rc;
- if (svc->number_temporal_layers > 1) {
+ if (svc->number_temporal_layers > 1 && cpi->oxcf.rc_mode == VPX_CBR) {
lc->target_bandwidth = oxcf->ts_target_bitrate[layer];
} else {
lc->target_bandwidth = oxcf->ss_target_bitrate[layer];
@@ -115,10 +118,10 @@
lrc->bits_off_target = MIN(lrc->bits_off_target, lrc->maximum_buffer_size);
lrc->buffer_level = MIN(lrc->buffer_level, lrc->maximum_buffer_size);
// Update framerate-related quantities.
- if (svc->number_temporal_layers > 1) {
- lc->framerate = oxcf->framerate / oxcf->ts_rate_decimator[layer];
+ if (svc->number_temporal_layers > 1 && cpi->oxcf.rc_mode == VPX_CBR) {
+ lc->framerate = cpi->framerate / oxcf->ts_rate_decimator[layer];
} else {
- lc->framerate = oxcf->framerate;
+ lc->framerate = cpi->framerate;
}
lrc->avg_frame_bandwidth = (int)(lc->target_bandwidth / lc->framerate);
lrc->max_frame_bandwidth = rc->max_frame_bandwidth;
@@ -128,20 +131,20 @@
}
}
-static LAYER_CONTEXT *get_layer_context(SVC *svc) {
- return svc->number_temporal_layers > 1 ?
- &svc->layer_context[svc->temporal_layer_id] :
- &svc->layer_context[svc->spatial_layer_id];
+static LAYER_CONTEXT *get_layer_context(VP9_COMP *const cpi) {
+ return (cpi->svc.number_temporal_layers > 1 && cpi->oxcf.rc_mode == VPX_CBR) ?
+ &cpi->svc.layer_context[cpi->svc.temporal_layer_id] :
+ &cpi->svc.layer_context[cpi->svc.spatial_layer_id];
}
void vp9_update_temporal_layer_framerate(VP9_COMP *const cpi) {
SVC *const svc = &cpi->svc;
const VP9EncoderConfig *const oxcf = &cpi->oxcf;
- LAYER_CONTEXT *const lc = get_layer_context(svc);
+ LAYER_CONTEXT *const lc = get_layer_context(cpi);
RATE_CONTROL *const lrc = &lc->rc;
const int layer = svc->temporal_layer_id;
- lc->framerate = oxcf->framerate / oxcf->ts_rate_decimator[layer];
+ lc->framerate = cpi->framerate / oxcf->ts_rate_decimator[layer];
lrc->avg_frame_bandwidth = (int)(lc->target_bandwidth / lc->framerate);
lrc->max_frame_bandwidth = cpi->rc.max_frame_bandwidth;
// Update the average layer frame size (non-cumulative per-frame-bw).
@@ -149,7 +152,7 @@
lc->avg_frame_size = lrc->avg_frame_bandwidth;
} else {
const double prev_layer_framerate =
- oxcf->framerate / oxcf->ts_rate_decimator[layer - 1];
+ cpi->framerate / oxcf->ts_rate_decimator[layer - 1];
const int prev_layer_target_bandwidth = oxcf->ts_target_bitrate[layer - 1];
lc->avg_frame_size =
(int)((lc->target_bandwidth - prev_layer_target_bandwidth) /
@@ -159,7 +162,7 @@
void vp9_update_spatial_layer_framerate(VP9_COMP *const cpi, double framerate) {
const VP9EncoderConfig *const oxcf = &cpi->oxcf;
- LAYER_CONTEXT *const lc = get_layer_context(&cpi->svc);
+ LAYER_CONTEXT *const lc = get_layer_context(cpi);
RATE_CONTROL *const lrc = &lc->rc;
lc->framerate = framerate;
@@ -172,7 +175,7 @@
}
void vp9_restore_layer_context(VP9_COMP *const cpi) {
- LAYER_CONTEXT *const lc = get_layer_context(&cpi->svc);
+ LAYER_CONTEXT *const lc = get_layer_context(cpi);
const int old_frame_since_key = cpi->rc.frames_since_key;
const int old_frame_to_key = cpi->rc.frames_to_key;
@@ -190,7 +193,7 @@
void vp9_save_layer_context(VP9_COMP *const cpi) {
const VP9EncoderConfig *const oxcf = &cpi->oxcf;
- LAYER_CONTEXT *const lc = get_layer_context(&cpi->svc);
+ LAYER_CONTEXT *const lc = get_layer_context(cpi);
lc->rc = cpi->rc;
lc->twopass = cpi->twopass;
@@ -214,15 +217,17 @@
svc->spatial_layer_id = 0;
}
-void vp9_inc_frame_in_layer(SVC *svc) {
- LAYER_CONTEXT *const lc = (svc->number_temporal_layers > 1)
- ? &svc->layer_context[svc->temporal_layer_id]
- : &svc->layer_context[svc->spatial_layer_id];
+void vp9_inc_frame_in_layer(VP9_COMP *const cpi) {
+ LAYER_CONTEXT *const lc =
+ (cpi->svc.number_temporal_layers > 1 && cpi->oxcf.rc_mode == VPX_CBR) ?
+ &cpi->svc.layer_context[cpi->svc.temporal_layer_id] :
+ &cpi->svc.layer_context[cpi->svc.spatial_layer_id];
++lc->current_video_frame_in_layer;
+ ++lc->frames_from_key_frame;
}
int vp9_is_upper_layer_key_frame(const VP9_COMP *const cpi) {
- return is_spatial_svc(cpi) &&
+ return is_two_pass_svc(cpi) &&
cpi->svc.spatial_layer_id > 0 &&
cpi->svc.layer_context[cpi->svc.spatial_layer_id].is_key_frame;
}
@@ -257,6 +262,7 @@
int layer_id;
vpx_svc_parameters_t *layer_param;
LAYER_CONTEXT *lc;
+ int count = 1 << (cpi->svc.number_temporal_layers - 1);
// Find the next layer to be encoded
for (layer_id = 0; layer_id < cpi->svc.number_spatial_layers; ++layer_id) {
@@ -274,17 +280,36 @@
lc = &cpi->svc.layer_context[cpi->svc.spatial_layer_id];
- cpi->lst_fb_idx = cpi->svc.spatial_layer_id;
+ cpi->svc.temporal_layer_id = 0;
+ while ((lc->current_video_frame_in_layer % count) != 0) {
+ ++cpi->svc.temporal_layer_id;
+ count >>= 1;
+ }
- if (cpi->svc.spatial_layer_id < 1)
+ cpi->lst_fb_idx =
+ cpi->svc.spatial_layer_id * cpi->svc.number_temporal_layers +
+ cpi->svc.temporal_layer_id;
+ if (lc->frames_from_key_frame < cpi->svc.number_temporal_layers)
+ cpi->ref_frame_flags &= ~VP9_LAST_FLAG;
+
+ if (cpi->svc.spatial_layer_id == 0) {
+ if (cpi->svc.temporal_layer_id == 0)
cpi->gld_fb_idx = lc->gold_ref_idx >= 0 ?
lc->gold_ref_idx : cpi->lst_fb_idx;
- else
- cpi->gld_fb_idx = cpi->svc.spatial_layer_id - 1;
+ else
+ cpi->gld_fb_idx = cpi->lst_fb_idx - 1;
+ } else {
+ if (cpi->svc.temporal_layer_id == 0)
+ cpi->gld_fb_idx = cpi->svc.spatial_layer_id -
+ cpi->svc.number_temporal_layers;
+ else
+ cpi->gld_fb_idx = cpi->lst_fb_idx - 1;
+ }
if (lc->current_video_frame_in_layer == 0) {
if (cpi->svc.spatial_layer_id >= 2) {
- cpi->alt_fb_idx = cpi->svc.spatial_layer_id - 2;
+ cpi->alt_fb_idx =
+ cpi->svc.spatial_layer_id - 2 * cpi->svc.number_temporal_layers;
} else {
cpi->alt_fb_idx = cpi->lst_fb_idx;
cpi->ref_frame_flags &= (~VP9_LAST_FLAG & ~VP9_ALT_FLAG);
@@ -306,7 +331,8 @@
lc_lower->alt_ref_source != NULL)
cpi->alt_fb_idx = lc_lower->alt_ref_idx;
else if (cpi->svc.spatial_layer_id >= 2)
- cpi->alt_fb_idx = cpi->svc.spatial_layer_id - 2;
+ cpi->alt_fb_idx =
+ cpi->svc.spatial_layer_id - 2 * cpi->svc.number_temporal_layers;
else
cpi->alt_fb_idx = cpi->lst_fb_idx;
}
@@ -325,7 +351,7 @@
vp9_set_high_precision_mv(cpi, 1);
- cpi->alt_ref_source = get_layer_context(&cpi->svc)->alt_ref_source;
+ cpi->alt_ref_source = get_layer_context(cpi)->alt_ref_source;
return 0;
}
diff --git a/vp9/encoder/vp9_svc_layercontext.h b/vp9/encoder/vp9_svc_layercontext.h
index 801449b..d180d1a 100644
--- a/vp9/encoder/vp9_svc_layercontext.h
+++ b/vp9/encoder/vp9_svc_layercontext.h
@@ -25,15 +25,18 @@
double framerate;
int avg_frame_size;
TWO_PASS twopass;
- struct vpx_fixed_buf rc_twopass_stats_in;
+ vpx_fixed_buf_t rc_twopass_stats_in;
unsigned int current_video_frame_in_layer;
int is_key_frame;
+ int frames_from_key_frame;
+ FRAME_TYPE last_frame_type;
vpx_svc_parameters_t svc_params_received;
struct lookahead_entry *alt_ref_source;
int alt_ref_idx;
int gold_ref_idx;
int has_alt_frame;
size_t layer_size;
+ struct vpx_psnr_pkt psnr_pkt;
} LAYER_CONTEXT;
typedef struct {
@@ -80,7 +83,7 @@
void vp9_init_second_pass_spatial_svc(struct VP9_COMP *cpi);
// Increment number of video frames in layer
-void vp9_inc_frame_in_layer(SVC *svc);
+void vp9_inc_frame_in_layer(struct VP9_COMP *const cpi);
// Check if current layer is key frame in spatial upper layer
int vp9_is_upper_layer_key_frame(const struct VP9_COMP *const cpi);
diff --git a/vp9/encoder/vp9_temporal_filter.c b/vp9/encoder/vp9_temporal_filter.c
index ce3b311..18a6a91 100644
--- a/vp9/encoder/vp9_temporal_filter.c
+++ b/vp9/encoder/vp9_temporal_filter.c
@@ -145,6 +145,7 @@
int bestsme = INT_MAX;
int distortion;
unsigned int sse;
+ int sad_list[5];
MV best_ref_mv1 = {0, 0};
MV best_ref_mv1_full; /* full-pixel value of best_ref_mv1 */
@@ -168,6 +169,7 @@
// Ignore mv costing by sending NULL pointer instead of cost arrays
vp9_hex_search(x, &best_ref_mv1_full, step_param, sadpb, 1,
+ cond_sad_list(cpi, sad_list),
&cpi->fn_ptr[BLOCK_16X16], 0, &best_ref_mv1, ref_mv);
// Ignore mv costing by sending NULL pointer instead of cost array
@@ -177,6 +179,7 @@
x->errorperbit,
&cpi->fn_ptr[BLOCK_16X16],
0, mv_sf->subpel_iters_per_step,
+ cond_sad_list(cpi, sad_list),
NULL, NULL,
&distortion, &sse, NULL, 0, 0);
@@ -188,6 +191,7 @@
}
static void temporal_filter_iterate_c(VP9_COMP *cpi,
+ YV12_BUFFER_CONFIG **frames,
int frame_count,
int alt_ref_index,
int strength,
@@ -203,7 +207,7 @@
DECLARE_ALIGNED_ARRAY(16, unsigned int, accumulator, 16 * 16 * 3);
DECLARE_ALIGNED_ARRAY(16, uint16_t, count, 16 * 16 * 3);
MACROBLOCKD *mbd = &cpi->mb.e_mbd;
- YV12_BUFFER_CONFIG *f = cpi->frames[alt_ref_index];
+ YV12_BUFFER_CONFIG *f = frames[alt_ref_index];
uint8_t *dst1, *dst2;
DECLARE_ALIGNED_ARRAY(16, uint8_t, predictor, 16 * 16 * 3);
const int mb_uv_height = 16 >> mbd->plane[1].subsampling_y;
@@ -247,7 +251,7 @@
const int thresh_low = 10000;
const int thresh_high = 20000;
- if (cpi->frames[frame] == NULL)
+ if (frames[frame] == NULL)
continue;
mbd->mi[0]->bmi[0].as_mv[0].as_mv.row = 0;
@@ -258,9 +262,9 @@
} else {
// Find best match in this frame by MC
int err = temporal_filter_find_matching_mb_c(cpi,
- cpi->frames[alt_ref_index]->y_buffer + mb_y_offset,
- cpi->frames[frame]->y_buffer + mb_y_offset,
- cpi->frames[frame]->y_stride);
+ frames[alt_ref_index]->y_buffer + mb_y_offset,
+ frames[frame]->y_buffer + mb_y_offset,
+ frames[frame]->y_stride);
// Assign higher weight to matching MB if it's error
// score is lower. If not applying MC default behavior
@@ -272,10 +276,10 @@
if (filter_weight != 0) {
// Construct the predictors
temporal_filter_predictors_mb_c(mbd,
- cpi->frames[frame]->y_buffer + mb_y_offset,
- cpi->frames[frame]->u_buffer + mb_uv_offset,
- cpi->frames[frame]->v_buffer + mb_uv_offset,
- cpi->frames[frame]->y_stride,
+ frames[frame]->y_buffer + mb_y_offset,
+ frames[frame]->u_buffer + mb_uv_offset,
+ frames[frame]->v_buffer + mb_uv_offset,
+ frames[frame]->y_stride,
mb_uv_width, mb_uv_height,
mbd->mi[0]->bmi[0].as_mv[0].as_mv.row,
mbd->mi[0]->bmi[0].as_mv[0].as_mv.col,
@@ -357,12 +361,14 @@
// Apply buffer limits and context specific adjustments to arnr filter.
static void adjust_arnr_filter(VP9_COMP *cpi,
- int distance, int group_boost) {
+ int distance, int group_boost,
+ int *arnr_frames, int *arnr_strength) {
+ const VP9EncoderConfig *const oxcf = &cpi->oxcf;
const int frames_after_arf =
- vp9_lookahead_depth(cpi->lookahead) - distance - 1;
+ vp9_lookahead_depth(cpi->lookahead) - distance - 1;
int frames_fwd = (cpi->oxcf.arnr_max_frames - 1) >> 1;
int frames_bwd;
- int q;
+ int q, frames, strength;
// Define the forward and backwards filter limits for this arnr group.
if (frames_fwd > frames_after_arf)
@@ -375,10 +381,10 @@
// For even length filter there is one more frame backward
// than forward: e.g. len=6 ==> bbbAff, len=7 ==> bbbAfff.
if (frames_bwd < distance)
- frames_bwd += (cpi->oxcf.arnr_max_frames + 1) & 0x1;
+ frames_bwd += (oxcf->arnr_max_frames + 1) & 0x1;
// Set the baseline active filter size.
- cpi->active_arnr_frames = frames_bwd + 1 + frames_fwd;
+ frames = frames_bwd + 1 + frames_fwd;
// Adjust the strength based on active max q.
if (cpi->common.current_video_frame > 1)
@@ -388,29 +394,33 @@
q = ((int)vp9_convert_qindex_to_q(
cpi->rc.avg_frame_qindex[KEY_FRAME]));
if (q > 16) {
- cpi->active_arnr_strength = cpi->oxcf.arnr_strength;
+ strength = oxcf->arnr_strength;
} else {
- cpi->active_arnr_strength = cpi->oxcf.arnr_strength - ((16 - q) / 2);
- if (cpi->active_arnr_strength < 0)
- cpi->active_arnr_strength = 0;
+ strength = oxcf->arnr_strength - ((16 - q) / 2);
+ if (strength < 0)
+ strength = 0;
}
// Adjust number of frames in filter and strength based on gf boost level.
- if (cpi->active_arnr_frames > (group_boost / 150)) {
- cpi->active_arnr_frames = (group_boost / 150);
- cpi->active_arnr_frames += !(cpi->active_arnr_frames & 1);
+ if (frames > group_boost / 150) {
+ frames = group_boost / 150;
+ frames += !(frames & 1);
}
- if (cpi->active_arnr_strength > (group_boost / 300)) {
- cpi->active_arnr_strength = (group_boost / 300);
+
+ if (strength > group_boost / 300) {
+ strength = group_boost / 300;
}
// Adjustments for second level arf in multi arf case.
if (cpi->oxcf.pass == 2 && cpi->multi_arf_allowed) {
const GF_GROUP *const gf_group = &cpi->twopass.gf_group;
if (gf_group->rf_level[gf_group->index] != GF_ARF_STD) {
- cpi->active_arnr_strength >>= 1;
+ strength >>= 1;
}
}
+
+ *arnr_frames = frames;
+ *arnr_strength = strength;
}
void vp9_temporal_filter(VP9_COMP *cpi, int distance) {
@@ -423,26 +433,24 @@
int frames_to_blur_backward;
int frames_to_blur_forward;
struct scale_factors sf;
+ YV12_BUFFER_CONFIG *frames[MAX_LAG_BUFFERS] = {NULL};
// Apply context specific adjustments to the arnr filter parameters.
- adjust_arnr_filter(cpi, distance, rc->gfu_boost);
- strength = cpi->active_arnr_strength;
- frames_to_blur = cpi->active_arnr_frames;
+ adjust_arnr_filter(cpi, distance, rc->gfu_boost, &frames_to_blur, &strength);
frames_to_blur_backward = (frames_to_blur / 2);
frames_to_blur_forward = ((frames_to_blur - 1) / 2);
start_frame = distance + frames_to_blur_forward;
// Setup frame pointers, NULL indicates frame not included in filter.
- vp9_zero(cpi->frames);
for (frame = 0; frame < frames_to_blur; ++frame) {
const int which_buffer = start_frame - frame;
struct lookahead_entry *buf = vp9_lookahead_peek(cpi->lookahead,
which_buffer);
- cpi->frames[frames_to_blur - 1 - frame] = &buf->img;
+ frames[frames_to_blur - 1 - frame] = &buf->img;
}
// Setup scaling factors. Scaling on each of the arnr frames is not supported
- if (is_spatial_svc(cpi)) {
+ if (is_two_pass_svc(cpi)) {
// In spatial svc the scaling factors might be less then 1/2. So we will use
// non-normative scaling.
int frame_used = 0;
@@ -453,19 +461,21 @@
get_frame_new_buffer(cm)->y_crop_height);
for (frame = 0; frame < frames_to_blur; ++frame) {
- if (cm->mi_cols * MI_SIZE != cpi->frames[frame]->y_width ||
- cm->mi_rows * MI_SIZE != cpi->frames[frame]->y_height) {
+ if (cm->mi_cols * MI_SIZE != frames[frame]->y_width ||
+ cm->mi_rows * MI_SIZE != frames[frame]->y_height) {
if (vp9_realloc_frame_buffer(&cpi->svc.scaled_frames[frame_used],
cm->width, cm->height,
cm->subsampling_x, cm->subsampling_y,
+#if CONFIG_VP9_HIGHBITDEPTH
+ cm->use_highbitdepth,
+#endif
VP9_ENC_BORDER_IN_PIXELS, NULL, NULL,
NULL))
vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
"Failed to reallocate alt_ref_buffer");
- cpi->frames[frame] =
- vp9_scale_if_required(cm, cpi->frames[frame],
- &cpi->svc.scaled_frames[frame_used]);
+ frames[frame] = vp9_scale_if_required(cm, frames[frame],
+ &cpi->svc.scaled_frames[frame_used]);
++frame_used;
}
}
@@ -476,6 +486,6 @@
cm->width, cm->height);
}
- temporal_filter_iterate_c(cpi, frames_to_blur, frames_to_blur_backward,
- strength, &sf);
+ temporal_filter_iterate_c(cpi, frames, frames_to_blur,
+ frames_to_blur_backward, strength, &sf);
}
diff --git a/vp9/encoder/vp9_tokenize.c b/vp9/encoder/vp9_tokenize.c
index 6068b85..263883f 100644
--- a/vp9/encoder/vp9_tokenize.c
+++ b/vp9/encoder/vp9_tokenize.c
@@ -212,7 +212,7 @@
TOKENEXTRA *t = *tp; /* store tokens starting here */
int eob = p->eobs[block];
const PLANE_TYPE type = pd->plane_type;
- const int16_t *qcoeff = BLOCK_OFFSET(p->qcoeff, block);
+ const tran_low_t *qcoeff = BLOCK_OFFSET(p->qcoeff, block);
const int segment_id = mbmi->segment_id;
const int16_t *scan, *nb;
const scan_order *so;
diff --git a/vp9/encoder/vp9_variance.c b/vp9/encoder/vp9_variance.c
index eb5ae2e..c97f93f 100644
--- a/vp9/encoder/vp9_variance.c
+++ b/vp9/encoder/vp9_variance.c
@@ -103,8 +103,9 @@
unsigned int vp9_get_mb_ss_c(const int16_t *src_ptr) {
unsigned int i, sum = 0;
- for (i = 0; i < 256; i++)
+ for (i = 0; i < 256; ++i) {
sum += src_ptr[i] * src_ptr[i];
+ }
return sum;
}
@@ -266,3 +267,375 @@
ref += ref_stride;
}
}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+void high_variance64(const uint8_t *a8, int a_stride,
+ const uint8_t *b8, int b_stride,
+ int w, int h, uint64_t *sse,
+ uint64_t *sum) {
+ int i, j;
+
+ uint16_t *a = CONVERT_TO_SHORTPTR(a8);
+ uint16_t *b = CONVERT_TO_SHORTPTR(b8);
+ *sum = 0;
+ *sse = 0;
+
+ for (i = 0; i < h; i++) {
+ for (j = 0; j < w; j++) {
+ const int diff = a[j] - b[j];
+ *sum += diff;
+ *sse += diff * diff;
+ }
+ a += a_stride;
+ b += b_stride;
+ }
+}
+
+void high_variance(const uint8_t *a8, int a_stride,
+ const uint8_t *b8, int b_stride,
+ int w, int h, unsigned int *sse,
+ int *sum) {
+ uint64_t sse_long = 0;
+ uint64_t sum_long = 0;
+ high_variance64(a8, a_stride, b8, b_stride, w, h, &sse_long, &sum_long);
+ *sse = sse_long;
+ *sum = sum_long;
+}
+
+void high_10_variance(const uint8_t *a8, int a_stride,
+ const uint8_t *b8, int b_stride,
+ int w, int h, unsigned int *sse,
+ int *sum) {
+ uint64_t sse_long = 0;
+ uint64_t sum_long = 0;
+ high_variance64(a8, a_stride, b8, b_stride, w, h, &sse_long, &sum_long);
+ *sum = ROUND_POWER_OF_TWO(sum_long, 2);
+ *sse = ROUND_POWER_OF_TWO(sse_long, 4);
+}
+
+void high_12_variance(const uint8_t *a8, int a_stride,
+ const uint8_t *b8, int b_stride,
+ int w, int h, unsigned int *sse,
+ int *sum) {
+ uint64_t sse_long = 0;
+ uint64_t sum_long = 0;
+ high_variance64(a8, a_stride, b8, b_stride, w, h, &sse_long, &sum_long);
+ *sum = ROUND_POWER_OF_TWO(sum_long, 4);
+ *sse = ROUND_POWER_OF_TWO(sse_long, 8);
+}
+
+static void high_var_filter_block2d_bil_first_pass(
+ const uint8_t *src_ptr8,
+ uint16_t *output_ptr,
+ unsigned int src_pixels_per_line,
+ int pixel_step,
+ unsigned int output_height,
+ unsigned int output_width,
+ const int16_t *vp9_filter) {
+ unsigned int i, j;
+ uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src_ptr8);
+ for (i = 0; i < output_height; i++) {
+ for (j = 0; j < output_width; j++) {
+ output_ptr[j] =
+ ROUND_POWER_OF_TWO((int)src_ptr[0] * vp9_filter[0] +
+ (int)src_ptr[pixel_step] * vp9_filter[1],
+ FILTER_BITS);
+
+ src_ptr++;
+ }
+
+ // Next row...
+ src_ptr += src_pixels_per_line - output_width;
+ output_ptr += output_width;
+ }
+}
+
+static void high_var_filter_block2d_bil_second_pass(
+ const uint16_t *src_ptr,
+ uint16_t *output_ptr,
+ unsigned int src_pixels_per_line,
+ unsigned int pixel_step,
+ unsigned int output_height,
+ unsigned int output_width,
+ const int16_t *vp9_filter) {
+ unsigned int i, j;
+
+ for (i = 0; i < output_height; i++) {
+ for (j = 0; j < output_width; j++) {
+ output_ptr[j] =
+ ROUND_POWER_OF_TWO((int)src_ptr[0] * vp9_filter[0] +
+ (int)src_ptr[pixel_step] * vp9_filter[1],
+ FILTER_BITS);
+ src_ptr++;
+ }
+
+ src_ptr += src_pixels_per_line - output_width;
+ output_ptr += output_width;
+ }
+}
+
+#define HIGH_VAR(W, H) \
+unsigned int vp9_high_variance##W##x##H##_c(const uint8_t *a, int a_stride, \
+ const uint8_t *b, int b_stride, \
+ unsigned int *sse) { \
+ int sum; \
+ high_variance(a, a_stride, b, b_stride, W, H, sse, &sum); \
+ return *sse - (((int64_t)sum * sum) / (W * H)); \
+} \
+\
+unsigned int vp9_high_10_variance##W##x##H##_c(const uint8_t *a, int a_stride, \
+ const uint8_t *b, int b_stride, \
+ unsigned int *sse) { \
+ int sum; \
+ high_10_variance(a, a_stride, b, b_stride, W, H, sse, &sum); \
+ return *sse - (((int64_t)sum * sum) / (W * H)); \
+} \
+\
+unsigned int vp9_high_12_variance##W##x##H##_c(const uint8_t *a, int a_stride, \
+ const uint8_t *b, int b_stride, \
+ unsigned int *sse) { \
+ int sum; \
+ high_12_variance(a, a_stride, b, b_stride, W, H, sse, &sum); \
+ return *sse - (((int64_t)sum * sum) / (W * H)); \
+}
+
+#define HIGH_SUBPIX_VAR(W, H) \
+unsigned int vp9_high_sub_pixel_variance##W##x##H##_c( \
+ const uint8_t *src, int src_stride, \
+ int xoffset, int yoffset, \
+ const uint8_t *dst, int dst_stride, \
+ unsigned int *sse) { \
+ uint16_t fdata3[(H + 1) * W]; \
+ uint16_t temp2[H * W]; \
+\
+ high_var_filter_block2d_bil_first_pass(src, fdata3, src_stride, 1, H + 1, \
+ W, BILINEAR_FILTERS_2TAP(xoffset)); \
+ high_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
+ BILINEAR_FILTERS_2TAP(yoffset)); \
+\
+ return vp9_high_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), W, dst, \
+ dst_stride, sse); \
+} \
+\
+unsigned int vp9_high_10_sub_pixel_variance##W##x##H##_c( \
+ const uint8_t *src, int src_stride, \
+ int xoffset, int yoffset, \
+ const uint8_t *dst, int dst_stride, \
+ unsigned int *sse) { \
+ uint16_t fdata3[(H + 1) * W]; \
+ uint16_t temp2[H * W]; \
+\
+ high_var_filter_block2d_bil_first_pass(src, fdata3, src_stride, 1, H + 1, \
+ W, BILINEAR_FILTERS_2TAP(xoffset)); \
+ high_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
+ BILINEAR_FILTERS_2TAP(yoffset)); \
+\
+ return vp9_high_10_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), W, dst, \
+ dst_stride, sse); \
+} \
+\
+unsigned int vp9_high_12_sub_pixel_variance##W##x##H##_c( \
+ const uint8_t *src, int src_stride, \
+ int xoffset, int yoffset, \
+ const uint8_t *dst, int dst_stride, \
+ unsigned int *sse) { \
+ uint16_t fdata3[(H + 1) * W]; \
+ uint16_t temp2[H * W]; \
+\
+ high_var_filter_block2d_bil_first_pass(src, fdata3, src_stride, 1, H + 1, \
+ W, BILINEAR_FILTERS_2TAP(xoffset)); \
+ high_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
+ BILINEAR_FILTERS_2TAP(yoffset)); \
+\
+ return vp9_high_12_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), W, dst, \
+ dst_stride, sse); \
+}
+
+#define HIGH_SUBPIX_AVG_VAR(W, H) \
+unsigned int vp9_high_sub_pixel_avg_variance##W##x##H##_c( \
+ const uint8_t *src, int src_stride, \
+ int xoffset, int yoffset, \
+ const uint8_t *dst, int dst_stride, \
+ unsigned int *sse, \
+ const uint8_t *second_pred) { \
+ uint16_t fdata3[(H + 1) * W]; \
+ uint16_t temp2[H * W]; \
+ DECLARE_ALIGNED_ARRAY(16, uint16_t, temp3, H * W); \
+\
+ high_var_filter_block2d_bil_first_pass(src, fdata3, src_stride, 1, H + 1, \
+ W, BILINEAR_FILTERS_2TAP(xoffset)); \
+ high_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
+ BILINEAR_FILTERS_2TAP(yoffset)); \
+\
+ vp9_high_comp_avg_pred(temp3, second_pred, W, H, CONVERT_TO_BYTEPTR(temp2), \
+ W); \
+\
+ return vp9_high_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W, dst, \
+ dst_stride, sse); \
+} \
+\
+unsigned int vp9_high_10_sub_pixel_avg_variance##W##x##H##_c( \
+ const uint8_t *src, int src_stride, \
+ int xoffset, int yoffset, \
+ const uint8_t *dst, int dst_stride, \
+ unsigned int *sse, \
+ const uint8_t *second_pred) { \
+ uint16_t fdata3[(H + 1) * W]; \
+ uint16_t temp2[H * W]; \
+ DECLARE_ALIGNED_ARRAY(16, uint16_t, temp3, H * W); \
+\
+ high_var_filter_block2d_bil_first_pass(src, fdata3, src_stride, 1, H + 1, \
+ W, BILINEAR_FILTERS_2TAP(xoffset)); \
+ high_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
+ BILINEAR_FILTERS_2TAP(yoffset)); \
+\
+ vp9_high_comp_avg_pred(temp3, second_pred, W, H, CONVERT_TO_BYTEPTR(temp2), \
+ W); \
+\
+ return vp9_high_10_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W, dst, \
+ dst_stride, sse); \
+} \
+\
+unsigned int vp9_high_12_sub_pixel_avg_variance##W##x##H##_c( \
+ const uint8_t *src, int src_stride, \
+ int xoffset, int yoffset, \
+ const uint8_t *dst, int dst_stride, \
+ unsigned int *sse, \
+ const uint8_t *second_pred) { \
+ uint16_t fdata3[(H + 1) * W]; \
+ uint16_t temp2[H * W]; \
+ DECLARE_ALIGNED_ARRAY(16, uint16_t, temp3, H * W); \
+\
+ high_var_filter_block2d_bil_first_pass(src, fdata3, src_stride, 1, H + 1, \
+ W, BILINEAR_FILTERS_2TAP(xoffset)); \
+ high_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
+ BILINEAR_FILTERS_2TAP(yoffset)); \
+\
+ vp9_high_comp_avg_pred(temp3, second_pred, W, H, CONVERT_TO_BYTEPTR(temp2), \
+ W); \
+\
+ return vp9_high_12_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W, dst, \
+ dst_stride, sse); \
+}
+
+#define HIGH_GET_VAR(S) \
+void vp9_high_get##S##x##S##var_c(const uint8_t *src, int src_stride, \
+ const uint8_t *ref, int ref_stride, \
+ unsigned int *sse, int *sum) { \
+ high_variance(src, src_stride, ref, ref_stride, S, S, sse, sum); \
+} \
+\
+void vp9_high_10_get##S##x##S##var_c(const uint8_t *src, int src_stride, \
+ const uint8_t *ref, int ref_stride, \
+ unsigned int *sse, int *sum) { \
+ high_10_variance(src, src_stride, ref, ref_stride, S, S, sse, sum); \
+} \
+\
+void vp9_high_12_get##S##x##S##var_c(const uint8_t *src, int src_stride, \
+ const uint8_t *ref, int ref_stride, \
+ unsigned int *sse, int *sum) { \
+ high_12_variance(src, src_stride, ref, ref_stride, S, S, sse, sum); \
+}
+
+#define HIGH_MSE(W, H) \
+unsigned int vp9_high_mse##W##x##H##_c(const uint8_t *src, int src_stride, \
+ const uint8_t *ref, int ref_stride, \
+ unsigned int *sse) { \
+ int sum; \
+ high_variance(src, src_stride, ref, ref_stride, W, H, sse, &sum); \
+ return *sse; \
+} \
+\
+unsigned int vp9_high_10_mse##W##x##H##_c(const uint8_t *src, int src_stride, \
+ const uint8_t *ref, int ref_stride, \
+ unsigned int *sse) { \
+ int sum; \
+ high_10_variance(src, src_stride, ref, ref_stride, W, H, sse, &sum); \
+ return *sse; \
+} \
+\
+unsigned int vp9_high_12_mse##W##x##H##_c(const uint8_t *src, int src_stride, \
+ const uint8_t *ref, int ref_stride, \
+ unsigned int *sse) { \
+ int sum; \
+ high_12_variance(src, src_stride, ref, ref_stride, W, H, sse, &sum); \
+ return *sse; \
+}
+
+HIGH_GET_VAR(8)
+HIGH_GET_VAR(16)
+
+HIGH_MSE(16, 16)
+HIGH_MSE(16, 8)
+HIGH_MSE(8, 16)
+HIGH_MSE(8, 8)
+
+HIGH_VAR(4, 4)
+HIGH_SUBPIX_VAR(4, 4)
+HIGH_SUBPIX_AVG_VAR(4, 4)
+
+HIGH_VAR(4, 8)
+HIGH_SUBPIX_VAR(4, 8)
+HIGH_SUBPIX_AVG_VAR(4, 8)
+
+HIGH_VAR(8, 4)
+HIGH_SUBPIX_VAR(8, 4)
+HIGH_SUBPIX_AVG_VAR(8, 4)
+
+HIGH_VAR(8, 8)
+HIGH_SUBPIX_VAR(8, 8)
+HIGH_SUBPIX_AVG_VAR(8, 8)
+
+HIGH_VAR(8, 16)
+HIGH_SUBPIX_VAR(8, 16)
+HIGH_SUBPIX_AVG_VAR(8, 16)
+
+HIGH_VAR(16, 8)
+HIGH_SUBPIX_VAR(16, 8)
+HIGH_SUBPIX_AVG_VAR(16, 8)
+
+HIGH_VAR(16, 16)
+HIGH_SUBPIX_VAR(16, 16)
+HIGH_SUBPIX_AVG_VAR(16, 16)
+
+HIGH_VAR(16, 32)
+HIGH_SUBPIX_VAR(16, 32)
+HIGH_SUBPIX_AVG_VAR(16, 32)
+
+HIGH_VAR(32, 16)
+HIGH_SUBPIX_VAR(32, 16)
+HIGH_SUBPIX_AVG_VAR(32, 16)
+
+HIGH_VAR(32, 32)
+HIGH_SUBPIX_VAR(32, 32)
+HIGH_SUBPIX_AVG_VAR(32, 32)
+
+HIGH_VAR(32, 64)
+HIGH_SUBPIX_VAR(32, 64)
+HIGH_SUBPIX_AVG_VAR(32, 64)
+
+HIGH_VAR(64, 32)
+HIGH_SUBPIX_VAR(64, 32)
+HIGH_SUBPIX_AVG_VAR(64, 32)
+
+HIGH_VAR(64, 64)
+HIGH_SUBPIX_VAR(64, 64)
+HIGH_SUBPIX_AVG_VAR(64, 64)
+
+void vp9_high_comp_avg_pred(uint16_t *comp_pred, const uint8_t *pred8,
+ int width, int height, const uint8_t *ref8,
+ int ref_stride) {
+ int i, j;
+ uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
+ uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
+ for (i = 0; i < height; i++) {
+ for (j = 0; j < width; j++) {
+ const int tmp = pred[j] + ref[j];
+ comp_pred[j] = ROUND_POWER_OF_TWO(tmp, 1);
+ }
+ comp_pred += width;
+ pred += width;
+ ref += ref_stride;
+ }
+}
+#endif // CONFIG_VP9_HIGHBITDEPTH
diff --git a/vp9/encoder/vp9_variance.h b/vp9/encoder/vp9_variance.h
index 4a194b7..c51d08d 100644
--- a/vp9/encoder/vp9_variance.h
+++ b/vp9/encoder/vp9_variance.h
@@ -22,6 +22,23 @@
int w, int h,
unsigned int *sse, int *sum);
+#if CONFIG_VP9_HIGHBITDEPTH
+void high_variance(const uint8_t *a8, int a_stride,
+ const uint8_t *b8, int b_stride,
+ int w, int h,
+ unsigned int *sse, int *sum);
+
+void high_10_variance(const uint8_t *a8, int a_stride,
+ const uint8_t *b8, int b_stride,
+ int w, int h,
+ unsigned int *sse, int *sum);
+
+void high_12_variance(const uint8_t *a8, int a_stride,
+ const uint8_t *b8, int b_stride,
+ int w, int h,
+ unsigned int *sse, int *sum);
+#endif
+
typedef unsigned int(*vp9_sad_fn_t)(const uint8_t *src_ptr,
int source_stride,
const uint8_t *ref_ptr,
@@ -81,6 +98,11 @@
void vp9_comp_avg_pred(uint8_t *comp_pred, const uint8_t *pred, int width,
int height, const uint8_t *ref, int ref_stride);
+#if CONFIG_VP9_HIGHBITDEPTH
+void vp9_high_comp_avg_pred(uint16_t *comp_pred, const uint8_t *pred, int width,
+ int height, const uint8_t *ref, int ref_stride);
+#endif
+
#ifdef __cplusplus
} // extern "C"
#endif
diff --git a/vp9/encoder/x86/vp9_dct_sse2.c b/vp9/encoder/x86/vp9_dct_sse2.c
index 487deef..e799951 100644
--- a/vp9/encoder/x86/vp9_dct_sse2.c
+++ b/vp9/encoder/x86/vp9_dct_sse2.c
@@ -12,6 +12,8 @@
#include "vp9/common/vp9_idct.h" // for cospi constants
#include "vpx_ports/mem.h"
+#include "vp9/common/x86/vp9_idct_intrin_sse2.h"
+
void vp9_fdct4x4_1_sse2(const int16_t *input, int16_t *output, int stride) {
__m128i in0, in1;
__m128i tmp;
@@ -780,58 +782,6 @@
_mm_store_si128((__m128i *)(output + 7 * stride), res[7]);
}
-// perform in-place transpose
-static INLINE void array_transpose_8x8(__m128i *in, __m128i *res) {
- const __m128i tr0_0 = _mm_unpacklo_epi16(in[0], in[1]);
- const __m128i tr0_1 = _mm_unpacklo_epi16(in[2], in[3]);
- const __m128i tr0_2 = _mm_unpackhi_epi16(in[0], in[1]);
- const __m128i tr0_3 = _mm_unpackhi_epi16(in[2], in[3]);
- const __m128i tr0_4 = _mm_unpacklo_epi16(in[4], in[5]);
- const __m128i tr0_5 = _mm_unpacklo_epi16(in[6], in[7]);
- const __m128i tr0_6 = _mm_unpackhi_epi16(in[4], in[5]);
- const __m128i tr0_7 = _mm_unpackhi_epi16(in[6], in[7]);
- // 00 10 01 11 02 12 03 13
- // 20 30 21 31 22 32 23 33
- // 04 14 05 15 06 16 07 17
- // 24 34 25 35 26 36 27 37
- // 40 50 41 51 42 52 43 53
- // 60 70 61 71 62 72 63 73
- // 44 54 45 55 46 56 47 57
- // 64 74 65 75 66 76 67 77
- const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
- const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_4, tr0_5);
- const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1);
- const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_4, tr0_5);
- const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_2, tr0_3);
- const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7);
- const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_2, tr0_3);
- const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7);
- // 00 10 20 30 01 11 21 31
- // 40 50 60 70 41 51 61 71
- // 02 12 22 32 03 13 23 33
- // 42 52 62 72 43 53 63 73
- // 04 14 24 34 05 15 25 35
- // 44 54 64 74 45 55 65 75
- // 06 16 26 36 07 17 27 37
- // 46 56 66 76 47 57 67 77
- res[0] = _mm_unpacklo_epi64(tr1_0, tr1_1);
- res[1] = _mm_unpackhi_epi64(tr1_0, tr1_1);
- res[2] = _mm_unpacklo_epi64(tr1_2, tr1_3);
- res[3] = _mm_unpackhi_epi64(tr1_2, tr1_3);
- res[4] = _mm_unpacklo_epi64(tr1_4, tr1_5);
- res[5] = _mm_unpackhi_epi64(tr1_4, tr1_5);
- res[6] = _mm_unpacklo_epi64(tr1_6, tr1_7);
- res[7] = _mm_unpackhi_epi64(tr1_6, tr1_7);
- // 00 10 20 30 40 50 60 70
- // 01 11 21 31 41 51 61 71
- // 02 12 22 32 42 52 62 72
- // 03 13 23 33 43 53 63 73
- // 04 14 24 34 44 54 64 74
- // 05 15 25 35 45 55 65 75
- // 06 16 26 36 46 56 66 76
- // 07 17 27 37 47 57 67 77
-}
-
void fdct8_sse2(__m128i *in) {
// constants
const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
@@ -1953,23 +1903,6 @@
write_buffer_8x8(output + 8 * stride, in1 + 8, stride);
}
-static INLINE void array_transpose_16x16(__m128i *res0, __m128i *res1) {
- __m128i tbuf[8];
- array_transpose_8x8(res0, res0);
- array_transpose_8x8(res1, tbuf);
- array_transpose_8x8(res0 + 8, res1);
- array_transpose_8x8(res1 + 8, res1 + 8);
-
- res0[8] = tbuf[0];
- res0[9] = tbuf[1];
- res0[10] = tbuf[2];
- res0[11] = tbuf[3];
- res0[12] = tbuf[4];
- res0[13] = tbuf[5];
- res0[14] = tbuf[6];
- res0[15] = tbuf[7];
-}
-
static INLINE void right_shift_16x16(__m128i *res0, __m128i *res1) {
// perform rounding operations
right_shift_8x8(res0, 2);
@@ -2891,11 +2824,11 @@
#define FDCT32x32_2D vp9_fdct32x32_rd_sse2
#define FDCT32x32_HIGH_PRECISION 0
#include "vp9/encoder/x86/vp9_dct32x32_sse2.c"
-#undef FDCT32x32_2D
#undef FDCT32x32_HIGH_PRECISION
+#undef FDCT32x32_2D
#define FDCT32x32_2D vp9_fdct32x32_sse2
#define FDCT32x32_HIGH_PRECISION 1
#include "vp9/encoder/x86/vp9_dct32x32_sse2.c" // NOLINT
-#undef FDCT32x32_2D
#undef FDCT32x32_HIGH_PRECISION
+#undef FDCT32x32_2D
diff --git a/vp9/encoder/x86/vp9_sad_mmx.asm b/vp9/encoder/x86/vp9_sad_mmx.asm
deleted file mode 100644
index 32fdd23..0000000
--- a/vp9/encoder/x86/vp9_sad_mmx.asm
+++ /dev/null
@@ -1,427 +0,0 @@
-;
-; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
-
-%include "vpx_ports/x86_abi_support.asm"
-
-global sym(vp9_sad16x16_mmx) PRIVATE
-global sym(vp9_sad8x16_mmx) PRIVATE
-global sym(vp9_sad8x8_mmx) PRIVATE
-global sym(vp9_sad4x4_mmx) PRIVATE
-global sym(vp9_sad16x8_mmx) PRIVATE
-
-;unsigned int vp9_sad16x16_mmx(
-; unsigned char *src_ptr,
-; int src_stride,
-; unsigned char *ref_ptr,
-; int ref_stride)
-sym(vp9_sad16x16_mmx):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 4
- push rsi
- push rdi
- ; end prolog
-
- mov rsi, arg(0) ;src_ptr
- mov rdi, arg(2) ;ref_ptr
-
- movsxd rax, dword ptr arg(1) ;src_stride
- movsxd rdx, dword ptr arg(3) ;ref_stride
-
- lea rcx, [rsi+rax*8]
-
- lea rcx, [rcx+rax*8]
- pxor mm7, mm7
-
- pxor mm6, mm6
-
-.x16x16sad_mmx_loop:
-
- movq mm0, QWORD PTR [rsi]
- movq mm2, QWORD PTR [rsi+8]
-
- movq mm1, QWORD PTR [rdi]
- movq mm3, QWORD PTR [rdi+8]
-
- movq mm4, mm0
- movq mm5, mm2
-
- psubusb mm0, mm1
- psubusb mm1, mm4
-
- psubusb mm2, mm3
- psubusb mm3, mm5
-
- por mm0, mm1
- por mm2, mm3
-
- movq mm1, mm0
- movq mm3, mm2
-
- punpcklbw mm0, mm6
- punpcklbw mm2, mm6
-
- punpckhbw mm1, mm6
- punpckhbw mm3, mm6
-
- paddw mm0, mm2
- paddw mm1, mm3
-
-
- lea rsi, [rsi+rax]
- add rdi, rdx
-
- paddw mm7, mm0
- paddw mm7, mm1
-
- cmp rsi, rcx
- jne .x16x16sad_mmx_loop
-
-
- movq mm0, mm7
-
- punpcklwd mm0, mm6
- punpckhwd mm7, mm6
-
- paddw mm0, mm7
- movq mm7, mm0
-
-
- psrlq mm0, 32
- paddw mm7, mm0
-
- movq rax, mm7
-
- pop rdi
- pop rsi
- mov rsp, rbp
- ; begin epilog
- UNSHADOW_ARGS
- pop rbp
- ret
-
-
-;unsigned int vp9_sad8x16_mmx(
-; unsigned char *src_ptr,
-; int src_stride,
-; unsigned char *ref_ptr,
-; int ref_stride)
-sym(vp9_sad8x16_mmx):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 4
- push rsi
- push rdi
- ; end prolog
-
- mov rsi, arg(0) ;src_ptr
- mov rdi, arg(2) ;ref_ptr
-
- movsxd rax, dword ptr arg(1) ;src_stride
- movsxd rdx, dword ptr arg(3) ;ref_stride
-
- lea rcx, [rsi+rax*8]
-
- lea rcx, [rcx+rax*8]
- pxor mm7, mm7
-
- pxor mm6, mm6
-
-.x8x16sad_mmx_loop:
-
- movq mm0, QWORD PTR [rsi]
- movq mm1, QWORD PTR [rdi]
-
- movq mm2, mm0
- psubusb mm0, mm1
-
- psubusb mm1, mm2
- por mm0, mm1
-
- movq mm2, mm0
- punpcklbw mm0, mm6
-
- punpckhbw mm2, mm6
- lea rsi, [rsi+rax]
-
- add rdi, rdx
- paddw mm7, mm0
-
- paddw mm7, mm2
- cmp rsi, rcx
-
- jne .x8x16sad_mmx_loop
-
- movq mm0, mm7
- punpcklwd mm0, mm6
-
- punpckhwd mm7, mm6
- paddw mm0, mm7
-
- movq mm7, mm0
- psrlq mm0, 32
-
- paddw mm7, mm0
- movq rax, mm7
-
- pop rdi
- pop rsi
- mov rsp, rbp
- ; begin epilog
- UNSHADOW_ARGS
- pop rbp
- ret
-
-
-;unsigned int vp9_sad8x8_mmx(
-; unsigned char *src_ptr,
-; int src_stride,
-; unsigned char *ref_ptr,
-; int ref_stride)
-sym(vp9_sad8x8_mmx):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 4
- push rsi
- push rdi
- ; end prolog
-
- mov rsi, arg(0) ;src_ptr
- mov rdi, arg(2) ;ref_ptr
-
- movsxd rax, dword ptr arg(1) ;src_stride
- movsxd rdx, dword ptr arg(3) ;ref_stride
-
- lea rcx, [rsi+rax*8]
- pxor mm7, mm7
-
- pxor mm6, mm6
-
-.x8x8sad_mmx_loop:
-
- movq mm0, QWORD PTR [rsi]
- movq mm1, QWORD PTR [rdi]
-
- movq mm2, mm0
- psubusb mm0, mm1
-
- psubusb mm1, mm2
- por mm0, mm1
-
- movq mm2, mm0
- punpcklbw mm0, mm6
-
- punpckhbw mm2, mm6
- paddw mm0, mm2
-
- lea rsi, [rsi+rax]
- add rdi, rdx
-
- paddw mm7, mm0
- cmp rsi, rcx
-
- jne .x8x8sad_mmx_loop
-
- movq mm0, mm7
- punpcklwd mm0, mm6
-
- punpckhwd mm7, mm6
- paddw mm0, mm7
-
- movq mm7, mm0
- psrlq mm0, 32
-
- paddw mm7, mm0
- movq rax, mm7
-
- pop rdi
- pop rsi
- mov rsp, rbp
- ; begin epilog
- UNSHADOW_ARGS
- pop rbp
- ret
-
-
-;unsigned int vp9_sad4x4_mmx(
-; unsigned char *src_ptr,
-; int src_stride,
-; unsigned char *ref_ptr,
-; int ref_stride)
-sym(vp9_sad4x4_mmx):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 4
- push rsi
- push rdi
- ; end prolog
-
- mov rsi, arg(0) ;src_ptr
- mov rdi, arg(2) ;ref_ptr
-
- movsxd rax, dword ptr arg(1) ;src_stride
- movsxd rdx, dword ptr arg(3) ;ref_stride
-
- movd mm0, DWORD PTR [rsi]
- movd mm1, DWORD PTR [rdi]
-
- movd mm2, DWORD PTR [rsi+rax]
- movd mm3, DWORD PTR [rdi+rdx]
-
- punpcklbw mm0, mm2
- punpcklbw mm1, mm3
-
- movq mm2, mm0
- psubusb mm0, mm1
-
- psubusb mm1, mm2
- por mm0, mm1
-
- movq mm2, mm0
- pxor mm3, mm3
-
- punpcklbw mm0, mm3
- punpckhbw mm2, mm3
-
- paddw mm0, mm2
-
- lea rsi, [rsi+rax*2]
- lea rdi, [rdi+rdx*2]
-
- movd mm4, DWORD PTR [rsi]
- movd mm5, DWORD PTR [rdi]
-
- movd mm6, DWORD PTR [rsi+rax]
- movd mm7, DWORD PTR [rdi+rdx]
-
- punpcklbw mm4, mm6
- punpcklbw mm5, mm7
-
- movq mm6, mm4
- psubusb mm4, mm5
-
- psubusb mm5, mm6
- por mm4, mm5
-
- movq mm5, mm4
- punpcklbw mm4, mm3
-
- punpckhbw mm5, mm3
- paddw mm4, mm5
-
- paddw mm0, mm4
- movq mm1, mm0
-
- punpcklwd mm0, mm3
- punpckhwd mm1, mm3
-
- paddw mm0, mm1
- movq mm1, mm0
-
- psrlq mm0, 32
- paddw mm0, mm1
-
- movq rax, mm0
-
- pop rdi
- pop rsi
- mov rsp, rbp
- ; begin epilog
- UNSHADOW_ARGS
- pop rbp
- ret
-
-
-;unsigned int vp9_sad16x8_mmx(
-; unsigned char *src_ptr,
-; int src_stride,
-; unsigned char *ref_ptr,
-; int ref_stride)
-sym(vp9_sad16x8_mmx):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 4
- push rsi
- push rdi
- ; end prolog
-
- mov rsi, arg(0) ;src_ptr
- mov rdi, arg(2) ;ref_ptr
-
- movsxd rax, dword ptr arg(1) ;src_stride
- movsxd rdx, dword ptr arg(3) ;ref_stride
-
- lea rcx, [rsi+rax*8]
- pxor mm7, mm7
-
- pxor mm6, mm6
-
-.x16x8sad_mmx_loop:
-
- movq mm0, [rsi]
- movq mm1, [rdi]
-
- movq mm2, [rsi+8]
- movq mm3, [rdi+8]
-
- movq mm4, mm0
- movq mm5, mm2
-
- psubusb mm0, mm1
- psubusb mm1, mm4
-
- psubusb mm2, mm3
- psubusb mm3, mm5
-
- por mm0, mm1
- por mm2, mm3
-
- movq mm1, mm0
- movq mm3, mm2
-
- punpcklbw mm0, mm6
- punpckhbw mm1, mm6
-
- punpcklbw mm2, mm6
- punpckhbw mm3, mm6
-
-
- paddw mm0, mm2
- paddw mm1, mm3
-
- paddw mm0, mm1
- lea rsi, [rsi+rax]
-
- add rdi, rdx
- paddw mm7, mm0
-
- cmp rsi, rcx
- jne .x16x8sad_mmx_loop
-
- movq mm0, mm7
- punpcklwd mm0, mm6
-
- punpckhwd mm7, mm6
- paddw mm0, mm7
-
- movq mm7, mm0
- psrlq mm0, 32
-
- paddw mm7, mm0
- movq rax, mm7
-
- pop rdi
- pop rsi
- mov rsp, rbp
- ; begin epilog
- UNSHADOW_ARGS
- pop rbp
- ret
diff --git a/vp9/encoder/x86/vp9_subpel_variance_impl_intrin_avx2.c b/vp9/encoder/x86/vp9_subpel_variance_impl_intrin_avx2.c
index 9aa4da9..a441cad 100644
--- a/vp9/encoder/x86/vp9_subpel_variance_impl_intrin_avx2.c
+++ b/vp9/encoder/x86/vp9_subpel_variance_impl_intrin_avx2.c
@@ -333,7 +333,7 @@
if (y_offset == 0) {
for (i = 0; i < height ; i++) {
LOAD_SRC_DST
- sec_reg = _mm256_load_si256((__m256i const *) (sec));
+ sec_reg = _mm256_loadu_si256((__m256i const *) (sec));
src_reg = _mm256_avg_epu8(src_reg, sec_reg);
sec+= sec_stride;
// expend each byte to 2 bytes
@@ -347,7 +347,7 @@
for (i = 0; i < height ; i++) {
LOAD_SRC_DST
AVG_NEXT_SRC(src_reg, src_stride)
- sec_reg = _mm256_load_si256((__m256i const *) (sec));
+ sec_reg = _mm256_loadu_si256((__m256i const *) (sec));
src_reg = _mm256_avg_epu8(src_reg, sec_reg);
sec+= sec_stride;
// expend each byte to 2 bytes
@@ -369,7 +369,7 @@
MERGE_NEXT_SRC(src_reg, src_stride)
FILTER_SRC(filter)
src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
- sec_reg = _mm256_load_si256((__m256i const *) (sec));
+ sec_reg = _mm256_loadu_si256((__m256i const *) (sec));
src_reg = _mm256_avg_epu8(src_reg, sec_reg);
sec+= sec_stride;
MERGE_WITH_SRC(src_reg, zero_reg)
@@ -385,7 +385,7 @@
for (i = 0; i < height ; i++) {
LOAD_SRC_DST
AVG_NEXT_SRC(src_reg, 1)
- sec_reg = _mm256_load_si256((__m256i const *) (sec));
+ sec_reg = _mm256_loadu_si256((__m256i const *) (sec));
src_reg = _mm256_avg_epu8(src_reg, sec_reg);
sec+= sec_stride;
// expand each byte to 2 bytes
@@ -409,7 +409,7 @@
AVG_NEXT_SRC(src_reg, 1)
// average between previous average to current average
src_avg = _mm256_avg_epu8(src_avg, src_reg);
- sec_reg = _mm256_load_si256((__m256i const *) (sec));
+ sec_reg = _mm256_loadu_si256((__m256i const *) (sec));
src_avg = _mm256_avg_epu8(src_avg, sec_reg);
sec+= sec_stride;
// expand each byte to 2 bytes
@@ -437,7 +437,7 @@
MERGE_WITH_SRC(src_avg, src_reg)
FILTER_SRC(filter)
src_avg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
- sec_reg = _mm256_load_si256((__m256i const *) (sec));
+ sec_reg = _mm256_loadu_si256((__m256i const *) (sec));
src_avg = _mm256_avg_epu8(src_avg, sec_reg);
// expand each byte to 2 bytes
MERGE_WITH_SRC(src_avg, zero_reg)
@@ -459,7 +459,7 @@
MERGE_NEXT_SRC(src_reg, 1)
FILTER_SRC(filter)
src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
- sec_reg = _mm256_load_si256((__m256i const *) (sec));
+ sec_reg = _mm256_loadu_si256((__m256i const *) (sec));
src_reg = _mm256_avg_epu8(src_reg, sec_reg);
MERGE_WITH_SRC(src_reg, zero_reg)
sec+= sec_stride;
@@ -487,7 +487,7 @@
src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
// average between previous pack to the current
src_pack = _mm256_avg_epu8(src_pack, src_reg);
- sec_reg = _mm256_load_si256((__m256i const *) (sec));
+ sec_reg = _mm256_loadu_si256((__m256i const *) (sec));
src_pack = _mm256_avg_epu8(src_pack, sec_reg);
sec+= sec_stride;
MERGE_WITH_SRC(src_pack, zero_reg)
@@ -524,7 +524,7 @@
// filter the source
FILTER_SRC(yfilter)
src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
- sec_reg = _mm256_load_si256((__m256i const *) (sec));
+ sec_reg = _mm256_loadu_si256((__m256i const *) (sec));
src_pack = _mm256_avg_epu8(src_pack, sec_reg);
MERGE_WITH_SRC(src_pack, zero_reg)
src_pack = src_reg;
diff --git a/vp9/encoder/x86/vp9_variance_avx2.c b/vp9/encoder/x86/vp9_variance_avx2.c
index 7f81f46..ea09b95 100644
--- a/vp9/encoder/x86/vp9_variance_avx2.c
+++ b/vp9/encoder/x86/vp9_variance_avx2.c
@@ -12,67 +12,39 @@
#include "vp9/encoder/vp9_variance.h"
#include "vpx_ports/mem.h"
-typedef void (*get_var_avx2) (
- const unsigned char *src_ptr,
- int source_stride,
- const unsigned char *ref_ptr,
- int recon_stride,
- unsigned int *SSE,
- int *Sum
-);
+typedef void (*get_var_avx2)(const uint8_t *src, int src_stride,
+ const uint8_t *ref, int ref_stride,
+ unsigned int *sse, int *sum);
-void vp9_get16x16var_avx2
-(
- const unsigned char *src_ptr,
- int source_stride,
- const unsigned char *ref_ptr,
- int recon_stride,
- unsigned int *SSE,
- int *Sum
-);
+void vp9_get16x16var_avx2(const uint8_t *src, int src_stride,
+ const uint8_t *ref, int ref_stride,
+ unsigned int *sse, int *sum);
-void vp9_get32x32var_avx2
-(
- const unsigned char *src_ptr,
- int source_stride,
- const unsigned char *ref_ptr,
- int recon_stride,
- unsigned int *SSE,
- int *Sum
-);
+void vp9_get32x32var_avx2(const uint8_t *src, int src_stride,
+ const uint8_t *ref, int ref_stride,
+ unsigned int *sse, int *sum);
-unsigned int vp9_sub_pixel_variance32xh_avx2
-(
- const uint8_t *src,
- int src_stride,
- int x_offset,
- int y_offset,
- const uint8_t *dst,
- int dst_stride,
- int height,
- unsigned int *sse
-);
+unsigned int vp9_sub_pixel_variance32xh_avx2(const uint8_t *src, int src_stride,
+ int x_offset, int y_offset,
+ const uint8_t *dst, int dst_stride,
+ int height,
+ unsigned int *sse);
-unsigned int vp9_sub_pixel_avg_variance32xh_avx2
-(
- const uint8_t *src,
- int src_stride,
- int x_offset,
- int y_offset,
- const uint8_t *dst,
- int dst_stride,
- const uint8_t *sec,
- int sec_stride,
- int height,
- unsigned int *sseptr
-);
+unsigned int vp9_sub_pixel_avg_variance32xh_avx2(const uint8_t *src,
+ int src_stride,
+ int x_offset,
+ int y_offset,
+ const uint8_t *dst,
+ int dst_stride,
+ const uint8_t *sec,
+ int sec_stride,
+ int height,
+ unsigned int *sseptr);
-static void variance_avx2(const unsigned char *src_ptr, int source_stride,
- const unsigned char *ref_ptr, int recon_stride,
- int w, int h, unsigned int *sse, int *sum,
- get_var_avx2 var_fn, int block_size) {
- unsigned int sse0;
- int sum0;
+static void variance_avx2(const uint8_t *src, int src_stride,
+ const uint8_t *ref, int ref_stride,
+ int w, int h, unsigned int *sse, int *sum,
+ get_var_avx2 var_fn, int block_size) {
int i, j;
*sse = 0;
@@ -80,105 +52,68 @@
for (i = 0; i < h; i += 16) {
for (j = 0; j < w; j += block_size) {
- // processing 16 rows horizontally each call
- var_fn(src_ptr + source_stride * i + j, source_stride,
- ref_ptr + recon_stride * i + j, recon_stride, &sse0, &sum0);
+ unsigned int sse0;
+ int sum0;
+ var_fn(&src[src_stride * i + j], src_stride,
+ &ref[ref_stride * i + j], ref_stride, &sse0, &sum0);
*sse += sse0;
*sum += sum0;
}
}
}
-unsigned int vp9_variance16x16_avx2
-(
- const unsigned char *src_ptr,
- int source_stride,
- const unsigned char *ref_ptr,
- int recon_stride,
- unsigned int *sse) {
- unsigned int var;
- int avg;
- variance_avx2(src_ptr, source_stride, ref_ptr, recon_stride, 16, 16,
- &var, &avg, vp9_get16x16var_avx2, 16);
- *sse = var;
- return (var - (((unsigned int)avg * avg) >> 8));
-}
-
-unsigned int vp9_mse16x16_avx2(
- const unsigned char *src_ptr,
- int source_stride,
- const unsigned char *ref_ptr,
- int recon_stride,
- unsigned int *sse) {
- unsigned int sse0;
- int sum0;
- vp9_get16x16var_avx2(src_ptr, source_stride, ref_ptr, recon_stride, &sse0,
- &sum0);
- *sse = sse0;
- return sse0;
-}
-
-unsigned int vp9_variance32x32_avx2(const uint8_t *src_ptr,
- int source_stride,
- const uint8_t *ref_ptr,
- int recon_stride,
+unsigned int vp9_variance16x16_avx2(const uint8_t *src, int src_stride,
+ const uint8_t *ref, int ref_stride,
unsigned int *sse) {
- unsigned int var;
- int avg;
-
- // processing 32 elements vertically in parallel
- variance_avx2(src_ptr, source_stride, ref_ptr, recon_stride, 32, 32,
- &var, &avg, vp9_get32x32var_avx2, 32);
- *sse = var;
- return (var - (((int64_t)avg * avg) >> 10));
+ int sum;
+ variance_avx2(src, src_stride, ref, ref_stride, 16, 16,
+ sse, &sum, vp9_get16x16var_avx2, 16);
+ return *sse - (((unsigned int)sum * sum) >> 8);
}
-unsigned int vp9_variance32x16_avx2(const uint8_t *src_ptr,
- int source_stride,
- const uint8_t *ref_ptr,
- int recon_stride,
- unsigned int *sse) {
- unsigned int var;
- int avg;
-
- // processing 32 elements vertically in parallel
- variance_avx2(src_ptr, source_stride, ref_ptr, recon_stride, 32, 16,
- &var, &avg, vp9_get32x32var_avx2, 32);
- *sse = var;
- return (var - (((int64_t)avg * avg) >> 9));
+unsigned int vp9_mse16x16_avx2(const uint8_t *src, int src_stride,
+ const uint8_t *ref, int ref_stride,
+ unsigned int *sse) {
+ int sum;
+ vp9_get16x16var_avx2(src, src_stride, ref, ref_stride, sse, &sum);
+ return *sse;
}
-
-unsigned int vp9_variance64x64_avx2(const uint8_t *src_ptr,
- int source_stride,
- const uint8_t *ref_ptr,
- int recon_stride,
+unsigned int vp9_variance32x16_avx2(const uint8_t *src, int src_stride,
+ const uint8_t *ref, int ref_stride,
unsigned int *sse) {
- unsigned int var;
- int avg;
-
- // processing 32 elements vertically in parallel
- variance_avx2(src_ptr, source_stride, ref_ptr, recon_stride, 64, 64,
- &var, &avg, vp9_get32x32var_avx2, 32);
- *sse = var;
- return (var - (((int64_t)avg * avg) >> 12));
+ int sum;
+ variance_avx2(src, src_stride, ref, ref_stride, 32, 16,
+ sse, &sum, vp9_get32x32var_avx2, 32);
+ return *sse - (((int64_t)sum * sum) >> 9);
}
-unsigned int vp9_variance64x32_avx2(const uint8_t *src_ptr,
- int source_stride,
- const uint8_t *ref_ptr,
- int recon_stride,
+unsigned int vp9_variance32x32_avx2(const uint8_t *src, int src_stride,
+ const uint8_t *ref, int ref_stride,
unsigned int *sse) {
- unsigned int var;
- int avg;
+ int sum;
+ variance_avx2(src, src_stride, ref, ref_stride, 32, 32,
+ sse, &sum, vp9_get32x32var_avx2, 32);
+ return *sse - (((int64_t)sum * sum) >> 10);
+}
- // processing 32 elements vertically in parallel
- variance_avx2(src_ptr, source_stride, ref_ptr, recon_stride, 64, 32,
- &var, &avg, vp9_get32x32var_avx2, 32);
+unsigned int vp9_variance64x64_avx2(const uint8_t *src, int src_stride,
+ const uint8_t *ref, int ref_stride,
+ unsigned int *sse) {
+ int sum;
+ variance_avx2(src, src_stride, ref, ref_stride, 64, 64,
+ sse, &sum, vp9_get32x32var_avx2, 32);
+ return *sse - (((int64_t)sum * sum) >> 12);
+}
- *sse = var;
- return (var - (((int64_t)avg * avg) >> 11));
+unsigned int vp9_variance64x32_avx2(const uint8_t *src, int src_stride,
+ const uint8_t *ref, int ref_stride,
+ unsigned int *sse) {
+ int sum;
+ variance_avx2(src, src_stride, ref, ref_stride, 64, 32,
+ sse, &sum, vp9_get32x32var_avx2, 32);
+ return *sse - (((int64_t)sum * sum) >> 11);
}
unsigned int vp9_sub_pixel_variance64x64_avx2(const uint8_t *src,
@@ -187,22 +122,19 @@
int y_offset,
const uint8_t *dst,
int dst_stride,
- unsigned int *sse_ptr) {
- // processing 32 elements in parallel
- unsigned int sse;
- int se = vp9_sub_pixel_variance32xh_avx2(src, src_stride, x_offset,
- y_offset, dst, dst_stride,
- 64, &sse);
- // processing the next 32 elements in parallel
+ unsigned int *sse) {
+ unsigned int sse1;
+ const int se1 = vp9_sub_pixel_variance32xh_avx2(src, src_stride, x_offset,
+ y_offset, dst, dst_stride,
+ 64, &sse1);
unsigned int sse2;
- int se2 = vp9_sub_pixel_variance32xh_avx2(src + 32, src_stride,
- x_offset, y_offset,
- dst + 32, dst_stride,
- 64, &sse2);
- se += se2;
- sse += sse2;
- *sse_ptr = sse;
- return sse - (((int64_t)se * se) >> 12);
+ const int se2 = vp9_sub_pixel_variance32xh_avx2(src + 32, src_stride,
+ x_offset, y_offset,
+ dst + 32, dst_stride,
+ 64, &sse2);
+ const int se = se1 + se2;
+ *sse = sse1 + sse2;
+ return *sse - (((int64_t)se * se) >> 12);
}
unsigned int vp9_sub_pixel_variance32x32_avx2(const uint8_t *src,
@@ -211,14 +143,11 @@
int y_offset,
const uint8_t *dst,
int dst_stride,
- unsigned int *sse_ptr) {
- // processing 32 element in parallel
- unsigned int sse;
- int se = vp9_sub_pixel_variance32xh_avx2(src, src_stride, x_offset,
- y_offset, dst, dst_stride,
- 32, &sse);
- *sse_ptr = sse;
- return sse - (((int64_t)se * se) >> 10);
+ unsigned int *sse) {
+ const int se = vp9_sub_pixel_variance32xh_avx2(src, src_stride, x_offset,
+ y_offset, dst, dst_stride,
+ 32, sse);
+ return *sse - (((int64_t)se * se) >> 10);
}
unsigned int vp9_sub_pixel_avg_variance64x64_avx2(const uint8_t *src,
@@ -227,24 +156,22 @@
int y_offset,
const uint8_t *dst,
int dst_stride,
- unsigned int *sseptr,
+ unsigned int *sse,
const uint8_t *sec) {
- // processing 32 elements in parallel
- unsigned int sse;
-
- int se = vp9_sub_pixel_avg_variance32xh_avx2(src, src_stride, x_offset,
- y_offset, dst, dst_stride,
- sec, 64, 64, &sse);
+ unsigned int sse1;
+ const int se1 = vp9_sub_pixel_avg_variance32xh_avx2(src, src_stride, x_offset,
+ y_offset, dst, dst_stride,
+ sec, 64, 64, &sse1);
unsigned int sse2;
- // processing the next 32 elements in parallel
- int se2 = vp9_sub_pixel_avg_variance32xh_avx2(src + 32, src_stride, x_offset,
- y_offset, dst + 32, dst_stride,
- sec + 32, 64, 64, &sse2);
- se += se2;
- sse += sse2;
- *sseptr = sse;
+ const int se2 =
+ vp9_sub_pixel_avg_variance32xh_avx2(src + 32, src_stride, x_offset,
+ y_offset, dst + 32, dst_stride,
+ sec + 32, 64, 64, &sse2);
+ const int se = se1 + se2;
- return sse - (((int64_t)se * se) >> 12);
+ *sse = sse1 + sse2;
+
+ return *sse - (((int64_t)se * se) >> 12);
}
unsigned int vp9_sub_pixel_avg_variance32x32_avx2(const uint8_t *src,
@@ -253,15 +180,11 @@
int y_offset,
const uint8_t *dst,
int dst_stride,
- unsigned int *sseptr,
+ unsigned int *sse,
const uint8_t *sec) {
// processing 32 element in parallel
- unsigned int sse;
- int se = vp9_sub_pixel_avg_variance32xh_avx2(src, src_stride, x_offset,
- y_offset, dst, dst_stride,
- sec, 32, 32, &sse);
- *sseptr = sse;
- return sse - (((int64_t)se * se) >> 10);
+ const int se = vp9_sub_pixel_avg_variance32xh_avx2(src, src_stride, x_offset,
+ y_offset, dst, dst_stride,
+ sec, 32, 32, sse);
+ return *sse - (((int64_t)se * se) >> 10);
}
-
-
diff --git a/vp9/encoder/x86/vp9_variance_impl_mmx.asm b/vp9/encoder/x86/vp9_variance_impl_mmx.asm
deleted file mode 100644
index 3501cf1..0000000
--- a/vp9/encoder/x86/vp9_variance_impl_mmx.asm
+++ /dev/null
@@ -1,510 +0,0 @@
-;
-; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
-
-%include "vpx_ports/x86_abi_support.asm"
-
-;unsigned int vp9_get_mb_ss_mmx( short *src_ptr )
-global sym(vp9_get_mb_ss_mmx) PRIVATE
-sym(vp9_get_mb_ss_mmx):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 7
- GET_GOT rbx
- push rsi
- push rdi
- sub rsp, 8
- ; end prolog
-
- mov rax, arg(0) ;src_ptr
- mov rcx, 16
- pxor mm4, mm4
-
-.NEXTROW:
- movq mm0, [rax]
- movq mm1, [rax+8]
- movq mm2, [rax+16]
- movq mm3, [rax+24]
- pmaddwd mm0, mm0
- pmaddwd mm1, mm1
- pmaddwd mm2, mm2
- pmaddwd mm3, mm3
-
- paddd mm4, mm0
- paddd mm4, mm1
- paddd mm4, mm2
- paddd mm4, mm3
-
- add rax, 32
- dec rcx
- ja .NEXTROW
- movq QWORD PTR [rsp], mm4
-
- ;return sum[0]+sum[1];
- movsxd rax, dword ptr [rsp]
- movsxd rcx, dword ptr [rsp+4]
- add rax, rcx
-
-
- ; begin epilog
- add rsp, 8
- pop rdi
- pop rsi
- RESTORE_GOT
- UNSHADOW_ARGS
- pop rbp
- ret
-
-
-;unsigned int vp9_get8x8var_mmx
-;(
-; unsigned char *src_ptr,
-; int source_stride,
-; unsigned char *ref_ptr,
-; int recon_stride,
-; unsigned int *SSE,
-; int *Sum
-;)
-global sym(vp9_get8x8var_mmx) PRIVATE
-sym(vp9_get8x8var_mmx):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 6
- push rsi
- push rdi
- push rbx
- sub rsp, 16
- ; end prolog
-
-
- pxor mm5, mm5 ; Blank mmx6
- pxor mm6, mm6 ; Blank mmx7
- pxor mm7, mm7 ; Blank mmx7
-
- mov rax, arg(0) ;[src_ptr] ; Load base addresses
- mov rbx, arg(2) ;[ref_ptr]
- movsxd rcx, dword ptr arg(1) ;[source_stride]
- movsxd rdx, dword ptr arg(3) ;[recon_stride]
-
- ; Row 1
- movq mm0, [rax] ; Copy eight bytes to mm0
- movq mm1, [rbx] ; Copy eight bytes to mm1
- movq mm2, mm0 ; Take copies
- movq mm3, mm1 ; Take copies
-
- punpcklbw mm0, mm6 ; unpack to higher prrcision
- punpcklbw mm1, mm6
- punpckhbw mm2, mm6 ; unpack to higher prrcision
- punpckhbw mm3, mm6
- psubsw mm0, mm1 ; A-B (low order) to MM0
- psubsw mm2, mm3 ; A-B (high order) to MM2
-
- paddw mm5, mm0 ; accumulate differences in mm5
- paddw mm5, mm2 ; accumulate differences in mm5
-
- pmaddwd mm0, mm0 ; square and accumulate
- pmaddwd mm2, mm2 ; square and accumulate
- add rbx,rdx ; Inc pointer into ref data
- add rax,rcx ; Inc pointer into the new data
- movq mm1, [rbx] ; Copy eight bytes to mm1
- paddd mm7, mm0 ; accumulate in mm7
- paddd mm7, mm2 ; accumulate in mm7
-
-
- ; Row 2
- movq mm0, [rax] ; Copy eight bytes to mm0
- movq mm2, mm0 ; Take copies
- movq mm3, mm1 ; Take copies
-
- punpcklbw mm0, mm6 ; unpack to higher prrcision
- punpcklbw mm1, mm6
- punpckhbw mm2, mm6 ; unpack to higher prrcision
- punpckhbw mm3, mm6
- psubsw mm0, mm1 ; A-B (low order) to MM0
- psubsw mm2, mm3 ; A-B (high order) to MM2
-
- paddw mm5, mm0 ; accumulate differences in mm5
- paddw mm5, mm2 ; accumulate differences in mm5
-
- pmaddwd mm0, mm0 ; square and accumulate
- pmaddwd mm2, mm2 ; square and accumulate
- add rbx,rdx ; Inc pointer into ref data
- add rax,rcx ; Inc pointer into the new data
- movq mm1, [rbx] ; Copy eight bytes to mm1
- paddd mm7, mm0 ; accumulate in mm7
- paddd mm7, mm2 ; accumulate in mm7
-
- ; Row 3
- movq mm0, [rax] ; Copy eight bytes to mm0
- movq mm2, mm0 ; Take copies
- movq mm3, mm1 ; Take copies
-
- punpcklbw mm0, mm6 ; unpack to higher prrcision
- punpcklbw mm1, mm6
- punpckhbw mm2, mm6 ; unpack to higher prrcision
- punpckhbw mm3, mm6
- psubsw mm0, mm1 ; A-B (low order) to MM0
- psubsw mm2, mm3 ; A-B (high order) to MM2
-
- paddw mm5, mm0 ; accumulate differences in mm5
- paddw mm5, mm2 ; accumulate differences in mm5
-
- pmaddwd mm0, mm0 ; square and accumulate
- pmaddwd mm2, mm2 ; square and accumulate
- add rbx,rdx ; Inc pointer into ref data
- add rax,rcx ; Inc pointer into the new data
- movq mm1, [rbx] ; Copy eight bytes to mm1
- paddd mm7, mm0 ; accumulate in mm7
- paddd mm7, mm2 ; accumulate in mm7
-
- ; Row 4
- movq mm0, [rax] ; Copy eight bytes to mm0
- movq mm2, mm0 ; Take copies
- movq mm3, mm1 ; Take copies
-
- punpcklbw mm0, mm6 ; unpack to higher prrcision
- punpcklbw mm1, mm6
- punpckhbw mm2, mm6 ; unpack to higher prrcision
- punpckhbw mm3, mm6
- psubsw mm0, mm1 ; A-B (low order) to MM0
- psubsw mm2, mm3 ; A-B (high order) to MM2
-
- paddw mm5, mm0 ; accumulate differences in mm5
- paddw mm5, mm2 ; accumulate differences in mm5
-
- pmaddwd mm0, mm0 ; square and accumulate
- pmaddwd mm2, mm2 ; square and accumulate
- add rbx,rdx ; Inc pointer into ref data
- add rax,rcx ; Inc pointer into the new data
- movq mm1, [rbx] ; Copy eight bytes to mm1
- paddd mm7, mm0 ; accumulate in mm7
- paddd mm7, mm2 ; accumulate in mm7
-
- ; Row 5
- movq mm0, [rax] ; Copy eight bytes to mm0
- movq mm2, mm0 ; Take copies
- movq mm3, mm1 ; Take copies
-
- punpcklbw mm0, mm6 ; unpack to higher prrcision
- punpcklbw mm1, mm6
- punpckhbw mm2, mm6 ; unpack to higher prrcision
- punpckhbw mm3, mm6
- psubsw mm0, mm1 ; A-B (low order) to MM0
- psubsw mm2, mm3 ; A-B (high order) to MM2
-
- paddw mm5, mm0 ; accumulate differences in mm5
- paddw mm5, mm2 ; accumulate differences in mm5
-
- pmaddwd mm0, mm0 ; square and accumulate
- pmaddwd mm2, mm2 ; square and accumulate
- add rbx,rdx ; Inc pointer into ref data
- add rax,rcx ; Inc pointer into the new data
- movq mm1, [rbx] ; Copy eight bytes to mm1
- ; movq mm4, [rbx + rdx]
- paddd mm7, mm0 ; accumulate in mm7
- paddd mm7, mm2 ; accumulate in mm7
-
- ; Row 6
- movq mm0, [rax] ; Copy eight bytes to mm0
- movq mm2, mm0 ; Take copies
- movq mm3, mm1 ; Take copies
-
- punpcklbw mm0, mm6 ; unpack to higher prrcision
- punpcklbw mm1, mm6
- punpckhbw mm2, mm6 ; unpack to higher prrcision
- punpckhbw mm3, mm6
- psubsw mm0, mm1 ; A-B (low order) to MM0
- psubsw mm2, mm3 ; A-B (high order) to MM2
-
- paddw mm5, mm0 ; accumulate differences in mm5
- paddw mm5, mm2 ; accumulate differences in mm5
-
- pmaddwd mm0, mm0 ; square and accumulate
- pmaddwd mm2, mm2 ; square and accumulate
- add rbx,rdx ; Inc pointer into ref data
- add rax,rcx ; Inc pointer into the new data
- movq mm1, [rbx] ; Copy eight bytes to mm1
- paddd mm7, mm0 ; accumulate in mm7
- paddd mm7, mm2 ; accumulate in mm7
-
- ; Row 7
- movq mm0, [rax] ; Copy eight bytes to mm0
- movq mm2, mm0 ; Take copies
- movq mm3, mm1 ; Take copies
-
- punpcklbw mm0, mm6 ; unpack to higher prrcision
- punpcklbw mm1, mm6
- punpckhbw mm2, mm6 ; unpack to higher prrcision
- punpckhbw mm3, mm6
- psubsw mm0, mm1 ; A-B (low order) to MM0
- psubsw mm2, mm3 ; A-B (high order) to MM2
-
- paddw mm5, mm0 ; accumulate differences in mm5
- paddw mm5, mm2 ; accumulate differences in mm5
-
- pmaddwd mm0, mm0 ; square and accumulate
- pmaddwd mm2, mm2 ; square and accumulate
- add rbx,rdx ; Inc pointer into ref data
- add rax,rcx ; Inc pointer into the new data
- movq mm1, [rbx] ; Copy eight bytes to mm1
- paddd mm7, mm0 ; accumulate in mm7
- paddd mm7, mm2 ; accumulate in mm7
-
- ; Row 8
- movq mm0, [rax] ; Copy eight bytes to mm0
- movq mm2, mm0 ; Take copies
- movq mm3, mm1 ; Take copies
-
- punpcklbw mm0, mm6 ; unpack to higher prrcision
- punpcklbw mm1, mm6
- punpckhbw mm2, mm6 ; unpack to higher prrcision
- punpckhbw mm3, mm6
- psubsw mm0, mm1 ; A-B (low order) to MM0
- psubsw mm2, mm3 ; A-B (high order) to MM2
-
- paddw mm5, mm0 ; accumulate differences in mm5
- paddw mm5, mm2 ; accumulate differences in mm5
-
- pmaddwd mm0, mm0 ; square and accumulate
- pmaddwd mm2, mm2 ; square and accumulate
- add rbx,rdx ; Inc pointer into ref data
- add rax,rcx ; Inc pointer into the new data
- paddd mm7, mm0 ; accumulate in mm7
- paddd mm7, mm2 ; accumulate in mm7
-
- ; Now accumulate the final results.
- movq QWORD PTR [rsp+8], mm5 ; copy back accumulated results into normal memory
- movq QWORD PTR [rsp], mm7 ; copy back accumulated results into normal memory
- movsx rdx, WORD PTR [rsp+8]
- movsx rcx, WORD PTR [rsp+10]
- movsx rbx, WORD PTR [rsp+12]
- movsx rax, WORD PTR [rsp+14]
- add rdx, rcx
- add rbx, rax
- add rdx, rbx ;XSum
- movsxd rax, DWORD PTR [rsp]
- movsxd rcx, DWORD PTR [rsp+4]
- add rax, rcx ;XXSum
- mov rsi, arg(4) ;SSE
- mov rdi, arg(5) ;Sum
- mov dword ptr [rsi], eax
- mov dword ptr [rdi], edx
- xor rax, rax ; return 0
-
-
- ; begin epilog
- add rsp, 16
- pop rbx
- pop rdi
- pop rsi
- UNSHADOW_ARGS
- pop rbp
- ret
-
-
-
-;unsigned int
-;vp9_get4x4var_mmx
-;(
-; unsigned char *src_ptr,
-; int source_stride,
-; unsigned char *ref_ptr,
-; int recon_stride,
-; unsigned int *SSE,
-; int *Sum
-;)
-global sym(vp9_get4x4var_mmx) PRIVATE
-sym(vp9_get4x4var_mmx):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 6
- push rsi
- push rdi
- push rbx
- sub rsp, 16
- ; end prolog
-
-
- pxor mm5, mm5 ; Blank mmx6
- pxor mm6, mm6 ; Blank mmx7
- pxor mm7, mm7 ; Blank mmx7
-
- mov rax, arg(0) ;[src_ptr] ; Load base addresses
- mov rbx, arg(2) ;[ref_ptr]
- movsxd rcx, dword ptr arg(1) ;[source_stride]
- movsxd rdx, dword ptr arg(3) ;[recon_stride]
-
- ; Row 1
- movd mm0, [rax] ; Copy 4 bytes to mm0
- movd mm1, [rbx] ; Copy 4 bytes to mm1
- punpcklbw mm0, mm6 ; unpack to higher prrcision
- punpcklbw mm1, mm6
- psubsw mm0, mm1 ; A-B (low order) to MM0
- paddw mm5, mm0 ; accumulate differences in mm5
- pmaddwd mm0, mm0 ; square and accumulate
- add rbx,rdx ; Inc pointer into ref data
- add rax,rcx ; Inc pointer into the new data
- movd mm1, [rbx] ; Copy 4 bytes to mm1
- paddd mm7, mm0 ; accumulate in mm7
-
-
- ; Row 2
- movd mm0, [rax] ; Copy 4 bytes to mm0
- punpcklbw mm0, mm6 ; unpack to higher prrcision
- punpcklbw mm1, mm6
- psubsw mm0, mm1 ; A-B (low order) to MM0
- paddw mm5, mm0 ; accumulate differences in mm5
-
- pmaddwd mm0, mm0 ; square and accumulate
- add rbx,rdx ; Inc pointer into ref data
- add rax,rcx ; Inc pointer into the new data
- movd mm1, [rbx] ; Copy 4 bytes to mm1
- paddd mm7, mm0 ; accumulate in mm7
-
- ; Row 3
- movd mm0, [rax] ; Copy 4 bytes to mm0
- punpcklbw mm0, mm6 ; unpack to higher prrcision
- punpcklbw mm1, mm6
- psubsw mm0, mm1 ; A-B (low order) to MM0
- paddw mm5, mm0 ; accumulate differences in mm5
-
- pmaddwd mm0, mm0 ; square and accumulate
- add rbx,rdx ; Inc pointer into ref data
- add rax,rcx ; Inc pointer into the new data
- movd mm1, [rbx] ; Copy 4 bytes to mm1
- paddd mm7, mm0 ; accumulate in mm7
-
- ; Row 4
- movd mm0, [rax] ; Copy 4 bytes to mm0
-
- punpcklbw mm0, mm6 ; unpack to higher prrcision
- punpcklbw mm1, mm6
- psubsw mm0, mm1 ; A-B (low order) to MM0
-
- paddw mm5, mm0 ; accumulate differences in mm5
-
- pmaddwd mm0, mm0 ; square and accumulate
- paddd mm7, mm0 ; accumulate in mm7
-
-
- ; Now accumulate the final results.
- movq QWORD PTR [rsp+8], mm5 ; copy back accumulated results into normal memory
- movq QWORD PTR [rsp], mm7 ; copy back accumulated results into normal memory
- movsx rdx, WORD PTR [rsp+8]
- movsx rcx, WORD PTR [rsp+10]
- movsx rbx, WORD PTR [rsp+12]
- movsx rax, WORD PTR [rsp+14]
- add rdx, rcx
- add rbx, rax
- add rdx, rbx ;XSum
- movsxd rax, DWORD PTR [rsp]
- movsxd rcx, DWORD PTR [rsp+4]
- add rax, rcx ;XXSum
- mov rsi, arg(4) ;SSE
- mov rdi, arg(5) ;Sum
- mov dword ptr [rsi], eax
- mov dword ptr [rdi], edx
- xor rax, rax ; return 0
-
-
- ; begin epilog
- add rsp, 16
- pop rbx
- pop rdi
- pop rsi
- UNSHADOW_ARGS
- pop rbp
- ret
-
-
-
-;unsigned int
-;vp9_get4x4sse_cs_mmx
-;(
-; unsigned char *src_ptr,
-; int source_stride,
-; unsigned char *ref_ptr,
-; int recon_stride
-;)
-global sym(vp9_get4x4sse_cs_mmx) PRIVATE
-sym(vp9_get4x4sse_cs_mmx):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 4
- push rsi
- push rdi
- push rbx
- ; end prolog
-
-
- pxor mm6, mm6 ; Blank mmx7
- pxor mm7, mm7 ; Blank mmx7
-
- mov rax, arg(0) ;[src_ptr] ; Load base addresses
- mov rbx, arg(2) ;[ref_ptr]
- movsxd rcx, dword ptr arg(1) ;[source_stride]
- movsxd rdx, dword ptr arg(3) ;[recon_stride]
- ; Row 1
- movd mm0, [rax] ; Copy eight bytes to mm0
- movd mm1, [rbx] ; Copy eight bytes to mm1
- punpcklbw mm0, mm6 ; unpack to higher prrcision
- punpcklbw mm1, mm6
- psubsw mm0, mm1 ; A-B (low order) to MM0
- pmaddwd mm0, mm0 ; square and accumulate
- add rbx,rdx ; Inc pointer into ref data
- add rax,rcx ; Inc pointer into the new data
- movd mm1, [rbx] ; Copy eight bytes to mm1
- paddd mm7, mm0 ; accumulate in mm7
-
- ; Row 2
- movd mm0, [rax] ; Copy eight bytes to mm0
- punpcklbw mm0, mm6 ; unpack to higher prrcision
- punpcklbw mm1, mm6
- psubsw mm0, mm1 ; A-B (low order) to MM0
- pmaddwd mm0, mm0 ; square and accumulate
- add rbx,rdx ; Inc pointer into ref data
- add rax,rcx ; Inc pointer into the new data
- movd mm1, [rbx] ; Copy eight bytes to mm1
- paddd mm7, mm0 ; accumulate in mm7
-
- ; Row 3
- movd mm0, [rax] ; Copy eight bytes to mm0
- punpcklbw mm1, mm6
- punpcklbw mm0, mm6 ; unpack to higher prrcision
- psubsw mm0, mm1 ; A-B (low order) to MM0
-
- pmaddwd mm0, mm0 ; square and accumulate
- add rbx,rdx ; Inc pointer into ref data
- add rax,rcx ; Inc pointer into the new data
- movd mm1, [rbx] ; Copy eight bytes to mm1
- paddd mm7, mm0 ; accumulate in mm7
-
- ; Row 4
- movd mm0, [rax] ; Copy eight bytes to mm0
- punpcklbw mm0, mm6 ; unpack to higher prrcision
- punpcklbw mm1, mm6
- psubsw mm0, mm1 ; A-B (low order) to MM0
- pmaddwd mm0, mm0 ; square and accumulate
- paddd mm7, mm0 ; accumulate in mm7
-
- movq mm0, mm7 ;
- psrlq mm7, 32
-
- paddd mm0, mm7
- movq rax, mm0
-
-
- ; begin epilog
- pop rbx
- pop rdi
- pop rsi
- UNSHADOW_ARGS
- pop rbp
- ret
diff --git a/vp9/encoder/x86/vp9_variance_impl_sse2.asm b/vp9/encoder/x86/vp9_variance_impl_sse2.asm
deleted file mode 100644
index 4830412..0000000
--- a/vp9/encoder/x86/vp9_variance_impl_sse2.asm
+++ /dev/null
@@ -1,401 +0,0 @@
-;
-; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
-
-%include "vpx_ports/x86_abi_support.asm"
-
-;unsigned int vp9_get_mb_ss_sse2
-;(
-; short *src_ptr
-;)
-global sym(vp9_get_mb_ss_sse2) PRIVATE
-sym(vp9_get_mb_ss_sse2):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 1
- GET_GOT rbx
- push rsi
- push rdi
- sub rsp, 16
- ; end prolog
-
-
- mov rax, arg(0) ;[src_ptr]
- mov rcx, 8
- pxor xmm4, xmm4
-
-.NEXTROW:
- movdqa xmm0, [rax]
- movdqa xmm1, [rax+16]
- movdqa xmm2, [rax+32]
- movdqa xmm3, [rax+48]
- pmaddwd xmm0, xmm0
- pmaddwd xmm1, xmm1
- pmaddwd xmm2, xmm2
- pmaddwd xmm3, xmm3
-
- paddd xmm0, xmm1
- paddd xmm2, xmm3
- paddd xmm4, xmm0
- paddd xmm4, xmm2
-
- add rax, 0x40
- dec rcx
- ja .NEXTROW
-
- movdqa xmm3,xmm4
- psrldq xmm4,8
- paddd xmm4,xmm3
- movdqa xmm3,xmm4
- psrldq xmm4,4
- paddd xmm4,xmm3
- movq rax,xmm4
-
-
- ; begin epilog
- add rsp, 16
- pop rdi
- pop rsi
- RESTORE_GOT
- UNSHADOW_ARGS
- pop rbp
- ret
-
-
-;unsigned int vp9_get16x16var_sse2
-;(
-; unsigned char * src_ptr,
-; int source_stride,
-; unsigned char * ref_ptr,
-; int recon_stride,
-; unsigned int * SSE,
-; int * Sum
-;)
-global sym(vp9_get16x16var_sse2) PRIVATE
-sym(vp9_get16x16var_sse2):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 6
- SAVE_XMM 7
- push rbx
- push rsi
- push rdi
- ; end prolog
-
- mov rsi, arg(0) ;[src_ptr]
- mov rdi, arg(2) ;[ref_ptr]
-
- movsxd rax, DWORD PTR arg(1) ;[source_stride]
- movsxd rdx, DWORD PTR arg(3) ;[recon_stride]
-
- ; Prefetch data
- lea rcx, [rax+rax*2]
- prefetcht0 [rsi]
- prefetcht0 [rsi+rax]
- prefetcht0 [rsi+rax*2]
- prefetcht0 [rsi+rcx]
- lea rbx, [rsi+rax*4]
- prefetcht0 [rbx]
- prefetcht0 [rbx+rax]
- prefetcht0 [rbx+rax*2]
- prefetcht0 [rbx+rcx]
-
- lea rcx, [rdx+rdx*2]
- prefetcht0 [rdi]
- prefetcht0 [rdi+rdx]
- prefetcht0 [rdi+rdx*2]
- prefetcht0 [rdi+rcx]
- lea rbx, [rdi+rdx*4]
- prefetcht0 [rbx]
- prefetcht0 [rbx+rdx]
- prefetcht0 [rbx+rdx*2]
- prefetcht0 [rbx+rcx]
-
- pxor xmm0, xmm0 ; clear xmm0 for unpack
- pxor xmm7, xmm7 ; clear xmm7 for accumulating diffs
-
- pxor xmm6, xmm6 ; clear xmm6 for accumulating sse
- mov rcx, 16
-
-.var16loop:
- movdqu xmm1, XMMWORD PTR [rsi]
- movdqu xmm2, XMMWORD PTR [rdi]
-
- prefetcht0 [rsi+rax*8]
- prefetcht0 [rdi+rdx*8]
-
- movdqa xmm3, xmm1
- movdqa xmm4, xmm2
-
-
- punpcklbw xmm1, xmm0
- punpckhbw xmm3, xmm0
-
- punpcklbw xmm2, xmm0
- punpckhbw xmm4, xmm0
-
-
- psubw xmm1, xmm2
- psubw xmm3, xmm4
-
- paddw xmm7, xmm1
- pmaddwd xmm1, xmm1
-
- paddw xmm7, xmm3
- pmaddwd xmm3, xmm3
-
- paddd xmm6, xmm1
- paddd xmm6, xmm3
-
- add rsi, rax
- add rdi, rdx
-
- sub rcx, 1
- jnz .var16loop
-
-
- movdqa xmm1, xmm6
- pxor xmm6, xmm6
-
- pxor xmm5, xmm5
- punpcklwd xmm6, xmm7
-
- punpckhwd xmm5, xmm7
- psrad xmm5, 16
-
- psrad xmm6, 16
- paddd xmm6, xmm5
-
- movdqa xmm2, xmm1
- punpckldq xmm1, xmm0
-
- punpckhdq xmm2, xmm0
- movdqa xmm7, xmm6
-
- paddd xmm1, xmm2
- punpckldq xmm6, xmm0
-
- punpckhdq xmm7, xmm0
- paddd xmm6, xmm7
-
- movdqa xmm2, xmm1
- movdqa xmm7, xmm6
-
- psrldq xmm1, 8
- psrldq xmm6, 8
-
- paddd xmm7, xmm6
- paddd xmm1, xmm2
-
- mov rax, arg(5) ;[Sum]
- mov rdi, arg(4) ;[SSE]
-
- movd DWORD PTR [rax], xmm7
- movd DWORD PTR [rdi], xmm1
-
-
- ; begin epilog
- pop rdi
- pop rsi
- pop rbx
- RESTORE_XMM
- UNSHADOW_ARGS
- pop rbp
- ret
-
-
-
-
-;unsigned int vp9_get8x8var_sse2
-;(
-; unsigned char * src_ptr,
-; int source_stride,
-; unsigned char * ref_ptr,
-; int recon_stride,
-; unsigned int * SSE,
-; int * Sum
-;)
-global sym(vp9_get8x8var_sse2) PRIVATE
-sym(vp9_get8x8var_sse2):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 6
- SAVE_XMM 7
- GET_GOT rbx
- push rsi
- push rdi
- sub rsp, 16
- ; end prolog
-
- mov rsi, arg(0) ;[src_ptr]
- mov rdi, arg(2) ;[ref_ptr]
-
- movsxd rax, DWORD PTR arg(1) ;[source_stride]
- movsxd rdx, DWORD PTR arg(3) ;[recon_stride]
-
- pxor xmm0, xmm0 ; clear xmm0 for unpack
- pxor xmm7, xmm7 ; clear xmm7 for accumulating diffs
-
- movq xmm1, QWORD PTR [rsi]
- movq xmm2, QWORD PTR [rdi]
-
- punpcklbw xmm1, xmm0
- punpcklbw xmm2, xmm0
-
- psubsw xmm1, xmm2
- paddw xmm7, xmm1
-
- pmaddwd xmm1, xmm1
-
- movq xmm2, QWORD PTR[rsi + rax]
- movq xmm3, QWORD PTR[rdi + rdx]
-
- punpcklbw xmm2, xmm0
- punpcklbw xmm3, xmm0
-
- psubsw xmm2, xmm3
- paddw xmm7, xmm2
-
- pmaddwd xmm2, xmm2
- paddd xmm1, xmm2
-
-
- movq xmm2, QWORD PTR[rsi + rax * 2]
- movq xmm3, QWORD PTR[rdi + rdx * 2]
-
- punpcklbw xmm2, xmm0
- punpcklbw xmm3, xmm0
-
- psubsw xmm2, xmm3
- paddw xmm7, xmm2
-
- pmaddwd xmm2, xmm2
- paddd xmm1, xmm2
-
-
- lea rsi, [rsi + rax * 2]
- lea rdi, [rdi + rdx * 2]
- movq xmm2, QWORD PTR[rsi + rax]
- movq xmm3, QWORD PTR[rdi + rdx]
-
- punpcklbw xmm2, xmm0
- punpcklbw xmm3, xmm0
-
- psubsw xmm2, xmm3
- paddw xmm7, xmm2
-
- pmaddwd xmm2, xmm2
- paddd xmm1, xmm2
-
- movq xmm2, QWORD PTR[rsi + rax *2]
- movq xmm3, QWORD PTR[rdi + rdx *2]
-
- punpcklbw xmm2, xmm0
- punpcklbw xmm3, xmm0
-
- psubsw xmm2, xmm3
- paddw xmm7, xmm2
-
- pmaddwd xmm2, xmm2
- paddd xmm1, xmm2
-
-
- lea rsi, [rsi + rax * 2]
- lea rdi, [rdi + rdx * 2]
-
-
- movq xmm2, QWORD PTR[rsi + rax]
- movq xmm3, QWORD PTR[rdi + rdx]
-
- punpcklbw xmm2, xmm0
- punpcklbw xmm3, xmm0
-
- psubsw xmm2, xmm3
- paddw xmm7, xmm2
-
- pmaddwd xmm2, xmm2
- paddd xmm1, xmm2
-
- movq xmm2, QWORD PTR[rsi + rax *2]
- movq xmm3, QWORD PTR[rdi + rdx *2]
-
- punpcklbw xmm2, xmm0
- punpcklbw xmm3, xmm0
-
- psubsw xmm2, xmm3
- paddw xmm7, xmm2
-
- pmaddwd xmm2, xmm2
- paddd xmm1, xmm2
-
-
- lea rsi, [rsi + rax * 2]
- lea rdi, [rdi + rdx * 2]
-
- movq xmm2, QWORD PTR[rsi + rax]
- movq xmm3, QWORD PTR[rdi + rdx]
-
- punpcklbw xmm2, xmm0
- punpcklbw xmm3, xmm0
-
- psubsw xmm2, xmm3
- paddw xmm7, xmm2
-
- pmaddwd xmm2, xmm2
- paddd xmm1, xmm2
-
-
- movdqa xmm6, xmm7
- punpcklwd xmm6, xmm0
-
- punpckhwd xmm7, xmm0
- movdqa xmm2, xmm1
-
- paddw xmm6, xmm7
- punpckldq xmm1, xmm0
-
- punpckhdq xmm2, xmm0
- movdqa xmm7, xmm6
-
- paddd xmm1, xmm2
- punpckldq xmm6, xmm0
-
- punpckhdq xmm7, xmm0
- paddw xmm6, xmm7
-
- movdqa xmm2, xmm1
- movdqa xmm7, xmm6
-
- psrldq xmm1, 8
- psrldq xmm6, 8
-
- paddw xmm7, xmm6
- paddd xmm1, xmm2
-
- mov rax, arg(5) ;[Sum]
- mov rdi, arg(4) ;[SSE]
-
- movq rdx, xmm7
- movsx rcx, dx
-
- mov dword ptr [rax], ecx
- movd DWORD PTR [rdi], xmm1
-
- ; begin epilog
- add rsp, 16
- pop rdi
- pop rsi
- RESTORE_GOT
- RESTORE_XMM
- UNSHADOW_ARGS
- pop rbp
- ret
-
-
diff --git a/vp9/encoder/x86/vp9_variance_mmx.c b/vp9/encoder/x86/vp9_variance_mmx.c
deleted file mode 100644
index ce1c832..0000000
--- a/vp9/encoder/x86/vp9_variance_mmx.c
+++ /dev/null
@@ -1,103 +0,0 @@
-/*
- * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "./vpx_config.h"
-#include "vp9/encoder/vp9_variance.h"
-#include "vpx_ports/mem.h"
-
-unsigned int vp9_get8x8var_mmx(const uint8_t *src, int src_stride,
- const uint8_t *ref, int ref_stride,
- unsigned int *sse, int *sum);
-
-unsigned int vp9_get4x4var_mmx(const uint8_t *src, int src_stride,
- const uint8_t *ref, int ref_stride,
- unsigned int *SSE, int *sum);
-
-unsigned int vp9_variance4x4_mmx(const uint8_t *src, int src_stride,
- const uint8_t *ref, int ref_stride,
- unsigned int *sse) {
- int sum;
- vp9_get4x4var_mmx(src, src_stride, ref, ref_stride, sse, &sum);
- return *sse - (((unsigned int)sum * sum) >> 4);
-}
-
-unsigned int vp9_variance8x8_mmx(const uint8_t *src, int src_stride,
- const uint8_t *ref, int ref_stride,
- unsigned int *sse) {
- int sum;
- vp9_get8x8var_mmx(src, src_stride, ref, ref_stride, sse, &sum);
- return *sse - (((unsigned int)sum * sum) >> 6);
-}
-
-unsigned int vp9_mse16x16_mmx(const uint8_t *src, int src_stride,
- const uint8_t *ref, int ref_stride,
- unsigned int *sse) {
- unsigned int sse0, sse1, sse2, sse3;
- int sum0, sum1, sum2, sum3;
-
- vp9_get8x8var_mmx(src, src_stride, ref, ref_stride, &sse0, &sum0);
- vp9_get8x8var_mmx(src + 8, src_stride, ref + 8, ref_stride, &sse1, &sum1);
- vp9_get8x8var_mmx(src + 8 * src_stride, src_stride,
- ref + 8 * ref_stride, ref_stride, &sse2, &sum2);
- vp9_get8x8var_mmx(src + 8 * src_stride + 8, src_stride,
- ref + 8 * ref_stride + 8, ref_stride, &sse3, &sum3);
-
- *sse = sse0 + sse1 + sse2 + sse3;
- return *sse;
-}
-
-
-unsigned int vp9_variance16x16_mmx(const uint8_t *src, int src_stride,
- const uint8_t *ref, int ref_stride,
- unsigned int *sse) {
- unsigned int sse0, sse1, sse2, sse3;
- int sum0, sum1, sum2, sum3, sum;
-
- vp9_get8x8var_mmx(src, src_stride, ref, ref_stride, &sse0, &sum0);
- vp9_get8x8var_mmx(src + 8, src_stride, ref + 8, ref_stride, &sse1, &sum1);
- vp9_get8x8var_mmx(src + 8 * src_stride, src_stride,
- ref + 8 * ref_stride, ref_stride, &sse2, &sum2);
- vp9_get8x8var_mmx(src + 8 * src_stride + 8, src_stride,
- ref + 8 * ref_stride + 8, ref_stride, &sse3, &sum3);
-
- *sse = sse0 + sse1 + sse2 + sse3;
- sum = sum0 + sum1 + sum2 + sum3;
- return *sse - (((unsigned int)sum * sum) >> 8);
-}
-
-unsigned int vp9_variance16x8_mmx(const uint8_t *src, int src_stride,
- const uint8_t *ref, int ref_stride,
- unsigned int *sse) {
- unsigned int sse0, sse1;
- int sum0, sum1, sum;
-
- vp9_get8x8var_mmx(src, src_stride, ref, ref_stride, &sse0, &sum0);
- vp9_get8x8var_mmx(src + 8, src_stride, ref + 8, ref_stride, &sse1, &sum1);
-
- *sse = sse0 + sse1;
- sum = sum0 + sum1;
- return *sse - (((unsigned int)sum * sum) >> 7);
-}
-
-
-unsigned int vp9_variance8x16_mmx(const uint8_t *src, int src_stride,
- const uint8_t *ref, int ref_stride,
- unsigned int *sse) {
- unsigned int sse0, sse1;
- int sum0, sum1, sum;
-
- vp9_get8x8var_mmx(src, src_stride, ref, ref_stride, &sse0, &sum0);
- vp9_get8x8var_mmx(src + 8 * src_stride, src_stride,
- ref + 8 * ref_stride, ref_stride, &sse1, &sum1);
-
- *sse = sse0 + sse1;
- sum = sum0 + sum1;
- return *sse - (((unsigned int)sum * sum) >> 7);
-}
diff --git a/vp9/encoder/x86/vp9_variance_sse2.c b/vp9/encoder/x86/vp9_variance_sse2.c
index e935a23..b4d2b0a 100644
--- a/vp9/encoder/x86/vp9_variance_sse2.c
+++ b/vp9/encoder/x86/vp9_variance_sse2.c
@@ -8,6 +8,8 @@
* be found in the AUTHORS file in the root of the source tree.
*/
+#include <emmintrin.h> // SSE2
+
#include "./vpx_config.h"
#include "vp9/encoder/vp9_variance.h"
@@ -17,18 +19,137 @@
const unsigned char *ref, int ref_stride,
unsigned int *sse, int *sum);
-unsigned int vp9_get4x4var_mmx(const unsigned char *src, int src_stride,
- const unsigned char *ref, int ref_stride,
- unsigned int *sse, int *sum);
+unsigned int vp9_get_mb_ss_sse2(const int16_t *src) {
+ __m128i vsum = _mm_setzero_si128();
+ int i;
+ for (i = 0; i < 32; ++i) {
+ const __m128i v = _mm_loadu_si128((const __m128i *)src);
+ vsum = _mm_add_epi32(vsum, _mm_madd_epi16(v, v));
+ src += 8;
+ }
-unsigned int vp9_get8x8var_sse2(const unsigned char *src, int src_stride,
- const unsigned char *ref, int ref_stride,
- unsigned int *sse, int *sum);
+ vsum = _mm_add_epi32(vsum, _mm_srli_si128(vsum, 8));
+ vsum = _mm_add_epi32(vsum, _mm_srli_si128(vsum, 4));
+ return _mm_cvtsi128_si32(vsum);
+}
-unsigned int vp9_get16x16var_sse2(const unsigned char *src, int src_stride,
- const unsigned char *ref, int ref_stride,
- unsigned int *sse, int *sum);
+#define READ64(p, stride, i) \
+ _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const uint32_t *)(p + i * stride)), \
+ _mm_cvtsi32_si128(*(const uint32_t *)(p + (i + 1) * stride)))
+
+unsigned int vp9_get4x4var_sse2(const uint8_t *src, int src_stride,
+ const uint8_t *ref, int ref_stride,
+ unsigned int *sse, int *sum) {
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i src0 = _mm_unpacklo_epi8(READ64(src, src_stride, 0), zero);
+ const __m128i src1 = _mm_unpacklo_epi8(READ64(src, src_stride, 2), zero);
+ const __m128i ref0 = _mm_unpacklo_epi8(READ64(ref, ref_stride, 0), zero);
+ const __m128i ref1 = _mm_unpacklo_epi8(READ64(ref, ref_stride, 2), zero);
+ const __m128i diff0 = _mm_sub_epi16(src0, ref0);
+ const __m128i diff1 = _mm_sub_epi16(src1, ref1);
+
+ // sum
+ __m128i vsum = _mm_add_epi16(diff0, diff1);
+ vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 8));
+ vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 4));
+ vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 2));
+ *sum = (int16_t)_mm_extract_epi16(vsum, 0);
+
+ // sse
+ vsum = _mm_add_epi32(_mm_madd_epi16(diff0, diff0),
+ _mm_madd_epi16(diff1, diff1));
+ vsum = _mm_add_epi32(vsum, _mm_srli_si128(vsum, 8));
+ vsum = _mm_add_epi32(vsum, _mm_srli_si128(vsum, 4));
+ *sse = _mm_cvtsi128_si32(vsum);
+
+ return 0;
+}
+
+unsigned int vp9_get8x8var_sse2(const uint8_t *src, int src_stride,
+ const uint8_t *ref, int ref_stride,
+ unsigned int *sse, int *sum) {
+ const __m128i zero = _mm_setzero_si128();
+ __m128i vsum = _mm_setzero_si128();
+ __m128i vsse = _mm_setzero_si128();
+ int i;
+
+ for (i = 0; i < 8; i += 2) {
+ const __m128i src0 = _mm_unpacklo_epi8(_mm_loadl_epi64(
+ (const __m128i *)(src + i * src_stride)), zero);
+ const __m128i ref0 = _mm_unpacklo_epi8(_mm_loadl_epi64(
+ (const __m128i *)(ref + i * ref_stride)), zero);
+ const __m128i diff0 = _mm_sub_epi16(src0, ref0);
+
+ const __m128i src1 = _mm_unpacklo_epi8(_mm_loadl_epi64(
+ (const __m128i *)(src + (i + 1) * src_stride)), zero);
+ const __m128i ref1 = _mm_unpacklo_epi8(_mm_loadl_epi64(
+ (const __m128i *)(ref + (i + 1) * ref_stride)), zero);
+ const __m128i diff1 = _mm_sub_epi16(src1, ref1);
+
+ vsum = _mm_add_epi16(vsum, diff0);
+ vsum = _mm_add_epi16(vsum, diff1);
+ vsse = _mm_add_epi32(vsse, _mm_madd_epi16(diff0, diff0));
+ vsse = _mm_add_epi32(vsse, _mm_madd_epi16(diff1, diff1));
+ }
+
+ // sum
+ vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 8));
+ vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 4));
+ vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 2));
+ *sum = (int16_t)_mm_extract_epi16(vsum, 0);
+
+ // sse
+ vsse = _mm_add_epi32(vsse, _mm_srli_si128(vsse, 8));
+ vsse = _mm_add_epi32(vsse, _mm_srli_si128(vsse, 4));
+ *sse = _mm_cvtsi128_si32(vsse);
+
+ return 0;
+}
+
+unsigned int vp9_get16x16var_sse2(const uint8_t *src, int src_stride,
+ const uint8_t *ref, int ref_stride,
+ unsigned int *sse, int *sum) {
+ const __m128i zero = _mm_setzero_si128();
+ __m128i vsum = _mm_setzero_si128();
+ __m128i vsse = _mm_setzero_si128();
+ int i;
+
+ for (i = 0; i < 16; ++i) {
+ const __m128i s = _mm_loadu_si128((const __m128i *)src);
+ const __m128i r = _mm_loadu_si128((const __m128i *)ref);
+
+ const __m128i src0 = _mm_unpacklo_epi8(s, zero);
+ const __m128i ref0 = _mm_unpacklo_epi8(r, zero);
+ const __m128i diff0 = _mm_sub_epi16(src0, ref0);
+
+ const __m128i src1 = _mm_unpackhi_epi8(s, zero);
+ const __m128i ref1 = _mm_unpackhi_epi8(r, zero);
+ const __m128i diff1 = _mm_sub_epi16(src1, ref1);
+
+ vsum = _mm_add_epi16(vsum, diff0);
+ vsum = _mm_add_epi16(vsum, diff1);
+ vsse = _mm_add_epi32(vsse, _mm_madd_epi16(diff0, diff0));
+ vsse = _mm_add_epi32(vsse, _mm_madd_epi16(diff1, diff1));
+
+ src += src_stride;
+ ref += ref_stride;
+ }
+
+ // sum
+ vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 8));
+ vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 4));
+ *sum = (int16_t)_mm_extract_epi16(vsum, 0) +
+ (int16_t)_mm_extract_epi16(vsum, 1);
+
+ // sse
+ vsse = _mm_add_epi32(vsse, _mm_srli_si128(vsse, 8));
+ vsse = _mm_add_epi32(vsse, _mm_srli_si128(vsse, 4));
+ *sse = _mm_cvtsi128_si32(vsse);
+
+ return 0;
+}
+
static void variance_sse2(const unsigned char *src, int src_stride,
const unsigned char *ref, int ref_stride,
@@ -55,8 +176,7 @@
const unsigned char *ref, int ref_stride,
unsigned int *sse) {
int sum;
- variance_sse2(src, src_stride, ref, ref_stride, 4, 4,
- sse, &sum, vp9_get4x4var_mmx, 4);
+ vp9_get4x4var_sse2(src, src_stride, ref, ref_stride, sse, &sum);
return *sse - (((unsigned int)sum * sum) >> 4);
}
@@ -65,7 +185,7 @@
unsigned int *sse) {
int sum;
variance_sse2(src, src_stride, ref, ref_stride, 8, 4,
- sse, &sum, vp9_get4x4var_mmx, 4);
+ sse, &sum, vp9_get4x4var_sse2, 4);
return *sse - (((unsigned int)sum * sum) >> 5);
}
@@ -74,7 +194,7 @@
unsigned int *sse) {
int sum;
variance_sse2(src, src_stride, ref, ref_stride, 4, 8,
- sse, &sum, vp9_get4x4var_mmx, 4);
+ sse, &sum, vp9_get4x4var_sse2, 4);
return *sse - (((unsigned int)sum * sum) >> 5);
}
@@ -82,8 +202,7 @@
const unsigned char *ref, int ref_stride,
unsigned int *sse) {
int sum;
- variance_sse2(src, src_stride, ref, ref_stride, 8, 8,
- sse, &sum, vp9_get8x8var_sse2, 8);
+ vp9_get8x8var_sse2(src, src_stride, ref, ref_stride, sse, &sum);
return *sse - (((unsigned int)sum * sum) >> 6);
}
@@ -109,17 +228,8 @@
const unsigned char *ref, int ref_stride,
unsigned int *sse) {
int sum;
- variance_sse2(src, src_stride, ref, ref_stride, 16, 16,
- sse, &sum, vp9_get16x16var_sse2, 16);
- return *sse - (((unsigned int)sum * sum) >> 8);
-}
-
-unsigned int vp9_mse16x16_sse2(const unsigned char *src, int src_stride,
- const unsigned char *ref, int ref_stride,
- unsigned int *sse) {
- int sum;
vp9_get16x16var_sse2(src, src_stride, ref, ref_stride, sse, &sum);
- return *sse;
+ return *sse - (((unsigned int)sum * sum) >> 8);
}
unsigned int vp9_variance32x32_sse2(const uint8_t *src, int src_stride,
@@ -176,6 +286,34 @@
return *sse - (((int64_t)sum * sum) >> 11);
}
+unsigned int vp9_mse8x8_sse2(const uint8_t *src, int src_stride,
+ const uint8_t *ref, int ref_stride,
+ unsigned int *sse) {
+ vp9_variance8x8_sse2(src, src_stride, ref, ref_stride, sse);
+ return *sse;
+}
+
+unsigned int vp9_mse8x16_sse2(const uint8_t *src, int src_stride,
+ const uint8_t *ref, int ref_stride,
+ unsigned int *sse) {
+ vp9_variance8x16_sse2(src, src_stride, ref, ref_stride, sse);
+ return *sse;
+}
+
+unsigned int vp9_mse16x8_sse2(const uint8_t *src, int src_stride,
+ const uint8_t *ref, int ref_stride,
+ unsigned int *sse) {
+ vp9_variance16x8_sse2(src, src_stride, ref, ref_stride, sse);
+ return *sse;
+}
+
+unsigned int vp9_mse16x16_sse2(const uint8_t *src, int src_stride,
+ const uint8_t *ref, int ref_stride,
+ unsigned int *sse) {
+ vp9_variance16x16_sse2(src, src_stride, ref, ref_stride, sse);
+ return *sse;
+}
+
#define DECL(w, opt) \
int vp9_sub_pixel_variance##w##xh_##opt(const uint8_t *src, \
ptrdiff_t src_stride, \
diff --git a/vp9/vp9_common.mk b/vp9/vp9_common.mk
index 8e3e885..90f0342 100644
--- a/vp9/vp9_common.mk
+++ b/vp9/vp9_common.mk
@@ -80,7 +80,6 @@
VP9_COMMON_SRCS-$(HAVE_AVX2) += common/x86/vp9_subpixel_8t_intrin_avx2.c
VP9_COMMON_SRCS-$(HAVE_SSSE3) += common/x86/vp9_subpixel_8t_intrin_ssse3.c
ifeq ($(CONFIG_VP9_POSTPROC),yes)
-VP9_COMMON_SRCS-$(HAVE_MMX) += common/x86/vp9_postproc_mmx.asm
VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_postproc_sse2.asm
endif
diff --git a/vp9/vp9_cx_iface.c b/vp9/vp9_cx_iface.c
index bf8eec7..0851d8a 100644
--- a/vp9/vp9_cx_iface.c
+++ b/vp9/vp9_cx_iface.c
@@ -21,7 +21,6 @@
#include "vp9/vp9_iface_common.h"
struct vp9_extracfg {
- struct vpx_codec_pkt_list *pkt_list;
int cpu_used; // available cpu percentage in 1/16
unsigned int enable_auto_alt_ref;
unsigned int noise_sensitivity;
@@ -31,7 +30,6 @@
unsigned int tile_rows;
unsigned int arnr_max_frames;
unsigned int arnr_strength;
- unsigned int arnr_type;
vp8e_tuning tuning;
unsigned int cq_level; // constrained quality level
unsigned int rc_max_intra_bitrate_pct;
@@ -39,41 +37,29 @@
unsigned int frame_parallel_decoding_mode;
AQ_MODE aq_mode;
unsigned int frame_periodic_boost;
- BIT_DEPTH bit_depth;
+ vpx_bit_depth_t bit_depth;
vp9e_tune_content content;
};
-struct extraconfig_map {
- unsigned int usage;
- struct vp9_extracfg cfg;
-};
-
-static const struct extraconfig_map extracfg_map[] = {
- {
- 0,
- { // NOLINT
- NULL,
- 0, // cpu_used
- 1, // enable_auto_alt_ref
- 0, // noise_sensitivity
- 0, // sharpness
- 0, // static_thresh
- 0, // tile_columns
- 0, // tile_rows
- 7, // arnr_max_frames
- 5, // arnr_strength
- 3, // arnr_type
- VP8_TUNE_PSNR, // tuning
- 10, // cq_level
- 0, // rc_max_intra_bitrate_pct
- 0, // lossless
- 0, // frame_parallel_decoding_mode
- NO_AQ, // aq_mode
- 0, // frame_periodic_delta_q
- BITS_8, // Bit depth
- VP9E_CONTENT_DEFAULT // content
- }
- }
+static struct vp9_extracfg default_extra_cfg = {
+ 0, // cpu_used
+ 1, // enable_auto_alt_ref
+ 0, // noise_sensitivity
+ 0, // sharpness
+ 0, // static_thresh
+ 0, // tile_columns
+ 0, // tile_rows
+ 7, // arnr_max_frames
+ 5, // arnr_strength
+ VP8_TUNE_PSNR, // tuning
+ 10, // cq_level
+ 0, // rc_max_intra_bitrate_pct
+ 0, // lossless
+ 0, // frame_parallel_decoding_mode
+ NO_AQ, // aq_mode
+ 0, // frame_periodic_delta_q
+ VPX_BITS_8, // Bit depth
+ VP9E_CONTENT_DEFAULT // content
};
struct vpx_codec_alg_priv {
@@ -177,20 +163,8 @@
}
RANGE_CHECK(cfg, ss_number_layers, 1, VPX_SS_MAX_LAYERS);
-
-#if CONFIG_SPATIAL_SVC
- if (cfg->ss_number_layers > 1) {
- unsigned int i, alt_ref_sum = 0;
- for (i = 0; i < cfg->ss_number_layers; ++i) {
- if (cfg->ss_enable_auto_alt_ref[i])
- ++alt_ref_sum;
- }
- if (alt_ref_sum > REF_FRAMES - cfg->ss_number_layers)
- ERROR("Not enough ref buffers for svc alt ref frames");
- }
-#endif
-
RANGE_CHECK(cfg, ts_number_layers, 1, VPX_TS_MAX_LAYERS);
+
if (cfg->ts_number_layers > 1) {
unsigned int i;
for (i = 1; i < cfg->ts_number_layers; ++i)
@@ -203,6 +177,28 @@
ERROR("ts_rate_decimator factors are not powers of 2");
}
+#if CONFIG_SPATIAL_SVC
+ if (cfg->ss_number_layers * cfg->ts_number_layers > REF_FRAMES)
+ ERROR("Too many layers. Maximum 8 layers could be set");
+
+ if ((cfg->ss_number_layers > 1 || cfg->ts_number_layers > 1) &&
+ cfg->g_pass == VPX_RC_LAST_PASS) {
+ unsigned int i, alt_ref_sum = 0;
+ for (i = 0; i < cfg->ss_number_layers; ++i) {
+ if (cfg->ss_enable_auto_alt_ref[i])
+ ++alt_ref_sum;
+ }
+ if (alt_ref_sum >
+ REF_FRAMES - cfg->ss_number_layers * cfg->ts_number_layers)
+ ERROR("Not enough ref buffers for svc alt ref frames");
+ if ((cfg->ss_number_layers > 3 ||
+ cfg->ss_number_layers * cfg->ts_number_layers > 4) &&
+ cfg->g_error_resilient == 0)
+ ERROR("Multiple frame context are not supported for more than 3 spatial "
+ "layers or more than 4 spatial x temporal layers");
+ }
+#endif
+
// VP9 does not support a lower bound on the keyframe interval in
// automatic keyframe placement mode.
if (cfg->kf_mode != VPX_KF_DISABLED &&
@@ -219,8 +215,9 @@
RANGE_CHECK_HI(extra_cfg, sharpness, 7);
RANGE_CHECK(extra_cfg, arnr_max_frames, 0, 15);
RANGE_CHECK_HI(extra_cfg, arnr_strength, 6);
- RANGE_CHECK(extra_cfg, arnr_type, 1, 3);
RANGE_CHECK(extra_cfg, cq_level, 0, 63);
+ RANGE_CHECK(cfg, g_bit_depth, VPX_BITS_8, VPX_BITS_12);
+ RANGE_CHECK(cfg, g_input_bit_depth, 8, 12);
RANGE_CHECK(extra_cfg, content,
VP9E_CONTENT_DEFAULT, VP9E_CONTENT_INVALID - 1);
@@ -239,7 +236,7 @@
if (cfg->rc_twopass_stats_in.sz % packet_sz)
ERROR("rc_twopass_stats_in.sz indicates truncated packet.");
- if (cfg->ss_number_layers > 1) {
+ if (cfg->ss_number_layers > 1 || cfg->ts_number_layers > 1) {
int i;
unsigned int n_packets_per_layer[VPX_SS_MAX_LAYERS] = {0};
@@ -279,12 +276,16 @@
}
}
+#if !CONFIG_VP9_HIGHBITDEPTH
+ if (cfg->g_profile > (unsigned int)PROFILE_1)
+ ERROR("Profile > 1 not supported in this build configuration");
+#endif
if (cfg->g_profile <= (unsigned int)PROFILE_1 &&
- extra_cfg->bit_depth > BITS_8)
- ERROR("High bit-depth not supported in profile < 2");
+ extra_cfg->bit_depth > VPX_BITS_8)
+ ERROR("Codec high bit-depth not supported in profile < 2");
if (cfg->g_profile > (unsigned int)PROFILE_1 &&
- extra_cfg->bit_depth == BITS_8)
- ERROR("Bit-depth 8 not supported in profile > 1");
+ extra_cfg->bit_depth == VPX_BITS_8)
+ ERROR("Codec bit-depth 8 not supported in profile > 1");
return VPX_CODEC_OK;
}
@@ -316,6 +317,9 @@
case VPX_IMG_FMT_I420: return 12;
case VPX_IMG_FMT_I422: return 16;
case VPX_IMG_FMT_I444: return 24;
+ case VPX_IMG_FMT_I42016: return 24;
+ case VPX_IMG_FMT_I42216: return 32;
+ case VPX_IMG_FMT_I44416: return 48;
default: assert(0 && "Invalid image format"); break;
}
return 0;
@@ -325,26 +329,27 @@
VP9EncoderConfig *oxcf,
const vpx_codec_enc_cfg_t *cfg,
const struct vp9_extracfg *extra_cfg) {
+ const int is_vbr = cfg->rc_end_usage == VPX_VBR;
oxcf->profile = cfg->g_profile;
oxcf->width = cfg->g_w;
oxcf->height = cfg->g_h;
oxcf->bit_depth = extra_cfg->bit_depth;
+ oxcf->input_bit_depth = cfg->g_input_bit_depth;
// guess a frame rate if out of whack, use 30
- oxcf->framerate = (double)cfg->g_timebase.den / cfg->g_timebase.num;
- if (oxcf->framerate > 180)
- oxcf->framerate = 30;
+ oxcf->init_framerate = (double)cfg->g_timebase.den / cfg->g_timebase.num;
+ if (oxcf->init_framerate > 180)
+ oxcf->init_framerate = 30;
+
+ oxcf->mode = GOOD;
switch (cfg->g_pass) {
case VPX_RC_ONE_PASS:
- oxcf->mode = ONE_PASS_GOOD;
oxcf->pass = 0;
break;
case VPX_RC_FIRST_PASS:
- oxcf->mode = TWO_PASS_FIRST;
oxcf->pass = 1;
break;
case VPX_RC_LAST_PASS:
- oxcf->mode = TWO_PASS_SECOND_BEST;
oxcf->pass = 2;
break;
}
@@ -371,9 +376,9 @@
oxcf->scaled_frame_width = cfg->rc_scaled_width;
oxcf->scaled_frame_height = cfg->rc_scaled_height;
- oxcf->maximum_buffer_size_ms = cfg->rc_buf_sz;
- oxcf->starting_buffer_level_ms = cfg->rc_buf_initial_sz;
- oxcf->optimal_buffer_level_ms = cfg->rc_buf_optimal_sz;
+ oxcf->maximum_buffer_size_ms = is_vbr ? 240000 : cfg->rc_buf_sz;
+ oxcf->starting_buffer_level_ms = is_vbr ? 60000 : cfg->rc_buf_initial_sz;
+ oxcf->optimal_buffer_level_ms = is_vbr ? 60000 : cfg->rc_buf_optimal_sz;
oxcf->drop_frames_water_mark = cfg->rc_dropframe_thresh;
@@ -393,7 +398,6 @@
oxcf->sharpness = extra_cfg->sharpness;
oxcf->two_pass_stats_in = cfg->rc_twopass_stats_in;
- oxcf->output_pkt_list = extra_cfg->pkt_list;
#if CONFIG_FP_MB_STATS
oxcf->firstpass_mb_stats_in = cfg->rc_firstpass_mb_stats_in;
@@ -401,7 +405,6 @@
oxcf->arnr_max_frames = extra_cfg->arnr_max_frames;
oxcf->arnr_strength = extra_cfg->arnr_strength;
- oxcf->arnr_type = extra_cfg->arnr_type;
oxcf->tuning = extra_cfg->tuning;
oxcf->content = extra_cfg->content;
@@ -428,6 +431,9 @@
}
} else if (oxcf->ss_number_layers == 1) {
oxcf->ss_target_bitrate[0] = (int)oxcf->target_bandwidth;
+#if CONFIG_SPATIAL_SVC
+ oxcf->ss_play_alternate[0] = extra_cfg->enable_auto_alt_ref;
+#endif
}
oxcf->ts_number_layers = cfg->ts_number_layers;
@@ -597,9 +603,9 @@
static vpx_codec_err_t ctrl_set_arnr_type(vpx_codec_alg_priv_t *ctx,
va_list args) {
- struct vp9_extracfg extra_cfg = ctx->extra_cfg;
- extra_cfg.arnr_type = CAST(VP8E_SET_ARNR_TYPE, args);
- return update_extra_cfg(ctx, &extra_cfg);
+ (void)ctx;
+ (void)args;
+ return VPX_CODEC_OK;
}
static vpx_codec_err_t ctrl_set_tuning(vpx_codec_alg_priv_t *ctx,
@@ -659,52 +665,36 @@
(void)data;
if (ctx->priv == NULL) {
- int i;
- vpx_codec_enc_cfg_t *cfg;
- struct vpx_codec_alg_priv *priv = calloc(1, sizeof(*priv));
-
+ vpx_codec_alg_priv_t *const priv = vpx_calloc(1, sizeof(*priv));
if (priv == NULL)
return VPX_CODEC_MEM_ERROR;
- ctx->priv = &priv->base;
- ctx->priv->sz = sizeof(*ctx->priv);
- ctx->priv->iface = ctx->iface;
- ctx->priv->alg_priv = priv;
+ ctx->priv = (vpx_codec_priv_t *)priv;
ctx->priv->init_flags = ctx->init_flags;
ctx->priv->enc.total_encoders = 1;
if (ctx->config.enc) {
// Update the reference to the config structure to an internal copy.
- ctx->priv->alg_priv->cfg = *ctx->config.enc;
- ctx->config.enc = &ctx->priv->alg_priv->cfg;
+ priv->cfg = *ctx->config.enc;
+ ctx->config.enc = &priv->cfg;
}
- cfg = &ctx->priv->alg_priv->cfg;
-
- // Select the extra vp6 configuration table based on the current
- // usage value. If the current usage value isn't found, use the
- // values for usage case 0.
- for (i = 0;
- extracfg_map[i].usage && extracfg_map[i].usage != cfg->g_usage;
- ++i) {}
-
- priv->extra_cfg = extracfg_map[i].cfg;
- priv->extra_cfg.pkt_list = &priv->pkt_list.head;
-
+ priv->extra_cfg = default_extra_cfg;
vp9_initialize_enc();
res = validate_config(priv, &priv->cfg, &priv->extra_cfg);
if (res == VPX_CODEC_OK) {
- VP9_COMP *cpi;
- set_encoder_config(&ctx->priv->alg_priv->oxcf,
- &ctx->priv->alg_priv->cfg,
- &ctx->priv->alg_priv->extra_cfg);
- cpi = vp9_create_compressor(&ctx->priv->alg_priv->oxcf);
- if (cpi == NULL)
+ set_encoder_config(&priv->oxcf, &priv->cfg, &priv->extra_cfg);
+#if CONFIG_VP9_HIGHBITDEPTH
+ priv->oxcf.use_highbitdepth =
+ (ctx->init_flags & VPX_CODEC_USE_HIGHBITDEPTH) ? 1 : 0;
+#endif
+ priv->cpi = vp9_create_compressor(&priv->oxcf);
+ if (priv->cpi == NULL)
res = VPX_CODEC_MEM_ERROR;
else
- ctx->priv->alg_priv->cpi = cpi;
+ priv->cpi->output_pkt_list = &priv->pkt_list.head;
}
}
@@ -714,35 +704,40 @@
static vpx_codec_err_t encoder_destroy(vpx_codec_alg_priv_t *ctx) {
free(ctx->cx_data);
vp9_remove_compressor(ctx->cpi);
- free(ctx);
+ vpx_free(ctx);
return VPX_CODEC_OK;
}
-static void pick_quickcompress_mode(vpx_codec_alg_priv_t *ctx,
+static void pick_quickcompress_mode(vpx_codec_alg_priv_t *ctx,
unsigned long duration,
unsigned long deadline) {
- // Use best quality mode if no deadline is given.
- MODE new_qc = ONE_PASS_BEST;
+ MODE new_mode = BEST;
- if (deadline) {
- // Convert duration parameter from stream timebase to microseconds
- const uint64_t duration_us = (uint64_t)duration * 1000000 *
- (uint64_t)ctx->cfg.g_timebase.num /
- (uint64_t)ctx->cfg.g_timebase.den;
+ switch (ctx->cfg.g_pass) {
+ case VPX_RC_ONE_PASS:
+ if (deadline > 0) {
+ const vpx_codec_enc_cfg_t *const cfg = &ctx->cfg;
- // If the deadline is more that the duration this frame is to be shown,
- // use good quality mode. Otherwise use realtime mode.
- new_qc = (deadline > duration_us) ? ONE_PASS_GOOD : REALTIME;
+ // Convert duration parameter from stream timebase to microseconds.
+ const uint64_t duration_us = (uint64_t)duration * 1000000 *
+ (uint64_t)cfg->g_timebase.num /(uint64_t)cfg->g_timebase.den;
+
+ // If the deadline is more that the duration this frame is to be shown,
+ // use good quality mode. Otherwise use realtime mode.
+ new_mode = (deadline > duration_us) ? GOOD : REALTIME;
+ } else {
+ new_mode = BEST;
+ }
+ break;
+ case VPX_RC_FIRST_PASS:
+ break;
+ case VPX_RC_LAST_PASS:
+ new_mode = deadline > 0 ? GOOD : BEST;
+ break;
}
- if (ctx->cfg.g_pass == VPX_RC_FIRST_PASS)
- new_qc = TWO_PASS_FIRST;
- else if (ctx->cfg.g_pass == VPX_RC_LAST_PASS)
- new_qc = (new_qc == ONE_PASS_BEST) ? TWO_PASS_SECOND_BEST
- : TWO_PASS_SECOND_GOOD;
-
- if (ctx->oxcf.mode != new_qc) {
- ctx->oxcf.mode = new_qc;
+ if (ctx->oxcf.mode != new_mode) {
+ ctx->oxcf.mode = new_mode;
vp9_change_config(ctx->cpi, &ctx->oxcf);
}
}
@@ -821,6 +816,23 @@
return (n * timebase->den + round) / timebase->num / TICKS_PER_SEC;
}
+static vpx_codec_frame_flags_t get_frame_pkt_flags(const VP9_COMP *cpi,
+ unsigned int lib_flags) {
+ vpx_codec_frame_flags_t flags = lib_flags << 16;
+
+ if (lib_flags & FRAMEFLAGS_KEY
+#if CONFIG_SPATIAL_SVC
+ || (is_two_pass_svc(cpi) && cpi->svc.layer_context[0].is_key_frame)
+#endif
+ )
+ flags |= VPX_FRAME_IS_KEY;
+
+ if (cpi->droppable)
+ flags |= VPX_FRAME_IS_DROPPABLE;
+
+ return flags;
+}
+
static vpx_codec_err_t encoder_encode(vpx_codec_alg_priv_t *ctx,
const vpx_image_t *img,
vpx_codec_pts_t pts,
@@ -828,18 +840,19 @@
vpx_enc_frame_flags_t flags,
unsigned long deadline) {
vpx_codec_err_t res = VPX_CODEC_OK;
+ VP9_COMP *const cpi = ctx->cpi;
const vpx_rational_t *const timebase = &ctx->cfg.g_timebase;
if (img != NULL) {
res = validate_img(ctx, img);
// TODO(jzern) the checks related to cpi's validity should be treated as a
// failure condition, encoder setup is done fully in init() currently.
- if (res == VPX_CODEC_OK && ctx->cpi != NULL && ctx->cx_data == NULL) {
+ if (res == VPX_CODEC_OK && cpi != NULL && ctx->cx_data == NULL) {
// There's no codec control for multiple alt-refs so check the encoder
// instance for its status to determine the compressed data size.
ctx->cx_data_sz = ctx->cfg.g_w * ctx->cfg.g_h *
get_image_bps(img) / 8 *
- (ctx->cpi->multi_arf_allowed ? 8 : 2);
+ (cpi->multi_arf_allowed ? 8 : 2);
if (ctx->cx_data_sz < 4096) ctx->cx_data_sz = 4096;
ctx->cx_data = (unsigned char *)malloc(ctx->cx_data_sz);
@@ -859,7 +872,7 @@
return VPX_CODEC_INVALID_PARAM;
}
- vp9_apply_encoding_flags(ctx->cpi, flags);
+ vp9_apply_encoding_flags(cpi, flags);
// Handle fixed keyframe intervals
if (ctx->cfg.kf_mode == VPX_KF_AUTO &&
@@ -871,7 +884,7 @@
}
// Initialize the encoder instance on the first frame.
- if (res == VPX_CODEC_OK && ctx->cpi != NULL) {
+ if (res == VPX_CODEC_OK && cpi != NULL) {
unsigned int lib_flags = 0;
YV12_BUFFER_CONFIG sd;
int64_t dst_time_stamp = timebase_units_to_ticks(timebase, pts);
@@ -882,16 +895,15 @@
// Set up internal flags
if (ctx->base.init_flags & VPX_CODEC_USE_PSNR)
- ((VP9_COMP *)ctx->cpi)->b_calculate_psnr = 1;
+ cpi->b_calculate_psnr = 1;
if (img != NULL) {
res = image2yuvconfig(img, &sd);
// Store the original flags in to the frame buffer. Will extract the
// key frame flag when we actually encode this frame.
- if (vp9_receive_raw_frame(ctx->cpi, flags,
+ if (vp9_receive_raw_frame(cpi, flags,
&sd, dst_time_stamp, dst_end_time_stamp)) {
- VP9_COMP *cpi = (VP9_COMP *)ctx->cpi;
res = update_error_state(ctx, &cpi->common.error);
}
}
@@ -916,22 +928,21 @@
}
while (cx_data_sz >= ctx->cx_data_sz / 2 &&
- -1 != vp9_get_compressed_data(ctx->cpi, &lib_flags, &size,
+ -1 != vp9_get_compressed_data(cpi, &lib_flags, &size,
cx_data, &dst_time_stamp,
&dst_end_time_stamp, !img)) {
if (size) {
- VP9_COMP *const cpi = (VP9_COMP *)ctx->cpi;
vpx_codec_cx_pkt_t pkt;
#if CONFIG_SPATIAL_SVC
- if (is_spatial_svc(cpi))
+ if (is_two_pass_svc(cpi))
cpi->svc.layer_context[cpi->svc.spatial_layer_id].layer_size += size;
#endif
// Pack invisible frames with the next visible frame
- if (cpi->common.show_frame == 0
+ if (!cpi->common.show_frame
#if CONFIG_SPATIAL_SVC
- || (is_spatial_svc(cpi) &&
+ || (is_two_pass_svc(cpi) &&
cpi->svc.spatial_layer_id < cpi->svc.number_spatial_layers - 1)
#endif
) {
@@ -951,30 +962,7 @@
pkt.data.frame.duration =
(unsigned long)ticks_to_timebase_units(timebase,
dst_end_time_stamp - dst_time_stamp);
- pkt.data.frame.flags = lib_flags << 16;
-
- if (lib_flags & FRAMEFLAGS_KEY
-#if CONFIG_SPATIAL_SVC
- || (is_spatial_svc(cpi) &&
- cpi->svc.layer_context[0].is_key_frame)
-#endif
- )
- pkt.data.frame.flags |= VPX_FRAME_IS_KEY;
-
- if (cpi->common.show_frame == 0) {
- pkt.data.frame.flags |= VPX_FRAME_IS_INVISIBLE;
-
- // This timestamp should be as close as possible to the
- // prior PTS so that if a decoder uses pts to schedule when
- // to do this, we start right after last frame was decoded.
- // Invisible frames have no duration.
- pkt.data.frame.pts =
- ticks_to_timebase_units(timebase, cpi->last_time_stamp_seen) + 1;
- pkt.data.frame.duration = 0;
- }
-
- if (cpi->droppable)
- pkt.data.frame.flags |= VPX_FRAME_IS_DROPPABLE;
+ pkt.data.frame.flags = get_frame_pkt_flags(cpi, lib_flags);
if (ctx->pending_cx_data) {
ctx->pending_frame_sizes[ctx->pending_frame_count++] = size;
@@ -996,15 +984,21 @@
cx_data += size;
cx_data_sz -= size;
#if CONFIG_SPATIAL_SVC
- if (is_spatial_svc(cpi)) {
- vpx_codec_cx_pkt_t pkt = {0};
+ if (is_two_pass_svc(cpi)) {
+ vpx_codec_cx_pkt_t pkt_sizes, pkt_psnr;
int i;
- pkt.kind = VPX_CODEC_SPATIAL_SVC_LAYER_SIZES;
+ vp9_zero(pkt_sizes);
+ vp9_zero(pkt_psnr);
+ pkt_sizes.kind = VPX_CODEC_SPATIAL_SVC_LAYER_SIZES;
+ pkt_psnr.kind = VPX_CODEC_SPATIAL_SVC_LAYER_PSNR;
for (i = 0; i < cpi->svc.number_spatial_layers; ++i) {
- pkt.data.layer_sizes[i] = cpi->svc.layer_context[i].layer_size;
- cpi->svc.layer_context[i].layer_size = 0;
+ LAYER_CONTEXT *lc = &cpi->svc.layer_context[i];
+ pkt_sizes.data.layer_sizes[i] = lc->layer_size;
+ pkt_psnr.data.layer_psnr[i] = lc->psnr_pkt;
+ lc->layer_size = 0;
}
- vpx_codec_pkt_list_add(&ctx->pkt_list.head, &pkt);
+ vpx_codec_pkt_list_add(&ctx->pkt_list.head, &pkt_sizes);
+ vpx_codec_pkt_list_add(&ctx->pkt_list.head, &pkt_psnr);
}
#endif
}
@@ -1285,6 +1279,9 @@
320, // g_width
240, // g_height
+ VPX_BITS_8, // g_bit_depth
+ 8, // g_input_bit_depth
+
{1, 30}, // g_timebase
0, // g_error_resilient
@@ -1345,16 +1342,19 @@
CODEC_INTERFACE(vpx_codec_vp9_cx) = {
"WebM Project VP9 Encoder" VERSION_STRING,
VPX_CODEC_INTERNAL_ABI_VERSION,
+#if CONFIG_VP9_HIGHBITDEPTH
+ VPX_CODEC_CAP_HIGHBITDEPTH |
+#endif
VPX_CODEC_CAP_ENCODER | VPX_CODEC_CAP_PSNR, // vpx_codec_caps_t
encoder_init, // vpx_codec_init_fn_t
encoder_destroy, // vpx_codec_destroy_fn_t
encoder_ctrl_maps, // vpx_codec_ctrl_fn_map_t
{ // NOLINT
- NOT_IMPLEMENTED, // vpx_codec_peek_si_fn_t
- NOT_IMPLEMENTED, // vpx_codec_get_si_fn_t
- NOT_IMPLEMENTED, // vpx_codec_decode_fn_t
- NOT_IMPLEMENTED, // vpx_codec_frame_get_fn_t
- NOT_IMPLEMENTED // vpx_codec_set_fb_fn_t
+ NULL, // vpx_codec_peek_si_fn_t
+ NULL, // vpx_codec_get_si_fn_t
+ NULL, // vpx_codec_decode_fn_t
+ NULL, // vpx_codec_frame_get_fn_t
+ NULL // vpx_codec_set_fb_fn_t
},
{ // NOLINT
1, // 1 cfg map
@@ -1362,8 +1362,8 @@
encoder_encode, // vpx_codec_encode_fn_t
encoder_get_cxdata, // vpx_codec_get_cx_data_fn_t
encoder_set_config, // vpx_codec_enc_config_set_fn_t
- NOT_IMPLEMENTED, // vpx_codec_get_global_headers_fn_t
+ NULL, // vpx_codec_get_global_headers_fn_t
encoder_get_preview, // vpx_codec_get_preview_frame_fn_t
- NOT_IMPLEMENTED // vpx_codec_enc_mr_get_mem_loc_fn_t
+ NULL // vpx_codec_enc_mr_get_mem_loc_fn_t
}
};
diff --git a/vp9/vp9_dx_iface.c b/vp9/vp9_dx_iface.c
index 4372ac9..393c66e 100644
--- a/vp9/vp9_dx_iface.c
+++ b/vp9/vp9_dx_iface.c
@@ -58,29 +58,22 @@
(void)data;
if (!ctx->priv) {
- vpx_codec_alg_priv_t *alg_priv = vpx_memalign(32, sizeof(*alg_priv));
- if (alg_priv == NULL)
+ vpx_codec_alg_priv_t *const priv = vpx_calloc(1, sizeof(*priv));
+ if (priv == NULL)
return VPX_CODEC_MEM_ERROR;
- vp9_zero(*alg_priv);
-
- ctx->priv = (vpx_codec_priv_t *)alg_priv;
- ctx->priv->sz = sizeof(*ctx->priv);
- ctx->priv->iface = ctx->iface;
- ctx->priv->alg_priv = alg_priv;
- ctx->priv->alg_priv->si.sz = sizeof(ctx->priv->alg_priv->si);
+ ctx->priv = (vpx_codec_priv_t *)priv;
ctx->priv->init_flags = ctx->init_flags;
- ctx->priv->alg_priv->flushed = 0;
- ctx->priv->alg_priv->frame_parallel_decode =
- (ctx->init_flags & VPX_CODEC_USE_FRAME_THREADING);
- // Disable frame parallel decoding for now.
- ctx->priv->alg_priv->frame_parallel_decode = 0;
+ priv->si.sz = sizeof(priv->si);
+ priv->flushed = 0;
+ priv->frame_parallel_decode =
+ (ctx->init_flags & VPX_CODEC_USE_FRAME_THREADING);
+ priv->frame_parallel_decode = 0; // Disable for now
if (ctx->config.dec) {
- // Update the reference to the config structure to an internal copy.
- ctx->priv->alg_priv->cfg = *ctx->config.dec;
- ctx->config.dec = &ctx->priv->alg_priv->cfg;
+ priv->cfg = *ctx->config.dec;
+ ctx->config.dec = &priv->cfg;
}
}
@@ -332,81 +325,6 @@
return VPX_CODEC_OK;
}
-static INLINE uint8_t read_marker(vpx_decrypt_cb decrypt_cb,
- void *decrypt_state,
- const uint8_t *data) {
- if (decrypt_cb) {
- uint8_t marker;
- decrypt_cb(decrypt_state, data, &marker, 1);
- return marker;
- }
- return *data;
-}
-
-static vpx_codec_err_t parse_superframe_index(const uint8_t *data,
- size_t data_sz,
- uint32_t sizes[8], int *count,
- vpx_decrypt_cb decrypt_cb,
- void *decrypt_state) {
- // A chunk ending with a byte matching 0xc0 is an invalid chunk unless
- // it is a super frame index. If the last byte of real video compression
- // data is 0xc0 the encoder must add a 0 byte. If we have the marker but
- // not the associated matching marker byte at the front of the index we have
- // an invalid bitstream and need to return an error.
-
- uint8_t marker;
-
- assert(data_sz);
- marker = read_marker(decrypt_cb, decrypt_state, data + data_sz - 1);
- *count = 0;
-
- if ((marker & 0xe0) == 0xc0) {
- const uint32_t frames = (marker & 0x7) + 1;
- const uint32_t mag = ((marker >> 3) & 0x3) + 1;
- const size_t index_sz = 2 + mag * frames;
-
- // This chunk is marked as having a superframe index but doesn't have
- // enough data for it, thus it's an invalid superframe index.
- if (data_sz < index_sz)
- return VPX_CODEC_CORRUPT_FRAME;
-
- {
- const uint8_t marker2 = read_marker(decrypt_cb, decrypt_state,
- data + data_sz - index_sz);
-
- // This chunk is marked as having a superframe index but doesn't have
- // the matching marker byte at the front of the index therefore it's an
- // invalid chunk.
- if (marker != marker2)
- return VPX_CODEC_CORRUPT_FRAME;
- }
-
- {
- // Found a valid superframe index.
- uint32_t i, j;
- const uint8_t *x = &data[data_sz - index_sz + 1];
-
- // Frames has a maximum of 8 and mag has a maximum of 4.
- uint8_t clear_buffer[32];
- assert(sizeof(clear_buffer) >= frames * mag);
- if (decrypt_cb) {
- decrypt_cb(decrypt_state, x, clear_buffer, frames * mag);
- x = clear_buffer;
- }
-
- for (i = 0; i < frames; ++i) {
- uint32_t this_sz = 0;
-
- for (j = 0; j < mag; ++j)
- this_sz |= (*x++) << (j * 8);
- sizes[i] = this_sz;
- }
- *count = frames;
- }
- }
- return VPX_CODEC_OK;
-}
-
static vpx_codec_err_t decoder_decode(vpx_codec_alg_priv_t *ctx,
const uint8_t *data, unsigned int data_sz,
void *user_priv, long deadline) {
@@ -424,8 +342,8 @@
// Reset flushed when receiving a valid frame.
ctx->flushed = 0;
- res = parse_superframe_index(data, data_sz, frame_sizes, &frame_count,
- ctx->decrypt_cb, ctx->decrypt_state);
+ res = vp9_parse_superframe_index(data, data_sz, frame_sizes, &frame_count,
+ ctx->decrypt_cb, ctx->decrypt_state);
if (res != VPX_CODEC_OK)
return res;
@@ -519,6 +437,7 @@
// call to get_frame.
if (!(*iter)) {
img = &ctx->img;
+ img->bit_depth = (int)ctx->pbi->common.bit_depth;
*iter = img;
}
}
@@ -667,6 +586,23 @@
}
}
+static vpx_codec_err_t ctrl_get_bit_depth(vpx_codec_alg_priv_t *ctx,
+ va_list args) {
+ unsigned int *const bit_depth = va_arg(args, unsigned int *);
+
+ if (bit_depth) {
+ if (ctx->pbi) {
+ const VP9_COMMON *const cm = &ctx->pbi->common;
+ *bit_depth = cm->bit_depth;
+ return VPX_CODEC_OK;
+ } else {
+ return VPX_CODEC_ERROR;
+ }
+ } else {
+ return VPX_CODEC_INVALID_PARAM;
+ }
+}
+
static vpx_codec_err_t ctrl_set_invert_tile_order(vpx_codec_alg_priv_t *ctx,
va_list args) {
ctx->invert_tile_order = va_arg(args, int);
@@ -699,6 +635,7 @@
{VP8D_GET_FRAME_CORRUPTED, ctrl_get_frame_corrupted},
{VP9_GET_REFERENCE, ctrl_get_reference},
{VP9D_GET_DISPLAY_SIZE, ctrl_get_display_size},
+ {VP9D_GET_BIT_DEPTH, ctrl_get_bit_depth},
{ -1, NULL},
};
@@ -723,12 +660,12 @@
},
{ // NOLINT
0,
- NOT_IMPLEMENTED, // vpx_codec_enc_cfg_map_t
- NOT_IMPLEMENTED, // vpx_codec_encode_fn_t
- NOT_IMPLEMENTED, // vpx_codec_get_cx_data_fn_t
- NOT_IMPLEMENTED, // vpx_codec_enc_config_set_fn_t
- NOT_IMPLEMENTED, // vpx_codec_get_global_headers_fn_t
- NOT_IMPLEMENTED, // vpx_codec_get_preview_frame_fn_t
- NOT_IMPLEMENTED // vpx_codec_enc_mr_get_mem_loc_fn_t
+ NULL, // vpx_codec_enc_cfg_map_t
+ NULL, // vpx_codec_encode_fn_t
+ NULL, // vpx_codec_get_cx_data_fn_t
+ NULL, // vpx_codec_enc_config_set_fn_t
+ NULL, // vpx_codec_get_global_headers_fn_t
+ NULL, // vpx_codec_get_preview_frame_fn_t
+ NULL // vpx_codec_enc_mr_get_mem_loc_fn_t
}
};
diff --git a/vp9/vp9cx.mk b/vp9/vp9cx.mk
index dc46c4e..e450f7b 100644
--- a/vp9/vp9cx.mk
+++ b/vp9/vp9cx.mk
@@ -93,10 +93,6 @@
VP9_CX_SRCS-yes += encoder/vp9_mbgraph.c
VP9_CX_SRCS-yes += encoder/vp9_mbgraph.h
-VP9_CX_SRCS-$(HAVE_MMX) += encoder/x86/vp9_variance_mmx.c
-VP9_CX_SRCS-$(HAVE_MMX) += encoder/x86/vp9_variance_impl_mmx.asm
-VP9_CX_SRCS-$(HAVE_MMX) += encoder/x86/vp9_sad_mmx.asm
-VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_variance_impl_sse2.asm
VP9_CX_SRCS-$(HAVE_AVX2) += encoder/x86/vp9_variance_impl_intrin_avx2.c
VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_sad4d_sse2.asm
VP9_CX_SRCS-$(HAVE_AVX2) += encoder/x86/vp9_sad4d_intrin_avx2.c
diff --git a/vpx/exports_enc b/vpx/exports_enc
index 07f0280..12e4578 100644
--- a/vpx/exports_enc
+++ b/vpx/exports_enc
@@ -8,17 +8,12 @@
text vpx_codec_set_cx_data_buf
text vpx_svc_dump_statistics
text vpx_svc_encode
-text vpx_svc_get_buffer
text vpx_svc_get_encode_frame_count
-text vpx_svc_get_frame_size
text vpx_svc_get_message
text vpx_svc_init
-text vpx_svc_is_keyframe
text vpx_svc_release
text vpx_svc_set_keyframe
text vpx_svc_set_options
text vpx_svc_set_quantizers
text vpx_svc_set_scale_factors
text vpx_svc_get_layer_resolution
-text vpx_svc_get_rc_stats_buffer_size
-text vpx_svc_get_rc_stats_buffer
diff --git a/vpx/internal/vpx_codec_internal.h b/vpx/internal/vpx_codec_internal.h
index 3ca6504..cbfffd0 100644
--- a/vpx/internal/vpx_codec_internal.h
+++ b/vpx/internal/vpx_codec_internal.h
@@ -286,8 +286,6 @@
vpx_codec_enc_cfg_t cfg;
} vpx_codec_enc_cfg_map_t;
-#define NOT_IMPLEMENTED 0
-
/*!\brief Decoder algorithm interface interface
*
* All decoders \ref MUST expose a variable of this type.
@@ -337,9 +335,6 @@
* and the pointer cast to the proper type.
*/
struct vpx_codec_priv {
- unsigned int sz;
- vpx_codec_iface_t *iface;
- struct vpx_codec_alg_priv *alg_priv;
const char *err_detail;
vpx_codec_flags_t init_flags;
struct {
@@ -347,8 +342,7 @@
vpx_codec_priv_cb_pair_t put_slice_cb;
} dec;
struct {
- int tbd;
- struct vpx_fixed_buf cx_data_dst_buf;
+ vpx_fixed_buf_t cx_data_dst_buf;
unsigned int cx_data_pad_before;
unsigned int cx_data_pad_after;
vpx_codec_cx_pkt_t cx_data_pkt;
@@ -369,11 +363,11 @@
#undef VPX_CTRL_USE_TYPE
#define VPX_CTRL_USE_TYPE(id, typ) \
- static typ id##__value(va_list args) {return va_arg(args, typ);} \
+ static VPX_INLINE typ id##__value(va_list args) {return va_arg(args, typ);}
#undef VPX_CTRL_USE_TYPE_DEPRECATED
#define VPX_CTRL_USE_TYPE_DEPRECATED(id, typ) \
- static typ id##__value(va_list args) {return va_arg(args, typ);} \
+ static VPX_INLINE typ id##__value(va_list args) {return va_arg(args, typ);}
#define CAST(id, arg) id##__value(arg)
diff --git a/vpx/src/svc_encodeframe.c b/vpx/src/svc_encodeframe.c
index 7828615..4c1c04a 100644
--- a/vpx/src/svc_encodeframe.c
+++ b/vpx/src/svc_encodeframe.c
@@ -15,12 +15,12 @@
#include <assert.h>
#include <math.h>
+#include <limits.h>
#include <stdarg.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#define VPX_DISABLE_CTRL_TYPECHECKS 1
-#define VPX_CODEC_DISABLE_COMPAT 1
#include "./vpx_config.h"
#include "vpx/svc_context.h"
#include "vpx/vp8cx.h"
@@ -47,8 +47,33 @@
#define OPTION_BUFFER_SIZE 256
#define COMPONENTS 4 // psnr & sse statistics maintained for total, y, u, v
-static const char *DEFAULT_QUANTIZER_VALUES = "60,53,39,33,27";
-static const char *DEFAULT_SCALE_FACTORS = "4/16,5/16,7/16,11/16,16/16";
+static const int DEFAULT_QUANTIZER_VALUES[VPX_SS_MAX_LAYERS] = {
+ 60, 53, 39, 33, 27
+};
+
+static const int DEFAULT_SCALE_FACTORS_NUM[VPX_SS_MAX_LAYERS] = {
+ 4, 5, 7, 11, 16
+};
+
+static const int DEFAULT_SCALE_FACTORS_DEN[VPX_SS_MAX_LAYERS] = {
+ 16, 16, 16, 16, 16
+};
+
+typedef enum {
+ QUANTIZER = 0,
+ BITRATE,
+ SCALE_FACTOR,
+ AUTO_ALT_REF,
+ ALL_OPTION_TYPES
+} LAYER_OPTION_TYPE;
+
+static const int option_max_values[ALL_OPTION_TYPES] = {
+ 63, INT_MAX, INT_MAX, 1
+};
+
+static const int option_min_values[ALL_OPTION_TYPES] = {
+ 0, 0, 1, 0
+};
// One encoded frame
typedef struct FrameData {
@@ -68,6 +93,7 @@
int scaling_factor_den[VPX_SS_MAX_LAYERS];
int quantizer[VPX_SS_MAX_LAYERS];
int enable_auto_alt_ref[VPX_SS_MAX_LAYERS];
+ int bitrates[VPX_SS_MAX_LAYERS];
// accumulated statistics
double psnr_sum[VPX_SS_MAX_LAYERS][COMPONENTS]; // total/Y/U/V
@@ -81,71 +107,17 @@
// state variables
int encode_frame_count;
- int frame_received;
+ int psnr_pkt_received;
int frame_within_gop;
int layers;
int layer;
int is_keyframe;
-
- FrameData *frame_list;
- FrameData *frame_temp;
-
- char *rc_stats_buf;
- size_t rc_stats_buf_size;
- size_t rc_stats_buf_used;
+ int use_multiple_frame_contexts;
char message_buffer[2048];
vpx_codec_ctx_t *codec_ctx;
} SvcInternal;
-// create FrameData from encoder output
-static struct FrameData *fd_create(void *buf, size_t size,
- vpx_codec_frame_flags_t flags) {
- struct FrameData *const frame_data =
- (struct FrameData *)vpx_malloc(sizeof(*frame_data));
- if (frame_data == NULL) {
- return NULL;
- }
- frame_data->buf = vpx_malloc(size);
- if (frame_data->buf == NULL) {
- vpx_free(frame_data);
- return NULL;
- }
- vpx_memcpy(frame_data->buf, buf, size);
- frame_data->size = size;
- frame_data->flags = flags;
- return frame_data;
-}
-
-// free FrameData
-static void fd_free(struct FrameData *p) {
- if (p) {
- if (p->buf)
- vpx_free(p->buf);
- vpx_free(p);
- }
-}
-
-// add FrameData to list
-static void fd_list_add(struct FrameData **list, struct FrameData *layer_data) {
- struct FrameData **p = list;
-
- while (*p != NULL) p = &(*p)->next;
- *p = layer_data;
- layer_data->next = NULL;
-}
-
-// free FrameData list
-static void fd_free_list(struct FrameData *list) {
- struct FrameData *p = list;
-
- while (p) {
- list = list->next;
- fd_free(p);
- p = list;
- }
-}
-
static SvcInternal *get_svc_internal(SvcContext *svc_ctx) {
if (svc_ctx == NULL) return NULL;
if (svc_ctx->internal == NULL) {
@@ -196,158 +168,62 @@
return retval;
}
-static vpx_codec_err_t parse_quantizer_values(SvcContext *svc_ctx,
- const char *quantizer_values) {
- char *input_string;
- char *token;
- const char *delim = ",";
- char *save_ptr;
- int found = 0;
- int i, q;
- vpx_codec_err_t res = VPX_CODEC_OK;
- SvcInternal *const si = get_svc_internal(svc_ctx);
+static vpx_codec_err_t extract_option(LAYER_OPTION_TYPE type,
+ char *input,
+ int *value0,
+ int *value1) {
+ if (type == SCALE_FACTOR) {
+ *value0 = strtol(input, &input, 10);
+ if (*input++ != '/')
+ return VPX_CODEC_INVALID_PARAM;
+ *value1 = strtol(input, &input, 10);
- if (quantizer_values == NULL || strlen(quantizer_values) == 0) {
- input_string = strdup(DEFAULT_QUANTIZER_VALUES);
+ if (*value0 < option_min_values[SCALE_FACTOR] ||
+ *value1 < option_min_values[SCALE_FACTOR] ||
+ *value0 > option_max_values[SCALE_FACTOR] ||
+ *value1 > option_max_values[SCALE_FACTOR])
+ return VPX_CODEC_INVALID_PARAM;
} else {
- input_string = strdup(quantizer_values);
+ *value0 = atoi(input);
+ if (*value0 < option_min_values[type] ||
+ *value0 > option_max_values[type])
+ return VPX_CODEC_INVALID_PARAM;
}
-
- token = strtok_r(input_string, delim, &save_ptr);
- for (i = 0; i < svc_ctx->spatial_layers; ++i) {
- if (token != NULL) {
- q = atoi(token);
- if (q <= 0 || q > 100) {
- svc_log(svc_ctx, SVC_LOG_ERROR,
- "svc-quantizer-values: invalid value %s\n", token);
- res = VPX_CODEC_INVALID_PARAM;
- break;
- }
- token = strtok_r(NULL, delim, &save_ptr);
- found = i + 1;
- } else {
- q = 0;
- }
- si->quantizer[i + VPX_SS_MAX_LAYERS - svc_ctx->spatial_layers] = q;
- }
- if (res == VPX_CODEC_OK && found != svc_ctx->spatial_layers) {
- svc_log(svc_ctx, SVC_LOG_ERROR,
- "svc: quantizers: %d values required, but only %d specified\n",
- svc_ctx->spatial_layers, found);
- res = VPX_CODEC_INVALID_PARAM;
- }
- free(input_string);
- return res;
+ return VPX_CODEC_OK;
}
-static vpx_codec_err_t parse_auto_alt_ref(SvcContext *svc_ctx,
- const char *alt_ref_options) {
- char *input_string;
- char *token;
- const char *delim = ",";
- char *save_ptr;
- int found = 0, enabled = 0;
- int i, value;
- vpx_codec_err_t res = VPX_CODEC_OK;
- SvcInternal *const si = get_svc_internal(svc_ctx);
-
- if (alt_ref_options == NULL || strlen(alt_ref_options) == 0) {
- return VPX_CODEC_INVALID_PARAM;
- } else {
- input_string = strdup(alt_ref_options);
- }
-
- token = strtok_r(input_string, delim, &save_ptr);
- for (i = 0; i < svc_ctx->spatial_layers; ++i) {
- if (token != NULL) {
- value = atoi(token);
- if (value < 0 || value > 1) {
- svc_log(svc_ctx, SVC_LOG_ERROR,
- "enable auto alt ref values: invalid value %s\n", token);
- res = VPX_CODEC_INVALID_PARAM;
- break;
- }
- token = strtok_r(NULL, delim, &save_ptr);
- found = i + 1;
- } else {
- value = 0;
- }
- si->enable_auto_alt_ref[i] = value;
- if (value > 0)
- ++enabled;
- }
- if (res == VPX_CODEC_OK && found != svc_ctx->spatial_layers) {
- svc_log(svc_ctx, SVC_LOG_ERROR,
- "svc: quantizers: %d values required, but only %d specified\n",
- svc_ctx->spatial_layers, found);
- res = VPX_CODEC_INVALID_PARAM;
- }
- if (enabled > REF_FRAMES - svc_ctx->spatial_layers) {
- svc_log(svc_ctx, SVC_LOG_ERROR,
- "svc: auto alt ref: Maxinum %d(REF_FRAMES - layers) layers could"
- "enabled auto alt reference frame, but % layers are enabled\n",
- REF_FRAMES - svc_ctx->spatial_layers, enabled);
- res = VPX_CODEC_INVALID_PARAM;
- }
- free(input_string);
- return res;
-}
-
-static void log_invalid_scale_factor(SvcContext *svc_ctx, const char *value) {
- svc_log(svc_ctx, SVC_LOG_ERROR, "svc scale-factors: invalid value %s\n",
- value);
-}
-
-static vpx_codec_err_t parse_scale_factors(SvcContext *svc_ctx,
- const char *scale_factors) {
- char *input_string;
- char *token;
- const char *delim = ",";
- char *save_ptr;
- int found = 0;
+static vpx_codec_err_t parse_layer_options_from_string(SvcContext *svc_ctx,
+ LAYER_OPTION_TYPE type,
+ const char *input,
+ int *option0,
+ int *option1) {
int i;
- int64_t num, den;
vpx_codec_err_t res = VPX_CODEC_OK;
- SvcInternal *const si = get_svc_internal(svc_ctx);
+ char *input_string;
+ char *token;
+ const char *delim = ",";
+ char *save_ptr;
- if (scale_factors == NULL || strlen(scale_factors) == 0) {
- input_string = strdup(DEFAULT_SCALE_FACTORS);
- } else {
- input_string = strdup(scale_factors);
- }
+ if (input == NULL || option0 == NULL ||
+ (option1 == NULL && type == SCALE_FACTOR))
+ return VPX_CODEC_INVALID_PARAM;
+
+ input_string = strdup(input);
token = strtok_r(input_string, delim, &save_ptr);
for (i = 0; i < svc_ctx->spatial_layers; ++i) {
- num = den = 0;
if (token != NULL) {
- num = strtol(token, &token, 10);
- if (num <= 0) {
- log_invalid_scale_factor(svc_ctx, token);
- res = VPX_CODEC_INVALID_PARAM;
+ res = extract_option(type, token, option0 + i, option1 + i);
+ if (res != VPX_CODEC_OK)
break;
- }
- if (*token++ != '/') {
- log_invalid_scale_factor(svc_ctx, token);
- res = VPX_CODEC_INVALID_PARAM;
- break;
- }
- den = strtol(token, &token, 10);
- if (den <= 0) {
- log_invalid_scale_factor(svc_ctx, token);
- res = VPX_CODEC_INVALID_PARAM;
- break;
- }
token = strtok_r(NULL, delim, &save_ptr);
- found = i + 1;
+ } else {
+ break;
}
- si->scaling_factor_num[i + VPX_SS_MAX_LAYERS - svc_ctx->spatial_layers] =
- (int)num;
- si->scaling_factor_den[i + VPX_SS_MAX_LAYERS - svc_ctx->spatial_layers] =
- (int)den;
}
- if (res == VPX_CODEC_OK && found != svc_ctx->spatial_layers) {
+ if (res == VPX_CODEC_OK && i != svc_ctx->spatial_layers) {
svc_log(svc_ctx, SVC_LOG_ERROR,
- "svc: scale-factors: %d values required, but only %d specified\n",
- svc_ctx->spatial_layers, found);
+ "svc: layer params type: %d %d values required, "
+ "but only %d specified\n", type, svc_ctx->spatial_layers, i);
res = VPX_CODEC_INVALID_PARAM;
}
free(input_string);
@@ -366,7 +242,9 @@
char *option_name;
char *option_value;
char *input_ptr;
+ SvcInternal *const si = get_svc_internal(svc_ctx);
vpx_codec_err_t res = VPX_CODEC_OK;
+ int i, alt_ref_enabled = 0;
if (options == NULL) return VPX_CODEC_OK;
input_string = strdup(options);
@@ -382,17 +260,29 @@
res = VPX_CODEC_INVALID_PARAM;
break;
}
- if (strcmp("layers", option_name) == 0) {
+ if (strcmp("spatial-layers", option_name) == 0) {
svc_ctx->spatial_layers = atoi(option_value);
+ } else if (strcmp("temporal-layers", option_name) == 0) {
+ svc_ctx->temporal_layers = atoi(option_value);
} else if (strcmp("scale-factors", option_name) == 0) {
- res = parse_scale_factors(svc_ctx, option_value);
+ res = parse_layer_options_from_string(svc_ctx, SCALE_FACTOR, option_value,
+ si->scaling_factor_num,
+ si->scaling_factor_den);
if (res != VPX_CODEC_OK) break;
} else if (strcmp("quantizers", option_name) == 0) {
- res = parse_quantizer_values(svc_ctx, option_value);
+ res = parse_layer_options_from_string(svc_ctx, QUANTIZER, option_value,
+ si->quantizer, NULL);
if (res != VPX_CODEC_OK) break;
} else if (strcmp("auto-alt-refs", option_name) == 0) {
- res = parse_auto_alt_ref(svc_ctx, option_value);
+ res = parse_layer_options_from_string(svc_ctx, AUTO_ALT_REF, option_value,
+ si->enable_auto_alt_ref, NULL);
if (res != VPX_CODEC_OK) break;
+ } else if (strcmp("bitrates", option_name) == 0) {
+ res = parse_layer_options_from_string(svc_ctx, BITRATE, option_value,
+ si->bitrates, NULL);
+ if (res != VPX_CODEC_OK) break;
+ } else if (strcmp("multi-frame-contexts", option_name) == 0) {
+ si->use_multiple_frame_contexts = atoi(option_value);
} else {
svc_log(svc_ctx, SVC_LOG_ERROR, "invalid option: %s\n", option_name);
res = VPX_CODEC_INVALID_PARAM;
@@ -401,6 +291,22 @@
option_name = strtok_r(NULL, "=", &input_ptr);
}
free(input_string);
+
+ if (si->use_multiple_frame_contexts &&
+ (svc_ctx->spatial_layers > 3 ||
+ svc_ctx->spatial_layers * svc_ctx->temporal_layers > 4))
+ res = VPX_CODEC_INVALID_PARAM;
+
+ for (i = 0; i < svc_ctx->spatial_layers; ++i)
+ alt_ref_enabled += si->enable_auto_alt_ref[i];
+ if (alt_ref_enabled > REF_FRAMES - svc_ctx->spatial_layers) {
+ svc_log(svc_ctx, SVC_LOG_ERROR,
+ "svc: auto alt ref: Maxinum %d(REF_FRAMES - layers) layers could"
+ "enabled auto alt reference frame, but % layers are enabled\n",
+ REF_FRAMES - svc_ctx->spatial_layers, alt_ref_enabled);
+ res = VPX_CODEC_INVALID_PARAM;
+ }
+
return res;
}
@@ -436,6 +342,39 @@
return VPX_CODEC_OK;
}
+void assign_layer_bitrates(const SvcInternal *const si,
+ vpx_codec_enc_cfg_t *const enc_cfg) {
+ int i;
+
+ if (si->bitrates[0] != 0) {
+ enc_cfg->rc_target_bitrate = 0;
+ for (i = 0; i < si->layers; ++i) {
+ enc_cfg->ss_target_bitrate[i] = (unsigned int)si->bitrates[i];
+ enc_cfg->rc_target_bitrate += si->bitrates[i];
+ }
+ } else {
+ float total = 0;
+ float alloc_ratio[VPX_SS_MAX_LAYERS] = {0};
+
+ for (i = 0; i < si->layers; ++i) {
+ if (si->scaling_factor_den[i] > 0) {
+ alloc_ratio[i] = (float)(si->scaling_factor_num[i] * 1.0 /
+ si->scaling_factor_den[i]);
+
+ alloc_ratio[i] *= alloc_ratio[i];
+ total += alloc_ratio[i];
+ }
+ }
+
+ for (i = 0; i < si->layers; ++i) {
+ if (total > 0) {
+ enc_cfg->ss_target_bitrate[i] = (unsigned int)
+ (enc_cfg->rc_target_bitrate * alloc_ratio[i] / total);
+ }
+ }
+ }
+}
+
vpx_codec_err_t vpx_svc_init(SvcContext *svc_ctx, vpx_codec_ctx_t *codec_ctx,
vpx_codec_iface_t *iface,
vpx_codec_enc_cfg_t *enc_cfg) {
@@ -469,55 +408,64 @@
return VPX_CODEC_INVALID_PARAM;
}
- res = parse_quantizer_values(svc_ctx, si->quantizers);
- if (res != VPX_CODEC_OK) return res;
+ for (i = 0; i < VPX_SS_MAX_LAYERS; ++i) {
+ si->quantizer[i] = DEFAULT_QUANTIZER_VALUES[i];
+ si->scaling_factor_num[i] = DEFAULT_SCALE_FACTORS_NUM[i];
+ si->scaling_factor_den[i] = DEFAULT_SCALE_FACTORS_DEN[i];
+ }
- res = parse_scale_factors(svc_ctx, si->scale_factors);
- if (res != VPX_CODEC_OK) return res;
+ if (strlen(si->quantizers) > 0) {
+ res = parse_layer_options_from_string(svc_ctx, QUANTIZER, si->quantizers,
+ si->quantizer, NULL);
+ if (res != VPX_CODEC_OK)
+ return res;
+ }
+
+ if (strlen(si->scale_factors) > 0) {
+ res = parse_layer_options_from_string(svc_ctx, SCALE_FACTOR,
+ si->scale_factors,
+ si->scaling_factor_num,
+ si->scaling_factor_den);
+ if (res != VPX_CODEC_OK)
+ return res;
+ }
// Parse aggregate command line options. Options must start with
// "layers=xx" then followed by other options
res = parse_options(svc_ctx, si->options);
if (res != VPX_CODEC_OK) return res;
+ if (svc_ctx->spatial_layers < 1)
+ svc_ctx->spatial_layers = 1;
+ if (svc_ctx->spatial_layers > VPX_SS_MAX_LAYERS)
+ svc_ctx->spatial_layers = VPX_SS_MAX_LAYERS;
+
+ if (svc_ctx->temporal_layers < 1)
+ svc_ctx->temporal_layers = 1;
+ if (svc_ctx->temporal_layers > VPX_TS_MAX_LAYERS)
+ svc_ctx->temporal_layers = VPX_TS_MAX_LAYERS;
+
si->layers = svc_ctx->spatial_layers;
- // Assign target bitrate for each layer. We calculate the ratio
- // from the resolution for now.
- // TODO(Minghai): Optimize the mechanism of allocating bits after
- // implementing svc two pass rate control.
- if (si->layers > 1) {
- float total = 0;
- float alloc_ratio[VPX_SS_MAX_LAYERS] = {0};
-
- assert(si->layers <= VPX_SS_MAX_LAYERS);
- for (i = 0; i < si->layers; ++i) {
- int pos = i + VPX_SS_MAX_LAYERS - svc_ctx->spatial_layers;
- if (pos < VPX_SS_MAX_LAYERS && si->scaling_factor_den[pos] > 0) {
- alloc_ratio[i] = (float)(si->scaling_factor_num[pos] * 1.0 /
- si->scaling_factor_den[pos]);
-
- alloc_ratio[i] *= alloc_ratio[i];
- total += alloc_ratio[i];
- }
- }
-
- for (i = 0; i < si->layers; ++i) {
- if (total > 0) {
- enc_cfg->ss_target_bitrate[i] = (unsigned int)
- (enc_cfg->rc_target_bitrate * alloc_ratio[i] / total);
- }
- }
- }
+ assign_layer_bitrates(si, enc_cfg);
#if CONFIG_SPATIAL_SVC
for (i = 0; i < si->layers; ++i)
enc_cfg->ss_enable_auto_alt_ref[i] = si->enable_auto_alt_ref[i];
#endif
+ if (svc_ctx->temporal_layers > 1) {
+ int i;
+ for (i = 0; i < svc_ctx->temporal_layers; ++i) {
+ enc_cfg->ts_target_bitrate[i] = enc_cfg->rc_target_bitrate /
+ svc_ctx->temporal_layers;
+ enc_cfg->ts_rate_decimator[i] = 1 << (svc_ctx->temporal_layers - 1 - i);
+ }
+ }
+
// modify encoder configuration
enc_cfg->ss_number_layers = si->layers;
- enc_cfg->ts_number_layers = 1; // Temporal layers not used in this encoder.
+ enc_cfg->ts_number_layers = svc_ctx->temporal_layers;
// TODO(ivanmaltz): determine if these values need to be set explicitly for
// svc, or if the normal default/override mechanism can be used
@@ -534,7 +482,8 @@
enc_cfg->rc_buf_initial_sz = 500;
enc_cfg->rc_buf_optimal_sz = 600;
enc_cfg->rc_buf_sz = 1000;
- enc_cfg->g_error_resilient = 1;
+ if (enc_cfg->g_error_resilient == 0 && si->use_multiple_frame_contexts == 0)
+ enc_cfg->g_error_resilient = 1;
// Initialize codec
res = vpx_codec_enc_init(codec_ctx, iface, enc_cfg, VPX_CODEC_USE_PSNR);
@@ -553,7 +502,7 @@
int layer,
unsigned int *width,
unsigned int *height) {
- int w, h, index, num, den;
+ int w, h, num, den;
const SvcInternal *const si = get_const_svc_internal(svc_ctx);
if (svc_ctx == NULL || si == NULL || width == NULL || height == NULL) {
@@ -561,9 +510,8 @@
}
if (layer < 0 || layer >= si->layers) return VPX_CODEC_INVALID_PARAM;
- index = layer + VPX_SS_MAX_LAYERS - si->layers;
- num = si->scaling_factor_num[index];
- den = si->scaling_factor_den[index];
+ num = si->scaling_factor_num[layer];
+ den = si->scaling_factor_den[layer];
if (num == 0 || den == 0) return VPX_CODEC_INVALID_PARAM;
w = si->width * num / den;
@@ -581,7 +529,7 @@
static void set_svc_parameters(SvcContext *svc_ctx,
vpx_codec_ctx_t *codec_ctx) {
- int layer, layer_index;
+ int layer;
vpx_svc_parameters_t svc_params;
SvcInternal *const si = get_svc_internal(svc_ctx);
@@ -595,11 +543,10 @@
&svc_params.height)) {
svc_log(svc_ctx, SVC_LOG_ERROR, "vpx_svc_get_layer_resolution failed\n");
}
- layer_index = layer + VPX_SS_MAX_LAYERS - si->layers;
if (codec_ctx->config.enc->g_pass == VPX_RC_ONE_PASS) {
- svc_params.min_quantizer = si->quantizer[layer_index];
- svc_params.max_quantizer = si->quantizer[layer_index];
+ svc_params.min_quantizer = si->quantizer[layer];
+ svc_params.max_quantizer = si->quantizer[layer];
} else {
svc_params.min_quantizer = codec_ctx->config.enc->rc_min_quantizer;
svc_params.max_quantizer = codec_ctx->config.enc->rc_max_quantizer;
@@ -619,14 +566,12 @@
vpx_codec_err_t res;
vpx_codec_iter_t iter;
const vpx_codec_cx_pkt_t *cx_pkt;
- int layer_for_psnr = 0;
SvcInternal *const si = get_svc_internal(svc_ctx);
if (svc_ctx == NULL || codec_ctx == NULL || si == NULL) {
return VPX_CODEC_INVALID_PARAM;
}
svc_log_reset(svc_ctx);
- si->rc_stats_buf_used = 0;
si->layers = svc_ctx->spatial_layers;
if (si->encode_frame_count == 0) {
@@ -657,61 +602,37 @@
iter = NULL;
while ((cx_pkt = vpx_codec_get_cx_data(codec_ctx, &iter))) {
switch (cx_pkt->kind) {
- case VPX_CODEC_CX_FRAME_PKT: {
- fd_list_add(&si->frame_list, fd_create(cx_pkt->data.frame.buf,
- cx_pkt->data.frame.sz,
- cx_pkt->data.frame.flags));
-
- svc_log(svc_ctx, SVC_LOG_DEBUG, "SVC frame: %d, kf: %d, size: %d, "
- "pts: %d\n", si->frame_received,
- (cx_pkt->data.frame.flags & VPX_FRAME_IS_KEY) ? 1 : 0,
- (int)cx_pkt->data.frame.sz, (int)cx_pkt->data.frame.pts);
-
- ++si->frame_received;
- layer_for_psnr = 0;
- break;
- }
- case VPX_CODEC_PSNR_PKT: {
- int i;
- svc_log(svc_ctx, SVC_LOG_DEBUG,
- "SVC frame: %d, layer: %d, PSNR(Total/Y/U/V): "
- "%2.3f %2.3f %2.3f %2.3f \n",
- si->frame_received, layer_for_psnr,
- cx_pkt->data.psnr.psnr[0], cx_pkt->data.psnr.psnr[1],
- cx_pkt->data.psnr.psnr[2], cx_pkt->data.psnr.psnr[3]);
- svc_log(svc_ctx, SVC_LOG_DEBUG,
- "SVC frame: %d, layer: %d, SSE(Total/Y/U/V): "
- "%2.3f %2.3f %2.3f %2.3f \n",
- si->frame_received, layer_for_psnr,
- cx_pkt->data.psnr.sse[0], cx_pkt->data.psnr.sse[1],
- cx_pkt->data.psnr.sse[2], cx_pkt->data.psnr.sse[3]);
- for (i = 0; i < COMPONENTS; i++) {
- si->psnr_sum[layer_for_psnr][i] += cx_pkt->data.psnr.psnr[i];
- si->sse_sum[layer_for_psnr][i] += cx_pkt->data.psnr.sse[i];
- }
- ++layer_for_psnr;
- break;
- }
- case VPX_CODEC_STATS_PKT: {
- size_t new_size = si->rc_stats_buf_used +
- cx_pkt->data.twopass_stats.sz;
-
- if (new_size > si->rc_stats_buf_size) {
- char *p = (char*)realloc(si->rc_stats_buf, new_size);
- if (p == NULL) {
- svc_log(svc_ctx, SVC_LOG_ERROR, "Error allocating stats buf\n");
- return VPX_CODEC_MEM_ERROR;
- }
- si->rc_stats_buf = p;
- si->rc_stats_buf_size = new_size;
- }
-
- memcpy(si->rc_stats_buf + si->rc_stats_buf_used,
- cx_pkt->data.twopass_stats.buf, cx_pkt->data.twopass_stats.sz);
- si->rc_stats_buf_used += cx_pkt->data.twopass_stats.sz;
- break;
- }
#if CONFIG_SPATIAL_SVC
+ case VPX_CODEC_SPATIAL_SVC_LAYER_PSNR: {
+ int i;
+ for (i = 0; i < svc_ctx->spatial_layers; ++i) {
+ int j;
+ svc_log(svc_ctx, SVC_LOG_DEBUG,
+ "SVC frame: %d, layer: %d, PSNR(Total/Y/U/V): "
+ "%2.3f %2.3f %2.3f %2.3f \n",
+ si->psnr_pkt_received, i,
+ cx_pkt->data.layer_psnr[i].psnr[0],
+ cx_pkt->data.layer_psnr[i].psnr[1],
+ cx_pkt->data.layer_psnr[i].psnr[2],
+ cx_pkt->data.layer_psnr[i].psnr[3]);
+ svc_log(svc_ctx, SVC_LOG_DEBUG,
+ "SVC frame: %d, layer: %d, SSE(Total/Y/U/V): "
+ "%2.3f %2.3f %2.3f %2.3f \n",
+ si->psnr_pkt_received, i,
+ cx_pkt->data.layer_psnr[i].sse[0],
+ cx_pkt->data.layer_psnr[i].sse[1],
+ cx_pkt->data.layer_psnr[i].sse[2],
+ cx_pkt->data.layer_psnr[i].sse[3]);
+
+ for (j = 0; j < COMPONENTS; ++j) {
+ si->psnr_sum[i][j] +=
+ cx_pkt->data.layer_psnr[i].psnr[j];
+ si->sse_sum[i][j] += cx_pkt->data.layer_psnr[i].sse[j];
+ }
+ }
+ ++si->psnr_pkt_received;
+ break;
+ }
case VPX_CODEC_SPATIAL_SVC_LAYER_SIZES: {
int i;
for (i = 0; i < si->layers; ++i)
@@ -739,41 +660,12 @@
return si->message_buffer;
}
-// We will maintain a list of output frame buffers since with lag_in_frame
-// we need to output all frame buffers at the end. vpx_svc_get_buffer() will
-// remove a frame buffer from the list the put it to a temporal pointer, which
-// will be removed at the next vpx_svc_get_buffer() or when closing encoder.
-void *vpx_svc_get_buffer(SvcContext *svc_ctx) {
- SvcInternal *const si = get_svc_internal(svc_ctx);
- if (svc_ctx == NULL || si == NULL || si->frame_list == NULL) return NULL;
-
- if (si->frame_temp)
- fd_free(si->frame_temp);
-
- si->frame_temp = si->frame_list;
- si->frame_list = si->frame_list->next;
-
- return si->frame_temp->buf;
-}
-
-size_t vpx_svc_get_frame_size(const SvcContext *svc_ctx) {
- const SvcInternal *const si = get_const_svc_internal(svc_ctx);
- if (svc_ctx == NULL || si == NULL || si->frame_list == NULL) return 0;
- return si->frame_list->size;
-}
-
int vpx_svc_get_encode_frame_count(const SvcContext *svc_ctx) {
const SvcInternal *const si = get_const_svc_internal(svc_ctx);
if (svc_ctx == NULL || si == NULL) return 0;
return si->encode_frame_count;
}
-int vpx_svc_is_keyframe(const SvcContext *svc_ctx) {
- const SvcInternal *const si = get_const_svc_internal(svc_ctx);
- if (svc_ctx == NULL || si == NULL || si->frame_list == NULL) return 0;
- return (si->frame_list->flags & VPX_FRAME_IS_KEY) != 0;
-}
-
void vpx_svc_set_keyframe(SvcContext *svc_ctx) {
SvcInternal *const si = get_svc_internal(svc_ctx);
if (svc_ctx == NULL || si == NULL) return;
@@ -787,7 +679,7 @@
// dump accumulated statistics and reset accumulated values
const char *vpx_svc_dump_statistics(SvcContext *svc_ctx) {
- int number_of_frames, encode_frame_count;
+ int number_of_frames;
int i, j;
uint32_t bytes_total = 0;
double scale[COMPONENTS];
@@ -800,12 +692,11 @@
svc_log_reset(svc_ctx);
- encode_frame_count = si->encode_frame_count;
- if (si->encode_frame_count <= 0) return vpx_svc_get_message(svc_ctx);
+ number_of_frames = si->psnr_pkt_received;
+ if (number_of_frames <= 0) return vpx_svc_get_message(svc_ctx);
svc_log(svc_ctx, SVC_LOG_INFO, "\n");
for (i = 0; i < si->layers; ++i) {
- number_of_frames = encode_frame_count;
svc_log(svc_ctx, SVC_LOG_INFO,
"Layer %d Average PSNR=[%2.3f, %2.3f, %2.3f, %2.3f], Bytes=[%u]\n",
@@ -853,26 +744,8 @@
// SvcInternal if it was not already allocated
si = (SvcInternal *)svc_ctx->internal;
if (si != NULL) {
- fd_free(si->frame_temp);
- fd_free_list(si->frame_list);
- if (si->rc_stats_buf) {
- free(si->rc_stats_buf);
- }
free(si);
svc_ctx->internal = NULL;
}
}
-size_t vpx_svc_get_rc_stats_buffer_size(const SvcContext *svc_ctx) {
- const SvcInternal *const si = get_const_svc_internal(svc_ctx);
- if (svc_ctx == NULL || si == NULL) return 0;
- return si->rc_stats_buf_used;
-}
-
-char *vpx_svc_get_rc_stats_buffer(const SvcContext *svc_ctx) {
- const SvcInternal *const si = get_const_svc_internal(svc_ctx);
- if (svc_ctx == NULL || si == NULL) return NULL;
- return si->rc_stats_buf;
-}
-
-
diff --git a/vpx/src/vpx_codec.c b/vpx/src/vpx_codec.c
index d175eae..5a495ce 100644
--- a/vpx/src/vpx_codec.c
+++ b/vpx/src/vpx_codec.c
@@ -88,8 +88,7 @@
else if (!ctx->iface || !ctx->priv)
res = VPX_CODEC_ERROR;
else {
- if (ctx->priv->alg_priv)
- ctx->iface->destroy(ctx->priv->alg_priv);
+ ctx->iface->destroy((vpx_codec_alg_priv_t *)ctx->priv);
ctx->iface = NULL;
ctx->name = NULL;
@@ -125,7 +124,7 @@
va_list ap;
va_start(ap, ctrl_id);
- res = entry->fn(ctx->priv->alg_priv, ap);
+ res = entry->fn((vpx_codec_alg_priv_t *)ctx->priv, ap);
va_end(ap);
break;
}
diff --git a/vpx/src/vpx_decoder.c b/vpx/src/vpx_decoder.c
index 4d22a08..802d8ed 100644
--- a/vpx/src/vpx_decoder.c
+++ b/vpx/src/vpx_decoder.c
@@ -18,9 +18,13 @@
#define SAVE_STATUS(ctx,var) (ctx?(ctx->err = var):var)
+static vpx_codec_alg_priv_t *get_alg_priv(vpx_codec_ctx_t *ctx) {
+ return (vpx_codec_alg_priv_t *)ctx->priv;
+}
+
vpx_codec_err_t vpx_codec_dec_init_ver(vpx_codec_ctx_t *ctx,
vpx_codec_iface_t *iface,
- vpx_codec_dec_cfg_t *cfg,
+ const vpx_codec_dec_cfg_t *cfg,
vpx_codec_flags_t flags,
int ver) {
vpx_codec_err_t res;
@@ -54,9 +58,6 @@
ctx->err_detail = ctx->priv ? ctx->priv->err_detail : NULL;
vpx_codec_destroy(ctx);
}
-
- if (ctx->priv)
- ctx->priv->iface = ctx->iface;
}
return SAVE_STATUS(ctx, res);
@@ -97,7 +98,7 @@
si->w = 0;
si->h = 0;
- res = ctx->iface->dec.get_si(ctx->priv->alg_priv, si);
+ res = ctx->iface->dec.get_si(get_alg_priv(ctx), si);
}
return SAVE_STATUS(ctx, res);
@@ -118,8 +119,8 @@
else if (!ctx->iface || !ctx->priv)
res = VPX_CODEC_ERROR;
else {
- res = ctx->iface->dec.decode(ctx->priv->alg_priv, data, data_sz,
- user_priv, deadline);
+ res = ctx->iface->dec.decode(get_alg_priv(ctx), data, data_sz, user_priv,
+ deadline);
}
return SAVE_STATUS(ctx, res);
@@ -132,7 +133,7 @@
if (!ctx || !iter || !ctx->iface || !ctx->priv)
img = NULL;
else
- img = ctx->iface->dec.get_frame(ctx->priv->alg_priv, iter);
+ img = ctx->iface->dec.get_frame(get_alg_priv(ctx), iter);
return img;
}
@@ -188,7 +189,7 @@
!(ctx->iface->caps & VPX_CODEC_CAP_EXTERNAL_FRAME_BUFFER)) {
res = VPX_CODEC_ERROR;
} else {
- res = ctx->iface->dec.set_fb_fn(ctx->priv->alg_priv, cb_get, cb_release,
+ res = ctx->iface->dec.set_fb_fn(get_alg_priv(ctx), cb_get, cb_release,
cb_priv);
}
diff --git a/vpx/src/vpx_encoder.c b/vpx/src/vpx_encoder.c
index 6e18bd1..cd10c41 100644
--- a/vpx/src/vpx_encoder.c
+++ b/vpx/src/vpx_encoder.c
@@ -15,14 +15,18 @@
*/
#include <limits.h>
#include <string.h>
-#include "vpx/internal/vpx_codec_internal.h"
#include "vpx_config.h"
+#include "vpx/internal/vpx_codec_internal.h"
#define SAVE_STATUS(ctx,var) (ctx?(ctx->err = var):var)
+static vpx_codec_alg_priv_t *get_alg_priv(vpx_codec_ctx_t *ctx) {
+ return (vpx_codec_alg_priv_t *)ctx->priv;
+}
+
vpx_codec_err_t vpx_codec_enc_init_ver(vpx_codec_ctx_t *ctx,
vpx_codec_iface_t *iface,
- vpx_codec_enc_cfg_t *cfg,
+ const vpx_codec_enc_cfg_t *cfg,
vpx_codec_flags_t flags,
int ver) {
vpx_codec_err_t res;
@@ -53,9 +57,6 @@
ctx->err_detail = ctx->priv ? ctx->priv->err_detail : NULL;
vpx_codec_destroy(ctx);
}
-
- if (ctx->priv)
- ctx->priv->iface = ctx->iface;
}
return SAVE_STATUS(ctx, res);
@@ -135,9 +136,6 @@
}
}
- if (ctx->priv)
- ctx->priv->iface = ctx->iface;
-
if (res)
break;
@@ -222,7 +220,7 @@
FLOATING_POINT_INIT();
if (num_enc == 1)
- res = ctx->iface->enc.encode(ctx->priv->alg_priv, img, pts,
+ res = ctx->iface->enc.encode(get_alg_priv(ctx), img, pts,
duration, flags, deadline);
else {
/* Multi-resolution encoding:
@@ -236,7 +234,7 @@
if (img) img += num_enc - 1;
for (i = num_enc - 1; i >= 0; i--) {
- if ((res = ctx->iface->enc.encode(ctx->priv->alg_priv, img, pts,
+ if ((res = ctx->iface->enc.encode(get_alg_priv(ctx), img, pts,
duration, flags, deadline)))
break;
@@ -265,7 +263,7 @@
else if (!(ctx->iface->caps & VPX_CODEC_CAP_ENCODER))
ctx->err = VPX_CODEC_INCAPABLE;
else
- pkt = ctx->iface->enc.get_cx_data(ctx->priv->alg_priv, iter);
+ pkt = ctx->iface->enc.get_cx_data(get_alg_priv(ctx), iter);
}
if (pkt && pkt->kind == VPX_CODEC_CX_FRAME_PKT) {
@@ -333,7 +331,7 @@
else if (!ctx->iface->enc.get_preview)
ctx->err = VPX_CODEC_INCAPABLE;
else
- img = ctx->iface->enc.get_preview(ctx->priv->alg_priv);
+ img = ctx->iface->enc.get_preview(get_alg_priv(ctx));
}
return img;
@@ -351,7 +349,7 @@
else if (!ctx->iface->enc.get_glob_hdrs)
ctx->err = VPX_CODEC_INCAPABLE;
else
- buf = ctx->iface->enc.get_glob_hdrs(ctx->priv->alg_priv);
+ buf = ctx->iface->enc.get_glob_hdrs(get_alg_priv(ctx));
}
return buf;
@@ -367,7 +365,7 @@
else if (!(ctx->iface->caps & VPX_CODEC_CAP_ENCODER))
res = VPX_CODEC_INCAPABLE;
else
- res = ctx->iface->enc.cfg_set(ctx->priv->alg_priv, cfg);
+ res = ctx->iface->enc.cfg_set(get_alg_priv(ctx), cfg);
return SAVE_STATUS(ctx, res);
}
diff --git a/vpx/src/vpx_image.c b/vpx/src/vpx_image.c
index 8c7e3cf..e58b61e 100644
--- a/vpx/src/vpx_image.c
+++ b/vpx/src/vpx_image.c
@@ -8,38 +8,12 @@
* be found in the AUTHORS file in the root of the source tree.
*/
-
#include <stdlib.h>
#include <string.h>
+
#include "vpx/vpx_image.h"
#include "vpx/vpx_integer.h"
-
-#define ADDRESS_STORAGE_SIZE sizeof(size_t)
-/*returns an addr aligned to the byte boundary specified by align*/
-#define align_addr(addr,align) (void*)(((size_t)(addr) + ((align) - 1)) & (size_t)-(align))
-
-/* Memalign code is copied from vpx_mem.c */
-static void *img_buf_memalign(size_t align, size_t size) {
- void *addr,
- * x = NULL;
-
- addr = malloc(size + align - 1 + ADDRESS_STORAGE_SIZE);
-
- if (addr) {
- x = align_addr((unsigned char *)addr + ADDRESS_STORAGE_SIZE, (int)align);
- /* save the actual malloc address */
- ((size_t *)x)[-1] = (size_t)addr;
- }
-
- return x;
-}
-
-static void img_buf_free(void *memblk) {
- if (memblk) {
- void *addr = (void *)(((size_t *)memblk)[-1]);
- free(addr);
- }
-}
+#include "vpx_mem/vpx_mem.h"
static vpx_image_t *img_alloc_helper(vpx_image_t *img,
vpx_img_fmt_t fmt,
@@ -172,7 +146,7 @@
if (alloc_size != (size_t)alloc_size)
goto fail;
- img->img_data = img_buf_memalign(buf_align, (size_t)alloc_size);
+ img->img_data = (uint8_t *)vpx_memalign(buf_align, (size_t)alloc_size);
img->img_data_owner = 1;
}
@@ -180,7 +154,7 @@
goto fail;
img->fmt = fmt;
- img->bit_depth = (fmt & VPX_IMG_FMT_HIGH) ? 16 : 8;
+ img->bit_depth = (fmt & VPX_IMG_FMT_HIGHBITDEPTH) ? 16 : 8;
img->w = w;
img->h = h;
img->x_chroma_shift = xcs;
@@ -296,7 +270,7 @@
void vpx_img_free(vpx_image_t *img) {
if (img) {
if (img->img_data && img->img_data_owner)
- img_buf_free(img->img_data);
+ vpx_free(img->img_data);
if (img->self_allocd)
free(img);
diff --git a/vpx/svc_context.h b/vpx/svc_context.h
index e0de263..0ed2f0f 100644
--- a/vpx/svc_context.h
+++ b/vpx/svc_context.h
@@ -31,7 +31,8 @@
typedef struct {
// public interface to svc_command options
- int spatial_layers; // number of layers
+ int spatial_layers; // number of spatial layers
+ int temporal_layers; // number of temporal layers
SVC_LOG_LEVEL log_level; // amount of information to display
int log_print; // when set, printf log messages instead of returning the
// message with svc_get_message
@@ -95,29 +96,6 @@
const char *vpx_svc_get_message(const SvcContext *svc_ctx);
/**
- * return size of encoded data to be returned by vpx_svc_get_buffer.
- * it needs to be called before vpx_svc_get_buffer.
- */
-size_t vpx_svc_get_frame_size(const SvcContext *svc_ctx);
-
-/**
- * return buffer with encoded data. encoder will maintain a list of frame
- * buffers. each call of vpx_svc_get_buffer() will return one frame.
- */
-void *vpx_svc_get_buffer(SvcContext *svc_ctx);
-
-/**
- * return size of two pass rate control stats data to be returned by
- * vpx_svc_get_rc_stats_buffer
- */
-size_t vpx_svc_get_rc_stats_buffer_size(const SvcContext *svc_ctx);
-
-/**
- * return buffer two pass of rate control stats data
- */
-char *vpx_svc_get_rc_stats_buffer(const SvcContext *svc_ctx);
-
-/**
* return spatial resolution of the specified layer
*/
vpx_codec_err_t vpx_svc_get_layer_resolution(const SvcContext *svc_ctx,
@@ -130,11 +108,6 @@
int vpx_svc_get_encode_frame_count(const SvcContext *svc_ctx);
/**
- * return 1 if last encoded frame was a keyframe
- */
-int vpx_svc_is_keyframe(const SvcContext *svc_ctx);
-
-/**
* force the next frame to be a keyframe
*/
void vpx_svc_set_keyframe(SvcContext *svc_ctx);
diff --git a/vpx/vp8dx.h b/vpx/vp8dx.h
index bd7f19c..379b306 100644
--- a/vpx/vp8dx.h
+++ b/vpx/vp8dx.h
@@ -75,6 +75,9 @@
/** control function to get the display dimensions for the current frame. */
VP9D_GET_DISPLAY_SIZE,
+ /** control function to get the bit depth of the stream. */
+ VP9D_GET_BIT_DEPTH,
+
/** For testing. */
VP9_INVERT_TILE_DECODE_ORDER,
@@ -118,6 +121,7 @@
VPX_CTRL_USE_TYPE(VPXD_SET_DECRYPTOR, vpx_decrypt_init *)
VPX_CTRL_USE_TYPE(VP8D_SET_DECRYPTOR, vpx_decrypt_init *)
VPX_CTRL_USE_TYPE(VP9D_GET_DISPLAY_SIZE, int *)
+VPX_CTRL_USE_TYPE(VP9D_GET_BIT_DEPTH, unsigned int *)
VPX_CTRL_USE_TYPE(VP9_INVERT_TILE_DECODE_ORDER, int)
/*! @} - end defgroup vp8_decoder */
diff --git a/vpx/vpx_codec.h b/vpx/vpx_codec.h
index 07df72a..b25308e 100644
--- a/vpx/vpx_codec.h
+++ b/vpx/vpx_codec.h
@@ -203,9 +203,11 @@
const char *err_detail; /**< Detailed info, if available */
vpx_codec_flags_t init_flags; /**< Flags passed at init time */
union {
- struct vpx_codec_dec_cfg *dec; /**< Decoder Configuration Pointer */
- struct vpx_codec_enc_cfg *enc; /**< Encoder Configuration Pointer */
- void *raw;
+ /**< Decoder Configuration Pointer */
+ const struct vpx_codec_dec_cfg *dec;
+ /**< Encoder Configuration Pointer */
+ const struct vpx_codec_enc_cfg *enc;
+ const void *raw;
} config; /**< Configuration pointer aliasing union */
vpx_codec_priv_t *priv; /**< Algorithm private storage */
} vpx_codec_ctx_t;
@@ -215,9 +217,9 @@
* This enumeration determines the bit depth of the codec.
*/
typedef enum vpx_bit_depth {
- VPX_BITS_8, /**< 8 bits */
- VPX_BITS_10, /**< 10 bits */
- VPX_BITS_12 /**< 12 bits */
+ VPX_BITS_8 = 8, /**< 8 bits */
+ VPX_BITS_10 = 10, /**< 10 bits */
+ VPX_BITS_12 = 12, /**< 12 bits */
} vpx_bit_depth_t;
/*
diff --git a/vpx/vpx_decoder.h b/vpx/vpx_decoder.h
index 10b89fa..62fd919 100644
--- a/vpx/vpx_decoder.h
+++ b/vpx/vpx_decoder.h
@@ -135,7 +135,7 @@
*/
vpx_codec_err_t vpx_codec_dec_init_ver(vpx_codec_ctx_t *ctx,
vpx_codec_iface_t *iface,
- vpx_codec_dec_cfg_t *cfg,
+ const vpx_codec_dec_cfg_t *cfg,
vpx_codec_flags_t flags,
int ver);
diff --git a/vpx/vpx_encoder.h b/vpx/vpx_encoder.h
index 7dbbf2f..996fcb4 100644
--- a/vpx/vpx_encoder.h
+++ b/vpx/vpx_encoder.h
@@ -80,6 +80,9 @@
*/
#define VPX_CODEC_CAP_OUTPUT_PARTITION 0x20000
+/*! Can support input images at greater than 8 bitdepth.
+ */
+#define VPX_CODEC_CAP_HIGHBITDEPTH 0x40000
/*! \brief Initialization-time Feature Enabling
*
@@ -91,6 +94,7 @@
#define VPX_CODEC_USE_PSNR 0x10000 /**< Calculate PSNR on each frame */
#define VPX_CODEC_USE_OUTPUT_PARTITION 0x20000 /**< Make the encoder output one
partition at a time. */
+#define VPX_CODEC_USE_HIGHBITDEPTH 0x40000 /**< Use high bitdepth */
/*!\brief Generic fixed size buffer structure
@@ -159,6 +163,7 @@
VPX_CODEC_PSNR_PKT, /**< PSNR statistics for this frame */
#if CONFIG_SPATIAL_SVC
VPX_CODEC_SPATIAL_SVC_LAYER_SIZES, /**< Sizes for each layer in this frame*/
+ VPX_CODEC_SPATIAL_SVC_LAYER_PSNR, /**< PSNR for each layer in this frame*/
#endif
VPX_CODEC_CUSTOM_PKT = 256 /**< Algorithm extensions */
};
@@ -188,16 +193,17 @@
has id 0.*/
} frame; /**< data for compressed frame packet */
- struct vpx_fixed_buf twopass_stats; /**< data for two-pass packet */
- struct vpx_fixed_buf firstpass_mb_stats; /**< first pass mb packet */
+ vpx_fixed_buf_t twopass_stats; /**< data for two-pass packet */
+ vpx_fixed_buf_t firstpass_mb_stats; /**< first pass mb packet */
struct vpx_psnr_pkt {
unsigned int samples[4]; /**< Number of samples, total/y/u/v */
uint64_t sse[4]; /**< sum squared error, total/y/u/v */
double psnr[4]; /**< PSNR, total/y/u/v */
} psnr; /**< data for PSNR packet */
- struct vpx_fixed_buf raw; /**< data for arbitrary packets */
+ vpx_fixed_buf_t raw; /**< data for arbitrary packets */
#if CONFIG_SPATIAL_SVC
size_t layer_sizes[VPX_SS_MAX_LAYERS];
+ struct vpx_psnr_pkt layer_psnr[VPX_SS_MAX_LAYERS];
#endif
/* This packet size is fixed to allow codecs to extend this
@@ -324,6 +330,21 @@
*/
unsigned int g_h;
+ /*!\brief Bit-depth of the codec
+ *
+ * This value identifies the bit_depth of the codec,
+ * Only certain bit-depths are supported as identified in the
+ * vpx_bit_depth_t enum.
+ */
+ vpx_bit_depth_t g_bit_depth;
+
+ /*!\brief Bit-depth of the input frames
+ *
+ * This value identifies the bit_depth of the input frames in bits.
+ * Note that the frames passed as input to the encoder must have
+ * this bit-depth.
+ */
+ unsigned int g_input_bit_depth;
/*!\brief Stream timebase units
*
@@ -452,14 +473,14 @@
* A buffer containing all of the stats packets produced in the first
* pass, concatenated.
*/
- struct vpx_fixed_buf rc_twopass_stats_in;
+ vpx_fixed_buf_t rc_twopass_stats_in;
/*!\brief first pass mb stats buffer.
*
* A buffer containing all of the first pass mb stats packets produced
* in the first pass, concatenated.
*/
- struct vpx_fixed_buf rc_firstpass_mb_stats_in;
+ vpx_fixed_buf_t rc_firstpass_mb_stats_in;
/*!\brief Target data rate
*
@@ -715,7 +736,7 @@
*/
vpx_codec_err_t vpx_codec_enc_init_ver(vpx_codec_ctx_t *ctx,
vpx_codec_iface_t *iface,
- vpx_codec_enc_cfg_t *cfg,
+ const vpx_codec_enc_cfg_t *cfg,
vpx_codec_flags_t flags,
int ver);
diff --git a/vpx/vpx_image.h b/vpx/vpx_image.h
index 7b04b70..ef6d1dd 100644
--- a/vpx/vpx_image.h
+++ b/vpx/vpx_image.h
@@ -31,10 +31,10 @@
#define VPX_IMAGE_ABI_VERSION (2) /**<\hideinitializer*/
-#define VPX_IMG_FMT_PLANAR 0x100 /**< Image is a planar format */
-#define VPX_IMG_FMT_UV_FLIP 0x200 /**< V plane precedes U plane in memory */
-#define VPX_IMG_FMT_HAS_ALPHA 0x400 /**< Image has an alpha channel component */
-#define VPX_IMG_FMT_HIGH 0x800 /**< Image uses 16bit framebuffer */
+#define VPX_IMG_FMT_PLANAR 0x100 /**< Image is a planar format. */
+#define VPX_IMG_FMT_UV_FLIP 0x200 /**< V plane precedes U in memory. */
+#define VPX_IMG_FMT_HAS_ALPHA 0x400 /**< Image has an alpha channel. */
+#define VPX_IMG_FMT_HIGHBITDEPTH 0x800 /**< Image uses 16bit framebuffer. */
/*!\brief List of supported image formats */
typedef enum vpx_img_fmt {
@@ -59,45 +59,11 @@
VPX_IMG_FMT_I422 = VPX_IMG_FMT_PLANAR | 5,
VPX_IMG_FMT_I444 = VPX_IMG_FMT_PLANAR | 6,
VPX_IMG_FMT_444A = VPX_IMG_FMT_PLANAR | VPX_IMG_FMT_HAS_ALPHA | 7,
- VPX_IMG_FMT_I42016 = VPX_IMG_FMT_I420 | VPX_IMG_FMT_HIGH,
- VPX_IMG_FMT_I42216 = VPX_IMG_FMT_I422 | VPX_IMG_FMT_HIGH,
- VPX_IMG_FMT_I44416 = VPX_IMG_FMT_I444 | VPX_IMG_FMT_HIGH
+ VPX_IMG_FMT_I42016 = VPX_IMG_FMT_I420 | VPX_IMG_FMT_HIGHBITDEPTH,
+ VPX_IMG_FMT_I42216 = VPX_IMG_FMT_I422 | VPX_IMG_FMT_HIGHBITDEPTH,
+ VPX_IMG_FMT_I44416 = VPX_IMG_FMT_I444 | VPX_IMG_FMT_HIGHBITDEPTH
} vpx_img_fmt_t; /**< alias for enum vpx_img_fmt */
-#if !defined(VPX_CODEC_DISABLE_COMPAT) || !VPX_CODEC_DISABLE_COMPAT
-#define IMG_FMT_PLANAR VPX_IMG_FMT_PLANAR /**< \deprecated Use #VPX_IMG_FMT_PLANAR */
-#define IMG_FMT_UV_FLIP VPX_IMG_FMT_UV_FLIP /**< \deprecated Use #VPX_IMG_FMT_UV_FLIP */
-#define IMG_FMT_HAS_ALPHA VPX_IMG_FMT_HAS_ALPHA /**< \deprecated Use #VPX_IMG_FMT_HAS_ALPHA */
-
- /*!\brief Deprecated list of supported image formats
- * \deprecated New code should use #vpx_img_fmt
- */
-#define img_fmt vpx_img_fmt
- /*!\brief alias for enum img_fmt.
- * \deprecated New code should use #vpx_img_fmt_t
- */
-#define img_fmt_t vpx_img_fmt_t
-
-#define IMG_FMT_NONE VPX_IMG_FMT_NONE /**< \deprecated Use #VPX_IMG_FMT_NONE */
-#define IMG_FMT_RGB24 VPX_IMG_FMT_RGB24 /**< \deprecated Use #VPX_IMG_FMT_RGB24 */
-#define IMG_FMT_RGB32 VPX_IMG_FMT_RGB32 /**< \deprecated Use #VPX_IMG_FMT_RGB32 */
-#define IMG_FMT_RGB565 VPX_IMG_FMT_RGB565 /**< \deprecated Use #VPX_IMG_FMT_RGB565 */
-#define IMG_FMT_RGB555 VPX_IMG_FMT_RGB555 /**< \deprecated Use #VPX_IMG_FMT_RGB555 */
-#define IMG_FMT_UYVY VPX_IMG_FMT_UYVY /**< \deprecated Use #VPX_IMG_FMT_UYVY */
-#define IMG_FMT_YUY2 VPX_IMG_FMT_YUY2 /**< \deprecated Use #VPX_IMG_FMT_YUY2 */
-#define IMG_FMT_YVYU VPX_IMG_FMT_YVYU /**< \deprecated Use #VPX_IMG_FMT_YVYU */
-#define IMG_FMT_BGR24 VPX_IMG_FMT_BGR24 /**< \deprecated Use #VPX_IMG_FMT_BGR24 */
-#define IMG_FMT_RGB32_LE VPX_IMG_FMT_RGB32_LE /**< \deprecated Use #VPX_IMG_FMT_RGB32_LE */
-#define IMG_FMT_ARGB VPX_IMG_FMT_ARGB /**< \deprecated Use #VPX_IMG_FMT_ARGB */
-#define IMG_FMT_ARGB_LE VPX_IMG_FMT_ARGB_LE /**< \deprecated Use #VPX_IMG_FMT_ARGB_LE */
-#define IMG_FMT_RGB565_LE VPX_IMG_FMT_RGB565_LE /**< \deprecated Use #VPX_IMG_FMT_RGB565_LE */
-#define IMG_FMT_RGB555_LE VPX_IMG_FMT_RGB555_LE /**< \deprecated Use #VPX_IMG_FMT_RGB555_LE */
-#define IMG_FMT_YV12 VPX_IMG_FMT_YV12 /**< \deprecated Use #VPX_IMG_FMT_YV12 */
-#define IMG_FMT_I420 VPX_IMG_FMT_I420 /**< \deprecated Use #VPX_IMG_FMT_I420 */
-#define IMG_FMT_VPXYV12 VPX_IMG_FMT_VPXYV12 /**< \deprecated Use #VPX_IMG_FMT_VPXYV12 */
-#define IMG_FMT_VPXI420 VPX_IMG_FMT_VPXI420 /**< \deprecated Use #VPX_IMG_FMT_VPXI420 */
-#endif /* VPX_CODEC_DISABLE_COMPAT */
-
/**\brief Image Descriptor */
typedef struct vpx_image {
vpx_img_fmt_t fmt; /**< Image Format */
@@ -121,13 +87,6 @@
#define VPX_PLANE_U 1 /**< U (Chroma) plane */
#define VPX_PLANE_V 2 /**< V (Chroma) plane */
#define VPX_PLANE_ALPHA 3 /**< A (Transparency) plane */
-#if !defined(VPX_CODEC_DISABLE_COMPAT) || !VPX_CODEC_DISABLE_COMPAT
-#define PLANE_PACKED VPX_PLANE_PACKED
-#define PLANE_Y VPX_PLANE_Y
-#define PLANE_U VPX_PLANE_U
-#define PLANE_V VPX_PLANE_V
-#define PLANE_ALPHA VPX_PLANE_ALPHA
-#endif
unsigned char *planes[4]; /**< pointer to the top left pixel for each plane */
int stride[4]; /**< stride between rows for each plane */
diff --git a/vpx_mem/vpx_mem.c b/vpx_mem/vpx_mem.c
index 059248b..da61642 100644
--- a/vpx_mem/vpx_mem.c
+++ b/vpx_mem/vpx_mem.c
@@ -16,6 +16,7 @@
#include <stdlib.h>
#include <string.h>
#include "include/vpx_mem_intrnl.h"
+#include "vpx/vpx_integer.h"
#if CONFIG_MEM_TRACKER
#ifndef VPX_NO_GLOBALS
@@ -452,6 +453,29 @@
return VPX_MEMSET_L(dest, val, length);
}
+#if CONFIG_VP9 && CONFIG_VP9_HIGHBITDEPTH
+void *vpx_memset16(void *dest, int val, size_t length) {
+#if CONFIG_MEM_CHECKS
+ if ((int)dest < 0x4000) {
+ _P(printf("WARNING: vpx_memset dest:0x%x val:%d len:%d\n",
+ (int)dest, val, length);)
+
+#if defined(VXWORKS)
+ sp(get_my_tt, task_id_self(), 0, 0, 0, 0, 0, 0, 0, 0);
+
+ vx_sleep(10000);
+#endif
+ }
+#endif
+ int i;
+ void *orig = dest;
+ uint16_t *dest16 = dest;
+ for (i = 0; i < length; i++)
+ *dest16++ = val;
+ return orig;
+}
+#endif // CONFIG_VP9 && CONFIG_VP9_HIGHBITDEPTH
+
void *vpx_memmove(void *dest, const void *src, size_t count) {
#if CONFIG_MEM_CHECKS
diff --git a/vpx_mem/vpx_mem.h b/vpx_mem/vpx_mem.h
index 33686b2..e2391f4 100644
--- a/vpx_mem/vpx_mem.h
+++ b/vpx_mem/vpx_mem.h
@@ -73,6 +73,9 @@
void *vpx_memcpy(void *dest, const void *src, size_t length);
void *vpx_memset(void *dest, int val, size_t length);
+#if CONFIG_VP9 && CONFIG_VP9_HIGHBITDEPTH
+ void *vpx_memset16(void *dest, int val, size_t length);
+#endif
void *vpx_memmove(void *dest, const void *src, size_t count);
/* special memory functions */
diff --git a/vpx_ports/arm_cpudetect.c b/vpx_ports/arm_cpudetect.c
index fa0e030..f03feff 100644
--- a/vpx_ports/arm_cpudetect.c
+++ b/vpx_ports/arm_cpudetect.c
@@ -10,7 +10,8 @@
#include <stdlib.h>
#include <string.h>
-#include "arm.h"
+#include "vpx_ports/arm.h"
+#include "./vpx_config.h"
#ifdef WINAPI_FAMILY
#include <winapifamily.h>
@@ -54,9 +55,9 @@
#if HAVE_MEDIA
flags |= HAS_MEDIA;
#endif /* HAVE_MEDIA */
-#if HAVE_NEON
+#if HAVE_NEON || HAVE_NEON_ASM
flags |= HAS_NEON;
-#endif /* HAVE_NEON */
+#endif /* HAVE_NEON || HAVE_NEON_ASM */
return flags & mask;
}
@@ -87,6 +88,7 @@
/*Ignore exception.*/
}
}
+#endif /* HAVE_EDSP */
#if HAVE_MEDIA
if (mask & HAS_MEDIA)
__try {
@@ -97,7 +99,8 @@
/*Ignore exception.*/
}
}
-#if HAVE_NEON
+#endif /* HAVE_MEDIA */
+#if HAVE_NEON || HAVE_NEON_ASM
if (mask &HAS_NEON) {
__try {
/*VORR q0,q0,q0*/
@@ -107,9 +110,7 @@
/*Ignore exception.*/
}
}
-#endif /* HAVE_NEON */
-#endif /* HAVE_MEDIA */
-#endif /* HAVE_EDSP */
+#endif /* HAVE_NEON || HAVE_NEON_ASM */
return flags & mask;
}
@@ -132,10 +133,10 @@
#if HAVE_MEDIA
flags |= HAS_MEDIA;
#endif /* HAVE_MEDIA */
-#if HAVE_NEON
+#if HAVE_NEON || HAVE_NEON_ASM
if (features & ANDROID_CPU_ARM_FEATURE_NEON)
flags |= HAS_NEON;
-#endif /* HAVE_NEON */
+#endif /* HAVE_NEON || HAVE_NEON_ASM */
return flags & mask;
}
@@ -162,7 +163,7 @@
*/
char buf[512];
while (fgets(buf, 511, fin) != NULL) {
-#if HAVE_EDSP || HAVE_NEON
+#if HAVE_EDSP || HAVE_NEON || HAVE_NEON_ASM
if (memcmp(buf, "Features", 8) == 0) {
char *p;
#if HAVE_EDSP
@@ -170,15 +171,15 @@
if (p != NULL && (p[5] == ' ' || p[5] == '\n')) {
flags |= HAS_EDSP;
}
-#if HAVE_NEON
+#endif /* HAVE_EDSP */
+#if HAVE_NEON || HAVE_NEON_ASM
p = strstr(buf, " neon");
if (p != NULL && (p[5] == ' ' || p[5] == '\n')) {
flags |= HAS_NEON;
}
-#endif /* HAVE_NEON */
-#endif /* HAVE_EDSP */
+#endif /* HAVE_NEON || HAVE_NEON_ASM */
}
-#endif /* HAVE_EDSP || HAVE_NEON */
+#endif /* HAVE_EDSP || HAVE_NEON || HAVE_NEON_ASM */
#if HAVE_MEDIA
if (memcmp(buf, "CPU architecture:", 17) == 0) {
int version;
diff --git a/vpx_ports/vpx_timer.h b/vpx_ports/vpx_timer.h
index 870338b..dd98e29 100644
--- a/vpx_ports/vpx_timer.h
+++ b/vpx_ports/vpx_timer.h
@@ -11,6 +11,9 @@
#ifndef VPX_PORTS_VPX_TIMER_H_
#define VPX_PORTS_VPX_TIMER_H_
+
+#include "./vpx_config.h"
+
#include "vpx/vpx_integer.h"
#if CONFIG_OS_SUPPORT
diff --git a/vpx_scale/generic/yv12config.c b/vpx_scale/generic/yv12config.c
index 827bce7..70d7ac0 100644
--- a/vpx_scale/generic/yv12config.c
+++ b/vpx_scale/generic/yv12config.c
@@ -13,6 +13,9 @@
#include "./vpx_config.h"
#include "vpx_scale/yv12config.h"
#include "vpx_mem/vpx_mem.h"
+#if CONFIG_VP9 && CONFIG_VP9_HIGHBITDEPTH
+#include "vp9/common/vp9_common.h"
+#endif
/****************************************************************************
* Exports
@@ -136,7 +139,11 @@
int vp9_realloc_frame_buffer(YV12_BUFFER_CONFIG *ybf,
int width, int height,
- int ss_x, int ss_y, int border,
+ int ss_x, int ss_y,
+#if CONFIG_VP9_HIGHBITDEPTH
+ int use_highbitdepth,
+#endif
+ int border,
vpx_codec_frame_buffer_t *fb,
vpx_get_frame_buffer_cb_fn_t cb,
void *cb_priv) {
@@ -161,11 +168,21 @@
const int alpha_border_h = border;
const uint64_t alpha_plane_size = (alpha_height + 2 * alpha_border_h) *
(uint64_t)alpha_stride;
+#if CONFIG_VP9_HIGHBITDEPTH
+ const uint64_t frame_size = (1 + use_highbitdepth) *
+ (yplane_size + 2 * uvplane_size + alpha_plane_size);
+#else
const uint64_t frame_size = yplane_size + 2 * uvplane_size +
alpha_plane_size;
+#endif // CONFIG_VP9_HIGHBITDEPTH
+#else
+#if CONFIG_VP9_HIGHBITDEPTH
+ const uint64_t frame_size =
+ (1 + use_highbitdepth) * (yplane_size + 2 * uvplane_size);
#else
const uint64_t frame_size = yplane_size + 2 * uvplane_size;
-#endif
+#endif // CONFIG_VP9_HIGHBITDEPTH
+#endif // CONFIG_ALPHA
if (cb != NULL) {
const int align_addr_extra_size = 31;
const uint64_t external_frame_size = frame_size + align_addr_extra_size;
@@ -231,11 +248,31 @@
ybf->border = border;
ybf->frame_size = (int)frame_size;
+#if CONFIG_VP9_HIGHBITDEPTH
+ if (use_highbitdepth) {
+ // Store uint16 addresses when using 16bit framebuffers
+ uint8_t *p = CONVERT_TO_BYTEPTR(ybf->buffer_alloc);
+ ybf->y_buffer = p + (border * y_stride) + border;
+ ybf->u_buffer = p + yplane_size +
+ (uv_border_h * uv_stride) + uv_border_w;
+ ybf->v_buffer = p + yplane_size + uvplane_size +
+ (uv_border_h * uv_stride) + uv_border_w;
+ ybf->flags = YV12_FLAG_HIGHBITDEPTH;
+ } else {
+ ybf->y_buffer = ybf->buffer_alloc + (border * y_stride) + border;
+ ybf->u_buffer = ybf->buffer_alloc + yplane_size +
+ (uv_border_h * uv_stride) + uv_border_w;
+ ybf->v_buffer = ybf->buffer_alloc + yplane_size + uvplane_size +
+ (uv_border_h * uv_stride) + uv_border_w;
+ ybf->flags = 0;
+ }
+#else
ybf->y_buffer = ybf->buffer_alloc + (border * y_stride) + border;
ybf->u_buffer = ybf->buffer_alloc + yplane_size +
(uv_border_h * uv_stride) + uv_border_w;
ybf->v_buffer = ybf->buffer_alloc + yplane_size + uvplane_size +
(uv_border_h * uv_stride) + uv_border_w;
+#endif // CONFIG_VP9_HIGHBITDEPTH
#if CONFIG_ALPHA
ybf->alpha_width = alpha_width;
@@ -252,11 +289,18 @@
int vp9_alloc_frame_buffer(YV12_BUFFER_CONFIG *ybf,
int width, int height,
- int ss_x, int ss_y, int border) {
+ int ss_x, int ss_y,
+#if CONFIG_VP9_HIGHBITDEPTH
+ int use_highbitdepth,
+#endif
+ int border) {
if (ybf) {
vp9_free_frame_buffer(ybf);
- return vp9_realloc_frame_buffer(ybf, width, height, ss_x, ss_y, border,
- NULL, NULL, NULL);
+ return vp9_realloc_frame_buffer(ybf, width, height, ss_x, ss_y,
+#if CONFIG_VP9_HIGHBITDEPTH
+ use_highbitdepth,
+#endif
+ border, NULL, NULL, NULL);
}
return -2;
}
diff --git a/vpx_scale/generic/yv12extend.c b/vpx_scale/generic/yv12extend.c
index 036a505..0485452 100644
--- a/vpx_scale/generic/yv12extend.c
+++ b/vpx_scale/generic/yv12extend.c
@@ -13,6 +13,9 @@
#include "vpx/vpx_integer.h"
#include "vpx_mem/vpx_mem.h"
#include "vpx_scale/yv12config.h"
+#if CONFIG_VP9 && CONFIG_VP9_HIGHBITDEPTH
+#include "vp9/common/vp9_common.h"
+#endif
static void extend_plane(uint8_t *const src, int src_stride,
int width, int height,
@@ -55,6 +58,50 @@
}
}
+#if CONFIG_VP9 && CONFIG_VP9_HIGHBITDEPTH
+static void extend_plane_high(uint8_t *const src8, int src_stride,
+ int width, int height,
+ int extend_top, int extend_left,
+ int extend_bottom, int extend_right) {
+ int i;
+ const int linesize = extend_left + extend_right + width;
+ uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+
+ /* copy the left and right most columns out */
+ uint16_t *src_ptr1 = src;
+ uint16_t *src_ptr2 = src + width - 1;
+ uint16_t *dst_ptr1 = src - extend_left;
+ uint16_t *dst_ptr2 = src + width;
+
+ for (i = 0; i < height; ++i) {
+ vpx_memset16(dst_ptr1, src_ptr1[0], extend_left);
+ vpx_memset16(dst_ptr2, src_ptr2[0], extend_right);
+ src_ptr1 += src_stride;
+ src_ptr2 += src_stride;
+ dst_ptr1 += src_stride;
+ dst_ptr2 += src_stride;
+ }
+
+ /* Now copy the top and bottom lines into each line of the respective
+ * borders
+ */
+ src_ptr1 = src - extend_left;
+ src_ptr2 = src + src_stride * (height - 1) - extend_left;
+ dst_ptr1 = src + src_stride * -extend_top - extend_left;
+ dst_ptr2 = src + src_stride * height - extend_left;
+
+ for (i = 0; i < extend_top; ++i) {
+ vpx_memcpy(dst_ptr1, src_ptr1, linesize * sizeof(uint16_t));
+ dst_ptr1 += src_stride;
+ }
+
+ for (i = 0; i < extend_bottom; ++i) {
+ vpx_memcpy(dst_ptr2, src_ptr2, linesize * sizeof(uint16_t));
+ dst_ptr2 += src_stride;
+ }
+}
+#endif
+
void vp8_yv12_extend_frame_borders_c(YV12_BUFFER_CONFIG *ybf) {
const int uv_border = ybf->border / 2;
@@ -64,6 +111,31 @@
assert(ybf->y_height - ybf->y_crop_height >= 0);
assert(ybf->y_width - ybf->y_crop_width >= 0);
+#if CONFIG_VP9 && CONFIG_VP9_HIGHBITDEPTH
+ if (ybf->flags & YV12_FLAG_HIGHBITDEPTH) {
+ extend_plane_high(
+ ybf->y_buffer, ybf->y_stride,
+ ybf->y_crop_width, ybf->y_crop_height,
+ ybf->border, ybf->border,
+ ybf->border + ybf->y_height - ybf->y_crop_height,
+ ybf->border + ybf->y_width - ybf->y_crop_width);
+
+ extend_plane_high(
+ ybf->u_buffer, ybf->uv_stride,
+ (ybf->y_crop_width + 1) / 2, (ybf->y_crop_height + 1) / 2,
+ ybf->border / 2, ybf->border / 2,
+ (ybf->border + ybf->y_height - ybf->y_crop_height + 1) / 2,
+ (ybf->border + ybf->y_width - ybf->y_crop_width + 1) / 2);
+
+ extend_plane_high(
+ ybf->v_buffer, ybf->uv_stride,
+ (ybf->y_crop_width + 1) / 2, (ybf->y_crop_height + 1) / 2,
+ ybf->border / 2, ybf->border / 2,
+ (ybf->border + ybf->y_height - ybf->y_crop_height + 1) / 2,
+ (ybf->border + ybf->y_width - ybf->y_crop_width + 1) / 2);
+ return;
+ }
+#endif
extend_plane(ybf->y_buffer, ybf->y_stride,
ybf->y_crop_width, ybf->y_crop_height,
ybf->border, ybf->border,
@@ -99,6 +171,20 @@
assert(ybf->y_height - ybf->y_crop_height >= 0);
assert(ybf->y_width - ybf->y_crop_width >= 0);
+#if CONFIG_VP9_HIGHBITDEPTH
+ if (ybf->flags & YV12_FLAG_HIGHBITDEPTH) {
+ extend_plane_high(ybf->y_buffer, ybf->y_stride,
+ ybf->y_crop_width, ybf->y_crop_height,
+ ext_size, ext_size,
+ ext_size + ybf->y_height - ybf->y_crop_height,
+ ext_size + ybf->y_width - ybf->y_crop_width);
+ extend_plane_high(ybf->u_buffer, ybf->uv_stride,
+ c_w, c_h, c_et, c_el, c_eb, c_er);
+ extend_plane_high(ybf->v_buffer, ybf->uv_stride,
+ c_w, c_h, c_et, c_el, c_eb, c_er);
+ return;
+ }
+#endif
extend_plane(ybf->y_buffer, ybf->y_stride,
ybf->y_crop_width, ybf->y_crop_height,
ext_size, ext_size,
@@ -121,6 +207,14 @@
VP9INNERBORDERINPIXELS : ybf->border;
extend_frame(ybf, inner_bw);
}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+void memcpy_short_addr(uint8_t *dst8, const uint8_t *src8, int num) {
+ uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
+ uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+ vpx_memcpy(dst, src, num * sizeof(uint16_t));
+}
+#endif // CONFIG_VP9_HIGHBITDEPTH
#endif // CONFIG_VP9
// Copies the source image into the destination image and updates the
@@ -140,6 +234,40 @@
assert(src_ybc->y_height == dst_ybc->y_height);
#endif
+#if CONFIG_VP9 && CONFIG_VP9_HIGHBITDEPTH
+ if (src_ybc->flags & YV12_FLAG_HIGHBITDEPTH) {
+ assert(dst_ybc->flags & YV12_FLAG_HIGHBITDEPTH);
+ for (row = 0; row < src_ybc->y_height; ++row) {
+ memcpy_short_addr(dst, src, src_ybc->y_width);
+ src += src_ybc->y_stride;
+ dst += dst_ybc->y_stride;
+ }
+
+ src = src_ybc->u_buffer;
+ dst = dst_ybc->u_buffer;
+
+ for (row = 0; row < src_ybc->uv_height; ++row) {
+ memcpy_short_addr(dst, src, src_ybc->uv_width);
+ src += src_ybc->uv_stride;
+ dst += dst_ybc->uv_stride;
+ }
+
+ src = src_ybc->v_buffer;
+ dst = dst_ybc->v_buffer;
+
+ for (row = 0; row < src_ybc->uv_height; ++row) {
+ memcpy_short_addr(dst, src, src_ybc->uv_width);
+ src += src_ybc->uv_stride;
+ dst += dst_ybc->uv_stride;
+ }
+
+ vp8_yv12_extend_frame_borders_c(dst_ybc);
+ return;
+ } else {
+ assert(!(dst_ybc->flags & YV12_FLAG_HIGHBITDEPTH));
+ }
+#endif
+
for (row = 0; row < src_ybc->y_height; ++row) {
vpx_memcpy(dst, src, src_ybc->y_width);
src += src_ybc->y_stride;
@@ -173,6 +301,19 @@
const uint8_t *src = src_ybc->y_buffer;
uint8_t *dst = dst_ybc->y_buffer;
+#if CONFIG_VP9 && CONFIG_VP9_HIGHBITDEPTH
+ if (src_ybc->flags & YV12_FLAG_HIGHBITDEPTH) {
+ const uint16_t *src16 = CONVERT_TO_SHORTPTR(src);
+ uint16_t *dst16 = CONVERT_TO_SHORTPTR(dst);
+ for (row = 0; row < src_ybc->y_height; ++row) {
+ vpx_memcpy(dst16, src16, src_ybc->y_width * sizeof(uint16_t));
+ src16 += src_ybc->y_stride;
+ dst16 += dst_ybc->y_stride;
+ }
+ return;
+ }
+#endif
+
for (row = 0; row < src_ybc->y_height; ++row) {
vpx_memcpy(dst, src, src_ybc->y_width);
src += src_ybc->y_stride;
diff --git a/vpx_scale/yv12config.h b/vpx_scale/yv12config.h
index cdde75c..9ff764c 100644
--- a/vpx_scale/yv12config.h
+++ b/vpx_scale/yv12config.h
@@ -15,6 +15,7 @@
extern "C" {
#endif
+#include "vpx/vpx_codec.h"
#include "vpx/vpx_frame_buffer.h"
#include "vpx/vpx_integer.h"
@@ -50,11 +51,14 @@
int buffer_alloc_sz;
int border;
int frame_size;
+ unsigned int bit_depth;
int corrupted;
int flags;
} YV12_BUFFER_CONFIG;
+#define YV12_FLAG_HIGHBITDEPTH 1
+
int vp8_yv12_alloc_frame_buffer(YV12_BUFFER_CONFIG *ybf,
int width, int height, int border);
int vp8_yv12_realloc_frame_buffer(YV12_BUFFER_CONFIG *ybf,
@@ -63,6 +67,9 @@
int vp9_alloc_frame_buffer(YV12_BUFFER_CONFIG *ybf,
int width, int height, int ss_x, int ss_y,
+#if CONFIG_VP9_HIGHBITDEPTH
+ int use_highbitdepth,
+#endif
int border);
// Updates the yv12 buffer config with the frame buffer. If cb is not
@@ -73,6 +80,9 @@
// on failure.
int vp9_realloc_frame_buffer(YV12_BUFFER_CONFIG *ybf,
int width, int height, int ss_x, int ss_y,
+#if CONFIG_VP9_HIGHBITDEPTH
+ int use_highbitdepth,
+#endif
int border,
vpx_codec_frame_buffer_t *fb,
vpx_get_frame_buffer_cb_fn_t cb,
diff --git a/vpxdec.c b/vpxdec.c
index faee42a..091522f 100644
--- a/vpxdec.c
+++ b/vpxdec.c
@@ -15,13 +15,15 @@
#include <string.h>
#include <limits.h>
+#include "./vpx_config.h"
+
+#if CONFIG_LIBYUV
#include "third_party/libyuv/include/libyuv/scale.h"
+#endif
#include "./args.h"
#include "./ivfdec.h"
-#define VPX_CODEC_DISABLE_COMPAT 1
-#include "./vpx_config.h"
#include "vpx/vpx_decoder.h"
#include "vpx_ports/mem_ops.h"
#include "vpx_ports/vpx_timer.h"
@@ -87,12 +89,20 @@
static const arg_def_t md5arg = ARG_DEF(NULL, "md5", 0,
"Compute the MD5 sum of the decoded frame");
+#if CONFIG_VP9 && CONFIG_VP9_HIGHBITDEPTH
+static const arg_def_t outbitdeptharg = ARG_DEF(
+ NULL, "output-bit-depth", 1,
+ "Output bit-depth for decoded frames");
+#endif
static const arg_def_t *all_args[] = {
&codecarg, &use_yv12, &use_i420, &flipuvarg, &rawvideo, &noblitarg,
&progressarg, &limitarg, &skiparg, &postprocarg, &summaryarg, &outputfile,
&threadsarg, &verbosearg, &scalearg, &fb_arg,
&md5arg, &error_concealment, &continuearg,
+#if CONFIG_VP9 && CONFIG_VP9_HIGHBITDEPTH
+ &outbitdeptharg,
+#endif
NULL
};
@@ -123,8 +133,29 @@
};
#endif
+#if CONFIG_LIBYUV
static INLINE int vpx_image_scale(vpx_image_t *src, vpx_image_t *dst,
FilterModeEnum mode) {
+#if CONFIG_VP9 && CONFIG_VP9_HIGHBITDEPTH
+ if (src->fmt == VPX_IMG_FMT_I42016) {
+ assert(dst->fmt == VPX_IMG_FMT_I42016);
+ return I420Scale_16((uint16_t*)src->planes[VPX_PLANE_Y],
+ src->stride[VPX_PLANE_Y]/2,
+ (uint16_t*)src->planes[VPX_PLANE_U],
+ src->stride[VPX_PLANE_U]/2,
+ (uint16_t*)src->planes[VPX_PLANE_V],
+ src->stride[VPX_PLANE_V]/2,
+ src->d_w, src->d_h,
+ (uint16_t*)dst->planes[VPX_PLANE_Y],
+ dst->stride[VPX_PLANE_Y]/2,
+ (uint16_t*)dst->planes[VPX_PLANE_U],
+ dst->stride[VPX_PLANE_U]/2,
+ (uint16_t*)dst->planes[VPX_PLANE_V],
+ dst->stride[VPX_PLANE_V]/2,
+ dst->d_w, dst->d_h,
+ mode);
+ }
+#endif
assert(src->fmt == VPX_IMG_FMT_I420);
assert(dst->fmt == VPX_IMG_FMT_I420);
return I420Scale(src->planes[VPX_PLANE_Y], src->stride[VPX_PLANE_Y],
@@ -137,6 +168,7 @@
dst->d_w, dst->d_h,
mode);
}
+#endif
void usage_exit() {
int i;
@@ -260,6 +292,11 @@
static void write_image_file(const vpx_image_t *img, const int planes[3],
FILE *file) {
int i, y;
+#if CONFIG_VP9 && CONFIG_VP9_HIGHBITDEPTH
+ const int bytes_per_sample = ((img->fmt & VPX_IMG_FMT_HIGHBITDEPTH) ? 2 : 1);
+#else
+ const int bytes_per_sample = 1;
+#endif
for (i = 0; i < 3; ++i) {
const int plane = planes[i];
@@ -269,7 +306,7 @@
const int h = vpx_img_plane_height(img, plane);
for (y = 0; y < h; ++y) {
- fwrite(buf, 1, w, file);
+ fwrite(buf, bytes_per_sample, w, file);
buf += stride;
}
}
@@ -489,6 +526,178 @@
}
}
+#if CONFIG_VP9 && CONFIG_VP9_HIGHBITDEPTH
+static void high_img_upshift(vpx_image_t *dst, vpx_image_t *src,
+ int input_shift) {
+ const int offset = input_shift > 0 ? (1 << (input_shift - 1)) : 0;
+ int plane;
+ if (dst->d_w != src->d_w || dst->d_h != src->d_h ||
+ dst->x_chroma_shift != src->x_chroma_shift ||
+ dst->y_chroma_shift != src->y_chroma_shift ||
+ dst->fmt != src->fmt || input_shift < 0) {
+ fatal("Unsupported image conversion");
+ }
+ switch (src->fmt) {
+ case VPX_IMG_FMT_I42016:
+ case VPX_IMG_FMT_I42216:
+ case VPX_IMG_FMT_I44416:
+ break;
+ default:
+ fatal("Unsupported image conversion");
+ break;
+ }
+ for (plane = 0; plane < 3; plane++) {
+ int w = src->d_w;
+ int h = src->d_h;
+ int x, y;
+ if (plane) {
+ w >>= src->x_chroma_shift;
+ h >>= src->y_chroma_shift;
+ }
+ for (y = 0; y < h; y++) {
+ uint16_t *p_src = (uint16_t *)(src->planes[plane] +
+ y * src->stride[plane]);
+ uint16_t *p_dst = (uint16_t *)(dst->planes[plane] +
+ y * dst->stride[plane]);
+ for (x = 0; x < w; x++)
+ *p_dst++ = (*p_src++ << input_shift) + offset;
+ }
+ }
+}
+
+static void low_img_upshift(vpx_image_t *dst, vpx_image_t *src,
+ int input_shift) {
+ const int offset = input_shift > 0 ? (1 << (input_shift - 1)) : 0;
+ int plane;
+ if (dst->d_w != src->d_w || dst->d_h != src->d_h ||
+ dst->x_chroma_shift != src->x_chroma_shift ||
+ dst->y_chroma_shift != src->y_chroma_shift ||
+ dst->fmt != src->fmt + VPX_IMG_FMT_HIGHBITDEPTH ||
+ input_shift < 0) {
+ fatal("Unsupported image conversion");
+ }
+ switch (src->fmt) {
+ case VPX_IMG_FMT_I420:
+ case VPX_IMG_FMT_I422:
+ case VPX_IMG_FMT_I444:
+ break;
+ default:
+ fatal("Unsupported image conversion");
+ break;
+ }
+ for (plane = 0; plane < 3; plane++) {
+ int w = src->d_w;
+ int h = src->d_h;
+ int x, y;
+ if (plane) {
+ w >>= src->x_chroma_shift;
+ h >>= src->y_chroma_shift;
+ }
+ for (y = 0; y < h; y++) {
+ uint8_t *p_src = src->planes[plane] + y * src->stride[plane];
+ uint16_t *p_dst = (uint16_t *)(dst->planes[plane] +
+ y * dst->stride[plane]);
+ for (x = 0; x < w; x++) {
+ *p_dst++ = (*p_src++ << input_shift) + offset;
+ }
+ }
+ }
+}
+
+static void img_upshift(vpx_image_t *dst, vpx_image_t *src,
+ int input_shift) {
+ if (src->fmt & VPX_IMG_FMT_HIGHBITDEPTH) {
+ high_img_upshift(dst, src, input_shift);
+ } else {
+ low_img_upshift(dst, src, input_shift);
+ }
+}
+
+static void high_img_downshift(vpx_image_t *dst, vpx_image_t *src,
+ int down_shift) {
+ int plane;
+ if (dst->d_w != src->d_w || dst->d_h != src->d_h ||
+ dst->x_chroma_shift != src->x_chroma_shift ||
+ dst->y_chroma_shift != src->y_chroma_shift ||
+ dst->fmt != src->fmt || down_shift < 0) {
+ fatal("Unsupported image conversion");
+ }
+ switch (src->fmt) {
+ case VPX_IMG_FMT_I42016:
+ case VPX_IMG_FMT_I42216:
+ case VPX_IMG_FMT_I44416:
+ break;
+ default:
+ fatal("Unsupported image conversion");
+ break;
+ }
+ for (plane = 0; plane < 3; plane++) {
+ int w = src->d_w;
+ int h = src->d_h;
+ int x, y;
+ if (plane) {
+ w >>= src->x_chroma_shift;
+ h >>= src->y_chroma_shift;
+ }
+ for (y = 0; y < h; y++) {
+ uint16_t *p_src = (uint16_t *)(src->planes[plane] +
+ y * src->stride[plane]);
+ uint16_t *p_dst = (uint16_t *)(dst->planes[plane] +
+ y * dst->stride[plane]);
+ for (x = 0; x < w; x++)
+ *p_dst++ = *p_src++ >> down_shift;
+ }
+ }
+}
+
+static void low_img_downshift(vpx_image_t *dst, vpx_image_t *src,
+ int down_shift) {
+ int plane;
+ if (dst->d_w != src->d_w || dst->d_h != src->d_h ||
+ dst->x_chroma_shift != src->x_chroma_shift ||
+ dst->y_chroma_shift != src->y_chroma_shift ||
+ src->fmt != dst->fmt + VPX_IMG_FMT_HIGHBITDEPTH ||
+ down_shift < 0) {
+ fatal("Unsupported image conversion");
+ }
+ switch (dst->fmt) {
+ case VPX_IMG_FMT_I420:
+ case VPX_IMG_FMT_I422:
+ case VPX_IMG_FMT_I444:
+ break;
+ default:
+ fatal("Unsupported image conversion");
+ break;
+ }
+ for (plane = 0; plane < 3; plane++) {
+ int w = src->d_w;
+ int h = src->d_h;
+ int x, y;
+ if (plane) {
+ w >>= src->x_chroma_shift;
+ h >>= src->y_chroma_shift;
+ }
+ for (y = 0; y < h; y++) {
+ uint16_t *p_src = (uint16_t *)(src->planes[plane] +
+ y * src->stride[plane]);
+ uint8_t *p_dst = dst->planes[plane] + y * dst->stride[plane];
+ for (x = 0; x < w; x++) {
+ *p_dst++ = *p_src++ >> down_shift;
+ }
+ }
+ }
+}
+
+static void img_downshift(vpx_image_t *dst, vpx_image_t *src,
+ int down_shift) {
+ if (dst->fmt & VPX_IMG_FMT_HIGHBITDEPTH) {
+ high_img_downshift(dst, src, down_shift);
+ } else {
+ low_img_downshift(dst, src, down_shift);
+ }
+}
+#endif
+
int main_loop(int argc, const char **argv_) {
vpx_codec_ctx_t decoder;
char *fn = NULL;
@@ -513,6 +722,9 @@
int opt_yv12 = 0;
int opt_i420 = 0;
vpx_codec_dec_cfg_t cfg = {0, 0, 0};
+#if CONFIG_VP9 && CONFIG_VP9_HIGHBITDEPTH
+ int output_bit_depth = 0;
+#endif
#if CONFIG_VP8_DECODER
vp8_postproc_cfg_t vp8_pp_cfg = {0};
int vp8_dbg_color_ref_frame = 0;
@@ -524,6 +736,9 @@
int dec_flags = 0;
int do_scale = 0;
vpx_image_t *scaled_img = NULL;
+#if CONFIG_VP9 && CONFIG_VP9_HIGHBITDEPTH
+ vpx_image_t *img_shifted = NULL;
+#endif
int frame_avail, got_data;
int num_external_frame_buffers = 0;
struct ExternalFrameBufferList ext_fb_list = {0, NULL};
@@ -538,7 +753,8 @@
struct VpxDecInputContext input = {NULL, NULL};
struct VpxInputContext vpx_input_ctx;
#if CONFIG_WEBM_IO
- struct WebmInputContext webm_ctx = {0};
+ struct WebmInputContext webm_ctx;
+ memset(&(webm_ctx), 0, sizeof(webm_ctx));
input.webm_ctx = &webm_ctx;
#endif
input.vpx_input_ctx = &vpx_input_ctx;
@@ -563,6 +779,9 @@
use_y4m = 0;
flipuv = 1;
opt_yv12 = 1;
+#if CONFIG_VP9 && CONFIG_VP9_HIGHBITDEPTH
+ output_bit_depth = 8; // For yv12 8-bit depth output is assumed
+#endif
} else if (arg_match(&arg, &use_i420, argi)) {
use_y4m = 0;
flipuv = 0;
@@ -593,7 +812,13 @@
do_scale = 1;
else if (arg_match(&arg, &fb_arg, argi))
num_external_frame_buffers = arg_parse_uint(&arg);
-
+ else if (arg_match(&arg, &continuearg, argi))
+ keep_going = 1;
+#if CONFIG_VP9 && CONFIG_VP9_HIGHBITDEPTH
+ else if (arg_match(&arg, &outbitdeptharg, argi)) {
+ output_bit_depth = arg_parse_uint(&arg);
+ }
+#endif
#if CONFIG_VP8_DECODER
else if (arg_match(&arg, &addnoise_level, argi)) {
postproc = 1;
@@ -643,11 +868,8 @@
}
} else if (arg_match(&arg, &error_concealment, argi)) {
ec_enabled = 1;
- } else if (arg_match(&arg, &continuearg, argi)) {
- keep_going = 1;
}
-
-#endif
+#endif // CONFIG_VP8_DECODER
else
argj++;
}
@@ -883,7 +1105,7 @@
display_height = display_size[1];
}
}
- scaled_img = vpx_img_alloc(NULL, VPX_IMG_FMT_I420, display_width,
+ scaled_img = vpx_img_alloc(NULL, img->fmt, display_width,
display_height, 16);
scaled_img->bit_depth = img->bit_depth;
}
@@ -901,6 +1123,33 @@
#endif
}
}
+#if CONFIG_VP9 && CONFIG_VP9_HIGHBITDEPTH
+ // Default to codec bit depth if output bit depth not set
+ if (!output_bit_depth) {
+ output_bit_depth = img->bit_depth;
+ }
+ // Shift up or down if necessary
+ if (output_bit_depth != img->bit_depth) {
+ if (!img_shifted) {
+ if (output_bit_depth == 8) {
+ img_shifted = vpx_img_alloc(
+ NULL, img->fmt - VPX_IMG_FMT_HIGHBITDEPTH,
+ img->d_w, img->d_h, 16);
+ } else {
+ img_shifted = vpx_img_alloc(
+ NULL, img->fmt | VPX_IMG_FMT_HIGHBITDEPTH,
+ img->d_w, img->d_h, 16);
+ }
+ img_shifted->bit_depth = output_bit_depth;
+ }
+ if (output_bit_depth > img->bit_depth) {
+ img_upshift(img_shifted, img, output_bit_depth - img->bit_depth);
+ } else {
+ img_downshift(img_shifted, img, img->bit_depth - output_bit_depth);
+ }
+ img = img_shifted;
+ }
+#endif
if (single_file) {
if (use_y4m) {
@@ -1007,6 +1256,9 @@
free(buf);
if (scaled_img) vpx_img_free(scaled_img);
+#if CONFIG_VP9 && CONFIG_VP9_HIGHBITDEPTH
+ if (img_shifted) vpx_img_free(img_shifted);
+#endif
for (i = 0; i < ext_fb_list.num_external_frame_buffers; ++i) {
free(ext_fb_list.ext_fb[i].data);
diff --git a/vpxenc.c b/vpxenc.c
index 7e037a6..5afca24 100644
--- a/vpxenc.c
+++ b/vpxenc.c
@@ -19,12 +19,15 @@
#include <stdlib.h>
#include <string.h>
+#if CONFIG_LIBYUV
+#include "third_party/libyuv/include/libyuv/scale.h"
+#endif
+
#include "vpx/vpx_encoder.h"
#if CONFIG_DECODERS
#include "vpx/vpx_decoder.h"
#endif
-#include "third_party/libyuv/include/libyuv/scale.h"
#include "./args.h"
#include "./ivfenc.h"
#include "./tools_common.h"
@@ -197,6 +200,10 @@
ARG_DEF(NULL, "experimental-bitstream", 0,
"Allow experimental bitstream features.");
+#if CONFIG_VP9 && CONFIG_VP9_HIGHBITDEPTH
+static const arg_def_t test16bitinternalarg = ARG_DEF(
+ NULL, "test-16bit-internal", 0, "Force use of 16 bit internal buffer");
+#endif
static const arg_def_t *main_args[] = {
&debugmode,
@@ -245,6 +252,9 @@
#endif
&timebase, &framerate,
&error_resilient,
+#if CONFIG_VP9 && CONFIG_VP9_HIGHBITDEPTH
+ &test16bitinternalarg,
+#endif
&lag_in_frames, NULL
};
@@ -318,7 +328,7 @@
static const arg_def_t noise_sens = ARG_DEF(NULL, "noise-sensitivity", 1,
"Noise sensitivity (frames to blur)");
static const arg_def_t sharpness = ARG_DEF(NULL, "sharpness", 1,
- "Filter sharpness (0-7)");
+ "Loop filter sharpness (0..7)");
static const arg_def_t static_thresh = ARG_DEF(NULL, "static-thresh", 1,
"Motion detection threshold");
static const arg_def_t cpu_used = ARG_DEF(NULL, "cpu-used", 1,
@@ -326,11 +336,11 @@
static const arg_def_t auto_altref = ARG_DEF(NULL, "auto-alt-ref", 1,
"Enable automatic alt reference frames");
static const arg_def_t arnr_maxframes = ARG_DEF(NULL, "arnr-maxframes", 1,
- "AltRef Max Frames");
+ "AltRef max frames (0..15)");
static const arg_def_t arnr_strength = ARG_DEF(NULL, "arnr-strength", 1,
- "AltRef Strength");
+ "AltRef filter strength (0..6)");
static const arg_def_t arnr_type = ARG_DEF(NULL, "arnr-type", 1,
- "AltRef Type");
+ "AltRef type");
static const struct arg_enum_list tuning_enum[] = {
{"psnr", VP8_TUNE_PSNR},
{"ssim", VP8_TUNE_SSIM},
@@ -375,9 +385,26 @@
"Adaptive quantization mode (0: off (default), 1: variance 2: complexity, "
"3: cyclic refresh)");
static const arg_def_t frame_periodic_boost = ARG_DEF(
- NULL, "frame_boost", 1,
+ NULL, "frame-boost", 1,
"Enable frame periodic boost (0: off (default), 1: on)");
+#if CONFIG_VP9 && CONFIG_VP9_HIGHBITDEPTH
+static const struct arg_enum_list bitdepth_enum[] = {
+ {"8", VPX_BITS_8},
+ {"10", VPX_BITS_10},
+ {"12", VPX_BITS_12},
+ {NULL, 0}
+};
+
+static const arg_def_t bitdeptharg = ARG_DEF_ENUM("b", "bit-depth", 1,
+ "Bit depth for codec "
+ "(8 for version <=1, "
+ "10 or 12 for version 2)",
+ bitdepth_enum);
+static const arg_def_t inbitdeptharg = ARG_DEF(NULL, "input-bit-depth", 1,
+ "Bit depth of input");
+#endif
+
static const struct arg_enum_list tune_content_enum[] = {
{"default", VP9E_CONTENT_DEFAULT},
{"screen", VP9E_CONTENT_SCREEN},
@@ -392,6 +419,9 @@
&tile_cols, &tile_rows, &arnr_maxframes, &arnr_strength, &arnr_type,
&tune_ssim, &cq_level, &max_intra_rate_pct, &lossless,
&frame_parallel_decoding, &aq_mode, &frame_periodic_boost, &tune_content,
+#if CONFIG_VP9 && CONFIG_VP9_HIGHBITDEPTH
+ &bitdeptharg, &inbitdeptharg,
+#endif
NULL
};
static const int vp9_arg_ctrl_map[] = {
@@ -447,6 +477,102 @@
}
#define mmin(a, b) ((a) < (b) ? (a) : (b))
+
+#if CONFIG_VP9 && CONFIG_VP9_HIGHBITDEPTH
+static void find_mismatch_high(const vpx_image_t *const img1,
+ const vpx_image_t *const img2,
+ int yloc[4], int uloc[4], int vloc[4]) {
+ uint16_t *plane1, *plane2;
+ uint32_t stride1, stride2;
+ const uint32_t bsize = 64;
+ const uint32_t bsizey = bsize >> img1->y_chroma_shift;
+ const uint32_t bsizex = bsize >> img1->x_chroma_shift;
+ const uint32_t c_w =
+ (img1->d_w + img1->x_chroma_shift) >> img1->x_chroma_shift;
+ const uint32_t c_h =
+ (img1->d_h + img1->y_chroma_shift) >> img1->y_chroma_shift;
+ int match = 1;
+ uint32_t i, j;
+ yloc[0] = yloc[1] = yloc[2] = yloc[3] = -1;
+ plane1 = (uint16_t*)img1->planes[VPX_PLANE_Y];
+ plane2 = (uint16_t*)img2->planes[VPX_PLANE_Y];
+ stride1 = img1->stride[VPX_PLANE_Y]/2;
+ stride2 = img2->stride[VPX_PLANE_Y]/2;
+ for (i = 0, match = 1; match && i < img1->d_h; i += bsize) {
+ for (j = 0; match && j < img1->d_w; j += bsize) {
+ int k, l;
+ const int si = mmin(i + bsize, img1->d_h) - i;
+ const int sj = mmin(j + bsize, img1->d_w) - j;
+ for (k = 0; match && k < si; ++k) {
+ for (l = 0; match && l < sj; ++l) {
+ if (*(plane1 + (i + k) * stride1 + j + l) !=
+ *(plane2 + (i + k) * stride2 + j + l)) {
+ yloc[0] = i + k;
+ yloc[1] = j + l;
+ yloc[2] = *(plane1 + (i + k) * stride1 + j + l);
+ yloc[3] = *(plane2 + (i + k) * stride2 + j + l);
+ match = 0;
+ break;
+ }
+ }
+ }
+ }
+ }
+
+ uloc[0] = uloc[1] = uloc[2] = uloc[3] = -1;
+ plane1 = (uint16_t*)img1->planes[VPX_PLANE_U];
+ plane2 = (uint16_t*)img2->planes[VPX_PLANE_U];
+ stride1 = img1->stride[VPX_PLANE_U]/2;
+ stride2 = img2->stride[VPX_PLANE_U]/2;
+ for (i = 0, match = 1; match && i < c_h; i += bsizey) {
+ for (j = 0; match && j < c_w; j += bsizex) {
+ int k, l;
+ const int si = mmin(i + bsizey, c_h - i);
+ const int sj = mmin(j + bsizex, c_w - j);
+ for (k = 0; match && k < si; ++k) {
+ for (l = 0; match && l < sj; ++l) {
+ if (*(plane1 + (i + k) * stride1 + j + l) !=
+ *(plane2 + (i + k) * stride2 + j + l)) {
+ uloc[0] = i + k;
+ uloc[1] = j + l;
+ uloc[2] = *(plane1 + (i + k) * stride1 + j + l);
+ uloc[3] = *(plane2 + (i + k) * stride2 + j + l);
+ match = 0;
+ break;
+ }
+ }
+ }
+ }
+ }
+
+ vloc[0] = vloc[1] = vloc[2] = vloc[3] = -1;
+ plane1 = (uint16_t*)img1->planes[VPX_PLANE_V];
+ plane2 = (uint16_t*)img2->planes[VPX_PLANE_V];
+ stride1 = img1->stride[VPX_PLANE_V]/2;
+ stride2 = img2->stride[VPX_PLANE_V]/2;
+ for (i = 0, match = 1; match && i < c_h; i += bsizey) {
+ for (j = 0; match && j < c_w; j += bsizex) {
+ int k, l;
+ const int si = mmin(i + bsizey, c_h - i);
+ const int sj = mmin(j + bsizex, c_w - j);
+ for (k = 0; match && k < si; ++k) {
+ for (l = 0; match && l < sj; ++l) {
+ if (*(plane1 + (i + k) * stride1 + j + l) !=
+ *(plane2 + (i + k) * stride2 + j + l)) {
+ vloc[0] = i + k;
+ vloc[1] = j + l;
+ vloc[2] = *(plane1 + (i + k) * stride1 + j + l);
+ vloc[3] = *(plane2 + (i + k) * stride2 + j + l);
+ match = 0;
+ break;
+ }
+ }
+ }
+ }
+ }
+}
+#endif
+
static void find_mismatch(const vpx_image_t *const img1,
const vpx_image_t *const img2,
int yloc[4], int uloc[4], int vloc[4]) {
@@ -539,7 +665,8 @@
static int compare_img(const vpx_image_t *const img1,
const vpx_image_t *const img2) {
- const uint32_t c_w =
+ uint32_t l_w = img1->d_w;
+ uint32_t c_w =
(img1->d_w + img1->x_chroma_shift) >> img1->x_chroma_shift;
const uint32_t c_h =
(img1->d_h + img1->y_chroma_shift) >> img1->y_chroma_shift;
@@ -549,11 +676,17 @@
match &= (img1->fmt == img2->fmt);
match &= (img1->d_w == img2->d_w);
match &= (img1->d_h == img2->d_h);
+#if CONFIG_VP9 && CONFIG_VP9_HIGHBITDEPTH
+ if (img1->fmt & VPX_IMG_FMT_HIGHBITDEPTH) {
+ l_w *= 2;
+ c_w *= 2;
+ }
+#endif
for (i = 0; i < img1->d_h; ++i)
match &= (memcmp(img1->planes[VPX_PLANE_Y] + i * img1->stride[VPX_PLANE_Y],
img2->planes[VPX_PLANE_Y] + i * img2->stride[VPX_PLANE_Y],
- img1->d_w) == 0);
+ l_w) == 0);
for (i = 0; i < c_h; ++i)
match &= (memcmp(img1->planes[VPX_PLANE_U] + i * img1->stride[VPX_PLANE_U],
@@ -598,6 +731,10 @@
int arg_ctrl_cnt;
int write_webm;
int have_kf_max_dist;
+#if CONFIG_VP9 && CONFIG_VP9_HIGHBITDEPTH
+ // whether to use 16bit internal buffers
+ int use_16bit_internal;
+#endif
};
@@ -737,8 +874,9 @@
#if CONFIG_VP9_ENCODER
// Make default VP9 passes = 2 until there is a better quality 1-pass
// encoder
- global->passes = (strcmp(global->codec->name, "vp9") == 0 &&
- global->deadline != VPX_DL_REALTIME) ? 2 : 1;
+ if (global->codec != NULL && global->codec->name != NULL)
+ global->passes = (strcmp(global->codec->name, "vp9") == 0 &&
+ global->deadline != VPX_DL_REALTIME) ? 2 : 1;
#else
global->passes = 1;
#endif
@@ -806,8 +944,10 @@
struct stream_state *stream;
stream = calloc(1, sizeof(*stream));
- if (!stream)
+ if (stream == NULL) {
fatal("Failed to allocate new stream.");
+ }
+
if (prev) {
memcpy(stream, prev, sizeof(*stream));
stream->index++;
@@ -867,6 +1007,9 @@
static const int *ctrl_args_map = NULL;
struct stream_config *config = &stream->config;
int eos_mark_found = 0;
+#if CONFIG_VP9 && CONFIG_VP9_HIGHBITDEPTH
+ int test_16bit_internal = 0;
+#endif
// Handle codec specific options
if (0) {
@@ -915,6 +1058,12 @@
config->cfg.g_w = arg_parse_uint(&arg);
} else if (arg_match(&arg, &height, argi)) {
config->cfg.g_h = arg_parse_uint(&arg);
+#if CONFIG_VP9 && CONFIG_VP9_HIGHBITDEPTH
+ } else if (arg_match(&arg, &bitdeptharg, argi)) {
+ config->cfg.g_bit_depth = arg_parse_enum_or_int(&arg);
+ } else if (arg_match(&arg, &inbitdeptharg, argi)) {
+ config->cfg.g_input_bit_depth = arg_parse_uint(&arg);
+#endif
#if CONFIG_WEBM_IO
} else if (arg_match(&arg, &stereo_mode, argi)) {
config->stereo_fmt = arg_parse_enum_or_int(&arg);
@@ -982,6 +1131,12 @@
config->have_kf_max_dist = 1;
} else if (arg_match(&arg, &kf_disabled, argi)) {
config->cfg.kf_mode = VPX_KF_DISABLED;
+#if CONFIG_VP9 && CONFIG_VP9_HIGHBITDEPTH
+ } else if (arg_match(&arg, &test16bitinternalarg, argi)) {
+ if (strcmp(global->codec->name, "vp9") == 0) {
+ test_16bit_internal = 1;
+ }
+#endif
} else {
int i, match = 0;
for (i = 0; ctrl_args[i]; i++) {
@@ -993,12 +1148,13 @@
* instance of this control.
*/
for (j = 0; j < config->arg_ctrl_cnt; j++)
- if (config->arg_ctrls[j][0] == ctrl_args_map[i])
+ if (ctrl_args_map != NULL &&
+ config->arg_ctrls[j][0] == ctrl_args_map[i])
break;
/* Update/insert */
assert(j < (int)ARG_CTRL_CNT_MAX);
- if (j < (int)ARG_CTRL_CNT_MAX) {
+ if (ctrl_args_map != NULL && j < (int)ARG_CTRL_CNT_MAX) {
config->arg_ctrls[j][0] = ctrl_args_map[i];
config->arg_ctrls[j][1] = arg_parse_enum_or_int(&arg);
if (j == config->arg_ctrl_cnt)
@@ -1011,6 +1167,12 @@
argj++;
}
}
+#if CONFIG_VP9 && CONFIG_VP9_HIGHBITDEPTH
+ if (strcmp(global->codec->name, "vp9") == 0) {
+ config->use_16bit_internal = test_16bit_internal |
+ (config->cfg.g_profile > 1);
+ }
+#endif
return eos_mark_found;
}
@@ -1038,6 +1200,14 @@
experimental_bitstream.long_name);
}
+ // Check that the codec bit depth is greater than the input bit depth.
+ if (stream->config.cfg.g_input_bit_depth >
+ (unsigned int)stream->config.cfg.g_bit_depth) {
+ fatal("Stream %d: codec bit depth (%d) less than input bit depth (%d)",
+ stream->index, (int)stream->config.cfg.g_bit_depth,
+ stream->config.cfg.g_input_bit_depth);
+ }
+
for (streami = stream; streami; streami = streami->next) {
/* All streams require output files */
if (!streami->config.out_fn)
@@ -1146,6 +1316,8 @@
SHOW(g_profile);
SHOW(g_w);
SHOW(g_h);
+ SHOW(g_bit_depth);
+ SHOW(g_input_bit_depth);
SHOW(g_timebase.num);
SHOW(g_timebase.den);
SHOW(g_error_resilient);
@@ -1278,6 +1450,9 @@
flags |= global->show_psnr ? VPX_CODEC_USE_PSNR : 0;
flags |= global->out_part ? VPX_CODEC_USE_OUTPUT_PARTITION : 0;
+#if CONFIG_VP9 && CONFIG_VP9_HIGHBITDEPTH
+ flags |= stream->config.use_16bit_internal ? VPX_CODEC_USE_HIGHBITDEPTH : 0;
+#endif
/* Construct Encoder Context */
vpx_codec_enc_init(&stream->encoder, global->codec->codec_interface(),
@@ -1323,6 +1498,46 @@
/ cfg->g_timebase.num / global->framerate.num;
/* Scale if necessary */
+#if CONFIG_VP9 && CONFIG_VP9_HIGHBITDEPTH
+ if (img) {
+ if ((img->fmt & VPX_IMG_FMT_HIGHBITDEPTH) &&
+ (img->d_w != cfg->g_w || img->d_h != cfg->g_h)) {
+ if (img->fmt != VPX_IMG_FMT_I42016) {
+ fprintf(stderr, "%s can only scale 4:2:0 inputs\n", exec_name);
+ exit(EXIT_FAILURE);
+ }
+#if CONFIG_LIBYUV
+ if (!stream->img) {
+ stream->img = vpx_img_alloc(NULL, VPX_IMG_FMT_I42016,
+ cfg->g_w, cfg->g_h, 16);
+ }
+ I420Scale_16((uint16*)img->planes[VPX_PLANE_Y],
+ img->stride[VPX_PLANE_Y]/2,
+ (uint16*)img->planes[VPX_PLANE_U],
+ img->stride[VPX_PLANE_U]/2,
+ (uint16*)img->planes[VPX_PLANE_V],
+ img->stride[VPX_PLANE_V]/2,
+ img->d_w, img->d_h,
+ (uint16*)stream->img->planes[VPX_PLANE_Y],
+ stream->img->stride[VPX_PLANE_Y]/2,
+ (uint16*)stream->img->planes[VPX_PLANE_U],
+ stream->img->stride[VPX_PLANE_U]/2,
+ (uint16*)stream->img->planes[VPX_PLANE_V],
+ stream->img->stride[VPX_PLANE_V]/2,
+ stream->img->d_w, stream->img->d_h,
+ kFilterBox);
+ img = stream->img;
+#else
+ stream->encoder.err = 1;
+ ctx_exit_on_error(&stream->encoder,
+ "Stream %d: Failed to encode frame.\n"
+ "Scaling disabled in this configuration. \n"
+ "To enable, configure with --enable-libyuv\n",
+ stream->index);
+#endif
+ }
+ }
+#endif
if (img && (img->d_w != cfg->g_w || img->d_h != cfg->g_h)) {
if (img->fmt != VPX_IMG_FMT_I420 && img->fmt != VPX_IMG_FMT_YV12) {
fprintf(stderr, "%s can only scale 4:2:0 8bpp inputs\n", exec_name);
@@ -1501,6 +1716,131 @@
return (float)(usec > 0 ? frames * 1000000.0 / (float)usec : 0);
}
+#if CONFIG_VP9 && CONFIG_VP9_HIGHBITDEPTH
+static void high_img_upshift(vpx_image_t *dst, vpx_image_t *src,
+ int input_shift) {
+ // Note the offset is 1 less than half
+ const int offset = input_shift > 0 ? (1 << (input_shift - 1)) - 1 : 0;
+ int plane;
+ if (dst->w != src->w || dst->h != src->h ||
+ dst->x_chroma_shift != src->x_chroma_shift ||
+ dst->y_chroma_shift != src->y_chroma_shift ||
+ dst->fmt != src->fmt || input_shift < 0) {
+ fatal("Unsupported image conversion");
+ }
+ switch (src->fmt) {
+ case VPX_IMG_FMT_I42016:
+ case VPX_IMG_FMT_I42216:
+ case VPX_IMG_FMT_I44416:
+ break;
+ default:
+ fatal("Unsupported image conversion");
+ break;
+ }
+ for (plane = 0; plane < 3; plane++) {
+ int w = src->w;
+ int h = src->h;
+ int x, y;
+ if (plane) {
+ w >>= src->x_chroma_shift;
+ h >>= src->y_chroma_shift;
+ }
+ for (y = 0; y < h; y++) {
+ uint16_t *p_src = (uint16_t *)(src->planes[plane] +
+ y * src->stride[plane]);
+ uint16_t *p_dst = (uint16_t *)(dst->planes[plane] +
+ y * dst->stride[plane]);
+ for (x = 0; x < w; x++)
+ *p_dst++ = (*p_src++ << input_shift) + offset;
+ }
+ }
+}
+
+static void low_img_upshift(vpx_image_t *dst, vpx_image_t *src,
+ int input_shift) {
+ // Note the offset is 1 less than half
+ const int offset = input_shift > 0 ? (1 << (input_shift - 1)) - 1 : 0;
+ int plane;
+ if (dst->w != src->w || dst->h != src->h ||
+ dst->x_chroma_shift != src->x_chroma_shift ||
+ dst->y_chroma_shift != src->y_chroma_shift ||
+ dst->fmt != src->fmt + VPX_IMG_FMT_HIGHBITDEPTH ||
+ input_shift < 0) {
+ fatal("Unsupported image conversion");
+ }
+ switch (src->fmt) {
+ case VPX_IMG_FMT_I420:
+ case VPX_IMG_FMT_I422:
+ case VPX_IMG_FMT_I444:
+ break;
+ default:
+ fatal("Unsupported image conversion");
+ break;
+ }
+ for (plane = 0; plane < 3; plane++) {
+ int w = src->w;
+ int h = src->h;
+ int x, y;
+ if (plane) {
+ w >>= src->x_chroma_shift;
+ h >>= src->y_chroma_shift;
+ }
+ for (y = 0; y < h; y++) {
+ uint8_t *p_src = src->planes[plane] + y * src->stride[plane];
+ uint16_t *p_dst = (uint16_t *)(dst->planes[plane] +
+ y * dst->stride[plane]);
+ for (x = 0; x < w; x++) {
+ *p_dst++ = (*p_src++ << input_shift) + offset;
+ }
+ }
+ }
+}
+
+static void img_upshift(vpx_image_t *dst, vpx_image_t *src,
+ int input_shift) {
+ if (src->fmt & VPX_IMG_FMT_HIGHBITDEPTH) {
+ high_img_upshift(dst, src, input_shift);
+ } else {
+ low_img_upshift(dst, src, input_shift);
+ }
+}
+
+static void img_cast_16_to_8(vpx_image_t *dst, vpx_image_t *src) {
+ int plane;
+ if (dst->fmt + VPX_IMG_FMT_HIGHBITDEPTH != src->fmt ||
+ dst->d_w != src->d_w || dst->d_h != src->d_h ||
+ dst->x_chroma_shift != src->x_chroma_shift ||
+ dst->y_chroma_shift != src->y_chroma_shift) {
+ fatal("Unsupported image conversion");
+ }
+ switch (dst->fmt) {
+ case VPX_IMG_FMT_I420:
+ case VPX_IMG_FMT_I422:
+ case VPX_IMG_FMT_I444:
+ break;
+ default:
+ fatal("Unsupported image conversion");
+ break;
+ }
+ for (plane = 0; plane < 3; plane++) {
+ int w = src->d_w;
+ int h = src->d_h;
+ int x, y;
+ if (plane) {
+ w >>= src->x_chroma_shift;
+ h >>= src->y_chroma_shift;
+ }
+ for (y = 0; y < h; y++) {
+ uint16_t *p_src = (uint16_t *)(src->planes[plane] +
+ y * src->stride[plane]);
+ uint8_t *p_dst = dst->planes[plane] + y * dst->stride[plane];
+ for (x = 0; x < w; x++) {
+ *p_dst++ = *p_src++;
+ }
+ }
+ }
+}
+#endif
static void test_decode(struct stream_state *stream,
enum TestDecodeFatality fatal,
@@ -1527,20 +1867,44 @@
vpx_codec_control(&stream->encoder, VP8_COPY_REFERENCE, &ref_enc);
vpx_codec_control(&stream->decoder, VP8_COPY_REFERENCE, &ref_dec);
} else {
- struct vp9_ref_frame ref;
+ struct vp9_ref_frame ref_enc, ref_dec;
- ref.idx = 0;
- vpx_codec_control(&stream->encoder, VP9_GET_REFERENCE, &ref);
- enc_img = ref.img;
- vpx_codec_control(&stream->decoder, VP9_GET_REFERENCE, &ref);
- dec_img = ref.img;
+ ref_enc.idx = 0;
+ ref_dec.idx = 0;
+ vpx_codec_control(&stream->encoder, VP9_GET_REFERENCE, &ref_enc);
+ enc_img = ref_enc.img;
+ vpx_codec_control(&stream->decoder, VP9_GET_REFERENCE, &ref_dec);
+ dec_img = ref_dec.img;
+#if CONFIG_VP9 && CONFIG_VP9_HIGHBITDEPTH
+ if ((enc_img.fmt & VPX_IMG_FMT_HIGHBITDEPTH) !=
+ (dec_img.fmt & VPX_IMG_FMT_HIGHBITDEPTH)) {
+ if (enc_img.fmt & VPX_IMG_FMT_HIGHBITDEPTH) {
+ vpx_img_alloc(&enc_img, enc_img.fmt - VPX_IMG_FMT_HIGHBITDEPTH,
+ enc_img.d_w, enc_img.d_h, 16);
+ img_cast_16_to_8(&enc_img, &ref_enc.img);
+ }
+ if (dec_img.fmt & VPX_IMG_FMT_HIGHBITDEPTH) {
+ vpx_img_alloc(&dec_img, dec_img.fmt - VPX_IMG_FMT_HIGHBITDEPTH,
+ dec_img.d_w, dec_img.d_h, 16);
+ img_cast_16_to_8(&dec_img, &ref_dec.img);
+ }
+ }
+#endif
}
ctx_exit_on_error(&stream->encoder, "Failed to get encoder reference frame");
ctx_exit_on_error(&stream->decoder, "Failed to get decoder reference frame");
if (!compare_img(&enc_img, &dec_img)) {
int y[4], u[4], v[4];
+#if CONFIG_VP9 && CONFIG_VP9_HIGHBITDEPTH
+ if (enc_img.fmt & VPX_IMG_FMT_HIGHBITDEPTH) {
+ find_mismatch_high(&enc_img, &dec_img, y, u, v);
+ } else {
+ find_mismatch(&enc_img, &dec_img, y, u, v);
+ }
+#else
find_mismatch(&enc_img, &dec_img, y, u, v);
+#endif
stream->decoder.err = 1;
warn_or_exit_on_error(&stream->decoder, fatal == TEST_DECODE_FATAL,
"Stream %d: Encode/decode mismatch on frame %d at"
@@ -1582,6 +1946,12 @@
int main(int argc, const char **argv_) {
int pass;
vpx_image_t raw;
+#if CONFIG_VP9 && CONFIG_VP9_HIGHBITDEPTH
+ vpx_image_t raw_shift;
+ int allocated_raw_shift = 0;
+ int use_16bit_internal = 0;
+ int input_shift = 0;
+#endif
int frame_avail, got_data;
struct VpxInputContext input;
@@ -1683,6 +2053,27 @@
if (!input.width || !input.height)
fatal("Specify stream dimensions with --width (-w) "
" and --height (-h)");
+
+ /* If input file does not specify bit-depth but input-bit-depth parameter
+ * exists, assume that to be the input bit-depth. However, if the
+ * input-bit-depth paramter does not exist, assume the input bit-depth
+ * to be the same as the codec bit-depth.
+ */
+ if (!input.bit_depth) {
+ FOREACH_STREAM({
+ if (stream->config.cfg.g_input_bit_depth)
+ input.bit_depth = stream->config.cfg.g_input_bit_depth;
+ else
+ input.bit_depth = stream->config.cfg.g_input_bit_depth =
+ (int)stream->config.cfg.g_bit_depth;
+ });
+ if (input.bit_depth > 8) input.fmt |= VPX_IMG_FMT_HIGHBITDEPTH;
+ } else {
+ FOREACH_STREAM({
+ stream->config.cfg.g_input_bit_depth = input.bit_depth;
+ });
+ }
+
FOREACH_STREAM(set_stream_dimensions(stream, input.width, input.height));
FOREACH_STREAM(validate_stream_config(stream, &global));
@@ -1736,6 +2127,25 @@
FOREACH_STREAM(open_output_file(stream, &global));
FOREACH_STREAM(initialize_encoder(stream, &global));
+#if CONFIG_VP9 && CONFIG_VP9_HIGHBITDEPTH
+ if (strcmp(global.codec->name, "vp9") == 0) {
+ // Check to see if at least one stream uses 16 bit internal.
+ // Currently assume that the bit_depths for all streams using
+ // highbitdepth are the same.
+ FOREACH_STREAM({
+ if (stream->config.use_16bit_internal) {
+ use_16bit_internal = 1;
+ }
+ if (stream->config.cfg.g_profile == 0) {
+ input_shift = 0;
+ } else {
+ input_shift = (int)stream->config.cfg.g_bit_depth -
+ stream->config.cfg.g_input_bit_depth;
+ }
+ });
+ }
+#endif
+
frame_avail = 1;
got_data = 0;
@@ -1773,10 +2183,45 @@
frame_avail = 0;
if (frames_in > global.skip_frames) {
+#if CONFIG_VP9 && CONFIG_VP9_HIGHBITDEPTH
+ vpx_image_t *frame_to_encode;
+ if (input_shift || (use_16bit_internal && input.bit_depth == 8)) {
+ assert(use_16bit_internal);
+ // Input bit depth and stream bit depth do not match, so up
+ // shift frame to stream bit depth
+ if (!allocated_raw_shift) {
+ vpx_img_alloc(&raw_shift, raw.fmt | VPX_IMG_FMT_HIGHBITDEPTH,
+ input.width, input.height, 32);
+ allocated_raw_shift = 1;
+ }
+ img_upshift(&raw_shift, &raw, input_shift);
+ frame_to_encode = &raw_shift;
+ } else {
+ frame_to_encode = &raw;
+ }
+ vpx_usec_timer_start(&timer);
+ if (use_16bit_internal) {
+ assert(frame_to_encode->fmt & VPX_IMG_FMT_HIGHBITDEPTH);
+ FOREACH_STREAM({
+ if (stream->config.use_16bit_internal)
+ encode_frame(stream, &global,
+ frame_avail ? frame_to_encode : NULL,
+ frames_in);
+ else
+ assert(0);
+ });
+ } else {
+ assert((frame_to_encode->fmt & VPX_IMG_FMT_HIGHBITDEPTH) == 0);
+ FOREACH_STREAM(encode_frame(stream, &global,
+ frame_avail ? frame_to_encode : NULL,
+ frames_in));
+ }
+#else
vpx_usec_timer_start(&timer);
FOREACH_STREAM(encode_frame(stream, &global,
frame_avail ? &raw : NULL,
frames_in));
+#endif
vpx_usec_timer_mark(&timer);
cx_time += vpx_usec_timer_elapsed(&timer);
@@ -1785,7 +2230,8 @@
got_data = 0;
FOREACH_STREAM(get_cx_data(stream, &global, &got_data));
- if (!got_data && input.length && !streams->frames_out) {
+ if (!got_data && input.length && streams != NULL &&
+ !streams->frames_out) {
lagged_count = global.limit ? seen_frames : ftello(input.file);
} else if (input.length) {
int64_t remaining;
@@ -1893,6 +2339,10 @@
});
#endif
+#if CONFIG_VP9 && CONFIG_VP9_HIGHBITDEPTH
+ if (allocated_raw_shift)
+ vpx_img_free(&raw_shift);
+#endif
vpx_img_free(&raw);
free(argv);
free(streams);
diff --git a/y4minput.c b/y4minput.c
index 520c332..34ea96d 100644
--- a/y4minput.c
+++ b/y4minput.c
@@ -700,7 +700,7 @@
int y4m_input_open(y4m_input *_y4m, FILE *_fin, char *_skip, int _nskip,
int only_420) {
- char buffer[80];
+ char buffer[80] = {0};
int ret;
int i;
/*Read until newline, or 80 cols, whichever happens first.*/
@@ -978,7 +978,9 @@
_y4m->dst_buf = (unsigned char *)malloc(_y4m->dst_buf_sz);
else
_y4m->dst_buf = (unsigned char *)malloc(2 * _y4m->dst_buf_sz);
- _y4m->aux_buf = (unsigned char *)malloc(_y4m->aux_buf_sz);
+
+ if (_y4m->aux_buf_sz > 0)
+ _y4m->aux_buf = (unsigned char *)malloc(_y4m->aux_buf_sz);
return 0;
}
@@ -1039,12 +1041,12 @@
c_w *= bytes_per_sample;
c_h = (_y4m->pic_h + _y4m->dst_c_dec_v - 1) / _y4m->dst_c_dec_v;
c_sz = c_w * c_h;
- _img->stride[PLANE_Y] = _img->stride[PLANE_ALPHA] =
+ _img->stride[VPX_PLANE_Y] = _img->stride[VPX_PLANE_ALPHA] =
_y4m->pic_w * bytes_per_sample;
- _img->stride[PLANE_U] = _img->stride[PLANE_V] = c_w;
- _img->planes[PLANE_Y] = _y4m->dst_buf;
- _img->planes[PLANE_U] = _y4m->dst_buf + pic_sz;
- _img->planes[PLANE_V] = _y4m->dst_buf + pic_sz + c_sz;
- _img->planes[PLANE_ALPHA] = _y4m->dst_buf + pic_sz + 2 * c_sz;
+ _img->stride[VPX_PLANE_U] = _img->stride[VPX_PLANE_V] = c_w;
+ _img->planes[VPX_PLANE_Y] = _y4m->dst_buf;
+ _img->planes[VPX_PLANE_U] = _y4m->dst_buf + pic_sz;
+ _img->planes[VPX_PLANE_V] = _y4m->dst_buf + pic_sz + c_sz;
+ _img->planes[VPX_PLANE_ALPHA] = _y4m->dst_buf + pic_sz + 2 * c_sz;
return 1;
}