Merge "Fix ubsan left shift warnings in warped motion library" into nextgenv2
diff --git a/CMakeLists.txt b/CMakeLists.txt
new file mode 100644
index 0000000..bfaa1f6
--- /dev/null
+++ b/CMakeLists.txt
@@ -0,0 +1,264 @@
+##
+## Copyright (c) 2016, Alliance for Open Media. All rights reserved
+##
+## This source code is subject to the terms of the BSD 2 Clause License and
+## the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+## was not distributed with this source code in the LICENSE file, you can
+## obtain it at www.aomedia.org/license/software. If the Alliance for Open
+## Media Patent License 1.0 was not distributed with this source code in the
+## PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+##
+cmake_minimum_required(VERSION 3.2)
+project(AOM C CXX)
+
+set(AOM_ROOT "${CMAKE_CURRENT_SOURCE_DIR}")
+
+set(AOM_SRCS
+ "${AOM_ROOT}/aom/aom.h"
+ "${AOM_ROOT}/aom/aom_codec.h"
+ "${AOM_ROOT}/aom/aom_decoder.h"
+ "${AOM_ROOT}/aom/aom_encoder.h"
+ "${AOM_ROOT}/aom/aom_frame_buffer.h"
+ "${AOM_ROOT}/aom/aom_image.h"
+ "${AOM_ROOT}/aom/aom_integer.h"
+ "${AOM_ROOT}/aom/aomcx.h"
+ "${AOM_ROOT}/aom/aomdx.h"
+ "${AOM_ROOT}/aom/internal/aom_codec_internal.h"
+ "${AOM_ROOT}/aom/src/aom_codec.c"
+ "${AOM_ROOT}/aom/src/aom_decoder.c"
+ "${AOM_ROOT}/aom/src/aom_encoder.c"
+ "${AOM_ROOT}/aom/src/aom_image.c")
+
+set(AOM_DSP_SRCS
+ "${AOM_ROOT}/aom_dsp/aom_convolve.c"
+ "${AOM_ROOT}/aom_dsp/aom_convolve.h"
+ "${AOM_ROOT}/aom_dsp/aom_dsp_common.h"
+ "${AOM_ROOT}/aom_dsp/aom_dsp_rtcd.c"
+ "${AOM_ROOT}/aom_dsp/aom_filter.h"
+ "${AOM_ROOT}/aom_dsp/aom_simd.c"
+ "${AOM_ROOT}/aom_dsp/aom_simd.h"
+ "${AOM_ROOT}/aom_dsp/aom_simd_inline.h"
+ "${AOM_ROOT}/aom_dsp/avg.c"
+ "${AOM_ROOT}/aom_dsp/bitreader.h"
+ "${AOM_ROOT}/aom_dsp/bitreader_buffer.c"
+ "${AOM_ROOT}/aom_dsp/bitreader_buffer.h"
+ "${AOM_ROOT}/aom_dsp/bitwriter.h"
+ "${AOM_ROOT}/aom_dsp/bitwriter_buffer.c"
+ "${AOM_ROOT}/aom_dsp/bitwriter_buffer.h"
+ "${AOM_ROOT}/aom_dsp/blend.h"
+ "${AOM_ROOT}/aom_dsp/blend_a64_hmask.c"
+ "${AOM_ROOT}/aom_dsp/blend_a64_mask.c"
+ "${AOM_ROOT}/aom_dsp/blend_a64_vmask.c"
+ "${AOM_ROOT}/aom_dsp/dkboolreader.c"
+ "${AOM_ROOT}/aom_dsp/dkboolreader.h"
+ "${AOM_ROOT}/aom_dsp/dkboolwriter.c"
+ "${AOM_ROOT}/aom_dsp/dkboolwriter.h"
+ "${AOM_ROOT}/aom_dsp/fwd_txfm.c"
+ "${AOM_ROOT}/aom_dsp/fwd_txfm.h"
+ "${AOM_ROOT}/aom_dsp/intrapred.c"
+ "${AOM_ROOT}/aom_dsp/inv_txfm.c"
+ "${AOM_ROOT}/aom_dsp/inv_txfm.h"
+ "${AOM_ROOT}/aom_dsp/loopfilter.c"
+ "${AOM_ROOT}/aom_dsp/prob.c"
+ "${AOM_ROOT}/aom_dsp/prob.h"
+ "${AOM_ROOT}/aom_dsp/psnr.c"
+ "${AOM_ROOT}/aom_dsp/psnr.h"
+ "${AOM_ROOT}/aom_dsp/quantize.c"
+ "${AOM_ROOT}/aom_dsp/quantize.h"
+ "${AOM_ROOT}/aom_dsp/sad.c"
+ "${AOM_ROOT}/aom_dsp/simd/v128_intrinsics.h"
+ "${AOM_ROOT}/aom_dsp/simd/v128_intrinsics_c.h"
+ "${AOM_ROOT}/aom_dsp/simd/v256_intrinsics.h"
+ "${AOM_ROOT}/aom_dsp/simd/v256_intrinsics_c.h"
+ "${AOM_ROOT}/aom_dsp/simd/v64_intrinsics.h"
+ "${AOM_ROOT}/aom_dsp/simd/v64_intrinsics_c.h"
+ "${AOM_ROOT}/aom_dsp/subtract.c"
+ "${AOM_ROOT}/aom_dsp/txfm_common.h"
+ "${AOM_ROOT}/aom_dsp/variance.c"
+ "${AOM_ROOT}/aom_dsp/variance.h")
+
+set(AOM_MEM_SRCS
+ "${AOM_ROOT}/aom_mem/aom_mem.c"
+ "${AOM_ROOT}/aom_mem/aom_mem.h"
+ "${AOM_ROOT}/aom_mem/include/aom_mem_intrnl.h")
+
+set(AOM_SCALE_SRCS
+ "${AOM_ROOT}/aom_scale/aom_scale.h"
+ "${AOM_ROOT}/aom_scale/aom_scale_rtcd.c"
+ "${AOM_ROOT}/aom_scale/generic/aom_scale.c"
+ "${AOM_ROOT}/aom_scale/generic/gen_scalers.c"
+ "${AOM_ROOT}/aom_scale/generic/yv12config.c"
+ "${AOM_ROOT}/aom_scale/generic/yv12extend.c"
+ "${AOM_ROOT}/aom_scale/yv12config.h")
+
+# TODO(tomfinegan): Extract aom_ports from aom_util if possible.
+set(AOM_UTIL_SRCS
+ "${AOM_ROOT}/aom_ports/aom_once.h"
+ "${AOM_ROOT}/aom_ports/aom_timer.h"
+ "${AOM_ROOT}/aom_ports/bitops.h"
+ "${AOM_ROOT}/aom_ports/emmintrin_compat.h"
+ "${AOM_ROOT}/aom_ports/mem.h"
+ "${AOM_ROOT}/aom_ports/mem_ops.h"
+ "${AOM_ROOT}/aom_ports/mem_ops_aligned.h"
+ "${AOM_ROOT}/aom_ports/msvc.h"
+ "${AOM_ROOT}/aom_ports/system_state.h"
+ "${AOM_ROOT}/aom_util/aom_thread.c"
+ "${AOM_ROOT}/aom_util/aom_thread.h"
+ "${AOM_ROOT}/aom_util/endian_inl.h")
+
+set(AOM_AV1_COMMON_SRCS
+ "${AOM_ROOT}/av1/av1_iface_common.h"
+ "${AOM_ROOT}/av1/common/alloccommon.c"
+ "${AOM_ROOT}/av1/common/alloccommon.h"
+ "${AOM_ROOT}/av1/common/av1_fwd_txfm.c"
+ "${AOM_ROOT}/av1/common/av1_fwd_txfm.h"
+ "${AOM_ROOT}/av1/common/av1_inv_txfm.c"
+ "${AOM_ROOT}/av1/common/av1_inv_txfm.h"
+ "${AOM_ROOT}/av1/common/av1_rtcd.c"
+ "${AOM_ROOT}/av1/common/blockd.c"
+ "${AOM_ROOT}/av1/common/blockd.h"
+ "${AOM_ROOT}/av1/common/common.h"
+ "${AOM_ROOT}/av1/common/common_data.h"
+ "${AOM_ROOT}/av1/common/convolve.c"
+ "${AOM_ROOT}/av1/common/convolve.h"
+ "${AOM_ROOT}/av1/common/debugmodes.c"
+ "${AOM_ROOT}/av1/common/entropy.c"
+ "${AOM_ROOT}/av1/common/entropy.h"
+ "${AOM_ROOT}/av1/common/entropymode.c"
+ "${AOM_ROOT}/av1/common/entropymode.h"
+ "${AOM_ROOT}/av1/common/entropymv.c"
+ "${AOM_ROOT}/av1/common/entropymv.h"
+ "${AOM_ROOT}/av1/common/enums.h"
+ "${AOM_ROOT}/av1/common/filter.c"
+ "${AOM_ROOT}/av1/common/filter.h"
+ "${AOM_ROOT}/av1/common/frame_buffers.c"
+ "${AOM_ROOT}/av1/common/frame_buffers.h"
+ "${AOM_ROOT}/av1/common/idct.c"
+ "${AOM_ROOT}/av1/common/idct.h"
+ "${AOM_ROOT}/av1/common/loopfilter.c"
+ "${AOM_ROOT}/av1/common/loopfilter.h"
+ "${AOM_ROOT}/av1/common/mv.h"
+ "${AOM_ROOT}/av1/common/mvref_common.c"
+ "${AOM_ROOT}/av1/common/mvref_common.h"
+ "${AOM_ROOT}/av1/common/odintrin.c"
+ "${AOM_ROOT}/av1/common/odintrin.h"
+ "${AOM_ROOT}/av1/common/onyxc_int.h"
+ "${AOM_ROOT}/av1/common/pred_common.c"
+ "${AOM_ROOT}/av1/common/pred_common.h"
+ "${AOM_ROOT}/av1/common/quant_common.c"
+ "${AOM_ROOT}/av1/common/quant_common.h"
+ "${AOM_ROOT}/av1/common/reconinter.c"
+ "${AOM_ROOT}/av1/common/reconinter.h"
+ "${AOM_ROOT}/av1/common/reconintra.c"
+ "${AOM_ROOT}/av1/common/reconintra.h"
+ "${AOM_ROOT}/av1/common/scale.c"
+ "${AOM_ROOT}/av1/common/scale.h"
+ "${AOM_ROOT}/av1/common/scan.c"
+ "${AOM_ROOT}/av1/common/scan.h"
+ "${AOM_ROOT}/av1/common/seg_common.c"
+ "${AOM_ROOT}/av1/common/seg_common.h"
+ "${AOM_ROOT}/av1/common/thread_common.c"
+ "${AOM_ROOT}/av1/common/thread_common.h"
+ "${AOM_ROOT}/av1/common/tile_common.c"
+ "${AOM_ROOT}/av1/common/tile_common.h")
+
+set(AOM_AV1_DECODER_SRCS
+ "${AOM_ROOT}/av1/av1_dx_iface.c"
+ "${AOM_ROOT}/av1/decoder/decodeframe.c"
+ "${AOM_ROOT}/av1/decoder/decodeframe.h"
+ "${AOM_ROOT}/av1/decoder/decodemv.c"
+ "${AOM_ROOT}/av1/decoder/decodemv.h"
+ "${AOM_ROOT}/av1/decoder/decoder.c"
+ "${AOM_ROOT}/av1/decoder/decoder.h"
+ "${AOM_ROOT}/av1/decoder/detokenize.c"
+ "${AOM_ROOT}/av1/decoder/detokenize.h"
+ "${AOM_ROOT}/av1/decoder/dsubexp.c"
+ "${AOM_ROOT}/av1/decoder/dsubexp.h"
+ "${AOM_ROOT}/av1/decoder/dthread.c"
+ "${AOM_ROOT}/av1/decoder/dthread.h")
+
+set(AOM_AV1_ENCODER_SRCS
+ "${AOM_ROOT}/av1/av1_cx_iface.c"
+ "${AOM_ROOT}/av1/encoder/aq_complexity.c"
+ "${AOM_ROOT}/av1/encoder/aq_complexity.h"
+ "${AOM_ROOT}/av1/encoder/aq_cyclicrefresh.c"
+ "${AOM_ROOT}/av1/encoder/aq_cyclicrefresh.h"
+ "${AOM_ROOT}/av1/encoder/aq_variance.c"
+ "${AOM_ROOT}/av1/encoder/aq_variance.h"
+ "${AOM_ROOT}/av1/encoder/bitstream.c"
+ "${AOM_ROOT}/av1/encoder/bitstream.h"
+ "${AOM_ROOT}/av1/encoder/block.h"
+ "${AOM_ROOT}/av1/encoder/context_tree.c"
+ "${AOM_ROOT}/av1/encoder/context_tree.h"
+ "${AOM_ROOT}/av1/encoder/cost.c"
+ "${AOM_ROOT}/av1/encoder/cost.h"
+ "${AOM_ROOT}/av1/encoder/dct.c"
+ "${AOM_ROOT}/av1/encoder/encodeframe.c"
+ "${AOM_ROOT}/av1/encoder/encodeframe.h"
+ "${AOM_ROOT}/av1/encoder/encodemb.c"
+ "${AOM_ROOT}/av1/encoder/encodemb.h"
+ "${AOM_ROOT}/av1/encoder/encodemv.c"
+ "${AOM_ROOT}/av1/encoder/encodemv.h"
+ "${AOM_ROOT}/av1/encoder/encoder.c"
+ "${AOM_ROOT}/av1/encoder/encoder.h"
+ "${AOM_ROOT}/av1/encoder/ethread.c"
+ "${AOM_ROOT}/av1/encoder/ethread.h"
+ "${AOM_ROOT}/av1/encoder/extend.c"
+ "${AOM_ROOT}/av1/encoder/extend.h"
+ "${AOM_ROOT}/av1/encoder/firstpass.c"
+ "${AOM_ROOT}/av1/encoder/firstpass.h"
+ "${AOM_ROOT}/av1/encoder/hybrid_fwd_txfm.c"
+ "${AOM_ROOT}/av1/encoder/hybrid_fwd_txfm.h"
+ "${AOM_ROOT}/av1/encoder/lookahead.c"
+ "${AOM_ROOT}/av1/encoder/lookahead.h"
+ "${AOM_ROOT}/av1/encoder/mbgraph.c"
+ "${AOM_ROOT}/av1/encoder/mbgraph.h"
+ "${AOM_ROOT}/av1/encoder/mcomp.c"
+ "${AOM_ROOT}/av1/encoder/mcomp.h"
+ "${AOM_ROOT}/av1/encoder/picklpf.c"
+ "${AOM_ROOT}/av1/encoder/picklpf.h"
+ "${AOM_ROOT}/av1/encoder/quantize.c"
+ "${AOM_ROOT}/av1/encoder/quantize.h"
+ "${AOM_ROOT}/av1/encoder/ratectrl.c"
+ "${AOM_ROOT}/av1/encoder/ratectrl.h"
+ "${AOM_ROOT}/av1/encoder/rd.c"
+ "${AOM_ROOT}/av1/encoder/rd.h"
+ "${AOM_ROOT}/av1/encoder/rdopt.c"
+ "${AOM_ROOT}/av1/encoder/rdopt.h"
+ "${AOM_ROOT}/av1/encoder/resize.c"
+ "${AOM_ROOT}/av1/encoder/resize.h"
+ "${AOM_ROOT}/av1/encoder/segmentation.c"
+ "${AOM_ROOT}/av1/encoder/segmentation.h"
+ "${AOM_ROOT}/av1/encoder/speed_features.c"
+ "${AOM_ROOT}/av1/encoder/speed_features.h"
+ "${AOM_ROOT}/av1/encoder/subexp.c"
+ "${AOM_ROOT}/av1/encoder/subexp.h"
+ "${AOM_ROOT}/av1/encoder/temporal_filter.c"
+ "${AOM_ROOT}/av1/encoder/temporal_filter.h"
+ "${AOM_ROOT}/av1/encoder/tokenize.c"
+ "${AOM_ROOT}/av1/encoder/tokenize.h"
+ "${AOM_ROOT}/av1/encoder/treewriter.c"
+ "${AOM_ROOT}/av1/encoder/treewriter.h")
+
+# Targets
+add_library(aom_dsp ${AOM_DSP_SRCS})
+add_library(aom_mem ${AOM_MEM_SRCS})
+add_library(aom_scale ${AOM_SCALE_SRCS})
+add_library(aom_util ${AOM_UTIL_SRCS})
+add_library(aom_av1_decoder ${AOM_AV1_DECODER_SRCS})
+add_library(aom_av1_encoder ${AOM_AV1_ENCODER_SRCS})
+add_library(aom ${AOM_SRCS})
+target_link_libraries(aom LINK_PUBLIC
+ aom_dsp
+ aom_mem
+ aom_scale
+ aom_util
+ aom_av1_decoder
+ aom_av1_encoder)
+add_executable(simple_decoder examples/simple_decoder.c)
+include_directories(${AOM_ROOT})
+target_link_libraries(simple_decoder LINK_PUBLIC aom)
+add_executable(simple_encoder examples/simple_encoder.c)
+include_directories(${AOM_ROOT})
+target_link_libraries(simple_encoder LINK_PUBLIC aom)
+
diff --git a/aom_dsp/ansreader.h b/aom_dsp/ansreader.h
index c46778b..0e9a671 100644
--- a/aom_dsp/ansreader.h
+++ b/aom_dsp/ansreader.h
@@ -20,6 +20,9 @@
#include "aom_dsp/prob.h"
#include "aom_dsp/ans.h"
#include "aom_ports/mem_ops.h"
+#if CONFIG_ACCOUNTING
+#include "av1/common/accounting.h"
+#endif
#ifdef __cplusplus
extern "C" {
@@ -29,6 +32,9 @@
const uint8_t *buf;
int buf_offset;
uint32_t state;
+#if CONFIG_ACCOUNTING
+ Accounting *accounting;
+#endif
};
static INLINE int uabs_read(struct AnsDecoder *ans, AnsP8 p0) {
@@ -119,6 +125,9 @@
// 110xxxxx implies this byte is a superframe marker
return 1;
}
+#if CONFIG_ACCOUNTING
+ ans->accounting = NULL;
+#endif
ans->state += L_BASE;
if (ans->state >= L_BASE * IO_BASE) return 1;
return 0;
diff --git a/aom_dsp/aom_dsp.mk b/aom_dsp/aom_dsp.mk
index 4735199..eebdc0c 100644
--- a/aom_dsp/aom_dsp.mk
+++ b/aom_dsp/aom_dsp.mk
@@ -191,6 +191,7 @@
endif # CONFIG_AOM_HIGHBITDEPTH
DSP_SRCS-yes += txfm_common.h
+DSP_SRCS-yes += x86/txfm_common_intrin.h
DSP_SRCS-$(HAVE_SSE2) += x86/txfm_common_sse2.h
DSP_SRCS-$(HAVE_MSA) += mips/txfm_macros_msa.h
# forward transform
diff --git a/aom_dsp/aom_dsp_rtcd_defs.pl b/aom_dsp/aom_dsp_rtcd_defs.pl
index 6d873bc..b073b1b 100644
--- a/aom_dsp/aom_dsp_rtcd_defs.pl
+++ b/aom_dsp/aom_dsp_rtcd_defs.pl
@@ -701,6 +701,34 @@
#
if (aom_config("CONFIG_AV1_ENCODER") eq "yes") {
if (aom_config("CONFIG_AOM_HIGHBITDEPTH") eq "yes") {
+ add_proto qw/void aom_fdct4x4/, "const int16_t *input, tran_low_t *output, int stride";
+ specialize qw/aom_fdct4x4 sse2/;
+
+ add_proto qw/void aom_fdct4x4_1/, "const int16_t *input, tran_low_t *output, int stride";
+ specialize qw/aom_fdct4x4_1 sse2/;
+
+ add_proto qw/void aom_fdct8x8/, "const int16_t *input, tran_low_t *output, int stride";
+ specialize qw/aom_fdct8x8 sse2/, "$ssse3_x86_64";
+
+ add_proto qw/void aom_fdct8x8_1/, "const int16_t *input, tran_low_t *output, int stride";
+ specialize qw/aom_fdct8x8_1 sse2/;
+
+ add_proto qw/void aom_fdct16x16/, "const int16_t *input, tran_low_t *output, int stride";
+ specialize qw/aom_fdct16x16 sse2/;
+
+ add_proto qw/void aom_fdct16x16_1/, "const int16_t *input, tran_low_t *output, int stride";
+ specialize qw/aom_fdct16x16_1 sse2 avx2/;
+
+ add_proto qw/void aom_fdct32x32/, "const int16_t *input, tran_low_t *output, int stride";
+ specialize qw/aom_fdct32x32 sse2 avx2/;
+
+ add_proto qw/void aom_fdct32x32_rd/, "const int16_t *input, tran_low_t *output, int stride";
+ specialize qw/aom_fdct32x32_rd sse2 avx2/;
+
+ add_proto qw/void aom_fdct32x32_1/, "const int16_t *input, tran_low_t *output, int stride";
+ specialize qw/aom_fdct32x32_1 sse2 avx2/;
+
+ # High bit depth
add_proto qw/void aom_highbd_fdct4x4/, "const int16_t *input, tran_low_t *output, int stride";
specialize qw/aom_highbd_fdct4x4 sse2/;
@@ -724,33 +752,34 @@
add_proto qw/void aom_highbd_fdct32x32_1/, "const int16_t *input, tran_low_t *output, int stride";
specialize qw/aom_highbd_fdct32x32_1/;
- } # CONFIG_AOM_HIGHBITDEPTH
- add_proto qw/void aom_fdct4x4/, "const int16_t *input, tran_low_t *output, int stride";
- specialize qw/aom_fdct4x4 sse2 msa/;
+ } else {
+ add_proto qw/void aom_fdct4x4/, "const int16_t *input, tran_low_t *output, int stride";
+ specialize qw/aom_fdct4x4 sse2 msa/;
- add_proto qw/void aom_fdct4x4_1/, "const int16_t *input, tran_low_t *output, int stride";
- specialize qw/aom_fdct4x4_1 sse2/;
+ add_proto qw/void aom_fdct4x4_1/, "const int16_t *input, tran_low_t *output, int stride";
+ specialize qw/aom_fdct4x4_1 sse2/;
- add_proto qw/void aom_fdct8x8/, "const int16_t *input, tran_low_t *output, int stride";
- specialize qw/aom_fdct8x8 sse2 neon msa/, "$ssse3_x86_64";
+ add_proto qw/void aom_fdct8x8/, "const int16_t *input, tran_low_t *output, int stride";
+ specialize qw/aom_fdct8x8 sse2 neon msa/, "$ssse3_x86_64";
- add_proto qw/void aom_fdct8x8_1/, "const int16_t *input, tran_low_t *output, int stride";
- specialize qw/aom_fdct8x8_1 sse2 neon msa/;
+ add_proto qw/void aom_fdct8x8_1/, "const int16_t *input, tran_low_t *output, int stride";
+ specialize qw/aom_fdct8x8_1 sse2 neon msa/;
- add_proto qw/void aom_fdct16x16/, "const int16_t *input, tran_low_t *output, int stride";
- specialize qw/aom_fdct16x16 sse2 msa/;
+ add_proto qw/void aom_fdct16x16/, "const int16_t *input, tran_low_t *output, int stride";
+ specialize qw/aom_fdct16x16 sse2 msa/;
- add_proto qw/void aom_fdct16x16_1/, "const int16_t *input, tran_low_t *output, int stride";
- specialize qw/aom_fdct16x16_1 sse2 avx2 msa/;
+ add_proto qw/void aom_fdct16x16_1/, "const int16_t *input, tran_low_t *output, int stride";
+ specialize qw/aom_fdct16x16_1 sse2 avx2 msa/;
- add_proto qw/void aom_fdct32x32/, "const int16_t *input, tran_low_t *output, int stride";
- specialize qw/aom_fdct32x32 sse2 avx2 msa/;
+ add_proto qw/void aom_fdct32x32/, "const int16_t *input, tran_low_t *output, int stride";
+ specialize qw/aom_fdct32x32 sse2 avx2 msa/;
- add_proto qw/void aom_fdct32x32_rd/, "const int16_t *input, tran_low_t *output, int stride";
- specialize qw/aom_fdct32x32_rd sse2 avx2 msa/;
+ add_proto qw/void aom_fdct32x32_rd/, "const int16_t *input, tran_low_t *output, int stride";
+ specialize qw/aom_fdct32x32_rd sse2 avx2 msa/;
- add_proto qw/void aom_fdct32x32_1/, "const int16_t *input, tran_low_t *output, int stride";
- specialize qw/aom_fdct32x32_1 sse2 avx2 msa/;
+ add_proto qw/void aom_fdct32x32_1/, "const int16_t *input, tran_low_t *output, int stride";
+ specialize qw/aom_fdct32x32_1 sse2 avx2 msa/;
+ } # CONFIG_AOM_HIGHBITDEPTH
} # CONFIG_AV1_ENCODER
#
diff --git a/aom_dsp/bitreader.h b/aom_dsp/bitreader.h
index ef2e5e9..478945b 100644
--- a/aom_dsp/bitreader.h
+++ b/aom_dsp/bitreader.h
@@ -16,6 +16,10 @@
#include <limits.h>
#include "./aom_config.h"
+#if CONFIG_EC_ADAPT && !CONFIG_EC_MULTISYMBOL
+#error "CONFIG_EC_ADAPT is enabled without enabling CONFIG_EC_MULTISYMBOL."
+#endif
+
#include "aom/aomdx.h"
#include "aom/aom_integer.h"
#if CONFIG_ANS
@@ -203,7 +207,8 @@
return ret;
}
-static INLINE int aom_read_symbol_(aom_reader *r, const aom_cdf_prob *cdf,
+#if CONFIG_EC_MULTISYMBOL
+static INLINE int aom_read_symbol_(aom_reader *r, aom_cdf_prob *cdf,
int nsymbs ACCT_STR_PARAM) {
int ret;
#if CONFIG_RANS
@@ -212,17 +217,21 @@
#elif CONFIG_DAALA_EC
ret = daala_read_symbol(r, cdf, nsymbs);
#else
- (void)r;
- (void)cdf;
- (void)nsymbs;
- assert(0 && "Unsupported bitreader operation");
- ret = -1;
+#error \
+ "CONFIG_EC_MULTISYMBOL is selected without a valid backing entropy " \
+ "coder. Enable daala_ec or ans for a valid configuration."
#endif
+
+#if CONFIG_EC_ADAPT
+ update_cdf(cdf, ret, nsymbs);
+#endif
+
#if CONFIG_ACCOUNTING
if (ACCT_STR_NAME) aom_process_accounting(r, ACCT_STR_NAME);
#endif
return ret;
}
+#endif // CONFIG_EC_MULTISYMBOL
#ifdef __cplusplus
} // extern "C"
diff --git a/aom_dsp/bitwriter.h b/aom_dsp/bitwriter.h
index 841a171..ef529fc 100644
--- a/aom_dsp/bitwriter.h
+++ b/aom_dsp/bitwriter.h
@@ -14,6 +14,10 @@
#include <assert.h>
#include "./aom_config.h"
+#if CONFIG_EC_ADAPT && !CONFIG_EC_MULTISYMBOL
+#error "CONFIG_EC_ADAPT is enabled without enabling CONFIG_EC_MULTISYMBOL"
+#endif
+
#if CONFIG_ANS
#include "aom_dsp/buf_ans.h"
#elif CONFIG_DAALA_EC
@@ -98,8 +102,9 @@
#endif
}
-static INLINE void aom_write_symbol(aom_writer *w, int symb,
- const aom_cdf_prob *cdf, int nsymbs) {
+#if CONFIG_EC_MULTISYMBOL
+static INLINE void aom_write_symbol(aom_writer *w, int symb, aom_cdf_prob *cdf,
+ int nsymbs) {
#if CONFIG_RANS
struct rans_sym s;
(void)nsymbs;
@@ -110,13 +115,16 @@
#elif CONFIG_DAALA_EC
daala_write_symbol(w, symb, cdf, nsymbs);
#else
- (void)w;
- (void)symb;
- (void)cdf;
- (void)nsymbs;
- assert(0 && "Unsupported bitwriter operation");
+#error \
+ "CONFIG_EC_MULTISYMBOL is selected without a valid backing entropy " \
+ "coder. Enable daala_ec or ans for a valid configuration."
+#endif
+
+#if CONFIG_EC_ADAPT
+ update_cdf(cdf, symb, nsymbs);
#endif
}
+#endif // CONFIG_EC_MULTISYMBOL
#ifdef __cplusplus
} // extern "C"
diff --git a/aom_dsp/inv_txfm.c b/aom_dsp/inv_txfm.c
index e0dda12..4bb656b 100644
--- a/aom_dsp/inv_txfm.c
+++ b/aom_dsp/inv_txfm.c
@@ -93,7 +93,7 @@
}
}
-void idct4_c(const tran_low_t *input, tran_low_t *output) {
+void aom_idct4_c(const tran_low_t *input, tran_low_t *output) {
tran_low_t step[4];
tran_high_t temp1, temp2;
// stage 1
@@ -121,7 +121,7 @@
// Rows
for (i = 0; i < 4; ++i) {
- idct4_c(input, outptr);
+ aom_idct4_c(input, outptr);
input += 4;
outptr += 4;
}
@@ -129,7 +129,7 @@
// Columns
for (i = 0; i < 4; ++i) {
for (j = 0; j < 4; ++j) temp_in[j] = out[j * 4 + i];
- idct4_c(temp_in, temp_out);
+ aom_idct4_c(temp_in, temp_out);
for (j = 0; j < 4; ++j) {
dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
ROUND_POWER_OF_TWO(temp_out[j], 4));
@@ -154,7 +154,7 @@
}
}
-void idct8_c(const tran_low_t *input, tran_low_t *output) {
+void aom_idct8_c(const tran_low_t *input, tran_low_t *output) {
tran_low_t step1[8], step2[8];
tran_high_t temp1, temp2;
// stage 1
@@ -216,7 +216,7 @@
// First transform rows
for (i = 0; i < 8; ++i) {
- idct8_c(input, outptr);
+ aom_idct8_c(input, outptr);
input += 8;
outptr += 8;
}
@@ -224,7 +224,7 @@
// Then transform columns
for (i = 0; i < 8; ++i) {
for (j = 0; j < 8; ++j) temp_in[j] = out[j * 8 + i];
- idct8_c(temp_in, temp_out);
+ aom_idct8_c(temp_in, temp_out);
for (j = 0; j < 8; ++j) {
dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
ROUND_POWER_OF_TWO(temp_out[j], 5));
@@ -244,7 +244,7 @@
}
}
-void iadst4_c(const tran_low_t *input, tran_low_t *output) {
+void aom_iadst4_c(const tran_low_t *input, tran_low_t *output) {
tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;
tran_low_t x0 = input[0];
@@ -281,7 +281,7 @@
output[3] = WRAPLOW(dct_const_round_shift(s0 + s1 - s3));
}
-void iadst8_c(const tran_low_t *input, tran_low_t *output) {
+void aom_iadst8_c(const tran_low_t *input, tran_low_t *output) {
int s0, s1, s2, s3, s4, s5, s6, s7;
tran_high_t x0 = input[7];
@@ -367,7 +367,7 @@
// First transform rows
// only first 4 row has non-zero coefs
for (i = 0; i < 4; ++i) {
- idct8_c(input, outptr);
+ aom_idct8_c(input, outptr);
input += 8;
outptr += 8;
}
@@ -375,7 +375,7 @@
// Then transform columns
for (i = 0; i < 8; ++i) {
for (j = 0; j < 8; ++j) temp_in[j] = out[j * 8 + i];
- idct8_c(temp_in, temp_out);
+ aom_idct8_c(temp_in, temp_out);
for (j = 0; j < 8; ++j) {
dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
ROUND_POWER_OF_TWO(temp_out[j], 5));
@@ -383,7 +383,7 @@
}
}
-void idct16_c(const tran_low_t *input, tran_low_t *output) {
+void aom_idct16_c(const tran_low_t *input, tran_low_t *output) {
tran_low_t step1[16], step2[16];
tran_high_t temp1, temp2;
@@ -557,7 +557,7 @@
// First transform rows
for (i = 0; i < 16; ++i) {
- idct16_c(input, outptr);
+ aom_idct16_c(input, outptr);
input += 16;
outptr += 16;
}
@@ -565,7 +565,7 @@
// Then transform columns
for (i = 0; i < 16; ++i) {
for (j = 0; j < 16; ++j) temp_in[j] = out[j * 16 + i];
- idct16_c(temp_in, temp_out);
+ aom_idct16_c(temp_in, temp_out);
for (j = 0; j < 16; ++j) {
dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
ROUND_POWER_OF_TWO(temp_out[j], 6));
@@ -573,7 +573,7 @@
}
}
-void iadst16_c(const tran_low_t *input, tran_low_t *output) {
+void aom_iadst16_c(const tran_low_t *input, tran_low_t *output) {
tran_high_t s0, s1, s2, s3, s4, s5, s6, s7, s8;
tran_high_t s9, s10, s11, s12, s13, s14, s15;
@@ -754,7 +754,7 @@
// First transform rows. Since all non-zero dct coefficients are in
// upper-left 4x4 area, we only need to calculate first 4 rows here.
for (i = 0; i < 4; ++i) {
- idct16_c(input, outptr);
+ aom_idct16_c(input, outptr);
input += 16;
outptr += 16;
}
@@ -762,7 +762,7 @@
// Then transform columns
for (i = 0; i < 16; ++i) {
for (j = 0; j < 16; ++j) temp_in[j] = out[j * 16 + i];
- idct16_c(temp_in, temp_out);
+ aom_idct16_c(temp_in, temp_out);
for (j = 0; j < 16; ++j) {
dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
ROUND_POWER_OF_TWO(temp_out[j], 6));
@@ -782,7 +782,7 @@
}
}
-void idct32_c(const tran_low_t *input, tran_low_t *output) {
+void aom_idct32_c(const tran_low_t *input, tran_low_t *output) {
tran_low_t step1[32], step2[32];
tran_high_t temp1, temp2;
@@ -1168,7 +1168,7 @@
zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1];
if (zero_coeff[0] | zero_coeff[1])
- idct32_c(input, outptr);
+ aom_idct32_c(input, outptr);
else
memset(outptr, 0, sizeof(tran_low_t) * 32);
input += 32;
@@ -1178,7 +1178,7 @@
// Columns
for (i = 0; i < 32; ++i) {
for (j = 0; j < 32; ++j) temp_in[j] = out[j * 32 + i];
- idct32_c(temp_in, temp_out);
+ aom_idct32_c(temp_in, temp_out);
for (j = 0; j < 32; ++j) {
dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
ROUND_POWER_OF_TWO(temp_out[j], 6));
@@ -1196,7 +1196,7 @@
// Rows
// only upper-left 16x16 has non-zero coeff
for (i = 0; i < 16; ++i) {
- idct32_c(input, outptr);
+ aom_idct32_c(input, outptr);
input += 32;
outptr += 32;
}
@@ -1204,7 +1204,7 @@
// Columns
for (i = 0; i < 32; ++i) {
for (j = 0; j < 32; ++j) temp_in[j] = out[j * 32 + i];
- idct32_c(temp_in, temp_out);
+ aom_idct32_c(temp_in, temp_out);
for (j = 0; j < 32; ++j) {
dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
ROUND_POWER_OF_TWO(temp_out[j], 6));
@@ -1222,7 +1222,7 @@
// Rows
// only upper-left 8x8 has non-zero coeff
for (i = 0; i < 8; ++i) {
- idct32_c(input, outptr);
+ aom_idct32_c(input, outptr);
input += 32;
outptr += 32;
}
@@ -1230,7 +1230,7 @@
// Columns
for (i = 0; i < 32; ++i) {
for (j = 0; j < 32; ++j) temp_in[j] = out[j * 32 + i];
- idct32_c(temp_in, temp_out);
+ aom_idct32_c(temp_in, temp_out);
for (j = 0; j < 32; ++j) {
dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
ROUND_POWER_OF_TWO(temp_out[j], 6));
diff --git a/aom_dsp/inv_txfm.h b/aom_dsp/inv_txfm.h
index 0f84e38..c3d794e 100644
--- a/aom_dsp/inv_txfm.h
+++ b/aom_dsp/inv_txfm.h
@@ -97,13 +97,13 @@
#endif // CONFIG_AOM_HIGHBITDEPTH
#endif // CONFIG_EMULATE_HARDWARE
-void idct4_c(const tran_low_t *input, tran_low_t *output);
-void idct8_c(const tran_low_t *input, tran_low_t *output);
-void idct16_c(const tran_low_t *input, tran_low_t *output);
-void idct32_c(const tran_low_t *input, tran_low_t *output);
-void iadst4_c(const tran_low_t *input, tran_low_t *output);
-void iadst8_c(const tran_low_t *input, tran_low_t *output);
-void iadst16_c(const tran_low_t *input, tran_low_t *output);
+void aom_idct4_c(const tran_low_t *input, tran_low_t *output);
+void aom_idct8_c(const tran_low_t *input, tran_low_t *output);
+void aom_idct16_c(const tran_low_t *input, tran_low_t *output);
+void aom_idct32_c(const tran_low_t *input, tran_low_t *output);
+void aom_iadst4_c(const tran_low_t *input, tran_low_t *output);
+void aom_iadst8_c(const tran_low_t *input, tran_low_t *output);
+void aom_iadst16_c(const tran_low_t *input, tran_low_t *output);
#if CONFIG_AOM_HIGHBITDEPTH
void aom_highbd_idct4_c(const tran_low_t *input, tran_low_t *output, int bd);
diff --git a/aom_dsp/prob.c b/aom_dsp/prob.c
index d3556cb..a98a4bc 100644
--- a/aom_dsp/prob.c
+++ b/aom_dsp/prob.c
@@ -11,7 +11,7 @@
#include "./aom_config.h"
-#if CONFIG_DAALA_EC
+#if CONFIG_EC_MULTISYMBOL
#include <string.h>
#endif
@@ -57,7 +57,7 @@
tree_merge_probs_impl(0, tree, pre_probs, counts, probs);
}
-#if CONFIG_DAALA_EC
+#if CONFIG_EC_MULTISYMBOL
typedef struct tree_node tree_node;
struct tree_node {
@@ -86,7 +86,7 @@
int i;
uint32_t pa;
uint32_t pb;
- for (i = 0; i < OD_MINI(a.len, b.len) && a.probs[i] == b.probs[i]; i++) {
+ for (i = 0; i < AOMMIN(a.len, b.len) && a.probs[i] == b.probs[i]; i++) {
}
pa = tree_node_prob(a, i);
pb = tree_node_prob(b, i);
diff --git a/aom_dsp/prob.h b/aom_dsp/prob.h
index bf9abbf..9384ffe 100644
--- a/aom_dsp/prob.h
+++ b/aom_dsp/prob.h
@@ -15,6 +15,7 @@
#include "./aom_config.h"
#include "./aom_dsp_common.h"
+#include "aom_ports/bitops.h"
#include "aom_ports/mem.h"
#ifdef __cplusplus
@@ -32,7 +33,7 @@
typedef int8_t aom_tree_index;
-#define TREE_SIZE(leaf_count) (2 * (leaf_count)-2)
+#define TREE_SIZE(leaf_count) (-2 + 2 * (leaf_count))
#define aom_complement(x) (255 - x)
@@ -96,7 +97,7 @@
void aom_tree_merge_probs(const aom_tree_index *tree, const aom_prob *pre_probs,
const unsigned int *counts, aom_prob *probs);
-#if CONFIG_DAALA_EC
+#if CONFIG_EC_MULTISYMBOL
int tree_to_cdf(const aom_tree_index *tree, const aom_prob *probs,
aom_tree_index root, aom_cdf_prob *cdf, aom_tree_index *ind,
int *pth, int *len);
@@ -134,6 +135,22 @@
DECLARE_ALIGNED(16, extern const uint8_t, aom_norm[256]);
+#if CONFIG_EC_ADAPT
+static INLINE void update_cdf(aom_cdf_prob *cdf, int val, int nsymbs) {
+ const int rate = 4 + get_msb(nsymbs);
+ int i, diff, tmp;
+ for (i = 0; i < nsymbs; ++i) {
+ tmp = (i + 1) << (12 - rate);
+ cdf[i] -= ((cdf[i] - tmp) >> rate);
+ }
+ diff = 32768 - cdf[nsymbs - 1];
+
+ for (i = val; i < nsymbs; ++i) {
+ cdf[i] += diff;
+ }
+}
+#endif
+
#ifdef __cplusplus
} // extern "C"
#endif
diff --git a/aom_dsp/x86/fwd_dct32x32_impl_avx2.h b/aom_dsp/x86/fwd_dct32x32_impl_avx2.h
index 8b136e7..2167395 100644
--- a/aom_dsp/x86/fwd_dct32x32_impl_avx2.h
+++ b/aom_dsp/x86/fwd_dct32x32_impl_avx2.h
@@ -12,6 +12,7 @@
#include <immintrin.h> // AVX2
#include "aom_dsp/txfm_common.h"
+#include "aom_dsp/x86/txfm_common_intrin.h"
#include "aom_dsp/x86/txfm_common_avx2.h"
#if FDCT32x32_HIGH_PRECISION
@@ -31,7 +32,19 @@
}
#endif
-void FDCT32x32_2D_AVX2(const int16_t *input, int16_t *output_org, int stride) {
+#ifndef STORE_COEFF_FUNC
+#define STORE_COEFF_FUNC
+static void store_coeff(const __m256i *coeff, tran_low_t *curr,
+ tran_low_t *next) {
+ __m128i u = _mm256_castsi256_si128(*coeff);
+ storeu_output(&u, curr);
+ u = _mm256_extractf128_si256(*coeff, 1);
+ storeu_output(&u, next);
+}
+#endif
+
+void FDCT32x32_2D_AVX2(const int16_t *input, tran_low_t *output_org,
+ int stride) {
// Calculate pre-multiplied strides
const int str1 = stride;
const int str2 = 2 * stride;
@@ -2842,13 +2855,14 @@
{
int transpose_block;
int16_t *output_currStep, *output_nextStep;
- if (0 == pass) {
- output_currStep = &intermediate[column_start * 32];
- output_nextStep = &intermediate[(column_start + 8) * 32];
- } else {
- output_currStep = &output_org[column_start * 32];
- output_nextStep = &output_org[(column_start + 8) * 32];
- }
+ tran_low_t *curr_out, *next_out;
+ // Pass 0
+ output_currStep = &intermediate[column_start * 32];
+ output_nextStep = &intermediate[(column_start + 8) * 32];
+ // Pass 1
+ curr_out = &output_org[column_start * 32];
+ next_out = &output_org[(column_start + 8) * 32];
+
for (transpose_block = 0; transpose_block < 4; ++transpose_block) {
__m256i *this_out = &out[8 * transpose_block];
// 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15
@@ -2948,44 +2962,58 @@
tr2_6 = _mm256_srai_epi16(tr2_6, 2);
tr2_7 = _mm256_srai_epi16(tr2_7, 2);
}
- // Note: even though all these stores are aligned, using the aligned
- // intrinsic make the code slightly slower.
- _mm_storeu_si128((__m128i *)(output_currStep + 0 * 32),
- _mm256_castsi256_si128(tr2_0));
- _mm_storeu_si128((__m128i *)(output_currStep + 1 * 32),
- _mm256_castsi256_si128(tr2_1));
- _mm_storeu_si128((__m128i *)(output_currStep + 2 * 32),
- _mm256_castsi256_si128(tr2_2));
- _mm_storeu_si128((__m128i *)(output_currStep + 3 * 32),
- _mm256_castsi256_si128(tr2_3));
- _mm_storeu_si128((__m128i *)(output_currStep + 4 * 32),
- _mm256_castsi256_si128(tr2_4));
- _mm_storeu_si128((__m128i *)(output_currStep + 5 * 32),
- _mm256_castsi256_si128(tr2_5));
- _mm_storeu_si128((__m128i *)(output_currStep + 6 * 32),
- _mm256_castsi256_si128(tr2_6));
- _mm_storeu_si128((__m128i *)(output_currStep + 7 * 32),
- _mm256_castsi256_si128(tr2_7));
+ if (0 == pass) {
+ // Note: even though all these stores are aligned, using the aligned
+ // intrinsic make the code slightly slower.
+ _mm_storeu_si128((__m128i *)(output_currStep + 0 * 32),
+ _mm256_castsi256_si128(tr2_0));
+ _mm_storeu_si128((__m128i *)(output_currStep + 1 * 32),
+ _mm256_castsi256_si128(tr2_1));
+ _mm_storeu_si128((__m128i *)(output_currStep + 2 * 32),
+ _mm256_castsi256_si128(tr2_2));
+ _mm_storeu_si128((__m128i *)(output_currStep + 3 * 32),
+ _mm256_castsi256_si128(tr2_3));
+ _mm_storeu_si128((__m128i *)(output_currStep + 4 * 32),
+ _mm256_castsi256_si128(tr2_4));
+ _mm_storeu_si128((__m128i *)(output_currStep + 5 * 32),
+ _mm256_castsi256_si128(tr2_5));
+ _mm_storeu_si128((__m128i *)(output_currStep + 6 * 32),
+ _mm256_castsi256_si128(tr2_6));
+ _mm_storeu_si128((__m128i *)(output_currStep + 7 * 32),
+ _mm256_castsi256_si128(tr2_7));
- _mm_storeu_si128((__m128i *)(output_nextStep + 0 * 32),
- _mm256_extractf128_si256(tr2_0, 1));
- _mm_storeu_si128((__m128i *)(output_nextStep + 1 * 32),
- _mm256_extractf128_si256(tr2_1, 1));
- _mm_storeu_si128((__m128i *)(output_nextStep + 2 * 32),
- _mm256_extractf128_si256(tr2_2, 1));
- _mm_storeu_si128((__m128i *)(output_nextStep + 3 * 32),
- _mm256_extractf128_si256(tr2_3, 1));
- _mm_storeu_si128((__m128i *)(output_nextStep + 4 * 32),
- _mm256_extractf128_si256(tr2_4, 1));
- _mm_storeu_si128((__m128i *)(output_nextStep + 5 * 32),
- _mm256_extractf128_si256(tr2_5, 1));
- _mm_storeu_si128((__m128i *)(output_nextStep + 6 * 32),
- _mm256_extractf128_si256(tr2_6, 1));
- _mm_storeu_si128((__m128i *)(output_nextStep + 7 * 32),
- _mm256_extractf128_si256(tr2_7, 1));
- // Process next 8x8
- output_currStep += 8;
- output_nextStep += 8;
+ _mm_storeu_si128((__m128i *)(output_nextStep + 0 * 32),
+ _mm256_extractf128_si256(tr2_0, 1));
+ _mm_storeu_si128((__m128i *)(output_nextStep + 1 * 32),
+ _mm256_extractf128_si256(tr2_1, 1));
+ _mm_storeu_si128((__m128i *)(output_nextStep + 2 * 32),
+ _mm256_extractf128_si256(tr2_2, 1));
+ _mm_storeu_si128((__m128i *)(output_nextStep + 3 * 32),
+ _mm256_extractf128_si256(tr2_3, 1));
+ _mm_storeu_si128((__m128i *)(output_nextStep + 4 * 32),
+ _mm256_extractf128_si256(tr2_4, 1));
+ _mm_storeu_si128((__m128i *)(output_nextStep + 5 * 32),
+ _mm256_extractf128_si256(tr2_5, 1));
+ _mm_storeu_si128((__m128i *)(output_nextStep + 6 * 32),
+ _mm256_extractf128_si256(tr2_6, 1));
+ _mm_storeu_si128((__m128i *)(output_nextStep + 7 * 32),
+ _mm256_extractf128_si256(tr2_7, 1));
+ // Process next 8x8
+ output_currStep += 8;
+ output_nextStep += 8;
+ }
+ if (1 == pass) {
+ store_coeff(&tr2_0, curr_out + 0 * 32, next_out + 0 * 32);
+ store_coeff(&tr2_1, curr_out + 1 * 32, next_out + 1 * 32);
+ store_coeff(&tr2_2, curr_out + 2 * 32, next_out + 2 * 32);
+ store_coeff(&tr2_3, curr_out + 3 * 32, next_out + 3 * 32);
+ store_coeff(&tr2_4, curr_out + 4 * 32, next_out + 4 * 32);
+ store_coeff(&tr2_5, curr_out + 5 * 32, next_out + 5 * 32);
+ store_coeff(&tr2_6, curr_out + 6 * 32, next_out + 6 * 32);
+ store_coeff(&tr2_7, curr_out + 7 * 32, next_out + 7 * 32);
+ curr_out += 8;
+ next_out += 8;
+ }
}
}
}
diff --git a/aom_dsp/x86/fwd_txfm_avx2.c b/aom_dsp/x86/fwd_txfm_avx2.c
index d381a6e..670f864 100644
--- a/aom_dsp/x86/fwd_txfm_avx2.c
+++ b/aom_dsp/x86/fwd_txfm_avx2.c
@@ -17,14 +17,6 @@
#undef FDCT32x32_2D_AVX2
#undef FDCT32x32_HIGH_PRECISION
-// TODO(luoyi): The following macro hides an error. The second parameter type of
-// function,
-// void FDCT32x32_2D_AVX2(const int16_t *, int16_t*, int);
-// is different from the one in,
-// void aom_fdct32x32_avx2(const int16_t *, tran_low_t*, int);
-// In CONFIG_AOM_HIGHBITDEPTH=1 build, the second parameter type should be
-// int32_t.
-// This function should be removed after av1_fht32x32 scaling/rounding fix.
#define FDCT32x32_2D_AVX2 aom_fdct32x32_avx2
#define FDCT32x32_HIGH_PRECISION 1
#include "aom_dsp/x86/fwd_dct32x32_impl_avx2.h" // NOLINT
diff --git a/aom_dsp/x86/fwd_txfm_sse2.h b/aom_dsp/x86/fwd_txfm_sse2.h
index 3261584..fe3e446 100644
--- a/aom_dsp/x86/fwd_txfm_sse2.h
+++ b/aom_dsp/x86/fwd_txfm_sse2.h
@@ -12,6 +12,8 @@
#ifndef AOM_DSP_X86_FWD_TXFM_SSE2_H_
#define AOM_DSP_X86_FWD_TXFM_SSE2_H_
+#include "aom_dsp/x86/txfm_common_intrin.h"
+
#ifdef __cplusplus
extern "C" {
#endif
@@ -257,19 +259,6 @@
#endif // CONFIG_AOM_HIGHBITDEPTH
}
-static INLINE void storeu_output(const __m128i *poutput, tran_low_t *dst_ptr) {
-#if CONFIG_AOM_HIGHBITDEPTH
- const __m128i zero = _mm_setzero_si128();
- const __m128i sign_bits = _mm_cmplt_epi16(*poutput, zero);
- __m128i out0 = _mm_unpacklo_epi16(*poutput, sign_bits);
- __m128i out1 = _mm_unpackhi_epi16(*poutput, sign_bits);
- _mm_storeu_si128((__m128i *)(dst_ptr), out0);
- _mm_storeu_si128((__m128i *)(dst_ptr + 4), out1);
-#else
- _mm_storeu_si128((__m128i *)(dst_ptr), *poutput);
-#endif // CONFIG_AOM_HIGHBITDEPTH
-}
-
static INLINE __m128i mult_round_shift(const __m128i *pin0, const __m128i *pin1,
const __m128i *pmultiplier,
const __m128i *prounding,
diff --git a/aom_dsp/x86/fwd_txfm_ssse3_x86_64.asm b/aom_dsp/x86/fwd_txfm_ssse3_x86_64.asm
index 6f3c470..5b2aab2 100644
--- a/aom_dsp/x86/fwd_txfm_ssse3_x86_64.asm
+++ b/aom_dsp/x86/fwd_txfm_ssse3_x86_64.asm
@@ -130,12 +130,30 @@
psraw m%2, 1
%endmacro
+%macro STORE_OUTPUT 2 ; index, result
+%if CONFIG_AOM_HIGHBITDEPTH
+ ; const __m128i sign_bits = _mm_cmplt_epi16(*poutput, zero);
+ ; __m128i out0 = _mm_unpacklo_epi16(*poutput, sign_bits);
+ ; __m128i out1 = _mm_unpackhi_epi16(*poutput, sign_bits);
+ ; _mm_store_si128((__m128i *)(dst_ptr), out0);
+ ; _mm_store_si128((__m128i *)(dst_ptr + 4), out1);
+ pxor m11, m11
+ pcmpgtw m11, m%2
+ movdqa m12, m%2
+ punpcklwd m%2, m11
+ punpckhwd m12, m11
+ mova [outputq + 4*%1 + 0], m%2
+ mova [outputq + 4*%1 + 16], m12
+%else
+ mova [outputq + 2*%1], m%2
+%endif
+%endmacro
+
INIT_XMM ssse3
cglobal fdct8x8, 3, 5, 13, input, output, stride
mova m8, [pd_8192]
mova m12, [pw_11585x2]
- pxor m11, m11
lea r3, [2 * strideq]
lea r4, [4 * strideq]
@@ -173,14 +191,14 @@
DIVIDE_ROUND_2X 4, 5, 9, 10
DIVIDE_ROUND_2X 6, 7, 9, 10
- mova [outputq + 0], m0
- mova [outputq + 16], m1
- mova [outputq + 32], m2
- mova [outputq + 48], m3
- mova [outputq + 64], m4
- mova [outputq + 80], m5
- mova [outputq + 96], m6
- mova [outputq + 112], m7
+ STORE_OUTPUT 0, 0
+ STORE_OUTPUT 8, 1
+ STORE_OUTPUT 16, 2
+ STORE_OUTPUT 24, 3
+ STORE_OUTPUT 32, 4
+ STORE_OUTPUT 40, 5
+ STORE_OUTPUT 48, 6
+ STORE_OUTPUT 56, 7
RET
%endif
diff --git a/aom_dsp/x86/inv_txfm_sse2.c b/aom_dsp/x86/inv_txfm_sse2.c
index 4735d97..2217a46 100644
--- a/aom_dsp/x86/inv_txfm_sse2.c
+++ b/aom_dsp/x86/inv_txfm_sse2.c
@@ -171,7 +171,7 @@
RECON_AND_STORE4X4(dest + 3 * stride, dc_value);
}
-void idct4_sse2(__m128i *in) {
+void aom_idct4_sse2(__m128i *in) {
const __m128i k__cospi_p16_p16 = pair_set_epi16(cospi_16_64, cospi_16_64);
const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
@@ -207,7 +207,7 @@
in[1] = _mm_shuffle_epi32(in[1], 0x4E);
}
-void iadst4_sse2(__m128i *in) {
+void aom_iadst4_sse2(__m128i *in) {
const __m128i k__sinpi_p01_p04 = pair_set_epi16(sinpi_1_9, sinpi_4_9);
const __m128i k__sinpi_p03_p02 = pair_set_epi16(sinpi_3_9, sinpi_2_9);
const __m128i k__sinpi_p02_m01 = pair_set_epi16(sinpi_2_9, -sinpi_1_9);
@@ -533,7 +533,7 @@
RECON_AND_STORE(dest + 7 * stride, dc_value);
}
-void idct8_sse2(__m128i *in) {
+void aom_idct8_sse2(__m128i *in) {
const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
const __m128i stg1_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
@@ -558,7 +558,7 @@
in[4], in[5], in[6], in[7]);
}
-void iadst8_sse2(__m128i *in) {
+void aom_iadst8_sse2(__m128i *in) {
const __m128i k__cospi_p02_p30 = pair_set_epi16(cospi_2_64, cospi_30_64);
const __m128i k__cospi_p30_m02 = pair_set_epi16(cospi_30_64, -cospi_2_64);
const __m128i k__cospi_p10_p22 = pair_set_epi16(cospi_10_64, cospi_22_64);
@@ -2114,13 +2114,13 @@
in[15] = _mm_sub_epi16(s[0], s[15]);
}
-void idct16_sse2(__m128i *in0, __m128i *in1) {
+void aom_idct16_sse2(__m128i *in0, __m128i *in1) {
array_transpose_16x16(in0, in1);
idct16_8col(in0);
idct16_8col(in1);
}
-void iadst16_sse2(__m128i *in0, __m128i *in1) {
+void aom_iadst16_sse2(__m128i *in0, __m128i *in1) {
array_transpose_16x16(in0, in1);
iadst16_8col(in0);
iadst16_8col(in1);
@@ -3596,7 +3596,7 @@
if (!test) {
// Do the row transform
- idct4_sse2(inptr);
+ aom_idct4_sse2(inptr);
// Check the min & max values
max_input = _mm_max_epi16(inptr[0], inptr[1]);
@@ -3632,7 +3632,7 @@
}
if (optimised_cols) {
- idct4_sse2(inptr);
+ aom_idct4_sse2(inptr);
// Final round and shift
inptr[0] = _mm_add_epi16(inptr[0], eight);
@@ -3712,7 +3712,7 @@
if (!test) {
// Do the row transform
- idct8_sse2(inptr);
+ aom_idct8_sse2(inptr);
// Find the min & max for the column transform
max_input = _mm_max_epi16(inptr[0], inptr[1]);
@@ -3749,7 +3749,7 @@
}
if (optimised_cols) {
- idct8_sse2(inptr);
+ aom_idct8_sse2(inptr);
// Final round & shift and Reconstruction and Store
{
@@ -3813,7 +3813,7 @@
if (!test) {
// Do the row transform
- idct8_sse2(inptr);
+ aom_idct8_sse2(inptr);
// Find the min & max for the column transform
// N.B. Only first 4 cols contain non-zero coeffs
@@ -3852,7 +3852,7 @@
}
if (optimised_cols) {
- idct8_sse2(inptr);
+ aom_idct8_sse2(inptr);
// Final round & shift and Reconstruction and Store
{
@@ -3918,7 +3918,7 @@
if (!test) {
// Do the row transform
- idct16_sse2(inptr, inptr + 16);
+ aom_idct16_sse2(inptr, inptr + 16);
// Find the min & max for the column transform
max_input = _mm_max_epi16(inptr[0], inptr[1]);
@@ -3960,7 +3960,7 @@
}
if (optimised_cols) {
- idct16_sse2(inptr, inptr + 16);
+ aom_idct16_sse2(inptr, inptr + 16);
// Final round & shift and Reconstruction and Store
{
@@ -4033,7 +4033,7 @@
if (!test) {
// Do the row transform (N.B. This transposes inptr)
- idct16_sse2(inptr, inptr + 16);
+ aom_idct16_sse2(inptr, inptr + 16);
// Find the min & max for the column transform
// N.B. Only first 4 cols contain non-zero coeffs
@@ -4078,7 +4078,7 @@
}
if (optimised_cols) {
- idct16_sse2(inptr, inptr + 16);
+ aom_idct16_sse2(inptr, inptr + 16);
// Final round & shift and Reconstruction and Store
{
diff --git a/aom_dsp/x86/inv_txfm_sse2.h b/aom_dsp/x86/inv_txfm_sse2.h
index c271b28..4ebb34d 100644
--- a/aom_dsp/x86/inv_txfm_sse2.h
+++ b/aom_dsp/x86/inv_txfm_sse2.h
@@ -197,12 +197,12 @@
void iadst16_8col(__m128i *in);
void idct16_8col(__m128i *in);
-void idct4_sse2(__m128i *in);
-void idct8_sse2(__m128i *in);
-void idct16_sse2(__m128i *in0, __m128i *in1);
-void iadst4_sse2(__m128i *in);
-void iadst8_sse2(__m128i *in);
-void iadst16_sse2(__m128i *in0, __m128i *in1);
+void aom_idct4_sse2(__m128i *in);
+void aom_idct8_sse2(__m128i *in);
+void aom_idct16_sse2(__m128i *in0, __m128i *in1);
+void aom_iadst4_sse2(__m128i *in);
+void aom_iadst8_sse2(__m128i *in);
+void aom_iadst16_sse2(__m128i *in0, __m128i *in1);
void idct32_8col(__m128i *in0, __m128i *in1);
#endif // AOM_DSP_X86_INV_TXFM_SSE2_H_
diff --git a/aom_dsp/x86/txfm_common_avx2.h b/aom_dsp/x86/txfm_common_avx2.h
index 7dc17f0..39e9b8e 100644
--- a/aom_dsp/x86/txfm_common_avx2.h
+++ b/aom_dsp/x86/txfm_common_avx2.h
@@ -14,6 +14,8 @@
#include <immintrin.h>
+#include "aom_dsp/txfm_common.h"
+
#define pair256_set_epi16(a, b) \
_mm256_set_epi16((int16_t)(b), (int16_t)(a), (int16_t)(b), (int16_t)(a), \
(int16_t)(b), (int16_t)(a), (int16_t)(b), (int16_t)(a), \
@@ -24,4 +26,179 @@
_mm256_set_epi32((int)(b), (int)(a), (int)(b), (int)(a), (int)(b), (int)(a), \
(int)(b), (int)(a))
+static INLINE void mm256_reverse_epi16(__m256i *u) {
+ const __m256i control = _mm256_set_epi16(
+ 0x0100, 0x0302, 0x0504, 0x0706, 0x0908, 0x0B0A, 0x0D0C, 0x0F0E, 0x0100,
+ 0x0302, 0x0504, 0x0706, 0x0908, 0x0B0A, 0x0D0C, 0x0F0E);
+ __m256i v = _mm256_shuffle_epi8(*u, control);
+ *u = _mm256_permute2x128_si256(v, v, 1);
+}
+
+static INLINE void mm256_transpose_16x16(__m256i *in) {
+ __m256i tr0_0 = _mm256_unpacklo_epi16(in[0], in[1]);
+ __m256i tr0_1 = _mm256_unpackhi_epi16(in[0], in[1]);
+ __m256i tr0_2 = _mm256_unpacklo_epi16(in[2], in[3]);
+ __m256i tr0_3 = _mm256_unpackhi_epi16(in[2], in[3]);
+ __m256i tr0_4 = _mm256_unpacklo_epi16(in[4], in[5]);
+ __m256i tr0_5 = _mm256_unpackhi_epi16(in[4], in[5]);
+ __m256i tr0_6 = _mm256_unpacklo_epi16(in[6], in[7]);
+ __m256i tr0_7 = _mm256_unpackhi_epi16(in[6], in[7]);
+
+ __m256i tr0_8 = _mm256_unpacklo_epi16(in[8], in[9]);
+ __m256i tr0_9 = _mm256_unpackhi_epi16(in[8], in[9]);
+ __m256i tr0_a = _mm256_unpacklo_epi16(in[10], in[11]);
+ __m256i tr0_b = _mm256_unpackhi_epi16(in[10], in[11]);
+ __m256i tr0_c = _mm256_unpacklo_epi16(in[12], in[13]);
+ __m256i tr0_d = _mm256_unpackhi_epi16(in[12], in[13]);
+ __m256i tr0_e = _mm256_unpacklo_epi16(in[14], in[15]);
+ __m256i tr0_f = _mm256_unpackhi_epi16(in[14], in[15]);
+
+ // 00 10 01 11 02 12 03 13 08 18 09 19 0a 1a 0b 1b
+ // 04 14 05 15 06 16 07 17 0c 1c 0d 1d 0e 1e 0f 1f
+ // 20 30 21 31 22 32 23 33 28 38 29 39 2a 3a 2b 3b
+ // 24 34 25 35 26 36 27 37 2c 3c 2d 3d 2e 3e 2f 3f
+ // 40 50 41 51 42 52 43 53 48 58 49 59 4a 5a 4b 5b
+ // 44 54 45 55 46 56 47 57 4c 5c 4d 5d 4e 5e 4f 5f
+ // 60 70 61 71 62 72 63 73 68 78 69 79 6a 7a 6b 7b
+ // 64 74 65 75 66 76 67 77 6c 7c 6d 7d 6e 7e 6f 7f
+
+ // 80 90 81 91 82 92 83 93 88 98 89 99 8a 9a 8b 9b
+ // 84 94 85 95 86 96 87 97 8c 9c 8d 9d 8e 9e 8f 9f
+ // a0 b0 a1 b1 a2 b2 a3 b3 a8 b8 a9 b9 aa ba ab bb
+ // a4 b4 a5 b5 a6 b6 a7 b7 ac bc ad bd ae be af bf
+ // c0 d0 c1 d1 c2 d2 c3 d3 c8 d8 c9 d9 ca da cb db
+ // c4 d4 c5 d5 c6 d6 c7 d7 cc dc cd dd ce de cf df
+ // e0 f0 e1 f1 e2 f2 e3 f3 e8 f8 e9 f9 ea fa eb fb
+ // e4 f4 e5 f5 e6 f6 e7 f7 ec fc ed fd ee fe ef ff
+
+ __m256i tr1_0 = _mm256_unpacklo_epi32(tr0_0, tr0_2);
+ __m256i tr1_1 = _mm256_unpackhi_epi32(tr0_0, tr0_2);
+ __m256i tr1_2 = _mm256_unpacklo_epi32(tr0_1, tr0_3);
+ __m256i tr1_3 = _mm256_unpackhi_epi32(tr0_1, tr0_3);
+ __m256i tr1_4 = _mm256_unpacklo_epi32(tr0_4, tr0_6);
+ __m256i tr1_5 = _mm256_unpackhi_epi32(tr0_4, tr0_6);
+ __m256i tr1_6 = _mm256_unpacklo_epi32(tr0_5, tr0_7);
+ __m256i tr1_7 = _mm256_unpackhi_epi32(tr0_5, tr0_7);
+
+ __m256i tr1_8 = _mm256_unpacklo_epi32(tr0_8, tr0_a);
+ __m256i tr1_9 = _mm256_unpackhi_epi32(tr0_8, tr0_a);
+ __m256i tr1_a = _mm256_unpacklo_epi32(tr0_9, tr0_b);
+ __m256i tr1_b = _mm256_unpackhi_epi32(tr0_9, tr0_b);
+ __m256i tr1_c = _mm256_unpacklo_epi32(tr0_c, tr0_e);
+ __m256i tr1_d = _mm256_unpackhi_epi32(tr0_c, tr0_e);
+ __m256i tr1_e = _mm256_unpacklo_epi32(tr0_d, tr0_f);
+ __m256i tr1_f = _mm256_unpackhi_epi32(tr0_d, tr0_f);
+
+ // 00 10 20 30 01 11 21 31 08 18 28 38 09 19 29 39
+ // 02 12 22 32 03 13 23 33 0a 1a 2a 3a 0b 1b 2b 3b
+ // 04 14 24 34 05 15 25 35 0c 1c 2c 3c 0d 1d 2d 3d
+ // 06 16 26 36 07 17 27 37 0e 1e 2e 3e 0f 1f 2f 3f
+ // 40 50 60 70 41 51 61 71 48 58 68 78 49 59 69 79
+ // 42 52 62 72 43 53 63 73 4a 5a 6a 7a 4b 5b 6b 7b
+ // 44 54 64 74 45 55 65 75 4c 5c 6c 7c 4d 5d 6d 7d
+ // 46 56 66 76 47 57 67 77 4e 5e 6e 7e 4f 5f 6f 7f
+
+ // 80 90 a0 b0 81 91 a1 b1 88 98 a8 b8 89 99 a9 b9
+ // 82 92 a2 b2 83 93 a3 b3 8a 9a aa ba 8b 9b ab bb
+ // 84 94 a4 b4 85 95 a5 b5 8c 9c ac bc 8d 9d ad bd
+ // 86 96 a6 b6 87 97 a7 b7 8e ae 9e be 8f 9f af bf
+ // c0 d0 e0 f0 c1 d1 e1 f1 c8 d8 e8 f8 c9 d9 e9 f9
+ // c2 d2 e2 f2 c3 d3 e3 f3 ca da ea fa cb db eb fb
+ // c4 d4 e4 f4 c5 d5 e5 f5 cc dc ef fc cd dd ed fd
+ // c6 d6 e6 f6 c7 d7 e7 f7 ce de ee fe cf df ef ff
+
+ tr0_0 = _mm256_unpacklo_epi64(tr1_0, tr1_4);
+ tr0_1 = _mm256_unpackhi_epi64(tr1_0, tr1_4);
+ tr0_2 = _mm256_unpacklo_epi64(tr1_1, tr1_5);
+ tr0_3 = _mm256_unpackhi_epi64(tr1_1, tr1_5);
+ tr0_4 = _mm256_unpacklo_epi64(tr1_2, tr1_6);
+ tr0_5 = _mm256_unpackhi_epi64(tr1_2, tr1_6);
+ tr0_6 = _mm256_unpacklo_epi64(tr1_3, tr1_7);
+ tr0_7 = _mm256_unpackhi_epi64(tr1_3, tr1_7);
+
+ tr0_8 = _mm256_unpacklo_epi64(tr1_8, tr1_c);
+ tr0_9 = _mm256_unpackhi_epi64(tr1_8, tr1_c);
+ tr0_a = _mm256_unpacklo_epi64(tr1_9, tr1_d);
+ tr0_b = _mm256_unpackhi_epi64(tr1_9, tr1_d);
+ tr0_c = _mm256_unpacklo_epi64(tr1_a, tr1_e);
+ tr0_d = _mm256_unpackhi_epi64(tr1_a, tr1_e);
+ tr0_e = _mm256_unpacklo_epi64(tr1_b, tr1_f);
+ tr0_f = _mm256_unpackhi_epi64(tr1_b, tr1_f);
+
+ // 00 10 20 30 40 50 60 70 08 18 28 38 48 58 68 78
+ // 01 11 21 31 41 51 61 71 09 19 29 39 49 59 69 79
+ // 02 12 22 32 42 52 62 72 0a 1a 2a 3a 4a 5a 6a 7a
+ // 03 13 23 33 43 53 63 73 0b 1b 2b 3b 4b 5b 6b 7b
+ // 04 14 24 34 44 54 64 74 0c 1c 2c 3c 4c 5c 6c 7c
+ // 05 15 25 35 45 55 65 75 0d 1d 2d 3d 4d 5d 6d 7d
+ // 06 16 26 36 46 56 66 76 0e 1e 2e 3e 4e 5e 6e 7e
+ // 07 17 27 37 47 57 67 77 0f 1f 2f 3f 4f 5f 6f 7f
+
+ // 80 90 a0 b0 c0 d0 e0 f0 88 98 a8 b8 c8 d8 e8 f8
+ // 81 91 a1 b1 c1 d1 e1 f1 89 99 a9 b9 c9 d9 e9 f9
+ // 82 92 a2 b2 c2 d2 e2 f2 8a 9a aa ba ca da ea fa
+ // 83 93 a3 b3 c3 d3 e3 f3 8b 9b ab bb cb db eb fb
+ // 84 94 a4 b4 c4 d4 e4 f4 8c 9c ac bc cc dc ef fc
+ // 85 95 a5 b5 c5 d5 e5 f5 8d 9d ad bd cd dd ed fd
+ // 86 96 a6 b6 c6 d6 e6 f6 8e ae 9e be ce de ee fe
+ // 87 97 a7 b7 c7 d7 e7 f7 8f 9f af bf cf df ef ff
+
+ in[0] = _mm256_permute2x128_si256(tr0_0, tr0_8, 0x20); // 0010 0000
+ in[8] = _mm256_permute2x128_si256(tr0_0, tr0_8, 0x31); // 0011 0001
+ in[1] = _mm256_permute2x128_si256(tr0_1, tr0_9, 0x20);
+ in[9] = _mm256_permute2x128_si256(tr0_1, tr0_9, 0x31);
+ in[2] = _mm256_permute2x128_si256(tr0_2, tr0_a, 0x20);
+ in[10] = _mm256_permute2x128_si256(tr0_2, tr0_a, 0x31);
+ in[3] = _mm256_permute2x128_si256(tr0_3, tr0_b, 0x20);
+ in[11] = _mm256_permute2x128_si256(tr0_3, tr0_b, 0x31);
+
+ in[4] = _mm256_permute2x128_si256(tr0_4, tr0_c, 0x20);
+ in[12] = _mm256_permute2x128_si256(tr0_4, tr0_c, 0x31);
+ in[5] = _mm256_permute2x128_si256(tr0_5, tr0_d, 0x20);
+ in[13] = _mm256_permute2x128_si256(tr0_5, tr0_d, 0x31);
+ in[6] = _mm256_permute2x128_si256(tr0_6, tr0_e, 0x20);
+ in[14] = _mm256_permute2x128_si256(tr0_6, tr0_e, 0x31);
+ in[7] = _mm256_permute2x128_si256(tr0_7, tr0_f, 0x20);
+ in[15] = _mm256_permute2x128_si256(tr0_7, tr0_f, 0x31);
+}
+
+static INLINE __m256i butter_fly(__m256i a0, __m256i a1, const __m256i cospi) {
+ const __m256i dct_rounding = _mm256_set1_epi32(DCT_CONST_ROUNDING);
+ __m256i y0 = _mm256_madd_epi16(a0, cospi);
+ __m256i y1 = _mm256_madd_epi16(a1, cospi);
+
+ y0 = _mm256_add_epi32(y0, dct_rounding);
+ y1 = _mm256_add_epi32(y1, dct_rounding);
+ y0 = _mm256_srai_epi32(y0, DCT_CONST_BITS);
+ y1 = _mm256_srai_epi32(y1, DCT_CONST_BITS);
+
+ return _mm256_packs_epi32(y0, y1);
+}
+
+static INLINE void txfm_scaling16_avx2(const int16_t c, __m256i *in) {
+ const __m256i zero = _mm256_setzero_si256();
+ const __m256i sqrt2_epi16 = _mm256_set1_epi16(c);
+ const __m256i dct_const_rounding = _mm256_set1_epi32(DCT_CONST_ROUNDING);
+ __m256i u0, u1;
+ int i = 0;
+
+ while (i < 16) {
+ in[i] = _mm256_slli_epi16(in[i], 1);
+
+ u0 = _mm256_unpacklo_epi16(zero, in[i]);
+ u1 = _mm256_unpackhi_epi16(zero, in[i]);
+
+ u0 = _mm256_madd_epi16(u0, sqrt2_epi16);
+ u1 = _mm256_madd_epi16(u1, sqrt2_epi16);
+
+ u0 = _mm256_add_epi32(u0, dct_const_rounding);
+ u1 = _mm256_add_epi32(u1, dct_const_rounding);
+
+ u0 = _mm256_srai_epi32(u0, DCT_CONST_BITS);
+ u1 = _mm256_srai_epi32(u1, DCT_CONST_BITS);
+ in[i] = _mm256_packs_epi32(u0, u1);
+ i++;
+ }
+}
+
#endif // AOM_DSP_X86_TXFM_COMMON_AVX2_H
diff --git a/aom_dsp/x86/txfm_common_intrin.h b/aom_dsp/x86/txfm_common_intrin.h
new file mode 100644
index 0000000..890e048
--- /dev/null
+++ b/aom_dsp/x86/txfm_common_intrin.h
@@ -0,0 +1,31 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef _AOM_DSP_X86_TXFM_COMMON_INTRIN_H_
+#define _AOM_DSP_X86_TXFM_COMMON_INTRIN_H_
+
+// Note:
+// This header file should be put below any x86 intrinsics head file
+
+static INLINE void storeu_output(const __m128i *poutput, tran_low_t *dst_ptr) {
+#if CONFIG_AOM_HIGHBITDEPTH
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i sign_bits = _mm_cmplt_epi16(*poutput, zero);
+ __m128i out0 = _mm_unpacklo_epi16(*poutput, sign_bits);
+ __m128i out1 = _mm_unpackhi_epi16(*poutput, sign_bits);
+ _mm_storeu_si128((__m128i *)(dst_ptr), out0);
+ _mm_storeu_si128((__m128i *)(dst_ptr + 4), out1);
+#else
+ _mm_storeu_si128((__m128i *)(dst_ptr), *poutput);
+#endif // CONFIG_AOM_HIGHBITDEPTH
+}
+
+#endif // _AOM_DSP_X86_TXFM_COMMON_INTRIN_H_
diff --git a/aomenc.c b/aomenc.c
index e32a922..ef174c2 100644
--- a/aomenc.c
+++ b/aomenc.c
@@ -1964,6 +1964,7 @@
{ stream->config.cfg.g_input_bit_depth = input.bit_depth; });
}
+#if CONFIG_AOM_HIGHBITDEPTH
FOREACH_STREAM({
if (input.fmt != AOM_IMG_FMT_I420 && input.fmt != AOM_IMG_FMT_I42016) {
/* Automatically upgrade if input is non-4:2:0 but a 4:2:0 profile
@@ -1980,7 +1981,6 @@
default: break;
}
}
-#if CONFIG_AOM_HIGHBITDEPTH
/* Automatically set the codec bit depth to match the input bit depth.
* Upgrade the profile if required. */
if (stream->config.cfg.g_input_bit_depth >
@@ -2003,7 +2003,6 @@
if (stream->config.cfg.g_profile > 1) {
stream->config.use_16bit_internal = 1;
}
-#endif
if (profile_updated) {
fprintf(stderr,
"Warning: automatically upgrading to profile %d to "
@@ -2011,6 +2010,31 @@
stream->config.cfg.g_profile);
}
});
+#else
+ FOREACH_STREAM({
+ if (input.fmt != AOM_IMG_FMT_I420 && input.fmt != AOM_IMG_FMT_I42016) {
+ /* Automatically upgrade if input is non-4:2:0 but a 4:2:0 profile
+ was selected. */
+ switch (stream->config.cfg.g_profile) {
+ case 0:
+ stream->config.cfg.g_profile = 1;
+ profile_updated = 1;
+ break;
+ case 2:
+ stream->config.cfg.g_profile = 3;
+ profile_updated = 1;
+ break;
+ default: break;
+ }
+ }
+ if (profile_updated) {
+ fprintf(stderr,
+ "Warning: automatically upgrading to profile %d to "
+ "match input format.\n",
+ stream->config.cfg.g_profile);
+ }
+ });
+#endif
FOREACH_STREAM(set_stream_dimensions(stream, input.width, input.height));
FOREACH_STREAM(validate_stream_config(stream, &global));
diff --git a/av1/av1_common.mk b/av1/av1_common.mk
index 38fb6fd..0fe4a89 100644
--- a/av1/av1_common.mk
+++ b/av1/av1_common.mk
@@ -30,8 +30,6 @@
AV1_COMMON_SRCS-yes += common/filter.c
AV1_COMMON_SRCS-yes += common/idct.h
AV1_COMMON_SRCS-yes += common/idct.c
-AV1_COMMON_SRCS-yes += common/av1_inv_txfm.h
-AV1_COMMON_SRCS-yes += common/av1_inv_txfm.c
AV1_COMMON_SRCS-yes += common/loopfilter.h
AV1_COMMON_SRCS-yes += common/thread_common.h
AV1_COMMON_SRCS-yes += common/mv.h
@@ -61,8 +59,6 @@
AV1_COMMON_SRCS-yes += common/scan.c
AV1_COMMON_SRCS-yes += common/scan.h
# TODO(angiebird) the forward transform belongs under encoder/
-AV1_COMMON_SRCS-$(CONFIG_AV1_ENCODER) += common/av1_fwd_txfm.h
-AV1_COMMON_SRCS-$(CONFIG_AV1_ENCODER) += common/av1_fwd_txfm.c
AV1_COMMON_SRCS-yes += common/av1_txfm.h
AV1_COMMON_SRCS-yes += common/av1_fwd_txfm1d.h
AV1_COMMON_SRCS-yes += common/av1_fwd_txfm1d.c
@@ -78,8 +74,8 @@
AV1_COMMON_SRCS-$(HAVE_SSE4_1) += common/x86/av1_highbd_convolve_sse4.c
AV1_COMMON_SRCS-$(HAVE_SSE4_1) += common/x86/av1_highbd_convolve_filters_sse4.h
endif
-AV1_COMMON_SRCS-yes += common/av1_convolve.c
-AV1_COMMON_SRCS-yes += common/av1_convolve.h
+AV1_COMMON_SRCS-yes += common/convolve.c
+AV1_COMMON_SRCS-yes += common/convolve.h
AV1_COMMON_SRCS-$(CONFIG_LOOP_RESTORATION) += common/restoration.h
AV1_COMMON_SRCS-$(CONFIG_LOOP_RESTORATION) += common/restoration.c
ifeq (yes,$(filter $(CONFIG_GLOBAL_MOTION) $(CONFIG_WARPED_MOTION),yes))
@@ -122,10 +118,9 @@
AV1_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/idct16x16_msa.c
AV1_COMMON_SRCS-$(HAVE_SSE2) += common/x86/idct_intrin_sse2.c
+AV1_COMMON_SRCS-$(HAVE_AVX2) += common/x86/hybrid_inv_txfm_avx2.c
+
ifeq ($(CONFIG_AV1_ENCODER),yes)
-AV1_COMMON_SRCS-$(HAVE_SSE2) += common/x86/av1_fwd_txfm_sse2.c
-AV1_COMMON_SRCS-$(HAVE_SSE2) += common/x86/av1_fwd_dct32x32_impl_sse2.h
-AV1_COMMON_SRCS-$(HAVE_SSE2) += common/x86/av1_fwd_txfm_impl_sse2.h
AV1_COMMON_SRCS-$(HAVE_SSE4_1) += common/x86/av1_txfm1d_sse4.h
AV1_COMMON_SRCS-$(HAVE_SSE4_1) += common/x86/av1_fwd_txfm1d_sse4.c
AV1_COMMON_SRCS-$(HAVE_SSE4_1) += common/x86/av1_fwd_txfm2d_sse4.c
@@ -143,7 +138,4 @@
AV1_COMMON_SRCS-$(HAVE_SSE4_1) += common/x86/filterintra_sse4.c
endif
-AV1_COMMON_SRCS-$(HAVE_SSE2) += common/x86/av1_inv_txfm_sse2.c
-AV1_COMMON_SRCS-$(HAVE_SSE2) += common/x86/av1_inv_txfm_sse2.h
-
$(eval $(call rtcd_h_template,av1_rtcd,av1/common/av1_rtcd_defs.pl))
diff --git a/av1/av1_cx_iface.c b/av1/av1_cx_iface.c
index a7eb71e..e8069d6 100644
--- a/av1/av1_cx_iface.c
+++ b/av1/av1_cx_iface.c
@@ -142,10 +142,10 @@
return AOM_CODEC_INVALID_PARAM; \
} while (0)
-#define RANGE_CHECK(p, memb, lo, hi) \
- do { \
- if (!(((p)->memb == lo || (p)->memb > (lo)) && (p)->memb <= hi)) \
- ERROR(#memb " out of range [" #lo ".." #hi "]"); \
+#define RANGE_CHECK(p, memb, lo, hi) \
+ do { \
+ if (!((p)->memb >= (lo) && (p)->memb <= (hi))) \
+ ERROR(#memb " out of range [" #lo ".." #hi "]"); \
} while (0)
#define RANGE_CHECK_HI(p, memb, hi) \
@@ -176,7 +176,7 @@
RANGE_CHECK_HI(cfg, rc_min_quantizer, cfg->rc_max_quantizer);
RANGE_CHECK_BOOL(extra_cfg, lossless);
RANGE_CHECK(extra_cfg, aq_mode, 0, AQ_MODE_COUNT - 1);
- RANGE_CHECK(extra_cfg, frame_periodic_boost, 0, 1);
+ RANGE_CHECK_HI(extra_cfg, frame_periodic_boost, 1);
RANGE_CHECK_HI(cfg, g_threads, 64);
RANGE_CHECK_HI(cfg, g_lag_in_frames, MAX_LAG_BUFFERS);
RANGE_CHECK(cfg, rc_end_usage, AOM_VBR, AOM_Q);
@@ -189,8 +189,8 @@
RANGE_CHECK_HI(cfg, rc_resize_up_thresh, 100);
RANGE_CHECK_HI(cfg, rc_resize_down_thresh, 100);
RANGE_CHECK(cfg, g_pass, AOM_RC_ONE_PASS, AOM_RC_LAST_PASS);
- RANGE_CHECK(extra_cfg, min_gf_interval, 0, (MAX_LAG_BUFFERS - 1));
- RANGE_CHECK(extra_cfg, max_gf_interval, 0, (MAX_LAG_BUFFERS - 1));
+ RANGE_CHECK_HI(extra_cfg, min_gf_interval, MAX_LAG_BUFFERS - 1);
+ RANGE_CHECK_HI(extra_cfg, max_gf_interval, MAX_LAG_BUFFERS - 1);
if (extra_cfg->max_gf_interval > 0) {
RANGE_CHECK(extra_cfg, max_gf_interval, 2, (MAX_LAG_BUFFERS - 1));
}
@@ -200,8 +200,8 @@
}
if (cfg->rc_resize_allowed == 1) {
- RANGE_CHECK(cfg, rc_scaled_width, 0, cfg->g_w);
- RANGE_CHECK(cfg, rc_scaled_height, 0, cfg->g_h);
+ RANGE_CHECK_HI(cfg, rc_scaled_width, cfg->g_w);
+ RANGE_CHECK_HI(cfg, rc_scaled_height, cfg->g_h);
}
// AV1 does not support a lower bound on the keyframe interval in
@@ -212,9 +212,9 @@
"kf_min_dist not supported in auto mode, use 0 "
"or kf_max_dist instead.");
- RANGE_CHECK(extra_cfg, enable_auto_alt_ref, 0, 2);
+ RANGE_CHECK_HI(extra_cfg, enable_auto_alt_ref, 2);
#if CONFIG_EXT_REFS
- RANGE_CHECK(extra_cfg, enable_auto_bwd_ref, 0, 2);
+ RANGE_CHECK_HI(extra_cfg, enable_auto_bwd_ref, 2);
#endif // CONFIG_EXT_REFS
RANGE_CHECK(extra_cfg, cpu_used, -8, 8);
RANGE_CHECK_HI(extra_cfg, noise_sensitivity, 6);
@@ -239,13 +239,13 @@
RANGE_CHECK(extra_cfg, tile_rows, 1, 64);
}
#else
- RANGE_CHECK(extra_cfg, tile_columns, 0, 6);
- RANGE_CHECK(extra_cfg, tile_rows, 0, 2);
+ RANGE_CHECK_HI(extra_cfg, tile_columns, 6);
+ RANGE_CHECK_HI(extra_cfg, tile_rows, 2);
#endif // CONFIG_EXT_TILE
RANGE_CHECK_HI(extra_cfg, sharpness, 7);
- RANGE_CHECK(extra_cfg, arnr_max_frames, 0, 15);
+ RANGE_CHECK_HI(extra_cfg, arnr_max_frames, 15);
RANGE_CHECK_HI(extra_cfg, arnr_strength, 6);
- RANGE_CHECK(extra_cfg, cq_level, 0, 63);
+ RANGE_CHECK_HI(extra_cfg, cq_level, 63);
RANGE_CHECK(cfg, g_bit_depth, AOM_BITS_8, AOM_BITS_12);
RANGE_CHECK(cfg, g_input_bit_depth, 8, 12);
RANGE_CHECK(extra_cfg, content, AOM_CONTENT_DEFAULT, AOM_CONTENT_INVALID - 1);
diff --git a/av1/common/av1_fwd_txfm.c b/av1/common/av1_fwd_txfm.c
deleted file mode 100644
index 84a3876..0000000
--- a/av1/common/av1_fwd_txfm.c
+++ /dev/null
@@ -1,813 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include "av1/common/av1_fwd_txfm.h"
-#include <assert.h>
-#include "./av1_rtcd.h"
-
-void av1_fdct4x4_c(const int16_t *input, tran_low_t *output, int stride) {
- // The 2D transform is done with two passes which are actually pretty
- // similar. In the first one, we transform the columns and transpose
- // the results. In the second one, we transform the rows. To achieve that,
- // as the first pass results are transposed, we transpose the columns (that
- // is the transposed rows) and transpose the results (so that it goes back
- // in normal/row positions).
- int pass;
- // We need an intermediate buffer between passes.
- tran_low_t intermediate[4 * 4];
- const tran_low_t *in_low = NULL;
- tran_low_t *out = intermediate;
- // Do the two transform/transpose passes
- for (pass = 0; pass < 2; ++pass) {
- tran_high_t in_high[4]; // canbe16
- tran_high_t step[4]; // canbe16
- tran_high_t temp1, temp2; // needs32
- int i;
- for (i = 0; i < 4; ++i) {
- // Load inputs.
- if (0 == pass) {
- in_high[0] = input[0 * stride] * 16;
- in_high[1] = input[1 * stride] * 16;
- in_high[2] = input[2 * stride] * 16;
- in_high[3] = input[3 * stride] * 16;
- if (i == 0 && in_high[0]) {
- in_high[0] += 1;
- }
- } else {
- assert(in_low != NULL);
- in_high[0] = in_low[0 * 4];
- in_high[1] = in_low[1 * 4];
- in_high[2] = in_low[2 * 4];
- in_high[3] = in_low[3 * 4];
- in_low++;
- }
- // Transform.
- step[0] = in_high[0] + in_high[3];
- step[1] = in_high[1] + in_high[2];
- step[2] = in_high[1] - in_high[2];
- step[3] = in_high[0] - in_high[3];
- temp1 = (step[0] + step[1]) * cospi_16_64;
- temp2 = (step[0] - step[1]) * cospi_16_64;
- out[0] = (tran_low_t)fdct_round_shift(temp1);
- out[2] = (tran_low_t)fdct_round_shift(temp2);
- temp1 = step[2] * cospi_24_64 + step[3] * cospi_8_64;
- temp2 = -step[2] * cospi_8_64 + step[3] * cospi_24_64;
- out[1] = (tran_low_t)fdct_round_shift(temp1);
- out[3] = (tran_low_t)fdct_round_shift(temp2);
- // Do next column (which is a transposed row in second/horizontal pass)
- input++;
- out += 4;
- }
- // Setup in_low/out for next pass.
- in_low = intermediate;
- out = output;
- }
-
- {
- int i, j;
- for (i = 0; i < 4; ++i) {
- for (j = 0; j < 4; ++j) output[j + i * 4] = (output[j + i * 4] + 1) >> 2;
- }
- }
-}
-
-void av1_fdct4x4_1_c(const int16_t *input, tran_low_t *output, int stride) {
- int r, c;
- tran_low_t sum = 0;
- for (r = 0; r < 4; ++r)
- for (c = 0; c < 4; ++c) sum += input[r * stride + c];
-
- output[0] = sum << 1;
- output[1] = 0;
-}
-
-void av1_fdct8x8_c(const int16_t *input, tran_low_t *final_output, int stride) {
- int i, j;
- tran_low_t intermediate[64];
- int pass;
- tran_low_t *output = intermediate;
- const tran_low_t *in = NULL;
-
- // Transform columns
- for (pass = 0; pass < 2; ++pass) {
- tran_high_t s0, s1, s2, s3, s4, s5, s6, s7; // canbe16
- tran_high_t t0, t1, t2, t3; // needs32
- tran_high_t x0, x1, x2, x3; // canbe16
-
- for (i = 0; i < 8; i++) {
- // stage 1
- if (pass == 0) {
- s0 = (input[0 * stride] + input[7 * stride]) * 4;
- s1 = (input[1 * stride] + input[6 * stride]) * 4;
- s2 = (input[2 * stride] + input[5 * stride]) * 4;
- s3 = (input[3 * stride] + input[4 * stride]) * 4;
- s4 = (input[3 * stride] - input[4 * stride]) * 4;
- s5 = (input[2 * stride] - input[5 * stride]) * 4;
- s6 = (input[1 * stride] - input[6 * stride]) * 4;
- s7 = (input[0 * stride] - input[7 * stride]) * 4;
- ++input;
- } else {
- s0 = in[0 * 8] + in[7 * 8];
- s1 = in[1 * 8] + in[6 * 8];
- s2 = in[2 * 8] + in[5 * 8];
- s3 = in[3 * 8] + in[4 * 8];
- s4 = in[3 * 8] - in[4 * 8];
- s5 = in[2 * 8] - in[5 * 8];
- s6 = in[1 * 8] - in[6 * 8];
- s7 = in[0 * 8] - in[7 * 8];
- ++in;
- }
-
- // fdct4(step, step);
- x0 = s0 + s3;
- x1 = s1 + s2;
- x2 = s1 - s2;
- x3 = s0 - s3;
- t0 = (x0 + x1) * cospi_16_64;
- t1 = (x0 - x1) * cospi_16_64;
- t2 = x2 * cospi_24_64 + x3 * cospi_8_64;
- t3 = -x2 * cospi_8_64 + x3 * cospi_24_64;
- output[0] = (tran_low_t)fdct_round_shift(t0);
- output[2] = (tran_low_t)fdct_round_shift(t2);
- output[4] = (tran_low_t)fdct_round_shift(t1);
- output[6] = (tran_low_t)fdct_round_shift(t3);
-
- // Stage 2
- t0 = (s6 - s5) * cospi_16_64;
- t1 = (s6 + s5) * cospi_16_64;
- t2 = fdct_round_shift(t0);
- t3 = fdct_round_shift(t1);
-
- // Stage 3
- x0 = s4 + t2;
- x1 = s4 - t2;
- x2 = s7 - t3;
- x3 = s7 + t3;
-
- // Stage 4
- t0 = x0 * cospi_28_64 + x3 * cospi_4_64;
- t1 = x1 * cospi_12_64 + x2 * cospi_20_64;
- t2 = x2 * cospi_12_64 + x1 * -cospi_20_64;
- t3 = x3 * cospi_28_64 + x0 * -cospi_4_64;
- output[1] = (tran_low_t)fdct_round_shift(t0);
- output[3] = (tran_low_t)fdct_round_shift(t2);
- output[5] = (tran_low_t)fdct_round_shift(t1);
- output[7] = (tran_low_t)fdct_round_shift(t3);
- output += 8;
- }
- in = intermediate;
- output = final_output;
- }
-
- // Rows
- for (i = 0; i < 8; ++i) {
- for (j = 0; j < 8; ++j) final_output[j + i * 8] /= 2;
- }
-}
-
-void av1_fdct8x8_1_c(const int16_t *input, tran_low_t *output, int stride) {
- int r, c;
- tran_low_t sum = 0;
- for (r = 0; r < 8; ++r)
- for (c = 0; c < 8; ++c) sum += input[r * stride + c];
-
- output[0] = sum;
- output[1] = 0;
-}
-
-void av1_fdct16x16_c(const int16_t *input, tran_low_t *output, int stride) {
- // The 2D transform is done with two passes which are actually pretty
- // similar. In the first one, we transform the columns and transpose
- // the results. In the second one, we transform the rows. To achieve that,
- // as the first pass results are transposed, we transpose the columns (that
- // is the transposed rows) and transpose the results (so that it goes back
- // in normal/row positions).
- int pass;
- // We need an intermediate buffer between passes.
- tran_low_t intermediate[256];
- const tran_low_t *in_low = NULL;
- tran_low_t *out = intermediate;
- // Do the two transform/transpose passes
- for (pass = 0; pass < 2; ++pass) {
- tran_high_t step1[8]; // canbe16
- tran_high_t step2[8]; // canbe16
- tran_high_t step3[8]; // canbe16
- tran_high_t in_high[8]; // canbe16
- tran_high_t temp1, temp2; // needs32
- int i;
- for (i = 0; i < 16; i++) {
- if (0 == pass) {
- // Calculate input for the first 8 results.
- in_high[0] = (input[0 * stride] + input[15 * stride]) * 4;
- in_high[1] = (input[1 * stride] + input[14 * stride]) * 4;
- in_high[2] = (input[2 * stride] + input[13 * stride]) * 4;
- in_high[3] = (input[3 * stride] + input[12 * stride]) * 4;
- in_high[4] = (input[4 * stride] + input[11 * stride]) * 4;
- in_high[5] = (input[5 * stride] + input[10 * stride]) * 4;
- in_high[6] = (input[6 * stride] + input[9 * stride]) * 4;
- in_high[7] = (input[7 * stride] + input[8 * stride]) * 4;
- // Calculate input for the next 8 results.
- step1[0] = (input[7 * stride] - input[8 * stride]) * 4;
- step1[1] = (input[6 * stride] - input[9 * stride]) * 4;
- step1[2] = (input[5 * stride] - input[10 * stride]) * 4;
- step1[3] = (input[4 * stride] - input[11 * stride]) * 4;
- step1[4] = (input[3 * stride] - input[12 * stride]) * 4;
- step1[5] = (input[2 * stride] - input[13 * stride]) * 4;
- step1[6] = (input[1 * stride] - input[14 * stride]) * 4;
- step1[7] = (input[0 * stride] - input[15 * stride]) * 4;
- } else {
- // Calculate input for the first 8 results.
- assert(in_low != NULL);
- in_high[0] = ((in_low[0 * 16] + 1) >> 2) + ((in_low[15 * 16] + 1) >> 2);
- in_high[1] = ((in_low[1 * 16] + 1) >> 2) + ((in_low[14 * 16] + 1) >> 2);
- in_high[2] = ((in_low[2 * 16] + 1) >> 2) + ((in_low[13 * 16] + 1) >> 2);
- in_high[3] = ((in_low[3 * 16] + 1) >> 2) + ((in_low[12 * 16] + 1) >> 2);
- in_high[4] = ((in_low[4 * 16] + 1) >> 2) + ((in_low[11 * 16] + 1) >> 2);
- in_high[5] = ((in_low[5 * 16] + 1) >> 2) + ((in_low[10 * 16] + 1) >> 2);
- in_high[6] = ((in_low[6 * 16] + 1) >> 2) + ((in_low[9 * 16] + 1) >> 2);
- in_high[7] = ((in_low[7 * 16] + 1) >> 2) + ((in_low[8 * 16] + 1) >> 2);
- // Calculate input for the next 8 results.
- step1[0] = ((in_low[7 * 16] + 1) >> 2) - ((in_low[8 * 16] + 1) >> 2);
- step1[1] = ((in_low[6 * 16] + 1) >> 2) - ((in_low[9 * 16] + 1) >> 2);
- step1[2] = ((in_low[5 * 16] + 1) >> 2) - ((in_low[10 * 16] + 1) >> 2);
- step1[3] = ((in_low[4 * 16] + 1) >> 2) - ((in_low[11 * 16] + 1) >> 2);
- step1[4] = ((in_low[3 * 16] + 1) >> 2) - ((in_low[12 * 16] + 1) >> 2);
- step1[5] = ((in_low[2 * 16] + 1) >> 2) - ((in_low[13 * 16] + 1) >> 2);
- step1[6] = ((in_low[1 * 16] + 1) >> 2) - ((in_low[14 * 16] + 1) >> 2);
- step1[7] = ((in_low[0 * 16] + 1) >> 2) - ((in_low[15 * 16] + 1) >> 2);
- in_low++;
- }
- // Work on the first eight values; fdct8(input, even_results);
- {
- tran_high_t s0, s1, s2, s3, s4, s5, s6, s7; // canbe16
- tran_high_t t0, t1, t2, t3; // needs32
- tran_high_t x0, x1, x2, x3; // canbe16
-
- // stage 1
- s0 = in_high[0] + in_high[7];
- s1 = in_high[1] + in_high[6];
- s2 = in_high[2] + in_high[5];
- s3 = in_high[3] + in_high[4];
- s4 = in_high[3] - in_high[4];
- s5 = in_high[2] - in_high[5];
- s6 = in_high[1] - in_high[6];
- s7 = in_high[0] - in_high[7];
-
- // fdct4(step, step);
- x0 = s0 + s3;
- x1 = s1 + s2;
- x2 = s1 - s2;
- x3 = s0 - s3;
- t0 = (x0 + x1) * cospi_16_64;
- t1 = (x0 - x1) * cospi_16_64;
- t2 = x3 * cospi_8_64 + x2 * cospi_24_64;
- t3 = x3 * cospi_24_64 - x2 * cospi_8_64;
- out[0] = (tran_low_t)fdct_round_shift(t0);
- out[4] = (tran_low_t)fdct_round_shift(t2);
- out[8] = (tran_low_t)fdct_round_shift(t1);
- out[12] = (tran_low_t)fdct_round_shift(t3);
-
- // Stage 2
- t0 = (s6 - s5) * cospi_16_64;
- t1 = (s6 + s5) * cospi_16_64;
- t2 = fdct_round_shift(t0);
- t3 = fdct_round_shift(t1);
-
- // Stage 3
- x0 = s4 + t2;
- x1 = s4 - t2;
- x2 = s7 - t3;
- x3 = s7 + t3;
-
- // Stage 4
- t0 = x0 * cospi_28_64 + x3 * cospi_4_64;
- t1 = x1 * cospi_12_64 + x2 * cospi_20_64;
- t2 = x2 * cospi_12_64 + x1 * -cospi_20_64;
- t3 = x3 * cospi_28_64 + x0 * -cospi_4_64;
- out[2] = (tran_low_t)fdct_round_shift(t0);
- out[6] = (tran_low_t)fdct_round_shift(t2);
- out[10] = (tran_low_t)fdct_round_shift(t1);
- out[14] = (tran_low_t)fdct_round_shift(t3);
- }
- // Work on the next eight values; step1 -> odd_results
- {
- // step 2
- temp1 = (step1[5] - step1[2]) * cospi_16_64;
- temp2 = (step1[4] - step1[3]) * cospi_16_64;
- step2[2] = fdct_round_shift(temp1);
- step2[3] = fdct_round_shift(temp2);
- temp1 = (step1[4] + step1[3]) * cospi_16_64;
- temp2 = (step1[5] + step1[2]) * cospi_16_64;
- step2[4] = fdct_round_shift(temp1);
- step2[5] = fdct_round_shift(temp2);
- // step 3
- step3[0] = step1[0] + step2[3];
- step3[1] = step1[1] + step2[2];
- step3[2] = step1[1] - step2[2];
- step3[3] = step1[0] - step2[3];
- step3[4] = step1[7] - step2[4];
- step3[5] = step1[6] - step2[5];
- step3[6] = step1[6] + step2[5];
- step3[7] = step1[7] + step2[4];
- // step 4
- temp1 = step3[1] * -cospi_8_64 + step3[6] * cospi_24_64;
- temp2 = step3[2] * cospi_24_64 + step3[5] * cospi_8_64;
- step2[1] = fdct_round_shift(temp1);
- step2[2] = fdct_round_shift(temp2);
- temp1 = step3[2] * cospi_8_64 - step3[5] * cospi_24_64;
- temp2 = step3[1] * cospi_24_64 + step3[6] * cospi_8_64;
- step2[5] = fdct_round_shift(temp1);
- step2[6] = fdct_round_shift(temp2);
- // step 5
- step1[0] = step3[0] + step2[1];
- step1[1] = step3[0] - step2[1];
- step1[2] = step3[3] + step2[2];
- step1[3] = step3[3] - step2[2];
- step1[4] = step3[4] - step2[5];
- step1[5] = step3[4] + step2[5];
- step1[6] = step3[7] - step2[6];
- step1[7] = step3[7] + step2[6];
- // step 6
- temp1 = step1[0] * cospi_30_64 + step1[7] * cospi_2_64;
- temp2 = step1[1] * cospi_14_64 + step1[6] * cospi_18_64;
- out[1] = (tran_low_t)fdct_round_shift(temp1);
- out[9] = (tran_low_t)fdct_round_shift(temp2);
- temp1 = step1[2] * cospi_22_64 + step1[5] * cospi_10_64;
- temp2 = step1[3] * cospi_6_64 + step1[4] * cospi_26_64;
- out[5] = (tran_low_t)fdct_round_shift(temp1);
- out[13] = (tran_low_t)fdct_round_shift(temp2);
- temp1 = step1[3] * -cospi_26_64 + step1[4] * cospi_6_64;
- temp2 = step1[2] * -cospi_10_64 + step1[5] * cospi_22_64;
- out[3] = (tran_low_t)fdct_round_shift(temp1);
- out[11] = (tran_low_t)fdct_round_shift(temp2);
- temp1 = step1[1] * -cospi_18_64 + step1[6] * cospi_14_64;
- temp2 = step1[0] * -cospi_2_64 + step1[7] * cospi_30_64;
- out[7] = (tran_low_t)fdct_round_shift(temp1);
- out[15] = (tran_low_t)fdct_round_shift(temp2);
- }
- // Do next column (which is a transposed row in second/horizontal pass)
- input++;
- out += 16;
- }
- // Setup in/out for next pass.
- in_low = intermediate;
- out = output;
- }
-}
-
-void av1_fdct16x16_1_c(const int16_t *input, tran_low_t *output, int stride) {
- int r, c;
- tran_low_t sum = 0;
- for (r = 0; r < 16; ++r)
- for (c = 0; c < 16; ++c) sum += input[r * stride + c];
-
- output[0] = sum >> 1;
- output[1] = 0;
-}
-
-static INLINE tran_high_t dct_32_round(tran_high_t input) {
- tran_high_t rv = ROUND_POWER_OF_TWO(input, DCT_CONST_BITS);
- // TODO(debargha, peter.derivaz): Find new bounds for this assert,
- // and make the bounds consts.
- // assert(-131072 <= rv && rv <= 131071);
- return rv;
-}
-
-static INLINE tran_high_t half_round_shift(tran_high_t input) {
- tran_high_t rv = (input + 1 + (input < 0)) >> 2;
- return rv;
-}
-
-void av1_fdct32(const tran_high_t *input, tran_high_t *output, int round) {
- tran_high_t step[32];
- // Stage 1
- step[0] = input[0] + input[(32 - 1)];
- step[1] = input[1] + input[(32 - 2)];
- step[2] = input[2] + input[(32 - 3)];
- step[3] = input[3] + input[(32 - 4)];
- step[4] = input[4] + input[(32 - 5)];
- step[5] = input[5] + input[(32 - 6)];
- step[6] = input[6] + input[(32 - 7)];
- step[7] = input[7] + input[(32 - 8)];
- step[8] = input[8] + input[(32 - 9)];
- step[9] = input[9] + input[(32 - 10)];
- step[10] = input[10] + input[(32 - 11)];
- step[11] = input[11] + input[(32 - 12)];
- step[12] = input[12] + input[(32 - 13)];
- step[13] = input[13] + input[(32 - 14)];
- step[14] = input[14] + input[(32 - 15)];
- step[15] = input[15] + input[(32 - 16)];
- step[16] = -input[16] + input[(32 - 17)];
- step[17] = -input[17] + input[(32 - 18)];
- step[18] = -input[18] + input[(32 - 19)];
- step[19] = -input[19] + input[(32 - 20)];
- step[20] = -input[20] + input[(32 - 21)];
- step[21] = -input[21] + input[(32 - 22)];
- step[22] = -input[22] + input[(32 - 23)];
- step[23] = -input[23] + input[(32 - 24)];
- step[24] = -input[24] + input[(32 - 25)];
- step[25] = -input[25] + input[(32 - 26)];
- step[26] = -input[26] + input[(32 - 27)];
- step[27] = -input[27] + input[(32 - 28)];
- step[28] = -input[28] + input[(32 - 29)];
- step[29] = -input[29] + input[(32 - 30)];
- step[30] = -input[30] + input[(32 - 31)];
- step[31] = -input[31] + input[(32 - 32)];
-
- // Stage 2
- output[0] = step[0] + step[16 - 1];
- output[1] = step[1] + step[16 - 2];
- output[2] = step[2] + step[16 - 3];
- output[3] = step[3] + step[16 - 4];
- output[4] = step[4] + step[16 - 5];
- output[5] = step[5] + step[16 - 6];
- output[6] = step[6] + step[16 - 7];
- output[7] = step[7] + step[16 - 8];
- output[8] = -step[8] + step[16 - 9];
- output[9] = -step[9] + step[16 - 10];
- output[10] = -step[10] + step[16 - 11];
- output[11] = -step[11] + step[16 - 12];
- output[12] = -step[12] + step[16 - 13];
- output[13] = -step[13] + step[16 - 14];
- output[14] = -step[14] + step[16 - 15];
- output[15] = -step[15] + step[16 - 16];
-
- output[16] = step[16];
- output[17] = step[17];
- output[18] = step[18];
- output[19] = step[19];
-
- output[20] = dct_32_round((-step[20] + step[27]) * cospi_16_64);
- output[21] = dct_32_round((-step[21] + step[26]) * cospi_16_64);
- output[22] = dct_32_round((-step[22] + step[25]) * cospi_16_64);
- output[23] = dct_32_round((-step[23] + step[24]) * cospi_16_64);
-
- output[24] = dct_32_round((step[24] + step[23]) * cospi_16_64);
- output[25] = dct_32_round((step[25] + step[22]) * cospi_16_64);
- output[26] = dct_32_round((step[26] + step[21]) * cospi_16_64);
- output[27] = dct_32_round((step[27] + step[20]) * cospi_16_64);
-
- output[28] = step[28];
- output[29] = step[29];
- output[30] = step[30];
- output[31] = step[31];
-
- // dump the magnitude by 4, hence the intermediate values are within
- // the range of 16 bits.
- if (round) {
- output[0] = half_round_shift(output[0]);
- output[1] = half_round_shift(output[1]);
- output[2] = half_round_shift(output[2]);
- output[3] = half_round_shift(output[3]);
- output[4] = half_round_shift(output[4]);
- output[5] = half_round_shift(output[5]);
- output[6] = half_round_shift(output[6]);
- output[7] = half_round_shift(output[7]);
- output[8] = half_round_shift(output[8]);
- output[9] = half_round_shift(output[9]);
- output[10] = half_round_shift(output[10]);
- output[11] = half_round_shift(output[11]);
- output[12] = half_round_shift(output[12]);
- output[13] = half_round_shift(output[13]);
- output[14] = half_round_shift(output[14]);
- output[15] = half_round_shift(output[15]);
-
- output[16] = half_round_shift(output[16]);
- output[17] = half_round_shift(output[17]);
- output[18] = half_round_shift(output[18]);
- output[19] = half_round_shift(output[19]);
- output[20] = half_round_shift(output[20]);
- output[21] = half_round_shift(output[21]);
- output[22] = half_round_shift(output[22]);
- output[23] = half_round_shift(output[23]);
- output[24] = half_round_shift(output[24]);
- output[25] = half_round_shift(output[25]);
- output[26] = half_round_shift(output[26]);
- output[27] = half_round_shift(output[27]);
- output[28] = half_round_shift(output[28]);
- output[29] = half_round_shift(output[29]);
- output[30] = half_round_shift(output[30]);
- output[31] = half_round_shift(output[31]);
- }
-
- // Stage 3
- step[0] = output[0] + output[(8 - 1)];
- step[1] = output[1] + output[(8 - 2)];
- step[2] = output[2] + output[(8 - 3)];
- step[3] = output[3] + output[(8 - 4)];
- step[4] = -output[4] + output[(8 - 5)];
- step[5] = -output[5] + output[(8 - 6)];
- step[6] = -output[6] + output[(8 - 7)];
- step[7] = -output[7] + output[(8 - 8)];
- step[8] = output[8];
- step[9] = output[9];
- step[10] = dct_32_round((-output[10] + output[13]) * cospi_16_64);
- step[11] = dct_32_round((-output[11] + output[12]) * cospi_16_64);
- step[12] = dct_32_round((output[12] + output[11]) * cospi_16_64);
- step[13] = dct_32_round((output[13] + output[10]) * cospi_16_64);
- step[14] = output[14];
- step[15] = output[15];
-
- step[16] = output[16] + output[23];
- step[17] = output[17] + output[22];
- step[18] = output[18] + output[21];
- step[19] = output[19] + output[20];
- step[20] = -output[20] + output[19];
- step[21] = -output[21] + output[18];
- step[22] = -output[22] + output[17];
- step[23] = -output[23] + output[16];
- step[24] = -output[24] + output[31];
- step[25] = -output[25] + output[30];
- step[26] = -output[26] + output[29];
- step[27] = -output[27] + output[28];
- step[28] = output[28] + output[27];
- step[29] = output[29] + output[26];
- step[30] = output[30] + output[25];
- step[31] = output[31] + output[24];
-
- // Stage 4
- output[0] = step[0] + step[3];
- output[1] = step[1] + step[2];
- output[2] = -step[2] + step[1];
- output[3] = -step[3] + step[0];
- output[4] = step[4];
- output[5] = dct_32_round((-step[5] + step[6]) * cospi_16_64);
- output[6] = dct_32_round((step[6] + step[5]) * cospi_16_64);
- output[7] = step[7];
- output[8] = step[8] + step[11];
- output[9] = step[9] + step[10];
- output[10] = -step[10] + step[9];
- output[11] = -step[11] + step[8];
- output[12] = -step[12] + step[15];
- output[13] = -step[13] + step[14];
- output[14] = step[14] + step[13];
- output[15] = step[15] + step[12];
-
- output[16] = step[16];
- output[17] = step[17];
- output[18] = dct_32_round(step[18] * -cospi_8_64 + step[29] * cospi_24_64);
- output[19] = dct_32_round(step[19] * -cospi_8_64 + step[28] * cospi_24_64);
- output[20] = dct_32_round(step[20] * -cospi_24_64 + step[27] * -cospi_8_64);
- output[21] = dct_32_round(step[21] * -cospi_24_64 + step[26] * -cospi_8_64);
- output[22] = step[22];
- output[23] = step[23];
- output[24] = step[24];
- output[25] = step[25];
- output[26] = dct_32_round(step[26] * cospi_24_64 + step[21] * -cospi_8_64);
- output[27] = dct_32_round(step[27] * cospi_24_64 + step[20] * -cospi_8_64);
- output[28] = dct_32_round(step[28] * cospi_8_64 + step[19] * cospi_24_64);
- output[29] = dct_32_round(step[29] * cospi_8_64 + step[18] * cospi_24_64);
- output[30] = step[30];
- output[31] = step[31];
-
- // Stage 5
- step[0] = dct_32_round((output[0] + output[1]) * cospi_16_64);
- step[1] = dct_32_round((-output[1] + output[0]) * cospi_16_64);
- step[2] = dct_32_round(output[2] * cospi_24_64 + output[3] * cospi_8_64);
- step[3] = dct_32_round(output[3] * cospi_24_64 - output[2] * cospi_8_64);
- step[4] = output[4] + output[5];
- step[5] = -output[5] + output[4];
- step[6] = -output[6] + output[7];
- step[7] = output[7] + output[6];
- step[8] = output[8];
- step[9] = dct_32_round(output[9] * -cospi_8_64 + output[14] * cospi_24_64);
- step[10] = dct_32_round(output[10] * -cospi_24_64 + output[13] * -cospi_8_64);
- step[11] = output[11];
- step[12] = output[12];
- step[13] = dct_32_round(output[13] * cospi_24_64 + output[10] * -cospi_8_64);
- step[14] = dct_32_round(output[14] * cospi_8_64 + output[9] * cospi_24_64);
- step[15] = output[15];
-
- step[16] = output[16] + output[19];
- step[17] = output[17] + output[18];
- step[18] = -output[18] + output[17];
- step[19] = -output[19] + output[16];
- step[20] = -output[20] + output[23];
- step[21] = -output[21] + output[22];
- step[22] = output[22] + output[21];
- step[23] = output[23] + output[20];
- step[24] = output[24] + output[27];
- step[25] = output[25] + output[26];
- step[26] = -output[26] + output[25];
- step[27] = -output[27] + output[24];
- step[28] = -output[28] + output[31];
- step[29] = -output[29] + output[30];
- step[30] = output[30] + output[29];
- step[31] = output[31] + output[28];
-
- // Stage 6
- output[0] = step[0];
- output[1] = step[1];
- output[2] = step[2];
- output[3] = step[3];
- output[4] = dct_32_round(step[4] * cospi_28_64 + step[7] * cospi_4_64);
- output[5] = dct_32_round(step[5] * cospi_12_64 + step[6] * cospi_20_64);
- output[6] = dct_32_round(step[6] * cospi_12_64 + step[5] * -cospi_20_64);
- output[7] = dct_32_round(step[7] * cospi_28_64 + step[4] * -cospi_4_64);
- output[8] = step[8] + step[9];
- output[9] = -step[9] + step[8];
- output[10] = -step[10] + step[11];
- output[11] = step[11] + step[10];
- output[12] = step[12] + step[13];
- output[13] = -step[13] + step[12];
- output[14] = -step[14] + step[15];
- output[15] = step[15] + step[14];
-
- output[16] = step[16];
- output[17] = dct_32_round(step[17] * -cospi_4_64 + step[30] * cospi_28_64);
- output[18] = dct_32_round(step[18] * -cospi_28_64 + step[29] * -cospi_4_64);
- output[19] = step[19];
- output[20] = step[20];
- output[21] = dct_32_round(step[21] * -cospi_20_64 + step[26] * cospi_12_64);
- output[22] = dct_32_round(step[22] * -cospi_12_64 + step[25] * -cospi_20_64);
- output[23] = step[23];
- output[24] = step[24];
- output[25] = dct_32_round(step[25] * cospi_12_64 + step[22] * -cospi_20_64);
- output[26] = dct_32_round(step[26] * cospi_20_64 + step[21] * cospi_12_64);
- output[27] = step[27];
- output[28] = step[28];
- output[29] = dct_32_round(step[29] * cospi_28_64 + step[18] * -cospi_4_64);
- output[30] = dct_32_round(step[30] * cospi_4_64 + step[17] * cospi_28_64);
- output[31] = step[31];
-
- // Stage 7
- step[0] = output[0];
- step[1] = output[1];
- step[2] = output[2];
- step[3] = output[3];
- step[4] = output[4];
- step[5] = output[5];
- step[6] = output[6];
- step[7] = output[7];
- step[8] = dct_32_round(output[8] * cospi_30_64 + output[15] * cospi_2_64);
- step[9] = dct_32_round(output[9] * cospi_14_64 + output[14] * cospi_18_64);
- step[10] = dct_32_round(output[10] * cospi_22_64 + output[13] * cospi_10_64);
- step[11] = dct_32_round(output[11] * cospi_6_64 + output[12] * cospi_26_64);
- step[12] = dct_32_round(output[12] * cospi_6_64 + output[11] * -cospi_26_64);
- step[13] = dct_32_round(output[13] * cospi_22_64 + output[10] * -cospi_10_64);
- step[14] = dct_32_round(output[14] * cospi_14_64 + output[9] * -cospi_18_64);
- step[15] = dct_32_round(output[15] * cospi_30_64 + output[8] * -cospi_2_64);
-
- step[16] = output[16] + output[17];
- step[17] = -output[17] + output[16];
- step[18] = -output[18] + output[19];
- step[19] = output[19] + output[18];
- step[20] = output[20] + output[21];
- step[21] = -output[21] + output[20];
- step[22] = -output[22] + output[23];
- step[23] = output[23] + output[22];
- step[24] = output[24] + output[25];
- step[25] = -output[25] + output[24];
- step[26] = -output[26] + output[27];
- step[27] = output[27] + output[26];
- step[28] = output[28] + output[29];
- step[29] = -output[29] + output[28];
- step[30] = -output[30] + output[31];
- step[31] = output[31] + output[30];
-
- // Final stage --- outputs indices are bit-reversed.
- output[0] = step[0];
- output[16] = step[1];
- output[8] = step[2];
- output[24] = step[3];
- output[4] = step[4];
- output[20] = step[5];
- output[12] = step[6];
- output[28] = step[7];
- output[2] = step[8];
- output[18] = step[9];
- output[10] = step[10];
- output[26] = step[11];
- output[6] = step[12];
- output[22] = step[13];
- output[14] = step[14];
- output[30] = step[15];
-
- output[1] = dct_32_round(step[16] * cospi_31_64 + step[31] * cospi_1_64);
- output[17] = dct_32_round(step[17] * cospi_15_64 + step[30] * cospi_17_64);
- output[9] = dct_32_round(step[18] * cospi_23_64 + step[29] * cospi_9_64);
- output[25] = dct_32_round(step[19] * cospi_7_64 + step[28] * cospi_25_64);
- output[5] = dct_32_round(step[20] * cospi_27_64 + step[27] * cospi_5_64);
- output[21] = dct_32_round(step[21] * cospi_11_64 + step[26] * cospi_21_64);
- output[13] = dct_32_round(step[22] * cospi_19_64 + step[25] * cospi_13_64);
- output[29] = dct_32_round(step[23] * cospi_3_64 + step[24] * cospi_29_64);
- output[3] = dct_32_round(step[24] * cospi_3_64 + step[23] * -cospi_29_64);
- output[19] = dct_32_round(step[25] * cospi_19_64 + step[22] * -cospi_13_64);
- output[11] = dct_32_round(step[26] * cospi_11_64 + step[21] * -cospi_21_64);
- output[27] = dct_32_round(step[27] * cospi_27_64 + step[20] * -cospi_5_64);
- output[7] = dct_32_round(step[28] * cospi_7_64 + step[19] * -cospi_25_64);
- output[23] = dct_32_round(step[29] * cospi_23_64 + step[18] * -cospi_9_64);
- output[15] = dct_32_round(step[30] * cospi_15_64 + step[17] * -cospi_17_64);
- output[31] = dct_32_round(step[31] * cospi_31_64 + step[16] * -cospi_1_64);
-}
-
-void av1_fdct32x32_c(const int16_t *input, tran_low_t *out, int stride) {
- int i, j;
- tran_high_t output[32 * 32];
-
- // Columns
- for (i = 0; i < 32; ++i) {
- tran_high_t temp_in[32], temp_out[32];
- for (j = 0; j < 32; ++j) temp_in[j] = input[j * stride + i] * 4;
- av1_fdct32(temp_in, temp_out, 0);
- for (j = 0; j < 32; ++j)
- output[j * 32 + i] = (temp_out[j] + 1 + (temp_out[j] > 0)) >> 2;
- }
-
- // Rows
- for (i = 0; i < 32; ++i) {
- tran_high_t temp_in[32], temp_out[32];
- for (j = 0; j < 32; ++j) temp_in[j] = output[j + i * 32];
- av1_fdct32(temp_in, temp_out, 0);
- for (j = 0; j < 32; ++j)
- out[j + i * 32] =
- (tran_low_t)((temp_out[j] + 1 + (temp_out[j] < 0)) >> 2);
- }
-}
-
-// Note that although we use dct_32_round in dct32 computation flow,
-// this 2d fdct32x32 for rate-distortion optimization loop is operating
-// within 16 bits precision.
-void av1_fdct32x32_rd_c(const int16_t *input, tran_low_t *out, int stride) {
- int i, j;
- tran_high_t output[32 * 32];
-
- // Columns
- for (i = 0; i < 32; ++i) {
- tran_high_t temp_in[32], temp_out[32];
- for (j = 0; j < 32; ++j) temp_in[j] = input[j * stride + i] * 4;
- av1_fdct32(temp_in, temp_out, 0);
- for (j = 0; j < 32; ++j)
- // TODO(cd): see quality impact of only doing
- // output[j * 32 + i] = (temp_out[j] + 1) >> 2;
- // PS: also change code in av1_dsp/x86/av1_dct_sse2.c
- output[j * 32 + i] = (temp_out[j] + 1 + (temp_out[j] > 0)) >> 2;
- }
-
- // Rows
- for (i = 0; i < 32; ++i) {
- tran_high_t temp_in[32], temp_out[32];
- for (j = 0; j < 32; ++j) temp_in[j] = output[j + i * 32];
- av1_fdct32(temp_in, temp_out, 1);
- for (j = 0; j < 32; ++j) out[j + i * 32] = (tran_low_t)temp_out[j];
- }
-}
-
-void av1_fdct32x32_1_c(const int16_t *input, tran_low_t *output, int stride) {
- int r, c;
- tran_low_t sum = 0;
- for (r = 0; r < 32; ++r)
- for (c = 0; c < 32; ++c) sum += input[r * stride + c];
-
- output[0] = sum >> 3;
- output[1] = 0;
-}
-
-#if CONFIG_AOM_HIGHBITDEPTH
-void av1_highbd_fdct4x4_c(const int16_t *input, tran_low_t *output,
- int stride) {
- av1_fdct4x4_c(input, output, stride);
-}
-
-void av1_highbd_fdct8x8_c(const int16_t *input, tran_low_t *final_output,
- int stride) {
- av1_fdct8x8_c(input, final_output, stride);
-}
-
-void av1_highbd_fdct8x8_1_c(const int16_t *input, tran_low_t *final_output,
- int stride) {
- av1_fdct8x8_1_c(input, final_output, stride);
-}
-
-void av1_highbd_fdct16x16_c(const int16_t *input, tran_low_t *output,
- int stride) {
- av1_fdct16x16_c(input, output, stride);
-}
-
-void av1_highbd_fdct16x16_1_c(const int16_t *input, tran_low_t *output,
- int stride) {
- av1_fdct16x16_1_c(input, output, stride);
-}
-
-void av1_highbd_fdct32x32_c(const int16_t *input, tran_low_t *out, int stride) {
- av1_fdct32x32_c(input, out, stride);
-}
-
-void av1_highbd_fdct32x32_rd_c(const int16_t *input, tran_low_t *out,
- int stride) {
- av1_fdct32x32_rd_c(input, out, stride);
-}
-
-void av1_highbd_fdct32x32_1_c(const int16_t *input, tran_low_t *out,
- int stride) {
- av1_fdct32x32_1_c(input, out, stride);
-}
-#endif // CONFIG_AOM_HIGHBITDEPTH
diff --git a/av1/common/av1_fwd_txfm.h b/av1/common/av1_fwd_txfm.h
deleted file mode 100644
index db763e5..0000000
--- a/av1/common/av1_fwd_txfm.h
+++ /dev/null
@@ -1,19 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#ifndef AV1_COMMON_AV1_FWD_TXFM_H_
-#define AV1_COMMON_AV1_FWD_TXFM_H_
-
-#include "aom_dsp/txfm_common.h"
-#include "aom_dsp/fwd_txfm.h"
-
-void av1_fdct32(const tran_high_t *input, tran_high_t *output, int round);
-#endif // AV1_COMMON_AV1_FWD_TXFM_H_
diff --git a/av1/common/av1_fwd_txfm2d_cfg.h b/av1/common/av1_fwd_txfm2d_cfg.h
index 49d324d..5a7c218 100644
--- a/av1/common/av1_fwd_txfm2d_cfg.h
+++ b/av1/common/av1_fwd_txfm2d_cfg.h
@@ -109,7 +109,7 @@
}; // .txfm_type_row
// ---------------- config fwd_dct_dct_64 ----------------
-static const int8_t fwd_shift_dct_dct_64[3] = { 2, -2, -2 };
+static const int8_t fwd_shift_dct_dct_64[3] = { 0, -2, -2 };
static const int8_t fwd_stage_range_col_dct_dct_64[12] = {
13, 14, 15, 16, 17, 18, 19, 19, 19, 19, 19, 19
};
diff --git a/av1/common/av1_inv_txfm.c b/av1/common/av1_inv_txfm.c
deleted file mode 100644
index 4b2f061..0000000
--- a/av1/common/av1_inv_txfm.c
+++ /dev/null
@@ -1,2468 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <math.h>
-#include <string.h>
-
-#include "./av1_rtcd.h"
-#include "av1/common/av1_inv_txfm.h"
-
-void av1_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
- /* 4-point reversible, orthonormal inverse Walsh-Hadamard in 3.5 adds,
- 0.5 shifts per pixel. */
- int i;
- tran_low_t output[16];
- tran_high_t a1, b1, c1, d1, e1;
- const tran_low_t *ip = input;
- tran_low_t *op = output;
-
- for (i = 0; i < 4; i++) {
- a1 = ip[0] >> UNIT_QUANT_SHIFT;
- c1 = ip[1] >> UNIT_QUANT_SHIFT;
- d1 = ip[2] >> UNIT_QUANT_SHIFT;
- b1 = ip[3] >> UNIT_QUANT_SHIFT;
- a1 += c1;
- d1 -= b1;
- e1 = (a1 - d1) >> 1;
- b1 = e1 - b1;
- c1 = e1 - c1;
- a1 -= b1;
- d1 += c1;
- op[0] = WRAPLOW(a1);
- op[1] = WRAPLOW(b1);
- op[2] = WRAPLOW(c1);
- op[3] = WRAPLOW(d1);
- ip += 4;
- op += 4;
- }
-
- ip = output;
- for (i = 0; i < 4; i++) {
- a1 = ip[4 * 0];
- c1 = ip[4 * 1];
- d1 = ip[4 * 2];
- b1 = ip[4 * 3];
- a1 += c1;
- d1 -= b1;
- e1 = (a1 - d1) >> 1;
- b1 = e1 - b1;
- c1 = e1 - c1;
- a1 -= b1;
- d1 += c1;
- dest[stride * 0] = clip_pixel_add(dest[stride * 0], WRAPLOW(a1));
- dest[stride * 1] = clip_pixel_add(dest[stride * 1], WRAPLOW(b1));
- dest[stride * 2] = clip_pixel_add(dest[stride * 2], WRAPLOW(c1));
- dest[stride * 3] = clip_pixel_add(dest[stride * 3], WRAPLOW(d1));
-
- ip++;
- dest++;
- }
-}
-
-void av1_iwht4x4_1_add_c(const tran_low_t *in, uint8_t *dest, int dest_stride) {
- int i;
- tran_high_t a1, e1;
- tran_low_t tmp[4];
- const tran_low_t *ip = in;
- tran_low_t *op = tmp;
-
- a1 = ip[0] >> UNIT_QUANT_SHIFT;
- e1 = a1 >> 1;
- a1 -= e1;
- op[0] = WRAPLOW(a1);
- op[1] = op[2] = op[3] = WRAPLOW(e1);
-
- ip = tmp;
- for (i = 0; i < 4; i++) {
- e1 = ip[0] >> 1;
- a1 = ip[0] - e1;
- dest[dest_stride * 0] = clip_pixel_add(dest[dest_stride * 0], a1);
- dest[dest_stride * 1] = clip_pixel_add(dest[dest_stride * 1], e1);
- dest[dest_stride * 2] = clip_pixel_add(dest[dest_stride * 2], e1);
- dest[dest_stride * 3] = clip_pixel_add(dest[dest_stride * 3], e1);
- ip++;
- dest++;
- }
-}
-
-void av1_idct4_c(const tran_low_t *input, tran_low_t *output) {
- tran_low_t step[4];
- tran_high_t temp1, temp2;
- // stage 1
- temp1 = (input[0] + input[2]) * cospi_16_64;
- temp2 = (input[0] - input[2]) * cospi_16_64;
- step[0] = WRAPLOW(dct_const_round_shift(temp1));
- step[1] = WRAPLOW(dct_const_round_shift(temp2));
- temp1 = input[1] * cospi_24_64 - input[3] * cospi_8_64;
- temp2 = input[1] * cospi_8_64 + input[3] * cospi_24_64;
- step[2] = WRAPLOW(dct_const_round_shift(temp1));
- step[3] = WRAPLOW(dct_const_round_shift(temp2));
-
- // stage 2
- output[0] = WRAPLOW(step[0] + step[3]);
- output[1] = WRAPLOW(step[1] + step[2]);
- output[2] = WRAPLOW(step[1] - step[2]);
- output[3] = WRAPLOW(step[0] - step[3]);
-}
-
-void av1_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
- tran_low_t out[4 * 4];
- tran_low_t *outptr = out;
- int i, j;
- tran_low_t temp_in[4], temp_out[4];
-
- // Rows
- for (i = 0; i < 4; ++i) {
- av1_idct4_c(input, outptr);
- input += 4;
- outptr += 4;
- }
-
- // Columns
- for (i = 0; i < 4; ++i) {
- for (j = 0; j < 4; ++j) temp_in[j] = out[j * 4 + i];
- av1_idct4_c(temp_in, temp_out);
- for (j = 0; j < 4; ++j) {
- dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
- ROUND_POWER_OF_TWO(temp_out[j], 4));
- }
- }
-}
-
-void av1_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest,
- int dest_stride) {
- int i;
- tran_high_t a1;
- tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64));
- out = WRAPLOW(dct_const_round_shift(out * cospi_16_64));
- a1 = ROUND_POWER_OF_TWO(out, 4);
-
- for (i = 0; i < 4; i++) {
- dest[0] = clip_pixel_add(dest[0], a1);
- dest[1] = clip_pixel_add(dest[1], a1);
- dest[2] = clip_pixel_add(dest[2], a1);
- dest[3] = clip_pixel_add(dest[3], a1);
- dest += dest_stride;
- }
-}
-
-void av1_idct8_c(const tran_low_t *input, tran_low_t *output) {
- tran_low_t step1[8], step2[8];
- tran_high_t temp1, temp2;
- // stage 1
- step1[0] = input[0];
- step1[2] = input[4];
- step1[1] = input[2];
- step1[3] = input[6];
- temp1 = input[1] * cospi_28_64 - input[7] * cospi_4_64;
- temp2 = input[1] * cospi_4_64 + input[7] * cospi_28_64;
- step1[4] = WRAPLOW(dct_const_round_shift(temp1));
- step1[7] = WRAPLOW(dct_const_round_shift(temp2));
- temp1 = input[5] * cospi_12_64 - input[3] * cospi_20_64;
- temp2 = input[5] * cospi_20_64 + input[3] * cospi_12_64;
- step1[5] = WRAPLOW(dct_const_round_shift(temp1));
- step1[6] = WRAPLOW(dct_const_round_shift(temp2));
-
- // stage 2
- temp1 = (step1[0] + step1[2]) * cospi_16_64;
- temp2 = (step1[0] - step1[2]) * cospi_16_64;
- step2[0] = WRAPLOW(dct_const_round_shift(temp1));
- step2[1] = WRAPLOW(dct_const_round_shift(temp2));
- temp1 = step1[1] * cospi_24_64 - step1[3] * cospi_8_64;
- temp2 = step1[1] * cospi_8_64 + step1[3] * cospi_24_64;
- step2[2] = WRAPLOW(dct_const_round_shift(temp1));
- step2[3] = WRAPLOW(dct_const_round_shift(temp2));
- step2[4] = WRAPLOW(step1[4] + step1[5]);
- step2[5] = WRAPLOW(step1[4] - step1[5]);
- step2[6] = WRAPLOW(-step1[6] + step1[7]);
- step2[7] = WRAPLOW(step1[6] + step1[7]);
-
- // stage 3
- step1[0] = WRAPLOW(step2[0] + step2[3]);
- step1[1] = WRAPLOW(step2[1] + step2[2]);
- step1[2] = WRAPLOW(step2[1] - step2[2]);
- step1[3] = WRAPLOW(step2[0] - step2[3]);
- step1[4] = step2[4];
- temp1 = (step2[6] - step2[5]) * cospi_16_64;
- temp2 = (step2[5] + step2[6]) * cospi_16_64;
- step1[5] = WRAPLOW(dct_const_round_shift(temp1));
- step1[6] = WRAPLOW(dct_const_round_shift(temp2));
- step1[7] = step2[7];
-
- // stage 4
- output[0] = WRAPLOW(step1[0] + step1[7]);
- output[1] = WRAPLOW(step1[1] + step1[6]);
- output[2] = WRAPLOW(step1[2] + step1[5]);
- output[3] = WRAPLOW(step1[3] + step1[4]);
- output[4] = WRAPLOW(step1[3] - step1[4]);
- output[5] = WRAPLOW(step1[2] - step1[5]);
- output[6] = WRAPLOW(step1[1] - step1[6]);
- output[7] = WRAPLOW(step1[0] - step1[7]);
-}
-
-void av1_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
- tran_low_t out[8 * 8];
- tran_low_t *outptr = out;
- int i, j;
- tran_low_t temp_in[8], temp_out[8];
-
- // First transform rows
- for (i = 0; i < 8; ++i) {
- av1_idct8_c(input, outptr);
- input += 8;
- outptr += 8;
- }
-
- // Then transform columns
- for (i = 0; i < 8; ++i) {
- for (j = 0; j < 8; ++j) temp_in[j] = out[j * 8 + i];
- av1_idct8_c(temp_in, temp_out);
- for (j = 0; j < 8; ++j) {
- dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
- ROUND_POWER_OF_TWO(temp_out[j], 5));
- }
- }
-}
-
-void av1_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
- int i, j;
- tran_high_t a1;
- tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64));
- out = WRAPLOW(dct_const_round_shift(out * cospi_16_64));
- a1 = ROUND_POWER_OF_TWO(out, 5);
- for (j = 0; j < 8; ++j) {
- for (i = 0; i < 8; ++i) dest[i] = clip_pixel_add(dest[i], a1);
- dest += stride;
- }
-}
-
-void av1_iadst4_c(const tran_low_t *input, tran_low_t *output) {
- tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;
-
- tran_low_t x0 = input[0];
- tran_low_t x1 = input[1];
- tran_low_t x2 = input[2];
- tran_low_t x3 = input[3];
-
- if (!(x0 | x1 | x2 | x3)) {
- output[0] = output[1] = output[2] = output[3] = 0;
- return;
- }
-
- s0 = sinpi_1_9 * x0;
- s1 = sinpi_2_9 * x0;
- s2 = sinpi_3_9 * x1;
- s3 = sinpi_4_9 * x2;
- s4 = sinpi_1_9 * x2;
- s5 = sinpi_2_9 * x3;
- s6 = sinpi_4_9 * x3;
- s7 = WRAPLOW(x0 - x2 + x3);
-
- s0 = s0 + s3 + s5;
- s1 = s1 - s4 - s6;
- s3 = s2;
- s2 = sinpi_3_9 * s7;
-
- // 1-D transform scaling factor is sqrt(2).
- // The overall dynamic range is 14b (input) + 14b (multiplication scaling)
- // + 1b (addition) = 29b.
- // Hence the output bit depth is 15b.
- output[0] = WRAPLOW(dct_const_round_shift(s0 + s3));
- output[1] = WRAPLOW(dct_const_round_shift(s1 + s3));
- output[2] = WRAPLOW(dct_const_round_shift(s2));
- output[3] = WRAPLOW(dct_const_round_shift(s0 + s1 - s3));
-}
-
-void av1_iadst8_c(const tran_low_t *input, tran_low_t *output) {
- int s0, s1, s2, s3, s4, s5, s6, s7;
-
- tran_high_t x0 = input[7];
- tran_high_t x1 = input[0];
- tran_high_t x2 = input[5];
- tran_high_t x3 = input[2];
- tran_high_t x4 = input[3];
- tran_high_t x5 = input[4];
- tran_high_t x6 = input[1];
- tran_high_t x7 = input[6];
-
- if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7)) {
- output[0] = output[1] = output[2] = output[3] = output[4] = output[5] =
- output[6] = output[7] = 0;
- return;
- }
-
- // stage 1
- s0 = (int)(cospi_2_64 * x0 + cospi_30_64 * x1);
- s1 = (int)(cospi_30_64 * x0 - cospi_2_64 * x1);
- s2 = (int)(cospi_10_64 * x2 + cospi_22_64 * x3);
- s3 = (int)(cospi_22_64 * x2 - cospi_10_64 * x3);
- s4 = (int)(cospi_18_64 * x4 + cospi_14_64 * x5);
- s5 = (int)(cospi_14_64 * x4 - cospi_18_64 * x5);
- s6 = (int)(cospi_26_64 * x6 + cospi_6_64 * x7);
- s7 = (int)(cospi_6_64 * x6 - cospi_26_64 * x7);
-
- x0 = WRAPLOW(dct_const_round_shift(s0 + s4));
- x1 = WRAPLOW(dct_const_round_shift(s1 + s5));
- x2 = WRAPLOW(dct_const_round_shift(s2 + s6));
- x3 = WRAPLOW(dct_const_round_shift(s3 + s7));
- x4 = WRAPLOW(dct_const_round_shift(s0 - s4));
- x5 = WRAPLOW(dct_const_round_shift(s1 - s5));
- x6 = WRAPLOW(dct_const_round_shift(s2 - s6));
- x7 = WRAPLOW(dct_const_round_shift(s3 - s7));
-
- // stage 2
- s0 = (int)x0;
- s1 = (int)x1;
- s2 = (int)x2;
- s3 = (int)x3;
- s4 = (int)(cospi_8_64 * x4 + cospi_24_64 * x5);
- s5 = (int)(cospi_24_64 * x4 - cospi_8_64 * x5);
- s6 = (int)(-cospi_24_64 * x6 + cospi_8_64 * x7);
- s7 = (int)(cospi_8_64 * x6 + cospi_24_64 * x7);
-
- x0 = WRAPLOW(s0 + s2);
- x1 = WRAPLOW(s1 + s3);
- x2 = WRAPLOW(s0 - s2);
- x3 = WRAPLOW(s1 - s3);
- x4 = WRAPLOW(dct_const_round_shift(s4 + s6));
- x5 = WRAPLOW(dct_const_round_shift(s5 + s7));
- x6 = WRAPLOW(dct_const_round_shift(s4 - s6));
- x7 = WRAPLOW(dct_const_round_shift(s5 - s7));
-
- // stage 3
- s2 = (int)(cospi_16_64 * (x2 + x3));
- s3 = (int)(cospi_16_64 * (x2 - x3));
- s6 = (int)(cospi_16_64 * (x6 + x7));
- s7 = (int)(cospi_16_64 * (x6 - x7));
-
- x2 = WRAPLOW(dct_const_round_shift(s2));
- x3 = WRAPLOW(dct_const_round_shift(s3));
- x6 = WRAPLOW(dct_const_round_shift(s6));
- x7 = WRAPLOW(dct_const_round_shift(s7));
-
- output[0] = WRAPLOW(x0);
- output[1] = WRAPLOW(-x4);
- output[2] = WRAPLOW(x6);
- output[3] = WRAPLOW(-x2);
- output[4] = WRAPLOW(x3);
- output[5] = WRAPLOW(-x7);
- output[6] = WRAPLOW(x5);
- output[7] = WRAPLOW(-x1);
-}
-
-void av1_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
- tran_low_t out[8 * 8] = { 0 };
- tran_low_t *outptr = out;
- int i, j;
- tran_low_t temp_in[8], temp_out[8];
-
- // First transform rows
- // only first 4 row has non-zero coefs
- for (i = 0; i < 4; ++i) {
- av1_idct8_c(input, outptr);
- input += 8;
- outptr += 8;
- }
-
- // Then transform columns
- for (i = 0; i < 8; ++i) {
- for (j = 0; j < 8; ++j) temp_in[j] = out[j * 8 + i];
- av1_idct8_c(temp_in, temp_out);
- for (j = 0; j < 8; ++j) {
- dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
- ROUND_POWER_OF_TWO(temp_out[j], 5));
- }
- }
-}
-
-void av1_idct16_c(const tran_low_t *input, tran_low_t *output) {
- tran_low_t step1[16], step2[16];
- tran_high_t temp1, temp2;
-
- // stage 1
- step1[0] = input[0 / 2];
- step1[1] = input[16 / 2];
- step1[2] = input[8 / 2];
- step1[3] = input[24 / 2];
- step1[4] = input[4 / 2];
- step1[5] = input[20 / 2];
- step1[6] = input[12 / 2];
- step1[7] = input[28 / 2];
- step1[8] = input[2 / 2];
- step1[9] = input[18 / 2];
- step1[10] = input[10 / 2];
- step1[11] = input[26 / 2];
- step1[12] = input[6 / 2];
- step1[13] = input[22 / 2];
- step1[14] = input[14 / 2];
- step1[15] = input[30 / 2];
-
- // stage 2
- step2[0] = step1[0];
- step2[1] = step1[1];
- step2[2] = step1[2];
- step2[3] = step1[3];
- step2[4] = step1[4];
- step2[5] = step1[5];
- step2[6] = step1[6];
- step2[7] = step1[7];
-
- temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64;
- temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64;
- step2[8] = WRAPLOW(dct_const_round_shift(temp1));
- step2[15] = WRAPLOW(dct_const_round_shift(temp2));
-
- temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64;
- temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64;
- step2[9] = WRAPLOW(dct_const_round_shift(temp1));
- step2[14] = WRAPLOW(dct_const_round_shift(temp2));
-
- temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64;
- temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64;
- step2[10] = WRAPLOW(dct_const_round_shift(temp1));
- step2[13] = WRAPLOW(dct_const_round_shift(temp2));
-
- temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64;
- temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64;
- step2[11] = WRAPLOW(dct_const_round_shift(temp1));
- step2[12] = WRAPLOW(dct_const_round_shift(temp2));
-
- // stage 3
- step1[0] = step2[0];
- step1[1] = step2[1];
- step1[2] = step2[2];
- step1[3] = step2[3];
-
- temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64;
- temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64;
- step1[4] = WRAPLOW(dct_const_round_shift(temp1));
- step1[7] = WRAPLOW(dct_const_round_shift(temp2));
- temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64;
- temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64;
- step1[5] = WRAPLOW(dct_const_round_shift(temp1));
- step1[6] = WRAPLOW(dct_const_round_shift(temp2));
-
- step1[8] = WRAPLOW(step2[8] + step2[9]);
- step1[9] = WRAPLOW(step2[8] - step2[9]);
- step1[10] = WRAPLOW(-step2[10] + step2[11]);
- step1[11] = WRAPLOW(step2[10] + step2[11]);
- step1[12] = WRAPLOW(step2[12] + step2[13]);
- step1[13] = WRAPLOW(step2[12] - step2[13]);
- step1[14] = WRAPLOW(-step2[14] + step2[15]);
- step1[15] = WRAPLOW(step2[14] + step2[15]);
-
- // stage 4
- temp1 = (step1[0] + step1[1]) * cospi_16_64;
- temp2 = (step1[0] - step1[1]) * cospi_16_64;
- step2[0] = WRAPLOW(dct_const_round_shift(temp1));
- step2[1] = WRAPLOW(dct_const_round_shift(temp2));
- temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64;
- temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64;
- step2[2] = WRAPLOW(dct_const_round_shift(temp1));
- step2[3] = WRAPLOW(dct_const_round_shift(temp2));
- step2[4] = WRAPLOW(step1[4] + step1[5]);
- step2[5] = WRAPLOW(step1[4] - step1[5]);
- step2[6] = WRAPLOW(-step1[6] + step1[7]);
- step2[7] = WRAPLOW(step1[6] + step1[7]);
-
- step2[8] = step1[8];
- step2[15] = step1[15];
- temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64;
- temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64;
- step2[9] = WRAPLOW(dct_const_round_shift(temp1));
- step2[14] = WRAPLOW(dct_const_round_shift(temp2));
- temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64;
- temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64;
- step2[10] = WRAPLOW(dct_const_round_shift(temp1));
- step2[13] = WRAPLOW(dct_const_round_shift(temp2));
- step2[11] = step1[11];
- step2[12] = step1[12];
-
- // stage 5
- step1[0] = WRAPLOW(step2[0] + step2[3]);
- step1[1] = WRAPLOW(step2[1] + step2[2]);
- step1[2] = WRAPLOW(step2[1] - step2[2]);
- step1[3] = WRAPLOW(step2[0] - step2[3]);
- step1[4] = step2[4];
- temp1 = (step2[6] - step2[5]) * cospi_16_64;
- temp2 = (step2[5] + step2[6]) * cospi_16_64;
- step1[5] = WRAPLOW(dct_const_round_shift(temp1));
- step1[6] = WRAPLOW(dct_const_round_shift(temp2));
- step1[7] = step2[7];
-
- step1[8] = WRAPLOW(step2[8] + step2[11]);
- step1[9] = WRAPLOW(step2[9] + step2[10]);
- step1[10] = WRAPLOW(step2[9] - step2[10]);
- step1[11] = WRAPLOW(step2[8] - step2[11]);
- step1[12] = WRAPLOW(-step2[12] + step2[15]);
- step1[13] = WRAPLOW(-step2[13] + step2[14]);
- step1[14] = WRAPLOW(step2[13] + step2[14]);
- step1[15] = WRAPLOW(step2[12] + step2[15]);
-
- // stage 6
- step2[0] = WRAPLOW(step1[0] + step1[7]);
- step2[1] = WRAPLOW(step1[1] + step1[6]);
- step2[2] = WRAPLOW(step1[2] + step1[5]);
- step2[3] = WRAPLOW(step1[3] + step1[4]);
- step2[4] = WRAPLOW(step1[3] - step1[4]);
- step2[5] = WRAPLOW(step1[2] - step1[5]);
- step2[6] = WRAPLOW(step1[1] - step1[6]);
- step2[7] = WRAPLOW(step1[0] - step1[7]);
- step2[8] = step1[8];
- step2[9] = step1[9];
- temp1 = (-step1[10] + step1[13]) * cospi_16_64;
- temp2 = (step1[10] + step1[13]) * cospi_16_64;
- step2[10] = WRAPLOW(dct_const_round_shift(temp1));
- step2[13] = WRAPLOW(dct_const_round_shift(temp2));
- temp1 = (-step1[11] + step1[12]) * cospi_16_64;
- temp2 = (step1[11] + step1[12]) * cospi_16_64;
- step2[11] = WRAPLOW(dct_const_round_shift(temp1));
- step2[12] = WRAPLOW(dct_const_round_shift(temp2));
- step2[14] = step1[14];
- step2[15] = step1[15];
-
- // stage 7
- output[0] = WRAPLOW(step2[0] + step2[15]);
- output[1] = WRAPLOW(step2[1] + step2[14]);
- output[2] = WRAPLOW(step2[2] + step2[13]);
- output[3] = WRAPLOW(step2[3] + step2[12]);
- output[4] = WRAPLOW(step2[4] + step2[11]);
- output[5] = WRAPLOW(step2[5] + step2[10]);
- output[6] = WRAPLOW(step2[6] + step2[9]);
- output[7] = WRAPLOW(step2[7] + step2[8]);
- output[8] = WRAPLOW(step2[7] - step2[8]);
- output[9] = WRAPLOW(step2[6] - step2[9]);
- output[10] = WRAPLOW(step2[5] - step2[10]);
- output[11] = WRAPLOW(step2[4] - step2[11]);
- output[12] = WRAPLOW(step2[3] - step2[12]);
- output[13] = WRAPLOW(step2[2] - step2[13]);
- output[14] = WRAPLOW(step2[1] - step2[14]);
- output[15] = WRAPLOW(step2[0] - step2[15]);
-}
-
-void av1_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest,
- int stride) {
- tran_low_t out[16 * 16];
- tran_low_t *outptr = out;
- int i, j;
- tran_low_t temp_in[16], temp_out[16];
-
- // First transform rows
- for (i = 0; i < 16; ++i) {
- av1_idct16_c(input, outptr);
- input += 16;
- outptr += 16;
- }
-
- // Then transform columns
- for (i = 0; i < 16; ++i) {
- for (j = 0; j < 16; ++j) temp_in[j] = out[j * 16 + i];
- av1_idct16_c(temp_in, temp_out);
- for (j = 0; j < 16; ++j) {
- dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
- ROUND_POWER_OF_TWO(temp_out[j], 6));
- }
- }
-}
-
-void av1_iadst16_c(const tran_low_t *input, tran_low_t *output) {
- tran_high_t s0, s1, s2, s3, s4, s5, s6, s7, s8;
- tran_high_t s9, s10, s11, s12, s13, s14, s15;
-
- tran_high_t x0 = input[15];
- tran_high_t x1 = input[0];
- tran_high_t x2 = input[13];
- tran_high_t x3 = input[2];
- tran_high_t x4 = input[11];
- tran_high_t x5 = input[4];
- tran_high_t x6 = input[9];
- tran_high_t x7 = input[6];
- tran_high_t x8 = input[7];
- tran_high_t x9 = input[8];
- tran_high_t x10 = input[5];
- tran_high_t x11 = input[10];
- tran_high_t x12 = input[3];
- tran_high_t x13 = input[12];
- tran_high_t x14 = input[1];
- tran_high_t x15 = input[14];
-
- if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7 | x8 | x9 | x10 | x11 | x12 |
- x13 | x14 | x15)) {
- output[0] = output[1] = output[2] = output[3] = output[4] = output[5] =
- output[6] = output[7] = output[8] = output[9] = output[10] =
- output[11] = output[12] = output[13] = output[14] = output[15] = 0;
- return;
- }
-
- // stage 1
- s0 = x0 * cospi_1_64 + x1 * cospi_31_64;
- s1 = x0 * cospi_31_64 - x1 * cospi_1_64;
- s2 = x2 * cospi_5_64 + x3 * cospi_27_64;
- s3 = x2 * cospi_27_64 - x3 * cospi_5_64;
- s4 = x4 * cospi_9_64 + x5 * cospi_23_64;
- s5 = x4 * cospi_23_64 - x5 * cospi_9_64;
- s6 = x6 * cospi_13_64 + x7 * cospi_19_64;
- s7 = x6 * cospi_19_64 - x7 * cospi_13_64;
- s8 = x8 * cospi_17_64 + x9 * cospi_15_64;
- s9 = x8 * cospi_15_64 - x9 * cospi_17_64;
- s10 = x10 * cospi_21_64 + x11 * cospi_11_64;
- s11 = x10 * cospi_11_64 - x11 * cospi_21_64;
- s12 = x12 * cospi_25_64 + x13 * cospi_7_64;
- s13 = x12 * cospi_7_64 - x13 * cospi_25_64;
- s14 = x14 * cospi_29_64 + x15 * cospi_3_64;
- s15 = x14 * cospi_3_64 - x15 * cospi_29_64;
-
- x0 = WRAPLOW(dct_const_round_shift(s0 + s8));
- x1 = WRAPLOW(dct_const_round_shift(s1 + s9));
- x2 = WRAPLOW(dct_const_round_shift(s2 + s10));
- x3 = WRAPLOW(dct_const_round_shift(s3 + s11));
- x4 = WRAPLOW(dct_const_round_shift(s4 + s12));
- x5 = WRAPLOW(dct_const_round_shift(s5 + s13));
- x6 = WRAPLOW(dct_const_round_shift(s6 + s14));
- x7 = WRAPLOW(dct_const_round_shift(s7 + s15));
- x8 = WRAPLOW(dct_const_round_shift(s0 - s8));
- x9 = WRAPLOW(dct_const_round_shift(s1 - s9));
- x10 = WRAPLOW(dct_const_round_shift(s2 - s10));
- x11 = WRAPLOW(dct_const_round_shift(s3 - s11));
- x12 = WRAPLOW(dct_const_round_shift(s4 - s12));
- x13 = WRAPLOW(dct_const_round_shift(s5 - s13));
- x14 = WRAPLOW(dct_const_round_shift(s6 - s14));
- x15 = WRAPLOW(dct_const_round_shift(s7 - s15));
-
- // stage 2
- s0 = x0;
- s1 = x1;
- s2 = x2;
- s3 = x3;
- s4 = x4;
- s5 = x5;
- s6 = x6;
- s7 = x7;
- s8 = x8 * cospi_4_64 + x9 * cospi_28_64;
- s9 = x8 * cospi_28_64 - x9 * cospi_4_64;
- s10 = x10 * cospi_20_64 + x11 * cospi_12_64;
- s11 = x10 * cospi_12_64 - x11 * cospi_20_64;
- s12 = -x12 * cospi_28_64 + x13 * cospi_4_64;
- s13 = x12 * cospi_4_64 + x13 * cospi_28_64;
- s14 = -x14 * cospi_12_64 + x15 * cospi_20_64;
- s15 = x14 * cospi_20_64 + x15 * cospi_12_64;
-
- x0 = WRAPLOW(s0 + s4);
- x1 = WRAPLOW(s1 + s5);
- x2 = WRAPLOW(s2 + s6);
- x3 = WRAPLOW(s3 + s7);
- x4 = WRAPLOW(s0 - s4);
- x5 = WRAPLOW(s1 - s5);
- x6 = WRAPLOW(s2 - s6);
- x7 = WRAPLOW(s3 - s7);
- x8 = WRAPLOW(dct_const_round_shift(s8 + s12));
- x9 = WRAPLOW(dct_const_round_shift(s9 + s13));
- x10 = WRAPLOW(dct_const_round_shift(s10 + s14));
- x11 = WRAPLOW(dct_const_round_shift(s11 + s15));
- x12 = WRAPLOW(dct_const_round_shift(s8 - s12));
- x13 = WRAPLOW(dct_const_round_shift(s9 - s13));
- x14 = WRAPLOW(dct_const_round_shift(s10 - s14));
- x15 = WRAPLOW(dct_const_round_shift(s11 - s15));
-
- // stage 3
- s0 = x0;
- s1 = x1;
- s2 = x2;
- s3 = x3;
- s4 = x4 * cospi_8_64 + x5 * cospi_24_64;
- s5 = x4 * cospi_24_64 - x5 * cospi_8_64;
- s6 = -x6 * cospi_24_64 + x7 * cospi_8_64;
- s7 = x6 * cospi_8_64 + x7 * cospi_24_64;
- s8 = x8;
- s9 = x9;
- s10 = x10;
- s11 = x11;
- s12 = x12 * cospi_8_64 + x13 * cospi_24_64;
- s13 = x12 * cospi_24_64 - x13 * cospi_8_64;
- s14 = -x14 * cospi_24_64 + x15 * cospi_8_64;
- s15 = x14 * cospi_8_64 + x15 * cospi_24_64;
-
- x0 = WRAPLOW(s0 + s2);
- x1 = WRAPLOW(s1 + s3);
- x2 = WRAPLOW(s0 - s2);
- x3 = WRAPLOW(s1 - s3);
- x4 = WRAPLOW(dct_const_round_shift(s4 + s6));
- x5 = WRAPLOW(dct_const_round_shift(s5 + s7));
- x6 = WRAPLOW(dct_const_round_shift(s4 - s6));
- x7 = WRAPLOW(dct_const_round_shift(s5 - s7));
- x8 = WRAPLOW(s8 + s10);
- x9 = WRAPLOW(s9 + s11);
- x10 = WRAPLOW(s8 - s10);
- x11 = WRAPLOW(s9 - s11);
- x12 = WRAPLOW(dct_const_round_shift(s12 + s14));
- x13 = WRAPLOW(dct_const_round_shift(s13 + s15));
- x14 = WRAPLOW(dct_const_round_shift(s12 - s14));
- x15 = WRAPLOW(dct_const_round_shift(s13 - s15));
-
- // stage 4
- s2 = (-cospi_16_64) * (x2 + x3);
- s3 = cospi_16_64 * (x2 - x3);
- s6 = cospi_16_64 * (x6 + x7);
- s7 = cospi_16_64 * (-x6 + x7);
- s10 = cospi_16_64 * (x10 + x11);
- s11 = cospi_16_64 * (-x10 + x11);
- s14 = (-cospi_16_64) * (x14 + x15);
- s15 = cospi_16_64 * (x14 - x15);
-
- x2 = WRAPLOW(dct_const_round_shift(s2));
- x3 = WRAPLOW(dct_const_round_shift(s3));
- x6 = WRAPLOW(dct_const_round_shift(s6));
- x7 = WRAPLOW(dct_const_round_shift(s7));
- x10 = WRAPLOW(dct_const_round_shift(s10));
- x11 = WRAPLOW(dct_const_round_shift(s11));
- x14 = WRAPLOW(dct_const_round_shift(s14));
- x15 = WRAPLOW(dct_const_round_shift(s15));
-
- output[0] = WRAPLOW(x0);
- output[1] = WRAPLOW(-x8);
- output[2] = WRAPLOW(x12);
- output[3] = WRAPLOW(-x4);
- output[4] = WRAPLOW(x6);
- output[5] = WRAPLOW(x14);
- output[6] = WRAPLOW(x10);
- output[7] = WRAPLOW(x2);
- output[8] = WRAPLOW(x3);
- output[9] = WRAPLOW(x11);
- output[10] = WRAPLOW(x15);
- output[11] = WRAPLOW(x7);
- output[12] = WRAPLOW(x5);
- output[13] = WRAPLOW(-x13);
- output[14] = WRAPLOW(x9);
- output[15] = WRAPLOW(-x1);
-}
-
-void av1_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest,
- int stride) {
- tran_low_t out[16 * 16] = { 0 };
- tran_low_t *outptr = out;
- int i, j;
- tran_low_t temp_in[16], temp_out[16];
-
- // First transform rows. Since all non-zero dct coefficients are in
- // upper-left 4x4 area, we only need to calculate first 4 rows here.
- for (i = 0; i < 4; ++i) {
- av1_idct16_c(input, outptr);
- input += 16;
- outptr += 16;
- }
-
- // Then transform columns
- for (i = 0; i < 16; ++i) {
- for (j = 0; j < 16; ++j) temp_in[j] = out[j * 16 + i];
- av1_idct16_c(temp_in, temp_out);
- for (j = 0; j < 16; ++j) {
- dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
- ROUND_POWER_OF_TWO(temp_out[j], 6));
- }
- }
-}
-
-void av1_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
- int i, j;
- tran_high_t a1;
- tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64));
- out = WRAPLOW(dct_const_round_shift(out * cospi_16_64));
- a1 = ROUND_POWER_OF_TWO(out, 6);
- for (j = 0; j < 16; ++j) {
- for (i = 0; i < 16; ++i) dest[i] = clip_pixel_add(dest[i], a1);
- dest += stride;
- }
-}
-
-void av1_idct32_c(const tran_low_t *input, tran_low_t *output) {
- tran_low_t step1[32], step2[32];
- tran_high_t temp1, temp2;
-
- // stage 1
- step1[0] = input[0];
- step1[1] = input[16];
- step1[2] = input[8];
- step1[3] = input[24];
- step1[4] = input[4];
- step1[5] = input[20];
- step1[6] = input[12];
- step1[7] = input[28];
- step1[8] = input[2];
- step1[9] = input[18];
- step1[10] = input[10];
- step1[11] = input[26];
- step1[12] = input[6];
- step1[13] = input[22];
- step1[14] = input[14];
- step1[15] = input[30];
-
- temp1 = input[1] * cospi_31_64 - input[31] * cospi_1_64;
- temp2 = input[1] * cospi_1_64 + input[31] * cospi_31_64;
- step1[16] = WRAPLOW(dct_const_round_shift(temp1));
- step1[31] = WRAPLOW(dct_const_round_shift(temp2));
-
- temp1 = input[17] * cospi_15_64 - input[15] * cospi_17_64;
- temp2 = input[17] * cospi_17_64 + input[15] * cospi_15_64;
- step1[17] = WRAPLOW(dct_const_round_shift(temp1));
- step1[30] = WRAPLOW(dct_const_round_shift(temp2));
-
- temp1 = input[9] * cospi_23_64 - input[23] * cospi_9_64;
- temp2 = input[9] * cospi_9_64 + input[23] * cospi_23_64;
- step1[18] = WRAPLOW(dct_const_round_shift(temp1));
- step1[29] = WRAPLOW(dct_const_round_shift(temp2));
-
- temp1 = input[25] * cospi_7_64 - input[7] * cospi_25_64;
- temp2 = input[25] * cospi_25_64 + input[7] * cospi_7_64;
- step1[19] = WRAPLOW(dct_const_round_shift(temp1));
- step1[28] = WRAPLOW(dct_const_round_shift(temp2));
-
- temp1 = input[5] * cospi_27_64 - input[27] * cospi_5_64;
- temp2 = input[5] * cospi_5_64 + input[27] * cospi_27_64;
- step1[20] = WRAPLOW(dct_const_round_shift(temp1));
- step1[27] = WRAPLOW(dct_const_round_shift(temp2));
-
- temp1 = input[21] * cospi_11_64 - input[11] * cospi_21_64;
- temp2 = input[21] * cospi_21_64 + input[11] * cospi_11_64;
- step1[21] = WRAPLOW(dct_const_round_shift(temp1));
- step1[26] = WRAPLOW(dct_const_round_shift(temp2));
-
- temp1 = input[13] * cospi_19_64 - input[19] * cospi_13_64;
- temp2 = input[13] * cospi_13_64 + input[19] * cospi_19_64;
- step1[22] = WRAPLOW(dct_const_round_shift(temp1));
- step1[25] = WRAPLOW(dct_const_round_shift(temp2));
-
- temp1 = input[29] * cospi_3_64 - input[3] * cospi_29_64;
- temp2 = input[29] * cospi_29_64 + input[3] * cospi_3_64;
- step1[23] = WRAPLOW(dct_const_round_shift(temp1));
- step1[24] = WRAPLOW(dct_const_round_shift(temp2));
-
- // stage 2
- step2[0] = step1[0];
- step2[1] = step1[1];
- step2[2] = step1[2];
- step2[3] = step1[3];
- step2[4] = step1[4];
- step2[5] = step1[5];
- step2[6] = step1[6];
- step2[7] = step1[7];
-
- temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64;
- temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64;
- step2[8] = WRAPLOW(dct_const_round_shift(temp1));
- step2[15] = WRAPLOW(dct_const_round_shift(temp2));
-
- temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64;
- temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64;
- step2[9] = WRAPLOW(dct_const_round_shift(temp1));
- step2[14] = WRAPLOW(dct_const_round_shift(temp2));
-
- temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64;
- temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64;
- step2[10] = WRAPLOW(dct_const_round_shift(temp1));
- step2[13] = WRAPLOW(dct_const_round_shift(temp2));
-
- temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64;
- temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64;
- step2[11] = WRAPLOW(dct_const_round_shift(temp1));
- step2[12] = WRAPLOW(dct_const_round_shift(temp2));
-
- step2[16] = WRAPLOW(step1[16] + step1[17]);
- step2[17] = WRAPLOW(step1[16] - step1[17]);
- step2[18] = WRAPLOW(-step1[18] + step1[19]);
- step2[19] = WRAPLOW(step1[18] + step1[19]);
- step2[20] = WRAPLOW(step1[20] + step1[21]);
- step2[21] = WRAPLOW(step1[20] - step1[21]);
- step2[22] = WRAPLOW(-step1[22] + step1[23]);
- step2[23] = WRAPLOW(step1[22] + step1[23]);
- step2[24] = WRAPLOW(step1[24] + step1[25]);
- step2[25] = WRAPLOW(step1[24] - step1[25]);
- step2[26] = WRAPLOW(-step1[26] + step1[27]);
- step2[27] = WRAPLOW(step1[26] + step1[27]);
- step2[28] = WRAPLOW(step1[28] + step1[29]);
- step2[29] = WRAPLOW(step1[28] - step1[29]);
- step2[30] = WRAPLOW(-step1[30] + step1[31]);
- step2[31] = WRAPLOW(step1[30] + step1[31]);
-
- // stage 3
- step1[0] = step2[0];
- step1[1] = step2[1];
- step1[2] = step2[2];
- step1[3] = step2[3];
-
- temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64;
- temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64;
- step1[4] = WRAPLOW(dct_const_round_shift(temp1));
- step1[7] = WRAPLOW(dct_const_round_shift(temp2));
- temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64;
- temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64;
- step1[5] = WRAPLOW(dct_const_round_shift(temp1));
- step1[6] = WRAPLOW(dct_const_round_shift(temp2));
-
- step1[8] = WRAPLOW(step2[8] + step2[9]);
- step1[9] = WRAPLOW(step2[8] - step2[9]);
- step1[10] = WRAPLOW(-step2[10] + step2[11]);
- step1[11] = WRAPLOW(step2[10] + step2[11]);
- step1[12] = WRAPLOW(step2[12] + step2[13]);
- step1[13] = WRAPLOW(step2[12] - step2[13]);
- step1[14] = WRAPLOW(-step2[14] + step2[15]);
- step1[15] = WRAPLOW(step2[14] + step2[15]);
-
- step1[16] = step2[16];
- step1[31] = step2[31];
- temp1 = -step2[17] * cospi_4_64 + step2[30] * cospi_28_64;
- temp2 = step2[17] * cospi_28_64 + step2[30] * cospi_4_64;
- step1[17] = WRAPLOW(dct_const_round_shift(temp1));
- step1[30] = WRAPLOW(dct_const_round_shift(temp2));
- temp1 = -step2[18] * cospi_28_64 - step2[29] * cospi_4_64;
- temp2 = -step2[18] * cospi_4_64 + step2[29] * cospi_28_64;
- step1[18] = WRAPLOW(dct_const_round_shift(temp1));
- step1[29] = WRAPLOW(dct_const_round_shift(temp2));
- step1[19] = step2[19];
- step1[20] = step2[20];
- temp1 = -step2[21] * cospi_20_64 + step2[26] * cospi_12_64;
- temp2 = step2[21] * cospi_12_64 + step2[26] * cospi_20_64;
- step1[21] = WRAPLOW(dct_const_round_shift(temp1));
- step1[26] = WRAPLOW(dct_const_round_shift(temp2));
- temp1 = -step2[22] * cospi_12_64 - step2[25] * cospi_20_64;
- temp2 = -step2[22] * cospi_20_64 + step2[25] * cospi_12_64;
- step1[22] = WRAPLOW(dct_const_round_shift(temp1));
- step1[25] = WRAPLOW(dct_const_round_shift(temp2));
- step1[23] = step2[23];
- step1[24] = step2[24];
- step1[27] = step2[27];
- step1[28] = step2[28];
-
- // stage 4
- temp1 = (step1[0] + step1[1]) * cospi_16_64;
- temp2 = (step1[0] - step1[1]) * cospi_16_64;
- step2[0] = WRAPLOW(dct_const_round_shift(temp1));
- step2[1] = WRAPLOW(dct_const_round_shift(temp2));
- temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64;
- temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64;
- step2[2] = WRAPLOW(dct_const_round_shift(temp1));
- step2[3] = WRAPLOW(dct_const_round_shift(temp2));
- step2[4] = WRAPLOW(step1[4] + step1[5]);
- step2[5] = WRAPLOW(step1[4] - step1[5]);
- step2[6] = WRAPLOW(-step1[6] + step1[7]);
- step2[7] = WRAPLOW(step1[6] + step1[7]);
-
- step2[8] = step1[8];
- step2[15] = step1[15];
- temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64;
- temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64;
- step2[9] = WRAPLOW(dct_const_round_shift(temp1));
- step2[14] = WRAPLOW(dct_const_round_shift(temp2));
- temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64;
- temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64;
- step2[10] = WRAPLOW(dct_const_round_shift(temp1));
- step2[13] = WRAPLOW(dct_const_round_shift(temp2));
- step2[11] = step1[11];
- step2[12] = step1[12];
-
- step2[16] = WRAPLOW(step1[16] + step1[19]);
- step2[17] = WRAPLOW(step1[17] + step1[18]);
- step2[18] = WRAPLOW(step1[17] - step1[18]);
- step2[19] = WRAPLOW(step1[16] - step1[19]);
- step2[20] = WRAPLOW(-step1[20] + step1[23]);
- step2[21] = WRAPLOW(-step1[21] + step1[22]);
- step2[22] = WRAPLOW(step1[21] + step1[22]);
- step2[23] = WRAPLOW(step1[20] + step1[23]);
-
- step2[24] = WRAPLOW(step1[24] + step1[27]);
- step2[25] = WRAPLOW(step1[25] + step1[26]);
- step2[26] = WRAPLOW(step1[25] - step1[26]);
- step2[27] = WRAPLOW(step1[24] - step1[27]);
- step2[28] = WRAPLOW(-step1[28] + step1[31]);
- step2[29] = WRAPLOW(-step1[29] + step1[30]);
- step2[30] = WRAPLOW(step1[29] + step1[30]);
- step2[31] = WRAPLOW(step1[28] + step1[31]);
-
- // stage 5
- step1[0] = WRAPLOW(step2[0] + step2[3]);
- step1[1] = WRAPLOW(step2[1] + step2[2]);
- step1[2] = WRAPLOW(step2[1] - step2[2]);
- step1[3] = WRAPLOW(step2[0] - step2[3]);
- step1[4] = step2[4];
- temp1 = (step2[6] - step2[5]) * cospi_16_64;
- temp2 = (step2[5] + step2[6]) * cospi_16_64;
- step1[5] = WRAPLOW(dct_const_round_shift(temp1));
- step1[6] = WRAPLOW(dct_const_round_shift(temp2));
- step1[7] = step2[7];
-
- step1[8] = WRAPLOW(step2[8] + step2[11]);
- step1[9] = WRAPLOW(step2[9] + step2[10]);
- step1[10] = WRAPLOW(step2[9] - step2[10]);
- step1[11] = WRAPLOW(step2[8] - step2[11]);
- step1[12] = WRAPLOW(-step2[12] + step2[15]);
- step1[13] = WRAPLOW(-step2[13] + step2[14]);
- step1[14] = WRAPLOW(step2[13] + step2[14]);
- step1[15] = WRAPLOW(step2[12] + step2[15]);
-
- step1[16] = step2[16];
- step1[17] = step2[17];
- temp1 = -step2[18] * cospi_8_64 + step2[29] * cospi_24_64;
- temp2 = step2[18] * cospi_24_64 + step2[29] * cospi_8_64;
- step1[18] = WRAPLOW(dct_const_round_shift(temp1));
- step1[29] = WRAPLOW(dct_const_round_shift(temp2));
- temp1 = -step2[19] * cospi_8_64 + step2[28] * cospi_24_64;
- temp2 = step2[19] * cospi_24_64 + step2[28] * cospi_8_64;
- step1[19] = WRAPLOW(dct_const_round_shift(temp1));
- step1[28] = WRAPLOW(dct_const_round_shift(temp2));
- temp1 = -step2[20] * cospi_24_64 - step2[27] * cospi_8_64;
- temp2 = -step2[20] * cospi_8_64 + step2[27] * cospi_24_64;
- step1[20] = WRAPLOW(dct_const_round_shift(temp1));
- step1[27] = WRAPLOW(dct_const_round_shift(temp2));
- temp1 = -step2[21] * cospi_24_64 - step2[26] * cospi_8_64;
- temp2 = -step2[21] * cospi_8_64 + step2[26] * cospi_24_64;
- step1[21] = WRAPLOW(dct_const_round_shift(temp1));
- step1[26] = WRAPLOW(dct_const_round_shift(temp2));
- step1[22] = step2[22];
- step1[23] = step2[23];
- step1[24] = step2[24];
- step1[25] = step2[25];
- step1[30] = step2[30];
- step1[31] = step2[31];
-
- // stage 6
- step2[0] = WRAPLOW(step1[0] + step1[7]);
- step2[1] = WRAPLOW(step1[1] + step1[6]);
- step2[2] = WRAPLOW(step1[2] + step1[5]);
- step2[3] = WRAPLOW(step1[3] + step1[4]);
- step2[4] = WRAPLOW(step1[3] - step1[4]);
- step2[5] = WRAPLOW(step1[2] - step1[5]);
- step2[6] = WRAPLOW(step1[1] - step1[6]);
- step2[7] = WRAPLOW(step1[0] - step1[7]);
- step2[8] = step1[8];
- step2[9] = step1[9];
- temp1 = (-step1[10] + step1[13]) * cospi_16_64;
- temp2 = (step1[10] + step1[13]) * cospi_16_64;
- step2[10] = WRAPLOW(dct_const_round_shift(temp1));
- step2[13] = WRAPLOW(dct_const_round_shift(temp2));
- temp1 = (-step1[11] + step1[12]) * cospi_16_64;
- temp2 = (step1[11] + step1[12]) * cospi_16_64;
- step2[11] = WRAPLOW(dct_const_round_shift(temp1));
- step2[12] = WRAPLOW(dct_const_round_shift(temp2));
- step2[14] = step1[14];
- step2[15] = step1[15];
-
- step2[16] = WRAPLOW(step1[16] + step1[23]);
- step2[17] = WRAPLOW(step1[17] + step1[22]);
- step2[18] = WRAPLOW(step1[18] + step1[21]);
- step2[19] = WRAPLOW(step1[19] + step1[20]);
- step2[20] = WRAPLOW(step1[19] - step1[20]);
- step2[21] = WRAPLOW(step1[18] - step1[21]);
- step2[22] = WRAPLOW(step1[17] - step1[22]);
- step2[23] = WRAPLOW(step1[16] - step1[23]);
-
- step2[24] = WRAPLOW(-step1[24] + step1[31]);
- step2[25] = WRAPLOW(-step1[25] + step1[30]);
- step2[26] = WRAPLOW(-step1[26] + step1[29]);
- step2[27] = WRAPLOW(-step1[27] + step1[28]);
- step2[28] = WRAPLOW(step1[27] + step1[28]);
- step2[29] = WRAPLOW(step1[26] + step1[29]);
- step2[30] = WRAPLOW(step1[25] + step1[30]);
- step2[31] = WRAPLOW(step1[24] + step1[31]);
-
- // stage 7
- step1[0] = WRAPLOW(step2[0] + step2[15]);
- step1[1] = WRAPLOW(step2[1] + step2[14]);
- step1[2] = WRAPLOW(step2[2] + step2[13]);
- step1[3] = WRAPLOW(step2[3] + step2[12]);
- step1[4] = WRAPLOW(step2[4] + step2[11]);
- step1[5] = WRAPLOW(step2[5] + step2[10]);
- step1[6] = WRAPLOW(step2[6] + step2[9]);
- step1[7] = WRAPLOW(step2[7] + step2[8]);
- step1[8] = WRAPLOW(step2[7] - step2[8]);
- step1[9] = WRAPLOW(step2[6] - step2[9]);
- step1[10] = WRAPLOW(step2[5] - step2[10]);
- step1[11] = WRAPLOW(step2[4] - step2[11]);
- step1[12] = WRAPLOW(step2[3] - step2[12]);
- step1[13] = WRAPLOW(step2[2] - step2[13]);
- step1[14] = WRAPLOW(step2[1] - step2[14]);
- step1[15] = WRAPLOW(step2[0] - step2[15]);
-
- step1[16] = step2[16];
- step1[17] = step2[17];
- step1[18] = step2[18];
- step1[19] = step2[19];
- temp1 = (-step2[20] + step2[27]) * cospi_16_64;
- temp2 = (step2[20] + step2[27]) * cospi_16_64;
- step1[20] = WRAPLOW(dct_const_round_shift(temp1));
- step1[27] = WRAPLOW(dct_const_round_shift(temp2));
- temp1 = (-step2[21] + step2[26]) * cospi_16_64;
- temp2 = (step2[21] + step2[26]) * cospi_16_64;
- step1[21] = WRAPLOW(dct_const_round_shift(temp1));
- step1[26] = WRAPLOW(dct_const_round_shift(temp2));
- temp1 = (-step2[22] + step2[25]) * cospi_16_64;
- temp2 = (step2[22] + step2[25]) * cospi_16_64;
- step1[22] = WRAPLOW(dct_const_round_shift(temp1));
- step1[25] = WRAPLOW(dct_const_round_shift(temp2));
- temp1 = (-step2[23] + step2[24]) * cospi_16_64;
- temp2 = (step2[23] + step2[24]) * cospi_16_64;
- step1[23] = WRAPLOW(dct_const_round_shift(temp1));
- step1[24] = WRAPLOW(dct_const_round_shift(temp2));
- step1[28] = step2[28];
- step1[29] = step2[29];
- step1[30] = step2[30];
- step1[31] = step2[31];
-
- // final stage
- output[0] = WRAPLOW(step1[0] + step1[31]);
- output[1] = WRAPLOW(step1[1] + step1[30]);
- output[2] = WRAPLOW(step1[2] + step1[29]);
- output[3] = WRAPLOW(step1[3] + step1[28]);
- output[4] = WRAPLOW(step1[4] + step1[27]);
- output[5] = WRAPLOW(step1[5] + step1[26]);
- output[6] = WRAPLOW(step1[6] + step1[25]);
- output[7] = WRAPLOW(step1[7] + step1[24]);
- output[8] = WRAPLOW(step1[8] + step1[23]);
- output[9] = WRAPLOW(step1[9] + step1[22]);
- output[10] = WRAPLOW(step1[10] + step1[21]);
- output[11] = WRAPLOW(step1[11] + step1[20]);
- output[12] = WRAPLOW(step1[12] + step1[19]);
- output[13] = WRAPLOW(step1[13] + step1[18]);
- output[14] = WRAPLOW(step1[14] + step1[17]);
- output[15] = WRAPLOW(step1[15] + step1[16]);
- output[16] = WRAPLOW(step1[15] - step1[16]);
- output[17] = WRAPLOW(step1[14] - step1[17]);
- output[18] = WRAPLOW(step1[13] - step1[18]);
- output[19] = WRAPLOW(step1[12] - step1[19]);
- output[20] = WRAPLOW(step1[11] - step1[20]);
- output[21] = WRAPLOW(step1[10] - step1[21]);
- output[22] = WRAPLOW(step1[9] - step1[22]);
- output[23] = WRAPLOW(step1[8] - step1[23]);
- output[24] = WRAPLOW(step1[7] - step1[24]);
- output[25] = WRAPLOW(step1[6] - step1[25]);
- output[26] = WRAPLOW(step1[5] - step1[26]);
- output[27] = WRAPLOW(step1[4] - step1[27]);
- output[28] = WRAPLOW(step1[3] - step1[28]);
- output[29] = WRAPLOW(step1[2] - step1[29]);
- output[30] = WRAPLOW(step1[1] - step1[30]);
- output[31] = WRAPLOW(step1[0] - step1[31]);
-}
-
-void av1_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest,
- int stride) {
- tran_low_t out[32 * 32];
- tran_low_t *outptr = out;
- int i, j;
- tran_low_t temp_in[32], temp_out[32];
-
- // Rows
- for (i = 0; i < 32; ++i) {
- int16_t zero_coeff[16];
- for (j = 0; j < 16; ++j) zero_coeff[j] = input[2 * j] | input[2 * j + 1];
- for (j = 0; j < 8; ++j)
- zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1];
- for (j = 0; j < 4; ++j)
- zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1];
- for (j = 0; j < 2; ++j)
- zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1];
-
- if (zero_coeff[0] | zero_coeff[1])
- av1_idct32_c(input, outptr);
- else
- memset(outptr, 0, sizeof(tran_low_t) * 32);
- input += 32;
- outptr += 32;
- }
-
- // Columns
- for (i = 0; i < 32; ++i) {
- for (j = 0; j < 32; ++j) temp_in[j] = out[j * 32 + i];
- av1_idct32_c(temp_in, temp_out);
- for (j = 0; j < 32; ++j) {
- dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
- ROUND_POWER_OF_TWO(temp_out[j], 6));
- }
- }
-}
-
-void av1_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest,
- int stride) {
- tran_low_t out[32 * 32] = { 0 };
- tran_low_t *outptr = out;
- int i, j;
- tran_low_t temp_in[32], temp_out[32];
-
- // Rows
- // only upper-left 8x8 has non-zero coeff
- for (i = 0; i < 8; ++i) {
- av1_idct32_c(input, outptr);
- input += 32;
- outptr += 32;
- }
-
- // Columns
- for (i = 0; i < 32; ++i) {
- for (j = 0; j < 32; ++j) temp_in[j] = out[j * 32 + i];
- av1_idct32_c(temp_in, temp_out);
- for (j = 0; j < 32; ++j) {
- dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
- ROUND_POWER_OF_TWO(temp_out[j], 6));
- }
- }
-}
-
-void av1_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
- int i, j;
- tran_high_t a1;
-
- tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64));
- out = WRAPLOW(dct_const_round_shift(out * cospi_16_64));
- a1 = ROUND_POWER_OF_TWO(out, 6);
-
- for (j = 0; j < 32; ++j) {
- for (i = 0; i < 32; ++i) dest[i] = clip_pixel_add(dest[i], a1);
- dest += stride;
- }
-}
-
-#if CONFIG_AOM_HIGHBITDEPTH
-void av1_highbd_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest8,
- int stride, int bd) {
- /* 4-point reversible, orthonormal inverse Walsh-Hadamard in 3.5 adds,
- 0.5 shifts per pixel. */
- int i;
- tran_low_t output[16];
- tran_high_t a1, b1, c1, d1, e1;
- const tran_low_t *ip = input;
- tran_low_t *op = output;
- uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
-
- for (i = 0; i < 4; i++) {
- a1 = ip[0] >> UNIT_QUANT_SHIFT;
- c1 = ip[1] >> UNIT_QUANT_SHIFT;
- d1 = ip[2] >> UNIT_QUANT_SHIFT;
- b1 = ip[3] >> UNIT_QUANT_SHIFT;
- a1 += c1;
- d1 -= b1;
- e1 = (a1 - d1) >> 1;
- b1 = e1 - b1;
- c1 = e1 - c1;
- a1 -= b1;
- d1 += c1;
- op[0] = HIGHBD_WRAPLOW(a1, bd);
- op[1] = HIGHBD_WRAPLOW(b1, bd);
- op[2] = HIGHBD_WRAPLOW(c1, bd);
- op[3] = HIGHBD_WRAPLOW(d1, bd);
- ip += 4;
- op += 4;
- }
-
- ip = output;
- for (i = 0; i < 4; i++) {
- a1 = ip[4 * 0];
- c1 = ip[4 * 1];
- d1 = ip[4 * 2];
- b1 = ip[4 * 3];
- a1 += c1;
- d1 -= b1;
- e1 = (a1 - d1) >> 1;
- b1 = e1 - b1;
- c1 = e1 - c1;
- a1 -= b1;
- d1 += c1;
- dest[stride * 0] =
- highbd_clip_pixel_add(dest[stride * 0], HIGHBD_WRAPLOW(a1, bd), bd);
- dest[stride * 1] =
- highbd_clip_pixel_add(dest[stride * 1], HIGHBD_WRAPLOW(b1, bd), bd);
- dest[stride * 2] =
- highbd_clip_pixel_add(dest[stride * 2], HIGHBD_WRAPLOW(c1, bd), bd);
- dest[stride * 3] =
- highbd_clip_pixel_add(dest[stride * 3], HIGHBD_WRAPLOW(d1, bd), bd);
- ip++;
- dest++;
- }
-}
-
-void av1_highbd_iwht4x4_1_add_c(const tran_low_t *in, uint8_t *dest8,
- int dest_stride, int bd) {
- int i;
- tran_high_t a1, e1;
- tran_low_t tmp[4];
- const tran_low_t *ip = in;
- tran_low_t *op = tmp;
- uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
- (void)bd;
-
- a1 = ip[0] >> UNIT_QUANT_SHIFT;
- e1 = a1 >> 1;
- a1 -= e1;
- op[0] = HIGHBD_WRAPLOW(a1, bd);
- op[1] = op[2] = op[3] = HIGHBD_WRAPLOW(e1, bd);
-
- ip = tmp;
- for (i = 0; i < 4; i++) {
- e1 = ip[0] >> 1;
- a1 = ip[0] - e1;
- dest[dest_stride * 0] =
- highbd_clip_pixel_add(dest[dest_stride * 0], a1, bd);
- dest[dest_stride * 1] =
- highbd_clip_pixel_add(dest[dest_stride * 1], e1, bd);
- dest[dest_stride * 2] =
- highbd_clip_pixel_add(dest[dest_stride * 2], e1, bd);
- dest[dest_stride * 3] =
- highbd_clip_pixel_add(dest[dest_stride * 3], e1, bd);
- ip++;
- dest++;
- }
-}
-
-void av1_highbd_idct4_c(const tran_low_t *input, tran_low_t *output, int bd) {
- tran_low_t step[4];
- tran_high_t temp1, temp2;
- (void)bd;
- // stage 1
- temp1 = (input[0] + input[2]) * cospi_16_64;
- temp2 = (input[0] - input[2]) * cospi_16_64;
- step[0] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
- step[1] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
- temp1 = input[1] * cospi_24_64 - input[3] * cospi_8_64;
- temp2 = input[1] * cospi_8_64 + input[3] * cospi_24_64;
- step[2] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
- step[3] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
-
- // stage 2
- output[0] = HIGHBD_WRAPLOW(step[0] + step[3], bd);
- output[1] = HIGHBD_WRAPLOW(step[1] + step[2], bd);
- output[2] = HIGHBD_WRAPLOW(step[1] - step[2], bd);
- output[3] = HIGHBD_WRAPLOW(step[0] - step[3], bd);
-}
-
-void av1_highbd_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest8,
- int stride, int bd) {
- tran_low_t out[4 * 4];
- tran_low_t *outptr = out;
- int i, j;
- tran_low_t temp_in[4], temp_out[4];
- uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
-
- // Rows
- for (i = 0; i < 4; ++i) {
- av1_highbd_idct4_c(input, outptr, bd);
- input += 4;
- outptr += 4;
- }
-
- // Columns
- for (i = 0; i < 4; ++i) {
- for (j = 0; j < 4; ++j) temp_in[j] = out[j * 4 + i];
- av1_highbd_idct4_c(temp_in, temp_out, bd);
- for (j = 0; j < 4; ++j) {
- dest[j * stride + i] = highbd_clip_pixel_add(
- dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 4), bd);
- }
- }
-}
-
-void av1_highbd_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest8,
- int dest_stride, int bd) {
- int i;
- tran_high_t a1;
- tran_low_t out =
- HIGHBD_WRAPLOW(highbd_dct_const_round_shift(input[0] * cospi_16_64), bd);
- uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
-
- out = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(out * cospi_16_64), bd);
- a1 = ROUND_POWER_OF_TWO(out, 4);
-
- for (i = 0; i < 4; i++) {
- dest[0] = highbd_clip_pixel_add(dest[0], a1, bd);
- dest[1] = highbd_clip_pixel_add(dest[1], a1, bd);
- dest[2] = highbd_clip_pixel_add(dest[2], a1, bd);
- dest[3] = highbd_clip_pixel_add(dest[3], a1, bd);
- dest += dest_stride;
- }
-}
-
-void av1_highbd_idct8_c(const tran_low_t *input, tran_low_t *output, int bd) {
- tran_low_t step1[8], step2[8];
- tran_high_t temp1, temp2;
- // stage 1
- step1[0] = input[0];
- step1[2] = input[4];
- step1[1] = input[2];
- step1[3] = input[6];
- temp1 = input[1] * cospi_28_64 - input[7] * cospi_4_64;
- temp2 = input[1] * cospi_4_64 + input[7] * cospi_28_64;
- step1[4] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
- step1[7] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
- temp1 = input[5] * cospi_12_64 - input[3] * cospi_20_64;
- temp2 = input[5] * cospi_20_64 + input[3] * cospi_12_64;
- step1[5] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
- step1[6] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
-
- // stage 2 & stage 3 - even half
- av1_highbd_idct4_c(step1, step1, bd);
-
- // stage 2 - odd half
- step2[4] = HIGHBD_WRAPLOW(step1[4] + step1[5], bd);
- step2[5] = HIGHBD_WRAPLOW(step1[4] - step1[5], bd);
- step2[6] = HIGHBD_WRAPLOW(-step1[6] + step1[7], bd);
- step2[7] = HIGHBD_WRAPLOW(step1[6] + step1[7], bd);
-
- // stage 3 - odd half
- step1[4] = step2[4];
- temp1 = (step2[6] - step2[5]) * cospi_16_64;
- temp2 = (step2[5] + step2[6]) * cospi_16_64;
- step1[5] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
- step1[6] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
- step1[7] = step2[7];
-
- // stage 4
- output[0] = HIGHBD_WRAPLOW(step1[0] + step1[7], bd);
- output[1] = HIGHBD_WRAPLOW(step1[1] + step1[6], bd);
- output[2] = HIGHBD_WRAPLOW(step1[2] + step1[5], bd);
- output[3] = HIGHBD_WRAPLOW(step1[3] + step1[4], bd);
- output[4] = HIGHBD_WRAPLOW(step1[3] - step1[4], bd);
- output[5] = HIGHBD_WRAPLOW(step1[2] - step1[5], bd);
- output[6] = HIGHBD_WRAPLOW(step1[1] - step1[6], bd);
- output[7] = HIGHBD_WRAPLOW(step1[0] - step1[7], bd);
-}
-
-void av1_highbd_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest8,
- int stride, int bd) {
- tran_low_t out[8 * 8];
- tran_low_t *outptr = out;
- int i, j;
- tran_low_t temp_in[8], temp_out[8];
- uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
-
- // First transform rows.
- for (i = 0; i < 8; ++i) {
- av1_highbd_idct8_c(input, outptr, bd);
- input += 8;
- outptr += 8;
- }
-
- // Then transform columns.
- for (i = 0; i < 8; ++i) {
- for (j = 0; j < 8; ++j) temp_in[j] = out[j * 8 + i];
- av1_highbd_idct8_c(temp_in, temp_out, bd);
- for (j = 0; j < 8; ++j) {
- dest[j * stride + i] = highbd_clip_pixel_add(
- dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 5), bd);
- }
- }
-}
-
-void av1_highbd_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest8,
- int stride, int bd) {
- int i, j;
- tran_high_t a1;
- tran_low_t out =
- HIGHBD_WRAPLOW(highbd_dct_const_round_shift(input[0] * cospi_16_64), bd);
- uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
- out = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(out * cospi_16_64), bd);
- a1 = ROUND_POWER_OF_TWO(out, 5);
- for (j = 0; j < 8; ++j) {
- for (i = 0; i < 8; ++i) dest[i] = highbd_clip_pixel_add(dest[i], a1, bd);
- dest += stride;
- }
-}
-
-void av1_highbd_iadst4_c(const tran_low_t *input, tran_low_t *output, int bd) {
- tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;
-
- tran_low_t x0 = input[0];
- tran_low_t x1 = input[1];
- tran_low_t x2 = input[2];
- tran_low_t x3 = input[3];
- (void)bd;
-
- if (!(x0 | x1 | x2 | x3)) {
- memset(output, 0, 4 * sizeof(*output));
- return;
- }
-
- s0 = sinpi_1_9 * x0;
- s1 = sinpi_2_9 * x0;
- s2 = sinpi_3_9 * x1;
- s3 = sinpi_4_9 * x2;
- s4 = sinpi_1_9 * x2;
- s5 = sinpi_2_9 * x3;
- s6 = sinpi_4_9 * x3;
- s7 = (tran_high_t)HIGHBD_WRAPLOW(x0 - x2 + x3, bd);
-
- s0 = s0 + s3 + s5;
- s1 = s1 - s4 - s6;
- s3 = s2;
- s2 = sinpi_3_9 * s7;
-
- // 1-D transform scaling factor is sqrt(2).
- // The overall dynamic range is 14b (input) + 14b (multiplication scaling)
- // + 1b (addition) = 29b.
- // Hence the output bit depth is 15b.
- output[0] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s0 + s3), bd);
- output[1] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s1 + s3), bd);
- output[2] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s2), bd);
- output[3] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s0 + s1 - s3), bd);
-}
-
-void av1_highbd_iadst8_c(const tran_low_t *input, tran_low_t *output, int bd) {
- tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;
-
- tran_low_t x0 = input[7];
- tran_low_t x1 = input[0];
- tran_low_t x2 = input[5];
- tran_low_t x3 = input[2];
- tran_low_t x4 = input[3];
- tran_low_t x5 = input[4];
- tran_low_t x6 = input[1];
- tran_low_t x7 = input[6];
- (void)bd;
-
- if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7)) {
- memset(output, 0, 8 * sizeof(*output));
- return;
- }
-
- // stage 1
- s0 = cospi_2_64 * x0 + cospi_30_64 * x1;
- s1 = cospi_30_64 * x0 - cospi_2_64 * x1;
- s2 = cospi_10_64 * x2 + cospi_22_64 * x3;
- s3 = cospi_22_64 * x2 - cospi_10_64 * x3;
- s4 = cospi_18_64 * x4 + cospi_14_64 * x5;
- s5 = cospi_14_64 * x4 - cospi_18_64 * x5;
- s6 = cospi_26_64 * x6 + cospi_6_64 * x7;
- s7 = cospi_6_64 * x6 - cospi_26_64 * x7;
-
- x0 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s0 + s4), bd);
- x1 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s1 + s5), bd);
- x2 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s2 + s6), bd);
- x3 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s3 + s7), bd);
- x4 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s0 - s4), bd);
- x5 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s1 - s5), bd);
- x6 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s2 - s6), bd);
- x7 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s3 - s7), bd);
-
- // stage 2
- s0 = x0;
- s1 = x1;
- s2 = x2;
- s3 = x3;
- s4 = cospi_8_64 * x4 + cospi_24_64 * x5;
- s5 = cospi_24_64 * x4 - cospi_8_64 * x5;
- s6 = -cospi_24_64 * x6 + cospi_8_64 * x7;
- s7 = cospi_8_64 * x6 + cospi_24_64 * x7;
-
- x0 = HIGHBD_WRAPLOW(s0 + s2, bd);
- x1 = HIGHBD_WRAPLOW(s1 + s3, bd);
- x2 = HIGHBD_WRAPLOW(s0 - s2, bd);
- x3 = HIGHBD_WRAPLOW(s1 - s3, bd);
- x4 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s4 + s6), bd);
- x5 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s5 + s7), bd);
- x6 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s4 - s6), bd);
- x7 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s5 - s7), bd);
-
- // stage 3
- s2 = cospi_16_64 * (x2 + x3);
- s3 = cospi_16_64 * (x2 - x3);
- s6 = cospi_16_64 * (x6 + x7);
- s7 = cospi_16_64 * (x6 - x7);
-
- x2 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s2), bd);
- x3 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s3), bd);
- x6 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s6), bd);
- x7 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s7), bd);
-
- output[0] = HIGHBD_WRAPLOW(x0, bd);
- output[1] = HIGHBD_WRAPLOW(-x4, bd);
- output[2] = HIGHBD_WRAPLOW(x6, bd);
- output[3] = HIGHBD_WRAPLOW(-x2, bd);
- output[4] = HIGHBD_WRAPLOW(x3, bd);
- output[5] = HIGHBD_WRAPLOW(-x7, bd);
- output[6] = HIGHBD_WRAPLOW(x5, bd);
- output[7] = HIGHBD_WRAPLOW(-x1, bd);
-}
-
-void av1_highbd_idct8x8_10_add_c(const tran_low_t *input, uint8_t *dest8,
- int stride, int bd) {
- tran_low_t out[8 * 8] = { 0 };
- tran_low_t *outptr = out;
- int i, j;
- tran_low_t temp_in[8], temp_out[8];
- uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
-
- // First transform rows.
- // Only first 4 row has non-zero coefs.
- for (i = 0; i < 4; ++i) {
- av1_highbd_idct8_c(input, outptr, bd);
- input += 8;
- outptr += 8;
- }
- // Then transform columns.
- for (i = 0; i < 8; ++i) {
- for (j = 0; j < 8; ++j) temp_in[j] = out[j * 8 + i];
- av1_highbd_idct8_c(temp_in, temp_out, bd);
- for (j = 0; j < 8; ++j) {
- dest[j * stride + i] = highbd_clip_pixel_add(
- dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 5), bd);
- }
- }
-}
-
-void av1_highbd_idct16_c(const tran_low_t *input, tran_low_t *output, int bd) {
- tran_low_t step1[16], step2[16];
- tran_high_t temp1, temp2;
- (void)bd;
-
- // stage 1
- step1[0] = input[0 / 2];
- step1[1] = input[16 / 2];
- step1[2] = input[8 / 2];
- step1[3] = input[24 / 2];
- step1[4] = input[4 / 2];
- step1[5] = input[20 / 2];
- step1[6] = input[12 / 2];
- step1[7] = input[28 / 2];
- step1[8] = input[2 / 2];
- step1[9] = input[18 / 2];
- step1[10] = input[10 / 2];
- step1[11] = input[26 / 2];
- step1[12] = input[6 / 2];
- step1[13] = input[22 / 2];
- step1[14] = input[14 / 2];
- step1[15] = input[30 / 2];
-
- // stage 2
- step2[0] = step1[0];
- step2[1] = step1[1];
- step2[2] = step1[2];
- step2[3] = step1[3];
- step2[4] = step1[4];
- step2[5] = step1[5];
- step2[6] = step1[6];
- step2[7] = step1[7];
-
- temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64;
- temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64;
- step2[8] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
- step2[15] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
-
- temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64;
- temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64;
- step2[9] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
- step2[14] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
-
- temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64;
- temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64;
- step2[10] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
- step2[13] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
-
- temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64;
- temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64;
- step2[11] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
- step2[12] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
-
- // stage 3
- step1[0] = step2[0];
- step1[1] = step2[1];
- step1[2] = step2[2];
- step1[3] = step2[3];
-
- temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64;
- temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64;
- step1[4] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
- step1[7] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
- temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64;
- temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64;
- step1[5] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
- step1[6] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
-
- step1[8] = HIGHBD_WRAPLOW(step2[8] + step2[9], bd);
- step1[9] = HIGHBD_WRAPLOW(step2[8] - step2[9], bd);
- step1[10] = HIGHBD_WRAPLOW(-step2[10] + step2[11], bd);
- step1[11] = HIGHBD_WRAPLOW(step2[10] + step2[11], bd);
- step1[12] = HIGHBD_WRAPLOW(step2[12] + step2[13], bd);
- step1[13] = HIGHBD_WRAPLOW(step2[12] - step2[13], bd);
- step1[14] = HIGHBD_WRAPLOW(-step2[14] + step2[15], bd);
- step1[15] = HIGHBD_WRAPLOW(step2[14] + step2[15], bd);
-
- // stage 4
- temp1 = (step1[0] + step1[1]) * cospi_16_64;
- temp2 = (step1[0] - step1[1]) * cospi_16_64;
- step2[0] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
- step2[1] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
- temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64;
- temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64;
- step2[2] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
- step2[3] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
- step2[4] = HIGHBD_WRAPLOW(step1[4] + step1[5], bd);
- step2[5] = HIGHBD_WRAPLOW(step1[4] - step1[5], bd);
- step2[6] = HIGHBD_WRAPLOW(-step1[6] + step1[7], bd);
- step2[7] = HIGHBD_WRAPLOW(step1[6] + step1[7], bd);
-
- step2[8] = step1[8];
- step2[15] = step1[15];
- temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64;
- temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64;
- step2[9] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
- step2[14] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
- temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64;
- temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64;
- step2[10] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
- step2[13] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
- step2[11] = step1[11];
- step2[12] = step1[12];
-
- // stage 5
- step1[0] = HIGHBD_WRAPLOW(step2[0] + step2[3], bd);
- step1[1] = HIGHBD_WRAPLOW(step2[1] + step2[2], bd);
- step1[2] = HIGHBD_WRAPLOW(step2[1] - step2[2], bd);
- step1[3] = HIGHBD_WRAPLOW(step2[0] - step2[3], bd);
- step1[4] = step2[4];
- temp1 = (step2[6] - step2[5]) * cospi_16_64;
- temp2 = (step2[5] + step2[6]) * cospi_16_64;
- step1[5] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
- step1[6] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
- step1[7] = step2[7];
-
- step1[8] = HIGHBD_WRAPLOW(step2[8] + step2[11], bd);
- step1[9] = HIGHBD_WRAPLOW(step2[9] + step2[10], bd);
- step1[10] = HIGHBD_WRAPLOW(step2[9] - step2[10], bd);
- step1[11] = HIGHBD_WRAPLOW(step2[8] - step2[11], bd);
- step1[12] = HIGHBD_WRAPLOW(-step2[12] + step2[15], bd);
- step1[13] = HIGHBD_WRAPLOW(-step2[13] + step2[14], bd);
- step1[14] = HIGHBD_WRAPLOW(step2[13] + step2[14], bd);
- step1[15] = HIGHBD_WRAPLOW(step2[12] + step2[15], bd);
-
- // stage 6
- step2[0] = HIGHBD_WRAPLOW(step1[0] + step1[7], bd);
- step2[1] = HIGHBD_WRAPLOW(step1[1] + step1[6], bd);
- step2[2] = HIGHBD_WRAPLOW(step1[2] + step1[5], bd);
- step2[3] = HIGHBD_WRAPLOW(step1[3] + step1[4], bd);
- step2[4] = HIGHBD_WRAPLOW(step1[3] - step1[4], bd);
- step2[5] = HIGHBD_WRAPLOW(step1[2] - step1[5], bd);
- step2[6] = HIGHBD_WRAPLOW(step1[1] - step1[6], bd);
- step2[7] = HIGHBD_WRAPLOW(step1[0] - step1[7], bd);
- step2[8] = step1[8];
- step2[9] = step1[9];
- temp1 = (-step1[10] + step1[13]) * cospi_16_64;
- temp2 = (step1[10] + step1[13]) * cospi_16_64;
- step2[10] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
- step2[13] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
- temp1 = (-step1[11] + step1[12]) * cospi_16_64;
- temp2 = (step1[11] + step1[12]) * cospi_16_64;
- step2[11] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
- step2[12] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
- step2[14] = step1[14];
- step2[15] = step1[15];
-
- // stage 7
- output[0] = HIGHBD_WRAPLOW(step2[0] + step2[15], bd);
- output[1] = HIGHBD_WRAPLOW(step2[1] + step2[14], bd);
- output[2] = HIGHBD_WRAPLOW(step2[2] + step2[13], bd);
- output[3] = HIGHBD_WRAPLOW(step2[3] + step2[12], bd);
- output[4] = HIGHBD_WRAPLOW(step2[4] + step2[11], bd);
- output[5] = HIGHBD_WRAPLOW(step2[5] + step2[10], bd);
- output[6] = HIGHBD_WRAPLOW(step2[6] + step2[9], bd);
- output[7] = HIGHBD_WRAPLOW(step2[7] + step2[8], bd);
- output[8] = HIGHBD_WRAPLOW(step2[7] - step2[8], bd);
- output[9] = HIGHBD_WRAPLOW(step2[6] - step2[9], bd);
- output[10] = HIGHBD_WRAPLOW(step2[5] - step2[10], bd);
- output[11] = HIGHBD_WRAPLOW(step2[4] - step2[11], bd);
- output[12] = HIGHBD_WRAPLOW(step2[3] - step2[12], bd);
- output[13] = HIGHBD_WRAPLOW(step2[2] - step2[13], bd);
- output[14] = HIGHBD_WRAPLOW(step2[1] - step2[14], bd);
- output[15] = HIGHBD_WRAPLOW(step2[0] - step2[15], bd);
-}
-
-void av1_highbd_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest8,
- int stride, int bd) {
- tran_low_t out[16 * 16];
- tran_low_t *outptr = out;
- int i, j;
- tran_low_t temp_in[16], temp_out[16];
- uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
-
- // First transform rows.
- for (i = 0; i < 16; ++i) {
- av1_highbd_idct16_c(input, outptr, bd);
- input += 16;
- outptr += 16;
- }
-
- // Then transform columns.
- for (i = 0; i < 16; ++i) {
- for (j = 0; j < 16; ++j) temp_in[j] = out[j * 16 + i];
- av1_highbd_idct16_c(temp_in, temp_out, bd);
- for (j = 0; j < 16; ++j) {
- dest[j * stride + i] = highbd_clip_pixel_add(
- dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd);
- }
- }
-}
-
-void av1_highbd_iadst16_c(const tran_low_t *input, tran_low_t *output, int bd) {
- tran_high_t s0, s1, s2, s3, s4, s5, s6, s7, s8;
- tran_high_t s9, s10, s11, s12, s13, s14, s15;
-
- tran_low_t x0 = input[15];
- tran_low_t x1 = input[0];
- tran_low_t x2 = input[13];
- tran_low_t x3 = input[2];
- tran_low_t x4 = input[11];
- tran_low_t x5 = input[4];
- tran_low_t x6 = input[9];
- tran_low_t x7 = input[6];
- tran_low_t x8 = input[7];
- tran_low_t x9 = input[8];
- tran_low_t x10 = input[5];
- tran_low_t x11 = input[10];
- tran_low_t x12 = input[3];
- tran_low_t x13 = input[12];
- tran_low_t x14 = input[1];
- tran_low_t x15 = input[14];
- (void)bd;
-
- if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7 | x8 | x9 | x10 | x11 | x12 |
- x13 | x14 | x15)) {
- memset(output, 0, 16 * sizeof(*output));
- return;
- }
-
- // stage 1
- s0 = x0 * cospi_1_64 + x1 * cospi_31_64;
- s1 = x0 * cospi_31_64 - x1 * cospi_1_64;
- s2 = x2 * cospi_5_64 + x3 * cospi_27_64;
- s3 = x2 * cospi_27_64 - x3 * cospi_5_64;
- s4 = x4 * cospi_9_64 + x5 * cospi_23_64;
- s5 = x4 * cospi_23_64 - x5 * cospi_9_64;
- s6 = x6 * cospi_13_64 + x7 * cospi_19_64;
- s7 = x6 * cospi_19_64 - x7 * cospi_13_64;
- s8 = x8 * cospi_17_64 + x9 * cospi_15_64;
- s9 = x8 * cospi_15_64 - x9 * cospi_17_64;
- s10 = x10 * cospi_21_64 + x11 * cospi_11_64;
- s11 = x10 * cospi_11_64 - x11 * cospi_21_64;
- s12 = x12 * cospi_25_64 + x13 * cospi_7_64;
- s13 = x12 * cospi_7_64 - x13 * cospi_25_64;
- s14 = x14 * cospi_29_64 + x15 * cospi_3_64;
- s15 = x14 * cospi_3_64 - x15 * cospi_29_64;
-
- x0 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s0 + s8), bd);
- x1 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s1 + s9), bd);
- x2 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s2 + s10), bd);
- x3 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s3 + s11), bd);
- x4 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s4 + s12), bd);
- x5 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s5 + s13), bd);
- x6 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s6 + s14), bd);
- x7 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s7 + s15), bd);
- x8 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s0 - s8), bd);
- x9 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s1 - s9), bd);
- x10 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s2 - s10), bd);
- x11 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s3 - s11), bd);
- x12 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s4 - s12), bd);
- x13 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s5 - s13), bd);
- x14 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s6 - s14), bd);
- x15 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s7 - s15), bd);
-
- // stage 2
- s0 = x0;
- s1 = x1;
- s2 = x2;
- s3 = x3;
- s4 = x4;
- s5 = x5;
- s6 = x6;
- s7 = x7;
- s8 = x8 * cospi_4_64 + x9 * cospi_28_64;
- s9 = x8 * cospi_28_64 - x9 * cospi_4_64;
- s10 = x10 * cospi_20_64 + x11 * cospi_12_64;
- s11 = x10 * cospi_12_64 - x11 * cospi_20_64;
- s12 = -x12 * cospi_28_64 + x13 * cospi_4_64;
- s13 = x12 * cospi_4_64 + x13 * cospi_28_64;
- s14 = -x14 * cospi_12_64 + x15 * cospi_20_64;
- s15 = x14 * cospi_20_64 + x15 * cospi_12_64;
-
- x0 = HIGHBD_WRAPLOW(s0 + s4, bd);
- x1 = HIGHBD_WRAPLOW(s1 + s5, bd);
- x2 = HIGHBD_WRAPLOW(s2 + s6, bd);
- x3 = HIGHBD_WRAPLOW(s3 + s7, bd);
- x4 = HIGHBD_WRAPLOW(s0 - s4, bd);
- x5 = HIGHBD_WRAPLOW(s1 - s5, bd);
- x6 = HIGHBD_WRAPLOW(s2 - s6, bd);
- x7 = HIGHBD_WRAPLOW(s3 - s7, bd);
- x8 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s8 + s12), bd);
- x9 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s9 + s13), bd);
- x10 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s10 + s14), bd);
- x11 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s11 + s15), bd);
- x12 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s8 - s12), bd);
- x13 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s9 - s13), bd);
- x14 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s10 - s14), bd);
- x15 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s11 - s15), bd);
-
- // stage 3
- s0 = x0;
- s1 = x1;
- s2 = x2;
- s3 = x3;
- s4 = x4 * cospi_8_64 + x5 * cospi_24_64;
- s5 = x4 * cospi_24_64 - x5 * cospi_8_64;
- s6 = -x6 * cospi_24_64 + x7 * cospi_8_64;
- s7 = x6 * cospi_8_64 + x7 * cospi_24_64;
- s8 = x8;
- s9 = x9;
- s10 = x10;
- s11 = x11;
- s12 = x12 * cospi_8_64 + x13 * cospi_24_64;
- s13 = x12 * cospi_24_64 - x13 * cospi_8_64;
- s14 = -x14 * cospi_24_64 + x15 * cospi_8_64;
- s15 = x14 * cospi_8_64 + x15 * cospi_24_64;
-
- x0 = HIGHBD_WRAPLOW(s0 + s2, bd);
- x1 = HIGHBD_WRAPLOW(s1 + s3, bd);
- x2 = HIGHBD_WRAPLOW(s0 - s2, bd);
- x3 = HIGHBD_WRAPLOW(s1 - s3, bd);
- x4 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s4 + s6), bd);
- x5 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s5 + s7), bd);
- x6 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s4 - s6), bd);
- x7 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s5 - s7), bd);
- x8 = HIGHBD_WRAPLOW(s8 + s10, bd);
- x9 = HIGHBD_WRAPLOW(s9 + s11, bd);
- x10 = HIGHBD_WRAPLOW(s8 - s10, bd);
- x11 = HIGHBD_WRAPLOW(s9 - s11, bd);
- x12 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s12 + s14), bd);
- x13 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s13 + s15), bd);
- x14 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s12 - s14), bd);
- x15 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s13 - s15), bd);
-
- // stage 4
- s2 = (-cospi_16_64) * (x2 + x3);
- s3 = cospi_16_64 * (x2 - x3);
- s6 = cospi_16_64 * (x6 + x7);
- s7 = cospi_16_64 * (-x6 + x7);
- s10 = cospi_16_64 * (x10 + x11);
- s11 = cospi_16_64 * (-x10 + x11);
- s14 = (-cospi_16_64) * (x14 + x15);
- s15 = cospi_16_64 * (x14 - x15);
-
- x2 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s2), bd);
- x3 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s3), bd);
- x6 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s6), bd);
- x7 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s7), bd);
- x10 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s10), bd);
- x11 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s11), bd);
- x14 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s14), bd);
- x15 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s15), bd);
-
- output[0] = HIGHBD_WRAPLOW(x0, bd);
- output[1] = HIGHBD_WRAPLOW(-x8, bd);
- output[2] = HIGHBD_WRAPLOW(x12, bd);
- output[3] = HIGHBD_WRAPLOW(-x4, bd);
- output[4] = HIGHBD_WRAPLOW(x6, bd);
- output[5] = HIGHBD_WRAPLOW(x14, bd);
- output[6] = HIGHBD_WRAPLOW(x10, bd);
- output[7] = HIGHBD_WRAPLOW(x2, bd);
- output[8] = HIGHBD_WRAPLOW(x3, bd);
- output[9] = HIGHBD_WRAPLOW(x11, bd);
- output[10] = HIGHBD_WRAPLOW(x15, bd);
- output[11] = HIGHBD_WRAPLOW(x7, bd);
- output[12] = HIGHBD_WRAPLOW(x5, bd);
- output[13] = HIGHBD_WRAPLOW(-x13, bd);
- output[14] = HIGHBD_WRAPLOW(x9, bd);
- output[15] = HIGHBD_WRAPLOW(-x1, bd);
-}
-
-void av1_highbd_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest8,
- int stride, int bd) {
- tran_low_t out[16 * 16] = { 0 };
- tran_low_t *outptr = out;
- int i, j;
- tran_low_t temp_in[16], temp_out[16];
- uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
-
- // First transform rows. Since all non-zero dct coefficients are in
- // upper-left 4x4 area, we only need to calculate first 4 rows here.
- for (i = 0; i < 4; ++i) {
- av1_highbd_idct16_c(input, outptr, bd);
- input += 16;
- outptr += 16;
- }
-
- // Then transform columns.
- for (i = 0; i < 16; ++i) {
- for (j = 0; j < 16; ++j) temp_in[j] = out[j * 16 + i];
- av1_highbd_idct16_c(temp_in, temp_out, bd);
- for (j = 0; j < 16; ++j) {
- dest[j * stride + i] = highbd_clip_pixel_add(
- dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd);
- }
- }
-}
-
-void av1_highbd_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest8,
- int stride, int bd) {
- int i, j;
- tran_high_t a1;
- tran_low_t out =
- HIGHBD_WRAPLOW(highbd_dct_const_round_shift(input[0] * cospi_16_64), bd);
- uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
-
- out = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(out * cospi_16_64), bd);
- a1 = ROUND_POWER_OF_TWO(out, 6);
- for (j = 0; j < 16; ++j) {
- for (i = 0; i < 16; ++i) dest[i] = highbd_clip_pixel_add(dest[i], a1, bd);
- dest += stride;
- }
-}
-
-static void highbd_idct32_c(const tran_low_t *input, tran_low_t *output,
- int bd) {
- tran_low_t step1[32], step2[32];
- tran_high_t temp1, temp2;
- (void)bd;
-
- // stage 1
- step1[0] = input[0];
- step1[1] = input[16];
- step1[2] = input[8];
- step1[3] = input[24];
- step1[4] = input[4];
- step1[5] = input[20];
- step1[6] = input[12];
- step1[7] = input[28];
- step1[8] = input[2];
- step1[9] = input[18];
- step1[10] = input[10];
- step1[11] = input[26];
- step1[12] = input[6];
- step1[13] = input[22];
- step1[14] = input[14];
- step1[15] = input[30];
-
- temp1 = input[1] * cospi_31_64 - input[31] * cospi_1_64;
- temp2 = input[1] * cospi_1_64 + input[31] * cospi_31_64;
- step1[16] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
- step1[31] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
-
- temp1 = input[17] * cospi_15_64 - input[15] * cospi_17_64;
- temp2 = input[17] * cospi_17_64 + input[15] * cospi_15_64;
- step1[17] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
- step1[30] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
-
- temp1 = input[9] * cospi_23_64 - input[23] * cospi_9_64;
- temp2 = input[9] * cospi_9_64 + input[23] * cospi_23_64;
- step1[18] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
- step1[29] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
-
- temp1 = input[25] * cospi_7_64 - input[7] * cospi_25_64;
- temp2 = input[25] * cospi_25_64 + input[7] * cospi_7_64;
- step1[19] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
- step1[28] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
-
- temp1 = input[5] * cospi_27_64 - input[27] * cospi_5_64;
- temp2 = input[5] * cospi_5_64 + input[27] * cospi_27_64;
- step1[20] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
- step1[27] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
-
- temp1 = input[21] * cospi_11_64 - input[11] * cospi_21_64;
- temp2 = input[21] * cospi_21_64 + input[11] * cospi_11_64;
- step1[21] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
- step1[26] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
-
- temp1 = input[13] * cospi_19_64 - input[19] * cospi_13_64;
- temp2 = input[13] * cospi_13_64 + input[19] * cospi_19_64;
- step1[22] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
- step1[25] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
-
- temp1 = input[29] * cospi_3_64 - input[3] * cospi_29_64;
- temp2 = input[29] * cospi_29_64 + input[3] * cospi_3_64;
- step1[23] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
- step1[24] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
-
- // stage 2
- step2[0] = step1[0];
- step2[1] = step1[1];
- step2[2] = step1[2];
- step2[3] = step1[3];
- step2[4] = step1[4];
- step2[5] = step1[5];
- step2[6] = step1[6];
- step2[7] = step1[7];
-
- temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64;
- temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64;
- step2[8] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
- step2[15] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
-
- temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64;
- temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64;
- step2[9] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
- step2[14] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
-
- temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64;
- temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64;
- step2[10] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
- step2[13] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
-
- temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64;
- temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64;
- step2[11] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
- step2[12] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
-
- step2[16] = HIGHBD_WRAPLOW(step1[16] + step1[17], bd);
- step2[17] = HIGHBD_WRAPLOW(step1[16] - step1[17], bd);
- step2[18] = HIGHBD_WRAPLOW(-step1[18] + step1[19], bd);
- step2[19] = HIGHBD_WRAPLOW(step1[18] + step1[19], bd);
- step2[20] = HIGHBD_WRAPLOW(step1[20] + step1[21], bd);
- step2[21] = HIGHBD_WRAPLOW(step1[20] - step1[21], bd);
- step2[22] = HIGHBD_WRAPLOW(-step1[22] + step1[23], bd);
- step2[23] = HIGHBD_WRAPLOW(step1[22] + step1[23], bd);
- step2[24] = HIGHBD_WRAPLOW(step1[24] + step1[25], bd);
- step2[25] = HIGHBD_WRAPLOW(step1[24] - step1[25], bd);
- step2[26] = HIGHBD_WRAPLOW(-step1[26] + step1[27], bd);
- step2[27] = HIGHBD_WRAPLOW(step1[26] + step1[27], bd);
- step2[28] = HIGHBD_WRAPLOW(step1[28] + step1[29], bd);
- step2[29] = HIGHBD_WRAPLOW(step1[28] - step1[29], bd);
- step2[30] = HIGHBD_WRAPLOW(-step1[30] + step1[31], bd);
- step2[31] = HIGHBD_WRAPLOW(step1[30] + step1[31], bd);
-
- // stage 3
- step1[0] = step2[0];
- step1[1] = step2[1];
- step1[2] = step2[2];
- step1[3] = step2[3];
-
- temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64;
- temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64;
- step1[4] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
- step1[7] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
- temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64;
- temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64;
- step1[5] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
- step1[6] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
-
- step1[8] = HIGHBD_WRAPLOW(step2[8] + step2[9], bd);
- step1[9] = HIGHBD_WRAPLOW(step2[8] - step2[9], bd);
- step1[10] = HIGHBD_WRAPLOW(-step2[10] + step2[11], bd);
- step1[11] = HIGHBD_WRAPLOW(step2[10] + step2[11], bd);
- step1[12] = HIGHBD_WRAPLOW(step2[12] + step2[13], bd);
- step1[13] = HIGHBD_WRAPLOW(step2[12] - step2[13], bd);
- step1[14] = HIGHBD_WRAPLOW(-step2[14] + step2[15], bd);
- step1[15] = HIGHBD_WRAPLOW(step2[14] + step2[15], bd);
-
- step1[16] = step2[16];
- step1[31] = step2[31];
- temp1 = -step2[17] * cospi_4_64 + step2[30] * cospi_28_64;
- temp2 = step2[17] * cospi_28_64 + step2[30] * cospi_4_64;
- step1[17] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
- step1[30] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
- temp1 = -step2[18] * cospi_28_64 - step2[29] * cospi_4_64;
- temp2 = -step2[18] * cospi_4_64 + step2[29] * cospi_28_64;
- step1[18] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
- step1[29] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
- step1[19] = step2[19];
- step1[20] = step2[20];
- temp1 = -step2[21] * cospi_20_64 + step2[26] * cospi_12_64;
- temp2 = step2[21] * cospi_12_64 + step2[26] * cospi_20_64;
- step1[21] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
- step1[26] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
- temp1 = -step2[22] * cospi_12_64 - step2[25] * cospi_20_64;
- temp2 = -step2[22] * cospi_20_64 + step2[25] * cospi_12_64;
- step1[22] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
- step1[25] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
- step1[23] = step2[23];
- step1[24] = step2[24];
- step1[27] = step2[27];
- step1[28] = step2[28];
-
- // stage 4
- temp1 = (step1[0] + step1[1]) * cospi_16_64;
- temp2 = (step1[0] - step1[1]) * cospi_16_64;
- step2[0] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
- step2[1] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
- temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64;
- temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64;
- step2[2] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
- step2[3] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
- step2[4] = HIGHBD_WRAPLOW(step1[4] + step1[5], bd);
- step2[5] = HIGHBD_WRAPLOW(step1[4] - step1[5], bd);
- step2[6] = HIGHBD_WRAPLOW(-step1[6] + step1[7], bd);
- step2[7] = HIGHBD_WRAPLOW(step1[6] + step1[7], bd);
-
- step2[8] = step1[8];
- step2[15] = step1[15];
- temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64;
- temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64;
- step2[9] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
- step2[14] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
- temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64;
- temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64;
- step2[10] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
- step2[13] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
- step2[11] = step1[11];
- step2[12] = step1[12];
-
- step2[16] = HIGHBD_WRAPLOW(step1[16] + step1[19], bd);
- step2[17] = HIGHBD_WRAPLOW(step1[17] + step1[18], bd);
- step2[18] = HIGHBD_WRAPLOW(step1[17] - step1[18], bd);
- step2[19] = HIGHBD_WRAPLOW(step1[16] - step1[19], bd);
- step2[20] = HIGHBD_WRAPLOW(-step1[20] + step1[23], bd);
- step2[21] = HIGHBD_WRAPLOW(-step1[21] + step1[22], bd);
- step2[22] = HIGHBD_WRAPLOW(step1[21] + step1[22], bd);
- step2[23] = HIGHBD_WRAPLOW(step1[20] + step1[23], bd);
-
- step2[24] = HIGHBD_WRAPLOW(step1[24] + step1[27], bd);
- step2[25] = HIGHBD_WRAPLOW(step1[25] + step1[26], bd);
- step2[26] = HIGHBD_WRAPLOW(step1[25] - step1[26], bd);
- step2[27] = HIGHBD_WRAPLOW(step1[24] - step1[27], bd);
- step2[28] = HIGHBD_WRAPLOW(-step1[28] + step1[31], bd);
- step2[29] = HIGHBD_WRAPLOW(-step1[29] + step1[30], bd);
- step2[30] = HIGHBD_WRAPLOW(step1[29] + step1[30], bd);
- step2[31] = HIGHBD_WRAPLOW(step1[28] + step1[31], bd);
-
- // stage 5
- step1[0] = HIGHBD_WRAPLOW(step2[0] + step2[3], bd);
- step1[1] = HIGHBD_WRAPLOW(step2[1] + step2[2], bd);
- step1[2] = HIGHBD_WRAPLOW(step2[1] - step2[2], bd);
- step1[3] = HIGHBD_WRAPLOW(step2[0] - step2[3], bd);
- step1[4] = step2[4];
- temp1 = (step2[6] - step2[5]) * cospi_16_64;
- temp2 = (step2[5] + step2[6]) * cospi_16_64;
- step1[5] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
- step1[6] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
- step1[7] = step2[7];
-
- step1[8] = HIGHBD_WRAPLOW(step2[8] + step2[11], bd);
- step1[9] = HIGHBD_WRAPLOW(step2[9] + step2[10], bd);
- step1[10] = HIGHBD_WRAPLOW(step2[9] - step2[10], bd);
- step1[11] = HIGHBD_WRAPLOW(step2[8] - step2[11], bd);
- step1[12] = HIGHBD_WRAPLOW(-step2[12] + step2[15], bd);
- step1[13] = HIGHBD_WRAPLOW(-step2[13] + step2[14], bd);
- step1[14] = HIGHBD_WRAPLOW(step2[13] + step2[14], bd);
- step1[15] = HIGHBD_WRAPLOW(step2[12] + step2[15], bd);
-
- step1[16] = step2[16];
- step1[17] = step2[17];
- temp1 = -step2[18] * cospi_8_64 + step2[29] * cospi_24_64;
- temp2 = step2[18] * cospi_24_64 + step2[29] * cospi_8_64;
- step1[18] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
- step1[29] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
- temp1 = -step2[19] * cospi_8_64 + step2[28] * cospi_24_64;
- temp2 = step2[19] * cospi_24_64 + step2[28] * cospi_8_64;
- step1[19] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
- step1[28] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
- temp1 = -step2[20] * cospi_24_64 - step2[27] * cospi_8_64;
- temp2 = -step2[20] * cospi_8_64 + step2[27] * cospi_24_64;
- step1[20] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
- step1[27] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
- temp1 = -step2[21] * cospi_24_64 - step2[26] * cospi_8_64;
- temp2 = -step2[21] * cospi_8_64 + step2[26] * cospi_24_64;
- step1[21] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
- step1[26] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
- step1[22] = step2[22];
- step1[23] = step2[23];
- step1[24] = step2[24];
- step1[25] = step2[25];
- step1[30] = step2[30];
- step1[31] = step2[31];
-
- // stage 6
- step2[0] = HIGHBD_WRAPLOW(step1[0] + step1[7], bd);
- step2[1] = HIGHBD_WRAPLOW(step1[1] + step1[6], bd);
- step2[2] = HIGHBD_WRAPLOW(step1[2] + step1[5], bd);
- step2[3] = HIGHBD_WRAPLOW(step1[3] + step1[4], bd);
- step2[4] = HIGHBD_WRAPLOW(step1[3] - step1[4], bd);
- step2[5] = HIGHBD_WRAPLOW(step1[2] - step1[5], bd);
- step2[6] = HIGHBD_WRAPLOW(step1[1] - step1[6], bd);
- step2[7] = HIGHBD_WRAPLOW(step1[0] - step1[7], bd);
- step2[8] = step1[8];
- step2[9] = step1[9];
- temp1 = (-step1[10] + step1[13]) * cospi_16_64;
- temp2 = (step1[10] + step1[13]) * cospi_16_64;
- step2[10] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
- step2[13] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
- temp1 = (-step1[11] + step1[12]) * cospi_16_64;
- temp2 = (step1[11] + step1[12]) * cospi_16_64;
- step2[11] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
- step2[12] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
- step2[14] = step1[14];
- step2[15] = step1[15];
-
- step2[16] = HIGHBD_WRAPLOW(step1[16] + step1[23], bd);
- step2[17] = HIGHBD_WRAPLOW(step1[17] + step1[22], bd);
- step2[18] = HIGHBD_WRAPLOW(step1[18] + step1[21], bd);
- step2[19] = HIGHBD_WRAPLOW(step1[19] + step1[20], bd);
- step2[20] = HIGHBD_WRAPLOW(step1[19] - step1[20], bd);
- step2[21] = HIGHBD_WRAPLOW(step1[18] - step1[21], bd);
- step2[22] = HIGHBD_WRAPLOW(step1[17] - step1[22], bd);
- step2[23] = HIGHBD_WRAPLOW(step1[16] - step1[23], bd);
-
- step2[24] = HIGHBD_WRAPLOW(-step1[24] + step1[31], bd);
- step2[25] = HIGHBD_WRAPLOW(-step1[25] + step1[30], bd);
- step2[26] = HIGHBD_WRAPLOW(-step1[26] + step1[29], bd);
- step2[27] = HIGHBD_WRAPLOW(-step1[27] + step1[28], bd);
- step2[28] = HIGHBD_WRAPLOW(step1[27] + step1[28], bd);
- step2[29] = HIGHBD_WRAPLOW(step1[26] + step1[29], bd);
- step2[30] = HIGHBD_WRAPLOW(step1[25] + step1[30], bd);
- step2[31] = HIGHBD_WRAPLOW(step1[24] + step1[31], bd);
-
- // stage 7
- step1[0] = HIGHBD_WRAPLOW(step2[0] + step2[15], bd);
- step1[1] = HIGHBD_WRAPLOW(step2[1] + step2[14], bd);
- step1[2] = HIGHBD_WRAPLOW(step2[2] + step2[13], bd);
- step1[3] = HIGHBD_WRAPLOW(step2[3] + step2[12], bd);
- step1[4] = HIGHBD_WRAPLOW(step2[4] + step2[11], bd);
- step1[5] = HIGHBD_WRAPLOW(step2[5] + step2[10], bd);
- step1[6] = HIGHBD_WRAPLOW(step2[6] + step2[9], bd);
- step1[7] = HIGHBD_WRAPLOW(step2[7] + step2[8], bd);
- step1[8] = HIGHBD_WRAPLOW(step2[7] - step2[8], bd);
- step1[9] = HIGHBD_WRAPLOW(step2[6] - step2[9], bd);
- step1[10] = HIGHBD_WRAPLOW(step2[5] - step2[10], bd);
- step1[11] = HIGHBD_WRAPLOW(step2[4] - step2[11], bd);
- step1[12] = HIGHBD_WRAPLOW(step2[3] - step2[12], bd);
- step1[13] = HIGHBD_WRAPLOW(step2[2] - step2[13], bd);
- step1[14] = HIGHBD_WRAPLOW(step2[1] - step2[14], bd);
- step1[15] = HIGHBD_WRAPLOW(step2[0] - step2[15], bd);
-
- step1[16] = step2[16];
- step1[17] = step2[17];
- step1[18] = step2[18];
- step1[19] = step2[19];
- temp1 = (-step2[20] + step2[27]) * cospi_16_64;
- temp2 = (step2[20] + step2[27]) * cospi_16_64;
- step1[20] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
- step1[27] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
- temp1 = (-step2[21] + step2[26]) * cospi_16_64;
- temp2 = (step2[21] + step2[26]) * cospi_16_64;
- step1[21] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
- step1[26] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
- temp1 = (-step2[22] + step2[25]) * cospi_16_64;
- temp2 = (step2[22] + step2[25]) * cospi_16_64;
- step1[22] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
- step1[25] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
- temp1 = (-step2[23] + step2[24]) * cospi_16_64;
- temp2 = (step2[23] + step2[24]) * cospi_16_64;
- step1[23] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
- step1[24] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
- step1[28] = step2[28];
- step1[29] = step2[29];
- step1[30] = step2[30];
- step1[31] = step2[31];
-
- // final stage
- output[0] = HIGHBD_WRAPLOW(step1[0] + step1[31], bd);
- output[1] = HIGHBD_WRAPLOW(step1[1] + step1[30], bd);
- output[2] = HIGHBD_WRAPLOW(step1[2] + step1[29], bd);
- output[3] = HIGHBD_WRAPLOW(step1[3] + step1[28], bd);
- output[4] = HIGHBD_WRAPLOW(step1[4] + step1[27], bd);
- output[5] = HIGHBD_WRAPLOW(step1[5] + step1[26], bd);
- output[6] = HIGHBD_WRAPLOW(step1[6] + step1[25], bd);
- output[7] = HIGHBD_WRAPLOW(step1[7] + step1[24], bd);
- output[8] = HIGHBD_WRAPLOW(step1[8] + step1[23], bd);
- output[9] = HIGHBD_WRAPLOW(step1[9] + step1[22], bd);
- output[10] = HIGHBD_WRAPLOW(step1[10] + step1[21], bd);
- output[11] = HIGHBD_WRAPLOW(step1[11] + step1[20], bd);
- output[12] = HIGHBD_WRAPLOW(step1[12] + step1[19], bd);
- output[13] = HIGHBD_WRAPLOW(step1[13] + step1[18], bd);
- output[14] = HIGHBD_WRAPLOW(step1[14] + step1[17], bd);
- output[15] = HIGHBD_WRAPLOW(step1[15] + step1[16], bd);
- output[16] = HIGHBD_WRAPLOW(step1[15] - step1[16], bd);
- output[17] = HIGHBD_WRAPLOW(step1[14] - step1[17], bd);
- output[18] = HIGHBD_WRAPLOW(step1[13] - step1[18], bd);
- output[19] = HIGHBD_WRAPLOW(step1[12] - step1[19], bd);
- output[20] = HIGHBD_WRAPLOW(step1[11] - step1[20], bd);
- output[21] = HIGHBD_WRAPLOW(step1[10] - step1[21], bd);
- output[22] = HIGHBD_WRAPLOW(step1[9] - step1[22], bd);
- output[23] = HIGHBD_WRAPLOW(step1[8] - step1[23], bd);
- output[24] = HIGHBD_WRAPLOW(step1[7] - step1[24], bd);
- output[25] = HIGHBD_WRAPLOW(step1[6] - step1[25], bd);
- output[26] = HIGHBD_WRAPLOW(step1[5] - step1[26], bd);
- output[27] = HIGHBD_WRAPLOW(step1[4] - step1[27], bd);
- output[28] = HIGHBD_WRAPLOW(step1[3] - step1[28], bd);
- output[29] = HIGHBD_WRAPLOW(step1[2] - step1[29], bd);
- output[30] = HIGHBD_WRAPLOW(step1[1] - step1[30], bd);
- output[31] = HIGHBD_WRAPLOW(step1[0] - step1[31], bd);
-}
-
-void av1_highbd_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest8,
- int stride, int bd) {
- tran_low_t out[32 * 32];
- tran_low_t *outptr = out;
- int i, j;
- tran_low_t temp_in[32], temp_out[32];
- uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
-
- // Rows
- for (i = 0; i < 32; ++i) {
- tran_low_t zero_coeff[16];
- for (j = 0; j < 16; ++j) zero_coeff[j] = input[2 * j] | input[2 * j + 1];
- for (j = 0; j < 8; ++j)
- zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1];
- for (j = 0; j < 4; ++j)
- zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1];
- for (j = 0; j < 2; ++j)
- zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1];
-
- if (zero_coeff[0] | zero_coeff[1])
- highbd_idct32_c(input, outptr, bd);
- else
- memset(outptr, 0, sizeof(tran_low_t) * 32);
- input += 32;
- outptr += 32;
- }
-
- // Columns
- for (i = 0; i < 32; ++i) {
- for (j = 0; j < 32; ++j) temp_in[j] = out[j * 32 + i];
- highbd_idct32_c(temp_in, temp_out, bd);
- for (j = 0; j < 32; ++j) {
- dest[j * stride + i] = highbd_clip_pixel_add(
- dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd);
- }
- }
-}
-
-void av1_highbd_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest8,
- int stride, int bd) {
- tran_low_t out[32 * 32] = { 0 };
- tran_low_t *outptr = out;
- int i, j;
- tran_low_t temp_in[32], temp_out[32];
- uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
-
- // Rows
- // Only upper-left 8x8 has non-zero coeff.
- for (i = 0; i < 8; ++i) {
- highbd_idct32_c(input, outptr, bd);
- input += 32;
- outptr += 32;
- }
- // Columns
- for (i = 0; i < 32; ++i) {
- for (j = 0; j < 32; ++j) temp_in[j] = out[j * 32 + i];
- highbd_idct32_c(temp_in, temp_out, bd);
- for (j = 0; j < 32; ++j) {
- dest[j * stride + i] = highbd_clip_pixel_add(
- dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd);
- }
- }
-}
-
-void av1_highbd_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest8,
- int stride, int bd) {
- int i, j;
- int a1;
- uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
-
- tran_low_t out =
- HIGHBD_WRAPLOW(highbd_dct_const_round_shift(input[0] * cospi_16_64), bd);
- out = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(out * cospi_16_64), bd);
- a1 = ROUND_POWER_OF_TWO(out, 6);
-
- for (j = 0; j < 32; ++j) {
- for (i = 0; i < 32; ++i) dest[i] = highbd_clip_pixel_add(dest[i], a1, bd);
- dest += stride;
- }
-}
-#endif // CONFIG_AOM_HIGHBITDEPTH
diff --git a/av1/common/av1_inv_txfm.h b/av1/common/av1_inv_txfm.h
deleted file mode 100644
index c57e888..0000000
--- a/av1/common/av1_inv_txfm.h
+++ /dev/null
@@ -1,133 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#ifndef AOM_DSP_INV_TXFM_H_
-#define AOM_DSP_INV_TXFM_H_
-
-#include <assert.h>
-
-#include "./aom_config.h"
-#include "aom_dsp/txfm_common.h"
-#include "aom_dsp/inv_txfm.h"
-#include "aom_ports/mem.h"
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-static INLINE tran_high_t check_range(tran_high_t input) {
-#if CONFIG_COEFFICIENT_RANGE_CHECKING
- // For valid input streams, intermediate stage coefficients should always
- // stay within the range of a signed 16 bit integer. Coefficients can go out
- // of this range for invalid/corrupt streams. However, strictly checking
- // this range for every intermediate coefficient can burdensome for a decoder,
- // therefore the following assertion is only enabled when configured with
- // --enable-coefficient-range-checking.
- assert(INT16_MIN <= input);
- assert(input <= INT16_MAX);
-#endif // CONFIG_COEFFICIENT_RANGE_CHECKING
- return input;
-}
-
-static INLINE tran_high_t dct_const_round_shift(tran_high_t input) {
- tran_high_t rv = ROUND_POWER_OF_TWO(input, DCT_CONST_BITS);
- return rv;
-}
-
-#if CONFIG_AOM_HIGHBITDEPTH
-static INLINE tran_high_t highbd_check_range(tran_high_t input, int bd) {
-#if CONFIG_COEFFICIENT_RANGE_CHECKING
- // For valid highbitdepth streams, intermediate stage coefficients will
- // stay within the ranges:
- // - 8 bit: signed 16 bit integer
- // - 10 bit: signed 18 bit integer
- // - 12 bit: signed 20 bit integer
- const int32_t int_max = (1 << (7 + bd)) - 1;
- const int32_t int_min = -int_max - 1;
- assert(int_min <= input);
- assert(input <= int_max);
- (void)int_min;
-#endif // CONFIG_COEFFICIENT_RANGE_CHECKING
- (void)bd;
- return input;
-}
-
-static INLINE tran_high_t highbd_dct_const_round_shift(tran_high_t input) {
- tran_high_t rv = ROUND_POWER_OF_TWO(input, DCT_CONST_BITS);
- return rv;
-}
-#endif // CONFIG_AOM_HIGHBITDEPTH
-
-#if CONFIG_EMULATE_HARDWARE
-// When CONFIG_EMULATE_HARDWARE is 1 the transform performs a
-// non-normative method to handle overflows. A stream that causes
-// overflows in the inverse transform is considered invalid,
-// and a hardware implementer is free to choose any reasonable
-// method to handle overflows. However to aid in hardware
-// verification they can use a specific implementation of the
-// WRAPLOW() macro below that is identical to their intended
-// hardware implementation (and also use configure options to trigger
-// the C-implementation of the transform).
-//
-// The particular WRAPLOW implementation below performs strict
-// overflow wrapping to match common hardware implementations.
-// bd of 8 uses trans_low with 16bits, need to remove 16bits
-// bd of 10 uses trans_low with 18bits, need to remove 14bits
-// bd of 12 uses trans_low with 20bits, need to remove 12bits
-// bd of x uses trans_low with 8+x bits, need to remove 24-x bits
-
-#define WRAPLOW(x) ((((int32_t)check_range(x)) << 16) >> 16)
-#if CONFIG_AOM_HIGHBITDEPTH
-#define HIGHBD_WRAPLOW(x, bd) \
- ((((int32_t)highbd_check_range((x), bd)) << (24 - bd)) >> (24 - bd))
-#endif // CONFIG_AOM_HIGHBITDEPTH
-
-#else // CONFIG_EMULATE_HARDWARE
-
-#define WRAPLOW(x) ((int32_t)check_range(x))
-#if CONFIG_AOM_HIGHBITDEPTH
-#define HIGHBD_WRAPLOW(x, bd) ((int32_t)highbd_check_range((x), bd))
-#endif // CONFIG_AOM_HIGHBITDEPTH
-
-#endif // CONFIG_EMULATE_HARDWARE
-
-void av1_idct4_c(const tran_low_t *input, tran_low_t *output);
-void av1_idct8_c(const tran_low_t *input, tran_low_t *output);
-void av1_idct16_c(const tran_low_t *input, tran_low_t *output);
-void av1_idct32_c(const tran_low_t *input, tran_low_t *output);
-void av1_iadst4_c(const tran_low_t *input, tran_low_t *output);
-void av1_iadst8_c(const tran_low_t *input, tran_low_t *output);
-void av1_iadst16_c(const tran_low_t *input, tran_low_t *output);
-
-#if CONFIG_AOM_HIGHBITDEPTH
-void av1_highbd_idct4_c(const tran_low_t *input, tran_low_t *output, int bd);
-void av1_highbd_idct8_c(const tran_low_t *input, tran_low_t *output, int bd);
-void av1_highbd_idct16_c(const tran_low_t *input, tran_low_t *output, int bd);
-
-void av1_highbd_iadst4_c(const tran_low_t *input, tran_low_t *output, int bd);
-void av1_highbd_iadst8_c(const tran_low_t *input, tran_low_t *output, int bd);
-void av1_highbd_iadst16_c(const tran_low_t *input, tran_low_t *output, int bd);
-
-static INLINE uint16_t highbd_clip_pixel_add(uint16_t dest, tran_high_t trans,
- int bd) {
- trans = HIGHBD_WRAPLOW(trans, bd);
- return clip_pixel_highbd(dest + (int)trans, bd);
-}
-#endif
-
-static INLINE uint8_t clip_pixel_add(uint8_t dest, tran_high_t trans) {
- trans = WRAPLOW(trans);
- return clip_pixel(dest + (int)trans);
-}
-#ifdef __cplusplus
-} // extern "C"
-#endif
-#endif // AOM_DSP_INV_TXFM_H_
diff --git a/av1/common/av1_rtcd_defs.pl b/av1/common/av1_rtcd_defs.pl
index e66826f..af98f79 100644
--- a/av1/common/av1_rtcd_defs.pl
+++ b/av1/common/av1_rtcd_defs.pl
@@ -61,25 +61,23 @@
add_proto qw/void av1_iht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
specialize qw/av1_iht4x4_16_add/;
- if (aom_config("CONFIG_EXT_TX") eq "yes") {
- add_proto qw/void av1_iht4x8_32_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
+ add_proto qw/void av1_iht4x8_32_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
specialize qw/av1_iht4x8_32_add/;
- add_proto qw/void av1_iht8x4_32_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
+ add_proto qw/void av1_iht8x4_32_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
specialize qw/av1_iht8x4_32_add/;
- add_proto qw/void av1_iht8x16_128_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
+ add_proto qw/void av1_iht8x16_128_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
specialize qw/av1_iht8x16_128_add/;
- add_proto qw/void av1_iht16x8_128_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
+ add_proto qw/void av1_iht16x8_128_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
specialize qw/av1_iht16x8_128_add/;
- add_proto qw/void av1_iht16x32_512_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
+ add_proto qw/void av1_iht16x32_512_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
specialize qw/av1_iht16x32_512_add/;
- add_proto qw/void av1_iht32x16_512_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
+ add_proto qw/void av1_iht32x16_512_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
specialize qw/av1_iht32x16_512_add/;
- }
add_proto qw/void av1_iht8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
specialize qw/av1_iht8x8_64_add/;
@@ -90,31 +88,29 @@
add_proto qw/void av1_iht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
specialize qw/av1_iht4x4_16_add sse2/;
- if (aom_config("CONFIG_EXT_TX") eq "yes") {
- add_proto qw/void av1_iht4x8_32_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
+ add_proto qw/void av1_iht4x8_32_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
specialize qw/av1_iht4x8_32_add sse2/;
- add_proto qw/void av1_iht8x4_32_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
+ add_proto qw/void av1_iht8x4_32_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
specialize qw/av1_iht8x4_32_add sse2/;
- add_proto qw/void av1_iht8x16_128_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
+ add_proto qw/void av1_iht8x16_128_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
specialize qw/av1_iht8x16_128_add sse2/;
- add_proto qw/void av1_iht16x8_128_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
+ add_proto qw/void av1_iht16x8_128_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
specialize qw/av1_iht16x8_128_add sse2/;
- add_proto qw/void av1_iht16x32_512_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
+ add_proto qw/void av1_iht16x32_512_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
specialize qw/av1_iht16x32_512_add sse2/;
- add_proto qw/void av1_iht32x16_512_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
+ add_proto qw/void av1_iht32x16_512_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
specialize qw/av1_iht32x16_512_add sse2/;
- }
add_proto qw/void av1_iht8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
specialize qw/av1_iht8x8_64_add sse2/;
add_proto qw/void av1_iht16x16_256_add/, "const tran_low_t *input, uint8_t *output, int pitch, int tx_type";
- specialize qw/av1_iht16x16_256_add sse2/;
+ specialize qw/av1_iht16x16_256_add sse2 avx2/;
}
} else {
# Force C versions if CONFIG_EMULATE_HARDWARE is 1
@@ -122,25 +118,23 @@
add_proto qw/void av1_iht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
specialize qw/av1_iht4x4_16_add/;
- if (aom_config("CONFIG_EXT_TX") eq "yes") {
- add_proto qw/void av1_iht4x8_32_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
+ add_proto qw/void av1_iht4x8_32_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
specialize qw/av1_iht4x8_32_add/;
- add_proto qw/void av1_iht8x4_32_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
+ add_proto qw/void av1_iht8x4_32_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
specialize qw/av1_iht8x4_32_add/;
- add_proto qw/void av1_iht8x16_128_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
+ add_proto qw/void av1_iht8x16_128_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
specialize qw/av1_iht8x16_128_add/;
- add_proto qw/void av1_iht16x8_128_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
+ add_proto qw/void av1_iht16x8_128_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
specialize qw/av1_iht16x8_128_add/;
- add_proto qw/void av1_iht16x32_512_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
+ add_proto qw/void av1_iht16x32_512_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
specialize qw/av1_iht16x32_512_add/;
- add_proto qw/void av1_iht32x16_512_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
+ add_proto qw/void av1_iht32x16_512_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
specialize qw/av1_iht32x16_512_add/;
- }
add_proto qw/void av1_iht8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
specialize qw/av1_iht8x8_64_add/;
@@ -151,31 +145,29 @@
add_proto qw/void av1_iht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
specialize qw/av1_iht4x4_16_add sse2 neon dspr2/;
- if (aom_config("CONFIG_EXT_TX") eq "yes") {
- add_proto qw/void av1_iht4x8_32_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
+ add_proto qw/void av1_iht4x8_32_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
specialize qw/av1_iht4x8_32_add sse2/;
- add_proto qw/void av1_iht8x4_32_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
+ add_proto qw/void av1_iht8x4_32_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
specialize qw/av1_iht8x4_32_add sse2/;
- add_proto qw/void av1_iht8x16_128_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
+ add_proto qw/void av1_iht8x16_128_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
specialize qw/av1_iht8x16_128_add sse2/;
- add_proto qw/void av1_iht16x8_128_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
+ add_proto qw/void av1_iht16x8_128_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
specialize qw/av1_iht16x8_128_add sse2/;
- add_proto qw/void av1_iht16x32_512_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
+ add_proto qw/void av1_iht16x32_512_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
specialize qw/av1_iht16x32_512_add sse2/;
- add_proto qw/void av1_iht32x16_512_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
+ add_proto qw/void av1_iht32x16_512_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
specialize qw/av1_iht32x16_512_add sse2/;
- }
add_proto qw/void av1_iht8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
specialize qw/av1_iht8x8_64_add sse2 neon dspr2/;
add_proto qw/void av1_iht16x16_256_add/, "const tran_low_t *input, uint8_t *output, int pitch, int tx_type";
- specialize qw/av1_iht16x16_256_add sse2 dspr2/;
+ specialize qw/av1_iht16x16_256_add sse2 avx2 dspr2/;
if (aom_config("CONFIG_EXT_TX") ne "yes") {
specialize qw/av1_iht4x4_16_add msa/;
@@ -283,25 +275,23 @@
add_proto qw/void av1_highbd_iht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type, int bd";
specialize qw/av1_highbd_iht4x4_16_add/;
- if (aom_config("CONFIG_EXT_TX") eq "yes") {
- add_proto qw/void av1_highbd_iht4x8_32_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type, int bd";
+ add_proto qw/void av1_highbd_iht4x8_32_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type, int bd";
specialize qw/av1_highbd_iht4x8_32_add/;
- add_proto qw/void av1_highbd_iht8x4_32_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type, int bd";
+ add_proto qw/void av1_highbd_iht8x4_32_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type, int bd";
specialize qw/av1_highbd_iht8x4_32_add/;
- add_proto qw/void av1_highbd_iht8x16_128_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type, int bd";
+ add_proto qw/void av1_highbd_iht8x16_128_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type, int bd";
specialize qw/av1_highbd_iht8x16_128_add/;
- add_proto qw/void av1_highbd_iht16x8_128_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type, int bd";
+ add_proto qw/void av1_highbd_iht16x8_128_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type, int bd";
specialize qw/av1_highbd_iht16x8_128_add/;
- add_proto qw/void av1_highbd_iht16x32_512_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type, int bd";
+ add_proto qw/void av1_highbd_iht16x32_512_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type, int bd";
specialize qw/av1_highbd_iht16x32_512_add/;
- add_proto qw/void av1_highbd_iht32x16_512_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type, int bd";
+ add_proto qw/void av1_highbd_iht32x16_512_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type, int bd";
specialize qw/av1_highbd_iht32x16_512_add/;
- }
add_proto qw/void av1_highbd_iht8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type, int bd";
specialize qw/av1_highbd_iht8x8_64_add/;
@@ -394,6 +384,11 @@
add_proto qw/void av1_fht32x32/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
specialize qw/av1_fht32x32 avx2/;
+if (aom_config("CONFIG_TX64X64") eq "yes") {
+ add_proto qw/void av1_fht64x64/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
+ specialize qw/av1_fht64x64/;
+}
+
if (aom_config("CONFIG_EXT_TX") eq "yes") {
add_proto qw/void av1_fht4x8/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
specialize qw/av1_fht4x8 sse2/;
@@ -414,62 +409,6 @@
specialize qw/av1_fht32x16 sse2/;
}
-if (aom_config("CONFIG_EMULATE_HARDWARE") eq "yes") {
- add_proto qw/void av1_fdct4x4/, "const int16_t *input, tran_low_t *output, int stride";
- specialize qw/av1_fdct4x4/;
-
- add_proto qw/void av1_fdct4x4_1/, "const int16_t *input, tran_low_t *output, int stride";
- specialize qw/av1_fdct4x4_1/;
-
- add_proto qw/void av1_fdct8x8/, "const int16_t *input, tran_low_t *output, int stride";
- specialize qw/av1_fdct8x8/;
-
- add_proto qw/void av1_fdct8x8_1/, "const int16_t *input, tran_low_t *output, int stride";
- specialize qw/av1_fdct8x8_1/;
-
- add_proto qw/void av1_fdct16x16/, "const int16_t *input, tran_low_t *output, int stride";
- specialize qw/av1_fdct16x16/;
-
- add_proto qw/void av1_fdct16x16_1/, "const int16_t *input, tran_low_t *output, int stride";
- specialize qw/av1_fdct16x16_1/;
-
- add_proto qw/void av1_fdct32x32/, "const int16_t *input, tran_low_t *output, int stride";
- specialize qw/av1_fdct32x32/;
-
- add_proto qw/void av1_fdct32x32_rd/, "const int16_t *input, tran_low_t *output, int stride";
- specialize qw/av1_fdct32x32_rd/;
-
- add_proto qw/void av1_fdct32x32_1/, "const int16_t *input, tran_low_t *output, int stride";
- specialize qw/av1_fdct32x32_1/;
-} else {
- add_proto qw/void av1_fdct4x4/, "const int16_t *input, tran_low_t *output, int stride";
- specialize qw/av1_fdct4x4 sse2/;
-
- add_proto qw/void av1_fdct4x4_1/, "const int16_t *input, tran_low_t *output, int stride";
- specialize qw/av1_fdct4x4_1 sse2/;
-
- add_proto qw/void av1_fdct8x8/, "const int16_t *input, tran_low_t *output, int stride";
- specialize qw/av1_fdct8x8 sse2/;
-
- add_proto qw/void av1_fdct8x8_1/, "const int16_t *input, tran_low_t *output, int stride";
- specialize qw/av1_fdct8x8_1 sse2/;
-
- add_proto qw/void av1_fdct16x16/, "const int16_t *input, tran_low_t *output, int stride";
- specialize qw/av1_fdct16x16 sse2/;
-
- add_proto qw/void av1_fdct16x16_1/, "const int16_t *input, tran_low_t *output, int stride";
- specialize qw/av1_fdct16x16_1 sse2/;
-
- add_proto qw/void av1_fdct32x32/, "const int16_t *input, tran_low_t *output, int stride";
- specialize qw/av1_fdct32x32 sse2/;
-
- add_proto qw/void av1_fdct32x32_rd/, "const int16_t *input, tran_low_t *output, int stride";
- specialize qw/av1_fdct32x32_rd sse2/;
-
- add_proto qw/void av1_fdct32x32_1/, "const int16_t *input, tran_low_t *output, int stride";
- specialize qw/av1_fdct32x32_1 sse2/;
-}
-
if (aom_config("CONFIG_AOM_HIGHBITDEPTH") ne "yes") {
if (aom_config("CONFIG_EXT_TX") ne "yes") {
specialize qw/av1_fht4x4 msa/;
@@ -478,243 +417,9 @@
}
}
-if (aom_config("CONFIG_AOM_HIGHBITDEPTH") eq "yes") {
- if (aom_config("CONFIG_EMULATE_HARDWARE") eq "yes") {
- add_proto qw/void av1_highbd_fdct4x4/, "const int16_t *input, tran_low_t *output, int stride";
- specialize qw/av1_highbd_fdct4x4/;
-
- add_proto qw/void av1_highbd_fdct8x8/, "const int16_t *input, tran_low_t *output, int stride";
- specialize qw/av1_highbd_fdct8x8/;
-
- add_proto qw/void av1_highbd_fdct8x8_1/, "const int16_t *input, tran_low_t *output, int stride";
- specialize qw/av1_highbd_fdct8x8_1/;
-
- add_proto qw/void av1_highbd_fdct16x16/, "const int16_t *input, tran_low_t *output, int stride";
- specialize qw/av1_highbd_fdct16x16/;
-
- add_proto qw/void av1_highbd_fdct16x16_1/, "const int16_t *input, tran_low_t *output, int stride";
- specialize qw/av1_highbd_fdct16x16_1/;
-
- add_proto qw/void av1_highbd_fdct32x32/, "const int16_t *input, tran_low_t *output, int stride";
- specialize qw/av1_highbd_fdct32x32/;
-
- add_proto qw/void av1_highbd_fdct32x32_rd/, "const int16_t *input, tran_low_t *output, int stride";
- specialize qw/av1_highbd_fdct32x32_rd/;
-
- add_proto qw/void av1_highbd_fdct32x32_1/, "const int16_t *input, tran_low_t *output, int stride";
- specialize qw/av1_highbd_fdct32x32_1/;
- } else {
- add_proto qw/void av1_highbd_fdct4x4/, "const int16_t *input, tran_low_t *output, int stride";
- specialize qw/av1_highbd_fdct4x4 sse2/;
-
- add_proto qw/void av1_highbd_fdct8x8/, "const int16_t *input, tran_low_t *output, int stride";
- specialize qw/av1_highbd_fdct8x8 sse2/;
-
- add_proto qw/void av1_highbd_fdct8x8_1/, "const int16_t *input, tran_low_t *output, int stride";
- specialize qw/av1_highbd_fdct8x8_1/;
-
- add_proto qw/void av1_highbd_fdct16x16/, "const int16_t *input, tran_low_t *output, int stride";
- specialize qw/av1_highbd_fdct16x16 sse2/;
-
- add_proto qw/void av1_highbd_fdct16x16_1/, "const int16_t *input, tran_low_t *output, int stride";
- specialize qw/av1_highbd_fdct16x16_1/;
-
- add_proto qw/void av1_highbd_fdct32x32/, "const int16_t *input, tran_low_t *output, int stride";
- specialize qw/av1_highbd_fdct32x32 sse2/;
-
- add_proto qw/void av1_highbd_fdct32x32_rd/, "const int16_t *input, tran_low_t *output, int stride";
- specialize qw/av1_highbd_fdct32x32_rd sse2/;
-
- add_proto qw/void av1_highbd_fdct32x32_1/, "const int16_t *input, tran_low_t *output, int stride";
- specialize qw/av1_highbd_fdct32x32_1/;
- }
-}
-
add_proto qw/void av1_fwd_idtx/, "const int16_t *src_diff, tran_low_t *coeff, int stride, int bs, int tx_type";
specialize qw/av1_fwd_idtx/;
-# Inverse transform
-if (aom_config("CONFIG_AOM_HIGHBITDEPTH") eq "yes") {
- # Note as optimized versions of these functions are added we need to add a check to ensure
- # that when CONFIG_EMULATE_HARDWARE is on, it defaults to the C versions only.
- add_proto qw/void av1_idct4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
- specialize qw/av1_idct4x4_1_add/;
-
- add_proto qw/void av1_idct4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
- specialize qw/av1_idct4x4_16_add/;
-
- add_proto qw/void av1_idct8x8_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
- specialize qw/av1_idct8x8_1_add/;
-
- add_proto qw/void av1_idct8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
- specialize qw/av1_idct8x8_64_add/;
-
- add_proto qw/void av1_idct8x8_12_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
- specialize qw/av1_idct8x8_12_add/;
-
- add_proto qw/void av1_idct16x16_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
- specialize qw/av1_idct16x16_1_add/;
-
- add_proto qw/void av1_idct16x16_256_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
- specialize qw/av1_idct16x16_256_add/;
-
- add_proto qw/void av1_idct16x16_10_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
- specialize qw/av1_idct16x16_10_add/;
-
- add_proto qw/void av1_idct32x32_1024_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
- specialize qw/av1_idct32x32_1024_add/;
-
- add_proto qw/void av1_idct32x32_34_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
- specialize qw/av1_idct32x32_34_add/;
-
- add_proto qw/void av1_idct32x32_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
- specialize qw/av1_idct32x32_1_add/;
-
- add_proto qw/void av1_iwht4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
- specialize qw/av1_iwht4x4_1_add/;
-
- add_proto qw/void av1_iwht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
- specialize qw/av1_iwht4x4_16_add/;
-
- add_proto qw/void av1_highbd_idct4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
- specialize qw/av1_highbd_idct4x4_1_add/;
-
- add_proto qw/void av1_highbd_idct8x8_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
- specialize qw/av1_highbd_idct8x8_1_add/;
-
- add_proto qw/void av1_highbd_idct16x16_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
- specialize qw/av1_highbd_idct16x16_1_add/;
-
- add_proto qw/void av1_highbd_idct32x32_1024_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
- specialize qw/av1_highbd_idct32x32_1024_add/;
-
- add_proto qw/void av1_highbd_idct32x32_34_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
- specialize qw/av1_highbd_idct32x32_34_add/;
-
- add_proto qw/void av1_highbd_idct32x32_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
- specialize qw/av1_highbd_idct32x32_1_add/;
-
- add_proto qw/void av1_highbd_iwht4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
- specialize qw/av1_highbd_iwht4x4_1_add/;
-
- add_proto qw/void av1_highbd_iwht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
- specialize qw/av1_highbd_iwht4x4_16_add/;
-
- # Force C versions if CONFIG_EMULATE_HARDWARE is 1
- if (aom_config("CONFIG_EMULATE_HARDWARE") eq "yes") {
- add_proto qw/void av1_highbd_idct4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
- specialize qw/av1_highbd_idct4x4_16_add/;
-
- add_proto qw/void av1_highbd_idct8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
- specialize qw/av1_highbd_idct8x8_64_add/;
-
- add_proto qw/void av1_highbd_idct8x8_10_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
- specialize qw/av1_highbd_idct8x8_10_add/;
-
- add_proto qw/void av1_highbd_idct16x16_256_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
- specialize qw/av1_highbd_idct16x16_256_add/;
-
- add_proto qw/void av1_highbd_idct16x16_10_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
- specialize qw/av1_highbd_idct16x16_10_add/;
- } else {
- add_proto qw/void av1_highbd_idct4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
- specialize qw/av1_highbd_idct4x4_16_add sse2/;
-
- add_proto qw/void av1_highbd_idct8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
- specialize qw/av1_highbd_idct8x8_64_add sse2/;
-
- add_proto qw/void av1_highbd_idct8x8_10_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
- specialize qw/av1_highbd_idct8x8_10_add sse2/;
-
- add_proto qw/void av1_highbd_idct16x16_256_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
- specialize qw/av1_highbd_idct16x16_256_add sse2/;
-
- add_proto qw/void av1_highbd_idct16x16_10_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
- specialize qw/av1_highbd_idct16x16_10_add sse2/;
- } # CONFIG_EMULATE_HARDWARE
-} else {
- # Force C versions if CONFIG_EMULATE_HARDWARE is 1
- if (aom_config("CONFIG_EMULATE_HARDWARE") eq "yes") {
- add_proto qw/void av1_idct4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
- specialize qw/av1_idct4x4_1_add/;
-
- add_proto qw/void av1_idct4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
- specialize qw/av1_idct4x4_16_add/;
-
- add_proto qw/void av1_idct8x8_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
- specialize qw/av1_idct8x8_1_add/;
-
- add_proto qw/void av1_idct8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
- specialize qw/av1_idct8x8_64_add/;
-
- add_proto qw/void av1_idct8x8_12_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
- specialize qw/av1_idct8x8_12_add/;
-
- add_proto qw/void av1_idct16x16_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
- specialize qw/av1_idct16x16_1_add/;
-
- add_proto qw/void av1_idct16x16_256_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
- specialize qw/av1_idct16x16_256_add/;
-
- add_proto qw/void av1_idct16x16_10_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
- specialize qw/av1_idct16x16_10_add/;
-
- add_proto qw/void av1_idct32x32_1024_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
- specialize qw/av1_idct32x32_1024_add/;
-
- add_proto qw/void av1_idct32x32_34_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
- specialize qw/av1_idct32x32_34_add/;
-
- add_proto qw/void av1_idct32x32_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
- specialize qw/av1_idct32x32_1_add/;
-
- add_proto qw/void av1_iwht4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
- specialize qw/av1_iwht4x4_1_add/;
-
- add_proto qw/void av1_iwht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
- specialize qw/av1_iwht4x4_16_add/;
- } else {
- add_proto qw/void av1_idct4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
- specialize qw/av1_idct4x4_1_add sse2/;
-
- add_proto qw/void av1_idct4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
- specialize qw/av1_idct4x4_16_add sse2/;
-
- add_proto qw/void av1_idct8x8_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
- specialize qw/av1_idct8x8_1_add sse2/;
-
- add_proto qw/void av1_idct8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
- specialize qw/av1_idct8x8_64_add sse2/;
-
- add_proto qw/void av1_idct8x8_12_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
- specialize qw/av1_idct8x8_12_add sse2/;
-
- add_proto qw/void av1_idct16x16_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
- specialize qw/av1_idct16x16_1_add sse2/;
-
- add_proto qw/void av1_idct16x16_256_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
- specialize qw/av1_idct16x16_256_add sse2/;
-
- add_proto qw/void av1_idct16x16_10_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
- specialize qw/av1_idct16x16_10_add sse2/;
-
- add_proto qw/void av1_idct32x32_1024_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
- specialize qw/av1_idct32x32_1024_add sse2/;
-
- add_proto qw/void av1_idct32x32_34_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
- specialize qw/av1_idct32x32_34_add sse2/;
-
- add_proto qw/void av1_idct32x32_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
- specialize qw/av1_idct32x32_1_add sse2/;
-
- add_proto qw/void av1_iwht4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
- specialize qw/av1_iwht4x4_1_add/;
-
- add_proto qw/void av1_iwht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
- specialize qw/av1_iwht4x4_16_add/;
- } # CONFIG_EMULATE_HARDWARE
-} # CONFIG_AOM_HIGHBITDEPTH
-
if (aom_config("CONFIG_AOM_HIGHBITDEPTH") eq "yes") {
#fwd txfm
add_proto qw/void av1_fwd_txfm2d_4x4/, "const int16_t *input, int32_t *output, int stride, int tx_type, int bd";
@@ -826,6 +531,11 @@
add_proto qw/void av1_highbd_fht32x32/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
specialize qw/av1_highbd_fht32x32/;
+ if (aom_config("CONFIG_TX64X64") eq "yes") {
+ add_proto qw/void av1_highbd_fht64x64/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
+ specialize qw/av1_highbd_fht64x64/;
+ }
+
add_proto qw/void av1_highbd_fwht4x4/, "const int16_t *input, tran_low_t *output, int stride";
specialize qw/av1_highbd_fwht4x4/;
diff --git a/av1/common/blockd.h b/av1/common/blockd.h
index 8d7c7f8..4d8f5e2 100644
--- a/av1/common/blockd.h
+++ b/av1/common/blockd.h
@@ -443,18 +443,6 @@
}
#endif // CONFIG_SUPERTX
-static INLINE int get_tx1d_width(TX_SIZE tx_size) {
- return num_4x4_blocks_wide_txsize_lookup[tx_size] << 2;
-}
-
-static INLINE int get_tx1d_height(TX_SIZE tx_size) {
- return num_4x4_blocks_high_txsize_lookup[tx_size] << 2;
-}
-
-static INLINE int get_tx2d_size(TX_SIZE tx_size) {
- return num_4x4_blocks_txsize_lookup[tx_size] << 4;
-}
-
#if CONFIG_EXT_TX
#define ALLOW_INTRA_EXT_TX 1
@@ -712,6 +700,14 @@
void av1_setup_block_planes(MACROBLOCKD *xd, int ss_x, int ss_y);
+static INLINE int tx_size_to_depth(const TX_SIZE tx_size) {
+ return (int)(tx_size - TX_4X4);
+}
+
+static INLINE TX_SIZE depth_to_tx_size(const int depth) {
+ return (TX_SIZE)(depth + TX_4X4);
+}
+
static INLINE TX_SIZE get_uv_tx_size(const MB_MODE_INFO *mbmi,
const struct macroblockd_plane *pd) {
TX_SIZE uv_txsize;
diff --git a/av1/common/common_data.h b/av1/common/common_data.h
index 9d851e2..4f799db 100644
--- a/av1/common/common_data.h
+++ b/av1/common/common_data.h
@@ -67,44 +67,6 @@
1, 1, 1, 1, 1, 1, 1, 2, 1, 2, 4, 2, 4, IF_EXT_PARTITION(8, 4, 8)
};
-static const uint8_t num_4x4_blocks_txsize_lookup[TX_SIZES_ALL] = {
- 1, 4, 16, 64,
-#if CONFIG_EXT_TX
- 2, 2, 8, 8, 32, 32
-#endif // CONFIG_EXT_TX
-};
-static const uint8_t num_4x4_blocks_wide_txsize_lookup[TX_SIZES_ALL] = {
- 1, 2, 4, 8,
-#if CONFIG_EXT_TX
- 1, 2, 2, 4, 4, 8
-#endif // CONFIG_EXT_TX
-};
-static const uint8_t num_4x4_blocks_high_txsize_lookup[TX_SIZES_ALL] = {
- 1, 2, 4, 8,
-#if CONFIG_EXT_TX
- 2, 1, 4, 2, 8, 4
-#endif // CONFIG_EXT_TX
-};
-
-static const uint8_t num_4x4_blocks_txsize_log2_lookup[TX_SIZES_ALL] = {
- 0, 2, 4, 6,
-#if CONFIG_EXT_TX
- 1, 1, 3, 3, 5, 5
-#endif // CONFIG_EXT_TX
-};
-static const uint8_t num_4x4_blocks_wide_txsize_log2_lookup[TX_SIZES_ALL] = {
- 0, 1, 2, 3,
-#if CONFIG_EXT_TX
- 0, 1, 1, 2, 2, 3
-#endif // CONFIG_EXT_TX
-};
-static const uint8_t num_4x4_blocks_high_txsize_log2_lookup[TX_SIZES_ALL] = {
- 0, 1, 2, 3,
-#if CONFIG_EXT_TX
- 1, 0, 2, 1, 3, 2
-#endif // CONFIG_EXT_TX
-};
-
// AOMMIN(3, AOMMIN(b_width_log2(bsize), b_height_log2(bsize)))
static const uint8_t size_group_lookup[BLOCK_SIZES] = {
0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3, IF_EXT_PARTITION(3, 3, 3)
@@ -419,6 +381,9 @@
/* clang-format on */
static const TX_SIZE txsize_horz_map[TX_SIZES_ALL] = {
+#if CONFIG_CB4X4
+ TX_2X2, // TX_2X2
+#endif
TX_4X4, // TX_4X4
TX_8X8, // TX_8X8
TX_16X16, // TX_16X16
@@ -434,6 +399,9 @@
};
static const TX_SIZE txsize_vert_map[TX_SIZES_ALL] = {
+#if CONFIG_CB4X4
+ TX_2X2, // TX_2X2
+#endif
TX_4X4, // TX_4X4
TX_8X8, // TX_8X8
TX_16X16, // TX_16X16
@@ -450,6 +418,9 @@
// Transform block width in pixels
static const int tx_size_wide[TX_SIZES_ALL] = {
+#if CONFIG_CB4X4
+ 2,
+#endif
4, 8, 16, 32,
#if CONFIG_EXT_TX
4, 8, 8, 16, 16, 32,
@@ -458,6 +429,9 @@
// Transform block height in pixels
static const int tx_size_high[TX_SIZES_ALL] = {
+#if CONFIG_CB4X4
+ 2,
+#endif
4, 8, 16, 32,
#if CONFIG_EXT_TX
8, 4, 16, 8, 32, 16,
@@ -466,6 +440,9 @@
// Transform block width in unit
static const int tx_size_wide_unit[TX_SIZES_ALL] = {
+#if CONFIG_CB4X4
+ 1,
+#endif
1, 2, 4, 8,
#if CONFIG_EXT_TX
1, 2, 2, 4, 4, 8,
@@ -474,6 +451,9 @@
// Transform block height in unit
static const int tx_size_high_unit[TX_SIZES_ALL] = {
+#if CONFIG_CB4X4
+ 1,
+#endif
1, 2, 4, 8,
#if CONFIG_EXT_TX
2, 1, 4, 2, 8, 4,
@@ -482,6 +462,9 @@
// Transform block width in log2
static const int tx_size_wide_log2[TX_SIZES_ALL] = {
+#if CONFIG_CB4X4
+ 2,
+#endif
2, 3, 4, 5,
#if CONFIG_EXT_TX
2, 3, 3, 4, 4, 5,
@@ -490,6 +473,9 @@
// Transform block height in log2
static const int tx_size_high_log2[TX_SIZES_ALL] = {
+#if CONFIG_CB4X4
+ 2,
+#endif
2, 3, 4, 5,
#if CONFIG_EXT_TX
3, 2, 4, 3, 5, 4,
@@ -497,6 +483,9 @@
};
static const int tx_size_2d[TX_SIZES_ALL] = {
+#if CONFIG_CB4X4
+ 4,
+#endif
16, 64, 256, 1024,
#if CONFIG_EXT_TX
32, 32, 128, 128, 512, 512,
@@ -509,6 +498,9 @@
static const int tx_size_1d_in_unit_log2[TX_SIZES] = { 0, 1, 2, 3 };
static const BLOCK_SIZE txsize_to_bsize[TX_SIZES_ALL] = {
+#if CONFIG_CB4X4
+ BLOCK_4X4, // TX_2X2
+#endif
BLOCK_4X4, // TX_4X4
BLOCK_8X8, // TX_8X8
BLOCK_16X16, // TX_16X16
@@ -524,6 +516,9 @@
};
static const TX_SIZE txsize_sqr_map[TX_SIZES_ALL] = {
+#if CONFIG_CB4X4
+ TX_2X2, // TX_2X2
+#endif
TX_4X4, // TX_4X4
TX_8X8, // TX_8X8
TX_16X16, // TX_16X16
@@ -539,6 +534,9 @@
};
static const TX_SIZE txsize_sqr_up_map[TX_SIZES_ALL] = {
+#if CONFIG_CB4X4
+ TX_2X2, // TX_2X2
+#endif
TX_4X4, // TX_4X4
TX_8X8, // TX_8X8
TX_16X16, // TX_16X16
@@ -589,7 +587,10 @@
// ss_x == 0 ss_x == 0 ss_x == 1 ss_x == 1
// ss_y == 0 ss_y == 1 ss_y == 0 ss_y == 1
{
- // BLOCK_4X4
+// BLOCK_4X4
+#if CONFIG_CB4X4
+ { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
+#endif
{ { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
{ { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
{ { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
@@ -604,7 +605,10 @@
#endif // CONFIG_EXT_TX
},
{
- // BLOCK_4X8
+// BLOCK_4X8
+#if CONFIG_CB4X4
+ { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
+#endif
{ { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
{ { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
{ { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
@@ -623,7 +627,10 @@
#endif // CONFIG_EXT_TX
},
{
- // BLOCK_8X4
+// BLOCK_8X4
+#if CONFIG_CB4X4
+ { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
+#endif
{ { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
{ { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
{ { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
@@ -642,7 +649,10 @@
#endif // CONFIG_EXT_TX
},
{
- // BLOCK_8X8
+// BLOCK_8X8
+#if CONFIG_CB4X4
+ { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
+#endif
{ { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
{ { TX_8X8, TX_4X4 }, { TX_4X4, TX_4X4 } },
{ { TX_8X8, TX_8X8 }, { TX_4X4, TX_4X4 } },
@@ -657,7 +667,10 @@
#endif // CONFIG_EXT_TX
},
{
- // BLOCK_8X16
+// BLOCK_8X16
+#if CONFIG_CB4X4
+ { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
+#endif
{ { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
{ { TX_8X8, TX_8X8 }, { TX_4X4, TX_4X4 } },
{ { TX_8X8, TX_8X8 }, { TX_4X4, TX_4X4 } },
@@ -676,7 +689,10 @@
#endif // CONFIG_EXT_TX
},
{
- // BLOCK_16X8
+// BLOCK_16X8
+#if CONFIG_CB4X4
+ { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
+#endif
{ { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
{ { TX_8X8, TX_4X4 }, { TX_8X8, TX_4X4 } },
{ { TX_8X8, TX_4X4 }, { TX_8X8, TX_8X8 } },
@@ -695,7 +711,10 @@
#endif // CONFIG_EXT_TX
},
{
- // BLOCK_16X16
+// BLOCK_16X16
+#if CONFIG_CB4X4
+ { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
+#endif
{ { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
{ { TX_8X8, TX_8X8 }, { TX_8X8, TX_8X8 } },
{ { TX_16X16, TX_8X8 }, { TX_8X8, TX_8X8 } },
@@ -710,7 +729,10 @@
#endif // CONFIG_EXT_TX
},
{
- // BLOCK_16X32
+// BLOCK_16X32
+#if CONFIG_CB4X4
+ { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
+#endif
{ { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
{ { TX_8X8, TX_8X8 }, { TX_8X8, TX_8X8 } },
{ { TX_16X16, TX_16X16 }, { TX_8X8, TX_8X8 } },
@@ -729,7 +751,10 @@
#endif // CONFIG_EXT_TX
},
{
- // BLOCK_32X16
+// BLOCK_32X16
+#if CONFIG_CB4X4
+ { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
+#endif
{ { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
{ { TX_8X8, TX_8X8 }, { TX_8X8, TX_8X8 } },
{ { TX_16X16, TX_8X8 }, { TX_16X16, TX_8X8 } },
@@ -748,7 +773,10 @@
#endif // CONFIG_EXT_TX
},
{
- // BLOCK_32X32
+// BLOCK_32X32
+#if CONFIG_CB4X4
+ { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
+#endif
{ { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
{ { TX_8X8, TX_8X8 }, { TX_8X8, TX_8X8 } },
{ { TX_16X16, TX_16X16 }, { TX_16X16, TX_16X16 } },
@@ -763,7 +791,10 @@
#endif // CONFIG_EXT_TX
},
{
- // BLOCK_32X64
+// BLOCK_32X64
+#if CONFIG_CB4X4
+ { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
+#endif
{ { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
{ { TX_8X8, TX_8X8 }, { TX_8X8, TX_8X8 } },
{ { TX_16X16, TX_16X16 }, { TX_16X16, TX_16X16 } },
@@ -778,7 +809,10 @@
#endif // CONFIG_EXT_TX
},
{
- // BLOCK_64X32
+// BLOCK_64X32
+#if CONFIG_CB4X4
+ { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
+#endif
{ { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
{ { TX_8X8, TX_8X8 }, { TX_8X8, TX_8X8 } },
{ { TX_16X16, TX_16X16 }, { TX_16X16, TX_16X16 } },
@@ -793,7 +827,10 @@
#endif // CONFIG_EXT_TX
},
{
- // BLOCK_64X64
+// BLOCK_64X64
+#if CONFIG_CB4X4
+ { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
+#endif
{ { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
{ { TX_8X8, TX_8X8 }, { TX_8X8, TX_8X8 } },
{ { TX_16X16, TX_16X16 }, { TX_16X16, TX_16X16 } },
@@ -801,7 +838,10 @@
#if CONFIG_EXT_PARTITION
},
{
- // BLOCK_64X128
+// BLOCK_64X128
+#if CONFIG_CB4X4
+ { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
+#endif
{ { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
{ { TX_8X8, TX_8X8 }, { TX_8X8, TX_8X8 } },
{ { TX_16X16, TX_16X16 }, { TX_16X16, TX_16X16 } },
@@ -816,7 +856,10 @@
#endif // CONFIG_EXT_TX
},
{
- // BLOCK_128X64
+// BLOCK_128X64
+#if CONFIG_CB4X4
+ { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
+#endif
{ { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
{ { TX_8X8, TX_8X8 }, { TX_8X8, TX_8X8 } },
{ { TX_16X16, TX_16X16 }, { TX_16X16, TX_16X16 } },
@@ -831,7 +874,10 @@
#endif // CONFIG_EXT_TX
},
{
- // BLOCK_128X128
+// BLOCK_128X128
+#if CONFIG_CB4X4
+ { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
+#endif
{ { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
{ { TX_8X8, TX_8X8 }, { TX_8X8, TX_8X8 } },
{ { TX_16X16, TX_16X16 }, { TX_16X16, TX_16X16 } },
@@ -891,8 +937,11 @@
#if CONFIG_SUPERTX
static const TX_SIZE uvsupertx_size_lookup[TX_SIZES][2][2] = {
- // ss_x == 0 ss_x == 0 ss_x == 1 ss_x == 1
- // ss_y == 0 ss_y == 1 ss_y == 0 ss_y == 1
+// ss_x == 0 ss_x == 0 ss_x == 1 ss_x == 1
+// ss_y == 0 ss_y == 1 ss_y == 0 ss_y == 1
+#if CONFIG_CB4X4
+ { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
+#endif
{ { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
{ { TX_8X8, TX_4X4 }, { TX_4X4, TX_4X4 } },
{ { TX_16X16, TX_8X8 }, { TX_8X8, TX_8X8 } },
diff --git a/av1/common/av1_convolve.c b/av1/common/convolve.c
similarity index 95%
rename from av1/common/av1_convolve.c
rename to av1/common/convolve.c
index 1f8d623..eef629e 100644
--- a/av1/common/av1_convolve.c
+++ b/av1/common/convolve.c
@@ -1,8 +1,19 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
#include <assert.h>
#include <string.h>
#include "./av1_rtcd.h"
-#include "av1/common/av1_convolve.h"
+#include "av1/common/convolve.h"
#include "av1/common/filter.h"
#include "aom_dsp/aom_dsp_common.h"
#include "aom_ports/mem.h"
diff --git a/av1/common/av1_convolve.h b/av1/common/convolve.h
similarity index 67%
rename from av1/common/av1_convolve.h
rename to av1/common/convolve.h
index 804c102..dafa032 100644
--- a/av1/common/av1_convolve.h
+++ b/av1/common/convolve.h
@@ -1,3 +1,14 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
#ifndef AV1_COMMON_AV1_CONVOLVE_H_
#define AV1_COMMON_AV1_CONVOLVE_H_
#include "av1/common/filter.h"
diff --git a/av1/common/dering.c b/av1/common/dering.c
index c21d4e5..908c588 100644
--- a/av1/common/dering.c
+++ b/av1/common/dering.c
@@ -9,6 +9,8 @@
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
+// clang-format off
+
#include <string.h>
#include <math.h>
@@ -45,22 +47,94 @@
return skip;
}
+int sb_all_skip_out(const AV1_COMMON *const cm, int mi_row, int mi_col,
+ unsigned char (*bskip)[2], int *count_ptr) {
+ int r, c;
+ int maxc, maxr;
+ int skip = 1;
+ MODE_INFO **grid;
+ int count=0;
+ grid = cm->mi_grid_visible;
+ maxc = cm->mi_cols - mi_col;
+ maxr = cm->mi_rows - mi_row;
+ if (maxr > MAX_MIB_SIZE) maxr = MAX_MIB_SIZE;
+ if (maxc > MAX_MIB_SIZE) maxc = MAX_MIB_SIZE;
+ for (r = 0; r < maxr; r++) {
+ MODE_INFO **grid_row;
+ grid_row = &grid[(mi_row + r) * cm->mi_stride + mi_col];
+ for (c = 0; c < maxc; c++) {
+ if (!grid_row[c]->mbmi.skip) {
+ skip = 0;
+ bskip[count][0] = r;
+ bskip[count][1] = c;
+ count++;
+ }
+ }
+ }
+ *count_ptr = count;
+ return skip;
+}
+
+static INLINE void copy_8x8_16_8bit(uint8_t *dst, int dstride, int16_t *src, int sstride) {
+ int i, j;
+ for (i = 0; i < 8; i++)
+ for (j = 0; j < 8; j++)
+ dst[i * dstride + j] = src[i * sstride + j];
+}
+
+static INLINE void copy_4x4_16_8bit(uint8_t *dst, int dstride, int16_t *src, int sstride) {
+ int i, j;
+ for (i = 0; i < 4; i++)
+ for (j = 0; j < 4; j++)
+ dst[i * dstride + j] = src[i * sstride + j];
+}
+
+/* TODO: Optimize this function for SSE. */
+void copy_blocks_16_8bit(uint8_t *dst, int dstride, int16_t *src, int sstride,
+ unsigned char (*bskip)[2], int dering_count, int bsize)
+{
+ int bi, bx, by;
+ if (bsize == 3) {
+ for (bi = 0; bi < dering_count; bi++) {
+ by = bskip[bi][0];
+ bx = bskip[bi][1];
+ copy_8x8_16_8bit(&dst[(by << 3) * dstride + (bx << 3)],
+ dstride,
+ &src[(by << 3) * sstride + (bx << 3)], sstride);
+ }
+ } else {
+ for (bi = 0; bi < dering_count; bi++) {
+ by = bskip[bi][0];
+ bx = bskip[bi][1];
+ copy_4x4_16_8bit(&dst[(by << 2) * dstride + (bx << 2)],
+ dstride,
+ &src[(by << 2) * sstride + (bx << 2)], sstride);
+ }
+ }
+}
+
void av1_dering_frame(YV12_BUFFER_CONFIG *frame, AV1_COMMON *cm,
MACROBLOCKD *xd, int global_level) {
int r, c;
int sbr, sbc;
int nhsb, nvsb;
od_dering_in *src[3];
- unsigned char *bskip;
+ unsigned char bskip[MAX_MIB_SIZE*MAX_MIB_SIZE][2];
+ int dering_count;
int dir[OD_DERING_NBLOCKS][OD_DERING_NBLOCKS] = { { 0 } };
int stride;
int bsize[3];
int dec[3];
int pli;
int coeff_shift = AOMMAX(cm->bit_depth - 8, 0);
+ int nplanes;
+ if (xd->plane[1].subsampling_x == xd->plane[1].subsampling_y &&
+ xd->plane[2].subsampling_x == xd->plane[2].subsampling_y)
+ nplanes = 3;
+ else
+ nplanes = 1;
nvsb = (cm->mi_rows + MAX_MIB_SIZE - 1) / MAX_MIB_SIZE;
nhsb = (cm->mi_cols + MAX_MIB_SIZE - 1) / MAX_MIB_SIZE;
- bskip = aom_malloc(sizeof(*bskip) * cm->mi_rows * cm->mi_cols);
av1_setup_dst_planes(xd->plane, frame, 0, 0);
for (pli = 0; pli < 3; pli++) {
dec[pli] = xd->plane[pli].subsampling_x;
@@ -85,13 +159,6 @@
}
}
}
- for (r = 0; r < cm->mi_rows; ++r) {
- for (c = 0; c < cm->mi_cols; ++c) {
- const MB_MODE_INFO *mbmi =
- &cm->mi_grid_visible[r * cm->mi_stride + c]->mbmi;
- bskip[r * cm->mi_cols + c] = mbmi->skip;
- }
- }
for (sbr = 0; sbr < nvsb; sbr++) {
for (sbc = 0; sbc < nhsb; sbc++) {
int level;
@@ -102,9 +169,10 @@
global_level, cm->mi_grid_visible[MAX_MIB_SIZE * sbr * cm->mi_stride +
MAX_MIB_SIZE * sbc]
->mbmi.dering_gain);
- if (level == 0 || sb_all_skip(cm, sbr * MAX_MIB_SIZE, sbc * MAX_MIB_SIZE))
+ if (level == 0 ||
+ sb_all_skip_out(cm, sbr * MAX_MIB_SIZE, sbc * MAX_MIB_SIZE, bskip, &dering_count))
continue;
- for (pli = 0; pli < 3; pli++) {
+ for (pli = 0; pli < nplanes; pli++) {
int16_t dst[MAX_MIB_SIZE * MAX_MIB_SIZE * 8 * 8];
int threshold;
/* FIXME: This is a temporary hack that uses more conservative
@@ -118,33 +186,31 @@
&src[pli][sbr * stride * bsize[pli] * MAX_MIB_SIZE +
sbc * bsize[pli] * MAX_MIB_SIZE],
stride, nhb, nvb, sbc, sbr, nhsb, nvsb, dec[pli], dir, pli,
- &bskip[MAX_MIB_SIZE * sbr * cm->mi_cols + MAX_MIB_SIZE * sbc],
- cm->mi_cols, threshold, coeff_shift);
- for (r = 0; r < bsize[pli] * nvb; ++r) {
- for (c = 0; c < bsize[pli] * nhb; ++c) {
+ bskip, dering_count, threshold, coeff_shift);
#if CONFIG_AOM_HIGHBITDEPTH
- if (cm->use_highbitdepth) {
- CONVERT_TO_SHORTPTR(xd->plane[pli].dst.buf)
- [xd->plane[pli].dst.stride *
- (bsize[pli] * MAX_MIB_SIZE * sbr + r) +
- sbc * bsize[pli] * MAX_MIB_SIZE + c] =
- dst[r * MAX_MIB_SIZE * bsize[pli] + c];
- } else {
+ if (cm->use_highbitdepth) {
+ copy_blocks_16bit(
+ (int16_t*)&CONVERT_TO_SHORTPTR(
+ xd->plane[pli].dst.buf)[xd->plane[pli].dst.stride *
+ (bsize[pli] * MAX_MIB_SIZE * sbr) +
+ sbc * bsize[pli] * MAX_MIB_SIZE],
+ xd->plane[pli].dst.stride, dst, MAX_MIB_SIZE * bsize[pli], bskip,
+ dering_count, 3 - dec[pli]);
+ } else {
#endif
- xd->plane[pli].dst.buf[xd->plane[pli].dst.stride *
- (bsize[pli] * MAX_MIB_SIZE * sbr + r) +
- sbc * bsize[pli] * MAX_MIB_SIZE + c] =
- dst[r * MAX_MIB_SIZE * bsize[pli] + c];
+ copy_blocks_16_8bit(
+ &xd->plane[pli].dst.buf[xd->plane[pli].dst.stride *
+ (bsize[pli] * MAX_MIB_SIZE * sbr) +
+ sbc * bsize[pli] * MAX_MIB_SIZE],
+ xd->plane[pli].dst.stride, dst, MAX_MIB_SIZE * bsize[pli], bskip,
+ dering_count, 3 - dec[pli]);
#if CONFIG_AOM_HIGHBITDEPTH
- }
-#endif
- }
}
+#endif
}
}
}
- for (pli = 0; pli < 3; pli++) {
+ for (pli = 0; pli < nplanes; pli++) {
aom_free(src[pli]);
}
- aom_free(bskip);
}
diff --git a/av1/common/dering.h b/av1/common/dering.h
index 7c93f8b..c906994 100644
--- a/av1/common/dering.h
+++ b/av1/common/dering.h
@@ -11,6 +11,8 @@
#ifndef AV1_COMMON_DERING_H_
#define AV1_COMMON_DERING_H_
+// clang-format off
+
#include "av1/common/od_dering.h"
#include "av1/common/onyxc_int.h"
#include "aom/aom_integer.h"
@@ -29,6 +31,8 @@
int compute_level_from_index(int global_level, int gi);
int sb_all_skip(const AV1_COMMON *const cm, int mi_row, int mi_col);
+int sb_all_skip_out(const AV1_COMMON *const cm, int mi_row, int mi_col,
+ unsigned char (*bskip)[2], int *count_ptr);
void av1_dering_frame(YV12_BUFFER_CONFIG *frame, AV1_COMMON *cm,
MACROBLOCKD *xd, int global_level);
diff --git a/av1/common/entropy.c b/av1/common/entropy.c
index 870632d..7e66e93 100644
--- a/av1/common/entropy.c
+++ b/av1/common/entropy.c
@@ -9,11 +9,12 @@
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
+#include "./aom_config.h"
#include "av1/common/entropy.h"
#include "av1/common/blockd.h"
#include "av1/common/onyxc_int.h"
-#include "av1/common/scan.h"
#include "av1/common/entropymode.h"
+#include "av1/common/scan.h"
#include "aom_mem/aom_mem.h"
#include "aom/aom_integer.h"
@@ -58,6 +59,9 @@
#endif
const uint16_t band_count_table[TX_SIZES_ALL][8] = {
+#if CONFIG_CB4X4
+ { 1, 2, 2, 3, 0, 0, 0 },
+#endif
{ 1, 2, 3, 4, 3, 16 - 13, 0 }, { 1, 2, 3, 4, 11, 64 - 21, 0 },
{ 1, 2, 3, 4, 11, 256 - 21, 0 }, { 1, 2, 3, 4, 11, 1024 - 21, 0 },
#if CONFIG_EXT_TX
@@ -68,6 +72,9 @@
};
const uint16_t band_cum_count_table[TX_SIZES_ALL][8] = {
+#if CONFIG_CB4X4
+ { 0, 1, 3, 6, 10, 13, 16, 0 },
+#endif
{ 0, 1, 3, 6, 10, 13, 16, 0 }, { 0, 1, 3, 6, 10, 21, 64, 0 },
{ 0, 1, 3, 6, 10, 21, 256, 0 }, { 0, 1, 3, 6, 10, 21, 1024, 0 },
#if CONFIG_EXT_TX
@@ -407,7 +414,7 @@
{ 255, 246, 247, 255, 239, 255, 253, 255 },
};
-#if CONFIG_RANS || CONFIG_DAALA_EC
+#if CONFIG_EC_MULTISYMBOL
// Model obtained from a 2-sided zero-centered distribution derived
// from a Pareto distribution. The cdf of the distribution is:
// cdf(x) = 0.5 + 0.5 * sgn(x) * [1 - {alpha/(alpha + |x|)} ^ beta]
@@ -679,7 +686,7 @@
{ 32512, 238, 11, 1, 1, 1, 1, 1, 1, 1 },
{ 32640, 117, 4, 1, 1, 1, 1, 1, 1, 1 },
};
-#endif // CONFIG_RANS
+#endif // CONFIG_EC_MULTISYMBOL
/* clang-format off */
#if CONFIG_ENTROPY
@@ -2833,6 +2840,9 @@
ROUND_POWER_OF_TWO(cm->base_qindex, 8 - QCTX_BIN_BITS), QCTX_BINS - 1);
av1_copy(cm->fc->coef_probs, default_qctx_coef_probs[index]);
#else
+#if CONFIG_CB4X4
+ av1_copy(cm->fc->coef_probs[TX_2X2], default_coef_probs_4x4);
+#endif
av1_copy(cm->fc->coef_probs[TX_4X4], default_coef_probs_4x4);
av1_copy(cm->fc->coef_probs[TX_8X8], default_coef_probs_8x8);
av1_copy(cm->fc->coef_probs[TX_16X16], default_coef_probs_16x16);
@@ -2913,11 +2923,8 @@
count_sat = COEF_COUNT_SAT;
}
#endif // CONFIG_ENTROPY
- for (tx_size = TX_4X4; tx_size <= TX_32X32; tx_size++)
+ for (tx_size = 0; tx_size < TX_SIZES; tx_size++)
adapt_coef_probs(cm, tx_size, count_sat, update_factor);
-#if CONFIG_RANS
- av1_coef_pareto_cdfs(cm->fc);
-#endif // CONFIG_RANS
#if CONFIG_ADAPT_SCAN
for (tx_size = TX_4X4; tx_size < TX_SIZES; ++tx_size)
diff --git a/av1/common/entropy.h b/av1/common/entropy.h
index 469b484..423b35c 100644
--- a/av1/common/entropy.h
+++ b/av1/common/entropy.h
@@ -14,9 +14,6 @@
#include "./aom_config.h"
#include "aom/aom_integer.h"
-#if CONFIG_RANS
-#include "aom_dsp/ans.h"
-#endif // CONFIG_RANS
#include "aom_dsp/prob.h"
#include "av1/common/common.h"
@@ -200,14 +197,14 @@
void av1_model_to_full_probs(const aom_prob *model, aom_prob *full);
-#if CONFIG_RANS || CONFIG_DAALA_EC
+#if CONFIG_EC_MULTISYMBOL
typedef aom_cdf_prob coeff_cdf_model[REF_TYPES][COEF_BANDS][COEFF_CONTEXTS]
[ENTROPY_TOKENS];
extern const aom_cdf_prob av1_pareto8_token_probs[COEFF_PROB_MODELS]
[ENTROPY_TOKENS - 2];
struct frame_contexts;
void av1_coef_pareto_cdfs(struct frame_contexts *fc);
-#endif // CONFIG_RANS
+#endif // CONFIG_EC_MULTISYMBOL
typedef char ENTROPY_CONTEXT;
diff --git a/av1/common/entropymode.c b/av1/common/entropymode.c
index ecad3f4..910ba0f 100644
--- a/av1/common/entropymode.c
+++ b/av1/common/entropymode.c
@@ -561,280 +561,348 @@
-PALETTE_COLOR_SEVEN, -PALETTE_COLOR_EIGHT },
};
-const aom_prob
- av1_default_palette_y_color_prob[PALETTE_MAX_SIZE - 1]
- [PALETTE_COLOR_CONTEXTS]
- [PALETTE_COLORS - 1] = {
- {
- // 2 colors
- { 230, 255, 128, 128, 128, 128, 128 },
- { 214, 255, 128, 128, 128, 128, 128 },
- { 128, 128, 128, 128, 128, 128, 128 },
- { 128, 128, 128, 128, 128, 128, 128 },
- { 128, 128, 128, 128, 128, 128, 128 },
- { 240, 255, 128, 128, 128, 128, 128 },
- { 73, 255, 128, 128, 128, 128, 128 },
- { 128, 128, 128, 128, 128, 128, 128 },
- { 130, 255, 128, 128, 128, 128, 128 },
- { 227, 255, 128, 128, 128, 128, 128 },
- { 128, 128, 128, 128, 128, 128, 128 },
- { 188, 255, 128, 128, 128, 128, 128 },
- { 75, 255, 128, 128, 128, 128, 128 },
- { 250, 255, 128, 128, 128, 128, 128 },
- { 223, 255, 128, 128, 128, 128, 128 },
- { 252, 255, 128, 128, 128, 128, 128 },
- },
- {
- // 3 colors
- { 229, 137, 255, 128, 128, 128, 128 },
- { 197, 120, 255, 128, 128, 128, 128 },
- { 107, 195, 255, 128, 128, 128, 128 },
- { 128, 128, 128, 128, 128, 128, 128 },
- { 27, 151, 255, 128, 128, 128, 128 },
- { 230, 130, 255, 128, 128, 128, 128 },
- { 37, 230, 255, 128, 128, 128, 128 },
- { 67, 221, 255, 128, 128, 128, 128 },
- { 124, 230, 255, 128, 128, 128, 128 },
- { 195, 109, 255, 128, 128, 128, 128 },
- { 99, 122, 255, 128, 128, 128, 128 },
- { 205, 208, 255, 128, 128, 128, 128 },
- { 40, 235, 255, 128, 128, 128, 128 },
- { 251, 132, 255, 128, 128, 128, 128 },
- { 237, 186, 255, 128, 128, 128, 128 },
- { 253, 112, 255, 128, 128, 128, 128 },
- },
- {
- // 4 colors
- { 195, 87, 128, 255, 128, 128, 128 },
- { 143, 100, 123, 255, 128, 128, 128 },
- { 94, 124, 119, 255, 128, 128, 128 },
- { 77, 91, 130, 255, 128, 128, 128 },
- { 39, 114, 178, 255, 128, 128, 128 },
- { 222, 94, 125, 255, 128, 128, 128 },
- { 44, 203, 132, 255, 128, 128, 128 },
- { 68, 175, 122, 255, 128, 128, 128 },
- { 110, 187, 124, 255, 128, 128, 128 },
- { 152, 91, 128, 255, 128, 128, 128 },
- { 70, 109, 181, 255, 128, 128, 128 },
- { 133, 113, 164, 255, 128, 128, 128 },
- { 47, 205, 133, 255, 128, 128, 128 },
- { 247, 94, 136, 255, 128, 128, 128 },
- { 205, 122, 146, 255, 128, 128, 128 },
- { 251, 100, 141, 255, 128, 128, 128 },
- },
- {
- // 5 colors
- { 195, 65, 84, 125, 255, 128, 128 },
- { 150, 76, 84, 121, 255, 128, 128 },
- { 94, 110, 81, 117, 255, 128, 128 },
- { 79, 85, 91, 139, 255, 128, 128 },
- { 26, 102, 139, 127, 255, 128, 128 },
- { 220, 73, 91, 119, 255, 128, 128 },
- { 38, 203, 86, 127, 255, 128, 128 },
- { 61, 186, 72, 124, 255, 128, 128 },
- { 132, 199, 84, 128, 255, 128, 128 },
- { 172, 52, 62, 120, 255, 128, 128 },
- { 102, 89, 121, 122, 255, 128, 128 },
- { 182, 48, 69, 186, 255, 128, 128 },
- { 36, 206, 87, 126, 255, 128, 128 },
- { 249, 55, 67, 122, 255, 128, 128 },
- { 218, 88, 75, 122, 255, 128, 128 },
- { 253, 64, 80, 119, 255, 128, 128 },
- },
- {
- // 6 colors
- { 182, 54, 64, 75, 118, 255, 128 },
- { 126, 67, 70, 76, 116, 255, 128 },
- { 79, 92, 67, 85, 120, 255, 128 },
- { 63, 61, 81, 118, 132, 255, 128 },
- { 21, 80, 105, 83, 119, 255, 128 },
- { 215, 72, 74, 74, 111, 255, 128 },
- { 50, 176, 63, 79, 120, 255, 128 },
- { 72, 148, 66, 77, 120, 255, 128 },
- { 105, 177, 57, 78, 130, 255, 128 },
- { 150, 66, 66, 80, 127, 255, 128 },
- { 81, 76, 109, 85, 116, 255, 128 },
- { 113, 81, 62, 96, 148, 255, 128 },
- { 54, 179, 69, 82, 121, 255, 128 },
- { 244, 47, 48, 67, 118, 255, 128 },
- { 198, 83, 53, 65, 121, 255, 128 },
- { 250, 42, 51, 69, 110, 255, 128 },
- },
- {
- // 7 colors
- { 182, 45, 54, 62, 74, 113, 255 },
- { 124, 63, 57, 62, 77, 114, 255 },
- { 77, 80, 56, 66, 76, 117, 255 },
- { 63, 57, 69, 98, 85, 131, 255 },
- { 19, 81, 98, 63, 80, 116, 255 },
- { 215, 56, 60, 63, 68, 105, 255 },
- { 50, 174, 50, 60, 79, 118, 255 },
- { 68, 151, 50, 58, 73, 117, 255 },
- { 104, 182, 53, 57, 79, 127, 255 },
- { 156, 50, 51, 63, 77, 111, 255 },
- { 88, 67, 97, 59, 82, 120, 255 },
- { 114, 81, 46, 65, 103, 132, 255 },
- { 55, 166, 57, 66, 82, 120, 255 },
- { 245, 34, 38, 43, 63, 114, 255 },
- { 203, 68, 45, 47, 60, 118, 255 },
- { 250, 35, 37, 47, 66, 110, 255 },
- },
- {
- // 8 colors
- { 180, 43, 46, 50, 56, 69, 109 },
- { 116, 53, 51, 49, 57, 73, 115 },
- { 79, 70, 49, 50, 59, 74, 117 },
- { 60, 54, 57, 70, 62, 83, 129 },
- { 20, 73, 85, 52, 66, 81, 119 },
- { 213, 56, 52, 49, 53, 62, 104 },
- { 48, 161, 41, 45, 56, 77, 116 },
- { 68, 139, 40, 47, 54, 71, 116 },
- { 123, 166, 42, 43, 52, 76, 130 },
- { 153, 44, 44, 47, 54, 79, 129 },
- { 87, 64, 83, 49, 60, 75, 127 },
- { 131, 68, 43, 48, 73, 96, 130 },
- { 55, 152, 45, 51, 64, 77, 113 },
- { 243, 30, 28, 33, 41, 65, 114 },
- { 202, 56, 35, 36, 42, 63, 123 },
- { 249, 31, 29, 32, 45, 68, 111 },
- }
- };
+// Note: Has to be non-zero to avoid any asserts triggering.
+#define UNUSED_PROB 128
+
+const aom_prob av1_default_palette_y_color_prob
+ [PALETTE_MAX_SIZE - 1][PALETTE_COLOR_CONTEXTS][PALETTE_COLORS - 1] = {
+ {
+ // 2 colors
+ { 230, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB,
+ UNUSED_PROB, UNUSED_PROB },
+ { 214, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB,
+ UNUSED_PROB, UNUSED_PROB },
+ { UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB,
+ UNUSED_PROB, UNUSED_PROB },
+ { UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB,
+ UNUSED_PROB, UNUSED_PROB },
+ { UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB,
+ UNUSED_PROB, UNUSED_PROB },
+ { 240, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB,
+ UNUSED_PROB, UNUSED_PROB },
+ { 73, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB,
+ UNUSED_PROB },
+ { UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB,
+ UNUSED_PROB, UNUSED_PROB },
+ { 130, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB,
+ UNUSED_PROB, UNUSED_PROB },
+ { 227, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB,
+ UNUSED_PROB, UNUSED_PROB },
+ { UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB,
+ UNUSED_PROB, UNUSED_PROB },
+ { 188, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB,
+ UNUSED_PROB, UNUSED_PROB },
+ { 75, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB,
+ UNUSED_PROB },
+ { 250, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB,
+ UNUSED_PROB, UNUSED_PROB },
+ { 223, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB,
+ UNUSED_PROB, UNUSED_PROB },
+ { 252, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB,
+ UNUSED_PROB, UNUSED_PROB },
+ },
+ {
+ // 3 colors
+ { 229, 137, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB,
+ UNUSED_PROB },
+ { 197, 120, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB,
+ UNUSED_PROB },
+ { 107, 195, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB,
+ UNUSED_PROB },
+ { UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB,
+ UNUSED_PROB, UNUSED_PROB },
+ { 27, 151, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB,
+ UNUSED_PROB },
+ { 230, 130, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB,
+ UNUSED_PROB },
+ { 37, 230, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB,
+ UNUSED_PROB },
+ { 67, 221, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB,
+ UNUSED_PROB },
+ { 124, 230, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB,
+ UNUSED_PROB },
+ { 195, 109, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB,
+ UNUSED_PROB },
+ { 99, 122, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB,
+ UNUSED_PROB },
+ { 205, 208, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB,
+ UNUSED_PROB },
+ { 40, 235, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB,
+ UNUSED_PROB },
+ { 251, 132, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB,
+ UNUSED_PROB },
+ { 237, 186, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB,
+ UNUSED_PROB },
+ { 253, 112, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB,
+ UNUSED_PROB },
+ },
+ {
+ // 4 colors
+ { 195, 87, 128, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB },
+ { 143, 100, 123, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB },
+ { 94, 124, 119, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB },
+ { 77, 91, 130, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB },
+ { 39, 114, 178, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB },
+ { 222, 94, 125, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB },
+ { 44, 203, 132, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB },
+ { 68, 175, 122, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB },
+ { 110, 187, 124, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB },
+ { 152, 91, 128, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB },
+ { 70, 109, 181, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB },
+ { 133, 113, 164, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB },
+ { 47, 205, 133, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB },
+ { 247, 94, 136, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB },
+ { 205, 122, 146, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB },
+ { 251, 100, 141, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB },
+ },
+ {
+ // 5 colors
+ { 195, 65, 84, 125, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB },
+ { 150, 76, 84, 121, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB },
+ { 94, 110, 81, 117, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB },
+ { 79, 85, 91, 139, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB },
+ { 26, 102, 139, 127, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB },
+ { 220, 73, 91, 119, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB },
+ { 38, 203, 86, 127, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB },
+ { 61, 186, 72, 124, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB },
+ { 132, 199, 84, 128, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB },
+ { 172, 52, 62, 120, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB },
+ { 102, 89, 121, 122, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB },
+ { 182, 48, 69, 186, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB },
+ { 36, 206, 87, 126, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB },
+ { 249, 55, 67, 122, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB },
+ { 218, 88, 75, 122, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB },
+ { 253, 64, 80, 119, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB },
+ },
+ {
+ // 6 colors
+ { 182, 54, 64, 75, 118, UNUSED_PROB, UNUSED_PROB },
+ { 126, 67, 70, 76, 116, UNUSED_PROB, UNUSED_PROB },
+ { 79, 92, 67, 85, 120, UNUSED_PROB, UNUSED_PROB },
+ { 63, 61, 81, 118, 132, UNUSED_PROB, UNUSED_PROB },
+ { 21, 80, 105, 83, 119, UNUSED_PROB, UNUSED_PROB },
+ { 215, 72, 74, 74, 111, UNUSED_PROB, UNUSED_PROB },
+ { 50, 176, 63, 79, 120, UNUSED_PROB, UNUSED_PROB },
+ { 72, 148, 66, 77, 120, UNUSED_PROB, UNUSED_PROB },
+ { 105, 177, 57, 78, 130, UNUSED_PROB, UNUSED_PROB },
+ { 150, 66, 66, 80, 127, UNUSED_PROB, UNUSED_PROB },
+ { 81, 76, 109, 85, 116, UNUSED_PROB, UNUSED_PROB },
+ { 113, 81, 62, 96, 148, UNUSED_PROB, UNUSED_PROB },
+ { 54, 179, 69, 82, 121, UNUSED_PROB, UNUSED_PROB },
+ { 244, 47, 48, 67, 118, UNUSED_PROB, UNUSED_PROB },
+ { 198, 83, 53, 65, 121, UNUSED_PROB, UNUSED_PROB },
+ { 250, 42, 51, 69, 110, UNUSED_PROB, UNUSED_PROB },
+ },
+ {
+ // 7 colors
+ { 182, 45, 54, 62, 74, 113, UNUSED_PROB },
+ { 124, 63, 57, 62, 77, 114, UNUSED_PROB },
+ { 77, 80, 56, 66, 76, 117, UNUSED_PROB },
+ { 63, 57, 69, 98, 85, 131, UNUSED_PROB },
+ { 19, 81, 98, 63, 80, 116, UNUSED_PROB },
+ { 215, 56, 60, 63, 68, 105, UNUSED_PROB },
+ { 50, 174, 50, 60, 79, 118, UNUSED_PROB },
+ { 68, 151, 50, 58, 73, 117, UNUSED_PROB },
+ { 104, 182, 53, 57, 79, 127, UNUSED_PROB },
+ { 156, 50, 51, 63, 77, 111, UNUSED_PROB },
+ { 88, 67, 97, 59, 82, 120, UNUSED_PROB },
+ { 114, 81, 46, 65, 103, 132, UNUSED_PROB },
+ { 55, 166, 57, 66, 82, 120, UNUSED_PROB },
+ { 245, 34, 38, 43, 63, 114, UNUSED_PROB },
+ { 203, 68, 45, 47, 60, 118, UNUSED_PROB },
+ { 250, 35, 37, 47, 66, 110, UNUSED_PROB },
+ },
+ {
+ // 8 colors
+ { 180, 43, 46, 50, 56, 69, 109 },
+ { 116, 53, 51, 49, 57, 73, 115 },
+ { 79, 70, 49, 50, 59, 74, 117 },
+ { 60, 54, 57, 70, 62, 83, 129 },
+ { 20, 73, 85, 52, 66, 81, 119 },
+ { 213, 56, 52, 49, 53, 62, 104 },
+ { 48, 161, 41, 45, 56, 77, 116 },
+ { 68, 139, 40, 47, 54, 71, 116 },
+ { 123, 166, 42, 43, 52, 76, 130 },
+ { 153, 44, 44, 47, 54, 79, 129 },
+ { 87, 64, 83, 49, 60, 75, 127 },
+ { 131, 68, 43, 48, 73, 96, 130 },
+ { 55, 152, 45, 51, 64, 77, 113 },
+ { 243, 30, 28, 33, 41, 65, 114 },
+ { 202, 56, 35, 36, 42, 63, 123 },
+ { 249, 31, 29, 32, 45, 68, 111 },
+ }
+ };
const aom_prob av1_default_palette_uv_color_prob
- [PALETTE_MAX_SIZE - 1][PALETTE_COLOR_CONTEXTS]
- [PALETTE_COLORS - 1] = { {
- // 2 colors
- { 228, 255, 128, 128, 128, 128, 128 },
- { 195, 255, 128, 128, 128, 128, 128 },
- { 128, 128, 128, 128, 128, 128, 128 },
- { 128, 128, 128, 128, 128, 128, 128 },
- { 128, 128, 128, 128, 128, 128, 128 },
- { 228, 255, 128, 128, 128, 128, 128 },
- { 71, 255, 128, 128, 128, 128, 128 },
- { 128, 128, 128, 128, 128, 128, 128 },
- { 129, 255, 128, 128, 128, 128, 128 },
- { 206, 255, 128, 128, 128, 128, 128 },
- { 128, 128, 128, 128, 128, 128, 128 },
- { 136, 255, 128, 128, 128, 128, 128 },
- { 98, 255, 128, 128, 128, 128, 128 },
- { 236, 255, 128, 128, 128, 128, 128 },
- { 222, 255, 128, 128, 128, 128, 128 },
- { 249, 255, 128, 128, 128, 128, 128 },
- },
- {
- // 3 colors
- { 198, 136, 255, 128, 128, 128, 128 },
- { 178, 105, 255, 128, 128, 128, 128 },
- { 100, 206, 255, 128, 128, 128, 128 },
- { 128, 128, 128, 128, 128, 128, 128 },
- { 12, 136, 255, 128, 128, 128, 128 },
- { 219, 134, 255, 128, 128, 128, 128 },
- { 50, 198, 255, 128, 128, 128, 128 },
- { 61, 231, 255, 128, 128, 128, 128 },
- { 110, 209, 255, 128, 128, 128, 128 },
- { 173, 106, 255, 128, 128, 128, 128 },
- { 145, 166, 255, 128, 128, 128, 128 },
- { 156, 175, 255, 128, 128, 128, 128 },
- { 69, 183, 255, 128, 128, 128, 128 },
- { 241, 163, 255, 128, 128, 128, 128 },
- { 224, 160, 255, 128, 128, 128, 128 },
- { 246, 154, 255, 128, 128, 128, 128 },
- },
- {
- // 4 colors
- { 173, 88, 143, 255, 128, 128, 128 },
- { 146, 81, 127, 255, 128, 128, 128 },
- { 84, 134, 102, 255, 128, 128, 128 },
- { 69, 138, 140, 255, 128, 128, 128 },
- { 31, 103, 200, 255, 128, 128, 128 },
- { 217, 101, 139, 255, 128, 128, 128 },
- { 51, 174, 121, 255, 128, 128, 128 },
- { 64, 177, 109, 255, 128, 128, 128 },
- { 96, 179, 145, 255, 128, 128, 128 },
- { 164, 77, 114, 255, 128, 128, 128 },
- { 87, 94, 156, 255, 128, 128, 128 },
- { 105, 57, 173, 255, 128, 128, 128 },
- { 63, 158, 137, 255, 128, 128, 128 },
- { 236, 102, 156, 255, 128, 128, 128 },
- { 197, 115, 153, 255, 128, 128, 128 },
- { 245, 106, 154, 255, 128, 128, 128 },
- },
- {
- // 5 colors
- { 179, 64, 97, 129, 255, 128, 128 },
- { 137, 56, 88, 125, 255, 128, 128 },
- { 82, 107, 61, 118, 255, 128, 128 },
- { 59, 113, 86, 115, 255, 128, 128 },
- { 23, 88, 118, 130, 255, 128, 128 },
- { 213, 66, 90, 125, 255, 128, 128 },
- { 37, 181, 103, 121, 255, 128, 128 },
- { 47, 188, 61, 131, 255, 128, 128 },
- { 104, 185, 103, 144, 255, 128, 128 },
- { 163, 39, 76, 112, 255, 128, 128 },
- { 94, 74, 131, 126, 255, 128, 128 },
- { 142, 42, 103, 163, 255, 128, 128 },
- { 53, 162, 99, 149, 255, 128, 128 },
- { 239, 54, 84, 108, 255, 128, 128 },
- { 203, 84, 110, 147, 255, 128, 128 },
- { 248, 70, 105, 151, 255, 128, 128 },
- },
- {
- // 6 colors
- { 189, 50, 67, 90, 130, 255, 128 },
- { 114, 50, 55, 90, 123, 255, 128 },
- { 66, 76, 54, 82, 128, 255, 128 },
- { 43, 69, 69, 80, 129, 255, 128 },
- { 22, 59, 87, 88, 141, 255, 128 },
- { 203, 49, 68, 87, 122, 255, 128 },
- { 43, 157, 74, 104, 146, 255, 128 },
- { 54, 138, 51, 95, 138, 255, 128 },
- { 82, 171, 58, 102, 146, 255, 128 },
- { 129, 38, 59, 64, 168, 255, 128 },
- { 56, 67, 119, 92, 112, 255, 128 },
- { 96, 62, 53, 132, 82, 255, 128 },
- { 60, 147, 77, 108, 145, 255, 128 },
- { 238, 76, 73, 93, 148, 255, 128 },
- { 189, 86, 73, 103, 157, 255, 128 },
- { 246, 62, 75, 83, 167, 255, 128 },
- },
- {
- // 7 colors
- { 179, 42, 51, 73, 99, 134, 255 },
- { 119, 52, 52, 61, 64, 114, 255 },
- { 53, 77, 35, 65, 71, 131, 255 },
- { 38, 70, 51, 68, 89, 144, 255 },
- { 23, 65, 128, 73, 97, 131, 255 },
- { 210, 47, 52, 63, 81, 143, 255 },
- { 42, 159, 57, 68, 98, 143, 255 },
- { 49, 153, 45, 82, 93, 143, 255 },
- { 81, 169, 52, 72, 113, 151, 255 },
- { 136, 46, 35, 56, 75, 96, 255 },
- { 57, 84, 109, 47, 107, 131, 255 },
- { 128, 78, 57, 36, 128, 85, 255 },
- { 54, 149, 68, 77, 94, 153, 255 },
- { 243, 58, 50, 71, 81, 167, 255 },
- { 189, 92, 64, 70, 121, 173, 255 },
- { 248, 35, 38, 51, 82, 201, 255 },
- },
- {
- // 8 colors
- { 201, 40, 36, 42, 64, 92, 123 },
- { 116, 43, 33, 43, 73, 102, 128 },
- { 46, 77, 37, 69, 62, 78, 150 },
- { 40, 65, 52, 50, 76, 89, 133 },
- { 28, 48, 91, 17, 64, 77, 133 },
- { 218, 43, 43, 37, 56, 72, 163 },
- { 41, 155, 44, 83, 82, 129, 180 },
- { 44, 141, 29, 55, 64, 89, 147 },
- { 92, 166, 48, 45, 59, 126, 179 },
- { 169, 35, 49, 41, 36, 99, 139 },
- { 55, 77, 77, 56, 60, 75, 156 },
- { 155, 81, 51, 64, 57, 182, 255 },
- { 60, 134, 49, 49, 93, 128, 174 },
- { 244, 98, 51, 46, 22, 73, 238 },
- { 189, 70, 40, 87, 93, 79, 201 },
- { 248, 54, 49, 40, 29, 42, 227 },
- } };
+ [PALETTE_MAX_SIZE - 1][PALETTE_COLOR_CONTEXTS][PALETTE_COLORS - 1] = {
+ {
+ // 2 colors
+ { 228, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB,
+ UNUSED_PROB, UNUSED_PROB },
+ { 195, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB,
+ UNUSED_PROB, UNUSED_PROB },
+ { UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB,
+ UNUSED_PROB, UNUSED_PROB },
+ { UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB,
+ UNUSED_PROB, UNUSED_PROB },
+ { UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB,
+ UNUSED_PROB, UNUSED_PROB },
+ { 228, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB,
+ UNUSED_PROB, UNUSED_PROB },
+ { 71, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB,
+ UNUSED_PROB },
+ { UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB,
+ UNUSED_PROB, UNUSED_PROB },
+ { 129, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB,
+ UNUSED_PROB, UNUSED_PROB },
+ { 206, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB,
+ UNUSED_PROB, UNUSED_PROB },
+ { UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB,
+ UNUSED_PROB, UNUSED_PROB },
+ { 136, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB,
+ UNUSED_PROB, UNUSED_PROB },
+ { 98, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB,
+ UNUSED_PROB },
+ { 236, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB,
+ UNUSED_PROB, UNUSED_PROB },
+ { 222, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB,
+ UNUSED_PROB, UNUSED_PROB },
+ { 249, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB,
+ UNUSED_PROB, UNUSED_PROB },
+ },
+ {
+ // 3 colors
+ { 198, 136, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB,
+ UNUSED_PROB },
+ { 178, 105, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB,
+ UNUSED_PROB },
+ { 100, 206, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB,
+ UNUSED_PROB },
+ { UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB,
+ UNUSED_PROB, UNUSED_PROB },
+ { 12, 136, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB,
+ UNUSED_PROB },
+ { 219, 134, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB,
+ UNUSED_PROB },
+ { 50, 198, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB,
+ UNUSED_PROB },
+ { 61, 231, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB,
+ UNUSED_PROB },
+ { 110, 209, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB,
+ UNUSED_PROB },
+ { 173, 106, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB,
+ UNUSED_PROB },
+ { 145, 166, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB,
+ UNUSED_PROB },
+ { 156, 175, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB,
+ UNUSED_PROB },
+ { 69, 183, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB,
+ UNUSED_PROB },
+ { 241, 163, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB,
+ UNUSED_PROB },
+ { 224, 160, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB,
+ UNUSED_PROB },
+ { 246, 154, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB,
+ UNUSED_PROB },
+ },
+ {
+ // 4 colors
+ { 173, 88, 143, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB },
+ { 146, 81, 127, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB },
+ { 84, 134, 102, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB },
+ { 69, 138, 140, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB },
+ { 31, 103, 200, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB },
+ { 217, 101, 139, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB },
+ { 51, 174, 121, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB },
+ { 64, 177, 109, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB },
+ { 96, 179, 145, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB },
+ { 164, 77, 114, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB },
+ { 87, 94, 156, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB },
+ { 105, 57, 173, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB },
+ { 63, 158, 137, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB },
+ { 236, 102, 156, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB },
+ { 197, 115, 153, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB },
+ { 245, 106, 154, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB },
+ },
+ {
+ // 5 colors
+ { 179, 64, 97, 129, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB },
+ { 137, 56, 88, 125, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB },
+ { 82, 107, 61, 118, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB },
+ { 59, 113, 86, 115, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB },
+ { 23, 88, 118, 130, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB },
+ { 213, 66, 90, 125, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB },
+ { 37, 181, 103, 121, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB },
+ { 47, 188, 61, 131, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB },
+ { 104, 185, 103, 144, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB },
+ { 163, 39, 76, 112, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB },
+ { 94, 74, 131, 126, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB },
+ { 142, 42, 103, 163, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB },
+ { 53, 162, 99, 149, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB },
+ { 239, 54, 84, 108, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB },
+ { 203, 84, 110, 147, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB },
+ { 248, 70, 105, 151, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB },
+ },
+ {
+ // 6 colors
+ { 189, 50, 67, 90, 130, UNUSED_PROB, UNUSED_PROB },
+ { 114, 50, 55, 90, 123, UNUSED_PROB, UNUSED_PROB },
+ { 66, 76, 54, 82, 128, UNUSED_PROB, UNUSED_PROB },
+ { 43, 69, 69, 80, 129, UNUSED_PROB, UNUSED_PROB },
+ { 22, 59, 87, 88, 141, UNUSED_PROB, UNUSED_PROB },
+ { 203, 49, 68, 87, 122, UNUSED_PROB, UNUSED_PROB },
+ { 43, 157, 74, 104, 146, UNUSED_PROB, UNUSED_PROB },
+ { 54, 138, 51, 95, 138, UNUSED_PROB, UNUSED_PROB },
+ { 82, 171, 58, 102, 146, UNUSED_PROB, UNUSED_PROB },
+ { 129, 38, 59, 64, 168, UNUSED_PROB, UNUSED_PROB },
+ { 56, 67, 119, 92, 112, UNUSED_PROB, UNUSED_PROB },
+ { 96, 62, 53, 132, 82, UNUSED_PROB, UNUSED_PROB },
+ { 60, 147, 77, 108, 145, UNUSED_PROB, UNUSED_PROB },
+ { 238, 76, 73, 93, 148, UNUSED_PROB, UNUSED_PROB },
+ { 189, 86, 73, 103, 157, UNUSED_PROB, UNUSED_PROB },
+ { 246, 62, 75, 83, 167, UNUSED_PROB, UNUSED_PROB },
+ },
+ {
+ // 7 colors
+ { 179, 42, 51, 73, 99, 134, UNUSED_PROB },
+ { 119, 52, 52, 61, 64, 114, UNUSED_PROB },
+ { 53, 77, 35, 65, 71, 131, UNUSED_PROB },
+ { 38, 70, 51, 68, 89, 144, UNUSED_PROB },
+ { 23, 65, 128, 73, 97, 131, UNUSED_PROB },
+ { 210, 47, 52, 63, 81, 143, UNUSED_PROB },
+ { 42, 159, 57, 68, 98, 143, UNUSED_PROB },
+ { 49, 153, 45, 82, 93, 143, UNUSED_PROB },
+ { 81, 169, 52, 72, 113, 151, UNUSED_PROB },
+ { 136, 46, 35, 56, 75, 96, UNUSED_PROB },
+ { 57, 84, 109, 47, 107, 131, UNUSED_PROB },
+ { 128, 78, 57, 36, 128, 85, UNUSED_PROB },
+ { 54, 149, 68, 77, 94, 153, UNUSED_PROB },
+ { 243, 58, 50, 71, 81, 167, UNUSED_PROB },
+ { 189, 92, 64, 70, 121, 173, UNUSED_PROB },
+ { 248, 35, 38, 51, 82, 201, UNUSED_PROB },
+ },
+ {
+ // 8 colors
+ { 201, 40, 36, 42, 64, 92, 123 },
+ { 116, 43, 33, 43, 73, 102, 128 },
+ { 46, 77, 37, 69, 62, 78, 150 },
+ { 40, 65, 52, 50, 76, 89, 133 },
+ { 28, 48, 91, 17, 64, 77, 133 },
+ { 218, 43, 43, 37, 56, 72, 163 },
+ { 41, 155, 44, 83, 82, 129, 180 },
+ { 44, 141, 29, 55, 64, 89, 147 },
+ { 92, 166, 48, 45, 59, 126, 179 },
+ { 169, 35, 49, 41, 36, 99, 139 },
+ { 55, 77, 77, 56, 60, 75, 156 },
+ { 155, 81, 51, 64, 57, 182, 255 },
+ { 60, 134, 49, 49, 93, 128, 174 },
+ { 244, 98, 51, 46, 22, 73, 238 },
+ { 189, 70, 40, 87, 93, 79, 201 },
+ { 248, 54, 49, 40, 29, 42, 227 },
+ }
+ };
+
+#undef UNUSED_PROB
static const int palette_color_context_lookup[PALETTE_COLOR_CONTEXTS] = {
// (3, 0, 0, 0), (3, 2, 0, 0), (3, 3, 2, 0), (3, 3, 2, 2),
@@ -848,18 +916,18 @@
};
#endif // CONFIG_PALETTE
-const aom_tree_index av1_tx_size_tree[TX_SIZES - 1][TREE_SIZE(TX_SIZES)] = {
+const aom_tree_index av1_tx_size_tree[MAX_TX_DEPTH][TREE_SIZE(TX_SIZES)] = {
{
// Max tx_size is 8X8
- -TX_4X4, -TX_8X8,
+ -0, -1,
},
{
// Max tx_size is 16X16
- -TX_4X4, 2, -TX_8X8, -TX_16X16,
+ -0, 2, -1, -2,
},
{
// Max tx_size is 32X32
- -TX_4X4, 2, -TX_8X8, 4, -TX_16X16, -TX_32X32,
+ -0, 2, -1, 4, -2, -3,
},
};
@@ -902,30 +970,26 @@
int av1_get_palette_color_context(const uint8_t *color_map, int cols, int r,
int c, int n, uint8_t *color_order,
int *color_idx) {
- int i, j, max, max_idx, temp;
+ int i;
+ // The +10 below should not be needed. But we get a warning "array subscript
+ // is above array bounds [-Werror=array-bounds]" without it, possibly due to
+ // this (or similar) bug: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=59124
int scores[PALETTE_MAX_SIZE + 10];
- int weights[4] = { 3, 2, 3, 2 };
- int color_ctx = 0;
+ const int weights[4] = { 3, 2, 3, 2 };
+ int color_ctx_hash;
+ int color_ctx;
int color_neighbors[4];
int inverse_color_order[PALETTE_MAX_SIZE];
assert(n <= PALETTE_MAX_SIZE);
- if (c - 1 >= 0)
- color_neighbors[0] = color_map[r * cols + c - 1];
- else
- color_neighbors[0] = -1;
- if (c - 1 >= 0 && r - 1 >= 0)
- color_neighbors[1] = color_map[(r - 1) * cols + c - 1];
- else
- color_neighbors[1] = -1;
- if (r - 1 >= 0)
- color_neighbors[2] = color_map[(r - 1) * cols + c];
- else
- color_neighbors[2] = -1;
- if (r - 1 >= 0 && c + 1 <= cols - 1)
- color_neighbors[3] = color_map[(r - 1) * cols + c + 1];
- else
- color_neighbors[3] = -1;
+ // Get color indices of neighbors.
+ color_neighbors[0] = (c - 1 >= 0) ? color_map[r * cols + c - 1] : -1;
+ color_neighbors[1] =
+ (c - 1 >= 0 && r - 1 >= 0) ? color_map[(r - 1) * cols + c - 1] : -1;
+ color_neighbors[2] = (r - 1 >= 0) ? color_map[(r - 1) * cols + c] : -1;
+ color_neighbors[3] = (r - 1 >= 0 && c + 1 <= cols - 1)
+ ? color_map[(r - 1) * cols + c + 1]
+ : -1;
for (i = 0; i < PALETTE_MAX_SIZE; ++i) {
color_order[i] = i;
@@ -933,23 +997,25 @@
}
memset(scores, 0, PALETTE_MAX_SIZE * sizeof(scores[0]));
for (i = 0; i < 4; ++i) {
- if (color_neighbors[i] >= 0) scores[color_neighbors[i]] += weights[i];
+ if (color_neighbors[i] >= 0) {
+ scores[color_neighbors[i]] += weights[i];
+ }
}
+ // Get the top 4 scores (sorted from large to small).
for (i = 0; i < 4; ++i) {
- max = scores[i];
- max_idx = i;
- j = i + 1;
- while (j < n) {
+ int max = scores[i];
+ int max_idx = i;
+ int j;
+ for (j = i + 1; j < n; ++j) {
if (scores[j] > max) {
max = scores[j];
max_idx = j;
}
- ++j;
}
if (max_idx != i) {
- temp = scores[i];
+ int temp = scores[i];
scores[i] = scores[max_idx];
scores[max_idx] = temp;
@@ -961,15 +1027,19 @@
}
}
- for (i = 0; i < 4; ++i) color_ctx = color_ctx * 11 + scores[i];
+ // Get hash value of context.
+ color_ctx_hash = 0;
+ for (i = 0; i < 4; ++i) color_ctx_hash = color_ctx_hash * 11 + scores[i];
- for (i = 0; i < PALETTE_COLOR_CONTEXTS; ++i)
- if (color_ctx == palette_color_context_lookup[i]) {
+ // Lookup context from hash.
+ color_ctx = 0; // Default.
+ for (i = 0; i < PALETTE_COLOR_CONTEXTS; ++i) {
+ if (color_ctx_hash == palette_color_context_lookup[i]) {
color_ctx = i;
break;
}
+ }
- if (color_ctx >= PALETTE_COLOR_CONTEXTS) color_ctx = 0;
if (color_idx != NULL) {
*color_idx = inverse_color_order[color_map[r * cols + c]];
}
@@ -979,7 +1049,7 @@
#if CONFIG_VAR_TX
static const aom_prob default_txfm_partition_probs[TXFM_PARTITION_CONTEXTS] = {
- 192, 128, 64, 192, 128, 64, 192, 128, 64,
+ 250, 231, 212, 241, 166, 66, 241, 230, 135, 243, 154, 64, 248, 161, 63, 128,
};
#endif
@@ -1310,13 +1380,21 @@
static const aom_prob
default_intra_ext_tx_prob[EXT_TX_SIZES][TX_TYPES][TX_TYPES - 1] = {
+#if CONFIG_CB4X4
+ { { 240, 85, 128 }, { 4, 1, 248 }, { 4, 1, 8 }, { 4, 248, 128 } },
+#endif
{ { 240, 85, 128 }, { 4, 1, 248 }, { 4, 1, 8 }, { 4, 248, 128 } },
{ { 244, 85, 128 }, { 8, 2, 248 }, { 8, 2, 8 }, { 8, 248, 128 } },
{ { 248, 85, 128 }, { 16, 4, 248 }, { 16, 4, 8 }, { 16, 248, 128 } },
};
static const aom_prob default_inter_ext_tx_prob[EXT_TX_SIZES][TX_TYPES - 1] = {
- { 160, 85, 128 }, { 176, 85, 128 }, { 192, 85, 128 },
+#if CONFIG_CB4X4
+ { 160, 85, 128 },
+#endif
+ { 160, 85, 128 },
+ { 176, 85, 128 },
+ { 192, 85, 128 },
};
#endif // CONFIG_EXT_TX
@@ -1440,6 +1518,48 @@
#if CONFIG_DAALA_EC
int av1_switchable_interp_ind[SWITCHABLE_FILTERS];
int av1_switchable_interp_inv[SWITCHABLE_FILTERS];
+
+void av1_set_mode_cdfs(struct AV1Common *cm) {
+ FRAME_CONTEXT *fc = cm->fc;
+ int i, j;
+ if (cm->seg.enabled && cm->seg.update_map) {
+ av1_tree_to_cdf(av1_segment_tree, cm->fc->seg.tree_probs,
+ cm->fc->seg.tree_cdf);
+ }
+
+ for (i = 0; i < INTRA_MODES; ++i)
+ av1_tree_to_cdf(av1_intra_mode_tree, fc->uv_mode_prob[i],
+ fc->uv_mode_cdf[i]);
+
+ for (i = 0; i < PARTITION_CONTEXTS; ++i)
+ av1_tree_to_cdf(av1_partition_tree, fc->partition_prob[i],
+ fc->partition_cdf[i]);
+
+ for (i = 0; i < INTRA_MODES; ++i)
+ for (j = 0; j < INTRA_MODES; ++j)
+ av1_tree_to_cdf(av1_intra_mode_tree, cm->kf_y_prob[i][j],
+ cm->kf_y_cdf[i][j]);
+
+ for (j = 0; j < SWITCHABLE_FILTER_CONTEXTS; ++j)
+ av1_tree_to_cdf(av1_switchable_interp_tree, fc->switchable_interp_prob[j],
+ fc->switchable_interp_cdf[j]);
+
+ for (i = 0; i < INTER_MODE_CONTEXTS; ++i)
+ av1_tree_to_cdf(av1_inter_mode_tree, fc->inter_mode_probs[i],
+ fc->inter_mode_cdf[i]);
+
+ for (i = 0; i < BLOCK_SIZE_GROUPS; ++i)
+ av1_tree_to_cdf(av1_intra_mode_tree, fc->y_mode_prob[i], fc->y_mode_cdf[i]);
+
+ for (i = TX_4X4; i < EXT_TX_SIZES; ++i)
+ for (j = 0; j < TX_TYPES; ++j)
+ av1_tree_to_cdf(av1_ext_tx_tree, fc->intra_ext_tx_prob[i][j],
+ fc->intra_ext_tx_cdf[i][j]);
+
+ for (i = TX_4X4; i < EXT_TX_SIZES; ++i)
+ av1_tree_to_cdf(av1_ext_tx_tree, fc->inter_ext_tx_prob[i],
+ fc->inter_ext_tx_cdf[i]);
+}
#endif
#if CONFIG_EXT_INTERP
@@ -1561,7 +1681,7 @@
#if CONFIG_VAR_TX && CONFIG_EXT_TX && CONFIG_RECT_TX
if (cm->tx_mode == TX_MODE_SELECT) {
- for (i = 0; i < TX_SIZES - 1; ++i) {
+ for (i = 0; i < MAX_TX_DEPTH; ++i) {
fc->rect_tx_prob[i] =
av1_mode_mv_merge_probs(pre_fc->rect_tx_prob[i], counts->rect_tx[i]);
}
@@ -1637,19 +1757,11 @@
aom_tree_merge_probs(av1_ext_tx_tree, pre_fc->intra_ext_tx_prob[i][j],
counts->intra_ext_tx[i][j],
fc->intra_ext_tx_prob[i][j]);
-#if CONFIG_DAALA_EC
- av1_tree_to_cdf(av1_ext_tx_tree, fc->intra_ext_tx_prob[i][j],
- fc->intra_ext_tx_cdf[i][j]);
-#endif
}
}
for (i = TX_4X4; i < EXT_TX_SIZES; ++i) {
aom_tree_merge_probs(av1_ext_tx_tree, pre_fc->inter_ext_tx_prob[i],
counts->inter_ext_tx[i], fc->inter_ext_tx_prob[i]);
-#if CONFIG_DAALA_EC
- av1_tree_to_cdf(av1_ext_tx_tree, fc->inter_ext_tx_prob[i],
- fc->inter_ext_tx_cdf[i]);
-#endif
}
#endif // CONFIG_EXT_TX
@@ -1679,10 +1791,6 @@
for (i = 0; i < PARTITION_CONTEXTS; i++) {
aom_tree_merge_probs(av1_partition_tree, pre_fc->partition_prob[i],
counts->partition[i], fc->partition_prob[i]);
-#if CONFIG_DAALA_EC
- av1_tree_to_cdf(av1_partition_tree, fc->partition_prob[i],
- fc->partition_cdf[i]);
-#endif
}
#endif // CONFIG_EXT_PARTITION_TYPES
#if CONFIG_DELTA_Q
diff --git a/av1/common/entropymode.h b/av1/common/entropymode.h
index 31ab65d..1d95cdc 100644
--- a/av1/common/entropymode.h
+++ b/av1/common/entropymode.h
@@ -62,9 +62,9 @@
aom_prob partition_prob[PARTITION_CONTEXTS][PARTITION_TYPES - 1];
#endif
av1_coeff_probs_model coef_probs[TX_SIZES][PLANE_TYPES];
-#if CONFIG_RANS || CONFIG_DAALA_EC
+#if CONFIG_EC_MULTISYMBOL
coeff_cdf_model coef_cdfs[TX_SIZES][PLANE_TYPES];
-#endif // CONFIG_RANS
+#endif // CONFIG_EC_MULTISYMBOL
aom_prob switchable_interp_prob[SWITCHABLE_FILTER_CONTEXTS]
[SWITCHABLE_FILTERS - 1];
@@ -324,7 +324,7 @@
extern const aom_tree_index av1_palette_color_tree[PALETTE_MAX_SIZE - 1]
[TREE_SIZE(PALETTE_COLORS)];
#endif // CONFIG_PALETTE
-extern const aom_tree_index av1_tx_size_tree[TX_SIZES - 1][TREE_SIZE(TX_SIZES)];
+extern const aom_tree_index av1_tx_size_tree[MAX_TX_DEPTH][TREE_SIZE(TX_SIZES)];
#if CONFIG_EXT_INTRA
extern const aom_tree_index av1_intra_filter_tree[TREE_SIZE(INTRA_FILTERS)];
#endif // CONFIG_EXT_INTRA
@@ -349,6 +349,8 @@
#if CONFIG_DAALA_EC
extern int av1_switchable_interp_ind[SWITCHABLE_FILTERS];
extern int av1_switchable_interp_inv[SWITCHABLE_FILTERS];
+
+void av1_set_mode_cdfs(struct AV1Common *cm);
#endif
void av1_setup_past_independence(struct AV1Common *cm);
diff --git a/av1/common/entropymv.c b/av1/common/entropymv.c
index a80165e..029f9f6 100644
--- a/av1/common/entropymv.c
+++ b/av1/common/entropymv.c
@@ -43,21 +43,21 @@
static const nmv_context default_nmv_context = {
{ 32, 64, 96 }, // joints
-#if CONFIG_DAALA_EC
+#if CONFIG_EC_MULTISYMBOL
{ 0, 0, 0, 0 }, // joint_cdf is computed from joints in av1_init_mv_probs()
#endif
{ {
// Vertical component
128, // sign
{ 224, 144, 192, 168, 192, 176, 192, 198, 198, 245 }, // class
-#if CONFIG_DAALA_EC
+#if CONFIG_EC_MULTISYMBOL
{ 0 }, // class_cdf is computed from class in av1_init_mv_probs()
#endif
{ 216 }, // class0
{ 136, 140, 148, 160, 176, 192, 224, 234, 234, 240 }, // bits
{ { 128, 128, 64 }, { 96, 112, 64 } }, // class0_fp
{ 64, 96, 64 }, // fp
-#if CONFIG_DAALA_EC
+#if CONFIG_EC_MULTISYMBOL
{ { 0 }, { 0 } }, // class0_fp_cdf is computed in av1_init_mv_probs()
{ 0 }, // fp_cdf is computed from fp in av1_init_mv_probs()
#endif
@@ -68,14 +68,14 @@
// Horizontal component
128, // sign
{ 216, 128, 176, 160, 176, 176, 192, 198, 198, 208 }, // class
-#if CONFIG_DAALA_EC
+#if CONFIG_EC_MULTISYMBOL
{ 0 }, // class_cdf is computed from class in av1_init_mv_probs()
#endif
{ 208 }, // class0
{ 136, 140, 148, 160, 176, 192, 224, 234, 234, 240 }, // bits
{ { 128, 128, 64 }, { 96, 112, 64 } }, // class0_fp
{ 64, 96, 64 }, // fp
-#if CONFIG_DAALA_EC
+#if CONFIG_EC_MULTISYMBOL
{ { 0 }, { 0 } }, // class0_fp_cdf is computed in av1_init_mv_probs()
{ 0 }, // fp_cdf is computed from fp in av1_init_mv_probs()
#endif
@@ -149,13 +149,6 @@
return c;
}
-// TODO(jingning): This idle function is intentionally left as is for
-// experimental purpose.
-int av1_use_mv_hp(const MV *ref) {
- (void)ref;
- return 1;
-}
-
static void inc_mv_component(int v, nmv_component_counts *comp_counts, int incr,
int usehp) {
int s, z, c, o, d, e, f;
@@ -273,28 +266,33 @@
#endif
}
+#if CONFIG_EC_MULTISYMBOL
+void av1_set_mv_cdfs(nmv_context *ctx) {
+ int i;
+ int j;
+ av1_tree_to_cdf(av1_mv_joint_tree, ctx->joints, ctx->joint_cdf);
+
+ for (i = 0; i < 2; ++i) {
+ nmv_component *const comp_ctx = &ctx->comps[i];
+ av1_tree_to_cdf(av1_mv_class_tree, comp_ctx->classes, comp_ctx->class_cdf);
+
+ for (j = 0; j < CLASS0_SIZE; ++j) {
+ av1_tree_to_cdf(av1_mv_fp_tree, comp_ctx->class0_fp[j],
+ comp_ctx->class0_fp_cdf[j]);
+ }
+ av1_tree_to_cdf(av1_mv_fp_tree, comp_ctx->fp, comp_ctx->fp_cdf);
+ }
+}
+#endif
+
void av1_init_mv_probs(AV1_COMMON *cm) {
#if CONFIG_REF_MV
int i;
for (i = 0; i < NMV_CONTEXTS; ++i) cm->fc->nmvc[i] = default_nmv_context;
#else
cm->fc->nmvc = default_nmv_context;
-#if CONFIG_DAALA_EC
- {
- int i, j;
- av1_tree_to_cdf(av1_mv_joint_tree, cm->fc->nmvc.joints,
- cm->fc->nmvc.joint_cdf);
- for (i = 0; i < 2; i++) {
- av1_tree_to_cdf(av1_mv_class_tree, cm->fc->nmvc.comps[i].classes,
- cm->fc->nmvc.comps[i].class_cdf);
- av1_tree_to_cdf(av1_mv_fp_tree, cm->fc->nmvc.comps[i].fp,
- cm->fc->nmvc.comps[i].fp_cdf);
- for (j = 0; j < CLASS0_SIZE; j++) {
- av1_tree_to_cdf(av1_mv_fp_tree, cm->fc->nmvc.comps[i].class0_fp[j],
- cm->fc->nmvc.comps[i].class0_fp_cdf[j]);
- }
- }
- }
+#if CONFIG_EC_MULTISYMBOL
+ av1_set_mv_cdfs(&cm->fc->nmvc);
#endif
#endif
#if CONFIG_GLOBAL_MOTION
diff --git a/av1/common/entropymv.h b/av1/common/entropymv.h
index f308ef3..1ebbdb2 100644
--- a/av1/common/entropymv.h
+++ b/av1/common/entropymv.h
@@ -27,7 +27,6 @@
void av1_init_mv_probs(struct AV1Common *cm);
void av1_adapt_mv_probs(struct AV1Common *cm, int usehp);
-int av1_use_mv_hp(const MV *ref);
#define MV_UPDATE_PROB 252
@@ -85,14 +84,14 @@
typedef struct {
aom_prob sign;
aom_prob classes[MV_CLASSES - 1];
-#if CONFIG_DAALA_EC
+#if CONFIG_EC_MULTISYMBOL
aom_cdf_prob class_cdf[MV_CLASSES];
#endif
aom_prob class0[CLASS0_SIZE - 1];
aom_prob bits[MV_OFFSET_BITS];
aom_prob class0_fp[CLASS0_SIZE][MV_FP_SIZE - 1];
aom_prob fp[MV_FP_SIZE - 1];
-#if CONFIG_DAALA_EC
+#if CONFIG_EC_MULTISYMBOL
aom_cdf_prob class0_fp_cdf[CLASS0_SIZE][MV_FP_SIZE];
aom_cdf_prob fp_cdf[MV_FP_SIZE];
#endif
@@ -102,7 +101,7 @@
typedef struct {
aom_prob joints[MV_JOINTS - 1];
-#if CONFIG_DAALA_EC
+#if CONFIG_EC_MULTISYMBOL
aom_cdf_prob joint_cdf[MV_JOINTS];
#endif
nmv_component comps[2];
@@ -135,11 +134,13 @@
} nmv_context_counts;
void av1_inc_mv(const MV *mv, nmv_context_counts *mvctx, const int usehp);
-
#if CONFIG_GLOBAL_MOTION
extern const aom_tree_index
av1_global_motion_types_tree[TREE_SIZE(GLOBAL_MOTION_TYPES)];
#endif // CONFIG_GLOBAL_MOTION
+#if CONFIG_EC_MULTISYMBOL
+void av1_set_mv_cdfs(nmv_context *ctx);
+#endif
#ifdef __cplusplus
} // extern "C"
diff --git a/av1/common/enums.h b/av1/common/enums.h
index 701d4b9..2ec83ec 100644
--- a/av1/common/enums.h
+++ b/av1/common/enums.h
@@ -131,18 +131,19 @@
// block transform size
typedef enum ATTRIBUTE_PACKED {
- TX_4X4, // 4x4 transform
- TX_8X8, // 8x8 transform
- TX_16X16, // 16x16 transform
- TX_32X32, // 32x32 transform
-#if CONFIG_EXT_TX
+#if CONFIG_CB4X4
+ TX_2X2, // 2x2 transform
+#endif
+ TX_4X4, // 4x4 transform
+ TX_8X8, // 8x8 transform
+ TX_16X16, // 16x16 transform
+ TX_32X32, // 32x32 transform
TX_4X8, // 4x8 transform
TX_8X4, // 8x4 transform
TX_8X16, // 8x16 transform
TX_16X8, // 16x8 transform
TX_16X32, // 16x32 transform
TX_32X16, // 32x16 transform
-#endif // CONFIG_EXT_TX
TX_SIZES_ALL, // Includes rectangular transforms
TX_SIZES = TX_32X32 + 1, // Does NOT include rectangular transforms
TX_INVALID = 255 // Invalid transform size
@@ -208,8 +209,12 @@
#define EXT_TX_SETS_INTER 4 // Sets of transform selections for INTER
#define EXT_TX_SETS_INTRA 3 // Sets of transform selections for INTRA
#else
+#if CONFIG_CB4X4
+#define EXT_TX_SIZES 4 // number of sizes that use extended transforms
+#else
#define EXT_TX_SIZES 3 // number of sizes that use extended transforms
-#endif // CONFIG_EXT_TX
+#endif
+#endif // CONFIG_EXT_TX
typedef enum {
AOM_LAST_FLAG = 1 << 0,
@@ -401,8 +406,8 @@
#define REF_CONTEXTS 5
#if CONFIG_VAR_TX
-#define TXFM_PARTITION_CONTEXTS 9
-typedef TX_SIZE TXFM_CONTEXT;
+#define TXFM_PARTITION_CONTEXTS 16
+typedef uint8_t TXFM_CONTEXT;
#endif
#define NONE -1
diff --git a/av1/common/filter.h b/av1/common/filter.h
index 15fc806..eb39a7f 100644
--- a/av1/common/filter.h
+++ b/av1/common/filter.h
@@ -38,8 +38,8 @@
#else
#define SWITCHABLE_FILTERS 3 /* Number of switchable filters */
#define LOG_SWITCHABLE_FILTERS \
- 2 /* (1 << LOG_SWITCHABLE_FILTERS) > SWITCHABLE_FILTERS */
-#endif // CONFIG_EXT_INTERP
+ 2 /* (1 << LOG_SWITCHABLE_FILTERS) > SWITCHABLE_FILTERS */
+#endif // CONFIG_EXT_INTERP
#define USE_TEMPORALFILTER_12TAP 1
#if USE_TEMPORALFILTER_12TAP
diff --git a/av1/common/idct.c b/av1/common/idct.c
index 4f33f9b..b5e3742 100644
--- a/av1/common/idct.c
+++ b/av1/common/idct.c
@@ -58,6 +58,7 @@
int i;
for (i = 0; i < 32; ++i) output[i] = input[i] * 4;
}
+#endif // CONFIG_EXT_TX
// For use in lieu of ADST
static void ihalfright32_c(const tran_low_t *input, tran_low_t *output) {
@@ -70,11 +71,53 @@
for (i = 0; i < 16; ++i) {
output[i] = input[16 + i] * 4;
}
- idct16_c(inputhalf, output + 16);
+ aom_idct16_c(inputhalf, output + 16);
// Note overall scaling factor is 4 times orthogonal
}
+#if CONFIG_TX64X64
+static void idct64_col_c(const tran_low_t *input, tran_low_t *output) {
+ int32_t in[64], out[64];
+ int i;
+ for (i = 0; i < 64; ++i) in[i] = (int32_t)input[i];
+ av1_idct64_new(in, out, inv_cos_bit_col_dct_dct_64,
+ inv_stage_range_col_dct_dct_64);
+ for (i = 0; i < 64; ++i) output[i] = (tran_low_t)out[i];
+}
+
+static void idct64_row_c(const tran_low_t *input, tran_low_t *output) {
+ int32_t in[64], out[64];
+ int i;
+ for (i = 0; i < 64; ++i) in[i] = (int32_t)input[i];
+ av1_idct64_new(in, out, inv_cos_bit_row_dct_dct_64,
+ inv_stage_range_row_dct_dct_64);
+ for (i = 0; i < 64; ++i) output[i] = (tran_low_t)out[i];
+}
+
+static void iidtx64_c(const tran_low_t *input, tran_low_t *output) {
+ int i;
+ for (i = 0; i < 64; ++i)
+ output[i] = (tran_low_t)dct_const_round_shift(input[i] * 4 * Sqrt2);
+}
+
+// For use in lieu of ADST
+static void ihalfright64_c(const tran_low_t *input, tran_low_t *output) {
+ int i;
+ tran_low_t inputhalf[32];
+ // Multiply input by sqrt(2)
+ for (i = 0; i < 32; ++i) {
+ inputhalf[i] = (tran_low_t)dct_const_round_shift(input[i] * Sqrt2);
+ }
+ for (i = 0; i < 32; ++i) {
+ output[i] = (tran_low_t)dct_const_round_shift(input[32 + i] * 4 * Sqrt2);
+ }
+ aom_idct32_c(inputhalf, output + 32);
+ // Note overall scaling factor is 4 * sqrt(2) times orthogonal
+}
+#endif // CONFIG_TX64X64
+
#if CONFIG_AOM_HIGHBITDEPTH
+#if CONFIG_EXT_TX
static void highbd_iidtx4_c(const tran_low_t *input, tran_low_t *output,
int bd) {
int i;
@@ -120,9 +163,61 @@
aom_highbd_idct16_c(inputhalf, output + 16, bd);
// Note overall scaling factor is 4 times orthogonal
}
+
+#if CONFIG_TX64X64
+static void highbd_iidtx64_c(const tran_low_t *input, tran_low_t *output,
+ int bd) {
+ int i;
+ for (i = 0; i < 64; ++i)
+ output[i] =
+ HIGHBD_WRAPLOW(highbd_dct_const_round_shift(input[i] * 4 * Sqrt2), bd);
+}
+
+// For use in lieu of ADST
+static void highbd_ihalfright64_c(const tran_low_t *input, tran_low_t *output,
+ int bd) {
+ int i;
+ tran_low_t inputhalf[32];
+ // Multiply input by sqrt(2)
+ for (i = 0; i < 32; ++i) {
+ inputhalf[i] =
+ HIGHBD_WRAPLOW(highbd_dct_const_round_shift(input[i] * Sqrt2), bd);
+ }
+ for (i = 0; i < 32; ++i) {
+ output[i] = HIGHBD_WRAPLOW(
+ highbd_dct_const_round_shift(input[32 + i] * 4 * Sqrt2), bd);
+ }
+ aom_highbd_idct32_c(inputhalf, output + 32, bd);
+ // Note overall scaling factor is 4 * sqrt(2) times orthogonal
+}
+
+static void highbd_idct64_col_c(const tran_low_t *input, tran_low_t *output,
+ int bd) {
+ int32_t in[64], out[64];
+ int i;
+ (void)bd;
+ for (i = 0; i < 64; ++i) in[i] = (int32_t)input[i];
+ av1_idct64_new(in, out, inv_cos_bit_col_dct_dct_64,
+ inv_stage_range_col_dct_dct_64);
+ for (i = 0; i < 64; ++i) output[i] = (tran_low_t)out[i];
+}
+
+static void highbd_idct64_row_c(const tran_low_t *input, tran_low_t *output,
+ int bd) {
+ int32_t in[64], out[64];
+ int i;
+ (void)bd;
+ for (i = 0; i < 64; ++i) in[i] = (int32_t)input[i];
+ av1_idct64_new(in, out, inv_cos_bit_row_dct_dct_64,
+ inv_stage_range_row_dct_dct_64);
+ for (i = 0; i < 64; ++i) output[i] = (tran_low_t)out[i];
+}
+#endif // CONFIG_TX64X64
+#endif // CONFIG_EXT_TX
#endif // CONFIG_AOM_HIGHBITDEPTH
// Inverse identity transform and add.
+#if CONFIG_EXT_TX
static void inv_idtx_add_c(const tran_low_t *input, uint8_t *dest, int stride,
int bs, int tx_type) {
int r, c;
@@ -136,6 +231,7 @@
}
}
}
+#endif // CONFIG_EXT_TX
#define FLIPUD_PTR(dest, stride, size) \
do { \
@@ -143,6 +239,7 @@
(stride) = -(stride); \
} while (0)
+#if CONFIG_EXT_TX
static void maybe_flip_strides(uint8_t **dst, int *dstride, tran_low_t **src,
int *sstride, int tx_type, int sizey,
int sizex) {
@@ -180,8 +277,10 @@
default: assert(0); break;
}
}
+#endif // CONFIG_EXT_TX
#if CONFIG_AOM_HIGHBITDEPTH
+#if CONFIG_EXT_TX
static void highbd_inv_idtx_add_c(const tran_low_t *input, uint8_t *dest8,
int stride, int bs, int tx_type, int bd) {
int r, c;
@@ -235,30 +334,30 @@
default: assert(0); break;
}
}
-#endif // CONFIG_AOM_HIGHBITDEPTH
#endif // CONFIG_EXT_TX
+#endif // CONFIG_AOM_HIGHBITDEPTH
void av1_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride,
int tx_type) {
static const transform_2d IHT_4[] = {
- { idct4_c, idct4_c }, // DCT_DCT
- { iadst4_c, idct4_c }, // ADST_DCT
- { idct4_c, iadst4_c }, // DCT_ADST
- { iadst4_c, iadst4_c }, // ADST_ADST
+ { aom_idct4_c, aom_idct4_c }, // DCT_DCT = 0
+ { aom_iadst4_c, aom_idct4_c }, // ADST_DCT = 1
+ { aom_idct4_c, aom_iadst4_c }, // DCT_ADST = 2
+ { aom_iadst4_c, aom_iadst4_c }, // ADST_ADST = 3
#if CONFIG_EXT_TX
- { iadst4_c, idct4_c }, // FLIPADST_DCT
- { idct4_c, iadst4_c }, // DCT_FLIPADST
- { iadst4_c, iadst4_c }, // FLIPADST_FLIPADST
- { iadst4_c, iadst4_c }, // ADST_FLIPADST
- { iadst4_c, iadst4_c }, // FLIPADST_ADST
- { iidtx4_c, iidtx4_c }, // IDTX
- { idct4_c, iidtx4_c }, // V_DCT
- { iidtx4_c, idct4_c }, // H_DCT
- { iadst4_c, iidtx4_c }, // V_ADST
- { iidtx4_c, iadst4_c }, // H_ADST
- { iadst4_c, iidtx4_c }, // V_FLIPADST
- { iidtx4_c, iadst4_c }, // H_FLIPADST
-#endif // CONFIG_EXT_TX
+ { aom_iadst4_c, aom_idct4_c }, // FLIPADST_DCT
+ { aom_idct4_c, aom_iadst4_c }, // DCT_FLIPADST
+ { aom_iadst4_c, aom_iadst4_c }, // FLIPADST_FLIPADST
+ { aom_iadst4_c, aom_iadst4_c }, // ADST_FLIPADST
+ { aom_iadst4_c, aom_iadst4_c }, // FLIPADST_ADST
+ { iidtx4_c, iidtx4_c }, // IDTX
+ { aom_idct4_c, iidtx4_c }, // V_DCT
+ { iidtx4_c, aom_idct4_c }, // H_DCT
+ { aom_iadst4_c, iidtx4_c }, // V_ADST
+ { iidtx4_c, aom_iadst4_c }, // H_ADST
+ { aom_iadst4_c, iidtx4_c }, // V_FLIPADST
+ { iidtx4_c, aom_iadst4_c }, // H_FLIPADST
+#endif // CONFIG_EXT_TX
};
int i, j;
@@ -301,26 +400,27 @@
}
}
-#if CONFIG_EXT_TX
void av1_iht4x8_32_add_c(const tran_low_t *input, uint8_t *dest, int stride,
int tx_type) {
static const transform_2d IHT_4x8[] = {
- { idct8_c, idct4_c }, // DCT_DCT
- { iadst8_c, idct4_c }, // ADST_DCT
- { idct8_c, iadst4_c }, // DCT_ADST
- { iadst8_c, iadst4_c }, // ADST_ADST
- { iadst8_c, idct4_c }, // FLIPADST_DCT
- { idct8_c, iadst4_c }, // DCT_FLIPADST
- { iadst8_c, iadst4_c }, // FLIPADST_FLIPADST
- { iadst8_c, iadst4_c }, // ADST_FLIPADST
- { iadst8_c, iadst4_c }, // FLIPADST_ADST
- { iidtx8_c, iidtx4_c }, // IDTX
- { idct8_c, iidtx4_c }, // V_DCT
- { iidtx8_c, idct4_c }, // H_DCT
- { iadst8_c, iidtx4_c }, // V_ADST
- { iidtx8_c, iadst4_c }, // H_ADST
- { iadst8_c, iidtx4_c }, // V_FLIPADST
- { iidtx8_c, iadst4_c }, // H_FLIPADST
+ { aom_idct8_c, aom_idct4_c }, // DCT_DCT
+ { aom_iadst8_c, aom_idct4_c }, // ADST_DCT
+ { aom_idct8_c, aom_iadst4_c }, // DCT_ADST
+ { aom_iadst8_c, aom_iadst4_c }, // ADST_ADST
+#if CONFIG_EXT_TX
+ { aom_iadst8_c, aom_idct4_c }, // FLIPADST_DCT
+ { aom_idct8_c, aom_iadst4_c }, // DCT_FLIPADST
+ { aom_iadst8_c, aom_iadst4_c }, // FLIPADST_FLIPADST
+ { aom_iadst8_c, aom_iadst4_c }, // ADST_FLIPADST
+ { aom_iadst8_c, aom_iadst4_c }, // FLIPADST_ADST
+ { iidtx8_c, iidtx4_c }, // IDTX
+ { aom_idct8_c, iidtx4_c }, // V_DCT
+ { iidtx8_c, aom_idct4_c }, // H_DCT
+ { aom_iadst8_c, iidtx4_c }, // V_ADST
+ { iidtx8_c, aom_iadst4_c }, // H_ADST
+ { aom_iadst8_c, iidtx4_c }, // V_FLIPADST
+ { iidtx8_c, aom_iadst4_c }, // H_FLIPADST
+#endif
};
const int n = 4;
@@ -343,7 +443,9 @@
IHT_4x8[tx_type].cols(out[i], out[i]);
}
+#if CONFIG_EXT_TX
maybe_flip_strides(&dest, &stride, &outp, &outstride, tx_type, n2, n);
+#endif
// Sum with the destination
for (i = 0; i < n2; ++i) {
@@ -358,22 +460,24 @@
void av1_iht8x4_32_add_c(const tran_low_t *input, uint8_t *dest, int stride,
int tx_type) {
static const transform_2d IHT_8x4[] = {
- { idct4_c, idct8_c }, // DCT_DCT
- { iadst4_c, idct8_c }, // ADST_DCT
- { idct4_c, iadst8_c }, // DCT_ADST
- { iadst4_c, iadst8_c }, // ADST_ADST
- { iadst4_c, idct8_c }, // FLIPADST_DCT
- { idct4_c, iadst8_c }, // DCT_FLIPADST
- { iadst4_c, iadst8_c }, // FLIPADST_FLIPADST
- { iadst4_c, iadst8_c }, // ADST_FLIPADST
- { iadst4_c, iadst8_c }, // FLIPADST_ADST
- { iidtx4_c, iidtx8_c }, // IDTX
- { idct4_c, iidtx8_c }, // V_DCT
- { iidtx4_c, idct8_c }, // H_DCT
- { iadst4_c, iidtx8_c }, // V_ADST
- { iidtx4_c, iadst8_c }, // H_ADST
- { iadst4_c, iidtx8_c }, // V_FLIPADST
- { iidtx4_c, iadst8_c }, // H_FLIPADST
+ { aom_idct4_c, aom_idct8_c }, // DCT_DCT
+ { aom_iadst4_c, aom_idct8_c }, // ADST_DCT
+ { aom_idct4_c, aom_iadst8_c }, // DCT_ADST
+ { aom_iadst4_c, aom_iadst8_c }, // ADST_ADST
+#if CONFIG_EXT_TX
+ { aom_iadst4_c, aom_idct8_c }, // FLIPADST_DCT
+ { aom_idct4_c, aom_iadst8_c }, // DCT_FLIPADST
+ { aom_iadst4_c, aom_iadst8_c }, // FLIPADST_FLIPADST
+ { aom_iadst4_c, aom_iadst8_c }, // ADST_FLIPADST
+ { aom_iadst4_c, aom_iadst8_c }, // FLIPADST_ADST
+ { iidtx4_c, iidtx8_c }, // IDTX
+ { aom_idct4_c, iidtx8_c }, // V_DCT
+ { iidtx4_c, aom_idct8_c }, // H_DCT
+ { aom_iadst4_c, iidtx8_c }, // V_ADST
+ { iidtx4_c, aom_iadst8_c }, // H_ADST
+ { aom_iadst4_c, iidtx8_c }, // V_FLIPADST
+ { iidtx4_c, aom_iadst8_c }, // H_FLIPADST
+#endif
};
const int n = 4;
const int n2 = 8;
@@ -396,7 +500,9 @@
IHT_8x4[tx_type].cols(out[i], out[i]);
}
+#if CONFIG_EXT_TX
maybe_flip_strides(&dest, &stride, &outp, &outstride, tx_type, n, n2);
+#endif
// Sum with the destination
for (i = 0; i < n; ++i) {
@@ -411,22 +517,24 @@
void av1_iht8x16_128_add_c(const tran_low_t *input, uint8_t *dest, int stride,
int tx_type) {
static const transform_2d IHT_8x16[] = {
- { idct16_c, idct8_c }, // DCT_DCT
- { iadst16_c, idct8_c }, // ADST_DCT
- { idct16_c, iadst8_c }, // DCT_ADST
- { iadst16_c, iadst8_c }, // ADST_ADST
- { iadst16_c, idct8_c }, // FLIPADST_DCT
- { idct16_c, iadst8_c }, // DCT_FLIPADST
- { iadst16_c, iadst8_c }, // FLIPADST_FLIPADST
- { iadst16_c, iadst8_c }, // ADST_FLIPADST
- { iadst16_c, iadst8_c }, // FLIPADST_ADST
- { iidtx16_c, iidtx8_c }, // IDTX
- { idct16_c, iidtx8_c }, // V_DCT
- { iidtx16_c, idct8_c }, // H_DCT
- { iadst16_c, iidtx8_c }, // V_ADST
- { iidtx16_c, iadst8_c }, // H_ADST
- { iadst16_c, iidtx8_c }, // V_FLIPADST
- { iidtx16_c, iadst8_c }, // H_FLIPADST
+ { aom_idct16_c, aom_idct8_c }, // DCT_DCT
+ { aom_iadst16_c, aom_idct8_c }, // ADST_DCT
+ { aom_idct16_c, aom_iadst8_c }, // DCT_ADST
+ { aom_iadst16_c, aom_iadst8_c }, // ADST_ADST
+#if CONFIG_EXT_TX
+ { aom_iadst16_c, aom_idct8_c }, // FLIPADST_DCT
+ { aom_idct16_c, aom_iadst8_c }, // DCT_FLIPADST
+ { aom_iadst16_c, aom_iadst8_c }, // FLIPADST_FLIPADST
+ { aom_iadst16_c, aom_iadst8_c }, // ADST_FLIPADST
+ { aom_iadst16_c, aom_iadst8_c }, // FLIPADST_ADST
+ { iidtx16_c, iidtx8_c }, // IDTX
+ { aom_idct16_c, iidtx8_c }, // V_DCT
+ { iidtx16_c, aom_idct8_c }, // H_DCT
+ { aom_iadst16_c, iidtx8_c }, // V_ADST
+ { iidtx16_c, aom_iadst8_c }, // H_ADST
+ { aom_iadst16_c, iidtx8_c }, // V_FLIPADST
+ { iidtx16_c, aom_iadst8_c }, // H_FLIPADST
+#endif
};
const int n = 8;
@@ -449,7 +557,9 @@
IHT_8x16[tx_type].cols(out[i], out[i]);
}
+#if CONFIG_EXT_TX
maybe_flip_strides(&dest, &stride, &outp, &outstride, tx_type, n2, n);
+#endif
// Sum with the destination
for (i = 0; i < n2; ++i) {
@@ -464,22 +574,24 @@
void av1_iht16x8_128_add_c(const tran_low_t *input, uint8_t *dest, int stride,
int tx_type) {
static const transform_2d IHT_16x8[] = {
- { idct8_c, idct16_c }, // DCT_DCT
- { iadst8_c, idct16_c }, // ADST_DCT
- { idct8_c, iadst16_c }, // DCT_ADST
- { iadst8_c, iadst16_c }, // ADST_ADST
- { iadst8_c, idct16_c }, // FLIPADST_DCT
- { idct8_c, iadst16_c }, // DCT_FLIPADST
- { iadst8_c, iadst16_c }, // FLIPADST_FLIPADST
- { iadst8_c, iadst16_c }, // ADST_FLIPADST
- { iadst8_c, iadst16_c }, // FLIPADST_ADST
- { iidtx8_c, iidtx16_c }, // IDTX
- { idct8_c, iidtx16_c }, // V_DCT
- { iidtx8_c, idct16_c }, // H_DCT
- { iadst8_c, iidtx16_c }, // V_ADST
- { iidtx8_c, iadst16_c }, // H_ADST
- { iadst8_c, iidtx16_c }, // V_FLIPADST
- { iidtx8_c, iadst16_c }, // H_FLIPADST
+ { aom_idct8_c, aom_idct16_c }, // DCT_DCT
+ { aom_iadst8_c, aom_idct16_c }, // ADST_DCT
+ { aom_idct8_c, aom_iadst16_c }, // DCT_ADST
+ { aom_iadst8_c, aom_iadst16_c }, // ADST_ADST
+#if CONFIG_EXT_TX
+ { aom_iadst8_c, aom_idct16_c }, // FLIPADST_DCT
+ { aom_idct8_c, aom_iadst16_c }, // DCT_FLIPADST
+ { aom_iadst8_c, aom_iadst16_c }, // FLIPADST_FLIPADST
+ { aom_iadst8_c, aom_iadst16_c }, // ADST_FLIPADST
+ { aom_iadst8_c, aom_iadst16_c }, // FLIPADST_ADST
+ { iidtx8_c, iidtx16_c }, // IDTX
+ { aom_idct8_c, iidtx16_c }, // V_DCT
+ { iidtx8_c, aom_idct16_c }, // H_DCT
+ { aom_iadst8_c, iidtx16_c }, // V_ADST
+ { iidtx8_c, aom_iadst16_c }, // H_ADST
+ { aom_iadst8_c, iidtx16_c }, // V_FLIPADST
+ { iidtx8_c, aom_iadst16_c }, // H_FLIPADST
+#endif
};
const int n = 8;
const int n2 = 16;
@@ -502,7 +614,9 @@
IHT_16x8[tx_type].cols(out[i], out[i]);
}
+#if CONFIG_EXT_TX
maybe_flip_strides(&dest, &stride, &outp, &outstride, tx_type, n, n2);
+#endif
// Sum with the destination
for (i = 0; i < n; ++i) {
@@ -517,22 +631,24 @@
void av1_iht16x32_512_add_c(const tran_low_t *input, uint8_t *dest, int stride,
int tx_type) {
static const transform_2d IHT_16x32[] = {
- { idct32_c, idct16_c }, // DCT_DCT
- { ihalfright32_c, idct16_c }, // ADST_DCT
- { idct32_c, iadst16_c }, // DCT_ADST
- { ihalfright32_c, iadst16_c }, // ADST_ADST
- { ihalfright32_c, idct16_c }, // FLIPADST_DCT
- { idct32_c, iadst16_c }, // DCT_FLIPADST
- { ihalfright32_c, iadst16_c }, // FLIPADST_FLIPADST
- { ihalfright32_c, iadst16_c }, // ADST_FLIPADST
- { ihalfright32_c, iadst16_c }, // FLIPADST_ADST
- { iidtx32_c, iidtx16_c }, // IDTX
- { idct32_c, iidtx16_c }, // V_DCT
- { iidtx32_c, idct16_c }, // H_DCT
- { ihalfright32_c, iidtx16_c }, // V_ADST
- { iidtx32_c, iadst16_c }, // H_ADST
- { ihalfright32_c, iidtx16_c }, // V_FLIPADST
- { iidtx32_c, iadst16_c }, // H_FLIPADST
+ { aom_idct32_c, aom_idct16_c }, // DCT_DCT
+ { ihalfright32_c, aom_idct16_c }, // ADST_DCT
+ { aom_idct32_c, aom_iadst16_c }, // DCT_ADST
+ { ihalfright32_c, aom_iadst16_c }, // ADST_ADST
+#if CONFIG_EXT_TX
+ { ihalfright32_c, aom_idct16_c }, // FLIPADST_DCT
+ { aom_idct32_c, aom_iadst16_c }, // DCT_FLIPADST
+ { ihalfright32_c, aom_iadst16_c }, // FLIPADST_FLIPADST
+ { ihalfright32_c, aom_iadst16_c }, // ADST_FLIPADST
+ { ihalfright32_c, aom_iadst16_c }, // FLIPADST_ADST
+ { iidtx32_c, iidtx16_c }, // IDTX
+ { aom_idct32_c, iidtx16_c }, // V_DCT
+ { iidtx32_c, aom_idct16_c }, // H_DCT
+ { ihalfright32_c, iidtx16_c }, // V_ADST
+ { iidtx32_c, aom_iadst16_c }, // H_ADST
+ { ihalfright32_c, iidtx16_c }, // V_FLIPADST
+ { iidtx32_c, aom_iadst16_c }, // H_FLIPADST
+#endif
};
const int n = 16;
@@ -555,7 +671,9 @@
IHT_16x32[tx_type].cols(out[i], out[i]);
}
+#if CONFIG_EXT_TX
maybe_flip_strides(&dest, &stride, &outp, &outstride, tx_type, n2, n);
+#endif
// Sum with the destination
for (i = 0; i < n2; ++i) {
@@ -570,22 +688,24 @@
void av1_iht32x16_512_add_c(const tran_low_t *input, uint8_t *dest, int stride,
int tx_type) {
static const transform_2d IHT_32x16[] = {
- { idct16_c, idct32_c }, // DCT_DCT
- { iadst16_c, idct32_c }, // ADST_DCT
- { idct16_c, ihalfright32_c }, // DCT_ADST
- { iadst16_c, ihalfright32_c }, // ADST_ADST
- { iadst16_c, idct32_c }, // FLIPADST_DCT
- { idct16_c, ihalfright32_c }, // DCT_FLIPADST
- { iadst16_c, ihalfright32_c }, // FLIPADST_FLIPADST
- { iadst16_c, ihalfright32_c }, // ADST_FLIPADST
- { iadst16_c, ihalfright32_c }, // FLIPADST_ADST
- { iidtx16_c, iidtx32_c }, // IDTX
- { idct16_c, iidtx32_c }, // V_DCT
- { iidtx16_c, idct32_c }, // H_DCT
- { iadst16_c, iidtx32_c }, // V_ADST
- { iidtx16_c, ihalfright32_c }, // H_ADST
- { iadst16_c, iidtx32_c }, // V_FLIPADST
- { iidtx16_c, ihalfright32_c }, // H_FLIPADST
+ { aom_idct16_c, aom_idct32_c }, // DCT_DCT
+ { aom_iadst16_c, aom_idct32_c }, // ADST_DCT
+ { aom_idct16_c, ihalfright32_c }, // DCT_ADST
+ { aom_iadst16_c, ihalfright32_c }, // ADST_ADST
+#if CONFIG_EXT_TX
+ { aom_iadst16_c, aom_idct32_c }, // FLIPADST_DCT
+ { aom_idct16_c, ihalfright32_c }, // DCT_FLIPADST
+ { aom_iadst16_c, ihalfright32_c }, // FLIPADST_FLIPADST
+ { aom_iadst16_c, ihalfright32_c }, // ADST_FLIPADST
+ { aom_iadst16_c, ihalfright32_c }, // FLIPADST_ADST
+ { iidtx16_c, iidtx32_c }, // IDTX
+ { aom_idct16_c, iidtx32_c }, // V_DCT
+ { iidtx16_c, aom_idct32_c }, // H_DCT
+ { aom_iadst16_c, iidtx32_c }, // V_ADST
+ { iidtx16_c, ihalfright32_c }, // H_ADST
+ { aom_iadst16_c, iidtx32_c }, // V_FLIPADST
+ { iidtx16_c, ihalfright32_c }, // H_FLIPADST
+#endif
};
const int n = 16;
const int n2 = 32;
@@ -608,7 +728,9 @@
IHT_32x16[tx_type].cols(out[i], out[i]);
}
+#if CONFIG_EXT_TX
maybe_flip_strides(&dest, &stride, &outp, &outstride, tx_type, n, n2);
+#endif
// Sum with the destination
for (i = 0; i < n; ++i) {
@@ -619,29 +741,28 @@
}
}
}
-#endif // CONFIG_EXT_TX
void av1_iht8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride,
int tx_type) {
static const transform_2d IHT_8[] = {
- { idct8_c, idct8_c }, // DCT_DCT
- { iadst8_c, idct8_c }, // ADST_DCT
- { idct8_c, iadst8_c }, // DCT_ADST
- { iadst8_c, iadst8_c }, // ADST_ADST
+ { aom_idct8_c, aom_idct8_c }, // DCT_DCT = 0
+ { aom_iadst8_c, aom_idct8_c }, // ADST_DCT = 1
+ { aom_idct8_c, aom_iadst8_c }, // DCT_ADST = 2
+ { aom_iadst8_c, aom_iadst8_c }, // ADST_ADST = 3
#if CONFIG_EXT_TX
- { iadst8_c, idct8_c }, // FLIPADST_DCT
- { idct8_c, iadst8_c }, // DCT_FLIPADST
- { iadst8_c, iadst8_c }, // FLIPADST_FLIPADST
- { iadst8_c, iadst8_c }, // ADST_FLIPADST
- { iadst8_c, iadst8_c }, // FLIPADST_ADST
- { iidtx8_c, iidtx8_c }, // IDTX
- { idct8_c, iidtx8_c }, // V_DCT
- { iidtx8_c, idct8_c }, // H_DCT
- { iadst8_c, iidtx8_c }, // V_ADST
- { iidtx8_c, iadst8_c }, // H_ADST
- { iadst8_c, iidtx8_c }, // V_FLIPADST
- { iidtx8_c, iadst8_c }, // H_FLIPADST
-#endif // CONFIG_EXT_TX
+ { aom_iadst8_c, aom_idct8_c }, // FLIPADST_DCT
+ { aom_idct8_c, aom_iadst8_c }, // DCT_FLIPADST
+ { aom_iadst8_c, aom_iadst8_c }, // FLIPADST_FLIPADST
+ { aom_iadst8_c, aom_iadst8_c }, // ADST_FLIPADST
+ { aom_iadst8_c, aom_iadst8_c }, // FLIPADST_ADST
+ { iidtx8_c, iidtx8_c }, // IDTX
+ { aom_idct8_c, iidtx8_c }, // V_DCT
+ { iidtx8_c, aom_idct8_c }, // H_DCT
+ { aom_iadst8_c, iidtx8_c }, // V_ADST
+ { iidtx8_c, aom_iadst8_c }, // H_ADST
+ { aom_iadst8_c, iidtx8_c }, // V_FLIPADST
+ { iidtx8_c, aom_iadst8_c }, // H_FLIPADST
+#endif // CONFIG_EXT_TX
};
int i, j;
@@ -687,24 +808,24 @@
void av1_iht16x16_256_add_c(const tran_low_t *input, uint8_t *dest, int stride,
int tx_type) {
static const transform_2d IHT_16[] = {
- { idct16_c, idct16_c }, // DCT_DCT
- { iadst16_c, idct16_c }, // ADST_DCT
- { idct16_c, iadst16_c }, // DCT_ADST
- { iadst16_c, iadst16_c }, // ADST_ADST
+ { aom_idct16_c, aom_idct16_c }, // DCT_DCT = 0
+ { aom_iadst16_c, aom_idct16_c }, // ADST_DCT = 1
+ { aom_idct16_c, aom_iadst16_c }, // DCT_ADST = 2
+ { aom_iadst16_c, aom_iadst16_c }, // ADST_ADST = 3
#if CONFIG_EXT_TX
- { iadst16_c, idct16_c }, // FLIPADST_DCT
- { idct16_c, iadst16_c }, // DCT_FLIPADST
- { iadst16_c, iadst16_c }, // FLIPADST_FLIPADST
- { iadst16_c, iadst16_c }, // ADST_FLIPADST
- { iadst16_c, iadst16_c }, // FLIPADST_ADST
- { iidtx16_c, iidtx16_c }, // IDTX
- { idct16_c, iidtx16_c }, // V_DCT
- { iidtx16_c, idct16_c }, // H_DCT
- { iadst16_c, iidtx16_c }, // V_ADST
- { iidtx16_c, iadst16_c }, // H_ADST
- { iadst16_c, iidtx16_c }, // V_FLIPADST
- { iidtx16_c, iadst16_c }, // H_FLIPADST
-#endif // CONFIG_EXT_TX
+ { aom_iadst16_c, aom_idct16_c }, // FLIPADST_DCT
+ { aom_idct16_c, aom_iadst16_c }, // DCT_FLIPADST
+ { aom_iadst16_c, aom_iadst16_c }, // FLIPADST_FLIPADST
+ { aom_iadst16_c, aom_iadst16_c }, // ADST_FLIPADST
+ { aom_iadst16_c, aom_iadst16_c }, // FLIPADST_ADST
+ { iidtx16_c, iidtx16_c }, // IDTX
+ { aom_idct16_c, iidtx16_c }, // V_DCT
+ { iidtx16_c, aom_idct16_c }, // H_DCT
+ { aom_iadst16_c, iidtx16_c }, // V_ADST
+ { iidtx16_c, aom_iadst16_c }, // H_ADST
+ { aom_iadst16_c, iidtx16_c }, // V_FLIPADST
+ { iidtx16_c, aom_iadst16_c }, // H_FLIPADST
+#endif // CONFIG_EXT_TX
};
int i, j;
@@ -751,22 +872,22 @@
void av1_iht32x32_1024_add_c(const tran_low_t *input, uint8_t *dest, int stride,
int tx_type) {
static const transform_2d IHT_32[] = {
- { idct32_c, idct32_c }, // DCT_DCT
- { ihalfright32_c, idct32_c }, // ADST_DCT
- { idct32_c, ihalfright32_c }, // DCT_ADST
+ { aom_idct32_c, aom_idct32_c }, // DCT_DCT
+ { ihalfright32_c, aom_idct32_c }, // ADST_DCT
+ { aom_idct32_c, ihalfright32_c }, // DCT_ADST
{ ihalfright32_c, ihalfright32_c }, // ADST_ADST
- { ihalfright32_c, idct32_c }, // FLIPADST_DCT
- { idct32_c, ihalfright32_c }, // DCT_FLIPADST
+ { ihalfright32_c, aom_idct32_c }, // FLIPADST_DCT
+ { aom_idct32_c, ihalfright32_c }, // DCT_FLIPADST
{ ihalfright32_c, ihalfright32_c }, // FLIPADST_FLIPADST
{ ihalfright32_c, ihalfright32_c }, // ADST_FLIPADST
{ ihalfright32_c, ihalfright32_c }, // FLIPADST_ADST
{ iidtx32_c, iidtx32_c }, // IDTX
- { idct32_c, iidtx32_c }, // V_DCT
- { iidtx32_c, idct32_c }, // H_DCT
- { ihalfright32_c, iidtx16_c }, // V_ADST
- { iidtx16_c, ihalfright32_c }, // H_ADST
- { ihalfright32_c, iidtx16_c }, // V_FLIPADST
- { iidtx16_c, ihalfright32_c }, // H_FLIPADST
+ { aom_idct32_c, iidtx32_c }, // V_DCT
+ { iidtx32_c, aom_idct32_c }, // H_DCT
+ { ihalfright32_c, iidtx32_c }, // V_ADST
+ { iidtx32_c, ihalfright32_c }, // H_ADST
+ { ihalfright32_c, iidtx32_c }, // V_FLIPADST
+ { iidtx32_c, ihalfright32_c }, // H_FLIPADST
};
int i, j;
@@ -806,6 +927,68 @@
}
}
}
+
+#if CONFIG_TX64X64
+void av1_iht64x64_4096_add_c(const tran_low_t *input, uint8_t *dest, int stride,
+ int tx_type) {
+ static const transform_2d IHT_64[] = {
+ { idct64_col_c, idct64_row_c }, // DCT_DCT
+ { ihalfright64_c, idct64_row_c }, // ADST_DCT
+ { idct64_col_c, ihalfright64_c }, // DCT_ADST
+ { ihalfright64_c, ihalfright64_c }, // ADST_ADST
+ { ihalfright64_c, idct64_row_c }, // FLIPADST_DCT
+ { idct64_col_c, ihalfright64_c }, // DCT_FLIPADST
+ { ihalfright64_c, ihalfright64_c }, // FLIPADST_FLIPADST
+ { ihalfright64_c, ihalfright64_c }, // ADST_FLIPADST
+ { ihalfright64_c, ihalfright64_c }, // FLIPADST_ADST
+ { iidtx64_c, iidtx64_c }, // IDTX
+ { idct64_col_c, iidtx64_c }, // V_DCT
+ { iidtx64_c, idct64_row_c }, // H_DCT
+ { ihalfright64_c, iidtx64_c }, // V_ADST
+ { iidtx64_c, ihalfright64_c }, // H_ADST
+ { ihalfright64_c, iidtx64_c }, // V_FLIPADST
+ { iidtx64_c, ihalfright64_c }, // H_FLIPADST
+ };
+
+ int i, j;
+ tran_low_t tmp;
+ tran_low_t out[64][64];
+ tran_low_t *outp = &out[0][0];
+ int outstride = 64;
+
+ // inverse transform row vectors
+ for (i = 0; i < 64; ++i) {
+ IHT_64[tx_type].rows(input, out[i]);
+ for (j = 0; j < 64; ++j) out[i][j] = ROUND_POWER_OF_TWO(out[i][j], 1);
+ input += 64;
+ }
+
+ // transpose
+ for (i = 1; i < 64; i++) {
+ for (j = 0; j < i; j++) {
+ tmp = out[i][j];
+ out[i][j] = out[j][i];
+ out[j][i] = tmp;
+ }
+ }
+
+ // inverse transform column vectors
+ for (i = 0; i < 64; ++i) {
+ IHT_64[tx_type].cols(out[i], out[i]);
+ }
+
+ maybe_flip_strides(&dest, &stride, &outp, &outstride, tx_type, 64, 64);
+
+ // Sum with the destination
+ for (i = 0; i < 64; ++i) {
+ for (j = 0; j < 64; ++j) {
+ int d = i * stride + j;
+ int s = j * outstride + i;
+ dest[d] = clip_pixel_add(dest[d], ROUND_POWER_OF_TWO(outp[s], 5));
+ }
+ }
+}
+#endif // CONFIG_TX64X64
#endif // CONFIG_EXT_TX
// idct
@@ -905,7 +1088,6 @@
}
}
-#if CONFIG_EXT_TX
void av1_inv_txfm_add_4x8(const tran_low_t *input, uint8_t *dest, int stride,
int eob, TX_TYPE tx_type) {
(void)eob;
@@ -941,7 +1123,6 @@
(void)eob;
av1_iht32x16_512_add(input, dest, stride, tx_type);
}
-#endif // CONFIG_EXT_TX
void av1_inv_txfm_add_8x8(const tran_low_t *input, uint8_t *dest, int stride,
int eob, TX_TYPE tx_type) {
@@ -984,17 +1165,12 @@
case FLIPADST_FLIPADST:
case ADST_FLIPADST:
case FLIPADST_ADST:
- av1_iht16x16_256_add(input, dest, stride, tx_type);
- break;
case V_DCT:
case H_DCT:
case V_ADST:
case H_ADST:
case V_FLIPADST:
- case H_FLIPADST:
- // Use C version since DST only exists in C code
- av1_iht16x16_256_add_c(input, dest, stride, tx_type);
- break;
+ case H_FLIPADST: av1_iht16x16_256_add(input, dest, stride, tx_type); break;
case IDTX: inv_idtx_add_c(input, dest, stride, 16, tx_type); break;
#endif // CONFIG_EXT_TX
default: assert(0); break;
@@ -1635,6 +1811,71 @@
}
}
}
+
+#if CONFIG_TX64X64
+void av1_highbd_iht64x64_4096_add_c(const tran_low_t *input, uint8_t *dest8,
+ int stride, int tx_type, int bd) {
+ static const highbd_transform_2d HIGH_IHT_64[] = {
+ { highbd_idct64_col_c, highbd_idct64_row_c }, // DCT_DCT
+ { highbd_ihalfright64_c, highbd_idct64_row_c }, // ADST_DCT
+ { highbd_idct64_col_c, highbd_ihalfright64_c }, // DCT_ADST
+ { highbd_ihalfright64_c, highbd_ihalfright64_c }, // ADST_ADST
+ { highbd_ihalfright64_c, highbd_idct64_row_c }, // FLIPADST_DCT
+ { highbd_idct64_col_c, highbd_ihalfright64_c }, // DCT_FLIPADST
+ { highbd_ihalfright64_c, highbd_ihalfright64_c }, // FLIPADST_FLIPADST
+ { highbd_ihalfright64_c, highbd_ihalfright64_c }, // ADST_FLIPADST
+ { highbd_ihalfright64_c, highbd_ihalfright64_c }, // FLIPADST_ADST
+ { highbd_iidtx64_c, highbd_iidtx64_c }, // IDTX
+ { highbd_idct64_col_c, highbd_iidtx64_c }, // V_DCT
+ { highbd_iidtx64_c, highbd_idct64_row_c }, // H_DCT
+ { highbd_ihalfright64_c, highbd_iidtx64_c }, // V_ADST
+ { highbd_iidtx64_c, highbd_ihalfright64_c }, // H_ADST
+ { highbd_ihalfright64_c, highbd_iidtx64_c }, // V_FLIPADST
+ { highbd_iidtx64_c, highbd_ihalfright64_c }, // H_FLIPADST
+ };
+
+ uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
+
+ int i, j;
+ tran_low_t tmp;
+ tran_low_t out[64][64];
+ tran_low_t *outp = &out[0][0];
+ int outstride = 64;
+
+ // inverse transform row vectors
+ for (i = 0; i < 64; ++i) {
+ HIGH_IHT_64[tx_type].rows(input, out[i], bd);
+ for (j = 0; j < 64; ++j) out[i][j] = ROUND_POWER_OF_TWO(out[i][j], 1);
+ input += 64;
+ }
+
+ // transpose
+ for (i = 1; i < 64; i++) {
+ for (j = 0; j < i; j++) {
+ tmp = out[i][j];
+ out[i][j] = out[j][i];
+ out[j][i] = tmp;
+ }
+ }
+
+ // inverse transform column vectors
+ for (i = 0; i < 64; ++i) {
+ HIGH_IHT_64[tx_type].cols(out[i], out[i], bd);
+ }
+
+ maybe_flip_strides16(&dest, &stride, &outp, &outstride, tx_type, 64, 64);
+
+ // Sum with the destination
+ for (i = 0; i < 64; ++i) {
+ for (j = 0; j < 64; ++j) {
+ int d = i * stride + j;
+ int s = j * outstride + i;
+ dest[d] =
+ highbd_clip_pixel_add(dest[d], ROUND_POWER_OF_TWO(outp[s], 5), bd);
+ }
+ }
+}
+#endif // CONFIG_TX64X64
#endif // CONFIG_EXT_TX
// idct
@@ -1909,7 +2150,6 @@
av1_inv_txfm_add_16x16(input, dest, stride, eob, tx_type);
break;
case TX_8X8: av1_inv_txfm_add_8x8(input, dest, stride, eob, tx_type); break;
-#if CONFIG_EXT_TX
case TX_4X8: av1_inv_txfm_add_4x8(input, dest, stride, eob, tx_type); break;
case TX_8X4: av1_inv_txfm_add_8x4(input, dest, stride, eob, tx_type); break;
case TX_8X16:
@@ -1924,7 +2164,6 @@
case TX_32X16:
av1_inv_txfm_add_32x16(input, dest, stride, eob, tx_type);
break;
-#endif // CONFIG_EXT_TX
case TX_4X4:
// this is like av1_short_idct4x4 but has a special case around eob<=1
// which is significant (not just an optimization) for the lossless
diff --git a/av1/common/idct.h b/av1/common/idct.h
index 1acc825..db9a6e2 100644
--- a/av1/common/idct.h
+++ b/av1/common/idct.h
@@ -67,12 +67,10 @@
void av1_inv_txfm_add_4x4(const tran_low_t *input, uint8_t *dest, int stride,
int eob, TX_TYPE tx_type, int lossless);
-#if CONFIG_EXT_TX
void av1_inv_txfm_add_8x4(const tran_low_t *input, uint8_t *dest, int stride,
int eob, TX_TYPE tx_type);
void av1_inv_txfm_add_4x8(const tran_low_t *input, uint8_t *dest, int stride,
int eob, TX_TYPE tx_type);
-#endif // CONFIG_EXT_TX
void av1_inv_txfm_add_8x8(const tran_low_t *input, uint8_t *dest, int stride,
int eob, TX_TYPE tx_type);
void av1_inv_txfm_add_16x16(const tran_low_t *input, uint8_t *dest, int stride,
@@ -95,12 +93,10 @@
void av1_highbd_inv_txfm_add_4x4(const tran_low_t *input, uint8_t *dest,
int stride, int eob, int bd, TX_TYPE tx_type,
int lossless);
-#if CONFIG_EXT_TX
void av1_highbd_inv_txfm_add_4x8(const tran_low_t *input, uint8_t *dest,
int stride, int eob, int bd, TX_TYPE tx_type);
void av1_highbd_inv_txfm_add_8x4(const tran_low_t *input, uint8_t *dest,
int stride, int eob, int bd, TX_TYPE tx_type);
-#endif // CONFIG_EXT_TX
void av1_highbd_inv_txfm_add_8x8(const tran_low_t *input, uint8_t *dest,
int stride, int eob, int bd, TX_TYPE tx_type);
void av1_highbd_inv_txfm_add_16x16(const tran_low_t *input, uint8_t *dest,
diff --git a/av1/common/loopfilter.c b/av1/common/loopfilter.c
index d0b897c..25ce24a 100644
--- a/av1/common/loopfilter.c
+++ b/av1/common/loopfilter.c
@@ -40,6 +40,9 @@
//
// A loopfilter should be applied to every other 8x8 horizontally.
static const uint64_t left_64x64_txform_mask[TX_SIZES] = {
+#if CONFIG_CB4X4
+ 0xffffffffffffffffULL, // TX_2X2
+#endif
0xffffffffffffffffULL, // TX_4X4
0xffffffffffffffffULL, // TX_8x8
0x5555555555555555ULL, // TX_16x16
@@ -64,6 +67,9 @@
//
// A loopfilter should be applied to every other 4 the row vertically.
static const uint64_t above_64x64_txform_mask[TX_SIZES] = {
+#if CONFIG_CB4X4
+ 0xffffffffffffffffULL, // TX_4X4
+#endif
0xffffffffffffffffULL, // TX_4X4
0xffffffffffffffffULL, // TX_8x8
0x00ff00ff00ff00ffULL, // TX_16x16
@@ -142,6 +148,9 @@
// 16 bit masks for uv transform sizes.
static const uint16_t left_64x64_txform_mask_uv[TX_SIZES] = {
+#if CONFIG_CB4X4
+ 0xffff, // TX_2X2
+#endif
0xffff, // TX_4X4
0xffff, // TX_8x8
0x5555, // TX_16x16
@@ -149,6 +158,9 @@
};
static const uint16_t above_64x64_txform_mask_uv[TX_SIZES] = {
+#if CONFIG_CB4X4
+ 0xffff, // TX_2X2
+#endif
0xffff, // TX_4X4
0xffff, // TX_8x8
0x0f0f, // TX_16x16
@@ -1054,6 +1066,8 @@
lfm->above_uv[TX_8X8] |= lfm->above_uv[TX_16X16] & 0xff00;
lfm->above_uv[TX_16X16] &= ~(lfm->above_uv[TX_16X16] & 0xff00);
}
+ } else {
+ lfm->above_int_4x4_uv = lfm->left_int_4x4_uv;
}
if (mi_col + MAX_MIB_SIZE > cm->mi_cols) {
@@ -1183,21 +1197,24 @@
}
#endif // CONFIG_AOM_HIGHBITDEPTH
-void av1_filter_block_plane_non420(AV1_COMMON *cm,
- struct macroblockd_plane *plane,
- MODE_INFO **mib, int mi_row, int mi_col) {
+void av1_filter_block_plane_non420_ver(AV1_COMMON *cm,
+ struct macroblockd_plane *plane,
+ MODE_INFO **mib, int mi_row,
+ int mi_col) {
const int ss_x = plane->subsampling_x;
const int ss_y = plane->subsampling_y;
const int row_step = 1 << ss_y;
const int col_step = 1 << ss_x;
+ const int row_step_stride = cm->mi_stride * row_step;
struct buf_2d *const dst = &plane->dst;
uint8_t *const dst0 = dst->buf;
unsigned int mask_16x16[MAX_MIB_SIZE] = { 0 };
unsigned int mask_8x8[MAX_MIB_SIZE] = { 0 };
unsigned int mask_4x4[MAX_MIB_SIZE] = { 0 };
unsigned int mask_4x4_int[MAX_MIB_SIZE] = { 0 };
- uint8_t lfl[MAX_MIB_SIZE][MAX_MIB_SIZE];
+ uint8_t lfl[MAX_MIB_SIZE][MAX_MIB_SIZE] = { { 0 } };
int r, c;
+ MODE_INFO **tmp_mi = mib;
for (r = 0; r < cm->mib_size && mi_row + r < cm->mi_rows; r += row_step) {
unsigned int mask_16x16_c = 0;
@@ -1207,7 +1224,7 @@
// Determine the vertical edges that need filtering
for (c = 0; c < cm->mib_size && mi_col + c < cm->mi_cols; c += col_step) {
- const MODE_INFO *mi = mib[c];
+ const MODE_INFO *mi = tmp_mi[c];
const MB_MODE_INFO *mbmi = &mi[0].mbmi;
const BLOCK_SIZE sb_type = mbmi->sb_type;
const int skip_this = mbmi->skip && is_inter_block(mbmi);
@@ -1224,24 +1241,39 @@
const int skip_this_r = skip_this && !block_edge_above;
#if CONFIG_VAR_TX
+#if CONFIG_EXT_TX && CONFIG_RECT_TX
+ TX_SIZE mb_tx_size = is_rect_tx(mbmi->tx_size)
+ ? mbmi->tx_size
+ : mbmi->inter_tx_size[blk_row][blk_col];
+#else
+ const TX_SIZE mb_tx_size = mbmi->inter_tx_size[blk_row][blk_col];
+#endif
+#endif
+
TX_SIZE tx_size = (plane->plane_type == PLANE_TYPE_UV)
? get_uv_tx_size(mbmi, plane)
: mbmi->tx_size;
-#else
- const TX_SIZE tx_size = (plane->plane_type == PLANE_TYPE_UV)
- ? get_uv_tx_size(mbmi, plane)
- : mbmi->tx_size;
-#endif
const int skip_border_4x4_c = ss_x && mi_col + c == cm->mi_cols - 1;
const int skip_border_4x4_r = ss_y && mi_row + r == cm->mi_rows - 1;
- TX_SIZE tx_size_c = num_4x4_blocks_wide_txsize_log2_lookup[tx_size];
- TX_SIZE tx_size_r = num_4x4_blocks_high_txsize_log2_lookup[tx_size];
+ TX_SIZE tx_size_c = tx_size_wide_unit[tx_size];
+ TX_SIZE tx_size_r = tx_size_high_unit[tx_size];
int tx_size_mask = 0;
+ const int c_step = (c >> ss_x);
+ const int r_step = (r >> ss_y);
+ const int col_mask = 1 << c_step;
+
+#if CONFIG_VAR_TX
+ if (is_inter_block(mbmi) && !mbmi->skip)
+ tx_size = (plane->plane_type == PLANE_TYPE_UV)
+ ? uv_txsize_lookup[sb_type][mb_tx_size][ss_x][ss_y]
+ : mb_tx_size;
+#endif
+
// Filter level can vary per MI
- if (!(lfl[r][c >> ss_x] = get_filter_level(&cm->lf_info, mbmi))) continue;
+ if (!(lfl[r][c_step] = get_filter_level(&cm->lf_info, mbmi))) continue;
if (txsize_sqr_up_map[tx_size] == TX_32X32)
tx_size_mask = 3;
@@ -1251,19 +1283,6 @@
tx_size_mask = 0;
#if CONFIG_VAR_TX
- if (is_inter_block(mbmi) && !mbmi->skip) {
-#if CONFIG_EXT_TX && CONFIG_RECT_TX
- TX_SIZE mb_tx_size = is_rect_tx(mbmi->tx_size)
- ? mbmi->tx_size
- : mbmi->inter_tx_size[blk_row][blk_col];
-#else
- TX_SIZE mb_tx_size = mbmi->inter_tx_size[blk_row][blk_col];
-#endif
- tx_size = (plane->plane_type == PLANE_TYPE_UV)
- ? uv_txsize_lookup[sb_type][mb_tx_size][ss_x][ss_y]
- : mb_tx_size;
- }
-
#if CONFIG_EXT_TX && CONFIG_RECT_TX
tx_size_r =
AOMMIN(txsize_horz_map[tx_size], cm->above_txfm_context[mi_col + c]);
@@ -1286,60 +1305,60 @@
// Build masks based on the transform size of each block
// handle vertical mask
if (tx_size_c == TX_32X32) {
- if (!skip_this_c && ((c >> ss_x) & tx_size_mask) == 0) {
+ if (!skip_this_c && (c_step & tx_size_mask) == 0) {
if (!skip_border_4x4_c)
- mask_16x16_c |= 1 << (c >> ss_x);
+ mask_16x16_c |= col_mask;
else
- mask_8x8_c |= 1 << (c >> ss_x);
+ mask_8x8_c |= col_mask;
}
} else if (tx_size_c == TX_16X16) {
- if (!skip_this_c && ((c >> ss_x) & tx_size_mask) == 0) {
+ if (!skip_this_c && (c_step & tx_size_mask) == 0) {
if (!skip_border_4x4_c)
- mask_16x16_c |= 1 << (c >> ss_x);
+ mask_16x16_c |= col_mask;
else
- mask_8x8_c |= 1 << (c >> ss_x);
+ mask_8x8_c |= col_mask;
}
} else {
// force 8x8 filtering on 32x32 boundaries
- if (!skip_this_c && ((c >> ss_x) & tx_size_mask) == 0) {
+ if (!skip_this_c && (c_step & tx_size_mask) == 0) {
if (tx_size_c == TX_8X8 || ((c >> ss_x) & 3) == 0)
- mask_8x8_c |= 1 << (c >> ss_x);
+ mask_8x8_c |= col_mask;
else
- mask_4x4_c |= 1 << (c >> ss_x);
+ mask_4x4_c |= col_mask;
}
if (!skip_this && tx_size_c < TX_8X8 && !skip_border_4x4_c &&
- ((c >> ss_x) & tx_size_mask) == 0)
- mask_4x4_int[r] |= 1 << (c >> ss_x);
+ (c_step & tx_size_mask) == 0)
+ mask_4x4_int[r] |= col_mask;
}
// set horizontal mask
if (tx_size_r == TX_32X32) {
- if (!skip_this_r && ((r >> ss_y) & tx_size_mask) == 0) {
+ if (!skip_this_r && (r_step & tx_size_mask) == 0) {
if (!skip_border_4x4_r)
- mask_16x16[r] |= 1 << (c >> ss_x);
+ mask_16x16[r] |= col_mask;
else
- mask_8x8[r] |= 1 << (c >> ss_x);
+ mask_8x8[r] |= col_mask;
}
} else if (tx_size_r == TX_16X16) {
- if (!skip_this_r && ((r >> ss_y) & tx_size_mask) == 0) {
+ if (!skip_this_r && (r_step & tx_size_mask) == 0) {
if (!skip_border_4x4_r)
- mask_16x16[r] |= 1 << (c >> ss_x);
+ mask_16x16[r] |= col_mask;
else
- mask_8x8[r] |= 1 << (c >> ss_x);
+ mask_8x8[r] |= col_mask;
}
} else {
// force 8x8 filtering on 32x32 boundaries
- if (!skip_this_r && ((r >> ss_y) & tx_size_mask) == 0) {
- if (tx_size_r == TX_8X8 || ((r >> ss_y) & 3) == 0)
- mask_8x8[r] |= 1 << (c >> ss_x);
+ if (!skip_this_r && (r_step & tx_size_mask) == 0) {
+ if (tx_size_r == TX_8X8 || (r_step & 3) == 0)
+ mask_8x8[r] |= col_mask;
else
- mask_4x4[r] |= 1 << (c >> ss_x);
+ mask_4x4[r] |= col_mask;
}
if (!skip_this && tx_size_r < TX_8X8 && !skip_border_4x4_c &&
((r >> ss_y) & tx_size_mask) == 0)
- mask_4x4_int[r] |= 1 << (c >> ss_x);
+ mask_4x4_int[r] |= col_mask;
}
}
@@ -1364,11 +1383,176 @@
mask_4x4_int[r], &cm->lf_info, &lfl[r][0]);
#endif // CONFIG_AOM_HIGHBITDEPTH
dst->buf += MI_SIZE * dst->stride;
- mib += row_step * cm->mi_stride;
+ tmp_mi += row_step_stride;
}
// Now do horizontal pass
dst->buf = dst0;
+}
+
+void av1_filter_block_plane_non420_hor(AV1_COMMON *cm,
+ struct macroblockd_plane *plane,
+ MODE_INFO **mib, int mi_row,
+ int mi_col) {
+ const int ss_x = plane->subsampling_x;
+ const int ss_y = plane->subsampling_y;
+ const int row_step = 1 << ss_y;
+ const int col_step = 1 << ss_x;
+ const int row_step_stride = cm->mi_stride * row_step;
+ struct buf_2d *const dst = &plane->dst;
+ uint8_t *const dst0 = dst->buf;
+ unsigned int mask_16x16[MAX_MIB_SIZE] = { 0 };
+ unsigned int mask_8x8[MAX_MIB_SIZE] = { 0 };
+ unsigned int mask_4x4[MAX_MIB_SIZE] = { 0 };
+ unsigned int mask_4x4_int[MAX_MIB_SIZE] = { 0 };
+ uint8_t lfl[MAX_MIB_SIZE][MAX_MIB_SIZE];
+ int r, c;
+ MODE_INFO **tmp_mi = mib;
+ for (r = 0; r < cm->mib_size && mi_row + r < cm->mi_rows; r += row_step) {
+ unsigned int mask_16x16_c = 0;
+ unsigned int mask_8x8_c = 0;
+ unsigned int mask_4x4_c = 0;
+
+ // Determine the vertical edges that need filtering
+ for (c = 0; c < cm->mib_size && mi_col + c < cm->mi_cols; c += col_step) {
+ const MODE_INFO *mi = tmp_mi[c];
+ const MB_MODE_INFO *mbmi = &mi[0].mbmi;
+ const BLOCK_SIZE sb_type = mbmi->sb_type;
+ const int skip_this = mbmi->skip && is_inter_block(mbmi);
+ const int blk_row = r & (num_8x8_blocks_high_lookup[sb_type] - 1);
+ const int blk_col = c & (num_8x8_blocks_wide_lookup[sb_type] - 1);
+
+ // left edge of current unit is block/partition edge -> no skip
+ const int block_edge_left =
+ (num_4x4_blocks_wide_lookup[sb_type] > 1) ? !blk_col : 1;
+ const int skip_this_c = skip_this && !block_edge_left;
+ // top edge of current unit is block/partition edge -> no skip
+ const int block_edge_above =
+ (num_4x4_blocks_high_lookup[sb_type] > 1) ? !blk_row : 1;
+ const int skip_this_r = skip_this && !block_edge_above;
+
+ TX_SIZE tx_size = (plane->plane_type == PLANE_TYPE_UV)
+ ? get_uv_tx_size(mbmi, plane)
+ : mbmi->tx_size;
+#if CONFIG_VAR_TX
+#if CONFIG_EXT_TX && CONFIG_RECT_TX
+ TX_SIZE mb_tx_size = is_rect_tx(mbmi->tx_size)
+ ? mbmi->tx_size
+ : mbmi->inter_tx_size[blk_row][blk_col];
+#else
+ TX_SIZE mb_tx_size = mbmi->inter_tx_size[blk_row][blk_col];
+#endif
+#endif
+ const int skip_border_4x4_c = ss_x && mi_col + c == cm->mi_cols - 1;
+ const int skip_border_4x4_r = ss_y && mi_row + r == cm->mi_rows - 1;
+
+ TX_SIZE tx_size_c = tx_size_wide_unit[tx_size];
+ TX_SIZE tx_size_r = tx_size_high_unit[tx_size];
+
+ int tx_size_mask = 0;
+ const int c_step = (c >> ss_x);
+ const int r_step = (r >> ss_y);
+ const int col_mask = 1 << c_step;
+
+#if CONFIG_VAR_TX
+ if (is_inter_block(mbmi) && !mbmi->skip) {
+ tx_size = (plane->plane_type == PLANE_TYPE_UV)
+ ? uv_txsize_lookup[sb_type][mb_tx_size][ss_x][ss_y]
+ : mb_tx_size;
+ }
+#endif
+
+ // Filter level can vary per MI
+ if (!(lfl[r][c_step] = get_filter_level(&cm->lf_info, mbmi))) continue;
+
+ if (txsize_sqr_up_map[tx_size] == TX_32X32)
+ tx_size_mask = 3;
+ else if (txsize_sqr_up_map[tx_size] == TX_16X16)
+ tx_size_mask = 1;
+ else
+ tx_size_mask = 0;
+
+#if CONFIG_VAR_TX
+#if CONFIG_EXT_TX && CONFIG_RECT_TX
+ tx_size_r =
+ AOMMIN(txsize_horz_map[tx_size], cm->above_txfm_context[mi_col + c]);
+ tx_size_c = AOMMIN(txsize_vert_map[tx_size],
+ cm->left_txfm_context[(mi_row + r) & MAX_MIB_MASK]);
+
+ cm->above_txfm_context[mi_col + c] = txsize_horz_map[tx_size];
+ cm->left_txfm_context[(mi_row + r) & MAX_MIB_MASK] =
+ txsize_vert_map[tx_size];
+#else
+ tx_size_r = AOMMIN(tx_size, cm->above_txfm_context[mi_col + c]);
+ tx_size_c =
+ AOMMIN(tx_size, cm->left_txfm_context[(mi_row + r) & MAX_MIB_MASK]);
+
+ cm->above_txfm_context[mi_col + c] = tx_size;
+ cm->left_txfm_context[(mi_row + r) & MAX_MIB_MASK] = tx_size;
+#endif
+#endif
+
+ // Build masks based on the transform size of each block
+ // handle vertical mask
+ if (tx_size_c == TX_32X32) {
+ if (!skip_this_c && (c_step & tx_size_mask) == 0) {
+ if (!skip_border_4x4_c)
+ mask_16x16_c |= col_mask;
+ else
+ mask_8x8_c |= col_mask;
+ }
+ } else if (tx_size_c == TX_16X16) {
+ if (!skip_this_c && (c_step & tx_size_mask) == 0) {
+ if (!skip_border_4x4_c)
+ mask_16x16_c |= col_mask;
+ else
+ mask_8x8_c |= col_mask;
+ }
+ } else {
+ // force 8x8 filtering on 32x32 boundaries
+ if (!skip_this_c && (c_step & tx_size_mask) == 0) {
+ if (tx_size_c == TX_8X8 || ((c >> ss_x) & 3) == 0)
+ mask_8x8_c |= col_mask;
+ else
+ mask_4x4_c |= col_mask;
+ }
+
+ if (!skip_this && tx_size_c < TX_8X8 && !skip_border_4x4_c &&
+ (c_step & tx_size_mask) == 0)
+ mask_4x4_int[r] |= col_mask;
+ }
+
+ // set horizontal mask
+ if (tx_size_r == TX_32X32) {
+ if (!skip_this_r && (r_step & tx_size_mask) == 0) {
+ if (!skip_border_4x4_r)
+ mask_16x16[r] |= col_mask;
+ else
+ mask_8x8[r] |= col_mask;
+ }
+ } else if (tx_size_r == TX_16X16) {
+ if (!skip_this_r && (r_step & tx_size_mask) == 0) {
+ if (!skip_border_4x4_r)
+ mask_16x16[r] |= col_mask;
+ else
+ mask_8x8[r] |= col_mask;
+ }
+ } else {
+ // force 8x8 filtering on 32x32 boundaries
+ if (!skip_this_r && (r_step & tx_size_mask) == 0) {
+ if (tx_size_r == TX_8X8 || (r_step & 3) == 0)
+ mask_8x8[r] |= col_mask;
+ else
+ mask_4x4[r] |= col_mask;
+ }
+
+ if (!skip_this && tx_size_r < TX_8X8 && !skip_border_4x4_c &&
+ ((r >> ss_y) & tx_size_mask) == 0)
+ mask_4x4_int[r] |= col_mask;
+ }
+ }
+ tmp_mi += row_step_stride;
+ }
for (r = 0; r < cm->mib_size && mi_row + r < cm->mi_rows; r += row_step) {
const int skip_border_4x4_r = ss_y && mi_row + r == cm->mi_rows - 1;
const unsigned int mask_4x4_int_r = skip_border_4x4_r ? 0 : mask_4x4_int[r];
@@ -1404,11 +1588,12 @@
#endif // CONFIG_AOM_HIGHBITDEPTH
dst->buf += MI_SIZE * dst->stride;
}
+ dst->buf = dst0;
}
-void av1_filter_block_plane_ss00(AV1_COMMON *const cm,
- struct macroblockd_plane *const plane,
- int mi_row, LOOP_FILTER_MASK *lfm) {
+void av1_filter_block_plane_ss00_ver(AV1_COMMON *const cm,
+ struct macroblockd_plane *const plane,
+ int mi_row, LOOP_FILTER_MASK *lfm) {
struct buf_2d *const dst = &plane->dst;
uint8_t *const dst0 = dst->buf;
int r;
@@ -1452,10 +1637,20 @@
// Horizontal pass
dst->buf = dst0;
- mask_16x16 = lfm->above_y[TX_16X16];
- mask_8x8 = lfm->above_y[TX_8X8];
- mask_4x4 = lfm->above_y[TX_4X4];
- mask_4x4_int = lfm->int_4x4_y;
+}
+
+void av1_filter_block_plane_ss00_hor(AV1_COMMON *const cm,
+ struct macroblockd_plane *const plane,
+ int mi_row, LOOP_FILTER_MASK *lfm) {
+ struct buf_2d *const dst = &plane->dst;
+ uint8_t *const dst0 = dst->buf;
+ int r;
+ uint64_t mask_16x16 = lfm->above_y[TX_16X16];
+ uint64_t mask_8x8 = lfm->above_y[TX_8X8];
+ uint64_t mask_4x4 = lfm->above_y[TX_4X4];
+ uint64_t mask_4x4_int = lfm->int_4x4_y;
+
+ assert(plane->subsampling_x == 0 && plane->subsampling_y == 0);
for (r = 0; r < cm->mib_size && mi_row + r < cm->mi_rows; r++) {
unsigned int mask_16x16_r;
@@ -1495,11 +1690,13 @@
mask_4x4 >>= MI_SIZE;
mask_4x4_int >>= MI_SIZE;
}
+ // restore the buf pointer in case there is additional filter pass.
+ dst->buf = dst0;
}
-void av1_filter_block_plane_ss11(AV1_COMMON *const cm,
- struct macroblockd_plane *const plane,
- int mi_row, LOOP_FILTER_MASK *lfm) {
+void av1_filter_block_plane_ss11_ver(AV1_COMMON *const cm,
+ struct macroblockd_plane *const plane,
+ int mi_row, LOOP_FILTER_MASK *lfm) {
struct buf_2d *const dst = &plane->dst;
uint8_t *const dst0 = dst->buf;
int r, c;
@@ -1511,6 +1708,7 @@
assert(plane->subsampling_x == 1 && plane->subsampling_y == 1);
assert(plane->plane_type == PLANE_TYPE_UV);
+ memset(lfm->lfl_uv, 0, sizeof(lfm->lfl_uv));
// Vertical pass: do 2 rows at one time
for (r = 0; r < cm->mib_size && mi_row + r < cm->mi_rows; r += 4) {
@@ -1554,10 +1752,30 @@
// Horizontal pass
dst->buf = dst0;
- mask_16x16 = lfm->above_uv[TX_16X16];
- mask_8x8 = lfm->above_uv[TX_8X8];
- mask_4x4 = lfm->above_uv[TX_4X4];
- mask_4x4_int = lfm->above_int_4x4_uv;
+}
+
+void av1_filter_block_plane_ss11_hor(AV1_COMMON *const cm,
+ struct macroblockd_plane *const plane,
+ int mi_row, LOOP_FILTER_MASK *lfm) {
+ struct buf_2d *const dst = &plane->dst;
+ uint8_t *const dst0 = dst->buf;
+ int r, c;
+ uint64_t mask_16x16 = lfm->above_uv[TX_16X16];
+ uint64_t mask_8x8 = lfm->above_uv[TX_8X8];
+ uint64_t mask_4x4 = lfm->above_uv[TX_4X4];
+ uint64_t mask_4x4_int = lfm->above_int_4x4_uv;
+
+ assert(plane->subsampling_x == 1 && plane->subsampling_y == 1);
+ memset(lfm->lfl_uv, 0, sizeof(lfm->lfl_uv));
+
+ // re-porpulate the filter level for uv, same as the code for vertical
+ // filter in av1_filter_block_plane_ss11_ver
+ for (r = 0; r < cm->mib_size && mi_row + r < cm->mi_rows; r += 4) {
+ for (c = 0; c < (cm->mib_size >> 1); c++) {
+ lfm->lfl_uv[r >> 1][c] = lfm->lfl_y[r][c << 1];
+ lfm->lfl_uv[(r + 2) >> 1][c] = lfm->lfl_y[r + 2][c << 1];
+ }
+ }
for (r = 0; r < cm->mib_size && mi_row + r < cm->mi_rows; r += 2) {
const int skip_border_4x4_r = mi_row + r == cm->mi_rows - 1;
@@ -1600,6 +1818,8 @@
mask_4x4 >>= MI_SIZE / 2;
mask_4x4_int >>= MI_SIZE / 2;
}
+ // restore the buf pointer in case there is additional filter pass.
+ dst->buf = dst0;
}
void av1_loop_filter_rows(YV12_BUFFER_CONFIG *frame_buffer, AV1_COMMON *cm,
@@ -1622,12 +1842,15 @@
av1_setup_dst_planes(planes, frame_buffer, mi_row, mi_col);
- for (plane = 0; plane < num_planes; ++plane)
- av1_filter_block_plane_non420(cm, &planes[plane], mi + mi_col, mi_row,
- mi_col);
+ for (plane = 0; plane < num_planes; ++plane) {
+ av1_filter_block_plane_non420_ver(cm, &planes[plane], mi + mi_col,
+ mi_row, mi_col);
+ av1_filter_block_plane_non420_hor(cm, &planes[plane], mi + mi_col,
+ mi_row, mi_col);
+ }
}
}
-#else
+#else // CONFIG_VAR_TX || CONFIG_EXT_PARTITION || CONFIG_EXT_PARTITION_TYPES
const int num_planes = y_only ? 1 : MAX_MB_PLANE;
int mi_row, mi_col;
enum lf_path path;
@@ -1641,7 +1864,7 @@
path = LF_PATH_444;
else
path = LF_PATH_SLOW;
-
+#if CONFIG_PARALLEL_DEBLOCKING
for (mi_row = start; mi_row < stop; mi_row += MAX_MIB_SIZE) {
MODE_INFO **mi = cm->mi_grid_visible + mi_row * cm->mi_stride;
for (mi_col = 0; mi_col < cm->mi_cols; mi_col += MAX_MIB_SIZE) {
@@ -1652,23 +1875,85 @@
// TODO(JBB): Make setup_mask work for non 420.
av1_setup_mask(cm, mi_row, mi_col, mi + mi_col, cm->mi_stride, &lfm);
- av1_filter_block_plane_ss00(cm, &planes[0], mi_row, &lfm);
+ av1_filter_block_plane_ss00_ver(cm, &planes[0], mi_row, &lfm);
for (plane = 1; plane < num_planes; ++plane) {
switch (path) {
case LF_PATH_420:
- av1_filter_block_plane_ss11(cm, &planes[plane], mi_row, &lfm);
+ av1_filter_block_plane_ss11_ver(cm, &planes[plane], mi_row, &lfm);
break;
case LF_PATH_444:
- av1_filter_block_plane_ss00(cm, &planes[plane], mi_row, &lfm);
+ av1_filter_block_plane_ss00_ver(cm, &planes[plane], mi_row, &lfm);
break;
case LF_PATH_SLOW:
- av1_filter_block_plane_non420(cm, &planes[plane], mi + mi_col,
- mi_row, mi_col);
+ av1_filter_block_plane_non420_ver(cm, &planes[plane], mi + mi_col,
+ mi_row, mi_col);
break;
}
}
}
}
+ for (mi_row = start; mi_row < stop; mi_row += MAX_MIB_SIZE) {
+ MODE_INFO **mi = cm->mi_grid_visible + mi_row * cm->mi_stride;
+ for (mi_col = 0; mi_col < cm->mi_cols; mi_col += MAX_MIB_SIZE) {
+ int plane;
+
+ av1_setup_dst_planes(planes, frame_buffer, mi_row, mi_col);
+
+ // TODO(JBB): Make setup_mask work for non 420.
+ av1_setup_mask(cm, mi_row, mi_col, mi + mi_col, cm->mi_stride, &lfm);
+
+ av1_filter_block_plane_ss00_hor(cm, &planes[0], mi_row, &lfm);
+ for (plane = 1; plane < num_planes; ++plane) {
+ switch (path) {
+ case LF_PATH_420:
+ av1_filter_block_plane_ss11_hor(cm, &planes[plane], mi_row, &lfm);
+ break;
+ case LF_PATH_444:
+ av1_filter_block_plane_ss00_hor(cm, &planes[plane], mi_row, &lfm);
+ break;
+ case LF_PATH_SLOW:
+ av1_filter_block_plane_non420_hor(cm, &planes[plane], mi + mi_col,
+ mi_row, mi_col);
+ break;
+ }
+ }
+ }
+ }
+#else // CONFIG_PARALLEL_DEBLOCKING
+ for (mi_row = start; mi_row < stop; mi_row += MAX_MIB_SIZE) {
+ MODE_INFO **mi = cm->mi_grid_visible + mi_row * cm->mi_stride;
+ for (mi_col = 0; mi_col < cm->mi_cols; mi_col += MAX_MIB_SIZE) {
+ int plane;
+
+ av1_setup_dst_planes(planes, frame_buffer, mi_row, mi_col);
+
+ // TODO(JBB): Make setup_mask work for non 420.
+ av1_setup_mask(cm, mi_row, mi_col, mi + mi_col, cm->mi_stride, &lfm);
+
+ av1_filter_block_plane_ss00_ver(cm, &planes[0], mi_row, &lfm);
+ av1_filter_block_plane_ss00_hor(cm, &planes[0], mi_row, &lfm);
+ for (plane = 1; plane < num_planes; ++plane) {
+ switch (path) {
+ case LF_PATH_420:
+ av1_filter_block_plane_ss11_ver(cm, &planes[plane], mi_row, &lfm);
+ av1_filter_block_plane_ss11_hor(cm, &planes[plane], mi_row, &lfm);
+ break;
+ case LF_PATH_444:
+ av1_filter_block_plane_ss00_ver(cm, &planes[plane], mi_row, &lfm);
+ av1_filter_block_plane_ss00_hor(cm, &planes[plane], mi_row, &lfm);
+ break;
+ case LF_PATH_SLOW:
+ av1_filter_block_plane_non420_ver(cm, &planes[plane], mi + mi_col,
+ mi_row, mi_col);
+ av1_filter_block_plane_non420_hor(cm, &planes[plane], mi + mi_col,
+ mi_row, mi_col);
+
+ break;
+ }
+ }
+ }
+ }
+#endif // CONFIG_PARALLEL_DEBLOCKING
#endif // CONFIG_VAR_TX || CONFIG_EXT_PARTITION || CONFIG_EXT_PARTITION_TYPES
}
diff --git a/av1/common/loopfilter.h b/av1/common/loopfilter.h
index 975cbdf..0f70672 100644
--- a/av1/common/loopfilter.h
+++ b/av1/common/loopfilter.h
@@ -99,17 +99,27 @@
const int mi_col, MODE_INFO **mi_8x8,
const int mode_info_stride, LOOP_FILTER_MASK *lfm);
-void av1_filter_block_plane_ss00(struct AV1Common *const cm,
- struct macroblockd_plane *const plane,
- int mi_row, LOOP_FILTER_MASK *lfm);
+void av1_filter_block_plane_ss00_ver(struct AV1Common *const cm,
+ struct macroblockd_plane *const plane,
+ int mi_row, LOOP_FILTER_MASK *lfm);
+void av1_filter_block_plane_ss00_hor(struct AV1Common *const cm,
+ struct macroblockd_plane *const plane,
+ int mi_row, LOOP_FILTER_MASK *lfm);
+void av1_filter_block_plane_ss11_ver(struct AV1Common *const cm,
+ struct macroblockd_plane *const plane,
+ int mi_row, LOOP_FILTER_MASK *lfm);
+void av1_filter_block_plane_ss11_hor(struct AV1Common *const cm,
+ struct macroblockd_plane *const plane,
+ int mi_row, LOOP_FILTER_MASK *lfm);
-void av1_filter_block_plane_ss11(struct AV1Common *const cm,
- struct macroblockd_plane *const plane,
- int mi_row, LOOP_FILTER_MASK *lfm);
-
-void av1_filter_block_plane_non420(struct AV1Common *cm,
- struct macroblockd_plane *plane,
- MODE_INFO **mi_8x8, int mi_row, int mi_col);
+void av1_filter_block_plane_non420_ver(struct AV1Common *cm,
+ struct macroblockd_plane *plane,
+ MODE_INFO **mi_8x8, int mi_row,
+ int mi_col);
+void av1_filter_block_plane_non420_hor(struct AV1Common *cm,
+ struct macroblockd_plane *plane,
+ MODE_INFO **mi_8x8, int mi_row,
+ int mi_col);
void av1_loop_filter_init(struct AV1Common *cm);
diff --git a/av1/common/mvref_common.c b/av1/common/mvref_common.c
index 2344bc1..f9402e9 100644
--- a/av1/common/mvref_common.c
+++ b/av1/common/mvref_common.c
@@ -350,7 +350,11 @@
if (idx == *refmv_count && *refmv_count < MAX_REF_MV_STACK_SIZE) {
ref_mv_stack[idx].this_mv.as_int = this_refmv.as_int;
+#if CONFIG_SIMP_MV_PRED
+ ref_mv_stack[idx].pred_mv[0] = prev_frame_mvs->mv[ref];
+#else
ref_mv_stack[idx].pred_mv[0] = prev_frame_mvs->pred_mv[ref];
+#endif
ref_mv_stack[idx].weight = 2;
++(*refmv_count);
}
@@ -394,8 +398,8 @@
// Check top-right boundary
if (has_tr)
- newmv_count += scan_blk_mbmi(cm, xd, mi_row, mi_col, block, rf, -1, 1,
- ref_mv_stack, refmv_count);
+ newmv_count += scan_blk_mbmi(cm, xd, mi_row, mi_col, block, rf, -1,
+ xd->n8_w, ref_mv_stack, refmv_count);
nearest_refmv_count = *refmv_count;
@@ -419,13 +423,26 @@
mode_context[ref_frame] |= (1 << ZEROMV_OFFSET);
}
- // Scan the second outer area.
+// Scan the second outer area.
+#if CONFIG_SIMP_MV_PRED
+ scan_blk_mbmi(cm, xd, mi_row, mi_col, block, rf, -1, -1, ref_mv_stack,
+ refmv_count);
+ for (idx = 2; idx <= 3; ++idx) {
+ scan_row_mbmi(cm, xd, mi_row, mi_col, block, rf, -idx, ref_mv_stack,
+ refmv_count);
+ scan_col_mbmi(cm, xd, mi_row, mi_col, block, rf, -idx, ref_mv_stack,
+ refmv_count);
+ }
+ scan_col_mbmi(cm, xd, mi_row, mi_col, block, rf, -4, ref_mv_stack,
+ refmv_count);
+#else
for (idx = 2; idx <= 4; ++idx) {
scan_row_mbmi(cm, xd, mi_row, mi_col, block, rf, -idx, ref_mv_stack,
refmv_count);
scan_col_mbmi(cm, xd, mi_row, mi_col, block, rf, -idx, ref_mv_stack,
refmv_count);
}
+#endif
switch (nearest_refmv_count) {
case 0:
@@ -530,7 +547,9 @@
void *const data, int16_t *mode_context) {
const int *ref_sign_bias = cm->ref_frame_sign_bias;
int i, refmv_count = 0;
+#if !CONFIG_SIMP_MV_PRED
const POSITION *const mv_ref_search = mv_ref_blocks[mi->mbmi.sb_type];
+#endif
int different_ref_found = 0;
int context_counter = 0;
const MV_REF *const prev_frame_mvs =
@@ -540,6 +559,29 @@
const TileInfo *const tile = &xd->tile;
const int bw = num_8x8_blocks_wide_lookup[mi->mbmi.sb_type] << 3;
const int bh = num_8x8_blocks_high_lookup[mi->mbmi.sb_type] << 3;
+#if CONFIG_SIMP_MV_PRED
+ POSITION mv_ref_search[MVREF_NEIGHBOURS];
+ const int num_8x8_blocks_wide = bw >> 3;
+ const int num_8x8_blocks_high = bh >> 3;
+ mv_ref_search[0].row = num_8x8_blocks_high - 1;
+ mv_ref_search[0].col = -1;
+ mv_ref_search[1].row = -1;
+ mv_ref_search[1].col = num_8x8_blocks_wide - 1;
+ mv_ref_search[2].row = -1;
+ mv_ref_search[2].col = (num_8x8_blocks_wide - 1) >> 1;
+ mv_ref_search[3].row = (num_8x8_blocks_high - 1) >> 1;
+ mv_ref_search[3].col = -1;
+ mv_ref_search[4].row = -1;
+ mv_ref_search[4].col = -1;
+ mv_ref_search[5].row = -1;
+ mv_ref_search[5].col = num_8x8_blocks_wide;
+ mv_ref_search[6].row = num_8x8_blocks_high;
+ mv_ref_search[6].col = -1;
+ mv_ref_search[7].row = -1;
+ mv_ref_search[7].col = -3;
+ mv_ref_search[8].row = num_8x8_blocks_high - 1;
+ mv_ref_search[8].col = -3;
+#endif
// The nearest 2 blocks are treated differently
// if the size < 8x8 we get the mv from the bmi substructure,
@@ -571,6 +613,11 @@
if (is_inside(tile, mi_col, mi_row, mv_ref)) {
const MB_MODE_INFO *const candidate =
&xd->mi[mv_ref->col + mv_ref->row * xd->mi_stride]->mbmi;
+#if CONFIG_SIMP_MV_PRED
+ if (candidate == NULL) continue;
+ if ((mi_row % 8) + mv_ref->row >= 8 || (mi_col % 8) + mv_ref->col >= 8)
+ continue;
+#endif
different_ref_found = 1;
if (candidate->ref_frame[0] == ref_frame)
@@ -617,6 +664,11 @@
if (is_inside(tile, mi_col, mi_row, mv_ref)) {
const MB_MODE_INFO *const candidate =
&xd->mi[mv_ref->col + mv_ref->row * xd->mi_stride]->mbmi;
+#if CONFIG_SIMP_MV_PRED
+ if (candidate == NULL) continue;
+ if ((mi_row % 8) + mv_ref->row >= 8 || (mi_col % 8) + mv_ref->col >= 8)
+ continue;
+#endif
// If the candidate is INTRA we don't want to consider its mv.
IF_DIFF_REF_FRAME_ADD_MV(candidate, ref_frame, ref_sign_bias,
diff --git a/av1/common/mvref_common.h b/av1/common/mvref_common.h
index 25ebbfd..a9478a6 100644
--- a/av1/common/mvref_common.h
+++ b/av1/common/mvref_common.h
@@ -18,7 +18,11 @@
extern "C" {
#endif
+#if CONFIG_SIMP_MV_PRED
+#define MVREF_NEIGHBOURS 9
+#else
#define MVREF_NEIGHBOURS 8
+#endif
typedef struct position {
int row;
@@ -96,6 +100,7 @@
BOTH_INTRA // 18
};
+#if !CONFIG_SIMP_MV_PRED
static const POSITION mv_ref_blocks[BLOCK_SIZES][MVREF_NEIGHBOURS] = {
// 4X4
{ { -1, 0 },
@@ -245,6 +250,7 @@
{ -2, 12 } },
#endif // CONFIG_EXT_PARTITION
};
+#endif
static const int idx_n_column_to_subblock[4][2] = {
{ 1, 2 }, { 1, 3 }, { 3, 2 }, { 3, 3 }
@@ -268,22 +274,30 @@
// on whether the block_size < 8x8 and we have check_sub_blocks set.
static INLINE int_mv get_sub_block_mv(const MODE_INFO *candidate, int which_mv,
int search_col, int block_idx) {
+#if CONFIG_SIMP_MV_PRED
+ return candidate->mbmi.mv[which_mv];
+#else
return block_idx >= 0 && candidate->mbmi.sb_type < BLOCK_8X8
? candidate
->bmi[idx_n_column_to_subblock[block_idx][search_col == 0]]
.as_mv[which_mv]
: candidate->mbmi.mv[which_mv];
+#endif
}
#if CONFIG_REF_MV
static INLINE int_mv get_sub_block_pred_mv(const MODE_INFO *candidate,
int which_mv, int search_col,
int block_idx) {
+#if CONFIG_SIMP_MV_PRED
+ return candidate->mbmi.mv[which_mv];
+#else
return block_idx >= 0 && candidate->mbmi.sb_type < BLOCK_8X8
? candidate
->bmi[idx_n_column_to_subblock[block_idx][search_col == 0]]
.pred_mv[which_mv]
: candidate->mbmi.pred_mv[which_mv];
+#endif
}
#endif
@@ -341,8 +355,7 @@
}
static INLINE void lower_mv_precision(MV *mv, int allow_hp) {
- const int use_hp = allow_hp && av1_use_mv_hp(mv);
- if (!use_hp) {
+ if (!allow_hp) {
if (mv->row & 1) mv->row += (mv->row > 0 ? -1 : 1);
if (mv->col & 1) mv->col += (mv->col > 0 ? -1 : 1);
}
diff --git a/av1/common/od_dering.c b/av1/common/od_dering.c
index 7aa704f..f19291c 100644
--- a/av1/common/od_dering.c
+++ b/av1/common/od_dering.c
@@ -262,13 +262,52 @@
return (threshold * OD_THRESH_TABLE_Q8[OD_ILOG(v1)] + 128) >> 8;
}
+static INLINE void copy_8x8_16bit(int16_t *dst, int dstride, int16_t *src, int sstride) {
+ int i, j;
+ for (i = 0; i < 8; i++)
+ for (j = 0; j < 8; j++)
+ dst[i * dstride + j] = src[i * sstride + j];
+}
+
+static INLINE void copy_4x4_16bit(int16_t *dst, int dstride, int16_t *src, int sstride) {
+ int i, j;
+ for (i = 0; i < 4; i++)
+ for (j = 0; j < 4; j++)
+ dst[i * dstride + j] = src[i * sstride + j];
+}
+
+/* TODO: Optimize this function for SSE. */
+void copy_blocks_16bit(int16_t *dst, int dstride, int16_t *src, int sstride,
+ unsigned char (*bskip)[2], int dering_count, int bsize)
+{
+ int bi, bx, by;
+ if (bsize == 3) {
+ for (bi = 0; bi < dering_count; bi++) {
+ by = bskip[bi][0];
+ bx = bskip[bi][1];
+ copy_8x8_16bit(&dst[(by << 3) * dstride + (bx << 3)],
+ dstride,
+ &src[(by << 3) * sstride + (bx << 3)], sstride);
+ }
+ } else {
+ for (bi = 0; bi < dering_count; bi++) {
+ by = bskip[bi][0];
+ bx = bskip[bi][1];
+ copy_4x4_16bit(&dst[(by << 2) * dstride + (bx << 2)],
+ dstride,
+ &src[(by << 2) * sstride + (bx << 2)], sstride);
+ }
+ }
+}
+
void od_dering(int16_t *y, int ystride, const od_dering_in *x, int xstride,
int nhb, int nvb, int sbx, int sby, int nhsb, int nvsb, int xdec,
int dir[OD_DERING_NBLOCKS][OD_DERING_NBLOCKS], int pli,
- unsigned char *bskip, int skip_stride, int threshold,
+ unsigned char (*bskip)[2], int dering_count, int threshold,
int coeff_shift) {
int i;
int j;
+ int bi;
int bx;
int by;
int16_t inbuf[OD_DERING_INBUF_SIZE];
@@ -295,56 +334,44 @@
in[i * OD_FILT_BSTRIDE + j] = x[i * xstride + j];
}
}
- /* Assume deringing filter is sparsely applied, so do one large copy rather
- than small copies later if deringing is skipped. */
- for (i = 0; i < nvb << bsize; i++) {
- for (j = 0; j < nhb << bsize; j++) {
- y[i * ystride + j] = in[i * OD_FILT_BSTRIDE + j];
- }
- }
if (pli == 0) {
- for (by = 0; by < nvb; by++) {
- for (bx = 0; bx < nhb; bx++) {
- if (bskip[by * skip_stride + bx]) continue;
- dir[by][bx] = od_dir_find8(&x[8 * by * xstride + 8 * bx], xstride,
- &var[by][bx], coeff_shift);
- /* Deringing orthogonal to the direction uses a tighter threshold
- because we want to be conservative. We've presumably already
- achieved some deringing, so the amount of change is expected
- to be low. Also, since we might be filtering across an edge, we
- want to make sure not to blur it. That being said, we might want
- to be a little bit more aggressive on pure horizontal/vertical
- since the ringing there tends to be directional, so it doesn't
- get removed by the directional filtering. */
- filter2_thresh[by][bx] = (filter_dering_direction[bsize - OD_LOG_BSIZE0])(
- &y[(by * ystride << bsize) + (bx << bsize)], ystride,
- &in[(by * OD_FILT_BSTRIDE << bsize) + (bx << bsize)],
- od_adjust_thresh(threshold, var[by][bx]), dir[by][bx]);
- }
+ for (bi = 0; bi < dering_count; bi++) {
+ by = bskip[bi][0];
+ bx = bskip[bi][1];
+ dir[by][bx] = od_dir_find8(&x[8 * by * xstride + 8 * bx], xstride,
+ &var[by][bx], coeff_shift);
+ /* Deringing orthogonal to the direction uses a tighter threshold
+ because we want to be conservative. We've presumably already
+ achieved some deringing, so the amount of change is expected
+ to be low. Also, since we might be filtering across an edge, we
+ want to make sure not to blur it. That being said, we might want
+ to be a little bit more aggressive on pure horizontal/vertical
+ since the ringing there tends to be directional, so it doesn't
+ get removed by the directional filtering. */
+ filter2_thresh[by][bx] = (filter_dering_direction[bsize - OD_LOG_BSIZE0])(
+ &y[(by * ystride << bsize) + (bx << bsize)], ystride,
+ &in[(by * OD_FILT_BSTRIDE << bsize) + (bx << bsize)],
+ od_adjust_thresh(threshold, var[by][bx]), dir[by][bx]);
}
} else {
- for (by = 0; by < nvb; by++) {
- for (bx = 0; bx < nhb; bx++) {
- if (bskip[by * skip_stride + bx]) continue;
- filter2_thresh[by][bx] = (filter_dering_direction[bsize - OD_LOG_BSIZE0])(
- &y[(by * ystride << bsize) + (bx << bsize)], ystride,
- &in[(by * OD_FILT_BSTRIDE << bsize) + (bx << bsize)], threshold,
- dir[by][bx]);
- }
- }
- }
- for (i = 0; i < nvb << bsize; i++) {
- for (j = 0; j < nhb << bsize; j++) {
- in[i * OD_FILT_BSTRIDE + j] = y[i * ystride + j];
- }
- }
- for (by = 0; by < nvb; by++) {
- for (bx = 0; bx < nhb; bx++) {
- if (bskip[by * skip_stride + bx] || filter2_thresh[by][bx] == 0) continue;
- (filter_dering_orthogonal[bsize - OD_LOG_BSIZE0])(
+ for (bi = 0; bi < dering_count; bi++) {
+ by = bskip[bi][0];
+ bx = bskip[bi][1];
+ filter2_thresh[by][bx] = (filter_dering_direction[bsize - OD_LOG_BSIZE0])(
&y[(by * ystride << bsize) + (bx << bsize)], ystride,
- &in[(by * OD_FILT_BSTRIDE << bsize) + (bx << bsize)], filter2_thresh[by][bx],
+ &in[(by * OD_FILT_BSTRIDE << bsize) + (bx << bsize)], threshold,
dir[by][bx]);
}
}
+ copy_blocks_16bit(in, OD_FILT_BSTRIDE, y, ystride, bskip, dering_count,
+ bsize);
+ for (bi = 0; bi < dering_count; bi++) {
+ by = bskip[bi][0];
+ bx = bskip[bi][1];
+ if (filter2_thresh[by][bx] == 0) continue;
+ (filter_dering_orthogonal[bsize - OD_LOG_BSIZE0])(
+ &y[(by * ystride << bsize) + (bx << bsize)], ystride,
+ &in[(by * OD_FILT_BSTRIDE << bsize) + (bx << bsize)], filter2_thresh[by][bx],
+ dir[by][bx]);
+ }
}
diff --git a/av1/common/od_dering.h b/av1/common/od_dering.h
index c64439f..97090e5 100644
--- a/av1/common/od_dering.h
+++ b/av1/common/od_dering.h
@@ -12,6 +12,8 @@
#if !defined(_dering_H)
#define _dering_H (1)
+// clang-format off
+
#include "odintrin.h"
#if defined(DAALA_ODINTRIN)
@@ -34,10 +36,13 @@
typedef void (*od_filter_dering_orthogonal_func)(int16_t *y, int ystride,
const int16_t *in,
int threshold, int dir);
+void copy_blocks_16bit(int16_t *dst, int dstride, int16_t *src, int sstride,
+ unsigned char (*bskip)[2], int dering_count, int bsize);
+
void od_dering(int16_t *y, int ystride, const od_dering_in *x, int xstride,
int nvb, int nhb, int sbx, int sby, int nhsb, int nvsb, int xdec,
int dir[OD_DERING_NBLOCKS][OD_DERING_NBLOCKS], int pli,
- unsigned char *bskip, int skip_stride, int threshold,
+ unsigned char (*bskip)[2], int skip_stride, int threshold,
int coeff_shift);
int od_filter_dering_direction_4x4_c(int16_t *y, int ystride, const int16_t *in,
int threshold, int dir);
diff --git a/av1/common/onyxc_int.h b/av1/common/onyxc_int.h
index be1cbc1..464314e 100644
--- a/av1/common/onyxc_int.h
+++ b/av1/common/onyxc_int.h
@@ -391,11 +391,15 @@
#if CONFIG_DERING
int dering_level;
#endif
+
#if CONFIG_DELTA_Q
int delta_q_present_flag;
// Resolution of delta quant
int delta_q_res;
#endif
+#if CONFIG_TILE_GROUPS
+ int num_tg;
+#endif
} AV1_COMMON;
// TODO(hkuang): Don't need to lock the whole pool after implementing atomic
@@ -588,11 +592,10 @@
}
#if CONFIG_DAALA_EC
-static INLINE const aom_cdf_prob *get_y_mode_cdf(const AV1_COMMON *cm,
- const MODE_INFO *mi,
- const MODE_INFO *above_mi,
- const MODE_INFO *left_mi,
- int block) {
+static INLINE aom_cdf_prob *get_y_mode_cdf(AV1_COMMON *cm, const MODE_INFO *mi,
+ const MODE_INFO *above_mi,
+ const MODE_INFO *left_mi,
+ int block) {
const PREDICTION_MODE above = av1_above_block_mode(mi, above_mi, block);
const PREDICTION_MODE left = av1_left_block_mode(mi, left_mi, block);
return cm->kf_y_cdf[above][left];
@@ -675,6 +678,30 @@
return (left * 2 + above) + bsl * PARTITION_PLOFFSET;
}
+static INLINE int max_block_wide(const MACROBLOCKD *xd, const BLOCK_SIZE bsize,
+ const int plane) {
+ int max_blocks_wide = block_size_wide[bsize];
+ const struct macroblockd_plane *const pd = &xd->plane[plane];
+
+ if (xd->mb_to_right_edge < 0)
+ max_blocks_wide += xd->mb_to_right_edge >> (3 + pd->subsampling_x);
+
+ // Scale the width in the transform block unit.
+ return max_blocks_wide >> tx_size_wide_log2[0];
+}
+
+static INLINE int max_block_high(const MACROBLOCKD *xd, const BLOCK_SIZE bsize,
+ const int plane) {
+ int max_blocks_high = block_size_high[bsize];
+ const struct macroblockd_plane *const pd = &xd->plane[plane];
+
+ if (xd->mb_to_bottom_edge < 0)
+ max_blocks_high += xd->mb_to_bottom_edge >> (3 + pd->subsampling_y);
+
+ // Scale the width in the transform block unit.
+ return max_blocks_high >> tx_size_wide_log2[0];
+}
+
static INLINE void av1_zero_above_context(AV1_COMMON *const cm,
int mi_col_start, int mi_col_end) {
const int width = mi_col_end - mi_col_start;
@@ -704,36 +731,60 @@
}
#if CONFIG_VAR_TX
-static INLINE void set_txfm_ctx(TXFM_CONTEXT *txfm_ctx, TX_SIZE tx_size,
- int len) {
+static INLINE void set_txfm_ctx(TXFM_CONTEXT *txfm_ctx, uint8_t txs, int len) {
int i;
- for (i = 0; i < len; ++i) txfm_ctx[i] = tx_size;
+ for (i = 0; i < len; ++i) txfm_ctx[i] = txs;
}
static INLINE void set_txfm_ctxs(TX_SIZE tx_size, int n8_w, int n8_h,
const MACROBLOCKD *xd) {
- set_txfm_ctx(xd->above_txfm_context, txsize_horz_map[tx_size], n8_w);
- set_txfm_ctx(xd->left_txfm_context, txsize_vert_map[tx_size], n8_h);
+ uint8_t bw = tx_size_wide[tx_size];
+ uint8_t bh = tx_size_high[tx_size];
+ set_txfm_ctx(xd->above_txfm_context, bw, n8_w);
+ set_txfm_ctx(xd->left_txfm_context, bh, n8_h);
}
static INLINE void txfm_partition_update(TXFM_CONTEXT *above_ctx,
TXFM_CONTEXT *left_ctx,
TX_SIZE tx_size) {
BLOCK_SIZE bsize = txsize_to_bsize[tx_size];
- int bs = num_8x8_blocks_high_lookup[bsize];
+ int bh = num_8x8_blocks_high_lookup[bsize];
+ int bw = num_8x8_blocks_wide_lookup[bsize];
+ uint8_t txw = tx_size_wide[tx_size];
+ uint8_t txh = tx_size_high[tx_size];
int i;
- for (i = 0; i < bs; ++i) {
- above_ctx[i] = tx_size;
- left_ctx[i] = tx_size;
- }
+ for (i = 0; i < bh; ++i) left_ctx[i] = txh;
+ for (i = 0; i < bw; ++i) above_ctx[i] = txw;
}
static INLINE int txfm_partition_context(TXFM_CONTEXT *above_ctx,
TXFM_CONTEXT *left_ctx,
- TX_SIZE tx_size) {
- int above = *above_ctx < tx_size;
- int left = *left_ctx < tx_size;
- return (tx_size - 1) * 3 + above + left;
+ const BLOCK_SIZE bsize,
+ const TX_SIZE tx_size) {
+ const uint8_t txw = tx_size_wide[tx_size];
+ const uint8_t txh = tx_size_high[tx_size];
+ const int above = *above_ctx < txw;
+ const int left = *left_ctx < txh;
+ TX_SIZE max_tx_size = max_txsize_lookup[bsize];
+ int category = 15;
+
+ if (max_tx_size == TX_32X32) {
+ if (tx_size == TX_32X32)
+ category = 0;
+ else
+ category = 1;
+ } else if (max_tx_size == TX_16X16) {
+ if (tx_size == TX_16X16)
+ category = 2;
+ else
+ category = 3;
+ } else if (max_tx_size == TX_8X8) {
+ category = 4;
+ }
+
+ if (category == 15) return category;
+
+ return category * 3 + above + left;
}
#endif
diff --git a/av1/common/pred_common.c b/av1/common/pred_common.c
index 35067f2..5b7c2ec 100644
--- a/av1/common/pred_common.c
+++ b/av1/common/pred_common.c
@@ -756,36 +756,24 @@
} else if (above_intra || left_intra) { // intra/inter or inter/intra
const MB_MODE_INFO *edge_mbmi = above_intra ? left_mbmi : above_mbmi;
- if (!has_second_ref(edge_mbmi))
+ if (!has_second_ref(edge_mbmi)) // single
pred_context = 4 * (!CHECK_BACKWARD_REFS(edge_mbmi->ref_frame[0]));
- else
- pred_context = 1 + (!CHECK_BACKWARD_REFS(edge_mbmi->ref_frame[0]) ||
- !CHECK_BACKWARD_REFS(edge_mbmi->ref_frame[1]));
+ else // comp
+ pred_context = 2;
} else { // inter/inter
const int above_has_second = has_second_ref(above_mbmi);
const int left_has_second = has_second_ref(left_mbmi);
const MV_REFERENCE_FRAME above0 = above_mbmi->ref_frame[0];
- const MV_REFERENCE_FRAME above1 = above_mbmi->ref_frame[1];
const MV_REFERENCE_FRAME left0 = left_mbmi->ref_frame[0];
- const MV_REFERENCE_FRAME left1 = left_mbmi->ref_frame[1];
- if (above_has_second && left_has_second) {
- pred_context =
- 1 + (!CHECK_BACKWARD_REFS(above0) || !CHECK_BACKWARD_REFS(above1) ||
- !CHECK_BACKWARD_REFS(left0) || !CHECK_BACKWARD_REFS(left1));
- } else if (above_has_second || left_has_second) {
+ if (above_has_second && left_has_second) { // comp/comp
+ pred_context = 2;
+ } else if (above_has_second || left_has_second) { // single/comp
const MV_REFERENCE_FRAME rfs = !above_has_second ? above0 : left0;
- const MV_REFERENCE_FRAME crf1 = above_has_second ? above0 : left0;
- const MV_REFERENCE_FRAME crf2 = above_has_second ? above1 : left1;
- if (!CHECK_BACKWARD_REFS(rfs))
- pred_context =
- 3 + (!CHECK_BACKWARD_REFS(crf1) || !CHECK_BACKWARD_REFS(crf2));
- else
- pred_context =
- !CHECK_BACKWARD_REFS(crf1) || !CHECK_BACKWARD_REFS(crf2);
- } else {
+ pred_context = (!CHECK_BACKWARD_REFS(rfs)) ? 4 : 1;
+ } else { // single/single
pred_context = 2 * (!CHECK_BACKWARD_REFS(above0)) +
2 * (!CHECK_BACKWARD_REFS(left0));
}
@@ -794,12 +782,11 @@
const MB_MODE_INFO *edge_mbmi = has_above ? above_mbmi : left_mbmi;
if (!is_inter_block(edge_mbmi)) { // intra
pred_context = 2;
- } else { // inter
- if (!has_second_ref(edge_mbmi))
+ } else { // inter
+ if (!has_second_ref(edge_mbmi)) // single
pred_context = 4 * (!CHECK_BACKWARD_REFS(edge_mbmi->ref_frame[0]));
- else
- pred_context = 1 + (!CHECK_BACKWARD_REFS(edge_mbmi->ref_frame[0]) ||
- !CHECK_BACKWARD_REFS(edge_mbmi->ref_frame[1]));
+ else // comp
+ pred_context = 2;
}
} else { // no edges available
pred_context = 2;
@@ -833,12 +820,12 @@
pred_context = 2;
} else if (above_intra || left_intra) { // intra/inter or inter/intra
const MB_MODE_INFO *edge_mbmi = above_intra ? left_mbmi : above_mbmi;
- if (!has_second_ref(edge_mbmi)) {
+ if (!has_second_ref(edge_mbmi)) { // single
if (!CHECK_BACKWARD_REFS(edge_mbmi->ref_frame[0]))
pred_context = 3;
else
pred_context = 4 * (edge_mbmi->ref_frame[0] == BWDREF_FRAME);
- } else {
+ } else { // comp
pred_context = 1 +
2 * (edge_mbmi->ref_frame[0] == BWDREF_FRAME ||
edge_mbmi->ref_frame[1] == BWDREF_FRAME);
@@ -851,14 +838,14 @@
const MV_REFERENCE_FRAME left0 = left_mbmi->ref_frame[0];
const MV_REFERENCE_FRAME left1 = left_mbmi->ref_frame[1];
- if (above_has_second && left_has_second) {
+ if (above_has_second && left_has_second) { // comp/comp
if (above0 == left0 && above1 == left1)
pred_context =
3 * (above0 == BWDREF_FRAME || above1 == BWDREF_FRAME ||
left0 == BWDREF_FRAME || left1 == BWDREF_FRAME);
else
pred_context = 2;
- } else if (above_has_second || left_has_second) {
+ } else if (above_has_second || left_has_second) { // single/comp
const MV_REFERENCE_FRAME rfs = !above_has_second ? above0 : left0;
const MV_REFERENCE_FRAME crf1 = above_has_second ? above0 : left0;
const MV_REFERENCE_FRAME crf2 = above_has_second ? above1 : left1;
@@ -869,7 +856,7 @@
pred_context = (crf1 == BWDREF_FRAME || crf2 == BWDREF_FRAME);
else
pred_context = 1 + 2 * (crf1 == BWDREF_FRAME || crf2 == BWDREF_FRAME);
- } else {
+ } else { // single/single
if (!CHECK_BACKWARD_REFS(above0) && !CHECK_BACKWARD_REFS(left0)) {
pred_context = 2 + (above0 == left0);
} else if (!CHECK_BACKWARD_REFS(above0) ||
@@ -890,9 +877,9 @@
(!CHECK_BACKWARD_REFS(edge_mbmi->ref_frame[0]) &&
!has_second_ref(edge_mbmi)))
pred_context = 2;
- else if (!has_second_ref(edge_mbmi))
+ else if (!has_second_ref(edge_mbmi)) // single
pred_context = 4 * (edge_mbmi->ref_frame[0] == BWDREF_FRAME);
- else
+ else // comp
pred_context = 3 * (edge_mbmi->ref_frame[0] == BWDREF_FRAME ||
edge_mbmi->ref_frame[1] == BWDREF_FRAME);
} else { // no edges available (2)
@@ -927,12 +914,12 @@
pred_context = 2;
} else if (above_intra || left_intra) { // intra/inter or inter/intra
const MB_MODE_INFO *edge_mbmi = above_intra ? left_mbmi : above_mbmi;
- if (!has_second_ref(edge_mbmi)) {
+ if (!has_second_ref(edge_mbmi)) { // single
if (CHECK_BACKWARD_REFS(edge_mbmi->ref_frame[0]))
pred_context = 3;
else
pred_context = 4 * CHECK_LAST_OR_LAST2(edge_mbmi->ref_frame[0]);
- } else {
+ } else { // comp
pred_context = 1 +
2 * (CHECK_LAST_OR_LAST2(edge_mbmi->ref_frame[0]) ||
CHECK_LAST_OR_LAST2(edge_mbmi->ref_frame[1]));
@@ -945,14 +932,14 @@
const MV_REFERENCE_FRAME left0 = left_mbmi->ref_frame[0];
const MV_REFERENCE_FRAME left1 = left_mbmi->ref_frame[1];
- if (above_has_second && left_has_second) {
+ if (above_has_second && left_has_second) { // comp/comp
if (above0 == left0 && above1 == left1)
pred_context =
3 * (CHECK_LAST_OR_LAST2(above0) || CHECK_LAST_OR_LAST2(above1) ||
CHECK_LAST_OR_LAST2(left0) || CHECK_LAST_OR_LAST2(left1));
else
pred_context = 2;
- } else if (above_has_second || left_has_second) {
+ } else if (above_has_second || left_has_second) { // single/comp
const MV_REFERENCE_FRAME rfs = !above_has_second ? above0 : left0;
const MV_REFERENCE_FRAME crf1 = above_has_second ? above0 : left0;
const MV_REFERENCE_FRAME crf2 = above_has_second ? above1 : left1;
@@ -966,7 +953,7 @@
else
pred_context =
1 + 2 * (CHECK_LAST_OR_LAST2(crf1) || CHECK_LAST_OR_LAST2(crf2));
- } else {
+ } else { // single/single
if (CHECK_BACKWARD_REFS(above0) && CHECK_BACKWARD_REFS(left0)) {
pred_context = 2 + (above0 == left0);
} else if (CHECK_BACKWARD_REFS(above0) || CHECK_BACKWARD_REFS(left0)) {
@@ -986,9 +973,9 @@
(CHECK_BACKWARD_REFS(edge_mbmi->ref_frame[0]) &&
!has_second_ref(edge_mbmi)))
pred_context = 2;
- else if (!has_second_ref(edge_mbmi))
+ else if (!has_second_ref(edge_mbmi)) // single
pred_context = 4 * (CHECK_LAST_OR_LAST2(edge_mbmi->ref_frame[0]));
- else
+ else // comp
pred_context = 3 * (CHECK_LAST_OR_LAST2(edge_mbmi->ref_frame[0]) ||
CHECK_LAST_OR_LAST2(edge_mbmi->ref_frame[1]));
} else { // no edges available (2)
@@ -1023,12 +1010,12 @@
pred_context = 2;
} else if (above_intra || left_intra) { // intra/inter or inter/intra
const MB_MODE_INFO *edge_mbmi = above_intra ? left_mbmi : above_mbmi;
- if (!has_second_ref(edge_mbmi)) {
+ if (!has_second_ref(edge_mbmi)) { // single
if (!CHECK_LAST_OR_LAST2(edge_mbmi->ref_frame[0]))
pred_context = 3;
else
pred_context = 4 * (edge_mbmi->ref_frame[0] == LAST_FRAME);
- } else {
+ } else { // comp
pred_context = 1 +
2 * (edge_mbmi->ref_frame[0] == LAST_FRAME ||
edge_mbmi->ref_frame[1] == LAST_FRAME);
@@ -1041,13 +1028,13 @@
const MV_REFERENCE_FRAME left0 = left_mbmi->ref_frame[0];
const MV_REFERENCE_FRAME left1 = left_mbmi->ref_frame[1];
- if (above_has_second && left_has_second) {
+ if (above_has_second && left_has_second) { // comp/comp
if (above0 == left0 && above1 == left1)
pred_context = 3 * (above0 == LAST_FRAME || above1 == LAST_FRAME ||
left0 == LAST_FRAME || left1 == LAST_FRAME);
else
pred_context = 2;
- } else if (above_has_second || left_has_second) {
+ } else if (above_has_second || left_has_second) { // single/comp
const MV_REFERENCE_FRAME rfs = !above_has_second ? above0 : left0;
const MV_REFERENCE_FRAME crf1 = above_has_second ? above0 : left0;
const MV_REFERENCE_FRAME crf2 = above_has_second ? above1 : left1;
@@ -1058,7 +1045,7 @@
pred_context = (crf1 == LAST_FRAME || crf2 == LAST_FRAME);
else
pred_context = 1 + 2 * (crf1 == LAST_FRAME || crf2 == LAST_FRAME);
- } else {
+ } else { // single/single
if (!CHECK_LAST_OR_LAST2(above0) && !CHECK_LAST_OR_LAST2(left0)) {
pred_context = 2 + (above0 == left0);
} else if (!CHECK_LAST_OR_LAST2(above0) ||
@@ -1078,9 +1065,9 @@
(!CHECK_LAST_OR_LAST2(edge_mbmi->ref_frame[0]) &&
!has_second_ref(edge_mbmi)))
pred_context = 2;
- else if (!has_second_ref(edge_mbmi))
+ else if (!has_second_ref(edge_mbmi)) // single
pred_context = 4 * (edge_mbmi->ref_frame[0] == LAST_FRAME);
- else
+ else // comp
pred_context = 3 * (edge_mbmi->ref_frame[0] == LAST_FRAME ||
edge_mbmi->ref_frame[1] == LAST_FRAME);
} else { // no edges available (2)
@@ -1115,12 +1102,12 @@
pred_context = 2;
} else if (above_intra || left_intra) { // intra/inter or inter/intra
const MB_MODE_INFO *edge_mbmi = above_intra ? left_mbmi : above_mbmi;
- if (!has_second_ref(edge_mbmi)) {
+ if (!has_second_ref(edge_mbmi)) { // single
if (!CHECK_GOLDEN_OR_LAST3(edge_mbmi->ref_frame[0]))
pred_context = 3;
else
pred_context = 4 * (edge_mbmi->ref_frame[0] == LAST3_FRAME);
- } else {
+ } else { // comp
pred_context = 1 +
2 * (edge_mbmi->ref_frame[0] == LAST3_FRAME ||
edge_mbmi->ref_frame[1] == LAST3_FRAME);
@@ -1133,13 +1120,13 @@
const MV_REFERENCE_FRAME left0 = left_mbmi->ref_frame[0];
const MV_REFERENCE_FRAME left1 = left_mbmi->ref_frame[1];
- if (above_has_second && left_has_second) {
+ if (above_has_second && left_has_second) { // comp/comp
if (above0 == left0 && above1 == left1)
pred_context = 3 * (above0 == LAST3_FRAME || above1 == LAST3_FRAME ||
left0 == LAST3_FRAME || left1 == LAST3_FRAME);
else
pred_context = 2;
- } else if (above_has_second || left_has_second) {
+ } else if (above_has_second || left_has_second) { // single/comp
const MV_REFERENCE_FRAME rfs = !above_has_second ? above0 : left0;
const MV_REFERENCE_FRAME crf1 = above_has_second ? above0 : left0;
const MV_REFERENCE_FRAME crf2 = above_has_second ? above1 : left1;
@@ -1150,7 +1137,7 @@
pred_context = (crf1 == LAST3_FRAME || crf2 == LAST3_FRAME);
else
pred_context = 1 + 2 * (crf1 == LAST3_FRAME || crf2 == LAST3_FRAME);
- } else {
+ } else { // single/single
if (!CHECK_GOLDEN_OR_LAST3(above0) && !CHECK_GOLDEN_OR_LAST3(left0)) {
pred_context = 2 + (above0 == left0);
} else if (!CHECK_GOLDEN_OR_LAST3(above0) ||
@@ -1171,9 +1158,9 @@
(!CHECK_GOLDEN_OR_LAST3(edge_mbmi->ref_frame[0]) &&
!has_second_ref(edge_mbmi)))
pred_context = 2;
- else if (!has_second_ref(edge_mbmi))
+ else if (!has_second_ref(edge_mbmi)) // single
pred_context = 4 * (edge_mbmi->ref_frame[0] == LAST3_FRAME);
- else
+ else // comp
pred_context = 3 * (edge_mbmi->ref_frame[0] == LAST3_FRAME ||
edge_mbmi->ref_frame[1] == LAST3_FRAME);
} else { // no edges available (2)
diff --git a/av1/common/pred_common.h b/av1/common/pred_common.h
index b3ef1c4..6b47ed2 100644
--- a/av1/common/pred_common.h
+++ b/av1/common/pred_common.h
@@ -186,8 +186,13 @@
if (!has_left) left_ctx = above_ctx;
if (!has_above) above_ctx = left_ctx;
-
+#if CONFIG_CB4X4
+ // TODO(jingning): Temporary setup. Will rework this after the cb4x4
+ // framework is up running.
+ return (above_ctx + left_ctx) > max_tx_size + 1;
+#else
return (above_ctx + left_ctx) > max_tx_size;
+#endif
}
#if CONFIG_VAR_TX
@@ -200,13 +205,8 @@
const int tx_row = blk_row >> (1 - pd->subsampling_y);
const int tx_col = blk_col >> (1 - pd->subsampling_x);
const TX_SIZE plane_tx_size = mbmi->inter_tx_size[tx_row][tx_col];
- int max_blocks_high = num_4x4_blocks_high_lookup[plane_bsize];
- int max_blocks_wide = num_4x4_blocks_wide_lookup[plane_bsize];
-
- if (xd->mb_to_bottom_edge < 0)
- max_blocks_high += xd->mb_to_bottom_edge >> (5 + pd->subsampling_y);
- if (xd->mb_to_right_edge < 0)
- max_blocks_wide += xd->mb_to_right_edge >> (5 + pd->subsampling_x);
+ const int max_blocks_high = max_block_high(xd, plane_bsize, 0);
+ const int max_blocks_wide = max_block_wide(xd, plane_bsize, 0);
if (blk_row >= max_blocks_high || blk_col >= max_blocks_wide) return;
diff --git a/av1/common/quant_common.c b/av1/common/quant_common.c
index b3228b7..69d0cc0 100644
--- a/av1/common/quant_common.c
+++ b/av1/common/quant_common.c
@@ -16,11 +16,6 @@
#include "av1/common/seg_common.h"
#include "av1/common/blockd.h"
-#if CONFIG_AOM_QM
-static void make_qmatrices(qm_val_t *wmatrix[NUM_QM_LEVELS][2][2][TX_SIZES],
- qm_val_t *iwmatrix[NUM_QM_LEVELS][2][2][TX_SIZES]);
-#endif
-
#if CONFIG_NEW_QUANT
// Bin widths expressed as a fraction over 128 of the quant stepsize,
// for the quantization bins 0-4.
diff --git a/av1/common/reconinter.c b/av1/common/reconinter.c
index 6c4ae2a..66b6bfd 100644
--- a/av1/common/reconinter.c
+++ b/av1/common/reconinter.c
@@ -1889,6 +1889,7 @@
int dst_stride,
PREDICTION_MODE mode,
BLOCK_SIZE bsize, int plane) {
+ struct macroblockd_plane *const pd = &xd->plane[plane];
BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, &xd->plane[plane]);
const int bwl = b_width_log2_lookup[plane_bsize];
const int bhl = b_height_log2_lookup[plane_bsize];
@@ -1897,14 +1898,14 @@
TX_SIZE max_tx_size = max_txsize_lookup[plane_bsize];
if (bwl == bhl) {
- av1_predict_intra_block(xd, bwl, bhl, max_tx_size, mode, ref, ref_stride,
- dst, dst_stride, 0, 0, plane);
+ av1_predict_intra_block(xd, pd->width, pd->height, max_tx_size, mode, ref,
+ ref_stride, dst, dst_stride, 0, 0, plane);
} else if (bwl < bhl) {
uint8_t *src_2 = ref + pxbw * ref_stride;
uint8_t *dst_2 = dst + pxbw * dst_stride;
- av1_predict_intra_block(xd, bwl, bhl, max_tx_size, mode, ref, ref_stride,
- dst, dst_stride, 0, 0, plane);
+ av1_predict_intra_block(xd, pd->width, pd->height, max_tx_size, mode, ref,
+ ref_stride, dst, dst_stride, 0, 0, plane);
#if CONFIG_AOM_HIGHBITDEPTH
if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
uint16_t *src_216 = CONVERT_TO_SHORTPTR(src_2);
@@ -1916,14 +1917,14 @@
{
memcpy(src_2 - ref_stride, dst_2 - dst_stride, sizeof(*src_2) * pxbw);
}
- av1_predict_intra_block(xd, bwl, bhl, max_tx_size, mode, src_2, ref_stride,
- dst_2, dst_stride, 0, 1 << bwl, plane);
+ av1_predict_intra_block(xd, pd->width, pd->height, max_tx_size, mode, src_2,
+ ref_stride, dst_2, dst_stride, 0, 1 << bwl, plane);
} else { // bwl > bhl
int i;
uint8_t *src_2 = ref + pxbh;
uint8_t *dst_2 = dst + pxbh;
- av1_predict_intra_block(xd, bwl, bhl, max_tx_size, mode, ref, ref_stride,
- dst, dst_stride, 0, 0, plane);
+ av1_predict_intra_block(xd, pd->width, pd->height, max_tx_size, mode, ref,
+ ref_stride, dst, dst_stride, 0, 0, plane);
#if CONFIG_AOM_HIGHBITDEPTH
if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
uint16_t *src_216 = CONVERT_TO_SHORTPTR(src_2);
@@ -1936,8 +1937,8 @@
for (i = 0; i < pxbh; ++i)
src_2[i * ref_stride - 1] = dst_2[i * dst_stride - 1];
}
- av1_predict_intra_block(xd, bwl, bhl, max_tx_size, mode, src_2, ref_stride,
- dst_2, dst_stride, 1 << bhl, 0, plane);
+ av1_predict_intra_block(xd, pd->width, pd->height, max_tx_size, mode, src_2,
+ ref_stride, dst_2, dst_stride, 1 << bhl, 0, plane);
}
}
diff --git a/av1/common/reconinter.h b/av1/common/reconinter.h
index 3eec384..13f581e 100644
--- a/av1/common/reconinter.h
+++ b/av1/common/reconinter.h
@@ -14,7 +14,7 @@
#include "av1/common/filter.h"
#include "av1/common/onyxc_int.h"
-#include "av1/common/av1_convolve.h"
+#include "av1/common/convolve.h"
#include "aom/aom_integer.h"
#ifdef __cplusplus
@@ -418,7 +418,6 @@
const YV12_BUFFER_CONFIG *src, int mi_row, int mi_col,
const struct scale_factors *sf);
-#if CONFIG_DUAL_FILTER
// Detect if the block have sub-pixel level motion vectors
// per component.
static INLINE int has_subpel_mv_component(const MODE_INFO *const mi,
@@ -460,60 +459,28 @@
return 0;
}
-#endif
-#if CONFIG_EXT_INTERP
+#define CHECK_SUBPEL 0
static INLINE int av1_is_interp_needed(const MACROBLOCKD *const xd) {
+#if CHECK_SUBPEL
MODE_INFO *const mi = xd->mi[0];
- MB_MODE_INFO *const mbmi = &mi->mbmi;
- const BLOCK_SIZE bsize = mbmi->sb_type;
- const int is_compound = has_second_ref(mbmi);
- int intpel_mv = 1;
- int plane;
-
-#if SUPPORT_NONINTERPOLATING_FILTERS
- // TODO(debargha): This is is currently only for experimentation
- // with non-interpolating filters. Remove later.
- // If any of the filters are non-interpolating, then indicate the
- // interpolation filter always.
- int i;
- for (i = 0; i < SWITCHABLE_FILTERS; ++i) {
- if (!IsInterpolatingFilter(i)) return 1;
- }
-#endif
-
- // For scaled references, interpolation filter is indicated all the time.
- if (av1_is_scaled(&xd->block_refs[0]->sf)) return 1;
- if (is_compound && av1_is_scaled(&xd->block_refs[1]->sf)) return 1;
-
- if (bsize < BLOCK_8X8) {
- for (plane = 0; plane < MAX_MB_PLANE; ++plane) {
- const PARTITION_TYPE bp = BLOCK_8X8 - bsize;
- const struct macroblockd_plane *const pd = &xd->plane[plane];
- const int have_vsplit = bp != PARTITION_HORZ;
- const int have_hsplit = bp != PARTITION_VERT;
- const int num_4x4_w = 2 >> ((!have_vsplit) | pd->subsampling_x);
- const int num_4x4_h = 2 >> ((!have_hsplit) | pd->subsampling_y);
- int ref;
- for (ref = 0; ref < 1 + is_compound; ++ref) {
- int x, y;
- for (y = 0; y < num_4x4_h; ++y)
- for (x = 0; x < num_4x4_w; ++x) {
- const MV mv = average_split_mvs(pd, mi, ref, y * 2 + x);
- if (mv_has_subpel(&mv)) return 1;
- }
+ const int is_compound = has_second_ref(&mi->mbmi);
+ int ref;
+ for (ref = 0; ref < 1 + is_compound; ++ref) {
+ int row_col;
+ for (row_col = 0; row_col < 2; ++row_col) {
+ const int dir = (ref << 1) + row_col;
+ if (has_subpel_mv_component(mi, xd, dir)) {
+ return 1;
}
}
- return 0;
- } else {
- intpel_mv = !mv_has_subpel(&mbmi->mv[0].as_mv);
- if (is_compound && intpel_mv) {
- intpel_mv &= !mv_has_subpel(&mbmi->mv[1].as_mv);
- }
}
- return !intpel_mv;
+ return 0;
+#else
+ (void)xd;
+ return 1;
+#endif
}
-#endif // CONFIG_EXT_INTERP
#if CONFIG_MOTION_VAR
const uint8_t *av1_get_obmc_mask(int length);
diff --git a/av1/common/scan.c b/av1/common/scan.c
index 1281843..b2386b9 100644
--- a/av1/common/scan.c
+++ b/av1/common/scan.c
@@ -36,12 +36,12 @@
0, 1, 4, 2, 5, 3, 6, 8, 9, 7, 12, 10, 13, 11, 14, 15,
};
-#if CONFIG_EXT_TX
DECLARE_ALIGNED(16, static const int16_t, default_scan_4x8[32]) = {
0, 1, 4, 5, 2, 8, 6, 9, 10, 3, 12, 7, 13, 11, 14, 16,
17, 15, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
};
+#if CONFIG_EXT_TX
DECLARE_ALIGNED(16, static const int16_t, mcol_scan_4x8[32]) = {
0, 4, 8, 12, 16, 20, 24, 28, 1, 5, 9, 13, 17, 21, 25, 29,
2, 6, 10, 14, 18, 22, 26, 30, 3, 7, 11, 15, 19, 23, 27, 31,
@@ -51,12 +51,14 @@
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
};
+#endif
DECLARE_ALIGNED(16, static const int16_t, default_scan_8x4[32]) = {
0, 1, 8, 9, 2, 16, 10, 17, 18, 3, 24, 11, 25, 19, 26, 4,
12, 27, 20, 5, 28, 13, 21, 29, 6, 14, 22, 30, 7, 15, 23, 31,
};
+#if CONFIG_EXT_TX
DECLARE_ALIGNED(16, static const int16_t, mcol_scan_8x4[32]) = {
0, 8, 16, 24, 1, 9, 17, 25, 2, 10, 18, 26, 3, 11, 19, 27,
4, 12, 20, 28, 5, 13, 21, 29, 6, 14, 22, 30, 7, 15, 23, 31,
@@ -66,7 +68,7 @@
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
};
-#endif // CONFIG_EXT_TX
+#endif
DECLARE_ALIGNED(16, static const int16_t, default_scan_8x8[64]) = {
0, 8, 1, 16, 9, 2, 17, 24, 10, 3, 18, 25, 32, 11, 4, 26,
@@ -105,7 +107,6 @@
58, 45, 38, 52, 31, 59, 53, 46, 60, 39, 61, 47, 54, 55, 62, 63,
};
-#if CONFIG_EXT_TX
DECLARE_ALIGNED(16, static const int16_t, default_scan_8x16[128]) = {
0, 1, 8, 2, 9, 16, 3, 10, 17, 24, 4, 11, 18, 25, 32,
5, 12, 19, 26, 33, 40, 6, 13, 20, 27, 34, 41, 48, 7, 14,
@@ -129,6 +130,7 @@
122, 63, 78, 93, 108, 123, 79, 94, 109, 124, 95, 110, 125, 111, 126, 127,
};
+#if CONFIG_EXT_TX
DECLARE_ALIGNED(16, static const int16_t, mcol_scan_8x16[128]) = {
0, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120,
1, 9, 17, 25, 33, 41, 49, 57, 65, 73, 81, 89, 97, 105, 113, 121,
@@ -174,6 +176,7 @@
105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119,
120, 121, 122, 123, 124, 125, 126, 127,
};
+#endif
DECLARE_ALIGNED(16, static const int16_t, default_scan_16x32[512]) = {
0, 1, 16, 2, 17, 32, 3, 18, 33, 48, 4, 19, 34, 49, 64,
@@ -251,6 +254,7 @@
510, 511,
};
+#if CONFIG_EXT_TX
DECLARE_ALIGNED(16, static const int16_t, mcol_scan_16x32[512]) = {
0, 16, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224,
240, 256, 272, 288, 304, 320, 336, 352, 368, 384, 400, 416, 432, 448, 464,
@@ -1034,7 +1038,6 @@
8, 3, 6, 8, 9, 6, 9, 9, 12, 7, 10, 10, 13, 11, 14, 0, 0,
};
-#if CONFIG_EXT_TX
DECLARE_ALIGNED(16, static const int16_t,
default_scan_4x8_neighbors[33 * MAX_NEIGHBORS]) = {
0, 0, 0, 0, 0, 0, 1, 4, 1, 1, 4, 4, 2, 5, 5, 8, 6,
@@ -1043,6 +1046,7 @@
24, 22, 25, 23, 26, 24, 24, 25, 28, 26, 29, 27, 30, 0, 0
};
+#if CONFIG_EXT_TX
DECLARE_ALIGNED(16, static const int16_t,
mcol_scan_4x8_neighbors[33 * MAX_NEIGHBORS]) = {
0, 0, 0, 0, 4, 4, 8, 8, 12, 12, 16, 16, 20, 20, 24, 24, 0,
@@ -1058,6 +1062,7 @@
13, 16, 14, 17, 15, 18, 16, 16, 17, 20, 18, 21, 19, 22, 20, 20, 21,
24, 22, 25, 23, 26, 24, 24, 25, 28, 26, 29, 27, 30, 0, 0
};
+#endif
DECLARE_ALIGNED(16, static const int16_t,
default_scan_8x4_neighbors[33 * MAX_NEIGHBORS]) = {
@@ -1067,6 +1072,7 @@
13, 14, 21, 22, 29, 6, 6, 7, 14, 15, 22, 23, 30, 0, 0
};
+#if CONFIG_EXT_TX
DECLARE_ALIGNED(16, static const int16_t,
mcol_scan_8x4_neighbors[33 * MAX_NEIGHBORS]) = {
0, 0, 0, 0, 8, 8, 16, 16, 0, 0, 1, 8, 9, 16, 17, 24, 1,
@@ -1141,7 +1147,6 @@
31, 38, 53, 60, 46, 53, 39, 46, 54, 61, 47, 54, 55, 62, 0, 0,
};
-#if CONFIG_EXT_TX
DECLARE_ALIGNED(16, static const int16_t,
default_scan_8x16_neighbors[129 * MAX_NEIGHBORS]) = {
0, 0, 0, 0, 0, 0, 1, 1, 1, 8, 8, 8, 2, 2, 2,
@@ -1186,6 +1191,7 @@
126, 0, 0
};
+#if CONFIG_EXT_TX
DECLARE_ALIGNED(16, static const int16_t,
mcol_scan_8x16_neighbors[129 * MAX_NEIGHBORS]) = {
0, 0, 0, 0, 8, 8, 16, 16, 24, 24, 32, 32, 40, 40, 48, 48,
@@ -1271,6 +1277,7 @@
104, 119, 105, 120, 106, 121, 107, 122, 108, 123, 109, 124, 110, 125, 111,
126, 0, 0
};
+#endif
DECLARE_ALIGNED(16, static const int16_t,
default_scan_16x32_neighbors[513 * MAX_NEIGHBORS]) = {
@@ -1418,6 +1425,7 @@
478, 509, 479, 510, 0, 0
};
+#if CONFIG_EXT_TX
DECLARE_ALIGNED(16, static const int16_t,
mcol_scan_16x32_neighbors[513 * MAX_NEIGHBORS]) = {
0, 0, 0, 0, 16, 16, 32, 32, 48, 48, 64, 64, 80, 80, 96,
@@ -2841,12 +2849,12 @@
0, 1, 3, 5, 2, 4, 6, 9, 7, 8, 11, 13, 10, 12, 14, 15,
};
-#if CONFIG_EXT_TX
DECLARE_ALIGNED(16, static const int16_t, av1_default_iscan_4x8[32]) = {
0, 1, 4, 9, 2, 3, 6, 11, 5, 7, 8, 13, 10, 12, 14, 17,
15, 16, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
};
+#if CONFIG_EXT_TX
DECLARE_ALIGNED(16, static const int16_t, av1_mcol_iscan_4x8[32]) = {
0, 8, 16, 24, 1, 9, 17, 25, 2, 10, 18, 26, 3, 11, 19, 27,
4, 12, 20, 28, 5, 13, 21, 29, 6, 14, 22, 30, 7, 15, 23, 31,
@@ -2856,12 +2864,14 @@
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
};
+#endif
DECLARE_ALIGNED(16, static const int16_t, av1_default_iscan_8x4[32]) = {
0, 1, 4, 9, 15, 19, 24, 28, 2, 3, 6, 11, 16, 21, 25, 29,
5, 7, 8, 13, 18, 22, 26, 30, 10, 12, 14, 17, 20, 23, 27, 31,
};
+#if CONFIG_EXT_TX
DECLARE_ALIGNED(16, static const int16_t, av1_mcol_iscan_8x4[32]) = {
0, 4, 8, 12, 16, 20, 24, 28, 1, 5, 9, 13, 17, 21, 25, 29,
2, 6, 10, 14, 18, 22, 26, 30, 3, 7, 11, 15, 19, 23, 27, 31,
@@ -2910,7 +2920,6 @@
25, 32, 39, 45, 50, 55, 59, 62, 33, 40, 46, 51, 54, 58, 61, 63,
};
-#if CONFIG_EXT_TX
DECLARE_ALIGNED(16, static const int16_t, av1_default_iscan_8x16[128]) = {
0, 1, 3, 6, 10, 15, 21, 28, 2, 4, 7, 11, 16, 22, 29, 36,
5, 8, 12, 17, 23, 30, 37, 44, 9, 13, 18, 24, 31, 38, 45, 52,
@@ -2933,6 +2942,7 @@
35, 43, 51, 59, 67, 75, 83, 91, 99, 106, 112, 117, 121, 124, 126, 127,
};
+#if CONFIG_EXT_TX
DECLARE_ALIGNED(16, static const int16_t, av1_mcol_iscan_8x16[128]) = {
0, 16, 32, 48, 64, 80, 96, 112, 1, 17, 33, 49, 65, 81, 97, 113,
2, 18, 34, 50, 66, 82, 98, 114, 3, 19, 35, 51, 67, 83, 99, 115,
@@ -2978,6 +2988,7 @@
105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119,
120, 121, 122, 123, 124, 125, 126, 127,
};
+#endif
DECLARE_ALIGNED(16, static const int16_t, av1_default_iscan_16x32[512]) = {
0, 1, 3, 6, 10, 15, 21, 28, 36, 45, 55, 66, 78, 91, 105,
@@ -3055,6 +3066,7 @@
510, 511,
};
+#if CONFIG_EXT_TX
DECLARE_ALIGNED(16, static const int16_t, av1_mcol_iscan_16x32[512]) = {
0, 32, 64, 96, 128, 160, 192, 224, 256, 288, 320, 352, 384, 416, 448, 480,
1, 33, 65, 97, 129, 161, 193, 225, 257, 289, 321, 353, 385, 417, 449, 481,
@@ -3801,20 +3813,24 @@
#endif // CONFIG_EXT_TX
const SCAN_ORDER av1_default_scan_orders[TX_SIZES] = {
+#if CONFIG_CB4X4
+ { default_scan_4x4, av1_default_iscan_4x4, default_scan_4x4_neighbors },
+#endif
{ default_scan_4x4, av1_default_iscan_4x4, default_scan_4x4_neighbors },
{ default_scan_8x8, av1_default_iscan_8x8, default_scan_8x8_neighbors },
{ default_scan_16x16, av1_default_iscan_16x16, default_scan_16x16_neighbors },
{ default_scan_32x32, av1_default_iscan_32x32, default_scan_32x32_neighbors },
};
-#if CONFIG_EXT_TX
const SCAN_ORDER av1_intra_scan_orders[TX_SIZES][TX_TYPES] = {
+#if CONFIG_CB4X4
{
- // TX_4X4
+ // TX_2X2
{ default_scan_4x4, av1_default_iscan_4x4, default_scan_4x4_neighbors },
{ row_scan_4x4, av1_row_iscan_4x4, row_scan_4x4_neighbors },
{ col_scan_4x4, av1_col_iscan_4x4, col_scan_4x4_neighbors },
{ default_scan_4x4, av1_default_iscan_4x4, default_scan_4x4_neighbors },
+#if CONFIG_EXT_TX
{ default_scan_4x4, av1_default_iscan_4x4, default_scan_4x4_neighbors },
{ default_scan_4x4, av1_default_iscan_4x4, default_scan_4x4_neighbors },
{ default_scan_4x4, av1_default_iscan_4x4, default_scan_4x4_neighbors },
@@ -3827,6 +3843,29 @@
{ col_scan_4x4, av1_col_iscan_4x4, col_scan_4x4_neighbors },
{ row_scan_4x4, av1_row_iscan_4x4, row_scan_4x4_neighbors },
{ col_scan_4x4, av1_col_iscan_4x4, col_scan_4x4_neighbors },
+#endif // CONFIG_EXT_TX
+ },
+#endif
+ {
+ // TX_4X4
+ { default_scan_4x4, av1_default_iscan_4x4, default_scan_4x4_neighbors },
+ { row_scan_4x4, av1_row_iscan_4x4, row_scan_4x4_neighbors },
+ { col_scan_4x4, av1_col_iscan_4x4, col_scan_4x4_neighbors },
+ { default_scan_4x4, av1_default_iscan_4x4, default_scan_4x4_neighbors },
+#if CONFIG_EXT_TX
+ { default_scan_4x4, av1_default_iscan_4x4, default_scan_4x4_neighbors },
+ { default_scan_4x4, av1_default_iscan_4x4, default_scan_4x4_neighbors },
+ { default_scan_4x4, av1_default_iscan_4x4, default_scan_4x4_neighbors },
+ { default_scan_4x4, av1_default_iscan_4x4, default_scan_4x4_neighbors },
+ { default_scan_4x4, av1_default_iscan_4x4, default_scan_4x4_neighbors },
+ { mrow_scan_4x4, av1_mrow_iscan_4x4, mrow_scan_4x4_neighbors },
+ { row_scan_4x4, av1_row_iscan_4x4, row_scan_4x4_neighbors },
+ { col_scan_4x4, av1_col_iscan_4x4, col_scan_4x4_neighbors },
+ { row_scan_4x4, av1_row_iscan_4x4, row_scan_4x4_neighbors },
+ { col_scan_4x4, av1_col_iscan_4x4, col_scan_4x4_neighbors },
+ { row_scan_4x4, av1_row_iscan_4x4, row_scan_4x4_neighbors },
+ { col_scan_4x4, av1_col_iscan_4x4, col_scan_4x4_neighbors },
+#endif // CONFIG_EXT_TX
},
{
// TX_8X8
@@ -3834,6 +3873,7 @@
{ row_scan_8x8, av1_row_iscan_8x8, row_scan_8x8_neighbors },
{ col_scan_8x8, av1_col_iscan_8x8, col_scan_8x8_neighbors },
{ default_scan_8x8, av1_default_iscan_8x8, default_scan_8x8_neighbors },
+#if CONFIG_EXT_TX
{ default_scan_8x8, av1_default_iscan_8x8, default_scan_8x8_neighbors },
{ default_scan_8x8, av1_default_iscan_8x8, default_scan_8x8_neighbors },
{ default_scan_8x8, av1_default_iscan_8x8, default_scan_8x8_neighbors },
@@ -3846,6 +3886,7 @@
{ col_scan_8x8, av1_col_iscan_8x8, col_scan_8x8_neighbors },
{ row_scan_8x8, av1_row_iscan_8x8, row_scan_8x8_neighbors },
{ col_scan_8x8, av1_col_iscan_8x8, col_scan_8x8_neighbors },
+#endif // CONFIG_EXT_TX
},
{
// TX_16X16
@@ -3855,6 +3896,7 @@
{ col_scan_16x16, av1_col_iscan_16x16, col_scan_16x16_neighbors },
{ default_scan_16x16, av1_default_iscan_16x16,
default_scan_16x16_neighbors },
+#if CONFIG_EXT_TX
{ default_scan_16x16, av1_default_iscan_16x16,
default_scan_16x16_neighbors },
{ default_scan_16x16, av1_default_iscan_16x16,
@@ -3872,11 +3914,13 @@
{ col_scan_16x16, av1_col_iscan_16x16, col_scan_16x16_neighbors },
{ row_scan_16x16, av1_row_iscan_16x16, row_scan_16x16_neighbors },
{ col_scan_16x16, av1_col_iscan_16x16, col_scan_16x16_neighbors },
+#endif // CONFIG_EXT_TX
},
{
// TX_32X32
{ default_scan_32x32, av1_default_iscan_32x32,
default_scan_32x32_neighbors },
+#if CONFIG_EXT_TX
{ h2_scan_32x32, av1_h2_iscan_32x32, h2_scan_32x32_neighbors },
{ v2_scan_32x32, av1_v2_iscan_32x32, v2_scan_32x32_neighbors },
{ qtr_scan_32x32, av1_qtr_iscan_32x32, qtr_scan_32x32_neighbors },
@@ -3892,16 +3936,41 @@
{ mcol_scan_32x32, av1_mcol_iscan_32x32, mcol_scan_32x32_neighbors },
{ mrow_scan_32x32, av1_mrow_iscan_32x32, mrow_scan_32x32_neighbors },
{ mcol_scan_32x32, av1_mcol_iscan_32x32, mcol_scan_32x32_neighbors },
+#endif // CONFIG_EXT_TX
}
};
const SCAN_ORDER av1_inter_scan_orders[TX_SIZES_ALL][TX_TYPES] = {
+#if CONFIG_CB4X4
+ {
+ // TX_2X2
+ { default_scan_4x4, av1_default_iscan_4x4, default_scan_4x4_neighbors },
+ { default_scan_4x4, av1_default_iscan_4x4, default_scan_4x4_neighbors },
+ { default_scan_4x4, av1_default_iscan_4x4, default_scan_4x4_neighbors },
+ { default_scan_4x4, av1_default_iscan_4x4, default_scan_4x4_neighbors },
+#if CONFIG_EXT_TX
+ { default_scan_4x4, av1_default_iscan_4x4, default_scan_4x4_neighbors },
+ { default_scan_4x4, av1_default_iscan_4x4, default_scan_4x4_neighbors },
+ { default_scan_4x4, av1_default_iscan_4x4, default_scan_4x4_neighbors },
+ { default_scan_4x4, av1_default_iscan_4x4, default_scan_4x4_neighbors },
+ { default_scan_4x4, av1_default_iscan_4x4, default_scan_4x4_neighbors },
+ { mrow_scan_4x4, av1_mrow_iscan_4x4, mrow_scan_4x4_neighbors },
+ { mrow_scan_4x4, av1_mrow_iscan_4x4, mrow_scan_4x4_neighbors },
+ { mcol_scan_4x4, av1_mcol_iscan_4x4, mcol_scan_4x4_neighbors },
+ { mrow_scan_4x4, av1_mrow_iscan_4x4, mrow_scan_4x4_neighbors },
+ { mcol_scan_4x4, av1_mcol_iscan_4x4, mcol_scan_4x4_neighbors },
+ { mrow_scan_4x4, av1_mrow_iscan_4x4, mrow_scan_4x4_neighbors },
+ { mcol_scan_4x4, av1_mcol_iscan_4x4, mcol_scan_4x4_neighbors },
+#endif // CONFIG_EXT_TX
+ },
+#endif
{
// TX_4X4
{ default_scan_4x4, av1_default_iscan_4x4, default_scan_4x4_neighbors },
{ default_scan_4x4, av1_default_iscan_4x4, default_scan_4x4_neighbors },
{ default_scan_4x4, av1_default_iscan_4x4, default_scan_4x4_neighbors },
{ default_scan_4x4, av1_default_iscan_4x4, default_scan_4x4_neighbors },
+#if CONFIG_EXT_TX
{ default_scan_4x4, av1_default_iscan_4x4, default_scan_4x4_neighbors },
{ default_scan_4x4, av1_default_iscan_4x4, default_scan_4x4_neighbors },
{ default_scan_4x4, av1_default_iscan_4x4, default_scan_4x4_neighbors },
@@ -3914,6 +3983,7 @@
{ mcol_scan_4x4, av1_mcol_iscan_4x4, mcol_scan_4x4_neighbors },
{ mrow_scan_4x4, av1_mrow_iscan_4x4, mrow_scan_4x4_neighbors },
{ mcol_scan_4x4, av1_mcol_iscan_4x4, mcol_scan_4x4_neighbors },
+#endif // CONFIG_EXT_TX
},
{
// TX_8X8
@@ -3921,6 +3991,7 @@
{ default_scan_8x8, av1_default_iscan_8x8, default_scan_8x8_neighbors },
{ default_scan_8x8, av1_default_iscan_8x8, default_scan_8x8_neighbors },
{ default_scan_8x8, av1_default_iscan_8x8, default_scan_8x8_neighbors },
+#if CONFIG_EXT_TX
{ default_scan_8x8, av1_default_iscan_8x8, default_scan_8x8_neighbors },
{ default_scan_8x8, av1_default_iscan_8x8, default_scan_8x8_neighbors },
{ default_scan_8x8, av1_default_iscan_8x8, default_scan_8x8_neighbors },
@@ -3933,6 +4004,7 @@
{ mcol_scan_8x8, av1_mcol_iscan_8x8, mcol_scan_8x8_neighbors },
{ mrow_scan_8x8, av1_mrow_iscan_8x8, mrow_scan_8x8_neighbors },
{ mcol_scan_8x8, av1_mcol_iscan_8x8, mcol_scan_8x8_neighbors },
+#endif // CONFIG_EXT_TX
},
{
// TX_16X16
@@ -3944,6 +4016,7 @@
default_scan_16x16_neighbors },
{ default_scan_16x16, av1_default_iscan_16x16,
default_scan_16x16_neighbors },
+#if CONFIG_EXT_TX
{ default_scan_16x16, av1_default_iscan_16x16,
default_scan_16x16_neighbors },
{ default_scan_16x16, av1_default_iscan_16x16,
@@ -3961,11 +4034,13 @@
{ mcol_scan_16x16, av1_mcol_iscan_16x16, mcol_scan_16x16_neighbors },
{ mrow_scan_16x16, av1_mrow_iscan_16x16, mrow_scan_16x16_neighbors },
{ mcol_scan_16x16, av1_mcol_iscan_16x16, mcol_scan_16x16_neighbors },
+#endif // CONFIG_EXT_TX
},
{
// TX_32X32
{ default_scan_32x32, av1_default_iscan_32x32,
default_scan_32x32_neighbors },
+#if CONFIG_EXT_TX
{ h2_scan_32x32, av1_h2_iscan_32x32, h2_scan_32x32_neighbors },
{ v2_scan_32x32, av1_v2_iscan_32x32, v2_scan_32x32_neighbors },
{ qtr_scan_32x32, av1_qtr_iscan_32x32, qtr_scan_32x32_neighbors },
@@ -3981,6 +4056,7 @@
{ mcol_scan_32x32, av1_mcol_iscan_32x32, mcol_scan_32x32_neighbors },
{ mrow_scan_32x32, av1_mrow_iscan_32x32, mrow_scan_32x32_neighbors },
{ mcol_scan_32x32, av1_mcol_iscan_32x32, mcol_scan_32x32_neighbors },
+#endif // CONFIG_EXT_TX
},
{
// TX_4X8
@@ -3988,6 +4064,7 @@
{ default_scan_4x8, av1_default_iscan_4x8, default_scan_4x8_neighbors },
{ default_scan_4x8, av1_default_iscan_4x8, default_scan_4x8_neighbors },
{ default_scan_4x8, av1_default_iscan_4x8, default_scan_4x8_neighbors },
+#if CONFIG_EXT_TX
{ default_scan_4x8, av1_default_iscan_4x8, default_scan_4x8_neighbors },
{ default_scan_4x8, av1_default_iscan_4x8, default_scan_4x8_neighbors },
{ default_scan_4x8, av1_default_iscan_4x8, default_scan_4x8_neighbors },
@@ -4000,6 +4077,7 @@
{ mcol_scan_4x8, av1_mcol_iscan_4x8, mcol_scan_4x8_neighbors },
{ mrow_scan_4x8, av1_mrow_iscan_4x8, mrow_scan_4x8_neighbors },
{ mcol_scan_4x8, av1_mcol_iscan_4x8, mcol_scan_4x8_neighbors },
+#endif // CONFIG_EXT_TX
},
{
// TX_8X4
@@ -4007,6 +4085,7 @@
{ default_scan_8x4, av1_default_iscan_8x4, default_scan_8x4_neighbors },
{ default_scan_8x4, av1_default_iscan_8x4, default_scan_8x4_neighbors },
{ default_scan_8x4, av1_default_iscan_8x4, default_scan_8x4_neighbors },
+#if CONFIG_EXT_TX
{ default_scan_8x4, av1_default_iscan_8x4, default_scan_8x4_neighbors },
{ default_scan_8x4, av1_default_iscan_8x4, default_scan_8x4_neighbors },
{ default_scan_8x4, av1_default_iscan_8x4, default_scan_8x4_neighbors },
@@ -4019,6 +4098,7 @@
{ mcol_scan_8x4, av1_mcol_iscan_8x4, mcol_scan_8x4_neighbors },
{ mrow_scan_8x4, av1_mrow_iscan_8x4, mrow_scan_8x4_neighbors },
{ mcol_scan_8x4, av1_mcol_iscan_8x4, mcol_scan_8x4_neighbors },
+#endif // CONFIG_EXT_TX
},
{
// TX_8X16
@@ -4030,6 +4110,7 @@
default_scan_8x16_neighbors },
{ default_scan_8x16, av1_default_iscan_8x16,
default_scan_8x16_neighbors },
+#if CONFIG_EXT_TX
{ default_scan_8x16, av1_default_iscan_8x16,
default_scan_8x16_neighbors },
{ default_scan_8x16, av1_default_iscan_8x16,
@@ -4047,6 +4128,7 @@
{ mcol_scan_8x16, av1_mcol_iscan_8x16, mcol_scan_8x16_neighbors },
{ mrow_scan_8x16, av1_mrow_iscan_8x16, mrow_scan_8x16_neighbors },
{ mcol_scan_8x16, av1_mcol_iscan_8x16, mcol_scan_8x16_neighbors },
+#endif // CONFIG_EXT_TX
},
{
// TX_16X8
@@ -4058,6 +4140,7 @@
default_scan_16x8_neighbors },
{ default_scan_16x8, av1_default_iscan_16x8,
default_scan_16x8_neighbors },
+#if CONFIG_EXT_TX
{ default_scan_16x8, av1_default_iscan_16x8,
default_scan_16x8_neighbors },
{ default_scan_16x8, av1_default_iscan_16x8,
@@ -4075,6 +4158,7 @@
{ mcol_scan_16x8, av1_mcol_iscan_16x8, mcol_scan_16x8_neighbors },
{ mrow_scan_16x8, av1_mrow_iscan_16x8, mrow_scan_16x8_neighbors },
{ mcol_scan_16x8, av1_mcol_iscan_16x8, mcol_scan_16x8_neighbors },
+#endif // CONFIG_EXT_TX
},
{
// TX_16X32
@@ -4086,6 +4170,7 @@
default_scan_16x32_neighbors },
{ default_scan_16x32, av1_default_iscan_16x32,
default_scan_16x32_neighbors },
+#if CONFIG_EXT_TX
{ default_scan_16x32, av1_default_iscan_16x32,
default_scan_16x32_neighbors },
{ default_scan_16x32, av1_default_iscan_16x32,
@@ -4103,6 +4188,7 @@
{ mcol_scan_16x32, av1_mcol_iscan_16x32, mcol_scan_16x32_neighbors },
{ mrow_scan_16x32, av1_mrow_iscan_16x32, mrow_scan_16x32_neighbors },
{ mcol_scan_16x32, av1_mcol_iscan_16x32, mcol_scan_16x32_neighbors },
+#endif // CONFIG_EXT_TX
},
{
// TX_32X16
@@ -4114,6 +4200,7 @@
default_scan_32x16_neighbors },
{ default_scan_32x16, av1_default_iscan_32x16,
default_scan_32x16_neighbors },
+#if CONFIG_EXT_TX
{ default_scan_32x16, av1_default_iscan_32x16,
default_scan_32x16_neighbors },
{ default_scan_32x16, av1_default_iscan_32x16,
@@ -4131,42 +4218,9 @@
{ mcol_scan_32x16, av1_mcol_iscan_32x16, mcol_scan_32x16_neighbors },
{ mrow_scan_32x16, av1_mrow_iscan_32x16, mrow_scan_32x16_neighbors },
{ mcol_scan_32x16, av1_mcol_iscan_32x16, mcol_scan_32x16_neighbors },
- }
-};
-
-#else // CONFIG_EXT_TX
-
-const SCAN_ORDER av1_intra_scan_orders[TX_SIZES][TX_TYPES] = {
- { // TX_4X4
- { default_scan_4x4, av1_default_iscan_4x4, default_scan_4x4_neighbors },
- { row_scan_4x4, av1_row_iscan_4x4, row_scan_4x4_neighbors },
- { col_scan_4x4, av1_col_iscan_4x4, col_scan_4x4_neighbors },
- { default_scan_4x4, av1_default_iscan_4x4, default_scan_4x4_neighbors } },
- { // TX_8X8
- { default_scan_8x8, av1_default_iscan_8x8, default_scan_8x8_neighbors },
- { row_scan_8x8, av1_row_iscan_8x8, row_scan_8x8_neighbors },
- { col_scan_8x8, av1_col_iscan_8x8, col_scan_8x8_neighbors },
- { default_scan_8x8, av1_default_iscan_8x8, default_scan_8x8_neighbors } },
- { // TX_16X16
- { default_scan_16x16, av1_default_iscan_16x16,
- default_scan_16x16_neighbors },
- { row_scan_16x16, av1_row_iscan_16x16, row_scan_16x16_neighbors },
- { col_scan_16x16, av1_col_iscan_16x16, col_scan_16x16_neighbors },
- { default_scan_16x16, av1_default_iscan_16x16,
- default_scan_16x16_neighbors } },
- {
- // TX_32X32
- { default_scan_32x32, av1_default_iscan_32x32,
- default_scan_32x32_neighbors },
- { default_scan_32x32, av1_default_iscan_32x32,
- default_scan_32x32_neighbors },
- { default_scan_32x32, av1_default_iscan_32x32,
- default_scan_32x32_neighbors },
- { default_scan_32x32, av1_default_iscan_32x32,
- default_scan_32x32_neighbors },
- }
-};
#endif // CONFIG_EXT_TX
+ }
+};
#if CONFIG_ADAPT_SCAN
// TX_32X32 will has 1024 coefficients whose indexes can be represented in 10
@@ -4375,8 +4429,7 @@
int16_t *scan = get_adapt_scan(cm->fc, tx_size, tx_type);
int16_t *iscan = get_adapt_iscan(cm->fc, tx_size, tx_type);
int16_t *nb = get_adapt_nb(cm->fc, tx_size, tx_type);
- const int tx2d_size = tx_size_2d[tx_size];
- assert(tx2d_size <= 1024);
+ assert(tx_size_2d[tx_size] <= 1024);
av1_update_sort_order(tx_size, non_zero_prob, sort_order);
av1_update_scan_order(tx_size, sort_order, scan, iscan);
av1_update_neighbors(tx_size, scan, iscan, nb);
diff --git a/av1/common/scan.h b/av1/common/scan.h
index af39993..2078e99 100644
--- a/av1/common/scan.h
+++ b/av1/common/scan.h
@@ -27,6 +27,7 @@
extern const SCAN_ORDER av1_default_scan_orders[TX_SIZES];
extern const SCAN_ORDER av1_intra_scan_orders[TX_SIZES][TX_TYPES];
+extern const SCAN_ORDER av1_inter_scan_orders[TX_SIZES_ALL][TX_TYPES];
#if CONFIG_ADAPT_SCAN
void av1_update_scan_prob(AV1_COMMON *cm, TX_SIZE tx_size, TX_TYPE tx_type,
@@ -87,7 +88,7 @@
return &cm->fc->sc[tx_size][tx_type];
#else // CONFIG_ADAPT_SCAN
(void)cm;
-#if CONFIG_EXT_TX
+#if CONFIG_EXT_TX || CONFIG_VAR_TX
return is_inter ? &av1_inter_scan_orders[tx_size][tx_type]
: &av1_intra_scan_orders[tx_size][tx_type];
#else
diff --git a/av1/common/thread_common.c b/av1/common/thread_common.c
index eeaeb21..541db1d 100644
--- a/av1/common/thread_common.c
+++ b/av1/common/thread_common.c
@@ -85,25 +85,155 @@
#endif // CONFIG_MULTITHREAD
}
-// Implement row loopfiltering for each thread.
-static INLINE void thread_loop_filter_rows(
- const YV12_BUFFER_CONFIG *const frame_buffer, AV1_COMMON *const cm,
- struct macroblockd_plane planes[MAX_MB_PLANE], int start, int stop,
- int y_only, AV1LfSync *const lf_sync) {
- const int num_planes = y_only ? 1 : MAX_MB_PLANE;
- const int sb_cols = mi_cols_aligned_to_sb(cm) >> cm->mib_size_log2;
+#if !CONFIG_EXT_PARTITION_TYPES
+static INLINE enum lf_path get_loop_filter_path(
+ int y_only, struct macroblockd_plane planes[MAX_MB_PLANE]) {
+ if (y_only)
+ return LF_PATH_444;
+ else if (planes[1].subsampling_y == 1 && planes[1].subsampling_x == 1)
+ return LF_PATH_420;
+ else if (planes[1].subsampling_y == 0 && planes[1].subsampling_x == 0)
+ return LF_PATH_444;
+ else
+ return LF_PATH_SLOW;
+}
+
+static INLINE void loop_filter_block_plane_ver(
+ AV1_COMMON *cm, struct macroblockd_plane planes[MAX_MB_PLANE], int plane,
+ MODE_INFO **mi, int mi_row, int mi_col, enum lf_path path,
+ LOOP_FILTER_MASK *lfm) {
+ if (plane == 0) {
+ av1_filter_block_plane_ss00_ver(cm, &planes[0], mi_row, lfm);
+ } else {
+ switch (path) {
+ case LF_PATH_420:
+ av1_filter_block_plane_ss11_ver(cm, &planes[plane], mi_row, lfm);
+ break;
+ case LF_PATH_444:
+ av1_filter_block_plane_ss00_ver(cm, &planes[plane], mi_row, lfm);
+ break;
+ case LF_PATH_SLOW:
+ av1_filter_block_plane_non420_ver(cm, &planes[plane], mi, mi_row,
+ mi_col);
+ break;
+ }
+ }
+}
+
+static INLINE void loop_filter_block_plane_hor(
+ AV1_COMMON *cm, struct macroblockd_plane planes[MAX_MB_PLANE], int plane,
+ MODE_INFO **mi, int mi_row, int mi_col, enum lf_path path,
+ LOOP_FILTER_MASK *lfm) {
+ if (plane == 0) {
+ av1_filter_block_plane_ss00_hor(cm, &planes[0], mi_row, lfm);
+ } else {
+ switch (path) {
+ case LF_PATH_420:
+ av1_filter_block_plane_ss11_hor(cm, &planes[plane], mi_row, lfm);
+ break;
+ case LF_PATH_444:
+ av1_filter_block_plane_ss00_hor(cm, &planes[plane], mi_row, lfm);
+ break;
+ case LF_PATH_SLOW:
+ av1_filter_block_plane_non420_hor(cm, &planes[plane], mi, mi_row,
+ mi_col);
+ break;
+ }
+ }
+}
+#endif
+// Row-based multi-threaded loopfilter hook
+#if CONFIG_PARALLEL_DEBLOCKING
+static int loop_filter_ver_row_worker(AV1LfSync *const lf_sync,
+ LFWorkerData *const lf_data) {
+ const int num_planes = lf_data->y_only ? 1 : MAX_MB_PLANE;
int mi_row, mi_col;
#if !CONFIG_EXT_PARTITION_TYPES
- enum lf_path path;
- LOOP_FILTER_MASK lfm;
- if (y_only)
- path = LF_PATH_444;
- else if (planes[1].subsampling_y == 1 && planes[1].subsampling_x == 1)
- path = LF_PATH_420;
- else if (planes[1].subsampling_y == 0 && planes[1].subsampling_x == 0)
- path = LF_PATH_444;
- else
- path = LF_PATH_SLOW;
+ enum lf_path path = get_loop_filter_path(lf_data->y_only, lf_data->planes);
+#endif
+ for (mi_row = lf_data->start; mi_row < lf_data->stop;
+ mi_row += lf_sync->num_workers * lf_data->cm->mib_size) {
+ MODE_INFO **const mi =
+ lf_data->cm->mi_grid_visible + mi_row * lf_data->cm->mi_stride;
+
+ for (mi_col = 0; mi_col < lf_data->cm->mi_cols;
+ mi_col += lf_data->cm->mib_size) {
+ LOOP_FILTER_MASK lfm;
+ int plane;
+
+ av1_setup_dst_planes(lf_data->planes, lf_data->frame_buffer, mi_row,
+ mi_col);
+ av1_setup_mask(lf_data->cm, mi_row, mi_col, mi + mi_col,
+ lf_data->cm->mi_stride, &lfm);
+
+#if CONFIG_EXT_PARTITION_TYPES
+ for (plane = 0; plane < num_planes; ++plane)
+ av1_filter_block_plane_non420_ver(lf_data->cm, &lf_data->planes[plane],
+ mi + mi_col, mi_row, mi_col);
+#else
+
+ for (plane = 0; plane < num_planes; ++plane)
+ loop_filter_block_plane_ver(lf_data->cm, lf_data->planes, plane,
+ mi + mi_col, mi_row, mi_col, path, &lfm);
+#endif
+ }
+ }
+ return 1;
+}
+
+static int loop_filter_hor_row_worker(AV1LfSync *const lf_sync,
+ LFWorkerData *const lf_data) {
+ const int num_planes = lf_data->y_only ? 1 : MAX_MB_PLANE;
+ const int sb_cols =
+ mi_cols_aligned_to_sb(lf_data->cm) >> lf_data->cm->mib_size_log2;
+ int mi_row, mi_col;
+#if !CONFIG_EXT_PARTITION_TYPES
+ enum lf_path path = get_loop_filter_path(lf_data->y_only, lf_data->planes);
+#endif
+
+ for (mi_row = lf_data->start; mi_row < lf_data->stop;
+ mi_row += lf_sync->num_workers * lf_data->cm->mib_size) {
+ MODE_INFO **const mi =
+ lf_data->cm->mi_grid_visible + mi_row * lf_data->cm->mi_stride;
+
+ for (mi_col = 0; mi_col < lf_data->cm->mi_cols;
+ mi_col += lf_data->cm->mib_size) {
+ const int r = mi_row >> lf_data->cm->mib_size_log2;
+ const int c = mi_col >> lf_data->cm->mib_size_log2;
+ LOOP_FILTER_MASK lfm;
+ int plane;
+
+ // TODO(wenhao.zhang@intel.com): For better parallelization, reorder
+ // the outer loop to column-based and remove the synchronizations here.
+ sync_read(lf_sync, r, c);
+
+ av1_setup_dst_planes(lf_data->planes, lf_data->frame_buffer, mi_row,
+ mi_col);
+ av1_setup_mask(lf_data->cm, mi_row, mi_col, mi + mi_col,
+ lf_data->cm->mi_stride, &lfm);
+#if CONFIG_EXT_PARTITION_TYPES
+ for (plane = 0; plane < num_planes; ++plane)
+ av1_filter_block_plane_non420_hor(lf_data->cm, &lf_data->planes[plane],
+ mi + mi_col, mi_row, mi_col);
+#else
+ for (plane = 0; plane < num_planes; ++plane)
+ loop_filter_block_plane_hor(lf_data->cm, lf_data->planes, plane,
+ mi + mi_col, mi_row, mi_col, path, &lfm);
+#endif
+ sync_write(lf_sync, r, c, sb_cols);
+ }
+ }
+ return 1;
+}
+#else // CONFIG_PARALLEL_DEBLOCKING
+static int loop_filter_row_worker(AV1LfSync *const lf_sync,
+ LFWorkerData *const lf_data) {
+ const int num_planes = lf_data->y_only ? 1 : MAX_MB_PLANE;
+ const int sb_cols =
+ mi_cols_aligned_to_sb(lf_data->cm) >> lf_data->cm->mib_size_log2;
+ int mi_row, mi_col;
+#if !CONFIG_EXT_PARTITION_TYPES
+ enum lf_path path = get_loop_filter_path(lf_data->y_only, lf_data->planes);
#endif // !CONFIG_EXT_PARTITION_TYPES
#if CONFIG_EXT_PARTITION
@@ -113,56 +243,48 @@
exit(EXIT_FAILURE);
#endif // CONFIG_EXT_PARTITION
- for (mi_row = start; mi_row < stop;
- mi_row += lf_sync->num_workers * cm->mib_size) {
- MODE_INFO **const mi = cm->mi_grid_visible + mi_row * cm->mi_stride;
+ for (mi_row = lf_data->start; mi_row < lf_data->stop;
+ mi_row += lf_sync->num_workers * lf_data->cm->mib_size) {
+ MODE_INFO **const mi =
+ lf_data->cm->mi_grid_visible + mi_row * lf_data->cm->mi_stride;
- for (mi_col = 0; mi_col < cm->mi_cols; mi_col += cm->mib_size) {
- const int r = mi_row >> cm->mib_size_log2;
- const int c = mi_col >> cm->mib_size_log2;
+ for (mi_col = 0; mi_col < lf_data->cm->mi_cols;
+ mi_col += lf_data->cm->mib_size) {
+ const int r = mi_row >> lf_data->cm->mib_size_log2;
+ const int c = mi_col >> lf_data->cm->mib_size_log2;
+#if !CONFIG_EXT_PARTITION_TYPES
+ LOOP_FILTER_MASK lfm;
+#endif
int plane;
sync_read(lf_sync, r, c);
- av1_setup_dst_planes(planes, frame_buffer, mi_row, mi_col);
-
+ av1_setup_dst_planes(lf_data->planes, lf_data->frame_buffer, mi_row,
+ mi_col);
#if CONFIG_EXT_PARTITION_TYPES
- for (plane = 0; plane < num_planes; ++plane)
- av1_filter_block_plane_non420(cm, &planes[plane], mi + mi_col, mi_row,
- mi_col);
+ for (plane = 0; plane < num_planes; ++plane) {
+ av1_filter_block_plane_non420_ver(lf_data->cm, &lf_data->planes[plane],
+ mi + mi_col, mi_row, mi_col);
+ av1_filter_block_plane_non420_hor(lf_data->cm, &lf_data->planes[plane],
+ mi + mi_col, mi_row, mi_col);
+ }
#else
- // TODO(JBB): Make setup_mask work for non 420.
- av1_setup_mask(cm, mi_row, mi_col, mi + mi_col, cm->mi_stride, &lfm);
+ av1_setup_mask(lf_data->cm, mi_row, mi_col, mi + mi_col,
+ lf_data->cm->mi_stride, &lfm);
- av1_filter_block_plane_ss00(cm, &planes[0], mi_row, &lfm);
- for (plane = 1; plane < num_planes; ++plane) {
- switch (path) {
- case LF_PATH_420:
- av1_filter_block_plane_ss11(cm, &planes[plane], mi_row, &lfm);
- break;
- case LF_PATH_444:
- av1_filter_block_plane_ss00(cm, &planes[plane], mi_row, &lfm);
- break;
- case LF_PATH_SLOW:
- av1_filter_block_plane_non420(cm, &planes[plane], mi + mi_col,
- mi_row, mi_col);
- break;
- }
+ for (plane = 0; plane < num_planes; ++plane) {
+ loop_filter_block_plane_ver(lf_data->cm, lf_data->planes, plane,
+ mi + mi_col, mi_row, mi_col, path, &lfm);
+ loop_filter_block_plane_hor(lf_data->cm, lf_data->planes, plane,
+ mi + mi_col, mi_row, mi_col, path, &lfm);
}
#endif // CONFIG_EXT_PARTITION_TYPES
sync_write(lf_sync, r, c, sb_cols);
}
}
-}
-
-// Row-based multi-threaded loopfilter hook
-static int loop_filter_row_worker(AV1LfSync *const lf_sync,
- LFWorkerData *const lf_data) {
- thread_loop_filter_rows(lf_data->frame_buffer, lf_data->cm, lf_data->planes,
- lf_data->start, lf_data->stop, lf_data->y_only,
- lf_sync);
return 1;
}
+#endif // CONFIG_PARALLEL_DEBLOCKING
static void loop_filter_rows_mt(YV12_BUFFER_CONFIG *frame, AV1_COMMON *cm,
struct macroblockd_plane planes[MAX_MB_PLANE],
@@ -191,17 +313,79 @@
av1_loop_filter_alloc(lf_sync, cm, sb_rows, cm->width, num_workers);
}
+// Set up loopfilter thread data.
+// The decoder is capping num_workers because it has been observed that using
+// more threads on the loopfilter than there are cores will hurt performance
+// on Android. This is because the system will only schedule the tile decode
+// workers on cores equal to the number of tile columns. Then if the decoder
+// tries to use more threads for the loopfilter, it will hurt performance
+// because of contention. If the multithreading code changes in the future
+// then the number of workers used by the loopfilter should be revisited.
+
+#if CONFIG_PARALLEL_DEBLOCKING
// Initialize cur_sb_col to -1 for all SB rows.
memset(lf_sync->cur_sb_col, -1, sizeof(*lf_sync->cur_sb_col) * sb_rows);
- // Set up loopfilter thread data.
- // The decoder is capping num_workers because it has been observed that using
- // more threads on the loopfilter than there are cores will hurt performance
- // on Android. This is because the system will only schedule the tile decode
- // workers on cores equal to the number of tile columns. Then if the decoder
- // tries to use more threads for the loopfilter, it will hurt performance
- // because of contention. If the multithreading code changes in the future
- // then the number of workers used by the loopfilter should be revisited.
+ // Filter all the vertical edges in the whole frame
+ for (i = 0; i < num_workers; ++i) {
+ AVxWorker *const worker = &workers[i];
+ LFWorkerData *const lf_data = &lf_sync->lfdata[i];
+
+ worker->hook = (AVxWorkerHook)loop_filter_ver_row_worker;
+ worker->data1 = lf_sync;
+ worker->data2 = lf_data;
+
+ // Loopfilter data
+ av1_loop_filter_data_reset(lf_data, frame, cm, planes);
+ lf_data->start = start + i * cm->mib_size;
+ lf_data->stop = stop;
+ lf_data->y_only = y_only;
+
+ // Start loopfiltering
+ if (i == num_workers - 1) {
+ winterface->execute(worker);
+ } else {
+ winterface->launch(worker);
+ }
+ }
+
+ // Wait till all rows are finished
+ for (i = 0; i < num_workers; ++i) {
+ winterface->sync(&workers[i]);
+ }
+
+ memset(lf_sync->cur_sb_col, -1, sizeof(*lf_sync->cur_sb_col) * sb_rows);
+ // Filter all the horizontal edges in the whole frame
+ for (i = 0; i < num_workers; ++i) {
+ AVxWorker *const worker = &workers[i];
+ LFWorkerData *const lf_data = &lf_sync->lfdata[i];
+
+ worker->hook = (AVxWorkerHook)loop_filter_hor_row_worker;
+ worker->data1 = lf_sync;
+ worker->data2 = lf_data;
+
+ // Loopfilter data
+ av1_loop_filter_data_reset(lf_data, frame, cm, planes);
+ lf_data->start = start + i * cm->mib_size;
+ lf_data->stop = stop;
+ lf_data->y_only = y_only;
+
+ // Start loopfiltering
+ if (i == num_workers - 1) {
+ winterface->execute(worker);
+ } else {
+ winterface->launch(worker);
+ }
+ }
+
+ // Wait till all rows are finished
+ for (i = 0; i < num_workers; ++i) {
+ winterface->sync(&workers[i]);
+ }
+#else // CONFIG_PARALLEL_DEBLOCKING
+ // Initialize cur_sb_col to -1 for all SB rows.
+ memset(lf_sync->cur_sb_col, -1, sizeof(*lf_sync->cur_sb_col) * sb_rows);
+
for (i = 0; i < num_workers; ++i) {
AVxWorker *const worker = &workers[i];
LFWorkerData *const lf_data = &lf_sync->lfdata[i];
@@ -228,6 +412,7 @@
for (i = 0; i < num_workers; ++i) {
winterface->sync(&workers[i]);
}
+#endif // CONFIG_PARALLEL_DEBLOCKING
}
void av1_loop_filter_frame_mt(YV12_BUFFER_CONFIG *frame, AV1_COMMON *cm,
diff --git a/av1/common/tile_common.h b/av1/common/tile_common.h
index 9fed2d6..d63d260 100644
--- a/av1/common/tile_common.h
+++ b/av1/common/tile_common.h
@@ -18,6 +18,10 @@
struct AV1Common;
+#if CONFIG_TILE_GROUPS
+#define MAX_NUM_TG 3
+#endif
+
typedef struct TileInfo {
int mi_row_start, mi_row_end;
int mi_col_start, mi_col_end;
diff --git a/av1/common/x86/av1_fwd_dct32x32_impl_sse2.h b/av1/common/x86/av1_fwd_dct32x32_impl_sse2.h
deleted file mode 100644
index 876e579..0000000
--- a/av1/common/x86/av1_fwd_dct32x32_impl_sse2.h
+++ /dev/null
@@ -1,3202 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <emmintrin.h> // SSE2
-
-#include "./av1_rtcd.h"
-#include "av1/common/av1_fwd_txfm.h"
-#include "aom_dsp/txfm_common.h"
-#include "aom_dsp/x86/txfm_common_sse2.h"
-
-// TODO(jingning) The high bit-depth version needs re-work for performance.
-// The current SSE2 implementation also causes cross reference to the static
-// functions in the C implementation file.
-#if DCT_HIGH_BIT_DEPTH
-#define ADD_EPI16 _mm_adds_epi16
-#define SUB_EPI16 _mm_subs_epi16
-#if FDCT32x32_HIGH_PRECISION
-void av1_fdct32x32_rows_c(const int16_t *intermediate, tran_low_t *out) {
- int i, j;
- for (i = 0; i < 32; ++i) {
- tran_high_t temp_in[32], temp_out[32];
- for (j = 0; j < 32; ++j) temp_in[j] = intermediate[j * 32 + i];
- av1_fdct32(temp_in, temp_out, 0);
- for (j = 0; j < 32; ++j)
- out[j + i * 32] =
- (tran_low_t)((temp_out[j] + 1 + (temp_out[j] < 0)) >> 2);
- }
-}
-#define HIGH_FDCT32x32_2D_C av1_highbd_fdct32x32_c
-#define HIGH_FDCT32x32_2D_ROWS_C av1_fdct32x32_rows_c
-#else
-void av1_fdct32x32_rd_rows_c(const int16_t *intermediate, tran_low_t *out) {
- int i, j;
- for (i = 0; i < 32; ++i) {
- tran_high_t temp_in[32], temp_out[32];
- for (j = 0; j < 32; ++j) temp_in[j] = intermediate[j * 32 + i];
- av1_fdct32(temp_in, temp_out, 1);
- for (j = 0; j < 32; ++j) out[j + i * 32] = (tran_low_t)temp_out[j];
- }
-}
-#define HIGH_FDCT32x32_2D_C av1_highbd_fdct32x32_rd_c
-#define HIGH_FDCT32x32_2D_ROWS_C av1_fdct32x32_rd_rows_c
-#endif // FDCT32x32_HIGH_PRECISION
-#else
-#define ADD_EPI16 _mm_add_epi16
-#define SUB_EPI16 _mm_sub_epi16
-#endif // DCT_HIGH_BIT_DEPTH
-
-void FDCT32x32_2D(const int16_t *input, tran_low_t *output_org, int stride) {
- // Calculate pre-multiplied strides
- const int str1 = stride;
- const int str2 = 2 * stride;
- const int str3 = 2 * stride + str1;
- // We need an intermediate buffer between passes.
- DECLARE_ALIGNED(16, int16_t, intermediate[32 * 32]);
- // Constants
- // When we use them, in one case, they are all the same. In all others
- // it's a pair of them that we need to repeat four times. This is done
- // by constructing the 32 bit constant corresponding to that pair.
- const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64);
- const __m128i k__cospi_p16_m16 = pair_set_epi16(+cospi_16_64, -cospi_16_64);
- const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);
- const __m128i k__cospi_m24_m08 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
- const __m128i k__cospi_p24_p08 = pair_set_epi16(+cospi_24_64, cospi_8_64);
- const __m128i k__cospi_p12_p20 = pair_set_epi16(+cospi_12_64, cospi_20_64);
- const __m128i k__cospi_m20_p12 = pair_set_epi16(-cospi_20_64, cospi_12_64);
- const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64);
- const __m128i k__cospi_p28_p04 = pair_set_epi16(+cospi_28_64, cospi_4_64);
- const __m128i k__cospi_m28_m04 = pair_set_epi16(-cospi_28_64, -cospi_4_64);
- const __m128i k__cospi_m12_m20 = pair_set_epi16(-cospi_12_64, -cospi_20_64);
- const __m128i k__cospi_p30_p02 = pair_set_epi16(+cospi_30_64, cospi_2_64);
- const __m128i k__cospi_p14_p18 = pair_set_epi16(+cospi_14_64, cospi_18_64);
- const __m128i k__cospi_p22_p10 = pair_set_epi16(+cospi_22_64, cospi_10_64);
- const __m128i k__cospi_p06_p26 = pair_set_epi16(+cospi_6_64, cospi_26_64);
- const __m128i k__cospi_m26_p06 = pair_set_epi16(-cospi_26_64, cospi_6_64);
- const __m128i k__cospi_m10_p22 = pair_set_epi16(-cospi_10_64, cospi_22_64);
- const __m128i k__cospi_m18_p14 = pair_set_epi16(-cospi_18_64, cospi_14_64);
- const __m128i k__cospi_m02_p30 = pair_set_epi16(-cospi_2_64, cospi_30_64);
- const __m128i k__cospi_p31_p01 = pair_set_epi16(+cospi_31_64, cospi_1_64);
- const __m128i k__cospi_p15_p17 = pair_set_epi16(+cospi_15_64, cospi_17_64);
- const __m128i k__cospi_p23_p09 = pair_set_epi16(+cospi_23_64, cospi_9_64);
- const __m128i k__cospi_p07_p25 = pair_set_epi16(+cospi_7_64, cospi_25_64);
- const __m128i k__cospi_m25_p07 = pair_set_epi16(-cospi_25_64, cospi_7_64);
- const __m128i k__cospi_m09_p23 = pair_set_epi16(-cospi_9_64, cospi_23_64);
- const __m128i k__cospi_m17_p15 = pair_set_epi16(-cospi_17_64, cospi_15_64);
- const __m128i k__cospi_m01_p31 = pair_set_epi16(-cospi_1_64, cospi_31_64);
- const __m128i k__cospi_p27_p05 = pair_set_epi16(+cospi_27_64, cospi_5_64);
- const __m128i k__cospi_p11_p21 = pair_set_epi16(+cospi_11_64, cospi_21_64);
- const __m128i k__cospi_p19_p13 = pair_set_epi16(+cospi_19_64, cospi_13_64);
- const __m128i k__cospi_p03_p29 = pair_set_epi16(+cospi_3_64, cospi_29_64);
- const __m128i k__cospi_m29_p03 = pair_set_epi16(-cospi_29_64, cospi_3_64);
- const __m128i k__cospi_m13_p19 = pair_set_epi16(-cospi_13_64, cospi_19_64);
- const __m128i k__cospi_m21_p11 = pair_set_epi16(-cospi_21_64, cospi_11_64);
- const __m128i k__cospi_m05_p27 = pair_set_epi16(-cospi_5_64, cospi_27_64);
- const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
- const __m128i kZero = _mm_set1_epi16(0);
- const __m128i kOne = _mm_set1_epi16(1);
- // Do the two transform/transpose passes
- int pass;
-#if DCT_HIGH_BIT_DEPTH
- int overflow;
-#endif
- for (pass = 0; pass < 2; ++pass) {
- // We process eight columns (transposed rows in second pass) at a time.
- int column_start;
- for (column_start = 0; column_start < 32; column_start += 8) {
- __m128i step1[32];
- __m128i step2[32];
- __m128i step3[32];
- __m128i out[32];
- // Stage 1
- // Note: even though all the loads below are aligned, using the aligned
- // intrinsic make the code slightly slower.
- if (0 == pass) {
- const int16_t *in = &input[column_start];
- // step1[i] = (in[ 0 * stride] + in[(32 - 1) * stride]) << 2;
- // Note: the next four blocks could be in a loop. That would help the
- // instruction cache but is actually slower.
- {
- const int16_t *ina = in + 0 * str1;
- const int16_t *inb = in + 31 * str1;
- __m128i *step1a = &step1[0];
- __m128i *step1b = &step1[31];
- const __m128i ina0 = _mm_loadu_si128((const __m128i *)(ina));
- const __m128i ina1 = _mm_loadu_si128((const __m128i *)(ina + str1));
- const __m128i ina2 = _mm_loadu_si128((const __m128i *)(ina + str2));
- const __m128i ina3 = _mm_loadu_si128((const __m128i *)(ina + str3));
- const __m128i inb3 = _mm_loadu_si128((const __m128i *)(inb - str3));
- const __m128i inb2 = _mm_loadu_si128((const __m128i *)(inb - str2));
- const __m128i inb1 = _mm_loadu_si128((const __m128i *)(inb - str1));
- const __m128i inb0 = _mm_loadu_si128((const __m128i *)(inb));
- step1a[0] = _mm_add_epi16(ina0, inb0);
- step1a[1] = _mm_add_epi16(ina1, inb1);
- step1a[2] = _mm_add_epi16(ina2, inb2);
- step1a[3] = _mm_add_epi16(ina3, inb3);
- step1b[-3] = _mm_sub_epi16(ina3, inb3);
- step1b[-2] = _mm_sub_epi16(ina2, inb2);
- step1b[-1] = _mm_sub_epi16(ina1, inb1);
- step1b[-0] = _mm_sub_epi16(ina0, inb0);
- step1a[0] = _mm_slli_epi16(step1a[0], 2);
- step1a[1] = _mm_slli_epi16(step1a[1], 2);
- step1a[2] = _mm_slli_epi16(step1a[2], 2);
- step1a[3] = _mm_slli_epi16(step1a[3], 2);
- step1b[-3] = _mm_slli_epi16(step1b[-3], 2);
- step1b[-2] = _mm_slli_epi16(step1b[-2], 2);
- step1b[-1] = _mm_slli_epi16(step1b[-1], 2);
- step1b[-0] = _mm_slli_epi16(step1b[-0], 2);
- }
- {
- const int16_t *ina = in + 4 * str1;
- const int16_t *inb = in + 27 * str1;
- __m128i *step1a = &step1[4];
- __m128i *step1b = &step1[27];
- const __m128i ina0 = _mm_loadu_si128((const __m128i *)(ina));
- const __m128i ina1 = _mm_loadu_si128((const __m128i *)(ina + str1));
- const __m128i ina2 = _mm_loadu_si128((const __m128i *)(ina + str2));
- const __m128i ina3 = _mm_loadu_si128((const __m128i *)(ina + str3));
- const __m128i inb3 = _mm_loadu_si128((const __m128i *)(inb - str3));
- const __m128i inb2 = _mm_loadu_si128((const __m128i *)(inb - str2));
- const __m128i inb1 = _mm_loadu_si128((const __m128i *)(inb - str1));
- const __m128i inb0 = _mm_loadu_si128((const __m128i *)(inb));
- step1a[0] = _mm_add_epi16(ina0, inb0);
- step1a[1] = _mm_add_epi16(ina1, inb1);
- step1a[2] = _mm_add_epi16(ina2, inb2);
- step1a[3] = _mm_add_epi16(ina3, inb3);
- step1b[-3] = _mm_sub_epi16(ina3, inb3);
- step1b[-2] = _mm_sub_epi16(ina2, inb2);
- step1b[-1] = _mm_sub_epi16(ina1, inb1);
- step1b[-0] = _mm_sub_epi16(ina0, inb0);
- step1a[0] = _mm_slli_epi16(step1a[0], 2);
- step1a[1] = _mm_slli_epi16(step1a[1], 2);
- step1a[2] = _mm_slli_epi16(step1a[2], 2);
- step1a[3] = _mm_slli_epi16(step1a[3], 2);
- step1b[-3] = _mm_slli_epi16(step1b[-3], 2);
- step1b[-2] = _mm_slli_epi16(step1b[-2], 2);
- step1b[-1] = _mm_slli_epi16(step1b[-1], 2);
- step1b[-0] = _mm_slli_epi16(step1b[-0], 2);
- }
- {
- const int16_t *ina = in + 8 * str1;
- const int16_t *inb = in + 23 * str1;
- __m128i *step1a = &step1[8];
- __m128i *step1b = &step1[23];
- const __m128i ina0 = _mm_loadu_si128((const __m128i *)(ina));
- const __m128i ina1 = _mm_loadu_si128((const __m128i *)(ina + str1));
- const __m128i ina2 = _mm_loadu_si128((const __m128i *)(ina + str2));
- const __m128i ina3 = _mm_loadu_si128((const __m128i *)(ina + str3));
- const __m128i inb3 = _mm_loadu_si128((const __m128i *)(inb - str3));
- const __m128i inb2 = _mm_loadu_si128((const __m128i *)(inb - str2));
- const __m128i inb1 = _mm_loadu_si128((const __m128i *)(inb - str1));
- const __m128i inb0 = _mm_loadu_si128((const __m128i *)(inb));
- step1a[0] = _mm_add_epi16(ina0, inb0);
- step1a[1] = _mm_add_epi16(ina1, inb1);
- step1a[2] = _mm_add_epi16(ina2, inb2);
- step1a[3] = _mm_add_epi16(ina3, inb3);
- step1b[-3] = _mm_sub_epi16(ina3, inb3);
- step1b[-2] = _mm_sub_epi16(ina2, inb2);
- step1b[-1] = _mm_sub_epi16(ina1, inb1);
- step1b[-0] = _mm_sub_epi16(ina0, inb0);
- step1a[0] = _mm_slli_epi16(step1a[0], 2);
- step1a[1] = _mm_slli_epi16(step1a[1], 2);
- step1a[2] = _mm_slli_epi16(step1a[2], 2);
- step1a[3] = _mm_slli_epi16(step1a[3], 2);
- step1b[-3] = _mm_slli_epi16(step1b[-3], 2);
- step1b[-2] = _mm_slli_epi16(step1b[-2], 2);
- step1b[-1] = _mm_slli_epi16(step1b[-1], 2);
- step1b[-0] = _mm_slli_epi16(step1b[-0], 2);
- }
- {
- const int16_t *ina = in + 12 * str1;
- const int16_t *inb = in + 19 * str1;
- __m128i *step1a = &step1[12];
- __m128i *step1b = &step1[19];
- const __m128i ina0 = _mm_loadu_si128((const __m128i *)(ina));
- const __m128i ina1 = _mm_loadu_si128((const __m128i *)(ina + str1));
- const __m128i ina2 = _mm_loadu_si128((const __m128i *)(ina + str2));
- const __m128i ina3 = _mm_loadu_si128((const __m128i *)(ina + str3));
- const __m128i inb3 = _mm_loadu_si128((const __m128i *)(inb - str3));
- const __m128i inb2 = _mm_loadu_si128((const __m128i *)(inb - str2));
- const __m128i inb1 = _mm_loadu_si128((const __m128i *)(inb - str1));
- const __m128i inb0 = _mm_loadu_si128((const __m128i *)(inb));
- step1a[0] = _mm_add_epi16(ina0, inb0);
- step1a[1] = _mm_add_epi16(ina1, inb1);
- step1a[2] = _mm_add_epi16(ina2, inb2);
- step1a[3] = _mm_add_epi16(ina3, inb3);
- step1b[-3] = _mm_sub_epi16(ina3, inb3);
- step1b[-2] = _mm_sub_epi16(ina2, inb2);
- step1b[-1] = _mm_sub_epi16(ina1, inb1);
- step1b[-0] = _mm_sub_epi16(ina0, inb0);
- step1a[0] = _mm_slli_epi16(step1a[0], 2);
- step1a[1] = _mm_slli_epi16(step1a[1], 2);
- step1a[2] = _mm_slli_epi16(step1a[2], 2);
- step1a[3] = _mm_slli_epi16(step1a[3], 2);
- step1b[-3] = _mm_slli_epi16(step1b[-3], 2);
- step1b[-2] = _mm_slli_epi16(step1b[-2], 2);
- step1b[-1] = _mm_slli_epi16(step1b[-1], 2);
- step1b[-0] = _mm_slli_epi16(step1b[-0], 2);
- }
- } else {
- int16_t *in = &intermediate[column_start];
- // step1[i] = in[ 0 * 32] + in[(32 - 1) * 32];
- // Note: using the same approach as above to have common offset is
- // counter-productive as all offsets can be calculated at compile
- // time.
- // Note: the next four blocks could be in a loop. That would help the
- // instruction cache but is actually slower.
- {
- __m128i in00 = _mm_loadu_si128((const __m128i *)(in + 0 * 32));
- __m128i in01 = _mm_loadu_si128((const __m128i *)(in + 1 * 32));
- __m128i in02 = _mm_loadu_si128((const __m128i *)(in + 2 * 32));
- __m128i in03 = _mm_loadu_si128((const __m128i *)(in + 3 * 32));
- __m128i in28 = _mm_loadu_si128((const __m128i *)(in + 28 * 32));
- __m128i in29 = _mm_loadu_si128((const __m128i *)(in + 29 * 32));
- __m128i in30 = _mm_loadu_si128((const __m128i *)(in + 30 * 32));
- __m128i in31 = _mm_loadu_si128((const __m128i *)(in + 31 * 32));
- step1[0] = ADD_EPI16(in00, in31);
- step1[1] = ADD_EPI16(in01, in30);
- step1[2] = ADD_EPI16(in02, in29);
- step1[3] = ADD_EPI16(in03, in28);
- step1[28] = SUB_EPI16(in03, in28);
- step1[29] = SUB_EPI16(in02, in29);
- step1[30] = SUB_EPI16(in01, in30);
- step1[31] = SUB_EPI16(in00, in31);
-#if DCT_HIGH_BIT_DEPTH
- overflow = check_epi16_overflow_x8(&step1[0], &step1[1], &step1[2],
- &step1[3], &step1[28], &step1[29],
- &step1[30], &step1[31]);
- if (overflow) {
- HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
- return;
- }
-#endif // DCT_HIGH_BIT_DEPTH
- }
- {
- __m128i in04 = _mm_loadu_si128((const __m128i *)(in + 4 * 32));
- __m128i in05 = _mm_loadu_si128((const __m128i *)(in + 5 * 32));
- __m128i in06 = _mm_loadu_si128((const __m128i *)(in + 6 * 32));
- __m128i in07 = _mm_loadu_si128((const __m128i *)(in + 7 * 32));
- __m128i in24 = _mm_loadu_si128((const __m128i *)(in + 24 * 32));
- __m128i in25 = _mm_loadu_si128((const __m128i *)(in + 25 * 32));
- __m128i in26 = _mm_loadu_si128((const __m128i *)(in + 26 * 32));
- __m128i in27 = _mm_loadu_si128((const __m128i *)(in + 27 * 32));
- step1[4] = ADD_EPI16(in04, in27);
- step1[5] = ADD_EPI16(in05, in26);
- step1[6] = ADD_EPI16(in06, in25);
- step1[7] = ADD_EPI16(in07, in24);
- step1[24] = SUB_EPI16(in07, in24);
- step1[25] = SUB_EPI16(in06, in25);
- step1[26] = SUB_EPI16(in05, in26);
- step1[27] = SUB_EPI16(in04, in27);
-#if DCT_HIGH_BIT_DEPTH
- overflow = check_epi16_overflow_x8(&step1[4], &step1[5], &step1[6],
- &step1[7], &step1[24], &step1[25],
- &step1[26], &step1[27]);
- if (overflow) {
- HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
- return;
- }
-#endif // DCT_HIGH_BIT_DEPTH
- }
- {
- __m128i in08 = _mm_loadu_si128((const __m128i *)(in + 8 * 32));
- __m128i in09 = _mm_loadu_si128((const __m128i *)(in + 9 * 32));
- __m128i in10 = _mm_loadu_si128((const __m128i *)(in + 10 * 32));
- __m128i in11 = _mm_loadu_si128((const __m128i *)(in + 11 * 32));
- __m128i in20 = _mm_loadu_si128((const __m128i *)(in + 20 * 32));
- __m128i in21 = _mm_loadu_si128((const __m128i *)(in + 21 * 32));
- __m128i in22 = _mm_loadu_si128((const __m128i *)(in + 22 * 32));
- __m128i in23 = _mm_loadu_si128((const __m128i *)(in + 23 * 32));
- step1[8] = ADD_EPI16(in08, in23);
- step1[9] = ADD_EPI16(in09, in22);
- step1[10] = ADD_EPI16(in10, in21);
- step1[11] = ADD_EPI16(in11, in20);
- step1[20] = SUB_EPI16(in11, in20);
- step1[21] = SUB_EPI16(in10, in21);
- step1[22] = SUB_EPI16(in09, in22);
- step1[23] = SUB_EPI16(in08, in23);
-#if DCT_HIGH_BIT_DEPTH
- overflow = check_epi16_overflow_x8(&step1[8], &step1[9], &step1[10],
- &step1[11], &step1[20], &step1[21],
- &step1[22], &step1[23]);
- if (overflow) {
- HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
- return;
- }
-#endif // DCT_HIGH_BIT_DEPTH
- }
- {
- __m128i in12 = _mm_loadu_si128((const __m128i *)(in + 12 * 32));
- __m128i in13 = _mm_loadu_si128((const __m128i *)(in + 13 * 32));
- __m128i in14 = _mm_loadu_si128((const __m128i *)(in + 14 * 32));
- __m128i in15 = _mm_loadu_si128((const __m128i *)(in + 15 * 32));
- __m128i in16 = _mm_loadu_si128((const __m128i *)(in + 16 * 32));
- __m128i in17 = _mm_loadu_si128((const __m128i *)(in + 17 * 32));
- __m128i in18 = _mm_loadu_si128((const __m128i *)(in + 18 * 32));
- __m128i in19 = _mm_loadu_si128((const __m128i *)(in + 19 * 32));
- step1[12] = ADD_EPI16(in12, in19);
- step1[13] = ADD_EPI16(in13, in18);
- step1[14] = ADD_EPI16(in14, in17);
- step1[15] = ADD_EPI16(in15, in16);
- step1[16] = SUB_EPI16(in15, in16);
- step1[17] = SUB_EPI16(in14, in17);
- step1[18] = SUB_EPI16(in13, in18);
- step1[19] = SUB_EPI16(in12, in19);
-#if DCT_HIGH_BIT_DEPTH
- overflow = check_epi16_overflow_x8(&step1[12], &step1[13], &step1[14],
- &step1[15], &step1[16], &step1[17],
- &step1[18], &step1[19]);
- if (overflow) {
- HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
- return;
- }
-#endif // DCT_HIGH_BIT_DEPTH
- }
- }
- // Stage 2
- {
- step2[0] = ADD_EPI16(step1[0], step1[15]);
- step2[1] = ADD_EPI16(step1[1], step1[14]);
- step2[2] = ADD_EPI16(step1[2], step1[13]);
- step2[3] = ADD_EPI16(step1[3], step1[12]);
- step2[4] = ADD_EPI16(step1[4], step1[11]);
- step2[5] = ADD_EPI16(step1[5], step1[10]);
- step2[6] = ADD_EPI16(step1[6], step1[9]);
- step2[7] = ADD_EPI16(step1[7], step1[8]);
- step2[8] = SUB_EPI16(step1[7], step1[8]);
- step2[9] = SUB_EPI16(step1[6], step1[9]);
- step2[10] = SUB_EPI16(step1[5], step1[10]);
- step2[11] = SUB_EPI16(step1[4], step1[11]);
- step2[12] = SUB_EPI16(step1[3], step1[12]);
- step2[13] = SUB_EPI16(step1[2], step1[13]);
- step2[14] = SUB_EPI16(step1[1], step1[14]);
- step2[15] = SUB_EPI16(step1[0], step1[15]);
-#if DCT_HIGH_BIT_DEPTH
- overflow = check_epi16_overflow_x16(
- &step2[0], &step2[1], &step2[2], &step2[3], &step2[4], &step2[5],
- &step2[6], &step2[7], &step2[8], &step2[9], &step2[10], &step2[11],
- &step2[12], &step2[13], &step2[14], &step2[15]);
- if (overflow) {
- if (pass == 0)
- HIGH_FDCT32x32_2D_C(input, output_org, stride);
- else
- HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
- return;
- }
-#endif // DCT_HIGH_BIT_DEPTH
- }
- {
- const __m128i s2_20_0 = _mm_unpacklo_epi16(step1[27], step1[20]);
- const __m128i s2_20_1 = _mm_unpackhi_epi16(step1[27], step1[20]);
- const __m128i s2_21_0 = _mm_unpacklo_epi16(step1[26], step1[21]);
- const __m128i s2_21_1 = _mm_unpackhi_epi16(step1[26], step1[21]);
- const __m128i s2_22_0 = _mm_unpacklo_epi16(step1[25], step1[22]);
- const __m128i s2_22_1 = _mm_unpackhi_epi16(step1[25], step1[22]);
- const __m128i s2_23_0 = _mm_unpacklo_epi16(step1[24], step1[23]);
- const __m128i s2_23_1 = _mm_unpackhi_epi16(step1[24], step1[23]);
- const __m128i s2_20_2 = _mm_madd_epi16(s2_20_0, k__cospi_p16_m16);
- const __m128i s2_20_3 = _mm_madd_epi16(s2_20_1, k__cospi_p16_m16);
- const __m128i s2_21_2 = _mm_madd_epi16(s2_21_0, k__cospi_p16_m16);
- const __m128i s2_21_3 = _mm_madd_epi16(s2_21_1, k__cospi_p16_m16);
- const __m128i s2_22_2 = _mm_madd_epi16(s2_22_0, k__cospi_p16_m16);
- const __m128i s2_22_3 = _mm_madd_epi16(s2_22_1, k__cospi_p16_m16);
- const __m128i s2_23_2 = _mm_madd_epi16(s2_23_0, k__cospi_p16_m16);
- const __m128i s2_23_3 = _mm_madd_epi16(s2_23_1, k__cospi_p16_m16);
- const __m128i s2_24_2 = _mm_madd_epi16(s2_23_0, k__cospi_p16_p16);
- const __m128i s2_24_3 = _mm_madd_epi16(s2_23_1, k__cospi_p16_p16);
- const __m128i s2_25_2 = _mm_madd_epi16(s2_22_0, k__cospi_p16_p16);
- const __m128i s2_25_3 = _mm_madd_epi16(s2_22_1, k__cospi_p16_p16);
- const __m128i s2_26_2 = _mm_madd_epi16(s2_21_0, k__cospi_p16_p16);
- const __m128i s2_26_3 = _mm_madd_epi16(s2_21_1, k__cospi_p16_p16);
- const __m128i s2_27_2 = _mm_madd_epi16(s2_20_0, k__cospi_p16_p16);
- const __m128i s2_27_3 = _mm_madd_epi16(s2_20_1, k__cospi_p16_p16);
- // dct_const_round_shift
- const __m128i s2_20_4 = _mm_add_epi32(s2_20_2, k__DCT_CONST_ROUNDING);
- const __m128i s2_20_5 = _mm_add_epi32(s2_20_3, k__DCT_CONST_ROUNDING);
- const __m128i s2_21_4 = _mm_add_epi32(s2_21_2, k__DCT_CONST_ROUNDING);
- const __m128i s2_21_5 = _mm_add_epi32(s2_21_3, k__DCT_CONST_ROUNDING);
- const __m128i s2_22_4 = _mm_add_epi32(s2_22_2, k__DCT_CONST_ROUNDING);
- const __m128i s2_22_5 = _mm_add_epi32(s2_22_3, k__DCT_CONST_ROUNDING);
- const __m128i s2_23_4 = _mm_add_epi32(s2_23_2, k__DCT_CONST_ROUNDING);
- const __m128i s2_23_5 = _mm_add_epi32(s2_23_3, k__DCT_CONST_ROUNDING);
- const __m128i s2_24_4 = _mm_add_epi32(s2_24_2, k__DCT_CONST_ROUNDING);
- const __m128i s2_24_5 = _mm_add_epi32(s2_24_3, k__DCT_CONST_ROUNDING);
- const __m128i s2_25_4 = _mm_add_epi32(s2_25_2, k__DCT_CONST_ROUNDING);
- const __m128i s2_25_5 = _mm_add_epi32(s2_25_3, k__DCT_CONST_ROUNDING);
- const __m128i s2_26_4 = _mm_add_epi32(s2_26_2, k__DCT_CONST_ROUNDING);
- const __m128i s2_26_5 = _mm_add_epi32(s2_26_3, k__DCT_CONST_ROUNDING);
- const __m128i s2_27_4 = _mm_add_epi32(s2_27_2, k__DCT_CONST_ROUNDING);
- const __m128i s2_27_5 = _mm_add_epi32(s2_27_3, k__DCT_CONST_ROUNDING);
- const __m128i s2_20_6 = _mm_srai_epi32(s2_20_4, DCT_CONST_BITS);
- const __m128i s2_20_7 = _mm_srai_epi32(s2_20_5, DCT_CONST_BITS);
- const __m128i s2_21_6 = _mm_srai_epi32(s2_21_4, DCT_CONST_BITS);
- const __m128i s2_21_7 = _mm_srai_epi32(s2_21_5, DCT_CONST_BITS);
- const __m128i s2_22_6 = _mm_srai_epi32(s2_22_4, DCT_CONST_BITS);
- const __m128i s2_22_7 = _mm_srai_epi32(s2_22_5, DCT_CONST_BITS);
- const __m128i s2_23_6 = _mm_srai_epi32(s2_23_4, DCT_CONST_BITS);
- const __m128i s2_23_7 = _mm_srai_epi32(s2_23_5, DCT_CONST_BITS);
- const __m128i s2_24_6 = _mm_srai_epi32(s2_24_4, DCT_CONST_BITS);
- const __m128i s2_24_7 = _mm_srai_epi32(s2_24_5, DCT_CONST_BITS);
- const __m128i s2_25_6 = _mm_srai_epi32(s2_25_4, DCT_CONST_BITS);
- const __m128i s2_25_7 = _mm_srai_epi32(s2_25_5, DCT_CONST_BITS);
- const __m128i s2_26_6 = _mm_srai_epi32(s2_26_4, DCT_CONST_BITS);
- const __m128i s2_26_7 = _mm_srai_epi32(s2_26_5, DCT_CONST_BITS);
- const __m128i s2_27_6 = _mm_srai_epi32(s2_27_4, DCT_CONST_BITS);
- const __m128i s2_27_7 = _mm_srai_epi32(s2_27_5, DCT_CONST_BITS);
- // Combine
- step2[20] = _mm_packs_epi32(s2_20_6, s2_20_7);
- step2[21] = _mm_packs_epi32(s2_21_6, s2_21_7);
- step2[22] = _mm_packs_epi32(s2_22_6, s2_22_7);
- step2[23] = _mm_packs_epi32(s2_23_6, s2_23_7);
- step2[24] = _mm_packs_epi32(s2_24_6, s2_24_7);
- step2[25] = _mm_packs_epi32(s2_25_6, s2_25_7);
- step2[26] = _mm_packs_epi32(s2_26_6, s2_26_7);
- step2[27] = _mm_packs_epi32(s2_27_6, s2_27_7);
-#if DCT_HIGH_BIT_DEPTH
- overflow = check_epi16_overflow_x8(&step2[20], &step2[21], &step2[22],
- &step2[23], &step2[24], &step2[25],
- &step2[26], &step2[27]);
- if (overflow) {
- if (pass == 0)
- HIGH_FDCT32x32_2D_C(input, output_org, stride);
- else
- HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
- return;
- }
-#endif // DCT_HIGH_BIT_DEPTH
- }
-
-#if !FDCT32x32_HIGH_PRECISION
- // dump the magnitude by half, hence the intermediate values are within
- // the range of 16 bits.
- if (1 == pass) {
- __m128i s3_00_0 = _mm_cmplt_epi16(step2[0], kZero);
- __m128i s3_01_0 = _mm_cmplt_epi16(step2[1], kZero);
- __m128i s3_02_0 = _mm_cmplt_epi16(step2[2], kZero);
- __m128i s3_03_0 = _mm_cmplt_epi16(step2[3], kZero);
- __m128i s3_04_0 = _mm_cmplt_epi16(step2[4], kZero);
- __m128i s3_05_0 = _mm_cmplt_epi16(step2[5], kZero);
- __m128i s3_06_0 = _mm_cmplt_epi16(step2[6], kZero);
- __m128i s3_07_0 = _mm_cmplt_epi16(step2[7], kZero);
- __m128i s2_08_0 = _mm_cmplt_epi16(step2[8], kZero);
- __m128i s2_09_0 = _mm_cmplt_epi16(step2[9], kZero);
- __m128i s3_10_0 = _mm_cmplt_epi16(step2[10], kZero);
- __m128i s3_11_0 = _mm_cmplt_epi16(step2[11], kZero);
- __m128i s3_12_0 = _mm_cmplt_epi16(step2[12], kZero);
- __m128i s3_13_0 = _mm_cmplt_epi16(step2[13], kZero);
- __m128i s2_14_0 = _mm_cmplt_epi16(step2[14], kZero);
- __m128i s2_15_0 = _mm_cmplt_epi16(step2[15], kZero);
- __m128i s3_16_0 = _mm_cmplt_epi16(step1[16], kZero);
- __m128i s3_17_0 = _mm_cmplt_epi16(step1[17], kZero);
- __m128i s3_18_0 = _mm_cmplt_epi16(step1[18], kZero);
- __m128i s3_19_0 = _mm_cmplt_epi16(step1[19], kZero);
- __m128i s3_20_0 = _mm_cmplt_epi16(step2[20], kZero);
- __m128i s3_21_0 = _mm_cmplt_epi16(step2[21], kZero);
- __m128i s3_22_0 = _mm_cmplt_epi16(step2[22], kZero);
- __m128i s3_23_0 = _mm_cmplt_epi16(step2[23], kZero);
- __m128i s3_24_0 = _mm_cmplt_epi16(step2[24], kZero);
- __m128i s3_25_0 = _mm_cmplt_epi16(step2[25], kZero);
- __m128i s3_26_0 = _mm_cmplt_epi16(step2[26], kZero);
- __m128i s3_27_0 = _mm_cmplt_epi16(step2[27], kZero);
- __m128i s3_28_0 = _mm_cmplt_epi16(step1[28], kZero);
- __m128i s3_29_0 = _mm_cmplt_epi16(step1[29], kZero);
- __m128i s3_30_0 = _mm_cmplt_epi16(step1[30], kZero);
- __m128i s3_31_0 = _mm_cmplt_epi16(step1[31], kZero);
-
- step2[0] = SUB_EPI16(step2[0], s3_00_0);
- step2[1] = SUB_EPI16(step2[1], s3_01_0);
- step2[2] = SUB_EPI16(step2[2], s3_02_0);
- step2[3] = SUB_EPI16(step2[3], s3_03_0);
- step2[4] = SUB_EPI16(step2[4], s3_04_0);
- step2[5] = SUB_EPI16(step2[5], s3_05_0);
- step2[6] = SUB_EPI16(step2[6], s3_06_0);
- step2[7] = SUB_EPI16(step2[7], s3_07_0);
- step2[8] = SUB_EPI16(step2[8], s2_08_0);
- step2[9] = SUB_EPI16(step2[9], s2_09_0);
- step2[10] = SUB_EPI16(step2[10], s3_10_0);
- step2[11] = SUB_EPI16(step2[11], s3_11_0);
- step2[12] = SUB_EPI16(step2[12], s3_12_0);
- step2[13] = SUB_EPI16(step2[13], s3_13_0);
- step2[14] = SUB_EPI16(step2[14], s2_14_0);
- step2[15] = SUB_EPI16(step2[15], s2_15_0);
- step1[16] = SUB_EPI16(step1[16], s3_16_0);
- step1[17] = SUB_EPI16(step1[17], s3_17_0);
- step1[18] = SUB_EPI16(step1[18], s3_18_0);
- step1[19] = SUB_EPI16(step1[19], s3_19_0);
- step2[20] = SUB_EPI16(step2[20], s3_20_0);
- step2[21] = SUB_EPI16(step2[21], s3_21_0);
- step2[22] = SUB_EPI16(step2[22], s3_22_0);
- step2[23] = SUB_EPI16(step2[23], s3_23_0);
- step2[24] = SUB_EPI16(step2[24], s3_24_0);
- step2[25] = SUB_EPI16(step2[25], s3_25_0);
- step2[26] = SUB_EPI16(step2[26], s3_26_0);
- step2[27] = SUB_EPI16(step2[27], s3_27_0);
- step1[28] = SUB_EPI16(step1[28], s3_28_0);
- step1[29] = SUB_EPI16(step1[29], s3_29_0);
- step1[30] = SUB_EPI16(step1[30], s3_30_0);
- step1[31] = SUB_EPI16(step1[31], s3_31_0);
-#if DCT_HIGH_BIT_DEPTH
- overflow = check_epi16_overflow_x32(
- &step2[0], &step2[1], &step2[2], &step2[3], &step2[4], &step2[5],
- &step2[6], &step2[7], &step2[8], &step2[9], &step2[10], &step2[11],
- &step2[12], &step2[13], &step2[14], &step2[15], &step1[16],
- &step1[17], &step1[18], &step1[19], &step2[20], &step2[21],
- &step2[22], &step2[23], &step2[24], &step2[25], &step2[26],
- &step2[27], &step1[28], &step1[29], &step1[30], &step1[31]);
- if (overflow) {
- HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
- return;
- }
-#endif // DCT_HIGH_BIT_DEPTH
- step2[0] = _mm_add_epi16(step2[0], kOne);
- step2[1] = _mm_add_epi16(step2[1], kOne);
- step2[2] = _mm_add_epi16(step2[2], kOne);
- step2[3] = _mm_add_epi16(step2[3], kOne);
- step2[4] = _mm_add_epi16(step2[4], kOne);
- step2[5] = _mm_add_epi16(step2[5], kOne);
- step2[6] = _mm_add_epi16(step2[6], kOne);
- step2[7] = _mm_add_epi16(step2[7], kOne);
- step2[8] = _mm_add_epi16(step2[8], kOne);
- step2[9] = _mm_add_epi16(step2[9], kOne);
- step2[10] = _mm_add_epi16(step2[10], kOne);
- step2[11] = _mm_add_epi16(step2[11], kOne);
- step2[12] = _mm_add_epi16(step2[12], kOne);
- step2[13] = _mm_add_epi16(step2[13], kOne);
- step2[14] = _mm_add_epi16(step2[14], kOne);
- step2[15] = _mm_add_epi16(step2[15], kOne);
- step1[16] = _mm_add_epi16(step1[16], kOne);
- step1[17] = _mm_add_epi16(step1[17], kOne);
- step1[18] = _mm_add_epi16(step1[18], kOne);
- step1[19] = _mm_add_epi16(step1[19], kOne);
- step2[20] = _mm_add_epi16(step2[20], kOne);
- step2[21] = _mm_add_epi16(step2[21], kOne);
- step2[22] = _mm_add_epi16(step2[22], kOne);
- step2[23] = _mm_add_epi16(step2[23], kOne);
- step2[24] = _mm_add_epi16(step2[24], kOne);
- step2[25] = _mm_add_epi16(step2[25], kOne);
- step2[26] = _mm_add_epi16(step2[26], kOne);
- step2[27] = _mm_add_epi16(step2[27], kOne);
- step1[28] = _mm_add_epi16(step1[28], kOne);
- step1[29] = _mm_add_epi16(step1[29], kOne);
- step1[30] = _mm_add_epi16(step1[30], kOne);
- step1[31] = _mm_add_epi16(step1[31], kOne);
-
- step2[0] = _mm_srai_epi16(step2[0], 2);
- step2[1] = _mm_srai_epi16(step2[1], 2);
- step2[2] = _mm_srai_epi16(step2[2], 2);
- step2[3] = _mm_srai_epi16(step2[3], 2);
- step2[4] = _mm_srai_epi16(step2[4], 2);
- step2[5] = _mm_srai_epi16(step2[5], 2);
- step2[6] = _mm_srai_epi16(step2[6], 2);
- step2[7] = _mm_srai_epi16(step2[7], 2);
- step2[8] = _mm_srai_epi16(step2[8], 2);
- step2[9] = _mm_srai_epi16(step2[9], 2);
- step2[10] = _mm_srai_epi16(step2[10], 2);
- step2[11] = _mm_srai_epi16(step2[11], 2);
- step2[12] = _mm_srai_epi16(step2[12], 2);
- step2[13] = _mm_srai_epi16(step2[13], 2);
- step2[14] = _mm_srai_epi16(step2[14], 2);
- step2[15] = _mm_srai_epi16(step2[15], 2);
- step1[16] = _mm_srai_epi16(step1[16], 2);
- step1[17] = _mm_srai_epi16(step1[17], 2);
- step1[18] = _mm_srai_epi16(step1[18], 2);
- step1[19] = _mm_srai_epi16(step1[19], 2);
- step2[20] = _mm_srai_epi16(step2[20], 2);
- step2[21] = _mm_srai_epi16(step2[21], 2);
- step2[22] = _mm_srai_epi16(step2[22], 2);
- step2[23] = _mm_srai_epi16(step2[23], 2);
- step2[24] = _mm_srai_epi16(step2[24], 2);
- step2[25] = _mm_srai_epi16(step2[25], 2);
- step2[26] = _mm_srai_epi16(step2[26], 2);
- step2[27] = _mm_srai_epi16(step2[27], 2);
- step1[28] = _mm_srai_epi16(step1[28], 2);
- step1[29] = _mm_srai_epi16(step1[29], 2);
- step1[30] = _mm_srai_epi16(step1[30], 2);
- step1[31] = _mm_srai_epi16(step1[31], 2);
- }
-#endif // !FDCT32x32_HIGH_PRECISION
-
-#if FDCT32x32_HIGH_PRECISION
- if (pass == 0) {
-#endif
- // Stage 3
- {
- step3[0] = ADD_EPI16(step2[(8 - 1)], step2[0]);
- step3[1] = ADD_EPI16(step2[(8 - 2)], step2[1]);
- step3[2] = ADD_EPI16(step2[(8 - 3)], step2[2]);
- step3[3] = ADD_EPI16(step2[(8 - 4)], step2[3]);
- step3[4] = SUB_EPI16(step2[(8 - 5)], step2[4]);
- step3[5] = SUB_EPI16(step2[(8 - 6)], step2[5]);
- step3[6] = SUB_EPI16(step2[(8 - 7)], step2[6]);
- step3[7] = SUB_EPI16(step2[(8 - 8)], step2[7]);
-#if DCT_HIGH_BIT_DEPTH
- overflow = check_epi16_overflow_x8(&step3[0], &step3[1], &step3[2],
- &step3[3], &step3[4], &step3[5],
- &step3[6], &step3[7]);
- if (overflow) {
- if (pass == 0)
- HIGH_FDCT32x32_2D_C(input, output_org, stride);
- else
- HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
- return;
- }
-#endif // DCT_HIGH_BIT_DEPTH
- }
- {
- const __m128i s3_10_0 = _mm_unpacklo_epi16(step2[13], step2[10]);
- const __m128i s3_10_1 = _mm_unpackhi_epi16(step2[13], step2[10]);
- const __m128i s3_11_0 = _mm_unpacklo_epi16(step2[12], step2[11]);
- const __m128i s3_11_1 = _mm_unpackhi_epi16(step2[12], step2[11]);
- const __m128i s3_10_2 = _mm_madd_epi16(s3_10_0, k__cospi_p16_m16);
- const __m128i s3_10_3 = _mm_madd_epi16(s3_10_1, k__cospi_p16_m16);
- const __m128i s3_11_2 = _mm_madd_epi16(s3_11_0, k__cospi_p16_m16);
- const __m128i s3_11_3 = _mm_madd_epi16(s3_11_1, k__cospi_p16_m16);
- const __m128i s3_12_2 = _mm_madd_epi16(s3_11_0, k__cospi_p16_p16);
- const __m128i s3_12_3 = _mm_madd_epi16(s3_11_1, k__cospi_p16_p16);
- const __m128i s3_13_2 = _mm_madd_epi16(s3_10_0, k__cospi_p16_p16);
- const __m128i s3_13_3 = _mm_madd_epi16(s3_10_1, k__cospi_p16_p16);
- // dct_const_round_shift
- const __m128i s3_10_4 = _mm_add_epi32(s3_10_2, k__DCT_CONST_ROUNDING);
- const __m128i s3_10_5 = _mm_add_epi32(s3_10_3, k__DCT_CONST_ROUNDING);
- const __m128i s3_11_4 = _mm_add_epi32(s3_11_2, k__DCT_CONST_ROUNDING);
- const __m128i s3_11_5 = _mm_add_epi32(s3_11_3, k__DCT_CONST_ROUNDING);
- const __m128i s3_12_4 = _mm_add_epi32(s3_12_2, k__DCT_CONST_ROUNDING);
- const __m128i s3_12_5 = _mm_add_epi32(s3_12_3, k__DCT_CONST_ROUNDING);
- const __m128i s3_13_4 = _mm_add_epi32(s3_13_2, k__DCT_CONST_ROUNDING);
- const __m128i s3_13_5 = _mm_add_epi32(s3_13_3, k__DCT_CONST_ROUNDING);
- const __m128i s3_10_6 = _mm_srai_epi32(s3_10_4, DCT_CONST_BITS);
- const __m128i s3_10_7 = _mm_srai_epi32(s3_10_5, DCT_CONST_BITS);
- const __m128i s3_11_6 = _mm_srai_epi32(s3_11_4, DCT_CONST_BITS);
- const __m128i s3_11_7 = _mm_srai_epi32(s3_11_5, DCT_CONST_BITS);
- const __m128i s3_12_6 = _mm_srai_epi32(s3_12_4, DCT_CONST_BITS);
- const __m128i s3_12_7 = _mm_srai_epi32(s3_12_5, DCT_CONST_BITS);
- const __m128i s3_13_6 = _mm_srai_epi32(s3_13_4, DCT_CONST_BITS);
- const __m128i s3_13_7 = _mm_srai_epi32(s3_13_5, DCT_CONST_BITS);
- // Combine
- step3[10] = _mm_packs_epi32(s3_10_6, s3_10_7);
- step3[11] = _mm_packs_epi32(s3_11_6, s3_11_7);
- step3[12] = _mm_packs_epi32(s3_12_6, s3_12_7);
- step3[13] = _mm_packs_epi32(s3_13_6, s3_13_7);
-#if DCT_HIGH_BIT_DEPTH
- overflow = check_epi16_overflow_x4(&step3[10], &step3[11], &step3[12],
- &step3[13]);
- if (overflow) {
- if (pass == 0)
- HIGH_FDCT32x32_2D_C(input, output_org, stride);
- else
- HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
- return;
- }
-#endif // DCT_HIGH_BIT_DEPTH
- }
- {
- step3[16] = ADD_EPI16(step2[23], step1[16]);
- step3[17] = ADD_EPI16(step2[22], step1[17]);
- step3[18] = ADD_EPI16(step2[21], step1[18]);
- step3[19] = ADD_EPI16(step2[20], step1[19]);
- step3[20] = SUB_EPI16(step1[19], step2[20]);
- step3[21] = SUB_EPI16(step1[18], step2[21]);
- step3[22] = SUB_EPI16(step1[17], step2[22]);
- step3[23] = SUB_EPI16(step1[16], step2[23]);
- step3[24] = SUB_EPI16(step1[31], step2[24]);
- step3[25] = SUB_EPI16(step1[30], step2[25]);
- step3[26] = SUB_EPI16(step1[29], step2[26]);
- step3[27] = SUB_EPI16(step1[28], step2[27]);
- step3[28] = ADD_EPI16(step2[27], step1[28]);
- step3[29] = ADD_EPI16(step2[26], step1[29]);
- step3[30] = ADD_EPI16(step2[25], step1[30]);
- step3[31] = ADD_EPI16(step2[24], step1[31]);
-#if DCT_HIGH_BIT_DEPTH
- overflow = check_epi16_overflow_x16(
- &step3[16], &step3[17], &step3[18], &step3[19], &step3[20],
- &step3[21], &step3[22], &step3[23], &step3[24], &step3[25],
- &step3[26], &step3[27], &step3[28], &step3[29], &step3[30],
- &step3[31]);
- if (overflow) {
- if (pass == 0)
- HIGH_FDCT32x32_2D_C(input, output_org, stride);
- else
- HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
- return;
- }
-#endif // DCT_HIGH_BIT_DEPTH
- }
-
- // Stage 4
- {
- step1[0] = ADD_EPI16(step3[3], step3[0]);
- step1[1] = ADD_EPI16(step3[2], step3[1]);
- step1[2] = SUB_EPI16(step3[1], step3[2]);
- step1[3] = SUB_EPI16(step3[0], step3[3]);
- step1[8] = ADD_EPI16(step3[11], step2[8]);
- step1[9] = ADD_EPI16(step3[10], step2[9]);
- step1[10] = SUB_EPI16(step2[9], step3[10]);
- step1[11] = SUB_EPI16(step2[8], step3[11]);
- step1[12] = SUB_EPI16(step2[15], step3[12]);
- step1[13] = SUB_EPI16(step2[14], step3[13]);
- step1[14] = ADD_EPI16(step3[13], step2[14]);
- step1[15] = ADD_EPI16(step3[12], step2[15]);
-#if DCT_HIGH_BIT_DEPTH
- overflow = check_epi16_overflow_x16(
- &step1[0], &step1[1], &step1[2], &step1[3], &step1[4], &step1[5],
- &step1[6], &step1[7], &step1[8], &step1[9], &step1[10],
- &step1[11], &step1[12], &step1[13], &step1[14], &step1[15]);
- if (overflow) {
- if (pass == 0)
- HIGH_FDCT32x32_2D_C(input, output_org, stride);
- else
- HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
- return;
- }
-#endif // DCT_HIGH_BIT_DEPTH
- }
- {
- const __m128i s1_05_0 = _mm_unpacklo_epi16(step3[6], step3[5]);
- const __m128i s1_05_1 = _mm_unpackhi_epi16(step3[6], step3[5]);
- const __m128i s1_05_2 = _mm_madd_epi16(s1_05_0, k__cospi_p16_m16);
- const __m128i s1_05_3 = _mm_madd_epi16(s1_05_1, k__cospi_p16_m16);
- const __m128i s1_06_2 = _mm_madd_epi16(s1_05_0, k__cospi_p16_p16);
- const __m128i s1_06_3 = _mm_madd_epi16(s1_05_1, k__cospi_p16_p16);
- // dct_const_round_shift
- const __m128i s1_05_4 = _mm_add_epi32(s1_05_2, k__DCT_CONST_ROUNDING);
- const __m128i s1_05_5 = _mm_add_epi32(s1_05_3, k__DCT_CONST_ROUNDING);
- const __m128i s1_06_4 = _mm_add_epi32(s1_06_2, k__DCT_CONST_ROUNDING);
- const __m128i s1_06_5 = _mm_add_epi32(s1_06_3, k__DCT_CONST_ROUNDING);
- const __m128i s1_05_6 = _mm_srai_epi32(s1_05_4, DCT_CONST_BITS);
- const __m128i s1_05_7 = _mm_srai_epi32(s1_05_5, DCT_CONST_BITS);
- const __m128i s1_06_6 = _mm_srai_epi32(s1_06_4, DCT_CONST_BITS);
- const __m128i s1_06_7 = _mm_srai_epi32(s1_06_5, DCT_CONST_BITS);
- // Combine
- step1[5] = _mm_packs_epi32(s1_05_6, s1_05_7);
- step1[6] = _mm_packs_epi32(s1_06_6, s1_06_7);
-#if DCT_HIGH_BIT_DEPTH
- overflow = check_epi16_overflow_x2(&step1[5], &step1[6]);
- if (overflow) {
- if (pass == 0)
- HIGH_FDCT32x32_2D_C(input, output_org, stride);
- else
- HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
- return;
- }
-#endif // DCT_HIGH_BIT_DEPTH
- }
- {
- const __m128i s1_18_0 = _mm_unpacklo_epi16(step3[18], step3[29]);
- const __m128i s1_18_1 = _mm_unpackhi_epi16(step3[18], step3[29]);
- const __m128i s1_19_0 = _mm_unpacklo_epi16(step3[19], step3[28]);
- const __m128i s1_19_1 = _mm_unpackhi_epi16(step3[19], step3[28]);
- const __m128i s1_20_0 = _mm_unpacklo_epi16(step3[20], step3[27]);
- const __m128i s1_20_1 = _mm_unpackhi_epi16(step3[20], step3[27]);
- const __m128i s1_21_0 = _mm_unpacklo_epi16(step3[21], step3[26]);
- const __m128i s1_21_1 = _mm_unpackhi_epi16(step3[21], step3[26]);
- const __m128i s1_18_2 = _mm_madd_epi16(s1_18_0, k__cospi_m08_p24);
- const __m128i s1_18_3 = _mm_madd_epi16(s1_18_1, k__cospi_m08_p24);
- const __m128i s1_19_2 = _mm_madd_epi16(s1_19_0, k__cospi_m08_p24);
- const __m128i s1_19_3 = _mm_madd_epi16(s1_19_1, k__cospi_m08_p24);
- const __m128i s1_20_2 = _mm_madd_epi16(s1_20_0, k__cospi_m24_m08);
- const __m128i s1_20_3 = _mm_madd_epi16(s1_20_1, k__cospi_m24_m08);
- const __m128i s1_21_2 = _mm_madd_epi16(s1_21_0, k__cospi_m24_m08);
- const __m128i s1_21_3 = _mm_madd_epi16(s1_21_1, k__cospi_m24_m08);
- const __m128i s1_26_2 = _mm_madd_epi16(s1_21_0, k__cospi_m08_p24);
- const __m128i s1_26_3 = _mm_madd_epi16(s1_21_1, k__cospi_m08_p24);
- const __m128i s1_27_2 = _mm_madd_epi16(s1_20_0, k__cospi_m08_p24);
- const __m128i s1_27_3 = _mm_madd_epi16(s1_20_1, k__cospi_m08_p24);
- const __m128i s1_28_2 = _mm_madd_epi16(s1_19_0, k__cospi_p24_p08);
- const __m128i s1_28_3 = _mm_madd_epi16(s1_19_1, k__cospi_p24_p08);
- const __m128i s1_29_2 = _mm_madd_epi16(s1_18_0, k__cospi_p24_p08);
- const __m128i s1_29_3 = _mm_madd_epi16(s1_18_1, k__cospi_p24_p08);
- // dct_const_round_shift
- const __m128i s1_18_4 = _mm_add_epi32(s1_18_2, k__DCT_CONST_ROUNDING);
- const __m128i s1_18_5 = _mm_add_epi32(s1_18_3, k__DCT_CONST_ROUNDING);
- const __m128i s1_19_4 = _mm_add_epi32(s1_19_2, k__DCT_CONST_ROUNDING);
- const __m128i s1_19_5 = _mm_add_epi32(s1_19_3, k__DCT_CONST_ROUNDING);
- const __m128i s1_20_4 = _mm_add_epi32(s1_20_2, k__DCT_CONST_ROUNDING);
- const __m128i s1_20_5 = _mm_add_epi32(s1_20_3, k__DCT_CONST_ROUNDING);
- const __m128i s1_21_4 = _mm_add_epi32(s1_21_2, k__DCT_CONST_ROUNDING);
- const __m128i s1_21_5 = _mm_add_epi32(s1_21_3, k__DCT_CONST_ROUNDING);
- const __m128i s1_26_4 = _mm_add_epi32(s1_26_2, k__DCT_CONST_ROUNDING);
- const __m128i s1_26_5 = _mm_add_epi32(s1_26_3, k__DCT_CONST_ROUNDING);
- const __m128i s1_27_4 = _mm_add_epi32(s1_27_2, k__DCT_CONST_ROUNDING);
- const __m128i s1_27_5 = _mm_add_epi32(s1_27_3, k__DCT_CONST_ROUNDING);
- const __m128i s1_28_4 = _mm_add_epi32(s1_28_2, k__DCT_CONST_ROUNDING);
- const __m128i s1_28_5 = _mm_add_epi32(s1_28_3, k__DCT_CONST_ROUNDING);
- const __m128i s1_29_4 = _mm_add_epi32(s1_29_2, k__DCT_CONST_ROUNDING);
- const __m128i s1_29_5 = _mm_add_epi32(s1_29_3, k__DCT_CONST_ROUNDING);
- const __m128i s1_18_6 = _mm_srai_epi32(s1_18_4, DCT_CONST_BITS);
- const __m128i s1_18_7 = _mm_srai_epi32(s1_18_5, DCT_CONST_BITS);
- const __m128i s1_19_6 = _mm_srai_epi32(s1_19_4, DCT_CONST_BITS);
- const __m128i s1_19_7 = _mm_srai_epi32(s1_19_5, DCT_CONST_BITS);
- const __m128i s1_20_6 = _mm_srai_epi32(s1_20_4, DCT_CONST_BITS);
- const __m128i s1_20_7 = _mm_srai_epi32(s1_20_5, DCT_CONST_BITS);
- const __m128i s1_21_6 = _mm_srai_epi32(s1_21_4, DCT_CONST_BITS);
- const __m128i s1_21_7 = _mm_srai_epi32(s1_21_5, DCT_CONST_BITS);
- const __m128i s1_26_6 = _mm_srai_epi32(s1_26_4, DCT_CONST_BITS);
- const __m128i s1_26_7 = _mm_srai_epi32(s1_26_5, DCT_CONST_BITS);
- const __m128i s1_27_6 = _mm_srai_epi32(s1_27_4, DCT_CONST_BITS);
- const __m128i s1_27_7 = _mm_srai_epi32(s1_27_5, DCT_CONST_BITS);
- const __m128i s1_28_6 = _mm_srai_epi32(s1_28_4, DCT_CONST_BITS);
- const __m128i s1_28_7 = _mm_srai_epi32(s1_28_5, DCT_CONST_BITS);
- const __m128i s1_29_6 = _mm_srai_epi32(s1_29_4, DCT_CONST_BITS);
- const __m128i s1_29_7 = _mm_srai_epi32(s1_29_5, DCT_CONST_BITS);
- // Combine
- step1[18] = _mm_packs_epi32(s1_18_6, s1_18_7);
- step1[19] = _mm_packs_epi32(s1_19_6, s1_19_7);
- step1[20] = _mm_packs_epi32(s1_20_6, s1_20_7);
- step1[21] = _mm_packs_epi32(s1_21_6, s1_21_7);
- step1[26] = _mm_packs_epi32(s1_26_6, s1_26_7);
- step1[27] = _mm_packs_epi32(s1_27_6, s1_27_7);
- step1[28] = _mm_packs_epi32(s1_28_6, s1_28_7);
- step1[29] = _mm_packs_epi32(s1_29_6, s1_29_7);
-#if DCT_HIGH_BIT_DEPTH
- overflow = check_epi16_overflow_x8(&step1[18], &step1[19], &step1[20],
- &step1[21], &step1[26], &step1[27],
- &step1[28], &step1[29]);
- if (overflow) {
- if (pass == 0)
- HIGH_FDCT32x32_2D_C(input, output_org, stride);
- else
- HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
- return;
- }
-#endif // DCT_HIGH_BIT_DEPTH
- }
- // Stage 5
- {
- step2[4] = ADD_EPI16(step1[5], step3[4]);
- step2[5] = SUB_EPI16(step3[4], step1[5]);
- step2[6] = SUB_EPI16(step3[7], step1[6]);
- step2[7] = ADD_EPI16(step1[6], step3[7]);
-#if DCT_HIGH_BIT_DEPTH
- overflow = check_epi16_overflow_x4(&step2[4], &step2[5], &step2[6],
- &step2[7]);
- if (overflow) {
- if (pass == 0)
- HIGH_FDCT32x32_2D_C(input, output_org, stride);
- else
- HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
- return;
- }
-#endif // DCT_HIGH_BIT_DEPTH
- }
- {
- const __m128i out_00_0 = _mm_unpacklo_epi16(step1[0], step1[1]);
- const __m128i out_00_1 = _mm_unpackhi_epi16(step1[0], step1[1]);
- const __m128i out_08_0 = _mm_unpacklo_epi16(step1[2], step1[3]);
- const __m128i out_08_1 = _mm_unpackhi_epi16(step1[2], step1[3]);
- const __m128i out_00_2 = _mm_madd_epi16(out_00_0, k__cospi_p16_p16);
- const __m128i out_00_3 = _mm_madd_epi16(out_00_1, k__cospi_p16_p16);
- const __m128i out_16_2 = _mm_madd_epi16(out_00_0, k__cospi_p16_m16);
- const __m128i out_16_3 = _mm_madd_epi16(out_00_1, k__cospi_p16_m16);
- const __m128i out_08_2 = _mm_madd_epi16(out_08_0, k__cospi_p24_p08);
- const __m128i out_08_3 = _mm_madd_epi16(out_08_1, k__cospi_p24_p08);
- const __m128i out_24_2 = _mm_madd_epi16(out_08_0, k__cospi_m08_p24);
- const __m128i out_24_3 = _mm_madd_epi16(out_08_1, k__cospi_m08_p24);
- // dct_const_round_shift
- const __m128i out_00_4 =
- _mm_add_epi32(out_00_2, k__DCT_CONST_ROUNDING);
- const __m128i out_00_5 =
- _mm_add_epi32(out_00_3, k__DCT_CONST_ROUNDING);
- const __m128i out_16_4 =
- _mm_add_epi32(out_16_2, k__DCT_CONST_ROUNDING);
- const __m128i out_16_5 =
- _mm_add_epi32(out_16_3, k__DCT_CONST_ROUNDING);
- const __m128i out_08_4 =
- _mm_add_epi32(out_08_2, k__DCT_CONST_ROUNDING);
- const __m128i out_08_5 =
- _mm_add_epi32(out_08_3, k__DCT_CONST_ROUNDING);
- const __m128i out_24_4 =
- _mm_add_epi32(out_24_2, k__DCT_CONST_ROUNDING);
- const __m128i out_24_5 =
- _mm_add_epi32(out_24_3, k__DCT_CONST_ROUNDING);
- const __m128i out_00_6 = _mm_srai_epi32(out_00_4, DCT_CONST_BITS);
- const __m128i out_00_7 = _mm_srai_epi32(out_00_5, DCT_CONST_BITS);
- const __m128i out_16_6 = _mm_srai_epi32(out_16_4, DCT_CONST_BITS);
- const __m128i out_16_7 = _mm_srai_epi32(out_16_5, DCT_CONST_BITS);
- const __m128i out_08_6 = _mm_srai_epi32(out_08_4, DCT_CONST_BITS);
- const __m128i out_08_7 = _mm_srai_epi32(out_08_5, DCT_CONST_BITS);
- const __m128i out_24_6 = _mm_srai_epi32(out_24_4, DCT_CONST_BITS);
- const __m128i out_24_7 = _mm_srai_epi32(out_24_5, DCT_CONST_BITS);
- // Combine
- out[0] = _mm_packs_epi32(out_00_6, out_00_7);
- out[16] = _mm_packs_epi32(out_16_6, out_16_7);
- out[8] = _mm_packs_epi32(out_08_6, out_08_7);
- out[24] = _mm_packs_epi32(out_24_6, out_24_7);
-#if DCT_HIGH_BIT_DEPTH
- overflow =
- check_epi16_overflow_x4(&out[0], &out[16], &out[8], &out[24]);
- if (overflow) {
- if (pass == 0)
- HIGH_FDCT32x32_2D_C(input, output_org, stride);
- else
- HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
- return;
- }
-#endif // DCT_HIGH_BIT_DEPTH
- }
- {
- const __m128i s2_09_0 = _mm_unpacklo_epi16(step1[9], step1[14]);
- const __m128i s2_09_1 = _mm_unpackhi_epi16(step1[9], step1[14]);
- const __m128i s2_10_0 = _mm_unpacklo_epi16(step1[10], step1[13]);
- const __m128i s2_10_1 = _mm_unpackhi_epi16(step1[10], step1[13]);
- const __m128i s2_09_2 = _mm_madd_epi16(s2_09_0, k__cospi_m08_p24);
- const __m128i s2_09_3 = _mm_madd_epi16(s2_09_1, k__cospi_m08_p24);
- const __m128i s2_10_2 = _mm_madd_epi16(s2_10_0, k__cospi_m24_m08);
- const __m128i s2_10_3 = _mm_madd_epi16(s2_10_1, k__cospi_m24_m08);
- const __m128i s2_13_2 = _mm_madd_epi16(s2_10_0, k__cospi_m08_p24);
- const __m128i s2_13_3 = _mm_madd_epi16(s2_10_1, k__cospi_m08_p24);
- const __m128i s2_14_2 = _mm_madd_epi16(s2_09_0, k__cospi_p24_p08);
- const __m128i s2_14_3 = _mm_madd_epi16(s2_09_1, k__cospi_p24_p08);
- // dct_const_round_shift
- const __m128i s2_09_4 = _mm_add_epi32(s2_09_2, k__DCT_CONST_ROUNDING);
- const __m128i s2_09_5 = _mm_add_epi32(s2_09_3, k__DCT_CONST_ROUNDING);
- const __m128i s2_10_4 = _mm_add_epi32(s2_10_2, k__DCT_CONST_ROUNDING);
- const __m128i s2_10_5 = _mm_add_epi32(s2_10_3, k__DCT_CONST_ROUNDING);
- const __m128i s2_13_4 = _mm_add_epi32(s2_13_2, k__DCT_CONST_ROUNDING);
- const __m128i s2_13_5 = _mm_add_epi32(s2_13_3, k__DCT_CONST_ROUNDING);
- const __m128i s2_14_4 = _mm_add_epi32(s2_14_2, k__DCT_CONST_ROUNDING);
- const __m128i s2_14_5 = _mm_add_epi32(s2_14_3, k__DCT_CONST_ROUNDING);
- const __m128i s2_09_6 = _mm_srai_epi32(s2_09_4, DCT_CONST_BITS);
- const __m128i s2_09_7 = _mm_srai_epi32(s2_09_5, DCT_CONST_BITS);
- const __m128i s2_10_6 = _mm_srai_epi32(s2_10_4, DCT_CONST_BITS);
- const __m128i s2_10_7 = _mm_srai_epi32(s2_10_5, DCT_CONST_BITS);
- const __m128i s2_13_6 = _mm_srai_epi32(s2_13_4, DCT_CONST_BITS);
- const __m128i s2_13_7 = _mm_srai_epi32(s2_13_5, DCT_CONST_BITS);
- const __m128i s2_14_6 = _mm_srai_epi32(s2_14_4, DCT_CONST_BITS);
- const __m128i s2_14_7 = _mm_srai_epi32(s2_14_5, DCT_CONST_BITS);
- // Combine
- step2[9] = _mm_packs_epi32(s2_09_6, s2_09_7);
- step2[10] = _mm_packs_epi32(s2_10_6, s2_10_7);
- step2[13] = _mm_packs_epi32(s2_13_6, s2_13_7);
- step2[14] = _mm_packs_epi32(s2_14_6, s2_14_7);
-#if DCT_HIGH_BIT_DEPTH
- overflow = check_epi16_overflow_x4(&step2[9], &step2[10], &step2[13],
- &step2[14]);
- if (overflow) {
- if (pass == 0)
- HIGH_FDCT32x32_2D_C(input, output_org, stride);
- else
- HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
- return;
- }
-#endif // DCT_HIGH_BIT_DEPTH
- }
- {
- step2[16] = ADD_EPI16(step1[19], step3[16]);
- step2[17] = ADD_EPI16(step1[18], step3[17]);
- step2[18] = SUB_EPI16(step3[17], step1[18]);
- step2[19] = SUB_EPI16(step3[16], step1[19]);
- step2[20] = SUB_EPI16(step3[23], step1[20]);
- step2[21] = SUB_EPI16(step3[22], step1[21]);
- step2[22] = ADD_EPI16(step1[21], step3[22]);
- step2[23] = ADD_EPI16(step1[20], step3[23]);
- step2[24] = ADD_EPI16(step1[27], step3[24]);
- step2[25] = ADD_EPI16(step1[26], step3[25]);
- step2[26] = SUB_EPI16(step3[25], step1[26]);
- step2[27] = SUB_EPI16(step3[24], step1[27]);
- step2[28] = SUB_EPI16(step3[31], step1[28]);
- step2[29] = SUB_EPI16(step3[30], step1[29]);
- step2[30] = ADD_EPI16(step1[29], step3[30]);
- step2[31] = ADD_EPI16(step1[28], step3[31]);
-#if DCT_HIGH_BIT_DEPTH
- overflow = check_epi16_overflow_x16(
- &step2[16], &step2[17], &step2[18], &step2[19], &step2[20],
- &step2[21], &step2[22], &step2[23], &step2[24], &step2[25],
- &step2[26], &step2[27], &step2[28], &step2[29], &step2[30],
- &step2[31]);
- if (overflow) {
- if (pass == 0)
- HIGH_FDCT32x32_2D_C(input, output_org, stride);
- else
- HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
- return;
- }
-#endif // DCT_HIGH_BIT_DEPTH
- }
- // Stage 6
- {
- const __m128i out_04_0 = _mm_unpacklo_epi16(step2[4], step2[7]);
- const __m128i out_04_1 = _mm_unpackhi_epi16(step2[4], step2[7]);
- const __m128i out_20_0 = _mm_unpacklo_epi16(step2[5], step2[6]);
- const __m128i out_20_1 = _mm_unpackhi_epi16(step2[5], step2[6]);
- const __m128i out_12_0 = _mm_unpacklo_epi16(step2[5], step2[6]);
- const __m128i out_12_1 = _mm_unpackhi_epi16(step2[5], step2[6]);
- const __m128i out_28_0 = _mm_unpacklo_epi16(step2[4], step2[7]);
- const __m128i out_28_1 = _mm_unpackhi_epi16(step2[4], step2[7]);
- const __m128i out_04_2 = _mm_madd_epi16(out_04_0, k__cospi_p28_p04);
- const __m128i out_04_3 = _mm_madd_epi16(out_04_1, k__cospi_p28_p04);
- const __m128i out_20_2 = _mm_madd_epi16(out_20_0, k__cospi_p12_p20);
- const __m128i out_20_3 = _mm_madd_epi16(out_20_1, k__cospi_p12_p20);
- const __m128i out_12_2 = _mm_madd_epi16(out_12_0, k__cospi_m20_p12);
- const __m128i out_12_3 = _mm_madd_epi16(out_12_1, k__cospi_m20_p12);
- const __m128i out_28_2 = _mm_madd_epi16(out_28_0, k__cospi_m04_p28);
- const __m128i out_28_3 = _mm_madd_epi16(out_28_1, k__cospi_m04_p28);
- // dct_const_round_shift
- const __m128i out_04_4 =
- _mm_add_epi32(out_04_2, k__DCT_CONST_ROUNDING);
- const __m128i out_04_5 =
- _mm_add_epi32(out_04_3, k__DCT_CONST_ROUNDING);
- const __m128i out_20_4 =
- _mm_add_epi32(out_20_2, k__DCT_CONST_ROUNDING);
- const __m128i out_20_5 =
- _mm_add_epi32(out_20_3, k__DCT_CONST_ROUNDING);
- const __m128i out_12_4 =
- _mm_add_epi32(out_12_2, k__DCT_CONST_ROUNDING);
- const __m128i out_12_5 =
- _mm_add_epi32(out_12_3, k__DCT_CONST_ROUNDING);
- const __m128i out_28_4 =
- _mm_add_epi32(out_28_2, k__DCT_CONST_ROUNDING);
- const __m128i out_28_5 =
- _mm_add_epi32(out_28_3, k__DCT_CONST_ROUNDING);
- const __m128i out_04_6 = _mm_srai_epi32(out_04_4, DCT_CONST_BITS);
- const __m128i out_04_7 = _mm_srai_epi32(out_04_5, DCT_CONST_BITS);
- const __m128i out_20_6 = _mm_srai_epi32(out_20_4, DCT_CONST_BITS);
- const __m128i out_20_7 = _mm_srai_epi32(out_20_5, DCT_CONST_BITS);
- const __m128i out_12_6 = _mm_srai_epi32(out_12_4, DCT_CONST_BITS);
- const __m128i out_12_7 = _mm_srai_epi32(out_12_5, DCT_CONST_BITS);
- const __m128i out_28_6 = _mm_srai_epi32(out_28_4, DCT_CONST_BITS);
- const __m128i out_28_7 = _mm_srai_epi32(out_28_5, DCT_CONST_BITS);
- // Combine
- out[4] = _mm_packs_epi32(out_04_6, out_04_7);
- out[20] = _mm_packs_epi32(out_20_6, out_20_7);
- out[12] = _mm_packs_epi32(out_12_6, out_12_7);
- out[28] = _mm_packs_epi32(out_28_6, out_28_7);
-#if DCT_HIGH_BIT_DEPTH
- overflow =
- check_epi16_overflow_x4(&out[4], &out[20], &out[12], &out[28]);
- if (overflow) {
- if (pass == 0)
- HIGH_FDCT32x32_2D_C(input, output_org, stride);
- else
- HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
- return;
- }
-#endif // DCT_HIGH_BIT_DEPTH
- }
- {
- step3[8] = ADD_EPI16(step2[9], step1[8]);
- step3[9] = SUB_EPI16(step1[8], step2[9]);
- step3[10] = SUB_EPI16(step1[11], step2[10]);
- step3[11] = ADD_EPI16(step2[10], step1[11]);
- step3[12] = ADD_EPI16(step2[13], step1[12]);
- step3[13] = SUB_EPI16(step1[12], step2[13]);
- step3[14] = SUB_EPI16(step1[15], step2[14]);
- step3[15] = ADD_EPI16(step2[14], step1[15]);
-#if DCT_HIGH_BIT_DEPTH
- overflow = check_epi16_overflow_x8(&step3[8], &step3[9], &step3[10],
- &step3[11], &step3[12], &step3[13],
- &step3[14], &step3[15]);
- if (overflow) {
- if (pass == 0)
- HIGH_FDCT32x32_2D_C(input, output_org, stride);
- else
- HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
- return;
- }
-#endif // DCT_HIGH_BIT_DEPTH
- }
- {
- const __m128i s3_17_0 = _mm_unpacklo_epi16(step2[17], step2[30]);
- const __m128i s3_17_1 = _mm_unpackhi_epi16(step2[17], step2[30]);
- const __m128i s3_18_0 = _mm_unpacklo_epi16(step2[18], step2[29]);
- const __m128i s3_18_1 = _mm_unpackhi_epi16(step2[18], step2[29]);
- const __m128i s3_21_0 = _mm_unpacklo_epi16(step2[21], step2[26]);
- const __m128i s3_21_1 = _mm_unpackhi_epi16(step2[21], step2[26]);
- const __m128i s3_22_0 = _mm_unpacklo_epi16(step2[22], step2[25]);
- const __m128i s3_22_1 = _mm_unpackhi_epi16(step2[22], step2[25]);
- const __m128i s3_17_2 = _mm_madd_epi16(s3_17_0, k__cospi_m04_p28);
- const __m128i s3_17_3 = _mm_madd_epi16(s3_17_1, k__cospi_m04_p28);
- const __m128i s3_18_2 = _mm_madd_epi16(s3_18_0, k__cospi_m28_m04);
- const __m128i s3_18_3 = _mm_madd_epi16(s3_18_1, k__cospi_m28_m04);
- const __m128i s3_21_2 = _mm_madd_epi16(s3_21_0, k__cospi_m20_p12);
- const __m128i s3_21_3 = _mm_madd_epi16(s3_21_1, k__cospi_m20_p12);
- const __m128i s3_22_2 = _mm_madd_epi16(s3_22_0, k__cospi_m12_m20);
- const __m128i s3_22_3 = _mm_madd_epi16(s3_22_1, k__cospi_m12_m20);
- const __m128i s3_25_2 = _mm_madd_epi16(s3_22_0, k__cospi_m20_p12);
- const __m128i s3_25_3 = _mm_madd_epi16(s3_22_1, k__cospi_m20_p12);
- const __m128i s3_26_2 = _mm_madd_epi16(s3_21_0, k__cospi_p12_p20);
- const __m128i s3_26_3 = _mm_madd_epi16(s3_21_1, k__cospi_p12_p20);
- const __m128i s3_29_2 = _mm_madd_epi16(s3_18_0, k__cospi_m04_p28);
- const __m128i s3_29_3 = _mm_madd_epi16(s3_18_1, k__cospi_m04_p28);
- const __m128i s3_30_2 = _mm_madd_epi16(s3_17_0, k__cospi_p28_p04);
- const __m128i s3_30_3 = _mm_madd_epi16(s3_17_1, k__cospi_p28_p04);
- // dct_const_round_shift
- const __m128i s3_17_4 = _mm_add_epi32(s3_17_2, k__DCT_CONST_ROUNDING);
- const __m128i s3_17_5 = _mm_add_epi32(s3_17_3, k__DCT_CONST_ROUNDING);
- const __m128i s3_18_4 = _mm_add_epi32(s3_18_2, k__DCT_CONST_ROUNDING);
- const __m128i s3_18_5 = _mm_add_epi32(s3_18_3, k__DCT_CONST_ROUNDING);
- const __m128i s3_21_4 = _mm_add_epi32(s3_21_2, k__DCT_CONST_ROUNDING);
- const __m128i s3_21_5 = _mm_add_epi32(s3_21_3, k__DCT_CONST_ROUNDING);
- const __m128i s3_22_4 = _mm_add_epi32(s3_22_2, k__DCT_CONST_ROUNDING);
- const __m128i s3_22_5 = _mm_add_epi32(s3_22_3, k__DCT_CONST_ROUNDING);
- const __m128i s3_17_6 = _mm_srai_epi32(s3_17_4, DCT_CONST_BITS);
- const __m128i s3_17_7 = _mm_srai_epi32(s3_17_5, DCT_CONST_BITS);
- const __m128i s3_18_6 = _mm_srai_epi32(s3_18_4, DCT_CONST_BITS);
- const __m128i s3_18_7 = _mm_srai_epi32(s3_18_5, DCT_CONST_BITS);
- const __m128i s3_21_6 = _mm_srai_epi32(s3_21_4, DCT_CONST_BITS);
- const __m128i s3_21_7 = _mm_srai_epi32(s3_21_5, DCT_CONST_BITS);
- const __m128i s3_22_6 = _mm_srai_epi32(s3_22_4, DCT_CONST_BITS);
- const __m128i s3_22_7 = _mm_srai_epi32(s3_22_5, DCT_CONST_BITS);
- const __m128i s3_25_4 = _mm_add_epi32(s3_25_2, k__DCT_CONST_ROUNDING);
- const __m128i s3_25_5 = _mm_add_epi32(s3_25_3, k__DCT_CONST_ROUNDING);
- const __m128i s3_26_4 = _mm_add_epi32(s3_26_2, k__DCT_CONST_ROUNDING);
- const __m128i s3_26_5 = _mm_add_epi32(s3_26_3, k__DCT_CONST_ROUNDING);
- const __m128i s3_29_4 = _mm_add_epi32(s3_29_2, k__DCT_CONST_ROUNDING);
- const __m128i s3_29_5 = _mm_add_epi32(s3_29_3, k__DCT_CONST_ROUNDING);
- const __m128i s3_30_4 = _mm_add_epi32(s3_30_2, k__DCT_CONST_ROUNDING);
- const __m128i s3_30_5 = _mm_add_epi32(s3_30_3, k__DCT_CONST_ROUNDING);
- const __m128i s3_25_6 = _mm_srai_epi32(s3_25_4, DCT_CONST_BITS);
- const __m128i s3_25_7 = _mm_srai_epi32(s3_25_5, DCT_CONST_BITS);
- const __m128i s3_26_6 = _mm_srai_epi32(s3_26_4, DCT_CONST_BITS);
- const __m128i s3_26_7 = _mm_srai_epi32(s3_26_5, DCT_CONST_BITS);
- const __m128i s3_29_6 = _mm_srai_epi32(s3_29_4, DCT_CONST_BITS);
- const __m128i s3_29_7 = _mm_srai_epi32(s3_29_5, DCT_CONST_BITS);
- const __m128i s3_30_6 = _mm_srai_epi32(s3_30_4, DCT_CONST_BITS);
- const __m128i s3_30_7 = _mm_srai_epi32(s3_30_5, DCT_CONST_BITS);
- // Combine
- step3[17] = _mm_packs_epi32(s3_17_6, s3_17_7);
- step3[18] = _mm_packs_epi32(s3_18_6, s3_18_7);
- step3[21] = _mm_packs_epi32(s3_21_6, s3_21_7);
- step3[22] = _mm_packs_epi32(s3_22_6, s3_22_7);
- // Combine
- step3[25] = _mm_packs_epi32(s3_25_6, s3_25_7);
- step3[26] = _mm_packs_epi32(s3_26_6, s3_26_7);
- step3[29] = _mm_packs_epi32(s3_29_6, s3_29_7);
- step3[30] = _mm_packs_epi32(s3_30_6, s3_30_7);
-#if DCT_HIGH_BIT_DEPTH
- overflow = check_epi16_overflow_x8(&step3[17], &step3[18], &step3[21],
- &step3[22], &step3[25], &step3[26],
- &step3[29], &step3[30]);
- if (overflow) {
- if (pass == 0)
- HIGH_FDCT32x32_2D_C(input, output_org, stride);
- else
- HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
- return;
- }
-#endif // DCT_HIGH_BIT_DEPTH
- }
- // Stage 7
- {
- const __m128i out_02_0 = _mm_unpacklo_epi16(step3[8], step3[15]);
- const __m128i out_02_1 = _mm_unpackhi_epi16(step3[8], step3[15]);
- const __m128i out_18_0 = _mm_unpacklo_epi16(step3[9], step3[14]);
- const __m128i out_18_1 = _mm_unpackhi_epi16(step3[9], step3[14]);
- const __m128i out_10_0 = _mm_unpacklo_epi16(step3[10], step3[13]);
- const __m128i out_10_1 = _mm_unpackhi_epi16(step3[10], step3[13]);
- const __m128i out_26_0 = _mm_unpacklo_epi16(step3[11], step3[12]);
- const __m128i out_26_1 = _mm_unpackhi_epi16(step3[11], step3[12]);
- const __m128i out_02_2 = _mm_madd_epi16(out_02_0, k__cospi_p30_p02);
- const __m128i out_02_3 = _mm_madd_epi16(out_02_1, k__cospi_p30_p02);
- const __m128i out_18_2 = _mm_madd_epi16(out_18_0, k__cospi_p14_p18);
- const __m128i out_18_3 = _mm_madd_epi16(out_18_1, k__cospi_p14_p18);
- const __m128i out_10_2 = _mm_madd_epi16(out_10_0, k__cospi_p22_p10);
- const __m128i out_10_3 = _mm_madd_epi16(out_10_1, k__cospi_p22_p10);
- const __m128i out_26_2 = _mm_madd_epi16(out_26_0, k__cospi_p06_p26);
- const __m128i out_26_3 = _mm_madd_epi16(out_26_1, k__cospi_p06_p26);
- const __m128i out_06_2 = _mm_madd_epi16(out_26_0, k__cospi_m26_p06);
- const __m128i out_06_3 = _mm_madd_epi16(out_26_1, k__cospi_m26_p06);
- const __m128i out_22_2 = _mm_madd_epi16(out_10_0, k__cospi_m10_p22);
- const __m128i out_22_3 = _mm_madd_epi16(out_10_1, k__cospi_m10_p22);
- const __m128i out_14_2 = _mm_madd_epi16(out_18_0, k__cospi_m18_p14);
- const __m128i out_14_3 = _mm_madd_epi16(out_18_1, k__cospi_m18_p14);
- const __m128i out_30_2 = _mm_madd_epi16(out_02_0, k__cospi_m02_p30);
- const __m128i out_30_3 = _mm_madd_epi16(out_02_1, k__cospi_m02_p30);
- // dct_const_round_shift
- const __m128i out_02_4 =
- _mm_add_epi32(out_02_2, k__DCT_CONST_ROUNDING);
- const __m128i out_02_5 =
- _mm_add_epi32(out_02_3, k__DCT_CONST_ROUNDING);
- const __m128i out_18_4 =
- _mm_add_epi32(out_18_2, k__DCT_CONST_ROUNDING);
- const __m128i out_18_5 =
- _mm_add_epi32(out_18_3, k__DCT_CONST_ROUNDING);
- const __m128i out_10_4 =
- _mm_add_epi32(out_10_2, k__DCT_CONST_ROUNDING);
- const __m128i out_10_5 =
- _mm_add_epi32(out_10_3, k__DCT_CONST_ROUNDING);
- const __m128i out_26_4 =
- _mm_add_epi32(out_26_2, k__DCT_CONST_ROUNDING);
- const __m128i out_26_5 =
- _mm_add_epi32(out_26_3, k__DCT_CONST_ROUNDING);
- const __m128i out_06_4 =
- _mm_add_epi32(out_06_2, k__DCT_CONST_ROUNDING);
- const __m128i out_06_5 =
- _mm_add_epi32(out_06_3, k__DCT_CONST_ROUNDING);
- const __m128i out_22_4 =
- _mm_add_epi32(out_22_2, k__DCT_CONST_ROUNDING);
- const __m128i out_22_5 =
- _mm_add_epi32(out_22_3, k__DCT_CONST_ROUNDING);
- const __m128i out_14_4 =
- _mm_add_epi32(out_14_2, k__DCT_CONST_ROUNDING);
- const __m128i out_14_5 =
- _mm_add_epi32(out_14_3, k__DCT_CONST_ROUNDING);
- const __m128i out_30_4 =
- _mm_add_epi32(out_30_2, k__DCT_CONST_ROUNDING);
- const __m128i out_30_5 =
- _mm_add_epi32(out_30_3, k__DCT_CONST_ROUNDING);
- const __m128i out_02_6 = _mm_srai_epi32(out_02_4, DCT_CONST_BITS);
- const __m128i out_02_7 = _mm_srai_epi32(out_02_5, DCT_CONST_BITS);
- const __m128i out_18_6 = _mm_srai_epi32(out_18_4, DCT_CONST_BITS);
- const __m128i out_18_7 = _mm_srai_epi32(out_18_5, DCT_CONST_BITS);
- const __m128i out_10_6 = _mm_srai_epi32(out_10_4, DCT_CONST_BITS);
- const __m128i out_10_7 = _mm_srai_epi32(out_10_5, DCT_CONST_BITS);
- const __m128i out_26_6 = _mm_srai_epi32(out_26_4, DCT_CONST_BITS);
- const __m128i out_26_7 = _mm_srai_epi32(out_26_5, DCT_CONST_BITS);
- const __m128i out_06_6 = _mm_srai_epi32(out_06_4, DCT_CONST_BITS);
- const __m128i out_06_7 = _mm_srai_epi32(out_06_5, DCT_CONST_BITS);
- const __m128i out_22_6 = _mm_srai_epi32(out_22_4, DCT_CONST_BITS);
- const __m128i out_22_7 = _mm_srai_epi32(out_22_5, DCT_CONST_BITS);
- const __m128i out_14_6 = _mm_srai_epi32(out_14_4, DCT_CONST_BITS);
- const __m128i out_14_7 = _mm_srai_epi32(out_14_5, DCT_CONST_BITS);
- const __m128i out_30_6 = _mm_srai_epi32(out_30_4, DCT_CONST_BITS);
- const __m128i out_30_7 = _mm_srai_epi32(out_30_5, DCT_CONST_BITS);
- // Combine
- out[2] = _mm_packs_epi32(out_02_6, out_02_7);
- out[18] = _mm_packs_epi32(out_18_6, out_18_7);
- out[10] = _mm_packs_epi32(out_10_6, out_10_7);
- out[26] = _mm_packs_epi32(out_26_6, out_26_7);
- out[6] = _mm_packs_epi32(out_06_6, out_06_7);
- out[22] = _mm_packs_epi32(out_22_6, out_22_7);
- out[14] = _mm_packs_epi32(out_14_6, out_14_7);
- out[30] = _mm_packs_epi32(out_30_6, out_30_7);
-#if DCT_HIGH_BIT_DEPTH
- overflow =
- check_epi16_overflow_x8(&out[2], &out[18], &out[10], &out[26],
- &out[6], &out[22], &out[14], &out[30]);
- if (overflow) {
- if (pass == 0)
- HIGH_FDCT32x32_2D_C(input, output_org, stride);
- else
- HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
- return;
- }
-#endif // DCT_HIGH_BIT_DEPTH
- }
- {
- step1[16] = ADD_EPI16(step3[17], step2[16]);
- step1[17] = SUB_EPI16(step2[16], step3[17]);
- step1[18] = SUB_EPI16(step2[19], step3[18]);
- step1[19] = ADD_EPI16(step3[18], step2[19]);
- step1[20] = ADD_EPI16(step3[21], step2[20]);
- step1[21] = SUB_EPI16(step2[20], step3[21]);
- step1[22] = SUB_EPI16(step2[23], step3[22]);
- step1[23] = ADD_EPI16(step3[22], step2[23]);
- step1[24] = ADD_EPI16(step3[25], step2[24]);
- step1[25] = SUB_EPI16(step2[24], step3[25]);
- step1[26] = SUB_EPI16(step2[27], step3[26]);
- step1[27] = ADD_EPI16(step3[26], step2[27]);
- step1[28] = ADD_EPI16(step3[29], step2[28]);
- step1[29] = SUB_EPI16(step2[28], step3[29]);
- step1[30] = SUB_EPI16(step2[31], step3[30]);
- step1[31] = ADD_EPI16(step3[30], step2[31]);
-#if DCT_HIGH_BIT_DEPTH
- overflow = check_epi16_overflow_x16(
- &step1[16], &step1[17], &step1[18], &step1[19], &step1[20],
- &step1[21], &step1[22], &step1[23], &step1[24], &step1[25],
- &step1[26], &step1[27], &step1[28], &step1[29], &step1[30],
- &step1[31]);
- if (overflow) {
- if (pass == 0)
- HIGH_FDCT32x32_2D_C(input, output_org, stride);
- else
- HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
- return;
- }
-#endif // DCT_HIGH_BIT_DEPTH
- }
- // Final stage --- outputs indices are bit-reversed.
- {
- const __m128i out_01_0 = _mm_unpacklo_epi16(step1[16], step1[31]);
- const __m128i out_01_1 = _mm_unpackhi_epi16(step1[16], step1[31]);
- const __m128i out_17_0 = _mm_unpacklo_epi16(step1[17], step1[30]);
- const __m128i out_17_1 = _mm_unpackhi_epi16(step1[17], step1[30]);
- const __m128i out_09_0 = _mm_unpacklo_epi16(step1[18], step1[29]);
- const __m128i out_09_1 = _mm_unpackhi_epi16(step1[18], step1[29]);
- const __m128i out_25_0 = _mm_unpacklo_epi16(step1[19], step1[28]);
- const __m128i out_25_1 = _mm_unpackhi_epi16(step1[19], step1[28]);
- const __m128i out_01_2 = _mm_madd_epi16(out_01_0, k__cospi_p31_p01);
- const __m128i out_01_3 = _mm_madd_epi16(out_01_1, k__cospi_p31_p01);
- const __m128i out_17_2 = _mm_madd_epi16(out_17_0, k__cospi_p15_p17);
- const __m128i out_17_3 = _mm_madd_epi16(out_17_1, k__cospi_p15_p17);
- const __m128i out_09_2 = _mm_madd_epi16(out_09_0, k__cospi_p23_p09);
- const __m128i out_09_3 = _mm_madd_epi16(out_09_1, k__cospi_p23_p09);
- const __m128i out_25_2 = _mm_madd_epi16(out_25_0, k__cospi_p07_p25);
- const __m128i out_25_3 = _mm_madd_epi16(out_25_1, k__cospi_p07_p25);
- const __m128i out_07_2 = _mm_madd_epi16(out_25_0, k__cospi_m25_p07);
- const __m128i out_07_3 = _mm_madd_epi16(out_25_1, k__cospi_m25_p07);
- const __m128i out_23_2 = _mm_madd_epi16(out_09_0, k__cospi_m09_p23);
- const __m128i out_23_3 = _mm_madd_epi16(out_09_1, k__cospi_m09_p23);
- const __m128i out_15_2 = _mm_madd_epi16(out_17_0, k__cospi_m17_p15);
- const __m128i out_15_3 = _mm_madd_epi16(out_17_1, k__cospi_m17_p15);
- const __m128i out_31_2 = _mm_madd_epi16(out_01_0, k__cospi_m01_p31);
- const __m128i out_31_3 = _mm_madd_epi16(out_01_1, k__cospi_m01_p31);
- // dct_const_round_shift
- const __m128i out_01_4 =
- _mm_add_epi32(out_01_2, k__DCT_CONST_ROUNDING);
- const __m128i out_01_5 =
- _mm_add_epi32(out_01_3, k__DCT_CONST_ROUNDING);
- const __m128i out_17_4 =
- _mm_add_epi32(out_17_2, k__DCT_CONST_ROUNDING);
- const __m128i out_17_5 =
- _mm_add_epi32(out_17_3, k__DCT_CONST_ROUNDING);
- const __m128i out_09_4 =
- _mm_add_epi32(out_09_2, k__DCT_CONST_ROUNDING);
- const __m128i out_09_5 =
- _mm_add_epi32(out_09_3, k__DCT_CONST_ROUNDING);
- const __m128i out_25_4 =
- _mm_add_epi32(out_25_2, k__DCT_CONST_ROUNDING);
- const __m128i out_25_5 =
- _mm_add_epi32(out_25_3, k__DCT_CONST_ROUNDING);
- const __m128i out_07_4 =
- _mm_add_epi32(out_07_2, k__DCT_CONST_ROUNDING);
- const __m128i out_07_5 =
- _mm_add_epi32(out_07_3, k__DCT_CONST_ROUNDING);
- const __m128i out_23_4 =
- _mm_add_epi32(out_23_2, k__DCT_CONST_ROUNDING);
- const __m128i out_23_5 =
- _mm_add_epi32(out_23_3, k__DCT_CONST_ROUNDING);
- const __m128i out_15_4 =
- _mm_add_epi32(out_15_2, k__DCT_CONST_ROUNDING);
- const __m128i out_15_5 =
- _mm_add_epi32(out_15_3, k__DCT_CONST_ROUNDING);
- const __m128i out_31_4 =
- _mm_add_epi32(out_31_2, k__DCT_CONST_ROUNDING);
- const __m128i out_31_5 =
- _mm_add_epi32(out_31_3, k__DCT_CONST_ROUNDING);
- const __m128i out_01_6 = _mm_srai_epi32(out_01_4, DCT_CONST_BITS);
- const __m128i out_01_7 = _mm_srai_epi32(out_01_5, DCT_CONST_BITS);
- const __m128i out_17_6 = _mm_srai_epi32(out_17_4, DCT_CONST_BITS);
- const __m128i out_17_7 = _mm_srai_epi32(out_17_5, DCT_CONST_BITS);
- const __m128i out_09_6 = _mm_srai_epi32(out_09_4, DCT_CONST_BITS);
- const __m128i out_09_7 = _mm_srai_epi32(out_09_5, DCT_CONST_BITS);
- const __m128i out_25_6 = _mm_srai_epi32(out_25_4, DCT_CONST_BITS);
- const __m128i out_25_7 = _mm_srai_epi32(out_25_5, DCT_CONST_BITS);
- const __m128i out_07_6 = _mm_srai_epi32(out_07_4, DCT_CONST_BITS);
- const __m128i out_07_7 = _mm_srai_epi32(out_07_5, DCT_CONST_BITS);
- const __m128i out_23_6 = _mm_srai_epi32(out_23_4, DCT_CONST_BITS);
- const __m128i out_23_7 = _mm_srai_epi32(out_23_5, DCT_CONST_BITS);
- const __m128i out_15_6 = _mm_srai_epi32(out_15_4, DCT_CONST_BITS);
- const __m128i out_15_7 = _mm_srai_epi32(out_15_5, DCT_CONST_BITS);
- const __m128i out_31_6 = _mm_srai_epi32(out_31_4, DCT_CONST_BITS);
- const __m128i out_31_7 = _mm_srai_epi32(out_31_5, DCT_CONST_BITS);
- // Combine
- out[1] = _mm_packs_epi32(out_01_6, out_01_7);
- out[17] = _mm_packs_epi32(out_17_6, out_17_7);
- out[9] = _mm_packs_epi32(out_09_6, out_09_7);
- out[25] = _mm_packs_epi32(out_25_6, out_25_7);
- out[7] = _mm_packs_epi32(out_07_6, out_07_7);
- out[23] = _mm_packs_epi32(out_23_6, out_23_7);
- out[15] = _mm_packs_epi32(out_15_6, out_15_7);
- out[31] = _mm_packs_epi32(out_31_6, out_31_7);
-#if DCT_HIGH_BIT_DEPTH
- overflow =
- check_epi16_overflow_x8(&out[1], &out[17], &out[9], &out[25],
- &out[7], &out[23], &out[15], &out[31]);
- if (overflow) {
- if (pass == 0)
- HIGH_FDCT32x32_2D_C(input, output_org, stride);
- else
- HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
- return;
- }
-#endif // DCT_HIGH_BIT_DEPTH
- }
- {
- const __m128i out_05_0 = _mm_unpacklo_epi16(step1[20], step1[27]);
- const __m128i out_05_1 = _mm_unpackhi_epi16(step1[20], step1[27]);
- const __m128i out_21_0 = _mm_unpacklo_epi16(step1[21], step1[26]);
- const __m128i out_21_1 = _mm_unpackhi_epi16(step1[21], step1[26]);
- const __m128i out_13_0 = _mm_unpacklo_epi16(step1[22], step1[25]);
- const __m128i out_13_1 = _mm_unpackhi_epi16(step1[22], step1[25]);
- const __m128i out_29_0 = _mm_unpacklo_epi16(step1[23], step1[24]);
- const __m128i out_29_1 = _mm_unpackhi_epi16(step1[23], step1[24]);
- const __m128i out_05_2 = _mm_madd_epi16(out_05_0, k__cospi_p27_p05);
- const __m128i out_05_3 = _mm_madd_epi16(out_05_1, k__cospi_p27_p05);
- const __m128i out_21_2 = _mm_madd_epi16(out_21_0, k__cospi_p11_p21);
- const __m128i out_21_3 = _mm_madd_epi16(out_21_1, k__cospi_p11_p21);
- const __m128i out_13_2 = _mm_madd_epi16(out_13_0, k__cospi_p19_p13);
- const __m128i out_13_3 = _mm_madd_epi16(out_13_1, k__cospi_p19_p13);
- const __m128i out_29_2 = _mm_madd_epi16(out_29_0, k__cospi_p03_p29);
- const __m128i out_29_3 = _mm_madd_epi16(out_29_1, k__cospi_p03_p29);
- const __m128i out_03_2 = _mm_madd_epi16(out_29_0, k__cospi_m29_p03);
- const __m128i out_03_3 = _mm_madd_epi16(out_29_1, k__cospi_m29_p03);
- const __m128i out_19_2 = _mm_madd_epi16(out_13_0, k__cospi_m13_p19);
- const __m128i out_19_3 = _mm_madd_epi16(out_13_1, k__cospi_m13_p19);
- const __m128i out_11_2 = _mm_madd_epi16(out_21_0, k__cospi_m21_p11);
- const __m128i out_11_3 = _mm_madd_epi16(out_21_1, k__cospi_m21_p11);
- const __m128i out_27_2 = _mm_madd_epi16(out_05_0, k__cospi_m05_p27);
- const __m128i out_27_3 = _mm_madd_epi16(out_05_1, k__cospi_m05_p27);
- // dct_const_round_shift
- const __m128i out_05_4 =
- _mm_add_epi32(out_05_2, k__DCT_CONST_ROUNDING);
- const __m128i out_05_5 =
- _mm_add_epi32(out_05_3, k__DCT_CONST_ROUNDING);
- const __m128i out_21_4 =
- _mm_add_epi32(out_21_2, k__DCT_CONST_ROUNDING);
- const __m128i out_21_5 =
- _mm_add_epi32(out_21_3, k__DCT_CONST_ROUNDING);
- const __m128i out_13_4 =
- _mm_add_epi32(out_13_2, k__DCT_CONST_ROUNDING);
- const __m128i out_13_5 =
- _mm_add_epi32(out_13_3, k__DCT_CONST_ROUNDING);
- const __m128i out_29_4 =
- _mm_add_epi32(out_29_2, k__DCT_CONST_ROUNDING);
- const __m128i out_29_5 =
- _mm_add_epi32(out_29_3, k__DCT_CONST_ROUNDING);
- const __m128i out_03_4 =
- _mm_add_epi32(out_03_2, k__DCT_CONST_ROUNDING);
- const __m128i out_03_5 =
- _mm_add_epi32(out_03_3, k__DCT_CONST_ROUNDING);
- const __m128i out_19_4 =
- _mm_add_epi32(out_19_2, k__DCT_CONST_ROUNDING);
- const __m128i out_19_5 =
- _mm_add_epi32(out_19_3, k__DCT_CONST_ROUNDING);
- const __m128i out_11_4 =
- _mm_add_epi32(out_11_2, k__DCT_CONST_ROUNDING);
- const __m128i out_11_5 =
- _mm_add_epi32(out_11_3, k__DCT_CONST_ROUNDING);
- const __m128i out_27_4 =
- _mm_add_epi32(out_27_2, k__DCT_CONST_ROUNDING);
- const __m128i out_27_5 =
- _mm_add_epi32(out_27_3, k__DCT_CONST_ROUNDING);
- const __m128i out_05_6 = _mm_srai_epi32(out_05_4, DCT_CONST_BITS);
- const __m128i out_05_7 = _mm_srai_epi32(out_05_5, DCT_CONST_BITS);
- const __m128i out_21_6 = _mm_srai_epi32(out_21_4, DCT_CONST_BITS);
- const __m128i out_21_7 = _mm_srai_epi32(out_21_5, DCT_CONST_BITS);
- const __m128i out_13_6 = _mm_srai_epi32(out_13_4, DCT_CONST_BITS);
- const __m128i out_13_7 = _mm_srai_epi32(out_13_5, DCT_CONST_BITS);
- const __m128i out_29_6 = _mm_srai_epi32(out_29_4, DCT_CONST_BITS);
- const __m128i out_29_7 = _mm_srai_epi32(out_29_5, DCT_CONST_BITS);
- const __m128i out_03_6 = _mm_srai_epi32(out_03_4, DCT_CONST_BITS);
- const __m128i out_03_7 = _mm_srai_epi32(out_03_5, DCT_CONST_BITS);
- const __m128i out_19_6 = _mm_srai_epi32(out_19_4, DCT_CONST_BITS);
- const __m128i out_19_7 = _mm_srai_epi32(out_19_5, DCT_CONST_BITS);
- const __m128i out_11_6 = _mm_srai_epi32(out_11_4, DCT_CONST_BITS);
- const __m128i out_11_7 = _mm_srai_epi32(out_11_5, DCT_CONST_BITS);
- const __m128i out_27_6 = _mm_srai_epi32(out_27_4, DCT_CONST_BITS);
- const __m128i out_27_7 = _mm_srai_epi32(out_27_5, DCT_CONST_BITS);
- // Combine
- out[5] = _mm_packs_epi32(out_05_6, out_05_7);
- out[21] = _mm_packs_epi32(out_21_6, out_21_7);
- out[13] = _mm_packs_epi32(out_13_6, out_13_7);
- out[29] = _mm_packs_epi32(out_29_6, out_29_7);
- out[3] = _mm_packs_epi32(out_03_6, out_03_7);
- out[19] = _mm_packs_epi32(out_19_6, out_19_7);
- out[11] = _mm_packs_epi32(out_11_6, out_11_7);
- out[27] = _mm_packs_epi32(out_27_6, out_27_7);
-#if DCT_HIGH_BIT_DEPTH
- overflow =
- check_epi16_overflow_x8(&out[5], &out[21], &out[13], &out[29],
- &out[3], &out[19], &out[11], &out[27]);
- if (overflow) {
- if (pass == 0)
- HIGH_FDCT32x32_2D_C(input, output_org, stride);
- else
- HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
- return;
- }
-#endif // DCT_HIGH_BIT_DEPTH
- }
-#if FDCT32x32_HIGH_PRECISION
- } else {
- __m128i lstep1[64], lstep2[64], lstep3[64];
- __m128i u[32], v[32], sign[16];
- const __m128i K32One = _mm_set_epi32(1, 1, 1, 1);
- // start using 32-bit operations
- // stage 3
- {
- // expanding to 32-bit length priori to addition operations
- lstep2[0] = _mm_unpacklo_epi16(step2[0], kZero);
- lstep2[1] = _mm_unpackhi_epi16(step2[0], kZero);
- lstep2[2] = _mm_unpacklo_epi16(step2[1], kZero);
- lstep2[3] = _mm_unpackhi_epi16(step2[1], kZero);
- lstep2[4] = _mm_unpacklo_epi16(step2[2], kZero);
- lstep2[5] = _mm_unpackhi_epi16(step2[2], kZero);
- lstep2[6] = _mm_unpacklo_epi16(step2[3], kZero);
- lstep2[7] = _mm_unpackhi_epi16(step2[3], kZero);
- lstep2[8] = _mm_unpacklo_epi16(step2[4], kZero);
- lstep2[9] = _mm_unpackhi_epi16(step2[4], kZero);
- lstep2[10] = _mm_unpacklo_epi16(step2[5], kZero);
- lstep2[11] = _mm_unpackhi_epi16(step2[5], kZero);
- lstep2[12] = _mm_unpacklo_epi16(step2[6], kZero);
- lstep2[13] = _mm_unpackhi_epi16(step2[6], kZero);
- lstep2[14] = _mm_unpacklo_epi16(step2[7], kZero);
- lstep2[15] = _mm_unpackhi_epi16(step2[7], kZero);
- lstep2[0] = _mm_madd_epi16(lstep2[0], kOne);
- lstep2[1] = _mm_madd_epi16(lstep2[1], kOne);
- lstep2[2] = _mm_madd_epi16(lstep2[2], kOne);
- lstep2[3] = _mm_madd_epi16(lstep2[3], kOne);
- lstep2[4] = _mm_madd_epi16(lstep2[4], kOne);
- lstep2[5] = _mm_madd_epi16(lstep2[5], kOne);
- lstep2[6] = _mm_madd_epi16(lstep2[6], kOne);
- lstep2[7] = _mm_madd_epi16(lstep2[7], kOne);
- lstep2[8] = _mm_madd_epi16(lstep2[8], kOne);
- lstep2[9] = _mm_madd_epi16(lstep2[9], kOne);
- lstep2[10] = _mm_madd_epi16(lstep2[10], kOne);
- lstep2[11] = _mm_madd_epi16(lstep2[11], kOne);
- lstep2[12] = _mm_madd_epi16(lstep2[12], kOne);
- lstep2[13] = _mm_madd_epi16(lstep2[13], kOne);
- lstep2[14] = _mm_madd_epi16(lstep2[14], kOne);
- lstep2[15] = _mm_madd_epi16(lstep2[15], kOne);
-
- lstep3[0] = _mm_add_epi32(lstep2[14], lstep2[0]);
- lstep3[1] = _mm_add_epi32(lstep2[15], lstep2[1]);
- lstep3[2] = _mm_add_epi32(lstep2[12], lstep2[2]);
- lstep3[3] = _mm_add_epi32(lstep2[13], lstep2[3]);
- lstep3[4] = _mm_add_epi32(lstep2[10], lstep2[4]);
- lstep3[5] = _mm_add_epi32(lstep2[11], lstep2[5]);
- lstep3[6] = _mm_add_epi32(lstep2[8], lstep2[6]);
- lstep3[7] = _mm_add_epi32(lstep2[9], lstep2[7]);
- lstep3[8] = _mm_sub_epi32(lstep2[6], lstep2[8]);
- lstep3[9] = _mm_sub_epi32(lstep2[7], lstep2[9]);
- lstep3[10] = _mm_sub_epi32(lstep2[4], lstep2[10]);
- lstep3[11] = _mm_sub_epi32(lstep2[5], lstep2[11]);
- lstep3[12] = _mm_sub_epi32(lstep2[2], lstep2[12]);
- lstep3[13] = _mm_sub_epi32(lstep2[3], lstep2[13]);
- lstep3[14] = _mm_sub_epi32(lstep2[0], lstep2[14]);
- lstep3[15] = _mm_sub_epi32(lstep2[1], lstep2[15]);
- }
- {
- const __m128i s3_10_0 = _mm_unpacklo_epi16(step2[13], step2[10]);
- const __m128i s3_10_1 = _mm_unpackhi_epi16(step2[13], step2[10]);
- const __m128i s3_11_0 = _mm_unpacklo_epi16(step2[12], step2[11]);
- const __m128i s3_11_1 = _mm_unpackhi_epi16(step2[12], step2[11]);
- const __m128i s3_10_2 = _mm_madd_epi16(s3_10_0, k__cospi_p16_m16);
- const __m128i s3_10_3 = _mm_madd_epi16(s3_10_1, k__cospi_p16_m16);
- const __m128i s3_11_2 = _mm_madd_epi16(s3_11_0, k__cospi_p16_m16);
- const __m128i s3_11_3 = _mm_madd_epi16(s3_11_1, k__cospi_p16_m16);
- const __m128i s3_12_2 = _mm_madd_epi16(s3_11_0, k__cospi_p16_p16);
- const __m128i s3_12_3 = _mm_madd_epi16(s3_11_1, k__cospi_p16_p16);
- const __m128i s3_13_2 = _mm_madd_epi16(s3_10_0, k__cospi_p16_p16);
- const __m128i s3_13_3 = _mm_madd_epi16(s3_10_1, k__cospi_p16_p16);
- // dct_const_round_shift
- const __m128i s3_10_4 = _mm_add_epi32(s3_10_2, k__DCT_CONST_ROUNDING);
- const __m128i s3_10_5 = _mm_add_epi32(s3_10_3, k__DCT_CONST_ROUNDING);
- const __m128i s3_11_4 = _mm_add_epi32(s3_11_2, k__DCT_CONST_ROUNDING);
- const __m128i s3_11_5 = _mm_add_epi32(s3_11_3, k__DCT_CONST_ROUNDING);
- const __m128i s3_12_4 = _mm_add_epi32(s3_12_2, k__DCT_CONST_ROUNDING);
- const __m128i s3_12_5 = _mm_add_epi32(s3_12_3, k__DCT_CONST_ROUNDING);
- const __m128i s3_13_4 = _mm_add_epi32(s3_13_2, k__DCT_CONST_ROUNDING);
- const __m128i s3_13_5 = _mm_add_epi32(s3_13_3, k__DCT_CONST_ROUNDING);
- lstep3[20] = _mm_srai_epi32(s3_10_4, DCT_CONST_BITS);
- lstep3[21] = _mm_srai_epi32(s3_10_5, DCT_CONST_BITS);
- lstep3[22] = _mm_srai_epi32(s3_11_4, DCT_CONST_BITS);
- lstep3[23] = _mm_srai_epi32(s3_11_5, DCT_CONST_BITS);
- lstep3[24] = _mm_srai_epi32(s3_12_4, DCT_CONST_BITS);
- lstep3[25] = _mm_srai_epi32(s3_12_5, DCT_CONST_BITS);
- lstep3[26] = _mm_srai_epi32(s3_13_4, DCT_CONST_BITS);
- lstep3[27] = _mm_srai_epi32(s3_13_5, DCT_CONST_BITS);
- }
- {
- lstep2[40] = _mm_unpacklo_epi16(step2[20], kZero);
- lstep2[41] = _mm_unpackhi_epi16(step2[20], kZero);
- lstep2[42] = _mm_unpacklo_epi16(step2[21], kZero);
- lstep2[43] = _mm_unpackhi_epi16(step2[21], kZero);
- lstep2[44] = _mm_unpacklo_epi16(step2[22], kZero);
- lstep2[45] = _mm_unpackhi_epi16(step2[22], kZero);
- lstep2[46] = _mm_unpacklo_epi16(step2[23], kZero);
- lstep2[47] = _mm_unpackhi_epi16(step2[23], kZero);
- lstep2[48] = _mm_unpacklo_epi16(step2[24], kZero);
- lstep2[49] = _mm_unpackhi_epi16(step2[24], kZero);
- lstep2[50] = _mm_unpacklo_epi16(step2[25], kZero);
- lstep2[51] = _mm_unpackhi_epi16(step2[25], kZero);
- lstep2[52] = _mm_unpacklo_epi16(step2[26], kZero);
- lstep2[53] = _mm_unpackhi_epi16(step2[26], kZero);
- lstep2[54] = _mm_unpacklo_epi16(step2[27], kZero);
- lstep2[55] = _mm_unpackhi_epi16(step2[27], kZero);
- lstep2[40] = _mm_madd_epi16(lstep2[40], kOne);
- lstep2[41] = _mm_madd_epi16(lstep2[41], kOne);
- lstep2[42] = _mm_madd_epi16(lstep2[42], kOne);
- lstep2[43] = _mm_madd_epi16(lstep2[43], kOne);
- lstep2[44] = _mm_madd_epi16(lstep2[44], kOne);
- lstep2[45] = _mm_madd_epi16(lstep2[45], kOne);
- lstep2[46] = _mm_madd_epi16(lstep2[46], kOne);
- lstep2[47] = _mm_madd_epi16(lstep2[47], kOne);
- lstep2[48] = _mm_madd_epi16(lstep2[48], kOne);
- lstep2[49] = _mm_madd_epi16(lstep2[49], kOne);
- lstep2[50] = _mm_madd_epi16(lstep2[50], kOne);
- lstep2[51] = _mm_madd_epi16(lstep2[51], kOne);
- lstep2[52] = _mm_madd_epi16(lstep2[52], kOne);
- lstep2[53] = _mm_madd_epi16(lstep2[53], kOne);
- lstep2[54] = _mm_madd_epi16(lstep2[54], kOne);
- lstep2[55] = _mm_madd_epi16(lstep2[55], kOne);
-
- lstep1[32] = _mm_unpacklo_epi16(step1[16], kZero);
- lstep1[33] = _mm_unpackhi_epi16(step1[16], kZero);
- lstep1[34] = _mm_unpacklo_epi16(step1[17], kZero);
- lstep1[35] = _mm_unpackhi_epi16(step1[17], kZero);
- lstep1[36] = _mm_unpacklo_epi16(step1[18], kZero);
- lstep1[37] = _mm_unpackhi_epi16(step1[18], kZero);
- lstep1[38] = _mm_unpacklo_epi16(step1[19], kZero);
- lstep1[39] = _mm_unpackhi_epi16(step1[19], kZero);
- lstep1[56] = _mm_unpacklo_epi16(step1[28], kZero);
- lstep1[57] = _mm_unpackhi_epi16(step1[28], kZero);
- lstep1[58] = _mm_unpacklo_epi16(step1[29], kZero);
- lstep1[59] = _mm_unpackhi_epi16(step1[29], kZero);
- lstep1[60] = _mm_unpacklo_epi16(step1[30], kZero);
- lstep1[61] = _mm_unpackhi_epi16(step1[30], kZero);
- lstep1[62] = _mm_unpacklo_epi16(step1[31], kZero);
- lstep1[63] = _mm_unpackhi_epi16(step1[31], kZero);
- lstep1[32] = _mm_madd_epi16(lstep1[32], kOne);
- lstep1[33] = _mm_madd_epi16(lstep1[33], kOne);
- lstep1[34] = _mm_madd_epi16(lstep1[34], kOne);
- lstep1[35] = _mm_madd_epi16(lstep1[35], kOne);
- lstep1[36] = _mm_madd_epi16(lstep1[36], kOne);
- lstep1[37] = _mm_madd_epi16(lstep1[37], kOne);
- lstep1[38] = _mm_madd_epi16(lstep1[38], kOne);
- lstep1[39] = _mm_madd_epi16(lstep1[39], kOne);
- lstep1[56] = _mm_madd_epi16(lstep1[56], kOne);
- lstep1[57] = _mm_madd_epi16(lstep1[57], kOne);
- lstep1[58] = _mm_madd_epi16(lstep1[58], kOne);
- lstep1[59] = _mm_madd_epi16(lstep1[59], kOne);
- lstep1[60] = _mm_madd_epi16(lstep1[60], kOne);
- lstep1[61] = _mm_madd_epi16(lstep1[61], kOne);
- lstep1[62] = _mm_madd_epi16(lstep1[62], kOne);
- lstep1[63] = _mm_madd_epi16(lstep1[63], kOne);
-
- lstep3[32] = _mm_add_epi32(lstep2[46], lstep1[32]);
- lstep3[33] = _mm_add_epi32(lstep2[47], lstep1[33]);
-
- lstep3[34] = _mm_add_epi32(lstep2[44], lstep1[34]);
- lstep3[35] = _mm_add_epi32(lstep2[45], lstep1[35]);
- lstep3[36] = _mm_add_epi32(lstep2[42], lstep1[36]);
- lstep3[37] = _mm_add_epi32(lstep2[43], lstep1[37]);
- lstep3[38] = _mm_add_epi32(lstep2[40], lstep1[38]);
- lstep3[39] = _mm_add_epi32(lstep2[41], lstep1[39]);
- lstep3[40] = _mm_sub_epi32(lstep1[38], lstep2[40]);
- lstep3[41] = _mm_sub_epi32(lstep1[39], lstep2[41]);
- lstep3[42] = _mm_sub_epi32(lstep1[36], lstep2[42]);
- lstep3[43] = _mm_sub_epi32(lstep1[37], lstep2[43]);
- lstep3[44] = _mm_sub_epi32(lstep1[34], lstep2[44]);
- lstep3[45] = _mm_sub_epi32(lstep1[35], lstep2[45]);
- lstep3[46] = _mm_sub_epi32(lstep1[32], lstep2[46]);
- lstep3[47] = _mm_sub_epi32(lstep1[33], lstep2[47]);
- lstep3[48] = _mm_sub_epi32(lstep1[62], lstep2[48]);
- lstep3[49] = _mm_sub_epi32(lstep1[63], lstep2[49]);
- lstep3[50] = _mm_sub_epi32(lstep1[60], lstep2[50]);
- lstep3[51] = _mm_sub_epi32(lstep1[61], lstep2[51]);
- lstep3[52] = _mm_sub_epi32(lstep1[58], lstep2[52]);
- lstep3[53] = _mm_sub_epi32(lstep1[59], lstep2[53]);
- lstep3[54] = _mm_sub_epi32(lstep1[56], lstep2[54]);
- lstep3[55] = _mm_sub_epi32(lstep1[57], lstep2[55]);
- lstep3[56] = _mm_add_epi32(lstep2[54], lstep1[56]);
- lstep3[57] = _mm_add_epi32(lstep2[55], lstep1[57]);
- lstep3[58] = _mm_add_epi32(lstep2[52], lstep1[58]);
- lstep3[59] = _mm_add_epi32(lstep2[53], lstep1[59]);
- lstep3[60] = _mm_add_epi32(lstep2[50], lstep1[60]);
- lstep3[61] = _mm_add_epi32(lstep2[51], lstep1[61]);
- lstep3[62] = _mm_add_epi32(lstep2[48], lstep1[62]);
- lstep3[63] = _mm_add_epi32(lstep2[49], lstep1[63]);
- }
-
- // stage 4
- {
- // expanding to 32-bit length priori to addition operations
- lstep2[16] = _mm_unpacklo_epi16(step2[8], kZero);
- lstep2[17] = _mm_unpackhi_epi16(step2[8], kZero);
- lstep2[18] = _mm_unpacklo_epi16(step2[9], kZero);
- lstep2[19] = _mm_unpackhi_epi16(step2[9], kZero);
- lstep2[28] = _mm_unpacklo_epi16(step2[14], kZero);
- lstep2[29] = _mm_unpackhi_epi16(step2[14], kZero);
- lstep2[30] = _mm_unpacklo_epi16(step2[15], kZero);
- lstep2[31] = _mm_unpackhi_epi16(step2[15], kZero);
- lstep2[16] = _mm_madd_epi16(lstep2[16], kOne);
- lstep2[17] = _mm_madd_epi16(lstep2[17], kOne);
- lstep2[18] = _mm_madd_epi16(lstep2[18], kOne);
- lstep2[19] = _mm_madd_epi16(lstep2[19], kOne);
- lstep2[28] = _mm_madd_epi16(lstep2[28], kOne);
- lstep2[29] = _mm_madd_epi16(lstep2[29], kOne);
- lstep2[30] = _mm_madd_epi16(lstep2[30], kOne);
- lstep2[31] = _mm_madd_epi16(lstep2[31], kOne);
-
- lstep1[0] = _mm_add_epi32(lstep3[6], lstep3[0]);
- lstep1[1] = _mm_add_epi32(lstep3[7], lstep3[1]);
- lstep1[2] = _mm_add_epi32(lstep3[4], lstep3[2]);
- lstep1[3] = _mm_add_epi32(lstep3[5], lstep3[3]);
- lstep1[4] = _mm_sub_epi32(lstep3[2], lstep3[4]);
- lstep1[5] = _mm_sub_epi32(lstep3[3], lstep3[5]);
- lstep1[6] = _mm_sub_epi32(lstep3[0], lstep3[6]);
- lstep1[7] = _mm_sub_epi32(lstep3[1], lstep3[7]);
- lstep1[16] = _mm_add_epi32(lstep3[22], lstep2[16]);
- lstep1[17] = _mm_add_epi32(lstep3[23], lstep2[17]);
- lstep1[18] = _mm_add_epi32(lstep3[20], lstep2[18]);
- lstep1[19] = _mm_add_epi32(lstep3[21], lstep2[19]);
- lstep1[20] = _mm_sub_epi32(lstep2[18], lstep3[20]);
- lstep1[21] = _mm_sub_epi32(lstep2[19], lstep3[21]);
- lstep1[22] = _mm_sub_epi32(lstep2[16], lstep3[22]);
- lstep1[23] = _mm_sub_epi32(lstep2[17], lstep3[23]);
- lstep1[24] = _mm_sub_epi32(lstep2[30], lstep3[24]);
- lstep1[25] = _mm_sub_epi32(lstep2[31], lstep3[25]);
- lstep1[26] = _mm_sub_epi32(lstep2[28], lstep3[26]);
- lstep1[27] = _mm_sub_epi32(lstep2[29], lstep3[27]);
- lstep1[28] = _mm_add_epi32(lstep3[26], lstep2[28]);
- lstep1[29] = _mm_add_epi32(lstep3[27], lstep2[29]);
- lstep1[30] = _mm_add_epi32(lstep3[24], lstep2[30]);
- lstep1[31] = _mm_add_epi32(lstep3[25], lstep2[31]);
- }
- {
- // to be continued...
- //
- const __m128i k32_p16_p16 = pair_set_epi32(cospi_16_64, cospi_16_64);
- const __m128i k32_p16_m16 = pair_set_epi32(cospi_16_64, -cospi_16_64);
-
- u[0] = _mm_unpacklo_epi32(lstep3[12], lstep3[10]);
- u[1] = _mm_unpackhi_epi32(lstep3[12], lstep3[10]);
- u[2] = _mm_unpacklo_epi32(lstep3[13], lstep3[11]);
- u[3] = _mm_unpackhi_epi32(lstep3[13], lstep3[11]);
-
- // TODO(jingning): manually inline k_madd_epi32_ to further hide
- // instruction latency.
- v[0] = k_madd_epi32(u[0], k32_p16_m16);
- v[1] = k_madd_epi32(u[1], k32_p16_m16);
- v[2] = k_madd_epi32(u[2], k32_p16_m16);
- v[3] = k_madd_epi32(u[3], k32_p16_m16);
- v[4] = k_madd_epi32(u[0], k32_p16_p16);
- v[5] = k_madd_epi32(u[1], k32_p16_p16);
- v[6] = k_madd_epi32(u[2], k32_p16_p16);
- v[7] = k_madd_epi32(u[3], k32_p16_p16);
-#if DCT_HIGH_BIT_DEPTH
- overflow = k_check_epi32_overflow_8(&v[0], &v[1], &v[2], &v[3], &v[4],
- &v[5], &v[6], &v[7], &kZero);
- if (overflow) {
- HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
- return;
- }
-#endif // DCT_HIGH_BIT_DEPTH
- u[0] = k_packs_epi64(v[0], v[1]);
- u[1] = k_packs_epi64(v[2], v[3]);
- u[2] = k_packs_epi64(v[4], v[5]);
- u[3] = k_packs_epi64(v[6], v[7]);
-
- v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
- v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
- v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
- v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
-
- lstep1[10] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
- lstep1[11] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
- lstep1[12] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
- lstep1[13] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
- }
- {
- const __m128i k32_m08_p24 = pair_set_epi32(-cospi_8_64, cospi_24_64);
- const __m128i k32_m24_m08 = pair_set_epi32(-cospi_24_64, -cospi_8_64);
- const __m128i k32_p24_p08 = pair_set_epi32(cospi_24_64, cospi_8_64);
-
- u[0] = _mm_unpacklo_epi32(lstep3[36], lstep3[58]);
- u[1] = _mm_unpackhi_epi32(lstep3[36], lstep3[58]);
- u[2] = _mm_unpacklo_epi32(lstep3[37], lstep3[59]);
- u[3] = _mm_unpackhi_epi32(lstep3[37], lstep3[59]);
- u[4] = _mm_unpacklo_epi32(lstep3[38], lstep3[56]);
- u[5] = _mm_unpackhi_epi32(lstep3[38], lstep3[56]);
- u[6] = _mm_unpacklo_epi32(lstep3[39], lstep3[57]);
- u[7] = _mm_unpackhi_epi32(lstep3[39], lstep3[57]);
- u[8] = _mm_unpacklo_epi32(lstep3[40], lstep3[54]);
- u[9] = _mm_unpackhi_epi32(lstep3[40], lstep3[54]);
- u[10] = _mm_unpacklo_epi32(lstep3[41], lstep3[55]);
- u[11] = _mm_unpackhi_epi32(lstep3[41], lstep3[55]);
- u[12] = _mm_unpacklo_epi32(lstep3[42], lstep3[52]);
- u[13] = _mm_unpackhi_epi32(lstep3[42], lstep3[52]);
- u[14] = _mm_unpacklo_epi32(lstep3[43], lstep3[53]);
- u[15] = _mm_unpackhi_epi32(lstep3[43], lstep3[53]);
-
- v[0] = k_madd_epi32(u[0], k32_m08_p24);
- v[1] = k_madd_epi32(u[1], k32_m08_p24);
- v[2] = k_madd_epi32(u[2], k32_m08_p24);
- v[3] = k_madd_epi32(u[3], k32_m08_p24);
- v[4] = k_madd_epi32(u[4], k32_m08_p24);
- v[5] = k_madd_epi32(u[5], k32_m08_p24);
- v[6] = k_madd_epi32(u[6], k32_m08_p24);
- v[7] = k_madd_epi32(u[7], k32_m08_p24);
- v[8] = k_madd_epi32(u[8], k32_m24_m08);
- v[9] = k_madd_epi32(u[9], k32_m24_m08);
- v[10] = k_madd_epi32(u[10], k32_m24_m08);
- v[11] = k_madd_epi32(u[11], k32_m24_m08);
- v[12] = k_madd_epi32(u[12], k32_m24_m08);
- v[13] = k_madd_epi32(u[13], k32_m24_m08);
- v[14] = k_madd_epi32(u[14], k32_m24_m08);
- v[15] = k_madd_epi32(u[15], k32_m24_m08);
- v[16] = k_madd_epi32(u[12], k32_m08_p24);
- v[17] = k_madd_epi32(u[13], k32_m08_p24);
- v[18] = k_madd_epi32(u[14], k32_m08_p24);
- v[19] = k_madd_epi32(u[15], k32_m08_p24);
- v[20] = k_madd_epi32(u[8], k32_m08_p24);
- v[21] = k_madd_epi32(u[9], k32_m08_p24);
- v[22] = k_madd_epi32(u[10], k32_m08_p24);
- v[23] = k_madd_epi32(u[11], k32_m08_p24);
- v[24] = k_madd_epi32(u[4], k32_p24_p08);
- v[25] = k_madd_epi32(u[5], k32_p24_p08);
- v[26] = k_madd_epi32(u[6], k32_p24_p08);
- v[27] = k_madd_epi32(u[7], k32_p24_p08);
- v[28] = k_madd_epi32(u[0], k32_p24_p08);
- v[29] = k_madd_epi32(u[1], k32_p24_p08);
- v[30] = k_madd_epi32(u[2], k32_p24_p08);
- v[31] = k_madd_epi32(u[3], k32_p24_p08);
-
-#if DCT_HIGH_BIT_DEPTH
- overflow = k_check_epi32_overflow_32(
- &v[0], &v[1], &v[2], &v[3], &v[4], &v[5], &v[6], &v[7], &v[8],
- &v[9], &v[10], &v[11], &v[12], &v[13], &v[14], &v[15], &v[16],
- &v[17], &v[18], &v[19], &v[20], &v[21], &v[22], &v[23], &v[24],
- &v[25], &v[26], &v[27], &v[28], &v[29], &v[30], &v[31], &kZero);
- if (overflow) {
- HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
- return;
- }
-#endif // DCT_HIGH_BIT_DEPTH
- u[0] = k_packs_epi64(v[0], v[1]);
- u[1] = k_packs_epi64(v[2], v[3]);
- u[2] = k_packs_epi64(v[4], v[5]);
- u[3] = k_packs_epi64(v[6], v[7]);
- u[4] = k_packs_epi64(v[8], v[9]);
- u[5] = k_packs_epi64(v[10], v[11]);
- u[6] = k_packs_epi64(v[12], v[13]);
- u[7] = k_packs_epi64(v[14], v[15]);
- u[8] = k_packs_epi64(v[16], v[17]);
- u[9] = k_packs_epi64(v[18], v[19]);
- u[10] = k_packs_epi64(v[20], v[21]);
- u[11] = k_packs_epi64(v[22], v[23]);
- u[12] = k_packs_epi64(v[24], v[25]);
- u[13] = k_packs_epi64(v[26], v[27]);
- u[14] = k_packs_epi64(v[28], v[29]);
- u[15] = k_packs_epi64(v[30], v[31]);
-
- v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
- v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
- v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
- v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
- v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
- v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
- v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
- v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
- v[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING);
- v[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING);
- v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);
- v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);
- v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);
- v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING);
- v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);
- v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);
-
- lstep1[36] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
- lstep1[37] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
- lstep1[38] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
- lstep1[39] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
- lstep1[40] = _mm_srai_epi32(v[4], DCT_CONST_BITS);
- lstep1[41] = _mm_srai_epi32(v[5], DCT_CONST_BITS);
- lstep1[42] = _mm_srai_epi32(v[6], DCT_CONST_BITS);
- lstep1[43] = _mm_srai_epi32(v[7], DCT_CONST_BITS);
- lstep1[52] = _mm_srai_epi32(v[8], DCT_CONST_BITS);
- lstep1[53] = _mm_srai_epi32(v[9], DCT_CONST_BITS);
- lstep1[54] = _mm_srai_epi32(v[10], DCT_CONST_BITS);
- lstep1[55] = _mm_srai_epi32(v[11], DCT_CONST_BITS);
- lstep1[56] = _mm_srai_epi32(v[12], DCT_CONST_BITS);
- lstep1[57] = _mm_srai_epi32(v[13], DCT_CONST_BITS);
- lstep1[58] = _mm_srai_epi32(v[14], DCT_CONST_BITS);
- lstep1[59] = _mm_srai_epi32(v[15], DCT_CONST_BITS);
- }
- // stage 5
- {
- lstep2[8] = _mm_add_epi32(lstep1[10], lstep3[8]);
- lstep2[9] = _mm_add_epi32(lstep1[11], lstep3[9]);
- lstep2[10] = _mm_sub_epi32(lstep3[8], lstep1[10]);
- lstep2[11] = _mm_sub_epi32(lstep3[9], lstep1[11]);
- lstep2[12] = _mm_sub_epi32(lstep3[14], lstep1[12]);
- lstep2[13] = _mm_sub_epi32(lstep3[15], lstep1[13]);
- lstep2[14] = _mm_add_epi32(lstep1[12], lstep3[14]);
- lstep2[15] = _mm_add_epi32(lstep1[13], lstep3[15]);
- }
- {
- const __m128i k32_p16_p16 = pair_set_epi32(cospi_16_64, cospi_16_64);
- const __m128i k32_p16_m16 = pair_set_epi32(cospi_16_64, -cospi_16_64);
- const __m128i k32_p24_p08 = pair_set_epi32(cospi_24_64, cospi_8_64);
- const __m128i k32_m08_p24 = pair_set_epi32(-cospi_8_64, cospi_24_64);
-
- u[0] = _mm_unpacklo_epi32(lstep1[0], lstep1[2]);
- u[1] = _mm_unpackhi_epi32(lstep1[0], lstep1[2]);
- u[2] = _mm_unpacklo_epi32(lstep1[1], lstep1[3]);
- u[3] = _mm_unpackhi_epi32(lstep1[1], lstep1[3]);
- u[4] = _mm_unpacklo_epi32(lstep1[4], lstep1[6]);
- u[5] = _mm_unpackhi_epi32(lstep1[4], lstep1[6]);
- u[6] = _mm_unpacklo_epi32(lstep1[5], lstep1[7]);
- u[7] = _mm_unpackhi_epi32(lstep1[5], lstep1[7]);
-
- // TODO(jingning): manually inline k_madd_epi32_ to further hide
- // instruction latency.
- v[0] = k_madd_epi32(u[0], k32_p16_p16);
- v[1] = k_madd_epi32(u[1], k32_p16_p16);
- v[2] = k_madd_epi32(u[2], k32_p16_p16);
- v[3] = k_madd_epi32(u[3], k32_p16_p16);
- v[4] = k_madd_epi32(u[0], k32_p16_m16);
- v[5] = k_madd_epi32(u[1], k32_p16_m16);
- v[6] = k_madd_epi32(u[2], k32_p16_m16);
- v[7] = k_madd_epi32(u[3], k32_p16_m16);
- v[8] = k_madd_epi32(u[4], k32_p24_p08);
- v[9] = k_madd_epi32(u[5], k32_p24_p08);
- v[10] = k_madd_epi32(u[6], k32_p24_p08);
- v[11] = k_madd_epi32(u[7], k32_p24_p08);
- v[12] = k_madd_epi32(u[4], k32_m08_p24);
- v[13] = k_madd_epi32(u[5], k32_m08_p24);
- v[14] = k_madd_epi32(u[6], k32_m08_p24);
- v[15] = k_madd_epi32(u[7], k32_m08_p24);
-
-#if DCT_HIGH_BIT_DEPTH
- overflow = k_check_epi32_overflow_16(
- &v[0], &v[1], &v[2], &v[3], &v[4], &v[5], &v[6], &v[7], &v[8],
- &v[9], &v[10], &v[11], &v[12], &v[13], &v[14], &v[15], &kZero);
- if (overflow) {
- HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
- return;
- }
-#endif // DCT_HIGH_BIT_DEPTH
- u[0] = k_packs_epi64(v[0], v[1]);
- u[1] = k_packs_epi64(v[2], v[3]);
- u[2] = k_packs_epi64(v[4], v[5]);
- u[3] = k_packs_epi64(v[6], v[7]);
- u[4] = k_packs_epi64(v[8], v[9]);
- u[5] = k_packs_epi64(v[10], v[11]);
- u[6] = k_packs_epi64(v[12], v[13]);
- u[7] = k_packs_epi64(v[14], v[15]);
-
- v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
- v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
- v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
- v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
- v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
- v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
- v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
- v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
-
- u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
- u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
- u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
- u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
- u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS);
- u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS);
- u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS);
- u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS);
-
- sign[0] = _mm_cmplt_epi32(u[0], kZero);
- sign[1] = _mm_cmplt_epi32(u[1], kZero);
- sign[2] = _mm_cmplt_epi32(u[2], kZero);
- sign[3] = _mm_cmplt_epi32(u[3], kZero);
- sign[4] = _mm_cmplt_epi32(u[4], kZero);
- sign[5] = _mm_cmplt_epi32(u[5], kZero);
- sign[6] = _mm_cmplt_epi32(u[6], kZero);
- sign[7] = _mm_cmplt_epi32(u[7], kZero);
-
- u[0] = _mm_sub_epi32(u[0], sign[0]);
- u[1] = _mm_sub_epi32(u[1], sign[1]);
- u[2] = _mm_sub_epi32(u[2], sign[2]);
- u[3] = _mm_sub_epi32(u[3], sign[3]);
- u[4] = _mm_sub_epi32(u[4], sign[4]);
- u[5] = _mm_sub_epi32(u[5], sign[5]);
- u[6] = _mm_sub_epi32(u[6], sign[6]);
- u[7] = _mm_sub_epi32(u[7], sign[7]);
-
- u[0] = _mm_add_epi32(u[0], K32One);
- u[1] = _mm_add_epi32(u[1], K32One);
- u[2] = _mm_add_epi32(u[2], K32One);
- u[3] = _mm_add_epi32(u[3], K32One);
- u[4] = _mm_add_epi32(u[4], K32One);
- u[5] = _mm_add_epi32(u[5], K32One);
- u[6] = _mm_add_epi32(u[6], K32One);
- u[7] = _mm_add_epi32(u[7], K32One);
-
- u[0] = _mm_srai_epi32(u[0], 2);
- u[1] = _mm_srai_epi32(u[1], 2);
- u[2] = _mm_srai_epi32(u[2], 2);
- u[3] = _mm_srai_epi32(u[3], 2);
- u[4] = _mm_srai_epi32(u[4], 2);
- u[5] = _mm_srai_epi32(u[5], 2);
- u[6] = _mm_srai_epi32(u[6], 2);
- u[7] = _mm_srai_epi32(u[7], 2);
-
- // Combine
- out[0] = _mm_packs_epi32(u[0], u[1]);
- out[16] = _mm_packs_epi32(u[2], u[3]);
- out[8] = _mm_packs_epi32(u[4], u[5]);
- out[24] = _mm_packs_epi32(u[6], u[7]);
-#if DCT_HIGH_BIT_DEPTH
- overflow =
- check_epi16_overflow_x4(&out[0], &out[16], &out[8], &out[24]);
- if (overflow) {
- HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
- return;
- }
-#endif // DCT_HIGH_BIT_DEPTH
- }
- {
- const __m128i k32_m08_p24 = pair_set_epi32(-cospi_8_64, cospi_24_64);
- const __m128i k32_m24_m08 = pair_set_epi32(-cospi_24_64, -cospi_8_64);
- const __m128i k32_p24_p08 = pair_set_epi32(cospi_24_64, cospi_8_64);
-
- u[0] = _mm_unpacklo_epi32(lstep1[18], lstep1[28]);
- u[1] = _mm_unpackhi_epi32(lstep1[18], lstep1[28]);
- u[2] = _mm_unpacklo_epi32(lstep1[19], lstep1[29]);
- u[3] = _mm_unpackhi_epi32(lstep1[19], lstep1[29]);
- u[4] = _mm_unpacklo_epi32(lstep1[20], lstep1[26]);
- u[5] = _mm_unpackhi_epi32(lstep1[20], lstep1[26]);
- u[6] = _mm_unpacklo_epi32(lstep1[21], lstep1[27]);
- u[7] = _mm_unpackhi_epi32(lstep1[21], lstep1[27]);
-
- v[0] = k_madd_epi32(u[0], k32_m08_p24);
- v[1] = k_madd_epi32(u[1], k32_m08_p24);
- v[2] = k_madd_epi32(u[2], k32_m08_p24);
- v[3] = k_madd_epi32(u[3], k32_m08_p24);
- v[4] = k_madd_epi32(u[4], k32_m24_m08);
- v[5] = k_madd_epi32(u[5], k32_m24_m08);
- v[6] = k_madd_epi32(u[6], k32_m24_m08);
- v[7] = k_madd_epi32(u[7], k32_m24_m08);
- v[8] = k_madd_epi32(u[4], k32_m08_p24);
- v[9] = k_madd_epi32(u[5], k32_m08_p24);
- v[10] = k_madd_epi32(u[6], k32_m08_p24);
- v[11] = k_madd_epi32(u[7], k32_m08_p24);
- v[12] = k_madd_epi32(u[0], k32_p24_p08);
- v[13] = k_madd_epi32(u[1], k32_p24_p08);
- v[14] = k_madd_epi32(u[2], k32_p24_p08);
- v[15] = k_madd_epi32(u[3], k32_p24_p08);
-
-#if DCT_HIGH_BIT_DEPTH
- overflow = k_check_epi32_overflow_16(
- &v[0], &v[1], &v[2], &v[3], &v[4], &v[5], &v[6], &v[7], &v[8],
- &v[9], &v[10], &v[11], &v[12], &v[13], &v[14], &v[15], &kZero);
- if (overflow) {
- HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
- return;
- }
-#endif // DCT_HIGH_BIT_DEPTH
- u[0] = k_packs_epi64(v[0], v[1]);
- u[1] = k_packs_epi64(v[2], v[3]);
- u[2] = k_packs_epi64(v[4], v[5]);
- u[3] = k_packs_epi64(v[6], v[7]);
- u[4] = k_packs_epi64(v[8], v[9]);
- u[5] = k_packs_epi64(v[10], v[11]);
- u[6] = k_packs_epi64(v[12], v[13]);
- u[7] = k_packs_epi64(v[14], v[15]);
-
- u[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
- u[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
- u[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
- u[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
- u[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
- u[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
- u[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
- u[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
-
- lstep2[18] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
- lstep2[19] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
- lstep2[20] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
- lstep2[21] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
- lstep2[26] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
- lstep2[27] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
- lstep2[28] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
- lstep2[29] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
- }
- {
- lstep2[32] = _mm_add_epi32(lstep1[38], lstep3[32]);
- lstep2[33] = _mm_add_epi32(lstep1[39], lstep3[33]);
- lstep2[34] = _mm_add_epi32(lstep1[36], lstep3[34]);
- lstep2[35] = _mm_add_epi32(lstep1[37], lstep3[35]);
- lstep2[36] = _mm_sub_epi32(lstep3[34], lstep1[36]);
- lstep2[37] = _mm_sub_epi32(lstep3[35], lstep1[37]);
- lstep2[38] = _mm_sub_epi32(lstep3[32], lstep1[38]);
- lstep2[39] = _mm_sub_epi32(lstep3[33], lstep1[39]);
- lstep2[40] = _mm_sub_epi32(lstep3[46], lstep1[40]);
- lstep2[41] = _mm_sub_epi32(lstep3[47], lstep1[41]);
- lstep2[42] = _mm_sub_epi32(lstep3[44], lstep1[42]);
- lstep2[43] = _mm_sub_epi32(lstep3[45], lstep1[43]);
- lstep2[44] = _mm_add_epi32(lstep1[42], lstep3[44]);
- lstep2[45] = _mm_add_epi32(lstep1[43], lstep3[45]);
- lstep2[46] = _mm_add_epi32(lstep1[40], lstep3[46]);
- lstep2[47] = _mm_add_epi32(lstep1[41], lstep3[47]);
- lstep2[48] = _mm_add_epi32(lstep1[54], lstep3[48]);
- lstep2[49] = _mm_add_epi32(lstep1[55], lstep3[49]);
- lstep2[50] = _mm_add_epi32(lstep1[52], lstep3[50]);
- lstep2[51] = _mm_add_epi32(lstep1[53], lstep3[51]);
- lstep2[52] = _mm_sub_epi32(lstep3[50], lstep1[52]);
- lstep2[53] = _mm_sub_epi32(lstep3[51], lstep1[53]);
- lstep2[54] = _mm_sub_epi32(lstep3[48], lstep1[54]);
- lstep2[55] = _mm_sub_epi32(lstep3[49], lstep1[55]);
- lstep2[56] = _mm_sub_epi32(lstep3[62], lstep1[56]);
- lstep2[57] = _mm_sub_epi32(lstep3[63], lstep1[57]);
- lstep2[58] = _mm_sub_epi32(lstep3[60], lstep1[58]);
- lstep2[59] = _mm_sub_epi32(lstep3[61], lstep1[59]);
- lstep2[60] = _mm_add_epi32(lstep1[58], lstep3[60]);
- lstep2[61] = _mm_add_epi32(lstep1[59], lstep3[61]);
- lstep2[62] = _mm_add_epi32(lstep1[56], lstep3[62]);
- lstep2[63] = _mm_add_epi32(lstep1[57], lstep3[63]);
- }
- // stage 6
- {
- const __m128i k32_p28_p04 = pair_set_epi32(cospi_28_64, cospi_4_64);
- const __m128i k32_p12_p20 = pair_set_epi32(cospi_12_64, cospi_20_64);
- const __m128i k32_m20_p12 = pair_set_epi32(-cospi_20_64, cospi_12_64);
- const __m128i k32_m04_p28 = pair_set_epi32(-cospi_4_64, cospi_28_64);
-
- u[0] = _mm_unpacklo_epi32(lstep2[8], lstep2[14]);
- u[1] = _mm_unpackhi_epi32(lstep2[8], lstep2[14]);
- u[2] = _mm_unpacklo_epi32(lstep2[9], lstep2[15]);
- u[3] = _mm_unpackhi_epi32(lstep2[9], lstep2[15]);
- u[4] = _mm_unpacklo_epi32(lstep2[10], lstep2[12]);
- u[5] = _mm_unpackhi_epi32(lstep2[10], lstep2[12]);
- u[6] = _mm_unpacklo_epi32(lstep2[11], lstep2[13]);
- u[7] = _mm_unpackhi_epi32(lstep2[11], lstep2[13]);
- u[8] = _mm_unpacklo_epi32(lstep2[10], lstep2[12]);
- u[9] = _mm_unpackhi_epi32(lstep2[10], lstep2[12]);
- u[10] = _mm_unpacklo_epi32(lstep2[11], lstep2[13]);
- u[11] = _mm_unpackhi_epi32(lstep2[11], lstep2[13]);
- u[12] = _mm_unpacklo_epi32(lstep2[8], lstep2[14]);
- u[13] = _mm_unpackhi_epi32(lstep2[8], lstep2[14]);
- u[14] = _mm_unpacklo_epi32(lstep2[9], lstep2[15]);
- u[15] = _mm_unpackhi_epi32(lstep2[9], lstep2[15]);
-
- v[0] = k_madd_epi32(u[0], k32_p28_p04);
- v[1] = k_madd_epi32(u[1], k32_p28_p04);
- v[2] = k_madd_epi32(u[2], k32_p28_p04);
- v[3] = k_madd_epi32(u[3], k32_p28_p04);
- v[4] = k_madd_epi32(u[4], k32_p12_p20);
- v[5] = k_madd_epi32(u[5], k32_p12_p20);
- v[6] = k_madd_epi32(u[6], k32_p12_p20);
- v[7] = k_madd_epi32(u[7], k32_p12_p20);
- v[8] = k_madd_epi32(u[8], k32_m20_p12);
- v[9] = k_madd_epi32(u[9], k32_m20_p12);
- v[10] = k_madd_epi32(u[10], k32_m20_p12);
- v[11] = k_madd_epi32(u[11], k32_m20_p12);
- v[12] = k_madd_epi32(u[12], k32_m04_p28);
- v[13] = k_madd_epi32(u[13], k32_m04_p28);
- v[14] = k_madd_epi32(u[14], k32_m04_p28);
- v[15] = k_madd_epi32(u[15], k32_m04_p28);
-
-#if DCT_HIGH_BIT_DEPTH
- overflow = k_check_epi32_overflow_16(
- &v[0], &v[1], &v[2], &v[3], &v[4], &v[5], &v[6], &v[7], &v[8],
- &v[9], &v[10], &v[11], &v[12], &v[13], &v[14], &v[15], &kZero);
- if (overflow) {
- HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
- return;
- }
-#endif // DCT_HIGH_BIT_DEPTH
- u[0] = k_packs_epi64(v[0], v[1]);
- u[1] = k_packs_epi64(v[2], v[3]);
- u[2] = k_packs_epi64(v[4], v[5]);
- u[3] = k_packs_epi64(v[6], v[7]);
- u[4] = k_packs_epi64(v[8], v[9]);
- u[5] = k_packs_epi64(v[10], v[11]);
- u[6] = k_packs_epi64(v[12], v[13]);
- u[7] = k_packs_epi64(v[14], v[15]);
-
- v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
- v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
- v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
- v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
- v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
- v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
- v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
- v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
-
- u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
- u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
- u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
- u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
- u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS);
- u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS);
- u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS);
- u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS);
-
- sign[0] = _mm_cmplt_epi32(u[0], kZero);
- sign[1] = _mm_cmplt_epi32(u[1], kZero);
- sign[2] = _mm_cmplt_epi32(u[2], kZero);
- sign[3] = _mm_cmplt_epi32(u[3], kZero);
- sign[4] = _mm_cmplt_epi32(u[4], kZero);
- sign[5] = _mm_cmplt_epi32(u[5], kZero);
- sign[6] = _mm_cmplt_epi32(u[6], kZero);
- sign[7] = _mm_cmplt_epi32(u[7], kZero);
-
- u[0] = _mm_sub_epi32(u[0], sign[0]);
- u[1] = _mm_sub_epi32(u[1], sign[1]);
- u[2] = _mm_sub_epi32(u[2], sign[2]);
- u[3] = _mm_sub_epi32(u[3], sign[3]);
- u[4] = _mm_sub_epi32(u[4], sign[4]);
- u[5] = _mm_sub_epi32(u[5], sign[5]);
- u[6] = _mm_sub_epi32(u[6], sign[6]);
- u[7] = _mm_sub_epi32(u[7], sign[7]);
-
- u[0] = _mm_add_epi32(u[0], K32One);
- u[1] = _mm_add_epi32(u[1], K32One);
- u[2] = _mm_add_epi32(u[2], K32One);
- u[3] = _mm_add_epi32(u[3], K32One);
- u[4] = _mm_add_epi32(u[4], K32One);
- u[5] = _mm_add_epi32(u[5], K32One);
- u[6] = _mm_add_epi32(u[6], K32One);
- u[7] = _mm_add_epi32(u[7], K32One);
-
- u[0] = _mm_srai_epi32(u[0], 2);
- u[1] = _mm_srai_epi32(u[1], 2);
- u[2] = _mm_srai_epi32(u[2], 2);
- u[3] = _mm_srai_epi32(u[3], 2);
- u[4] = _mm_srai_epi32(u[4], 2);
- u[5] = _mm_srai_epi32(u[5], 2);
- u[6] = _mm_srai_epi32(u[6], 2);
- u[7] = _mm_srai_epi32(u[7], 2);
-
- out[4] = _mm_packs_epi32(u[0], u[1]);
- out[20] = _mm_packs_epi32(u[2], u[3]);
- out[12] = _mm_packs_epi32(u[4], u[5]);
- out[28] = _mm_packs_epi32(u[6], u[7]);
-#if DCT_HIGH_BIT_DEPTH
- overflow =
- check_epi16_overflow_x4(&out[4], &out[20], &out[12], &out[28]);
- if (overflow) {
- HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
- return;
- }
-#endif // DCT_HIGH_BIT_DEPTH
- }
- {
- lstep3[16] = _mm_add_epi32(lstep2[18], lstep1[16]);
- lstep3[17] = _mm_add_epi32(lstep2[19], lstep1[17]);
- lstep3[18] = _mm_sub_epi32(lstep1[16], lstep2[18]);
- lstep3[19] = _mm_sub_epi32(lstep1[17], lstep2[19]);
- lstep3[20] = _mm_sub_epi32(lstep1[22], lstep2[20]);
- lstep3[21] = _mm_sub_epi32(lstep1[23], lstep2[21]);
- lstep3[22] = _mm_add_epi32(lstep2[20], lstep1[22]);
- lstep3[23] = _mm_add_epi32(lstep2[21], lstep1[23]);
- lstep3[24] = _mm_add_epi32(lstep2[26], lstep1[24]);
- lstep3[25] = _mm_add_epi32(lstep2[27], lstep1[25]);
- lstep3[26] = _mm_sub_epi32(lstep1[24], lstep2[26]);
- lstep3[27] = _mm_sub_epi32(lstep1[25], lstep2[27]);
- lstep3[28] = _mm_sub_epi32(lstep1[30], lstep2[28]);
- lstep3[29] = _mm_sub_epi32(lstep1[31], lstep2[29]);
- lstep3[30] = _mm_add_epi32(lstep2[28], lstep1[30]);
- lstep3[31] = _mm_add_epi32(lstep2[29], lstep1[31]);
- }
- {
- const __m128i k32_m04_p28 = pair_set_epi32(-cospi_4_64, cospi_28_64);
- const __m128i k32_m28_m04 = pair_set_epi32(-cospi_28_64, -cospi_4_64);
- const __m128i k32_m20_p12 = pair_set_epi32(-cospi_20_64, cospi_12_64);
- const __m128i k32_m12_m20 =
- pair_set_epi32(-cospi_12_64, -cospi_20_64);
- const __m128i k32_p12_p20 = pair_set_epi32(cospi_12_64, cospi_20_64);
- const __m128i k32_p28_p04 = pair_set_epi32(cospi_28_64, cospi_4_64);
-
- u[0] = _mm_unpacklo_epi32(lstep2[34], lstep2[60]);
- u[1] = _mm_unpackhi_epi32(lstep2[34], lstep2[60]);
- u[2] = _mm_unpacklo_epi32(lstep2[35], lstep2[61]);
- u[3] = _mm_unpackhi_epi32(lstep2[35], lstep2[61]);
- u[4] = _mm_unpacklo_epi32(lstep2[36], lstep2[58]);
- u[5] = _mm_unpackhi_epi32(lstep2[36], lstep2[58]);
- u[6] = _mm_unpacklo_epi32(lstep2[37], lstep2[59]);
- u[7] = _mm_unpackhi_epi32(lstep2[37], lstep2[59]);
- u[8] = _mm_unpacklo_epi32(lstep2[42], lstep2[52]);
- u[9] = _mm_unpackhi_epi32(lstep2[42], lstep2[52]);
- u[10] = _mm_unpacklo_epi32(lstep2[43], lstep2[53]);
- u[11] = _mm_unpackhi_epi32(lstep2[43], lstep2[53]);
- u[12] = _mm_unpacklo_epi32(lstep2[44], lstep2[50]);
- u[13] = _mm_unpackhi_epi32(lstep2[44], lstep2[50]);
- u[14] = _mm_unpacklo_epi32(lstep2[45], lstep2[51]);
- u[15] = _mm_unpackhi_epi32(lstep2[45], lstep2[51]);
-
- v[0] = k_madd_epi32(u[0], k32_m04_p28);
- v[1] = k_madd_epi32(u[1], k32_m04_p28);
- v[2] = k_madd_epi32(u[2], k32_m04_p28);
- v[3] = k_madd_epi32(u[3], k32_m04_p28);
- v[4] = k_madd_epi32(u[4], k32_m28_m04);
- v[5] = k_madd_epi32(u[5], k32_m28_m04);
- v[6] = k_madd_epi32(u[6], k32_m28_m04);
- v[7] = k_madd_epi32(u[7], k32_m28_m04);
- v[8] = k_madd_epi32(u[8], k32_m20_p12);
- v[9] = k_madd_epi32(u[9], k32_m20_p12);
- v[10] = k_madd_epi32(u[10], k32_m20_p12);
- v[11] = k_madd_epi32(u[11], k32_m20_p12);
- v[12] = k_madd_epi32(u[12], k32_m12_m20);
- v[13] = k_madd_epi32(u[13], k32_m12_m20);
- v[14] = k_madd_epi32(u[14], k32_m12_m20);
- v[15] = k_madd_epi32(u[15], k32_m12_m20);
- v[16] = k_madd_epi32(u[12], k32_m20_p12);
- v[17] = k_madd_epi32(u[13], k32_m20_p12);
- v[18] = k_madd_epi32(u[14], k32_m20_p12);
- v[19] = k_madd_epi32(u[15], k32_m20_p12);
- v[20] = k_madd_epi32(u[8], k32_p12_p20);
- v[21] = k_madd_epi32(u[9], k32_p12_p20);
- v[22] = k_madd_epi32(u[10], k32_p12_p20);
- v[23] = k_madd_epi32(u[11], k32_p12_p20);
- v[24] = k_madd_epi32(u[4], k32_m04_p28);
- v[25] = k_madd_epi32(u[5], k32_m04_p28);
- v[26] = k_madd_epi32(u[6], k32_m04_p28);
- v[27] = k_madd_epi32(u[7], k32_m04_p28);
- v[28] = k_madd_epi32(u[0], k32_p28_p04);
- v[29] = k_madd_epi32(u[1], k32_p28_p04);
- v[30] = k_madd_epi32(u[2], k32_p28_p04);
- v[31] = k_madd_epi32(u[3], k32_p28_p04);
-
-#if DCT_HIGH_BIT_DEPTH
- overflow = k_check_epi32_overflow_32(
- &v[0], &v[1], &v[2], &v[3], &v[4], &v[5], &v[6], &v[7], &v[8],
- &v[9], &v[10], &v[11], &v[12], &v[13], &v[14], &v[15], &v[16],
- &v[17], &v[18], &v[19], &v[20], &v[21], &v[22], &v[23], &v[24],
- &v[25], &v[26], &v[27], &v[28], &v[29], &v[30], &v[31], &kZero);
- if (overflow) {
- HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
- return;
- }
-#endif // DCT_HIGH_BIT_DEPTH
- u[0] = k_packs_epi64(v[0], v[1]);
- u[1] = k_packs_epi64(v[2], v[3]);
- u[2] = k_packs_epi64(v[4], v[5]);
- u[3] = k_packs_epi64(v[6], v[7]);
- u[4] = k_packs_epi64(v[8], v[9]);
- u[5] = k_packs_epi64(v[10], v[11]);
- u[6] = k_packs_epi64(v[12], v[13]);
- u[7] = k_packs_epi64(v[14], v[15]);
- u[8] = k_packs_epi64(v[16], v[17]);
- u[9] = k_packs_epi64(v[18], v[19]);
- u[10] = k_packs_epi64(v[20], v[21]);
- u[11] = k_packs_epi64(v[22], v[23]);
- u[12] = k_packs_epi64(v[24], v[25]);
- u[13] = k_packs_epi64(v[26], v[27]);
- u[14] = k_packs_epi64(v[28], v[29]);
- u[15] = k_packs_epi64(v[30], v[31]);
-
- v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
- v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
- v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
- v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
- v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
- v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
- v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
- v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
- v[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING);
- v[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING);
- v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);
- v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);
- v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);
- v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING);
- v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);
- v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);
-
- lstep3[34] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
- lstep3[35] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
- lstep3[36] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
- lstep3[37] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
- lstep3[42] = _mm_srai_epi32(v[4], DCT_CONST_BITS);
- lstep3[43] = _mm_srai_epi32(v[5], DCT_CONST_BITS);
- lstep3[44] = _mm_srai_epi32(v[6], DCT_CONST_BITS);
- lstep3[45] = _mm_srai_epi32(v[7], DCT_CONST_BITS);
- lstep3[50] = _mm_srai_epi32(v[8], DCT_CONST_BITS);
- lstep3[51] = _mm_srai_epi32(v[9], DCT_CONST_BITS);
- lstep3[52] = _mm_srai_epi32(v[10], DCT_CONST_BITS);
- lstep3[53] = _mm_srai_epi32(v[11], DCT_CONST_BITS);
- lstep3[58] = _mm_srai_epi32(v[12], DCT_CONST_BITS);
- lstep3[59] = _mm_srai_epi32(v[13], DCT_CONST_BITS);
- lstep3[60] = _mm_srai_epi32(v[14], DCT_CONST_BITS);
- lstep3[61] = _mm_srai_epi32(v[15], DCT_CONST_BITS);
- }
- // stage 7
- {
- const __m128i k32_p30_p02 = pair_set_epi32(cospi_30_64, cospi_2_64);
- const __m128i k32_p14_p18 = pair_set_epi32(cospi_14_64, cospi_18_64);
- const __m128i k32_p22_p10 = pair_set_epi32(cospi_22_64, cospi_10_64);
- const __m128i k32_p06_p26 = pair_set_epi32(cospi_6_64, cospi_26_64);
- const __m128i k32_m26_p06 = pair_set_epi32(-cospi_26_64, cospi_6_64);
- const __m128i k32_m10_p22 = pair_set_epi32(-cospi_10_64, cospi_22_64);
- const __m128i k32_m18_p14 = pair_set_epi32(-cospi_18_64, cospi_14_64);
- const __m128i k32_m02_p30 = pair_set_epi32(-cospi_2_64, cospi_30_64);
-
- u[0] = _mm_unpacklo_epi32(lstep3[16], lstep3[30]);
- u[1] = _mm_unpackhi_epi32(lstep3[16], lstep3[30]);
- u[2] = _mm_unpacklo_epi32(lstep3[17], lstep3[31]);
- u[3] = _mm_unpackhi_epi32(lstep3[17], lstep3[31]);
- u[4] = _mm_unpacklo_epi32(lstep3[18], lstep3[28]);
- u[5] = _mm_unpackhi_epi32(lstep3[18], lstep3[28]);
- u[6] = _mm_unpacklo_epi32(lstep3[19], lstep3[29]);
- u[7] = _mm_unpackhi_epi32(lstep3[19], lstep3[29]);
- u[8] = _mm_unpacklo_epi32(lstep3[20], lstep3[26]);
- u[9] = _mm_unpackhi_epi32(lstep3[20], lstep3[26]);
- u[10] = _mm_unpacklo_epi32(lstep3[21], lstep3[27]);
- u[11] = _mm_unpackhi_epi32(lstep3[21], lstep3[27]);
- u[12] = _mm_unpacklo_epi32(lstep3[22], lstep3[24]);
- u[13] = _mm_unpackhi_epi32(lstep3[22], lstep3[24]);
- u[14] = _mm_unpacklo_epi32(lstep3[23], lstep3[25]);
- u[15] = _mm_unpackhi_epi32(lstep3[23], lstep3[25]);
-
- v[0] = k_madd_epi32(u[0], k32_p30_p02);
- v[1] = k_madd_epi32(u[1], k32_p30_p02);
- v[2] = k_madd_epi32(u[2], k32_p30_p02);
- v[3] = k_madd_epi32(u[3], k32_p30_p02);
- v[4] = k_madd_epi32(u[4], k32_p14_p18);
- v[5] = k_madd_epi32(u[5], k32_p14_p18);
- v[6] = k_madd_epi32(u[6], k32_p14_p18);
- v[7] = k_madd_epi32(u[7], k32_p14_p18);
- v[8] = k_madd_epi32(u[8], k32_p22_p10);
- v[9] = k_madd_epi32(u[9], k32_p22_p10);
- v[10] = k_madd_epi32(u[10], k32_p22_p10);
- v[11] = k_madd_epi32(u[11], k32_p22_p10);
- v[12] = k_madd_epi32(u[12], k32_p06_p26);
- v[13] = k_madd_epi32(u[13], k32_p06_p26);
- v[14] = k_madd_epi32(u[14], k32_p06_p26);
- v[15] = k_madd_epi32(u[15], k32_p06_p26);
- v[16] = k_madd_epi32(u[12], k32_m26_p06);
- v[17] = k_madd_epi32(u[13], k32_m26_p06);
- v[18] = k_madd_epi32(u[14], k32_m26_p06);
- v[19] = k_madd_epi32(u[15], k32_m26_p06);
- v[20] = k_madd_epi32(u[8], k32_m10_p22);
- v[21] = k_madd_epi32(u[9], k32_m10_p22);
- v[22] = k_madd_epi32(u[10], k32_m10_p22);
- v[23] = k_madd_epi32(u[11], k32_m10_p22);
- v[24] = k_madd_epi32(u[4], k32_m18_p14);
- v[25] = k_madd_epi32(u[5], k32_m18_p14);
- v[26] = k_madd_epi32(u[6], k32_m18_p14);
- v[27] = k_madd_epi32(u[7], k32_m18_p14);
- v[28] = k_madd_epi32(u[0], k32_m02_p30);
- v[29] = k_madd_epi32(u[1], k32_m02_p30);
- v[30] = k_madd_epi32(u[2], k32_m02_p30);
- v[31] = k_madd_epi32(u[3], k32_m02_p30);
-
-#if DCT_HIGH_BIT_DEPTH
- overflow = k_check_epi32_overflow_32(
- &v[0], &v[1], &v[2], &v[3], &v[4], &v[5], &v[6], &v[7], &v[8],
- &v[9], &v[10], &v[11], &v[12], &v[13], &v[14], &v[15], &v[16],
- &v[17], &v[18], &v[19], &v[20], &v[21], &v[22], &v[23], &v[24],
- &v[25], &v[26], &v[27], &v[28], &v[29], &v[30], &v[31], &kZero);
- if (overflow) {
- HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
- return;
- }
-#endif // DCT_HIGH_BIT_DEPTH
- u[0] = k_packs_epi64(v[0], v[1]);
- u[1] = k_packs_epi64(v[2], v[3]);
- u[2] = k_packs_epi64(v[4], v[5]);
- u[3] = k_packs_epi64(v[6], v[7]);
- u[4] = k_packs_epi64(v[8], v[9]);
- u[5] = k_packs_epi64(v[10], v[11]);
- u[6] = k_packs_epi64(v[12], v[13]);
- u[7] = k_packs_epi64(v[14], v[15]);
- u[8] = k_packs_epi64(v[16], v[17]);
- u[9] = k_packs_epi64(v[18], v[19]);
- u[10] = k_packs_epi64(v[20], v[21]);
- u[11] = k_packs_epi64(v[22], v[23]);
- u[12] = k_packs_epi64(v[24], v[25]);
- u[13] = k_packs_epi64(v[26], v[27]);
- u[14] = k_packs_epi64(v[28], v[29]);
- u[15] = k_packs_epi64(v[30], v[31]);
-
- v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
- v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
- v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
- v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
- v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
- v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
- v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
- v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
- v[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING);
- v[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING);
- v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);
- v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);
- v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);
- v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING);
- v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);
- v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);
-
- u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
- u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
- u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
- u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
- u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS);
- u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS);
- u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS);
- u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS);
- u[8] = _mm_srai_epi32(v[8], DCT_CONST_BITS);
- u[9] = _mm_srai_epi32(v[9], DCT_CONST_BITS);
- u[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS);
- u[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS);
- u[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS);
- u[13] = _mm_srai_epi32(v[13], DCT_CONST_BITS);
- u[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS);
- u[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS);
-
- v[0] = _mm_cmplt_epi32(u[0], kZero);
- v[1] = _mm_cmplt_epi32(u[1], kZero);
- v[2] = _mm_cmplt_epi32(u[2], kZero);
- v[3] = _mm_cmplt_epi32(u[3], kZero);
- v[4] = _mm_cmplt_epi32(u[4], kZero);
- v[5] = _mm_cmplt_epi32(u[5], kZero);
- v[6] = _mm_cmplt_epi32(u[6], kZero);
- v[7] = _mm_cmplt_epi32(u[7], kZero);
- v[8] = _mm_cmplt_epi32(u[8], kZero);
- v[9] = _mm_cmplt_epi32(u[9], kZero);
- v[10] = _mm_cmplt_epi32(u[10], kZero);
- v[11] = _mm_cmplt_epi32(u[11], kZero);
- v[12] = _mm_cmplt_epi32(u[12], kZero);
- v[13] = _mm_cmplt_epi32(u[13], kZero);
- v[14] = _mm_cmplt_epi32(u[14], kZero);
- v[15] = _mm_cmplt_epi32(u[15], kZero);
-
- u[0] = _mm_sub_epi32(u[0], v[0]);
- u[1] = _mm_sub_epi32(u[1], v[1]);
- u[2] = _mm_sub_epi32(u[2], v[2]);
- u[3] = _mm_sub_epi32(u[3], v[3]);
- u[4] = _mm_sub_epi32(u[4], v[4]);
- u[5] = _mm_sub_epi32(u[5], v[5]);
- u[6] = _mm_sub_epi32(u[6], v[6]);
- u[7] = _mm_sub_epi32(u[7], v[7]);
- u[8] = _mm_sub_epi32(u[8], v[8]);
- u[9] = _mm_sub_epi32(u[9], v[9]);
- u[10] = _mm_sub_epi32(u[10], v[10]);
- u[11] = _mm_sub_epi32(u[11], v[11]);
- u[12] = _mm_sub_epi32(u[12], v[12]);
- u[13] = _mm_sub_epi32(u[13], v[13]);
- u[14] = _mm_sub_epi32(u[14], v[14]);
- u[15] = _mm_sub_epi32(u[15], v[15]);
-
- v[0] = _mm_add_epi32(u[0], K32One);
- v[1] = _mm_add_epi32(u[1], K32One);
- v[2] = _mm_add_epi32(u[2], K32One);
- v[3] = _mm_add_epi32(u[3], K32One);
- v[4] = _mm_add_epi32(u[4], K32One);
- v[5] = _mm_add_epi32(u[5], K32One);
- v[6] = _mm_add_epi32(u[6], K32One);
- v[7] = _mm_add_epi32(u[7], K32One);
- v[8] = _mm_add_epi32(u[8], K32One);
- v[9] = _mm_add_epi32(u[9], K32One);
- v[10] = _mm_add_epi32(u[10], K32One);
- v[11] = _mm_add_epi32(u[11], K32One);
- v[12] = _mm_add_epi32(u[12], K32One);
- v[13] = _mm_add_epi32(u[13], K32One);
- v[14] = _mm_add_epi32(u[14], K32One);
- v[15] = _mm_add_epi32(u[15], K32One);
-
- u[0] = _mm_srai_epi32(v[0], 2);
- u[1] = _mm_srai_epi32(v[1], 2);
- u[2] = _mm_srai_epi32(v[2], 2);
- u[3] = _mm_srai_epi32(v[3], 2);
- u[4] = _mm_srai_epi32(v[4], 2);
- u[5] = _mm_srai_epi32(v[5], 2);
- u[6] = _mm_srai_epi32(v[6], 2);
- u[7] = _mm_srai_epi32(v[7], 2);
- u[8] = _mm_srai_epi32(v[8], 2);
- u[9] = _mm_srai_epi32(v[9], 2);
- u[10] = _mm_srai_epi32(v[10], 2);
- u[11] = _mm_srai_epi32(v[11], 2);
- u[12] = _mm_srai_epi32(v[12], 2);
- u[13] = _mm_srai_epi32(v[13], 2);
- u[14] = _mm_srai_epi32(v[14], 2);
- u[15] = _mm_srai_epi32(v[15], 2);
-
- out[2] = _mm_packs_epi32(u[0], u[1]);
- out[18] = _mm_packs_epi32(u[2], u[3]);
- out[10] = _mm_packs_epi32(u[4], u[5]);
- out[26] = _mm_packs_epi32(u[6], u[7]);
- out[6] = _mm_packs_epi32(u[8], u[9]);
- out[22] = _mm_packs_epi32(u[10], u[11]);
- out[14] = _mm_packs_epi32(u[12], u[13]);
- out[30] = _mm_packs_epi32(u[14], u[15]);
-#if DCT_HIGH_BIT_DEPTH
- overflow =
- check_epi16_overflow_x8(&out[2], &out[18], &out[10], &out[26],
- &out[6], &out[22], &out[14], &out[30]);
- if (overflow) {
- HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
- return;
- }
-#endif // DCT_HIGH_BIT_DEPTH
- }
- {
- lstep1[32] = _mm_add_epi32(lstep3[34], lstep2[32]);
- lstep1[33] = _mm_add_epi32(lstep3[35], lstep2[33]);
- lstep1[34] = _mm_sub_epi32(lstep2[32], lstep3[34]);
- lstep1[35] = _mm_sub_epi32(lstep2[33], lstep3[35]);
- lstep1[36] = _mm_sub_epi32(lstep2[38], lstep3[36]);
- lstep1[37] = _mm_sub_epi32(lstep2[39], lstep3[37]);
- lstep1[38] = _mm_add_epi32(lstep3[36], lstep2[38]);
- lstep1[39] = _mm_add_epi32(lstep3[37], lstep2[39]);
- lstep1[40] = _mm_add_epi32(lstep3[42], lstep2[40]);
- lstep1[41] = _mm_add_epi32(lstep3[43], lstep2[41]);
- lstep1[42] = _mm_sub_epi32(lstep2[40], lstep3[42]);
- lstep1[43] = _mm_sub_epi32(lstep2[41], lstep3[43]);
- lstep1[44] = _mm_sub_epi32(lstep2[46], lstep3[44]);
- lstep1[45] = _mm_sub_epi32(lstep2[47], lstep3[45]);
- lstep1[46] = _mm_add_epi32(lstep3[44], lstep2[46]);
- lstep1[47] = _mm_add_epi32(lstep3[45], lstep2[47]);
- lstep1[48] = _mm_add_epi32(lstep3[50], lstep2[48]);
- lstep1[49] = _mm_add_epi32(lstep3[51], lstep2[49]);
- lstep1[50] = _mm_sub_epi32(lstep2[48], lstep3[50]);
- lstep1[51] = _mm_sub_epi32(lstep2[49], lstep3[51]);
- lstep1[52] = _mm_sub_epi32(lstep2[54], lstep3[52]);
- lstep1[53] = _mm_sub_epi32(lstep2[55], lstep3[53]);
- lstep1[54] = _mm_add_epi32(lstep3[52], lstep2[54]);
- lstep1[55] = _mm_add_epi32(lstep3[53], lstep2[55]);
- lstep1[56] = _mm_add_epi32(lstep3[58], lstep2[56]);
- lstep1[57] = _mm_add_epi32(lstep3[59], lstep2[57]);
- lstep1[58] = _mm_sub_epi32(lstep2[56], lstep3[58]);
- lstep1[59] = _mm_sub_epi32(lstep2[57], lstep3[59]);
- lstep1[60] = _mm_sub_epi32(lstep2[62], lstep3[60]);
- lstep1[61] = _mm_sub_epi32(lstep2[63], lstep3[61]);
- lstep1[62] = _mm_add_epi32(lstep3[60], lstep2[62]);
- lstep1[63] = _mm_add_epi32(lstep3[61], lstep2[63]);
- }
- // stage 8
- {
- const __m128i k32_p31_p01 = pair_set_epi32(cospi_31_64, cospi_1_64);
- const __m128i k32_p15_p17 = pair_set_epi32(cospi_15_64, cospi_17_64);
- const __m128i k32_p23_p09 = pair_set_epi32(cospi_23_64, cospi_9_64);
- const __m128i k32_p07_p25 = pair_set_epi32(cospi_7_64, cospi_25_64);
- const __m128i k32_m25_p07 = pair_set_epi32(-cospi_25_64, cospi_7_64);
- const __m128i k32_m09_p23 = pair_set_epi32(-cospi_9_64, cospi_23_64);
- const __m128i k32_m17_p15 = pair_set_epi32(-cospi_17_64, cospi_15_64);
- const __m128i k32_m01_p31 = pair_set_epi32(-cospi_1_64, cospi_31_64);
-
- u[0] = _mm_unpacklo_epi32(lstep1[32], lstep1[62]);
- u[1] = _mm_unpackhi_epi32(lstep1[32], lstep1[62]);
- u[2] = _mm_unpacklo_epi32(lstep1[33], lstep1[63]);
- u[3] = _mm_unpackhi_epi32(lstep1[33], lstep1[63]);
- u[4] = _mm_unpacklo_epi32(lstep1[34], lstep1[60]);
- u[5] = _mm_unpackhi_epi32(lstep1[34], lstep1[60]);
- u[6] = _mm_unpacklo_epi32(lstep1[35], lstep1[61]);
- u[7] = _mm_unpackhi_epi32(lstep1[35], lstep1[61]);
- u[8] = _mm_unpacklo_epi32(lstep1[36], lstep1[58]);
- u[9] = _mm_unpackhi_epi32(lstep1[36], lstep1[58]);
- u[10] = _mm_unpacklo_epi32(lstep1[37], lstep1[59]);
- u[11] = _mm_unpackhi_epi32(lstep1[37], lstep1[59]);
- u[12] = _mm_unpacklo_epi32(lstep1[38], lstep1[56]);
- u[13] = _mm_unpackhi_epi32(lstep1[38], lstep1[56]);
- u[14] = _mm_unpacklo_epi32(lstep1[39], lstep1[57]);
- u[15] = _mm_unpackhi_epi32(lstep1[39], lstep1[57]);
-
- v[0] = k_madd_epi32(u[0], k32_p31_p01);
- v[1] = k_madd_epi32(u[1], k32_p31_p01);
- v[2] = k_madd_epi32(u[2], k32_p31_p01);
- v[3] = k_madd_epi32(u[3], k32_p31_p01);
- v[4] = k_madd_epi32(u[4], k32_p15_p17);
- v[5] = k_madd_epi32(u[5], k32_p15_p17);
- v[6] = k_madd_epi32(u[6], k32_p15_p17);
- v[7] = k_madd_epi32(u[7], k32_p15_p17);
- v[8] = k_madd_epi32(u[8], k32_p23_p09);
- v[9] = k_madd_epi32(u[9], k32_p23_p09);
- v[10] = k_madd_epi32(u[10], k32_p23_p09);
- v[11] = k_madd_epi32(u[11], k32_p23_p09);
- v[12] = k_madd_epi32(u[12], k32_p07_p25);
- v[13] = k_madd_epi32(u[13], k32_p07_p25);
- v[14] = k_madd_epi32(u[14], k32_p07_p25);
- v[15] = k_madd_epi32(u[15], k32_p07_p25);
- v[16] = k_madd_epi32(u[12], k32_m25_p07);
- v[17] = k_madd_epi32(u[13], k32_m25_p07);
- v[18] = k_madd_epi32(u[14], k32_m25_p07);
- v[19] = k_madd_epi32(u[15], k32_m25_p07);
- v[20] = k_madd_epi32(u[8], k32_m09_p23);
- v[21] = k_madd_epi32(u[9], k32_m09_p23);
- v[22] = k_madd_epi32(u[10], k32_m09_p23);
- v[23] = k_madd_epi32(u[11], k32_m09_p23);
- v[24] = k_madd_epi32(u[4], k32_m17_p15);
- v[25] = k_madd_epi32(u[5], k32_m17_p15);
- v[26] = k_madd_epi32(u[6], k32_m17_p15);
- v[27] = k_madd_epi32(u[7], k32_m17_p15);
- v[28] = k_madd_epi32(u[0], k32_m01_p31);
- v[29] = k_madd_epi32(u[1], k32_m01_p31);
- v[30] = k_madd_epi32(u[2], k32_m01_p31);
- v[31] = k_madd_epi32(u[3], k32_m01_p31);
-
-#if DCT_HIGH_BIT_DEPTH
- overflow = k_check_epi32_overflow_32(
- &v[0], &v[1], &v[2], &v[3], &v[4], &v[5], &v[6], &v[7], &v[8],
- &v[9], &v[10], &v[11], &v[12], &v[13], &v[14], &v[15], &v[16],
- &v[17], &v[18], &v[19], &v[20], &v[21], &v[22], &v[23], &v[24],
- &v[25], &v[26], &v[27], &v[28], &v[29], &v[30], &v[31], &kZero);
- if (overflow) {
- HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
- return;
- }
-#endif // DCT_HIGH_BIT_DEPTH
- u[0] = k_packs_epi64(v[0], v[1]);
- u[1] = k_packs_epi64(v[2], v[3]);
- u[2] = k_packs_epi64(v[4], v[5]);
- u[3] = k_packs_epi64(v[6], v[7]);
- u[4] = k_packs_epi64(v[8], v[9]);
- u[5] = k_packs_epi64(v[10], v[11]);
- u[6] = k_packs_epi64(v[12], v[13]);
- u[7] = k_packs_epi64(v[14], v[15]);
- u[8] = k_packs_epi64(v[16], v[17]);
- u[9] = k_packs_epi64(v[18], v[19]);
- u[10] = k_packs_epi64(v[20], v[21]);
- u[11] = k_packs_epi64(v[22], v[23]);
- u[12] = k_packs_epi64(v[24], v[25]);
- u[13] = k_packs_epi64(v[26], v[27]);
- u[14] = k_packs_epi64(v[28], v[29]);
- u[15] = k_packs_epi64(v[30], v[31]);
-
- v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
- v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
- v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
- v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
- v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
- v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
- v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
- v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
- v[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING);
- v[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING);
- v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);
- v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);
- v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);
- v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING);
- v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);
- v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);
-
- u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
- u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
- u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
- u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
- u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS);
- u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS);
- u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS);
- u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS);
- u[8] = _mm_srai_epi32(v[8], DCT_CONST_BITS);
- u[9] = _mm_srai_epi32(v[9], DCT_CONST_BITS);
- u[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS);
- u[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS);
- u[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS);
- u[13] = _mm_srai_epi32(v[13], DCT_CONST_BITS);
- u[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS);
- u[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS);
-
- v[0] = _mm_cmplt_epi32(u[0], kZero);
- v[1] = _mm_cmplt_epi32(u[1], kZero);
- v[2] = _mm_cmplt_epi32(u[2], kZero);
- v[3] = _mm_cmplt_epi32(u[3], kZero);
- v[4] = _mm_cmplt_epi32(u[4], kZero);
- v[5] = _mm_cmplt_epi32(u[5], kZero);
- v[6] = _mm_cmplt_epi32(u[6], kZero);
- v[7] = _mm_cmplt_epi32(u[7], kZero);
- v[8] = _mm_cmplt_epi32(u[8], kZero);
- v[9] = _mm_cmplt_epi32(u[9], kZero);
- v[10] = _mm_cmplt_epi32(u[10], kZero);
- v[11] = _mm_cmplt_epi32(u[11], kZero);
- v[12] = _mm_cmplt_epi32(u[12], kZero);
- v[13] = _mm_cmplt_epi32(u[13], kZero);
- v[14] = _mm_cmplt_epi32(u[14], kZero);
- v[15] = _mm_cmplt_epi32(u[15], kZero);
-
- u[0] = _mm_sub_epi32(u[0], v[0]);
- u[1] = _mm_sub_epi32(u[1], v[1]);
- u[2] = _mm_sub_epi32(u[2], v[2]);
- u[3] = _mm_sub_epi32(u[3], v[3]);
- u[4] = _mm_sub_epi32(u[4], v[4]);
- u[5] = _mm_sub_epi32(u[5], v[5]);
- u[6] = _mm_sub_epi32(u[6], v[6]);
- u[7] = _mm_sub_epi32(u[7], v[7]);
- u[8] = _mm_sub_epi32(u[8], v[8]);
- u[9] = _mm_sub_epi32(u[9], v[9]);
- u[10] = _mm_sub_epi32(u[10], v[10]);
- u[11] = _mm_sub_epi32(u[11], v[11]);
- u[12] = _mm_sub_epi32(u[12], v[12]);
- u[13] = _mm_sub_epi32(u[13], v[13]);
- u[14] = _mm_sub_epi32(u[14], v[14]);
- u[15] = _mm_sub_epi32(u[15], v[15]);
-
- v[0] = _mm_add_epi32(u[0], K32One);
- v[1] = _mm_add_epi32(u[1], K32One);
- v[2] = _mm_add_epi32(u[2], K32One);
- v[3] = _mm_add_epi32(u[3], K32One);
- v[4] = _mm_add_epi32(u[4], K32One);
- v[5] = _mm_add_epi32(u[5], K32One);
- v[6] = _mm_add_epi32(u[6], K32One);
- v[7] = _mm_add_epi32(u[7], K32One);
- v[8] = _mm_add_epi32(u[8], K32One);
- v[9] = _mm_add_epi32(u[9], K32One);
- v[10] = _mm_add_epi32(u[10], K32One);
- v[11] = _mm_add_epi32(u[11], K32One);
- v[12] = _mm_add_epi32(u[12], K32One);
- v[13] = _mm_add_epi32(u[13], K32One);
- v[14] = _mm_add_epi32(u[14], K32One);
- v[15] = _mm_add_epi32(u[15], K32One);
-
- u[0] = _mm_srai_epi32(v[0], 2);
- u[1] = _mm_srai_epi32(v[1], 2);
- u[2] = _mm_srai_epi32(v[2], 2);
- u[3] = _mm_srai_epi32(v[3], 2);
- u[4] = _mm_srai_epi32(v[4], 2);
- u[5] = _mm_srai_epi32(v[5], 2);
- u[6] = _mm_srai_epi32(v[6], 2);
- u[7] = _mm_srai_epi32(v[7], 2);
- u[8] = _mm_srai_epi32(v[8], 2);
- u[9] = _mm_srai_epi32(v[9], 2);
- u[10] = _mm_srai_epi32(v[10], 2);
- u[11] = _mm_srai_epi32(v[11], 2);
- u[12] = _mm_srai_epi32(v[12], 2);
- u[13] = _mm_srai_epi32(v[13], 2);
- u[14] = _mm_srai_epi32(v[14], 2);
- u[15] = _mm_srai_epi32(v[15], 2);
-
- out[1] = _mm_packs_epi32(u[0], u[1]);
- out[17] = _mm_packs_epi32(u[2], u[3]);
- out[9] = _mm_packs_epi32(u[4], u[5]);
- out[25] = _mm_packs_epi32(u[6], u[7]);
- out[7] = _mm_packs_epi32(u[8], u[9]);
- out[23] = _mm_packs_epi32(u[10], u[11]);
- out[15] = _mm_packs_epi32(u[12], u[13]);
- out[31] = _mm_packs_epi32(u[14], u[15]);
-#if DCT_HIGH_BIT_DEPTH
- overflow =
- check_epi16_overflow_x8(&out[1], &out[17], &out[9], &out[25],
- &out[7], &out[23], &out[15], &out[31]);
- if (overflow) {
- HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
- return;
- }
-#endif // DCT_HIGH_BIT_DEPTH
- }
- {
- const __m128i k32_p27_p05 = pair_set_epi32(cospi_27_64, cospi_5_64);
- const __m128i k32_p11_p21 = pair_set_epi32(cospi_11_64, cospi_21_64);
- const __m128i k32_p19_p13 = pair_set_epi32(cospi_19_64, cospi_13_64);
- const __m128i k32_p03_p29 = pair_set_epi32(cospi_3_64, cospi_29_64);
- const __m128i k32_m29_p03 = pair_set_epi32(-cospi_29_64, cospi_3_64);
- const __m128i k32_m13_p19 = pair_set_epi32(-cospi_13_64, cospi_19_64);
- const __m128i k32_m21_p11 = pair_set_epi32(-cospi_21_64, cospi_11_64);
- const __m128i k32_m05_p27 = pair_set_epi32(-cospi_5_64, cospi_27_64);
-
- u[0] = _mm_unpacklo_epi32(lstep1[40], lstep1[54]);
- u[1] = _mm_unpackhi_epi32(lstep1[40], lstep1[54]);
- u[2] = _mm_unpacklo_epi32(lstep1[41], lstep1[55]);
- u[3] = _mm_unpackhi_epi32(lstep1[41], lstep1[55]);
- u[4] = _mm_unpacklo_epi32(lstep1[42], lstep1[52]);
- u[5] = _mm_unpackhi_epi32(lstep1[42], lstep1[52]);
- u[6] = _mm_unpacklo_epi32(lstep1[43], lstep1[53]);
- u[7] = _mm_unpackhi_epi32(lstep1[43], lstep1[53]);
- u[8] = _mm_unpacklo_epi32(lstep1[44], lstep1[50]);
- u[9] = _mm_unpackhi_epi32(lstep1[44], lstep1[50]);
- u[10] = _mm_unpacklo_epi32(lstep1[45], lstep1[51]);
- u[11] = _mm_unpackhi_epi32(lstep1[45], lstep1[51]);
- u[12] = _mm_unpacklo_epi32(lstep1[46], lstep1[48]);
- u[13] = _mm_unpackhi_epi32(lstep1[46], lstep1[48]);
- u[14] = _mm_unpacklo_epi32(lstep1[47], lstep1[49]);
- u[15] = _mm_unpackhi_epi32(lstep1[47], lstep1[49]);
-
- v[0] = k_madd_epi32(u[0], k32_p27_p05);
- v[1] = k_madd_epi32(u[1], k32_p27_p05);
- v[2] = k_madd_epi32(u[2], k32_p27_p05);
- v[3] = k_madd_epi32(u[3], k32_p27_p05);
- v[4] = k_madd_epi32(u[4], k32_p11_p21);
- v[5] = k_madd_epi32(u[5], k32_p11_p21);
- v[6] = k_madd_epi32(u[6], k32_p11_p21);
- v[7] = k_madd_epi32(u[7], k32_p11_p21);
- v[8] = k_madd_epi32(u[8], k32_p19_p13);
- v[9] = k_madd_epi32(u[9], k32_p19_p13);
- v[10] = k_madd_epi32(u[10], k32_p19_p13);
- v[11] = k_madd_epi32(u[11], k32_p19_p13);
- v[12] = k_madd_epi32(u[12], k32_p03_p29);
- v[13] = k_madd_epi32(u[13], k32_p03_p29);
- v[14] = k_madd_epi32(u[14], k32_p03_p29);
- v[15] = k_madd_epi32(u[15], k32_p03_p29);
- v[16] = k_madd_epi32(u[12], k32_m29_p03);
- v[17] = k_madd_epi32(u[13], k32_m29_p03);
- v[18] = k_madd_epi32(u[14], k32_m29_p03);
- v[19] = k_madd_epi32(u[15], k32_m29_p03);
- v[20] = k_madd_epi32(u[8], k32_m13_p19);
- v[21] = k_madd_epi32(u[9], k32_m13_p19);
- v[22] = k_madd_epi32(u[10], k32_m13_p19);
- v[23] = k_madd_epi32(u[11], k32_m13_p19);
- v[24] = k_madd_epi32(u[4], k32_m21_p11);
- v[25] = k_madd_epi32(u[5], k32_m21_p11);
- v[26] = k_madd_epi32(u[6], k32_m21_p11);
- v[27] = k_madd_epi32(u[7], k32_m21_p11);
- v[28] = k_madd_epi32(u[0], k32_m05_p27);
- v[29] = k_madd_epi32(u[1], k32_m05_p27);
- v[30] = k_madd_epi32(u[2], k32_m05_p27);
- v[31] = k_madd_epi32(u[3], k32_m05_p27);
-
-#if DCT_HIGH_BIT_DEPTH
- overflow = k_check_epi32_overflow_32(
- &v[0], &v[1], &v[2], &v[3], &v[4], &v[5], &v[6], &v[7], &v[8],
- &v[9], &v[10], &v[11], &v[12], &v[13], &v[14], &v[15], &v[16],
- &v[17], &v[18], &v[19], &v[20], &v[21], &v[22], &v[23], &v[24],
- &v[25], &v[26], &v[27], &v[28], &v[29], &v[30], &v[31], &kZero);
- if (overflow) {
- HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
- return;
- }
-#endif // DCT_HIGH_BIT_DEPTH
- u[0] = k_packs_epi64(v[0], v[1]);
- u[1] = k_packs_epi64(v[2], v[3]);
- u[2] = k_packs_epi64(v[4], v[5]);
- u[3] = k_packs_epi64(v[6], v[7]);
- u[4] = k_packs_epi64(v[8], v[9]);
- u[5] = k_packs_epi64(v[10], v[11]);
- u[6] = k_packs_epi64(v[12], v[13]);
- u[7] = k_packs_epi64(v[14], v[15]);
- u[8] = k_packs_epi64(v[16], v[17]);
- u[9] = k_packs_epi64(v[18], v[19]);
- u[10] = k_packs_epi64(v[20], v[21]);
- u[11] = k_packs_epi64(v[22], v[23]);
- u[12] = k_packs_epi64(v[24], v[25]);
- u[13] = k_packs_epi64(v[26], v[27]);
- u[14] = k_packs_epi64(v[28], v[29]);
- u[15] = k_packs_epi64(v[30], v[31]);
-
- v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
- v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
- v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
- v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
- v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
- v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
- v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
- v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
- v[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING);
- v[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING);
- v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);
- v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);
- v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);
- v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING);
- v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);
- v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);
-
- u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
- u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
- u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
- u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
- u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS);
- u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS);
- u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS);
- u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS);
- u[8] = _mm_srai_epi32(v[8], DCT_CONST_BITS);
- u[9] = _mm_srai_epi32(v[9], DCT_CONST_BITS);
- u[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS);
- u[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS);
- u[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS);
- u[13] = _mm_srai_epi32(v[13], DCT_CONST_BITS);
- u[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS);
- u[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS);
-
- v[0] = _mm_cmplt_epi32(u[0], kZero);
- v[1] = _mm_cmplt_epi32(u[1], kZero);
- v[2] = _mm_cmplt_epi32(u[2], kZero);
- v[3] = _mm_cmplt_epi32(u[3], kZero);
- v[4] = _mm_cmplt_epi32(u[4], kZero);
- v[5] = _mm_cmplt_epi32(u[5], kZero);
- v[6] = _mm_cmplt_epi32(u[6], kZero);
- v[7] = _mm_cmplt_epi32(u[7], kZero);
- v[8] = _mm_cmplt_epi32(u[8], kZero);
- v[9] = _mm_cmplt_epi32(u[9], kZero);
- v[10] = _mm_cmplt_epi32(u[10], kZero);
- v[11] = _mm_cmplt_epi32(u[11], kZero);
- v[12] = _mm_cmplt_epi32(u[12], kZero);
- v[13] = _mm_cmplt_epi32(u[13], kZero);
- v[14] = _mm_cmplt_epi32(u[14], kZero);
- v[15] = _mm_cmplt_epi32(u[15], kZero);
-
- u[0] = _mm_sub_epi32(u[0], v[0]);
- u[1] = _mm_sub_epi32(u[1], v[1]);
- u[2] = _mm_sub_epi32(u[2], v[2]);
- u[3] = _mm_sub_epi32(u[3], v[3]);
- u[4] = _mm_sub_epi32(u[4], v[4]);
- u[5] = _mm_sub_epi32(u[5], v[5]);
- u[6] = _mm_sub_epi32(u[6], v[6]);
- u[7] = _mm_sub_epi32(u[7], v[7]);
- u[8] = _mm_sub_epi32(u[8], v[8]);
- u[9] = _mm_sub_epi32(u[9], v[9]);
- u[10] = _mm_sub_epi32(u[10], v[10]);
- u[11] = _mm_sub_epi32(u[11], v[11]);
- u[12] = _mm_sub_epi32(u[12], v[12]);
- u[13] = _mm_sub_epi32(u[13], v[13]);
- u[14] = _mm_sub_epi32(u[14], v[14]);
- u[15] = _mm_sub_epi32(u[15], v[15]);
-
- v[0] = _mm_add_epi32(u[0], K32One);
- v[1] = _mm_add_epi32(u[1], K32One);
- v[2] = _mm_add_epi32(u[2], K32One);
- v[3] = _mm_add_epi32(u[3], K32One);
- v[4] = _mm_add_epi32(u[4], K32One);
- v[5] = _mm_add_epi32(u[5], K32One);
- v[6] = _mm_add_epi32(u[6], K32One);
- v[7] = _mm_add_epi32(u[7], K32One);
- v[8] = _mm_add_epi32(u[8], K32One);
- v[9] = _mm_add_epi32(u[9], K32One);
- v[10] = _mm_add_epi32(u[10], K32One);
- v[11] = _mm_add_epi32(u[11], K32One);
- v[12] = _mm_add_epi32(u[12], K32One);
- v[13] = _mm_add_epi32(u[13], K32One);
- v[14] = _mm_add_epi32(u[14], K32One);
- v[15] = _mm_add_epi32(u[15], K32One);
-
- u[0] = _mm_srai_epi32(v[0], 2);
- u[1] = _mm_srai_epi32(v[1], 2);
- u[2] = _mm_srai_epi32(v[2], 2);
- u[3] = _mm_srai_epi32(v[3], 2);
- u[4] = _mm_srai_epi32(v[4], 2);
- u[5] = _mm_srai_epi32(v[5], 2);
- u[6] = _mm_srai_epi32(v[6], 2);
- u[7] = _mm_srai_epi32(v[7], 2);
- u[8] = _mm_srai_epi32(v[8], 2);
- u[9] = _mm_srai_epi32(v[9], 2);
- u[10] = _mm_srai_epi32(v[10], 2);
- u[11] = _mm_srai_epi32(v[11], 2);
- u[12] = _mm_srai_epi32(v[12], 2);
- u[13] = _mm_srai_epi32(v[13], 2);
- u[14] = _mm_srai_epi32(v[14], 2);
- u[15] = _mm_srai_epi32(v[15], 2);
-
- out[5] = _mm_packs_epi32(u[0], u[1]);
- out[21] = _mm_packs_epi32(u[2], u[3]);
- out[13] = _mm_packs_epi32(u[4], u[5]);
- out[29] = _mm_packs_epi32(u[6], u[7]);
- out[3] = _mm_packs_epi32(u[8], u[9]);
- out[19] = _mm_packs_epi32(u[10], u[11]);
- out[11] = _mm_packs_epi32(u[12], u[13]);
- out[27] = _mm_packs_epi32(u[14], u[15]);
-#if DCT_HIGH_BIT_DEPTH
- overflow =
- check_epi16_overflow_x8(&out[5], &out[21], &out[13], &out[29],
- &out[3], &out[19], &out[11], &out[27]);
- if (overflow) {
- HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
- return;
- }
-#endif // DCT_HIGH_BIT_DEPTH
- }
- }
-#endif // FDCT32x32_HIGH_PRECISION
- // Transpose the results, do it as four 8x8 transposes.
- {
- int transpose_block;
- int16_t *output0 = &intermediate[column_start * 32];
- tran_low_t *output1 = &output_org[column_start * 32];
- for (transpose_block = 0; transpose_block < 4; ++transpose_block) {
- __m128i *this_out = &out[8 * transpose_block];
- // 00 01 02 03 04 05 06 07
- // 10 11 12 13 14 15 16 17
- // 20 21 22 23 24 25 26 27
- // 30 31 32 33 34 35 36 37
- // 40 41 42 43 44 45 46 47
- // 50 51 52 53 54 55 56 57
- // 60 61 62 63 64 65 66 67
- // 70 71 72 73 74 75 76 77
- const __m128i tr0_0 = _mm_unpacklo_epi16(this_out[0], this_out[1]);
- const __m128i tr0_1 = _mm_unpacklo_epi16(this_out[2], this_out[3]);
- const __m128i tr0_2 = _mm_unpackhi_epi16(this_out[0], this_out[1]);
- const __m128i tr0_3 = _mm_unpackhi_epi16(this_out[2], this_out[3]);
- const __m128i tr0_4 = _mm_unpacklo_epi16(this_out[4], this_out[5]);
- const __m128i tr0_5 = _mm_unpacklo_epi16(this_out[6], this_out[7]);
- const __m128i tr0_6 = _mm_unpackhi_epi16(this_out[4], this_out[5]);
- const __m128i tr0_7 = _mm_unpackhi_epi16(this_out[6], this_out[7]);
- // 00 10 01 11 02 12 03 13
- // 20 30 21 31 22 32 23 33
- // 04 14 05 15 06 16 07 17
- // 24 34 25 35 26 36 27 37
- // 40 50 41 51 42 52 43 53
- // 60 70 61 71 62 72 63 73
- // 54 54 55 55 56 56 57 57
- // 64 74 65 75 66 76 67 77
- const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
- const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3);
- const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1);
- const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3);
- const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5);
- const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7);
- const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5);
- const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7);
- // 00 10 20 30 01 11 21 31
- // 40 50 60 70 41 51 61 71
- // 02 12 22 32 03 13 23 33
- // 42 52 62 72 43 53 63 73
- // 04 14 24 34 05 15 21 36
- // 44 54 64 74 45 55 61 76
- // 06 16 26 36 07 17 27 37
- // 46 56 66 76 47 57 67 77
- __m128i tr2_0 = _mm_unpacklo_epi64(tr1_0, tr1_4);
- __m128i tr2_1 = _mm_unpackhi_epi64(tr1_0, tr1_4);
- __m128i tr2_2 = _mm_unpacklo_epi64(tr1_2, tr1_6);
- __m128i tr2_3 = _mm_unpackhi_epi64(tr1_2, tr1_6);
- __m128i tr2_4 = _mm_unpacklo_epi64(tr1_1, tr1_5);
- __m128i tr2_5 = _mm_unpackhi_epi64(tr1_1, tr1_5);
- __m128i tr2_6 = _mm_unpacklo_epi64(tr1_3, tr1_7);
- __m128i tr2_7 = _mm_unpackhi_epi64(tr1_3, tr1_7);
- // 00 10 20 30 40 50 60 70
- // 01 11 21 31 41 51 61 71
- // 02 12 22 32 42 52 62 72
- // 03 13 23 33 43 53 63 73
- // 04 14 24 34 44 54 64 74
- // 05 15 25 35 45 55 65 75
- // 06 16 26 36 46 56 66 76
- // 07 17 27 37 47 57 67 77
- if (0 == pass) {
- // output[j] = (output[j] + 1 + (output[j] > 0)) >> 2;
- // TODO(cd): see quality impact of only doing
- // output[j] = (output[j] + 1) >> 2;
- // which would remove the code between here ...
- __m128i tr2_0_0 = _mm_cmpgt_epi16(tr2_0, kZero);
- __m128i tr2_1_0 = _mm_cmpgt_epi16(tr2_1, kZero);
- __m128i tr2_2_0 = _mm_cmpgt_epi16(tr2_2, kZero);
- __m128i tr2_3_0 = _mm_cmpgt_epi16(tr2_3, kZero);
- __m128i tr2_4_0 = _mm_cmpgt_epi16(tr2_4, kZero);
- __m128i tr2_5_0 = _mm_cmpgt_epi16(tr2_5, kZero);
- __m128i tr2_6_0 = _mm_cmpgt_epi16(tr2_6, kZero);
- __m128i tr2_7_0 = _mm_cmpgt_epi16(tr2_7, kZero);
- tr2_0 = _mm_sub_epi16(tr2_0, tr2_0_0);
- tr2_1 = _mm_sub_epi16(tr2_1, tr2_1_0);
- tr2_2 = _mm_sub_epi16(tr2_2, tr2_2_0);
- tr2_3 = _mm_sub_epi16(tr2_3, tr2_3_0);
- tr2_4 = _mm_sub_epi16(tr2_4, tr2_4_0);
- tr2_5 = _mm_sub_epi16(tr2_5, tr2_5_0);
- tr2_6 = _mm_sub_epi16(tr2_6, tr2_6_0);
- tr2_7 = _mm_sub_epi16(tr2_7, tr2_7_0);
- // ... and here.
- // PS: also change code in av1/encoder/dct.c
- tr2_0 = _mm_add_epi16(tr2_0, kOne);
- tr2_1 = _mm_add_epi16(tr2_1, kOne);
- tr2_2 = _mm_add_epi16(tr2_2, kOne);
- tr2_3 = _mm_add_epi16(tr2_3, kOne);
- tr2_4 = _mm_add_epi16(tr2_4, kOne);
- tr2_5 = _mm_add_epi16(tr2_5, kOne);
- tr2_6 = _mm_add_epi16(tr2_6, kOne);
- tr2_7 = _mm_add_epi16(tr2_7, kOne);
- tr2_0 = _mm_srai_epi16(tr2_0, 2);
- tr2_1 = _mm_srai_epi16(tr2_1, 2);
- tr2_2 = _mm_srai_epi16(tr2_2, 2);
- tr2_3 = _mm_srai_epi16(tr2_3, 2);
- tr2_4 = _mm_srai_epi16(tr2_4, 2);
- tr2_5 = _mm_srai_epi16(tr2_5, 2);
- tr2_6 = _mm_srai_epi16(tr2_6, 2);
- tr2_7 = _mm_srai_epi16(tr2_7, 2);
- }
- // Note: even though all these stores are aligned, using the aligned
- // intrinsic make the code slightly slower.
- if (pass == 0) {
- _mm_storeu_si128((__m128i *)(output0 + 0 * 32), tr2_0);
- _mm_storeu_si128((__m128i *)(output0 + 1 * 32), tr2_1);
- _mm_storeu_si128((__m128i *)(output0 + 2 * 32), tr2_2);
- _mm_storeu_si128((__m128i *)(output0 + 3 * 32), tr2_3);
- _mm_storeu_si128((__m128i *)(output0 + 4 * 32), tr2_4);
- _mm_storeu_si128((__m128i *)(output0 + 5 * 32), tr2_5);
- _mm_storeu_si128((__m128i *)(output0 + 6 * 32), tr2_6);
- _mm_storeu_si128((__m128i *)(output0 + 7 * 32), tr2_7);
- // Process next 8x8
- output0 += 8;
- } else {
- storeu_output(&tr2_0, (output1 + 0 * 32));
- storeu_output(&tr2_1, (output1 + 1 * 32));
- storeu_output(&tr2_2, (output1 + 2 * 32));
- storeu_output(&tr2_3, (output1 + 3 * 32));
- storeu_output(&tr2_4, (output1 + 4 * 32));
- storeu_output(&tr2_5, (output1 + 5 * 32));
- storeu_output(&tr2_6, (output1 + 6 * 32));
- storeu_output(&tr2_7, (output1 + 7 * 32));
- // Process next 8x8
- output1 += 8;
- }
- }
- }
- }
- }
-} // NOLINT
-
-#undef ADD_EPI16
-#undef SUB_EPI16
-#undef HIGH_FDCT32x32_2D_C
-#undef HIGH_FDCT32x32_2D_ROWS_C
diff --git a/av1/common/x86/av1_fwd_txfm1d_sse4.c b/av1/common/x86/av1_fwd_txfm1d_sse4.c
index f0bcef9..c09a019 100644
--- a/av1/common/x86/av1_fwd_txfm1d_sse4.c
+++ b/av1/common/x86/av1_fwd_txfm1d_sse4.c
@@ -1,354 +1,5 @@
#include "av1/common/x86/av1_txfm1d_sse4.h"
-void av1_fdct4_new_sse4_1(const __m128i *input, __m128i *output,
- const int8_t *cos_bit, const int8_t *stage_range) {
- const int txfm_size = 4;
- const int num_per_128 = 4;
- const int32_t *cospi;
- __m128i buf0[4];
- __m128i buf1[4];
- int col_num = txfm_size / num_per_128;
- int bit;
- int col;
- (void)stage_range;
- for (col = 0; col < col_num; col++) {
- // stage 0;
- int32_t stage_idx = 0;
- buf0[0] = input[0 * col_num + col];
- buf0[1] = input[1 * col_num + col];
- buf0[2] = input[2 * col_num + col];
- buf0[3] = input[3 * col_num + col];
-
- // stage 1
- stage_idx++;
- bit = cos_bit[stage_idx];
- cospi = cospi_arr[bit - cos_bit_min];
- buf1[0] = _mm_add_epi32(buf0[0], buf0[3]);
- buf1[3] = _mm_sub_epi32(buf0[0], buf0[3]);
- buf1[1] = _mm_add_epi32(buf0[1], buf0[2]);
- buf1[2] = _mm_sub_epi32(buf0[1], buf0[2]);
-
- // stage 2
- stage_idx++;
- bit = cos_bit[stage_idx];
- cospi = cospi_arr[bit - cos_bit_min];
- btf_32_sse4_1_type0(cospi[32], cospi[32], buf1[0], buf1[1], buf0[0],
- buf0[1], bit);
- btf_32_sse4_1_type1(cospi[48], cospi[16], buf1[2], buf1[3], buf0[2],
- buf0[3], bit);
-
- // stage 3
- stage_idx++;
- bit = cos_bit[stage_idx];
- cospi = cospi_arr[bit - cos_bit_min];
- buf1[0] = buf0[0];
- buf1[1] = buf0[2];
- buf1[2] = buf0[1];
- buf1[3] = buf0[3];
-
- output[0 * col_num + col] = buf1[0];
- output[1 * col_num + col] = buf1[1];
- output[2 * col_num + col] = buf1[2];
- output[3 * col_num + col] = buf1[3];
- }
-}
-
-void av1_fdct8_new_sse4_1(const __m128i *input, __m128i *output,
- const int8_t *cos_bit, const int8_t *stage_range) {
- const int txfm_size = 8;
- const int num_per_128 = 4;
- const int32_t *cospi;
- __m128i buf0[8];
- __m128i buf1[8];
- int col_num = txfm_size / num_per_128;
- int bit;
- int col;
- (void)stage_range;
- for (col = 0; col < col_num; col++) {
- // stage 0;
- int32_t stage_idx = 0;
- buf0[0] = input[0 * col_num + col];
- buf0[1] = input[1 * col_num + col];
- buf0[2] = input[2 * col_num + col];
- buf0[3] = input[3 * col_num + col];
- buf0[4] = input[4 * col_num + col];
- buf0[5] = input[5 * col_num + col];
- buf0[6] = input[6 * col_num + col];
- buf0[7] = input[7 * col_num + col];
-
- // stage 1
- stage_idx++;
- bit = cos_bit[stage_idx];
- cospi = cospi_arr[bit - cos_bit_min];
- buf1[0] = _mm_add_epi32(buf0[0], buf0[7]);
- buf1[7] = _mm_sub_epi32(buf0[0], buf0[7]);
- buf1[1] = _mm_add_epi32(buf0[1], buf0[6]);
- buf1[6] = _mm_sub_epi32(buf0[1], buf0[6]);
- buf1[2] = _mm_add_epi32(buf0[2], buf0[5]);
- buf1[5] = _mm_sub_epi32(buf0[2], buf0[5]);
- buf1[3] = _mm_add_epi32(buf0[3], buf0[4]);
- buf1[4] = _mm_sub_epi32(buf0[3], buf0[4]);
-
- // stage 2
- stage_idx++;
- bit = cos_bit[stage_idx];
- cospi = cospi_arr[bit - cos_bit_min];
- buf0[0] = _mm_add_epi32(buf1[0], buf1[3]);
- buf0[3] = _mm_sub_epi32(buf1[0], buf1[3]);
- buf0[1] = _mm_add_epi32(buf1[1], buf1[2]);
- buf0[2] = _mm_sub_epi32(buf1[1], buf1[2]);
- buf0[4] = buf1[4];
- btf_32_sse4_1_type0(-cospi[32], cospi[32], buf1[5], buf1[6], buf0[5],
- buf0[6], bit);
- buf0[7] = buf1[7];
-
- // stage 3
- stage_idx++;
- bit = cos_bit[stage_idx];
- cospi = cospi_arr[bit - cos_bit_min];
- btf_32_sse4_1_type0(cospi[32], cospi[32], buf0[0], buf0[1], buf1[0],
- buf1[1], bit);
- btf_32_sse4_1_type1(cospi[48], cospi[16], buf0[2], buf0[3], buf1[2],
- buf1[3], bit);
- buf1[4] = _mm_add_epi32(buf0[4], buf0[5]);
- buf1[5] = _mm_sub_epi32(buf0[4], buf0[5]);
- buf1[6] = _mm_sub_epi32(buf0[7], buf0[6]);
- buf1[7] = _mm_add_epi32(buf0[7], buf0[6]);
-
- // stage 4
- stage_idx++;
- bit = cos_bit[stage_idx];
- cospi = cospi_arr[bit - cos_bit_min];
- buf0[0] = buf1[0];
- buf0[1] = buf1[1];
- buf0[2] = buf1[2];
- buf0[3] = buf1[3];
- btf_32_sse4_1_type1(cospi[56], cospi[8], buf1[4], buf1[7], buf0[4], buf0[7],
- bit);
- btf_32_sse4_1_type1(cospi[24], cospi[40], buf1[5], buf1[6], buf0[5],
- buf0[6], bit);
-
- // stage 5
- stage_idx++;
- bit = cos_bit[stage_idx];
- cospi = cospi_arr[bit - cos_bit_min];
- buf1[0] = buf0[0];
- buf1[1] = buf0[4];
- buf1[2] = buf0[2];
- buf1[3] = buf0[6];
- buf1[4] = buf0[1];
- buf1[5] = buf0[5];
- buf1[6] = buf0[3];
- buf1[7] = buf0[7];
-
- output[0 * col_num + col] = buf1[0];
- output[1 * col_num + col] = buf1[1];
- output[2 * col_num + col] = buf1[2];
- output[3 * col_num + col] = buf1[3];
- output[4 * col_num + col] = buf1[4];
- output[5 * col_num + col] = buf1[5];
- output[6 * col_num + col] = buf1[6];
- output[7 * col_num + col] = buf1[7];
- }
-}
-
-void av1_fdct16_new_sse4_1(const __m128i *input, __m128i *output,
- const int8_t *cos_bit, const int8_t *stage_range) {
- const int txfm_size = 16;
- const int num_per_128 = 4;
- const int32_t *cospi;
- __m128i buf0[16];
- __m128i buf1[16];
- int col_num = txfm_size / num_per_128;
- int bit;
- int col;
- (void)stage_range;
- for (col = 0; col < col_num; col++) {
- // stage 0;
- int32_t stage_idx = 0;
- buf0[0] = input[0 * col_num + col];
- buf0[1] = input[1 * col_num + col];
- buf0[2] = input[2 * col_num + col];
- buf0[3] = input[3 * col_num + col];
- buf0[4] = input[4 * col_num + col];
- buf0[5] = input[5 * col_num + col];
- buf0[6] = input[6 * col_num + col];
- buf0[7] = input[7 * col_num + col];
- buf0[8] = input[8 * col_num + col];
- buf0[9] = input[9 * col_num + col];
- buf0[10] = input[10 * col_num + col];
- buf0[11] = input[11 * col_num + col];
- buf0[12] = input[12 * col_num + col];
- buf0[13] = input[13 * col_num + col];
- buf0[14] = input[14 * col_num + col];
- buf0[15] = input[15 * col_num + col];
-
- // stage 1
- stage_idx++;
- bit = cos_bit[stage_idx];
- cospi = cospi_arr[bit - cos_bit_min];
- buf1[0] = _mm_add_epi32(buf0[0], buf0[15]);
- buf1[15] = _mm_sub_epi32(buf0[0], buf0[15]);
- buf1[1] = _mm_add_epi32(buf0[1], buf0[14]);
- buf1[14] = _mm_sub_epi32(buf0[1], buf0[14]);
- buf1[2] = _mm_add_epi32(buf0[2], buf0[13]);
- buf1[13] = _mm_sub_epi32(buf0[2], buf0[13]);
- buf1[3] = _mm_add_epi32(buf0[3], buf0[12]);
- buf1[12] = _mm_sub_epi32(buf0[3], buf0[12]);
- buf1[4] = _mm_add_epi32(buf0[4], buf0[11]);
- buf1[11] = _mm_sub_epi32(buf0[4], buf0[11]);
- buf1[5] = _mm_add_epi32(buf0[5], buf0[10]);
- buf1[10] = _mm_sub_epi32(buf0[5], buf0[10]);
- buf1[6] = _mm_add_epi32(buf0[6], buf0[9]);
- buf1[9] = _mm_sub_epi32(buf0[6], buf0[9]);
- buf1[7] = _mm_add_epi32(buf0[7], buf0[8]);
- buf1[8] = _mm_sub_epi32(buf0[7], buf0[8]);
-
- // stage 2
- stage_idx++;
- bit = cos_bit[stage_idx];
- cospi = cospi_arr[bit - cos_bit_min];
- buf0[0] = _mm_add_epi32(buf1[0], buf1[7]);
- buf0[7] = _mm_sub_epi32(buf1[0], buf1[7]);
- buf0[1] = _mm_add_epi32(buf1[1], buf1[6]);
- buf0[6] = _mm_sub_epi32(buf1[1], buf1[6]);
- buf0[2] = _mm_add_epi32(buf1[2], buf1[5]);
- buf0[5] = _mm_sub_epi32(buf1[2], buf1[5]);
- buf0[3] = _mm_add_epi32(buf1[3], buf1[4]);
- buf0[4] = _mm_sub_epi32(buf1[3], buf1[4]);
- buf0[8] = buf1[8];
- buf0[9] = buf1[9];
- btf_32_sse4_1_type0(-cospi[32], cospi[32], buf1[10], buf1[13], buf0[10],
- buf0[13], bit);
- btf_32_sse4_1_type0(-cospi[32], cospi[32], buf1[11], buf1[12], buf0[11],
- buf0[12], bit);
- buf0[14] = buf1[14];
- buf0[15] = buf1[15];
-
- // stage 3
- stage_idx++;
- bit = cos_bit[stage_idx];
- cospi = cospi_arr[bit - cos_bit_min];
- buf1[0] = _mm_add_epi32(buf0[0], buf0[3]);
- buf1[3] = _mm_sub_epi32(buf0[0], buf0[3]);
- buf1[1] = _mm_add_epi32(buf0[1], buf0[2]);
- buf1[2] = _mm_sub_epi32(buf0[1], buf0[2]);
- buf1[4] = buf0[4];
- btf_32_sse4_1_type0(-cospi[32], cospi[32], buf0[5], buf0[6], buf1[5],
- buf1[6], bit);
- buf1[7] = buf0[7];
- buf1[8] = _mm_add_epi32(buf0[8], buf0[11]);
- buf1[11] = _mm_sub_epi32(buf0[8], buf0[11]);
- buf1[9] = _mm_add_epi32(buf0[9], buf0[10]);
- buf1[10] = _mm_sub_epi32(buf0[9], buf0[10]);
- buf1[12] = _mm_sub_epi32(buf0[15], buf0[12]);
- buf1[15] = _mm_add_epi32(buf0[15], buf0[12]);
- buf1[13] = _mm_sub_epi32(buf0[14], buf0[13]);
- buf1[14] = _mm_add_epi32(buf0[14], buf0[13]);
-
- // stage 4
- stage_idx++;
- bit = cos_bit[stage_idx];
- cospi = cospi_arr[bit - cos_bit_min];
- btf_32_sse4_1_type0(cospi[32], cospi[32], buf1[0], buf1[1], buf0[0],
- buf0[1], bit);
- btf_32_sse4_1_type1(cospi[48], cospi[16], buf1[2], buf1[3], buf0[2],
- buf0[3], bit);
- buf0[4] = _mm_add_epi32(buf1[4], buf1[5]);
- buf0[5] = _mm_sub_epi32(buf1[4], buf1[5]);
- buf0[6] = _mm_sub_epi32(buf1[7], buf1[6]);
- buf0[7] = _mm_add_epi32(buf1[7], buf1[6]);
- buf0[8] = buf1[8];
- btf_32_sse4_1_type0(-cospi[16], cospi[48], buf1[9], buf1[14], buf0[9],
- buf0[14], bit);
- btf_32_sse4_1_type0(-cospi[48], -cospi[16], buf1[10], buf1[13], buf0[10],
- buf0[13], bit);
- buf0[11] = buf1[11];
- buf0[12] = buf1[12];
- buf0[15] = buf1[15];
-
- // stage 5
- stage_idx++;
- bit = cos_bit[stage_idx];
- cospi = cospi_arr[bit - cos_bit_min];
- buf1[0] = buf0[0];
- buf1[1] = buf0[1];
- buf1[2] = buf0[2];
- buf1[3] = buf0[3];
- btf_32_sse4_1_type1(cospi[56], cospi[8], buf0[4], buf0[7], buf1[4], buf1[7],
- bit);
- btf_32_sse4_1_type1(cospi[24], cospi[40], buf0[5], buf0[6], buf1[5],
- buf1[6], bit);
- buf1[8] = _mm_add_epi32(buf0[8], buf0[9]);
- buf1[9] = _mm_sub_epi32(buf0[8], buf0[9]);
- buf1[10] = _mm_sub_epi32(buf0[11], buf0[10]);
- buf1[11] = _mm_add_epi32(buf0[11], buf0[10]);
- buf1[12] = _mm_add_epi32(buf0[12], buf0[13]);
- buf1[13] = _mm_sub_epi32(buf0[12], buf0[13]);
- buf1[14] = _mm_sub_epi32(buf0[15], buf0[14]);
- buf1[15] = _mm_add_epi32(buf0[15], buf0[14]);
-
- // stage 6
- stage_idx++;
- bit = cos_bit[stage_idx];
- cospi = cospi_arr[bit - cos_bit_min];
- buf0[0] = buf1[0];
- buf0[1] = buf1[1];
- buf0[2] = buf1[2];
- buf0[3] = buf1[3];
- buf0[4] = buf1[4];
- buf0[5] = buf1[5];
- buf0[6] = buf1[6];
- buf0[7] = buf1[7];
- btf_32_sse4_1_type1(cospi[60], cospi[4], buf1[8], buf1[15], buf0[8],
- buf0[15], bit);
- btf_32_sse4_1_type1(cospi[28], cospi[36], buf1[9], buf1[14], buf0[9],
- buf0[14], bit);
- btf_32_sse4_1_type1(cospi[44], cospi[20], buf1[10], buf1[13], buf0[10],
- buf0[13], bit);
- btf_32_sse4_1_type1(cospi[12], cospi[52], buf1[11], buf1[12], buf0[11],
- buf0[12], bit);
-
- // stage 7
- stage_idx++;
- bit = cos_bit[stage_idx];
- cospi = cospi_arr[bit - cos_bit_min];
- buf1[0] = buf0[0];
- buf1[1] = buf0[8];
- buf1[2] = buf0[4];
- buf1[3] = buf0[12];
- buf1[4] = buf0[2];
- buf1[5] = buf0[10];
- buf1[6] = buf0[6];
- buf1[7] = buf0[14];
- buf1[8] = buf0[1];
- buf1[9] = buf0[9];
- buf1[10] = buf0[5];
- buf1[11] = buf0[13];
- buf1[12] = buf0[3];
- buf1[13] = buf0[11];
- buf1[14] = buf0[7];
- buf1[15] = buf0[15];
-
- output[0 * col_num + col] = buf1[0];
- output[1 * col_num + col] = buf1[1];
- output[2 * col_num + col] = buf1[2];
- output[3 * col_num + col] = buf1[3];
- output[4 * col_num + col] = buf1[4];
- output[5 * col_num + col] = buf1[5];
- output[6 * col_num + col] = buf1[6];
- output[7 * col_num + col] = buf1[7];
- output[8 * col_num + col] = buf1[8];
- output[9 * col_num + col] = buf1[9];
- output[10 * col_num + col] = buf1[10];
- output[11 * col_num + col] = buf1[11];
- output[12 * col_num + col] = buf1[12];
- output[13 * col_num + col] = buf1[13];
- output[14 * col_num + col] = buf1[14];
- output[15 * col_num + col] = buf1[15];
- }
-}
-
void av1_fdct32_new_sse4_1(const __m128i *input, __m128i *output,
const int8_t *cos_bit, const int8_t *stage_range) {
const int txfm_size = 32;
@@ -835,370 +486,6 @@
}
}
-void av1_fadst8_new_sse4_1(const __m128i *input, __m128i *output,
- const int8_t *cos_bit, const int8_t *stage_range) {
- const int txfm_size = 8;
- const int num_per_128 = 4;
- const int32_t *cospi;
- __m128i buf0[8];
- __m128i buf1[8];
- int col_num = txfm_size / num_per_128;
- int bit;
- int col;
- (void)stage_range;
- for (col = 0; col < col_num; col++) {
- // stage 0;
- int32_t stage_idx = 0;
- buf0[0] = input[0 * col_num + col];
- buf0[1] = input[1 * col_num + col];
- buf0[2] = input[2 * col_num + col];
- buf0[3] = input[3 * col_num + col];
- buf0[4] = input[4 * col_num + col];
- buf0[5] = input[5 * col_num + col];
- buf0[6] = input[6 * col_num + col];
- buf0[7] = input[7 * col_num + col];
-
- // stage 1
- stage_idx++;
- bit = cos_bit[stage_idx];
- cospi = cospi_arr[bit - cos_bit_min];
- buf1[0] = buf0[7];
- buf1[1] = buf0[0];
- buf1[2] = buf0[5];
- buf1[3] = buf0[2];
- buf1[4] = buf0[3];
- buf1[5] = buf0[4];
- buf1[6] = buf0[1];
- buf1[7] = buf0[6];
-
- // stage 2
- stage_idx++;
- bit = cos_bit[stage_idx];
- cospi = cospi_arr[bit - cos_bit_min];
- btf_32_sse4_1_type0(cospi[4], cospi[60], buf1[0], buf1[1], buf0[0], buf0[1],
- bit);
- btf_32_sse4_1_type0(cospi[20], cospi[44], buf1[2], buf1[3], buf0[2],
- buf0[3], bit);
- btf_32_sse4_1_type0(cospi[36], cospi[28], buf1[4], buf1[5], buf0[4],
- buf0[5], bit);
- btf_32_sse4_1_type0(cospi[52], cospi[12], buf1[6], buf1[7], buf0[6],
- buf0[7], bit);
-
- // stage 3
- stage_idx++;
- bit = cos_bit[stage_idx];
- cospi = cospi_arr[bit - cos_bit_min];
- buf1[0] = _mm_add_epi32(buf0[0], buf0[4]);
- buf1[4] = _mm_sub_epi32(buf0[0], buf0[4]);
- buf1[1] = _mm_add_epi32(buf0[1], buf0[5]);
- buf1[5] = _mm_sub_epi32(buf0[1], buf0[5]);
- buf1[2] = _mm_add_epi32(buf0[2], buf0[6]);
- buf1[6] = _mm_sub_epi32(buf0[2], buf0[6]);
- buf1[3] = _mm_add_epi32(buf0[3], buf0[7]);
- buf1[7] = _mm_sub_epi32(buf0[3], buf0[7]);
-
- // stage 4
- stage_idx++;
- bit = cos_bit[stage_idx];
- cospi = cospi_arr[bit - cos_bit_min];
- buf0[0] = buf1[0];
- buf0[1] = buf1[1];
- buf0[2] = buf1[2];
- buf0[3] = buf1[3];
- btf_32_sse4_1_type0(cospi[16], cospi[48], buf1[4], buf1[5], buf0[4],
- buf0[5], bit);
- btf_32_sse4_1_type0(-cospi[48], cospi[16], buf1[6], buf1[7], buf0[6],
- buf0[7], bit);
-
- // stage 5
- stage_idx++;
- bit = cos_bit[stage_idx];
- cospi = cospi_arr[bit - cos_bit_min];
- buf1[0] = _mm_add_epi32(buf0[0], buf0[2]);
- buf1[2] = _mm_sub_epi32(buf0[0], buf0[2]);
- buf1[1] = _mm_add_epi32(buf0[1], buf0[3]);
- buf1[3] = _mm_sub_epi32(buf0[1], buf0[3]);
- buf1[4] = _mm_add_epi32(buf0[4], buf0[6]);
- buf1[6] = _mm_sub_epi32(buf0[4], buf0[6]);
- buf1[5] = _mm_add_epi32(buf0[5], buf0[7]);
- buf1[7] = _mm_sub_epi32(buf0[5], buf0[7]);
-
- // stage 6
- stage_idx++;
- bit = cos_bit[stage_idx];
- cospi = cospi_arr[bit - cos_bit_min];
- buf0[0] = buf1[0];
- buf0[1] = buf1[1];
- btf_32_sse4_1_type0(cospi[32], cospi[32], buf1[2], buf1[3], buf0[2],
- buf0[3], bit);
- buf0[4] = buf1[4];
- buf0[5] = buf1[5];
- btf_32_sse4_1_type0(cospi[32], cospi[32], buf1[6], buf1[7], buf0[6],
- buf0[7], bit);
-
- // stage 7
- stage_idx++;
- bit = cos_bit[stage_idx];
- cospi = cospi_arr[bit - cos_bit_min];
- buf1[0] = buf0[0];
- buf1[1] = _mm_sub_epi32(_mm_set1_epi32(0), buf0[4]);
- buf1[2] = buf0[6];
- buf1[3] = _mm_sub_epi32(_mm_set1_epi32(0), buf0[2]);
- buf1[4] = buf0[3];
- buf1[5] = _mm_sub_epi32(_mm_set1_epi32(0), buf0[7]);
- buf1[6] = buf0[5];
- buf1[7] = _mm_sub_epi32(_mm_set1_epi32(0), buf0[1]);
-
- output[0 * col_num + col] = buf1[0];
- output[1 * col_num + col] = buf1[1];
- output[2 * col_num + col] = buf1[2];
- output[3 * col_num + col] = buf1[3];
- output[4 * col_num + col] = buf1[4];
- output[5 * col_num + col] = buf1[5];
- output[6 * col_num + col] = buf1[6];
- output[7 * col_num + col] = buf1[7];
- }
-}
-
-void av1_fadst16_new_sse4_1(const __m128i *input, __m128i *output,
- const int8_t *cos_bit, const int8_t *stage_range) {
- const int txfm_size = 16;
- const int num_per_128 = 4;
- const int32_t *cospi;
- __m128i buf0[16];
- __m128i buf1[16];
- int col_num = txfm_size / num_per_128;
- int bit;
- int col;
- (void)stage_range;
- for (col = 0; col < col_num; col++) {
- // stage 0;
- int32_t stage_idx = 0;
- buf0[0] = input[0 * col_num + col];
- buf0[1] = input[1 * col_num + col];
- buf0[2] = input[2 * col_num + col];
- buf0[3] = input[3 * col_num + col];
- buf0[4] = input[4 * col_num + col];
- buf0[5] = input[5 * col_num + col];
- buf0[6] = input[6 * col_num + col];
- buf0[7] = input[7 * col_num + col];
- buf0[8] = input[8 * col_num + col];
- buf0[9] = input[9 * col_num + col];
- buf0[10] = input[10 * col_num + col];
- buf0[11] = input[11 * col_num + col];
- buf0[12] = input[12 * col_num + col];
- buf0[13] = input[13 * col_num + col];
- buf0[14] = input[14 * col_num + col];
- buf0[15] = input[15 * col_num + col];
-
- // stage 1
- stage_idx++;
- bit = cos_bit[stage_idx];
- cospi = cospi_arr[bit - cos_bit_min];
- buf1[0] = buf0[15];
- buf1[1] = buf0[0];
- buf1[2] = buf0[13];
- buf1[3] = buf0[2];
- buf1[4] = buf0[11];
- buf1[5] = buf0[4];
- buf1[6] = buf0[9];
- buf1[7] = buf0[6];
- buf1[8] = buf0[7];
- buf1[9] = buf0[8];
- buf1[10] = buf0[5];
- buf1[11] = buf0[10];
- buf1[12] = buf0[3];
- buf1[13] = buf0[12];
- buf1[14] = buf0[1];
- buf1[15] = buf0[14];
-
- // stage 2
- stage_idx++;
- bit = cos_bit[stage_idx];
- cospi = cospi_arr[bit - cos_bit_min];
- btf_32_sse4_1_type0(cospi[2], cospi[62], buf1[0], buf1[1], buf0[0], buf0[1],
- bit);
- btf_32_sse4_1_type0(cospi[10], cospi[54], buf1[2], buf1[3], buf0[2],
- buf0[3], bit);
- btf_32_sse4_1_type0(cospi[18], cospi[46], buf1[4], buf1[5], buf0[4],
- buf0[5], bit);
- btf_32_sse4_1_type0(cospi[26], cospi[38], buf1[6], buf1[7], buf0[6],
- buf0[7], bit);
- btf_32_sse4_1_type0(cospi[34], cospi[30], buf1[8], buf1[9], buf0[8],
- buf0[9], bit);
- btf_32_sse4_1_type0(cospi[42], cospi[22], buf1[10], buf1[11], buf0[10],
- buf0[11], bit);
- btf_32_sse4_1_type0(cospi[50], cospi[14], buf1[12], buf1[13], buf0[12],
- buf0[13], bit);
- btf_32_sse4_1_type0(cospi[58], cospi[6], buf1[14], buf1[15], buf0[14],
- buf0[15], bit);
-
- // stage 3
- stage_idx++;
- bit = cos_bit[stage_idx];
- cospi = cospi_arr[bit - cos_bit_min];
- buf1[0] = _mm_add_epi32(buf0[0], buf0[8]);
- buf1[8] = _mm_sub_epi32(buf0[0], buf0[8]);
- buf1[1] = _mm_add_epi32(buf0[1], buf0[9]);
- buf1[9] = _mm_sub_epi32(buf0[1], buf0[9]);
- buf1[2] = _mm_add_epi32(buf0[2], buf0[10]);
- buf1[10] = _mm_sub_epi32(buf0[2], buf0[10]);
- buf1[3] = _mm_add_epi32(buf0[3], buf0[11]);
- buf1[11] = _mm_sub_epi32(buf0[3], buf0[11]);
- buf1[4] = _mm_add_epi32(buf0[4], buf0[12]);
- buf1[12] = _mm_sub_epi32(buf0[4], buf0[12]);
- buf1[5] = _mm_add_epi32(buf0[5], buf0[13]);
- buf1[13] = _mm_sub_epi32(buf0[5], buf0[13]);
- buf1[6] = _mm_add_epi32(buf0[6], buf0[14]);
- buf1[14] = _mm_sub_epi32(buf0[6], buf0[14]);
- buf1[7] = _mm_add_epi32(buf0[7], buf0[15]);
- buf1[15] = _mm_sub_epi32(buf0[7], buf0[15]);
-
- // stage 4
- stage_idx++;
- bit = cos_bit[stage_idx];
- cospi = cospi_arr[bit - cos_bit_min];
- buf0[0] = buf1[0];
- buf0[1] = buf1[1];
- buf0[2] = buf1[2];
- buf0[3] = buf1[3];
- buf0[4] = buf1[4];
- buf0[5] = buf1[5];
- buf0[6] = buf1[6];
- buf0[7] = buf1[7];
- btf_32_sse4_1_type0(cospi[8], cospi[56], buf1[8], buf1[9], buf0[8], buf0[9],
- bit);
- btf_32_sse4_1_type0(cospi[40], cospi[24], buf1[10], buf1[11], buf0[10],
- buf0[11], bit);
- btf_32_sse4_1_type0(-cospi[56], cospi[8], buf1[12], buf1[13], buf0[12],
- buf0[13], bit);
- btf_32_sse4_1_type0(-cospi[24], cospi[40], buf1[14], buf1[15], buf0[14],
- buf0[15], bit);
-
- // stage 5
- stage_idx++;
- bit = cos_bit[stage_idx];
- cospi = cospi_arr[bit - cos_bit_min];
- buf1[0] = _mm_add_epi32(buf0[0], buf0[4]);
- buf1[4] = _mm_sub_epi32(buf0[0], buf0[4]);
- buf1[1] = _mm_add_epi32(buf0[1], buf0[5]);
- buf1[5] = _mm_sub_epi32(buf0[1], buf0[5]);
- buf1[2] = _mm_add_epi32(buf0[2], buf0[6]);
- buf1[6] = _mm_sub_epi32(buf0[2], buf0[6]);
- buf1[3] = _mm_add_epi32(buf0[3], buf0[7]);
- buf1[7] = _mm_sub_epi32(buf0[3], buf0[7]);
- buf1[8] = _mm_add_epi32(buf0[8], buf0[12]);
- buf1[12] = _mm_sub_epi32(buf0[8], buf0[12]);
- buf1[9] = _mm_add_epi32(buf0[9], buf0[13]);
- buf1[13] = _mm_sub_epi32(buf0[9], buf0[13]);
- buf1[10] = _mm_add_epi32(buf0[10], buf0[14]);
- buf1[14] = _mm_sub_epi32(buf0[10], buf0[14]);
- buf1[11] = _mm_add_epi32(buf0[11], buf0[15]);
- buf1[15] = _mm_sub_epi32(buf0[11], buf0[15]);
-
- // stage 6
- stage_idx++;
- bit = cos_bit[stage_idx];
- cospi = cospi_arr[bit - cos_bit_min];
- buf0[0] = buf1[0];
- buf0[1] = buf1[1];
- buf0[2] = buf1[2];
- buf0[3] = buf1[3];
- btf_32_sse4_1_type0(cospi[16], cospi[48], buf1[4], buf1[5], buf0[4],
- buf0[5], bit);
- btf_32_sse4_1_type0(-cospi[48], cospi[16], buf1[6], buf1[7], buf0[6],
- buf0[7], bit);
- buf0[8] = buf1[8];
- buf0[9] = buf1[9];
- buf0[10] = buf1[10];
- buf0[11] = buf1[11];
- btf_32_sse4_1_type0(cospi[16], cospi[48], buf1[12], buf1[13], buf0[12],
- buf0[13], bit);
- btf_32_sse4_1_type0(-cospi[48], cospi[16], buf1[14], buf1[15], buf0[14],
- buf0[15], bit);
-
- // stage 7
- stage_idx++;
- bit = cos_bit[stage_idx];
- cospi = cospi_arr[bit - cos_bit_min];
- buf1[0] = _mm_add_epi32(buf0[0], buf0[2]);
- buf1[2] = _mm_sub_epi32(buf0[0], buf0[2]);
- buf1[1] = _mm_add_epi32(buf0[1], buf0[3]);
- buf1[3] = _mm_sub_epi32(buf0[1], buf0[3]);
- buf1[4] = _mm_add_epi32(buf0[4], buf0[6]);
- buf1[6] = _mm_sub_epi32(buf0[4], buf0[6]);
- buf1[5] = _mm_add_epi32(buf0[5], buf0[7]);
- buf1[7] = _mm_sub_epi32(buf0[5], buf0[7]);
- buf1[8] = _mm_add_epi32(buf0[8], buf0[10]);
- buf1[10] = _mm_sub_epi32(buf0[8], buf0[10]);
- buf1[9] = _mm_add_epi32(buf0[9], buf0[11]);
- buf1[11] = _mm_sub_epi32(buf0[9], buf0[11]);
- buf1[12] = _mm_add_epi32(buf0[12], buf0[14]);
- buf1[14] = _mm_sub_epi32(buf0[12], buf0[14]);
- buf1[13] = _mm_add_epi32(buf0[13], buf0[15]);
- buf1[15] = _mm_sub_epi32(buf0[13], buf0[15]);
-
- // stage 8
- stage_idx++;
- bit = cos_bit[stage_idx];
- cospi = cospi_arr[bit - cos_bit_min];
- buf0[0] = buf1[0];
- buf0[1] = buf1[1];
- btf_32_sse4_1_type0(cospi[32], cospi[32], buf1[2], buf1[3], buf0[2],
- buf0[3], bit);
- buf0[4] = buf1[4];
- buf0[5] = buf1[5];
- btf_32_sse4_1_type0(cospi[32], cospi[32], buf1[6], buf1[7], buf0[6],
- buf0[7], bit);
- buf0[8] = buf1[8];
- buf0[9] = buf1[9];
- btf_32_sse4_1_type0(cospi[32], cospi[32], buf1[10], buf1[11], buf0[10],
- buf0[11], bit);
- buf0[12] = buf1[12];
- buf0[13] = buf1[13];
- btf_32_sse4_1_type0(cospi[32], cospi[32], buf1[14], buf1[15], buf0[14],
- buf0[15], bit);
-
- // stage 9
- stage_idx++;
- bit = cos_bit[stage_idx];
- cospi = cospi_arr[bit - cos_bit_min];
- buf1[0] = buf0[0];
- buf1[1] = _mm_sub_epi32(_mm_set1_epi32(0), buf0[8]);
- buf1[2] = buf0[12];
- buf1[3] = _mm_sub_epi32(_mm_set1_epi32(0), buf0[4]);
- buf1[4] = buf0[6];
- buf1[5] = _mm_sub_epi32(_mm_set1_epi32(0), buf0[14]);
- buf1[6] = buf0[10];
- buf1[7] = _mm_sub_epi32(_mm_set1_epi32(0), buf0[2]);
- buf1[8] = buf0[3];
- buf1[9] = _mm_sub_epi32(_mm_set1_epi32(0), buf0[11]);
- buf1[10] = buf0[15];
- buf1[11] = _mm_sub_epi32(_mm_set1_epi32(0), buf0[7]);
- buf1[12] = buf0[5];
- buf1[13] = _mm_sub_epi32(_mm_set1_epi32(0), buf0[13]);
- buf1[14] = buf0[9];
- buf1[15] = _mm_sub_epi32(_mm_set1_epi32(0), buf0[1]);
-
- output[0 * col_num + col] = buf1[0];
- output[1 * col_num + col] = buf1[1];
- output[2 * col_num + col] = buf1[2];
- output[3 * col_num + col] = buf1[3];
- output[4 * col_num + col] = buf1[4];
- output[5 * col_num + col] = buf1[5];
- output[6 * col_num + col] = buf1[6];
- output[7 * col_num + col] = buf1[7];
- output[8 * col_num + col] = buf1[8];
- output[9 * col_num + col] = buf1[9];
- output[10 * col_num + col] = buf1[10];
- output[11 * col_num + col] = buf1[11];
- output[12 * col_num + col] = buf1[12];
- output[13 * col_num + col] = buf1[13];
- output[14 * col_num + col] = buf1[14];
- output[15 * col_num + col] = buf1[15];
- }
-}
-
void av1_fadst32_new_sse4_1(const __m128i *input, __m128i *output,
const int8_t *cos_bit, const int8_t *stage_range) {
const int txfm_size = 32;
diff --git a/av1/common/x86/av1_fwd_txfm2d_sse4.c b/av1/common/x86/av1_fwd_txfm2d_sse4.c
index 07c283e..3d60b36 100644
--- a/av1/common/x86/av1_fwd_txfm2d_sse4.c
+++ b/av1/common/x86/av1_fwd_txfm2d_sse4.c
@@ -28,13 +28,7 @@
static INLINE TxfmFuncSSE2 fwd_txfm_type_to_func(TXFM_TYPE txfm_type) {
switch (txfm_type) {
- case TXFM_TYPE_DCT4: return av1_fdct4_new_sse4_1; break;
- case TXFM_TYPE_DCT8: return av1_fdct8_new_sse4_1; break;
- case TXFM_TYPE_DCT16: return av1_fdct16_new_sse4_1; break;
case TXFM_TYPE_DCT32: return av1_fdct32_new_sse4_1; break;
- case TXFM_TYPE_ADST4: return av1_fadst4_new_sse4_1; break;
- case TXFM_TYPE_ADST8: return av1_fadst8_new_sse4_1; break;
- case TXFM_TYPE_ADST16: return av1_fadst16_new_sse4_1; break;
case TXFM_TYPE_ADST32: return av1_fadst32_new_sse4_1; break;
default: assert(0);
}
diff --git a/av1/common/x86/av1_fwd_txfm_impl_sse2.h b/av1/common/x86/av1_fwd_txfm_impl_sse2.h
deleted file mode 100644
index 0e341ac..0000000
--- a/av1/common/x86/av1_fwd_txfm_impl_sse2.h
+++ /dev/null
@@ -1,1014 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <emmintrin.h> // SSE2
-
-#include "./aom_dsp_rtcd.h"
-#include "aom_dsp/txfm_common.h"
-#include "aom_dsp/x86/fwd_txfm_sse2.h"
-#include "aom_dsp/x86/txfm_common_sse2.h"
-#include "aom_ports/mem.h"
-
-// TODO(jingning) The high bit-depth functions need rework for performance.
-// After we properly fix the high bit-depth function implementations, this
-// file's dependency should be substantially simplified.
-#if DCT_HIGH_BIT_DEPTH
-#define ADD_EPI16 _mm_adds_epi16
-#define SUB_EPI16 _mm_subs_epi16
-
-#else
-#define ADD_EPI16 _mm_add_epi16
-#define SUB_EPI16 _mm_sub_epi16
-#endif
-
-void FDCT4x4_2D(const int16_t *input, tran_low_t *output, int stride) {
- // This 2D transform implements 4 vertical 1D transforms followed
- // by 4 horizontal 1D transforms. The multiplies and adds are as given
- // by Chen, Smith and Fralick ('77). The commands for moving the data
- // around have been minimized by hand.
- // For the purposes of the comments, the 16 inputs are referred to at i0
- // through iF (in raster order), intermediate variables are a0, b0, c0
- // through f, and correspond to the in-place computations mapped to input
- // locations. The outputs, o0 through oF are labeled according to the
- // output locations.
-
- // Constants
- // These are the coefficients used for the multiplies.
- // In the comments, pN means cos(N pi /64) and mN is -cos(N pi /64),
- // where cospi_N_64 = cos(N pi /64)
- const __m128i k__cospi_A =
- octa_set_epi16(cospi_16_64, cospi_16_64, cospi_16_64, cospi_16_64,
- cospi_16_64, -cospi_16_64, cospi_16_64, -cospi_16_64);
- const __m128i k__cospi_B =
- octa_set_epi16(cospi_16_64, -cospi_16_64, cospi_16_64, -cospi_16_64,
- cospi_16_64, cospi_16_64, cospi_16_64, cospi_16_64);
- const __m128i k__cospi_C =
- octa_set_epi16(cospi_8_64, cospi_24_64, cospi_8_64, cospi_24_64,
- cospi_24_64, -cospi_8_64, cospi_24_64, -cospi_8_64);
- const __m128i k__cospi_D =
- octa_set_epi16(cospi_24_64, -cospi_8_64, cospi_24_64, -cospi_8_64,
- cospi_8_64, cospi_24_64, cospi_8_64, cospi_24_64);
- const __m128i k__cospi_E =
- octa_set_epi16(cospi_16_64, cospi_16_64, cospi_16_64, cospi_16_64,
- cospi_16_64, cospi_16_64, cospi_16_64, cospi_16_64);
- const __m128i k__cospi_F =
- octa_set_epi16(cospi_16_64, -cospi_16_64, cospi_16_64, -cospi_16_64,
- cospi_16_64, -cospi_16_64, cospi_16_64, -cospi_16_64);
- const __m128i k__cospi_G =
- octa_set_epi16(cospi_8_64, cospi_24_64, cospi_8_64, cospi_24_64,
- -cospi_8_64, -cospi_24_64, -cospi_8_64, -cospi_24_64);
- const __m128i k__cospi_H =
- octa_set_epi16(cospi_24_64, -cospi_8_64, cospi_24_64, -cospi_8_64,
- -cospi_24_64, cospi_8_64, -cospi_24_64, cospi_8_64);
-
- const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
- // This second rounding constant saves doing some extra adds at the end
- const __m128i k__DCT_CONST_ROUNDING2 =
- _mm_set1_epi32(DCT_CONST_ROUNDING + (DCT_CONST_ROUNDING << 1));
- const int DCT_CONST_BITS2 = DCT_CONST_BITS + 2;
- const __m128i k__nonzero_bias_a = _mm_setr_epi16(0, 1, 1, 1, 1, 1, 1, 1);
- const __m128i k__nonzero_bias_b = _mm_setr_epi16(1, 0, 0, 0, 0, 0, 0, 0);
- __m128i in0, in1;
-#if DCT_HIGH_BIT_DEPTH
- __m128i cmp0, cmp1;
- int test, overflow;
-#endif
-
- // Load inputs.
- in0 = _mm_loadl_epi64((const __m128i *)(input + 0 * stride));
- in1 = _mm_loadl_epi64((const __m128i *)(input + 1 * stride));
- in1 = _mm_unpacklo_epi64(
- in1, _mm_loadl_epi64((const __m128i *)(input + 2 * stride)));
- in0 = _mm_unpacklo_epi64(
- in0, _mm_loadl_epi64((const __m128i *)(input + 3 * stride)));
-// in0 = [i0 i1 i2 i3 iC iD iE iF]
-// in1 = [i4 i5 i6 i7 i8 i9 iA iB]
-#if DCT_HIGH_BIT_DEPTH
- // Check inputs small enough to use optimised code
- cmp0 = _mm_xor_si128(_mm_cmpgt_epi16(in0, _mm_set1_epi16(0x3ff)),
- _mm_cmplt_epi16(in0, _mm_set1_epi16(0xfc00)));
- cmp1 = _mm_xor_si128(_mm_cmpgt_epi16(in1, _mm_set1_epi16(0x3ff)),
- _mm_cmplt_epi16(in1, _mm_set1_epi16(0xfc00)));
- test = _mm_movemask_epi8(_mm_or_si128(cmp0, cmp1));
- if (test) {
- aom_highbd_fdct4x4_c(input, output, stride);
- return;
- }
-#endif // DCT_HIGH_BIT_DEPTH
-
- // multiply by 16 to give some extra precision
- in0 = _mm_slli_epi16(in0, 4);
- in1 = _mm_slli_epi16(in1, 4);
- // if (i == 0 && input[0]) input[0] += 1;
- // add 1 to the upper left pixel if it is non-zero, which helps reduce
- // the round-trip error
- {
- // The mask will only contain whether the first value is zero, all
- // other comparison will fail as something shifted by 4 (above << 4)
- // can never be equal to one. To increment in the non-zero case, we
- // add the mask and one for the first element:
- // - if zero, mask = -1, v = v - 1 + 1 = v
- // - if non-zero, mask = 0, v = v + 0 + 1 = v + 1
- __m128i mask = _mm_cmpeq_epi16(in0, k__nonzero_bias_a);
- in0 = _mm_add_epi16(in0, mask);
- in0 = _mm_add_epi16(in0, k__nonzero_bias_b);
- }
- // There are 4 total stages, alternating between an add/subtract stage
- // followed by an multiply-and-add stage.
- {
- // Stage 1: Add/subtract
-
- // in0 = [i0 i1 i2 i3 iC iD iE iF]
- // in1 = [i4 i5 i6 i7 i8 i9 iA iB]
- const __m128i r0 = _mm_unpacklo_epi16(in0, in1);
- const __m128i r1 = _mm_unpackhi_epi16(in0, in1);
- // r0 = [i0 i4 i1 i5 i2 i6 i3 i7]
- // r1 = [iC i8 iD i9 iE iA iF iB]
- const __m128i r2 = _mm_shuffle_epi32(r0, 0xB4);
- const __m128i r3 = _mm_shuffle_epi32(r1, 0xB4);
- // r2 = [i0 i4 i1 i5 i3 i7 i2 i6]
- // r3 = [iC i8 iD i9 iF iB iE iA]
-
- const __m128i t0 = _mm_add_epi16(r2, r3);
- const __m128i t1 = _mm_sub_epi16(r2, r3);
- // t0 = [a0 a4 a1 a5 a3 a7 a2 a6]
- // t1 = [aC a8 aD a9 aF aB aE aA]
-
- // Stage 2: multiply by constants (which gets us into 32 bits).
- // The constants needed here are:
- // k__cospi_A = [p16 p16 p16 p16 p16 m16 p16 m16]
- // k__cospi_B = [p16 m16 p16 m16 p16 p16 p16 p16]
- // k__cospi_C = [p08 p24 p08 p24 p24 m08 p24 m08]
- // k__cospi_D = [p24 m08 p24 m08 p08 p24 p08 p24]
- const __m128i u0 = _mm_madd_epi16(t0, k__cospi_A);
- const __m128i u2 = _mm_madd_epi16(t0, k__cospi_B);
- const __m128i u1 = _mm_madd_epi16(t1, k__cospi_C);
- const __m128i u3 = _mm_madd_epi16(t1, k__cospi_D);
- // Then add and right-shift to get back to 16-bit range
- const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
- const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
- const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
- const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
- const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
- const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
- const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
- const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
- // w0 = [b0 b1 b7 b6]
- // w1 = [b8 b9 bF bE]
- // w2 = [b4 b5 b3 b2]
- // w3 = [bC bD bB bA]
- const __m128i x0 = _mm_packs_epi32(w0, w1);
- const __m128i x1 = _mm_packs_epi32(w2, w3);
-#if DCT_HIGH_BIT_DEPTH
- overflow = check_epi16_overflow_x2(&x0, &x1);
- if (overflow) {
- aom_highbd_fdct4x4_c(input, output, stride);
- return;
- }
-#endif // DCT_HIGH_BIT_DEPTH
- // x0 = [b0 b1 b7 b6 b8 b9 bF bE]
- // x1 = [b4 b5 b3 b2 bC bD bB bA]
- in0 = _mm_shuffle_epi32(x0, 0xD8);
- in1 = _mm_shuffle_epi32(x1, 0x8D);
- // in0 = [b0 b1 b8 b9 b7 b6 bF bE]
- // in1 = [b3 b2 bB bA b4 b5 bC bD]
- }
- {
- // vertical DCTs finished. Now we do the horizontal DCTs.
- // Stage 3: Add/subtract
-
- // t0 = [c0 c1 c8 c9 c4 c5 cC cD]
- // t1 = [c3 c2 cB cA -c7 -c6 -cF -cE]
- const __m128i t0 = ADD_EPI16(in0, in1);
- const __m128i t1 = SUB_EPI16(in0, in1);
-#if DCT_HIGH_BIT_DEPTH
- overflow = check_epi16_overflow_x2(&t0, &t1);
- if (overflow) {
- aom_highbd_fdct4x4_c(input, output, stride);
- return;
- }
-#endif // DCT_HIGH_BIT_DEPTH
-
- // Stage 4: multiply by constants (which gets us into 32 bits).
- {
- // The constants needed here are:
- // k__cospi_E = [p16 p16 p16 p16 p16 p16 p16 p16]
- // k__cospi_F = [p16 m16 p16 m16 p16 m16 p16 m16]
- // k__cospi_G = [p08 p24 p08 p24 m08 m24 m08 m24]
- // k__cospi_H = [p24 m08 p24 m08 m24 p08 m24 p08]
- const __m128i u0 = _mm_madd_epi16(t0, k__cospi_E);
- const __m128i u1 = _mm_madd_epi16(t0, k__cospi_F);
- const __m128i u2 = _mm_madd_epi16(t1, k__cospi_G);
- const __m128i u3 = _mm_madd_epi16(t1, k__cospi_H);
- // Then add and right-shift to get back to 16-bit range
- // but this combines the final right-shift as well to save operations
- // This unusual rounding operations is to maintain bit-accurate
- // compatibility with the c version of this function which has two
- // rounding steps in a row.
- const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING2);
- const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING2);
- const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING2);
- const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING2);
- const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS2);
- const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS2);
- const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS2);
- const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS2);
- // w0 = [o0 o4 o8 oC]
- // w1 = [o2 o6 oA oE]
- // w2 = [o1 o5 o9 oD]
- // w3 = [o3 o7 oB oF]
- // remember the o's are numbered according to the correct output location
- const __m128i x0 = _mm_packs_epi32(w0, w1);
- const __m128i x1 = _mm_packs_epi32(w2, w3);
-#if DCT_HIGH_BIT_DEPTH
- overflow = check_epi16_overflow_x2(&x0, &x1);
- if (overflow) {
- aom_highbd_fdct4x4_c(input, output, stride);
- return;
- }
-#endif // DCT_HIGH_BIT_DEPTH
- {
- // x0 = [o0 o4 o8 oC o2 o6 oA oE]
- // x1 = [o1 o5 o9 oD o3 o7 oB oF]
- const __m128i y0 = _mm_unpacklo_epi16(x0, x1);
- const __m128i y1 = _mm_unpackhi_epi16(x0, x1);
- // y0 = [o0 o1 o4 o5 o8 o9 oC oD]
- // y1 = [o2 o3 o6 o7 oA oB oE oF]
- in0 = _mm_unpacklo_epi32(y0, y1);
- // in0 = [o0 o1 o2 o3 o4 o5 o6 o7]
- in1 = _mm_unpackhi_epi32(y0, y1);
- // in1 = [o8 o9 oA oB oC oD oE oF]
- }
- }
- }
- // Post-condition (v + 1) >> 2 is now incorporated into previous
- // add and right-shift commands. Only 2 store instructions needed
- // because we are using the fact that 1/3 are stored just after 0/2.
- storeu_output(&in0, output + 0 * 4);
- storeu_output(&in1, output + 2 * 4);
-}
-
-void FDCT8x8_2D(const int16_t *input, tran_low_t *output, int stride) {
- int pass;
- // Constants
- // When we use them, in one case, they are all the same. In all others
- // it's a pair of them that we need to repeat four times. This is done
- // by constructing the 32 bit constant corresponding to that pair.
- const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64);
- const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
- const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);
- const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);
- const __m128i k__cospi_p28_p04 = pair_set_epi16(cospi_28_64, cospi_4_64);
- const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64);
- const __m128i k__cospi_p12_p20 = pair_set_epi16(cospi_12_64, cospi_20_64);
- const __m128i k__cospi_m20_p12 = pair_set_epi16(-cospi_20_64, cospi_12_64);
- const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
-#if DCT_HIGH_BIT_DEPTH
- int overflow;
-#endif
- // Load input
- __m128i in0 = _mm_load_si128((const __m128i *)(input + 0 * stride));
- __m128i in1 = _mm_load_si128((const __m128i *)(input + 1 * stride));
- __m128i in2 = _mm_load_si128((const __m128i *)(input + 2 * stride));
- __m128i in3 = _mm_load_si128((const __m128i *)(input + 3 * stride));
- __m128i in4 = _mm_load_si128((const __m128i *)(input + 4 * stride));
- __m128i in5 = _mm_load_si128((const __m128i *)(input + 5 * stride));
- __m128i in6 = _mm_load_si128((const __m128i *)(input + 6 * stride));
- __m128i in7 = _mm_load_si128((const __m128i *)(input + 7 * stride));
- // Pre-condition input (shift by two)
- in0 = _mm_slli_epi16(in0, 2);
- in1 = _mm_slli_epi16(in1, 2);
- in2 = _mm_slli_epi16(in2, 2);
- in3 = _mm_slli_epi16(in3, 2);
- in4 = _mm_slli_epi16(in4, 2);
- in5 = _mm_slli_epi16(in5, 2);
- in6 = _mm_slli_epi16(in6, 2);
- in7 = _mm_slli_epi16(in7, 2);
-
- // We do two passes, first the columns, then the rows. The results of the
- // first pass are transposed so that the same column code can be reused. The
- // results of the second pass are also transposed so that the rows (processed
- // as columns) are put back in row positions.
- for (pass = 0; pass < 2; pass++) {
- // To store results of each pass before the transpose.
- __m128i res0, res1, res2, res3, res4, res5, res6, res7;
- // Add/subtract
- const __m128i q0 = ADD_EPI16(in0, in7);
- const __m128i q1 = ADD_EPI16(in1, in6);
- const __m128i q2 = ADD_EPI16(in2, in5);
- const __m128i q3 = ADD_EPI16(in3, in4);
- const __m128i q4 = SUB_EPI16(in3, in4);
- const __m128i q5 = SUB_EPI16(in2, in5);
- const __m128i q6 = SUB_EPI16(in1, in6);
- const __m128i q7 = SUB_EPI16(in0, in7);
-#if DCT_HIGH_BIT_DEPTH
- if (pass == 1) {
- overflow =
- check_epi16_overflow_x8(&q0, &q1, &q2, &q3, &q4, &q5, &q6, &q7);
- if (overflow) {
- aom_highbd_fdct8x8_c(input, output, stride);
- return;
- }
- }
-#endif // DCT_HIGH_BIT_DEPTH
- // Work on first four results
- {
- // Add/subtract
- const __m128i r0 = ADD_EPI16(q0, q3);
- const __m128i r1 = ADD_EPI16(q1, q2);
- const __m128i r2 = SUB_EPI16(q1, q2);
- const __m128i r3 = SUB_EPI16(q0, q3);
-#if DCT_HIGH_BIT_DEPTH
- overflow = check_epi16_overflow_x4(&r0, &r1, &r2, &r3);
- if (overflow) {
- aom_highbd_fdct8x8_c(input, output, stride);
- return;
- }
-#endif // DCT_HIGH_BIT_DEPTH
- // Interleave to do the multiply by constants which gets us into 32bits
- {
- const __m128i t0 = _mm_unpacklo_epi16(r0, r1);
- const __m128i t1 = _mm_unpackhi_epi16(r0, r1);
- const __m128i t2 = _mm_unpacklo_epi16(r2, r3);
- const __m128i t3 = _mm_unpackhi_epi16(r2, r3);
- const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p16_p16);
- const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p16_p16);
- const __m128i u2 = _mm_madd_epi16(t0, k__cospi_p16_m16);
- const __m128i u3 = _mm_madd_epi16(t1, k__cospi_p16_m16);
- const __m128i u4 = _mm_madd_epi16(t2, k__cospi_p24_p08);
- const __m128i u5 = _mm_madd_epi16(t3, k__cospi_p24_p08);
- const __m128i u6 = _mm_madd_epi16(t2, k__cospi_m08_p24);
- const __m128i u7 = _mm_madd_epi16(t3, k__cospi_m08_p24);
- // dct_const_round_shift
- const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
- const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
- const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
- const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
- const __m128i v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING);
- const __m128i v5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING);
- const __m128i v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING);
- const __m128i v7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING);
- const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
- const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
- const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
- const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
- const __m128i w4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
- const __m128i w5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
- const __m128i w6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
- const __m128i w7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
- // Combine
- res0 = _mm_packs_epi32(w0, w1);
- res4 = _mm_packs_epi32(w2, w3);
- res2 = _mm_packs_epi32(w4, w5);
- res6 = _mm_packs_epi32(w6, w7);
-#if DCT_HIGH_BIT_DEPTH
- overflow = check_epi16_overflow_x4(&res0, &res4, &res2, &res6);
- if (overflow) {
- aom_highbd_fdct8x8_c(input, output, stride);
- return;
- }
-#endif // DCT_HIGH_BIT_DEPTH
- }
- }
- // Work on next four results
- {
- // Interleave to do the multiply by constants which gets us into 32bits
- const __m128i d0 = _mm_unpacklo_epi16(q6, q5);
- const __m128i d1 = _mm_unpackhi_epi16(q6, q5);
- const __m128i e0 = _mm_madd_epi16(d0, k__cospi_p16_m16);
- const __m128i e1 = _mm_madd_epi16(d1, k__cospi_p16_m16);
- const __m128i e2 = _mm_madd_epi16(d0, k__cospi_p16_p16);
- const __m128i e3 = _mm_madd_epi16(d1, k__cospi_p16_p16);
- // dct_const_round_shift
- const __m128i f0 = _mm_add_epi32(e0, k__DCT_CONST_ROUNDING);
- const __m128i f1 = _mm_add_epi32(e1, k__DCT_CONST_ROUNDING);
- const __m128i f2 = _mm_add_epi32(e2, k__DCT_CONST_ROUNDING);
- const __m128i f3 = _mm_add_epi32(e3, k__DCT_CONST_ROUNDING);
- const __m128i s0 = _mm_srai_epi32(f0, DCT_CONST_BITS);
- const __m128i s1 = _mm_srai_epi32(f1, DCT_CONST_BITS);
- const __m128i s2 = _mm_srai_epi32(f2, DCT_CONST_BITS);
- const __m128i s3 = _mm_srai_epi32(f3, DCT_CONST_BITS);
- // Combine
- const __m128i r0 = _mm_packs_epi32(s0, s1);
- const __m128i r1 = _mm_packs_epi32(s2, s3);
-#if DCT_HIGH_BIT_DEPTH
- overflow = check_epi16_overflow_x2(&r0, &r1);
- if (overflow) {
- aom_highbd_fdct8x8_c(input, output, stride);
- return;
- }
-#endif // DCT_HIGH_BIT_DEPTH
- {
- // Add/subtract
- const __m128i x0 = ADD_EPI16(q4, r0);
- const __m128i x1 = SUB_EPI16(q4, r0);
- const __m128i x2 = SUB_EPI16(q7, r1);
- const __m128i x3 = ADD_EPI16(q7, r1);
-#if DCT_HIGH_BIT_DEPTH
- overflow = check_epi16_overflow_x4(&x0, &x1, &x2, &x3);
- if (overflow) {
- aom_highbd_fdct8x8_c(input, output, stride);
- return;
- }
-#endif // DCT_HIGH_BIT_DEPTH
- // Interleave to do the multiply by constants which gets us into 32bits
- {
- const __m128i t0 = _mm_unpacklo_epi16(x0, x3);
- const __m128i t1 = _mm_unpackhi_epi16(x0, x3);
- const __m128i t2 = _mm_unpacklo_epi16(x1, x2);
- const __m128i t3 = _mm_unpackhi_epi16(x1, x2);
- const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p28_p04);
- const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p28_p04);
- const __m128i u2 = _mm_madd_epi16(t0, k__cospi_m04_p28);
- const __m128i u3 = _mm_madd_epi16(t1, k__cospi_m04_p28);
- const __m128i u4 = _mm_madd_epi16(t2, k__cospi_p12_p20);
- const __m128i u5 = _mm_madd_epi16(t3, k__cospi_p12_p20);
- const __m128i u6 = _mm_madd_epi16(t2, k__cospi_m20_p12);
- const __m128i u7 = _mm_madd_epi16(t3, k__cospi_m20_p12);
- // dct_const_round_shift
- const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
- const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
- const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
- const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
- const __m128i v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING);
- const __m128i v5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING);
- const __m128i v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING);
- const __m128i v7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING);
- const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
- const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
- const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
- const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
- const __m128i w4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
- const __m128i w5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
- const __m128i w6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
- const __m128i w7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
- // Combine
- res1 = _mm_packs_epi32(w0, w1);
- res7 = _mm_packs_epi32(w2, w3);
- res5 = _mm_packs_epi32(w4, w5);
- res3 = _mm_packs_epi32(w6, w7);
-#if DCT_HIGH_BIT_DEPTH
- overflow = check_epi16_overflow_x4(&res1, &res7, &res5, &res3);
- if (overflow) {
- aom_highbd_fdct8x8_c(input, output, stride);
- return;
- }
-#endif // DCT_HIGH_BIT_DEPTH
- }
- }
- }
- // Transpose the 8x8.
- {
- // 00 01 02 03 04 05 06 07
- // 10 11 12 13 14 15 16 17
- // 20 21 22 23 24 25 26 27
- // 30 31 32 33 34 35 36 37
- // 40 41 42 43 44 45 46 47
- // 50 51 52 53 54 55 56 57
- // 60 61 62 63 64 65 66 67
- // 70 71 72 73 74 75 76 77
- const __m128i tr0_0 = _mm_unpacklo_epi16(res0, res1);
- const __m128i tr0_1 = _mm_unpacklo_epi16(res2, res3);
- const __m128i tr0_2 = _mm_unpackhi_epi16(res0, res1);
- const __m128i tr0_3 = _mm_unpackhi_epi16(res2, res3);
- const __m128i tr0_4 = _mm_unpacklo_epi16(res4, res5);
- const __m128i tr0_5 = _mm_unpacklo_epi16(res6, res7);
- const __m128i tr0_6 = _mm_unpackhi_epi16(res4, res5);
- const __m128i tr0_7 = _mm_unpackhi_epi16(res6, res7);
- // 00 10 01 11 02 12 03 13
- // 20 30 21 31 22 32 23 33
- // 04 14 05 15 06 16 07 17
- // 24 34 25 35 26 36 27 37
- // 40 50 41 51 42 52 43 53
- // 60 70 61 71 62 72 63 73
- // 54 54 55 55 56 56 57 57
- // 64 74 65 75 66 76 67 77
- const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
- const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3);
- const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1);
- const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3);
- const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5);
- const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7);
- const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5);
- const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7);
- // 00 10 20 30 01 11 21 31
- // 40 50 60 70 41 51 61 71
- // 02 12 22 32 03 13 23 33
- // 42 52 62 72 43 53 63 73
- // 04 14 24 34 05 15 21 36
- // 44 54 64 74 45 55 61 76
- // 06 16 26 36 07 17 27 37
- // 46 56 66 76 47 57 67 77
- in0 = _mm_unpacklo_epi64(tr1_0, tr1_4);
- in1 = _mm_unpackhi_epi64(tr1_0, tr1_4);
- in2 = _mm_unpacklo_epi64(tr1_2, tr1_6);
- in3 = _mm_unpackhi_epi64(tr1_2, tr1_6);
- in4 = _mm_unpacklo_epi64(tr1_1, tr1_5);
- in5 = _mm_unpackhi_epi64(tr1_1, tr1_5);
- in6 = _mm_unpacklo_epi64(tr1_3, tr1_7);
- in7 = _mm_unpackhi_epi64(tr1_3, tr1_7);
- // 00 10 20 30 40 50 60 70
- // 01 11 21 31 41 51 61 71
- // 02 12 22 32 42 52 62 72
- // 03 13 23 33 43 53 63 73
- // 04 14 24 34 44 54 64 74
- // 05 15 25 35 45 55 65 75
- // 06 16 26 36 46 56 66 76
- // 07 17 27 37 47 57 67 77
- }
- }
- // Post-condition output and store it
- {
- // Post-condition (division by two)
- // division of two 16 bits signed numbers using shifts
- // n / 2 = (n - (n >> 15)) >> 1
- const __m128i sign_in0 = _mm_srai_epi16(in0, 15);
- const __m128i sign_in1 = _mm_srai_epi16(in1, 15);
- const __m128i sign_in2 = _mm_srai_epi16(in2, 15);
- const __m128i sign_in3 = _mm_srai_epi16(in3, 15);
- const __m128i sign_in4 = _mm_srai_epi16(in4, 15);
- const __m128i sign_in5 = _mm_srai_epi16(in5, 15);
- const __m128i sign_in6 = _mm_srai_epi16(in6, 15);
- const __m128i sign_in7 = _mm_srai_epi16(in7, 15);
- in0 = _mm_sub_epi16(in0, sign_in0);
- in1 = _mm_sub_epi16(in1, sign_in1);
- in2 = _mm_sub_epi16(in2, sign_in2);
- in3 = _mm_sub_epi16(in3, sign_in3);
- in4 = _mm_sub_epi16(in4, sign_in4);
- in5 = _mm_sub_epi16(in5, sign_in5);
- in6 = _mm_sub_epi16(in6, sign_in6);
- in7 = _mm_sub_epi16(in7, sign_in7);
- in0 = _mm_srai_epi16(in0, 1);
- in1 = _mm_srai_epi16(in1, 1);
- in2 = _mm_srai_epi16(in2, 1);
- in3 = _mm_srai_epi16(in3, 1);
- in4 = _mm_srai_epi16(in4, 1);
- in5 = _mm_srai_epi16(in5, 1);
- in6 = _mm_srai_epi16(in6, 1);
- in7 = _mm_srai_epi16(in7, 1);
- // store results
- store_output(&in0, (output + 0 * 8));
- store_output(&in1, (output + 1 * 8));
- store_output(&in2, (output + 2 * 8));
- store_output(&in3, (output + 3 * 8));
- store_output(&in4, (output + 4 * 8));
- store_output(&in5, (output + 5 * 8));
- store_output(&in6, (output + 6 * 8));
- store_output(&in7, (output + 7 * 8));
- }
-}
-
-void FDCT16x16_2D(const int16_t *input, tran_low_t *output, int stride) {
- // The 2D transform is done with two passes which are actually pretty
- // similar. In the first one, we transform the columns and transpose
- // the results. In the second one, we transform the rows. To achieve that,
- // as the first pass results are transposed, we transpose the columns (that
- // is the transposed rows) and transpose the results (so that it goes back
- // in normal/row positions).
- int pass;
- // We need an intermediate buffer between passes.
- DECLARE_ALIGNED(16, int16_t, intermediate[256]);
- const int16_t *in = input;
- int16_t *out0 = intermediate;
- tran_low_t *out1 = output;
- // Constants
- // When we use them, in one case, they are all the same. In all others
- // it's a pair of them that we need to repeat four times. This is done
- // by constructing the 32 bit constant corresponding to that pair.
- const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64);
- const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
- const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);
- const __m128i k__cospi_p08_m24 = pair_set_epi16(cospi_8_64, -cospi_24_64);
- const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);
- const __m128i k__cospi_p28_p04 = pair_set_epi16(cospi_28_64, cospi_4_64);
- const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64);
- const __m128i k__cospi_p12_p20 = pair_set_epi16(cospi_12_64, cospi_20_64);
- const __m128i k__cospi_m20_p12 = pair_set_epi16(-cospi_20_64, cospi_12_64);
- const __m128i k__cospi_p30_p02 = pair_set_epi16(cospi_30_64, cospi_2_64);
- const __m128i k__cospi_p14_p18 = pair_set_epi16(cospi_14_64, cospi_18_64);
- const __m128i k__cospi_m02_p30 = pair_set_epi16(-cospi_2_64, cospi_30_64);
- const __m128i k__cospi_m18_p14 = pair_set_epi16(-cospi_18_64, cospi_14_64);
- const __m128i k__cospi_p22_p10 = pair_set_epi16(cospi_22_64, cospi_10_64);
- const __m128i k__cospi_p06_p26 = pair_set_epi16(cospi_6_64, cospi_26_64);
- const __m128i k__cospi_m10_p22 = pair_set_epi16(-cospi_10_64, cospi_22_64);
- const __m128i k__cospi_m26_p06 = pair_set_epi16(-cospi_26_64, cospi_6_64);
- const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
- const __m128i kOne = _mm_set1_epi16(1);
- // Do the two transform/transpose passes
- for (pass = 0; pass < 2; ++pass) {
- // We process eight columns (transposed rows in second pass) at a time.
- int column_start;
-#if DCT_HIGH_BIT_DEPTH
- int overflow;
-#endif
- for (column_start = 0; column_start < 16; column_start += 8) {
- __m128i in00, in01, in02, in03, in04, in05, in06, in07;
- __m128i in08, in09, in10, in11, in12, in13, in14, in15;
- __m128i input0, input1, input2, input3, input4, input5, input6, input7;
- __m128i step1_0, step1_1, step1_2, step1_3;
- __m128i step1_4, step1_5, step1_6, step1_7;
- __m128i step2_1, step2_2, step2_3, step2_4, step2_5, step2_6;
- __m128i step3_0, step3_1, step3_2, step3_3;
- __m128i step3_4, step3_5, step3_6, step3_7;
- __m128i res00, res01, res02, res03, res04, res05, res06, res07;
- __m128i res08, res09, res10, res11, res12, res13, res14, res15;
- // Load and pre-condition input.
- if (0 == pass) {
- in00 = _mm_load_si128((const __m128i *)(in + 0 * stride));
- in01 = _mm_load_si128((const __m128i *)(in + 1 * stride));
- in02 = _mm_load_si128((const __m128i *)(in + 2 * stride));
- in03 = _mm_load_si128((const __m128i *)(in + 3 * stride));
- in04 = _mm_load_si128((const __m128i *)(in + 4 * stride));
- in05 = _mm_load_si128((const __m128i *)(in + 5 * stride));
- in06 = _mm_load_si128((const __m128i *)(in + 6 * stride));
- in07 = _mm_load_si128((const __m128i *)(in + 7 * stride));
- in08 = _mm_load_si128((const __m128i *)(in + 8 * stride));
- in09 = _mm_load_si128((const __m128i *)(in + 9 * stride));
- in10 = _mm_load_si128((const __m128i *)(in + 10 * stride));
- in11 = _mm_load_si128((const __m128i *)(in + 11 * stride));
- in12 = _mm_load_si128((const __m128i *)(in + 12 * stride));
- in13 = _mm_load_si128((const __m128i *)(in + 13 * stride));
- in14 = _mm_load_si128((const __m128i *)(in + 14 * stride));
- in15 = _mm_load_si128((const __m128i *)(in + 15 * stride));
- // x = x << 2
- in00 = _mm_slli_epi16(in00, 2);
- in01 = _mm_slli_epi16(in01, 2);
- in02 = _mm_slli_epi16(in02, 2);
- in03 = _mm_slli_epi16(in03, 2);
- in04 = _mm_slli_epi16(in04, 2);
- in05 = _mm_slli_epi16(in05, 2);
- in06 = _mm_slli_epi16(in06, 2);
- in07 = _mm_slli_epi16(in07, 2);
- in08 = _mm_slli_epi16(in08, 2);
- in09 = _mm_slli_epi16(in09, 2);
- in10 = _mm_slli_epi16(in10, 2);
- in11 = _mm_slli_epi16(in11, 2);
- in12 = _mm_slli_epi16(in12, 2);
- in13 = _mm_slli_epi16(in13, 2);
- in14 = _mm_slli_epi16(in14, 2);
- in15 = _mm_slli_epi16(in15, 2);
- } else {
- in00 = _mm_load_si128((const __m128i *)(in + 0 * 16));
- in01 = _mm_load_si128((const __m128i *)(in + 1 * 16));
- in02 = _mm_load_si128((const __m128i *)(in + 2 * 16));
- in03 = _mm_load_si128((const __m128i *)(in + 3 * 16));
- in04 = _mm_load_si128((const __m128i *)(in + 4 * 16));
- in05 = _mm_load_si128((const __m128i *)(in + 5 * 16));
- in06 = _mm_load_si128((const __m128i *)(in + 6 * 16));
- in07 = _mm_load_si128((const __m128i *)(in + 7 * 16));
- in08 = _mm_load_si128((const __m128i *)(in + 8 * 16));
- in09 = _mm_load_si128((const __m128i *)(in + 9 * 16));
- in10 = _mm_load_si128((const __m128i *)(in + 10 * 16));
- in11 = _mm_load_si128((const __m128i *)(in + 11 * 16));
- in12 = _mm_load_si128((const __m128i *)(in + 12 * 16));
- in13 = _mm_load_si128((const __m128i *)(in + 13 * 16));
- in14 = _mm_load_si128((const __m128i *)(in + 14 * 16));
- in15 = _mm_load_si128((const __m128i *)(in + 15 * 16));
- // x = (x + 1) >> 2
- in00 = _mm_add_epi16(in00, kOne);
- in01 = _mm_add_epi16(in01, kOne);
- in02 = _mm_add_epi16(in02, kOne);
- in03 = _mm_add_epi16(in03, kOne);
- in04 = _mm_add_epi16(in04, kOne);
- in05 = _mm_add_epi16(in05, kOne);
- in06 = _mm_add_epi16(in06, kOne);
- in07 = _mm_add_epi16(in07, kOne);
- in08 = _mm_add_epi16(in08, kOne);
- in09 = _mm_add_epi16(in09, kOne);
- in10 = _mm_add_epi16(in10, kOne);
- in11 = _mm_add_epi16(in11, kOne);
- in12 = _mm_add_epi16(in12, kOne);
- in13 = _mm_add_epi16(in13, kOne);
- in14 = _mm_add_epi16(in14, kOne);
- in15 = _mm_add_epi16(in15, kOne);
- in00 = _mm_srai_epi16(in00, 2);
- in01 = _mm_srai_epi16(in01, 2);
- in02 = _mm_srai_epi16(in02, 2);
- in03 = _mm_srai_epi16(in03, 2);
- in04 = _mm_srai_epi16(in04, 2);
- in05 = _mm_srai_epi16(in05, 2);
- in06 = _mm_srai_epi16(in06, 2);
- in07 = _mm_srai_epi16(in07, 2);
- in08 = _mm_srai_epi16(in08, 2);
- in09 = _mm_srai_epi16(in09, 2);
- in10 = _mm_srai_epi16(in10, 2);
- in11 = _mm_srai_epi16(in11, 2);
- in12 = _mm_srai_epi16(in12, 2);
- in13 = _mm_srai_epi16(in13, 2);
- in14 = _mm_srai_epi16(in14, 2);
- in15 = _mm_srai_epi16(in15, 2);
- }
- in += 8;
- // Calculate input for the first 8 results.
- {
- input0 = ADD_EPI16(in00, in15);
- input1 = ADD_EPI16(in01, in14);
- input2 = ADD_EPI16(in02, in13);
- input3 = ADD_EPI16(in03, in12);
- input4 = ADD_EPI16(in04, in11);
- input5 = ADD_EPI16(in05, in10);
- input6 = ADD_EPI16(in06, in09);
- input7 = ADD_EPI16(in07, in08);
-#if DCT_HIGH_BIT_DEPTH
- overflow = check_epi16_overflow_x8(&input0, &input1, &input2, &input3,
- &input4, &input5, &input6, &input7);
- if (overflow) {
- aom_highbd_fdct16x16_c(input, output, stride);
- return;
- }
-#endif // DCT_HIGH_BIT_DEPTH
- }
- // Calculate input for the next 8 results.
- {
- step1_0 = SUB_EPI16(in07, in08);
- step1_1 = SUB_EPI16(in06, in09);
- step1_2 = SUB_EPI16(in05, in10);
- step1_3 = SUB_EPI16(in04, in11);
- step1_4 = SUB_EPI16(in03, in12);
- step1_5 = SUB_EPI16(in02, in13);
- step1_6 = SUB_EPI16(in01, in14);
- step1_7 = SUB_EPI16(in00, in15);
-#if DCT_HIGH_BIT_DEPTH
- overflow =
- check_epi16_overflow_x8(&step1_0, &step1_1, &step1_2, &step1_3,
- &step1_4, &step1_5, &step1_6, &step1_7);
- if (overflow) {
- aom_highbd_fdct16x16_c(input, output, stride);
- return;
- }
-#endif // DCT_HIGH_BIT_DEPTH
- }
- // Work on the first eight values; fdct8(input, even_results);
- {
- // Add/subtract
- const __m128i q0 = ADD_EPI16(input0, input7);
- const __m128i q1 = ADD_EPI16(input1, input6);
- const __m128i q2 = ADD_EPI16(input2, input5);
- const __m128i q3 = ADD_EPI16(input3, input4);
- const __m128i q4 = SUB_EPI16(input3, input4);
- const __m128i q5 = SUB_EPI16(input2, input5);
- const __m128i q6 = SUB_EPI16(input1, input6);
- const __m128i q7 = SUB_EPI16(input0, input7);
-#if DCT_HIGH_BIT_DEPTH
- overflow =
- check_epi16_overflow_x8(&q0, &q1, &q2, &q3, &q4, &q5, &q6, &q7);
- if (overflow) {
- aom_highbd_fdct16x16_c(input, output, stride);
- return;
- }
-#endif // DCT_HIGH_BIT_DEPTH
- // Work on first four results
- {
- // Add/subtract
- const __m128i r0 = ADD_EPI16(q0, q3);
- const __m128i r1 = ADD_EPI16(q1, q2);
- const __m128i r2 = SUB_EPI16(q1, q2);
- const __m128i r3 = SUB_EPI16(q0, q3);
-#if DCT_HIGH_BIT_DEPTH
- overflow = check_epi16_overflow_x4(&r0, &r1, &r2, &r3);
- if (overflow) {
- aom_highbd_fdct16x16_c(input, output, stride);
- return;
- }
-#endif // DCT_HIGH_BIT_DEPTH
- // Interleave to do the multiply by constants which gets us
- // into 32 bits.
- {
- const __m128i t0 = _mm_unpacklo_epi16(r0, r1);
- const __m128i t1 = _mm_unpackhi_epi16(r0, r1);
- const __m128i t2 = _mm_unpacklo_epi16(r2, r3);
- const __m128i t3 = _mm_unpackhi_epi16(r2, r3);
- res00 = mult_round_shift(&t0, &t1, &k__cospi_p16_p16,
- &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
- res08 = mult_round_shift(&t0, &t1, &k__cospi_p16_m16,
- &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
- res04 = mult_round_shift(&t2, &t3, &k__cospi_p24_p08,
- &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
- res12 = mult_round_shift(&t2, &t3, &k__cospi_m08_p24,
- &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
-#if DCT_HIGH_BIT_DEPTH
- overflow = check_epi16_overflow_x4(&res00, &res08, &res04, &res12);
- if (overflow) {
- aom_highbd_fdct16x16_c(input, output, stride);
- return;
- }
-#endif // DCT_HIGH_BIT_DEPTH
- }
- }
- // Work on next four results
- {
- // Interleave to do the multiply by constants which gets us
- // into 32 bits.
- const __m128i d0 = _mm_unpacklo_epi16(q6, q5);
- const __m128i d1 = _mm_unpackhi_epi16(q6, q5);
- const __m128i r0 =
- mult_round_shift(&d0, &d1, &k__cospi_p16_m16,
- &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
- const __m128i r1 =
- mult_round_shift(&d0, &d1, &k__cospi_p16_p16,
- &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
-#if DCT_HIGH_BIT_DEPTH
- overflow = check_epi16_overflow_x2(&r0, &r1);
- if (overflow) {
- aom_highbd_fdct16x16_c(input, output, stride);
- return;
- }
-#endif // DCT_HIGH_BIT_DEPTH
- {
- // Add/subtract
- const __m128i x0 = ADD_EPI16(q4, r0);
- const __m128i x1 = SUB_EPI16(q4, r0);
- const __m128i x2 = SUB_EPI16(q7, r1);
- const __m128i x3 = ADD_EPI16(q7, r1);
-#if DCT_HIGH_BIT_DEPTH
- overflow = check_epi16_overflow_x4(&x0, &x1, &x2, &x3);
- if (overflow) {
- aom_highbd_fdct16x16_c(input, output, stride);
- return;
- }
-#endif // DCT_HIGH_BIT_DEPTH
- // Interleave to do the multiply by constants which gets us
- // into 32 bits.
- {
- const __m128i t0 = _mm_unpacklo_epi16(x0, x3);
- const __m128i t1 = _mm_unpackhi_epi16(x0, x3);
- const __m128i t2 = _mm_unpacklo_epi16(x1, x2);
- const __m128i t3 = _mm_unpackhi_epi16(x1, x2);
- res02 = mult_round_shift(&t0, &t1, &k__cospi_p28_p04,
- &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
- res14 = mult_round_shift(&t0, &t1, &k__cospi_m04_p28,
- &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
- res10 = mult_round_shift(&t2, &t3, &k__cospi_p12_p20,
- &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
- res06 = mult_round_shift(&t2, &t3, &k__cospi_m20_p12,
- &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
-#if DCT_HIGH_BIT_DEPTH
- overflow =
- check_epi16_overflow_x4(&res02, &res14, &res10, &res06);
- if (overflow) {
- aom_highbd_fdct16x16_c(input, output, stride);
- return;
- }
-#endif // DCT_HIGH_BIT_DEPTH
- }
- }
- }
- }
- // Work on the next eight values; step1 -> odd_results
- {
- // step 2
- {
- const __m128i t0 = _mm_unpacklo_epi16(step1_5, step1_2);
- const __m128i t1 = _mm_unpackhi_epi16(step1_5, step1_2);
- const __m128i t2 = _mm_unpacklo_epi16(step1_4, step1_3);
- const __m128i t3 = _mm_unpackhi_epi16(step1_4, step1_3);
- step2_2 = mult_round_shift(&t0, &t1, &k__cospi_p16_m16,
- &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
- step2_3 = mult_round_shift(&t2, &t3, &k__cospi_p16_m16,
- &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
- step2_5 = mult_round_shift(&t0, &t1, &k__cospi_p16_p16,
- &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
- step2_4 = mult_round_shift(&t2, &t3, &k__cospi_p16_p16,
- &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
-#if DCT_HIGH_BIT_DEPTH
- overflow =
- check_epi16_overflow_x4(&step2_2, &step2_3, &step2_5, &step2_4);
- if (overflow) {
- aom_highbd_fdct16x16_c(input, output, stride);
- return;
- }
-#endif // DCT_HIGH_BIT_DEPTH
- }
- // step 3
- {
- step3_0 = ADD_EPI16(step1_0, step2_3);
- step3_1 = ADD_EPI16(step1_1, step2_2);
- step3_2 = SUB_EPI16(step1_1, step2_2);
- step3_3 = SUB_EPI16(step1_0, step2_3);
- step3_4 = SUB_EPI16(step1_7, step2_4);
- step3_5 = SUB_EPI16(step1_6, step2_5);
- step3_6 = ADD_EPI16(step1_6, step2_5);
- step3_7 = ADD_EPI16(step1_7, step2_4);
-#if DCT_HIGH_BIT_DEPTH
- overflow =
- check_epi16_overflow_x8(&step3_0, &step3_1, &step3_2, &step3_3,
- &step3_4, &step3_5, &step3_6, &step3_7);
- if (overflow) {
- aom_highbd_fdct16x16_c(input, output, stride);
- return;
- }
-#endif // DCT_HIGH_BIT_DEPTH
- }
- // step 4
- {
- const __m128i t0 = _mm_unpacklo_epi16(step3_1, step3_6);
- const __m128i t1 = _mm_unpackhi_epi16(step3_1, step3_6);
- const __m128i t2 = _mm_unpacklo_epi16(step3_2, step3_5);
- const __m128i t3 = _mm_unpackhi_epi16(step3_2, step3_5);
- step2_1 = mult_round_shift(&t0, &t1, &k__cospi_m08_p24,
- &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
- step2_2 = mult_round_shift(&t2, &t3, &k__cospi_p24_p08,
- &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
- step2_6 = mult_round_shift(&t0, &t1, &k__cospi_p24_p08,
- &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
- step2_5 = mult_round_shift(&t2, &t3, &k__cospi_p08_m24,
- &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
-#if DCT_HIGH_BIT_DEPTH
- overflow =
- check_epi16_overflow_x4(&step2_1, &step2_2, &step2_6, &step2_5);
- if (overflow) {
- aom_highbd_fdct16x16_c(input, output, stride);
- return;
- }
-#endif // DCT_HIGH_BIT_DEPTH
- }
- // step 5
- {
- step1_0 = ADD_EPI16(step3_0, step2_1);
- step1_1 = SUB_EPI16(step3_0, step2_1);
- step1_2 = ADD_EPI16(step3_3, step2_2);
- step1_3 = SUB_EPI16(step3_3, step2_2);
- step1_4 = SUB_EPI16(step3_4, step2_5);
- step1_5 = ADD_EPI16(step3_4, step2_5);
- step1_6 = SUB_EPI16(step3_7, step2_6);
- step1_7 = ADD_EPI16(step3_7, step2_6);
-#if DCT_HIGH_BIT_DEPTH
- overflow =
- check_epi16_overflow_x8(&step1_0, &step1_1, &step1_2, &step1_3,
- &step1_4, &step1_5, &step1_6, &step1_7);
- if (overflow) {
- aom_highbd_fdct16x16_c(input, output, stride);
- return;
- }
-#endif // DCT_HIGH_BIT_DEPTH
- }
- // step 6
- {
- const __m128i t0 = _mm_unpacklo_epi16(step1_0, step1_7);
- const __m128i t1 = _mm_unpackhi_epi16(step1_0, step1_7);
- const __m128i t2 = _mm_unpacklo_epi16(step1_1, step1_6);
- const __m128i t3 = _mm_unpackhi_epi16(step1_1, step1_6);
- res01 = mult_round_shift(&t0, &t1, &k__cospi_p30_p02,
- &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
- res09 = mult_round_shift(&t2, &t3, &k__cospi_p14_p18,
- &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
- res15 = mult_round_shift(&t0, &t1, &k__cospi_m02_p30,
- &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
- res07 = mult_round_shift(&t2, &t3, &k__cospi_m18_p14,
- &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
-#if DCT_HIGH_BIT_DEPTH
- overflow = check_epi16_overflow_x4(&res01, &res09, &res15, &res07);
- if (overflow) {
- aom_highbd_fdct16x16_c(input, output, stride);
- return;
- }
-#endif // DCT_HIGH_BIT_DEPTH
- }
- {
- const __m128i t0 = _mm_unpacklo_epi16(step1_2, step1_5);
- const __m128i t1 = _mm_unpackhi_epi16(step1_2, step1_5);
- const __m128i t2 = _mm_unpacklo_epi16(step1_3, step1_4);
- const __m128i t3 = _mm_unpackhi_epi16(step1_3, step1_4);
- res05 = mult_round_shift(&t0, &t1, &k__cospi_p22_p10,
- &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
- res13 = mult_round_shift(&t2, &t3, &k__cospi_p06_p26,
- &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
- res11 = mult_round_shift(&t0, &t1, &k__cospi_m10_p22,
- &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
- res03 = mult_round_shift(&t2, &t3, &k__cospi_m26_p06,
- &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
-#if DCT_HIGH_BIT_DEPTH
- overflow = check_epi16_overflow_x4(&res05, &res13, &res11, &res03);
- if (overflow) {
- aom_highbd_fdct16x16_c(input, output, stride);
- return;
- }
-#endif // DCT_HIGH_BIT_DEPTH
- }
- }
- // Transpose the results, do it as two 8x8 transposes.
- transpose_and_output8x8(&res00, &res01, &res02, &res03, &res04, &res05,
- &res06, &res07, pass, out0, out1);
- transpose_and_output8x8(&res08, &res09, &res10, &res11, &res12, &res13,
- &res14, &res15, pass, out0 + 8, out1 + 8);
- if (pass == 0) {
- out0 += 8 * 16;
- } else {
- out1 += 8 * 16;
- }
- }
- // Setup in/out for next pass.
- in = intermediate;
- }
-}
-
-#undef ADD_EPI16
-#undef SUB_EPI16
diff --git a/av1/common/x86/av1_fwd_txfm_sse2.c b/av1/common/x86/av1_fwd_txfm_sse2.c
deleted file mode 100644
index 081fe08..0000000
--- a/av1/common/x86/av1_fwd_txfm_sse2.c
+++ /dev/null
@@ -1,272 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <emmintrin.h> // SSE2
-
-#include "./aom_config.h"
-#include "aom_dsp/aom_dsp_common.h"
-#include "aom_dsp/x86/fwd_txfm_sse2.h"
-
-void av1_fdct4x4_1_sse2(const int16_t *input, tran_low_t *output, int stride) {
- __m128i in0, in1;
- __m128i tmp;
- const __m128i zero = _mm_setzero_si128();
- in0 = _mm_loadl_epi64((const __m128i *)(input + 0 * stride));
- in1 = _mm_loadl_epi64((const __m128i *)(input + 1 * stride));
- in1 = _mm_unpacklo_epi64(
- in1, _mm_loadl_epi64((const __m128i *)(input + 2 * stride)));
- in0 = _mm_unpacklo_epi64(
- in0, _mm_loadl_epi64((const __m128i *)(input + 3 * stride)));
-
- tmp = _mm_add_epi16(in0, in1);
- in0 = _mm_unpacklo_epi16(zero, tmp);
- in1 = _mm_unpackhi_epi16(zero, tmp);
- in0 = _mm_srai_epi32(in0, 16);
- in1 = _mm_srai_epi32(in1, 16);
-
- tmp = _mm_add_epi32(in0, in1);
- in0 = _mm_unpacklo_epi32(tmp, zero);
- in1 = _mm_unpackhi_epi32(tmp, zero);
-
- tmp = _mm_add_epi32(in0, in1);
- in0 = _mm_srli_si128(tmp, 8);
-
- in1 = _mm_add_epi32(tmp, in0);
- in0 = _mm_slli_epi32(in1, 1);
- store_output(&in0, output);
-}
-
-void av1_fdct8x8_1_sse2(const int16_t *input, tran_low_t *output, int stride) {
- __m128i in0 = _mm_load_si128((const __m128i *)(input + 0 * stride));
- __m128i in1 = _mm_load_si128((const __m128i *)(input + 1 * stride));
- __m128i in2 = _mm_load_si128((const __m128i *)(input + 2 * stride));
- __m128i in3 = _mm_load_si128((const __m128i *)(input + 3 * stride));
- __m128i u0, u1, sum;
-
- u0 = _mm_add_epi16(in0, in1);
- u1 = _mm_add_epi16(in2, in3);
-
- in0 = _mm_load_si128((const __m128i *)(input + 4 * stride));
- in1 = _mm_load_si128((const __m128i *)(input + 5 * stride));
- in2 = _mm_load_si128((const __m128i *)(input + 6 * stride));
- in3 = _mm_load_si128((const __m128i *)(input + 7 * stride));
-
- sum = _mm_add_epi16(u0, u1);
-
- in0 = _mm_add_epi16(in0, in1);
- in2 = _mm_add_epi16(in2, in3);
- sum = _mm_add_epi16(sum, in0);
-
- u0 = _mm_setzero_si128();
- sum = _mm_add_epi16(sum, in2);
-
- in0 = _mm_unpacklo_epi16(u0, sum);
- in1 = _mm_unpackhi_epi16(u0, sum);
- in0 = _mm_srai_epi32(in0, 16);
- in1 = _mm_srai_epi32(in1, 16);
-
- sum = _mm_add_epi32(in0, in1);
- in0 = _mm_unpacklo_epi32(sum, u0);
- in1 = _mm_unpackhi_epi32(sum, u0);
-
- sum = _mm_add_epi32(in0, in1);
- in0 = _mm_srli_si128(sum, 8);
-
- in1 = _mm_add_epi32(sum, in0);
- store_output(&in1, output);
-}
-
-void av1_fdct16x16_1_sse2(const int16_t *input, tran_low_t *output,
- int stride) {
- __m128i in0, in1, in2, in3;
- __m128i u0, u1;
- __m128i sum = _mm_setzero_si128();
- int i;
-
- for (i = 0; i < 2; ++i) {
- input += 8 * i;
- in0 = _mm_load_si128((const __m128i *)(input + 0 * stride));
- in1 = _mm_load_si128((const __m128i *)(input + 1 * stride));
- in2 = _mm_load_si128((const __m128i *)(input + 2 * stride));
- in3 = _mm_load_si128((const __m128i *)(input + 3 * stride));
-
- u0 = _mm_add_epi16(in0, in1);
- u1 = _mm_add_epi16(in2, in3);
- sum = _mm_add_epi16(sum, u0);
-
- in0 = _mm_load_si128((const __m128i *)(input + 4 * stride));
- in1 = _mm_load_si128((const __m128i *)(input + 5 * stride));
- in2 = _mm_load_si128((const __m128i *)(input + 6 * stride));
- in3 = _mm_load_si128((const __m128i *)(input + 7 * stride));
-
- sum = _mm_add_epi16(sum, u1);
- u0 = _mm_add_epi16(in0, in1);
- u1 = _mm_add_epi16(in2, in3);
- sum = _mm_add_epi16(sum, u0);
-
- in0 = _mm_load_si128((const __m128i *)(input + 8 * stride));
- in1 = _mm_load_si128((const __m128i *)(input + 9 * stride));
- in2 = _mm_load_si128((const __m128i *)(input + 10 * stride));
- in3 = _mm_load_si128((const __m128i *)(input + 11 * stride));
-
- sum = _mm_add_epi16(sum, u1);
- u0 = _mm_add_epi16(in0, in1);
- u1 = _mm_add_epi16(in2, in3);
- sum = _mm_add_epi16(sum, u0);
-
- in0 = _mm_load_si128((const __m128i *)(input + 12 * stride));
- in1 = _mm_load_si128((const __m128i *)(input + 13 * stride));
- in2 = _mm_load_si128((const __m128i *)(input + 14 * stride));
- in3 = _mm_load_si128((const __m128i *)(input + 15 * stride));
-
- sum = _mm_add_epi16(sum, u1);
- u0 = _mm_add_epi16(in0, in1);
- u1 = _mm_add_epi16(in2, in3);
- sum = _mm_add_epi16(sum, u0);
-
- sum = _mm_add_epi16(sum, u1);
- }
-
- u0 = _mm_setzero_si128();
- in0 = _mm_unpacklo_epi16(u0, sum);
- in1 = _mm_unpackhi_epi16(u0, sum);
- in0 = _mm_srai_epi32(in0, 16);
- in1 = _mm_srai_epi32(in1, 16);
-
- sum = _mm_add_epi32(in0, in1);
- in0 = _mm_unpacklo_epi32(sum, u0);
- in1 = _mm_unpackhi_epi32(sum, u0);
-
- sum = _mm_add_epi32(in0, in1);
- in0 = _mm_srli_si128(sum, 8);
-
- in1 = _mm_add_epi32(sum, in0);
- in1 = _mm_srai_epi32(in1, 1);
- store_output(&in1, output);
-}
-
-void av1_fdct32x32_1_sse2(const int16_t *input, tran_low_t *output,
- int stride) {
- __m128i in0, in1, in2, in3;
- __m128i u0, u1;
- __m128i sum = _mm_setzero_si128();
- int i;
-
- for (i = 0; i < 8; ++i) {
- in0 = _mm_load_si128((const __m128i *)(input + 0));
- in1 = _mm_load_si128((const __m128i *)(input + 8));
- in2 = _mm_load_si128((const __m128i *)(input + 16));
- in3 = _mm_load_si128((const __m128i *)(input + 24));
-
- input += stride;
- u0 = _mm_add_epi16(in0, in1);
- u1 = _mm_add_epi16(in2, in3);
- sum = _mm_add_epi16(sum, u0);
-
- in0 = _mm_load_si128((const __m128i *)(input + 0));
- in1 = _mm_load_si128((const __m128i *)(input + 8));
- in2 = _mm_load_si128((const __m128i *)(input + 16));
- in3 = _mm_load_si128((const __m128i *)(input + 24));
-
- input += stride;
- sum = _mm_add_epi16(sum, u1);
- u0 = _mm_add_epi16(in0, in1);
- u1 = _mm_add_epi16(in2, in3);
- sum = _mm_add_epi16(sum, u0);
-
- in0 = _mm_load_si128((const __m128i *)(input + 0));
- in1 = _mm_load_si128((const __m128i *)(input + 8));
- in2 = _mm_load_si128((const __m128i *)(input + 16));
- in3 = _mm_load_si128((const __m128i *)(input + 24));
-
- input += stride;
- sum = _mm_add_epi16(sum, u1);
- u0 = _mm_add_epi16(in0, in1);
- u1 = _mm_add_epi16(in2, in3);
- sum = _mm_add_epi16(sum, u0);
-
- in0 = _mm_load_si128((const __m128i *)(input + 0));
- in1 = _mm_load_si128((const __m128i *)(input + 8));
- in2 = _mm_load_si128((const __m128i *)(input + 16));
- in3 = _mm_load_si128((const __m128i *)(input + 24));
-
- input += stride;
- sum = _mm_add_epi16(sum, u1);
- u0 = _mm_add_epi16(in0, in1);
- u1 = _mm_add_epi16(in2, in3);
- sum = _mm_add_epi16(sum, u0);
-
- sum = _mm_add_epi16(sum, u1);
- }
-
- u0 = _mm_setzero_si128();
- in0 = _mm_unpacklo_epi16(u0, sum);
- in1 = _mm_unpackhi_epi16(u0, sum);
- in0 = _mm_srai_epi32(in0, 16);
- in1 = _mm_srai_epi32(in1, 16);
-
- sum = _mm_add_epi32(in0, in1);
- in0 = _mm_unpacklo_epi32(sum, u0);
- in1 = _mm_unpackhi_epi32(sum, u0);
-
- sum = _mm_add_epi32(in0, in1);
- in0 = _mm_srli_si128(sum, 8);
-
- in1 = _mm_add_epi32(sum, in0);
- in1 = _mm_srai_epi32(in1, 3);
- store_output(&in1, output);
-}
-
-#define DCT_HIGH_BIT_DEPTH 0
-#define FDCT4x4_2D av1_fdct4x4_sse2
-#define FDCT8x8_2D av1_fdct8x8_sse2
-#define FDCT16x16_2D av1_fdct16x16_sse2
-#include "av1/common/x86/av1_fwd_txfm_impl_sse2.h"
-#undef FDCT4x4_2D
-#undef FDCT8x8_2D
-#undef FDCT16x16_2D
-
-#define FDCT32x32_2D av1_fdct32x32_rd_sse2
-#define FDCT32x32_HIGH_PRECISION 0
-#include "av1/common/x86/av1_fwd_dct32x32_impl_sse2.h"
-#undef FDCT32x32_2D
-#undef FDCT32x32_HIGH_PRECISION
-
-#define FDCT32x32_2D av1_fdct32x32_sse2
-#define FDCT32x32_HIGH_PRECISION 1
-#include "av1/common/x86/av1_fwd_dct32x32_impl_sse2.h" // NOLINT
-#undef FDCT32x32_2D
-#undef FDCT32x32_HIGH_PRECISION
-#undef DCT_HIGH_BIT_DEPTH
-
-#if CONFIG_AOM_HIGHBITDEPTH
-#define DCT_HIGH_BIT_DEPTH 1
-#define FDCT4x4_2D av1_highbd_fdct4x4_sse2
-#define FDCT8x8_2D av1_highbd_fdct8x8_sse2
-#define FDCT16x16_2D av1_highbd_fdct16x16_sse2
-#include "av1/common/x86/av1_fwd_txfm_impl_sse2.h" // NOLINT
-#undef FDCT4x4_2D
-#undef FDCT8x8_2D
-#undef FDCT16x16_2D
-
-#define FDCT32x32_2D av1_highbd_fdct32x32_rd_sse2
-#define FDCT32x32_HIGH_PRECISION 0
-#include "av1/common/x86/av1_fwd_dct32x32_impl_sse2.h" // NOLINT
-#undef FDCT32x32_2D
-#undef FDCT32x32_HIGH_PRECISION
-
-#define FDCT32x32_2D av1_highbd_fdct32x32_sse2
-#define FDCT32x32_HIGH_PRECISION 1
-#include "av1/common/x86/av1_fwd_dct32x32_impl_sse2.h" // NOLINT
-#undef FDCT32x32_2D
-#undef FDCT32x32_HIGH_PRECISION
-#undef DCT_HIGH_BIT_DEPTH
-#endif // CONFIG_AOM_HIGHBITDEPTH
diff --git a/av1/common/x86/av1_inv_txfm_sse2.c b/av1/common/x86/av1_inv_txfm_sse2.c
deleted file mode 100644
index 365c124..0000000
--- a/av1/common/x86/av1_inv_txfm_sse2.c
+++ /dev/null
@@ -1,4028 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include "./av1_rtcd.h"
-#include "av1/common/x86/av1_inv_txfm_sse2.h"
-#include "aom_dsp/x86/txfm_common_sse2.h"
-
-#define RECON_AND_STORE4X4(dest, in_x) \
- { \
- __m128i d0 = _mm_cvtsi32_si128(*(const int *)(dest)); \
- d0 = _mm_unpacklo_epi8(d0, zero); \
- d0 = _mm_add_epi16(in_x, d0); \
- d0 = _mm_packus_epi16(d0, d0); \
- *(int *)(dest) = _mm_cvtsi128_si32(d0); \
- }
-
-void av1_idct4x4_16_add_sse2(const int16_t *input, uint8_t *dest, int stride) {
- const __m128i zero = _mm_setzero_si128();
- const __m128i eight = _mm_set1_epi16(8);
- const __m128i cst = _mm_setr_epi16(
- (int16_t)cospi_16_64, (int16_t)cospi_16_64, (int16_t)cospi_16_64,
- (int16_t)-cospi_16_64, (int16_t)cospi_24_64, (int16_t)-cospi_8_64,
- (int16_t)cospi_8_64, (int16_t)cospi_24_64);
- const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
- __m128i input0, input1, input2, input3;
-
- // Rows
- input0 = _mm_load_si128((const __m128i *)input);
- input2 = _mm_load_si128((const __m128i *)(input + 8));
-
- // Construct i3, i1, i3, i1, i2, i0, i2, i0
- input0 = _mm_shufflelo_epi16(input0, 0xd8);
- input0 = _mm_shufflehi_epi16(input0, 0xd8);
- input2 = _mm_shufflelo_epi16(input2, 0xd8);
- input2 = _mm_shufflehi_epi16(input2, 0xd8);
-
- input1 = _mm_unpackhi_epi32(input0, input0);
- input0 = _mm_unpacklo_epi32(input0, input0);
- input3 = _mm_unpackhi_epi32(input2, input2);
- input2 = _mm_unpacklo_epi32(input2, input2);
-
- // Stage 1
- input0 = _mm_madd_epi16(input0, cst);
- input1 = _mm_madd_epi16(input1, cst);
- input2 = _mm_madd_epi16(input2, cst);
- input3 = _mm_madd_epi16(input3, cst);
-
- input0 = _mm_add_epi32(input0, rounding);
- input1 = _mm_add_epi32(input1, rounding);
- input2 = _mm_add_epi32(input2, rounding);
- input3 = _mm_add_epi32(input3, rounding);
-
- input0 = _mm_srai_epi32(input0, DCT_CONST_BITS);
- input1 = _mm_srai_epi32(input1, DCT_CONST_BITS);
- input2 = _mm_srai_epi32(input2, DCT_CONST_BITS);
- input3 = _mm_srai_epi32(input3, DCT_CONST_BITS);
-
- // Stage 2
- input0 = _mm_packs_epi32(input0, input1);
- input1 = _mm_packs_epi32(input2, input3);
-
- // Transpose
- input2 = _mm_unpacklo_epi16(input0, input1);
- input3 = _mm_unpackhi_epi16(input0, input1);
- input0 = _mm_unpacklo_epi32(input2, input3);
- input1 = _mm_unpackhi_epi32(input2, input3);
-
- // Switch column2, column 3, and then, we got:
- // input2: column1, column 0; input3: column2, column 3.
- input1 = _mm_shuffle_epi32(input1, 0x4e);
- input2 = _mm_add_epi16(input0, input1);
- input3 = _mm_sub_epi16(input0, input1);
-
- // Columns
- // Construct i3, i1, i3, i1, i2, i0, i2, i0
- input0 = _mm_unpacklo_epi32(input2, input2);
- input1 = _mm_unpackhi_epi32(input2, input2);
- input2 = _mm_unpackhi_epi32(input3, input3);
- input3 = _mm_unpacklo_epi32(input3, input3);
-
- // Stage 1
- input0 = _mm_madd_epi16(input0, cst);
- input1 = _mm_madd_epi16(input1, cst);
- input2 = _mm_madd_epi16(input2, cst);
- input3 = _mm_madd_epi16(input3, cst);
-
- input0 = _mm_add_epi32(input0, rounding);
- input1 = _mm_add_epi32(input1, rounding);
- input2 = _mm_add_epi32(input2, rounding);
- input3 = _mm_add_epi32(input3, rounding);
-
- input0 = _mm_srai_epi32(input0, DCT_CONST_BITS);
- input1 = _mm_srai_epi32(input1, DCT_CONST_BITS);
- input2 = _mm_srai_epi32(input2, DCT_CONST_BITS);
- input3 = _mm_srai_epi32(input3, DCT_CONST_BITS);
-
- // Stage 2
- input0 = _mm_packs_epi32(input0, input2);
- input1 = _mm_packs_epi32(input1, input3);
-
- // Transpose
- input2 = _mm_unpacklo_epi16(input0, input1);
- input3 = _mm_unpackhi_epi16(input0, input1);
- input0 = _mm_unpacklo_epi32(input2, input3);
- input1 = _mm_unpackhi_epi32(input2, input3);
-
- // Switch column2, column 3, and then, we got:
- // input2: column1, column 0; input3: column2, column 3.
- input1 = _mm_shuffle_epi32(input1, 0x4e);
- input2 = _mm_add_epi16(input0, input1);
- input3 = _mm_sub_epi16(input0, input1);
-
- // Final round and shift
- input2 = _mm_add_epi16(input2, eight);
- input3 = _mm_add_epi16(input3, eight);
-
- input2 = _mm_srai_epi16(input2, 4);
- input3 = _mm_srai_epi16(input3, 4);
-
- // Reconstruction and Store
- {
- __m128i d0 = _mm_cvtsi32_si128(*(const int *)(dest));
- __m128i d2 = _mm_cvtsi32_si128(*(const int *)(dest + stride * 2));
- d0 = _mm_unpacklo_epi32(d0,
- _mm_cvtsi32_si128(*(const int *)(dest + stride)));
- d2 = _mm_unpacklo_epi32(
- _mm_cvtsi32_si128(*(const int *)(dest + stride * 3)), d2);
- d0 = _mm_unpacklo_epi8(d0, zero);
- d2 = _mm_unpacklo_epi8(d2, zero);
- d0 = _mm_add_epi16(d0, input2);
- d2 = _mm_add_epi16(d2, input3);
- d0 = _mm_packus_epi16(d0, d2);
- // store input0
- *(int *)dest = _mm_cvtsi128_si32(d0);
- // store input1
- d0 = _mm_srli_si128(d0, 4);
- *(int *)(dest + stride) = _mm_cvtsi128_si32(d0);
- // store input2
- d0 = _mm_srli_si128(d0, 4);
- *(int *)(dest + stride * 3) = _mm_cvtsi128_si32(d0);
- // store input3
- d0 = _mm_srli_si128(d0, 4);
- *(int *)(dest + stride * 2) = _mm_cvtsi128_si32(d0);
- }
-}
-
-void av1_idct4x4_1_add_sse2(const int16_t *input, uint8_t *dest, int stride) {
- __m128i dc_value;
- const __m128i zero = _mm_setzero_si128();
- int a;
-
- a = (int)dct_const_round_shift(input[0] * cospi_16_64);
- a = (int)dct_const_round_shift(a * cospi_16_64);
- a = ROUND_POWER_OF_TWO(a, 4);
-
- dc_value = _mm_set1_epi16(a);
-
- RECON_AND_STORE4X4(dest + 0 * stride, dc_value);
- RECON_AND_STORE4X4(dest + 1 * stride, dc_value);
- RECON_AND_STORE4X4(dest + 2 * stride, dc_value);
- RECON_AND_STORE4X4(dest + 3 * stride, dc_value);
-}
-
-static INLINE void transpose_4x4(__m128i *res) {
- const __m128i tr0_0 = _mm_unpacklo_epi16(res[0], res[1]);
- const __m128i tr0_1 = _mm_unpackhi_epi16(res[0], res[1]);
-
- res[0] = _mm_unpacklo_epi16(tr0_0, tr0_1);
- res[1] = _mm_unpackhi_epi16(tr0_0, tr0_1);
-}
-
-void av1_idct4_sse2(__m128i *in) {
- const __m128i k__cospi_p16_p16 = pair_set_epi16(cospi_16_64, cospi_16_64);
- const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
- const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
- const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
- const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
- __m128i u[8], v[8];
-
- transpose_4x4(in);
- // stage 1
- u[0] = _mm_unpacklo_epi16(in[0], in[1]);
- u[1] = _mm_unpackhi_epi16(in[0], in[1]);
- v[0] = _mm_madd_epi16(u[0], k__cospi_p16_p16);
- v[1] = _mm_madd_epi16(u[0], k__cospi_p16_m16);
- v[2] = _mm_madd_epi16(u[1], k__cospi_p24_m08);
- v[3] = _mm_madd_epi16(u[1], k__cospi_p08_p24);
-
- u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
- u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
- u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
- u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
-
- v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
- v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
- v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
- v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
-
- u[0] = _mm_packs_epi32(v[0], v[1]);
- u[1] = _mm_packs_epi32(v[3], v[2]);
-
- // stage 2
- in[0] = _mm_add_epi16(u[0], u[1]);
- in[1] = _mm_sub_epi16(u[0], u[1]);
- in[1] = _mm_shuffle_epi32(in[1], 0x4E);
-}
-
-void av1_iadst4_sse2(__m128i *in) {
- const __m128i k__sinpi_p01_p04 = pair_set_epi16(sinpi_1_9, sinpi_4_9);
- const __m128i k__sinpi_p03_p02 = pair_set_epi16(sinpi_3_9, sinpi_2_9);
- const __m128i k__sinpi_p02_m01 = pair_set_epi16(sinpi_2_9, -sinpi_1_9);
- const __m128i k__sinpi_p03_m04 = pair_set_epi16(sinpi_3_9, -sinpi_4_9);
- const __m128i k__sinpi_p03_p03 = _mm_set1_epi16((int16_t)sinpi_3_9);
- const __m128i kZero = _mm_set1_epi16(0);
- const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
- __m128i u[8], v[8], in7;
-
- transpose_4x4(in);
- in7 = _mm_srli_si128(in[1], 8);
- in7 = _mm_add_epi16(in7, in[0]);
- in7 = _mm_sub_epi16(in7, in[1]);
-
- u[0] = _mm_unpacklo_epi16(in[0], in[1]);
- u[1] = _mm_unpackhi_epi16(in[0], in[1]);
- u[2] = _mm_unpacklo_epi16(in7, kZero);
- u[3] = _mm_unpackhi_epi16(in[0], kZero);
-
- v[0] = _mm_madd_epi16(u[0], k__sinpi_p01_p04); // s0 + s3
- v[1] = _mm_madd_epi16(u[1], k__sinpi_p03_p02); // s2 + s5
- v[2] = _mm_madd_epi16(u[2], k__sinpi_p03_p03); // x2
- v[3] = _mm_madd_epi16(u[0], k__sinpi_p02_m01); // s1 - s4
- v[4] = _mm_madd_epi16(u[1], k__sinpi_p03_m04); // s2 - s6
- v[5] = _mm_madd_epi16(u[3], k__sinpi_p03_p03); // s2
-
- u[0] = _mm_add_epi32(v[0], v[1]);
- u[1] = _mm_add_epi32(v[3], v[4]);
- u[2] = v[2];
- u[3] = _mm_add_epi32(u[0], u[1]);
- u[4] = _mm_slli_epi32(v[5], 2);
- u[5] = _mm_add_epi32(u[3], v[5]);
- u[6] = _mm_sub_epi32(u[5], u[4]);
-
- v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
- v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
- v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
- v[3] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
-
- u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
- u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
- u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
- u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
-
- in[0] = _mm_packs_epi32(u[0], u[1]);
- in[1] = _mm_packs_epi32(u[2], u[3]);
-}
-
-#define TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \
- out2, out3, out4, out5, out6, out7) \
- { \
- const __m128i tr0_0 = _mm_unpacklo_epi16(in0, in1); \
- const __m128i tr0_1 = _mm_unpacklo_epi16(in2, in3); \
- const __m128i tr0_2 = _mm_unpackhi_epi16(in0, in1); \
- const __m128i tr0_3 = _mm_unpackhi_epi16(in2, in3); \
- const __m128i tr0_4 = _mm_unpacklo_epi16(in4, in5); \
- const __m128i tr0_5 = _mm_unpacklo_epi16(in6, in7); \
- const __m128i tr0_6 = _mm_unpackhi_epi16(in4, in5); \
- const __m128i tr0_7 = _mm_unpackhi_epi16(in6, in7); \
- \
- const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); \
- const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3); \
- const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); \
- const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3); \
- const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5); \
- const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7); \
- const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5); \
- const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7); \
- \
- out0 = _mm_unpacklo_epi64(tr1_0, tr1_4); \
- out1 = _mm_unpackhi_epi64(tr1_0, tr1_4); \
- out2 = _mm_unpacklo_epi64(tr1_2, tr1_6); \
- out3 = _mm_unpackhi_epi64(tr1_2, tr1_6); \
- out4 = _mm_unpacklo_epi64(tr1_1, tr1_5); \
- out5 = _mm_unpackhi_epi64(tr1_1, tr1_5); \
- out6 = _mm_unpacklo_epi64(tr1_3, tr1_7); \
- out7 = _mm_unpackhi_epi64(tr1_3, tr1_7); \
- }
-
-#define TRANSPOSE_4X8_10(tmp0, tmp1, tmp2, tmp3, out0, out1, out2, out3) \
- { \
- const __m128i tr0_0 = _mm_unpackhi_epi16(tmp0, tmp1); \
- const __m128i tr0_1 = _mm_unpacklo_epi16(tmp1, tmp0); \
- const __m128i tr0_4 = _mm_unpacklo_epi16(tmp2, tmp3); \
- const __m128i tr0_5 = _mm_unpackhi_epi16(tmp3, tmp2); \
- \
- const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); \
- const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); \
- const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5); \
- const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5); \
- \
- out0 = _mm_unpacklo_epi64(tr1_0, tr1_4); \
- out1 = _mm_unpackhi_epi64(tr1_0, tr1_4); \
- out2 = _mm_unpacklo_epi64(tr1_2, tr1_6); \
- out3 = _mm_unpackhi_epi64(tr1_2, tr1_6); \
- }
-
-#define TRANSPOSE_8X8_10(in0, in1, in2, in3, out0, out1) \
- { \
- const __m128i tr0_0 = _mm_unpacklo_epi16(in0, in1); \
- const __m128i tr0_1 = _mm_unpacklo_epi16(in2, in3); \
- out0 = _mm_unpacklo_epi32(tr0_0, tr0_1); \
- out1 = _mm_unpackhi_epi32(tr0_0, tr0_1); \
- }
-
-// Define Macro for multiplying elements by constants and adding them together.
-#define MULTIPLICATION_AND_ADD(lo_0, hi_0, lo_1, hi_1, cst0, cst1, cst2, cst3, \
- res0, res1, res2, res3) \
- { \
- tmp0 = _mm_madd_epi16(lo_0, cst0); \
- tmp1 = _mm_madd_epi16(hi_0, cst0); \
- tmp2 = _mm_madd_epi16(lo_0, cst1); \
- tmp3 = _mm_madd_epi16(hi_0, cst1); \
- tmp4 = _mm_madd_epi16(lo_1, cst2); \
- tmp5 = _mm_madd_epi16(hi_1, cst2); \
- tmp6 = _mm_madd_epi16(lo_1, cst3); \
- tmp7 = _mm_madd_epi16(hi_1, cst3); \
- \
- tmp0 = _mm_add_epi32(tmp0, rounding); \
- tmp1 = _mm_add_epi32(tmp1, rounding); \
- tmp2 = _mm_add_epi32(tmp2, rounding); \
- tmp3 = _mm_add_epi32(tmp3, rounding); \
- tmp4 = _mm_add_epi32(tmp4, rounding); \
- tmp5 = _mm_add_epi32(tmp5, rounding); \
- tmp6 = _mm_add_epi32(tmp6, rounding); \
- tmp7 = _mm_add_epi32(tmp7, rounding); \
- \
- tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \
- tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \
- tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \
- tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \
- tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS); \
- tmp5 = _mm_srai_epi32(tmp5, DCT_CONST_BITS); \
- tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS); \
- tmp7 = _mm_srai_epi32(tmp7, DCT_CONST_BITS); \
- \
- res0 = _mm_packs_epi32(tmp0, tmp1); \
- res1 = _mm_packs_epi32(tmp2, tmp3); \
- res2 = _mm_packs_epi32(tmp4, tmp5); \
- res3 = _mm_packs_epi32(tmp6, tmp7); \
- }
-
-#define MULTIPLICATION_AND_ADD_2(lo_0, hi_0, cst0, cst1, res0, res1) \
- { \
- tmp0 = _mm_madd_epi16(lo_0, cst0); \
- tmp1 = _mm_madd_epi16(hi_0, cst0); \
- tmp2 = _mm_madd_epi16(lo_0, cst1); \
- tmp3 = _mm_madd_epi16(hi_0, cst1); \
- \
- tmp0 = _mm_add_epi32(tmp0, rounding); \
- tmp1 = _mm_add_epi32(tmp1, rounding); \
- tmp2 = _mm_add_epi32(tmp2, rounding); \
- tmp3 = _mm_add_epi32(tmp3, rounding); \
- \
- tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \
- tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \
- tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \
- tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \
- \
- res0 = _mm_packs_epi32(tmp0, tmp1); \
- res1 = _mm_packs_epi32(tmp2, tmp3); \
- }
-
-#define IDCT8(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, out3, \
- out4, out5, out6, out7) \
- { \
- /* Stage1 */ \
- { \
- const __m128i lo_17 = _mm_unpacklo_epi16(in1, in7); \
- const __m128i hi_17 = _mm_unpackhi_epi16(in1, in7); \
- const __m128i lo_35 = _mm_unpacklo_epi16(in3, in5); \
- const __m128i hi_35 = _mm_unpackhi_epi16(in3, in5); \
- \
- MULTIPLICATION_AND_ADD(lo_17, hi_17, lo_35, hi_35, stg1_0, stg1_1, \
- stg1_2, stg1_3, stp1_4, stp1_7, stp1_5, stp1_6) \
- } \
- \
- /* Stage2 */ \
- { \
- const __m128i lo_04 = _mm_unpacklo_epi16(in0, in4); \
- const __m128i hi_04 = _mm_unpackhi_epi16(in0, in4); \
- const __m128i lo_26 = _mm_unpacklo_epi16(in2, in6); \
- const __m128i hi_26 = _mm_unpackhi_epi16(in2, in6); \
- \
- MULTIPLICATION_AND_ADD(lo_04, hi_04, lo_26, hi_26, stg2_0, stg2_1, \
- stg2_2, stg2_3, stp2_0, stp2_1, stp2_2, stp2_3) \
- \
- stp2_4 = _mm_adds_epi16(stp1_4, stp1_5); \
- stp2_5 = _mm_subs_epi16(stp1_4, stp1_5); \
- stp2_6 = _mm_subs_epi16(stp1_7, stp1_6); \
- stp2_7 = _mm_adds_epi16(stp1_7, stp1_6); \
- } \
- \
- /* Stage3 */ \
- { \
- const __m128i lo_56 = _mm_unpacklo_epi16(stp2_6, stp2_5); \
- const __m128i hi_56 = _mm_unpackhi_epi16(stp2_6, stp2_5); \
- \
- stp1_0 = _mm_adds_epi16(stp2_0, stp2_3); \
- stp1_1 = _mm_adds_epi16(stp2_1, stp2_2); \
- stp1_2 = _mm_subs_epi16(stp2_1, stp2_2); \
- stp1_3 = _mm_subs_epi16(stp2_0, stp2_3); \
- \
- tmp0 = _mm_madd_epi16(lo_56, stg2_1); \
- tmp1 = _mm_madd_epi16(hi_56, stg2_1); \
- tmp2 = _mm_madd_epi16(lo_56, stg2_0); \
- tmp3 = _mm_madd_epi16(hi_56, stg2_0); \
- \
- tmp0 = _mm_add_epi32(tmp0, rounding); \
- tmp1 = _mm_add_epi32(tmp1, rounding); \
- tmp2 = _mm_add_epi32(tmp2, rounding); \
- tmp3 = _mm_add_epi32(tmp3, rounding); \
- \
- tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \
- tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \
- tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \
- tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \
- \
- stp1_5 = _mm_packs_epi32(tmp0, tmp1); \
- stp1_6 = _mm_packs_epi32(tmp2, tmp3); \
- } \
- \
- /* Stage4 */ \
- out0 = _mm_adds_epi16(stp1_0, stp2_7); \
- out1 = _mm_adds_epi16(stp1_1, stp1_6); \
- out2 = _mm_adds_epi16(stp1_2, stp1_5); \
- out3 = _mm_adds_epi16(stp1_3, stp2_4); \
- out4 = _mm_subs_epi16(stp1_3, stp2_4); \
- out5 = _mm_subs_epi16(stp1_2, stp1_5); \
- out6 = _mm_subs_epi16(stp1_1, stp1_6); \
- out7 = _mm_subs_epi16(stp1_0, stp2_7); \
- }
-
-void av1_idct8x8_64_add_sse2(const int16_t *input, uint8_t *dest, int stride) {
- const __m128i zero = _mm_setzero_si128();
- const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
- const __m128i final_rounding = _mm_set1_epi16(1 << 4);
- const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
- const __m128i stg1_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
- const __m128i stg1_2 = pair_set_epi16(-cospi_20_64, cospi_12_64);
- const __m128i stg1_3 = pair_set_epi16(cospi_12_64, cospi_20_64);
- const __m128i stg2_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
- const __m128i stg2_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
- const __m128i stg2_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);
- const __m128i stg2_3 = pair_set_epi16(cospi_8_64, cospi_24_64);
-
- __m128i in0, in1, in2, in3, in4, in5, in6, in7;
- __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7;
- __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7;
- __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
- int i;
-
- // Load input data.
- in0 = _mm_load_si128((const __m128i *)input);
- in1 = _mm_load_si128((const __m128i *)(input + 8 * 1));
- in2 = _mm_load_si128((const __m128i *)(input + 8 * 2));
- in3 = _mm_load_si128((const __m128i *)(input + 8 * 3));
- in4 = _mm_load_si128((const __m128i *)(input + 8 * 4));
- in5 = _mm_load_si128((const __m128i *)(input + 8 * 5));
- in6 = _mm_load_si128((const __m128i *)(input + 8 * 6));
- in7 = _mm_load_si128((const __m128i *)(input + 8 * 7));
-
- // 2-D
- for (i = 0; i < 2; i++) {
- // 8x8 Transpose is copied from av1_fdct8x8_sse2()
- TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3,
- in4, in5, in6, in7);
-
- // 4-stage 1D av1_idct8x8
- IDCT8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, in4, in5,
- in6, in7);
- }
-
- // Final rounding and shift
- in0 = _mm_adds_epi16(in0, final_rounding);
- in1 = _mm_adds_epi16(in1, final_rounding);
- in2 = _mm_adds_epi16(in2, final_rounding);
- in3 = _mm_adds_epi16(in3, final_rounding);
- in4 = _mm_adds_epi16(in4, final_rounding);
- in5 = _mm_adds_epi16(in5, final_rounding);
- in6 = _mm_adds_epi16(in6, final_rounding);
- in7 = _mm_adds_epi16(in7, final_rounding);
-
- in0 = _mm_srai_epi16(in0, 5);
- in1 = _mm_srai_epi16(in1, 5);
- in2 = _mm_srai_epi16(in2, 5);
- in3 = _mm_srai_epi16(in3, 5);
- in4 = _mm_srai_epi16(in4, 5);
- in5 = _mm_srai_epi16(in5, 5);
- in6 = _mm_srai_epi16(in6, 5);
- in7 = _mm_srai_epi16(in7, 5);
-
- RECON_AND_STORE(dest + 0 * stride, in0);
- RECON_AND_STORE(dest + 1 * stride, in1);
- RECON_AND_STORE(dest + 2 * stride, in2);
- RECON_AND_STORE(dest + 3 * stride, in3);
- RECON_AND_STORE(dest + 4 * stride, in4);
- RECON_AND_STORE(dest + 5 * stride, in5);
- RECON_AND_STORE(dest + 6 * stride, in6);
- RECON_AND_STORE(dest + 7 * stride, in7);
-}
-
-void av1_idct8x8_1_add_sse2(const int16_t *input, uint8_t *dest, int stride) {
- __m128i dc_value;
- const __m128i zero = _mm_setzero_si128();
- int a;
-
- a = (int)dct_const_round_shift(input[0] * cospi_16_64);
- a = (int)dct_const_round_shift(a * cospi_16_64);
- a = ROUND_POWER_OF_TWO(a, 5);
-
- dc_value = _mm_set1_epi16(a);
-
- RECON_AND_STORE(dest + 0 * stride, dc_value);
- RECON_AND_STORE(dest + 1 * stride, dc_value);
- RECON_AND_STORE(dest + 2 * stride, dc_value);
- RECON_AND_STORE(dest + 3 * stride, dc_value);
- RECON_AND_STORE(dest + 4 * stride, dc_value);
- RECON_AND_STORE(dest + 5 * stride, dc_value);
- RECON_AND_STORE(dest + 6 * stride, dc_value);
- RECON_AND_STORE(dest + 7 * stride, dc_value);
-}
-
-void av1_idct8_sse2(__m128i *in) {
- const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
- const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
- const __m128i stg1_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
- const __m128i stg1_2 = pair_set_epi16(-cospi_20_64, cospi_12_64);
- const __m128i stg1_3 = pair_set_epi16(cospi_12_64, cospi_20_64);
- const __m128i stg2_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
- const __m128i stg2_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
- const __m128i stg2_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);
- const __m128i stg2_3 = pair_set_epi16(cospi_8_64, cospi_24_64);
-
- __m128i in0, in1, in2, in3, in4, in5, in6, in7;
- __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7;
- __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7;
- __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
-
- // 8x8 Transpose is copied from av1_fdct8x8_sse2()
- TRANSPOSE_8X8(in[0], in[1], in[2], in[3], in[4], in[5], in[6], in[7], in0,
- in1, in2, in3, in4, in5, in6, in7);
-
- // 4-stage 1D av1_idct8x8
- IDCT8(in0, in1, in2, in3, in4, in5, in6, in7, in[0], in[1], in[2], in[3],
- in[4], in[5], in[6], in[7]);
-}
-
-void av1_iadst8_sse2(__m128i *in) {
- const __m128i k__cospi_p02_p30 = pair_set_epi16(cospi_2_64, cospi_30_64);
- const __m128i k__cospi_p30_m02 = pair_set_epi16(cospi_30_64, -cospi_2_64);
- const __m128i k__cospi_p10_p22 = pair_set_epi16(cospi_10_64, cospi_22_64);
- const __m128i k__cospi_p22_m10 = pair_set_epi16(cospi_22_64, -cospi_10_64);
- const __m128i k__cospi_p18_p14 = pair_set_epi16(cospi_18_64, cospi_14_64);
- const __m128i k__cospi_p14_m18 = pair_set_epi16(cospi_14_64, -cospi_18_64);
- const __m128i k__cospi_p26_p06 = pair_set_epi16(cospi_26_64, cospi_6_64);
- const __m128i k__cospi_p06_m26 = pair_set_epi16(cospi_6_64, -cospi_26_64);
- const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
- const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
- const __m128i k__cospi_m24_p08 = pair_set_epi16(-cospi_24_64, cospi_8_64);
- const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
- const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64);
- const __m128i k__const_0 = _mm_set1_epi16(0);
- const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
-
- __m128i u0, u1, u2, u3, u4, u5, u6, u7, u8, u9, u10, u11, u12, u13, u14, u15;
- __m128i v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15;
- __m128i w0, w1, w2, w3, w4, w5, w6, w7, w8, w9, w10, w11, w12, w13, w14, w15;
- __m128i s0, s1, s2, s3, s4, s5, s6, s7;
- __m128i in0, in1, in2, in3, in4, in5, in6, in7;
-
- // transpose
- array_transpose_8x8(in, in);
-
- // properly aligned for butterfly input
- in0 = in[7];
- in1 = in[0];
- in2 = in[5];
- in3 = in[2];
- in4 = in[3];
- in5 = in[4];
- in6 = in[1];
- in7 = in[6];
-
- // column transformation
- // stage 1
- // interleave and multiply/add into 32-bit integer
- s0 = _mm_unpacklo_epi16(in0, in1);
- s1 = _mm_unpackhi_epi16(in0, in1);
- s2 = _mm_unpacklo_epi16(in2, in3);
- s3 = _mm_unpackhi_epi16(in2, in3);
- s4 = _mm_unpacklo_epi16(in4, in5);
- s5 = _mm_unpackhi_epi16(in4, in5);
- s6 = _mm_unpacklo_epi16(in6, in7);
- s7 = _mm_unpackhi_epi16(in6, in7);
-
- u0 = _mm_madd_epi16(s0, k__cospi_p02_p30);
- u1 = _mm_madd_epi16(s1, k__cospi_p02_p30);
- u2 = _mm_madd_epi16(s0, k__cospi_p30_m02);
- u3 = _mm_madd_epi16(s1, k__cospi_p30_m02);
- u4 = _mm_madd_epi16(s2, k__cospi_p10_p22);
- u5 = _mm_madd_epi16(s3, k__cospi_p10_p22);
- u6 = _mm_madd_epi16(s2, k__cospi_p22_m10);
- u7 = _mm_madd_epi16(s3, k__cospi_p22_m10);
- u8 = _mm_madd_epi16(s4, k__cospi_p18_p14);
- u9 = _mm_madd_epi16(s5, k__cospi_p18_p14);
- u10 = _mm_madd_epi16(s4, k__cospi_p14_m18);
- u11 = _mm_madd_epi16(s5, k__cospi_p14_m18);
- u12 = _mm_madd_epi16(s6, k__cospi_p26_p06);
- u13 = _mm_madd_epi16(s7, k__cospi_p26_p06);
- u14 = _mm_madd_epi16(s6, k__cospi_p06_m26);
- u15 = _mm_madd_epi16(s7, k__cospi_p06_m26);
-
- // addition
- w0 = _mm_add_epi32(u0, u8);
- w1 = _mm_add_epi32(u1, u9);
- w2 = _mm_add_epi32(u2, u10);
- w3 = _mm_add_epi32(u3, u11);
- w4 = _mm_add_epi32(u4, u12);
- w5 = _mm_add_epi32(u5, u13);
- w6 = _mm_add_epi32(u6, u14);
- w7 = _mm_add_epi32(u7, u15);
- w8 = _mm_sub_epi32(u0, u8);
- w9 = _mm_sub_epi32(u1, u9);
- w10 = _mm_sub_epi32(u2, u10);
- w11 = _mm_sub_epi32(u3, u11);
- w12 = _mm_sub_epi32(u4, u12);
- w13 = _mm_sub_epi32(u5, u13);
- w14 = _mm_sub_epi32(u6, u14);
- w15 = _mm_sub_epi32(u7, u15);
-
- // shift and rounding
- v0 = _mm_add_epi32(w0, k__DCT_CONST_ROUNDING);
- v1 = _mm_add_epi32(w1, k__DCT_CONST_ROUNDING);
- v2 = _mm_add_epi32(w2, k__DCT_CONST_ROUNDING);
- v3 = _mm_add_epi32(w3, k__DCT_CONST_ROUNDING);
- v4 = _mm_add_epi32(w4, k__DCT_CONST_ROUNDING);
- v5 = _mm_add_epi32(w5, k__DCT_CONST_ROUNDING);
- v6 = _mm_add_epi32(w6, k__DCT_CONST_ROUNDING);
- v7 = _mm_add_epi32(w7, k__DCT_CONST_ROUNDING);
- v8 = _mm_add_epi32(w8, k__DCT_CONST_ROUNDING);
- v9 = _mm_add_epi32(w9, k__DCT_CONST_ROUNDING);
- v10 = _mm_add_epi32(w10, k__DCT_CONST_ROUNDING);
- v11 = _mm_add_epi32(w11, k__DCT_CONST_ROUNDING);
- v12 = _mm_add_epi32(w12, k__DCT_CONST_ROUNDING);
- v13 = _mm_add_epi32(w13, k__DCT_CONST_ROUNDING);
- v14 = _mm_add_epi32(w14, k__DCT_CONST_ROUNDING);
- v15 = _mm_add_epi32(w15, k__DCT_CONST_ROUNDING);
-
- u0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
- u1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
- u2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
- u3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
- u4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
- u5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
- u6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
- u7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
- u8 = _mm_srai_epi32(v8, DCT_CONST_BITS);
- u9 = _mm_srai_epi32(v9, DCT_CONST_BITS);
- u10 = _mm_srai_epi32(v10, DCT_CONST_BITS);
- u11 = _mm_srai_epi32(v11, DCT_CONST_BITS);
- u12 = _mm_srai_epi32(v12, DCT_CONST_BITS);
- u13 = _mm_srai_epi32(v13, DCT_CONST_BITS);
- u14 = _mm_srai_epi32(v14, DCT_CONST_BITS);
- u15 = _mm_srai_epi32(v15, DCT_CONST_BITS);
-
- // back to 16-bit and pack 8 integers into __m128i
- in[0] = _mm_packs_epi32(u0, u1);
- in[1] = _mm_packs_epi32(u2, u3);
- in[2] = _mm_packs_epi32(u4, u5);
- in[3] = _mm_packs_epi32(u6, u7);
- in[4] = _mm_packs_epi32(u8, u9);
- in[5] = _mm_packs_epi32(u10, u11);
- in[6] = _mm_packs_epi32(u12, u13);
- in[7] = _mm_packs_epi32(u14, u15);
-
- // stage 2
- s0 = _mm_add_epi16(in[0], in[2]);
- s1 = _mm_add_epi16(in[1], in[3]);
- s2 = _mm_sub_epi16(in[0], in[2]);
- s3 = _mm_sub_epi16(in[1], in[3]);
- u0 = _mm_unpacklo_epi16(in[4], in[5]);
- u1 = _mm_unpackhi_epi16(in[4], in[5]);
- u2 = _mm_unpacklo_epi16(in[6], in[7]);
- u3 = _mm_unpackhi_epi16(in[6], in[7]);
-
- v0 = _mm_madd_epi16(u0, k__cospi_p08_p24);
- v1 = _mm_madd_epi16(u1, k__cospi_p08_p24);
- v2 = _mm_madd_epi16(u0, k__cospi_p24_m08);
- v3 = _mm_madd_epi16(u1, k__cospi_p24_m08);
- v4 = _mm_madd_epi16(u2, k__cospi_m24_p08);
- v5 = _mm_madd_epi16(u3, k__cospi_m24_p08);
- v6 = _mm_madd_epi16(u2, k__cospi_p08_p24);
- v7 = _mm_madd_epi16(u3, k__cospi_p08_p24);
-
- w0 = _mm_add_epi32(v0, v4);
- w1 = _mm_add_epi32(v1, v5);
- w2 = _mm_add_epi32(v2, v6);
- w3 = _mm_add_epi32(v3, v7);
- w4 = _mm_sub_epi32(v0, v4);
- w5 = _mm_sub_epi32(v1, v5);
- w6 = _mm_sub_epi32(v2, v6);
- w7 = _mm_sub_epi32(v3, v7);
-
- v0 = _mm_add_epi32(w0, k__DCT_CONST_ROUNDING);
- v1 = _mm_add_epi32(w1, k__DCT_CONST_ROUNDING);
- v2 = _mm_add_epi32(w2, k__DCT_CONST_ROUNDING);
- v3 = _mm_add_epi32(w3, k__DCT_CONST_ROUNDING);
- v4 = _mm_add_epi32(w4, k__DCT_CONST_ROUNDING);
- v5 = _mm_add_epi32(w5, k__DCT_CONST_ROUNDING);
- v6 = _mm_add_epi32(w6, k__DCT_CONST_ROUNDING);
- v7 = _mm_add_epi32(w7, k__DCT_CONST_ROUNDING);
-
- u0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
- u1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
- u2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
- u3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
- u4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
- u5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
- u6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
- u7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
-
- // back to 16-bit intergers
- s4 = _mm_packs_epi32(u0, u1);
- s5 = _mm_packs_epi32(u2, u3);
- s6 = _mm_packs_epi32(u4, u5);
- s7 = _mm_packs_epi32(u6, u7);
-
- // stage 3
- u0 = _mm_unpacklo_epi16(s2, s3);
- u1 = _mm_unpackhi_epi16(s2, s3);
- u2 = _mm_unpacklo_epi16(s6, s7);
- u3 = _mm_unpackhi_epi16(s6, s7);
-
- v0 = _mm_madd_epi16(u0, k__cospi_p16_p16);
- v1 = _mm_madd_epi16(u1, k__cospi_p16_p16);
- v2 = _mm_madd_epi16(u0, k__cospi_p16_m16);
- v3 = _mm_madd_epi16(u1, k__cospi_p16_m16);
- v4 = _mm_madd_epi16(u2, k__cospi_p16_p16);
- v5 = _mm_madd_epi16(u3, k__cospi_p16_p16);
- v6 = _mm_madd_epi16(u2, k__cospi_p16_m16);
- v7 = _mm_madd_epi16(u3, k__cospi_p16_m16);
-
- u0 = _mm_add_epi32(v0, k__DCT_CONST_ROUNDING);
- u1 = _mm_add_epi32(v1, k__DCT_CONST_ROUNDING);
- u2 = _mm_add_epi32(v2, k__DCT_CONST_ROUNDING);
- u3 = _mm_add_epi32(v3, k__DCT_CONST_ROUNDING);
- u4 = _mm_add_epi32(v4, k__DCT_CONST_ROUNDING);
- u5 = _mm_add_epi32(v5, k__DCT_CONST_ROUNDING);
- u6 = _mm_add_epi32(v6, k__DCT_CONST_ROUNDING);
- u7 = _mm_add_epi32(v7, k__DCT_CONST_ROUNDING);
-
- v0 = _mm_srai_epi32(u0, DCT_CONST_BITS);
- v1 = _mm_srai_epi32(u1, DCT_CONST_BITS);
- v2 = _mm_srai_epi32(u2, DCT_CONST_BITS);
- v3 = _mm_srai_epi32(u3, DCT_CONST_BITS);
- v4 = _mm_srai_epi32(u4, DCT_CONST_BITS);
- v5 = _mm_srai_epi32(u5, DCT_CONST_BITS);
- v6 = _mm_srai_epi32(u6, DCT_CONST_BITS);
- v7 = _mm_srai_epi32(u7, DCT_CONST_BITS);
-
- s2 = _mm_packs_epi32(v0, v1);
- s3 = _mm_packs_epi32(v2, v3);
- s6 = _mm_packs_epi32(v4, v5);
- s7 = _mm_packs_epi32(v6, v7);
-
- in[0] = s0;
- in[1] = _mm_sub_epi16(k__const_0, s4);
- in[2] = s6;
- in[3] = _mm_sub_epi16(k__const_0, s2);
- in[4] = s3;
- in[5] = _mm_sub_epi16(k__const_0, s7);
- in[6] = s5;
- in[7] = _mm_sub_epi16(k__const_0, s1);
-}
-
-void av1_idct8x8_12_add_sse2(const int16_t *input, uint8_t *dest, int stride) {
- const __m128i zero = _mm_setzero_si128();
- const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
- const __m128i final_rounding = _mm_set1_epi16(1 << 4);
- const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
- const __m128i stg1_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
- const __m128i stg1_2 = pair_set_epi16(-cospi_20_64, cospi_12_64);
- const __m128i stg1_3 = pair_set_epi16(cospi_12_64, cospi_20_64);
- const __m128i stg2_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
- const __m128i stg2_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
- const __m128i stg2_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);
- const __m128i stg2_3 = pair_set_epi16(cospi_8_64, cospi_24_64);
- const __m128i stg3_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
-
- __m128i in0, in1, in2, in3, in4, in5, in6, in7;
- __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7;
- __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7;
- __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
-
- // Rows. Load 4-row input data.
- in0 = _mm_load_si128((const __m128i *)input);
- in1 = _mm_load_si128((const __m128i *)(input + 8 * 1));
- in2 = _mm_load_si128((const __m128i *)(input + 8 * 2));
- in3 = _mm_load_si128((const __m128i *)(input + 8 * 3));
-
- // 8x4 Transpose
- TRANSPOSE_8X8_10(in0, in1, in2, in3, in0, in1);
- // Stage1
- {
- const __m128i lo_17 = _mm_unpackhi_epi16(in0, zero);
- const __m128i lo_35 = _mm_unpackhi_epi16(in1, zero);
-
- tmp0 = _mm_madd_epi16(lo_17, stg1_0);
- tmp2 = _mm_madd_epi16(lo_17, stg1_1);
- tmp4 = _mm_madd_epi16(lo_35, stg1_2);
- tmp6 = _mm_madd_epi16(lo_35, stg1_3);
-
- tmp0 = _mm_add_epi32(tmp0, rounding);
- tmp2 = _mm_add_epi32(tmp2, rounding);
- tmp4 = _mm_add_epi32(tmp4, rounding);
- tmp6 = _mm_add_epi32(tmp6, rounding);
- tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
- tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
- tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS);
- tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS);
-
- stp1_4 = _mm_packs_epi32(tmp0, tmp2);
- stp1_5 = _mm_packs_epi32(tmp4, tmp6);
- }
-
- // Stage2
- {
- const __m128i lo_04 = _mm_unpacklo_epi16(in0, zero);
- const __m128i lo_26 = _mm_unpacklo_epi16(in1, zero);
-
- tmp0 = _mm_madd_epi16(lo_04, stg2_0);
- tmp2 = _mm_madd_epi16(lo_04, stg2_1);
- tmp4 = _mm_madd_epi16(lo_26, stg2_2);
- tmp6 = _mm_madd_epi16(lo_26, stg2_3);
-
- tmp0 = _mm_add_epi32(tmp0, rounding);
- tmp2 = _mm_add_epi32(tmp2, rounding);
- tmp4 = _mm_add_epi32(tmp4, rounding);
- tmp6 = _mm_add_epi32(tmp6, rounding);
- tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
- tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
- tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS);
- tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS);
-
- stp2_0 = _mm_packs_epi32(tmp0, tmp2);
- stp2_2 = _mm_packs_epi32(tmp6, tmp4);
-
- tmp0 = _mm_adds_epi16(stp1_4, stp1_5);
- tmp1 = _mm_subs_epi16(stp1_4, stp1_5);
-
- stp2_4 = tmp0;
- stp2_5 = _mm_unpacklo_epi64(tmp1, zero);
- stp2_6 = _mm_unpackhi_epi64(tmp1, zero);
- }
-
- // Stage3
- {
- const __m128i lo_56 = _mm_unpacklo_epi16(stp2_5, stp2_6);
-
- tmp4 = _mm_adds_epi16(stp2_0, stp2_2);
- tmp6 = _mm_subs_epi16(stp2_0, stp2_2);
-
- stp1_2 = _mm_unpackhi_epi64(tmp6, tmp4);
- stp1_3 = _mm_unpacklo_epi64(tmp6, tmp4);
-
- tmp0 = _mm_madd_epi16(lo_56, stg3_0);
- tmp2 = _mm_madd_epi16(lo_56, stg2_0); // stg3_1 = stg2_0
-
- tmp0 = _mm_add_epi32(tmp0, rounding);
- tmp2 = _mm_add_epi32(tmp2, rounding);
- tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
- tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
-
- stp1_5 = _mm_packs_epi32(tmp0, tmp2);
- }
-
- // Stage4
- tmp0 = _mm_adds_epi16(stp1_3, stp2_4);
- tmp1 = _mm_adds_epi16(stp1_2, stp1_5);
- tmp2 = _mm_subs_epi16(stp1_3, stp2_4);
- tmp3 = _mm_subs_epi16(stp1_2, stp1_5);
-
- TRANSPOSE_4X8_10(tmp0, tmp1, tmp2, tmp3, in0, in1, in2, in3)
-
- IDCT8(in0, in1, in2, in3, zero, zero, zero, zero, in0, in1, in2, in3, in4,
- in5, in6, in7);
- // Final rounding and shift
- in0 = _mm_adds_epi16(in0, final_rounding);
- in1 = _mm_adds_epi16(in1, final_rounding);
- in2 = _mm_adds_epi16(in2, final_rounding);
- in3 = _mm_adds_epi16(in3, final_rounding);
- in4 = _mm_adds_epi16(in4, final_rounding);
- in5 = _mm_adds_epi16(in5, final_rounding);
- in6 = _mm_adds_epi16(in6, final_rounding);
- in7 = _mm_adds_epi16(in7, final_rounding);
-
- in0 = _mm_srai_epi16(in0, 5);
- in1 = _mm_srai_epi16(in1, 5);
- in2 = _mm_srai_epi16(in2, 5);
- in3 = _mm_srai_epi16(in3, 5);
- in4 = _mm_srai_epi16(in4, 5);
- in5 = _mm_srai_epi16(in5, 5);
- in6 = _mm_srai_epi16(in6, 5);
- in7 = _mm_srai_epi16(in7, 5);
-
- RECON_AND_STORE(dest + 0 * stride, in0);
- RECON_AND_STORE(dest + 1 * stride, in1);
- RECON_AND_STORE(dest + 2 * stride, in2);
- RECON_AND_STORE(dest + 3 * stride, in3);
- RECON_AND_STORE(dest + 4 * stride, in4);
- RECON_AND_STORE(dest + 5 * stride, in5);
- RECON_AND_STORE(dest + 6 * stride, in6);
- RECON_AND_STORE(dest + 7 * stride, in7);
-}
-
-#define IDCT16 \
- /* Stage2 */ \
- { \
- const __m128i lo_1_15 = _mm_unpacklo_epi16(in[1], in[15]); \
- const __m128i hi_1_15 = _mm_unpackhi_epi16(in[1], in[15]); \
- const __m128i lo_9_7 = _mm_unpacklo_epi16(in[9], in[7]); \
- const __m128i hi_9_7 = _mm_unpackhi_epi16(in[9], in[7]); \
- const __m128i lo_5_11 = _mm_unpacklo_epi16(in[5], in[11]); \
- const __m128i hi_5_11 = _mm_unpackhi_epi16(in[5], in[11]); \
- const __m128i lo_13_3 = _mm_unpacklo_epi16(in[13], in[3]); \
- const __m128i hi_13_3 = _mm_unpackhi_epi16(in[13], in[3]); \
- \
- MULTIPLICATION_AND_ADD(lo_1_15, hi_1_15, lo_9_7, hi_9_7, stg2_0, stg2_1, \
- stg2_2, stg2_3, stp2_8, stp2_15, stp2_9, stp2_14) \
- \
- MULTIPLICATION_AND_ADD(lo_5_11, hi_5_11, lo_13_3, hi_13_3, stg2_4, stg2_5, \
- stg2_6, stg2_7, stp2_10, stp2_13, stp2_11, stp2_12) \
- } \
- \
- /* Stage3 */ \
- { \
- const __m128i lo_2_14 = _mm_unpacklo_epi16(in[2], in[14]); \
- const __m128i hi_2_14 = _mm_unpackhi_epi16(in[2], in[14]); \
- const __m128i lo_10_6 = _mm_unpacklo_epi16(in[10], in[6]); \
- const __m128i hi_10_6 = _mm_unpackhi_epi16(in[10], in[6]); \
- \
- MULTIPLICATION_AND_ADD(lo_2_14, hi_2_14, lo_10_6, hi_10_6, stg3_0, stg3_1, \
- stg3_2, stg3_3, stp1_4, stp1_7, stp1_5, stp1_6) \
- \
- stp1_8_0 = _mm_add_epi16(stp2_8, stp2_9); \
- stp1_9 = _mm_sub_epi16(stp2_8, stp2_9); \
- stp1_10 = _mm_sub_epi16(stp2_11, stp2_10); \
- stp1_11 = _mm_add_epi16(stp2_11, stp2_10); \
- \
- stp1_12_0 = _mm_add_epi16(stp2_12, stp2_13); \
- stp1_13 = _mm_sub_epi16(stp2_12, stp2_13); \
- stp1_14 = _mm_sub_epi16(stp2_15, stp2_14); \
- stp1_15 = _mm_add_epi16(stp2_15, stp2_14); \
- } \
- \
- /* Stage4 */ \
- { \
- const __m128i lo_0_8 = _mm_unpacklo_epi16(in[0], in[8]); \
- const __m128i hi_0_8 = _mm_unpackhi_epi16(in[0], in[8]); \
- const __m128i lo_4_12 = _mm_unpacklo_epi16(in[4], in[12]); \
- const __m128i hi_4_12 = _mm_unpackhi_epi16(in[4], in[12]); \
- \
- const __m128i lo_9_14 = _mm_unpacklo_epi16(stp1_9, stp1_14); \
- const __m128i hi_9_14 = _mm_unpackhi_epi16(stp1_9, stp1_14); \
- const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \
- const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \
- \
- MULTIPLICATION_AND_ADD(lo_0_8, hi_0_8, lo_4_12, hi_4_12, stg4_0, stg4_1, \
- stg4_2, stg4_3, stp2_0, stp2_1, stp2_2, stp2_3) \
- \
- stp2_4 = _mm_add_epi16(stp1_4, stp1_5); \
- stp2_5 = _mm_sub_epi16(stp1_4, stp1_5); \
- stp2_6 = _mm_sub_epi16(stp1_7, stp1_6); \
- stp2_7 = _mm_add_epi16(stp1_7, stp1_6); \
- \
- MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, stg4_4, \
- stg4_5, stg4_6, stg4_7, stp2_9, stp2_14, stp2_10, \
- stp2_13) \
- } \
- \
- /* Stage5 */ \
- { \
- const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5); \
- const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5); \
- \
- stp1_0 = _mm_add_epi16(stp2_0, stp2_3); \
- stp1_1 = _mm_add_epi16(stp2_1, stp2_2); \
- stp1_2 = _mm_sub_epi16(stp2_1, stp2_2); \
- stp1_3 = _mm_sub_epi16(stp2_0, stp2_3); \
- \
- tmp0 = _mm_madd_epi16(lo_6_5, stg4_1); \
- tmp1 = _mm_madd_epi16(hi_6_5, stg4_1); \
- tmp2 = _mm_madd_epi16(lo_6_5, stg4_0); \
- tmp3 = _mm_madd_epi16(hi_6_5, stg4_0); \
- \
- tmp0 = _mm_add_epi32(tmp0, rounding); \
- tmp1 = _mm_add_epi32(tmp1, rounding); \
- tmp2 = _mm_add_epi32(tmp2, rounding); \
- tmp3 = _mm_add_epi32(tmp3, rounding); \
- \
- tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \
- tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \
- tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \
- tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \
- \
- stp1_5 = _mm_packs_epi32(tmp0, tmp1); \
- stp1_6 = _mm_packs_epi32(tmp2, tmp3); \
- \
- stp1_8 = _mm_add_epi16(stp1_8_0, stp1_11); \
- stp1_9 = _mm_add_epi16(stp2_9, stp2_10); \
- stp1_10 = _mm_sub_epi16(stp2_9, stp2_10); \
- stp1_11 = _mm_sub_epi16(stp1_8_0, stp1_11); \
- \
- stp1_12 = _mm_sub_epi16(stp1_15, stp1_12_0); \
- stp1_13 = _mm_sub_epi16(stp2_14, stp2_13); \
- stp1_14 = _mm_add_epi16(stp2_14, stp2_13); \
- stp1_15 = _mm_add_epi16(stp1_15, stp1_12_0); \
- } \
- \
- /* Stage6 */ \
- { \
- const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \
- const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \
- const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12); \
- const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12); \
- \
- stp2_0 = _mm_add_epi16(stp1_0, stp2_7); \
- stp2_1 = _mm_add_epi16(stp1_1, stp1_6); \
- stp2_2 = _mm_add_epi16(stp1_2, stp1_5); \
- stp2_3 = _mm_add_epi16(stp1_3, stp2_4); \
- stp2_4 = _mm_sub_epi16(stp1_3, stp2_4); \
- stp2_5 = _mm_sub_epi16(stp1_2, stp1_5); \
- stp2_6 = _mm_sub_epi16(stp1_1, stp1_6); \
- stp2_7 = _mm_sub_epi16(stp1_0, stp2_7); \
- \
- MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, stg6_0, \
- stg4_0, stg6_0, stg4_0, stp2_10, stp2_13, stp2_11, \
- stp2_12) \
- }
-
-#define IDCT16_10 \
- /* Stage2 */ \
- { \
- const __m128i lo_1_15 = _mm_unpacklo_epi16(in[1], zero); \
- const __m128i hi_1_15 = _mm_unpackhi_epi16(in[1], zero); \
- const __m128i lo_13_3 = _mm_unpacklo_epi16(zero, in[3]); \
- const __m128i hi_13_3 = _mm_unpackhi_epi16(zero, in[3]); \
- \
- MULTIPLICATION_AND_ADD(lo_1_15, hi_1_15, lo_13_3, hi_13_3, stg2_0, stg2_1, \
- stg2_6, stg2_7, stp1_8_0, stp1_15, stp1_11, \
- stp1_12_0) \
- } \
- \
- /* Stage3 */ \
- { \
- const __m128i lo_2_14 = _mm_unpacklo_epi16(in[2], zero); \
- const __m128i hi_2_14 = _mm_unpackhi_epi16(in[2], zero); \
- \
- MULTIPLICATION_AND_ADD_2(lo_2_14, hi_2_14, stg3_0, stg3_1, stp2_4, stp2_7) \
- \
- stp1_9 = stp1_8_0; \
- stp1_10 = stp1_11; \
- \
- stp1_13 = stp1_12_0; \
- stp1_14 = stp1_15; \
- } \
- \
- /* Stage4 */ \
- { \
- const __m128i lo_0_8 = _mm_unpacklo_epi16(in[0], zero); \
- const __m128i hi_0_8 = _mm_unpackhi_epi16(in[0], zero); \
- \
- const __m128i lo_9_14 = _mm_unpacklo_epi16(stp1_9, stp1_14); \
- const __m128i hi_9_14 = _mm_unpackhi_epi16(stp1_9, stp1_14); \
- const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \
- const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \
- \
- MULTIPLICATION_AND_ADD_2(lo_0_8, hi_0_8, stg4_0, stg4_1, stp1_0, stp1_1) \
- stp2_5 = stp2_4; \
- stp2_6 = stp2_7; \
- \
- MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, stg4_4, \
- stg4_5, stg4_6, stg4_7, stp2_9, stp2_14, stp2_10, \
- stp2_13) \
- } \
- \
- /* Stage5 */ \
- { \
- const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5); \
- const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5); \
- \
- stp1_2 = stp1_1; \
- stp1_3 = stp1_0; \
- \
- tmp0 = _mm_madd_epi16(lo_6_5, stg4_1); \
- tmp1 = _mm_madd_epi16(hi_6_5, stg4_1); \
- tmp2 = _mm_madd_epi16(lo_6_5, stg4_0); \
- tmp3 = _mm_madd_epi16(hi_6_5, stg4_0); \
- \
- tmp0 = _mm_add_epi32(tmp0, rounding); \
- tmp1 = _mm_add_epi32(tmp1, rounding); \
- tmp2 = _mm_add_epi32(tmp2, rounding); \
- tmp3 = _mm_add_epi32(tmp3, rounding); \
- \
- tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \
- tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \
- tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \
- tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \
- \
- stp1_5 = _mm_packs_epi32(tmp0, tmp1); \
- stp1_6 = _mm_packs_epi32(tmp2, tmp3); \
- \
- stp1_8 = _mm_add_epi16(stp1_8_0, stp1_11); \
- stp1_9 = _mm_add_epi16(stp2_9, stp2_10); \
- stp1_10 = _mm_sub_epi16(stp2_9, stp2_10); \
- stp1_11 = _mm_sub_epi16(stp1_8_0, stp1_11); \
- \
- stp1_12 = _mm_sub_epi16(stp1_15, stp1_12_0); \
- stp1_13 = _mm_sub_epi16(stp2_14, stp2_13); \
- stp1_14 = _mm_add_epi16(stp2_14, stp2_13); \
- stp1_15 = _mm_add_epi16(stp1_15, stp1_12_0); \
- } \
- \
- /* Stage6 */ \
- { \
- const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \
- const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \
- const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12); \
- const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12); \
- \
- stp2_0 = _mm_add_epi16(stp1_0, stp2_7); \
- stp2_1 = _mm_add_epi16(stp1_1, stp1_6); \
- stp2_2 = _mm_add_epi16(stp1_2, stp1_5); \
- stp2_3 = _mm_add_epi16(stp1_3, stp2_4); \
- stp2_4 = _mm_sub_epi16(stp1_3, stp2_4); \
- stp2_5 = _mm_sub_epi16(stp1_2, stp1_5); \
- stp2_6 = _mm_sub_epi16(stp1_1, stp1_6); \
- stp2_7 = _mm_sub_epi16(stp1_0, stp2_7); \
- \
- MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, stg6_0, \
- stg4_0, stg6_0, stg4_0, stp2_10, stp2_13, stp2_11, \
- stp2_12) \
- }
-
-void av1_idct16x16_256_add_sse2(const int16_t *input, uint8_t *dest,
- int stride) {
- const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
- const __m128i final_rounding = _mm_set1_epi16(1 << 5);
- const __m128i zero = _mm_setzero_si128();
-
- const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64);
- const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64);
- const __m128i stg2_2 = pair_set_epi16(cospi_14_64, -cospi_18_64);
- const __m128i stg2_3 = pair_set_epi16(cospi_18_64, cospi_14_64);
- const __m128i stg2_4 = pair_set_epi16(cospi_22_64, -cospi_10_64);
- const __m128i stg2_5 = pair_set_epi16(cospi_10_64, cospi_22_64);
- const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64);
- const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64);
-
- const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
- const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
- const __m128i stg3_2 = pair_set_epi16(cospi_12_64, -cospi_20_64);
- const __m128i stg3_3 = pair_set_epi16(cospi_20_64, cospi_12_64);
-
- const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
- const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
- const __m128i stg4_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);
- const __m128i stg4_3 = pair_set_epi16(cospi_8_64, cospi_24_64);
- const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64);
- const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64);
- const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
- const __m128i stg4_7 = pair_set_epi16(-cospi_8_64, cospi_24_64);
-
- const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
-
- __m128i in[16], l[16], r[16], *curr1;
- __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7,
- stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15,
- stp1_8_0, stp1_12_0;
- __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7,
- stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15;
- __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
- int i;
-
- curr1 = l;
- for (i = 0; i < 2; i++) {
- // 1-D av1_idct
-
- // Load input data.
- in[0] = _mm_load_si128((const __m128i *)input);
- in[8] = _mm_load_si128((const __m128i *)(input + 8 * 1));
- in[1] = _mm_load_si128((const __m128i *)(input + 8 * 2));
- in[9] = _mm_load_si128((const __m128i *)(input + 8 * 3));
- in[2] = _mm_load_si128((const __m128i *)(input + 8 * 4));
- in[10] = _mm_load_si128((const __m128i *)(input + 8 * 5));
- in[3] = _mm_load_si128((const __m128i *)(input + 8 * 6));
- in[11] = _mm_load_si128((const __m128i *)(input + 8 * 7));
- in[4] = _mm_load_si128((const __m128i *)(input + 8 * 8));
- in[12] = _mm_load_si128((const __m128i *)(input + 8 * 9));
- in[5] = _mm_load_si128((const __m128i *)(input + 8 * 10));
- in[13] = _mm_load_si128((const __m128i *)(input + 8 * 11));
- in[6] = _mm_load_si128((const __m128i *)(input + 8 * 12));
- in[14] = _mm_load_si128((const __m128i *)(input + 8 * 13));
- in[7] = _mm_load_si128((const __m128i *)(input + 8 * 14));
- in[15] = _mm_load_si128((const __m128i *)(input + 8 * 15));
-
- array_transpose_8x8(in, in);
- array_transpose_8x8(in + 8, in + 8);
-
- IDCT16
-
- // Stage7
- curr1[0] = _mm_add_epi16(stp2_0, stp1_15);
- curr1[1] = _mm_add_epi16(stp2_1, stp1_14);
- curr1[2] = _mm_add_epi16(stp2_2, stp2_13);
- curr1[3] = _mm_add_epi16(stp2_3, stp2_12);
- curr1[4] = _mm_add_epi16(stp2_4, stp2_11);
- curr1[5] = _mm_add_epi16(stp2_5, stp2_10);
- curr1[6] = _mm_add_epi16(stp2_6, stp1_9);
- curr1[7] = _mm_add_epi16(stp2_7, stp1_8);
- curr1[8] = _mm_sub_epi16(stp2_7, stp1_8);
- curr1[9] = _mm_sub_epi16(stp2_6, stp1_9);
- curr1[10] = _mm_sub_epi16(stp2_5, stp2_10);
- curr1[11] = _mm_sub_epi16(stp2_4, stp2_11);
- curr1[12] = _mm_sub_epi16(stp2_3, stp2_12);
- curr1[13] = _mm_sub_epi16(stp2_2, stp2_13);
- curr1[14] = _mm_sub_epi16(stp2_1, stp1_14);
- curr1[15] = _mm_sub_epi16(stp2_0, stp1_15);
-
- curr1 = r;
- input += 128;
- }
- for (i = 0; i < 2; i++) {
- int j;
- // 1-D av1_idct
- array_transpose_8x8(l + i * 8, in);
- array_transpose_8x8(r + i * 8, in + 8);
-
- IDCT16
-
- // 2-D
- in[0] = _mm_add_epi16(stp2_0, stp1_15);
- in[1] = _mm_add_epi16(stp2_1, stp1_14);
- in[2] = _mm_add_epi16(stp2_2, stp2_13);
- in[3] = _mm_add_epi16(stp2_3, stp2_12);
- in[4] = _mm_add_epi16(stp2_4, stp2_11);
- in[5] = _mm_add_epi16(stp2_5, stp2_10);
- in[6] = _mm_add_epi16(stp2_6, stp1_9);
- in[7] = _mm_add_epi16(stp2_7, stp1_8);
- in[8] = _mm_sub_epi16(stp2_7, stp1_8);
- in[9] = _mm_sub_epi16(stp2_6, stp1_9);
- in[10] = _mm_sub_epi16(stp2_5, stp2_10);
- in[11] = _mm_sub_epi16(stp2_4, stp2_11);
- in[12] = _mm_sub_epi16(stp2_3, stp2_12);
- in[13] = _mm_sub_epi16(stp2_2, stp2_13);
- in[14] = _mm_sub_epi16(stp2_1, stp1_14);
- in[15] = _mm_sub_epi16(stp2_0, stp1_15);
-
- for (j = 0; j < 16; ++j) {
- // Final rounding and shift
- in[j] = _mm_adds_epi16(in[j], final_rounding);
- in[j] = _mm_srai_epi16(in[j], 6);
- RECON_AND_STORE(dest + j * stride, in[j]);
- }
-
- dest += 8;
- }
-}
-
-void av1_idct16x16_1_add_sse2(const int16_t *input, uint8_t *dest, int stride) {
- __m128i dc_value;
- const __m128i zero = _mm_setzero_si128();
- int a, i;
-
- a = (int)dct_const_round_shift(input[0] * cospi_16_64);
- a = (int)dct_const_round_shift(a * cospi_16_64);
- a = ROUND_POWER_OF_TWO(a, 6);
-
- dc_value = _mm_set1_epi16(a);
-
- for (i = 0; i < 2; ++i) {
- RECON_AND_STORE(dest + 0 * stride, dc_value);
- RECON_AND_STORE(dest + 1 * stride, dc_value);
- RECON_AND_STORE(dest + 2 * stride, dc_value);
- RECON_AND_STORE(dest + 3 * stride, dc_value);
- RECON_AND_STORE(dest + 4 * stride, dc_value);
- RECON_AND_STORE(dest + 5 * stride, dc_value);
- RECON_AND_STORE(dest + 6 * stride, dc_value);
- RECON_AND_STORE(dest + 7 * stride, dc_value);
- RECON_AND_STORE(dest + 8 * stride, dc_value);
- RECON_AND_STORE(dest + 9 * stride, dc_value);
- RECON_AND_STORE(dest + 10 * stride, dc_value);
- RECON_AND_STORE(dest + 11 * stride, dc_value);
- RECON_AND_STORE(dest + 12 * stride, dc_value);
- RECON_AND_STORE(dest + 13 * stride, dc_value);
- RECON_AND_STORE(dest + 14 * stride, dc_value);
- RECON_AND_STORE(dest + 15 * stride, dc_value);
- dest += 8;
- }
-}
-
-static void av1_iadst16_8col(__m128i *in) {
- // perform 16x16 1-D ADST for 8 columns
- __m128i s[16], x[16], u[32], v[32];
- const __m128i k__cospi_p01_p31 = pair_set_epi16(cospi_1_64, cospi_31_64);
- const __m128i k__cospi_p31_m01 = pair_set_epi16(cospi_31_64, -cospi_1_64);
- const __m128i k__cospi_p05_p27 = pair_set_epi16(cospi_5_64, cospi_27_64);
- const __m128i k__cospi_p27_m05 = pair_set_epi16(cospi_27_64, -cospi_5_64);
- const __m128i k__cospi_p09_p23 = pair_set_epi16(cospi_9_64, cospi_23_64);
- const __m128i k__cospi_p23_m09 = pair_set_epi16(cospi_23_64, -cospi_9_64);
- const __m128i k__cospi_p13_p19 = pair_set_epi16(cospi_13_64, cospi_19_64);
- const __m128i k__cospi_p19_m13 = pair_set_epi16(cospi_19_64, -cospi_13_64);
- const __m128i k__cospi_p17_p15 = pair_set_epi16(cospi_17_64, cospi_15_64);
- const __m128i k__cospi_p15_m17 = pair_set_epi16(cospi_15_64, -cospi_17_64);
- const __m128i k__cospi_p21_p11 = pair_set_epi16(cospi_21_64, cospi_11_64);
- const __m128i k__cospi_p11_m21 = pair_set_epi16(cospi_11_64, -cospi_21_64);
- const __m128i k__cospi_p25_p07 = pair_set_epi16(cospi_25_64, cospi_7_64);
- const __m128i k__cospi_p07_m25 = pair_set_epi16(cospi_7_64, -cospi_25_64);
- const __m128i k__cospi_p29_p03 = pair_set_epi16(cospi_29_64, cospi_3_64);
- const __m128i k__cospi_p03_m29 = pair_set_epi16(cospi_3_64, -cospi_29_64);
- const __m128i k__cospi_p04_p28 = pair_set_epi16(cospi_4_64, cospi_28_64);
- const __m128i k__cospi_p28_m04 = pair_set_epi16(cospi_28_64, -cospi_4_64);
- const __m128i k__cospi_p20_p12 = pair_set_epi16(cospi_20_64, cospi_12_64);
- const __m128i k__cospi_p12_m20 = pair_set_epi16(cospi_12_64, -cospi_20_64);
- const __m128i k__cospi_m28_p04 = pair_set_epi16(-cospi_28_64, cospi_4_64);
- const __m128i k__cospi_m12_p20 = pair_set_epi16(-cospi_12_64, cospi_20_64);
- const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
- const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
- const __m128i k__cospi_m24_p08 = pair_set_epi16(-cospi_24_64, cospi_8_64);
- const __m128i k__cospi_m16_m16 = _mm_set1_epi16((int16_t)-cospi_16_64);
- const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64);
- const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
- const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64);
- const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
- const __m128i kZero = _mm_set1_epi16(0);
-
- u[0] = _mm_unpacklo_epi16(in[15], in[0]);
- u[1] = _mm_unpackhi_epi16(in[15], in[0]);
- u[2] = _mm_unpacklo_epi16(in[13], in[2]);
- u[3] = _mm_unpackhi_epi16(in[13], in[2]);
- u[4] = _mm_unpacklo_epi16(in[11], in[4]);
- u[5] = _mm_unpackhi_epi16(in[11], in[4]);
- u[6] = _mm_unpacklo_epi16(in[9], in[6]);
- u[7] = _mm_unpackhi_epi16(in[9], in[6]);
- u[8] = _mm_unpacklo_epi16(in[7], in[8]);
- u[9] = _mm_unpackhi_epi16(in[7], in[8]);
- u[10] = _mm_unpacklo_epi16(in[5], in[10]);
- u[11] = _mm_unpackhi_epi16(in[5], in[10]);
- u[12] = _mm_unpacklo_epi16(in[3], in[12]);
- u[13] = _mm_unpackhi_epi16(in[3], in[12]);
- u[14] = _mm_unpacklo_epi16(in[1], in[14]);
- u[15] = _mm_unpackhi_epi16(in[1], in[14]);
-
- v[0] = _mm_madd_epi16(u[0], k__cospi_p01_p31);
- v[1] = _mm_madd_epi16(u[1], k__cospi_p01_p31);
- v[2] = _mm_madd_epi16(u[0], k__cospi_p31_m01);
- v[3] = _mm_madd_epi16(u[1], k__cospi_p31_m01);
- v[4] = _mm_madd_epi16(u[2], k__cospi_p05_p27);
- v[5] = _mm_madd_epi16(u[3], k__cospi_p05_p27);
- v[6] = _mm_madd_epi16(u[2], k__cospi_p27_m05);
- v[7] = _mm_madd_epi16(u[3], k__cospi_p27_m05);
- v[8] = _mm_madd_epi16(u[4], k__cospi_p09_p23);
- v[9] = _mm_madd_epi16(u[5], k__cospi_p09_p23);
- v[10] = _mm_madd_epi16(u[4], k__cospi_p23_m09);
- v[11] = _mm_madd_epi16(u[5], k__cospi_p23_m09);
- v[12] = _mm_madd_epi16(u[6], k__cospi_p13_p19);
- v[13] = _mm_madd_epi16(u[7], k__cospi_p13_p19);
- v[14] = _mm_madd_epi16(u[6], k__cospi_p19_m13);
- v[15] = _mm_madd_epi16(u[7], k__cospi_p19_m13);
- v[16] = _mm_madd_epi16(u[8], k__cospi_p17_p15);
- v[17] = _mm_madd_epi16(u[9], k__cospi_p17_p15);
- v[18] = _mm_madd_epi16(u[8], k__cospi_p15_m17);
- v[19] = _mm_madd_epi16(u[9], k__cospi_p15_m17);
- v[20] = _mm_madd_epi16(u[10], k__cospi_p21_p11);
- v[21] = _mm_madd_epi16(u[11], k__cospi_p21_p11);
- v[22] = _mm_madd_epi16(u[10], k__cospi_p11_m21);
- v[23] = _mm_madd_epi16(u[11], k__cospi_p11_m21);
- v[24] = _mm_madd_epi16(u[12], k__cospi_p25_p07);
- v[25] = _mm_madd_epi16(u[13], k__cospi_p25_p07);
- v[26] = _mm_madd_epi16(u[12], k__cospi_p07_m25);
- v[27] = _mm_madd_epi16(u[13], k__cospi_p07_m25);
- v[28] = _mm_madd_epi16(u[14], k__cospi_p29_p03);
- v[29] = _mm_madd_epi16(u[15], k__cospi_p29_p03);
- v[30] = _mm_madd_epi16(u[14], k__cospi_p03_m29);
- v[31] = _mm_madd_epi16(u[15], k__cospi_p03_m29);
-
- u[0] = _mm_add_epi32(v[0], v[16]);
- u[1] = _mm_add_epi32(v[1], v[17]);
- u[2] = _mm_add_epi32(v[2], v[18]);
- u[3] = _mm_add_epi32(v[3], v[19]);
- u[4] = _mm_add_epi32(v[4], v[20]);
- u[5] = _mm_add_epi32(v[5], v[21]);
- u[6] = _mm_add_epi32(v[6], v[22]);
- u[7] = _mm_add_epi32(v[7], v[23]);
- u[8] = _mm_add_epi32(v[8], v[24]);
- u[9] = _mm_add_epi32(v[9], v[25]);
- u[10] = _mm_add_epi32(v[10], v[26]);
- u[11] = _mm_add_epi32(v[11], v[27]);
- u[12] = _mm_add_epi32(v[12], v[28]);
- u[13] = _mm_add_epi32(v[13], v[29]);
- u[14] = _mm_add_epi32(v[14], v[30]);
- u[15] = _mm_add_epi32(v[15], v[31]);
- u[16] = _mm_sub_epi32(v[0], v[16]);
- u[17] = _mm_sub_epi32(v[1], v[17]);
- u[18] = _mm_sub_epi32(v[2], v[18]);
- u[19] = _mm_sub_epi32(v[3], v[19]);
- u[20] = _mm_sub_epi32(v[4], v[20]);
- u[21] = _mm_sub_epi32(v[5], v[21]);
- u[22] = _mm_sub_epi32(v[6], v[22]);
- u[23] = _mm_sub_epi32(v[7], v[23]);
- u[24] = _mm_sub_epi32(v[8], v[24]);
- u[25] = _mm_sub_epi32(v[9], v[25]);
- u[26] = _mm_sub_epi32(v[10], v[26]);
- u[27] = _mm_sub_epi32(v[11], v[27]);
- u[28] = _mm_sub_epi32(v[12], v[28]);
- u[29] = _mm_sub_epi32(v[13], v[29]);
- u[30] = _mm_sub_epi32(v[14], v[30]);
- u[31] = _mm_sub_epi32(v[15], v[31]);
-
- v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
- v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
- v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
- v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
- v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
- v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
- v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
- v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
- v[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING);
- v[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING);
- v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);
- v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);
- v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);
- v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING);
- v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);
- v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);
- v[16] = _mm_add_epi32(u[16], k__DCT_CONST_ROUNDING);
- v[17] = _mm_add_epi32(u[17], k__DCT_CONST_ROUNDING);
- v[18] = _mm_add_epi32(u[18], k__DCT_CONST_ROUNDING);
- v[19] = _mm_add_epi32(u[19], k__DCT_CONST_ROUNDING);
- v[20] = _mm_add_epi32(u[20], k__DCT_CONST_ROUNDING);
- v[21] = _mm_add_epi32(u[21], k__DCT_CONST_ROUNDING);
- v[22] = _mm_add_epi32(u[22], k__DCT_CONST_ROUNDING);
- v[23] = _mm_add_epi32(u[23], k__DCT_CONST_ROUNDING);
- v[24] = _mm_add_epi32(u[24], k__DCT_CONST_ROUNDING);
- v[25] = _mm_add_epi32(u[25], k__DCT_CONST_ROUNDING);
- v[26] = _mm_add_epi32(u[26], k__DCT_CONST_ROUNDING);
- v[27] = _mm_add_epi32(u[27], k__DCT_CONST_ROUNDING);
- v[28] = _mm_add_epi32(u[28], k__DCT_CONST_ROUNDING);
- v[29] = _mm_add_epi32(u[29], k__DCT_CONST_ROUNDING);
- v[30] = _mm_add_epi32(u[30], k__DCT_CONST_ROUNDING);
- v[31] = _mm_add_epi32(u[31], k__DCT_CONST_ROUNDING);
-
- u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
- u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
- u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
- u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
- u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS);
- u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS);
- u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS);
- u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS);
- u[8] = _mm_srai_epi32(v[8], DCT_CONST_BITS);
- u[9] = _mm_srai_epi32(v[9], DCT_CONST_BITS);
- u[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS);
- u[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS);
- u[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS);
- u[13] = _mm_srai_epi32(v[13], DCT_CONST_BITS);
- u[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS);
- u[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS);
- u[16] = _mm_srai_epi32(v[16], DCT_CONST_BITS);
- u[17] = _mm_srai_epi32(v[17], DCT_CONST_BITS);
- u[18] = _mm_srai_epi32(v[18], DCT_CONST_BITS);
- u[19] = _mm_srai_epi32(v[19], DCT_CONST_BITS);
- u[20] = _mm_srai_epi32(v[20], DCT_CONST_BITS);
- u[21] = _mm_srai_epi32(v[21], DCT_CONST_BITS);
- u[22] = _mm_srai_epi32(v[22], DCT_CONST_BITS);
- u[23] = _mm_srai_epi32(v[23], DCT_CONST_BITS);
- u[24] = _mm_srai_epi32(v[24], DCT_CONST_BITS);
- u[25] = _mm_srai_epi32(v[25], DCT_CONST_BITS);
- u[26] = _mm_srai_epi32(v[26], DCT_CONST_BITS);
- u[27] = _mm_srai_epi32(v[27], DCT_CONST_BITS);
- u[28] = _mm_srai_epi32(v[28], DCT_CONST_BITS);
- u[29] = _mm_srai_epi32(v[29], DCT_CONST_BITS);
- u[30] = _mm_srai_epi32(v[30], DCT_CONST_BITS);
- u[31] = _mm_srai_epi32(v[31], DCT_CONST_BITS);
-
- s[0] = _mm_packs_epi32(u[0], u[1]);
- s[1] = _mm_packs_epi32(u[2], u[3]);
- s[2] = _mm_packs_epi32(u[4], u[5]);
- s[3] = _mm_packs_epi32(u[6], u[7]);
- s[4] = _mm_packs_epi32(u[8], u[9]);
- s[5] = _mm_packs_epi32(u[10], u[11]);
- s[6] = _mm_packs_epi32(u[12], u[13]);
- s[7] = _mm_packs_epi32(u[14], u[15]);
- s[8] = _mm_packs_epi32(u[16], u[17]);
- s[9] = _mm_packs_epi32(u[18], u[19]);
- s[10] = _mm_packs_epi32(u[20], u[21]);
- s[11] = _mm_packs_epi32(u[22], u[23]);
- s[12] = _mm_packs_epi32(u[24], u[25]);
- s[13] = _mm_packs_epi32(u[26], u[27]);
- s[14] = _mm_packs_epi32(u[28], u[29]);
- s[15] = _mm_packs_epi32(u[30], u[31]);
-
- // stage 2
- u[0] = _mm_unpacklo_epi16(s[8], s[9]);
- u[1] = _mm_unpackhi_epi16(s[8], s[9]);
- u[2] = _mm_unpacklo_epi16(s[10], s[11]);
- u[3] = _mm_unpackhi_epi16(s[10], s[11]);
- u[4] = _mm_unpacklo_epi16(s[12], s[13]);
- u[5] = _mm_unpackhi_epi16(s[12], s[13]);
- u[6] = _mm_unpacklo_epi16(s[14], s[15]);
- u[7] = _mm_unpackhi_epi16(s[14], s[15]);
-
- v[0] = _mm_madd_epi16(u[0], k__cospi_p04_p28);
- v[1] = _mm_madd_epi16(u[1], k__cospi_p04_p28);
- v[2] = _mm_madd_epi16(u[0], k__cospi_p28_m04);
- v[3] = _mm_madd_epi16(u[1], k__cospi_p28_m04);
- v[4] = _mm_madd_epi16(u[2], k__cospi_p20_p12);
- v[5] = _mm_madd_epi16(u[3], k__cospi_p20_p12);
- v[6] = _mm_madd_epi16(u[2], k__cospi_p12_m20);
- v[7] = _mm_madd_epi16(u[3], k__cospi_p12_m20);
- v[8] = _mm_madd_epi16(u[4], k__cospi_m28_p04);
- v[9] = _mm_madd_epi16(u[5], k__cospi_m28_p04);
- v[10] = _mm_madd_epi16(u[4], k__cospi_p04_p28);
- v[11] = _mm_madd_epi16(u[5], k__cospi_p04_p28);
- v[12] = _mm_madd_epi16(u[6], k__cospi_m12_p20);
- v[13] = _mm_madd_epi16(u[7], k__cospi_m12_p20);
- v[14] = _mm_madd_epi16(u[6], k__cospi_p20_p12);
- v[15] = _mm_madd_epi16(u[7], k__cospi_p20_p12);
-
- u[0] = _mm_add_epi32(v[0], v[8]);
- u[1] = _mm_add_epi32(v[1], v[9]);
- u[2] = _mm_add_epi32(v[2], v[10]);
- u[3] = _mm_add_epi32(v[3], v[11]);
- u[4] = _mm_add_epi32(v[4], v[12]);
- u[5] = _mm_add_epi32(v[5], v[13]);
- u[6] = _mm_add_epi32(v[6], v[14]);
- u[7] = _mm_add_epi32(v[7], v[15]);
- u[8] = _mm_sub_epi32(v[0], v[8]);
- u[9] = _mm_sub_epi32(v[1], v[9]);
- u[10] = _mm_sub_epi32(v[2], v[10]);
- u[11] = _mm_sub_epi32(v[3], v[11]);
- u[12] = _mm_sub_epi32(v[4], v[12]);
- u[13] = _mm_sub_epi32(v[5], v[13]);
- u[14] = _mm_sub_epi32(v[6], v[14]);
- u[15] = _mm_sub_epi32(v[7], v[15]);
-
- v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
- v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
- v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
- v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
- v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
- v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
- v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
- v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
- v[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING);
- v[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING);
- v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);
- v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);
- v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);
- v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING);
- v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);
- v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);
-
- u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
- u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
- u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
- u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
- u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS);
- u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS);
- u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS);
- u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS);
- u[8] = _mm_srai_epi32(v[8], DCT_CONST_BITS);
- u[9] = _mm_srai_epi32(v[9], DCT_CONST_BITS);
- u[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS);
- u[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS);
- u[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS);
- u[13] = _mm_srai_epi32(v[13], DCT_CONST_BITS);
- u[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS);
- u[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS);
-
- x[0] = _mm_add_epi16(s[0], s[4]);
- x[1] = _mm_add_epi16(s[1], s[5]);
- x[2] = _mm_add_epi16(s[2], s[6]);
- x[3] = _mm_add_epi16(s[3], s[7]);
- x[4] = _mm_sub_epi16(s[0], s[4]);
- x[5] = _mm_sub_epi16(s[1], s[5]);
- x[6] = _mm_sub_epi16(s[2], s[6]);
- x[7] = _mm_sub_epi16(s[3], s[7]);
- x[8] = _mm_packs_epi32(u[0], u[1]);
- x[9] = _mm_packs_epi32(u[2], u[3]);
- x[10] = _mm_packs_epi32(u[4], u[5]);
- x[11] = _mm_packs_epi32(u[6], u[7]);
- x[12] = _mm_packs_epi32(u[8], u[9]);
- x[13] = _mm_packs_epi32(u[10], u[11]);
- x[14] = _mm_packs_epi32(u[12], u[13]);
- x[15] = _mm_packs_epi32(u[14], u[15]);
-
- // stage 3
- u[0] = _mm_unpacklo_epi16(x[4], x[5]);
- u[1] = _mm_unpackhi_epi16(x[4], x[5]);
- u[2] = _mm_unpacklo_epi16(x[6], x[7]);
- u[3] = _mm_unpackhi_epi16(x[6], x[7]);
- u[4] = _mm_unpacklo_epi16(x[12], x[13]);
- u[5] = _mm_unpackhi_epi16(x[12], x[13]);
- u[6] = _mm_unpacklo_epi16(x[14], x[15]);
- u[7] = _mm_unpackhi_epi16(x[14], x[15]);
-
- v[0] = _mm_madd_epi16(u[0], k__cospi_p08_p24);
- v[1] = _mm_madd_epi16(u[1], k__cospi_p08_p24);
- v[2] = _mm_madd_epi16(u[0], k__cospi_p24_m08);
- v[3] = _mm_madd_epi16(u[1], k__cospi_p24_m08);
- v[4] = _mm_madd_epi16(u[2], k__cospi_m24_p08);
- v[5] = _mm_madd_epi16(u[3], k__cospi_m24_p08);
- v[6] = _mm_madd_epi16(u[2], k__cospi_p08_p24);
- v[7] = _mm_madd_epi16(u[3], k__cospi_p08_p24);
- v[8] = _mm_madd_epi16(u[4], k__cospi_p08_p24);
- v[9] = _mm_madd_epi16(u[5], k__cospi_p08_p24);
- v[10] = _mm_madd_epi16(u[4], k__cospi_p24_m08);
- v[11] = _mm_madd_epi16(u[5], k__cospi_p24_m08);
- v[12] = _mm_madd_epi16(u[6], k__cospi_m24_p08);
- v[13] = _mm_madd_epi16(u[7], k__cospi_m24_p08);
- v[14] = _mm_madd_epi16(u[6], k__cospi_p08_p24);
- v[15] = _mm_madd_epi16(u[7], k__cospi_p08_p24);
-
- u[0] = _mm_add_epi32(v[0], v[4]);
- u[1] = _mm_add_epi32(v[1], v[5]);
- u[2] = _mm_add_epi32(v[2], v[6]);
- u[3] = _mm_add_epi32(v[3], v[7]);
- u[4] = _mm_sub_epi32(v[0], v[4]);
- u[5] = _mm_sub_epi32(v[1], v[5]);
- u[6] = _mm_sub_epi32(v[2], v[6]);
- u[7] = _mm_sub_epi32(v[3], v[7]);
- u[8] = _mm_add_epi32(v[8], v[12]);
- u[9] = _mm_add_epi32(v[9], v[13]);
- u[10] = _mm_add_epi32(v[10], v[14]);
- u[11] = _mm_add_epi32(v[11], v[15]);
- u[12] = _mm_sub_epi32(v[8], v[12]);
- u[13] = _mm_sub_epi32(v[9], v[13]);
- u[14] = _mm_sub_epi32(v[10], v[14]);
- u[15] = _mm_sub_epi32(v[11], v[15]);
-
- u[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
- u[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
- u[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
- u[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
- u[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
- u[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
- u[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
- u[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
- u[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING);
- u[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING);
- u[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);
- u[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);
- u[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);
- u[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING);
- u[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);
- u[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);
-
- v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
- v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
- v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
- v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
- v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
- v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
- v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
- v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
- v[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS);
- v[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS);
- v[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS);
- v[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS);
- v[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS);
- v[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS);
- v[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS);
- v[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS);
-
- s[0] = _mm_add_epi16(x[0], x[2]);
- s[1] = _mm_add_epi16(x[1], x[3]);
- s[2] = _mm_sub_epi16(x[0], x[2]);
- s[3] = _mm_sub_epi16(x[1], x[3]);
- s[4] = _mm_packs_epi32(v[0], v[1]);
- s[5] = _mm_packs_epi32(v[2], v[3]);
- s[6] = _mm_packs_epi32(v[4], v[5]);
- s[7] = _mm_packs_epi32(v[6], v[7]);
- s[8] = _mm_add_epi16(x[8], x[10]);
- s[9] = _mm_add_epi16(x[9], x[11]);
- s[10] = _mm_sub_epi16(x[8], x[10]);
- s[11] = _mm_sub_epi16(x[9], x[11]);
- s[12] = _mm_packs_epi32(v[8], v[9]);
- s[13] = _mm_packs_epi32(v[10], v[11]);
- s[14] = _mm_packs_epi32(v[12], v[13]);
- s[15] = _mm_packs_epi32(v[14], v[15]);
-
- // stage 4
- u[0] = _mm_unpacklo_epi16(s[2], s[3]);
- u[1] = _mm_unpackhi_epi16(s[2], s[3]);
- u[2] = _mm_unpacklo_epi16(s[6], s[7]);
- u[3] = _mm_unpackhi_epi16(s[6], s[7]);
- u[4] = _mm_unpacklo_epi16(s[10], s[11]);
- u[5] = _mm_unpackhi_epi16(s[10], s[11]);
- u[6] = _mm_unpacklo_epi16(s[14], s[15]);
- u[7] = _mm_unpackhi_epi16(s[14], s[15]);
-
- v[0] = _mm_madd_epi16(u[0], k__cospi_m16_m16);
- v[1] = _mm_madd_epi16(u[1], k__cospi_m16_m16);
- v[2] = _mm_madd_epi16(u[0], k__cospi_p16_m16);
- v[3] = _mm_madd_epi16(u[1], k__cospi_p16_m16);
- v[4] = _mm_madd_epi16(u[2], k__cospi_p16_p16);
- v[5] = _mm_madd_epi16(u[3], k__cospi_p16_p16);
- v[6] = _mm_madd_epi16(u[2], k__cospi_m16_p16);
- v[7] = _mm_madd_epi16(u[3], k__cospi_m16_p16);
- v[8] = _mm_madd_epi16(u[4], k__cospi_p16_p16);
- v[9] = _mm_madd_epi16(u[5], k__cospi_p16_p16);
- v[10] = _mm_madd_epi16(u[4], k__cospi_m16_p16);
- v[11] = _mm_madd_epi16(u[5], k__cospi_m16_p16);
- v[12] = _mm_madd_epi16(u[6], k__cospi_m16_m16);
- v[13] = _mm_madd_epi16(u[7], k__cospi_m16_m16);
- v[14] = _mm_madd_epi16(u[6], k__cospi_p16_m16);
- v[15] = _mm_madd_epi16(u[7], k__cospi_p16_m16);
-
- u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
- u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
- u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
- u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
- u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
- u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
- u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
- u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
- u[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING);
- u[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING);
- u[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING);
- u[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING);
- u[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING);
- u[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING);
- u[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING);
- u[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING);
-
- v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
- v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
- v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
- v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
- v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
- v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
- v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
- v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
- v[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS);
- v[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS);
- v[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS);
- v[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS);
- v[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS);
- v[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS);
- v[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS);
- v[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS);
-
- in[0] = s[0];
- in[1] = _mm_sub_epi16(kZero, s[8]);
- in[2] = s[12];
- in[3] = _mm_sub_epi16(kZero, s[4]);
- in[4] = _mm_packs_epi32(v[4], v[5]);
- in[5] = _mm_packs_epi32(v[12], v[13]);
- in[6] = _mm_packs_epi32(v[8], v[9]);
- in[7] = _mm_packs_epi32(v[0], v[1]);
- in[8] = _mm_packs_epi32(v[2], v[3]);
- in[9] = _mm_packs_epi32(v[10], v[11]);
- in[10] = _mm_packs_epi32(v[14], v[15]);
- in[11] = _mm_packs_epi32(v[6], v[7]);
- in[12] = s[5];
- in[13] = _mm_sub_epi16(kZero, s[13]);
- in[14] = s[9];
- in[15] = _mm_sub_epi16(kZero, s[1]);
-}
-
-static void av1_idct16_8col(__m128i *in) {
- const __m128i k__cospi_p30_m02 = pair_set_epi16(cospi_30_64, -cospi_2_64);
- const __m128i k__cospi_p02_p30 = pair_set_epi16(cospi_2_64, cospi_30_64);
- const __m128i k__cospi_p14_m18 = pair_set_epi16(cospi_14_64, -cospi_18_64);
- const __m128i k__cospi_p18_p14 = pair_set_epi16(cospi_18_64, cospi_14_64);
- const __m128i k__cospi_p22_m10 = pair_set_epi16(cospi_22_64, -cospi_10_64);
- const __m128i k__cospi_p10_p22 = pair_set_epi16(cospi_10_64, cospi_22_64);
- const __m128i k__cospi_p06_m26 = pair_set_epi16(cospi_6_64, -cospi_26_64);
- const __m128i k__cospi_p26_p06 = pair_set_epi16(cospi_26_64, cospi_6_64);
- const __m128i k__cospi_p28_m04 = pair_set_epi16(cospi_28_64, -cospi_4_64);
- const __m128i k__cospi_p04_p28 = pair_set_epi16(cospi_4_64, cospi_28_64);
- const __m128i k__cospi_p12_m20 = pair_set_epi16(cospi_12_64, -cospi_20_64);
- const __m128i k__cospi_p20_p12 = pair_set_epi16(cospi_20_64, cospi_12_64);
- const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64);
- const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
- const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
- const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
- const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);
- const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);
- const __m128i k__cospi_m24_m08 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
- const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64);
- const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
- __m128i v[16], u[16], s[16], t[16];
-
- // stage 1
- s[0] = in[0];
- s[1] = in[8];
- s[2] = in[4];
- s[3] = in[12];
- s[4] = in[2];
- s[5] = in[10];
- s[6] = in[6];
- s[7] = in[14];
- s[8] = in[1];
- s[9] = in[9];
- s[10] = in[5];
- s[11] = in[13];
- s[12] = in[3];
- s[13] = in[11];
- s[14] = in[7];
- s[15] = in[15];
-
- // stage 2
- u[0] = _mm_unpacklo_epi16(s[8], s[15]);
- u[1] = _mm_unpackhi_epi16(s[8], s[15]);
- u[2] = _mm_unpacklo_epi16(s[9], s[14]);
- u[3] = _mm_unpackhi_epi16(s[9], s[14]);
- u[4] = _mm_unpacklo_epi16(s[10], s[13]);
- u[5] = _mm_unpackhi_epi16(s[10], s[13]);
- u[6] = _mm_unpacklo_epi16(s[11], s[12]);
- u[7] = _mm_unpackhi_epi16(s[11], s[12]);
-
- v[0] = _mm_madd_epi16(u[0], k__cospi_p30_m02);
- v[1] = _mm_madd_epi16(u[1], k__cospi_p30_m02);
- v[2] = _mm_madd_epi16(u[0], k__cospi_p02_p30);
- v[3] = _mm_madd_epi16(u[1], k__cospi_p02_p30);
- v[4] = _mm_madd_epi16(u[2], k__cospi_p14_m18);
- v[5] = _mm_madd_epi16(u[3], k__cospi_p14_m18);
- v[6] = _mm_madd_epi16(u[2], k__cospi_p18_p14);
- v[7] = _mm_madd_epi16(u[3], k__cospi_p18_p14);
- v[8] = _mm_madd_epi16(u[4], k__cospi_p22_m10);
- v[9] = _mm_madd_epi16(u[5], k__cospi_p22_m10);
- v[10] = _mm_madd_epi16(u[4], k__cospi_p10_p22);
- v[11] = _mm_madd_epi16(u[5], k__cospi_p10_p22);
- v[12] = _mm_madd_epi16(u[6], k__cospi_p06_m26);
- v[13] = _mm_madd_epi16(u[7], k__cospi_p06_m26);
- v[14] = _mm_madd_epi16(u[6], k__cospi_p26_p06);
- v[15] = _mm_madd_epi16(u[7], k__cospi_p26_p06);
-
- u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
- u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
- u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
- u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
- u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
- u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
- u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
- u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
- u[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING);
- u[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING);
- u[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING);
- u[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING);
- u[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING);
- u[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING);
- u[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING);
- u[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING);
-
- u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
- u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
- u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
- u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
- u[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
- u[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
- u[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
- u[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
- u[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS);
- u[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS);
- u[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS);
- u[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS);
- u[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS);
- u[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS);
- u[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS);
- u[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS);
-
- s[8] = _mm_packs_epi32(u[0], u[1]);
- s[15] = _mm_packs_epi32(u[2], u[3]);
- s[9] = _mm_packs_epi32(u[4], u[5]);
- s[14] = _mm_packs_epi32(u[6], u[7]);
- s[10] = _mm_packs_epi32(u[8], u[9]);
- s[13] = _mm_packs_epi32(u[10], u[11]);
- s[11] = _mm_packs_epi32(u[12], u[13]);
- s[12] = _mm_packs_epi32(u[14], u[15]);
-
- // stage 3
- t[0] = s[0];
- t[1] = s[1];
- t[2] = s[2];
- t[3] = s[3];
- u[0] = _mm_unpacklo_epi16(s[4], s[7]);
- u[1] = _mm_unpackhi_epi16(s[4], s[7]);
- u[2] = _mm_unpacklo_epi16(s[5], s[6]);
- u[3] = _mm_unpackhi_epi16(s[5], s[6]);
-
- v[0] = _mm_madd_epi16(u[0], k__cospi_p28_m04);
- v[1] = _mm_madd_epi16(u[1], k__cospi_p28_m04);
- v[2] = _mm_madd_epi16(u[0], k__cospi_p04_p28);
- v[3] = _mm_madd_epi16(u[1], k__cospi_p04_p28);
- v[4] = _mm_madd_epi16(u[2], k__cospi_p12_m20);
- v[5] = _mm_madd_epi16(u[3], k__cospi_p12_m20);
- v[6] = _mm_madd_epi16(u[2], k__cospi_p20_p12);
- v[7] = _mm_madd_epi16(u[3], k__cospi_p20_p12);
-
- u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
- u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
- u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
- u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
- u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
- u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
- u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
- u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
-
- u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
- u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
- u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
- u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
- u[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
- u[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
- u[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
- u[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
-
- t[4] = _mm_packs_epi32(u[0], u[1]);
- t[7] = _mm_packs_epi32(u[2], u[3]);
- t[5] = _mm_packs_epi32(u[4], u[5]);
- t[6] = _mm_packs_epi32(u[6], u[7]);
- t[8] = _mm_add_epi16(s[8], s[9]);
- t[9] = _mm_sub_epi16(s[8], s[9]);
- t[10] = _mm_sub_epi16(s[11], s[10]);
- t[11] = _mm_add_epi16(s[10], s[11]);
- t[12] = _mm_add_epi16(s[12], s[13]);
- t[13] = _mm_sub_epi16(s[12], s[13]);
- t[14] = _mm_sub_epi16(s[15], s[14]);
- t[15] = _mm_add_epi16(s[14], s[15]);
-
- // stage 4
- u[0] = _mm_unpacklo_epi16(t[0], t[1]);
- u[1] = _mm_unpackhi_epi16(t[0], t[1]);
- u[2] = _mm_unpacklo_epi16(t[2], t[3]);
- u[3] = _mm_unpackhi_epi16(t[2], t[3]);
- u[4] = _mm_unpacklo_epi16(t[9], t[14]);
- u[5] = _mm_unpackhi_epi16(t[9], t[14]);
- u[6] = _mm_unpacklo_epi16(t[10], t[13]);
- u[7] = _mm_unpackhi_epi16(t[10], t[13]);
-
- v[0] = _mm_madd_epi16(u[0], k__cospi_p16_p16);
- v[1] = _mm_madd_epi16(u[1], k__cospi_p16_p16);
- v[2] = _mm_madd_epi16(u[0], k__cospi_p16_m16);
- v[3] = _mm_madd_epi16(u[1], k__cospi_p16_m16);
- v[4] = _mm_madd_epi16(u[2], k__cospi_p24_m08);
- v[5] = _mm_madd_epi16(u[3], k__cospi_p24_m08);
- v[6] = _mm_madd_epi16(u[2], k__cospi_p08_p24);
- v[7] = _mm_madd_epi16(u[3], k__cospi_p08_p24);
- v[8] = _mm_madd_epi16(u[4], k__cospi_m08_p24);
- v[9] = _mm_madd_epi16(u[5], k__cospi_m08_p24);
- v[10] = _mm_madd_epi16(u[4], k__cospi_p24_p08);
- v[11] = _mm_madd_epi16(u[5], k__cospi_p24_p08);
- v[12] = _mm_madd_epi16(u[6], k__cospi_m24_m08);
- v[13] = _mm_madd_epi16(u[7], k__cospi_m24_m08);
- v[14] = _mm_madd_epi16(u[6], k__cospi_m08_p24);
- v[15] = _mm_madd_epi16(u[7], k__cospi_m08_p24);
-
- u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
- u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
- u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
- u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
- u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
- u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
- u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
- u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
- u[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING);
- u[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING);
- u[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING);
- u[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING);
- u[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING);
- u[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING);
- u[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING);
- u[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING);
-
- u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
- u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
- u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
- u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
- u[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
- u[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
- u[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
- u[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
- u[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS);
- u[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS);
- u[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS);
- u[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS);
- u[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS);
- u[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS);
- u[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS);
- u[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS);
-
- s[0] = _mm_packs_epi32(u[0], u[1]);
- s[1] = _mm_packs_epi32(u[2], u[3]);
- s[2] = _mm_packs_epi32(u[4], u[5]);
- s[3] = _mm_packs_epi32(u[6], u[7]);
- s[4] = _mm_add_epi16(t[4], t[5]);
- s[5] = _mm_sub_epi16(t[4], t[5]);
- s[6] = _mm_sub_epi16(t[7], t[6]);
- s[7] = _mm_add_epi16(t[6], t[7]);
- s[8] = t[8];
- s[15] = t[15];
- s[9] = _mm_packs_epi32(u[8], u[9]);
- s[14] = _mm_packs_epi32(u[10], u[11]);
- s[10] = _mm_packs_epi32(u[12], u[13]);
- s[13] = _mm_packs_epi32(u[14], u[15]);
- s[11] = t[11];
- s[12] = t[12];
-
- // stage 5
- t[0] = _mm_add_epi16(s[0], s[3]);
- t[1] = _mm_add_epi16(s[1], s[2]);
- t[2] = _mm_sub_epi16(s[1], s[2]);
- t[3] = _mm_sub_epi16(s[0], s[3]);
- t[4] = s[4];
- t[7] = s[7];
-
- u[0] = _mm_unpacklo_epi16(s[5], s[6]);
- u[1] = _mm_unpackhi_epi16(s[5], s[6]);
- v[0] = _mm_madd_epi16(u[0], k__cospi_m16_p16);
- v[1] = _mm_madd_epi16(u[1], k__cospi_m16_p16);
- v[2] = _mm_madd_epi16(u[0], k__cospi_p16_p16);
- v[3] = _mm_madd_epi16(u[1], k__cospi_p16_p16);
- u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
- u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
- u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
- u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
- u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
- u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
- u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
- u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
- t[5] = _mm_packs_epi32(u[0], u[1]);
- t[6] = _mm_packs_epi32(u[2], u[3]);
-
- t[8] = _mm_add_epi16(s[8], s[11]);
- t[9] = _mm_add_epi16(s[9], s[10]);
- t[10] = _mm_sub_epi16(s[9], s[10]);
- t[11] = _mm_sub_epi16(s[8], s[11]);
- t[12] = _mm_sub_epi16(s[15], s[12]);
- t[13] = _mm_sub_epi16(s[14], s[13]);
- t[14] = _mm_add_epi16(s[13], s[14]);
- t[15] = _mm_add_epi16(s[12], s[15]);
-
- // stage 6
- s[0] = _mm_add_epi16(t[0], t[7]);
- s[1] = _mm_add_epi16(t[1], t[6]);
- s[2] = _mm_add_epi16(t[2], t[5]);
- s[3] = _mm_add_epi16(t[3], t[4]);
- s[4] = _mm_sub_epi16(t[3], t[4]);
- s[5] = _mm_sub_epi16(t[2], t[5]);
- s[6] = _mm_sub_epi16(t[1], t[6]);
- s[7] = _mm_sub_epi16(t[0], t[7]);
- s[8] = t[8];
- s[9] = t[9];
-
- u[0] = _mm_unpacklo_epi16(t[10], t[13]);
- u[1] = _mm_unpackhi_epi16(t[10], t[13]);
- u[2] = _mm_unpacklo_epi16(t[11], t[12]);
- u[3] = _mm_unpackhi_epi16(t[11], t[12]);
-
- v[0] = _mm_madd_epi16(u[0], k__cospi_m16_p16);
- v[1] = _mm_madd_epi16(u[1], k__cospi_m16_p16);
- v[2] = _mm_madd_epi16(u[0], k__cospi_p16_p16);
- v[3] = _mm_madd_epi16(u[1], k__cospi_p16_p16);
- v[4] = _mm_madd_epi16(u[2], k__cospi_m16_p16);
- v[5] = _mm_madd_epi16(u[3], k__cospi_m16_p16);
- v[6] = _mm_madd_epi16(u[2], k__cospi_p16_p16);
- v[7] = _mm_madd_epi16(u[3], k__cospi_p16_p16);
-
- u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
- u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
- u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
- u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
- u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
- u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
- u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
- u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
-
- u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
- u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
- u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
- u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
- u[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
- u[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
- u[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
- u[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
-
- s[10] = _mm_packs_epi32(u[0], u[1]);
- s[13] = _mm_packs_epi32(u[2], u[3]);
- s[11] = _mm_packs_epi32(u[4], u[5]);
- s[12] = _mm_packs_epi32(u[6], u[7]);
- s[14] = t[14];
- s[15] = t[15];
-
- // stage 7
- in[0] = _mm_add_epi16(s[0], s[15]);
- in[1] = _mm_add_epi16(s[1], s[14]);
- in[2] = _mm_add_epi16(s[2], s[13]);
- in[3] = _mm_add_epi16(s[3], s[12]);
- in[4] = _mm_add_epi16(s[4], s[11]);
- in[5] = _mm_add_epi16(s[5], s[10]);
- in[6] = _mm_add_epi16(s[6], s[9]);
- in[7] = _mm_add_epi16(s[7], s[8]);
- in[8] = _mm_sub_epi16(s[7], s[8]);
- in[9] = _mm_sub_epi16(s[6], s[9]);
- in[10] = _mm_sub_epi16(s[5], s[10]);
- in[11] = _mm_sub_epi16(s[4], s[11]);
- in[12] = _mm_sub_epi16(s[3], s[12]);
- in[13] = _mm_sub_epi16(s[2], s[13]);
- in[14] = _mm_sub_epi16(s[1], s[14]);
- in[15] = _mm_sub_epi16(s[0], s[15]);
-}
-
-void av1_idct16_sse2(__m128i *in0, __m128i *in1) {
- array_transpose_16x16(in0, in1);
- av1_idct16_8col(in0);
- av1_idct16_8col(in1);
-}
-
-void av1_iadst16_sse2(__m128i *in0, __m128i *in1) {
- array_transpose_16x16(in0, in1);
- av1_iadst16_8col(in0);
- av1_iadst16_8col(in1);
-}
-
-void av1_idct16x16_10_add_sse2(const int16_t *input, uint8_t *dest,
- int stride) {
- const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
- const __m128i final_rounding = _mm_set1_epi16(1 << 5);
- const __m128i zero = _mm_setzero_si128();
-
- const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64);
- const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64);
- const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64);
- const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64);
-
- const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
- const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
-
- const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
- const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
- const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64);
- const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64);
- const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
- const __m128i stg4_7 = pair_set_epi16(-cospi_8_64, cospi_24_64);
-
- const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
- __m128i in[16], l[16];
- __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_8,
- stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15, stp1_8_0,
- stp1_12_0;
- __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7,
- stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14;
- __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
- int i;
- // First 1-D inverse DCT
- // Load input data.
- in[0] = _mm_load_si128((const __m128i *)input);
- in[1] = _mm_load_si128((const __m128i *)(input + 8 * 2));
- in[2] = _mm_load_si128((const __m128i *)(input + 8 * 4));
- in[3] = _mm_load_si128((const __m128i *)(input + 8 * 6));
-
- TRANSPOSE_8X4(in[0], in[1], in[2], in[3], in[0], in[1]);
-
- // Stage2
- {
- const __m128i lo_1_15 = _mm_unpackhi_epi16(in[0], zero);
- const __m128i lo_13_3 = _mm_unpackhi_epi16(zero, in[1]);
-
- tmp0 = _mm_madd_epi16(lo_1_15, stg2_0);
- tmp2 = _mm_madd_epi16(lo_1_15, stg2_1);
- tmp5 = _mm_madd_epi16(lo_13_3, stg2_6);
- tmp7 = _mm_madd_epi16(lo_13_3, stg2_7);
-
- tmp0 = _mm_add_epi32(tmp0, rounding);
- tmp2 = _mm_add_epi32(tmp2, rounding);
- tmp5 = _mm_add_epi32(tmp5, rounding);
- tmp7 = _mm_add_epi32(tmp7, rounding);
-
- tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
- tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
- tmp5 = _mm_srai_epi32(tmp5, DCT_CONST_BITS);
- tmp7 = _mm_srai_epi32(tmp7, DCT_CONST_BITS);
-
- stp2_8 = _mm_packs_epi32(tmp0, tmp2);
- stp2_11 = _mm_packs_epi32(tmp5, tmp7);
- }
-
- // Stage3
- {
- const __m128i lo_2_14 = _mm_unpacklo_epi16(in[1], zero);
-
- tmp0 = _mm_madd_epi16(lo_2_14, stg3_0);
- tmp2 = _mm_madd_epi16(lo_2_14, stg3_1);
-
- tmp0 = _mm_add_epi32(tmp0, rounding);
- tmp2 = _mm_add_epi32(tmp2, rounding);
- tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
- tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
-
- stp1_13 = _mm_unpackhi_epi64(stp2_11, zero);
- stp1_14 = _mm_unpackhi_epi64(stp2_8, zero);
-
- stp1_4 = _mm_packs_epi32(tmp0, tmp2);
- }
-
- // Stage4
- {
- const __m128i lo_0_8 = _mm_unpacklo_epi16(in[0], zero);
- const __m128i lo_9_14 = _mm_unpacklo_epi16(stp2_8, stp1_14);
- const __m128i lo_10_13 = _mm_unpacklo_epi16(stp2_11, stp1_13);
-
- tmp0 = _mm_madd_epi16(lo_0_8, stg4_0);
- tmp2 = _mm_madd_epi16(lo_0_8, stg4_1);
- tmp1 = _mm_madd_epi16(lo_9_14, stg4_4);
- tmp3 = _mm_madd_epi16(lo_9_14, stg4_5);
- tmp5 = _mm_madd_epi16(lo_10_13, stg4_6);
- tmp7 = _mm_madd_epi16(lo_10_13, stg4_7);
-
- tmp0 = _mm_add_epi32(tmp0, rounding);
- tmp2 = _mm_add_epi32(tmp2, rounding);
- tmp1 = _mm_add_epi32(tmp1, rounding);
- tmp3 = _mm_add_epi32(tmp3, rounding);
- tmp5 = _mm_add_epi32(tmp5, rounding);
- tmp7 = _mm_add_epi32(tmp7, rounding);
-
- tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
- tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
- tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS);
- tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS);
- tmp5 = _mm_srai_epi32(tmp5, DCT_CONST_BITS);
- tmp7 = _mm_srai_epi32(tmp7, DCT_CONST_BITS);
-
- stp1_0 = _mm_packs_epi32(tmp0, tmp0);
- stp1_1 = _mm_packs_epi32(tmp2, tmp2);
- stp2_9 = _mm_packs_epi32(tmp1, tmp3);
- stp2_10 = _mm_packs_epi32(tmp5, tmp7);
-
- stp2_6 = _mm_unpackhi_epi64(stp1_4, zero);
- }
-
- // Stage5 and Stage6
- {
- tmp0 = _mm_add_epi16(stp2_8, stp2_11);
- tmp1 = _mm_sub_epi16(stp2_8, stp2_11);
- tmp2 = _mm_add_epi16(stp2_9, stp2_10);
- tmp3 = _mm_sub_epi16(stp2_9, stp2_10);
-
- stp1_9 = _mm_unpacklo_epi64(tmp2, zero);
- stp1_10 = _mm_unpacklo_epi64(tmp3, zero);
- stp1_8 = _mm_unpacklo_epi64(tmp0, zero);
- stp1_11 = _mm_unpacklo_epi64(tmp1, zero);
-
- stp1_13 = _mm_unpackhi_epi64(tmp3, zero);
- stp1_14 = _mm_unpackhi_epi64(tmp2, zero);
- stp1_12 = _mm_unpackhi_epi64(tmp1, zero);
- stp1_15 = _mm_unpackhi_epi64(tmp0, zero);
- }
-
- // Stage6
- {
- const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp1_4);
- const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13);
- const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12);
-
- tmp1 = _mm_madd_epi16(lo_6_5, stg4_1);
- tmp3 = _mm_madd_epi16(lo_6_5, stg4_0);
- tmp0 = _mm_madd_epi16(lo_10_13, stg6_0);
- tmp2 = _mm_madd_epi16(lo_10_13, stg4_0);
- tmp4 = _mm_madd_epi16(lo_11_12, stg6_0);
- tmp6 = _mm_madd_epi16(lo_11_12, stg4_0);
-
- tmp1 = _mm_add_epi32(tmp1, rounding);
- tmp3 = _mm_add_epi32(tmp3, rounding);
- tmp0 = _mm_add_epi32(tmp0, rounding);
- tmp2 = _mm_add_epi32(tmp2, rounding);
- tmp4 = _mm_add_epi32(tmp4, rounding);
- tmp6 = _mm_add_epi32(tmp6, rounding);
-
- tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS);
- tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS);
- tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
- tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
- tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS);
- tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS);
-
- stp1_6 = _mm_packs_epi32(tmp3, tmp1);
-
- stp2_10 = _mm_packs_epi32(tmp0, zero);
- stp2_13 = _mm_packs_epi32(tmp2, zero);
- stp2_11 = _mm_packs_epi32(tmp4, zero);
- stp2_12 = _mm_packs_epi32(tmp6, zero);
-
- tmp0 = _mm_add_epi16(stp1_0, stp1_4);
- tmp1 = _mm_sub_epi16(stp1_0, stp1_4);
- tmp2 = _mm_add_epi16(stp1_1, stp1_6);
- tmp3 = _mm_sub_epi16(stp1_1, stp1_6);
-
- stp2_0 = _mm_unpackhi_epi64(tmp0, zero);
- stp2_1 = _mm_unpacklo_epi64(tmp2, zero);
- stp2_2 = _mm_unpackhi_epi64(tmp2, zero);
- stp2_3 = _mm_unpacklo_epi64(tmp0, zero);
- stp2_4 = _mm_unpacklo_epi64(tmp1, zero);
- stp2_5 = _mm_unpackhi_epi64(tmp3, zero);
- stp2_6 = _mm_unpacklo_epi64(tmp3, zero);
- stp2_7 = _mm_unpackhi_epi64(tmp1, zero);
- }
-
- // Stage7. Left 8x16 only.
- l[0] = _mm_add_epi16(stp2_0, stp1_15);
- l[1] = _mm_add_epi16(stp2_1, stp1_14);
- l[2] = _mm_add_epi16(stp2_2, stp2_13);
- l[3] = _mm_add_epi16(stp2_3, stp2_12);
- l[4] = _mm_add_epi16(stp2_4, stp2_11);
- l[5] = _mm_add_epi16(stp2_5, stp2_10);
- l[6] = _mm_add_epi16(stp2_6, stp1_9);
- l[7] = _mm_add_epi16(stp2_7, stp1_8);
- l[8] = _mm_sub_epi16(stp2_7, stp1_8);
- l[9] = _mm_sub_epi16(stp2_6, stp1_9);
- l[10] = _mm_sub_epi16(stp2_5, stp2_10);
- l[11] = _mm_sub_epi16(stp2_4, stp2_11);
- l[12] = _mm_sub_epi16(stp2_3, stp2_12);
- l[13] = _mm_sub_epi16(stp2_2, stp2_13);
- l[14] = _mm_sub_epi16(stp2_1, stp1_14);
- l[15] = _mm_sub_epi16(stp2_0, stp1_15);
-
- // Second 1-D inverse transform, performed per 8x16 block
- for (i = 0; i < 2; i++) {
- int j;
- array_transpose_4X8(l + 8 * i, in);
-
- IDCT16_10
-
- // Stage7
- in[0] = _mm_add_epi16(stp2_0, stp1_15);
- in[1] = _mm_add_epi16(stp2_1, stp1_14);
- in[2] = _mm_add_epi16(stp2_2, stp2_13);
- in[3] = _mm_add_epi16(stp2_3, stp2_12);
- in[4] = _mm_add_epi16(stp2_4, stp2_11);
- in[5] = _mm_add_epi16(stp2_5, stp2_10);
- in[6] = _mm_add_epi16(stp2_6, stp1_9);
- in[7] = _mm_add_epi16(stp2_7, stp1_8);
- in[8] = _mm_sub_epi16(stp2_7, stp1_8);
- in[9] = _mm_sub_epi16(stp2_6, stp1_9);
- in[10] = _mm_sub_epi16(stp2_5, stp2_10);
- in[11] = _mm_sub_epi16(stp2_4, stp2_11);
- in[12] = _mm_sub_epi16(stp2_3, stp2_12);
- in[13] = _mm_sub_epi16(stp2_2, stp2_13);
- in[14] = _mm_sub_epi16(stp2_1, stp1_14);
- in[15] = _mm_sub_epi16(stp2_0, stp1_15);
-
- for (j = 0; j < 16; ++j) {
- // Final rounding and shift
- in[j] = _mm_adds_epi16(in[j], final_rounding);
- in[j] = _mm_srai_epi16(in[j], 6);
- RECON_AND_STORE(dest + j * stride, in[j]);
- }
-
- dest += 8;
- }
-}
-
-#define LOAD_DQCOEFF(reg, input) \
- { \
- reg = _mm_load_si128((const __m128i *)input); \
- input += 8; \
- }
-
-#define IDCT32_34 \
- /* Stage1 */ \
- { \
- const __m128i lo_1_31 = _mm_unpacklo_epi16(in[1], zero); \
- const __m128i hi_1_31 = _mm_unpackhi_epi16(in[1], zero); \
- \
- const __m128i lo_25_7 = _mm_unpacklo_epi16(zero, in[7]); \
- const __m128i hi_25_7 = _mm_unpackhi_epi16(zero, in[7]); \
- \
- const __m128i lo_5_27 = _mm_unpacklo_epi16(in[5], zero); \
- const __m128i hi_5_27 = _mm_unpackhi_epi16(in[5], zero); \
- \
- const __m128i lo_29_3 = _mm_unpacklo_epi16(zero, in[3]); \
- const __m128i hi_29_3 = _mm_unpackhi_epi16(zero, in[3]); \
- \
- MULTIPLICATION_AND_ADD_2(lo_1_31, hi_1_31, stg1_0, stg1_1, stp1_16, \
- stp1_31); \
- MULTIPLICATION_AND_ADD_2(lo_25_7, hi_25_7, stg1_6, stg1_7, stp1_19, \
- stp1_28); \
- MULTIPLICATION_AND_ADD_2(lo_5_27, hi_5_27, stg1_8, stg1_9, stp1_20, \
- stp1_27); \
- MULTIPLICATION_AND_ADD_2(lo_29_3, hi_29_3, stg1_14, stg1_15, stp1_23, \
- stp1_24); \
- } \
- \
- /* Stage2 */ \
- { \
- const __m128i lo_2_30 = _mm_unpacklo_epi16(in[2], zero); \
- const __m128i hi_2_30 = _mm_unpackhi_epi16(in[2], zero); \
- \
- const __m128i lo_26_6 = _mm_unpacklo_epi16(zero, in[6]); \
- const __m128i hi_26_6 = _mm_unpackhi_epi16(zero, in[6]); \
- \
- MULTIPLICATION_AND_ADD_2(lo_2_30, hi_2_30, stg2_0, stg2_1, stp2_8, \
- stp2_15); \
- MULTIPLICATION_AND_ADD_2(lo_26_6, hi_26_6, stg2_6, stg2_7, stp2_11, \
- stp2_12); \
- \
- stp2_16 = stp1_16; \
- stp2_19 = stp1_19; \
- \
- stp2_20 = stp1_20; \
- stp2_23 = stp1_23; \
- \
- stp2_24 = stp1_24; \
- stp2_27 = stp1_27; \
- \
- stp2_28 = stp1_28; \
- stp2_31 = stp1_31; \
- } \
- \
- /* Stage3 */ \
- { \
- const __m128i lo_4_28 = _mm_unpacklo_epi16(in[4], zero); \
- const __m128i hi_4_28 = _mm_unpackhi_epi16(in[4], zero); \
- \
- const __m128i lo_17_30 = _mm_unpacklo_epi16(stp1_16, stp1_31); \
- const __m128i hi_17_30 = _mm_unpackhi_epi16(stp1_16, stp1_31); \
- const __m128i lo_18_29 = _mm_unpacklo_epi16(stp1_19, stp1_28); \
- const __m128i hi_18_29 = _mm_unpackhi_epi16(stp1_19, stp1_28); \
- \
- const __m128i lo_21_26 = _mm_unpacklo_epi16(stp1_20, stp1_27); \
- const __m128i hi_21_26 = _mm_unpackhi_epi16(stp1_20, stp1_27); \
- const __m128i lo_22_25 = _mm_unpacklo_epi16(stp1_23, stp1_24); \
- const __m128i hi_22_25 = _mm_unpackhi_epi16(stp1_23, stp2_24); \
- \
- MULTIPLICATION_AND_ADD_2(lo_4_28, hi_4_28, stg3_0, stg3_1, stp1_4, \
- stp1_7); \
- \
- stp1_8 = stp2_8; \
- stp1_11 = stp2_11; \
- stp1_12 = stp2_12; \
- stp1_15 = stp2_15; \
- \
- MULTIPLICATION_AND_ADD(lo_17_30, hi_17_30, lo_18_29, hi_18_29, stg3_4, \
- stg3_5, stg3_6, stg3_4, stp1_17, stp1_30, stp1_18, \
- stp1_29) \
- MULTIPLICATION_AND_ADD(lo_21_26, hi_21_26, lo_22_25, hi_22_25, stg3_8, \
- stg3_9, stg3_10, stg3_8, stp1_21, stp1_26, stp1_22, \
- stp1_25) \
- \
- stp1_16 = stp2_16; \
- stp1_31 = stp2_31; \
- stp1_19 = stp2_19; \
- stp1_20 = stp2_20; \
- stp1_23 = stp2_23; \
- stp1_24 = stp2_24; \
- stp1_27 = stp2_27; \
- stp1_28 = stp2_28; \
- } \
- \
- /* Stage4 */ \
- { \
- const __m128i lo_0_16 = _mm_unpacklo_epi16(in[0], zero); \
- const __m128i hi_0_16 = _mm_unpackhi_epi16(in[0], zero); \
- \
- const __m128i lo_9_14 = _mm_unpacklo_epi16(stp2_8, stp2_15); \
- const __m128i hi_9_14 = _mm_unpackhi_epi16(stp2_8, stp2_15); \
- const __m128i lo_10_13 = _mm_unpacklo_epi16(stp2_11, stp2_12); \
- const __m128i hi_10_13 = _mm_unpackhi_epi16(stp2_11, stp2_12); \
- \
- MULTIPLICATION_AND_ADD_2(lo_0_16, hi_0_16, stg4_0, stg4_1, stp2_0, \
- stp2_1); \
- \
- stp2_4 = stp1_4; \
- stp2_5 = stp1_4; \
- stp2_6 = stp1_7; \
- stp2_7 = stp1_7; \
- \
- MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, stg4_4, \
- stg4_5, stg4_6, stg4_4, stp2_9, stp2_14, stp2_10, \
- stp2_13) \
- \
- stp2_8 = stp1_8; \
- stp2_15 = stp1_15; \
- stp2_11 = stp1_11; \
- stp2_12 = stp1_12; \
- \
- stp2_16 = _mm_add_epi16(stp1_16, stp1_19); \
- stp2_17 = _mm_add_epi16(stp1_17, stp1_18); \
- stp2_18 = _mm_sub_epi16(stp1_17, stp1_18); \
- stp2_19 = _mm_sub_epi16(stp1_16, stp1_19); \
- stp2_20 = _mm_sub_epi16(stp1_23, stp1_20); \
- stp2_21 = _mm_sub_epi16(stp1_22, stp1_21); \
- stp2_22 = _mm_add_epi16(stp1_22, stp1_21); \
- stp2_23 = _mm_add_epi16(stp1_23, stp1_20); \
- \
- stp2_24 = _mm_add_epi16(stp1_24, stp1_27); \
- stp2_25 = _mm_add_epi16(stp1_25, stp1_26); \
- stp2_26 = _mm_sub_epi16(stp1_25, stp1_26); \
- stp2_27 = _mm_sub_epi16(stp1_24, stp1_27); \
- stp2_28 = _mm_sub_epi16(stp1_31, stp1_28); \
- stp2_29 = _mm_sub_epi16(stp1_30, stp1_29); \
- stp2_30 = _mm_add_epi16(stp1_29, stp1_30); \
- stp2_31 = _mm_add_epi16(stp1_28, stp1_31); \
- } \
- \
- /* Stage5 */ \
- { \
- const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5); \
- const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5); \
- const __m128i lo_18_29 = _mm_unpacklo_epi16(stp2_18, stp2_29); \
- const __m128i hi_18_29 = _mm_unpackhi_epi16(stp2_18, stp2_29); \
- \
- const __m128i lo_19_28 = _mm_unpacklo_epi16(stp2_19, stp2_28); \
- const __m128i hi_19_28 = _mm_unpackhi_epi16(stp2_19, stp2_28); \
- const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27); \
- const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27); \
- \
- const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \
- const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \
- \
- stp1_0 = stp2_0; \
- stp1_1 = stp2_1; \
- stp1_2 = stp2_1; \
- stp1_3 = stp2_0; \
- \
- tmp0 = _mm_madd_epi16(lo_6_5, stg4_1); \
- tmp1 = _mm_madd_epi16(hi_6_5, stg4_1); \
- tmp2 = _mm_madd_epi16(lo_6_5, stg4_0); \
- tmp3 = _mm_madd_epi16(hi_6_5, stg4_0); \
- \
- tmp0 = _mm_add_epi32(tmp0, rounding); \
- tmp1 = _mm_add_epi32(tmp1, rounding); \
- tmp2 = _mm_add_epi32(tmp2, rounding); \
- tmp3 = _mm_add_epi32(tmp3, rounding); \
- \
- tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \
- tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \
- tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \
- tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \
- \
- stp1_5 = _mm_packs_epi32(tmp0, tmp1); \
- stp1_6 = _mm_packs_epi32(tmp2, tmp3); \
- \
- stp1_4 = stp2_4; \
- stp1_7 = stp2_7; \
- \
- stp1_8 = _mm_add_epi16(stp2_8, stp2_11); \
- stp1_9 = _mm_add_epi16(stp2_9, stp2_10); \
- stp1_10 = _mm_sub_epi16(stp2_9, stp2_10); \
- stp1_11 = _mm_sub_epi16(stp2_8, stp2_11); \
- stp1_12 = _mm_sub_epi16(stp2_15, stp2_12); \
- stp1_13 = _mm_sub_epi16(stp2_14, stp2_13); \
- stp1_14 = _mm_add_epi16(stp2_14, stp2_13); \
- stp1_15 = _mm_add_epi16(stp2_15, stp2_12); \
- \
- stp1_16 = stp2_16; \
- stp1_17 = stp2_17; \
- \
- MULTIPLICATION_AND_ADD(lo_18_29, hi_18_29, lo_19_28, hi_19_28, stg4_4, \
- stg4_5, stg4_4, stg4_5, stp1_18, stp1_29, stp1_19, \
- stp1_28) \
- MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg4_6, \
- stg4_4, stg4_6, stg4_4, stp1_20, stp1_27, stp1_21, \
- stp1_26) \
- \
- stp1_22 = stp2_22; \
- stp1_23 = stp2_23; \
- stp1_24 = stp2_24; \
- stp1_25 = stp2_25; \
- stp1_30 = stp2_30; \
- stp1_31 = stp2_31; \
- } \
- \
- /* Stage6 */ \
- { \
- const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \
- const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \
- const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12); \
- const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12); \
- \
- stp2_0 = _mm_add_epi16(stp1_0, stp1_7); \
- stp2_1 = _mm_add_epi16(stp1_1, stp1_6); \
- stp2_2 = _mm_add_epi16(stp1_2, stp1_5); \
- stp2_3 = _mm_add_epi16(stp1_3, stp1_4); \
- stp2_4 = _mm_sub_epi16(stp1_3, stp1_4); \
- stp2_5 = _mm_sub_epi16(stp1_2, stp1_5); \
- stp2_6 = _mm_sub_epi16(stp1_1, stp1_6); \
- stp2_7 = _mm_sub_epi16(stp1_0, stp1_7); \
- \
- stp2_8 = stp1_8; \
- stp2_9 = stp1_9; \
- stp2_14 = stp1_14; \
- stp2_15 = stp1_15; \
- \
- MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, stg6_0, \
- stg4_0, stg6_0, stg4_0, stp2_10, stp2_13, stp2_11, \
- stp2_12) \
- \
- stp2_16 = _mm_add_epi16(stp1_16, stp1_23); \
- stp2_17 = _mm_add_epi16(stp1_17, stp1_22); \
- stp2_18 = _mm_add_epi16(stp1_18, stp1_21); \
- stp2_19 = _mm_add_epi16(stp1_19, stp1_20); \
- stp2_20 = _mm_sub_epi16(stp1_19, stp1_20); \
- stp2_21 = _mm_sub_epi16(stp1_18, stp1_21); \
- stp2_22 = _mm_sub_epi16(stp1_17, stp1_22); \
- stp2_23 = _mm_sub_epi16(stp1_16, stp1_23); \
- \
- stp2_24 = _mm_sub_epi16(stp1_31, stp1_24); \
- stp2_25 = _mm_sub_epi16(stp1_30, stp1_25); \
- stp2_26 = _mm_sub_epi16(stp1_29, stp1_26); \
- stp2_27 = _mm_sub_epi16(stp1_28, stp1_27); \
- stp2_28 = _mm_add_epi16(stp1_27, stp1_28); \
- stp2_29 = _mm_add_epi16(stp1_26, stp1_29); \
- stp2_30 = _mm_add_epi16(stp1_25, stp1_30); \
- stp2_31 = _mm_add_epi16(stp1_24, stp1_31); \
- } \
- \
- /* Stage7 */ \
- { \
- const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27); \
- const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27); \
- const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \
- const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \
- \
- const __m128i lo_22_25 = _mm_unpacklo_epi16(stp2_22, stp2_25); \
- const __m128i hi_22_25 = _mm_unpackhi_epi16(stp2_22, stp2_25); \
- const __m128i lo_23_24 = _mm_unpacklo_epi16(stp2_23, stp2_24); \
- const __m128i hi_23_24 = _mm_unpackhi_epi16(stp2_23, stp2_24); \
- \
- stp1_0 = _mm_add_epi16(stp2_0, stp2_15); \
- stp1_1 = _mm_add_epi16(stp2_1, stp2_14); \
- stp1_2 = _mm_add_epi16(stp2_2, stp2_13); \
- stp1_3 = _mm_add_epi16(stp2_3, stp2_12); \
- stp1_4 = _mm_add_epi16(stp2_4, stp2_11); \
- stp1_5 = _mm_add_epi16(stp2_5, stp2_10); \
- stp1_6 = _mm_add_epi16(stp2_6, stp2_9); \
- stp1_7 = _mm_add_epi16(stp2_7, stp2_8); \
- stp1_8 = _mm_sub_epi16(stp2_7, stp2_8); \
- stp1_9 = _mm_sub_epi16(stp2_6, stp2_9); \
- stp1_10 = _mm_sub_epi16(stp2_5, stp2_10); \
- stp1_11 = _mm_sub_epi16(stp2_4, stp2_11); \
- stp1_12 = _mm_sub_epi16(stp2_3, stp2_12); \
- stp1_13 = _mm_sub_epi16(stp2_2, stp2_13); \
- stp1_14 = _mm_sub_epi16(stp2_1, stp2_14); \
- stp1_15 = _mm_sub_epi16(stp2_0, stp2_15); \
- \
- stp1_16 = stp2_16; \
- stp1_17 = stp2_17; \
- stp1_18 = stp2_18; \
- stp1_19 = stp2_19; \
- \
- MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg6_0, \
- stg4_0, stg6_0, stg4_0, stp1_20, stp1_27, stp1_21, \
- stp1_26) \
- MULTIPLICATION_AND_ADD(lo_22_25, hi_22_25, lo_23_24, hi_23_24, stg6_0, \
- stg4_0, stg6_0, stg4_0, stp1_22, stp1_25, stp1_23, \
- stp1_24) \
- \
- stp1_28 = stp2_28; \
- stp1_29 = stp2_29; \
- stp1_30 = stp2_30; \
- stp1_31 = stp2_31; \
- }
-
-#define IDCT32 \
- /* Stage1 */ \
- { \
- const __m128i lo_1_31 = _mm_unpacklo_epi16(in[1], in[31]); \
- const __m128i hi_1_31 = _mm_unpackhi_epi16(in[1], in[31]); \
- const __m128i lo_17_15 = _mm_unpacklo_epi16(in[17], in[15]); \
- const __m128i hi_17_15 = _mm_unpackhi_epi16(in[17], in[15]); \
- \
- const __m128i lo_9_23 = _mm_unpacklo_epi16(in[9], in[23]); \
- const __m128i hi_9_23 = _mm_unpackhi_epi16(in[9], in[23]); \
- const __m128i lo_25_7 = _mm_unpacklo_epi16(in[25], in[7]); \
- const __m128i hi_25_7 = _mm_unpackhi_epi16(in[25], in[7]); \
- \
- const __m128i lo_5_27 = _mm_unpacklo_epi16(in[5], in[27]); \
- const __m128i hi_5_27 = _mm_unpackhi_epi16(in[5], in[27]); \
- const __m128i lo_21_11 = _mm_unpacklo_epi16(in[21], in[11]); \
- const __m128i hi_21_11 = _mm_unpackhi_epi16(in[21], in[11]); \
- \
- const __m128i lo_13_19 = _mm_unpacklo_epi16(in[13], in[19]); \
- const __m128i hi_13_19 = _mm_unpackhi_epi16(in[13], in[19]); \
- const __m128i lo_29_3 = _mm_unpacklo_epi16(in[29], in[3]); \
- const __m128i hi_29_3 = _mm_unpackhi_epi16(in[29], in[3]); \
- \
- MULTIPLICATION_AND_ADD(lo_1_31, hi_1_31, lo_17_15, hi_17_15, stg1_0, \
- stg1_1, stg1_2, stg1_3, stp1_16, stp1_31, stp1_17, \
- stp1_30) \
- MULTIPLICATION_AND_ADD(lo_9_23, hi_9_23, lo_25_7, hi_25_7, stg1_4, stg1_5, \
- stg1_6, stg1_7, stp1_18, stp1_29, stp1_19, stp1_28) \
- MULTIPLICATION_AND_ADD(lo_5_27, hi_5_27, lo_21_11, hi_21_11, stg1_8, \
- stg1_9, stg1_10, stg1_11, stp1_20, stp1_27, \
- stp1_21, stp1_26) \
- MULTIPLICATION_AND_ADD(lo_13_19, hi_13_19, lo_29_3, hi_29_3, stg1_12, \
- stg1_13, stg1_14, stg1_15, stp1_22, stp1_25, \
- stp1_23, stp1_24) \
- } \
- \
- /* Stage2 */ \
- { \
- const __m128i lo_2_30 = _mm_unpacklo_epi16(in[2], in[30]); \
- const __m128i hi_2_30 = _mm_unpackhi_epi16(in[2], in[30]); \
- const __m128i lo_18_14 = _mm_unpacklo_epi16(in[18], in[14]); \
- const __m128i hi_18_14 = _mm_unpackhi_epi16(in[18], in[14]); \
- \
- const __m128i lo_10_22 = _mm_unpacklo_epi16(in[10], in[22]); \
- const __m128i hi_10_22 = _mm_unpackhi_epi16(in[10], in[22]); \
- const __m128i lo_26_6 = _mm_unpacklo_epi16(in[26], in[6]); \
- const __m128i hi_26_6 = _mm_unpackhi_epi16(in[26], in[6]); \
- \
- MULTIPLICATION_AND_ADD(lo_2_30, hi_2_30, lo_18_14, hi_18_14, stg2_0, \
- stg2_1, stg2_2, stg2_3, stp2_8, stp2_15, stp2_9, \
- stp2_14) \
- MULTIPLICATION_AND_ADD(lo_10_22, hi_10_22, lo_26_6, hi_26_6, stg2_4, \
- stg2_5, stg2_6, stg2_7, stp2_10, stp2_13, stp2_11, \
- stp2_12) \
- \
- stp2_16 = _mm_add_epi16(stp1_16, stp1_17); \
- stp2_17 = _mm_sub_epi16(stp1_16, stp1_17); \
- stp2_18 = _mm_sub_epi16(stp1_19, stp1_18); \
- stp2_19 = _mm_add_epi16(stp1_19, stp1_18); \
- \
- stp2_20 = _mm_add_epi16(stp1_20, stp1_21); \
- stp2_21 = _mm_sub_epi16(stp1_20, stp1_21); \
- stp2_22 = _mm_sub_epi16(stp1_23, stp1_22); \
- stp2_23 = _mm_add_epi16(stp1_23, stp1_22); \
- \
- stp2_24 = _mm_add_epi16(stp1_24, stp1_25); \
- stp2_25 = _mm_sub_epi16(stp1_24, stp1_25); \
- stp2_26 = _mm_sub_epi16(stp1_27, stp1_26); \
- stp2_27 = _mm_add_epi16(stp1_27, stp1_26); \
- \
- stp2_28 = _mm_add_epi16(stp1_28, stp1_29); \
- stp2_29 = _mm_sub_epi16(stp1_28, stp1_29); \
- stp2_30 = _mm_sub_epi16(stp1_31, stp1_30); \
- stp2_31 = _mm_add_epi16(stp1_31, stp1_30); \
- } \
- \
- /* Stage3 */ \
- { \
- const __m128i lo_4_28 = _mm_unpacklo_epi16(in[4], in[28]); \
- const __m128i hi_4_28 = _mm_unpackhi_epi16(in[4], in[28]); \
- const __m128i lo_20_12 = _mm_unpacklo_epi16(in[20], in[12]); \
- const __m128i hi_20_12 = _mm_unpackhi_epi16(in[20], in[12]); \
- \
- const __m128i lo_17_30 = _mm_unpacklo_epi16(stp2_17, stp2_30); \
- const __m128i hi_17_30 = _mm_unpackhi_epi16(stp2_17, stp2_30); \
- const __m128i lo_18_29 = _mm_unpacklo_epi16(stp2_18, stp2_29); \
- const __m128i hi_18_29 = _mm_unpackhi_epi16(stp2_18, stp2_29); \
- \
- const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \
- const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \
- const __m128i lo_22_25 = _mm_unpacklo_epi16(stp2_22, stp2_25); \
- const __m128i hi_22_25 = _mm_unpackhi_epi16(stp2_22, stp2_25); \
- \
- MULTIPLICATION_AND_ADD(lo_4_28, hi_4_28, lo_20_12, hi_20_12, stg3_0, \
- stg3_1, stg3_2, stg3_3, stp1_4, stp1_7, stp1_5, \
- stp1_6) \
- \
- stp1_8 = _mm_add_epi16(stp2_8, stp2_9); \
- stp1_9 = _mm_sub_epi16(stp2_8, stp2_9); \
- stp1_10 = _mm_sub_epi16(stp2_11, stp2_10); \
- stp1_11 = _mm_add_epi16(stp2_11, stp2_10); \
- stp1_12 = _mm_add_epi16(stp2_12, stp2_13); \
- stp1_13 = _mm_sub_epi16(stp2_12, stp2_13); \
- stp1_14 = _mm_sub_epi16(stp2_15, stp2_14); \
- stp1_15 = _mm_add_epi16(stp2_15, stp2_14); \
- \
- MULTIPLICATION_AND_ADD(lo_17_30, hi_17_30, lo_18_29, hi_18_29, stg3_4, \
- stg3_5, stg3_6, stg3_4, stp1_17, stp1_30, stp1_18, \
- stp1_29) \
- MULTIPLICATION_AND_ADD(lo_21_26, hi_21_26, lo_22_25, hi_22_25, stg3_8, \
- stg3_9, stg3_10, stg3_8, stp1_21, stp1_26, stp1_22, \
- stp1_25) \
- \
- stp1_16 = stp2_16; \
- stp1_31 = stp2_31; \
- stp1_19 = stp2_19; \
- stp1_20 = stp2_20; \
- stp1_23 = stp2_23; \
- stp1_24 = stp2_24; \
- stp1_27 = stp2_27; \
- stp1_28 = stp2_28; \
- } \
- \
- /* Stage4 */ \
- { \
- const __m128i lo_0_16 = _mm_unpacklo_epi16(in[0], in[16]); \
- const __m128i hi_0_16 = _mm_unpackhi_epi16(in[0], in[16]); \
- const __m128i lo_8_24 = _mm_unpacklo_epi16(in[8], in[24]); \
- const __m128i hi_8_24 = _mm_unpackhi_epi16(in[8], in[24]); \
- \
- const __m128i lo_9_14 = _mm_unpacklo_epi16(stp1_9, stp1_14); \
- const __m128i hi_9_14 = _mm_unpackhi_epi16(stp1_9, stp1_14); \
- const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \
- const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \
- \
- MULTIPLICATION_AND_ADD(lo_0_16, hi_0_16, lo_8_24, hi_8_24, stg4_0, stg4_1, \
- stg4_2, stg4_3, stp2_0, stp2_1, stp2_2, stp2_3) \
- \
- stp2_4 = _mm_add_epi16(stp1_4, stp1_5); \
- stp2_5 = _mm_sub_epi16(stp1_4, stp1_5); \
- stp2_6 = _mm_sub_epi16(stp1_7, stp1_6); \
- stp2_7 = _mm_add_epi16(stp1_7, stp1_6); \
- \
- MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, stg4_4, \
- stg4_5, stg4_6, stg4_4, stp2_9, stp2_14, stp2_10, \
- stp2_13) \
- \
- stp2_8 = stp1_8; \
- stp2_15 = stp1_15; \
- stp2_11 = stp1_11; \
- stp2_12 = stp1_12; \
- \
- stp2_16 = _mm_add_epi16(stp1_16, stp1_19); \
- stp2_17 = _mm_add_epi16(stp1_17, stp1_18); \
- stp2_18 = _mm_sub_epi16(stp1_17, stp1_18); \
- stp2_19 = _mm_sub_epi16(stp1_16, stp1_19); \
- stp2_20 = _mm_sub_epi16(stp1_23, stp1_20); \
- stp2_21 = _mm_sub_epi16(stp1_22, stp1_21); \
- stp2_22 = _mm_add_epi16(stp1_22, stp1_21); \
- stp2_23 = _mm_add_epi16(stp1_23, stp1_20); \
- \
- stp2_24 = _mm_add_epi16(stp1_24, stp1_27); \
- stp2_25 = _mm_add_epi16(stp1_25, stp1_26); \
- stp2_26 = _mm_sub_epi16(stp1_25, stp1_26); \
- stp2_27 = _mm_sub_epi16(stp1_24, stp1_27); \
- stp2_28 = _mm_sub_epi16(stp1_31, stp1_28); \
- stp2_29 = _mm_sub_epi16(stp1_30, stp1_29); \
- stp2_30 = _mm_add_epi16(stp1_29, stp1_30); \
- stp2_31 = _mm_add_epi16(stp1_28, stp1_31); \
- } \
- \
- /* Stage5 */ \
- { \
- const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5); \
- const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5); \
- const __m128i lo_18_29 = _mm_unpacklo_epi16(stp2_18, stp2_29); \
- const __m128i hi_18_29 = _mm_unpackhi_epi16(stp2_18, stp2_29); \
- \
- const __m128i lo_19_28 = _mm_unpacklo_epi16(stp2_19, stp2_28); \
- const __m128i hi_19_28 = _mm_unpackhi_epi16(stp2_19, stp2_28); \
- const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27); \
- const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27); \
- \
- const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \
- const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \
- \
- stp1_0 = _mm_add_epi16(stp2_0, stp2_3); \
- stp1_1 = _mm_add_epi16(stp2_1, stp2_2); \
- stp1_2 = _mm_sub_epi16(stp2_1, stp2_2); \
- stp1_3 = _mm_sub_epi16(stp2_0, stp2_3); \
- \
- tmp0 = _mm_madd_epi16(lo_6_5, stg4_1); \
- tmp1 = _mm_madd_epi16(hi_6_5, stg4_1); \
- tmp2 = _mm_madd_epi16(lo_6_5, stg4_0); \
- tmp3 = _mm_madd_epi16(hi_6_5, stg4_0); \
- \
- tmp0 = _mm_add_epi32(tmp0, rounding); \
- tmp1 = _mm_add_epi32(tmp1, rounding); \
- tmp2 = _mm_add_epi32(tmp2, rounding); \
- tmp3 = _mm_add_epi32(tmp3, rounding); \
- \
- tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \
- tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \
- tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \
- tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \
- \
- stp1_5 = _mm_packs_epi32(tmp0, tmp1); \
- stp1_6 = _mm_packs_epi32(tmp2, tmp3); \
- \
- stp1_4 = stp2_4; \
- stp1_7 = stp2_7; \
- \
- stp1_8 = _mm_add_epi16(stp2_8, stp2_11); \
- stp1_9 = _mm_add_epi16(stp2_9, stp2_10); \
- stp1_10 = _mm_sub_epi16(stp2_9, stp2_10); \
- stp1_11 = _mm_sub_epi16(stp2_8, stp2_11); \
- stp1_12 = _mm_sub_epi16(stp2_15, stp2_12); \
- stp1_13 = _mm_sub_epi16(stp2_14, stp2_13); \
- stp1_14 = _mm_add_epi16(stp2_14, stp2_13); \
- stp1_15 = _mm_add_epi16(stp2_15, stp2_12); \
- \
- stp1_16 = stp2_16; \
- stp1_17 = stp2_17; \
- \
- MULTIPLICATION_AND_ADD(lo_18_29, hi_18_29, lo_19_28, hi_19_28, stg4_4, \
- stg4_5, stg4_4, stg4_5, stp1_18, stp1_29, stp1_19, \
- stp1_28) \
- MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg4_6, \
- stg4_4, stg4_6, stg4_4, stp1_20, stp1_27, stp1_21, \
- stp1_26) \
- \
- stp1_22 = stp2_22; \
- stp1_23 = stp2_23; \
- stp1_24 = stp2_24; \
- stp1_25 = stp2_25; \
- stp1_30 = stp2_30; \
- stp1_31 = stp2_31; \
- } \
- \
- /* Stage6 */ \
- { \
- const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \
- const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \
- const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12); \
- const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12); \
- \
- stp2_0 = _mm_add_epi16(stp1_0, stp1_7); \
- stp2_1 = _mm_add_epi16(stp1_1, stp1_6); \
- stp2_2 = _mm_add_epi16(stp1_2, stp1_5); \
- stp2_3 = _mm_add_epi16(stp1_3, stp1_4); \
- stp2_4 = _mm_sub_epi16(stp1_3, stp1_4); \
- stp2_5 = _mm_sub_epi16(stp1_2, stp1_5); \
- stp2_6 = _mm_sub_epi16(stp1_1, stp1_6); \
- stp2_7 = _mm_sub_epi16(stp1_0, stp1_7); \
- \
- stp2_8 = stp1_8; \
- stp2_9 = stp1_9; \
- stp2_14 = stp1_14; \
- stp2_15 = stp1_15; \
- \
- MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, stg6_0, \
- stg4_0, stg6_0, stg4_0, stp2_10, stp2_13, stp2_11, \
- stp2_12) \
- \
- stp2_16 = _mm_add_epi16(stp1_16, stp1_23); \
- stp2_17 = _mm_add_epi16(stp1_17, stp1_22); \
- stp2_18 = _mm_add_epi16(stp1_18, stp1_21); \
- stp2_19 = _mm_add_epi16(stp1_19, stp1_20); \
- stp2_20 = _mm_sub_epi16(stp1_19, stp1_20); \
- stp2_21 = _mm_sub_epi16(stp1_18, stp1_21); \
- stp2_22 = _mm_sub_epi16(stp1_17, stp1_22); \
- stp2_23 = _mm_sub_epi16(stp1_16, stp1_23); \
- \
- stp2_24 = _mm_sub_epi16(stp1_31, stp1_24); \
- stp2_25 = _mm_sub_epi16(stp1_30, stp1_25); \
- stp2_26 = _mm_sub_epi16(stp1_29, stp1_26); \
- stp2_27 = _mm_sub_epi16(stp1_28, stp1_27); \
- stp2_28 = _mm_add_epi16(stp1_27, stp1_28); \
- stp2_29 = _mm_add_epi16(stp1_26, stp1_29); \
- stp2_30 = _mm_add_epi16(stp1_25, stp1_30); \
- stp2_31 = _mm_add_epi16(stp1_24, stp1_31); \
- } \
- \
- /* Stage7 */ \
- { \
- const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27); \
- const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27); \
- const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \
- const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \
- \
- const __m128i lo_22_25 = _mm_unpacklo_epi16(stp2_22, stp2_25); \
- const __m128i hi_22_25 = _mm_unpackhi_epi16(stp2_22, stp2_25); \
- const __m128i lo_23_24 = _mm_unpacklo_epi16(stp2_23, stp2_24); \
- const __m128i hi_23_24 = _mm_unpackhi_epi16(stp2_23, stp2_24); \
- \
- stp1_0 = _mm_add_epi16(stp2_0, stp2_15); \
- stp1_1 = _mm_add_epi16(stp2_1, stp2_14); \
- stp1_2 = _mm_add_epi16(stp2_2, stp2_13); \
- stp1_3 = _mm_add_epi16(stp2_3, stp2_12); \
- stp1_4 = _mm_add_epi16(stp2_4, stp2_11); \
- stp1_5 = _mm_add_epi16(stp2_5, stp2_10); \
- stp1_6 = _mm_add_epi16(stp2_6, stp2_9); \
- stp1_7 = _mm_add_epi16(stp2_7, stp2_8); \
- stp1_8 = _mm_sub_epi16(stp2_7, stp2_8); \
- stp1_9 = _mm_sub_epi16(stp2_6, stp2_9); \
- stp1_10 = _mm_sub_epi16(stp2_5, stp2_10); \
- stp1_11 = _mm_sub_epi16(stp2_4, stp2_11); \
- stp1_12 = _mm_sub_epi16(stp2_3, stp2_12); \
- stp1_13 = _mm_sub_epi16(stp2_2, stp2_13); \
- stp1_14 = _mm_sub_epi16(stp2_1, stp2_14); \
- stp1_15 = _mm_sub_epi16(stp2_0, stp2_15); \
- \
- stp1_16 = stp2_16; \
- stp1_17 = stp2_17; \
- stp1_18 = stp2_18; \
- stp1_19 = stp2_19; \
- \
- MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg6_0, \
- stg4_0, stg6_0, stg4_0, stp1_20, stp1_27, stp1_21, \
- stp1_26) \
- MULTIPLICATION_AND_ADD(lo_22_25, hi_22_25, lo_23_24, hi_23_24, stg6_0, \
- stg4_0, stg6_0, stg4_0, stp1_22, stp1_25, stp1_23, \
- stp1_24) \
- \
- stp1_28 = stp2_28; \
- stp1_29 = stp2_29; \
- stp1_30 = stp2_30; \
- stp1_31 = stp2_31; \
- }
-
-// Only upper-left 8x8 has non-zero coeff
-void av1_idct32x32_34_add_sse2(const int16_t *input, uint8_t *dest,
- int stride) {
- const __m128i zero = _mm_setzero_si128();
- const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
- const __m128i final_rounding = _mm_set1_epi16(1 << 5);
-
- // av1_idct constants for each stage
- const __m128i stg1_0 = pair_set_epi16(cospi_31_64, -cospi_1_64);
- const __m128i stg1_1 = pair_set_epi16(cospi_1_64, cospi_31_64);
- const __m128i stg1_6 = pair_set_epi16(cospi_7_64, -cospi_25_64);
- const __m128i stg1_7 = pair_set_epi16(cospi_25_64, cospi_7_64);
- const __m128i stg1_8 = pair_set_epi16(cospi_27_64, -cospi_5_64);
- const __m128i stg1_9 = pair_set_epi16(cospi_5_64, cospi_27_64);
- const __m128i stg1_14 = pair_set_epi16(cospi_3_64, -cospi_29_64);
- const __m128i stg1_15 = pair_set_epi16(cospi_29_64, cospi_3_64);
-
- const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64);
- const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64);
- const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64);
- const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64);
-
- const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
- const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
- const __m128i stg3_4 = pair_set_epi16(-cospi_4_64, cospi_28_64);
- const __m128i stg3_5 = pair_set_epi16(cospi_28_64, cospi_4_64);
- const __m128i stg3_6 = pair_set_epi16(-cospi_28_64, -cospi_4_64);
- const __m128i stg3_8 = pair_set_epi16(-cospi_20_64, cospi_12_64);
- const __m128i stg3_9 = pair_set_epi16(cospi_12_64, cospi_20_64);
- const __m128i stg3_10 = pair_set_epi16(-cospi_12_64, -cospi_20_64);
-
- const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
- const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
- const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64);
- const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64);
- const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
-
- const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
-
- __m128i in[32], col[32];
- __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7,
- stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15,
- stp1_16, stp1_17, stp1_18, stp1_19, stp1_20, stp1_21, stp1_22, stp1_23,
- stp1_24, stp1_25, stp1_26, stp1_27, stp1_28, stp1_29, stp1_30, stp1_31;
- __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7,
- stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15,
- stp2_16, stp2_17, stp2_18, stp2_19, stp2_20, stp2_21, stp2_22, stp2_23,
- stp2_24, stp2_25, stp2_26, stp2_27, stp2_28, stp2_29, stp2_30, stp2_31;
- __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
- int i;
-
- // Load input data. Only need to load the top left 8x8 block.
- in[0] = _mm_load_si128((const __m128i *)input);
- in[1] = _mm_load_si128((const __m128i *)(input + 32));
- in[2] = _mm_load_si128((const __m128i *)(input + 64));
- in[3] = _mm_load_si128((const __m128i *)(input + 96));
- in[4] = _mm_load_si128((const __m128i *)(input + 128));
- in[5] = _mm_load_si128((const __m128i *)(input + 160));
- in[6] = _mm_load_si128((const __m128i *)(input + 192));
- in[7] = _mm_load_si128((const __m128i *)(input + 224));
-
- for (i = 8; i < 32; ++i) {
- in[i] = _mm_setzero_si128();
- }
-
- array_transpose_8x8(in, in);
- // TODO(hkuang): Following transposes are unnecessary. But remove them will
- // lead to performance drop on some devices.
- array_transpose_8x8(in + 8, in + 8);
- array_transpose_8x8(in + 16, in + 16);
- array_transpose_8x8(in + 24, in + 24);
-
- IDCT32_34
-
- // 1_D: Store 32 intermediate results for each 8x32 block.
- col[0] = _mm_add_epi16(stp1_0, stp1_31);
- col[1] = _mm_add_epi16(stp1_1, stp1_30);
- col[2] = _mm_add_epi16(stp1_2, stp1_29);
- col[3] = _mm_add_epi16(stp1_3, stp1_28);
- col[4] = _mm_add_epi16(stp1_4, stp1_27);
- col[5] = _mm_add_epi16(stp1_5, stp1_26);
- col[6] = _mm_add_epi16(stp1_6, stp1_25);
- col[7] = _mm_add_epi16(stp1_7, stp1_24);
- col[8] = _mm_add_epi16(stp1_8, stp1_23);
- col[9] = _mm_add_epi16(stp1_9, stp1_22);
- col[10] = _mm_add_epi16(stp1_10, stp1_21);
- col[11] = _mm_add_epi16(stp1_11, stp1_20);
- col[12] = _mm_add_epi16(stp1_12, stp1_19);
- col[13] = _mm_add_epi16(stp1_13, stp1_18);
- col[14] = _mm_add_epi16(stp1_14, stp1_17);
- col[15] = _mm_add_epi16(stp1_15, stp1_16);
- col[16] = _mm_sub_epi16(stp1_15, stp1_16);
- col[17] = _mm_sub_epi16(stp1_14, stp1_17);
- col[18] = _mm_sub_epi16(stp1_13, stp1_18);
- col[19] = _mm_sub_epi16(stp1_12, stp1_19);
- col[20] = _mm_sub_epi16(stp1_11, stp1_20);
- col[21] = _mm_sub_epi16(stp1_10, stp1_21);
- col[22] = _mm_sub_epi16(stp1_9, stp1_22);
- col[23] = _mm_sub_epi16(stp1_8, stp1_23);
- col[24] = _mm_sub_epi16(stp1_7, stp1_24);
- col[25] = _mm_sub_epi16(stp1_6, stp1_25);
- col[26] = _mm_sub_epi16(stp1_5, stp1_26);
- col[27] = _mm_sub_epi16(stp1_4, stp1_27);
- col[28] = _mm_sub_epi16(stp1_3, stp1_28);
- col[29] = _mm_sub_epi16(stp1_2, stp1_29);
- col[30] = _mm_sub_epi16(stp1_1, stp1_30);
- col[31] = _mm_sub_epi16(stp1_0, stp1_31);
- for (i = 0; i < 4; i++) {
- int j;
- // Transpose 32x8 block to 8x32 block
- array_transpose_8x8(col + i * 8, in);
- IDCT32_34
-
- // 2_D: Calculate the results and store them to destination.
- in[0] = _mm_add_epi16(stp1_0, stp1_31);
- in[1] = _mm_add_epi16(stp1_1, stp1_30);
- in[2] = _mm_add_epi16(stp1_2, stp1_29);
- in[3] = _mm_add_epi16(stp1_3, stp1_28);
- in[4] = _mm_add_epi16(stp1_4, stp1_27);
- in[5] = _mm_add_epi16(stp1_5, stp1_26);
- in[6] = _mm_add_epi16(stp1_6, stp1_25);
- in[7] = _mm_add_epi16(stp1_7, stp1_24);
- in[8] = _mm_add_epi16(stp1_8, stp1_23);
- in[9] = _mm_add_epi16(stp1_9, stp1_22);
- in[10] = _mm_add_epi16(stp1_10, stp1_21);
- in[11] = _mm_add_epi16(stp1_11, stp1_20);
- in[12] = _mm_add_epi16(stp1_12, stp1_19);
- in[13] = _mm_add_epi16(stp1_13, stp1_18);
- in[14] = _mm_add_epi16(stp1_14, stp1_17);
- in[15] = _mm_add_epi16(stp1_15, stp1_16);
- in[16] = _mm_sub_epi16(stp1_15, stp1_16);
- in[17] = _mm_sub_epi16(stp1_14, stp1_17);
- in[18] = _mm_sub_epi16(stp1_13, stp1_18);
- in[19] = _mm_sub_epi16(stp1_12, stp1_19);
- in[20] = _mm_sub_epi16(stp1_11, stp1_20);
- in[21] = _mm_sub_epi16(stp1_10, stp1_21);
- in[22] = _mm_sub_epi16(stp1_9, stp1_22);
- in[23] = _mm_sub_epi16(stp1_8, stp1_23);
- in[24] = _mm_sub_epi16(stp1_7, stp1_24);
- in[25] = _mm_sub_epi16(stp1_6, stp1_25);
- in[26] = _mm_sub_epi16(stp1_5, stp1_26);
- in[27] = _mm_sub_epi16(stp1_4, stp1_27);
- in[28] = _mm_sub_epi16(stp1_3, stp1_28);
- in[29] = _mm_sub_epi16(stp1_2, stp1_29);
- in[30] = _mm_sub_epi16(stp1_1, stp1_30);
- in[31] = _mm_sub_epi16(stp1_0, stp1_31);
-
- for (j = 0; j < 32; ++j) {
- // Final rounding and shift
- in[j] = _mm_adds_epi16(in[j], final_rounding);
- in[j] = _mm_srai_epi16(in[j], 6);
- RECON_AND_STORE(dest + j * stride, in[j]);
- }
-
- dest += 8;
- }
-}
-
-void av1_idct32x32_1024_add_sse2(const int16_t *input, uint8_t *dest,
- int stride) {
- const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
- const __m128i final_rounding = _mm_set1_epi16(1 << 5);
- const __m128i zero = _mm_setzero_si128();
-
- // av1_idct constants for each stage
- const __m128i stg1_0 = pair_set_epi16(cospi_31_64, -cospi_1_64);
- const __m128i stg1_1 = pair_set_epi16(cospi_1_64, cospi_31_64);
- const __m128i stg1_2 = pair_set_epi16(cospi_15_64, -cospi_17_64);
- const __m128i stg1_3 = pair_set_epi16(cospi_17_64, cospi_15_64);
- const __m128i stg1_4 = pair_set_epi16(cospi_23_64, -cospi_9_64);
- const __m128i stg1_5 = pair_set_epi16(cospi_9_64, cospi_23_64);
- const __m128i stg1_6 = pair_set_epi16(cospi_7_64, -cospi_25_64);
- const __m128i stg1_7 = pair_set_epi16(cospi_25_64, cospi_7_64);
- const __m128i stg1_8 = pair_set_epi16(cospi_27_64, -cospi_5_64);
- const __m128i stg1_9 = pair_set_epi16(cospi_5_64, cospi_27_64);
- const __m128i stg1_10 = pair_set_epi16(cospi_11_64, -cospi_21_64);
- const __m128i stg1_11 = pair_set_epi16(cospi_21_64, cospi_11_64);
- const __m128i stg1_12 = pair_set_epi16(cospi_19_64, -cospi_13_64);
- const __m128i stg1_13 = pair_set_epi16(cospi_13_64, cospi_19_64);
- const __m128i stg1_14 = pair_set_epi16(cospi_3_64, -cospi_29_64);
- const __m128i stg1_15 = pair_set_epi16(cospi_29_64, cospi_3_64);
-
- const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64);
- const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64);
- const __m128i stg2_2 = pair_set_epi16(cospi_14_64, -cospi_18_64);
- const __m128i stg2_3 = pair_set_epi16(cospi_18_64, cospi_14_64);
- const __m128i stg2_4 = pair_set_epi16(cospi_22_64, -cospi_10_64);
- const __m128i stg2_5 = pair_set_epi16(cospi_10_64, cospi_22_64);
- const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64);
- const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64);
-
- const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
- const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
- const __m128i stg3_2 = pair_set_epi16(cospi_12_64, -cospi_20_64);
- const __m128i stg3_3 = pair_set_epi16(cospi_20_64, cospi_12_64);
- const __m128i stg3_4 = pair_set_epi16(-cospi_4_64, cospi_28_64);
- const __m128i stg3_5 = pair_set_epi16(cospi_28_64, cospi_4_64);
- const __m128i stg3_6 = pair_set_epi16(-cospi_28_64, -cospi_4_64);
- const __m128i stg3_8 = pair_set_epi16(-cospi_20_64, cospi_12_64);
- const __m128i stg3_9 = pair_set_epi16(cospi_12_64, cospi_20_64);
- const __m128i stg3_10 = pair_set_epi16(-cospi_12_64, -cospi_20_64);
-
- const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
- const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
- const __m128i stg4_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);
- const __m128i stg4_3 = pair_set_epi16(cospi_8_64, cospi_24_64);
- const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64);
- const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64);
- const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
-
- const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
-
- __m128i in[32], col[128], zero_idx[16];
- __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7,
- stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15,
- stp1_16, stp1_17, stp1_18, stp1_19, stp1_20, stp1_21, stp1_22, stp1_23,
- stp1_24, stp1_25, stp1_26, stp1_27, stp1_28, stp1_29, stp1_30, stp1_31;
- __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7,
- stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15,
- stp2_16, stp2_17, stp2_18, stp2_19, stp2_20, stp2_21, stp2_22, stp2_23,
- stp2_24, stp2_25, stp2_26, stp2_27, stp2_28, stp2_29, stp2_30, stp2_31;
- __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
- int i, j, i32;
-
- for (i = 0; i < 4; i++) {
- i32 = (i << 5);
- // First 1-D av1_idct
- // Load input data.
- LOAD_DQCOEFF(in[0], input);
- LOAD_DQCOEFF(in[8], input);
- LOAD_DQCOEFF(in[16], input);
- LOAD_DQCOEFF(in[24], input);
- LOAD_DQCOEFF(in[1], input);
- LOAD_DQCOEFF(in[9], input);
- LOAD_DQCOEFF(in[17], input);
- LOAD_DQCOEFF(in[25], input);
- LOAD_DQCOEFF(in[2], input);
- LOAD_DQCOEFF(in[10], input);
- LOAD_DQCOEFF(in[18], input);
- LOAD_DQCOEFF(in[26], input);
- LOAD_DQCOEFF(in[3], input);
- LOAD_DQCOEFF(in[11], input);
- LOAD_DQCOEFF(in[19], input);
- LOAD_DQCOEFF(in[27], input);
-
- LOAD_DQCOEFF(in[4], input);
- LOAD_DQCOEFF(in[12], input);
- LOAD_DQCOEFF(in[20], input);
- LOAD_DQCOEFF(in[28], input);
- LOAD_DQCOEFF(in[5], input);
- LOAD_DQCOEFF(in[13], input);
- LOAD_DQCOEFF(in[21], input);
- LOAD_DQCOEFF(in[29], input);
- LOAD_DQCOEFF(in[6], input);
- LOAD_DQCOEFF(in[14], input);
- LOAD_DQCOEFF(in[22], input);
- LOAD_DQCOEFF(in[30], input);
- LOAD_DQCOEFF(in[7], input);
- LOAD_DQCOEFF(in[15], input);
- LOAD_DQCOEFF(in[23], input);
- LOAD_DQCOEFF(in[31], input);
-
- // checking if all entries are zero
- zero_idx[0] = _mm_or_si128(in[0], in[1]);
- zero_idx[1] = _mm_or_si128(in[2], in[3]);
- zero_idx[2] = _mm_or_si128(in[4], in[5]);
- zero_idx[3] = _mm_or_si128(in[6], in[7]);
- zero_idx[4] = _mm_or_si128(in[8], in[9]);
- zero_idx[5] = _mm_or_si128(in[10], in[11]);
- zero_idx[6] = _mm_or_si128(in[12], in[13]);
- zero_idx[7] = _mm_or_si128(in[14], in[15]);
- zero_idx[8] = _mm_or_si128(in[16], in[17]);
- zero_idx[9] = _mm_or_si128(in[18], in[19]);
- zero_idx[10] = _mm_or_si128(in[20], in[21]);
- zero_idx[11] = _mm_or_si128(in[22], in[23]);
- zero_idx[12] = _mm_or_si128(in[24], in[25]);
- zero_idx[13] = _mm_or_si128(in[26], in[27]);
- zero_idx[14] = _mm_or_si128(in[28], in[29]);
- zero_idx[15] = _mm_or_si128(in[30], in[31]);
-
- zero_idx[0] = _mm_or_si128(zero_idx[0], zero_idx[1]);
- zero_idx[1] = _mm_or_si128(zero_idx[2], zero_idx[3]);
- zero_idx[2] = _mm_or_si128(zero_idx[4], zero_idx[5]);
- zero_idx[3] = _mm_or_si128(zero_idx[6], zero_idx[7]);
- zero_idx[4] = _mm_or_si128(zero_idx[8], zero_idx[9]);
- zero_idx[5] = _mm_or_si128(zero_idx[10], zero_idx[11]);
- zero_idx[6] = _mm_or_si128(zero_idx[12], zero_idx[13]);
- zero_idx[7] = _mm_or_si128(zero_idx[14], zero_idx[15]);
-
- zero_idx[8] = _mm_or_si128(zero_idx[0], zero_idx[1]);
- zero_idx[9] = _mm_or_si128(zero_idx[2], zero_idx[3]);
- zero_idx[10] = _mm_or_si128(zero_idx[4], zero_idx[5]);
- zero_idx[11] = _mm_or_si128(zero_idx[6], zero_idx[7]);
- zero_idx[12] = _mm_or_si128(zero_idx[8], zero_idx[9]);
- zero_idx[13] = _mm_or_si128(zero_idx[10], zero_idx[11]);
- zero_idx[14] = _mm_or_si128(zero_idx[12], zero_idx[13]);
-
- if (_mm_movemask_epi8(_mm_cmpeq_epi32(zero_idx[14], zero)) == 0xFFFF) {
- col[i32 + 0] = _mm_setzero_si128();
- col[i32 + 1] = _mm_setzero_si128();
- col[i32 + 2] = _mm_setzero_si128();
- col[i32 + 3] = _mm_setzero_si128();
- col[i32 + 4] = _mm_setzero_si128();
- col[i32 + 5] = _mm_setzero_si128();
- col[i32 + 6] = _mm_setzero_si128();
- col[i32 + 7] = _mm_setzero_si128();
- col[i32 + 8] = _mm_setzero_si128();
- col[i32 + 9] = _mm_setzero_si128();
- col[i32 + 10] = _mm_setzero_si128();
- col[i32 + 11] = _mm_setzero_si128();
- col[i32 + 12] = _mm_setzero_si128();
- col[i32 + 13] = _mm_setzero_si128();
- col[i32 + 14] = _mm_setzero_si128();
- col[i32 + 15] = _mm_setzero_si128();
- col[i32 + 16] = _mm_setzero_si128();
- col[i32 + 17] = _mm_setzero_si128();
- col[i32 + 18] = _mm_setzero_si128();
- col[i32 + 19] = _mm_setzero_si128();
- col[i32 + 20] = _mm_setzero_si128();
- col[i32 + 21] = _mm_setzero_si128();
- col[i32 + 22] = _mm_setzero_si128();
- col[i32 + 23] = _mm_setzero_si128();
- col[i32 + 24] = _mm_setzero_si128();
- col[i32 + 25] = _mm_setzero_si128();
- col[i32 + 26] = _mm_setzero_si128();
- col[i32 + 27] = _mm_setzero_si128();
- col[i32 + 28] = _mm_setzero_si128();
- col[i32 + 29] = _mm_setzero_si128();
- col[i32 + 30] = _mm_setzero_si128();
- col[i32 + 31] = _mm_setzero_si128();
- continue;
- }
-
- // Transpose 32x8 block to 8x32 block
- array_transpose_8x8(in, in);
- array_transpose_8x8(in + 8, in + 8);
- array_transpose_8x8(in + 16, in + 16);
- array_transpose_8x8(in + 24, in + 24);
-
- IDCT32
-
- // 1_D: Store 32 intermediate results for each 8x32 block.
- col[i32 + 0] = _mm_add_epi16(stp1_0, stp1_31);
- col[i32 + 1] = _mm_add_epi16(stp1_1, stp1_30);
- col[i32 + 2] = _mm_add_epi16(stp1_2, stp1_29);
- col[i32 + 3] = _mm_add_epi16(stp1_3, stp1_28);
- col[i32 + 4] = _mm_add_epi16(stp1_4, stp1_27);
- col[i32 + 5] = _mm_add_epi16(stp1_5, stp1_26);
- col[i32 + 6] = _mm_add_epi16(stp1_6, stp1_25);
- col[i32 + 7] = _mm_add_epi16(stp1_7, stp1_24);
- col[i32 + 8] = _mm_add_epi16(stp1_8, stp1_23);
- col[i32 + 9] = _mm_add_epi16(stp1_9, stp1_22);
- col[i32 + 10] = _mm_add_epi16(stp1_10, stp1_21);
- col[i32 + 11] = _mm_add_epi16(stp1_11, stp1_20);
- col[i32 + 12] = _mm_add_epi16(stp1_12, stp1_19);
- col[i32 + 13] = _mm_add_epi16(stp1_13, stp1_18);
- col[i32 + 14] = _mm_add_epi16(stp1_14, stp1_17);
- col[i32 + 15] = _mm_add_epi16(stp1_15, stp1_16);
- col[i32 + 16] = _mm_sub_epi16(stp1_15, stp1_16);
- col[i32 + 17] = _mm_sub_epi16(stp1_14, stp1_17);
- col[i32 + 18] = _mm_sub_epi16(stp1_13, stp1_18);
- col[i32 + 19] = _mm_sub_epi16(stp1_12, stp1_19);
- col[i32 + 20] = _mm_sub_epi16(stp1_11, stp1_20);
- col[i32 + 21] = _mm_sub_epi16(stp1_10, stp1_21);
- col[i32 + 22] = _mm_sub_epi16(stp1_9, stp1_22);
- col[i32 + 23] = _mm_sub_epi16(stp1_8, stp1_23);
- col[i32 + 24] = _mm_sub_epi16(stp1_7, stp1_24);
- col[i32 + 25] = _mm_sub_epi16(stp1_6, stp1_25);
- col[i32 + 26] = _mm_sub_epi16(stp1_5, stp1_26);
- col[i32 + 27] = _mm_sub_epi16(stp1_4, stp1_27);
- col[i32 + 28] = _mm_sub_epi16(stp1_3, stp1_28);
- col[i32 + 29] = _mm_sub_epi16(stp1_2, stp1_29);
- col[i32 + 30] = _mm_sub_epi16(stp1_1, stp1_30);
- col[i32 + 31] = _mm_sub_epi16(stp1_0, stp1_31);
- }
- for (i = 0; i < 4; i++) {
- // Second 1-D av1_idct
- j = i << 3;
-
- // Transpose 32x8 block to 8x32 block
- array_transpose_8x8(col + j, in);
- array_transpose_8x8(col + j + 32, in + 8);
- array_transpose_8x8(col + j + 64, in + 16);
- array_transpose_8x8(col + j + 96, in + 24);
-
- IDCT32
-
- // 2_D: Calculate the results and store them to destination.
- in[0] = _mm_add_epi16(stp1_0, stp1_31);
- in[1] = _mm_add_epi16(stp1_1, stp1_30);
- in[2] = _mm_add_epi16(stp1_2, stp1_29);
- in[3] = _mm_add_epi16(stp1_3, stp1_28);
- in[4] = _mm_add_epi16(stp1_4, stp1_27);
- in[5] = _mm_add_epi16(stp1_5, stp1_26);
- in[6] = _mm_add_epi16(stp1_6, stp1_25);
- in[7] = _mm_add_epi16(stp1_7, stp1_24);
- in[8] = _mm_add_epi16(stp1_8, stp1_23);
- in[9] = _mm_add_epi16(stp1_9, stp1_22);
- in[10] = _mm_add_epi16(stp1_10, stp1_21);
- in[11] = _mm_add_epi16(stp1_11, stp1_20);
- in[12] = _mm_add_epi16(stp1_12, stp1_19);
- in[13] = _mm_add_epi16(stp1_13, stp1_18);
- in[14] = _mm_add_epi16(stp1_14, stp1_17);
- in[15] = _mm_add_epi16(stp1_15, stp1_16);
- in[16] = _mm_sub_epi16(stp1_15, stp1_16);
- in[17] = _mm_sub_epi16(stp1_14, stp1_17);
- in[18] = _mm_sub_epi16(stp1_13, stp1_18);
- in[19] = _mm_sub_epi16(stp1_12, stp1_19);
- in[20] = _mm_sub_epi16(stp1_11, stp1_20);
- in[21] = _mm_sub_epi16(stp1_10, stp1_21);
- in[22] = _mm_sub_epi16(stp1_9, stp1_22);
- in[23] = _mm_sub_epi16(stp1_8, stp1_23);
- in[24] = _mm_sub_epi16(stp1_7, stp1_24);
- in[25] = _mm_sub_epi16(stp1_6, stp1_25);
- in[26] = _mm_sub_epi16(stp1_5, stp1_26);
- in[27] = _mm_sub_epi16(stp1_4, stp1_27);
- in[28] = _mm_sub_epi16(stp1_3, stp1_28);
- in[29] = _mm_sub_epi16(stp1_2, stp1_29);
- in[30] = _mm_sub_epi16(stp1_1, stp1_30);
- in[31] = _mm_sub_epi16(stp1_0, stp1_31);
-
- for (j = 0; j < 32; ++j) {
- // Final rounding and shift
- in[j] = _mm_adds_epi16(in[j], final_rounding);
- in[j] = _mm_srai_epi16(in[j], 6);
- RECON_AND_STORE(dest + j * stride, in[j]);
- }
-
- dest += 8;
- }
-}
-
-void av1_idct32x32_1_add_sse2(const int16_t *input, uint8_t *dest, int stride) {
- __m128i dc_value;
- const __m128i zero = _mm_setzero_si128();
- int a, i;
-
- a = (int)dct_const_round_shift(input[0] * cospi_16_64);
- a = (int)dct_const_round_shift(a * cospi_16_64);
- a = ROUND_POWER_OF_TWO(a, 6);
-
- dc_value = _mm_set1_epi16(a);
-
- for (i = 0; i < 4; ++i) {
- int j;
- for (j = 0; j < 32; ++j) {
- RECON_AND_STORE(dest + j * stride, dc_value);
- }
- dest += 8;
- }
-}
-
-#if CONFIG_AOM_HIGHBITDEPTH
-static INLINE __m128i clamp_high_sse2(__m128i value, int bd) {
- __m128i ubounded, retval;
- const __m128i zero = _mm_set1_epi16(0);
- const __m128i one = _mm_set1_epi16(1);
- const __m128i max = _mm_subs_epi16(_mm_slli_epi16(one, bd), one);
- ubounded = _mm_cmpgt_epi16(value, max);
- retval = _mm_andnot_si128(ubounded, value);
- ubounded = _mm_and_si128(ubounded, max);
- retval = _mm_or_si128(retval, ubounded);
- retval = _mm_and_si128(retval, _mm_cmpgt_epi16(retval, zero));
- return retval;
-}
-
-void av1_highbd_idct4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest8,
- int stride, int bd) {
- tran_low_t out[4 * 4];
- tran_low_t *outptr = out;
- int i, j;
- __m128i inptr[4];
- __m128i sign_bits[2];
- __m128i temp_mm, min_input, max_input;
- int test;
- uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
- int optimised_cols = 0;
- const __m128i zero = _mm_set1_epi16(0);
- const __m128i eight = _mm_set1_epi16(8);
- const __m128i max = _mm_set1_epi16(12043);
- const __m128i min = _mm_set1_epi16(-12043);
- // Load input into __m128i
- inptr[0] = _mm_loadu_si128((const __m128i *)input);
- inptr[1] = _mm_loadu_si128((const __m128i *)(input + 4));
- inptr[2] = _mm_loadu_si128((const __m128i *)(input + 8));
- inptr[3] = _mm_loadu_si128((const __m128i *)(input + 12));
-
- // Pack to 16 bits
- inptr[0] = _mm_packs_epi32(inptr[0], inptr[1]);
- inptr[1] = _mm_packs_epi32(inptr[2], inptr[3]);
-
- max_input = _mm_max_epi16(inptr[0], inptr[1]);
- min_input = _mm_min_epi16(inptr[0], inptr[1]);
- max_input = _mm_cmpgt_epi16(max_input, max);
- min_input = _mm_cmplt_epi16(min_input, min);
- temp_mm = _mm_or_si128(max_input, min_input);
- test = _mm_movemask_epi8(temp_mm);
-
- if (!test) {
- // Do the row transform
- av1_idct4_sse2(inptr);
-
- // Check the min & max values
- max_input = _mm_max_epi16(inptr[0], inptr[1]);
- min_input = _mm_min_epi16(inptr[0], inptr[1]);
- max_input = _mm_cmpgt_epi16(max_input, max);
- min_input = _mm_cmplt_epi16(min_input, min);
- temp_mm = _mm_or_si128(max_input, min_input);
- test = _mm_movemask_epi8(temp_mm);
-
- if (test) {
- transpose_4x4(inptr);
- sign_bits[0] = _mm_cmplt_epi16(inptr[0], zero);
- sign_bits[1] = _mm_cmplt_epi16(inptr[1], zero);
- inptr[3] = _mm_unpackhi_epi16(inptr[1], sign_bits[1]);
- inptr[2] = _mm_unpacklo_epi16(inptr[1], sign_bits[1]);
- inptr[1] = _mm_unpackhi_epi16(inptr[0], sign_bits[0]);
- inptr[0] = _mm_unpacklo_epi16(inptr[0], sign_bits[0]);
- _mm_storeu_si128((__m128i *)outptr, inptr[0]);
- _mm_storeu_si128((__m128i *)(outptr + 4), inptr[1]);
- _mm_storeu_si128((__m128i *)(outptr + 8), inptr[2]);
- _mm_storeu_si128((__m128i *)(outptr + 12), inptr[3]);
- } else {
- // Set to use the optimised transform for the column
- optimised_cols = 1;
- }
- } else {
- // Run the un-optimised row transform
- for (i = 0; i < 4; ++i) {
- av1_highbd_idct4_c(input, outptr, bd);
- input += 4;
- outptr += 4;
- }
- }
-
- if (optimised_cols) {
- av1_idct4_sse2(inptr);
-
- // Final round and shift
- inptr[0] = _mm_add_epi16(inptr[0], eight);
- inptr[1] = _mm_add_epi16(inptr[1], eight);
-
- inptr[0] = _mm_srai_epi16(inptr[0], 4);
- inptr[1] = _mm_srai_epi16(inptr[1], 4);
-
- // Reconstruction and Store
- {
- __m128i d0 = _mm_loadl_epi64((const __m128i *)dest);
- __m128i d2 = _mm_loadl_epi64((const __m128i *)(dest + stride * 2));
- d0 = _mm_unpacklo_epi64(
- d0, _mm_loadl_epi64((const __m128i *)(dest + stride)));
- d2 = _mm_unpacklo_epi64(
- d2, _mm_loadl_epi64((const __m128i *)(dest + stride * 3)));
- d0 = clamp_high_sse2(_mm_adds_epi16(d0, inptr[0]), bd);
- d2 = clamp_high_sse2(_mm_adds_epi16(d2, inptr[1]), bd);
- // store input0
- _mm_storel_epi64((__m128i *)dest, d0);
- // store input1
- d0 = _mm_srli_si128(d0, 8);
- _mm_storel_epi64((__m128i *)(dest + stride), d0);
- // store input2
- _mm_storel_epi64((__m128i *)(dest + stride * 2), d2);
- // store input3
- d2 = _mm_srli_si128(d2, 8);
- _mm_storel_epi64((__m128i *)(dest + stride * 3), d2);
- }
- } else {
- // Run the un-optimised column transform
- tran_low_t temp_in[4], temp_out[4];
- // Columns
- for (i = 0; i < 4; ++i) {
- for (j = 0; j < 4; ++j) temp_in[j] = out[j * 4 + i];
- av1_highbd_idct4_c(temp_in, temp_out, bd);
- for (j = 0; j < 4; ++j) {
- dest[j * stride + i] = highbd_clip_pixel_add(
- dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 4), bd);
- }
- }
- }
-}
-
-void av1_highbd_idct8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest8,
- int stride, int bd) {
- tran_low_t out[8 * 8];
- tran_low_t *outptr = out;
- int i, j, test;
- __m128i inptr[8];
- __m128i min_input, max_input, temp1, temp2, sign_bits;
- uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
- const __m128i zero = _mm_set1_epi16(0);
- const __m128i sixteen = _mm_set1_epi16(16);
- const __m128i max = _mm_set1_epi16(6201);
- const __m128i min = _mm_set1_epi16(-6201);
- int optimised_cols = 0;
-
- // Load input into __m128i & pack to 16 bits
- for (i = 0; i < 8; i++) {
- temp1 = _mm_loadu_si128((const __m128i *)(input + 8 * i));
- temp2 = _mm_loadu_si128((const __m128i *)(input + 8 * i + 4));
- inptr[i] = _mm_packs_epi32(temp1, temp2);
- }
-
- // Find the min & max for the row transform
- max_input = _mm_max_epi16(inptr[0], inptr[1]);
- min_input = _mm_min_epi16(inptr[0], inptr[1]);
- for (i = 2; i < 8; i++) {
- max_input = _mm_max_epi16(max_input, inptr[i]);
- min_input = _mm_min_epi16(min_input, inptr[i]);
- }
- max_input = _mm_cmpgt_epi16(max_input, max);
- min_input = _mm_cmplt_epi16(min_input, min);
- temp1 = _mm_or_si128(max_input, min_input);
- test = _mm_movemask_epi8(temp1);
-
- if (!test) {
- // Do the row transform
- av1_idct8_sse2(inptr);
-
- // Find the min & max for the column transform
- max_input = _mm_max_epi16(inptr[0], inptr[1]);
- min_input = _mm_min_epi16(inptr[0], inptr[1]);
- for (i = 2; i < 8; i++) {
- max_input = _mm_max_epi16(max_input, inptr[i]);
- min_input = _mm_min_epi16(min_input, inptr[i]);
- }
- max_input = _mm_cmpgt_epi16(max_input, max);
- min_input = _mm_cmplt_epi16(min_input, min);
- temp1 = _mm_or_si128(max_input, min_input);
- test = _mm_movemask_epi8(temp1);
-
- if (test) {
- array_transpose_8x8(inptr, inptr);
- for (i = 0; i < 8; i++) {
- sign_bits = _mm_cmplt_epi16(inptr[i], zero);
- temp1 = _mm_unpackhi_epi16(inptr[i], sign_bits);
- temp2 = _mm_unpacklo_epi16(inptr[i], sign_bits);
- _mm_storeu_si128((__m128i *)(outptr + 4 * (2 * i + 1)), temp1);
- _mm_storeu_si128((__m128i *)(outptr + 4 * (2 * i)), temp2);
- }
- } else {
- // Set to use the optimised transform for the column
- optimised_cols = 1;
- }
- } else {
- // Run the un-optimised row transform
- for (i = 0; i < 8; ++i) {
- av1_highbd_idct8_c(input, outptr, bd);
- input += 8;
- outptr += 8;
- }
- }
-
- if (optimised_cols) {
- av1_idct8_sse2(inptr);
-
- // Final round & shift and Reconstruction and Store
- {
- __m128i d[8];
- for (i = 0; i < 8; i++) {
- inptr[i] = _mm_add_epi16(inptr[i], sixteen);
- d[i] = _mm_loadu_si128((const __m128i *)(dest + stride * i));
- inptr[i] = _mm_srai_epi16(inptr[i], 5);
- d[i] = clamp_high_sse2(_mm_adds_epi16(d[i], inptr[i]), bd);
- // Store
- _mm_storeu_si128((__m128i *)(dest + stride * i), d[i]);
- }
- }
- } else {
- // Run the un-optimised column transform
- tran_low_t temp_in[8], temp_out[8];
- for (i = 0; i < 8; ++i) {
- for (j = 0; j < 8; ++j) temp_in[j] = out[j * 8 + i];
- av1_highbd_idct8_c(temp_in, temp_out, bd);
- for (j = 0; j < 8; ++j) {
- dest[j * stride + i] = highbd_clip_pixel_add(
- dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 5), bd);
- }
- }
- }
-}
-
-void av1_highbd_idct8x8_10_add_sse2(const tran_low_t *input, uint8_t *dest8,
- int stride, int bd) {
- tran_low_t out[8 * 8] = { 0 };
- tran_low_t *outptr = out;
- int i, j, test;
- __m128i inptr[8];
- __m128i min_input, max_input, temp1, temp2, sign_bits;
- uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
- const __m128i zero = _mm_set1_epi16(0);
- const __m128i sixteen = _mm_set1_epi16(16);
- const __m128i max = _mm_set1_epi16(6201);
- const __m128i min = _mm_set1_epi16(-6201);
- int optimised_cols = 0;
-
- // Load input into __m128i & pack to 16 bits
- for (i = 0; i < 8; i++) {
- temp1 = _mm_loadu_si128((const __m128i *)(input + 8 * i));
- temp2 = _mm_loadu_si128((const __m128i *)(input + 8 * i + 4));
- inptr[i] = _mm_packs_epi32(temp1, temp2);
- }
-
- // Find the min & max for the row transform
- // only first 4 row has non-zero coefs
- max_input = _mm_max_epi16(inptr[0], inptr[1]);
- min_input = _mm_min_epi16(inptr[0], inptr[1]);
- for (i = 2; i < 4; i++) {
- max_input = _mm_max_epi16(max_input, inptr[i]);
- min_input = _mm_min_epi16(min_input, inptr[i]);
- }
- max_input = _mm_cmpgt_epi16(max_input, max);
- min_input = _mm_cmplt_epi16(min_input, min);
- temp1 = _mm_or_si128(max_input, min_input);
- test = _mm_movemask_epi8(temp1);
-
- if (!test) {
- // Do the row transform
- av1_idct8_sse2(inptr);
-
- // Find the min & max for the column transform
- // N.B. Only first 4 cols contain non-zero coeffs
- max_input = _mm_max_epi16(inptr[0], inptr[1]);
- min_input = _mm_min_epi16(inptr[0], inptr[1]);
- for (i = 2; i < 8; i++) {
- max_input = _mm_max_epi16(max_input, inptr[i]);
- min_input = _mm_min_epi16(min_input, inptr[i]);
- }
- max_input = _mm_cmpgt_epi16(max_input, max);
- min_input = _mm_cmplt_epi16(min_input, min);
- temp1 = _mm_or_si128(max_input, min_input);
- test = _mm_movemask_epi8(temp1);
-
- if (test) {
- // Use fact only first 4 rows contain non-zero coeffs
- array_transpose_4X8(inptr, inptr);
- for (i = 0; i < 4; i++) {
- sign_bits = _mm_cmplt_epi16(inptr[i], zero);
- temp1 = _mm_unpackhi_epi16(inptr[i], sign_bits);
- temp2 = _mm_unpacklo_epi16(inptr[i], sign_bits);
- _mm_storeu_si128((__m128i *)(outptr + 4 * (2 * i + 1)), temp1);
- _mm_storeu_si128((__m128i *)(outptr + 4 * (2 * i)), temp2);
- }
- } else {
- // Set to use the optimised transform for the column
- optimised_cols = 1;
- }
- } else {
- // Run the un-optimised row transform
- for (i = 0; i < 4; ++i) {
- av1_highbd_idct8_c(input, outptr, bd);
- input += 8;
- outptr += 8;
- }
- }
-
- if (optimised_cols) {
- av1_idct8_sse2(inptr);
-
- // Final round & shift and Reconstruction and Store
- {
- __m128i d[8];
- for (i = 0; i < 8; i++) {
- inptr[i] = _mm_add_epi16(inptr[i], sixteen);
- d[i] = _mm_loadu_si128((const __m128i *)(dest + stride * i));
- inptr[i] = _mm_srai_epi16(inptr[i], 5);
- d[i] = clamp_high_sse2(_mm_adds_epi16(d[i], inptr[i]), bd);
- // Store
- _mm_storeu_si128((__m128i *)(dest + stride * i), d[i]);
- }
- }
- } else {
- // Run the un-optimised column transform
- tran_low_t temp_in[8], temp_out[8];
- for (i = 0; i < 8; ++i) {
- for (j = 0; j < 8; ++j) temp_in[j] = out[j * 8 + i];
- av1_highbd_idct8_c(temp_in, temp_out, bd);
- for (j = 0; j < 8; ++j) {
- dest[j * stride + i] = highbd_clip_pixel_add(
- dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 5), bd);
- }
- }
- }
-}
-
-void av1_highbd_idct16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest8,
- int stride, int bd) {
- tran_low_t out[16 * 16];
- tran_low_t *outptr = out;
- int i, j, test;
- __m128i inptr[32];
- __m128i min_input, max_input, temp1, temp2, sign_bits;
- uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
- const __m128i zero = _mm_set1_epi16(0);
- const __m128i rounding = _mm_set1_epi16(32);
- const __m128i max = _mm_set1_epi16(3155);
- const __m128i min = _mm_set1_epi16(-3155);
- int optimised_cols = 0;
-
- // Load input into __m128i & pack to 16 bits
- for (i = 0; i < 16; i++) {
- temp1 = _mm_loadu_si128((const __m128i *)(input + 16 * i));
- temp2 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 4));
- inptr[i] = _mm_packs_epi32(temp1, temp2);
- temp1 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 8));
- temp2 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 12));
- inptr[i + 16] = _mm_packs_epi32(temp1, temp2);
- }
-
- // Find the min & max for the row transform
- max_input = _mm_max_epi16(inptr[0], inptr[1]);
- min_input = _mm_min_epi16(inptr[0], inptr[1]);
- for (i = 2; i < 32; i++) {
- max_input = _mm_max_epi16(max_input, inptr[i]);
- min_input = _mm_min_epi16(min_input, inptr[i]);
- }
- max_input = _mm_cmpgt_epi16(max_input, max);
- min_input = _mm_cmplt_epi16(min_input, min);
- temp1 = _mm_or_si128(max_input, min_input);
- test = _mm_movemask_epi8(temp1);
-
- if (!test) {
- // Do the row transform
- av1_idct16_sse2(inptr, inptr + 16);
-
- // Find the min & max for the column transform
- max_input = _mm_max_epi16(inptr[0], inptr[1]);
- min_input = _mm_min_epi16(inptr[0], inptr[1]);
- for (i = 2; i < 32; i++) {
- max_input = _mm_max_epi16(max_input, inptr[i]);
- min_input = _mm_min_epi16(min_input, inptr[i]);
- }
- max_input = _mm_cmpgt_epi16(max_input, max);
- min_input = _mm_cmplt_epi16(min_input, min);
- temp1 = _mm_or_si128(max_input, min_input);
- test = _mm_movemask_epi8(temp1);
-
- if (test) {
- array_transpose_16x16(inptr, inptr + 16);
- for (i = 0; i < 16; i++) {
- sign_bits = _mm_cmplt_epi16(inptr[i], zero);
- temp1 = _mm_unpacklo_epi16(inptr[i], sign_bits);
- temp2 = _mm_unpackhi_epi16(inptr[i], sign_bits);
- _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4)), temp1);
- _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 1)), temp2);
- sign_bits = _mm_cmplt_epi16(inptr[i + 16], zero);
- temp1 = _mm_unpacklo_epi16(inptr[i + 16], sign_bits);
- temp2 = _mm_unpackhi_epi16(inptr[i + 16], sign_bits);
- _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 2)), temp1);
- _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 3)), temp2);
- }
- } else {
- // Set to use the optimised transform for the column
- optimised_cols = 1;
- }
- } else {
- // Run the un-optimised row transform
- for (i = 0; i < 16; ++i) {
- av1_highbd_idct16_c(input, outptr, bd);
- input += 16;
- outptr += 16;
- }
- }
-
- if (optimised_cols) {
- av1_idct16_sse2(inptr, inptr + 16);
-
- // Final round & shift and Reconstruction and Store
- {
- __m128i d[2];
- for (i = 0; i < 16; i++) {
- inptr[i] = _mm_add_epi16(inptr[i], rounding);
- inptr[i + 16] = _mm_add_epi16(inptr[i + 16], rounding);
- d[0] = _mm_loadu_si128((const __m128i *)(dest + stride * i));
- d[1] = _mm_loadu_si128((const __m128i *)(dest + stride * i + 8));
- inptr[i] = _mm_srai_epi16(inptr[i], 6);
- inptr[i + 16] = _mm_srai_epi16(inptr[i + 16], 6);
- d[0] = clamp_high_sse2(_mm_add_epi16(d[0], inptr[i]), bd);
- d[1] = clamp_high_sse2(_mm_add_epi16(d[1], inptr[i + 16]), bd);
- // Store
- _mm_storeu_si128((__m128i *)(dest + stride * i), d[0]);
- _mm_storeu_si128((__m128i *)(dest + stride * i + 8), d[1]);
- }
- }
- } else {
- // Run the un-optimised column transform
- tran_low_t temp_in[16], temp_out[16];
- for (i = 0; i < 16; ++i) {
- for (j = 0; j < 16; ++j) temp_in[j] = out[j * 16 + i];
- av1_highbd_idct16_c(temp_in, temp_out, bd);
- for (j = 0; j < 16; ++j) {
- dest[j * stride + i] = highbd_clip_pixel_add(
- dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd);
- }
- }
- }
-}
-
-void av1_highbd_idct16x16_10_add_sse2(const tran_low_t *input, uint8_t *dest8,
- int stride, int bd) {
- tran_low_t out[16 * 16] = { 0 };
- tran_low_t *outptr = out;
- int i, j, test;
- __m128i inptr[32];
- __m128i min_input, max_input, temp1, temp2, sign_bits;
- uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
- const __m128i zero = _mm_set1_epi16(0);
- const __m128i rounding = _mm_set1_epi16(32);
- const __m128i max = _mm_set1_epi16(3155);
- const __m128i min = _mm_set1_epi16(-3155);
- int optimised_cols = 0;
-
- // Load input into __m128i & pack to 16 bits
- for (i = 0; i < 16; i++) {
- temp1 = _mm_loadu_si128((const __m128i *)(input + 16 * i));
- temp2 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 4));
- inptr[i] = _mm_packs_epi32(temp1, temp2);
- temp1 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 8));
- temp2 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 12));
- inptr[i + 16] = _mm_packs_epi32(temp1, temp2);
- }
-
- // Find the min & max for the row transform
- // Since all non-zero dct coefficients are in upper-left 4x4 area,
- // we only need to consider first 4 rows here.
- max_input = _mm_max_epi16(inptr[0], inptr[1]);
- min_input = _mm_min_epi16(inptr[0], inptr[1]);
- for (i = 2; i < 4; i++) {
- max_input = _mm_max_epi16(max_input, inptr[i]);
- min_input = _mm_min_epi16(min_input, inptr[i]);
- }
- max_input = _mm_cmpgt_epi16(max_input, max);
- min_input = _mm_cmplt_epi16(min_input, min);
- temp1 = _mm_or_si128(max_input, min_input);
- test = _mm_movemask_epi8(temp1);
-
- if (!test) {
- // Do the row transform (N.B. This transposes inptr)
- av1_idct16_sse2(inptr, inptr + 16);
-
- // Find the min & max for the column transform
- // N.B. Only first 4 cols contain non-zero coeffs
- max_input = _mm_max_epi16(inptr[0], inptr[1]);
- min_input = _mm_min_epi16(inptr[0], inptr[1]);
- for (i = 2; i < 16; i++) {
- max_input = _mm_max_epi16(max_input, inptr[i]);
- min_input = _mm_min_epi16(min_input, inptr[i]);
- }
- max_input = _mm_cmpgt_epi16(max_input, max);
- min_input = _mm_cmplt_epi16(min_input, min);
- temp1 = _mm_or_si128(max_input, min_input);
- test = _mm_movemask_epi8(temp1);
-
- if (test) {
- // Use fact only first 4 rows contain non-zero coeffs
- array_transpose_8x8(inptr, inptr);
- array_transpose_8x8(inptr + 8, inptr + 16);
- for (i = 0; i < 4; i++) {
- sign_bits = _mm_cmplt_epi16(inptr[i], zero);
- temp1 = _mm_unpacklo_epi16(inptr[i], sign_bits);
- temp2 = _mm_unpackhi_epi16(inptr[i], sign_bits);
- _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4)), temp1);
- _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 1)), temp2);
- sign_bits = _mm_cmplt_epi16(inptr[i + 16], zero);
- temp1 = _mm_unpacklo_epi16(inptr[i + 16], sign_bits);
- temp2 = _mm_unpackhi_epi16(inptr[i + 16], sign_bits);
- _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 2)), temp1);
- _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 3)), temp2);
- }
- } else {
- // Set to use the optimised transform for the column
- optimised_cols = 1;
- }
- } else {
- // Run the un-optimised row transform
- for (i = 0; i < 4; ++i) {
- av1_highbd_idct16_c(input, outptr, bd);
- input += 16;
- outptr += 16;
- }
- }
-
- if (optimised_cols) {
- av1_idct16_sse2(inptr, inptr + 16);
-
- // Final round & shift and Reconstruction and Store
- {
- __m128i d[2];
- for (i = 0; i < 16; i++) {
- inptr[i] = _mm_add_epi16(inptr[i], rounding);
- inptr[i + 16] = _mm_add_epi16(inptr[i + 16], rounding);
- d[0] = _mm_loadu_si128((const __m128i *)(dest + stride * i));
- d[1] = _mm_loadu_si128((const __m128i *)(dest + stride * i + 8));
- inptr[i] = _mm_srai_epi16(inptr[i], 6);
- inptr[i + 16] = _mm_srai_epi16(inptr[i + 16], 6);
- d[0] = clamp_high_sse2(_mm_add_epi16(d[0], inptr[i]), bd);
- d[1] = clamp_high_sse2(_mm_add_epi16(d[1], inptr[i + 16]), bd);
- // Store
- _mm_storeu_si128((__m128i *)(dest + stride * i), d[0]);
- _mm_storeu_si128((__m128i *)(dest + stride * i + 8), d[1]);
- }
- }
- } else {
- // Run the un-optimised column transform
- tran_low_t temp_in[16], temp_out[16];
- for (i = 0; i < 16; ++i) {
- for (j = 0; j < 16; ++j) temp_in[j] = out[j * 16 + i];
- av1_highbd_idct16_c(temp_in, temp_out, bd);
- for (j = 0; j < 16; ++j) {
- dest[j * stride + i] = highbd_clip_pixel_add(
- dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd);
- }
- }
- }
-}
-#endif // CONFIG_AOM_HIGHBITDEPTH
diff --git a/av1/common/x86/av1_inv_txfm_sse2.h b/av1/common/x86/av1_inv_txfm_sse2.h
deleted file mode 100644
index 3aab34c..0000000
--- a/av1/common/x86/av1_inv_txfm_sse2.h
+++ /dev/null
@@ -1,185 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#ifndef AOM_DSP_X86_INV_TXFM_SSE2_H_
-#define AOM_DSP_X86_INV_TXFM_SSE2_H_
-
-#include <emmintrin.h> // SSE2
-#include "./aom_config.h"
-#include "aom/aom_integer.h"
-#include "av1/common/av1_inv_txfm.h"
-
-// perform 8x8 transpose
-static INLINE void array_transpose_8x8(__m128i *in, __m128i *res) {
- const __m128i tr0_0 = _mm_unpacklo_epi16(in[0], in[1]);
- const __m128i tr0_1 = _mm_unpacklo_epi16(in[2], in[3]);
- const __m128i tr0_2 = _mm_unpackhi_epi16(in[0], in[1]);
- const __m128i tr0_3 = _mm_unpackhi_epi16(in[2], in[3]);
- const __m128i tr0_4 = _mm_unpacklo_epi16(in[4], in[5]);
- const __m128i tr0_5 = _mm_unpacklo_epi16(in[6], in[7]);
- const __m128i tr0_6 = _mm_unpackhi_epi16(in[4], in[5]);
- const __m128i tr0_7 = _mm_unpackhi_epi16(in[6], in[7]);
-
- const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
- const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_4, tr0_5);
- const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1);
- const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_4, tr0_5);
- const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_2, tr0_3);
- const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7);
- const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_2, tr0_3);
- const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7);
-
- res[0] = _mm_unpacklo_epi64(tr1_0, tr1_1);
- res[1] = _mm_unpackhi_epi64(tr1_0, tr1_1);
- res[2] = _mm_unpacklo_epi64(tr1_2, tr1_3);
- res[3] = _mm_unpackhi_epi64(tr1_2, tr1_3);
- res[4] = _mm_unpacklo_epi64(tr1_4, tr1_5);
- res[5] = _mm_unpackhi_epi64(tr1_4, tr1_5);
- res[6] = _mm_unpacklo_epi64(tr1_6, tr1_7);
- res[7] = _mm_unpackhi_epi64(tr1_6, tr1_7);
-}
-
-#define TRANSPOSE_8X4(in0, in1, in2, in3, out0, out1) \
- { \
- const __m128i tr0_0 = _mm_unpacklo_epi16(in0, in1); \
- const __m128i tr0_1 = _mm_unpacklo_epi16(in2, in3); \
- \
- in0 = _mm_unpacklo_epi32(tr0_0, tr0_1); /* i1 i0 */ \
- in1 = _mm_unpackhi_epi32(tr0_0, tr0_1); /* i3 i2 */ \
- }
-
-static INLINE void array_transpose_4X8(__m128i *in, __m128i *out) {
- const __m128i tr0_0 = _mm_unpacklo_epi16(in[0], in[1]);
- const __m128i tr0_1 = _mm_unpacklo_epi16(in[2], in[3]);
- const __m128i tr0_4 = _mm_unpacklo_epi16(in[4], in[5]);
- const __m128i tr0_5 = _mm_unpacklo_epi16(in[6], in[7]);
-
- const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
- const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1);
- const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5);
- const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5);
-
- out[0] = _mm_unpacklo_epi64(tr1_0, tr1_4);
- out[1] = _mm_unpackhi_epi64(tr1_0, tr1_4);
- out[2] = _mm_unpacklo_epi64(tr1_2, tr1_6);
- out[3] = _mm_unpackhi_epi64(tr1_2, tr1_6);
-}
-
-static INLINE void array_transpose_16x16(__m128i *res0, __m128i *res1) {
- __m128i tbuf[8];
- array_transpose_8x8(res0, res0);
- array_transpose_8x8(res1, tbuf);
- array_transpose_8x8(res0 + 8, res1);
- array_transpose_8x8(res1 + 8, res1 + 8);
-
- res0[8] = tbuf[0];
- res0[9] = tbuf[1];
- res0[10] = tbuf[2];
- res0[11] = tbuf[3];
- res0[12] = tbuf[4];
- res0[13] = tbuf[5];
- res0[14] = tbuf[6];
- res0[15] = tbuf[7];
-}
-
-static INLINE void load_buffer_8x16(const int16_t *input, __m128i *in) {
- in[0] = _mm_load_si128((const __m128i *)(input + 0 * 16));
- in[1] = _mm_load_si128((const __m128i *)(input + 1 * 16));
- in[2] = _mm_load_si128((const __m128i *)(input + 2 * 16));
- in[3] = _mm_load_si128((const __m128i *)(input + 3 * 16));
- in[4] = _mm_load_si128((const __m128i *)(input + 4 * 16));
- in[5] = _mm_load_si128((const __m128i *)(input + 5 * 16));
- in[6] = _mm_load_si128((const __m128i *)(input + 6 * 16));
- in[7] = _mm_load_si128((const __m128i *)(input + 7 * 16));
-
- in[8] = _mm_load_si128((const __m128i *)(input + 8 * 16));
- in[9] = _mm_load_si128((const __m128i *)(input + 9 * 16));
- in[10] = _mm_load_si128((const __m128i *)(input + 10 * 16));
- in[11] = _mm_load_si128((const __m128i *)(input + 11 * 16));
- in[12] = _mm_load_si128((const __m128i *)(input + 12 * 16));
- in[13] = _mm_load_si128((const __m128i *)(input + 13 * 16));
- in[14] = _mm_load_si128((const __m128i *)(input + 14 * 16));
- in[15] = _mm_load_si128((const __m128i *)(input + 15 * 16));
-}
-
-#define RECON_AND_STORE(dest, in_x) \
- { \
- __m128i d0 = _mm_loadl_epi64((__m128i *)(dest)); \
- d0 = _mm_unpacklo_epi8(d0, zero); \
- d0 = _mm_add_epi16(in_x, d0); \
- d0 = _mm_packus_epi16(d0, d0); \
- _mm_storel_epi64((__m128i *)(dest), d0); \
- }
-
-static INLINE void write_buffer_8x16(uint8_t *dest, __m128i *in, int stride) {
- const __m128i final_rounding = _mm_set1_epi16(1 << 5);
- const __m128i zero = _mm_setzero_si128();
- // Final rounding and shift
- in[0] = _mm_adds_epi16(in[0], final_rounding);
- in[1] = _mm_adds_epi16(in[1], final_rounding);
- in[2] = _mm_adds_epi16(in[2], final_rounding);
- in[3] = _mm_adds_epi16(in[3], final_rounding);
- in[4] = _mm_adds_epi16(in[4], final_rounding);
- in[5] = _mm_adds_epi16(in[5], final_rounding);
- in[6] = _mm_adds_epi16(in[6], final_rounding);
- in[7] = _mm_adds_epi16(in[7], final_rounding);
- in[8] = _mm_adds_epi16(in[8], final_rounding);
- in[9] = _mm_adds_epi16(in[9], final_rounding);
- in[10] = _mm_adds_epi16(in[10], final_rounding);
- in[11] = _mm_adds_epi16(in[11], final_rounding);
- in[12] = _mm_adds_epi16(in[12], final_rounding);
- in[13] = _mm_adds_epi16(in[13], final_rounding);
- in[14] = _mm_adds_epi16(in[14], final_rounding);
- in[15] = _mm_adds_epi16(in[15], final_rounding);
-
- in[0] = _mm_srai_epi16(in[0], 6);
- in[1] = _mm_srai_epi16(in[1], 6);
- in[2] = _mm_srai_epi16(in[2], 6);
- in[3] = _mm_srai_epi16(in[3], 6);
- in[4] = _mm_srai_epi16(in[4], 6);
- in[5] = _mm_srai_epi16(in[5], 6);
- in[6] = _mm_srai_epi16(in[6], 6);
- in[7] = _mm_srai_epi16(in[7], 6);
- in[8] = _mm_srai_epi16(in[8], 6);
- in[9] = _mm_srai_epi16(in[9], 6);
- in[10] = _mm_srai_epi16(in[10], 6);
- in[11] = _mm_srai_epi16(in[11], 6);
- in[12] = _mm_srai_epi16(in[12], 6);
- in[13] = _mm_srai_epi16(in[13], 6);
- in[14] = _mm_srai_epi16(in[14], 6);
- in[15] = _mm_srai_epi16(in[15], 6);
-
- RECON_AND_STORE(dest + 0 * stride, in[0]);
- RECON_AND_STORE(dest + 1 * stride, in[1]);
- RECON_AND_STORE(dest + 2 * stride, in[2]);
- RECON_AND_STORE(dest + 3 * stride, in[3]);
- RECON_AND_STORE(dest + 4 * stride, in[4]);
- RECON_AND_STORE(dest + 5 * stride, in[5]);
- RECON_AND_STORE(dest + 6 * stride, in[6]);
- RECON_AND_STORE(dest + 7 * stride, in[7]);
- RECON_AND_STORE(dest + 8 * stride, in[8]);
- RECON_AND_STORE(dest + 9 * stride, in[9]);
- RECON_AND_STORE(dest + 10 * stride, in[10]);
- RECON_AND_STORE(dest + 11 * stride, in[11]);
- RECON_AND_STORE(dest + 12 * stride, in[12]);
- RECON_AND_STORE(dest + 13 * stride, in[13]);
- RECON_AND_STORE(dest + 14 * stride, in[14]);
- RECON_AND_STORE(dest + 15 * stride, in[15]);
-}
-
-void idct4_sse2(__m128i *in);
-void idct8_sse2(__m128i *in);
-void idct16_sse2(__m128i *in0, __m128i *in1);
-void iadst4_sse2(__m128i *in);
-void iadst8_sse2(__m128i *in);
-void iadst16_sse2(__m128i *in0, __m128i *in1);
-
-#endif // AOM_DSP_X86_INV_TXFM_SSE2_H_
diff --git a/av1/common/x86/hybrid_inv_txfm_avx2.c b/av1/common/x86/hybrid_inv_txfm_avx2.c
new file mode 100644
index 0000000..754152c
--- /dev/null
+++ b/av1/common/x86/hybrid_inv_txfm_avx2.c
@@ -0,0 +1,503 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <immintrin.h> // avx2
+
+#include "./aom_config.h"
+#include "./av1_rtcd.h"
+
+#include "aom_dsp/x86/txfm_common_avx2.h"
+
+static INLINE void load_coeff(const tran_low_t *coeff, __m256i *in) {
+#if CONFIG_AOM_HIGHBITDEPTH
+ *in = _mm256_setr_epi16(
+ (int16_t)coeff[0], (int16_t)coeff[1], (int16_t)coeff[2],
+ (int16_t)coeff[3], (int16_t)coeff[4], (int16_t)coeff[5],
+ (int16_t)coeff[6], (int16_t)coeff[7], (int16_t)coeff[8],
+ (int16_t)coeff[9], (int16_t)coeff[10], (int16_t)coeff[11],
+ (int16_t)coeff[12], (int16_t)coeff[13], (int16_t)coeff[14],
+ (int16_t)coeff[15]);
+#else
+ *in = _mm256_loadu_si256((const __m256i *)coeff);
+#endif
+}
+
+static void load_buffer_16x16(const tran_low_t *coeff, __m256i *in) {
+ int i = 0;
+ while (i < 16) {
+ load_coeff(coeff + (i << 4), &in[i]);
+ i += 1;
+ }
+}
+
+static void recon_and_store(const __m256i *res, uint8_t *output) {
+ const __m128i zero = _mm_setzero_si128();
+ __m128i x = _mm_loadu_si128((__m128i const *)output);
+ __m128i p0 = _mm_unpacklo_epi8(x, zero);
+ __m128i p1 = _mm_unpackhi_epi8(x, zero);
+
+ p0 = _mm_add_epi16(p0, _mm256_castsi256_si128(*res));
+ p1 = _mm_add_epi16(p1, _mm256_extractf128_si256(*res, 1));
+ x = _mm_packus_epi16(p0, p1);
+ _mm_storeu_si128((__m128i *)output, x);
+}
+
+#define IDCT_ROUNDING_POS (6)
+
+static void write_buffer_16x16(__m256i *in, const int stride, uint8_t *output) {
+ const __m256i rounding = _mm256_set1_epi16(1 << (IDCT_ROUNDING_POS - 1));
+ int i = 0;
+
+ while (i < 16) {
+ in[i] = _mm256_add_epi16(in[i], rounding);
+ in[i] = _mm256_srai_epi16(in[i], IDCT_ROUNDING_POS);
+ recon_and_store(&in[i], output + i * stride);
+ i += 1;
+ }
+}
+
+static INLINE void unpack_butter_fly(const __m256i *a0, const __m256i *a1,
+ const __m256i *c0, const __m256i *c1,
+ __m256i *b0, __m256i *b1) {
+ __m256i x0, x1;
+ x0 = _mm256_unpacklo_epi16(*a0, *a1);
+ x1 = _mm256_unpackhi_epi16(*a0, *a1);
+ *b0 = butter_fly(x0, x1, *c0);
+ *b1 = butter_fly(x0, x1, *c1);
+}
+
+static void idct16_avx2(__m256i *in) {
+ const __m256i cospi_p30_m02 = pair256_set_epi16(cospi_30_64, -cospi_2_64);
+ const __m256i cospi_p02_p30 = pair256_set_epi16(cospi_2_64, cospi_30_64);
+ const __m256i cospi_p14_m18 = pair256_set_epi16(cospi_14_64, -cospi_18_64);
+ const __m256i cospi_p18_p14 = pair256_set_epi16(cospi_18_64, cospi_14_64);
+ const __m256i cospi_p22_m10 = pair256_set_epi16(cospi_22_64, -cospi_10_64);
+ const __m256i cospi_p10_p22 = pair256_set_epi16(cospi_10_64, cospi_22_64);
+ const __m256i cospi_p06_m26 = pair256_set_epi16(cospi_6_64, -cospi_26_64);
+ const __m256i cospi_p26_p06 = pair256_set_epi16(cospi_26_64, cospi_6_64);
+ const __m256i cospi_p28_m04 = pair256_set_epi16(cospi_28_64, -cospi_4_64);
+ const __m256i cospi_p04_p28 = pair256_set_epi16(cospi_4_64, cospi_28_64);
+ const __m256i cospi_p12_m20 = pair256_set_epi16(cospi_12_64, -cospi_20_64);
+ const __m256i cospi_p20_p12 = pair256_set_epi16(cospi_20_64, cospi_12_64);
+ const __m256i cospi_p16_p16 = _mm256_set1_epi16((int16_t)cospi_16_64);
+ const __m256i cospi_p16_m16 = pair256_set_epi16(cospi_16_64, -cospi_16_64);
+ const __m256i cospi_p24_m08 = pair256_set_epi16(cospi_24_64, -cospi_8_64);
+ const __m256i cospi_p08_p24 = pair256_set_epi16(cospi_8_64, cospi_24_64);
+ const __m256i cospi_m08_p24 = pair256_set_epi16(-cospi_8_64, cospi_24_64);
+ const __m256i cospi_p24_p08 = pair256_set_epi16(cospi_24_64, cospi_8_64);
+ const __m256i cospi_m24_m08 = pair256_set_epi16(-cospi_24_64, -cospi_8_64);
+ __m256i u0, u1, u2, u3, u4, u5, u6, u7;
+ __m256i v0, v1, v2, v3, v4, v5, v6, v7;
+ __m256i t0, t1, t2, t3, t4, t5, t6, t7;
+
+ // stage 1, (0-7)
+ u0 = in[0];
+ u1 = in[8];
+ u2 = in[4];
+ u3 = in[12];
+ u4 = in[2];
+ u5 = in[10];
+ u6 = in[6];
+ u7 = in[14];
+
+ // stage 2, (0-7)
+ // stage 3, (0-7)
+ t0 = u0;
+ t1 = u1;
+ t2 = u2;
+ t3 = u3;
+ unpack_butter_fly(&u4, &u7, &cospi_p28_m04, &cospi_p04_p28, &t4, &t7);
+ unpack_butter_fly(&u5, &u6, &cospi_p12_m20, &cospi_p20_p12, &t5, &t6);
+
+ // stage 4, (0-7)
+ unpack_butter_fly(&t0, &t1, &cospi_p16_p16, &cospi_p16_m16, &u0, &u1);
+ unpack_butter_fly(&t2, &t3, &cospi_p24_m08, &cospi_p08_p24, &u2, &u3);
+ u4 = _mm256_add_epi16(t4, t5);
+ u5 = _mm256_sub_epi16(t4, t5);
+ u6 = _mm256_sub_epi16(t7, t6);
+ u7 = _mm256_add_epi16(t7, t6);
+
+ // stage 5, (0-7)
+ t0 = _mm256_add_epi16(u0, u3);
+ t1 = _mm256_add_epi16(u1, u2);
+ t2 = _mm256_sub_epi16(u1, u2);
+ t3 = _mm256_sub_epi16(u0, u3);
+ t4 = u4;
+ t7 = u7;
+ unpack_butter_fly(&u6, &u5, &cospi_p16_m16, &cospi_p16_p16, &t5, &t6);
+
+ // stage 6, (0-7)
+ u0 = _mm256_add_epi16(t0, t7);
+ u1 = _mm256_add_epi16(t1, t6);
+ u2 = _mm256_add_epi16(t2, t5);
+ u3 = _mm256_add_epi16(t3, t4);
+ u4 = _mm256_sub_epi16(t3, t4);
+ u5 = _mm256_sub_epi16(t2, t5);
+ u6 = _mm256_sub_epi16(t1, t6);
+ u7 = _mm256_sub_epi16(t0, t7);
+
+ // stage 1, (8-15)
+ v0 = in[1];
+ v1 = in[9];
+ v2 = in[5];
+ v3 = in[13];
+ v4 = in[3];
+ v5 = in[11];
+ v6 = in[7];
+ v7 = in[15];
+
+ // stage 2, (8-15)
+ unpack_butter_fly(&v0, &v7, &cospi_p30_m02, &cospi_p02_p30, &t0, &t7);
+ unpack_butter_fly(&v1, &v6, &cospi_p14_m18, &cospi_p18_p14, &t1, &t6);
+ unpack_butter_fly(&v2, &v5, &cospi_p22_m10, &cospi_p10_p22, &t2, &t5);
+ unpack_butter_fly(&v3, &v4, &cospi_p06_m26, &cospi_p26_p06, &t3, &t4);
+
+ // stage 3, (8-15)
+ v0 = _mm256_add_epi16(t0, t1);
+ v1 = _mm256_sub_epi16(t0, t1);
+ v2 = _mm256_sub_epi16(t3, t2);
+ v3 = _mm256_add_epi16(t2, t3);
+ v4 = _mm256_add_epi16(t4, t5);
+ v5 = _mm256_sub_epi16(t4, t5);
+ v6 = _mm256_sub_epi16(t7, t6);
+ v7 = _mm256_add_epi16(t6, t7);
+
+ // stage 4, (8-15)
+ t0 = v0;
+ t7 = v7;
+ t3 = v3;
+ t4 = v4;
+ unpack_butter_fly(&v1, &v6, &cospi_m08_p24, &cospi_p24_p08, &t1, &t6);
+ unpack_butter_fly(&v2, &v5, &cospi_m24_m08, &cospi_m08_p24, &t2, &t5);
+
+ // stage 5, (8-15)
+ v0 = _mm256_add_epi16(t0, t3);
+ v1 = _mm256_add_epi16(t1, t2);
+ v2 = _mm256_sub_epi16(t1, t2);
+ v3 = _mm256_sub_epi16(t0, t3);
+ v4 = _mm256_sub_epi16(t7, t4);
+ v5 = _mm256_sub_epi16(t6, t5);
+ v6 = _mm256_add_epi16(t6, t5);
+ v7 = _mm256_add_epi16(t7, t4);
+
+ // stage 6, (8-15)
+ t0 = v0;
+ t1 = v1;
+ t6 = v6;
+ t7 = v7;
+ unpack_butter_fly(&v5, &v2, &cospi_p16_m16, &cospi_p16_p16, &t2, &t5);
+ unpack_butter_fly(&v4, &v3, &cospi_p16_m16, &cospi_p16_p16, &t3, &t4);
+
+ // stage 7
+ in[0] = _mm256_add_epi16(u0, t7);
+ in[1] = _mm256_add_epi16(u1, t6);
+ in[2] = _mm256_add_epi16(u2, t5);
+ in[3] = _mm256_add_epi16(u3, t4);
+ in[4] = _mm256_add_epi16(u4, t3);
+ in[5] = _mm256_add_epi16(u5, t2);
+ in[6] = _mm256_add_epi16(u6, t1);
+ in[7] = _mm256_add_epi16(u7, t0);
+ in[8] = _mm256_sub_epi16(u7, t0);
+ in[9] = _mm256_sub_epi16(u6, t1);
+ in[10] = _mm256_sub_epi16(u5, t2);
+ in[11] = _mm256_sub_epi16(u4, t3);
+ in[12] = _mm256_sub_epi16(u3, t4);
+ in[13] = _mm256_sub_epi16(u2, t5);
+ in[14] = _mm256_sub_epi16(u1, t6);
+ in[15] = _mm256_sub_epi16(u0, t7);
+}
+
+static void idct16(__m256i *in) {
+ mm256_transpose_16x16(in);
+ idct16_avx2(in);
+}
+
+static INLINE void butterfly_32b(const __m256i *a0, const __m256i *a1,
+ const __m256i *c0, const __m256i *c1,
+ __m256i *b) {
+ __m256i x0, x1;
+ x0 = _mm256_unpacklo_epi16(*a0, *a1);
+ x1 = _mm256_unpackhi_epi16(*a0, *a1);
+ b[0] = _mm256_madd_epi16(x0, *c0);
+ b[1] = _mm256_madd_epi16(x1, *c0);
+ b[2] = _mm256_madd_epi16(x0, *c1);
+ b[3] = _mm256_madd_epi16(x1, *c1);
+}
+
+static INLINE void group_rounding(__m256i *a, int num) {
+ const __m256i dct_rounding = _mm256_set1_epi32(DCT_CONST_ROUNDING);
+ int i;
+ for (i = 0; i < num; ++i) {
+ a[i] = _mm256_add_epi32(a[i], dct_rounding);
+ a[i] = _mm256_srai_epi32(a[i], DCT_CONST_BITS);
+ }
+}
+
+static INLINE void add_rnd(const __m256i *a, const __m256i *b, __m256i *out) {
+ __m256i x[4];
+ x[0] = _mm256_add_epi32(a[0], b[0]);
+ x[1] = _mm256_add_epi32(a[1], b[1]);
+ x[2] = _mm256_add_epi32(a[2], b[2]);
+ x[3] = _mm256_add_epi32(a[3], b[3]);
+
+ group_rounding(x, 4);
+
+ out[0] = _mm256_packs_epi32(x[0], x[1]);
+ out[1] = _mm256_packs_epi32(x[2], x[3]);
+}
+
+static INLINE void sub_rnd(const __m256i *a, const __m256i *b, __m256i *out) {
+ __m256i x[4];
+ x[0] = _mm256_sub_epi32(a[0], b[0]);
+ x[1] = _mm256_sub_epi32(a[1], b[1]);
+ x[2] = _mm256_sub_epi32(a[2], b[2]);
+ x[3] = _mm256_sub_epi32(a[3], b[3]);
+
+ group_rounding(x, 4);
+
+ out[0] = _mm256_packs_epi32(x[0], x[1]);
+ out[1] = _mm256_packs_epi32(x[2], x[3]);
+}
+
+static INLINE void butterfly_rnd(__m256i *a, __m256i *out) {
+ group_rounding(a, 4);
+ out[0] = _mm256_packs_epi32(a[0], a[1]);
+ out[1] = _mm256_packs_epi32(a[2], a[3]);
+}
+
+static void iadst16_avx2(__m256i *in) {
+ const __m256i cospi_p01_p31 = pair256_set_epi16(cospi_1_64, cospi_31_64);
+ const __m256i cospi_p31_m01 = pair256_set_epi16(cospi_31_64, -cospi_1_64);
+ const __m256i cospi_p05_p27 = pair256_set_epi16(cospi_5_64, cospi_27_64);
+ const __m256i cospi_p27_m05 = pair256_set_epi16(cospi_27_64, -cospi_5_64);
+ const __m256i cospi_p09_p23 = pair256_set_epi16(cospi_9_64, cospi_23_64);
+ const __m256i cospi_p23_m09 = pair256_set_epi16(cospi_23_64, -cospi_9_64);
+ const __m256i cospi_p13_p19 = pair256_set_epi16(cospi_13_64, cospi_19_64);
+ const __m256i cospi_p19_m13 = pair256_set_epi16(cospi_19_64, -cospi_13_64);
+ const __m256i cospi_p17_p15 = pair256_set_epi16(cospi_17_64, cospi_15_64);
+ const __m256i cospi_p15_m17 = pair256_set_epi16(cospi_15_64, -cospi_17_64);
+ const __m256i cospi_p21_p11 = pair256_set_epi16(cospi_21_64, cospi_11_64);
+ const __m256i cospi_p11_m21 = pair256_set_epi16(cospi_11_64, -cospi_21_64);
+ const __m256i cospi_p25_p07 = pair256_set_epi16(cospi_25_64, cospi_7_64);
+ const __m256i cospi_p07_m25 = pair256_set_epi16(cospi_7_64, -cospi_25_64);
+ const __m256i cospi_p29_p03 = pair256_set_epi16(cospi_29_64, cospi_3_64);
+ const __m256i cospi_p03_m29 = pair256_set_epi16(cospi_3_64, -cospi_29_64);
+ const __m256i cospi_p04_p28 = pair256_set_epi16(cospi_4_64, cospi_28_64);
+ const __m256i cospi_p28_m04 = pair256_set_epi16(cospi_28_64, -cospi_4_64);
+ const __m256i cospi_p20_p12 = pair256_set_epi16(cospi_20_64, cospi_12_64);
+ const __m256i cospi_p12_m20 = pair256_set_epi16(cospi_12_64, -cospi_20_64);
+ const __m256i cospi_m28_p04 = pair256_set_epi16(-cospi_28_64, cospi_4_64);
+ const __m256i cospi_m12_p20 = pair256_set_epi16(-cospi_12_64, cospi_20_64);
+ const __m256i cospi_p08_p24 = pair256_set_epi16(cospi_8_64, cospi_24_64);
+ const __m256i cospi_p24_m08 = pair256_set_epi16(cospi_24_64, -cospi_8_64);
+ const __m256i cospi_m24_p08 = pair256_set_epi16(-cospi_24_64, cospi_8_64);
+ const __m256i cospi_m16_m16 = _mm256_set1_epi16((int16_t)-cospi_16_64);
+ const __m256i cospi_p16_p16 = _mm256_set1_epi16((int16_t)cospi_16_64);
+ const __m256i cospi_p16_m16 = pair256_set_epi16(cospi_16_64, -cospi_16_64);
+ const __m256i cospi_m16_p16 = pair256_set_epi16(-cospi_16_64, cospi_16_64);
+ const __m256i zero = _mm256_setzero_si256();
+ __m256i x[16], s[16];
+ __m256i u[4], v[4];
+
+ // stage 1
+ butterfly_32b(&in[15], &in[0], &cospi_p01_p31, &cospi_p31_m01, u);
+ butterfly_32b(&in[7], &in[8], &cospi_p17_p15, &cospi_p15_m17, v);
+ add_rnd(u, v, &x[0]);
+ sub_rnd(u, v, &x[8]);
+
+ butterfly_32b(&in[13], &in[2], &cospi_p05_p27, &cospi_p27_m05, u);
+ butterfly_32b(&in[5], &in[10], &cospi_p21_p11, &cospi_p11_m21, v);
+ add_rnd(u, v, &x[2]);
+ sub_rnd(u, v, &x[10]);
+
+ butterfly_32b(&in[11], &in[4], &cospi_p09_p23, &cospi_p23_m09, u);
+ butterfly_32b(&in[3], &in[12], &cospi_p25_p07, &cospi_p07_m25, v);
+ add_rnd(u, v, &x[4]);
+ sub_rnd(u, v, &x[12]);
+
+ butterfly_32b(&in[9], &in[6], &cospi_p13_p19, &cospi_p19_m13, u);
+ butterfly_32b(&in[1], &in[14], &cospi_p29_p03, &cospi_p03_m29, v);
+ add_rnd(u, v, &x[6]);
+ sub_rnd(u, v, &x[14]);
+
+ // stage 2
+ s[0] = _mm256_add_epi16(x[0], x[4]);
+ s[1] = _mm256_add_epi16(x[1], x[5]);
+ s[2] = _mm256_add_epi16(x[2], x[6]);
+ s[3] = _mm256_add_epi16(x[3], x[7]);
+ s[4] = _mm256_sub_epi16(x[0], x[4]);
+ s[5] = _mm256_sub_epi16(x[1], x[5]);
+ s[6] = _mm256_sub_epi16(x[2], x[6]);
+ s[7] = _mm256_sub_epi16(x[3], x[7]);
+ butterfly_32b(&x[8], &x[9], &cospi_p04_p28, &cospi_p28_m04, u);
+ butterfly_32b(&x[12], &x[13], &cospi_m28_p04, &cospi_p04_p28, v);
+ add_rnd(u, v, &s[8]);
+ sub_rnd(u, v, &s[12]);
+
+ butterfly_32b(&x[10], &x[11], &cospi_p20_p12, &cospi_p12_m20, u);
+ butterfly_32b(&x[14], &x[15], &cospi_m12_p20, &cospi_p20_p12, v);
+ add_rnd(u, v, &s[10]);
+ sub_rnd(u, v, &s[14]);
+
+ // stage 3
+ x[0] = _mm256_add_epi16(s[0], s[2]);
+ x[1] = _mm256_add_epi16(s[1], s[3]);
+ x[2] = _mm256_sub_epi16(s[0], s[2]);
+ x[3] = _mm256_sub_epi16(s[1], s[3]);
+
+ x[8] = _mm256_add_epi16(s[8], s[10]);
+ x[9] = _mm256_add_epi16(s[9], s[11]);
+ x[10] = _mm256_sub_epi16(s[8], s[10]);
+ x[11] = _mm256_sub_epi16(s[9], s[11]);
+
+ butterfly_32b(&s[4], &s[5], &cospi_p08_p24, &cospi_p24_m08, u);
+ butterfly_32b(&s[6], &s[7], &cospi_m24_p08, &cospi_p08_p24, v);
+ add_rnd(u, v, &x[4]);
+ sub_rnd(u, v, &x[6]);
+
+ butterfly_32b(&s[12], &s[13], &cospi_p08_p24, &cospi_p24_m08, u);
+ butterfly_32b(&s[14], &s[15], &cospi_m24_p08, &cospi_p08_p24, v);
+ add_rnd(u, v, &x[12]);
+ sub_rnd(u, v, &x[14]);
+
+ // stage 4
+ butterfly_32b(&x[2], &x[3], &cospi_m16_m16, &cospi_p16_m16, u);
+ butterfly_32b(&x[6], &x[7], &cospi_p16_p16, &cospi_m16_p16, v);
+ butterfly_rnd(u, &x[2]);
+ butterfly_rnd(v, &x[6]);
+
+ butterfly_32b(&x[10], &x[11], &cospi_p16_p16, &cospi_m16_p16, u);
+ butterfly_32b(&x[14], &x[15], &cospi_m16_m16, &cospi_p16_m16, v);
+ butterfly_rnd(u, &x[10]);
+ butterfly_rnd(v, &x[14]);
+
+ in[0] = x[0];
+ in[1] = _mm256_sub_epi16(zero, x[8]);
+ in[2] = x[12];
+ in[3] = _mm256_sub_epi16(zero, x[4]);
+ in[4] = x[6];
+ in[5] = x[14];
+ in[6] = x[10];
+ in[7] = x[2];
+ in[8] = x[3];
+ in[9] = x[11];
+ in[10] = x[15];
+ in[11] = x[7];
+ in[12] = x[5];
+ in[13] = _mm256_sub_epi16(zero, x[13]);
+ in[14] = x[9];
+ in[15] = _mm256_sub_epi16(zero, x[1]);
+}
+
+static void iadst16(__m256i *in) {
+ mm256_transpose_16x16(in);
+ iadst16_avx2(in);
+}
+
+#if CONFIG_EXT_TX
+static void flip_row(__m256i *in, int rows) {
+ int i;
+ for (i = 0; i < rows; ++i) {
+ mm256_reverse_epi16(&in[i]);
+ }
+}
+
+static void flip_col(uint8_t **dest, int *stride, int rows) {
+ *dest = *dest + (rows - 1) * (*stride);
+ *stride = -*stride;
+}
+
+static void iidtx16(__m256i *in) {
+ mm256_transpose_16x16(in);
+ txfm_scaling16_avx2(Sqrt2, in);
+}
+#endif
+
+void av1_iht16x16_256_add_avx2(const tran_low_t *input, uint8_t *dest,
+ int stride, int tx_type) {
+ __m256i in[16];
+
+ load_buffer_16x16(input, in);
+ switch (tx_type) {
+ case DCT_DCT:
+ idct16(in);
+ idct16(in);
+ break;
+ case ADST_DCT:
+ idct16(in);
+ iadst16(in);
+ break;
+ case DCT_ADST:
+ iadst16(in);
+ idct16(in);
+ break;
+ case ADST_ADST:
+ iadst16(in);
+ iadst16(in);
+ break;
+#if CONFIG_EXT_TX
+ case FLIPADST_DCT:
+ idct16(in);
+ iadst16(in);
+ flip_col(&dest, &stride, 16);
+ break;
+ case DCT_FLIPADST:
+ iadst16(in);
+ idct16(in);
+ flip_row(in, 16);
+ break;
+ case FLIPADST_FLIPADST:
+ iadst16(in);
+ iadst16(in);
+ flip_row(in, 16);
+ flip_col(&dest, &stride, 16);
+ break;
+ case ADST_FLIPADST:
+ iadst16(in);
+ iadst16(in);
+ flip_row(in, 16);
+ break;
+ case FLIPADST_ADST:
+ iadst16(in);
+ iadst16(in);
+ flip_col(&dest, &stride, 16);
+ break;
+ case V_DCT:
+ iidtx16(in);
+ idct16(in);
+ break;
+ case H_DCT:
+ idct16(in);
+ iidtx16(in);
+ break;
+ case V_ADST:
+ iidtx16(in);
+ iadst16(in);
+ break;
+ case H_ADST:
+ iadst16(in);
+ iidtx16(in);
+ break;
+ case V_FLIPADST:
+ iidtx16(in);
+ iadst16(in);
+ flip_col(&dest, &stride, 16);
+ break;
+ case H_FLIPADST:
+ iadst16(in);
+ iidtx16(in);
+ flip_row(in, 16);
+ break;
+#endif // CONFIG_EXT_TX
+ default: assert(0); break;
+ }
+ write_buffer_16x16(in, stride, dest);
+}
diff --git a/av1/common/x86/idct_intrin_sse2.c b/av1/common/x86/idct_intrin_sse2.c
index 27cd756..a6b6e1e 100644
--- a/av1/common/x86/idct_intrin_sse2.c
+++ b/av1/common/x86/idct_intrin_sse2.c
@@ -69,46 +69,46 @@
switch (tx_type) {
case DCT_DCT:
- idct4_sse2(in);
- idct4_sse2(in);
+ aom_idct4_sse2(in);
+ aom_idct4_sse2(in);
break;
case ADST_DCT:
- idct4_sse2(in);
- iadst4_sse2(in);
+ aom_idct4_sse2(in);
+ aom_iadst4_sse2(in);
break;
case DCT_ADST:
- iadst4_sse2(in);
- idct4_sse2(in);
+ aom_iadst4_sse2(in);
+ aom_idct4_sse2(in);
break;
case ADST_ADST:
- iadst4_sse2(in);
- iadst4_sse2(in);
+ aom_iadst4_sse2(in);
+ aom_iadst4_sse2(in);
break;
#if CONFIG_EXT_TX
case FLIPADST_DCT:
- idct4_sse2(in);
- iadst4_sse2(in);
+ aom_idct4_sse2(in);
+ aom_iadst4_sse2(in);
FLIPUD_PTR(dest, stride, 4);
break;
case DCT_FLIPADST:
- iadst4_sse2(in);
- idct4_sse2(in);
+ aom_iadst4_sse2(in);
+ aom_idct4_sse2(in);
fliplr_4x4(in);
break;
case FLIPADST_FLIPADST:
- iadst4_sse2(in);
- iadst4_sse2(in);
+ aom_iadst4_sse2(in);
+ aom_iadst4_sse2(in);
FLIPUD_PTR(dest, stride, 4);
fliplr_4x4(in);
break;
case ADST_FLIPADST:
- iadst4_sse2(in);
- iadst4_sse2(in);
+ aom_iadst4_sse2(in);
+ aom_iadst4_sse2(in);
fliplr_4x4(in);
break;
case FLIPADST_ADST:
- iadst4_sse2(in);
- iadst4_sse2(in);
+ aom_iadst4_sse2(in);
+ aom_iadst4_sse2(in);
FLIPUD_PTR(dest, stride, 4);
break;
#endif // CONFIG_EXT_TX
@@ -167,46 +167,46 @@
switch (tx_type) {
case DCT_DCT:
- idct8_sse2(in);
- idct8_sse2(in);
+ aom_idct8_sse2(in);
+ aom_idct8_sse2(in);
break;
case ADST_DCT:
- idct8_sse2(in);
- iadst8_sse2(in);
+ aom_idct8_sse2(in);
+ aom_iadst8_sse2(in);
break;
case DCT_ADST:
- iadst8_sse2(in);
- idct8_sse2(in);
+ aom_iadst8_sse2(in);
+ aom_idct8_sse2(in);
break;
case ADST_ADST:
- iadst8_sse2(in);
- iadst8_sse2(in);
+ aom_iadst8_sse2(in);
+ aom_iadst8_sse2(in);
break;
#if CONFIG_EXT_TX
case FLIPADST_DCT:
- idct8_sse2(in);
- iadst8_sse2(in);
+ aom_idct8_sse2(in);
+ aom_iadst8_sse2(in);
FLIPUD_PTR(dest, stride, 8);
break;
case DCT_FLIPADST:
- iadst8_sse2(in);
- idct8_sse2(in);
+ aom_iadst8_sse2(in);
+ aom_idct8_sse2(in);
fliplr_8x8(in);
break;
case FLIPADST_FLIPADST:
- iadst8_sse2(in);
- iadst8_sse2(in);
+ aom_iadst8_sse2(in);
+ aom_iadst8_sse2(in);
FLIPUD_PTR(dest, stride, 8);
fliplr_8x8(in);
break;
case ADST_FLIPADST:
- iadst8_sse2(in);
- iadst8_sse2(in);
+ aom_iadst8_sse2(in);
+ aom_iadst8_sse2(in);
fliplr_8x8(in);
break;
case FLIPADST_ADST:
- iadst8_sse2(in);
- iadst8_sse2(in);
+ aom_iadst8_sse2(in);
+ aom_iadst8_sse2(in);
FLIPUD_PTR(dest, stride, 8);
break;
#endif // CONFIG_EXT_TX
@@ -242,69 +242,6 @@
RECON_AND_STORE(dest + 7 * stride, in[7]);
}
-void av1_iht16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest,
- int stride, int tx_type) {
- __m128i in[32];
- __m128i *in0 = &in[0];
- __m128i *in1 = &in[16];
-
- load_buffer_8x16(input, in0);
- input += 8;
- load_buffer_8x16(input, in1);
-
- switch (tx_type) {
- case DCT_DCT:
- idct16_sse2(in0, in1);
- idct16_sse2(in0, in1);
- break;
- case ADST_DCT:
- idct16_sse2(in0, in1);
- iadst16_sse2(in0, in1);
- break;
- case DCT_ADST:
- iadst16_sse2(in0, in1);
- idct16_sse2(in0, in1);
- break;
- case ADST_ADST:
- iadst16_sse2(in0, in1);
- iadst16_sse2(in0, in1);
- break;
-#if CONFIG_EXT_TX
- case FLIPADST_DCT:
- idct16_sse2(in0, in1);
- iadst16_sse2(in0, in1);
- FLIPUD_PTR(dest, stride, 16);
- break;
- case DCT_FLIPADST:
- iadst16_sse2(in0, in1);
- idct16_sse2(in0, in1);
- FLIPLR_16x16(in0, in1);
- break;
- case FLIPADST_FLIPADST:
- iadst16_sse2(in0, in1);
- iadst16_sse2(in0, in1);
- FLIPUD_PTR(dest, stride, 16);
- FLIPLR_16x16(in0, in1);
- break;
- case ADST_FLIPADST:
- iadst16_sse2(in0, in1);
- iadst16_sse2(in0, in1);
- FLIPLR_16x16(in0, in1);
- break;
- case FLIPADST_ADST:
- iadst16_sse2(in0, in1);
- iadst16_sse2(in0, in1);
- FLIPUD_PTR(dest, stride, 16);
- break;
-#endif // CONFIG_EXT_TX
- default: assert(0); break;
- }
-
- write_buffer_8x16(dest, in0, stride);
- dest += 8;
- write_buffer_8x16(dest, in1, stride);
-}
-
#if CONFIG_EXT_TX
static void iidtx16_8col(__m128i *in) {
const __m128i k__zero_epi16 = _mm_set1_epi16((int16_t)0);
@@ -501,7 +438,98 @@
iidtx16_8col(in0);
iidtx16_8col(in1);
}
+#endif
+void av1_iht16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest,
+ int stride, int tx_type) {
+ __m128i in[32];
+ __m128i *in0 = &in[0];
+ __m128i *in1 = &in[16];
+
+ load_buffer_8x16(input, in0);
+ input += 8;
+ load_buffer_8x16(input, in1);
+
+ switch (tx_type) {
+ case DCT_DCT:
+ aom_idct16_sse2(in0, in1);
+ aom_idct16_sse2(in0, in1);
+ break;
+ case ADST_DCT:
+ aom_idct16_sse2(in0, in1);
+ aom_iadst16_sse2(in0, in1);
+ break;
+ case DCT_ADST:
+ aom_iadst16_sse2(in0, in1);
+ aom_idct16_sse2(in0, in1);
+ break;
+ case ADST_ADST:
+ aom_iadst16_sse2(in0, in1);
+ aom_iadst16_sse2(in0, in1);
+ break;
+#if CONFIG_EXT_TX
+ case FLIPADST_DCT:
+ aom_idct16_sse2(in0, in1);
+ aom_iadst16_sse2(in0, in1);
+ FLIPUD_PTR(dest, stride, 16);
+ break;
+ case DCT_FLIPADST:
+ aom_iadst16_sse2(in0, in1);
+ aom_idct16_sse2(in0, in1);
+ FLIPLR_16x16(in0, in1);
+ break;
+ case FLIPADST_FLIPADST:
+ aom_iadst16_sse2(in0, in1);
+ aom_iadst16_sse2(in0, in1);
+ FLIPUD_PTR(dest, stride, 16);
+ FLIPLR_16x16(in0, in1);
+ break;
+ case ADST_FLIPADST:
+ aom_iadst16_sse2(in0, in1);
+ aom_iadst16_sse2(in0, in1);
+ FLIPLR_16x16(in0, in1);
+ break;
+ case FLIPADST_ADST:
+ aom_iadst16_sse2(in0, in1);
+ aom_iadst16_sse2(in0, in1);
+ FLIPUD_PTR(dest, stride, 16);
+ break;
+ case V_DCT:
+ iidtx16_sse2(in0, in1);
+ aom_idct16_sse2(in0, in1);
+ break;
+ case H_DCT:
+ aom_idct16_sse2(in0, in1);
+ iidtx16_sse2(in0, in1);
+ break;
+ case V_ADST:
+ iidtx16_sse2(in0, in1);
+ aom_iadst16_sse2(in0, in1);
+ break;
+ case H_ADST:
+ aom_iadst16_sse2(in0, in1);
+ iidtx16_sse2(in0, in1);
+ break;
+ case V_FLIPADST:
+ iidtx16_sse2(in0, in1);
+ aom_iadst16_sse2(in0, in1);
+ FLIPUD_PTR(dest, stride, 16);
+ break;
+ case H_FLIPADST:
+ aom_iadst16_sse2(in0, in1);
+ iidtx16_sse2(in0, in1);
+ FLIPLR_16x16(in0, in1);
+ break;
+#endif // CONFIG_EXT_TX
+ default: assert(0); break;
+ }
+
+ write_buffer_8x16(dest, in0, stride);
+ dest += 8;
+ write_buffer_8x16(dest, in1, stride);
+}
+
+#if CONFIG_EXT_TX
static void iidtx8_sse2(__m128i *in) {
in[0] = _mm_slli_epi16(in[0], 1);
in[1] = _mm_slli_epi16(in[1], 1);
@@ -543,6 +571,7 @@
in[6] = mm_reverse_epi16(in[6]);
in[7] = mm_reverse_epi16(in[7]);
}
+#endif // CONFIG_EXT_TX
static INLINE void scale_sqrt2_8x4(__m128i *in) {
// Implements 'ROUND_POWER_OF_TWO(input * Sqrt2, DCT_CONST_BITS)'
@@ -665,26 +694,31 @@
switch (tx_type) {
case DCT_DCT:
case ADST_DCT:
+#if CONFIG_EXT_TX
case FLIPADST_DCT:
case H_DCT:
- idct8_sse2(in);
+#endif
+ aom_idct8_sse2(in);
array_transpose_8x8(in, in);
- idct8_sse2(in + 8);
+ aom_idct8_sse2(in + 8);
array_transpose_8x8(in + 8, in + 8);
break;
case DCT_ADST:
case ADST_ADST:
+#if CONFIG_EXT_TX
case DCT_FLIPADST:
case FLIPADST_FLIPADST:
case ADST_FLIPADST:
case FLIPADST_ADST:
case H_ADST:
case H_FLIPADST:
- iadst8_sse2(in);
+#endif
+ aom_iadst8_sse2(in);
array_transpose_8x8(in, in);
- iadst8_sse2(in + 8);
+ aom_iadst8_sse2(in + 8);
array_transpose_8x8(in + 8, in + 8);
break;
+#if CONFIG_EXT_TX
case V_FLIPADST:
case V_ADST:
case V_DCT:
@@ -692,6 +726,7 @@
iidtx8_sse2(in);
iidtx8_sse2(in + 8);
break;
+#endif
default: assert(0); break;
}
scale_sqrt2_8x8(in);
@@ -701,33 +736,50 @@
switch (tx_type) {
case DCT_DCT:
case DCT_ADST:
+#if CONFIG_EXT_TX
case DCT_FLIPADST:
- case V_DCT: idct16_8col(in); break;
+ case V_DCT:
+#endif
+ idct16_8col(in);
+ break;
case ADST_DCT:
case ADST_ADST:
+#if CONFIG_EXT_TX
case FLIPADST_ADST:
case ADST_FLIPADST:
case FLIPADST_FLIPADST:
case FLIPADST_DCT:
case V_ADST:
- case V_FLIPADST: iadst16_8col(in); break;
+ case V_FLIPADST:
+#endif
+ iadst16_8col(in);
+ break;
+#if CONFIG_EXT_TX
case H_DCT:
case H_ADST:
case H_FLIPADST:
case IDTX: iidtx16_8col(in); break;
+#endif
default: assert(0); break;
}
switch (tx_type) {
case DCT_DCT:
case ADST_DCT:
+#if CONFIG_EXT_TX
case H_DCT:
+#endif
case DCT_ADST:
case ADST_ADST:
+#if CONFIG_EXT_TX
case H_ADST:
case V_ADST:
case V_DCT:
- case IDTX: write_buffer_8x16(dest, in, stride); break;
+ case IDTX:
+#endif
+ write_buffer_8x16(dest, in, stride);
+ break;
+#if CONFIG_EXT_TX
case FLIPADST_DCT:
case FLIPADST_ADST:
case V_FLIPADST: write_buffer_8x16(dest + stride * 15, in, -stride); break;
@@ -743,6 +795,7 @@
flip_buffer_lr_8x8(in + 8);
write_buffer_8x16(dest + stride * 15, in, -stride);
break;
+#endif
default: assert(0); break;
}
}
@@ -809,20 +862,30 @@
switch (tx_type) {
case DCT_DCT:
case ADST_DCT:
+#if CONFIG_EXT_TX
case FLIPADST_DCT:
- case H_DCT: idct16_8col(in); break;
+ case H_DCT:
+#endif
+ idct16_8col(in);
+ break;
case DCT_ADST:
case ADST_ADST:
+#if CONFIG_EXT_TX
case DCT_FLIPADST:
case FLIPADST_FLIPADST:
case ADST_FLIPADST:
case FLIPADST_ADST:
case H_ADST:
- case H_FLIPADST: iadst16_8col(in); break;
+ case H_FLIPADST:
+#endif
+ iadst16_8col(in);
+ break;
+#if CONFIG_EXT_TX
case V_FLIPADST:
case V_ADST:
case V_DCT:
case IDTX: iidtx16_8col(in); break;
+#endif
default: assert(0); break;
}
@@ -834,22 +897,27 @@
switch (tx_type) {
case DCT_DCT:
case DCT_ADST:
+#if CONFIG_EXT_TX
case DCT_FLIPADST:
case V_DCT:
- idct8_sse2(in);
- idct8_sse2(in + 8);
+#endif
+ aom_idct8_sse2(in);
+ aom_idct8_sse2(in + 8);
break;
case ADST_DCT:
case ADST_ADST:
+#if CONFIG_EXT_TX
case FLIPADST_ADST:
case ADST_FLIPADST:
case FLIPADST_FLIPADST:
case FLIPADST_DCT:
case V_ADST:
case V_FLIPADST:
- iadst8_sse2(in);
- iadst8_sse2(in + 8);
+#endif
+ aom_iadst8_sse2(in);
+ aom_iadst8_sse2(in + 8);
break;
+#if CONFIG_EXT_TX
case H_DCT:
case H_ADST:
case H_FLIPADST:
@@ -859,22 +927,26 @@
iidtx8_sse2(in);
iidtx8_sse2(in + 8);
break;
+#endif
default: assert(0); break;
}
switch (tx_type) {
case DCT_DCT:
case ADST_DCT:
- case H_DCT:
case DCT_ADST:
case ADST_ADST:
+#if CONFIG_EXT_TX
+ case H_DCT:
case H_ADST:
case V_ADST:
case V_DCT:
case IDTX:
+#endif
write_buffer_8x8_round6(dest, in, stride);
write_buffer_8x8_round6(dest + 8, in + 8, stride);
break;
+#if CONFIG_EXT_TX
case FLIPADST_DCT:
case FLIPADST_ADST:
case V_FLIPADST:
@@ -895,6 +967,7 @@
write_buffer_8x8_round6(dest + stride * 7, in + 8, -stride);
write_buffer_8x8_round6(dest + stride * 7 + 8, in, -stride);
break;
+#endif
default: assert(0); break;
}
}
@@ -933,22 +1006,26 @@
switch (tx_type) {
case DCT_DCT:
case ADST_DCT:
+#if CONFIG_EXT_TX
case FLIPADST_DCT:
- case H_DCT: idct8_sse2(in); break;
+ case H_DCT:
+#endif
+ aom_idct8_sse2(in);
+ break;
case DCT_ADST:
case ADST_ADST:
+#if CONFIG_EXT_TX
case DCT_FLIPADST:
case FLIPADST_FLIPADST:
case ADST_FLIPADST:
case FLIPADST_ADST:
case H_ADST:
- case H_FLIPADST: iadst8_sse2(in); break;
+ case H_FLIPADST: aom_iadst8_sse2(in); break;
case V_FLIPADST:
case V_ADST:
case V_DCT:
- case IDTX:
- iidtx8_sse2(in);
- array_transpose_8x8(in, in);
+ case IDTX: iidtx8_sse2(in); array_transpose_8x8(in, in);
+#endif
break;
default: assert(0); break;
}
@@ -967,22 +1044,27 @@
switch (tx_type) {
case DCT_DCT:
case DCT_ADST:
+#if CONFIG_EXT_TX
case DCT_FLIPADST:
case V_DCT:
- idct4_sse2(in + 4);
- idct4_sse2(in + 6);
+#endif
+ aom_idct4_sse2(in + 4);
+ aom_idct4_sse2(in + 6);
break;
case ADST_DCT:
case ADST_ADST:
+#if CONFIG_EXT_TX
case FLIPADST_ADST:
case ADST_FLIPADST:
case FLIPADST_FLIPADST:
case FLIPADST_DCT:
case V_ADST:
case V_FLIPADST:
- iadst4_sse2(in + 4);
- iadst4_sse2(in + 6);
+#endif
+ aom_iadst4_sse2(in + 4);
+ aom_iadst4_sse2(in + 6);
break;
+#if CONFIG_EXT_TX
case H_DCT:
case H_ADST:
case H_FLIPADST:
@@ -992,6 +1074,7 @@
iidtx4_sse2(in + 6);
array_transpose_4x4(in + 6);
break;
+#endif
default: assert(0); break;
}
@@ -1004,9 +1087,10 @@
switch (tx_type) {
case DCT_DCT:
case ADST_DCT:
- case H_DCT:
case DCT_ADST:
case ADST_ADST:
+#if CONFIG_EXT_TX
+ case H_DCT:
case H_ADST:
case V_ADST:
case V_DCT:
@@ -1028,6 +1112,7 @@
in[2] = mm_reverse_epi16(in[2]);
in[3] = mm_reverse_epi16(in[3]);
FLIPUD_PTR(dest, stride, 4);
+#endif
break;
default: assert(0); break;
}
@@ -1111,22 +1196,27 @@
switch (tx_type) {
case DCT_DCT:
case ADST_DCT:
+#if CONFIG_EXT_TX
case FLIPADST_DCT:
case H_DCT:
- idct4_sse2(in + 4);
- idct4_sse2(in + 6);
+#endif
+ aom_idct4_sse2(in + 4);
+ aom_idct4_sse2(in + 6);
break;
case DCT_ADST:
case ADST_ADST:
+#if CONFIG_EXT_TX
case DCT_FLIPADST:
case FLIPADST_FLIPADST:
case ADST_FLIPADST:
case FLIPADST_ADST:
case H_ADST:
case H_FLIPADST:
- iadst4_sse2(in + 4);
- iadst4_sse2(in + 6);
+#endif
+ aom_iadst4_sse2(in + 4);
+ aom_iadst4_sse2(in + 6);
break;
+#if CONFIG_EXT_TX
case V_FLIPADST:
case V_ADST:
case V_DCT:
@@ -1136,6 +1226,7 @@
iidtx4_sse2(in + 6);
array_transpose_4x4(in + 6);
break;
+#endif
default: assert(0); break;
}
@@ -1149,16 +1240,25 @@
switch (tx_type) {
case DCT_DCT:
case DCT_ADST:
+#if CONFIG_EXT_TX
case DCT_FLIPADST:
- case V_DCT: idct8_sse2(in); break;
+ case V_DCT:
+#endif
+ aom_idct8_sse2(in);
+ break;
case ADST_DCT:
case ADST_ADST:
+#if CONFIG_EXT_TX
case FLIPADST_ADST:
case ADST_FLIPADST:
case FLIPADST_FLIPADST:
case FLIPADST_DCT:
case V_ADST:
- case V_FLIPADST: iadst8_sse2(in); break;
+ case V_FLIPADST:
+#endif
+ aom_iadst8_sse2(in);
+ break;
+#if CONFIG_EXT_TX
case H_DCT:
case H_ADST:
case H_FLIPADST:
@@ -1166,19 +1266,24 @@
iidtx8_sse2(in);
array_transpose_8x8(in, in);
break;
+#endif
default: assert(0); break;
}
switch (tx_type) {
case DCT_DCT:
case ADST_DCT:
- case H_DCT:
case DCT_ADST:
case ADST_ADST:
+#if CONFIG_EXT_TX
+ case H_DCT:
case H_ADST:
case V_ADST:
case V_DCT:
- case IDTX: break;
+ case IDTX:
+#endif
+ break;
+#if CONFIG_EXT_TX
case FLIPADST_DCT:
case FLIPADST_ADST:
case V_FLIPADST: FLIPUD_PTR(dest, stride, 8); break;
@@ -1205,6 +1310,7 @@
in[7] = _mm_shufflelo_epi16(in[7], 0x1b);
FLIPUD_PTR(dest, stride, 8);
break;
+#endif
default: assert(0); break;
}
in[0] = _mm_unpacklo_epi64(in[0], in[1]);
@@ -1252,9 +1358,10 @@
// Generate the bottom half of the output
scale_sqrt2_8x16(bl);
scale_sqrt2_8x16(br);
- idct16_sse2(bl, br); // Includes a transposition
+ aom_idct16_sse2(bl, br); // Includes a transposition
}
+#if CONFIG_EXT_TX
static INLINE void iidtx32_16col(__m128i *tl, __m128i *tr, __m128i *bl,
__m128i *br) {
int i;
@@ -1267,6 +1374,7 @@
br[i] = _mm_slli_epi16(br[i], 2);
}
}
+#endif // CONFIG_EXT_TX
static INLINE void write_buffer_16x32_round6(uint8_t *dest, __m128i *intl,
__m128i *intr, __m128i *inbl,
@@ -1307,22 +1415,27 @@
switch (tx_type) {
case DCT_DCT:
case ADST_DCT:
+#if CONFIG_EXT_TX
case FLIPADST_DCT:
case H_DCT:
- idct16_sse2(intl, intr);
- idct16_sse2(inbl, inbr);
+#endif
+ aom_idct16_sse2(intl, intr);
+ aom_idct16_sse2(inbl, inbr);
break;
case DCT_ADST:
case ADST_ADST:
+#if CONFIG_EXT_TX
case DCT_FLIPADST:
case FLIPADST_FLIPADST:
case ADST_FLIPADST:
case FLIPADST_ADST:
case H_ADST:
case H_FLIPADST:
- iadst16_sse2(intl, intr);
- iadst16_sse2(inbl, inbr);
+#endif
+ aom_iadst16_sse2(intl, intr);
+ aom_iadst16_sse2(inbl, inbr);
break;
+#if CONFIG_EXT_TX
case V_FLIPADST:
case V_ADST:
case V_DCT:
@@ -1330,6 +1443,7 @@
iidtx16_sse2(intl, intr);
iidtx16_sse2(inbl, inbr);
break;
+#endif
default: assert(0); break;
}
@@ -1342,33 +1456,47 @@
switch (tx_type) {
case DCT_DCT:
case DCT_ADST:
+#if CONFIG_EXT_TX
case DCT_FLIPADST:
- case V_DCT: idct32_16col(intl, intr, inbl, inbr); break;
+ case V_DCT:
+#endif
+ idct32_16col(intl, intr, inbl, inbr);
+ break;
case ADST_DCT:
case ADST_ADST:
+#if CONFIG_EXT_TX
case FLIPADST_ADST:
case ADST_FLIPADST:
case FLIPADST_FLIPADST:
case FLIPADST_DCT:
case V_ADST:
- case V_FLIPADST: ihalfright32_16col(intl, intr, inbl, inbr); break;
+ case V_FLIPADST:
+#endif
+ ihalfright32_16col(intl, intr, inbl, inbr);
+ break;
+#if CONFIG_EXT_TX
case H_DCT:
case H_ADST:
case H_FLIPADST:
case IDTX: iidtx32_16col(intl, intr, inbl, inbr); break;
+#endif
default: assert(0); break;
}
switch (tx_type) {
case DCT_DCT:
case ADST_DCT:
- case H_DCT:
case DCT_ADST:
case ADST_ADST:
+#if CONFIG_EXT_TX
+ case H_DCT:
case H_ADST:
case V_ADST:
case V_DCT:
- case IDTX: break;
+ case IDTX:
+#endif
+ break;
+#if CONFIG_EXT_TX
case FLIPADST_DCT:
case FLIPADST_ADST:
case V_FLIPADST: FLIPUD_PTR(dest, stride, 32); break;
@@ -1395,6 +1523,7 @@
}
FLIPUD_PTR(dest, stride, 32);
break;
+#endif
default: assert(0); break;
}
write_buffer_16x32_round6(dest, intl, intr, inbl, inbr, stride);
@@ -1439,20 +1568,30 @@
switch (tx_type) {
case DCT_DCT:
case ADST_DCT:
+#if CONFIG_EXT_TX
case FLIPADST_DCT:
- case H_DCT: idct32_16col(in0, in1, in2, in3); break;
+ case H_DCT:
+#endif
+ idct32_16col(in0, in1, in2, in3);
+ break;
case DCT_ADST:
case ADST_ADST:
+#if CONFIG_EXT_TX
case DCT_FLIPADST:
case FLIPADST_FLIPADST:
case ADST_FLIPADST:
case FLIPADST_ADST:
case H_ADST:
- case H_FLIPADST: ihalfright32_16col(in0, in1, in2, in3); break;
+ case H_FLIPADST:
+#endif
+ ihalfright32_16col(in0, in1, in2, in3);
+ break;
+#if CONFIG_EXT_TX
case V_FLIPADST:
case V_ADST:
case V_DCT:
case IDTX: iidtx32_16col(in0, in1, in2, in3); break;
+#endif
default: assert(0); break;
}
@@ -1465,22 +1604,27 @@
switch (tx_type) {
case DCT_DCT:
case DCT_ADST:
+#if CONFIG_EXT_TX
case DCT_FLIPADST:
case V_DCT:
- idct16_sse2(in0, in1);
- idct16_sse2(in2, in3);
+#endif
+ aom_idct16_sse2(in0, in1);
+ aom_idct16_sse2(in2, in3);
break;
case ADST_DCT:
case ADST_ADST:
+#if CONFIG_EXT_TX
case FLIPADST_ADST:
case ADST_FLIPADST:
case FLIPADST_FLIPADST:
case FLIPADST_DCT:
case V_ADST:
case V_FLIPADST:
- iadst16_sse2(in0, in1);
- iadst16_sse2(in2, in3);
+#endif
+ aom_iadst16_sse2(in0, in1);
+ aom_iadst16_sse2(in2, in3);
break;
+#if CONFIG_EXT_TX
case H_DCT:
case H_ADST:
case H_FLIPADST:
@@ -1488,19 +1632,24 @@
iidtx16_sse2(in0, in1);
iidtx16_sse2(in2, in3);
break;
+#endif
default: assert(0); break;
}
switch (tx_type) {
case DCT_DCT:
case ADST_DCT:
- case H_DCT:
case DCT_ADST:
case ADST_ADST:
+#if CONFIG_EXT_TX
+ case H_DCT:
case H_ADST:
case V_ADST:
case V_DCT:
- case IDTX: break;
+ case IDTX:
+#endif
+ break;
+#if CONFIG_EXT_TX
case FLIPADST_DCT:
case FLIPADST_ADST:
case V_FLIPADST: FLIPUD_PTR(dest, stride, 16); break;
@@ -1527,8 +1676,8 @@
}
FLIPUD_PTR(dest, stride, 16);
break;
+#endif
default: assert(0); break;
}
write_buffer_32x16_round6(dest, in0, in1, in2, in3, stride);
}
-#endif // CONFIG_EXT_TX
diff --git a/av1/decoder/decodeframe.c b/av1/decoder/decodeframe.c
index 0f958e4..b1de763 100644
--- a/av1/decoder/decodeframe.c
+++ b/av1/decoder/decodeframe.c
@@ -38,6 +38,7 @@
#endif // CONFIG_DERING
#include "av1/common/entropy.h"
#include "av1/common/entropymode.h"
+#include "av1/common/entropymv.h"
#include "av1/common/idct.h"
#include "av1/common/pred_common.h"
#include "av1/common/quant_common.h"
@@ -56,6 +57,14 @@
#define MAX_AV1_HEADER_SIZE 80
#define ACCT_STR __func__
+static struct aom_read_bit_buffer *init_read_bit_buffer(
+ AV1Decoder *pbi, struct aom_read_bit_buffer *rb, const uint8_t *data,
+ const uint8_t *data_end, uint8_t clear_data[MAX_AV1_HEADER_SIZE]);
+static int read_compressed_header(AV1Decoder *pbi, const uint8_t *data,
+ size_t partition_size);
+static size_t read_uncompressed_header(AV1Decoder *pbi,
+ struct aom_read_bit_buffer *rb);
+
static int is_compound_reference_allowed(const AV1_COMMON *cm) {
int i;
if (frame_is_intra_only(cm)) return 0;
@@ -106,21 +115,27 @@
return aom_rb_read_bit(rb) ? TX_MODE_SELECT : aom_rb_read_literal(rb, 2);
}
+static void read_tx_size_probs(FRAME_CONTEXT *fc, aom_reader *r) {
+ int i, j, k;
+ for (i = 0; i < MAX_TX_DEPTH; ++i)
+ for (j = 0; j < TX_SIZE_CONTEXTS; ++j)
+ for (k = 0; k < i + 1; ++k)
+ av1_diff_update_prob(r, &fc->tx_size_probs[i][j][k], ACCT_STR);
+}
+
+#if !CONFIG_EC_ADAPT
static void read_switchable_interp_probs(FRAME_CONTEXT *fc, aom_reader *r) {
int i, j;
for (j = 0; j < SWITCHABLE_FILTER_CONTEXTS; ++j) {
for (i = 0; i < SWITCHABLE_FILTERS - 1; ++i)
av1_diff_update_prob(r, &fc->switchable_interp_prob[j][i], ACCT_STR);
-#if CONFIG_DAALA_EC
- av1_tree_to_cdf(av1_switchable_interp_tree, fc->switchable_interp_prob[j],
- fc->switchable_interp_cdf[j]);
-#endif
}
}
+#endif
static void read_inter_mode_probs(FRAME_CONTEXT *fc, aom_reader *r) {
- int i;
#if CONFIG_REF_MV
+ int i;
for (i = 0; i < NEWMV_MODE_CONTEXTS; ++i)
av1_diff_update_prob(r, &fc->newmv_prob[i], ACCT_STR);
for (i = 0; i < ZEROMV_MODE_CONTEXTS; ++i)
@@ -133,18 +148,20 @@
av1_diff_update_prob(r, &fc->new2mv_prob, ACCT_STR);
#endif // CONFIG_EXT_INTER
#else
- int j;
+#if !CONFIG_EC_ADAPT
+ int i, j;
for (i = 0; i < INTER_MODE_CONTEXTS; ++i) {
for (j = 0; j < INTER_MODES - 1; ++j)
av1_diff_update_prob(r, &fc->inter_mode_probs[i][j], ACCT_STR);
-#if CONFIG_DAALA_EC
- av1_tree_to_cdf(av1_inter_mode_tree, fc->inter_mode_probs[i],
- fc->inter_mode_cdf[i]);
-#endif
}
+#else
+ (void)fc;
+ (void)r;
+#endif
#endif
}
+#if !CONFIG_EC_ADAPT
#if CONFIG_EXT_INTER
static void read_inter_compound_mode_probs(FRAME_CONTEXT *fc, aom_reader *r) {
int i, j;
@@ -157,6 +174,26 @@
}
}
#endif // CONFIG_EXT_INTER
+#if !CONFIG_EXT_TX
+static void read_ext_tx_probs(FRAME_CONTEXT *fc, aom_reader *r) {
+ int i, j, k;
+ if (aom_read(r, GROUP_DIFF_UPDATE_PROB, ACCT_STR)) {
+ for (i = TX_4X4; i < EXT_TX_SIZES; ++i) {
+ for (j = 0; j < TX_TYPES; ++j) {
+ for (k = 0; k < TX_TYPES - 1; ++k)
+ av1_diff_update_prob(r, &fc->intra_ext_tx_prob[i][j][k], ACCT_STR);
+ }
+ }
+ }
+ if (aom_read(r, GROUP_DIFF_UPDATE_PROB, ACCT_STR)) {
+ for (i = TX_4X4; i < EXT_TX_SIZES; ++i) {
+ for (k = 0; k < TX_TYPES - 1; ++k)
+ av1_diff_update_prob(r, &fc->inter_ext_tx_prob[i][k], ACCT_STR);
+ }
+ }
+}
+#endif
+#endif
static REFERENCE_MODE read_frame_reference_mode(
const AV1_COMMON *cm, struct aom_read_bit_buffer *rb) {
@@ -206,12 +243,11 @@
}
static void read_mv_probs(nmv_context *ctx, int allow_hp, aom_reader *r) {
- int i, j;
+ int i;
+#if !CONFIG_EC_ADAPT
+ int j;
update_mv_probs(ctx->joints, MV_JOINTS - 1, r);
-#if CONFIG_DAALA_EC
- av1_tree_to_cdf(av1_mv_joint_tree, ctx->joints, ctx->joint_cdf);
-#endif
for (i = 0; i < 2; ++i) {
nmv_component *const comp_ctx = &ctx->comps[i];
@@ -219,25 +255,15 @@
update_mv_probs(comp_ctx->classes, MV_CLASSES - 1, r);
update_mv_probs(comp_ctx->class0, CLASS0_SIZE - 1, r);
update_mv_probs(comp_ctx->bits, MV_OFFSET_BITS, r);
-#if CONFIG_DAALA_EC
- av1_tree_to_cdf(av1_mv_class_tree, comp_ctx->classes, comp_ctx->class_cdf);
-#endif
}
-
for (i = 0; i < 2; ++i) {
nmv_component *const comp_ctx = &ctx->comps[i];
for (j = 0; j < CLASS0_SIZE; ++j) {
update_mv_probs(comp_ctx->class0_fp[j], MV_FP_SIZE - 1, r);
-#if CONFIG_DAALA_EC
- av1_tree_to_cdf(av1_mv_fp_tree, comp_ctx->class0_fp[j],
- comp_ctx->class0_fp_cdf[j]);
-#endif
}
update_mv_probs(comp_ctx->fp, MV_FP_SIZE - 1, r);
-#if CONFIG_DAALA_EC
- av1_tree_to_cdf(av1_mv_fp_tree, comp_ctx->fp, comp_ctx->fp_cdf);
-#endif
}
+#endif // !CONFIG_EC_ADAPT
if (allow_hp) {
for (i = 0; i < 2; ++i) {
@@ -325,13 +351,17 @@
const TX_SIZE plane_tx_size =
plane ? uv_txsize_lookup[bsize][mbmi->inter_tx_size[tx_row][tx_col]][0][0]
: mbmi->inter_tx_size[tx_row][tx_col];
- int max_blocks_high = num_4x4_blocks_high_lookup[plane_bsize];
- int max_blocks_wide = num_4x4_blocks_wide_lookup[plane_bsize];
+ // Scale to match transform block unit.
+ int max_blocks_high = block_size_high[plane_bsize];
+ int max_blocks_wide = block_size_wide[plane_bsize];
if (xd->mb_to_bottom_edge < 0)
- max_blocks_high += xd->mb_to_bottom_edge >> (5 + pd->subsampling_y);
+ max_blocks_high += xd->mb_to_bottom_edge >> (3 + pd->subsampling_y);
if (xd->mb_to_right_edge < 0)
- max_blocks_wide += xd->mb_to_right_edge >> (5 + pd->subsampling_x);
+ max_blocks_wide += xd->mb_to_right_edge >> (3 + pd->subsampling_x);
+
+ max_blocks_high >>= tx_size_wide_log2[0];
+ max_blocks_wide >>= tx_size_wide_log2[0];
if (blk_row >= max_blocks_high || blk_col >= max_blocks_wide) return;
@@ -350,15 +380,14 @@
pd->dst.stride, max_scan_line, eob);
*eob_total += eob;
} else {
- int bsl = b_width_log2_lookup[bsize];
+ int bsl = block_size_wide[bsize] >> (tx_size_wide_log2[0] + 1);
int i;
assert(bsl > 0);
- --bsl;
for (i = 0; i < 4; ++i) {
- const int offsetr = blk_row + ((i >> 1) << bsl);
- const int offsetc = blk_col + ((i & 0x01) << bsl);
+ const int offsetr = blk_row + (i >> 1) * bsl;
+ const int offsetc = blk_col + (i & 0x01) * bsl;
if (offsetr >= max_blocks_high || offsetc >= max_blocks_wide) continue;
@@ -1281,8 +1310,8 @@
for (plane = 0; plane < MAX_MB_PLANE; ++plane) {
const struct macroblockd_plane *const pd = &xd->plane[plane];
- const int num_4x4_w = pd->n4_w;
- const int num_4x4_h = pd->n4_h;
+ int block_width = pd->width;
+ int block_height = pd->height;
int row, col;
#if CONFIG_VAR_TX
// TODO(jingning): This can be simplified for decoder performance.
@@ -1297,23 +1326,26 @@
plane ? get_uv_tx_size(mbmi, pd) : mbmi->tx_size;
const int stepr = tx_size_high_unit[tx_size];
const int stepc = tx_size_wide_unit[tx_size];
- const int max_blocks_wide =
- num_4x4_w +
+ int max_blocks_wide =
+ block_width +
(xd->mb_to_right_edge >= 0 ? 0 : xd->mb_to_right_edge >>
- (5 + pd->subsampling_x));
- const int max_blocks_high =
- num_4x4_h +
+ (3 + pd->subsampling_x));
+ int max_blocks_high =
+ block_height +
(xd->mb_to_bottom_edge >= 0 ? 0 : xd->mb_to_bottom_edge >>
- (5 + pd->subsampling_y));
-
+ (3 + pd->subsampling_y));
+ max_blocks_wide >>= tx_size_wide_log2[0];
+ max_blocks_high >>= tx_size_wide_log2[0];
for (row = 0; row < max_blocks_high; row += stepr)
for (col = 0; col < max_blocks_wide; col += stepc)
eobtotal += reconstruct_inter_block(cm, xd, r, mbmi->segment_id,
plane, row, col, tx_size);
} else {
#endif
- for (row = 0; row < num_4x4_h; row += bh_var_tx)
- for (col = 0; col < num_4x4_w; col += bw_var_tx)
+ block_width >>= tx_size_wide_log2[0];
+ block_height >>= tx_size_wide_log2[0];
+ for (row = 0; row < block_height; row += bh_var_tx)
+ for (col = 0; col < block_width; col += bw_var_tx)
decode_reconstruct_tx(cm, xd, r, mbmi, plane, plane_bsize, row,
col, max_tx_size, &eobtotal);
#if CONFIG_EXT_TX && CONFIG_RECT_TX
@@ -1324,15 +1356,16 @@
plane ? get_uv_tx_size(mbmi, pd) : mbmi->tx_size;
const int stepr = tx_size_high_unit[tx_size];
const int stepc = tx_size_wide_unit[tx_size];
- const int max_blocks_wide =
- num_4x4_w + (xd->mb_to_right_edge >= 0
- ? 0
- : xd->mb_to_right_edge >> (5 + pd->subsampling_x));
- const int max_blocks_high =
- num_4x4_h +
+ int max_blocks_wide =
+ block_width +
+ (xd->mb_to_right_edge >= 0 ? 0 : xd->mb_to_right_edge >>
+ (3 + pd->subsampling_x));
+ int max_blocks_high =
+ block_height +
(xd->mb_to_bottom_edge >= 0 ? 0 : xd->mb_to_bottom_edge >>
- (5 + pd->subsampling_y));
-
+ (3 + pd->subsampling_y));
+ max_blocks_wide >>= tx_size_wide_log2[0];
+ max_blocks_high >>= tx_size_wide_log2[0];
for (row = 0; row < max_blocks_high; row += stepr)
for (col = 0; col < max_blocks_wide; col += stepc)
eobtotal += reconstruct_inter_block(cm, xd, r, mbmi->segment_id,
@@ -1715,20 +1748,21 @@
assert(mbmi->segment_id_supertx != MAX_SEGMENTS);
for (i = 0; i < MAX_MB_PLANE; ++i) {
const struct macroblockd_plane *const pd = &xd->plane[i];
- const int num_4x4_w = pd->n4_w;
- const int num_4x4_h = pd->n4_h;
int row, col;
const TX_SIZE tx_size = i ? get_uv_tx_size(mbmi, pd) : mbmi->tx_size;
- const int stepr = num_4x4_blocks_high_txsize_lookup[tx_size];
- const int stepc = num_4x4_blocks_wide_txsize_lookup[tx_size];
- const int max_blocks_wide =
- num_4x4_w + (xd->mb_to_right_edge >= 0
+ const int stepr = tx_size_high_unit[tx_size];
+ const int stepc = tx_size_wide_unit[tx_size];
+ int max_blocks_wide =
+ pd->width + (xd->mb_to_right_edge >= 0
? 0
- : xd->mb_to_right_edge >> (5 + pd->subsampling_x));
- const int max_blocks_high =
- num_4x4_h +
+ : xd->mb_to_right_edge >> (3 + pd->subsampling_x));
+ int max_blocks_high =
+ pd->height +
(xd->mb_to_bottom_edge >= 0 ? 0 : xd->mb_to_bottom_edge >>
- (5 + pd->subsampling_y));
+ (3 + pd->subsampling_y));
+
+ max_blocks_wide >>= tx_size_wide_log2[0];
+ max_blocks_high >>= tx_size_wide_log2[0];
for (row = 0; row < max_blocks_high; row += stepr)
for (col = 0; col < max_blocks_wide; col += stepc)
@@ -1775,6 +1809,19 @@
if (bsize >= BLOCK_8X8 &&
(bsize == BLOCK_8X8 || partition != PARTITION_SPLIT))
dec_update_partition_context(xd, mi_row, mi_col, subsize, num_8x8_wh);
+#endif // CONFIG_EXT_PARTITION_TYPES
+
+#if CONFIG_DERING
+ if (bsize == BLOCK_64X64) {
+ if (cm->dering_level != 0 && !sb_all_skip(cm, mi_row, mi_col)) {
+ cm->mi_grid_visible[mi_row * cm->mi_stride + mi_col]->mbmi.dering_gain =
+ aom_read_literal(r, DERING_REFINEMENT_BITS, ACCT_STR);
+ } else {
+ cm->mi_grid_visible[mi_row * cm->mi_stride + mi_col]->mbmi.dering_gain =
+ 0;
+ }
+ }
+#endif
#if CONFIG_CLPF
if (bsize == BLOCK_64X64 && cm->clpf_strength_y &&
@@ -1812,18 +1859,6 @@
}
}
#endif
-#if CONFIG_DERING
- if (bsize == BLOCK_64X64) {
- if (cm->dering_level != 0 && !sb_all_skip(cm, mi_row, mi_col)) {
- cm->mi_grid_visible[mi_row * cm->mi_stride + mi_col]->mbmi.dering_gain =
- aom_read_literal(r, DERING_REFINEMENT_BITS, ACCT_STR);
- } else {
- cm->mi_grid_visible[mi_row * cm->mi_stride + mi_col]->mbmi.dering_gain =
- 0;
- }
- }
-#endif
-#endif // CONFIG_EXT_PARTITION_TYPES
}
#if !CONFIG_ANS
@@ -1868,13 +1903,18 @@
static void read_coef_probs_common(av1_coeff_probs_model *coef_probs,
aom_reader *r) {
int i, j, k, l, m;
+#if CONFIG_EC_ADAPT
+ const int node_limit = UNCONSTRAINED_NODES - 1;
+#else
+ const int node_limit = UNCONSTRAINED_NODES;
+#endif
if (aom_read_bit(r, ACCT_STR))
for (i = 0; i < PLANE_TYPES; ++i)
for (j = 0; j < REF_TYPES; ++j)
for (k = 0; k < COEF_BANDS; ++k)
for (l = 0; l < BAND_COEFF_CONTEXTS(k); ++l)
- for (m = 0; m < UNCONSTRAINED_NODES; ++m)
+ for (m = 0; m < node_limit; ++m)
av1_diff_update_prob(r, &coef_probs[i][j][k][l][m], ACCT_STR);
}
@@ -1883,9 +1923,6 @@
TX_SIZE tx_size;
for (tx_size = TX_4X4; tx_size <= max_tx_size; ++tx_size)
read_coef_probs_common(fc->coef_probs[tx_size], r);
-#if CONFIG_RANS || CONFIG_DAALA_EC
- av1_coef_pareto_cdfs(fc);
-#endif // CONFIG_RANS
}
static void setup_segmentation(AV1_COMMON *const cm,
@@ -2473,6 +2510,17 @@
pbi->tile_size_bytes = aom_rb_read_literal(rb, 2) + 1;
}
#endif // CONFIG_EXT_TILE
+#if CONFIG_TILE_GROUPS
+ // Store an index to the location of the tile group information
+ pbi->tg_size_bit_offset = rb->bit_offset;
+ pbi->tg_size = 1 << (cm->log2_tile_rows + cm->log2_tile_cols);
+ if (cm->log2_tile_rows + cm->log2_tile_cols > 0) {
+ pbi->tg_start =
+ aom_rb_read_literal(rb, cm->log2_tile_rows + cm->log2_tile_cols);
+ pbi->tg_size =
+ 1 + aom_rb_read_literal(rb, cm->log2_tile_rows + cm->log2_tile_cols);
+ }
+#endif
}
static int mem_get_varsize(const uint8_t *src, const int sz) {
@@ -2670,6 +2718,41 @@
AV1Decoder *pbi, const uint8_t *data, const uint8_t *data_end,
TileBufferDec (*const tile_buffers)[MAX_TILE_COLS]) {
AV1_COMMON *const cm = &pbi->common;
+#if CONFIG_TILE_GROUPS
+ int r, c;
+ const int tile_cols = cm->tile_cols;
+ const int tile_rows = cm->tile_rows;
+ int tc = 0;
+ int first_tile_in_tg = 0;
+ int hdr_offset;
+ struct aom_read_bit_buffer rb_tg_hdr;
+ uint8_t clear_data[MAX_AV1_HEADER_SIZE];
+ const int num_tiles = tile_rows * tile_cols;
+ const int num_bits = OD_ILOG(num_tiles) - 1;
+ const int hdr_size = pbi->uncomp_hdr_size + pbi->first_partition_size;
+ const int tg_size_bit_offset = pbi->tg_size_bit_offset;
+
+ for (r = 0; r < tile_rows; ++r) {
+ for (c = 0; c < tile_cols; ++c, ++tc) {
+ TileBufferDec *const buf = &tile_buffers[r][c];
+ hdr_offset = (tc && tc == first_tile_in_tg) ? hdr_size : 0;
+
+ buf->col = c;
+ if (hdr_offset) {
+ init_read_bit_buffer(pbi, &rb_tg_hdr, data, data_end, clear_data);
+ rb_tg_hdr.bit_offset = tg_size_bit_offset;
+ if (num_tiles) {
+ pbi->tg_start = aom_rb_read_literal(&rb_tg_hdr, num_bits);
+ pbi->tg_size = 1 + aom_rb_read_literal(&rb_tg_hdr, num_bits);
+ }
+ }
+ first_tile_in_tg += tc == first_tile_in_tg ? pbi->tg_size : 0;
+ data += hdr_offset;
+ get_tile_buffer(data_end, pbi->tile_size_bytes, 0, &pbi->common.error,
+ &data, pbi->decrypt_cb, pbi->decrypt_state, buf);
+ }
+ }
+#else
int r, c;
const int tile_cols = cm->tile_cols;
const int tile_rows = cm->tile_rows;
@@ -2683,6 +2766,7 @@
&data, pbi->decrypt_cb, pbi->decrypt_state, buf);
}
}
+#endif
}
#endif // CONFIG_EXT_TILE
@@ -2850,7 +2934,10 @@
assert(mi_row > 0);
-#if !CONFIG_VAR_TX
+// when Parallel deblocking is enabled, deblocking should not
+// be interleaved with decoding. Instead, deblocking should be done
+// after the entire frame is decoded.
+#if !CONFIG_VAR_TX && !CONFIG_PARALLEL_DEBLOCKING
// Loopfilter one tile row.
if (cm->lf.filter_level && !cm->skip_loop_filter) {
LFWorkerData *const lf_data = (LFWorkerData *)pbi->lf_worker.data1;
@@ -2873,12 +2960,12 @@
winterface->execute(&pbi->lf_worker);
}
}
+#endif // !CONFIG_VAR_TX && !CONFIG_PARALLEL_DEBLOCKING
// After loopfiltering, the last 7 row pixels in each superblock row may
// still be changed by the longest loopfilter of the next superblock row.
if (cm->frame_parallel_decode)
av1_frameworker_broadcast(pbi->cur_buf, mi_row << cm->mib_size_log2);
-#endif // !CONFIG_VAR_TX
}
#if CONFIG_VAR_TX
@@ -2886,6 +2973,16 @@
av1_loop_filter_frame(get_frame_new_buffer(cm), cm, &pbi->mb,
cm->lf.filter_level, 0, 0);
#else
+#if CONFIG_PARALLEL_DEBLOCKING
+ // Loopfilter all rows in the frame in the frame.
+ if (cm->lf.filter_level && !cm->skip_loop_filter) {
+ LFWorkerData *const lf_data = (LFWorkerData *)pbi->lf_worker.data1;
+ winterface->sync(&pbi->lf_worker);
+ lf_data->start = 0;
+ lf_data->stop = cm->mi_rows;
+ winterface->execute(&pbi->lf_worker);
+ }
+#else
// Loopfilter remaining rows in the frame.
if (cm->lf.filter_level && !cm->skip_loop_filter) {
LFWorkerData *const lf_data = (LFWorkerData *)pbi->lf_worker.data1;
@@ -2894,6 +2991,7 @@
lf_data->stop = cm->mi_rows;
winterface->execute(&pbi->lf_worker);
}
+#endif // CONFIG_PARALLEL_DEBLOCKING
#endif // CONFIG_VAR_TX
if (cm->frame_parallel_decode)
av1_frameworker_broadcast(pbi->cur_buf, INT_MAX);
@@ -3436,12 +3534,12 @@
#endif // CONFIG_EXT_PARTITION
setup_loopfilter(cm, rb);
-#if CONFIG_CLPF
- setup_clpf(pbi, rb);
-#endif
#if CONFIG_DERING
setup_dering(cm, rb);
#endif
+#if CONFIG_CLPF
+ setup_clpf(pbi, rb);
+#endif
#if CONFIG_LOOP_RESTORATION
decode_restoration_mode(cm, rb);
#endif // CONFIG_LOOP_RESTORATION
@@ -3505,11 +3603,11 @@
if (sz == 0)
aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
"Invalid header size");
-
return sz;
}
#if CONFIG_EXT_TX
+#if !CONFIG_EC_ADAPT || !CONFIG_DAALA_EC
static void read_ext_tx_probs(FRAME_CONTEXT *fc, aom_reader *r) {
int i, j, k;
int s;
@@ -3535,36 +3633,10 @@
}
}
}
-
+#endif // !CONFIG_EC_ADAPT || !CONFIG_DAALA_EC
#else
-static void read_ext_tx_probs(FRAME_CONTEXT *fc, aom_reader *r) {
- int i, j, k;
- if (aom_read(r, GROUP_DIFF_UPDATE_PROB, ACCT_STR)) {
- for (i = TX_4X4; i < EXT_TX_SIZES; ++i) {
- for (j = 0; j < TX_TYPES; ++j) {
- for (k = 0; k < TX_TYPES - 1; ++k)
- av1_diff_update_prob(r, &fc->intra_ext_tx_prob[i][j][k], ACCT_STR);
-#if CONFIG_DAALA_EC
- av1_tree_to_cdf(av1_ext_tx_tree, fc->intra_ext_tx_prob[i][j],
- fc->intra_ext_tx_cdf[i][j]);
-#endif
- }
- }
- }
- if (aom_read(r, GROUP_DIFF_UPDATE_PROB, ACCT_STR)) {
- for (i = TX_4X4; i < EXT_TX_SIZES; ++i) {
- for (k = 0; k < TX_TYPES - 1; ++k)
- av1_diff_update_prob(r, &fc->inter_ext_tx_prob[i][k], ACCT_STR);
-#if CONFIG_DAALA_EC
- av1_tree_to_cdf(av1_ext_tx_tree, fc->inter_ext_tx_prob[i],
- fc->inter_ext_tx_cdf[i]);
-#endif
- }
- }
-}
#endif // CONFIG_EXT_TX
-
#if CONFIG_SUPERTX
static void read_supertx_probs(FRAME_CONTEXT *fc, aom_reader *r) {
int i, j;
@@ -3643,7 +3715,10 @@
#endif
FRAME_CONTEXT *const fc = cm->fc;
aom_reader r;
- int k, i, j;
+ int k, i;
+#if !CONFIG_EC_ADAPT
+ int j;
+#endif
#if !CONFIG_ANS
if (aom_reader_init(&r, data, partition_size, pbi->decrypt_cb,
@@ -3660,12 +3735,7 @@
decode_restoration(cm, &r);
#endif
- if (cm->tx_mode == TX_MODE_SELECT) {
- for (i = 0; i < MAX_TX_DEPTH; ++i)
- for (j = 0; j < TX_SIZE_CONTEXTS; ++j)
- for (k = 0; k < i + 1; ++k)
- av1_diff_update_prob(&r, &fc->tx_size_probs[i][j][k], ACCT_STR);
- }
+ if (cm->tx_mode == TX_MODE_SELECT) read_tx_size_probs(fc, &r);
read_coef_probs(fc, cm->tx_mode, &r);
@@ -3674,7 +3744,7 @@
av1_diff_update_prob(&r, &fc->txfm_partition_prob[k], ACCT_STR);
#if CONFIG_EXT_TX && CONFIG_RECT_TX
if (cm->tx_mode == TX_MODE_SELECT) {
- for (i = 1; i < TX_SIZES - 1; ++i)
+ for (i = 1; i < MAX_TX_DEPTH; ++i)
av1_diff_update_prob(&r, &fc->rect_tx_prob[i], ACCT_STR);
}
#endif // CONFIG_EXT_TX && CONFIG_RECT_TX
@@ -3688,6 +3758,7 @@
av1_diff_update_prob(&r, &fc->delta_q_prob[k], ACCT_STR);
#endif
+#if !CONFIG_EC_ADAPT
if (cm->seg.enabled && cm->seg.update_map) {
if (cm->seg.temporal_update) {
for (k = 0; k < PREDICTION_PROBS; k++)
@@ -3695,19 +3766,11 @@
}
for (k = 0; k < MAX_SEGMENTS - 1; k++)
av1_diff_update_prob(&r, &cm->fc->seg.tree_probs[k], ACCT_STR);
-#if CONFIG_DAALA_EC
- av1_tree_to_cdf(av1_segment_tree, cm->fc->seg.tree_probs,
- cm->fc->seg.tree_cdf);
-#endif
}
for (j = 0; j < INTRA_MODES; j++) {
for (i = 0; i < INTRA_MODES - 1; ++i)
av1_diff_update_prob(&r, &fc->uv_mode_prob[j][i], ACCT_STR);
-#if CONFIG_DAALA_EC
- av1_tree_to_cdf(av1_intra_mode_tree, fc->uv_mode_prob[j],
- fc->uv_mode_cdf[j]);
-#endif
}
#if CONFIG_EXT_PARTITION_TYPES
@@ -3717,41 +3780,32 @@
for (i = 0; i < EXT_PARTITION_TYPES - 1; ++i)
av1_diff_update_prob(&r, &fc->partition_prob[j][i], ACCT_STR);
#else
- for (j = 0; j < PARTITION_CONTEXTS; ++j) {
+ for (j = 0; j < PARTITION_CONTEXTS; ++j)
for (i = 0; i < PARTITION_TYPES - 1; ++i)
av1_diff_update_prob(&r, &fc->partition_prob[j][i], ACCT_STR);
-#if CONFIG_DAALA_EC
- av1_tree_to_cdf(av1_partition_tree, fc->partition_prob[j],
- fc->partition_cdf[j]);
-#endif
- }
#endif // CONFIG_EXT_PARTITION_TYPES
-
+#endif // EC_ADAPT, DAALA_EC
#if CONFIG_EXT_INTRA
for (i = 0; i < INTRA_FILTERS + 1; ++i)
for (j = 0; j < INTRA_FILTERS - 1; ++j)
av1_diff_update_prob(&r, &fc->intra_filter_probs[i][j], ACCT_STR);
-#endif // CONFIG_EXT_INTRA
+#endif // EC_ADAPT, DAALA_EC
if (frame_is_intra_only(cm)) {
av1_copy(cm->kf_y_prob, av1_kf_y_mode_prob);
#if CONFIG_DAALA_EC
av1_copy(cm->kf_y_cdf, av1_kf_y_mode_cdf);
#endif
+#if !CONFIG_EC_ADAPT
for (k = 0; k < INTRA_MODES; k++)
- for (j = 0; j < INTRA_MODES; j++) {
+ for (j = 0; j < INTRA_MODES; j++)
for (i = 0; i < INTRA_MODES - 1; ++i)
av1_diff_update_prob(&r, &cm->kf_y_prob[k][j][i], ACCT_STR);
-#if CONFIG_DAALA_EC
- av1_tree_to_cdf(av1_intra_mode_tree, cm->kf_y_prob[k][j],
- cm->kf_y_cdf[k][j]);
#endif
- }
} else {
#if !CONFIG_REF_MV
nmv_context *const nmvc = &fc->nmvc;
#endif
-
read_inter_mode_probs(fc, &r);
#if CONFIG_EXT_INTER
@@ -3788,24 +3842,23 @@
}
#endif // CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
+#if !CONFIG_EC_ADAPT
if (cm->interp_filter == SWITCHABLE) read_switchable_interp_probs(fc, &r);
+#endif
for (i = 0; i < INTRA_INTER_CONTEXTS; i++)
av1_diff_update_prob(&r, &fc->intra_inter_prob[i], ACCT_STR);
if (cm->reference_mode != SINGLE_REFERENCE)
setup_compound_reference_mode(cm);
-
read_frame_reference_mode_probs(cm, &r);
+#if !CONFIG_EC_ADAPT
for (j = 0; j < BLOCK_SIZE_GROUPS; j++) {
for (i = 0; i < INTRA_MODES - 1; ++i)
av1_diff_update_prob(&r, &fc->y_mode_prob[j][i], ACCT_STR);
-#if CONFIG_DAALA_EC
- av1_tree_to_cdf(av1_intra_mode_tree, fc->y_mode_prob[j],
- fc->y_mode_cdf[j]);
-#endif
}
+#endif
#if CONFIG_REF_MV
for (i = 0; i < NMV_CONTEXTS; ++i)
@@ -3813,14 +3866,23 @@
#else
read_mv_probs(nmvc, cm->allow_high_precision_mv, &r);
#endif
+#if !CONFIG_EC_ADAPT
read_ext_tx_probs(fc, &r);
+#endif // EC_ADAPT, DAALA_EC
#if CONFIG_SUPERTX
if (!xd->lossless[0]) read_supertx_probs(fc, &r);
#endif
#if CONFIG_GLOBAL_MOTION
read_global_motion(cm, &r);
-#endif // CONFIG_GLOBAL_MOTION
+#endif // EC_ADAPT, DAALA_EC
}
+#if CONFIG_EC_MULTISYMBOL
+ av1_coef_pareto_cdfs(fc);
+ av1_set_mv_cdfs(&fc->nmvc);
+#if CONFIG_DAALA_EC
+ av1_set_mode_cdfs(cm);
+#endif
+#endif
return aom_reader_has_error(&r);
}
@@ -3931,7 +3993,56 @@
if (profile > 2) profile += aom_rb_read_bit(rb);
return (BITSTREAM_PROFILE)profile;
}
+#if CONFIG_TILE_GROUPS
+static int read_all_headers(AV1Decoder *pbi, struct aom_read_bit_buffer *rb,
+ const uint8_t **p_data,
+ const uint8_t **p_data_end) {
+ AV1_COMMON *const cm = &pbi->common;
+ MACROBLOCKD *const xd = &pbi->mb;
+ YV12_BUFFER_CONFIG *fb = (YV12_BUFFER_CONFIG *)xd->cur_buf;
+ pbi->first_partition_size = read_uncompressed_header(pbi, rb);
+ pbi->uncomp_hdr_size = aom_rb_bytes_read(rb);
+#if CONFIG_GLOBAL_MOTION
+ xd->global_motion = cm->global_motion;
+#endif // CONFIG_GLOBAL_MOTION
+
+ if (!pbi->first_partition_size) {
+// showing a frame directly
+#if CONFIG_EXT_REFS
+ if (cm->show_existing_frame)
+ *p_data_end = *p_data + pbi->uncomp_hdr_size;
+ else
+#endif // CONFIG_EXT_REFS
+ *p_data_end = *p_data + (cm->profile <= PROFILE_2 ? 1 : 2);
+ return 1;
+ }
+
+ *p_data += pbi->uncomp_hdr_size;
+
+ if (!read_is_valid(*p_data, pbi->first_partition_size, *p_data_end))
+ aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
+ "Truncated packet or corrupt header length");
+
+ *cm->fc = cm->frame_contexts[cm->frame_context_idx];
+ if (!cm->fc->initialized)
+ aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
+ "Uninitialized entropy context.");
+
+ av1_zero(cm->counts);
+
+ xd->corrupted = 0;
+ fb->corrupted =
+ read_compressed_header(pbi, *p_data, pbi->first_partition_size);
+ if (fb->corrupted)
+ aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
+ "Decode failed. Frame data header is corrupted.");
+
+ *p_data += pbi->first_partition_size;
+
+ return 0;
+}
+#endif
void av1_decode_frame(AV1Decoder *pbi, const uint8_t *data,
const uint8_t *data_end, const uint8_t **p_data_end) {
AV1_COMMON *const cm = &pbi->common;
@@ -3971,6 +4082,10 @@
aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
"Truncated packet or corrupt header length");
+#if CONFIG_SIMP_MV_PRED
+ cm->setup_mi(cm);
+#endif
+
cm->use_prev_frame_mvs =
!cm->error_resilient_mode && cm->width == cm->last_width &&
cm->height == cm->last_height && !cm->last_intra_only &&
@@ -4067,6 +4182,12 @@
}
#endif // CONFIG_LOOP_RESTORATION
+#if CONFIG_DERING
+ if (cm->dering_level && !cm->skip_loop_filter) {
+ av1_dering_frame(&pbi->cur_buf->buf, cm, &pbi->mb, cm->dering_level);
+ }
+#endif // CONFIG_DERING
+
#if CONFIG_CLPF
if (!cm->skip_loop_filter) {
const YV12_BUFFER_CONFIG *const frame = &pbi->cur_buf->buf;
@@ -4088,11 +4209,6 @@
}
if (cm->clpf_blocks) aom_free(cm->clpf_blocks);
#endif
-#if CONFIG_DERING
- if (cm->dering_level && !cm->skip_loop_filter) {
- av1_dering_frame(&pbi->cur_buf->buf, cm, &pbi->mb, cm->dering_level);
- }
-#endif // CONFIG_DERING
if (!xd->corrupted) {
if (cm->refresh_frame_context == REFRESH_FRAME_CONTEXT_BACKWARD) {
diff --git a/av1/decoder/decodemv.c b/av1/decoder/decodemv.c
index 3993e72..77cea8a 100644
--- a/av1/decoder/decodemv.c
+++ b/av1/decoder/decodemv.c
@@ -42,7 +42,7 @@
#endif // CONFIG_EXT_INTRA || CONFIG_FILTER_INTRA || CONFIG_PALETTE
#if CONFIG_DAALA_EC
-static PREDICTION_MODE read_intra_mode(aom_reader *r, const aom_cdf_prob *cdf) {
+static PREDICTION_MODE read_intra_mode(aom_reader *r, aom_cdf_prob *cdf) {
return (PREDICTION_MODE)
av1_intra_mode_inv[aom_read_symbol(r, cdf, INTRA_MODES, ACCT_STR)];
}
@@ -249,6 +249,24 @@
}
#endif
+#if CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
+static MOTION_MODE read_motion_mode(AV1_COMMON *cm, MACROBLOCKD *xd,
+ MB_MODE_INFO *mbmi, aom_reader *r) {
+ if (is_motion_variation_allowed(mbmi)) {
+ int motion_mode;
+ FRAME_COUNTS *counts = xd->counts;
+
+ motion_mode =
+ aom_read_tree(r, av1_motion_mode_tree,
+ cm->fc->motion_mode_prob[mbmi->sb_type], ACCT_STR);
+ if (counts) ++counts->motion_mode[mbmi->sb_type][motion_mode];
+ return (MOTION_MODE)(SIMPLE_TRANSLATION + motion_mode);
+ } else {
+ return SIMPLE_TRANSLATION;
+ }
+}
+#endif // CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
+
#if CONFIG_EXT_INTER
static PREDICTION_MODE read_inter_compound_mode(AV1_COMMON *cm, MACROBLOCKD *xd,
aom_reader *r, int16_t ctx) {
@@ -264,8 +282,7 @@
}
#endif // CONFIG_EXT_INTER
-static int read_segment_id(aom_reader *r,
- const struct segmentation_probs *segp) {
+static int read_segment_id(aom_reader *r, struct segmentation_probs *segp) {
#if CONFIG_DAALA_EC
return aom_read_symbol(r, segp->tree_cdf, MAX_SEGMENTS, ACCT_STR);
#else
@@ -281,24 +298,29 @@
int is_split = 0;
const int tx_row = blk_row >> 1;
const int tx_col = blk_col >> 1;
- int max_blocks_high = num_4x4_blocks_high_lookup[mbmi->sb_type];
- int max_blocks_wide = num_4x4_blocks_wide_lookup[mbmi->sb_type];
+ int max_blocks_high = block_size_high[mbmi->sb_type];
+ int max_blocks_wide = block_size_wide[mbmi->sb_type];
int ctx = txfm_partition_context(xd->above_txfm_context + tx_col,
- xd->left_txfm_context + tx_row, tx_size);
+ xd->left_txfm_context + tx_row,
+ mbmi->sb_type, tx_size);
TX_SIZE(*const inter_tx_size)
[MAX_MIB_SIZE] =
(TX_SIZE(*)[MAX_MIB_SIZE]) & mbmi->inter_tx_size[tx_row][tx_col];
- if (xd->mb_to_bottom_edge < 0) max_blocks_high += xd->mb_to_bottom_edge >> 5;
- if (xd->mb_to_right_edge < 0) max_blocks_wide += xd->mb_to_right_edge >> 5;
+ if (xd->mb_to_bottom_edge < 0) max_blocks_high += xd->mb_to_bottom_edge >> 3;
+ if (xd->mb_to_right_edge < 0) max_blocks_wide += xd->mb_to_right_edge >> 3;
+
+ // Scale to transform block unit.
+ max_blocks_high >>= tx_size_wide_log2[0];
+ max_blocks_wide >>= tx_size_wide_log2[0];
if (blk_row >= max_blocks_high || blk_col >= max_blocks_wide) return;
if (depth == MAX_VARTX_DEPTH) {
int idx, idy;
inter_tx_size[0][0] = tx_size;
- for (idy = 0; idy < num_4x4_blocks_high_txsize_lookup[tx_size] / 2; ++idy)
- for (idx = 0; idx < num_4x4_blocks_wide_txsize_lookup[tx_size] / 2; ++idx)
+ for (idy = 0; idy < tx_size_high_unit[tx_size] / 2; ++idy)
+ for (idx = 0; idx < tx_size_wide_unit[tx_size] / 2; ++idx)
inter_tx_size[idy][idx] = tx_size;
mbmi->tx_size = tx_size;
if (counts) ++counts->txfm_partition[ctx][0];
@@ -311,7 +333,8 @@
if (is_split) {
BLOCK_SIZE bsize = txsize_to_bsize[tx_size];
- int bsl = b_width_log2_lookup[bsize];
+ // Half the block size in transform block unit.
+ int bsl = block_size_wide[bsize] >> (tx_size_wide_log2[0] + 1);
int i;
if (counts) ++counts->txfm_partition[ctx][1];
@@ -325,18 +348,17 @@
}
assert(bsl > 0);
- --bsl;
for (i = 0; i < 4; ++i) {
- int offsetr = blk_row + ((i >> 1) << bsl);
- int offsetc = blk_col + ((i & 0x01) << bsl);
+ int offsetr = blk_row + ((i >> 1) * bsl);
+ int offsetc = blk_col + ((i & 0x01) * bsl);
read_tx_size_vartx(cm, xd, mbmi, counts, tx_size - 1, depth + 1, offsetr,
offsetc, r);
}
} else {
int idx, idy;
inter_tx_size[0][0] = tx_size;
- for (idy = 0; idy < num_4x4_blocks_high_txsize_lookup[tx_size] / 2; ++idy)
- for (idx = 0; idx < num_4x4_blocks_wide_txsize_lookup[tx_size] / 2; ++idx)
+ for (idy = 0; idy < tx_size_high_unit[tx_size] / 2; ++idy)
+ for (idx = 0; idx < tx_size_wide_unit[tx_size] / 2; ++idx)
inter_tx_size[idy][idx] = tx_size;
mbmi->tx_size = tx_size;
if (counts) ++counts->txfm_partition[ctx][0];
@@ -350,11 +372,11 @@
int tx_size_cat, aom_reader *r) {
FRAME_COUNTS *counts = xd->counts;
const int ctx = get_tx_size_context(xd);
- int tx_size =
- aom_read_tree(r, av1_tx_size_tree[tx_size_cat],
- cm->fc->tx_size_probs[tx_size_cat][ctx], ACCT_STR);
- if (counts) ++counts->tx_size[tx_size_cat][ctx][tx_size];
- return (TX_SIZE)tx_size;
+ int depth = aom_read_tree(r, av1_tx_size_tree[tx_size_cat],
+ cm->fc->tx_size_probs[tx_size_cat][ctx], ACCT_STR);
+ TX_SIZE tx_size = depth_to_tx_size(depth);
+ if (counts) ++counts->tx_size[tx_size_cat][ctx][depth];
+ return tx_size;
}
static TX_SIZE read_tx_size_intra(AV1_COMMON *cm, MACROBLOCKD *xd,
@@ -792,12 +814,11 @@
}
}
-static int read_mv_component(aom_reader *r, const nmv_component *mvcomp,
- int usehp) {
+static int read_mv_component(aom_reader *r, nmv_component *mvcomp, int usehp) {
int mag, d, fr, hp;
const int sign = aom_read(r, mvcomp->sign, ACCT_STR);
const int mv_class =
-#if CONFIG_DAALA_EC
+#if CONFIG_EC_MULTISYMBOL
aom_read_symbol(r, mvcomp->class_cdf, MV_CLASSES, ACCT_STR);
#else
aom_read_tree(r, av1_mv_class_tree, mvcomp->classes, ACCT_STR);
@@ -818,7 +839,7 @@
}
// Fractional part
-#if CONFIG_DAALA_EC
+#if CONFIG_EC_MULTISYMBOL
fr = aom_read_symbol(r, class0 ? mvcomp->class0_fp_cdf[d] : mvcomp->fp_cdf,
MV_FP_SIZE, ACCT_STR);
#else
@@ -836,25 +857,24 @@
}
static INLINE void read_mv(aom_reader *r, MV *mv, const MV *ref,
- const nmv_context *ctx, nmv_context_counts *counts,
+ nmv_context *ctx, nmv_context_counts *counts,
int allow_hp) {
MV_JOINT_TYPE joint_type;
- const int use_hp = allow_hp && av1_use_mv_hp(ref);
MV diff = { 0, 0 };
joint_type =
-#if CONFIG_DAALA_EC
+#if CONFIG_EC_MULTISYMBOL
(MV_JOINT_TYPE)aom_read_symbol(r, ctx->joint_cdf, MV_JOINTS, ACCT_STR);
#else
(MV_JOINT_TYPE)aom_read_tree(r, av1_mv_joint_tree, ctx->joints, ACCT_STR);
#endif
if (mv_joint_vertical(joint_type))
- diff.row = read_mv_component(r, &ctx->comps[0], use_hp);
+ diff.row = read_mv_component(r, &ctx->comps[0], allow_hp);
if (mv_joint_horizontal(joint_type))
- diff.col = read_mv_component(r, &ctx->comps[1], use_hp);
+ diff.col = read_mv_component(r, &ctx->comps[1], allow_hp);
- av1_inc_mv(&diff, counts, use_hp);
+ av1_inc_mv(&diff, counts, allow_hp);
mv->row = ref->row + diff.row;
mv->col = ref->col + diff.col;
@@ -975,24 +995,6 @@
}
}
-#if CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
-static MOTION_MODE read_motion_mode(AV1_COMMON *cm, MACROBLOCKD *xd,
- MB_MODE_INFO *mbmi, aom_reader *r) {
- if (is_motion_variation_allowed(mbmi)) {
- int motion_mode;
- FRAME_COUNTS *counts = xd->counts;
-
- motion_mode =
- aom_read_tree(r, av1_motion_mode_tree,
- cm->fc->motion_mode_prob[mbmi->sb_type], ACCT_STR);
- if (counts) ++counts->motion_mode[mbmi->sb_type][motion_mode];
- return (MOTION_MODE)(SIMPLE_TRANSLATION + motion_mode);
- } else {
- return SIMPLE_TRANSLATION;
- }
-}
-#endif // CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
-
static INLINE InterpFilter read_interp_filter(AV1_COMMON *const cm,
MACROBLOCKD *const xd,
#if CONFIG_DUAL_FILTER
@@ -1801,9 +1803,9 @@
inter_block) {
const TX_SIZE max_tx_size = max_txsize_lookup[bsize];
const BLOCK_SIZE txb_size = txsize_to_bsize[max_tx_size];
- const int bs = num_4x4_blocks_wide_lookup[txb_size];
- const int width = num_4x4_blocks_wide_lookup[bsize];
- const int height = num_4x4_blocks_high_lookup[bsize];
+ const int bs = block_size_wide[txb_size] >> tx_size_wide_log2[0];
+ const int width = block_size_wide[bsize] >> tx_size_wide_log2[0];
+ const int height = block_size_high[bsize] >> tx_size_wide_log2[0];
int idx, idy;
#if CONFIG_EXT_TX && CONFIG_RECT_TX
int is_rect_tx_allowed = inter_block && is_rect_tx_allowed_bsize(bsize) &&
diff --git a/av1/decoder/decoder.h b/av1/decoder/decoder.h
index fd68d13..262995a 100644
--- a/av1/decoder/decoder.h
+++ b/av1/decoder/decoder.h
@@ -107,7 +107,13 @@
int acct_enabled;
Accounting accounting;
#endif
-
+ size_t uncomp_hdr_size; // Size of the uncompressed header
+ size_t first_partition_size; // Size of the compressed header
+#if CONFIG_TILE_GROUPS
+ int tg_size; // Number of tiles in the current tilegroup
+ int tg_start; // First tile in the current tilegroup
+ int tg_size_bit_offset;
+#endif
} AV1Decoder;
int av1_receive_compressed_data(struct AV1Decoder *pbi, size_t size,
diff --git a/av1/decoder/detokenize.c b/av1/decoder/detokenize.c
index 67af6f3..795b1b0 100644
--- a/av1/decoder/detokenize.c
+++ b/av1/decoder/detokenize.c
@@ -48,15 +48,14 @@
}
#if CONFIG_AOM_QM
-static int decode_coefs(const MACROBLOCKD *xd, PLANE_TYPE type,
- tran_low_t *dqcoeff, TX_SIZE tx_size, TX_TYPE tx_type,
- const int16_t *dq, int ctx, const int16_t *scan,
- const int16_t *nb, aom_reader *r,
+static int decode_coefs(MACROBLOCKD *xd, PLANE_TYPE type, tran_low_t *dqcoeff,
+ TX_SIZE tx_size, TX_TYPE tx_type, const int16_t *dq,
+ int ctx, const int16_t *scan, const int16_t *nb,
+ int16_t *max_scan_line, aom_reader *r,
const qm_val_t *iqm[2][TX_SIZES])
#else
-static int decode_coefs(const MACROBLOCKD *xd, PLANE_TYPE type,
- tran_low_t *dqcoeff, TX_SIZE tx_size, TX_TYPE tx_type,
- const int16_t *dq,
+static int decode_coefs(MACROBLOCKD *xd, PLANE_TYPE type, tran_low_t *dqcoeff,
+ TX_SIZE tx_size, TX_TYPE tx_type, const int16_t *dq,
#if CONFIG_NEW_QUANT
dequant_val_type_nuq *dq_val,
#endif // CONFIG_NEW_QUANT
@@ -65,22 +64,22 @@
#endif
{
FRAME_COUNTS *counts = xd->counts;
- const int max_eob = get_tx2d_size(tx_size);
- const FRAME_CONTEXT *const fc = xd->fc;
+ FRAME_CONTEXT *const fc = xd->fc;
+ const int max_eob = tx_size_2d[tx_size];
const int ref = is_inter_block(&xd->mi[0]->mbmi);
#if CONFIG_AOM_QM
const qm_val_t *iqmatrix = iqm[!ref][tx_size];
#endif
int band, c = 0;
const int tx_size_ctx = txsize_sqr_map[tx_size];
- const aom_prob(*coef_probs)[COEFF_CONTEXTS][UNCONSTRAINED_NODES] =
+ aom_prob(*coef_probs)[COEFF_CONTEXTS][UNCONSTRAINED_NODES] =
fc->coef_probs[tx_size_ctx][type][ref];
const aom_prob *prob;
-#if CONFIG_RANS || CONFIG_DAALA_EC
- const aom_cdf_prob(*const coef_cdfs)[COEFF_CONTEXTS][ENTROPY_TOKENS] =
+#if CONFIG_EC_MULTISYMBOL
+ aom_cdf_prob(*coef_cdfs)[COEFF_CONTEXTS][ENTROPY_TOKENS] =
fc->coef_cdfs[tx_size_ctx][type][ref];
- const aom_cdf_prob(*cdf)[ENTROPY_TOKENS];
-#endif // CONFIG_RANS
+ aom_cdf_prob(*cdf)[ENTROPY_TOKENS];
+#endif // CONFIG_EC_MULTISYMBOL
unsigned int(*coef_counts)[COEFF_CONTEXTS][UNCONSTRAINED_NODES + 1];
unsigned int(*eob_branch_count)[COEFF_CONTEXTS];
uint8_t token_cache[MAX_TX_SQUARE];
@@ -169,7 +168,7 @@
*max_scan_line = AOMMAX(*max_scan_line, scan[c]);
-#if CONFIG_RANS || CONFIG_DAALA_EC
+#if CONFIG_EC_MULTISYMBOL
cdf = &coef_cdfs[band][ctx];
token = ONE_TOKEN +
aom_read_symbol(r, *cdf, CATEGORY6_TOKEN - ONE_TOKEN + 1, ACCT_STR);
@@ -215,7 +214,7 @@
#endif
} break;
}
-#else
+#else // CONFIG_EC_MULTISYMBOL
if (!aom_read(r, prob[ONE_CONTEXT_NODE], ACCT_STR)) {
INCREMENT_COUNT(ONE_TOKEN);
token = ONE_TOKEN;
@@ -266,7 +265,7 @@
}
}
}
-#endif // CONFIG_RANS
+#endif // CONFIG_EC_MULTISYMBOL
#if CONFIG_NEW_QUANT
v = av1_dequant_abscoeff_nuq(val, dqv, dqv_val);
v = dq_shift ? ROUND_POWER_OF_TWO(v, dq_shift) : v;
@@ -345,7 +344,7 @@
#if CONFIG_AOM_QM
const int eob = decode_coefs(xd, pd->plane_type, pd->dqcoeff, tx_size,
tx_type, dequant, ctx, sc->scan, sc->neighbors,
- &sc->max_scan_line, r, pd->seg_iqmatrix[seg_id]);
+ max_scan_line, r, pd->seg_iqmatrix[seg_id]);
#else
const int eob =
decode_coefs(xd, pd->plane_type, pd->dqcoeff, tx_size, tx_type, dequant,
diff --git a/av1/encoder/bitstream.c b/av1/encoder/bitstream.c
index 260319e..fcfae7c 100644
--- a/av1/encoder/bitstream.c
+++ b/av1/encoder/bitstream.c
@@ -115,6 +115,9 @@
#if CONFIG_LOOP_RESTORATION
static struct av1_token switchable_restore_encodings[RESTORE_SWITCHABLE_TYPES];
#endif // CONFIG_LOOP_RESTORATION
+static void write_uncompressed_header(AV1_COMP *cpi,
+ struct aom_write_bit_buffer *wb);
+static uint32_t write_compressed_header(AV1_COMP *cpi, uint8_t *data);
void av1_encode_token_init(void) {
#if CONFIG_EXT_TX || CONFIG_PALETTE
@@ -314,7 +317,7 @@
static void prob_diff_update(const aom_tree_index *tree,
aom_prob probs[/*n - 1*/],
const unsigned int counts[/*n - 1*/], int n,
- aom_writer *w) {
+ int probwt, aom_writer *w) {
int i;
unsigned int branch_ct[32][2];
@@ -323,13 +326,14 @@
av1_tree_probs_from_distribution(tree, branch_ct, counts);
for (i = 0; i < n - 1; ++i)
- av1_cond_prob_diff_update(w, &probs[i], branch_ct[i]);
+ av1_cond_prob_diff_update(w, &probs[i], branch_ct[i], probwt);
}
+#if !CONFIG_EC_ADAPT
static int prob_diff_update_savings(const aom_tree_index *tree,
aom_prob probs[/*n - 1*/],
- const unsigned int counts[/*n - 1*/],
- int n) {
+ const unsigned int counts[/*n - 1*/], int n,
+ int probwt) {
int i;
unsigned int branch_ct[32][2];
int savings = 0;
@@ -338,10 +342,12 @@
assert(n <= 32);
av1_tree_probs_from_distribution(tree, branch_ct, counts);
for (i = 0; i < n - 1; ++i) {
- savings += av1_cond_prob_diff_update_savings(&probs[i], branch_ct[i]);
+ savings +=
+ av1_cond_prob_diff_update_savings(&probs[i], branch_ct[i], probwt);
}
return savings;
}
+#endif
#if CONFIG_VAR_TX
static void write_tx_size_vartx(const AV1_COMMON *cm, const MACROBLOCKD *xd,
@@ -350,13 +356,12 @@
aom_writer *w) {
const int tx_row = blk_row >> 1;
const int tx_col = blk_col >> 1;
- int max_blocks_high = num_4x4_blocks_high_lookup[mbmi->sb_type];
- int max_blocks_wide = num_4x4_blocks_wide_lookup[mbmi->sb_type];
- int ctx = txfm_partition_context(xd->above_txfm_context + tx_col,
- xd->left_txfm_context + tx_row, tx_size);
+ const int max_blocks_high = max_block_high(xd, mbmi->sb_type, 0);
+ const int max_blocks_wide = max_block_wide(xd, mbmi->sb_type, 0);
- if (xd->mb_to_bottom_edge < 0) max_blocks_high += xd->mb_to_bottom_edge >> 5;
- if (xd->mb_to_right_edge < 0) max_blocks_wide += xd->mb_to_right_edge >> 5;
+ int ctx = txfm_partition_context(xd->above_txfm_context + tx_col,
+ xd->left_txfm_context + tx_row,
+ mbmi->sb_type, tx_size);
if (blk_row >= max_blocks_high || blk_col >= max_blocks_wide) return;
@@ -394,11 +399,11 @@
}
static void update_txfm_partition_probs(AV1_COMMON *cm, aom_writer *w,
- FRAME_COUNTS *counts) {
+ FRAME_COUNTS *counts, int probwt) {
int k;
for (k = 0; k < TXFM_PARTITION_CONTEXTS; ++k)
av1_cond_prob_diff_update(w, &cm->fc->txfm_partition_prob[k],
- counts->txfm_partition[k]);
+ counts->txfm_partition[k], probwt);
}
#endif
@@ -414,6 +419,7 @@
const int tx_size_cat = is_inter ? inter_tx_size_cat_lookup[bsize]
: intra_tx_size_cat_lookup[bsize];
const TX_SIZE coded_tx_size = txsize_sqr_up_map[tx_size];
+ const int depth = tx_size_to_depth(coded_tx_size);
#if CONFIG_EXT_TX && CONFIG_RECT_TX
assert(IMPLIES(is_rect_tx(tx_size), is_rect_tx_allowed(xd, mbmi)));
@@ -423,7 +429,7 @@
av1_write_token(w, av1_tx_size_tree[tx_size_cat],
cm->fc->tx_size_probs[tx_size_cat][tx_size_ctx],
- &tx_size_encodings[tx_size_cat][coded_tx_size]);
+ &tx_size_encodings[tx_size_cat][depth]);
}
}
@@ -431,23 +437,33 @@
static void update_inter_mode_probs(AV1_COMMON *cm, aom_writer *w,
FRAME_COUNTS *counts) {
int i;
+#if CONFIG_TILE_GROUPS
+ const int probwt = cm->num_tg;
+#else
+ const int probwt = 1;
+#endif
for (i = 0; i < NEWMV_MODE_CONTEXTS; ++i)
- av1_cond_prob_diff_update(w, &cm->fc->newmv_prob[i], counts->newmv_mode[i]);
+ av1_cond_prob_diff_update(w, &cm->fc->newmv_prob[i], counts->newmv_mode[i],
+ probwt);
for (i = 0; i < ZEROMV_MODE_CONTEXTS; ++i)
av1_cond_prob_diff_update(w, &cm->fc->zeromv_prob[i],
- counts->zeromv_mode[i]);
+ counts->zeromv_mode[i], probwt);
for (i = 0; i < REFMV_MODE_CONTEXTS; ++i)
- av1_cond_prob_diff_update(w, &cm->fc->refmv_prob[i], counts->refmv_mode[i]);
+ av1_cond_prob_diff_update(w, &cm->fc->refmv_prob[i], counts->refmv_mode[i],
+ probwt);
for (i = 0; i < DRL_MODE_CONTEXTS; ++i)
- av1_cond_prob_diff_update(w, &cm->fc->drl_prob[i], counts->drl_mode[i]);
+ av1_cond_prob_diff_update(w, &cm->fc->drl_prob[i], counts->drl_mode[i],
+ probwt);
#if CONFIG_EXT_INTER
- av1_cond_prob_diff_update(w, &cm->fc->new2mv_prob, counts->new2mv_mode);
+ av1_cond_prob_diff_update(w, &cm->fc->new2mv_prob, counts->new2mv_mode,
+ probwt);
#endif // CONFIG_EXT_INTER
}
#endif
#if CONFIG_EXT_INTER
-static void update_inter_compound_mode_probs(AV1_COMMON *cm, aom_writer *w) {
+static void update_inter_compound_mode_probs(AV1_COMMON *cm, int probwt,
+ aom_writer *w) {
const int savings_thresh = av1_cost_one(GROUP_DIFF_UPDATE_PROB) -
av1_cost_zero(GROUP_DIFF_UPDATE_PROB);
int i;
@@ -456,7 +472,7 @@
for (i = 0; i < INTER_MODE_CONTEXTS; ++i) {
savings += prob_diff_update_savings(
av1_inter_compound_mode_tree, cm->fc->inter_compound_mode_probs[i],
- cm->counts.inter_compound_mode[i], INTER_COMPOUND_MODES);
+ cm->counts.inter_compound_mode[i], INTER_COMPOUND_MODES, probwt);
}
do_update = savings > savings_thresh;
aom_write(w, do_update, GROUP_DIFF_UPDATE_PROB);
@@ -464,7 +480,7 @@
for (i = 0; i < INTER_MODE_CONTEXTS; ++i) {
prob_diff_update(
av1_inter_compound_mode_tree, cm->fc->inter_compound_mode_probs[i],
- cm->counts.inter_compound_mode[i], INTER_COMPOUND_MODES, w);
+ cm->counts.inter_compound_mode[i], INTER_COMPOUND_MODES, probwt, w);
}
}
}
@@ -509,9 +525,14 @@
static void update_delta_q_probs(AV1_COMMON *cm, aom_writer *w,
FRAME_COUNTS *counts) {
int k;
-
+#if CONFIG_TILE_GROUPS
+ const int probwt = cm->num_tg;
+#else
+ const int probwt = 1;
+#endif
for (k = 0; k < DELTA_Q_CONTEXTS; ++k) {
- av1_cond_prob_diff_update(w, &cm->fc->delta_q_prob[k], counts->delta_q[k]);
+ av1_cond_prob_diff_update(w, &cm->fc->delta_q_prob[k], counts->delta_q[k],
+ probwt);
}
}
#endif
@@ -519,25 +540,33 @@
static void update_skip_probs(AV1_COMMON *cm, aom_writer *w,
FRAME_COUNTS *counts) {
int k;
-
- for (k = 0; k < SKIP_CONTEXTS; ++k)
- av1_cond_prob_diff_update(w, &cm->fc->skip_probs[k], counts->skip[k]);
+#if CONFIG_TILE_GROUPS
+ const int probwt = cm->num_tg;
+#else
+ const int probwt = 1;
+#endif
+ for (k = 0; k < SKIP_CONTEXTS; ++k) {
+ av1_cond_prob_diff_update(w, &cm->fc->skip_probs[k], counts->skip[k],
+ probwt);
+ }
}
+#if !CONFIG_EC_ADAPT
static void update_switchable_interp_probs(AV1_COMMON *cm, aom_writer *w,
FRAME_COUNTS *counts) {
int j;
for (j = 0; j < SWITCHABLE_FILTER_CONTEXTS; ++j) {
- prob_diff_update(av1_switchable_interp_tree,
- cm->fc->switchable_interp_prob[j],
- counts->switchable_interp[j], SWITCHABLE_FILTERS, w);
-#if CONFIG_DAALA_EC
- av1_tree_to_cdf(av1_switchable_interp_tree,
- cm->fc->switchable_interp_prob[j],
- cm->fc->switchable_interp_cdf[j]);
+#if CONFIG_TILE_GROUPS
+ const int probwt = cm->num_tg;
+#else
+ const int probwt = 1;
#endif
+ prob_diff_update(
+ av1_switchable_interp_tree, cm->fc->switchable_interp_prob[j],
+ counts->switchable_interp[j], SWITCHABLE_FILTERS, probwt, w);
}
}
+#endif
#if CONFIG_EXT_TX
static void update_ext_tx_probs(AV1_COMMON *cm, aom_writer *w) {
@@ -545,6 +574,11 @@
av1_cost_zero(GROUP_DIFF_UPDATE_PROB);
int i, j;
int s;
+#if CONFIG_TILE_GROUPS
+ const int probwt = cm->num_tg;
+#else
+ const int probwt = 1;
+#endif
for (s = 1; s < EXT_TX_SETS_INTER; ++s) {
int savings = 0;
int do_update = 0;
@@ -552,7 +586,7 @@
if (!use_inter_ext_tx_for_txsize[s][i]) continue;
savings += prob_diff_update_savings(
av1_ext_tx_inter_tree[s], cm->fc->inter_ext_tx_prob[s][i],
- cm->counts.inter_ext_tx[s][i], num_ext_tx_set_inter[s]);
+ cm->counts.inter_ext_tx[s][i], num_ext_tx_set_inter[s], probwt);
}
do_update = savings > savings_thresh;
aom_write(w, do_update, GROUP_DIFF_UPDATE_PROB);
@@ -561,7 +595,7 @@
if (!use_inter_ext_tx_for_txsize[s][i]) continue;
prob_diff_update(
av1_ext_tx_inter_tree[s], cm->fc->inter_ext_tx_prob[s][i],
- cm->counts.inter_ext_tx[s][i], num_ext_tx_set_inter[s], w);
+ cm->counts.inter_ext_tx[s][i], num_ext_tx_set_inter[s], probwt, w);
}
}
}
@@ -574,7 +608,7 @@
for (j = 0; j < INTRA_MODES; ++j)
savings += prob_diff_update_savings(
av1_ext_tx_intra_tree[s], cm->fc->intra_ext_tx_prob[s][i][j],
- cm->counts.intra_ext_tx[s][i][j], num_ext_tx_set_intra[s]);
+ cm->counts.intra_ext_tx[s][i][j], num_ext_tx_set_intra[s], probwt);
}
do_update = savings > savings_thresh;
aom_write(w, do_update, GROUP_DIFF_UPDATE_PROB);
@@ -582,16 +616,17 @@
for (i = TX_4X4; i < EXT_TX_SIZES; ++i) {
if (!use_intra_ext_tx_for_txsize[s][i]) continue;
for (j = 0; j < INTRA_MODES; ++j)
- prob_diff_update(
- av1_ext_tx_intra_tree[s], cm->fc->intra_ext_tx_prob[s][i][j],
- cm->counts.intra_ext_tx[s][i][j], num_ext_tx_set_intra[s], w);
+ prob_diff_update(av1_ext_tx_intra_tree[s],
+ cm->fc->intra_ext_tx_prob[s][i][j],
+ cm->counts.intra_ext_tx[s][i][j],
+ num_ext_tx_set_intra[s], probwt, w);
}
}
}
}
#else
-
+#if !CONFIG_EC_ADAPT
static void update_ext_tx_probs(AV1_COMMON *cm, aom_writer *w) {
const int savings_thresh = av1_cost_one(GROUP_DIFF_UPDATE_PROB) -
av1_cost_zero(GROUP_DIFF_UPDATE_PROB);
@@ -599,11 +634,16 @@
int savings = 0;
int do_update = 0;
+#if CONFIG_TILE_GROUPS
+ const int probwt = cm->num_tg;
+#else
+ const int probwt = 1;
+#endif
for (i = TX_4X4; i < EXT_TX_SIZES; ++i) {
for (j = 0; j < TX_TYPES; ++j)
savings += prob_diff_update_savings(
av1_ext_tx_tree, cm->fc->intra_ext_tx_prob[i][j],
- cm->counts.intra_ext_tx[i][j], TX_TYPES);
+ cm->counts.intra_ext_tx[i][j], TX_TYPES, probwt);
}
do_update = savings > savings_thresh;
aom_write(w, do_update, GROUP_DIFF_UPDATE_PROB);
@@ -611,35 +651,28 @@
for (i = TX_4X4; i < EXT_TX_SIZES; ++i) {
for (j = 0; j < TX_TYPES; ++j) {
prob_diff_update(av1_ext_tx_tree, cm->fc->intra_ext_tx_prob[i][j],
- cm->counts.intra_ext_tx[i][j], TX_TYPES, w);
-#if CONFIG_DAALA_EC
- av1_tree_to_cdf(av1_ext_tx_tree, cm->fc->intra_ext_tx_prob[i][j],
- cm->fc->intra_ext_tx_cdf[i][j]);
-#endif
+ cm->counts.intra_ext_tx[i][j], TX_TYPES, probwt, w);
}
}
}
+
savings = 0;
for (i = TX_4X4; i < EXT_TX_SIZES; ++i) {
savings +=
prob_diff_update_savings(av1_ext_tx_tree, cm->fc->inter_ext_tx_prob[i],
- cm->counts.inter_ext_tx[i], TX_TYPES);
+ cm->counts.inter_ext_tx[i], TX_TYPES, probwt);
}
do_update = savings > savings_thresh;
aom_write(w, do_update, GROUP_DIFF_UPDATE_PROB);
if (do_update) {
for (i = TX_4X4; i < EXT_TX_SIZES; ++i) {
prob_diff_update(av1_ext_tx_tree, cm->fc->inter_ext_tx_prob[i],
- cm->counts.inter_ext_tx[i], TX_TYPES, w);
-#if CONFIG_DAALA_EC
- av1_tree_to_cdf(av1_ext_tx_tree, cm->fc->inter_ext_tx_prob[i],
- cm->fc->inter_ext_tx_cdf[i]);
-#endif
+ cm->counts.inter_ext_tx[i], TX_TYPES, probwt, w);
}
}
}
#endif // CONFIG_EXT_TX
-
+#endif
#if CONFIG_PALETTE
static void pack_palette_tokens(aom_writer *w, const TOKENEXTRA **tp, int n,
int num) {
@@ -657,7 +690,7 @@
#endif // CONFIG_PALETTE
#if CONFIG_SUPERTX
-static void update_supertx_probs(AV1_COMMON *cm, aom_writer *w) {
+static void update_supertx_probs(AV1_COMMON *cm, int probwt, aom_writer *w) {
const int savings_thresh = av1_cost_one(GROUP_DIFF_UPDATE_PROB) -
av1_cost_zero(GROUP_DIFF_UPDATE_PROB);
int i, j;
@@ -665,8 +698,8 @@
int do_update = 0;
for (i = 0; i < PARTITION_SUPERTX_CONTEXTS; ++i) {
for (j = 1; j < TX_SIZES; ++j) {
- savings += av1_cond_prob_diff_update_savings(&cm->fc->supertx_prob[i][j],
- cm->counts.supertx[i][j]);
+ savings += av1_cond_prob_diff_update_savings(
+ &cm->fc->supertx_prob[i][j], cm->counts.supertx[i][j], probwt);
}
}
do_update = savings > savings_thresh;
@@ -675,7 +708,7 @@
for (i = 0; i < PARTITION_SUPERTX_CONTEXTS; ++i) {
for (j = 1; j < TX_SIZES; ++j) {
av1_cond_prob_diff_update(w, &cm->fc->supertx_prob[i][j],
- cm->counts.supertx[i][j]);
+ cm->counts.supertx[i][j], probwt);
}
}
}
@@ -684,11 +717,11 @@
static void pack_mb_tokens(aom_writer *w, const TOKENEXTRA **tp,
const TOKENEXTRA *const stop,
- aom_bit_depth_t bit_depth, const TX_SIZE tx) {
+ aom_bit_depth_t bit_depth, const TX_SIZE tx_size) {
const TOKENEXTRA *p = *tp;
#if CONFIG_VAR_TX
int count = 0;
- const int seg_eob = get_tx2d_size(tx);
+ const int seg_eob = tx_size_2d[tx_size];
#endif
#if CONFIG_AOM_HIGHBITDEPTH
const av1_extra_bit *const extra_bits_table =
@@ -703,14 +736,14 @@
while (p < stop && p->token != EOSB_TOKEN) {
const int token = p->token;
aom_tree_index index = 0;
-#if !CONFIG_RANS && !CONFIG_DAALA_EC
+#if !CONFIG_EC_MULTISYMBOL
const struct av1_token *const coef_encoding = &av1_coef_encodings[token];
int coef_value = coef_encoding->value;
int coef_length = coef_encoding->len;
-#endif // !CONFIG_RANS
+#endif // !CONFIG_EC_MULTISYMBOL
const av1_extra_bit *const extra_bits = &extra_bits_table[token];
-#if CONFIG_RANS || CONFIG_DAALA_EC
+#if CONFIG_EC_MULTISYMBOL
/* skip one or two nodes */
if (!p->skip_eob_node) aom_write(w, token != EOB_TOKEN, p->context_tree[0]);
@@ -743,7 +776,7 @@
}
}
}
-#endif // CONFIG_RANS
+#endif // CONFIG_EC_MULTISYMBOL
if (extra_bits->base_val) {
const int bit_string = p->extra;
@@ -751,7 +784,7 @@
// be written excluding
// the sign bit.
int skip_bits = (extra_bits->base_val == CAT6_MIN_VAL)
- ? TX_SIZES - 1 - txsize_sqr_up_map[tx]
+ ? TX_SIZES - 1 - txsize_sqr_up_map[tx_size]
: 0;
if (bit_string_length > 0) {
@@ -797,13 +830,8 @@
const int tx_row = blk_row >> (1 - pd->subsampling_y);
const int tx_col = blk_col >> (1 - pd->subsampling_x);
TX_SIZE plane_tx_size;
- int max_blocks_high = num_4x4_blocks_high_lookup[plane_bsize];
- int max_blocks_wide = num_4x4_blocks_wide_lookup[plane_bsize];
-
- if (xd->mb_to_bottom_edge < 0)
- max_blocks_high += xd->mb_to_bottom_edge >> (5 + pd->subsampling_y);
- if (xd->mb_to_right_edge < 0)
- max_blocks_wide += xd->mb_to_right_edge >> (5 + pd->subsampling_x);
+ const int max_blocks_high = max_block_high(xd, plane_bsize, plane);
+ const int max_blocks_wide = max_block_wide(xd, plane_bsize, plane);
if (blk_row >= max_blocks_high || blk_col >= max_blocks_wide) return;
@@ -814,29 +842,28 @@
if (tx_size == plane_tx_size) {
pack_mb_tokens(w, tp, tok_end, bit_depth, tx_size);
} else {
- int bsl = b_width_log2_lookup[bsize];
+ const int bsl = block_size_wide[bsize] >> (tx_size_wide_log2[0] + 1);
int i;
assert(bsl > 0);
- --bsl;
for (i = 0; i < 4; ++i) {
- const int offsetr = blk_row + ((i >> 1) << bsl);
- const int offsetc = blk_col + ((i & 0x01) << bsl);
- int step = num_4x4_blocks_txsize_lookup[tx_size - 1];
+ const int offsetr = blk_row + (i >> 1) * bsl;
+ const int offsetc = blk_col + (i & 0x01) * bsl;
+ const TX_SIZE sub_txs = tx_size - 1;
+ const int step = tx_size_wide_unit[sub_txs] * tx_size_high_unit[sub_txs];
if (offsetr >= max_blocks_high || offsetc >= max_blocks_wide) continue;
pack_txb_tokens(w, tp, tok_end, xd, mbmi, plane, plane_bsize, bit_depth,
- block + i * step, offsetr, offsetc, tx_size - 1);
+ block + i * step, offsetr, offsetc, sub_txs);
}
}
}
#endif
static void write_segment_id(aom_writer *w, const struct segmentation *seg,
- const struct segmentation_probs *segp,
- int segment_id) {
+ struct segmentation_probs *segp, int segment_id) {
if (seg->enabled && seg->update_map) {
#if CONFIG_DAALA_EC
aom_write_symbol(w, segment_id, segp->tree_cdf, MAX_SEGMENTS);
@@ -1097,7 +1124,7 @@
aom_writer *w) {
AV1_COMMON *const cm = &cpi->common;
#if !CONFIG_REF_MV
- const nmv_context *nmvc = &cm->fc->nmvc;
+ nmv_context *nmvc = &cm->fc->nmvc;
#endif
#if CONFIG_DELTA_Q
@@ -1108,7 +1135,7 @@
const MACROBLOCKD *xd = &x->e_mbd;
#endif
const struct segmentation *const seg = &cm->seg;
- const struct segmentation_probs *const segp = &cm->fc->seg;
+ struct segmentation_probs *const segp = &cm->fc->seg;
const MB_MODE_INFO *const mbmi = &mi->mbmi;
const MB_MODE_INFO_EXT *const mbmi_ext = x->mbmi_ext;
const PREDICTION_MODE mode = mbmi->mode;
@@ -1321,7 +1348,7 @@
int nmv_ctx = av1_nmv_ctx(mbmi_ext->ref_mv_count[rf_type],
mbmi_ext->ref_mv_stack[rf_type], ref,
mbmi->ref_mv_idx);
- const nmv_context *nmvc = &cm->fc->nmvc[nmv_ctx];
+ nmv_context *nmvc = &cm->fc->nmvc[nmv_ctx];
#endif
av1_encode_mv(cpi, w, &mi->bmi[j].as_mv[ref].as_mv,
#if CONFIG_EXT_INTER
@@ -1346,7 +1373,7 @@
int nmv_ctx = av1_nmv_ctx(mbmi_ext->ref_mv_count[rf_type],
mbmi_ext->ref_mv_stack[rf_type], 1,
mbmi->ref_mv_idx);
- const nmv_context *nmvc = &cm->fc->nmvc[nmv_ctx];
+ nmv_context *nmvc = &cm->fc->nmvc[nmv_ctx];
#endif
av1_encode_mv(cpi, w, &mi->bmi[j].as_mv[1].as_mv,
&mi->bmi[j].ref_mv[1].as_mv,
@@ -1360,7 +1387,7 @@
int nmv_ctx = av1_nmv_ctx(mbmi_ext->ref_mv_count[rf_type],
mbmi_ext->ref_mv_stack[rf_type], 0,
mbmi->ref_mv_idx);
- const nmv_context *nmvc = &cm->fc->nmvc[nmv_ctx];
+ nmv_context *nmvc = &cm->fc->nmvc[nmv_ctx];
#endif
av1_encode_mv(cpi, w, &mi->bmi[j].as_mv[0].as_mv,
&mi->bmi[j].ref_mv[0].as_mv,
@@ -1385,7 +1412,7 @@
int nmv_ctx = av1_nmv_ctx(mbmi_ext->ref_mv_count[rf_type],
mbmi_ext->ref_mv_stack[rf_type], ref,
mbmi->ref_mv_idx);
- const nmv_context *nmvc = &cm->fc->nmvc[nmv_ctx];
+ nmv_context *nmvc = &cm->fc->nmvc[nmv_ctx];
#endif
ref_mv = mbmi_ext->ref_mvs[mbmi->ref_frame[ref]][0];
#if CONFIG_EXT_INTER
@@ -1411,7 +1438,7 @@
int nmv_ctx =
av1_nmv_ctx(mbmi_ext->ref_mv_count[rf_type],
mbmi_ext->ref_mv_stack[rf_type], 1, mbmi->ref_mv_idx);
- const nmv_context *nmvc = &cm->fc->nmvc[nmv_ctx];
+ nmv_context *nmvc = &cm->fc->nmvc[nmv_ctx];
#endif
av1_encode_mv(cpi, w, &mbmi->mv[1].as_mv,
&mbmi_ext->ref_mvs[mbmi->ref_frame[1]][0].as_mv,
@@ -1425,7 +1452,7 @@
int nmv_ctx =
av1_nmv_ctx(mbmi_ext->ref_mv_count[rf_type],
mbmi_ext->ref_mv_stack[rf_type], 0, mbmi->ref_mv_idx);
- const nmv_context *nmvc = &cm->fc->nmvc[nmv_ctx];
+ nmv_context *nmvc = &cm->fc->nmvc[nmv_ctx];
#endif
av1_encode_mv(cpi, w, &mbmi->mv[0].as_mv,
&mbmi_ext->ref_mvs[mbmi->ref_frame[0]][0].as_mv,
@@ -1571,15 +1598,15 @@
}
#if CONFIG_DELTA_Q
-static void write_mb_modes_kf(const AV1_COMMON *cm, MACROBLOCKD *xd,
+static void write_mb_modes_kf(AV1_COMMON *cm, MACROBLOCKD *xd,
MODE_INFO **mi_8x8, aom_writer *w) {
int skip;
#else
-static void write_mb_modes_kf(const AV1_COMMON *cm, const MACROBLOCKD *xd,
+static void write_mb_modes_kf(AV1_COMMON *cm, const MACROBLOCKD *xd,
MODE_INFO **mi_8x8, aom_writer *w) {
#endif
const struct segmentation *const seg = &cm->seg;
- const struct segmentation_probs *const segp = &cm->fc->seg;
+ struct segmentation_probs *const segp = &cm->fc->seg;
const MODE_INFO *const mi = mi_8x8[0];
const MODE_INFO *const above_mi = xd->above_mi;
const MODE_INFO *const left_mi = xd->left_mi;
@@ -1800,8 +1827,10 @@
const BLOCK_SIZE plane_bsize =
get_plane_block_size(AOMMAX(bsize, BLOCK_8X8), pd);
- const int num_4x4_w = num_4x4_blocks_wide_lookup[plane_bsize];
- const int num_4x4_h = num_4x4_blocks_high_lookup[plane_bsize];
+ const int num_4x4_w =
+ block_size_wide[plane_bsize] >> tx_size_wide_log2[0];
+ const int num_4x4_h =
+ block_size_high[plane_bsize] >> tx_size_wide_log2[0];
int row, col;
#if CONFIG_EXT_TX && CONFIG_RECT_TX
TX_SIZE tx_size =
@@ -1812,12 +1841,13 @@
if (is_inter_block(mbmi)) {
#endif
const TX_SIZE max_tx_size = max_txsize_lookup[plane_bsize];
- const BLOCK_SIZE txb_size = txsize_to_bsize[max_tx_size];
int block = 0;
- const int step = num_4x4_blocks_txsize_lookup[max_tx_size];
- bw = num_4x4_blocks_wide_lookup[txb_size];
- for (row = 0; row < num_4x4_h; row += bw) {
- for (col = 0; col < num_4x4_w; col += bw) {
+ const int step =
+ tx_size_wide_unit[max_tx_size] * tx_size_high_unit[max_tx_size];
+ const int bkw = tx_size_wide_unit[max_tx_size];
+ const int bkh = tx_size_high_unit[max_tx_size];
+ for (row = 0; row < num_4x4_h; row += bkh) {
+ for (col = 0; col < num_4x4_w; col += bkw) {
pack_txb_tokens(w, tok, tok_end, xd, mbmi, plane, plane_bsize,
cm->bit_depth, block, row, col, max_tx_size);
block += step;
@@ -1826,12 +1856,11 @@
} else {
TX_SIZE tx = plane ? get_uv_tx_size(&m->mbmi, &xd->plane[plane])
: m->mbmi.tx_size;
- BLOCK_SIZE txb_size = txsize_to_bsize[tx];
- bw = num_4x4_blocks_wide_lookup[txb_size];
- bh = num_4x4_blocks_high_lookup[txb_size];
+ const int bkw = tx_size_wide_unit[tx];
+ const int bkh = tx_size_high_unit[tx];
- for (row = 0; row < num_4x4_h; row += bh)
- for (col = 0; col < num_4x4_w; col += bw)
+ for (row = 0; row < num_4x4_h; row += bkh)
+ for (col = 0; col < num_4x4_w; col += bkw)
pack_mb_tokens(w, tok, tok_end, cm->bit_depth, tx);
}
#else
@@ -2057,6 +2086,17 @@
if (bsize >= BLOCK_8X8 &&
(bsize == BLOCK_8X8 || partition != PARTITION_SPLIT))
update_partition_context(xd, mi_row, mi_col, subsize, bsize);
+#endif // CONFIG_EXT_PARTITION_TYPES
+
+#if CONFIG_DERING
+ if (bsize == BLOCK_64X64 && cm->dering_level != 0 &&
+ !sb_all_skip(cm, mi_row, mi_col)) {
+ aom_write_literal(
+ w,
+ cm->mi_grid_visible[mi_row * cm->mi_stride + mi_col]->mbmi.dering_gain,
+ DERING_REFINEMENT_BITS);
+ }
+#endif
#if CONFIG_CLPF
if (bsize == BLOCK_64X64 && cm->clpf_blocks && cm->clpf_strength_y &&
@@ -2088,17 +2128,6 @@
aom_write_literal(w, cm->clpf_blocks[br], 1);
}
#endif
-
-#if CONFIG_DERING
- if (bsize == BLOCK_64X64 && cm->dering_level != 0 &&
- !sb_all_skip(cm, mi_row, mi_col)) {
- aom_write_literal(
- w,
- cm->mi_grid_visible[mi_row * cm->mi_stride + mi_col]->mbmi.dering_gain,
- DERING_REFINEMENT_BITS);
- }
-#endif
-#endif // CONFIG_EXT_PARTITION_TYPES
}
static void write_modes(AV1_COMP *const cpi, const TileInfo *const tile,
@@ -2161,9 +2190,18 @@
av1_coeff_probs_model *new_coef_probs) {
av1_coeff_probs_model *old_coef_probs = cpi->common.fc->coef_probs[tx_size];
const aom_prob upd = DIFF_UPDATE_PROB;
+#if CONFIG_EC_ADAPT
+ const int entropy_nodes_update = UNCONSTRAINED_NODES - 1;
+#else
const int entropy_nodes_update = UNCONSTRAINED_NODES;
+#endif
int i, j, k, l, t;
int stepsize = cpi->sf.coeff_prob_appx_step;
+#if CONFIG_TILE_GROUPS
+ const int probwt = cpi->common.num_tg;
+#else
+ const int probwt = 1;
+#endif
switch (cpi->sf.use_fast_coef_updates) {
case TWO_LOOP: {
@@ -2182,10 +2220,11 @@
if (t == PIVOT_NODE)
s = av1_prob_diff_update_savings_search_model(
frame_branch_ct[i][j][k][l][0],
- old_coef_probs[i][j][k][l], &newp, upd, stepsize);
+ old_coef_probs[i][j][k][l], &newp, upd, stepsize, probwt);
else
s = av1_prob_diff_update_savings_search(
- frame_branch_ct[i][j][k][l][t], oldp, &newp, upd);
+ frame_branch_ct[i][j][k][l][t], oldp, &newp, upd, probwt);
+
if (s > 0 && newp != oldp) u = 1;
if (u)
savings += s - (int)(av1_cost_zero(upd));
@@ -2217,10 +2256,11 @@
if (t == PIVOT_NODE)
s = av1_prob_diff_update_savings_search_model(
frame_branch_ct[i][j][k][l][0],
- old_coef_probs[i][j][k][l], &newp, upd, stepsize);
+ old_coef_probs[i][j][k][l], &newp, upd, stepsize, probwt);
else
s = av1_prob_diff_update_savings_search(
- frame_branch_ct[i][j][k][l][t], *oldp, &newp, upd);
+ frame_branch_ct[i][j][k][l][t], *oldp, &newp, upd,
+ probwt);
if (s > 0 && newp != *oldp) u = 1;
aom_write(bc, u, upd);
if (u) {
@@ -2249,14 +2289,14 @@
aom_prob *oldp = old_coef_probs[i][j][k][l] + t;
int s;
int u = 0;
-
if (t == PIVOT_NODE) {
s = av1_prob_diff_update_savings_search_model(
frame_branch_ct[i][j][k][l][0],
- old_coef_probs[i][j][k][l], &newp, upd, stepsize);
+ old_coef_probs[i][j][k][l], &newp, upd, stepsize, probwt);
} else {
s = av1_prob_diff_update_savings_search(
- frame_branch_ct[i][j][k][l][t], *oldp, &newp, upd);
+ frame_branch_ct[i][j][k][l][t], *oldp, &newp, upd,
+ probwt);
}
if (s > 0 && newp != *oldp) u = 1;
@@ -2840,8 +2880,14 @@
}
}
+#if !CONFIG_EC_ADAPT
static void update_seg_probs(AV1_COMP *cpi, aom_writer *w) {
AV1_COMMON *cm = &cpi->common;
+#if CONFIG_TILE_GROUPS
+ const int probwt = cm->num_tg;
+#else
+ const int probwt = 1;
+#endif
if (!cm->seg.enabled || !cm->seg.update_map) return;
@@ -2850,19 +2896,16 @@
for (i = 0; i < PREDICTION_PROBS; i++)
av1_cond_prob_diff_update(w, &cm->fc->seg.pred_probs[i],
- cm->counts.seg.pred[i]);
+ cm->counts.seg.pred[i], probwt);
prob_diff_update(av1_segment_tree, cm->fc->seg.tree_probs,
- cm->counts.seg.tree_mispred, MAX_SEGMENTS, w);
+ cm->counts.seg.tree_mispred, MAX_SEGMENTS, probwt, w);
} else {
prob_diff_update(av1_segment_tree, cm->fc->seg.tree_probs,
- cm->counts.seg.tree_total, MAX_SEGMENTS, w);
+ cm->counts.seg.tree_total, MAX_SEGMENTS, probwt, w);
}
-#if CONFIG_DAALA_EC
- av1_tree_to_cdf(av1_segment_tree, cm->fc->seg.tree_probs,
- cm->fc->seg.tree_cdf);
-#endif
}
+#endif
static void write_txfm_mode(TX_MODE mode, struct aom_write_bit_buffer *wb) {
aom_wb_write_bit(wb, mode == TX_MODE_SELECT);
@@ -2871,12 +2914,17 @@
static void update_txfm_probs(AV1_COMMON *cm, aom_writer *w,
FRAME_COUNTS *counts) {
+#if CONFIG_TILE_GROUPS
+ const int probwt = cm->num_tg;
+#else
+ const int probwt = 1;
+#endif
if (cm->tx_mode == TX_MODE_SELECT) {
int i, j;
for (i = 0; i < MAX_TX_DEPTH; ++i)
for (j = 0; j < TX_SIZE_CONTEXTS; ++j)
prob_diff_update(av1_tx_size_tree[i], cm->fc->tx_size_probs[i][j],
- counts->tx_size[i][j], i + 2, w);
+ counts->tx_size[i][j], i + 2, probwt, w);
}
}
@@ -3059,9 +3107,16 @@
}
#endif // CONFIG_EXT_TILE
+#if CONFIG_TILE_GROUPS
+static uint32_t write_tiles(AV1_COMP *const cpi,
+ struct aom_write_bit_buffer *wb,
+ unsigned int *max_tile_size,
+ unsigned int *max_tile_col_size) {
+#else
static uint32_t write_tiles(AV1_COMP *const cpi, uint8_t *const dst,
unsigned int *max_tile_size,
unsigned int *max_tile_col_size) {
+#endif
const AV1_COMMON *const cm = &cpi->common;
#if CONFIG_ANS
struct AnsCoder token_ans;
@@ -3074,6 +3129,20 @@
size_t total_size = 0;
const int tile_cols = cm->tile_cols;
const int tile_rows = cm->tile_rows;
+#if CONFIG_TILE_GROUPS
+ const int n_log2_tiles = cm->log2_tile_rows + cm->log2_tile_cols;
+ const int have_tiles = n_log2_tiles > 0;
+ size_t comp_hdr_size;
+ // Fixed size tile groups for the moment
+ const int num_tg_hdrs = cm->num_tg;
+ const int tg_size = (tile_rows * tile_cols + num_tg_hdrs - 1) / num_tg_hdrs;
+ int tile_count = 0;
+ int uncompressed_hdr_size = 0;
+ uint8_t *dst = NULL;
+ struct aom_write_bit_buffer comp_hdr_len_wb;
+ struct aom_write_bit_buffer tg_params_wb;
+ int saved_offset;
+#endif
#if CONFIG_EXT_TILE
const int have_tiles = tile_cols * tile_rows > 1;
#endif // CONFIG_EXT_TILE
@@ -3166,6 +3235,28 @@
}
}
#else
+#if CONFIG_TILE_GROUPS
+ write_uncompressed_header(cpi, wb);
+
+ // Write the tile length code. Use full 32 bit length fields for the moment
+ if (have_tiles) aom_wb_write_literal(wb, 3, 2);
+
+ /* Write a placeholder for the number of tiles in each tile group */
+ tg_params_wb = *wb;
+ saved_offset = wb->bit_offset;
+ if (n_log2_tiles) aom_wb_write_literal(wb, 0, n_log2_tiles * 2);
+
+ /* Write a placeholder for the compressed header length */
+ comp_hdr_len_wb = *wb;
+ aom_wb_write_literal(wb, 0, 16);
+
+ uncompressed_hdr_size = aom_wb_bytes_written(wb);
+ dst = wb->bit_buffer;
+ comp_hdr_size = write_compressed_header(cpi, dst + uncompressed_hdr_size);
+ aom_wb_write_literal(&comp_hdr_len_wb, (int)(comp_hdr_size), 16);
+ total_size += uncompressed_hdr_size + comp_hdr_size;
+#endif
+
for (tile_row = 0; tile_row < tile_rows; tile_row++) {
TileInfo tile_info;
const int is_last_row = (tile_row == tile_rows - 1);
@@ -3175,11 +3266,33 @@
for (tile_col = 0; tile_col < tile_cols; tile_col++) {
TileBufferEnc *const buf = &tile_buffers[tile_row][tile_col];
const int is_last_col = (tile_col == tile_cols - 1);
- const int is_last_tile = is_last_col && is_last_row;
unsigned int tile_size;
const TOKENEXTRA *tok = tok_buffers[tile_row][tile_col];
const TOKENEXTRA *tok_end = tok + cpi->tok_count[tile_row][tile_col];
-
+#if !CONFIG_TILE_GROUPS
+ const int is_last_tile = is_last_col && is_last_row;
+#else
+ const int tile_idx = tile_row * tile_cols + tile_col;
+ // All tiles in a tile group have a length
+ const int is_last_tile = 0;
+ if (tile_count >= tg_size) {
+ // Copy uncompressed header
+ memcpy(dst + total_size, dst, uncompressed_hdr_size * sizeof(uint8_t));
+ // Write the number of tiles in the group into the last uncompressed
+ // header
+ aom_wb_write_literal(&tg_params_wb, tile_idx - tile_count,
+ n_log2_tiles);
+ aom_wb_write_literal(&tg_params_wb, tile_count - 1, n_log2_tiles);
+ tg_params_wb.bit_offset = saved_offset + 8 * total_size;
+ // Copy compressed header
+ memcpy(dst + total_size + uncompressed_hdr_size,
+ dst + uncompressed_hdr_size, comp_hdr_size * sizeof(uint8_t));
+ total_size += uncompressed_hdr_size;
+ total_size += comp_hdr_size;
+ tile_count = 0;
+ }
+ tile_count++;
+#endif
av1_tile_set_col(&tile_info, cm, tile_col);
buf->data = dst + total_size;
@@ -3187,19 +3300,19 @@
// The last tile does not have a header.
if (!is_last_tile) total_size += 4;
-#if !CONFIG_ANS
- aom_start_encode(&mode_bc, dst + total_size);
- write_modes(cpi, &tile_info, &mode_bc, &tok, tok_end);
- assert(tok == tok_end);
- aom_stop_encode(&mode_bc);
- tile_size = mode_bc.pos;
-#else
+#if CONFIG_ANS
buf_ans_write_reset(buf_ans);
write_modes(cpi, &tile_info, buf_ans, &tok, tok_end);
assert(tok == tok_end);
ans_write_init(&token_ans, dst + total_size);
buf_ans_flush(buf_ans, &token_ans);
tile_size = ans_write_end(&token_ans);
+#else
+ aom_start_encode(&mode_bc, dst + total_size);
+ write_modes(cpi, &tile_info, &mode_bc, &tok, tok_end);
+ assert(tok == tok_end);
+ aom_stop_encode(&mode_bc);
+ tile_size = mode_bc.pos;
#endif // !CONFIG_ANS
assert(tile_size > 0);
@@ -3215,6 +3328,14 @@
total_size += tile_size;
}
}
+#if CONFIG_TILE_GROUPS
+ // Write the final tile group size
+ if (n_log2_tiles) {
+ aom_wb_write_literal(&tg_params_wb, (1 << n_log2_tiles) - tile_count,
+ n_log2_tiles);
+ aom_wb_write_literal(&tg_params_wb, tile_count - 1, n_log2_tiles);
+ }
+#endif
#endif // CONFIG_EXT_TILE
return (uint32_t)total_size;
}
@@ -3443,12 +3564,12 @@
#endif // CONFIG_EXT_PARTITION
encode_loopfilter(cm, wb);
-#if CONFIG_CLPF
- encode_clpf(cm, wb);
-#endif
#if CONFIG_DERING
encode_dering(cm->dering_level, wb);
#endif // CONFIG_DERING
+#if CONFIG_CLPF
+ encode_clpf(cm, wb);
+#endif
#if CONFIG_LOOP_RESTORATION
encode_restoration_mode(cm, wb);
#endif // CONFIG_LOOP_RESTORATION
@@ -3560,6 +3681,12 @@
aom_writer *header_bc;
int i, j;
+#if CONFIG_TILE_GROUPS
+ const int probwt = cm->num_tg;
+#else
+ const int probwt = 1;
+#endif
+
#if CONFIG_ANS
struct AnsCoder header_ans;
int header_size;
@@ -3574,17 +3701,16 @@
#if CONFIG_LOOP_RESTORATION
encode_restoration(cm, header_bc);
#endif // CONFIG_LOOP_RESTORATION
-
update_txfm_probs(cm, header_bc, counts);
update_coef_probs(cpi, header_bc);
#if CONFIG_VAR_TX
- update_txfm_partition_probs(cm, header_bc, counts);
+ update_txfm_partition_probs(cm, header_bc, counts, probwt);
#if CONFIG_EXT_TX && CONFIG_RECT_TX
if (cm->tx_mode == TX_MODE_SELECT) {
for (i = 1; i < TX_SIZES - 1; ++i)
av1_cond_prob_diff_update(header_bc, &fc->rect_tx_prob[i],
- counts->rect_tx[i]);
+ counts->rect_tx[i], probwt);
}
#endif // CONFIG_EXT_TX && CONFIG_RECT_TX
#endif
@@ -3593,155 +3719,143 @@
#if CONFIG_DELTA_Q
update_delta_q_probs(cm, header_bc, counts);
#endif
+#if !CONFIG_EC_ADAPT
update_seg_probs(cpi, header_bc);
for (i = 0; i < INTRA_MODES; ++i) {
prob_diff_update(av1_intra_mode_tree, fc->uv_mode_prob[i],
- counts->uv_mode[i], INTRA_MODES, header_bc);
-#if CONFIG_DAALA_EC
- av1_tree_to_cdf(av1_intra_mode_tree, fc->uv_mode_prob[i],
- fc->uv_mode_cdf[i]);
-#endif
+ counts->uv_mode[i], INTRA_MODES, probwt, header_bc);
}
#if CONFIG_EXT_PARTITION_TYPES
prob_diff_update(av1_partition_tree, fc->partition_prob[0],
- counts->partition[0], PARTITION_TYPES, header_bc);
+ counts->partition[0], PARTITION_TYPES, probwt, header_bc);
for (i = 1; i < PARTITION_CONTEXTS; ++i)
prob_diff_update(av1_ext_partition_tree, fc->partition_prob[i],
- counts->partition[i], EXT_PARTITION_TYPES, header_bc);
+ counts->partition[i], EXT_PARTITION_TYPES, probwt,
+ header_bc);
#else
for (i = 0; i < PARTITION_CONTEXTS; ++i) {
prob_diff_update(av1_partition_tree, fc->partition_prob[i],
- counts->partition[i], PARTITION_TYPES, header_bc);
-#if CONFIG_DAALA_EC
- av1_tree_to_cdf(av1_partition_tree, cm->fc->partition_prob[i],
- cm->fc->partition_cdf[i]);
-#endif
+ counts->partition[i], PARTITION_TYPES, probwt, header_bc);
}
-#endif // CONFIG_EXT_PARTITION_TYPES
+#endif // CONFIG_EC_ADAPT, CONFIG_DAALA_EC
#if CONFIG_EXT_INTRA
for (i = 0; i < INTRA_FILTERS + 1; ++i)
prob_diff_update(av1_intra_filter_tree, fc->intra_filter_probs[i],
- counts->intra_filter[i], INTRA_FILTERS, header_bc);
+ counts->intra_filter[i], INTRA_FILTERS, probwt, header_bc);
#endif // CONFIG_EXT_INTRA
-
+#endif // CONFIG_EC_ADAPT, CONFIG_DAALA_EC
if (frame_is_intra_only(cm)) {
av1_copy(cm->kf_y_prob, av1_kf_y_mode_prob);
#if CONFIG_DAALA_EC
av1_copy(cm->kf_y_cdf, av1_kf_y_mode_cdf);
#endif
+
+#if !CONFIG_EC_ADAPT
for (i = 0; i < INTRA_MODES; ++i)
- for (j = 0; j < INTRA_MODES; ++j) {
+ for (j = 0; j < INTRA_MODES; ++j)
prob_diff_update(av1_intra_mode_tree, cm->kf_y_prob[i][j],
- counts->kf_y_mode[i][j], INTRA_MODES, header_bc);
-#if CONFIG_DAALA_EC
- av1_tree_to_cdf(av1_intra_mode_tree, cm->kf_y_prob[i][j],
- cm->kf_y_cdf[i][j]);
-#endif
- }
+ counts->kf_y_mode[i][j], INTRA_MODES, probwt,
+ header_bc);
+#endif // CONFIG_EC_ADAPT
} else {
#if CONFIG_REF_MV
update_inter_mode_probs(cm, header_bc, counts);
#else
+#if !CONFIG_EC_ADAPT
for (i = 0; i < INTER_MODE_CONTEXTS; ++i) {
prob_diff_update(av1_inter_mode_tree, cm->fc->inter_mode_probs[i],
- counts->inter_mode[i], INTER_MODES, header_bc);
-#if CONFIG_DAALA_EC
- av1_tree_to_cdf(av1_inter_mode_tree, cm->fc->inter_mode_probs[i],
- cm->fc->inter_mode_cdf[i]);
-#endif
+ counts->inter_mode[i], INTER_MODES, probwt, header_bc);
}
#endif
-
+#endif
#if CONFIG_EXT_INTER
- update_inter_compound_mode_probs(cm, header_bc);
+ update_inter_compound_mode_probs(cm, probwt, header_bc);
if (cm->reference_mode != COMPOUND_REFERENCE) {
for (i = 0; i < BLOCK_SIZE_GROUPS; i++) {
if (is_interintra_allowed_bsize_group(i)) {
av1_cond_prob_diff_update(header_bc, &fc->interintra_prob[i],
- cm->counts.interintra[i]);
+ cm->counts.interintra[i], probwt);
}
}
for (i = 0; i < BLOCK_SIZE_GROUPS; i++) {
prob_diff_update(
av1_interintra_mode_tree, cm->fc->interintra_mode_prob[i],
- counts->interintra_mode[i], INTERINTRA_MODES, header_bc);
+ counts->interintra_mode[i], INTERINTRA_MODES, probwt, header_bc);
}
for (i = 0; i < BLOCK_SIZES; i++) {
if (is_interintra_allowed_bsize(i) && is_interintra_wedge_used(i))
av1_cond_prob_diff_update(header_bc, &fc->wedge_interintra_prob[i],
- cm->counts.wedge_interintra[i]);
+ cm->counts.wedge_interintra[i], probwt);
}
}
if (cm->reference_mode != SINGLE_REFERENCE) {
for (i = 0; i < BLOCK_SIZES; i++)
if (is_interinter_wedge_used(i))
av1_cond_prob_diff_update(header_bc, &fc->wedge_interinter_prob[i],
- cm->counts.wedge_interinter[i]);
+ cm->counts.wedge_interinter[i], probwt);
}
#endif // CONFIG_EXT_INTER
#if CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
for (i = BLOCK_8X8; i < BLOCK_SIZES; ++i)
prob_diff_update(av1_motion_mode_tree, fc->motion_mode_prob[i],
- counts->motion_mode[i], MOTION_MODES, header_bc);
+ counts->motion_mode[i], MOTION_MODES, probwt, header_bc);
#endif // CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
-
+#if !CONFIG_EC_ADAPT
if (cm->interp_filter == SWITCHABLE)
update_switchable_interp_probs(cm, header_bc, counts);
+#endif
for (i = 0; i < INTRA_INTER_CONTEXTS; i++)
av1_cond_prob_diff_update(header_bc, &fc->intra_inter_prob[i],
- counts->intra_inter[i]);
+ counts->intra_inter[i], probwt);
if (cpi->allow_comp_inter_inter) {
const int use_hybrid_pred = cm->reference_mode == REFERENCE_MODE_SELECT;
if (use_hybrid_pred)
for (i = 0; i < COMP_INTER_CONTEXTS; i++)
av1_cond_prob_diff_update(header_bc, &fc->comp_inter_prob[i],
- counts->comp_inter[i]);
+ counts->comp_inter[i], probwt);
}
if (cm->reference_mode != COMPOUND_REFERENCE) {
for (i = 0; i < REF_CONTEXTS; i++) {
for (j = 0; j < (SINGLE_REFS - 1); j++) {
av1_cond_prob_diff_update(header_bc, &fc->single_ref_prob[i][j],
- counts->single_ref[i][j]);
+ counts->single_ref[i][j], probwt);
}
}
}
-
if (cm->reference_mode != SINGLE_REFERENCE) {
for (i = 0; i < REF_CONTEXTS; i++) {
#if CONFIG_EXT_REFS
for (j = 0; j < (FWD_REFS - 1); j++) {
av1_cond_prob_diff_update(header_bc, &fc->comp_ref_prob[i][j],
- counts->comp_ref[i][j]);
+ counts->comp_ref[i][j], probwt);
}
for (j = 0; j < (BWD_REFS - 1); j++) {
av1_cond_prob_diff_update(header_bc, &fc->comp_bwdref_prob[i][j],
- counts->comp_bwdref[i][j]);
+ counts->comp_bwdref[i][j], probwt);
}
#else
for (j = 0; j < (COMP_REFS - 1); j++) {
av1_cond_prob_diff_update(header_bc, &fc->comp_ref_prob[i][j],
- counts->comp_ref[i][j]);
+ counts->comp_ref[i][j], probwt);
}
#endif // CONFIG_EXT_REFS
}
}
+#if !CONFIG_EC_ADAPT
for (i = 0; i < BLOCK_SIZE_GROUPS; ++i) {
prob_diff_update(av1_intra_mode_tree, cm->fc->y_mode_prob[i],
- counts->y_mode[i], INTRA_MODES, header_bc);
-#if CONFIG_DAALA_EC
- av1_tree_to_cdf(av1_intra_mode_tree, cm->fc->y_mode_prob[i],
- cm->fc->y_mode_cdf[i]);
-#endif
+ counts->y_mode[i], INTRA_MODES, probwt, header_bc);
}
+#endif
av1_write_nmv_probs(cm, cm->allow_high_precision_mv, header_bc,
#if CONFIG_REF_MV
@@ -3753,14 +3867,23 @@
av1_tree_to_cdf(av1_mv_joint_tree, cm->fc->nmvc.joints,
cm->fc->nmvc.joint_cdf);
#endif
+#if !CONFIG_EC_ADAPT
update_ext_tx_probs(cm, header_bc);
+#endif
#if CONFIG_SUPERTX
- if (!xd->lossless[0]) update_supertx_probs(cm, header_bc);
+ if (!xd->lossless[0]) update_supertx_probs(cm, probwt, header_bc);
#endif // CONFIG_SUPERTX
#if CONFIG_GLOBAL_MOTION
write_global_motion(cpi, header_bc);
#endif // CONFIG_GLOBAL_MOTION
}
+#if CONFIG_EC_MULTISYMBOL
+ av1_coef_pareto_cdfs(fc);
+ av1_set_mv_cdfs(&fc->nmvc);
+#if CONFIG_DAALA_EC
+ av1_set_mode_cdfs(cm);
+#endif
+#endif
#if CONFIG_ANS
ans_write_init(&header_ans, data);
buf_ans_flush(header_bc, &header_ans);
@@ -3774,6 +3897,7 @@
#endif // CONFIG_ANS
}
+#if !CONFIG_TILE_GROUPS
static int choose_size_bytes(uint32_t size, int spare_msbs) {
// Choose the number of bytes required to represent size, without
// using the 'spare_msbs' number of most significant bits.
@@ -3803,7 +3927,6 @@
default: assert("Invalid size" && 0); break;
}
}
-
static int remux_tiles(const AV1_COMMON *const cm, uint8_t *dst,
const uint32_t data_size, const uint32_t max_tile_size,
const uint32_t max_tile_col_size,
@@ -3901,19 +4024,24 @@
return wpos;
}
}
+#endif // CONFIG_TILE_GROUPS
void av1_pack_bitstream(AV1_COMP *const cpi, uint8_t *dst, size_t *size) {
uint8_t *data = dst;
+#if !CONFIG_TILE_GROUPS
uint32_t compressed_header_size;
uint32_t uncompressed_header_size;
+ struct aom_write_bit_buffer saved_wb;
+#endif
uint32_t data_size;
struct aom_write_bit_buffer wb = { data, 0 };
- struct aom_write_bit_buffer saved_wb;
+
unsigned int max_tile_size;
unsigned int max_tile_col_size;
+
+#if !CONFIG_TILE_GROUPS
int tile_size_bytes;
int tile_col_size_bytes;
-
AV1_COMMON *const cm = &cpi->common;
const int have_tiles = cm->tile_cols * cm->tile_rows > 1;
@@ -3958,7 +4086,10 @@
// Write the encoded tile data
data_size = write_tiles(cpi, data, &max_tile_size, &max_tile_col_size);
-
+#else
+ data_size = write_tiles(cpi, &wb, &max_tile_size, &max_tile_col_size);
+#endif
+#if !CONFIG_TILE_GROUPS
if (have_tiles) {
data_size =
remux_tiles(cm, data, data_size, max_tile_size, max_tile_col_size,
@@ -3979,6 +4110,8 @@
// TODO(jbb): Figure out what to do if compressed_header_size > 16 bits.
assert(compressed_header_size <= 0xffff);
aom_wb_write_literal(&saved_wb, compressed_header_size, 16);
-
+#else
+ data += data_size;
+#endif
*size = data - dst;
}
diff --git a/av1/encoder/dct.c b/av1/encoder/dct.c
index 221e3cd..dd4031f 100644
--- a/av1/encoder/dct.c
+++ b/av1/encoder/dct.c
@@ -18,6 +18,8 @@
#include "aom_dsp/fwd_txfm.h"
#include "aom_ports/mem.h"
#include "av1/common/blockd.h"
+#include "av1/common/av1_fwd_txfm1d.h"
+#include "av1/common/av1_fwd_txfm2d_cfg.h"
#include "av1/common/idct.h"
static INLINE void range_check(const tran_low_t *input, const int size,
@@ -1874,12 +1876,103 @@
}
}
+#if CONFIG_TX64X64
+#if CONFIG_EXT_TX
+static void fidtx64(const tran_low_t *input, tran_low_t *output) {
+ int i;
+ for (i = 0; i < 64; ++i)
+ output[i] = (tran_low_t)fdct_round_shift(input[i] * 4 * Sqrt2);
+}
+
+// For use in lieu of ADST
+static void fhalfright64(const tran_low_t *input, tran_low_t *output) {
+ int i;
+ tran_low_t inputhalf[32];
+ for (i = 0; i < 32; ++i) {
+ output[32 + i] = (tran_low_t)fdct_round_shift(input[i] * 4 * Sqrt2);
+ }
+ // Multiply input by sqrt(2)
+ for (i = 0; i < 32; ++i) {
+ inputhalf[i] = (tran_low_t)fdct_round_shift(input[i + 32] * Sqrt2);
+ }
+ fdct32(inputhalf, output);
+ // Note overall scaling factor is 2 times unitary
+}
+#endif // CONFIG_EXT_TX
+
+static void fdct64_col(const tran_low_t *input, tran_low_t *output) {
+ int32_t in[64], out[64];
+ int i;
+ for (i = 0; i < 64; ++i) in[i] = (int32_t)input[i];
+ av1_fdct64_new(in, out, fwd_cos_bit_col_dct_dct_64,
+ fwd_stage_range_col_dct_dct_64);
+ for (i = 0; i < 64; ++i) output[i] = (tran_low_t)out[i];
+}
+
+static void fdct64_row(const tran_low_t *input, tran_low_t *output) {
+ int32_t in[64], out[64];
+ int i;
+ for (i = 0; i < 64; ++i) in[i] = (int32_t)input[i];
+ av1_fdct64_new(in, out, fwd_cos_bit_row_dct_dct_64,
+ fwd_stage_range_row_dct_dct_64);
+ for (i = 0; i < 64; ++i) output[i] = (tran_low_t)out[i];
+}
+
+void av1_fht64x64_c(const int16_t *input, tran_low_t *output, int stride,
+ int tx_type) {
+ static const transform_2d FHT[] = {
+ { fdct64_col, fdct64_row }, // DCT_DCT
+#if CONFIG_EXT_TX
+ { fhalfright64, fdct64_row }, // ADST_DCT
+ { fdct64_col, fhalfright64 }, // DCT_ADST
+ { fhalfright64, fhalfright64 }, // ADST_ADST
+ { fhalfright64, fdct64_row }, // FLIPADST_DCT
+ { fdct64_col, fhalfright64 }, // DCT_FLIPADST
+ { fhalfright64, fhalfright64 }, // FLIPADST_FLIPADST
+ { fhalfright64, fhalfright64 }, // ADST_FLIPADST
+ { fhalfright64, fhalfright64 }, // FLIPADST_ADST
+ { fidtx64, fidtx64 }, // IDTX
+ { fdct64_col, fidtx64 }, // V_DCT
+ { fidtx64, fdct64_row }, // H_DCT
+ { fhalfright64, fidtx64 }, // V_ADST
+ { fidtx64, fhalfright64 }, // H_ADST
+ { fhalfright64, fidtx64 }, // V_FLIPADST
+ { fidtx64, fhalfright64 }, // H_FLIPADST
+#endif
+ };
+ const transform_2d ht = FHT[tx_type];
+ tran_low_t out[4096];
+ int i, j;
+ tran_low_t temp_in[64], temp_out[64];
+#if CONFIG_EXT_TX
+ int16_t flipped_input[64 * 64];
+ maybe_flip_input(&input, &stride, 64, 64, flipped_input, tx_type);
+#endif
+ // Columns
+ for (i = 0; i < 64; ++i) {
+ for (j = 0; j < 64; ++j) temp_in[j] = input[j * stride + i];
+ ht.cols(temp_in, temp_out);
+ for (j = 0; j < 64; ++j)
+ out[j * 64 + i] = (temp_out[j] + 1 + (temp_out[j] > 0)) >> 2;
+ }
+
+ // Rows
+ for (i = 0; i < 64; ++i) {
+ for (j = 0; j < 64; ++j) temp_in[j] = out[j + i * 64];
+ ht.rows(temp_in, temp_out);
+ for (j = 0; j < 64; ++j)
+ output[j + i * 64] =
+ (tran_low_t)((temp_out[j] + 1 + (temp_out[j] < 0)) >> 2);
+ }
+}
+#endif // CONFIG_TX64X64
+
#if CONFIG_EXT_TX
// Forward identity transform.
void av1_fwd_idtx_c(const int16_t *src_diff, tran_low_t *coeff, int stride,
int bs, int tx_type) {
int r, c;
- const int shift = bs < 32 ? 3 : 2;
+ const int shift = bs < 32 ? 3 : (bs < 64 ? 2 : 1);
if (tx_type == IDTX) {
for (r = 0; r < bs; ++r) {
for (c = 0; c < bs; ++c) coeff[c] = src_diff[c] * (1 << shift);
@@ -1894,5 +1987,12 @@
int tx_type) {
av1_fht32x32_c(input, output, stride, tx_type);
}
+
+#if CONFIG_TX64X64
+void av1_highbd_fht64x64_c(const int16_t *input, tran_low_t *output, int stride,
+ int tx_type) {
+ av1_fht64x64_c(input, output, stride, tx_type);
+}
+#endif // CONFIG_TX64X64
#endif // CONFIG_AOM_HIGHBITDEPTH
#endif // CONFIG_EXT_TX
diff --git a/av1/encoder/encodeframe.c b/av1/encoder/encodeframe.c
index 3733efc..d1d6ecc 100644
--- a/av1/encoder/encodeframe.c
+++ b/av1/encoder/encodeframe.c
@@ -1305,8 +1305,8 @@
#if CONFIG_VAR_TX
{
const TX_SIZE mtx = mbmi->tx_size;
- const int num_4x4_blocks_wide = num_4x4_blocks_wide_txsize_lookup[mtx] >> 1;
- const int num_4x4_blocks_high = num_4x4_blocks_high_txsize_lookup[mtx] >> 1;
+ const int num_4x4_blocks_wide = tx_size_wide_unit[mtx] >> 1;
+ const int num_4x4_blocks_high = tx_size_high_unit[mtx] >> 1;
int idy, idx;
mbmi->inter_tx_size[0][0] = mtx;
for (idy = 0; idy < num_4x4_blocks_high; ++idy)
@@ -4442,8 +4442,10 @@
av1_zero_above_context(cm, tile_info->mi_col_start, tile_info->mi_col_end);
// Set up pointers to per thread motion search counters.
- td->mb.m_search_count_ptr = &td->rd_counts.m_search_count;
- td->mb.ex_search_count_ptr = &td->rd_counts.ex_search_count;
+ this_tile->m_search_count = 0; // Count of motion search hits.
+ this_tile->ex_search_count = 0; // Exhaustive mesh search hits.
+ td->mb.m_search_count_ptr = &this_tile->m_search_count;
+ td->mb.ex_search_count_ptr = &this_tile->ex_search_count;
for (mi_row = tile_info->mi_row_start; mi_row < tile_info->mi_row_end;
mi_row += cm->mib_size) {
@@ -4484,10 +4486,35 @@
#define MIN_TRANS_THRESH 8
#define GLOBAL_MOTION_ADVANTAGE_THRESH 0.60
#define GLOBAL_MOTION_MODEL ROTZOOM
-// TODO(sarahparker) This function needs to be adjusted
-// to accomodate changes in the paraemter integerization.
-// Commenting it out until the fix is made.
-/*
+
+// Adds some offset to a global motion parameter and handles
+// all of the necessary precision shifts, clamping, and
+// zero-centering.
+static int16_t add_param_offset(int param_index, int16_t param_value,
+ int16_t offset) {
+ const int scale_vals[2] = { GM_ALPHA_PREC_DIFF, GM_TRANS_PREC_DIFF };
+ const int clamp_vals[2] = { GM_ALPHA_MAX, GM_TRANS_MAX };
+ const int is_trans_param = param_index < 2;
+ const int is_one_centered = (!is_trans_param) && (param_index & 1);
+
+ // Make parameter zero-centered and offset the shift that was done to make
+ // it compatible with the warped model
+ param_value = (param_value - (is_one_centered << WARPEDMODEL_PREC_BITS)) >>
+ scale_vals[is_trans_param];
+ // Add desired offset to the rescaled/zero-centered parameter
+ param_value += offset;
+ // Clamp the parameter so it does not overflow the number of bits allotted
+ // to it in the bitstream
+ param_value = (int16_t)clamp(param_value, -clamp_vals[is_trans_param],
+ clamp_vals[is_trans_param]);
+ // Rescale the parameter to WARPEDMODEL_PRECISION_BITS so it is compatible
+ // with the warped motion library
+ param_value *= (1 << scale_vals[is_trans_param]);
+
+ // Undo the zero-centering step if necessary
+ return param_value + (is_one_centered << WARPEDMODEL_PREC_BITS);
+}
+
static void refine_integerized_param(WarpedMotionParams *wm,
#if CONFIG_AOM_HIGHBITDEPTH
int use_hbd, int bd,
@@ -4500,7 +4527,7 @@
int n_params = n_trans_model_params[wm->wmtype];
int16_t *param_mat = (int16_t *)wm->wmmat;
double step_error;
- int step;
+ int16_t step;
int16_t *param;
int16_t curr_param;
int16_t best_param;
@@ -4519,9 +4546,7 @@
best_param = curr_param;
for (i = 0; i < n_refinements; i++) {
// look to the left
- *param =
- (int16_t)clamp(curr_param - step, p < 2 ? GM_TRANS_MIN : GM_ALPHA_MIN,
- p < 2 ? GM_TRANS_MAX : GM_ALPHA_MAX);
+ *param = add_param_offset(p, curr_param, -step);
step_error =
av1_warp_erroradv(wm,
#if CONFIG_AOM_HIGHBITDEPTH
@@ -4538,9 +4563,7 @@
}
// look to the right
- *param =
- (int16_t)clamp(curr_param + step, p < 2 ? GM_TRANS_MIN : GM_ALPHA_MIN,
- p < 2 ? GM_TRANS_MAX : GM_ALPHA_MAX);
+ *param = add_param_offset(p, curr_param, step);
step_error =
av1_warp_erroradv(wm,
#if CONFIG_AOM_HIGHBITDEPTH
@@ -4564,7 +4587,6 @@
*param = best_param;
}
}
-*/
static void convert_to_params(const double *params, TransformationType type,
int16_t *model) {
@@ -4617,6 +4639,9 @@
x->min_partition_size = AOMMIN(x->min_partition_size, cm->sb_size);
x->max_partition_size = AOMMIN(x->max_partition_size, cm->sb_size);
+#if CONFIG_SIMP_MV_PRED
+ cm->setup_mi(cm);
+#endif
xd->mi = cm->mi_grid_visible;
xd->mi[0] = cm->mi;
@@ -4624,8 +4649,6 @@
av1_zero(*td->counts);
av1_zero(rdc->coef_counts);
av1_zero(rdc->comp_pred_diff);
- rdc->m_search_count = 0; // Count of motion search hits.
- rdc->ex_search_count = 0; // Exhaustive mesh search hits.
#if CONFIG_GLOBAL_MOTION
aom_clear_system_state();
@@ -4643,6 +4666,14 @@
convert_model_to_params(params, GLOBAL_MOTION_MODEL,
&cm->global_motion[frame]);
if (get_gmtype(&cm->global_motion[frame]) > GLOBAL_ZERO) {
+ refine_integerized_param(
+ &cm->global_motion[frame].motion_params,
+#if CONFIG_AOM_HIGHBITDEPTH
+ xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH, xd->bd,
+#endif // CONFIG_AOM_HIGHBITDEPTH
+ ref_buf->y_buffer, ref_buf->y_width, ref_buf->y_height,
+ ref_buf->y_stride, cpi->Source->y_buffer, cpi->Source->y_width,
+ cpi->Source->y_height, cpi->Source->y_stride, 3);
// compute the advantage of using gm parameters over 0 motion
erroradvantage = av1_warp_erroradv(
&cm->global_motion[frame].motion_params,
@@ -4858,17 +4889,18 @@
int count16x16_16x16p = 0, count16x16_lp = 0;
int count32x32 = 0;
for (i = 0; i < TX_SIZE_CONTEXTS; ++i) {
- count4x4 += counts->tx_size[0][i][TX_4X4];
- count4x4 += counts->tx_size[1][i][TX_4X4];
- count4x4 += counts->tx_size[2][i][TX_4X4];
+ // counts->tx_size[max_depth][context_idx][this_depth_level]
+ count4x4 += counts->tx_size[0][i][0];
+ count4x4 += counts->tx_size[1][i][0];
+ count4x4 += counts->tx_size[2][i][0];
- count8x8_lp += counts->tx_size[1][i][TX_8X8];
- count8x8_lp += counts->tx_size[2][i][TX_8X8];
- count8x8_8x8p += counts->tx_size[0][i][TX_8X8];
+ count8x8_lp += counts->tx_size[1][i][1];
+ count8x8_lp += counts->tx_size[2][i][1];
+ count8x8_8x8p += counts->tx_size[0][i][1];
- count16x16_16x16p += counts->tx_size[1][i][TX_16X16];
- count16x16_lp += counts->tx_size[2][i][TX_16X16];
- count32x32 += counts->tx_size[2][i][TX_32X32];
+ count16x16_16x16p += counts->tx_size[1][i][2];
+ count16x16_lp += counts->tx_size[2][i][2];
+ count32x32 += counts->tx_size[2][i][3];
}
#if CONFIG_EXT_TX && CONFIG_RECT_TX
count4x4 += counts->tx_size_implied[0][TX_4X4];
@@ -4955,20 +4987,18 @@
#if CONFIG_VAR_TX
static void update_txfm_count(MACROBLOCK *x, MACROBLOCKD *xd,
- FRAME_COUNTS *counts, TX_SIZE tx_size,
+ FRAME_COUNTS *counts, TX_SIZE tx_size, int depth,
int blk_row, int blk_col) {
MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
const int tx_row = blk_row >> 1;
const int tx_col = blk_col >> 1;
- int max_blocks_high = num_4x4_blocks_high_lookup[mbmi->sb_type];
- int max_blocks_wide = num_4x4_blocks_wide_lookup[mbmi->sb_type];
+ const int max_blocks_high = max_block_high(xd, mbmi->sb_type, 0);
+ const int max_blocks_wide = max_block_wide(xd, mbmi->sb_type, 0);
int ctx = txfm_partition_context(xd->above_txfm_context + tx_col,
- xd->left_txfm_context + tx_row, tx_size);
+ xd->left_txfm_context + tx_row,
+ mbmi->sb_type, tx_size);
const TX_SIZE plane_tx_size = mbmi->inter_tx_size[tx_row][tx_col];
- if (xd->mb_to_bottom_edge < 0) max_blocks_high += xd->mb_to_bottom_edge >> 5;
- if (xd->mb_to_right_edge < 0) max_blocks_wide += xd->mb_to_right_edge >> 5;
-
if (blk_row >= max_blocks_high || blk_col >= max_blocks_wide) return;
if (tx_size == plane_tx_size) {
@@ -4994,8 +5024,8 @@
for (i = 0; i < 4; ++i) {
int offsetr = (i >> 1) * bh / 2;
int offsetc = (i & 0x01) * bh / 2;
- update_txfm_count(x, xd, counts, tx_size - 1, blk_row + offsetr,
- blk_col + offsetc);
+ update_txfm_count(x, xd, counts, tx_size - 1, depth + 1,
+ blk_row + offsetr, blk_col + offsetc);
}
}
}
@@ -5017,7 +5047,8 @@
for (idy = 0; idy < mi_height; idy += bh)
for (idx = 0; idx < mi_width; idx += bh)
- update_txfm_count(x, xd, td_counts, max_tx_size, idy, idx);
+ update_txfm_count(x, xd, td_counts, max_tx_size, mi_width != mi_height,
+ idy, idx);
}
static void set_txfm_context(MACROBLOCKD *xd, TX_SIZE tx_size, int blk_row,
@@ -5025,13 +5056,10 @@
MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
const int tx_row = blk_row >> 1;
const int tx_col = blk_col >> 1;
- int max_blocks_high = num_4x4_blocks_high_lookup[mbmi->sb_type];
- int max_blocks_wide = num_4x4_blocks_wide_lookup[mbmi->sb_type];
+ const int max_blocks_high = max_block_high(xd, mbmi->sb_type, 0);
+ const int max_blocks_wide = max_block_wide(xd, mbmi->sb_type, 0);
const TX_SIZE plane_tx_size = mbmi->inter_tx_size[tx_row][tx_col];
- if (xd->mb_to_bottom_edge < 0) max_blocks_high += xd->mb_to_bottom_edge >> 5;
- if (xd->mb_to_right_edge < 0) max_blocks_wide += xd->mb_to_right_edge >> 5;
-
if (blk_row >= max_blocks_high || blk_col >= max_blocks_wide) return;
if (tx_size == plane_tx_size) {
@@ -5206,6 +5234,7 @@
const int tx_size_cat = is_inter ? inter_tx_size_cat_lookup[bsize]
: intra_tx_size_cat_lookup[bsize];
const TX_SIZE coded_tx_size = txsize_sqr_up_map[mbmi->tx_size];
+ const int depth = tx_size_to_depth(coded_tx_size);
#if CONFIG_EXT_TX && CONFIG_RECT_TX
assert(IMPLIES(is_rect_tx(mbmi->tx_size), is_rect_tx_allowed(xd, mbmi)));
#endif // CONFIG_EXT_TX && CONFIG_RECT_TX
@@ -5219,7 +5248,7 @@
if (is_inter) {
tx_partition_count_update(cm, x, bsize, mi_row, mi_col, td->counts);
} else {
- ++td->counts->tx_size[tx_size_cat][tx_size_ctx][coded_tx_size];
+ ++td->counts->tx_size[tx_size_cat][tx_size_ctx][depth];
if (mbmi->tx_size != max_txsize_lookup[bsize]) ++x->txb_split_count;
}
#if CONFIG_EXT_TX && CONFIG_RECT_TX
@@ -5227,7 +5256,7 @@
#endif
#endif
#if !CONFIG_VAR_TX
- ++td->counts->tx_size[tx_size_cat][tx_size_ctx][coded_tx_size];
+ ++td->counts->tx_size[tx_size_cat][tx_size_ctx][depth];
#endif
} else {
int i, j;
diff --git a/av1/encoder/encodemb.c b/av1/encoder/encodemb.c
index c5459dc..3a33f35 100644
--- a/av1/encoder/encodemb.c
+++ b/av1/encoder/encodemb.c
@@ -451,7 +451,7 @@
const qm_val_t *iqmatrix = pd->seg_iqmatrix[seg_id][!is_inter][tx_size];
#endif
const int16_t *src_diff;
- const int tx2d_size = get_tx2d_size(tx_size);
+ const int tx2d_size = tx_size_2d[tx_size];
FWD_TXFM_PARAM fwd_txfm_param;
QUANT_PARAM qparam;
@@ -542,14 +542,13 @@
highbd_fwd_txfm(src_diff, coeff, diff_stride, &fwd_txfm_param);
if (tx_size == TX_32X32) {
highbd_quantize_32x32_nuq(
- coeff, get_tx2d_size(tx_size), x->skip_block, p->quant,
- p->quant_shift, pd->dequant,
- (const cuml_bins_type_nuq *)p->cuml_bins_nuq[dq],
+ coeff, tx_size_2d[tx_size], x->skip_block, p->quant, p->quant_shift,
+ pd->dequant, (const cuml_bins_type_nuq *)p->cuml_bins_nuq[dq],
(const dequant_val_type_nuq *)pd->dequant_val_nuq[dq], qcoeff,
dqcoeff, eob, scan_order->scan, band);
} else {
- highbd_quantize_nuq(coeff, get_tx2d_size(tx_size), x->skip_block,
- p->quant, p->quant_shift, pd->dequant,
+ highbd_quantize_nuq(coeff, tx_size_2d[tx_size], x->skip_block, p->quant,
+ p->quant_shift, pd->dequant,
(const cuml_bins_type_nuq *)p->cuml_bins_nuq[dq],
(const dequant_val_type_nuq *)pd->dequant_val_nuq[dq],
qcoeff, dqcoeff, eob, scan_order->scan, band);
@@ -566,7 +565,7 @@
(const dequant_val_type_nuq *)pd->dequant_val_nuq[dq],
qcoeff, dqcoeff, eob, scan_order->scan, band);
} else {
- quantize_nuq(coeff, get_tx2d_size(tx_size), x->skip_block, p->quant,
+ quantize_nuq(coeff, tx_size_2d[tx_size], x->skip_block, p->quant,
p->quant_shift, pd->dequant,
(const cuml_bins_type_nuq *)p->cuml_bins_nuq[dq],
(const dequant_val_type_nuq *)pd->dequant_val_nuq[dq], qcoeff,
@@ -613,14 +612,14 @@
highbd_fwd_txfm(src_diff, coeff, diff_stride, &fwd_txfm_param);
if (tx_size == TX_32X32) {
highbd_quantize_32x32_fp_nuq(
- coeff, get_tx2d_size(tx_size), x->skip_block, p->quant_fp,
- pd->dequant, (const cuml_bins_type_nuq *)p->cuml_bins_nuq[dq],
+ coeff, tx_size_2d[tx_size], x->skip_block, p->quant_fp, pd->dequant,
+ (const cuml_bins_type_nuq *)p->cuml_bins_nuq[dq],
(const dequant_val_type_nuq *)pd->dequant_val_nuq[dq], qcoeff,
dqcoeff, eob, scan_order->scan, band);
} else {
highbd_quantize_fp_nuq(
- coeff, get_tx2d_size(tx_size), x->skip_block, p->quant_fp,
- pd->dequant, (const cuml_bins_type_nuq *)p->cuml_bins_nuq[dq],
+ coeff, tx_size_2d[tx_size], x->skip_block, p->quant_fp, pd->dequant,
+ (const cuml_bins_type_nuq *)p->cuml_bins_nuq[dq],
(const dequant_val_type_nuq *)pd->dequant_val_nuq[dq], qcoeff,
dqcoeff, eob, scan_order->scan, band);
}
@@ -630,13 +629,13 @@
fwd_txfm(src_diff, coeff, diff_stride, &fwd_txfm_param);
if (tx_size == TX_32X32) {
- quantize_32x32_fp_nuq(coeff, get_tx2d_size(tx_size), x->skip_block,
+ quantize_32x32_fp_nuq(coeff, tx_size_2d[tx_size], x->skip_block,
p->quant_fp, pd->dequant,
(const cuml_bins_type_nuq *)p->cuml_bins_nuq[dq],
(const dequant_val_type_nuq *)pd->dequant_val_nuq[dq],
qcoeff, dqcoeff, eob, scan_order->scan, band);
} else {
- quantize_fp_nuq(coeff, get_tx2d_size(tx_size), x->skip_block, p->quant_fp,
+ quantize_fp_nuq(coeff, tx_size_2d[tx_size], x->skip_block, p->quant_fp,
pd->dequant,
(const cuml_bins_type_nuq *)p->cuml_bins_nuq[dq],
(const dequant_val_type_nuq *)pd->dequant_val_nuq[dq],
@@ -681,11 +680,11 @@
highbd_fwd_txfm(src_diff, coeff, diff_stride, &fwd_txfm_param);
if (tx_size == TX_32X32) {
highbd_quantize_dc_32x32_nuq(
- coeff, get_tx2d_size(tx_size), x->skip_block, p->quant[0],
+ coeff, tx_size_2d[tx_size], x->skip_block, p->quant[0],
p->quant_shift[0], pd->dequant[0], p->cuml_bins_nuq[dq][0],
pd->dequant_val_nuq[dq][0], qcoeff, dqcoeff, eob);
} else {
- highbd_quantize_dc_nuq(coeff, get_tx2d_size(tx_size), x->skip_block,
+ highbd_quantize_dc_nuq(coeff, tx_size_2d[tx_size], x->skip_block,
p->quant[0], p->quant_shift[0], pd->dequant[0],
p->cuml_bins_nuq[dq][0],
pd->dequant_val_nuq[dq][0], qcoeff, dqcoeff, eob);
@@ -696,12 +695,12 @@
fwd_txfm(src_diff, coeff, diff_stride, &fwd_txfm_param);
if (tx_size == TX_32X32) {
- quantize_dc_32x32_nuq(coeff, get_tx2d_size(tx_size), x->skip_block,
+ quantize_dc_32x32_nuq(coeff, tx_size_2d[tx_size], x->skip_block,
p->quant[0], p->quant_shift[0], pd->dequant[0],
p->cuml_bins_nuq[dq][0], pd->dequant_val_nuq[dq][0],
qcoeff, dqcoeff, eob);
} else {
- quantize_dc_nuq(coeff, get_tx2d_size(tx_size), x->skip_block, p->quant[0],
+ quantize_dc_nuq(coeff, tx_size_2d[tx_size], x->skip_block, p->quant[0],
p->quant_shift[0], pd->dequant[0], p->cuml_bins_nuq[dq][0],
pd->dequant_val_nuq[dq][0], qcoeff, dqcoeff, eob);
}
@@ -744,12 +743,12 @@
highbd_fwd_txfm(src_diff, coeff, diff_stride, &fwd_txfm_param);
if (tx_size == TX_32X32) {
highbd_quantize_dc_32x32_fp_nuq(
- coeff, get_tx2d_size(tx_size), x->skip_block, p->quant_fp[0],
+ coeff, tx_size_2d[tx_size], x->skip_block, p->quant_fp[0],
pd->dequant[0], p->cuml_bins_nuq[dq][0], pd->dequant_val_nuq[dq][0],
qcoeff, dqcoeff, eob);
} else {
highbd_quantize_dc_fp_nuq(
- coeff, get_tx2d_size(tx_size), x->skip_block, p->quant_fp[0],
+ coeff, tx_size_2d[tx_size], x->skip_block, p->quant_fp[0],
pd->dequant[0], p->cuml_bins_nuq[dq][0], pd->dequant_val_nuq[dq][0],
qcoeff, dqcoeff, eob);
}
@@ -759,12 +758,12 @@
fwd_txfm(src_diff, coeff, diff_stride, &fwd_txfm_param);
if (tx_size == TX_32X32) {
- quantize_dc_32x32_fp_nuq(coeff, get_tx2d_size(tx_size), x->skip_block,
+ quantize_dc_32x32_fp_nuq(coeff, tx_size_2d[tx_size], x->skip_block,
p->quant_fp[0], pd->dequant[0],
p->cuml_bins_nuq[dq][0],
pd->dequant_val_nuq[dq][0], qcoeff, dqcoeff, eob);
} else {
- quantize_dc_fp_nuq(coeff, get_tx2d_size(tx_size), x->skip_block,
+ quantize_dc_fp_nuq(coeff, tx_size_2d[tx_size], x->skip_block,
p->quant_fp[0], pd->dequant[0], p->cuml_bins_nuq[dq][0],
pd->dequant_val_nuq[dq][0], qcoeff, dqcoeff, eob);
}
@@ -864,14 +863,8 @@
const int tx_row = blk_row >> (1 - pd->subsampling_y);
const int tx_col = blk_col >> (1 - pd->subsampling_x);
TX_SIZE plane_tx_size;
-
- int max_blocks_high = num_4x4_blocks_high_lookup[plane_bsize];
- int max_blocks_wide = num_4x4_blocks_wide_lookup[plane_bsize];
-
- if (xd->mb_to_bottom_edge < 0)
- max_blocks_high += xd->mb_to_bottom_edge >> (5 + pd->subsampling_y);
- if (xd->mb_to_right_edge < 0)
- max_blocks_wide += xd->mb_to_right_edge >> (5 + pd->subsampling_x);
+ const int max_blocks_high = max_block_high(xd, plane_bsize, plane);
+ const int max_blocks_wide = max_block_wide(xd, plane_bsize, plane);
if (blk_row >= max_blocks_high || blk_col >= max_blocks_wide) return;
diff --git a/av1/encoder/encodemv.c b/av1/encoder/encodemv.c
index 53dac12..8a6ad18 100644
--- a/av1/encoder/encodemv.c
+++ b/av1/encoder/encodemv.c
@@ -30,8 +30,8 @@
av1_tokens_from_tree(mv_fp_encodings, av1_mv_fp_tree);
}
-static void encode_mv_component(aom_writer *w, int comp,
- const nmv_component *mvcomp, int usehp) {
+static void encode_mv_component(aom_writer *w, int comp, nmv_component *mvcomp,
+ int usehp) {
int offset;
const int sign = comp < 0;
const int mag = sign ? -comp : comp;
@@ -46,7 +46,7 @@
aom_write(w, sign, mvcomp->sign);
// Class
-#if CONFIG_DAALA_EC
+#if CONFIG_EC_MULTISYMBOL
aom_write_symbol(w, mv_class, mvcomp->class_cdf, MV_CLASSES);
#else
av1_write_token(w, av1_mv_class_tree, mvcomp->classes,
@@ -63,7 +63,7 @@
}
// Fractional bits
-#if CONFIG_DAALA_EC
+#if CONFIG_EC_MULTISYMBOL
aom_write_symbol(
w, fr, mv_class == MV_CLASS_0 ? mvcomp->class0_fp_cdf[d] : mvcomp->fp_cdf,
MV_FP_SIZE);
@@ -141,9 +141,16 @@
static void update_mv(aom_writer *w, const unsigned int ct[2], aom_prob *cur_p,
aom_prob upd_p) {
(void)upd_p;
- av1_cond_prob_diff_update(w, cur_p, ct);
+#if CONFIG_TILE_GROUPS
+ // Just use the maximum number of tile groups to avoid passing in the actual
+ // number
+ av1_cond_prob_diff_update(w, cur_p, ct, MAX_NUM_TG);
+#else
+ av1_cond_prob_diff_update(w, cur_p, ct, 1);
+#endif
}
+#if !CONFIG_EC_ADAPT
static void write_mv_update(const aom_tree_index *tree,
aom_prob probs[/*n - 1*/],
const unsigned int counts[/*n - 1*/], int n,
@@ -158,19 +165,22 @@
for (i = 0; i < n - 1; ++i)
update_mv(w, branch_ct[i], &probs[i], MV_UPDATE_PROB);
}
+#endif
void av1_write_nmv_probs(AV1_COMMON *cm, int usehp, aom_writer *w,
nmv_context_counts *const nmv_counts) {
- int i, j;
+ int i;
#if CONFIG_REF_MV
int nmv_ctx = 0;
for (nmv_ctx = 0; nmv_ctx < NMV_CONTEXTS; ++nmv_ctx) {
nmv_context *const mvc = &cm->fc->nmvc[nmv_ctx];
nmv_context_counts *const counts = &nmv_counts[nmv_ctx];
+#if !CONFIG_EC_ADAPT
write_mv_update(av1_mv_joint_tree, mvc->joints, counts->joints, MV_JOINTS,
w);
for (i = 0; i < 2; ++i) {
+ int j;
nmv_component *comp = &mvc->comps[i];
nmv_component_counts *comp_counts = &counts->comps[i];
@@ -184,6 +194,7 @@
}
for (i = 0; i < 2; ++i) {
+ int j;
for (j = 0; j < CLASS0_SIZE; ++j)
write_mv_update(av1_mv_fp_tree, mvc->comps[i].class0_fp[j],
counts->comps[i].class0_fp[j], MV_FP_SIZE, w);
@@ -191,6 +202,7 @@
write_mv_update(av1_mv_fp_tree, mvc->comps[i].fp, counts->comps[i].fp,
MV_FP_SIZE, w);
}
+#endif
if (usehp) {
for (i = 0; i < 2; ++i) {
@@ -204,18 +216,17 @@
nmv_context *const mvc = &cm->fc->nmvc;
nmv_context_counts *const counts = nmv_counts;
+#if !CONFIG_EC_ADAPT
write_mv_update(av1_mv_joint_tree, mvc->joints, counts->joints, MV_JOINTS, w);
for (i = 0; i < 2; ++i) {
+ int j;
nmv_component *comp = &mvc->comps[i];
nmv_component_counts *comp_counts = &counts->comps[i];
update_mv(w, comp_counts->sign, &comp->sign, MV_UPDATE_PROB);
write_mv_update(av1_mv_class_tree, comp->classes, comp_counts->classes,
MV_CLASSES, w);
-#if CONFIG_DAALA_EC
- av1_tree_to_cdf(av1_mv_class_tree, comp->classes, comp->class_cdf);
-#endif
write_mv_update(av1_mv_class0_tree, comp->class0, comp_counts->class0,
CLASS0_SIZE, w);
for (j = 0; j < MV_OFFSET_BITS; ++j)
@@ -223,20 +234,15 @@
}
for (i = 0; i < 2; ++i) {
+ int j;
for (j = 0; j < CLASS0_SIZE; ++j) {
write_mv_update(av1_mv_fp_tree, mvc->comps[i].class0_fp[j],
counts->comps[i].class0_fp[j], MV_FP_SIZE, w);
-#if CONFIG_DAALA_EC
- av1_tree_to_cdf(av1_mv_fp_tree, mvc->comps[i].class0_fp[j],
- mvc->comps[i].class0_fp_cdf[j]);
-#endif
}
write_mv_update(av1_mv_fp_tree, mvc->comps[i].fp, counts->comps[i].fp,
MV_FP_SIZE, w);
-#if CONFIG_DAALA_EC
- av1_tree_to_cdf(av1_mv_fp_tree, mvc->comps[i].fp, mvc->comps[i].fp_cdf);
-#endif
}
+#endif // !CONFIG_EC_ADAPT
if (usehp) {
for (i = 0; i < 2; ++i) {
@@ -252,14 +258,13 @@
#if CONFIG_REF_MV
int is_compound,
#endif
- const nmv_context *mvctx, int usehp) {
+ nmv_context *mvctx, int usehp) {
const MV diff = { mv->row - ref->row, mv->col - ref->col };
const MV_JOINT_TYPE j = av1_get_mv_joint(&diff);
- usehp = usehp && av1_use_mv_hp(ref);
#if CONFIG_REF_MV
(void)is_compound;
#endif
-#if CONFIG_DAALA_EC
+#if CONFIG_EC_MULTISYMBOL
aom_write_symbol(w, j, mvctx->joint_cdf, MV_JOINTS);
#else
av1_write_token(w, av1_mv_joint_tree, mvctx->joints, &mv_joint_encodings[j]);
@@ -312,7 +317,7 @@
nmv_context_counts *counts = &nmv_counts[nmv_ctx];
(void)pred_mvs;
#endif
- av1_inc_mv(&diff, counts, av1_use_mv_hp(ref));
+ av1_inc_mv(&diff, counts, 1);
}
} else if (mode == NEAREST_NEWMV || mode == NEAR_NEWMV) {
const MV *ref = &mbmi_ext->ref_mvs[mbmi->ref_frame[1]][0].as_mv;
@@ -325,7 +330,7 @@
mbmi_ext->ref_mv_stack[rf_type], 1, mbmi->ref_mv_idx);
nmv_context_counts *counts = &nmv_counts[nmv_ctx];
#endif
- av1_inc_mv(&diff, counts, av1_use_mv_hp(ref));
+ av1_inc_mv(&diff, counts, 1);
} else if (mode == NEW_NEARESTMV || mode == NEW_NEARMV) {
const MV *ref = &mbmi_ext->ref_mvs[mbmi->ref_frame[0]][0].as_mv;
const MV diff = { mvs[0].as_mv.row - ref->row,
@@ -337,7 +342,7 @@
mbmi_ext->ref_mv_stack[rf_type], 0, mbmi->ref_mv_idx);
nmv_context_counts *counts = &nmv_counts[nmv_ctx];
#endif
- av1_inc_mv(&diff, counts, av1_use_mv_hp(ref));
+ av1_inc_mv(&diff, counts, 1);
}
}
@@ -366,7 +371,7 @@
mbmi_ext->ref_mv_stack[rf_type], i, mbmi->ref_mv_idx);
nmv_context_counts *counts = &nmv_counts[nmv_ctx];
#endif
- av1_inc_mv(&diff, counts, av1_use_mv_hp(ref));
+ av1_inc_mv(&diff, counts, 1);
}
} else if (mode == NEAREST_NEWMV || mode == NEAR_NEWMV) {
const MV *ref = &mi->bmi[block].ref_mv[1].as_mv;
@@ -379,7 +384,7 @@
mbmi_ext->ref_mv_stack[rf_type], 1, mbmi->ref_mv_idx);
nmv_context_counts *counts = &nmv_counts[nmv_ctx];
#endif
- av1_inc_mv(&diff, counts, av1_use_mv_hp(ref));
+ av1_inc_mv(&diff, counts, 1);
} else if (mode == NEW_NEARESTMV || mode == NEW_NEARMV) {
const MV *ref = &mi->bmi[block].ref_mv[0].as_mv;
const MV diff = { mvs[0].as_mv.row - ref->row,
@@ -391,7 +396,7 @@
mbmi_ext->ref_mv_stack[rf_type], 0, mbmi->ref_mv_idx);
nmv_context_counts *counts = &nmv_counts[nmv_ctx];
#endif
- av1_inc_mv(&diff, counts, av1_use_mv_hp(ref));
+ av1_inc_mv(&diff, counts, 1);
}
}
#else
@@ -419,7 +424,7 @@
#endif
const MV diff = { mvs[i].as_mv.row - ref->row,
mvs[i].as_mv.col - ref->col };
- av1_inc_mv(&diff, counts, av1_use_mv_hp(ref));
+ av1_inc_mv(&diff, counts, 1);
}
}
#endif // CONFIG_EXT_INTER
diff --git a/av1/encoder/encodemv.h b/av1/encoder/encodemv.h
index 543064e..17baa2d 100644
--- a/av1/encoder/encodemv.h
+++ b/av1/encoder/encodemv.h
@@ -27,7 +27,7 @@
#if CONFIG_REF_MV
int is_compound,
#endif
- const nmv_context *mvctx, int usehp);
+ nmv_context *mvctx, int usehp);
void av1_build_nmv_cost_table(int *mvjoint, int *mvcost[2],
const nmv_context *mvctx, int usehp);
diff --git a/av1/encoder/encoder.c b/av1/encoder/encoder.c
index 629eb46..52408b9 100644
--- a/av1/encoder/encoder.c
+++ b/av1/encoder/encoder.c
@@ -3324,6 +3324,16 @@
av1_loop_filter_frame(cm->frame_to_show, cm, xd, lf->filter_level, 0, 0);
#endif
}
+#if CONFIG_DERING
+ if (is_lossless_requested(&cpi->oxcf)) {
+ cm->dering_level = 0;
+ } else {
+ cm->dering_level =
+ av1_dering_search(cm->frame_to_show, cpi->Source, cm, xd);
+ av1_dering_frame(cm->frame_to_show, cm, xd, cm->dering_level);
+ }
+#endif // CONFIG_DERING
+
#if CONFIG_CLPF
cm->clpf_strength_y = cm->clpf_strength_u = cm->clpf_strength_v = 0;
cm->clpf_size = CLPF_64X64;
@@ -3372,15 +3382,6 @@
}
}
#endif
-#if CONFIG_DERING
- if (is_lossless_requested(&cpi->oxcf)) {
- cm->dering_level = 0;
- } else {
- cm->dering_level =
- av1_dering_search(cm->frame_to_show, cpi->Source, cm, xd);
- av1_dering_frame(cm->frame_to_show, cm, xd, cm->dering_level);
- }
-#endif // CONFIG_DERING
#if CONFIG_LOOP_RESTORATION
if (cm->rst_info.restoration_type != RESTORE_NONE) {
av1_loop_restoration_init(&cm->rst_internal, &cm->rst_info,
@@ -3596,7 +3597,7 @@
recon_err = aom_get_y_sse(cpi->Source, get_frame_new_buffer(cm));
if (cpi->twopass.total_left_stats.coded_error != 0.0)
- fprintf(f, "%10u %dx%d %10d %10d %d %d %10d %10d %10d %10d"
+ fprintf(f, "%10u %dx%d %d %d %10d %10d %10d %10d"
"%10"PRId64" %10"PRId64" %5d %5d %10"PRId64" "
"%10"PRId64" %10"PRId64" %10d "
"%7.2lf %7.2lf %7.2lf %7.2lf %7.2lf"
@@ -3605,8 +3606,6 @@
"%10lf %8u %10"PRId64" %10d %10d %10d\n",
cpi->common.current_video_frame,
cm->width, cm->height,
- cpi->td.rd_counts.m_search_count,
- cpi->td.rd_counts.ex_search_count,
cpi->rc.source_alt_ref_pending,
cpi->rc.source_alt_ref_active,
cpi->rc.this_frame_target,
@@ -3921,12 +3920,6 @@
set_size_independent_vars(cpi);
- // cpi->sf.use_upsampled_references can be different from frame to frame.
- // Every time when cpi->sf.use_upsampled_references is changed from 0 to 1.
- // The reference frames for this frame have to be up-sampled before encoding.
- if (!use_upsampled_ref && cpi->sf.use_upsampled_references)
- reset_use_upsampled_references(cpi);
-
do {
aom_clear_system_state();
@@ -3935,6 +3928,14 @@
if (loop_count == 0 || cpi->resize_pending != 0) {
set_size_dependent_vars(cpi, &q, &bottom_index, &top_index);
+ // cpi->sf.use_upsampled_references can be different from frame to frame.
+ // Every time when cpi->sf.use_upsampled_references is changed from 0 to
+ // 1.
+ // The reference frames for this frame have to be up-sampled before
+ // encoding.
+ if (!use_upsampled_ref && cpi->sf.use_upsampled_references)
+ reset_use_upsampled_references(cpi);
+
// TODO(agrange) Scale cpi->max_mv_magnitude if frame-size has changed.
set_mv_search_params(cpi);
@@ -4565,6 +4566,12 @@
cm->reset_frame_context = RESET_FRAME_CONTEXT_CURRENT;
}
}
+#if CONFIG_TILE_GROUPS
+ if (cm->error_resilient_mode)
+ cm->num_tg = MAX_NUM_TG;
+ else
+ cm->num_tg = 1;
+#endif
// For 1 pass CBR, check if we are dropping this frame.
// Never drop on key frame.
diff --git a/av1/encoder/encoder.h b/av1/encoder/encoder.h
index 8738609..00abc71 100644
--- a/av1/encoder/encoder.h
+++ b/av1/encoder/encoder.h
@@ -268,13 +268,13 @@
TileInfo tile_info;
int thresh_freq_fact[BLOCK_SIZES][MAX_MODES];
int mode_map[BLOCK_SIZES][MAX_MODES];
+ int m_search_count;
+ int ex_search_count;
} TileDataEnc;
typedef struct RD_COUNTS {
av1_coeff_count coef_counts[TX_SIZES][PLANE_TYPES];
int64_t comp_pred_diff[REFERENCE_MODES];
- int m_search_count;
- int ex_search_count;
} RD_COUNTS;
typedef struct ThreadData {
diff --git a/av1/encoder/ethread.c b/av1/encoder/ethread.c
index 117d0ed..5876d15 100644
--- a/av1/encoder/ethread.c
+++ b/av1/encoder/ethread.c
@@ -27,10 +27,6 @@
for (n = 0; n < ENTROPY_TOKENS; n++)
td->rd_counts.coef_counts[i][j][k][l][m][n] +=
td_t->rd_counts.coef_counts[i][j][k][l][m][n];
-
- // Counts of all motion searches and exhuastive mesh searches.
- td->rd_counts.m_search_count += td_t->rd_counts.m_search_count;
- td->rd_counts.ex_search_count += td_t->rd_counts.ex_search_count;
}
static int enc_worker_hook(EncWorkerData *const thread_data, void *unused) {
diff --git a/av1/encoder/mcomp.c b/av1/encoder/mcomp.c
index 3fbceab..4b54a2c 100644
--- a/av1/encoder/mcomp.c
+++ b/av1/encoder/mcomp.c
@@ -424,7 +424,7 @@
tr = br;
tc = bc;
- if (allow_hp && av1_use_mv_hp(ref_mv) && forced_stop == 0) {
+ if (allow_hp && forced_stop == 0) {
hstep >>= 1;
FIRST_LEVEL_CHECKS;
if (eighthiters > 1) {
@@ -484,7 +484,7 @@
}
}
- if (allow_hp && av1_use_mv_hp(ref_mv) && forced_stop == 0) {
+ if (allow_hp && forced_stop == 0) {
tr = br;
tc = bc;
hstep >>= 1;
@@ -572,7 +572,7 @@
tc = bc;
}
- if (allow_hp && av1_use_mv_hp(ref_mv) && forced_stop == 0) {
+ if (allow_hp && forced_stop == 0) {
hstep >>= 1;
FIRST_LEVEL_CHECKS;
if (eighthiters > 1) {
@@ -687,7 +687,7 @@
unsigned int cost_array[5];
int kr, kc;
- if (!(allow_hp && av1_use_mv_hp(ref_mv)))
+ if (!allow_hp)
if (round == 3) round = 2;
bestmv->row *= 8;
@@ -2446,7 +2446,7 @@
tc = bc;
}
- if (allow_hp && av1_use_mv_hp(ref_mv) && forced_stop == 0) {
+ if (allow_hp && forced_stop == 0) {
hstep >>= 1;
FIRST_LEVEL_CHECKS;
if (eighthiters > 1) {
@@ -2581,7 +2581,7 @@
y_stride = pd->pre[is_second].stride;
offset = bestmv->row * y_stride + bestmv->col;
- if (!(allow_hp && av1_use_mv_hp(ref_mv)))
+ if (!allow_hp)
if (round == 3) round = 2;
bestmv->row *= 8;
@@ -3083,7 +3083,7 @@
y_stride = pd->pre[is_second].stride;
offset = bestmv->row * y_stride + bestmv->col;
- if (!(allow_hp && av1_use_mv_hp(ref_mv)))
+ if (!allow_hp)
if (round == 3) round = 2;
bestmv->row *= 8;
diff --git a/av1/encoder/pickdering.c b/av1/encoder/pickdering.c
index 4ef83cd..0c79e45 100644
--- a/av1/encoder/pickdering.c
+++ b/av1/encoder/pickdering.c
@@ -9,6 +9,8 @@
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
+// clang-format off
+
#include <string.h>
#include <math.h>
@@ -41,7 +43,7 @@
int nhsb, nvsb;
od_dering_in *src;
int16_t *ref_coeff;
- unsigned char *bskip;
+ unsigned char bskip[MAX_MIB_SIZE*MAX_MIB_SIZE][2];
int dir[OD_DERING_NBLOCKS][OD_DERING_NBLOCKS] = { { 0 } };
int stride;
int bsize[3];
@@ -49,10 +51,10 @@
int pli;
int level;
int best_level;
+ int dering_count;
int coeff_shift = AOMMAX(cm->bit_depth - 8, 0);
src = aom_malloc(sizeof(*src) * cm->mi_rows * cm->mi_cols * 64);
ref_coeff = aom_malloc(sizeof(*ref_coeff) * cm->mi_rows * cm->mi_cols * 64);
- bskip = aom_malloc(sizeof(*bskip) * cm->mi_rows * cm->mi_cols);
av1_setup_dst_planes(xd->plane, frame, 0, 0);
for (pli = 0; pli < 3; pli++) {
dec[pli] = xd->plane[pli].subsampling_x;
@@ -77,13 +79,6 @@
#endif
}
}
- for (r = 0; r < cm->mi_rows; ++r) {
- for (c = 0; c < cm->mi_cols; ++c) {
- const MB_MODE_INFO *mbmi =
- &cm->mi_grid_visible[r * cm->mi_stride + c]->mbmi;
- bskip[r * cm->mi_cols + c] = mbmi->skip;
- }
- }
nvsb = (cm->mi_rows + MAX_MIB_SIZE - 1) / MAX_MIB_SIZE;
nhsb = (cm->mi_cols + MAX_MIB_SIZE - 1) / MAX_MIB_SIZE;
/* Pick a base threshold based on the quantizer. The threshold will then be
@@ -105,20 +100,28 @@
int16_t dst[MAX_MIB_SIZE * MAX_MIB_SIZE * 8 * 8];
nhb = AOMMIN(MAX_MIB_SIZE, cm->mi_cols - MAX_MIB_SIZE * sbc);
nvb = AOMMIN(MAX_MIB_SIZE, cm->mi_rows - MAX_MIB_SIZE * sbr);
- if (sb_all_skip(cm, sbr * MAX_MIB_SIZE, sbc * MAX_MIB_SIZE)) continue;
+ if (sb_all_skip_out(cm, sbr * MAX_MIB_SIZE, sbc * MAX_MIB_SIZE, bskip, &dering_count))
+ continue;
best_gi = 0;
for (gi = 0; gi < DERING_REFINEMENT_LEVELS; gi++) {
int cur_mse;
int threshold;
level = compute_level_from_index(best_level, gi);
threshold = level << coeff_shift;
+ for (r = 0; r < bsize[0] * nvb; r++) {
+ for (c = 0; c < bsize[0] * nhb; c++) {
+ dst[r * MAX_MIB_SIZE * bsize[0] + c] =
+ src[(sbr * bsize[0] * MAX_MIB_SIZE + r) * stride +
+ sbc * bsize[0] * MAX_MIB_SIZE + c];
+ }
+ }
od_dering(dst, MAX_MIB_SIZE * bsize[0],
&src[sbr * stride * bsize[0] * MAX_MIB_SIZE +
sbc * bsize[0] * MAX_MIB_SIZE],
cm->mi_cols * bsize[0], nhb, nvb, sbc, sbr, nhsb, nvsb, 0,
dir, 0,
- &bskip[MAX_MIB_SIZE * sbr * cm->mi_cols + MAX_MIB_SIZE * sbc],
- cm->mi_cols, threshold, coeff_shift);
+ bskip,
+ dering_count, threshold, coeff_shift);
cur_mse = (int)compute_dist(
dst, MAX_MIB_SIZE * bsize[0],
&ref_coeff[sbr * stride * bsize[0] * MAX_MIB_SIZE +
@@ -136,6 +139,5 @@
}
aom_free(src);
aom_free(ref_coeff);
- aom_free(bskip);
return best_level;
}
diff --git a/av1/encoder/rd.c b/av1/encoder/rd.c
index 4a2bc3f..f5af485 100644
--- a/av1/encoder/rd.c
+++ b/av1/encoder/rd.c
@@ -719,31 +719,42 @@
}
#if CONFIG_DUAL_FILTER
-int av1_get_switchable_rate(const AV1_COMP *cpi, const MACROBLOCKD *const xd) {
- const MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
- int inter_filter_cost = 0;
- int dir;
+int av1_get_switchable_rate(const AV1_COMP *cpi, const MACROBLOCKD *xd) {
+ const AV1_COMMON *const cm = &cpi->common;
+ if (cm->interp_filter == SWITCHABLE) {
+ const MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
+ int inter_filter_cost = 0;
+ int dir;
- for (dir = 0; dir < 2; ++dir) {
- if (has_subpel_mv_component(xd->mi[0], xd, dir) ||
- (mbmi->ref_frame[1] > INTRA_FRAME &&
- has_subpel_mv_component(xd->mi[0], xd, dir + 2))) {
- const int ctx = av1_get_pred_context_switchable_interp(xd, dir);
- inter_filter_cost +=
- cpi->switchable_interp_costs[ctx][mbmi->interp_filter[dir]];
+ for (dir = 0; dir < 2; ++dir) {
+ if (has_subpel_mv_component(xd->mi[0], xd, dir) ||
+ (mbmi->ref_frame[1] > INTRA_FRAME &&
+ has_subpel_mv_component(xd->mi[0], xd, dir + 2))) {
+ const int ctx = av1_get_pred_context_switchable_interp(xd, dir);
+ inter_filter_cost +=
+ cpi->switchable_interp_costs[ctx][mbmi->interp_filter[dir]];
+ }
}
+ return SWITCHABLE_INTERP_RATE_FACTOR * inter_filter_cost;
+ } else {
+ return 0;
}
- return SWITCHABLE_INTERP_RATE_FACTOR * inter_filter_cost;
}
#else
-int av1_get_switchable_rate(const AV1_COMP *cpi, const MACROBLOCKD *const xd) {
- const MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
- const int ctx = av1_get_pred_context_switchable_interp(xd);
+int av1_get_switchable_rate(const AV1_COMP *cpi, const MACROBLOCKD *xd) {
+ const AV1_COMMON *const cm = &cpi->common;
+ if (cm->interp_filter == SWITCHABLE) {
#if CONFIG_EXT_INTERP
- if (!av1_is_interp_needed(xd)) return 0;
-#endif // CONFIG_EXT_INTERP
- return SWITCHABLE_INTERP_RATE_FACTOR *
- cpi->switchable_interp_costs[ctx][mbmi->interp_filter];
+ if (av1_is_interp_needed(xd))
+#endif
+ {
+ const MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
+ const int ctx = av1_get_pred_context_switchable_interp(xd);
+ return SWITCHABLE_INTERP_RATE_FACTOR *
+ cpi->switchable_interp_costs[ctx][mbmi->interp_filter];
+ }
+ }
+ return 0;
}
#endif
diff --git a/av1/encoder/rdopt.c b/av1/encoder/rdopt.c
index 76d471e..8682a3e 100644
--- a/av1/encoder/rdopt.c
+++ b/av1/encoder/rdopt.c
@@ -707,17 +707,17 @@
switch (cpi->sf.tx_type_search.prune_mode) {
case NO_PRUNE: return 0; break;
case PRUNE_ONE:
- if ((tx_set >= 0) & !(tx_set_1D[FLIPADST_1D] & tx_set_1D[ADST_1D]))
+ if ((tx_set >= 0) && !(tx_set_1D[FLIPADST_1D] & tx_set_1D[ADST_1D]))
return 0;
return prune_one_for_sby(cpi, bsize, x, xd);
break;
#if CONFIG_EXT_TX
case PRUNE_TWO:
- if ((tx_set >= 0) & !(tx_set_1D[FLIPADST_1D] & tx_set_1D[ADST_1D])) {
+ if ((tx_set >= 0) && !(tx_set_1D[FLIPADST_1D] & tx_set_1D[ADST_1D])) {
if (!(tx_set_1D[DCT_1D] & tx_set_1D[IDTX_1D])) return 0;
return prune_two_for_sby(cpi, bsize, x, xd, 0, 1);
}
- if ((tx_set >= 0) & !(tx_set_1D[DCT_1D] & tx_set_1D[IDTX_1D]))
+ if ((tx_set >= 0) && !(tx_set_1D[DCT_1D] & tx_set_1D[IDTX_1D]))
return prune_two_for_sby(cpi, bsize, x, xd, 1, 0);
return prune_two_for_sby(cpi, bsize, x, xd, 1, 1);
break;
@@ -1318,9 +1318,9 @@
const int tx_size_cat =
is_inter ? inter_tx_size_cat_lookup[bs] : intra_tx_size_cat_lookup[bs];
const TX_SIZE coded_tx_size = txsize_sqr_up_map[tx_size];
+ const int depth = tx_size_to_depth(coded_tx_size);
const int tx_select = cm->tx_mode == TX_MODE_SELECT;
- const int r_tx_size =
- cpi->tx_size_cost[tx_size_cat][tx_size_ctx][coded_tx_size];
+ const int r_tx_size = cpi->tx_size_cost[tx_size_cat][tx_size_ctx][depth];
assert(skip_prob > 0);
#if CONFIG_EXT_TX && CONFIG_RECT_TX
@@ -2746,7 +2746,7 @@
// not the tokenonly rate.
this_rate_tokenonly -=
cpi->tx_size_cost[max_tx_size - TX_8X8][get_tx_size_context(xd)]
- [mic->mbmi.tx_size];
+ [tx_size_to_depth(mic->mbmi.tx_size)];
}
#if CONFIG_PALETTE
if (cpi->common.allow_screen_content_tools && mic->mbmi.mode == DC_PRED)
@@ -2918,9 +2918,12 @@
TX_TYPE tx_type = get_tx_type(plane_type, xd, block, tx_size);
const SCAN_ORDER *const scan_order =
get_scan(cm, tx_size, tx_type, is_inter_block(&xd->mi[0]->mbmi));
-
BLOCK_SIZE txm_bsize = txsize_to_bsize[tx_size];
- int bh = 4 * num_4x4_blocks_wide_lookup[txm_bsize];
+ int bh = block_size_high[txm_bsize];
+ int bw = block_size_wide[txm_bsize];
+ int txb_h = tx_size_high_unit[tx_size];
+ int txb_w = tx_size_wide_unit[tx_size];
+
int src_stride = p->src.stride;
uint8_t *src = &p->src.buf[4 * blk_row * src_stride + 4 * blk_col];
uint8_t *dst = &pd->dst.buf[4 * blk_row * pd->dst.stride + 4 * blk_col];
@@ -2930,20 +2933,21 @@
#else
DECLARE_ALIGNED(16, uint8_t, rec_buffer[MAX_TX_SQUARE]);
#endif // CONFIG_AOM_HIGHBITDEPTH
- const int diff_stride = 4 * num_4x4_blocks_wide_lookup[plane_bsize];
+ int max_blocks_high = block_size_high[plane_bsize];
+ int max_blocks_wide = block_size_wide[plane_bsize];
+ const int diff_stride = max_blocks_wide;
const int16_t *diff = &p->src_diff[4 * (blk_row * diff_stride + blk_col)];
-
- int max_blocks_high = num_4x4_blocks_high_lookup[plane_bsize];
- int max_blocks_wide = num_4x4_blocks_wide_lookup[plane_bsize];
-
#if CONFIG_EXT_TX
assert(tx_size < TX_SIZES);
#endif // CONFIG_EXT_TX
if (xd->mb_to_bottom_edge < 0)
- max_blocks_high += xd->mb_to_bottom_edge >> (5 + pd->subsampling_y);
+ max_blocks_high += xd->mb_to_bottom_edge >> (3 + pd->subsampling_y);
if (xd->mb_to_right_edge < 0)
- max_blocks_wide += xd->mb_to_right_edge >> (5 + pd->subsampling_x);
+ max_blocks_wide += xd->mb_to_right_edge >> (3 + pd->subsampling_x);
+
+ max_blocks_high >>= tx_size_wide_log2[0];
+ max_blocks_wide >>= tx_size_wide_log2[0];
#if CONFIG_NEW_QUANT
av1_xform_quant_fp_nuq(cm, x, plane, block, blk_row, blk_col, plane_bsize,
@@ -2960,22 +2964,21 @@
if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
rec_buffer = CONVERT_TO_BYTEPTR(rec_buffer16);
aom_highbd_convolve_copy(dst, pd->dst.stride, rec_buffer, MAX_TX_SIZE, NULL,
- 0, NULL, 0, bh, bh, xd->bd);
+ 0, NULL, 0, bw, bh, xd->bd);
} else {
rec_buffer = (uint8_t *)rec_buffer16;
aom_convolve_copy(dst, pd->dst.stride, rec_buffer, MAX_TX_SIZE, NULL, 0,
- NULL, 0, bh, bh);
+ NULL, 0, bw, bh);
}
#else
aom_convolve_copy(dst, pd->dst.stride, rec_buffer, MAX_TX_SIZE, NULL, 0, NULL,
- 0, bh, bh);
+ 0, bw, bh);
#endif // CONFIG_AOM_HIGHBITDEPTH
- if (blk_row + (bh >> 2) > max_blocks_high ||
- blk_col + (bh >> 2) > max_blocks_wide) {
+ if (blk_row + txb_h > max_blocks_high || blk_col + txb_w > max_blocks_wide) {
int idx, idy;
- int blocks_height = AOMMIN(bh >> 2, max_blocks_high - blk_row);
- int blocks_width = AOMMIN(bh >> 2, max_blocks_wide - blk_col);
+ int blocks_height = AOMMIN(txb_h, max_blocks_high - blk_row);
+ int blocks_width = AOMMIN(txb_w, max_blocks_wide - blk_col);
tmp = 0;
for (idy = 0; idy < blocks_height; idy += 2) {
for (idx = 0; idx < blocks_width; idx += 2) {
@@ -2984,7 +2987,7 @@
}
}
} else {
- tmp = aom_sum_squares_2d_i16(diff, diff_stride, bh);
+ tmp = sum_squares_2d(diff, diff_stride, tx_size);
}
#if CONFIG_AOM_HIGHBITDEPTH
@@ -3010,12 +3013,12 @@
inv_txfm_add(dqcoeff, rec_buffer, MAX_TX_SIZE, &inv_txfm_param);
#endif // CONFIG_AOM_HIGHBITDEPTH
- if ((bh >> 2) + blk_col > max_blocks_wide ||
- (bh >> 2) + blk_row > max_blocks_high) {
+ if (txb_h + blk_col > max_blocks_wide ||
+ txb_w + blk_row > max_blocks_high) {
int idx, idy;
unsigned int this_dist;
- int blocks_height = AOMMIN(bh >> 2, max_blocks_high - blk_row);
- int blocks_width = AOMMIN(bh >> 2, max_blocks_wide - blk_col);
+ int blocks_height = AOMMIN(txb_h, max_blocks_high - blk_row);
+ int blocks_width = AOMMIN(txb_w, max_blocks_wide - blk_col);
tmp = 0;
for (idy = 0; idy < blocks_height; idy += 2) {
for (idx = 0; idx < blocks_width; idx += 2) {
@@ -3054,20 +3057,20 @@
TX_SIZE(*const inter_tx_size)
[MAX_MIB_SIZE] =
(TX_SIZE(*)[MAX_MIB_SIZE]) & mbmi->inter_tx_size[tx_row][tx_col];
- const int bw = num_4x4_blocks_wide_lookup[plane_bsize];
- int max_blocks_high = num_4x4_blocks_high_lookup[plane_bsize];
- int max_blocks_wide = bw;
+ const int max_blocks_high = max_block_high(xd, plane_bsize, plane);
+ const int max_blocks_wide = max_block_wide(xd, plane_bsize, plane);
+ const int bw = block_size_wide[plane_bsize] >> tx_size_wide_log2[0];
int64_t this_rd = INT64_MAX;
ENTROPY_CONTEXT *pta = ta + blk_col;
ENTROPY_CONTEXT *ptl = tl + blk_row;
- ENTROPY_CONTEXT stxa = 0, stxl = 0;
int coeff_ctx, i;
- int ctx = txfm_partition_context(tx_above + (blk_col >> 1),
- tx_left + (blk_row >> 1), tx_size);
+ int ctx =
+ txfm_partition_context(tx_above + (blk_col >> 1),
+ tx_left + (blk_row >> 1), mbmi->sb_type, tx_size);
int64_t sum_dist = 0, sum_bsse = 0;
int64_t sum_rd = INT64_MAX;
- int sum_rate = av1_cost_bit(cpi->common.fc->txfm_partition_prob[ctx], 1);
+ int sum_rate = 0;
int all_skip = 1;
int tmp_eob = 0;
int zero_blk_rate;
@@ -3081,31 +3084,7 @@
return;
}
- switch (tx_size) {
- case TX_4X4:
- stxa = pta[0];
- stxl = ptl[0];
- break;
- case TX_8X8:
- stxa = !!*(const uint16_t *)&pta[0];
- stxl = !!*(const uint16_t *)&ptl[0];
- break;
- case TX_16X16:
- stxa = !!*(const uint32_t *)&pta[0];
- stxl = !!*(const uint32_t *)&ptl[0];
- break;
- case TX_32X32:
- stxa = !!*(const uint64_t *)&pta[0];
- stxl = !!*(const uint64_t *)&ptl[0];
- break;
- default: assert(0 && "Invalid transform size."); break;
- }
- coeff_ctx = combine_entropy_contexts(stxa, stxl);
-
- if (xd->mb_to_bottom_edge < 0)
- max_blocks_high += xd->mb_to_bottom_edge >> (5 + pd->subsampling_y);
- if (xd->mb_to_right_edge < 0)
- max_blocks_wide += xd->mb_to_right_edge >> (5 + pd->subsampling_x);
+ coeff_ctx = get_entropy_context(tx_size, pta, ptl);
*rate = 0;
*dist = 0;
@@ -3144,8 +3123,10 @@
if (tx_size > TX_4X4 && depth < MAX_VARTX_DEPTH) {
BLOCK_SIZE bsize = txsize_to_bsize[tx_size];
- int bsl = b_height_log2_lookup[bsize];
- int sub_step = num_4x4_blocks_txsize_lookup[tx_size - 1];
+ int bsl = block_size_wide[bsize] >> (tx_size_wide_log2[0] + 1);
+ // TODO(jingning): Refactor this transform block size transition.
+ TX_SIZE sub_txs = tx_size - 1;
+ int sub_step = tx_size_wide_unit[sub_txs] * tx_size_high_unit[sub_txs];
int this_rate;
int64_t this_dist;
int64_t this_bsse;
@@ -3153,18 +3134,17 @@
int this_cost_valid = 1;
int64_t tmp_rd = 0;
+ sum_rate = av1_cost_bit(cpi->common.fc->txfm_partition_prob[ctx], 1);
#if CONFIG_EXT_TX
assert(tx_size < TX_SIZES);
#endif // CONFIG_EXT_TX
- --bsl;
for (i = 0; i < 4 && this_cost_valid; ++i) {
- int offsetr = (i >> 1) << bsl;
- int offsetc = (i & 0x01) << bsl;
+ int offsetr = (i >> 1) * bsl;
+ int offsetc = (i & 0x01) * bsl;
select_tx_block(cpi, x, blk_row + offsetr, blk_col + offsetc, plane,
- block + i * sub_step, tx_size - 1, depth + 1, plane_bsize,
- ta, tl, tx_above, tx_left, &this_rate, &this_dist,
- &this_bsse, &this_skip, ref_best_rd - tmp_rd,
- &this_cost_valid);
+ block + i * sub_step, sub_txs, depth + 1, plane_bsize, ta,
+ tl, tx_above, tx_left, &this_rate, &this_dist, &this_bsse,
+ &this_skip, ref_best_rd - tmp_rd, &this_cost_valid);
sum_rate += this_rate;
sum_dist += this_dist;
sum_bsse += this_bsse;
@@ -3177,15 +3157,13 @@
if (this_rd < sum_rd) {
int idx, idy;
- for (i = 0; i < num_4x4_blocks_wide_txsize_lookup[tx_size]; ++i)
- pta[i] = !(tmp_eob == 0);
- for (i = 0; i < num_4x4_blocks_high_txsize_lookup[tx_size]; ++i)
- ptl[i] = !(tmp_eob == 0);
+ for (i = 0; i < tx_size_wide_unit[tx_size]; ++i) pta[i] = !(tmp_eob == 0);
+ for (i = 0; i < tx_size_high_unit[tx_size]; ++i) ptl[i] = !(tmp_eob == 0);
txfm_partition_update(tx_above + (blk_col >> 1), tx_left + (blk_row >> 1),
tx_size);
inter_tx_size[0][0] = tx_size;
- for (idy = 0; idy < num_4x4_blocks_high_txsize_lookup[tx_size] / 2; ++idy)
- for (idx = 0; idx < num_4x4_blocks_wide_txsize_lookup[tx_size] / 2; ++idx)
+ for (idy = 0; idy < tx_size_high_unit[tx_size] / 2; ++idy)
+ for (idx = 0; idx < tx_size_wide_unit[tx_size] / 2; ++idx)
inter_tx_size[idy][idx] = tx_size;
mbmi->tx_size = tx_size;
if (this_rd == INT64_MAX) *is_cost_valid = 0;
@@ -3222,7 +3200,8 @@
int bh = num_4x4_blocks_wide_lookup[txb_size];
int idx, idy;
int block = 0;
- int step = 1 << (max_txsize_lookup[plane_bsize] * 2);
+ int step = tx_size_wide_unit[max_txsize_lookup[plane_bsize]] *
+ tx_size_high_unit[max_txsize_lookup[plane_bsize]];
ENTROPY_CONTEXT ctxa[2 * MAX_MIB_SIZE];
ENTROPY_CONTEXT ctxl[2 * MAX_MIB_SIZE];
TXFM_CONTEXT tx_above[MAX_MIB_SIZE];
@@ -3478,17 +3457,20 @@
const int tx_row = blk_row >> (1 - pd->subsampling_y);
const int tx_col = blk_col >> (1 - pd->subsampling_x);
TX_SIZE plane_tx_size;
- int max_blocks_high = num_4x4_blocks_high_lookup[plane_bsize];
- int max_blocks_wide = num_4x4_blocks_wide_lookup[plane_bsize];
+ int max_blocks_high = block_size_high[plane_bsize];
+ int max_blocks_wide = block_size_wide[plane_bsize];
#if CONFIG_EXT_TX
assert(tx_size < TX_SIZES);
#endif // CONFIG_EXT_TX
if (xd->mb_to_bottom_edge < 0)
- max_blocks_high += xd->mb_to_bottom_edge >> (5 + pd->subsampling_y);
+ max_blocks_high += xd->mb_to_bottom_edge >> (3 + pd->subsampling_y);
if (xd->mb_to_right_edge < 0)
- max_blocks_wide += xd->mb_to_right_edge >> (5 + pd->subsampling_x);
+ max_blocks_wide += xd->mb_to_right_edge >> (3 + pd->subsampling_x);
+
+ max_blocks_high >>= tx_size_wide_log2[0];
+ max_blocks_wide >>= tx_size_wide_log2[0];
if (blk_row >= max_blocks_high || blk_col >= max_blocks_wide) return;
@@ -3519,24 +3501,25 @@
coeff_ctx = combine_entropy_contexts(ta[0], tl[0]);
av1_tx_block_rd_b(cpi, x, tx_size, blk_row, blk_col, plane, block,
plane_bsize, coeff_ctx, rate, dist, bsse, skip);
- for (i = 0; i < num_4x4_blocks_wide_txsize_lookup[tx_size]; ++i)
+
+ for (i = 0; i < tx_size_wide_unit[tx_size]; ++i)
ta[i] = !(p->eobs[block] == 0);
- for (i = 0; i < num_4x4_blocks_high_txsize_lookup[tx_size]; ++i)
+ for (i = 0; i < tx_size_high_unit[tx_size]; ++i)
tl[i] = !(p->eobs[block] == 0);
} else {
- int bsl = b_width_log2_lookup[bsize];
- int step = num_4x4_blocks_txsize_lookup[tx_size - 1];
+ const int bsl = block_size_wide[bsize] >> (1 + tx_size_wide_log2[0]);
+ const TX_SIZE sub_txs = tx_size - 1;
+ int step = tx_size_wide_unit[sub_txs] * tx_size_high_unit[sub_txs];
int i;
assert(bsl > 0);
- --bsl;
for (i = 0; i < 4; ++i) {
- int offsetr = (i >> 1) << bsl;
- int offsetc = (i & 0x01) << bsl;
+ int offsetr = (i >> 1) * bsl;
+ int offsetc = (i & 0x01) * bsl;
tx_block_rd(cpi, x, blk_row + offsetr, blk_col + offsetc, plane,
- block + i * step, tx_size - 1, plane_bsize, above_ctx,
- left_ctx, rate, dist, bsse, skip);
+ block + i * step, sub_txs, plane_bsize, above_ctx, left_ctx,
+ rate, dist, bsse, skip);
}
}
}
@@ -4218,8 +4201,7 @@
#endif // CONFIG_EXT_INTER
this_mv[0].as_int = seg_mvs[mbmi->ref_frame[0]].as_int;
#if CONFIG_EXT_INTER
- if (!cpi->common.allow_high_precision_mv ||
- !av1_use_mv_hp(&best_ref_mv[0]->as_mv))
+ if (!cpi->common.allow_high_precision_mv)
lower_mv_precision(&this_mv[0].as_mv, 0);
#endif // CONFIG_EXT_INTER
@@ -4278,11 +4260,9 @@
this_mv[0].as_int = compound_seg_newmvs[0].as_int;
this_mv[1].as_int = compound_seg_newmvs[1].as_int;
}
- if (!cpi->common.allow_high_precision_mv ||
- !av1_use_mv_hp(&best_ref_mv[0]->as_mv))
+ if (!cpi->common.allow_high_precision_mv)
lower_mv_precision(&this_mv[0].as_mv, 0);
- if (!cpi->common.allow_high_precision_mv ||
- !av1_use_mv_hp(&best_ref_mv[1]->as_mv))
+ if (!cpi->common.allow_high_precision_mv)
lower_mv_precision(&this_mv[1].as_mv, 0);
thismvcost += av1_mv_bit_cost(&this_mv[0].as_mv, &best_ref_mv[0]->as_mv,
mvjcost, mvcost, MV_COST_WEIGHT_SUB);
@@ -4292,8 +4272,7 @@
case NEW_NEARMV:
case NEW_NEARESTMV:
this_mv[0].as_int = seg_mvs[mbmi->ref_frame[0]].as_int;
- if (!cpi->common.allow_high_precision_mv ||
- !av1_use_mv_hp(&best_ref_mv[0]->as_mv))
+ if (!cpi->common.allow_high_precision_mv)
lower_mv_precision(&this_mv[0].as_mv, 0);
thismvcost += av1_mv_bit_cost(&this_mv[0].as_mv, &best_ref_mv[0]->as_mv,
mvjcost, mvcost, MV_COST_WEIGHT_SUB);
@@ -4303,8 +4282,7 @@
case NEAREST_NEWMV:
this_mv[0].as_int = frame_mv[mode][mbmi->ref_frame[0]].as_int;
this_mv[1].as_int = seg_mvs[mbmi->ref_frame[1]].as_int;
- if (!cpi->common.allow_high_precision_mv ||
- !av1_use_mv_hp(&best_ref_mv[1]->as_mv))
+ if (!cpi->common.allow_high_precision_mv)
lower_mv_precision(&this_mv[1].as_mv, 0);
thismvcost += av1_mv_bit_cost(&this_mv[1].as_mv, &best_ref_mv[1]->as_mv,
mvjcost, mvcost, MV_COST_WEIGHT_SUB);
@@ -4374,8 +4352,8 @@
struct macroblock_plane *const p = &x->plane[0];
MODE_INFO *const mi = xd->mi[0];
const BLOCK_SIZE plane_bsize = get_plane_block_size(mi->mbmi.sb_type, pd);
- const int width = 4 * num_4x4_blocks_wide_lookup[plane_bsize];
- const int height = 4 * num_4x4_blocks_high_lookup[plane_bsize];
+ const int width = block_size_wide[plane_bsize];
+ const int height = block_size_high[plane_bsize];
int idx, idy;
const uint8_t *const src =
&p->src.buf[av1_raster_block_offset(BLOCK_8X8, i, p->src.stride)];
@@ -4387,8 +4365,8 @@
TX_TYPE tx_type = get_tx_type(PLANE_TYPE_Y, xd, i, tx_size);
const SCAN_ORDER *scan_order = get_scan(cm, tx_size, tx_type, 1);
- const int num_4x4_w = num_4x4_blocks_wide_txsize_lookup[tx_size];
- const int num_4x4_h = num_4x4_blocks_high_txsize_lookup[tx_size];
+ const int num_4x4_w = tx_size_wide_unit[tx_size];
+ const int num_4x4_h = tx_size_high_unit[tx_size];
#if CONFIG_EXT_TX && CONFIG_RECT_TX
assert(IMPLIES(xd->lossless[mi->mbmi.segment_id], tx_size == TX_4X4));
@@ -4428,11 +4406,8 @@
block = k;
else
block = (i ? 2 : 0);
-#if CONFIG_VAR_TX
- coeff_ctx = get_entropy_context(tx_size, ta + (k & 1), tl + (k >> 1));
-#else
+
coeff_ctx = combine_entropy_contexts(*(ta + (k & 1)), *(tl + (k >> 1)));
-#endif
#if CONFIG_NEW_QUANT
av1_xform_quant_fp_nuq(cm, x, 0, block, idy + (i >> 1), idx + (i & 0x01),
BLOCK_8X8, tx_size, coeff_ctx);
@@ -5145,8 +5120,7 @@
if (!has_second_rf &&
#if CONFIG_EXT_INTER
have_newmv_in_inter_mode(this_mode) &&
- (seg_mvs[index][mv_idx][mbmi->ref_frame[0]].as_int == INVALID_MV ||
- av1_use_mv_hp(&bsi->ref_mv[0]->as_mv) == 0)
+ (seg_mvs[index][mv_idx][mbmi->ref_frame[0]].as_int == INVALID_MV)
#else
this_mode == NEWMV &&
(seg_mvs[index][mbmi->ref_frame[0]].as_int == INVALID_MV ||
@@ -6367,10 +6341,6 @@
const int this_mode = mbmi->mode;
int refs[2] = { mbmi->ref_frame[0],
(mbmi->ref_frame[1] < 0 ? 0 : mbmi->ref_frame[1]) };
-#if CONFIG_DUAL_FILTER
- (void)pred_filter_search;
- return SWITCHABLE;
-#else
if (pred_filter_search) {
InterpFilter af = SWITCHABLE, lf = SWITCHABLE;
if (xd->up_available) af = xd->mi[-xd->mi_stride]->mbmi.interp_filter;
@@ -6385,7 +6355,6 @@
#endif // CONFIG_EXT_INTER
best_filter = af;
}
-#endif
if (is_comp_pred) {
if (cpi->sf.adaptive_mode_search) {
#if CONFIG_EXT_INTER
@@ -6448,15 +6417,8 @@
#endif // CONFIG_EXT_INTER
}
}
- if (cm->interp_filter != BILINEAR) {
- if (x->source_variance < cpi->sf.disable_filter_search_var_thresh) {
- best_filter = EIGHTTAP_REGULAR;
- }
-#if CONFIG_EXT_INTERP
- else if (!av1_is_interp_needed(xd) && cm->interp_filter == SWITCHABLE) {
- best_filter = EIGHTTAP_REGULAR;
- }
-#endif
+ if (x->source_variance < cpi->sf.disable_filter_search_var_thresh) {
+ best_filter = EIGHTTAP_REGULAR;
}
return best_filter;
}
@@ -6676,6 +6638,7 @@
int_mv cur_mv[2];
int rate_mv = 0;
#if CONFIG_EXT_INTER
+ int pred_exists = 1;
const int bw = 4 * num_4x4_blocks_wide_lookup[bsize];
int mv_idx = (this_mode == NEWFROMNEARMV) ? 1 : 0;
int_mv single_newmv[TOTAL_REFS_PER_FRAME];
@@ -6706,6 +6669,7 @@
uint8_t best_blk_skip[MAX_MB_PLANE][MAX_MIB_SIZE * MAX_MIB_SIZE * 4];
#endif // CONFIG_VAR_TX
int64_t best_distortion = INT64_MAX;
+ int64_t best_rd = INT64_MAX;
MB_MODE_INFO best_mbmi;
#if CONFIG_EXT_INTER
int rate2_bmc_nocoeff;
@@ -6713,24 +6677,13 @@
MB_MODE_INFO best_bmc_mbmi;
#endif // CONFIG_EXT_INTER
#endif // CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
-
- int pred_exists = 0;
- int intpel_mv;
- int64_t rd, tmp_rd, best_rd = INT64_MAX;
- int best_needs_copy = 0;
+ int64_t rd = INT64_MAX;
uint8_t *orig_dst[MAX_MB_PLANE];
int orig_dst_stride[MAX_MB_PLANE];
+ uint8_t *tmp_dst[MAX_MB_PLANE];
+ int tmp_dst_stride[MAX_MB_PLANE];
int rs = 0;
-#if CONFIG_DUAL_FILTER
- // Index use case:
- // {0, 1} -> (vertical, horizontal) filter types for the first ref frame
- // {2, 3} -> (vertical, horizontal) filter types for the second ref frame
- InterpFilter best_filter[4] = {
- SWITCHABLE, SWITCHABLE, SWITCHABLE, SWITCHABLE,
- };
-#else
- InterpFilter best_filter = SWITCHABLE;
-#endif
+ InterpFilter assign_filter = SWITCHABLE;
int skip_txfm_sb = 0;
int64_t skip_sse_sb = INT64_MAX;
@@ -6792,24 +6745,24 @@
av1_set_mvcost(x, mbmi->ref_frame[0], 0, mbmi->ref_mv_idx);
#endif // CONFIG_REF_MV
rate_mv = av1_mv_bit_cost(&frame_mv[refs[0]].as_mv,
- &x->mbmi_ext->ref_mvs[refs[0]][0].as_mv,
+ &mbmi_ext->ref_mvs[refs[0]][0].as_mv,
x->nmvjointcost, x->mvcost, MV_COST_WEIGHT);
#if CONFIG_REF_MV
av1_set_mvcost(x, mbmi->ref_frame[1], 1, mbmi->ref_mv_idx);
#endif // CONFIG_REF_MV
rate_mv += av1_mv_bit_cost(
- &frame_mv[refs[1]].as_mv, &x->mbmi_ext->ref_mvs[refs[1]][0].as_mv,
+ &frame_mv[refs[1]].as_mv, &mbmi_ext->ref_mvs[refs[1]][0].as_mv,
x->nmvjointcost, x->mvcost, MV_COST_WEIGHT);
}
} else if (this_mode == NEAREST_NEWMV || this_mode == NEAR_NEWMV) {
frame_mv[refs[1]].as_int = single_newmv[refs[1]].as_int;
rate_mv = av1_mv_bit_cost(&frame_mv[refs[1]].as_mv,
- &x->mbmi_ext->ref_mvs[refs[1]][0].as_mv,
+ &mbmi_ext->ref_mvs[refs[1]][0].as_mv,
x->nmvjointcost, x->mvcost, MV_COST_WEIGHT);
} else {
frame_mv[refs[0]].as_int = single_newmv[refs[0]].as_int;
rate_mv = av1_mv_bit_cost(&frame_mv[refs[0]].as_mv,
- &x->mbmi_ext->ref_mvs[refs[0]][0].as_mv,
+ &mbmi_ext->ref_mvs[refs[0]][0].as_mv,
x->nmvjointcost, x->mvcost, MV_COST_WEIGHT);
}
#else
@@ -6825,13 +6778,13 @@
av1_set_mvcost(x, mbmi->ref_frame[0], 0, mbmi->ref_mv_idx);
#endif // CONFIG_REF_MV
rate_mv = av1_mv_bit_cost(&frame_mv[refs[0]].as_mv,
- &x->mbmi_ext->ref_mvs[refs[0]][0].as_mv,
+ &mbmi_ext->ref_mvs[refs[0]][0].as_mv,
x->nmvjointcost, x->mvcost, MV_COST_WEIGHT);
#if CONFIG_REF_MV
av1_set_mvcost(x, mbmi->ref_frame[1], 1, mbmi->ref_mv_idx);
#endif // CONFIG_REF_MV
rate_mv += av1_mv_bit_cost(&frame_mv[refs[1]].as_mv,
- &x->mbmi_ext->ref_mvs[refs[1]][0].as_mv,
+ &mbmi_ext->ref_mvs[refs[1]][0].as_mv,
x->nmvjointcost, x->mvcost, MV_COST_WEIGHT);
}
#endif // CONFIG_EXT_INTER
@@ -6966,6 +6919,10 @@
// one for future predictions. In the end, copy from tmp_buf to
// dst if necessary.
for (i = 0; i < MAX_MB_PLANE; i++) {
+ tmp_dst[i] = tmp_buf + i * MAX_SB_SQUARE;
+ tmp_dst_stride[i] = MAX_SB_SIZE;
+ }
+ for (i = 0; i < MAX_MB_PLANE; i++) {
orig_dst[i] = xd->plane[i].dst.buf;
orig_dst_stride[i] = xd->plane[i].dst.stride;
}
@@ -7003,135 +6960,126 @@
)
return INT64_MAX;
- pred_exists = 0;
- // Are all MVs integer pel for Y and UV
- intpel_mv = !mv_has_subpel(&mbmi->mv[0].as_mv);
- if (is_comp_pred) intpel_mv &= !mv_has_subpel(&mbmi->mv[1].as_mv);
-
+ if (cm->interp_filter == SWITCHABLE) {
#if !CONFIG_DUAL_FILTER
- best_filter =
- predict_interp_filter(cpi, x, bsize, mi_row, mi_col, single_filter);
+ assign_filter =
+ predict_interp_filter(cpi, x, bsize, mi_row, mi_col, single_filter);
#endif
+#if CONFIG_EXT_INTERP || CONFIG_DUAL_FILTER
+ if (!av1_is_interp_needed(xd)) assign_filter = EIGHTTAP_REGULAR;
+#endif
+ } else {
+ assign_filter = cm->interp_filter;
+ }
- if (cm->interp_filter != BILINEAR) {
- int newbest;
- int tmp_rate_sum = 0;
- int64_t tmp_dist_sum = 0;
-
+ { // Do interpolation filter search in the parentheses
+ int tmp_rate;
+ int64_t tmp_dist;
#if CONFIG_DUAL_FILTER
- for (i = 0; i < SWITCHABLE_FILTERS * SWITCHABLE_FILTERS; ++i)
+ mbmi->interp_filter[0] =
+ assign_filter == SWITCHABLE ? EIGHTTAP_REGULAR : assign_filter;
+ mbmi->interp_filter[1] =
+ assign_filter == SWITCHABLE ? EIGHTTAP_REGULAR : assign_filter;
+ mbmi->interp_filter[2] =
+ assign_filter == SWITCHABLE ? EIGHTTAP_REGULAR : assign_filter;
+ mbmi->interp_filter[3] =
+ assign_filter == SWITCHABLE ? EIGHTTAP_REGULAR : assign_filter;
#else
- for (i = 0; i < SWITCHABLE_FILTERS; ++i)
+ mbmi->interp_filter =
+ assign_filter == SWITCHABLE ? EIGHTTAP_REGULAR : assign_filter;
#endif
- {
- int j;
- int64_t rs_rd;
- int tmp_skip_sb = 0;
- int64_t tmp_skip_sse = INT64_MAX;
+ rs = av1_get_switchable_rate(cpi, xd);
+ av1_build_inter_predictors_sb(xd, mi_row, mi_col, bsize);
+ model_rd_for_sb(cpi, bsize, x, xd, 0, MAX_MB_PLANE - 1, &tmp_rate,
+ &tmp_dist, &skip_txfm_sb, &skip_sse_sb);
+ rd = RDCOST(x->rdmult, x->rddiv, rs + tmp_rate, tmp_dist);
+ if (assign_filter == SWITCHABLE) {
+ // do interp_filter search
+ if (av1_is_interp_needed(xd)) {
+ int best_in_temp = 0;
#if CONFIG_DUAL_FILTER
- mbmi->interp_filter[0] = filter_sets[i][0];
- mbmi->interp_filter[1] = filter_sets[i][1];
- mbmi->interp_filter[2] = filter_sets[i][0];
- mbmi->interp_filter[3] = filter_sets[i][1];
+ InterpFilter best_filter[4];
+ av1_copy(best_filter, mbmi->interp_filter);
#else
- mbmi->interp_filter = i;
+ InterpFilter best_filter = mbmi->interp_filter;
#endif
- rs = av1_get_switchable_rate(cpi, xd);
- rs_rd = RDCOST(x->rdmult, x->rddiv, rs, 0);
+ restore_dst_buf(xd, tmp_dst, tmp_dst_stride);
+#if CONFIG_DUAL_FILTER
+ // EIGHTTAP_REGULAR mode is calculated beforehand
+ for (i = 1; i < SWITCHABLE_FILTERS * SWITCHABLE_FILTERS; ++i)
+#else
+ // EIGHTTAP_REGULAR mode is calculated beforehand
+ for (i = 1; i < SWITCHABLE_FILTERS; ++i)
+#endif
+ {
+ int tmp_skip_sb = 0;
+ int64_t tmp_skip_sse = INT64_MAX;
+ int tmp_rs;
+ int64_t tmp_rd;
+#if CONFIG_DUAL_FILTER
+ mbmi->interp_filter[0] = filter_sets[i][0];
+ mbmi->interp_filter[1] = filter_sets[i][1];
+ mbmi->interp_filter[2] = filter_sets[i][0];
+ mbmi->interp_filter[3] = filter_sets[i][1];
+#else
+ mbmi->interp_filter = i;
+#endif
+ tmp_rs = av1_get_switchable_rate(cpi, xd);
+ av1_build_inter_predictors_sb(xd, mi_row, mi_col, bsize);
+ model_rd_for_sb(cpi, bsize, x, xd, 0, MAX_MB_PLANE - 1, &tmp_rate,
+ &tmp_dist, &tmp_skip_sb, &tmp_skip_sse);
+ tmp_rd = RDCOST(x->rdmult, x->rddiv, tmp_rs + tmp_rate, tmp_dist);
- if (i > 0 && intpel_mv && IsInterpolatingFilter(i)) {
- rd = RDCOST(x->rdmult, x->rddiv, tmp_rate_sum, tmp_dist_sum);
- if (cm->interp_filter == SWITCHABLE) rd += rs_rd;
- } else {
- int rate_sum = 0;
- int64_t dist_sum = 0;
- if ((cm->interp_filter == SWITCHABLE && (!i || best_needs_copy)) ||
-#if CONFIG_EXT_INTER
- is_comp_interintra_pred ||
-#endif // CONFIG_EXT_INTER
- (cm->interp_filter != SWITCHABLE &&
- (
+ if (tmp_rd < rd) {
+ rd = tmp_rd;
+ rs = av1_get_switchable_rate(cpi, xd);
#if CONFIG_DUAL_FILTER
- cm->interp_filter == mbmi->interp_filter[0]
+ av1_copy(best_filter, mbmi->interp_filter);
#else
- cm->interp_filter == mbmi->interp_filter
+ best_filter = mbmi->interp_filter;
#endif
- || (i == 0 && intpel_mv && IsInterpolatingFilter(i))))) {
- restore_dst_buf(xd, orig_dst, orig_dst_stride);
- } else {
- for (j = 0; j < MAX_MB_PLANE; j++) {
- xd->plane[j].dst.buf = tmp_buf + j * MAX_SB_SQUARE;
- xd->plane[j].dst.stride = MAX_SB_SIZE;
+ skip_txfm_sb = tmp_skip_sb;
+ skip_sse_sb = tmp_skip_sse;
+ best_in_temp = !best_in_temp;
+ if (best_in_temp) {
+ restore_dst_buf(xd, orig_dst, orig_dst_stride);
+ } else {
+ restore_dst_buf(xd, tmp_dst, tmp_dst_stride);
+ }
}
}
- av1_build_inter_predictors_sb(xd, mi_row, mi_col, bsize);
- model_rd_for_sb(cpi, bsize, x, xd, 0, MAX_MB_PLANE - 1, &rate_sum,
- &dist_sum, &tmp_skip_sb, &tmp_skip_sse);
-
- rd = RDCOST(x->rdmult, x->rddiv, rate_sum, dist_sum);
- if (cm->interp_filter == SWITCHABLE) rd += rs_rd;
-
- if (i == 0 && intpel_mv && IsInterpolatingFilter(i)) {
- tmp_rate_sum = rate_sum;
- tmp_dist_sum = dist_sum;
+ if (best_in_temp) {
+ restore_dst_buf(xd, tmp_dst, tmp_dst_stride);
+ } else {
+ restore_dst_buf(xd, orig_dst, orig_dst_stride);
}
- }
- newbest = i == 0 || rd < best_rd;
-
- if (newbest) {
- best_rd = rd;
#if CONFIG_DUAL_FILTER
- best_filter[0] = mbmi->interp_filter[0];
- best_filter[1] = mbmi->interp_filter[1];
- best_filter[2] = mbmi->interp_filter[2];
- best_filter[3] = mbmi->interp_filter[3];
+ av1_copy(mbmi->interp_filter, best_filter);
#else
- best_filter = mbmi->interp_filter;
+ mbmi->interp_filter = best_filter;
#endif
- if (cm->interp_filter == SWITCHABLE && i &&
- !(intpel_mv && IsInterpolatingFilter(i)))
- best_needs_copy = !best_needs_copy;
- }
-
- if ((cm->interp_filter == SWITCHABLE && newbest) ||
- (cm->interp_filter != SWITCHABLE &&
-#if CONFIG_DUAL_FILTER
- cm->interp_filter == mbmi->interp_filter[0]
-#else
- cm->interp_filter == mbmi->interp_filter
-#endif
- )) {
- pred_exists = 1;
- tmp_rd = best_rd;
-
- skip_txfm_sb = tmp_skip_sb;
- skip_sse_sb = tmp_skip_sse;
} else {
- pred_exists = 0;
+#if !CONFIG_EXT_INTERP && !CONFIG_DUAL_FILTER
+ int tmp_rs;
+ InterpFilter best_filter = mbmi->interp_filter;
+ rs = av1_get_switchable_rate(cpi, xd);
+ for (i = 1; i < SWITCHABLE_FILTERS; ++i) {
+ mbmi->interp_filter = i;
+ tmp_rs = av1_get_switchable_rate(cpi, xd);
+ if (tmp_rs < rs) {
+ rs = tmp_rs;
+ best_filter = i;
+ }
+ }
+ mbmi->interp_filter = best_filter;
+#else
+ assert(0);
+#endif
}
}
- restore_dst_buf(xd, orig_dst, orig_dst_stride);
}
-// Set the appropriate filter
-#if CONFIG_DUAL_FILTER
- mbmi->interp_filter[0] =
- cm->interp_filter != SWITCHABLE ? cm->interp_filter : best_filter[0];
- mbmi->interp_filter[1] =
- cm->interp_filter != SWITCHABLE ? cm->interp_filter : best_filter[1];
- if (mbmi->ref_frame[1] > INTRA_FRAME) {
- mbmi->interp_filter[2] =
- cm->interp_filter != SWITCHABLE ? cm->interp_filter : best_filter[2];
- mbmi->interp_filter[3] =
- cm->interp_filter != SWITCHABLE ? cm->interp_filter : best_filter[3];
- }
-#else
- mbmi->interp_filter =
- cm->interp_filter != SWITCHABLE ? cm->interp_filter : best_filter;
-#endif
- rs = cm->interp_filter == SWITCHABLE ? av1_get_switchable_rate(cpi, xd) : 0;
-
#if CONFIG_EXT_INTER
#if CONFIG_MOTION_VAR
best_bmc_mbmi = *mbmi;
@@ -7264,7 +7212,6 @@
return INT64_MAX;
pred_exists = 0;
- tmp_rd = AOMMIN(best_rd_wedge, best_rd_nowedge);
if (mbmi->use_wedge_interinter)
*compmode_wedge_cost =
@@ -7405,7 +7352,6 @@
}
pred_exists = 0;
- tmp_rd = best_interintra_rd;
*compmode_interintra_cost =
av1_cost_bit(cm->fc->interintra_prob[size_group_lookup[bsize]], 1);
*compmode_interintra_cost += interintra_mode_cost[mbmi->interintra_mode];
@@ -7432,29 +7378,15 @@
pred_exists = 0;
}
#endif // CONFIG_EXT_INTERP
-#endif // CONFIG_EXT_INTER
-
- if (pred_exists) {
- if (best_needs_copy) {
- // again temporarily set the buffers to local memory to prevent a memcpy
- for (i = 0; i < MAX_MB_PLANE; i++) {
- xd->plane[i].dst.buf = tmp_buf + i * MAX_SB_SQUARE;
- xd->plane[i].dst.stride = MAX_SB_SIZE;
- }
- }
- rd = tmp_rd;
- } else {
+ if (pred_exists == 0) {
int tmp_rate;
int64_t tmp_dist;
-
- // Handles the special case when a filter that is not in the
- // switchable list (ex. bilinear) is indicated at the frame level, or
- // skip condition holds.
av1_build_inter_predictors_sb(xd, mi_row, mi_col, bsize);
model_rd_for_sb(cpi, bsize, x, xd, 0, MAX_MB_PLANE - 1, &tmp_rate,
&tmp_dist, &skip_txfm_sb, &skip_sse_sb);
rd = RDCOST(x->rdmult, x->rddiv, rs + tmp_rate, tmp_dist);
}
+#endif // CONFIG_EXT_INTER
#if CONFIG_DUAL_FILTER
if (!is_comp_pred) single_filter[this_mode][refs[0]] = mbmi->interp_filter[0];
@@ -7498,6 +7430,7 @@
for (mbmi->motion_mode = SIMPLE_TRANSLATION;
mbmi->motion_mode < (allow_motvar ? MOTION_MODES : 1);
mbmi->motion_mode++) {
+ int64_t tmp_rd = INT64_MAX;
#if CONFIG_EXT_INTER
int tmp_rate2 = mbmi->motion_mode != SIMPLE_TRANSLATION ? rate2_bmc_nocoeff
: rate2_nocoeff;
@@ -8054,7 +7987,7 @@
// (prediction granularity), so we account for it in the full rate,
// not the tokenonly rate.
rate_y -= cpi->tx_size_cost[max_tx_size - TX_8X8][get_tx_size_context(xd)]
- [mbmi->tx_size];
+ [tx_size_to_depth(mbmi->tx_size)];
}
rate2 += av1_cost_bit(cm->fc->filter_intra_probs[0],
@@ -8811,8 +8744,9 @@
// tokenonly rate, but for intra blocks, tx_size is always coded
// (prediction granularity), so we account for it in the full rate,
// not the tokenonly rate.
- rate_y -= cpi->tx_size_cost[max_tx_size - TX_8X8]
- [get_tx_size_context(xd)][mbmi->tx_size];
+ rate_y -=
+ cpi->tx_size_cost[max_tx_size - TX_8X8][get_tx_size_context(xd)]
+ [tx_size_to_depth(mbmi->tx_size)];
}
#if CONFIG_EXT_INTRA
if (is_directional_mode) {
@@ -10227,14 +10161,13 @@
this_rd_thresh = (ref_frame == LAST3_FRAME)
? rd_opt->threshes[segment_id][bsize][THR_LAST3]
: this_rd_thresh;
+ this_rd_thresh = (ref_frame == BWDREF_FRAME)
+ ? rd_opt->threshes[segment_id][bsize][THR_BWDR]
+ : this_rd_thresh;
#endif // CONFIG_EXT_REFS
this_rd_thresh = (ref_frame == GOLDEN_FRAME)
? rd_opt->threshes[segment_id][bsize][THR_GOLD]
: this_rd_thresh;
-#if CONFIG_EXT_REFS
-// TODO(zoeliu): To explore whether this_rd_thresh should consider
-// BWDREF_FRAME and ALTREF_FRAME
-#endif // CONFIG_EXT_REFS
// TODO(any): Add search of the tx_type to improve rd performance at the
// expense of speed.
diff --git a/av1/encoder/segmentation.c b/av1/encoder/segmentation.c
index 3292da4..828b31c 100644
--- a/av1/encoder/segmentation.c
+++ b/av1/encoder/segmentation.c
@@ -51,7 +51,8 @@
// Based on set of segment counts calculate a probability tree
static void calc_segtree_probs(unsigned *segcounts,
aom_prob *segment_tree_probs,
- const aom_prob *cur_tree_probs) {
+ const aom_prob *cur_tree_probs,
+ const int probwt) {
// Work out probabilities of each segment
const unsigned cc[4] = { segcounts[0] + segcounts[1],
segcounts[2] + segcounts[3],
@@ -71,8 +72,9 @@
for (i = 0; i < 7; i++) {
const unsigned *ct =
i == 0 ? ccc : i < 3 ? cc + (i & 2) : segcounts + (i - 3) * 2;
- av1_prob_diff_update_savings_search(
- ct, cur_tree_probs[i], &segment_tree_probs[i], DIFF_UPDATE_PROB);
+ av1_prob_diff_update_savings_search(ct, cur_tree_probs[i],
+ &segment_tree_probs[i],
+ DIFF_UPDATE_PROB, probwt);
}
}
@@ -294,6 +296,11 @@
int t_pred_cost = INT_MAX;
int i, tile_col, tile_row, mi_row, mi_col;
+#if CONFIG_TILE_GROUPS
+ const int probwt = cm->num_tg;
+#else
+ const int probwt = 1;
+#endif
unsigned(*temporal_predictor_count)[2] = cm->counts.seg.pred;
unsigned *no_pred_segcounts = cm->counts.seg.tree_total;
@@ -333,14 +340,15 @@
// Work out probability tree for coding segments without prediction
// and the cost.
- calc_segtree_probs(no_pred_segcounts, no_pred_tree, segp->tree_probs);
+ calc_segtree_probs(no_pred_segcounts, no_pred_tree, segp->tree_probs, probwt);
no_pred_cost = cost_segmap(no_pred_segcounts, no_pred_tree);
// Key frames cannot use temporal prediction
if (!frame_is_intra_only(cm) && !cm->error_resilient_mode) {
// Work out probability tree for coding those segments not
// predicted using the temporal method and the cost.
- calc_segtree_probs(t_unpred_seg_counts, t_pred_tree, segp->tree_probs);
+ calc_segtree_probs(t_unpred_seg_counts, t_pred_tree, segp->tree_probs,
+ probwt);
t_pred_cost = cost_segmap(t_unpred_seg_counts, t_pred_tree);
// Add in the cost of the signaling for each prediction context.
@@ -349,9 +357,9 @@
const int count1 = temporal_predictor_count[i][1];
t_nopred_prob[i] = get_binary_prob(count0, count1);
- av1_prob_diff_update_savings_search(temporal_predictor_count[i],
- segp->pred_probs[i],
- &t_nopred_prob[i], DIFF_UPDATE_PROB);
+ av1_prob_diff_update_savings_search(
+ temporal_predictor_count[i], segp->pred_probs[i], &t_nopred_prob[i],
+ DIFF_UPDATE_PROB, probwt);
// Add in the predictor signaling cost
t_pred_cost += count0 * av1_cost_zero(t_nopred_prob[i]) +
@@ -366,9 +374,6 @@
} else {
seg->temporal_update = 0;
}
-#if CONFIG_DAALA_EC
- av1_tree_to_cdf(av1_segment_tree, segp->tree_probs, segp->tree_cdf);
-#endif
}
void av1_reset_segment_features(AV1_COMMON *cm) {
diff --git a/av1/encoder/speed_features.c b/av1/encoder/speed_features.c
index acdc13b..2fb651c 100644
--- a/av1/encoder/speed_features.c
+++ b/av1/encoder/speed_features.c
@@ -69,6 +69,11 @@
int speed) {
AV1_COMMON *const cm = &cpi->common;
+ // Limit memory usage for high resolutions
+ if (AOMMIN(cm->width, cm->height) > 1080) {
+ sf->use_upsampled_references = 0;
+ }
+
if (speed >= 1) {
if (AOMMIN(cm->width, cm->height) >= 720) {
sf->disable_split_mask =
diff --git a/av1/encoder/subexp.c b/av1/encoder/subexp.c
index 0ca5247..81bb56d 100644
--- a/av1/encoder/subexp.c
+++ b/av1/encoder/subexp.c
@@ -116,7 +116,8 @@
}
int av1_prob_diff_update_savings_search(const unsigned int *ct, aom_prob oldp,
- aom_prob *bestp, aom_prob upd) {
+ aom_prob *bestp, aom_prob upd,
+ int probwt) {
const uint32_t old_b = cost_branch256(ct, oldp);
int bestsavings = 0;
aom_prob newp, bestnewp = oldp;
@@ -126,7 +127,7 @@
const uint32_t new_b = cost_branch256(ct, newp);
const uint32_t update_b =
prob_diff_update_cost(newp, oldp) + av1_cost_upd256;
- const int savings = (int)((int64_t)old_b - new_b - update_b);
+ const int savings = (int)((int64_t)old_b - new_b - update_b * probwt);
if (savings > bestsavings) {
bestsavings = savings;
bestnewp = newp;
@@ -139,7 +140,7 @@
int av1_prob_diff_update_savings_search_model(const unsigned int *ct,
const aom_prob *oldp,
aom_prob *bestp, aom_prob upd,
- int stepsize) {
+ int stepsize, int probwt) {
int i, old_b, new_b, update_b, savings, bestsavings;
int newp;
const int step_sign = *bestp > oldp[PIVOT_NODE] ? -1 : 1;
@@ -164,7 +165,7 @@
new_b += cost_branch256(ct + 2 * i, newplist[i]);
new_b += cost_branch256(ct + 2 * PIVOT_NODE, newplist[PIVOT_NODE]);
update_b = prob_diff_update_cost(newp, oldp[PIVOT_NODE]) + av1_cost_upd256;
- savings = old_b - new_b - update_b;
+ savings = old_b - new_b - update_b * probwt;
if (savings > bestsavings) {
bestsavings = savings;
bestnewp = newp;
@@ -253,11 +254,11 @@
#endif // CONFIG_ENTROPY
void av1_cond_prob_diff_update(aom_writer *w, aom_prob *oldp,
- const unsigned int ct[2]) {
+ const unsigned int ct[2], int probwt) {
const aom_prob upd = DIFF_UPDATE_PROB;
aom_prob newp = get_binary_prob(ct[0], ct[1]);
const int savings =
- av1_prob_diff_update_savings_search(ct, *oldp, &newp, upd);
+ av1_prob_diff_update_savings_search(ct, *oldp, &newp, upd, probwt);
assert(newp >= 1);
if (savings > 0) {
aom_write(w, 1, upd);
@@ -268,12 +269,12 @@
}
}
-int av1_cond_prob_diff_update_savings(aom_prob *oldp,
- const unsigned int ct[2]) {
+int av1_cond_prob_diff_update_savings(aom_prob *oldp, const unsigned int ct[2],
+ int probwt) {
const aom_prob upd = DIFF_UPDATE_PROB;
aom_prob newp = get_binary_prob(ct[0], ct[1]);
const int savings =
- av1_prob_diff_update_savings_search(ct, *oldp, &newp, upd);
+ av1_prob_diff_update_savings_search(ct, *oldp, &newp, upd, probwt);
return savings;
}
diff --git a/av1/encoder/subexp.h b/av1/encoder/subexp.h
index 25750bb..d01dea9 100644
--- a/av1/encoder/subexp.h
+++ b/av1/encoder/subexp.h
@@ -19,20 +19,22 @@
#include "aom_dsp/bitwriter.h"
#include "aom_dsp/prob.h"
-void av1_write_prob_diff_update(aom_writer *w, aom_prob newp, aom_prob oldp);
+void av1_write_prob_diff_update(aom_writer *w, aom_prob newp, aom_prob oldpm);
void av1_cond_prob_diff_update(aom_writer *w, aom_prob *oldp,
- const unsigned int ct[2]);
+ const unsigned int ct[2], int probwt);
int av1_prob_diff_update_savings_search(const unsigned int *ct, aom_prob oldp,
- aom_prob *bestp, aom_prob upd);
+ aom_prob *bestp, aom_prob upd,
+ int probwt);
int av1_prob_diff_update_savings_search_model(const unsigned int *ct,
const aom_prob *oldp,
aom_prob *bestp, aom_prob upd,
- int stepsize);
-int av1_cond_prob_diff_update_savings(aom_prob *oldp, const unsigned int ct[2]);
+ int stepsize, int probwt);
+int av1_cond_prob_diff_update_savings(aom_prob *oldp, const unsigned int ct[2],
+ int probwt);
#if CONFIG_ENTROPY
int av1_prob_update_search_subframe(unsigned int ct[][2], aom_prob oldp,
aom_prob *bestp, aom_prob upd, int n);
diff --git a/av1/encoder/tokenize.c b/av1/encoder/tokenize.c
index 47cc02a..d79763e 100644
--- a/av1/encoder/tokenize.c
+++ b/av1/encoder/tokenize.c
@@ -357,17 +357,17 @@
}
static INLINE void add_token(TOKENEXTRA **t, const aom_prob *context_tree,
-#if CONFIG_RANS || CONFIG_DAALA_EC
- const aom_cdf_prob (*token_cdf)[ENTROPY_TOKENS],
-#endif // CONFIG_RANS
+#if CONFIG_EC_MULTISYMBOL
+ aom_cdf_prob (*token_cdf)[ENTROPY_TOKENS],
+#endif // CONFIG_EC_MULTISYMBOL
int32_t extra, uint8_t token,
uint8_t skip_eob_node, unsigned int *counts) {
(*t)->token = token;
(*t)->extra = extra;
(*t)->context_tree = context_tree;
-#if CONFIG_RANS || CONFIG_DAALA_EC
+#if CONFIG_EC_MULTISYMBOL
(*t)->token_cdf = token_cdf;
-#endif // CONFIG_RANS
+#endif // CONFIG_EC_MULTISYMBOL
(*t)->skip_eob_node = skip_eob_node;
(*t)++;
++counts[token];
@@ -458,7 +458,7 @@
aom_prob(*const coef_probs)[COEFF_CONTEXTS][UNCONSTRAINED_NODES] =
cpi->common.fc->coef_probs[txsize_sqr_map[tx_size]][type][ref];
#endif // CONFIG_ENTROPY
-#if CONFIG_RANS || CONFIG_DAALA_EC
+#if CONFIG_EC_MULTISYMBOL
aom_cdf_prob(*const coef_cdfs)[COEFF_CONTEXTS][ENTROPY_TOKENS] =
cpi->common.fc->coef_cdfs[tx_size][type][ref];
#endif
@@ -483,8 +483,8 @@
av1_get_token_extra(v, &token, &extra);
add_token(&t, coef_probs[band[c]][pt],
-#if CONFIG_RANS || CONFIG_DAALA_EC
- (const aom_cdf_prob(*)[ENTROPY_TOKENS]) & coef_cdfs[band[c]][pt],
+#if CONFIG_EC_MULTISYMBOL
+ &coef_cdfs[band[c]][pt],
#endif
extra, (uint8_t)token, (uint8_t)skip_eob, counts[band[c]][pt]);
@@ -495,7 +495,7 @@
}
if (c < seg_eob) {
add_token(&t, coef_probs[band[c]][pt],
-#if CONFIG_RANS || CONFIG_DAALA_EC
+#if CONFIG_EC_MULTISYMBOL
NULL,
#endif
0, EOB_TOKEN, 0, counts[band[c]][pt]);
diff --git a/av1/encoder/tokenize.h b/av1/encoder/tokenize.h
index b9487da..27bdef5 100644
--- a/av1/encoder/tokenize.h
+++ b/av1/encoder/tokenize.h
@@ -36,8 +36,8 @@
typedef struct {
const aom_prob *context_tree;
-#if CONFIG_RANS || CONFIG_DAALA_EC
- const aom_cdf_prob (*token_cdf)[ENTROPY_TOKENS];
+#if CONFIG_EC_MULTISYMBOL
+ aom_cdf_prob (*token_cdf)[ENTROPY_TOKENS];
#endif
EXTRABIT extra;
uint8_t token;
diff --git a/av1/encoder/x86/hybrid_fwd_txfm_avx2.c b/av1/encoder/x86/hybrid_fwd_txfm_avx2.c
index f4bd142..77ae724 100644
--- a/av1/encoder/x86/hybrid_fwd_txfm_avx2.c
+++ b/av1/encoder/x86/hybrid_fwd_txfm_avx2.c
@@ -18,14 +18,6 @@
#include "aom_dsp/txfm_common.h"
#include "aom_dsp/x86/txfm_common_avx2.h"
-static INLINE void mm256_reverse_epi16(__m256i *u) {
- const __m256i control = _mm256_set_epi16(
- 0x0100, 0x0302, 0x0504, 0x0706, 0x0908, 0x0B0A, 0x0D0C, 0x0F0E, 0x0100,
- 0x0302, 0x0504, 0x0706, 0x0908, 0x0B0A, 0x0D0C, 0x0F0E);
- __m256i v = _mm256_shuffle_epi8(*u, control);
- *u = _mm256_permute2x128_si256(v, v, 1);
-}
-
static int32_t get_16x16_sum(const int16_t *input, int stride) {
__m256i r0, r1, r2, r3, u0, u1;
__m256i zero = _mm256_setzero_si256();
@@ -71,134 +63,6 @@
_mm256_zeroupper();
}
-static void mm256_transpose_16x16(__m256i *in) {
- __m256i tr0_0 = _mm256_unpacklo_epi16(in[0], in[1]);
- __m256i tr0_1 = _mm256_unpackhi_epi16(in[0], in[1]);
- __m256i tr0_2 = _mm256_unpacklo_epi16(in[2], in[3]);
- __m256i tr0_3 = _mm256_unpackhi_epi16(in[2], in[3]);
- __m256i tr0_4 = _mm256_unpacklo_epi16(in[4], in[5]);
- __m256i tr0_5 = _mm256_unpackhi_epi16(in[4], in[5]);
- __m256i tr0_6 = _mm256_unpacklo_epi16(in[6], in[7]);
- __m256i tr0_7 = _mm256_unpackhi_epi16(in[6], in[7]);
-
- __m256i tr0_8 = _mm256_unpacklo_epi16(in[8], in[9]);
- __m256i tr0_9 = _mm256_unpackhi_epi16(in[8], in[9]);
- __m256i tr0_a = _mm256_unpacklo_epi16(in[10], in[11]);
- __m256i tr0_b = _mm256_unpackhi_epi16(in[10], in[11]);
- __m256i tr0_c = _mm256_unpacklo_epi16(in[12], in[13]);
- __m256i tr0_d = _mm256_unpackhi_epi16(in[12], in[13]);
- __m256i tr0_e = _mm256_unpacklo_epi16(in[14], in[15]);
- __m256i tr0_f = _mm256_unpackhi_epi16(in[14], in[15]);
-
- // 00 10 01 11 02 12 03 13 08 18 09 19 0a 1a 0b 1b
- // 04 14 05 15 06 16 07 17 0c 1c 0d 1d 0e 1e 0f 1f
- // 20 30 21 31 22 32 23 33 28 38 29 39 2a 3a 2b 3b
- // 24 34 25 35 26 36 27 37 2c 3c 2d 3d 2e 3e 2f 3f
- // 40 50 41 51 42 52 43 53 48 58 49 59 4a 5a 4b 5b
- // 44 54 45 55 46 56 47 57 4c 5c 4d 5d 4e 5e 4f 5f
- // 60 70 61 71 62 72 63 73 68 78 69 79 6a 7a 6b 7b
- // 64 74 65 75 66 76 67 77 6c 7c 6d 7d 6e 7e 6f 7f
-
- // 80 90 81 91 82 92 83 93 88 98 89 99 8a 9a 8b 9b
- // 84 94 85 95 86 96 87 97 8c 9c 8d 9d 8e 9e 8f 9f
- // a0 b0 a1 b1 a2 b2 a3 b3 a8 b8 a9 b9 aa ba ab bb
- // a4 b4 a5 b5 a6 b6 a7 b7 ac bc ad bd ae be af bf
- // c0 d0 c1 d1 c2 d2 c3 d3 c8 d8 c9 d9 ca da cb db
- // c4 d4 c5 d5 c6 d6 c7 d7 cc dc cd dd ce de cf df
- // e0 f0 e1 f1 e2 f2 e3 f3 e8 f8 e9 f9 ea fa eb fb
- // e4 f4 e5 f5 e6 f6 e7 f7 ec fc ed fd ee fe ef ff
-
- __m256i tr1_0 = _mm256_unpacklo_epi32(tr0_0, tr0_2);
- __m256i tr1_1 = _mm256_unpackhi_epi32(tr0_0, tr0_2);
- __m256i tr1_2 = _mm256_unpacklo_epi32(tr0_1, tr0_3);
- __m256i tr1_3 = _mm256_unpackhi_epi32(tr0_1, tr0_3);
- __m256i tr1_4 = _mm256_unpacklo_epi32(tr0_4, tr0_6);
- __m256i tr1_5 = _mm256_unpackhi_epi32(tr0_4, tr0_6);
- __m256i tr1_6 = _mm256_unpacklo_epi32(tr0_5, tr0_7);
- __m256i tr1_7 = _mm256_unpackhi_epi32(tr0_5, tr0_7);
-
- __m256i tr1_8 = _mm256_unpacklo_epi32(tr0_8, tr0_a);
- __m256i tr1_9 = _mm256_unpackhi_epi32(tr0_8, tr0_a);
- __m256i tr1_a = _mm256_unpacklo_epi32(tr0_9, tr0_b);
- __m256i tr1_b = _mm256_unpackhi_epi32(tr0_9, tr0_b);
- __m256i tr1_c = _mm256_unpacklo_epi32(tr0_c, tr0_e);
- __m256i tr1_d = _mm256_unpackhi_epi32(tr0_c, tr0_e);
- __m256i tr1_e = _mm256_unpacklo_epi32(tr0_d, tr0_f);
- __m256i tr1_f = _mm256_unpackhi_epi32(tr0_d, tr0_f);
-
- // 00 10 20 30 01 11 21 31 08 18 28 38 09 19 29 39
- // 02 12 22 32 03 13 23 33 0a 1a 2a 3a 0b 1b 2b 3b
- // 04 14 24 34 05 15 25 35 0c 1c 2c 3c 0d 1d 2d 3d
- // 06 16 26 36 07 17 27 37 0e 1e 2e 3e 0f 1f 2f 3f
- // 40 50 60 70 41 51 61 71 48 58 68 78 49 59 69 79
- // 42 52 62 72 43 53 63 73 4a 5a 6a 7a 4b 5b 6b 7b
- // 44 54 64 74 45 55 65 75 4c 5c 6c 7c 4d 5d 6d 7d
- // 46 56 66 76 47 57 67 77 4e 5e 6e 7e 4f 5f 6f 7f
-
- // 80 90 a0 b0 81 91 a1 b1 88 98 a8 b8 89 99 a9 b9
- // 82 92 a2 b2 83 93 a3 b3 8a 9a aa ba 8b 9b ab bb
- // 84 94 a4 b4 85 95 a5 b5 8c 9c ac bc 8d 9d ad bd
- // 86 96 a6 b6 87 97 a7 b7 8e ae 9e be 8f 9f af bf
- // c0 d0 e0 f0 c1 d1 e1 f1 c8 d8 e8 f8 c9 d9 e9 f9
- // c2 d2 e2 f2 c3 d3 e3 f3 ca da ea fa cb db eb fb
- // c4 d4 e4 f4 c5 d5 e5 f5 cc dc ef fc cd dd ed fd
- // c6 d6 e6 f6 c7 d7 e7 f7 ce de ee fe cf df ef ff
-
- tr0_0 = _mm256_unpacklo_epi64(tr1_0, tr1_4);
- tr0_1 = _mm256_unpackhi_epi64(tr1_0, tr1_4);
- tr0_2 = _mm256_unpacklo_epi64(tr1_1, tr1_5);
- tr0_3 = _mm256_unpackhi_epi64(tr1_1, tr1_5);
- tr0_4 = _mm256_unpacklo_epi64(tr1_2, tr1_6);
- tr0_5 = _mm256_unpackhi_epi64(tr1_2, tr1_6);
- tr0_6 = _mm256_unpacklo_epi64(tr1_3, tr1_7);
- tr0_7 = _mm256_unpackhi_epi64(tr1_3, tr1_7);
-
- tr0_8 = _mm256_unpacklo_epi64(tr1_8, tr1_c);
- tr0_9 = _mm256_unpackhi_epi64(tr1_8, tr1_c);
- tr0_a = _mm256_unpacklo_epi64(tr1_9, tr1_d);
- tr0_b = _mm256_unpackhi_epi64(tr1_9, tr1_d);
- tr0_c = _mm256_unpacklo_epi64(tr1_a, tr1_e);
- tr0_d = _mm256_unpackhi_epi64(tr1_a, tr1_e);
- tr0_e = _mm256_unpacklo_epi64(tr1_b, tr1_f);
- tr0_f = _mm256_unpackhi_epi64(tr1_b, tr1_f);
-
- // 00 10 20 30 40 50 60 70 08 18 28 38 48 58 68 78
- // 01 11 21 31 41 51 61 71 09 19 29 39 49 59 69 79
- // 02 12 22 32 42 52 62 72 0a 1a 2a 3a 4a 5a 6a 7a
- // 03 13 23 33 43 53 63 73 0b 1b 2b 3b 4b 5b 6b 7b
- // 04 14 24 34 44 54 64 74 0c 1c 2c 3c 4c 5c 6c 7c
- // 05 15 25 35 45 55 65 75 0d 1d 2d 3d 4d 5d 6d 7d
- // 06 16 26 36 46 56 66 76 0e 1e 2e 3e 4e 5e 6e 7e
- // 07 17 27 37 47 57 67 77 0f 1f 2f 3f 4f 5f 6f 7f
-
- // 80 90 a0 b0 c0 d0 e0 f0 88 98 a8 b8 c8 d8 e8 f8
- // 81 91 a1 b1 c1 d1 e1 f1 89 99 a9 b9 c9 d9 e9 f9
- // 82 92 a2 b2 c2 d2 e2 f2 8a 9a aa ba ca da ea fa
- // 83 93 a3 b3 c3 d3 e3 f3 8b 9b ab bb cb db eb fb
- // 84 94 a4 b4 c4 d4 e4 f4 8c 9c ac bc cc dc ef fc
- // 85 95 a5 b5 c5 d5 e5 f5 8d 9d ad bd cd dd ed fd
- // 86 96 a6 b6 c6 d6 e6 f6 8e ae 9e be ce de ee fe
- // 87 97 a7 b7 c7 d7 e7 f7 8f 9f af bf cf df ef ff
-
- in[0] = _mm256_permute2x128_si256(tr0_0, tr0_8, 0x20); // 0010 0000
- in[8] = _mm256_permute2x128_si256(tr0_0, tr0_8, 0x31); // 0011 0001
- in[1] = _mm256_permute2x128_si256(tr0_1, tr0_9, 0x20);
- in[9] = _mm256_permute2x128_si256(tr0_1, tr0_9, 0x31);
- in[2] = _mm256_permute2x128_si256(tr0_2, tr0_a, 0x20);
- in[10] = _mm256_permute2x128_si256(tr0_2, tr0_a, 0x31);
- in[3] = _mm256_permute2x128_si256(tr0_3, tr0_b, 0x20);
- in[11] = _mm256_permute2x128_si256(tr0_3, tr0_b, 0x31);
-
- in[4] = _mm256_permute2x128_si256(tr0_4, tr0_c, 0x20);
- in[12] = _mm256_permute2x128_si256(tr0_4, tr0_c, 0x31);
- in[5] = _mm256_permute2x128_si256(tr0_5, tr0_d, 0x20);
- in[13] = _mm256_permute2x128_si256(tr0_5, tr0_d, 0x31);
- in[6] = _mm256_permute2x128_si256(tr0_6, tr0_e, 0x20);
- in[14] = _mm256_permute2x128_si256(tr0_6, tr0_e, 0x31);
- in[7] = _mm256_permute2x128_si256(tr0_7, tr0_f, 0x20);
- in[15] = _mm256_permute2x128_si256(tr0_7, tr0_f, 0x31);
-}
-
static INLINE void load_buffer_16x16(const int16_t *input, int stride,
int flipud, int fliplr, __m256i *in) {
if (!flipud) {
@@ -352,19 +216,6 @@
in[15] = _mm256_srai_epi16(in[15], 2);
}
-static INLINE __m256i butter_fly(__m256i a0, __m256i a1, const __m256i cospi) {
- const __m256i dct_rounding = _mm256_set1_epi32(DCT_CONST_ROUNDING);
- __m256i y0 = _mm256_madd_epi16(a0, cospi);
- __m256i y1 = _mm256_madd_epi16(a1, cospi);
-
- y0 = _mm256_add_epi32(y0, dct_rounding);
- y1 = _mm256_add_epi32(y1, dct_rounding);
- y0 = _mm256_srai_epi32(y0, DCT_CONST_BITS);
- y1 = _mm256_srai_epi32(y1, DCT_CONST_BITS);
-
- return _mm256_packs_epi32(y0, y1);
-}
-
static void fdct16_avx2(__m256i *in) {
// sequence: cospi_L_H = pairs(L, H) and L first
const __m256i cospi_p16_m16 = pair256_set_epi16(cospi_16_64, -cospi_16_64);
@@ -1099,31 +950,7 @@
}
#if CONFIG_EXT_TX
-static void fidtx16_avx2(__m256i *in) {
- const __m256i zero = _mm256_setzero_si256();
- const __m256i sqrt2_epi16 = _mm256_set1_epi16((int16_t)Sqrt2);
- const __m256i dct_const_rounding = _mm256_set1_epi32(DCT_CONST_ROUNDING);
- __m256i u0, u1;
- int i = 0;
-
- while (i < 16) {
- in[i] = _mm256_slli_epi16(in[i], 1);
-
- u0 = _mm256_unpacklo_epi16(zero, in[i]);
- u1 = _mm256_unpackhi_epi16(zero, in[i]);
-
- u0 = _mm256_madd_epi16(u0, sqrt2_epi16);
- u1 = _mm256_madd_epi16(u1, sqrt2_epi16);
-
- u0 = _mm256_add_epi32(u0, dct_const_rounding);
- u1 = _mm256_add_epi32(u1, dct_const_rounding);
-
- u0 = _mm256_srai_epi32(u0, DCT_CONST_BITS);
- u1 = _mm256_srai_epi32(u1, DCT_CONST_BITS);
- in[i] = _mm256_packs_epi32(u0, u1);
- i++;
- }
-}
+static void fidtx16_avx2(__m256i *in) { txfm_scaling16_avx2(Sqrt2, in); }
#endif
void av1_fht16x16_avx2(const int16_t *input, tran_low_t *output, int stride,
diff --git a/configure b/configure
index 5bfce04..3258ee3 100755
--- a/configure
+++ b/configure
@@ -272,6 +272,7 @@
supertx
ans
rans
+ ec_multisymbol
loop_restoration
ext_partition
ext_partition_types
@@ -289,6 +290,10 @@
delta_q
adapt_scan
filter_7bit
+ parallel_deblocking
+ tile_groups
+ ec_adapt
+ simp_mv_pred
"
CONFIG_LIST="
dependency_tracking
@@ -405,6 +410,7 @@
aom_highbitdepth
experimental
aom_qm
+ tile_groups
"
process_cmdline() {
@@ -450,6 +456,11 @@
enabled ${c} && enable_feature ${c##*_}s
done
+ # Fix up experiment dependencies
+ enabled ec_adapt && enable_feature ec_multisymbol
+ enabled ec_multisymbol && ! enabled ans && soft_enable daala_ec
+ enabled ec_multisymbol && ! enabled daala_ec && soft_enable ans
+ enabled daala_ec && enable_feature ec_multisymbol
if enabled global_motion && (enabled ext_inter || enabled dual_filter); then
log_echo "global_motion currently not compatible with ext_inter"
log_echo "and dual_filter. Disabling global_motion."
@@ -619,6 +630,7 @@
check_add_cflags -Wuninitialized
check_add_cflags -Wunused
check_add_cflags -Wsign-compare
+ check_add_cflags -Wlogical-op
# Enabling the following warning (in combination with -Wunused above)
# for C++ generates errors in third_party code including googletest and
# libyuv. So enable it only for C code.
diff --git a/test/av1_convolve_test.cc b/test/av1_convolve_test.cc
index 435e106..0324b8e 100644
--- a/test/av1_convolve_test.cc
+++ b/test/av1_convolve_test.cc
@@ -15,7 +15,7 @@
#include "./aom_dsp_rtcd.h"
#include "test/acm_random.h"
#include "av1/common/filter.h"
-#include "av1/common/av1_convolve.h"
+#include "av1/common/convolve.h"
#include "aom_dsp/aom_dsp_common.h"
#include "aom_ports/mem.h"
diff --git a/test/av1_fht16x16_test.cc b/test/av1_fht16x16_test.cc
index 4a44e16..0b89071 100644
--- a/test/av1_fht16x16_test.cc
+++ b/test/av1_fht16x16_test.cc
@@ -33,6 +33,11 @@
av1_fht16x16_c(in, out, stride, tx_type);
}
+void iht16x16_ref(const tran_low_t *in, uint8_t *dest, int stride,
+ int tx_type) {
+ av1_iht16x16_256_add_c(in, dest, stride, tx_type);
+}
+
#if CONFIG_AOM_HIGHBITDEPTH
typedef void (*IHbdHtFunc)(const tran_low_t *in, uint8_t *out, int stride,
int tx_type, int bd);
@@ -48,16 +53,6 @@
}
#endif // CONFIG_AOM_HIGHBITDEPTH
-#if HAVE_AVX2
-void dummy_inv_txfm(const tran_low_t *in, uint8_t *out, int stride,
- int tx_type) {
- (void)in;
- (void)out;
- (void)stride;
- (void)tx_type;
-}
-#endif
-
class AV1Trans16x16HT : public libaom_test::TransformTestBase,
public ::testing::TestWithParam<Ht16x16Param> {
public:
@@ -70,6 +65,7 @@
pitch_ = 16;
height_ = 16;
fwd_txfm_ref = fht16x16_ref;
+ inv_txfm_ref = iht16x16_ref;
bit_depth_ = GET_PARAM(3);
mask_ = (1 << bit_depth_) - 1;
num_coeffs_ = GET_PARAM(4);
@@ -90,6 +86,7 @@
};
TEST_P(AV1Trans16x16HT, CoeffCheck) { RunCoeffCheck(); }
+TEST_P(AV1Trans16x16HT, InvCoeffCheck) { RunInvCoeffCheck(); }
#if CONFIG_AOM_HIGHBITDEPTH
class AV1HighbdTrans16x16HT
@@ -203,22 +200,27 @@
#if HAVE_AVX2
const Ht16x16Param kArrayHt16x16Param_avx2[] = {
- make_tuple(&av1_fht16x16_avx2, dummy_inv_txfm, 0, AOM_BITS_8, 256),
- make_tuple(&av1_fht16x16_avx2, dummy_inv_txfm, 1, AOM_BITS_8, 256),
- make_tuple(&av1_fht16x16_avx2, dummy_inv_txfm, 2, AOM_BITS_8, 256),
- make_tuple(&av1_fht16x16_avx2, dummy_inv_txfm, 3, AOM_BITS_8, 256),
+ make_tuple(&av1_fht16x16_avx2, av1_iht16x16_256_add_avx2, 0, AOM_BITS_8, 256),
+ make_tuple(&av1_fht16x16_avx2, av1_iht16x16_256_add_avx2, 1, AOM_BITS_8, 256),
+ make_tuple(&av1_fht16x16_avx2, av1_iht16x16_256_add_avx2, 2, AOM_BITS_8, 256),
+ make_tuple(&av1_fht16x16_avx2, av1_iht16x16_256_add_avx2, 3, AOM_BITS_8, 256),
#if CONFIG_EXT_TX
- make_tuple(&av1_fht16x16_avx2, dummy_inv_txfm, 4, AOM_BITS_8, 256),
- make_tuple(&av1_fht16x16_avx2, dummy_inv_txfm, 5, AOM_BITS_8, 256),
- make_tuple(&av1_fht16x16_avx2, dummy_inv_txfm, 6, AOM_BITS_8, 256),
- make_tuple(&av1_fht16x16_avx2, dummy_inv_txfm, 7, AOM_BITS_8, 256),
- make_tuple(&av1_fht16x16_avx2, dummy_inv_txfm, 8, AOM_BITS_8, 256),
- make_tuple(&av1_fht16x16_avx2, dummy_inv_txfm, 10, AOM_BITS_8, 256),
- make_tuple(&av1_fht16x16_avx2, dummy_inv_txfm, 11, AOM_BITS_8, 256),
- make_tuple(&av1_fht16x16_avx2, dummy_inv_txfm, 12, AOM_BITS_8, 256),
- make_tuple(&av1_fht16x16_avx2, dummy_inv_txfm, 13, AOM_BITS_8, 256),
- make_tuple(&av1_fht16x16_avx2, dummy_inv_txfm, 14, AOM_BITS_8, 256),
- make_tuple(&av1_fht16x16_avx2, dummy_inv_txfm, 15, AOM_BITS_8, 256)
+ make_tuple(&av1_fht16x16_avx2, av1_iht16x16_256_add_avx2, 4, AOM_BITS_8, 256),
+ make_tuple(&av1_fht16x16_avx2, av1_iht16x16_256_add_avx2, 5, AOM_BITS_8, 256),
+ make_tuple(&av1_fht16x16_avx2, av1_iht16x16_256_add_avx2, 6, AOM_BITS_8, 256),
+ make_tuple(&av1_fht16x16_avx2, av1_iht16x16_256_add_avx2, 7, AOM_BITS_8, 256),
+ make_tuple(&av1_fht16x16_avx2, av1_iht16x16_256_add_avx2, 8, AOM_BITS_8, 256),
+ make_tuple(&av1_fht16x16_avx2, av1_iht16x16_256_add_avx2, 10, AOM_BITS_8,
+ 256),
+ make_tuple(&av1_fht16x16_avx2, av1_iht16x16_256_add_avx2, 11, AOM_BITS_8,
+ 256),
+ make_tuple(&av1_fht16x16_avx2, av1_iht16x16_256_add_avx2, 12, AOM_BITS_8,
+ 256),
+ make_tuple(&av1_fht16x16_avx2, av1_iht16x16_256_add_avx2, 13, AOM_BITS_8,
+ 256),
+ make_tuple(&av1_fht16x16_avx2, av1_iht16x16_256_add_avx2, 14, AOM_BITS_8,
+ 256),
+ make_tuple(&av1_fht16x16_avx2, av1_iht16x16_256_add_avx2, 15, AOM_BITS_8, 256)
#endif // CONFIG_EXT_TX
};
INSTANTIATE_TEST_CASE_P(AVX2, AV1Trans16x16HT,
diff --git a/test/av1_inv_txfm_test.cc b/test/av1_inv_txfm_test.cc
index 84e2402..8f6c868 100644
--- a/test/av1_inv_txfm_test.cc
+++ b/test/av1_inv_txfm_test.cc
@@ -24,7 +24,7 @@
#include "av1/common/blockd.h"
#include "av1/common/scan.h"
#include "aom/aom_integer.h"
-#include "av1/common/av1_inv_txfm.h"
+#include "aom_dsp/inv_txfm.h"
using libaom_test::ACMRandom;
@@ -104,10 +104,10 @@
INSTANTIATE_TEST_CASE_P(
C, AV1InvTxfm,
- ::testing::Values(IdctParam(&av1_idct4_c, &reference_idct_1d, 4, 1),
- IdctParam(&av1_idct8_c, &reference_idct_1d, 8, 2),
- IdctParam(&av1_idct16_c, &reference_idct_1d, 16, 4),
- IdctParam(&av1_idct32_c, &reference_idct_1d, 32, 6)));
+ ::testing::Values(IdctParam(&aom_idct4_c, &reference_idct_1d, 4, 1),
+ IdctParam(&aom_idct8_c, &reference_idct_1d, 8, 2),
+ IdctParam(&aom_idct16_c, &reference_idct_1d, 16, 4),
+ IdctParam(&aom_idct32_c, &reference_idct_1d, 32, 6)));
#if CONFIG_AV1_ENCODER
typedef void (*FwdTxfmFunc)(const int16_t *in, tran_low_t *out, int stride);
@@ -262,19 +262,19 @@
INSTANTIATE_TEST_CASE_P(
C, AV1PartialIDctTest,
- ::testing::Values(make_tuple(&av1_fdct32x32_c, &av1_idct32x32_1024_add_c,
- &av1_idct32x32_34_add_c, TX_32X32, 34),
- make_tuple(&av1_fdct32x32_c, &av1_idct32x32_1024_add_c,
- &av1_idct32x32_1_add_c, TX_32X32, 1),
- make_tuple(&av1_fdct16x16_c, &av1_idct16x16_256_add_c,
- &av1_idct16x16_10_add_c, TX_16X16, 10),
- make_tuple(&av1_fdct16x16_c, &av1_idct16x16_256_add_c,
- &av1_idct16x16_1_add_c, TX_16X16, 1),
- make_tuple(&av1_fdct8x8_c, &av1_idct8x8_64_add_c,
- &av1_idct8x8_12_add_c, TX_8X8, 12),
- make_tuple(&av1_fdct8x8_c, &av1_idct8x8_64_add_c,
- &av1_idct8x8_1_add_c, TX_8X8, 1),
- make_tuple(&av1_fdct4x4_c, &av1_idct4x4_16_add_c,
- &av1_idct4x4_1_add_c, TX_4X4, 1)));
+ ::testing::Values(make_tuple(&aom_fdct32x32_c, &aom_idct32x32_1024_add_c,
+ &aom_idct32x32_34_add_c, TX_32X32, 34),
+ make_tuple(&aom_fdct32x32_c, &aom_idct32x32_1024_add_c,
+ &aom_idct32x32_1_add_c, TX_32X32, 1),
+ make_tuple(&aom_fdct16x16_c, &aom_idct16x16_256_add_c,
+ &aom_idct16x16_10_add_c, TX_16X16, 10),
+ make_tuple(&aom_fdct16x16_c, &aom_idct16x16_256_add_c,
+ &aom_idct16x16_1_add_c, TX_16X16, 1),
+ make_tuple(&aom_fdct8x8_c, &aom_idct8x8_64_add_c,
+ &aom_idct8x8_12_add_c, TX_8X8, 12),
+ make_tuple(&aom_fdct8x8_c, &aom_idct8x8_64_add_c,
+ &aom_idct8x8_1_add_c, TX_8X8, 1),
+ make_tuple(&aom_fdct4x4_c, &aom_idct4x4_16_add_c,
+ &aom_idct4x4_1_add_c, TX_4X4, 1)));
#endif // CONFIG_AV1_ENCODER
} // namespace
diff --git a/test/dct32x32_test.cc b/test/dct32x32_test.cc
index e4179ef..cb2fbd5 100644
--- a/test/dct32x32_test.cc
+++ b/test/dct32x32_test.cc
@@ -436,6 +436,15 @@
&aom_idct32x32_1024_add_sse2, 1, AOM_BITS_8)));
#endif // HAVE_AVX2 && !CONFIG_AOM_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
+#if HAVE_AVX2 && CONFIG_AOM_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
+INSTANTIATE_TEST_CASE_P(
+ AVX2, Trans32x32Test,
+ ::testing::Values(make_tuple(&aom_fdct32x32_avx2,
+ &aom_idct32x32_1024_add_sse2, 0, AOM_BITS_8),
+ make_tuple(&aom_fdct32x32_rd_avx2,
+ &aom_idct32x32_1024_add_sse2, 1, AOM_BITS_8)));
+#endif // HAVE_AVX2 && CONFIG_AOM_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
+
#if HAVE_MSA && !CONFIG_AOM_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
INSTANTIATE_TEST_CASE_P(
MSA, Trans32x32Test,
diff --git a/test/encoder_parms_get_to_decoder.cc b/test/encoder_parms_get_to_decoder.cc
index 52d68b1..e6bf0be 100644
--- a/test/encoder_parms_get_to_decoder.cc
+++ b/test/encoder_parms_get_to_decoder.cc
@@ -49,7 +49,9 @@
{ 0, 0, 0, 1, 0, AOM_CR_STUDIO_RANGE, AOM_CS_BT_601, { 0, 0 } },
{ 0, 0, 0, 0, 0, AOM_CR_FULL_RANGE, AOM_CS_BT_709, { 0, 0 } },
{ 0, 0, 1, 0, 0, AOM_CR_FULL_RANGE, AOM_CS_BT_2020, { 0, 0 } },
+#if !CONFIG_EC_ADAPT
{ 0, 2, 0, 0, 1, AOM_CR_STUDIO_RANGE, AOM_CS_UNKNOWN, { 640, 480 } },
+#endif
// TODO(JBB): Test profiles (requires more work).
};
diff --git a/test/ethread_test.cc b/test/ethread_test.cc
index 6b2f1ea..2685f87 100644
--- a/test/ethread_test.cc
+++ b/test/ethread_test.cc
@@ -89,6 +89,7 @@
encoder->Control(AOME_SET_ARNR_MAXFRAMES, 7);
encoder->Control(AOME_SET_ARNR_STRENGTH, 5);
encoder->Control(AOME_SET_ARNR_TYPE, 3);
+ encoder->Control(AV1E_SET_FRAME_PARALLEL_DECODING, 0);
} else {
encoder->Control(AOME_SET_ENABLEAUTOALTREF, 0);
encoder->Control(AV1E_SET_AQ_MODE, 3);
@@ -172,6 +173,10 @@
TEST_P(AVxEncoderThreadTestLarge, EncoderResultTest) { DoTest(); }
+#if CONFIG_EC_ADAPT
+// TODO(thdavies): EC_ADAPT does not support tiles
+
+#else
AV1_INSTANTIATE_TEST_CASE(AVxEncoderThreadTest,
::testing::Values(::libaom_test::kTwoPassGood,
::libaom_test::kOnePassGood),
@@ -180,5 +185,6 @@
AV1_INSTANTIATE_TEST_CASE(AVxEncoderThreadTestLarge,
::testing::Values(::libaom_test::kTwoPassGood,
::libaom_test::kOnePassGood),
- ::testing::Range(1, 3));
+ ::testing::Range(0, 3));
+#endif
} // namespace
diff --git a/test/fdct8x8_test.cc b/test/fdct8x8_test.cc
index 9f62ffe..bbfb7f1 100644
--- a/test/fdct8x8_test.cc
+++ b/test/fdct8x8_test.cc
@@ -728,8 +728,7 @@
make_tuple(&idct8x8_12, &idct8x8_64_add_12_sse2, 6225, AOM_BITS_12)));
#endif // HAVE_SSE2 && CONFIG_AOM_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
-#if HAVE_SSSE3 && ARCH_X86_64 && !CONFIG_AOM_HIGHBITDEPTH && \
- !CONFIG_EMULATE_HARDWARE
+#if HAVE_SSSE3 && ARCH_X86_64 && !CONFIG_EMULATE_HARDWARE
INSTANTIATE_TEST_CASE_P(SSSE3, FwdTrans8x8DCT,
::testing::Values(make_tuple(&aom_fdct8x8_ssse3,
&aom_idct8x8_64_add_ssse3,
diff --git a/test/fht32x32_test.cc b/test/fht32x32_test.cc
index 1f85761..8545b2c 100644
--- a/test/fht32x32_test.cc
+++ b/test/fht32x32_test.cc
@@ -90,12 +90,11 @@
IhtFunc inv_txfm_;
};
-// TODO(luoyi): Owing to the range check in DCT_DCT of av1_fht32x32_avx2, as
-// input is out of the range, we use aom_fdct32x32_avx2. However this function
-// does not support CONFIG_AOM_HIGHBITDEPTH. I need to fix the scaling/rounding
-// of av1_fht32x32_avx2 then add this test on CONFIG_AOM_HIGHBITDEPTH.
-#if !CONFIG_AOM_HIGHBITDEPTH
TEST_P(AV1Trans32x32HT, CoeffCheck) { RunCoeffCheck(); }
+// TODO(luoyi): As CONFIG_AOM_HIGHBITDEPTH = 1, our AVX2 implementation of
+// av1_fht32x32 does not support tran_low_t (int32_t) as intermediate result.
+// Therefore MemCheck test, tx_type=1,2,...,8 can't pass the test yet.
+#if !CONFIG_AOM_HIGHBITDEPTH
TEST_P(AV1Trans32x32HT, MemCheck) { RunMemCheck(); }
#endif
diff --git a/test/scan_test.cc b/test/scan_test.cc
new file mode 100644
index 0000000..22a6d85
--- /dev/null
+++ b/test/scan_test.cc
@@ -0,0 +1,86 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "av1/common/scan.h"
+#include "third_party/googletest/src/include/gtest/gtest.h"
+
+namespace {
+
+TEST(ScanTest, av1_augment_prob) {
+ const int tx1d_size = 4;
+ uint32_t prob[16] = { 8, 8, 7, 7, 8, 8, 4, 2, 3, 3, 2, 2, 2, 2, 2, 2 };
+ const uint32_t ref_prob[16] = {
+ 8, 8, 7, 7, 8, 8, 4, 2, 3, 3, 2, 2, 2, 2, 2, 2
+ };
+ av1_augment_prob(prob, tx1d_size, tx1d_size);
+ for (int r = 0; r < tx1d_size; ++r) {
+ for (int c = 0; c < tx1d_size; ++c) {
+ const int idx = r * tx1d_size + c;
+ EXPECT_EQ(ref_prob[idx], prob[idx] >> 16);
+ }
+ }
+
+ const int mask = (1 << 10) - 1;
+ for (int r = 0; r < tx1d_size; ++r) {
+ for (int c = 0; c < tx1d_size; ++c) {
+ const int idx = r * tx1d_size + c;
+ EXPECT_EQ(idx, mask ^ (prob[r * tx1d_size + c] & mask));
+ }
+ }
+}
+
+TEST(ScanTest, av1_update_sort_order) {
+ const TX_SIZE tx_size = TX_4X4;
+ const uint32_t prob[16] = { 8, 8, 7, 7, 8, 8, 4, 2, 3, 3, 2, 2, 2, 2, 2, 2 };
+ const int16_t ref_sort_order[16] = { 0, 1, 4, 5, 2, 3, 6, 8,
+ 9, 12, 7, 10, 13, 11, 14, 15 };
+ int16_t sort_order[16];
+ av1_update_sort_order(tx_size, prob, sort_order);
+ for (int i = 0; i < 16; ++i) EXPECT_EQ(ref_sort_order[i], sort_order[i]);
+}
+
+TEST(ScanTest, av1_update_scan_order) {
+ TX_SIZE tx_size = TX_4X4;
+ const uint32_t prob[16] = { 4, 5, 7, 4, 5, 6, 8, 2, 3, 3, 2, 2, 2, 2, 2, 2 };
+ int16_t sort_order[16];
+ int16_t scan[16];
+ int16_t iscan[16];
+ const int16_t ref_iscan[16] = { 0, 1, 2, 6, 3, 4, 5, 10,
+ 7, 8, 11, 13, 9, 12, 14, 15 };
+
+ av1_update_sort_order(tx_size, prob, sort_order);
+ av1_update_scan_order(tx_size, sort_order, scan, iscan);
+
+ for (int i = 0; i < 16; ++i) {
+ EXPECT_EQ(ref_iscan[i], iscan[i]);
+ EXPECT_EQ(i, scan[ref_iscan[i]]);
+ }
+}
+
+TEST(ScanTest, av1_update_neighbors) {
+ TX_SIZE tx_size = TX_4X4;
+ // raster order
+ const int16_t scan[16] = { 0, 1, 2, 3, 4, 5, 6, 7,
+ 8, 9, 10, 11, 12, 13, 14, 15 };
+ int16_t nb[(16 + 1) * 2];
+ const int16_t ref_nb[(16 + 1) * 2] = { 0, 0, 0, 0, 1, 1, 2, 2, 0,
+ 0, 4, 1, 5, 2, 6, 3, 4, 4,
+ 8, 5, 9, 6, 10, 7, 8, 8, 12,
+ 9, 13, 10, 14, 11, 0, 0 };
+
+ // raster order's scan and iscan are the same
+ av1_update_neighbors(tx_size, scan, scan, nb);
+ for (int i = 0; i < (16 + 1) * 2; ++i) {
+ EXPECT_EQ(ref_nb[i], nb[i]);
+ }
+}
+
+} // namespace
diff --git a/test/superframe_test.cc b/test/superframe_test.cc
index 339843f..94f4be9 100644
--- a/test/superframe_test.cc
+++ b/test/superframe_test.cc
@@ -134,7 +134,7 @@
::testing::ValuesIn(tile_col_values),
::testing::ValuesIn(tile_row_values)));
#else
-#if !CONFIG_ANS
+#if !CONFIG_ANS && !CONFIG_DAALA_EC
AV1_INSTANTIATE_TEST_CASE(
SuperframeTest,
::testing::Combine(::testing::Values(::libaom_test::kTwoPassGood),
diff --git a/test/tile_independence_test.cc b/test/tile_independence_test.cc
index cbc27d0..57f4a60 100644
--- a/test/tile_independence_test.cc
+++ b/test/tile_independence_test.cc
@@ -126,6 +126,9 @@
TEST_P(TileIndependenceTestLarge, MD5Match) { DoTest(); }
+#if CONFIG_EC_ADAPT
+// TODO(thdavies): EC_ADAPT does not support tiles
+#else
#if CONFIG_EXT_TILE
AV1_INSTANTIATE_TEST_CASE(TileIndependenceTest, ::testing::Values(1, 2, 32),
::testing::Values(1, 2, 32));
@@ -138,4 +141,5 @@
AV1_INSTANTIATE_TEST_CASE(TileIndependenceTestLarge, ::testing::Values(0, 1),
::testing::Values(0, 1));
#endif // CONFIG_EXT_TILE
+#endif // CONFIG_EC_ADAPT
} // namespace
diff --git a/test/transform_test_base.h b/test/transform_test_base.h
index 540136c..64bf2d6 100644
--- a/test/transform_test_base.h
+++ b/test/transform_test_base.h
@@ -210,7 +210,7 @@
int out_idx = j * stride + k;
ASSERT_EQ(output_block[out_idx], output_ref_block[out_idx])
<< "Error: not bit-exact result at index: " << out_idx
- << " at test block: " << i;
+ << " j = " << j << " k = " << k << " at test block: " << i;
}
}
}