Merge "Removing is_intra_mode() function."
diff --git a/build/make/Makefile b/build/make/Makefile
index 7a25239..030c1b5 100644
--- a/build/make/Makefile
+++ b/build/make/Makefile
@@ -114,6 +114,10 @@
$(BUILD_PFX)%_ssse3.c.o: CFLAGS += -mssse3
$(BUILD_PFX)%_sse4.c.d: CFLAGS += -msse4.1
$(BUILD_PFX)%_sse4.c.o: CFLAGS += -msse4.1
+$(BUILD_PFX)%_avx.c.d: CFLAGS += -mavx
+$(BUILD_PFX)%_avx.c.o: CFLAGS += -mavx
+$(BUILD_PFX)%_avx2.c.d: CFLAGS += -mavx2
+$(BUILD_PFX)%_avx2.c.o: CFLAGS += -mavx2
$(BUILD_PFX)%.c.d: %.c
$(if $(quiet),@echo " [DEP] $@")
diff --git a/build/make/configure.sh b/build/make/configure.sh
index c6c8660..83f480a 100755
--- a/build/make/configure.sh
+++ b/build/make/configure.sh
@@ -1108,6 +1108,18 @@
soft_enable sse4_1
fi
+ if enabled gcc && ! disabled avx && ! check_cflags -mavx; then
+ RTCD_OPTIONS="${RTCD_OPTIONS}--disable-avx "
+ else
+ soft_enable avx
+ fi
+
+ if enabled gcc && ! disabled avx2 && ! check_cflags -mavx2; then
+ RTCD_OPTIONS="${RTCD_OPTIONS}--disable-avx2 "
+ else
+ soft_enable avx2
+ fi
+
case "${AS}" in
auto|"")
which nasm >/dev/null 2>&1 && AS=nasm
diff --git a/build/make/rtcd.sh b/build/make/rtcd.sh
index c531e95..2967b5a 100755
--- a/build/make/rtcd.sh
+++ b/build/make/rtcd.sh
@@ -327,11 +327,11 @@
require c
case $arch in
x86)
- ALL_ARCHS=$(filter mmx sse sse2 sse3 ssse3 sse4_1)
+ ALL_ARCHS=$(filter mmx sse sse2 sse3 ssse3 sse4_1 avx avx2)
x86
;;
x86_64)
- ALL_ARCHS=$(filter mmx sse sse2 sse3 ssse3 sse4_1)
+ ALL_ARCHS=$(filter mmx sse sse2 sse3 ssse3 sse4_1 avx avx2)
REQUIRES=${REQUIRES:-mmx sse sse2}
require $(filter $REQUIRES)
x86
diff --git a/configure b/configure
index 297cec4..621161c 100755
--- a/configure
+++ b/configure
@@ -234,6 +234,8 @@
sse3
ssse3
sse4_1
+ avx
+ avx2
altivec
"
@@ -422,7 +424,7 @@
fi
# The write_common_config (config.mk) logic is deferred until after the
- # recursive calls to configure complete, becuase we want our universal
+ # recursive calls to configure complete, because we want our universal
# targets to be executed last.
write_common_config_targets
enabled universal && echo "FAT_ARCHS=${fat_bin_archs}" >> config.mk
@@ -608,7 +610,12 @@
check_add_cflags -Wuninitialized
check_add_cflags -Wunused-variable
case ${CC} in
- *clang*) ;;
+ *clang*)
+ # libvpx and/or clang have issues with aliasing:
+ # https://code.google.com/p/webm/issues/detail?id=603
+ # work around them until they are fixed
+ check_add_cflags -fno-strict-aliasing
+ ;;
*) check_add_cflags -Wunused-but-set-variable ;;
esac
enabled extra_warnings || check_add_cflags -Wno-unused-function
diff --git a/test/datarate_test.cc b/test/datarate_test.cc
index 6d50644..85f4bb6 100644
--- a/test/datarate_test.cc
+++ b/test/datarate_test.cc
@@ -176,14 +176,32 @@
}
}
-class DatarateTestVP9 : public DatarateTest {
+class DatarateTestVP9 : public ::libvpx_test::EncoderTest,
+ public ::libvpx_test::CodecTestWith2Params<libvpx_test::TestMode, int> {
+ public:
+ DatarateTestVP9() : EncoderTest(GET_PARAM(0)) {}
+
protected:
virtual ~DatarateTestVP9() {}
+ virtual void SetUp() {
+ InitializeConfig();
+ SetMode(GET_PARAM(1));
+ set_cpu_used_ = GET_PARAM(2);
+ ResetModel();
+ }
+
+ virtual void ResetModel() {
+ last_pts_ = 0;
+ frame_number_ = 0;
+ bits_total_ = 0;
+ duration_ = 0.0;
+ }
+
virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
::libvpx_test::Encoder *encoder) {
if (video->frame() == 1) {
- encoder->Control(VP8E_SET_CPUUSED, 2);
+ encoder->Control(VP8E_SET_CPUUSED, set_cpu_used_);
}
const vpx_rational_t tb = video->timebase();
timebase_ = static_cast<double>(tb.num) / tb.den;
@@ -205,6 +223,14 @@
effective_datarate_ = ((bits_total_) / 1000.0) / duration_;
}
}
+
+ vpx_codec_pts_t last_pts_;
+ double timebase_;
+ int frame_number_;
+ int64_t bits_total_;
+ double duration_;
+ double effective_datarate_;
+ int set_cpu_used_;
};
// There is no buffer model/frame dropper in VP9 currently, so for now we
@@ -218,7 +244,7 @@
::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
30, 1, 0, 140);
- for (int i = 200; i < 800; i += 200) {
+ for (int i = 150; i < 800; i += 200) {
cfg_.rc_target_bitrate = i;
ResetModel();
ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
@@ -231,5 +257,6 @@
VP8_INSTANTIATE_TEST_CASE(DatarateTest, ALL_TEST_MODES);
VP9_INSTANTIATE_TEST_CASE(DatarateTestVP9,
- ::testing::Values(::libvpx_test::kOnePassGood));
+ ::testing::Values(::libvpx_test::kOnePassGood),
+ ::testing::Range(1, 5));
} // namespace
diff --git a/test/test_libvpx.cc b/test/test_libvpx.cc
index a4dbca4..80aca98 100644
--- a/test/test_libvpx.cc
+++ b/test/test_libvpx.cc
@@ -45,6 +45,10 @@
append_gtest_filter(":-SSSE3/*");
if (!(simd_caps & HAS_SSE4_1))
append_gtest_filter(":-SSE4_1/*");
+ if (!(simd_caps & HAS_AVX))
+ append_gtest_filter(":-AVX/*");
+ if (!(simd_caps & HAS_AVX2))
+ append_gtest_filter(":-AVX2/*");
#endif
#if !CONFIG_SHARED
diff --git a/test/vp9_thread_test.cc b/test/vp9_thread_test.cc
index 76fc9bb..a8ce6e4 100644
--- a/test/vp9_thread_test.cc
+++ b/test/vp9_thread_test.cc
@@ -8,16 +8,19 @@
* be found in the AUTHORS file in the root of the source tree.
*/
-#include "vp9/decoder/vp9_thread.h"
+#include <string>
#include "third_party/googletest/src/include/gtest/gtest.h"
#include "test/codec_factory.h"
#include "test/decode_test_driver.h"
#include "test/md5_helper.h"
#include "test/webm_video_source.h"
+#include "vp9/decoder/vp9_thread.h"
namespace {
+using std::string;
+
class VP9WorkerThreadTest : public ::testing::TestWithParam<bool> {
protected:
virtual ~VP9WorkerThreadTest() {}
@@ -91,19 +94,26 @@
EXPECT_FALSE(worker_.had_error);
}
-TEST(VP9DecodeMTTest, MTDecode) {
- libvpx_test::WebMVideoSource video("vp90-2-03-size-226x226.webm");
+// -----------------------------------------------------------------------------
+// Multi-threaded decode tests
+
+// Decodes |filename| with |num_threads|. Returns the md5 of the decoded frames.
+string DecodeFile(const string& filename, int num_threads) {
+ libvpx_test::WebMVideoSource video(filename);
video.Init();
vpx_codec_dec_cfg_t cfg = {0};
- cfg.threads = 2;
+ cfg.threads = num_threads;
libvpx_test::VP9Decoder decoder(cfg, 0);
libvpx_test::MD5 md5;
for (video.Begin(); video.cxdata(); video.Next()) {
const vpx_codec_err_t res =
decoder.DecodeFrame(video.cxdata(), video.frame_size());
- ASSERT_EQ(VPX_CODEC_OK, res) << decoder.DecodeError();
+ if (res != VPX_CODEC_OK) {
+ EXPECT_EQ(VPX_CODEC_OK, res) << decoder.DecodeError();
+ break;
+ }
libvpx_test::DxDataIterator dec_iter = decoder.GetDxData();
const vpx_image_t *img = NULL;
@@ -113,7 +123,32 @@
md5.Add(img);
}
}
- EXPECT_STREQ("b35a1b707b28e82be025d960aba039bc", md5.Get());
+ return string(md5.Get());
+}
+
+TEST(VP9DecodeMTTest, MTDecode) {
+ // no tiles or frame parallel; this exercises loop filter threading.
+ EXPECT_STREQ("b35a1b707b28e82be025d960aba039bc",
+ DecodeFile("vp90-2-03-size-226x226.webm", 2).c_str());
+}
+
+TEST(VP9DecodeMTTest, MTDecode2) {
+ static const struct {
+ const char *name;
+ const char *expected_md5;
+ } files[] = {
+ { "vp90-2-08-tile_1x2_frame_parallel.webm",
+ "68ede6abd66bae0a2edf2eb9232241b6" },
+ { "vp90-2-08-tile_1x4_frame_parallel.webm",
+ "368ebc6ebf3a5e478d85b2c3149b2848" },
+ };
+
+ for (int i = 0; i < static_cast<int>(sizeof(files) / sizeof(files[0])); ++i) {
+ for (int t = 2; t <= 4; ++t) {
+ EXPECT_STREQ(files[i].expected_md5, DecodeFile(files[i].name, t).c_str())
+ << "threads = " << t;
+ }
+ }
}
INSTANTIATE_TEST_CASE_P(Synchronous, VP9WorkerThreadTest, ::testing::Bool());
diff --git a/vp8/common/idct_blk.c b/vp8/common/idct_blk.c
index 8edfffb..65d5002 100644
--- a/vp8/common/idct_blk.c
+++ b/vp8/common/idct_blk.c
@@ -10,6 +10,7 @@
#include "vpx_config.h"
#include "vp8_rtcd.h"
+#include "vpx_mem/vpx_mem.h"
void vp8_dequant_idct_add_c(short *input, short *dq,
unsigned char *dest, int stride);
@@ -32,7 +33,7 @@
else
{
vp8_dc_only_idct_add_c (q[0]*dq[0], dst, stride, dst, stride);
- ((int *)q)[0] = 0;
+ vpx_memset(q, 0, 2 * sizeof(q[0]));
}
q += 16;
@@ -58,7 +59,7 @@
else
{
vp8_dc_only_idct_add_c (q[0]*dq[0], dstu, stride, dstu, stride);
- ((int *)q)[0] = 0;
+ vpx_memset(q, 0, 2 * sizeof(q[0]));
}
q += 16;
@@ -77,7 +78,7 @@
else
{
vp8_dc_only_idct_add_c (q[0]*dq[0], dstv, stride, dstv, stride);
- ((int *)q)[0] = 0;
+ vpx_memset(q, 0, 2 * sizeof(q[0]));
}
q += 16;
diff --git a/vp8/common/x86/idct_blk_mmx.c b/vp8/common/x86/idct_blk_mmx.c
index 49b2013..a1e4ce6 100644
--- a/vp8/common/x86/idct_blk_mmx.c
+++ b/vp8/common/x86/idct_blk_mmx.c
@@ -11,6 +11,7 @@
#include "vpx_config.h"
#include "vp8_rtcd.h"
#include "vp8/common/blockd.h"
+#include "vpx_mem/vpx_mem.h"
extern void vp8_dequantize_b_impl_mmx(short *sq, short *dq, short *q);
@@ -35,7 +36,7 @@
else if (eobs[0] == 1)
{
vp8_dc_only_idct_add_mmx (q[0]*dq[0], dst, stride, dst, stride);
- ((int *)q)[0] = 0;
+ vpx_memset(q, 0, 2 * sizeof(q[0]));
}
if (eobs[1] > 1)
@@ -44,7 +45,7 @@
{
vp8_dc_only_idct_add_mmx (q[16]*dq[0], dst+4, stride,
dst+4, stride);
- ((int *)(q+16))[0] = 0;
+ vpx_memset(q + 16, 0, 2 * sizeof(q[0]));
}
if (eobs[2] > 1)
@@ -53,7 +54,7 @@
{
vp8_dc_only_idct_add_mmx (q[32]*dq[0], dst+8, stride,
dst+8, stride);
- ((int *)(q+32))[0] = 0;
+ vpx_memset(q + 32, 0, 2 * sizeof(q[0]));
}
if (eobs[3] > 1)
@@ -62,7 +63,7 @@
{
vp8_dc_only_idct_add_mmx (q[48]*dq[0], dst+12, stride,
dst+12, stride);
- ((int *)(q+48))[0] = 0;
+ vpx_memset(q + 48, 0, 2 * sizeof(q[0]));
}
q += 64;
@@ -84,7 +85,7 @@
else if (eobs[0] == 1)
{
vp8_dc_only_idct_add_mmx (q[0]*dq[0], dstu, stride, dstu, stride);
- ((int *)q)[0] = 0;
+ vpx_memset(q, 0, 2 * sizeof(q[0]));
}
if (eobs[1] > 1)
@@ -93,7 +94,7 @@
{
vp8_dc_only_idct_add_mmx (q[16]*dq[0], dstu+4, stride,
dstu+4, stride);
- ((int *)(q+16))[0] = 0;
+ vpx_memset(q + 16, 0, 2 * sizeof(q[0]));
}
q += 32;
@@ -108,7 +109,7 @@
else if (eobs[0] == 1)
{
vp8_dc_only_idct_add_mmx (q[0]*dq[0], dstv, stride, dstv, stride);
- ((int *)q)[0] = 0;
+ vpx_memset(q, 0, 2 * sizeof(q[0]));
}
if (eobs[1] > 1)
@@ -117,7 +118,7 @@
{
vp8_dc_only_idct_add_mmx (q[16]*dq[0], dstv+4, stride,
dstv+4, stride);
- ((int *)(q+16))[0] = 0;
+ vpx_memset(q + 16, 0, 2 * sizeof(q[0]));
}
q += 32;
diff --git a/vp8/decoder/decodemv.c b/vp8/decoder/decodemv.c
index 759d842..35a22c7 100644
--- a/vp8/decoder/decodemv.c
+++ b/vp8/decoder/decodemv.c
@@ -512,15 +512,15 @@
else
{
mbmi->mode = NEARMV;
- vp8_clamp_mv2(&near_mvs[CNT_NEAR], &pbi->mb);
mbmi->mv.as_int = near_mvs[CNT_NEAR].as_int;
+ vp8_clamp_mv2(&mbmi->mv, &pbi->mb);
}
}
else
{
mbmi->mode = NEARESTMV;
- vp8_clamp_mv2(&near_mvs[CNT_NEAREST], &pbi->mb);
mbmi->mv.as_int = near_mvs[CNT_NEAREST].as_int;
+ vp8_clamp_mv2(&mbmi->mv, &pbi->mb);
}
}
else
diff --git a/vp8/decoder/decodframe.c b/vp8/decoder/decodframe.c
index 50ee9c5..16da78a 100644
--- a/vp8/decoder/decodframe.c
+++ b/vp8/decoder/decodframe.c
@@ -211,7 +211,7 @@
(b->qcoeff[0] * DQC[0],
dst, dst_stride,
dst, dst_stride);
- ((int *)b->qcoeff)[0] = 0;
+ vpx_memset(b->qcoeff, 0, 2 * sizeof(b->qcoeff[0]));
}
}
}
@@ -248,21 +248,14 @@
vp8_short_inv_walsh4x4(&b->dqcoeff[0],
xd->qcoeff);
- ((int *)b->qcoeff)[0] = 0;
- ((int *)b->qcoeff)[1] = 0;
- ((int *)b->qcoeff)[2] = 0;
- ((int *)b->qcoeff)[3] = 0;
- ((int *)b->qcoeff)[4] = 0;
- ((int *)b->qcoeff)[5] = 0;
- ((int *)b->qcoeff)[6] = 0;
- ((int *)b->qcoeff)[7] = 0;
+ vpx_memset(b->qcoeff, 0, 16 * sizeof(b->qcoeff[0]));
}
else
{
b->dqcoeff[0] = b->qcoeff[0] * xd->dequant_y2[0];
vp8_short_inv_walsh4x4_1(&b->dqcoeff[0],
xd->qcoeff);
- ((int *)b->qcoeff)[0] = 0;
+ vpx_memset(b->qcoeff, 0, 2 * sizeof(b->qcoeff[0]));
}
/* override the dc dequant constant in order to preserve the
diff --git a/vp8/decoder/threading.c b/vp8/decoder/threading.c
index 7303189..fe290cf 100644
--- a/vp8/decoder/threading.c
+++ b/vp8/decoder/threading.c
@@ -227,7 +227,7 @@
{
vp8_dc_only_idct_add(b->qcoeff[0] * DQC[0],
dst, dst_stride, dst, dst_stride);
- ((int *)b->qcoeff)[0] = 0;
+ vpx_memset(b->qcoeff, 0, 2 * sizeof(b->qcoeff[0]));
}
}
}
@@ -264,21 +264,14 @@
vp8_short_inv_walsh4x4(&b->dqcoeff[0],
xd->qcoeff);
- ((int *)b->qcoeff)[0] = 0;
- ((int *)b->qcoeff)[1] = 0;
- ((int *)b->qcoeff)[2] = 0;
- ((int *)b->qcoeff)[3] = 0;
- ((int *)b->qcoeff)[4] = 0;
- ((int *)b->qcoeff)[5] = 0;
- ((int *)b->qcoeff)[6] = 0;
- ((int *)b->qcoeff)[7] = 0;
+ vpx_memset(b->qcoeff, 0, 16 * sizeof(b->qcoeff[0]));
}
else
{
b->dqcoeff[0] = b->qcoeff[0] * xd->dequant_y2[0];
vp8_short_inv_walsh4x4_1(&b->dqcoeff[0],
xd->qcoeff);
- ((int *)b->qcoeff)[0] = 0;
+ vpx_memset(b->qcoeff, 0, 2 * sizeof(b->qcoeff[0]));
}
/* override the dc dequant constant in order to preserve the
diff --git a/vp9/common/vp9_entropymode.c b/vp9/common/vp9_entropymode.c
index 3347b35..21c91d6 100644
--- a/vp9/common/vp9_entropymode.c
+++ b/vp9/common/vp9_entropymode.c
@@ -309,8 +309,8 @@
192, 128, 64
};
-static const vp9_prob default_switchable_interp_prob[SWITCHABLE_FILTERS+1]
- [SWITCHABLE_FILTERS-1] = {
+static const vp9_prob default_switchable_interp_prob[SWITCHABLE_FILTER_CONTEXTS]
+ [SWITCHABLE_FILTERS - 1] = {
{ 235, 162, },
{ 36, 255, },
{ 34, 3, },
@@ -416,7 +416,7 @@
fc->partition_prob[INTER_FRAME][i], 0);
if (cm->mcomp_filter_type == SWITCHABLE) {
- for (i = 0; i <= SWITCHABLE_FILTERS; i++)
+ for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; i++)
update_mode_probs(SWITCHABLE_FILTERS, vp9_switchable_interp_tree,
counts->switchable_interp[i],
pre_fc->switchable_interp_prob[i],
diff --git a/vp9/common/vp9_entropymode.h b/vp9/common/vp9_entropymode.h
index ab37b75..ea96555 100644
--- a/vp9/common/vp9_entropymode.h
+++ b/vp9/common/vp9_entropymode.h
@@ -16,6 +16,7 @@
#define TX_SIZE_CONTEXTS 2
#define SWITCHABLE_FILTERS 3 // number of switchable filters
+#define SWITCHABLE_FILTER_CONTEXTS (SWITCHABLE_FILTERS + 1)
// #define MODE_STATS
diff --git a/vp9/common/vp9_loopfilter.c b/vp9/common/vp9_loopfilter.c
index 85ac6d2..218e12e 100644
--- a/vp9/common/vp9_loopfilter.c
+++ b/vp9/common/vp9_loopfilter.c
@@ -16,12 +16,6 @@
#include "vp9/common/vp9_seg_common.h"
-struct loop_filter_info {
- const uint8_t *mblim;
- const uint8_t *lim;
- const uint8_t *hev_thr;
-};
-
// This structure holds bit masks for all 8x8 blocks in a 64x64 region.
// Each 1 bit represents a position in which we want to apply the loop filter.
// Left_ entries refer to whether we apply a filter on the border to the
@@ -259,8 +253,8 @@
if (block_inside_limit < 1)
block_inside_limit = 1;
- vpx_memset(lfi->lim[lvl], block_inside_limit, SIMD_WIDTH);
- vpx_memset(lfi->mblim[lvl], (2 * (lvl + 2) + block_inside_limit),
+ vpx_memset(lfi->lfthr[lvl].lim, block_inside_limit, SIMD_WIDTH);
+ vpx_memset(lfi->lfthr[lvl].mblim, (2 * (lvl + 2) + block_inside_limit),
SIMD_WIDTH);
}
}
@@ -268,7 +262,7 @@
void vp9_loop_filter_init(VP9_COMMON *cm) {
loop_filter_info_n *lfi = &cm->lf_info;
struct loopfilter *lf = &cm->lf;
- int i;
+ int lvl;
// init limits for given sharpness
update_sharpness(lfi, lf->sharpness_level);
@@ -278,8 +272,8 @@
lf_init_lut(lfi);
// init hev threshold const vectors
- for (i = 0; i < 4; i++)
- vpx_memset(lfi->hev_thr[i], i, SIMD_WIDTH);
+ for (lvl = 0; lvl <= MAX_LOOP_FILTER; lvl++)
+ vpx_memset(lfi->lfthr[lvl].hev_thr, (lvl >> 4), SIMD_WIDTH);
}
void vp9_loop_filter_frame_init(VP9_COMMON *cm, int default_filt_lvl) {
@@ -330,16 +324,14 @@
static int build_lfi(const loop_filter_info_n *lfi_n,
const MB_MODE_INFO *mbmi,
- struct loop_filter_info *lfi) {
+ const loop_filter_thresh **lfi) {
const int seg = mbmi->segment_id;
const int ref = mbmi->ref_frame[0];
const int mode = lfi_n->mode_lf_lut[mbmi->mode];
const int filter_level = lfi_n->lvl[seg][ref][mode];
if (filter_level > 0) {
- lfi->mblim = lfi_n->mblim[filter_level];
- lfi->lim = lfi_n->lim[filter_level];
- lfi->hev_thr = lfi_n->hev_thr[filter_level >> 4];
+ *lfi = &lfi_n->lfthr[filter_level];
return 1;
} else {
return 0;
@@ -351,11 +343,13 @@
unsigned int mask_8x8,
unsigned int mask_4x4,
unsigned int mask_4x4_int,
- const struct loop_filter_info *lfi) {
+ const loop_filter_thresh **p_lfi) {
unsigned int mask;
for (mask = mask_16x16 | mask_8x8 | mask_4x4 | mask_4x4_int;
mask; mask >>= 1) {
+ const loop_filter_thresh *lfi = *p_lfi;
+
if (mask & 1) {
if (mask_16x16 & 1) {
vp9_mb_lpf_vertical_edge_w(s, pitch, lfi->mblim, lfi->lim,
@@ -379,7 +373,7 @@
vp9_loop_filter_vertical_edge(s + 4, pitch, lfi->mblim, lfi->lim,
lfi->hev_thr, 1);
s += 8;
- lfi++;
+ p_lfi++;
mask_16x16 >>= 1;
mask_8x8 >>= 1;
mask_4x4 >>= 1;
@@ -393,12 +387,14 @@
unsigned int mask_4x4,
unsigned int mask_4x4_int,
int only_4x4_1,
- const struct loop_filter_info *lfi) {
+ const loop_filter_thresh **p_lfi) {
unsigned int mask;
int count;
for (mask = mask_16x16 | mask_8x8 | mask_4x4 | mask_4x4_int;
mask; mask >>= count) {
+ const loop_filter_thresh *lfi = *p_lfi;
+
count = 1;
if (mask & 1) {
if (!only_4x4_1) {
@@ -432,7 +428,7 @@
lfi->lim, lfi->hev_thr, 1);
}
s += 8 * count;
- lfi += count;
+ p_lfi += count;
mask_16x16 >>= count;
mask_8x8 >>= count;
mask_4x4 >>= count;
@@ -805,7 +801,7 @@
unsigned int mask_8x8[MI_BLOCK_SIZE] = {0};
unsigned int mask_4x4[MI_BLOCK_SIZE] = {0};
unsigned int mask_4x4_int[MI_BLOCK_SIZE] = {0};
- struct loop_filter_info lfi[MI_BLOCK_SIZE][MI_BLOCK_SIZE];
+ const loop_filter_thresh *lfi[MI_BLOCK_SIZE][MI_BLOCK_SIZE];
int r, c;
for (r = 0; r < MI_BLOCK_SIZE && mi_row + r < cm->mi_rows; r += row_step) {
@@ -834,7 +830,7 @@
const int skip_border_4x4_r = ss_y && mi_row + r == cm->mi_rows - 1;
// Filter level can vary per MI
- if (!build_lfi(&cm->lf_info, &mi[0].mbmi, lfi[r] + (c >> ss_x)))
+ if (!build_lfi(&cm->lf_info, &mi[0].mbmi, &lfi[r][c >> ss_x]))
continue;
// Build masks based on the transform size of each block
@@ -925,7 +921,7 @@
struct buf_2d *const dst = &plane->dst;
uint8_t* const dst0 = dst->buf;
unsigned int mask_4x4_int[MI_BLOCK_SIZE] = {0};
- struct loop_filter_info lfi[MI_BLOCK_SIZE][MI_BLOCK_SIZE];
+ const loop_filter_thresh *lfi[MI_BLOCK_SIZE][MI_BLOCK_SIZE];
int r, c;
int row_shift = 3 - ss_x;
int row_mask = 0xff >> (ss_x << 2);
@@ -938,8 +934,8 @@
// Determine the vertical edges that need filtering
for (c = 0; c < MI_BLOCK_SIZE && mi_col + c < cm->mi_cols; c += col_step) {
const MODE_INFO *mi = mi_8x8[c];
- if (!build_lfi(&cm->lf_info, &mi[0].mbmi, lfi[r] + (c >> ss_x)))
- continue;
+
+ build_lfi(&cm->lf_info, &mi[0].mbmi, &lfi[r][c >> ss_x]);
}
if (!plane->plane_type) {
mask_4x4_int[r] = MASK_ROW(lfm->int_4x4_y);
diff --git a/vp9/common/vp9_loopfilter.h b/vp9/common/vp9_loopfilter.h
index c698090..62389ea 100644
--- a/vp9/common/vp9_loopfilter.h
+++ b/vp9/common/vp9_loopfilter.h
@@ -46,12 +46,13 @@
// Need to align this structure so when it is declared and
// passed it can be loaded into vector registers.
typedef struct {
- DECLARE_ALIGNED(SIMD_WIDTH, uint8_t,
- mblim[MAX_LOOP_FILTER + 1][SIMD_WIDTH]);
- DECLARE_ALIGNED(SIMD_WIDTH, uint8_t,
- lim[MAX_LOOP_FILTER + 1][SIMD_WIDTH]);
- DECLARE_ALIGNED(SIMD_WIDTH, uint8_t,
- hev_thr[4][SIMD_WIDTH]);
+ DECLARE_ALIGNED(SIMD_WIDTH, uint8_t, mblim[SIMD_WIDTH]);
+ DECLARE_ALIGNED(SIMD_WIDTH, uint8_t, lim[SIMD_WIDTH]);
+ DECLARE_ALIGNED(SIMD_WIDTH, uint8_t, hev_thr[SIMD_WIDTH]);
+} loop_filter_thresh;
+
+typedef struct {
+ loop_filter_thresh lfthr[MAX_LOOP_FILTER + 1];
uint8_t lvl[MAX_SEGMENTS][MAX_REF_FRAMES][MAX_MODE_LF_DELTAS];
uint8_t mode_lf_lut[MB_MODE_COUNT];
} loop_filter_info_n;
diff --git a/vp9/common/vp9_onyxc_int.h b/vp9/common/vp9_onyxc_int.h
index a823de8..ba2e9d8 100644
--- a/vp9/common/vp9_onyxc_int.h
+++ b/vp9/common/vp9_onyxc_int.h
@@ -43,7 +43,7 @@
vp9_prob uv_mode_prob[INTRA_MODES][INTRA_MODES - 1];
vp9_prob partition_prob[FRAME_TYPES][PARTITION_CONTEXTS][PARTITION_TYPES - 1];
vp9_coeff_probs_model coef_probs[TX_SIZES][BLOCK_TYPES];
- vp9_prob switchable_interp_prob[SWITCHABLE_FILTERS + 1]
+ vp9_prob switchable_interp_prob[SWITCHABLE_FILTER_CONTEXTS]
[SWITCHABLE_FILTERS - 1];
vp9_prob inter_mode_probs[INTER_MODE_CONTEXTS][INTER_MODES - 1];
vp9_prob intra_inter_prob[INTRA_INTER_CONTEXTS];
@@ -62,7 +62,7 @@
vp9_coeff_count_model coef[TX_SIZES][BLOCK_TYPES];
unsigned int eob_branch[TX_SIZES][BLOCK_TYPES][REF_TYPES]
[COEF_BANDS][PREV_COEF_CONTEXTS];
- unsigned int switchable_interp[SWITCHABLE_FILTERS + 1]
+ unsigned int switchable_interp[SWITCHABLE_FILTER_CONTEXTS]
[SWITCHABLE_FILTERS];
unsigned int inter_mode[INTER_MODE_CONTEXTS][INTER_MODES];
unsigned int intra_inter[INTRA_INTER_CONTEXTS][2];
diff --git a/vp9/common/vp9_pred_common.h b/vp9/common/vp9_pred_common.h
index a869dc0..19032bf 100644
--- a/vp9/common/vp9_pred_common.h
+++ b/vp9/common/vp9_pred_common.h
@@ -127,14 +127,14 @@
return get_tx_probs(bsize, context, tx_probs);
}
-static void update_tx_counts(BLOCK_SIZE bsize, uint8_t context,
- TX_SIZE tx_size, struct tx_counts *tx_counts) {
- if (bsize >= BLOCK_32X32)
- tx_counts->p32x32[context][tx_size]++;
- else if (bsize >= BLOCK_16X16)
- tx_counts->p16x16[context][tx_size]++;
+static unsigned int *get_tx_counts(BLOCK_SIZE bsize, uint8_t context,
+ struct tx_counts *tx_counts) {
+ if (bsize < BLOCK_16X16)
+ return tx_counts->p8x8[context];
+ else if (bsize < BLOCK_32X32)
+ return tx_counts->p16x16[context];
else
- tx_counts->p8x8[context][tx_size]++;
+ return tx_counts->p32x32[context];
}
#endif // VP9_COMMON_VP9_PRED_COMMON_H_
diff --git a/vp9/common/vp9_reconinter.c b/vp9/common/vp9_reconinter.c
index 53b9003..1c96788 100644
--- a/vp9/common/vp9_reconinter.c
+++ b/vp9/common/vp9_reconinter.c
@@ -161,13 +161,12 @@
// scaling case. It needs to be done on the scaled MV, not the pre-scaling
// MV. Note however that it performs the subsampling aware scaling so
// that the result is always q4.
- const MV res_mv = clamp_mv_to_umv_border_sb(xd, &mv, bw, bh,
- pd->subsampling_x,
- pd->subsampling_y);
+ // mv_precision precision is MV_PRECISION_Q4.
+ const MV mv_q4 = clamp_mv_to_umv_border_sb(xd, &mv, bw, bh,
+ pd->subsampling_x,
+ pd->subsampling_y);
uint8_t *pre;
- // mv_precision precision is MV_PRECISION_Q4.
- const MV mv_q4 = {res_mv.row, res_mv.col };
MV32 scaled_mv;
int xs, ys;
diff --git a/vp9/common/vp9_rtcd_defs.sh b/vp9/common/vp9_rtcd_defs.sh
index 2d9fbff..3f3268f 100644
--- a/vp9/common/vp9_rtcd_defs.sh
+++ b/vp9/common/vp9_rtcd_defs.sh
@@ -22,10 +22,11 @@
# x86inc.asm doesn't work if pic is enabled on 32 bit platforms so no assembly.
[ "$CONFIG_USE_X86INC" = "yes" ] && mmx_x86inc=mmx && sse_x86inc=sse &&
- sse2_x86inc=sse2 && ssse3_x86inc=ssse3
+ sse2_x86inc=sse2 && ssse3_x86inc=ssse3 && avx_x86inc=avx && avx2_x86inc=avx2
# this variable is for functions that are 64 bit only.
-[ $arch = "x86_64" ] && mmx_x86_64=mmx && sse2_x86_64=sse2 && ssse3_x86_64=ssse3
+[ $arch = "x86_64" ] && mmx_x86_64=mmx && sse2_x86_64=sse2 &&
+ ssse3_x86_64=ssse3 && avx_x86_64=avx && avx2_x86_64=avx2
#
# RECON
@@ -199,7 +200,7 @@
specialize vp9_loop_filter_vertical_edge mmx neon
prototype void vp9_mb_lpf_horizontal_edge_w "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count"
-specialize vp9_mb_lpf_horizontal_edge_w sse2 neon
+specialize vp9_mb_lpf_horizontal_edge_w sse2 avx2 neon
prototype void vp9_mbloop_filter_horizontal_edge "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count"
specialize vp9_mbloop_filter_horizontal_edge sse2 neon
@@ -671,10 +672,10 @@
prototype void vp9_subtract_block "int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride"
specialize vp9_subtract_block $sse2_x86inc
-prototype void vp9_quantize_b "int16_t *coeff_ptr, intptr_t n_coeffs, int skip_block, int16_t *zbin_ptr, int16_t *round_ptr, int16_t *quant_ptr, int16_t *quant_shift_ptr, int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"
+prototype void vp9_quantize_b "const int16_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"
specialize vp9_quantize_b $ssse3_x86_64
-prototype void vp9_quantize_b_32x32 "int16_t *coeff_ptr, intptr_t n_coeffs, int skip_block, int16_t *zbin_ptr, int16_t *round_ptr, int16_t *quant_ptr, int16_t *quant_shift_ptr, int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"
+prototype void vp9_quantize_b_32x32 "const int16_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"
specialize vp9_quantize_b_32x32 $ssse3_x86_64
#
diff --git a/vp9/common/x86/vp9_loopfilter_intrin_avx2.c b/vp9/common/x86/vp9_loopfilter_intrin_avx2.c
new file mode 100644
index 0000000..3c5cb8f
--- /dev/null
+++ b/vp9/common/x86/vp9_loopfilter_intrin_avx2.c
@@ -0,0 +1,943 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <immintrin.h> /* AVX2 */
+
+static void mb_lpf_horizontal_edge_w_avx2_8(unsigned char *s, int p,
+ const unsigned char *_blimit, const unsigned char *_limit,
+ const unsigned char *_thresh) {
+ __m128i mask, hev, flat, flat2;
+ const __m128i zero = _mm_set1_epi16(0);
+ const __m128i one = _mm_set1_epi8(1);
+ __m128i q7p7, q6p6, q5p5, q4p4, q3p3, q2p2, q1p1, q0p0, p0q0, p1q1;
+ __m128i abs_p1p0;
+
+ const __m128i thresh = _mm_broadcastb_epi8(
+ _mm_cvtsi32_si128((int) _thresh[0]));
+ const __m128i limit = _mm_broadcastb_epi8(
+ _mm_cvtsi32_si128((int) _limit[0]));
+ const __m128i blimit = _mm_broadcastb_epi8(
+ _mm_cvtsi32_si128((int) _blimit[0]));
+
+ q4p4 = _mm_loadl_epi64((__m128i *) (s - 5 * p));
+ q4p4 = _mm_castps_si128(
+ _mm_loadh_pi(_mm_castsi128_ps(q4p4), (__m64 *) (s + 4 * p)));
+ q3p3 = _mm_loadl_epi64((__m128i *) (s - 4 * p));
+ q3p3 = _mm_castps_si128(
+ _mm_loadh_pi(_mm_castsi128_ps(q3p3), (__m64 *) (s + 3 * p)));
+ q2p2 = _mm_loadl_epi64((__m128i *) (s - 3 * p));
+ q2p2 = _mm_castps_si128(
+ _mm_loadh_pi(_mm_castsi128_ps(q2p2), (__m64 *) (s + 2 * p)));
+ q1p1 = _mm_loadl_epi64((__m128i *) (s - 2 * p));
+ q1p1 = _mm_castps_si128(
+ _mm_loadh_pi(_mm_castsi128_ps(q1p1), (__m64 *) (s + 1 * p)));
+ p1q1 = _mm_shuffle_epi32(q1p1, 78);
+ q0p0 = _mm_loadl_epi64((__m128i *) (s - 1 * p));
+ q0p0 = _mm_castps_si128(
+ _mm_loadh_pi(_mm_castsi128_ps(q0p0), (__m64 *) (s - 0 * p)));
+ p0q0 = _mm_shuffle_epi32(q0p0, 78);
+
+ {
+ __m128i abs_p1q1, abs_p0q0, abs_q1q0, fe, ff, work;
+ abs_p1p0 = _mm_or_si128(_mm_subs_epu8(q1p1, q0p0),
+ _mm_subs_epu8(q0p0, q1p1));
+ abs_q1q0 = _mm_srli_si128(abs_p1p0, 8);
+ fe = _mm_set1_epi8(0xfe);
+ ff = _mm_cmpeq_epi8(abs_p1p0, abs_p1p0);
+ abs_p0q0 = _mm_or_si128(_mm_subs_epu8(q0p0, p0q0),
+ _mm_subs_epu8(p0q0, q0p0));
+ abs_p1q1 = _mm_or_si128(_mm_subs_epu8(q1p1, p1q1),
+ _mm_subs_epu8(p1q1, q1p1));
+ flat = _mm_max_epu8(abs_p1p0, abs_q1q0);
+ hev = _mm_subs_epu8(flat, thresh);
+ hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
+
+ abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0);
+ abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
+ mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit);
+ mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
+ // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1;
+ mask = _mm_max_epu8(abs_p1p0, mask);
+ // mask |= (abs(p1 - p0) > limit) * -1;
+ // mask |= (abs(q1 - q0) > limit) * -1;
+
+ work = _mm_max_epu8(
+ _mm_or_si128(_mm_subs_epu8(q2p2, q1p1),
+ _mm_subs_epu8(q1p1, q2p2)),
+ _mm_or_si128(_mm_subs_epu8(q3p3, q2p2),
+ _mm_subs_epu8(q2p2, q3p3)));
+ mask = _mm_max_epu8(work, mask);
+ mask = _mm_max_epu8(mask, _mm_srli_si128(mask, 8));
+ mask = _mm_subs_epu8(mask, limit);
+ mask = _mm_cmpeq_epi8(mask, zero);
+ }
+
+ // lp filter
+ {
+ const __m128i t4 = _mm_set1_epi8(4);
+ const __m128i t3 = _mm_set1_epi8(3);
+ const __m128i t80 = _mm_set1_epi8(0x80);
+ const __m128i t1 = _mm_set1_epi16(0x1);
+ __m128i qs1ps1 = _mm_xor_si128(q1p1, t80);
+ __m128i qs0ps0 = _mm_xor_si128(q0p0, t80);
+ __m128i qs0 = _mm_xor_si128(p0q0, t80);
+ __m128i qs1 = _mm_xor_si128(p1q1, t80);
+ __m128i filt;
+ __m128i work_a;
+ __m128i filter1, filter2;
+ __m128i flat2_q6p6, flat2_q5p5, flat2_q4p4, flat2_q3p3, flat2_q2p2;
+ __m128i flat2_q1p1, flat2_q0p0, flat_q2p2, flat_q1p1, flat_q0p0;
+
+ filt = _mm_and_si128(_mm_subs_epi8(qs1ps1, qs1), hev);
+ work_a = _mm_subs_epi8(qs0, qs0ps0);
+ filt = _mm_adds_epi8(filt, work_a);
+ filt = _mm_adds_epi8(filt, work_a);
+ filt = _mm_adds_epi8(filt, work_a);
+ /* (vp9_filter + 3 * (qs0 - ps0)) & mask */
+ filt = _mm_and_si128(filt, mask);
+
+ filter1 = _mm_adds_epi8(filt, t4);
+ filter2 = _mm_adds_epi8(filt, t3);
+
+ filter1 = _mm_unpacklo_epi8(zero, filter1);
+ filter1 = _mm_srai_epi16(filter1, 0xB);
+ filter2 = _mm_unpacklo_epi8(zero, filter2);
+ filter2 = _mm_srai_epi16(filter2, 0xB);
+
+ /* Filter1 >> 3 */
+ filt = _mm_packs_epi16(filter2, _mm_subs_epi16(zero, filter1));
+ qs0ps0 = _mm_xor_si128(_mm_adds_epi8(qs0ps0, filt), t80);
+
+ /* filt >> 1 */
+ filt = _mm_adds_epi16(filter1, t1);
+ filt = _mm_srai_epi16(filt, 1);
+ filt = _mm_andnot_si128(
+ _mm_srai_epi16(_mm_unpacklo_epi8(zero, hev), 0x8), filt);
+ filt = _mm_packs_epi16(filt, _mm_subs_epi16(zero, filt));
+ qs1ps1 = _mm_xor_si128(_mm_adds_epi8(qs1ps1, filt), t80);
+ // loopfilter done
+
+ {
+ __m128i work;
+ flat = _mm_max_epu8(
+ _mm_or_si128(_mm_subs_epu8(q2p2, q0p0),
+ _mm_subs_epu8(q0p0, q2p2)),
+ _mm_or_si128(_mm_subs_epu8(q3p3, q0p0),
+ _mm_subs_epu8(q0p0, q3p3)));
+ flat = _mm_max_epu8(abs_p1p0, flat);
+ flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 8));
+ flat = _mm_subs_epu8(flat, one);
+ flat = _mm_cmpeq_epi8(flat, zero);
+ flat = _mm_and_si128(flat, mask);
+
+ q5p5 = _mm_loadl_epi64((__m128i *) (s - 6 * p));
+ q5p5 = _mm_castps_si128(
+ _mm_loadh_pi(_mm_castsi128_ps(q5p5),
+ (__m64 *) (s + 5 * p)));
+
+ q6p6 = _mm_loadl_epi64((__m128i *) (s - 7 * p));
+ q6p6 = _mm_castps_si128(
+ _mm_loadh_pi(_mm_castsi128_ps(q6p6),
+ (__m64 *) (s + 6 * p)));
+
+ flat2 = _mm_max_epu8(
+ _mm_or_si128(_mm_subs_epu8(q4p4, q0p0),
+ _mm_subs_epu8(q0p0, q4p4)),
+ _mm_or_si128(_mm_subs_epu8(q5p5, q0p0),
+ _mm_subs_epu8(q0p0, q5p5)));
+
+ q7p7 = _mm_loadl_epi64((__m128i *) (s - 8 * p));
+ q7p7 = _mm_castps_si128(
+ _mm_loadh_pi(_mm_castsi128_ps(q7p7),
+ (__m64 *) (s + 7 * p)));
+
+ work = _mm_max_epu8(
+ _mm_or_si128(_mm_subs_epu8(q6p6, q0p0),
+ _mm_subs_epu8(q0p0, q6p6)),
+ _mm_or_si128(_mm_subs_epu8(q7p7, q0p0),
+ _mm_subs_epu8(q0p0, q7p7)));
+
+ flat2 = _mm_max_epu8(work, flat2);
+ flat2 = _mm_max_epu8(flat2, _mm_srli_si128(flat2, 8));
+ flat2 = _mm_subs_epu8(flat2, one);
+ flat2 = _mm_cmpeq_epi8(flat2, zero);
+ flat2 = _mm_and_si128(flat2, flat); // flat2 & flat & mask
+ }
+
+ // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ // flat and wide flat calculations
+ {
+ const __m128i eight = _mm_set1_epi16(8);
+ const __m128i four = _mm_set1_epi16(4);
+ __m128i p7_16, p6_16, p5_16, p4_16, p3_16, p2_16, p1_16, p0_16;
+ __m128i q7_16, q6_16, q5_16, q4_16, q3_16, q2_16, q1_16, q0_16;
+ __m128i pixelFilter_p, pixelFilter_q;
+ __m128i pixetFilter_p2p1p0, pixetFilter_q2q1q0;
+ __m128i sum_p7, sum_q7, sum_p3, sum_q3, res_p, res_q;
+
+ p7_16 = _mm_unpacklo_epi8(q7p7, zero);
+ p6_16 = _mm_unpacklo_epi8(q6p6, zero);
+ p5_16 = _mm_unpacklo_epi8(q5p5, zero);
+ p4_16 = _mm_unpacklo_epi8(q4p4, zero);
+ p3_16 = _mm_unpacklo_epi8(q3p3, zero);
+ p2_16 = _mm_unpacklo_epi8(q2p2, zero);
+ p1_16 = _mm_unpacklo_epi8(q1p1, zero);
+ p0_16 = _mm_unpacklo_epi8(q0p0, zero);
+ q0_16 = _mm_unpackhi_epi8(q0p0, zero);
+ q1_16 = _mm_unpackhi_epi8(q1p1, zero);
+ q2_16 = _mm_unpackhi_epi8(q2p2, zero);
+ q3_16 = _mm_unpackhi_epi8(q3p3, zero);
+ q4_16 = _mm_unpackhi_epi8(q4p4, zero);
+ q5_16 = _mm_unpackhi_epi8(q5p5, zero);
+ q6_16 = _mm_unpackhi_epi8(q6p6, zero);
+ q7_16 = _mm_unpackhi_epi8(q7p7, zero);
+
+ pixelFilter_p = _mm_add_epi16(_mm_add_epi16(p6_16, p5_16),
+ _mm_add_epi16(p4_16, p3_16));
+ pixelFilter_q = _mm_add_epi16(_mm_add_epi16(q6_16, q5_16),
+ _mm_add_epi16(q4_16, q3_16));
+
+ pixetFilter_p2p1p0 = _mm_add_epi16(p0_16,
+ _mm_add_epi16(p2_16, p1_16));
+ pixelFilter_p = _mm_add_epi16(pixelFilter_p, pixetFilter_p2p1p0);
+
+ pixetFilter_q2q1q0 = _mm_add_epi16(q0_16,
+ _mm_add_epi16(q2_16, q1_16));
+ pixelFilter_q = _mm_add_epi16(pixelFilter_q, pixetFilter_q2q1q0);
+ pixelFilter_p = _mm_add_epi16(eight,
+ _mm_add_epi16(pixelFilter_p, pixelFilter_q));
+ pixetFilter_p2p1p0 = _mm_add_epi16(four,
+ _mm_add_epi16(pixetFilter_p2p1p0, pixetFilter_q2q1q0));
+ res_p = _mm_srli_epi16(
+ _mm_add_epi16(pixelFilter_p, _mm_add_epi16(p7_16, p0_16)),
+ 4);
+ res_q = _mm_srli_epi16(
+ _mm_add_epi16(pixelFilter_p, _mm_add_epi16(q7_16, q0_16)),
+ 4);
+ flat2_q0p0 = _mm_packus_epi16(res_p, res_q);
+ res_p = _mm_srli_epi16(
+ _mm_add_epi16(pixetFilter_p2p1p0,
+ _mm_add_epi16(p3_16, p0_16)), 3);
+ res_q = _mm_srli_epi16(
+ _mm_add_epi16(pixetFilter_p2p1p0,
+ _mm_add_epi16(q3_16, q0_16)), 3);
+
+ flat_q0p0 = _mm_packus_epi16(res_p, res_q);
+
+ sum_p7 = _mm_add_epi16(p7_16, p7_16);
+ sum_q7 = _mm_add_epi16(q7_16, q7_16);
+ sum_p3 = _mm_add_epi16(p3_16, p3_16);
+ sum_q3 = _mm_add_epi16(q3_16, q3_16);
+
+ pixelFilter_q = _mm_sub_epi16(pixelFilter_p, p6_16);
+ pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q6_16);
+ res_p = _mm_srli_epi16(
+ _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p1_16)),
+ 4);
+ res_q = _mm_srli_epi16(
+ _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q1_16)),
+ 4);
+ flat2_q1p1 = _mm_packus_epi16(res_p, res_q);
+
+ pixetFilter_q2q1q0 = _mm_sub_epi16(pixetFilter_p2p1p0, p2_16);
+ pixetFilter_p2p1p0 = _mm_sub_epi16(pixetFilter_p2p1p0, q2_16);
+ res_p = _mm_srli_epi16(
+ _mm_add_epi16(pixetFilter_p2p1p0,
+ _mm_add_epi16(sum_p3, p1_16)), 3);
+ res_q = _mm_srli_epi16(
+ _mm_add_epi16(pixetFilter_q2q1q0,
+ _mm_add_epi16(sum_q3, q1_16)), 3);
+ flat_q1p1 = _mm_packus_epi16(res_p, res_q);
+
+ sum_p7 = _mm_add_epi16(sum_p7, p7_16);
+ sum_q7 = _mm_add_epi16(sum_q7, q7_16);
+ sum_p3 = _mm_add_epi16(sum_p3, p3_16);
+ sum_q3 = _mm_add_epi16(sum_q3, q3_16);
+
+ pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q5_16);
+ pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p5_16);
+ res_p = _mm_srli_epi16(
+ _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p2_16)),
+ 4);
+ res_q = _mm_srli_epi16(
+ _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q2_16)),
+ 4);
+ flat2_q2p2 = _mm_packus_epi16(res_p, res_q);
+
+ pixetFilter_p2p1p0 = _mm_sub_epi16(pixetFilter_p2p1p0, q1_16);
+ pixetFilter_q2q1q0 = _mm_sub_epi16(pixetFilter_q2q1q0, p1_16);
+
+ res_p = _mm_srli_epi16(
+ _mm_add_epi16(pixetFilter_p2p1p0,
+ _mm_add_epi16(sum_p3, p2_16)), 3);
+ res_q = _mm_srli_epi16(
+ _mm_add_epi16(pixetFilter_q2q1q0,
+ _mm_add_epi16(sum_q3, q2_16)), 3);
+ flat_q2p2 = _mm_packus_epi16(res_p, res_q);
+
+ sum_p7 = _mm_add_epi16(sum_p7, p7_16);
+ sum_q7 = _mm_add_epi16(sum_q7, q7_16);
+ pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q4_16);
+ pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p4_16);
+ res_p = _mm_srli_epi16(
+ _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p3_16)),
+ 4);
+ res_q = _mm_srli_epi16(
+ _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q3_16)),
+ 4);
+ flat2_q3p3 = _mm_packus_epi16(res_p, res_q);
+
+ sum_p7 = _mm_add_epi16(sum_p7, p7_16);
+ sum_q7 = _mm_add_epi16(sum_q7, q7_16);
+ pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q3_16);
+ pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p3_16);
+ res_p = _mm_srli_epi16(
+ _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p4_16)),
+ 4);
+ res_q = _mm_srli_epi16(
+ _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q4_16)),
+ 4);
+ flat2_q4p4 = _mm_packus_epi16(res_p, res_q);
+
+ sum_p7 = _mm_add_epi16(sum_p7, p7_16);
+ sum_q7 = _mm_add_epi16(sum_q7, q7_16);
+ pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q2_16);
+ pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p2_16);
+ res_p = _mm_srli_epi16(
+ _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p5_16)),
+ 4);
+ res_q = _mm_srli_epi16(
+ _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q5_16)),
+ 4);
+ flat2_q5p5 = _mm_packus_epi16(res_p, res_q);
+
+ sum_p7 = _mm_add_epi16(sum_p7, p7_16);
+ sum_q7 = _mm_add_epi16(sum_q7, q7_16);
+ pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q1_16);
+ pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p1_16);
+ res_p = _mm_srli_epi16(
+ _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p6_16)),
+ 4);
+ res_q = _mm_srli_epi16(
+ _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q6_16)),
+ 4);
+ flat2_q6p6 = _mm_packus_epi16(res_p, res_q);
+ }
+ // wide flat
+ // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+ flat = _mm_shuffle_epi32(flat, 68);
+ flat2 = _mm_shuffle_epi32(flat2, 68);
+
+ q2p2 = _mm_andnot_si128(flat, q2p2);
+ flat_q2p2 = _mm_and_si128(flat, flat_q2p2);
+ q2p2 = _mm_or_si128(q2p2, flat_q2p2);
+
+ qs1ps1 = _mm_andnot_si128(flat, qs1ps1);
+ flat_q1p1 = _mm_and_si128(flat, flat_q1p1);
+ q1p1 = _mm_or_si128(qs1ps1, flat_q1p1);
+
+ qs0ps0 = _mm_andnot_si128(flat, qs0ps0);
+ flat_q0p0 = _mm_and_si128(flat, flat_q0p0);
+ q0p0 = _mm_or_si128(qs0ps0, flat_q0p0);
+
+ q6p6 = _mm_andnot_si128(flat2, q6p6);
+ flat2_q6p6 = _mm_and_si128(flat2, flat2_q6p6);
+ q6p6 = _mm_or_si128(q6p6, flat2_q6p6);
+ _mm_storel_epi64((__m128i *) (s - 7 * p), q6p6);
+ _mm_storeh_pi((__m64 *) (s + 6 * p), _mm_castsi128_ps(q6p6));
+
+ q5p5 = _mm_andnot_si128(flat2, q5p5);
+ flat2_q5p5 = _mm_and_si128(flat2, flat2_q5p5);
+ q5p5 = _mm_or_si128(q5p5, flat2_q5p5);
+ _mm_storel_epi64((__m128i *) (s - 6 * p), q5p5);
+ _mm_storeh_pi((__m64 *) (s + 5 * p), _mm_castsi128_ps(q5p5));
+
+ q4p4 = _mm_andnot_si128(flat2, q4p4);
+ flat2_q4p4 = _mm_and_si128(flat2, flat2_q4p4);
+ q4p4 = _mm_or_si128(q4p4, flat2_q4p4);
+ _mm_storel_epi64((__m128i *) (s - 5 * p), q4p4);
+ _mm_storeh_pi((__m64 *) (s + 4 * p), _mm_castsi128_ps(q4p4));
+
+ q3p3 = _mm_andnot_si128(flat2, q3p3);
+ flat2_q3p3 = _mm_and_si128(flat2, flat2_q3p3);
+ q3p3 = _mm_or_si128(q3p3, flat2_q3p3);
+ _mm_storel_epi64((__m128i *) (s - 4 * p), q3p3);
+ _mm_storeh_pi((__m64 *) (s + 3 * p), _mm_castsi128_ps(q3p3));
+
+ q2p2 = _mm_andnot_si128(flat2, q2p2);
+ flat2_q2p2 = _mm_and_si128(flat2, flat2_q2p2);
+ q2p2 = _mm_or_si128(q2p2, flat2_q2p2);
+ _mm_storel_epi64((__m128i *) (s - 3 * p), q2p2);
+ _mm_storeh_pi((__m64 *) (s + 2 * p), _mm_castsi128_ps(q2p2));
+
+ q1p1 = _mm_andnot_si128(flat2, q1p1);
+ flat2_q1p1 = _mm_and_si128(flat2, flat2_q1p1);
+ q1p1 = _mm_or_si128(q1p1, flat2_q1p1);
+ _mm_storel_epi64((__m128i *) (s - 2 * p), q1p1);
+ _mm_storeh_pi((__m64 *) (s + 1 * p), _mm_castsi128_ps(q1p1));
+
+ q0p0 = _mm_andnot_si128(flat2, q0p0);
+ flat2_q0p0 = _mm_and_si128(flat2, flat2_q0p0);
+ q0p0 = _mm_or_si128(q0p0, flat2_q0p0);
+ _mm_storel_epi64((__m128i *) (s - 1 * p), q0p0);
+ _mm_storeh_pi((__m64 *) (s - 0 * p), _mm_castsi128_ps(q0p0));
+ }
+}
+
+static void mb_lpf_horizontal_edge_w_avx2_16(unsigned char *s, int p,
+ const unsigned char *_blimit, const unsigned char *_limit,
+ const unsigned char *_thresh) {
+ __m128i mask, hev, flat, flat2;
+ const __m128i zero = _mm_set1_epi16(0);
+ const __m128i one = _mm_set1_epi8(1);
+ __m128i p7, p6, p5;
+ __m128i p4, p3, p2, p1, p0, q0, q1, q2, q3, q4;
+ __m128i q5, q6, q7;
+
+ const __m128i thresh = _mm_broadcastb_epi8(
+ _mm_cvtsi32_si128((int) _thresh[0]));
+ const __m128i limit = _mm_broadcastb_epi8(
+ _mm_cvtsi32_si128((int) _limit[0]));
+ const __m128i blimit = _mm_broadcastb_epi8(
+ _mm_cvtsi32_si128((int) _blimit[0]));
+
+ p4 = _mm_loadu_si128((__m128i *) (s - 5 * p));
+ p3 = _mm_loadu_si128((__m128i *) (s - 4 * p));
+ p2 = _mm_loadu_si128((__m128i *) (s - 3 * p));
+ p1 = _mm_loadu_si128((__m128i *) (s - 2 * p));
+ p0 = _mm_loadu_si128((__m128i *) (s - 1 * p));
+ q0 = _mm_loadu_si128((__m128i *) (s - 0 * p));
+ q1 = _mm_loadu_si128((__m128i *) (s + 1 * p));
+ q2 = _mm_loadu_si128((__m128i *) (s + 2 * p));
+ q3 = _mm_loadu_si128((__m128i *) (s + 3 * p));
+ q4 = _mm_loadu_si128((__m128i *) (s + 4 * p));
+
+ {
+ const __m128i abs_p1p0 = _mm_or_si128(_mm_subs_epu8(p1, p0),
+ _mm_subs_epu8(p0, p1));
+ const __m128i abs_q1q0 = _mm_or_si128(_mm_subs_epu8(q1, q0),
+ _mm_subs_epu8(q0, q1));
+ const __m128i fe = _mm_set1_epi8(0xfe);
+ const __m128i ff = _mm_cmpeq_epi8(abs_p1p0, abs_p1p0);
+ __m128i abs_p0q0 = _mm_or_si128(_mm_subs_epu8(p0, q0),
+ _mm_subs_epu8(q0, p0));
+ __m128i abs_p1q1 = _mm_or_si128(_mm_subs_epu8(p1, q1),
+ _mm_subs_epu8(q1, p1));
+ __m128i work;
+ flat = _mm_max_epu8(abs_p1p0, abs_q1q0);
+ hev = _mm_subs_epu8(flat, thresh);
+ hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
+
+ abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0);
+ abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
+ mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit);
+ mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
+ // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1;
+ mask = _mm_max_epu8(flat, mask);
+ // mask |= (abs(p1 - p0) > limit) * -1;
+ // mask |= (abs(q1 - q0) > limit) * -1;
+ work = _mm_max_epu8(
+ _mm_or_si128(_mm_subs_epu8(p2, p1), _mm_subs_epu8(p1, p2)),
+ _mm_or_si128(_mm_subs_epu8(p3, p2), _mm_subs_epu8(p2, p3)));
+ mask = _mm_max_epu8(work, mask);
+ work = _mm_max_epu8(
+ _mm_or_si128(_mm_subs_epu8(q2, q1), _mm_subs_epu8(q1, q2)),
+ _mm_or_si128(_mm_subs_epu8(q3, q2), _mm_subs_epu8(q2, q3)));
+ mask = _mm_max_epu8(work, mask);
+ mask = _mm_subs_epu8(mask, limit);
+ mask = _mm_cmpeq_epi8(mask, zero);
+ }
+
+ // lp filter
+ {
+ const __m128i t4 = _mm_set1_epi8(4);
+ const __m128i t3 = _mm_set1_epi8(3);
+ const __m128i t80 = _mm_set1_epi8(0x80);
+ const __m128i te0 = _mm_set1_epi8(0xe0);
+ const __m128i t1f = _mm_set1_epi8(0x1f);
+ const __m128i t1 = _mm_set1_epi8(0x1);
+ const __m128i t7f = _mm_set1_epi8(0x7f);
+
+ __m128i ps1 = _mm_xor_si128(p1, t80);
+ __m128i ps0 = _mm_xor_si128(p0, t80);
+ __m128i qs0 = _mm_xor_si128(q0, t80);
+ __m128i qs1 = _mm_xor_si128(q1, t80);
+ __m128i filt;
+ __m128i work_a;
+ __m128i filter1, filter2;
+ __m128i flat2_p6, flat2_p5, flat2_p4, flat2_p3, flat2_p2, flat2_p1,
+ flat2_p0, flat2_q0, flat2_q1, flat2_q2, flat2_q3, flat2_q4,
+ flat2_q5, flat2_q6, flat_p2, flat_p1, flat_p0, flat_q0, flat_q1,
+ flat_q2;
+
+ filt = _mm_and_si128(_mm_subs_epi8(ps1, qs1), hev);
+ work_a = _mm_subs_epi8(qs0, ps0);
+ filt = _mm_adds_epi8(filt, work_a);
+ filt = _mm_adds_epi8(filt, work_a);
+ filt = _mm_adds_epi8(filt, work_a);
+ /* (vp9_filter + 3 * (qs0 - ps0)) & mask */
+ filt = _mm_and_si128(filt, mask);
+
+ filter1 = _mm_adds_epi8(filt, t4);
+ filter2 = _mm_adds_epi8(filt, t3);
+
+ /* Filter1 >> 3 */
+ work_a = _mm_cmpgt_epi8(zero, filter1);
+ filter1 = _mm_srli_epi16(filter1, 3);
+ work_a = _mm_and_si128(work_a, te0);
+ filter1 = _mm_and_si128(filter1, t1f);
+ filter1 = _mm_or_si128(filter1, work_a);
+ qs0 = _mm_xor_si128(_mm_subs_epi8(qs0, filter1), t80);
+
+ /* Filter2 >> 3 */
+ work_a = _mm_cmpgt_epi8(zero, filter2);
+ filter2 = _mm_srli_epi16(filter2, 3);
+ work_a = _mm_and_si128(work_a, te0);
+ filter2 = _mm_and_si128(filter2, t1f);
+ filter2 = _mm_or_si128(filter2, work_a);
+ ps0 = _mm_xor_si128(_mm_adds_epi8(ps0, filter2), t80);
+
+ /* filt >> 1 */
+ filt = _mm_adds_epi8(filter1, t1);
+ work_a = _mm_cmpgt_epi8(zero, filt);
+ filt = _mm_srli_epi16(filt, 1);
+ work_a = _mm_and_si128(work_a, t80);
+ filt = _mm_and_si128(filt, t7f);
+ filt = _mm_or_si128(filt, work_a);
+ filt = _mm_andnot_si128(hev, filt);
+ ps1 = _mm_xor_si128(_mm_adds_epi8(ps1, filt), t80);
+ qs1 = _mm_xor_si128(_mm_subs_epi8(qs1, filt), t80);
+ // loopfilter done
+
+ {
+ __m128i work;
+ work = _mm_max_epu8(
+ _mm_or_si128(_mm_subs_epu8(p2, p0), _mm_subs_epu8(p0, p2)),
+ _mm_or_si128(_mm_subs_epu8(q2, q0), _mm_subs_epu8(q0, q2)));
+ flat = _mm_max_epu8(work, flat);
+ work = _mm_max_epu8(
+ _mm_or_si128(_mm_subs_epu8(p3, p0), _mm_subs_epu8(p0, p3)),
+ _mm_or_si128(_mm_subs_epu8(q3, q0), _mm_subs_epu8(q0, q3)));
+ flat = _mm_max_epu8(work, flat);
+ work = _mm_max_epu8(
+ _mm_or_si128(_mm_subs_epu8(p4, p0), _mm_subs_epu8(p0, p4)),
+ _mm_or_si128(_mm_subs_epu8(q4, q0), _mm_subs_epu8(q0, q4)));
+ flat = _mm_subs_epu8(flat, one);
+ flat = _mm_cmpeq_epi8(flat, zero);
+ flat = _mm_and_si128(flat, mask);
+
+ p5 = _mm_loadu_si128((__m128i *) (s - 6 * p));
+ q5 = _mm_loadu_si128((__m128i *) (s + 5 * p));
+ flat2 = _mm_max_epu8(
+ _mm_or_si128(_mm_subs_epu8(p5, p0), _mm_subs_epu8(p0, p5)),
+ _mm_or_si128(_mm_subs_epu8(q5, q0), _mm_subs_epu8(q0, q5)));
+
+ flat2 = _mm_max_epu8(work, flat2);
+ p6 = _mm_loadu_si128((__m128i *) (s - 7 * p));
+ q6 = _mm_loadu_si128((__m128i *) (s + 6 * p));
+ work = _mm_max_epu8(
+ _mm_or_si128(_mm_subs_epu8(p6, p0), _mm_subs_epu8(p0, p6)),
+ _mm_or_si128(_mm_subs_epu8(q6, q0), _mm_subs_epu8(q0, q6)));
+
+ flat2 = _mm_max_epu8(work, flat2);
+
+ p7 = _mm_loadu_si128((__m128i *) (s - 8 * p));
+ q7 = _mm_loadu_si128((__m128i *) (s + 7 * p));
+ work = _mm_max_epu8(
+ _mm_or_si128(_mm_subs_epu8(p7, p0), _mm_subs_epu8(p0, p7)),
+ _mm_or_si128(_mm_subs_epu8(q7, q0), _mm_subs_epu8(q0, q7)));
+
+ flat2 = _mm_max_epu8(work, flat2);
+ flat2 = _mm_subs_epu8(flat2, one);
+ flat2 = _mm_cmpeq_epi8(flat2, zero);
+ flat2 = _mm_and_si128(flat2, flat); // flat2 & flat & mask
+ }
+
+ // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ // flat and wide flat calculations
+ {
+ const __m256i eight = _mm256_set1_epi16(8);
+ const __m256i four = _mm256_set1_epi16(4);
+ __m256i p256_7, q256_7, p256_6, q256_6, p256_5, q256_5, p256_4,
+ q256_4, p256_3, q256_3, p256_2, q256_2, p256_1, q256_1,
+ p256_0, q256_0;
+ __m256i pixelFilter_p, pixelFilter_q, pixetFilter_p2p1p0,
+ pixetFilter_q2q1q0, sum_p7, sum_q7, sum_p3, sum_q3, res_p,
+ res_q;
+
+ p256_7 = _mm256_cvtepu8_epi16(p7);
+ p256_6 = _mm256_cvtepu8_epi16(p6);
+ p256_5 = _mm256_cvtepu8_epi16(p5);
+ p256_4 = _mm256_cvtepu8_epi16(p4);
+ p256_3 = _mm256_cvtepu8_epi16(p3);
+ p256_2 = _mm256_cvtepu8_epi16(p2);
+ p256_1 = _mm256_cvtepu8_epi16(p1);
+ p256_0 = _mm256_cvtepu8_epi16(p0);
+ q256_0 = _mm256_cvtepu8_epi16(q0);
+ q256_1 = _mm256_cvtepu8_epi16(q1);
+ q256_2 = _mm256_cvtepu8_epi16(q2);
+ q256_3 = _mm256_cvtepu8_epi16(q3);
+ q256_4 = _mm256_cvtepu8_epi16(q4);
+ q256_5 = _mm256_cvtepu8_epi16(q5);
+ q256_6 = _mm256_cvtepu8_epi16(q6);
+ q256_7 = _mm256_cvtepu8_epi16(q7);
+
+ pixelFilter_p = _mm256_add_epi16(_mm256_add_epi16(p256_6, p256_5),
+ _mm256_add_epi16(p256_4, p256_3));
+ pixelFilter_q = _mm256_add_epi16(_mm256_add_epi16(q256_6, q256_5),
+ _mm256_add_epi16(q256_4, q256_3));
+
+ pixetFilter_p2p1p0 = _mm256_add_epi16(p256_0,
+ _mm256_add_epi16(p256_2, p256_1));
+ pixelFilter_p = _mm256_add_epi16(pixelFilter_p, pixetFilter_p2p1p0);
+
+ pixetFilter_q2q1q0 = _mm256_add_epi16(q256_0,
+ _mm256_add_epi16(q256_2, q256_1));
+ pixelFilter_q = _mm256_add_epi16(pixelFilter_q, pixetFilter_q2q1q0);
+
+ pixelFilter_p = _mm256_add_epi16(eight,
+ _mm256_add_epi16(pixelFilter_p, pixelFilter_q));
+
+ pixetFilter_p2p1p0 = _mm256_add_epi16(four,
+ _mm256_add_epi16(pixetFilter_p2p1p0, pixetFilter_q2q1q0));
+
+ res_p = _mm256_srli_epi16(
+ _mm256_add_epi16(pixelFilter_p,
+ _mm256_add_epi16(p256_7, p256_0)), 4);
+
+ flat2_p0 = _mm256_castsi256_si128(
+ _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p),
+ 168));
+
+ res_q = _mm256_srli_epi16(
+ _mm256_add_epi16(pixelFilter_p,
+ _mm256_add_epi16(q256_7, q256_0)), 4);
+
+ flat2_q0 = _mm256_castsi256_si128(
+ _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q),
+ 168));
+
+ res_p = _mm256_srli_epi16(
+ _mm256_add_epi16(pixetFilter_p2p1p0,
+ _mm256_add_epi16(p256_3, p256_0)), 3);
+
+ flat_p0 = _mm256_castsi256_si128(
+ _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p),
+ 168));
+
+ res_q = _mm256_srli_epi16(
+ _mm256_add_epi16(pixetFilter_p2p1p0,
+ _mm256_add_epi16(q256_3, q256_0)), 3);
+
+ flat_q0 = _mm256_castsi256_si128(
+ _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q),
+ 168));
+
+ sum_p7 = _mm256_add_epi16(p256_7, p256_7);
+
+ sum_q7 = _mm256_add_epi16(q256_7, q256_7);
+
+ sum_p3 = _mm256_add_epi16(p256_3, p256_3);
+
+ sum_q3 = _mm256_add_epi16(q256_3, q256_3);
+
+ pixelFilter_q = _mm256_sub_epi16(pixelFilter_p, p256_6);
+
+ pixelFilter_p = _mm256_sub_epi16(pixelFilter_p, q256_6);
+
+ res_p = _mm256_srli_epi16(
+ _mm256_add_epi16(pixelFilter_p,
+ _mm256_add_epi16(sum_p7, p256_1)), 4);
+
+ flat2_p1 = _mm256_castsi256_si128(
+ _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p),
+ 168));
+
+ res_q = _mm256_srli_epi16(
+ _mm256_add_epi16(pixelFilter_q,
+ _mm256_add_epi16(sum_q7, q256_1)), 4);
+
+ flat2_q1 = _mm256_castsi256_si128(
+ _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q),
+ 168));
+
+ pixetFilter_q2q1q0 = _mm256_sub_epi16(pixetFilter_p2p1p0, p256_2);
+
+ pixetFilter_p2p1p0 = _mm256_sub_epi16(pixetFilter_p2p1p0, q256_2);
+
+ res_p = _mm256_srli_epi16(
+ _mm256_add_epi16(pixetFilter_p2p1p0,
+ _mm256_add_epi16(sum_p3, p256_1)), 3);
+
+ flat_p1 = _mm256_castsi256_si128(
+ _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p),
+ 168));
+
+ res_q = _mm256_srli_epi16(
+ _mm256_add_epi16(pixetFilter_q2q1q0,
+ _mm256_add_epi16(sum_q3, q256_1)), 3);
+
+ flat_q1 = _mm256_castsi256_si128(
+ _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q),
+ 168));
+
+ sum_p7 = _mm256_add_epi16(sum_p7, p256_7);
+
+ sum_q7 = _mm256_add_epi16(sum_q7, q256_7);
+
+ sum_p3 = _mm256_add_epi16(sum_p3, p256_3);
+
+ sum_q3 = _mm256_add_epi16(sum_q3, q256_3);
+
+ pixelFilter_p = _mm256_sub_epi16(pixelFilter_p, q256_5);
+
+ pixelFilter_q = _mm256_sub_epi16(pixelFilter_q, p256_5);
+
+ res_p = _mm256_srli_epi16(
+ _mm256_add_epi16(pixelFilter_p,
+ _mm256_add_epi16(sum_p7, p256_2)), 4);
+
+ flat2_p2 = _mm256_castsi256_si128(
+ _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p),
+ 168));
+
+ res_q = _mm256_srli_epi16(
+ _mm256_add_epi16(pixelFilter_q,
+ _mm256_add_epi16(sum_q7, q256_2)), 4);
+
+ flat2_q2 = _mm256_castsi256_si128(
+ _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q),
+ 168));
+
+ pixetFilter_p2p1p0 = _mm256_sub_epi16(pixetFilter_p2p1p0, q256_1);
+
+ pixetFilter_q2q1q0 = _mm256_sub_epi16(pixetFilter_q2q1q0, p256_1);
+
+ res_p = _mm256_srli_epi16(
+ _mm256_add_epi16(pixetFilter_p2p1p0,
+ _mm256_add_epi16(sum_p3, p256_2)), 3);
+
+ flat_p2 = _mm256_castsi256_si128(
+ _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p),
+ 168));
+
+ res_q = _mm256_srli_epi16(
+ _mm256_add_epi16(pixetFilter_q2q1q0,
+ _mm256_add_epi16(sum_q3, q256_2)), 3);
+
+ flat_q2 = _mm256_castsi256_si128(
+ _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q),
+ 168));
+
+ sum_p7 = _mm256_add_epi16(sum_p7, p256_7);
+
+ sum_q7 = _mm256_add_epi16(sum_q7, q256_7);
+
+ pixelFilter_p = _mm256_sub_epi16(pixelFilter_p, q256_4);
+
+ pixelFilter_q = _mm256_sub_epi16(pixelFilter_q, p256_4);
+
+ res_p = _mm256_srli_epi16(
+ _mm256_add_epi16(pixelFilter_p,
+ _mm256_add_epi16(sum_p7, p256_3)), 4);
+
+ flat2_p3 = _mm256_castsi256_si128(
+ _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p),
+ 168));
+
+ res_q = _mm256_srli_epi16(
+ _mm256_add_epi16(pixelFilter_q,
+ _mm256_add_epi16(sum_q7, q256_3)), 4);
+
+ flat2_q3 = _mm256_castsi256_si128(
+ _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q),
+ 168));
+
+ sum_p7 = _mm256_add_epi16(sum_p7, p256_7);
+
+ sum_q7 = _mm256_add_epi16(sum_q7, q256_7);
+
+ pixelFilter_p = _mm256_sub_epi16(pixelFilter_p, q256_3);
+
+ pixelFilter_q = _mm256_sub_epi16(pixelFilter_q, p256_3);
+
+ res_p = _mm256_srli_epi16(
+ _mm256_add_epi16(pixelFilter_p,
+ _mm256_add_epi16(sum_p7, p256_4)), 4);
+
+ flat2_p4 = _mm256_castsi256_si128(
+ _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p),
+ 168));
+
+ res_q = _mm256_srli_epi16(
+ _mm256_add_epi16(pixelFilter_q,
+ _mm256_add_epi16(sum_q7, q256_4)), 4);
+
+ flat2_q4 = _mm256_castsi256_si128(
+ _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q),
+ 168));
+
+ sum_p7 = _mm256_add_epi16(sum_p7, p256_7);
+
+ sum_q7 = _mm256_add_epi16(sum_q7, q256_7);
+
+ pixelFilter_p = _mm256_sub_epi16(pixelFilter_p, q256_2);
+
+ pixelFilter_q = _mm256_sub_epi16(pixelFilter_q, p256_2);
+
+ res_p = _mm256_srli_epi16(
+ _mm256_add_epi16(pixelFilter_p,
+ _mm256_add_epi16(sum_p7, p256_5)), 4);
+
+ flat2_p5 = _mm256_castsi256_si128(
+ _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p),
+ 168));
+
+ res_q = _mm256_srli_epi16(
+ _mm256_add_epi16(pixelFilter_q,
+ _mm256_add_epi16(sum_q7, q256_5)), 4);
+
+ flat2_q5 = _mm256_castsi256_si128(
+ _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q),
+ 168));
+
+ sum_p7 = _mm256_add_epi16(sum_p7, p256_7);
+
+ sum_q7 = _mm256_add_epi16(sum_q7, q256_7);
+
+ pixelFilter_p = _mm256_sub_epi16(pixelFilter_p, q256_1);
+
+ pixelFilter_q = _mm256_sub_epi16(pixelFilter_q, p256_1);
+
+ res_p = _mm256_srli_epi16(
+ _mm256_add_epi16(pixelFilter_p,
+ _mm256_add_epi16(sum_p7, p256_6)), 4);
+
+ flat2_p6 = _mm256_castsi256_si128(
+ _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p),
+ 168));
+
+ res_q = _mm256_srli_epi16(
+ _mm256_add_epi16(pixelFilter_q,
+ _mm256_add_epi16(sum_q7, q256_6)), 4);
+
+ flat2_q6 = _mm256_castsi256_si128(
+ _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q),
+ 168));
+ }
+
+ // wide flat
+ // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+ p2 = _mm_andnot_si128(flat, p2);
+ flat_p2 = _mm_and_si128(flat, flat_p2);
+ p2 = _mm_or_si128(flat_p2, p2);
+
+ p1 = _mm_andnot_si128(flat, ps1);
+ flat_p1 = _mm_and_si128(flat, flat_p1);
+ p1 = _mm_or_si128(flat_p1, p1);
+
+ p0 = _mm_andnot_si128(flat, ps0);
+ flat_p0 = _mm_and_si128(flat, flat_p0);
+ p0 = _mm_or_si128(flat_p0, p0);
+
+ q0 = _mm_andnot_si128(flat, qs0);
+ flat_q0 = _mm_and_si128(flat, flat_q0);
+ q0 = _mm_or_si128(flat_q0, q0);
+
+ q1 = _mm_andnot_si128(flat, qs1);
+ flat_q1 = _mm_and_si128(flat, flat_q1);
+ q1 = _mm_or_si128(flat_q1, q1);
+
+ q2 = _mm_andnot_si128(flat, q2);
+ flat_q2 = _mm_and_si128(flat, flat_q2);
+ q2 = _mm_or_si128(flat_q2, q2);
+
+ p6 = _mm_andnot_si128(flat2, p6);
+ flat2_p6 = _mm_and_si128(flat2, flat2_p6);
+ p6 = _mm_or_si128(flat2_p6, p6);
+ _mm_storeu_si128((__m128i *) (s - 7 * p), p6);
+
+ p5 = _mm_andnot_si128(flat2, p5);
+ flat2_p5 = _mm_and_si128(flat2, flat2_p5);
+ p5 = _mm_or_si128(flat2_p5, p5);
+ _mm_storeu_si128((__m128i *) (s - 6 * p), p5);
+
+ p4 = _mm_andnot_si128(flat2, p4);
+ flat2_p4 = _mm_and_si128(flat2, flat2_p4);
+ p4 = _mm_or_si128(flat2_p4, p4);
+ _mm_storeu_si128((__m128i *) (s - 5 * p), p4);
+
+ p3 = _mm_andnot_si128(flat2, p3);
+ flat2_p3 = _mm_and_si128(flat2, flat2_p3);
+ p3 = _mm_or_si128(flat2_p3, p3);
+ _mm_storeu_si128((__m128i *) (s - 4 * p), p3);
+
+ p2 = _mm_andnot_si128(flat2, p2);
+ flat2_p2 = _mm_and_si128(flat2, flat2_p2);
+ p2 = _mm_or_si128(flat2_p2, p2);
+ _mm_storeu_si128((__m128i *) (s - 3 * p), p2);
+
+ p1 = _mm_andnot_si128(flat2, p1);
+ flat2_p1 = _mm_and_si128(flat2, flat2_p1);
+ p1 = _mm_or_si128(flat2_p1, p1);
+ _mm_storeu_si128((__m128i *) (s - 2 * p), p1);
+
+ p0 = _mm_andnot_si128(flat2, p0);
+ flat2_p0 = _mm_and_si128(flat2, flat2_p0);
+ p0 = _mm_or_si128(flat2_p0, p0);
+ _mm_storeu_si128((__m128i *) (s - 1 * p), p0);
+
+ q0 = _mm_andnot_si128(flat2, q0);
+ flat2_q0 = _mm_and_si128(flat2, flat2_q0);
+ q0 = _mm_or_si128(flat2_q0, q0);
+ _mm_storeu_si128((__m128i *) (s - 0 * p), q0);
+
+ q1 = _mm_andnot_si128(flat2, q1);
+ flat2_q1 = _mm_and_si128(flat2, flat2_q1);
+ q1 = _mm_or_si128(flat2_q1, q1);
+ _mm_storeu_si128((__m128i *) (s + 1 * p), q1);
+
+ q2 = _mm_andnot_si128(flat2, q2);
+ flat2_q2 = _mm_and_si128(flat2, flat2_q2);
+ q2 = _mm_or_si128(flat2_q2, q2);
+ _mm_storeu_si128((__m128i *) (s + 2 * p), q2);
+
+ q3 = _mm_andnot_si128(flat2, q3);
+ flat2_q3 = _mm_and_si128(flat2, flat2_q3);
+ q3 = _mm_or_si128(flat2_q3, q3);
+ _mm_storeu_si128((__m128i *) (s + 3 * p), q3);
+
+ q4 = _mm_andnot_si128(flat2, q4);
+ flat2_q4 = _mm_and_si128(flat2, flat2_q4);
+ q4 = _mm_or_si128(flat2_q4, q4);
+ _mm_storeu_si128((__m128i *) (s + 4 * p), q4);
+
+ q5 = _mm_andnot_si128(flat2, q5);
+ flat2_q5 = _mm_and_si128(flat2, flat2_q5);
+ q5 = _mm_or_si128(flat2_q5, q5);
+ _mm_storeu_si128((__m128i *) (s + 5 * p), q5);
+
+ q6 = _mm_andnot_si128(flat2, q6);
+ flat2_q6 = _mm_and_si128(flat2, flat2_q6);
+ q6 = _mm_or_si128(flat2_q6, q6);
+ _mm_storeu_si128((__m128i *) (s + 6 * p), q6);
+ }
+}
+
+void vp9_mb_lpf_horizontal_edge_w_avx2(unsigned char *s, int p,
+ const unsigned char *_blimit, const unsigned char *_limit,
+ const unsigned char *_thresh, int count) {
+ if (count == 1)
+ mb_lpf_horizontal_edge_w_avx2_8(s, p, _blimit, _limit, _thresh);
+ else
+ mb_lpf_horizontal_edge_w_avx2_16(s, p, _blimit, _limit, _thresh);
+}
diff --git a/vp9/decoder/vp9_decodemv.c b/vp9/decoder/vp9_decodemv.c
index 6bc51e8..475a299 100644
--- a/vp9/decoder/vp9_decodemv.c
+++ b/vp9/decoder/vp9_decodemv.c
@@ -72,7 +72,7 @@
}
if (!cm->frame_parallel_decoding_mode)
- update_tx_counts(bsize, context, tx_size, &cm->counts.tx);
+ ++get_tx_counts(bsize, context, &cm->counts.tx)[tx_size];
return tx_size;
}
diff --git a/vp9/decoder/vp9_decodframe.c b/vp9/decoder/vp9_decodframe.c
index 779368d..e3a2b77 100644
--- a/vp9/decoder/vp9_decodframe.c
+++ b/vp9/decoder/vp9_decodframe.c
@@ -37,6 +37,12 @@
#include "vp9/decoder/vp9_thread.h"
#include "vp9/decoder/vp9_treereader.h"
+typedef struct TileWorkerData {
+ VP9_COMMON *cm;
+ vp9_reader bit_reader;
+ DECLARE_ALIGNED(16, MACROBLOCKD, xd);
+} TileWorkerData;
+
static int read_be32(const uint8_t *p) {
return (p[0] << 24) | (p[1] << 16) | (p[2] << 8) | p[3];
}
@@ -103,7 +109,7 @@
static void read_switchable_interp_probs(FRAME_CONTEXT *fc, vp9_reader *r) {
int i, j;
- for (j = 0; j < SWITCHABLE_FILTERS + 1; ++j)
+ for (j = 0; j < SWITCHABLE_FILTER_CONTEXTS; ++j)
for (i = 0; i < SWITCHABLE_FILTERS - 1; ++i)
vp9_diff_update_prob(r, &fc->switchable_interp_prob[j][i]);
}
@@ -238,9 +244,8 @@
aligned_mi_cols));
}
-static void decode_block(int plane, int block, BLOCK_SIZE plane_bsize,
- TX_SIZE tx_size, void *arg) {
- MACROBLOCKD* const xd = arg;
+static void inverse_transform_block(MACROBLOCKD* xd, int plane, int block,
+ BLOCK_SIZE plane_bsize, TX_SIZE tx_size) {
struct macroblockd_plane *const pd = &xd->plane[plane];
int16_t* const qcoeff = BLOCK_OFFSET(pd->qcoeff, block);
const int stride = pd->dst.stride;
@@ -276,7 +281,7 @@
}
if (eob == 1) {
- *((int32_t *)qcoeff) = 0;
+ vpx_memset(qcoeff, 0, 2 * sizeof(qcoeff[0]));
} else {
if (tx_type == DCT_DCT && tx_size <= TX_16X16 && eob <= 10)
vpx_memset(qcoeff, 0, 4 * (4 << tx_size) * sizeof(qcoeff[0]));
@@ -286,9 +291,19 @@
}
}
-static void decode_block_intra(int plane, int block, BLOCK_SIZE plane_bsize,
- TX_SIZE tx_size, void *arg) {
- MACROBLOCKD* const xd = arg;
+struct intra_args {
+ VP9_COMMON *cm;
+ MACROBLOCKD *xd;
+ vp9_reader *r;
+};
+
+static void predict_and_reconstruct_intra_block(int plane, int block,
+ BLOCK_SIZE plane_bsize,
+ TX_SIZE tx_size, void *arg) {
+ struct intra_args *const args = arg;
+ VP9_COMMON *const cm = args->cm;
+ MACROBLOCKD *const xd = args->xd;
+
struct macroblockd_plane *const pd = &xd->plane[plane];
MODE_INFO *const mi = xd->mi_8x8[0];
const int raster_block = txfrm_block_to_raster_block(plane_bsize, tx_size,
@@ -307,25 +322,30 @@
b_width_log2(plane_bsize), tx_size, mode,
dst, pd->dst.stride, dst, pd->dst.stride);
- if (!mi->mbmi.skip_coeff)
- decode_block(plane, block, plane_bsize, tx_size, arg);
+ if (!mi->mbmi.skip_coeff) {
+ vp9_decode_block_tokens(cm, xd, plane, block, plane_bsize, tx_size,
+ args->r);
+ inverse_transform_block(xd, plane, block, plane_bsize, tx_size);
+ }
}
-static int decode_tokens(VP9_COMMON *const cm, MACROBLOCKD *const xd,
- BLOCK_SIZE bsize, vp9_reader *r) {
- MB_MODE_INFO *const mbmi = &xd->mi_8x8[0]->mbmi;
+struct inter_args {
+ VP9_COMMON *cm;
+ MACROBLOCKD *xd;
+ vp9_reader *r;
+ int *eobtotal;
+};
- if (mbmi->skip_coeff) {
- reset_skip_context(xd, bsize);
- return -1;
- } else {
- if (cm->seg.enabled)
- setup_plane_dequants(cm, xd, vp9_get_qindex(&cm->seg, mbmi->segment_id,
- cm->base_qindex));
+static void reconstruct_inter_block(int plane, int block,
+ BLOCK_SIZE plane_bsize,
+ TX_SIZE tx_size, void *arg) {
+ struct inter_args *args = arg;
+ VP9_COMMON *const cm = args->cm;
+ MACROBLOCKD *const xd = args->xd;
- // TODO(dkovalev) if (!vp9_reader_has_error(r))
- return vp9_decode_tokens(cm, xd, &cm->seg, r, bsize);
- }
+ *args->eobtotal += vp9_decode_block_tokens(cm, xd, plane, block,
+ plane_bsize, tx_size, args->r);
+ inverse_transform_block(xd, plane, block, plane_bsize, tx_size);
}
static void set_offsets(VP9_COMMON *const cm, MACROBLOCKD *const xd,
@@ -376,14 +396,9 @@
static void decode_modes_b(VP9_COMMON *const cm, MACROBLOCKD *const xd,
const TileInfo *const tile,
int mi_row, int mi_col,
- vp9_reader *r, BLOCK_SIZE bsize, int index) {
+ vp9_reader *r, BLOCK_SIZE bsize) {
const int less8x8 = bsize < BLOCK_8X8;
MB_MODE_INFO *mbmi;
- int eobtotal;
-
- if (less8x8)
- if (index > 0)
- return;
set_offsets(cm, xd, tile, bsize, mi_row, mi_col);
vp9_read_mode_info(cm, xd, tile, mi_row, mi_col, r);
@@ -393,32 +408,41 @@
// Has to be called after set_offsets
mbmi = &xd->mi_8x8[0]->mbmi;
- eobtotal = decode_tokens(cm, xd, bsize, r);
+
+ if (mbmi->skip_coeff) {
+ reset_skip_context(xd, bsize);
+ } else {
+ if (cm->seg.enabled)
+ setup_plane_dequants(cm, xd, vp9_get_qindex(&cm->seg, mbmi->segment_id,
+ cm->base_qindex));
+ }
if (!is_inter_block(mbmi)) {
- // Intra reconstruction
- foreach_transformed_block(xd, bsize, decode_block_intra, xd);
+ struct intra_args arg = { cm, xd, r };
+ foreach_transformed_block(xd, bsize, predict_and_reconstruct_intra_block,
+ &arg);
} else {
- // Inter reconstruction
- const int decode_blocks = (eobtotal > 0);
-
- if (!less8x8) {
- assert(mbmi->sb_type == bsize);
- if (eobtotal == 0)
- mbmi->skip_coeff = 1; // skip loopfilter
- }
-
+ // Setup
set_ref(cm, xd, 0, mi_row, mi_col);
if (has_second_ref(mbmi))
set_ref(cm, xd, 1, mi_row, mi_col);
xd->subpix.filter_x = xd->subpix.filter_y =
vp9_get_filter_kernel(mbmi->interp_filter);
+
+ // Prediction
vp9_build_inter_predictors_sb(xd, mi_row, mi_col, bsize);
- if (decode_blocks)
- foreach_transformed_block(xd, bsize, decode_block, xd);
+ // Reconstruction
+ if (!mbmi->skip_coeff) {
+ int eobtotal = 0;
+ struct inter_args arg = { cm, xd, r, &eobtotal };
+ foreach_transformed_block(xd, bsize, reconstruct_inter_block, &arg);
+ if (!less8x8 && eobtotal == 0)
+ mbmi->skip_coeff = 1; // skip loopfilter
+ }
}
+
xd->corrupted |= vp9_reader_has_error(r);
}
@@ -442,54 +466,50 @@
static void decode_modes_sb(VP9_COMMON *const cm, MACROBLOCKD *const xd,
const TileInfo *const tile,
int mi_row, int mi_col,
- vp9_reader* r, BLOCK_SIZE bsize, int index) {
+ vp9_reader* r, BLOCK_SIZE bsize) {
const int hbs = num_8x8_blocks_wide_lookup[bsize] / 2;
- PARTITION_TYPE partition = PARTITION_NONE;
+ PARTITION_TYPE partition;
BLOCK_SIZE subsize;
+ int ctx;
if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols)
return;
- if (bsize < BLOCK_8X8) {
- if (index > 0)
- return;
- } else {
- const int ctx = partition_plane_context(xd->above_seg_context,
- xd->left_seg_context,
- mi_row, mi_col, bsize);
- partition = read_partition(hbs, cm->mi_rows, cm->mi_cols, mi_row, mi_col,
- cm->fc.partition_prob[cm->frame_type][ctx], r);
+ ctx = partition_plane_context(xd->above_seg_context, xd->left_seg_context,
+ mi_row, mi_col, bsize);
+ partition = read_partition(hbs, cm->mi_rows, cm->mi_cols, mi_row, mi_col,
+ cm->fc.partition_prob[cm->frame_type][ctx], r);
- if (!cm->frame_parallel_decoding_mode)
- ++cm->counts.partition[ctx][partition];
- }
+ if (!cm->frame_parallel_decoding_mode)
+ ++cm->counts.partition[ctx][partition];
subsize = get_subsize(bsize, partition);
-
- switch (partition) {
- case PARTITION_NONE:
- decode_modes_b(cm, xd, tile, mi_row, mi_col, r, subsize, 0);
- break;
- case PARTITION_HORZ:
- decode_modes_b(cm, xd, tile, mi_row, mi_col, r, subsize, 0);
- if (mi_row + hbs < cm->mi_rows)
- decode_modes_b(cm, xd, tile, mi_row + hbs, mi_col, r, subsize, 1);
- break;
- case PARTITION_VERT:
- decode_modes_b(cm, xd, tile, mi_row, mi_col, r, subsize, 0);
- if (mi_col + hbs < cm->mi_cols)
- decode_modes_b(cm, xd, tile, mi_row, mi_col + hbs, r, subsize, 1);
- break;
- case PARTITION_SPLIT: {
- int n;
- for (n = 0; n < 4; n++) {
- const int j = n >> 1, i = n & 1;
- decode_modes_sb(cm, xd, tile, mi_row + j * hbs, mi_col + i * hbs,
- r, subsize, n);
- }
- } break;
- default:
- assert(!"Invalid partition type");
+ if (subsize < BLOCK_8X8) {
+ decode_modes_b(cm, xd, tile, mi_row, mi_col, r, subsize);
+ } else {
+ switch (partition) {
+ case PARTITION_NONE:
+ decode_modes_b(cm, xd, tile, mi_row, mi_col, r, subsize);
+ break;
+ case PARTITION_HORZ:
+ decode_modes_b(cm, xd, tile, mi_row, mi_col, r, subsize);
+ if (mi_row + hbs < cm->mi_rows)
+ decode_modes_b(cm, xd, tile, mi_row + hbs, mi_col, r, subsize);
+ break;
+ case PARTITION_VERT:
+ decode_modes_b(cm, xd, tile, mi_row, mi_col, r, subsize);
+ if (mi_col + hbs < cm->mi_cols)
+ decode_modes_b(cm, xd, tile, mi_row, mi_col + hbs, r, subsize);
+ break;
+ case PARTITION_SPLIT:
+ decode_modes_sb(cm, xd, tile, mi_row, mi_col, r, subsize);
+ decode_modes_sb(cm, xd, tile, mi_row, mi_col + hbs, r, subsize);
+ decode_modes_sb(cm, xd, tile, mi_row + hbs, mi_col, r, subsize);
+ decode_modes_sb(cm, xd, tile, mi_row + hbs, mi_col + hbs, r, subsize);
+ break;
+ default:
+ assert(!"Invalid partition type");
+ }
}
// update partition context
@@ -774,7 +794,7 @@
vp9_zero(xd->left_seg_context);
for (mi_col = tile->mi_col_start; mi_col < tile->mi_col_end;
mi_col += MI_BLOCK_SIZE)
- decode_modes_sb(cm, xd, tile, mi_row, mi_col, r, BLOCK_64X64, 0);
+ decode_modes_sb(cm, xd, tile, mi_row, mi_col, r, BLOCK_64X64);
if (pbi->do_loopfilter_inline) {
const int lf_start = mi_row - MI_BLOCK_SIZE;
@@ -823,6 +843,27 @@
cm->log2_tile_rows += vp9_rb_read_bit(rb);
}
+// Reads the next tile returning its size and adjusting '*data' accordingly
+// based on 'is_last'.
+static size_t get_tile(const uint8_t *const data_end,
+ int is_last,
+ struct vpx_internal_error_info *error_info,
+ const uint8_t **data) {
+ size_t size;
+
+ if (!is_last) {
+ if (!read_is_valid(*data, 4, data_end))
+ vpx_internal_error(error_info, VPX_CODEC_CORRUPT_FRAME,
+ "Truncated packet or corrupt tile length");
+
+ size = read_be32(*data);
+ *data += 4;
+ } else {
+ size = data_end - *data;
+ }
+ return size;
+}
+
static const uint8_t *decode_tiles(VP9D_COMP *pbi, const uint8_t *data) {
vp9_reader residual_bc;
@@ -848,20 +889,15 @@
const uint8_t *data_ptr2[4][1 << 6];
vp9_reader bc_bak = {0};
- // pre-initialize the offsets, we're going to read in inverse order
+ // pre-initialize the offsets, we're going to decode in inverse order
data_ptr2[0][0] = data;
for (tile_row = 0; tile_row < tile_rows; tile_row++) {
- if (tile_row) {
- const int size = read_be32(data_ptr2[tile_row - 1][tile_cols - 1]);
- data_ptr2[tile_row - 1][tile_cols - 1] += 4;
- data_ptr2[tile_row][0] = data_ptr2[tile_row - 1][tile_cols - 1] + size;
- }
-
- for (tile_col = 1; tile_col < tile_cols; tile_col++) {
- const int size = read_be32(data_ptr2[tile_row][tile_col - 1]);
- data_ptr2[tile_row][tile_col - 1] += 4;
- data_ptr2[tile_row][tile_col] =
- data_ptr2[tile_row][tile_col - 1] + size;
+ for (tile_col = 0; tile_col < tile_cols; tile_col++) {
+ const int last_tile =
+ tile_row == tile_rows - 1 && tile_col == tile_cols - 1;
+ const size_t size = get_tile(data_end, last_tile, &cm->error, &data);
+ data_ptr2[tile_row][tile_col] = data;
+ data += size;
}
}
@@ -881,27 +917,15 @@
}
residual_bc = bc_bak;
} else {
- int has_more;
-
for (tile_row = 0; tile_row < tile_rows; tile_row++) {
for (tile_col = 0; tile_col < tile_cols; tile_col++) {
+ const int last_tile =
+ tile_row == tile_rows - 1 && tile_col == tile_cols - 1;
+ const size_t size = get_tile(data_end, last_tile, &cm->error, &data);
TileInfo tile;
- size_t size;
vp9_tile_init(&tile, cm, tile_row, tile_col);
- has_more = tile_col < tile_cols - 1 || tile_row < tile_rows - 1;
- if (has_more) {
- if (!read_is_valid(data, 4, data_end))
- vpx_internal_error(&cm->error, VPX_CODEC_CORRUPT_FRAME,
- "Truncated packet or corrupt tile length");
-
- size = read_be32(data);
- data += 4;
- } else {
- size = data_end - data;
- }
-
setup_token_decoder(data, data_end, size, &cm->error, &residual_bc);
setup_tile_context(pbi, xd, tile_col);
decode_tile(pbi, &tile, &residual_bc);
@@ -913,6 +937,107 @@
return vp9_reader_find_end(&residual_bc);
}
+static int tile_worker_hook(void *arg1, void *arg2) {
+ TileWorkerData *const tile_data = (TileWorkerData*)arg1;
+ const TileInfo *const tile = (TileInfo*)arg2;
+ int mi_row, mi_col;
+
+ for (mi_row = tile->mi_row_start; mi_row < tile->mi_row_end;
+ mi_row += MI_BLOCK_SIZE) {
+ vp9_zero(tile_data->xd.left_context);
+ vp9_zero(tile_data->xd.left_seg_context);
+ for (mi_col = tile->mi_col_start; mi_col < tile->mi_col_end;
+ mi_col += MI_BLOCK_SIZE)
+ decode_modes_sb(tile_data->cm, &tile_data->xd, tile,
+ mi_row, mi_col, &tile_data->bit_reader, BLOCK_64X64);
+ }
+ return !tile_data->xd.corrupted;
+}
+
+static const uint8_t *decode_tiles_mt(VP9D_COMP *pbi, const uint8_t *data) {
+ VP9_COMMON *const cm = &pbi->common;
+ const uint8_t *const data_end = pbi->source + pbi->source_sz;
+ const int aligned_mi_cols = mi_cols_aligned_to_sb(cm->mi_cols);
+ const int tile_cols = 1 << cm->log2_tile_cols;
+ const int tile_rows = 1 << cm->log2_tile_rows;
+ const int num_workers = MIN(pbi->oxcf.max_threads & ~1, tile_cols);
+ int tile_col = 0;
+
+ assert(tile_rows == 1);
+ (void)tile_rows;
+
+ if (num_workers > pbi->num_tile_workers) {
+ int i;
+ CHECK_MEM_ERROR(cm, pbi->tile_workers,
+ vpx_realloc(pbi->tile_workers,
+ num_workers * sizeof(*pbi->tile_workers)));
+ for (i = pbi->num_tile_workers; i < num_workers; ++i) {
+ VP9Worker *const worker = &pbi->tile_workers[i];
+ ++pbi->num_tile_workers;
+
+ vp9_worker_init(worker);
+ worker->hook = (VP9WorkerHook)tile_worker_hook;
+ CHECK_MEM_ERROR(cm, worker->data1,
+ vpx_memalign(32, sizeof(TileWorkerData)));
+ CHECK_MEM_ERROR(cm, worker->data2, vpx_malloc(sizeof(TileInfo)));
+ if (i < num_workers - 1 && !vp9_worker_reset(worker)) {
+ vpx_internal_error(&cm->error, VPX_CODEC_ERROR,
+ "Tile decoder thread creation failed");
+ }
+ }
+ }
+
+ // Note: this memset assumes above_context[0], [1] and [2]
+ // are allocated as part of the same buffer.
+ vpx_memset(pbi->above_context[0], 0,
+ sizeof(*pbi->above_context[0]) * MAX_MB_PLANE *
+ 2 * aligned_mi_cols);
+ vpx_memset(pbi->above_seg_context, 0,
+ sizeof(*pbi->above_seg_context) * aligned_mi_cols);
+
+ while (tile_col < tile_cols) {
+ int i;
+ for (i = 0; i < num_workers && tile_col < tile_cols; ++i) {
+ VP9Worker *const worker = &pbi->tile_workers[i];
+ TileWorkerData *const tile_data = (TileWorkerData*)worker->data1;
+ TileInfo *const tile = (TileInfo*)worker->data2;
+ const size_t size =
+ get_tile(data_end, tile_col == tile_cols - 1, &cm->error, &data);
+
+ tile_data->cm = cm;
+ tile_data->xd = pbi->mb;
+ tile_data->xd.corrupted = 0;
+ vp9_tile_init(tile, tile_data->cm, 0, tile_col);
+
+ setup_token_decoder(data, data_end, size, &cm->error,
+ &tile_data->bit_reader);
+ setup_tile_context(pbi, &tile_data->xd, tile_col);
+
+ worker->had_error = 0;
+ if (i == num_workers - 1 || tile_col == tile_cols - 1) {
+ vp9_worker_execute(worker);
+ } else {
+ vp9_worker_launch(worker);
+ }
+
+ data += size;
+ ++tile_col;
+ }
+
+ for (; i > 0; --i) {
+ VP9Worker *const worker = &pbi->tile_workers[i - 1];
+ pbi->mb.corrupted |= !vp9_worker_sync(worker);
+ }
+ }
+
+ {
+ const int final_worker = (tile_cols + num_workers - 1) % num_workers;
+ TileWorkerData *const tile_data =
+ (TileWorkerData*)pbi->tile_workers[final_worker].data1;
+ return vp9_reader_find_end(&tile_data->bit_reader);
+ }
+}
+
static void check_sync_code(VP9_COMMON *cm, struct vp9_read_bit_buffer *rb) {
if (vp9_rb_read_literal(rb, 8) != VP9_SYNC_CODE_0 ||
vp9_rb_read_literal(rb, 8) != VP9_SYNC_CODE_1 ||
@@ -1041,7 +1166,11 @@
setup_tile_info(cm, rb);
sz = vp9_rb_read_literal(rb, 16);
- return sz > 0 ? sz : -1;
+ if (sz == 0)
+ vpx_internal_error(&cm->error, VPX_CODEC_CORRUPT_FRAME,
+ "Invalid header size");
+
+ return sz;
}
static int read_compressed_header(VP9D_COMP *pbi, const uint8_t *data,
@@ -1148,39 +1277,31 @@
MACROBLOCKD *const xd = &pbi->mb;
const uint8_t *data = pbi->source;
- const uint8_t *data_end = pbi->source + pbi->source_sz;
+ const uint8_t *const data_end = pbi->source + pbi->source_sz;
- struct vp9_read_bit_buffer rb = { data, data_end, 0,
- cm, error_handler };
+ struct vp9_read_bit_buffer rb = { data, data_end, 0, cm, error_handler };
const size_t first_partition_size = read_uncompressed_header(pbi, &rb);
const int keyframe = cm->frame_type == KEY_FRAME;
- YV12_BUFFER_CONFIG *new_fb = get_frame_new_buffer(cm);
+ const int tile_rows = 1 << cm->log2_tile_rows;
const int tile_cols = 1 << cm->log2_tile_cols;
+ YV12_BUFFER_CONFIG *const new_fb = get_frame_new_buffer(cm);
if (!first_partition_size) {
- if (!keyframe) {
// showing a frame directly
*p_data_end = data + 1;
return 0;
- } else {
- vpx_internal_error(&cm->error, VPX_CODEC_CORRUPT_FRAME,
- "Invalid key frame");
- return -1;
- }
}
- data += vp9_rb_bytes_read(&rb);
- xd->corrupted = 0;
- new_fb->corrupted = 0;
- pbi->do_loopfilter_inline =
- (cm->log2_tile_rows | cm->log2_tile_cols) == 0 && cm->lf.filter_level;
if (!pbi->decoded_key_frame && !keyframe)
return -1;
+ data += vp9_rb_bytes_read(&rb);
if (!read_is_valid(data, first_partition_size, data_end))
vpx_internal_error(&cm->error, VPX_CODEC_CORRUPT_FRAME,
"Truncated packet or corrupt header length");
+ pbi->do_loopfilter_inline =
+ (cm->log2_tile_rows | cm->log2_tile_cols) == 0 && cm->lf.filter_level;
if (pbi->do_loopfilter_inline && pbi->lf_worker.data1 == NULL) {
CHECK_MEM_ERROR(cm, pbi->lf_worker.data1, vpx_malloc(sizeof(LFWorkerData)));
pbi->lf_worker.hook = (VP9WorkerHook)vp9_loop_filter_worker;
@@ -1190,28 +1311,31 @@
}
}
- setup_plane_dequants(cm, &pbi->mb, cm->base_qindex);
+ alloc_tile_storage(pbi, tile_cols);
xd->mi_8x8 = cm->mi_grid_visible;
xd->mode_info_stride = cm->mode_info_stride;
+ set_prev_mi(cm);
- alloc_tile_storage(pbi, tile_cols);
-
- cm->fc = cm->frame_contexts[cm->frame_context_idx];
-
- vp9_zero(cm->counts);
-
- new_fb->corrupted |= read_compressed_header(pbi, data, first_partition_size);
-
+ setup_plane_dequants(cm, xd, cm->base_qindex);
setup_block_dptrs(xd, cm->subsampling_x, cm->subsampling_y);
- // clear out the coeff buffer
+ cm->fc = cm->frame_contexts[cm->frame_context_idx];
+ vp9_zero(cm->counts);
for (i = 0; i < MAX_MB_PLANE; ++i)
vp9_zero(xd->plane[i].qcoeff);
- set_prev_mi(cm);
+ xd->corrupted = 0;
+ new_fb->corrupted = read_compressed_header(pbi, data, first_partition_size);
- *p_data_end = decode_tiles(pbi, data + first_partition_size);
+ // TODO(jzern): remove frame_parallel_decoding_mode restriction for
+ // single-frame tile decoding.
+ if (pbi->oxcf.max_threads > 1 && tile_rows == 1 && tile_cols > 1 &&
+ cm->frame_parallel_decoding_mode) {
+ *p_data_end = decode_tiles_mt(pbi, data + first_partition_size);
+ } else {
+ *p_data_end = decode_tiles(pbi, data + first_partition_size);
+ }
cm->last_width = cm->width;
cm->last_height = cm->height;
diff --git a/vp9/decoder/vp9_detokenize.c b/vp9/decoder/vp9_detokenize.c
index 0d0f0df..6ecce28 100644
--- a/vp9/decoder/vp9_detokenize.c
+++ b/vp9/decoder/vp9_detokenize.c
@@ -210,45 +210,25 @@
return c;
}
-struct decode_block_args {
- VP9_COMMON *cm;
- MACROBLOCKD *xd;
- struct segmentation *seg;
- vp9_reader *r;
- int *eobtotal;
-};
-
-static void decode_block(int plane, int block, BLOCK_SIZE plane_bsize,
- TX_SIZE tx_size, void *argv) {
- const struct decode_block_args* const arg = argv;
-
- // find the maximum eob for this transform size, adjusted by segment
- MACROBLOCKD *xd = arg->xd;
- const struct segmentation *seg = arg->seg;
- struct macroblockd_plane* pd = &xd->plane[plane];
- const int segment_id = xd->mi_8x8[0]->mbmi.segment_id;
- const int seg_eob = get_tx_eob(seg, segment_id, tx_size);
+int vp9_decode_block_tokens(VP9_COMMON *cm, MACROBLOCKD *xd,
+ int plane, int block, BLOCK_SIZE plane_bsize,
+ TX_SIZE tx_size, vp9_reader *r) {
+ struct macroblockd_plane *const pd = &xd->plane[plane];
+ const int seg_eob = get_tx_eob(&cm->seg, xd->mi_8x8[0]->mbmi.segment_id,
+ tx_size);
int aoff, loff, eob, pt;
-
txfrm_block_to_raster_xy(plane_bsize, tx_size, block, &aoff, &loff);
pt = get_entropy_context(tx_size, pd->above_context + aoff,
pd->left_context + loff);
- eob = decode_coefs(arg->cm, xd, arg->r, block,
+ eob = decode_coefs(cm, xd, r, block,
pd->plane_type, seg_eob, BLOCK_OFFSET(pd->qcoeff, block),
tx_size, pd->dequant, pt);
set_contexts(xd, pd, plane_bsize, tx_size, eob > 0, aoff, loff);
pd->eobs[block] = eob;
- *arg->eobtotal += eob;
+ return eob;
}
-int vp9_decode_tokens(VP9_COMMON *cm, MACROBLOCKD *xd,
- struct segmentation *seg,
- vp9_reader *r, BLOCK_SIZE bsize) {
- int eobtotal = 0;
- struct decode_block_args args = {cm, xd, seg, r, &eobtotal};
- foreach_transformed_block(xd, bsize, decode_block, &args);
- return eobtotal;
-}
+
diff --git a/vp9/decoder/vp9_detokenize.h b/vp9/decoder/vp9_detokenize.h
index 0fb4c3c..94dd8e4 100644
--- a/vp9/decoder/vp9_detokenize.h
+++ b/vp9/decoder/vp9_detokenize.h
@@ -15,8 +15,8 @@
#include "vp9/decoder/vp9_onyxd_int.h"
#include "vp9/decoder/vp9_dboolhuff.h"
-int vp9_decode_tokens(VP9_COMMON *cm, MACROBLOCKD *xd,
- struct segmentation *seg,
- vp9_reader *r, BLOCK_SIZE bsize);
+int vp9_decode_block_tokens(VP9_COMMON *cm, MACROBLOCKD *xd,
+ int plane, int block, BLOCK_SIZE plane_bsize,
+ TX_SIZE tx_size, vp9_reader *r);
#endif // VP9_DECODER_VP9_DETOKENIZE_H_
diff --git a/vp9/decoder/vp9_onyxd_if.c b/vp9/decoder/vp9_onyxd_if.c
index ada73cc..5f970a3 100644
--- a/vp9/decoder/vp9_onyxd_if.c
+++ b/vp9/decoder/vp9_onyxd_if.c
@@ -147,6 +147,7 @@
}
void vp9_remove_decompressor(VP9D_PTR ptr) {
+ int i;
VP9D_COMP *const pbi = (VP9D_COMP *)ptr;
if (!pbi)
@@ -155,6 +156,13 @@
vp9_remove_common(&pbi->common);
vp9_worker_end(&pbi->lf_worker);
vpx_free(pbi->lf_worker.data1);
+ for (i = 0; i < pbi->num_tile_workers; ++i) {
+ VP9Worker *const worker = &pbi->tile_workers[i];
+ vp9_worker_end(worker);
+ vpx_free(worker->data1);
+ vpx_free(worker->data2);
+ }
+ vpx_free(pbi->tile_workers);
vpx_free(pbi->mi_streams);
vpx_free(pbi->above_context[0]);
vpx_free(pbi->above_seg_context);
diff --git a/vp9/decoder/vp9_onyxd_int.h b/vp9/decoder/vp9_onyxd_int.h
index 7739952..83ea967 100644
--- a/vp9/decoder/vp9_onyxd_int.h
+++ b/vp9/decoder/vp9_onyxd_int.h
@@ -40,6 +40,9 @@
int do_loopfilter_inline; // apply loopfilter to available rows immediately
VP9Worker lf_worker;
+ VP9Worker *tile_workers;
+ int num_tile_workers;
+
/* Each tile column has its own MODE_INFO stream. This array indexes them by
tile column index. */
MODE_INFO **mi_streams;
diff --git a/vp9/encoder/vp9_bitstream.c b/vp9/encoder/vp9_bitstream.c
index c677907..a996e0e 100644
--- a/vp9/encoder/vp9_bitstream.c
+++ b/vp9/encoder/vp9_bitstream.c
@@ -53,8 +53,7 @@
int64_t tx_count_32x32p_stats[TX_SIZE_CONTEXTS][TX_SIZES];
int64_t tx_count_16x16p_stats[TX_SIZE_CONTEXTS][TX_SIZES - 1];
int64_t tx_count_8x8p_stats[TX_SIZE_CONTEXTS][TX_SIZES - 2];
-int64_t switchable_interp_stats[SWITCHABLE_FILTERS+1]
- [SWITCHABLE_FILTERS];
+int64_t switchable_interp_stats[SWITCHABLE_FILTER_CONTEXTS][SWITCHABLE_FILTERS];
void init_tx_count_stats() {
vp9_zero(tx_count_32x32p_stats);
@@ -87,10 +86,9 @@
static void update_switchable_interp_stats(VP9_COMMON *cm) {
int i, j;
- for (i = 0; i < SWITCHABLE_FILTERS+1; ++i)
- for (j = 0; j < SWITCHABLE_FILTERS; ++j) {
+ for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; ++i)
+ for (j = 0; j < SWITCHABLE_FILTERS; ++j)
switchable_interp_stats[i][j] += cm->fc.switchable_interp_count[i][j];
- }
}
void write_tx_count_stats() {
@@ -140,9 +138,9 @@
fclose(fp);
printf(
- "vp9_default_switchable_filter_count[SWITCHABLE_FILTERS+1]"
+ "vp9_default_switchable_filter_count[SWITCHABLE_FILTER_CONTEXTS]"
"[SWITCHABLE_FILTERS] = {\n");
- for (i = 0; i < SWITCHABLE_FILTERS+1; i++) {
+ for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; i++) {
printf(" { ");
for (j = 0; j < SWITCHABLE_FILTERS; j++) {
printf("%"PRId64", ", switchable_interp_stats[i][j]);
@@ -236,17 +234,16 @@
static void update_switchable_interp_probs(VP9_COMP *const cpi,
vp9_writer* const bc) {
VP9_COMMON *const cm = &cpi->common;
- unsigned int branch_ct[SWITCHABLE_FILTERS + 1]
- [SWITCHABLE_FILTERS - 1][2];
- vp9_prob new_prob[SWITCHABLE_FILTERS + 1][SWITCHABLE_FILTERS - 1];
+ unsigned int branch_ct[SWITCHABLE_FILTER_CONTEXTS][SWITCHABLE_FILTERS - 1][2];
+ vp9_prob new_prob[SWITCHABLE_FILTER_CONTEXTS][SWITCHABLE_FILTERS - 1];
int i, j;
- for (j = 0; j <= SWITCHABLE_FILTERS; ++j) {
+ for (j = 0; j < SWITCHABLE_FILTER_CONTEXTS; ++j) {
vp9_tree_probs_from_distribution(
vp9_switchable_interp_tree,
new_prob[j], branch_ct[j],
cm->counts.switchable_interp[j], 0);
}
- for (j = 0; j <= SWITCHABLE_FILTERS; ++j) {
+ for (j = 0; j < SWITCHABLE_FILTER_CONTEXTS; ++j) {
for (i = 0; i < SWITCHABLE_FILTERS - 1; ++i) {
vp9_cond_prob_diff_update(bc, &cm->fc.switchable_interp_prob[j][i],
branch_ct[j][i]);
@@ -1142,7 +1139,7 @@
int i, j, c = 0;
for (i = 0; i < SWITCHABLE_FILTERS; ++i) {
count[i] = 0;
- for (j = 0; j <= SWITCHABLE_FILTERS; ++j)
+ for (j = 0; j < SWITCHABLE_FILTER_CONTEXTS; ++j)
count[i] += cm->counts.switchable_interp[j][i];
c += (count[i] > 0);
}
diff --git a/vp9/encoder/vp9_block.h b/vp9/encoder/vp9_block.h
index db2564b..583c6c8 100644
--- a/vp9/encoder/vp9_block.h
+++ b/vp9/encoder/vp9_block.h
@@ -42,7 +42,7 @@
int comp_pred_diff;
int single_pred_diff;
int64_t tx_rd_diff[TX_MODES];
- int64_t best_filter_diff[SWITCHABLE_FILTERS + 1];
+ int64_t best_filter_diff[SWITCHABLE_FILTER_CONTEXTS];
// motion vector cache for adaptive motion search control in partition
// search loop
@@ -118,8 +118,7 @@
unsigned inter_mode_cost[INTER_MODE_CONTEXTS][INTER_MODES];
int intra_uv_mode_cost[2][MB_MODE_COUNT];
int y_mode_costs[INTRA_MODES][INTRA_MODES][INTRA_MODES];
- int switchable_interp_costs[SWITCHABLE_FILTERS + 1]
- [SWITCHABLE_FILTERS];
+ int switchable_interp_costs[SWITCHABLE_FILTER_CONTEXTS][SWITCHABLE_FILTERS];
// These define limits to motion vector components to prevent them
// from extending outside the UMV borders
diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c
index 5ff59a8..44ade18 100644
--- a/vp9/encoder/vp9_encodeframe.c
+++ b/vp9/encoder/vp9_encodeframe.c
@@ -465,7 +465,7 @@
cpi->rd_comp_pred_diff[COMP_PREDICTION_ONLY] += ctx->comp_pred_diff;
cpi->rd_comp_pred_diff[HYBRID_PREDICTION] += ctx->hybrid_pred_diff;
- for (i = 0; i <= SWITCHABLE_FILTERS; i++)
+ for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; i++)
cpi->rd_filter_diff[i] += ctx->best_filter_diff[i];
}
}
@@ -2279,7 +2279,7 @@
cpi->rd_prediction_type_threshes[frame_type][i] >>= 1;
}
- for (i = 0; i <= SWITCHABLE_FILTERS; i++) {
+ for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; i++) {
const int64_t diff = cpi->rd_filter_diff[i] / cpi->common.MBs;
cpi->rd_filter_threshes[frame_type][i] =
(cpi->rd_filter_threshes[frame_type][i] + diff) / 2;
@@ -2493,7 +2493,7 @@
(mbmi->skip_coeff ||
vp9_segfeature_active(&cm->seg, segment_id, SEG_LVL_SKIP)))) {
const uint8_t context = vp9_get_pred_context_tx_size(xd);
- update_tx_counts(bsize, context, mbmi->tx_size, &cm->counts.tx);
+ ++get_tx_counts(bsize, context, &cm->counts.tx)[mbmi->tx_size];
} else {
int x, y;
TX_SIZE sz = tx_mode_to_biggest_tx_size[cm->tx_mode];
diff --git a/vp9/encoder/vp9_encodemb.c b/vp9/encoder/vp9_encodemb.c
index 91546e8..e52e8ec 100644
--- a/vp9/encoder/vp9_encodemb.c
+++ b/vp9/encoder/vp9_encodemb.c
@@ -418,6 +418,7 @@
struct encode_b_args *const args = arg;
MACROBLOCK *const x = args->x;
MACROBLOCKD *const xd = &x->e_mbd;
+ struct optimize_ctx *const ctx = args->ctx;
struct macroblockd_plane *const pd = &xd->plane[plane];
const int raster_block = txfrm_block_to_raster_block(plane_bsize, tx_size,
block);
@@ -429,14 +430,18 @@
// TODO(jingning): per transformed block zero forcing only enabled for
// luma component. will integrate chroma components as well.
if (x->zcoeff_blk[tx_size][block] && plane == 0) {
+ int x, y;
pd->eobs[block] = 0;
+ txfrm_block_to_raster_xy(plane_bsize, tx_size, block, &x, &y);
+ ctx->ta[plane][x] = 0;
+ ctx->tl[plane][y] = 0;
return;
}
vp9_xform_quant(plane, block, plane_bsize, tx_size, arg);
if (x->optimize)
- vp9_optimize_b(plane, block, plane_bsize, tx_size, x, args->ctx);
+ vp9_optimize_b(plane, block, plane_bsize, tx_size, x, ctx);
if (x->skip_encode || pd->eobs[block] == 0)
return;
diff --git a/vp9/encoder/vp9_modecosts.c b/vp9/encoder/vp9_modecosts.c
index b867d8b..7eb6592 100644
--- a/vp9/encoder/vp9_modecosts.c
+++ b/vp9/encoder/vp9_modecosts.c
@@ -36,7 +36,7 @@
vp9_kf_uv_mode_prob[INTRA_MODES - 1],
vp9_intra_mode_tree);
- for (i = 0; i <= SWITCHABLE_FILTERS; ++i)
+ for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; ++i)
vp9_cost_tokens((int *)c->mb.switchable_interp_costs[i],
cm->fc.switchable_interp_prob[i],
vp9_switchable_interp_tree);
diff --git a/vp9/encoder/vp9_onyx_if.c b/vp9/encoder/vp9_onyx_if.c
index ad214c7..b664f1e 100644
--- a/vp9/encoder/vp9_onyx_if.c
+++ b/vp9/encoder/vp9_onyx_if.c
@@ -2866,7 +2866,7 @@
cpi->active_best_quality = inter_minq[q];
// 1-pass: for now, use the average Q for the active_best, if its lower
// than active_worst.
- if (cpi->pass == 0 && (cpi->avg_frame_qindex < cpi->active_worst_quality))
+ if (cpi->pass == 0 && (cpi->avg_frame_qindex < q))
cpi->active_best_quality = inter_minq[cpi->avg_frame_qindex];
#endif
@@ -2902,7 +2902,14 @@
if (cm->frame_type == KEY_FRAME && !cpi->this_key_frame_forced) {
*top_index =
(cpi->active_worst_quality + cpi->active_best_quality * 3) / 4;
+ // If this is the first (key) frame in 1-pass, active best is the user
+ // best-allowed, and leave the top_index to active_worst.
+ if (cpi->pass == 0 && cpi->common.current_video_frame == 0) {
+ cpi->active_best_quality = cpi->oxcf.best_allowed_q;
+ *top_index = cpi->oxcf.worst_allowed_q;
+ }
} else if (!cpi->is_src_frame_alt_ref &&
+ (cpi->oxcf.end_usage != USAGE_STREAM_FROM_SERVER) &&
(cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame)) {
*top_index =
(cpi->active_worst_quality + cpi->active_best_quality) / 2;
diff --git a/vp9/encoder/vp9_onyx_int.h b/vp9/encoder/vp9_onyx_int.h
index 20831be..0498043 100644
--- a/vp9/encoder/vp9_onyx_int.h
+++ b/vp9/encoder/vp9_onyx_int.h
@@ -396,9 +396,9 @@
// FIXME(rbultje) can this overflow?
int rd_tx_select_threshes[4][TX_MODES];
- int64_t rd_filter_diff[SWITCHABLE_FILTERS + 1];
- int64_t rd_filter_threshes[4][SWITCHABLE_FILTERS + 1];
- int64_t rd_filter_cache[SWITCHABLE_FILTERS + 1];
+ int64_t rd_filter_diff[SWITCHABLE_FILTER_CONTEXTS];
+ int64_t rd_filter_threshes[4][SWITCHABLE_FILTER_CONTEXTS];
+ int64_t rd_filter_cache[SWITCHABLE_FILTER_CONTEXTS];
int RDMULT;
int RDDIV;
@@ -641,7 +641,7 @@
int dummy_packing; /* flag to indicate if packing is dummy */
- unsigned int switchable_interp_count[SWITCHABLE_FILTERS + 1]
+ unsigned int switchable_interp_count[SWITCHABLE_FILTER_CONTEXTS]
[SWITCHABLE_FILTERS];
unsigned int tx_stepdown_count[TX_SIZES];
diff --git a/vp9/encoder/vp9_quantize.c b/vp9/encoder/vp9_quantize.c
index 7ad8d1f..fca7525 100644
--- a/vp9/encoder/vp9_quantize.c
+++ b/vp9/encoder/vp9_quantize.c
@@ -22,12 +22,14 @@
extern int enc_debug;
#endif
-void vp9_quantize_b_c(int16_t *coeff_ptr, intptr_t n_coeffs, int skip_block,
- int16_t *zbin_ptr, int16_t *round_ptr, int16_t *quant_ptr,
- int16_t *quant_shift_ptr, int16_t *qcoeff_ptr,
- int16_t *dqcoeff_ptr, int16_t *dequant_ptr,
- int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan,
- const int16_t *iscan) {
+void vp9_quantize_b_c(const int16_t *coeff_ptr, intptr_t n_coeffs,
+ int skip_block,
+ const int16_t *zbin_ptr, const int16_t *round_ptr,
+ const int16_t *quant_ptr, const int16_t *quant_shift_ptr,
+ int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr,
+ const int16_t *dequant_ptr,
+ int zbin_oq_value, uint16_t *eob_ptr,
+ const int16_t *scan, const int16_t *iscan) {
int i, rc, eob;
int zbins[2], nzbins[2], zbin;
int x, y, z, sz;
@@ -86,14 +88,15 @@
*eob_ptr = eob + 1;
}
-void vp9_quantize_b_32x32_c(int16_t *coeff_ptr, intptr_t n_coeffs,
+void vp9_quantize_b_32x32_c(const int16_t *coeff_ptr, intptr_t n_coeffs,
int skip_block,
- int16_t *zbin_ptr, int16_t *round_ptr,
- int16_t *quant_ptr, int16_t *quant_shift_ptr,
+ const int16_t *zbin_ptr, const int16_t *round_ptr,
+ const int16_t *quant_ptr,
+ const int16_t *quant_shift_ptr,
int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr,
- int16_t *dequant_ptr, int zbin_oq_value,
- uint16_t *eob_ptr, const int16_t *scan,
- const int16_t *iscan) {
+ const int16_t *dequant_ptr,
+ int zbin_oq_value, uint16_t *eob_ptr,
+ const int16_t *scan, const int16_t *iscan) {
int i, rc, eob;
int zbins[2], nzbins[2];
int x, y, z, sz;
@@ -174,25 +177,19 @@
return res;
}
-void vp9_regular_quantize_b_4x4(MACROBLOCK *mb, int b_idx, TX_TYPE tx_type,
- int y_blocks) {
- MACROBLOCKD *const xd = &mb->e_mbd;
+void vp9_regular_quantize_b_4x4(MACROBLOCK *x, int y_blocks, int b_idx,
+ const int16_t *scan, const int16_t *iscan) {
+ MACROBLOCKD *const xd = &x->e_mbd;
const struct plane_block_idx pb_idx = plane_block_idx(y_blocks, b_idx);
- const int16_t *scan = get_scan_4x4(tx_type);
- const int16_t *iscan = get_iscan_4x4(tx_type);
+ struct macroblock_plane* p = &x->plane[pb_idx.plane];
+ struct macroblockd_plane* pd = &xd->plane[pb_idx.plane];
- vp9_quantize_b(BLOCK_OFFSET(mb->plane[pb_idx.plane].coeff, pb_idx.block),
- 16, mb->skip_block,
- mb->plane[pb_idx.plane].zbin,
- mb->plane[pb_idx.plane].round,
- mb->plane[pb_idx.plane].quant,
- mb->plane[pb_idx.plane].quant_shift,
- BLOCK_OFFSET(xd->plane[pb_idx.plane].qcoeff, pb_idx.block),
- BLOCK_OFFSET(xd->plane[pb_idx.plane].dqcoeff, pb_idx.block),
- xd->plane[pb_idx.plane].dequant,
- mb->plane[pb_idx.plane].zbin_extra,
- &xd->plane[pb_idx.plane].eobs[pb_idx.block],
- scan, iscan);
+ vp9_quantize_b(BLOCK_OFFSET(p->coeff, pb_idx.block),
+ 16, x->skip_block,
+ p->zbin, p->round, p->quant, p->quant_shift,
+ BLOCK_OFFSET(pd->qcoeff, pb_idx.block),
+ BLOCK_OFFSET(pd->dqcoeff, pb_idx.block),
+ pd->dequant, p->zbin_extra, &pd->eobs[pb_idx.block], scan, iscan);
}
static void invert_quant(int16_t *quant, int16_t *shift, int d) {
diff --git a/vp9/encoder/vp9_quantize.h b/vp9/encoder/vp9_quantize.h
index 459aa33..c078e1d 100644
--- a/vp9/encoder/vp9_quantize.h
+++ b/vp9/encoder/vp9_quantize.h
@@ -13,8 +13,9 @@
#include "vp9/encoder/vp9_block.h"
-void vp9_regular_quantize_b_4x4(MACROBLOCK *mb, int b_idx, TX_TYPE tx_type,
- int y_blocks);
+void vp9_regular_quantize_b_4x4(MACROBLOCK *x, int y_blocks, int b_idx,
+ const int16_t *scan, const int16_t *iscan);
+
struct VP9_COMP;
void vp9_set_quantizer(struct VP9_COMP *cpi, int q);
diff --git a/vp9/encoder/vp9_rdopt.c b/vp9/encoder/vp9_rdopt.c
index e4be046..f69b19b 100644
--- a/vp9/encoder/vp9_rdopt.c
+++ b/vp9/encoder/vp9_rdopt.c
@@ -611,7 +611,7 @@
// TODO(jingning): temporarily enabled only for luma component
rd = MIN(rd1, rd2);
if (plane == 0)
- x->zcoeff_blk[tx_size][block] = rd1 > rd2;
+ x->zcoeff_blk[tx_size][block] = rd1 > rd2 || !xd->plane[plane].eobs[block];
args->this_rate += args->rate;
args->this_dist += args->dist;
@@ -1032,10 +1032,10 @@
ENTROPY_CONTEXT ta[2], tempa[2];
ENTROPY_CONTEXT tl[2], templ[2];
- TX_TYPE tx_type = DCT_DCT;
+
const int num_4x4_blocks_wide = num_4x4_blocks_wide_lookup[bsize];
const int num_4x4_blocks_high = num_4x4_blocks_high_lookup[bsize];
- int idx, idy, block;
+ int idx, idy;
uint8_t best_dst[8 * 8];
assert(ib < 4);
@@ -1071,8 +1071,8 @@
const int16_t *nb;
uint8_t *src = src_init + idx * 4 + idy * 4 * src_stride;
uint8_t *dst = dst_init + idx * 4 + idy * 4 * dst_stride;
-
- block = ib + idy * 2 + idx;
+ const int block = ib + idy * 2 + idx;
+ TX_TYPE tx_type;
xd->mi_8x8[0]->bmi[block].as_mode = mode;
src_diff = raster_block_offset_int16(BLOCK_8X8, block, p->src_diff);
coeff = BLOCK_OFFSET(x->plane[0].coeff, block);
@@ -1086,13 +1086,15 @@
dst, dst_stride);
tx_type = get_tx_type_4x4(PLANE_TYPE_Y_WITH_DC, xd, block);
+ get_scan_nb_4x4(tx_type, &scan, &nb);
+
if (tx_type != DCT_DCT)
vp9_short_fht4x4(src_diff, coeff, 8, tx_type);
else
x->fwd_txm4x4(src_diff, coeff, 8);
- vp9_regular_quantize_b_4x4(x, block, tx_type, 16);
- get_scan_nb_4x4(tx_type, &scan, &nb);
+ vp9_regular_quantize_b_4x4(x, 16, block, scan, get_iscan_4x4(tx_type));
+
ratey += cost_coeffs(x, 0, block,
tempa + idx, templ + idy, TX_4X4, scan, nb);
distortion += vp9_block_error(coeff, BLOCK_OFFSET(pd->dqcoeff, block),
@@ -1558,7 +1560,8 @@
coeff = BLOCK_OFFSET(p->coeff, k);
x->fwd_txm4x4(raster_block_offset_int16(BLOCK_8X8, k, p->src_diff),
coeff, 8);
- vp9_regular_quantize_b_4x4(x, k, DCT_DCT, 16);
+ vp9_regular_quantize_b_4x4(x, 16, k, get_scan_4x4(DCT_DCT),
+ get_iscan_4x4(DCT_DCT));
thisdistortion += vp9_block_error(coeff, BLOCK_OFFSET(pd->dqcoeff, k),
16, &ssz);
thissse += ssz;
@@ -1651,6 +1654,7 @@
MB_PREDICTION_MODE this_mode;
MODE_INFO *mi = x->e_mbd.mi_8x8[0];
MB_MODE_INFO *const mbmi = &mi->mbmi;
+ struct macroblockd_plane *const pd = &x->e_mbd.plane[0];
const int label_count = 4;
int64_t this_segment_rd = 0;
int label_mv_thresh;
@@ -1665,8 +1669,8 @@
int subpelmv = 1, have_ref = 0;
const int has_second_rf = has_second_ref(mbmi);
- vpx_memcpy(t_above, x->e_mbd.plane[0].above_context, sizeof(t_above));
- vpx_memcpy(t_left, x->e_mbd.plane[0].left_context, sizeof(t_left));
+ vpx_memcpy(t_above, pd->above_context, sizeof(t_above));
+ vpx_memcpy(t_left, pd->left_context, sizeof(t_left));
v_fn_ptr = &cpi->fn_ptr[bsize];
@@ -1744,7 +1748,7 @@
}
}
- vpx_memcpy(orig_pre, x->e_mbd.plane[0].pre, sizeof(orig_pre));
+ vpx_memcpy(orig_pre, pd->pre, sizeof(orig_pre));
vpx_memcpy(bsi->rdstat[i][mode_idx].ta, t_above,
sizeof(bsi->rdstat[i][mode_idx].ta));
vpx_memcpy(bsi->rdstat[i][mode_idx].tl, t_left,
@@ -1948,6 +1952,13 @@
ref_bsi->rdstat[i][mode_idx].brdcost < INT64_MAX) {
vpx_memcpy(&bsi->rdstat[i][mode_idx], &ref_bsi->rdstat[i][mode_idx],
sizeof(SEG_RDSTAT));
+ if (num_4x4_blocks_wide > 1)
+ bsi->rdstat[i + 1][mode_idx].eobs =
+ ref_bsi->rdstat[i + 1][mode_idx].eobs;
+ if (num_4x4_blocks_high > 1)
+ bsi->rdstat[i + 2][mode_idx].eobs =
+ ref_bsi->rdstat[i + 2][mode_idx].eobs;
+
if (bsi->rdstat[i][mode_idx].brdcost < best_rd) {
mode_selected = this_mode;
best_rd = bsi->rdstat[i][mode_idx].brdcost;
@@ -1968,7 +1979,11 @@
bsi->rdstat[i][mode_idx].brdcost += RDCOST(x->rdmult, x->rddiv,
bsi->rdstat[i][mode_idx].brate, 0);
bsi->rdstat[i][mode_idx].brate += bsi->rdstat[i][mode_idx].byrate;
- bsi->rdstat[i][mode_idx].eobs = x->e_mbd.plane[0].eobs[i];
+ bsi->rdstat[i][mode_idx].eobs = pd->eobs[i];
+ if (num_4x4_blocks_wide > 1)
+ bsi->rdstat[i + 1][mode_idx].eobs = pd->eobs[i + 1];
+ if (num_4x4_blocks_high > 1)
+ bsi->rdstat[i + 2][mode_idx].eobs = pd->eobs[i + 2];
}
if (bsi->rdstat[i][mode_idx].brdcost < best_rd) {
@@ -2204,7 +2219,7 @@
int_mv *second_ref_mv,
int64_t comp_pred_diff[NB_PREDICTION_TYPES],
int64_t tx_size_diff[TX_MODES],
- int64_t best_filter_diff[SWITCHABLE_FILTERS + 1]) {
+ int64_t best_filter_diff[SWITCHABLE_FILTER_CONTEXTS]) {
MACROBLOCKD *const xd = &x->e_mbd;
// Take a snapshot of the coding context so it can be
@@ -2222,7 +2237,7 @@
vpx_memcpy(ctx->tx_rd_diff, tx_size_diff, sizeof(ctx->tx_rd_diff));
vpx_memcpy(ctx->best_filter_diff, best_filter_diff,
- sizeof(*best_filter_diff) * (SWITCHABLE_FILTERS + 1));
+ sizeof(*best_filter_diff) * SWITCHABLE_FILTER_CONTEXTS);
}
static void setup_pred_block(const MACROBLOCKD *xd,
@@ -2268,12 +2283,8 @@
// set up scaling factors
scale[frame_type] = cpi->common.active_ref_scale[frame_type - 1];
- scale[frame_type].x_offset_q4 =
- ROUND_POWER_OF_TWO(mi_col * MI_SIZE * scale[frame_type].sfc->x_scale_fp,
- REF_SCALE_SHIFT) & 0xf;
- scale[frame_type].y_offset_q4 =
- ROUND_POWER_OF_TWO(mi_row * MI_SIZE * scale[frame_type].sfc->y_scale_fp,
- REF_SCALE_SHIFT) & 0xf;
+ scale[frame_type].sfc->set_scaled_offsets(&scale[frame_type],
+ mi_row * MI_SIZE, mi_col * MI_SIZE);
// TODO(jkoleszar): Is the UV buffer ever used here? If so, need to make this
// use the UV scaling factors.
@@ -3116,8 +3127,8 @@
int64_t best_tx_diff[TX_MODES];
int64_t best_pred_diff[NB_PREDICTION_TYPES];
int64_t best_pred_rd[NB_PREDICTION_TYPES];
- int64_t best_filter_rd[SWITCHABLE_FILTERS + 1];
- int64_t best_filter_diff[SWITCHABLE_FILTERS + 1];
+ int64_t best_filter_rd[SWITCHABLE_FILTER_CONTEXTS];
+ int64_t best_filter_diff[SWITCHABLE_FILTER_CONTEXTS];
MB_MODE_INFO best_mbmode = { 0 };
int j;
int mode_index, best_mode_index = 0;
@@ -3155,7 +3166,7 @@
best_pred_rd[i] = INT64_MAX;
for (i = 0; i < TX_MODES; i++)
best_tx_rd[i] = INT64_MAX;
- for (i = 0; i <= SWITCHABLE_FILTERS; i++)
+ for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; i++)
best_filter_rd[i] = INT64_MAX;
for (i = 0; i < TX_SIZES; i++)
rate_uv_intra[i] = INT_MAX;
@@ -3542,7 +3553,7 @@
if (!disable_skip && ref_frame == INTRA_FRAME) {
for (i = 0; i < NB_PREDICTION_TYPES; ++i)
best_pred_rd[i] = MIN(best_pred_rd[i], this_rd);
- for (i = 0; i <= SWITCHABLE_FILTERS; i++)
+ for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; i++)
best_filter_rd[i] = MIN(best_filter_rd[i], this_rd);
}
@@ -3625,7 +3636,7 @@
cm->mcomp_filter_type != BILINEAR) {
int64_t ref = cpi->rd_filter_cache[cm->mcomp_filter_type == SWITCHABLE ?
SWITCHABLE_FILTERS : cm->mcomp_filter_type];
- for (i = 0; i <= SWITCHABLE_FILTERS; i++) {
+ for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; i++) {
int64_t adj_rd;
// In cases of poor prediction, filter_cache[] can contain really big
// values, which actually are bigger than this_rd itself. This can
@@ -3747,7 +3758,7 @@
}
if (!x->skip) {
- for (i = 0; i <= SWITCHABLE_FILTERS; i++) {
+ for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; i++) {
if (best_filter_rd[i] == INT64_MAX)
best_filter_diff[i] = 0;
else
@@ -3812,8 +3823,8 @@
int64_t best_tx_diff[TX_MODES];
int64_t best_pred_diff[NB_PREDICTION_TYPES];
int64_t best_pred_rd[NB_PREDICTION_TYPES];
- int64_t best_filter_rd[SWITCHABLE_FILTERS + 1];
- int64_t best_filter_diff[SWITCHABLE_FILTERS + 1];
+ int64_t best_filter_rd[SWITCHABLE_FILTER_CONTEXTS];
+ int64_t best_filter_diff[SWITCHABLE_FILTER_CONTEXTS];
MB_MODE_INFO best_mbmode = { 0 };
int mode_index, best_mode_index = 0;
unsigned int ref_costs_single[MAX_REF_FRAMES], ref_costs_comp[MAX_REF_FRAMES];
@@ -3835,7 +3846,7 @@
int best_skip2 = 0;
x->skip_encode = cpi->sf.skip_encode_frame && xd->q_index < QIDX_SKIP_THRESH;
- vp9_zero(x->zcoeff_blk);
+ vpx_memset(x->zcoeff_blk[TX_4X4], 0, 4);
for (i = 0; i < 4; i++) {
int j;
@@ -3850,7 +3861,7 @@
best_pred_rd[i] = INT64_MAX;
for (i = 0; i < TX_MODES; i++)
best_tx_rd[i] = INT64_MAX;
- for (i = 0; i <= SWITCHABLE_FILTERS; i++)
+ for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; i++)
best_filter_rd[i] = INT64_MAX;
for (i = 0; i < TX_SIZES; i++)
rate_uv_intra[i] = INT_MAX;
@@ -4136,8 +4147,10 @@
tmp_best_sse = total_sse;
tmp_best_skippable = skippable;
tmp_best_mbmode = *mbmi;
- for (i = 0; i < 4; i++)
+ for (i = 0; i < 4; i++) {
tmp_best_bmodes[i] = xd->mi_8x8[0]->bmi[i];
+ x->zcoeff_blk[TX_4X4][i] = !xd->plane[0].eobs[i];
+ }
pred_exists = 1;
if (switchable_filter_index == 0 &&
cpi->sf.use_rd_breakout &&
@@ -4292,7 +4305,7 @@
if (!disable_skip && ref_frame == INTRA_FRAME) {
for (i = 0; i < NB_PREDICTION_TYPES; ++i)
best_pred_rd[i] = MIN(best_pred_rd[i], this_rd);
- for (i = 0; i <= SWITCHABLE_FILTERS; i++)
+ for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; i++)
best_filter_rd[i] = MIN(best_filter_rd[i], this_rd);
}
@@ -4370,7 +4383,7 @@
cm->mcomp_filter_type != BILINEAR) {
int64_t ref = cpi->rd_filter_cache[cm->mcomp_filter_type == SWITCHABLE ?
SWITCHABLE_FILTERS : cm->mcomp_filter_type];
- for (i = 0; i <= SWITCHABLE_FILTERS; i++) {
+ for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; i++) {
int64_t adj_rd;
// In cases of poor prediction, filter_cache[] can contain really big
// values, which actually are bigger than this_rd itself. This can
@@ -4486,7 +4499,7 @@
}
if (!x->skip) {
- for (i = 0; i <= SWITCHABLE_FILTERS; i++) {
+ for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; i++) {
if (best_filter_rd[i] == INT64_MAX)
best_filter_diff[i] = 0;
else
diff --git a/vp9/vp9_common.mk b/vp9/vp9_common.mk
index 11fa2e0..0badb08 100644
--- a/vp9/vp9_common.mk
+++ b/vp9/vp9_common.mk
@@ -74,6 +74,7 @@
VP9_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/vp9_postproc_x86.h
VP9_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/vp9_asm_stubs.c
VP9_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/vp9_loopfilter_intrin_sse2.c
+VP9_COMMON_SRCS-$(HAVE_AVX2) += common/x86/vp9_loopfilter_intrin_avx2.c
VP9_COMMON_SRCS-$(CONFIG_VP9_POSTPROC) += common/vp9_postproc.h
VP9_COMMON_SRCS-$(CONFIG_VP9_POSTPROC) += common/vp9_postproc.c
VP9_COMMON_SRCS-$(HAVE_MMX) += common/x86/vp9_loopfilter_mmx.asm
diff --git a/vp9/vp9_cx_iface.c b/vp9/vp9_cx_iface.c
index 0f12d88..4d39670 100644
--- a/vp9/vp9_cx_iface.c
+++ b/vp9/vp9_cx_iface.c
@@ -995,8 +995,9 @@
if (data) {
int res;
vpx_scaling_mode_t scalemode = *(vpx_scaling_mode_t *)data;
- res = vp9_set_internal_size(ctx->cpi, scalemode.h_scaling_mode,
- scalemode.v_scaling_mode);
+ res = vp9_set_internal_size(ctx->cpi,
+ (VPX_SCALING)scalemode.h_scaling_mode,
+ (VPX_SCALING)scalemode.v_scaling_mode);
if (!res) {
return VPX_CODEC_OK;
diff --git a/vpx/vp8.h b/vpx/vp8.h
index 57d3cae..056fa7a 100644
--- a/vpx/vp8.h
+++ b/vpx/vp8.h
@@ -100,14 +100,17 @@
/*!\brief reference frame data struct
*
- * define the data struct to access vp8 reference frames
+ * Define the data struct to access vp8 reference frames.
*/
-
typedef struct vpx_ref_frame {
vpx_ref_frame_type_t frame_type; /**< which reference frame */
vpx_image_t img; /**< reference frame data in image format */
} vpx_ref_frame_t;
+/*!\brief VP9 specific reference frame data struct
+ *
+ * Define the data struct to access vp9 reference frames.
+ */
typedef struct vp9_ref_frame {
int idx; /**< frame index to get (input) */
vpx_image_t img; /**< img structure to populate (output) */
@@ -117,7 +120,6 @@
*
* defines the data type for each of VP8 decoder control function requires
*/
-
VPX_CTRL_USE_TYPE(VP8_SET_REFERENCE, vpx_ref_frame_t *)
VPX_CTRL_USE_TYPE(VP8_COPY_REFERENCE, vpx_ref_frame_t *)
VPX_CTRL_USE_TYPE(VP8_SET_POSTPROC, vp8_postproc_cfg_t *)
@@ -127,7 +129,6 @@
VPX_CTRL_USE_TYPE(VP8_SET_DBG_DISPLAY_MV, int)
VPX_CTRL_USE_TYPE(VP9_GET_REFERENCE, vp9_ref_frame_t *)
-
/*! @} - end defgroup vp8 */
#ifdef __cplusplus
diff --git a/vpx/vp8cx.h b/vpx/vp8cx.h
index 92fdb00..9f68c38 100644
--- a/vpx/vp8cx.h
+++ b/vpx/vp8cx.h
@@ -7,7 +7,8 @@
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
-
+#ifndef VP8CX_H
+#define VP8CX_H
/*!\defgroup vp8_encoder WebM VP8 Encoder
* \ingroup vp8
@@ -20,8 +21,6 @@
* \brief Provides definitions for using the VP8 encoder algorithm within the
* vpx Codec Interface.
*/
-#ifndef VP8CX_H
-#define VP8CX_H
#ifdef __cplusplus
extern "C" {
@@ -223,16 +222,17 @@
*/
typedef struct vpx_roi_map {
- unsigned char *roi_map; /**< specify an id between 0 and 3 for each 16x16 region within a frame */
- unsigned int rows; /**< number of rows */
- unsigned int cols; /**< number of cols */
+ /*! An id between 0 and 3 for each 16x16 region within a frame. */
+ unsigned char *roi_map;
+ unsigned int rows; /**< Number of rows. */
+ unsigned int cols; /**< Number of columns. */
// TODO(paulwilkins): broken for VP9 which has 8 segments
// q and loop filter deltas for each segment
// (see MAX_MB_SEGMENTS)
- int delta_q[4];
- int delta_lf[4];
- // Static breakout threshold for each segment
- unsigned int static_threshold[4];
+ int delta_q[4]; /**< Quantizer deltas. */
+ int delta_lf[4]; /**< Loop filter deltas. */
+ /*! Static breakout threshold for each segment. */
+ unsigned int static_threshold[4];
} vpx_roi_map_t;
/*!\brief vpx active region map
diff --git a/vpx/vp8dx.h b/vpx/vp8dx.h
index 50a223f..d3093c4 100644
--- a/vpx/vp8dx.h
+++ b/vpx/vp8dx.h
@@ -45,7 +45,8 @@
#include "vp8.h"
-/*!\brief VP8 decoder control functions
+/*!\enum vp8_dec_control_id
+ * \brief VP8 decoder control functions
*
* This set of macros define the control functions available for the VP8
* decoder interface.
@@ -78,12 +79,17 @@
VP8_DECODER_CTRL_ID_MAX
};
+/*!\brief Structure to hold decryption state
+ *
+ * Defines a structure to hold the decryption state and access function.
+ */
typedef struct vp8_decrypt_init {
/** Decrypt n bytes of data from input -> output, using the decrypt_state
* passed in VP8D_SET_DECRYPTOR.
*/
void (*decrypt_cb)(void *decrypt_state, const unsigned char *input,
unsigned char *output, int count);
+ /*! Decryption state. */
void *decrypt_state;
} vp8_decrypt_init;
diff --git a/vpx/vpx_codec.h b/vpx/vpx_codec.h
index 2e6f1e7..3ea36d6 100644
--- a/vpx/vpx_codec.h
+++ b/vpx/vpx_codec.h
@@ -36,12 +36,13 @@
* Once initialized, the instance is manged using other functions from
* the vpx_codec_* family.
*/
+#ifndef VPX_CODEC_H
+#define VPX_CODEC_H
+
#ifdef __cplusplus
extern "C" {
#endif
-#ifndef VPX_CODEC_H
-#define VPX_CODEC_H
#include "vpx_integer.h"
#include "vpx_image.h"
@@ -550,9 +551,8 @@
/*!@} - end defgroup cap_xma*/
/*!@} - end defgroup codec*/
-
-
-#endif
#ifdef __cplusplus
}
#endif
+#endif
+
diff --git a/vpx/vpx_decoder.h b/vpx/vpx_decoder.h
index e7701e5..2dcd024 100644
--- a/vpx/vpx_decoder.h
+++ b/vpx/vpx_decoder.h
@@ -7,7 +7,8 @@
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
-
+#ifndef VPX_DECODER_H
+#define VPX_DECODER_H
/*!\defgroup decoder Decoder Algorithm Interface
* \ingroup codec
@@ -28,8 +29,6 @@
extern "C" {
#endif
-#ifndef VPX_DECODER_H
-#define VPX_DECODER_H
#include "vpx_codec.h"
/*!\brief Current ABI version number
@@ -328,9 +327,8 @@
/*!@} - end defgroup cap_put_slice*/
/*!@} - end defgroup decoder*/
-
-#endif
-
#ifdef __cplusplus
}
#endif
+#endif
+
diff --git a/vpx/vpx_encoder.h b/vpx/vpx_encoder.h
index 56fd2d9..56752cf 100644
--- a/vpx/vpx_encoder.h
+++ b/vpx/vpx_encoder.h
@@ -7,7 +7,8 @@
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
-
+#ifndef VPX_ENCODER_H
+#define VPX_ENCODER_H
/*!\defgroup encoder Encoder Algorithm Interface
* \ingroup codec
@@ -28,8 +29,6 @@
extern "C" {
#endif
-#ifndef VPX_ENCODER_H
-#define VPX_ENCODER_H
#include "vpx_codec.h"
/*! Temporal Scalability: Maximum length of the sequence defining frame
@@ -930,8 +929,8 @@
/*!@} - end defgroup encoder*/
-
-#endif
#ifdef __cplusplus
}
#endif
+#endif
+
diff --git a/vpx_ports/x86.h b/vpx_ports/x86.h
index b009c35..2990583 100644
--- a/vpx_ports/x86.h
+++ b/vpx_ports/x86.h
@@ -88,12 +88,14 @@
#endif
#endif /* end others */
-#define HAS_MMX 0x01
-#define HAS_SSE 0x02
-#define HAS_SSE2 0x04
-#define HAS_SSE3 0x08
-#define HAS_SSSE3 0x10
-#define HAS_SSE4_1 0x20
+#define HAS_MMX 0x01
+#define HAS_SSE 0x02
+#define HAS_SSE2 0x04
+#define HAS_SSE3 0x08
+#define HAS_SSSE3 0x10
+#define HAS_SSE4_1 0x20
+#define HAS_AVX 0x40
+#define HAS_AVX2 0x80
#ifndef BIT
#define BIT(n) (1<<n)
#endif
@@ -132,12 +134,16 @@
if (reg_edx & BIT(26)) flags |= HAS_SSE2; /* aka wmt */
- if (reg_ecx & BIT(0)) flags |= HAS_SSE3;
+ if (reg_ecx & BIT(0)) flags |= HAS_SSE3;
- if (reg_ecx & BIT(9)) flags |= HAS_SSSE3;
+ if (reg_ecx & BIT(9)) flags |= HAS_SSSE3;
if (reg_ecx & BIT(19)) flags |= HAS_SSE4_1;
+ if (reg_ecx & BIT(28)) flags |= HAS_AVX;
+
+ if (reg_ebx & BIT(5)) flags |= HAS_AVX2;
+
return flags & mask;
}