Merge "vp9: fix -Wclobbered (longjmp + local variables)"
diff --git a/configure b/configure
index 3ac00ff..4509697 100755
--- a/configure
+++ b/configure
@@ -36,6 +36,7 @@
${toggle_codec_srcs} in/exclude codec library source code
${toggle_debug_libs} in/exclude debug version of libraries
${toggle_static_msvcrt} use static MSVCRT (VS builds only)
+ ${toggle_vp9_highbitdepth} use VP9 high bit depth (10/12) profiles
${toggle_vp8} VP8 codec support
${toggle_vp9} VP9 codec support
${toggle_internal_stats} output of encoder internal stats for debug, if supported (encoders)
diff --git a/test/datarate_test.cc b/test/datarate_test.cc
index a76d806..573870e 100644
--- a/test/datarate_test.cc
+++ b/test/datarate_test.cc
@@ -246,6 +246,8 @@
for (int i = 0; i < 3; ++i) {
bits_total_[i] = 0;
}
+ denoiser_offon_test_ = 0;
+ denoiser_offon_period_ = -1;
}
//
@@ -313,10 +315,20 @@
virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
::libvpx_test::Encoder *encoder) {
- if (video->frame() == 1) {
+ if (video->frame() == 1)
encoder->Control(VP8E_SET_CPUUSED, set_cpu_used_);
- encoder->Control(VP9E_SET_NOISE_SENSITIVITY, denoiser_on_);
+
+ if (denoiser_offon_test_) {
+ ASSERT_GT(denoiser_offon_period_, 0)
+ << "denoiser_offon_period_ is not positive.";
+ if ((video->frame() + 1) % denoiser_offon_period_ == 0) {
+ // Flip denoiser_on_ periodically
+ denoiser_on_ ^= 1;
+ }
}
+
+ encoder->Control(VP9E_SET_NOISE_SENSITIVITY, denoiser_on_);
+
if (cfg_.ts_number_layers > 1) {
if (video->frame() == 1) {
encoder->Control(VP9E_SET_SVC, 1);
@@ -398,6 +410,8 @@
vpx_codec_pts_t first_drop_;
int num_drops_;
int denoiser_on_;
+ int denoiser_offon_test_;
+ int denoiser_offon_period_;
};
// Check basic rate targeting,
@@ -652,6 +666,38 @@
ASSERT_LE(effective_datarate_[0], cfg_.rc_target_bitrate * 1.15)
<< " The datarate for the file is greater than target by too much!";
}
+
+// Check basic datarate targeting, for a single bitrate, when denoiser is off
+// and on.
+TEST_P(DatarateTestVP9Large, DenoiserOffon) {
+ cfg_.rc_buf_initial_sz = 500;
+ cfg_.rc_buf_optimal_sz = 500;
+ cfg_.rc_buf_sz = 1000;
+ cfg_.rc_dropframe_thresh = 1;
+ cfg_.rc_min_quantizer = 2;
+ cfg_.rc_max_quantizer = 56;
+ cfg_.rc_end_usage = VPX_CBR;
+ cfg_.g_lag_in_frames = 0;
+
+ ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
+ 30, 1, 0, 299);
+
+ // For the temporal denoiser (#if CONFIG_VP9_TEMPORAL_DENOISING),
+ // there is only one denoiser mode: denoiserYonly(which is 1),
+ // but may add more modes in the future.
+ cfg_.rc_target_bitrate = 300;
+ ResetModel();
+ // The denoiser is off by default.
+ denoiser_on_ = 0;
+ // Set the offon test flag.
+ denoiser_offon_test_ = 1;
+ denoiser_offon_period_ = 100;
+ ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+ ASSERT_GE(effective_datarate_[0], cfg_.rc_target_bitrate * 0.85)
+ << " The datarate for the file is lower than target by too much!";
+ ASSERT_LE(effective_datarate_[0], cfg_.rc_target_bitrate * 1.15)
+ << " The datarate for the file is greater than target by too much!";
+}
#endif // CONFIG_VP9_TEMPORAL_DENOISING
VP8_INSTANTIATE_TEST_CASE(DatarateTestLarge, ALL_TEST_MODES);
diff --git a/test/decode_perf_test.cc b/test/decode_perf_test.cc
index 33399e9..c24d517 100644
--- a/test/decode_perf_test.cc
+++ b/test/decode_perf_test.cc
@@ -176,7 +176,8 @@
// Write frame header and data.
ivf_write_frame_header(outfile_, out_frames_, pkt->data.frame.sz);
- ASSERT_GT(fwrite(pkt->data.frame.buf, 1, pkt->data.frame.sz, outfile_), 0);
+ ASSERT_EQ(fwrite(pkt->data.frame.buf, 1, pkt->data.frame.sz, outfile_),
+ pkt->data.frame.sz);
}
virtual bool DoDecode() { return false; }
diff --git a/test/encode_perf_test.cc b/test/encode_perf_test.cc
index 769e14a..7e9f0d6 100644
--- a/test/encode_perf_test.cc
+++ b/test/encode_perf_test.cc
@@ -189,7 +189,7 @@
printf("\t\"totalFrames\" : %u,\n", frames);
printf("\t\"framesPerSecond\" : %f,\n", fps);
printf("\t\"minPsnr\" : %f,\n", minimum_psnr);
- printf("\t\"speed\" : %d\n", kEncodePerfTestSpeeds[j]);
+ printf("\t\"speed\" : %d,\n", kEncodePerfTestSpeeds[j]);
printf("\t\"threads\" : %d\n", kEncodePerfTestThreads[k]);
printf("}\n");
}
diff --git a/test/encode_test_driver.cc b/test/encode_test_driver.cc
index 7a13364..77135ec 100644
--- a/test/encode_test_driver.cc
+++ b/test/encode_test_driver.cc
@@ -64,8 +64,7 @@
// Encode the frame
API_REGISTER_STATE_CHECK(
- res = vpx_codec_encode(&encoder_,
- video.img(), video.pts(), video.duration(),
+ res = vpx_codec_encode(&encoder_, img, video.pts(), video.duration(),
frame_flags, deadline_));
ASSERT_EQ(VPX_CODEC_OK, res) << EncoderError();
}
diff --git a/test/lpf_8_test.cc b/test/lpf_8_test.cc
index cdc0a98..c836fac 100644
--- a/test/lpf_8_test.cc
+++ b/test/lpf_8_test.cc
@@ -107,6 +107,36 @@
#endif // CONFIG_VP9_HIGHBITDEPTH
#endif // HAVE_SSE2
+#if HAVE_NEON_ASM
+#if CONFIG_VP9_HIGHBITDEPTH
+// No neon high bitdepth functions.
+#else
+void wrapper_vertical_16_neon(uint8_t *s, int p, const uint8_t *blimit,
+ const uint8_t *limit, const uint8_t *thresh,
+ int count) {
+ vp9_lpf_vertical_16_neon(s, p, blimit, limit, thresh);
+}
+
+void wrapper_vertical_16_c(uint8_t *s, int p, const uint8_t *blimit,
+ const uint8_t *limit, const uint8_t *thresh,
+ int count) {
+ vp9_lpf_vertical_16_c(s, p, blimit, limit, thresh);
+}
+
+void wrapper_vertical_16_dual_neon(uint8_t *s, int p, const uint8_t *blimit,
+ const uint8_t *limit, const uint8_t *thresh,
+ int count) {
+ vp9_lpf_vertical_16_dual_neon(s, p, blimit, limit, thresh);
+}
+
+void wrapper_vertical_16_dual_c(uint8_t *s, int p, const uint8_t *blimit,
+ const uint8_t *limit, const uint8_t *thresh,
+ int count) {
+ vp9_lpf_vertical_16_dual_c(s, p, blimit, limit, thresh);
+}
+#endif // CONFIG_VP9_HIGHBITDEPTH
+#endif // HAVE_NEON_ASM
+
class Loop8Test6Param : public ::testing::TestWithParam<loop8_param_t> {
public:
virtual ~Loop8Test6Param() {}
@@ -594,33 +624,45 @@
#endif // CONFIG_VP9_HIGHBITDEPTH
#endif
-#if HAVE_NEON && (!CONFIG_VP9_HIGHBITDEPTH)
+#if HAVE_NEON
+#if CONFIG_VP9_HIGHBITDEPTH
+// No neon high bitdepth functions.
+#else
INSTANTIATE_TEST_CASE_P(
NEON, Loop8Test6Param,
::testing::Values(
#if HAVE_NEON_ASM
+// Using #if inside the macro is unsupported on MSVS but the tests are not
+// currently built for MSVS with ARM and NEON.
make_tuple(&vp9_lpf_horizontal_16_neon,
&vp9_lpf_horizontal_16_c, 8),
+ make_tuple(&wrapper_vertical_16_neon,
+ &wrapper_vertical_16_c, 8),
+ make_tuple(&wrapper_vertical_16_dual_neon,
+ &wrapper_vertical_16_dual_c, 8),
+ make_tuple(&vp9_lpf_horizontal_8_neon,
+ &vp9_lpf_horizontal_8_c, 8),
+ make_tuple(&vp9_lpf_vertical_8_neon,
+ &vp9_lpf_vertical_8_c, 8),
#endif // HAVE_NEON_ASM
make_tuple(&vp9_lpf_horizontal_4_neon,
&vp9_lpf_horizontal_4_c, 8),
- make_tuple(&vp9_lpf_horizontal_8_neon,
- &vp9_lpf_horizontal_8_c, 8),
make_tuple(&vp9_lpf_vertical_4_neon,
- &vp9_lpf_vertical_4_c, 8),
- make_tuple(&vp9_lpf_vertical_8_neon,
- &vp9_lpf_vertical_8_c, 8)));
+ &vp9_lpf_vertical_4_c, 8)));
INSTANTIATE_TEST_CASE_P(
NEON, Loop8Test9Param,
::testing::Values(
- make_tuple(&vp9_lpf_horizontal_4_dual_neon,
- &vp9_lpf_horizontal_4_dual_c, 8),
+#if HAVE_NEON_ASM
make_tuple(&vp9_lpf_horizontal_8_dual_neon,
&vp9_lpf_horizontal_8_dual_c, 8),
- make_tuple(&vp9_lpf_vertical_4_dual_neon,
- &vp9_lpf_vertical_4_dual_c, 8),
make_tuple(&vp9_lpf_vertical_8_dual_neon,
- &vp9_lpf_vertical_8_dual_c, 8)));
-#endif // HAVE_NEON && (!CONFIG_VP9_HIGHBITDEPTH)
+ &vp9_lpf_vertical_8_dual_c, 8),
+#endif // HAVE_NEON_ASM
+ make_tuple(&vp9_lpf_horizontal_4_dual_neon,
+ &vp9_lpf_horizontal_4_dual_c, 8),
+ make_tuple(&vp9_lpf_vertical_4_dual_neon,
+ &vp9_lpf_vertical_4_dual_c, 8)));
+#endif // CONFIG_VP9_HIGHBITDEPTH
+#endif // HAVE_NEON
} // namespace
diff --git a/test/video_source.h b/test/video_source.h
index 84bfa8e..b97e155 100644
--- a/test/video_source.h
+++ b/test/video_source.h
@@ -175,8 +175,8 @@
void SetSize(unsigned int width, unsigned int height) {
if (width != width_ || height != height_) {
vpx_img_free(img_);
- raw_sz_ = ((width + 31)&~31) * height * 3 / 2;
img_ = vpx_img_alloc(NULL, VPX_IMG_FMT_I420, width, height, 32);
+ raw_sz_ = ((img_->w + 31) & ~31) * img_->h * 3 / 2;
width_ = width;
height_ = height;
}
diff --git a/vp8/vp8_dx_iface.c b/vp8/vp8_dx_iface.c
index 6810644..67a0fef 100644
--- a/vp8/vp8_dx_iface.c
+++ b/vp8/vp8_dx_iface.c
@@ -188,7 +188,7 @@
/* vet via sync code */
if (clear[3] != 0x9d || clear[4] != 0x01 || clear[5] != 0x2a)
- res = VPX_CODEC_UNSUP_BITSTREAM;
+ return VPX_CODEC_UNSUP_BITSTREAM;
si->w = (clear[6] | (clear[7] << 8)) & 0x3fff;
si->h = (clear[8] | (clear[9] << 8)) & 0x3fff;
@@ -402,7 +402,7 @@
if (!res)
{
VP8D_COMP *pbi = ctx->yv12_frame_buffers.pbi[0];
- if(resolution_change)
+ if (resolution_change)
{
VP8_COMMON *const pc = & pbi->common;
MACROBLOCKD *const xd = & pbi->mb;
diff --git a/vp9/common/arm/neon/vp9_loopfilter_16_neon.c b/vp9/common/arm/neon/vp9_loopfilter_16_neon.c
index 09f470e..c69ee10 100644
--- a/vp9/common/arm/neon/vp9_loopfilter_16_neon.c
+++ b/vp9/common/arm/neon/vp9_loopfilter_16_neon.c
@@ -124,7 +124,6 @@
return;
}
-#if !HAVE_NEON_ASM
void vp9_lpf_horizontal_4_dual_neon(uint8_t *s, int p /* pitch */,
const uint8_t *blimit0,
const uint8_t *limit0,
@@ -178,47 +177,3 @@
vst1q_u8(s, q8u8);
return;
}
-#endif // !HAVE_NEON_ASM
-
-void vp9_lpf_horizontal_8_dual_neon(uint8_t *s, int p /* pitch */,
- const uint8_t *blimit0,
- const uint8_t *limit0,
- const uint8_t *thresh0,
- const uint8_t *blimit1,
- const uint8_t *limit1,
- const uint8_t *thresh1) {
- vp9_lpf_horizontal_8_neon(s, p, blimit0, limit0, thresh0, 1);
- vp9_lpf_horizontal_8_neon(s + 8, p, blimit1, limit1, thresh1, 1);
-}
-
-void vp9_lpf_vertical_4_dual_neon(uint8_t *s, int p,
- const uint8_t *blimit0,
- const uint8_t *limit0,
- const uint8_t *thresh0,
- const uint8_t *blimit1,
- const uint8_t *limit1,
- const uint8_t *thresh1) {
- vp9_lpf_vertical_4_neon(s, p, blimit0, limit0, thresh0, 1);
- vp9_lpf_vertical_4_neon(s + 8 * p, p, blimit1, limit1, thresh1, 1);
-}
-
-void vp9_lpf_vertical_8_dual_neon(uint8_t *s, int p,
- const uint8_t *blimit0,
- const uint8_t *limit0,
- const uint8_t *thresh0,
- const uint8_t *blimit1,
- const uint8_t *limit1,
- const uint8_t *thresh1) {
- vp9_lpf_vertical_8_neon(s, p, blimit0, limit0, thresh0, 1);
- vp9_lpf_vertical_8_neon(s + 8 * p, p, blimit1, limit1, thresh1, 1);
-}
-
-#if HAVE_NEON_ASM
-void vp9_lpf_vertical_16_dual_neon(uint8_t *s, int p,
- const uint8_t *blimit,
- const uint8_t *limit,
- const uint8_t *thresh) {
- vp9_lpf_vertical_16_neon(s, p, blimit, limit, thresh);
- vp9_lpf_vertical_16_neon(s + 8 * p, p, blimit, limit, thresh);
-}
-#endif // HAVE_NEON_ASM
diff --git a/vp9/common/arm/neon/vp9_loopfilter_4_neon.c b/vp9/common/arm/neon/vp9_loopfilter_4_neon.c
new file mode 100644
index 0000000..fd9db61
--- /dev/null
+++ b/vp9/common/arm/neon/vp9_loopfilter_4_neon.c
@@ -0,0 +1,274 @@
+/*
+ * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "./vpx_config.h"
+
+static INLINE void vp9_loop_filter_neon(
+ uint8x8_t dblimit, // flimit
+ uint8x8_t dlimit, // limit
+ uint8x8_t dthresh, // thresh
+ uint8x8_t d3u8, // p3
+ uint8x8_t d4u8, // p2
+ uint8x8_t d5u8, // p1
+ uint8x8_t d6u8, // p0
+ uint8x8_t d7u8, // q0
+ uint8x8_t d16u8, // q1
+ uint8x8_t d17u8, // q2
+ uint8x8_t d18u8, // q3
+ uint8x8_t *d4ru8, // p1
+ uint8x8_t *d5ru8, // p0
+ uint8x8_t *d6ru8, // q0
+ uint8x8_t *d7ru8) { // q1
+ uint8x8_t d19u8, d20u8, d21u8, d22u8, d23u8, d27u8, d28u8;
+ int16x8_t q12s16;
+ int8x8_t d19s8, d20s8, d21s8, d26s8, d27s8, d28s8;
+
+ d19u8 = vabd_u8(d3u8, d4u8);
+ d20u8 = vabd_u8(d4u8, d5u8);
+ d21u8 = vabd_u8(d5u8, d6u8);
+ d22u8 = vabd_u8(d16u8, d7u8);
+ d3u8 = vabd_u8(d17u8, d16u8);
+ d4u8 = vabd_u8(d18u8, d17u8);
+
+ d19u8 = vmax_u8(d19u8, d20u8);
+ d20u8 = vmax_u8(d21u8, d22u8);
+ d3u8 = vmax_u8(d3u8, d4u8);
+ d23u8 = vmax_u8(d19u8, d20u8);
+
+ d17u8 = vabd_u8(d6u8, d7u8);
+
+ d21u8 = vcgt_u8(d21u8, dthresh);
+ d22u8 = vcgt_u8(d22u8, dthresh);
+ d23u8 = vmax_u8(d23u8, d3u8);
+
+ d28u8 = vabd_u8(d5u8, d16u8);
+ d17u8 = vqadd_u8(d17u8, d17u8);
+
+ d23u8 = vcge_u8(dlimit, d23u8);
+
+ d18u8 = vdup_n_u8(0x80);
+ d5u8 = veor_u8(d5u8, d18u8);
+ d6u8 = veor_u8(d6u8, d18u8);
+ d7u8 = veor_u8(d7u8, d18u8);
+ d16u8 = veor_u8(d16u8, d18u8);
+
+ d28u8 = vshr_n_u8(d28u8, 1);
+ d17u8 = vqadd_u8(d17u8, d28u8);
+
+ d19u8 = vdup_n_u8(3);
+
+ d28s8 = vsub_s8(vreinterpret_s8_u8(d7u8),
+ vreinterpret_s8_u8(d6u8));
+
+ d17u8 = vcge_u8(dblimit, d17u8);
+
+ d27s8 = vqsub_s8(vreinterpret_s8_u8(d5u8),
+ vreinterpret_s8_u8(d16u8));
+
+ d22u8 = vorr_u8(d21u8, d22u8);
+
+ q12s16 = vmull_s8(d28s8, vreinterpret_s8_u8(d19u8));
+
+ d27u8 = vand_u8(vreinterpret_u8_s8(d27s8), d22u8);
+ d23u8 = vand_u8(d23u8, d17u8);
+
+ q12s16 = vaddw_s8(q12s16, vreinterpret_s8_u8(d27u8));
+
+ d17u8 = vdup_n_u8(4);
+
+ d27s8 = vqmovn_s16(q12s16);
+ d27u8 = vand_u8(vreinterpret_u8_s8(d27s8), d23u8);
+ d27s8 = vreinterpret_s8_u8(d27u8);
+
+ d28s8 = vqadd_s8(d27s8, vreinterpret_s8_u8(d19u8));
+ d27s8 = vqadd_s8(d27s8, vreinterpret_s8_u8(d17u8));
+ d28s8 = vshr_n_s8(d28s8, 3);
+ d27s8 = vshr_n_s8(d27s8, 3);
+
+ d19s8 = vqadd_s8(vreinterpret_s8_u8(d6u8), d28s8);
+ d26s8 = vqsub_s8(vreinterpret_s8_u8(d7u8), d27s8);
+
+ d27s8 = vrshr_n_s8(d27s8, 1);
+ d27s8 = vbic_s8(d27s8, vreinterpret_s8_u8(d22u8));
+
+ d21s8 = vqadd_s8(vreinterpret_s8_u8(d5u8), d27s8);
+ d20s8 = vqsub_s8(vreinterpret_s8_u8(d16u8), d27s8);
+
+ *d4ru8 = veor_u8(vreinterpret_u8_s8(d21s8), d18u8);
+ *d5ru8 = veor_u8(vreinterpret_u8_s8(d19s8), d18u8);
+ *d6ru8 = veor_u8(vreinterpret_u8_s8(d26s8), d18u8);
+ *d7ru8 = veor_u8(vreinterpret_u8_s8(d20s8), d18u8);
+ return;
+}
+
+void vp9_lpf_horizontal_4_neon(
+ unsigned char *src,
+ int pitch,
+ unsigned char *blimit,
+ unsigned char *limit,
+ unsigned char *thresh,
+ int count) {
+ int i;
+ uint8_t *s, *psrc;
+ uint8x8_t dblimit, dlimit, dthresh;
+ uint8x8_t d3u8, d4u8, d5u8, d6u8, d7u8, d16u8, d17u8, d18u8;
+
+ if (count == 0) // end_vp9_lf_h_edge
+ return;
+
+ dblimit = vld1_u8(blimit);
+ dlimit = vld1_u8(limit);
+ dthresh = vld1_u8(thresh);
+
+ psrc = src - (pitch << 2);
+ for (i = 0; i < count; i++) {
+ s = psrc + i * 8;
+
+ d3u8 = vld1_u8(s);
+ s += pitch;
+ d4u8 = vld1_u8(s);
+ s += pitch;
+ d5u8 = vld1_u8(s);
+ s += pitch;
+ d6u8 = vld1_u8(s);
+ s += pitch;
+ d7u8 = vld1_u8(s);
+ s += pitch;
+ d16u8 = vld1_u8(s);
+ s += pitch;
+ d17u8 = vld1_u8(s);
+ s += pitch;
+ d18u8 = vld1_u8(s);
+
+ vp9_loop_filter_neon(dblimit, dlimit, dthresh,
+ d3u8, d4u8, d5u8, d6u8, d7u8, d16u8, d17u8, d18u8,
+ &d4u8, &d5u8, &d6u8, &d7u8);
+
+ s -= (pitch * 5);
+ vst1_u8(s, d4u8);
+ s += pitch;
+ vst1_u8(s, d5u8);
+ s += pitch;
+ vst1_u8(s, d6u8);
+ s += pitch;
+ vst1_u8(s, d7u8);
+ }
+ return;
+}
+
+void vp9_lpf_vertical_4_neon(
+ unsigned char *src,
+ int pitch,
+ unsigned char *blimit,
+ unsigned char *limit,
+ unsigned char *thresh,
+ int count) {
+ int i, pitch8;
+ uint8_t *s;
+ uint8x8_t dblimit, dlimit, dthresh;
+ uint8x8_t d3u8, d4u8, d5u8, d6u8, d7u8, d16u8, d17u8, d18u8;
+ uint32x2x2_t d2tmp0, d2tmp1, d2tmp2, d2tmp3;
+ uint16x4x2_t d2tmp4, d2tmp5, d2tmp6, d2tmp7;
+ uint8x8x2_t d2tmp8, d2tmp9, d2tmp10, d2tmp11;
+ uint8x8x4_t d4Result;
+
+ if (count == 0) // end_vp9_lf_h_edge
+ return;
+
+ dblimit = vld1_u8(blimit);
+ dlimit = vld1_u8(limit);
+ dthresh = vld1_u8(thresh);
+
+ pitch8 = pitch * 8;
+ for (i = 0; i < count; i++, src += pitch8) {
+ s = src - (i + 1) * 4;
+
+ d3u8 = vld1_u8(s);
+ s += pitch;
+ d4u8 = vld1_u8(s);
+ s += pitch;
+ d5u8 = vld1_u8(s);
+ s += pitch;
+ d6u8 = vld1_u8(s);
+ s += pitch;
+ d7u8 = vld1_u8(s);
+ s += pitch;
+ d16u8 = vld1_u8(s);
+ s += pitch;
+ d17u8 = vld1_u8(s);
+ s += pitch;
+ d18u8 = vld1_u8(s);
+
+ d2tmp0 = vtrn_u32(vreinterpret_u32_u8(d3u8),
+ vreinterpret_u32_u8(d7u8));
+ d2tmp1 = vtrn_u32(vreinterpret_u32_u8(d4u8),
+ vreinterpret_u32_u8(d16u8));
+ d2tmp2 = vtrn_u32(vreinterpret_u32_u8(d5u8),
+ vreinterpret_u32_u8(d17u8));
+ d2tmp3 = vtrn_u32(vreinterpret_u32_u8(d6u8),
+ vreinterpret_u32_u8(d18u8));
+
+ d2tmp4 = vtrn_u16(vreinterpret_u16_u32(d2tmp0.val[0]),
+ vreinterpret_u16_u32(d2tmp2.val[0]));
+ d2tmp5 = vtrn_u16(vreinterpret_u16_u32(d2tmp1.val[0]),
+ vreinterpret_u16_u32(d2tmp3.val[0]));
+ d2tmp6 = vtrn_u16(vreinterpret_u16_u32(d2tmp0.val[1]),
+ vreinterpret_u16_u32(d2tmp2.val[1]));
+ d2tmp7 = vtrn_u16(vreinterpret_u16_u32(d2tmp1.val[1]),
+ vreinterpret_u16_u32(d2tmp3.val[1]));
+
+ d2tmp8 = vtrn_u8(vreinterpret_u8_u16(d2tmp4.val[0]),
+ vreinterpret_u8_u16(d2tmp5.val[0]));
+ d2tmp9 = vtrn_u8(vreinterpret_u8_u16(d2tmp4.val[1]),
+ vreinterpret_u8_u16(d2tmp5.val[1]));
+ d2tmp10 = vtrn_u8(vreinterpret_u8_u16(d2tmp6.val[0]),
+ vreinterpret_u8_u16(d2tmp7.val[0]));
+ d2tmp11 = vtrn_u8(vreinterpret_u8_u16(d2tmp6.val[1]),
+ vreinterpret_u8_u16(d2tmp7.val[1]));
+
+ d3u8 = d2tmp8.val[0];
+ d4u8 = d2tmp8.val[1];
+ d5u8 = d2tmp9.val[0];
+ d6u8 = d2tmp9.val[1];
+ d7u8 = d2tmp10.val[0];
+ d16u8 = d2tmp10.val[1];
+ d17u8 = d2tmp11.val[0];
+ d18u8 = d2tmp11.val[1];
+
+ vp9_loop_filter_neon(dblimit, dlimit, dthresh,
+ d3u8, d4u8, d5u8, d6u8, d7u8, d16u8, d17u8, d18u8,
+ &d4u8, &d5u8, &d6u8, &d7u8);
+
+ d4Result.val[0] = d4u8;
+ d4Result.val[1] = d5u8;
+ d4Result.val[2] = d6u8;
+ d4Result.val[3] = d7u8;
+
+ src -= 2;
+ vst4_lane_u8(src, d4Result, 0);
+ src += pitch;
+ vst4_lane_u8(src, d4Result, 1);
+ src += pitch;
+ vst4_lane_u8(src, d4Result, 2);
+ src += pitch;
+ vst4_lane_u8(src, d4Result, 3);
+ src += pitch;
+ vst4_lane_u8(src, d4Result, 4);
+ src += pitch;
+ vst4_lane_u8(src, d4Result, 5);
+ src += pitch;
+ vst4_lane_u8(src, d4Result, 6);
+ src += pitch;
+ vst4_lane_u8(src, d4Result, 7);
+ }
+ return;
+}
diff --git a/vp9/common/arm/neon/vp9_loopfilter_4_neon_asm.asm b/vp9/common/arm/neon/vp9_loopfilter_4_neon_asm.asm
new file mode 100644
index 0000000..7738e0d
--- /dev/null
+++ b/vp9/common/arm/neon/vp9_loopfilter_4_neon_asm.asm
@@ -0,0 +1,277 @@
+;
+; Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+ EXPORT |vp9_lpf_horizontal_4_neon|
+ EXPORT |vp9_lpf_vertical_4_neon|
+ ARM
+
+ AREA ||.text||, CODE, READONLY, ALIGN=2
+
+; Currently vp9 only works on iterations 8 at a time. The vp8 loop filter
+; works on 16 iterations at a time.
+; TODO(fgalligan): See about removing the count code as this function is only
+; called with a count of 1.
+;
+; void vp9_lpf_horizontal_4_neon(uint8_t *s,
+; int p /* pitch */,
+; const uint8_t *blimit,
+; const uint8_t *limit,
+; const uint8_t *thresh,
+; int count)
+;
+; r0 uint8_t *s,
+; r1 int p, /* pitch */
+; r2 const uint8_t *blimit,
+; r3 const uint8_t *limit,
+; sp const uint8_t *thresh,
+; sp+4 int count
+|vp9_lpf_horizontal_4_neon| PROC
+ push {lr}
+
+ vld1.8 {d0[]}, [r2] ; duplicate *blimit
+ ldr r12, [sp, #8] ; load count
+ ldr r2, [sp, #4] ; load thresh
+ add r1, r1, r1 ; double pitch
+
+ cmp r12, #0
+ beq end_vp9_lf_h_edge
+
+ vld1.8 {d1[]}, [r3] ; duplicate *limit
+ vld1.8 {d2[]}, [r2] ; duplicate *thresh
+
+count_lf_h_loop
+ sub r2, r0, r1, lsl #1 ; move src pointer down by 4 lines
+ add r3, r2, r1, lsr #1 ; set to 3 lines down
+
+ vld1.u8 {d3}, [r2@64], r1 ; p3
+ vld1.u8 {d4}, [r3@64], r1 ; p2
+ vld1.u8 {d5}, [r2@64], r1 ; p1
+ vld1.u8 {d6}, [r3@64], r1 ; p0
+ vld1.u8 {d7}, [r2@64], r1 ; q0
+ vld1.u8 {d16}, [r3@64], r1 ; q1
+ vld1.u8 {d17}, [r2@64] ; q2
+ vld1.u8 {d18}, [r3@64] ; q3
+
+ sub r2, r2, r1, lsl #1
+ sub r3, r3, r1, lsl #1
+
+ bl vp9_loop_filter_neon
+
+ vst1.u8 {d4}, [r2@64], r1 ; store op1
+ vst1.u8 {d5}, [r3@64], r1 ; store op0
+ vst1.u8 {d6}, [r2@64], r1 ; store oq0
+ vst1.u8 {d7}, [r3@64], r1 ; store oq1
+
+ add r0, r0, #8
+ subs r12, r12, #1
+ bne count_lf_h_loop
+
+end_vp9_lf_h_edge
+ pop {pc}
+ ENDP ; |vp9_lpf_horizontal_4_neon|
+
+; Currently vp9 only works on iterations 8 at a time. The vp8 loop filter
+; works on 16 iterations at a time.
+; TODO(fgalligan): See about removing the count code as this function is only
+; called with a count of 1.
+;
+; void vp9_lpf_vertical_4_neon(uint8_t *s,
+; int p /* pitch */,
+; const uint8_t *blimit,
+; const uint8_t *limit,
+; const uint8_t *thresh,
+; int count)
+;
+; r0 uint8_t *s,
+; r1 int p, /* pitch */
+; r2 const uint8_t *blimit,
+; r3 const uint8_t *limit,
+; sp const uint8_t *thresh,
+; sp+4 int count
+|vp9_lpf_vertical_4_neon| PROC
+ push {lr}
+
+ vld1.8 {d0[]}, [r2] ; duplicate *blimit
+ ldr r12, [sp, #8] ; load count
+ vld1.8 {d1[]}, [r3] ; duplicate *limit
+
+ ldr r3, [sp, #4] ; load thresh
+ sub r2, r0, #4 ; move s pointer down by 4 columns
+ cmp r12, #0
+ beq end_vp9_lf_v_edge
+
+ vld1.8 {d2[]}, [r3] ; duplicate *thresh
+
+count_lf_v_loop
+ vld1.u8 {d3}, [r2], r1 ; load s data
+ vld1.u8 {d4}, [r2], r1
+ vld1.u8 {d5}, [r2], r1
+ vld1.u8 {d6}, [r2], r1
+ vld1.u8 {d7}, [r2], r1
+ vld1.u8 {d16}, [r2], r1
+ vld1.u8 {d17}, [r2], r1
+ vld1.u8 {d18}, [r2]
+
+ ;transpose to 8x16 matrix
+ vtrn.32 d3, d7
+ vtrn.32 d4, d16
+ vtrn.32 d5, d17
+ vtrn.32 d6, d18
+
+ vtrn.16 d3, d5
+ vtrn.16 d4, d6
+ vtrn.16 d7, d17
+ vtrn.16 d16, d18
+
+ vtrn.8 d3, d4
+ vtrn.8 d5, d6
+ vtrn.8 d7, d16
+ vtrn.8 d17, d18
+
+ bl vp9_loop_filter_neon
+
+ sub r0, r0, #2
+
+ ;store op1, op0, oq0, oq1
+ vst4.8 {d4[0], d5[0], d6[0], d7[0]}, [r0], r1
+ vst4.8 {d4[1], d5[1], d6[1], d7[1]}, [r0], r1
+ vst4.8 {d4[2], d5[2], d6[2], d7[2]}, [r0], r1
+ vst4.8 {d4[3], d5[3], d6[3], d7[3]}, [r0], r1
+ vst4.8 {d4[4], d5[4], d6[4], d7[4]}, [r0], r1
+ vst4.8 {d4[5], d5[5], d6[5], d7[5]}, [r0], r1
+ vst4.8 {d4[6], d5[6], d6[6], d7[6]}, [r0], r1
+ vst4.8 {d4[7], d5[7], d6[7], d7[7]}, [r0]
+
+ add r0, r0, r1, lsl #3 ; s += pitch * 8
+ subs r12, r12, #1
+ subne r2, r0, #4 ; move s pointer down by 4 columns
+ bne count_lf_v_loop
+
+end_vp9_lf_v_edge
+ pop {pc}
+ ENDP ; |vp9_lpf_vertical_4_neon|
+
+; void vp9_loop_filter_neon();
+; This is a helper function for the loopfilters. The invidual functions do the
+; necessary load, transpose (if necessary) and store. The function does not use
+; registers d8-d15.
+;
+; Inputs:
+; r0-r3, r12 PRESERVE
+; d0 blimit
+; d1 limit
+; d2 thresh
+; d3 p3
+; d4 p2
+; d5 p1
+; d6 p0
+; d7 q0
+; d16 q1
+; d17 q2
+; d18 q3
+;
+; Outputs:
+; d4 op1
+; d5 op0
+; d6 oq0
+; d7 oq1
+|vp9_loop_filter_neon| PROC
+ ; filter_mask
+ vabd.u8 d19, d3, d4 ; m1 = abs(p3 - p2)
+ vabd.u8 d20, d4, d5 ; m2 = abs(p2 - p1)
+ vabd.u8 d21, d5, d6 ; m3 = abs(p1 - p0)
+ vabd.u8 d22, d16, d7 ; m4 = abs(q1 - q0)
+ vabd.u8 d3, d17, d16 ; m5 = abs(q2 - q1)
+ vabd.u8 d4, d18, d17 ; m6 = abs(q3 - q2)
+
+ ; only compare the largest value to limit
+ vmax.u8 d19, d19, d20 ; m1 = max(m1, m2)
+ vmax.u8 d20, d21, d22 ; m2 = max(m3, m4)
+
+ vabd.u8 d17, d6, d7 ; abs(p0 - q0)
+
+ vmax.u8 d3, d3, d4 ; m3 = max(m5, m6)
+
+ vmov.u8 d18, #0x80
+
+ vmax.u8 d23, d19, d20 ; m1 = max(m1, m2)
+
+ ; hevmask
+ vcgt.u8 d21, d21, d2 ; (abs(p1 - p0) > thresh)*-1
+ vcgt.u8 d22, d22, d2 ; (abs(q1 - q0) > thresh)*-1
+ vmax.u8 d23, d23, d3 ; m1 = max(m1, m3)
+
+ vabd.u8 d28, d5, d16 ; a = abs(p1 - q1)
+ vqadd.u8 d17, d17, d17 ; b = abs(p0 - q0) * 2
+
+ veor d7, d7, d18 ; qs0
+
+ vcge.u8 d23, d1, d23 ; abs(m1) > limit
+
+ ; filter() function
+ ; convert to signed
+
+ vshr.u8 d28, d28, #1 ; a = a / 2
+ veor d6, d6, d18 ; ps0
+
+ veor d5, d5, d18 ; ps1
+ vqadd.u8 d17, d17, d28 ; a = b + a
+
+ veor d16, d16, d18 ; qs1
+
+ vmov.u8 d19, #3
+
+ vsub.s8 d28, d7, d6 ; ( qs0 - ps0)
+
+ vcge.u8 d17, d0, d17 ; a > blimit
+
+ vqsub.s8 d27, d5, d16 ; filter = clamp(ps1-qs1)
+ vorr d22, d21, d22 ; hevmask
+
+ vmull.s8 q12, d28, d19 ; 3 * ( qs0 - ps0)
+
+ vand d27, d27, d22 ; filter &= hev
+ vand d23, d23, d17 ; filter_mask
+
+ vaddw.s8 q12, q12, d27 ; filter + 3 * (qs0 - ps0)
+
+ vmov.u8 d17, #4
+
+ ; filter = clamp(filter + 3 * ( qs0 - ps0))
+ vqmovn.s16 d27, q12
+
+ vand d27, d27, d23 ; filter &= mask
+
+ vqadd.s8 d28, d27, d19 ; filter2 = clamp(filter+3)
+ vqadd.s8 d27, d27, d17 ; filter1 = clamp(filter+4)
+ vshr.s8 d28, d28, #3 ; filter2 >>= 3
+ vshr.s8 d27, d27, #3 ; filter1 >>= 3
+
+ vqadd.s8 d19, d6, d28 ; u = clamp(ps0 + filter2)
+ vqsub.s8 d26, d7, d27 ; u = clamp(qs0 - filter1)
+
+ ; outer tap adjustments
+ vrshr.s8 d27, d27, #1 ; filter = ++filter1 >> 1
+
+ veor d6, d26, d18 ; *oq0 = u^0x80
+
+ vbic d27, d27, d22 ; filter &= ~hev
+
+ vqadd.s8 d21, d5, d27 ; u = clamp(ps1 + filter)
+ vqsub.s8 d20, d16, d27 ; u = clamp(qs1 - filter)
+
+ veor d5, d19, d18 ; *op0 = u^0x80
+ veor d4, d21, d18 ; *op1 = u^0x80
+ veor d7, d20, d18 ; *oq1 = u^0x80
+
+ bx lr
+ ENDP ; |vp9_loop_filter_neon|
+
+ END
diff --git a/vp9/common/arm/neon/vp9_loopfilter_8_neon.c b/vp9/common/arm/neon/vp9_loopfilter_8_neon.c
new file mode 100644
index 0000000..33068a8
--- /dev/null
+++ b/vp9/common/arm/neon/vp9_loopfilter_8_neon.c
@@ -0,0 +1,453 @@
+/*
+ * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "./vpx_config.h"
+
+static INLINE void vp9_mbloop_filter_neon(
+ uint8x8_t dblimit, // mblimit
+ uint8x8_t dlimit, // limit
+ uint8x8_t dthresh, // thresh
+ uint8x8_t d3u8, // p2
+ uint8x8_t d4u8, // p2
+ uint8x8_t d5u8, // p1
+ uint8x8_t d6u8, // p0
+ uint8x8_t d7u8, // q0
+ uint8x8_t d16u8, // q1
+ uint8x8_t d17u8, // q2
+ uint8x8_t d18u8, // q3
+ uint8x8_t *d0ru8, // p1
+ uint8x8_t *d1ru8, // p1
+ uint8x8_t *d2ru8, // p0
+ uint8x8_t *d3ru8, // q0
+ uint8x8_t *d4ru8, // q1
+ uint8x8_t *d5ru8) { // q1
+ uint32_t flat;
+ uint8x8_t d0u8, d1u8, d2u8, d19u8, d20u8, d21u8, d22u8, d23u8, d24u8;
+ uint8x8_t d25u8, d26u8, d27u8, d28u8, d29u8, d30u8, d31u8;
+ int16x8_t q15s16;
+ uint16x8_t q10u16, q14u16;
+ int8x8_t d21s8, d24s8, d25s8, d26s8, d28s8, d29s8, d30s8;
+
+ d19u8 = vabd_u8(d3u8, d4u8);
+ d20u8 = vabd_u8(d4u8, d5u8);
+ d21u8 = vabd_u8(d5u8, d6u8);
+ d22u8 = vabd_u8(d16u8, d7u8);
+ d23u8 = vabd_u8(d17u8, d16u8);
+ d24u8 = vabd_u8(d18u8, d17u8);
+
+ d19u8 = vmax_u8(d19u8, d20u8);
+ d20u8 = vmax_u8(d21u8, d22u8);
+
+ d25u8 = vabd_u8(d6u8, d4u8);
+
+ d23u8 = vmax_u8(d23u8, d24u8);
+
+ d26u8 = vabd_u8(d7u8, d17u8);
+
+ d19u8 = vmax_u8(d19u8, d20u8);
+
+ d24u8 = vabd_u8(d6u8, d7u8);
+ d27u8 = vabd_u8(d3u8, d6u8);
+ d28u8 = vabd_u8(d18u8, d7u8);
+
+ d19u8 = vmax_u8(d19u8, d23u8);
+
+ d23u8 = vabd_u8(d5u8, d16u8);
+ d24u8 = vqadd_u8(d24u8, d24u8);
+
+
+ d19u8 = vcge_u8(dlimit, d19u8);
+
+
+ d25u8 = vmax_u8(d25u8, d26u8);
+ d26u8 = vmax_u8(d27u8, d28u8);
+
+ d23u8 = vshr_n_u8(d23u8, 1);
+
+ d25u8 = vmax_u8(d25u8, d26u8);
+
+ d24u8 = vqadd_u8(d24u8, d23u8);
+
+ d20u8 = vmax_u8(d20u8, d25u8);
+
+ d23u8 = vdup_n_u8(1);
+ d24u8 = vcge_u8(dblimit, d24u8);
+
+ d21u8 = vcgt_u8(d21u8, dthresh);
+
+ d20u8 = vcge_u8(d23u8, d20u8);
+
+ d19u8 = vand_u8(d19u8, d24u8);
+
+ d23u8 = vcgt_u8(d22u8, dthresh);
+
+ d20u8 = vand_u8(d20u8, d19u8);
+
+ d22u8 = vdup_n_u8(0x80);
+
+ d23u8 = vorr_u8(d21u8, d23u8);
+
+ q10u16 = vcombine_u16(vreinterpret_u16_u8(d20u8),
+ vreinterpret_u16_u8(d21u8));
+
+ d30u8 = vshrn_n_u16(q10u16, 4);
+ flat = vget_lane_u32(vreinterpret_u32_u8(d30u8), 0);
+
+ if (flat == 0xffffffff) { // Check for all 1's, power_branch_only
+ d27u8 = vdup_n_u8(3);
+ d21u8 = vdup_n_u8(2);
+ q14u16 = vaddl_u8(d6u8, d7u8);
+ q14u16 = vmlal_u8(q14u16, d3u8, d27u8);
+ q14u16 = vmlal_u8(q14u16, d4u8, d21u8);
+ q14u16 = vaddw_u8(q14u16, d5u8);
+ *d0ru8 = vqrshrn_n_u16(q14u16, 3);
+
+ q14u16 = vsubw_u8(q14u16, d3u8);
+ q14u16 = vsubw_u8(q14u16, d4u8);
+ q14u16 = vaddw_u8(q14u16, d5u8);
+ q14u16 = vaddw_u8(q14u16, d16u8);
+ *d1ru8 = vqrshrn_n_u16(q14u16, 3);
+
+ q14u16 = vsubw_u8(q14u16, d3u8);
+ q14u16 = vsubw_u8(q14u16, d5u8);
+ q14u16 = vaddw_u8(q14u16, d6u8);
+ q14u16 = vaddw_u8(q14u16, d17u8);
+ *d2ru8 = vqrshrn_n_u16(q14u16, 3);
+
+ q14u16 = vsubw_u8(q14u16, d3u8);
+ q14u16 = vsubw_u8(q14u16, d6u8);
+ q14u16 = vaddw_u8(q14u16, d7u8);
+ q14u16 = vaddw_u8(q14u16, d18u8);
+ *d3ru8 = vqrshrn_n_u16(q14u16, 3);
+
+ q14u16 = vsubw_u8(q14u16, d4u8);
+ q14u16 = vsubw_u8(q14u16, d7u8);
+ q14u16 = vaddw_u8(q14u16, d16u8);
+ q14u16 = vaddw_u8(q14u16, d18u8);
+ *d4ru8 = vqrshrn_n_u16(q14u16, 3);
+
+ q14u16 = vsubw_u8(q14u16, d5u8);
+ q14u16 = vsubw_u8(q14u16, d16u8);
+ q14u16 = vaddw_u8(q14u16, d17u8);
+ q14u16 = vaddw_u8(q14u16, d18u8);
+ *d5ru8 = vqrshrn_n_u16(q14u16, 3);
+ } else {
+ d21u8 = veor_u8(d7u8, d22u8);
+ d24u8 = veor_u8(d6u8, d22u8);
+ d25u8 = veor_u8(d5u8, d22u8);
+ d26u8 = veor_u8(d16u8, d22u8);
+
+ d27u8 = vdup_n_u8(3);
+
+ d28s8 = vsub_s8(vreinterpret_s8_u8(d21u8), vreinterpret_s8_u8(d24u8));
+ d29s8 = vqsub_s8(vreinterpret_s8_u8(d25u8), vreinterpret_s8_u8(d26u8));
+
+ q15s16 = vmull_s8(d28s8, vreinterpret_s8_u8(d27u8));
+
+ d29s8 = vand_s8(d29s8, vreinterpret_s8_u8(d23u8));
+
+ q15s16 = vaddw_s8(q15s16, d29s8);
+
+ d29u8 = vdup_n_u8(4);
+
+ d28s8 = vqmovn_s16(q15s16);
+
+ d28s8 = vand_s8(d28s8, vreinterpret_s8_u8(d19u8));
+
+ d30s8 = vqadd_s8(d28s8, vreinterpret_s8_u8(d27u8));
+ d29s8 = vqadd_s8(d28s8, vreinterpret_s8_u8(d29u8));
+ d30s8 = vshr_n_s8(d30s8, 3);
+ d29s8 = vshr_n_s8(d29s8, 3);
+
+ d24s8 = vqadd_s8(vreinterpret_s8_u8(d24u8), d30s8);
+ d21s8 = vqsub_s8(vreinterpret_s8_u8(d21u8), d29s8);
+
+ d29s8 = vrshr_n_s8(d29s8, 1);
+ d29s8 = vbic_s8(d29s8, vreinterpret_s8_u8(d23u8));
+
+ d25s8 = vqadd_s8(vreinterpret_s8_u8(d25u8), d29s8);
+ d26s8 = vqsub_s8(vreinterpret_s8_u8(d26u8), d29s8);
+
+ if (flat == 0) { // filter_branch_only
+ *d0ru8 = d4u8;
+ *d1ru8 = veor_u8(vreinterpret_u8_s8(d25s8), d22u8);
+ *d2ru8 = veor_u8(vreinterpret_u8_s8(d24s8), d22u8);
+ *d3ru8 = veor_u8(vreinterpret_u8_s8(d21s8), d22u8);
+ *d4ru8 = veor_u8(vreinterpret_u8_s8(d26s8), d22u8);
+ *d5ru8 = d17u8;
+ return;
+ }
+
+ d21u8 = veor_u8(vreinterpret_u8_s8(d21s8), d22u8);
+ d24u8 = veor_u8(vreinterpret_u8_s8(d24s8), d22u8);
+ d25u8 = veor_u8(vreinterpret_u8_s8(d25s8), d22u8);
+ d26u8 = veor_u8(vreinterpret_u8_s8(d26s8), d22u8);
+
+ d23u8 = vdup_n_u8(2);
+ q14u16 = vaddl_u8(d6u8, d7u8);
+ q14u16 = vmlal_u8(q14u16, d3u8, d27u8);
+ q14u16 = vmlal_u8(q14u16, d4u8, d23u8);
+
+ d0u8 = vbsl_u8(d20u8, dblimit, d4u8);
+
+ q14u16 = vaddw_u8(q14u16, d5u8);
+
+ d1u8 = vbsl_u8(d20u8, dlimit, d25u8);
+
+ d30u8 = vqrshrn_n_u16(q14u16, 3);
+
+ q14u16 = vsubw_u8(q14u16, d3u8);
+ q14u16 = vsubw_u8(q14u16, d4u8);
+ q14u16 = vaddw_u8(q14u16, d5u8);
+ q14u16 = vaddw_u8(q14u16, d16u8);
+
+ d2u8 = vbsl_u8(d20u8, dthresh, d24u8);
+
+ d31u8 = vqrshrn_n_u16(q14u16, 3);
+
+ q14u16 = vsubw_u8(q14u16, d3u8);
+ q14u16 = vsubw_u8(q14u16, d5u8);
+ q14u16 = vaddw_u8(q14u16, d6u8);
+ q14u16 = vaddw_u8(q14u16, d17u8);
+
+ *d0ru8 = vbsl_u8(d20u8, d30u8, d0u8);
+
+ d23u8 = vqrshrn_n_u16(q14u16, 3);
+
+ q14u16 = vsubw_u8(q14u16, d3u8);
+ q14u16 = vsubw_u8(q14u16, d6u8);
+ q14u16 = vaddw_u8(q14u16, d7u8);
+
+ *d1ru8 = vbsl_u8(d20u8, d31u8, d1u8);
+
+ q14u16 = vaddw_u8(q14u16, d18u8);
+
+ *d2ru8 = vbsl_u8(d20u8, d23u8, d2u8);
+
+ d22u8 = vqrshrn_n_u16(q14u16, 3);
+
+ q14u16 = vsubw_u8(q14u16, d4u8);
+ q14u16 = vsubw_u8(q14u16, d7u8);
+ q14u16 = vaddw_u8(q14u16, d16u8);
+
+ d3u8 = vbsl_u8(d20u8, d3u8, d21u8);
+
+ q14u16 = vaddw_u8(q14u16, d18u8);
+
+ d4u8 = vbsl_u8(d20u8, d4u8, d26u8);
+
+ d6u8 = vqrshrn_n_u16(q14u16, 3);
+
+ q14u16 = vsubw_u8(q14u16, d5u8);
+ q14u16 = vsubw_u8(q14u16, d16u8);
+ q14u16 = vaddw_u8(q14u16, d17u8);
+ q14u16 = vaddw_u8(q14u16, d18u8);
+
+ d5u8 = vbsl_u8(d20u8, d5u8, d17u8);
+
+ d7u8 = vqrshrn_n_u16(q14u16, 3);
+
+ *d3ru8 = vbsl_u8(d20u8, d22u8, d3u8);
+ *d4ru8 = vbsl_u8(d20u8, d6u8, d4u8);
+ *d5ru8 = vbsl_u8(d20u8, d7u8, d5u8);
+ }
+ return;
+}
+
+void vp9_lpf_horizontal_8_neon(
+ unsigned char *src,
+ int pitch,
+ unsigned char *blimit,
+ unsigned char *limit,
+ unsigned char *thresh,
+ int count) {
+ int i;
+ uint8_t *s, *psrc;
+ uint8x8_t dblimit, dlimit, dthresh;
+ uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8;
+ uint8x8_t d16u8, d17u8, d18u8;
+
+ if (count == 0) // end_vp9_mblf_h_edge
+ return;
+
+ dblimit = vld1_u8(blimit);
+ dlimit = vld1_u8(limit);
+ dthresh = vld1_u8(thresh);
+
+ psrc = src - (pitch << 2);
+ for (i = 0; i < count; i++) {
+ s = psrc + i * 8;
+
+ d3u8 = vld1_u8(s);
+ s += pitch;
+ d4u8 = vld1_u8(s);
+ s += pitch;
+ d5u8 = vld1_u8(s);
+ s += pitch;
+ d6u8 = vld1_u8(s);
+ s += pitch;
+ d7u8 = vld1_u8(s);
+ s += pitch;
+ d16u8 = vld1_u8(s);
+ s += pitch;
+ d17u8 = vld1_u8(s);
+ s += pitch;
+ d18u8 = vld1_u8(s);
+
+ vp9_mbloop_filter_neon(dblimit, dlimit, dthresh,
+ d3u8, d4u8, d5u8, d6u8, d7u8, d16u8, d17u8, d18u8,
+ &d0u8, &d1u8, &d2u8, &d3u8, &d4u8, &d5u8);
+
+ s -= (pitch * 6);
+ vst1_u8(s, d0u8);
+ s += pitch;
+ vst1_u8(s, d1u8);
+ s += pitch;
+ vst1_u8(s, d2u8);
+ s += pitch;
+ vst1_u8(s, d3u8);
+ s += pitch;
+ vst1_u8(s, d4u8);
+ s += pitch;
+ vst1_u8(s, d5u8);
+ }
+ return;
+}
+
+void vp9_lpf_vertical_8_neon(
+ unsigned char *src,
+ int pitch,
+ unsigned char *blimit,
+ unsigned char *limit,
+ unsigned char *thresh,
+ int count) {
+ int i;
+ uint8_t *s;
+ uint8x8_t dblimit, dlimit, dthresh;
+ uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8;
+ uint8x8_t d16u8, d17u8, d18u8;
+ uint32x2x2_t d2tmp0, d2tmp1, d2tmp2, d2tmp3;
+ uint16x4x2_t d2tmp4, d2tmp5, d2tmp6, d2tmp7;
+ uint8x8x2_t d2tmp8, d2tmp9, d2tmp10, d2tmp11;
+ uint8x8x4_t d4Result;
+ uint8x8x2_t d2Result;
+
+ if (count == 0)
+ return;
+
+ dblimit = vld1_u8(blimit);
+ dlimit = vld1_u8(limit);
+ dthresh = vld1_u8(thresh);
+
+ for (i = 0; i < count; i++) {
+ s = src + (i * (pitch << 3)) - 4;
+
+ d3u8 = vld1_u8(s);
+ s += pitch;
+ d4u8 = vld1_u8(s);
+ s += pitch;
+ d5u8 = vld1_u8(s);
+ s += pitch;
+ d6u8 = vld1_u8(s);
+ s += pitch;
+ d7u8 = vld1_u8(s);
+ s += pitch;
+ d16u8 = vld1_u8(s);
+ s += pitch;
+ d17u8 = vld1_u8(s);
+ s += pitch;
+ d18u8 = vld1_u8(s);
+
+ d2tmp0 = vtrn_u32(vreinterpret_u32_u8(d3u8),
+ vreinterpret_u32_u8(d7u8));
+ d2tmp1 = vtrn_u32(vreinterpret_u32_u8(d4u8),
+ vreinterpret_u32_u8(d16u8));
+ d2tmp2 = vtrn_u32(vreinterpret_u32_u8(d5u8),
+ vreinterpret_u32_u8(d17u8));
+ d2tmp3 = vtrn_u32(vreinterpret_u32_u8(d6u8),
+ vreinterpret_u32_u8(d18u8));
+
+ d2tmp4 = vtrn_u16(vreinterpret_u16_u32(d2tmp0.val[0]),
+ vreinterpret_u16_u32(d2tmp2.val[0]));
+ d2tmp5 = vtrn_u16(vreinterpret_u16_u32(d2tmp1.val[0]),
+ vreinterpret_u16_u32(d2tmp3.val[0]));
+ d2tmp6 = vtrn_u16(vreinterpret_u16_u32(d2tmp0.val[1]),
+ vreinterpret_u16_u32(d2tmp2.val[1]));
+ d2tmp7 = vtrn_u16(vreinterpret_u16_u32(d2tmp1.val[1]),
+ vreinterpret_u16_u32(d2tmp3.val[1]));
+
+ d2tmp8 = vtrn_u8(vreinterpret_u8_u16(d2tmp4.val[0]),
+ vreinterpret_u8_u16(d2tmp5.val[0]));
+ d2tmp9 = vtrn_u8(vreinterpret_u8_u16(d2tmp4.val[1]),
+ vreinterpret_u8_u16(d2tmp5.val[1]));
+ d2tmp10 = vtrn_u8(vreinterpret_u8_u16(d2tmp6.val[0]),
+ vreinterpret_u8_u16(d2tmp7.val[0]));
+ d2tmp11 = vtrn_u8(vreinterpret_u8_u16(d2tmp6.val[1]),
+ vreinterpret_u8_u16(d2tmp7.val[1]));
+
+ d3u8 = d2tmp8.val[0];
+ d4u8 = d2tmp8.val[1];
+ d5u8 = d2tmp9.val[0];
+ d6u8 = d2tmp9.val[1];
+ d7u8 = d2tmp10.val[0];
+ d16u8 = d2tmp10.val[1];
+ d17u8 = d2tmp11.val[0];
+ d18u8 = d2tmp11.val[1];
+
+ vp9_mbloop_filter_neon(dblimit, dlimit, dthresh,
+ d3u8, d4u8, d5u8, d6u8, d7u8, d16u8, d17u8, d18u8,
+ &d0u8, &d1u8, &d2u8, &d3u8, &d4u8, &d5u8);
+
+ d4Result.val[0] = d0u8;
+ d4Result.val[1] = d1u8;
+ d4Result.val[2] = d2u8;
+ d4Result.val[3] = d3u8;
+
+ d2Result.val[0] = d4u8;
+ d2Result.val[1] = d5u8;
+
+ s = src - 3;
+ vst4_lane_u8(s, d4Result, 0);
+ s += pitch;
+ vst4_lane_u8(s, d4Result, 1);
+ s += pitch;
+ vst4_lane_u8(s, d4Result, 2);
+ s += pitch;
+ vst4_lane_u8(s, d4Result, 3);
+ s += pitch;
+ vst4_lane_u8(s, d4Result, 4);
+ s += pitch;
+ vst4_lane_u8(s, d4Result, 5);
+ s += pitch;
+ vst4_lane_u8(s, d4Result, 6);
+ s += pitch;
+ vst4_lane_u8(s, d4Result, 7);
+
+ s = src + 1;
+ vst2_lane_u8(s, d2Result, 0);
+ s += pitch;
+ vst2_lane_u8(s, d2Result, 1);
+ s += pitch;
+ vst2_lane_u8(s, d2Result, 2);
+ s += pitch;
+ vst2_lane_u8(s, d2Result, 3);
+ s += pitch;
+ vst2_lane_u8(s, d2Result, 4);
+ s += pitch;
+ vst2_lane_u8(s, d2Result, 5);
+ s += pitch;
+ vst2_lane_u8(s, d2Result, 6);
+ s += pitch;
+ vst2_lane_u8(s, d2Result, 7);
+ }
+ return;
+}
diff --git a/vp9/common/arm/neon/vp9_loopfilter_neon_asm.asm b/vp9/common/arm/neon/vp9_loopfilter_8_neon_asm.asm
similarity index 65%
rename from vp9/common/arm/neon/vp9_loopfilter_neon_asm.asm
rename to vp9/common/arm/neon/vp9_loopfilter_8_neon_asm.asm
index 4430322..91aaec0 100644
--- a/vp9/common/arm/neon/vp9_loopfilter_neon_asm.asm
+++ b/vp9/common/arm/neon/vp9_loopfilter_8_neon_asm.asm
@@ -8,8 +8,6 @@
; be found in the AUTHORS file in the root of the source tree.
;
- EXPORT |vp9_lpf_horizontal_4_neon|
- EXPORT |vp9_lpf_vertical_4_neon|
EXPORT |vp9_lpf_horizontal_8_neon|
EXPORT |vp9_lpf_vertical_8_neon|
ARM
@@ -21,261 +19,6 @@
; TODO(fgalligan): See about removing the count code as this function is only
; called with a count of 1.
;
-; void vp9_lpf_horizontal_4_neon(uint8_t *s,
-; int p /* pitch */,
-; const uint8_t *blimit,
-; const uint8_t *limit,
-; const uint8_t *thresh,
-; int count)
-;
-; r0 uint8_t *s,
-; r1 int p, /* pitch */
-; r2 const uint8_t *blimit,
-; r3 const uint8_t *limit,
-; sp const uint8_t *thresh,
-; sp+4 int count
-|vp9_lpf_horizontal_4_neon| PROC
- push {lr}
-
- vld1.8 {d0[]}, [r2] ; duplicate *blimit
- ldr r12, [sp, #8] ; load count
- ldr r2, [sp, #4] ; load thresh
- add r1, r1, r1 ; double pitch
-
- cmp r12, #0
- beq end_vp9_lf_h_edge
-
- vld1.8 {d1[]}, [r3] ; duplicate *limit
- vld1.8 {d2[]}, [r2] ; duplicate *thresh
-
-count_lf_h_loop
- sub r2, r0, r1, lsl #1 ; move src pointer down by 4 lines
- add r3, r2, r1, lsr #1 ; set to 3 lines down
-
- vld1.u8 {d3}, [r2@64], r1 ; p3
- vld1.u8 {d4}, [r3@64], r1 ; p2
- vld1.u8 {d5}, [r2@64], r1 ; p1
- vld1.u8 {d6}, [r3@64], r1 ; p0
- vld1.u8 {d7}, [r2@64], r1 ; q0
- vld1.u8 {d16}, [r3@64], r1 ; q1
- vld1.u8 {d17}, [r2@64] ; q2
- vld1.u8 {d18}, [r3@64] ; q3
-
- sub r2, r2, r1, lsl #1
- sub r3, r3, r1, lsl #1
-
- bl vp9_loop_filter_neon
-
- vst1.u8 {d4}, [r2@64], r1 ; store op1
- vst1.u8 {d5}, [r3@64], r1 ; store op0
- vst1.u8 {d6}, [r2@64], r1 ; store oq0
- vst1.u8 {d7}, [r3@64], r1 ; store oq1
-
- add r0, r0, #8
- subs r12, r12, #1
- bne count_lf_h_loop
-
-end_vp9_lf_h_edge
- pop {pc}
- ENDP ; |vp9_lpf_horizontal_4_neon|
-
-; Currently vp9 only works on iterations 8 at a time. The vp8 loop filter
-; works on 16 iterations at a time.
-; TODO(fgalligan): See about removing the count code as this function is only
-; called with a count of 1.
-;
-; void vp9_lpf_vertical_4_neon(uint8_t *s,
-; int p /* pitch */,
-; const uint8_t *blimit,
-; const uint8_t *limit,
-; const uint8_t *thresh,
-; int count)
-;
-; r0 uint8_t *s,
-; r1 int p, /* pitch */
-; r2 const uint8_t *blimit,
-; r3 const uint8_t *limit,
-; sp const uint8_t *thresh,
-; sp+4 int count
-|vp9_lpf_vertical_4_neon| PROC
- push {lr}
-
- vld1.8 {d0[]}, [r2] ; duplicate *blimit
- ldr r12, [sp, #8] ; load count
- vld1.8 {d1[]}, [r3] ; duplicate *limit
-
- ldr r3, [sp, #4] ; load thresh
- sub r2, r0, #4 ; move s pointer down by 4 columns
- cmp r12, #0
- beq end_vp9_lf_v_edge
-
- vld1.8 {d2[]}, [r3] ; duplicate *thresh
-
-count_lf_v_loop
- vld1.u8 {d3}, [r2], r1 ; load s data
- vld1.u8 {d4}, [r2], r1
- vld1.u8 {d5}, [r2], r1
- vld1.u8 {d6}, [r2], r1
- vld1.u8 {d7}, [r2], r1
- vld1.u8 {d16}, [r2], r1
- vld1.u8 {d17}, [r2], r1
- vld1.u8 {d18}, [r2]
-
- ;transpose to 8x16 matrix
- vtrn.32 d3, d7
- vtrn.32 d4, d16
- vtrn.32 d5, d17
- vtrn.32 d6, d18
-
- vtrn.16 d3, d5
- vtrn.16 d4, d6
- vtrn.16 d7, d17
- vtrn.16 d16, d18
-
- vtrn.8 d3, d4
- vtrn.8 d5, d6
- vtrn.8 d7, d16
- vtrn.8 d17, d18
-
- bl vp9_loop_filter_neon
-
- sub r0, r0, #2
-
- ;store op1, op0, oq0, oq1
- vst4.8 {d4[0], d5[0], d6[0], d7[0]}, [r0], r1
- vst4.8 {d4[1], d5[1], d6[1], d7[1]}, [r0], r1
- vst4.8 {d4[2], d5[2], d6[2], d7[2]}, [r0], r1
- vst4.8 {d4[3], d5[3], d6[3], d7[3]}, [r0], r1
- vst4.8 {d4[4], d5[4], d6[4], d7[4]}, [r0], r1
- vst4.8 {d4[5], d5[5], d6[5], d7[5]}, [r0], r1
- vst4.8 {d4[6], d5[6], d6[6], d7[6]}, [r0], r1
- vst4.8 {d4[7], d5[7], d6[7], d7[7]}, [r0]
-
- add r0, r0, r1, lsl #3 ; s += pitch * 8
- subs r12, r12, #1
- subne r2, r0, #4 ; move s pointer down by 4 columns
- bne count_lf_v_loop
-
-end_vp9_lf_v_edge
- pop {pc}
- ENDP ; |vp9_lpf_vertical_4_neon|
-
-; void vp9_loop_filter_neon();
-; This is a helper function for the loopfilters. The invidual functions do the
-; necessary load, transpose (if necessary) and store. The function does not use
-; registers d8-d15.
-;
-; Inputs:
-; r0-r3, r12 PRESERVE
-; d0 blimit
-; d1 limit
-; d2 thresh
-; d3 p3
-; d4 p2
-; d5 p1
-; d6 p0
-; d7 q0
-; d16 q1
-; d17 q2
-; d18 q3
-;
-; Outputs:
-; d4 op1
-; d5 op0
-; d6 oq0
-; d7 oq1
-|vp9_loop_filter_neon| PROC
- ; filter_mask
- vabd.u8 d19, d3, d4 ; m1 = abs(p3 - p2)
- vabd.u8 d20, d4, d5 ; m2 = abs(p2 - p1)
- vabd.u8 d21, d5, d6 ; m3 = abs(p1 - p0)
- vabd.u8 d22, d16, d7 ; m4 = abs(q1 - q0)
- vabd.u8 d3, d17, d16 ; m5 = abs(q2 - q1)
- vabd.u8 d4, d18, d17 ; m6 = abs(q3 - q2)
-
- ; only compare the largest value to limit
- vmax.u8 d19, d19, d20 ; m1 = max(m1, m2)
- vmax.u8 d20, d21, d22 ; m2 = max(m3, m4)
-
- vabd.u8 d17, d6, d7 ; abs(p0 - q0)
-
- vmax.u8 d3, d3, d4 ; m3 = max(m5, m6)
-
- vmov.u8 d18, #0x80
-
- vmax.u8 d23, d19, d20 ; m1 = max(m1, m2)
-
- ; hevmask
- vcgt.u8 d21, d21, d2 ; (abs(p1 - p0) > thresh)*-1
- vcgt.u8 d22, d22, d2 ; (abs(q1 - q0) > thresh)*-1
- vmax.u8 d23, d23, d3 ; m1 = max(m1, m3)
-
- vabd.u8 d28, d5, d16 ; a = abs(p1 - q1)
- vqadd.u8 d17, d17, d17 ; b = abs(p0 - q0) * 2
-
- veor d7, d7, d18 ; qs0
-
- vcge.u8 d23, d1, d23 ; abs(m1) > limit
-
- ; filter() function
- ; convert to signed
-
- vshr.u8 d28, d28, #1 ; a = a / 2
- veor d6, d6, d18 ; ps0
-
- veor d5, d5, d18 ; ps1
- vqadd.u8 d17, d17, d28 ; a = b + a
-
- veor d16, d16, d18 ; qs1
-
- vmov.u8 d19, #3
-
- vsub.s8 d28, d7, d6 ; ( qs0 - ps0)
-
- vcge.u8 d17, d0, d17 ; a > blimit
-
- vqsub.s8 d27, d5, d16 ; filter = clamp(ps1-qs1)
- vorr d22, d21, d22 ; hevmask
-
- vmull.s8 q12, d28, d19 ; 3 * ( qs0 - ps0)
-
- vand d27, d27, d22 ; filter &= hev
- vand d23, d23, d17 ; filter_mask
-
- vaddw.s8 q12, q12, d27 ; filter + 3 * (qs0 - ps0)
-
- vmov.u8 d17, #4
-
- ; filter = clamp(filter + 3 * ( qs0 - ps0))
- vqmovn.s16 d27, q12
-
- vand d27, d27, d23 ; filter &= mask
-
- vqadd.s8 d28, d27, d19 ; filter2 = clamp(filter+3)
- vqadd.s8 d27, d27, d17 ; filter1 = clamp(filter+4)
- vshr.s8 d28, d28, #3 ; filter2 >>= 3
- vshr.s8 d27, d27, #3 ; filter1 >>= 3
-
- vqadd.s8 d19, d6, d28 ; u = clamp(ps0 + filter2)
- vqsub.s8 d26, d7, d27 ; u = clamp(qs0 - filter1)
-
- ; outer tap adjustments
- vrshr.s8 d27, d27, #1 ; filter = ++filter1 >> 1
-
- veor d6, d26, d18 ; *oq0 = u^0x80
-
- vbic d27, d27, d22 ; filter &= ~hev
-
- vqadd.s8 d21, d5, d27 ; u = clamp(ps1 + filter)
- vqsub.s8 d20, d16, d27 ; u = clamp(qs1 - filter)
-
- veor d5, d19, d18 ; *op0 = u^0x80
- veor d4, d21, d18 ; *op1 = u^0x80
- veor d7, d20, d18 ; *oq1 = u^0x80
-
- bx lr
- ENDP ; |vp9_loop_filter_neon|
-
; void vp9_lpf_horizontal_8_neon(uint8_t *s, int p,
; const uint8_t *blimit,
; const uint8_t *limit,
diff --git a/vp9/common/arm/neon/vp9_loopfilter_neon.c b/vp9/common/arm/neon/vp9_loopfilter_neon.c
index 079d266..31fcc63 100644
--- a/vp9/common/arm/neon/vp9_loopfilter_neon.c
+++ b/vp9/common/arm/neon/vp9_loopfilter_neon.c
@@ -10,705 +10,49 @@
#include <arm_neon.h>
+#include "./vp9_rtcd.h"
#include "./vpx_config.h"
+#include "vpx/vpx_integer.h"
-static INLINE void vp9_loop_filter_neon(
- uint8x8_t dblimit, // flimit
- uint8x8_t dlimit, // limit
- uint8x8_t dthresh, // thresh
- uint8x8_t d3u8, // p3
- uint8x8_t d4u8, // p2
- uint8x8_t d5u8, // p1
- uint8x8_t d6u8, // p0
- uint8x8_t d7u8, // q0
- uint8x8_t d16u8, // q1
- uint8x8_t d17u8, // q2
- uint8x8_t d18u8, // q3
- uint8x8_t *d4ru8, // p1
- uint8x8_t *d5ru8, // p0
- uint8x8_t *d6ru8, // q0
- uint8x8_t *d7ru8) { // q1
- uint8x8_t d19u8, d20u8, d21u8, d22u8, d23u8, d27u8, d28u8;
- int16x8_t q12s16;
- int8x8_t d19s8, d20s8, d21s8, d26s8, d27s8, d28s8;
-
- d19u8 = vabd_u8(d3u8, d4u8);
- d20u8 = vabd_u8(d4u8, d5u8);
- d21u8 = vabd_u8(d5u8, d6u8);
- d22u8 = vabd_u8(d16u8, d7u8);
- d3u8 = vabd_u8(d17u8, d16u8);
- d4u8 = vabd_u8(d18u8, d17u8);
-
- d19u8 = vmax_u8(d19u8, d20u8);
- d20u8 = vmax_u8(d21u8, d22u8);
- d3u8 = vmax_u8(d3u8, d4u8);
- d23u8 = vmax_u8(d19u8, d20u8);
-
- d17u8 = vabd_u8(d6u8, d7u8);
-
- d21u8 = vcgt_u8(d21u8, dthresh);
- d22u8 = vcgt_u8(d22u8, dthresh);
- d23u8 = vmax_u8(d23u8, d3u8);
-
- d28u8 = vabd_u8(d5u8, d16u8);
- d17u8 = vqadd_u8(d17u8, d17u8);
-
- d23u8 = vcge_u8(dlimit, d23u8);
-
- d18u8 = vdup_n_u8(0x80);
- d5u8 = veor_u8(d5u8, d18u8);
- d6u8 = veor_u8(d6u8, d18u8);
- d7u8 = veor_u8(d7u8, d18u8);
- d16u8 = veor_u8(d16u8, d18u8);
-
- d28u8 = vshr_n_u8(d28u8, 1);
- d17u8 = vqadd_u8(d17u8, d28u8);
-
- d19u8 = vdup_n_u8(3);
-
- d28s8 = vsub_s8(vreinterpret_s8_u8(d7u8),
- vreinterpret_s8_u8(d6u8));
-
- d17u8 = vcge_u8(dblimit, d17u8);
-
- d27s8 = vqsub_s8(vreinterpret_s8_u8(d5u8),
- vreinterpret_s8_u8(d16u8));
-
- d22u8 = vorr_u8(d21u8, d22u8);
-
- q12s16 = vmull_s8(d28s8, vreinterpret_s8_u8(d19u8));
-
- d27u8 = vand_u8(vreinterpret_u8_s8(d27s8), d22u8);
- d23u8 = vand_u8(d23u8, d17u8);
-
- q12s16 = vaddw_s8(q12s16, vreinterpret_s8_u8(d27u8));
-
- d17u8 = vdup_n_u8(4);
-
- d27s8 = vqmovn_s16(q12s16);
- d27u8 = vand_u8(vreinterpret_u8_s8(d27s8), d23u8);
- d27s8 = vreinterpret_s8_u8(d27u8);
-
- d28s8 = vqadd_s8(d27s8, vreinterpret_s8_u8(d19u8));
- d27s8 = vqadd_s8(d27s8, vreinterpret_s8_u8(d17u8));
- d28s8 = vshr_n_s8(d28s8, 3);
- d27s8 = vshr_n_s8(d27s8, 3);
-
- d19s8 = vqadd_s8(vreinterpret_s8_u8(d6u8), d28s8);
- d26s8 = vqsub_s8(vreinterpret_s8_u8(d7u8), d27s8);
-
- d27s8 = vrshr_n_s8(d27s8, 1);
- d27s8 = vbic_s8(d27s8, vreinterpret_s8_u8(d22u8));
-
- d21s8 = vqadd_s8(vreinterpret_s8_u8(d5u8), d27s8);
- d20s8 = vqsub_s8(vreinterpret_s8_u8(d16u8), d27s8);
-
- *d4ru8 = veor_u8(vreinterpret_u8_s8(d21s8), d18u8);
- *d5ru8 = veor_u8(vreinterpret_u8_s8(d19s8), d18u8);
- *d6ru8 = veor_u8(vreinterpret_u8_s8(d26s8), d18u8);
- *d7ru8 = veor_u8(vreinterpret_u8_s8(d20s8), d18u8);
- return;
+void vp9_lpf_vertical_4_dual_neon(uint8_t *s, int p,
+ const uint8_t *blimit0,
+ const uint8_t *limit0,
+ const uint8_t *thresh0,
+ const uint8_t *blimit1,
+ const uint8_t *limit1,
+ const uint8_t *thresh1) {
+ vp9_lpf_vertical_4_neon(s, p, blimit0, limit0, thresh0, 1);
+ vp9_lpf_vertical_4_neon(s + 8 * p, p, blimit1, limit1, thresh1, 1);
}
-void vp9_lpf_horizontal_4_neon(
- unsigned char *src,
- int pitch,
- unsigned char *blimit,
- unsigned char *limit,
- unsigned char *thresh,
- int count) {
- int i;
- uint8_t *s, *psrc;
- uint8x8_t dblimit, dlimit, dthresh;
- uint8x8_t d3u8, d4u8, d5u8, d6u8, d7u8, d16u8, d17u8, d18u8;
-
- if (count == 0) // end_vp9_lf_h_edge
- return;
-
- dblimit = vld1_u8(blimit);
- dlimit = vld1_u8(limit);
- dthresh = vld1_u8(thresh);
-
- psrc = src - (pitch << 2);
- for (i = 0; i < count; i++) {
- s = psrc + i * 8;
-
- d3u8 = vld1_u8(s);
- s += pitch;
- d4u8 = vld1_u8(s);
- s += pitch;
- d5u8 = vld1_u8(s);
- s += pitch;
- d6u8 = vld1_u8(s);
- s += pitch;
- d7u8 = vld1_u8(s);
- s += pitch;
- d16u8 = vld1_u8(s);
- s += pitch;
- d17u8 = vld1_u8(s);
- s += pitch;
- d18u8 = vld1_u8(s);
-
- vp9_loop_filter_neon(dblimit, dlimit, dthresh,
- d3u8, d4u8, d5u8, d6u8, d7u8, d16u8, d17u8, d18u8,
- &d4u8, &d5u8, &d6u8, &d7u8);
-
- s -= (pitch * 5);
- vst1_u8(s, d4u8);
- s += pitch;
- vst1_u8(s, d5u8);
- s += pitch;
- vst1_u8(s, d6u8);
- s += pitch;
- vst1_u8(s, d7u8);
- }
- return;
+#if HAVE_NEON_ASM
+void vp9_lpf_horizontal_8_dual_neon(uint8_t *s, int p /* pitch */,
+ const uint8_t *blimit0,
+ const uint8_t *limit0,
+ const uint8_t *thresh0,
+ const uint8_t *blimit1,
+ const uint8_t *limit1,
+ const uint8_t *thresh1) {
+ vp9_lpf_horizontal_8_neon(s, p, blimit0, limit0, thresh0, 1);
+ vp9_lpf_horizontal_8_neon(s + 8, p, blimit1, limit1, thresh1, 1);
}
-void vp9_lpf_vertical_4_neon(
- unsigned char *src,
- int pitch,
- unsigned char *blimit,
- unsigned char *limit,
- unsigned char *thresh,
- int count) {
- int i, pitch8;
- uint8_t *s;
- uint8x8_t dblimit, dlimit, dthresh;
- uint8x8_t d3u8, d4u8, d5u8, d6u8, d7u8, d16u8, d17u8, d18u8;
- uint32x2x2_t d2tmp0, d2tmp1, d2tmp2, d2tmp3;
- uint16x4x2_t d2tmp4, d2tmp5, d2tmp6, d2tmp7;
- uint8x8x2_t d2tmp8, d2tmp9, d2tmp10, d2tmp11;
- uint8x8x4_t d4Result;
-
- if (count == 0) // end_vp9_lf_h_edge
- return;
-
- dblimit = vld1_u8(blimit);
- dlimit = vld1_u8(limit);
- dthresh = vld1_u8(thresh);
-
- pitch8 = pitch * 8;
- for (i = 0; i < count; i++, src += pitch8) {
- s = src - (i + 1) * 4;
-
- d3u8 = vld1_u8(s);
- s += pitch;
- d4u8 = vld1_u8(s);
- s += pitch;
- d5u8 = vld1_u8(s);
- s += pitch;
- d6u8 = vld1_u8(s);
- s += pitch;
- d7u8 = vld1_u8(s);
- s += pitch;
- d16u8 = vld1_u8(s);
- s += pitch;
- d17u8 = vld1_u8(s);
- s += pitch;
- d18u8 = vld1_u8(s);
-
- d2tmp0 = vtrn_u32(vreinterpret_u32_u8(d3u8),
- vreinterpret_u32_u8(d7u8));
- d2tmp1 = vtrn_u32(vreinterpret_u32_u8(d4u8),
- vreinterpret_u32_u8(d16u8));
- d2tmp2 = vtrn_u32(vreinterpret_u32_u8(d5u8),
- vreinterpret_u32_u8(d17u8));
- d2tmp3 = vtrn_u32(vreinterpret_u32_u8(d6u8),
- vreinterpret_u32_u8(d18u8));
-
- d2tmp4 = vtrn_u16(vreinterpret_u16_u32(d2tmp0.val[0]),
- vreinterpret_u16_u32(d2tmp2.val[0]));
- d2tmp5 = vtrn_u16(vreinterpret_u16_u32(d2tmp1.val[0]),
- vreinterpret_u16_u32(d2tmp3.val[0]));
- d2tmp6 = vtrn_u16(vreinterpret_u16_u32(d2tmp0.val[1]),
- vreinterpret_u16_u32(d2tmp2.val[1]));
- d2tmp7 = vtrn_u16(vreinterpret_u16_u32(d2tmp1.val[1]),
- vreinterpret_u16_u32(d2tmp3.val[1]));
-
- d2tmp8 = vtrn_u8(vreinterpret_u8_u16(d2tmp4.val[0]),
- vreinterpret_u8_u16(d2tmp5.val[0]));
- d2tmp9 = vtrn_u8(vreinterpret_u8_u16(d2tmp4.val[1]),
- vreinterpret_u8_u16(d2tmp5.val[1]));
- d2tmp10 = vtrn_u8(vreinterpret_u8_u16(d2tmp6.val[0]),
- vreinterpret_u8_u16(d2tmp7.val[0]));
- d2tmp11 = vtrn_u8(vreinterpret_u8_u16(d2tmp6.val[1]),
- vreinterpret_u8_u16(d2tmp7.val[1]));
-
- d3u8 = d2tmp8.val[0];
- d4u8 = d2tmp8.val[1];
- d5u8 = d2tmp9.val[0];
- d6u8 = d2tmp9.val[1];
- d7u8 = d2tmp10.val[0];
- d16u8 = d2tmp10.val[1];
- d17u8 = d2tmp11.val[0];
- d18u8 = d2tmp11.val[1];
-
- vp9_loop_filter_neon(dblimit, dlimit, dthresh,
- d3u8, d4u8, d5u8, d6u8, d7u8, d16u8, d17u8, d18u8,
- &d4u8, &d5u8, &d6u8, &d7u8);
-
- d4Result.val[0] = d4u8;
- d4Result.val[1] = d5u8;
- d4Result.val[2] = d6u8;
- d4Result.val[3] = d7u8;
-
- src -= 2;
- vst4_lane_u8(src, d4Result, 0);
- src += pitch;
- vst4_lane_u8(src, d4Result, 1);
- src += pitch;
- vst4_lane_u8(src, d4Result, 2);
- src += pitch;
- vst4_lane_u8(src, d4Result, 3);
- src += pitch;
- vst4_lane_u8(src, d4Result, 4);
- src += pitch;
- vst4_lane_u8(src, d4Result, 5);
- src += pitch;
- vst4_lane_u8(src, d4Result, 6);
- src += pitch;
- vst4_lane_u8(src, d4Result, 7);
- }
- return;
+void vp9_lpf_vertical_8_dual_neon(uint8_t *s, int p,
+ const uint8_t *blimit0,
+ const uint8_t *limit0,
+ const uint8_t *thresh0,
+ const uint8_t *blimit1,
+ const uint8_t *limit1,
+ const uint8_t *thresh1) {
+ vp9_lpf_vertical_8_neon(s, p, blimit0, limit0, thresh0, 1);
+ vp9_lpf_vertical_8_neon(s + 8 * p, p, blimit1, limit1, thresh1, 1);
}
-static INLINE void vp9_mbloop_filter_neon(
- uint8x8_t dblimit, // mblimit
- uint8x8_t dlimit, // limit
- uint8x8_t dthresh, // thresh
- uint8x8_t d3u8, // p2
- uint8x8_t d4u8, // p2
- uint8x8_t d5u8, // p1
- uint8x8_t d6u8, // p0
- uint8x8_t d7u8, // q0
- uint8x8_t d16u8, // q1
- uint8x8_t d17u8, // q2
- uint8x8_t d18u8, // q3
- uint8x8_t *d0ru8, // p1
- uint8x8_t *d1ru8, // p1
- uint8x8_t *d2ru8, // p0
- uint8x8_t *d3ru8, // q0
- uint8x8_t *d4ru8, // q1
- uint8x8_t *d5ru8) { // q1
- uint32_t flat;
- uint8x8_t d0u8, d1u8, d2u8, d19u8, d20u8, d21u8, d22u8, d23u8, d24u8;
- uint8x8_t d25u8, d26u8, d27u8, d28u8, d29u8, d30u8, d31u8;
- int16x8_t q15s16;
- uint16x8_t q10u16, q14u16;
- int8x8_t d21s8, d24s8, d25s8, d26s8, d28s8, d29s8, d30s8;
-
- d19u8 = vabd_u8(d3u8, d4u8);
- d20u8 = vabd_u8(d4u8, d5u8);
- d21u8 = vabd_u8(d5u8, d6u8);
- d22u8 = vabd_u8(d16u8, d7u8);
- d23u8 = vabd_u8(d17u8, d16u8);
- d24u8 = vabd_u8(d18u8, d17u8);
-
- d19u8 = vmax_u8(d19u8, d20u8);
- d20u8 = vmax_u8(d21u8, d22u8);
-
- d25u8 = vabd_u8(d6u8, d4u8);
-
- d23u8 = vmax_u8(d23u8, d24u8);
-
- d26u8 = vabd_u8(d7u8, d17u8);
-
- d19u8 = vmax_u8(d19u8, d20u8);
-
- d24u8 = vabd_u8(d6u8, d7u8);
- d27u8 = vabd_u8(d3u8, d6u8);
- d28u8 = vabd_u8(d18u8, d7u8);
-
- d19u8 = vmax_u8(d19u8, d23u8);
-
- d23u8 = vabd_u8(d5u8, d16u8);
- d24u8 = vqadd_u8(d24u8, d24u8);
-
-
- d19u8 = vcge_u8(dlimit, d19u8);
-
-
- d25u8 = vmax_u8(d25u8, d26u8);
- d26u8 = vmax_u8(d27u8, d28u8);
-
- d23u8 = vshr_n_u8(d23u8, 1);
-
- d25u8 = vmax_u8(d25u8, d26u8);
-
- d24u8 = vqadd_u8(d24u8, d23u8);
-
- d20u8 = vmax_u8(d20u8, d25u8);
-
- d23u8 = vdup_n_u8(1);
- d24u8 = vcge_u8(dblimit, d24u8);
-
- d21u8 = vcgt_u8(d21u8, dthresh);
-
- d20u8 = vcge_u8(d23u8, d20u8);
-
- d19u8 = vand_u8(d19u8, d24u8);
-
- d23u8 = vcgt_u8(d22u8, dthresh);
-
- d20u8 = vand_u8(d20u8, d19u8);
-
- d22u8 = vdup_n_u8(0x80);
-
- d23u8 = vorr_u8(d21u8, d23u8);
-
- q10u16 = vcombine_u16(vreinterpret_u16_u8(d20u8),
- vreinterpret_u16_u8(d21u8));
-
- d30u8 = vshrn_n_u16(q10u16, 4);
- flat = vget_lane_u32(vreinterpret_u32_u8(d30u8), 0);
-
- if (flat == 0xffffffff) { // Check for all 1's, power_branch_only
- d27u8 = vdup_n_u8(3);
- d21u8 = vdup_n_u8(2);
- q14u16 = vaddl_u8(d6u8, d7u8);
- q14u16 = vmlal_u8(q14u16, d3u8, d27u8);
- q14u16 = vmlal_u8(q14u16, d4u8, d21u8);
- q14u16 = vaddw_u8(q14u16, d5u8);
- *d0ru8 = vqrshrn_n_u16(q14u16, 3);
-
- q14u16 = vsubw_u8(q14u16, d3u8);
- q14u16 = vsubw_u8(q14u16, d4u8);
- q14u16 = vaddw_u8(q14u16, d5u8);
- q14u16 = vaddw_u8(q14u16, d16u8);
- *d1ru8 = vqrshrn_n_u16(q14u16, 3);
-
- q14u16 = vsubw_u8(q14u16, d3u8);
- q14u16 = vsubw_u8(q14u16, d5u8);
- q14u16 = vaddw_u8(q14u16, d6u8);
- q14u16 = vaddw_u8(q14u16, d17u8);
- *d2ru8 = vqrshrn_n_u16(q14u16, 3);
-
- q14u16 = vsubw_u8(q14u16, d3u8);
- q14u16 = vsubw_u8(q14u16, d6u8);
- q14u16 = vaddw_u8(q14u16, d7u8);
- q14u16 = vaddw_u8(q14u16, d18u8);
- *d3ru8 = vqrshrn_n_u16(q14u16, 3);
-
- q14u16 = vsubw_u8(q14u16, d4u8);
- q14u16 = vsubw_u8(q14u16, d7u8);
- q14u16 = vaddw_u8(q14u16, d16u8);
- q14u16 = vaddw_u8(q14u16, d18u8);
- *d4ru8 = vqrshrn_n_u16(q14u16, 3);
-
- q14u16 = vsubw_u8(q14u16, d5u8);
- q14u16 = vsubw_u8(q14u16, d16u8);
- q14u16 = vaddw_u8(q14u16, d17u8);
- q14u16 = vaddw_u8(q14u16, d18u8);
- *d5ru8 = vqrshrn_n_u16(q14u16, 3);
- } else {
- d21u8 = veor_u8(d7u8, d22u8);
- d24u8 = veor_u8(d6u8, d22u8);
- d25u8 = veor_u8(d5u8, d22u8);
- d26u8 = veor_u8(d16u8, d22u8);
-
- d27u8 = vdup_n_u8(3);
-
- d28s8 = vsub_s8(vreinterpret_s8_u8(d21u8), vreinterpret_s8_u8(d24u8));
- d29s8 = vqsub_s8(vreinterpret_s8_u8(d25u8), vreinterpret_s8_u8(d26u8));
-
- q15s16 = vmull_s8(d28s8, vreinterpret_s8_u8(d27u8));
-
- d29s8 = vand_s8(d29s8, vreinterpret_s8_u8(d23u8));
-
- q15s16 = vaddw_s8(q15s16, d29s8);
-
- d29u8 = vdup_n_u8(4);
-
- d28s8 = vqmovn_s16(q15s16);
-
- d28s8 = vand_s8(d28s8, vreinterpret_s8_u8(d19u8));
-
- d30s8 = vqadd_s8(d28s8, vreinterpret_s8_u8(d27u8));
- d29s8 = vqadd_s8(d28s8, vreinterpret_s8_u8(d29u8));
- d30s8 = vshr_n_s8(d30s8, 3);
- d29s8 = vshr_n_s8(d29s8, 3);
-
- d24s8 = vqadd_s8(vreinterpret_s8_u8(d24u8), d30s8);
- d21s8 = vqsub_s8(vreinterpret_s8_u8(d21u8), d29s8);
-
- d29s8 = vrshr_n_s8(d29s8, 1);
- d29s8 = vbic_s8(d29s8, vreinterpret_s8_u8(d23u8));
-
- d25s8 = vqadd_s8(vreinterpret_s8_u8(d25u8), d29s8);
- d26s8 = vqsub_s8(vreinterpret_s8_u8(d26u8), d29s8);
-
- if (flat == 0) { // filter_branch_only
- *d0ru8 = d4u8;
- *d1ru8 = veor_u8(vreinterpret_u8_s8(d25s8), d22u8);
- *d2ru8 = veor_u8(vreinterpret_u8_s8(d24s8), d22u8);
- *d3ru8 = veor_u8(vreinterpret_u8_s8(d21s8), d22u8);
- *d4ru8 = veor_u8(vreinterpret_u8_s8(d26s8), d22u8);
- *d5ru8 = d17u8;
- return;
- }
-
- d21u8 = veor_u8(vreinterpret_u8_s8(d21s8), d22u8);
- d24u8 = veor_u8(vreinterpret_u8_s8(d24s8), d22u8);
- d25u8 = veor_u8(vreinterpret_u8_s8(d25s8), d22u8);
- d26u8 = veor_u8(vreinterpret_u8_s8(d26s8), d22u8);
-
- d23u8 = vdup_n_u8(2);
- q14u16 = vaddl_u8(d6u8, d7u8);
- q14u16 = vmlal_u8(q14u16, d3u8, d27u8);
- q14u16 = vmlal_u8(q14u16, d4u8, d23u8);
-
- d0u8 = vbsl_u8(d20u8, dblimit, d4u8);
-
- q14u16 = vaddw_u8(q14u16, d5u8);
-
- d1u8 = vbsl_u8(d20u8, dlimit, d25u8);
-
- d30u8 = vqrshrn_n_u16(q14u16, 3);
-
- q14u16 = vsubw_u8(q14u16, d3u8);
- q14u16 = vsubw_u8(q14u16, d4u8);
- q14u16 = vaddw_u8(q14u16, d5u8);
- q14u16 = vaddw_u8(q14u16, d16u8);
-
- d2u8 = vbsl_u8(d20u8, dthresh, d24u8);
-
- d31u8 = vqrshrn_n_u16(q14u16, 3);
-
- q14u16 = vsubw_u8(q14u16, d3u8);
- q14u16 = vsubw_u8(q14u16, d5u8);
- q14u16 = vaddw_u8(q14u16, d6u8);
- q14u16 = vaddw_u8(q14u16, d17u8);
-
- *d0ru8 = vbsl_u8(d20u8, d30u8, d0u8);
-
- d23u8 = vqrshrn_n_u16(q14u16, 3);
-
- q14u16 = vsubw_u8(q14u16, d3u8);
- q14u16 = vsubw_u8(q14u16, d6u8);
- q14u16 = vaddw_u8(q14u16, d7u8);
-
- *d1ru8 = vbsl_u8(d20u8, d31u8, d1u8);
-
- q14u16 = vaddw_u8(q14u16, d18u8);
-
- *d2ru8 = vbsl_u8(d20u8, d23u8, d2u8);
-
- d22u8 = vqrshrn_n_u16(q14u16, 3);
-
- q14u16 = vsubw_u8(q14u16, d4u8);
- q14u16 = vsubw_u8(q14u16, d7u8);
- q14u16 = vaddw_u8(q14u16, d16u8);
-
- d3u8 = vbsl_u8(d20u8, d3u8, d21u8);
-
- q14u16 = vaddw_u8(q14u16, d18u8);
-
- d4u8 = vbsl_u8(d20u8, d4u8, d26u8);
-
- d6u8 = vqrshrn_n_u16(q14u16, 3);
-
- q14u16 = vsubw_u8(q14u16, d5u8);
- q14u16 = vsubw_u8(q14u16, d16u8);
- q14u16 = vaddw_u8(q14u16, d17u8);
- q14u16 = vaddw_u8(q14u16, d18u8);
-
- d5u8 = vbsl_u8(d20u8, d5u8, d17u8);
-
- d7u8 = vqrshrn_n_u16(q14u16, 3);
-
- *d3ru8 = vbsl_u8(d20u8, d22u8, d3u8);
- *d4ru8 = vbsl_u8(d20u8, d6u8, d4u8);
- *d5ru8 = vbsl_u8(d20u8, d7u8, d5u8);
- }
- return;
+void vp9_lpf_vertical_16_dual_neon(uint8_t *s, int p,
+ const uint8_t *blimit,
+ const uint8_t *limit,
+ const uint8_t *thresh) {
+ vp9_lpf_vertical_16_neon(s, p, blimit, limit, thresh);
+ vp9_lpf_vertical_16_neon(s + 8 * p, p, blimit, limit, thresh);
}
-
-void vp9_lpf_horizontal_8_neon(
- unsigned char *src,
- int pitch,
- unsigned char *blimit,
- unsigned char *limit,
- unsigned char *thresh,
- int count) {
- int i;
- uint8_t *s, *psrc;
- uint8x8_t dblimit, dlimit, dthresh;
- uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8;
- uint8x8_t d16u8, d17u8, d18u8;
-
- if (count == 0) // end_vp9_mblf_h_edge
- return;
-
- dblimit = vld1_u8(blimit);
- dlimit = vld1_u8(limit);
- dthresh = vld1_u8(thresh);
-
- psrc = src - (pitch << 2);
- for (i = 0; i < count; i++) {
- s = psrc + i * 8;
-
- d3u8 = vld1_u8(s);
- s += pitch;
- d4u8 = vld1_u8(s);
- s += pitch;
- d5u8 = vld1_u8(s);
- s += pitch;
- d6u8 = vld1_u8(s);
- s += pitch;
- d7u8 = vld1_u8(s);
- s += pitch;
- d16u8 = vld1_u8(s);
- s += pitch;
- d17u8 = vld1_u8(s);
- s += pitch;
- d18u8 = vld1_u8(s);
-
- vp9_mbloop_filter_neon(dblimit, dlimit, dthresh,
- d3u8, d4u8, d5u8, d6u8, d7u8, d16u8, d17u8, d18u8,
- &d0u8, &d1u8, &d2u8, &d3u8, &d4u8, &d5u8);
-
- s -= (pitch * 6);
- vst1_u8(s, d0u8);
- s += pitch;
- vst1_u8(s, d1u8);
- s += pitch;
- vst1_u8(s, d2u8);
- s += pitch;
- vst1_u8(s, d3u8);
- s += pitch;
- vst1_u8(s, d4u8);
- s += pitch;
- vst1_u8(s, d5u8);
- }
- return;
-}
-
-void vp9_lpf_vertical_8_neon(
- unsigned char *src,
- int pitch,
- unsigned char *blimit,
- unsigned char *limit,
- unsigned char *thresh,
- int count) {
- int i;
- uint8_t *s;
- uint8x8_t dblimit, dlimit, dthresh;
- uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8;
- uint8x8_t d16u8, d17u8, d18u8;
- uint32x2x2_t d2tmp0, d2tmp1, d2tmp2, d2tmp3;
- uint16x4x2_t d2tmp4, d2tmp5, d2tmp6, d2tmp7;
- uint8x8x2_t d2tmp8, d2tmp9, d2tmp10, d2tmp11;
- uint8x8x4_t d4Result;
- uint8x8x2_t d2Result;
-
- if (count == 0)
- return;
-
- dblimit = vld1_u8(blimit);
- dlimit = vld1_u8(limit);
- dthresh = vld1_u8(thresh);
-
- for (i = 0; i < count; i++) {
- s = src + (i * (pitch << 3)) - 4;
-
- d3u8 = vld1_u8(s);
- s += pitch;
- d4u8 = vld1_u8(s);
- s += pitch;
- d5u8 = vld1_u8(s);
- s += pitch;
- d6u8 = vld1_u8(s);
- s += pitch;
- d7u8 = vld1_u8(s);
- s += pitch;
- d16u8 = vld1_u8(s);
- s += pitch;
- d17u8 = vld1_u8(s);
- s += pitch;
- d18u8 = vld1_u8(s);
-
- d2tmp0 = vtrn_u32(vreinterpret_u32_u8(d3u8),
- vreinterpret_u32_u8(d7u8));
- d2tmp1 = vtrn_u32(vreinterpret_u32_u8(d4u8),
- vreinterpret_u32_u8(d16u8));
- d2tmp2 = vtrn_u32(vreinterpret_u32_u8(d5u8),
- vreinterpret_u32_u8(d17u8));
- d2tmp3 = vtrn_u32(vreinterpret_u32_u8(d6u8),
- vreinterpret_u32_u8(d18u8));
-
- d2tmp4 = vtrn_u16(vreinterpret_u16_u32(d2tmp0.val[0]),
- vreinterpret_u16_u32(d2tmp2.val[0]));
- d2tmp5 = vtrn_u16(vreinterpret_u16_u32(d2tmp1.val[0]),
- vreinterpret_u16_u32(d2tmp3.val[0]));
- d2tmp6 = vtrn_u16(vreinterpret_u16_u32(d2tmp0.val[1]),
- vreinterpret_u16_u32(d2tmp2.val[1]));
- d2tmp7 = vtrn_u16(vreinterpret_u16_u32(d2tmp1.val[1]),
- vreinterpret_u16_u32(d2tmp3.val[1]));
-
- d2tmp8 = vtrn_u8(vreinterpret_u8_u16(d2tmp4.val[0]),
- vreinterpret_u8_u16(d2tmp5.val[0]));
- d2tmp9 = vtrn_u8(vreinterpret_u8_u16(d2tmp4.val[1]),
- vreinterpret_u8_u16(d2tmp5.val[1]));
- d2tmp10 = vtrn_u8(vreinterpret_u8_u16(d2tmp6.val[0]),
- vreinterpret_u8_u16(d2tmp7.val[0]));
- d2tmp11 = vtrn_u8(vreinterpret_u8_u16(d2tmp6.val[1]),
- vreinterpret_u8_u16(d2tmp7.val[1]));
-
- d3u8 = d2tmp8.val[0];
- d4u8 = d2tmp8.val[1];
- d5u8 = d2tmp9.val[0];
- d6u8 = d2tmp9.val[1];
- d7u8 = d2tmp10.val[0];
- d16u8 = d2tmp10.val[1];
- d17u8 = d2tmp11.val[0];
- d18u8 = d2tmp11.val[1];
-
- vp9_mbloop_filter_neon(dblimit, dlimit, dthresh,
- d3u8, d4u8, d5u8, d6u8, d7u8, d16u8, d17u8, d18u8,
- &d0u8, &d1u8, &d2u8, &d3u8, &d4u8, &d5u8);
-
- d4Result.val[0] = d0u8;
- d4Result.val[1] = d1u8;
- d4Result.val[2] = d2u8;
- d4Result.val[3] = d3u8;
-
- d2Result.val[0] = d4u8;
- d2Result.val[1] = d5u8;
-
- s = src - 3;
- vst4_lane_u8(s, d4Result, 0);
- s += pitch;
- vst4_lane_u8(s, d4Result, 1);
- s += pitch;
- vst4_lane_u8(s, d4Result, 2);
- s += pitch;
- vst4_lane_u8(s, d4Result, 3);
- s += pitch;
- vst4_lane_u8(s, d4Result, 4);
- s += pitch;
- vst4_lane_u8(s, d4Result, 5);
- s += pitch;
- vst4_lane_u8(s, d4Result, 6);
- s += pitch;
- vst4_lane_u8(s, d4Result, 7);
-
- s = src + 1;
- vst2_lane_u8(s, d2Result, 0);
- s += pitch;
- vst2_lane_u8(s, d2Result, 1);
- s += pitch;
- vst2_lane_u8(s, d2Result, 2);
- s += pitch;
- vst2_lane_u8(s, d2Result, 3);
- s += pitch;
- vst2_lane_u8(s, d2Result, 4);
- s += pitch;
- vst2_lane_u8(s, d2Result, 5);
- s += pitch;
- vst2_lane_u8(s, d2Result, 6);
- s += pitch;
- vst2_lane_u8(s, d2Result, 7);
- }
- return;
-}
+#endif // HAVE_NEON_ASM
diff --git a/vp9/common/vp9_entropy.h b/vp9/common/vp9_entropy.h
index 239c049..4eb2e64 100644
--- a/vp9/common/vp9_entropy.h
+++ b/vp9/common/vp9_entropy.h
@@ -81,6 +81,7 @@
const vp9_prob *prob;
int len;
int base_val;
+ const int16_t *cost;
} vp9_extra_bit;
// indexed by token value
diff --git a/vp9/common/vp9_idct.c b/vp9/common/vp9_idct.c
index 1a3fefc..b48d522 100644
--- a/vp9/common/vp9_idct.c
+++ b/vp9/common/vp9_idct.c
@@ -1517,12 +1517,12 @@
// stage 1
temp1 = (input[0] + input[2]) * cospi_16_64;
temp2 = (input[0] - input[2]) * cospi_16_64;
- step[0] = WRAPLOW(dct_const_round_shift(temp1), bd);
- step[1] = WRAPLOW(dct_const_round_shift(temp2), bd);
+ step[0] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+ step[1] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
temp1 = input[1] * cospi_24_64 - input[3] * cospi_8_64;
temp2 = input[1] * cospi_8_64 + input[3] * cospi_24_64;
- step[2] = WRAPLOW(dct_const_round_shift(temp1), bd);
- step[3] = WRAPLOW(dct_const_round_shift(temp2), bd);
+ step[2] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+ step[3] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
// stage 2
output[0] = WRAPLOW(step[0] + step[3], bd);
@@ -1562,10 +1562,11 @@
int dest_stride, int bd) {
int i;
tran_high_t a1;
- tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), bd);
+ tran_low_t out = WRAPLOW(
+ highbd_dct_const_round_shift(input[0] * cospi_16_64, bd), bd);
uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
- out = WRAPLOW(dct_const_round_shift(out * cospi_16_64), bd);
+ out = WRAPLOW(highbd_dct_const_round_shift(out * cospi_16_64, bd), bd);
a1 = ROUND_POWER_OF_TWO(out, 4);
for (i = 0; i < 4; i++) {
@@ -1587,12 +1588,12 @@
step1[3] = input[6];
temp1 = input[1] * cospi_28_64 - input[7] * cospi_4_64;
temp2 = input[1] * cospi_4_64 + input[7] * cospi_28_64;
- step1[4] = WRAPLOW(dct_const_round_shift(temp1), bd);
- step1[7] = WRAPLOW(dct_const_round_shift(temp2), bd);
+ step1[4] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+ step1[7] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
temp1 = input[5] * cospi_12_64 - input[3] * cospi_20_64;
temp2 = input[5] * cospi_20_64 + input[3] * cospi_12_64;
- step1[5] = WRAPLOW(dct_const_round_shift(temp1), bd);
- step1[6] = WRAPLOW(dct_const_round_shift(temp2), bd);
+ step1[5] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+ step1[6] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
// stage 2 & stage 3 - even half
vp9_highbd_idct4(step1, step1, bd);
@@ -1607,8 +1608,8 @@
step1[4] = step2[4];
temp1 = (step2[6] - step2[5]) * cospi_16_64;
temp2 = (step2[5] + step2[6]) * cospi_16_64;
- step1[5] = WRAPLOW(dct_const_round_shift(temp1), bd);
- step1[6] = WRAPLOW(dct_const_round_shift(temp2), bd);
+ step1[5] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+ step1[6] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
step1[7] = step2[7];
// stage 4
@@ -1653,9 +1654,10 @@
int stride, int bd) {
int i, j;
tran_high_t a1;
- tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), bd);
+ tran_low_t out = WRAPLOW(
+ highbd_dct_const_round_shift(input[0] * cospi_16_64, bd), bd);
uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
- out = WRAPLOW(dct_const_round_shift(out * cospi_16_64), bd);
+ out = WRAPLOW(highbd_dct_const_round_shift(out * cospi_16_64, bd), bd);
a1 = ROUND_POWER_OF_TWO(out, 5);
for (j = 0; j < 8; ++j) {
for (i = 0; i < 8; ++i)
@@ -1696,10 +1698,10 @@
// The overall dynamic range is 14b (input) + 14b (multiplication scaling)
// + 1b (addition) = 29b.
// Hence the output bit depth is 15b.
- output[0] = WRAPLOW(dct_const_round_shift(s0 + s3), bd);
- output[1] = WRAPLOW(dct_const_round_shift(s1 + s3), bd);
- output[2] = WRAPLOW(dct_const_round_shift(s2), bd);
- output[3] = WRAPLOW(dct_const_round_shift(s0 + s1 - s3), bd);
+ output[0] = WRAPLOW(highbd_dct_const_round_shift(s0 + s3, bd), bd);
+ output[1] = WRAPLOW(highbd_dct_const_round_shift(s1 + s3, bd), bd);
+ output[2] = WRAPLOW(highbd_dct_const_round_shift(s2, bd), bd);
+ output[3] = WRAPLOW(highbd_dct_const_round_shift(s0 + s1 - s3, bd), bd);
}
void vp9_highbd_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest8,
@@ -1764,14 +1766,14 @@
s6 = cospi_26_64 * x6 + cospi_6_64 * x7;
s7 = cospi_6_64 * x6 - cospi_26_64 * x7;
- x0 = WRAPLOW(dct_const_round_shift(s0 + s4), bd);
- x1 = WRAPLOW(dct_const_round_shift(s1 + s5), bd);
- x2 = WRAPLOW(dct_const_round_shift(s2 + s6), bd);
- x3 = WRAPLOW(dct_const_round_shift(s3 + s7), bd);
- x4 = WRAPLOW(dct_const_round_shift(s0 - s4), bd);
- x5 = WRAPLOW(dct_const_round_shift(s1 - s5), bd);
- x6 = WRAPLOW(dct_const_round_shift(s2 - s6), bd);
- x7 = WRAPLOW(dct_const_round_shift(s3 - s7), bd);
+ x0 = WRAPLOW(highbd_dct_const_round_shift(s0 + s4, bd), bd);
+ x1 = WRAPLOW(highbd_dct_const_round_shift(s1 + s5, bd), bd);
+ x2 = WRAPLOW(highbd_dct_const_round_shift(s2 + s6, bd), bd);
+ x3 = WRAPLOW(highbd_dct_const_round_shift(s3 + s7, bd), bd);
+ x4 = WRAPLOW(highbd_dct_const_round_shift(s0 - s4, bd), bd);
+ x5 = WRAPLOW(highbd_dct_const_round_shift(s1 - s5, bd), bd);
+ x6 = WRAPLOW(highbd_dct_const_round_shift(s2 - s6, bd), bd);
+ x7 = WRAPLOW(highbd_dct_const_round_shift(s3 - s7, bd), bd);
// stage 2
s0 = x0;
@@ -1787,10 +1789,10 @@
x1 = WRAPLOW(s1 + s3, bd);
x2 = WRAPLOW(s0 - s2, bd);
x3 = WRAPLOW(s1 - s3, bd);
- x4 = WRAPLOW(dct_const_round_shift(s4 + s6), bd);
- x5 = WRAPLOW(dct_const_round_shift(s5 + s7), bd);
- x6 = WRAPLOW(dct_const_round_shift(s4 - s6), bd);
- x7 = WRAPLOW(dct_const_round_shift(s5 - s7), bd);
+ x4 = WRAPLOW(highbd_dct_const_round_shift(s4 + s6, bd), bd);
+ x5 = WRAPLOW(highbd_dct_const_round_shift(s5 + s7, bd), bd);
+ x6 = WRAPLOW(highbd_dct_const_round_shift(s4 - s6, bd), bd);
+ x7 = WRAPLOW(highbd_dct_const_round_shift(s5 - s7, bd), bd);
// stage 3
s2 = cospi_16_64 * (x2 + x3);
@@ -1798,10 +1800,10 @@
s6 = cospi_16_64 * (x6 + x7);
s7 = cospi_16_64 * (x6 - x7);
- x2 = WRAPLOW(dct_const_round_shift(s2), bd);
- x3 = WRAPLOW(dct_const_round_shift(s3), bd);
- x6 = WRAPLOW(dct_const_round_shift(s6), bd);
- x7 = WRAPLOW(dct_const_round_shift(s7), bd);
+ x2 = WRAPLOW(highbd_dct_const_round_shift(s2, bd), bd);
+ x3 = WRAPLOW(highbd_dct_const_round_shift(s3, bd), bd);
+ x6 = WRAPLOW(highbd_dct_const_round_shift(s6, bd), bd);
+ x7 = WRAPLOW(highbd_dct_const_round_shift(s7, bd), bd);
output[0] = WRAPLOW(x0, bd);
output[1] = WRAPLOW(-x4, bd);
@@ -1910,23 +1912,23 @@
temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64;
temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64;
- step2[8] = WRAPLOW(dct_const_round_shift(temp1), bd);
- step2[15] = WRAPLOW(dct_const_round_shift(temp2), bd);
+ step2[8] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+ step2[15] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64;
temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64;
- step2[9] = WRAPLOW(dct_const_round_shift(temp1), bd);
- step2[14] = WRAPLOW(dct_const_round_shift(temp2), bd);
+ step2[9] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+ step2[14] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64;
temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64;
- step2[10] = WRAPLOW(dct_const_round_shift(temp1), bd);
- step2[13] = WRAPLOW(dct_const_round_shift(temp2), bd);
+ step2[10] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+ step2[13] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64;
temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64;
- step2[11] = WRAPLOW(dct_const_round_shift(temp1), bd);
- step2[12] = WRAPLOW(dct_const_round_shift(temp2), bd);
+ step2[11] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+ step2[12] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
// stage 3
step1[0] = step2[0];
@@ -1936,12 +1938,12 @@
temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64;
temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64;
- step1[4] = WRAPLOW(dct_const_round_shift(temp1), bd);
- step1[7] = WRAPLOW(dct_const_round_shift(temp2), bd);
+ step1[4] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+ step1[7] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64;
temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64;
- step1[5] = WRAPLOW(dct_const_round_shift(temp1), bd);
- step1[6] = WRAPLOW(dct_const_round_shift(temp2), bd);
+ step1[5] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+ step1[6] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
step1[8] = WRAPLOW(step2[8] + step2[9], bd);
step1[9] = WRAPLOW(step2[8] - step2[9], bd);
@@ -1955,12 +1957,12 @@
// stage 4
temp1 = (step1[0] + step1[1]) * cospi_16_64;
temp2 = (step1[0] - step1[1]) * cospi_16_64;
- step2[0] = WRAPLOW(dct_const_round_shift(temp1), bd);
- step2[1] = WRAPLOW(dct_const_round_shift(temp2), bd);
+ step2[0] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+ step2[1] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64;
temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64;
- step2[2] = WRAPLOW(dct_const_round_shift(temp1), bd);
- step2[3] = WRAPLOW(dct_const_round_shift(temp2), bd);
+ step2[2] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+ step2[3] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
step2[4] = WRAPLOW(step1[4] + step1[5], bd);
step2[5] = WRAPLOW(step1[4] - step1[5], bd);
step2[6] = WRAPLOW(-step1[6] + step1[7], bd);
@@ -1970,12 +1972,12 @@
step2[15] = step1[15];
temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64;
temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64;
- step2[9] = WRAPLOW(dct_const_round_shift(temp1), bd);
- step2[14] = WRAPLOW(dct_const_round_shift(temp2), bd);
+ step2[9] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+ step2[14] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64;
temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64;
- step2[10] = WRAPLOW(dct_const_round_shift(temp1), bd);
- step2[13] = WRAPLOW(dct_const_round_shift(temp2), bd);
+ step2[10] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+ step2[13] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
step2[11] = step1[11];
step2[12] = step1[12];
@@ -1987,8 +1989,8 @@
step1[4] = step2[4];
temp1 = (step2[6] - step2[5]) * cospi_16_64;
temp2 = (step2[5] + step2[6]) * cospi_16_64;
- step1[5] = WRAPLOW(dct_const_round_shift(temp1), bd);
- step1[6] = WRAPLOW(dct_const_round_shift(temp2), bd);
+ step1[5] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+ step1[6] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
step1[7] = step2[7];
step1[8] = WRAPLOW(step2[8] + step2[11], bd);
@@ -2013,12 +2015,12 @@
step2[9] = step1[9];
temp1 = (-step1[10] + step1[13]) * cospi_16_64;
temp2 = (step1[10] + step1[13]) * cospi_16_64;
- step2[10] = WRAPLOW(dct_const_round_shift(temp1), bd);
- step2[13] = WRAPLOW(dct_const_round_shift(temp2), bd);
+ step2[10] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+ step2[13] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
temp1 = (-step1[11] + step1[12]) * cospi_16_64;
temp2 = (step1[11] + step1[12]) * cospi_16_64;
- step2[11] = WRAPLOW(dct_const_round_shift(temp1), bd);
- step2[12] = WRAPLOW(dct_const_round_shift(temp2), bd);
+ step2[11] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+ step2[12] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
step2[14] = step1[14];
step2[15] = step1[15];
@@ -2115,22 +2117,22 @@
s14 = x14 * cospi_29_64 + x15 * cospi_3_64;
s15 = x14 * cospi_3_64 - x15 * cospi_29_64;
- x0 = WRAPLOW(dct_const_round_shift(s0 + s8), bd);
- x1 = WRAPLOW(dct_const_round_shift(s1 + s9), bd);
- x2 = WRAPLOW(dct_const_round_shift(s2 + s10), bd);
- x3 = WRAPLOW(dct_const_round_shift(s3 + s11), bd);
- x4 = WRAPLOW(dct_const_round_shift(s4 + s12), bd);
- x5 = WRAPLOW(dct_const_round_shift(s5 + s13), bd);
- x6 = WRAPLOW(dct_const_round_shift(s6 + s14), bd);
- x7 = WRAPLOW(dct_const_round_shift(s7 + s15), bd);
- x8 = WRAPLOW(dct_const_round_shift(s0 - s8), bd);
- x9 = WRAPLOW(dct_const_round_shift(s1 - s9), bd);
- x10 = WRAPLOW(dct_const_round_shift(s2 - s10), bd);
- x11 = WRAPLOW(dct_const_round_shift(s3 - s11), bd);
- x12 = WRAPLOW(dct_const_round_shift(s4 - s12), bd);
- x13 = WRAPLOW(dct_const_round_shift(s5 - s13), bd);
- x14 = WRAPLOW(dct_const_round_shift(s6 - s14), bd);
- x15 = WRAPLOW(dct_const_round_shift(s7 - s15), bd);
+ x0 = WRAPLOW(highbd_dct_const_round_shift(s0 + s8, bd), bd);
+ x1 = WRAPLOW(highbd_dct_const_round_shift(s1 + s9, bd), bd);
+ x2 = WRAPLOW(highbd_dct_const_round_shift(s2 + s10, bd), bd);
+ x3 = WRAPLOW(highbd_dct_const_round_shift(s3 + s11, bd), bd);
+ x4 = WRAPLOW(highbd_dct_const_round_shift(s4 + s12, bd), bd);
+ x5 = WRAPLOW(highbd_dct_const_round_shift(s5 + s13, bd), bd);
+ x6 = WRAPLOW(highbd_dct_const_round_shift(s6 + s14, bd), bd);
+ x7 = WRAPLOW(highbd_dct_const_round_shift(s7 + s15, bd), bd);
+ x8 = WRAPLOW(highbd_dct_const_round_shift(s0 - s8, bd), bd);
+ x9 = WRAPLOW(highbd_dct_const_round_shift(s1 - s9, bd), bd);
+ x10 = WRAPLOW(highbd_dct_const_round_shift(s2 - s10, bd), bd);
+ x11 = WRAPLOW(highbd_dct_const_round_shift(s3 - s11, bd), bd);
+ x12 = WRAPLOW(highbd_dct_const_round_shift(s4 - s12, bd), bd);
+ x13 = WRAPLOW(highbd_dct_const_round_shift(s5 - s13, bd), bd);
+ x14 = WRAPLOW(highbd_dct_const_round_shift(s6 - s14, bd), bd);
+ x15 = WRAPLOW(highbd_dct_const_round_shift(s7 - s15, bd), bd);
// stage 2
s0 = x0;
@@ -2158,14 +2160,14 @@
x5 = WRAPLOW(s1 - s5, bd);
x6 = WRAPLOW(s2 - s6, bd);
x7 = WRAPLOW(s3 - s7, bd);
- x8 = WRAPLOW(dct_const_round_shift(s8 + s12), bd);
- x9 = WRAPLOW(dct_const_round_shift(s9 + s13), bd);
- x10 = WRAPLOW(dct_const_round_shift(s10 + s14), bd);
- x11 = WRAPLOW(dct_const_round_shift(s11 + s15), bd);
- x12 = WRAPLOW(dct_const_round_shift(s8 - s12), bd);
- x13 = WRAPLOW(dct_const_round_shift(s9 - s13), bd);
- x14 = WRAPLOW(dct_const_round_shift(s10 - s14), bd);
- x15 = WRAPLOW(dct_const_round_shift(s11 - s15), bd);
+ x8 = WRAPLOW(highbd_dct_const_round_shift(s8 + s12, bd), bd);
+ x9 = WRAPLOW(highbd_dct_const_round_shift(s9 + s13, bd), bd);
+ x10 = WRAPLOW(highbd_dct_const_round_shift(s10 + s14, bd), bd);
+ x11 = WRAPLOW(highbd_dct_const_round_shift(s11 + s15, bd), bd);
+ x12 = WRAPLOW(highbd_dct_const_round_shift(s8 - s12, bd), bd);
+ x13 = WRAPLOW(highbd_dct_const_round_shift(s9 - s13, bd), bd);
+ x14 = WRAPLOW(highbd_dct_const_round_shift(s10 - s14, bd), bd);
+ x15 = WRAPLOW(highbd_dct_const_round_shift(s11 - s15, bd), bd);
// stage 3
s0 = x0;
@@ -2189,18 +2191,18 @@
x1 = WRAPLOW(s1 + s3, bd);
x2 = WRAPLOW(s0 - s2, bd);
x3 = WRAPLOW(s1 - s3, bd);
- x4 = WRAPLOW(dct_const_round_shift(s4 + s6), bd);
- x5 = WRAPLOW(dct_const_round_shift(s5 + s7), bd);
- x6 = WRAPLOW(dct_const_round_shift(s4 - s6), bd);
- x7 = WRAPLOW(dct_const_round_shift(s5 - s7), bd);
+ x4 = WRAPLOW(highbd_dct_const_round_shift(s4 + s6, bd), bd);
+ x5 = WRAPLOW(highbd_dct_const_round_shift(s5 + s7, bd), bd);
+ x6 = WRAPLOW(highbd_dct_const_round_shift(s4 - s6, bd), bd);
+ x7 = WRAPLOW(highbd_dct_const_round_shift(s5 - s7, bd), bd);
x8 = WRAPLOW(s8 + s10, bd);
x9 = WRAPLOW(s9 + s11, bd);
x10 = WRAPLOW(s8 - s10, bd);
x11 = WRAPLOW(s9 - s11, bd);
- x12 = WRAPLOW(dct_const_round_shift(s12 + s14), bd);
- x13 = WRAPLOW(dct_const_round_shift(s13 + s15), bd);
- x14 = WRAPLOW(dct_const_round_shift(s12 - s14), bd);
- x15 = WRAPLOW(dct_const_round_shift(s13 - s15), bd);
+ x12 = WRAPLOW(highbd_dct_const_round_shift(s12 + s14, bd), bd);
+ x13 = WRAPLOW(highbd_dct_const_round_shift(s13 + s15, bd), bd);
+ x14 = WRAPLOW(highbd_dct_const_round_shift(s12 - s14, bd), bd);
+ x15 = WRAPLOW(highbd_dct_const_round_shift(s13 - s15, bd), bd);
// stage 4
s2 = (- cospi_16_64) * (x2 + x3);
@@ -2212,14 +2214,14 @@
s14 = (- cospi_16_64) * (x14 + x15);
s15 = cospi_16_64 * (x14 - x15);
- x2 = WRAPLOW(dct_const_round_shift(s2), bd);
- x3 = WRAPLOW(dct_const_round_shift(s3), bd);
- x6 = WRAPLOW(dct_const_round_shift(s6), bd);
- x7 = WRAPLOW(dct_const_round_shift(s7), bd);
- x10 = WRAPLOW(dct_const_round_shift(s10), bd);
- x11 = WRAPLOW(dct_const_round_shift(s11), bd);
- x14 = WRAPLOW(dct_const_round_shift(s14), bd);
- x15 = WRAPLOW(dct_const_round_shift(s15), bd);
+ x2 = WRAPLOW(highbd_dct_const_round_shift(s2, bd), bd);
+ x3 = WRAPLOW(highbd_dct_const_round_shift(s3, bd), bd);
+ x6 = WRAPLOW(highbd_dct_const_round_shift(s6, bd), bd);
+ x7 = WRAPLOW(highbd_dct_const_round_shift(s7, bd), bd);
+ x10 = WRAPLOW(highbd_dct_const_round_shift(s10, bd), bd);
+ x11 = WRAPLOW(highbd_dct_const_round_shift(s11, bd), bd);
+ x14 = WRAPLOW(highbd_dct_const_round_shift(s14, bd), bd);
+ x15 = WRAPLOW(highbd_dct_const_round_shift(s15, bd), bd);
output[0] = WRAPLOW(x0, bd);
output[1] = WRAPLOW(-x8, bd);
@@ -2306,10 +2308,11 @@
int stride, int bd) {
int i, j;
tran_high_t a1;
- tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), bd);
+ tran_low_t out = WRAPLOW(
+ highbd_dct_const_round_shift(input[0] * cospi_16_64, bd), bd);
uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
- out = WRAPLOW(dct_const_round_shift(out * cospi_16_64), bd);
+ out = WRAPLOW(highbd_dct_const_round_shift(out * cospi_16_64, bd), bd);
a1 = ROUND_POWER_OF_TWO(out, 6);
for (j = 0; j < 16; ++j) {
for (i = 0; i < 16; ++i)
@@ -2343,43 +2346,43 @@
temp1 = input[1] * cospi_31_64 - input[31] * cospi_1_64;
temp2 = input[1] * cospi_1_64 + input[31] * cospi_31_64;
- step1[16] = WRAPLOW(dct_const_round_shift(temp1), bd);
- step1[31] = WRAPLOW(dct_const_round_shift(temp2), bd);
+ step1[16] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+ step1[31] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
temp1 = input[17] * cospi_15_64 - input[15] * cospi_17_64;
temp2 = input[17] * cospi_17_64 + input[15] * cospi_15_64;
- step1[17] = WRAPLOW(dct_const_round_shift(temp1), bd);
- step1[30] = WRAPLOW(dct_const_round_shift(temp2), bd);
+ step1[17] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+ step1[30] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
temp1 = input[9] * cospi_23_64 - input[23] * cospi_9_64;
temp2 = input[9] * cospi_9_64 + input[23] * cospi_23_64;
- step1[18] = WRAPLOW(dct_const_round_shift(temp1), bd);
- step1[29] = WRAPLOW(dct_const_round_shift(temp2), bd);
+ step1[18] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+ step1[29] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
temp1 = input[25] * cospi_7_64 - input[7] * cospi_25_64;
temp2 = input[25] * cospi_25_64 + input[7] * cospi_7_64;
- step1[19] = WRAPLOW(dct_const_round_shift(temp1), bd);
- step1[28] = WRAPLOW(dct_const_round_shift(temp2), bd);
+ step1[19] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+ step1[28] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
temp1 = input[5] * cospi_27_64 - input[27] * cospi_5_64;
temp2 = input[5] * cospi_5_64 + input[27] * cospi_27_64;
- step1[20] = WRAPLOW(dct_const_round_shift(temp1), bd);
- step1[27] = WRAPLOW(dct_const_round_shift(temp2), bd);
+ step1[20] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+ step1[27] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
temp1 = input[21] * cospi_11_64 - input[11] * cospi_21_64;
temp2 = input[21] * cospi_21_64 + input[11] * cospi_11_64;
- step1[21] = WRAPLOW(dct_const_round_shift(temp1), bd);
- step1[26] = WRAPLOW(dct_const_round_shift(temp2), bd);
+ step1[21] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+ step1[26] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
temp1 = input[13] * cospi_19_64 - input[19] * cospi_13_64;
temp2 = input[13] * cospi_13_64 + input[19] * cospi_19_64;
- step1[22] = WRAPLOW(dct_const_round_shift(temp1), bd);
- step1[25] = WRAPLOW(dct_const_round_shift(temp2), bd);
+ step1[22] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+ step1[25] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
temp1 = input[29] * cospi_3_64 - input[3] * cospi_29_64;
temp2 = input[29] * cospi_29_64 + input[3] * cospi_3_64;
- step1[23] = WRAPLOW(dct_const_round_shift(temp1), bd);
- step1[24] = WRAPLOW(dct_const_round_shift(temp2), bd);
+ step1[23] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+ step1[24] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
// stage 2
step2[0] = step1[0];
@@ -2393,23 +2396,23 @@
temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64;
temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64;
- step2[8] = WRAPLOW(dct_const_round_shift(temp1), bd);
- step2[15] = WRAPLOW(dct_const_round_shift(temp2), bd);
+ step2[8] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+ step2[15] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64;
temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64;
- step2[9] = WRAPLOW(dct_const_round_shift(temp1), bd);
- step2[14] = WRAPLOW(dct_const_round_shift(temp2), bd);
+ step2[9] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+ step2[14] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64;
temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64;
- step2[10] = WRAPLOW(dct_const_round_shift(temp1), bd);
- step2[13] = WRAPLOW(dct_const_round_shift(temp2), bd);
+ step2[10] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+ step2[13] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64;
temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64;
- step2[11] = WRAPLOW(dct_const_round_shift(temp1), bd);
- step2[12] = WRAPLOW(dct_const_round_shift(temp2), bd);
+ step2[11] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+ step2[12] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
step2[16] = WRAPLOW(step1[16] + step1[17], bd);
step2[17] = WRAPLOW(step1[16] - step1[17], bd);
@@ -2436,12 +2439,12 @@
temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64;
temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64;
- step1[4] = WRAPLOW(dct_const_round_shift(temp1), bd);
- step1[7] = WRAPLOW(dct_const_round_shift(temp2), bd);
+ step1[4] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+ step1[7] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64;
temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64;
- step1[5] = WRAPLOW(dct_const_round_shift(temp1), bd);
- step1[6] = WRAPLOW(dct_const_round_shift(temp2), bd);
+ step1[5] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+ step1[6] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
step1[8] = WRAPLOW(step2[8] + step2[9], bd);
step1[9] = WRAPLOW(step2[8] - step2[9], bd);
@@ -2456,22 +2459,22 @@
step1[31] = step2[31];
temp1 = -step2[17] * cospi_4_64 + step2[30] * cospi_28_64;
temp2 = step2[17] * cospi_28_64 + step2[30] * cospi_4_64;
- step1[17] = WRAPLOW(dct_const_round_shift(temp1), bd);
- step1[30] = WRAPLOW(dct_const_round_shift(temp2), bd);
+ step1[17] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+ step1[30] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
temp1 = -step2[18] * cospi_28_64 - step2[29] * cospi_4_64;
temp2 = -step2[18] * cospi_4_64 + step2[29] * cospi_28_64;
- step1[18] = WRAPLOW(dct_const_round_shift(temp1), bd);
- step1[29] = WRAPLOW(dct_const_round_shift(temp2), bd);
+ step1[18] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+ step1[29] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
step1[19] = step2[19];
step1[20] = step2[20];
temp1 = -step2[21] * cospi_20_64 + step2[26] * cospi_12_64;
temp2 = step2[21] * cospi_12_64 + step2[26] * cospi_20_64;
- step1[21] = WRAPLOW(dct_const_round_shift(temp1), bd);
- step1[26] = WRAPLOW(dct_const_round_shift(temp2), bd);
+ step1[21] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+ step1[26] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
temp1 = -step2[22] * cospi_12_64 - step2[25] * cospi_20_64;
temp2 = -step2[22] * cospi_20_64 + step2[25] * cospi_12_64;
- step1[22] = WRAPLOW(dct_const_round_shift(temp1), bd);
- step1[25] = WRAPLOW(dct_const_round_shift(temp2), bd);
+ step1[22] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+ step1[25] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
step1[23] = step2[23];
step1[24] = step2[24];
step1[27] = step2[27];
@@ -2480,12 +2483,12 @@
// stage 4
temp1 = (step1[0] + step1[1]) * cospi_16_64;
temp2 = (step1[0] - step1[1]) * cospi_16_64;
- step2[0] = WRAPLOW(dct_const_round_shift(temp1), bd);
- step2[1] = WRAPLOW(dct_const_round_shift(temp2), bd);
+ step2[0] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+ step2[1] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64;
temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64;
- step2[2] = WRAPLOW(dct_const_round_shift(temp1), bd);
- step2[3] = WRAPLOW(dct_const_round_shift(temp2), bd);
+ step2[2] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+ step2[3] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
step2[4] = WRAPLOW(step1[4] + step1[5], bd);
step2[5] = WRAPLOW(step1[4] - step1[5], bd);
step2[6] = WRAPLOW(-step1[6] + step1[7], bd);
@@ -2495,12 +2498,12 @@
step2[15] = step1[15];
temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64;
temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64;
- step2[9] = WRAPLOW(dct_const_round_shift(temp1), bd);
- step2[14] = WRAPLOW(dct_const_round_shift(temp2), bd);
+ step2[9] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+ step2[14] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64;
temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64;
- step2[10] = WRAPLOW(dct_const_round_shift(temp1), bd);
- step2[13] = WRAPLOW(dct_const_round_shift(temp2), bd);
+ step2[10] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+ step2[13] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
step2[11] = step1[11];
step2[12] = step1[12];
@@ -2530,8 +2533,8 @@
step1[4] = step2[4];
temp1 = (step2[6] - step2[5]) * cospi_16_64;
temp2 = (step2[5] + step2[6]) * cospi_16_64;
- step1[5] = WRAPLOW(dct_const_round_shift(temp1), bd);
- step1[6] = WRAPLOW(dct_const_round_shift(temp2), bd);
+ step1[5] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+ step1[6] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
step1[7] = step2[7];
step1[8] = WRAPLOW(step2[8] + step2[11], bd);
@@ -2547,20 +2550,20 @@
step1[17] = step2[17];
temp1 = -step2[18] * cospi_8_64 + step2[29] * cospi_24_64;
temp2 = step2[18] * cospi_24_64 + step2[29] * cospi_8_64;
- step1[18] = WRAPLOW(dct_const_round_shift(temp1), bd);
- step1[29] = WRAPLOW(dct_const_round_shift(temp2), bd);
+ step1[18] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+ step1[29] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
temp1 = -step2[19] * cospi_8_64 + step2[28] * cospi_24_64;
temp2 = step2[19] * cospi_24_64 + step2[28] * cospi_8_64;
- step1[19] = WRAPLOW(dct_const_round_shift(temp1), bd);
- step1[28] = WRAPLOW(dct_const_round_shift(temp2), bd);
+ step1[19] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+ step1[28] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
temp1 = -step2[20] * cospi_24_64 - step2[27] * cospi_8_64;
temp2 = -step2[20] * cospi_8_64 + step2[27] * cospi_24_64;
- step1[20] = WRAPLOW(dct_const_round_shift(temp1), bd);
- step1[27] = WRAPLOW(dct_const_round_shift(temp2), bd);
+ step1[20] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+ step1[27] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
temp1 = -step2[21] * cospi_24_64 - step2[26] * cospi_8_64;
temp2 = -step2[21] * cospi_8_64 + step2[26] * cospi_24_64;
- step1[21] = WRAPLOW(dct_const_round_shift(temp1), bd);
- step1[26] = WRAPLOW(dct_const_round_shift(temp2), bd);
+ step1[21] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+ step1[26] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
step1[22] = step2[22];
step1[23] = step2[23];
step1[24] = step2[24];
@@ -2581,12 +2584,12 @@
step2[9] = step1[9];
temp1 = (-step1[10] + step1[13]) * cospi_16_64;
temp2 = (step1[10] + step1[13]) * cospi_16_64;
- step2[10] = WRAPLOW(dct_const_round_shift(temp1), bd);
- step2[13] = WRAPLOW(dct_const_round_shift(temp2), bd);
+ step2[10] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+ step2[13] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
temp1 = (-step1[11] + step1[12]) * cospi_16_64;
temp2 = (step1[11] + step1[12]) * cospi_16_64;
- step2[11] = WRAPLOW(dct_const_round_shift(temp1), bd);
- step2[12] = WRAPLOW(dct_const_round_shift(temp2), bd);
+ step2[11] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+ step2[12] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
step2[14] = step1[14];
step2[15] = step1[15];
@@ -2632,20 +2635,20 @@
step1[19] = step2[19];
temp1 = (-step2[20] + step2[27]) * cospi_16_64;
temp2 = (step2[20] + step2[27]) * cospi_16_64;
- step1[20] = WRAPLOW(dct_const_round_shift(temp1), bd);
- step1[27] = WRAPLOW(dct_const_round_shift(temp2), bd);
+ step1[20] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+ step1[27] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
temp1 = (-step2[21] + step2[26]) * cospi_16_64;
temp2 = (step2[21] + step2[26]) * cospi_16_64;
- step1[21] = WRAPLOW(dct_const_round_shift(temp1), bd);
- step1[26] = WRAPLOW(dct_const_round_shift(temp2), bd);
+ step1[21] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+ step1[26] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
temp1 = (-step2[22] + step2[25]) * cospi_16_64;
temp2 = (step2[22] + step2[25]) * cospi_16_64;
- step1[22] = WRAPLOW(dct_const_round_shift(temp1), bd);
- step1[25] = WRAPLOW(dct_const_round_shift(temp2), bd);
+ step1[22] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+ step1[25] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
temp1 = (-step2[23] + step2[24]) * cospi_16_64;
temp2 = (step2[23] + step2[24]) * cospi_16_64;
- step1[23] = WRAPLOW(dct_const_round_shift(temp1), bd);
- step1[24] = WRAPLOW(dct_const_round_shift(temp2), bd);
+ step1[23] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+ step1[24] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
step1[28] = step2[28];
step1[29] = step2[29];
step1[30] = step2[30];
@@ -2759,8 +2762,9 @@
int a1;
uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
- tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), bd);
- out = WRAPLOW(dct_const_round_shift(out * cospi_16_64), bd);
+ tran_low_t out = WRAPLOW(
+ highbd_dct_const_round_shift(input[0] * cospi_16_64, bd), bd);
+ out = WRAPLOW(highbd_dct_const_round_shift(out * cospi_16_64, bd), bd);
a1 = ROUND_POWER_OF_TWO(out, 6);
for (j = 0; j < 32; ++j) {
diff --git a/vp9/common/vp9_idct.h b/vp9/common/vp9_idct.h
index 1d8836c..6e2551d 100644
--- a/vp9/common/vp9_idct.h
+++ b/vp9/common/vp9_idct.h
@@ -80,13 +80,7 @@
static const tran_high_t sinpi_4_9 = 15212;
static INLINE tran_low_t check_range(tran_high_t input) {
-#if CONFIG_VP9_HIGHBITDEPTH
- // For valid highbitdepth VP9 streams, intermediate stage coefficients will
- // stay within the ranges:
- // - 8 bit: signed 16 bit integer
- // - 10 bit: signed 18 bit integer
- // - 12 bit: signed 20 bit integer
-#elif CONFIG_COEFFICIENT_RANGE_CHECKING
+#if CONFIG_COEFFICIENT_RANGE_CHECKING
// For valid VP9 input streams, intermediate stage coefficients should always
// stay within the range of a signed 16 bit integer. Coefficients can go out
// of this range for invalid/corrupt VP9 streams. However, strictly checking
@@ -95,7 +89,7 @@
// --enable-coefficient-range-checking.
assert(INT16_MIN <= input);
assert(input <= INT16_MAX);
-#endif
+#endif // CONFIG_COEFFICIENT_RANGE_CHECKING
return (tran_low_t)input;
}
@@ -104,6 +98,32 @@
return check_range(rv);
}
+#if CONFIG_VP9_HIGHBITDEPTH
+static INLINE tran_low_t highbd_check_range(tran_high_t input,
+ int bd) {
+#if CONFIG_COEFFICIENT_RANGE_CHECKING
+ // For valid highbitdepth VP9 streams, intermediate stage coefficients will
+ // stay within the ranges:
+ // - 8 bit: signed 16 bit integer
+ // - 10 bit: signed 18 bit integer
+ // - 12 bit: signed 20 bit integer
+ const int32_t int_max = (1 << (7 + bd)) - 1;
+ const int32_t int_min = -int_max - 1;
+ assert(int_min <= input);
+ assert(input <= int_max);
+ (void) int_min;
+#endif // CONFIG_COEFFICIENT_RANGE_CHECKING
+ (void) bd;
+ return (tran_low_t)input;
+}
+
+static INLINE tran_low_t highbd_dct_const_round_shift(tran_high_t input,
+ int bd) {
+ tran_high_t rv = ROUND_POWER_OF_TWO(input, DCT_CONST_BITS);
+ return highbd_check_range(rv, bd);
+}
+#endif // CONFIG_VP9_HIGHBITDEPTH
+
typedef void (*transform_1d)(const tran_low_t*, tran_low_t*);
typedef struct {
diff --git a/vp9/common/vp9_mfqe.c b/vp9/common/vp9_mfqe.c
index 92650e9..e7ee903 100644
--- a/vp9/common/vp9_mfqe.c
+++ b/vp9/common/vp9_mfqe.c
@@ -136,13 +136,27 @@
}
}
+static void get_thr(BLOCK_SIZE bs, int qdiff, int *sad_thr, int *vdiff_thr) {
+ const int adj = qdiff >> MFQE_PRECISION;
+ if (bs == BLOCK_16X16) {
+ *sad_thr = 7 + adj;
+ } else if (bs == BLOCK_32X32) {
+ *sad_thr = 6 + adj;
+ } else { // BLOCK_64X64
+ *sad_thr = 5 + adj;
+ }
+ *vdiff_thr = 125 + qdiff;
+}
+
static void mfqe_block(BLOCK_SIZE bs, const uint8_t *y, const uint8_t *u,
const uint8_t *v, int y_stride, int uv_stride,
- uint8_t *yd, uint8_t *ud, uint8_t *vd,
- int yd_stride, int uvd_stride) {
- int sad, sad_thr, vdiff;
+ uint8_t *yd, uint8_t *ud, uint8_t *vd, int yd_stride,
+ int uvd_stride, int qdiff) {
+ int sad, sad_thr, vdiff, vdiff_thr;
uint32_t sse;
+ get_thr(bs, qdiff, &sad_thr, &vdiff_thr);
+
if (bs == BLOCK_16X16) {
vdiff = (vp9_variance16x16(y, y_stride, yd, yd_stride, &sse) + 128) >> 8;
sad = (vp9_sad16x16(y, y_stride, yd, yd_stride) + 128) >> 8;
@@ -154,23 +168,18 @@
sad = (vp9_sad64x64(y, y_stride, yd, yd_stride) + 2048) >> 12;
}
- if (bs == BLOCK_16X16) {
- sad_thr = 8;
- } else if (bs == BLOCK_32X32) {
- sad_thr = 7;
- } else { // BLOCK_64X64
- sad_thr = 6;
- }
-
- // TODO(jackychen): More experiments and remove magic numbers.
// vdiff > sad * 3 means vdiff should not be too small, otherwise,
// it might be a lighting change in smooth area. When there is a
// lighting change in smooth area, it is dangerous to do MFQE.
- if (sad > 1 && sad < sad_thr && vdiff > sad * 3 && vdiff < 150) {
- // TODO(jackychen): Add weighted average in the calculation.
- // Currently, the data is copied from last frame without averaging.
- apply_ifactor(y, y_stride, yd, yd_stride, u, v, uv_stride,
- ud, vd, uvd_stride, bs, 0);
+ if (sad > 1 && vdiff > sad * 3) {
+ const int weight = 1 << MFQE_PRECISION;
+ int ifactor = weight * sad * vdiff / (sad_thr * vdiff_thr);
+ // When ifactor equals weight, no MFQE is done.
+ if (ifactor > weight) {
+ ifactor = weight;
+ }
+ apply_ifactor(y, y_stride, yd, yd_stride, u, v, uv_stride, ud, vd,
+ uvd_stride, bs, ifactor);
} else {
// Copy the block from current frame (i.e., no mfqe is done).
copy_block(y, u, v, y_stride, uv_stride, yd, ud, vd,
@@ -199,8 +208,7 @@
int yd_stride, int uvd_stride) {
int mi_offset, y_offset, uv_offset;
const BLOCK_SIZE cur_bs = mi->mbmi.sb_type;
- // TODO(jackychen): Consider how and whether to use qdiff in MFQE.
- // int qdiff = cm->base_qindex - cm->postproc_state.last_base_qindex;
+ const int qdiff = cm->base_qindex - cm->postproc_state.last_base_qindex;
const int bsl = b_width_log2_lookup[bs];
PARTITION_TYPE partition = partition_lookup[bsl][cur_bs];
const BLOCK_SIZE subsize = get_subsize(bs, partition);
@@ -235,18 +243,18 @@
if (mfqe_decision(mi, mfqe_bs)) {
// Do mfqe on the first square partition.
mfqe_block(bs_tmp, y, u, v, y_stride, uv_stride,
- yd, ud, vd, yd_stride, uvd_stride);
+ yd, ud, vd, yd_stride, uvd_stride, qdiff);
// Do mfqe on the second square partition.
mfqe_block(bs_tmp, y + y_offset, u + uv_offset, v + uv_offset,
y_stride, uv_stride, yd + y_offset, ud + uv_offset,
- vd + uv_offset, yd_stride, uvd_stride);
+ vd + uv_offset, yd_stride, uvd_stride, qdiff);
}
if (mfqe_decision(mi + mi_offset * cm->mi_stride, mfqe_bs)) {
// Do mfqe on the first square partition.
mfqe_block(bs_tmp, y + y_offset * y_stride, u + uv_offset * uv_stride,
v + uv_offset * uv_stride, y_stride, uv_stride,
yd + y_offset * yd_stride, ud + uv_offset * uvd_stride,
- vd + uv_offset * uvd_stride, yd_stride, uvd_stride);
+ vd + uv_offset * uvd_stride, yd_stride, uvd_stride, qdiff);
// Do mfqe on the second square partition.
mfqe_block(bs_tmp, y + y_offset * y_stride + y_offset,
u + uv_offset * uv_stride + uv_offset,
@@ -254,7 +262,7 @@
uv_stride, yd + y_offset * yd_stride + y_offset,
ud + uv_offset * uvd_stride + uv_offset,
vd + uv_offset * uvd_stride + uv_offset,
- yd_stride, uvd_stride);
+ yd_stride, uvd_stride, qdiff);
}
break;
case PARTITION_VERT:
@@ -268,18 +276,18 @@
if (mfqe_decision(mi, mfqe_bs)) {
// Do mfqe on the first square partition.
mfqe_block(bs_tmp, y, u, v, y_stride, uv_stride,
- yd, ud, vd, yd_stride, uvd_stride);
+ yd, ud, vd, yd_stride, uvd_stride, qdiff);
// Do mfqe on the second square partition.
mfqe_block(bs_tmp, y + y_offset * y_stride, u + uv_offset * uv_stride,
v + uv_offset * uv_stride, y_stride, uv_stride,
yd + y_offset * yd_stride, ud + uv_offset * uvd_stride,
- vd + uv_offset * uvd_stride, yd_stride, uvd_stride);
+ vd + uv_offset * uvd_stride, yd_stride, uvd_stride, qdiff);
}
if (mfqe_decision(mi + mi_offset, mfqe_bs)) {
// Do mfqe on the first square partition.
mfqe_block(bs_tmp, y + y_offset, u + uv_offset, v + uv_offset,
y_stride, uv_stride, yd + y_offset, ud + uv_offset,
- vd + uv_offset, yd_stride, uvd_stride);
+ vd + uv_offset, yd_stride, uvd_stride, qdiff);
// Do mfqe on the second square partition.
mfqe_block(bs_tmp, y + y_offset * y_stride + y_offset,
u + uv_offset * uv_stride + uv_offset,
@@ -287,14 +295,14 @@
uv_stride, yd + y_offset * yd_stride + y_offset,
ud + uv_offset * uvd_stride + uv_offset,
vd + uv_offset * uvd_stride + uv_offset,
- yd_stride, uvd_stride);
+ yd_stride, uvd_stride, qdiff);
}
break;
case PARTITION_NONE:
if (mfqe_decision(mi, cur_bs)) {
// Do mfqe on this partition.
mfqe_block(cur_bs, y, u, v, y_stride, uv_stride,
- yd, ud, vd, yd_stride, uvd_stride);
+ yd, ud, vd, yd_stride, uvd_stride, qdiff);
} else {
// Copy the block from current frame(i.e., no mfqe is done).
copy_block(y, u, v, y_stride, uv_stride, yd, ud, vd,
diff --git a/vp9/common/vp9_rtcd_defs.pl b/vp9/common/vp9_rtcd_defs.pl
index d2ab875..88f85a8 100644
--- a/vp9/common/vp9_rtcd_defs.pl
+++ b/vp9/common/vp9_rtcd_defs.pl
@@ -224,10 +224,12 @@
$vp9_lpf_vertical_16_dual_neon_asm=vp9_lpf_vertical_16_dual_neon;
add_proto qw/void vp9_lpf_vertical_8/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count";
-specialize qw/vp9_lpf_vertical_8 sse2 neon dspr2/;
+specialize qw/vp9_lpf_vertical_8 sse2 neon_asm dspr2/;
+$vp9_lpf_vertical_8_neon_asm=vp9_lpf_vertical_8_neon;
add_proto qw/void vp9_lpf_vertical_8_dual/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1";
-specialize qw/vp9_lpf_vertical_8_dual sse2 neon dspr2/;
+specialize qw/vp9_lpf_vertical_8_dual sse2 neon_asm dspr2/;
+$vp9_lpf_vertical_8_dual_neon_asm=vp9_lpf_vertical_8_dual_neon;
add_proto qw/void vp9_lpf_vertical_4/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count";
specialize qw/vp9_lpf_vertical_4 mmx neon dspr2/;
@@ -240,10 +242,12 @@
$vp9_lpf_horizontal_16_neon_asm=vp9_lpf_horizontal_16_neon;
add_proto qw/void vp9_lpf_horizontal_8/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count";
-specialize qw/vp9_lpf_horizontal_8 sse2 neon dspr2/;
+specialize qw/vp9_lpf_horizontal_8 sse2 neon_asm dspr2/;
+$vp9_lpf_horizontal_8_neon_asm=vp9_lpf_horizontal_8_neon;
add_proto qw/void vp9_lpf_horizontal_8_dual/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1";
-specialize qw/vp9_lpf_horizontal_8_dual sse2 neon dspr2/;
+specialize qw/vp9_lpf_horizontal_8_dual sse2 neon_asm dspr2/;
+$vp9_lpf_horizontal_8_dual_neon_asm=vp9_lpf_horizontal_8_dual_neon;
add_proto qw/void vp9_lpf_horizontal_4/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count";
specialize qw/vp9_lpf_horizontal_4 mmx neon dspr2/;
diff --git a/vp9/common/vp9_tile_common.c b/vp9/common/vp9_tile_common.c
index 8c4a303..7a20e0a 100644
--- a/vp9/common/vp9_tile_common.c
+++ b/vp9/common/vp9_tile_common.c
@@ -36,24 +36,24 @@
vp9_tile_set_col(tile, cm, col);
}
+static int get_min_log2_tile_cols(const int sb64_cols) {
+ int min_log2 = 0;
+ while ((MAX_TILE_WIDTH_B64 << min_log2) < sb64_cols)
+ ++min_log2;
+ return min_log2;
+}
+
+static int get_max_log2_tile_cols(const int sb64_cols) {
+ int max_log2 = 1;
+ while ((sb64_cols >> max_log2) >= MIN_TILE_WIDTH_B64)
+ ++max_log2;
+ return max_log2 - 1;
+}
+
void vp9_get_tile_n_bits(int mi_cols,
int *min_log2_tile_cols, int *max_log2_tile_cols) {
- const int sb_cols = mi_cols_aligned_to_sb(mi_cols) >> MI_BLOCK_SIZE_LOG2;
- int min_log2 = 0, max_log2 = 0;
-
- // max
- while ((sb_cols >> max_log2) >= MIN_TILE_WIDTH_B64)
- ++max_log2;
- --max_log2;
- if (max_log2 < 0)
- max_log2 = 0;
-
- // min
- while ((MAX_TILE_WIDTH_B64 << min_log2) < sb_cols)
- ++min_log2;
-
- assert(min_log2 <= max_log2);
-
- *min_log2_tile_cols = min_log2;
- *max_log2_tile_cols = max_log2;
+ const int sb64_cols = mi_cols_aligned_to_sb(mi_cols) >> MI_BLOCK_SIZE_LOG2;
+ *min_log2_tile_cols = get_min_log2_tile_cols(sb64_cols);
+ *max_log2_tile_cols = get_max_log2_tile_cols(sb64_cols);
+ assert(*min_log2_tile_cols <= *max_log2_tile_cols);
}
diff --git a/vp9/decoder/vp9_decodeframe.c b/vp9/decoder/vp9_decodeframe.c
index 9677173..3b8af84 100644
--- a/vp9/decoder/vp9_decodeframe.c
+++ b/vp9/decoder/vp9_decodeframe.c
@@ -1546,8 +1546,6 @@
vpx_internal_error(&cm->error, VPX_CODEC_CORRUPT_FRAME,
"Truncated packet or corrupt header length");
- init_macroblockd(cm, &pbi->mb);
-
cm->use_prev_frame_mvs = !cm->error_resilient_mode &&
cm->width == cm->last_width &&
cm->height == cm->last_height &&
diff --git a/vp9/decoder/vp9_decoder.c b/vp9/decoder/vp9_decoder.c
index bb7bb56..1d254d2 100644
--- a/vp9/decoder/vp9_decoder.c
+++ b/vp9/decoder/vp9_decoder.c
@@ -238,7 +238,7 @@
// Invalidate these references until the next frame starts.
for (ref_index = 0; ref_index < 3; ref_index++)
- cm->frame_refs[ref_index].idx = INT_MAX;
+ cm->frame_refs[ref_index].idx = -1;
}
int vp9_receive_compressed_data(VP9Decoder *pbi,
@@ -258,7 +258,7 @@
// TODO(jkoleszar): Error concealment is undefined and non-normative
// at this point, but if it becomes so, [0] may not always be the correct
// thing to do here.
- if (cm->frame_refs[0].idx != INT_MAX)
+ if (cm->frame_refs[0].idx > 0)
cm->frame_refs[0].buf->corrupted = 1;
}
diff --git a/vp9/decoder/vp9_detokenize.c b/vp9/decoder/vp9_detokenize.c
index 8704fdd..23d622d 100644
--- a/vp9/decoder/vp9_detokenize.c
+++ b/vp9/decoder/vp9_detokenize.c
@@ -14,6 +14,9 @@
#include "vp9/common/vp9_blockd.h"
#include "vp9/common/vp9_common.h"
#include "vp9/common/vp9_entropy.h"
+#if CONFIG_COEFFICIENT_RANGE_CHECKING
+#include "vp9/common/vp9_idct.h"
+#endif
#include "vp9/decoder/vp9_detokenize.h"
@@ -32,7 +35,7 @@
#define INCREMENT_COUNT(token) \
do { \
if (!cm->frame_parallel_decoding_mode) \
- ++coef_counts[band][ctx][token]; \
+ ++coef_counts[band][ctx][token]; \
} while (0)
static INLINE int read_coeff(const vp9_prob *probs, int n, vp9_reader *r) {
@@ -191,10 +194,15 @@
}
v = (val * dqv) >> dq_shift;
#if CONFIG_COEFFICIENT_RANGE_CHECKING
+#if CONFIG_VP9_HIGHBITDEPTH
+ dqcoeff[scan[c]] = highbd_check_range((vp9_read_bit(r) ? -v : v),
+ cm->bit_depth);
+#else
dqcoeff[scan[c]] = check_range(vp9_read_bit(r) ? -v : v);
+#endif // CONFIG_VP9_HIGHBITDEPTH
#else
dqcoeff[scan[c]] = vp9_read_bit(r) ? -v : v;
-#endif
+#endif // CONFIG_COEFFICIENT_RANGE_CHECKING
token_cache[scan[c]] = vp9_pt_energy_class[token];
++c;
ctx = get_coef_context(nb, token_cache, c);
diff --git a/vp9/encoder/vp9_bitstream.c b/vp9/encoder/vp9_bitstream.c
index 752429c..19bcfd2 100644
--- a/vp9/encoder/vp9_bitstream.c
+++ b/vp9/encoder/vp9_bitstream.c
@@ -297,7 +297,6 @@
if (!vp9_segfeature_active(seg, segment_id, SEG_LVL_SKIP)) {
if (bsize >= BLOCK_8X8) {
write_inter_mode(w, mode, inter_probs);
- ++cpi->td.counts->inter_mode[mode_ctx][INTER_OFFSET(mode)];
}
}
@@ -320,7 +319,6 @@
const int j = idy * 2 + idx;
const PREDICTION_MODE b_mode = mi->bmi[j].as_mode;
write_inter_mode(w, b_mode, inter_probs);
- ++cpi->td.counts->inter_mode[mode_ctx][INTER_OFFSET(b_mode)];
if (b_mode == NEWMV) {
for (ref = 0; ref < 1 + is_compound; ++ref)
vp9_encode_mv(cpi, w, &mi->bmi[j].as_mv[ref].as_mv,
@@ -1172,8 +1170,6 @@
prob_diff_update(vp9_inter_mode_tree, cm->fc->inter_mode_probs[i],
counts->inter_mode[i], INTER_MODES, &header_bc);
- vp9_zero(counts->inter_mode);
-
if (cm->interp_filter == SWITCHABLE)
update_switchable_interp_probs(cm, &header_bc, counts);
diff --git a/vp9/encoder/vp9_dct.c b/vp9/encoder/vp9_dct.c
index 506f6de..41f72f8 100644
--- a/vp9/encoder/vp9_dct.c
+++ b/vp9/encoder/vp9_dct.c
@@ -170,7 +170,6 @@
vp9_fdct4x4_c(input, output, stride);
} else {
tran_low_t out[4 * 4];
- tran_low_t *outptr = &out[0];
int i, j;
tran_low_t temp_in[4], temp_out[4];
const transform_2d ht = FHT_4[tx_type];
@@ -183,7 +182,7 @@
temp_in[0] += 1;
ht.cols(temp_in, temp_out);
for (j = 0; j < 4; ++j)
- outptr[j * 4 + i] = temp_out[j];
+ out[j * 4 + i] = temp_out[j];
}
// Rows
@@ -711,7 +710,6 @@
vp9_fdct8x8_c(input, output, stride);
} else {
tran_low_t out[64];
- tran_low_t *outptr = &out[0];
int i, j;
tran_low_t temp_in[8], temp_out[8];
const transform_2d ht = FHT_8[tx_type];
@@ -722,7 +720,7 @@
temp_in[j] = input[j * stride + i] * 4;
ht.cols(temp_in, temp_out);
for (j = 0; j < 8; ++j)
- outptr[j * 8 + i] = temp_out[j];
+ out[j * 8 + i] = temp_out[j];
}
// Rows
@@ -1103,7 +1101,6 @@
vp9_fdct16x16_c(input, output, stride);
} else {
tran_low_t out[256];
- tran_low_t *outptr = &out[0];
int i, j;
tran_low_t temp_in[16], temp_out[16];
const transform_2d ht = FHT_16[tx_type];
@@ -1114,7 +1111,7 @@
temp_in[j] = input[j * stride + i] * 4;
ht.cols(temp_in, temp_out);
for (j = 0; j < 16; ++j)
- outptr[j * 16 + i] = (temp_out[j] + 1 + (temp_out[j] < 0)) >> 2;
+ out[j * 16 + i] = (temp_out[j] + 1 + (temp_out[j] < 0)) >> 2;
}
// Rows
diff --git a/vp9/encoder/vp9_denoiser.c b/vp9/encoder/vp9_denoiser.c
index 56ec6b3..5a99eca 100644
--- a/vp9/encoder/vp9_denoiser.c
+++ b/vp9/encoder/vp9_denoiser.c
@@ -49,9 +49,7 @@
}
static unsigned int sse_thresh(BLOCK_SIZE bs, int increase_denoising) {
- return (4 << b_width_log2_lookup[bs]) *
- (4 << b_height_log2_lookup[bs]) *
- (increase_denoising ? 60 : 40);
+ return (1 << num_pels_log2_lookup[bs]) * (increase_denoising ? 60 : 40);
}
static int sse_diff_thresh(BLOCK_SIZE bs, int increase_denoising,
@@ -60,19 +58,16 @@
noise_motion_thresh(bs, increase_denoising)) {
return 0;
} else {
- return (4 << b_width_log2_lookup[bs]) *
- (4 << b_height_log2_lookup[bs]) * 20;
+ return (1 << num_pels_log2_lookup[bs]) * 20;
}
}
int total_adj_strong_thresh(BLOCK_SIZE bs, int increase_denoising) {
- return (4 << b_width_log2_lookup[bs]) *
- (4 << b_height_log2_lookup[bs]) * (increase_denoising ? 3 : 2);
+ return (1 << num_pels_log2_lookup[bs]) * (increase_denoising ? 3 : 2);
}
static int total_adj_weak_thresh(BLOCK_SIZE bs, int increase_denoising) {
- return (4 << b_width_log2_lookup[bs]) *
- (4 << b_height_log2_lookup[bs]) * (increase_denoising ? 3 : 2);
+ return (1 << num_pels_log2_lookup[bs]) * (increase_denoising ? 3 : 2);
}
// TODO(jackychen): If increase_denoising is enabled in the future,
@@ -195,15 +190,6 @@
return framebuf + (stride * mi_row * 8) + (mi_col * 8);
}
-static void copy_block(uint8_t *dest, int dest_stride,
- const uint8_t *src, int src_stride, BLOCK_SIZE bs) {
- int r;
- for (r = 0; r < (4 << b_height_log2_lookup[bs]); ++r) {
- vpx_memcpy(dest, src, (4 << b_width_log2_lookup[bs]));
- dest += dest_stride;
- src += src_stride;
- }
-}
static VP9_DENOISER_DECISION perform_motion_compensation(VP9_DENOISER *denoiser,
MACROBLOCK *mb,
@@ -348,9 +334,15 @@
}
if (decision == FILTER_BLOCK) {
- copy_block(src.buf, src.stride, avg_start, avg.y_stride, bs);
+ vp9_convolve_copy(avg_start, avg.y_stride, src.buf, src.stride,
+ NULL, 0, NULL, 0,
+ num_4x4_blocks_wide_lookup[bs] << 2,
+ num_4x4_blocks_high_lookup[bs] << 2);
} else { // COPY_BLOCK
- copy_block(avg_start, avg.y_stride, src.buf, src.stride, bs);
+ vp9_convolve_copy(src.buf, src.stride, avg_start, avg.y_stride,
+ NULL, 0, NULL, 0,
+ num_4x4_blocks_wide_lookup[bs] << 2,
+ num_4x4_blocks_high_lookup[bs] << 2);
}
}
@@ -368,6 +360,15 @@
}
}
+static void swap_frame_buffer(YV12_BUFFER_CONFIG dest,
+ YV12_BUFFER_CONFIG src) {
+ uint8_t *tmp_buf = dest.y_buffer;
+ assert(dest.y_width == src.y_width);
+ assert(dest.y_height == src.y_height);
+ dest.y_buffer = src.y_buffer;
+ src.y_buffer = tmp_buf;
+}
+
void vp9_denoiser_update_frame_info(VP9_DENOISER *denoiser,
YV12_BUFFER_CONFIG src,
FRAME_TYPE frame_type,
@@ -377,28 +378,32 @@
if (frame_type == KEY_FRAME) {
int i;
// Start at 1 so as not to overwrite the INTRA_FRAME
- for (i = 1; i < MAX_REF_FRAMES; ++i) {
+ for (i = 1; i < MAX_REF_FRAMES; ++i)
copy_frame(denoiser->running_avg_y[i], src);
- }
- } else { /* For non key frames */
- if (refresh_alt_ref_frame) {
- copy_frame(denoiser->running_avg_y[ALTREF_FRAME],
- denoiser->running_avg_y[INTRA_FRAME]);
- }
- if (refresh_golden_frame) {
- copy_frame(denoiser->running_avg_y[GOLDEN_FRAME],
- denoiser->running_avg_y[INTRA_FRAME]);
- }
- if (refresh_last_frame) {
- copy_frame(denoiser->running_avg_y[LAST_FRAME],
- denoiser->running_avg_y[INTRA_FRAME]);
- }
+ return;
+ }
+
+ /* For non key frames */
+ if (refresh_alt_ref_frame) {
+ swap_frame_buffer(denoiser->running_avg_y[ALTREF_FRAME],
+ denoiser->running_avg_y[INTRA_FRAME]);
+ }
+ if (refresh_golden_frame) {
+ swap_frame_buffer(denoiser->running_avg_y[GOLDEN_FRAME],
+ denoiser->running_avg_y[INTRA_FRAME]);
+ }
+ if (refresh_last_frame) {
+ swap_frame_buffer(denoiser->running_avg_y[LAST_FRAME],
+ denoiser->running_avg_y[INTRA_FRAME]);
}
}
void vp9_denoiser_reset_frame_stats(PICK_MODE_CONTEXT *ctx) {
ctx->zeromv_sse = UINT_MAX;
- ctx->newmv_sse = UINT_MAX;
+ // This should be initialized as zero since mode search stage might skip
+ // NEWMV mode if inferred motion vector modes provide sufficiently good
+ // prediction quality.
+ ctx->newmv_sse = 0;
}
void vp9_denoiser_update_frame_stats(MB_MODE_INFO *mbmi, unsigned int sse,
@@ -458,12 +463,14 @@
make_grayscale(&denoiser->running_avg_y[i]);
#endif
denoiser->increase_denoising = 0;
+ denoiser->frame_buffer_initialized = 1;
return 0;
}
void vp9_denoiser_free(VP9_DENOISER *denoiser) {
int i;
+ denoiser->frame_buffer_initialized = 0;
if (denoiser == NULL) {
return;
}
@@ -483,15 +490,13 @@
uint8_t *u = yuv->u_buffer;
uint8_t *v = yuv->v_buffer;
- // The '/2's are there because we have a 440 buffer, but we want to output
- // 420.
- for (r = 0; r < yuv->uv_height / 2; ++r) {
- for (c = 0; c < yuv->uv_width / 2; ++c) {
+ for (r = 0; r < yuv->uv_height; ++r) {
+ for (c = 0; c < yuv->uv_width; ++c) {
u[c] = UINT8_MAX / 2;
v[c] = UINT8_MAX / 2;
}
- u += yuv->uv_stride + yuv->uv_width / 2;
- v += yuv->uv_stride + yuv->uv_width / 2;
+ u += yuv->uv_stride;
+ v += yuv->uv_stride;
}
}
#endif
diff --git a/vp9/encoder/vp9_denoiser.h b/vp9/encoder/vp9_denoiser.h
index 421dfcd..8eb5da1 100644
--- a/vp9/encoder/vp9_denoiser.h
+++ b/vp9/encoder/vp9_denoiser.h
@@ -29,6 +29,7 @@
YV12_BUFFER_CONFIG running_avg_y[MAX_REF_FRAMES];
YV12_BUFFER_CONFIG mc_running_avg_y;
int increase_denoising;
+ int frame_buffer_initialized;
} VP9_DENOISER;
void vp9_denoiser_update_frame_info(VP9_DENOISER *denoiser,
diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c
index 4c94823..7560527 100644
--- a/vp9/encoder/vp9_encodeframe.c
+++ b/vp9/encoder/vp9_encodeframe.c
@@ -55,9 +55,6 @@
int mi_row, int mi_col, BLOCK_SIZE bsize,
PICK_MODE_CONTEXT *ctx);
-// Motion vector component magnitude threshold for defining fast motion.
-#define FAST_MOTION_MV_THRESH 24
-
// This is used as a reference when computing the source variance for the
// purposes of activity masking.
// Eventually this should be replaced by custom no-reference routines,
@@ -1010,22 +1007,20 @@
const MACROBLOCKD *const xd = &x->e_mbd;
const MODE_INFO *const mi = xd->mi[0].src_mi;
const MB_MODE_INFO *const mbmi = &mi->mbmi;
+ const BLOCK_SIZE bsize = mbmi->sb_type;
if (!frame_is_intra_only(cm)) {
+ FRAME_COUNTS *const counts = td->counts;
+ const int inter_block = is_inter_block(mbmi);
const int seg_ref_active = vp9_segfeature_active(&cm->seg, mbmi->segment_id,
SEG_LVL_REF_FRAME);
if (!seg_ref_active) {
- FRAME_COUNTS *const counts = td->counts;
- const int inter_block = is_inter_block(mbmi);
-
counts->intra_inter[vp9_get_intra_inter_context(xd)][inter_block]++;
-
// If the segment reference feature is enabled we have only a single
// reference frame allowed for the segment so exclude it from
// the reference frame counts used to work out probabilities.
if (inter_block) {
const MV_REFERENCE_FRAME ref0 = mbmi->ref_frame[0];
-
if (cm->reference_mode == REFERENCE_MODE_SELECT)
counts->comp_inter[vp9_get_reference_mode_context(cm, xd)]
[has_second_ref(mbmi)]++;
@@ -1042,6 +1037,25 @@
}
}
}
+ if (inter_block &&
+ !vp9_segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP)) {
+ const int mode_ctx = mbmi->mode_context[mbmi->ref_frame[0]];
+ if (bsize >= BLOCK_8X8) {
+ const PREDICTION_MODE mode = mbmi->mode;
+ ++counts->inter_mode[mode_ctx][INTER_OFFSET(mode)];
+ } else {
+ const int num_4x4_w = num_4x4_blocks_wide_lookup[bsize];
+ const int num_4x4_h = num_4x4_blocks_high_lookup[bsize];
+ int idx, idy;
+ for (idy = 0; idy < 2; idy += num_4x4_h) {
+ for (idx = 0; idx < 2; idx += num_4x4_w) {
+ const int j = idy * 2 + idx;
+ const PREDICTION_MODE b_mode = mi->bmi[j].as_mode;
+ ++counts->inter_mode[mode_ctx][INTER_OFFSET(b_mode)];
+ }
+ }
+ }
+ }
}
}
@@ -1410,6 +1424,11 @@
const int pred_ctx = vp9_get_pred_context_switchable_interp(xd);
++td->counts->switchable_interp[pred_ctx][mbmi->interp_filter];
}
+
+ if (mbmi->sb_type < BLOCK_8X8) {
+ mbmi->mv[0].as_int = mi->bmi[3].as_mv[0].as_int;
+ mbmi->mv[1].as_int = mi->bmi[3].as_mv[1].as_int;
+ }
}
if (cm->use_prev_frame_mvs) {
@@ -2705,9 +2724,12 @@
hybrid_intra_mode_search(cpi, x, rd_cost, bsize, ctx);
else if (vp9_segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP))
set_mode_info_seg_skip(x, cm->tx_mode, rd_cost, bsize);
- else
+ else if (bsize >= BLOCK_8X8)
vp9_pick_inter_mode(cpi, x, tile_data, mi_row, mi_col,
rd_cost, bsize, ctx);
+ else
+ vp9_pick_inter_mode_sub8x8(cpi, x, tile_data, mi_row, mi_col,
+ rd_cost, bsize, ctx);
duplicate_mode_info_in_sb(cm, xd, mi_row, mi_col, bsize);
@@ -3219,7 +3241,7 @@
pc_tree->vertical[0].skip = x->skip;
encode_b_rt(cpi, td, tile_info, tp, mi_row, mi_col, output_enabled,
subsize, &pc_tree->vertical[0]);
- if (mi_col + hbs < cm->mi_cols) {
+ if (mi_col + hbs < cm->mi_cols && bsize > BLOCK_8X8) {
pc_tree->vertical[1].pred_pixel_ready = 1;
nonrd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col + hbs,
dummy_cost, subsize, &pc_tree->vertical[1]);
@@ -3240,7 +3262,7 @@
encode_b_rt(cpi, td, tile_info, tp, mi_row, mi_col, output_enabled,
subsize, &pc_tree->horizontal[0]);
- if (mi_row + hbs < cm->mi_rows) {
+ if (mi_row + hbs < cm->mi_rows && bsize > BLOCK_8X8) {
pc_tree->horizontal[1].pred_pixel_ready = 1;
nonrd_pick_sb_modes(cpi, tile_data, x, mi_row + hbs, mi_col,
dummy_cost, subsize, &pc_tree->horizontal[1]);
@@ -3312,9 +3334,10 @@
// Set the partition type of the 64X64 block
switch (sf->partition_search_type) {
case VAR_BASED_PARTITION:
- // TODO(jingning) Only key frame coding supports sub8x8 block at this
- // point. To be continued to enable sub8x8 block mode decision for
- // P frames.
+ // TODO(jingning, marpan): The mode decision and encoding process
+ // support both intra and inter sub8x8 block coding for RTC mode.
+ // Tune the thresholds accordingly to use sub8x8 block coding for
+ // coding performance improvement.
choose_partitioning(cpi, tile_info, x, mi_row, mi_col);
nonrd_use_partition(cpi, td, tile_data, mi, tp, mi_row, mi_col,
BLOCK_64X64, 1, &dummy_rdc, td->pc_root);
diff --git a/vp9/encoder/vp9_encodemb.c b/vp9/encoder/vp9_encodemb.c
index 9c29eb4..70b804e 100644
--- a/vp9/encoder/vp9_encodemb.c
+++ b/vp9/encoder/vp9_encodemb.c
@@ -93,7 +93,7 @@
int rate;
int error;
int next;
- signed char token;
+ int16_t token;
short qc;
} vp9_token_state;
@@ -147,9 +147,15 @@
int next = eob, sz = 0;
int64_t rdmult = mb->rdmult * plane_rd_mult[type], rddiv = mb->rddiv;
int64_t rd_cost0, rd_cost1;
- int rate0, rate1, error0, error1, t0, t1;
+ int rate0, rate1, error0, error1;
+ int16_t t0, t1;
+ EXTRABIT e0;
int best, band, pt, i, final_eob;
- const int16_t *dct_value_cost;
+#if CONFIG_VP9_HIGHBITDEPTH
+ const int16_t *cat6_high_cost = vp9_get_high_cost_table(xd->bd);
+#else
+ const int16_t *cat6_high_cost = vp9_get_high_cost_table(8);
+#endif
assert((!type && !plane) || (type && plane));
assert(eob <= default_eob);
@@ -166,17 +172,6 @@
tokens[eob][0].qc = 0;
tokens[eob][1] = tokens[eob][0];
-#if CONFIG_VP9_HIGHBITDEPTH
- if (xd->bd == 12) {
- dct_value_cost = vp9_dct_value_cost_high12_ptr;
- } else if (xd->bd == 10) {
- dct_value_cost = vp9_dct_value_cost_high10_ptr;
- } else {
- dct_value_cost = vp9_dct_value_cost_ptr;
- }
-#else
- dct_value_cost = vp9_dct_value_cost_ptr;
-#endif
for (i = 0; i < eob; i++)
token_cache[scan[i]] =
vp9_pt_energy_class[vp9_get_token(qcoeff[scan[i]])];
@@ -193,7 +188,7 @@
/* Evaluate the first possibility for this state. */
rate0 = tokens[next][0].rate;
rate1 = tokens[next][1].rate;
- t0 = vp9_get_token(x);
+ vp9_get_token_extra(x, &t0, &e0);
/* Consider both possible successor states. */
if (next < default_eob) {
band = band_translate[i + 1];
@@ -206,7 +201,7 @@
UPDATE_RD_COST();
/* And pick the best. */
best = rd_cost1 < rd_cost0;
- base_bits = dct_value_cost[x];
+ base_bits = vp9_get_cost(t0, e0, cat6_high_cost);
dx = mul * (dqcoeff[rc] - coeff[rc]);
#if CONFIG_VP9_HIGHBITDEPTH
if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
@@ -244,8 +239,10 @@
*/
t0 = tokens[next][0].token == EOB_TOKEN ? EOB_TOKEN : ZERO_TOKEN;
t1 = tokens[next][1].token == EOB_TOKEN ? EOB_TOKEN : ZERO_TOKEN;
+ e0 = 0;
} else {
- t0 = t1 = vp9_get_token(x);
+ vp9_get_token_extra(x, &t0, &e0);
+ t1 = t0;
}
if (next < default_eob) {
band = band_translate[i + 1];
@@ -264,7 +261,7 @@
UPDATE_RD_COST();
/* And pick the best. */
best = rd_cost1 < rd_cost0;
- base_bits = dct_value_cost[x];
+ base_bits = vp9_get_cost(t0, e0, cat6_high_cost);
if (shortcut) {
#if CONFIG_VP9_HIGHBITDEPTH
diff --git a/vp9/encoder/vp9_encoder.c b/vp9/encoder/vp9_encoder.c
index 4fd34aa..49fe7de 100644
--- a/vp9/encoder/vp9_encoder.c
+++ b/vp9/encoder/vp9_encoder.c
@@ -186,7 +186,6 @@
if (!init_done) {
vp9_rtcd();
vp9_init_intra_predictors();
- vp9_tokenize_initialize();
vp9_init_me_luts();
vp9_rc_init_minq_luts();
vp9_entropy_mv_init();
@@ -1348,17 +1347,6 @@
#if CONFIG_VP9_HIGHBITDEPTH
highbd_set_var_fns(cpi);
#endif
-
-#if CONFIG_VP9_TEMPORAL_DENOISING
- if (cpi->oxcf.noise_sensitivity > 0) {
- vp9_denoiser_alloc(&(cpi->denoiser), cm->width, cm->height,
- cm->subsampling_x, cm->subsampling_y,
-#if CONFIG_VP9_HIGHBITDEPTH
- cm->use_highbitdepth,
-#endif
- VP9_ENC_BORDER_IN_PIXELS);
- }
-#endif
}
#ifndef M_LOG2_E
@@ -1792,9 +1780,7 @@
}
#if CONFIG_VP9_TEMPORAL_DENOISING
- if (cpi->oxcf.noise_sensitivity > 0) {
- vp9_denoiser_free(&(cpi->denoiser));
- }
+ vp9_denoiser_free(&(cpi->denoiser));
#endif
for (t = 0; t < cpi->num_workers; ++t) {
@@ -2140,19 +2126,19 @@
} while (--h);
src = s->u_buffer;
- h = s->uv_height / 2;
+ h = s->uv_height;
do {
- fwrite(src, s->uv_width / 2, 1, f);
- src += s->uv_stride + s->uv_width / 2;
+ fwrite(src, s->uv_width, 1, f);
+ src += s->uv_stride;
} while (--h);
src = s->v_buffer;
- h = s->uv_height / 2;
+ h = s->uv_height;
do {
- fwrite(src, s->uv_width / 2, 1, f);
- src += s->uv_stride + s->uv_width / 2;
+ fwrite(src, s->uv_width, 1, f);
+ src += s->uv_stride;
} while (--h);
}
#endif
@@ -3409,6 +3395,20 @@
}
}
+#if CONFIG_VP9_TEMPORAL_DENOISING
+static void setup_denoiser_buffer(VP9_COMP *cpi) {
+ VP9_COMMON *const cm = &cpi->common;
+ if (cpi->oxcf.noise_sensitivity > 0 &&
+ !cpi->denoiser.frame_buffer_initialized) {
+ vp9_denoiser_alloc(&(cpi->denoiser), cm->width, cm->height,
+ cm->subsampling_x, cm->subsampling_y,
+#if CONFIG_VP9_HIGHBITDEPTH
+ cm->use_highbitdepth,
+#endif
+ VP9_ENC_BORDER_IN_PIXELS);
+ }
+}
+#endif
int vp9_receive_raw_frame(VP9_COMP *cpi, unsigned int frame_flags,
YV12_BUFFER_CONFIG *sd, int64_t time_stamp,
@@ -3425,6 +3425,9 @@
check_initial_width(cpi, subsampling_x, subsampling_y);
#endif // CONFIG_VP9_HIGHBITDEPTH
+#if CONFIG_VP9_TEMPORAL_DENOISING
+ setup_denoiser_buffer(cpi);
+#endif
vpx_usec_timer_start(&timer);
if (vp9_lookahead_push(cpi->lookahead, sd, time_stamp, end_time, frame_flags))
@@ -3999,6 +4002,10 @@
check_initial_width(cpi, 1, 1);
#endif // CONFIG_VP9_HIGHBITDEPTH
+#if CONFIG_VP9_TEMPORAL_DENOISING
+ setup_denoiser_buffer(cpi);
+#endif
+
if (width) {
cm->width = width;
if (cm->width > cpi->initial_width) {
diff --git a/vp9/encoder/vp9_pickmode.c b/vp9/encoder/vp9_pickmode.c
index 319a478..5acfcc5 100644
--- a/vp9/encoder/vp9_pickmode.c
+++ b/vp9/encoder/vp9_pickmode.c
@@ -540,8 +540,7 @@
cm->base_qindex, cm->y_dc_delta_q, cm->bit_depth) >> reduction_fac;
const int64_t inter_mode_thresh = RDCOST(x->rdmult, x->rddiv,
intra_cost_penalty, 0);
- const int8_t segment_id = mbmi->segment_id;
- const int *const rd_threshes = cpi->rd.threshes[segment_id][bsize];
+ const int *const rd_threshes = cpi->rd.threshes[mbmi->segment_id][bsize];
const int *const rd_thresh_freq_fact = tile_data->thresh_freq_fact[bsize];
INTERP_FILTER filter_ref;
const int bsl = mi_width_log2_lookup[bsize];
@@ -605,7 +604,10 @@
tx_mode_to_biggest_tx_size[cm->tx_mode]);
mbmi->interp_filter = cm->interp_filter == SWITCHABLE ?
EIGHTTAP : cm->interp_filter;
- mbmi->segment_id = segment_id;
+
+#if CONFIG_VP9_TEMPORAL_DENOISING
+ vp9_denoiser_reset_frame_stats(ctx);
+#endif
for (ref_frame = LAST_FRAME; ref_frame <= GOLDEN_FRAME; ++ref_frame) {
x->pred_mv_sad[ref_frame] = INT_MAX;
@@ -662,6 +664,7 @@
clamp_mv2(&frame_mv[NEARMV][ref_frame].as_mv, xd);
mbmi->ref_frame[0] = ref_frame;
+ set_ref_ptrs(cm, xd, ref_frame, NONE);
for (this_mode = NEARESTMV; this_mode <= NEWMV; ++this_mode) {
int rate_mv = 0;
@@ -946,3 +949,247 @@
*rd_cost = best_rdc;
}
+
+void vp9_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x,
+ TileDataEnc *tile_data,
+ int mi_row, int mi_col, RD_COST *rd_cost,
+ BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx) {
+ VP9_COMMON *const cm = &cpi->common;
+ TileInfo *const tile_info = &tile_data->tile_info;
+ SPEED_FEATURES *const sf = &cpi->sf;
+ MACROBLOCKD *const xd = &x->e_mbd;
+ MB_MODE_INFO *const mbmi = &xd->mi[0].src_mi->mbmi;
+ const struct segmentation *const seg = &cm->seg;
+ MV_REFERENCE_FRAME ref_frame, second_ref_frame = NONE;
+ MV_REFERENCE_FRAME best_ref_frame = NONE;
+ unsigned char segment_id = mbmi->segment_id;
+ struct buf_2d yv12_mb[4][MAX_MB_PLANE];
+ static const int flag_list[4] = { 0, VP9_LAST_FLAG, VP9_GOLD_FLAG,
+ VP9_ALT_FLAG };
+ int64_t best_rd = INT64_MAX;
+ b_mode_info bsi[MAX_REF_FRAMES][4];
+ int ref_frame_skip_mask = 0;
+ const int num_4x4_blocks_wide = num_4x4_blocks_wide_lookup[bsize];
+ const int num_4x4_blocks_high = num_4x4_blocks_high_lookup[bsize];
+ int idx, idy;
+
+ x->skip_encode = sf->skip_encode_frame && x->q_index < QIDX_SKIP_THRESH;
+ ctx->pred_pixel_ready = 0;
+
+ for (ref_frame = LAST_FRAME; ref_frame <= GOLDEN_FRAME; ++ref_frame) {
+ int_mv dummy_mv[2];
+ x->pred_mv_sad[ref_frame] = INT_MAX;
+
+ if (cpi->ref_frame_flags & flag_list[ref_frame]) {
+ const YV12_BUFFER_CONFIG *yv12 = get_ref_frame_buffer(cpi, ref_frame);
+ int_mv *const candidates = mbmi->ref_mvs[ref_frame];
+ const struct scale_factors *const sf =
+ &cm->frame_refs[ref_frame - 1].sf;
+ vp9_setup_pred_block(xd, yv12_mb[ref_frame], yv12, mi_row, mi_col,
+ sf, sf);
+ vp9_find_mv_refs(cm, xd, tile_info, xd->mi[0].src_mi, ref_frame,
+ candidates, mi_row, mi_col);
+
+ vp9_find_best_ref_mvs(xd, cm->allow_high_precision_mv, candidates,
+ &dummy_mv[0], &dummy_mv[1]);
+ } else {
+ ref_frame_skip_mask |= (1 << ref_frame);
+ }
+ }
+
+ mbmi->sb_type = bsize;
+ mbmi->tx_size = TX_4X4;
+ mbmi->uv_mode = DC_PRED;
+ mbmi->ref_frame[0] = LAST_FRAME;
+ mbmi->ref_frame[1] = NONE;
+ mbmi->interp_filter = cm->interp_filter == SWITCHABLE ? EIGHTTAP
+ : cm->interp_filter;
+
+ for (ref_frame = LAST_FRAME; ref_frame <= GOLDEN_FRAME; ++ref_frame) {
+ int64_t this_rd = 0;
+ int plane;
+
+ if (ref_frame_skip_mask & (1 << ref_frame))
+ continue;
+
+ // TODO(jingning, agrange): Scaling reference frame not supported for
+ // sub8x8 blocks. Is this supported now?
+ if (ref_frame > INTRA_FRAME &&
+ vp9_is_scaled(&cm->frame_refs[ref_frame - 1].sf))
+ continue;
+
+ // If the segment reference frame feature is enabled....
+ // then do nothing if the current ref frame is not allowed..
+ if (vp9_segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME) &&
+ vp9_get_segdata(seg, segment_id, SEG_LVL_REF_FRAME) != (int)ref_frame)
+ continue;
+
+ mbmi->ref_frame[0] = ref_frame;
+ x->skip = 0;
+ set_ref_ptrs(cm, xd, ref_frame, second_ref_frame);
+
+ // Select prediction reference frames.
+ for (plane = 0; plane < MAX_MB_PLANE; plane++)
+ xd->plane[plane].pre[0] = yv12_mb[ref_frame][plane];
+
+ for (idy = 0; idy < 2; idy += num_4x4_blocks_high) {
+ for (idx = 0; idx < 2; idx += num_4x4_blocks_wide) {
+ int_mv b_mv[MB_MODE_COUNT];
+ int64_t b_best_rd = INT64_MAX;
+ const int i = idy * 2 + idx;
+ PREDICTION_MODE this_mode;
+ int b_rate = 0;
+ int64_t b_dist = 0;
+ RD_COST this_rdc;
+ unsigned int var_y, sse_y;
+
+ struct macroblock_plane *p = &x->plane[0];
+ struct macroblockd_plane *pd = &xd->plane[0];
+
+ const struct buf_2d orig_src = p->src;
+ const struct buf_2d orig_dst = pd->dst;
+ struct buf_2d orig_pre[2];
+ vpx_memcpy(orig_pre, xd->plane[0].pre, sizeof(orig_pre));
+
+ // set buffer pointers for sub8x8 motion search.
+ p->src.buf =
+ &p->src.buf[vp9_raster_block_offset(BLOCK_8X8, i, p->src.stride)];
+ pd->dst.buf =
+ &pd->dst.buf[vp9_raster_block_offset(BLOCK_8X8, i, pd->dst.stride)];
+ pd->pre[0].buf =
+ &pd->pre[0].buf[vp9_raster_block_offset(BLOCK_8X8,
+ i, pd->pre[0].stride)];
+
+ b_mv[ZEROMV].as_int = 0;
+ b_mv[NEWMV].as_int = INVALID_MV;
+ vp9_append_sub8x8_mvs_for_idx(cm, xd, tile_info, i, 0, mi_row, mi_col,
+ &b_mv[NEARESTMV],
+ &b_mv[NEARMV]);
+
+ for (this_mode = NEARESTMV; this_mode <= NEWMV; ++this_mode) {
+ xd->mi[0].bmi[i].as_mv[0].as_int = b_mv[this_mode].as_int;
+
+ if (this_mode == NEWMV) {
+ const int step_param = cpi->sf.mv.fullpel_search_step_param;
+ MV mvp_full;
+ MV tmp_mv;
+ int cost_list[5];
+ const int tmp_col_min = x->mv_col_min;
+ const int tmp_col_max = x->mv_col_max;
+ const int tmp_row_min = x->mv_row_min;
+ const int tmp_row_max = x->mv_row_max;
+ int dummy_dist;
+
+ if (i == 0) {
+ mvp_full.row = b_mv[NEARESTMV].as_mv.row >> 3;
+ mvp_full.col = b_mv[NEARESTMV].as_mv.col >> 3;
+ } else {
+ mvp_full.row = xd->mi[0].bmi[0].as_mv[0].as_mv.row >> 3;
+ mvp_full.col = xd->mi[0].bmi[0].as_mv[0].as_mv.col >> 3;
+ }
+
+ vp9_set_mv_search_range(x, &mbmi->ref_mvs[0]->as_mv);
+
+ vp9_full_pixel_search(
+ cpi, x, bsize, &mvp_full, step_param, x->sadperbit4,
+ cond_cost_list(cpi, cost_list),
+ &mbmi->ref_mvs[ref_frame][0].as_mv, &tmp_mv,
+ INT_MAX, 0);
+
+ x->mv_col_min = tmp_col_min;
+ x->mv_col_max = tmp_col_max;
+ x->mv_row_min = tmp_row_min;
+ x->mv_row_max = tmp_row_max;
+
+ // calculate the bit cost on motion vector
+ mvp_full.row = tmp_mv.row * 8;
+ mvp_full.col = tmp_mv.col * 8;
+
+ b_rate += vp9_mv_bit_cost(&mvp_full,
+ &mbmi->ref_mvs[ref_frame][0].as_mv,
+ x->nmvjointcost, x->mvcost,
+ MV_COST_WEIGHT);
+
+ b_rate += cpi->inter_mode_cost[mbmi->mode_context[ref_frame]]
+ [INTER_OFFSET(NEWMV)];
+ if (RDCOST(x->rdmult, x->rddiv, b_rate, 0) > b_best_rd)
+ continue;
+
+ cpi->find_fractional_mv_step(x, &tmp_mv,
+ &mbmi->ref_mvs[ref_frame][0].as_mv,
+ cpi->common.allow_high_precision_mv,
+ x->errorperbit,
+ &cpi->fn_ptr[bsize],
+ cpi->sf.mv.subpel_force_stop,
+ cpi->sf.mv.subpel_iters_per_step,
+ cond_cost_list(cpi, cost_list),
+ x->nmvjointcost, x->mvcost,
+ &dummy_dist,
+ &x->pred_sse[ref_frame], NULL, 0, 0);
+
+ xd->mi[0].bmi[i].as_mv[0].as_mv = tmp_mv;
+ }
+
+ vp9_build_inter_predictor(pd->pre[0].buf, pd->pre[0].stride,
+ pd->dst.buf, pd->dst.stride,
+ &xd->mi[0].bmi[i].as_mv[0].as_mv,
+ &xd->block_refs[0]->sf,
+ 4 * num_4x4_blocks_wide,
+ 4 * num_4x4_blocks_high, 0,
+ vp9_get_interp_kernel(mbmi->interp_filter),
+ MV_PRECISION_Q3,
+ mi_col * MI_SIZE + 4 * (i & 0x01),
+ mi_row * MI_SIZE + 4 * (i >> 1));
+ model_rd_for_sb_y(cpi, bsize, x, xd, &this_rdc.rate, &this_rdc.dist,
+ &var_y, &sse_y);
+
+ this_rdc.rate += b_rate;
+ this_rdc.dist += b_dist;
+ this_rdc.rdcost = RDCOST(x->rdmult, x->rddiv,
+ this_rdc.rate, this_rdc.dist);
+ if (this_rdc.rdcost < b_best_rd) {
+ b_best_rd = this_rdc.rdcost;
+ bsi[ref_frame][i].as_mode = this_mode;
+ bsi[ref_frame][i].as_mv[0].as_mv = xd->mi[0].bmi[i].as_mv[0].as_mv;
+ }
+ } // mode search
+
+ // restore source and prediction buffer pointers.
+ p->src = orig_src;
+ pd->pre[0] = orig_pre[0];
+ pd->dst = orig_dst;
+ this_rd += b_best_rd;
+
+ xd->mi[0].bmi[i] = bsi[ref_frame][i];
+ if (num_4x4_blocks_wide > 1)
+ xd->mi[0].bmi[i + 1] = xd->mi[0].bmi[i];
+ if (num_4x4_blocks_high > 1)
+ xd->mi[0].bmi[i + 2] = xd->mi[0].bmi[i];
+ }
+ } // loop through sub8x8 blocks
+
+ if (this_rd < best_rd) {
+ best_rd = this_rd;
+ best_ref_frame = ref_frame;
+ }
+ } // reference frames
+
+ mbmi->tx_size = TX_4X4;
+ mbmi->ref_frame[0] = best_ref_frame;
+ for (idy = 0; idy < 2; idy += num_4x4_blocks_high) {
+ for (idx = 0; idx < 2; idx += num_4x4_blocks_wide) {
+ const int block = idy * 2 + idx;
+ xd->mi[0].bmi[block] = bsi[best_ref_frame][block];
+ if (num_4x4_blocks_wide > 1)
+ xd->mi[0].bmi[block + 1] = bsi[best_ref_frame][block];
+ if (num_4x4_blocks_high > 1)
+ xd->mi[0].bmi[block + 2] = bsi[best_ref_frame][block];
+ }
+ }
+ mbmi->mode = xd->mi[0].bmi[3].as_mode;
+ ctx->mic = *(xd->mi[0].src_mi);
+ ctx->skip_txfm[0] = 0;
+ ctx->skip = 0;
+ // Dummy assignment for speed -5. No effect in speed -6.
+ rd_cost->rdcost = best_rd;
+}
diff --git a/vp9/encoder/vp9_pickmode.h b/vp9/encoder/vp9_pickmode.h
index 57a1cf9..11f4409 100644
--- a/vp9/encoder/vp9_pickmode.h
+++ b/vp9/encoder/vp9_pickmode.h
@@ -26,6 +26,12 @@
BLOCK_SIZE bsize,
PICK_MODE_CONTEXT *ctx);
+void vp9_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x,
+ TileDataEnc *tile_data,
+ int mi_row, int mi_col, RD_COST *rd_cost,
+ BLOCK_SIZE bsize,
+ PICK_MODE_CONTEXT *ctx);
+
#ifdef __cplusplus
} // extern "C"
#endif
diff --git a/vp9/encoder/vp9_rd.c b/vp9/encoder/vp9_rd.c
index 34d49f0..375407d 100644
--- a/vp9/encoder/vp9_rd.c
+++ b/vp9/encoder/vp9_rd.c
@@ -516,6 +516,20 @@
}
}
+int vp9_raster_block_offset(BLOCK_SIZE plane_bsize,
+ int raster_block, int stride) {
+ const int bw = b_width_log2_lookup[plane_bsize];
+ const int y = 4 * (raster_block >> bw);
+ const int x = 4 * (raster_block & ((1 << bw) - 1));
+ return y * stride + x;
+}
+
+int16_t* vp9_raster_block_offset_int16(BLOCK_SIZE plane_bsize,
+ int raster_block, int16_t *base) {
+ const int stride = 4 * num_4x4_blocks_wide_lookup[plane_bsize];
+ return base + vp9_raster_block_offset(plane_bsize, raster_block, stride);
+}
+
const YV12_BUFFER_CONFIG *vp9_get_scaled_ref_frame(const VP9_COMP *cpi,
int ref_frame) {
const VP9_COMMON *const cm = &cpi->common;
diff --git a/vp9/encoder/vp9_rd.h b/vp9/encoder/vp9_rd.h
index e1593af..59a87cf 100644
--- a/vp9/encoder/vp9_rd.h
+++ b/vp9/encoder/vp9_rd.h
@@ -141,6 +141,12 @@
int vp9_get_switchable_rate(const struct VP9_COMP *cpi,
const MACROBLOCKD *const xd);
+int vp9_raster_block_offset(BLOCK_SIZE plane_bsize,
+ int raster_block, int stride);
+
+int16_t* vp9_raster_block_offset_int16(BLOCK_SIZE plane_bsize,
+ int raster_block, int16_t *base);
+
const YV12_BUFFER_CONFIG *vp9_get_scaled_ref_frame(const struct VP9_COMP *cpi,
int ref_frame);
diff --git a/vp9/encoder/vp9_rdopt.c b/vp9/encoder/vp9_rdopt.c
index ded082f..a183fdc 100644
--- a/vp9/encoder/vp9_rdopt.c
+++ b/vp9/encoder/vp9_rdopt.c
@@ -129,19 +129,6 @@
{{INTRA_FRAME, NONE}},
};
-static int raster_block_offset(BLOCK_SIZE plane_bsize,
- int raster_block, int stride) {
- const int bw = b_width_log2_lookup[plane_bsize];
- const int y = 4 * (raster_block >> bw);
- const int x = 4 * (raster_block & ((1 << bw) - 1));
- return y * stride + x;
-}
-static int16_t* raster_block_offset_int16(BLOCK_SIZE plane_bsize,
- int raster_block, int16_t *base) {
- const int stride = 4 * num_4x4_blocks_wide_lookup[plane_bsize];
- return base + raster_block_offset(plane_bsize, raster_block, stride);
-}
-
static void swap_block_ptr(MACROBLOCK *x, PICK_MODE_CONTEXT *ctx,
int m, int n, int min_plane, int max_plane) {
int i;
@@ -360,6 +347,12 @@
uint8_t token_cache[32 * 32];
int pt = combine_entropy_contexts(*A, *L);
int c, cost;
+#if CONFIG_VP9_HIGHBITDEPTH
+ const int16_t *cat6_high_cost = vp9_get_high_cost_table(xd->bd);
+#else
+ const int16_t *cat6_high_cost = vp9_get_high_cost_table(8);
+#endif
+
// Check for consistency of tx_size with mode info
assert(type == PLANE_TYPE_Y ? mbmi->tx_size == tx_size
: get_uv_tx_size(mbmi, pd) == tx_size);
@@ -373,23 +366,29 @@
// dc token
int v = qcoeff[0];
- int prev_t = vp9_get_token(v);
- cost = (*token_costs)[0][pt][prev_t] + vp9_dct_value_cost_ptr[v];
+ int16_t prev_t;
+ EXTRABIT e;
+ vp9_get_token_extra(v, &prev_t, &e);
+ cost = (*token_costs)[0][pt][prev_t] +
+ vp9_get_cost(prev_t, e, cat6_high_cost);
+
token_cache[0] = vp9_pt_energy_class[prev_t];
++token_costs;
// ac tokens
for (c = 1; c < eob; c++) {
const int rc = scan[c];
- int t;
+ int16_t t;
v = qcoeff[rc];
- t = vp9_get_token(v);
+ vp9_get_token_extra(v, &t, &e);
if (use_fast_coef_costing) {
- cost += (*token_costs)[!prev_t][!prev_t][t] + vp9_dct_value_cost_ptr[v];
+ cost += (*token_costs)[!prev_t][!prev_t][t] +
+ vp9_get_cost(t, e, cat6_high_cost);
} else {
pt = get_coef_context(nb, token_cache, c);
- cost += (*token_costs)[!prev_t][pt][t] + vp9_dct_value_cost_ptr[v];
+ cost += (*token_costs)[!prev_t][pt][t] +
+ vp9_get_cost(t, e, cat6_high_cost);
token_cache[rc] = vp9_pt_energy_class[t];
}
prev_t = t;
@@ -761,10 +760,10 @@
struct macroblockd_plane *pd = &xd->plane[0];
const int src_stride = p->src.stride;
const int dst_stride = pd->dst.stride;
- const uint8_t *src_init = &p->src.buf[raster_block_offset(BLOCK_8X8, ib,
- src_stride)];
- uint8_t *dst_init = &pd->dst.buf[raster_block_offset(BLOCK_8X8, ib,
- dst_stride)];
+ const uint8_t *src_init = &p->src.buf[vp9_raster_block_offset(BLOCK_8X8, ib,
+ src_stride)];
+ uint8_t *dst_init = &pd->dst.buf[vp9_raster_block_offset(BLOCK_8X8, ib,
+ dst_stride)];
ENTROPY_CONTEXT ta[2], tempa[2];
ENTROPY_CONTEXT tl[2], templ[2];
@@ -808,8 +807,9 @@
const int block = ib + idy * 2 + idx;
const uint8_t *const src = &src_init[idx * 4 + idy * 4 * src_stride];
uint8_t *const dst = &dst_init[idx * 4 + idy * 4 * dst_stride];
- int16_t *const src_diff = raster_block_offset_int16(BLOCK_8X8, block,
- p->src_diff);
+ int16_t *const src_diff = vp9_raster_block_offset_int16(BLOCK_8X8,
+ block,
+ p->src_diff);
tran_low_t *const coeff = BLOCK_OFFSET(x->plane[0].coeff, block);
xd->mi[0].src_mi->bmi[block].as_mode = mode;
vp9_predict_intra_block(xd, block, 1,
@@ -908,8 +908,8 @@
const int block = ib + idy * 2 + idx;
const uint8_t *const src = &src_init[idx * 4 + idy * 4 * src_stride];
uint8_t *const dst = &dst_init[idx * 4 + idy * 4 * dst_stride];
- int16_t *const src_diff = raster_block_offset_int16(BLOCK_8X8, block,
- p->src_diff);
+ int16_t *const src_diff =
+ vp9_raster_block_offset_int16(BLOCK_8X8, block, p->src_diff);
tran_low_t *const coeff = BLOCK_OFFSET(x->plane[0].coeff, block);
xd->mi[0].src_mi->bmi[block].as_mode = mode;
vp9_predict_intra_block(xd, block, 1,
@@ -1341,10 +1341,10 @@
const int height = 4 * num_4x4_blocks_high_lookup[plane_bsize];
int idx, idy;
- const uint8_t *const src = &p->src.buf[raster_block_offset(BLOCK_8X8, i,
- p->src.stride)];
- uint8_t *const dst = &pd->dst.buf[raster_block_offset(BLOCK_8X8, i,
- pd->dst.stride)];
+ const uint8_t *const src =
+ &p->src.buf[vp9_raster_block_offset(BLOCK_8X8, i, p->src.stride)];
+ uint8_t *const dst = &pd->dst.buf[vp9_raster_block_offset(BLOCK_8X8, i,
+ pd->dst.stride)];
int64_t thisdistortion = 0, thissse = 0;
int thisrate = 0, ref;
const scan_order *so = &vp9_default_scan_orders[TX_4X4];
@@ -1352,7 +1352,7 @@
const InterpKernel *kernel = vp9_get_interp_kernel(mi->mbmi.interp_filter);
for (ref = 0; ref < 1 + is_compound; ++ref) {
- const uint8_t *pre = &pd->pre[ref].buf[raster_block_offset(BLOCK_8X8, i,
+ const uint8_t *pre = &pd->pre[ref].buf[vp9_raster_block_offset(BLOCK_8X8, i,
pd->pre[ref].stride)];
#if CONFIG_VP9_HIGHBITDEPTH
if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
@@ -1386,17 +1386,17 @@
#if CONFIG_VP9_HIGHBITDEPTH
if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
vp9_highbd_subtract_block(
- height, width, raster_block_offset_int16(BLOCK_8X8, i, p->src_diff), 8,
- src, p->src.stride, dst, pd->dst.stride, xd->bd);
+ height, width, vp9_raster_block_offset_int16(BLOCK_8X8, i, p->src_diff),
+ 8, src, p->src.stride, dst, pd->dst.stride, xd->bd);
} else {
vp9_subtract_block(
- height, width, raster_block_offset_int16(BLOCK_8X8, i, p->src_diff), 8,
- src, p->src.stride, dst, pd->dst.stride);
+ height, width, vp9_raster_block_offset_int16(BLOCK_8X8, i, p->src_diff),
+ 8, src, p->src.stride, dst, pd->dst.stride);
}
#else
vp9_subtract_block(height, width,
- raster_block_offset_int16(BLOCK_8X8, i, p->src_diff), 8,
- src, p->src.stride, dst, pd->dst.stride);
+ vp9_raster_block_offset_int16(BLOCK_8X8, i, p->src_diff),
+ 8, src, p->src.stride, dst, pd->dst.stride);
#endif // CONFIG_VP9_HIGHBITDEPTH
k = i;
@@ -1407,7 +1407,7 @@
k += (idy * 2 + idx);
coeff = BLOCK_OFFSET(p->coeff, k);
- x->fwd_txm4x4(raster_block_offset_int16(BLOCK_8X8, k, p->src_diff),
+ x->fwd_txm4x4(vp9_raster_block_offset_int16(BLOCK_8X8, k, p->src_diff),
coeff, 8);
vp9_regular_quantize_b_4x4(x, 0, k, so->scan, so->iscan);
#if CONFIG_VP9_HIGHBITDEPTH
@@ -1480,13 +1480,14 @@
struct macroblock_plane *const p = &x->plane[0];
struct macroblockd_plane *const pd = &x->e_mbd.plane[0];
- p->src.buf = &p->src.buf[raster_block_offset(BLOCK_8X8, i, p->src.stride)];
+ p->src.buf = &p->src.buf[vp9_raster_block_offset(BLOCK_8X8, i,
+ p->src.stride)];
assert(((intptr_t)pd->pre[0].buf & 0x7) == 0);
- pd->pre[0].buf = &pd->pre[0].buf[raster_block_offset(BLOCK_8X8, i,
- pd->pre[0].stride)];
+ pd->pre[0].buf = &pd->pre[0].buf[vp9_raster_block_offset(BLOCK_8X8, i,
+ pd->pre[0].stride)];
if (has_second_ref(mbmi))
- pd->pre[1].buf = &pd->pre[1].buf[raster_block_offset(BLOCK_8X8, i,
- pd->pre[1].stride)];
+ pd->pre[1].buf = &pd->pre[1].buf[vp9_raster_block_offset(BLOCK_8X8, i,
+ pd->pre[1].stride)];
}
static INLINE void mi_buf_restore(MACROBLOCK *x, struct buf_2d orig_src,
diff --git a/vp9/encoder/vp9_tokenize.c b/vp9/encoder/vp9_tokenize.c
index 54b2594..4c89953 100644
--- a/vp9/encoder/vp9_tokenize.c
+++ b/vp9/encoder/vp9_tokenize.c
@@ -23,19 +23,6 @@
#include "vp9/encoder/vp9_encoder.h"
#include "vp9/encoder/vp9_tokenize.h"
-static int16_t dct_value_cost[DCT_MAX_VALUE * 2];
-const int16_t *vp9_dct_value_cost_ptr = dct_value_cost + DCT_MAX_VALUE;
-
-#if CONFIG_VP9_HIGHBITDEPTH
-static int16_t dct_value_cost_high10[DCT_MAX_VALUE_HIGH10 * 2];
-const int16_t *vp9_dct_value_cost_high10_ptr =
- dct_value_cost_high10 + DCT_MAX_VALUE_HIGH10;
-
-static int16_t dct_value_cost_high12[DCT_MAX_VALUE_HIGH12 * 2];
-const int16_t *vp9_dct_value_cost_high12_ptr =
- dct_value_cost_high12 + DCT_MAX_VALUE_HIGH12;
-#endif
-
static const TOKENVALUE dct_cat_lt_10_value_tokens[] = {
{9, 63}, {9, 61}, {9, 59}, {9, 57}, {9, 55}, {9, 53}, {9, 51}, {9, 49},
{9, 47}, {9, 45}, {9, 43}, {9, 41}, {9, 39}, {9, 37}, {9, 35}, {9, 33},
@@ -98,6 +85,298 @@
static const vp9_tree_index cat6[28] = {2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12,
14, 14, 16, 16, 18, 18, 20, 20, 22, 22, 24, 24, 26, 26, 0, 0};
+static const int16_t zero_cost[] = {0};
+static const int16_t one_cost[] = {255, 257};
+static const int16_t two_cost[] = {255, 257};
+static const int16_t three_cost[] = {255, 257};
+static const int16_t four_cost[] = {255, 257};
+static const int16_t cat1_cost[] = {429, 431, 616, 618};
+static const int16_t cat2_cost[] = {624, 626, 727, 729, 848, 850, 951, 953};
+static const int16_t cat3_cost[] = {
+ 820, 822, 893, 895, 940, 942, 1013, 1015, 1096, 1098, 1169, 1171, 1216, 1218,
+ 1289, 1291
+};
+static const int16_t cat4_cost[] = {
+ 1032, 1034, 1075, 1077, 1105, 1107, 1148, 1150, 1194, 1196, 1237, 1239,
+ 1267, 1269, 1310, 1312, 1328, 1330, 1371, 1373, 1401, 1403, 1444, 1446,
+ 1490, 1492, 1533, 1535, 1563, 1565, 1606, 1608
+};
+static const int16_t cat5_cost[] = {
+ 1269, 1271, 1283, 1285, 1306, 1308, 1320,
+ 1322, 1347, 1349, 1361, 1363, 1384, 1386, 1398, 1400, 1443, 1445, 1457,
+ 1459, 1480, 1482, 1494, 1496, 1521, 1523, 1535, 1537, 1558, 1560, 1572,
+ 1574, 1592, 1594, 1606, 1608, 1629, 1631, 1643, 1645, 1670, 1672, 1684,
+ 1686, 1707, 1709, 1721, 1723, 1766, 1768, 1780, 1782, 1803, 1805, 1817,
+ 1819, 1844, 1846, 1858, 1860, 1881, 1883, 1895, 1897
+};
+const int16_t vp9_cat6_low_cost[256] = {
+ 1638, 1640, 1646, 1648, 1652, 1654, 1660, 1662,
+ 1670, 1672, 1678, 1680, 1684, 1686, 1692, 1694, 1711, 1713, 1719, 1721,
+ 1725, 1727, 1733, 1735, 1743, 1745, 1751, 1753, 1757, 1759, 1765, 1767,
+ 1787, 1789, 1795, 1797, 1801, 1803, 1809, 1811, 1819, 1821, 1827, 1829,
+ 1833, 1835, 1841, 1843, 1860, 1862, 1868, 1870, 1874, 1876, 1882, 1884,
+ 1892, 1894, 1900, 1902, 1906, 1908, 1914, 1916, 1940, 1942, 1948, 1950,
+ 1954, 1956, 1962, 1964, 1972, 1974, 1980, 1982, 1986, 1988, 1994, 1996,
+ 2013, 2015, 2021, 2023, 2027, 2029, 2035, 2037, 2045, 2047, 2053, 2055,
+ 2059, 2061, 2067, 2069, 2089, 2091, 2097, 2099, 2103, 2105, 2111, 2113,
+ 2121, 2123, 2129, 2131, 2135, 2137, 2143, 2145, 2162, 2164, 2170, 2172,
+ 2176, 2178, 2184, 2186, 2194, 2196, 2202, 2204, 2208, 2210, 2216, 2218,
+ 2082, 2084, 2090, 2092, 2096, 2098, 2104, 2106, 2114, 2116, 2122, 2124,
+ 2128, 2130, 2136, 2138, 2155, 2157, 2163, 2165, 2169, 2171, 2177, 2179,
+ 2187, 2189, 2195, 2197, 2201, 2203, 2209, 2211, 2231, 2233, 2239, 2241,
+ 2245, 2247, 2253, 2255, 2263, 2265, 2271, 2273, 2277, 2279, 2285, 2287,
+ 2304, 2306, 2312, 2314, 2318, 2320, 2326, 2328, 2336, 2338, 2344, 2346,
+ 2350, 2352, 2358, 2360, 2384, 2386, 2392, 2394, 2398, 2400, 2406, 2408,
+ 2416, 2418, 2424, 2426, 2430, 2432, 2438, 2440, 2457, 2459, 2465, 2467,
+ 2471, 2473, 2479, 2481, 2489, 2491, 2497, 2499, 2503, 2505, 2511, 2513,
+ 2533, 2535, 2541, 2543, 2547, 2549, 2555, 2557, 2565, 2567, 2573, 2575,
+ 2579, 2581, 2587, 2589, 2606, 2608, 2614, 2616, 2620, 2622, 2628, 2630,
+ 2638, 2640, 2646, 2648, 2652, 2654, 2660, 2662
+};
+const int16_t vp9_cat6_high_cost[128] = {
+ 72, 892, 1183, 2003, 1448, 2268, 2559, 3379,
+ 1709, 2529, 2820, 3640, 3085, 3905, 4196, 5016, 2118, 2938, 3229, 4049,
+ 3494, 4314, 4605, 5425, 3755, 4575, 4866, 5686, 5131, 5951, 6242, 7062,
+ 2118, 2938, 3229, 4049, 3494, 4314, 4605, 5425, 3755, 4575, 4866, 5686,
+ 5131, 5951, 6242, 7062, 4164, 4984, 5275, 6095, 5540, 6360, 6651, 7471,
+ 5801, 6621, 6912, 7732, 7177, 7997, 8288, 9108, 2118, 2938, 3229, 4049,
+ 3494, 4314, 4605, 5425, 3755, 4575, 4866, 5686, 5131, 5951, 6242, 7062,
+ 4164, 4984, 5275, 6095, 5540, 6360, 6651, 7471, 5801, 6621, 6912, 7732,
+ 7177, 7997, 8288, 9108, 4164, 4984, 5275, 6095, 5540, 6360, 6651, 7471,
+ 5801, 6621, 6912, 7732, 7177, 7997, 8288, 9108, 6210, 7030, 7321, 8141,
+ 7586, 8406, 8697, 9517, 7847, 8667, 8958, 9778, 9223, 10043, 10334, 11154
+};
+
+#if CONFIG_VP9_HIGHBITDEPTH
+const int16_t vp9_cat6_high10_high_cost[512] = {
+ 74, 894, 1185, 2005, 1450, 2270, 2561,
+ 3381, 1711, 2531, 2822, 3642, 3087, 3907, 4198, 5018, 2120, 2940, 3231,
+ 4051, 3496, 4316, 4607, 5427, 3757, 4577, 4868, 5688, 5133, 5953, 6244,
+ 7064, 2120, 2940, 3231, 4051, 3496, 4316, 4607, 5427, 3757, 4577, 4868,
+ 5688, 5133, 5953, 6244, 7064, 4166, 4986, 5277, 6097, 5542, 6362, 6653,
+ 7473, 5803, 6623, 6914, 7734, 7179, 7999, 8290, 9110, 2120, 2940, 3231,
+ 4051, 3496, 4316, 4607, 5427, 3757, 4577, 4868, 5688, 5133, 5953, 6244,
+ 7064, 4166, 4986, 5277, 6097, 5542, 6362, 6653, 7473, 5803, 6623, 6914,
+ 7734, 7179, 7999, 8290, 9110, 4166, 4986, 5277, 6097, 5542, 6362, 6653,
+ 7473, 5803, 6623, 6914, 7734, 7179, 7999, 8290, 9110, 6212, 7032, 7323,
+ 8143, 7588, 8408, 8699, 9519, 7849, 8669, 8960, 9780, 9225, 10045, 10336,
+ 11156, 2120, 2940, 3231, 4051, 3496, 4316, 4607, 5427, 3757, 4577, 4868,
+ 5688, 5133, 5953, 6244, 7064, 4166, 4986, 5277, 6097, 5542, 6362, 6653,
+ 7473, 5803, 6623, 6914, 7734, 7179, 7999, 8290, 9110, 4166, 4986, 5277,
+ 6097, 5542, 6362, 6653, 7473, 5803, 6623, 6914, 7734, 7179, 7999, 8290,
+ 9110, 6212, 7032, 7323, 8143, 7588, 8408, 8699, 9519, 7849, 8669, 8960,
+ 9780, 9225, 10045, 10336, 11156, 4166, 4986, 5277, 6097, 5542, 6362, 6653,
+ 7473, 5803, 6623, 6914, 7734, 7179, 7999, 8290, 9110, 6212, 7032, 7323,
+ 8143, 7588, 8408, 8699, 9519, 7849, 8669, 8960, 9780, 9225, 10045, 10336,
+ 11156, 6212, 7032, 7323, 8143, 7588, 8408, 8699, 9519, 7849, 8669, 8960,
+ 9780, 9225, 10045, 10336, 11156, 8258, 9078, 9369, 10189, 9634, 10454,
+ 10745, 11565, 9895, 10715, 11006, 11826, 11271, 12091, 12382, 13202, 2120,
+ 2940, 3231, 4051, 3496, 4316, 4607, 5427, 3757, 4577, 4868, 5688, 5133,
+ 5953, 6244, 7064, 4166, 4986, 5277, 6097, 5542, 6362, 6653, 7473, 5803,
+ 6623, 6914, 7734, 7179, 7999, 8290, 9110, 4166, 4986, 5277, 6097, 5542,
+ 6362, 6653, 7473, 5803, 6623, 6914, 7734, 7179, 7999, 8290, 9110, 6212,
+ 7032, 7323, 8143, 7588, 8408, 8699, 9519, 7849, 8669, 8960, 9780, 9225,
+ 10045, 10336, 11156, 4166, 4986, 5277, 6097, 5542, 6362, 6653, 7473, 5803,
+ 6623, 6914, 7734, 7179, 7999, 8290, 9110, 6212, 7032, 7323, 8143, 7588,
+ 8408, 8699, 9519, 7849, 8669, 8960, 9780, 9225, 10045, 10336, 11156, 6212,
+ 7032, 7323, 8143, 7588, 8408, 8699, 9519, 7849, 8669, 8960, 9780, 9225,
+ 10045, 10336, 11156, 8258, 9078, 9369, 10189, 9634, 10454, 10745, 11565,
+ 9895, 10715, 11006, 11826, 11271, 12091, 12382, 13202, 4166, 4986, 5277,
+ 6097, 5542, 6362, 6653, 7473, 5803, 6623, 6914, 7734, 7179, 7999, 8290,
+ 9110, 6212, 7032, 7323, 8143, 7588, 8408, 8699, 9519, 7849, 8669, 8960,
+ 9780, 9225, 10045, 10336, 11156, 6212, 7032, 7323, 8143, 7588, 8408, 8699,
+ 9519, 7849, 8669, 8960, 9780, 9225, 10045, 10336, 11156, 8258, 9078, 9369,
+ 10189, 9634, 10454, 10745, 11565, 9895, 10715, 11006, 11826, 11271, 12091,
+ 12382, 13202, 6212, 7032, 7323, 8143, 7588, 8408, 8699, 9519, 7849, 8669,
+ 8960, 9780, 9225, 10045, 10336, 11156, 8258, 9078, 9369, 10189, 9634, 10454,
+ 10745, 11565, 9895, 10715, 11006, 11826, 11271, 12091, 12382, 13202, 8258,
+ 9078, 9369, 10189, 9634, 10454, 10745, 11565, 9895, 10715, 11006, 11826,
+ 11271, 12091, 12382, 13202, 10304, 11124, 11415, 12235, 11680, 12500, 12791,
+ 13611, 11941, 12761, 13052, 13872, 13317, 14137, 14428, 15248,
+};
+const int16_t vp9_cat6_high12_high_cost[2048] = {
+ 76, 896, 1187, 2007, 1452, 2272, 2563,
+ 3383, 1713, 2533, 2824, 3644, 3089, 3909, 4200, 5020, 2122, 2942, 3233,
+ 4053, 3498, 4318, 4609, 5429, 3759, 4579, 4870, 5690, 5135, 5955, 6246,
+ 7066, 2122, 2942, 3233, 4053, 3498, 4318, 4609, 5429, 3759, 4579, 4870,
+ 5690, 5135, 5955, 6246, 7066, 4168, 4988, 5279, 6099, 5544, 6364, 6655,
+ 7475, 5805, 6625, 6916, 7736, 7181, 8001, 8292, 9112, 2122, 2942, 3233,
+ 4053, 3498, 4318, 4609, 5429, 3759, 4579, 4870, 5690, 5135, 5955, 6246,
+ 7066, 4168, 4988, 5279, 6099, 5544, 6364, 6655, 7475, 5805, 6625, 6916,
+ 7736, 7181, 8001, 8292, 9112, 4168, 4988, 5279, 6099, 5544, 6364, 6655,
+ 7475, 5805, 6625, 6916, 7736, 7181, 8001, 8292, 9112, 6214, 7034, 7325,
+ 8145, 7590, 8410, 8701, 9521, 7851, 8671, 8962, 9782, 9227, 10047, 10338,
+ 11158, 2122, 2942, 3233, 4053, 3498, 4318, 4609, 5429, 3759, 4579, 4870,
+ 5690, 5135, 5955, 6246, 7066, 4168, 4988, 5279, 6099, 5544, 6364, 6655,
+ 7475, 5805, 6625, 6916, 7736, 7181, 8001, 8292, 9112, 4168, 4988, 5279,
+ 6099, 5544, 6364, 6655, 7475, 5805, 6625, 6916, 7736, 7181, 8001, 8292,
+ 9112, 6214, 7034, 7325, 8145, 7590, 8410, 8701, 9521, 7851, 8671, 8962,
+ 9782, 9227, 10047, 10338, 11158, 4168, 4988, 5279, 6099, 5544, 6364, 6655,
+ 7475, 5805, 6625, 6916, 7736, 7181, 8001, 8292, 9112, 6214, 7034, 7325,
+ 8145, 7590, 8410, 8701, 9521, 7851, 8671, 8962, 9782, 9227, 10047, 10338,
+ 11158, 6214, 7034, 7325, 8145, 7590, 8410, 8701, 9521, 7851, 8671, 8962,
+ 9782, 9227, 10047, 10338, 11158, 8260, 9080, 9371, 10191, 9636, 10456,
+ 10747, 11567, 9897, 10717, 11008, 11828, 11273, 12093, 12384, 13204, 2122,
+ 2942, 3233, 4053, 3498, 4318, 4609, 5429, 3759, 4579, 4870, 5690, 5135,
+ 5955, 6246, 7066, 4168, 4988, 5279, 6099, 5544, 6364, 6655, 7475, 5805,
+ 6625, 6916, 7736, 7181, 8001, 8292, 9112, 4168, 4988, 5279, 6099, 5544,
+ 6364, 6655, 7475, 5805, 6625, 6916, 7736, 7181, 8001, 8292, 9112, 6214,
+ 7034, 7325, 8145, 7590, 8410, 8701, 9521, 7851, 8671, 8962, 9782, 9227,
+ 10047, 10338, 11158, 4168, 4988, 5279, 6099, 5544, 6364, 6655, 7475, 5805,
+ 6625, 6916, 7736, 7181, 8001, 8292, 9112, 6214, 7034, 7325, 8145, 7590,
+ 8410, 8701, 9521, 7851, 8671, 8962, 9782, 9227, 10047, 10338, 11158, 6214,
+ 7034, 7325, 8145, 7590, 8410, 8701, 9521, 7851, 8671, 8962, 9782, 9227,
+ 10047, 10338, 11158, 8260, 9080, 9371, 10191, 9636, 10456, 10747, 11567,
+ 9897, 10717, 11008, 11828, 11273, 12093, 12384, 13204, 4168, 4988, 5279,
+ 6099, 5544, 6364, 6655, 7475, 5805, 6625, 6916, 7736, 7181, 8001, 8292,
+ 9112, 6214, 7034, 7325, 8145, 7590, 8410, 8701, 9521, 7851, 8671, 8962,
+ 9782, 9227, 10047, 10338, 11158, 6214, 7034, 7325, 8145, 7590, 8410, 8701,
+ 9521, 7851, 8671, 8962, 9782, 9227, 10047, 10338, 11158, 8260, 9080, 9371,
+ 10191, 9636, 10456, 10747, 11567, 9897, 10717, 11008, 11828, 11273, 12093,
+ 12384, 13204, 6214, 7034, 7325, 8145, 7590, 8410, 8701, 9521, 7851, 8671,
+ 8962, 9782, 9227, 10047, 10338, 11158, 8260, 9080, 9371, 10191, 9636, 10456,
+ 10747, 11567, 9897, 10717, 11008, 11828, 11273, 12093, 12384, 13204, 8260,
+ 9080, 9371, 10191, 9636, 10456, 10747, 11567, 9897, 10717, 11008, 11828,
+ 11273, 12093, 12384, 13204, 10306, 11126, 11417, 12237, 11682, 12502, 12793,
+ 13613, 11943, 12763, 13054, 13874, 13319, 14139, 14430, 15250, 2122, 2942,
+ 3233, 4053, 3498, 4318, 4609, 5429, 3759, 4579, 4870, 5690, 5135, 5955,
+ 6246, 7066, 4168, 4988, 5279, 6099, 5544, 6364, 6655, 7475, 5805, 6625,
+ 6916, 7736, 7181, 8001, 8292, 9112, 4168, 4988, 5279, 6099, 5544, 6364,
+ 6655, 7475, 5805, 6625, 6916, 7736, 7181, 8001, 8292, 9112, 6214, 7034,
+ 7325, 8145, 7590, 8410, 8701, 9521, 7851, 8671, 8962, 9782, 9227, 10047,
+ 10338, 11158, 4168, 4988, 5279, 6099, 5544, 6364, 6655, 7475, 5805, 6625,
+ 6916, 7736, 7181, 8001, 8292, 9112, 6214, 7034, 7325, 8145, 7590, 8410,
+ 8701, 9521, 7851, 8671, 8962, 9782, 9227, 10047, 10338, 11158, 6214, 7034,
+ 7325, 8145, 7590, 8410, 8701, 9521, 7851, 8671, 8962, 9782, 9227, 10047,
+ 10338, 11158, 8260, 9080, 9371, 10191, 9636, 10456, 10747, 11567, 9897,
+ 10717, 11008, 11828, 11273, 12093, 12384, 13204, 4168, 4988, 5279, 6099,
+ 5544, 6364, 6655, 7475, 5805, 6625, 6916, 7736, 7181, 8001, 8292, 9112,
+ 6214, 7034, 7325, 8145, 7590, 8410, 8701, 9521, 7851, 8671, 8962, 9782,
+ 9227, 10047, 10338, 11158, 6214, 7034, 7325, 8145, 7590, 8410, 8701, 9521,
+ 7851, 8671, 8962, 9782, 9227, 10047, 10338, 11158, 8260, 9080, 9371, 10191,
+ 9636, 10456, 10747, 11567, 9897, 10717, 11008, 11828, 11273, 12093, 12384,
+ 13204, 6214, 7034, 7325, 8145, 7590, 8410, 8701, 9521, 7851, 8671, 8962,
+ 9782, 9227, 10047, 10338, 11158, 8260, 9080, 9371, 10191, 9636, 10456,
+ 10747, 11567, 9897, 10717, 11008, 11828, 11273, 12093, 12384, 13204, 8260,
+ 9080, 9371, 10191, 9636, 10456, 10747, 11567, 9897, 10717, 11008, 11828,
+ 11273, 12093, 12384, 13204, 10306, 11126, 11417, 12237, 11682, 12502, 12793,
+ 13613, 11943, 12763, 13054, 13874, 13319, 14139, 14430, 15250, 4168, 4988,
+ 5279, 6099, 5544, 6364, 6655, 7475, 5805, 6625, 6916, 7736, 7181, 8001,
+ 8292, 9112, 6214, 7034, 7325, 8145, 7590, 8410, 8701, 9521, 7851, 8671,
+ 8962, 9782, 9227, 10047, 10338, 11158, 6214, 7034, 7325, 8145, 7590, 8410,
+ 8701, 9521, 7851, 8671, 8962, 9782, 9227, 10047, 10338, 11158, 8260, 9080,
+ 9371, 10191, 9636, 10456, 10747, 11567, 9897, 10717, 11008, 11828, 11273,
+ 12093, 12384, 13204, 6214, 7034, 7325, 8145, 7590, 8410, 8701, 9521, 7851,
+ 8671, 8962, 9782, 9227, 10047, 10338, 11158, 8260, 9080, 9371, 10191, 9636,
+ 10456, 10747, 11567, 9897, 10717, 11008, 11828, 11273, 12093, 12384, 13204,
+ 8260, 9080, 9371, 10191, 9636, 10456, 10747, 11567, 9897, 10717, 11008,
+ 11828, 11273, 12093, 12384, 13204, 10306, 11126, 11417, 12237, 11682, 12502,
+ 12793, 13613, 11943, 12763, 13054, 13874, 13319, 14139, 14430, 15250, 6214,
+ 7034, 7325, 8145, 7590, 8410, 8701, 9521, 7851, 8671, 8962, 9782, 9227,
+ 10047, 10338, 11158, 8260, 9080, 9371, 10191, 9636, 10456, 10747, 11567,
+ 9897, 10717, 11008, 11828, 11273, 12093, 12384, 13204, 8260, 9080, 9371,
+ 10191, 9636, 10456, 10747, 11567, 9897, 10717, 11008, 11828, 11273, 12093,
+ 12384, 13204, 10306, 11126, 11417, 12237, 11682, 12502, 12793, 13613, 11943,
+ 12763, 13054, 13874, 13319, 14139, 14430, 15250, 8260, 9080, 9371, 10191,
+ 9636, 10456, 10747, 11567, 9897, 10717, 11008, 11828, 11273, 12093, 12384,
+ 13204, 10306, 11126, 11417, 12237, 11682, 12502, 12793, 13613, 11943, 12763,
+ 13054, 13874, 13319, 14139, 14430, 15250, 10306, 11126, 11417, 12237, 11682,
+ 12502, 12793, 13613, 11943, 12763, 13054, 13874, 13319, 14139, 14430, 15250,
+ 12352, 13172, 13463, 14283, 13728, 14548, 14839, 15659, 13989, 14809, 15100,
+ 15920, 15365, 16185, 16476, 17296, 2122, 2942, 3233, 4053, 3498, 4318, 4609,
+ 5429, 3759, 4579, 4870, 5690, 5135, 5955, 6246, 7066, 4168, 4988, 5279,
+ 6099, 5544, 6364, 6655, 7475, 5805, 6625, 6916, 7736, 7181, 8001, 8292,
+ 9112, 4168, 4988, 5279, 6099, 5544, 6364, 6655, 7475, 5805, 6625, 6916,
+ 7736, 7181, 8001, 8292, 9112, 6214, 7034, 7325, 8145, 7590, 8410, 8701,
+ 9521, 7851, 8671, 8962, 9782, 9227, 10047, 10338, 11158, 4168, 4988, 5279,
+ 6099, 5544, 6364, 6655, 7475, 5805, 6625, 6916, 7736, 7181, 8001, 8292,
+ 9112, 6214, 7034, 7325, 8145, 7590, 8410, 8701, 9521, 7851, 8671, 8962,
+ 9782, 9227, 10047, 10338, 11158, 6214, 7034, 7325, 8145, 7590, 8410, 8701,
+ 9521, 7851, 8671, 8962, 9782, 9227, 10047, 10338, 11158, 8260, 9080, 9371,
+ 10191, 9636, 10456, 10747, 11567, 9897, 10717, 11008, 11828, 11273, 12093,
+ 12384, 13204, 4168, 4988, 5279, 6099, 5544, 6364, 6655, 7475, 5805, 6625,
+ 6916, 7736, 7181, 8001, 8292, 9112, 6214, 7034, 7325, 8145, 7590, 8410,
+ 8701, 9521, 7851, 8671, 8962, 9782, 9227, 10047, 10338, 11158, 6214, 7034,
+ 7325, 8145, 7590, 8410, 8701, 9521, 7851, 8671, 8962, 9782, 9227, 10047,
+ 10338, 11158, 8260, 9080, 9371, 10191, 9636, 10456, 10747, 11567, 9897,
+ 10717, 11008, 11828, 11273, 12093, 12384, 13204, 6214, 7034, 7325, 8145,
+ 7590, 8410, 8701, 9521, 7851, 8671, 8962, 9782, 9227, 10047, 10338, 11158,
+ 8260, 9080, 9371, 10191, 9636, 10456, 10747, 11567, 9897, 10717, 11008,
+ 11828, 11273, 12093, 12384, 13204, 8260, 9080, 9371, 10191, 9636, 10456,
+ 10747, 11567, 9897, 10717, 11008, 11828, 11273, 12093, 12384, 13204, 10306,
+ 11126, 11417, 12237, 11682, 12502, 12793, 13613, 11943, 12763, 13054, 13874,
+ 13319, 14139, 14430, 15250, 4168, 4988, 5279, 6099, 5544, 6364, 6655, 7475,
+ 5805, 6625, 6916, 7736, 7181, 8001, 8292, 9112, 6214, 7034, 7325, 8145,
+ 7590, 8410, 8701, 9521, 7851, 8671, 8962, 9782, 9227, 10047, 10338, 11158,
+ 6214, 7034, 7325, 8145, 7590, 8410, 8701, 9521, 7851, 8671, 8962, 9782,
+ 9227, 10047, 10338, 11158, 8260, 9080, 9371, 10191, 9636, 10456, 10747,
+ 11567, 9897, 10717, 11008, 11828, 11273, 12093, 12384, 13204, 6214, 7034,
+ 7325, 8145, 7590, 8410, 8701, 9521, 7851, 8671, 8962, 9782, 9227, 10047,
+ 10338, 11158, 8260, 9080, 9371, 10191, 9636, 10456, 10747, 11567, 9897,
+ 10717, 11008, 11828, 11273, 12093, 12384, 13204, 8260, 9080, 9371, 10191,
+ 9636, 10456, 10747, 11567, 9897, 10717, 11008, 11828, 11273, 12093, 12384,
+ 13204, 10306, 11126, 11417, 12237, 11682, 12502, 12793, 13613, 11943, 12763,
+ 13054, 13874, 13319, 14139, 14430, 15250, 6214, 7034, 7325, 8145, 7590,
+ 8410, 8701, 9521, 7851, 8671, 8962, 9782, 9227, 10047, 10338, 11158, 8260,
+ 9080, 9371, 10191, 9636, 10456, 10747, 11567, 9897, 10717, 11008, 11828,
+ 11273, 12093, 12384, 13204, 8260, 9080, 9371, 10191, 9636, 10456, 10747,
+ 11567, 9897, 10717, 11008, 11828, 11273, 12093, 12384, 13204, 10306, 11126,
+ 11417, 12237, 11682, 12502, 12793, 13613, 11943, 12763, 13054, 13874, 13319,
+ 14139, 14430, 15250, 8260, 9080, 9371, 10191, 9636, 10456, 10747, 11567,
+ 9897, 10717, 11008, 11828, 11273, 12093, 12384, 13204, 10306, 11126, 11417,
+ 12237, 11682, 12502, 12793, 13613, 11943, 12763, 13054, 13874, 13319, 14139,
+ 14430, 15250, 10306, 11126, 11417, 12237, 11682, 12502, 12793, 13613, 11943,
+ 12763, 13054, 13874, 13319, 14139, 14430, 15250, 12352, 13172, 13463, 14283,
+ 13728, 14548, 14839, 15659, 13989, 14809, 15100, 15920, 15365, 16185, 16476,
+ 17296, 4168, 4988, 5279, 6099, 5544, 6364, 6655, 7475, 5805, 6625, 6916,
+ 7736, 7181, 8001, 8292, 9112, 6214, 7034, 7325, 8145, 7590, 8410, 8701,
+ 9521, 7851, 8671, 8962, 9782, 9227, 10047, 10338, 11158, 6214, 7034, 7325,
+ 8145, 7590, 8410, 8701, 9521, 7851, 8671, 8962, 9782, 9227, 10047, 10338,
+ 11158, 8260, 9080, 9371, 10191, 9636, 10456, 10747, 11567, 9897, 10717,
+ 11008, 11828, 11273, 12093, 12384, 13204, 6214, 7034, 7325, 8145, 7590,
+ 8410, 8701, 9521, 7851, 8671, 8962, 9782, 9227, 10047, 10338, 11158, 8260,
+ 9080, 9371, 10191, 9636, 10456, 10747, 11567, 9897, 10717, 11008, 11828,
+ 11273, 12093, 12384, 13204, 8260, 9080, 9371, 10191, 9636, 10456, 10747,
+ 11567, 9897, 10717, 11008, 11828, 11273, 12093, 12384, 13204, 10306, 11126,
+ 11417, 12237, 11682, 12502, 12793, 13613, 11943, 12763, 13054, 13874, 13319,
+ 14139, 14430, 15250, 6214, 7034, 7325, 8145, 7590, 8410, 8701, 9521, 7851,
+ 8671, 8962, 9782, 9227, 10047, 10338, 11158, 8260, 9080, 9371, 10191, 9636,
+ 10456, 10747, 11567, 9897, 10717, 11008, 11828, 11273, 12093, 12384, 13204,
+ 8260, 9080, 9371, 10191, 9636, 10456, 10747, 11567, 9897, 10717, 11008,
+ 11828, 11273, 12093, 12384, 13204, 10306, 11126, 11417, 12237, 11682, 12502,
+ 12793, 13613, 11943, 12763, 13054, 13874, 13319, 14139, 14430, 15250, 8260,
+ 9080, 9371, 10191, 9636, 10456, 10747, 11567, 9897, 10717, 11008, 11828,
+ 11273, 12093, 12384, 13204, 10306, 11126, 11417, 12237, 11682, 12502, 12793,
+ 13613, 11943, 12763, 13054, 13874, 13319, 14139, 14430, 15250, 10306, 11126,
+ 11417, 12237, 11682, 12502, 12793, 13613, 11943, 12763, 13054, 13874, 13319,
+ 14139, 14430, 15250, 12352, 13172, 13463, 14283, 13728, 14548, 14839, 15659,
+ 13989, 14809, 15100, 15920, 15365, 16185, 16476, 17296, 6214, 7034, 7325,
+ 8145, 7590, 8410, 8701, 9521, 7851, 8671, 8962, 9782, 9227, 10047, 10338,
+ 11158, 8260, 9080, 9371, 10191, 9636, 10456, 10747, 11567, 9897, 10717,
+ 11008, 11828, 11273, 12093, 12384, 13204, 8260, 9080, 9371, 10191, 9636,
+ 10456, 10747, 11567, 9897, 10717, 11008, 11828, 11273, 12093, 12384, 13204,
+ 10306, 11126, 11417, 12237, 11682, 12502, 12793, 13613, 11943, 12763, 13054,
+ 13874, 13319, 14139, 14430, 15250, 8260, 9080, 9371, 10191, 9636, 10456,
+ 10747, 11567, 9897, 10717, 11008, 11828, 11273, 12093, 12384, 13204, 10306,
+ 11126, 11417, 12237, 11682, 12502, 12793, 13613, 11943, 12763, 13054, 13874,
+ 13319, 14139, 14430, 15250, 10306, 11126, 11417, 12237, 11682, 12502, 12793,
+ 13613, 11943, 12763, 13054, 13874, 13319, 14139, 14430, 15250, 12352, 13172,
+ 13463, 14283, 13728, 14548, 14839, 15659, 13989, 14809, 15100, 15920, 15365,
+ 16185, 16476, 17296, 8260, 9080, 9371, 10191, 9636, 10456, 10747, 11567,
+ 9897, 10717, 11008, 11828, 11273, 12093, 12384, 13204, 10306, 11126, 11417,
+ 12237, 11682, 12502, 12793, 13613, 11943, 12763, 13054, 13874, 13319, 14139,
+ 14430, 15250, 10306, 11126, 11417, 12237, 11682, 12502, 12793, 13613, 11943,
+ 12763, 13054, 13874, 13319, 14139, 14430, 15250, 12352, 13172, 13463, 14283,
+ 13728, 14548, 14839, 15659, 13989, 14809, 15100, 15920, 15365, 16185, 16476,
+ 17296, 10306, 11126, 11417, 12237, 11682, 12502, 12793, 13613, 11943, 12763,
+ 13054, 13874, 13319, 14139, 14430, 15250, 12352, 13172, 13463, 14283, 13728,
+ 14548, 14839, 15659, 13989, 14809, 15100, 15920, 15365, 16185, 16476, 17296,
+ 12352, 13172, 13463, 14283, 13728, 14548, 14839, 15659, 13989, 14809, 15100,
+ 15920, 15365, 16185, 16476, 17296, 14398, 15218, 15509, 16329, 15774, 16594,
+ 16885, 17705, 16035, 16855, 17146, 17966, 17411, 18231, 18522, 19342
+};
+#endif
+
#if CONFIG_VP9_HIGHBITDEPTH
static const vp9_tree_index cat1_high10[2] = {0, 0};
static const vp9_tree_index cat2_high10[4] = {2, 2, 0, 0};
@@ -117,61 +396,49 @@
30, 30, 32, 32, 34, 34, 0, 0};
#endif
-static void init_bit_tree(vp9_tree_index *p, int n) {
- int i = 0;
-
- while (++i < n) {
- p[0] = p[1] = i << 1;
- p += 2;
- }
-
- p[0] = p[1] = 0;
-}
-
-
const vp9_extra_bit vp9_extra_bits[ENTROPY_TOKENS] = {
- {0, 0, 0, 0}, // ZERO_TOKEN
- {0, 0, 0, 1}, // ONE_TOKEN
- {0, 0, 0, 2}, // TWO_TOKEN
- {0, 0, 0, 3}, // THREE_TOKEN
- {0, 0, 0, 4}, // FOUR_TOKEN
- {cat1, vp9_cat1_prob, 1, CAT1_MIN_VAL}, // CATEGORY1_TOKEN
- {cat2, vp9_cat2_prob, 2, CAT2_MIN_VAL}, // CATEGORY2_TOKEN
- {cat3, vp9_cat3_prob, 3, CAT3_MIN_VAL}, // CATEGORY3_TOKEN
- {cat4, vp9_cat4_prob, 4, CAT4_MIN_VAL}, // CATEGORY4_TOKEN
- {cat5, vp9_cat5_prob, 5, CAT5_MIN_VAL}, // CATEGORY5_TOKEN
- {cat6, vp9_cat6_prob, 14, CAT6_MIN_VAL}, // CATEGORY6_TOKEN
- {0, 0, 0, 0} // EOB_TOKEN
+ {0, 0, 0, 0, zero_cost}, // ZERO_TOKEN
+ {0, 0, 0, 1, one_cost}, // ONE_TOKEN
+ {0, 0, 0, 2, two_cost}, // TWO_TOKEN
+ {0, 0, 0, 3, three_cost}, // THREE_TOKEN
+ {0, 0, 0, 4, four_cost}, // FOUR_TOKEN
+ {cat1, vp9_cat1_prob, 1, CAT1_MIN_VAL, cat1_cost}, // CATEGORY1_TOKEN
+ {cat2, vp9_cat2_prob, 2, CAT2_MIN_VAL, cat2_cost}, // CATEGORY2_TOKEN
+ {cat3, vp9_cat3_prob, 3, CAT3_MIN_VAL, cat3_cost}, // CATEGORY3_TOKEN
+ {cat4, vp9_cat4_prob, 4, CAT4_MIN_VAL, cat4_cost}, // CATEGORY4_TOKEN
+ {cat5, vp9_cat5_prob, 5, CAT5_MIN_VAL, cat5_cost}, // CATEGORY5_TOKEN
+ {cat6, vp9_cat6_prob, 14, CAT6_MIN_VAL, 0}, // CATEGORY6_TOKEN
+ {0, 0, 0, 0, zero_cost} // EOB_TOKEN
};
#if CONFIG_VP9_HIGHBITDEPTH
const vp9_extra_bit vp9_extra_bits_high10[ENTROPY_TOKENS] = {
- {0, 0, 0, 0}, // ZERO_TOKEN
- {0, 0, 0, 1}, // ONE_TOKEN
- {0, 0, 0, 2}, // TWO_TOKEN
- {0, 0, 0, 3}, // THREE_TOKEN
- {0, 0, 0, 4}, // FOUR_TOKEN
- {cat1_high10, vp9_cat1_prob_high10, 1, CAT1_MIN_VAL}, // CATEGORY1_TOKEN
- {cat2_high10, vp9_cat2_prob_high10, 2, CAT2_MIN_VAL}, // CATEGORY2_TOKEN
- {cat3_high10, vp9_cat3_prob_high10, 3, CAT3_MIN_VAL}, // CATEGORY3_TOKEN
- {cat4_high10, vp9_cat4_prob_high10, 4, CAT4_MIN_VAL}, // CATEGORY4_TOKEN
- {cat5_high10, vp9_cat5_prob_high10, 5, CAT5_MIN_VAL}, // CATEGORY5_TOKEN
- {cat6_high10, vp9_cat6_prob_high10, 16, CAT6_MIN_VAL}, // CATEGORY6_TOKEN
- {0, 0, 0, 0} // EOB_TOKEN
+ {0, 0, 0, 0, zero_cost}, // ZERO
+ {0, 0, 0, 1, one_cost}, // ONE
+ {0, 0, 0, 2, two_cost}, // TWO
+ {0, 0, 0, 3, three_cost}, // THREE
+ {0, 0, 0, 4, four_cost}, // FOUR
+ {cat1_high10, vp9_cat1_prob_high10, 1, CAT1_MIN_VAL, cat1_cost}, // CAT1
+ {cat2_high10, vp9_cat2_prob_high10, 2, CAT2_MIN_VAL, cat2_cost}, // CAT2
+ {cat3_high10, vp9_cat3_prob_high10, 3, CAT3_MIN_VAL, cat3_cost}, // CAT3
+ {cat4_high10, vp9_cat4_prob_high10, 4, CAT4_MIN_VAL, cat4_cost}, // CAT4
+ {cat5_high10, vp9_cat5_prob_high10, 5, CAT5_MIN_VAL, cat5_cost}, // CAT5
+ {cat6_high10, vp9_cat6_prob_high10, 16, CAT6_MIN_VAL, 0}, // CAT6
+ {0, 0, 0, 0, zero_cost} // EOB
};
const vp9_extra_bit vp9_extra_bits_high12[ENTROPY_TOKENS] = {
- {0, 0, 0, 0}, // ZERO_TOKEN
- {0, 0, 0, 1}, // ONE_TOKEN
- {0, 0, 0, 2}, // TWO_TOKEN
- {0, 0, 0, 3}, // THREE_TOKEN
- {0, 0, 0, 4}, // FOUR_TOKEN
- {cat1_high12, vp9_cat1_prob_high12, 1, CAT1_MIN_VAL}, // CATEGORY1_TOKEN
- {cat2_high12, vp9_cat2_prob_high12, 2, CAT2_MIN_VAL}, // CATEGORY2_TOKEN
- {cat3_high12, vp9_cat3_prob_high12, 3, CAT3_MIN_VAL}, // CATEGORY3_TOKEN
- {cat4_high12, vp9_cat4_prob_high12, 4, CAT4_MIN_VAL}, // CATEGORY4_TOKEN
- {cat5_high12, vp9_cat5_prob_high12, 5, CAT5_MIN_VAL}, // CATEGORY5_TOKEN
- {cat6_high12, vp9_cat6_prob_high12, 18, CAT6_MIN_VAL}, // CATEGORY6_TOKEN
- {0, 0, 0, 0} // EOB_TOKEN
+ {0, 0, 0, 0, zero_cost}, // ZERO
+ {0, 0, 0, 1, one_cost}, // ONE
+ {0, 0, 0, 2, two_cost}, // TWO
+ {0, 0, 0, 3, three_cost}, // THREE
+ {0, 0, 0, 4, four_cost}, // FOUR
+ {cat1_high12, vp9_cat1_prob_high12, 1, CAT1_MIN_VAL, cat1_cost}, // CAT1
+ {cat2_high12, vp9_cat2_prob_high12, 2, CAT2_MIN_VAL, cat2_cost}, // CAT2
+ {cat3_high12, vp9_cat3_prob_high12, 3, CAT3_MIN_VAL, cat3_cost}, // CAT3
+ {cat4_high12, vp9_cat4_prob_high12, 4, CAT4_MIN_VAL, cat4_cost}, // CAT4
+ {cat5_high12, vp9_cat5_prob_high12, 5, CAT5_MIN_VAL, cat5_cost}, // CAT5
+ {cat6_high12, vp9_cat6_prob_high12, 18, CAT6_MIN_VAL, 0}, // CAT6
+ {0, 0, 0, 0, zero_cost} // EOB
};
#endif
@@ -180,46 +447,6 @@
{125, 7}, {126, 7}, {127, 7}, {0, 1}
};
-static void tokenize_init_one(const vp9_extra_bit *const e,
- int16_t *value_cost, int max_value) {
- int i = -max_value;
-
- TOKENVALUE t;
- do {
-
- vp9_get_token_extra(i, &t.token, &t.extra);
- // initialize the cost for extra bits for all possible coefficient value.
- {
- int cost = 0;
- const vp9_extra_bit *p = &e[t.token];
-
- if (p->base_val) {
- const int extra = t.extra;
- const int length = p->len;
-
- if (length)
- cost += treed_cost(p->tree, p->prob, extra >> 1, length);
-
- cost += vp9_cost_bit(vp9_prob_half, extra & 1); /* sign */
- value_cost[i] = cost;
- }
- }
- } while (++i < max_value);
-}
-
-void vp9_tokenize_initialize() {
- tokenize_init_one(vp9_extra_bits,
- dct_value_cost + DCT_MAX_VALUE, DCT_MAX_VALUE);
-#if CONFIG_VP9_HIGHBITDEPTH
- tokenize_init_one(vp9_extra_bits_high10,
- dct_value_cost_high10 + DCT_MAX_VALUE_HIGH10,
- DCT_MAX_VALUE_HIGH10);
-
- tokenize_init_one(vp9_extra_bits_high12,
- dct_value_cost_high12 + DCT_MAX_VALUE_HIGH12,
- DCT_MAX_VALUE_HIGH12);
-#endif
-}
struct tokenize_b_args {
VP9_COMP *cpi;
diff --git a/vp9/encoder/vp9_tokenize.h b/vp9/encoder/vp9_tokenize.h
index 49dc4a7..81cc2e1 100644
--- a/vp9/encoder/vp9_tokenize.h
+++ b/vp9/encoder/vp9_tokenize.h
@@ -20,8 +20,6 @@
extern "C" {
#endif
-void vp9_tokenize_initialize();
-
#define EOSB_TOKEN 127 // Not signalled, encoder only
#if CONFIG_VP9_HIGHBITDEPTH
@@ -63,11 +61,29 @@
*/
extern const TOKENVALUE *vp9_dct_value_tokens_ptr;
extern const TOKENVALUE *vp9_dct_cat_lt_10_value_tokens;
+extern const int16_t vp9_cat6_low_cost[256];
+extern const int16_t vp9_cat6_high_cost[128];
+extern const int16_t vp9_cat6_high10_high_cost[512];
+extern const int16_t vp9_cat6_high12_high_cost[2048];
+static INLINE int16_t vp9_get_cost(int16_t token, EXTRABIT extrabits,
+ const int16_t *cat6_high_table) {
+ if (token != CATEGORY6_TOKEN)
+ return vp9_extra_bits[token].cost[extrabits];
+ return vp9_cat6_low_cost[extrabits & 0xff]
+ + cat6_high_table[extrabits >> 8];
+}
+
#if CONFIG_VP9_HIGHBITDEPTH
-extern const int16_t *vp9_dct_value_cost_high10_ptr;
-extern const TOKENVALUE *vp9_dct_value_tokens_high10_ptr;
-extern const int16_t *vp9_dct_value_cost_high12_ptr;
-extern const TOKENVALUE *vp9_dct_value_tokens_high12_ptr;
+static INLINE const int16_t* vp9_get_high_cost_table(int bit_depth) {
+ return bit_depth == 8 ? vp9_cat6_high_cost
+ : (bit_depth == 10 ? vp9_cat6_high10_high_cost :
+ vp9_cat6_high12_high_cost);
+}
+#else
+static INLINE const int16_t* vp9_get_high_cost_table(int bit_depth) {
+ (void) bit_depth;
+ return vp9_cat6_high_cost;
+}
#endif // CONFIG_VP9_HIGHBITDEPTH
static INLINE void vp9_get_token_extra(int v, int16_t *token, EXTRABIT *extra) {
diff --git a/vp9/vp9_common.mk b/vp9/vp9_common.mk
index f5e6e31..76602c2 100644
--- a/vp9/vp9_common.mk
+++ b/vp9/vp9_common.mk
@@ -133,11 +133,14 @@
VP9_COMMON_SRCS-$(HAVE_SSSE3) += common/x86/vp9_idct_ssse3_x86_64.asm
endif
+VP9_COMMON_SRCS-$(HAVE_NEON_ASM) += common/arm/neon/vp9_loopfilter_16_neon_asm$(ASM)
+VP9_COMMON_SRCS-$(HAVE_NEON_ASM) += common/arm/neon/vp9_loopfilter_8_neon_asm$(ASM)
VP9_COMMON_SRCS-$(HAVE_NEON_ASM) += common/arm/neon/vp9_mb_lpf_neon$(ASM)
VP9_COMMON_SRCS-$(HAVE_NEON_ASM) += common/arm/neon/vp9_save_reg_neon$(ASM)
VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_iht4x4_add_neon.c
VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_iht8x8_add_neon.c
+VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_loopfilter_neon.c
# neon with assembly and intrinsics implementations. If both are available
# prefer assembly.
@@ -156,9 +159,7 @@
VP9_COMMON_SRCS-yes += common/arm/neon/vp9_idct4x4_add_neon_asm$(ASM)
VP9_COMMON_SRCS-yes += common/arm/neon/vp9_idct8x8_1_add_neon_asm$(ASM)
VP9_COMMON_SRCS-yes += common/arm/neon/vp9_idct8x8_add_neon_asm$(ASM)
-VP9_COMMON_SRCS-yes += common/arm/neon/vp9_loopfilter_16_neon_asm$(ASM)
-VP9_COMMON_SRCS-yes += common/arm/neon/vp9_loopfilter_neon_asm$(ASM)
-VP9_COMMON_SRCS-yes += common/arm/neon/vp9_loopfilter_16_neon.c
+VP9_COMMON_SRCS-yes += common/arm/neon/vp9_loopfilter_4_neon_asm$(ASM)
VP9_COMMON_SRCS-yes += common/arm/neon/vp9_reconintra_neon_asm$(ASM)
else
ifeq ($(HAVE_NEON), yes)
@@ -176,8 +177,11 @@
VP9_COMMON_SRCS-yes += common/arm/neon/vp9_idct4x4_add_neon.c
VP9_COMMON_SRCS-yes += common/arm/neon/vp9_idct8x8_1_add_neon.c
VP9_COMMON_SRCS-yes += common/arm/neon/vp9_idct8x8_add_neon.c
-VP9_COMMON_SRCS-yes += common/arm/neon/vp9_loopfilter_neon.c
VP9_COMMON_SRCS-yes += common/arm/neon/vp9_loopfilter_16_neon.c
+VP9_COMMON_SRCS-yes += common/arm/neon/vp9_loopfilter_4_neon.c
+# TODO(johannkoenig): re-enable when chromium build is fixed
+# # https://code.google.com/p/chromium/issues/detail?id=443839
+#VP9_COMMON_SRCS-yes += common/arm/neon/vp9_loopfilter_8_neon.c
VP9_COMMON_SRCS-yes += common/arm/neon/vp9_reconintra_neon.c
endif # HAVE_NEON
endif # HAVE_NEON_ASM
diff --git a/vp9/vp9_dx_iface.c b/vp9/vp9_dx_iface.c
index 43bf35f..b917787 100644
--- a/vp9/vp9_dx_iface.c
+++ b/vp9/vp9_dx_iface.c
@@ -148,7 +148,11 @@
if (frame_marker != VP9_FRAME_MARKER)
return VPX_CODEC_UNSUP_BITSTREAM;
- if (profile >= MAX_PROFILES) return VPX_CODEC_UNSUP_BITSTREAM;
+ if (profile >= MAX_PROFILES)
+ return VPX_CODEC_UNSUP_BITSTREAM;
+
+ if ((profile >= 2 && data_sz <= 1) || data_sz < 1)
+ return VPX_CODEC_UNSUP_BITSTREAM;
if (vp9_rb_read_bit(&rb)) { // show an existing frame
vp9_rb_read_literal(&rb, 3); // Frame buffer to show.