Merge "vp9_filter: make all filter tables static"
diff --git a/examples/vp9_spatial_svc_encoder.c b/examples/vp9_spatial_svc_encoder.c
index c22a04a..5a60976 100644
--- a/examples/vp9_spatial_svc_encoder.c
+++ b/examples/vp9_spatial_svc_encoder.c
@@ -630,7 +630,8 @@
if (svc_ctx.speed != -1)
vpx_codec_control(&codec, VP8E_SET_CPUUSED, svc_ctx.speed);
- vpx_codec_control(&codec, VP9E_SET_TILE_COLUMNS, 0);
+ if (svc_ctx.threads)
+ vpx_codec_control(&codec, VP9E_SET_TILE_COLUMNS, (svc_ctx.threads >> 1));
// Encode frames
while (!end_of_stream) {
diff --git a/examples/vpx_temporal_svc_encoder.c b/examples/vpx_temporal_svc_encoder.c
index da2a7cf..484deb5 100644
--- a/examples/vpx_temporal_svc_encoder.c
+++ b/examples/vpx_temporal_svc_encoder.c
@@ -608,7 +608,6 @@
// Real time parameters.
cfg.rc_dropframe_thresh = strtol(argv[9], NULL, 0);
cfg.rc_end_usage = VPX_CBR;
- cfg.rc_resize_allowed = 0;
cfg.rc_min_quantizer = 2;
cfg.rc_max_quantizer = 56;
if (strncmp(encoder->name, "vp9", 3) == 0)
@@ -619,6 +618,9 @@
cfg.rc_buf_optimal_sz = 600;
cfg.rc_buf_sz = 1000;
+ // Disable dynamic resizing by default.
+ cfg.rc_resize_allowed = 0;
+
// Use 1 thread as default.
cfg.g_threads = 1;
diff --git a/test/dct16x16_test.cc b/test/dct16x16_test.cc
index b37d8e3..66ca4bb 100644
--- a/test/dct16x16_test.cc
+++ b/test/dct16x16_test.cc
@@ -933,14 +933,15 @@
INSTANTIATE_TEST_CASE_P(
MSA, Trans16x16DCT,
::testing::Values(
- make_tuple(&vp9_fdct16x16_c,
+ make_tuple(&vp9_fdct16x16_msa,
&vp9_idct16x16_256_add_msa, 0, VPX_BITS_8)));
INSTANTIATE_TEST_CASE_P(
MSA, Trans16x16HT,
::testing::Values(
- make_tuple(&vp9_fht16x16_c, &vp9_iht16x16_256_add_msa, 0, VPX_BITS_8),
- make_tuple(&vp9_fht16x16_c, &vp9_iht16x16_256_add_msa, 1, VPX_BITS_8),
- make_tuple(&vp9_fht16x16_c, &vp9_iht16x16_256_add_msa, 2, VPX_BITS_8),
- make_tuple(&vp9_fht16x16_c, &vp9_iht16x16_256_add_msa, 3, VPX_BITS_8)));
+ make_tuple(&vp9_fht16x16_msa, &vp9_iht16x16_256_add_msa, 0, VPX_BITS_8),
+ make_tuple(&vp9_fht16x16_msa, &vp9_iht16x16_256_add_msa, 1, VPX_BITS_8),
+ make_tuple(&vp9_fht16x16_msa, &vp9_iht16x16_256_add_msa, 2, VPX_BITS_8),
+ make_tuple(&vp9_fht16x16_msa, &vp9_iht16x16_256_add_msa, 3,
+ VPX_BITS_8)));
#endif // HAVE_MSA && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
} // namespace
diff --git a/test/dct32x32_test.cc b/test/dct32x32_test.cc
index 267dfb8..25059a5 100644
--- a/test/dct32x32_test.cc
+++ b/test/dct32x32_test.cc
@@ -386,7 +386,9 @@
INSTANTIATE_TEST_CASE_P(
MSA, Trans32x32Test,
::testing::Values(
- make_tuple(&vp9_fdct32x32_c,
- &vp9_idct32x32_1024_add_msa, 0, VPX_BITS_8)));
+ make_tuple(&vp9_fdct32x32_msa,
+ &vp9_idct32x32_1024_add_msa, 0, VPX_BITS_8),
+ make_tuple(&vp9_fdct32x32_rd_msa,
+ &vp9_idct32x32_1024_add_msa, 1, VPX_BITS_8)));
#endif // HAVE_MSA && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
} // namespace
diff --git a/test/test_intra_pred_speed.cc b/test/test_intra_pred_speed.cc
index c61ffff..352cde2 100644
--- a/test/test_intra_pred_speed.cc
+++ b/test/test_intra_pred_speed.cc
@@ -16,6 +16,7 @@
#include "./vp9_rtcd.h"
#include "test/acm_random.h"
+#include "test/clear_system_state.h"
#include "test/md5_helper.h"
#include "vpx/vpx_integer.h"
#include "vpx_ports/mem.h"
@@ -66,6 +67,7 @@
for (int num_tests = 0; num_tests < kNumTests; ++num_tests) {
pred_funcs[k](src, kBPS, above, left);
}
+ libvpx_test::ClearSystemState();
vpx_usec_timer_mark(&timer);
const int elapsed_time =
static_cast<int>(vpx_usec_timer_elapsed(&timer) / 1000);
@@ -206,9 +208,12 @@
#endif // HAVE_DSPR2
#if HAVE_NEON
-INTRA_PRED_TEST(NEON, TestIntraPred4, NULL, NULL, NULL, NULL,
- vp9_v_predictor_4x4_neon, vp9_h_predictor_4x4_neon, NULL, NULL,
- NULL, NULL, NULL, NULL, vp9_tm_predictor_4x4_neon)
+INTRA_PRED_TEST(NEON, TestIntraPred4, vp9_dc_predictor_4x4_neon,
+ vp9_dc_left_predictor_4x4_neon, vp9_dc_top_predictor_4x4_neon,
+ vp9_dc_128_predictor_4x4_neon, vp9_v_predictor_4x4_neon,
+ vp9_h_predictor_4x4_neon, vp9_d45_predictor_4x4_neon,
+ vp9_d135_predictor_4x4_neon, NULL, NULL, NULL, NULL,
+ vp9_tm_predictor_4x4_neon)
#endif // HAVE_NEON
#if HAVE_MSA
@@ -354,14 +359,18 @@
#if HAVE_SSSE3
INTRA_PRED_TEST(SSSE3, TestIntraPred32, NULL, NULL, NULL, NULL, NULL,
vp9_h_predictor_32x32_ssse3, vp9_d45_predictor_32x32_ssse3,
- NULL, NULL, NULL, vp9_d207_predictor_32x32_ssse3,
- vp9_d63_predictor_32x32_ssse3, NULL)
+ NULL, NULL, vp9_d153_predictor_32x32_ssse3,
+ vp9_d207_predictor_32x32_ssse3, vp9_d63_predictor_32x32_ssse3,
+ NULL)
#endif // HAVE_SSSE3
#if HAVE_NEON
-INTRA_PRED_TEST(NEON, TestIntraPred32, NULL, NULL, NULL, NULL,
- vp9_v_predictor_32x32_neon, vp9_h_predictor_32x32_neon, NULL,
- NULL, NULL, NULL, NULL, NULL, vp9_tm_predictor_32x32_neon)
+INTRA_PRED_TEST(NEON, TestIntraPred32, vp9_dc_predictor_32x32_neon,
+ vp9_dc_left_predictor_32x32_neon,
+ vp9_dc_top_predictor_32x32_neon,
+ vp9_dc_128_predictor_32x32_neon, vp9_v_predictor_32x32_neon,
+ vp9_h_predictor_32x32_neon, NULL, NULL, NULL, NULL, NULL, NULL,
+ vp9_tm_predictor_32x32_neon)
#endif // HAVE_NEON
#if HAVE_MSA
diff --git a/test/variance_test.cc b/test/variance_test.cc
index 2d17119..670fe09 100644
--- a/test/variance_test.cc
+++ b/test/variance_test.cc
@@ -1106,12 +1106,12 @@
#endif // CONFIG_VP9_HIGHBITDEPTH
#endif // HAVE_SSE2
-#if CONFIG_VP8
+#if CONFIG_VP8_ENCODER
typedef SubpelVarianceTest<SubpixVarMxNFunc> VP8SubpelVarianceTest;
TEST_P(VP8SubpelVarianceTest, Ref) { RefTest(); }
TEST_P(VP8SubpelVarianceTest, ExtremeRef) { ExtremeRefTest(); }
-#endif // CONFIG_VP8
+#endif // CONFIG_VP8_ENCODER
#if CONFIG_VP9_ENCODER
typedef SubpelVarianceTest<SubpixVarMxNFunc> VP9SubpelVarianceTest;
@@ -1160,7 +1160,7 @@
make_tuple(6, 5, subpel_variance64x32_c, 0),
make_tuple(6, 6, subpel_variance64x64_c, 0)));
-#if CONFIG_VP8
+#if CONFIG_VP8_ENCODER
const SubpixVarMxNFunc vp8_subpel_variance16x16_c =
vp8_sub_pixel_variance16x16_c;
const SubpixVarMxNFunc vp8_subpel_variance16x8_c = vp8_sub_pixel_variance16x8_c;
@@ -1174,7 +1174,7 @@
make_tuple(3, 4, vp8_subpel_variance8x16_c, 0),
make_tuple(4, 3, vp8_subpel_variance16x8_c, 0),
make_tuple(4, 4, vp8_subpel_variance16x16_c, 0)));
-#endif // CONFIG_VP8
+#endif // CONFIG_VP8_ENCODER
const vp9_subp_avg_variance_fn_t subpel_avg_variance4x4_c =
vp9_sub_pixel_avg_variance4x4_c;
@@ -1460,7 +1460,7 @@
#endif // CONFIG_VP9_HIGHBITDEPTH
#endif // CONFIG_VP9_ENCODER
-#if CONFIG_VP8
+#if CONFIG_VP8_ENCODER
#if HAVE_MMX
const SubpixVarMxNFunc subpel_variance16x16_mmx =
vp8_sub_pixel_variance16x16_mmx;
@@ -1476,7 +1476,7 @@
make_tuple(3, 3, subpel_variance8x8_mmx, 0),
make_tuple(2, 2, subpel_variance4x4_mmx, 0)));
#endif // HAVE_MMX
-#endif // CONFIG_VP8
+#endif // CONFIG_VP8_ENCODER
#if CONFIG_VP9_ENCODER
#if HAVE_SSE2
@@ -1768,7 +1768,7 @@
#endif // HAVE_SSE2
#endif // CONFIG_VP9_ENCODER
-#if CONFIG_VP8
+#if CONFIG_VP8_ENCODER
#if HAVE_SSE2
const SubpixVarMxNFunc vp8_subpel_variance16x16_sse2 =
vp8_sub_pixel_variance16x16_wmt;
@@ -1788,7 +1788,7 @@
make_tuple(4, 3, vp8_subpel_variance16x8_sse2, 0),
make_tuple(4, 4, vp8_subpel_variance16x16_sse2, 0)));
#endif // HAVE_SSE2
-#endif // CONFIG_VP8
+#endif // CONFIG_VP8_ENCODER
#if CONFIG_VP9_ENCODER
#if HAVE_SSSE3
@@ -1879,7 +1879,7 @@
#endif // HAVE_SSSE3
#endif // CONFIG_VP9_ENCODER
-#if CONFIG_VP8
+#if CONFIG_VP8_ENCODER
#if HAVE_SSSE3
const SubpixVarMxNFunc vp8_subpel_variance16x16_ssse3 =
vp8_sub_pixel_variance16x16_ssse3;
@@ -1890,7 +1890,7 @@
::testing::Values(make_tuple(4, 3, vp8_subpel_variance16x8_ssse3, 0),
make_tuple(4, 4, vp8_subpel_variance16x16_ssse3, 0)));
#endif // HAVE_SSSE3
-#endif // CONFIG_VP8
+#endif // CONFIG_VP8_ENCODER
#if HAVE_AVX2
const VarianceMxNFunc mse16x16_avx2 = vpx_mse16x16_avx2;
@@ -1931,7 +1931,7 @@
#endif // CONFIG_VP9_ENCODER
#endif // HAVE_AVX2
-#if CONFIG_VP8
+#if CONFIG_VP8_ENCODER
#if HAVE_MEDIA
const SubpixVarMxNFunc subpel_variance16x16_media =
vp8_sub_pixel_variance16x16_armv6;
@@ -1942,7 +1942,7 @@
::testing::Values(make_tuple(3, 3, subpel_variance8x8_media, 0),
make_tuple(4, 4, subpel_variance16x16_media, 0)));
#endif // HAVE_MEDIA
-#endif // CONFIG_VP8
+#endif // CONFIG_VP8_ENCODER
#if HAVE_NEON
const Get4x4SseFunc get4x4sse_cs_neon = vpx_get4x4sse_cs_neon;
@@ -1972,7 +1972,7 @@
make_tuple(3, 4, variance8x16_neon, 0),
make_tuple(3, 3, variance8x8_neon, 0)));
-#if CONFIG_VP8
+#if CONFIG_VP8_ENCODER
#if HAVE_NEON_ASM
const SubpixVarMxNFunc vp8_subpel_variance16x16_neon =
vp8_sub_pixel_variance16x16_neon;
@@ -1980,7 +1980,7 @@
NEON, VP8SubpelVarianceTest,
::testing::Values(make_tuple(4, 4, vp8_subpel_variance16x16_neon, 0)));
#endif // HAVE_NEON_ASM
-#endif // CONFIG_VP8
+#endif // CONFIG_VP8_ENCODER
#if CONFIG_VP9_ENCODER
const SubpixVarMxNFunc subpel_variance8x8_neon = vp9_sub_pixel_variance8x8_neon;
diff --git a/vp9/common/arm/neon/vp9_reconintra_neon.c b/vp9/common/arm/neon/vp9_reconintra_neon.c
index 499c42a..13c46a5 100644
--- a/vp9/common/arm/neon/vp9_reconintra_neon.c
+++ b/vp9/common/arm/neon/vp9_reconintra_neon.c
@@ -15,6 +15,75 @@
#include "vpx/vpx_integer.h"
//------------------------------------------------------------------------------
+// DC 4x4
+
+// 'do_above' and 'do_left' facilitate branch removal when inlined.
+static INLINE void dc_4x4(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left,
+ int do_above, int do_left) {
+ uint16x8_t sum_top;
+ uint16x8_t sum_left;
+ uint8x8_t dc0;
+
+ if (do_above) {
+ const uint8x8_t A = vld1_u8(above); // top row
+ const uint16x4_t p0 = vpaddl_u8(A); // cascading summation of the top
+ const uint16x4_t p1 = vpadd_u16(p0, p0);
+ sum_top = vcombine_u16(p1, p1);
+ }
+
+ if (do_left) {
+ const uint8x8_t L = vld1_u8(left); // left border
+ const uint16x4_t p0 = vpaddl_u8(L); // cascading summation of the left
+ const uint16x4_t p1 = vpadd_u16(p0, p0);
+ sum_left = vcombine_u16(p1, p1);
+ }
+
+ if (do_above && do_left) {
+ const uint16x8_t sum = vaddq_u16(sum_left, sum_top);
+ dc0 = vrshrn_n_u16(sum, 3);
+ } else if (do_above) {
+ dc0 = vrshrn_n_u16(sum_top, 2);
+ } else if (do_left) {
+ dc0 = vrshrn_n_u16(sum_left, 2);
+ } else {
+ dc0 = vdup_n_u8(0x80);
+ }
+
+ {
+ const uint8x8_t dc = vdup_lane_u8(dc0, 0);
+ int i;
+ for (i = 0; i < 4; ++i) {
+ vst1_lane_u32((uint32_t*)(dst + i * stride), vreinterpret_u32_u8(dc), 0);
+ }
+ }
+}
+
+void vp9_dc_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ dc_4x4(dst, stride, above, left, 1, 1);
+}
+
+void vp9_dc_left_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ (void)above;
+ dc_4x4(dst, stride, NULL, left, 0, 1);
+}
+
+void vp9_dc_top_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ (void)left;
+ dc_4x4(dst, stride, above, NULL, 1, 0);
+}
+
+void vp9_dc_128_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ (void)above;
+ (void)left;
+ dc_4x4(dst, stride, NULL, NULL, 0, 0);
+}
+
+//------------------------------------------------------------------------------
// DC 8x8
// 'do_above' and 'do_left' facilitate branch removal when inlined.
@@ -161,6 +230,144 @@
dc_16x16(dst, stride, NULL, NULL, 0, 0);
}
+//------------------------------------------------------------------------------
+// DC 32x32
+
+// 'do_above' and 'do_left' facilitate branch removal when inlined.
+static INLINE void dc_32x32(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left,
+ int do_above, int do_left) {
+ uint16x8_t sum_top;
+ uint16x8_t sum_left;
+ uint8x8_t dc0;
+
+ if (do_above) {
+ const uint8x16_t A0 = vld1q_u8(above); // top row
+ const uint8x16_t A1 = vld1q_u8(above + 16);
+ const uint16x8_t p0 = vpaddlq_u8(A0); // cascading summation of the top
+ const uint16x8_t p1 = vpaddlq_u8(A1);
+ const uint16x8_t p2 = vaddq_u16(p0, p1);
+ const uint16x4_t p3 = vadd_u16(vget_low_u16(p2), vget_high_u16(p2));
+ const uint16x4_t p4 = vpadd_u16(p3, p3);
+ const uint16x4_t p5 = vpadd_u16(p4, p4);
+ sum_top = vcombine_u16(p5, p5);
+ }
+
+ if (do_left) {
+ const uint8x16_t L0 = vld1q_u8(left); // left row
+ const uint8x16_t L1 = vld1q_u8(left + 16);
+ const uint16x8_t p0 = vpaddlq_u8(L0); // cascading summation of the left
+ const uint16x8_t p1 = vpaddlq_u8(L1);
+ const uint16x8_t p2 = vaddq_u16(p0, p1);
+ const uint16x4_t p3 = vadd_u16(vget_low_u16(p2), vget_high_u16(p2));
+ const uint16x4_t p4 = vpadd_u16(p3, p3);
+ const uint16x4_t p5 = vpadd_u16(p4, p4);
+ sum_left = vcombine_u16(p5, p5);
+ }
+
+ if (do_above && do_left) {
+ const uint16x8_t sum = vaddq_u16(sum_left, sum_top);
+ dc0 = vrshrn_n_u16(sum, 6);
+ } else if (do_above) {
+ dc0 = vrshrn_n_u16(sum_top, 5);
+ } else if (do_left) {
+ dc0 = vrshrn_n_u16(sum_left, 5);
+ } else {
+ dc0 = vdup_n_u8(0x80);
+ }
+
+ {
+ const uint8x16_t dc = vdupq_lane_u8(dc0, 0);
+ int i;
+ for (i = 0; i < 32; ++i) {
+ vst1q_u8(dst + i * stride, dc);
+ vst1q_u8(dst + i * stride + 16, dc);
+ }
+ }
+}
+
+void vp9_dc_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ dc_32x32(dst, stride, above, left, 1, 1);
+}
+
+void vp9_dc_left_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above,
+ const uint8_t *left) {
+ (void)above;
+ dc_32x32(dst, stride, NULL, left, 0, 1);
+}
+
+void vp9_dc_top_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above,
+ const uint8_t *left) {
+ (void)left;
+ dc_32x32(dst, stride, above, NULL, 1, 0);
+}
+
+void vp9_dc_128_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above,
+ const uint8_t *left) {
+ (void)above;
+ (void)left;
+ dc_32x32(dst, stride, NULL, NULL, 0, 0);
+}
+
+// -----------------------------------------------------------------------------
+
+void vp9_d45_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ const uint64x1_t A0 = vreinterpret_u64_u8(vld1_u8(above)); // top row
+ const uint64x1_t A1 = vshr_n_u64(A0, 8);
+ const uint64x1_t A2 = vshr_n_u64(A0, 16);
+ const uint8x8_t ABCDEFGH = vreinterpret_u8_u64(A0);
+ const uint8x8_t BCDEFGH0 = vreinterpret_u8_u64(A1);
+ const uint8x8_t CDEFGH00 = vreinterpret_u8_u64(A2);
+ const uint8x8_t avg1 = vhadd_u8(ABCDEFGH, CDEFGH00);
+ const uint8x8_t avg2 = vrhadd_u8(avg1, BCDEFGH0);
+ const uint64x1_t avg2_u64 = vreinterpret_u64_u8(avg2);
+ const uint32x2_t r0 = vreinterpret_u32_u8(avg2);
+ const uint32x2_t r1 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 8));
+ const uint32x2_t r2 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 16));
+ const uint32x2_t r3 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 24));
+ (void)left;
+ vst1_lane_u32((uint32_t *)(dst + 0 * stride), r0, 0);
+ vst1_lane_u32((uint32_t *)(dst + 1 * stride), r1, 0);
+ vst1_lane_u32((uint32_t *)(dst + 2 * stride), r2, 0);
+ vst1_lane_u32((uint32_t *)(dst + 3 * stride), r3, 0);
+ dst[3 * stride + 3] = above[7];
+}
+
+// -----------------------------------------------------------------------------
+
+void vp9_d135_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ const uint8x8_t XABCD_u8 = vld1_u8(above - 1);
+ const uint64x1_t XABCD = vreinterpret_u64_u8(XABCD_u8);
+ const uint64x1_t ____XABC = vshl_n_u64(XABCD, 32);
+ const uint32x2_t zero = vdup_n_u32(0);
+ const uint32x2_t IJKL = vld1_lane_u32((const uint32_t *)left, zero, 0);
+ const uint8x8_t IJKL_u8 = vreinterpret_u8_u32(IJKL);
+ const uint64x1_t LKJI____ = vreinterpret_u64_u8(vrev32_u8(IJKL_u8));
+ const uint64x1_t LKJIXABC = vorr_u64(LKJI____, ____XABC);
+ const uint8x8_t KJIXABC_ = vreinterpret_u8_u64(vshr_n_u64(LKJIXABC, 8));
+ const uint8x8_t JIXABC__ = vreinterpret_u8_u64(vshr_n_u64(LKJIXABC, 16));
+ const uint8_t D = vget_lane_u8(XABCD_u8, 4);
+ const uint8x8_t JIXABCD_ = vset_lane_u8(D, JIXABC__, 6);
+ const uint8x8_t LKJIXABC_u8 = vreinterpret_u8_u64(LKJIXABC);
+ const uint8x8_t avg1 = vhadd_u8(JIXABCD_, LKJIXABC_u8);
+ const uint8x8_t avg2 = vrhadd_u8(avg1, KJIXABC_);
+ const uint64x1_t avg2_u64 = vreinterpret_u64_u8(avg2);
+ const uint32x2_t r3 = vreinterpret_u32_u8(avg2);
+ const uint32x2_t r2 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 8));
+ const uint32x2_t r1 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 16));
+ const uint32x2_t r0 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 24));
+ vst1_lane_u32((uint32_t *)(dst + 0 * stride), r0, 0);
+ vst1_lane_u32((uint32_t *)(dst + 1 * stride), r1, 0);
+ vst1_lane_u32((uint32_t *)(dst + 2 * stride), r2, 0);
+ vst1_lane_u32((uint32_t *)(dst + 3 * stride), r3, 0);
+}
+
#if !HAVE_NEON_ASM
void vp9_v_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
diff --git a/vp9/common/mips/msa/vp9_macros_msa.h b/vp9/common/mips/msa/vp9_macros_msa.h
index f1217d5a..863b2dc 100644
--- a/vp9/common/mips/msa/vp9_macros_msa.h
+++ b/vp9/common/mips/msa/vp9_macros_msa.h
@@ -244,6 +244,22 @@
out3 = LW((psrc) + 3 * stride); \
}
+/* Description : Load double words with stride
+ Arguments : Inputs - psrc (source pointer to load from)
+ - stride
+ Outputs - out0, out1
+ Details : Loads double word in 'out0' from (psrc)
+ Loads double word in 'out1' from (psrc + stride)
+*/
+#define LD2(psrc, stride, out0, out1) { \
+ out0 = LD((psrc)); \
+ out1 = LD((psrc) + stride); \
+}
+#define LD4(psrc, stride, out0, out1, out2, out3) { \
+ LD2((psrc), stride, out0, out1); \
+ LD2((psrc) + 2 * stride, stride, out2, out3); \
+}
+
/* Description : Store 4 words with stride
Arguments : Inputs - in0, in1, in2, in3, pdst, stride
Details : Stores word from 'in0' to (pdst)
@@ -364,6 +380,17 @@
out3 = (v8i16)__msa_ilvl_d((v2i64)out2, (v2i64)out2); \
}
+/* Description : Load 2 vectors of signed word elements with stride
+ Arguments : Inputs - psrc (source pointer to load from)
+ - stride
+ Outputs - out0, out1
+ Return Type - signed word
+*/
+#define LD_SW2(psrc, stride, out0, out1) { \
+ out0 = LD_SW((psrc)); \
+ out1 = LD_SW((psrc) + stride); \
+}
+
/* Description : Store vectors of 16 byte elements with stride
Arguments : Inputs - in0, in1, stride
Outputs - pdst (destination pointer to store to)
@@ -482,6 +509,24 @@
SD(out0_m, pdst); \
}
+/* Description : Store as 8x2 byte block to destination memory from input vector
+ Arguments : Inputs - in, pdst, stride
+ Details : Index 0 double word element from input vector 'in' is copied
+ and stored to destination memory at (pdst)
+ Index 1 double word element from input vector 'in' is copied
+ and stored to destination memory at (pdst + stride)
+*/
+#define ST8x2_UB(in, pdst, stride) { \
+ uint64_t out0_m, out1_m; \
+ uint8_t *pblk_8x2_m = (uint8_t *)(pdst); \
+ \
+ out0_m = __msa_copy_u_d((v2i64)in, 0); \
+ out1_m = __msa_copy_u_d((v2i64)in, 1); \
+ \
+ SD(out0_m, pblk_8x2_m); \
+ SD(out1_m, pblk_8x2_m + stride); \
+}
+
/* Description : Store as 8x4 byte block to destination memory from input
vectors
Arguments : Inputs - in0, in1, pdst, stride
@@ -675,6 +720,24 @@
}
#define DOTP_SH4_SW(...) DOTP_SH4(v4i32, __VA_ARGS__)
+/* Description : Dot product of word vector elements
+ Arguments : Inputs - mult0, mult1
+ cnst0, cnst1
+ Outputs - out0, out1
+ Return Type - signed word
+ Details : Signed word elements from mult0 are multiplied with
+ signed word elements from cnst0 producing a result
+ twice the size of input i.e. signed double word.
+ Then this multiplication results of adjacent odd-even elements
+ are added together and stored to the out vector
+ (2 signed double word results)
+*/
+#define DOTP_SW2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) { \
+ out0 = (RTYPE)__msa_dotp_s_d((v4i32)mult0, (v4i32)cnst0); \
+ out1 = (RTYPE)__msa_dotp_s_d((v4i32)mult1, (v4i32)cnst1); \
+}
+#define DOTP_SW2_SD(...) DOTP_SW2(v2i64, __VA_ARGS__)
+
/* Description : Dot product & addition of byte vector elements
Arguments : Inputs - mult0, mult1
cnst0, cnst1
@@ -743,6 +806,24 @@
CLIP_SH2_0_255(in2, in3); \
}
+/* Description : Addition of 4 signed word elements
+ 4 signed word elements of input vector are added together and
+ the resulting integer sum is returned
+ Arguments : Inputs - in (signed word vector)
+ Outputs - sum_m (i32 sum)
+ Return Type - signed word
+*/
+#define HADD_SW_S32(in) ({ \
+ v2i64 res0_m, res1_m; \
+ int32_t sum_m; \
+ \
+ res0_m = __msa_hadd_s_d((v4i32)in, (v4i32)in); \
+ res1_m = __msa_splati_d(res0_m, 1); \
+ res0_m = res0_m + res1_m; \
+ sum_m = __msa_copy_s_w((v4i32)res0_m, 0); \
+ sum_m; \
+})
+
/* Description : Horizontal addition of unsigned byte vector elements
Arguments : Inputs - in0, in1
Outputs - out0, out1
@@ -1039,8 +1120,8 @@
Outputs - in0, in1, in2, in3 (in place)
Return Type - unsigned halfword
Details : Each unsigned halfword element from 'in0' is saturated to the
- value generated with (sat_val+1) bit range
- Results are in placed to original vectors
+ value generated with (sat_val+1) bit range.
+ The results are stored in place
*/
#define SAT_UH2(RTYPE, in0, in1, sat_val) { \
in0 = (RTYPE)__msa_sat_u_h((v8u16)in0, sat_val); \
@@ -1062,7 +1143,7 @@
Return Type - unsigned halfword
Details : Each unsigned halfword element from 'in0' is saturated to the
value generated with (sat_val+1) bit range
- Results are in placed to original vectors
+ The results are stored in place
*/
#define SAT_SH2(RTYPE, in0, in1, sat_val) { \
in0 = (RTYPE)__msa_sat_s_h((v8i16)in0, sat_val); \
@@ -1182,10 +1263,10 @@
Outputs - in0, in1 (in-place)
Return Type - as per RTYPE
Details : Each unsigned byte element from input vector 'in0' is
- logically xor'ed with 128 and result is in-place stored in
+ logically xor'ed with 128 and the result is in-place stored in
'in0' vector
Each unsigned byte element from input vector 'in1' is
- logically xor'ed with 128 and result is in-place stored in
+ logically xor'ed with 128 and the result is in-place stored in
'in1' vector
Similar for other pairs
*/
@@ -1237,13 +1318,28 @@
}
#define ADDS_SH4_SH(...) ADDS_SH4(v8i16, __VA_ARGS__)
+/* Description : Shift left all elements of vector (generic for all data types)
+ Arguments : Inputs - in0, in1, in2, in3, shift
+ Outputs - in0, in1, in2, in3 (in place)
+ Return Type - as per input vector RTYPE
+ Details : Each element of vector 'in0' is left shifted by 'shift' and
+ the result is in place written to 'in0'
+ Similar for other pairs
+*/
+#define SLLI_4V(in0, in1, in2, in3, shift) { \
+ in0 = in0 << shift; \
+ in1 = in1 << shift; \
+ in2 = in2 << shift; \
+ in3 = in3 << shift; \
+}
+
/* Description : Arithmetic shift right all elements of vector
(generic for all data types)
Arguments : Inputs - in0, in1, in2, in3, shift
Outputs - in0, in1, in2, in3 (in place)
Return Type - as per input vector RTYPE
Details : Each element of vector 'in0' is right shifted by 'shift' and
- result is in place written to 'in0'
+ the result is in place written to 'in0'
Here, 'shift' is GP variable passed in
Similar for other pairs
*/
@@ -1362,6 +1458,24 @@
ILVRL_B2_SH(zero_m, in, out0, out1); \
}
+/* Description : Sign extend halfword elements from input vector and return
+ result in pair of vectors
+ Arguments : Inputs - in (1 input halfword vector)
+ Outputs - out0, out1 (sign extended 2 word vectors)
+ Return Type - signed word
+ Details : Sign bit of halfword elements from input vector 'in' is
+ extracted and interleaved right with same vector 'in0' to
+ generate 4 signed word elements in 'out0'
+ Then interleaved left with same vector 'in0' to
+ generate 4 signed word elements in 'out1'
+*/
+#define UNPCK_SH_SW(in, out0, out1) { \
+ v8i16 tmp_m; \
+ \
+ tmp_m = __msa_clti_s_h((v8i16)in, 0); \
+ ILVRL_H2_SW(tmp_m, in, out0, out1); \
+}
+
/* Description : Butterfly of 4 input vectors
Arguments : Inputs - in0, in1, in2, in3
Outputs - out0, out1, out2, out3
@@ -1393,6 +1507,34 @@
out7 = in0 - in7; \
}
+/* Description : Butterfly of 16 input vectors
+ Arguments : Inputs - in0 ... in15
+ Outputs - out0 .. out15
+ Details : Butterfly operation
+*/
+#define BUTTERFLY_16(in0, in1, in2, in3, in4, in5, in6, in7, \
+ in8, in9, in10, in11, in12, in13, in14, in15, \
+ out0, out1, out2, out3, out4, out5, out6, out7, \
+ out8, out9, out10, out11, out12, out13, out14, out15) { \
+ out0 = in0 + in15; \
+ out1 = in1 + in14; \
+ out2 = in2 + in13; \
+ out3 = in3 + in12; \
+ out4 = in4 + in11; \
+ out5 = in5 + in10; \
+ out6 = in6 + in9; \
+ out7 = in7 + in8; \
+ \
+ out8 = in7 - in8; \
+ out9 = in6 - in9; \
+ out10 = in5 - in10; \
+ out11 = in4 - in11; \
+ out12 = in3 - in12; \
+ out13 = in2 - in13; \
+ out14 = in1 - in14; \
+ out15 = in0 - in15; \
+}
+
/* Description : Transposes input 8x8 byte block
Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7
(input 8x8 byte block)
@@ -1606,7 +1748,7 @@
Outputs - out_m
Return Type - unsigned byte
Details : Signed byte even elements from 'in0' and 'in1' are packed
- together in one vector and the resulted vector is xor'ed with
+ together in one vector and the resulting vector is xor'ed with
128 to shift the range from signed to unsigned byte
*/
#define PCKEV_XORI128_UB(in0, in1) ({ \
diff --git a/vp9/common/mips/msa/vp9_mfqe_msa.c b/vp9/common/mips/msa/vp9_mfqe_msa.c
new file mode 100644
index 0000000..64cb9a8
--- /dev/null
+++ b/vp9/common/mips/msa/vp9_mfqe_msa.c
@@ -0,0 +1,137 @@
+/*
+ * Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vp9_rtcd.h"
+#include "vp9/common/vp9_onyxc_int.h"
+#include "vp9/common/mips/msa/vp9_macros_msa.h"
+
+static void filter_by_weight8x8_msa(const uint8_t *src_ptr, int32_t src_stride,
+ uint8_t *dst_ptr, int32_t dst_stride,
+ int32_t src_weight) {
+ int32_t dst_weight = (1 << MFQE_PRECISION) - src_weight;
+ int32_t row;
+ uint64_t src0_d, src1_d, dst0_d, dst1_d;
+ v16i8 src0 = { 0 };
+ v16i8 src1 = { 0 };
+ v16i8 dst0 = { 0 };
+ v16i8 dst1 = { 0 };
+ v8i16 src_wt, dst_wt, res_h_r, res_h_l, src_r, src_l, dst_r, dst_l;
+
+ src_wt = __msa_fill_h(src_weight);
+ dst_wt = __msa_fill_h(dst_weight);
+
+ for (row = 2; row--;) {
+ LD2(src_ptr, src_stride, src0_d, src1_d);
+ src_ptr += (2 * src_stride);
+ LD2(dst_ptr, dst_stride, dst0_d, dst1_d);
+ INSERT_D2_SB(src0_d, src1_d, src0);
+ INSERT_D2_SB(dst0_d, dst1_d, dst0);
+
+ LD2(src_ptr, src_stride, src0_d, src1_d);
+ src_ptr += (2 * src_stride);
+ LD2((dst_ptr + 2 * dst_stride), dst_stride, dst0_d, dst1_d);
+ INSERT_D2_SB(src0_d, src1_d, src1);
+ INSERT_D2_SB(dst0_d, dst1_d, dst1);
+
+ UNPCK_UB_SH(src0, src_r, src_l);
+ UNPCK_UB_SH(dst0, dst_r, dst_l);
+ res_h_r = (src_r * src_wt);
+ res_h_r += (dst_r * dst_wt);
+ res_h_l = (src_l * src_wt);
+ res_h_l += (dst_l * dst_wt);
+ SRARI_H2_SH(res_h_r, res_h_l, MFQE_PRECISION);
+ dst0 = (v16i8)__msa_pckev_b((v16i8)res_h_l, (v16i8)res_h_r);
+ ST8x2_UB(dst0, dst_ptr, dst_stride);
+ dst_ptr += (2 * dst_stride);
+
+ UNPCK_UB_SH(src1, src_r, src_l);
+ UNPCK_UB_SH(dst1, dst_r, dst_l);
+ res_h_r = (src_r * src_wt);
+ res_h_r += (dst_r * dst_wt);
+ res_h_l = (src_l * src_wt);
+ res_h_l += (dst_l * dst_wt);
+ SRARI_H2_SH(res_h_r, res_h_l, MFQE_PRECISION);
+ dst1 = (v16i8)__msa_pckev_b((v16i8)res_h_l, (v16i8)res_h_r);
+ ST8x2_UB(dst1, dst_ptr, dst_stride);
+ dst_ptr += (2 * dst_stride);
+ }
+}
+
+static void filter_by_weight16x16_msa(const uint8_t *src_ptr,
+ int32_t src_stride,
+ uint8_t *dst_ptr,
+ int32_t dst_stride,
+ int32_t src_weight) {
+ int32_t dst_weight = (1 << MFQE_PRECISION) - src_weight;
+ int32_t row;
+ v16i8 src0, src1, src2, src3, dst0, dst1, dst2, dst3;
+ v8i16 src_wt, dst_wt, res_h_r, res_h_l, src_r, src_l, dst_r, dst_l;
+
+ src_wt = __msa_fill_h(src_weight);
+ dst_wt = __msa_fill_h(dst_weight);
+
+ for (row = 4; row--;) {
+ LD_SB4(src_ptr, src_stride, src0, src1, src2, src3);
+ src_ptr += (4 * src_stride);
+ LD_SB4(dst_ptr, dst_stride, dst0, dst1, dst2, dst3);
+
+ UNPCK_UB_SH(src0, src_r, src_l);
+ UNPCK_UB_SH(dst0, dst_r, dst_l);
+ res_h_r = (src_r * src_wt);
+ res_h_r += (dst_r * dst_wt);
+ res_h_l = (src_l * src_wt);
+ res_h_l += (dst_l * dst_wt);
+ SRARI_H2_SH(res_h_r, res_h_l, MFQE_PRECISION);
+ PCKEV_ST_SB(res_h_r, res_h_l, dst_ptr);
+ dst_ptr += dst_stride;
+
+ UNPCK_UB_SH(src1, src_r, src_l);
+ UNPCK_UB_SH(dst1, dst_r, dst_l);
+ res_h_r = (src_r * src_wt);
+ res_h_r += (dst_r * dst_wt);
+ res_h_l = (src_l * src_wt);
+ res_h_l += (dst_l * dst_wt);
+ SRARI_H2_SH(res_h_r, res_h_l, MFQE_PRECISION);
+ PCKEV_ST_SB(res_h_r, res_h_l, dst_ptr);
+ dst_ptr += dst_stride;
+
+ UNPCK_UB_SH(src2, src_r, src_l);
+ UNPCK_UB_SH(dst2, dst_r, dst_l);
+ res_h_r = (src_r * src_wt);
+ res_h_r += (dst_r * dst_wt);
+ res_h_l = (src_l * src_wt);
+ res_h_l += (dst_l * dst_wt);
+ SRARI_H2_SH(res_h_r, res_h_l, MFQE_PRECISION);
+ PCKEV_ST_SB(res_h_r, res_h_l, dst_ptr);
+ dst_ptr += dst_stride;
+
+ UNPCK_UB_SH(src3, src_r, src_l);
+ UNPCK_UB_SH(dst3, dst_r, dst_l);
+ res_h_r = (src_r * src_wt);
+ res_h_r += (dst_r * dst_wt);
+ res_h_l = (src_l * src_wt);
+ res_h_l += (dst_l * dst_wt);
+ SRARI_H2_SH(res_h_r, res_h_l, MFQE_PRECISION);
+ PCKEV_ST_SB(res_h_r, res_h_l, dst_ptr);
+ dst_ptr += dst_stride;
+ }
+}
+
+void vp9_filter_by_weight8x8_msa(const uint8_t *src, int src_stride,
+ uint8_t *dst, int dst_stride,
+ int src_weight) {
+ filter_by_weight8x8_msa(src, src_stride, dst, dst_stride, src_weight);
+}
+
+void vp9_filter_by_weight16x16_msa(const uint8_t *src, int src_stride,
+ uint8_t *dst, int dst_stride,
+ int src_weight) {
+ filter_by_weight16x16_msa(src, src_stride, dst, dst_stride, src_weight);
+}
diff --git a/vp9/common/vp9_blockd.h b/vp9/common/vp9_blockd.h
index 53ae921..64d379c 100644
--- a/vp9/common/vp9_blockd.h
+++ b/vp9/common/vp9_blockd.h
@@ -161,6 +161,8 @@
int up_available;
int left_available;
+ const vp9_prob (*partition_probs)[PARTITION_TYPES - 1];
+
/* Distance of MB away from frame edges */
int mb_to_left_edge;
int mb_to_right_edge;
diff --git a/vp9/common/vp9_entropy.c b/vp9/common/vp9_entropy.c
index a2584e8..ad6c04b 100644
--- a/vp9/common/vp9_entropy.c
+++ b/vp9/common/vp9_entropy.c
@@ -133,12 +133,6 @@
0, 1, 2, 3, 3, 4, 4, 5, 5, 5, 5, 5
};
-const vp9_tree_index vp9_coefmodel_tree[TREE_SIZE(UNCONSTRAINED_NODES + 1)] = {
- -EOB_MODEL_TOKEN, 2,
- -ZERO_TOKEN, 4,
- -ONE_TOKEN, -TWO_TOKEN,
-};
-
// Model obtained from a 2-sided zero-centerd distribuition derived
// from a Pareto distribution. The cdf of the distribution is:
// cdf(x) = 0.5 + 0.5 * sgn(x) * [1 - {alpha/(alpha + |x|)} ^ beta]
diff --git a/vp9/common/vp9_entropy.h b/vp9/common/vp9_entropy.h
index 4e02630..2fc97c3 100644
--- a/vp9/common/vp9_entropy.h
+++ b/vp9/common/vp9_entropy.h
@@ -74,7 +74,6 @@
#endif // CONFIG_VP9_HIGHBITDEPTH
#define EOB_MODEL_TOKEN 3
-extern const vp9_tree_index vp9_coefmodel_tree[];
typedef struct {
const vp9_tree_index *tree;
diff --git a/vp9/common/vp9_entropymode.c b/vp9/common/vp9_entropymode.c
index 424451f..22d431b 100644
--- a/vp9/common/vp9_entropymode.c
+++ b/vp9/common/vp9_entropymode.c
@@ -314,7 +314,7 @@
{ 149, 144, },
};
-void vp9_init_mode_probs(FRAME_CONTEXT *fc) {
+static void init_mode_probs(FRAME_CONTEXT *fc) {
vp9_copy(fc->uv_mode_prob, default_if_uv_probs);
vp9_copy(fc->y_mode_prob, default_if_y_probs);
vp9_copy(fc->switchable_interp_prob, default_switchable_interp_prob);
@@ -444,7 +444,7 @@
lf->last_sharpness_level = -1;
vp9_default_coef_probs(cm);
- vp9_init_mode_probs(cm->fc);
+ init_mode_probs(cm->fc);
vp9_init_mv_probs(cm);
cm->fc->initialized = 1;
diff --git a/vp9/common/vp9_entropymode.h b/vp9/common/vp9_entropymode.h
index a0619ec..8c9e6a7 100644
--- a/vp9/common/vp9_entropymode.h
+++ b/vp9/common/vp9_entropymode.h
@@ -90,8 +90,6 @@
void vp9_setup_past_independence(struct VP9Common *cm);
-void vp9_init_mode_probs(FRAME_CONTEXT *fc);
-
void vp9_adapt_mode_probs(struct VP9Common *cm);
void tx_counts_to_branch_counts_32x32(const unsigned int *tx_count_32x32p,
diff --git a/vp9/common/vp9_onyxc_int.h b/vp9/common/vp9_onyxc_int.h
index 3af2a41..1811d76 100644
--- a/vp9/common/vp9_onyxc_int.h
+++ b/vp9/common/vp9_onyxc_int.h
@@ -162,7 +162,8 @@
int show_existing_frame;
// Flag signaling that the frame is encoded using only INTRA modes.
- int intra_only;
+ uint8_t intra_only;
+ uint8_t last_intra_only;
int allow_high_precision_mv;
@@ -335,6 +336,18 @@
return ALIGN_POWER_OF_TWO(n_mis, MI_BLOCK_SIZE_LOG2);
}
+static INLINE int frame_is_intra_only(const VP9_COMMON *const cm) {
+ return cm->frame_type == KEY_FRAME || cm->intra_only;
+}
+
+static INLINE void set_partition_probs(const VP9_COMMON *const cm,
+ MACROBLOCKD *const xd) {
+ xd->partition_probs =
+ frame_is_intra_only(cm) ?
+ &vp9_kf_partition_probs[0] :
+ (const vp9_prob (*)[PARTITION_TYPES - 1])cm->fc->partition_prob;
+}
+
static INLINE void init_macroblockd(VP9_COMMON *cm, MACROBLOCKD *xd) {
int i;
@@ -355,16 +368,13 @@
xd->above_seg_context = cm->above_seg_context;
xd->mi_stride = cm->mi_stride;
xd->error_info = &cm->error;
+
+ set_partition_probs(cm, xd);
}
-static INLINE int frame_is_intra_only(const VP9_COMMON *const cm) {
- return cm->frame_type == KEY_FRAME || cm->intra_only;
-}
-
-static INLINE const vp9_prob* get_partition_probs(const VP9_COMMON *cm,
+static INLINE const vp9_prob* get_partition_probs(const MACROBLOCKD *xd,
int ctx) {
- return frame_is_intra_only(cm) ? vp9_kf_partition_probs[ctx]
- : cm->fc->partition_prob[ctx];
+ return xd->partition_probs[ctx];
}
static INLINE void set_skip_context(MACROBLOCKD *xd, int mi_row, int mi_col) {
diff --git a/vp9/common/vp9_reconintra.c b/vp9/common/vp9_reconintra.c
index 650f4ad..1e9acb8 100644
--- a/vp9/common/vp9_reconintra.c
+++ b/vp9/common/vp9_reconintra.c
@@ -533,8 +533,8 @@
}
intra_pred_no_4x4(d117)
-void vp9_d135_predictor_4x4(uint8_t *dst, ptrdiff_t stride,
- const uint8_t *above, const uint8_t *left) {
+void vp9_d135_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
const int I = left[0];
const int J = left[1];
const int K = left[2];
diff --git a/vp9/common/vp9_rtcd_defs.pl b/vp9/common/vp9_rtcd_defs.pl
index 5035126..27cd3d0 100644
--- a/vp9/common/vp9_rtcd_defs.pl
+++ b/vp9/common/vp9_rtcd_defs.pl
@@ -60,7 +60,7 @@
specialize qw/vp9_d207_predictor_4x4/, "$ssse3_x86inc";
add_proto qw/void vp9_d45_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vp9_d45_predictor_4x4/, "$ssse3_x86inc";
+specialize qw/vp9_d45_predictor_4x4 neon/, "$ssse3_x86inc";
add_proto qw/void vp9_d63_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
specialize qw/vp9_d63_predictor_4x4/, "$ssse3_x86inc";
@@ -72,7 +72,7 @@
specialize qw/vp9_d117_predictor_4x4/;
add_proto qw/void vp9_d135_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vp9_d135_predictor_4x4/;
+specialize qw/vp9_d135_predictor_4x4 neon/;
add_proto qw/void vp9_d153_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
specialize qw/vp9_d153_predictor_4x4/, "$ssse3_x86inc";
@@ -84,16 +84,16 @@
specialize qw/vp9_tm_predictor_4x4 neon dspr2 msa/, "$sse_x86inc";
add_proto qw/void vp9_dc_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vp9_dc_predictor_4x4 dspr2 msa/, "$sse_x86inc";
+specialize qw/vp9_dc_predictor_4x4 dspr2 msa neon/, "$sse_x86inc";
add_proto qw/void vp9_dc_top_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vp9_dc_top_predictor_4x4 msa/, "$sse_x86inc";
+specialize qw/vp9_dc_top_predictor_4x4 msa neon/, "$sse_x86inc";
add_proto qw/void vp9_dc_left_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vp9_dc_left_predictor_4x4 msa/, "$sse_x86inc";
+specialize qw/vp9_dc_left_predictor_4x4 msa neon/, "$sse_x86inc";
add_proto qw/void vp9_dc_128_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vp9_dc_128_predictor_4x4 msa/, "$sse_x86inc";
+specialize qw/vp9_dc_128_predictor_4x4 msa neon/, "$sse_x86inc";
add_proto qw/void vp9_d207_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
specialize qw/vp9_d207_predictor_8x8/, "$ssse3_x86inc";
@@ -192,7 +192,7 @@
specialize qw/vp9_d135_predictor_32x32/;
add_proto qw/void vp9_d153_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vp9_d153_predictor_32x32/;
+specialize qw/vp9_d153_predictor_32x32/, "$ssse3_x86inc";
add_proto qw/void vp9_v_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
specialize qw/vp9_v_predictor_32x32 neon msa/, "$sse2_x86inc";
@@ -201,16 +201,16 @@
specialize qw/vp9_tm_predictor_32x32 neon msa/, "$sse2_x86_64";
add_proto qw/void vp9_dc_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vp9_dc_predictor_32x32 msa/, "$sse2_x86inc";
+specialize qw/vp9_dc_predictor_32x32 msa neon/, "$sse2_x86inc";
add_proto qw/void vp9_dc_top_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vp9_dc_top_predictor_32x32 msa/, "$sse2_x86inc";
+specialize qw/vp9_dc_top_predictor_32x32 msa neon/, "$sse2_x86inc";
add_proto qw/void vp9_dc_left_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vp9_dc_left_predictor_32x32 msa/, "$sse2_x86inc";
+specialize qw/vp9_dc_left_predictor_32x32 msa neon/, "$sse2_x86inc";
add_proto qw/void vp9_dc_128_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vp9_dc_128_predictor_32x32 msa/, "$sse2_x86inc";
+specialize qw/vp9_dc_128_predictor_32x32 msa neon/, "$sse2_x86inc";
#
# Loopfilter
@@ -276,10 +276,10 @@
$vp9_plane_add_noise_sse2=vp9_plane_add_noise_wmt;
add_proto qw/void vp9_filter_by_weight16x16/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int src_weight";
-specialize qw/vp9_filter_by_weight16x16 sse2/;
+specialize qw/vp9_filter_by_weight16x16 sse2 msa/;
add_proto qw/void vp9_filter_by_weight8x8/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int src_weight";
-specialize qw/vp9_filter_by_weight8x8 sse2/;
+specialize qw/vp9_filter_by_weight8x8 sse2 msa/;
}
#
@@ -1029,7 +1029,7 @@
specialize qw/vp9_fht8x8 sse2/;
add_proto qw/void vp9_fht16x16/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
- specialize qw/vp9_fht16x16 sse2/;
+ specialize qw/vp9_fht16x16 sse2 msa/;
add_proto qw/void vp9_fwht4x4/, "const int16_t *input, tran_low_t *output, int stride";
specialize qw/vp9_fwht4x4/, "$mmx_x86inc";
@@ -1047,19 +1047,19 @@
specialize qw/vp9_fdct8x8 sse2 neon/, "$ssse3_x86_64";
add_proto qw/void vp9_fdct16x16_1/, "const int16_t *input, tran_low_t *output, int stride";
- specialize qw/vp9_fdct16x16_1 sse2/;
+ specialize qw/vp9_fdct16x16_1 sse2 msa/;
add_proto qw/void vp9_fdct16x16/, "const int16_t *input, tran_low_t *output, int stride";
- specialize qw/vp9_fdct16x16 sse2/;
+ specialize qw/vp9_fdct16x16 sse2 msa/;
add_proto qw/void vp9_fdct32x32_1/, "const int16_t *input, tran_low_t *output, int stride";
- specialize qw/vp9_fdct32x32_1 sse2/;
+ specialize qw/vp9_fdct32x32_1 sse2 msa/;
add_proto qw/void vp9_fdct32x32/, "const int16_t *input, tran_low_t *output, int stride";
- specialize qw/vp9_fdct32x32 sse2 avx2/;
+ specialize qw/vp9_fdct32x32 sse2 avx2 msa/;
add_proto qw/void vp9_fdct32x32_rd/, "const int16_t *input, tran_low_t *output, int stride";
- specialize qw/vp9_fdct32x32_rd sse2 avx2/;
+ specialize qw/vp9_fdct32x32_rd sse2 avx2 msa/;
}
#
diff --git a/vp9/decoder/vp9_decodeframe.c b/vp9/decoder/vp9_decodeframe.c
index 30ca2d0..9311d8d 100644
--- a/vp9/decoder/vp9_decodeframe.c
+++ b/vp9/decoder/vp9_decodeframe.c
@@ -346,6 +346,357 @@
*args->eobtotal += eob;
}
+static void build_mc_border(const uint8_t *src, int src_stride,
+ uint8_t *dst, int dst_stride,
+ int x, int y, int b_w, int b_h, int w, int h) {
+ // Get a pointer to the start of the real data for this row.
+ const uint8_t *ref_row = src - x - y * src_stride;
+
+ if (y >= h)
+ ref_row += (h - 1) * src_stride;
+ else if (y > 0)
+ ref_row += y * src_stride;
+
+ do {
+ int right = 0, copy;
+ int left = x < 0 ? -x : 0;
+
+ if (left > b_w)
+ left = b_w;
+
+ if (x + b_w > w)
+ right = x + b_w - w;
+
+ if (right > b_w)
+ right = b_w;
+
+ copy = b_w - left - right;
+
+ if (left)
+ memset(dst, ref_row[0], left);
+
+ if (copy)
+ memcpy(dst + left, ref_row + x + left, copy);
+
+ if (right)
+ memset(dst + left + copy, ref_row[w - 1], right);
+
+ dst += dst_stride;
+ ++y;
+
+ if (y > 0 && y < h)
+ ref_row += src_stride;
+ } while (--b_h);
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+static void high_build_mc_border(const uint8_t *src8, int src_stride,
+ uint16_t *dst, int dst_stride,
+ int x, int y, int b_w, int b_h,
+ int w, int h) {
+ // Get a pointer to the start of the real data for this row.
+ const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+ const uint16_t *ref_row = src - x - y * src_stride;
+
+ if (y >= h)
+ ref_row += (h - 1) * src_stride;
+ else if (y > 0)
+ ref_row += y * src_stride;
+
+ do {
+ int right = 0, copy;
+ int left = x < 0 ? -x : 0;
+
+ if (left > b_w)
+ left = b_w;
+
+ if (x + b_w > w)
+ right = x + b_w - w;
+
+ if (right > b_w)
+ right = b_w;
+
+ copy = b_w - left - right;
+
+ if (left)
+ vpx_memset16(dst, ref_row[0], left);
+
+ if (copy)
+ memcpy(dst + left, ref_row + x + left, copy * sizeof(uint16_t));
+
+ if (right)
+ vpx_memset16(dst + left + copy, ref_row[w - 1], right);
+
+ dst += dst_stride;
+ ++y;
+
+ if (y > 0 && y < h)
+ ref_row += src_stride;
+ } while (--b_h);
+}
+#endif // CONFIG_VP9_HIGHBITDEPTH
+
+#if CONFIG_VP9_HIGHBITDEPTH
+static void extend_and_predict(const uint8_t *buf_ptr1, int pre_buf_stride,
+ int x0, int y0, int b_w, int b_h,
+ int frame_width, int frame_height,
+ int border_offset,
+ uint8_t *const dst, int dst_buf_stride,
+ int subpel_x, int subpel_y,
+ const InterpKernel *kernel,
+ const struct scale_factors *sf,
+ MACROBLOCKD *xd,
+ int w, int h, int ref, int xs, int ys) {
+ DECLARE_ALIGNED(16, uint16_t, mc_buf_high[80 * 2 * 80 * 2]);
+ const uint8_t *buf_ptr;
+
+ if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+ high_build_mc_border(buf_ptr1, pre_buf_stride, mc_buf_high, b_w,
+ x0, y0, b_w, b_h, frame_width, frame_height);
+ buf_ptr = CONVERT_TO_BYTEPTR(mc_buf_high) + border_offset;
+ } else {
+ build_mc_border(buf_ptr1, pre_buf_stride, (uint8_t *)mc_buf_high, b_w,
+ x0, y0, b_w, b_h, frame_width, frame_height);
+ buf_ptr = ((uint8_t *)mc_buf_high) + border_offset;
+ }
+
+ if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+ high_inter_predictor(buf_ptr, b_w, dst, dst_buf_stride, subpel_x,
+ subpel_y, sf, w, h, ref, kernel, xs, ys, xd->bd);
+ } else {
+ inter_predictor(buf_ptr, b_w, dst, dst_buf_stride, subpel_x,
+ subpel_y, sf, w, h, ref, kernel, xs, ys);
+ }
+}
+#else
+static void extend_and_predict(const uint8_t *buf_ptr1, int pre_buf_stride,
+ int x0, int y0, int b_w, int b_h,
+ int frame_width, int frame_height,
+ int border_offset,
+ uint8_t *const dst, int dst_buf_stride,
+ int subpel_x, int subpel_y,
+ const InterpKernel *kernel,
+ const struct scale_factors *sf,
+ int w, int h, int ref, int xs, int ys) {
+ DECLARE_ALIGNED(16, uint8_t, mc_buf[80 * 2 * 80 * 2]);
+ const uint8_t *buf_ptr;
+
+ build_mc_border(buf_ptr1, pre_buf_stride, mc_buf, b_w,
+ x0, y0, b_w, b_h, frame_width, frame_height);
+ buf_ptr = mc_buf + border_offset;
+
+ inter_predictor(buf_ptr, b_w, dst, dst_buf_stride, subpel_x,
+ subpel_y, sf, w, h, ref, kernel, xs, ys);
+}
+#endif // CONFIG_VP9_HIGHBITDEPTH
+
+static void dec_build_inter_predictors(VP9Decoder *const pbi, MACROBLOCKD *xd,
+ int plane, int bw, int bh, int x,
+ int y, int w, int h, int mi_x, int mi_y,
+ const InterpKernel *kernel,
+ const struct scale_factors *sf,
+ struct buf_2d *pre_buf,
+ struct buf_2d *dst_buf, const MV* mv,
+ RefCntBuffer *ref_frame_buf,
+ int is_scaled, int ref) {
+ struct macroblockd_plane *const pd = &xd->plane[plane];
+ uint8_t *const dst = dst_buf->buf + dst_buf->stride * y + x;
+ MV32 scaled_mv;
+ int xs, ys, x0, y0, x0_16, y0_16, frame_width, frame_height,
+ buf_stride, subpel_x, subpel_y;
+ uint8_t *ref_frame, *buf_ptr;
+
+ // Get reference frame pointer, width and height.
+ if (plane == 0) {
+ frame_width = ref_frame_buf->buf.y_crop_width;
+ frame_height = ref_frame_buf->buf.y_crop_height;
+ ref_frame = ref_frame_buf->buf.y_buffer;
+ } else {
+ frame_width = ref_frame_buf->buf.uv_crop_width;
+ frame_height = ref_frame_buf->buf.uv_crop_height;
+ ref_frame = plane == 1 ? ref_frame_buf->buf.u_buffer
+ : ref_frame_buf->buf.v_buffer;
+ }
+
+ if (is_scaled) {
+ const MV mv_q4 = clamp_mv_to_umv_border_sb(xd, mv, bw, bh,
+ pd->subsampling_x,
+ pd->subsampling_y);
+ // Co-ordinate of containing block to pixel precision.
+ int x_start = (-xd->mb_to_left_edge >> (3 + pd->subsampling_x));
+ int y_start = (-xd->mb_to_top_edge >> (3 + pd->subsampling_y));
+
+ // Co-ordinate of the block to 1/16th pixel precision.
+ x0_16 = (x_start + x) << SUBPEL_BITS;
+ y0_16 = (y_start + y) << SUBPEL_BITS;
+
+ // Co-ordinate of current block in reference frame
+ // to 1/16th pixel precision.
+ x0_16 = sf->scale_value_x(x0_16, sf);
+ y0_16 = sf->scale_value_y(y0_16, sf);
+
+ // Map the top left corner of the block into the reference frame.
+ x0 = sf->scale_value_x(x_start + x, sf);
+ y0 = sf->scale_value_y(y_start + y, sf);
+
+ // Scale the MV and incorporate the sub-pixel offset of the block
+ // in the reference frame.
+ scaled_mv = vp9_scale_mv(&mv_q4, mi_x + x, mi_y + y, sf);
+ xs = sf->x_step_q4;
+ ys = sf->y_step_q4;
+ } else {
+ // Co-ordinate of containing block to pixel precision.
+ x0 = (-xd->mb_to_left_edge >> (3 + pd->subsampling_x)) + x;
+ y0 = (-xd->mb_to_top_edge >> (3 + pd->subsampling_y)) + y;
+
+ // Co-ordinate of the block to 1/16th pixel precision.
+ x0_16 = x0 << SUBPEL_BITS;
+ y0_16 = y0 << SUBPEL_BITS;
+
+ scaled_mv.row = mv->row * (1 << (1 - pd->subsampling_y));
+ scaled_mv.col = mv->col * (1 << (1 - pd->subsampling_x));
+ xs = ys = 16;
+ }
+ subpel_x = scaled_mv.col & SUBPEL_MASK;
+ subpel_y = scaled_mv.row & SUBPEL_MASK;
+
+ // Calculate the top left corner of the best matching block in the
+ // reference frame.
+ x0 += scaled_mv.col >> SUBPEL_BITS;
+ y0 += scaled_mv.row >> SUBPEL_BITS;
+ x0_16 += scaled_mv.col;
+ y0_16 += scaled_mv.row;
+
+ // Get reference block pointer.
+ buf_ptr = ref_frame + y0 * pre_buf->stride + x0;
+ buf_stride = pre_buf->stride;
+
+ // Do border extension if there is motion or the
+ // width/height is not a multiple of 8 pixels.
+ if (is_scaled || scaled_mv.col || scaled_mv.row ||
+ (frame_width & 0x7) || (frame_height & 0x7)) {
+ int y1 = (y0_16 + (h - 1) * ys) >> SUBPEL_BITS;
+
+ // Get reference block bottom right horizontal coordinate.
+ int x1 = (x0_16 + (w - 1) * xs) >> SUBPEL_BITS;
+ int x_pad = 0, y_pad = 0;
+
+ if (subpel_x || (sf->x_step_q4 != SUBPEL_SHIFTS)) {
+ x0 -= VP9_INTERP_EXTEND - 1;
+ x1 += VP9_INTERP_EXTEND;
+ x_pad = 1;
+ }
+
+ if (subpel_y || (sf->y_step_q4 != SUBPEL_SHIFTS)) {
+ y0 -= VP9_INTERP_EXTEND - 1;
+ y1 += VP9_INTERP_EXTEND;
+ y_pad = 1;
+ }
+
+ // Wait until reference block is ready. Pad 7 more pixels as last 7
+ // pixels of each superblock row can be changed by next superblock row.
+ if (pbi->frame_parallel_decode)
+ vp9_frameworker_wait(pbi->frame_worker_owner, ref_frame_buf,
+ MAX(0, (y1 + 7)) << (plane == 0 ? 0 : 1));
+
+ // Skip border extension if block is inside the frame.
+ if (x0 < 0 || x0 > frame_width - 1 || x1 < 0 || x1 > frame_width - 1 ||
+ y0 < 0 || y0 > frame_height - 1 || y1 < 0 || y1 > frame_height - 1) {
+ // Extend the border.
+ const uint8_t *const buf_ptr1 = ref_frame + y0 * buf_stride + x0;
+ const int b_w = x1 - x0 + 1;
+ const int b_h = y1 - y0 + 1;
+ const int border_offset = y_pad * 3 * b_w + x_pad * 3;
+
+ extend_and_predict(buf_ptr1, buf_stride, x0, y0, b_w, b_h,
+ frame_width, frame_height, border_offset,
+ dst, dst_buf->stride,
+ subpel_x, subpel_y,
+ kernel, sf,
+#if CONFIG_VP9_HIGHBITDEPTH
+ xd,
+#endif
+ w, h, ref, xs, ys);
+ return;
+ }
+ } else {
+ // Wait until reference block is ready. Pad 7 more pixels as last 7
+ // pixels of each superblock row can be changed by next superblock row.
+ if (pbi->frame_parallel_decode) {
+ const int y1 = (y0_16 + (h - 1) * ys) >> SUBPEL_BITS;
+ vp9_frameworker_wait(pbi->frame_worker_owner, ref_frame_buf,
+ MAX(0, (y1 + 7)) << (plane == 0 ? 0 : 1));
+ }
+ }
+#if CONFIG_VP9_HIGHBITDEPTH
+ if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+ high_inter_predictor(buf_ptr, buf_stride, dst, dst_buf->stride, subpel_x,
+ subpel_y, sf, w, h, ref, kernel, xs, ys, xd->bd);
+ } else {
+ inter_predictor(buf_ptr, buf_stride, dst, dst_buf->stride, subpel_x,
+ subpel_y, sf, w, h, ref, kernel, xs, ys);
+ }
+#else
+ inter_predictor(buf_ptr, buf_stride, dst, dst_buf->stride, subpel_x,
+ subpel_y, sf, w, h, ref, kernel, xs, ys);
+#endif // CONFIG_VP9_HIGHBITDEPTH
+}
+
+static void dec_build_inter_predictors_sb(VP9Decoder *const pbi,
+ MACROBLOCKD *xd,
+ int mi_row, int mi_col,
+ BLOCK_SIZE bsize) {
+ int plane;
+ const int mi_x = mi_col * MI_SIZE;
+ const int mi_y = mi_row * MI_SIZE;
+ const MODE_INFO *mi = xd->mi[0];
+ const InterpKernel *kernel = vp9_get_interp_kernel(mi->mbmi.interp_filter);
+ const BLOCK_SIZE sb_type = mi->mbmi.sb_type;
+ const int is_compound = has_second_ref(&mi->mbmi);
+
+ for (plane = 0; plane < MAX_MB_PLANE; ++plane) {
+ const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize,
+ &xd->plane[plane]);
+ struct macroblockd_plane *const pd = &xd->plane[plane];
+ struct buf_2d *const dst_buf = &pd->dst;
+ const int num_4x4_w = num_4x4_blocks_wide_lookup[plane_bsize];
+ const int num_4x4_h = num_4x4_blocks_high_lookup[plane_bsize];
+
+ const int bw = 4 * num_4x4_w;
+ const int bh = 4 * num_4x4_h;
+ int ref;
+
+ for (ref = 0; ref < 1 + is_compound; ++ref) {
+ const struct scale_factors *const sf = &xd->block_refs[ref]->sf;
+ struct buf_2d *const pre_buf = &pd->pre[ref];
+ const int idx = xd->block_refs[ref]->idx;
+ BufferPool *const pool = pbi->common.buffer_pool;
+ RefCntBuffer *const ref_frame_buf = &pool->frame_bufs[idx];
+ const int is_scaled = vp9_is_scaled(sf);
+
+ if (sb_type < BLOCK_8X8) {
+ int i = 0, x, y;
+ assert(bsize == BLOCK_8X8);
+ for (y = 0; y < num_4x4_h; ++y) {
+ for (x = 0; x < num_4x4_w; ++x) {
+ const MV mv = average_split_mvs(pd, mi, ref, i++);
+ dec_build_inter_predictors(pbi, xd, plane, bw, bh,
+ 4 * x, 4 * y, 4, 4, mi_x, mi_y, kernel,
+ sf, pre_buf, dst_buf, &mv,
+ ref_frame_buf, is_scaled, ref);
+ }
+ }
+ } else {
+ const MV mv = mi->mbmi.mv[ref].as_mv;
+ dec_build_inter_predictors(pbi, xd, plane, bw, bh,
+ 0, 0, bw, bh, mi_x, mi_y, kernel,
+ sf, pre_buf, dst_buf, &mv, ref_frame_buf,
+ is_scaled, ref);
+ }
+ }
+ }
+}
+
static MB_MODE_INFO *set_offsets(VP9_COMMON *const cm, MACROBLOCKD *const xd,
const TileInfo *const tile,
BLOCK_SIZE bsize, int mi_row, int mi_col) {
@@ -405,7 +756,7 @@
predict_and_reconstruct_intra_block, &arg);
} else {
// Prediction
- vp9_dec_build_inter_predictors_sb(pbi, xd, mi_row, mi_col, bsize);
+ dec_build_inter_predictors_sb(pbi, xd, mi_row, mi_col, bsize);
// Reconstruction
if (!mbmi->skip) {
@@ -420,14 +771,11 @@
xd->corrupted |= vp9_reader_has_error(r);
}
-static PARTITION_TYPE read_partition(VP9_COMMON *cm, MACROBLOCKD *xd,
- int hbs,
- int mi_row, int mi_col, BLOCK_SIZE bsize,
- vp9_reader *r) {
+static PARTITION_TYPE read_partition(MACROBLOCKD *xd, int mi_row, int mi_col,
+ BLOCK_SIZE bsize, vp9_reader *r,
+ int has_rows, int has_cols) {
const int ctx = partition_plane_context(xd, mi_row, mi_col, bsize);
- const vp9_prob *const probs = get_partition_probs(cm, ctx);
- const int has_rows = (mi_row + hbs) < cm->mi_rows;
- const int has_cols = (mi_col + hbs) < cm->mi_cols;
+ const vp9_prob *const probs = get_partition_probs(xd, ctx);
FRAME_COUNTS *counts = xd->counts;
PARTITION_TYPE p;
@@ -454,11 +802,13 @@
const int hbs = num_8x8_blocks_wide_lookup[bsize] / 2;
PARTITION_TYPE partition;
BLOCK_SIZE subsize;
+ const int has_rows = (mi_row + hbs) < cm->mi_rows;
+ const int has_cols = (mi_col + hbs) < cm->mi_cols;
if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols)
return;
- partition = read_partition(cm, xd, hbs, mi_row, mi_col, bsize, r);
+ partition = read_partition(xd, mi_row, mi_col, bsize, r, has_rows, has_cols);
subsize = get_subsize(bsize, partition);
if (bsize == BLOCK_8X8) {
decode_block(pbi, xd, tile, mi_row, mi_col, r, subsize);
@@ -469,12 +819,12 @@
break;
case PARTITION_HORZ:
decode_block(pbi, xd, tile, mi_row, mi_col, r, subsize);
- if (mi_row + hbs < cm->mi_rows)
+ if (has_rows)
decode_block(pbi, xd, tile, mi_row + hbs, mi_col, r, subsize);
break;
case PARTITION_VERT:
decode_block(pbi, xd, tile, mi_row, mi_col, r, subsize);
- if (mi_col + hbs < cm->mi_cols)
+ if (has_cols)
decode_block(pbi, xd, tile, mi_row, mi_col + hbs, r, subsize);
break;
case PARTITION_SPLIT:
@@ -669,12 +1019,6 @@
: literal_to_filter[vp9_rb_read_literal(rb, 2)];
}
-void vp9_read_frame_size(struct vp9_read_bit_buffer *rb,
- int *width, int *height) {
- *width = vp9_rb_read_literal(rb, 16) + 1;
- *height = vp9_rb_read_literal(rb, 16) + 1;
-}
-
static void setup_display_size(VP9_COMMON *cm, struct vp9_read_bit_buffer *rb) {
cm->display_width = cm->width;
cm->display_height = cm->height;
@@ -1112,8 +1456,6 @@
if (pbi->num_tile_workers == 0) {
const int num_threads = pbi->max_threads & ~1;
int i;
- // TODO(jzern): Allocate one less worker, as in the current code we only
- // use num_threads - 1 workers.
CHECK_MEM_ERROR(cm, pbi->tile_workers,
vpx_malloc(num_threads * sizeof(*pbi->tile_workers)));
// Ensure tile data offsets will be properly aligned. This may fail on
@@ -1253,20 +1595,6 @@
vpx_internal_error(&cm->error, VPX_CODEC_CORRUPT_FRAME, "Truncated packet");
}
-int vp9_read_sync_code(struct vp9_read_bit_buffer *const rb) {
- return vp9_rb_read_literal(rb, 8) == VP9_SYNC_CODE_0 &&
- vp9_rb_read_literal(rb, 8) == VP9_SYNC_CODE_1 &&
- vp9_rb_read_literal(rb, 8) == VP9_SYNC_CODE_2;
-}
-
-BITSTREAM_PROFILE vp9_read_profile(struct vp9_read_bit_buffer *rb) {
- int profile = vp9_rb_read_bit(rb);
- profile |= vp9_rb_read_bit(rb) << 1;
- if (profile > 2)
- profile += vp9_rb_read_bit(rb);
- return (BITSTREAM_PROFILE) profile;
-}
-
static void read_bitdepth_colorspace_sampling(
VP9_COMMON *cm, struct vp9_read_bit_buffer *rb) {
if (cm->profile >= PROFILE_2) {
@@ -1319,6 +1647,7 @@
size_t sz;
cm->last_frame_type = cm->frame_type;
+ cm->last_intra_only = cm->intra_only;
if (vp9_rb_read_literal(rb, 2) != VP9_FRAME_MARKER)
vpx_internal_error(&cm->error, VPX_CODEC_UNSUP_BITSTREAM,
@@ -1595,12 +1924,12 @@
}
#endif // NDEBUG
-static struct vp9_read_bit_buffer* init_read_bit_buffer(
+static struct vp9_read_bit_buffer *init_read_bit_buffer(
VP9Decoder *pbi,
struct vp9_read_bit_buffer *rb,
const uint8_t *data,
const uint8_t *data_end,
- uint8_t *clear_data /* buffer size MAX_VP9_HEADER_SIZE */) {
+ uint8_t clear_data[MAX_VP9_HEADER_SIZE]) {
rb->bit_offset = 0;
rb->error_handler = error_handler;
rb->error_handler_data = &pbi->common;
@@ -1616,12 +1945,34 @@
return rb;
}
+//------------------------------------------------------------------------------
+
+int vp9_read_sync_code(struct vp9_read_bit_buffer *const rb) {
+ return vp9_rb_read_literal(rb, 8) == VP9_SYNC_CODE_0 &&
+ vp9_rb_read_literal(rb, 8) == VP9_SYNC_CODE_1 &&
+ vp9_rb_read_literal(rb, 8) == VP9_SYNC_CODE_2;
+}
+
+void vp9_read_frame_size(struct vp9_read_bit_buffer *rb,
+ int *width, int *height) {
+ *width = vp9_rb_read_literal(rb, 16) + 1;
+ *height = vp9_rb_read_literal(rb, 16) + 1;
+}
+
+BITSTREAM_PROFILE vp9_read_profile(struct vp9_read_bit_buffer *rb) {
+ int profile = vp9_rb_read_bit(rb);
+ profile |= vp9_rb_read_bit(rb) << 1;
+ if (profile > 2)
+ profile += vp9_rb_read_bit(rb);
+ return (BITSTREAM_PROFILE) profile;
+}
+
void vp9_decode_frame(VP9Decoder *pbi,
const uint8_t *data, const uint8_t *data_end,
const uint8_t **p_data_end) {
VP9_COMMON *const cm = &pbi->common;
MACROBLOCKD *const xd = &pbi->mb;
- struct vp9_read_bit_buffer rb = { NULL, NULL, 0, NULL, 0};
+ struct vp9_read_bit_buffer rb;
int context_updated = 0;
uint8_t clear_data[MAX_VP9_HEADER_SIZE];
const size_t first_partition_size = read_uncompressed_header(pbi,
@@ -1645,8 +1996,9 @@
cm->use_prev_frame_mvs = !cm->error_resilient_mode &&
cm->width == cm->last_width &&
cm->height == cm->last_height &&
- !cm->intra_only &&
- cm->last_show_frame;
+ !cm->last_intra_only &&
+ cm->last_show_frame &&
+ (cm->last_frame_type != KEY_FRAME);
vp9_setup_block_planes(xd, cm->subsampling_x, cm->subsampling_y);
@@ -1725,353 +2077,3 @@
if (cm->refresh_frame_context && !context_updated)
cm->frame_contexts[cm->frame_context_idx] = *cm->fc;
}
-
-static void build_mc_border(const uint8_t *src, int src_stride,
- uint8_t *dst, int dst_stride,
- int x, int y, int b_w, int b_h, int w, int h) {
- // Get a pointer to the start of the real data for this row.
- const uint8_t *ref_row = src - x - y * src_stride;
-
- if (y >= h)
- ref_row += (h - 1) * src_stride;
- else if (y > 0)
- ref_row += y * src_stride;
-
- do {
- int right = 0, copy;
- int left = x < 0 ? -x : 0;
-
- if (left > b_w)
- left = b_w;
-
- if (x + b_w > w)
- right = x + b_w - w;
-
- if (right > b_w)
- right = b_w;
-
- copy = b_w - left - right;
-
- if (left)
- memset(dst, ref_row[0], left);
-
- if (copy)
- memcpy(dst + left, ref_row + x + left, copy);
-
- if (right)
- memset(dst + left + copy, ref_row[w - 1], right);
-
- dst += dst_stride;
- ++y;
-
- if (y > 0 && y < h)
- ref_row += src_stride;
- } while (--b_h);
-}
-
-#if CONFIG_VP9_HIGHBITDEPTH
-static void high_build_mc_border(const uint8_t *src8, int src_stride,
- uint16_t *dst, int dst_stride,
- int x, int y, int b_w, int b_h,
- int w, int h) {
- // Get a pointer to the start of the real data for this row.
- const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
- const uint16_t *ref_row = src - x - y * src_stride;
-
- if (y >= h)
- ref_row += (h - 1) * src_stride;
- else if (y > 0)
- ref_row += y * src_stride;
-
- do {
- int right = 0, copy;
- int left = x < 0 ? -x : 0;
-
- if (left > b_w)
- left = b_w;
-
- if (x + b_w > w)
- right = x + b_w - w;
-
- if (right > b_w)
- right = b_w;
-
- copy = b_w - left - right;
-
- if (left)
- vpx_memset16(dst, ref_row[0], left);
-
- if (copy)
- memcpy(dst + left, ref_row + x + left, copy * sizeof(uint16_t));
-
- if (right)
- vpx_memset16(dst + left + copy, ref_row[w - 1], right);
-
- dst += dst_stride;
- ++y;
-
- if (y > 0 && y < h)
- ref_row += src_stride;
- } while (--b_h);
-}
-#endif // CONFIG_VP9_HIGHBITDEPTH
-
-#if CONFIG_VP9_HIGHBITDEPTH
-static void extend_and_predict(const uint8_t *buf_ptr1, int pre_buf_stride,
- int x0, int y0, int b_w, int b_h,
- int frame_width, int frame_height,
- int border_offset,
- uint8_t *const dst, int dst_buf_stride,
- int subpel_x, int subpel_y,
- const InterpKernel *kernel,
- const struct scale_factors *sf,
- MACROBLOCKD *xd,
- int w, int h, int ref, int xs, int ys) {
- DECLARE_ALIGNED(16, uint16_t, mc_buf_high[80 * 2 * 80 * 2]);
- const uint8_t *buf_ptr;
-
- if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
- high_build_mc_border(buf_ptr1, pre_buf_stride, mc_buf_high, b_w,
- x0, y0, b_w, b_h, frame_width, frame_height);
- buf_ptr = CONVERT_TO_BYTEPTR(mc_buf_high) + border_offset;
- } else {
- build_mc_border(buf_ptr1, pre_buf_stride, (uint8_t *)mc_buf_high, b_w,
- x0, y0, b_w, b_h, frame_width, frame_height);
- buf_ptr = ((uint8_t *)mc_buf_high) + border_offset;
- }
-
- if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
- high_inter_predictor(buf_ptr, b_w, dst, dst_buf_stride, subpel_x,
- subpel_y, sf, w, h, ref, kernel, xs, ys, xd->bd);
- } else {
- inter_predictor(buf_ptr, b_w, dst, dst_buf_stride, subpel_x,
- subpel_y, sf, w, h, ref, kernel, xs, ys);
- }
-}
-#else
-static void extend_and_predict(const uint8_t *buf_ptr1, int pre_buf_stride,
- int x0, int y0, int b_w, int b_h,
- int frame_width, int frame_height,
- int border_offset,
- uint8_t *const dst, int dst_buf_stride,
- int subpel_x, int subpel_y,
- const InterpKernel *kernel,
- const struct scale_factors *sf,
- int w, int h, int ref, int xs, int ys) {
- DECLARE_ALIGNED(16, uint8_t, mc_buf[80 * 2 * 80 * 2]);
- const uint8_t *buf_ptr;
-
- build_mc_border(buf_ptr1, pre_buf_stride, mc_buf, b_w,
- x0, y0, b_w, b_h, frame_width, frame_height);
- buf_ptr = mc_buf + border_offset;
-
- inter_predictor(buf_ptr, b_w, dst, dst_buf_stride, subpel_x,
- subpel_y, sf, w, h, ref, kernel, xs, ys);
-}
-#endif // CONFIG_VP9_HIGHBITDEPTH
-
-static void dec_build_inter_predictors(VP9Decoder *const pbi, MACROBLOCKD *xd,
- int plane, int bw, int bh, int x,
- int y, int w, int h, int mi_x, int mi_y,
- const InterpKernel *kernel,
- const struct scale_factors *sf,
- struct buf_2d *pre_buf,
- struct buf_2d *dst_buf, const MV* mv,
- RefCntBuffer *ref_frame_buf,
- int is_scaled, int ref) {
- struct macroblockd_plane *const pd = &xd->plane[plane];
- uint8_t *const dst = dst_buf->buf + dst_buf->stride * y + x;
- MV32 scaled_mv;
- int xs, ys, x0, y0, x0_16, y0_16, frame_width, frame_height,
- buf_stride, subpel_x, subpel_y;
- uint8_t *ref_frame, *buf_ptr;
-
- // Get reference frame pointer, width and height.
- if (plane == 0) {
- frame_width = ref_frame_buf->buf.y_crop_width;
- frame_height = ref_frame_buf->buf.y_crop_height;
- ref_frame = ref_frame_buf->buf.y_buffer;
- } else {
- frame_width = ref_frame_buf->buf.uv_crop_width;
- frame_height = ref_frame_buf->buf.uv_crop_height;
- ref_frame = plane == 1 ? ref_frame_buf->buf.u_buffer
- : ref_frame_buf->buf.v_buffer;
- }
-
- if (is_scaled) {
- const MV mv_q4 = clamp_mv_to_umv_border_sb(xd, mv, bw, bh,
- pd->subsampling_x,
- pd->subsampling_y);
- // Co-ordinate of containing block to pixel precision.
- int x_start = (-xd->mb_to_left_edge >> (3 + pd->subsampling_x));
- int y_start = (-xd->mb_to_top_edge >> (3 + pd->subsampling_y));
-
- // Co-ordinate of the block to 1/16th pixel precision.
- x0_16 = (x_start + x) << SUBPEL_BITS;
- y0_16 = (y_start + y) << SUBPEL_BITS;
-
- // Co-ordinate of current block in reference frame
- // to 1/16th pixel precision.
- x0_16 = sf->scale_value_x(x0_16, sf);
- y0_16 = sf->scale_value_y(y0_16, sf);
-
- // Map the top left corner of the block into the reference frame.
- x0 = sf->scale_value_x(x_start + x, sf);
- y0 = sf->scale_value_y(y_start + y, sf);
-
- // Scale the MV and incorporate the sub-pixel offset of the block
- // in the reference frame.
- scaled_mv = vp9_scale_mv(&mv_q4, mi_x + x, mi_y + y, sf);
- xs = sf->x_step_q4;
- ys = sf->y_step_q4;
- } else {
- // Co-ordinate of containing block to pixel precision.
- x0 = (-xd->mb_to_left_edge >> (3 + pd->subsampling_x)) + x;
- y0 = (-xd->mb_to_top_edge >> (3 + pd->subsampling_y)) + y;
-
- // Co-ordinate of the block to 1/16th pixel precision.
- x0_16 = x0 << SUBPEL_BITS;
- y0_16 = y0 << SUBPEL_BITS;
-
- scaled_mv.row = mv->row * (1 << (1 - pd->subsampling_y));
- scaled_mv.col = mv->col * (1 << (1 - pd->subsampling_x));
- xs = ys = 16;
- }
- subpel_x = scaled_mv.col & SUBPEL_MASK;
- subpel_y = scaled_mv.row & SUBPEL_MASK;
-
- // Calculate the top left corner of the best matching block in the
- // reference frame.
- x0 += scaled_mv.col >> SUBPEL_BITS;
- y0 += scaled_mv.row >> SUBPEL_BITS;
- x0_16 += scaled_mv.col;
- y0_16 += scaled_mv.row;
-
- // Get reference block pointer.
- buf_ptr = ref_frame + y0 * pre_buf->stride + x0;
- buf_stride = pre_buf->stride;
-
- // Do border extension if there is motion or the
- // width/height is not a multiple of 8 pixels.
- if (is_scaled || scaled_mv.col || scaled_mv.row ||
- (frame_width & 0x7) || (frame_height & 0x7)) {
- int y1 = (y0_16 + (h - 1) * ys) >> SUBPEL_BITS;
-
- // Get reference block bottom right horizontal coordinate.
- int x1 = (x0_16 + (w - 1) * xs) >> SUBPEL_BITS;
- int x_pad = 0, y_pad = 0;
-
- if (subpel_x || (sf->x_step_q4 != SUBPEL_SHIFTS)) {
- x0 -= VP9_INTERP_EXTEND - 1;
- x1 += VP9_INTERP_EXTEND;
- x_pad = 1;
- }
-
- if (subpel_y || (sf->y_step_q4 != SUBPEL_SHIFTS)) {
- y0 -= VP9_INTERP_EXTEND - 1;
- y1 += VP9_INTERP_EXTEND;
- y_pad = 1;
- }
-
- // Wait until reference block is ready. Pad 7 more pixels as last 7
- // pixels of each superblock row can be changed by next superblock row.
- if (pbi->frame_parallel_decode)
- vp9_frameworker_wait(pbi->frame_worker_owner, ref_frame_buf,
- MAX(0, (y1 + 7)) << (plane == 0 ? 0 : 1));
-
- // Skip border extension if block is inside the frame.
- if (x0 < 0 || x0 > frame_width - 1 || x1 < 0 || x1 > frame_width - 1 ||
- y0 < 0 || y0 > frame_height - 1 || y1 < 0 || y1 > frame_height - 1) {
- // Extend the border.
- const uint8_t *const buf_ptr1 = ref_frame + y0 * buf_stride + x0;
- const int b_w = x1 - x0 + 1;
- const int b_h = y1 - y0 + 1;
- const int border_offset = y_pad * 3 * b_w + x_pad * 3;
-
- extend_and_predict(buf_ptr1, buf_stride, x0, y0, b_w, b_h,
- frame_width, frame_height, border_offset,
- dst, dst_buf->stride,
- subpel_x, subpel_y,
- kernel, sf,
-#if CONFIG_VP9_HIGHBITDEPTH
- xd,
-#endif
- w, h, ref, xs, ys);
- return;
- }
- } else {
- // Wait until reference block is ready. Pad 7 more pixels as last 7
- // pixels of each superblock row can be changed by next superblock row.
- if (pbi->frame_parallel_decode) {
- const int y1 = (y0_16 + (h - 1) * ys) >> SUBPEL_BITS;
- vp9_frameworker_wait(pbi->frame_worker_owner, ref_frame_buf,
- MAX(0, (y1 + 7)) << (plane == 0 ? 0 : 1));
- }
- }
-#if CONFIG_VP9_HIGHBITDEPTH
- if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
- high_inter_predictor(buf_ptr, buf_stride, dst, dst_buf->stride, subpel_x,
- subpel_y, sf, w, h, ref, kernel, xs, ys, xd->bd);
- } else {
- inter_predictor(buf_ptr, buf_stride, dst, dst_buf->stride, subpel_x,
- subpel_y, sf, w, h, ref, kernel, xs, ys);
- }
-#else
- inter_predictor(buf_ptr, buf_stride, dst, dst_buf->stride, subpel_x,
- subpel_y, sf, w, h, ref, kernel, xs, ys);
-#endif // CONFIG_VP9_HIGHBITDEPTH
-}
-
-void vp9_dec_build_inter_predictors_sb(VP9Decoder *const pbi, MACROBLOCKD *xd,
- int mi_row, int mi_col,
- BLOCK_SIZE bsize) {
- int plane;
- const int mi_x = mi_col * MI_SIZE;
- const int mi_y = mi_row * MI_SIZE;
- const MODE_INFO *mi = xd->mi[0];
- const InterpKernel *kernel = vp9_get_interp_kernel(mi->mbmi.interp_filter);
- const BLOCK_SIZE sb_type = mi->mbmi.sb_type;
- const int is_compound = has_second_ref(&mi->mbmi);
-
- for (plane = 0; plane < MAX_MB_PLANE; ++plane) {
- const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize,
- &xd->plane[plane]);
- struct macroblockd_plane *const pd = &xd->plane[plane];
- struct buf_2d *const dst_buf = &pd->dst;
- const int num_4x4_w = num_4x4_blocks_wide_lookup[plane_bsize];
- const int num_4x4_h = num_4x4_blocks_high_lookup[plane_bsize];
-
- const int bw = 4 * num_4x4_w;
- const int bh = 4 * num_4x4_h;
- int ref;
-
- for (ref = 0; ref < 1 + is_compound; ++ref) {
- const struct scale_factors *const sf = &xd->block_refs[ref]->sf;
- struct buf_2d *const pre_buf = &pd->pre[ref];
- const int idx = xd->block_refs[ref]->idx;
- BufferPool *const pool = pbi->common.buffer_pool;
- RefCntBuffer *const ref_frame_buf = &pool->frame_bufs[idx];
- const int is_scaled = vp9_is_scaled(sf);
-
- if (sb_type < BLOCK_8X8) {
- int i = 0, x, y;
- assert(bsize == BLOCK_8X8);
- for (y = 0; y < num_4x4_h; ++y) {
- for (x = 0; x < num_4x4_w; ++x) {
- const MV mv = average_split_mvs(pd, mi, ref, i++);
- dec_build_inter_predictors(pbi, xd, plane, bw, bh,
- 4 * x, 4 * y, 4, 4, mi_x, mi_y, kernel,
- sf, pre_buf, dst_buf, &mv,
- ref_frame_buf, is_scaled, ref);
- }
- }
- } else {
- const MV mv = mi->mbmi.mv[ref].as_mv;
- dec_build_inter_predictors(pbi, xd, plane, bw, bh,
- 0, 0, bw, bh, mi_x, mi_y, kernel,
- sf, pre_buf, dst_buf, &mv, ref_frame_buf,
- is_scaled, ref);
- }
- }
- }
-}
diff --git a/vp9/decoder/vp9_decodeframe.h b/vp9/decoder/vp9_decodeframe.h
index 8410c54..a876e7c 100644
--- a/vp9/decoder/vp9_decodeframe.h
+++ b/vp9/decoder/vp9_decodeframe.h
@@ -16,24 +16,18 @@
extern "C" {
#endif
-struct VP9Common;
struct VP9Decoder;
struct vp9_read_bit_buffer;
-void vp9_init_dequantizer(struct VP9Common *cm);
-
-void vp9_decode_frame(struct VP9Decoder *pbi,
- const uint8_t *data, const uint8_t *data_end,
- const uint8_t **p_data_end);
-
int vp9_read_sync_code(struct vp9_read_bit_buffer *const rb);
void vp9_read_frame_size(struct vp9_read_bit_buffer *rb,
int *width, int *height);
BITSTREAM_PROFILE vp9_read_profile(struct vp9_read_bit_buffer *rb);
-void vp9_dec_build_inter_predictors_sb(struct VP9Decoder *const pbi,
- MACROBLOCKD *xd, int mi_row, int mi_col,
- BLOCK_SIZE bsize);
+void vp9_decode_frame(struct VP9Decoder *pbi,
+ const uint8_t *data, const uint8_t *data_end,
+ const uint8_t **p_data_end);
+
#ifdef __cplusplus
} // extern "C"
#endif
diff --git a/vp9/decoder/vp9_decodemv.c b/vp9/decoder/vp9_decodemv.c
index d34926d..8a8d8dd 100644
--- a/vp9/decoder/vp9_decodemv.c
+++ b/vp9/decoder/vp9_decodemv.c
@@ -599,19 +599,20 @@
MV_REF* frame_mvs = cm->cur_frame->mvs + mi_row * cm->mi_cols + mi_col;
int w, h;
- if (frame_is_intra_only(cm))
+ if (frame_is_intra_only(cm)) {
read_intra_frame_mode_info(cm, xd, mi_row, mi_col, r);
- else
+ } else {
read_inter_frame_mode_info(pbi, xd, tile, mi_row, mi_col, r);
- for (h = 0; h < y_mis; ++h) {
- MV_REF *const frame_mv = frame_mvs + h * cm->mi_cols;
- for (w = 0; w < x_mis; ++w) {
- MV_REF *const mv = frame_mv + w;
- mv->ref_frame[0] = mi->mbmi.ref_frame[0];
- mv->ref_frame[1] = mi->mbmi.ref_frame[1];
- mv->mv[0].as_int = mi->mbmi.mv[0].as_int;
- mv->mv[1].as_int = mi->mbmi.mv[1].as_int;
+ for (h = 0; h < y_mis; ++h) {
+ MV_REF *const frame_mv = frame_mvs + h * cm->mi_cols;
+ for (w = 0; w < x_mis; ++w) {
+ MV_REF *const mv = frame_mv + w;
+ mv->ref_frame[0] = mi->mbmi.ref_frame[0];
+ mv->ref_frame[1] = mi->mbmi.ref_frame[1];
+ mv->mv[0].as_int = mi->mbmi.mv[0].as_int;
+ mv->mv[1].as_int = mi->mbmi.mv[1].as_int;
+ }
}
}
}
diff --git a/vp9/encoder/mips/msa/vp9_fdct16x16_msa.c b/vp9/encoder/mips/msa/vp9_fdct16x16_msa.c
new file mode 100644
index 0000000..a3ebfab
--- /dev/null
+++ b/vp9/encoder/mips/msa/vp9_fdct16x16_msa.c
@@ -0,0 +1,688 @@
+/*
+ * Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+
+#include "./vp9_rtcd.h"
+#include "vp9/encoder/mips/msa/vp9_fdct_msa.h"
+
+static void fdct8x16_1d_column(const int16_t *input, int16_t *tmp_ptr,
+ int32_t src_stride) {
+ v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+ v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
+ v8i16 in8, in9, in10, in11, in12, in13, in14, in15;
+ v8i16 stp21, stp22, stp23, stp24, stp25, stp26, stp30;
+ v8i16 stp31, stp32, stp33, stp34, stp35, stp36, stp37;
+ v8i16 vec0, vec1, vec2, vec3, vec4, vec5, cnst0, cnst1, cnst4, cnst5;
+ v8i16 coeff = { cospi_16_64, -cospi_16_64, cospi_8_64, cospi_24_64,
+ -cospi_8_64, -cospi_24_64, cospi_12_64, cospi_20_64 };
+ v8i16 coeff1 = { cospi_2_64, cospi_30_64, cospi_14_64, cospi_18_64,
+ cospi_10_64, cospi_22_64, cospi_6_64, cospi_26_64 };
+ v8i16 coeff2 = { -cospi_2_64, -cospi_10_64, -cospi_18_64, -cospi_26_64,
+ 0, 0, 0, 0 };
+
+ LD_SH16(input, src_stride,
+ in0, in1, in2, in3, in4, in5, in6, in7,
+ in8, in9, in10, in11, in12, in13, in14, in15);
+ SLLI_4V(in0, in1, in2, in3, 2);
+ SLLI_4V(in4, in5, in6, in7, 2);
+ SLLI_4V(in8, in9, in10, in11, 2);
+ SLLI_4V(in12, in13, in14, in15, 2);
+ ADD4(in0, in15, in1, in14, in2, in13, in3, in12, tmp0, tmp1, tmp2, tmp3);
+ ADD4(in4, in11, in5, in10, in6, in9, in7, in8, tmp4, tmp5, tmp6, tmp7);
+ VP9_FDCT8x16_EVEN(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7,
+ tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
+ ST_SH8(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp_ptr, 32);
+ SUB4(in0, in15, in1, in14, in2, in13, in3, in12, in15, in14, in13, in12);
+ SUB4(in4, in11, in5, in10, in6, in9, in7, in8, in11, in10, in9, in8);
+
+ tmp_ptr += 16;
+
+ /* stp 1 */
+ ILVL_H2_SH(in10, in13, in11, in12, vec2, vec4);
+ ILVR_H2_SH(in10, in13, in11, in12, vec3, vec5);
+
+ cnst4 = __msa_splati_h(coeff, 0);
+ stp25 = VP9_DOT_SHIFT_RIGHT_PCK_H(vec2, vec3, cnst4);
+
+ cnst5 = __msa_splati_h(coeff, 1);
+ cnst5 = __msa_ilvev_h(cnst5, cnst4);
+ stp22 = VP9_DOT_SHIFT_RIGHT_PCK_H(vec2, vec3, cnst5);
+ stp24 = VP9_DOT_SHIFT_RIGHT_PCK_H(vec4, vec5, cnst4);
+ stp23 = VP9_DOT_SHIFT_RIGHT_PCK_H(vec4, vec5, cnst5);
+
+ /* stp2 */
+ BUTTERFLY_4(in8, in9, stp22, stp23, stp30, stp31, stp32, stp33);
+ BUTTERFLY_4(in15, in14, stp25, stp24, stp37, stp36, stp35, stp34);
+ ILVL_H2_SH(stp36, stp31, stp35, stp32, vec2, vec4);
+ ILVR_H2_SH(stp36, stp31, stp35, stp32, vec3, vec5);
+ SPLATI_H2_SH(coeff, 2, 3, cnst0, cnst1);
+ cnst0 = __msa_ilvev_h(cnst0, cnst1);
+ stp26 = VP9_DOT_SHIFT_RIGHT_PCK_H(vec2, vec3, cnst0);
+
+ cnst0 = __msa_splati_h(coeff, 4);
+ cnst1 = __msa_ilvev_h(cnst1, cnst0);
+ stp21 = VP9_DOT_SHIFT_RIGHT_PCK_H(vec2, vec3, cnst1);
+
+ BUTTERFLY_4(stp30, stp37, stp26, stp21, in8, in15, in14, in9);
+ ILVRL_H2_SH(in15, in8, vec1, vec0);
+ SPLATI_H2_SH(coeff1, 0, 1, cnst0, cnst1);
+ cnst0 = __msa_ilvev_h(cnst0, cnst1);
+
+ in8 = VP9_DOT_SHIFT_RIGHT_PCK_H(vec0, vec1, cnst0);
+ ST_SH(in8, tmp_ptr);
+
+ cnst0 = __msa_splati_h(coeff2, 0);
+ cnst0 = __msa_ilvev_h(cnst1, cnst0);
+ in8 = VP9_DOT_SHIFT_RIGHT_PCK_H(vec0, vec1, cnst0);
+ ST_SH(in8, tmp_ptr + 224);
+
+ ILVRL_H2_SH(in14, in9, vec1, vec0);
+ SPLATI_H2_SH(coeff1, 2, 3, cnst0, cnst1);
+ cnst1 = __msa_ilvev_h(cnst1, cnst0);
+
+ in8 = VP9_DOT_SHIFT_RIGHT_PCK_H(vec0, vec1, cnst1);
+ ST_SH(in8, tmp_ptr + 128);
+
+ cnst1 = __msa_splati_h(coeff2, 2);
+ cnst0 = __msa_ilvev_h(cnst0, cnst1);
+ in8 = VP9_DOT_SHIFT_RIGHT_PCK_H(vec0, vec1, cnst0);
+ ST_SH(in8, tmp_ptr + 96);
+
+ SPLATI_H2_SH(coeff, 2, 5, cnst0, cnst1);
+ cnst1 = __msa_ilvev_h(cnst1, cnst0);
+
+ stp25 = VP9_DOT_SHIFT_RIGHT_PCK_H(vec4, vec5, cnst1);
+
+ cnst1 = __msa_splati_h(coeff, 3);
+ cnst1 = __msa_ilvev_h(cnst0, cnst1);
+ stp22 = VP9_DOT_SHIFT_RIGHT_PCK_H(vec4, vec5, cnst1);
+
+ /* stp4 */
+ ADD2(stp34, stp25, stp33, stp22, in13, in10);
+
+ ILVRL_H2_SH(in13, in10, vec1, vec0);
+ SPLATI_H2_SH(coeff1, 4, 5, cnst0, cnst1);
+ cnst0 = __msa_ilvev_h(cnst0, cnst1);
+ in8 = VP9_DOT_SHIFT_RIGHT_PCK_H(vec0, vec1, cnst0);
+ ST_SH(in8, tmp_ptr + 64);
+
+ cnst0 = __msa_splati_h(coeff2, 1);
+ cnst0 = __msa_ilvev_h(cnst1, cnst0);
+ in8 = VP9_DOT_SHIFT_RIGHT_PCK_H(vec0, vec1, cnst0);
+ ST_SH(in8, tmp_ptr + 160);
+
+ SUB2(stp34, stp25, stp33, stp22, in12, in11);
+ ILVRL_H2_SH(in12, in11, vec1, vec0);
+ SPLATI_H2_SH(coeff1, 6, 7, cnst0, cnst1);
+ cnst1 = __msa_ilvev_h(cnst1, cnst0);
+
+ in8 = VP9_DOT_SHIFT_RIGHT_PCK_H(vec0, vec1, cnst1);
+ ST_SH(in8, tmp_ptr + 192);
+
+ cnst1 = __msa_splati_h(coeff2, 3);
+ cnst0 = __msa_ilvev_h(cnst0, cnst1);
+ in8 = VP9_DOT_SHIFT_RIGHT_PCK_H(vec0, vec1, cnst0);
+ ST_SH(in8, tmp_ptr + 32);
+}
+
+static void fdct16x8_1d_row(int16_t *input, int16_t *output) {
+ v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+ v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
+ v8i16 in8, in9, in10, in11, in12, in13, in14, in15;
+
+ LD_SH8(input, 16, in0, in1, in2, in3, in4, in5, in6, in7);
+ LD_SH8((input + 8), 16, in8, in9, in10, in11, in12, in13, in14, in15);
+ TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7,
+ in0, in1, in2, in3, in4, in5, in6, in7);
+ TRANSPOSE8x8_SH_SH(in8, in9, in10, in11, in12, in13, in14, in15,
+ in8, in9, in10, in11, in12, in13, in14, in15);
+ ADD4(in0, 1, in1, 1, in2, 1, in3, 1, in0, in1, in2, in3);
+ ADD4(in4, 1, in5, 1, in6, 1, in7, 1, in4, in5, in6, in7);
+ ADD4(in8, 1, in9, 1, in10, 1, in11, 1, in8, in9, in10, in11);
+ ADD4(in12, 1, in13, 1, in14, 1, in15, 1, in12, in13, in14, in15);
+ SRA_4V(in0, in1, in2, in3, 2);
+ SRA_4V(in4, in5, in6, in7, 2);
+ SRA_4V(in8, in9, in10, in11, 2);
+ SRA_4V(in12, in13, in14, in15, 2);
+ BUTTERFLY_16(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, in11,
+ in12, in13, in14, in15, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5,
+ tmp6, tmp7, in8, in9, in10, in11, in12, in13, in14, in15);
+ ST_SH8(in8, in9, in10, in11, in12, in13, in14, in15, input, 16);
+ VP9_FDCT8x16_EVEN(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7,
+ tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
+ LD_SH8(input, 16, in8, in9, in10, in11, in12, in13, in14, in15);
+ VP9_FDCT8x16_ODD(in8, in9, in10, in11, in12, in13, in14, in15,
+ in0, in1, in2, in3, in4, in5, in6, in7);
+ TRANSPOSE8x8_SH_SH(tmp0, in0, tmp1, in1, tmp2, in2, tmp3, in3,
+ tmp0, in0, tmp1, in1, tmp2, in2, tmp3, in3);
+ ST_SH8(tmp0, in0, tmp1, in1, tmp2, in2, tmp3, in3, output, 16);
+ TRANSPOSE8x8_SH_SH(tmp4, in4, tmp5, in5, tmp6, in6, tmp7, in7,
+ tmp4, in4, tmp5, in5, tmp6, in6, tmp7, in7);
+ ST_SH8(tmp4, in4, tmp5, in5, tmp6, in6, tmp7, in7, output + 8, 16);
+}
+
+void vp9_fdct16x16_msa(const int16_t *input, int16_t *output,
+ int32_t src_stride) {
+ int32_t i;
+ DECLARE_ALIGNED(32, int16_t, tmp_buf[16 * 16]);
+
+ /* column transform */
+ for (i = 0; i < 2; ++i) {
+ fdct8x16_1d_column((input + 8 * i), (&tmp_buf[0] + 8 * i), src_stride);
+ }
+
+ /* row transform */
+ for (i = 0; i < 2; ++i) {
+ fdct16x8_1d_row((&tmp_buf[0] + (128 * i)), (output + (128 * i)));
+ }
+}
+
+void vp9_fdct16x16_1_msa(const int16_t *input, int16_t *out, int32_t stride) {
+ out[1] = 0;
+
+ out[0] = VP9_LD_HADD(input, stride);
+ out[0] += VP9_LD_HADD(input + 8, stride);
+ out[0] += VP9_LD_HADD(input + 16 * 8, stride);
+ out[0] += VP9_LD_HADD(input + 16 * 8 + 8, stride);
+ out[0] >>= 1;
+}
+
+static void fadst16_cols_step1_msa(const int16_t *input, int32_t stride,
+ const int32_t *const0, int16_t *int_buf) {
+ v8i16 r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, r12, r13, r14, r15;
+ v8i16 tp0, tp1, tp2, tp3, g0, g1, g2, g3, g8, g9, g10, g11, h0, h1, h2, h3;
+ v4i32 k0, k1, k2, k3;
+
+ /* load input data */
+ r0 = LD_SH(input);
+ r15 = LD_SH(input + 15 * stride);
+ r7 = LD_SH(input + 7 * stride);
+ r8 = LD_SH(input + 8 * stride);
+ SLLI_4V(r0, r15, r7, r8, 2);
+
+ /* stage 1 */
+ LD_SW2(const0, 4, k0, k1);
+ LD_SW2(const0 + 8, 4, k2, k3);
+ VP9_MADD_BF(r15, r0, r7, r8, k0, k1, k2, k3, g0, g1, g2, g3);
+
+ r3 = LD_SH(input + 3 * stride);
+ r4 = LD_SH(input + 4 * stride);
+ r11 = LD_SH(input + 11 * stride);
+ r12 = LD_SH(input + 12 * stride);
+ SLLI_4V(r3, r4, r11, r12, 2);
+
+ LD_SW2(const0 + 4 * 4, 4, k0, k1);
+ LD_SW2(const0 + 4 * 6, 4, k2, k3);
+ VP9_MADD_BF(r11, r4, r3, r12, k0, k1, k2, k3, g8, g9, g10, g11);
+
+ /* stage 2 */
+ BUTTERFLY_4(g0, g2, g10, g8, tp0, tp2, tp3, tp1);
+ ST_SH2(tp0, tp2, int_buf, 8);
+ ST_SH2(tp1, tp3, int_buf + 4 * 8, 8);
+
+ LD_SW2(const0 + 4 * 8, 4, k0, k1);
+ k2 = LD_SW(const0 + 4 * 10);
+ VP9_MADD_BF(g1, g3, g9, g11, k0, k1, k2, k0, h0, h1, h2, h3);
+
+ ST_SH2(h0, h1, int_buf + 8 * 8, 8);
+ ST_SH2(h3, h2, int_buf + 12 * 8, 8);
+
+ r9 = LD_SH(input + 9 * stride);
+ r6 = LD_SH(input + 6 * stride);
+ r1 = LD_SH(input + stride);
+ r14 = LD_SH(input + 14 * stride);
+ SLLI_4V(r9, r6, r1, r14, 2);
+
+ LD_SW2(const0 + 4 * 11, 4, k0, k1);
+ LD_SW2(const0 + 4 * 13, 4, k2, k3);
+ VP9_MADD_BF(r9, r6, r1, r14, k0, k1, k2, k3, g0, g1, g2, g3);
+
+ ST_SH2(g1, g3, int_buf + 3 * 8, 4 * 8);
+
+ r13 = LD_SH(input + 13 * stride);
+ r2 = LD_SH(input + 2 * stride);
+ r5 = LD_SH(input + 5 * stride);
+ r10 = LD_SH(input + 10 * stride);
+ SLLI_4V(r13, r2, r5, r10, 2);
+
+ LD_SW2(const0 + 4 * 15, 4, k0, k1);
+ LD_SW2(const0 + 4 * 17, 4, k2, k3);
+ VP9_MADD_BF(r13, r2, r5, r10, k0, k1, k2, k3, h0, h1, h2, h3);
+
+ ST_SH2(h1, h3, int_buf + 11 * 8, 4 * 8);
+
+ BUTTERFLY_4(h0, h2, g2, g0, tp0, tp1, tp2, tp3);
+ ST_SH4(tp0, tp1, tp2, tp3, int_buf + 2 * 8, 4 * 8);
+}
+
+static void fadst16_cols_step2_msa(int16_t *int_buf, const int32_t *const0,
+ int16_t *out) {
+ int16_t *out_ptr = out + 128;
+ v8i16 tp0, tp1, tp2, tp3, g5, g7, g13, g15;
+ v8i16 h0, h1, h2, h3, h4, h5, h6, h7, h10, h11;
+ v8i16 out0, out1, out2, out3, out4, out5, out6, out7;
+ v8i16 out8, out9, out10, out11, out12, out13, out14, out15;
+ v4i32 k0, k1, k2, k3;
+
+ LD_SH2(int_buf + 3 * 8, 4 * 8, g13, g15);
+ LD_SH2(int_buf + 11 * 8, 4 * 8, g5, g7);
+ LD_SW2(const0 + 4 * 19, 4, k0, k1);
+ k2 = LD_SW(const0 + 4 * 21);
+ VP9_MADD_BF(g7, g5, g15, g13, k0, k1, k2, k0, h4, h5, h6, h7);
+
+ tp0 = LD_SH(int_buf + 4 * 8);
+ tp1 = LD_SH(int_buf + 5 * 8);
+ tp3 = LD_SH(int_buf + 10 * 8);
+ tp2 = LD_SH(int_buf + 14 * 8);
+ LD_SW2(const0 + 4 * 22, 4, k0, k1);
+ k2 = LD_SW(const0 + 4 * 24);
+ VP9_MADD_BF(tp0, tp1, tp2, tp3, k0, k1, k2, k0, out4, out6, out5, out7);
+ out4 = -out4;
+ ST_SH(out4, (out + 3 * 16));
+ ST_SH(out5, (out_ptr + 4 * 16));
+
+ h1 = LD_SH(int_buf + 9 * 8);
+ h3 = LD_SH(int_buf + 12 * 8);
+ VP9_MADD_BF(h1, h3, h5, h7, k0, k1, k2, k0, out12, out14, out13, out15);
+ out13 = -out13;
+ ST_SH(out12, (out + 2 * 16));
+ ST_SH(out13, (out_ptr + 5 * 16));
+
+ tp0 = LD_SH(int_buf);
+ tp1 = LD_SH(int_buf + 8);
+ tp2 = LD_SH(int_buf + 2 * 8);
+ tp3 = LD_SH(int_buf + 6 * 8);
+
+ BUTTERFLY_4(tp0, tp1, tp3, tp2, out0, out1, h11, h10);
+ out1 = -out1;
+ ST_SH(out0, (out));
+ ST_SH(out1, (out_ptr + 7 * 16));
+
+ h0 = LD_SH(int_buf + 8 * 8);
+ h2 = LD_SH(int_buf + 13 * 8);
+
+ BUTTERFLY_4(h0, h2, h6, h4, out8, out9, out11, out10);
+ out8 = -out8;
+ ST_SH(out8, (out + 16));
+ ST_SH(out9, (out_ptr + 6 * 16));
+
+ /* stage 4 */
+ LD_SW2(const0 + 4 * 25, 4, k0, k1);
+ LD_SW2(const0 + 4 * 27, 4, k2, k3);
+ VP9_MADD_SHORT(h10, h11, k1, k2, out2, out3);
+ ST_SH(out2, (out + 7 * 16));
+ ST_SH(out3, (out_ptr));
+
+ VP9_MADD_SHORT(out6, out7, k0, k3, out6, out7);
+ ST_SH(out6, (out + 4 * 16));
+ ST_SH(out7, (out_ptr + 3 * 16));
+
+ VP9_MADD_SHORT(out10, out11, k0, k3, out10, out11);
+ ST_SH(out10, (out + 6 * 16));
+ ST_SH(out11, (out_ptr + 16));
+
+ VP9_MADD_SHORT(out14, out15, k1, k2, out14, out15);
+ ST_SH(out14, (out + 5 * 16));
+ ST_SH(out15, (out_ptr + 2 * 16));
+}
+
+static void fadst16_transpose_postproc_msa(int16_t *input, int16_t *out) {
+ v8i16 r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, r12, r13, r14, r15;
+ v8i16 l0, l1, l2, l3, l4, l5, l6, l7, l8, l9, l10, l11, l12, l13, l14, l15;
+
+ /* load input data */
+ LD_SH8(input, 16, l0, l1, l2, l3, l4, l5, l6, l7);
+ TRANSPOSE8x8_SH_SH(l0, l1, l2, l3, l4, l5, l6, l7,
+ r0, r1, r2, r3, r4, r5, r6, r7);
+ VP9_FDCT_POSTPROC_2V_NEG_H(r0, r1);
+ VP9_FDCT_POSTPROC_2V_NEG_H(r2, r3);
+ VP9_FDCT_POSTPROC_2V_NEG_H(r4, r5);
+ VP9_FDCT_POSTPROC_2V_NEG_H(r6, r7);
+ ST_SH8(r0, r1, r2, r3, r4, r5, r6, r7, out, 8);
+ out += 64;
+
+ LD_SH8(input + 8, 16, l8, l9, l10, l11, l12, l13, l14, l15);
+ TRANSPOSE8x8_SH_SH(l8, l9, l10, l11, l12, l13, l14, l15,
+ r8, r9, r10, r11, r12, r13, r14, r15);
+ VP9_FDCT_POSTPROC_2V_NEG_H(r8, r9);
+ VP9_FDCT_POSTPROC_2V_NEG_H(r10, r11);
+ VP9_FDCT_POSTPROC_2V_NEG_H(r12, r13);
+ VP9_FDCT_POSTPROC_2V_NEG_H(r14, r15);
+ ST_SH8(r8, r9, r10, r11, r12, r13, r14, r15, out, 8);
+ out += 64;
+
+ /* load input data */
+ input += 128;
+ LD_SH8(input, 16, l0, l1, l2, l3, l4, l5, l6, l7);
+ TRANSPOSE8x8_SH_SH(l0, l1, l2, l3, l4, l5, l6, l7,
+ r0, r1, r2, r3, r4, r5, r6, r7);
+ VP9_FDCT_POSTPROC_2V_NEG_H(r0, r1);
+ VP9_FDCT_POSTPROC_2V_NEG_H(r2, r3);
+ VP9_FDCT_POSTPROC_2V_NEG_H(r4, r5);
+ VP9_FDCT_POSTPROC_2V_NEG_H(r6, r7);
+ ST_SH8(r0, r1, r2, r3, r4, r5, r6, r7, out, 8);
+ out += 64;
+
+ LD_SH8(input + 8, 16, l8, l9, l10, l11, l12, l13, l14, l15);
+ TRANSPOSE8x8_SH_SH(l8, l9, l10, l11, l12, l13, l14, l15,
+ r8, r9, r10, r11, r12, r13, r14, r15);
+ VP9_FDCT_POSTPROC_2V_NEG_H(r8, r9);
+ VP9_FDCT_POSTPROC_2V_NEG_H(r10, r11);
+ VP9_FDCT_POSTPROC_2V_NEG_H(r12, r13);
+ VP9_FDCT_POSTPROC_2V_NEG_H(r14, r15);
+ ST_SH8(r8, r9, r10, r11, r12, r13, r14, r15, out, 8);
+}
+
+static void fadst16_rows_step1_msa(int16_t *input, const int32_t *const0,
+ int16_t *int_buf) {
+ v8i16 r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, r12, r13, r14, r15;
+ v8i16 tp0, tp1, tp2, tp3, g0, g1, g2, g3, g8, g9, g10, g11, h0, h1, h2, h3;
+ v4i32 k0, k1, k2, k3;
+
+ /* load input data */
+ r0 = LD_SH(input);
+ r7 = LD_SH(input + 7 * 8);
+ r8 = LD_SH(input + 8 * 8);
+ r15 = LD_SH(input + 15 * 8);
+
+ /* stage 1 */
+ LD_SW2(const0, 4, k0, k1);
+ LD_SW2(const0 + 4 * 2, 4, k2, k3);
+ VP9_MADD_BF(r15, r0, r7, r8, k0, k1, k2, k3, g0, g1, g2, g3);
+
+ r3 = LD_SH(input + 3 * 8);
+ r4 = LD_SH(input + 4 * 8);
+ r11 = LD_SH(input + 11 * 8);
+ r12 = LD_SH(input + 12 * 8);
+
+ LD_SW2(const0 + 4 * 4, 4, k0, k1);
+ LD_SW2(const0 + 4 * 6, 4, k2, k3);
+ VP9_MADD_BF(r11, r4, r3, r12, k0, k1, k2, k3, g8, g9, g10, g11);
+
+ /* stage 2 */
+ BUTTERFLY_4(g0, g2, g10, g8, tp0, tp2, tp3, tp1);
+ ST_SH2(tp0, tp1, int_buf, 4 * 8);
+ ST_SH2(tp2, tp3, int_buf + 8, 4 * 8);
+
+ LD_SW2(const0 + 4 * 8, 4, k0, k1);
+ k2 = LD_SW(const0 + 4 * 10);
+ VP9_MADD_BF(g1, g3, g9, g11, k0, k1, k2, k0, h0, h1, h2, h3);
+ ST_SH2(h0, h3, int_buf + 8 * 8, 4 * 8);
+ ST_SH2(h1, h2, int_buf + 9 * 8, 4 * 8);
+
+ r1 = LD_SH(input + 8);
+ r6 = LD_SH(input + 6 * 8);
+ r9 = LD_SH(input + 9 * 8);
+ r14 = LD_SH(input + 14 * 8);
+
+ LD_SW2(const0 + 4 * 11, 4, k0, k1);
+ LD_SW2(const0 + 4 * 13, 4, k2, k3);
+ VP9_MADD_BF(r9, r6, r1, r14, k0, k1, k2, k3, g0, g1, g2, g3);
+ ST_SH2(g1, g3, int_buf + 3 * 8, 4 * 8);
+
+ r2 = LD_SH(input + 2 * 8);
+ r5 = LD_SH(input + 5 * 8);
+ r10 = LD_SH(input + 10 * 8);
+ r13 = LD_SH(input + 13 * 8);
+
+ LD_SW2(const0 + 4 * 15, 4, k0, k1);
+ LD_SW2(const0 + 4 * 17, 4, k2, k3);
+ VP9_MADD_BF(r13, r2, r5, r10, k0, k1, k2, k3, h0, h1, h2, h3);
+ ST_SH2(h1, h3, int_buf + 11 * 8, 4 * 8);
+ BUTTERFLY_4(h0, h2, g2, g0, tp0, tp1, tp2, tp3);
+ ST_SH4(tp0, tp1, tp2, tp3, int_buf + 2 * 8, 4 * 8);
+}
+
+static void fadst16_rows_step2_msa(int16_t *int_buf, const int32_t *const0,
+ int16_t *out) {
+ int16_t *out_ptr = out + 8;
+ v8i16 tp0, tp1, tp2, tp3, g5, g7, g13, g15;
+ v8i16 h0, h1, h2, h3, h4, h5, h6, h7, h10, h11;
+ v8i16 out0, out1, out2, out3, out4, out5, out6, out7;
+ v8i16 out8, out9, out10, out11, out12, out13, out14, out15;
+ v4i32 k0, k1, k2, k3;
+
+ g13 = LD_SH(int_buf + 3 * 8);
+ g15 = LD_SH(int_buf + 7 * 8);
+ g5 = LD_SH(int_buf + 11 * 8);
+ g7 = LD_SH(int_buf + 15 * 8);
+
+ LD_SW2(const0 + 4 * 19, 4, k0, k1);
+ k2 = LD_SW(const0 + 4 * 21);
+ VP9_MADD_BF(g7, g5, g15, g13, k0, k1, k2, k0, h4, h5, h6, h7);
+
+ tp0 = LD_SH(int_buf + 4 * 8);
+ tp1 = LD_SH(int_buf + 5 * 8);
+ tp3 = LD_SH(int_buf + 10 * 8);
+ tp2 = LD_SH(int_buf + 14 * 8);
+
+ LD_SW2(const0 + 4 * 22, 4, k0, k1);
+ k2 = LD_SW(const0 + 4 * 24);
+ VP9_MADD_BF(tp0, tp1, tp2, tp3, k0, k1, k2, k0, out4, out6, out5, out7);
+ out4 = -out4;
+ ST_SH(out4, (out + 3 * 16));
+ ST_SH(out5, (out_ptr + 4 * 16));
+
+ h1 = LD_SH(int_buf + 9 * 8);
+ h3 = LD_SH(int_buf + 12 * 8);
+ VP9_MADD_BF(h1, h3, h5, h7, k0, k1, k2, k0, out12, out14, out13, out15);
+ out13 = -out13;
+ ST_SH(out12, (out + 2 * 16));
+ ST_SH(out13, (out_ptr + 5 * 16));
+
+ tp0 = LD_SH(int_buf);
+ tp1 = LD_SH(int_buf + 8);
+ tp2 = LD_SH(int_buf + 2 * 8);
+ tp3 = LD_SH(int_buf + 6 * 8);
+
+ BUTTERFLY_4(tp0, tp1, tp3, tp2, out0, out1, h11, h10);
+ out1 = -out1;
+ ST_SH(out0, (out));
+ ST_SH(out1, (out_ptr + 7 * 16));
+
+ h0 = LD_SH(int_buf + 8 * 8);
+ h2 = LD_SH(int_buf + 13 * 8);
+ BUTTERFLY_4(h0, h2, h6, h4, out8, out9, out11, out10);
+ out8 = -out8;
+ ST_SH(out8, (out + 16));
+ ST_SH(out9, (out_ptr + 6 * 16));
+
+ /* stage 4 */
+ LD_SW2(const0 + 4 * 25, 4, k0, k1);
+ LD_SW2(const0 + 4 * 27, 4, k2, k3);
+ VP9_MADD_SHORT(h10, h11, k1, k2, out2, out3);
+ ST_SH(out2, (out + 7 * 16));
+ ST_SH(out3, (out_ptr));
+
+ VP9_MADD_SHORT(out6, out7, k0, k3, out6, out7);
+ ST_SH(out6, (out + 4 * 16));
+ ST_SH(out7, (out_ptr + 3 * 16));
+
+ VP9_MADD_SHORT(out10, out11, k0, k3, out10, out11);
+ ST_SH(out10, (out + 6 * 16));
+ ST_SH(out11, (out_ptr + 16));
+
+ VP9_MADD_SHORT(out14, out15, k1, k2, out14, out15);
+ ST_SH(out14, (out + 5 * 16));
+ ST_SH(out15, (out_ptr + 2 * 16));
+}
+
+static void fadst16_transpose_msa(int16_t *input, int16_t *out) {
+ v8i16 r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, r12, r13, r14, r15;
+ v8i16 l0, l1, l2, l3, l4, l5, l6, l7, l8, l9, l10, l11, l12, l13, l14, l15;
+
+ /* load input data */
+ LD_SH16(input, 8, l0, l8, l1, l9, l2, l10, l3, l11,
+ l4, l12, l5, l13, l6, l14, l7, l15);
+ TRANSPOSE8x8_SH_SH(l0, l1, l2, l3, l4, l5, l6, l7,
+ r0, r1, r2, r3, r4, r5, r6, r7);
+ TRANSPOSE8x8_SH_SH(l8, l9, l10, l11, l12, l13, l14, l15,
+ r8, r9, r10, r11, r12, r13, r14, r15);
+ ST_SH8(r0, r8, r1, r9, r2, r10, r3, r11, out, 8);
+ ST_SH8(r4, r12, r5, r13, r6, r14, r7, r15, (out + 64), 8);
+ out += 16 * 8;
+
+ /* load input data */
+ input += 128;
+ LD_SH16(input, 8, l0, l8, l1, l9, l2, l10, l3, l11,
+ l4, l12, l5, l13, l6, l14, l7, l15);
+ TRANSPOSE8x8_SH_SH(l0, l1, l2, l3, l4, l5, l6, l7,
+ r0, r1, r2, r3, r4, r5, r6, r7);
+ TRANSPOSE8x8_SH_SH(l8, l9, l10, l11, l12, l13, l14, l15,
+ r8, r9, r10, r11, r12, r13, r14, r15);
+ ST_SH8(r0, r8, r1, r9, r2, r10, r3, r11, out, 8);
+ ST_SH8(r4, r12, r5, r13, r6, r14, r7, r15, (out + 64), 8);
+}
+
+static void postproc_fdct16x8_1d_row(int16_t *intermediate, int16_t *output) {
+ int16_t *temp = intermediate;
+ int16_t *out = output;
+ v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+ v8i16 in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, in11;
+ v8i16 in12, in13, in14, in15;
+
+ LD_SH8(temp, 16, in0, in1, in2, in3, in4, in5, in6, in7);
+ temp = intermediate + 8;
+ LD_SH8(temp, 16, in8, in9, in10, in11, in12, in13, in14, in15);
+ TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7,
+ in0, in1, in2, in3, in4, in5, in6, in7);
+ TRANSPOSE8x8_SH_SH(in8, in9, in10, in11, in12, in13, in14, in15,
+ in8, in9, in10, in11, in12, in13, in14, in15);
+ VP9_FDCT_POSTPROC_2V_NEG_H(in0, in1);
+ VP9_FDCT_POSTPROC_2V_NEG_H(in2, in3);
+ VP9_FDCT_POSTPROC_2V_NEG_H(in4, in5);
+ VP9_FDCT_POSTPROC_2V_NEG_H(in6, in7);
+ VP9_FDCT_POSTPROC_2V_NEG_H(in8, in9);
+ VP9_FDCT_POSTPROC_2V_NEG_H(in10, in11);
+ VP9_FDCT_POSTPROC_2V_NEG_H(in12, in13);
+ VP9_FDCT_POSTPROC_2V_NEG_H(in14, in15);
+ BUTTERFLY_16(in0, in1, in2, in3, in4, in5, in6, in7,
+ in8, in9, in10, in11, in12, in13, in14, in15,
+ tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7,
+ in8, in9, in10, in11, in12, in13, in14, in15);
+ temp = intermediate;
+ ST_SH8(in8, in9, in10, in11, in12, in13, in14, in15, temp, 16);
+ VP9_FDCT8x16_EVEN(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7,
+ tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
+ temp = intermediate;
+ LD_SH8(temp, 16, in8, in9, in10, in11, in12, in13, in14, in15);
+ VP9_FDCT8x16_ODD(in8, in9, in10, in11, in12, in13, in14, in15,
+ in0, in1, in2, in3, in4, in5, in6, in7);
+ TRANSPOSE8x8_SH_SH(tmp0, in0, tmp1, in1, tmp2, in2, tmp3, in3,
+ tmp0, in0, tmp1, in1, tmp2, in2, tmp3, in3);
+ ST_SH8(tmp0, in0, tmp1, in1, tmp2, in2, tmp3, in3, out, 16);
+ TRANSPOSE8x8_SH_SH(tmp4, in4, tmp5, in5, tmp6, in6, tmp7, in7,
+ tmp4, in4, tmp5, in5, tmp6, in6, tmp7, in7);
+ out = output + 8;
+ ST_SH8(tmp4, in4, tmp5, in5, tmp6, in6, tmp7, in7, out, 16);
+}
+
+void vp9_fht16x16_msa(const int16_t *input, int16_t *output,
+ int32_t stride, int32_t tx_type) {
+ DECLARE_ALIGNED(32, int16_t, tmp[256]);
+ DECLARE_ALIGNED(32, int16_t, trans_buf[256]);
+ DECLARE_ALIGNED(32, int16_t, tmp_buf[128]);
+ int32_t i;
+ int16_t *ptmpbuf = &tmp_buf[0];
+ int16_t *trans = &trans_buf[0];
+ const int32_t const_arr[29 * 4] = {
+ 52707308, 52707308, 52707308, 52707308,
+ -1072430300, -1072430300, -1072430300, -1072430300,
+ 795618043, 795618043, 795618043, 795618043,
+ -721080468, -721080468, -721080468, -721080468,
+ 459094491, 459094491, 459094491, 459094491,
+ -970646691, -970646691, -970646691, -970646691,
+ 1010963856, 1010963856, 1010963856, 1010963856,
+ -361743294, -361743294, -361743294, -361743294,
+ 209469125, 209469125, 209469125, 209469125,
+ -1053094788, -1053094788, -1053094788, -1053094788,
+ 1053160324, 1053160324, 1053160324, 1053160324,
+ 639644520, 639644520, 639644520, 639644520,
+ -862444000, -862444000, -862444000, -862444000,
+ 1062144356, 1062144356, 1062144356, 1062144356,
+ -157532337, -157532337, -157532337, -157532337,
+ 260914709, 260914709, 260914709, 260914709,
+ -1041559667, -1041559667, -1041559667, -1041559667,
+ 920985831, 920985831, 920985831, 920985831,
+ -551995675, -551995675, -551995675, -551995675,
+ 596522295, 596522295, 596522295, 596522295,
+ 892853362, 892853362, 892853362, 892853362,
+ -892787826, -892787826, -892787826, -892787826,
+ 410925857, 410925857, 410925857, 410925857,
+ -992012162, -992012162, -992012162, -992012162,
+ 992077698, 992077698, 992077698, 992077698,
+ 759246145, 759246145, 759246145, 759246145,
+ -759180609, -759180609, -759180609, -759180609,
+ -759222975, -759222975, -759222975, -759222975,
+ 759288511, 759288511, 759288511, 759288511 };
+
+ switch (tx_type) {
+ case DCT_DCT:
+ /* column transform */
+ for (i = 0; i < 2; ++i) {
+ fdct8x16_1d_column(input + 8 * i, tmp + 8 * i, stride);
+ }
+
+ /* row transform */
+ for (i = 0; i < 2; ++i) {
+ fdct16x8_1d_row(tmp + (128 * i), output + (128 * i));
+ }
+ break;
+ case ADST_DCT:
+ /* column transform */
+ for (i = 0; i < 2; ++i) {
+ fadst16_cols_step1_msa(input + (i << 3), stride, const_arr, ptmpbuf);
+ fadst16_cols_step2_msa(ptmpbuf, const_arr, tmp + (i << 3));
+ }
+
+ /* row transform */
+ for (i = 0; i < 2; ++i) {
+ postproc_fdct16x8_1d_row(tmp + (128 * i), output + (128 * i));
+ }
+ break;
+ case DCT_ADST:
+ /* column transform */
+ for (i = 0; i < 2; ++i) {
+ fdct8x16_1d_column(input + 8 * i, tmp + 8 * i, stride);
+ }
+
+ fadst16_transpose_postproc_msa(tmp, trans);
+
+ /* row transform */
+ for (i = 0; i < 2; ++i) {
+ fadst16_rows_step1_msa(trans + (i << 7), const_arr, ptmpbuf);
+ fadst16_rows_step2_msa(ptmpbuf, const_arr, tmp + (i << 7));
+ }
+
+ fadst16_transpose_msa(tmp, output);
+ break;
+ case ADST_ADST:
+ /* column transform */
+ for (i = 0; i < 2; ++i) {
+ fadst16_cols_step1_msa(input + (i << 3), stride, const_arr, ptmpbuf);
+ fadst16_cols_step2_msa(ptmpbuf, const_arr, tmp + (i << 3));
+ }
+
+ fadst16_transpose_postproc_msa(tmp, trans);
+
+ /* row transform */
+ for (i = 0; i < 2; ++i) {
+ fadst16_rows_step1_msa(trans + (i << 7), const_arr, ptmpbuf);
+ fadst16_rows_step2_msa(ptmpbuf, const_arr, tmp + (i << 7));
+ }
+
+ fadst16_transpose_msa(tmp, output);
+ break;
+ default:
+ assert(0);
+ break;
+ }
+}
diff --git a/vp9/encoder/mips/msa/vp9_fdct32x32_msa.c b/vp9/encoder/mips/msa/vp9_fdct32x32_msa.c
new file mode 100644
index 0000000..3a74023
--- /dev/null
+++ b/vp9/encoder/mips/msa/vp9_fdct32x32_msa.c
@@ -0,0 +1,956 @@
+/*
+ * Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vp9_rtcd.h"
+#include "vp9/encoder/mips/msa/vp9_fdct_msa.h"
+
+static void fdct8x32_1d_column_load_butterfly(const int16_t *input,
+ int32_t src_stride,
+ int16_t *temp_buff) {
+ v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
+ v8i16 step0, step1, step2, step3;
+ v8i16 in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1;
+ v8i16 step0_1, step1_1, step2_1, step3_1;
+
+ /* 1st and 2nd set */
+ LD_SH4(input, src_stride, in0, in1, in2, in3);
+ LD_SH4(input + (28 * src_stride), src_stride, in4, in5, in6, in7);
+ LD_SH4(input + (4 * src_stride), src_stride, in0_1, in1_1, in2_1, in3_1);
+ LD_SH4(input + (24 * src_stride), src_stride, in4_1, in5_1, in6_1, in7_1);
+ SLLI_4V(in0, in1, in2, in3, 2);
+ SLLI_4V(in4, in5, in6, in7, 2);
+ SLLI_4V(in0_1, in1_1, in2_1, in3_1, 2);
+ SLLI_4V(in4_1, in5_1, in6_1, in7_1, 2);
+ BUTTERFLY_8(in0, in1, in2, in3, in4, in5, in6, in7,
+ step0, step1, step2, step3, in4, in5, in6, in7);
+ BUTTERFLY_8(in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1,
+ step0_1, step1_1, step2_1, step3_1, in4_1, in5_1, in6_1, in7_1);
+ ST_SH4(step0, step1, step2, step3, temp_buff, 8);
+ ST_SH4(in4, in5, in6, in7, temp_buff + (28 * 8), 8);
+ ST_SH4(step0_1, step1_1, step2_1, step3_1, temp_buff + (4 * 8), 8);
+ ST_SH4(in4_1, in5_1, in6_1, in7_1, temp_buff + (24 * 8), 8);
+
+ /* 3rd and 4th set */
+ LD_SH4(input + (8 * src_stride), src_stride, in0, in1, in2, in3);
+ LD_SH4(input + (20 * src_stride), src_stride, in4, in5, in6, in7);
+ LD_SH4(input + (12 * src_stride), src_stride, in0_1, in1_1, in2_1, in3_1);
+ LD_SH4(input + (16 * src_stride), src_stride, in4_1, in5_1, in6_1, in7_1);
+ SLLI_4V(in0, in1, in2, in3, 2);
+ SLLI_4V(in4, in5, in6, in7, 2);
+ SLLI_4V(in0_1, in1_1, in2_1, in3_1, 2);
+ SLLI_4V(in4_1, in5_1, in6_1, in7_1, 2);
+ BUTTERFLY_8(in0, in1, in2, in3, in4, in5, in6, in7,
+ step0, step1, step2, step3, in4, in5, in6, in7);
+ BUTTERFLY_8(in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1,
+ step0_1, step1_1, step2_1, step3_1, in4_1, in5_1, in6_1, in7_1);
+ ST_SH4(step0, step1, step2, step3, temp_buff + (8 * 8), 8);
+ ST_SH4(in4, in5, in6, in7, temp_buff + (20 * 8), 8);
+ ST_SH4(step0_1, step1_1, step2_1, step3_1, temp_buff + (12 * 8), 8);
+ ST_SH4(in4_1, in5_1, in6_1, in7_1, temp_buff + (15 * 8) + 8, 8);
+}
+
+static void fdct8x32_1d_column_even_store(int16_t *input, int16_t *temp) {
+ v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
+ v8i16 in8, in9, in10, in11, in12, in13, in14, in15;
+ v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+ v8i16 temp0, temp1;
+
+ /* fdct even */
+ LD_SH4(input, 8, in0, in1, in2, in3);
+ LD_SH4(input + 96, 8, in12, in13, in14, in15);
+ BUTTERFLY_8(in0, in1, in2, in3, in12, in13, in14, in15,
+ vec0, vec1, vec2, vec3, in12, in13, in14, in15);
+ LD_SH4(input + 32, 8, in4, in5, in6, in7);
+ LD_SH4(input + 64, 8, in8, in9, in10, in11);
+ BUTTERFLY_8(in4, in5, in6, in7, in8, in9, in10, in11,
+ vec4, vec5, vec6, vec7, in8, in9, in10, in11);
+
+ /* Stage 3 */
+ ADD4(vec0, vec7, vec1, vec6, vec2, vec5, vec3, vec4, in0, in1, in2, in3);
+ BUTTERFLY_4(in0, in1, in2, in3, temp0, in4, in1, in0);
+ VP9_DOTP_CONST_PAIR(temp0, in4, cospi_16_64, cospi_16_64, temp1, temp0);
+ VP9_FDCT32_POSTPROC_2V_POS_H(temp0, temp1);
+ ST_SH(temp0, temp);
+ ST_SH(temp1, temp + 512);
+
+ VP9_DOTP_CONST_PAIR(in0, in1, cospi_24_64, cospi_8_64, temp1, temp0);
+ VP9_FDCT32_POSTPROC_2V_POS_H(temp0, temp1);
+ ST_SH(temp0, temp + 256);
+ ST_SH(temp1, temp + 768);
+
+ SUB4(vec0, vec7, vec1, vec6, vec2, vec5, vec3, vec4, vec7, vec6, vec5, vec4);
+ VP9_DOTP_CONST_PAIR(vec6, vec5, cospi_16_64, cospi_16_64, vec5, vec6);
+ ADD2(vec4, vec5, vec7, vec6, vec0, vec1);
+ VP9_DOTP_CONST_PAIR(vec1, vec0, cospi_28_64, cospi_4_64, temp1, temp0);
+ VP9_FDCT32_POSTPROC_2V_POS_H(temp0, temp1);
+ ST_SH(temp0, temp + 128);
+ ST_SH(temp1, temp + 896);
+
+ SUB2(vec4, vec5, vec7, vec6, vec4, vec7);
+ VP9_DOTP_CONST_PAIR(vec7, vec4, cospi_12_64, cospi_20_64, temp1, temp0);
+ VP9_FDCT32_POSTPROC_2V_POS_H(temp0, temp1);
+ ST_SH(temp0, temp + 640);
+ ST_SH(temp1, temp + 384);
+
+ VP9_DOTP_CONST_PAIR(in13, in10, cospi_16_64, cospi_16_64, vec2, vec5);
+ VP9_DOTP_CONST_PAIR(in12, in11, cospi_16_64, cospi_16_64, vec3, vec4);
+ ADD4(in8, vec3, in9, vec2, in14, vec5, in15, vec4, in0, vec1, vec6, in2);
+ VP9_DOTP_CONST_PAIR(vec6, vec1, cospi_24_64, cospi_8_64, in1, in3);
+ ADD2(in0, in1, in2, in3, vec0, vec7);
+ VP9_DOTP_CONST_PAIR(vec7, vec0, cospi_30_64, cospi_2_64, temp1, temp0);
+ VP9_FDCT32_POSTPROC_2V_POS_H(temp0, temp1);
+ ST_SH(temp0, temp + 64);
+ ST_SH(temp1, temp + 960);
+
+ SUB2(in0, in1, in2, in3, in0, in2);
+ VP9_DOTP_CONST_PAIR(in2, in0, cospi_14_64, cospi_18_64, temp1, temp0);
+ VP9_FDCT32_POSTPROC_2V_POS_H(temp0, temp1);
+ ST_SH(temp0, temp + 576);
+ ST_SH(temp1, temp + 448);
+
+ SUB2(in9, vec2, in14, vec5, vec2, vec5);
+ VP9_DOTP_CONST_PAIR((-vec2), vec5, cospi_24_64, cospi_8_64, in2, in1);
+ SUB4(in8, vec3, in15, vec4, in3, in2, in0, in1, in3, in0, vec2, vec5);
+ VP9_DOTP_CONST_PAIR(vec5, vec2, cospi_22_64, cospi_10_64, temp1, temp0);
+ VP9_FDCT32_POSTPROC_2V_POS_H(temp0, temp1);
+ ST_SH(temp0, temp + 320);
+ ST_SH(temp1, temp + 704);
+
+ ADD2(in3, in2, in0, in1, vec3, vec4);
+ VP9_DOTP_CONST_PAIR(vec4, vec3, cospi_6_64, cospi_26_64, temp0, temp1);
+ VP9_FDCT32_POSTPROC_2V_POS_H(temp0, temp1);
+ ST_SH(temp0, temp + 192);
+ ST_SH(temp1, temp + 832);
+}
+
+static void fdct8x32_1d_column_odd_store(int16_t *input, int16_t *temp_ptr) {
+ v8i16 in16, in17, in18, in19, in20, in21, in22, in23;
+ v8i16 in24, in25, in26, in27, in28, in29, in30, in31, vec4, vec5;
+
+ in20 = LD_SH(input + 32);
+ in21 = LD_SH(input + 40);
+ in26 = LD_SH(input + 80);
+ in27 = LD_SH(input + 88);
+
+ VP9_DOTP_CONST_PAIR(in27, in20, cospi_16_64, cospi_16_64, in20, in27);
+ VP9_DOTP_CONST_PAIR(in26, in21, cospi_16_64, cospi_16_64, in21, in26);
+
+ in18 = LD_SH(input + 16);
+ in19 = LD_SH(input + 24);
+ in28 = LD_SH(input + 96);
+ in29 = LD_SH(input + 104);
+
+ vec4 = in19 - in20;
+ ST_SH(vec4, input + 32);
+ vec4 = in18 - in21;
+ ST_SH(vec4, input + 40);
+ vec4 = in29 - in26;
+ ST_SH(vec4, input + 80);
+ vec4 = in28 - in27;
+ ST_SH(vec4, input + 88);
+
+ in21 = in18 + in21;
+ in20 = in19 + in20;
+ in27 = in28 + in27;
+ in26 = in29 + in26;
+
+ LD_SH4(input + 48, 8, in22, in23, in24, in25);
+ VP9_DOTP_CONST_PAIR(in25, in22, cospi_16_64, cospi_16_64, in22, in25);
+ VP9_DOTP_CONST_PAIR(in24, in23, cospi_16_64, cospi_16_64, in23, in24);
+
+ in16 = LD_SH(input);
+ in17 = LD_SH(input + 8);
+ in30 = LD_SH(input + 112);
+ in31 = LD_SH(input + 120);
+
+ vec4 = in17 - in22;
+ ST_SH(vec4, input + 16);
+ vec4 = in16 - in23;
+ ST_SH(vec4, input + 24);
+ vec4 = in31 - in24;
+ ST_SH(vec4, input + 96);
+ vec4 = in30 - in25;
+ ST_SH(vec4, input + 104);
+
+ ADD4(in16, in23, in17, in22, in30, in25, in31, in24, in16, in17, in30, in31);
+ VP9_DOTP_CONST_PAIR(in26, in21, cospi_24_64, cospi_8_64, in18, in29);
+ VP9_DOTP_CONST_PAIR(in27, in20, cospi_24_64, cospi_8_64, in19, in28);
+ ADD4(in16, in19, in17, in18, in30, in29, in31, in28, in27, in22, in21, in25);
+ VP9_DOTP_CONST_PAIR(in21, in22, cospi_28_64, cospi_4_64, in26, in24);
+ ADD2(in27, in26, in25, in24, in23, in20);
+ VP9_DOTP_CONST_PAIR(in20, in23, cospi_31_64, cospi_1_64, vec4, vec5);
+ VP9_FDCT32_POSTPROC_2V_POS_H(vec5, vec4);
+ ST_SH(vec5, temp_ptr);
+ ST_SH(vec4, temp_ptr + 960);
+
+ SUB2(in27, in26, in25, in24, in22, in21);
+ VP9_DOTP_CONST_PAIR(in21, in22, cospi_15_64, cospi_17_64, vec5, vec4);
+ VP9_FDCT32_POSTPROC_2V_POS_H(vec5, vec4);
+ ST_SH(vec5, temp_ptr + 448);
+ ST_SH(vec4, temp_ptr + 512);
+
+ SUB4(in17, in18, in16, in19, in31, in28, in30, in29, in23, in26, in24, in20);
+ VP9_DOTP_CONST_PAIR((-in23), in20, cospi_28_64, cospi_4_64, in27, in25);
+ SUB2(in26, in27, in24, in25, in23, in20);
+ VP9_DOTP_CONST_PAIR(in20, in23, cospi_23_64, cospi_9_64, vec4, vec5);
+ VP9_FDCT32_POSTPROC_2V_POS_H(vec5, vec4);
+ ST_SH(vec4, temp_ptr + 704);
+ ST_SH(vec5, temp_ptr + 256);
+
+ ADD2(in26, in27, in24, in25, in22, in21);
+ VP9_DOTP_CONST_PAIR(in21, in22, cospi_7_64, cospi_25_64, vec4, vec5);
+ VP9_FDCT32_POSTPROC_2V_POS_H(vec5, vec4);
+ ST_SH(vec4, temp_ptr + 192);
+ ST_SH(vec5, temp_ptr + 768);
+
+ LD_SH4(input + 16, 8, in22, in23, in20, in21);
+ LD_SH4(input + 80, 8, in26, in27, in24, in25);
+ in16 = in20;
+ in17 = in21;
+ VP9_DOTP_CONST_PAIR(-in16, in27, cospi_24_64, cospi_8_64, in20, in27);
+ VP9_DOTP_CONST_PAIR(-in17, in26, cospi_24_64, cospi_8_64, in21, in26);
+ SUB4(in23, in20, in22, in21, in25, in26, in24, in27, in28, in17, in18, in31);
+ VP9_DOTP_CONST_PAIR(in18, in17, cospi_12_64, cospi_20_64, in29, in30);
+ ADD2(in28, in29, in31, in30, in16, in19);
+ VP9_DOTP_CONST_PAIR(in19, in16, cospi_27_64, cospi_5_64, vec5, vec4);
+ VP9_FDCT32_POSTPROC_2V_POS_H(vec5, vec4);
+ ST_SH(vec5, temp_ptr + 832);
+ ST_SH(vec4, temp_ptr + 128);
+
+ SUB2(in28, in29, in31, in30, in17, in18);
+ VP9_DOTP_CONST_PAIR(in18, in17, cospi_11_64, cospi_21_64, vec5, vec4);
+ VP9_FDCT32_POSTPROC_2V_POS_H(vec5, vec4);
+ ST_SH(vec5, temp_ptr + 320);
+ ST_SH(vec4, temp_ptr + 640);
+ ADD4(in22, in21, in23, in20, in24, in27, in25, in26, in16, in29, in30, in19);
+ VP9_DOTP_CONST_PAIR(-in16, in19, cospi_12_64, cospi_20_64, in28, in31);
+ SUB2(in29, in28, in30, in31, in16, in19);
+ VP9_DOTP_CONST_PAIR(in19, in16, cospi_19_64, cospi_13_64, vec5, vec4);
+ VP9_FDCT32_POSTPROC_2V_POS_H(vec5, vec4);
+ ST_SH(vec5, temp_ptr + 576);
+ ST_SH(vec4, temp_ptr + 384);
+
+ ADD2(in29, in28, in30, in31, in17, in18);
+ VP9_DOTP_CONST_PAIR(in18, in17, cospi_3_64, cospi_29_64, vec5, vec4);
+ VP9_FDCT32_POSTPROC_2V_POS_H(vec5, vec4);
+ ST_SH(vec5, temp_ptr + 64);
+ ST_SH(vec4, temp_ptr + 896);
+}
+
+static void fdct8x32_1d_column(const int16_t *input, int32_t src_stride,
+ int16_t *tmp_buf, int16_t *tmp_buf_big) {
+ fdct8x32_1d_column_load_butterfly(input, src_stride, tmp_buf);
+ fdct8x32_1d_column_even_store(tmp_buf, tmp_buf_big);
+ fdct8x32_1d_column_odd_store(tmp_buf + 128, (tmp_buf_big + 32));
+}
+
+static void fdct8x32_1d_row_load_butterfly(int16_t *temp_buff,
+ int16_t *output) {
+ v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
+ v8i16 in8, in9, in10, in11, in12, in13, in14, in15;
+ v8i16 step0, step1, step2, step3, step4, step5, step6, step7;
+
+ LD_SH8(temp_buff, 32, in0, in1, in2, in3, in4, in5, in6, in7);
+ LD_SH8(temp_buff + 24, 32, in8, in9, in10, in11, in12, in13, in14, in15);
+ TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7,
+ in0, in1, in2, in3, in4, in5, in6, in7);
+ TRANSPOSE8x8_SH_SH(in8, in9, in10, in11, in12, in13, in14, in15,
+ in8, in9, in10, in11, in12, in13, in14, in15);
+ BUTTERFLY_16(in0, in1, in2, in3, in4, in5, in6, in7,
+ in8, in9, in10, in11, in12, in13, in14, in15,
+ step0, step1, step2, step3, step4, step5, step6, step7,
+ in8, in9, in10, in11, in12, in13, in14, in15);
+ ST_SH8(step0, step1, step2, step3, step4, step5, step6, step7, output, 8);
+ ST_SH8(in8, in9, in10, in11, in12, in13, in14, in15, (output + 24 * 8), 8);
+
+ /* 2nd set */
+ LD_SH8(temp_buff + 8, 32, in0, in1, in2, in3, in4, in5, in6, in7);
+ LD_SH8(temp_buff + 16, 32, in8, in9, in10, in11, in12, in13, in14, in15);
+ TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7,
+ in0, in1, in2, in3, in4, in5, in6, in7);
+ TRANSPOSE8x8_SH_SH(in8, in9, in10, in11, in12, in13, in14, in15,
+ in8, in9, in10, in11, in12, in13, in14, in15);
+ BUTTERFLY_16(in0, in1, in2, in3, in4, in5, in6, in7,
+ in8, in9, in10, in11, in12, in13, in14, in15,
+ step0, step1, step2, step3, step4, step5, step6, step7,
+ in8, in9, in10, in11, in12, in13, in14, in15);
+ ST_SH8(step0, step1, step2, step3, step4, step5, step6, step7,
+ (output + 8 * 8), 8);
+ ST_SH8(in8, in9, in10, in11, in12, in13, in14, in15, (output + 16 * 8), 8);
+}
+
+static void fdct8x32_1d_row_even_4x(int16_t *input, int16_t *interm_ptr,
+ int16_t *out) {
+ v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
+ v8i16 in8, in9, in10, in11, in12, in13, in14, in15;
+ v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+ v4i32 vec0_l, vec1_l, vec2_l, vec3_l, vec4_l, vec5_l, vec6_l, vec7_l;
+ v4i32 vec0_r, vec1_r, vec2_r, vec3_r, vec4_r, vec5_r, vec6_r, vec7_r;
+ v4i32 tmp0_w, tmp1_w, tmp2_w, tmp3_w;
+
+ /* fdct32 even */
+ /* stage 2 */
+ LD_SH8(input, 8, in0, in1, in2, in3, in4, in5, in6, in7);
+ LD_SH8(input + 64, 8, in8, in9, in10, in11, in12, in13, in14, in15);
+
+ BUTTERFLY_16(in0, in1, in2, in3, in4, in5, in6, in7,
+ in8, in9, in10, in11, in12, in13, in14, in15,
+ vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7,
+ in8, in9, in10, in11, in12, in13, in14, in15);
+ ST_SH8(vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, interm_ptr, 8);
+ ST_SH8(in8, in9, in10, in11, in12, in13, in14, in15, interm_ptr + 64, 8);
+
+ /* Stage 3 */
+ UNPCK_SH_SW(vec0, vec0_l, vec0_r);
+ UNPCK_SH_SW(vec1, vec1_l, vec1_r);
+ UNPCK_SH_SW(vec2, vec2_l, vec2_r);
+ UNPCK_SH_SW(vec3, vec3_l, vec3_r);
+ UNPCK_SH_SW(vec4, vec4_l, vec4_r);
+ UNPCK_SH_SW(vec5, vec5_l, vec5_r);
+ UNPCK_SH_SW(vec6, vec6_l, vec6_r);
+ UNPCK_SH_SW(vec7, vec7_l, vec7_r);
+ ADD4(vec0_r, vec7_r, vec1_r, vec6_r, vec2_r, vec5_r, vec3_r, vec4_r,
+ tmp0_w, tmp1_w, tmp2_w, tmp3_w);
+ BUTTERFLY_4(tmp0_w, tmp1_w, tmp2_w, tmp3_w, vec4_r, vec6_r, vec7_r, vec5_r);
+ ADD4(vec0_l, vec7_l, vec1_l, vec6_l, vec2_l, vec5_l, vec3_l, vec4_l,
+ vec0_r, vec1_r, vec2_r, vec3_r);
+
+ tmp3_w = vec0_r + vec3_r;
+ vec0_r = vec0_r - vec3_r;
+ vec3_r = vec1_r + vec2_r;
+ vec1_r = vec1_r - vec2_r;
+
+ VP9_DOTP_CONST_PAIR_W(vec4_r, vec6_r, tmp3_w, vec3_r, cospi_16_64,
+ cospi_16_64, vec4_r, tmp3_w, vec6_r, vec3_r);
+ VP9_FDCT32_POSTPROC_NEG_W(vec4_r);
+ VP9_FDCT32_POSTPROC_NEG_W(tmp3_w);
+ VP9_FDCT32_POSTPROC_NEG_W(vec6_r);
+ VP9_FDCT32_POSTPROC_NEG_W(vec3_r);
+ PCKEV_H2_SH(vec4_r, tmp3_w, vec6_r, vec3_r, vec4, vec5);
+ ST_SH2(vec5, vec4, out, 8);
+
+ VP9_DOTP_CONST_PAIR_W(vec5_r, vec7_r, vec0_r, vec1_r, cospi_24_64,
+ cospi_8_64, vec4_r, tmp3_w, vec6_r, vec3_r);
+ VP9_FDCT32_POSTPROC_NEG_W(vec4_r);
+ VP9_FDCT32_POSTPROC_NEG_W(tmp3_w);
+ VP9_FDCT32_POSTPROC_NEG_W(vec6_r);
+ VP9_FDCT32_POSTPROC_NEG_W(vec3_r);
+ PCKEV_H2_SH(vec4_r, tmp3_w, vec6_r, vec3_r, vec4, vec5);
+ ST_SH2(vec5, vec4, out + 16, 8);
+
+ LD_SH8(interm_ptr, 8, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7);
+ SUB4(vec3, vec4, vec2, vec5, vec1, vec6, vec0, vec7, vec4, vec5, vec6, vec7);
+ VP9_DOTP_CONST_PAIR(vec6, vec5, cospi_16_64, cospi_16_64, vec5, vec6);
+ ADD2(vec4, vec5, vec7, vec6, vec0, vec1);
+ VP9_DOTP_CONST_PAIR(vec1, vec0, cospi_28_64, cospi_4_64, in5, in4);
+ VP9_FDCT_POSTPROC_2V_NEG_H(in4, in5);
+ ST_SH(in4, out + 32);
+ ST_SH(in5, out + 56);
+
+ SUB2(vec4, vec5, vec7, vec6, vec4, vec7);
+ VP9_DOTP_CONST_PAIR(vec7, vec4, cospi_12_64, cospi_20_64, in5, in4);
+ VP9_FDCT_POSTPROC_2V_NEG_H(in4, in5);
+ ST_SH(in4, out + 40);
+ ST_SH(in5, out + 48);
+
+ LD_SH8(interm_ptr + 64, 8, in8, in9, in10, in11, in12, in13, in14, in15);
+ VP9_DOTP_CONST_PAIR(in13, in10, cospi_16_64, cospi_16_64, vec2, vec5);
+ VP9_DOTP_CONST_PAIR(in12, in11, cospi_16_64, cospi_16_64, vec3, vec4);
+ ADD4(in8, vec3, in9, vec2, in14, vec5, in15, vec4, in0, vec1, vec6, in2);
+ VP9_DOTP_CONST_PAIR(vec6, vec1, cospi_24_64, cospi_8_64, in1, in3);
+ ADD2(in0, in1, in2, in3, vec0, vec7);
+ VP9_DOTP_CONST_PAIR(vec7, vec0, cospi_30_64, cospi_2_64, in5, in4);
+ VP9_FDCT_POSTPROC_2V_NEG_H(in4, in5);
+ ST_SH(in4, out + 64);
+ ST_SH(in5, out + 120);
+
+ SUB2(in0, in1, in2, in3, in0, in2);
+ VP9_DOTP_CONST_PAIR(in2, in0, cospi_14_64, cospi_18_64, in5, in4);
+ VP9_FDCT_POSTPROC_2V_NEG_H(in4, in5);
+ ST_SH(in4, out + 72);
+ ST_SH(in5, out + 112);
+
+ SUB2(in9, vec2, in14, vec5, vec2, vec5);
+ VP9_DOTP_CONST_PAIR((-vec2), vec5, cospi_24_64, cospi_8_64, in2, in1);
+ SUB4(in8, vec3, in15, vec4, in3, in2, in0, in1, in3, in0, vec2, vec5);
+ VP9_DOTP_CONST_PAIR(vec5, vec2, cospi_22_64, cospi_10_64, in5, in4);
+ VP9_FDCT_POSTPROC_2V_NEG_H(in4, in5);
+ ST_SH(in4, out + 80);
+ ST_SH(in5, out + 104);
+
+ ADD2(in3, in2, in0, in1, vec3, vec4);
+ VP9_DOTP_CONST_PAIR(vec4, vec3, cospi_6_64, cospi_26_64, in4, in5);
+ VP9_FDCT_POSTPROC_2V_NEG_H(in4, in5);
+ ST_SH(in4, out + 96);
+ ST_SH(in5, out + 88);
+}
+
+static void fdct8x32_1d_row_even(int16_t *temp, int16_t *out) {
+ v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
+ v8i16 in8, in9, in10, in11, in12, in13, in14, in15;
+ v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, temp0, temp1;
+
+ /* fdct32 even */
+ /* stage 2 */
+ LD_SH8(temp, 8, in0, in1, in2, in3, in4, in5, in6, in7);
+ LD_SH8(temp + 64, 8, in8, in9, in10, in11, in12, in13, in14, in15);
+
+ BUTTERFLY_16(in0, in1, in2, in3, in4, in5, in6, in7,
+ in8, in9, in10, in11, in12, in13, in14, in15,
+ vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7,
+ in8, in9, in10, in11, in12, in13, in14, in15);
+
+ /* Stage 3 */
+ ADD4(vec0, vec7, vec1, vec6, vec2, vec5, vec3, vec4, in0, in1, in2, in3);
+ BUTTERFLY_4(in0, in1, in2, in3, temp0, in4, in1, in0);
+ VP9_DOTP_CONST_PAIR(temp0, in4, cospi_16_64, cospi_16_64, temp1, temp0);
+ VP9_FDCT_POSTPROC_2V_NEG_H(temp0, temp1);
+ ST_SH(temp0, out);
+ ST_SH(temp1, out + 8);
+
+ VP9_DOTP_CONST_PAIR(in0, in1, cospi_24_64, cospi_8_64, temp1, temp0);
+ VP9_FDCT_POSTPROC_2V_NEG_H(temp0, temp1);
+ ST_SH(temp0, out + 16);
+ ST_SH(temp1, out + 24);
+
+ SUB4(vec3, vec4, vec2, vec5, vec1, vec6, vec0, vec7, vec4, vec5, vec6, vec7);
+ VP9_DOTP_CONST_PAIR(vec6, vec5, cospi_16_64, cospi_16_64, vec5, vec6);
+ ADD2(vec4, vec5, vec7, vec6, vec0, vec1);
+ VP9_DOTP_CONST_PAIR(vec1, vec0, cospi_28_64, cospi_4_64, temp1, temp0);
+ VP9_FDCT_POSTPROC_2V_NEG_H(temp0, temp1);
+ ST_SH(temp0, out + 32);
+ ST_SH(temp1, out + 56);
+
+ SUB2(vec4, vec5, vec7, vec6, vec4, vec7);
+ VP9_DOTP_CONST_PAIR(vec7, vec4, cospi_12_64, cospi_20_64, temp1, temp0);
+ VP9_FDCT_POSTPROC_2V_NEG_H(temp0, temp1);
+ ST_SH(temp0, out + 40);
+ ST_SH(temp1, out + 48);
+
+ VP9_DOTP_CONST_PAIR(in13, in10, cospi_16_64, cospi_16_64, vec2, vec5);
+ VP9_DOTP_CONST_PAIR(in12, in11, cospi_16_64, cospi_16_64, vec3, vec4);
+ ADD4(in8, vec3, in9, vec2, in14, vec5, in15, vec4, in0, vec1, vec6, in2);
+ VP9_DOTP_CONST_PAIR(vec6, vec1, cospi_24_64, cospi_8_64, in1, in3);
+ ADD2(in0, in1, in2, in3, vec0, vec7);
+ VP9_DOTP_CONST_PAIR(vec7, vec0, cospi_30_64, cospi_2_64, temp1, temp0);
+ VP9_FDCT_POSTPROC_2V_NEG_H(temp0, temp1);
+ ST_SH(temp0, out + 64);
+ ST_SH(temp1, out + 120);
+
+ SUB2(in0, in1, in2, in3, in0, in2);
+ VP9_DOTP_CONST_PAIR(in2, in0, cospi_14_64, cospi_18_64, temp1, temp0);
+ VP9_FDCT_POSTPROC_2V_NEG_H(temp0, temp1);
+ ST_SH(temp0, out + 72);
+ ST_SH(temp1, out + 112);
+
+ SUB2(in9, vec2, in14, vec5, vec2, vec5);
+ VP9_DOTP_CONST_PAIR((-vec2), vec5, cospi_24_64, cospi_8_64, in2, in1);
+ SUB4(in8, vec3, in15, vec4, in3, in2, in0, in1, in3, in0, vec2, vec5)
+ VP9_DOTP_CONST_PAIR(vec5, vec2, cospi_22_64, cospi_10_64, temp1, temp0);
+ VP9_FDCT_POSTPROC_2V_NEG_H(temp0, temp1);
+ ST_SH(temp0, out + 80);
+ ST_SH(temp1, out + 104);
+
+ ADD2(in3, in2, in0, in1, vec3, vec4);
+ VP9_DOTP_CONST_PAIR(vec4, vec3, cospi_6_64, cospi_26_64, temp0, temp1);
+ VP9_FDCT_POSTPROC_2V_NEG_H(temp0, temp1);
+ ST_SH(temp0, out + 96);
+ ST_SH(temp1, out + 88);
+}
+
+static void fdct8x32_1d_row_odd(int16_t *temp, int16_t *interm_ptr,
+ int16_t *out) {
+ v8i16 in16, in17, in18, in19, in20, in21, in22, in23;
+ v8i16 in24, in25, in26, in27, in28, in29, in30, in31, vec4, vec5;
+
+ in20 = LD_SH(temp + 32);
+ in21 = LD_SH(temp + 40);
+ in26 = LD_SH(temp + 80);
+ in27 = LD_SH(temp + 88);
+
+ VP9_DOTP_CONST_PAIR(in27, in20, cospi_16_64, cospi_16_64, in20, in27);
+ VP9_DOTP_CONST_PAIR(in26, in21, cospi_16_64, cospi_16_64, in21, in26);
+
+ in18 = LD_SH(temp + 16);
+ in19 = LD_SH(temp + 24);
+ in28 = LD_SH(temp + 96);
+ in29 = LD_SH(temp + 104);
+
+ vec4 = in19 - in20;
+ ST_SH(vec4, interm_ptr + 32);
+ vec4 = in18 - in21;
+ ST_SH(vec4, interm_ptr + 88);
+ vec4 = in28 - in27;
+ ST_SH(vec4, interm_ptr + 56);
+ vec4 = in29 - in26;
+ ST_SH(vec4, interm_ptr + 64);
+
+ ADD4(in18, in21, in19, in20, in28, in27, in29, in26, in21, in20, in27, in26);
+
+ in22 = LD_SH(temp + 48);
+ in23 = LD_SH(temp + 56);
+ in24 = LD_SH(temp + 64);
+ in25 = LD_SH(temp + 72);
+
+ VP9_DOTP_CONST_PAIR(in25, in22, cospi_16_64, cospi_16_64, in22, in25);
+ VP9_DOTP_CONST_PAIR(in24, in23, cospi_16_64, cospi_16_64, in23, in24);
+
+ in16 = LD_SH(temp);
+ in17 = LD_SH(temp + 8);
+ in30 = LD_SH(temp + 112);
+ in31 = LD_SH(temp + 120);
+
+ vec4 = in17 - in22;
+ ST_SH(vec4, interm_ptr + 40);
+ vec4 = in30 - in25;
+ ST_SH(vec4, interm_ptr + 48);
+ vec4 = in31 - in24;
+ ST_SH(vec4, interm_ptr + 72);
+ vec4 = in16 - in23;
+ ST_SH(vec4, interm_ptr + 80);
+
+ ADD4(in16, in23, in17, in22, in30, in25, in31, in24, in16, in17, in30, in31);
+ VP9_DOTP_CONST_PAIR(in26, in21, cospi_24_64, cospi_8_64, in18, in29);
+ VP9_DOTP_CONST_PAIR(in27, in20, cospi_24_64, cospi_8_64, in19, in28);
+
+ ADD4(in16, in19, in17, in18, in30, in29, in31, in28, in27, in22, in21, in25);
+ VP9_DOTP_CONST_PAIR(in21, in22, cospi_28_64, cospi_4_64, in26, in24);
+ ADD2(in27, in26, in25, in24, in23, in20);
+
+ VP9_DOTP_CONST_PAIR(in20, in23, cospi_31_64, cospi_1_64, vec4, vec5);
+ VP9_FDCT_POSTPROC_2V_NEG_H(vec5, vec4);
+ ST_SH(vec5, out);
+ ST_SH(vec4, out + 120);
+
+ SUB2(in27, in26, in25, in24, in22, in21);
+
+ VP9_DOTP_CONST_PAIR(in21, in22, cospi_15_64, cospi_17_64, vec5, vec4);
+ VP9_FDCT_POSTPROC_2V_NEG_H(vec5, vec4);
+ ST_SH(vec5, out + 112);
+ ST_SH(vec4, out + 8);
+
+ SUB4(in17, in18, in16, in19, in31, in28, in30, in29, in23, in26, in24, in20);
+ VP9_DOTP_CONST_PAIR((-in23), in20, cospi_28_64, cospi_4_64, in27, in25);
+ SUB2(in26, in27, in24, in25, in23, in20);
+
+ VP9_DOTP_CONST_PAIR(in20, in23, cospi_23_64, cospi_9_64, vec4, vec5);
+ VP9_FDCT_POSTPROC_2V_NEG_H(vec5, vec4);
+ ST_SH(vec4, out + 16);
+ ST_SH(vec5, out + 104);
+
+ ADD2(in26, in27, in24, in25, in22, in21);
+ VP9_DOTP_CONST_PAIR(in21, in22, cospi_7_64, cospi_25_64, vec4, vec5);
+ VP9_FDCT_POSTPROC_2V_NEG_H(vec5, vec4);
+ ST_SH(vec4, out + 24);
+ ST_SH(vec5, out + 96);
+
+ in20 = LD_SH(interm_ptr + 32);
+ in21 = LD_SH(interm_ptr + 88);
+ in27 = LD_SH(interm_ptr + 56);
+ in26 = LD_SH(interm_ptr + 64);
+
+ in16 = in20;
+ in17 = in21;
+ VP9_DOTP_CONST_PAIR(-in16, in27, cospi_24_64, cospi_8_64, in20, in27);
+ VP9_DOTP_CONST_PAIR(-in17, in26, cospi_24_64, cospi_8_64, in21, in26);
+
+ in22 = LD_SH(interm_ptr + 40);
+ in25 = LD_SH(interm_ptr + 48);
+ in24 = LD_SH(interm_ptr + 72);
+ in23 = LD_SH(interm_ptr + 80);
+
+ SUB4(in23, in20, in22, in21, in25, in26, in24, in27, in28, in17, in18, in31);
+ VP9_DOTP_CONST_PAIR(in18, in17, cospi_12_64, cospi_20_64, in29, in30);
+ ADD2(in28, in29, in31, in30, in16, in19);
+ VP9_DOTP_CONST_PAIR(in19, in16, cospi_27_64, cospi_5_64, vec5, vec4);
+ VP9_FDCT_POSTPROC_2V_NEG_H(vec5, vec4);
+ ST_SH(vec5, out + 32);
+ ST_SH(vec4, out + 88);
+
+ SUB2(in28, in29, in31, in30, in17, in18);
+ VP9_DOTP_CONST_PAIR(in18, in17, cospi_11_64, cospi_21_64, vec5, vec4);
+ VP9_FDCT_POSTPROC_2V_NEG_H(vec5, vec4);
+ ST_SH(vec5, out + 40);
+ ST_SH(vec4, out + 80);
+
+ ADD4(in22, in21, in23, in20, in24, in27, in25, in26, in16, in29, in30, in19);
+ VP9_DOTP_CONST_PAIR(-in16, in19, cospi_12_64, cospi_20_64, in28, in31);
+ SUB2(in29, in28, in30, in31, in16, in19);
+
+ VP9_DOTP_CONST_PAIR(in19, in16, cospi_19_64, cospi_13_64, vec5, vec4);
+ VP9_FDCT_POSTPROC_2V_NEG_H(vec5, vec4);
+ ST_SH(vec5, out + 72);
+ ST_SH(vec4, out + 48);
+
+ ADD2(in29, in28, in30, in31, in17, in18);
+
+ VP9_DOTP_CONST_PAIR(in18, in17, cospi_3_64, cospi_29_64, vec5, vec4);
+ VP9_FDCT_POSTPROC_2V_NEG_H(vec5, vec4);
+ ST_SH(vec4, out + 56);
+ ST_SH(vec5, out + 64);
+}
+
+static void fdct8x32_1d_row_transpose_store(int16_t *temp, int16_t *output) {
+ v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
+ v8i16 in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1;
+
+ /* 1st set */
+ in0 = LD_SH(temp);
+ in4 = LD_SH(temp + 32);
+ in2 = LD_SH(temp + 64);
+ in6 = LD_SH(temp + 96);
+ in1 = LD_SH(temp + 128);
+ in7 = LD_SH(temp + 152);
+ in3 = LD_SH(temp + 192);
+ in5 = LD_SH(temp + 216);
+
+ TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7,
+ in0, in1, in2, in3, in4, in5, in6, in7);
+
+ /* 2nd set */
+ in0_1 = LD_SH(temp + 16);
+ in1_1 = LD_SH(temp + 232);
+ in2_1 = LD_SH(temp + 80);
+ in3_1 = LD_SH(temp + 168);
+ in4_1 = LD_SH(temp + 48);
+ in5_1 = LD_SH(temp + 176);
+ in6_1 = LD_SH(temp + 112);
+ in7_1 = LD_SH(temp + 240);
+
+ ST_SH8(in0, in1, in2, in3, in4, in5, in6, in7, output, 32);
+ TRANSPOSE8x8_SH_SH(in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1,
+ in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1);
+
+ /* 3rd set */
+ in0 = LD_SH(temp + 8);
+ in1 = LD_SH(temp + 136);
+ in2 = LD_SH(temp + 72);
+ in3 = LD_SH(temp + 200);
+ in4 = LD_SH(temp + 40);
+ in5 = LD_SH(temp + 208);
+ in6 = LD_SH(temp + 104);
+ in7 = LD_SH(temp + 144);
+
+ ST_SH8(in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1,
+ output + 8, 32);
+ TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7,
+ in0, in1, in2, in3, in4, in5, in6, in7);
+ ST_SH8(in0, in1, in2, in3, in4, in5, in6, in7, output + 16, 32);
+
+ /* 4th set */
+ in0_1 = LD_SH(temp + 24);
+ in1_1 = LD_SH(temp + 224);
+ in2_1 = LD_SH(temp + 88);
+ in3_1 = LD_SH(temp + 160);
+ in4_1 = LD_SH(temp + 56);
+ in5_1 = LD_SH(temp + 184);
+ in6_1 = LD_SH(temp + 120);
+ in7_1 = LD_SH(temp + 248);
+
+ TRANSPOSE8x8_SH_SH(in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1,
+ in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1);
+ ST_SH8(in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1,
+ output + 24, 32);
+}
+
+static void fdct32x8_1d_row(int16_t *temp, int16_t *temp_buf,
+ int16_t *output) {
+ fdct8x32_1d_row_load_butterfly(temp, temp_buf);
+ fdct8x32_1d_row_even(temp_buf, temp_buf);
+ fdct8x32_1d_row_odd(temp_buf + 128, temp, temp_buf + 128);
+ fdct8x32_1d_row_transpose_store(temp_buf, output);
+}
+
+static void fdct32x8_1d_row_4x(int16_t *tmp_buf_big, int16_t *tmp_buf,
+ int16_t *output) {
+ fdct8x32_1d_row_load_butterfly(tmp_buf_big, tmp_buf);
+ fdct8x32_1d_row_even_4x(tmp_buf, tmp_buf_big, tmp_buf);
+ fdct8x32_1d_row_odd(tmp_buf + 128, tmp_buf_big, tmp_buf + 128);
+ fdct8x32_1d_row_transpose_store(tmp_buf, output);
+}
+
+void vp9_fdct32x32_msa(const int16_t *input, int16_t *output,
+ int32_t src_stride) {
+ int32_t i;
+ DECLARE_ALIGNED(32, int16_t, tmp_buf_big[1024]);
+ DECLARE_ALIGNED(32, int16_t, tmp_buf[256]);
+
+ /* column transform */
+ for (i = 0; i < 4; ++i) {
+ fdct8x32_1d_column(input + (8 * i), src_stride, tmp_buf,
+ tmp_buf_big + (8 * i));
+ }
+
+ /* row transform */
+ fdct32x8_1d_row_4x(tmp_buf_big, tmp_buf, output);
+
+ /* row transform */
+ for (i = 1; i < 4; ++i) {
+ fdct32x8_1d_row(tmp_buf_big + (i * 256), tmp_buf, output + (i * 256));
+ }
+}
+
+void vp9_fdct32x32_1_msa(const int16_t *input, int16_t *out, int32_t stride) {
+ out[1] = 0;
+
+ out[0] = VP9_LD_HADD(input, stride);
+ out[0] += VP9_LD_HADD(input + 8, stride);
+ out[0] += VP9_LD_HADD(input + 16, stride);
+ out[0] += VP9_LD_HADD(input + 24, stride);
+ out[0] += VP9_LD_HADD(input + 32 * 8, stride);
+ out[0] += VP9_LD_HADD(input + 32 * 8 + 8, stride);
+ out[0] += VP9_LD_HADD(input + 32 * 8 + 16, stride);
+ out[0] += VP9_LD_HADD(input + 32 * 8 + 24, stride);
+ out[0] += VP9_LD_HADD(input + 32 * 16, stride);
+ out[0] += VP9_LD_HADD(input + 32 * 16 + 8, stride);
+ out[0] += VP9_LD_HADD(input + 32 * 16 + 16, stride);
+ out[0] += VP9_LD_HADD(input + 32 * 16 + 24, stride);
+ out[0] += VP9_LD_HADD(input + 32 * 24, stride);
+ out[0] += VP9_LD_HADD(input + 32 * 24 + 8, stride);
+ out[0] += VP9_LD_HADD(input + 32 * 24 + 16, stride);
+ out[0] += VP9_LD_HADD(input + 32 * 24 + 24, stride);
+ out[0] >>= 3;
+}
+
+static void fdct8x32_1d_row_even_rd(int16_t *temp, int16_t *out) {
+ v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
+ v8i16 in8, in9, in10, in11, in12, in13, in14, in15;
+ v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, temp0, temp1;
+
+ /* fdct32 even */
+ /* stage 2 */
+ LD_SH8(temp, 8, in0, in1, in2, in3, in4, in5, in6, in7);
+ LD_SH8(temp + 64, 8, in8, in9, in10, in11, in12, in13, in14, in15);
+
+ BUTTERFLY_16(in0, in1, in2, in3, in4, in5, in6, in7,
+ in8, in9, in10, in11, in12, in13, in14, in15,
+ vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7,
+ in8, in9, in10, in11, in12, in13, in14, in15);
+ VP9_FDCT_POSTPROC_2V_NEG_H(vec0, vec1);
+ VP9_FDCT_POSTPROC_2V_NEG_H(vec2, vec3);
+ VP9_FDCT_POSTPROC_2V_NEG_H(vec4, vec5);
+ VP9_FDCT_POSTPROC_2V_NEG_H(vec6, vec7);
+ VP9_FDCT_POSTPROC_2V_NEG_H(in8, in9);
+ VP9_FDCT_POSTPROC_2V_NEG_H(in10, in11);
+ VP9_FDCT_POSTPROC_2V_NEG_H(in12, in13);
+ VP9_FDCT_POSTPROC_2V_NEG_H(in14, in15);
+
+ /* Stage 3 */
+ ADD4(vec0, vec7, vec1, vec6, vec2, vec5, vec3, vec4, in0, in1, in2, in3);
+
+ temp0 = in0 + in3;
+ in0 = in0 - in3;
+ in3 = in1 + in2;
+ in1 = in1 - in2;
+
+ VP9_DOTP_CONST_PAIR(temp0, in3, cospi_16_64, cospi_16_64, temp1, temp0);
+ ST_SH(temp0, out);
+ ST_SH(temp1, out + 8);
+
+ VP9_DOTP_CONST_PAIR(in0, in1, cospi_24_64, cospi_8_64, temp1, temp0);
+ ST_SH(temp0, out + 16);
+ ST_SH(temp1, out + 24);
+
+ SUB4(vec3, vec4, vec2, vec5, vec1, vec6, vec0, vec7, vec4, vec5, vec6, vec7);
+ VP9_DOTP_CONST_PAIR(vec6, vec5, cospi_16_64, cospi_16_64, vec5, vec6);
+ ADD2(vec4, vec5, vec7, vec6, vec0, vec1);
+ VP9_DOTP_CONST_PAIR(vec1, vec0, cospi_28_64, cospi_4_64, temp1, temp0);
+ ST_SH(temp0, out + 32);
+ ST_SH(temp1, out + 56);
+
+ SUB2(vec4, vec5, vec7, vec6, vec4, vec7);
+ VP9_DOTP_CONST_PAIR(vec7, vec4, cospi_12_64, cospi_20_64, temp1, temp0);
+ ST_SH(temp0, out + 40);
+ ST_SH(temp1, out + 48);
+
+ VP9_DOTP_CONST_PAIR(in13, in10, cospi_16_64, cospi_16_64, vec2, vec5);
+ VP9_DOTP_CONST_PAIR(in12, in11, cospi_16_64, cospi_16_64, vec3, vec4);
+ ADD4(in8, vec3, in9, vec2, in14, vec5, in15, vec4, in0, vec1, vec6, in2);
+ VP9_DOTP_CONST_PAIR(vec6, vec1, cospi_24_64, cospi_8_64, in1, in3);
+ ADD2(in0, in1, in2, in3, vec0, vec7);
+ VP9_DOTP_CONST_PAIR(vec7, vec0, cospi_30_64, cospi_2_64, temp1, temp0);
+ ST_SH(temp0, out + 64);
+ ST_SH(temp1, out + 120);
+
+ SUB2(in0, in1, in2, in3, in0, in2);
+ VP9_DOTP_CONST_PAIR(in2, in0, cospi_14_64, cospi_18_64, temp1, temp0);
+ ST_SH(temp0, out + 72);
+ ST_SH(temp1, out + 112);
+
+ SUB2(in9, vec2, in14, vec5, vec2, vec5);
+ VP9_DOTP_CONST_PAIR((-vec2), vec5, cospi_24_64, cospi_8_64, in2, in1);
+ SUB4(in8, vec3, in15, vec4, in3, in2, in0, in1, in3, in0, vec2, vec5);
+ VP9_DOTP_CONST_PAIR(vec5, vec2, cospi_22_64, cospi_10_64, temp1, temp0);
+ ST_SH(temp0, out + 80);
+ ST_SH(temp1, out + 104);
+
+ ADD2(in3, in2, in0, in1, vec3, vec4);
+ VP9_DOTP_CONST_PAIR(vec4, vec3, cospi_6_64, cospi_26_64, temp0, temp1);
+ ST_SH(temp0, out + 96);
+ ST_SH(temp1, out + 88);
+}
+
+static void fdct8x32_1d_row_odd_rd(int16_t *temp, int16_t *interm_ptr,
+ int16_t *out) {
+ v8i16 in16, in17, in18, in19, in20, in21, in22, in23;
+ v8i16 in24, in25, in26, in27, in28, in29, in30, in31;
+ v8i16 vec4, vec5;
+
+ in20 = LD_SH(temp + 32);
+ in21 = LD_SH(temp + 40);
+ in26 = LD_SH(temp + 80);
+ in27 = LD_SH(temp + 88);
+
+ VP9_DOTP_CONST_PAIR(in27, in20, cospi_16_64, cospi_16_64, in20, in27);
+ VP9_DOTP_CONST_PAIR(in26, in21, cospi_16_64, cospi_16_64, in21, in26);
+
+ VP9_FDCT_POSTPROC_2V_NEG_H(in20, in21);
+ VP9_FDCT_POSTPROC_2V_NEG_H(in26, in27);
+
+ in18 = LD_SH(temp + 16);
+ in19 = LD_SH(temp + 24);
+ in28 = LD_SH(temp + 96);
+ in29 = LD_SH(temp + 104);
+
+ VP9_FDCT_POSTPROC_2V_NEG_H(in18, in19);
+ VP9_FDCT_POSTPROC_2V_NEG_H(in28, in29);
+
+ vec4 = in19 - in20;
+ ST_SH(vec4, interm_ptr + 32);
+ vec4 = in18 - in21;
+ ST_SH(vec4, interm_ptr + 88);
+ vec4 = in29 - in26;
+ ST_SH(vec4, interm_ptr + 64);
+ vec4 = in28 - in27;
+ ST_SH(vec4, interm_ptr + 56);
+
+ ADD4(in18, in21, in19, in20, in28, in27, in29, in26, in21, in20, in27, in26);
+
+ in22 = LD_SH(temp + 48);
+ in23 = LD_SH(temp + 56);
+ in24 = LD_SH(temp + 64);
+ in25 = LD_SH(temp + 72);
+
+ VP9_DOTP_CONST_PAIR(in25, in22, cospi_16_64, cospi_16_64, in22, in25);
+ VP9_DOTP_CONST_PAIR(in24, in23, cospi_16_64, cospi_16_64, in23, in24);
+ VP9_FDCT_POSTPROC_2V_NEG_H(in22, in23);
+ VP9_FDCT_POSTPROC_2V_NEG_H(in24, in25);
+
+ in16 = LD_SH(temp);
+ in17 = LD_SH(temp + 8);
+ in30 = LD_SH(temp + 112);
+ in31 = LD_SH(temp + 120);
+
+ VP9_FDCT_POSTPROC_2V_NEG_H(in16, in17);
+ VP9_FDCT_POSTPROC_2V_NEG_H(in30, in31);
+
+ vec4 = in17 - in22;
+ ST_SH(vec4, interm_ptr + 40);
+ vec4 = in30 - in25;
+ ST_SH(vec4, interm_ptr + 48);
+ vec4 = in31 - in24;
+ ST_SH(vec4, interm_ptr + 72);
+ vec4 = in16 - in23;
+ ST_SH(vec4, interm_ptr + 80);
+
+ ADD4(in16, in23, in17, in22, in30, in25, in31, in24, in16, in17, in30, in31);
+ VP9_DOTP_CONST_PAIR(in26, in21, cospi_24_64, cospi_8_64, in18, in29);
+ VP9_DOTP_CONST_PAIR(in27, in20, cospi_24_64, cospi_8_64, in19, in28);
+ ADD4(in16, in19, in17, in18, in30, in29, in31, in28, in27, in22, in21, in25);
+ VP9_DOTP_CONST_PAIR(in21, in22, cospi_28_64, cospi_4_64, in26, in24);
+ ADD2(in27, in26, in25, in24, in23, in20);
+ VP9_DOTP_CONST_PAIR(in20, in23, cospi_31_64, cospi_1_64, vec4, vec5);
+ ST_SH(vec5, out);
+ ST_SH(vec4, out + 120);
+
+ SUB2(in27, in26, in25, in24, in22, in21);
+ VP9_DOTP_CONST_PAIR(in21, in22, cospi_15_64, cospi_17_64, vec5, vec4);
+ ST_SH(vec5, out + 112);
+ ST_SH(vec4, out + 8);
+
+ SUB4(in17, in18, in16, in19, in31, in28, in30, in29, in23, in26, in24, in20);
+ VP9_DOTP_CONST_PAIR((-in23), in20, cospi_28_64, cospi_4_64, in27, in25);
+ SUB2(in26, in27, in24, in25, in23, in20);
+ VP9_DOTP_CONST_PAIR(in20, in23, cospi_23_64, cospi_9_64, vec4, vec5);
+ ST_SH(vec4, out + 16);
+ ST_SH(vec5, out + 104);
+
+ ADD2(in26, in27, in24, in25, in22, in21);
+ VP9_DOTP_CONST_PAIR(in21, in22, cospi_7_64, cospi_25_64, vec4, vec5);
+ ST_SH(vec4, out + 24);
+ ST_SH(vec5, out + 96);
+
+ in20 = LD_SH(interm_ptr + 32);
+ in21 = LD_SH(interm_ptr + 88);
+ in27 = LD_SH(interm_ptr + 56);
+ in26 = LD_SH(interm_ptr + 64);
+
+ in16 = in20;
+ in17 = in21;
+ VP9_DOTP_CONST_PAIR(-in16, in27, cospi_24_64, cospi_8_64, in20, in27);
+ VP9_DOTP_CONST_PAIR(-in17, in26, cospi_24_64, cospi_8_64, in21, in26);
+
+ in22 = LD_SH(interm_ptr + 40);
+ in25 = LD_SH(interm_ptr + 48);
+ in24 = LD_SH(interm_ptr + 72);
+ in23 = LD_SH(interm_ptr + 80);
+
+ SUB4(in23, in20, in22, in21, in25, in26, in24, in27, in28, in17, in18, in31);
+ VP9_DOTP_CONST_PAIR(in18, in17, cospi_12_64, cospi_20_64, in29, in30);
+ in16 = in28 + in29;
+ in19 = in31 + in30;
+ VP9_DOTP_CONST_PAIR(in19, in16, cospi_27_64, cospi_5_64, vec5, vec4);
+ ST_SH(vec5, out + 32);
+ ST_SH(vec4, out + 88);
+
+ SUB2(in28, in29, in31, in30, in17, in18);
+ VP9_DOTP_CONST_PAIR(in18, in17, cospi_11_64, cospi_21_64, vec5, vec4);
+ ST_SH(vec5, out + 40);
+ ST_SH(vec4, out + 80);
+
+ ADD4(in22, in21, in23, in20, in24, in27, in25, in26, in16, in29, in30, in19);
+ VP9_DOTP_CONST_PAIR(-in16, in19, cospi_12_64, cospi_20_64, in28, in31);
+ SUB2(in29, in28, in30, in31, in16, in19);
+ VP9_DOTP_CONST_PAIR(in19, in16, cospi_19_64, cospi_13_64, vec5, vec4);
+ ST_SH(vec5, out + 72);
+ ST_SH(vec4, out + 48);
+
+ ADD2(in29, in28, in30, in31, in17, in18);
+ VP9_DOTP_CONST_PAIR(in18, in17, cospi_3_64, cospi_29_64, vec5, vec4);
+ ST_SH(vec4, out + 56);
+ ST_SH(vec5, out + 64);
+}
+
+static void fdct32x8_1d_row_rd(int16_t *tmp_buf_big, int16_t *tmp_buf,
+ int16_t *output) {
+ fdct8x32_1d_row_load_butterfly(tmp_buf_big, tmp_buf);
+ fdct8x32_1d_row_even_rd(tmp_buf, tmp_buf);
+ fdct8x32_1d_row_odd_rd((tmp_buf + 128), tmp_buf_big, (tmp_buf + 128));
+ fdct8x32_1d_row_transpose_store(tmp_buf, output);
+}
+
+void vp9_fdct32x32_rd_msa(const int16_t *input, int16_t *out,
+ int32_t src_stride) {
+ int32_t i;
+ DECLARE_ALIGNED(32, int16_t, tmp_buf_big[1024]);
+ DECLARE_ALIGNED(32, int16_t, tmp_buf[256]);
+
+ /* column transform */
+ for (i = 0; i < 4; ++i) {
+ fdct8x32_1d_column(input + (8 * i), src_stride, &tmp_buf[0],
+ &tmp_buf_big[0] + (8 * i));
+ }
+
+ /* row transform */
+ for (i = 0; i < 4; ++i) {
+ fdct32x8_1d_row_rd(&tmp_buf_big[0] + (8 * i * 32), &tmp_buf[0],
+ out + (8 * i * 32));
+ }
+}
diff --git a/vp9/encoder/mips/msa/vp9_fdct_msa.h b/vp9/encoder/mips/msa/vp9_fdct_msa.h
new file mode 100644
index 0000000..99d299a
--- /dev/null
+++ b/vp9/encoder/mips/msa/vp9_fdct_msa.h
@@ -0,0 +1,333 @@
+/*
+ * Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VP9_ENCODER_MIPS_MSA_VP9_FDCT_MSA_H_
+#define VP9_ENCODER_MIPS_MSA_VP9_FDCT_MSA_H_
+
+#include "vpx_ports/mem.h"
+#include "vp9/common/vp9_idct.h"
+#include "vp9/common/mips/msa/vp9_macros_msa.h"
+
+#define VP9_DOTP_CONST_PAIR(reg0, reg1, cnst0, cnst1, out0, out1) { \
+ v8i16 k0_m = __msa_fill_h(cnst0); \
+ v4i32 s0_m, s1_m, s2_m, s3_m; \
+ \
+ s0_m = (v4i32)__msa_fill_h(cnst1); \
+ k0_m = __msa_ilvev_h((v8i16)s0_m, k0_m); \
+ \
+ ILVRL_H2_SW((-reg1), reg0, s1_m, s0_m); \
+ ILVRL_H2_SW(reg0, reg1, s3_m, s2_m); \
+ DOTP_SH2_SW(s1_m, s0_m, k0_m, k0_m, s1_m, s0_m); \
+ SRARI_W2_SW(s1_m, s0_m, DCT_CONST_BITS); \
+ out0 = __msa_pckev_h((v8i16)s0_m, (v8i16)s1_m); \
+ \
+ DOTP_SH2_SW(s3_m, s2_m, k0_m, k0_m, s1_m, s0_m); \
+ SRARI_W2_SW(s1_m, s0_m, DCT_CONST_BITS); \
+ out1 = __msa_pckev_h((v8i16)s0_m, (v8i16)s1_m); \
+}
+
+#define VP9_DOT_SHIFT_RIGHT_PCK_H(in0, in1, in2) ({ \
+ v8i16 dst_m; \
+ v4i32 tp0_m, tp1_m; \
+ \
+ DOTP_SH2_SW(in0, in1, in2, in2, tp1_m, tp0_m); \
+ SRARI_W2_SW(tp1_m, tp0_m, DCT_CONST_BITS); \
+ dst_m = __msa_pckev_h((v8i16)tp1_m, (v8i16)tp0_m); \
+ \
+ dst_m; \
+})
+
+#define VP9_MADD_SHORT(m0, m1, c0, c1, res0, res1) { \
+ v4i32 madd0_m, madd1_m, madd2_m, madd3_m; \
+ v8i16 madd_s0_m, madd_s1_m; \
+ \
+ ILVRL_H2_SH(m1, m0, madd_s0_m, madd_s1_m); \
+ DOTP_SH4_SW(madd_s0_m, madd_s1_m, madd_s0_m, madd_s1_m, \
+ c0, c0, c1, c1, madd0_m, madd1_m, madd2_m, madd3_m); \
+ SRARI_W4_SW(madd0_m, madd1_m, madd2_m, madd3_m, DCT_CONST_BITS); \
+ PCKEV_H2_SH(madd1_m, madd0_m, madd3_m, madd2_m, res0, res1); \
+}
+
+#define VP9_MADD_BF(inp0, inp1, inp2, inp3, cst0, cst1, cst2, cst3, \
+ out0, out1, out2, out3) { \
+ v8i16 madd_s0_m, madd_s1_m, madd_s2_m, madd_s3_m; \
+ v4i32 tmp0_m, tmp1_m, tmp2_m, tmp3_m, m4_m, m5_m; \
+ \
+ ILVRL_H2_SH(inp1, inp0, madd_s0_m, madd_s1_m); \
+ ILVRL_H2_SH(inp3, inp2, madd_s2_m, madd_s3_m); \
+ DOTP_SH4_SW(madd_s0_m, madd_s1_m, madd_s2_m, madd_s3_m, \
+ cst0, cst0, cst2, cst2, tmp0_m, tmp1_m, tmp2_m, tmp3_m); \
+ BUTTERFLY_4(tmp0_m, tmp1_m, tmp3_m, tmp2_m, \
+ m4_m, m5_m, tmp3_m, tmp2_m); \
+ SRARI_W4_SW(m4_m, m5_m, tmp2_m, tmp3_m, DCT_CONST_BITS); \
+ PCKEV_H2_SH(m5_m, m4_m, tmp3_m, tmp2_m, out0, out1); \
+ DOTP_SH4_SW(madd_s0_m, madd_s1_m, madd_s2_m, madd_s3_m, \
+ cst1, cst1, cst3, cst3, tmp0_m, tmp1_m, tmp2_m, tmp3_m); \
+ BUTTERFLY_4(tmp0_m, tmp1_m, tmp3_m, tmp2_m, \
+ m4_m, m5_m, tmp3_m, tmp2_m); \
+ SRARI_W4_SW(m4_m, m5_m, tmp2_m, tmp3_m, DCT_CONST_BITS); \
+ PCKEV_H2_SH(m5_m, m4_m, tmp3_m, tmp2_m, out2, out3); \
+}
+
+#define VP9_LD_HADD(psrc, stride) ({ \
+ v8i16 in0_m, in1_m, in2_m, in3_m, in4_m, in5_m, in6_m, in7_m; \
+ v4i32 vec_w_m; \
+ \
+ LD_SH4((psrc), stride, in0_m, in1_m, in2_m, in3_m); \
+ ADD2(in0_m, in1_m, in2_m, in3_m, in0_m, in2_m); \
+ LD_SH4(((psrc) + 4 * stride), stride, in4_m, in5_m, in6_m, in7_m); \
+ ADD4(in4_m, in5_m, in6_m, in7_m, in0_m, in2_m, in4_m, in6_m, \
+ in4_m, in6_m, in0_m, in4_m); \
+ in0_m += in4_m; \
+ \
+ vec_w_m = __msa_hadd_s_w(in0_m, in0_m); \
+ HADD_SW_S32(vec_w_m); \
+})
+
+#define VP9_FDCT_POSTPROC_2V_NEG_H(vec0, vec1) { \
+ v8i16 tp0_m, tp1_m; \
+ v8i16 one_m = __msa_ldi_h(1); \
+ \
+ tp0_m = __msa_clti_s_h(vec0, 0); \
+ tp1_m = __msa_clti_s_h(vec1, 0); \
+ vec0 += 1; \
+ vec1 += 1; \
+ tp0_m = one_m & tp0_m; \
+ tp1_m = one_m & tp1_m; \
+ vec0 += tp0_m; \
+ vec1 += tp1_m; \
+ vec0 >>= 2; \
+ vec1 >>= 2; \
+}
+
+#define VP9_FDCT8x16_EVEN(in0, in1, in2, in3, in4, in5, in6, in7, \
+ out0, out1, out2, out3, out4, out5, out6, out7) { \
+ v8i16 s0_m, s1_m, s2_m, s3_m, s4_m, s5_m, s6_m, s7_m; \
+ v8i16 x0_m, x1_m, x2_m, x3_m; \
+ v8i16 coeff_m = { cospi_16_64, -cospi_16_64, cospi_8_64, cospi_24_64, \
+ cospi_4_64, cospi_28_64, cospi_12_64, cospi_20_64 }; \
+ \
+ /* FDCT stage1 */ \
+ BUTTERFLY_8(in0, in1, in2, in3, in4, in5, in6, in7, \
+ s0_m, s1_m, s2_m, s3_m, s4_m, s5_m, s6_m, s7_m); \
+ BUTTERFLY_4(s0_m, s1_m, s2_m, s3_m, x0_m, x1_m, x2_m, x3_m); \
+ ILVL_H2_SH(x1_m, x0_m, x3_m, x2_m, s0_m, s2_m); \
+ ILVR_H2_SH(x1_m, x0_m, x3_m, x2_m, s1_m, s3_m); \
+ SPLATI_H2_SH(coeff_m, 0, 1, x0_m, x1_m); \
+ x1_m = __msa_ilvev_h(x1_m, x0_m); \
+ out4 = VP9_DOT_SHIFT_RIGHT_PCK_H(s0_m, s1_m, x1_m); \
+ \
+ SPLATI_H2_SH(coeff_m, 2, 3, x2_m, x3_m); \
+ x2_m = -x2_m; \
+ x2_m = __msa_ilvev_h(x3_m, x2_m); \
+ out6 = VP9_DOT_SHIFT_RIGHT_PCK_H(s2_m, s3_m, x2_m); \
+ \
+ out0 = VP9_DOT_SHIFT_RIGHT_PCK_H(s0_m, s1_m, x0_m); \
+ x2_m = __msa_splati_h(coeff_m, 2); \
+ x2_m = __msa_ilvev_h(x2_m, x3_m); \
+ out2 = VP9_DOT_SHIFT_RIGHT_PCK_H(s2_m, s3_m, x2_m); \
+ \
+ /* stage2 */ \
+ ILVRL_H2_SH(s5_m, s6_m, s1_m, s0_m); \
+ \
+ s6_m = VP9_DOT_SHIFT_RIGHT_PCK_H(s0_m, s1_m, x0_m); \
+ s5_m = VP9_DOT_SHIFT_RIGHT_PCK_H(s0_m, s1_m, x1_m); \
+ \
+ /* stage3 */ \
+ BUTTERFLY_4(s4_m, s7_m, s6_m, s5_m, x0_m, x3_m, x2_m, x1_m); \
+ \
+ /* stage4 */ \
+ ILVL_H2_SH(x3_m, x0_m, x2_m, x1_m, s4_m, s6_m); \
+ ILVR_H2_SH(x3_m, x0_m, x2_m, x1_m, s5_m, s7_m); \
+ \
+ SPLATI_H2_SH(coeff_m, 4, 5, x0_m, x1_m); \
+ x1_m = __msa_ilvev_h(x0_m, x1_m); \
+ out1 = VP9_DOT_SHIFT_RIGHT_PCK_H(s4_m, s5_m, x1_m); \
+ \
+ SPLATI_H2_SH(coeff_m, 6, 7, x2_m, x3_m); \
+ x2_m = __msa_ilvev_h(x3_m, x2_m); \
+ out5 = VP9_DOT_SHIFT_RIGHT_PCK_H(s6_m, s7_m, x2_m); \
+ \
+ x1_m = __msa_splati_h(coeff_m, 5); \
+ x0_m = -x0_m; \
+ x0_m = __msa_ilvev_h(x1_m, x0_m); \
+ out7 = VP9_DOT_SHIFT_RIGHT_PCK_H(s4_m, s5_m, x0_m); \
+ \
+ x2_m = __msa_splati_h(coeff_m, 6); \
+ x3_m = -x3_m; \
+ x2_m = __msa_ilvev_h(x2_m, x3_m); \
+ out3 = VP9_DOT_SHIFT_RIGHT_PCK_H(s6_m, s7_m, x2_m); \
+}
+
+#define VP9_FDCT8x16_ODD(input0, input1, input2, input3, \
+ input4, input5, input6, input7, \
+ out1, out3, out5, out7, \
+ out9, out11, out13, out15) { \
+ v8i16 stp21_m, stp22_m, stp23_m, stp24_m, stp25_m, stp26_m; \
+ v8i16 stp30_m, stp31_m, stp32_m, stp33_m, stp34_m, stp35_m; \
+ v8i16 stp36_m, stp37_m, vec0_m, vec1_m; \
+ v8i16 vec2_m, vec3_m, vec4_m, vec5_m, vec6_m; \
+ v8i16 cnst0_m, cnst1_m, cnst4_m, cnst5_m; \
+ v8i16 coeff_m = { cospi_16_64, -cospi_16_64, cospi_8_64, \
+ cospi_24_64, -cospi_8_64, -cospi_24_64, \
+ cospi_12_64, cospi_20_64 }; \
+ v8i16 coeff1_m = { cospi_2_64, cospi_30_64, cospi_14_64, \
+ cospi_18_64, cospi_10_64, cospi_22_64, \
+ cospi_6_64, cospi_26_64 }; \
+ v8i16 coeff2_m = { -cospi_2_64, -cospi_10_64, -cospi_18_64, \
+ -cospi_26_64, 0, 0, 0, 0 }; \
+ \
+ /* stp 1 */ \
+ ILVL_H2_SH(input2, input5, input3, input4, vec2_m, vec4_m); \
+ ILVR_H2_SH(input2, input5, input3, input4, vec3_m, vec5_m); \
+ \
+ cnst4_m = __msa_splati_h(coeff_m, 0); \
+ stp25_m = VP9_DOT_SHIFT_RIGHT_PCK_H(vec2_m, vec3_m, cnst4_m); \
+ \
+ cnst5_m = __msa_splati_h(coeff_m, 1); \
+ cnst5_m = __msa_ilvev_h(cnst5_m, cnst4_m); \
+ stp22_m = VP9_DOT_SHIFT_RIGHT_PCK_H(vec2_m, vec3_m, cnst5_m); \
+ stp24_m = VP9_DOT_SHIFT_RIGHT_PCK_H(vec4_m, vec5_m, cnst4_m); \
+ stp23_m = VP9_DOT_SHIFT_RIGHT_PCK_H(vec4_m, vec5_m, cnst5_m); \
+ \
+ /* stp2 */ \
+ BUTTERFLY_4(input0, input1, stp22_m, stp23_m, \
+ stp30_m, stp31_m, stp32_m, stp33_m); \
+ BUTTERFLY_4(input7, input6, stp25_m, stp24_m, \
+ stp37_m, stp36_m, stp35_m, stp34_m); \
+ \
+ ILVL_H2_SH(stp36_m, stp31_m, stp35_m, stp32_m, vec2_m, vec4_m); \
+ ILVR_H2_SH(stp36_m, stp31_m, stp35_m, stp32_m, vec3_m, vec5_m); \
+ \
+ SPLATI_H2_SH(coeff_m, 2, 3, cnst0_m, cnst1_m); \
+ cnst0_m = __msa_ilvev_h(cnst0_m, cnst1_m); \
+ stp26_m = VP9_DOT_SHIFT_RIGHT_PCK_H(vec2_m, vec3_m, cnst0_m); \
+ \
+ cnst0_m = __msa_splati_h(coeff_m, 4); \
+ cnst1_m = __msa_ilvev_h(cnst1_m, cnst0_m); \
+ stp21_m = VP9_DOT_SHIFT_RIGHT_PCK_H(vec2_m, vec3_m, cnst1_m); \
+ \
+ SPLATI_H2_SH(coeff_m, 5, 2, cnst0_m, cnst1_m); \
+ cnst1_m = __msa_ilvev_h(cnst0_m, cnst1_m); \
+ stp25_m = VP9_DOT_SHIFT_RIGHT_PCK_H(vec4_m, vec5_m, cnst1_m); \
+ \
+ cnst0_m = __msa_splati_h(coeff_m, 3); \
+ cnst1_m = __msa_ilvev_h(cnst1_m, cnst0_m); \
+ stp22_m = VP9_DOT_SHIFT_RIGHT_PCK_H(vec4_m, vec5_m, cnst1_m); \
+ \
+ /* stp4 */ \
+ BUTTERFLY_4(stp30_m, stp37_m, stp26_m, stp21_m, \
+ vec6_m, vec2_m, vec4_m, vec5_m); \
+ BUTTERFLY_4(stp33_m, stp34_m, stp25_m, stp22_m, \
+ stp21_m, stp23_m, stp24_m, stp31_m); \
+ \
+ ILVRL_H2_SH(vec2_m, vec6_m, vec1_m, vec0_m); \
+ SPLATI_H2_SH(coeff1_m, 0, 1, cnst0_m, cnst1_m); \
+ cnst0_m = __msa_ilvev_h(cnst0_m, cnst1_m); \
+ \
+ out1 = VP9_DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst0_m); \
+ \
+ cnst0_m = __msa_splati_h(coeff2_m, 0); \
+ cnst0_m = __msa_ilvev_h(cnst1_m, cnst0_m); \
+ out15 = VP9_DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst0_m); \
+ \
+ ILVRL_H2_SH(vec4_m, vec5_m, vec1_m, vec0_m); \
+ SPLATI_H2_SH(coeff1_m, 2, 3, cnst0_m, cnst1_m); \
+ cnst1_m = __msa_ilvev_h(cnst1_m, cnst0_m); \
+ \
+ out9 = VP9_DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst1_m); \
+ \
+ cnst1_m = __msa_splati_h(coeff2_m, 2); \
+ cnst0_m = __msa_ilvev_h(cnst0_m, cnst1_m); \
+ out7 = VP9_DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst0_m); \
+ \
+ ILVRL_H2_SH(stp23_m, stp21_m, vec1_m, vec0_m); \
+ SPLATI_H2_SH(coeff1_m, 4, 5, cnst0_m, cnst1_m); \
+ cnst0_m = __msa_ilvev_h(cnst0_m, cnst1_m); \
+ out5 = VP9_DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst0_m); \
+ \
+ cnst0_m = __msa_splati_h(coeff2_m, 1); \
+ cnst0_m = __msa_ilvev_h(cnst1_m, cnst0_m); \
+ out11 = VP9_DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst0_m); \
+ \
+ ILVRL_H2_SH(stp24_m, stp31_m, vec1_m, vec0_m); \
+ SPLATI_H2_SH(coeff1_m, 6, 7, cnst0_m, cnst1_m); \
+ cnst1_m = __msa_ilvev_h(cnst1_m, cnst0_m); \
+ \
+ out13 = VP9_DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst1_m); \
+ \
+ cnst1_m = __msa_splati_h(coeff2_m, 3); \
+ cnst0_m = __msa_ilvev_h(cnst0_m, cnst1_m); \
+ out3 = VP9_DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst0_m); \
+}
+
+#define VP9_FDCT32_POSTPROC_NEG_W(vec) { \
+ v4i32 temp_m; \
+ v4i32 one_m = __msa_ldi_w(1); \
+ \
+ temp_m = __msa_clti_s_w(vec, 0); \
+ vec += 1; \
+ temp_m = one_m & temp_m; \
+ vec += temp_m; \
+ vec >>= 2; \
+}
+
+#define VP9_FDCT32_POSTPROC_2V_POS_H(vec0, vec1) { \
+ v8i16 tp0_m, tp1_m; \
+ v8i16 one = __msa_ldi_h(1); \
+ \
+ tp0_m = __msa_clei_s_h(vec0, 0); \
+ tp1_m = __msa_clei_s_h(vec1, 0); \
+ tp0_m = (v8i16)__msa_xori_b((v16u8)tp0_m, 255); \
+ tp1_m = (v8i16)__msa_xori_b((v16u8)tp1_m, 255); \
+ vec0 += 1; \
+ vec1 += 1; \
+ tp0_m = one & tp0_m; \
+ tp1_m = one & tp1_m; \
+ vec0 += tp0_m; \
+ vec1 += tp1_m; \
+ vec0 >>= 2; \
+ vec1 >>= 2; \
+}
+
+#define VP9_DOTP_CONST_PAIR_W(reg0_left, reg1_left, reg0_right, \
+ reg1_right, const0, const1, \
+ out0, out1, out2, out3) { \
+ v4i32 s0_m, s1_m, s2_m, s3_m, s4_m, s5_m, s6_m, s7_m; \
+ v2i64 tp0_m, tp1_m, tp2_m, tp3_m; \
+ v4i32 k0_m = __msa_fill_w((int32_t) const0); \
+ \
+ s0_m = __msa_fill_w((int32_t) const1); \
+ k0_m = __msa_ilvev_w(s0_m, k0_m); \
+ \
+ ILVRL_W2_SW(-reg1_left, reg0_left, s1_m, s0_m); \
+ ILVRL_W2_SW(reg0_left, reg1_left, s3_m, s2_m); \
+ ILVRL_W2_SW(-reg1_right, reg0_right, s5_m, s4_m); \
+ ILVRL_W2_SW(reg0_right, reg1_right, s7_m, s6_m); \
+ \
+ DOTP_SW2_SD(s0_m, s1_m, k0_m, k0_m, tp0_m, tp1_m); \
+ DOTP_SW2_SD(s4_m, s5_m, k0_m, k0_m, tp2_m, tp3_m); \
+ tp0_m = __msa_srari_d(tp0_m, DCT_CONST_BITS); \
+ tp1_m = __msa_srari_d(tp1_m, DCT_CONST_BITS); \
+ tp2_m = __msa_srari_d(tp2_m, DCT_CONST_BITS); \
+ tp3_m = __msa_srari_d(tp3_m, DCT_CONST_BITS); \
+ out0 = __msa_pckev_w((v4i32)tp0_m, (v4i32)tp1_m); \
+ out1 = __msa_pckev_w((v4i32)tp2_m, (v4i32)tp3_m); \
+ \
+ DOTP_SW2_SD(s2_m, s3_m, k0_m, k0_m, tp0_m, tp1_m); \
+ DOTP_SW2_SD(s6_m, s7_m, k0_m, k0_m, tp2_m, tp3_m); \
+ tp0_m = __msa_srari_d(tp0_m, DCT_CONST_BITS); \
+ tp1_m = __msa_srari_d(tp1_m, DCT_CONST_BITS); \
+ tp2_m = __msa_srari_d(tp2_m, DCT_CONST_BITS); \
+ tp3_m = __msa_srari_d(tp3_m, DCT_CONST_BITS); \
+ out2 = __msa_pckev_w((v4i32)tp0_m, (v4i32)tp1_m); \
+ out3 = __msa_pckev_w((v4i32)tp2_m, (v4i32)tp3_m); \
+}
+#endif /* VP9_ENCODER_MIPS_MSA_VP9_FDCT_MSA_H_ */
diff --git a/vp9/encoder/vp9_aq_cyclicrefresh.c b/vp9/encoder/vp9_aq_cyclicrefresh.c
index df70d48..78dced2 100644
--- a/vp9/encoder/vp9_aq_cyclicrefresh.c
+++ b/vp9/encoder/vp9_aq_cyclicrefresh.c
@@ -529,3 +529,10 @@
int vp9_cyclic_refresh_get_rdmult(const CYCLIC_REFRESH *cr) {
return cr->rdmult;
}
+
+void vp9_cyclic_refresh_reset_resize(VP9_COMP *const cpi) {
+ const VP9_COMMON *const cm = &cpi->common;
+ CYCLIC_REFRESH *const cr = cpi->cyclic_refresh;
+ memset(cr->map, 0, cm->mi_rows * cm->mi_cols);
+ cr->sb_index = 0;
+}
diff --git a/vp9/encoder/vp9_aq_cyclicrefresh.h b/vp9/encoder/vp9_aq_cyclicrefresh.h
index 99bb98e..29d2a91 100644
--- a/vp9/encoder/vp9_aq_cyclicrefresh.h
+++ b/vp9/encoder/vp9_aq_cyclicrefresh.h
@@ -75,6 +75,8 @@
int vp9_cyclic_refresh_get_rdmult(const CYCLIC_REFRESH *cr);
+void vp9_cyclic_refresh_reset_resize(struct VP9_COMP *const cpi);
+
static INLINE int cyclic_refresh_segment_id_boosted(int segment_id) {
return segment_id == CR_SEGMENT_ID_BOOST1 ||
segment_id == CR_SEGMENT_ID_BOOST2;
diff --git a/vp9/encoder/vp9_avg.c b/vp9/encoder/vp9_avg.c
index b9987c1..223c923 100644
--- a/vp9/encoder/vp9_avg.c
+++ b/vp9/encoder/vp9_avg.c
@@ -29,6 +29,8 @@
return (sum + 8) >> 4;
}
+// src_diff: first pass, 9 bit, dynamic range [-255, 255]
+// second pass, 12 bit, dynamic range [-2040, 2040]
static void hadamard_col8(const int16_t *src_diff, int src_stride,
int16_t *coeff) {
int16_t b0 = src_diff[0 * src_stride] + src_diff[1 * src_stride];
@@ -65,15 +67,18 @@
int16_t buffer[64];
int16_t *tmp_buf = &buffer[0];
for (idx = 0; idx < 8; ++idx) {
- hadamard_col8(src_diff, src_stride, tmp_buf);
+ hadamard_col8(src_diff, src_stride, tmp_buf); // src_diff: 9 bit
+ // dynamic range [-255, 255]
tmp_buf += 8;
++src_diff;
}
tmp_buf = &buffer[0];
for (idx = 0; idx < 8; ++idx) {
- hadamard_col8(tmp_buf, 8, coeff);
- coeff += 8;
+ hadamard_col8(tmp_buf, 8, coeff); // tmp_buf: 12 bit
+ // dynamic range [-2040, 2040]
+ coeff += 8; // coeff: 15 bit
+ // dynamic range [-16320, 16320]
++tmp_buf;
}
}
@@ -83,37 +88,42 @@
int16_t *coeff) {
int idx;
for (idx = 0; idx < 4; ++idx) {
+ // src_diff: 9 bit, dynamic range [-255, 255]
int16_t const *src_ptr = src_diff + (idx >> 1) * 8 * src_stride
+ (idx & 0x01) * 8;
vp9_hadamard_8x8_c(src_ptr, src_stride, coeff + idx * 64);
}
+ // coeff: 15 bit, dynamic range [-16320, 16320]
for (idx = 0; idx < 64; ++idx) {
int16_t a0 = coeff[0];
int16_t a1 = coeff[64];
int16_t a2 = coeff[128];
int16_t a3 = coeff[192];
- int16_t b0 = a0 + a1;
- int16_t b1 = a0 - a1;
- int16_t b2 = a2 + a3;
- int16_t b3 = a2 - a3;
+ int16_t b0 = (a0 + a1) >> 1; // (a0 + a1): 16 bit, [-32640, 32640]
+ int16_t b1 = (a0 - a1) >> 1; // b0-b3: 15 bit, dynamic range
+ int16_t b2 = (a2 + a3) >> 1; // [-16320, 16320]
+ int16_t b3 = (a2 - a3) >> 1;
- coeff[0] = (b0 + b2) >> 1;
- coeff[64] = (b1 + b3) >> 1;
- coeff[128] = (b0 - b2) >> 1;
- coeff[192] = (b1 - b3) >> 1;
+ coeff[0] = b0 + b2; // 16 bit, [-32640, 32640]
+ coeff[64] = b1 + b3;
+ coeff[128] = b0 - b2;
+ coeff[192] = b1 - b3;
++coeff;
}
}
+// coeff: 16 bits, dynamic range [-32640, 32640].
+// length: value range {16, 64, 256, 1024}.
int16_t vp9_satd_c(const int16_t *coeff, int length) {
int i;
int satd = 0;
for (i = 0; i < length; ++i)
satd += abs(coeff[i]);
+ // satd: 26 bits, dynamic range [-32640 * 1024, 32640 * 1024]
return (int16_t)satd;
}
diff --git a/vp9/encoder/vp9_bitstream.c b/vp9/encoder/vp9_bitstream.c
index 092d265..1ebdd06 100644
--- a/vp9/encoder/vp9_bitstream.c
+++ b/vp9/encoder/vp9_bitstream.c
@@ -403,7 +403,7 @@
int hbs, int mi_row, int mi_col,
PARTITION_TYPE p, BLOCK_SIZE bsize, vp9_writer *w) {
const int ctx = partition_plane_context(xd, mi_row, mi_col, bsize);
- const vp9_prob *const probs = get_partition_probs(cm, ctx);
+ const vp9_prob *const probs = xd->partition_probs[ctx];
const int has_rows = (mi_row + hbs) < cm->mi_rows;
const int has_cols = (mi_col + hbs) < cm->mi_cols;
@@ -481,9 +481,12 @@
static void write_modes(VP9_COMP *cpi,
const TileInfo *const tile, vp9_writer *w,
TOKENEXTRA **tok, const TOKENEXTRA *const tok_end) {
+ const VP9_COMMON *const cm = &cpi->common;
MACROBLOCKD *const xd = &cpi->td.mb.e_mbd;
int mi_row, mi_col;
+ set_partition_probs(cm, xd);
+
for (mi_row = tile->mi_row_start; mi_row < tile->mi_row_end;
mi_row += MI_BLOCK_SIZE) {
vp9_zero(xd->left_seg_context);
diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c
index ddfe69f..dcddefc 100644
--- a/vp9/encoder/vp9_encodeframe.c
+++ b/vp9/encoder/vp9_encodeframe.c
@@ -1564,7 +1564,7 @@
}
}
-const struct {
+static const struct {
int row;
int col;
} coord_lookup[16] = {
@@ -2220,66 +2220,6 @@
*max_block_size = max_size;
}
-static void auto_partition_range(VP9_COMP *cpi, const TileInfo *const tile,
- MACROBLOCKD *const xd,
- int mi_row, int mi_col,
- BLOCK_SIZE *min_block_size,
- BLOCK_SIZE *max_block_size) {
- VP9_COMMON *const cm = &cpi->common;
- MODE_INFO **mi_8x8 = xd->mi;
- const int left_in_image = xd->left_available && mi_8x8[-1];
- const int above_in_image = xd->up_available && mi_8x8[-xd->mi_stride];
- int row8x8_remaining = tile->mi_row_end - mi_row;
- int col8x8_remaining = tile->mi_col_end - mi_col;
- int bh, bw;
- BLOCK_SIZE min_size = BLOCK_32X32;
- BLOCK_SIZE max_size = BLOCK_8X8;
- int bsl = mi_width_log2_lookup[BLOCK_64X64];
- const int search_range_ctrl = (((mi_row + mi_col) >> bsl) +
- get_chessboard_index(cm->current_video_frame)) & 0x1;
- // Trap case where we do not have a prediction.
- if (search_range_ctrl &&
- (left_in_image || above_in_image || cm->frame_type != KEY_FRAME)) {
- int block;
- MODE_INFO **mi;
- BLOCK_SIZE sb_type;
-
- // Find the min and max partition sizes used in the left SB64.
- if (left_in_image) {
- MODE_INFO *cur_mi;
- mi = &mi_8x8[-1];
- for (block = 0; block < MI_BLOCK_SIZE; ++block) {
- cur_mi = mi[block * xd->mi_stride];
- sb_type = cur_mi ? cur_mi->mbmi.sb_type : 0;
- min_size = MIN(min_size, sb_type);
- max_size = MAX(max_size, sb_type);
- }
- }
- // Find the min and max partition sizes used in the above SB64.
- if (above_in_image) {
- mi = &mi_8x8[-xd->mi_stride * MI_BLOCK_SIZE];
- for (block = 0; block < MI_BLOCK_SIZE; ++block) {
- sb_type = mi[block] ? mi[block]->mbmi.sb_type : 0;
- min_size = MIN(min_size, sb_type);
- max_size = MAX(max_size, sb_type);
- }
- }
-
- min_size = min_partition_size[min_size];
- max_size = find_partition_size(max_size, row8x8_remaining, col8x8_remaining,
- &bh, &bw);
- min_size = MIN(min_size, max_size);
- min_size = MAX(min_size, BLOCK_8X8);
- max_size = MIN(max_size, BLOCK_32X32);
- } else {
- min_size = BLOCK_8X8;
- max_size = BLOCK_32X32;
- }
-
- *min_block_size = min_size;
- *max_block_size = max_size;
-}
-
// TODO(jingning) refactor functions setting partition search range
static void set_partition_range(VP9_COMMON *cm, MACROBLOCKD *xd,
int mi_row, int mi_col, BLOCK_SIZE bsize,
@@ -3786,9 +3726,13 @@
TOKENEXTRA *pre_tok = cpi->tile_tok[0][0];
int tile_tok = 0;
- if (cpi->tile_data == NULL) {
+ if (cpi->tile_data == NULL || cpi->allocated_tiles < tile_cols * tile_rows) {
+ if (cpi->tile_data != NULL)
+ vpx_free(cpi->tile_data);
CHECK_MEM_ERROR(cm, cpi->tile_data,
vpx_malloc(tile_cols * tile_rows * sizeof(*cpi->tile_data)));
+ cpi->allocated_tiles = tile_cols * tile_rows;
+
for (tile_row = 0; tile_row < tile_rows; ++tile_row)
for (tile_col = 0; tile_col < tile_cols; ++tile_col) {
TileDataEnc *tile_data =
diff --git a/vp9/encoder/vp9_encoder.c b/vp9/encoder/vp9_encoder.c
index ba38d64..d708b83 100644
--- a/vp9/encoder/vp9_encoder.c
+++ b/vp9/encoder/vp9_encoder.c
@@ -1596,6 +1596,9 @@
sizeof(*cm->frame_contexts)));
cpi->use_svc = 0;
+ cpi->resize_state = 0;
+ cpi->resize_avg_qp = 0;
+ cpi->resize_buffer_underflow = 0;
cpi->common.buffer_pool = pool;
init_config(cpi, oxcf);
@@ -3032,6 +3035,31 @@
oxcf->scaled_frame_height);
}
+ if (oxcf->pass == 0 &&
+ oxcf->rc_mode == VPX_CBR &&
+ !cpi->use_svc &&
+ oxcf->resize_mode == RESIZE_DYNAMIC) {
+ if (cpi->resize_state == 1) {
+ oxcf->scaled_frame_width =
+ (cm->width * cpi->resize_scale_num) / cpi->resize_scale_den;
+ oxcf->scaled_frame_height =
+ (cm->height * cpi->resize_scale_num) /cpi->resize_scale_den;
+ } else if (cpi->resize_state == -1) {
+ // Go back up to original size.
+ oxcf->scaled_frame_width = oxcf->width;
+ oxcf->scaled_frame_height = oxcf->height;
+ }
+ if (cpi->resize_state != 0) {
+ // There has been a change in frame size.
+ vp9_set_size_literal(cpi,
+ oxcf->scaled_frame_width,
+ oxcf->scaled_frame_height);
+
+ // TODO(agrange) Scale cpi->max_mv_magnitude if frame-size has changed.
+ set_mv_search_params(cpi);
+ }
+ }
+
if ((oxcf->pass == 2) &&
(!cpi->use_svc ||
(is_two_pass_svc(cpi) &&
@@ -3962,7 +3990,6 @@
extern double vp9_get_blockiness(const unsigned char *img1, int img1_pitch,
const unsigned char *img2, int img2_pitch,
int width, int height);
-#endif
static void adjust_image_stat(double y, double u, double v, double all,
ImageStat *s) {
@@ -3972,6 +3999,7 @@
s->stat[ALL] += all;
s->worst = MIN(s->worst, all);
}
+#endif // CONFIG_INTERNAL_STATS
int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags,
size_t *size, uint8_t *dest,
@@ -4165,7 +4193,7 @@
(is_two_pass_svc(cpi) &&
cpi->svc.encode_empty_frame_state != ENCODING))) {
vp9_rc_get_second_pass_params(cpi);
- } else {
+ } else if (oxcf->pass == 1) {
set_frame_size(cpi);
}
diff --git a/vp9/encoder/vp9_encoder.h b/vp9/encoder/vp9_encoder.h
index 6ce4a67..2b0da10 100644
--- a/vp9/encoder/vp9_encoder.h
+++ b/vp9/encoder/vp9_encoder.h
@@ -306,6 +306,7 @@
YV12_BUFFER_CONFIG scaled_last_source;
TileDataEnc *tile_data;
+ int allocated_tiles; // Keep track of memory allocated for tiles.
// For a still frame, this flag is set to 1 to skip partition search.
int partition_search_skippable_frame;
@@ -478,6 +479,12 @@
#endif
int resize_pending;
+ int resize_state;
+ int resize_scale_num;
+ int resize_scale_den;
+ int resize_avg_qp;
+ int resize_buffer_underflow;
+ int resize_count;
// VAR_BASED_PARTITION thresholds
// 0 - threshold_64x64; 1 - threshold_32x32;
diff --git a/vp9/encoder/vp9_ethread.c b/vp9/encoder/vp9_ethread.c
index 8700ccd..4ae3fbc 100644
--- a/vp9/encoder/vp9_ethread.c
+++ b/vp9/encoder/vp9_ethread.c
@@ -54,6 +54,18 @@
return 0;
}
+static int get_max_tile_cols(VP9_COMP *cpi) {
+ const int aligned_width = ALIGN_POWER_OF_TWO(cpi->oxcf.width, MI_SIZE_LOG2);
+ int mi_cols = aligned_width >> MI_SIZE_LOG2;
+ int min_log2_tile_cols, max_log2_tile_cols;
+ int log2_tile_cols;
+
+ vp9_get_tile_n_bits(mi_cols, &min_log2_tile_cols, &max_log2_tile_cols);
+ log2_tile_cols = clamp(cpi->oxcf.tile_columns,
+ min_log2_tile_cols, max_log2_tile_cols);
+ return (1 << log2_tile_cols);
+}
+
void vp9_encode_tiles_mt(VP9_COMP *cpi) {
VP9_COMMON *const cm = &cpi->common;
const int tile_cols = 1 << cm->log2_tile_cols;
@@ -65,20 +77,30 @@
// Only run once to create threads and allocate thread data.
if (cpi->num_workers == 0) {
+ int allocated_workers = num_workers;
+
+ // While using SVC, we need to allocate threads according to the highest
+ // resolution.
+ if (cpi->use_svc) {
+ int max_tile_cols = get_max_tile_cols(cpi);
+ allocated_workers = MIN(cpi->oxcf.max_threads, max_tile_cols);
+ }
+
CHECK_MEM_ERROR(cm, cpi->workers,
- vpx_malloc(num_workers * sizeof(*cpi->workers)));
+ vpx_malloc(allocated_workers * sizeof(*cpi->workers)));
CHECK_MEM_ERROR(cm, cpi->tile_thr_data,
- vpx_calloc(num_workers, sizeof(*cpi->tile_thr_data)));
+ vpx_calloc(allocated_workers,
+ sizeof(*cpi->tile_thr_data)));
- for (i = 0; i < num_workers; i++) {
+ for (i = 0; i < allocated_workers; i++) {
VP9Worker *const worker = &cpi->workers[i];
EncWorkerData *thread_data = &cpi->tile_thr_data[i];
++cpi->num_workers;
winterface->init(worker);
- if (i < num_workers - 1) {
+ if (i < allocated_workers - 1) {
thread_data->cpi = cpi;
// Allocate thread data.
@@ -154,7 +176,7 @@
// Set the starting tile for each thread.
thread_data->start = i;
- if (i == num_workers - 1)
+ if (i == cpi->num_workers - 1)
winterface->execute(worker);
else
winterface->launch(worker);
@@ -171,7 +193,7 @@
EncWorkerData *const thread_data = (EncWorkerData*)worker->data1;
// Accumulate counters.
- if (i < num_workers - 1) {
+ if (i < cpi->num_workers - 1) {
vp9_accumulate_frame_counts(cm, thread_data->td->counts, 0);
accumulate_rd_opt(&cpi->td, thread_data->td);
}
diff --git a/vp9/encoder/vp9_mcomp.c b/vp9/encoder/vp9_mcomp.c
index 2342726..081b99f 100644
--- a/vp9/encoder/vp9_mcomp.c
+++ b/vp9/encoder/vp9_mcomp.c
@@ -286,20 +286,20 @@
bestmv->row *= 8; \
bestmv->col *= 8;
-static INLINE unsigned int setup_center_error(const MACROBLOCKD *xd,
- const MV *bestmv,
- const MV *ref_mv,
- int error_per_bit,
- const vp9_variance_fn_ptr_t *vfp,
- const uint8_t *const src,
- const int src_stride,
- const uint8_t *const y,
- int y_stride,
- const uint8_t *second_pred,
- int w, int h, int offset,
- int *mvjcost, int *mvcost[2],
- unsigned int *sse1,
- int *distortion) {
+static unsigned int setup_center_error(const MACROBLOCKD *xd,
+ const MV *bestmv,
+ const MV *ref_mv,
+ int error_per_bit,
+ const vp9_variance_fn_ptr_t *vfp,
+ const uint8_t *const src,
+ const int src_stride,
+ const uint8_t *const y,
+ int y_stride,
+ const uint8_t *second_pred,
+ int w, int h, int offset,
+ int *mvjcost, int *mvcost[2],
+ unsigned int *sse1,
+ int *distortion) {
unsigned int besterr;
#if CONFIG_VP9_HIGHBITDEPTH
if (second_pred != NULL) {
@@ -610,7 +610,7 @@
return besterr;
}
-const MV search_step_table[12] = {
+static const MV search_step_table[12] = {
// left, right, up, down
{0, -4}, {0, 4}, {-4, 0}, {4, 0},
{0, -2}, {0, 2}, {-2, 0}, {2, 0},
diff --git a/vp9/encoder/vp9_pickmode.c b/vp9/encoder/vp9_pickmode.c
index 2479b6e..3eaa990 100644
--- a/vp9/encoder/vp9_pickmode.c
+++ b/vp9/encoder/vp9_pickmode.c
@@ -658,7 +658,8 @@
block = 0;
*rate = 0;
*dist = 0;
- *sse = (*sse << 6) >> shift;
+ if (*sse < INT64_MAX)
+ *sse = (*sse << 6) >> shift;
for (r = 0; r < max_blocks_high; r += block_step) {
for (c = 0; c < num_4x4_w; c += block_step) {
if (c < max_blocks_wide) {
diff --git a/vp9/encoder/vp9_ratectrl.c b/vp9/encoder/vp9_ratectrl.c
index 32682fe..158581b 100644
--- a/vp9/encoder/vp9_ratectrl.c
+++ b/vp9/encoder/vp9_ratectrl.c
@@ -1596,6 +1596,7 @@
target = calc_pframe_target_size_one_pass_cbr(cpi);
vp9_rc_set_frame_target(cpi, target);
+ cpi->resize_state = vp9_resize_one_pass_cbr(cpi);
}
int vp9_compute_qdelta(const RATE_CONTROL *rc, double qstart, double qtarget,
@@ -1756,3 +1757,92 @@
vbr_rate_correction(cpi, &target_rate);
vp9_rc_set_frame_target(cpi, target_rate);
}
+
+// Check if we should resize, based on average QP from past x frames.
+// Only allow for resize at most one scale down for now, scaling factor is 2.
+int vp9_resize_one_pass_cbr(VP9_COMP *cpi) {
+ const VP9_COMMON *const cm = &cpi->common;
+ RATE_CONTROL *const rc = &cpi->rc;
+ int resize_now = 0;
+ cpi->resize_scale_num = 1;
+ cpi->resize_scale_den = 1;
+ // Don't resize on key frame; reset the counters on key frame.
+ if (cm->frame_type == KEY_FRAME) {
+ cpi->resize_avg_qp = 0;
+ cpi->resize_count = 0;
+ return 0;
+ }
+ // Resize based on average QP over some window.
+ // Ignore samples close to key frame, since QP is usually high after key.
+ if (cpi->rc.frames_since_key > 2 * cpi->framerate) {
+ const int window = 5 * cpi->framerate;
+ cpi->resize_avg_qp += cm->base_qindex;
+ if (cpi->rc.buffer_level < 0)
+ ++cpi->resize_buffer_underflow;
+ ++cpi->resize_count;
+ // Check for resize action every "window" frames.
+ if (cpi->resize_count == window) {
+ int avg_qp = cpi->resize_avg_qp / cpi->resize_count;
+ // Resize down if buffer level has underflowed sufficent amount in past
+ // window, and we are at original resolution.
+ // Resize back up if average QP is low, and we are currently in a resized
+ // down state.
+ if (cpi->resize_state == 0 &&
+ cpi->resize_buffer_underflow > (cpi->resize_count >> 3)) {
+ resize_now = 1;
+ } else if (cpi->resize_state == 1 &&
+ avg_qp < 40 * cpi->rc.worst_quality / 100) {
+ resize_now = -1;
+ }
+ // Reset for next window measurement.
+ cpi->resize_avg_qp = 0;
+ cpi->resize_count = 0;
+ cpi->resize_buffer_underflow = 0;
+ }
+ }
+ // If decision is to resize, reset some quantities, and check is we should
+ // reduce rate correction factor,
+ if (resize_now != 0) {
+ int target_bits_per_frame;
+ int active_worst_quality;
+ int qindex;
+ int tot_scale_change;
+ // For now, resize is by 1/2 x 1/2.
+ cpi->resize_scale_num = 1;
+ cpi->resize_scale_den = 2;
+ tot_scale_change = (cpi->resize_scale_den * cpi->resize_scale_den) /
+ (cpi->resize_scale_num * cpi->resize_scale_num);
+ // Reset buffer level to optimal, update target size.
+ rc->buffer_level = rc->optimal_buffer_level;
+ rc->bits_off_target = rc->optimal_buffer_level;
+ rc->this_frame_target = calc_pframe_target_size_one_pass_cbr(cpi);
+ // Reset cyclic refresh parameters.
+ if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ && cm->seg.enabled)
+ vp9_cyclic_refresh_reset_resize(cpi);
+ // Get the projected qindex, based on the scaled target frame size (scaled
+ // so target_bits_per_mb in vp9_rc_regulate_q will be correct target).
+ target_bits_per_frame = (resize_now == 1) ?
+ rc->this_frame_target * tot_scale_change :
+ rc->this_frame_target / tot_scale_change;
+ active_worst_quality = calc_active_worst_quality_one_pass_cbr(cpi);
+ qindex = vp9_rc_regulate_q(cpi,
+ target_bits_per_frame,
+ rc->best_quality,
+ active_worst_quality);
+ // If resize is down, check if projected q index is close to worst_quality,
+ // and if so, reduce the rate correction factor (since likely can afford
+ // lower q for resized frame).
+ if (resize_now == 1 &&
+ qindex > 90 * cpi->rc.worst_quality / 100) {
+ rc->rate_correction_factors[INTER_NORMAL] *= 0.85;
+ }
+ // If resize is back up, check if projected q index is too much above the
+ // current base_qindex, and if so, reduce the rate correction factor
+ // (since prefer to keep q for resized frame at least close to previous q).
+ if (resize_now == -1 &&
+ qindex > 130 * cm->base_qindex / 100) {
+ rc->rate_correction_factors[INTER_NORMAL] *= 0.9;
+ }
+ }
+ return resize_now;
+}
diff --git a/vp9/encoder/vp9_ratectrl.h b/vp9/encoder/vp9_ratectrl.h
index e12d200..a10836c 100644
--- a/vp9/encoder/vp9_ratectrl.h
+++ b/vp9/encoder/vp9_ratectrl.h
@@ -245,6 +245,8 @@
void vp9_set_target_rate(struct VP9_COMP *cpi);
+int vp9_resize_one_pass_cbr(struct VP9_COMP *cpi);
+
#ifdef __cplusplus
} // extern "C"
#endif
diff --git a/vp9/encoder/vp9_rd.c b/vp9/encoder/vp9_rd.c
index bbcbfe9..90ee1e4 100644
--- a/vp9/encoder/vp9_rd.c
+++ b/vp9/encoder/vp9_rd.c
@@ -265,6 +265,7 @@
void vp9_initialize_rd_consts(VP9_COMP *cpi) {
VP9_COMMON *const cm = &cpi->common;
MACROBLOCK *const x = &cpi->td.mb;
+ MACROBLOCKD *const xd = &cpi->td.mb.e_mbd;
RD_OPT *const rd = &cpi->rd;
int i;
@@ -280,6 +281,7 @@
cm->frame_type != KEY_FRAME) ? 0 : 1;
set_block_thresholds(cm, rd);
+ set_partition_probs(cm, xd);
if (!cpi->sf.use_nonrd_pick_mode || cm->frame_type == KEY_FRAME)
fill_token_costs(x->token_costs, cm->fc->coef_probs);
@@ -287,7 +289,7 @@
if (cpi->sf.partition_search_type != VAR_BASED_PARTITION ||
cm->frame_type == KEY_FRAME) {
for (i = 0; i < PARTITION_CONTEXTS; ++i)
- vp9_cost_tokens(cpi->partition_cost[i], get_partition_probs(cm, i),
+ vp9_cost_tokens(cpi->partition_cost[i], get_partition_probs(xd, i),
vp9_partition_tree);
}
diff --git a/vp9/encoder/vp9_resize.c b/vp9/encoder/vp9_resize.c
index bca5b13..f46cad8 100644
--- a/vp9/encoder/vp9_resize.c
+++ b/vp9/encoder/vp9_resize.c
@@ -29,7 +29,7 @@
typedef int16_t interp_kernel[INTERP_TAPS];
// Filters for interpolation (0.5-band) - note this also filters integer pels.
-const interp_kernel vp9_filteredinterp_filters500[(1 << SUBPEL_BITS)] = {
+static const interp_kernel filteredinterp_filters500[(1 << SUBPEL_BITS)] = {
{-3, 0, 35, 64, 35, 0, -3, 0},
{-3, -1, 34, 64, 36, 1, -3, 0},
{-3, -1, 32, 64, 38, 1, -3, 0},
@@ -65,7 +65,7 @@
};
// Filters for interpolation (0.625-band) - note this also filters integer pels.
-const interp_kernel vp9_filteredinterp_filters625[(1 << SUBPEL_BITS)] = {
+static const interp_kernel filteredinterp_filters625[(1 << SUBPEL_BITS)] = {
{-1, -8, 33, 80, 33, -8, -1, 0},
{-1, -8, 30, 80, 35, -8, -1, 1},
{-1, -8, 28, 80, 37, -7, -2, 1},
@@ -101,7 +101,7 @@
};
// Filters for interpolation (0.75-band) - note this also filters integer pels.
-const interp_kernel vp9_filteredinterp_filters750[(1 << SUBPEL_BITS)] = {
+static const interp_kernel filteredinterp_filters750[(1 << SUBPEL_BITS)] = {
{2, -11, 25, 96, 25, -11, 2, 0},
{2, -11, 22, 96, 28, -11, 2, 0},
{2, -10, 19, 95, 31, -11, 2, 0},
@@ -137,7 +137,7 @@
};
// Filters for interpolation (0.875-band) - note this also filters integer pels.
-const interp_kernel vp9_filteredinterp_filters875[(1 << SUBPEL_BITS)] = {
+static const interp_kernel filteredinterp_filters875[(1 << SUBPEL_BITS)] = {
{3, -8, 13, 112, 13, -8, 3, 0},
{3, -7, 10, 112, 17, -9, 3, -1},
{2, -6, 7, 111, 21, -9, 3, -1},
@@ -173,7 +173,7 @@
};
// Filters for interpolation (full-band) - no filtering for integer pixels
-const interp_kernel vp9_filteredinterp_filters1000[(1 << SUBPEL_BITS)] = {
+static const interp_kernel filteredinterp_filters1000[(1 << SUBPEL_BITS)] = {
{0, 0, 0, 128, 0, 0, 0, 0},
{0, 1, -3, 128, 3, -1, 0, 0},
{-1, 2, -6, 127, 7, -2, 1, 0},
@@ -215,15 +215,15 @@
static const interp_kernel *choose_interp_filter(int inlength, int outlength) {
int outlength16 = outlength * 16;
if (outlength16 >= inlength * 16)
- return vp9_filteredinterp_filters1000;
+ return filteredinterp_filters1000;
else if (outlength16 >= inlength * 13)
- return vp9_filteredinterp_filters875;
+ return filteredinterp_filters875;
else if (outlength16 >= inlength * 11)
- return vp9_filteredinterp_filters750;
+ return filteredinterp_filters750;
else if (outlength16 >= inlength * 9)
- return vp9_filteredinterp_filters625;
+ return filteredinterp_filters625;
else
- return vp9_filteredinterp_filters500;
+ return filteredinterp_filters500;
}
static void interpolate(const uint8_t *const input, int inlength,
diff --git a/vp9/encoder/vp9_svc_layercontext.c b/vp9/encoder/vp9_svc_layercontext.c
index cb1b0df..1b35ac9 100644
--- a/vp9/encoder/vp9_svc_layercontext.c
+++ b/vp9/encoder/vp9_svc_layercontext.c
@@ -152,7 +152,6 @@
}
} else {
int layer_end;
- float bitrate_alloc = 1.0;
if (svc->number_temporal_layers > 1 && cpi->oxcf.rc_mode == VPX_CBR) {
layer_end = svc->number_temporal_layers;
diff --git a/vp9/encoder/x86/vp9_avg_intrin_sse2.c b/vp9/encoder/x86/vp9_avg_intrin_sse2.c
index 56a91ed..4531d79 100644
--- a/vp9/encoder/x86/vp9_avg_intrin_sse2.c
+++ b/vp9/encoder/x86/vp9_avg_intrin_sse2.c
@@ -264,17 +264,18 @@
__m128i b2 = _mm_add_epi16(coeff2, coeff3);
__m128i b3 = _mm_sub_epi16(coeff2, coeff3);
+ b0 = _mm_srai_epi16(b0, 1);
+ b1 = _mm_srai_epi16(b1, 1);
+ b2 = _mm_srai_epi16(b2, 1);
+ b3 = _mm_srai_epi16(b3, 1);
+
coeff0 = _mm_add_epi16(b0, b2);
coeff1 = _mm_add_epi16(b1, b3);
- coeff0 = _mm_srai_epi16(coeff0, 1);
- coeff1 = _mm_srai_epi16(coeff1, 1);
_mm_store_si128((__m128i *)coeff, coeff0);
_mm_store_si128((__m128i *)(coeff + 64), coeff1);
coeff2 = _mm_sub_epi16(b0, b2);
coeff3 = _mm_sub_epi16(b1, b3);
- coeff2 = _mm_srai_epi16(coeff2, 1);
- coeff3 = _mm_srai_epi16(coeff3, 1);
_mm_store_si128((__m128i *)(coeff + 128), coeff2);
_mm_store_si128((__m128i *)(coeff + 192), coeff3);
diff --git a/vp9/vp9_common.mk b/vp9/vp9_common.mk
index b01fdd1..6f091ee 100644
--- a/vp9/vp9_common.mk
+++ b/vp9/vp9_common.mk
@@ -152,6 +152,10 @@
VP9_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/vp9_loopfilter_16_msa.c
VP9_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/vp9_loopfilter_msa.h
+ifeq ($(CONFIG_VP9_POSTPROC),yes)
+VP9_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/vp9_mfqe_msa.c
+endif
+
VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_idct_intrin_sse2.c
VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_idct_intrin_sse2.h
ifeq ($(ARCH_X86_64), yes)
diff --git a/vp9/vp9cx.mk b/vp9/vp9cx.mk
index 5415215..6074da2 100644
--- a/vp9/vp9cx.mk
+++ b/vp9/vp9cx.mk
@@ -152,4 +152,8 @@
VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_subtract_neon.c
VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_variance_neon.c
+VP9_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/vp9_fdct16x16_msa.c
+VP9_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/vp9_fdct32x32_msa.c
+VP9_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/vp9_fdct_msa.h
+
VP9_CX_SRCS-yes := $(filter-out $(VP9_CX_SRCS_REMOVE-yes),$(VP9_CX_SRCS-yes))