Merge changes Iedb5b6a3,Iaea98508,I36580cea,Ia0574320 * changes: vp9_decodeframe.h: remove unused prototype vp9_decodeframe: move public funcs to end of file vp9_decodeframe: reorder some functions vp9_decodeframe: hide vp9_dec_build_inter_predictors_sb
diff --git a/test/test_intra_pred_speed.cc b/test/test_intra_pred_speed.cc index cd562da..d2687b2 100644 --- a/test/test_intra_pred_speed.cc +++ b/test/test_intra_pred_speed.cc
@@ -16,6 +16,7 @@ #include "./vp9_rtcd.h" #include "test/acm_random.h" +#include "test/clear_system_state.h" #include "test/md5_helper.h" #include "vpx/vpx_integer.h" #include "vpx_ports/mem.h" @@ -66,6 +67,7 @@ for (int num_tests = 0; num_tests < kNumTests; ++num_tests) { pred_funcs[k](src, kBPS, above, left); } + libvpx_test::ClearSystemState(); vpx_usec_timer_mark(&timer); const int elapsed_time = static_cast<int>(vpx_usec_timer_elapsed(&timer) / 1000); @@ -211,6 +213,14 @@ NULL, NULL, NULL, NULL, vp9_tm_predictor_4x4_neon) #endif // HAVE_NEON +#if HAVE_MSA +INTRA_PRED_TEST(MSA, TestIntraPred4, vp9_dc_predictor_4x4_msa, + vp9_dc_left_predictor_4x4_msa, vp9_dc_top_predictor_4x4_msa, + vp9_dc_128_predictor_4x4_msa, vp9_v_predictor_4x4_msa, + vp9_h_predictor_4x4_msa, NULL, NULL, NULL, NULL, NULL, + NULL, vp9_tm_predictor_4x4_msa) +#endif // HAVE_MSA + // ----------------------------------------------------------------------------- // 8x8 @@ -256,6 +266,14 @@ #endif // HAVE_NEON +#if HAVE_MSA +INTRA_PRED_TEST(MSA, TestIntraPred8, vp9_dc_predictor_8x8_msa, + vp9_dc_left_predictor_8x8_msa, vp9_dc_top_predictor_8x8_msa, + vp9_dc_128_predictor_8x8_msa, vp9_v_predictor_8x8_msa, + vp9_h_predictor_8x8_msa, NULL, NULL, NULL, NULL, NULL, + NULL, vp9_tm_predictor_8x8_msa) +#endif // HAVE_MSA + // ----------------------------------------------------------------------------- // 16x16 @@ -299,6 +317,14 @@ vp9_tm_predictor_16x16_neon) #endif // HAVE_NEON +#if HAVE_MSA +INTRA_PRED_TEST(MSA, TestIntraPred16, vp9_dc_predictor_16x16_msa, + vp9_dc_left_predictor_16x16_msa, vp9_dc_top_predictor_16x16_msa, + vp9_dc_128_predictor_16x16_msa, vp9_v_predictor_16x16_msa, + vp9_h_predictor_16x16_msa, NULL, NULL, NULL, NULL, NULL, + NULL, vp9_tm_predictor_16x16_msa) +#endif // HAVE_MSA + // ----------------------------------------------------------------------------- // 32x32 @@ -340,4 +366,12 @@ NULL, NULL, NULL, NULL, NULL, vp9_tm_predictor_32x32_neon) #endif // HAVE_NEON +#if HAVE_MSA +INTRA_PRED_TEST(MSA, TestIntraPred32, vp9_dc_predictor_32x32_msa, + vp9_dc_left_predictor_32x32_msa, vp9_dc_top_predictor_32x32_msa, + vp9_dc_128_predictor_32x32_msa, vp9_v_predictor_32x32_msa, + vp9_h_predictor_32x32_msa, NULL, NULL, NULL, NULL, NULL, + NULL, vp9_tm_predictor_32x32_msa) +#endif // HAVE_MSA + #include "test/test_libvpx.cc"
diff --git a/test/variance_test.cc b/test/variance_test.cc index 2d17119..670fe09 100644 --- a/test/variance_test.cc +++ b/test/variance_test.cc
@@ -1106,12 +1106,12 @@ #endif // CONFIG_VP9_HIGHBITDEPTH #endif // HAVE_SSE2 -#if CONFIG_VP8 +#if CONFIG_VP8_ENCODER typedef SubpelVarianceTest<SubpixVarMxNFunc> VP8SubpelVarianceTest; TEST_P(VP8SubpelVarianceTest, Ref) { RefTest(); } TEST_P(VP8SubpelVarianceTest, ExtremeRef) { ExtremeRefTest(); } -#endif // CONFIG_VP8 +#endif // CONFIG_VP8_ENCODER #if CONFIG_VP9_ENCODER typedef SubpelVarianceTest<SubpixVarMxNFunc> VP9SubpelVarianceTest; @@ -1160,7 +1160,7 @@ make_tuple(6, 5, subpel_variance64x32_c, 0), make_tuple(6, 6, subpel_variance64x64_c, 0))); -#if CONFIG_VP8 +#if CONFIG_VP8_ENCODER const SubpixVarMxNFunc vp8_subpel_variance16x16_c = vp8_sub_pixel_variance16x16_c; const SubpixVarMxNFunc vp8_subpel_variance16x8_c = vp8_sub_pixel_variance16x8_c; @@ -1174,7 +1174,7 @@ make_tuple(3, 4, vp8_subpel_variance8x16_c, 0), make_tuple(4, 3, vp8_subpel_variance16x8_c, 0), make_tuple(4, 4, vp8_subpel_variance16x16_c, 0))); -#endif // CONFIG_VP8 +#endif // CONFIG_VP8_ENCODER const vp9_subp_avg_variance_fn_t subpel_avg_variance4x4_c = vp9_sub_pixel_avg_variance4x4_c; @@ -1460,7 +1460,7 @@ #endif // CONFIG_VP9_HIGHBITDEPTH #endif // CONFIG_VP9_ENCODER -#if CONFIG_VP8 +#if CONFIG_VP8_ENCODER #if HAVE_MMX const SubpixVarMxNFunc subpel_variance16x16_mmx = vp8_sub_pixel_variance16x16_mmx; @@ -1476,7 +1476,7 @@ make_tuple(3, 3, subpel_variance8x8_mmx, 0), make_tuple(2, 2, subpel_variance4x4_mmx, 0))); #endif // HAVE_MMX -#endif // CONFIG_VP8 +#endif // CONFIG_VP8_ENCODER #if CONFIG_VP9_ENCODER #if HAVE_SSE2 @@ -1768,7 +1768,7 @@ #endif // HAVE_SSE2 #endif // CONFIG_VP9_ENCODER -#if CONFIG_VP8 +#if CONFIG_VP8_ENCODER #if HAVE_SSE2 const SubpixVarMxNFunc vp8_subpel_variance16x16_sse2 = vp8_sub_pixel_variance16x16_wmt; @@ -1788,7 +1788,7 @@ make_tuple(4, 3, vp8_subpel_variance16x8_sse2, 0), make_tuple(4, 4, vp8_subpel_variance16x16_sse2, 0))); #endif // HAVE_SSE2 -#endif // CONFIG_VP8 +#endif // CONFIG_VP8_ENCODER #if CONFIG_VP9_ENCODER #if HAVE_SSSE3 @@ -1879,7 +1879,7 @@ #endif // HAVE_SSSE3 #endif // CONFIG_VP9_ENCODER -#if CONFIG_VP8 +#if CONFIG_VP8_ENCODER #if HAVE_SSSE3 const SubpixVarMxNFunc vp8_subpel_variance16x16_ssse3 = vp8_sub_pixel_variance16x16_ssse3; @@ -1890,7 +1890,7 @@ ::testing::Values(make_tuple(4, 3, vp8_subpel_variance16x8_ssse3, 0), make_tuple(4, 4, vp8_subpel_variance16x16_ssse3, 0))); #endif // HAVE_SSSE3 -#endif // CONFIG_VP8 +#endif // CONFIG_VP8_ENCODER #if HAVE_AVX2 const VarianceMxNFunc mse16x16_avx2 = vpx_mse16x16_avx2; @@ -1931,7 +1931,7 @@ #endif // CONFIG_VP9_ENCODER #endif // HAVE_AVX2 -#if CONFIG_VP8 +#if CONFIG_VP8_ENCODER #if HAVE_MEDIA const SubpixVarMxNFunc subpel_variance16x16_media = vp8_sub_pixel_variance16x16_armv6; @@ -1942,7 +1942,7 @@ ::testing::Values(make_tuple(3, 3, subpel_variance8x8_media, 0), make_tuple(4, 4, subpel_variance16x16_media, 0))); #endif // HAVE_MEDIA -#endif // CONFIG_VP8 +#endif // CONFIG_VP8_ENCODER #if HAVE_NEON const Get4x4SseFunc get4x4sse_cs_neon = vpx_get4x4sse_cs_neon; @@ -1972,7 +1972,7 @@ make_tuple(3, 4, variance8x16_neon, 0), make_tuple(3, 3, variance8x8_neon, 0))); -#if CONFIG_VP8 +#if CONFIG_VP8_ENCODER #if HAVE_NEON_ASM const SubpixVarMxNFunc vp8_subpel_variance16x16_neon = vp8_sub_pixel_variance16x16_neon; @@ -1980,7 +1980,7 @@ NEON, VP8SubpelVarianceTest, ::testing::Values(make_tuple(4, 4, vp8_subpel_variance16x16_neon, 0))); #endif // HAVE_NEON_ASM -#endif // CONFIG_VP8 +#endif // CONFIG_VP8_ENCODER #if CONFIG_VP9_ENCODER const SubpixVarMxNFunc subpel_variance8x8_neon = vp9_sub_pixel_variance8x8_neon;
diff --git a/vp9/common/mips/msa/vp9_intra_predict_msa.c b/vp9/common/mips/msa/vp9_intra_predict_msa.c new file mode 100644 index 0000000..2fc6105 --- /dev/null +++ b/vp9/common/mips/msa/vp9_intra_predict_msa.c
@@ -0,0 +1,737 @@ +/* + * Copyright (c) 2015 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "./vp9_rtcd.h" +#include "vp9/common/mips/msa/vp9_macros_msa.h" + +#define IPRED_SUBS_UH2_UH(in0, in1, out0, out1) { \ + out0 = __msa_subs_u_h(out0, in0); \ + out1 = __msa_subs_u_h(out1, in1); \ +} + +static void intra_predict_vert_4x4_msa(const uint8_t *src, uint8_t *dst, + int32_t dst_stride) { + uint32_t src_data; + + src_data = LW(src); + + SW4(src_data, src_data, src_data, src_data, dst, dst_stride); +} + +static void intra_predict_vert_8x8_msa(const uint8_t *src, uint8_t *dst, + int32_t dst_stride) { + uint32_t row; + uint32_t src_data1, src_data2; + + src_data1 = LW(src); + src_data2 = LW(src + 4); + + for (row = 8; row--;) { + SW(src_data1, dst); + SW(src_data2, (dst + 4)); + dst += dst_stride; + } +} + +static void intra_predict_vert_16x16_msa(const uint8_t *src, uint8_t *dst, + int32_t dst_stride) { + uint32_t row; + v16u8 src0; + + src0 = LD_UB(src); + + for (row = 16; row--;) { + ST_UB(src0, dst); + dst += dst_stride; + } +} + +static void intra_predict_vert_32x32_msa(const uint8_t *src, uint8_t *dst, + int32_t dst_stride) { + uint32_t row; + v16u8 src1, src2; + + src1 = LD_UB(src); + src2 = LD_UB(src + 16); + + for (row = 32; row--;) { + ST_UB2(src1, src2, dst, 16); + dst += dst_stride; + } +} + +static void intra_predict_horiz_4x4_msa(const uint8_t *src, uint8_t *dst, + int32_t dst_stride) { + uint32_t out0, out1, out2, out3; + + out0 = src[0] * 0x01010101; + out1 = src[1] * 0x01010101; + out2 = src[2] * 0x01010101; + out3 = src[3] * 0x01010101; + + SW4(out0, out1, out2, out3, dst, dst_stride); +} + +static void intra_predict_horiz_8x8_msa(const uint8_t *src, uint8_t *dst, + int32_t dst_stride) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + + out0 = src[0] * 0x0101010101010101ull; + out1 = src[1] * 0x0101010101010101ull; + out2 = src[2] * 0x0101010101010101ull; + out3 = src[3] * 0x0101010101010101ull; + out4 = src[4] * 0x0101010101010101ull; + out5 = src[5] * 0x0101010101010101ull; + out6 = src[6] * 0x0101010101010101ull; + out7 = src[7] * 0x0101010101010101ull; + + SD4(out0, out1, out2, out3, dst, dst_stride); + dst += (4 * dst_stride); + SD4(out4, out5, out6, out7, dst, dst_stride); +} + +static void intra_predict_horiz_16x16_msa(const uint8_t *src, uint8_t *dst, + int32_t dst_stride) { + uint32_t row; + uint8_t inp0, inp1, inp2, inp3; + v16u8 src0, src1, src2, src3; + + for (row = 4; row--;) { + inp0 = src[0]; + inp1 = src[1]; + inp2 = src[2]; + inp3 = src[3]; + src += 4; + + src0 = (v16u8)__msa_fill_b(inp0); + src1 = (v16u8)__msa_fill_b(inp1); + src2 = (v16u8)__msa_fill_b(inp2); + src3 = (v16u8)__msa_fill_b(inp3); + + ST_UB4(src0, src1, src2, src3, dst, dst_stride); + dst += (4 * dst_stride); + } +} + +static void intra_predict_horiz_32x32_msa(const uint8_t *src, uint8_t *dst, + int32_t dst_stride) { + uint32_t row; + uint8_t inp0, inp1, inp2, inp3; + v16u8 src0, src1, src2, src3; + + for (row = 8; row--;) { + inp0 = src[0]; + inp1 = src[1]; + inp2 = src[2]; + inp3 = src[3]; + src += 4; + + src0 = (v16u8)__msa_fill_b(inp0); + src1 = (v16u8)__msa_fill_b(inp1); + src2 = (v16u8)__msa_fill_b(inp2); + src3 = (v16u8)__msa_fill_b(inp3); + + ST_UB2(src0, src0, dst, 16); + dst += dst_stride; + ST_UB2(src1, src1, dst, 16); + dst += dst_stride; + ST_UB2(src2, src2, dst, 16); + dst += dst_stride; + ST_UB2(src3, src3, dst, 16); + dst += dst_stride; + } +} + +static void intra_predict_dc_4x4_msa(const uint8_t *src_top, + const uint8_t *src_left, + uint8_t *dst, int32_t dst_stride) { + uint32_t val0, val1; + v16i8 store, src = { 0 }; + v8u16 sum_h; + v4u32 sum_w; + v2u64 sum_d; + + val0 = LW(src_top); + val1 = LW(src_left); + INSERT_W2_SB(val0, val1, src); + sum_h = __msa_hadd_u_h((v16u8)src, (v16u8)src); + sum_w = __msa_hadd_u_w(sum_h, sum_h); + sum_d = __msa_hadd_u_d(sum_w, sum_w); + sum_w = (v4u32)__msa_srari_w((v4i32)sum_d, 3); + store = __msa_splati_b((v16i8)sum_w, 0); + val0 = __msa_copy_u_w((v4i32)store, 0); + + SW4(val0, val0, val0, val0, dst, dst_stride); +} + +static void intra_predict_dc_tl_4x4_msa(const uint8_t *src, uint8_t *dst, + int32_t dst_stride) { + uint32_t val0; + v16i8 store, data = { 0 }; + v8u16 sum_h; + v4u32 sum_w; + + val0 = LW(src); + data = (v16i8)__msa_insert_w((v4i32)data, 0, val0); + sum_h = __msa_hadd_u_h((v16u8)data, (v16u8)data); + sum_w = __msa_hadd_u_w(sum_h, sum_h); + sum_w = (v4u32)__msa_srari_w((v4i32)sum_w, 2); + store = __msa_splati_b((v16i8)sum_w, 0); + val0 = __msa_copy_u_w((v4i32)store, 0); + + SW4(val0, val0, val0, val0, dst, dst_stride); +} + +static void intra_predict_128dc_4x4_msa(uint8_t *dst, int32_t dst_stride) { + uint32_t out; + const v16i8 store = __msa_ldi_b(128); + + out = __msa_copy_u_w((v4i32)store, 0); + + SW4(out, out, out, out, dst, dst_stride); +} + +static void intra_predict_dc_8x8_msa(const uint8_t *src_top, + const uint8_t *src_left, + uint8_t *dst, int32_t dst_stride) { + uint64_t val0, val1; + v16i8 store; + v16u8 src = { 0 }; + v8u16 sum_h; + v4u32 sum_w; + v2u64 sum_d; + + val0 = LD(src_top); + val1 = LD(src_left); + INSERT_D2_UB(val0, val1, src); + sum_h = __msa_hadd_u_h(src, src); + sum_w = __msa_hadd_u_w(sum_h, sum_h); + sum_d = __msa_hadd_u_d(sum_w, sum_w); + sum_w = (v4u32)__msa_pckev_w((v4i32)sum_d, (v4i32)sum_d); + sum_d = __msa_hadd_u_d(sum_w, sum_w); + sum_w = (v4u32)__msa_srari_w((v4i32)sum_d, 4); + store = __msa_splati_b((v16i8)sum_w, 0); + val0 = __msa_copy_u_d((v2i64)store, 0); + + SD4(val0, val0, val0, val0, dst, dst_stride); + dst += (4 * dst_stride); + SD4(val0, val0, val0, val0, dst, dst_stride); +} + +static void intra_predict_dc_tl_8x8_msa(const uint8_t *src, uint8_t *dst, + int32_t dst_stride) { + uint64_t val0; + v16i8 store; + v16u8 data = { 0 }; + v8u16 sum_h; + v4u32 sum_w; + v2u64 sum_d; + + val0 = LD(src); + data = (v16u8)__msa_insert_d((v2i64)data, 0, val0); + sum_h = __msa_hadd_u_h(data, data); + sum_w = __msa_hadd_u_w(sum_h, sum_h); + sum_d = __msa_hadd_u_d(sum_w, sum_w); + sum_w = (v4u32)__msa_srari_w((v4i32)sum_d, 3); + store = __msa_splati_b((v16i8)sum_w, 0); + val0 = __msa_copy_u_d((v2i64)store, 0); + + SD4(val0, val0, val0, val0, dst, dst_stride); + dst += (4 * dst_stride); + SD4(val0, val0, val0, val0, dst, dst_stride); +} + +static void intra_predict_128dc_8x8_msa(uint8_t *dst, int32_t dst_stride) { + uint64_t out; + const v16i8 store = __msa_ldi_b(128); + + out = __msa_copy_u_d((v2i64)store, 0); + + SD4(out, out, out, out, dst, dst_stride); + dst += (4 * dst_stride); + SD4(out, out, out, out, dst, dst_stride); +} + +static void intra_predict_dc_16x16_msa(const uint8_t *src_top, + const uint8_t *src_left, + uint8_t *dst, int32_t dst_stride) { + v16u8 top, left, out; + v8u16 sum_h, sum_top, sum_left; + v4u32 sum_w; + v2u64 sum_d; + + top = LD_UB(src_top); + left = LD_UB(src_left); + HADD_UB2_UH(top, left, sum_top, sum_left); + sum_h = sum_top + sum_left; + sum_w = __msa_hadd_u_w(sum_h, sum_h); + sum_d = __msa_hadd_u_d(sum_w, sum_w); + sum_w = (v4u32)__msa_pckev_w((v4i32)sum_d, (v4i32)sum_d); + sum_d = __msa_hadd_u_d(sum_w, sum_w); + sum_w = (v4u32)__msa_srari_w((v4i32)sum_d, 5); + out = (v16u8)__msa_splati_b((v16i8)sum_w, 0); + + ST_UB8(out, out, out, out, out, out, out, out, dst, dst_stride); + dst += (8 * dst_stride); + ST_UB8(out, out, out, out, out, out, out, out, dst, dst_stride); +} + +static void intra_predict_dc_tl_16x16_msa(const uint8_t *src, uint8_t *dst, + int32_t dst_stride) { + v16u8 data, out; + v8u16 sum_h; + v4u32 sum_w; + v2u64 sum_d; + + data = LD_UB(src); + sum_h = __msa_hadd_u_h(data, data); + sum_w = __msa_hadd_u_w(sum_h, sum_h); + sum_d = __msa_hadd_u_d(sum_w, sum_w); + sum_w = (v4u32)__msa_pckev_w((v4i32)sum_d, (v4i32)sum_d); + sum_d = __msa_hadd_u_d(sum_w, sum_w); + sum_w = (v4u32)__msa_srari_w((v4i32)sum_d, 4); + out = (v16u8)__msa_splati_b((v16i8)sum_w, 0); + + ST_UB8(out, out, out, out, out, out, out, out, dst, dst_stride); + dst += (8 * dst_stride); + ST_UB8(out, out, out, out, out, out, out, out, dst, dst_stride); +} + +static void intra_predict_128dc_16x16_msa(uint8_t *dst, int32_t dst_stride) { + const v16u8 out = (v16u8)__msa_ldi_b(128); + + ST_UB8(out, out, out, out, out, out, out, out, dst, dst_stride); + dst += (8 * dst_stride); + ST_UB8(out, out, out, out, out, out, out, out, dst, dst_stride); +} + +static void intra_predict_dc_32x32_msa(const uint8_t *src_top, + const uint8_t *src_left, + uint8_t *dst, int32_t dst_stride) { + uint32_t row; + v16u8 top0, top1, left0, left1, out; + v8u16 sum_h, sum_top0, sum_top1, sum_left0, sum_left1; + v4u32 sum_w; + v2u64 sum_d; + + LD_UB2(src_top, 16, top0, top1); + LD_UB2(src_left, 16, left0, left1); + HADD_UB2_UH(top0, top1, sum_top0, sum_top1); + HADD_UB2_UH(left0, left1, sum_left0, sum_left1); + sum_h = sum_top0 + sum_top1; + sum_h += sum_left0 + sum_left1; + sum_w = __msa_hadd_u_w(sum_h, sum_h); + sum_d = __msa_hadd_u_d(sum_w, sum_w); + sum_w = (v4u32)__msa_pckev_w((v4i32)sum_d, (v4i32)sum_d); + sum_d = __msa_hadd_u_d(sum_w, sum_w); + sum_w = (v4u32)__msa_srari_w((v4i32)sum_d, 6); + out = (v16u8)__msa_splati_b((v16i8)sum_w, 0); + + for (row = 16; row--;) { + ST_UB2(out, out, dst, 16); + dst += dst_stride; + ST_UB2(out, out, dst, 16); + dst += dst_stride; + } +} + +static void intra_predict_dc_tl_32x32_msa(const uint8_t *src, uint8_t *dst, + int32_t dst_stride) { + uint32_t row; + v16u8 data0, data1, out; + v8u16 sum_h, sum_data0, sum_data1; + v4u32 sum_w; + v2u64 sum_d; + + LD_UB2(src, 16, data0, data1); + HADD_UB2_UH(data0, data1, sum_data0, sum_data1); + sum_h = sum_data0 + sum_data1; + sum_w = __msa_hadd_u_w(sum_h, sum_h); + sum_d = __msa_hadd_u_d(sum_w, sum_w); + sum_w = (v4u32)__msa_pckev_w((v4i32)sum_d, (v4i32)sum_d); + sum_d = __msa_hadd_u_d(sum_w, sum_w); + sum_w = (v4u32)__msa_srari_w((v4i32)sum_d, 5); + out = (v16u8)__msa_splati_b((v16i8)sum_w, 0); + + for (row = 16; row--;) { + ST_UB2(out, out, dst, 16); + dst += dst_stride; + ST_UB2(out, out, dst, 16); + dst += dst_stride; + } +} + +static void intra_predict_128dc_32x32_msa(uint8_t *dst, int32_t dst_stride) { + uint32_t row; + const v16u8 out = (v16u8)__msa_ldi_b(128); + + for (row = 16; row--;) { + ST_UB2(out, out, dst, 16); + dst += dst_stride; + ST_UB2(out, out, dst, 16); + dst += dst_stride; + } +} + +static void intra_predict_tm_4x4_msa(const uint8_t *src_top_ptr, + const uint8_t *src_left, + uint8_t *dst, int32_t dst_stride) { + uint32_t val; + uint8_t top_left = src_top_ptr[-1]; + v16i8 src_left0, src_left1, src_left2, src_left3, tmp0, tmp1, src_top = { 0 }; + v16u8 src0, src1, src2, src3; + v8u16 src_top_left, vec0, vec1, vec2, vec3; + + src_top_left = (v8u16)__msa_fill_h(top_left); + val = LW(src_top_ptr); + src_top = (v16i8)__msa_insert_w((v4i32)src_top, 0, val); + + src_left0 = __msa_fill_b(src_left[0]); + src_left1 = __msa_fill_b(src_left[1]); + src_left2 = __msa_fill_b(src_left[2]); + src_left3 = __msa_fill_b(src_left[3]); + + ILVR_B4_UB(src_left0, src_top, src_left1, src_top, src_left2, src_top, + src_left3, src_top, src0, src1, src2, src3); + HADD_UB4_UH(src0, src1, src2, src3, vec0, vec1, vec2, vec3); + IPRED_SUBS_UH2_UH(src_top_left, src_top_left, vec0, vec1); + IPRED_SUBS_UH2_UH(src_top_left, src_top_left, vec2, vec3); + SAT_UH4_UH(vec0, vec1, vec2, vec3, 7); + PCKEV_B2_SB(vec1, vec0, vec3, vec2, tmp0, tmp1); + ST4x4_UB(tmp0, tmp1, 0, 2, 0, 2, dst, dst_stride); +} + +static void intra_predict_tm_8x8_msa(const uint8_t *src_top_ptr, + const uint8_t *src_left, + uint8_t *dst, int32_t dst_stride) { + uint64_t val; + uint8_t top_left = src_top_ptr[-1]; + uint32_t loop_cnt; + v16i8 src_left0, src_left1, src_left2, src_left3, tmp0, tmp1, src_top = { 0 }; + v8u16 src_top_left, vec0, vec1, vec2, vec3; + v16u8 src0, src1, src2, src3; + + val = LD(src_top_ptr); + src_top = (v16i8)__msa_insert_d((v2i64)src_top, 0, val); + src_top_left = (v8u16)__msa_fill_h(top_left); + + for (loop_cnt = 2; loop_cnt--;) { + src_left0 = __msa_fill_b(src_left[0]); + src_left1 = __msa_fill_b(src_left[1]); + src_left2 = __msa_fill_b(src_left[2]); + src_left3 = __msa_fill_b(src_left[3]); + src_left += 4; + + ILVR_B4_UB(src_left0, src_top, src_left1, src_top, src_left2, src_top, + src_left3, src_top, src0, src1, src2, src3); + HADD_UB4_UH(src0, src1, src2, src3, vec0, vec1, vec2, vec3); + IPRED_SUBS_UH2_UH(src_top_left, src_top_left, vec0, vec1); + IPRED_SUBS_UH2_UH(src_top_left, src_top_left, vec2, vec3); + SAT_UH4_UH(vec0, vec1, vec2, vec3, 7); + PCKEV_B2_SB(vec1, vec0, vec3, vec2, tmp0, tmp1); + ST8x4_UB(tmp0, tmp1, dst, dst_stride); + dst += (4 * dst_stride); + } +} + +static void intra_predict_tm_16x16_msa(const uint8_t *src_top_ptr, + const uint8_t *src_left, + uint8_t *dst, int32_t dst_stride) { + uint8_t top_left = src_top_ptr[-1]; + uint32_t loop_cnt; + v16i8 src_top, src_left0, src_left1, src_left2, src_left3; + v8u16 src_top_left, res_r, res_l; + + src_top = LD_SB(src_top_ptr); + src_top_left = (v8u16)__msa_fill_h(top_left); + + for (loop_cnt = 4; loop_cnt--;) { + src_left0 = __msa_fill_b(src_left[0]); + src_left1 = __msa_fill_b(src_left[1]); + src_left2 = __msa_fill_b(src_left[2]); + src_left3 = __msa_fill_b(src_left[3]); + src_left += 4; + + ILVRL_B2_UH(src_left0, src_top, res_r, res_l); + HADD_UB2_UH(res_r, res_l, res_r, res_l); + IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r, res_l); + + SAT_UH2_UH(res_r, res_l, 7); + PCKEV_ST_SB(res_r, res_l, dst); + dst += dst_stride; + + ILVRL_B2_UH(src_left1, src_top, res_r, res_l); + HADD_UB2_UH(res_r, res_l, res_r, res_l); + IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r, res_l); + SAT_UH2_UH(res_r, res_l, 7); + PCKEV_ST_SB(res_r, res_l, dst); + dst += dst_stride; + + ILVRL_B2_UH(src_left2, src_top, res_r, res_l); + HADD_UB2_UH(res_r, res_l, res_r, res_l); + IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r, res_l); + SAT_UH2_UH(res_r, res_l, 7); + PCKEV_ST_SB(res_r, res_l, dst); + dst += dst_stride; + + ILVRL_B2_UH(src_left3, src_top, res_r, res_l); + HADD_UB2_UH(res_r, res_l, res_r, res_l); + IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r, res_l); + SAT_UH2_UH(res_r, res_l, 7); + PCKEV_ST_SB(res_r, res_l, dst); + dst += dst_stride; + } +} + +static void intra_predict_tm_32x32_msa(const uint8_t *src_top, + const uint8_t *src_left, + uint8_t *dst, int32_t dst_stride) { + uint8_t top_left = src_top[-1]; + uint32_t loop_cnt; + v16i8 src_top0, src_top1, src_left0, src_left1, src_left2, src_left3; + v8u16 src_top_left, res_r0, res_r1, res_l0, res_l1; + + LD_SB2(src_top, 16, src_top0, src_top1); + src_top_left = (v8u16)__msa_fill_h(top_left); + + for (loop_cnt = 8; loop_cnt--;) { + src_left0 = __msa_fill_b(src_left[0]); + src_left1 = __msa_fill_b(src_left[1]); + src_left2 = __msa_fill_b(src_left[2]); + src_left3 = __msa_fill_b(src_left[3]); + src_left += 4; + + ILVR_B2_UH(src_left0, src_top0, src_left0, src_top1, res_r0, res_r1); + ILVL_B2_UH(src_left0, src_top0, src_left0, src_top1, res_l0, res_l1); + HADD_UB4_UH(res_r0, res_l0, res_r1, res_l1, res_r0, res_l0, res_r1, res_l1); + IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r0, res_l0); + IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r1, res_l1); + SAT_UH4_UH(res_r0, res_l0, res_r1, res_l1, 7); + PCKEV_ST_SB(res_r0, res_l0, dst); + PCKEV_ST_SB(res_r1, res_l1, dst + 16); + dst += dst_stride; + + ILVR_B2_UH(src_left1, src_top0, src_left1, src_top1, res_r0, res_r1); + ILVL_B2_UH(src_left1, src_top0, src_left1, src_top1, res_l0, res_l1); + HADD_UB4_UH(res_r0, res_l0, res_r1, res_l1, res_r0, res_l0, res_r1, res_l1); + IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r0, res_l0); + IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r1, res_l1); + SAT_UH4_UH(res_r0, res_l0, res_r1, res_l1, 7); + PCKEV_ST_SB(res_r0, res_l0, dst); + PCKEV_ST_SB(res_r1, res_l1, dst + 16); + dst += dst_stride; + + ILVR_B2_UH(src_left2, src_top0, src_left2, src_top1, res_r0, res_r1); + ILVL_B2_UH(src_left2, src_top0, src_left2, src_top1, res_l0, res_l1); + HADD_UB4_UH(res_r0, res_l0, res_r1, res_l1, res_r0, res_l0, res_r1, res_l1); + IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r0, res_l0); + IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r1, res_l1); + SAT_UH4_UH(res_r0, res_l0, res_r1, res_l1, 7); + PCKEV_ST_SB(res_r0, res_l0, dst); + PCKEV_ST_SB(res_r1, res_l1, dst + 16); + dst += dst_stride; + + ILVR_B2_UH(src_left3, src_top0, src_left3, src_top1, res_r0, res_r1); + ILVL_B2_UH(src_left3, src_top0, src_left3, src_top1, res_l0, res_l1); + HADD_UB4_UH(res_r0, res_l0, res_r1, res_l1, res_r0, res_l0, res_r1, res_l1); + IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r0, res_l0); + IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r1, res_l1); + SAT_UH4_UH(res_r0, res_l0, res_r1, res_l1, 7); + PCKEV_ST_SB(res_r0, res_l0, dst); + PCKEV_ST_SB(res_r1, res_l1, dst + 16); + dst += dst_stride; + } +} + +void vp9_v_predictor_4x4_msa(uint8_t *dst, ptrdiff_t y_stride, + const uint8_t *above, const uint8_t *left) { + (void)left; + + intra_predict_vert_4x4_msa(above, dst, y_stride); +} + +void vp9_v_predictor_8x8_msa(uint8_t *dst, ptrdiff_t y_stride, + const uint8_t *above, const uint8_t *left) { + (void)left; + + intra_predict_vert_8x8_msa(above, dst, y_stride); +} + +void vp9_v_predictor_16x16_msa(uint8_t *dst, ptrdiff_t y_stride, + const uint8_t *above, const uint8_t *left) { + (void)left; + + intra_predict_vert_16x16_msa(above, dst, y_stride); +} + +void vp9_v_predictor_32x32_msa(uint8_t *dst, ptrdiff_t y_stride, + const uint8_t *above, const uint8_t *left) { + (void)left; + + intra_predict_vert_32x32_msa(above, dst, y_stride); +} + +void vp9_h_predictor_4x4_msa(uint8_t *dst, ptrdiff_t y_stride, + const uint8_t *above, const uint8_t *left) { + (void)above; + + intra_predict_horiz_4x4_msa(left, dst, y_stride); +} + +void vp9_h_predictor_8x8_msa(uint8_t *dst, ptrdiff_t y_stride, + const uint8_t *above, const uint8_t *left) { + (void)above; + + intra_predict_horiz_8x8_msa(left, dst, y_stride); +} + +void vp9_h_predictor_16x16_msa(uint8_t *dst, ptrdiff_t y_stride, + const uint8_t *above, const uint8_t *left) { + (void)above; + + intra_predict_horiz_16x16_msa(left, dst, y_stride); +} + +void vp9_h_predictor_32x32_msa(uint8_t *dst, ptrdiff_t y_stride, + const uint8_t *above, const uint8_t *left) { + (void)above; + + intra_predict_horiz_32x32_msa(left, dst, y_stride); +} + +void vp9_dc_predictor_4x4_msa(uint8_t *dst, ptrdiff_t y_stride, + const uint8_t *above, const uint8_t *left) { + intra_predict_dc_4x4_msa(above, left, dst, y_stride); +} + +void vp9_dc_predictor_8x8_msa(uint8_t *dst, ptrdiff_t y_stride, + const uint8_t *above, const uint8_t *left) { + intra_predict_dc_8x8_msa(above, left, dst, y_stride); +} + +void vp9_dc_predictor_16x16_msa(uint8_t *dst, ptrdiff_t y_stride, + const uint8_t *above, const uint8_t *left) { + intra_predict_dc_16x16_msa(above, left, dst, y_stride); +} + +void vp9_dc_predictor_32x32_msa(uint8_t *dst, ptrdiff_t y_stride, + const uint8_t *above, const uint8_t *left) { + intra_predict_dc_32x32_msa(above, left, dst, y_stride); +} + +void vp9_dc_top_predictor_4x4_msa(uint8_t *dst, ptrdiff_t y_stride, + const uint8_t *above, const uint8_t *left) { + (void)left; + + intra_predict_dc_tl_4x4_msa(above, dst, y_stride); +} + +void vp9_dc_top_predictor_8x8_msa(uint8_t *dst, ptrdiff_t y_stride, + const uint8_t *above, const uint8_t *left) { + (void)left; + + intra_predict_dc_tl_8x8_msa(above, dst, y_stride); +} + +void vp9_dc_top_predictor_16x16_msa(uint8_t *dst, ptrdiff_t y_stride, + const uint8_t *above, const uint8_t *left) { + (void)left; + + intra_predict_dc_tl_16x16_msa(above, dst, y_stride); +} + +void vp9_dc_top_predictor_32x32_msa(uint8_t *dst, ptrdiff_t y_stride, + const uint8_t *above, const uint8_t *left) { + (void)left; + + intra_predict_dc_tl_32x32_msa(above, dst, y_stride); +} + +void vp9_dc_left_predictor_4x4_msa(uint8_t *dst, ptrdiff_t y_stride, + const uint8_t *above, const uint8_t *left) { + (void)above; + + intra_predict_dc_tl_4x4_msa(left, dst, y_stride); +} + +void vp9_dc_left_predictor_8x8_msa(uint8_t *dst, ptrdiff_t y_stride, + const uint8_t *above, const uint8_t *left) { + (void)above; + + intra_predict_dc_tl_8x8_msa(left, dst, y_stride); +} + +void vp9_dc_left_predictor_16x16_msa(uint8_t *dst, ptrdiff_t y_stride, + const uint8_t *above, + const uint8_t *left) { + (void)above; + + intra_predict_dc_tl_16x16_msa(left, dst, y_stride); +} + +void vp9_dc_left_predictor_32x32_msa(uint8_t *dst, ptrdiff_t y_stride, + const uint8_t *above, + const uint8_t *left) { + (void)above; + + intra_predict_dc_tl_32x32_msa(left, dst, y_stride); +} + +void vp9_dc_128_predictor_4x4_msa(uint8_t *dst, ptrdiff_t y_stride, + const uint8_t *above, const uint8_t *left) { + (void)above; + (void)left; + + intra_predict_128dc_4x4_msa(dst, y_stride); +} + +void vp9_dc_128_predictor_8x8_msa(uint8_t *dst, ptrdiff_t y_stride, + const uint8_t *above, const uint8_t *left) { + (void)above; + (void)left; + + intra_predict_128dc_8x8_msa(dst, y_stride); +} + +void vp9_dc_128_predictor_16x16_msa(uint8_t *dst, ptrdiff_t y_stride, + const uint8_t *above, const uint8_t *left) { + (void)above; + (void)left; + + intra_predict_128dc_16x16_msa(dst, y_stride); +} + +void vp9_dc_128_predictor_32x32_msa(uint8_t *dst, ptrdiff_t y_stride, + const uint8_t *above, const uint8_t *left) { + (void)above; + (void)left; + + intra_predict_128dc_32x32_msa(dst, y_stride); +} + +void vp9_tm_predictor_4x4_msa(uint8_t *dst, ptrdiff_t y_stride, + const uint8_t *above, const uint8_t *left) { + intra_predict_tm_4x4_msa(above, left, dst, y_stride); +} + +void vp9_tm_predictor_8x8_msa(uint8_t *dst, ptrdiff_t y_stride, + const uint8_t *above, const uint8_t *left) { + intra_predict_tm_8x8_msa(above, left, dst, y_stride); +} + +void vp9_tm_predictor_16x16_msa(uint8_t *dst, ptrdiff_t y_stride, + const uint8_t *above, const uint8_t *left) { + intra_predict_tm_16x16_msa(above, left, dst, y_stride); +} + +void vp9_tm_predictor_32x32_msa(uint8_t *dst, ptrdiff_t y_stride, + const uint8_t *above, const uint8_t *left) { + intra_predict_tm_32x32_msa(above, left, dst, y_stride); +}
diff --git a/vp9/common/mips/msa/vp9_macros_msa.h b/vp9/common/mips/msa/vp9_macros_msa.h index 3751e35..2043e13 100644 --- a/vp9/common/mips/msa/vp9_macros_msa.h +++ b/vp9/common/mips/msa/vp9_macros_msa.h
@@ -244,6 +244,22 @@ out3 = LW((psrc) + 3 * stride); \ } +/* Description : Load double words with stride + Arguments : Inputs - psrc (source pointer to load from) + - stride + Outputs - out0, out1 + Details : Loads double word in 'out0' from (psrc) + Loads double word in 'out1' from (psrc + stride) +*/ +#define LD2(psrc, stride, out0, out1) { \ + out0 = LD((psrc)); \ + out1 = LD((psrc) + stride); \ +} +#define LD4(psrc, stride, out0, out1, out2, out3) { \ + LD2((psrc), stride, out0, out1); \ + LD2((psrc) + 2 * stride, stride, out2, out3); \ +} + /* Description : Store 4 words with stride Arguments : Inputs - in0, in1, in2, in3, pdst, stride Details : Stores word from 'in0' to (pdst) @@ -482,6 +498,24 @@ SD(out0_m, pdst); \ } +/* Description : Store as 8x2 byte block to destination memory from input vector + Arguments : Inputs - in, pdst, stride + Details : Index 0 double word element from input vector 'in' is copied + and stored to destination memory at (pdst) + Index 1 double word element from input vector 'in' is copied + and stored to destination memory at (pdst + stride) +*/ +#define ST8x2_UB(in, pdst, stride) { \ + uint64_t out0_m, out1_m; \ + uint8_t *pblk_8x2_m = (uint8_t *)(pdst); \ + \ + out0_m = __msa_copy_u_d((v2i64)in, 0); \ + out1_m = __msa_copy_u_d((v2i64)in, 1); \ + \ + SD(out0_m, pblk_8x2_m); \ + SD(out1_m, pblk_8x2_m + stride); \ +} + /* Description : Store as 8x4 byte block to destination memory from input vectors Arguments : Inputs - in0, in1, pdst, stride @@ -743,6 +777,26 @@ CLIP_SH2_0_255(in2, in3); \ } +/* Description : Horizontal addition of unsigned byte vector elements + Arguments : Inputs - in0, in1 + Outputs - out0, out1 + Return Type - as per RTYPE + Details : Each unsigned odd byte element from 'in0' is added to + even unsigned byte element from 'in0' (pairwise) and the + halfword result is stored in 'out0' +*/ +#define HADD_UB2(RTYPE, in0, in1, out0, out1) { \ + out0 = (RTYPE)__msa_hadd_u_h((v16u8)in0, (v16u8)in0); \ + out1 = (RTYPE)__msa_hadd_u_h((v16u8)in1, (v16u8)in1); \ +} +#define HADD_UB2_UH(...) HADD_UB2(v8u16, __VA_ARGS__) + +#define HADD_UB4(RTYPE, in0, in1, in2, in3, out0, out1, out2, out3) { \ + HADD_UB2(RTYPE, in0, in1, out0, out1); \ + HADD_UB2(RTYPE, in2, in3, out2, out3); \ +} +#define HADD_UB4_UH(...) HADD_UB4(v8u16, __VA_ARGS__) + /* Description : Insert specified word elements from input vectors to 1 destination vector Arguments : Inputs - in0, in1, in2, in3 (4 input vectors) @@ -755,6 +809,19 @@ } #define INSERT_W2_SB(...) INSERT_W2(v16i8, __VA_ARGS__) +/* Description : Insert specified double word elements from input vectors to 1 + destination vector + Arguments : Inputs - in0, in1 (2 input vectors) + Outputs - out (output vector) + Return Type - as per RTYPE +*/ +#define INSERT_D2(RTYPE, in0, in1, out) { \ + out = (RTYPE)__msa_insert_d((v2i64)out, 0, in0); \ + out = (RTYPE)__msa_insert_d((v2i64)out, 1, in1); \ +} +#define INSERT_D2_UB(...) INSERT_D2(v16u8, __VA_ARGS__) +#define INSERT_D2_SB(...) INSERT_D2(v16i8, __VA_ARGS__) + /* Description : Interleave even byte elements from vectors Arguments : Inputs - in0, in1, in2, in3 Outputs - out0, out1
diff --git a/vp9/common/mips/msa/vp9_mfqe_msa.c b/vp9/common/mips/msa/vp9_mfqe_msa.c new file mode 100644 index 0000000..64cb9a8 --- /dev/null +++ b/vp9/common/mips/msa/vp9_mfqe_msa.c
@@ -0,0 +1,137 @@ +/* + * Copyright (c) 2015 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "./vp9_rtcd.h" +#include "vp9/common/vp9_onyxc_int.h" +#include "vp9/common/mips/msa/vp9_macros_msa.h" + +static void filter_by_weight8x8_msa(const uint8_t *src_ptr, int32_t src_stride, + uint8_t *dst_ptr, int32_t dst_stride, + int32_t src_weight) { + int32_t dst_weight = (1 << MFQE_PRECISION) - src_weight; + int32_t row; + uint64_t src0_d, src1_d, dst0_d, dst1_d; + v16i8 src0 = { 0 }; + v16i8 src1 = { 0 }; + v16i8 dst0 = { 0 }; + v16i8 dst1 = { 0 }; + v8i16 src_wt, dst_wt, res_h_r, res_h_l, src_r, src_l, dst_r, dst_l; + + src_wt = __msa_fill_h(src_weight); + dst_wt = __msa_fill_h(dst_weight); + + for (row = 2; row--;) { + LD2(src_ptr, src_stride, src0_d, src1_d); + src_ptr += (2 * src_stride); + LD2(dst_ptr, dst_stride, dst0_d, dst1_d); + INSERT_D2_SB(src0_d, src1_d, src0); + INSERT_D2_SB(dst0_d, dst1_d, dst0); + + LD2(src_ptr, src_stride, src0_d, src1_d); + src_ptr += (2 * src_stride); + LD2((dst_ptr + 2 * dst_stride), dst_stride, dst0_d, dst1_d); + INSERT_D2_SB(src0_d, src1_d, src1); + INSERT_D2_SB(dst0_d, dst1_d, dst1); + + UNPCK_UB_SH(src0, src_r, src_l); + UNPCK_UB_SH(dst0, dst_r, dst_l); + res_h_r = (src_r * src_wt); + res_h_r += (dst_r * dst_wt); + res_h_l = (src_l * src_wt); + res_h_l += (dst_l * dst_wt); + SRARI_H2_SH(res_h_r, res_h_l, MFQE_PRECISION); + dst0 = (v16i8)__msa_pckev_b((v16i8)res_h_l, (v16i8)res_h_r); + ST8x2_UB(dst0, dst_ptr, dst_stride); + dst_ptr += (2 * dst_stride); + + UNPCK_UB_SH(src1, src_r, src_l); + UNPCK_UB_SH(dst1, dst_r, dst_l); + res_h_r = (src_r * src_wt); + res_h_r += (dst_r * dst_wt); + res_h_l = (src_l * src_wt); + res_h_l += (dst_l * dst_wt); + SRARI_H2_SH(res_h_r, res_h_l, MFQE_PRECISION); + dst1 = (v16i8)__msa_pckev_b((v16i8)res_h_l, (v16i8)res_h_r); + ST8x2_UB(dst1, dst_ptr, dst_stride); + dst_ptr += (2 * dst_stride); + } +} + +static void filter_by_weight16x16_msa(const uint8_t *src_ptr, + int32_t src_stride, + uint8_t *dst_ptr, + int32_t dst_stride, + int32_t src_weight) { + int32_t dst_weight = (1 << MFQE_PRECISION) - src_weight; + int32_t row; + v16i8 src0, src1, src2, src3, dst0, dst1, dst2, dst3; + v8i16 src_wt, dst_wt, res_h_r, res_h_l, src_r, src_l, dst_r, dst_l; + + src_wt = __msa_fill_h(src_weight); + dst_wt = __msa_fill_h(dst_weight); + + for (row = 4; row--;) { + LD_SB4(src_ptr, src_stride, src0, src1, src2, src3); + src_ptr += (4 * src_stride); + LD_SB4(dst_ptr, dst_stride, dst0, dst1, dst2, dst3); + + UNPCK_UB_SH(src0, src_r, src_l); + UNPCK_UB_SH(dst0, dst_r, dst_l); + res_h_r = (src_r * src_wt); + res_h_r += (dst_r * dst_wt); + res_h_l = (src_l * src_wt); + res_h_l += (dst_l * dst_wt); + SRARI_H2_SH(res_h_r, res_h_l, MFQE_PRECISION); + PCKEV_ST_SB(res_h_r, res_h_l, dst_ptr); + dst_ptr += dst_stride; + + UNPCK_UB_SH(src1, src_r, src_l); + UNPCK_UB_SH(dst1, dst_r, dst_l); + res_h_r = (src_r * src_wt); + res_h_r += (dst_r * dst_wt); + res_h_l = (src_l * src_wt); + res_h_l += (dst_l * dst_wt); + SRARI_H2_SH(res_h_r, res_h_l, MFQE_PRECISION); + PCKEV_ST_SB(res_h_r, res_h_l, dst_ptr); + dst_ptr += dst_stride; + + UNPCK_UB_SH(src2, src_r, src_l); + UNPCK_UB_SH(dst2, dst_r, dst_l); + res_h_r = (src_r * src_wt); + res_h_r += (dst_r * dst_wt); + res_h_l = (src_l * src_wt); + res_h_l += (dst_l * dst_wt); + SRARI_H2_SH(res_h_r, res_h_l, MFQE_PRECISION); + PCKEV_ST_SB(res_h_r, res_h_l, dst_ptr); + dst_ptr += dst_stride; + + UNPCK_UB_SH(src3, src_r, src_l); + UNPCK_UB_SH(dst3, dst_r, dst_l); + res_h_r = (src_r * src_wt); + res_h_r += (dst_r * dst_wt); + res_h_l = (src_l * src_wt); + res_h_l += (dst_l * dst_wt); + SRARI_H2_SH(res_h_r, res_h_l, MFQE_PRECISION); + PCKEV_ST_SB(res_h_r, res_h_l, dst_ptr); + dst_ptr += dst_stride; + } +} + +void vp9_filter_by_weight8x8_msa(const uint8_t *src, int src_stride, + uint8_t *dst, int dst_stride, + int src_weight) { + filter_by_weight8x8_msa(src, src_stride, dst, dst_stride, src_weight); +} + +void vp9_filter_by_weight16x16_msa(const uint8_t *src, int src_stride, + uint8_t *dst, int dst_stride, + int src_weight) { + filter_by_weight16x16_msa(src, src_stride, dst, dst_stride, src_weight); +}
diff --git a/vp9/common/vp9_filter.c b/vp9/common/vp9_filter.c index afcdf22..b256d4a 100644 --- a/vp9/common/vp9_filter.c +++ b/vp9/common/vp9_filter.c
@@ -12,7 +12,8 @@ #include "vp9/common/vp9_filter.h" -const InterpKernel vp9_bilinear_filters[SUBPEL_SHIFTS] = { +DECLARE_ALIGNED(256, const InterpKernel, + vp9_bilinear_filters[SUBPEL_SHIFTS]) = { { 0, 0, 0, 128, 0, 0, 0, 0 }, { 0, 0, 0, 120, 8, 0, 0, 0 }, { 0, 0, 0, 112, 16, 0, 0, 0 },
diff --git a/vp9/common/vp9_loopfilter.c b/vp9/common/vp9_loopfilter.c index 484e457..9816728 100644 --- a/vp9/common/vp9_loopfilter.c +++ b/vp9/common/vp9_loopfilter.c
@@ -267,8 +267,8 @@ for (seg_id = 0; seg_id < MAX_SEGMENTS; seg_id++) { int lvl_seg = default_filt_lvl; - if (vp9_segfeature_active(seg, seg_id, SEG_LVL_ALT_LF)) { - const int data = vp9_get_segdata(seg, seg_id, SEG_LVL_ALT_LF); + if (segfeature_active(seg, seg_id, SEG_LVL_ALT_LF)) { + const int data = get_segdata(seg, seg_id, SEG_LVL_ALT_LF); lvl_seg = clamp(seg->abs_delta == SEGMENT_ABSDATA ? data : default_filt_lvl + data, 0, MAX_LOOP_FILTER);
diff --git a/vp9/common/vp9_quant_common.c b/vp9/common/vp9_quant_common.c index 564a3eb..d83f3c1 100644 --- a/vp9/common/vp9_quant_common.c +++ b/vp9/common/vp9_quant_common.c
@@ -266,8 +266,8 @@ int vp9_get_qindex(const struct segmentation *seg, int segment_id, int base_qindex) { - if (vp9_segfeature_active(seg, segment_id, SEG_LVL_ALT_Q)) { - const int data = vp9_get_segdata(seg, segment_id, SEG_LVL_ALT_Q); + if (segfeature_active(seg, segment_id, SEG_LVL_ALT_Q)) { + const int data = get_segdata(seg, segment_id, SEG_LVL_ALT_Q); const int seg_qindex = seg->abs_delta == SEGMENT_ABSDATA ? data : base_qindex + data; return clamp(seg_qindex, 0, MAXQ);
diff --git a/vp9/common/vp9_rtcd_defs.pl b/vp9/common/vp9_rtcd_defs.pl index bbe200d..2f262a6 100644 --- a/vp9/common/vp9_rtcd_defs.pl +++ b/vp9/common/vp9_rtcd_defs.pl
@@ -66,7 +66,7 @@ specialize qw/vp9_d63_predictor_4x4/, "$ssse3_x86inc"; add_proto qw/void vp9_h_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; -specialize qw/vp9_h_predictor_4x4 neon dspr2/, "$ssse3_x86inc"; +specialize qw/vp9_h_predictor_4x4 neon dspr2 msa/, "$ssse3_x86inc"; add_proto qw/void vp9_d117_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; specialize qw/vp9_d117_predictor_4x4/; @@ -78,22 +78,22 @@ specialize qw/vp9_d153_predictor_4x4/, "$ssse3_x86inc"; add_proto qw/void vp9_v_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; -specialize qw/vp9_v_predictor_4x4 neon/, "$sse_x86inc"; +specialize qw/vp9_v_predictor_4x4 neon msa/, "$sse_x86inc"; add_proto qw/void vp9_tm_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; -specialize qw/vp9_tm_predictor_4x4 neon dspr2/, "$sse_x86inc"; +specialize qw/vp9_tm_predictor_4x4 neon dspr2 msa/, "$sse_x86inc"; add_proto qw/void vp9_dc_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; -specialize qw/vp9_dc_predictor_4x4 dspr2/, "$sse_x86inc"; +specialize qw/vp9_dc_predictor_4x4 dspr2 msa/, "$sse_x86inc"; add_proto qw/void vp9_dc_top_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; -specialize qw/vp9_dc_top_predictor_4x4/, "$sse_x86inc"; +specialize qw/vp9_dc_top_predictor_4x4 msa/, "$sse_x86inc"; add_proto qw/void vp9_dc_left_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; -specialize qw/vp9_dc_left_predictor_4x4/, "$sse_x86inc"; +specialize qw/vp9_dc_left_predictor_4x4 msa/, "$sse_x86inc"; add_proto qw/void vp9_dc_128_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; -specialize qw/vp9_dc_128_predictor_4x4/, "$sse_x86inc"; +specialize qw/vp9_dc_128_predictor_4x4 msa/, "$sse_x86inc"; add_proto qw/void vp9_d207_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; specialize qw/vp9_d207_predictor_8x8/, "$ssse3_x86inc"; @@ -105,7 +105,7 @@ specialize qw/vp9_d63_predictor_8x8/, "$ssse3_x86inc"; add_proto qw/void vp9_h_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; -specialize qw/vp9_h_predictor_8x8 neon dspr2/, "$ssse3_x86inc"; +specialize qw/vp9_h_predictor_8x8 neon dspr2 msa/, "$ssse3_x86inc"; add_proto qw/void vp9_d117_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; specialize qw/vp9_d117_predictor_8x8/; @@ -117,22 +117,22 @@ specialize qw/vp9_d153_predictor_8x8/, "$ssse3_x86inc"; add_proto qw/void vp9_v_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; -specialize qw/vp9_v_predictor_8x8 neon/, "$sse_x86inc"; +specialize qw/vp9_v_predictor_8x8 neon msa/, "$sse_x86inc"; add_proto qw/void vp9_tm_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; -specialize qw/vp9_tm_predictor_8x8 neon dspr2/, "$sse2_x86inc"; +specialize qw/vp9_tm_predictor_8x8 neon dspr2 msa/, "$sse2_x86inc"; add_proto qw/void vp9_dc_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; -specialize qw/vp9_dc_predictor_8x8 dspr2 neon/, "$sse_x86inc"; +specialize qw/vp9_dc_predictor_8x8 dspr2 neon msa/, "$sse_x86inc"; add_proto qw/void vp9_dc_top_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; -specialize qw/vp9_dc_top_predictor_8x8 neon/, "$sse_x86inc"; +specialize qw/vp9_dc_top_predictor_8x8 neon msa/, "$sse_x86inc"; add_proto qw/void vp9_dc_left_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; -specialize qw/vp9_dc_left_predictor_8x8 neon/, "$sse_x86inc"; +specialize qw/vp9_dc_left_predictor_8x8 neon msa/, "$sse_x86inc"; add_proto qw/void vp9_dc_128_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; -specialize qw/vp9_dc_128_predictor_8x8 neon/, "$sse_x86inc"; +specialize qw/vp9_dc_128_predictor_8x8 neon msa/, "$sse_x86inc"; add_proto qw/void vp9_d207_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; specialize qw/vp9_d207_predictor_16x16/, "$ssse3_x86inc"; @@ -144,7 +144,7 @@ specialize qw/vp9_d63_predictor_16x16/, "$ssse3_x86inc"; add_proto qw/void vp9_h_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; -specialize qw/vp9_h_predictor_16x16 neon dspr2/, "$ssse3_x86inc"; +specialize qw/vp9_h_predictor_16x16 neon dspr2 msa/, "$ssse3_x86inc"; add_proto qw/void vp9_d117_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; specialize qw/vp9_d117_predictor_16x16/; @@ -156,22 +156,22 @@ specialize qw/vp9_d153_predictor_16x16/, "$ssse3_x86inc"; add_proto qw/void vp9_v_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; -specialize qw/vp9_v_predictor_16x16 neon/, "$sse2_x86inc"; +specialize qw/vp9_v_predictor_16x16 neon msa/, "$sse2_x86inc"; add_proto qw/void vp9_tm_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; -specialize qw/vp9_tm_predictor_16x16 neon/, "$sse2_x86inc"; +specialize qw/vp9_tm_predictor_16x16 neon msa/, "$sse2_x86inc"; add_proto qw/void vp9_dc_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; -specialize qw/vp9_dc_predictor_16x16 dspr2 neon/, "$sse2_x86inc"; +specialize qw/vp9_dc_predictor_16x16 dspr2 neon msa/, "$sse2_x86inc"; add_proto qw/void vp9_dc_top_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; -specialize qw/vp9_dc_top_predictor_16x16 neon/, "$sse2_x86inc"; +specialize qw/vp9_dc_top_predictor_16x16 neon msa/, "$sse2_x86inc"; add_proto qw/void vp9_dc_left_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; -specialize qw/vp9_dc_left_predictor_16x16 neon/, "$sse2_x86inc"; +specialize qw/vp9_dc_left_predictor_16x16 neon msa/, "$sse2_x86inc"; add_proto qw/void vp9_dc_128_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; -specialize qw/vp9_dc_128_predictor_16x16 neon/, "$sse2_x86inc"; +specialize qw/vp9_dc_128_predictor_16x16 neon msa/, "$sse2_x86inc"; add_proto qw/void vp9_d207_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; specialize qw/vp9_d207_predictor_32x32/, "$ssse3_x86inc"; @@ -183,7 +183,7 @@ specialize qw/vp9_d63_predictor_32x32/, "$ssse3_x86inc"; add_proto qw/void vp9_h_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; -specialize qw/vp9_h_predictor_32x32 neon/, "$ssse3_x86inc"; +specialize qw/vp9_h_predictor_32x32 neon msa/, "$ssse3_x86inc"; add_proto qw/void vp9_d117_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; specialize qw/vp9_d117_predictor_32x32/; @@ -195,22 +195,22 @@ specialize qw/vp9_d153_predictor_32x32/; add_proto qw/void vp9_v_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; -specialize qw/vp9_v_predictor_32x32 neon/, "$sse2_x86inc"; +specialize qw/vp9_v_predictor_32x32 neon msa/, "$sse2_x86inc"; add_proto qw/void vp9_tm_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; -specialize qw/vp9_tm_predictor_32x32 neon/, "$sse2_x86_64"; +specialize qw/vp9_tm_predictor_32x32 neon msa/, "$sse2_x86_64"; add_proto qw/void vp9_dc_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; -specialize qw/vp9_dc_predictor_32x32/, "$sse2_x86inc"; +specialize qw/vp9_dc_predictor_32x32 msa/, "$sse2_x86inc"; add_proto qw/void vp9_dc_top_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; -specialize qw/vp9_dc_top_predictor_32x32/, "$sse2_x86inc"; +specialize qw/vp9_dc_top_predictor_32x32 msa/, "$sse2_x86inc"; add_proto qw/void vp9_dc_left_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; -specialize qw/vp9_dc_left_predictor_32x32/, "$sse2_x86inc"; +specialize qw/vp9_dc_left_predictor_32x32 msa/, "$sse2_x86inc"; add_proto qw/void vp9_dc_128_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; -specialize qw/vp9_dc_128_predictor_32x32/, "$sse2_x86inc"; +specialize qw/vp9_dc_128_predictor_32x32 msa/, "$sse2_x86inc"; # # Loopfilter @@ -276,10 +276,10 @@ $vp9_plane_add_noise_sse2=vp9_plane_add_noise_wmt; add_proto qw/void vp9_filter_by_weight16x16/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int src_weight"; -specialize qw/vp9_filter_by_weight16x16 sse2/; +specialize qw/vp9_filter_by_weight16x16 sse2 msa/; add_proto qw/void vp9_filter_by_weight8x8/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int src_weight"; -specialize qw/vp9_filter_by_weight8x8 sse2/; +specialize qw/vp9_filter_by_weight8x8 sse2 msa/; } #
diff --git a/vp9/common/vp9_seg_common.c b/vp9/common/vp9_seg_common.c index 910200e..471e238 100644 --- a/vp9/common/vp9_seg_common.c +++ b/vp9/common/vp9_seg_common.c
@@ -25,12 +25,6 @@ // the coding mechanism is still subject to change so these provide a // convenient single point of change. -int vp9_segfeature_active(const struct segmentation *seg, int segment_id, - SEG_LVL_FEATURES feature_id) { - return seg->enabled && - (seg->feature_mask[segment_id] & (1 << feature_id)); -} - void vp9_clearall_segfeatures(struct segmentation *seg) { vp9_zero(seg->feature_data); vp9_zero(seg->feature_mask); @@ -60,12 +54,6 @@ seg->feature_data[segment_id][feature_id] = seg_data; } -int vp9_get_segdata(const struct segmentation *seg, int segment_id, - SEG_LVL_FEATURES feature_id) { - return seg->feature_data[segment_id][feature_id]; -} - - const vp9_tree_index vp9_segment_tree[TREE_SIZE(MAX_SEGMENTS)] = { 2, 4, 6, 8, 10, 12, 0, -1, -2, -3, -4, -5, -6, -7
diff --git a/vp9/common/vp9_seg_common.h b/vp9/common/vp9_seg_common.h index ff2d66a..95c9918 100644 --- a/vp9/common/vp9_seg_common.h +++ b/vp9/common/vp9_seg_common.h
@@ -49,9 +49,12 @@ unsigned int feature_mask[MAX_SEGMENTS]; }; -int vp9_segfeature_active(const struct segmentation *seg, - int segment_id, - SEG_LVL_FEATURES feature_id); +static INLINE int segfeature_active(const struct segmentation *seg, + int segment_id, + SEG_LVL_FEATURES feature_id) { + return seg->enabled && + (seg->feature_mask[segment_id] & (1 << feature_id)); +} void vp9_clearall_segfeatures(struct segmentation *seg); @@ -68,9 +71,10 @@ SEG_LVL_FEATURES feature_id, int seg_data); -int vp9_get_segdata(const struct segmentation *seg, - int segment_id, - SEG_LVL_FEATURES feature_id); +static INLINE int get_segdata(const struct segmentation *seg, int segment_id, + SEG_LVL_FEATURES feature_id) { + return seg->feature_data[segment_id][feature_id]; +} extern const vp9_tree_index vp9_segment_tree[TREE_SIZE(MAX_SEGMENTS)];
diff --git a/vp9/decoder/vp9_decodemv.c b/vp9/decoder/vp9_decodemv.c index bc03caf..d34926d 100644 --- a/vp9/decoder/vp9_decodemv.c +++ b/vp9/decoder/vp9_decodemv.c
@@ -177,7 +177,7 @@ static int read_skip(VP9_COMMON *cm, const MACROBLOCKD *xd, int segment_id, vp9_reader *r) { - if (vp9_segfeature_active(&cm->seg, segment_id, SEG_LVL_SKIP)) { + if (segfeature_active(&cm->seg, segment_id, SEG_LVL_SKIP)) { return 1; } else { const int ctx = vp9_get_skip_context(xd); @@ -307,9 +307,9 @@ FRAME_CONTEXT *const fc = cm->fc; FRAME_COUNTS *counts = xd->counts; - if (vp9_segfeature_active(&cm->seg, segment_id, SEG_LVL_REF_FRAME)) { - ref_frame[0] = (MV_REFERENCE_FRAME)vp9_get_segdata(&cm->seg, segment_id, - SEG_LVL_REF_FRAME); + if (segfeature_active(&cm->seg, segment_id, SEG_LVL_REF_FRAME)) { + ref_frame[0] = (MV_REFERENCE_FRAME)get_segdata(&cm->seg, segment_id, + SEG_LVL_REF_FRAME); ref_frame[1] = NONE; } else { const REFERENCE_MODE mode = read_block_reference_mode(cm, xd, r); @@ -444,9 +444,8 @@ static int read_is_inter_block(VP9_COMMON *const cm, MACROBLOCKD *const xd, int segment_id, vp9_reader *r) { - if (vp9_segfeature_active(&cm->seg, segment_id, SEG_LVL_REF_FRAME)) { - return vp9_get_segdata(&cm->seg, segment_id, SEG_LVL_REF_FRAME) != - INTRA_FRAME; + if (segfeature_active(&cm->seg, segment_id, SEG_LVL_REF_FRAME)) { + return get_segdata(&cm->seg, segment_id, SEG_LVL_REF_FRAME) != INTRA_FRAME; } else { const int ctx = vp9_get_intra_inter_context(xd); const int is_inter = vp9_read(r, cm->fc->intra_inter_prob[ctx]); @@ -493,7 +492,7 @@ mi_row, mi_col, fpm_sync, (void *)pbi, inter_mode_ctx); } - if (vp9_segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP)) { + if (segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP)) { mbmi->mode = ZEROMV; if (bsize < BLOCK_8X8) { vpx_internal_error(xd->error_info, VPX_CODEC_UNSUP_BITSTREAM,
diff --git a/vp9/encoder/vp9_aq_cyclicrefresh.c b/vp9/encoder/vp9_aq_cyclicrefresh.c index 71c1e0b..df70d48 100644 --- a/vp9/encoder/vp9_aq_cyclicrefresh.c +++ b/vp9/encoder/vp9_aq_cyclicrefresh.c
@@ -95,19 +95,6 @@ return 1; } -static void adjust_cyclic_refresh_parameters(VP9_COMP *const cpi) { - const VP9_COMMON *const cm = &cpi->common; - const RATE_CONTROL *const rc = &cpi->rc; - CYCLIC_REFRESH *const cr = cpi->cyclic_refresh; - // Adjust some parameters, currently only for low resolutions at low bitrates. - if (cm->width <= 352 && - cm->height <= 288 && - rc->avg_frame_bandwidth < 3400) { - cr->motion_thresh = 4; - cr->rate_boost_fac = 1.25; - } -} - // Check if this coding block, of size bsize, should be considered for refresh // (lower-qp coding). Decision can be based on various factors, such as // size of the coding block (i.e., below min_block size rejected), coding @@ -435,18 +422,30 @@ cr->sb_index = i; } -// Set/update global/frame level cyclic refresh parameters. +// Set cyclic refresh parameters. void vp9_cyclic_refresh_update_parameters(VP9_COMP *const cpi) { const RATE_CONTROL *const rc = &cpi->rc; + const VP9_COMMON *const cm = &cpi->common; CYCLIC_REFRESH *const cr = cpi->cyclic_refresh; cr->percent_refresh = 10; + cr->max_qdelta_perc = 50; + cr->time_for_refresh = 0; // Use larger delta-qp (increase rate_ratio_qdelta) for first few (~4) - // periods of the refresh cycle, after a key frame. This corresponds to ~40 - // frames with cr->percent_refresh = 10. - if (rc->frames_since_key < 40) + // periods of the refresh cycle, after a key frame. + if (rc->frames_since_key < 4 * cr->percent_refresh) cr->rate_ratio_qdelta = 3.0; else cr->rate_ratio_qdelta = 2.0; + // Adjust some parameters for low resolutions at low bitrates. + if (cm->width <= 352 && + cm->height <= 288 && + rc->avg_frame_bandwidth < 3400) { + cr->motion_thresh = 4; + cr->rate_boost_fac = 1.25; + } else { + cr->motion_thresh = 32; + cr->rate_boost_fac = 1.7; + } } // Setup cyclic background refresh: set delta q and segmentation map. @@ -475,9 +474,6 @@ int qindex2; const double q = vp9_convert_qindex_to_q(cm->base_qindex, cm->bit_depth); vp9_clear_system_state(); - cr->max_qdelta_perc = 50; - cr->time_for_refresh = 0; - cr->rate_boost_fac = 1.7; // Set rate threshold to some multiple (set to 2 for now) of the target // rate (target is given by sb64_target_rate and scaled by 256). cr->thresh_rate_sb = ((int64_t)(rc->sb64_target_rate) << 8) << 2; @@ -485,9 +481,6 @@ // q will not exceed 457, so (q * q) is within 32bit; see: // vp9_convert_qindex_to_q(), vp9_ac_quant(), ac_qlookup*[]. cr->thresh_dist_sb = ((int64_t)(q * q)) << 2; - cr->motion_thresh = 32; - - adjust_cyclic_refresh_parameters(cpi); // Set up segmentation. // Clear down the segment map.
diff --git a/vp9/encoder/vp9_avg.c b/vp9/encoder/vp9_avg.c index b9987c1..4373566 100644 --- a/vp9/encoder/vp9_avg.c +++ b/vp9/encoder/vp9_avg.c
@@ -29,6 +29,8 @@ return (sum + 8) >> 4; } +// src_diff: first pass, 9 bit, dynamic range [-255, 255] +// second pass, 12 bit, dynamic range [-2040, 2040] static void hadamard_col8(const int16_t *src_diff, int src_stride, int16_t *coeff) { int16_t b0 = src_diff[0 * src_stride] + src_diff[1 * src_stride]; @@ -65,15 +67,18 @@ int16_t buffer[64]; int16_t *tmp_buf = &buffer[0]; for (idx = 0; idx < 8; ++idx) { - hadamard_col8(src_diff, src_stride, tmp_buf); + hadamard_col8(src_diff, src_stride, tmp_buf); // src_diff: 9 bit + // dynamic range [-255, 255] tmp_buf += 8; ++src_diff; } tmp_buf = &buffer[0]; for (idx = 0; idx < 8; ++idx) { - hadamard_col8(tmp_buf, 8, coeff); - coeff += 8; + hadamard_col8(tmp_buf, 8, coeff); // tmp_buf: 12 bit + // dynamic range [-2040, 2040] + coeff += 8; // coeff: 15 bit + // dynamic range [-16320, 16320] ++tmp_buf; } } @@ -83,26 +88,28 @@ int16_t *coeff) { int idx; for (idx = 0; idx < 4; ++idx) { + // src_diff: 9 bit, dynamic range [-255, 255] int16_t const *src_ptr = src_diff + (idx >> 1) * 8 * src_stride + (idx & 0x01) * 8; vp9_hadamard_8x8_c(src_ptr, src_stride, coeff + idx * 64); } + // coeff: 15 bit, dynamic range [-16320, 16320] for (idx = 0; idx < 64; ++idx) { int16_t a0 = coeff[0]; int16_t a1 = coeff[64]; int16_t a2 = coeff[128]; int16_t a3 = coeff[192]; - int16_t b0 = a0 + a1; - int16_t b1 = a0 - a1; - int16_t b2 = a2 + a3; - int16_t b3 = a2 - a3; + int16_t b0 = (a0 + a1) >> 1; // (a0 + a1): 16 bit, [-32640, 32640] + int16_t b1 = (a0 - a1) >> 1; // b0-b3: 15 bit, dynamic range + int16_t b2 = (a2 + a3) >> 1; // [-16320, 16320] + int16_t b3 = (a2 - a3) >> 1; - coeff[0] = (b0 + b2) >> 1; - coeff[64] = (b1 + b3) >> 1; - coeff[128] = (b0 - b2) >> 1; - coeff[192] = (b1 - b3) >> 1; + coeff[0] = b0 + b2; // 16 bit, [-32640, 32640] + coeff[64] = b1 + b3; + coeff[128] = b0 - b2; + coeff[192] = b1 - b3; ++coeff; }
diff --git a/vp9/encoder/vp9_bitstream.c b/vp9/encoder/vp9_bitstream.c index d20e067..092d265 100644 --- a/vp9/encoder/vp9_bitstream.c +++ b/vp9/encoder/vp9_bitstream.c
@@ -93,7 +93,7 @@ static int write_skip(const VP9_COMMON *cm, const MACROBLOCKD *xd, int segment_id, const MODE_INFO *mi, vp9_writer *w) { - if (vp9_segfeature_active(&cm->seg, segment_id, SEG_LVL_SKIP)) { + if (segfeature_active(&cm->seg, segment_id, SEG_LVL_SKIP)) { return 1; } else { const int skip = mi->mbmi.skip; @@ -207,10 +207,10 @@ // If segment level coding of this signal is disabled... // or the segment allows multiple reference frame options - if (vp9_segfeature_active(&cm->seg, segment_id, SEG_LVL_REF_FRAME)) { + if (segfeature_active(&cm->seg, segment_id, SEG_LVL_REF_FRAME)) { assert(!is_compound); assert(mbmi->ref_frame[0] == - vp9_get_segdata(&cm->seg, segment_id, SEG_LVL_REF_FRAME)); + get_segdata(&cm->seg, segment_id, SEG_LVL_REF_FRAME)); } else { // does the feature use compound prediction or not // (if not specified at the frame/segment level) @@ -264,7 +264,7 @@ skip = write_skip(cm, xd, segment_id, mi, w); - if (!vp9_segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME)) + if (!segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME)) vp9_write(w, is_inter, vp9_get_intra_inter_prob(cm, xd)); if (bsize >= BLOCK_8X8 && cm->tx_mode == TX_MODE_SELECT && @@ -293,7 +293,7 @@ write_ref_frames(cm, xd, w); // If segment skip is not enabled code the mode. - if (!vp9_segfeature_active(seg, segment_id, SEG_LVL_SKIP)) { + if (!segfeature_active(seg, segment_id, SEG_LVL_SKIP)) { if (bsize >= BLOCK_8X8) { write_inter_mode(w, mode, inter_probs); } @@ -787,10 +787,10 @@ for (i = 0; i < MAX_SEGMENTS; i++) { for (j = 0; j < SEG_LVL_MAX; j++) { - const int active = vp9_segfeature_active(seg, i, j); + const int active = segfeature_active(seg, i, j); vp9_wb_write_bit(wb, active); if (active) { - const int data = vp9_get_segdata(seg, i, j); + const int data = get_segdata(seg, i, j); const int data_max = vp9_seg_feature_data_max(j); if (vp9_is_segfeature_signed(j)) {
diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c index 8864e0e..f5e3e98 100644 --- a/vp9/encoder/vp9_encodeframe.c +++ b/vp9/encoder/vp9_encodeframe.c
@@ -1051,7 +1051,7 @@ if (!output_enabled) return; - if (!vp9_segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP)) { + if (!segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP)) { for (i = 0; i < TX_MODES; i++) rdc->tx_select_diff[i] += ctx->tx_rd_diff[i]; } @@ -1248,7 +1248,7 @@ vp9_rd_pick_intra_mode_sb(cpi, x, rd_cost, bsize, ctx, best_rd); } else { if (bsize >= BLOCK_8X8) { - if (vp9_segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP)) + if (segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP)) vp9_rd_pick_inter_mode_sb_seg_skip(cpi, tile_data, x, rd_cost, bsize, ctx, best_rd); else @@ -1291,8 +1291,8 @@ if (!frame_is_intra_only(cm)) { FRAME_COUNTS *const counts = td->counts; const int inter_block = is_inter_block(mbmi); - const int seg_ref_active = vp9_segfeature_active(&cm->seg, mbmi->segment_id, - SEG_LVL_REF_FRAME); + const int seg_ref_active = segfeature_active(&cm->seg, mbmi->segment_id, + SEG_LVL_REF_FRAME); if (!seg_ref_active) { counts->intra_inter[vp9_get_intra_inter_context(xd)][inter_block]++; // If the segment reference feature is enabled we have only a single @@ -1317,7 +1317,7 @@ } } if (inter_block && - !vp9_segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP)) { + !segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP)) { const int mode_ctx = mbmi->mode_context[mbmi->ref_frame[0]]; if (bsize >= BLOCK_8X8) { const PREDICTION_MODE mode = mbmi->mode; @@ -2849,7 +2849,7 @@ const uint8_t *const map = seg->update_map ? cpi->segmentation_map : cm->last_frame_seg_map; int segment_id = vp9_get_segment_id(cm, map, BLOCK_64X64, mi_row, mi_col); - seg_skip = vp9_segfeature_active(seg, segment_id, SEG_LVL_SKIP); + seg_skip = segfeature_active(seg, segment_id, SEG_LVL_SKIP); } x->source_variance = UINT_MAX; @@ -2909,7 +2909,7 @@ static int check_dual_ref_flags(VP9_COMP *cpi) { const int ref_flags = cpi->ref_frame_flags; - if (vp9_segfeature_active(&cpi->common.seg, 1, SEG_LVL_REF_FRAME)) { + if (segfeature_active(&cpi->common.seg, 1, SEG_LVL_REF_FRAME)) { return 0; } else { return (!!(ref_flags & VP9_GOLD_FLAG) + !!(ref_flags & VP9_LAST_FLAG) @@ -2984,7 +2984,7 @@ if (cm->frame_type == KEY_FRAME) hybrid_intra_mode_search(cpi, x, rd_cost, bsize, ctx); - else if (vp9_segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP)) + else if (segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP)) set_mode_info_seg_skip(x, cm->tx_mode, rd_cost, bsize); else if (bsize >= BLOCK_8X8) vp9_pick_inter_mode(cpi, x, tile_data, mi_row, mi_col, @@ -3599,7 +3599,7 @@ const uint8_t *const map = seg->update_map ? cpi->segmentation_map : cm->last_frame_seg_map; int segment_id = vp9_get_segment_id(cm, map, BLOCK_64X64, mi_row, mi_col); - seg_skip = vp9_segfeature_active(seg, segment_id, SEG_LVL_SKIP); + seg_skip = segfeature_active(seg, segment_id, SEG_LVL_SKIP); if (seg_skip) { partition_search_type = FIXED_PARTITION; } @@ -3786,9 +3786,13 @@ TOKENEXTRA *pre_tok = cpi->tile_tok[0][0]; int tile_tok = 0; - if (cpi->tile_data == NULL) { + if (cpi->tile_data == NULL || cpi->allocated_tiles < tile_cols * tile_rows) { + if (cpi->tile_data != NULL) + vpx_free(cpi->tile_data); CHECK_MEM_ERROR(cm, cpi->tile_data, vpx_malloc(tile_cols * tile_rows * sizeof(*cpi->tile_data))); + cpi->allocated_tiles = tile_cols * tile_rows; + for (tile_row = 0; tile_row < tile_rows; ++tile_row) for (tile_col = 0; tile_col < tile_cols; ++tile_col) { TileDataEnc *tile_data = @@ -4157,8 +4161,8 @@ MODE_INFO **mi_8x8 = xd->mi; MODE_INFO *mi = mi_8x8[0]; MB_MODE_INFO *mbmi = &mi->mbmi; - const int seg_skip = vp9_segfeature_active(&cm->seg, mbmi->segment_id, - SEG_LVL_SKIP); + const int seg_skip = segfeature_active(&cm->seg, mbmi->segment_id, + SEG_LVL_SKIP); const int mis = cm->mi_stride; const int mi_width = num_8x8_blocks_wide_lookup[bsize]; const int mi_height = num_8x8_blocks_high_lookup[bsize];
diff --git a/vp9/encoder/vp9_encoder.h b/vp9/encoder/vp9_encoder.h index 6ce4a67..4d2a186 100644 --- a/vp9/encoder/vp9_encoder.h +++ b/vp9/encoder/vp9_encoder.h
@@ -306,6 +306,7 @@ YV12_BUFFER_CONFIG scaled_last_source; TileDataEnc *tile_data; + int allocated_tiles; // Keep track of memory allocated for tiles. // For a still frame, this flag is set to 1 to skip partition search. int partition_search_skippable_frame;
diff --git a/vp9/encoder/vp9_pickmode.c b/vp9/encoder/vp9_pickmode.c index 60bff57..2479b6e 100644 --- a/vp9/encoder/vp9_pickmode.c +++ b/vp9/encoder/vp9_pickmode.c
@@ -1692,8 +1692,8 @@ // If the segment reference frame feature is enabled.... // then do nothing if the current ref frame is not allowed.. - if (vp9_segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME) && - vp9_get_segdata(seg, segment_id, SEG_LVL_REF_FRAME) != (int)ref_frame) + if (segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME) && + get_segdata(seg, segment_id, SEG_LVL_REF_FRAME) != (int)ref_frame) continue; mbmi->ref_frame[0] = ref_frame;
diff --git a/vp9/encoder/vp9_quantize.c b/vp9/encoder/vp9_quantize.c index db5460b..e6e17c0 100644 --- a/vp9/encoder/vp9_quantize.c +++ b/vp9/encoder/vp9_quantize.c
@@ -678,7 +678,7 @@ x->plane[i].quant_thred[1] = x->plane[i].zbin[1] * x->plane[i].zbin[1]; } - x->skip_block = vp9_segfeature_active(&cm->seg, segment_id, SEG_LVL_SKIP); + x->skip_block = segfeature_active(&cm->seg, segment_id, SEG_LVL_SKIP); x->q_index = qindex; x->errorperbit = rdmult >> 6;
diff --git a/vp9/encoder/vp9_rdopt.c b/vp9/encoder/vp9_rdopt.c index eacc63f..162d4de 100644 --- a/vp9/encoder/vp9_rdopt.c +++ b/vp9/encoder/vp9_rdopt.c
@@ -2120,8 +2120,8 @@ unsigned int *ref_costs_single, unsigned int *ref_costs_comp, vp9_prob *comp_mode_p) { - int seg_ref_active = vp9_segfeature_active(&cm->seg, segment_id, - SEG_LVL_REF_FRAME); + int seg_ref_active = segfeature_active(&cm->seg, segment_id, + SEG_LVL_REF_FRAME); if (seg_ref_active) { memset(ref_costs_single, 0, MAX_REF_FRAMES * sizeof(*ref_costs_single)); memset(ref_costs_comp, 0, MAX_REF_FRAMES * sizeof(*ref_costs_comp)); @@ -3007,8 +3007,8 @@ } // If the segment reference frame feature is enabled.... // then do nothing if the current ref frame is not allowed.. - if (vp9_segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME) && - vp9_get_segdata(seg, segment_id, SEG_LVL_REF_FRAME) != (int)ref_frame) { + if (segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME) && + get_segdata(seg, segment_id, SEG_LVL_REF_FRAME) != (int)ref_frame) { ref_frame_skip_mask[0] |= (1 << ref_frame); ref_frame_skip_mask[1] |= SECOND_REF_FRAME_MASK; } @@ -3017,7 +3017,7 @@ // Disable this drop out case if the ref frame // segment level feature is enabled for this segment. This is to // prevent the possibility that we end up unable to pick any mode. - if (!vp9_segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME)) { + if (!segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME)) { // Only consider ZEROMV/ALTREF_FRAME for alt ref frame, // unless ARNR filtering is enabled in which case we want // an unfiltered alternative. We allow near/nearest as well @@ -3196,7 +3196,7 @@ // Do not allow compound prediction if the segment level reference frame // feature is in use as in this case there can only be one reference. - if (vp9_segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME)) + if (segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME)) continue; if ((mode_search_skip_flags & FLAG_SKIP_COMP_BESTINTRA) && @@ -3638,7 +3638,7 @@ rd_cost->rate = INT_MAX; - assert(vp9_segfeature_active(&cm->seg, segment_id, SEG_LVL_SKIP)); + assert(segfeature_active(&cm->seg, segment_id, SEG_LVL_SKIP)); mbmi->mode = ZEROMV; mbmi->uv_mode = DC_PRED; @@ -3850,7 +3850,7 @@ continue; // Do not allow compound prediction if the segment level reference frame // feature is in use as in this case there can only be one reference. - if (vp9_segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME)) + if (segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME)) continue; if ((sf->mode_search_skip_flags & FLAG_SKIP_COMP_BESTINTRA) && @@ -3875,13 +3875,13 @@ // If the segment reference frame feature is enabled.... // then do nothing if the current ref frame is not allowed.. - if (vp9_segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME) && - vp9_get_segdata(seg, segment_id, SEG_LVL_REF_FRAME) != (int)ref_frame) { + if (segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME) && + get_segdata(seg, segment_id, SEG_LVL_REF_FRAME) != (int)ref_frame) { continue; // Disable this drop out case if the ref frame // segment level feature is enabled for this segment. This is to // prevent the possibility that we end up unable to pick any mode. - } else if (!vp9_segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME)) { + } else if (!segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME)) { // Only consider ZEROMV/ALTREF_FRAME for alt ref frame, // unless ARNR filtering is enabled in which case we want // an unfiltered alternative. We allow near/nearest as well
diff --git a/vp9/encoder/vp9_tokenize.c b/vp9/encoder/vp9_tokenize.c index 3592031..181a99c 100644 --- a/vp9/encoder/vp9_tokenize.c +++ b/vp9/encoder/vp9_tokenize.c
@@ -484,7 +484,7 @@ static INLINE int get_tx_eob(const struct segmentation *seg, int segment_id, TX_SIZE tx_size) { const int eob_max = 16 << (tx_size << 1); - return vp9_segfeature_active(seg, segment_id, SEG_LVL_SKIP) ? 0 : eob_max; + return segfeature_active(seg, segment_id, SEG_LVL_SKIP) ? 0 : eob_max; } static void tokenize_b(int plane, int block, BLOCK_SIZE plane_bsize, @@ -615,8 +615,8 @@ MACROBLOCKD *const xd = &x->e_mbd; MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi; const int ctx = vp9_get_skip_context(xd); - const int skip_inc = !vp9_segfeature_active(&cm->seg, mbmi->segment_id, - SEG_LVL_SKIP); + const int skip_inc = !segfeature_active(&cm->seg, mbmi->segment_id, + SEG_LVL_SKIP); struct tokenize_b_args arg = {cpi, td, t}; if (mbmi->skip) { if (!dry_run)
diff --git a/vp9/encoder/x86/vp9_avg_intrin_sse2.c b/vp9/encoder/x86/vp9_avg_intrin_sse2.c index 56a91ed..4531d79 100644 --- a/vp9/encoder/x86/vp9_avg_intrin_sse2.c +++ b/vp9/encoder/x86/vp9_avg_intrin_sse2.c
@@ -264,17 +264,18 @@ __m128i b2 = _mm_add_epi16(coeff2, coeff3); __m128i b3 = _mm_sub_epi16(coeff2, coeff3); + b0 = _mm_srai_epi16(b0, 1); + b1 = _mm_srai_epi16(b1, 1); + b2 = _mm_srai_epi16(b2, 1); + b3 = _mm_srai_epi16(b3, 1); + coeff0 = _mm_add_epi16(b0, b2); coeff1 = _mm_add_epi16(b1, b3); - coeff0 = _mm_srai_epi16(coeff0, 1); - coeff1 = _mm_srai_epi16(coeff1, 1); _mm_store_si128((__m128i *)coeff, coeff0); _mm_store_si128((__m128i *)(coeff + 64), coeff1); coeff2 = _mm_sub_epi16(b0, b2); coeff3 = _mm_sub_epi16(b1, b3); - coeff2 = _mm_srai_epi16(coeff2, 1); - coeff3 = _mm_srai_epi16(coeff3, 1); _mm_store_si128((__m128i *)(coeff + 128), coeff2); _mm_store_si128((__m128i *)(coeff + 192), coeff3);
diff --git a/vp9/vp9_common.mk b/vp9/vp9_common.mk index d018699..6f091ee 100644 --- a/vp9/vp9_common.mk +++ b/vp9/vp9_common.mk
@@ -146,11 +146,16 @@ VP9_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/vp9_idct16x16_msa.c VP9_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/vp9_idct32x32_msa.c VP9_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/vp9_idct_msa.h +VP9_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/vp9_intra_predict_msa.c VP9_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/vp9_loopfilter_4_msa.c VP9_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/vp9_loopfilter_8_msa.c VP9_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/vp9_loopfilter_16_msa.c VP9_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/vp9_loopfilter_msa.h +ifeq ($(CONFIG_VP9_POSTPROC),yes) +VP9_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/vp9_mfqe_msa.c +endif + VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_idct_intrin_sse2.c VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_idct_intrin_sse2.h ifeq ($(ARCH_X86_64), yes)