| /* |
| * Copyright (c) 2025, Alliance for Open Media. All rights reserved. |
| * |
| * This source code is subject to the terms of the BSD 2 Clause License and |
| * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License |
| * was not distributed with this source code in the LICENSE file, you can |
| * obtain it at www.aomedia.org/license/software. If the Alliance for Open |
| * Media Patent License 1.0 was not distributed with this source code in the |
| * PATENTS file, you can obtain it at www.aomedia.org/license/patent. |
| */ |
| |
| #include <riscv_vector.h> |
| |
| #include "config/aom_config.h" |
| #include "config/av1_rtcd.h" |
| #include "av1/common/cdef_block.h" |
| |
| // partial A is a 16-bit vector of the form: |
| // [x8 x7 x6 x5 x4 x3 x2 x1] and partial B has the form: |
| // [0 y1 y2 y3 y4 y5 y6 y7]. |
| // This function computes (x1^2+y1^2)*C1 + (x2^2+y2^2)*C2 + ... |
| // (x7^2+y2^7)*C7 + (x8^2+0^2)*C8 where the C1..C8 constants are in const1 |
| // and const2. |
| static inline vuint32m1_t fold_mul_and_sum_rvv(vint16m1_t partiala, |
| vint16m1_t partialb, |
| vuint32m1_t const1, |
| vuint32m1_t const2) { |
| // Square and add the corresponding x and y values. |
| vint32m2_t cost = __riscv_vwmul_vv_i32m2(partiala, partiala, 8); |
| cost = __riscv_vwmacc_vv_i32m2(cost, partialb, partialb, 8); |
| |
| // Multiply by constant. |
| vuint32m2_t tmp1_u32m2 = __riscv_vreinterpret_v_i32m2_u32m2(cost); |
| vuint32m1_t cost_u32m1 = __riscv_vmul_vv_u32m1( |
| __riscv_vlmul_trunc_v_u32m2_u32m1(tmp1_u32m2), const1, 4); |
| tmp1_u32m2 = __riscv_vslidedown_vx_u32m2(tmp1_u32m2, 4, 8); |
| vuint32m1_t ret = __riscv_vmacc_vv_u32m1( |
| cost_u32m1, __riscv_vlmul_trunc_v_u32m2_u32m1(tmp1_u32m2), const2, 4); |
| return ret; |
| } |
| |
| // This function computes the cost along directions 4, 5, 6, 7. (4 is diagonal |
| // down-right, 6 is vertical). |
| // |
| // For each direction the lines are shifted so that we can perform a |
| // basic sum on each vector element. For example, direction 5 is "south by |
| // southeast", so we need to add the pixels along each line i below: |
| // |
| // 0 1 2 3 4 5 6 7 |
| // 0 1 2 3 4 5 6 7 |
| // 8 0 1 2 3 4 5 6 |
| // 8 0 1 2 3 4 5 6 |
| // 9 8 0 1 2 3 4 5 |
| // 9 8 0 1 2 3 4 5 |
| // 10 9 8 0 1 2 3 4 |
| // 10 9 8 0 1 2 3 4 |
| // |
| // For this to fit nicely in vectors, the lines need to be shifted like so: |
| // 0 1 2 3 4 5 6 7 |
| // 0 1 2 3 4 5 6 7 |
| // 8 0 1 2 3 4 5 6 |
| // 8 0 1 2 3 4 5 6 |
| // 9 8 0 1 2 3 4 5 |
| // 9 8 0 1 2 3 4 5 |
| // 10 9 8 0 1 2 3 4 |
| // 10 9 8 0 1 2 3 4 |
| // |
| // In this configuration we can now perform SIMD additions to get the cost |
| // along direction 5. Since this won't fit into a single 128-bit vector, we use |
| // two of them to compute each half of the new configuration, and pad the empty |
| // spaces with zeros. Similar shifting is done for other directions, except |
| // direction 6 which is straightforward as it's the vertical direction. |
| static vuint32m1_t compute_vert_directions_rvv( |
| vint16m1_t lines_0, vint16m1_t lines_1, vint16m1_t lines_2, |
| vint16m1_t lines_3, vint16m1_t lines_4, vint16m1_t lines_5, |
| vint16m1_t lines_6, vint16m1_t lines_7, uint32_t cost[4], size_t vl) { |
| size_t VL_SLIDE_DOWN = __riscv_vsetvl_e16m1(16); |
| vint16m1_t vec_zero_i16m1 = __riscv_vmv_v_x_i16m1(0, vl); |
| |
| // Partial sums for lines 0 and 1. |
| vint16m1_t partial4a = |
| __riscv_vslideup_vx_i16m1(vec_zero_i16m1, lines_0, (8 - 1), vl); |
| vint16m1_t tmp1_i16m1 = |
| __riscv_vslideup_vx_i16m1(vec_zero_i16m1, lines_1, (8 - 2), vl); |
| partial4a = __riscv_vadd_vv_i16m1(partial4a, tmp1_i16m1, vl); |
| vint16m1_t partial4b = __riscv_vslide1down_vx_i16m1(lines_0, 0, vl); |
| tmp1_i16m1 = __riscv_vslidedown_vx_i16m1(lines_1, 2, VL_SLIDE_DOWN); |
| partial4b = __riscv_vadd_vv_i16m1(partial4b, tmp1_i16m1, vl); |
| tmp1_i16m1 = __riscv_vadd_vv_i16m1(lines_0, lines_1, VL_SLIDE_DOWN); |
| vint16m1_t partial5a = |
| __riscv_vslideup_vx_i16m1(vec_zero_i16m1, tmp1_i16m1, (8 - 3), vl); |
| vint16m1_t partial5b = |
| __riscv_vslidedown_vx_i16m1(tmp1_i16m1, 3, VL_SLIDE_DOWN); |
| vint16m1_t partial7a = |
| __riscv_vslideup_vx_i16m1(vec_zero_i16m1, tmp1_i16m1, (8 - 6), vl); |
| vint16m1_t partial7b = |
| __riscv_vslidedown_vx_i16m1(tmp1_i16m1, 6, VL_SLIDE_DOWN); |
| vint16m1_t partial6 = __riscv_vmv_v_v_i16m1(tmp1_i16m1, vl); |
| |
| // Partial sums for lines 2 and 3. |
| tmp1_i16m1 = __riscv_vslideup_vx_i16m1(vec_zero_i16m1, lines_2, (8 - 3), vl); |
| partial4a = __riscv_vadd_vv_i16m1(partial4a, tmp1_i16m1, vl); |
| tmp1_i16m1 = __riscv_vslideup_vx_i16m1(vec_zero_i16m1, lines_3, (8 - 4), vl); |
| partial4a = __riscv_vadd_vv_i16m1(partial4a, tmp1_i16m1, vl); |
| tmp1_i16m1 = __riscv_vslidedown_vx_i16m1(lines_2, 3, VL_SLIDE_DOWN); |
| partial4b = __riscv_vadd_vv_i16m1(partial4b, tmp1_i16m1, vl); |
| tmp1_i16m1 = __riscv_vslidedown_vx_i16m1(lines_3, 4, VL_SLIDE_DOWN); |
| partial4b = __riscv_vadd_vv_i16m1(partial4b, tmp1_i16m1, vl); |
| tmp1_i16m1 = __riscv_vadd_vv_i16m1(lines_2, lines_3, VL_SLIDE_DOWN); |
| partial5a = __riscv_vadd_vv_i16m1( |
| partial5a, |
| __riscv_vslideup_vx_i16m1(vec_zero_i16m1, tmp1_i16m1, (8 - 4), vl), vl); |
| partial5b = __riscv_vadd_vv_i16m1( |
| partial5b, __riscv_vslidedown_vx_i16m1(tmp1_i16m1, 4, VL_SLIDE_DOWN), vl); |
| partial7a = __riscv_vadd_vv_i16m1( |
| partial7a, |
| __riscv_vslideup_vx_i16m1(vec_zero_i16m1, tmp1_i16m1, (8 - 5), vl), vl); |
| partial7b = __riscv_vadd_vv_i16m1( |
| partial7b, __riscv_vslidedown_vx_i16m1(tmp1_i16m1, 5, VL_SLIDE_DOWN), vl); |
| partial6 = __riscv_vadd_vv_i16m1(partial6, tmp1_i16m1, vl); |
| |
| // Partial sums for lines 4 and 5. |
| partial4a = __riscv_vadd_vv_i16m1( |
| partial4a, |
| __riscv_vslideup_vx_i16m1(vec_zero_i16m1, lines_4, (8 - 5), vl), vl); |
| partial4a = __riscv_vadd_vv_i16m1( |
| partial4a, |
| __riscv_vslideup_vx_i16m1(vec_zero_i16m1, lines_5, (8 - 6), vl), vl); |
| partial4b = __riscv_vadd_vv_i16m1( |
| partial4b, __riscv_vslidedown_vx_i16m1(lines_4, 5, VL_SLIDE_DOWN), vl); |
| partial4b = __riscv_vadd_vv_i16m1( |
| partial4b, __riscv_vslidedown_vx_i16m1(lines_5, 6, VL_SLIDE_DOWN), vl); |
| tmp1_i16m1 = __riscv_vadd_vv_i16m1(lines_4, lines_5, VL_SLIDE_DOWN); |
| partial5a = __riscv_vadd_vv_i16m1( |
| partial5a, |
| __riscv_vslideup_vx_i16m1(vec_zero_i16m1, tmp1_i16m1, (8 - 5), vl), vl); |
| partial5b = __riscv_vadd_vv_i16m1( |
| partial5b, __riscv_vslidedown_vx_i16m1(tmp1_i16m1, 5, VL_SLIDE_DOWN), vl); |
| partial7a = __riscv_vadd_vv_i16m1( |
| partial7a, |
| __riscv_vslideup_vx_i16m1(vec_zero_i16m1, tmp1_i16m1, (8 - 4), vl), vl); |
| partial7b = __riscv_vadd_vv_i16m1( |
| partial7b, __riscv_vslidedown_vx_i16m1(tmp1_i16m1, 4, VL_SLIDE_DOWN), vl); |
| partial6 = __riscv_vadd_vv_i16m1(partial6, tmp1_i16m1, vl); |
| |
| // Partial sums for lines 6 and 7. |
| partial4a = __riscv_vadd_vv_i16m1( |
| partial4a, |
| __riscv_vslideup_vx_i16m1(vec_zero_i16m1, lines_6, (8 - 7), vl), vl); |
| partial4a = __riscv_vadd_vv_i16m1(partial4a, lines_7, vl); |
| partial4b = __riscv_vadd_vv_i16m1( |
| partial4b, __riscv_vslidedown_vx_i16m1(lines_6, 7, VL_SLIDE_DOWN), vl); |
| tmp1_i16m1 = __riscv_vadd_vv_i16m1(lines_6, lines_7, VL_SLIDE_DOWN); |
| partial5a = __riscv_vadd_vv_i16m1( |
| partial5a, |
| __riscv_vslideup_vx_i16m1(vec_zero_i16m1, tmp1_i16m1, (8 - 6), vl), vl); |
| partial5b = __riscv_vadd_vv_i16m1( |
| partial5b, __riscv_vslidedown_vx_i16m1(tmp1_i16m1, 6, VL_SLIDE_DOWN), vl); |
| partial7a = __riscv_vadd_vv_i16m1( |
| partial7a, |
| __riscv_vslideup_vx_i16m1(vec_zero_i16m1, tmp1_i16m1, (8 - 3), vl), vl); |
| partial7b = __riscv_vadd_vv_i16m1( |
| partial7b, __riscv_vslidedown_vx_i16m1(tmp1_i16m1, 3, VL_SLIDE_DOWN), vl); |
| partial6 = __riscv_vadd_vv_i16m1(partial6, tmp1_i16m1, vl); |
| |
| // const0 = { 840, 420, 280, 210, } |
| vuint32m1_t const0 = __riscv_vmv_s_x_u32m1(210, 4); |
| const0 = __riscv_vslide1up_vx_u32m1(const0, 280, 4); |
| const0 = __riscv_vslide1up_vx_u32m1(const0, 420, 4); |
| const0 = __riscv_vslide1up_vx_u32m1(const0, 840, 4); |
| |
| // const1 = { 168, 140, 120, 105, } |
| vuint32m1_t const1 = __riscv_vmv_s_x_u32m1(105, 4); |
| const1 = __riscv_vslide1up_vx_u32m1(const1, 120, 4); |
| const1 = __riscv_vslide1up_vx_u32m1(const1, 140, 4); |
| const1 = __riscv_vslide1up_vx_u32m1(const1, 168, 4); |
| |
| // const2 = { 0, 0, 420, 210, } |
| vuint32m1_t const2 = __riscv_vmv_v_x_u32m1(0, 4); |
| const2 = __riscv_vslide1down_vx_u32m1(const2, 420, 4); |
| const2 = __riscv_vslide1down_vx_u32m1(const2, 210, 4); |
| |
| // const3 = { 140, 105, 105, 105, }; |
| vuint32m1_t const3 = __riscv_vmv_v_x_u32m1(105, 4); |
| const3 = __riscv_vslide1up_vx_u32m1(const3, 140, 4); |
| |
| // Compute costs in terms of partial sums. |
| vint32m2_t tmp1_i32m2 = __riscv_vwmul_vv_i32m2(partial6, partial6, vl); |
| vint32m2_t partial6_s32 = __riscv_vslidedown_vx_i32m2(tmp1_i32m2, 4, vl); |
| partial6_s32 = __riscv_vadd_vv_i32m2(partial6_s32, tmp1_i32m2, 4); |
| |
| // Reverse partial B. |
| // pattern = { 6, 5, 4, 3, 2, 1, 0, 7, }. |
| vuint32m1_t costs_0, costs_1, costs_2, costs_3; |
| static const uint16_t tab_u16[8] = { |
| 6, 5, 4, 3, 2, 1, 0, 7, |
| }; |
| vuint16m1_t index_u16m1 = __riscv_vle16_v_u16m1(tab_u16, 8); |
| vint16m1_t partial4b_rv = |
| __riscv_vrgather_vv_i16m1(partial4b, index_u16m1, 8); |
| costs_0 = fold_mul_and_sum_rvv(partial4a, partial4b_rv, const0, const1); |
| vuint32m1_t partial6_u32 = __riscv_vreinterpret_v_i32m1_u32m1( |
| __riscv_vlmul_trunc_v_i32m2_i32m1(partial6_s32)); |
| costs_2 = __riscv_vmul_vx_u32m1(partial6_u32, 105, 4); |
| vint16m1_t partial5b_rv = |
| __riscv_vrgather_vv_i16m1(partial5b, index_u16m1, 8); |
| costs_1 = fold_mul_and_sum_rvv(partial5a, partial5b_rv, const2, const3); |
| vint16m1_t partial7b_rv = |
| __riscv_vrgather_vv_i16m1(partial7b, index_u16m1, 8); |
| costs_3 = fold_mul_and_sum_rvv(partial7a, partial7b_rv, const2, const3); |
| |
| // combine values |
| vuint32m1_t vec_scalar_u32m1 = __riscv_vmv_s_x_u32m1(0, 1); |
| vuint32m1_t cost0_sum = |
| __riscv_vredsum_vs_u32m1_u32m1(costs_0, vec_scalar_u32m1, 4); |
| vuint32m1_t cost1_sum = |
| __riscv_vredsum_vs_u32m1_u32m1(costs_1, vec_scalar_u32m1, 4); |
| vuint32m1_t cost2_sum = |
| __riscv_vredsum_vs_u32m1_u32m1(costs_2, vec_scalar_u32m1, 4); |
| vuint32m1_t cost3_sum = |
| __riscv_vredsum_vs_u32m1_u32m1(costs_3, vec_scalar_u32m1, 4); |
| |
| vuint32m1_t cost47 = __riscv_vslideup_vx_u32m1(cost0_sum, cost1_sum, 1, 4); |
| cost47 = __riscv_vslideup_vx_u32m1(cost47, cost2_sum, 2, 4); |
| cost47 = __riscv_vslideup_vx_u32m1(cost47, cost3_sum, 3, 4); |
| __riscv_vse32_v_u32m1(&cost[0], cost47, 4); |
| return cost47; |
| } |
| |
| static inline vuint32m1_t fold_mul_and_sum_pairwise_rvv(vint16m1_t partiala, |
| vint16m1_t partialb, |
| vint16m1_t partialc, |
| vuint32m1_t const0) { |
| vuint16m1_t vid_u16m1 = __riscv_vid_v_u16m1(4); |
| vuint16m1_t index_u16m1 = __riscv_vsll_vx_u16m1(vid_u16m1, 1, 4); |
| vint16m1_t tmp_i16m1 = __riscv_vslide1down_vx_i16m1(partiala, 0, 8); |
| vint32m2_t partiala_i32m2 = __riscv_vwadd_vv_i32m2(partiala, tmp_i16m1, 8); |
| tmp_i16m1 = __riscv_vslide1down_vx_i16m1(partialb, 0, 8); |
| vint32m2_t partialb_i32m2 = __riscv_vwadd_vv_i32m2(partialb, tmp_i16m1, 8); |
| |
| tmp_i16m1 = __riscv_vslide1down_vx_i16m1(partialc, 0, 8); |
| vint32m2_t partialc_i32m2 = __riscv_vwadd_vv_i32m2(partialc, tmp_i16m1, 8); |
| partiala_i32m2 = __riscv_vmul_vv_i32m2(partiala_i32m2, partiala_i32m2, 8); |
| partialb_i32m2 = __riscv_vmul_vv_i32m2(partialb_i32m2, partialb_i32m2, 8); |
| vint32m1_t partialb_i32m1 = __riscv_vlmul_trunc_v_i32m2_i32m1( |
| __riscv_vrgatherei16_vv_i32m2(partialb_i32m2, index_u16m1, 4)); |
| partialc_i32m2 = __riscv_vmul_vv_i32m2(partialc_i32m2, partialc_i32m2, 8); |
| partiala_i32m2 = __riscv_vadd_vv_i32m2(partiala_i32m2, partialc_i32m2, 8); |
| vint32m1_t partiala_i32m1 = __riscv_vlmul_trunc_v_i32m2_i32m1( |
| __riscv_vrgatherei16_vv_i32m2(partiala_i32m2, index_u16m1, 4)); |
| |
| vuint32m1_t cost = __riscv_vmul_vx_u32m1( |
| __riscv_vreinterpret_v_i32m1_u32m1(partialb_i32m1), 105, 4); |
| cost = __riscv_vmacc_vv_u32m1( |
| cost, __riscv_vreinterpret_v_i32m1_u32m1(partiala_i32m1), const0, 4); |
| return cost; |
| } |
| |
| static inline vint32m1_t horizontal_add_4d_s16x8(vint16m1_t lines_0, |
| vint16m1_t lines_1, |
| vint16m1_t lines_2, |
| vint16m1_t lines_3) { |
| vint32m1_t vec_scalar_i32m1 = __riscv_vmv_s_x_i32m1(0, 1); |
| vint32m1_t lines0_sum = |
| __riscv_vwredsum_vs_i16m1_i32m1(lines_0, vec_scalar_i32m1, 8); |
| vint32m1_t lines1_sum = |
| __riscv_vwredsum_vs_i16m1_i32m1(lines_1, vec_scalar_i32m1, 8); |
| vint32m1_t lines2_sum = |
| __riscv_vwredsum_vs_i16m1_i32m1(lines_2, vec_scalar_i32m1, 8); |
| vint32m1_t lines3_sum = |
| __riscv_vwredsum_vs_i16m1_i32m1(lines_3, vec_scalar_i32m1, 8); |
| |
| vint32m1_t ret = __riscv_vslideup_vx_i32m1(lines0_sum, lines1_sum, 1, 4); |
| ret = __riscv_vslideup_vx_i32m1(ret, lines2_sum, 2, 4); |
| ret = __riscv_vslideup_vx_i32m1(ret, lines3_sum, 3, 4); |
| return ret; |
| } |
| |
| // This function computes the cost along directions 0, 1, 2, 3. (0 means |
| // 45-degree up-right, 2 is horizontal). |
| // |
| // For direction 1 and 3 ("east northeast" and "east southeast") the shifted |
| // lines need three vectors instead of two. For direction 1 for example, we need |
| // to compute the sums along the line i below: |
| // 0 0 1 1 2 2 3 3 |
| // 1 1 2 2 3 3 4 4 |
| // 2 2 3 3 4 4 5 5 |
| // 3 3 4 4 5 5 6 6 |
| // 4 4 5 5 6 6 7 7 |
| // 5 5 6 6 7 7 8 8 |
| // 6 6 7 7 8 8 9 9 |
| // 7 7 8 8 9 9 10 10 |
| // |
| // Which means we need the following configuration: |
| // 0 0 1 1 2 2 3 3 |
| // 1 1 2 2 3 3 4 4 |
| // 2 2 3 3 4 4 5 5 |
| // 3 3 4 4 5 5 6 6 |
| // 4 4 5 5 6 6 7 7 |
| // 5 5 6 6 7 7 8 8 |
| // 6 6 7 7 8 8 9 9 |
| // 7 7 8 8 9 9 10 10 |
| // |
| // Three vectors are needed to compute this, as well as some extra pairwise |
| // additions. |
| static vuint32m1_t compute_horiz_directions_rvv( |
| vint16m1_t lines_0, vint16m1_t lines_1, vint16m1_t lines_2, |
| vint16m1_t lines_3, vint16m1_t lines_4, vint16m1_t lines_5, |
| vint16m1_t lines_6, vint16m1_t lines_7, uint32_t cost[4], size_t vl) { |
| // Compute diagonal directions (1, 2, 3). |
| // Partial sums for lines 0 and 1. |
| size_t VL_SLIDE_DOWN = __riscv_vsetvl_e16m1(16); |
| vint16m1_t vec_zero_i16m1 = __riscv_vmv_v_x_i16m1(0, vl); |
| vint16m1_t partial0a = __riscv_vmv_v_v_i16m1(lines_0, vl); |
| partial0a = __riscv_vadd_vv_i16m1( |
| partial0a, |
| __riscv_vslideup_vx_i16m1(vec_zero_i16m1, lines_1, (8 - 7), vl), vl); |
| vint16m1_t partial0b = __riscv_vslidedown_vx_i16m1(lines_1, 7, VL_SLIDE_DOWN); |
| vint16m1_t partial1a = __riscv_vadd_vv_i16m1( |
| lines_0, __riscv_vslideup_vx_i16m1(vec_zero_i16m1, lines_1, (8 - 6), vl), |
| vl); |
| vint16m1_t partial1b = __riscv_vslidedown_vx_i16m1(lines_1, 6, VL_SLIDE_DOWN); |
| vint16m1_t partial3a = __riscv_vslidedown_vx_i16m1(lines_0, 2, VL_SLIDE_DOWN); |
| partial3a = __riscv_vadd_vv_i16m1( |
| partial3a, __riscv_vslidedown_vx_i16m1(lines_1, 4, VL_SLIDE_DOWN), vl); |
| vint16m1_t partial3b = |
| __riscv_vslideup_vx_i16m1(vec_zero_i16m1, lines_0, (8 - 2), vl); |
| partial3b = __riscv_vadd_vv_i16m1( |
| partial3b, __riscv_vslideup_vx_i16m1(vec_zero_i16m1, lines_1, 4, vl), vl); |
| |
| // Partial sums for lines 2 and 3. |
| partial0a = __riscv_vadd_vv_i16m1( |
| partial0a, |
| __riscv_vslideup_vx_i16m1(vec_zero_i16m1, lines_2, (8 - 6), vl), vl); |
| partial0a = __riscv_vadd_vv_i16m1( |
| partial0a, |
| __riscv_vslideup_vx_i16m1(vec_zero_i16m1, lines_3, (8 - 5), vl), vl); |
| partial0b = __riscv_vadd_vv_i16m1( |
| partial0b, __riscv_vslidedown_vx_i16m1(lines_2, 6, VL_SLIDE_DOWN), vl); |
| partial0b = __riscv_vadd_vv_i16m1( |
| partial0b, __riscv_vslidedown_vx_i16m1(lines_3, 5, VL_SLIDE_DOWN), vl); |
| partial1a = __riscv_vadd_vv_i16m1( |
| partial1a, |
| __riscv_vslideup_vx_i16m1(vec_zero_i16m1, lines_2, (8 - 4), vl), vl); |
| partial1a = __riscv_vadd_vv_i16m1( |
| partial1a, |
| __riscv_vslideup_vx_i16m1(vec_zero_i16m1, lines_3, (8 - 2), vl), vl); |
| partial1b = __riscv_vadd_vv_i16m1( |
| partial1b, __riscv_vslidedown_vx_i16m1(lines_2, 4, VL_SLIDE_DOWN), vl); |
| partial1b = __riscv_vadd_vv_i16m1( |
| partial1b, __riscv_vslidedown_vx_i16m1(lines_3, 2, VL_SLIDE_DOWN), vl); |
| partial3a = __riscv_vadd_vv_i16m1( |
| partial3a, __riscv_vslidedown_vx_i16m1(lines_2, 6, VL_SLIDE_DOWN), vl); |
| partial3b = __riscv_vadd_vv_i16m1( |
| partial3b, |
| __riscv_vslideup_vx_i16m1(vec_zero_i16m1, lines_2, (8 - 6), vl), vl); |
| partial3b = __riscv_vadd_vv_i16m1(partial3b, lines_3, vl); |
| |
| // Partial sums for lines 4 and 5. |
| partial0a = __riscv_vadd_vv_i16m1( |
| partial0a, |
| __riscv_vslideup_vx_i16m1(vec_zero_i16m1, lines_4, (8 - 4), vl), vl); |
| partial0a = __riscv_vadd_vv_i16m1( |
| partial0a, |
| __riscv_vslideup_vx_i16m1(vec_zero_i16m1, lines_5, (8 - 3), vl), vl); |
| partial0b = __riscv_vadd_vv_i16m1( |
| partial0b, __riscv_vslidedown_vx_i16m1(lines_4, 4, VL_SLIDE_DOWN), vl); |
| partial0b = __riscv_vadd_vv_i16m1( |
| partial0b, __riscv_vslidedown_vx_i16m1(lines_5, 3, VL_SLIDE_DOWN), vl); |
| partial1b = __riscv_vadd_vv_i16m1(partial1b, lines_4, vl); |
| partial1b = __riscv_vadd_vv_i16m1( |
| partial1b, |
| __riscv_vslideup_vx_i16m1(vec_zero_i16m1, lines_5, (8 - 6), vl), vl); |
| vint16m1_t partial1c = __riscv_vslidedown_vx_i16m1(lines_5, 6, VL_SLIDE_DOWN); |
| partial3b = __riscv_vadd_vv_i16m1( |
| partial3b, __riscv_vslidedown_vx_i16m1(lines_4, 2, VL_SLIDE_DOWN), vl); |
| partial3b = __riscv_vadd_vv_i16m1( |
| partial3b, __riscv_vslidedown_vx_i16m1(lines_5, 4, VL_SLIDE_DOWN), vl); |
| vint16m1_t partial3c = |
| __riscv_vslideup_vx_i16m1(vec_zero_i16m1, lines_4, (8 - 2), vl); |
| partial3c = __riscv_vadd_vv_i16m1( |
| partial3c, |
| __riscv_vslideup_vx_i16m1(vec_zero_i16m1, lines_5, (8 - 4), vl), vl); |
| |
| // Partial sums for lines 6 and 7. |
| partial0a = __riscv_vadd_vv_i16m1( |
| partial0a, |
| __riscv_vslideup_vx_i16m1(vec_zero_i16m1, lines_6, (8 - 2), vl), vl); |
| partial0a = __riscv_vadd_vv_i16m1( |
| partial0a, |
| __riscv_vslideup_vx_i16m1(vec_zero_i16m1, lines_7, (8 - 1), vl), vl); |
| partial0b = __riscv_vadd_vv_i16m1( |
| partial0b, __riscv_vslidedown_vx_i16m1(lines_6, 2, VL_SLIDE_DOWN), vl); |
| partial0b = __riscv_vadd_vv_i16m1( |
| partial0b, __riscv_vslide1down_vx_i16m1(lines_7, 0, vl), vl); |
| partial1b = __riscv_vadd_vv_i16m1( |
| partial1b, |
| __riscv_vslideup_vx_i16m1(vec_zero_i16m1, lines_6, (8 - 4), vl), vl); |
| partial1b = __riscv_vadd_vv_i16m1( |
| partial1b, |
| __riscv_vslideup_vx_i16m1(vec_zero_i16m1, lines_7, (8 - 2), vl), vl); |
| partial1c = __riscv_vadd_vv_i16m1( |
| partial1c, __riscv_vslidedown_vx_i16m1(lines_6, 4, VL_SLIDE_DOWN), vl); |
| partial1c = __riscv_vadd_vv_i16m1( |
| partial1c, __riscv_vslidedown_vx_i16m1(lines_7, 2, VL_SLIDE_DOWN), vl); |
| partial3b = __riscv_vadd_vv_i16m1( |
| partial3b, __riscv_vslidedown_vx_i16m1(lines_6, 6, VL_SLIDE_DOWN), vl); |
| partial3c = __riscv_vadd_vv_i16m1( |
| partial3c, |
| __riscv_vslideup_vx_i16m1(vec_zero_i16m1, lines_6, (8 - 6), vl), vl); |
| partial3c = __riscv_vadd_vv_i16m1(partial3c, lines_7, vl); |
| |
| // Special case for direction 2 as it's just a sum along each line. |
| vint32m1_t partial2a = |
| horizontal_add_4d_s16x8(lines_0, lines_1, lines_2, lines_3); |
| vint32m1_t partial2b = |
| horizontal_add_4d_s16x8(lines_4, lines_5, lines_6, lines_7); |
| vuint32m1_t partial2a_u32 = __riscv_vreinterpret_v_i32m1_u32m1( |
| __riscv_vmul_vv_i32m1(partial2a, partial2a, 4)); |
| vuint32m1_t partial2b_u32 = __riscv_vreinterpret_v_i32m1_u32m1( |
| __riscv_vmul_vv_i32m1(partial2b, partial2b, 4)); |
| |
| // const0 = { 840, 420, 280, 210, } |
| vuint32m1_t const0 = __riscv_vmv_s_x_u32m1(210, 4); |
| const0 = __riscv_vslide1up_vx_u32m1(const0, 280, 4); |
| const0 = __riscv_vslide1up_vx_u32m1(const0, 420, 4); |
| const0 = __riscv_vslide1up_vx_u32m1(const0, 840, 4); |
| |
| // const1 = { 168, 140, 120, 105, } |
| vuint32m1_t const1 = __riscv_vmv_s_x_u32m1(105, 4); |
| const1 = __riscv_vslide1up_vx_u32m1(const1, 120, 4); |
| const1 = __riscv_vslide1up_vx_u32m1(const1, 140, 4); |
| const1 = __riscv_vslide1up_vx_u32m1(const1, 168, 4); |
| |
| // const2 = { 420, 210, 140, 105, }; |
| vuint32m1_t const2 = __riscv_vmv_s_x_u32m1(105, 4); |
| const2 = __riscv_vslide1up_vx_u32m1(const2, 140, 4); |
| const2 = __riscv_vslide1up_vx_u32m1(const2, 210, 4); |
| const2 = __riscv_vslide1up_vx_u32m1(const2, 420, 4); |
| |
| static const uint16_t tab_u16[8] = { |
| 0, 6, 5, 4, 3, 2, 1, 0, |
| }; |
| vuint32m1_t costs_0, costs_1, costs_2, costs_3; |
| vuint16m1_t template_u16m1 = __riscv_vle16_v_u16m1(tab_u16, 8); |
| |
| // Reverse partial c. |
| // pattern = { 6, 5, 4, 3, 2, 1, 0, 7, } |
| vuint16m1_t index_u16m1 = __riscv_vslide1down_vx_u16m1(template_u16m1, 7, 8); |
| vint16m1_t partial0b_rv = |
| __riscv_vrgather_vv_i16m1(partial0b, index_u16m1, 8); |
| costs_0 = fold_mul_and_sum_rvv(partial0a, partial0b_rv, const0, const1); |
| |
| // Reverse partial c. |
| // pattern = { 5, 4, 3, 2, 1, 0, 6, 7, } |
| vuint16m1_t index_pair_u16m1 = |
| __riscv_vslide1down_vx_u16m1(template_u16m1, 6, 8); |
| index_pair_u16m1 = __riscv_vslide1down_vx_u16m1(index_pair_u16m1, 7, 8); |
| vint16m1_t partialc_rv = |
| __riscv_vrgather_vv_i16m1(partial1c, index_pair_u16m1, 8); |
| costs_1 = |
| fold_mul_and_sum_pairwise_rvv(partial1a, partial1b, partialc_rv, const2); |
| |
| costs_2 = __riscv_vadd_vv_u32m1(partial2a_u32, partial2b_u32, 4); |
| costs_2 = __riscv_vmul_vx_u32m1(costs_2, 105, 4); |
| |
| vint16m1_t partial3a_rv = |
| __riscv_vrgather_vv_i16m1(partial3a, index_pair_u16m1, 8); |
| costs_3 = |
| fold_mul_and_sum_pairwise_rvv(partial3c, partial3b, partial3a_rv, const2); |
| |
| // combine values |
| vuint32m1_t vec_scalar_u32m1 = __riscv_vmv_s_x_u32m1(0, 1); |
| vuint32m1_t cost0_sum = |
| __riscv_vredsum_vs_u32m1_u32m1(costs_0, vec_scalar_u32m1, 4); |
| vuint32m1_t cost1_sum = |
| __riscv_vredsum_vs_u32m1_u32m1(costs_1, vec_scalar_u32m1, 4); |
| vuint32m1_t cost2_sum = |
| __riscv_vredsum_vs_u32m1_u32m1(costs_2, vec_scalar_u32m1, 4); |
| vuint32m1_t cost3_sum = |
| __riscv_vredsum_vs_u32m1_u32m1(costs_3, vec_scalar_u32m1, 4); |
| |
| costs_0 = __riscv_vslideup_vx_u32m1(cost0_sum, cost1_sum, 1, 4); |
| costs_0 = __riscv_vslideup_vx_u32m1(costs_0, cost2_sum, 2, 4); |
| costs_0 = __riscv_vslideup_vx_u32m1(costs_0, cost3_sum, 3, 4); |
| __riscv_vse32_v_u32m1(&cost[0], costs_0, 4); |
| return costs_0; |
| } |
| |
| int cdef_find_dir_rvv(const uint16_t *img, int stride, int32_t *var, |
| int coeff_shift) { |
| size_t vl = 8; |
| size_t vlmax = __riscv_vsetvlmax_e16m1(); |
| vuint16m1_t s; |
| vint16m1_t lines_0, lines_1, lines_2, lines_3; |
| vint16m1_t lines_4, lines_5, lines_6, lines_7; |
| vuint16m1_t vec_zero_u16m1 = |
| __riscv_vmv_v_x_u16m1(0, __riscv_vsetvl_e16m1(16)); |
| |
| if (vlmax == 8) |
| s = __riscv_vle16_v_u16m1(img, vl); |
| else |
| s = __riscv_vle16_v_u16m1_tu(vec_zero_u16m1, img, vl); |
| lines_0 = __riscv_vreinterpret_v_u16m1_i16m1( |
| __riscv_vsrl_vx_u16m1(s, coeff_shift, vl)); |
| lines_0 = __riscv_vsub_vx_i16m1(lines_0, 128, vl); |
| |
| img += stride; |
| if (vlmax == 8) |
| s = __riscv_vle16_v_u16m1(img, vl); |
| else |
| s = __riscv_vle16_v_u16m1_tu(vec_zero_u16m1, img, vl); |
| lines_1 = __riscv_vreinterpret_v_u16m1_i16m1( |
| __riscv_vsrl_vx_u16m1(s, coeff_shift, vl)); |
| lines_1 = __riscv_vsub_vx_i16m1(lines_1, 128, vl); |
| |
| img += stride; |
| if (vlmax == 8) |
| s = __riscv_vle16_v_u16m1(img, vl); |
| else |
| s = __riscv_vle16_v_u16m1_tu(vec_zero_u16m1, img, vl); |
| lines_2 = __riscv_vreinterpret_v_u16m1_i16m1( |
| __riscv_vsrl_vx_u16m1(s, coeff_shift, vl)); |
| lines_2 = __riscv_vsub_vx_i16m1(lines_2, 128, vl); |
| |
| img += stride; |
| if (vlmax == 8) |
| s = __riscv_vle16_v_u16m1(img, vl); |
| else |
| s = __riscv_vle16_v_u16m1_tu(vec_zero_u16m1, img, vl); |
| lines_3 = __riscv_vreinterpret_v_u16m1_i16m1( |
| __riscv_vsrl_vx_u16m1(s, coeff_shift, vl)); |
| lines_3 = __riscv_vsub_vx_i16m1(lines_3, 128, vl); |
| |
| img += stride; |
| if (vlmax == 8) |
| s = __riscv_vle16_v_u16m1(img, vl); |
| else |
| s = __riscv_vle16_v_u16m1_tu(vec_zero_u16m1, img, vl); |
| lines_4 = __riscv_vreinterpret_v_u16m1_i16m1( |
| __riscv_vsrl_vx_u16m1(s, coeff_shift, vl)); |
| lines_4 = __riscv_vsub_vx_i16m1(lines_4, 128, vl); |
| |
| img += stride; |
| if (vlmax == 8) |
| s = __riscv_vle16_v_u16m1(img, vl); |
| else |
| s = __riscv_vle16_v_u16m1_tu(vec_zero_u16m1, img, vl); |
| lines_5 = __riscv_vreinterpret_v_u16m1_i16m1( |
| __riscv_vsrl_vx_u16m1(s, coeff_shift, vl)); |
| lines_5 = __riscv_vsub_vx_i16m1(lines_5, 128, vl); |
| |
| img += stride; |
| if (vlmax == 8) |
| s = __riscv_vle16_v_u16m1(img, vl); |
| else |
| s = __riscv_vle16_v_u16m1_tu(vec_zero_u16m1, img, vl); |
| lines_6 = __riscv_vreinterpret_v_u16m1_i16m1( |
| __riscv_vsrl_vx_u16m1(s, coeff_shift, vl)); |
| lines_6 = __riscv_vsub_vx_i16m1(lines_6, 128, vl); |
| |
| img += stride; |
| if (vlmax == 8) |
| s = __riscv_vle16_v_u16m1(img, vl); |
| else |
| s = __riscv_vle16_v_u16m1_tu(vec_zero_u16m1, img, vl); |
| lines_7 = __riscv_vreinterpret_v_u16m1_i16m1( |
| __riscv_vsrl_vx_u16m1(s, coeff_shift, vl)); |
| lines_7 = __riscv_vsub_vx_i16m1(lines_7, 128, vl); |
| |
| // Compute "mostly vertical" directions. |
| uint32_t cost[8]; |
| vuint32m1_t cost47 = |
| compute_vert_directions_rvv(lines_0, lines_1, lines_2, lines_3, lines_4, |
| lines_5, lines_6, lines_7, cost + 4, vl); |
| |
| // Compute "mostly horizontal" directions. |
| vuint32m1_t cost03 = |
| compute_horiz_directions_rvv(lines_0, lines_1, lines_2, lines_3, lines_4, |
| lines_5, lines_6, lines_7, cost, vl); |
| |
| // Find max cost as well as its index to get best_dir. |
| // The max cost needs to be propagated in the whole vector to find its |
| // position in the original cost vectors cost03 and cost47. |
| vuint32m1_t vec_scalar_u32m1 = __riscv_vmv_s_x_u32m1(0, 1); |
| vuint32m1_t cost07 = __riscv_vmaxu_vv_u32m1(cost03, cost47, 4); |
| uint32_t best_cost = __riscv_vmv_x_s_u32m1_u32( |
| __riscv_vredmaxu_vs_u32m1_u32m1(cost07, vec_scalar_u32m1, 4)); |
| vbool32_t mask_cost = __riscv_vmseq_vx_u32m1_b32(cost03, best_cost, 4); |
| long best_dir = __riscv_vfirst_m_b32(mask_cost, 4); |
| if (best_dir == -1) { |
| mask_cost = __riscv_vmseq_vx_u32m1_b32(cost47, best_cost, 4); |
| best_dir = __riscv_vfirst_m_b32(mask_cost, 4); |
| best_dir += 4; |
| } |
| |
| // Difference between the optimal variance and the variance along the |
| // orthogonal direction. Again, the sum(x^2) terms cancel out. |
| *var = best_cost - cost[(best_dir + 4) & 7]; |
| |
| // We'd normally divide by 840, but dividing by 1024 is close enough |
| // for what we're going to do with this. |
| *var >>= 10; |
| return (int)best_dir; |
| } |
| |
| void cdef_copy_rect8_8bit_to_16bit_rvv(uint16_t *dst, int dstride, |
| const uint8_t *src, int sstride, |
| int width, int height) { |
| do { |
| int w = 0; |
| size_t num_cols = width; |
| while (num_cols > 0) { |
| size_t vl = __riscv_vsetvl_e8mf2(num_cols); |
| vuint8mf2_t u8_src = __riscv_vle8_v_u8mf2(src + w, vl); |
| vuint16m1_t u16_src = __riscv_vwcvtu_x_x_v_u16m1(u8_src, vl); |
| __riscv_vse16_v_u16m1(dst + w, u16_src, vl); |
| |
| w += vl; |
| num_cols -= vl; |
| } |
| src += sstride; |
| dst += dstride; |
| } while (--height != 0); |
| } |
| |
| void cdef_copy_rect8_16bit_to_16bit_rvv(uint16_t *dst, int dstride, |
| const uint16_t *src, int sstride, |
| int width, int height) { |
| do { |
| int w = 0; |
| size_t num_cols = width; |
| while (num_cols > 0) { |
| size_t vl = __riscv_vsetvl_e16m1(num_cols); |
| vuint16m1_t u16_src = __riscv_vle16_v_u16m1(src + w, vl); |
| __riscv_vse16_v_u16m1(dst + w, u16_src, vl); |
| |
| w += vl; |
| num_cols -= vl; |
| } |
| src += sstride; |
| dst += dstride; |
| } while (--height != 0); |
| } |
| |
| static inline vint16m1_t constrain16(vint16m1_t a, vint16m1_t b, |
| int16_t threshold, int16_t adjdamp, |
| size_t vl) { |
| if (!threshold) return __riscv_vmv_v_x_i16m1(0, vl); |
| const vbool16_t mask = __riscv_vmslt_vv_i16m1_b16(a, b, vl); |
| const vint16m1_t diff = __riscv_vsub_vv_i16m1(a, b, vl); |
| const vint16m1_t abs_diff = __riscv_vneg_v_i16m1_tumu(mask, diff, diff, vl); |
| const vint16m1_t shift = __riscv_vsra_vx_i16m1(abs_diff, adjdamp, vl); |
| const vint16m1_t thr = __riscv_vmv_v_x_i16m1(threshold, vl); |
| const vint16m1_t sub = __riscv_vsub_vv_i16m1(thr, shift, vl); |
| const vint16m1_t max = __riscv_vmax_vx_i16m1(sub, 0, vl); |
| const vint16m1_t min = __riscv_vmin_vv_i16m1(abs_diff, max, vl); |
| return __riscv_vneg_v_i16m1_tumu(mask, min, min, vl); |
| } |
| |
| static inline vint16m1_t vmax_mask(vint16m1_t a, vint16m1_t b, size_t vl) { |
| const vbool16_t mask = |
| __riscv_vmseq_vx_i16m1_b16(a, (int16_t)CDEF_VERY_LARGE, vl); |
| const vint16m1_t val = __riscv_vmerge_vvm_i16m1(a, b, mask, vl); |
| return __riscv_vmax_vv_i16m1(val, b, vl); |
| } |
| |
| static inline vint16m1_t load_strided_i16_4x2(int16_t *addr, |
| const ptrdiff_t stride, |
| size_t vl) { |
| const vint16m1_t px_l1 = __riscv_vle16_v_i16m1(addr + stride, vl); |
| const vint16m1_t px_l0 = __riscv_vle16_v_i16m1(addr, vl); |
| return __riscv_vslideup_vx_i16m1(px_l0, px_l1, 4, vl); |
| } |
| |
| static inline void store_strided_u8_4x2(uint8_t *addr, vuint8mf2_t vdst, |
| const ptrdiff_t stride, size_t vl) { |
| __riscv_vse8_v_u8mf2(addr, vdst, vl >> 1); |
| vdst = __riscv_vslidedown_vx_u8mf2(vdst, 4, vl); |
| __riscv_vse8_v_u8mf2(addr + stride, vdst, vl >> 1); |
| } |
| |
| static inline void store_strided_u16_4x2(uint16_t *addr, vuint16m1_t vdst, |
| const ptrdiff_t stride, size_t vl) { |
| __riscv_vse16_v_u16m1(addr, vdst, vl >> 1); |
| vdst = __riscv_vslidedown_vx_u16m1(vdst, 4, vl); |
| __riscv_vse16_v_u16m1(addr + stride, vdst, vl >> 1); |
| } |
| |
| #define LOAD_PIX(addr) \ |
| const vint16m1_t px = __riscv_vle16_v_i16m1((int16_t *)addr, vl); \ |
| vint16m1_t sum = __riscv_vmv_v_x_i16m1(0, vl) |
| |
| #define LOAD_PIX4(addr) \ |
| const vint16m1_t px = \ |
| load_strided_i16_4x2((int16_t *)addr, CDEF_BSTRIDE, vl); \ |
| vint16m1_t sum = __riscv_vmv_v_x_i16m1(0, vl) |
| |
| #define LOAD_DIR(p, addr, o0, o1) \ |
| const vint16m1_t p##0 = __riscv_vle16_v_i16m1((int16_t *)addr + o0, vl); \ |
| const vint16m1_t p##1 = __riscv_vle16_v_i16m1((int16_t *)addr - o0, vl); \ |
| const vint16m1_t p##2 = __riscv_vle16_v_i16m1((int16_t *)addr + o1, vl); \ |
| const vint16m1_t p##3 = __riscv_vle16_v_i16m1((int16_t *)addr - o1, vl) |
| |
| #define LOAD_DIR4(p, addr, o0, o1) \ |
| const vint16m1_t p##0 = \ |
| load_strided_i16_4x2((int16_t *)addr + o0, CDEF_BSTRIDE, vl); \ |
| const vint16m1_t p##1 = \ |
| load_strided_i16_4x2((int16_t *)addr - o0, CDEF_BSTRIDE, vl); \ |
| const vint16m1_t p##2 = \ |
| load_strided_i16_4x2((int16_t *)addr + o1, CDEF_BSTRIDE, vl); \ |
| const vint16m1_t p##3 = \ |
| load_strided_i16_4x2((int16_t *)addr - o1, CDEF_BSTRIDE, vl) |
| |
| #define MAKE_TAPS \ |
| const int *pri_taps = cdef_pri_taps[(pri_strength >> coeff_shift) & 1]; \ |
| const int16_t tap0 = (int16_t)(pri_taps[0]); \ |
| const int16_t tap1 = (int16_t)(pri_taps[1]) |
| |
| #define CONSTRAIN(p, strength, shift) \ |
| vint16m1_t p##_c0 = \ |
| constrain16(p##0, px, (int16_t)strength, (int16_t)shift, vl); \ |
| vint16m1_t p##_c1 = \ |
| constrain16(p##1, px, (int16_t)strength, (int16_t)shift, vl); \ |
| vint16m1_t p##_c2 = \ |
| constrain16(p##2, px, (int16_t)strength, (int16_t)shift, vl); \ |
| vint16m1_t p##_c3 = \ |
| constrain16(p##3, px, (int16_t)strength, (int16_t)shift, vl) |
| |
| #define SETUP_MINMAX \ |
| vint16m1_t max = px; \ |
| vint16m1_t min = px |
| |
| #define MIN_MAX(p) \ |
| do { \ |
| max = vmax_mask(p##0, max, vl); \ |
| min = __riscv_vmin_vv_i16m1(p##0, min, vl); \ |
| max = vmax_mask(p##1, max, vl); \ |
| min = __riscv_vmin_vv_i16m1(p##1, min, vl); \ |
| max = vmax_mask(p##2, max, vl); \ |
| min = __riscv_vmin_vv_i16m1(p##2, min, vl); \ |
| max = vmax_mask(p##3, max, vl); \ |
| min = __riscv_vmin_vv_i16m1(p##3, min, vl); \ |
| } while (0) |
| |
| #define PRI_0_UPDATE_SUM(p) \ |
| const vint16m1_t p##sum0 = __riscv_vadd_vv_i16m1(p##_c0, p##_c1, vl); \ |
| const vint16m1_t p##sum1 = __riscv_vadd_vv_i16m1(p##_c2, p##_c3, vl); \ |
| sum = __riscv_vmacc_vx_i16m1(sum, tap0, p##sum0, vl); \ |
| sum = __riscv_vmacc_vx_i16m1(sum, tap1, p##sum1, vl) |
| |
| #define UPDATE_SUM(p) \ |
| const vint16m1_t p##sum0 = __riscv_vadd_vv_i16m1(p##_c0, p##_c1, vl); \ |
| const vint16m1_t p##sum1 = __riscv_vadd_vv_i16m1(p##_c2, p##_c3, vl); \ |
| sum = __riscv_vadd_vv_i16m1(sum, p##sum0, vl); \ |
| sum = __riscv_vadd_vv_i16m1(sum, p##sum1, vl) |
| |
| #define SEC_0_UPDATE_SUM(p) \ |
| const vint16m1_t p##sum0 = __riscv_vadd_vv_i16m1(p##_c0, p##_c1, vl); \ |
| const vint16m1_t p##sum1 = __riscv_vadd_vv_i16m1(p##_c2, p##_c3, vl); \ |
| const vint16m1_t p##sum2 = __riscv_vadd_vv_i16m1(p##sum0, p##sum1, vl); \ |
| sum = __riscv_vadd_vv_i16m1(sum, __riscv_vsll_vx_i16m1(p##sum2, 1, vl), vl) |
| |
| #define BIAS \ |
| const vbool16_t mask = __riscv_vmslt_vx_i16m1_b16(sum, 0, vl); \ |
| const vint16m1_t v_8 = __riscv_vmv_v_x_i16m1(8, vl); \ |
| const vint16m1_t bias = __riscv_vsub_vx_i16m1_tumu(mask, v_8, v_8, 1, vl); \ |
| const vint16m1_t unclamped = __riscv_vadd_vv_i16m1( \ |
| px, __riscv_vsra_vx_i16m1(__riscv_vadd_vv_i16m1(bias, sum, vl), 4, vl), \ |
| vl) |
| |
| #define STORE4 \ |
| do { \ |
| store_strided_u8_4x2(dst8, vdst, dstride, vl); \ |
| \ |
| in += (CDEF_BSTRIDE << 1); \ |
| dst8 += (dstride << 1); \ |
| } while (0) |
| |
| #define STORE4_CLAMPED \ |
| do { \ |
| BIAS; \ |
| vint16m1_t clamped = __riscv_vmin_vv_i16m1( \ |
| __riscv_vmax_vv_i16m1(unclamped, min, vl), max, vl); \ |
| vuint8mf2_t vdst = __riscv_vncvt_x_x_w_u8mf2( \ |
| __riscv_vreinterpret_v_i16m1_u16m1(clamped), vl); \ |
| STORE4; \ |
| } while (0) |
| |
| #define STORE4_UNCLAMPED \ |
| do { \ |
| BIAS; \ |
| vuint8mf2_t vdst = __riscv_vncvt_x_x_w_u8mf2( \ |
| __riscv_vreinterpret_v_i16m1_u16m1(unclamped), vl); \ |
| STORE4; \ |
| } while (0) |
| |
| #define STORE8 \ |
| do { \ |
| __riscv_vse8_v_u8mf2(dst8, vdst, vl); \ |
| \ |
| in += CDEF_BSTRIDE; \ |
| dst8 += dstride; \ |
| } while (0) |
| |
| #define STORE8_CLAMPED \ |
| do { \ |
| BIAS; \ |
| vint16m1_t clamped = __riscv_vmin_vv_i16m1( \ |
| __riscv_vmax_vv_i16m1(unclamped, min, vl), max, vl); \ |
| vuint8mf2_t vdst = __riscv_vncvt_x_x_w_u8mf2( \ |
| __riscv_vreinterpret_v_i16m1_u16m1(clamped), vl); \ |
| STORE8; \ |
| } while (0) |
| |
| #define STORE8_UNCLAMPED \ |
| do { \ |
| BIAS; \ |
| vuint8mf2_t vdst = __riscv_vncvt_x_x_w_u8mf2( \ |
| __riscv_vreinterpret_v_i16m1_u16m1(unclamped), vl); \ |
| STORE8; \ |
| } while (0) |
| |
| #define STORE16_4 \ |
| do { \ |
| store_strided_u16_4x2(dst16, vdst, dstride, vl); \ |
| \ |
| in += (CDEF_BSTRIDE << 1); \ |
| dst16 += (dstride << 1); \ |
| } while (0) |
| |
| #define STORE16_4_CLAMPED \ |
| do { \ |
| BIAS; \ |
| vint16m1_t clamped = __riscv_vmin_vv_i16m1( \ |
| __riscv_vmax_vv_i16m1(unclamped, min, vl), max, vl); \ |
| vuint16m1_t vdst = __riscv_vreinterpret_v_i16m1_u16m1(clamped); \ |
| STORE16_4; \ |
| } while (0) |
| |
| #define STORE16_4_UNCLAMPED \ |
| do { \ |
| BIAS; \ |
| vuint16m1_t vdst = __riscv_vreinterpret_v_i16m1_u16m1(unclamped); \ |
| STORE16_4; \ |
| } while (0) |
| |
| #define STORE16 \ |
| do { \ |
| __riscv_vse16_v_u16m1(dst16, vdst, vl); \ |
| \ |
| in += CDEF_BSTRIDE; \ |
| dst16 += dstride; \ |
| } while (0) |
| |
| #define STORE16_CLAMPED \ |
| do { \ |
| BIAS; \ |
| vint16m1_t clamped = __riscv_vmin_vv_i16m1( \ |
| __riscv_vmax_vv_i16m1(unclamped, min, vl), max, vl); \ |
| vuint16m1_t vdst = __riscv_vreinterpret_v_i16m1_u16m1(clamped); \ |
| STORE16; \ |
| } while (0) |
| |
| #define STORE16_UNCLAMPED \ |
| do { \ |
| BIAS; \ |
| vuint16m1_t vdst = __riscv_vreinterpret_v_i16m1_u16m1(unclamped); \ |
| STORE16; \ |
| } while (0) |
| |
| void cdef_filter_8_0_rvv(void *dest, int dstride, const uint16_t *in, |
| int pri_strength, int sec_strength, int dir, |
| int pri_damping, int sec_damping, int coeff_shift, |
| int block_width, int block_height) { |
| const int po1 = cdef_directions[dir][0]; |
| const int po2 = cdef_directions[dir][1]; |
| const int s1o1 = cdef_directions[dir + 2][0]; |
| const int s1o2 = cdef_directions[dir + 2][1]; |
| const int s2o1 = cdef_directions[dir - 2][0]; |
| const int s2o2 = cdef_directions[dir - 2][1]; |
| MAKE_TAPS; |
| |
| if (pri_strength) { |
| pri_damping = AOMMAX(0, pri_damping - get_msb(pri_strength)); |
| } |
| if (sec_strength) { |
| sec_damping = AOMMAX(0, sec_damping - get_msb(sec_strength)); |
| } |
| |
| if (block_width == 8) { |
| uint8_t *dst8 = (uint8_t *)dest; |
| |
| int h = block_height; |
| const size_t vl = block_width; |
| do { |
| LOAD_PIX(in); |
| SETUP_MINMAX; |
| |
| // Primary pass |
| LOAD_DIR(p, in, po1, po2); |
| CONSTRAIN(p, pri_strength, pri_damping); |
| MIN_MAX(p); |
| PRI_0_UPDATE_SUM(p); |
| |
| // Secondary pass 1 |
| LOAD_DIR(s, in, s1o1, s2o1); |
| CONSTRAIN(s, sec_strength, sec_damping); |
| MIN_MAX(s); |
| SEC_0_UPDATE_SUM(s); |
| |
| // Secondary pass 2 |
| LOAD_DIR(s2, in, s1o2, s2o2); |
| CONSTRAIN(s2, sec_strength, sec_damping); |
| MIN_MAX(s2); |
| UPDATE_SUM(s2); |
| |
| // Store |
| STORE8_CLAMPED; |
| } while (--h != 0); |
| } else { |
| uint8_t *dst8 = (uint8_t *)dest; |
| |
| int h = block_height; |
| const size_t vl = block_width << 1; |
| do { |
| LOAD_PIX4(in); |
| SETUP_MINMAX; |
| |
| // Primary pass |
| LOAD_DIR4(p, in, po1, po2); |
| CONSTRAIN(p, pri_strength, pri_damping); |
| MIN_MAX(p); |
| PRI_0_UPDATE_SUM(p); |
| |
| // Secondary pass 1 |
| LOAD_DIR4(s, in, s1o1, s2o1); |
| CONSTRAIN(s, sec_strength, sec_damping); |
| MIN_MAX(s); |
| SEC_0_UPDATE_SUM(s); |
| |
| // Secondary pass 2 |
| LOAD_DIR4(s2, in, s1o2, s2o2); |
| CONSTRAIN(s2, sec_strength, sec_damping); |
| MIN_MAX(s2); |
| UPDATE_SUM(s2); |
| |
| // Store |
| STORE4_CLAMPED; |
| |
| h -= 2; |
| } while (h != 0); |
| } |
| } |
| |
| void cdef_filter_8_1_rvv(void *dest, int dstride, const uint16_t *in, |
| int pri_strength, int sec_strength, int dir, |
| int pri_damping, int sec_damping, int coeff_shift, |
| int block_width, int block_height) { |
| (void)sec_strength; |
| (void)sec_damping; |
| |
| const int po1 = cdef_directions[dir][0]; |
| const int po2 = cdef_directions[dir][1]; |
| MAKE_TAPS; |
| |
| if (pri_strength) { |
| pri_damping = AOMMAX(0, pri_damping - get_msb(pri_strength)); |
| } |
| |
| if (block_width == 8) { |
| uint8_t *dst8 = (uint8_t *)dest; |
| |
| int h = block_height; |
| const size_t vl = block_width; |
| do { |
| LOAD_PIX(in); |
| |
| // Primary pass |
| LOAD_DIR(p, in, po1, po2); |
| CONSTRAIN(p, pri_strength, pri_damping); |
| PRI_0_UPDATE_SUM(p); |
| |
| // Store |
| STORE8_UNCLAMPED; |
| } while (--h != 0); |
| } else { |
| uint8_t *dst8 = (uint8_t *)dest; |
| |
| int h = block_height; |
| const size_t vl = block_width << 1; |
| do { |
| LOAD_PIX4(in); |
| |
| // Primary pass |
| LOAD_DIR4(p, in, po1, po2); |
| CONSTRAIN(p, pri_strength, pri_damping); |
| PRI_0_UPDATE_SUM(p); |
| |
| // Store |
| STORE4_UNCLAMPED; |
| |
| h -= 2; |
| } while (h != 0); |
| } |
| } |
| |
| void cdef_filter_8_2_rvv(void *dest, int dstride, const uint16_t *in, |
| int pri_strength, int sec_strength, int dir, |
| int pri_damping, int sec_damping, int coeff_shift, |
| int block_width, int block_height) { |
| (void)pri_strength; |
| (void)pri_damping; |
| (void)coeff_shift; |
| |
| const int s1o1 = cdef_directions[dir + 2][0]; |
| const int s1o2 = cdef_directions[dir + 2][1]; |
| const int s2o1 = cdef_directions[dir - 2][0]; |
| const int s2o2 = cdef_directions[dir - 2][1]; |
| |
| if (sec_strength) { |
| sec_damping = AOMMAX(0, sec_damping - get_msb(sec_strength)); |
| } |
| |
| if (block_width == 8) { |
| uint8_t *dst8 = (uint8_t *)dest; |
| |
| int h = block_height; |
| const size_t vl = block_width; |
| do { |
| LOAD_PIX(in); |
| |
| // Secondary pass 1 |
| LOAD_DIR(s, in, s1o1, s2o1); |
| CONSTRAIN(s, sec_strength, sec_damping); |
| SEC_0_UPDATE_SUM(s); |
| |
| // Secondary pass 2 |
| LOAD_DIR(s2, in, s1o2, s2o2); |
| CONSTRAIN(s2, sec_strength, sec_damping); |
| UPDATE_SUM(s2); |
| |
| // Store |
| STORE8_UNCLAMPED; |
| } while (--h != 0); |
| } else { |
| uint8_t *dst8 = (uint8_t *)dest; |
| |
| int h = block_height; |
| const size_t vl = block_width << 1; |
| do { |
| LOAD_PIX4(in); |
| |
| // Secondary pass 1 |
| LOAD_DIR4(s, in, s1o1, s2o1); |
| CONSTRAIN(s, sec_strength, sec_damping); |
| SEC_0_UPDATE_SUM(s); |
| |
| // Secondary pass 2 |
| LOAD_DIR4(s2, in, s1o2, s2o2); |
| CONSTRAIN(s2, sec_strength, sec_damping); |
| UPDATE_SUM(s2); |
| |
| // Store |
| STORE4_UNCLAMPED; |
| |
| h -= 2; |
| } while (h != 0); |
| } |
| } |
| |
| void cdef_filter_8_3_rvv(void *dest, int dstride, const uint16_t *in, |
| int pri_strength, int sec_strength, int dir, |
| int pri_damping, int sec_damping, int coeff_shift, |
| int block_width, int block_height) { |
| (void)pri_strength; |
| (void)sec_strength; |
| (void)dir; |
| (void)pri_damping; |
| (void)sec_damping; |
| (void)coeff_shift; |
| |
| if (block_width == 8) { |
| uint8_t *dst8 = (uint8_t *)dest; |
| |
| int h = block_height; |
| const size_t vl = block_width; |
| do { |
| const vuint16m1_t px = __riscv_vle16_v_u16m1(in, vl); |
| const vuint8mf2_t vdst = __riscv_vncvt_x_x_w_u8mf2(px, vl); |
| __riscv_vse8_v_u8mf2(dst8, vdst, vl); |
| |
| in += CDEF_BSTRIDE; |
| dst8 += dstride; |
| } while (--h != 0); |
| } else { |
| uint8_t *dst8 = (uint8_t *)dest; |
| |
| int h = block_height; |
| const size_t vl = block_width << 1; |
| do { |
| const vint16m1_t px = |
| load_strided_i16_4x2((int16_t *)in, CDEF_BSTRIDE, vl); |
| vuint8mf2_t vdst = |
| __riscv_vncvt_x_x_w_u8mf2(__riscv_vreinterpret_v_i16m1_u16m1(px), vl); |
| store_strided_u8_4x2(dst8, vdst, dstride, vl); |
| |
| in += 2 * CDEF_BSTRIDE; |
| dst8 += 2 * dstride; |
| h -= 2; |
| } while (h != 0); |
| } |
| } |
| |
| void cdef_filter_16_0_rvv(void *dest, int dstride, const uint16_t *in, |
| int pri_strength, int sec_strength, int dir, |
| int pri_damping, int sec_damping, int coeff_shift, |
| int block_width, int block_height) { |
| const int po1 = cdef_directions[dir][0]; |
| const int po2 = cdef_directions[dir][1]; |
| const int s1o1 = cdef_directions[dir + 2][0]; |
| const int s1o2 = cdef_directions[dir + 2][1]; |
| const int s2o1 = cdef_directions[dir - 2][0]; |
| const int s2o2 = cdef_directions[dir - 2][1]; |
| MAKE_TAPS; |
| |
| if (pri_strength) { |
| pri_damping = AOMMAX(0, pri_damping - get_msb(pri_strength)); |
| } |
| if (sec_strength) { |
| sec_damping = AOMMAX(0, sec_damping - get_msb(sec_strength)); |
| } |
| |
| if (block_width == 8) { |
| uint16_t *dst16 = (uint16_t *)dest; |
| |
| int h = block_height; |
| const size_t vl = block_width; |
| do { |
| LOAD_PIX(in); |
| SETUP_MINMAX; |
| |
| // Primary pass |
| LOAD_DIR(p, in, po1, po2); |
| CONSTRAIN(p, pri_strength, pri_damping); |
| MIN_MAX(p); |
| PRI_0_UPDATE_SUM(p); |
| |
| // Secondary pass 1 |
| LOAD_DIR(s, in, s1o1, s2o1); |
| CONSTRAIN(s, sec_strength, sec_damping); |
| MIN_MAX(s); |
| SEC_0_UPDATE_SUM(s); |
| |
| // Secondary pass 2 |
| LOAD_DIR(s2, in, s1o2, s2o2); |
| CONSTRAIN(s2, sec_strength, sec_damping); |
| MIN_MAX(s2); |
| UPDATE_SUM(s2); |
| |
| // Store |
| STORE16_CLAMPED; |
| } while (--h != 0); |
| } else { |
| uint16_t *dst16 = (uint16_t *)dest; |
| |
| int h = block_height; |
| const size_t vl = block_width << 1; |
| do { |
| LOAD_PIX4(in); |
| SETUP_MINMAX; |
| |
| // Primary pass |
| LOAD_DIR4(p, in, po1, po2); |
| CONSTRAIN(p, pri_strength, pri_damping); |
| MIN_MAX(p); |
| PRI_0_UPDATE_SUM(p); |
| |
| // Secondary pass 1 |
| LOAD_DIR4(s, in, s1o1, s2o1); |
| CONSTRAIN(s, sec_strength, sec_damping); |
| MIN_MAX(s); |
| SEC_0_UPDATE_SUM(s); |
| |
| // Secondary pass 2 |
| LOAD_DIR4(s2, in, s1o2, s2o2); |
| CONSTRAIN(s2, sec_strength, sec_damping); |
| MIN_MAX(s2); |
| UPDATE_SUM(s2); |
| |
| // Store |
| STORE16_4_CLAMPED; |
| |
| h -= 2; |
| } while (h != 0); |
| } |
| } |
| |
| void cdef_filter_16_1_rvv(void *dest, int dstride, const uint16_t *in, |
| int pri_strength, int sec_strength, int dir, |
| int pri_damping, int sec_damping, int coeff_shift, |
| int block_width, int block_height) { |
| (void)sec_strength; |
| (void)sec_damping; |
| |
| const int po1 = cdef_directions[dir][0]; |
| const int po2 = cdef_directions[dir][1]; |
| MAKE_TAPS; |
| |
| if (pri_strength) { |
| pri_damping = AOMMAX(0, pri_damping - get_msb(pri_strength)); |
| } |
| |
| if (block_width == 8) { |
| uint16_t *dst16 = (uint16_t *)dest; |
| |
| int h = block_height; |
| const size_t vl = block_width; |
| do { |
| LOAD_PIX(in); |
| |
| // Primary pass |
| LOAD_DIR(p, in, po1, po2); |
| CONSTRAIN(p, pri_strength, pri_damping); |
| PRI_0_UPDATE_SUM(p); |
| |
| // Store |
| STORE16_UNCLAMPED; |
| } while (--h != 0); |
| } else { |
| uint16_t *dst16 = (uint16_t *)dest; |
| |
| int h = block_height; |
| const size_t vl = block_width << 1; |
| do { |
| LOAD_PIX4(in); |
| |
| // Primary pass |
| LOAD_DIR4(p, in, po1, po2); |
| CONSTRAIN(p, pri_strength, pri_damping); |
| PRI_0_UPDATE_SUM(p); |
| |
| // Store |
| STORE16_4_UNCLAMPED; |
| |
| h -= 2; |
| } while (h != 0); |
| } |
| } |
| |
| void cdef_filter_16_2_rvv(void *dest, int dstride, const uint16_t *in, |
| int pri_strength, int sec_strength, int dir, |
| int pri_damping, int sec_damping, int coeff_shift, |
| int block_width, int block_height) { |
| (void)pri_strength; |
| (void)pri_damping; |
| (void)coeff_shift; |
| |
| const int s1o1 = cdef_directions[dir + 2][0]; |
| const int s1o2 = cdef_directions[dir + 2][1]; |
| const int s2o1 = cdef_directions[dir - 2][0]; |
| const int s2o2 = cdef_directions[dir - 2][1]; |
| |
| if (sec_strength) { |
| sec_damping = AOMMAX(0, sec_damping - get_msb(sec_strength)); |
| } |
| |
| if (block_width == 8) { |
| uint16_t *dst16 = (uint16_t *)dest; |
| |
| int h = block_height; |
| const size_t vl = block_width; |
| do { |
| LOAD_PIX(in); |
| |
| // Secondary pass 1 |
| LOAD_DIR(s, in, s1o1, s2o1); |
| CONSTRAIN(s, sec_strength, sec_damping); |
| SEC_0_UPDATE_SUM(s); |
| |
| // Secondary pass 2 |
| LOAD_DIR(s2, in, s1o2, s2o2); |
| CONSTRAIN(s2, sec_strength, sec_damping); |
| UPDATE_SUM(s2); |
| |
| // Store |
| STORE16_UNCLAMPED; |
| } while (--h != 0); |
| } else { |
| uint16_t *dst16 = (uint16_t *)dest; |
| |
| int h = block_height; |
| const size_t vl = block_width << 1; |
| do { |
| LOAD_PIX4(in); |
| |
| // Secondary pass 1 |
| LOAD_DIR4(s, in, s1o1, s2o1); |
| CONSTRAIN(s, sec_strength, sec_damping); |
| SEC_0_UPDATE_SUM(s); |
| |
| // Secondary pass 2 |
| LOAD_DIR4(s2, in, s1o2, s2o2); |
| CONSTRAIN(s2, sec_strength, sec_damping); |
| UPDATE_SUM(s2); |
| |
| // Store |
| STORE16_4_UNCLAMPED; |
| |
| h -= 2; |
| } while (h != 0); |
| } |
| } |
| |
| void cdef_filter_16_3_rvv(void *dest, int dstride, const uint16_t *in, |
| int pri_strength, int sec_strength, int dir, |
| int pri_damping, int sec_damping, int coeff_shift, |
| int block_width, int block_height) { |
| (void)pri_strength; |
| (void)sec_strength; |
| (void)dir; |
| (void)pri_damping; |
| (void)sec_damping; |
| (void)coeff_shift; |
| |
| if (block_width == 8) { |
| uint16_t *dst16 = (uint16_t *)dest; |
| |
| int h = block_height; |
| const size_t vl = block_width; |
| do { |
| const vuint16m1_t px = __riscv_vle16_v_u16m1(in, vl); |
| __riscv_vse16_v_u16m1(dst16, px, vl); |
| |
| in += CDEF_BSTRIDE; |
| dst16 += dstride; |
| } while (--h != 0); |
| } else { |
| uint16_t *dst16 = (uint16_t *)dest; |
| |
| int h = block_height; |
| const size_t vl = block_width << 1; |
| do { |
| const vint16m1_t px = |
| load_strided_i16_4x2((int16_t *)in, CDEF_BSTRIDE, vl); |
| vuint16m1_t vdst = __riscv_vreinterpret_v_i16m1_u16m1(px); |
| store_strided_u16_4x2(dst16, vdst, dstride, vl); |
| |
| in += 2 * CDEF_BSTRIDE; |
| dst16 += 2 * dstride; |
| h -= 2; |
| } while (h != 0); |
| } |
| } |