| /* |
| * Copyright (c) 2016, Alliance for Open Media. All rights reserved |
| * |
| * This source code is subject to the terms of the BSD 2 Clause License and |
| * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License |
| * was not distributed with this source code in the LICENSE file, you can |
| * obtain it at www.aomedia.org/license/software. If the Alliance for Open |
| * Media Patent License 1.0 was not distributed with this source code in the |
| * PATENTS file, you can obtain it at www.aomedia.org/license/patent. |
| */ |
| |
| #include <arm_neon.h> |
| #include <assert.h> |
| #include <math.h> |
| |
| #include "av1/common/txb_common.h" |
| #include "av1/encoder/encodetxb.h" |
| #include "av1/common/arm/mem_neon.h" |
| |
| void av1_txb_init_levels_neon(const tran_low_t *const coeff, const int width, |
| const int height, uint8_t *const levels) { |
| const int stride = width + TX_PAD_HOR; |
| memset(levels - TX_PAD_TOP * stride, 0, |
| sizeof(*levels) * TX_PAD_TOP * stride); |
| memset(levels + stride * height, 0, |
| sizeof(*levels) * (TX_PAD_BOTTOM * stride + TX_PAD_END)); |
| |
| const int32x4_t zeros = vdupq_n_s32(0); |
| int i = 0; |
| uint8_t *ls = levels; |
| const tran_low_t *cf = coeff; |
| if (width == 4) { |
| do { |
| const int32x4_t coeffA = vld1q_s32(cf); |
| const int32x4_t coeffB = vld1q_s32(cf + width); |
| const int16x8_t coeffAB = |
| vcombine_s16(vqmovn_s32(coeffA), vqmovn_s32(coeffB)); |
| const int16x8_t absAB = vqabsq_s16(coeffAB); |
| const int8x8_t absABs = vqmovn_s16(absAB); |
| #if defined(__aarch64__) |
| const int8x16_t absAB8 = |
| vcombine_s8(absABs, vreinterpret_s8_s32(vget_low_s32(zeros))); |
| const uint8x16_t lsAB = |
| vreinterpretq_u8_s32(vzip1q_s32(vreinterpretq_s32_s8(absAB8), zeros)); |
| #else |
| const int32x2x2_t absAB8 = |
| vzip_s32(vreinterpret_s32_s8(absABs), vget_low_s32(zeros)); |
| const uint8x16_t lsAB = |
| vreinterpretq_u8_s32(vcombine_s32(absAB8.val[0], absAB8.val[1])); |
| #endif |
| vst1q_u8(ls, lsAB); |
| ls += (stride << 1); |
| cf += (width << 1); |
| i += 2; |
| } while (i < height); |
| } else if (width == 8) { |
| do { |
| const int32x4_t coeffA = vld1q_s32(cf); |
| const int32x4_t coeffB = vld1q_s32(cf + 4); |
| const int16x8_t coeffAB = |
| vcombine_s16(vqmovn_s32(coeffA), vqmovn_s32(coeffB)); |
| const int16x8_t absAB = vqabsq_s16(coeffAB); |
| const uint8x16_t absAB8 = vreinterpretq_u8_s8(vcombine_s8( |
| vqmovn_s16(absAB), vreinterpret_s8_s32(vget_low_s32(zeros)))); |
| vst1q_u8(ls, absAB8); |
| ls += stride; |
| cf += width; |
| i += 1; |
| } while (i < height); |
| } else { |
| do { |
| int j = 0; |
| do { |
| const int32x4_t coeffA = vld1q_s32(cf); |
| const int32x4_t coeffB = vld1q_s32(cf + 4); |
| const int32x4_t coeffC = vld1q_s32(cf + 8); |
| const int32x4_t coeffD = vld1q_s32(cf + 12); |
| const int16x8_t coeffAB = |
| vcombine_s16(vqmovn_s32(coeffA), vqmovn_s32(coeffB)); |
| const int16x8_t coeffCD = |
| vcombine_s16(vqmovn_s32(coeffC), vqmovn_s32(coeffD)); |
| const int16x8_t absAB = vqabsq_s16(coeffAB); |
| const int16x8_t absCD = vqabsq_s16(coeffCD); |
| const uint8x16_t absABCD = vreinterpretq_u8_s8( |
| vcombine_s8(vqmovn_s16(absAB), vqmovn_s16(absCD))); |
| vst1q_u8((ls + j), absABCD); |
| j += 16; |
| cf += 16; |
| } while (j < width); |
| *(int32_t *)(ls + width) = 0; |
| ls += stride; |
| i += 1; |
| } while (i < height); |
| } |
| } |
| |
| // get_4_nz_map_contexts_2d coefficients: |
| static const DECLARE_ALIGNED(16, uint8_t, c_4_po_2d[2][16]) = { |
| { 0, 1, 6, 6, 1, 6, 6, 21, 6, 6, 21, 21, 6, 21, 21, 21 }, |
| { 0, 11, 11, 11, 11, 11, 11, 11, 6, 6, 21, 21, 6, 21, 21, 21 } |
| }; |
| |
| // get_4_nz_map_contexts_hor coefficients: |
| /* clang-format off */ |
| #define SIG_COEF_CONTEXTS_2D_X4_051010 \ |
| (SIG_COEF_CONTEXTS_2D + ((SIG_COEF_CONTEXTS_2D + 5) << 8) + \ |
| ((SIG_COEF_CONTEXTS_2D + 10) << 16) + ((SIG_COEF_CONTEXTS_2D + 10) << 24)) |
| /* clang-format on */ |
| |
| // get_4_nz_map_contexts_ver coefficients: |
| static const DECLARE_ALIGNED(16, uint8_t, c_4_po_ver[16]) = { |
| SIG_COEF_CONTEXTS_2D + 0, SIG_COEF_CONTEXTS_2D + 0, |
| SIG_COEF_CONTEXTS_2D + 0, SIG_COEF_CONTEXTS_2D + 0, |
| SIG_COEF_CONTEXTS_2D + 5, SIG_COEF_CONTEXTS_2D + 5, |
| SIG_COEF_CONTEXTS_2D + 5, SIG_COEF_CONTEXTS_2D + 5, |
| SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10, |
| SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10, |
| SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10, |
| SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10 |
| }; |
| |
| // get_8_coeff_contexts_2d coefficients: |
| // if (height == 8) |
| static const DECLARE_ALIGNED(16, uint8_t, c_8_po_2d_8[2][16]) = { |
| { 0, 1, 6, 6, 21, 21, 21, 21, 1, 6, 6, 21, 21, 21, 21, 21 }, |
| { 6, 6, 21, 21, 21, 21, 21, 21, 6, 21, 21, 21, 21, 21, 21, 21 } |
| }; |
| // if (height < 8) |
| static const DECLARE_ALIGNED(16, uint8_t, c_8_po_2d_l[2][16]) = { |
| { 0, 16, 6, 6, 21, 21, 21, 21, 16, 16, 6, 21, 21, 21, 21, 21 }, |
| { 16, 16, 21, 21, 21, 21, 21, 21, 16, 16, 21, 21, 21, 21, 21, 21 } |
| }; |
| |
| // if (height > 8) |
| static const DECLARE_ALIGNED(16, uint8_t, c_8_po_2d_g[2][16]) = { |
| { 0, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11 }, |
| { 6, 6, 21, 21, 21, 21, 21, 21, 6, 21, 21, 21, 21, 21, 21, 21 } |
| }; |
| |
| // get_4_nz_map_contexts_ver coefficients: |
| static const DECLARE_ALIGNED(16, uint8_t, c_8_po_hor[16]) = { |
| SIG_COEF_CONTEXTS_2D + 0, SIG_COEF_CONTEXTS_2D + 5, |
| SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10, |
| SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10, |
| SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10, |
| SIG_COEF_CONTEXTS_2D + 0, SIG_COEF_CONTEXTS_2D + 5, |
| SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10, |
| SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10, |
| SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10 |
| }; |
| |
| // get_16n_coeff_contexts_2d coefficients: |
| // real_width == real_height |
| static const DECLARE_ALIGNED(16, uint8_t, c_16_po_2d_e[4][16]) = { |
| { 0, 1, 6, 6, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21 }, |
| { 1, 6, 6, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21 }, |
| { 6, 6, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21 }, |
| { 6, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21 } |
| }; |
| |
| // real_width > real_height |
| static const DECLARE_ALIGNED(16, uint8_t, c_16_po_2d_g[3][16]) = { |
| { 0, 16, 6, 6, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21 }, |
| { 16, 16, 6, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21 }, |
| { 16, 16, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21 } |
| }; |
| |
| // real_width < real_height |
| static const DECLARE_ALIGNED(16, uint8_t, c_16_po_2d_l[3][16]) = { |
| { 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11 }, |
| { 6, 6, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21 }, |
| { 6, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21 } |
| }; |
| |
| // get_16n_coeff_contexts_hor coefficients: |
| static const DECLARE_ALIGNED(16, uint8_t, c_16_po_hor[16]) = { |
| SIG_COEF_CONTEXTS_2D + 0, SIG_COEF_CONTEXTS_2D + 5, |
| SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10, |
| SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10, |
| SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10, |
| SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10, |
| SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10, |
| SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10, |
| SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10 |
| }; |
| |
| // end of coefficients declaration area |
| |
| static INLINE uint8x16_t load_8bit_4x4_to_1_reg(const uint8_t *const src, |
| const int byte_stride) { |
| #ifdef __aarch64__ |
| uint32x4_t v_data = vld1q_u32((uint32_t *)src); |
| v_data = vld1q_lane_u32((uint32_t *)(src + 1 * byte_stride), v_data, 1); |
| v_data = vld1q_lane_u32((uint32_t *)(src + 2 * byte_stride), v_data, 2); |
| v_data = vld1q_lane_u32((uint32_t *)(src + 3 * byte_stride), v_data, 3); |
| |
| return vreinterpretq_u8_u32(v_data); |
| #else |
| return load_unaligned_u8q(src, byte_stride); |
| #endif |
| } |
| |
| static INLINE uint8x16_t load_8bit_8x2_to_1_reg(const uint8_t *const src, |
| const int byte_stride) { |
| #ifdef __aarch64__ |
| uint64x2_t v_data = vld1q_u64((uint64_t *)src); |
| v_data = vld1q_lane_u64((uint64_t *)(src + 1 * byte_stride), v_data, 1); |
| |
| return vreinterpretq_u8_u64(v_data); |
| #else |
| uint8x8_t v_data_low = vld1_u8(src); |
| uint8x8_t v_data_high = vld1_u8(src + byte_stride); |
| |
| return vcombine_u8(v_data_low, v_data_high); |
| #endif |
| } |
| |
| static INLINE uint8x16_t load_8bit_16x1_to_1_reg(const uint8_t *const src, |
| const int byte_stride) { |
| (void)byte_stride; |
| return vld1q_u8(src); |
| } |
| |
| static INLINE void load_levels_4x4x5(const uint8_t *const src, const int stride, |
| const ptrdiff_t *const offsets, |
| uint8x16_t *const level) { |
| level[0] = load_8bit_4x4_to_1_reg(&src[1], stride); |
| level[1] = load_8bit_4x4_to_1_reg(&src[stride], stride); |
| level[2] = load_8bit_4x4_to_1_reg(&src[offsets[0]], stride); |
| level[3] = load_8bit_4x4_to_1_reg(&src[offsets[1]], stride); |
| level[4] = load_8bit_4x4_to_1_reg(&src[offsets[2]], stride); |
| } |
| |
| static INLINE void load_levels_8x2x5(const uint8_t *const src, const int stride, |
| const ptrdiff_t *const offsets, |
| uint8x16_t *const level) { |
| level[0] = load_8bit_8x2_to_1_reg(&src[1], stride); |
| level[1] = load_8bit_8x2_to_1_reg(&src[stride], stride); |
| level[2] = load_8bit_8x2_to_1_reg(&src[offsets[0]], stride); |
| level[3] = load_8bit_8x2_to_1_reg(&src[offsets[1]], stride); |
| level[4] = load_8bit_8x2_to_1_reg(&src[offsets[2]], stride); |
| } |
| |
| static INLINE void load_levels_16x1x5(const uint8_t *const src, |
| const int stride, |
| const ptrdiff_t *const offsets, |
| uint8x16_t *const level) { |
| level[0] = load_8bit_16x1_to_1_reg(&src[1], stride); |
| level[1] = load_8bit_16x1_to_1_reg(&src[stride], stride); |
| level[2] = load_8bit_16x1_to_1_reg(&src[offsets[0]], stride); |
| level[3] = load_8bit_16x1_to_1_reg(&src[offsets[1]], stride); |
| level[4] = load_8bit_16x1_to_1_reg(&src[offsets[2]], stride); |
| } |
| |
| static INLINE uint8x16_t get_coeff_contexts_kernel(uint8x16_t *const level) { |
| const uint8x16_t const_3 = vdupq_n_u8(3); |
| const uint8x16_t const_4 = vdupq_n_u8(4); |
| uint8x16_t count; |
| |
| count = vminq_u8(level[0], const_3); |
| level[1] = vminq_u8(level[1], const_3); |
| level[2] = vminq_u8(level[2], const_3); |
| level[3] = vminq_u8(level[3], const_3); |
| level[4] = vminq_u8(level[4], const_3); |
| count = vaddq_u8(count, level[1]); |
| count = vaddq_u8(count, level[2]); |
| count = vaddq_u8(count, level[3]); |
| count = vaddq_u8(count, level[4]); |
| |
| count = vrshrq_n_u8(count, 1); |
| count = vminq_u8(count, const_4); |
| return count; |
| } |
| |
| static INLINE void get_4_nz_map_contexts_2d(const uint8_t *levels, |
| const int height, |
| const ptrdiff_t *const offsets, |
| uint8_t *const coeff_contexts) { |
| const int stride = 4 + TX_PAD_HOR; |
| const uint8x16_t pos_to_offset_large = vdupq_n_u8(21); |
| |
| uint8x16_t pos_to_offset = |
| vld1q_u8((height == 4) ? c_4_po_2d[0] : c_4_po_2d[1]); |
| |
| uint8x16_t count; |
| uint8x16_t level[5]; |
| uint8_t *cc = coeff_contexts; |
| |
| assert(!(height % 4)); |
| |
| int row = height; |
| do { |
| load_levels_4x4x5(levels, stride, offsets, level); |
| count = get_coeff_contexts_kernel(level); |
| count = vaddq_u8(count, pos_to_offset); |
| vst1q_u8(cc, count); |
| pos_to_offset = pos_to_offset_large; |
| levels += 4 * stride; |
| cc += 16; |
| row -= 4; |
| } while (row); |
| |
| coeff_contexts[0] = 0; |
| } |
| |
| static INLINE void get_4_nz_map_contexts_hor(const uint8_t *levels, |
| const int height, |
| const ptrdiff_t *const offsets, |
| uint8_t *coeff_contexts) { |
| const int stride = 4 + TX_PAD_HOR; |
| |
| const uint8x16_t pos_to_offset = |
| vreinterpretq_u8_u32(vdupq_n_u32(SIG_COEF_CONTEXTS_2D_X4_051010)); |
| |
| uint8x16_t count; |
| uint8x16_t level[5]; |
| |
| assert(!(height % 4)); |
| |
| int row = height; |
| do { |
| load_levels_4x4x5(levels, stride, offsets, level); |
| count = get_coeff_contexts_kernel(level); |
| count = vaddq_u8(count, pos_to_offset); |
| vst1q_u8(coeff_contexts, count); |
| levels += 4 * stride; |
| coeff_contexts += 16; |
| row -= 4; |
| } while (row); |
| } |
| |
| static INLINE void get_4_nz_map_contexts_ver(const uint8_t *levels, |
| const int height, |
| const ptrdiff_t *const offsets, |
| uint8_t *coeff_contexts) { |
| const int stride = 4 + TX_PAD_HOR; |
| const uint8x16_t pos_to_offset_large = vdupq_n_u8(SIG_COEF_CONTEXTS_2D + 10); |
| |
| uint8x16_t pos_to_offset = vld1q_u8(c_4_po_ver); |
| |
| uint8x16_t count; |
| uint8x16_t level[5]; |
| |
| assert(!(height % 4)); |
| |
| int row = height; |
| do { |
| load_levels_4x4x5(levels, stride, offsets, level); |
| count = get_coeff_contexts_kernel(level); |
| count = vaddq_u8(count, pos_to_offset); |
| vst1q_u8(coeff_contexts, count); |
| pos_to_offset = pos_to_offset_large; |
| levels += 4 * stride; |
| coeff_contexts += 16; |
| row -= 4; |
| } while (row); |
| } |
| |
| static INLINE void get_8_coeff_contexts_2d(const uint8_t *levels, |
| const int height, |
| const ptrdiff_t *const offsets, |
| uint8_t *coeff_contexts) { |
| const int stride = 8 + TX_PAD_HOR; |
| uint8_t *cc = coeff_contexts; |
| uint8x16_t count; |
| uint8x16_t level[5]; |
| uint8x16_t pos_to_offset[3]; |
| |
| assert(!(height % 2)); |
| |
| if (height == 8) { |
| pos_to_offset[0] = vld1q_u8(c_8_po_2d_8[0]); |
| pos_to_offset[1] = vld1q_u8(c_8_po_2d_8[1]); |
| } else if (height < 8) { |
| pos_to_offset[0] = vld1q_u8(c_8_po_2d_l[0]); |
| pos_to_offset[1] = vld1q_u8(c_8_po_2d_l[1]); |
| } else { |
| pos_to_offset[0] = vld1q_u8(c_8_po_2d_g[0]); |
| pos_to_offset[1] = vld1q_u8(c_8_po_2d_g[1]); |
| } |
| pos_to_offset[2] = vdupq_n_u8(21); |
| |
| int row = height; |
| do { |
| load_levels_8x2x5(levels, stride, offsets, level); |
| count = get_coeff_contexts_kernel(level); |
| count = vaddq_u8(count, pos_to_offset[0]); |
| vst1q_u8(cc, count); |
| pos_to_offset[0] = pos_to_offset[1]; |
| pos_to_offset[1] = pos_to_offset[2]; |
| levels += 2 * stride; |
| cc += 16; |
| row -= 2; |
| } while (row); |
| |
| coeff_contexts[0] = 0; |
| } |
| |
| static INLINE void get_8_coeff_contexts_hor(const uint8_t *levels, |
| const int height, |
| const ptrdiff_t *const offsets, |
| uint8_t *coeff_contexts) { |
| const int stride = 8 + TX_PAD_HOR; |
| |
| const uint8x16_t pos_to_offset = vld1q_u8(c_8_po_hor); |
| |
| uint8x16_t count; |
| uint8x16_t level[5]; |
| |
| assert(!(height % 2)); |
| |
| int row = height; |
| do { |
| load_levels_8x2x5(levels, stride, offsets, level); |
| count = get_coeff_contexts_kernel(level); |
| count = vaddq_u8(count, pos_to_offset); |
| vst1q_u8(coeff_contexts, count); |
| levels += 2 * stride; |
| coeff_contexts += 16; |
| row -= 2; |
| } while (row); |
| } |
| |
| static INLINE void get_8_coeff_contexts_ver(const uint8_t *levels, |
| const int height, |
| const ptrdiff_t *const offsets, |
| uint8_t *coeff_contexts) { |
| const int stride = 8 + TX_PAD_HOR; |
| const uint8x16_t pos_to_offset_large = vdupq_n_u8(SIG_COEF_CONTEXTS_2D + 10); |
| |
| uint8x16_t pos_to_offset = vcombine_u8(vdup_n_u8(SIG_COEF_CONTEXTS_2D + 0), |
| vdup_n_u8(SIG_COEF_CONTEXTS_2D + 5)); |
| |
| uint8x16_t count; |
| uint8x16_t level[5]; |
| |
| assert(!(height % 2)); |
| |
| int row = height; |
| do { |
| load_levels_8x2x5(levels, stride, offsets, level); |
| count = get_coeff_contexts_kernel(level); |
| count = vaddq_u8(count, pos_to_offset); |
| vst1q_u8(coeff_contexts, count); |
| pos_to_offset = pos_to_offset_large; |
| levels += 2 * stride; |
| coeff_contexts += 16; |
| row -= 2; |
| } while (row); |
| } |
| |
| static INLINE void get_16n_coeff_contexts_2d(const uint8_t *levels, |
| const int real_width, |
| const int real_height, |
| const int width, const int height, |
| const ptrdiff_t *const offsets, |
| uint8_t *coeff_contexts) { |
| const int stride = width + TX_PAD_HOR; |
| uint8_t *cc = coeff_contexts; |
| int row = height; |
| uint8x16_t pos_to_offset[5]; |
| uint8x16_t pos_to_offset_large[3]; |
| uint8x16_t count; |
| uint8x16_t level[5]; |
| |
| assert(!(width % 16)); |
| |
| pos_to_offset_large[2] = vdupq_n_u8(21); |
| if (real_width == real_height) { |
| pos_to_offset[0] = vld1q_u8(c_16_po_2d_e[0]); |
| pos_to_offset[1] = vld1q_u8(c_16_po_2d_e[1]); |
| pos_to_offset[2] = vld1q_u8(c_16_po_2d_e[2]); |
| pos_to_offset[3] = vld1q_u8(c_16_po_2d_e[3]); |
| pos_to_offset[4] = pos_to_offset_large[0] = pos_to_offset_large[1] = |
| pos_to_offset_large[2]; |
| } else if (real_width > real_height) { |
| pos_to_offset[0] = vld1q_u8(c_16_po_2d_g[0]); |
| pos_to_offset[1] = vld1q_u8(c_16_po_2d_g[1]); |
| pos_to_offset[2] = pos_to_offset[3] = pos_to_offset[4] = |
| vld1q_u8(c_16_po_2d_g[2]); |
| pos_to_offset_large[0] = pos_to_offset_large[1] = pos_to_offset_large[2]; |
| } else { // real_width < real_height |
| pos_to_offset[0] = pos_to_offset[1] = vld1q_u8(c_16_po_2d_l[0]); |
| pos_to_offset[2] = vld1q_u8(c_16_po_2d_l[1]); |
| pos_to_offset[3] = vld1q_u8(c_16_po_2d_l[2]); |
| pos_to_offset[4] = pos_to_offset_large[2]; |
| pos_to_offset_large[0] = pos_to_offset_large[1] = vdupq_n_u8(11); |
| } |
| |
| do { |
| int w = width; |
| |
| do { |
| load_levels_16x1x5(levels, stride, offsets, level); |
| count = get_coeff_contexts_kernel(level); |
| count = vaddq_u8(count, pos_to_offset[0]); |
| vst1q_u8(cc, count); |
| levels += 16; |
| cc += 16; |
| w -= 16; |
| pos_to_offset[0] = pos_to_offset_large[0]; |
| } while (w); |
| |
| pos_to_offset[0] = pos_to_offset[1]; |
| pos_to_offset[1] = pos_to_offset[2]; |
| pos_to_offset[2] = pos_to_offset[3]; |
| pos_to_offset[3] = pos_to_offset[4]; |
| pos_to_offset_large[0] = pos_to_offset_large[1]; |
| pos_to_offset_large[1] = pos_to_offset_large[2]; |
| levels += TX_PAD_HOR; |
| } while (--row); |
| |
| coeff_contexts[0] = 0; |
| } |
| |
| static INLINE void get_16n_coeff_contexts_hor(const uint8_t *levels, |
| const int width, const int height, |
| const ptrdiff_t *const offsets, |
| uint8_t *coeff_contexts) { |
| const int stride = width + TX_PAD_HOR; |
| |
| const uint8x16_t pos_to_offset_large = vdupq_n_u8(SIG_COEF_CONTEXTS_2D + 10); |
| |
| uint8x16_t count; |
| uint8x16_t level[5]; |
| |
| assert(!(width % 16)); |
| |
| int row = height; |
| do { |
| uint8x16_t pos_to_offset = vld1q_u8(c_16_po_hor); |
| |
| int w = width; |
| do { |
| load_levels_16x1x5(levels, stride, offsets, level); |
| count = get_coeff_contexts_kernel(level); |
| count = vaddq_u8(count, pos_to_offset); |
| vst1q_u8(coeff_contexts, count); |
| pos_to_offset = pos_to_offset_large; |
| levels += 16; |
| coeff_contexts += 16; |
| w -= 16; |
| } while (w); |
| |
| levels += TX_PAD_HOR; |
| } while (--row); |
| } |
| |
| static INLINE void get_16n_coeff_contexts_ver(const uint8_t *levels, |
| const int width, const int height, |
| const ptrdiff_t *const offsets, |
| uint8_t *coeff_contexts) { |
| const int stride = width + TX_PAD_HOR; |
| |
| uint8x16_t pos_to_offset[3]; |
| uint8x16_t count; |
| uint8x16_t level[5]; |
| |
| assert(!(width % 16)); |
| |
| pos_to_offset[0] = vdupq_n_u8(SIG_COEF_CONTEXTS_2D + 0); |
| pos_to_offset[1] = vdupq_n_u8(SIG_COEF_CONTEXTS_2D + 5); |
| pos_to_offset[2] = vdupq_n_u8(SIG_COEF_CONTEXTS_2D + 10); |
| |
| int row = height; |
| do { |
| int w = width; |
| do { |
| load_levels_16x1x5(levels, stride, offsets, level); |
| count = get_coeff_contexts_kernel(level); |
| count = vaddq_u8(count, pos_to_offset[0]); |
| vst1q_u8(coeff_contexts, count); |
| levels += 16; |
| coeff_contexts += 16; |
| w -= 16; |
| } while (w); |
| |
| pos_to_offset[0] = pos_to_offset[1]; |
| pos_to_offset[1] = pos_to_offset[2]; |
| levels += TX_PAD_HOR; |
| } while (--row); |
| } |
| |
| // Note: levels[] must be in the range [0, 127], inclusive. |
| void av1_get_nz_map_contexts_neon(const uint8_t *const levels, |
| const int16_t *const scan, const uint16_t eob, |
| const TX_SIZE tx_size, |
| const TX_CLASS tx_class, |
| int8_t *const coeff_contexts) { |
| const int last_idx = eob - 1; |
| if (!last_idx) { |
| coeff_contexts[0] = 0; |
| return; |
| } |
| |
| uint8_t *const coefficients = (uint8_t *const)coeff_contexts; |
| |
| const int real_width = tx_size_wide[tx_size]; |
| const int real_height = tx_size_high[tx_size]; |
| const int width = get_txb_wide(tx_size); |
| const int height = get_txb_high(tx_size); |
| const int stride = width + TX_PAD_HOR; |
| ptrdiff_t offsets[3]; |
| |
| /* coeff_contexts must be 16 byte aligned. */ |
| assert(!((intptr_t)coeff_contexts & 0xf)); |
| |
| if (tx_class == TX_CLASS_2D) { |
| offsets[0] = 0 * stride + 2; |
| offsets[1] = 1 * stride + 1; |
| offsets[2] = 2 * stride + 0; |
| |
| if (width == 4) { |
| get_4_nz_map_contexts_2d(levels, height, offsets, coefficients); |
| } else if (width == 8) { |
| get_8_coeff_contexts_2d(levels, height, offsets, coefficients); |
| } else { |
| get_16n_coeff_contexts_2d(levels, real_width, real_height, width, height, |
| offsets, coefficients); |
| } |
| } else if (tx_class == TX_CLASS_HORIZ) { |
| offsets[0] = 2; |
| offsets[1] = 3; |
| offsets[2] = 4; |
| if (width == 4) { |
| get_4_nz_map_contexts_hor(levels, height, offsets, coefficients); |
| } else if (width == 8) { |
| get_8_coeff_contexts_hor(levels, height, offsets, coefficients); |
| } else { |
| get_16n_coeff_contexts_hor(levels, width, height, offsets, coefficients); |
| } |
| } else { // TX_CLASS_VERT |
| offsets[0] = 2 * stride; |
| offsets[1] = 3 * stride; |
| offsets[2] = 4 * stride; |
| if (width == 4) { |
| get_4_nz_map_contexts_ver(levels, height, offsets, coefficients); |
| } else if (width == 8) { |
| get_8_coeff_contexts_ver(levels, height, offsets, coefficients); |
| } else { |
| get_16n_coeff_contexts_ver(levels, width, height, offsets, coefficients); |
| } |
| } |
| |
| const int bwl = get_txb_bwl(tx_size); |
| const int pos = scan[last_idx]; |
| if (last_idx <= (height << bwl) / 8) |
| coeff_contexts[pos] = 1; |
| else if (last_idx <= (height << bwl) / 4) |
| coeff_contexts[pos] = 2; |
| else |
| coeff_contexts[pos] = 3; |
| } |