av1/encoder/arm/neon/encodetxb_neon.c - avm - Git at Google

 /*
  * Copyright (c) 2021, Alliance for Open Media. All rights reserved
  *
  * This source code is subject to the terms of the BSD 3-Clause Clear License
  * and the Alliance for Open Media Patent License 1.0. If the BSD 3-Clause Clear
  * License was not distributed with this source code in the LICENSE file, you
  * can obtain it at aomedia.org/license/software-license/bsd-3-c-c/.  If the
  * Alliance for Open Media Patent License 1.0 was not distributed with this
  * source code in the PATENTS file, you can obtain it at
  * aomedia.org/license/patent-license/.
  */

 #include <arm_neon.h>
 #include <assert.h>
 #include <math.h>

 #include "av1/common/txb_common.h"
 #include "av1/encoder/encodetxb.h"
 #include "av1/common/arm/mem_neon.h"

 void av1_txb_init_levels_neon(const tran_low_t *const coeff, const int width,
                               const int height, uint8_t *const levels) {
   const int stride = width + TX_PAD_HOR;
   memset(levels - TX_PAD_TOP * stride, 0,
          sizeof(*levels) * TX_PAD_TOP * stride);
   memset(levels + stride * height, 0,
          sizeof(*levels) * (TX_PAD_BOTTOM * stride + TX_PAD_END));

   const int32x4_t zeros = vdupq_n_s32(0);
   int i = 0;
   uint8_t *ls = levels;
   const tran_low_t *cf = coeff;
   if (width == 4) {
     do {
       const int32x4_t coeffA = vld1q_s32(cf);
       const int32x4_t coeffB = vld1q_s32(cf + width);
       const int16x8_t coeffAB =
           vcombine_s16(vqmovn_s32(coeffA), vqmovn_s32(coeffB));
       const int16x8_t absAB = vqabsq_s16(coeffAB);
       const int8x8_t absABs = vqmovn_s16(absAB);
 #if defined(__aarch64__)
       const int8x16_t absAB8 =
           vcombine_s8(absABs, vreinterpret_s8_s32(vget_low_s32(zeros)));
       const uint8x16_t lsAB =
           vreinterpretq_u8_s32(vzip1q_s32(vreinterpretq_s32_s8(absAB8), zeros));
 #else
       const int32x2x2_t absAB8 =
           vzip_s32(vreinterpret_s32_s8(absABs), vget_low_s32(zeros));
       const uint8x16_t lsAB =
           vreinterpretq_u8_s32(vcombine_s32(absAB8.val[0], absAB8.val[1]));
 #endif
       vst1q_u8(ls, lsAB);
       ls += (stride << 1);
       cf += (width << 1);
       i += 2;
     } while (i < height);
   } else if (width == 8) {
     do {
       const int32x4_t coeffA = vld1q_s32(cf);
       const int32x4_t coeffB = vld1q_s32(cf + 4);
       const int16x8_t coeffAB =
           vcombine_s16(vqmovn_s32(coeffA), vqmovn_s32(coeffB));
       const int16x8_t absAB = vqabsq_s16(coeffAB);
       const uint8x16_t absAB8 = vreinterpretq_u8_s8(vcombine_s8(
           vqmovn_s16(absAB), vreinterpret_s8_s32(vget_low_s32(zeros))));
       vst1q_u8(ls, absAB8);
       ls += stride;
       cf += width;
       i += 1;
     } while (i < height);
   } else {
     do {
       int j = 0;
       do {
         const int32x4_t coeffA = vld1q_s32(cf);
         const int32x4_t coeffB = vld1q_s32(cf + 4);
         const int32x4_t coeffC = vld1q_s32(cf + 8);
         const int32x4_t coeffD = vld1q_s32(cf + 12);
         const int16x8_t coeffAB =
             vcombine_s16(vqmovn_s32(coeffA), vqmovn_s32(coeffB));
         const int16x8_t coeffCD =
             vcombine_s16(vqmovn_s32(coeffC), vqmovn_s32(coeffD));
         const int16x8_t absAB = vqabsq_s16(coeffAB);
         const int16x8_t absCD = vqabsq_s16(coeffCD);
         const uint8x16_t absABCD = vreinterpretq_u8_s8(
             vcombine_s8(vqmovn_s16(absAB), vqmovn_s16(absCD)));
         vst1q_u8((ls + j), absABCD);
         j += 16;
         cf += 16;
       } while (j < width);
       *(int32_t *)(ls + width) = 0;
       ls += stride;
       i += 1;
     } while (i < height);
   }
 }

 // get_4_nz_map_contexts_2d coefficients:
 static const DECLARE_ALIGNED(16, uint8_t, c_4_po_2d[2][16]) = {
   { 0, 1, 6, 6, 1, 6, 6, 21, 6, 6, 21, 21, 6, 21, 21, 21 },
   { 0, 11, 11, 11, 11, 11, 11, 11, 6, 6, 21, 21, 6, 21, 21, 21 }
 };

 // get_4_nz_map_contexts_hor coefficients:
 /* clang-format off */
 #define SIG_COEF_CONTEXTS_2D_X4_051010                        \
   (SIG_COEF_CONTEXTS_2D + ((SIG_COEF_CONTEXTS_2D + 5) << 8) + \
   ((SIG_COEF_CONTEXTS_2D + 10) << 16) + ((SIG_COEF_CONTEXTS_2D + 10) << 24))
 /* clang-format on */

 // get_4_nz_map_contexts_ver coefficients:
 static const DECLARE_ALIGNED(16, uint8_t, c_4_po_ver[16]) = {
   SIG_COEF_CONTEXTS_2D + 0,  SIG_COEF_CONTEXTS_2D + 0,
   SIG_COEF_CONTEXTS_2D + 0,  SIG_COEF_CONTEXTS_2D + 0,
   SIG_COEF_CONTEXTS_2D + 5,  SIG_COEF_CONTEXTS_2D + 5,
   SIG_COEF_CONTEXTS_2D + 5,  SIG_COEF_CONTEXTS_2D + 5,
   SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
   SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
   SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
   SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10
 };

 // get_8_coeff_contexts_2d coefficients:
 // if (height == 8)
 static const DECLARE_ALIGNED(16, uint8_t, c_8_po_2d_8[2][16]) = {
   { 0, 1, 6, 6, 21, 21, 21, 21, 1, 6, 6, 21, 21, 21, 21, 21 },
   { 6, 6, 21, 21, 21, 21, 21, 21, 6, 21, 21, 21, 21, 21, 21, 21 }
 };
 // if (height < 8)
 static const DECLARE_ALIGNED(16, uint8_t, c_8_po_2d_l[2][16]) = {
   { 0, 16, 6, 6, 21, 21, 21, 21, 16, 16, 6, 21, 21, 21, 21, 21 },
   { 16, 16, 21, 21, 21, 21, 21, 21, 16, 16, 21, 21, 21, 21, 21, 21 }
 };

 // if (height > 8)
 static const DECLARE_ALIGNED(16, uint8_t, c_8_po_2d_g[2][16]) = {
   { 0, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11 },
   { 6, 6, 21, 21, 21, 21, 21, 21, 6, 21, 21, 21, 21, 21, 21, 21 }
 };

 // get_4_nz_map_contexts_ver coefficients:
 static const DECLARE_ALIGNED(16, uint8_t, c_8_po_hor[16]) = {
   SIG_COEF_CONTEXTS_2D + 0,  SIG_COEF_CONTEXTS_2D + 5,
   SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
   SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
   SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
   SIG_COEF_CONTEXTS_2D + 0,  SIG_COEF_CONTEXTS_2D + 5,
   SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
   SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
   SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10
 };

 // get_16n_coeff_contexts_2d coefficients:
 // real_width == real_height
 static const DECLARE_ALIGNED(16, uint8_t, c_16_po_2d_e[4][16]) = {
   { 0, 1, 6, 6, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21 },
   { 1, 6, 6, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21 },
   { 6, 6, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21 },
   { 6, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21 }
 };

 // real_width > real_height
 static const DECLARE_ALIGNED(16, uint8_t, c_16_po_2d_g[3][16]) = {
   { 0, 16, 6, 6, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21 },
   { 16, 16, 6, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21 },
   { 16, 16, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21 }
 };

 // real_width < real_height
 static const DECLARE_ALIGNED(16, uint8_t, c_16_po_2d_l[3][16]) = {
   { 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11 },
   { 6, 6, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21 },
   { 6, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21 }
 };

 // get_16n_coeff_contexts_hor coefficients:
 static const DECLARE_ALIGNED(16, uint8_t, c_16_po_hor[16]) = {
   SIG_COEF_CONTEXTS_2D + 0,  SIG_COEF_CONTEXTS_2D + 5,
   SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
   SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
   SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
   SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
   SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
   SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
   SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10
 };

 // end of coefficients declaration area

 static INLINE uint8x16_t load_8bit_4x4_to_1_reg(const uint8_t *const src,
                                                 const int byte_stride) {
 #ifdef __aarch64__
   uint32x4_t v_data = vld1q_u32((uint32_t *)src);
   v_data = vld1q_lane_u32((uint32_t *)(src + 1 * byte_stride), v_data, 1);
   v_data = vld1q_lane_u32((uint32_t *)(src + 2 * byte_stride), v_data, 2);
   v_data = vld1q_lane_u32((uint32_t *)(src + 3 * byte_stride), v_data, 3);

   return vreinterpretq_u8_u32(v_data);
 #else
   return load_unaligned_u8q(src, byte_stride);
 #endif
 }

 static INLINE uint8x16_t load_8bit_8x2_to_1_reg(const uint8_t *const src,
                                                 const int byte_stride) {
 #ifdef __aarch64__
   uint64x2_t v_data = vld1q_u64((uint64_t *)src);
   v_data = vld1q_lane_u64((uint64_t *)(src + 1 * byte_stride), v_data, 1);

   return vreinterpretq_u8_u64(v_data);
 #else
   uint8x8_t v_data_low = vld1_u8(src);
   uint8x8_t v_data_high = vld1_u8(src + byte_stride);

   return vcombine_u8(v_data_low, v_data_high);
 #endif
 }

 static INLINE uint8x16_t load_8bit_16x1_to_1_reg(const uint8_t *const src,
                                                  const int byte_stride) {
   (void)byte_stride;
   return vld1q_u8(src);
 }

 static INLINE void load_levels_4x4x5(const uint8_t *const src, const int stride,
                                      const ptrdiff_t *const offsets,
                                      uint8x16_t *const level) {
   level[0] = load_8bit_4x4_to_1_reg(&src[1], stride);
   level[1] = load_8bit_4x4_to_1_reg(&src[stride], stride);
   level[2] = load_8bit_4x4_to_1_reg(&src[offsets[0]], stride);
   level[3] = load_8bit_4x4_to_1_reg(&src[offsets[1]], stride);
   level[4] = load_8bit_4x4_to_1_reg(&src[offsets[2]], stride);
 }

 static INLINE void load_levels_8x2x5(const uint8_t *const src, const int stride,
                                      const ptrdiff_t *const offsets,
                                      uint8x16_t *const level) {
   level[0] = load_8bit_8x2_to_1_reg(&src[1], stride);
   level[1] = load_8bit_8x2_to_1_reg(&src[stride], stride);
   level[2] = load_8bit_8x2_to_1_reg(&src[offsets[0]], stride);
   level[3] = load_8bit_8x2_to_1_reg(&src[offsets[1]], stride);
   level[4] = load_8bit_8x2_to_1_reg(&src[offsets[2]], stride);
 }

 static INLINE void load_levels_16x1x5(const uint8_t *const src,
                                       const int stride,
                                       const ptrdiff_t *const offsets,
                                       uint8x16_t *const level) {
   level[0] = load_8bit_16x1_to_1_reg(&src[1], stride);
   level[1] = load_8bit_16x1_to_1_reg(&src[stride], stride);
   level[2] = load_8bit_16x1_to_1_reg(&src[offsets[0]], stride);
   level[3] = load_8bit_16x1_to_1_reg(&src[offsets[1]], stride);
   level[4] = load_8bit_16x1_to_1_reg(&src[offsets[2]], stride);
 }

 static INLINE uint8x16_t get_coeff_contexts_kernel(uint8x16_t *const level) {
   const uint8x16_t const_3 = vdupq_n_u8(3);
   const uint8x16_t const_4 = vdupq_n_u8(4);
   uint8x16_t count;

   count = vminq_u8(level[0], const_3);
   level[1] = vminq_u8(level[1], const_3);
   level[2] = vminq_u8(level[2], const_3);
   level[3] = vminq_u8(level[3], const_3);
   level[4] = vminq_u8(level[4], const_3);
   count = vaddq_u8(count, level[1]);
   count = vaddq_u8(count, level[2]);
   count = vaddq_u8(count, level[3]);
   count = vaddq_u8(count, level[4]);

   count = vrshrq_n_u8(count, 1);
   count = vminq_u8(count, const_4);
   return count;
 }

 static INLINE void get_4_nz_map_contexts_2d(const uint8_t *levels,
                                             const int height,
                                             const ptrdiff_t *const offsets,
                                             uint8_t *const coeff_contexts) {
   const int stride = 4 + TX_PAD_HOR;
   const uint8x16_t pos_to_offset_large = vdupq_n_u8(21);

   uint8x16_t pos_to_offset =
       vld1q_u8((height == 4) ? c_4_po_2d[0] : c_4_po_2d[1]);

   uint8x16_t count;
   uint8x16_t level[5];
   uint8_t *cc = coeff_contexts;

   assert(!(height % 4));

   int row = height;
   do {
     load_levels_4x4x5(levels, stride, offsets, level);
     count = get_coeff_contexts_kernel(level);
     count = vaddq_u8(count, pos_to_offset);
     vst1q_u8(cc, count);
     pos_to_offset = pos_to_offset_large;
     levels += 4 * stride;
     cc += 16;
     row -= 4;
   } while (row);

   coeff_contexts[0] = 0;
 }

 static INLINE void get_4_nz_map_contexts_hor(const uint8_t *levels,
                                              const int height,
                                              const ptrdiff_t *const offsets,
                                              uint8_t *coeff_contexts) {
   const int stride = 4 + TX_PAD_HOR;

   const uint8x16_t pos_to_offset =
       vreinterpretq_u8_u32(vdupq_n_u32(SIG_COEF_CONTEXTS_2D_X4_051010));

   uint8x16_t count;
   uint8x16_t level[5];

   assert(!(height % 4));

   int row = height;
   do {
     load_levels_4x4x5(levels, stride, offsets, level);
     count = get_coeff_contexts_kernel(level);
     count = vaddq_u8(count, pos_to_offset);
     vst1q_u8(coeff_contexts, count);
     levels += 4 * stride;
     coeff_contexts += 16;
     row -= 4;
   } while (row);
 }

 static INLINE void get_4_nz_map_contexts_ver(const uint8_t *levels,
                                              const int height,
                                              const ptrdiff_t *const offsets,
                                              uint8_t *coeff_contexts) {
   const int stride = 4 + TX_PAD_HOR;
   const uint8x16_t pos_to_offset_large = vdupq_n_u8(SIG_COEF_CONTEXTS_2D + 10);

   uint8x16_t pos_to_offset = vld1q_u8(c_4_po_ver);

   uint8x16_t count;
   uint8x16_t level[5];

   assert(!(height % 4));

   int row = height;
   do {
     load_levels_4x4x5(levels, stride, offsets, level);
     count = get_coeff_contexts_kernel(level);
     count = vaddq_u8(count, pos_to_offset);
     vst1q_u8(coeff_contexts, count);
     pos_to_offset = pos_to_offset_large;
     levels += 4 * stride;
     coeff_contexts += 16;
     row -= 4;
   } while (row);
 }

 static INLINE void get_8_coeff_contexts_2d(const uint8_t *levels,
                                            const int height,
                                            const ptrdiff_t *const offsets,
                                            uint8_t *coeff_contexts) {
   const int stride = 8 + TX_PAD_HOR;
   uint8_t *cc = coeff_contexts;
   uint8x16_t count;
   uint8x16_t level[5];
   uint8x16_t pos_to_offset[3];

   assert(!(height % 2));

   if (height == 8) {
     pos_to_offset[0] = vld1q_u8(c_8_po_2d_8[0]);
     pos_to_offset[1] = vld1q_u8(c_8_po_2d_8[1]);
   } else if (height < 8) {
     pos_to_offset[0] = vld1q_u8(c_8_po_2d_l[0]);
     pos_to_offset[1] = vld1q_u8(c_8_po_2d_l[1]);
   } else {
     pos_to_offset[0] = vld1q_u8(c_8_po_2d_g[0]);
     pos_to_offset[1] = vld1q_u8(c_8_po_2d_g[1]);
   }
   pos_to_offset[2] = vdupq_n_u8(21);

   int row = height;
   do {
     load_levels_8x2x5(levels, stride, offsets, level);
     count = get_coeff_contexts_kernel(level);
     count = vaddq_u8(count, pos_to_offset[0]);
     vst1q_u8(cc, count);
     pos_to_offset[0] = pos_to_offset[1];
     pos_to_offset[1] = pos_to_offset[2];
     levels += 2 * stride;
     cc += 16;
     row -= 2;
   } while (row);

   coeff_contexts[0] = 0;
 }

 static INLINE void get_8_coeff_contexts_hor(const uint8_t *levels,
                                             const int height,
                                             const ptrdiff_t *const offsets,
                                             uint8_t *coeff_contexts) {
   const int stride = 8 + TX_PAD_HOR;

   const uint8x16_t pos_to_offset = vld1q_u8(c_8_po_hor);

   uint8x16_t count;
   uint8x16_t level[5];

   assert(!(height % 2));

   int row = height;
   do {
     load_levels_8x2x5(levels, stride, offsets, level);
     count = get_coeff_contexts_kernel(level);
     count = vaddq_u8(count, pos_to_offset);
     vst1q_u8(coeff_contexts, count);
     levels += 2 * stride;
     coeff_contexts += 16;
     row -= 2;
   } while (row);
 }

 static INLINE void get_8_coeff_contexts_ver(const uint8_t *levels,
                                             const int height,
                                             const ptrdiff_t *const offsets,
                                             uint8_t *coeff_contexts) {
   const int stride = 8 + TX_PAD_HOR;
   const uint8x16_t pos_to_offset_large = vdupq_n_u8(SIG_COEF_CONTEXTS_2D + 10);

   uint8x16_t pos_to_offset = vcombine_u8(vdup_n_u8(SIG_COEF_CONTEXTS_2D + 0),
                                          vdup_n_u8(SIG_COEF_CONTEXTS_2D + 5));

   uint8x16_t count;
   uint8x16_t level[5];

   assert(!(height % 2));

   int row = height;
   do {
     load_levels_8x2x5(levels, stride, offsets, level);
     count = get_coeff_contexts_kernel(level);
     count = vaddq_u8(count, pos_to_offset);
     vst1q_u8(coeff_contexts, count);
     pos_to_offset = pos_to_offset_large;
     levels += 2 * stride;
     coeff_contexts += 16;
     row -= 2;
   } while (row);
 }

 static INLINE void get_16n_coeff_contexts_2d(const uint8_t *levels,
                                              const int real_width,
                                              const int real_height,
                                              const int width, const int height,
                                              const ptrdiff_t *const offsets,
                                              uint8_t *coeff_contexts) {
   const int stride = width + TX_PAD_HOR;
   uint8_t *cc = coeff_contexts;
   int row = height;
   uint8x16_t pos_to_offset[5];
   uint8x16_t pos_to_offset_large[3];
   uint8x16_t count;
   uint8x16_t level[5];

   assert(!(width % 16));

   pos_to_offset_large[2] = vdupq_n_u8(21);
   if (real_width == real_height) {
     pos_to_offset[0] = vld1q_u8(c_16_po_2d_e[0]);
     pos_to_offset[1] = vld1q_u8(c_16_po_2d_e[1]);
     pos_to_offset[2] = vld1q_u8(c_16_po_2d_e[2]);
     pos_to_offset[3] = vld1q_u8(c_16_po_2d_e[3]);
     pos_to_offset[4] = pos_to_offset_large[0] = pos_to_offset_large[1] =
         pos_to_offset_large[2];
   } else if (real_width > real_height) {
     pos_to_offset[0] = vld1q_u8(c_16_po_2d_g[0]);
     pos_to_offset[1] = vld1q_u8(c_16_po_2d_g[1]);
     pos_to_offset[2] = pos_to_offset[3] = pos_to_offset[4] =
         vld1q_u8(c_16_po_2d_g[2]);
     pos_to_offset_large[0] = pos_to_offset_large[1] = pos_to_offset_large[2];
   } else {  // real_width < real_height
     pos_to_offset[0] = pos_to_offset[1] = vld1q_u8(c_16_po_2d_l[0]);
     pos_to_offset[2] = vld1q_u8(c_16_po_2d_l[1]);
     pos_to_offset[3] = vld1q_u8(c_16_po_2d_l[2]);
     pos_to_offset[4] = pos_to_offset_large[2];
     pos_to_offset_large[0] = pos_to_offset_large[1] = vdupq_n_u8(11);
   }

   do {
     int w = width;

     do {
       load_levels_16x1x5(levels, stride, offsets, level);
       count = get_coeff_contexts_kernel(level);
       count = vaddq_u8(count, pos_to_offset[0]);
       vst1q_u8(cc, count);
       levels += 16;
       cc += 16;
       w -= 16;
       pos_to_offset[0] = pos_to_offset_large[0];
     } while (w);

     pos_to_offset[0] = pos_to_offset[1];
     pos_to_offset[1] = pos_to_offset[2];
     pos_to_offset[2] = pos_to_offset[3];
     pos_to_offset[3] = pos_to_offset[4];
     pos_to_offset_large[0] = pos_to_offset_large[1];
     pos_to_offset_large[1] = pos_to_offset_large[2];
     levels += TX_PAD_HOR;
   } while (--row);

   coeff_contexts[0] = 0;
 }

 static INLINE void get_16n_coeff_contexts_hor(const uint8_t *levels,
                                               const int width, const int height,
                                               const ptrdiff_t *const offsets,
                                               uint8_t *coeff_contexts) {
   const int stride = width + TX_PAD_HOR;

   const uint8x16_t pos_to_offset_large = vdupq_n_u8(SIG_COEF_CONTEXTS_2D + 10);

   uint8x16_t count;
   uint8x16_t level[5];

   assert(!(width % 16));

   int row = height;
   do {
     uint8x16_t pos_to_offset = vld1q_u8(c_16_po_hor);

     int w = width;
     do {
       load_levels_16x1x5(levels, stride, offsets, level);
       count = get_coeff_contexts_kernel(level);
       count = vaddq_u8(count, pos_to_offset);
       vst1q_u8(coeff_contexts, count);
       pos_to_offset = pos_to_offset_large;
       levels += 16;
       coeff_contexts += 16;
       w -= 16;
     } while (w);

     levels += TX_PAD_HOR;
   } while (--row);
 }

 static INLINE void get_16n_coeff_contexts_ver(const uint8_t *levels,
                                               const int width, const int height,
                                               const ptrdiff_t *const offsets,
                                               uint8_t *coeff_contexts) {
   const int stride = width + TX_PAD_HOR;

   uint8x16_t pos_to_offset[3];
   uint8x16_t count;
   uint8x16_t level[5];

   assert(!(width % 16));

   pos_to_offset[0] = vdupq_n_u8(SIG_COEF_CONTEXTS_2D + 0);
   pos_to_offset[1] = vdupq_n_u8(SIG_COEF_CONTEXTS_2D + 5);
   pos_to_offset[2] = vdupq_n_u8(SIG_COEF_CONTEXTS_2D + 10);

   int row = height;
   do {
     int w = width;
     do {
       load_levels_16x1x5(levels, stride, offsets, level);
       count = get_coeff_contexts_kernel(level);
       count = vaddq_u8(count, pos_to_offset[0]);
       vst1q_u8(coeff_contexts, count);
       levels += 16;
       coeff_contexts += 16;
       w -= 16;
     } while (w);

     pos_to_offset[0] = pos_to_offset[1];
     pos_to_offset[1] = pos_to_offset[2];
     levels += TX_PAD_HOR;
   } while (--row);
 }

 // Note: levels[] must be in the range [0, 127], inclusive.
 void av1_get_nz_map_contexts_neon(const uint8_t *const levels,
                                   const int16_t *const scan, const uint16_t eob,
                                   const TX_SIZE tx_size,
                                   const TX_CLASS tx_class,
                                   int8_t *const coeff_contexts) {
   const int last_idx = eob - 1;
   if (!last_idx) {
     coeff_contexts[0] = 0;
     return;
   }

   uint8_t *const coefficients = (uint8_t *const)coeff_contexts;

   const int real_width = tx_size_wide[tx_size];
   const int real_height = tx_size_high[tx_size];
   const int width = get_txb_wide(tx_size);
   const int height = get_txb_high(tx_size);
   const int stride = width + TX_PAD_HOR;
   ptrdiff_t offsets[3];

   /* coeff_contexts must be 16 byte aligned. */
   assert(!((intptr_t)coeff_contexts & 0xf));

   if (tx_class == TX_CLASS_2D) {
     offsets[0] = 0 * stride + 2;
     offsets[1] = 1 * stride + 1;
     offsets[2] = 2 * stride + 0;

     if (width == 4) {
       get_4_nz_map_contexts_2d(levels, height, offsets, coefficients);
     } else if (width == 8) {
       get_8_coeff_contexts_2d(levels, height, offsets, coefficients);
     } else {
       get_16n_coeff_contexts_2d(levels, real_width, real_height, width, height,
                                 offsets, coefficients);
     }
   } else if (tx_class == TX_CLASS_HORIZ) {
     offsets[0] = 2;
     offsets[1] = 3;
     offsets[2] = 4;
     if (width == 4) {
       get_4_nz_map_contexts_hor(levels, height, offsets, coefficients);
     } else if (width == 8) {
       get_8_coeff_contexts_hor(levels, height, offsets, coefficients);
     } else {
       get_16n_coeff_contexts_hor(levels, width, height, offsets, coefficients);
     }
   } else {  // TX_CLASS_VERT
     offsets[0] = 2 * stride;
     offsets[1] = 3 * stride;
     offsets[2] = 4 * stride;
     if (width == 4) {
       get_4_nz_map_contexts_ver(levels, height, offsets, coefficients);
     } else if (width == 8) {
       get_8_coeff_contexts_ver(levels, height, offsets, coefficients);
     } else {
       get_16n_coeff_contexts_ver(levels, width, height, offsets, coefficients);
     }
   }

   const int bwl = get_txb_bwl(tx_size);
   const int pos = scan[last_idx];
   if (last_idx <= (height << bwl) / 8)
     coeff_contexts[pos] = 1;
   else if (last_idx <= (height << bwl) / 4)
     coeff_contexts[pos] = 2;
   else
     coeff_contexts[pos] = 3;
 }
	/*
	* Copyright (c) 2021, Alliance for Open Media. All rights reserved
	*
	* This source code is subject to the terms of the BSD 3-Clause Clear License
	* and the Alliance for Open Media Patent License 1.0. If the BSD 3-Clause Clear
	* License was not distributed with this source code in the LICENSE file, you
	* can obtain it at aomedia.org/license/software-license/bsd-3-c-c/. If the
	* Alliance for Open Media Patent License 1.0 was not distributed with this
	* source code in the PATENTS file, you can obtain it at
	* aomedia.org/license/patent-license/.
	*/

	#include <arm_neon.h>
	#include <assert.h>
	#include <math.h>

	#include "av1/common/txb_common.h"
	#include "av1/encoder/encodetxb.h"
	#include "av1/common/arm/mem_neon.h"

	void av1_txb_init_levels_neon(const tran_low_t *const coeff, const int width,
	const int height, uint8_t *const levels) {
	const int stride = width + TX_PAD_HOR;
	memset(levels - TX_PAD_TOP * stride, 0,
	sizeof(levels) TX_PAD_TOP * stride);
	memset(levels + stride * height, 0,
	sizeof(levels) (TX_PAD_BOTTOM * stride + TX_PAD_END));

	const int32x4_t zeros = vdupq_n_s32(0);
	int i = 0;
	uint8_t *ls = levels;
	const tran_low_t *cf = coeff;
	if (width == 4) {
	do {
	const int32x4_t coeffA = vld1q_s32(cf);
	const int32x4_t coeffB = vld1q_s32(cf + width);
	const int16x8_t coeffAB =
	vcombine_s16(vqmovn_s32(coeffA), vqmovn_s32(coeffB));
	const int16x8_t absAB = vqabsq_s16(coeffAB);
	const int8x8_t absABs = vqmovn_s16(absAB);
	#if defined(__aarch64__)
	const int8x16_t absAB8 =
	vcombine_s8(absABs, vreinterpret_s8_s32(vget_low_s32(zeros)));
	const uint8x16_t lsAB =
	vreinterpretq_u8_s32(vzip1q_s32(vreinterpretq_s32_s8(absAB8), zeros));
	#else
	const int32x2x2_t absAB8 =
	vzip_s32(vreinterpret_s32_s8(absABs), vget_low_s32(zeros));
	const uint8x16_t lsAB =
	vreinterpretq_u8_s32(vcombine_s32(absAB8.val[0], absAB8.val[1]));
	#endif
	vst1q_u8(ls, lsAB);
	ls += (stride << 1);
	cf += (width << 1);
	i += 2;
	} while (i < height);
	} else if (width == 8) {
	do {
	const int32x4_t coeffA = vld1q_s32(cf);
	const int32x4_t coeffB = vld1q_s32(cf + 4);
	const int16x8_t coeffAB =
	vcombine_s16(vqmovn_s32(coeffA), vqmovn_s32(coeffB));
	const int16x8_t absAB = vqabsq_s16(coeffAB);
	const uint8x16_t absAB8 = vreinterpretq_u8_s8(vcombine_s8(
	vqmovn_s16(absAB), vreinterpret_s8_s32(vget_low_s32(zeros))));
	vst1q_u8(ls, absAB8);
	ls += stride;
	cf += width;
	i += 1;
	} while (i < height);
	} else {
	do {
	int j = 0;
	do {
	const int32x4_t coeffA = vld1q_s32(cf);
	const int32x4_t coeffB = vld1q_s32(cf + 4);
	const int32x4_t coeffC = vld1q_s32(cf + 8);
	const int32x4_t coeffD = vld1q_s32(cf + 12);
	const int16x8_t coeffAB =
	vcombine_s16(vqmovn_s32(coeffA), vqmovn_s32(coeffB));
	const int16x8_t coeffCD =
	vcombine_s16(vqmovn_s32(coeffC), vqmovn_s32(coeffD));
	const int16x8_t absAB = vqabsq_s16(coeffAB);
	const int16x8_t absCD = vqabsq_s16(coeffCD);
	const uint8x16_t absABCD = vreinterpretq_u8_s8(
	vcombine_s8(vqmovn_s16(absAB), vqmovn_s16(absCD)));
	vst1q_u8((ls + j), absABCD);
	j += 16;
	cf += 16;
	} while (j < width);
	(int32_t )(ls + width) = 0;
	ls += stride;
	i += 1;
	} while (i < height);
	}
	}

	// get_4_nz_map_contexts_2d coefficients:
	static const DECLARE_ALIGNED(16, uint8_t, c_4_po_2d[2][16]) = {
	{ 0, 1, 6, 6, 1, 6, 6, 21, 6, 6, 21, 21, 6, 21, 21, 21 },
	{ 0, 11, 11, 11, 11, 11, 11, 11, 6, 6, 21, 21, 6, 21, 21, 21 }
	};

	// get_4_nz_map_contexts_hor coefficients:
	/* clang-format off */
	#define SIG_COEF_CONTEXTS_2D_X4_051010 \
	(SIG_COEF_CONTEXTS_2D + ((SIG_COEF_CONTEXTS_2D + 5) << 8) + \
	((SIG_COEF_CONTEXTS_2D + 10) << 16) + ((SIG_COEF_CONTEXTS_2D + 10) << 24))
	/* clang-format on */

	// get_4_nz_map_contexts_ver coefficients:
	static const DECLARE_ALIGNED(16, uint8_t, c_4_po_ver[16]) = {
	SIG_COEF_CONTEXTS_2D + 0, SIG_COEF_CONTEXTS_2D + 0,
	SIG_COEF_CONTEXTS_2D + 0, SIG_COEF_CONTEXTS_2D + 0,
	SIG_COEF_CONTEXTS_2D + 5, SIG_COEF_CONTEXTS_2D + 5,
	SIG_COEF_CONTEXTS_2D + 5, SIG_COEF_CONTEXTS_2D + 5,
	SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
	SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
	SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
	SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10
	};

	// get_8_coeff_contexts_2d coefficients:
	// if (height == 8)
	static const DECLARE_ALIGNED(16, uint8_t, c_8_po_2d_8[2][16]) = {
	{ 0, 1, 6, 6, 21, 21, 21, 21, 1, 6, 6, 21, 21, 21, 21, 21 },
	{ 6, 6, 21, 21, 21, 21, 21, 21, 6, 21, 21, 21, 21, 21, 21, 21 }
	};
	// if (height < 8)
	static const DECLARE_ALIGNED(16, uint8_t, c_8_po_2d_l[2][16]) = {
	{ 0, 16, 6, 6, 21, 21, 21, 21, 16, 16, 6, 21, 21, 21, 21, 21 },
	{ 16, 16, 21, 21, 21, 21, 21, 21, 16, 16, 21, 21, 21, 21, 21, 21 }
	};

	// if (height > 8)
	static const DECLARE_ALIGNED(16, uint8_t, c_8_po_2d_g[2][16]) = {
	{ 0, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11 },
	{ 6, 6, 21, 21, 21, 21, 21, 21, 6, 21, 21, 21, 21, 21, 21, 21 }
	};

	// get_4_nz_map_contexts_ver coefficients:
	static const DECLARE_ALIGNED(16, uint8_t, c_8_po_hor[16]) = {
	SIG_COEF_CONTEXTS_2D + 0, SIG_COEF_CONTEXTS_2D + 5,
	SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
	SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
	SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
	SIG_COEF_CONTEXTS_2D + 0, SIG_COEF_CONTEXTS_2D + 5,
	SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
	SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
	SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10
	};

	// get_16n_coeff_contexts_2d coefficients:
	// real_width == real_height
	static const DECLARE_ALIGNED(16, uint8_t, c_16_po_2d_e[4][16]) = {
	{ 0, 1, 6, 6, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21 },
	{ 1, 6, 6, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21 },
	{ 6, 6, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21 },
	{ 6, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21 }
	};

	// real_width > real_height
	static const DECLARE_ALIGNED(16, uint8_t, c_16_po_2d_g[3][16]) = {
	{ 0, 16, 6, 6, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21 },
	{ 16, 16, 6, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21 },
	{ 16, 16, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21 }
	};

	// real_width < real_height
	static const DECLARE_ALIGNED(16, uint8_t, c_16_po_2d_l[3][16]) = {
	{ 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11 },
	{ 6, 6, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21 },
	{ 6, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21 }
	};

	// get_16n_coeff_contexts_hor coefficients:
	static const DECLARE_ALIGNED(16, uint8_t, c_16_po_hor[16]) = {
	SIG_COEF_CONTEXTS_2D + 0, SIG_COEF_CONTEXTS_2D + 5,
	SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
	SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
	SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
	SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
	SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
	SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
	SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10
	};

	// end of coefficients declaration area

	static INLINE uint8x16_t load_8bit_4x4_to_1_reg(const uint8_t *const src,
	const int byte_stride) {
	#ifdef __aarch64__
	uint32x4_t v_data = vld1q_u32((uint32_t *)src);
	v_data = vld1q_lane_u32((uint32_t )(src + 1 byte_stride), v_data, 1);
	v_data = vld1q_lane_u32((uint32_t )(src + 2 byte_stride), v_data, 2);
	v_data = vld1q_lane_u32((uint32_t )(src + 3 byte_stride), v_data, 3);

	return vreinterpretq_u8_u32(v_data);
	#else
	return load_unaligned_u8q(src, byte_stride);
	#endif
	}

	static INLINE uint8x16_t load_8bit_8x2_to_1_reg(const uint8_t *const src,
	const int byte_stride) {
	#ifdef __aarch64__
	uint64x2_t v_data = vld1q_u64((uint64_t *)src);
	v_data = vld1q_lane_u64((uint64_t )(src + 1 byte_stride), v_data, 1);

	return vreinterpretq_u8_u64(v_data);
	#else
	uint8x8_t v_data_low = vld1_u8(src);
	uint8x8_t v_data_high = vld1_u8(src + byte_stride);

	return vcombine_u8(v_data_low, v_data_high);
	#endif
	}

	static INLINE uint8x16_t load_8bit_16x1_to_1_reg(const uint8_t *const src,
	const int byte_stride) {
	(void)byte_stride;
	return vld1q_u8(src);
	}

	static INLINE void load_levels_4x4x5(const uint8_t *const src, const int stride,
	const ptrdiff_t *const offsets,
	uint8x16_t *const level) {
	level[0] = load_8bit_4x4_to_1_reg(&src[1], stride);
	level[1] = load_8bit_4x4_to_1_reg(&src[stride], stride);
	level[2] = load_8bit_4x4_to_1_reg(&src[offsets[0]], stride);
	level[3] = load_8bit_4x4_to_1_reg(&src[offsets[1]], stride);
	level[4] = load_8bit_4x4_to_1_reg(&src[offsets[2]], stride);
	}

	static INLINE void load_levels_8x2x5(const uint8_t *const src, const int stride,
	const ptrdiff_t *const offsets,
	uint8x16_t *const level) {
	level[0] = load_8bit_8x2_to_1_reg(&src[1], stride);
	level[1] = load_8bit_8x2_to_1_reg(&src[stride], stride);
	level[2] = load_8bit_8x2_to_1_reg(&src[offsets[0]], stride);
	level[3] = load_8bit_8x2_to_1_reg(&src[offsets[1]], stride);
	level[4] = load_8bit_8x2_to_1_reg(&src[offsets[2]], stride);
	}

	static INLINE void load_levels_16x1x5(const uint8_t *const src,
	const int stride,
	const ptrdiff_t *const offsets,
	uint8x16_t *const level) {
	level[0] = load_8bit_16x1_to_1_reg(&src[1], stride);
	level[1] = load_8bit_16x1_to_1_reg(&src[stride], stride);
	level[2] = load_8bit_16x1_to_1_reg(&src[offsets[0]], stride);
	level[3] = load_8bit_16x1_to_1_reg(&src[offsets[1]], stride);
	level[4] = load_8bit_16x1_to_1_reg(&src[offsets[2]], stride);
	}

	static INLINE uint8x16_t get_coeff_contexts_kernel(uint8x16_t *const level) {
	const uint8x16_t const_3 = vdupq_n_u8(3);
	const uint8x16_t const_4 = vdupq_n_u8(4);
	uint8x16_t count;

	count = vminq_u8(level[0], const_3);
	level[1] = vminq_u8(level[1], const_3);
	level[2] = vminq_u8(level[2], const_3);
	level[3] = vminq_u8(level[3], const_3);
	level[4] = vminq_u8(level[4], const_3);
	count = vaddq_u8(count, level[1]);
	count = vaddq_u8(count, level[2]);
	count = vaddq_u8(count, level[3]);
	count = vaddq_u8(count, level[4]);

	count = vrshrq_n_u8(count, 1);
	count = vminq_u8(count, const_4);
	return count;
	}

	static INLINE void get_4_nz_map_contexts_2d(const uint8_t *levels,
	const int height,
	const ptrdiff_t *const offsets,
	uint8_t *const coeff_contexts) {
	const int stride = 4 + TX_PAD_HOR;
	const uint8x16_t pos_to_offset_large = vdupq_n_u8(21);

	uint8x16_t pos_to_offset =
	vld1q_u8((height == 4) ? c_4_po_2d[0] : c_4_po_2d[1]);

	uint8x16_t count;
	uint8x16_t level[5];
	uint8_t *cc = coeff_contexts;

	assert(!(height % 4));

	int row = height;
	do {
	load_levels_4x4x5(levels, stride, offsets, level);
	count = get_coeff_contexts_kernel(level);
	count = vaddq_u8(count, pos_to_offset);
	vst1q_u8(cc, count);
	pos_to_offset = pos_to_offset_large;
	levels += 4 * stride;
	cc += 16;
	row -= 4;
	} while (row);

	coeff_contexts[0] = 0;
	}

	static INLINE void get_4_nz_map_contexts_hor(const uint8_t *levels,
	const int height,
	const ptrdiff_t *const offsets,
	uint8_t *coeff_contexts) {
	const int stride = 4 + TX_PAD_HOR;

	const uint8x16_t pos_to_offset =
	vreinterpretq_u8_u32(vdupq_n_u32(SIG_COEF_CONTEXTS_2D_X4_051010));

	uint8x16_t count;
	uint8x16_t level[5];

	assert(!(height % 4));

	int row = height;
	do {
	load_levels_4x4x5(levels, stride, offsets, level);
	count = get_coeff_contexts_kernel(level);
	count = vaddq_u8(count, pos_to_offset);
	vst1q_u8(coeff_contexts, count);
	levels += 4 * stride;
	coeff_contexts += 16;
	row -= 4;
	} while (row);
	}

	static INLINE void get_4_nz_map_contexts_ver(const uint8_t *levels,
	const int height,
	const ptrdiff_t *const offsets,
	uint8_t *coeff_contexts) {
	const int stride = 4 + TX_PAD_HOR;
	const uint8x16_t pos_to_offset_large = vdupq_n_u8(SIG_COEF_CONTEXTS_2D + 10);

	uint8x16_t pos_to_offset = vld1q_u8(c_4_po_ver);

	uint8x16_t count;
	uint8x16_t level[5];

	assert(!(height % 4));

	int row = height;
	do {
	load_levels_4x4x5(levels, stride, offsets, level);
	count = get_coeff_contexts_kernel(level);
	count = vaddq_u8(count, pos_to_offset);
	vst1q_u8(coeff_contexts, count);
	pos_to_offset = pos_to_offset_large;
	levels += 4 * stride;
	coeff_contexts += 16;
	row -= 4;
	} while (row);
	}

	static INLINE void get_8_coeff_contexts_2d(const uint8_t *levels,
	const int height,
	const ptrdiff_t *const offsets,
	uint8_t *coeff_contexts) {
	const int stride = 8 + TX_PAD_HOR;
	uint8_t *cc = coeff_contexts;
	uint8x16_t count;
	uint8x16_t level[5];
	uint8x16_t pos_to_offset[3];

	assert(!(height % 2));

	if (height == 8) {
	pos_to_offset[0] = vld1q_u8(c_8_po_2d_8[0]);
	pos_to_offset[1] = vld1q_u8(c_8_po_2d_8[1]);
	} else if (height < 8) {
	pos_to_offset[0] = vld1q_u8(c_8_po_2d_l[0]);
	pos_to_offset[1] = vld1q_u8(c_8_po_2d_l[1]);
	} else {
	pos_to_offset[0] = vld1q_u8(c_8_po_2d_g[0]);
	pos_to_offset[1] = vld1q_u8(c_8_po_2d_g[1]);
	}
	pos_to_offset[2] = vdupq_n_u8(21);

	int row = height;
	do {
	load_levels_8x2x5(levels, stride, offsets, level);
	count = get_coeff_contexts_kernel(level);
	count = vaddq_u8(count, pos_to_offset[0]);
	vst1q_u8(cc, count);
	pos_to_offset[0] = pos_to_offset[1];
	pos_to_offset[1] = pos_to_offset[2];
	levels += 2 * stride;
	cc += 16;
	row -= 2;
	} while (row);

	coeff_contexts[0] = 0;
	}

	static INLINE void get_8_coeff_contexts_hor(const uint8_t *levels,
	const int height,
	const ptrdiff_t *const offsets,
	uint8_t *coeff_contexts) {
	const int stride = 8 + TX_PAD_HOR;

	const uint8x16_t pos_to_offset = vld1q_u8(c_8_po_hor);

	uint8x16_t count;
	uint8x16_t level[5];

	assert(!(height % 2));

	int row = height;
	do {
	load_levels_8x2x5(levels, stride, offsets, level);
	count = get_coeff_contexts_kernel(level);
	count = vaddq_u8(count, pos_to_offset);
	vst1q_u8(coeff_contexts, count);
	levels += 2 * stride;
	coeff_contexts += 16;
	row -= 2;
	} while (row);
	}

	static INLINE void get_8_coeff_contexts_ver(const uint8_t *levels,
	const int height,
	const ptrdiff_t *const offsets,
	uint8_t *coeff_contexts) {
	const int stride = 8 + TX_PAD_HOR;
	const uint8x16_t pos_to_offset_large = vdupq_n_u8(SIG_COEF_CONTEXTS_2D + 10);

	uint8x16_t pos_to_offset = vcombine_u8(vdup_n_u8(SIG_COEF_CONTEXTS_2D + 0),
	vdup_n_u8(SIG_COEF_CONTEXTS_2D + 5));

	uint8x16_t count;
	uint8x16_t level[5];

	assert(!(height % 2));

	int row = height;
	do {
	load_levels_8x2x5(levels, stride, offsets, level);
	count = get_coeff_contexts_kernel(level);
	count = vaddq_u8(count, pos_to_offset);
	vst1q_u8(coeff_contexts, count);
	pos_to_offset = pos_to_offset_large;
	levels += 2 * stride;
	coeff_contexts += 16;
	row -= 2;
	} while (row);
	}

	static INLINE void get_16n_coeff_contexts_2d(const uint8_t *levels,
	const int real_width,
	const int real_height,
	const int width, const int height,
	const ptrdiff_t *const offsets,
	uint8_t *coeff_contexts) {
	const int stride = width + TX_PAD_HOR;
	uint8_t *cc = coeff_contexts;
	int row = height;
	uint8x16_t pos_to_offset[5];
	uint8x16_t pos_to_offset_large[3];
	uint8x16_t count;
	uint8x16_t level[5];

	assert(!(width % 16));

	pos_to_offset_large[2] = vdupq_n_u8(21);
	if (real_width == real_height) {
	pos_to_offset[0] = vld1q_u8(c_16_po_2d_e[0]);
	pos_to_offset[1] = vld1q_u8(c_16_po_2d_e[1]);
	pos_to_offset[2] = vld1q_u8(c_16_po_2d_e[2]);
	pos_to_offset[3] = vld1q_u8(c_16_po_2d_e[3]);
	pos_to_offset[4] = pos_to_offset_large[0] = pos_to_offset_large[1] =
	pos_to_offset_large[2];
	} else if (real_width > real_height) {
	pos_to_offset[0] = vld1q_u8(c_16_po_2d_g[0]);
	pos_to_offset[1] = vld1q_u8(c_16_po_2d_g[1]);
	pos_to_offset[2] = pos_to_offset[3] = pos_to_offset[4] =
	vld1q_u8(c_16_po_2d_g[2]);
	pos_to_offset_large[0] = pos_to_offset_large[1] = pos_to_offset_large[2];
	} else { // real_width < real_height
	pos_to_offset[0] = pos_to_offset[1] = vld1q_u8(c_16_po_2d_l[0]);
	pos_to_offset[2] = vld1q_u8(c_16_po_2d_l[1]);
	pos_to_offset[3] = vld1q_u8(c_16_po_2d_l[2]);
	pos_to_offset[4] = pos_to_offset_large[2];
	pos_to_offset_large[0] = pos_to_offset_large[1] = vdupq_n_u8(11);
	}

	do {
	int w = width;

	do {
	load_levels_16x1x5(levels, stride, offsets, level);
	count = get_coeff_contexts_kernel(level);
	count = vaddq_u8(count, pos_to_offset[0]);
	vst1q_u8(cc, count);
	levels += 16;
	cc += 16;
	w -= 16;
	pos_to_offset[0] = pos_to_offset_large[0];
	} while (w);

	pos_to_offset[0] = pos_to_offset[1];
	pos_to_offset[1] = pos_to_offset[2];
	pos_to_offset[2] = pos_to_offset[3];
	pos_to_offset[3] = pos_to_offset[4];
	pos_to_offset_large[0] = pos_to_offset_large[1];
	pos_to_offset_large[1] = pos_to_offset_large[2];
	levels += TX_PAD_HOR;
	} while (--row);

	coeff_contexts[0] = 0;
	}

	static INLINE void get_16n_coeff_contexts_hor(const uint8_t *levels,
	const int width, const int height,
	const ptrdiff_t *const offsets,
	uint8_t *coeff_contexts) {
	const int stride = width + TX_PAD_HOR;

	const uint8x16_t pos_to_offset_large = vdupq_n_u8(SIG_COEF_CONTEXTS_2D + 10);

	uint8x16_t count;
	uint8x16_t level[5];

	assert(!(width % 16));

	int row = height;
	do {
	uint8x16_t pos_to_offset = vld1q_u8(c_16_po_hor);

	int w = width;
	do {
	load_levels_16x1x5(levels, stride, offsets, level);
	count = get_coeff_contexts_kernel(level);
	count = vaddq_u8(count, pos_to_offset);
	vst1q_u8(coeff_contexts, count);
	pos_to_offset = pos_to_offset_large;
	levels += 16;
	coeff_contexts += 16;
	w -= 16;
	} while (w);

	levels += TX_PAD_HOR;
	} while (--row);
	}

	static INLINE void get_16n_coeff_contexts_ver(const uint8_t *levels,
	const int width, const int height,
	const ptrdiff_t *const offsets,
	uint8_t *coeff_contexts) {
	const int stride = width + TX_PAD_HOR;

	uint8x16_t pos_to_offset[3];
	uint8x16_t count;
	uint8x16_t level[5];

	assert(!(width % 16));

	pos_to_offset[0] = vdupq_n_u8(SIG_COEF_CONTEXTS_2D + 0);
	pos_to_offset[1] = vdupq_n_u8(SIG_COEF_CONTEXTS_2D + 5);
	pos_to_offset[2] = vdupq_n_u8(SIG_COEF_CONTEXTS_2D + 10);

	int row = height;
	do {
	int w = width;
	do {
	load_levels_16x1x5(levels, stride, offsets, level);
	count = get_coeff_contexts_kernel(level);
	count = vaddq_u8(count, pos_to_offset[0]);
	vst1q_u8(coeff_contexts, count);
	levels += 16;
	coeff_contexts += 16;
	w -= 16;
	} while (w);

	pos_to_offset[0] = pos_to_offset[1];
	pos_to_offset[1] = pos_to_offset[2];
	levels += TX_PAD_HOR;
	} while (--row);
	}

	// Note: levels[] must be in the range [0, 127], inclusive.
	void av1_get_nz_map_contexts_neon(const uint8_t *const levels,
	const int16_t *const scan, const uint16_t eob,
	const TX_SIZE tx_size,
	const TX_CLASS tx_class,
	int8_t *const coeff_contexts) {
	const int last_idx = eob - 1;
	if (!last_idx) {
	coeff_contexts[0] = 0;
	return;
	}

	uint8_t const coefficients = (uint8_t const)coeff_contexts;

	const int real_width = tx_size_wide[tx_size];
	const int real_height = tx_size_high[tx_size];
	const int width = get_txb_wide(tx_size);
	const int height = get_txb_high(tx_size);
	const int stride = width + TX_PAD_HOR;
	ptrdiff_t offsets[3];

	/* coeff_contexts must be 16 byte aligned. */
	assert(!((intptr_t)coeff_contexts & 0xf));

	if (tx_class == TX_CLASS_2D) {
	offsets[0] = 0 * stride + 2;
	offsets[1] = 1 * stride + 1;
	offsets[2] = 2 * stride + 0;

	if (width == 4) {
	get_4_nz_map_contexts_2d(levels, height, offsets, coefficients);
	} else if (width == 8) {
	get_8_coeff_contexts_2d(levels, height, offsets, coefficients);
	} else {
	get_16n_coeff_contexts_2d(levels, real_width, real_height, width, height,
	offsets, coefficients);
	}
	} else if (tx_class == TX_CLASS_HORIZ) {
	offsets[0] = 2;
	offsets[1] = 3;
	offsets[2] = 4;
	if (width == 4) {
	get_4_nz_map_contexts_hor(levels, height, offsets, coefficients);
	} else if (width == 8) {
	get_8_coeff_contexts_hor(levels, height, offsets, coefficients);
	} else {
	get_16n_coeff_contexts_hor(levels, width, height, offsets, coefficients);
	}
	} else { // TX_CLASS_VERT
	offsets[0] = 2 * stride;
	offsets[1] = 3 * stride;
	offsets[2] = 4 * stride;
	if (width == 4) {
	get_4_nz_map_contexts_ver(levels, height, offsets, coefficients);
	} else if (width == 8) {
	get_8_coeff_contexts_ver(levels, height, offsets, coefficients);
	} else {
	get_16n_coeff_contexts_ver(levels, width, height, offsets, coefficients);
	}
	}

	const int bwl = get_txb_bwl(tx_size);
	const int pos = scan[last_idx];
	if (last_idx <= (height << bwl) / 8)
	coeff_contexts[pos] = 1;
	else if (last_idx <= (height << bwl) / 4)
	coeff_contexts[pos] = 2;
	else
	coeff_contexts[pos] = 3;
	}