Add NEON version of av1_get_nz_map_contexts function

SpeedUp

size    gain

 4x 4:  8.95
 8x 8:  16.18
16x16:  17.84
32x32:  21.36
64x64:  21.14
 4x 8:  9.86
 8x 4:  10.66
 8x16:  16.63
16x 8:  14.88
16x32:  17.56
32x16:  20.16
32x64:  20.98
64x32:  20.95
 4x16:  12.34
16x 4:  12.38
 8x32:  18.77
32x 8:  18.63
16x64:  17.56
64x16:  20.32

via NEON/EncodeTxbTest.DISABLED_SpeedTestGetNzMapContexts

Change-Id: I288b4fe2e33fd098bc87ef62a0cc4911320c8836
diff --git a/av1/common/av1_rtcd_defs.pl b/av1/common/av1_rtcd_defs.pl
index 3869447..01d63dc 100644
--- a/av1/common/av1_rtcd_defs.pl
+++ b/av1/common/av1_rtcd_defs.pl
@@ -369,7 +369,7 @@
 
   # txb
   add_proto qw/void av1_get_nz_map_contexts/, "const uint8_t *const levels, const int16_t *const scan, const uint16_t eob, const TX_SIZE tx_size, const TX_CLASS tx_class, int8_t *const coeff_contexts";
-  specialize qw/av1_get_nz_map_contexts sse2/;
+  specialize qw/av1_get_nz_map_contexts sse2 neon/;
   add_proto qw/void av1_txb_init_levels/, "const tran_low_t *const coeff, const int width, const int height, uint8_t *const levels";
   specialize qw/av1_txb_init_levels sse4_1 avx2 neon/;
 
diff --git a/av1/encoder/arm/neon/encodetxb_neon.c b/av1/encoder/arm/neon/encodetxb_neon.c
index 045c622..0b419f7 100644
--- a/av1/encoder/arm/neon/encodetxb_neon.c
+++ b/av1/encoder/arm/neon/encodetxb_neon.c
@@ -10,10 +10,11 @@
  */
 
 #include <arm_neon.h>
-
+#include <assert.h>
 #include <math.h>
 
 #include "av1/common/txb_common.h"
+#include "av1/encoder/encodetxb.h"
 
 void av1_txb_init_levels_neon(const tran_low_t *const coeff, const int width,
                               const int height, uint8_t *const levels) {
@@ -91,3 +92,557 @@
     } while (i < height);
   }
 }
+
+// get_4_nz_map_contexts_2d coefficients:
+static const DECLARE_ALIGNED(16, uint8_t, c_4_po_2d[2][16]) = {
+  { 0, 1, 6, 6, 1, 6, 6, 21, 6, 6, 21, 21, 6, 21, 21, 21 },
+  { 0, 11, 11, 11, 11, 11, 11, 11, 6, 6, 21, 21, 6, 21, 21, 21 }
+};
+
+// get_4_nz_map_contexts_hor coefficients:
+/* clang-format off */
+#define SIG_COEF_CONTEXTS_2D_X4_051010                        \
+  (SIG_COEF_CONTEXTS_2D + ((SIG_COEF_CONTEXTS_2D + 5) << 8) + \
+  ((SIG_COEF_CONTEXTS_2D + 10) << 16) + ((SIG_COEF_CONTEXTS_2D + 10) << 24))
+/* clang-format on */
+
+// get_4_nz_map_contexts_ver coefficients:
+static const DECLARE_ALIGNED(16, uint8_t, c_4_po_ver[16]) = {
+  SIG_COEF_CONTEXTS_2D + 0,  SIG_COEF_CONTEXTS_2D + 0,
+  SIG_COEF_CONTEXTS_2D + 0,  SIG_COEF_CONTEXTS_2D + 0,
+  SIG_COEF_CONTEXTS_2D + 5,  SIG_COEF_CONTEXTS_2D + 5,
+  SIG_COEF_CONTEXTS_2D + 5,  SIG_COEF_CONTEXTS_2D + 5,
+  SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
+  SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
+  SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
+  SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10
+};
+
+// get_8_coeff_contexts_2d coefficients:
+// if (height == 8)
+static const DECLARE_ALIGNED(16, uint8_t, c_8_po_2d_8[2][16]) = {
+  { 0, 1, 6, 6, 21, 21, 21, 21, 1, 6, 6, 21, 21, 21, 21, 21 },
+  { 6, 6, 21, 21, 21, 21, 21, 21, 6, 21, 21, 21, 21, 21, 21, 21 }
+};
+// if (height < 8)
+static const DECLARE_ALIGNED(16, uint8_t, c_8_po_2d_l[2][16]) = {
+  { 0, 16, 6, 6, 21, 21, 21, 21, 16, 16, 6, 21, 21, 21, 21, 21 },
+  { 16, 16, 21, 21, 21, 21, 21, 21, 16, 16, 21, 21, 21, 21, 21, 21 }
+};
+
+// if (height > 8)
+static const DECLARE_ALIGNED(16, uint8_t, c_8_po_2d_g[2][16]) = {
+  { 0, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11 },
+  { 6, 6, 21, 21, 21, 21, 21, 21, 6, 21, 21, 21, 21, 21, 21, 21 }
+};
+
+// get_4_nz_map_contexts_ver coefficients:
+static const DECLARE_ALIGNED(16, uint8_t, c_8_po_hor[16]) = {
+  SIG_COEF_CONTEXTS_2D + 0,  SIG_COEF_CONTEXTS_2D + 5,
+  SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
+  SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
+  SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
+  SIG_COEF_CONTEXTS_2D + 0,  SIG_COEF_CONTEXTS_2D + 5,
+  SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
+  SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
+  SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10
+};
+
+// get_16n_coeff_contexts_2d coefficients:
+// real_width == real_height
+static const DECLARE_ALIGNED(16, uint8_t, c_16_po_2d_e[4][16]) = {
+  { 0, 1, 6, 6, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21 },
+  { 1, 6, 6, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21 },
+  { 6, 6, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21 },
+  { 6, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21 }
+};
+
+// real_width > real_height
+static const DECLARE_ALIGNED(16, uint8_t, c_16_po_2d_g[3][16]) = {
+  { 0, 16, 6, 6, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21 },
+  { 16, 16, 6, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21 },
+  { 16, 16, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21 }
+};
+
+// real_width < real_height
+static const DECLARE_ALIGNED(16, uint8_t, c_16_po_2d_l[3][16]) = {
+  { 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11 },
+  { 6, 6, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21 },
+  { 6, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21 }
+};
+
+// get_16n_coeff_contexts_hor coefficients:
+static const DECLARE_ALIGNED(16, uint8_t, c_16_po_hor[16]) = {
+  SIG_COEF_CONTEXTS_2D + 0,  SIG_COEF_CONTEXTS_2D + 5,
+  SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
+  SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
+  SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
+  SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
+  SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
+  SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
+  SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10
+};
+
+// end of coefficients declaration area
+
+static INLINE uint8x16_t load_8bit_4x4_to_1_reg(const uint8_t *const src,
+                                                const int byte_stride) {
+  uint32x4_t v_data = vld1q_u32((uint32_t *)src);
+  v_data = vld1q_lane_u32((uint32_t *)(src + 1 * byte_stride), v_data, 1);
+  v_data = vld1q_lane_u32((uint32_t *)(src + 2 * byte_stride), v_data, 2);
+  v_data = vld1q_lane_u32((uint32_t *)(src + 3 * byte_stride), v_data, 3);
+
+  return vreinterpretq_u8_u32(v_data);
+}
+
+static INLINE uint8x16_t load_8bit_8x2_to_1_reg(const uint8_t *const src,
+                                                const int byte_stride) {
+#ifdef __aarch64__
+  uint64x2_t v_data = vld1q_u64((uint64_t *)src);
+  v_data = vld1q_lane_u64((uint64_t *)(src + 1 * byte_stride), v_data, 1);
+
+  return vreinterpretq_u8_u64(v_data);
+#else
+  uint8x8_t v_data_low = vld1_u8(src);
+  uint8x8_t v_data_high = vld1_u8(src + byte_stride);
+
+  return vcombine_u8(v_data_low, v_data_high);
+#endif
+}
+
+static INLINE uint8x16_t load_8bit_16x1_to_1_reg(const uint8_t *const src,
+                                                 const int byte_stride) {
+  (void)byte_stride;
+  return vld1q_u8(src);
+}
+
+static INLINE void load_levels_4x4x5(const uint8_t *const src, const int stride,
+                                     const ptrdiff_t *const offsets,
+                                     uint8x16_t *const level) {
+  level[0] = load_8bit_4x4_to_1_reg(&src[1], stride);
+  level[1] = load_8bit_4x4_to_1_reg(&src[stride], stride);
+  level[2] = load_8bit_4x4_to_1_reg(&src[offsets[0]], stride);
+  level[3] = load_8bit_4x4_to_1_reg(&src[offsets[1]], stride);
+  level[4] = load_8bit_4x4_to_1_reg(&src[offsets[2]], stride);
+}
+
+static INLINE void load_levels_8x2x5(const uint8_t *const src, const int stride,
+                                     const ptrdiff_t *const offsets,
+                                     uint8x16_t *const level) {
+  level[0] = load_8bit_8x2_to_1_reg(&src[1], stride);
+  level[1] = load_8bit_8x2_to_1_reg(&src[stride], stride);
+  level[2] = load_8bit_8x2_to_1_reg(&src[offsets[0]], stride);
+  level[3] = load_8bit_8x2_to_1_reg(&src[offsets[1]], stride);
+  level[4] = load_8bit_8x2_to_1_reg(&src[offsets[2]], stride);
+}
+
+static INLINE void load_levels_16x1x5(const uint8_t *const src,
+                                      const int stride,
+                                      const ptrdiff_t *const offsets,
+                                      uint8x16_t *const level) {
+  level[0] = load_8bit_16x1_to_1_reg(&src[1], stride);
+  level[1] = load_8bit_16x1_to_1_reg(&src[stride], stride);
+  level[2] = load_8bit_16x1_to_1_reg(&src[offsets[0]], stride);
+  level[3] = load_8bit_16x1_to_1_reg(&src[offsets[1]], stride);
+  level[4] = load_8bit_16x1_to_1_reg(&src[offsets[2]], stride);
+}
+
+static INLINE uint8x16_t get_coeff_contexts_kernel(uint8x16_t *const level) {
+  const uint8x16_t const_3 = vdupq_n_u8(3);
+  const uint8x16_t const_4 = vdupq_n_u8(4);
+  uint8x16_t count;
+
+  count = vminq_u8(level[0], const_3);
+  level[1] = vminq_u8(level[1], const_3);
+  level[2] = vminq_u8(level[2], const_3);
+  level[3] = vminq_u8(level[3], const_3);
+  level[4] = vminq_u8(level[4], const_3);
+  count = vaddq_u8(count, level[1]);
+  count = vaddq_u8(count, level[2]);
+  count = vaddq_u8(count, level[3]);
+  count = vaddq_u8(count, level[4]);
+
+  count = vrshrq_n_u8(count, 1);
+  count = vminq_u8(count, const_4);
+  return count;
+}
+
+static INLINE void get_4_nz_map_contexts_2d(const uint8_t *levels,
+                                            const int height,
+                                            const ptrdiff_t *const offsets,
+                                            uint8_t *const coeff_contexts) {
+  const int stride = 4 + TX_PAD_HOR;
+  const uint8x16_t pos_to_offset_large = vdupq_n_u8(21);
+
+  uint8x16_t pos_to_offset =
+      vld1q_u8((height == 4) ? c_4_po_2d[0] : c_4_po_2d[1]);
+
+  uint8x16_t count;
+  uint8x16_t level[5];
+  uint8_t *cc = coeff_contexts;
+
+  assert(!(height % 4));
+
+  int row = height;
+  do {
+    load_levels_4x4x5(levels, stride, offsets, level);
+    count = get_coeff_contexts_kernel(level);
+    count = vaddq_u8(count, pos_to_offset);
+    vst1q_u8(cc, count);
+    pos_to_offset = pos_to_offset_large;
+    levels += 4 * stride;
+    cc += 16;
+    row -= 4;
+  } while (row);
+
+  coeff_contexts[0] = 0;
+}
+
+static INLINE void get_4_nz_map_contexts_hor(const uint8_t *levels,
+                                             const int height,
+                                             const ptrdiff_t *const offsets,
+                                             uint8_t *coeff_contexts) {
+  const int stride = 4 + TX_PAD_HOR;
+
+  const uint8x16_t pos_to_offset =
+      vreinterpretq_u8_u32(vdupq_n_u32(SIG_COEF_CONTEXTS_2D_X4_051010));
+
+  uint8x16_t count;
+  uint8x16_t level[5];
+
+  assert(!(height % 4));
+
+  int row = height;
+  do {
+    load_levels_4x4x5(levels, stride, offsets, level);
+    count = get_coeff_contexts_kernel(level);
+    count = vaddq_u8(count, pos_to_offset);
+    vst1q_u8(coeff_contexts, count);
+    levels += 4 * stride;
+    coeff_contexts += 16;
+    row -= 4;
+  } while (row);
+}
+
+static INLINE void get_4_nz_map_contexts_ver(const uint8_t *levels,
+                                             const int height,
+                                             const ptrdiff_t *const offsets,
+                                             uint8_t *coeff_contexts) {
+  const int stride = 4 + TX_PAD_HOR;
+  const uint8x16_t pos_to_offset_large = vdupq_n_u8(SIG_COEF_CONTEXTS_2D + 10);
+
+  uint8x16_t pos_to_offset = vld1q_u8(c_4_po_ver);
+
+  uint8x16_t count;
+  uint8x16_t level[5];
+
+  assert(!(height % 4));
+
+  int row = height;
+  do {
+    load_levels_4x4x5(levels, stride, offsets, level);
+    count = get_coeff_contexts_kernel(level);
+    count = vaddq_u8(count, pos_to_offset);
+    vst1q_u8(coeff_contexts, count);
+    pos_to_offset = pos_to_offset_large;
+    levels += 4 * stride;
+    coeff_contexts += 16;
+    row -= 4;
+  } while (row);
+}
+
+static INLINE void get_8_coeff_contexts_2d(const uint8_t *levels,
+                                           const int height,
+                                           const ptrdiff_t *const offsets,
+                                           uint8_t *coeff_contexts) {
+  const int stride = 8 + TX_PAD_HOR;
+  uint8_t *cc = coeff_contexts;
+  uint8x16_t count;
+  uint8x16_t level[5];
+  uint8x16_t pos_to_offset[3];
+
+  assert(!(height % 2));
+
+  if (height == 8) {
+    pos_to_offset[0] = vld1q_u8(c_8_po_2d_8[0]);
+    pos_to_offset[1] = vld1q_u8(c_8_po_2d_8[1]);
+  } else if (height < 8) {
+    pos_to_offset[0] = vld1q_u8(c_8_po_2d_l[0]);
+    pos_to_offset[1] = vld1q_u8(c_8_po_2d_l[1]);
+  } else {
+    pos_to_offset[0] = vld1q_u8(c_8_po_2d_g[0]);
+    pos_to_offset[1] = vld1q_u8(c_8_po_2d_g[1]);
+  }
+  pos_to_offset[2] = vdupq_n_u8(21);
+
+  int row = height;
+  do {
+    load_levels_8x2x5(levels, stride, offsets, level);
+    count = get_coeff_contexts_kernel(level);
+    count = vaddq_u8(count, pos_to_offset[0]);
+    vst1q_u8(cc, count);
+    pos_to_offset[0] = pos_to_offset[1];
+    pos_to_offset[1] = pos_to_offset[2];
+    levels += 2 * stride;
+    cc += 16;
+    row -= 2;
+  } while (row);
+
+  coeff_contexts[0] = 0;
+}
+
+static INLINE void get_8_coeff_contexts_hor(const uint8_t *levels,
+                                            const int height,
+                                            const ptrdiff_t *const offsets,
+                                            uint8_t *coeff_contexts) {
+  const int stride = 8 + TX_PAD_HOR;
+
+  const uint8x16_t pos_to_offset = vld1q_u8(c_8_po_hor);
+
+  uint8x16_t count;
+  uint8x16_t level[5];
+
+  assert(!(height % 2));
+
+  int row = height;
+  do {
+    load_levels_8x2x5(levels, stride, offsets, level);
+    count = get_coeff_contexts_kernel(level);
+    count = vaddq_u8(count, pos_to_offset);
+    vst1q_u8(coeff_contexts, count);
+    levels += 2 * stride;
+    coeff_contexts += 16;
+    row -= 2;
+  } while (row);
+}
+
+static INLINE void get_8_coeff_contexts_ver(const uint8_t *levels,
+                                            const int height,
+                                            const ptrdiff_t *const offsets,
+                                            uint8_t *coeff_contexts) {
+  const int stride = 8 + TX_PAD_HOR;
+  const uint8x16_t pos_to_offset_large = vdupq_n_u8(SIG_COEF_CONTEXTS_2D + 10);
+
+  uint8x16_t pos_to_offset = vcombine_u8(vdup_n_u8(SIG_COEF_CONTEXTS_2D + 0),
+                                         vdup_n_u8(SIG_COEF_CONTEXTS_2D + 5));
+
+  uint8x16_t count;
+  uint8x16_t level[5];
+
+  assert(!(height % 2));
+
+  int row = height;
+  do {
+    load_levels_8x2x5(levels, stride, offsets, level);
+    count = get_coeff_contexts_kernel(level);
+    count = vaddq_u8(count, pos_to_offset);
+    vst1q_u8(coeff_contexts, count);
+    pos_to_offset = pos_to_offset_large;
+    levels += 2 * stride;
+    coeff_contexts += 16;
+    row -= 2;
+  } while (row);
+}
+
+static INLINE void get_16n_coeff_contexts_2d(const uint8_t *levels,
+                                             const int real_width,
+                                             const int real_height,
+                                             const int width, const int height,
+                                             const ptrdiff_t *const offsets,
+                                             uint8_t *coeff_contexts) {
+  const int stride = width + TX_PAD_HOR;
+  uint8_t *cc = coeff_contexts;
+  int row = height;
+  uint8x16_t pos_to_offset[5];
+  uint8x16_t pos_to_offset_large[3];
+  uint8x16_t count;
+  uint8x16_t level[5];
+
+  assert(!(width % 16));
+
+  pos_to_offset_large[2] = vdupq_n_u8(21);
+  if (real_width == real_height) {
+    pos_to_offset[0] = vld1q_u8(c_16_po_2d_e[0]);
+    pos_to_offset[1] = vld1q_u8(c_16_po_2d_e[1]);
+    pos_to_offset[2] = vld1q_u8(c_16_po_2d_e[2]);
+    pos_to_offset[3] = vld1q_u8(c_16_po_2d_e[3]);
+    pos_to_offset[4] = pos_to_offset_large[0] = pos_to_offset_large[1] =
+        pos_to_offset_large[2];
+  } else if (real_width > real_height) {
+    pos_to_offset[0] = vld1q_u8(c_16_po_2d_g[0]);
+    pos_to_offset[1] = vld1q_u8(c_16_po_2d_g[1]);
+    pos_to_offset[2] = pos_to_offset[3] = pos_to_offset[4] =
+        vld1q_u8(c_16_po_2d_g[2]);
+    pos_to_offset_large[0] = pos_to_offset_large[1] = pos_to_offset_large[2];
+  } else {  // real_width < real_height
+    pos_to_offset[0] = pos_to_offset[1] = vld1q_u8(c_16_po_2d_l[0]);
+    pos_to_offset[2] = vld1q_u8(c_16_po_2d_l[1]);
+    pos_to_offset[3] = vld1q_u8(c_16_po_2d_l[2]);
+    pos_to_offset[4] = pos_to_offset_large[2];
+    pos_to_offset_large[0] = pos_to_offset_large[1] = vdupq_n_u8(11);
+  }
+
+  do {
+    int w = width;
+
+    do {
+      load_levels_16x1x5(levels, stride, offsets, level);
+      count = get_coeff_contexts_kernel(level);
+      count = vaddq_u8(count, pos_to_offset[0]);
+      vst1q_u8(cc, count);
+      levels += 16;
+      cc += 16;
+      w -= 16;
+      pos_to_offset[0] = pos_to_offset_large[0];
+    } while (w);
+
+    pos_to_offset[0] = pos_to_offset[1];
+    pos_to_offset[1] = pos_to_offset[2];
+    pos_to_offset[2] = pos_to_offset[3];
+    pos_to_offset[3] = pos_to_offset[4];
+    pos_to_offset_large[0] = pos_to_offset_large[1];
+    pos_to_offset_large[1] = pos_to_offset_large[2];
+    levels += TX_PAD_HOR;
+  } while (--row);
+
+  coeff_contexts[0] = 0;
+}
+
+static INLINE void get_16n_coeff_contexts_hor(const uint8_t *levels,
+                                              const int width, const int height,
+                                              const ptrdiff_t *const offsets,
+                                              uint8_t *coeff_contexts) {
+  const int stride = width + TX_PAD_HOR;
+
+  const uint8x16_t pos_to_offset_large = vdupq_n_u8(SIG_COEF_CONTEXTS_2D + 10);
+
+  uint8x16_t count;
+  uint8x16_t level[5];
+
+  assert(!(width % 16));
+
+  int row = height;
+  do {
+    uint8x16_t pos_to_offset = vld1q_u8(c_16_po_hor);
+
+    int w = width;
+    do {
+      load_levels_16x1x5(levels, stride, offsets, level);
+      count = get_coeff_contexts_kernel(level);
+      count = vaddq_u8(count, pos_to_offset);
+      vst1q_u8(coeff_contexts, count);
+      pos_to_offset = pos_to_offset_large;
+      levels += 16;
+      coeff_contexts += 16;
+      w -= 16;
+    } while (w);
+
+    levels += TX_PAD_HOR;
+  } while (--row);
+}
+
+static INLINE void get_16n_coeff_contexts_ver(const uint8_t *levels,
+                                              const int width, const int height,
+                                              const ptrdiff_t *const offsets,
+                                              uint8_t *coeff_contexts) {
+  const int stride = width + TX_PAD_HOR;
+
+  uint8x16_t pos_to_offset[3];
+  uint8x16_t count;
+  uint8x16_t level[5];
+
+  assert(!(width % 16));
+
+  pos_to_offset[0] = vdupq_n_u8(SIG_COEF_CONTEXTS_2D + 0);
+  pos_to_offset[1] = vdupq_n_u8(SIG_COEF_CONTEXTS_2D + 5);
+  pos_to_offset[2] = vdupq_n_u8(SIG_COEF_CONTEXTS_2D + 10);
+
+  int row = height;
+  do {
+    int w = width;
+    do {
+      load_levels_16x1x5(levels, stride, offsets, level);
+      count = get_coeff_contexts_kernel(level);
+      count = vaddq_u8(count, pos_to_offset[0]);
+      vst1q_u8(coeff_contexts, count);
+      levels += 16;
+      coeff_contexts += 16;
+      w -= 16;
+    } while (w);
+
+    pos_to_offset[0] = pos_to_offset[1];
+    pos_to_offset[1] = pos_to_offset[2];
+    levels += TX_PAD_HOR;
+  } while (--row);
+}
+
+// Note: levels[] must be in the range [0, 127], inclusive.
+void av1_get_nz_map_contexts_neon(const uint8_t *const levels,
+                                  const int16_t *const scan, const uint16_t eob,
+                                  const TX_SIZE tx_size,
+                                  const TX_CLASS tx_class,
+                                  int8_t *const coeff_contexts) {
+  const int last_idx = eob - 1;
+  if (!last_idx) {
+    coeff_contexts[0] = 0;
+    return;
+  }
+
+  uint8_t *const coefficients = (uint8_t *const)coeff_contexts;
+
+  const int real_width = tx_size_wide[tx_size];
+  const int real_height = tx_size_high[tx_size];
+  const int width = get_txb_wide(tx_size);
+  const int height = get_txb_high(tx_size);
+  const int stride = width + TX_PAD_HOR;
+  ptrdiff_t offsets[3];
+
+  /* coeff_contexts must be 16 byte aligned. */
+  assert(!((intptr_t)coeff_contexts & 0xf));
+
+  if (tx_class == TX_CLASS_2D) {
+    offsets[0] = 0 * stride + 2;
+    offsets[1] = 1 * stride + 1;
+    offsets[2] = 2 * stride + 0;
+
+    if (width == 4) {
+      get_4_nz_map_contexts_2d(levels, height, offsets, coefficients);
+    } else if (width == 8) {
+      get_8_coeff_contexts_2d(levels, height, offsets, coefficients);
+    } else {
+      get_16n_coeff_contexts_2d(levels, real_width, real_height, width, height,
+                                offsets, coefficients);
+    }
+  } else if (tx_class == TX_CLASS_HORIZ) {
+    offsets[0] = 2;
+    offsets[1] = 3;
+    offsets[2] = 4;
+    if (width == 4) {
+      get_4_nz_map_contexts_hor(levels, height, offsets, coefficients);
+    } else if (width == 8) {
+      get_8_coeff_contexts_hor(levels, height, offsets, coefficients);
+    } else {
+      get_16n_coeff_contexts_hor(levels, width, height, offsets, coefficients);
+    }
+  } else {  // TX_CLASS_VERT
+    offsets[0] = 2 * stride;
+    offsets[1] = 3 * stride;
+    offsets[2] = 4 * stride;
+    if (width == 4) {
+      get_4_nz_map_contexts_ver(levels, height, offsets, coefficients);
+    } else if (width == 8) {
+      get_8_coeff_contexts_ver(levels, height, offsets, coefficients);
+    } else {
+      get_16n_coeff_contexts_ver(levels, width, height, offsets, coefficients);
+    }
+  }
+
+  const int bwl = get_txb_bwl(tx_size);
+  const int pos = scan[last_idx];
+  if (last_idx <= (height << bwl) / 8)
+    coeff_contexts[pos] = 1;
+  else if (last_idx <= (height << bwl) / 4)
+    coeff_contexts[pos] = 2;
+  else
+    coeff_contexts[pos] = 3;
+}
diff --git a/test/encodetxb_test.cc b/test/encodetxb_test.cc
index 1dc3e1f..042d157 100644
--- a/test/encodetxb_test.cc
+++ b/test/encodetxb_test.cc
@@ -100,6 +100,7 @@
   void SpeedTestGetNzMapContextsRun() {
     const int kNumTests = 2000000000;
     aom_usec_timer timer;
+    aom_usec_timer timer_ref;
 
     printf("Note: Only test the largest possible eob case!\n");
     for (int tx_size = TX_4X4; tx_size < TX_SIZES_ALL; ++tx_size) {
@@ -117,6 +118,16 @@
       levels_ = set_levels(levels_buf_, width);
       InitDataWithEob(scan, bwl, eob);
 
+      aom_usec_timer_start(&timer_ref);
+      for (int i = 0; i < numTests; ++i) {
+        av1_get_nz_map_contexts_c(levels_, scan, eob, (TX_SIZE)tx_size,
+                                  tx_class, coeff_contexts_ref_);
+      }
+      aom_usec_timer_mark(&timer_ref);
+
+      levels_ = set_levels(levels_buf_, width);
+      InitDataWithEob(scan, bwl, eob);
+
       aom_usec_timer_start(&timer);
       for (int i = 0; i < numTests; ++i) {
         get_nz_map_contexts_func_(levels_, scan, eob, (TX_SIZE)tx_size,
@@ -124,9 +135,14 @@
       }
       aom_usec_timer_mark(&timer);
 
+      const int elapsed_time_ref =
+          static_cast<int>(aom_usec_timer_elapsed(&timer_ref));
       const int elapsed_time = static_cast<int>(aom_usec_timer_elapsed(&timer));
-      printf("get_nz_map_contexts_%2dx%2d: %7.1f ms\n", real_width, real_height,
-             elapsed_time / 1000.0);
+
+      printf("get_nz_map_contexts_%2dx%2d: %7.1f ms ref %7.1f ms gain %4.2f\n",
+             real_width, real_height, elapsed_time / 1000.0,
+             elapsed_time_ref / 1000.0,
+             (elapsed_time_ref * 1.0) / (elapsed_time * 1.0));
     }
   }
 
@@ -182,6 +198,11 @@
                          ::testing::Values(av1_get_nz_map_contexts_sse2));
 #endif
 
+#if HAVE_NEON
+INSTANTIATE_TEST_SUITE_P(NEON, EncodeTxbTest,
+                         ::testing::Values(av1_get_nz_map_contexts_neon));
+#endif
+
 typedef void (*av1_txb_init_levels_func)(const tran_low_t *const coeff,
                                          const int width, const int height,
                                          uint8_t *const levels);