Add av1_get_nz_map_contexts_sse2()
10x - 50x faster than C code.
av1_cost_coeffs_txb() is about 6% faster.
av1_cost_coeffs() is about 3% faster.
Change-Id: Ib9cbed02a65b9cb0c5deb7a5d99c95d0d8ba32c0
diff --git a/aom_dsp/x86/mem_sse2.h b/aom_dsp/x86/mem_sse2.h
index 54f16dd..10a505f 100644
--- a/aom_dsp/x86/mem_sse2.h
+++ b/aom_dsp/x86/mem_sse2.h
@@ -22,4 +22,20 @@
_mm_loadh_pi(_mm_castsi128_ps(s), (const __m64 *)src));
}
+static INLINE __m128i load_8bit_4x4_to_1_reg_sse2(const void *const src,
+ const int byte_stride) {
+ return _mm_setr_epi32(*(const int32_t *)((int8_t *)src + 0 * byte_stride),
+ *(const int32_t *)((int8_t *)src + 1 * byte_stride),
+ *(const int32_t *)((int8_t *)src + 2 * byte_stride),
+ *(const int32_t *)((int8_t *)src + 3 * byte_stride));
+}
+
+static INLINE __m128i load_8bit_8x2_to_1_reg_sse2(const void *const src,
+ const int byte_stride) {
+ __m128i dst;
+ dst = _mm_loadl_epi64((__m128i *)((int8_t *)src + 0 * byte_stride));
+ dst = loadh_epi64((int8_t *)src + 1 * byte_stride, dst);
+ return dst;
+}
+
#endif // AOM_DSP_X86_MEM_SSE2_H_
diff --git a/av1/av1.cmake b/av1/av1.cmake
index 8823b42..2a7a1b9 100644
--- a/av1/av1.cmake
+++ b/av1/av1.cmake
@@ -400,6 +400,10 @@
${AOM_AV1_ENCODER_SOURCES}
"${AOM_ROOT}/av1/encoder/encodetxb.c"
"${AOM_ROOT}/av1/encoder/encodetxb.h")
+
+ set(AOM_AV1_ENCODER_INTRIN_SSE2
+ ${AOM_AV1_ENCODER_INTRIN_SSE2}
+ "${AOM_ROOT}/av1/encoder/x86/encodetxb_sse2.c")
endif ()
if (CONFIG_CFL)
diff --git a/av1/av1_cx.mk b/av1/av1_cx.mk
index d35bb89..0bfeea5 100644
--- a/av1/av1_cx.mk
+++ b/av1/av1_cx.mk
@@ -123,6 +123,10 @@
endif
AV1_CX_SRCS-yes += encoder/pickcdef.c
+ifeq ($(CONFIG_LV_MAP),yes)
+AV1_COMMON_SRCS-$(HAVE_SSE2) += encoder/x86/encodetxb_sse2.c
+endif
+
AV1_CX_SRCS-$(HAVE_SSE2) += encoder/x86/av1_quantize_sse2.c
AV1_CX_SRCS-$(HAVE_AVX2) += encoder/x86/av1_quantize_avx2.c
AV1_CX_SRCS-$(HAVE_SSE2) += encoder/x86/temporal_filter_apply_sse2.asm
diff --git a/av1/common/av1_rtcd_defs.pl b/av1/common/av1_rtcd_defs.pl
index d566d14..c3c3613 100755
--- a/av1/common/av1_rtcd_defs.pl
+++ b/av1/common/av1_rtcd_defs.pl
@@ -486,6 +486,12 @@
# End av1_high encoder functions
+ # txb
+ if (aom_config("CONFIG_LV_MAP") eq "yes") {
+ add_proto qw/void av1_get_nz_map_contexts/, "const uint8_t *const levels, const int16_t *const scan, const uint16_t eob, const TX_SIZE tx_size, const TX_TYPE tx_type, int8_t *const coeff_contexts";
+ specialize qw/av1_get_nz_map_contexts sse2/;
+ }
+
add_proto qw/uint64_t av1_wedge_sse_from_residuals/, "const int16_t *r1, const int16_t *d, const uint8_t *m, int N";
specialize qw/av1_wedge_sse_from_residuals sse2/;
add_proto qw/int av1_wedge_sign_from_residuals/, "const int16_t *ds, const uint8_t *m, int N, int64_t limit";
diff --git a/av1/encoder/encodetxb.c b/av1/encoder/encodetxb.c
index 910a90f..b687e60 100644
--- a/av1/encoder/encodetxb.c
+++ b/av1/encoder/encodetxb.c
@@ -314,11 +314,10 @@
}
}
-static void av1_get_nz_map_contexts(const uint8_t *const levels,
- const int16_t *const scan,
- const uint16_t eob, const TX_SIZE tx_size,
- const TX_TYPE tx_type,
- int8_t *const coeff_contexts) {
+void av1_get_nz_map_contexts_c(const uint8_t *const levels,
+ const int16_t *const scan, const uint16_t eob,
+ const TX_SIZE tx_size, const TX_TYPE tx_type,
+ int8_t *const coeff_contexts) {
const int bwl = get_txb_bwl(tx_size);
#if CONFIG_LV_MAP_MULTI
const int height = get_txb_high(tx_size);
diff --git a/av1/encoder/x86/encodetxb_sse2.c b/av1/encoder/x86/encodetxb_sse2.c
new file mode 100644
index 0000000..708e63f
--- /dev/null
+++ b/av1/encoder/x86/encodetxb_sse2.c
@@ -0,0 +1,512 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <emmintrin.h> // SSE2
+
+#include "aom/aom_integer.h"
+#include "aom_dsp/x86/mem_sse2.h"
+#include "av1/common/onyxc_int.h"
+#include "av1/common/txb_common.h"
+
+static INLINE void load_levels_4x4x5_sse2(const uint8_t *const src,
+ const int stride,
+ const ptrdiff_t *const offsets,
+ __m128i *const level) {
+ level[0] = load_8bit_4x4_to_1_reg_sse2(src + 1, stride);
+ level[1] = load_8bit_4x4_to_1_reg_sse2(src + stride, stride);
+ level[2] = load_8bit_4x4_to_1_reg_sse2(src + offsets[0], stride);
+ level[3] = load_8bit_4x4_to_1_reg_sse2(src + offsets[1], stride);
+ level[4] = load_8bit_4x4_to_1_reg_sse2(src + offsets[2], stride);
+}
+
+static INLINE void load_levels_8x2x5_sse2(const uint8_t *const src,
+ const int stride,
+ const ptrdiff_t *const offsets,
+ __m128i *const level) {
+ level[0] = load_8bit_8x2_to_1_reg_sse2(src + 1, stride);
+ level[1] = load_8bit_8x2_to_1_reg_sse2(src + stride, stride);
+ level[2] = load_8bit_8x2_to_1_reg_sse2(src + offsets[0], stride);
+ level[3] = load_8bit_8x2_to_1_reg_sse2(src + offsets[1], stride);
+ level[4] = load_8bit_8x2_to_1_reg_sse2(src + offsets[2], stride);
+}
+
+static INLINE void load_levels_16x1x5_sse2(const uint8_t *const src,
+ const int stride,
+ const ptrdiff_t *const offsets,
+ __m128i *const level) {
+ level[0] = _mm_loadu_si128((__m128i *)(src + 1));
+ level[1] = _mm_loadu_si128((__m128i *)(src + stride));
+ level[2] = _mm_loadu_si128((__m128i *)(src + offsets[0]));
+ level[3] = _mm_loadu_si128((__m128i *)(src + offsets[1]));
+ level[4] = _mm_loadu_si128((__m128i *)(src + offsets[2]));
+}
+
+static INLINE __m128i get_coeff_contexts_kernel_sse2(__m128i *const level) {
+ const __m128i const_3 = _mm_set1_epi8(3);
+ const __m128i const_4 = _mm_set1_epi8(4);
+ __m128i count;
+
+ count = _mm_min_epu8(level[0], const_3);
+ level[1] = _mm_min_epu8(level[1], const_3);
+ level[2] = _mm_min_epu8(level[2], const_3);
+ level[3] = _mm_min_epu8(level[3], const_3);
+ level[4] = _mm_min_epu8(level[4], const_3);
+ count = _mm_add_epi8(count, level[1]);
+ count = _mm_add_epi8(count, level[2]);
+ count = _mm_add_epi8(count, level[3]);
+ count = _mm_add_epi8(count, level[4]);
+ count = _mm_avg_epu8(count, _mm_setzero_si128());
+ count = _mm_min_epu8(count, const_4);
+ return count;
+}
+
+static INLINE void get_4_nz_map_contexts_2d(const uint8_t *levels,
+ const int height,
+ const ptrdiff_t *const offsets,
+ int8_t *const coeff_contexts) {
+ const int stride = 4 + TX_PAD_HOR;
+ const __m128i pos_to_offset_large = _mm_set1_epi8(21);
+ __m128i pos_to_offset =
+ (height == 4)
+ ? _mm_setr_epi8(0, 1, 6, 6, 1, 6, 6, 21, 6, 6, 21, 21, 6, 21, 21, 21)
+ : _mm_setr_epi8(0, 11, 11, 11, 11, 11, 11, 11, 6, 6, 21, 21, 6, 21,
+ 21, 21);
+ __m128i count;
+ __m128i level[5];
+ int8_t *cc = coeff_contexts;
+ int row = height;
+
+ assert(!(height % 4));
+
+ do {
+ load_levels_4x4x5_sse2(levels, stride, offsets, level);
+ count = get_coeff_contexts_kernel_sse2(level);
+ count = _mm_add_epi8(count, pos_to_offset);
+ _mm_store_si128((__m128i *)cc, count);
+ pos_to_offset = pos_to_offset_large;
+ levels += 4 * stride;
+ cc += 16;
+ row -= 4;
+ } while (row);
+
+ coeff_contexts[0] = 0;
+}
+
+static INLINE void get_4_nz_map_contexts_hor(const uint8_t *levels,
+ const int height,
+ const ptrdiff_t *const offsets,
+ int8_t *coeff_contexts) {
+ const int stride = 4 + TX_PAD_HOR;
+ const __m128i pos_to_offset =
+ _mm_setr_epi8(SIG_COEF_CONTEXTS_2D + 0, SIG_COEF_CONTEXTS_2D + 5,
+ SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
+ SIG_COEF_CONTEXTS_2D + 0, SIG_COEF_CONTEXTS_2D + 5,
+ SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
+ SIG_COEF_CONTEXTS_2D + 0, SIG_COEF_CONTEXTS_2D + 5,
+ SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
+ SIG_COEF_CONTEXTS_2D + 0, SIG_COEF_CONTEXTS_2D + 5,
+ SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10);
+ __m128i count;
+ __m128i level[5];
+ int row = height;
+
+ assert(!(height % 4));
+
+ do {
+ load_levels_4x4x5_sse2(levels, stride, offsets, level);
+ count = get_coeff_contexts_kernel_sse2(level);
+ count = _mm_add_epi8(count, pos_to_offset);
+ _mm_store_si128((__m128i *)coeff_contexts, count);
+ levels += 4 * stride;
+ coeff_contexts += 16;
+ row -= 4;
+ } while (row);
+}
+
+static INLINE void get_4_nz_map_contexts_ver(const uint8_t *levels,
+ const int height,
+ const ptrdiff_t *const offsets,
+ int8_t *coeff_contexts) {
+ const int stride = 4 + TX_PAD_HOR;
+ const __m128i pos_to_offset_large = _mm_set1_epi8(SIG_COEF_CONTEXTS_2D + 10);
+ __m128i pos_to_offset =
+ _mm_setr_epi8(SIG_COEF_CONTEXTS_2D + 0, SIG_COEF_CONTEXTS_2D + 0,
+ SIG_COEF_CONTEXTS_2D + 0, SIG_COEF_CONTEXTS_2D + 0,
+ SIG_COEF_CONTEXTS_2D + 5, SIG_COEF_CONTEXTS_2D + 5,
+ SIG_COEF_CONTEXTS_2D + 5, SIG_COEF_CONTEXTS_2D + 5,
+ SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
+ SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
+ SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
+ SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10);
+ __m128i count;
+ __m128i level[5];
+ int row = height;
+
+ assert(!(height % 4));
+
+ do {
+ load_levels_4x4x5_sse2(levels, stride, offsets, level);
+ count = get_coeff_contexts_kernel_sse2(level);
+ count = _mm_add_epi8(count, pos_to_offset);
+ _mm_store_si128((__m128i *)coeff_contexts, count);
+ pos_to_offset = pos_to_offset_large;
+ levels += 4 * stride;
+ coeff_contexts += 16;
+ row -= 4;
+ } while (row);
+}
+
+static INLINE void get_8_coeff_contexts_2d(const uint8_t *levels,
+ const int height,
+ const ptrdiff_t *const offsets,
+ int8_t *coeff_contexts) {
+ const int stride = 8 + TX_PAD_HOR;
+ int8_t *cc = coeff_contexts;
+ int row = height;
+ __m128i count;
+ __m128i level[5];
+ __m128i pos_to_offset[3];
+
+ assert(!(height % 2));
+
+ if (height == 8) {
+ pos_to_offset[0] =
+ _mm_setr_epi8(0, 1, 6, 6, 21, 21, 21, 21, 1, 6, 6, 21, 21, 21, 21, 21);
+ pos_to_offset[1] = _mm_setr_epi8(6, 6, 21, 21, 21, 21, 21, 21, 6, 21, 21,
+ 21, 21, 21, 21, 21);
+ } else if (height < 8) {
+ pos_to_offset[0] = _mm_setr_epi8(0, 16, 6, 6, 21, 21, 21, 21, 16, 16, 6, 21,
+ 21, 21, 21, 21);
+ pos_to_offset[1] = _mm_setr_epi8(16, 16, 21, 21, 21, 21, 21, 21, 16, 16, 21,
+ 21, 21, 21, 21, 21);
+ } else {
+ pos_to_offset[0] = _mm_setr_epi8(0, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
+ 11, 11, 11, 11, 11);
+ pos_to_offset[1] = _mm_setr_epi8(6, 6, 21, 21, 21, 21, 21, 21, 6, 21, 21,
+ 21, 21, 21, 21, 21);
+ }
+ pos_to_offset[2] = _mm_set1_epi8(21);
+
+ do {
+ load_levels_8x2x5_sse2(levels, stride, offsets, level);
+ count = get_coeff_contexts_kernel_sse2(level);
+ count = _mm_add_epi8(count, pos_to_offset[0]);
+ _mm_store_si128((__m128i *)cc, count);
+ pos_to_offset[0] = pos_to_offset[1];
+ pos_to_offset[1] = pos_to_offset[2];
+ levels += 2 * stride;
+ cc += 16;
+ row -= 2;
+ } while (row);
+
+ coeff_contexts[0] = 0;
+}
+
+static INLINE void get_8_coeff_contexts_hor(const uint8_t *levels,
+ const int height,
+ const ptrdiff_t *const offsets,
+ int8_t *coeff_contexts) {
+ const int stride = 8 + TX_PAD_HOR;
+ const __m128i pos_to_offset =
+ _mm_setr_epi8(SIG_COEF_CONTEXTS_2D + 0, SIG_COEF_CONTEXTS_2D + 5,
+ SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
+ SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
+ SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
+ SIG_COEF_CONTEXTS_2D + 0, SIG_COEF_CONTEXTS_2D + 5,
+ SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
+ SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
+ SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10);
+ int row = height;
+ __m128i count;
+ __m128i level[5];
+
+ assert(!(height % 2));
+
+ do {
+ load_levels_8x2x5_sse2(levels, stride, offsets, level);
+ count = get_coeff_contexts_kernel_sse2(level);
+ count = _mm_add_epi8(count, pos_to_offset);
+ _mm_store_si128((__m128i *)coeff_contexts, count);
+ levels += 2 * stride;
+ coeff_contexts += 16;
+ row -= 2;
+ } while (row);
+}
+
+static INLINE void get_8_coeff_contexts_ver(const uint8_t *levels,
+ const int height,
+ const ptrdiff_t *const offsets,
+ int8_t *coeff_contexts) {
+ const int stride = 8 + TX_PAD_HOR;
+ const __m128i pos_to_offset_large = _mm_set1_epi8(SIG_COEF_CONTEXTS_2D + 10);
+ __m128i pos_to_offset =
+ _mm_setr_epi8(SIG_COEF_CONTEXTS_2D + 0, SIG_COEF_CONTEXTS_2D + 0,
+ SIG_COEF_CONTEXTS_2D + 0, SIG_COEF_CONTEXTS_2D + 0,
+ SIG_COEF_CONTEXTS_2D + 0, SIG_COEF_CONTEXTS_2D + 0,
+ SIG_COEF_CONTEXTS_2D + 0, SIG_COEF_CONTEXTS_2D + 0,
+ SIG_COEF_CONTEXTS_2D + 5, SIG_COEF_CONTEXTS_2D + 5,
+ SIG_COEF_CONTEXTS_2D + 5, SIG_COEF_CONTEXTS_2D + 5,
+ SIG_COEF_CONTEXTS_2D + 5, SIG_COEF_CONTEXTS_2D + 5,
+ SIG_COEF_CONTEXTS_2D + 5, SIG_COEF_CONTEXTS_2D + 5);
+ int row = height;
+ __m128i count;
+ __m128i level[5];
+
+ assert(!(height % 2));
+
+ do {
+ load_levels_8x2x5_sse2(levels, stride, offsets, level);
+ count = get_coeff_contexts_kernel_sse2(level);
+ count = _mm_add_epi8(count, pos_to_offset);
+ _mm_store_si128((__m128i *)coeff_contexts, count);
+ pos_to_offset = pos_to_offset_large;
+ levels += 2 * stride;
+ coeff_contexts += 16;
+ row -= 2;
+ } while (row);
+}
+
+static INLINE void get_16n_coeff_contexts_2d(const uint8_t *levels,
+ const int real_width,
+ const int real_height,
+ const int width, const int height,
+ const ptrdiff_t *const offsets,
+ int8_t *coeff_contexts) {
+ const int stride = width + TX_PAD_HOR;
+ int8_t *cc = coeff_contexts;
+ int row = height;
+ __m128i pos_to_offset[5];
+ __m128i pos_to_offset_large[3];
+ __m128i count;
+ __m128i level[5];
+
+ assert(!(width % 16));
+
+ pos_to_offset_large[2] = _mm_set1_epi8(21);
+ if (real_width == real_height) {
+ pos_to_offset[0] = _mm_setr_epi8(0, 1, 6, 6, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21);
+ pos_to_offset[1] = _mm_setr_epi8(1, 6, 6, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21);
+ pos_to_offset[2] = _mm_setr_epi8(6, 6, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21);
+ pos_to_offset[3] = _mm_setr_epi8(6, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21);
+ pos_to_offset[4] = pos_to_offset_large[0] = pos_to_offset_large[1] =
+ pos_to_offset_large[2];
+ } else if (real_width > real_height) {
+ pos_to_offset[0] = _mm_setr_epi8(0, 16, 6, 6, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21);
+ pos_to_offset[1] = _mm_setr_epi8(16, 16, 6, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21);
+ pos_to_offset[2] = pos_to_offset[3] = pos_to_offset[4] = _mm_setr_epi8(
+ 16, 16, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21);
+ pos_to_offset_large[0] = pos_to_offset_large[1] = pos_to_offset_large[2];
+ } else { // real_width < real_height
+ pos_to_offset[0] = pos_to_offset[1] = _mm_setr_epi8(
+ 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11);
+ pos_to_offset[2] = _mm_setr_epi8(6, 6, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21);
+ pos_to_offset[3] = _mm_setr_epi8(6, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21);
+ pos_to_offset[4] = pos_to_offset_large[2];
+ pos_to_offset_large[0] = pos_to_offset_large[1] = _mm_set1_epi8(11);
+ }
+
+ do {
+ int w = width;
+
+ do {
+ load_levels_16x1x5_sse2(levels, stride, offsets, level);
+ count = get_coeff_contexts_kernel_sse2(level);
+ count = _mm_add_epi8(count, pos_to_offset[0]);
+ _mm_store_si128((__m128i *)cc, count);
+ levels += 16;
+ cc += 16;
+ w -= 16;
+ pos_to_offset[0] = pos_to_offset_large[0];
+ } while (w);
+
+ pos_to_offset[0] = pos_to_offset[1];
+ pos_to_offset[1] = pos_to_offset[2];
+ pos_to_offset[2] = pos_to_offset[3];
+ pos_to_offset[3] = pos_to_offset[4];
+ pos_to_offset_large[0] = pos_to_offset_large[1];
+ pos_to_offset_large[1] = pos_to_offset_large[2];
+ levels += TX_PAD_HOR;
+ } while (--row);
+
+ coeff_contexts[0] = 0;
+}
+
+static INLINE void get_16n_coeff_contexts_hor(const uint8_t *levels,
+ const int width, const int height,
+ const ptrdiff_t *const offsets,
+ int8_t *coeff_contexts) {
+ const int stride = width + TX_PAD_HOR;
+ const __m128i pos_to_offset_large =
+ _mm_setr_epi8(SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
+ SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
+ SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
+ SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
+ SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
+ SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
+ SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
+ SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10);
+ __m128i count;
+ __m128i level[5];
+ int row = height;
+
+ assert(!(width % 16));
+
+ do {
+ __m128i pos_to_offset =
+ _mm_setr_epi8(SIG_COEF_CONTEXTS_2D + 0, SIG_COEF_CONTEXTS_2D + 5,
+ SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
+ SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
+ SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
+ SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
+ SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
+ SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
+ SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10);
+ int w = width;
+
+ do {
+ load_levels_16x1x5_sse2(levels, stride, offsets, level);
+ count = get_coeff_contexts_kernel_sse2(level);
+ count = _mm_add_epi8(count, pos_to_offset);
+ _mm_store_si128((__m128i *)coeff_contexts, count);
+ pos_to_offset = pos_to_offset_large;
+ levels += 16;
+ coeff_contexts += 16;
+ w -= 16;
+ } while (w);
+
+ levels += TX_PAD_HOR;
+ } while (--row);
+}
+
+static INLINE void get_16n_coeff_contexts_ver(const uint8_t *levels,
+ const int width, const int height,
+ const ptrdiff_t *const offsets,
+ int8_t *coeff_contexts) {
+ const int stride = width + TX_PAD_HOR;
+ __m128i pos_to_offset[3];
+ __m128i count;
+ __m128i level[5];
+ int row = height;
+
+ assert(!(width % 16));
+
+ pos_to_offset[0] = _mm_set1_epi8(SIG_COEF_CONTEXTS_2D + 0);
+ pos_to_offset[1] = _mm_set1_epi8(SIG_COEF_CONTEXTS_2D + 5);
+ pos_to_offset[2] = _mm_set1_epi8(SIG_COEF_CONTEXTS_2D + 10);
+
+ do {
+ int w = width;
+
+ do {
+ load_levels_16x1x5_sse2(levels, stride, offsets, level);
+ count = get_coeff_contexts_kernel_sse2(level);
+ count = _mm_add_epi8(count, pos_to_offset[0]);
+ _mm_store_si128((__m128i *)coeff_contexts, count);
+ levels += 16;
+ coeff_contexts += 16;
+ w -= 16;
+ } while (w);
+
+ pos_to_offset[0] = pos_to_offset[1];
+ pos_to_offset[1] = pos_to_offset[2];
+ levels += TX_PAD_HOR;
+ } while (--row);
+}
+
+// Note: levels[] must be in the range [0, 127], inclusive.
+void av1_get_nz_map_contexts_sse2(const uint8_t *const levels,
+ const int16_t *const scan, const uint16_t eob,
+ const TX_SIZE tx_size, const TX_TYPE tx_type,
+ int8_t *const coeff_contexts) {
+#if CONFIG_LV_MAP_MULTI
+ const int last_idx = eob - 1;
+ if (!last_idx) {
+ coeff_contexts[0] = SIG_COEF_CONTEXTS - 4;
+ return;
+ }
+#else
+ (void)scan;
+ (void)eob;
+#endif
+
+ const int real_width = tx_size_wide[tx_size];
+ const int real_height = tx_size_high[tx_size];
+ const int width = get_txb_wide(tx_size);
+ const int height = get_txb_high(tx_size);
+ const int stride = width + TX_PAD_HOR;
+ const TX_CLASS tx_class = tx_type_to_class[tx_type];
+ ptrdiff_t offsets[3];
+
+ /* coeff_contexts must be 16 byte aligned. */
+ assert(!((intptr_t)coeff_contexts & 0xf));
+
+ if (tx_class == TX_CLASS_2D) {
+ offsets[0] = 0 * stride + 2;
+ offsets[1] = 1 * stride + 1;
+ offsets[2] = 2 * stride + 0;
+
+ if (width == 4) {
+ get_4_nz_map_contexts_2d(levels, height, offsets, coeff_contexts);
+ } else if (width == 8) {
+ get_8_coeff_contexts_2d(levels, height, offsets, coeff_contexts);
+ } else if (width == 16) {
+ get_16n_coeff_contexts_2d(levels, real_width, real_height, width, height,
+ offsets, coeff_contexts);
+ } else {
+ get_16n_coeff_contexts_2d(levels, real_width, real_height, width, height,
+ offsets, coeff_contexts);
+ }
+ } else if (tx_class == TX_CLASS_HORIZ) {
+ offsets[0] = 2;
+ offsets[1] = 3;
+ offsets[2] = 4;
+ if (width == 4) {
+ get_4_nz_map_contexts_hor(levels, height, offsets, coeff_contexts);
+ } else if (width == 8) {
+ get_8_coeff_contexts_hor(levels, height, offsets, coeff_contexts);
+ } else {
+ get_16n_coeff_contexts_hor(levels, width, height, offsets,
+ coeff_contexts);
+ }
+ } else { // TX_CLASS_VERT
+ offsets[0] = 2 * stride;
+ offsets[1] = 3 * stride;
+ offsets[2] = 4 * stride;
+ if (width == 4) {
+ get_4_nz_map_contexts_ver(levels, height, offsets, coeff_contexts);
+ } else if (width == 8) {
+ get_8_coeff_contexts_ver(levels, height, offsets, coeff_contexts);
+ } else {
+ get_16n_coeff_contexts_ver(levels, width, height, offsets,
+ coeff_contexts);
+ }
+ }
+
+#if CONFIG_LV_MAP_MULTI
+ const int bwl = get_txb_bwl(tx_size);
+ const int pos = scan[last_idx];
+ if (last_idx <= (height << bwl) / 8)
+ coeff_contexts[pos] = SIG_COEF_CONTEXTS - 3;
+ else if (last_idx <= (height << bwl) / 4)
+ coeff_contexts[pos] = SIG_COEF_CONTEXTS - 2;
+ else
+ coeff_contexts[pos] = SIG_COEF_CONTEXTS - 1;
+#endif
+}
diff --git a/test/encodetxb_test.cc b/test/encodetxb_test.cc
new file mode 100644
index 0000000..dfb2a63
--- /dev/null
+++ b/test/encodetxb_test.cc
@@ -0,0 +1,183 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <stdint.h>
+#include <stdio.h>
+#include <string.h>
+
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+
+#include "./aom_config.h"
+#include "./av1_rtcd.h"
+#include "aom_ports/aom_timer.h"
+#include "aom_ports/mem.h"
+#include "av1/common/idct.h"
+#include "av1/common/onyxc_int.h"
+#include "av1/common/scan.h"
+#include "av1/common/txb_common.h"
+#include "av1/decoder/decodeframe.h"
+#include "test/acm_random.h"
+#include "test/clear_system_state.h"
+#include "test/register_state_check.h"
+#include "test/util.h"
+
+namespace {
+using libaom_test::ACMRandom;
+
+typedef void (*GetNzMapContextsFunc)(const uint8_t *const levels,
+ const int16_t *const scan,
+ const uint16_t eob, const TX_SIZE tx_size,
+ const TX_TYPE tx_type,
+ int8_t *const coeff_contexts);
+
+class EncodeTxbTest : public ::testing::TestWithParam<GetNzMapContextsFunc> {
+ public:
+ EncodeTxbTest() : get_nz_map_contexts_func_(GetParam()) {}
+
+ virtual ~EncodeTxbTest() {}
+
+ virtual void SetUp() {
+ coeff_contexts_ref_ = reinterpret_cast<int8_t *>(
+ aom_memalign(16, sizeof(*coeff_contexts_ref_) * MAX_TX_SQUARE));
+ ASSERT_TRUE(coeff_contexts_ref_ != NULL);
+ coeff_contexts_ = reinterpret_cast<int8_t *>(
+ aom_memalign(16, sizeof(*coeff_contexts_) * MAX_TX_SQUARE));
+ ASSERT_TRUE(coeff_contexts_ != NULL);
+ }
+
+ virtual void TearDown() {
+ aom_free(coeff_contexts_ref_);
+ aom_free(coeff_contexts_);
+ libaom_test::ClearSystemState();
+ }
+
+ void GetNzMapContextsRun() {
+ const int kNumTests = 10;
+ int result = 0;
+
+ for (int is_inter = 0; is_inter < 2; ++is_inter) {
+ for (int tx_type = DCT_DCT; tx_type < TX_TYPES; ++tx_type) {
+ for (int tx_size = TX_4X4; tx_size < TX_SIZES_ALL; ++tx_size) {
+ const int bwl = get_txb_bwl((TX_SIZE)tx_size);
+ const int width = get_txb_wide((TX_SIZE)tx_size);
+ const int height = get_txb_high((TX_SIZE)tx_size);
+ const int real_width = tx_size_wide[tx_size];
+ const int real_height = tx_size_high[tx_size];
+ const int16_t *const scan =
+ is_inter ? av1_inter_scan_orders[tx_size][tx_type].scan
+ : av1_intra_scan_orders[tx_size][tx_type].scan;
+
+ levels_ = set_levels(levels_buf_, width);
+ for (int i = 0; i < kNumTests && !result; ++i) {
+ for (int eob = 1; eob <= width * height && !result; ++eob) {
+ InitDataWithEob(scan, bwl, eob);
+
+ av1_get_nz_map_contexts_c(levels_, scan, eob, (TX_SIZE)tx_size,
+ (TX_TYPE)tx_type, coeff_contexts_ref_);
+ get_nz_map_contexts_func_(levels_, scan, eob, (TX_SIZE)tx_size,
+ (TX_TYPE)tx_type, coeff_contexts_);
+
+ result = Compare(scan, eob);
+
+ EXPECT_EQ(result, 0)
+ << " tx_class " << tx_type_to_class[tx_type] << " width "
+ << real_width << " height " << real_height << " eob " << eob;
+ }
+ }
+ }
+ }
+ }
+ }
+
+ void SpeedTestGetNzMapContextsRun() {
+ const int kNumTests = 2000000000;
+ aom_usec_timer timer;
+
+ printf("Note: Only test the largest possible eob case!\n");
+ for (int tx_size = TX_4X4; tx_size < TX_SIZES_ALL; ++tx_size) {
+ const int bwl = get_txb_bwl((TX_SIZE)tx_size);
+ const int width = get_txb_wide((TX_SIZE)tx_size);
+ const int height = get_txb_high((TX_SIZE)tx_size);
+ const int real_width = tx_size_wide[tx_size];
+ const int real_height = tx_size_high[tx_size];
+ const TX_TYPE tx_type = DCT_DCT;
+ const int16_t *const scan = av1_inter_scan_orders[tx_size][tx_type].scan;
+ const int eob = width * height;
+ const int numTests = kNumTests / (width * height);
+
+ levels_ = set_levels(levels_buf_, width);
+ InitDataWithEob(scan, bwl, eob);
+
+ aom_usec_timer_start(&timer);
+ for (int i = 0; i < numTests; ++i) {
+ get_nz_map_contexts_func_(levels_, scan, eob, (TX_SIZE)tx_size, tx_type,
+ coeff_contexts_);
+ }
+ aom_usec_timer_mark(&timer);
+
+ const int elapsed_time = static_cast<int>(aom_usec_timer_elapsed(&timer));
+ printf("get_nz_map_contexts_%2dx%2d: %7.1f ms\n", real_width, real_height,
+ elapsed_time / 1000.0);
+ }
+ }
+
+ private:
+ void InitDataWithEob(const int16_t *const scan, const int bwl,
+ const int eob) {
+ memset(levels_buf_, 0, sizeof(levels_buf_));
+ memset(coeff_contexts_, 0, sizeof(*coeff_contexts_) * MAX_TX_SQUARE);
+
+ for (int c = 0; c < eob; ++c) {
+ levels_[get_padded_idx(scan[c], bwl)] =
+ static_cast<uint8_t>(clamp(rnd_.Rand8(), 0, INT8_MAX));
+ coeff_contexts_[scan[c]] = rnd_.Rand16() >> 1;
+ }
+
+ memcpy(coeff_contexts_ref_, coeff_contexts_,
+ sizeof(*coeff_contexts_) * MAX_TX_SQUARE);
+ }
+
+ bool Compare(const int16_t *const scan, const int eob) const {
+ bool result = false;
+ if (memcmp(coeff_contexts_, coeff_contexts_ref_,
+ sizeof(*coeff_contexts_ref_) * MAX_TX_SQUARE)) {
+ for (int i = 0; i < eob; i++) {
+ const int pos = scan[i];
+ if (coeff_contexts_ref_[pos] != coeff_contexts_[pos]) {
+ printf("coeff_contexts_[%d] diff:%6d (ref),%6d (opt)\n", pos,
+ coeff_contexts_ref_[pos], coeff_contexts_[pos]);
+ result = true;
+ break;
+ }
+ }
+ }
+ return result;
+ }
+
+ GetNzMapContextsFunc get_nz_map_contexts_func_;
+ ACMRandom rnd_;
+ uint8_t levels_buf_[TX_PAD_2D];
+ uint8_t *levels_;
+ int8_t *coeff_contexts_ref_;
+ int8_t *coeff_contexts_;
+};
+
+TEST_P(EncodeTxbTest, GetNzMapContexts) { GetNzMapContextsRun(); }
+
+TEST_P(EncodeTxbTest, DISABLED_SpeedTestGetNzMapContexts) {
+ SpeedTestGetNzMapContextsRun();
+}
+
+#if HAVE_SSE2
+INSTANTIATE_TEST_CASE_P(SSE2, EncodeTxbTest,
+ ::testing::Values(av1_get_nz_map_contexts_sse2));
+#endif
+} // namespace
diff --git a/test/test.cmake b/test/test.cmake
index 71ffad9..4ff69c5 100644
--- a/test/test.cmake
+++ b/test/test.cmake
@@ -164,6 +164,10 @@
set(AOM_UNIT_TEST_COMMON_SOURCES
${AOM_UNIT_TEST_COMMON_SOURCES}
"${AOM_ROOT}/test/txb_test.cc")
+
+ set(AOM_UNIT_TEST_ENCODER_SOURCES
+ ${AOM_UNIT_TEST_ENCODER_SOURCES}
+ "${AOM_ROOT}/test/encodetxb_test.cc")
endif ()
set(AOM_UNIT_TEST_COMMON_INTRIN_NEON
diff --git a/test/test.mk b/test/test.mk
index 0df7d0c..ed38f24 100644
--- a/test/test.mk
+++ b/test/test.mk
@@ -203,6 +203,7 @@
ifeq ($(CONFIG_AV1_ENCODER),yes)
LIBAOM_TEST_SRCS-yes += avg_test.cc
+LIBAOM_TEST_SRCS-$(CONFIG_LV_MAP) += encodetxb_test.cc
endif
ifeq ($(CONFIG_INTERNAL_STATS),yes)
LIBAOM_TEST_SRCS-$(CONFIG_HIGHBITDEPTH) += hbd_metrics_test.cc