RTC: Let compiler unroll loops in get_txb_ctx
Performance:
| SPD_SET | TESTSET | AVG_PSNR | OVR_PSNR | SSIM | ENC_T |
|---------|----------|----------|----------|---------|-------|
| 7 | rtc | +0.000% | +0.000% | +0.000% | -0.1% |
| 7 | rtc_derf | +0.000% | +0.000% | +0.000% | -0.1% |
|---------|----------|----------|----------|---------|-------|
| 8 | rtc | +0.000% | +0.000% | +0.000% | -0.1% |
| 8 | rtc_derf | +0.000% | +0.000% | +0.000% | -0.1% |
|---------|----------|----------|----------|---------|-------|
| 9 | rtc | +0.000% | +0.000% | +0.000% | -0.2% |
| 9 | rtc_derf | +0.000% | +0.000% | +0.000% | -0.2% |
|---------|----------|----------|----------|---------|-------|
| 10 | rtc | +0.000% | +0.000% | +0.000% | -0.2% |
| 10 | rtc_derf | +0.000% | +0.000% | +0.000% | -0.2% |
Change-Id: Iacdd6c69e72296cf73ed13770fc59934301974fd
diff --git a/av1/common/txb_common.h b/av1/common/txb_common.h
index 5ba3951..c7be4a1 100644
--- a/av1/common/txb_common.h
+++ b/av1/common/txb_common.h
@@ -351,11 +351,11 @@
*cul_level += 2 << COEFF_CONTEXT_BITS;
}
-static INLINE void get_txb_ctx(const BLOCK_SIZE plane_bsize,
- const TX_SIZE tx_size, const int plane,
- const ENTROPY_CONTEXT *const a,
- const ENTROPY_CONTEXT *const l,
- TXB_CTX *const txb_ctx) {
+static void get_txb_ctx_general(const BLOCK_SIZE plane_bsize,
+ const TX_SIZE tx_size, const int plane,
+ const ENTROPY_CONTEXT *const a,
+ const ENTROPY_CONTEXT *const l,
+ TXB_CTX *const txb_ctx) {
#define MAX_TX_SIZE_UNIT 16
static const int8_t signs[3] = { 0, -1, 1 };
static const int8_t dc_sign_contexts[4 * MAX_TX_SIZE_UNIT + 1] = {
@@ -437,7 +437,100 @@
: 7;
txb_ctx->txb_skip_ctx = ctx_base + ctx_offset;
}
-#undef MAX_TX_SIZE_UNIT
}
+#define SPECIALIZE_GET_TXB_CTX(w, h) \
+ static void get_txb_ctx_##w##x##h( \
+ const BLOCK_SIZE plane_bsize, const int plane, \
+ const ENTROPY_CONTEXT *const a, const ENTROPY_CONTEXT *const l, \
+ TXB_CTX *const txb_ctx) { \
+ static const int8_t signs[3] = { 0, -1, 1 }; \
+ static const int8_t dc_sign_contexts[4 * MAX_TX_SIZE_UNIT + 1] = { \
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, \
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, \
+ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 \
+ }; \
+ const TX_SIZE tx_size = TX_##w##X##h; \
+ const int txb_w_unit = tx_size_wide_unit[tx_size]; \
+ const int txb_h_unit = tx_size_high_unit[tx_size]; \
+ int dc_sign = 0; \
+ int k = 0; \
+ \
+ do { \
+ const unsigned int sign = ((uint8_t)a[k]) >> COEFF_CONTEXT_BITS; \
+ assert(sign <= 2); \
+ dc_sign += signs[sign]; \
+ } while (++k < txb_w_unit); \
+ \
+ k = 0; \
+ do { \
+ const unsigned int sign = ((uint8_t)l[k]) >> COEFF_CONTEXT_BITS; \
+ assert(sign <= 2); \
+ dc_sign += signs[sign]; \
+ } while (++k < txb_h_unit); \
+ \
+ txb_ctx->dc_sign_ctx = dc_sign_contexts[dc_sign + 2 * MAX_TX_SIZE_UNIT]; \
+ \
+ if (plane == 0) { \
+ if (plane_bsize == txsize_to_bsize[tx_size]) { \
+ txb_ctx->txb_skip_ctx = 0; \
+ } else { \
+ static const uint8_t skip_contexts[5][5] = { { 1, 2, 2, 2, 3 }, \
+ { 2, 4, 4, 4, 5 }, \
+ { 2, 4, 4, 4, 5 }, \
+ { 2, 4, 4, 4, 5 }, \
+ { 3, 5, 5, 5, 6 } }; \
+ int top = 0; \
+ int left = 0; \
+ \
+ k = 0; \
+ do { \
+ top |= a[k]; \
+ } while (++k < txb_w_unit); \
+ top &= COEFF_CONTEXT_MASK; \
+ top = AOMMIN(top, 4); \
+ \
+ k = 0; \
+ do { \
+ left |= l[k]; \
+ } while (++k < txb_h_unit); \
+ left &= COEFF_CONTEXT_MASK; \
+ left = AOMMIN(left, 4); \
+ \
+ txb_ctx->txb_skip_ctx = skip_contexts[top][left]; \
+ } \
+ } else { \
+ const int ctx_base = get_entropy_context(tx_size, a, l); \
+ const int ctx_offset = (num_pels_log2_lookup[plane_bsize] > \
+ num_pels_log2_lookup[txsize_to_bsize[tx_size]]) \
+ ? 10 \
+ : 7; \
+ txb_ctx->txb_skip_ctx = ctx_base + ctx_offset; \
+ } \
+ }
+
+SPECIALIZE_GET_TXB_CTX(4, 4)
+SPECIALIZE_GET_TXB_CTX(8, 8)
+SPECIALIZE_GET_TXB_CTX(16, 16)
+SPECIALIZE_GET_TXB_CTX(32, 32)
+
+// Wrapper for get_txb_ctx that calls the specialized version of get_txb_ctc_*
+// so that the compiler can compile away the while loops.
+static INLINE void get_txb_ctx(const BLOCK_SIZE plane_bsize,
+ const TX_SIZE tx_size, const int plane,
+ const ENTROPY_CONTEXT *const a,
+ const ENTROPY_CONTEXT *const l,
+ TXB_CTX *const txb_ctx) {
+ switch (tx_size) {
+ case TX_4X4: get_txb_ctx_4x4(plane_bsize, plane, a, l, txb_ctx); break;
+ case TX_8X8: get_txb_ctx_8x8(plane_bsize, plane, a, l, txb_ctx); break;
+ case TX_16X16: get_txb_ctx_16x16(plane_bsize, plane, a, l, txb_ctx); break;
+ case TX_32X32: get_txb_ctx_32x32(plane_bsize, plane, a, l, txb_ctx); break;
+ default:
+ get_txb_ctx_general(plane_bsize, tx_size, plane, a, l, txb_ctx);
+ break;
+ }
+}
+#undef MAX_TX_SIZE_UNIT
+
#endif // AOM_AV1_COMMON_TXB_COMMON_H_