Modify encoder's entropy coding byte-push mechanism
Changed the byte-push mechanism in libaom entropy
encoder, as suggested in a To Do comment. In the
parent version, the register is flushed when at
least 1 byte is available. In the CL, we make the
register 64-bit and flush it when at least 4 bytes
are accumulated in it.
An intermediate 16-bit buffer 'precarry_buf' is
removed and carry-propagation mechanism is changed.
No changes to the decoder logic at this time, it
already has an OPT comment about using uint64_t
for decode-time optimization.
AV1 RTC Encoder:
cpu Enc time
red (%)
5 0.407
6 0.406
7 0.612
8 0.964
9 1.278
10 1.654
AVIF Encoder:
cpu Enc time
red (%)
6 0.238
7 0.302
8 0.664
9 1.929
This is a bit-exact change.
Change-Id: If23e711145aef1fa193523c98b29c7c82eff76b7
diff --git a/aom_dsp/entenc.c b/aom_dsp/entenc.c
index 2fd4493..6f90480 100644
--- a/aom_dsp/entenc.c
+++ b/aom_dsp/entenc.c
@@ -49,11 +49,11 @@
}*/
/*Takes updated low and range values, renormalizes them so that
- 32768 <= rng < 65536 (flushing bytes from low to the pre-carry buffer if
+ 32768 <= rng < 65536 (flushing bytes from low to the output buffer if
necessary), and stores them back in the encoder context.
low: The new value of low.
rng: The new value of the range.*/
-static void od_ec_enc_normalize(od_ec_enc *enc, od_ec_window low,
+static void od_ec_enc_normalize(od_ec_enc *enc, od_ec_enc_window low,
unsigned rng) {
int d;
int c;
@@ -63,44 +63,59 @@
/*The number of leading zeros in the 16-bit binary representation of rng.*/
d = 16 - OD_ILOG_NZ(rng);
s = c + d;
- /*TODO: Right now we flush every time we have at least one byte available.
- Instead we should use an od_ec_window and flush right before we're about to
- shift bits off the end of the window.
- For a 32-bit window this is about the same amount of work, but for a 64-bit
- window it should be a fair win.*/
- if (s >= 0) {
- uint16_t *buf;
- uint32_t storage;
- uint32_t offs;
- unsigned m;
- buf = enc->precarry_buf;
- storage = enc->precarry_storage;
- offs = enc->offs;
- if (offs + 2 > storage) {
- storage = 2 * storage + 2;
- buf = (uint16_t *)realloc(buf, sizeof(*buf) * storage);
- if (buf == NULL) {
+
+ /* We flush every time "low" cannot safely and efficiently accommodate any
+ more data. Overall, c must not exceed 63 at the time of byte flush out. To
+ facilitate this, "s" cannot exceed 56-bits because we have to keep 1 byte
+ for carry. Also, we need to subtract 16 because we want to keep room for
+ the next symbol worth "d"-bits (max 15). An alternate condition would be if
+ (e < d), where e = number of leading zeros in "low", indicating there is
+ not enough rooom to accommodate "rng" worth of "d"-bits in "low". However,
+ this approach needs additional computations: (i) compute "e", (ii) push
+ the leading 0x00's as a special case.
+ */
+ if (s >= 40) { // 56 - 16
+ unsigned char *out = enc->buf;
+ uint32_t storage = enc->storage;
+ uint32_t offs = enc->offs;
+ if (offs + 8 > storage) {
+ storage = 2 * storage + 8;
+ out = (unsigned char *)realloc(out, sizeof(*out) * storage);
+ if (out == NULL) {
enc->error = -1;
enc->offs = 0;
return;
}
- enc->precarry_buf = buf;
- enc->precarry_storage = storage;
+ enc->buf = out;
+ enc->storage = storage;
}
- c += 16;
- m = (1 << c) - 1;
- if (s >= 8) {
- assert(offs < storage);
- buf[offs++] = (uint16_t)(low >> c);
- low &= m;
- c -= 8;
- m >>= 8;
- }
- assert(offs < storage);
- buf[offs++] = (uint16_t)(low >> c);
+ // Need to add 1 byte here since enc->cnt always counts 1 byte less
+ // (enc->cnt = -9) to ensure correct operation
+ uint8_t num_bytes_ready = (s >> 3) + 1;
+
+ // Update "c" to contain the number of non-ready bits in "low". Since "low"
+ // has 64-bit capacity, we need to add the (64 - 40) cushion bits and take
+ // off the number of ready bits.
+ c += 24 - (num_bytes_ready << 3);
+
+ // Prepare "output" and update "low"
+ uint64_t output = low >> c;
+ low = low & (((uint64_t)1 << c) - 1);
+
+ // Prepare data and carry mask
+ uint64_t mask = (uint64_t)1 << (num_bytes_ready << 3);
+ uint64_t carry = output & mask;
+
+ mask = mask - 0x01;
+ output = output & mask;
+
+ // Write data in a single operation
+ write_enc_data_to_out_buf(out, offs, output, carry, &enc->offs,
+ num_bytes_ready);
+
+ // Update state of the encoder: enc->cnt to contain the number of residual
+ // bits
s = c + d - 24;
- low &= m;
- enc->offs = offs;
}
enc->low = low << d;
enc->rng = rng << d;
@@ -117,12 +132,6 @@
enc->storage = 0;
enc->error = -1;
}
- enc->precarry_buf = (uint16_t *)malloc(sizeof(*enc->precarry_buf) * size);
- enc->precarry_storage = size;
- if (size > 0 && enc->precarry_buf == NULL) {
- enc->precarry_storage = 0;
- enc->error = -1;
- }
}
/*Reinitializes the encoder.*/
@@ -141,21 +150,16 @@
}
/*Frees the buffers used by the encoder.*/
-void od_ec_enc_clear(od_ec_enc *enc) {
- free(enc->precarry_buf);
- free(enc->buf);
-}
+void od_ec_enc_clear(od_ec_enc *enc) { free(enc->buf); }
/*Encodes a symbol given its frequency in Q15.
fl: CDF_PROB_TOP minus the cumulative frequency of all symbols that come
- before the
- one to be encoded.
+ before the one to be encoded.
fh: CDF_PROB_TOP minus the cumulative frequency of all symbols up to and
- including
- the one to be encoded.*/
+ including the one to be encoded.*/
static void od_ec_encode_q15(od_ec_enc *enc, unsigned fl, unsigned fh, int s,
int nsyms) {
- od_ec_window l;
+ od_ec_enc_window l;
unsigned r;
unsigned u;
unsigned v;
@@ -191,7 +195,7 @@
val: The value to encode (0 or 1).
f: The probability that the val is one, scaled by 32768.*/
void od_ec_encode_bool_q15(od_ec_enc *enc, int val, unsigned f) {
- od_ec_window l;
+ od_ec_enc_window l;
unsigned r;
unsigned v;
assert(0 < f);
@@ -251,12 +255,11 @@
mask = ((1U << nbits) - 1) << shift;
if (enc->offs > 0) {
/*The first byte has been finalized.*/
- enc->precarry_buf[0] =
- (uint16_t)((enc->precarry_buf[0] & ~mask) | val << shift);
+ enc->buf[0] = (unsigned char)((enc->buf[0] & ~mask) | val << shift);
} else if (9 + enc->cnt + (enc->rng == 0x8000) > nbits) {
/*The first byte has yet to be output.*/
- enc->low = (enc->low & ~((od_ec_window)mask << (16 + enc->cnt))) |
- (od_ec_window)val << (16 + enc->cnt + shift);
+ enc->low = (enc->low & ~((od_ec_enc_window)mask << (16 + enc->cnt))) |
+ (od_ec_enc_window)val << (16 + enc->cnt + shift);
} else {
/*The encoder hasn't even encoded _nbits of data yet.*/
enc->error = -1;
@@ -276,11 +279,10 @@
unsigned char *od_ec_enc_done(od_ec_enc *enc, uint32_t *nbytes) {
unsigned char *out;
uint32_t storage;
- uint16_t *buf;
uint32_t offs;
- od_ec_window m;
- od_ec_window e;
- od_ec_window l;
+ od_ec_enc_window m;
+ od_ec_enc_window e;
+ od_ec_enc_window l;
int c;
int s;
if (enc->error) return NULL;
@@ -295,8 +297,7 @@
(double)tell / enc->nb_symbols);
}
#endif
- /*We output the minimum number of bits that ensures that the symbols encoded
- thus far will be decoded correctly regardless of the bits that follow.*/
+
l = enc->low;
c = enc->cnt;
s = 10;
@@ -304,36 +305,14 @@
e = ((l + m) & ~m) | (m + 1);
s += c;
offs = enc->offs;
- buf = enc->precarry_buf;
- if (s > 0) {
- unsigned n;
- storage = enc->precarry_storage;
- if (offs + ((s + 7) >> 3) > storage) {
- storage = storage * 2 + ((s + 7) >> 3);
- buf = (uint16_t *)realloc(buf, sizeof(*buf) * storage);
- if (buf == NULL) {
- enc->error = -1;
- return NULL;
- }
- enc->precarry_buf = buf;
- enc->precarry_storage = storage;
- }
- n = (1 << (c + 16)) - 1;
- do {
- assert(offs < storage);
- buf[offs++] = (uint16_t)(e >> (c + 16));
- e &= n;
- s -= 8;
- c -= 8;
- n >>= 8;
- } while (s > 0);
- }
+
/*Make sure there's enough room for the entropy-coded bits.*/
out = enc->buf;
storage = enc->storage;
- c = OD_MAXI((s + 7) >> 3, 0);
- if (offs + c > storage) {
- storage = offs + c;
+ const int s_bits = (s + 7) >> 3;
+ int b = OD_MAXI(s_bits, 0);
+ if (offs + b > storage) {
+ storage = offs + b;
out = (unsigned char *)realloc(out, sizeof(*out) * storage);
if (out == NULL) {
enc->error = -1;
@@ -342,23 +321,30 @@
enc->buf = out;
enc->storage = storage;
}
- *nbytes = offs;
- /*Perform carry propagation.*/
- assert(offs <= storage);
- out = out + storage - offs;
- c = 0;
- while (offs > 0) {
- offs--;
- c = buf[offs] + c;
- out[offs] = (unsigned char)c;
- c >>= 8;
+
+ /*We output the minimum number of bits that ensures that the symbols encoded
+ thus far will be decoded correctly regardless of the bits that follow.*/
+ if (s > 0) {
+ uint64_t n;
+ n = ((uint64_t)1 << (c + 16)) - 1;
+ do {
+ assert(offs < storage);
+ uint16_t val = (uint16_t)(e >> (c + 16));
+ out[offs] = (unsigned char)(val & 0x00FF);
+ if (val & 0x0100) {
+ assert(offs > 0);
+ propagate_carry_bwd(out, offs - 1);
+ }
+ offs++;
+
+ e &= n;
+ s -= 8;
+ c -= 8;
+ n >>= 8;
+ } while (s > 0);
}
- /*Note: Unless there's an allocation error, if you keep encoding into the
- current buffer and call this function again later, everything will work
- just fine (you won't get a new packet out, but you will get a single
- buffer with the new data appended to the old).
- However, this function is O(N) where N is the amount of data coded so far,
- so calling it more than once for a given packet is a bad idea.*/
+ *nbytes = offs;
+
return out;
}
@@ -407,17 +393,10 @@
void od_ec_enc_rollback(od_ec_enc *dst, const od_ec_enc *src) {
unsigned char *buf;
uint32_t storage;
- uint16_t *precarry_buf;
- uint32_t precarry_storage;
assert(dst->storage >= src->storage);
- assert(dst->precarry_storage >= src->precarry_storage);
buf = dst->buf;
storage = dst->storage;
- precarry_buf = dst->precarry_buf;
- precarry_storage = dst->precarry_storage;
OD_COPY(dst, src, 1);
dst->buf = buf;
dst->storage = storage;
- dst->precarry_buf = precarry_buf;
- dst->precarry_storage = precarry_storage;
}
diff --git a/aom_dsp/entenc.h b/aom_dsp/entenc.h
index 3551d42..467e47b 100644
--- a/aom_dsp/entenc.h
+++ b/aom_dsp/entenc.h
@@ -13,11 +13,14 @@
#define AOM_AOM_DSP_ENTENC_H_
#include <stddef.h>
#include "aom_dsp/entcode.h"
+#include "aom_ports/bitops.h"
#ifdef __cplusplus
extern "C" {
#endif
+typedef uint64_t od_ec_enc_window;
+
typedef struct od_ec_enc od_ec_enc;
#define OD_MEASURE_EC_OVERHEAD (0)
@@ -30,14 +33,10 @@
unsigned char *buf;
/*The size of the buffer.*/
uint32_t storage;
- /*A buffer for output bytes with their associated carry flags.*/
- uint16_t *precarry_buf;
- /*The size of the pre-carry buffer.*/
- uint32_t precarry_storage;
/*The offset at which the next entropy-coded byte will be written.*/
uint32_t offs;
/*The low end of the current range.*/
- od_ec_window low;
+ od_ec_enc_window low;
/*The number of values in the current range.*/
uint16_t rng;
/*The number of bits of data in the current value.*/
@@ -78,6 +77,32 @@
void od_ec_enc_checkpoint(od_ec_enc *dst, const od_ec_enc *src);
void od_ec_enc_rollback(od_ec_enc *dst, const od_ec_enc *src);
+// buf is the frame bitbuffer, offs is where carry to be added
+static AOM_INLINE void propagate_carry_bwd(unsigned char *buf, uint32_t offs) {
+ uint16_t sum, carry = 1;
+ do {
+ sum = (uint16_t)buf[offs] + 1;
+ buf[offs--] = (unsigned char)sum;
+ carry = sum >> 8;
+ } while (carry);
+}
+
+// Reverse byte order and write data to buffer adding the carry-bit
+static AOM_INLINE void write_enc_data_to_out_buf(unsigned char *out,
+ uint32_t offs, uint64_t output,
+ uint64_t carry,
+ uint32_t *enc_offs,
+ uint8_t num_bytes_ready) {
+ const uint64_t reg = get_byteswap64(output) >> ((8 - num_bytes_ready) << 3);
+ memcpy(&out[offs], ®, 8);
+ // Propagate carry backwards if exists
+ if (carry) {
+ assert(offs > 0);
+ propagate_carry_bwd(out, offs - 1);
+ }
+ *enc_offs = offs + num_bytes_ready;
+}
+
#ifdef __cplusplus
} // extern "C"
#endif
diff --git a/aom_ports/bitops.h b/aom_ports/bitops.h
index 44df173..9309dad 100644
--- a/aom_ports/bitops.h
+++ b/aom_ports/bitops.h
@@ -13,6 +13,7 @@
#define AOM_AOM_PORTS_BITOPS_H_
#include <assert.h>
+#include <stdint.h>
#include "aom_ports/msvc.h"
#include "config/aom_config.h"
@@ -32,7 +33,13 @@
// Returns (int)floor(log2(n)). n must be > 0.
// These versions of get_msb() are only valid when n != 0 because all
// of the optimized versions are undefined when n == 0:
-// https://gcc.gnu.org/onlinedocs/gcc/Other-Builtins.html
+
+// get_byteswap64:
+// Returns the number (uint64_t) with byte-positions reversed
+// e.g. input 0x123456789ABCDEF0 returns 0xF0DEBC9A78563412
+
+// GCC compiler: https://gcc.gnu.org/onlinedocs/gcc/Other-Builtins.html
+// MSVC: https://learn.microsoft.com/en-us/cpp/c-runtime-library/
// use GNU builtins where available.
#if defined(__GNUC__) && \
@@ -41,6 +48,10 @@
assert(n != 0);
return 31 ^ __builtin_clz(n);
}
+
+static INLINE uint64_t get_byteswap64(uint64_t num) {
+ return __builtin_bswap64(num);
+}
#elif defined(USE_MSC_INTRINSICS)
#pragma intrinsic(_BitScanReverse)
@@ -50,6 +61,10 @@
_BitScanReverse(&first_set_bit, n);
return first_set_bit;
}
+
+static INLINE uint64_t get_byteswap64(uint64_t num) {
+ return _byteswap_uint64(num);
+}
#undef USE_MSC_INTRINSICS
#else
static INLINE int get_msb(unsigned int n) {
@@ -69,6 +84,26 @@
}
return log;
}
+
+static INLINE uint64_t get_byteswap64(uint64_t num) {
+ uint64_t out = 0x00;
+ uint64_t mask = 0xFF00000000000000;
+ int bit_shift = 56; // 7 bytes
+ // 4 ms bytes
+ do {
+ out |= (num & mask) >> bit_shift;
+ mask >>= 8;
+ bit_shift -= 16;
+ } while (bit_shift >= 0);
+ // 4 ls bytes
+ bit_shift = 8; // 1 byte
+ do {
+ out |= (num & mask) << bit_shift;
+ mask >>= 8;
+ bit_shift += 16;
+ } while (bit_shift <= 56);
+ return out;
+}
#endif
#ifdef __cplusplus