Modify encoder's entropy coding byte-push mechanism

Changed the byte-push mechanism in libaom entropy
encoder, as suggested in a To Do comment. In the
parent version, the register is flushed when at
least 1 byte is available. In the CL, we make the
register 64-bit and flush it when at least 4 bytes
are accumulated in it.

An intermediate 16-bit buffer 'precarry_buf' is
removed and carry-propagation mechanism is changed.

No changes to the decoder logic at this time, it
already has an OPT comment about using uint64_t
for decode-time optimization.

AV1 RTC Encoder:

cpu   Enc time
       red (%)
 5      0.407
 6      0.406
 7      0.612
 8      0.964
 9      1.278
10      1.654

AVIF Encoder:

cpu   Enc time
       red (%)
 6      0.238
 7      0.302
 8      0.664
 9      1.929

This is a bit-exact change.

Change-Id: If23e711145aef1fa193523c98b29c7c82eff76b7
diff --git a/aom_dsp/entenc.c b/aom_dsp/entenc.c
index 2fd4493..6f90480 100644
--- a/aom_dsp/entenc.c
+++ b/aom_dsp/entenc.c
@@ -49,11 +49,11 @@
   }*/
 
 /*Takes updated low and range values, renormalizes them so that
-   32768 <= rng < 65536 (flushing bytes from low to the pre-carry buffer if
+   32768 <= rng < 65536 (flushing bytes from low to the output buffer if
    necessary), and stores them back in the encoder context.
   low: The new value of low.
   rng: The new value of the range.*/
-static void od_ec_enc_normalize(od_ec_enc *enc, od_ec_window low,
+static void od_ec_enc_normalize(od_ec_enc *enc, od_ec_enc_window low,
                                 unsigned rng) {
   int d;
   int c;
@@ -63,44 +63,59 @@
   /*The number of leading zeros in the 16-bit binary representation of rng.*/
   d = 16 - OD_ILOG_NZ(rng);
   s = c + d;
-  /*TODO: Right now we flush every time we have at least one byte available.
-    Instead we should use an od_ec_window and flush right before we're about to
-     shift bits off the end of the window.
-    For a 32-bit window this is about the same amount of work, but for a 64-bit
-     window it should be a fair win.*/
-  if (s >= 0) {
-    uint16_t *buf;
-    uint32_t storage;
-    uint32_t offs;
-    unsigned m;
-    buf = enc->precarry_buf;
-    storage = enc->precarry_storage;
-    offs = enc->offs;
-    if (offs + 2 > storage) {
-      storage = 2 * storage + 2;
-      buf = (uint16_t *)realloc(buf, sizeof(*buf) * storage);
-      if (buf == NULL) {
+
+  /* We flush every time "low" cannot safely and efficiently accommodate any
+     more data. Overall, c must not exceed 63 at the time of byte flush out. To
+     facilitate this, "s" cannot exceed 56-bits because we have to keep 1 byte
+     for carry. Also, we need to subtract 16 because we want to keep room for
+     the next symbol worth "d"-bits (max 15). An alternate condition would be if
+     (e < d), where e = number of leading zeros in "low", indicating there is
+     not enough rooom to accommodate "rng" worth of "d"-bits in "low". However,
+     this approach needs additional computations: (i) compute "e", (ii) push
+     the leading 0x00's as a special case.
+  */
+  if (s >= 40) {  // 56 - 16
+    unsigned char *out = enc->buf;
+    uint32_t storage = enc->storage;
+    uint32_t offs = enc->offs;
+    if (offs + 8 > storage) {
+      storage = 2 * storage + 8;
+      out = (unsigned char *)realloc(out, sizeof(*out) * storage);
+      if (out == NULL) {
         enc->error = -1;
         enc->offs = 0;
         return;
       }
-      enc->precarry_buf = buf;
-      enc->precarry_storage = storage;
+      enc->buf = out;
+      enc->storage = storage;
     }
-    c += 16;
-    m = (1 << c) - 1;
-    if (s >= 8) {
-      assert(offs < storage);
-      buf[offs++] = (uint16_t)(low >> c);
-      low &= m;
-      c -= 8;
-      m >>= 8;
-    }
-    assert(offs < storage);
-    buf[offs++] = (uint16_t)(low >> c);
+    // Need to add 1 byte here since enc->cnt always counts 1 byte less
+    // (enc->cnt = -9) to ensure correct operation
+    uint8_t num_bytes_ready = (s >> 3) + 1;
+
+    // Update "c" to contain the number of non-ready bits in "low". Since "low"
+    // has 64-bit capacity, we need to add the (64 - 40) cushion bits and take
+    // off the number of ready bits.
+    c += 24 - (num_bytes_ready << 3);
+
+    // Prepare "output" and update "low"
+    uint64_t output = low >> c;
+    low = low & (((uint64_t)1 << c) - 1);
+
+    // Prepare data and carry mask
+    uint64_t mask = (uint64_t)1 << (num_bytes_ready << 3);
+    uint64_t carry = output & mask;
+
+    mask = mask - 0x01;
+    output = output & mask;
+
+    // Write data in a single operation
+    write_enc_data_to_out_buf(out, offs, output, carry, &enc->offs,
+                              num_bytes_ready);
+
+    // Update state of the encoder: enc->cnt to contain the number of residual
+    // bits
     s = c + d - 24;
-    low &= m;
-    enc->offs = offs;
   }
   enc->low = low << d;
   enc->rng = rng << d;
@@ -117,12 +132,6 @@
     enc->storage = 0;
     enc->error = -1;
   }
-  enc->precarry_buf = (uint16_t *)malloc(sizeof(*enc->precarry_buf) * size);
-  enc->precarry_storage = size;
-  if (size > 0 && enc->precarry_buf == NULL) {
-    enc->precarry_storage = 0;
-    enc->error = -1;
-  }
 }
 
 /*Reinitializes the encoder.*/
@@ -141,21 +150,16 @@
 }
 
 /*Frees the buffers used by the encoder.*/
-void od_ec_enc_clear(od_ec_enc *enc) {
-  free(enc->precarry_buf);
-  free(enc->buf);
-}
+void od_ec_enc_clear(od_ec_enc *enc) { free(enc->buf); }
 
 /*Encodes a symbol given its frequency in Q15.
   fl: CDF_PROB_TOP minus the cumulative frequency of all symbols that come
-  before the
-       one to be encoded.
+  before the one to be encoded.
   fh: CDF_PROB_TOP minus the cumulative frequency of all symbols up to and
-  including
-       the one to be encoded.*/
+  including the one to be encoded.*/
 static void od_ec_encode_q15(od_ec_enc *enc, unsigned fl, unsigned fh, int s,
                              int nsyms) {
-  od_ec_window l;
+  od_ec_enc_window l;
   unsigned r;
   unsigned u;
   unsigned v;
@@ -191,7 +195,7 @@
   val: The value to encode (0 or 1).
   f: The probability that the val is one, scaled by 32768.*/
 void od_ec_encode_bool_q15(od_ec_enc *enc, int val, unsigned f) {
-  od_ec_window l;
+  od_ec_enc_window l;
   unsigned r;
   unsigned v;
   assert(0 < f);
@@ -251,12 +255,11 @@
   mask = ((1U << nbits) - 1) << shift;
   if (enc->offs > 0) {
     /*The first byte has been finalized.*/
-    enc->precarry_buf[0] =
-        (uint16_t)((enc->precarry_buf[0] & ~mask) | val << shift);
+    enc->buf[0] = (unsigned char)((enc->buf[0] & ~mask) | val << shift);
   } else if (9 + enc->cnt + (enc->rng == 0x8000) > nbits) {
     /*The first byte has yet to be output.*/
-    enc->low = (enc->low & ~((od_ec_window)mask << (16 + enc->cnt))) |
-               (od_ec_window)val << (16 + enc->cnt + shift);
+    enc->low = (enc->low & ~((od_ec_enc_window)mask << (16 + enc->cnt))) |
+               (od_ec_enc_window)val << (16 + enc->cnt + shift);
   } else {
     /*The encoder hasn't even encoded _nbits of data yet.*/
     enc->error = -1;
@@ -276,11 +279,10 @@
 unsigned char *od_ec_enc_done(od_ec_enc *enc, uint32_t *nbytes) {
   unsigned char *out;
   uint32_t storage;
-  uint16_t *buf;
   uint32_t offs;
-  od_ec_window m;
-  od_ec_window e;
-  od_ec_window l;
+  od_ec_enc_window m;
+  od_ec_enc_window e;
+  od_ec_enc_window l;
   int c;
   int s;
   if (enc->error) return NULL;
@@ -295,8 +297,7 @@
             (double)tell / enc->nb_symbols);
   }
 #endif
-  /*We output the minimum number of bits that ensures that the symbols encoded
-     thus far will be decoded correctly regardless of the bits that follow.*/
+
   l = enc->low;
   c = enc->cnt;
   s = 10;
@@ -304,36 +305,14 @@
   e = ((l + m) & ~m) | (m + 1);
   s += c;
   offs = enc->offs;
-  buf = enc->precarry_buf;
-  if (s > 0) {
-    unsigned n;
-    storage = enc->precarry_storage;
-    if (offs + ((s + 7) >> 3) > storage) {
-      storage = storage * 2 + ((s + 7) >> 3);
-      buf = (uint16_t *)realloc(buf, sizeof(*buf) * storage);
-      if (buf == NULL) {
-        enc->error = -1;
-        return NULL;
-      }
-      enc->precarry_buf = buf;
-      enc->precarry_storage = storage;
-    }
-    n = (1 << (c + 16)) - 1;
-    do {
-      assert(offs < storage);
-      buf[offs++] = (uint16_t)(e >> (c + 16));
-      e &= n;
-      s -= 8;
-      c -= 8;
-      n >>= 8;
-    } while (s > 0);
-  }
+
   /*Make sure there's enough room for the entropy-coded bits.*/
   out = enc->buf;
   storage = enc->storage;
-  c = OD_MAXI((s + 7) >> 3, 0);
-  if (offs + c > storage) {
-    storage = offs + c;
+  const int s_bits = (s + 7) >> 3;
+  int b = OD_MAXI(s_bits, 0);
+  if (offs + b > storage) {
+    storage = offs + b;
     out = (unsigned char *)realloc(out, sizeof(*out) * storage);
     if (out == NULL) {
       enc->error = -1;
@@ -342,23 +321,30 @@
     enc->buf = out;
     enc->storage = storage;
   }
-  *nbytes = offs;
-  /*Perform carry propagation.*/
-  assert(offs <= storage);
-  out = out + storage - offs;
-  c = 0;
-  while (offs > 0) {
-    offs--;
-    c = buf[offs] + c;
-    out[offs] = (unsigned char)c;
-    c >>= 8;
+
+  /*We output the minimum number of bits that ensures that the symbols encoded
+     thus far will be decoded correctly regardless of the bits that follow.*/
+  if (s > 0) {
+    uint64_t n;
+    n = ((uint64_t)1 << (c + 16)) - 1;
+    do {
+      assert(offs < storage);
+      uint16_t val = (uint16_t)(e >> (c + 16));
+      out[offs] = (unsigned char)(val & 0x00FF);
+      if (val & 0x0100) {
+        assert(offs > 0);
+        propagate_carry_bwd(out, offs - 1);
+      }
+      offs++;
+
+      e &= n;
+      s -= 8;
+      c -= 8;
+      n >>= 8;
+    } while (s > 0);
   }
-  /*Note: Unless there's an allocation error, if you keep encoding into the
-     current buffer and call this function again later, everything will work
-     just fine (you won't get a new packet out, but you will get a single
-     buffer with the new data appended to the old).
-    However, this function is O(N) where N is the amount of data coded so far,
-     so calling it more than once for a given packet is a bad idea.*/
+  *nbytes = offs;
+
   return out;
 }
 
@@ -407,17 +393,10 @@
 void od_ec_enc_rollback(od_ec_enc *dst, const od_ec_enc *src) {
   unsigned char *buf;
   uint32_t storage;
-  uint16_t *precarry_buf;
-  uint32_t precarry_storage;
   assert(dst->storage >= src->storage);
-  assert(dst->precarry_storage >= src->precarry_storage);
   buf = dst->buf;
   storage = dst->storage;
-  precarry_buf = dst->precarry_buf;
-  precarry_storage = dst->precarry_storage;
   OD_COPY(dst, src, 1);
   dst->buf = buf;
   dst->storage = storage;
-  dst->precarry_buf = precarry_buf;
-  dst->precarry_storage = precarry_storage;
 }
diff --git a/aom_dsp/entenc.h b/aom_dsp/entenc.h
index 3551d42..467e47b 100644
--- a/aom_dsp/entenc.h
+++ b/aom_dsp/entenc.h
@@ -13,11 +13,14 @@
 #define AOM_AOM_DSP_ENTENC_H_
 #include <stddef.h>
 #include "aom_dsp/entcode.h"
+#include "aom_ports/bitops.h"
 
 #ifdef __cplusplus
 extern "C" {
 #endif
 
+typedef uint64_t od_ec_enc_window;
+
 typedef struct od_ec_enc od_ec_enc;
 
 #define OD_MEASURE_EC_OVERHEAD (0)
@@ -30,14 +33,10 @@
   unsigned char *buf;
   /*The size of the buffer.*/
   uint32_t storage;
-  /*A buffer for output bytes with their associated carry flags.*/
-  uint16_t *precarry_buf;
-  /*The size of the pre-carry buffer.*/
-  uint32_t precarry_storage;
   /*The offset at which the next entropy-coded byte will be written.*/
   uint32_t offs;
   /*The low end of the current range.*/
-  od_ec_window low;
+  od_ec_enc_window low;
   /*The number of values in the current range.*/
   uint16_t rng;
   /*The number of bits of data in the current value.*/
@@ -78,6 +77,32 @@
 void od_ec_enc_checkpoint(od_ec_enc *dst, const od_ec_enc *src);
 void od_ec_enc_rollback(od_ec_enc *dst, const od_ec_enc *src);
 
+// buf is the frame bitbuffer, offs is where carry to be added
+static AOM_INLINE void propagate_carry_bwd(unsigned char *buf, uint32_t offs) {
+  uint16_t sum, carry = 1;
+  do {
+    sum = (uint16_t)buf[offs] + 1;
+    buf[offs--] = (unsigned char)sum;
+    carry = sum >> 8;
+  } while (carry);
+}
+
+// Reverse byte order and write data to buffer adding the carry-bit
+static AOM_INLINE void write_enc_data_to_out_buf(unsigned char *out,
+                                                 uint32_t offs, uint64_t output,
+                                                 uint64_t carry,
+                                                 uint32_t *enc_offs,
+                                                 uint8_t num_bytes_ready) {
+  const uint64_t reg = get_byteswap64(output) >> ((8 - num_bytes_ready) << 3);
+  memcpy(&out[offs], &reg, 8);
+  // Propagate carry backwards if exists
+  if (carry) {
+    assert(offs > 0);
+    propagate_carry_bwd(out, offs - 1);
+  }
+  *enc_offs = offs + num_bytes_ready;
+}
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif
diff --git a/aom_ports/bitops.h b/aom_ports/bitops.h
index 44df173..9309dad 100644
--- a/aom_ports/bitops.h
+++ b/aom_ports/bitops.h
@@ -13,6 +13,7 @@
 #define AOM_AOM_PORTS_BITOPS_H_
 
 #include <assert.h>
+#include <stdint.h>
 
 #include "aom_ports/msvc.h"
 #include "config/aom_config.h"
@@ -32,7 +33,13 @@
 // Returns (int)floor(log2(n)). n must be > 0.
 // These versions of get_msb() are only valid when n != 0 because all
 // of the optimized versions are undefined when n == 0:
-// https://gcc.gnu.org/onlinedocs/gcc/Other-Builtins.html
+
+// get_byteswap64:
+// Returns the number (uint64_t) with byte-positions reversed
+// e.g. input 0x123456789ABCDEF0 returns 0xF0DEBC9A78563412
+
+// GCC compiler: https://gcc.gnu.org/onlinedocs/gcc/Other-Builtins.html
+// MSVC: https://learn.microsoft.com/en-us/cpp/c-runtime-library/
 
 // use GNU builtins where available.
 #if defined(__GNUC__) && \
@@ -41,6 +48,10 @@
   assert(n != 0);
   return 31 ^ __builtin_clz(n);
 }
+
+static INLINE uint64_t get_byteswap64(uint64_t num) {
+  return __builtin_bswap64(num);
+}
 #elif defined(USE_MSC_INTRINSICS)
 #pragma intrinsic(_BitScanReverse)
 
@@ -50,6 +61,10 @@
   _BitScanReverse(&first_set_bit, n);
   return first_set_bit;
 }
+
+static INLINE uint64_t get_byteswap64(uint64_t num) {
+  return _byteswap_uint64(num);
+}
 #undef USE_MSC_INTRINSICS
 #else
 static INLINE int get_msb(unsigned int n) {
@@ -69,6 +84,26 @@
   }
   return log;
 }
+
+static INLINE uint64_t get_byteswap64(uint64_t num) {
+  uint64_t out = 0x00;
+  uint64_t mask = 0xFF00000000000000;
+  int bit_shift = 56;  // 7 bytes
+  // 4 ms bytes
+  do {
+    out |= (num & mask) >> bit_shift;
+    mask >>= 8;
+    bit_shift -= 16;
+  } while (bit_shift >= 0);
+  // 4 ls bytes
+  bit_shift = 8;  // 1 byte
+  do {
+    out |= (num & mask) << bit_shift;
+    mask >>= 8;
+    bit_shift += 16;
+  } while (bit_shift <= 56);
+  return out;
+}
 #endif
 
 #ifdef __cplusplus