Add Daala entropy coder.

Change-Id: I2849a50163268d58cc5d80aacfec1fd02299ca43
diff --git a/aom_dsp/entdec.c b/aom_dsp/entdec.c
new file mode 100644
index 0000000..4dcc0a1
--- /dev/null
+++ b/aom_dsp/entdec.c
@@ -0,0 +1,534 @@
+/*Daala video codec
+Copyright (c) 2001-2013 Daala project contributors.  All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+- Redistributions of source code must retain the above copyright notice, this
+  list of conditions and the following disclaimer.
+
+- Redistributions in binary form must reproduce the above copyright notice,
+  this list of conditions and the following disclaimer in the documentation
+  and/or other materials provided with the distribution.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.*/
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include "entdec.h"
+#if OD_ACCOUNTING
+#include "accounting.h"
+#endif
+
+/*A range decoder.
+  This is an entropy decoder based upon \cite{Mar79}, which is itself a
+   rediscovery of the FIFO arithmetic code introduced by \cite{Pas76}.
+  It is very similar to arithmetic encoding, except that encoding is done with
+   digits in any base, instead of with bits, and so it is faster when using
+   larger bases (i.e.: a byte).
+  The author claims an average waste of $\frac{1}{2}\log_b(2b)$ bits, where $b$
+   is the base, longer than the theoretical optimum, but to my knowledge there
+   is no published justification for this claim.
+  This only seems true when using near-infinite precision arithmetic so that
+   the process is carried out with no rounding errors.
+
+  An excellent description of implementation details is available at
+   http://www.arturocampos.com/ac_range.html
+  A recent work \cite{MNW98} which proposes several changes to arithmetic
+   encoding for efficiency actually re-discovers many of the principles
+   behind range encoding, and presents a good theoretical analysis of them.
+
+  End of stream is handled by writing out the smallest number of bits that
+   ensures that the stream will be correctly decoded regardless of the value of
+   any subsequent bits.
+  od_ec_dec_tell() can be used to determine how many bits were needed to decode
+   all the symbols thus far; other data can be packed in the remaining bits of
+   the input buffer.
+  @PHDTHESIS{Pas76,
+    author="Richard Clark Pasco",
+    title="Source coding algorithms for fast data compression",
+    school="Dept. of Electrical Engineering, Stanford University",
+    address="Stanford, CA",
+    month=May,
+    year=1976,
+    URL="http://www.richpasco.org/scaffdc.pdf"
+  }
+  @INPROCEEDINGS{Mar79,
+   author="Martin, G.N.N.",
+   title="Range encoding: an algorithm for removing redundancy from a digitised
+    message",
+   booktitle="Video & Data Recording Conference",
+   year=1979,
+   address="Southampton",
+   month=Jul,
+   URL="http://www.compressconsult.com/rangecoder/rngcod.pdf.gz"
+  }
+  @ARTICLE{MNW98,
+   author="Alistair Moffat and Radford Neal and Ian H. Witten",
+   title="Arithmetic Coding Revisited",
+   journal="{ACM} Transactions on Information Systems",
+   year=1998,
+   volume=16,
+   number=3,
+   pages="256--294",
+   month=Jul,
+   URL="http://researchcommons.waikato.ac.nz/bitstream/handle/10289/78/content.pdf"
+  }*/
+
+#if OD_ACCOUNTING
+#define od_ec_dec_normalize(dec, dif, rng, ret, str) \
+  od_ec_dec_normalize_(dec, dif, rng, ret, str)
+static void od_process_accounting(od_ec_dec *dec, char *str) {
+  if (dec->acct != NULL) {
+    uint32_t tell;
+    tell = od_ec_dec_tell_frac(dec);
+    OD_ASSERT(tell >= dec->acct->last_tell);
+    od_accounting_record(dec->acct, str, tell - dec->acct->last_tell);
+    dec->acct->last_tell = tell;
+  }
+}
+#else
+#define od_ec_dec_normalize(dec, dif, rng, ret, str) \
+  od_ec_dec_normalize_(dec, dif, rng, ret)
+#endif
+
+/*This is meant to be a large, positive constant that can still be efficiently
+   loaded as an immediate (on platforms like ARM, for example).
+  Even relatively modest values like 100 would work fine.*/
+#define OD_EC_LOTS_OF_BITS (0x4000)
+
+static void od_ec_dec_refill(od_ec_dec *dec) {
+  int s;
+  od_ec_window dif;
+  int16_t cnt;
+  const unsigned char *bptr;
+  const unsigned char *end;
+  dif = dec->dif;
+  cnt = dec->cnt;
+  bptr = dec->bptr;
+  end = dec->end;
+  s = OD_EC_WINDOW_SIZE - 9 - (cnt + 15);
+  for (; s >= 0 && bptr < end; s -= 8, bptr++) {
+    OD_ASSERT(s <= OD_EC_WINDOW_SIZE - 8);
+    dif |= (od_ec_window)bptr[0] << s;
+    cnt += 8;
+  }
+  if (bptr >= end) {
+    dec->tell_offs += OD_EC_LOTS_OF_BITS - cnt;
+    cnt = OD_EC_LOTS_OF_BITS;
+  }
+  dec->dif = dif;
+  dec->cnt = cnt;
+  dec->bptr = bptr;
+}
+
+/*Takes updated dif and range values, renormalizes them so that
+   32768 <= rng < 65536 (reading more bytes from the stream into dif if
+   necessary), and stores them back in the decoder context.
+  dif: The new value of dif.
+  rng: The new value of the range.
+  ret: The value to return.
+  Return: ret.
+          This allows the compiler to jump to this function via a tail-call.*/
+static int od_ec_dec_normalize_(od_ec_dec *dec, od_ec_window dif, unsigned rng,
+                                int ret OD_ACC_STR) {
+  int d;
+  OD_ASSERT(rng <= 65535U);
+  d = 16 - OD_ILOG_NZ(rng);
+  dec->cnt -= d;
+  dec->dif = dif << d;
+  dec->rng = rng << d;
+  if (dec->cnt < 0) od_ec_dec_refill(dec);
+#if OD_ACCOUNTING
+  od_process_accounting(dec, acc_str);
+#endif
+  return ret;
+}
+
+/*Initializes the decoder.
+  buf: The input buffer to use.
+  Return: 0 on success, or a negative value on error.*/
+void od_ec_dec_init(od_ec_dec *dec, const unsigned char *buf,
+                    uint32_t storage) {
+  dec->buf = buf;
+  dec->eptr = buf + storage;
+  dec->end_window = 0;
+  dec->nend_bits = 0;
+  dec->tell_offs = 10 - (OD_EC_WINDOW_SIZE - 8);
+  dec->end = buf + storage;
+  dec->bptr = buf;
+  dec->dif = 0;
+  dec->rng = 0x8000;
+  dec->cnt = -15;
+  dec->error = 0;
+  od_ec_dec_refill(dec);
+#if OD_ACCOUNTING
+  dec->acct = NULL;
+#endif
+}
+
+/*Decode a bit that has an fz/ft probability of being a zero.
+  fz: The probability that the bit is zero, scaled by _ft.
+  ft: The total probability.
+      This must be at least 16384 and no more than 32768.
+  Return: The value decoded (0 or 1).*/
+int od_ec_decode_bool_(od_ec_dec *dec, unsigned fz, unsigned ft OD_ACC_STR) {
+  od_ec_window dif;
+  od_ec_window vw;
+  unsigned r;
+  int s;
+  unsigned v;
+  int ret;
+  OD_ASSERT(0 < fz);
+  OD_ASSERT(fz < ft);
+  OD_ASSERT(16384 <= ft);
+  OD_ASSERT(ft <= 32768U);
+  dif = dec->dif;
+  r = dec->rng;
+  OD_ASSERT(dif >> (OD_EC_WINDOW_SIZE - 16) < r);
+  OD_ASSERT(ft <= r);
+  s = r - ft >= ft;
+  ft <<= s;
+  fz <<= s;
+  OD_ASSERT(r - ft < ft);
+#if OD_EC_REDUCED_OVERHEAD
+  {
+    unsigned d;
+    unsigned e;
+    d = r - ft;
+    e = OD_SUBSATU(2 * d, ft);
+    v = fz + OD_MINI(fz, e) + OD_MINI(OD_SUBSATU(fz, e) >> 1, d);
+  }
+#else
+  v = fz + OD_MINI(fz, r - ft);
+#endif
+  vw = (od_ec_window)v << (OD_EC_WINDOW_SIZE - 16);
+  ret = dif >= vw;
+  if (ret) dif -= vw;
+  r = ret ? r - v : v;
+  return od_ec_dec_normalize(dec, dif, r, ret, acc_str);
+}
+
+/*Decode a bit that has an fz probability of being a zero in Q15.
+  This is a simpler, lower overhead version of od_ec_decode_bool() for use when
+   ft == 32768.
+  To be decoded properly by this function, symbols cannot have been encoded by
+   od_ec_encode(), but must have been encoded with one of the equivalent _q15()
+   or _dyadic() functions instead.
+  fz: The probability that the bit is zero, scaled by 32768.
+  Return: The value decoded (0 or 1).*/
+int od_ec_decode_bool_q15_(od_ec_dec *dec, unsigned fz OD_ACC_STR) {
+  od_ec_window dif;
+  od_ec_window vw;
+  unsigned r;
+  unsigned v;
+  int ret;
+  OD_ASSERT(0 < fz);
+  OD_ASSERT(fz < 32768U);
+  dif = dec->dif;
+  r = dec->rng;
+  OD_ASSERT(dif >> (OD_EC_WINDOW_SIZE - 16) < r);
+  OD_ASSERT(32768U <= r);
+  v = fz * (uint32_t)r >> 15;
+  vw = (od_ec_window)v << (OD_EC_WINDOW_SIZE - 16);
+  ret = dif >= vw;
+  if (ret) dif -= vw;
+  r = ret ? r - v : v;
+  return od_ec_dec_normalize(dec, dif, r, ret, acc_str);
+}
+
+/*Decodes a symbol given a cumulative distribution function (CDF) table.
+  cdf: The CDF, such that symbol s falls in the range
+        [s > 0 ? cdf[s - 1] : 0, cdf[s]).
+       The values must be monotonically non-increasing, and cdf[nsyms - 1]
+        must be at least 16384, and no more than 32768.
+  nsyms: The number of symbols in the alphabet.
+         This should be at most 16.
+  Return: The decoded symbol s.*/
+int od_ec_decode_cdf_(od_ec_dec *dec, const uint16_t *cdf,
+                      int nsyms OD_ACC_STR) {
+  od_ec_window dif;
+  unsigned r;
+  unsigned c;
+  unsigned d;
+#if OD_EC_REDUCED_OVERHEAD
+  unsigned e;
+#endif
+  int s;
+  unsigned u;
+  unsigned v;
+  unsigned q;
+  unsigned fl;
+  unsigned fh;
+  unsigned ft;
+  int ret;
+  dif = dec->dif;
+  r = dec->rng;
+  OD_ASSERT(dif >> (OD_EC_WINDOW_SIZE - 16) < r);
+  OD_ASSERT(nsyms > 0);
+  ft = cdf[nsyms - 1];
+  OD_ASSERT(16384 <= ft);
+  OD_ASSERT(ft <= 32768U);
+  OD_ASSERT(ft <= r);
+  s = r - ft >= ft;
+  ft <<= s;
+  d = r - ft;
+  OD_ASSERT(d < ft);
+  c = (unsigned)(dif >> (OD_EC_WINDOW_SIZE - 16));
+  q = OD_MAXI((int)(c >> 1), (int)(c - d));
+#if OD_EC_REDUCED_OVERHEAD
+  e = OD_SUBSATU(2 * d, ft);
+  /*The correctness of this inverse partition function is not obvious, but it
+     was checked exhaustively for all possible values of r, ft, and c.
+    TODO: It should be possible to optimize this better than the compiler,
+     given that we do not care about the accuracy of negative results (as we
+     will not use them).
+    It would also be nice to get rid of the 32-bit dividend, as it requires a
+     32x32->64 bit multiply to invert.*/
+  q = OD_MAXI((int)q, (int)((2 * (int32_t)c + 1 - (int32_t)e) / 3));
+#endif
+  q >>= s;
+  OD_ASSERT(q<ft>> s);
+  fl = 0;
+  ret = 0;
+  for (fh = cdf[ret]; fh <= q; fh = cdf[++ret]) fl = fh;
+  OD_ASSERT(fh <= ft >> s);
+  fl <<= s;
+  fh <<= s;
+#if OD_EC_REDUCED_OVERHEAD
+  u = fl + OD_MINI(fl, e) + OD_MINI(OD_SUBSATU(fl, e) >> 1, d);
+  v = fh + OD_MINI(fh, e) + OD_MINI(OD_SUBSATU(fh, e) >> 1, d);
+#else
+  u = fl + OD_MINI(fl, d);
+  v = fh + OD_MINI(fh, d);
+#endif
+  r = v - u;
+  dif -= (od_ec_window)u << (OD_EC_WINDOW_SIZE - 16);
+  return od_ec_dec_normalize(dec, dif, r, ret, acc_str);
+}
+
+/*Decodes a symbol given a cumulative distribution function (CDF) table.
+  cdf: The CDF, such that symbol s falls in the range
+        [s > 0 ? cdf[s - 1] : 0, cdf[s]).
+       The values must be monotonically non-increasing, and cdf[nsyms - 1]
+       must be at least 2, and no more than 32768.
+  nsyms: The number of symbols in the alphabet.
+         This should be at most 16.
+  Return: The decoded symbol s.*/
+int od_ec_decode_cdf_unscaled_(od_ec_dec *dec, const uint16_t *cdf,
+                               int nsyms OD_ACC_STR) {
+  od_ec_window dif;
+  unsigned r;
+  unsigned c;
+  unsigned d;
+#if OD_EC_REDUCED_OVERHEAD
+  unsigned e;
+#endif
+  int s;
+  unsigned u;
+  unsigned v;
+  unsigned q;
+  unsigned fl;
+  unsigned fh;
+  unsigned ft;
+  int ret;
+  dif = dec->dif;
+  r = dec->rng;
+  OD_ASSERT(dif >> (OD_EC_WINDOW_SIZE - 16) < r);
+  OD_ASSERT(nsyms > 0);
+  ft = cdf[nsyms - 1];
+  OD_ASSERT(2 <= ft);
+  OD_ASSERT(ft <= 32768U);
+  s = 15 - OD_ILOG_NZ(ft - 1);
+  ft <<= s;
+  OD_ASSERT(ft <= r);
+  if (r - ft >= ft) {
+    ft <<= 1;
+    s++;
+  }
+  d = r - ft;
+  OD_ASSERT(d < ft);
+  c = (unsigned)(dif >> (OD_EC_WINDOW_SIZE - 16));
+  q = OD_MAXI((int)(c >> 1), (int)(c - d));
+#if OD_EC_REDUCED_OVERHEAD
+  e = OD_SUBSATU(2 * d, ft);
+  /*TODO: See TODO above.*/
+  q = OD_MAXI((int)q, (int)((2 * (int32_t)c + 1 - (int32_t)e) / 3));
+#endif
+  q >>= s;
+  OD_ASSERT(q<ft>> s);
+  fl = 0;
+  ret = 0;
+  for (fh = cdf[ret]; fh <= q; fh = cdf[++ret]) fl = fh;
+  OD_ASSERT(fh <= ft >> s);
+  fl <<= s;
+  fh <<= s;
+#if OD_EC_REDUCED_OVERHEAD
+  u = fl + OD_MINI(fl, e) + OD_MINI(OD_SUBSATU(fl, e) >> 1, d);
+  v = fh + OD_MINI(fh, e) + OD_MINI(OD_SUBSATU(fh, e) >> 1, d);
+#else
+  u = fl + OD_MINI(fl, d);
+  v = fh + OD_MINI(fh, d);
+#endif
+  r = v - u;
+  dif -= (od_ec_window)u << (OD_EC_WINDOW_SIZE - 16);
+  return od_ec_dec_normalize(dec, dif, r, ret, acc_str);
+}
+
+/*Decodes a symbol given a cumulative distribution function (CDF) table that
+   sums to a power of two.
+  This is a simpler, lower overhead version of od_ec_decode_cdf() for use when
+   cdf[nsyms - 1] is a power of two.
+  To be decoded properly by this function, symbols cannot have been encoded by
+   od_ec_encode(), but must have been encoded with one of the equivalent _q15()
+   functions instead.
+  cdf: The CDF, such that symbol s falls in the range
+        [s > 0 ? cdf[s - 1] : 0, cdf[s]).
+       The values must be monotonically non-increasing, and cdf[nsyms - 1]
+       must be exactly 1 << ftb.
+  nsyms: The number of symbols in the alphabet.
+         This should be at most 16.
+  ftb: The number of bits of precision in the cumulative distribution.
+       This must be no more than 15.
+  Return: The decoded symbol s.*/
+int od_ec_decode_cdf_unscaled_dyadic_(od_ec_dec *dec, const uint16_t *cdf,
+                                      int nsyms, unsigned ftb OD_ACC_STR) {
+  od_ec_window dif;
+  unsigned r;
+  unsigned c;
+  unsigned u;
+  unsigned v;
+  int ret;
+  (void)nsyms;
+  dif = dec->dif;
+  r = dec->rng;
+  OD_ASSERT(dif >> (OD_EC_WINDOW_SIZE - 16) < r);
+  OD_ASSERT(ftb <= 15);
+  OD_ASSERT(cdf[nsyms - 1] == 1U << ftb);
+  OD_ASSERT(32768U <= r);
+  c = (unsigned)(dif >> (OD_EC_WINDOW_SIZE - 16));
+  v = 0;
+  ret = -1;
+  do {
+    u = v;
+    v = cdf[++ret] * (uint32_t)r >> ftb;
+  } while (v <= c);
+  OD_ASSERT(v <= r);
+  r = v - u;
+  dif -= (od_ec_window)u << (OD_EC_WINDOW_SIZE - 16);
+  return od_ec_dec_normalize(dec, dif, r, ret, acc_str);
+}
+
+/*Decodes a symbol given a cumulative distribution function (CDF) table in Q15.
+  This is a simpler, lower overhead version of od_ec_decode_cdf() for use when
+   cdf[nsyms - 1] == 32768.
+  To be decoded properly by this function, symbols cannot have been encoded by
+   od_ec_encode(), but must have been encoded with one of the equivalent _q15()
+   or dyadic() functions instead.
+  cdf: The CDF, such that symbol s falls in the range
+        [s > 0 ? cdf[s - 1] : 0, cdf[s]).
+       The values must be monotonically non-increasing, and cdf[nsyms - 1]
+        must be 32768.
+  nsyms: The number of symbols in the alphabet.
+         This should be at most 16.
+  Return: The decoded symbol s.*/
+int od_ec_decode_cdf_q15_(od_ec_dec *dec, const uint16_t *cdf,
+                          int nsyms OD_ACC_STR) {
+  return od_ec_decode_cdf_unscaled_dyadic(dec, cdf, nsyms, 15, acc_str);
+}
+
+/*Extracts a raw unsigned integer with a non-power-of-2 range from the stream.
+  The integer must have been encoded with od_ec_enc_uint().
+  ft: The number of integers that can be decoded (one more than the max).
+      This must be at least 2, and no more than 2**29.
+  Return: The decoded bits.*/
+uint32_t od_ec_dec_uint_(od_ec_dec *dec, uint32_t ft OD_ACC_STR) {
+  OD_ASSERT(ft >= 2);
+  OD_ASSERT(ft <= (uint32_t)1 << (25 + OD_EC_UINT_BITS));
+  if (ft > 1U << OD_EC_UINT_BITS) {
+    uint32_t t;
+    int ft1;
+    int ftb;
+    ft--;
+    ftb = OD_ILOG_NZ(ft) - OD_EC_UINT_BITS;
+    ft1 = (int)(ft >> ftb) + 1;
+    t = od_ec_decode_cdf_q15(dec, OD_UNIFORM_CDF_Q15(ft1), ft1, acc_str);
+    t = t << ftb | od_ec_dec_bits(dec, ftb, acc_str);
+    if (t <= ft) return t;
+    dec->error = 1;
+    return ft;
+  }
+  return od_ec_decode_cdf_q15(dec, OD_UNIFORM_CDF_Q15(ft), (int)ft, acc_str);
+}
+
+/*Extracts a sequence of raw bits from the stream.
+  The bits must have been encoded with od_ec_enc_bits().
+  ftb: The number of bits to extract.
+       This must be between 0 and 25, inclusive.
+  Return: The decoded bits.*/
+uint32_t od_ec_dec_bits_(od_ec_dec *dec, unsigned ftb OD_ACC_STR) {
+  od_ec_window window;
+  int available;
+  uint32_t ret;
+  OD_ASSERT(ftb <= 25);
+  window = dec->end_window;
+  available = dec->nend_bits;
+  if ((unsigned)available < ftb) {
+    const unsigned char *buf;
+    const unsigned char *eptr;
+    buf = dec->buf;
+    eptr = dec->eptr;
+    OD_ASSERT(available <= OD_EC_WINDOW_SIZE - 8);
+    do {
+      if (eptr <= buf) {
+        dec->tell_offs += OD_EC_LOTS_OF_BITS - available;
+        available = OD_EC_LOTS_OF_BITS;
+        break;
+      }
+      window |= (od_ec_window) * --eptr << available;
+      available += 8;
+    } while (available <= OD_EC_WINDOW_SIZE - 8);
+    dec->eptr = eptr;
+  }
+  ret = (uint32_t)window & (((uint32_t)1 << ftb) - 1);
+  window >>= ftb;
+  available -= ftb;
+  dec->end_window = window;
+  dec->nend_bits = available;
+#if OD_ACCOUNTING
+  od_process_accounting(dec, acc_str);
+#endif
+  return ret;
+}
+
+/*Returns the number of bits "used" by the decoded symbols so far.
+  This same number can be computed in either the encoder or the decoder, and is
+   suitable for making coding decisions.
+  Return: The number of bits.
+          This will always be slightly larger than the exact value (e.g., all
+           rounding error is in the positive direction).*/
+int od_ec_dec_tell(od_ec_dec *dec) {
+  return ((dec->end - dec->eptr) + (dec->bptr - dec->buf)) * 8 - dec->cnt -
+         dec->nend_bits + dec->tell_offs;
+}
+
+/*Returns the number of bits "used" by the decoded symbols so far.
+  This same number can be computed in either the encoder or the decoder, and is
+   suitable for making coding decisions.
+  Return: The number of bits scaled by 2**OD_BITRES.
+          This will always be slightly larger than the exact value (e.g., all
+           rounding error is in the positive direction).*/
+uint32_t od_ec_dec_tell_frac(od_ec_dec *dec) {
+  return od_ec_tell_frac(od_ec_dec_tell(dec), dec->rng);
+}