Add ec_smallmul experiment.

This reduces the multiplier width of daala_ec from 16x15->31 to
8x15->23, which reduces hardware latency by an estimated 20% (and
area for this module by an estimated 40%).

These are the smallest logical changes required to achieve this,
but the approach will be optimized significantly in subsequent
commits.

When enabled:

ec_smallmul1c_base@2017-03-08T00:49:01.830Z ->
 ec_smallmul1c@2017-03-08T00:49:45.091Z

  PSNR | PSNR Cb | PSNR Cr | PSNR HVS |   SSIM | MS SSIM | CIEDE 2000
0.0203 |  0.0203 |  0.0204 |   0.0203 | 0.0203 |  0.0203 |     0.0202

Change-Id: Idbbd3743e9189146cb519d5b984bdabd69e3f4c0
diff --git a/aom_dsp/entdec.c b/aom_dsp/entdec.c
index 28d1388..621c600 100644
--- a/aom_dsp/entdec.c
+++ b/aom_dsp/entdec.c
@@ -202,7 +202,11 @@
   r = dec->rng;
   OD_ASSERT(dif >> (OD_EC_WINDOW_SIZE - 16) < r);
   OD_ASSERT(32768U <= r);
+#if CONFIG_EC_SMALLMUL
+  v = r - ((r >> 8) * (uint32_t)(32768U - fz) >> 7);
+#else
   v = fz * (uint32_t)r >> 15;
+#endif
   vw = (od_ec_window)v << (OD_EC_WINDOW_SIZE - 16);
   ret = 0;
   r_new = v;
@@ -381,7 +385,11 @@
   ret = -1;
   do {
     u = v;
+#if CONFIG_EC_SMALLMUL
+    v = r - ((r >> 8) * (uint32_t)(32768U - cdf[++ret]) >> 7);
+#else
     v = cdf[++ret] * (uint32_t)r >> 15;
+#endif
   } while (v <= c);
   OD_ASSERT(v <= r);
   r = v - u;
diff --git a/aom_dsp/entenc.c b/aom_dsp/entenc.c
index 3a7a47e..76c8588 100644
--- a/aom_dsp/entenc.c
+++ b/aom_dsp/entenc.c
@@ -217,10 +217,21 @@
   l = enc->low;
   r = enc->rng;
   OD_ASSERT(32768U <= r);
+#if CONFIG_EC_SMALLMUL
+  if (fl > 0) {
+    u = (r >> 8) * (uint32_t)(32768U - fl) >> 7;
+    v = (r >> 8) * (uint32_t)(32768U - fh) >> 7;
+    l += r - u;
+    r = u - v;
+  } else {
+    r -= (r >> 8) * (uint32_t)(32768U - fh) >> 7;
+  }
+#else
   u = fl * (uint32_t)r >> 15;
   v = fh * (uint32_t)r >> 15;
   r = v - u;
   l += u;
+#endif
   od_ec_enc_normalize(enc, l, r);
 #if OD_MEASURE_EC_OVERHEAD
   enc->entropy -= OD_LOG2((double)(fh - fl) / 32768.);
@@ -306,7 +317,11 @@
   l = enc->low;
   r = enc->rng;
   OD_ASSERT(32768U <= r);
+#if CONFIG_EC_SMALLMUL
+  v = r - ((r >> 8) * (uint32_t)(32768U - fz) >> 7);
+#else
   v = fz * (uint32_t)r >> 15;
+#endif
   if (val) l += v;
   r = val ? r - v : v;
   od_ec_enc_normalize(enc, l, r);
diff --git a/configure b/configure
index dd4aef7..6fdc1cc 100755
--- a/configure
+++ b/configure
@@ -286,6 +286,7 @@
     palette
     daala_ec
     rawbits
+    ec_smallmul
     pvq
     xiphrc
     cb4x4
@@ -501,6 +502,10 @@
       log_echo "rawbits requires daala_ec, so disabling rawbits"
       disable_feature rawbits
     fi
+    if enabled ec_smallmul && ! enabled daala_ec; then
+      log_echo "ec_smallmul requires daala_ec, so disabling ec_smallmul"
+      disable_feature ec_smallmul
+    fi
     if enabled ext_tile; then
       log_echo "ext_tile not compatible with reference_buffer, so"
       log_echo "disabling reference_buffer"