Port libgav1 scalar optimizations for update_cdf
Improves the performance of the function by about ~35% when not inlined.
In the future, implementing simd should improve performance even more.
Change-Id: Id39a6f377a0b0552d881d2aaf240044affcf380e
diff --git a/aom_dsp/prob.h b/aom_dsp/prob.h
index ea5e4cb..5e25b9c 100644
--- a/aom_dsp/prob.h
+++ b/aom_dsp/prob.h
@@ -641,26 +641,33 @@
}
static INLINE void update_cdf(aom_cdf_prob *cdf, int8_t val, int nsymbs) {
- int rate;
- int i, tmp;
-
- static const int nsymbs2speed[17] = { 0, 0, 1, 1, 2, 2, 2, 2, 2,
- 2, 2, 2, 2, 2, 2, 2, 2 };
assert(nsymbs < 17);
- rate = 3 + (cdf[nsymbs] > 15) + (cdf[nsymbs] > 31) +
- nsymbs2speed[nsymbs]; // + get_msb(nsymbs);
- tmp = AOM_ICDF(0);
+ const int count = cdf[nsymbs];
- // Single loop (faster)
- for (i = 0; i < nsymbs - 1; ++i) {
- tmp = (i == val) ? 0 : tmp;
- if (tmp < cdf[i]) {
- cdf[i] -= ((cdf[i] - tmp) >> rate);
+ // rate is computed in the spec as:
+ // 3 + ( cdf[N] > 15 ) + ( cdf[N] > 31 ) + Min(FloorLog2(N), 2)
+ // In this case cdf[N] is |count|.
+ // Min(FloorLog2(N), 2) is 1 for nsymbs == {2, 3} and 2 for all
+ // nsymbs > 3. So the equation becomes:
+ // 4 + (count > 15) + (count > 31) + (nsymbs > 3).
+ // Note that the largest value for count is 32 (it is not incremented beyond
+ // 32). So using that information:
+ // count >> 4 is 0 for count from 0 to 15.
+ // count >> 4 is 1 for count from 16 to 31.
+ // count >> 4 is 2 for count == 31.
+ // Now, the equation becomes:
+ // 4 + (count >> 4) + (nsymbs > 3).
+ const int rate = 4 + (count >> 4) + (nsymbs > 3);
+
+ int i = 0;
+ do {
+ if (i < val) {
+ cdf[i] += (CDF_PROB_TOP - cdf[i]) >> rate;
} else {
- cdf[i] += ((tmp - cdf[i]) >> rate);
+ cdf[i] -= cdf[i] >> rate;
}
- }
- cdf[nsymbs] += (cdf[nsymbs] < 32);
+ } while (++i < nsymbs - 1);
+ cdf[nsymbs] += (count < 32);
}
#ifdef __cplusplus