Remove calls to atan in generate_hog

This CL replaces calls to atan in generate_hog with a llok up table to
find the histogram bin index.

There is a small encoding difference due to the new rounding rules, but
the difference is on the order of 0.001%.

Performance on Speed 6:
 FRAME_TYPE | TESTSET |  SPD
            |   HDRES | +2.0%
    INTRA   |  MIDRES | +1.6%
            |  LOWRES | +1.2%
------------+---------+--------
            |   HDRES | +0.6%
    INTER   |  MIDRES | +0.4%
            |  LOWRES | +0.3%

STATS_CHANGE

Change-Id: I50bdd93040c5f00929a846005c3c34d9a77f2923
diff --git a/av1/encoder/intra_mode_search_utils.h b/av1/encoder/intra_mode_search_utils.h
index 131f3cf..e7f7189 100644
--- a/av1/encoder/intra_mode_search_utils.h
+++ b/av1/encoder/intra_mode_search_utils.h
@@ -81,9 +81,47 @@
   -1.222860f, -1.502437f, -1.900969f, -3.206816f,
 };
 
+#define FIX_PREC_BITS (16)
+static AOM_INLINE int get_hist_bin_idx(int dx, int dy) {
+  const int32_t ratio = (dy * (1 << FIX_PREC_BITS)) / dx;
+
+  // Find index by bisection
+  static const int thresholds[BINS] = {
+    -1334015, -441798, -261605, -183158, -138560, -109331, -88359, -72303,
+    -59392,   -48579,  -39272,  -30982,  -23445,  -16400,  -9715,  -3194,
+    3227,     9748,    16433,   23478,   31015,   39305,   48611,  59425,
+    72336,    88392,   109364,  138593,  183191,  261638,  441831, INT32_MAX
+  };
+
+  int lo_idx = 0, hi_idx = BINS - 1;
+  // Divide into segments of size 8 gives better performance than binary search
+  // here.
+  if (ratio <= thresholds[7]) {
+    lo_idx = 0;
+    hi_idx = 7;
+  } else if (ratio <= thresholds[15]) {
+    lo_idx = 8;
+    hi_idx = 15;
+  } else if (ratio <= thresholds[23]) {
+    lo_idx = 16;
+    hi_idx = 23;
+  } else {
+    lo_idx = 24;
+    hi_idx = 31;
+  }
+
+  for (int idx = lo_idx; idx <= hi_idx; idx++) {
+    if (ratio <= thresholds[idx]) {
+      return idx;
+    }
+  }
+  assert(0 && "No valid histogram bin found!");
+  return BINS - 1;
+}
+#undef FIX_PREC_BITS
+
 static AOM_INLINE void generate_hog(const uint8_t *src, int stride, int rows,
                                     int cols, float *hist) {
-  const float step = (float)PI / BINS;
   float total = 0.1f;
   src += stride;
   for (int r = 1; r < rows - 1; ++r) {
@@ -105,10 +143,8 @@
         hist[0] += temp / 2;
         hist[BINS - 1] += temp / 2;
       } else {
-        const float angle = atanf(dy * 1.0f / dx);
-        int idx = (int)roundf(angle / step) + BINS / 2;
-        idx = AOMMIN(idx, BINS - 1);
-        idx = AOMMAX(idx, 0);
+        const int idx = get_hist_bin_idx(dx, dy);
+        assert(idx >= 0 && idx < BINS);
         hist[idx] += temp;
       }
     }
@@ -120,7 +156,6 @@
 
 static AOM_INLINE void generate_hog_hbd(const uint8_t *src8, int stride,
                                         int rows, int cols, float *hist) {
-  const float step = (float)PI / BINS;
   float total = 0.1f;
   uint16_t *src = CONVERT_TO_SHORTPTR(src8);
   src += stride;
@@ -143,10 +178,8 @@
         hist[0] += temp / 2;
         hist[BINS - 1] += temp / 2;
       } else {
-        const float angle = atanf(dy * 1.0f / dx);
-        int idx = (int)roundf(angle / step) + BINS / 2;
-        idx = AOMMIN(idx, BINS - 1);
-        idx = AOMMAX(idx, 0);
+        const int idx = get_hist_bin_idx(dx, dy);
+        assert(idx >= 0 && idx < BINS);
         hist[idx] += temp;
       }
     }