Merge "buf_ans: Misc cleanup." into nextgenv2
diff --git a/vp10/common/blockd.h b/vp10/common/blockd.h
index 90c596e..f7eb141 100644
--- a/vp10/common/blockd.h
+++ b/vp10/common/blockd.h
@@ -485,13 +485,10 @@
 #define ALLOW_FILTER_INTRA_MODES 1
 #define ANGLE_STEP 3
 #define MAX_ANGLE_DELTAS 3
-#define ANGLE_FAST_SEARCH 1
-#define ANGLE_SKIP_THRESH 0.10
-#define FILTER_FAST_SEARCH 1
 
 extern const int16_t dr_intra_derivative[270][2];
 
-static uint8_t mode_to_angle_map[INTRA_MODES] = {
+static const uint8_t mode_to_angle_map[INTRA_MODES] = {
     0, 90, 180, 45, 135, 111, 157, 203, 67, 0,
 };
 
diff --git a/vp10/common/x86/vp10_txfm1d_sse4.h b/vp10/common/x86/vp10_txfm1d_sse4.h
index 803b86d..86ab660 100644
--- a/vp10/common/x86/vp10_txfm1d_sse4.h
+++ b/vp10/common/x86/vp10_txfm1d_sse4.h
@@ -81,33 +81,32 @@
   }
 }
 
-#define round_shift_32_sse4_1(vec, bit)     \
-  ({                                        \
-    __m128i tmp, round;                     \
-    round = _mm_set1_epi32(1 << (bit - 1)); \
-    tmp = _mm_add_epi32(vec, round);        \
-    _mm_srai_epi32(tmp, bit);               \
-  })
+static INLINE __m128i round_shift_32_sse4_1(__m128i vec, int bit) {
+  __m128i tmp, round;
+  round = _mm_set1_epi32(1 << (bit - 1));
+  tmp = _mm_add_epi32(vec, round);
+  return _mm_srai_epi32(tmp, bit);
+}
 
-#define round_shift_array_32_sse4_1(input, output, size, bit) \
-  ({                                                          \
-    if (bit > 0) {                                            \
-      int i;                                                  \
-      for (i = 0; i < size; i++) {                            \
-        output[i] = round_shift_32_sse4_1(input[i], bit);     \
-      }                                                       \
-    } else {                                                  \
-      int i;                                                  \
-      for (i = 0; i < size; i++) {                            \
-        output[i] = _mm_slli_epi32(input[i], -bit);           \
-      }                                                       \
-    }                                                         \
-  })
+static INLINE void round_shift_array_32_sse4_1(__m128i *input, __m128i *output,
+                                               const int size, const int bit) {
+  if (bit > 0) {
+    int i;
+    for (i = 0; i < size; i++) {
+      output[i] = round_shift_32_sse4_1(input[i], bit);
+    }
+  } else {
+    int i;
+    for (i = 0; i < size; i++) {
+      output[i] = _mm_slli_epi32(input[i], -bit);
+    }
+  }
+}
 
 // out0 = in0*w0 + in1*w1
 // out1 = -in1*w0 + in0*w1
 #define btf_32_sse4_1_type0(w0, w1, in0, in1, out0, out1, bit) \
-  ({                                                           \
+  do {                                                         \
     __m128i ww0, ww1, in0_w0, in1_w1, in0_w1, in1_w0;          \
     ww0 = _mm_set1_epi32(w0);                                  \
     ww1 = _mm_set1_epi32(w1);                                  \
@@ -119,12 +118,12 @@
     in1_w0 = _mm_mullo_epi32(in1, ww0);                        \
     out1 = _mm_sub_epi32(in0_w1, in1_w0);                      \
     out1 = round_shift_32_sse4_1(out1, bit);                   \
-  })
+  } while (0)
 
 // out0 = in0*w0 + in1*w1
 // out1 = in1*w0 - in0*w1
 #define btf_32_sse4_1_type1(w0, w1, in0, in1, out0, out1, bit) \
-  ({                                                           \
+  do {                                                         \
     __m128i ww0, ww1, in0_w0, in1_w1, in0_w1, in1_w0;          \
     ww0 = _mm_set1_epi32(w0);                                  \
     ww1 = _mm_set1_epi32(w1);                                  \
@@ -136,7 +135,7 @@
     in1_w0 = _mm_mullo_epi32(in1, ww0);                        \
     out1 = _mm_sub_epi32(in1_w0, in0_w1);                      \
     out1 = round_shift_32_sse4_1(out1, bit);                   \
-  })
+  } while (0)
 
 #ifdef __cplusplus
 }
diff --git a/vp10/encoder/rdopt.c b/vp10/encoder/rdopt.c
index f2f8291..d7a0f1a 100644
--- a/vp10/encoder/rdopt.c
+++ b/vp10/encoder/rdopt.c
@@ -81,6 +81,12 @@
 #define MIN_EARLY_TERM_INDEX    3
 #define NEW_MV_DISCOUNT_FACTOR  8
 
+#if CONFIG_EXT_INTRA
+#define ANGLE_FAST_SEARCH 1
+#define ANGLE_SKIP_THRESH 10
+#define FILTER_FAST_SEARCH 1
+#endif  // CONFIG_EXT_INTRA
+
 const double ADST_FLIP_SVM[8] = {-6.6623, -2.8062, -3.2531, 3.1671,  // vert
                                  -7.7051, -3.2234, -3.6193, 3.4533};  // horz
 
@@ -2531,99 +2537,134 @@
   return best_rd;
 }
 
-static INLINE int get_angle_index(double angle) {
-  const double step = 22.5, base = 45;
-  return (int)lround((angle - base) / step);
-}
+// Indices are sign, integer, and fractional part of the gradient value
+static const uint8_t gradient_to_angle_bin[2][7][16] = {
+    {
+        {6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 7, 7, 0, 0, 0, 0, },
+        {0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, },
+        {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, },
+        {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, },
+        {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, },
+        {2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, },
+        {2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, },
+    },
+    {
+        {6, 6, 6, 6, 5, 5, 5, 5, 5, 5, 5, 5, 4, 4, 4, 4, },
+        {4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 3, 3, 3, 3, 3, 3, },
+        {3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, },
+        {3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, },
+        {3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, },
+        {3, 3, 3, 3, 3, 3, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, },
+        {2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, },
+    },
+};
+
+static const uint8_t mode_to_angle_bin[INTRA_MODES] = {
+    0, 2, 6, 0, 4, 3, 5, 7, 1, 0,
+};
 
 static void angle_estimation(const uint8_t *src, int src_stride,
-                             int rows, int cols, double *hist) {
-  int r, c, i, index;
-  double angle, dx, dy;
-  double temp, divisor;
+                             int rows, int cols,
+                             uint8_t *directional_mode_skip_mask) {
+  int i, r, c, index, dx, dy, temp, sn, remd, quot;
+  uint64_t hist[DIRECTIONAL_MODES];
+  uint64_t hist_sum = 0;
 
-  vpx_clear_system_state();
-  divisor = 0;
-  for (i = 0; i < DIRECTIONAL_MODES; ++i)
-    hist[i] = 0;
-
+  memset(hist, 0, DIRECTIONAL_MODES * sizeof(hist[0]));
   src += src_stride;
   for (r = 1; r < rows; ++r) {
     for (c = 1; c < cols; ++c) {
       dx = src[c] - src[c - 1];
       dy = src[c] - src[c - src_stride];
       temp = dx * dx + dy * dy;
-      if (dy == 0)
-        angle = 90;
-      else
-        angle = (atan((double)dx / (double)dy)) * 180 / PI;
-      assert(angle >= -90 && angle <= 90);
-      index = get_angle_index(angle + 180);
-      if (index < DIRECTIONAL_MODES) {
-        hist[index] += temp;
-        divisor += temp;
+      if (dy == 0) {
+        index = 2;
+      } else {
+        sn = (dx > 0) ^ (dy > 0);
+        dx = abs(dx);
+        dy = abs(dy);
+        remd = dx % dy;
+        quot = dx / dy;
+        remd = remd * 16 / dy;
+        index = gradient_to_angle_bin[sn][VPXMIN(quot, 6)][VPXMIN(remd, 15)];
       }
-      if (angle > 0) {
-        index = get_angle_index(angle);
-        if (index >= 0) {
-          hist[index] += temp;
-          divisor += temp;
-        }
-      }
+      hist[index] += temp;
     }
     src += src_stride;
   }
 
-  if (divisor < 1)
-    divisor = 1;
   for (i = 0; i < DIRECTIONAL_MODES; ++i)
-    hist[i] /= divisor;
+    hist_sum += hist[i];
+  for (i = 0; i < INTRA_MODES; ++i) {
+    if (i != DC_PRED && i != TM_PRED) {
+      int index = mode_to_angle_bin[i];
+      uint64_t score = 2 * hist[index];
+      int weight = 2;
+      if (index > 0) {
+        score += hist[index - 1];
+        weight += 1;
+      }
+      if (index < DIRECTIONAL_MODES - 1) {
+        score += hist[index + 1];
+        weight += 1;
+      }
+      if (score * ANGLE_SKIP_THRESH  < hist_sum * weight)
+        directional_mode_skip_mask[i] = 1;
+    }
+  }
 }
 
 #if CONFIG_VP9_HIGHBITDEPTH
 static void highbd_angle_estimation(const uint8_t *src8, int src_stride,
-                                    int rows, int cols, double *hist) {
-  int r, c, i, index;
-  double angle, dx, dy;
-  double temp, divisor;
+                                    int rows, int cols,
+                                    uint8_t *directional_mode_skip_mask) {
+  int i, r, c, index, dx, dy, temp, sn, remd, quot;
+  uint64_t hist[DIRECTIONAL_MODES];
+  uint64_t hist_sum = 0;
   uint16_t *src = CONVERT_TO_SHORTPTR(src8);
 
-  vpx_clear_system_state();
-  divisor = 0;
-  for (i = 0; i < DIRECTIONAL_MODES; ++i)
-    hist[i] = 0;
-
+  memset(hist, 0, DIRECTIONAL_MODES * sizeof(hist[0]));
   src += src_stride;
   for (r = 1; r < rows; ++r) {
     for (c = 1; c < cols; ++c) {
       dx = src[c] - src[c - 1];
       dy = src[c] - src[c - src_stride];
       temp = dx * dx + dy * dy;
-      if (dy == 0)
-        angle = 90;
-      else
-        angle = (atan((double)dx / (double)dy)) * 180 / PI;
-      assert(angle >= -90 && angle <= 90);
-      index = get_angle_index(angle + 180);
-      if (index < DIRECTIONAL_MODES) {
-        hist[index] += temp;
-        divisor += temp;
+      if (dy == 0) {
+        index = 2;
+      } else {
+        sn = (dx > 0) ^ (dy > 0);
+        dx = abs(dx);
+        dy = abs(dy);
+        remd = dx % dy;
+        quot = dx / dy;
+        remd = remd * 16 / dy;
+        index = gradient_to_angle_bin[sn][VPXMIN(quot, 6)][VPXMIN(remd, 15)];
       }
-      if (angle > 0) {
-        index = get_angle_index(angle);
-        if (index >= 0) {
-          hist[index] += temp;
-          divisor += temp;
-        }
-      }
+      hist[index] += temp;
     }
     src += src_stride;
   }
 
-  if (divisor < 1)
-    divisor = 1;
   for (i = 0; i < DIRECTIONAL_MODES; ++i)
-    hist[i] /= divisor;
+    hist_sum += hist[i];
+  for (i = 0; i < INTRA_MODES; ++i) {
+    if (i != DC_PRED && i != TM_PRED) {
+      int index = mode_to_angle_bin[i];
+      uint64_t score = 2 * hist[index];
+      int weight = 2;
+      if (index > 0) {
+        score += hist[index - 1];
+        weight += 1;
+      }
+      if (index < DIRECTIONAL_MODES - 1) {
+        score += hist[index + 1];
+        weight += 1;
+      }
+      if (score * ANGLE_SKIP_THRESH  < hist_sum * weight)
+        directional_mode_skip_mask[i] = 1;
+    }
+  }
 }
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 #endif  // CONFIG_EXT_INTRA
@@ -2649,7 +2690,6 @@
   uint8_t directional_mode_skip_mask[INTRA_MODES];
   const int src_stride = x->plane[0].src.stride;
   const uint8_t *src = x->plane[0].src.buf;
-  double hist[DIRECTIONAL_MODES];
 #endif  // CONFIG_EXT_INTRA
   TX_TYPE best_tx_type = DCT_DCT;
   int *bmode_costs;
@@ -2675,29 +2715,11 @@
          sizeof(directional_mode_skip_mask[0]) * INTRA_MODES);
 #if CONFIG_VP9_HIGHBITDEPTH
   if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
-    highbd_angle_estimation(src, src_stride, rows, cols, hist);
+    highbd_angle_estimation(src, src_stride, rows, cols,
+                            directional_mode_skip_mask);
   else
 #endif
-    angle_estimation(src, src_stride, rows, cols, hist);
-
-  for (mode = 0; mode < INTRA_MODES; ++mode) {
-    if (mode != DC_PRED && mode != TM_PRED) {
-      int index = get_angle_index((double)mode_to_angle_map[mode]);
-      double score, weight = 1.0;
-      score = hist[index];
-      if (index > 0) {
-        score += hist[index - 1] * 0.5;
-        weight += 0.5;
-      }
-      if (index < DIRECTIONAL_MODES - 1) {
-        score += hist[index + 1] * 0.5;
-        weight += 0.5;
-      }
-      score /= weight;
-      if (score < ANGLE_SKIP_THRESH)
-        directional_mode_skip_mask[mode] = 1;
-    }
-  }
+    angle_estimation(src, src_stride, rows, cols, directional_mode_skip_mask);
 #endif  // CONFIG_EXT_INTRA
   memset(x->skip_txfm, SKIP_TXFM_NONE, sizeof(x->skip_txfm));
   palette_mode_info.palette_size[0] = 0;
@@ -8150,33 +8172,14 @@
           const uint8_t *src = x->plane[0].src.buf;
           const int rows = 4 * num_4x4_blocks_high_lookup[bsize];
           const int cols = 4 * num_4x4_blocks_wide_lookup[bsize];
-          double hist[DIRECTIONAL_MODES];
-          PREDICTION_MODE mode;
-
 #if CONFIG_VP9_HIGHBITDEPTH
           if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
-            highbd_angle_estimation(src, src_stride, rows, cols, hist);
+            highbd_angle_estimation(src, src_stride, rows, cols,
+                                    directional_mode_skip_mask);
           else
 #endif
-            angle_estimation(src, src_stride, rows, cols, hist);
-          for (mode = 0; mode < INTRA_MODES; ++mode) {
-            if (mode != DC_PRED && mode != TM_PRED) {
-              int index = get_angle_index((double)mode_to_angle_map[mode]);
-              double score, weight = 1.0;
-              score = hist[index];
-              if (index > 0) {
-                score += hist[index - 1] * 0.5;
-                weight += 0.5;
-              }
-              if (index < DIRECTIONAL_MODES - 1) {
-                score += hist[index + 1] * 0.5;
-                weight += 0.5;
-              }
-              score /= weight;
-              if (score < ANGLE_SKIP_THRESH)
-                directional_mode_skip_mask[mode] = 1;
-            }
-          }
+            angle_estimation(src, src_stride, rows, cols,
+                             directional_mode_skip_mask);
           angle_stats_ready = 1;
         }
         if (directional_mode_skip_mask[mbmi->mode])