Convert palette from double to float.

About 20% less time spent coding in vp10_k_means().

Change-Id: I5cf7605cde869a269776197bace70de353b07d83
diff --git a/vp10/encoder/block.h b/vp10/encoder/block.h
index d6b1563..2e8af98 100644
--- a/vp10/encoder/block.h
+++ b/vp10/encoder/block.h
@@ -64,7 +64,7 @@
 
 typedef struct {
   uint8_t best_palette_color_map[MAX_SB_SQUARE];
-  double kmeans_data_buf[2 * MAX_SB_SQUARE];
+  float kmeans_data_buf[2 * MAX_SB_SQUARE];
   uint8_t kmeans_indices_buf[MAX_SB_SQUARE];
   uint8_t kmeans_pre_indices_buf[MAX_SB_SQUARE];
 } PALETTE_BUFFER;
diff --git a/vp10/encoder/palette.c b/vp10/encoder/palette.c
index d413935..cbc3582 100644
--- a/vp10/encoder/palette.c
+++ b/vp10/encoder/palette.c
@@ -11,20 +11,21 @@
 #include <math.h>
 #include "vp10/encoder/palette.h"
 
-static double calc_dist(const double *p1, const double *p2, int dim) {
-  double dist = 0;
+static float calc_dist(const float *p1, const float *p2, int dim) {
+  float dist = 0;
   int i = 0;
 
   for (i = 0; i < dim; ++i) {
-    dist = dist + (p1[i] - round(p2[i])) * (p1[i] - round(p2[i]));
+    float diff = p1[i] - roundf(p2[i]);
+    dist += diff * diff;
   }
   return dist;
 }
 
-void vp10_calc_indices(const double *data, const double *centroids,
+void vp10_calc_indices(const float *data, const float *centroids,
                        uint8_t *indices, int n, int k, int dim) {
   int i, j;
-  double min_dist, this_dist;
+  float min_dist, this_dist;
 
   for (i = 0; i < n; ++i) {
     min_dist = calc_dist(data + i * dim, centroids, dim);
@@ -45,7 +46,7 @@
   return *state / 65536 % 32768;
 }
 
-static void calc_centroids(const double *data, double *centroids,
+static void calc_centroids(const float *data, float *centroids,
                            const uint8_t *indices, int n, int k, int dim) {
   int i, j, index;
   int count[PALETTE_MAX_SIZE];
@@ -70,16 +71,16 @@
       memcpy(centroids + i * dim, data + (lcg_rand16(&rand_state) % n) * dim,
                  sizeof(centroids[0]) * dim);
     } else {
-      const double norm = 1.0 / count[i];
+      const float norm = 1.0f / count[i];
       for (j = 0; j < dim; ++j)
         centroids[i * dim + j] *= norm;
     }
   }
 }
 
-static double calc_total_dist(const double *data, const double *centroids,
+static float calc_total_dist(const float *data, const float *centroids,
                               const uint8_t *indices, int n, int k, int dim) {
-  double dist = 0;
+  float dist = 0;
   int i;
   (void) k;
 
@@ -89,11 +90,11 @@
   return dist;
 }
 
-int vp10_k_means(const double *data, double *centroids, uint8_t *indices,
+int vp10_k_means(const float *data, float *centroids, uint8_t *indices,
                  uint8_t *pre_indices, int n, int k, int dim, int max_itr) {
   int i = 0;
-  double pre_dist, this_dist;
-  double pre_centroids[2 * PALETTE_MAX_SIZE];
+  float pre_dist, this_dist;
+  float pre_centroids[2 * PALETTE_MAX_SIZE];
 
   vp10_calc_indices(data, centroids, indices, n, k, dim);
   pre_dist = calc_total_dist(data, centroids, indices, n, k, dim);
@@ -121,9 +122,9 @@
   return i;
 }
 
-void vp10_insertion_sort(double *data, int n) {
+void vp10_insertion_sort(float *data, int n) {
   int i, j, k;
-  double val;
+  float val;
 
   if (n <= 1)
     return;
diff --git a/vp10/encoder/palette.h b/vp10/encoder/palette.h
index 124cf74..40d9ef9 100644
--- a/vp10/encoder/palette.h
+++ b/vp10/encoder/palette.h
@@ -17,10 +17,10 @@
 extern "C" {
 #endif
 
-void vp10_insertion_sort(double *data, int n);
-void vp10_calc_indices(const double *data, const double *centroids,
+void vp10_insertion_sort(float *data, int n);
+void vp10_calc_indices(const float *data, const float *centroids,
                        uint8_t *indices, int n, int k, int dim);
-int vp10_k_means(const double *data, double *centroids, uint8_t *indices,
+int vp10_k_means(const float *data, float *centroids, uint8_t *indices,
                  uint8_t *pre_indices, int n, int k, int dim, int max_itr);
 int vp10_count_colors(const uint8_t *src, int stride, int rows, int cols);
 #if CONFIG_VP9_HIGHBITDEPTH
diff --git a/vp10/encoder/rdopt.c b/vp10/encoder/rdopt.c
index 918ad3e..87836cb 100644
--- a/vp10/encoder/rdopt.c
+++ b/vp10/encoder/rdopt.c
@@ -1774,12 +1774,12 @@
     const int max_itr = 50;
     int color_ctx, color_idx = 0;
     int color_order[PALETTE_MAX_SIZE];
-    double *const data = x->palette_buffer->kmeans_data_buf;
+    float *const data = x->palette_buffer->kmeans_data_buf;
     uint8_t *const indices = x->palette_buffer->kmeans_indices_buf;
     uint8_t *const pre_indices = x->palette_buffer->kmeans_pre_indices_buf;
-    double centroids[PALETTE_MAX_SIZE];
+    float centroids[PALETTE_MAX_SIZE];
     uint8_t *const color_map = xd->plane[0].color_index_map;
-    double lb, ub, val;
+    float lb, ub, val;
     MB_MODE_INFO *const mbmi = &mic->mbmi;
     PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info;
 #if CONFIG_VP9_HIGHBITDEPTH
@@ -1834,7 +1834,7 @@
                    n, 1, max_itr);
       vp10_insertion_sort(centroids, n);
       for (i = 0; i < n; ++i)
-        centroids[i] = round(centroids[i]);
+        centroids[i] = roundf(centroids[i]);
       // remove duplicates
       i = 1;
       k = n;
@@ -1854,12 +1854,12 @@
 #if CONFIG_VP9_HIGHBITDEPTH
       if (cpi->common.use_highbitdepth)
         for (i = 0; i < k; ++i)
-          pmi->palette_colors[i] = clip_pixel_highbd((int)round(centroids[i]),
+          pmi->palette_colors[i] = clip_pixel_highbd((int)lroundf(centroids[i]),
                                                      cpi->common.bit_depth);
       else
 #endif  // CONFIG_VP9_HIGHBITDEPTH
         for (i = 0; i < k; ++i)
-          pmi->palette_colors[i] = clip_pixel((int)round(centroids[i]));
+          pmi->palette_colors[i] = clip_pixel((int)lroundf(centroids[i]));
       pmi->palette_size[0] = k;
 
       vp10_calc_indices(data, centroids, indices, rows * cols, k, 1);
@@ -3583,12 +3583,12 @@
     int color_ctx, color_idx = 0;
     int color_order[PALETTE_MAX_SIZE];
     int64_t this_sse;
-    double lb_u, ub_u, val_u;
-    double lb_v, ub_v, val_v;
-    double *const data = x->palette_buffer->kmeans_data_buf;
+    float lb_u, ub_u, val_u;
+    float lb_v, ub_v, val_v;
+    float *const data = x->palette_buffer->kmeans_data_buf;
     uint8_t *const indices = x->palette_buffer->kmeans_indices_buf;
     uint8_t *const pre_indices = x->palette_buffer->kmeans_pre_indices_buf;
-    double centroids[2 * PALETTE_MAX_SIZE];
+    float centroids[2 * PALETTE_MAX_SIZE];
     uint8_t *const color_map = xd->plane[1].color_index_map;
     PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info;
 
@@ -3657,12 +3657,12 @@
 #if CONFIG_VP9_HIGHBITDEPTH
           if (cpi->common.use_highbitdepth)
             pmi->palette_colors[i * PALETTE_MAX_SIZE + j] =
-                clip_pixel_highbd(round(centroids[j * 2 + i - 1]),
+                clip_pixel_highbd(roundf(centroids[j * 2 + i - 1]),
                                   cpi->common.bit_depth);
           else
 #endif  // CONFIG_VP9_HIGHBITDEPTH
             pmi->palette_colors[i * PALETTE_MAX_SIZE + j] =
-                clip_pixel(round(centroids[j * 2 + i - 1]));
+                clip_pixel(roundf(centroids[j * 2 + i - 1]));
         }
       }
       for (r = 0; r < rows; ++r)
@@ -7475,9 +7475,9 @@
   int src_stride = x->plane[1].src.stride;
   const uint8_t *const src_u = x->plane[1].src.buf;
   const uint8_t *const src_v = x->plane[2].src.buf;
-  double *const data = x->palette_buffer->kmeans_data_buf;
+  float *const data = x->palette_buffer->kmeans_data_buf;
   uint8_t *const indices = x->palette_buffer->kmeans_indices_buf;
-  double centroids[2 * PALETTE_MAX_SIZE];
+  float centroids[2 * PALETTE_MAX_SIZE];
   uint8_t *const color_map = xd->plane[1].color_index_map;
   int r, c;
 #if CONFIG_VP9_HIGHBITDEPTH