Palette: replace floating point numbers with integers

Performance change is within noise range.

Change-Id: I9ca2ea6dfdc629513836637283ae3f964a1dd497
diff --git a/aom_ports/mem.h b/aom_ports/mem.h
index 500e397..937b79f 100644
--- a/aom_ports/mem.h
+++ b/aom_ports/mem.h
@@ -61,6 +61,8 @@
 #define ALIGN_POWER_OF_TWO(value, n) \
   (((value) + ((1 << (n)) - 1)) & ~((1 << (n)) - 1))
 
+#define DIVIDE_AND_ROUND(x, y) (((x) + ((y) >> 1)) / (y))
+
 #define CONVERT_TO_SHORTPTR(x) ((uint16_t *)(((uintptr_t)(x)) << 1))
 #define CONVERT_TO_BYTEPTR(x) ((uint8_t *)(((uintptr_t)(x)) >> 1))
 
diff --git a/av1/common/av1_txfm.h b/av1/common/av1_txfm.h
index ec74a02..66e9753 100644
--- a/av1/common/av1_txfm.h
+++ b/av1/common/av1_txfm.h
@@ -26,7 +26,6 @@
 #endif
 
 #define MAX_TXFM_STAGE_NUM 12
-#define DIVIDE_AND_ROUND(x, y) (((x) + ((y) >> 1)) / (y))
 
 static const int cos_bit_min = 10;
 static const int cos_bit_max = 16;
diff --git a/av1/encoder/block.h b/av1/encoder/block.h
index f02b9e0..4489913 100644
--- a/av1/encoder/block.h
+++ b/av1/encoder/block.h
@@ -117,7 +117,7 @@
 
 typedef struct {
   uint8_t best_palette_color_map[MAX_PALETTE_SQUARE];
-  float kmeans_data_buf[2 * MAX_PALETTE_SQUARE];
+  int kmeans_data_buf[2 * MAX_PALETTE_SQUARE];
 } PALETTE_BUFFER;
 
 typedef struct {
diff --git a/av1/encoder/k_means_template.h b/av1/encoder/k_means_template.h
index 3a433d9..9e526b8 100644
--- a/av1/encoder/k_means_template.h
+++ b/av1/encoder/k_means_template.h
@@ -23,25 +23,23 @@
 #define RENAME_(x, y) AV1_K_MEANS_RENAME(x, y)
 #define RENAME(x) RENAME_(x, AV1_K_MEANS_DIM)
 
-static float RENAME(calc_dist)(const float *p1, const float *p2) {
-  float dist = 0;
-  int i;
-  for (i = 0; i < AV1_K_MEANS_DIM; ++i) {
-    const float diff = p1[i] - p2[i];
+static int RENAME(calc_dist)(const int *p1, const int *p2) {
+  int dist = 0;
+  for (int i = 0; i < AV1_K_MEANS_DIM; ++i) {
+    const int diff = p1[i] - p2[i];
     dist += diff * diff;
   }
   return dist;
 }
 
-void RENAME(av1_calc_indices)(const float *data, const float *centroids,
+void RENAME(av1_calc_indices)(const int *data, const int *centroids,
                               uint8_t *indices, int n, int k) {
-  int i, j;
-  for (i = 0; i < n; ++i) {
-    float min_dist = RENAME(calc_dist)(data + i * AV1_K_MEANS_DIM, centroids);
+  for (int i = 0; i < n; ++i) {
+    int min_dist = RENAME(calc_dist)(data + i * AV1_K_MEANS_DIM, centroids);
     indices[i] = 0;
-    for (j = 1; j < k; ++j) {
-      const float this_dist = RENAME(calc_dist)(
-          data + i * AV1_K_MEANS_DIM, centroids + j * AV1_K_MEANS_DIM);
+    for (int j = 1; j < k; ++j) {
+      const int this_dist = RENAME(calc_dist)(data + i * AV1_K_MEANS_DIM,
+                                              centroids + j * AV1_K_MEANS_DIM);
       if (this_dist < min_dist) {
         min_dist = this_dist;
         indices[i] = j;
@@ -50,19 +48,16 @@
   }
 }
 
-static void RENAME(calc_centroids)(const float *data, float *centroids,
+static void RENAME(calc_centroids)(const int *data, int *centroids,
                                    const uint8_t *indices, int n, int k) {
-  int i, j, index;
-  int count[PALETTE_MAX_SIZE];
+  int i, j;
+  int count[PALETTE_MAX_SIZE] = { 0 };
   unsigned int rand_state = (unsigned int)data[0];
-
   assert(n <= 32768);
-
-  memset(count, 0, sizeof(count[0]) * k);
   memset(centroids, 0, sizeof(centroids[0]) * k * AV1_K_MEANS_DIM);
 
   for (i = 0; i < n; ++i) {
-    index = indices[i];
+    const int index = indices[i];
     assert(index < k);
     ++count[index];
     for (j = 0; j < AV1_K_MEANS_DIM; ++j) {
@@ -76,43 +71,35 @@
              data + (lcg_rand16(&rand_state) % n) * AV1_K_MEANS_DIM,
              sizeof(centroids[0]) * AV1_K_MEANS_DIM);
     } else {
-      const float norm = 1.0f / count[i];
-      for (j = 0; j < AV1_K_MEANS_DIM; ++j)
-        centroids[i * AV1_K_MEANS_DIM + j] *= norm;
+      for (j = 0; j < AV1_K_MEANS_DIM; ++j) {
+        centroids[i * AV1_K_MEANS_DIM + j] =
+            DIVIDE_AND_ROUND(centroids[i * AV1_K_MEANS_DIM + j], count[i]);
+      }
     }
   }
-
-  // Round to nearest integers.
-  for (i = 0; i < k * AV1_K_MEANS_DIM; ++i) {
-    centroids[i] = roundf(centroids[i]);
-  }
 }
 
-static float RENAME(calc_total_dist)(const float *data, const float *centroids,
-                                     const uint8_t *indices, int n, int k) {
-  float dist = 0;
-  int i;
+static int64_t RENAME(calc_total_dist)(const int *data, const int *centroids,
+                                       const uint8_t *indices, int n, int k) {
+  int64_t dist = 0;
   (void)k;
-
-  for (i = 0; i < n; ++i)
+  for (int i = 0; i < n; ++i) {
     dist += RENAME(calc_dist)(data + i * AV1_K_MEANS_DIM,
                               centroids + indices[i] * AV1_K_MEANS_DIM);
-
+  }
   return dist;
 }
 
-void RENAME(av1_k_means)(const float *data, float *centroids, uint8_t *indices,
+void RENAME(av1_k_means)(const int *data, int *centroids, uint8_t *indices,
                          int n, int k, int max_itr) {
-  int i;
-  float this_dist;
-  float pre_centroids[2 * PALETTE_MAX_SIZE];
+  int pre_centroids[2 * PALETTE_MAX_SIZE];
   uint8_t pre_indices[MAX_SB_SQUARE];
 
   RENAME(av1_calc_indices)(data, centroids, indices, n, k);
-  this_dist = RENAME(calc_total_dist)(data, centroids, indices, n, k);
+  int64_t this_dist = RENAME(calc_total_dist)(data, centroids, indices, n, k);
 
-  for (i = 0; i < max_itr; ++i) {
-    const float pre_dist = this_dist;
+  for (int i = 0; i < max_itr; ++i) {
+    const int64_t pre_dist = this_dist;
     memcpy(pre_centroids, centroids,
            sizeof(pre_centroids[0]) * k * AV1_K_MEANS_DIM);
     memcpy(pre_indices, indices, sizeof(pre_indices[0]) * n);
@@ -132,6 +119,5 @@
       break;
   }
 }
-
 #undef RENAME_
 #undef RENAME
diff --git a/av1/encoder/palette.c b/av1/encoder/palette.c
index 94e866f..5fcbf9a 100644
--- a/av1/encoder/palette.c
+++ b/av1/encoder/palette.c
@@ -23,16 +23,14 @@
 #include "av1/encoder/k_means_template.h"
 #undef AV1_K_MEANS_DIM
 
-static int float_comparer(const void *a, const void *b) {
-  const float fa = *(const float *)a;
-  const float fb = *(const float *)b;
-  return (fa > fb) - (fa < fb);
+static int int_comparer(const void *a, const void *b) {
+  return (*(int *)a - *(int *)b);
 }
 
-int av1_remove_duplicates(float *centroids, int num_centroids) {
+int av1_remove_duplicates(int *centroids, int num_centroids) {
   int num_unique;  // number of unique centroids
   int i;
-  qsort(centroids, num_centroids, sizeof(*centroids), float_comparer);
+  qsort(centroids, num_centroids, sizeof(*centroids), int_comparer);
   // Remove duplicates.
   num_unique = 1;
   for (i = 1; i < num_centroids; ++i) {
diff --git a/av1/encoder/palette.h b/av1/encoder/palette.h
index faf2229..bbdd507 100644
--- a/av1/encoder/palette.h
+++ b/av1/encoder/palette.h
@@ -20,22 +20,22 @@
 
 #define AV1_K_MEANS_RENAME(func, dim) func##_dim##dim
 
-void AV1_K_MEANS_RENAME(av1_calc_indices, 1)(const float *data,
-                                             const float *centroids,
+void AV1_K_MEANS_RENAME(av1_calc_indices, 1)(const int *data,
+                                             const int *centroids,
                                              uint8_t *indices, int n, int k);
-void AV1_K_MEANS_RENAME(av1_calc_indices, 2)(const float *data,
-                                             const float *centroids,
+void AV1_K_MEANS_RENAME(av1_calc_indices, 2)(const int *data,
+                                             const int *centroids,
                                              uint8_t *indices, int n, int k);
-void AV1_K_MEANS_RENAME(av1_k_means, 1)(const float *data, float *centroids,
+void AV1_K_MEANS_RENAME(av1_k_means, 1)(const int *data, int *centroids,
                                         uint8_t *indices, int n, int k,
                                         int max_itr);
-void AV1_K_MEANS_RENAME(av1_k_means, 2)(const float *data, float *centroids,
+void AV1_K_MEANS_RENAME(av1_k_means, 2)(const int *data, int *centroids,
                                         uint8_t *indices, int n, int k,
                                         int max_itr);
 
 // Given 'n' 'data' points and 'k' 'centroids' each of dimension 'dim',
 // calculate the centroid 'indices' for the data points.
-static INLINE void av1_calc_indices(const float *data, const float *centroids,
+static INLINE void av1_calc_indices(const int *data, const int *centroids,
                                     uint8_t *indices, int n, int k, int dim) {
   if (dim == 1) {
     AV1_K_MEANS_RENAME(av1_calc_indices, 1)(data, centroids, indices, n, k);
@@ -50,7 +50,7 @@
 // dimension 'dim', runs up to 'max_itr' iterations of k-means algorithm to get
 // updated 'centroids' and the centroid 'indices' for elements in 'data'.
 // Note: the output centroids are rounded off to nearest integers.
-static INLINE void av1_k_means(const float *data, float *centroids,
+static INLINE void av1_k_means(const int *data, int *centroids,
                                uint8_t *indices, int n, int k, int dim,
                                int max_itr) {
   if (dim == 1) {
@@ -66,7 +66,7 @@
 // puts these unique centroids in first 'k' indices of 'centroids' array.
 // Ideally, the centroids should be rounded to integers before calling this
 // method.
-int av1_remove_duplicates(float *centroids, int num_centroids);
+int av1_remove_duplicates(int *centroids, int num_centroids);
 
 // Given a color cache and a set of base colors, find if each cache color is
 // present in the base colors, record the binary results in "cache_color_found".
diff --git a/av1/encoder/rdopt.c b/av1/encoder/rdopt.c
index 2f63807..7bd299d 100644
--- a/av1/encoder/rdopt.c
+++ b/av1/encoder/rdopt.c
@@ -2797,20 +2797,19 @@
 // Bias toward using colors in the cache.
 // TODO(huisu): Try other schemes to improve compression.
 static void optimize_palette_colors(uint16_t *color_cache, int n_cache,
-                                    int n_colors, int stride,
-                                    float *centroids) {
+                                    int n_colors, int stride, int *centroids) {
   if (n_cache <= 0) return;
   for (int i = 0; i < n_colors * stride; i += stride) {
-    float min_diff = fabsf(centroids[i] - color_cache[0]);
+    int min_diff = abs(centroids[i] - (int)color_cache[0]);
     int idx = 0;
     for (int j = 1; j < n_cache; ++j) {
-      float this_diff = fabsf(centroids[i] - color_cache[j]);
+      const int this_diff = abs(centroids[i] - color_cache[j]);
       if (this_diff < min_diff) {
         min_diff = this_diff;
         idx = j;
       }
     }
-    if (min_diff < 1.5) centroids[i] = color_cache[idx];
+    if (min_diff <= 1) centroids[i] = color_cache[idx];
   }
 }
 
@@ -2818,19 +2817,13 @@
 // of palette mode.
 static void palette_rd_y(const AV1_COMP *const cpi, MACROBLOCK *x,
                          MB_MODE_INFO *mbmi, BLOCK_SIZE bsize, int palette_ctx,
-                         int dc_mode_cost, const float *data, float *centroids,
+                         int dc_mode_cost, const int *data, int *centroids,
                          int n, uint16_t *color_cache, int n_cache,
                          MB_MODE_INFO *best_mbmi,
                          uint8_t *best_palette_color_map, int64_t *best_rd,
                          int64_t *best_model_rd, int *rate, int *rate_tokenonly,
                          int *rate_overhead, int64_t *distortion,
                          int *skippable) {
-  aom_clear_system_state();
-#ifndef NDEBUG
-  for (int i = 0; i < n; ++i) {
-    assert(!isnan(centroids[i]));
-  }
-#endif  // NDEBUG
   optimize_palette_colors(color_cache, n_cache, n, 1, centroids);
   int k = av1_remove_duplicates(centroids, n);
   if (k < PALETTE_MIN_SIZE) {
@@ -2847,7 +2840,7 @@
   else
 #endif  // CONFIG_HIGHBITDEPTH
     for (int i = 0; i < k; ++i)
-      pmi->palette_colors[i] = clip_pixel((int)centroids[i]);
+      pmi->palette_colors[i] = clip_pixel(centroids[i]);
   pmi->palette_size[0] = k;
   MACROBLOCKD *const xd = &x->e_mbd;
   uint8_t *const color_map = xd->plane[0].color_index_map;
@@ -2928,12 +2921,11 @@
 #endif  // CONFIG_FILTER_INTRA
 
   if (colors > 1 && colors <= 64) {
-    aom_clear_system_state();
     int r, c, i;
     const int max_itr = 50;
-    float *const data = x->palette_buffer->kmeans_data_buf;
-    float centroids[PALETTE_MAX_SIZE];
-    float lb, ub, val;
+    int *const data = x->palette_buffer->kmeans_data_buf;
+    int centroids[PALETTE_MAX_SIZE];
+    int lb, ub, val;
 #if CONFIG_HIGHBITDEPTH
     uint16_t *src16 = CONVERT_TO_SHORTPTR(src);
     if (cpi->common.use_highbitdepth)
@@ -2998,8 +2990,7 @@
     // TODO(huisu@google.com): Try to avoid duplicate computation in cases
     // where the dominant colors and the k-means results are similar.
     for (n = AOMMIN(colors, PALETTE_MAX_SIZE); n >= 2; --n) {
-      aom_clear_system_state();
-      for (i = 0; i < n; ++i) centroids[i] = (float)(top_colors[i]);
+      for (i = 0; i < n; ++i) centroids[i] = top_colors[i];
       palette_rd_y(cpi, x, mbmi, bsize, palette_ctx, dc_mode_cost, data,
                    centroids, n, color_cache, n_cache, best_mbmi,
                    best_palette_color_map, best_rd, best_model_rd, rate,
@@ -3008,7 +2999,6 @@
 
     // K-means clustering.
     for (n = AOMMIN(colors, PALETTE_MAX_SIZE); n >= 2; --n) {
-      aom_clear_system_state();
       if (colors == PALETTE_MIN_SIZE) {
         // Special case: These colors automatically become the centroids.
         assert(colors == n);
@@ -5174,13 +5164,12 @@
 
   colors = colors_u > colors_v ? colors_u : colors_v;
   if (colors > 1 && colors <= 64) {
-    aom_clear_system_state();
     int r, c, n, i, j;
     const int max_itr = 50;
-    float lb_u, ub_u, val_u;
-    float lb_v, ub_v, val_v;
-    float *const data = x->palette_buffer->kmeans_data_buf;
-    float centroids[2 * PALETTE_MAX_SIZE];
+    int lb_u, ub_u, val_u;
+    int lb_v, ub_v, val_v;
+    int *const data = x->palette_buffer->kmeans_data_buf;
+    int centroids[2 * PALETTE_MAX_SIZE];
 
 #if CONFIG_HIGHBITDEPTH
     uint16_t *src_u16 = CONVERT_TO_SHORTPTR(src_u);
@@ -5230,26 +5219,20 @@
 
     for (n = colors > PALETTE_MAX_SIZE ? PALETTE_MAX_SIZE : colors; n >= 2;
          --n) {
-      aom_clear_system_state();
       for (i = 0; i < n; ++i) {
         centroids[i * 2] = lb_u + (2 * i + 1) * (ub_u - lb_u) / n / 2;
         centroids[i * 2 + 1] = lb_v + (2 * i + 1) * (ub_v - lb_v) / n / 2;
       }
       av1_k_means(data, centroids, color_map, rows * cols, n, 2, max_itr);
-#ifndef NDEBUG
-      for (i = 0; i < 2 * n; ++i) {
-        assert(!isnan(centroids[i]));
-      }
-#endif  // NDEBUG
       optimize_palette_colors(color_cache, n_cache, n, 2, centroids);
       // Sort the U channel colors in ascending order.
       for (i = 0; i < 2 * (n - 1); i += 2) {
         int min_idx = i;
-        float min_val = centroids[i];
+        int min_val = centroids[i];
         for (j = i + 2; j < 2 * n; j += 2)
           if (centroids[j] < min_val) min_val = centroids[j], min_idx = j;
         if (min_idx != i) {
-          float temp_u = centroids[i], temp_v = centroids[i + 1];
+          int temp_u = centroids[i], temp_v = centroids[i + 1];
           centroids[i] = centroids[min_idx];
           centroids[i + 1] = centroids[min_idx + 1];
           centroids[min_idx] = temp_u, centroids[min_idx + 1] = temp_v;
@@ -9031,8 +9014,8 @@
   int src_stride = x->plane[1].src.stride;
   const uint8_t *const src_u = x->plane[1].src.buf;
   const uint8_t *const src_v = x->plane[2].src.buf;
-  float *const data = x->palette_buffer->kmeans_data_buf;
-  float centroids[2 * PALETTE_MAX_SIZE];
+  int *const data = x->palette_buffer->kmeans_data_buf;
+  int centroids[2 * PALETTE_MAX_SIZE];
   uint8_t *const color_map = xd->plane[1].color_index_map;
   int r, c;
 #if CONFIG_HIGHBITDEPTH