Feature: reduce line buffers using block summary stats instead of pixels
diff --git a/av1/common/av1_common_int.h b/av1/common/av1_common_int.h
index aa37ea0..6fded67 100644
--- a/av1/common/av1_common_int.h
+++ b/av1/common/av1_common_int.h
@@ -390,6 +390,10 @@
   uint16_t *frame;
   /** mask frame buffer */
   uint16_t *mask;
+  /** buffer with block pixel values */
+  uint16_t *block_values;
+  /** buffer with # of occurrens of pixel values in a block */
+  uint16_t *block_hist;
   /** stride from frame, mask and c_values buffers */
   int stride;
   /** frame height for 'frame', mask and c_values buffers */
diff --git a/av1/common/debanding.c b/av1/common/debanding.c
index f5e60e5..cb758c0 100644
--- a/av1/common/debanding.c
+++ b/av1/common/debanding.c
@@ -226,9 +226,11 @@
     dbi->buffers.c_values_histograms =
         aom_malloc(frame_width * dbi->num_bins * sizeof(uint16_t));
   } else {
-    int mask_width = frame_width >> CAMDA_LOG2_BLOCK_SIZE;
-    int mask_height = frame_height >> CAMDA_LOG2_BLOCK_SIZE;
-    dbi->mask = aom_malloc(mask_width * mask_height * sizeof(*dbi->mask));
+    int mask_size = (frame_width>>CAMDA_LOG2_BLOCK_SIZE) *
+                    (frame_height>>CAMDA_LOG2_BLOCK_SIZE) * sizeof(uint16_t);
+    dbi->mask = aom_malloc(mask_size);
+    dbi->block_values = aom_malloc(mask_size * (CAMDA_BLOCK_MAX_COUNT+1));
+    dbi->block_hist = aom_malloc(mask_size * CAMDA_BLOCK_MAX_COUNT);
   }
 
   return use_deband;
@@ -271,6 +273,35 @@
   return (filter_size * filter_size + 3 * (ceil_log2(shifted_wh) - 11) - 1)>>1;
 }
 
+
+uint16_t pixel_count_if_below_thr(DebandInfo *dbi, uint16_t *image, ptrdiff_t stride,
+                                  int mask_pos, int b_row, int b_col) {
+  const int row = b_row << CAMDA_LOG2_BLOCK_SIZE;
+  const int col = b_col << CAMDA_LOG2_BLOCK_SIZE;
+  long int index = row * stride + col;
+  uint16_t num_vals = 0;
+  uint16_t *values = (uint16_t*) dbi->block_values + mask_pos * (CAMDA_BLOCK_MAX_COUNT+1);
+  uint16_t *hist = (uint16_t*)  dbi->block_hist + mask_pos * CAMDA_BLOCK_MAX_COUNT;
+
+  for (int i=0; i<CAMDA_BLOCK_SIZE; i++, index+=stride) {
+    for (int j=0; j<CAMDA_BLOCK_SIZE; j++) {
+      uint16_t count = 0;
+      while (values[count]!=image[index+j] && values[count]!=UINT16_MAX)
+        count++;
+
+      if (count>num_vals) {
+        num_vals = count;
+        if (count==CAMDA_BLOCK_MAX_COUNT)
+          return 0;
+      }
+
+      values[count] = image[index+j];
+      hist[count]++;
+    }
+  }
+  return num_vals+1;
+}
+
 void camda_get_spatial_mask(DebandInfo *dbi, int width, int height) {
   uint16_t pad_size = CAMDA_MASK_FILTER_SIZE >> 1;
   uint16_t *image_data = dbi->frame;
@@ -284,7 +315,11 @@
   int dp_width = width + 2 * pad_size + 1;
   int dp_height = 2 * pad_size + 2;
   memset(dp, 0, dp_width * dp_height * sizeof(uint32_t));
-  memset(mask_data, 0, mask_stride * mask_height * sizeof(uint16_t));
+
+  int mask_size = mask_height * mask_stride * sizeof(uint16_t);
+  memset(mask_data, 0, mask_size);
+  memset(dbi->block_values, UINT16_MAX, mask_size * (CAMDA_BLOCK_MAX_COUNT+1));
+  memset(dbi->block_hist, 0, mask_size * CAMDA_BLOCK_MAX_COUNT);
 
   // Initial computation: fill dp except for the last row
   for (int i = 0; i < pad_size; i++) {
@@ -343,40 +378,36 @@
               + dp[top * dp_width + left];
 
           int mask_j = left >> CAMDA_LOG2_BLOCK_SIZE;
-          mask_data[mask_i * mask_stride + mask_j] = (result > mask_index);
+          if (result > mask_index) {
+            int mask_pos = mask_i * mask_stride + mask_j;
+            mask_data[mask_pos] = pixel_count_if_below_thr(dbi, image_data, stride,
+                                                           mask_pos, mask_i, mask_j);
+          }
         }
       }
     }
   }
 }
 
-static inline void add_block_to_histogram(uint16_t *histogram, int b_row, int b_col,
-                                          uint16_t *image, ptrdiff_t stride,
-                                          const uint16_t num_diffs) {
+static inline void add_hist_to_histogram(DebandInfo *dbi, uint16_t *histogram,
+                                         ptrdiff_t mask_pos, const uint16_t num_diffs,
+                                         uint16_t num_values) {
   uint16_t *hist_diff = histogram + num_diffs;
-  const int row = b_row << CAMDA_LOG2_BLOCK_SIZE;
-  const int col = b_col << CAMDA_LOG2_BLOCK_SIZE;
-  long int index = row * stride + col;
-  for (int i=0; i<CAMDA_BLOCK_SIZE; i++, index+=stride) {
-    hist_diff[image[index]]++;
-    hist_diff[image[index+1]]++;
-    hist_diff[image[index+2]]++;
-    hist_diff[image[index+3]]++;
+  ptrdiff_t hist_pos = mask_pos * CAMDA_BLOCK_MAX_COUNT;
+  ptrdiff_t values_pos = hist_pos + mask_pos;
+  for (int i=0; i<num_values; i++) {
+    hist_diff[dbi->block_values[values_pos++]] += dbi->block_hist[hist_pos++];
   }
 }
 
-static inline void sub_block_to_histogram(uint16_t *histogram, int b_row, int b_col,
-                                          uint16_t *image, ptrdiff_t stride,
-                                          const uint16_t num_diffs) {
+static inline void sub_hist_to_histogram(DebandInfo *dbi, uint16_t *histogram,
+                                         ptrdiff_t mask_pos, const uint16_t num_diffs,
+                                         uint16_t num_values) {
   uint16_t *hist_diff = histogram + num_diffs;
-  const int row = b_row << CAMDA_LOG2_BLOCK_SIZE;
-  const int col = b_col << CAMDA_LOG2_BLOCK_SIZE;
-  long int index = row * stride + col;
-  for (int i=0; i<CAMDA_BLOCK_SIZE; i++, index+=stride) {
-    hist_diff[image[index]]--;
-    hist_diff[image[index+1]]--;
-    hist_diff[image[index+2]]--;
-    hist_diff[image[index+3]]--;
+  ptrdiff_t hist_pos = mask_pos * CAMDA_BLOCK_MAX_COUNT;
+  ptrdiff_t values_pos = hist_pos + mask_pos;
+  for (int i=0; i<num_values; i++) {
+    hist_diff[dbi->block_values[values_pos++]] -= dbi->block_hist[hist_pos++];
   }
 }
 
@@ -469,9 +500,11 @@
 
     for (int b_i=-b_pad_size; b_i<=b_pad_size; b_i++)
       for (int b_j=0; b_j<=b_pad_size; b_j++)
-        if (b_row+b_i>=0 && b_row+b_i<b_height)
-          if (mask[(b_row+b_i) * mask_stride + b_j])
-            add_block_to_histogram(hist, b_row + b_i, b_j, image, stride, num_diffs);
+        if (b_row+b_i>=0 && b_row+b_i<b_height) {
+          ptrdiff_t mask_pos = (b_row+b_i) * mask_stride + b_j;
+          if (mask[mask_pos])
+            add_hist_to_histogram(dbi, hist, mask_pos, num_diffs, mask[mask_pos]);
+        }
 
     if (mask[b_row * mask_stride])
       dither_block(hist, image, b_row, 0, stride, num_diffs, tvi_for_diff, dbi);
@@ -483,15 +516,17 @@
           int b_col_in = b_col + b_pad_size;
           int b_col_out = b_col - b_pad_size - 1;
 
-          if (b_col_out >= 0)
-            if (mask[b_row_curr * mask_stride + b_col_out])
-              sub_block_to_histogram(hist, b_row_curr, b_col_out,
-                                     image, stride, num_diffs);
+          if (b_col_out >= 0) {
+            ptrdiff_t mask_pos = b_row_curr * mask_stride + b_col_out;
+            if (mask[mask_pos])
+              sub_hist_to_histogram(dbi, hist, mask_pos, num_diffs, mask[mask_pos]);
+          }
 
-          if (b_col_in<b_width)
-            if (mask[b_row_curr * mask_stride + b_col_in])
-              add_block_to_histogram(hist, b_row_curr, b_col_in,
-                                     image, stride, num_diffs);
+          if (b_col_in<b_width) {
+            ptrdiff_t mask_pos = b_row_curr * mask_stride + b_col_in;
+            if (mask[mask_pos])
+              add_hist_to_histogram(dbi, hist, mask_pos, num_diffs, mask[mask_pos]);
+          }
         }
       }
 
@@ -540,5 +575,8 @@
     aom_free(dbi->diffs_weights);
     aom_free(dbi->buffers.c_values);
     aom_free(dbi->buffers.c_values_histograms);
+  } else {
+    aom_free(dbi->block_values);
+    aom_free(dbi->block_hist);
   }
 }
diff --git a/av1/common/debanding.h b/av1/common/debanding.h
index d320e0f..12ed775 100644
--- a/av1/common/debanding.h
+++ b/av1/common/debanding.h
@@ -31,6 +31,7 @@
 #define CAMDA_MASK_FILTER_SIZE (7)
 #define CAMDA_BLOCK_SIZE (4)
 #define CAMDA_LOG2_BLOCK_SIZE (2)
+#define CAMDA_BLOCK_MAX_COUNT (4)
 
 /* Visibility threshold for luminance ΔL < tvi_threshold*L_mean for BT.1886 */
 #define CAMBI_TVI (0.019)