Add av1_down_sample_scan_count

This is for reduce memory usage for adapt_scan
The whole change will be under the flage USE_2X2_PROB

Change-Id: If7839d6396dad7618155ef2f36896d17743696ce
diff --git a/av1/common/scan.c b/av1/common/scan.c
index fc8562e..97491e8 100644
--- a/av1/common/scan.c
+++ b/av1/common/scan.c
@@ -6604,6 +6604,39 @@
   return value < low ? low : (value > high ? high : (int)value);
 }
 
+#if USE_2X2_PROB
+static int do_down_sample(TX_SIZE tx_size) {
+  const int tx_w = tx_size_wide[tx_size];
+  const int tx_h = tx_size_high[tx_size];
+  if (tx_w > 8 || tx_h > 8) {
+    return 1;
+  } else {
+    return 0;
+  }
+}
+
+void av1_down_sample_scan_count(uint32_t *non_zero_count_ds,
+                                const uint32_t *non_zero_count,
+                                TX_SIZE tx_size) {
+  const int tx_w = tx_size_wide[tx_size];
+  const int tx_h = tx_size_high[tx_size];
+  const int tx_w_ds = tx_w >> 1;
+  for (int r = 0; r < tx_h; r += 2) {
+    for (int c = 0; c < tx_w; c += 2) {
+      assert(r + 2 < tx_h);
+      assert(c + 2 < tx_w);
+      const int ci = r * tx_w + c;
+      const int r_ds = r >> 1;
+      const int c_ds = c >> 1;
+      const int ci_ds = r_ds * tx_w_ds + c_ds;
+      non_zero_count_ds[ci_ds] = non_zero_count[ci] + non_zero_count[ci + 1] +
+                                 non_zero_count[ci + tx_w] +
+                                 non_zero_count[ci + 1 + tx_w];
+    }
+  }
+}
+#endif
+
 static void update_scan_prob(AV1_COMMON *cm, TX_SIZE tx_size, TX_TYPE tx_type,
                              int rate) {
   FRAME_CONTEXT *pre_fc = cm->pre_fc;
@@ -6612,12 +6645,29 @@
   uint32_t *non_zero_count = get_non_zero_counts(&cm->counts, tx_size, tx_type);
   const int tx2d_size = tx_size_2d[tx_size];
   unsigned int block_num = cm->counts.txb_count[tx_size][tx_type];
+  uint32_t *non_zero_count_new = non_zero_count;
+  int count_size = tx2d_size;
+#if USE_2X2_PROB
+#if CONFIG_TX64X64
+  DECLARE_ALIGNED(16, uint32_t, non_zero_count_ds[1024]);
+  assert((tx2d_size >> 2) <= 1024);
+#else   // CONFIG_TX64X64
+  DECLARE_ALIGNED(16, uint32_t, non_zero_count_ds[256]);
+  assert((tx2d_size >> 2) <= 256);
+#endif  // CONFIG_TX64X64
+  if (do_down_sample(tx_size)) {
+    av1_down_sample_scan_count(non_zero_count_ds, non_zero_count, tx_size);
+    non_zero_count_new = non_zero_count_ds;
+    count_size = tx2d_size >> 2;
+    block_num <<= 2;
+  }
+#endif
   int i;
-  for (i = 0; i < tx2d_size; i++) {
+  for (i = 0; i < count_size; i++) {
     int64_t curr_prob =
         block_num == 0
             ? 0
-            : (non_zero_count[i] << ADAPT_SCAN_PROB_PRECISION) / block_num;
+            : (non_zero_count_new[i] << ADAPT_SCAN_PROB_PRECISION) / block_num;
     int64_t prev_prob = prev_non_zero_prob[i];
     int64_t pred_prob =
         (curr_prob * rate +
diff --git a/av1/common/scan.h b/av1/common/scan.h
index c9911de..5246f1f 100644
--- a/av1/common/scan.h
+++ b/av1/common/scan.h
@@ -30,6 +30,7 @@
 extern const SCAN_ORDER av1_inter_scan_orders[TX_SIZES_ALL][TX_TYPES];
 
 #if CONFIG_ADAPT_SCAN
+#define USE_2X2_PROB 0
 void av1_update_scan_count_facade(AV1_COMMON *cm, FRAME_COUNTS *counts,
                                   TX_SIZE tx_size, TX_TYPE tx_type,
                                   const tran_low_t *dqcoeffs, int max_scan);
@@ -55,7 +56,12 @@
                           const int16_t *iscan, int16_t *neighbors);
 void av1_init_scan_order(AV1_COMMON *cm);
 void av1_adapt_scan_order(AV1_COMMON *cm);
-#endif
+#if USE_2X2_PROB
+void av1_down_sample_scan_count(uint32_t *non_zero_count_ds,
+                                const uint32_t *non_zero_count,
+                                TX_SIZE tx_size);
+#endif  // USE_2X2_PROB
+#endif  // CONFIG_ADAPT_SCAN
 void av1_deliver_eob_threshold(const AV1_COMMON *cm, MACROBLOCKD *xd);
 
 static INLINE int get_coef_context(const int16_t *neighbors,
diff --git a/test/scan_test.cc b/test/scan_test.cc
index 16c831c..a2ca724 100644
--- a/test/scan_test.cc
+++ b/test/scan_test.cc
@@ -94,4 +94,17 @@
   }
 }
 
+#if USE_2X2_PROB
+TEST(ScanTest, av1_down_sample_scan_count) {
+  const uint32_t non_zero_count[16] = { 13, 12, 11, 10, 13, 9, 10, 8,
+                                        11, 12, 9,  8,  13, 9, 9,  10 };
+  const uint32_t ref_non_zero_count_ds[4] = { 47, 39, 45, 36 };
+  uint32_t non_zero_count_ds[4];
+  av1_down_sample_scan_count(non_zero_count_ds, non_zero_count, TX_4X4);
+  for (int i = 0; i < 4; ++i) {
+    EXPECT_EQ(ref_non_zero_count_ds[i], non_zero_count_ds[i]);
+  }
+}
+#endif
+
 }  // namespace