Release previous hash-me memory when encoding new frame

The hash-table from hash-me has two usages:
  - intrablock copy for keyframe
  - prune subpel search in interframe

The current usages only require the hash-table for the current and
previous coded frame be kept. But currently in the codebase, the
memory is only released when the frame buffers are refreshed.

This commit changes so that the hash-tables for frames other than
the current and previously coded frames are released, which allows
significant memory saving.

Performance when encoding 4k videos over 33 frames:
Baseline: 7.2 GB
 Commit : 3.1 GB
 Saving : 4.1 GB, or ~57% memory reduction

BUG=aomedia:2453

Change-Id: I659dbe5eb2d7badea7239a841fc5ba06987f75f7
diff --git a/av1/encoder/encodeframe.c b/av1/encoder/encodeframe.c
index f205562..52c651c 100644
--- a/av1/encoder/encodeframe.c
+++ b/av1/encoder/encodeframe.c
@@ -4562,6 +4562,9 @@
       }
     }
 
+#if CONFIG_DEBUG
+    cm->cur_frame->hash_table.has_content++;
+#endif
     av1_hash_table_create(&cm->cur_frame->hash_table);
     av1_generate_block_2x2_hash_value(cpi->source, block_hash_values[0],
                                       is_block_same[0], &cpi->td.mb);
@@ -4879,22 +4882,20 @@
   cm->current_frame.skip_mode_info.skip_mode_flag =
       check_skip_mode_enabled(cpi);
 
-  {
-    cpi->row_mt_sync_read_ptr = av1_row_mt_sync_read_dummy;
-    cpi->row_mt_sync_write_ptr = av1_row_mt_sync_write_dummy;
-    cpi->row_mt = 0;
+  cpi->row_mt_sync_read_ptr = av1_row_mt_sync_read_dummy;
+  cpi->row_mt_sync_write_ptr = av1_row_mt_sync_write_dummy;
+  cpi->row_mt = 0;
 
-    if (cpi->oxcf.row_mt && (cpi->oxcf.max_threads > 1)) {
-      cpi->row_mt = 1;
-      cpi->row_mt_sync_read_ptr = av1_row_mt_sync_read;
-      cpi->row_mt_sync_write_ptr = av1_row_mt_sync_write;
-      av1_encode_tiles_row_mt(cpi);
-    } else {
-      if (AOMMIN(cpi->oxcf.max_threads, cm->tile_cols * cm->tile_rows) > 1)
-        av1_encode_tiles_mt(cpi);
-      else
-        encode_tiles(cpi);
-    }
+  if (cpi->oxcf.row_mt && (cpi->oxcf.max_threads > 1)) {
+    cpi->row_mt = 1;
+    cpi->row_mt_sync_read_ptr = av1_row_mt_sync_read;
+    cpi->row_mt_sync_write_ptr = av1_row_mt_sync_write;
+    av1_encode_tiles_row_mt(cpi);
+  } else {
+    if (AOMMIN(cpi->oxcf.max_threads, cm->tile_cols * cm->tile_rows) > 1)
+      av1_encode_tiles_mt(cpi);
+    else
+      encode_tiles(cpi);
   }
 
   // If intrabc is allowed but never selected, reset the allow_intrabc flag.
diff --git a/av1/encoder/encoder.c b/av1/encoder/encoder.c
index 1da9a6a..d5349d6 100644
--- a/av1/encoder/encoder.c
+++ b/av1/encoder/encoder.c
@@ -5404,6 +5404,20 @@
   (void)xd;
 }
 
+#if CONFIG_DEBUG
+static int hash_me_has_at_most_two_refs(RefCntBuffer *frame_bufs) {
+  int total_count = 0;
+  for (int frame_idx = 0; frame_idx < FRAME_BUFFERS; ++frame_idx) {
+    if (frame_bufs[frame_idx].hash_table.has_content > 1) {
+      return 0;
+    }
+    total_count += frame_bufs[frame_idx].hash_table.has_content;
+  }
+
+  return total_count <= 2;
+}
+#endif
+
 static int encode_frame_to_data_rate(AV1_COMP *cpi, size_t *size,
                                      uint8_t *dest) {
   AV1_COMMON *const cm = &cpi->common;
@@ -5495,6 +5509,16 @@
     cpi->common.cur_frame_force_integer_mv = 0;
   }
 
+#if CONFIG_DEBUG
+  assert(hash_me_has_at_most_two_refs(cm->buffer_pool->frame_bufs) &&
+         "Hash-me is leaking memory!");
+#endif
+
+  if (cpi->oxcf.pass != 1 && cpi->need_to_clear_prev_hash_table) {
+    av1_hash_table_clear_all(cpi->previous_hash_table);
+    cpi->need_to_clear_prev_hash_table = 0;
+  }
+
   // Set default state for segment based loop filter update flags.
   cm->lf.mode_ref_delta_update = 0;
 
@@ -5693,9 +5717,12 @@
 
   av1_rc_postencode_update(cpi, *size);
 
-  // Store encoded frame's hash table for is_integer_mv() next time
-  if (oxcf->pass != 1 && cpi->common.allow_screen_content_tools) {
+  // Store encoded frame's hash table for in_integer_mv() next time.
+  // Beware! If we don't update previous_hash_table here we will leak the
+  // items stored in cur_frame's hash_table!
+  if (oxcf->pass != 1 && av1_use_hash_me(cm)) {
     cpi->previous_hash_table = &cm->cur_frame->hash_table;
+    cpi->need_to_clear_prev_hash_table = 1;
   }
 
   // Clear the one shot update flags for segmentation map and mode/ref loop
diff --git a/av1/encoder/encoder.h b/av1/encoder/encoder.h
index 9a7e25a..2f9264d 100644
--- a/av1/encoder/encoder.h
+++ b/av1/encoder/encoder.h
@@ -798,6 +798,7 @@
   int rate_size;
   int rate_index;
   hash_table *previous_hash_table;
+  int need_to_clear_prev_hash_table;
   int previous_index;
 
   unsigned int row_mt;
diff --git a/av1/encoder/hash_motion.c b/av1/encoder/hash_motion.c
index 950a570..25fedf9 100644
--- a/av1/encoder/hash_motion.c
+++ b/av1/encoder/hash_motion.c
@@ -20,20 +20,6 @@
 static const int crc_bits = 16;
 static const int block_size_bits = 3;
 
-static void hash_table_clear_all(hash_table *p_hash_table) {
-  if (p_hash_table->p_lookup_table == NULL) {
-    return;
-  }
-  int max_addr = 1 << (crc_bits + block_size_bits);
-  for (int i = 0; i < max_addr; i++) {
-    if (p_hash_table->p_lookup_table[i] != NULL) {
-      aom_vector_destroy(p_hash_table->p_lookup_table[i]);
-      aom_free(p_hash_table->p_lookup_table[i]);
-      p_hash_table->p_lookup_table[i] = NULL;
-    }
-  }
-}
-
 // TODO(youzhou@microsoft.com): is higher than 8 bits screen content supported?
 // If yes, fix this function
 static void get_pixels_in_1D_char_array_by_block_2x2(uint8_t *y_src, int stride,
@@ -111,17 +97,40 @@
     x->g_crc_initialized = 1;
   }
   p_hash_table->p_lookup_table = NULL;
+#if CONFIG_DEBUG
+  p_hash_table->has_content = 0;
+#endif
+}
+
+void av1_hash_table_clear_all(hash_table *p_hash_table) {
+  if (p_hash_table->p_lookup_table == NULL) {
+    return;
+  }
+  int max_addr = 1 << (crc_bits + block_size_bits);
+  for (int i = 0; i < max_addr; i++) {
+    if (p_hash_table->p_lookup_table[i] != NULL) {
+      aom_vector_destroy(p_hash_table->p_lookup_table[i]);
+      aom_free(p_hash_table->p_lookup_table[i]);
+      p_hash_table->p_lookup_table[i] = NULL;
+    }
+  }
+#if CONFIG_DEBUG
+  p_hash_table->has_content = 0;
+#endif
 }
 
 void av1_hash_table_destroy(hash_table *p_hash_table) {
-  hash_table_clear_all(p_hash_table);
+  av1_hash_table_clear_all(p_hash_table);
   aom_free(p_hash_table->p_lookup_table);
   p_hash_table->p_lookup_table = NULL;
+#if CONFIG_DEBUG
+  p_hash_table->has_content = 0;
+#endif
 }
 
 void av1_hash_table_create(hash_table *p_hash_table) {
   if (p_hash_table->p_lookup_table != NULL) {
-    hash_table_clear_all(p_hash_table);
+    av1_hash_table_clear_all(p_hash_table);
     return;
   }
   const int max_addr = 1 << (crc_bits + block_size_bits);
@@ -129,6 +138,9 @@
       (Vector **)aom_malloc(sizeof(p_hash_table->p_lookup_table[0]) * max_addr);
   memset(p_hash_table->p_lookup_table, 0,
          sizeof(p_hash_table->p_lookup_table[0]) * max_addr);
+#if CONFIG_DEBUG
+  p_hash_table->has_content = 0;
+#endif
 }
 
 static void hash_table_add_to_table(hash_table *p_hash_table,
diff --git a/av1/encoder/hash_motion.h b/av1/encoder/hash_motion.h
index ed9bb6e..0ea9daa 100644
--- a/av1/encoder/hash_motion.h
+++ b/av1/encoder/hash_motion.h
@@ -32,9 +32,13 @@
 
 typedef struct _hash_table {
   Vector **p_lookup_table;
+#if CONFIG_DEBUG
+  int has_content;
+#endif
 } hash_table;
 
 void av1_hash_table_init(hash_table *p_hash_table, struct macroblock *x);
+void av1_hash_table_clear_all(hash_table *p_hash_table);
 void av1_hash_table_destroy(hash_table *p_hash_table);
 void av1_hash_table_create(hash_table *p_hash_table);
 int32_t av1_hash_table_count(const hash_table *p_hash_table,