Optimize Memory for firstpass multi-threading

Avoid allocating redundant memory for the workers shared
between the LAP_STAGE and ENCODE_STAGE in case of single pass
encoding.

Change-Id: I1a29e3c4e98df085a2b6b4cb03835eead2823c7d
diff --git a/av1/av1_cx_iface.c b/av1/av1_cx_iface.c
index 0ef0fd8..8a22da6 100644
--- a/av1/av1_cx_iface.c
+++ b/av1/av1_cx_iface.c
@@ -2193,8 +2193,10 @@
       aom_rational64_t timestamp_ratio_la = *timestamp_ratio;
       int64_t dst_time_stamp_la = dst_time_stamp;
       int64_t dst_end_time_stamp_la = dst_end_time_stamp;
-      if (cpi_lap->mt_info.workers == NULL)
+      if (cpi_lap->mt_info.workers == NULL) {
         cpi_lap->mt_info.workers = cpi->mt_info.workers;
+        cpi_lap->mt_info.tile_thr_data = cpi->mt_info.tile_thr_data;
+      }
       cpi_lap->mt_info.num_workers = cpi->mt_info.num_workers;
       status = av1_get_compressed_data(
           cpi_lap, &lib_flags, &frame_size, NULL, &dst_time_stamp_la,
diff --git a/av1/encoder/encoder.c b/av1/encoder/encoder.c
index 24a2dbf..23410a7 100644
--- a/av1/encoder/encoder.c
+++ b/av1/encoder/encoder.c
@@ -3525,7 +3525,6 @@
 static AOM_INLINE void free_thread_data(AV1_COMP *cpi) {
   MultiThreadInfo *const mt_info = &cpi->mt_info;
   AV1_COMMON *cm = &cpi->common;
-  if (mt_info->tile_thr_data == NULL) return;
   for (int t = 0; t < mt_info->num_workers; ++t) {
     EncWorkerData *const thread_data = &mt_info->tile_thr_data[t];
     aom_free(thread_data->td->tctx);
@@ -3668,8 +3667,10 @@
     aom_free_frame_buffer(&tpl_data->tpl_rec_pool[frame]);
   }
 
-  if (cpi->compressor_stage != LAP_STAGE) terminate_worker_data(cpi);
-  free_thread_data(cpi);
+  if (cpi->compressor_stage != LAP_STAGE) {
+    terminate_worker_data(cpi);
+    free_thread_data(cpi);
+  }
 
 #if CONFIG_MULTITHREAD
   if (enc_row_mt_mutex_ != NULL) {
@@ -3682,8 +3683,10 @@
   }
 #endif
   av1_row_mt_mem_dealloc(cpi);
-  aom_free(mt_info->tile_thr_data);
-  if (cpi->compressor_stage != LAP_STAGE) aom_free(mt_info->workers);
+  if (cpi->compressor_stage != LAP_STAGE) {
+    aom_free(mt_info->tile_thr_data);
+    aom_free(mt_info->workers);
+  }
 
 #if !CONFIG_REALTIME_ONLY
   av1_tpl_dealloc(&tpl_data->tpl_mt_sync);
diff --git a/av1/encoder/ethread.c b/av1/encoder/ethread.c
index e65c07c..5a3fdef 100644
--- a/av1/encoder/ethread.c
+++ b/av1/encoder/ethread.c
@@ -512,8 +512,8 @@
   MultiThreadInfo *const mt_info = &cpi->mt_info;
   int sb_mi_size = av1_get_sb_mi_size(cm);
 
-  CHECK_MEM_ERROR(cm, mt_info->tile_thr_data,
-                  aom_calloc(num_workers, sizeof(*mt_info->tile_thr_data)));
+  assert(mt_info->workers != NULL);
+  assert(mt_info->tile_thr_data != NULL);
 
 #if CONFIG_MULTITHREAD
   if (cpi->oxcf.row_mt == 1) {
@@ -542,14 +542,8 @@
     thread_data->thread_id = i;
 
     if (i > 0) {
-      // Allocate thread data.
-      CHECK_MEM_ERROR(cm, thread_data->td,
-                      aom_memalign(32, sizeof(*thread_data->td)));
-      av1_zero(*thread_data->td);
-
       // Set up sms_tree.
       av1_setup_sms_tree(cpi, thread_data->td);
-      av1_setup_shared_coeff_buffer(cm, &thread_data->td->shared_coeff_buf);
 
       av1_alloc_obmc_buffers(&thread_data->td->obmc_buffer, cm);
 
@@ -619,14 +613,29 @@
   AV1_COMMON *const cm = &cpi->common;
   MultiThreadInfo *const mt_info = &cpi->mt_info;
   const AVxWorkerInterface *const winterface = aom_get_worker_interface();
-  mt_info->tile_thr_data = NULL;
 
   CHECK_MEM_ERROR(cm, mt_info->workers,
                   aom_malloc(num_workers * sizeof(*mt_info->workers)));
+
+  CHECK_MEM_ERROR(cm, mt_info->tile_thr_data,
+                  aom_calloc(num_workers, sizeof(*mt_info->tile_thr_data)));
+
   for (int i = num_workers - 1; i >= 0; i--) {
     AVxWorker *const worker = &mt_info->workers[i];
+    EncWorkerData *const thread_data = &mt_info->tile_thr_data[i];
+
     winterface->init(worker);
     worker->thread_name = "aom enc worker";
+
+    if (i > 0) {
+      // Allocate thread data.
+      CHECK_MEM_ERROR(cm, thread_data->td,
+                      aom_memalign(32, sizeof(*thread_data->td)));
+      av1_zero(*thread_data->td);
+
+      // Set up shared coeff buffers.
+      av1_setup_shared_coeff_buffer(cm, &thread_data->td->shared_coeff_buf);
+    }
     ++mt_info->num_workers;
   }
 }
@@ -637,10 +646,8 @@
   const AVxWorkerInterface *const winterface = aom_get_worker_interface();
   MultiThreadInfo *const mt_info = &cpi->mt_info;
 
-  CHECK_MEM_ERROR(cm, mt_info->tile_thr_data,
-                  aom_calloc(num_workers, sizeof(*mt_info->tile_thr_data)));
-
   assert(mt_info->workers != NULL);
+  assert(mt_info->tile_thr_data != NULL);
 
 #if CONFIG_MULTITHREAD
   AV1EncRowMultiThreadInfo *enc_row_mt = &mt_info->enc_row_mt;
@@ -661,14 +668,6 @@
     thread_data->thread_id = i;
 
     if (i > 0) {
-      // Allocate thread data.
-      CHECK_MEM_ERROR(cm, thread_data->td,
-                      aom_memalign(32, sizeof(*thread_data->td)));
-      av1_zero(*thread_data->td);
-
-      // Set up shared coeff buffers.
-      av1_setup_shared_coeff_buffer(cm, &thread_data->td->shared_coeff_buf);
-
       // Set up firstpass PICK_MODE_CONTEXT.
       thread_data->td->firstpass_ctx =
           av1_alloc_pmc(cm, BLOCK_16X16, &thread_data->td->shared_coeff_buf);
@@ -753,6 +752,11 @@
     worker->data1 = thread_data;
     worker->data2 = NULL;
 
+    thread_data->cpi = cpi;
+    if (i == 0) {
+      thread_data->td = &cpi->td;
+    }
+
     thread_data->td->intrabc_used = 0;
     thread_data->td->deltaq_used = 0;
 
@@ -809,6 +813,11 @@
     worker->data1 = thread_data;
     worker->data2 = NULL;
 
+    thread_data->cpi = cpi;
+    if (i == 0) {
+      thread_data->td = &cpi->td;
+    }
+
     // Before encoding a frame, copy the thread data from cpi.
     if (thread_data->td != &cpi->td) {
       thread_data->td->mb = cpi->td.mb;
@@ -1244,6 +1253,11 @@
     worker->data1 = thread_data;
     worker->data2 = NULL;
 
+    thread_data->cpi = cpi;
+    if (i == 0) {
+      thread_data->td = &cpi->td;
+    }
+
     // Before encoding a frame, copy the thread data from cpi.
     if (thread_data->td != &cpi->td) {
       thread_data->td->mb = cpi->td.mb;
@@ -1416,6 +1430,8 @@
     worker->hook = hook;
     worker->data1 = thread_data;
     worker->data2 = NULL;
+
+    thread_data->cpi = cpi;
   }
 }