Allintra: fix an issue in multithreading for deltaq-mode=3

A data race issue was seen due to "av1_frame_init_quantizer" is
called inside each worker thread.

The quantizer initialization could be moved before worker threads
start to run.

BUG=aomedia:3376

Change-Id: Ic901a5eed7197c9fd0157a31400ff80ead2e5658
diff --git a/av1/encoder/allintra_vis.c b/av1/encoder/allintra_vis.c
index 0463856..c561031 100644
--- a/av1/encoder/allintra_vis.c
+++ b/av1/encoder/allintra_vis.c
@@ -229,7 +229,8 @@
   return (rate_cost << AV1_PROB_COST_SHIFT);
 }
 
-void av1_calc_mb_wiener_var_row(AV1_COMP *const cpi, const int mi_row,
+void av1_calc_mb_wiener_var_row(AV1_COMP *const cpi, MACROBLOCK *x,
+                                MACROBLOCKD *xd, const int mi_row,
                                 int16_t *src_diff, tran_low_t *coeff,
                                 tran_low_t *qcoeff, tran_low_t *dqcoeff,
                                 double *sum_rec_distortion,
@@ -237,9 +238,6 @@
   AV1_COMMON *const cm = &cpi->common;
   uint8_t *buffer = cpi->source->y_buffer;
   int buf_stride = cpi->source->y_stride;
-  ThreadData *td = &cpi->td;
-  MACROBLOCK *x = &td->mb;
-  MACROBLOCKD *xd = &x->e_mbd;
   MB_MODE_INFO mbmi;
   memset(&mbmi, 0, sizeof(mbmi));
   MB_MODE_INFO *mbmi_ptr = &mbmi;
@@ -250,8 +248,6 @@
   const int coeff_count = block_size * block_size;
   const int mb_step = mi_size_wide[bsize];
   const BitDepthInfo bd_info = get_bit_depth_info(xd);
-  cm->quant_params.base_qindex = cpi->oxcf.rc_cfg.cq_level;
-  av1_frame_init_quantizer(cpi);
   const AV1EncRowMultiThreadInfo *const enc_row_mt = &cpi->mt_info.enc_row_mt;
   // We allocate cpi->tile_data (of size 1) when we call this function in
   // multithreaded mode, so cpi->tile_data may be a null pointer when we call
@@ -412,10 +408,14 @@
       ++mt_unit_col;
     }
   }
+  // Set the pointer to null since mbmi is only allocated inside this function.
+  xd->mi = NULL;
 }
 
 static void calc_mb_wiener_var(AV1_COMP *const cpi, double *sum_rec_distortion,
                                double *sum_est_rate) {
+  MACROBLOCK *x = &cpi->td.mb;
+  MACROBLOCKD *xd = &x->e_mbd;
   const BLOCK_SIZE bsize = cpi->weber_bsize;
   const int mb_step = mi_size_wide[bsize];
   DECLARE_ALIGNED(32, int16_t, src_diff[32 * 32]);
@@ -423,8 +423,8 @@
   DECLARE_ALIGNED(32, tran_low_t, qcoeff[32 * 32]);
   DECLARE_ALIGNED(32, tran_low_t, dqcoeff[32 * 32]);
   for (int mi_row = 0; mi_row < cpi->frame_info.mi_rows; mi_row += mb_step) {
-    av1_calc_mb_wiener_var_row(cpi, mi_row, src_diff, coeff, qcoeff, dqcoeff,
-                               sum_rec_distortion, sum_est_rate);
+    av1_calc_mb_wiener_var_row(cpi, x, xd, mi_row, src_diff, coeff, qcoeff,
+                               dqcoeff, sum_rec_distortion, sum_est_rate);
   }
 }
 
@@ -538,6 +538,17 @@
     aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR,
                        "Failed to allocate frame buffer");
   cpi->norm_wiener_variance = 0;
+
+  MACROBLOCK *x = &cpi->td.mb;
+  MACROBLOCKD *xd = &x->e_mbd;
+  // xd->mi needs to be setup since it is used in av1_frame_init_quantizer.
+  MB_MODE_INFO mbmi;
+  memset(&mbmi, 0, sizeof(mbmi));
+  MB_MODE_INFO *mbmi_ptr = &mbmi;
+  xd->mi = &mbmi_ptr;
+  cm->quant_params.base_qindex = cpi->oxcf.rc_cfg.cq_level;
+  av1_frame_init_quantizer(cpi);
+
   double sum_rec_distortion = 0.0;
   double sum_est_rate = 0.0;
 
@@ -602,6 +613,8 @@
     cpi->norm_wiener_variance = AOMMAX(1, cpi->norm_wiener_variance);
   }
 
+  // Set the pointer to null since mbmi is only allocated inside this function.
+  xd->mi = NULL;
   aom_free_frame_buffer(&cm->cur_frame->buf);
 }
 
diff --git a/av1/encoder/allintra_vis.h b/av1/encoder/allintra_vis.h
index e783d22..9e10566 100644
--- a/av1/encoder/allintra_vis.h
+++ b/av1/encoder/allintra_vis.h
@@ -22,7 +22,8 @@
 
 void av1_init_mb_wiener_var_buffer(AV1_COMP *cpi);
 
-void av1_calc_mb_wiener_var_row(AV1_COMP *const cpi, const int mi_row,
+void av1_calc_mb_wiener_var_row(AV1_COMP *const cpi, MACROBLOCK *x,
+                                MACROBLOCKD *xd, const int mi_row,
                                 int16_t *src_diff, tran_low_t *coeff,
                                 tran_low_t *qcoeff, tran_low_t *dqcoeff,
                                 double *sum_rec_distortion,
diff --git a/av1/encoder/ethread.c b/av1/encoder/ethread.c
index f94e675..1c8631a 100644
--- a/av1/encoder/ethread.c
+++ b/av1/encoder/ethread.c
@@ -2417,7 +2417,6 @@
 #endif  // !CONFIG_REALTIME_ONLY
 
 // Allocate memory for row synchronization
-// TODO(chengchen): do we need dealloc? where?
 static void wiener_var_sync_mem_alloc(
     AV1EncRowMultiThreadSync *const row_mt_sync, AV1_COMMON *const cm,
     const int rows) {
@@ -2449,6 +2448,34 @@
   row_mt_sync->sync_range = 1;
 }
 
+// Deallocate row based multi-threading synchronization related mutex and data
+static void wiener_var_sync_mem_dealloc(AV1EncRowMultiThreadSync *row_mt_sync) {
+  if (row_mt_sync != NULL) {
+#if CONFIG_MULTITHREAD
+    int i;
+
+    if (row_mt_sync->mutex_ != NULL) {
+      for (i = 0; i < row_mt_sync->rows; ++i) {
+        pthread_mutex_destroy(&row_mt_sync->mutex_[i]);
+      }
+      aom_free(row_mt_sync->mutex_);
+    }
+    if (row_mt_sync->cond_ != NULL) {
+      for (i = 0; i < row_mt_sync->rows; ++i) {
+        pthread_cond_destroy(&row_mt_sync->cond_[i]);
+      }
+      aom_free(row_mt_sync->cond_);
+    }
+#endif  // CONFIG_MULTITHREAD
+    aom_free(row_mt_sync->num_finished_cols);
+
+    // clear the structure as the source of this call may be dynamic change
+    // in tiles in which case this call will be followed by an _alloc()
+    // which may fail.
+    av1_zero(*row_mt_sync);
+  }
+}
+
 static AOM_INLINE void prepare_wiener_var_workers(AV1_COMP *const cpi,
                                                   AVxWorkerHook hook,
                                                   const int num_workers) {
@@ -2462,12 +2489,20 @@
     worker->data2 = NULL;
 
     thread_data->thread_id = i;
-    // Set the starting tile for each thread.
-    thread_data->start = i;
+    // Set the starting tile for each thread, in this case the preprocessing
+    // stage does not need tiles. So we set it to 0.
+    thread_data->start = 0;
 
     thread_data->cpi = cpi;
-    thread_data->td = &cpi->td;
-    thread_data->td->mb = cpi->td.mb;
+    if (i == 0) {
+      thread_data->td = &cpi->td;
+    } else {
+      thread_data->td = thread_data->original_td;
+    }
+
+    if (thread_data->td != &cpi->td) {
+      thread_data->td->mb = cpi->td.mb;
+    }
   }
 }
 
@@ -2475,6 +2510,8 @@
   (void)unused;
   EncWorkerData *const thread_data = (EncWorkerData *)arg1;
   AV1_COMP *const cpi = thread_data->cpi;
+  MACROBLOCK *x = &thread_data->td->mb;
+  MACROBLOCKD *xd = &x->e_mbd;
   const BLOCK_SIZE bsize = cpi->weber_bsize;
   const int mb_step = mi_size_wide[bsize];
   AV1EncRowMultiThreadSync *const row_mt_sync = &cpi->tile_data[0].row_mt_sync;
@@ -2501,8 +2538,9 @@
 #endif
     if (!has_jobs) break;
     // TODO(chengchen): properly accumulate the distortion and rate.
-    av1_calc_mb_wiener_var_row(cpi, current_mi_row, src_diff, coeff, qcoeff,
-                               dqcoeff, &sum_rec_distortion, &sum_est_rate);
+    av1_calc_mb_wiener_var_row(cpi, x, xd, current_mi_row, src_diff, coeff,
+                               qcoeff, dqcoeff, &sum_rec_distortion,
+                               &sum_est_rate);
 #if CONFIG_MULTITHREAD
     pthread_mutex_lock(enc_row_mt_mutex_);
 #endif
@@ -2550,6 +2588,8 @@
   prepare_wiener_var_workers(cpi, cal_mb_wiener_var_hook, num_workers);
   launch_workers(mt_info, num_workers);
   sync_enc_workers(mt_info, cm, num_workers);
+
+  wiener_var_sync_mem_dealloc(row_mt_sync);
 }
 
 // Compare and order tiles based on absolute sum of tx coeffs.