Implement multithreading for CDEF search

This CL adds support for block based multithreading of CDEF
search module.

cpu-used   Resolution    Tile    Average Encode Time
                                     Reduction (%)
   2        832x480      2x1       0.39 (2 threads)
   3       1280x720      2x2       0.52 (4 threads)
   4       1920x1080     4x2       0.51 (8 threads)
   5       3840x2160     4x2       2.57 (8 threads)
   6       3840x2160     4x2       1.55 (8 threads)

Change-Id: I754d3e7f0cb5355726682163bcf99bfde43ba233
diff --git a/av1/encoder/encoder.c b/av1/encoder/encoder.c
index cc681bc..9dade55 100644
--- a/av1/encoder/encoder.c
+++ b/av1/encoder/encoder.c
@@ -1541,6 +1541,7 @@
 #endif
   if (mt_info->num_workers > 1) {
     av1_loop_filter_dealloc(&mt_info->lf_row_sync);
+    av1_cdef_mt_dealloc(&mt_info->cdef_sync);
 #if !CONFIG_REALTIME_ONLY
     av1_loop_restoration_dealloc(&mt_info->lr_row_sync, mt_info->num_workers);
     av1_gm_dealloc(&mt_info->gm_sync);
@@ -2041,7 +2042,7 @@
     start_timing(cpi, cdef_time);
 #endif
     // Find CDEF parameters
-    av1_cdef_search(&cm->cur_frame->buf, cpi->source, cm, xd,
+    av1_cdef_search(&cpi->mt_info, &cm->cur_frame->buf, cpi->source, cm, xd,
                     cpi->sf.lpf_sf.cdef_pick_method, cpi->td.mb.rdmult);
 
     // Apply the filter
diff --git a/av1/encoder/encoder.h b/av1/encoder/encoder.h
index ed79ba0..b4bc58d 100644
--- a/av1/encoder/encoder.h
+++ b/av1/encoder/encoder.h
@@ -40,6 +40,7 @@
 #include "av1/encoder/level.h"
 #include "av1/encoder/lookahead.h"
 #include "av1/encoder/mcomp.h"
+#include "av1/encoder/pickcdef.h"
 #include "av1/encoder/ratectrl.h"
 #include "av1/encoder/rd.h"
 #include "av1/encoder/speed_features.h"
@@ -1337,7 +1338,7 @@
 /*!
  * \brief Encoder parameters related to multi-threading.
  */
-typedef struct {
+typedef struct MultiThreadInfo {
   /*!
    * Number of workers created for multi-threading.
    */
@@ -1399,6 +1400,11 @@
    * Temporal Filter multi-threading object.
    */
   AV1TemporalFilterSync tf_sync;
+
+  /*!
+   * CDEF search multi-threading object.
+   */
+  AV1CdefSync cdef_sync;
 } MultiThreadInfo;
 
 /*!\cond */
diff --git a/av1/encoder/ethread.c b/av1/encoder/ethread.c
index bf22743..730eedb 100644
--- a/av1/encoder/ethread.c
+++ b/av1/encoder/ethread.c
@@ -545,6 +545,12 @@
     CHECK_MEM_ERROR(cm, tf_sync->mutex_, aom_malloc(sizeof(*tf_sync->mutex_)));
     if (tf_sync->mutex_) pthread_mutex_init(tf_sync->mutex_, NULL);
   }
+  AV1CdefSync *cdef_sync = &mt_info->cdef_sync;
+  if (cdef_sync->mutex_ == NULL) {
+    CHECK_MEM_ERROR(cm, cdef_sync->mutex_,
+                    aom_malloc(sizeof(*(cdef_sync->mutex_))));
+    if (cdef_sync->mutex_) pthread_mutex_init(cdef_sync->mutex_, NULL);
+  }
 #endif
 
   for (int i = num_workers - 1; i >= 0; i--) {
@@ -1662,3 +1668,110 @@
   sync_enc_workers(&cpi->mt_info, &cpi->common, num_workers);
 }
 #endif  // !CONFIG_REALTIME_ONLY
+
+// Deallocate memory for CDEF search multi-thread synchronization.
+void av1_cdef_mt_dealloc(AV1CdefSync *cdef_sync) {
+  (void)cdef_sync;
+  assert(cdef_sync != NULL);
+#if CONFIG_MULTITHREAD
+  if (cdef_sync->mutex_ != NULL) {
+    pthread_mutex_destroy(cdef_sync->mutex_);
+    aom_free(cdef_sync->mutex_);
+  }
+#endif  // CONFIG_MULTITHREAD
+}
+
+// Updates the row and column indices of the next job to be processed.
+// Also updates end_of_frame flag when the processing of all blocks is complete.
+static void update_next_job_info(AV1CdefSync *cdef_sync, int nvfb, int nhfb) {
+  cdef_sync->fbc++;
+  if (cdef_sync->fbc == nhfb) {
+    cdef_sync->fbr++;
+    if (cdef_sync->fbr == nvfb) {
+      cdef_sync->end_of_frame = 1;
+    } else {
+      cdef_sync->fbc = 0;
+    }
+  }
+}
+
+// Initializes cdef_sync parameters.
+static AOM_INLINE void cdef_reset_job_info(AV1CdefSync *cdef_sync) {
+  cdef_sync->end_of_frame = 0;
+  cdef_sync->fbr = 0;
+  cdef_sync->fbc = 0;
+}
+
+// Checks if a job is available. If job is available,
+// populates next job information and returns 1, else returns 0.
+static AOM_INLINE int cdef_get_next_job(AV1CdefSync *cdef_sync,
+                                        CdefSearchCtx *cdef_search_ctx,
+                                        int *cur_fbr, int *cur_fbc,
+                                        int *sb_count) {
+#if CONFIG_MULTITHREAD
+  pthread_mutex_lock(cdef_sync->mutex_);
+#endif  // CONFIG_MULTITHREAD
+  int do_next_block = 0;
+  const int nvfb = cdef_search_ctx->nvfb;
+  const int nhfb = cdef_search_ctx->nhfb;
+
+  // If a block is skip, do not process the block and
+  // check the skip condition for the next block.
+  while ((!cdef_sync->end_of_frame) &&
+         (cdef_sb_skip(cdef_search_ctx->mi_params, cdef_sync->fbr,
+                       cdef_sync->fbc))) {
+    update_next_job_info(cdef_sync, nvfb, nhfb);
+  }
+
+  // Populates information needed for current job and update the row,
+  // column indices of the next block to be processed.
+  if (cdef_sync->end_of_frame == 0) {
+    do_next_block = 1;
+    *cur_fbr = cdef_sync->fbr;
+    *cur_fbc = cdef_sync->fbc;
+    *sb_count = cdef_search_ctx->sb_count;
+    cdef_search_ctx->sb_count++;
+    update_next_job_info(cdef_sync, nvfb, nhfb);
+  }
+#if CONFIG_MULTITHREAD
+  pthread_mutex_unlock(cdef_sync->mutex_);
+#endif  // CONFIG_MULTITHREAD
+  return do_next_block;
+}
+
+// Hook function for each thread in CDEF search multi-threading.
+static int cdef_filter_block_worker_hook(void *arg1, void *arg2) {
+  AV1CdefSync *const cdef_sync = (AV1CdefSync *)arg1;
+  CdefSearchCtx *cdef_search_ctx = (CdefSearchCtx *)arg2;
+  int cur_fbr, cur_fbc, sb_count;
+  while (cdef_get_next_job(cdef_sync, cdef_search_ctx, &cur_fbr, &cur_fbc,
+                           &sb_count)) {
+    av1_cdef_mse_calc_block(cdef_search_ctx, cur_fbr, cur_fbc, sb_count);
+  }
+  return 1;
+}
+
+// Assigns CDEF search hook function and thread data to each worker.
+static void prepare_cdef_workers(MultiThreadInfo *mt_info,
+                                 CdefSearchCtx *cdef_search_ctx,
+                                 AVxWorkerHook hook, int num_workers) {
+  for (int i = num_workers - 1; i >= 0; i--) {
+    AVxWorker *worker = &mt_info->workers[i];
+    worker->hook = hook;
+    worker->data1 = &mt_info->cdef_sync;
+    worker->data2 = cdef_search_ctx;
+  }
+}
+
+// Implements multi-threading for CDEF search.
+void av1_cdef_mse_calc_frame_mt(AV1_COMMON *cm, MultiThreadInfo *mt_info,
+                                CdefSearchCtx *cdef_search_ctx) {
+  AV1CdefSync *cdef_sync = &mt_info->cdef_sync;
+  const int num_workers = mt_info->num_workers;
+
+  cdef_reset_job_info(cdef_sync);
+  prepare_cdef_workers(mt_info, cdef_search_ctx, cdef_filter_block_worker_hook,
+                       num_workers);
+  launch_workers(mt_info, num_workers);
+  sync_enc_workers(mt_info, cm, num_workers);
+}
diff --git a/av1/encoder/ethread.h b/av1/encoder/ethread.h
index ab8e1bb..e8b8ad3 100644
--- a/av1/encoder/ethread.h
+++ b/av1/encoder/ethread.h
@@ -78,6 +78,11 @@
 
 void av1_create_workers(AV1_COMP *cpi, int num_workers);
 
+void av1_cdef_mse_calc_frame_mt(AV1_COMMON *cm, MultiThreadInfo *mt_info,
+                                CdefSearchCtx *cdef_search_ctx);
+
+void av1_cdef_mt_dealloc(AV1CdefSync *cdef_sync);
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif
diff --git a/av1/encoder/pickcdef.c b/av1/encoder/pickcdef.c
index c71ef31..55e466d 100644
--- a/av1/encoder/pickcdef.c
+++ b/av1/encoder/pickcdef.c
@@ -20,6 +20,7 @@
 #include "av1/common/av1_common_int.h"
 #include "av1/common/reconinter.h"
 #include "av1/encoder/encoder.h"
+#include "av1/encoder/ethread.h"
 #include "av1/encoder/pickcdef.h"
 
 // Get primary and secondary filter strength for the given strength index and
@@ -289,11 +290,10 @@
 //   fbc: Column index in units of 64x64 block
 // Returns:
 //   Nothing will be returned. Contents of cdef_search_ctx will be modified.
-static void cdef_mse_calc_block(CdefSearchCtx *cdef_search_ctx, int fbr,
-                                int fbc) {
-  const CommonModeInfoParams *const mi_params = &cdef_search_ctx->cm->mi_params;
+void av1_cdef_mse_calc_block(CdefSearchCtx *cdef_search_ctx, int fbr, int fbc,
+                             int sb_count) {
+  const CommonModeInfoParams *const mi_params = cdef_search_ctx->mi_params;
   const YV12_BUFFER_CONFIG *ref = cdef_search_ctx->ref;
-  const int sb_count = cdef_search_ctx->sb_count;
   const int coeff_shift = cdef_search_ctx->coeff_shift;
   const int *mi_wide_l2 = cdef_search_ctx->mi_wide_l2;
   const int *mi_high_l2 = cdef_search_ctx->mi_high_l2;
@@ -387,14 +387,14 @@
 // Returns:
 //   Nothing will be returned. Contents of cdef_search_ctx will be modified.
 static void cdef_mse_calc_frame(CdefSearchCtx *cdef_search_ctx) {
-  const CommonModeInfoParams *const mi_params = &cdef_search_ctx->cm->mi_params;
   // Loop over each sb.
   for (int fbr = 0; fbr < cdef_search_ctx->nvfb; ++fbr) {
     for (int fbc = 0; fbc < cdef_search_ctx->nhfb; ++fbc) {
       // Checks if cdef processing can be skipped for particular sb.
-      if (cdef_sb_skip(mi_params, fbr, fbc)) continue;
+      if (cdef_sb_skip(cdef_search_ctx->mi_params, fbr, fbc)) continue;
       // Calculate mse for each sb and store the relevant sb index.
-      cdef_mse_calc_block(cdef_search_ctx, fbr, fbc);
+      av1_cdef_mse_calc_block(cdef_search_ctx, fbr, fbc,
+                              cdef_search_ctx->sb_count);
       cdef_search_ctx->sb_count++;
     }
   }
@@ -448,7 +448,7 @@
                                         CDEF_PICK_METHOD pick_method) {
   const CommonModeInfoParams *const mi_params = &cm->mi_params;
   const int num_planes = av1_num_planes(cm);
-  cdef_search_ctx->cm = cm;
+  cdef_search_ctx->mi_params = &cm->mi_params;
   cdef_search_ctx->ref = ref;
   cdef_search_ctx->nvfb =
       (mi_params->mi_rows + MI_SIZE_64X64 - 1) / MI_SIZE_64X64;
@@ -549,7 +549,7 @@
   }
 }
 
-void av1_cdef_search(const YV12_BUFFER_CONFIG *frame,
+void av1_cdef_search(MultiThreadInfo *mt_info, const YV12_BUFFER_CONFIG *frame,
                      const YV12_BUFFER_CONFIG *ref, AV1_COMMON *cm,
                      MACROBLOCKD *xd, CDEF_PICK_METHOD pick_method,
                      int rdmult) {
@@ -568,7 +568,11 @@
   // Allocate CDEF search context buffers.
   cdef_alloc_data(&cdef_search_ctx);
   // Frame level mse calculation.
-  cdef_mse_calc_frame(&cdef_search_ctx);
+  if (mt_info->num_workers > 1) {
+    av1_cdef_mse_calc_frame_mt(cm, mt_info, &cdef_search_ctx);
+  } else {
+    cdef_mse_calc_frame(&cdef_search_ctx);
+  }
 
   /* Search for different number of signaling bits. */
   int nb_strength_bits = 0;
diff --git a/av1/encoder/pickcdef.h b/av1/encoder/pickcdef.h
index ef342dc..7fe1edb 100644
--- a/av1/encoder/pickcdef.h
+++ b/av1/encoder/pickcdef.h
@@ -19,6 +19,8 @@
 #endif
 
 /*!\cond */
+struct MultiThreadInfo;
+
 #define REDUCED_PRI_STRENGTHS_LVL1 8
 #define REDUCED_PRI_STRENGTHS_LVL2 5
 #define REDUCED_SEC_STRENGTHS_LVL3 2
@@ -56,6 +58,20 @@
                                         BLOCK_SIZE bsize, int coeff_shift,
                                         int row, int col);
 
+// Data related to CDEF search multi-thread synchronization.
+typedef struct AV1CdefSyncData {
+#if CONFIG_MULTITHREAD
+  // Mutex lock used while dispatching jobs.
+  pthread_mutex_t *mutex_;
+#endif  // CONFIG_MULTITHREAD
+  // Flag to indicate all blocks are processed and end of frame is reached
+  int end_of_frame;
+  // Row index in units of 64x64 block
+  int fbr;
+  // Column index in units of 64x64 block
+  int fbc;
+} AV1CdefSync;
+
 /*! \brief CDEF search context.
  */
 typedef struct {
@@ -64,9 +80,9 @@
    */
   const YV12_BUFFER_CONFIG *ref;
   /*!
-   * Pointer to top level common structure
+   * Pointer to params related to MB_MODE_INFO arrays and related info
    */
-  AV1_COMMON *cm;
+  CommonModeInfoParams *mi_params;
   /*!
    * Info specific to each plane
    */
@@ -190,6 +206,9 @@
     return 1;
   return 0;
 }
+
+void av1_cdef_mse_calc_block(CdefSearchCtx *cdef_search_ctx, int fbr, int fbc,
+                             int sb_count);
 /*!\endcond */
 
 /*!\brief AV1 CDEF parameter search
@@ -198,6 +217,7 @@
  *
  * Searches for optimal CDEF parameters for frame
  *
+ * \param[in]      mt_info      Pointer to multi-threading parameters
  * \param[in]      frame        Compressed frame buffer
  * \param[in]      ref          Source frame buffer
  * \param[in,out]  cm           Pointer to top level common structure
@@ -216,7 +236,8 @@
  * \arg \c damping_factor: CDEF damping factor.
  *
  */
-void av1_cdef_search(const YV12_BUFFER_CONFIG *frame,
+void av1_cdef_search(struct MultiThreadInfo *mt_info,
+                     const YV12_BUFFER_CONFIG *frame,
                      const YV12_BUFFER_CONFIG *ref, AV1_COMMON *cm,
                      MACROBLOCKD *xd, CDEF_PICK_METHOD pick_method, int rdmult);