Allocate workers for each module independently

This CL facilitates the allocation of number of threads for each
module to be independent of the other modules.

Change-Id: I758a03d1d21d6b188003ce3c42467d8bfc48ba54
diff --git a/av1/av1_cx_iface.c b/av1/av1_cx_iface.c
index d3d7a04..02afee5 100644
--- a/av1/av1_cx_iface.c
+++ b/av1/av1_cx_iface.c
@@ -2277,10 +2277,15 @@
       num_workers = av1_fp_compute_num_enc_workers(cpi);
 #endif
     } else {
-      num_workers = av1_compute_num_enc_workers(cpi, cpi->oxcf.max_threads);
+      av1_compute_num_workers_for_mt(cpi);
+      num_workers = av1_get_max_num_workers(cpi);
     }
-    if ((num_workers > 1) && (cpi->mt_info.num_workers == 0))
+    if ((num_workers > 1) && (cpi->mt_info.num_workers == 0)) {
       av1_create_workers(cpi, num_workers);
+      if (cpi->oxcf.pass != 1) {
+        av1_create_second_pass_workers(cpi, num_workers);
+      }
+    }
 
     // Call for LAP stage
     if (cpi_lap != NULL) {
diff --git a/av1/encoder/encoder.c b/av1/encoder/encoder.c
index 44cefbb..4e7de26 100644
--- a/av1/encoder/encoder.c
+++ b/av1/encoder/encoder.c
@@ -1544,7 +1544,8 @@
     av1_loop_filter_dealloc(&mt_info->lf_row_sync);
     av1_cdef_mt_dealloc(&mt_info->cdef_sync);
 #if !CONFIG_REALTIME_ONLY
-    av1_loop_restoration_dealloc(&mt_info->lr_row_sync, mt_info->num_workers);
+    av1_loop_restoration_dealloc(&mt_info->lr_row_sync,
+                                 mt_info->num_mod_workers[MOD_LR]);
     av1_gm_dealloc(&mt_info->gm_sync);
     av1_tf_mt_dealloc(&mt_info->tf_sync);
 #endif
@@ -2067,7 +2068,7 @@
 #endif
   if (use_restoration) {
     MultiThreadInfo *const mt_info = &cpi->mt_info;
-    const int num_workers = mt_info->num_workers;
+    const int num_workers = mt_info->num_mod_workers[MOD_LR];
     av1_loop_restoration_save_boundary_lines(&cm->cur_frame->buf, cm, 1);
     av1_pick_filter_restoration(cpi->source, cpi);
     if (cm->rst_info[0].frame_restoration_type != RESTORE_NONE ||
@@ -2099,7 +2100,7 @@
  */
 static void loopfilter_frame(AV1_COMP *cpi, AV1_COMMON *cm) {
   MultiThreadInfo *const mt_info = &cpi->mt_info;
-  const int num_workers = mt_info->num_workers;
+  const int num_workers = mt_info->num_mod_workers[MOD_LPF];
   const int num_planes = av1_num_planes(cm);
   MACROBLOCKD *xd = &cpi->td.mb.e_mbd;
 
diff --git a/av1/encoder/encoder.h b/av1/encoder/encoder.h
index 95efd65..f7d9d6b 100644
--- a/av1/encoder/encoder.h
+++ b/av1/encoder/encoder.h
@@ -157,6 +157,18 @@
   COST_UPD_OFF,
 } COST_UPDATE_TYPE;
 
+typedef enum {
+  MOD_FP,           // First pass
+  MOD_TF,           // Temporal filtering
+  MOD_TPL,          // TPL
+  MOD_GME,          // Global motion estimation
+  MOD_ENC,          // Encode stage
+  MOD_LPF,          // Deblocking loop filter
+  MOD_CDEF_SEARCH,  // CDEF search
+  MOD_LR,           // Loop restoration filtering
+  NUM_MT_MODULES
+} MULTI_THREADED_MODULES;
+
 /*!\endcond */
 
 /*!
@@ -1345,14 +1357,21 @@
   int num_workers;
 
   /*!
-   * Number of workers created for tpl and tile/row multi-threading of encoder.
+   * Number of workers used for different MT modules.
    */
-  int num_enc_workers;
+  int num_mod_workers[NUM_MT_MODULES];
 
   /*!
-   * Number of workers created for first-pass multi-threading.
+   * Flag to indicate whether thread specific buffers need to be allocated for
+   * tile/row based multi-threading of first pass stage.
    */
-  int num_fp_workers;
+  int fp_mt_buf_init_done;
+
+  /*!
+   * Flag to indicate whether thread specific buffers need to be allocated for
+   * tile/row based multi-threading of encode stage.
+   */
+  int enc_mt_buf_init_done;
 
   /*!
    * Synchronization object used to launch job in the worker thread.
diff --git a/av1/encoder/ethread.c b/av1/encoder/ethread.c
index 43776f9..b71a40a 100644
--- a/av1/encoder/ethread.c
+++ b/av1/encoder/ethread.c
@@ -528,7 +528,7 @@
   return 1;
 }
 
-static AOM_INLINE void create_enc_workers(AV1_COMP *cpi, int num_workers) {
+void av1_create_second_pass_workers(AV1_COMP *cpi, int num_workers) {
   AV1_COMMON *const cm = &cpi->common;
   const AVxWorkerInterface *const winterface = aom_get_worker_interface();
   MultiThreadInfo *const mt_info = &cpi->mt_info;
@@ -568,14 +568,37 @@
     AVxWorker *const worker = &mt_info->workers[i];
     EncWorkerData *const thread_data = &mt_info->tile_thr_data[i];
 
-    ++mt_info->num_enc_workers;
-
     thread_data->cpi = cpi;
     thread_data->thread_id = i;
     // Set the starting tile for each thread.
     thread_data->start = i;
 
     if (i > 0) {
+      // alloc_obmc_buffers(&thread_data->td->obmc_buffer, cm);
+
+      // Create threads
+      if (!winterface->reset(worker))
+        aom_internal_error(&cm->error, AOM_CODEC_ERROR,
+                           "Tile encoder thread creation failed");
+    } else {
+      // Main thread acts as a worker and uses the thread data in cpi.
+      thread_data->td = &cpi->td;
+    }
+    winterface->sync(worker);
+  }
+}
+
+static AOM_INLINE void create_enc_workers(AV1_COMP *cpi, int num_workers) {
+  AV1_COMMON *const cm = &cpi->common;
+  MultiThreadInfo *const mt_info = &cpi->mt_info;
+
+  assert(mt_info->workers != NULL);
+  assert(mt_info->tile_thr_data != NULL);
+
+  for (int i = num_workers - 1; i >= 0; i--) {
+    EncWorkerData *const thread_data = &mt_info->tile_thr_data[i];
+
+    if (i > 0) {
       // Set up sms_tree.
       av1_setup_sms_tree(cpi, thread_data->td);
 
@@ -622,21 +645,13 @@
             cm, thread_data->td->vt64x64,
             aom_malloc(sizeof(*thread_data->td->vt64x64) * num_64x64_blocks));
       }
-
-      // Create threads
-      if (!winterface->reset(worker))
-        aom_internal_error(&cm->error, AOM_CODEC_ERROR,
-                           "Tile encoder thread creation failed");
-    } else {
-      // Main thread acts as a worker and uses the thread data in cpi.
-      thread_data->td = &cpi->td;
     }
     if (cpi->oxcf.row_mt == 1)
       CHECK_MEM_ERROR(
           cm, thread_data->td->tctx,
           (FRAME_CONTEXT *)aom_memalign(16, sizeof(*thread_data->td->tctx)));
-    winterface->sync(worker);
   }
+  mt_info->enc_mt_buf_init_done = 1;
 }
 
 void av1_create_workers(AV1_COMP *cpi, int num_workers) {
@@ -675,6 +690,10 @@
   AV1_COMMON *const cm = &cpi->common;
   const AVxWorkerInterface *const winterface = aom_get_worker_interface();
   MultiThreadInfo *const mt_info = &cpi->mt_info;
+  // For single-pass encode, threads are already created during call to
+  // av1_create_second_pass_workers(). Create threads only in the case of
+  // pass = 1.
+  const int create_workers = (mt_info->num_mod_workers[MOD_FP] == 0) ? 1 : 0;
 
   assert(mt_info->workers != NULL);
   assert(mt_info->tile_thr_data != NULL);
@@ -692,8 +711,6 @@
     AVxWorker *const worker = &mt_info->workers[i];
     EncWorkerData *const thread_data = &mt_info->tile_thr_data[i];
 
-    ++mt_info->num_fp_workers;
-
     thread_data->cpi = cpi;
     thread_data->thread_id = i;
     // Set the starting tile for each thread.
@@ -704,16 +721,22 @@
       thread_data->td->firstpass_ctx =
           av1_alloc_pmc(cm, BLOCK_16X16, &thread_data->td->shared_coeff_buf);
 
-      // Create threads
-      if (!winterface->reset(worker))
-        aom_internal_error(&cm->error, AOM_CODEC_ERROR,
-                           "Tile encoder thread creation failed");
+      if (create_workers) {
+        // Create threads
+        if (!winterface->reset(worker))
+          aom_internal_error(&cm->error, AOM_CODEC_ERROR,
+                             "Tile encoder thread creation failed");
+      }
     } else {
       // Main thread acts as a worker and uses the thread data in cpi.
       thread_data->td = &cpi->td;
     }
-    winterface->sync(worker);
+    if (create_workers) {
+      winterface->sync(worker);
+      ++mt_info->num_mod_workers[MOD_FP];
+    }
   }
+  mt_info->fp_mt_buf_init_done = 1;
 }
 #endif
 
@@ -878,8 +901,17 @@
   return AOMMIN(max_threads, tile_cols * tile_rows);
 }
 
+// Find max worker of all MT stages
+int av1_get_max_num_workers(AV1_COMP *cpi) {
+  int max_num_workers = 0;
+  for (int i = MOD_FP; i < NUM_MT_MODULES; i++)
+    max_num_workers = AOMMAX(cpi->mt_info.num_mod_workers[i], max_num_workers);
+  assert(max_num_workers >= 1);
+  return AOMMIN(max_num_workers, cpi->oxcf.max_threads);
+}
+
 // Computes the number of workers for encoding stage (row/tile multi-threading)
-int av1_compute_num_enc_workers(AV1_COMP *cpi, int max_workers) {
+static AOM_INLINE int compute_num_enc_workers(AV1_COMP *cpi, int max_workers) {
   if (max_workers <= 1) return 1;
   if (cpi->oxcf.row_mt)
     return compute_num_enc_row_mt_workers(&cpi->common, max_workers);
@@ -892,7 +924,7 @@
   MultiThreadInfo *const mt_info = &cpi->mt_info;
   const int tile_cols = cm->tiles.cols;
   const int tile_rows = cm->tiles.rows;
-  int num_workers = av1_compute_num_enc_workers(cpi, mt_info->num_workers);
+  int num_workers = mt_info->num_mod_workers[MOD_ENC];
 
   assert(IMPLIES(cpi->tile_data == NULL,
                  cpi->allocated_tiles < tile_cols * tile_rows));
@@ -900,10 +932,10 @@
 
   av1_init_tile_data(cpi);
   // Only run once to create threads and allocate thread data.
-  if (mt_info->num_enc_workers == 0) {
+  if (mt_info->enc_mt_buf_init_done == 0) {
     create_enc_workers(cpi, num_workers);
   } else {
-    num_workers = AOMMIN(num_workers, mt_info->num_enc_workers);
+    num_workers = AOMMIN(num_workers, mt_info->num_workers);
   }
   prepare_enc_workers(cpi, enc_worker_hook, num_workers);
   launch_workers(&cpi->mt_info, num_workers);
@@ -996,13 +1028,7 @@
   const int tile_rows = cm->tiles.rows;
   int *thread_id_to_tile_id = enc_row_mt->thread_id_to_tile_id;
   int max_sb_rows = 0, max_sb_cols = 0;
-
-  // TODO(ravi.chaudhary@ittiam.com): Currently the percentage of
-  // post-processing stages in encoder is quiet low, so limiting the number of
-  // threads to the theoretical limit in row-mt does not have much impact on
-  // post-processing multi-threading stage. Need to revisit this when
-  // post-processing time starts shooting up.
-  int num_workers = av1_compute_num_enc_workers(cpi, mt_info->num_workers);
+  int num_workers = mt_info->num_mod_workers[MOD_ENC];
 
   assert(IMPLIES(cpi->tile_data == NULL,
                  cpi->allocated_tiles < tile_cols * tile_rows));
@@ -1047,10 +1073,10 @@
   }
 
   // Only run once to create threads and allocate thread data.
-  if (mt_info->num_enc_workers == 0) {
+  if (mt_info->enc_mt_buf_init_done == 0) {
     create_enc_workers(cpi, num_workers);
   } else {
-    num_workers = AOMMIN(num_workers, mt_info->num_enc_workers);
+    num_workers = AOMMIN(num_workers, mt_info->num_workers);
   }
   assign_tile_to_thread(thread_id_to_tile_id, tile_cols * tile_rows,
                         num_workers);
@@ -1084,12 +1110,12 @@
   const BLOCK_SIZE fp_block_size = cpi->fp_block_size;
   max_mb_rows = fp_compute_max_mb_rows(cm, cpi->tile_data, fp_block_size);
 
-  // TODO(ravi.chaudhary@ittiam.com): Currently the percentage of
-  // post-processing stages in encoder is quiet low, so limiting the number of
-  // threads to the theoretical limit in row-mt does not have much impact on
-  // post-processing multi-threading stage. Need to revisit this when
-  // post-processing time starts shooting up.
-  num_workers = av1_fp_compute_num_enc_workers(cpi);
+  // For pass = 1, compute the no. of workers needed. For single-pass encode
+  // (pass = 0), no. of workers are already computed.
+  if (mt_info->num_mod_workers[MOD_FP] == 0)
+    num_workers = av1_fp_compute_num_enc_workers(cpi);
+  else
+    num_workers = mt_info->num_mod_workers[MOD_FP];
 
   if (enc_row_mt->allocated_tile_cols != tile_cols ||
       enc_row_mt->allocated_tile_rows != tile_rows ||
@@ -1117,7 +1143,8 @@
 
   num_workers = AOMMIN(num_workers, mt_info->num_workers);
   // Only run once to create threads and allocate thread data.
-  if (mt_info->num_fp_workers == 0) fp_create_enc_workers(cpi, num_workers);
+  if (mt_info->fp_mt_buf_init_done == 0)
+    fp_create_enc_workers(cpi, num_workers);
   assign_tile_to_thread(thread_id_to_tile_id, tile_cols * tile_rows,
                         num_workers);
   fp_prepare_enc_workers(cpi, fp_enc_row_mt_worker_hook, num_workers);
@@ -1291,16 +1318,13 @@
     // Before encoding a frame, copy the thread data from cpi.
     if (thread_data->td != &cpi->td) {
       thread_data->td->mb = cpi->td.mb;
-      thread_data->td->mb.obmc_buffer = thread_data->td->obmc_buffer;
+      // OBMC buffers are used only to init MS params and remain unused when
+      // called from tpl, hence set the buffers to defaults.
+      av1_init_obmc_buffer(&thread_data->td->mb.obmc_buffer);
     }
   }
 }
 
-// Computes num_workers for tpl multi-threading.
-static AOM_INLINE int compute_num_tpl_workers(AV1_COMP *cpi) {
-  return av1_compute_num_enc_workers(cpi, cpi->mt_info.num_workers);
-}
-
 // Implements multi-threading for tpl.
 void av1_mc_flow_dispenser_mt(AV1_COMP *cpi) {
   AV1_COMMON *cm = &cpi->common;
@@ -1309,12 +1333,8 @@
   TplParams *tpl_data = &cpi->tpl_data;
   AV1TplRowMultiThreadSync *tpl_sync = &tpl_data->tpl_mt_sync;
   int mb_rows = mi_params->mb_rows;
-  int num_workers = compute_num_tpl_workers(cpi);
-
-  if (mt_info->num_enc_workers == 0)
-    create_enc_workers(cpi, num_workers);
-  else
-    num_workers = AOMMIN(num_workers, mt_info->num_enc_workers);
+  int num_workers =
+      AOMMIN(mt_info->num_mod_workers[MOD_TPL], mt_info->num_workers);
 
   if (mb_rows != tpl_sync->rows) {
     av1_tpl_dealloc(tpl_sync);
@@ -1412,7 +1432,9 @@
     // Before encoding a frame, copy the thread data from cpi.
     if (thread_data->td != &cpi->td) {
       thread_data->td->mb = cpi->td.mb;
-      thread_data->td->mb.obmc_buffer = thread_data->td->obmc_buffer;
+      // OBMC buffers are used only to init MS params and remain unused when
+      // called from tf, hence set the buffers to defaults.
+      av1_init_obmc_buffer(&thread_data->td->mb.obmc_buffer);
       tf_alloc_and_reset_data(&thread_data->td->tf_data, cpi->tf_ctx.num_pels,
                               is_highbitdepth);
     }
@@ -1451,11 +1473,8 @@
   MultiThreadInfo *mt_info = &cpi->mt_info;
   const int is_highbitdepth = cpi->tf_ctx.is_highbitdepth;
 
-  int num_workers = mt_info->num_workers;
-  if (mt_info->num_enc_workers == 0)
-    create_enc_workers(cpi, num_workers);
-  else
-    num_workers = AOMMIN(num_workers, mt_info->num_enc_workers);
+  int num_workers =
+      AOMMIN(mt_info->num_mod_workers[MOD_TF], mt_info->num_workers);
 
   prepare_tf_workers(cpi, tf_worker_hook, num_workers, is_highbitdepth);
   launch_workers(mt_info, num_workers);
@@ -1610,12 +1629,12 @@
 static AOM_INLINE int compute_gm_workers(const AV1_COMP *cpi) {
   int total_refs =
       cpi->gm_info.num_ref_frames[0] + cpi->gm_info.num_ref_frames[1];
-  int max_num_workers = cpi->mt_info.num_workers;
-  int max_allowed_workers = cpi->sf.gm_sf.prune_ref_frame_for_gm_search
-                                ? AOMMIN(MAX_DIRECTIONS, max_num_workers)
-                                : max_num_workers;
-
-  return (AOMMIN(total_refs, max_allowed_workers));
+  int num_gm_workers = cpi->sf.gm_sf.prune_ref_frame_for_gm_search
+                           ? AOMMIN(MAX_DIRECTIONS, total_refs)
+                           : total_refs;
+  num_gm_workers = AOMMIN(num_gm_workers, cpi->oxcf.max_threads);
+  assert(num_gm_workers <= cpi->mt_info.num_workers);
+  return (num_gm_workers);
 }
 
 // Frees the memory allocated for each worker in global motion multi-threading.
@@ -1783,7 +1802,7 @@
 void av1_cdef_mse_calc_frame_mt(AV1_COMMON *cm, MultiThreadInfo *mt_info,
                                 CdefSearchCtx *cdef_search_ctx) {
   AV1CdefSync *cdef_sync = &mt_info->cdef_sync;
-  const int num_workers = mt_info->num_workers;
+  const int num_workers = mt_info->num_mod_workers[MOD_CDEF_SEARCH];
 
   cdef_reset_job_info(cdef_sync);
   prepare_cdef_workers(mt_info, cdef_search_ctx, cdef_filter_block_worker_hook,
@@ -1791,3 +1810,59 @@
   launch_workers(mt_info, num_workers);
   sync_enc_workers(mt_info, cm, num_workers);
 }
+
+// Computes num_workers for temporal filter multi-threading.
+static AOM_INLINE int compute_num_tf_workers(AV1_COMP *cpi) {
+  return compute_num_enc_workers(cpi, cpi->oxcf.max_threads);
+}
+
+// Computes num_workers for tpl multi-threading.
+static AOM_INLINE int compute_num_tpl_workers(AV1_COMP *cpi) {
+  return compute_num_enc_workers(cpi, cpi->oxcf.max_threads);
+}
+
+// Computes num_workers for loop filter multi-threading.
+static AOM_INLINE int compute_num_lf_workers(AV1_COMP *cpi) {
+  return compute_num_enc_workers(cpi, cpi->oxcf.max_threads);
+}
+
+// Computes num_workers for cdef multi-threading.
+static AOM_INLINE int compute_num_cdef_workers(AV1_COMP *cpi) {
+  return compute_num_enc_workers(cpi, cpi->oxcf.max_threads);
+}
+
+// Computes num_workers for loop-restoration multi-threading.
+static AOM_INLINE int compute_num_lr_workers(AV1_COMP *cpi) {
+  return compute_num_enc_workers(cpi, cpi->oxcf.max_threads);
+}
+
+int compute_num_mod_workers(AV1_COMP *cpi, MULTI_THREADED_MODULES mod_name) {
+  int num_mod_workers = 0;
+  switch (mod_name) {
+    case MOD_FP:
+      if (cpi->oxcf.pass == 2)
+        num_mod_workers = 0;
+      else
+        num_mod_workers = compute_num_enc_workers(cpi, cpi->oxcf.max_threads);
+      break;
+    case MOD_TF: num_mod_workers = compute_num_tf_workers(cpi); break;
+    case MOD_TPL: num_mod_workers = compute_num_tpl_workers(cpi); break;
+    case MOD_GME: num_mod_workers = 1; break;
+    case MOD_ENC:
+      num_mod_workers = compute_num_enc_workers(cpi, cpi->oxcf.max_threads);
+      break;
+    case MOD_LPF: num_mod_workers = compute_num_lf_workers(cpi); break;
+    case MOD_CDEF_SEARCH:
+      num_mod_workers = compute_num_cdef_workers(cpi);
+      break;
+    case MOD_LR: num_mod_workers = compute_num_lr_workers(cpi); break;
+    default: assert(0); break;
+  }
+  return (num_mod_workers);
+}
+// Computes the number of workers for each MT modules in the encoder
+void av1_compute_num_workers_for_mt(AV1_COMP *cpi) {
+  for (int i = MOD_FP; i < NUM_MT_MODULES; i++)
+    cpi->mt_info.num_mod_workers[i] =
+        compute_num_mod_workers(cpi, (MULTI_THREADED_MODULES)i);
+}
diff --git a/av1/encoder/ethread.h b/av1/encoder/ethread.h
index e8b8ad3..55e7f7b 100644
--- a/av1/encoder/ethread.h
+++ b/av1/encoder/ethread.h
@@ -74,10 +74,14 @@
 
 void av1_tf_mt_dealloc(AV1TemporalFilterSync *tf_sync);
 
-int av1_compute_num_enc_workers(AV1_COMP *cpi, int max_workers);
+void av1_compute_num_workers_for_mt(AV1_COMP *cpi);
+
+int av1_get_max_num_workers(AV1_COMP *cpi);
 
 void av1_create_workers(AV1_COMP *cpi, int num_workers);
 
+void av1_create_second_pass_workers(AV1_COMP *cpi, int num_workers);
+
 void av1_cdef_mse_calc_frame_mt(AV1_COMMON *cm, MultiThreadInfo *mt_info,
                                 CdefSearchCtx *cdef_search_ctx);
 
diff --git a/av1/encoder/mcomp.c b/av1/encoder/mcomp.c
index 745f75b..225263c 100644
--- a/av1/encoder/mcomp.c
+++ b/av1/encoder/mcomp.c
@@ -78,6 +78,13 @@
   }
 }
 
+void av1_init_obmc_buffer(OBMCBuffer *obmc_buffer) {
+  obmc_buffer->wsrc = NULL;
+  obmc_buffer->mask = NULL;
+  obmc_buffer->above_pred = NULL;
+  obmc_buffer->left_pred = NULL;
+}
+
 void av1_make_default_fullpel_ms_params(
     FULLPEL_MOTION_SEARCH_PARAMS *ms_params, const struct AV1_COMP *cpi,
     const MACROBLOCK *x, BLOCK_SIZE bsize, const MV *ref_mv,
diff --git a/av1/encoder/mcomp.h b/av1/encoder/mcomp.h
index 8f6085f..94baed6 100644
--- a/av1/encoder/mcomp.h
+++ b/av1/encoder/mcomp.h
@@ -203,6 +203,8 @@
   aom_sad_multi_d_fn_t sdx4df;
 } FULLPEL_MOTION_SEARCH_PARAMS;
 
+void av1_init_obmc_buffer(OBMCBuffer *obmc_buffer);
+
 void av1_make_default_fullpel_ms_params(
     FULLPEL_MOTION_SEARCH_PARAMS *ms_params, const struct AV1_COMP *cpi,
     const MACROBLOCK *x, BLOCK_SIZE bsize, const MV *ref_mv,
diff --git a/av1/encoder/picklpf.c b/av1/encoder/picklpf.c
index e6446c7..9b3924f 100644
--- a/av1/encoder/picklpf.c
+++ b/av1/encoder/picklpf.c
@@ -50,7 +50,7 @@
                                 AV1_COMP *const cpi, int filt_level,
                                 int partial_frame, int plane, int dir) {
   MultiThreadInfo *const mt_info = &cpi->mt_info;
-  int num_workers = mt_info->num_workers;
+  int num_workers = mt_info->num_mod_workers[MOD_LPF];
   AV1_COMMON *const cm = &cpi->common;
   int64_t filt_err;