Improved encoder threading

Reduce the number of sync points by letting each thread
continue imediatly with a new MB row.
Better multicore scaling, improves performance by 5-20% on ARM multicore.

Change-Id: Ic97e4d1c4886a842c85dd3539a93cb217188ed1b
diff --git a/vp8/encoder/onyx_int.h b/vp8/encoder/onyx_int.h
index 8a97e98..580e1ee 100644
--- a/vp8/encoder/onyx_int.h
+++ b/vp8/encoder/onyx_int.h
@@ -190,11 +190,8 @@
 typedef struct
 {
     MACROBLOCK  mb;
-    int mb_row;
-    TOKENEXTRA *tp;
     int segment_counts[MAX_MB_SEGMENTS];
     int totalrate;
-    int current_mb_col;
 } MB_ROW_COMP;
 
 typedef struct
@@ -608,7 +605,8 @@
     signed char *cyclic_refresh_map;
 
     // multithread data
-    int current_mb_col_main;
+    int * mt_current_mb_col;
+    int mt_sync_range;
     int processor_core_count;
     int b_multi_threaded;
     int encoding_thread_count;
@@ -621,8 +619,8 @@
 
 #if CONFIG_MULTITHREAD
     //events
-    sem_t *h_event_mbrencoding;
-    sem_t h_event_main;
+    sem_t *h_event_start_encoding;
+    sem_t h_event_end_encoding;
 #endif
 
     TOKENLIST *tplist;