diff --git a/include/linux/cpu.h b/include/linux/cpu.h index 11dd01b8f821..caec85beee12 100644 --- a/include/linux/cpu.h +++ b/include/linux/cpu.h @@ -75,8 +75,9 @@ enum { /* migration should happen before other stuff but after perf */ CPU_PRI_PERF = 20, CPU_PRI_MIGRATION = 10, - /* prepare workqueues for other notifiers */ - CPU_PRI_WORKQUEUE = 5, + /* bring up workqueues before normal notifiers and down after */ + CPU_PRI_WORKQUEUE_UP = 5, + CPU_PRI_WORKQUEUE_DOWN = -5, }; #define CPU_ONLINE 0x0002 /* CPU (unsigned)v is up */ diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h index d36619ead3ba..00e46376e28f 100644 --- a/include/linux/lockdep.h +++ b/include/linux/lockdep.h @@ -157,6 +157,24 @@ struct lockdep_map { #endif }; +static inline void lockdep_copy_map(struct lockdep_map *to, + struct lockdep_map *from) +{ + int i; + + *to = *from; + /* + * Since the class cache can be modified concurrently we could observe + * half pointers (64bit arch using 32bit copy insns). Therefore clear + * the caches and take the performance hit. + * + * XXX it doesn't work well with lockdep_set_class_and_subclass(), since + * that relies on cache abuse. + */ + for (i = 0; i < NR_LOCKDEP_CACHING_CLASSES; i++) + to->class_cache[i] = NULL; +} + /* * Every lock has a list of other locks that were taken after it. * We only grow the list, never remove from it: diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h index af155450cabb..25253f8a643e 100644 --- a/include/linux/workqueue.h +++ b/include/linux/workqueue.h @@ -16,6 +16,7 @@ struct workqueue_struct; struct work_struct; typedef void (*work_func_t)(struct work_struct *work); +void delayed_work_timer_fn(unsigned long __data); /* * The first word is the work queue pointer and the flags rolled into @@ -67,9 +68,18 @@ enum { WORK_STRUCT_FLAG_BITS = WORK_STRUCT_COLOR_SHIFT + WORK_STRUCT_COLOR_BITS, + /* data contains off-queue information when !WORK_STRUCT_CWQ */ + WORK_OFFQ_FLAG_BASE = WORK_STRUCT_FLAG_BITS, + + WORK_OFFQ_CANCELING = (1 << WORK_OFFQ_FLAG_BASE), + + WORK_OFFQ_FLAG_BITS = 1, + WORK_OFFQ_CPU_SHIFT = WORK_OFFQ_FLAG_BASE + WORK_OFFQ_FLAG_BITS, + + /* convenience constants */ WORK_STRUCT_FLAG_MASK = (1UL << WORK_STRUCT_FLAG_BITS) - 1, WORK_STRUCT_WQ_DATA_MASK = ~WORK_STRUCT_FLAG_MASK, - WORK_STRUCT_NO_CPU = WORK_CPU_NONE << WORK_STRUCT_FLAG_BITS, + WORK_STRUCT_NO_CPU = (unsigned long)WORK_CPU_NONE << WORK_OFFQ_CPU_SHIFT, /* bit mask for work_busy() return values */ WORK_BUSY_PENDING = 1 << 0, @@ -92,6 +102,7 @@ struct work_struct { struct delayed_work { struct work_struct work; struct timer_list timer; + int cpu; }; static inline struct delayed_work *to_delayed_work(struct work_struct *work) @@ -115,41 +126,43 @@ struct execute_work { #define __WORK_INIT_LOCKDEP_MAP(n, k) #endif -#define __WORK_INITIALIZER(n, f) { \ - .data = WORK_DATA_STATIC_INIT(), \ - .entry = { &(n).entry, &(n).entry }, \ - .func = (f), \ - __WORK_INIT_LOCKDEP_MAP(#n, &(n)) \ +#define __WORK_INITIALIZER(n, f) { \ + .data = WORK_DATA_STATIC_INIT(), \ + .entry = { &(n).entry, &(n).entry }, \ + .func = (f), \ + __WORK_INIT_LOCKDEP_MAP(#n, &(n)) \ } -#define __DELAYED_WORK_INITIALIZER(n, f) { \ - .work = __WORK_INITIALIZER((n).work, (f)), \ - .timer = TIMER_INITIALIZER(NULL, 0, 0), \ +#define __DELAYED_WORK_INITIALIZER(n, f) { \ + .work = __WORK_INITIALIZER((n).work, (f)), \ + .timer = TIMER_INITIALIZER(delayed_work_timer_fn, \ + 0, (unsigned long)&(n)), \ } -#define __DEFERRED_WORK_INITIALIZER(n, f) { \ - .work = __WORK_INITIALIZER((n).work, (f)), \ - .timer = TIMER_DEFERRED_INITIALIZER(NULL, 0, 0), \ +#define __DEFERRED_WORK_INITIALIZER(n, f) { \ + .work = __WORK_INITIALIZER((n).work, (f)), \ + .timer = TIMER_DEFERRED_INITIALIZER(delayed_work_timer_fn, \ + 0, (unsigned long)&(n)), \ } -#define DECLARE_WORK(n, f) \ +#define DECLARE_WORK(n, f) \ struct work_struct n = __WORK_INITIALIZER(n, f) -#define DECLARE_DELAYED_WORK(n, f) \ +#define DECLARE_DELAYED_WORK(n, f) \ struct delayed_work n = __DELAYED_WORK_INITIALIZER(n, f) -#define DECLARE_DEFERRED_WORK(n, f) \ +#define DECLARE_DEFERRED_WORK(n, f) \ struct delayed_work n = __DEFERRED_WORK_INITIALIZER(n, f) /* * initialize a work item's function pointer */ -#define PREPARE_WORK(_work, _func) \ - do { \ - (_work)->func = (_func); \ +#define PREPARE_WORK(_work, _func) \ + do { \ + (_work)->func = (_func); \ } while (0) -#define PREPARE_DELAYED_WORK(_work, _func) \ +#define PREPARE_DELAYED_WORK(_work, _func) \ PREPARE_WORK(&(_work)->work, (_func)) #ifdef CONFIG_DEBUG_OBJECTS_WORK @@ -179,7 +192,7 @@ static inline unsigned int work_static(struct work_struct *work) { return 0; } \ __init_work((_work), _onstack); \ (_work)->data = (atomic_long_t) WORK_DATA_INIT(); \ - lockdep_init_map(&(_work)->lockdep_map, #_work, &__key, 0);\ + lockdep_init_map(&(_work)->lockdep_map, #_work, &__key, 0); \ INIT_LIST_HEAD(&(_work)->entry); \ PREPARE_WORK((_work), (_func)); \ } while (0) @@ -193,32 +206,38 @@ static inline unsigned int work_static(struct work_struct *work) { return 0; } } while (0) #endif -#define INIT_WORK(_work, _func) \ - do { \ - __INIT_WORK((_work), (_func), 0); \ +#define INIT_WORK(_work, _func) \ + do { \ + __INIT_WORK((_work), (_func), 0); \ } while (0) -#define INIT_WORK_ONSTACK(_work, _func) \ - do { \ - __INIT_WORK((_work), (_func), 1); \ +#define INIT_WORK_ONSTACK(_work, _func) \ + do { \ + __INIT_WORK((_work), (_func), 1); \ } while (0) -#define INIT_DELAYED_WORK(_work, _func) \ - do { \ - INIT_WORK(&(_work)->work, (_func)); \ - init_timer(&(_work)->timer); \ +#define INIT_DELAYED_WORK(_work, _func) \ + do { \ + INIT_WORK(&(_work)->work, (_func)); \ + init_timer(&(_work)->timer); \ + (_work)->timer.function = delayed_work_timer_fn; \ + (_work)->timer.data = (unsigned long)(_work); \ } while (0) -#define INIT_DELAYED_WORK_ONSTACK(_work, _func) \ - do { \ - INIT_WORK_ONSTACK(&(_work)->work, (_func)); \ - init_timer_on_stack(&(_work)->timer); \ +#define INIT_DELAYED_WORK_ONSTACK(_work, _func) \ + do { \ + INIT_WORK_ONSTACK(&(_work)->work, (_func)); \ + init_timer_on_stack(&(_work)->timer); \ + (_work)->timer.function = delayed_work_timer_fn; \ + (_work)->timer.data = (unsigned long)(_work); \ } while (0) -#define INIT_DELAYED_WORK_DEFERRABLE(_work, _func) \ - do { \ - INIT_WORK(&(_work)->work, (_func)); \ - init_timer_deferrable(&(_work)->timer); \ +#define INIT_DELAYED_WORK_DEFERRABLE(_work, _func) \ + do { \ + INIT_WORK(&(_work)->work, (_func)); \ + init_timer_deferrable(&(_work)->timer); \ + (_work)->timer.function = delayed_work_timer_fn; \ + (_work)->timer.data = (unsigned long)(_work); \ } while (0) /** @@ -278,10 +297,6 @@ enum { * system_long_wq is similar to system_wq but may host long running * works. Queue flushing might take relatively long. * - * system_nrt_wq is non-reentrant and guarantees that any given work - * item is never executed in parallel by multiple CPUs. Queue - * flushing might take relatively long. - * * system_unbound_wq is unbound workqueue. Workers are not bound to * any specific CPU, not concurrency managed, and all queued works are * executed immediately as long as max_active limit is not reached and @@ -289,16 +304,25 @@ enum { * * system_freezable_wq is equivalent to system_wq except that it's * freezable. - * - * system_nrt_freezable_wq is equivalent to system_nrt_wq except that - * it's freezable. */ extern struct workqueue_struct *system_wq; extern struct workqueue_struct *system_long_wq; -extern struct workqueue_struct *system_nrt_wq; extern struct workqueue_struct *system_unbound_wq; extern struct workqueue_struct *system_freezable_wq; -extern struct workqueue_struct *system_nrt_freezable_wq; + +static inline struct workqueue_struct *__system_nrt_wq(void) +{ + return system_wq; +} + +static inline struct workqueue_struct *__system_nrt_freezable_wq(void) +{ + return system_freezable_wq; +} + +/* equivlalent to system_wq and system_freezable_wq, deprecated */ +#define system_nrt_wq __system_nrt_wq() +#define system_nrt_freezable_wq __system_nrt_freezable_wq() extern struct workqueue_struct * __alloc_workqueue_key(const char *fmt, unsigned int flags, int max_active, @@ -321,22 +345,22 @@ __alloc_workqueue_key(const char *fmt, unsigned int flags, int max_active, * Pointer to the allocated workqueue on success, %NULL on failure. */ #ifdef CONFIG_LOCKDEP -#define alloc_workqueue(fmt, flags, max_active, args...) \ -({ \ - static struct lock_class_key __key; \ - const char *__lock_name; \ - \ - if (__builtin_constant_p(fmt)) \ - __lock_name = (fmt); \ - else \ - __lock_name = #fmt; \ - \ - __alloc_workqueue_key((fmt), (flags), (max_active), \ - &__key, __lock_name, ##args); \ +#define alloc_workqueue(fmt, flags, max_active, args...) \ +({ \ + static struct lock_class_key __key; \ + const char *__lock_name; \ + \ + if (__builtin_constant_p(fmt)) \ + __lock_name = (fmt); \ + else \ + __lock_name = #fmt; \ + \ + __alloc_workqueue_key((fmt), (flags), (max_active), \ + &__key, __lock_name, ##args); \ }) #else -#define alloc_workqueue(fmt, flags, max_active, args...) \ - __alloc_workqueue_key((fmt), (flags), (max_active), \ +#define alloc_workqueue(fmt, flags, max_active, args...) \ + __alloc_workqueue_key((fmt), (flags), (max_active), \ NULL, NULL, ##args) #endif @@ -353,46 +377,50 @@ __alloc_workqueue_key(const char *fmt, unsigned int flags, int max_active, * RETURNS: * Pointer to the allocated workqueue on success, %NULL on failure. */ -#define alloc_ordered_workqueue(fmt, flags, args...) \ +#define alloc_ordered_workqueue(fmt, flags, args...) \ alloc_workqueue(fmt, WQ_UNBOUND | (flags), 1, ##args) -#define create_workqueue(name) \ +#define create_workqueue(name) \ alloc_workqueue((name), WQ_MEM_RECLAIM, 1) -#define create_freezable_workqueue(name) \ +#define create_freezable_workqueue(name) \ alloc_workqueue((name), WQ_FREEZABLE | WQ_UNBOUND | WQ_MEM_RECLAIM, 1) -#define create_singlethread_workqueue(name) \ +#define create_singlethread_workqueue(name) \ alloc_workqueue((name), WQ_UNBOUND | WQ_MEM_RECLAIM, 1) extern void destroy_workqueue(struct workqueue_struct *wq); -extern int queue_work(struct workqueue_struct *wq, struct work_struct *work); -extern int queue_work_on(int cpu, struct workqueue_struct *wq, +extern bool queue_work_on(int cpu, struct workqueue_struct *wq, struct work_struct *work); -extern int queue_delayed_work(struct workqueue_struct *wq, +extern bool queue_work(struct workqueue_struct *wq, struct work_struct *work); +extern bool queue_delayed_work_on(int cpu, struct workqueue_struct *wq, struct delayed_work *work, unsigned long delay); -extern int queue_delayed_work_on(int cpu, struct workqueue_struct *wq, +extern bool queue_delayed_work(struct workqueue_struct *wq, struct delayed_work *work, unsigned long delay); +extern bool mod_delayed_work_on(int cpu, struct workqueue_struct *wq, + struct delayed_work *dwork, unsigned long delay); +extern bool mod_delayed_work(struct workqueue_struct *wq, + struct delayed_work *dwork, unsigned long delay); extern void flush_workqueue(struct workqueue_struct *wq); extern void drain_workqueue(struct workqueue_struct *wq); extern void flush_scheduled_work(void); -extern int schedule_work(struct work_struct *work); -extern int schedule_work_on(int cpu, struct work_struct *work); -extern int schedule_delayed_work(struct delayed_work *work, unsigned long delay); -extern int schedule_delayed_work_on(int cpu, struct delayed_work *work, - unsigned long delay); +extern bool schedule_work_on(int cpu, struct work_struct *work); +extern bool schedule_work(struct work_struct *work); +extern bool schedule_delayed_work_on(int cpu, struct delayed_work *work, + unsigned long delay); +extern bool schedule_delayed_work(struct delayed_work *work, + unsigned long delay); extern int schedule_on_each_cpu(work_func_t func); extern int keventd_up(void); int execute_in_process_context(work_func_t fn, struct execute_work *); extern bool flush_work(struct work_struct *work); -extern bool flush_work_sync(struct work_struct *work); extern bool cancel_work_sync(struct work_struct *work); extern bool flush_delayed_work(struct delayed_work *dwork); -extern bool flush_delayed_work_sync(struct delayed_work *work); +extern bool cancel_delayed_work(struct delayed_work *dwork); extern bool cancel_delayed_work_sync(struct delayed_work *dwork); extern void workqueue_set_max_active(struct workqueue_struct *wq, @@ -401,22 +429,6 @@ extern bool workqueue_congested(unsigned int cpu, struct workqueue_struct *wq); extern unsigned int work_cpu(struct work_struct *work); extern unsigned int work_busy(struct work_struct *work); -/* - * Kill off a pending schedule_delayed_work(). Note that the work callback - * function may still be running on return from cancel_delayed_work(), unless - * it returns 1 and the work doesn't re-arm itself. Run flush_workqueue() or - * cancel_work_sync() to wait on it. - */ -static inline bool cancel_delayed_work(struct delayed_work *work) -{ - bool ret; - - ret = del_timer_sync(&work->timer); - if (ret) - work_clear_pending(&work->work); - return ret; -} - /* * Like above, but uses del_timer() instead of del_timer_sync(). This means, * if it returns 0 the timer function may be running and the queueing is in @@ -432,6 +444,18 @@ static inline bool __cancel_delayed_work(struct delayed_work *work) return ret; } +/* used to be different but now identical to flush_work(), deprecated */ +static inline bool flush_work_sync(struct work_struct *work) +{ + return flush_work(work); +} + +/* used to be different but now identical to flush_delayed_work(), deprecated */ +static inline bool flush_delayed_work_sync(struct delayed_work *dwork) +{ + return flush_delayed_work(dwork); +} + #ifndef CONFIG_SMP static inline long work_on_cpu(unsigned int cpu, long (*fn)(void *), void *arg) { diff --git a/kernel/timer.c b/kernel/timer.c index a85102527467..7acab7de9536 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -1109,7 +1109,9 @@ static void call_timer_fn(struct timer_list *timer, void (*fn)(unsigned long), * warnings as well as problems when looking into * timer->lockdep_map, make a copy and use that here. */ - struct lockdep_map lockdep_map = timer->lockdep_map; + struct lockdep_map lockdep_map; + + lockdep_copy_map(&lockdep_map, &timer->lockdep_map); #endif /* * Couple the lock chain with the lock chain at diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 90fd57da8ea9..e6ad770e09a1 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -46,33 +46,39 @@ #include "workqueue_sched.h" enum { - /* global_cwq flags */ + /* + * global_cwq flags + * + * A bound gcwq is either associated or disassociated with its CPU. + * While associated (!DISASSOCIATED), all workers are bound to the + * CPU and none has %WORKER_UNBOUND set and concurrency management + * is in effect. + * + * While DISASSOCIATED, the cpu may be offline and all workers have + * %WORKER_UNBOUND set and concurrency management disabled, and may + * be executing on any CPU. The gcwq behaves as an unbound one. + * + * Note that DISASSOCIATED can be flipped only while holding + * assoc_mutex of all pools on the gcwq to avoid changing binding + * state while create_worker() is in progress. + */ GCWQ_DISASSOCIATED = 1 << 0, /* cpu can't serve workers */ GCWQ_FREEZING = 1 << 1, /* freeze in progress */ /* pool flags */ POOL_MANAGE_WORKERS = 1 << 0, /* need to manage workers */ - POOL_MANAGING_WORKERS = 1 << 1, /* managing workers */ + POOL_MANAGING_WORKERS = 1 << 1, /* managing workers */ /* worker flags */ WORKER_STARTED = 1 << 0, /* started */ WORKER_DIE = 1 << 1, /* die die die */ WORKER_IDLE = 1 << 2, /* is idle */ WORKER_PREP = 1 << 3, /* preparing to run works */ - WORKER_ROGUE = 1 << 4, /* not bound to any cpu */ - WORKER_REBIND = 1 << 5, /* mom is home, come back */ WORKER_CPU_INTENSIVE = 1 << 6, /* cpu intensive */ WORKER_UNBOUND = 1 << 7, /* worker is unbound */ - WORKER_NOT_RUNNING = WORKER_PREP | WORKER_ROGUE | WORKER_REBIND | - WORKER_CPU_INTENSIVE | WORKER_UNBOUND, - - /* gcwq->trustee_state */ - TRUSTEE_START = 0, /* start */ - TRUSTEE_IN_CHARGE = 1, /* trustee in charge of gcwq */ - TRUSTEE_BUTCHER = 2, /* butcher workers */ - TRUSTEE_RELEASE = 3, /* release workers */ - TRUSTEE_DONE = 4, /* trustee is done */ + WORKER_NOT_RUNNING = WORKER_PREP | WORKER_UNBOUND | + WORKER_CPU_INTENSIVE, NR_WORKER_POOLS = 2, /* # worker pools per gcwq */ @@ -88,7 +94,6 @@ enum { (min two ticks) */ MAYDAY_INTERVAL = HZ / 10, /* and then every 100ms */ CREATE_COOLDOWN = HZ, /* time to breath after fail */ - TRUSTEE_COOLDOWN = HZ / 10, /* for trustee draining */ /* * Rescue workers are used only on emergencies and shared by @@ -142,7 +147,9 @@ struct worker { unsigned long last_active; /* L: last active timestamp */ unsigned int flags; /* X: flags */ int id; /* I: worker id */ - struct work_struct rebind_work; /* L: rebind worker to cpu */ + + /* for rebinding worker to CPU */ + struct work_struct rebind_work; /* L: for busy worker */ }; struct worker_pool { @@ -151,14 +158,16 @@ struct worker_pool { struct list_head worklist; /* L: list of pending works */ int nr_workers; /* L: total number of workers */ + + /* nr_idle includes the ones off idle_list for rebinding */ int nr_idle; /* L: currently idle ones */ struct list_head idle_list; /* X: list of idle workers */ struct timer_list idle_timer; /* L: worker idle timeout */ struct timer_list mayday_timer; /* L: SOS timer for workers */ + struct mutex assoc_mutex; /* protect GCWQ_DISASSOCIATED */ struct ida worker_ida; /* L: for worker IDs */ - struct worker *first_idle; /* L: first idle worker */ }; /* @@ -175,11 +184,8 @@ struct global_cwq { struct hlist_head busy_hash[BUSY_WORKER_HASH_SIZE]; /* L: hash of busy workers */ - struct worker_pool pools[2]; /* normal and highpri pools */ - - struct task_struct *trustee; /* L: for gcwq shutdown */ - unsigned int trustee_state; /* L: trustee state */ - wait_queue_head_t trustee_wait; /* trustee wait */ + struct worker_pool pools[NR_WORKER_POOLS]; + /* normal and highpri pools */ } ____cacheline_aligned_in_smp; /* @@ -262,17 +268,15 @@ struct workqueue_struct { }; struct workqueue_struct *system_wq __read_mostly; -struct workqueue_struct *system_long_wq __read_mostly; -struct workqueue_struct *system_nrt_wq __read_mostly; -struct workqueue_struct *system_unbound_wq __read_mostly; -struct workqueue_struct *system_freezable_wq __read_mostly; -struct workqueue_struct *system_nrt_freezable_wq __read_mostly; EXPORT_SYMBOL_GPL(system_wq); +struct workqueue_struct *system_highpri_wq __read_mostly; +EXPORT_SYMBOL_GPL(system_highpri_wq); +struct workqueue_struct *system_long_wq __read_mostly; EXPORT_SYMBOL_GPL(system_long_wq); -EXPORT_SYMBOL_GPL(system_nrt_wq); +struct workqueue_struct *system_unbound_wq __read_mostly; EXPORT_SYMBOL_GPL(system_unbound_wq); +struct workqueue_struct *system_freezable_wq __read_mostly; EXPORT_SYMBOL_GPL(system_freezable_wq); -EXPORT_SYMBOL_GPL(system_nrt_freezable_wq); #define CREATE_TRACE_POINTS #include @@ -527,18 +531,24 @@ static int work_next_color(int color) } /* - * A work's data points to the cwq with WORK_STRUCT_CWQ set while the - * work is on queue. Once execution starts, WORK_STRUCT_CWQ is - * cleared and the work data contains the cpu number it was last on. + * While queued, %WORK_STRUCT_CWQ is set and non flag bits of a work's data + * contain the pointer to the queued cwq. Once execution starts, the flag + * is cleared and the high bits contain OFFQ flags and CPU number. + * + * set_work_cwq(), set_work_cpu_and_clear_pending(), mark_work_canceling() + * and clear_work_data() can be used to set the cwq, cpu or clear + * work->data. These functions should only be called while the work is + * owned - ie. while the PENDING bit is set. * - * set_work_{cwq|cpu}() and clear_work_data() can be used to set the - * cwq, cpu or clear work->data. These functions should only be - * called while the work is owned - ie. while the PENDING bit is set. + * get_work_[g]cwq() can be used to obtain the gcwq or cwq corresponding to + * a work. gcwq is available once the work has been queued anywhere after + * initialization until it is sync canceled. cwq is available only while + * the work item is queued. * - * get_work_[g]cwq() can be used to obtain the gcwq or cwq - * corresponding to a work. gcwq is available once the work has been - * queued anywhere after initialization. cwq is available only from - * queueing until execution starts. + * %WORK_OFFQ_CANCELING is used to mark a work item which is being + * canceled. While being canceled, a work item may have its PENDING set + * but stay off timer and worklist for arbitrarily long and nobody should + * try to steal the PENDING bit. */ static inline void set_work_data(struct work_struct *work, unsigned long data, unsigned long flags) @@ -555,13 +565,22 @@ static void set_work_cwq(struct work_struct *work, WORK_STRUCT_PENDING | WORK_STRUCT_CWQ | extra_flags); } -static void set_work_cpu(struct work_struct *work, unsigned int cpu) +static void set_work_cpu_and_clear_pending(struct work_struct *work, + unsigned int cpu) { - set_work_data(work, cpu << WORK_STRUCT_FLAG_BITS, WORK_STRUCT_PENDING); + /* + * The following wmb is paired with the implied mb in + * test_and_set_bit(PENDING) and ensures all updates to @work made + * here are visible to and precede any updates by the next PENDING + * owner. + */ + smp_wmb(); + set_work_data(work, (unsigned long)cpu << WORK_OFFQ_CPU_SHIFT, 0); } static void clear_work_data(struct work_struct *work) { + smp_wmb(); /* see set_work_cpu_and_clear_pending() */ set_work_data(work, WORK_STRUCT_NO_CPU, 0); } @@ -584,7 +603,7 @@ static struct global_cwq *get_work_gcwq(struct work_struct *work) return ((struct cpu_workqueue_struct *) (data & WORK_STRUCT_WQ_DATA_MASK))->pool->gcwq; - cpu = data >> WORK_STRUCT_FLAG_BITS; + cpu = data >> WORK_OFFQ_CPU_SHIFT; if (cpu == WORK_CPU_NONE) return NULL; @@ -592,6 +611,22 @@ static struct global_cwq *get_work_gcwq(struct work_struct *work) return get_gcwq(cpu); } +static void mark_work_canceling(struct work_struct *work) +{ + struct global_cwq *gcwq = get_work_gcwq(work); + unsigned long cpu = gcwq ? gcwq->cpu : WORK_CPU_NONE; + + set_work_data(work, (cpu << WORK_OFFQ_CPU_SHIFT) | WORK_OFFQ_CANCELING, + WORK_STRUCT_PENDING); +} + +static bool work_is_canceling(struct work_struct *work) +{ + unsigned long data = atomic_long_read(&work->data); + + return !(data & WORK_STRUCT_CWQ) && (data & WORK_OFFQ_CANCELING); +} + /* * Policy functions. These define the policies on how the global worker * pools are managed. Unless noted otherwise, these functions assume that @@ -650,6 +685,13 @@ static bool too_many_workers(struct worker_pool *pool) int nr_idle = pool->nr_idle + managing; /* manager is considered idle */ int nr_busy = pool->nr_workers - nr_idle; + /* + * nr_idle and idle_list may disagree if idle rebinding is in + * progress. Never return %true if idle_list is empty. + */ + if (list_empty(&pool->idle_list)) + return false; + return nr_idle > 2 && (nr_idle - 2) * MAX_IDLE_WORKERS_RATIO >= nr_busy; } @@ -698,8 +740,10 @@ void wq_worker_waking_up(struct task_struct *task, unsigned int cpu) { struct worker *worker = kthread_data(task); - if (!(worker->flags & WORKER_NOT_RUNNING)) + if (!(worker->flags & WORKER_NOT_RUNNING)) { + WARN_ON_ONCE(worker->pool->gcwq->cpu != cpu); atomic_inc(get_pool_nr_running(worker->pool)); + } } /** @@ -735,11 +779,11 @@ struct task_struct *wq_worker_sleeping(struct task_struct *task, * worklist not empty test sequence is in insert_work(). * Please read comment there. * - * NOT_RUNNING is clear. This means that trustee is not in - * charge and we're running on the local cpu w/ rq lock held - * and preemption disabled, which in turn means that none else - * could be manipulating idle_list, so dereferencing idle_list - * without gcwq lock is safe. + * NOT_RUNNING is clear. This means that we're bound to and + * running on the local cpu w/ rq lock held and preemption + * disabled, which in turn means that none else could be + * manipulating idle_list, so dereferencing idle_list without gcwq + * lock is safe. */ if (atomic_dec_and_test(nr_running) && !list_empty(&pool->worklist)) to_wakeup = first_worker(pool); @@ -895,6 +939,208 @@ static struct worker *find_worker_executing_work(struct global_cwq *gcwq, work); } +/** + * move_linked_works - move linked works to a list + * @work: start of series of works to be scheduled + * @head: target list to append @work to + * @nextp: out paramter for nested worklist walking + * + * Schedule linked works starting from @work to @head. Work series to + * be scheduled starts at @work and includes any consecutive work with + * WORK_STRUCT_LINKED set in its predecessor. + * + * If @nextp is not NULL, it's updated to point to the next work of + * the last scheduled work. This allows move_linked_works() to be + * nested inside outer list_for_each_entry_safe(). + * + * CONTEXT: + * spin_lock_irq(gcwq->lock). + */ +static void move_linked_works(struct work_struct *work, struct list_head *head, + struct work_struct **nextp) +{ + struct work_struct *n; + + /* + * Linked worklist will always end before the end of the list, + * use NULL for list head. + */ + list_for_each_entry_safe_from(work, n, NULL, entry) { + list_move_tail(&work->entry, head); + if (!(*work_data_bits(work) & WORK_STRUCT_LINKED)) + break; + } + + /* + * If we're already inside safe list traversal and have moved + * multiple works to the scheduled queue, the next position + * needs to be updated. + */ + if (nextp) + *nextp = n; +} + +static void cwq_activate_delayed_work(struct work_struct *work) +{ + struct cpu_workqueue_struct *cwq = get_work_cwq(work); + + trace_workqueue_activate_work(work); + move_linked_works(work, &cwq->pool->worklist, NULL); + __clear_bit(WORK_STRUCT_DELAYED_BIT, work_data_bits(work)); + cwq->nr_active++; +} + +static void cwq_activate_first_delayed(struct cpu_workqueue_struct *cwq) +{ + struct work_struct *work = list_first_entry(&cwq->delayed_works, + struct work_struct, entry); + + cwq_activate_delayed_work(work); +} + +/** + * cwq_dec_nr_in_flight - decrement cwq's nr_in_flight + * @cwq: cwq of interest + * @color: color of work which left the queue + * @delayed: for a delayed work + * + * A work either has completed or is removed from pending queue, + * decrement nr_in_flight of its cwq and handle workqueue flushing. + * + * CONTEXT: + * spin_lock_irq(gcwq->lock). + */ +static void cwq_dec_nr_in_flight(struct cpu_workqueue_struct *cwq, int color, + bool delayed) +{ + /* ignore uncolored works */ + if (color == WORK_NO_COLOR) + return; + + cwq->nr_in_flight[color]--; + + if (!delayed) { + cwq->nr_active--; + if (!list_empty(&cwq->delayed_works)) { + /* one down, submit a delayed one */ + if (cwq->nr_active < cwq->max_active) + cwq_activate_first_delayed(cwq); + } + } + + /* is flush in progress and are we at the flushing tip? */ + if (likely(cwq->flush_color != color)) + return; + + /* are there still in-flight works? */ + if (cwq->nr_in_flight[color]) + return; + + /* this cwq is done, clear flush_color */ + cwq->flush_color = -1; + + /* + * If this was the last cwq, wake up the first flusher. It + * will handle the rest. + */ + if (atomic_dec_and_test(&cwq->wq->nr_cwqs_to_flush)) + complete(&cwq->wq->first_flusher->done); +} + +/** + * try_to_grab_pending - steal work item from worklist and disable irq + * @work: work item to steal + * @is_dwork: @work is a delayed_work + * @flags: place to store irq state + * + * Try to grab PENDING bit of @work. This function can handle @work in any + * stable state - idle, on timer or on worklist. Return values are + * + * 1 if @work was pending and we successfully stole PENDING + * 0 if @work was idle and we claimed PENDING + * -EAGAIN if PENDING couldn't be grabbed at the moment, safe to busy-retry + * -ENOENT if someone else is canceling @work, this state may persist + * for arbitrarily long + * + * On >= 0 return, the caller owns @work's PENDING bit. To avoid getting + * preempted while holding PENDING and @work off queue, preemption must be + * disabled on entry. This ensures that we don't return -EAGAIN while + * another task is preempted in this function. + * + * On successful return, >= 0, irq is disabled and the caller is + * responsible for releasing it using local_irq_restore(*@flags). + * + * This function is safe to call from any context other than IRQ handler. + * An IRQ handler may run on top of delayed_work_timer_fn() which can make + * this function return -EAGAIN perpetually. + */ +static int try_to_grab_pending(struct work_struct *work, bool is_dwork, + unsigned long *flags) +{ + struct global_cwq *gcwq; + + local_irq_save(*flags); + + /* try to steal the timer if it exists */ + if (is_dwork) { + struct delayed_work *dwork = to_delayed_work(work); + + if (likely(del_timer(&dwork->timer))) + return 1; + } + + /* try to claim PENDING the normal way */ + if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))) + return 0; + + /* + * The queueing is in progress, or it is already queued. Try to + * steal it from ->worklist without clearing WORK_STRUCT_PENDING. + */ + gcwq = get_work_gcwq(work); + if (!gcwq) + goto fail; + + spin_lock(&gcwq->lock); + if (!list_empty(&work->entry)) { + /* + * This work is queued, but perhaps we locked the wrong gcwq. + * In that case we must see the new value after rmb(), see + * insert_work()->wmb(). + */ + smp_rmb(); + if (gcwq == get_work_gcwq(work)) { + debug_work_deactivate(work); + + /* + * A delayed work item cannot be grabbed directly + * because it might have linked NO_COLOR work items + * which, if left on the delayed_list, will confuse + * cwq->nr_active management later on and cause + * stall. Make sure the work item is activated + * before grabbing. + */ + if (*work_data_bits(work) & WORK_STRUCT_DELAYED) + cwq_activate_delayed_work(work); + + list_del_init(&work->entry); + cwq_dec_nr_in_flight(get_work_cwq(work), + get_work_color(work), + *work_data_bits(work) & WORK_STRUCT_DELAYED); + + spin_unlock(&gcwq->lock); + return 1; + } + } + spin_unlock(&gcwq->lock); +fail: + local_irq_restore(*flags); + if (work_is_canceling(work)) + return -ENOENT; + cpu_relax(); + return -EAGAIN; +} + /** * insert_work - insert a work into gcwq * @cwq: cwq @work belongs to @@ -975,7 +1221,15 @@ static void __queue_work(unsigned int cpu, struct workqueue_struct *wq, struct cpu_workqueue_struct *cwq; struct list_head *worklist; unsigned int work_flags; - unsigned long flags; + unsigned int req_cpu = cpu; + + /* + * While a work item is PENDING && off queue, a task trying to + * steal the PENDING will busy-loop waiting for it to either get + * queued or lose PENDING. Grabbing PENDING and queueing should + * happen with IRQ disabled. + */ + WARN_ON_ONCE(!irqs_disabled()); debug_work_activate(work); @@ -988,21 +1242,22 @@ static void __queue_work(unsigned int cpu, struct workqueue_struct *wq, if (!(wq->flags & WQ_UNBOUND)) { struct global_cwq *last_gcwq; - if (unlikely(cpu == WORK_CPU_UNBOUND)) + if (cpu == WORK_CPU_UNBOUND) cpu = raw_smp_processor_id(); /* - * It's multi cpu. If @wq is non-reentrant and @work - * was previously on a different cpu, it might still - * be running there, in which case the work needs to - * be queued on that cpu to guarantee non-reentrance. + * It's multi cpu. If @work was previously on a different + * cpu, it might still be running there, in which case the + * work needs to be queued on that cpu to guarantee + * non-reentrancy. */ gcwq = get_gcwq(cpu); - if (wq->flags & WQ_NON_REENTRANT && - (last_gcwq = get_work_gcwq(work)) && last_gcwq != gcwq) { + last_gcwq = get_work_gcwq(work); + + if (last_gcwq && last_gcwq != gcwq) { struct worker *worker; - spin_lock_irqsave(&last_gcwq->lock, flags); + spin_lock(&last_gcwq->lock); worker = find_worker_executing_work(last_gcwq, work); @@ -1010,21 +1265,25 @@ static void __queue_work(unsigned int cpu, struct workqueue_struct *wq, gcwq = last_gcwq; else { /* meh... not running there, queue here */ - spin_unlock_irqrestore(&last_gcwq->lock, flags); - spin_lock_irqsave(&gcwq->lock, flags); + spin_unlock(&last_gcwq->lock); + spin_lock(&gcwq->lock); } - } else - spin_lock_irqsave(&gcwq->lock, flags); + } else { + spin_lock(&gcwq->lock); + } } else { gcwq = get_gcwq(WORK_CPU_UNBOUND); - spin_lock_irqsave(&gcwq->lock, flags); + spin_lock(&gcwq->lock); } /* gcwq determined, get cwq and queue */ cwq = get_cwq(gcwq->cpu, wq); - trace_workqueue_queue_work(cpu, cwq, work); + trace_workqueue_queue_work(req_cpu, cwq, work); - BUG_ON(!list_empty(&work->entry)); + if (WARN_ON(!list_empty(&work->entry))) { + spin_unlock(&gcwq->lock); + return; + } cwq->nr_in_flight[cwq->work_color]++; work_flags = work_color_to_flags(cwq->work_color); @@ -1040,29 +1299,8 @@ static void __queue_work(unsigned int cpu, struct workqueue_struct *wq, insert_work(cwq, work, worklist, work_flags); - spin_unlock_irqrestore(&gcwq->lock, flags); -} - -/** - * queue_work - queue work on a workqueue - * @wq: workqueue to use - * @work: work to queue - * - * Returns 0 if @work was already on a queue, non-zero otherwise. - * - * We queue the work to the CPU on which it was submitted, but if the CPU dies - * it can be processed by another CPU. - */ -int queue_work(struct workqueue_struct *wq, struct work_struct *work) -{ - int ret; - - ret = queue_work_on(get_cpu(), wq, work); - put_cpu(); - - return ret; + spin_unlock(&gcwq->lock); } -EXPORT_SYMBOL_GPL(queue_work); /** * queue_work_on - queue work on specific cpu @@ -1070,105 +1308,213 @@ EXPORT_SYMBOL_GPL(queue_work); * @wq: workqueue to use * @work: work to queue * - * Returns 0 if @work was already on a queue, non-zero otherwise. + * Returns %false if @work was already on a queue, %true otherwise. * * We queue the work to a specific CPU, the caller must ensure it * can't go away. */ -int -queue_work_on(int cpu, struct workqueue_struct *wq, struct work_struct *work) +bool queue_work_on(int cpu, struct workqueue_struct *wq, + struct work_struct *work) { - int ret = 0; + bool ret = false; + unsigned long flags; + + local_irq_save(flags); if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))) { __queue_work(cpu, wq, work); - ret = 1; + ret = true; } + + local_irq_restore(flags); return ret; } EXPORT_SYMBOL_GPL(queue_work_on); -static void delayed_work_timer_fn(unsigned long __data) -{ - struct delayed_work *dwork = (struct delayed_work *)__data; - struct cpu_workqueue_struct *cwq = get_work_cwq(&dwork->work); - - __queue_work(smp_processor_id(), cwq->wq, &dwork->work); -} - /** - * queue_delayed_work - queue work on a workqueue after delay + * queue_work - queue work on a workqueue * @wq: workqueue to use - * @dwork: delayable work to queue - * @delay: number of jiffies to wait before queueing + * @work: work to queue + * + * Returns %false if @work was already on a queue, %true otherwise. * - * Returns 0 if @work was already on a queue, non-zero otherwise. + * We queue the work to the CPU on which it was submitted, but if the CPU dies + * it can be processed by another CPU. */ -int queue_delayed_work(struct workqueue_struct *wq, - struct delayed_work *dwork, unsigned long delay) +bool queue_work(struct workqueue_struct *wq, struct work_struct *work) +{ + return queue_work_on(WORK_CPU_UNBOUND, wq, work); +} +EXPORT_SYMBOL_GPL(queue_work); + +void delayed_work_timer_fn(unsigned long __data) { - if (delay == 0) - return queue_work(wq, &dwork->work); + struct delayed_work *dwork = (struct delayed_work *)__data; + struct cpu_workqueue_struct *cwq = get_work_cwq(&dwork->work); - return queue_delayed_work_on(-1, wq, dwork, delay); + local_irq_disable(); + __queue_work(dwork->cpu, cwq->wq, &dwork->work); + local_irq_enable(); } -EXPORT_SYMBOL_GPL(queue_delayed_work); +EXPORT_SYMBOL_GPL(delayed_work_timer_fn); -/** - * queue_delayed_work_on - queue work on specific CPU after delay - * @cpu: CPU number to execute work on - * @wq: workqueue to use - * @dwork: work to queue - * @delay: number of jiffies to wait before queueing - * - * Returns 0 if @work was already on a queue, non-zero otherwise. - */ -int queue_delayed_work_on(int cpu, struct workqueue_struct *wq, - struct delayed_work *dwork, unsigned long delay) +static void __queue_delayed_work(int cpu, struct workqueue_struct *wq, + struct delayed_work *dwork, unsigned long delay) { - int ret = 0; struct timer_list *timer = &dwork->timer; struct work_struct *work = &dwork->work; + unsigned int lcpu; - if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))) { - unsigned int lcpu; - - BUG_ON(timer_pending(timer)); - BUG_ON(!list_empty(&work->entry)); - - timer_stats_timer_set_start_info(&dwork->timer); + WARN_ON_ONCE(timer->function != delayed_work_timer_fn || + timer->data != (unsigned long)dwork); + WARN_ON_ONCE(timer_pending(timer)); + WARN_ON_ONCE(!list_empty(&work->entry)); + + /* + * If @delay is 0, queue @dwork->work immediately. This is for + * both optimization and correctness. The earliest @timer can + * expire is on the closest next tick and delayed_work users depend + * on that there's no such delay when @delay is 0. + */ + if (!delay) { + __queue_work(cpu, wq, &dwork->work); + return; + } + + timer_stats_timer_set_start_info(&dwork->timer); + + /* + * This stores cwq for the moment, for the timer_fn. Note that the + * work's gcwq is preserved to allow reentrance detection for + * delayed works. + */ + if (!(wq->flags & WQ_UNBOUND)) { + struct global_cwq *gcwq = get_work_gcwq(work); /* - * This stores cwq for the moment, for the timer_fn. - * Note that the work's gcwq is preserved to allow - * reentrance detection for delayed works. + * If we cannot get the last gcwq from @work directly, + * select the last CPU such that it avoids unnecessarily + * triggering non-reentrancy check in __queue_work(). */ - if (!(wq->flags & WQ_UNBOUND)) { - struct global_cwq *gcwq = get_work_gcwq(work); + lcpu = cpu; + if (gcwq) + lcpu = gcwq->cpu; + if (lcpu == WORK_CPU_UNBOUND) + lcpu = raw_smp_processor_id(); + } else { + lcpu = WORK_CPU_UNBOUND; + } - if (gcwq && gcwq->cpu != WORK_CPU_UNBOUND) - lcpu = gcwq->cpu; - else - lcpu = raw_smp_processor_id(); - } else - lcpu = WORK_CPU_UNBOUND; + set_work_cwq(work, get_cwq(lcpu, wq), 0); - set_work_cwq(work, get_cwq(lcpu, wq), 0); + dwork->cpu = cpu; + timer->expires = jiffies + delay; - timer->expires = jiffies + delay; - timer->data = (unsigned long)dwork; - timer->function = delayed_work_timer_fn; + if (unlikely(cpu != WORK_CPU_UNBOUND)) + add_timer_on(timer, cpu); + else + add_timer(timer); +} - if (unlikely(cpu >= 0)) - add_timer_on(timer, cpu); - else - add_timer(timer); - ret = 1; +/** + * queue_delayed_work_on - queue work on specific CPU after delay + * @cpu: CPU number to execute work on + * @wq: workqueue to use + * @dwork: work to queue + * @delay: number of jiffies to wait before queueing + * + * Returns %false if @work was already on a queue, %true otherwise. If + * @delay is zero and @dwork is idle, it will be scheduled for immediate + * execution. + */ +bool queue_delayed_work_on(int cpu, struct workqueue_struct *wq, + struct delayed_work *dwork, unsigned long delay) +{ + struct work_struct *work = &dwork->work; + bool ret = false; + unsigned long flags; + + /* read the comment in __queue_work() */ + local_irq_save(flags); + + if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))) { + __queue_delayed_work(cpu, wq, dwork, delay); + ret = true; } + + local_irq_restore(flags); return ret; } EXPORT_SYMBOL_GPL(queue_delayed_work_on); +/** + * queue_delayed_work - queue work on a workqueue after delay + * @wq: workqueue to use + * @dwork: delayable work to queue + * @delay: number of jiffies to wait before queueing + * + * Equivalent to queue_delayed_work_on() but tries to use the local CPU. + */ +bool queue_delayed_work(struct workqueue_struct *wq, + struct delayed_work *dwork, unsigned long delay) +{ + return queue_delayed_work_on(WORK_CPU_UNBOUND, wq, dwork, delay); +} +EXPORT_SYMBOL_GPL(queue_delayed_work); + +/** + * mod_delayed_work_on - modify delay of or queue a delayed work on specific CPU + * @cpu: CPU number to execute work on + * @wq: workqueue to use + * @dwork: work to queue + * @delay: number of jiffies to wait before queueing + * + * If @dwork is idle, equivalent to queue_delayed_work_on(); otherwise, + * modify @dwork's timer so that it expires after @delay. If @delay is + * zero, @work is guaranteed to be scheduled immediately regardless of its + * current state. + * + * Returns %false if @dwork was idle and queued, %true if @dwork was + * pending and its timer was modified. + * + * This function is safe to call from any context other than IRQ handler. + * See try_to_grab_pending() for details. + */ +bool mod_delayed_work_on(int cpu, struct workqueue_struct *wq, + struct delayed_work *dwork, unsigned long delay) +{ + unsigned long flags; + int ret; + + do { + ret = try_to_grab_pending(&dwork->work, true, &flags); + } while (unlikely(ret == -EAGAIN)); + + if (likely(ret >= 0)) { + __queue_delayed_work(cpu, wq, dwork, delay); + local_irq_restore(flags); + } + + /* -ENOENT from try_to_grab_pending() becomes %true */ + return ret; +} +EXPORT_SYMBOL_GPL(mod_delayed_work_on); + +/** + * mod_delayed_work - modify delay of or queue a delayed work + * @wq: workqueue to use + * @dwork: work to queue + * @delay: number of jiffies to wait before queueing + * + * mod_delayed_work_on() on local CPU. + */ +bool mod_delayed_work(struct workqueue_struct *wq, struct delayed_work *dwork, + unsigned long delay) +{ + return mod_delayed_work_on(WORK_CPU_UNBOUND, wq, dwork, delay); +} +EXPORT_SYMBOL_GPL(mod_delayed_work); + /** * worker_enter_idle - enter idle state * @worker: worker which is entering idle state @@ -1196,19 +1542,16 @@ static void worker_enter_idle(struct worker *worker) /* idle_list is LIFO */ list_add(&worker->entry, &pool->idle_list); - if (likely(!(worker->flags & WORKER_ROGUE))) { - if (too_many_workers(pool) && !timer_pending(&pool->idle_timer)) - mod_timer(&pool->idle_timer, - jiffies + IDLE_WORKER_TIMEOUT); - } else - wake_up_all(&gcwq->trustee_wait); + if (too_many_workers(pool) && !timer_pending(&pool->idle_timer)) + mod_timer(&pool->idle_timer, jiffies + IDLE_WORKER_TIMEOUT); /* - * Sanity check nr_running. Because trustee releases gcwq->lock - * between setting %WORKER_ROGUE and zapping nr_running, the - * warning may trigger spuriously. Check iff trustee is idle. + * Sanity check nr_running. Because gcwq_unbind_fn() releases + * gcwq->lock between setting %WORKER_UNBOUND and zapping + * nr_running, the warning may trigger spuriously. Check iff + * unbind is not in progress. */ - WARN_ON_ONCE(gcwq->trustee_state == TRUSTEE_DONE && + WARN_ON_ONCE(!(gcwq->flags & GCWQ_DISASSOCIATED) && pool->nr_workers == pool->nr_idle && atomic_read(get_pool_nr_running(pool))); } @@ -1248,11 +1591,11 @@ static void worker_leave_idle(struct worker *worker) * verbatim as it's best effort and blocking and gcwq may be * [dis]associated in the meantime. * - * This function tries set_cpus_allowed() and locks gcwq and verifies - * the binding against GCWQ_DISASSOCIATED which is set during - * CPU_DYING and cleared during CPU_ONLINE, so if the worker enters - * idle state or fetches works without dropping lock, it can guarantee - * the scheduling requirement described in the first paragraph. + * This function tries set_cpus_allowed() and locks gcwq and verifies the + * binding against %GCWQ_DISASSOCIATED which is set during + * %CPU_DOWN_PREPARE and cleared during %CPU_ONLINE, so if the worker + * enters idle state or fetches works without dropping lock, it can + * guarantee the scheduling requirement described in the first paragraph. * * CONTEXT: * Might sleep. Called without any lock but returns with gcwq->lock @@ -1299,22 +1642,117 @@ __acquires(&gcwq->lock) } /* - * Function for worker->rebind_work used to rebind rogue busy workers - * to the associated cpu which is coming back online. This is - * scheduled by cpu up but can race with other cpu hotplug operations - * and may be executed twice without intervening cpu down. + * Rebind an idle @worker to its CPU. worker_thread() will test + * list_empty(@worker->entry) before leaving idle and call this function. + */ +static void idle_worker_rebind(struct worker *worker) +{ + struct global_cwq *gcwq = worker->pool->gcwq; + + /* CPU may go down again inbetween, clear UNBOUND only on success */ + if (worker_maybe_bind_and_lock(worker)) + worker_clr_flags(worker, WORKER_UNBOUND); + + /* rebind complete, become available again */ + list_add(&worker->entry, &worker->pool->idle_list); + spin_unlock_irq(&gcwq->lock); +} + +/* + * Function for @worker->rebind.work used to rebind unbound busy workers to + * the associated cpu which is coming back online. This is scheduled by + * cpu up but can race with other cpu hotplug operations and may be + * executed twice without intervening cpu down. */ -static void worker_rebind_fn(struct work_struct *work) +static void busy_worker_rebind_fn(struct work_struct *work) { struct worker *worker = container_of(work, struct worker, rebind_work); struct global_cwq *gcwq = worker->pool->gcwq; if (worker_maybe_bind_and_lock(worker)) - worker_clr_flags(worker, WORKER_REBIND); + worker_clr_flags(worker, WORKER_UNBOUND); spin_unlock_irq(&gcwq->lock); } +/** + * rebind_workers - rebind all workers of a gcwq to the associated CPU + * @gcwq: gcwq of interest + * + * @gcwq->cpu is coming online. Rebind all workers to the CPU. Rebinding + * is different for idle and busy ones. + * + * Idle ones will be removed from the idle_list and woken up. They will + * add themselves back after completing rebind. This ensures that the + * idle_list doesn't contain any unbound workers when re-bound busy workers + * try to perform local wake-ups for concurrency management. + * + * Busy workers can rebind after they finish their current work items. + * Queueing the rebind work item at the head of the scheduled list is + * enough. Note that nr_running will be properly bumped as busy workers + * rebind. + * + * On return, all non-manager workers are scheduled for rebind - see + * manage_workers() for the manager special case. Any idle worker + * including the manager will not appear on @idle_list until rebind is + * complete, making local wake-ups safe. + */ +static void rebind_workers(struct global_cwq *gcwq) +{ + struct worker_pool *pool; + struct worker *worker, *n; + struct hlist_node *pos; + int i; + + lockdep_assert_held(&gcwq->lock); + + for_each_worker_pool(pool, gcwq) + lockdep_assert_held(&pool->assoc_mutex); + + /* dequeue and kick idle ones */ + for_each_worker_pool(pool, gcwq) { + list_for_each_entry_safe(worker, n, &pool->idle_list, entry) { + /* + * idle workers should be off @pool->idle_list + * until rebind is complete to avoid receiving + * premature local wake-ups. + */ + list_del_init(&worker->entry); + + /* + * worker_thread() will see the above dequeuing + * and call idle_worker_rebind(). + */ + wake_up_process(worker->task); + } + } + + /* rebind busy workers */ + for_each_busy_worker(worker, i, pos, gcwq) { + struct work_struct *rebind_work = &worker->rebind_work; + struct workqueue_struct *wq; + + if (test_and_set_bit(WORK_STRUCT_PENDING_BIT, + work_data_bits(rebind_work))) + continue; + + debug_work_activate(rebind_work); + + /* + * wq doesn't really matter but let's keep @worker->pool + * and @cwq->pool consistent for sanity. + */ + if (worker_pool_pri(worker->pool)) + wq = system_highpri_wq; + else + wq = system_wq; + + insert_work(get_cwq(gcwq->cpu, wq), rebind_work, + worker->scheduled.next, + work_color_to_flags(WORK_NO_COLOR)); + } +} + static struct worker *alloc_worker(void) { struct worker *worker; @@ -1323,7 +1761,7 @@ static struct worker *alloc_worker(void) if (worker) { INIT_LIST_HEAD(&worker->entry); INIT_LIST_HEAD(&worker->scheduled); - INIT_WORK(&worker->rebind_work, worker_rebind_fn); + INIT_WORK(&worker->rebind_work, busy_worker_rebind_fn); /* on creation a worker is in !idle && prep state */ worker->flags = WORKER_PREP; } @@ -1333,7 +1771,6 @@ static struct worker *alloc_worker(void) /** * create_worker - create a new workqueue worker * @pool: pool the new worker will belong to - * @bind: whether to set affinity to @cpu or not * * Create a new worker which is bound to @pool. The returned worker * can be started by calling start_worker() or destroyed using @@ -1345,10 +1782,9 @@ static struct worker *alloc_worker(void) * RETURNS: * Pointer to the newly created worker. */ -static struct worker *create_worker(struct worker_pool *pool, bool bind) +static struct worker *create_worker(struct worker_pool *pool) { struct global_cwq *gcwq = pool->gcwq; - bool on_unbound_cpu = gcwq->cpu == WORK_CPU_UNBOUND; const char *pri = worker_pool_pri(pool) ? "H" : ""; struct worker *worker = NULL; int id = -1; @@ -1369,7 +1805,7 @@ static struct worker *create_worker(struct worker_pool *pool, bool bind) worker->pool = pool; worker->id = id; - if (!on_unbound_cpu) + if (gcwq->cpu != WORK_CPU_UNBOUND) worker->task = kthread_create_on_node(worker_thread, worker, cpu_to_node(gcwq->cpu), "kworker/%u:%d%s", gcwq->cpu, id, pri); @@ -1383,16 +1819,19 @@ static struct worker *create_worker(struct worker_pool *pool, bool bind) set_user_nice(worker->task, HIGHPRI_NICE_LEVEL); /* - * A rogue worker will become a regular one if CPU comes - * online later on. Make sure every worker has - * PF_THREAD_BOUND set. + * Determine CPU binding of the new worker depending on + * %GCWQ_DISASSOCIATED. The caller is responsible for ensuring the + * flag remains stable across this function. See the comments + * above the flag definition for details. + * + * As an unbound worker may later become a regular one if CPU comes + * online, make sure every worker has %PF_THREAD_BOUND set. */ - if (bind && !on_unbound_cpu) + if (!(gcwq->flags & GCWQ_DISASSOCIATED)) { kthread_bind(worker->task, gcwq->cpu); - else { + } else { worker->task->flags |= PF_THREAD_BOUND; - if (on_unbound_cpu) - worker->flags |= WORKER_UNBOUND; + worker->flags |= WORKER_UNBOUND; } return worker; @@ -1568,7 +2007,7 @@ __acquires(&gcwq->lock) while (true) { struct worker *worker; - worker = create_worker(pool, true); + worker = create_worker(pool); if (worker) { del_timer_sync(&pool->mayday_timer); spin_lock_irq(&gcwq->lock); @@ -1655,15 +2094,49 @@ static bool maybe_destroy_workers(struct worker_pool *pool) static bool manage_workers(struct worker *worker) { struct worker_pool *pool = worker->pool; - struct global_cwq *gcwq = pool->gcwq; bool ret = false; if (pool->flags & POOL_MANAGING_WORKERS) return ret; - pool->flags &= ~POOL_MANAGE_WORKERS; pool->flags |= POOL_MANAGING_WORKERS; + /* + * To simplify both worker management and CPU hotplug, hold off + * management while hotplug is in progress. CPU hotplug path can't + * grab %POOL_MANAGING_WORKERS to achieve this because that can + * lead to idle worker depletion (all become busy thinking someone + * else is managing) which in turn can result in deadlock under + * extreme circumstances. Use @pool->assoc_mutex to synchronize + * manager against CPU hotplug. + * + * assoc_mutex would always be free unless CPU hotplug is in + * progress. trylock first without dropping @gcwq->lock. + */ + if (unlikely(!mutex_trylock(&pool->assoc_mutex))) { + spin_unlock_irq(&pool->gcwq->lock); + mutex_lock(&pool->assoc_mutex); + /* + * CPU hotplug could have happened while we were waiting + * for assoc_mutex. Hotplug itself can't handle us + * because manager isn't either on idle or busy list, and + * @gcwq's state and ours could have deviated. + * + * As hotplug is now excluded via assoc_mutex, we can + * simply try to bind. It will succeed or fail depending + * on @gcwq's current state. Try it and adjust + * %WORKER_UNBOUND accordingly. + */ + if (worker_maybe_bind_and_lock(worker)) + worker->flags &= ~WORKER_UNBOUND; + else + worker->flags |= WORKER_UNBOUND; + + ret = true; + } + + pool->flags &= ~POOL_MANAGE_WORKERS; + /* * Destroy and then create so that may_start_working() is true * on return. @@ -1672,118 +2145,10 @@ static bool manage_workers(struct worker *worker) ret |= maybe_create_worker(pool); pool->flags &= ~POOL_MANAGING_WORKERS; - - /* - * The trustee might be waiting to take over the manager - * position, tell it we're done. - */ - if (unlikely(gcwq->trustee)) - wake_up_all(&gcwq->trustee_wait); - + mutex_unlock(&pool->assoc_mutex); return ret; } -/** - * move_linked_works - move linked works to a list - * @work: start of series of works to be scheduled - * @head: target list to append @work to - * @nextp: out paramter for nested worklist walking - * - * Schedule linked works starting from @work to @head. Work series to - * be scheduled starts at @work and includes any consecutive work with - * WORK_STRUCT_LINKED set in its predecessor. - * - * If @nextp is not NULL, it's updated to point to the next work of - * the last scheduled work. This allows move_linked_works() to be - * nested inside outer list_for_each_entry_safe(). - * - * CONTEXT: - * spin_lock_irq(gcwq->lock). - */ -static void move_linked_works(struct work_struct *work, struct list_head *head, - struct work_struct **nextp) -{ - struct work_struct *n; - - /* - * Linked worklist will always end before the end of the list, - * use NULL for list head. - */ - list_for_each_entry_safe_from(work, n, NULL, entry) { - list_move_tail(&work->entry, head); - if (!(*work_data_bits(work) & WORK_STRUCT_LINKED)) - break; - } - - /* - * If we're already inside safe list traversal and have moved - * multiple works to the scheduled queue, the next position - * needs to be updated. - */ - if (nextp) - *nextp = n; -} - -static void cwq_activate_first_delayed(struct cpu_workqueue_struct *cwq) -{ - struct work_struct *work = list_first_entry(&cwq->delayed_works, - struct work_struct, entry); - - trace_workqueue_activate_work(work); - move_linked_works(work, &cwq->pool->worklist, NULL); - __clear_bit(WORK_STRUCT_DELAYED_BIT, work_data_bits(work)); - cwq->nr_active++; -} - -/** - * cwq_dec_nr_in_flight - decrement cwq's nr_in_flight - * @cwq: cwq of interest - * @color: color of work which left the queue - * @delayed: for a delayed work - * - * A work either has completed or is removed from pending queue, - * decrement nr_in_flight of its cwq and handle workqueue flushing. - * - * CONTEXT: - * spin_lock_irq(gcwq->lock). - */ -static void cwq_dec_nr_in_flight(struct cpu_workqueue_struct *cwq, int color, - bool delayed) -{ - /* ignore uncolored works */ - if (color == WORK_NO_COLOR) - return; - - cwq->nr_in_flight[color]--; - - if (!delayed) { - cwq->nr_active--; - if (!list_empty(&cwq->delayed_works)) { - /* one down, submit a delayed one */ - if (cwq->nr_active < cwq->max_active) - cwq_activate_first_delayed(cwq); - } - } - - /* is flush in progress and are we at the flushing tip? */ - if (likely(cwq->flush_color != color)) - return; - - /* are there still in-flight works? */ - if (cwq->nr_in_flight[color]) - return; - - /* this cwq is done, clear flush_color */ - cwq->flush_color = -1; - - /* - * If this was the last cwq, wake up the first flusher. It - * will handle the rest. - */ - if (atomic_dec_and_test(&cwq->wq->nr_cwqs_to_flush)) - complete(&cwq->wq->first_flusher->done); -} - /** * process_one_work - process single work * @worker: self @@ -1818,8 +2183,19 @@ __acquires(&gcwq->lock) * lock freed" warnings as well as problems when looking into * work->lockdep_map, make a copy and use that here. */ - struct lockdep_map lockdep_map = work->lockdep_map; + struct lockdep_map lockdep_map; + + lockdep_copy_map(&lockdep_map, &work->lockdep_map); #endif + /* + * Ensure we're on the correct CPU. DISASSOCIATED test is + * necessary to avoid spurious warnings from rescuers servicing the + * unbound or a disassociated gcwq. + */ + WARN_ON_ONCE(!(worker->flags & WORKER_UNBOUND) && + !(gcwq->flags & GCWQ_DISASSOCIATED) && + raw_smp_processor_id() != gcwq->cpu); + /* * A single work shouldn't be executed concurrently by * multiple workers on a single cpu. Check whether anyone is @@ -1832,15 +2208,13 @@ __acquires(&gcwq->lock) return; } - /* claim and process */ + /* claim and dequeue */ debug_work_deactivate(work); hlist_add_head(&worker->hentry, bwh); worker->current_work = work; worker->current_cwq = cwq; work_color = get_work_color(work); - /* record the current cpu number in the work data and dequeue */ - set_work_cpu(work, gcwq->cpu); list_del_init(&work->entry); /* @@ -1857,9 +2231,16 @@ __acquires(&gcwq->lock) if ((worker->flags & WORKER_UNBOUND) && need_more_worker(pool)) wake_up_worker(pool); + /* + * Record the last CPU and clear PENDING which should be the last + * update to @work. Also, do this inside @gcwq->lock so that + * PENDING and queued state changes happen together while IRQ is + * disabled. + */ + set_work_cpu_and_clear_pending(work, gcwq->cpu); + spin_unlock_irq(&gcwq->lock); - work_clear_pending(work); lock_map_acquire_read(&cwq->wq->lockdep_map); lock_map_acquire(&lockdep_map); trace_workqueue_execute_start(work); @@ -1873,11 +2254,9 @@ __acquires(&gcwq->lock) lock_map_release(&cwq->wq->lockdep_map); if (unlikely(in_atomic() || lockdep_depth(current) > 0)) { - printk(KERN_ERR "BUG: workqueue leaked lock or atomic: " - "%s/0x%08x/%d\n", - current->comm, preempt_count(), task_pid_nr(current)); - printk(KERN_ERR " last function: "); - print_symbol("%s\n", (unsigned long)f); + pr_err("BUG: workqueue leaked lock or atomic: %s/0x%08x/%d\n" + " last function: %pf\n", + current->comm, preempt_count(), task_pid_nr(current), f); debug_show_held_locks(current); BUG_ON(PANIC_CORRUPTION); dump_stack(); @@ -1938,11 +2317,19 @@ static int worker_thread(void *__worker) woke_up: spin_lock_irq(&gcwq->lock); - /* DIE can be set only while we're idle, checking here is enough */ - if (worker->flags & WORKER_DIE) { + /* we are off idle list if destruction or rebind is requested */ + if (unlikely(list_empty(&worker->entry))) { spin_unlock_irq(&gcwq->lock); - worker->task->flags &= ~PF_WQ_WORKER; - return 0; + + /* if DIE is set, destruction is requested */ + if (worker->flags & WORKER_DIE) { + worker->task->flags &= ~PF_WQ_WORKER; + return 0; + } + + /* otherwise, rebind */ + idle_worker_rebind(worker); + goto woke_up; } worker_leave_idle(worker); @@ -2035,8 +2422,10 @@ static int rescuer_thread(void *__wq) repeat: set_current_state(TASK_INTERRUPTIBLE); - if (kthread_should_stop()) + if (kthread_should_stop()) { + __set_current_state(TASK_RUNNING); return 0; + } /* * See whether any cpu is asking for help. Unbounded @@ -2423,8 +2812,8 @@ void drain_workqueue(struct workqueue_struct *wq) if (++flush_cnt == 10 || (flush_cnt % 100 == 0 && flush_cnt <= 1000)) - pr_warning("workqueue %s: flush on destruction isn't complete after %u tries\n", - wq->name, flush_cnt); + pr_warn("workqueue %s: flush on destruction isn't complete after %u tries\n", + wq->name, flush_cnt); goto reflush; } @@ -2435,8 +2824,7 @@ void drain_workqueue(struct workqueue_struct *wq) } EXPORT_SYMBOL_GPL(drain_workqueue); -static bool start_flush_work(struct work_struct *work, struct wq_barrier *barr, - bool wait_executing) +static bool start_flush_work(struct work_struct *work, struct wq_barrier *barr) { struct worker *worker = NULL; struct global_cwq *gcwq; @@ -2458,13 +2846,12 @@ static bool start_flush_work(struct work_struct *work, struct wq_barrier *barr, cwq = get_work_cwq(work); if (unlikely(!cwq || gcwq != cwq->pool->gcwq)) goto already_gone; - } else if (wait_executing) { + } else { worker = find_worker_executing_work(gcwq, work); if (!worker) goto already_gone; cwq = worker->current_cwq; - } else - goto already_gone; + } insert_wq_barrier(cwq, barr, work, worker); spin_unlock_irq(&gcwq->lock); @@ -2490,159 +2877,51 @@ static bool start_flush_work(struct work_struct *work, struct wq_barrier *barr, /** * flush_work - wait for a work to finish executing the last queueing instance * @work: the work to flush - * - * Wait until @work has finished execution. This function considers - * only the last queueing instance of @work. If @work has been - * enqueued across different CPUs on a non-reentrant workqueue or on - * multiple workqueues, @work might still be executing on return on - * some of the CPUs from earlier queueing. - * - * If @work was queued only on a non-reentrant, ordered or unbound - * workqueue, @work is guaranteed to be idle on return if it hasn't - * been requeued since flush started. - * - * RETURNS: - * %true if flush_work() waited for the work to finish execution, - * %false if it was already idle. - */ -bool flush_work(struct work_struct *work) -{ - struct wq_barrier barr; - - if (start_flush_work(work, &barr, true)) { - wait_for_completion(&barr.done); - destroy_work_on_stack(&barr.work); - return true; - } else - return false; -} -EXPORT_SYMBOL_GPL(flush_work); - -static bool wait_on_cpu_work(struct global_cwq *gcwq, struct work_struct *work) -{ - struct wq_barrier barr; - struct worker *worker; - - spin_lock_irq(&gcwq->lock); - - worker = find_worker_executing_work(gcwq, work); - if (unlikely(worker)) - insert_wq_barrier(worker->current_cwq, &barr, work, worker); - - spin_unlock_irq(&gcwq->lock); - - if (unlikely(worker)) { - wait_for_completion(&barr.done); - destroy_work_on_stack(&barr.work); - return true; - } else - return false; -} - -static bool wait_on_work(struct work_struct *work) -{ - bool ret = false; - int cpu; - - might_sleep(); - - lock_map_acquire(&work->lockdep_map); - lock_map_release(&work->lockdep_map); - - for_each_gcwq_cpu(cpu) - ret |= wait_on_cpu_work(get_gcwq(cpu), work); - return ret; -} - -/** - * flush_work_sync - wait until a work has finished execution - * @work: the work to flush - * - * Wait until @work has finished execution. On return, it's - * guaranteed that all queueing instances of @work which happened - * before this function is called are finished. In other words, if - * @work hasn't been requeued since this function was called, @work is - * guaranteed to be idle on return. - * - * RETURNS: - * %true if flush_work_sync() waited for the work to finish execution, - * %false if it was already idle. - */ -bool flush_work_sync(struct work_struct *work) -{ - struct wq_barrier barr; - bool pending, waited; - - /* we'll wait for executions separately, queue barr only if pending */ - pending = start_flush_work(work, &barr, false); - - /* wait for executions to finish */ - waited = wait_on_work(work); - - /* wait for the pending one */ - if (pending) { - wait_for_completion(&barr.done); - destroy_work_on_stack(&barr.work); - } - - return pending || waited; -} -EXPORT_SYMBOL_GPL(flush_work_sync); - -/* - * Upon a successful return (>= 0), the caller "owns" WORK_STRUCT_PENDING bit, - * so this work can't be re-armed in any way. - */ -static int try_to_grab_pending(struct work_struct *work) -{ - struct global_cwq *gcwq; - int ret = -1; - - if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))) - return 0; - - /* - * The queueing is in progress, or it is already queued. Try to - * steal it from ->worklist without clearing WORK_STRUCT_PENDING. - */ - gcwq = get_work_gcwq(work); - if (!gcwq) - return ret; - - spin_lock_irq(&gcwq->lock); - if (!list_empty(&work->entry)) { - /* - * This work is queued, but perhaps we locked the wrong gcwq. - * In that case we must see the new value after rmb(), see - * insert_work()->wmb(). - */ - smp_rmb(); - if (gcwq == get_work_gcwq(work)) { - debug_work_deactivate(work); - list_del_init(&work->entry); - cwq_dec_nr_in_flight(get_work_cwq(work), - get_work_color(work), - *work_data_bits(work) & WORK_STRUCT_DELAYED); - ret = 1; - } - } - spin_unlock_irq(&gcwq->lock); + * + * Wait until @work has finished execution. @work is guaranteed to be idle + * on return if it hasn't been requeued since flush started. + * + * RETURNS: + * %true if flush_work() waited for the work to finish execution, + * %false if it was already idle. + */ +bool flush_work(struct work_struct *work) +{ + struct wq_barrier barr; - return ret; + lock_map_acquire(&work->lockdep_map); + lock_map_release(&work->lockdep_map); + + if (start_flush_work(work, &barr)) { + wait_for_completion(&barr.done); + destroy_work_on_stack(&barr.work); + return true; + } else { + return false; + } } +EXPORT_SYMBOL_GPL(flush_work); -static bool __cancel_work_timer(struct work_struct *work, - struct timer_list* timer) +static bool __cancel_work_timer(struct work_struct *work, bool is_dwork) { + unsigned long flags; int ret; do { - ret = (timer && likely(del_timer(timer))); - if (!ret) - ret = try_to_grab_pending(work); - wait_on_work(work); + ret = try_to_grab_pending(work, is_dwork, &flags); + /* + * If someone else is canceling, wait for the same event it + * would be waiting for before retrying. + */ + if (unlikely(ret == -ENOENT)) + flush_work(work); } while (unlikely(ret < 0)); + /* tell other tasks trying to grab @work to back off */ + mark_work_canceling(work); + local_irq_restore(flags); + + flush_work(work); clear_work_data(work); return ret; } @@ -2667,7 +2946,7 @@ static bool __cancel_work_timer(struct work_struct *work, */ bool cancel_work_sync(struct work_struct *work) { - return __cancel_work_timer(work, NULL); + return __cancel_work_timer(work, false); } EXPORT_SYMBOL_GPL(cancel_work_sync); @@ -2685,33 +2964,44 @@ EXPORT_SYMBOL_GPL(cancel_work_sync); */ bool flush_delayed_work(struct delayed_work *dwork) { + local_irq_disable(); if (del_timer_sync(&dwork->timer)) - __queue_work(raw_smp_processor_id(), + __queue_work(dwork->cpu, get_work_cwq(&dwork->work)->wq, &dwork->work); + local_irq_enable(); return flush_work(&dwork->work); } EXPORT_SYMBOL(flush_delayed_work); /** - * flush_delayed_work_sync - wait for a dwork to finish - * @dwork: the delayed work to flush + * cancel_delayed_work - cancel a delayed work + * @dwork: delayed_work to cancel * - * Delayed timer is cancelled and the pending work is queued for - * execution immediately. Other than timer handling, its behavior - * is identical to flush_work_sync(). + * Kill off a pending delayed_work. Returns %true if @dwork was pending + * and canceled; %false if wasn't pending. Note that the work callback + * function may still be running on return, unless it returns %true and the + * work doesn't re-arm itself. Explicitly flush or use + * cancel_delayed_work_sync() to wait on it. * - * RETURNS: - * %true if flush_work_sync() waited for the work to finish execution, - * %false if it was already idle. + * This function is safe to call from any context including IRQ handler. */ -bool flush_delayed_work_sync(struct delayed_work *dwork) +bool cancel_delayed_work(struct delayed_work *dwork) { - if (del_timer_sync(&dwork->timer)) - __queue_work(raw_smp_processor_id(), - get_work_cwq(&dwork->work)->wq, &dwork->work); - return flush_work_sync(&dwork->work); + unsigned long flags; + int ret; + + do { + ret = try_to_grab_pending(&dwork->work, true, &flags); + } while (unlikely(ret == -EAGAIN)); + + if (unlikely(ret < 0)) + return false; + + set_work_cpu_and_clear_pending(&dwork->work, work_cpu(&dwork->work)); + local_irq_restore(flags); + return ret; } -EXPORT_SYMBOL(flush_delayed_work_sync); +EXPORT_SYMBOL(cancel_delayed_work); /** * cancel_delayed_work_sync - cancel a delayed work and wait for it to finish @@ -2724,54 +3014,39 @@ EXPORT_SYMBOL(flush_delayed_work_sync); */ bool cancel_delayed_work_sync(struct delayed_work *dwork) { - return __cancel_work_timer(&dwork->work, &dwork->timer); + return __cancel_work_timer(&dwork->work, true); } EXPORT_SYMBOL(cancel_delayed_work_sync); /** - * schedule_work - put work task in global workqueue - * @work: job to be done - * - * Returns zero if @work was already on the kernel-global workqueue and - * non-zero otherwise. - * - * This puts a job in the kernel-global workqueue if it was not already - * queued and leaves it in the same position on the kernel-global - * workqueue otherwise. - */ -int schedule_work(struct work_struct *work) -{ - return queue_work(system_wq, work); -} -EXPORT_SYMBOL(schedule_work); - -/* * schedule_work_on - put work task on a specific cpu * @cpu: cpu to put the work task on * @work: job to be done * * This puts a job on a specific cpu */ -int schedule_work_on(int cpu, struct work_struct *work) +bool schedule_work_on(int cpu, struct work_struct *work) { return queue_work_on(cpu, system_wq, work); } EXPORT_SYMBOL(schedule_work_on); /** - * schedule_delayed_work - put work task in global workqueue after delay - * @dwork: job to be done - * @delay: number of jiffies to wait or 0 for immediate execution + * schedule_work - put work task in global workqueue + * @work: job to be done * - * After waiting for a given time this puts a job in the kernel-global - * workqueue. + * Returns %false if @work was already on the kernel-global workqueue and + * %true otherwise. + * + * This puts a job in the kernel-global workqueue if it was not already + * queued and leaves it in the same position on the kernel-global + * workqueue otherwise. */ -int schedule_delayed_work(struct delayed_work *dwork, - unsigned long delay) +bool schedule_work(struct work_struct *work) { - return queue_delayed_work(system_wq, dwork, delay); + return queue_work(system_wq, work); } -EXPORT_SYMBOL(schedule_delayed_work); +EXPORT_SYMBOL(schedule_work); /** * schedule_delayed_work_on - queue work in global workqueue on CPU after delay @@ -2782,13 +3057,27 @@ EXPORT_SYMBOL(schedule_delayed_work); * After waiting for a given time this puts a job in the kernel-global * workqueue on the specified CPU. */ -int schedule_delayed_work_on(int cpu, - struct delayed_work *dwork, unsigned long delay) +bool schedule_delayed_work_on(int cpu, struct delayed_work *dwork, + unsigned long delay) { return queue_delayed_work_on(cpu, system_wq, dwork, delay); } EXPORT_SYMBOL(schedule_delayed_work_on); +/** + * schedule_delayed_work - put work task in global workqueue after delay + * @dwork: job to be done + * @delay: number of jiffies to wait or 0 for immediate execution + * + * After waiting for a given time this puts a job in the kernel-global + * workqueue. + */ +bool schedule_delayed_work(struct delayed_work *dwork, unsigned long delay) +{ + return queue_delayed_work(system_wq, dwork, delay); +} +EXPORT_SYMBOL(schedule_delayed_work); + /** * schedule_on_each_cpu - execute a function synchronously on each online CPU * @func: the function to call @@ -2936,9 +3225,8 @@ static int wq_clamp_max_active(int max_active, unsigned int flags, int lim = flags & WQ_UNBOUND ? WQ_UNBOUND_MAX_ACTIVE : WQ_MAX_ACTIVE; if (max_active < 1 || max_active > lim) - printk(KERN_WARNING "workqueue: max_active %d requested for %s " - "is out of range, clamping between %d and %d\n", - max_active, name, 1, lim); + pr_warn("workqueue: max_active %d requested for %s is out of range, clamping between %d and %d\n", + max_active, name, 1, lim); return clamp_val(max_active, 1, lim); } @@ -3093,6 +3381,26 @@ void destroy_workqueue(struct workqueue_struct *wq) } EXPORT_SYMBOL_GPL(destroy_workqueue); +/** + * cwq_set_max_active - adjust max_active of a cwq + * @cwq: target cpu_workqueue_struct + * @max_active: new max_active value. + * + * Set @cwq->max_active to @max_active and activate delayed works if + * increased. + * + * CONTEXT: + * spin_lock_irq(gcwq->lock). + */ +static void cwq_set_max_active(struct cpu_workqueue_struct *cwq, int max_active) +{ + cwq->max_active = max_active; + + while (!list_empty(&cwq->delayed_works) && + cwq->nr_active < cwq->max_active) + cwq_activate_first_delayed(cwq); +} + /** * workqueue_set_max_active - adjust max_active of a workqueue * @wq: target workqueue @@ -3120,7 +3428,7 @@ void workqueue_set_max_active(struct workqueue_struct *wq, int max_active) if (!(wq->flags & WQ_FREEZABLE) || !(gcwq->flags & GCWQ_FREEZING)) - get_cwq(gcwq->cpu, wq)->max_active = max_active; + cwq_set_max_active(get_cwq(gcwq->cpu, wq), max_active); spin_unlock_irq(&gcwq->lock); } @@ -3184,7 +3492,7 @@ unsigned int work_busy(struct work_struct *work) unsigned int ret = 0; if (!gcwq) - return false; + return 0; spin_lock_irqsave(&gcwq->lock, flags); @@ -3209,450 +3517,159 @@ EXPORT_SYMBOL_GPL(work_busy); * gcwqs serve mix of short, long and very long running works making * blocked draining impractical. * - * This is solved by allowing a gcwq to be detached from CPU, running - * it with unbound (rogue) workers and allowing it to be reattached - * later if the cpu comes back online. A separate thread is created - * to govern a gcwq in such state and is called the trustee of the - * gcwq. - * - * Trustee states and their descriptions. - * - * START Command state used on startup. On CPU_DOWN_PREPARE, a - * new trustee is started with this state. - * - * IN_CHARGE Once started, trustee will enter this state after - * assuming the manager role and making all existing - * workers rogue. DOWN_PREPARE waits for trustee to - * enter this state. After reaching IN_CHARGE, trustee - * tries to execute the pending worklist until it's empty - * and the state is set to BUTCHER, or the state is set - * to RELEASE. - * - * BUTCHER Command state which is set by the cpu callback after - * the cpu has went down. Once this state is set trustee - * knows that there will be no new works on the worklist - * and once the worklist is empty it can proceed to - * killing idle workers. - * - * RELEASE Command state which is set by the cpu callback if the - * cpu down has been canceled or it has come online - * again. After recognizing this state, trustee stops - * trying to drain or butcher and clears ROGUE, rebinds - * all remaining workers back to the cpu and releases - * manager role. - * - * DONE Trustee will enter this state after BUTCHER or RELEASE - * is complete. - * - * trustee CPU draining - * took over down complete - * START -----------> IN_CHARGE -----------> BUTCHER -----------> DONE - * | | ^ - * | CPU is back online v return workers | - * ----------------> RELEASE -------------- - */ - -/** - * trustee_wait_event_timeout - timed event wait for trustee - * @cond: condition to wait for - * @timeout: timeout in jiffies - * - * wait_event_timeout() for trustee to use. Handles locking and - * checks for RELEASE request. - * - * CONTEXT: - * spin_lock_irq(gcwq->lock) which may be released and regrabbed - * multiple times. To be used by trustee. - * - * RETURNS: - * Positive indicating left time if @cond is satisfied, 0 if timed - * out, -1 if canceled. - */ -#define trustee_wait_event_timeout(cond, timeout) ({ \ - long __ret = (timeout); \ - while (!((cond) || (gcwq->trustee_state == TRUSTEE_RELEASE)) && \ - __ret) { \ - spin_unlock_irq(&gcwq->lock); \ - __wait_event_timeout(gcwq->trustee_wait, (cond) || \ - (gcwq->trustee_state == TRUSTEE_RELEASE), \ - __ret); \ - spin_lock_irq(&gcwq->lock); \ - } \ - gcwq->trustee_state == TRUSTEE_RELEASE ? -1 : (__ret); \ -}) - -/** - * trustee_wait_event - event wait for trustee - * @cond: condition to wait for - * - * wait_event() for trustee to use. Automatically handles locking and - * checks for CANCEL request. - * - * CONTEXT: - * spin_lock_irq(gcwq->lock) which may be released and regrabbed - * multiple times. To be used by trustee. - * - * RETURNS: - * 0 if @cond is satisfied, -1 if canceled. + * This is solved by allowing a gcwq to be disassociated from the CPU + * running as an unbound one and allowing it to be reattached later if the + * cpu comes back online. */ -#define trustee_wait_event(cond) ({ \ - long __ret1; \ - __ret1 = trustee_wait_event_timeout(cond, MAX_SCHEDULE_TIMEOUT);\ - __ret1 < 0 ? -1 : 0; \ -}) -static bool gcwq_is_managing_workers(struct global_cwq *gcwq) +/* claim manager positions of all pools */ +static void gcwq_claim_assoc_and_lock(struct global_cwq *gcwq) { struct worker_pool *pool; for_each_worker_pool(pool, gcwq) - if (pool->flags & POOL_MANAGING_WORKERS) - return true; - return false; + mutex_lock_nested(&pool->assoc_mutex, pool - gcwq->pools); + spin_lock_irq(&gcwq->lock); } -static bool gcwq_has_idle_workers(struct global_cwq *gcwq) +/* release manager positions */ +static void gcwq_release_assoc_and_unlock(struct global_cwq *gcwq) { struct worker_pool *pool; + spin_unlock_irq(&gcwq->lock); for_each_worker_pool(pool, gcwq) - if (!list_empty(&pool->idle_list)) - return true; - return false; + mutex_unlock(&pool->assoc_mutex); } -static int __cpuinit trustee_thread(void *__gcwq) +static void gcwq_unbind_fn(struct work_struct *work) { - struct global_cwq *gcwq = __gcwq; + struct global_cwq *gcwq = get_gcwq(smp_processor_id()); struct worker_pool *pool; struct worker *worker; - struct work_struct *work; struct hlist_node *pos; - long rc; int i; BUG_ON(gcwq->cpu != smp_processor_id()); - spin_lock_irq(&gcwq->lock); - /* - * Claim the manager position and make all workers rogue. - * Trustee must be bound to the target cpu and can't be - * cancelled. - */ - BUG_ON(gcwq->cpu != smp_processor_id()); - rc = trustee_wait_event(!gcwq_is_managing_workers(gcwq)); - BUG_ON(rc < 0); - - for_each_worker_pool(pool, gcwq) { - pool->flags |= POOL_MANAGING_WORKERS; - - list_for_each_entry(worker, &pool->idle_list, entry) - worker->flags |= WORKER_ROGUE; - } - - for_each_busy_worker(worker, i, pos, gcwq) - worker->flags |= WORKER_ROGUE; - - /* - * Call schedule() so that we cross rq->lock and thus can - * guarantee sched callbacks see the rogue flag. This is - * necessary as scheduler callbacks may be invoked from other - * cpus. - */ - spin_unlock_irq(&gcwq->lock); - schedule(); - spin_lock_irq(&gcwq->lock); + gcwq_claim_assoc_and_lock(gcwq); /* - * Sched callbacks are disabled now. Zap nr_running. After - * this, nr_running stays zero and need_more_worker() and - * keep_working() are always true as long as the worklist is - * not empty. + * We've claimed all manager positions. Make all workers unbound + * and set DISASSOCIATED. Before this, all workers except for the + * ones which are still executing works from before the last CPU + * down must be on the cpu. After this, they may become diasporas. */ for_each_worker_pool(pool, gcwq) - atomic_set(get_pool_nr_running(pool), 0); - - spin_unlock_irq(&gcwq->lock); - for_each_worker_pool(pool, gcwq) - del_timer_sync(&pool->idle_timer); - spin_lock_irq(&gcwq->lock); - - /* - * We're now in charge. Notify and proceed to drain. We need - * to keep the gcwq running during the whole CPU down - * procedure as other cpu hotunplug callbacks may need to - * flush currently running tasks. - */ - gcwq->trustee_state = TRUSTEE_IN_CHARGE; - wake_up_all(&gcwq->trustee_wait); - - /* - * The original cpu is in the process of dying and may go away - * anytime now. When that happens, we and all workers would - * be migrated to other cpus. Try draining any left work. We - * want to get it over with ASAP - spam rescuers, wake up as - * many idlers as necessary and create new ones till the - * worklist is empty. Note that if the gcwq is frozen, there - * may be frozen works in freezable cwqs. Don't declare - * completion while frozen. - */ - while (true) { - bool busy = false; - - for_each_worker_pool(pool, gcwq) - busy |= pool->nr_workers != pool->nr_idle; - - if (!busy && !(gcwq->flags & GCWQ_FREEZING) && - gcwq->trustee_state != TRUSTEE_IN_CHARGE) - break; - - for_each_worker_pool(pool, gcwq) { - int nr_works = 0; - - list_for_each_entry(work, &pool->worklist, entry) { - send_mayday(work); - nr_works++; - } + list_for_each_entry(worker, &pool->idle_list, entry) + worker->flags |= WORKER_UNBOUND; - list_for_each_entry(worker, &pool->idle_list, entry) { - if (!nr_works--) - break; - wake_up_process(worker->task); - } + for_each_busy_worker(worker, i, pos, gcwq) + worker->flags |= WORKER_UNBOUND; - if (need_to_create_worker(pool)) { - spin_unlock_irq(&gcwq->lock); - worker = create_worker(pool, false); - spin_lock_irq(&gcwq->lock); - if (worker) { - worker->flags |= WORKER_ROGUE; - start_worker(worker); - } - } - } + gcwq->flags |= GCWQ_DISASSOCIATED; - /* give a breather */ - if (trustee_wait_event_timeout(false, TRUSTEE_COOLDOWN) < 0) - break; - } + gcwq_release_assoc_and_unlock(gcwq); /* - * Either all works have been scheduled and cpu is down, or - * cpu down has already been canceled. Wait for and butcher - * all workers till we're canceled. + * Call schedule() so that we cross rq->lock and thus can guarantee + * sched callbacks see the %WORKER_UNBOUND flag. This is necessary + * as scheduler callbacks may be invoked from other cpus. */ - do { - rc = trustee_wait_event(gcwq_has_idle_workers(gcwq)); - - i = 0; - for_each_worker_pool(pool, gcwq) { - while (!list_empty(&pool->idle_list)) { - worker = list_first_entry(&pool->idle_list, - struct worker, entry); - destroy_worker(worker); - } - i |= pool->nr_workers; - } - } while (i && rc >= 0); + schedule(); /* - * At this point, either draining has completed and no worker - * is left, or cpu down has been canceled or the cpu is being - * brought back up. There shouldn't be any idle one left. - * Tell the remaining busy ones to rebind once it finishes the - * currently scheduled works by scheduling the rebind_work. + * Sched callbacks are disabled now. Zap nr_running. After this, + * nr_running stays zero and need_more_worker() and keep_working() + * are always true as long as the worklist is not empty. @gcwq now + * behaves as unbound (in terms of concurrency management) gcwq + * which is served by workers tied to the CPU. + * + * On return from this function, the current worker would trigger + * unbound chain execution of pending work items if other workers + * didn't already. */ for_each_worker_pool(pool, gcwq) - WARN_ON(!list_empty(&pool->idle_list)); - - for_each_busy_worker(worker, i, pos, gcwq) { - struct work_struct *rebind_work = &worker->rebind_work; - - /* - * Rebind_work may race with future cpu hotplug - * operations. Use a separate flag to mark that - * rebinding is scheduled. - */ - worker->flags |= WORKER_REBIND; - worker->flags &= ~WORKER_ROGUE; - - /* queue rebind_work, wq doesn't matter, use the default one */ - if (test_and_set_bit(WORK_STRUCT_PENDING_BIT, - work_data_bits(rebind_work))) - continue; - - debug_work_activate(rebind_work); - insert_work(get_cwq(gcwq->cpu, system_wq), rebind_work, - worker->scheduled.next, - work_color_to_flags(WORK_NO_COLOR)); - } - - /* relinquish manager role */ - for_each_worker_pool(pool, gcwq) - pool->flags &= ~POOL_MANAGING_WORKERS; - - /* notify completion */ - gcwq->trustee = NULL; - gcwq->trustee_state = TRUSTEE_DONE; - wake_up_all(&gcwq->trustee_wait); - spin_unlock_irq(&gcwq->lock); - return 0; + atomic_set(get_pool_nr_running(pool), 0); } -/** - * wait_trustee_state - wait for trustee to enter the specified state - * @gcwq: gcwq the trustee of interest belongs to - * @state: target state to wait for - * - * Wait for the trustee to reach @state. DONE is already matched. - * - * CONTEXT: - * spin_lock_irq(gcwq->lock) which may be released and regrabbed - * multiple times. To be used by cpu_callback. +/* + * Workqueues should be brought up before normal priority CPU notifiers. + * This will be registered high priority CPU notifier. */ -static void __cpuinit wait_trustee_state(struct global_cwq *gcwq, int state) -__releases(&gcwq->lock) -__acquires(&gcwq->lock) -{ - if (!(gcwq->trustee_state == state || - gcwq->trustee_state == TRUSTEE_DONE)) { - spin_unlock_irq(&gcwq->lock); - __wait_event(gcwq->trustee_wait, - gcwq->trustee_state == state || - gcwq->trustee_state == TRUSTEE_DONE); - spin_lock_irq(&gcwq->lock); - } -} - -static int __devinit workqueue_cpu_callback(struct notifier_block *nfb, - unsigned long action, - void *hcpu) +static int __cpuinit workqueue_cpu_up_callback(struct notifier_block *nfb, + unsigned long action, + void *hcpu) { unsigned int cpu = (unsigned long)hcpu; struct global_cwq *gcwq = get_gcwq(cpu); - struct task_struct *new_trustee = NULL; - struct worker *new_workers[NR_WORKER_POOLS] = { }; struct worker_pool *pool; - unsigned long flags; - int i; - - action &= ~CPU_TASKS_FROZEN; - switch (action) { - case CPU_DOWN_PREPARE: - new_trustee = kthread_create(trustee_thread, gcwq, - "workqueue_trustee/%d\n", cpu); - if (IS_ERR(new_trustee)) - return notifier_from_errno(PTR_ERR(new_trustee)); - kthread_bind(new_trustee, cpu); - /* fall through */ + switch (action & ~CPU_TASKS_FROZEN) { case CPU_UP_PREPARE: - i = 0; for_each_worker_pool(pool, gcwq) { - BUG_ON(pool->first_idle); - new_workers[i] = create_worker(pool, false); - if (!new_workers[i++]) - goto err_destroy; - } - } - - /* some are called w/ irq disabled, don't disturb irq status */ - spin_lock_irqsave(&gcwq->lock, flags); + struct worker *worker; - switch (action) { - case CPU_DOWN_PREPARE: - /* initialize trustee and tell it to acquire the gcwq */ - BUG_ON(gcwq->trustee || gcwq->trustee_state != TRUSTEE_DONE); - gcwq->trustee = new_trustee; - gcwq->trustee_state = TRUSTEE_START; - wake_up_process(gcwq->trustee); - wait_trustee_state(gcwq, TRUSTEE_IN_CHARGE); - /* fall through */ - case CPU_UP_PREPARE: - i = 0; - for_each_worker_pool(pool, gcwq) { - BUG_ON(pool->first_idle); - pool->first_idle = new_workers[i++]; - } - break; + if (pool->nr_workers) + continue; - case CPU_DYING: - /* - * Before this, the trustee and all workers except for - * the ones which are still executing works from - * before the last CPU down must be on the cpu. After - * this, they'll all be diasporas. - */ - gcwq->flags |= GCWQ_DISASSOCIATED; - break; + worker = create_worker(pool); + if (!worker) + return NOTIFY_BAD; - case CPU_POST_DEAD: - gcwq->trustee_state = TRUSTEE_BUTCHER; - /* fall through */ - case CPU_UP_CANCELED: - for_each_worker_pool(pool, gcwq) { - destroy_worker(pool->first_idle); - pool->first_idle = NULL; + spin_lock_irq(&gcwq->lock); + start_worker(worker); + spin_unlock_irq(&gcwq->lock); } break; case CPU_DOWN_FAILED: case CPU_ONLINE: + gcwq_claim_assoc_and_lock(gcwq); gcwq->flags &= ~GCWQ_DISASSOCIATED; - if (gcwq->trustee_state != TRUSTEE_DONE) { - gcwq->trustee_state = TRUSTEE_RELEASE; - wake_up_process(gcwq->trustee); - wait_trustee_state(gcwq, TRUSTEE_DONE); - } - - /* - * Trustee is done and there might be no worker left. - * Put the first_idle in and request a real manager to - * take a look. - */ - for_each_worker_pool(pool, gcwq) { - spin_unlock_irq(&gcwq->lock); - kthread_bind(pool->first_idle->task, cpu); - spin_lock_irq(&gcwq->lock); - pool->flags |= POOL_MANAGE_WORKERS; - start_worker(pool->first_idle); - pool->first_idle = NULL; - } + rebind_workers(gcwq); + gcwq_release_assoc_and_unlock(gcwq); break; } + return NOTIFY_OK; +} - spin_unlock_irqrestore(&gcwq->lock, flags); - - return notifier_from_errno(0); - -err_destroy: - if (new_trustee) - kthread_stop(new_trustee); - - spin_lock_irqsave(&gcwq->lock, flags); - for (i = 0; i < NR_WORKER_POOLS; i++) - if (new_workers[i]) - destroy_worker(new_workers[i]); - spin_unlock_irqrestore(&gcwq->lock, flags); +/* + * Workqueues should be brought down after normal priority CPU notifiers. + * This will be registered as low priority CPU notifier. + */ +static int __cpuinit workqueue_cpu_down_callback(struct notifier_block *nfb, + unsigned long action, + void *hcpu) +{ + unsigned int cpu = (unsigned long)hcpu; + struct work_struct unbind_work; - return NOTIFY_BAD; + switch (action & ~CPU_TASKS_FROZEN) { + case CPU_DOWN_PREPARE: + /* unbinding should happen on the local CPU */ + INIT_WORK_ONSTACK(&unbind_work, gcwq_unbind_fn); + queue_work_on(cpu, system_highpri_wq, &unbind_work); + flush_work(&unbind_work); + break; + } + return NOTIFY_OK; } #ifdef CONFIG_SMP struct work_for_cpu { - struct completion completion; + struct work_struct work; long (*fn)(void *); void *arg; long ret; }; -static int do_work_for_cpu(void *_wfc) +static void work_for_cpu_fn(struct work_struct *work) { - struct work_for_cpu *wfc = _wfc; + struct work_for_cpu *wfc = container_of(work, struct work_for_cpu, work); + wfc->ret = wfc->fn(wfc->arg); - complete(&wfc->completion); - return 0; } /** @@ -3667,19 +3684,11 @@ static int do_work_for_cpu(void *_wfc) */ long work_on_cpu(unsigned int cpu, long (*fn)(void *), void *arg) { - struct task_struct *sub_thread; - struct work_for_cpu wfc = { - .completion = COMPLETION_INITIALIZER_ONSTACK(wfc.completion), - .fn = fn, - .arg = arg, - }; + struct work_for_cpu wfc = { .fn = fn, .arg = arg }; - sub_thread = kthread_create(do_work_for_cpu, &wfc, "work_for_cpu"); - if (IS_ERR(sub_thread)) - return PTR_ERR(sub_thread); - kthread_bind(sub_thread, cpu); - wake_up_process(sub_thread); - wait_for_completion(&wfc.completion); + INIT_WORK_ONSTACK(&wfc.work, work_for_cpu_fn); + schedule_work_on(cpu, &wfc.work); + flush_work(&wfc.work); return wfc.ret; } EXPORT_SYMBOL_GPL(work_on_cpu); @@ -3809,11 +3818,7 @@ void thaw_workqueues(void) continue; /* restore max_active and repopulate worklist */ - cwq->max_active = wq->saved_max_active; - - while (!list_empty(&cwq->delayed_works) && - cwq->nr_active < cwq->max_active) - cwq_activate_first_delayed(cwq); + cwq_set_max_active(cwq, wq->saved_max_active); } for_each_worker_pool(pool, gcwq) @@ -3833,7 +3838,12 @@ static int __init init_workqueues(void) unsigned int cpu; int i; - cpu_notifier(workqueue_cpu_callback, CPU_PRI_WORKQUEUE); + /* make sure we have enough bits for OFFQ CPU number */ + BUILD_BUG_ON((1LU << (BITS_PER_LONG - WORK_OFFQ_CPU_SHIFT)) < + WORK_CPU_LAST); + + cpu_notifier(workqueue_cpu_up_callback, CPU_PRI_WORKQUEUE_UP); + hotcpu_notifier(workqueue_cpu_down_callback, CPU_PRI_WORKQUEUE_DOWN); /* initialize gcwqs */ for_each_gcwq_cpu(cpu) { @@ -3859,11 +3869,9 @@ static int __init init_workqueues(void) setup_timer(&pool->mayday_timer, gcwq_mayday_timeout, (unsigned long)pool); + mutex_init(&pool->assoc_mutex); ida_init(&pool->worker_ida); } - - gcwq->trustee_state = TRUSTEE_DONE; - init_waitqueue_head(&gcwq->trustee_wait); } /* create the initial worker */ @@ -3877,7 +3885,7 @@ static int __init init_workqueues(void) for_each_worker_pool(pool, gcwq) { struct worker *worker; - worker = create_worker(pool, true); + worker = create_worker(pool); BUG_ON(!worker); spin_lock_irq(&gcwq->lock); start_worker(worker); @@ -3886,17 +3894,14 @@ static int __init init_workqueues(void) } system_wq = alloc_workqueue("events", 0, 0); + system_highpri_wq = alloc_workqueue("events_highpri", WQ_HIGHPRI, 0); system_long_wq = alloc_workqueue("events_long", 0, 0); - system_nrt_wq = alloc_workqueue("events_nrt", WQ_NON_REENTRANT, 0); system_unbound_wq = alloc_workqueue("events_unbound", WQ_UNBOUND, WQ_UNBOUND_MAX_ACTIVE); system_freezable_wq = alloc_workqueue("events_freezable", WQ_FREEZABLE, 0); - system_nrt_freezable_wq = alloc_workqueue("events_nrt_freezable", - WQ_NON_REENTRANT | WQ_FREEZABLE, 0); - BUG_ON(!system_wq || !system_long_wq || !system_nrt_wq || - !system_unbound_wq || !system_freezable_wq || - !system_nrt_freezable_wq); + BUG_ON(!system_wq || !system_highpri_wq || !system_long_wq || + !system_unbound_wq || !system_freezable_wq); return 0; } early_initcall(init_workqueues);