workqueue: reimplement CPU hotplugging support using trustee
[linux-2.6-block.git] / kernel / workqueue.c
CommitLineData
1da177e4
LT
1/*
2 * linux/kernel/workqueue.c
3 *
4 * Generic mechanism for defining kernel helper threads for running
5 * arbitrary tasks in process context.
6 *
7 * Started by Ingo Molnar, Copyright (C) 2002
8 *
9 * Derived from the taskqueue/keventd code by:
10 *
11 * David Woodhouse <dwmw2@infradead.org>
e1f8e874 12 * Andrew Morton
1da177e4
LT
13 * Kai Petzke <wpp@marie.physik.tu-berlin.de>
14 * Theodore Ts'o <tytso@mit.edu>
89ada679 15 *
cde53535 16 * Made to use alloc_percpu by Christoph Lameter.
1da177e4
LT
17 */
18
19#include <linux/module.h>
20#include <linux/kernel.h>
21#include <linux/sched.h>
22#include <linux/init.h>
23#include <linux/signal.h>
24#include <linux/completion.h>
25#include <linux/workqueue.h>
26#include <linux/slab.h>
27#include <linux/cpu.h>
28#include <linux/notifier.h>
29#include <linux/kthread.h>
1fa44eca 30#include <linux/hardirq.h>
46934023 31#include <linux/mempolicy.h>
341a5958 32#include <linux/freezer.h>
d5abe669
PZ
33#include <linux/kallsyms.h>
34#include <linux/debug_locks.h>
4e6045f1 35#include <linux/lockdep.h>
c34056a3 36#include <linux/idr.h>
1da177e4 37
c8e55f36 38enum {
db7bccf4
TH
39 /* global_cwq flags */
40 GCWQ_FREEZING = 1 << 3, /* freeze in progress */
41
c8e55f36
TH
42 /* worker flags */
43 WORKER_STARTED = 1 << 0, /* started */
44 WORKER_DIE = 1 << 1, /* die die die */
45 WORKER_IDLE = 1 << 2, /* is idle */
db7bccf4
TH
46 WORKER_ROGUE = 1 << 4, /* not bound to any cpu */
47
48 /* gcwq->trustee_state */
49 TRUSTEE_START = 0, /* start */
50 TRUSTEE_IN_CHARGE = 1, /* trustee in charge of gcwq */
51 TRUSTEE_BUTCHER = 2, /* butcher workers */
52 TRUSTEE_RELEASE = 3, /* release workers */
53 TRUSTEE_DONE = 4, /* trustee is done */
c8e55f36
TH
54
55 BUSY_WORKER_HASH_ORDER = 6, /* 64 pointers */
56 BUSY_WORKER_HASH_SIZE = 1 << BUSY_WORKER_HASH_ORDER,
57 BUSY_WORKER_HASH_MASK = BUSY_WORKER_HASH_SIZE - 1,
db7bccf4
TH
58
59 TRUSTEE_COOLDOWN = HZ / 10, /* for trustee draining */
c8e55f36
TH
60};
61
4690c4ab
TH
62/*
63 * Structure fields follow one of the following exclusion rules.
64 *
65 * I: Set during initialization and read-only afterwards.
66 *
8b03ae3c 67 * L: gcwq->lock protected. Access with gcwq->lock held.
4690c4ab 68 *
73f53c4a
TH
69 * F: wq->flush_mutex protected.
70 *
4690c4ab
TH
71 * W: workqueue_lock protected.
72 */
73
8b03ae3c 74struct global_cwq;
c34056a3
TH
75struct cpu_workqueue_struct;
76
77struct worker {
c8e55f36
TH
78 /* on idle list while idle, on busy hash table while busy */
79 union {
80 struct list_head entry; /* L: while idle */
81 struct hlist_node hentry; /* L: while busy */
82 };
83
c34056a3 84 struct work_struct *current_work; /* L: work being processed */
affee4b2 85 struct list_head scheduled; /* L: scheduled works */
c34056a3 86 struct task_struct *task; /* I: worker task */
8b03ae3c 87 struct global_cwq *gcwq; /* I: the associated gcwq */
c34056a3 88 struct cpu_workqueue_struct *cwq; /* I: the associated cwq */
c8e55f36 89 unsigned int flags; /* L: flags */
c34056a3
TH
90 int id; /* I: worker id */
91};
92
8b03ae3c
TH
93/*
94 * Global per-cpu workqueue.
95 */
96struct global_cwq {
97 spinlock_t lock; /* the gcwq lock */
98 unsigned int cpu; /* I: the associated cpu */
db7bccf4 99 unsigned int flags; /* L: GCWQ_* flags */
c8e55f36
TH
100
101 int nr_workers; /* L: total number of workers */
102 int nr_idle; /* L: currently idle ones */
103
104 /* workers are chained either in the idle_list or busy_hash */
105 struct list_head idle_list; /* L: list of idle workers */
106 struct hlist_head busy_hash[BUSY_WORKER_HASH_SIZE];
107 /* L: hash of busy workers */
108
8b03ae3c 109 struct ida worker_ida; /* L: for worker IDs */
db7bccf4
TH
110
111 struct task_struct *trustee; /* L: for gcwq shutdown */
112 unsigned int trustee_state; /* L: trustee state */
113 wait_queue_head_t trustee_wait; /* trustee wait */
8b03ae3c
TH
114} ____cacheline_aligned_in_smp;
115
1da177e4 116/*
f756d5e2 117 * The per-CPU workqueue (if single thread, we always use the first
0f900049
TH
118 * possible cpu). The lower WORK_STRUCT_FLAG_BITS of
119 * work_struct->data are used for flags and thus cwqs need to be
120 * aligned at two's power of the number of flag bits.
1da177e4
LT
121 */
122struct cpu_workqueue_struct {
8b03ae3c 123 struct global_cwq *gcwq; /* I: the associated gcwq */
1da177e4 124 struct list_head worklist;
c34056a3 125 struct worker *worker;
4690c4ab 126 struct workqueue_struct *wq; /* I: the owning workqueue */
73f53c4a
TH
127 int work_color; /* L: current color */
128 int flush_color; /* L: flushing color */
129 int nr_in_flight[WORK_NR_COLORS];
130 /* L: nr of in_flight works */
1e19ffc6 131 int nr_active; /* L: nr of active works */
a0a1a5fd 132 int max_active; /* L: max active works */
1e19ffc6 133 struct list_head delayed_works; /* L: delayed works */
0f900049 134};
1da177e4 135
73f53c4a
TH
136/*
137 * Structure used to wait for workqueue flush.
138 */
139struct wq_flusher {
140 struct list_head list; /* F: list of flushers */
141 int flush_color; /* F: flush color waiting for */
142 struct completion done; /* flush completion */
143};
144
1da177e4
LT
145/*
146 * The externally visible workqueue abstraction is an array of
147 * per-CPU workqueues:
148 */
149struct workqueue_struct {
97e37d7b 150 unsigned int flags; /* I: WQ_* flags */
4690c4ab
TH
151 struct cpu_workqueue_struct *cpu_wq; /* I: cwq's */
152 struct list_head list; /* W: list of all workqueues */
73f53c4a
TH
153
154 struct mutex flush_mutex; /* protects wq flushing */
155 int work_color; /* F: current work color */
156 int flush_color; /* F: current flush color */
157 atomic_t nr_cwqs_to_flush; /* flush in progress */
158 struct wq_flusher *first_flusher; /* F: first flusher */
159 struct list_head flusher_queue; /* F: flush waiters */
160 struct list_head flusher_overflow; /* F: flush overflow list */
161
a0a1a5fd 162 int saved_max_active; /* I: saved cwq max_active */
4690c4ab 163 const char *name; /* I: workqueue name */
4e6045f1 164#ifdef CONFIG_LOCKDEP
4690c4ab 165 struct lockdep_map lockdep_map;
4e6045f1 166#endif
1da177e4
LT
167};
168
db7bccf4
TH
169#define for_each_busy_worker(worker, i, pos, gcwq) \
170 for (i = 0; i < BUSY_WORKER_HASH_SIZE; i++) \
171 hlist_for_each_entry(worker, pos, &gcwq->busy_hash[i], hentry)
172
dc186ad7
TG
173#ifdef CONFIG_DEBUG_OBJECTS_WORK
174
175static struct debug_obj_descr work_debug_descr;
176
177/*
178 * fixup_init is called when:
179 * - an active object is initialized
180 */
181static int work_fixup_init(void *addr, enum debug_obj_state state)
182{
183 struct work_struct *work = addr;
184
185 switch (state) {
186 case ODEBUG_STATE_ACTIVE:
187 cancel_work_sync(work);
188 debug_object_init(work, &work_debug_descr);
189 return 1;
190 default:
191 return 0;
192 }
193}
194
195/*
196 * fixup_activate is called when:
197 * - an active object is activated
198 * - an unknown object is activated (might be a statically initialized object)
199 */
200static int work_fixup_activate(void *addr, enum debug_obj_state state)
201{
202 struct work_struct *work = addr;
203
204 switch (state) {
205
206 case ODEBUG_STATE_NOTAVAILABLE:
207 /*
208 * This is not really a fixup. The work struct was
209 * statically initialized. We just make sure that it
210 * is tracked in the object tracker.
211 */
22df02bb 212 if (test_bit(WORK_STRUCT_STATIC_BIT, work_data_bits(work))) {
dc186ad7
TG
213 debug_object_init(work, &work_debug_descr);
214 debug_object_activate(work, &work_debug_descr);
215 return 0;
216 }
217 WARN_ON_ONCE(1);
218 return 0;
219
220 case ODEBUG_STATE_ACTIVE:
221 WARN_ON(1);
222
223 default:
224 return 0;
225 }
226}
227
228/*
229 * fixup_free is called when:
230 * - an active object is freed
231 */
232static int work_fixup_free(void *addr, enum debug_obj_state state)
233{
234 struct work_struct *work = addr;
235
236 switch (state) {
237 case ODEBUG_STATE_ACTIVE:
238 cancel_work_sync(work);
239 debug_object_free(work, &work_debug_descr);
240 return 1;
241 default:
242 return 0;
243 }
244}
245
246static struct debug_obj_descr work_debug_descr = {
247 .name = "work_struct",
248 .fixup_init = work_fixup_init,
249 .fixup_activate = work_fixup_activate,
250 .fixup_free = work_fixup_free,
251};
252
253static inline void debug_work_activate(struct work_struct *work)
254{
255 debug_object_activate(work, &work_debug_descr);
256}
257
258static inline void debug_work_deactivate(struct work_struct *work)
259{
260 debug_object_deactivate(work, &work_debug_descr);
261}
262
263void __init_work(struct work_struct *work, int onstack)
264{
265 if (onstack)
266 debug_object_init_on_stack(work, &work_debug_descr);
267 else
268 debug_object_init(work, &work_debug_descr);
269}
270EXPORT_SYMBOL_GPL(__init_work);
271
272void destroy_work_on_stack(struct work_struct *work)
273{
274 debug_object_free(work, &work_debug_descr);
275}
276EXPORT_SYMBOL_GPL(destroy_work_on_stack);
277
278#else
279static inline void debug_work_activate(struct work_struct *work) { }
280static inline void debug_work_deactivate(struct work_struct *work) { }
281#endif
282
95402b38
GS
283/* Serializes the accesses to the list of workqueues. */
284static DEFINE_SPINLOCK(workqueue_lock);
1da177e4 285static LIST_HEAD(workqueues);
a0a1a5fd 286static bool workqueue_freezing; /* W: have wqs started freezing? */
c34056a3 287
8b03ae3c
TH
288static DEFINE_PER_CPU(struct global_cwq, global_cwq);
289
c34056a3 290static int worker_thread(void *__worker);
1da177e4 291
3af24433 292static int singlethread_cpu __read_mostly;
1da177e4 293
8b03ae3c
TH
294static struct global_cwq *get_gcwq(unsigned int cpu)
295{
296 return &per_cpu(global_cwq, cpu);
297}
298
1537663f
TH
299static struct cpu_workqueue_struct *get_cwq(unsigned int cpu,
300 struct workqueue_struct *wq)
b1f4ec17 301{
1537663f 302 return per_cpu_ptr(wq->cpu_wq, cpu);
b1f4ec17
ON
303}
304
1537663f
TH
305static struct cpu_workqueue_struct *target_cwq(unsigned int cpu,
306 struct workqueue_struct *wq)
a848e3b6 307{
1537663f 308 if (unlikely(wq->flags & WQ_SINGLE_THREAD))
a848e3b6 309 cpu = singlethread_cpu;
1537663f 310 return get_cwq(cpu, wq);
a848e3b6
ON
311}
312
73f53c4a
TH
313static unsigned int work_color_to_flags(int color)
314{
315 return color << WORK_STRUCT_COLOR_SHIFT;
316}
317
318static int get_work_color(struct work_struct *work)
319{
320 return (*work_data_bits(work) >> WORK_STRUCT_COLOR_SHIFT) &
321 ((1 << WORK_STRUCT_COLOR_BITS) - 1);
322}
323
324static int work_next_color(int color)
325{
326 return (color + 1) % WORK_NR_COLORS;
327}
328
4594bf15
DH
329/*
330 * Set the workqueue on which a work item is to be run
331 * - Must *only* be called if the pending flag is set
332 */
ed7c0fee 333static inline void set_wq_data(struct work_struct *work,
4690c4ab
TH
334 struct cpu_workqueue_struct *cwq,
335 unsigned long extra_flags)
365970a1 336{
4594bf15 337 BUG_ON(!work_pending(work));
365970a1 338
4690c4ab 339 atomic_long_set(&work->data, (unsigned long)cwq | work_static(work) |
22df02bb 340 WORK_STRUCT_PENDING | extra_flags);
365970a1
DH
341}
342
4d707b9f
ON
343/*
344 * Clear WORK_STRUCT_PENDING and the workqueue on which it was queued.
345 */
346static inline void clear_wq_data(struct work_struct *work)
347{
4690c4ab 348 atomic_long_set(&work->data, work_static(work));
4d707b9f
ON
349}
350
64166699 351static inline struct cpu_workqueue_struct *get_wq_data(struct work_struct *work)
365970a1 352{
64166699
TH
353 return (void *)(atomic_long_read(&work->data) &
354 WORK_STRUCT_WQ_DATA_MASK);
365970a1
DH
355}
356
c8e55f36
TH
357/**
358 * busy_worker_head - return the busy hash head for a work
359 * @gcwq: gcwq of interest
360 * @work: work to be hashed
361 *
362 * Return hash head of @gcwq for @work.
363 *
364 * CONTEXT:
365 * spin_lock_irq(gcwq->lock).
366 *
367 * RETURNS:
368 * Pointer to the hash head.
369 */
370static struct hlist_head *busy_worker_head(struct global_cwq *gcwq,
371 struct work_struct *work)
372{
373 const int base_shift = ilog2(sizeof(struct work_struct));
374 unsigned long v = (unsigned long)work;
375
376 /* simple shift and fold hash, do we need something better? */
377 v >>= base_shift;
378 v += v >> BUSY_WORKER_HASH_ORDER;
379 v &= BUSY_WORKER_HASH_MASK;
380
381 return &gcwq->busy_hash[v];
382}
383
4690c4ab
TH
384/**
385 * insert_work - insert a work into cwq
386 * @cwq: cwq @work belongs to
387 * @work: work to insert
388 * @head: insertion point
389 * @extra_flags: extra WORK_STRUCT_* flags to set
390 *
391 * Insert @work into @cwq after @head.
392 *
393 * CONTEXT:
8b03ae3c 394 * spin_lock_irq(gcwq->lock).
4690c4ab 395 */
b89deed3 396static void insert_work(struct cpu_workqueue_struct *cwq,
4690c4ab
TH
397 struct work_struct *work, struct list_head *head,
398 unsigned int extra_flags)
b89deed3 399{
4690c4ab
TH
400 /* we own @work, set data and link */
401 set_wq_data(work, cwq, extra_flags);
402
6e84d644
ON
403 /*
404 * Ensure that we get the right work->data if we see the
405 * result of list_add() below, see try_to_grab_pending().
406 */
407 smp_wmb();
4690c4ab 408
1a4d9b0a 409 list_add_tail(&work->entry, head);
c8e55f36 410 wake_up_process(cwq->worker->task);
b89deed3
ON
411}
412
4690c4ab 413static void __queue_work(unsigned int cpu, struct workqueue_struct *wq,
1da177e4
LT
414 struct work_struct *work)
415{
1537663f 416 struct cpu_workqueue_struct *cwq = target_cwq(cpu, wq);
8b03ae3c 417 struct global_cwq *gcwq = cwq->gcwq;
1e19ffc6 418 struct list_head *worklist;
1da177e4
LT
419 unsigned long flags;
420
dc186ad7 421 debug_work_activate(work);
1e19ffc6 422
8b03ae3c 423 spin_lock_irqsave(&gcwq->lock, flags);
4690c4ab 424 BUG_ON(!list_empty(&work->entry));
1e19ffc6 425
73f53c4a 426 cwq->nr_in_flight[cwq->work_color]++;
1e19ffc6
TH
427
428 if (likely(cwq->nr_active < cwq->max_active)) {
429 cwq->nr_active++;
430 worklist = &cwq->worklist;
431 } else
432 worklist = &cwq->delayed_works;
433
434 insert_work(cwq, work, worklist, work_color_to_flags(cwq->work_color));
435
8b03ae3c 436 spin_unlock_irqrestore(&gcwq->lock, flags);
1da177e4
LT
437}
438
0fcb78c2
REB
439/**
440 * queue_work - queue work on a workqueue
441 * @wq: workqueue to use
442 * @work: work to queue
443 *
057647fc 444 * Returns 0 if @work was already on a queue, non-zero otherwise.
1da177e4 445 *
00dfcaf7
ON
446 * We queue the work to the CPU on which it was submitted, but if the CPU dies
447 * it can be processed by another CPU.
1da177e4 448 */
7ad5b3a5 449int queue_work(struct workqueue_struct *wq, struct work_struct *work)
1da177e4 450{
ef1ca236
ON
451 int ret;
452
453 ret = queue_work_on(get_cpu(), wq, work);
454 put_cpu();
455
1da177e4
LT
456 return ret;
457}
ae90dd5d 458EXPORT_SYMBOL_GPL(queue_work);
1da177e4 459
c1a220e7
ZR
460/**
461 * queue_work_on - queue work on specific cpu
462 * @cpu: CPU number to execute work on
463 * @wq: workqueue to use
464 * @work: work to queue
465 *
466 * Returns 0 if @work was already on a queue, non-zero otherwise.
467 *
468 * We queue the work to a specific CPU, the caller must ensure it
469 * can't go away.
470 */
471int
472queue_work_on(int cpu, struct workqueue_struct *wq, struct work_struct *work)
473{
474 int ret = 0;
475
22df02bb 476 if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))) {
4690c4ab 477 __queue_work(cpu, wq, work);
c1a220e7
ZR
478 ret = 1;
479 }
480 return ret;
481}
482EXPORT_SYMBOL_GPL(queue_work_on);
483
6d141c3f 484static void delayed_work_timer_fn(unsigned long __data)
1da177e4 485{
52bad64d 486 struct delayed_work *dwork = (struct delayed_work *)__data;
ed7c0fee 487 struct cpu_workqueue_struct *cwq = get_wq_data(&dwork->work);
1da177e4 488
4690c4ab 489 __queue_work(smp_processor_id(), cwq->wq, &dwork->work);
1da177e4
LT
490}
491
0fcb78c2
REB
492/**
493 * queue_delayed_work - queue work on a workqueue after delay
494 * @wq: workqueue to use
af9997e4 495 * @dwork: delayable work to queue
0fcb78c2
REB
496 * @delay: number of jiffies to wait before queueing
497 *
057647fc 498 * Returns 0 if @work was already on a queue, non-zero otherwise.
0fcb78c2 499 */
7ad5b3a5 500int queue_delayed_work(struct workqueue_struct *wq,
52bad64d 501 struct delayed_work *dwork, unsigned long delay)
1da177e4 502{
52bad64d 503 if (delay == 0)
63bc0362 504 return queue_work(wq, &dwork->work);
1da177e4 505
63bc0362 506 return queue_delayed_work_on(-1, wq, dwork, delay);
1da177e4 507}
ae90dd5d 508EXPORT_SYMBOL_GPL(queue_delayed_work);
1da177e4 509
0fcb78c2
REB
510/**
511 * queue_delayed_work_on - queue work on specific CPU after delay
512 * @cpu: CPU number to execute work on
513 * @wq: workqueue to use
af9997e4 514 * @dwork: work to queue
0fcb78c2
REB
515 * @delay: number of jiffies to wait before queueing
516 *
057647fc 517 * Returns 0 if @work was already on a queue, non-zero otherwise.
0fcb78c2 518 */
7a6bc1cd 519int queue_delayed_work_on(int cpu, struct workqueue_struct *wq,
52bad64d 520 struct delayed_work *dwork, unsigned long delay)
7a6bc1cd
VP
521{
522 int ret = 0;
52bad64d
DH
523 struct timer_list *timer = &dwork->timer;
524 struct work_struct *work = &dwork->work;
7a6bc1cd 525
22df02bb 526 if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))) {
7a6bc1cd
VP
527 BUG_ON(timer_pending(timer));
528 BUG_ON(!list_empty(&work->entry));
529
8a3e77cc
AL
530 timer_stats_timer_set_start_info(&dwork->timer);
531
ed7c0fee 532 /* This stores cwq for the moment, for the timer_fn */
1537663f 533 set_wq_data(work, target_cwq(raw_smp_processor_id(), wq), 0);
7a6bc1cd 534 timer->expires = jiffies + delay;
52bad64d 535 timer->data = (unsigned long)dwork;
7a6bc1cd 536 timer->function = delayed_work_timer_fn;
63bc0362
ON
537
538 if (unlikely(cpu >= 0))
539 add_timer_on(timer, cpu);
540 else
541 add_timer(timer);
7a6bc1cd
VP
542 ret = 1;
543 }
544 return ret;
545}
ae90dd5d 546EXPORT_SYMBOL_GPL(queue_delayed_work_on);
1da177e4 547
c8e55f36
TH
548/**
549 * worker_enter_idle - enter idle state
550 * @worker: worker which is entering idle state
551 *
552 * @worker is entering idle state. Update stats and idle timer if
553 * necessary.
554 *
555 * LOCKING:
556 * spin_lock_irq(gcwq->lock).
557 */
558static void worker_enter_idle(struct worker *worker)
559{
560 struct global_cwq *gcwq = worker->gcwq;
561
562 BUG_ON(worker->flags & WORKER_IDLE);
563 BUG_ON(!list_empty(&worker->entry) &&
564 (worker->hentry.next || worker->hentry.pprev));
565
566 worker->flags |= WORKER_IDLE;
567 gcwq->nr_idle++;
568
569 /* idle_list is LIFO */
570 list_add(&worker->entry, &gcwq->idle_list);
db7bccf4
TH
571
572 if (unlikely(worker->flags & WORKER_ROGUE))
573 wake_up_all(&gcwq->trustee_wait);
c8e55f36
TH
574}
575
576/**
577 * worker_leave_idle - leave idle state
578 * @worker: worker which is leaving idle state
579 *
580 * @worker is leaving idle state. Update stats.
581 *
582 * LOCKING:
583 * spin_lock_irq(gcwq->lock).
584 */
585static void worker_leave_idle(struct worker *worker)
586{
587 struct global_cwq *gcwq = worker->gcwq;
588
589 BUG_ON(!(worker->flags & WORKER_IDLE));
590 worker->flags &= ~WORKER_IDLE;
591 gcwq->nr_idle--;
592 list_del_init(&worker->entry);
593}
594
c34056a3
TH
595static struct worker *alloc_worker(void)
596{
597 struct worker *worker;
598
599 worker = kzalloc(sizeof(*worker), GFP_KERNEL);
c8e55f36
TH
600 if (worker) {
601 INIT_LIST_HEAD(&worker->entry);
affee4b2 602 INIT_LIST_HEAD(&worker->scheduled);
c8e55f36 603 }
c34056a3
TH
604 return worker;
605}
606
607/**
608 * create_worker - create a new workqueue worker
609 * @cwq: cwq the new worker will belong to
610 * @bind: whether to set affinity to @cpu or not
611 *
612 * Create a new worker which is bound to @cwq. The returned worker
613 * can be started by calling start_worker() or destroyed using
614 * destroy_worker().
615 *
616 * CONTEXT:
617 * Might sleep. Does GFP_KERNEL allocations.
618 *
619 * RETURNS:
620 * Pointer to the newly created worker.
621 */
622static struct worker *create_worker(struct cpu_workqueue_struct *cwq, bool bind)
623{
8b03ae3c 624 struct global_cwq *gcwq = cwq->gcwq;
c34056a3
TH
625 int id = -1;
626 struct worker *worker = NULL;
627
8b03ae3c
TH
628 spin_lock_irq(&gcwq->lock);
629 while (ida_get_new(&gcwq->worker_ida, &id)) {
630 spin_unlock_irq(&gcwq->lock);
631 if (!ida_pre_get(&gcwq->worker_ida, GFP_KERNEL))
c34056a3 632 goto fail;
8b03ae3c 633 spin_lock_irq(&gcwq->lock);
c34056a3 634 }
8b03ae3c 635 spin_unlock_irq(&gcwq->lock);
c34056a3
TH
636
637 worker = alloc_worker();
638 if (!worker)
639 goto fail;
640
8b03ae3c 641 worker->gcwq = gcwq;
c34056a3
TH
642 worker->cwq = cwq;
643 worker->id = id;
644
645 worker->task = kthread_create(worker_thread, worker, "kworker/%u:%d",
8b03ae3c 646 gcwq->cpu, id);
c34056a3
TH
647 if (IS_ERR(worker->task))
648 goto fail;
649
db7bccf4
TH
650 /*
651 * A rogue worker will become a regular one if CPU comes
652 * online later on. Make sure every worker has
653 * PF_THREAD_BOUND set.
654 */
c34056a3 655 if (bind)
8b03ae3c 656 kthread_bind(worker->task, gcwq->cpu);
db7bccf4
TH
657 else
658 worker->task->flags |= PF_THREAD_BOUND;
c34056a3
TH
659
660 return worker;
661fail:
662 if (id >= 0) {
8b03ae3c
TH
663 spin_lock_irq(&gcwq->lock);
664 ida_remove(&gcwq->worker_ida, id);
665 spin_unlock_irq(&gcwq->lock);
c34056a3
TH
666 }
667 kfree(worker);
668 return NULL;
669}
670
671/**
672 * start_worker - start a newly created worker
673 * @worker: worker to start
674 *
c8e55f36 675 * Make the gcwq aware of @worker and start it.
c34056a3
TH
676 *
677 * CONTEXT:
8b03ae3c 678 * spin_lock_irq(gcwq->lock).
c34056a3
TH
679 */
680static void start_worker(struct worker *worker)
681{
c8e55f36
TH
682 worker->flags |= WORKER_STARTED;
683 worker->gcwq->nr_workers++;
684 worker_enter_idle(worker);
c34056a3
TH
685 wake_up_process(worker->task);
686}
687
688/**
689 * destroy_worker - destroy a workqueue worker
690 * @worker: worker to be destroyed
691 *
c8e55f36
TH
692 * Destroy @worker and adjust @gcwq stats accordingly.
693 *
694 * CONTEXT:
695 * spin_lock_irq(gcwq->lock) which is released and regrabbed.
c34056a3
TH
696 */
697static void destroy_worker(struct worker *worker)
698{
8b03ae3c 699 struct global_cwq *gcwq = worker->gcwq;
c34056a3
TH
700 int id = worker->id;
701
702 /* sanity check frenzy */
703 BUG_ON(worker->current_work);
affee4b2 704 BUG_ON(!list_empty(&worker->scheduled));
c34056a3 705
c8e55f36
TH
706 if (worker->flags & WORKER_STARTED)
707 gcwq->nr_workers--;
708 if (worker->flags & WORKER_IDLE)
709 gcwq->nr_idle--;
710
711 list_del_init(&worker->entry);
712 worker->flags |= WORKER_DIE;
713
714 spin_unlock_irq(&gcwq->lock);
715
c34056a3
TH
716 kthread_stop(worker->task);
717 kfree(worker);
718
8b03ae3c
TH
719 spin_lock_irq(&gcwq->lock);
720 ida_remove(&gcwq->worker_ida, id);
c34056a3
TH
721}
722
affee4b2
TH
723/**
724 * move_linked_works - move linked works to a list
725 * @work: start of series of works to be scheduled
726 * @head: target list to append @work to
727 * @nextp: out paramter for nested worklist walking
728 *
729 * Schedule linked works starting from @work to @head. Work series to
730 * be scheduled starts at @work and includes any consecutive work with
731 * WORK_STRUCT_LINKED set in its predecessor.
732 *
733 * If @nextp is not NULL, it's updated to point to the next work of
734 * the last scheduled work. This allows move_linked_works() to be
735 * nested inside outer list_for_each_entry_safe().
736 *
737 * CONTEXT:
8b03ae3c 738 * spin_lock_irq(gcwq->lock).
affee4b2
TH
739 */
740static void move_linked_works(struct work_struct *work, struct list_head *head,
741 struct work_struct **nextp)
742{
743 struct work_struct *n;
744
745 /*
746 * Linked worklist will always end before the end of the list,
747 * use NULL for list head.
748 */
749 list_for_each_entry_safe_from(work, n, NULL, entry) {
750 list_move_tail(&work->entry, head);
751 if (!(*work_data_bits(work) & WORK_STRUCT_LINKED))
752 break;
753 }
754
755 /*
756 * If we're already inside safe list traversal and have moved
757 * multiple works to the scheduled queue, the next position
758 * needs to be updated.
759 */
760 if (nextp)
761 *nextp = n;
762}
763
1e19ffc6
TH
764static void cwq_activate_first_delayed(struct cpu_workqueue_struct *cwq)
765{
766 struct work_struct *work = list_first_entry(&cwq->delayed_works,
767 struct work_struct, entry);
768
769 move_linked_works(work, &cwq->worklist, NULL);
770 cwq->nr_active++;
771}
772
73f53c4a
TH
773/**
774 * cwq_dec_nr_in_flight - decrement cwq's nr_in_flight
775 * @cwq: cwq of interest
776 * @color: color of work which left the queue
777 *
778 * A work either has completed or is removed from pending queue,
779 * decrement nr_in_flight of its cwq and handle workqueue flushing.
780 *
781 * CONTEXT:
8b03ae3c 782 * spin_lock_irq(gcwq->lock).
73f53c4a
TH
783 */
784static void cwq_dec_nr_in_flight(struct cpu_workqueue_struct *cwq, int color)
785{
786 /* ignore uncolored works */
787 if (color == WORK_NO_COLOR)
788 return;
789
790 cwq->nr_in_flight[color]--;
1e19ffc6
TH
791 cwq->nr_active--;
792
793 /* one down, submit a delayed one */
794 if (!list_empty(&cwq->delayed_works) &&
795 cwq->nr_active < cwq->max_active)
796 cwq_activate_first_delayed(cwq);
73f53c4a
TH
797
798 /* is flush in progress and are we at the flushing tip? */
799 if (likely(cwq->flush_color != color))
800 return;
801
802 /* are there still in-flight works? */
803 if (cwq->nr_in_flight[color])
804 return;
805
806 /* this cwq is done, clear flush_color */
807 cwq->flush_color = -1;
808
809 /*
810 * If this was the last cwq, wake up the first flusher. It
811 * will handle the rest.
812 */
813 if (atomic_dec_and_test(&cwq->wq->nr_cwqs_to_flush))
814 complete(&cwq->wq->first_flusher->done);
815}
816
a62428c0
TH
817/**
818 * process_one_work - process single work
c34056a3 819 * @worker: self
a62428c0
TH
820 * @work: work to process
821 *
822 * Process @work. This function contains all the logics necessary to
823 * process a single work including synchronization against and
824 * interaction with other workers on the same cpu, queueing and
825 * flushing. As long as context requirement is met, any worker can
826 * call this function to process a work.
827 *
828 * CONTEXT:
8b03ae3c 829 * spin_lock_irq(gcwq->lock) which is released and regrabbed.
a62428c0 830 */
c34056a3 831static void process_one_work(struct worker *worker, struct work_struct *work)
a62428c0 832{
c34056a3 833 struct cpu_workqueue_struct *cwq = worker->cwq;
8b03ae3c 834 struct global_cwq *gcwq = cwq->gcwq;
c8e55f36 835 struct hlist_head *bwh = busy_worker_head(gcwq, work);
a62428c0 836 work_func_t f = work->func;
73f53c4a 837 int work_color;
a62428c0
TH
838#ifdef CONFIG_LOCKDEP
839 /*
840 * It is permissible to free the struct work_struct from
841 * inside the function that is called from it, this we need to
842 * take into account for lockdep too. To avoid bogus "held
843 * lock freed" warnings as well as problems when looking into
844 * work->lockdep_map, make a copy and use that here.
845 */
846 struct lockdep_map lockdep_map = work->lockdep_map;
847#endif
848 /* claim and process */
a62428c0 849 debug_work_deactivate(work);
c8e55f36 850 hlist_add_head(&worker->hentry, bwh);
c34056a3 851 worker->current_work = work;
73f53c4a 852 work_color = get_work_color(work);
a62428c0
TH
853 list_del_init(&work->entry);
854
8b03ae3c 855 spin_unlock_irq(&gcwq->lock);
a62428c0
TH
856
857 BUG_ON(get_wq_data(work) != cwq);
858 work_clear_pending(work);
859 lock_map_acquire(&cwq->wq->lockdep_map);
860 lock_map_acquire(&lockdep_map);
861 f(work);
862 lock_map_release(&lockdep_map);
863 lock_map_release(&cwq->wq->lockdep_map);
864
865 if (unlikely(in_atomic() || lockdep_depth(current) > 0)) {
866 printk(KERN_ERR "BUG: workqueue leaked lock or atomic: "
867 "%s/0x%08x/%d\n",
868 current->comm, preempt_count(), task_pid_nr(current));
869 printk(KERN_ERR " last function: ");
870 print_symbol("%s\n", (unsigned long)f);
871 debug_show_held_locks(current);
872 dump_stack();
873 }
874
8b03ae3c 875 spin_lock_irq(&gcwq->lock);
a62428c0
TH
876
877 /* we're done with it, release */
c8e55f36 878 hlist_del_init(&worker->hentry);
c34056a3 879 worker->current_work = NULL;
73f53c4a 880 cwq_dec_nr_in_flight(cwq, work_color);
a62428c0
TH
881}
882
affee4b2
TH
883/**
884 * process_scheduled_works - process scheduled works
885 * @worker: self
886 *
887 * Process all scheduled works. Please note that the scheduled list
888 * may change while processing a work, so this function repeatedly
889 * fetches a work from the top and executes it.
890 *
891 * CONTEXT:
8b03ae3c 892 * spin_lock_irq(gcwq->lock) which may be released and regrabbed
affee4b2
TH
893 * multiple times.
894 */
895static void process_scheduled_works(struct worker *worker)
1da177e4 896{
affee4b2
TH
897 while (!list_empty(&worker->scheduled)) {
898 struct work_struct *work = list_first_entry(&worker->scheduled,
1da177e4 899 struct work_struct, entry);
c34056a3 900 process_one_work(worker, work);
1da177e4 901 }
1da177e4
LT
902}
903
4690c4ab
TH
904/**
905 * worker_thread - the worker thread function
c34056a3 906 * @__worker: self
4690c4ab
TH
907 *
908 * The cwq worker thread function.
909 */
c34056a3 910static int worker_thread(void *__worker)
1da177e4 911{
c34056a3 912 struct worker *worker = __worker;
8b03ae3c 913 struct global_cwq *gcwq = worker->gcwq;
c34056a3 914 struct cpu_workqueue_struct *cwq = worker->cwq;
1da177e4 915
c8e55f36 916woke_up:
c8e55f36 917 spin_lock_irq(&gcwq->lock);
1da177e4 918
c8e55f36
TH
919 /* DIE can be set only while we're idle, checking here is enough */
920 if (worker->flags & WORKER_DIE) {
921 spin_unlock_irq(&gcwq->lock);
922 return 0;
923 }
affee4b2 924
c8e55f36 925 worker_leave_idle(worker);
db7bccf4 926recheck:
c8e55f36
TH
927 /*
928 * ->scheduled list can only be filled while a worker is
929 * preparing to process a work or actually processing it.
930 * Make sure nobody diddled with it while I was sleeping.
931 */
932 BUG_ON(!list_empty(&worker->scheduled));
933
934 while (!list_empty(&cwq->worklist)) {
935 struct work_struct *work =
936 list_first_entry(&cwq->worklist,
937 struct work_struct, entry);
938
db7bccf4
TH
939 /*
940 * The following is a rather inefficient way to close
941 * race window against cpu hotplug operations. Will
942 * be replaced soon.
943 */
944 if (unlikely(!(worker->flags & WORKER_ROGUE) &&
945 !cpumask_equal(&worker->task->cpus_allowed,
946 get_cpu_mask(gcwq->cpu)))) {
947 spin_unlock_irq(&gcwq->lock);
948 set_cpus_allowed_ptr(worker->task,
949 get_cpu_mask(gcwq->cpu));
950 cpu_relax();
951 spin_lock_irq(&gcwq->lock);
952 goto recheck;
953 }
954
c8e55f36
TH
955 if (likely(!(*work_data_bits(work) & WORK_STRUCT_LINKED))) {
956 /* optimization path, not strictly necessary */
957 process_one_work(worker, work);
958 if (unlikely(!list_empty(&worker->scheduled)))
affee4b2 959 process_scheduled_works(worker);
c8e55f36
TH
960 } else {
961 move_linked_works(work, &worker->scheduled, NULL);
962 process_scheduled_works(worker);
affee4b2 963 }
1da177e4 964 }
3af24433 965
c8e55f36
TH
966 /*
967 * gcwq->lock is held and there's no work to process, sleep.
968 * Workers are woken up only while holding gcwq->lock, so
969 * setting the current state before releasing gcwq->lock is
970 * enough to prevent losing any event.
971 */
972 worker_enter_idle(worker);
973 __set_current_state(TASK_INTERRUPTIBLE);
974 spin_unlock_irq(&gcwq->lock);
975 schedule();
976 goto woke_up;
1da177e4
LT
977}
978
fc2e4d70
ON
979struct wq_barrier {
980 struct work_struct work;
981 struct completion done;
982};
983
984static void wq_barrier_func(struct work_struct *work)
985{
986 struct wq_barrier *barr = container_of(work, struct wq_barrier, work);
987 complete(&barr->done);
988}
989
4690c4ab
TH
990/**
991 * insert_wq_barrier - insert a barrier work
992 * @cwq: cwq to insert barrier into
993 * @barr: wq_barrier to insert
affee4b2
TH
994 * @target: target work to attach @barr to
995 * @worker: worker currently executing @target, NULL if @target is not executing
4690c4ab 996 *
affee4b2
TH
997 * @barr is linked to @target such that @barr is completed only after
998 * @target finishes execution. Please note that the ordering
999 * guarantee is observed only with respect to @target and on the local
1000 * cpu.
1001 *
1002 * Currently, a queued barrier can't be canceled. This is because
1003 * try_to_grab_pending() can't determine whether the work to be
1004 * grabbed is at the head of the queue and thus can't clear LINKED
1005 * flag of the previous work while there must be a valid next work
1006 * after a work with LINKED flag set.
1007 *
1008 * Note that when @worker is non-NULL, @target may be modified
1009 * underneath us, so we can't reliably determine cwq from @target.
4690c4ab
TH
1010 *
1011 * CONTEXT:
8b03ae3c 1012 * spin_lock_irq(gcwq->lock).
4690c4ab 1013 */
83c22520 1014static void insert_wq_barrier(struct cpu_workqueue_struct *cwq,
affee4b2
TH
1015 struct wq_barrier *barr,
1016 struct work_struct *target, struct worker *worker)
fc2e4d70 1017{
affee4b2
TH
1018 struct list_head *head;
1019 unsigned int linked = 0;
1020
dc186ad7 1021 /*
8b03ae3c 1022 * debugobject calls are safe here even with gcwq->lock locked
dc186ad7
TG
1023 * as we know for sure that this will not trigger any of the
1024 * checks and call back into the fixup functions where we
1025 * might deadlock.
1026 */
1027 INIT_WORK_ON_STACK(&barr->work, wq_barrier_func);
22df02bb 1028 __set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(&barr->work));
fc2e4d70 1029 init_completion(&barr->done);
83c22520 1030
affee4b2
TH
1031 /*
1032 * If @target is currently being executed, schedule the
1033 * barrier to the worker; otherwise, put it after @target.
1034 */
1035 if (worker)
1036 head = worker->scheduled.next;
1037 else {
1038 unsigned long *bits = work_data_bits(target);
1039
1040 head = target->entry.next;
1041 /* there can already be other linked works, inherit and set */
1042 linked = *bits & WORK_STRUCT_LINKED;
1043 __set_bit(WORK_STRUCT_LINKED_BIT, bits);
1044 }
1045
dc186ad7 1046 debug_work_activate(&barr->work);
affee4b2
TH
1047 insert_work(cwq, &barr->work, head,
1048 work_color_to_flags(WORK_NO_COLOR) | linked);
fc2e4d70
ON
1049}
1050
73f53c4a
TH
1051/**
1052 * flush_workqueue_prep_cwqs - prepare cwqs for workqueue flushing
1053 * @wq: workqueue being flushed
1054 * @flush_color: new flush color, < 0 for no-op
1055 * @work_color: new work color, < 0 for no-op
1056 *
1057 * Prepare cwqs for workqueue flushing.
1058 *
1059 * If @flush_color is non-negative, flush_color on all cwqs should be
1060 * -1. If no cwq has in-flight commands at the specified color, all
1061 * cwq->flush_color's stay at -1 and %false is returned. If any cwq
1062 * has in flight commands, its cwq->flush_color is set to
1063 * @flush_color, @wq->nr_cwqs_to_flush is updated accordingly, cwq
1064 * wakeup logic is armed and %true is returned.
1065 *
1066 * The caller should have initialized @wq->first_flusher prior to
1067 * calling this function with non-negative @flush_color. If
1068 * @flush_color is negative, no flush color update is done and %false
1069 * is returned.
1070 *
1071 * If @work_color is non-negative, all cwqs should have the same
1072 * work_color which is previous to @work_color and all will be
1073 * advanced to @work_color.
1074 *
1075 * CONTEXT:
1076 * mutex_lock(wq->flush_mutex).
1077 *
1078 * RETURNS:
1079 * %true if @flush_color >= 0 and there's something to flush. %false
1080 * otherwise.
1081 */
1082static bool flush_workqueue_prep_cwqs(struct workqueue_struct *wq,
1083 int flush_color, int work_color)
1da177e4 1084{
73f53c4a
TH
1085 bool wait = false;
1086 unsigned int cpu;
1da177e4 1087
73f53c4a
TH
1088 if (flush_color >= 0) {
1089 BUG_ON(atomic_read(&wq->nr_cwqs_to_flush));
1090 atomic_set(&wq->nr_cwqs_to_flush, 1);
1da177e4 1091 }
2355b70f 1092
73f53c4a
TH
1093 for_each_possible_cpu(cpu) {
1094 struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq);
8b03ae3c 1095 struct global_cwq *gcwq = cwq->gcwq;
73f53c4a 1096
8b03ae3c 1097 spin_lock_irq(&gcwq->lock);
73f53c4a
TH
1098
1099 if (flush_color >= 0) {
1100 BUG_ON(cwq->flush_color != -1);
1101
1102 if (cwq->nr_in_flight[flush_color]) {
1103 cwq->flush_color = flush_color;
1104 atomic_inc(&wq->nr_cwqs_to_flush);
1105 wait = true;
1106 }
1107 }
1108
1109 if (work_color >= 0) {
1110 BUG_ON(work_color != work_next_color(cwq->work_color));
1111 cwq->work_color = work_color;
1112 }
1113
8b03ae3c 1114 spin_unlock_irq(&gcwq->lock);
dc186ad7 1115 }
14441960 1116
73f53c4a
TH
1117 if (flush_color >= 0 && atomic_dec_and_test(&wq->nr_cwqs_to_flush))
1118 complete(&wq->first_flusher->done);
1119
1120 return wait;
1da177e4
LT
1121}
1122
0fcb78c2 1123/**
1da177e4 1124 * flush_workqueue - ensure that any scheduled work has run to completion.
0fcb78c2 1125 * @wq: workqueue to flush
1da177e4
LT
1126 *
1127 * Forces execution of the workqueue and blocks until its completion.
1128 * This is typically used in driver shutdown handlers.
1129 *
fc2e4d70
ON
1130 * We sleep until all works which were queued on entry have been handled,
1131 * but we are not livelocked by new incoming ones.
1da177e4 1132 */
7ad5b3a5 1133void flush_workqueue(struct workqueue_struct *wq)
1da177e4 1134{
73f53c4a
TH
1135 struct wq_flusher this_flusher = {
1136 .list = LIST_HEAD_INIT(this_flusher.list),
1137 .flush_color = -1,
1138 .done = COMPLETION_INITIALIZER_ONSTACK(this_flusher.done),
1139 };
1140 int next_color;
1da177e4 1141
3295f0ef
IM
1142 lock_map_acquire(&wq->lockdep_map);
1143 lock_map_release(&wq->lockdep_map);
73f53c4a
TH
1144
1145 mutex_lock(&wq->flush_mutex);
1146
1147 /*
1148 * Start-to-wait phase
1149 */
1150 next_color = work_next_color(wq->work_color);
1151
1152 if (next_color != wq->flush_color) {
1153 /*
1154 * Color space is not full. The current work_color
1155 * becomes our flush_color and work_color is advanced
1156 * by one.
1157 */
1158 BUG_ON(!list_empty(&wq->flusher_overflow));
1159 this_flusher.flush_color = wq->work_color;
1160 wq->work_color = next_color;
1161
1162 if (!wq->first_flusher) {
1163 /* no flush in progress, become the first flusher */
1164 BUG_ON(wq->flush_color != this_flusher.flush_color);
1165
1166 wq->first_flusher = &this_flusher;
1167
1168 if (!flush_workqueue_prep_cwqs(wq, wq->flush_color,
1169 wq->work_color)) {
1170 /* nothing to flush, done */
1171 wq->flush_color = next_color;
1172 wq->first_flusher = NULL;
1173 goto out_unlock;
1174 }
1175 } else {
1176 /* wait in queue */
1177 BUG_ON(wq->flush_color == this_flusher.flush_color);
1178 list_add_tail(&this_flusher.list, &wq->flusher_queue);
1179 flush_workqueue_prep_cwqs(wq, -1, wq->work_color);
1180 }
1181 } else {
1182 /*
1183 * Oops, color space is full, wait on overflow queue.
1184 * The next flush completion will assign us
1185 * flush_color and transfer to flusher_queue.
1186 */
1187 list_add_tail(&this_flusher.list, &wq->flusher_overflow);
1188 }
1189
1190 mutex_unlock(&wq->flush_mutex);
1191
1192 wait_for_completion(&this_flusher.done);
1193
1194 /*
1195 * Wake-up-and-cascade phase
1196 *
1197 * First flushers are responsible for cascading flushes and
1198 * handling overflow. Non-first flushers can simply return.
1199 */
1200 if (wq->first_flusher != &this_flusher)
1201 return;
1202
1203 mutex_lock(&wq->flush_mutex);
1204
1205 wq->first_flusher = NULL;
1206
1207 BUG_ON(!list_empty(&this_flusher.list));
1208 BUG_ON(wq->flush_color != this_flusher.flush_color);
1209
1210 while (true) {
1211 struct wq_flusher *next, *tmp;
1212
1213 /* complete all the flushers sharing the current flush color */
1214 list_for_each_entry_safe(next, tmp, &wq->flusher_queue, list) {
1215 if (next->flush_color != wq->flush_color)
1216 break;
1217 list_del_init(&next->list);
1218 complete(&next->done);
1219 }
1220
1221 BUG_ON(!list_empty(&wq->flusher_overflow) &&
1222 wq->flush_color != work_next_color(wq->work_color));
1223
1224 /* this flush_color is finished, advance by one */
1225 wq->flush_color = work_next_color(wq->flush_color);
1226
1227 /* one color has been freed, handle overflow queue */
1228 if (!list_empty(&wq->flusher_overflow)) {
1229 /*
1230 * Assign the same color to all overflowed
1231 * flushers, advance work_color and append to
1232 * flusher_queue. This is the start-to-wait
1233 * phase for these overflowed flushers.
1234 */
1235 list_for_each_entry(tmp, &wq->flusher_overflow, list)
1236 tmp->flush_color = wq->work_color;
1237
1238 wq->work_color = work_next_color(wq->work_color);
1239
1240 list_splice_tail_init(&wq->flusher_overflow,
1241 &wq->flusher_queue);
1242 flush_workqueue_prep_cwqs(wq, -1, wq->work_color);
1243 }
1244
1245 if (list_empty(&wq->flusher_queue)) {
1246 BUG_ON(wq->flush_color != wq->work_color);
1247 break;
1248 }
1249
1250 /*
1251 * Need to flush more colors. Make the next flusher
1252 * the new first flusher and arm cwqs.
1253 */
1254 BUG_ON(wq->flush_color == wq->work_color);
1255 BUG_ON(wq->flush_color != next->flush_color);
1256
1257 list_del_init(&next->list);
1258 wq->first_flusher = next;
1259
1260 if (flush_workqueue_prep_cwqs(wq, wq->flush_color, -1))
1261 break;
1262
1263 /*
1264 * Meh... this color is already done, clear first
1265 * flusher and repeat cascading.
1266 */
1267 wq->first_flusher = NULL;
1268 }
1269
1270out_unlock:
1271 mutex_unlock(&wq->flush_mutex);
1da177e4 1272}
ae90dd5d 1273EXPORT_SYMBOL_GPL(flush_workqueue);
1da177e4 1274
db700897
ON
1275/**
1276 * flush_work - block until a work_struct's callback has terminated
1277 * @work: the work which is to be flushed
1278 *
a67da70d
ON
1279 * Returns false if @work has already terminated.
1280 *
db700897
ON
1281 * It is expected that, prior to calling flush_work(), the caller has
1282 * arranged for the work to not be requeued, otherwise it doesn't make
1283 * sense to use this function.
1284 */
1285int flush_work(struct work_struct *work)
1286{
affee4b2 1287 struct worker *worker = NULL;
db700897 1288 struct cpu_workqueue_struct *cwq;
8b03ae3c 1289 struct global_cwq *gcwq;
db700897
ON
1290 struct wq_barrier barr;
1291
1292 might_sleep();
1293 cwq = get_wq_data(work);
1294 if (!cwq)
1295 return 0;
8b03ae3c 1296 gcwq = cwq->gcwq;
db700897 1297
3295f0ef
IM
1298 lock_map_acquire(&cwq->wq->lockdep_map);
1299 lock_map_release(&cwq->wq->lockdep_map);
a67da70d 1300
8b03ae3c 1301 spin_lock_irq(&gcwq->lock);
db700897
ON
1302 if (!list_empty(&work->entry)) {
1303 /*
1304 * See the comment near try_to_grab_pending()->smp_rmb().
1305 * If it was re-queued under us we are not going to wait.
1306 */
1307 smp_rmb();
1308 if (unlikely(cwq != get_wq_data(work)))
4690c4ab 1309 goto already_gone;
db700897 1310 } else {
affee4b2
TH
1311 if (cwq->worker && cwq->worker->current_work == work)
1312 worker = cwq->worker;
1313 if (!worker)
4690c4ab 1314 goto already_gone;
db700897 1315 }
db700897 1316
affee4b2 1317 insert_wq_barrier(cwq, &barr, work, worker);
8b03ae3c 1318 spin_unlock_irq(&gcwq->lock);
db700897 1319 wait_for_completion(&barr.done);
dc186ad7 1320 destroy_work_on_stack(&barr.work);
db700897 1321 return 1;
4690c4ab 1322already_gone:
8b03ae3c 1323 spin_unlock_irq(&gcwq->lock);
4690c4ab 1324 return 0;
db700897
ON
1325}
1326EXPORT_SYMBOL_GPL(flush_work);
1327
6e84d644 1328/*
1f1f642e 1329 * Upon a successful return (>= 0), the caller "owns" WORK_STRUCT_PENDING bit,
6e84d644
ON
1330 * so this work can't be re-armed in any way.
1331 */
1332static int try_to_grab_pending(struct work_struct *work)
1333{
8b03ae3c 1334 struct global_cwq *gcwq;
6e84d644 1335 struct cpu_workqueue_struct *cwq;
1f1f642e 1336 int ret = -1;
6e84d644 1337
22df02bb 1338 if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work)))
1f1f642e 1339 return 0;
6e84d644
ON
1340
1341 /*
1342 * The queueing is in progress, or it is already queued. Try to
1343 * steal it from ->worklist without clearing WORK_STRUCT_PENDING.
1344 */
1345
1346 cwq = get_wq_data(work);
1347 if (!cwq)
1348 return ret;
8b03ae3c 1349 gcwq = cwq->gcwq;
6e84d644 1350
8b03ae3c 1351 spin_lock_irq(&gcwq->lock);
6e84d644
ON
1352 if (!list_empty(&work->entry)) {
1353 /*
1354 * This work is queued, but perhaps we locked the wrong cwq.
1355 * In that case we must see the new value after rmb(), see
1356 * insert_work()->wmb().
1357 */
1358 smp_rmb();
1359 if (cwq == get_wq_data(work)) {
dc186ad7 1360 debug_work_deactivate(work);
6e84d644 1361 list_del_init(&work->entry);
73f53c4a 1362 cwq_dec_nr_in_flight(cwq, get_work_color(work));
6e84d644
ON
1363 ret = 1;
1364 }
1365 }
8b03ae3c 1366 spin_unlock_irq(&gcwq->lock);
6e84d644
ON
1367
1368 return ret;
1369}
1370
1371static void wait_on_cpu_work(struct cpu_workqueue_struct *cwq,
b89deed3
ON
1372 struct work_struct *work)
1373{
8b03ae3c 1374 struct global_cwq *gcwq = cwq->gcwq;
b89deed3 1375 struct wq_barrier barr;
affee4b2 1376 struct worker *worker;
b89deed3 1377
8b03ae3c 1378 spin_lock_irq(&gcwq->lock);
affee4b2
TH
1379
1380 worker = NULL;
c34056a3 1381 if (unlikely(cwq->worker && cwq->worker->current_work == work)) {
affee4b2
TH
1382 worker = cwq->worker;
1383 insert_wq_barrier(cwq, &barr, work, worker);
b89deed3 1384 }
affee4b2 1385
8b03ae3c 1386 spin_unlock_irq(&gcwq->lock);
b89deed3 1387
affee4b2 1388 if (unlikely(worker)) {
b89deed3 1389 wait_for_completion(&barr.done);
dc186ad7
TG
1390 destroy_work_on_stack(&barr.work);
1391 }
b89deed3
ON
1392}
1393
6e84d644 1394static void wait_on_work(struct work_struct *work)
b89deed3
ON
1395{
1396 struct cpu_workqueue_struct *cwq;
28e53bdd 1397 struct workqueue_struct *wq;
b1f4ec17 1398 int cpu;
b89deed3 1399
f293ea92
ON
1400 might_sleep();
1401
3295f0ef
IM
1402 lock_map_acquire(&work->lockdep_map);
1403 lock_map_release(&work->lockdep_map);
4e6045f1 1404
b89deed3 1405 cwq = get_wq_data(work);
b89deed3 1406 if (!cwq)
3af24433 1407 return;
b89deed3 1408
28e53bdd 1409 wq = cwq->wq;
28e53bdd 1410
1537663f 1411 for_each_possible_cpu(cpu)
4690c4ab 1412 wait_on_cpu_work(get_cwq(cpu, wq), work);
6e84d644
ON
1413}
1414
1f1f642e
ON
1415static int __cancel_work_timer(struct work_struct *work,
1416 struct timer_list* timer)
1417{
1418 int ret;
1419
1420 do {
1421 ret = (timer && likely(del_timer(timer)));
1422 if (!ret)
1423 ret = try_to_grab_pending(work);
1424 wait_on_work(work);
1425 } while (unlikely(ret < 0));
1426
4d707b9f 1427 clear_wq_data(work);
1f1f642e
ON
1428 return ret;
1429}
1430
6e84d644
ON
1431/**
1432 * cancel_work_sync - block until a work_struct's callback has terminated
1433 * @work: the work which is to be flushed
1434 *
1f1f642e
ON
1435 * Returns true if @work was pending.
1436 *
6e84d644
ON
1437 * cancel_work_sync() will cancel the work if it is queued. If the work's
1438 * callback appears to be running, cancel_work_sync() will block until it
1439 * has completed.
1440 *
1441 * It is possible to use this function if the work re-queues itself. It can
1442 * cancel the work even if it migrates to another workqueue, however in that
1443 * case it only guarantees that work->func() has completed on the last queued
1444 * workqueue.
1445 *
1446 * cancel_work_sync(&delayed_work->work) should be used only if ->timer is not
1447 * pending, otherwise it goes into a busy-wait loop until the timer expires.
1448 *
1449 * The caller must ensure that workqueue_struct on which this work was last
1450 * queued can't be destroyed before this function returns.
1451 */
1f1f642e 1452int cancel_work_sync(struct work_struct *work)
6e84d644 1453{
1f1f642e 1454 return __cancel_work_timer(work, NULL);
b89deed3 1455}
28e53bdd 1456EXPORT_SYMBOL_GPL(cancel_work_sync);
b89deed3 1457
6e84d644 1458/**
f5a421a4 1459 * cancel_delayed_work_sync - reliably kill off a delayed work.
6e84d644
ON
1460 * @dwork: the delayed work struct
1461 *
1f1f642e
ON
1462 * Returns true if @dwork was pending.
1463 *
6e84d644
ON
1464 * It is possible to use this function if @dwork rearms itself via queue_work()
1465 * or queue_delayed_work(). See also the comment for cancel_work_sync().
1466 */
1f1f642e 1467int cancel_delayed_work_sync(struct delayed_work *dwork)
6e84d644 1468{
1f1f642e 1469 return __cancel_work_timer(&dwork->work, &dwork->timer);
6e84d644 1470}
f5a421a4 1471EXPORT_SYMBOL(cancel_delayed_work_sync);
1da177e4 1472
6e84d644 1473static struct workqueue_struct *keventd_wq __read_mostly;
1da177e4 1474
0fcb78c2
REB
1475/**
1476 * schedule_work - put work task in global workqueue
1477 * @work: job to be done
1478 *
5b0f437d
BVA
1479 * Returns zero if @work was already on the kernel-global workqueue and
1480 * non-zero otherwise.
1481 *
1482 * This puts a job in the kernel-global workqueue if it was not already
1483 * queued and leaves it in the same position on the kernel-global
1484 * workqueue otherwise.
0fcb78c2 1485 */
7ad5b3a5 1486int schedule_work(struct work_struct *work)
1da177e4
LT
1487{
1488 return queue_work(keventd_wq, work);
1489}
ae90dd5d 1490EXPORT_SYMBOL(schedule_work);
1da177e4 1491
c1a220e7
ZR
1492/*
1493 * schedule_work_on - put work task on a specific cpu
1494 * @cpu: cpu to put the work task on
1495 * @work: job to be done
1496 *
1497 * This puts a job on a specific cpu
1498 */
1499int schedule_work_on(int cpu, struct work_struct *work)
1500{
1501 return queue_work_on(cpu, keventd_wq, work);
1502}
1503EXPORT_SYMBOL(schedule_work_on);
1504
0fcb78c2
REB
1505/**
1506 * schedule_delayed_work - put work task in global workqueue after delay
52bad64d
DH
1507 * @dwork: job to be done
1508 * @delay: number of jiffies to wait or 0 for immediate execution
0fcb78c2
REB
1509 *
1510 * After waiting for a given time this puts a job in the kernel-global
1511 * workqueue.
1512 */
7ad5b3a5 1513int schedule_delayed_work(struct delayed_work *dwork,
82f67cd9 1514 unsigned long delay)
1da177e4 1515{
52bad64d 1516 return queue_delayed_work(keventd_wq, dwork, delay);
1da177e4 1517}
ae90dd5d 1518EXPORT_SYMBOL(schedule_delayed_work);
1da177e4 1519
8c53e463
LT
1520/**
1521 * flush_delayed_work - block until a dwork_struct's callback has terminated
1522 * @dwork: the delayed work which is to be flushed
1523 *
1524 * Any timeout is cancelled, and any pending work is run immediately.
1525 */
1526void flush_delayed_work(struct delayed_work *dwork)
1527{
1528 if (del_timer_sync(&dwork->timer)) {
4690c4ab
TH
1529 __queue_work(get_cpu(), get_wq_data(&dwork->work)->wq,
1530 &dwork->work);
8c53e463
LT
1531 put_cpu();
1532 }
1533 flush_work(&dwork->work);
1534}
1535EXPORT_SYMBOL(flush_delayed_work);
1536
0fcb78c2
REB
1537/**
1538 * schedule_delayed_work_on - queue work in global workqueue on CPU after delay
1539 * @cpu: cpu to use
52bad64d 1540 * @dwork: job to be done
0fcb78c2
REB
1541 * @delay: number of jiffies to wait
1542 *
1543 * After waiting for a given time this puts a job in the kernel-global
1544 * workqueue on the specified CPU.
1545 */
1da177e4 1546int schedule_delayed_work_on(int cpu,
52bad64d 1547 struct delayed_work *dwork, unsigned long delay)
1da177e4 1548{
52bad64d 1549 return queue_delayed_work_on(cpu, keventd_wq, dwork, delay);
1da177e4 1550}
ae90dd5d 1551EXPORT_SYMBOL(schedule_delayed_work_on);
1da177e4 1552
b6136773
AM
1553/**
1554 * schedule_on_each_cpu - call a function on each online CPU from keventd
1555 * @func: the function to call
b6136773
AM
1556 *
1557 * Returns zero on success.
1558 * Returns -ve errno on failure.
1559 *
b6136773
AM
1560 * schedule_on_each_cpu() is very slow.
1561 */
65f27f38 1562int schedule_on_each_cpu(work_func_t func)
15316ba8
CL
1563{
1564 int cpu;
65a64464 1565 int orig = -1;
b6136773 1566 struct work_struct *works;
15316ba8 1567
b6136773
AM
1568 works = alloc_percpu(struct work_struct);
1569 if (!works)
15316ba8 1570 return -ENOMEM;
b6136773 1571
93981800
TH
1572 get_online_cpus();
1573
65a64464 1574 /*
93981800
TH
1575 * When running in keventd don't schedule a work item on
1576 * itself. Can just call directly because the work queue is
1577 * already bound. This also is faster.
65a64464 1578 */
93981800 1579 if (current_is_keventd())
65a64464 1580 orig = raw_smp_processor_id();
65a64464 1581
15316ba8 1582 for_each_online_cpu(cpu) {
9bfb1839
IM
1583 struct work_struct *work = per_cpu_ptr(works, cpu);
1584
1585 INIT_WORK(work, func);
65a64464 1586 if (cpu != orig)
93981800 1587 schedule_work_on(cpu, work);
65a64464 1588 }
93981800
TH
1589 if (orig >= 0)
1590 func(per_cpu_ptr(works, orig));
1591
1592 for_each_online_cpu(cpu)
1593 flush_work(per_cpu_ptr(works, cpu));
1594
95402b38 1595 put_online_cpus();
b6136773 1596 free_percpu(works);
15316ba8
CL
1597 return 0;
1598}
1599
eef6a7d5
AS
1600/**
1601 * flush_scheduled_work - ensure that any scheduled work has run to completion.
1602 *
1603 * Forces execution of the kernel-global workqueue and blocks until its
1604 * completion.
1605 *
1606 * Think twice before calling this function! It's very easy to get into
1607 * trouble if you don't take great care. Either of the following situations
1608 * will lead to deadlock:
1609 *
1610 * One of the work items currently on the workqueue needs to acquire
1611 * a lock held by your code or its caller.
1612 *
1613 * Your code is running in the context of a work routine.
1614 *
1615 * They will be detected by lockdep when they occur, but the first might not
1616 * occur very often. It depends on what work items are on the workqueue and
1617 * what locks they need, which you have no control over.
1618 *
1619 * In most situations flushing the entire workqueue is overkill; you merely
1620 * need to know that a particular work item isn't queued and isn't running.
1621 * In such cases you should use cancel_delayed_work_sync() or
1622 * cancel_work_sync() instead.
1623 */
1da177e4
LT
1624void flush_scheduled_work(void)
1625{
1626 flush_workqueue(keventd_wq);
1627}
ae90dd5d 1628EXPORT_SYMBOL(flush_scheduled_work);
1da177e4 1629
1fa44eca
JB
1630/**
1631 * execute_in_process_context - reliably execute the routine with user context
1632 * @fn: the function to execute
1fa44eca
JB
1633 * @ew: guaranteed storage for the execute work structure (must
1634 * be available when the work executes)
1635 *
1636 * Executes the function immediately if process context is available,
1637 * otherwise schedules the function for delayed execution.
1638 *
1639 * Returns: 0 - function was executed
1640 * 1 - function was scheduled for execution
1641 */
65f27f38 1642int execute_in_process_context(work_func_t fn, struct execute_work *ew)
1fa44eca
JB
1643{
1644 if (!in_interrupt()) {
65f27f38 1645 fn(&ew->work);
1fa44eca
JB
1646 return 0;
1647 }
1648
65f27f38 1649 INIT_WORK(&ew->work, fn);
1fa44eca
JB
1650 schedule_work(&ew->work);
1651
1652 return 1;
1653}
1654EXPORT_SYMBOL_GPL(execute_in_process_context);
1655
1da177e4
LT
1656int keventd_up(void)
1657{
1658 return keventd_wq != NULL;
1659}
1660
1661int current_is_keventd(void)
1662{
1663 struct cpu_workqueue_struct *cwq;
d243769d 1664 int cpu = raw_smp_processor_id(); /* preempt-safe: keventd is per-cpu */
1da177e4
LT
1665 int ret = 0;
1666
1667 BUG_ON(!keventd_wq);
1668
1537663f 1669 cwq = get_cwq(cpu, keventd_wq);
c34056a3 1670 if (current == cwq->worker->task)
1da177e4
LT
1671 ret = 1;
1672
1673 return ret;
1674
1675}
1676
0f900049
TH
1677static struct cpu_workqueue_struct *alloc_cwqs(void)
1678{
1679 /*
1680 * cwqs are forced aligned according to WORK_STRUCT_FLAG_BITS.
1681 * Make sure that the alignment isn't lower than that of
1682 * unsigned long long.
1683 */
1684 const size_t size = sizeof(struct cpu_workqueue_struct);
1685 const size_t align = max_t(size_t, 1 << WORK_STRUCT_FLAG_BITS,
1686 __alignof__(unsigned long long));
1687 struct cpu_workqueue_struct *cwqs;
1688#ifndef CONFIG_SMP
1689 void *ptr;
1690
1691 /*
1692 * On UP, percpu allocator doesn't honor alignment parameter
1693 * and simply uses arch-dependent default. Allocate enough
1694 * room to align cwq and put an extra pointer at the end
1695 * pointing back to the originally allocated pointer which
1696 * will be used for free.
1697 *
1698 * FIXME: This really belongs to UP percpu code. Update UP
1699 * percpu code to honor alignment and remove this ugliness.
1700 */
1701 ptr = __alloc_percpu(size + align + sizeof(void *), 1);
1702 cwqs = PTR_ALIGN(ptr, align);
1703 *(void **)per_cpu_ptr(cwqs + 1, 0) = ptr;
1704#else
1705 /* On SMP, percpu allocator can do it itself */
1706 cwqs = __alloc_percpu(size, align);
1707#endif
1708 /* just in case, make sure it's actually aligned */
1709 BUG_ON(!IS_ALIGNED((unsigned long)cwqs, align));
1710 return cwqs;
1711}
1712
1713static void free_cwqs(struct cpu_workqueue_struct *cwqs)
1714{
1715#ifndef CONFIG_SMP
1716 /* on UP, the pointer to free is stored right after the cwq */
1717 if (cwqs)
1718 free_percpu(*(void **)per_cpu_ptr(cwqs + 1, 0));
1719#else
1720 free_percpu(cwqs);
1721#endif
1722}
1723
4e6045f1 1724struct workqueue_struct *__create_workqueue_key(const char *name,
97e37d7b 1725 unsigned int flags,
1e19ffc6 1726 int max_active,
eb13ba87
JB
1727 struct lock_class_key *key,
1728 const char *lock_name)
1da177e4 1729{
1537663f 1730 bool singlethread = flags & WQ_SINGLE_THREAD;
1da177e4 1731 struct workqueue_struct *wq;
c34056a3
TH
1732 bool failed = false;
1733 unsigned int cpu;
1da177e4 1734
1e19ffc6
TH
1735 max_active = clamp_val(max_active, 1, INT_MAX);
1736
3af24433
ON
1737 wq = kzalloc(sizeof(*wq), GFP_KERNEL);
1738 if (!wq)
4690c4ab 1739 goto err;
3af24433 1740
0f900049 1741 wq->cpu_wq = alloc_cwqs();
4690c4ab
TH
1742 if (!wq->cpu_wq)
1743 goto err;
3af24433 1744
97e37d7b 1745 wq->flags = flags;
a0a1a5fd 1746 wq->saved_max_active = max_active;
73f53c4a
TH
1747 mutex_init(&wq->flush_mutex);
1748 atomic_set(&wq->nr_cwqs_to_flush, 0);
1749 INIT_LIST_HEAD(&wq->flusher_queue);
1750 INIT_LIST_HEAD(&wq->flusher_overflow);
3af24433 1751 wq->name = name;
eb13ba87 1752 lockdep_init_map(&wq->lockdep_map, lock_name, key, 0);
cce1a165 1753 INIT_LIST_HEAD(&wq->list);
3af24433 1754
1537663f
TH
1755 cpu_maps_update_begin();
1756 /*
1757 * We must initialize cwqs for each possible cpu even if we
1758 * are going to call destroy_workqueue() finally. Otherwise
1759 * cpu_up() can hit the uninitialized cwq once we drop the
1760 * lock.
1761 */
1762 for_each_possible_cpu(cpu) {
1763 struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq);
8b03ae3c 1764 struct global_cwq *gcwq = get_gcwq(cpu);
1537663f 1765
0f900049 1766 BUG_ON((unsigned long)cwq & WORK_STRUCT_FLAG_MASK);
8b03ae3c 1767 cwq->gcwq = gcwq;
c34056a3 1768 cwq->wq = wq;
73f53c4a 1769 cwq->flush_color = -1;
1e19ffc6 1770 cwq->max_active = max_active;
1537663f 1771 INIT_LIST_HEAD(&cwq->worklist);
1e19ffc6 1772 INIT_LIST_HEAD(&cwq->delayed_works);
1537663f 1773
c34056a3 1774 if (failed)
1537663f 1775 continue;
c34056a3
TH
1776 cwq->worker = create_worker(cwq,
1777 cpu_online(cpu) && !singlethread);
1778 if (cwq->worker)
1779 start_worker(cwq->worker);
1537663f 1780 else
c34056a3 1781 failed = true;
3af24433
ON
1782 }
1783
a0a1a5fd
TH
1784 /*
1785 * workqueue_lock protects global freeze state and workqueues
1786 * list. Grab it, set max_active accordingly and add the new
1787 * workqueue to workqueues list.
1788 */
1537663f 1789 spin_lock(&workqueue_lock);
a0a1a5fd
TH
1790
1791 if (workqueue_freezing && wq->flags & WQ_FREEZEABLE)
1792 for_each_possible_cpu(cpu)
1793 get_cwq(cpu, wq)->max_active = 0;
1794
1537663f 1795 list_add(&wq->list, &workqueues);
a0a1a5fd 1796
1537663f
TH
1797 spin_unlock(&workqueue_lock);
1798
1799 cpu_maps_update_done();
1800
c34056a3 1801 if (failed) {
3af24433
ON
1802 destroy_workqueue(wq);
1803 wq = NULL;
1804 }
1805 return wq;
4690c4ab
TH
1806err:
1807 if (wq) {
0f900049 1808 free_cwqs(wq->cpu_wq);
4690c4ab
TH
1809 kfree(wq);
1810 }
1811 return NULL;
3af24433 1812}
4e6045f1 1813EXPORT_SYMBOL_GPL(__create_workqueue_key);
1da177e4 1814
3af24433
ON
1815/**
1816 * destroy_workqueue - safely terminate a workqueue
1817 * @wq: target workqueue
1818 *
1819 * Safely destroy a workqueue. All work currently pending will be done first.
1820 */
1821void destroy_workqueue(struct workqueue_struct *wq)
1822{
c8e55f36 1823 unsigned int cpu;
3af24433 1824
a0a1a5fd
TH
1825 flush_workqueue(wq);
1826
1827 /*
1828 * wq list is used to freeze wq, remove from list after
1829 * flushing is complete in case freeze races us.
1830 */
3da1c84c 1831 cpu_maps_update_begin();
95402b38 1832 spin_lock(&workqueue_lock);
b1f4ec17 1833 list_del(&wq->list);
95402b38 1834 spin_unlock(&workqueue_lock);
1537663f 1835 cpu_maps_update_done();
3af24433 1836
73f53c4a
TH
1837 for_each_possible_cpu(cpu) {
1838 struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq);
1839 int i;
1840
c34056a3 1841 if (cwq->worker) {
c8e55f36 1842 spin_lock_irq(&cwq->gcwq->lock);
c34056a3
TH
1843 destroy_worker(cwq->worker);
1844 cwq->worker = NULL;
c8e55f36 1845 spin_unlock_irq(&cwq->gcwq->lock);
73f53c4a
TH
1846 }
1847
1848 for (i = 0; i < WORK_NR_COLORS; i++)
1849 BUG_ON(cwq->nr_in_flight[i]);
1e19ffc6
TH
1850 BUG_ON(cwq->nr_active);
1851 BUG_ON(!list_empty(&cwq->delayed_works));
73f53c4a 1852 }
9b41ea72 1853
0f900049 1854 free_cwqs(wq->cpu_wq);
3af24433
ON
1855 kfree(wq);
1856}
1857EXPORT_SYMBOL_GPL(destroy_workqueue);
1858
db7bccf4
TH
1859/*
1860 * CPU hotplug.
1861 *
1862 * CPU hotplug is implemented by allowing cwqs to be detached from
1863 * CPU, running with unbound workers and allowing them to be
1864 * reattached later if the cpu comes back online. A separate thread
1865 * is created to govern cwqs in such state and is called the trustee.
1866 *
1867 * Trustee states and their descriptions.
1868 *
1869 * START Command state used on startup. On CPU_DOWN_PREPARE, a
1870 * new trustee is started with this state.
1871 *
1872 * IN_CHARGE Once started, trustee will enter this state after
1873 * making all existing workers rogue. DOWN_PREPARE waits
1874 * for trustee to enter this state. After reaching
1875 * IN_CHARGE, trustee tries to execute the pending
1876 * worklist until it's empty and the state is set to
1877 * BUTCHER, or the state is set to RELEASE.
1878 *
1879 * BUTCHER Command state which is set by the cpu callback after
1880 * the cpu has went down. Once this state is set trustee
1881 * knows that there will be no new works on the worklist
1882 * and once the worklist is empty it can proceed to
1883 * killing idle workers.
1884 *
1885 * RELEASE Command state which is set by the cpu callback if the
1886 * cpu down has been canceled or it has come online
1887 * again. After recognizing this state, trustee stops
1888 * trying to drain or butcher and transits to DONE.
1889 *
1890 * DONE Trustee will enter this state after BUTCHER or RELEASE
1891 * is complete.
1892 *
1893 * trustee CPU draining
1894 * took over down complete
1895 * START -----------> IN_CHARGE -----------> BUTCHER -----------> DONE
1896 * | | ^
1897 * | CPU is back online v return workers |
1898 * ----------------> RELEASE --------------
1899 */
1900
1901/**
1902 * trustee_wait_event_timeout - timed event wait for trustee
1903 * @cond: condition to wait for
1904 * @timeout: timeout in jiffies
1905 *
1906 * wait_event_timeout() for trustee to use. Handles locking and
1907 * checks for RELEASE request.
1908 *
1909 * CONTEXT:
1910 * spin_lock_irq(gcwq->lock) which may be released and regrabbed
1911 * multiple times. To be used by trustee.
1912 *
1913 * RETURNS:
1914 * Positive indicating left time if @cond is satisfied, 0 if timed
1915 * out, -1 if canceled.
1916 */
1917#define trustee_wait_event_timeout(cond, timeout) ({ \
1918 long __ret = (timeout); \
1919 while (!((cond) || (gcwq->trustee_state == TRUSTEE_RELEASE)) && \
1920 __ret) { \
1921 spin_unlock_irq(&gcwq->lock); \
1922 __wait_event_timeout(gcwq->trustee_wait, (cond) || \
1923 (gcwq->trustee_state == TRUSTEE_RELEASE), \
1924 __ret); \
1925 spin_lock_irq(&gcwq->lock); \
1926 } \
1927 gcwq->trustee_state == TRUSTEE_RELEASE ? -1 : (__ret); \
1928})
1929
1930/**
1931 * trustee_wait_event - event wait for trustee
1932 * @cond: condition to wait for
1933 *
1934 * wait_event() for trustee to use. Automatically handles locking and
1935 * checks for CANCEL request.
1936 *
1937 * CONTEXT:
1938 * spin_lock_irq(gcwq->lock) which may be released and regrabbed
1939 * multiple times. To be used by trustee.
1940 *
1941 * RETURNS:
1942 * 0 if @cond is satisfied, -1 if canceled.
1943 */
1944#define trustee_wait_event(cond) ({ \
1945 long __ret1; \
1946 __ret1 = trustee_wait_event_timeout(cond, MAX_SCHEDULE_TIMEOUT);\
1947 __ret1 < 0 ? -1 : 0; \
1948})
1949
1950static int __cpuinit trustee_thread(void *__gcwq)
1951{
1952 struct global_cwq *gcwq = __gcwq;
1953 struct worker *worker;
1954 struct hlist_node *pos;
1955 int i;
1956
1957 BUG_ON(gcwq->cpu != smp_processor_id());
1958
1959 spin_lock_irq(&gcwq->lock);
1960 /*
1961 * Make all multithread workers rogue. Trustee must be bound
1962 * to the target cpu and can't be cancelled.
1963 */
1964 BUG_ON(gcwq->cpu != smp_processor_id());
1965
1966 list_for_each_entry(worker, &gcwq->idle_list, entry)
1967 if (!(worker->cwq->wq->flags & WQ_SINGLE_THREAD))
1968 worker->flags |= WORKER_ROGUE;
1969
1970 for_each_busy_worker(worker, i, pos, gcwq)
1971 if (!(worker->cwq->wq->flags & WQ_SINGLE_THREAD))
1972 worker->flags |= WORKER_ROGUE;
1973
1974 /*
1975 * We're now in charge. Notify and proceed to drain. We need
1976 * to keep the gcwq running during the whole CPU down
1977 * procedure as other cpu hotunplug callbacks may need to
1978 * flush currently running tasks.
1979 */
1980 gcwq->trustee_state = TRUSTEE_IN_CHARGE;
1981 wake_up_all(&gcwq->trustee_wait);
1982
1983 /*
1984 * The original cpu is in the process of dying and may go away
1985 * anytime now. When that happens, we and all workers would
1986 * be migrated to other cpus. Try draining any left work.
1987 * Note that if the gcwq is frozen, there may be frozen works
1988 * in freezeable cwqs. Don't declare completion while frozen.
1989 */
1990 while (gcwq->nr_workers != gcwq->nr_idle ||
1991 gcwq->flags & GCWQ_FREEZING ||
1992 gcwq->trustee_state == TRUSTEE_IN_CHARGE) {
1993 /* give a breather */
1994 if (trustee_wait_event_timeout(false, TRUSTEE_COOLDOWN) < 0)
1995 break;
1996 }
1997
1998 /* notify completion */
1999 gcwq->trustee = NULL;
2000 gcwq->trustee_state = TRUSTEE_DONE;
2001 wake_up_all(&gcwq->trustee_wait);
2002 spin_unlock_irq(&gcwq->lock);
2003 return 0;
2004}
2005
2006/**
2007 * wait_trustee_state - wait for trustee to enter the specified state
2008 * @gcwq: gcwq the trustee of interest belongs to
2009 * @state: target state to wait for
2010 *
2011 * Wait for the trustee to reach @state. DONE is already matched.
2012 *
2013 * CONTEXT:
2014 * spin_lock_irq(gcwq->lock) which may be released and regrabbed
2015 * multiple times. To be used by cpu_callback.
2016 */
2017static void __cpuinit wait_trustee_state(struct global_cwq *gcwq, int state)
2018{
2019 if (!(gcwq->trustee_state == state ||
2020 gcwq->trustee_state == TRUSTEE_DONE)) {
2021 spin_unlock_irq(&gcwq->lock);
2022 __wait_event(gcwq->trustee_wait,
2023 gcwq->trustee_state == state ||
2024 gcwq->trustee_state == TRUSTEE_DONE);
2025 spin_lock_irq(&gcwq->lock);
2026 }
2027}
2028
3af24433
ON
2029static int __devinit workqueue_cpu_callback(struct notifier_block *nfb,
2030 unsigned long action,
2031 void *hcpu)
2032{
2033 unsigned int cpu = (unsigned long)hcpu;
db7bccf4
TH
2034 struct global_cwq *gcwq = get_gcwq(cpu);
2035 struct task_struct *new_trustee = NULL;
2036 struct worker *worker;
2037 struct hlist_node *pos;
2038 unsigned long flags;
2039 int i;
3af24433 2040
8bb78442
RW
2041 action &= ~CPU_TASKS_FROZEN;
2042
db7bccf4
TH
2043 switch (action) {
2044 case CPU_DOWN_PREPARE:
2045 new_trustee = kthread_create(trustee_thread, gcwq,
2046 "workqueue_trustee/%d\n", cpu);
2047 if (IS_ERR(new_trustee))
2048 return notifier_from_errno(PTR_ERR(new_trustee));
2049 kthread_bind(new_trustee, cpu);
2050 }
3af24433 2051
db7bccf4
TH
2052 /* some are called w/ irq disabled, don't disturb irq status */
2053 spin_lock_irqsave(&gcwq->lock, flags);
3af24433 2054
db7bccf4
TH
2055 switch (action) {
2056 case CPU_DOWN_PREPARE:
2057 /* initialize trustee and tell it to acquire the gcwq */
2058 BUG_ON(gcwq->trustee || gcwq->trustee_state != TRUSTEE_DONE);
2059 gcwq->trustee = new_trustee;
2060 gcwq->trustee_state = TRUSTEE_START;
2061 wake_up_process(gcwq->trustee);
2062 wait_trustee_state(gcwq, TRUSTEE_IN_CHARGE);
2063 break;
2064
2065 case CPU_POST_DEAD:
2066 gcwq->trustee_state = TRUSTEE_BUTCHER;
2067 break;
2068
2069 case CPU_DOWN_FAILED:
2070 case CPU_ONLINE:
2071 if (gcwq->trustee_state != TRUSTEE_DONE) {
2072 gcwq->trustee_state = TRUSTEE_RELEASE;
2073 wake_up_process(gcwq->trustee);
2074 wait_trustee_state(gcwq, TRUSTEE_DONE);
3af24433 2075 }
db7bccf4
TH
2076
2077 /* clear ROGUE from all multithread workers */
2078 list_for_each_entry(worker, &gcwq->idle_list, entry)
2079 if (!(worker->cwq->wq->flags & WQ_SINGLE_THREAD))
2080 worker->flags &= ~WORKER_ROGUE;
2081
2082 for_each_busy_worker(worker, i, pos, gcwq)
2083 if (!(worker->cwq->wq->flags & WQ_SINGLE_THREAD))
2084 worker->flags &= ~WORKER_ROGUE;
2085 break;
1da177e4
LT
2086 }
2087
db7bccf4
TH
2088 spin_unlock_irqrestore(&gcwq->lock, flags);
2089
1537663f 2090 return notifier_from_errno(0);
1da177e4 2091}
1da177e4 2092
2d3854a3 2093#ifdef CONFIG_SMP
8ccad40d 2094
2d3854a3 2095struct work_for_cpu {
6b44003e 2096 struct completion completion;
2d3854a3
RR
2097 long (*fn)(void *);
2098 void *arg;
2099 long ret;
2100};
2101
6b44003e 2102static int do_work_for_cpu(void *_wfc)
2d3854a3 2103{
6b44003e 2104 struct work_for_cpu *wfc = _wfc;
2d3854a3 2105 wfc->ret = wfc->fn(wfc->arg);
6b44003e
AM
2106 complete(&wfc->completion);
2107 return 0;
2d3854a3
RR
2108}
2109
2110/**
2111 * work_on_cpu - run a function in user context on a particular cpu
2112 * @cpu: the cpu to run on
2113 * @fn: the function to run
2114 * @arg: the function arg
2115 *
31ad9081
RR
2116 * This will return the value @fn returns.
2117 * It is up to the caller to ensure that the cpu doesn't go offline.
6b44003e 2118 * The caller must not hold any locks which would prevent @fn from completing.
2d3854a3
RR
2119 */
2120long work_on_cpu(unsigned int cpu, long (*fn)(void *), void *arg)
2121{
6b44003e
AM
2122 struct task_struct *sub_thread;
2123 struct work_for_cpu wfc = {
2124 .completion = COMPLETION_INITIALIZER_ONSTACK(wfc.completion),
2125 .fn = fn,
2126 .arg = arg,
2127 };
2128
2129 sub_thread = kthread_create(do_work_for_cpu, &wfc, "work_for_cpu");
2130 if (IS_ERR(sub_thread))
2131 return PTR_ERR(sub_thread);
2132 kthread_bind(sub_thread, cpu);
2133 wake_up_process(sub_thread);
2134 wait_for_completion(&wfc.completion);
2d3854a3
RR
2135 return wfc.ret;
2136}
2137EXPORT_SYMBOL_GPL(work_on_cpu);
2138#endif /* CONFIG_SMP */
2139
a0a1a5fd
TH
2140#ifdef CONFIG_FREEZER
2141
2142/**
2143 * freeze_workqueues_begin - begin freezing workqueues
2144 *
2145 * Start freezing workqueues. After this function returns, all
2146 * freezeable workqueues will queue new works to their frozen_works
2147 * list instead of the cwq ones.
2148 *
2149 * CONTEXT:
8b03ae3c 2150 * Grabs and releases workqueue_lock and gcwq->lock's.
a0a1a5fd
TH
2151 */
2152void freeze_workqueues_begin(void)
2153{
2154 struct workqueue_struct *wq;
2155 unsigned int cpu;
2156
2157 spin_lock(&workqueue_lock);
2158
2159 BUG_ON(workqueue_freezing);
2160 workqueue_freezing = true;
2161
2162 for_each_possible_cpu(cpu) {
8b03ae3c
TH
2163 struct global_cwq *gcwq = get_gcwq(cpu);
2164
2165 spin_lock_irq(&gcwq->lock);
2166
db7bccf4
TH
2167 BUG_ON(gcwq->flags & GCWQ_FREEZING);
2168 gcwq->flags |= GCWQ_FREEZING;
2169
a0a1a5fd
TH
2170 list_for_each_entry(wq, &workqueues, list) {
2171 struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq);
2172
a0a1a5fd
TH
2173 if (wq->flags & WQ_FREEZEABLE)
2174 cwq->max_active = 0;
a0a1a5fd 2175 }
8b03ae3c
TH
2176
2177 spin_unlock_irq(&gcwq->lock);
a0a1a5fd
TH
2178 }
2179
2180 spin_unlock(&workqueue_lock);
2181}
2182
2183/**
2184 * freeze_workqueues_busy - are freezeable workqueues still busy?
2185 *
2186 * Check whether freezing is complete. This function must be called
2187 * between freeze_workqueues_begin() and thaw_workqueues().
2188 *
2189 * CONTEXT:
2190 * Grabs and releases workqueue_lock.
2191 *
2192 * RETURNS:
2193 * %true if some freezeable workqueues are still busy. %false if
2194 * freezing is complete.
2195 */
2196bool freeze_workqueues_busy(void)
2197{
2198 struct workqueue_struct *wq;
2199 unsigned int cpu;
2200 bool busy = false;
2201
2202 spin_lock(&workqueue_lock);
2203
2204 BUG_ON(!workqueue_freezing);
2205
2206 for_each_possible_cpu(cpu) {
2207 /*
2208 * nr_active is monotonically decreasing. It's safe
2209 * to peek without lock.
2210 */
2211 list_for_each_entry(wq, &workqueues, list) {
2212 struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq);
2213
2214 if (!(wq->flags & WQ_FREEZEABLE))
2215 continue;
2216
2217 BUG_ON(cwq->nr_active < 0);
2218 if (cwq->nr_active) {
2219 busy = true;
2220 goto out_unlock;
2221 }
2222 }
2223 }
2224out_unlock:
2225 spin_unlock(&workqueue_lock);
2226 return busy;
2227}
2228
2229/**
2230 * thaw_workqueues - thaw workqueues
2231 *
2232 * Thaw workqueues. Normal queueing is restored and all collected
2233 * frozen works are transferred to their respective cwq worklists.
2234 *
2235 * CONTEXT:
8b03ae3c 2236 * Grabs and releases workqueue_lock and gcwq->lock's.
a0a1a5fd
TH
2237 */
2238void thaw_workqueues(void)
2239{
2240 struct workqueue_struct *wq;
2241 unsigned int cpu;
2242
2243 spin_lock(&workqueue_lock);
2244
2245 if (!workqueue_freezing)
2246 goto out_unlock;
2247
2248 for_each_possible_cpu(cpu) {
8b03ae3c
TH
2249 struct global_cwq *gcwq = get_gcwq(cpu);
2250
2251 spin_lock_irq(&gcwq->lock);
2252
db7bccf4
TH
2253 BUG_ON(!(gcwq->flags & GCWQ_FREEZING));
2254 gcwq->flags &= ~GCWQ_FREEZING;
2255
a0a1a5fd
TH
2256 list_for_each_entry(wq, &workqueues, list) {
2257 struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq);
2258
2259 if (!(wq->flags & WQ_FREEZEABLE))
2260 continue;
2261
a0a1a5fd
TH
2262 /* restore max_active and repopulate worklist */
2263 cwq->max_active = wq->saved_max_active;
2264
2265 while (!list_empty(&cwq->delayed_works) &&
2266 cwq->nr_active < cwq->max_active)
2267 cwq_activate_first_delayed(cwq);
2268
c8e55f36 2269 wake_up_process(cwq->worker->task);
a0a1a5fd 2270 }
8b03ae3c
TH
2271
2272 spin_unlock_irq(&gcwq->lock);
a0a1a5fd
TH
2273 }
2274
2275 workqueue_freezing = false;
2276out_unlock:
2277 spin_unlock(&workqueue_lock);
2278}
2279#endif /* CONFIG_FREEZER */
2280
c12920d1 2281void __init init_workqueues(void)
1da177e4 2282{
c34056a3 2283 unsigned int cpu;
c8e55f36 2284 int i;
c34056a3 2285
e7577c50 2286 singlethread_cpu = cpumask_first(cpu_possible_mask);
db7bccf4 2287 hotcpu_notifier(workqueue_cpu_callback, CPU_PRI_WORKQUEUE);
8b03ae3c
TH
2288
2289 /* initialize gcwqs */
2290 for_each_possible_cpu(cpu) {
2291 struct global_cwq *gcwq = get_gcwq(cpu);
2292
2293 spin_lock_init(&gcwq->lock);
2294 gcwq->cpu = cpu;
2295
c8e55f36
TH
2296 INIT_LIST_HEAD(&gcwq->idle_list);
2297 for (i = 0; i < BUSY_WORKER_HASH_SIZE; i++)
2298 INIT_HLIST_HEAD(&gcwq->busy_hash[i]);
2299
8b03ae3c 2300 ida_init(&gcwq->worker_ida);
db7bccf4
TH
2301
2302 gcwq->trustee_state = TRUSTEE_DONE;
2303 init_waitqueue_head(&gcwq->trustee_wait);
8b03ae3c
TH
2304 }
2305
1da177e4
LT
2306 keventd_wq = create_workqueue("events");
2307 BUG_ON(!keventd_wq);
2308}