workqueue: update synchronization rules on workqueue->pwqs
[linux-2.6-block.git] / kernel / workqueue.c
CommitLineData
1da177e4 1/*
c54fce6e 2 * kernel/workqueue.c - generic async execution with shared worker pool
1da177e4 3 *
c54fce6e 4 * Copyright (C) 2002 Ingo Molnar
1da177e4 5 *
c54fce6e
TH
6 * Derived from the taskqueue/keventd code by:
7 * David Woodhouse <dwmw2@infradead.org>
8 * Andrew Morton
9 * Kai Petzke <wpp@marie.physik.tu-berlin.de>
10 * Theodore Ts'o <tytso@mit.edu>
1da177e4 11 *
c54fce6e 12 * Made to use alloc_percpu by Christoph Lameter.
1da177e4 13 *
c54fce6e
TH
14 * Copyright (C) 2010 SUSE Linux Products GmbH
15 * Copyright (C) 2010 Tejun Heo <tj@kernel.org>
89ada679 16 *
c54fce6e
TH
17 * This is the generic async execution mechanism. Work items as are
18 * executed in process context. The worker pool is shared and
19 * automatically managed. There is one worker pool for each CPU and
20 * one extra for works which are better served by workers which are
21 * not bound to any specific CPU.
22 *
23 * Please read Documentation/workqueue.txt for details.
1da177e4
LT
24 */
25
9984de1a 26#include <linux/export.h>
1da177e4
LT
27#include <linux/kernel.h>
28#include <linux/sched.h>
29#include <linux/init.h>
30#include <linux/signal.h>
31#include <linux/completion.h>
32#include <linux/workqueue.h>
33#include <linux/slab.h>
34#include <linux/cpu.h>
35#include <linux/notifier.h>
36#include <linux/kthread.h>
1fa44eca 37#include <linux/hardirq.h>
46934023 38#include <linux/mempolicy.h>
341a5958 39#include <linux/freezer.h>
d5abe669
PZ
40#include <linux/kallsyms.h>
41#include <linux/debug_locks.h>
4e6045f1 42#include <linux/lockdep.h>
c34056a3 43#include <linux/idr.h>
42f8570f 44#include <linux/hashtable.h>
76af4d93 45#include <linux/rculist.h>
e22bee78 46
ea138446 47#include "workqueue_internal.h"
1da177e4 48
c8e55f36 49enum {
24647570
TH
50 /*
51 * worker_pool flags
bc2ae0f5 52 *
24647570 53 * A bound pool is either associated or disassociated with its CPU.
bc2ae0f5
TH
54 * While associated (!DISASSOCIATED), all workers are bound to the
55 * CPU and none has %WORKER_UNBOUND set and concurrency management
56 * is in effect.
57 *
58 * While DISASSOCIATED, the cpu may be offline and all workers have
59 * %WORKER_UNBOUND set and concurrency management disabled, and may
24647570 60 * be executing on any CPU. The pool behaves as an unbound one.
bc2ae0f5
TH
61 *
62 * Note that DISASSOCIATED can be flipped only while holding
24647570
TH
63 * assoc_mutex to avoid changing binding state while
64 * create_worker() is in progress.
bc2ae0f5 65 */
11ebea50 66 POOL_MANAGE_WORKERS = 1 << 0, /* need to manage workers */
552a37e9 67 POOL_MANAGING_WORKERS = 1 << 1, /* managing workers */
24647570 68 POOL_DISASSOCIATED = 1 << 2, /* cpu can't serve workers */
35b6bb63 69 POOL_FREEZING = 1 << 3, /* freeze in progress */
db7bccf4 70
c8e55f36
TH
71 /* worker flags */
72 WORKER_STARTED = 1 << 0, /* started */
73 WORKER_DIE = 1 << 1, /* die die die */
74 WORKER_IDLE = 1 << 2, /* is idle */
e22bee78 75 WORKER_PREP = 1 << 3, /* preparing to run works */
fb0e7beb 76 WORKER_CPU_INTENSIVE = 1 << 6, /* cpu intensive */
f3421797 77 WORKER_UNBOUND = 1 << 7, /* worker is unbound */
e22bee78 78
5f7dabfd 79 WORKER_NOT_RUNNING = WORKER_PREP | WORKER_UNBOUND |
403c821d 80 WORKER_CPU_INTENSIVE,
db7bccf4 81
e34cdddb 82 NR_STD_WORKER_POOLS = 2, /* # standard pools per cpu */
4ce62e9e 83
c8e55f36 84 BUSY_WORKER_HASH_ORDER = 6, /* 64 pointers */
db7bccf4 85
e22bee78
TH
86 MAX_IDLE_WORKERS_RATIO = 4, /* 1/4 of busy can be idle */
87 IDLE_WORKER_TIMEOUT = 300 * HZ, /* keep idle ones for 5 mins */
88
3233cdbd
TH
89 MAYDAY_INITIAL_TIMEOUT = HZ / 100 >= 2 ? HZ / 100 : 2,
90 /* call for help after 10ms
91 (min two ticks) */
e22bee78
TH
92 MAYDAY_INTERVAL = HZ / 10, /* and then every 100ms */
93 CREATE_COOLDOWN = HZ, /* time to breath after fail */
e22bee78
TH
94
95 /*
96 * Rescue workers are used only on emergencies and shared by
97 * all cpus. Give -20.
98 */
99 RESCUER_NICE_LEVEL = -20,
3270476a 100 HIGHPRI_NICE_LEVEL = -20,
c8e55f36 101};
1da177e4
LT
102
103/*
4690c4ab
TH
104 * Structure fields follow one of the following exclusion rules.
105 *
e41e704b
TH
106 * I: Modifiable by initialization/destruction paths and read-only for
107 * everyone else.
4690c4ab 108 *
e22bee78
TH
109 * P: Preemption protected. Disabling preemption is enough and should
110 * only be modified and accessed from the local cpu.
111 *
d565ed63 112 * L: pool->lock protected. Access with pool->lock held.
4690c4ab 113 *
d565ed63
TH
114 * X: During normal operation, modification requires pool->lock and should
115 * be done only from local cpu. Either disabling preemption on local
116 * cpu or grabbing pool->lock is enough for read access. If
117 * POOL_DISASSOCIATED is set, it's identical to L.
e22bee78 118 *
73f53c4a
TH
119 * F: wq->flush_mutex protected.
120 *
4690c4ab 121 * W: workqueue_lock protected.
76af4d93
TH
122 *
123 * R: workqueue_lock protected for writes. Sched-RCU protected for reads.
1da177e4 124 */
1da177e4 125
2eaebdb3 126/* struct worker is defined in workqueue_internal.h */
c34056a3 127
bd7bdd43 128struct worker_pool {
d565ed63 129 spinlock_t lock; /* the pool lock */
d84ff051 130 int cpu; /* I: the associated cpu */
9daf9e67 131 int id; /* I: pool ID */
11ebea50 132 unsigned int flags; /* X: flags */
bd7bdd43
TH
133
134 struct list_head worklist; /* L: list of pending works */
135 int nr_workers; /* L: total number of workers */
ea1abd61
LJ
136
137 /* nr_idle includes the ones off idle_list for rebinding */
bd7bdd43
TH
138 int nr_idle; /* L: currently idle ones */
139
140 struct list_head idle_list; /* X: list of idle workers */
141 struct timer_list idle_timer; /* L: worker idle timeout */
142 struct timer_list mayday_timer; /* L: SOS timer for workers */
143
c9e7cf27
TH
144 /* workers are chained either in busy_hash or idle_list */
145 DECLARE_HASHTABLE(busy_hash, BUSY_WORKER_HASH_ORDER);
146 /* L: hash of busy workers */
147
24647570 148 struct mutex assoc_mutex; /* protect POOL_DISASSOCIATED */
bd7bdd43 149 struct ida worker_ida; /* L: for worker IDs */
e19e397a
TH
150
151 /*
152 * The current concurrency level. As it's likely to be accessed
153 * from other CPUs during try_to_wake_up(), put it in a separate
154 * cacheline.
155 */
156 atomic_t nr_running ____cacheline_aligned_in_smp;
8b03ae3c
TH
157} ____cacheline_aligned_in_smp;
158
1da177e4 159/*
112202d9
TH
160 * The per-pool workqueue. While queued, the lower WORK_STRUCT_FLAG_BITS
161 * of work_struct->data are used for flags and the remaining high bits
162 * point to the pwq; thus, pwqs need to be aligned at two's power of the
163 * number of flag bits.
1da177e4 164 */
112202d9 165struct pool_workqueue {
bd7bdd43 166 struct worker_pool *pool; /* I: the associated pool */
4690c4ab 167 struct workqueue_struct *wq; /* I: the owning workqueue */
73f53c4a
TH
168 int work_color; /* L: current color */
169 int flush_color; /* L: flushing color */
170 int nr_in_flight[WORK_NR_COLORS];
171 /* L: nr of in_flight works */
1e19ffc6 172 int nr_active; /* L: nr of active works */
a0a1a5fd 173 int max_active; /* L: max active works */
1e19ffc6 174 struct list_head delayed_works; /* L: delayed works */
76af4d93 175 struct list_head pwqs_node; /* R: node on wq->pwqs */
493a1724 176 struct list_head mayday_node; /* W: node on wq->maydays */
e904e6c2 177} __aligned(1 << WORK_STRUCT_FLAG_BITS);
1da177e4 178
73f53c4a
TH
179/*
180 * Structure used to wait for workqueue flush.
181 */
182struct wq_flusher {
183 struct list_head list; /* F: list of flushers */
184 int flush_color; /* F: flush color waiting for */
185 struct completion done; /* flush completion */
186};
187
1da177e4
LT
188/*
189 * The externally visible workqueue abstraction is an array of
190 * per-CPU workqueues:
191 */
192struct workqueue_struct {
9c5a2ba7 193 unsigned int flags; /* W: WQ_* flags */
420c0ddb 194 struct pool_workqueue __percpu *cpu_pwqs; /* I: per-cpu pwq's */
76af4d93 195 struct list_head pwqs; /* R: all pwqs of this wq */
4690c4ab 196 struct list_head list; /* W: list of all workqueues */
73f53c4a
TH
197
198 struct mutex flush_mutex; /* protects wq flushing */
199 int work_color; /* F: current work color */
200 int flush_color; /* F: current flush color */
112202d9 201 atomic_t nr_pwqs_to_flush; /* flush in progress */
73f53c4a
TH
202 struct wq_flusher *first_flusher; /* F: first flusher */
203 struct list_head flusher_queue; /* F: flush waiters */
204 struct list_head flusher_overflow; /* F: flush overflow list */
205
493a1724 206 struct list_head maydays; /* W: pwqs requesting rescue */
e22bee78
TH
207 struct worker *rescuer; /* I: rescue worker */
208
9c5a2ba7 209 int nr_drainers; /* W: drain in progress */
112202d9 210 int saved_max_active; /* W: saved pwq max_active */
4e6045f1 211#ifdef CONFIG_LOCKDEP
4690c4ab 212 struct lockdep_map lockdep_map;
4e6045f1 213#endif
b196be89 214 char name[]; /* I: workqueue name */
1da177e4
LT
215};
216
e904e6c2
TH
217static struct kmem_cache *pwq_cache;
218
d320c038 219struct workqueue_struct *system_wq __read_mostly;
d320c038 220EXPORT_SYMBOL_GPL(system_wq);
044c782c 221struct workqueue_struct *system_highpri_wq __read_mostly;
1aabe902 222EXPORT_SYMBOL_GPL(system_highpri_wq);
044c782c 223struct workqueue_struct *system_long_wq __read_mostly;
d320c038 224EXPORT_SYMBOL_GPL(system_long_wq);
044c782c 225struct workqueue_struct *system_unbound_wq __read_mostly;
f3421797 226EXPORT_SYMBOL_GPL(system_unbound_wq);
044c782c 227struct workqueue_struct *system_freezable_wq __read_mostly;
24d51add 228EXPORT_SYMBOL_GPL(system_freezable_wq);
d320c038 229
97bd2347
TH
230#define CREATE_TRACE_POINTS
231#include <trace/events/workqueue.h>
232
76af4d93
TH
233#define assert_rcu_or_wq_lock() \
234 rcu_lockdep_assert(rcu_read_lock_sched_held() || \
235 lockdep_is_held(&workqueue_lock), \
236 "sched RCU or workqueue lock should be held")
237
38db41d9 238#define for_each_std_worker_pool(pool, cpu) \
a60dc39c
TH
239 for ((pool) = &std_worker_pools(cpu)[0]; \
240 (pool) < &std_worker_pools(cpu)[NR_STD_WORKER_POOLS]; (pool)++)
4ce62e9e 241
b67bfe0d
SL
242#define for_each_busy_worker(worker, i, pool) \
243 hash_for_each(pool->busy_hash, i, worker, hentry)
db7bccf4 244
706026c2
TH
245static inline int __next_wq_cpu(int cpu, const struct cpumask *mask,
246 unsigned int sw)
f3421797
TH
247{
248 if (cpu < nr_cpu_ids) {
249 if (sw & 1) {
250 cpu = cpumask_next(cpu, mask);
251 if (cpu < nr_cpu_ids)
252 return cpu;
253 }
254 if (sw & 2)
255 return WORK_CPU_UNBOUND;
256 }
6be19588 257 return WORK_CPU_END;
f3421797
TH
258}
259
09884951
TH
260/*
261 * CPU iterators
262 *
706026c2 263 * An extra cpu number is defined using an invalid cpu number
09884951 264 * (WORK_CPU_UNBOUND) to host workqueues which are not bound to any
706026c2
TH
265 * specific CPU. The following iterators are similar to for_each_*_cpu()
266 * iterators but also considers the unbound CPU.
09884951 267 *
706026c2
TH
268 * for_each_wq_cpu() : possible CPUs + WORK_CPU_UNBOUND
269 * for_each_online_wq_cpu() : online CPUs + WORK_CPU_UNBOUND
09884951 270 */
706026c2
TH
271#define for_each_wq_cpu(cpu) \
272 for ((cpu) = __next_wq_cpu(-1, cpu_possible_mask, 3); \
6be19588 273 (cpu) < WORK_CPU_END; \
706026c2 274 (cpu) = __next_wq_cpu((cpu), cpu_possible_mask, 3))
f3421797 275
706026c2
TH
276#define for_each_online_wq_cpu(cpu) \
277 for ((cpu) = __next_wq_cpu(-1, cpu_online_mask, 3); \
6be19588 278 (cpu) < WORK_CPU_END; \
706026c2 279 (cpu) = __next_wq_cpu((cpu), cpu_online_mask, 3))
f3421797 280
17116969
TH
281/**
282 * for_each_pool - iterate through all worker_pools in the system
283 * @pool: iteration cursor
284 * @id: integer used for iteration
285 */
286#define for_each_pool(pool, id) \
287 idr_for_each_entry(&worker_pool_idr, pool, id)
288
49e3cf44
TH
289/**
290 * for_each_pwq - iterate through all pool_workqueues of the specified workqueue
291 * @pwq: iteration cursor
292 * @wq: the target workqueue
76af4d93
TH
293 *
294 * This must be called either with workqueue_lock held or sched RCU read
295 * locked. If the pwq needs to be used beyond the locking in effect, the
296 * caller is responsible for guaranteeing that the pwq stays online.
297 *
298 * The if/else clause exists only for the lockdep assertion and can be
299 * ignored.
49e3cf44
TH
300 */
301#define for_each_pwq(pwq, wq) \
76af4d93
TH
302 list_for_each_entry_rcu((pwq), &(wq)->pwqs, pwqs_node) \
303 if (({ assert_rcu_or_wq_lock(); false; })) { } \
304 else
f3421797 305
dc186ad7
TG
306#ifdef CONFIG_DEBUG_OBJECTS_WORK
307
308static struct debug_obj_descr work_debug_descr;
309
99777288
SG
310static void *work_debug_hint(void *addr)
311{
312 return ((struct work_struct *) addr)->func;
313}
314
dc186ad7
TG
315/*
316 * fixup_init is called when:
317 * - an active object is initialized
318 */
319static int work_fixup_init(void *addr, enum debug_obj_state state)
320{
321 struct work_struct *work = addr;
322
323 switch (state) {
324 case ODEBUG_STATE_ACTIVE:
325 cancel_work_sync(work);
326 debug_object_init(work, &work_debug_descr);
327 return 1;
328 default:
329 return 0;
330 }
331}
332
333/*
334 * fixup_activate is called when:
335 * - an active object is activated
336 * - an unknown object is activated (might be a statically initialized object)
337 */
338static int work_fixup_activate(void *addr, enum debug_obj_state state)
339{
340 struct work_struct *work = addr;
341
342 switch (state) {
343
344 case ODEBUG_STATE_NOTAVAILABLE:
345 /*
346 * This is not really a fixup. The work struct was
347 * statically initialized. We just make sure that it
348 * is tracked in the object tracker.
349 */
22df02bb 350 if (test_bit(WORK_STRUCT_STATIC_BIT, work_data_bits(work))) {
dc186ad7
TG
351 debug_object_init(work, &work_debug_descr);
352 debug_object_activate(work, &work_debug_descr);
353 return 0;
354 }
355 WARN_ON_ONCE(1);
356 return 0;
357
358 case ODEBUG_STATE_ACTIVE:
359 WARN_ON(1);
360
361 default:
362 return 0;
363 }
364}
365
366/*
367 * fixup_free is called when:
368 * - an active object is freed
369 */
370static int work_fixup_free(void *addr, enum debug_obj_state state)
371{
372 struct work_struct *work = addr;
373
374 switch (state) {
375 case ODEBUG_STATE_ACTIVE:
376 cancel_work_sync(work);
377 debug_object_free(work, &work_debug_descr);
378 return 1;
379 default:
380 return 0;
381 }
382}
383
384static struct debug_obj_descr work_debug_descr = {
385 .name = "work_struct",
99777288 386 .debug_hint = work_debug_hint,
dc186ad7
TG
387 .fixup_init = work_fixup_init,
388 .fixup_activate = work_fixup_activate,
389 .fixup_free = work_fixup_free,
390};
391
392static inline void debug_work_activate(struct work_struct *work)
393{
394 debug_object_activate(work, &work_debug_descr);
395}
396
397static inline void debug_work_deactivate(struct work_struct *work)
398{
399 debug_object_deactivate(work, &work_debug_descr);
400}
401
402void __init_work(struct work_struct *work, int onstack)
403{
404 if (onstack)
405 debug_object_init_on_stack(work, &work_debug_descr);
406 else
407 debug_object_init(work, &work_debug_descr);
408}
409EXPORT_SYMBOL_GPL(__init_work);
410
411void destroy_work_on_stack(struct work_struct *work)
412{
413 debug_object_free(work, &work_debug_descr);
414}
415EXPORT_SYMBOL_GPL(destroy_work_on_stack);
416
417#else
418static inline void debug_work_activate(struct work_struct *work) { }
419static inline void debug_work_deactivate(struct work_struct *work) { }
420#endif
421
95402b38
GS
422/* Serializes the accesses to the list of workqueues. */
423static DEFINE_SPINLOCK(workqueue_lock);
1da177e4 424static LIST_HEAD(workqueues);
a0a1a5fd 425static bool workqueue_freezing; /* W: have wqs started freezing? */
c34056a3 426
e22bee78 427/*
e19e397a
TH
428 * The CPU and unbound standard worker pools. The unbound ones have
429 * POOL_DISASSOCIATED set, and their workers have WORKER_UNBOUND set.
f3421797 430 */
e19e397a
TH
431static DEFINE_PER_CPU_SHARED_ALIGNED(struct worker_pool [NR_STD_WORKER_POOLS],
432 cpu_std_worker_pools);
a60dc39c 433static struct worker_pool unbound_std_worker_pools[NR_STD_WORKER_POOLS];
f3421797 434
9daf9e67
TH
435/* idr of all pools */
436static DEFINE_MUTEX(worker_pool_idr_mutex);
437static DEFINE_IDR(worker_pool_idr);
438
c34056a3 439static int worker_thread(void *__worker);
1da177e4 440
a60dc39c 441static struct worker_pool *std_worker_pools(int cpu)
8b03ae3c 442{
f3421797 443 if (cpu != WORK_CPU_UNBOUND)
a60dc39c 444 return per_cpu(cpu_std_worker_pools, cpu);
f3421797 445 else
a60dc39c 446 return unbound_std_worker_pools;
8b03ae3c
TH
447}
448
4e8f0a60
TH
449static int std_worker_pool_pri(struct worker_pool *pool)
450{
a60dc39c 451 return pool - std_worker_pools(pool->cpu);
4e8f0a60
TH
452}
453
9daf9e67
TH
454/* allocate ID and assign it to @pool */
455static int worker_pool_assign_id(struct worker_pool *pool)
456{
457 int ret;
458
459 mutex_lock(&worker_pool_idr_mutex);
460 idr_pre_get(&worker_pool_idr, GFP_KERNEL);
461 ret = idr_get_new(&worker_pool_idr, pool, &pool->id);
462 mutex_unlock(&worker_pool_idr_mutex);
463
464 return ret;
465}
466
7c3eed5c
TH
467/*
468 * Lookup worker_pool by id. The idr currently is built during boot and
469 * never modified. Don't worry about locking for now.
470 */
471static struct worker_pool *worker_pool_by_id(int pool_id)
472{
473 return idr_find(&worker_pool_idr, pool_id);
474}
475
d565ed63
TH
476static struct worker_pool *get_std_worker_pool(int cpu, bool highpri)
477{
a60dc39c 478 struct worker_pool *pools = std_worker_pools(cpu);
d565ed63 479
a60dc39c 480 return &pools[highpri];
d565ed63
TH
481}
482
76af4d93
TH
483/**
484 * first_pwq - return the first pool_workqueue of the specified workqueue
485 * @wq: the target workqueue
486 *
487 * This must be called either with workqueue_lock held or sched RCU read
488 * locked. If the pwq needs to be used beyond the locking in effect, the
489 * caller is responsible for guaranteeing that the pwq stays online.
490 */
7fb98ea7 491static struct pool_workqueue *first_pwq(struct workqueue_struct *wq)
b1f4ec17 492{
76af4d93
TH
493 assert_rcu_or_wq_lock();
494 return list_first_or_null_rcu(&wq->pwqs, struct pool_workqueue,
495 pwqs_node);
b1f4ec17
ON
496}
497
73f53c4a
TH
498static unsigned int work_color_to_flags(int color)
499{
500 return color << WORK_STRUCT_COLOR_SHIFT;
501}
502
503static int get_work_color(struct work_struct *work)
504{
505 return (*work_data_bits(work) >> WORK_STRUCT_COLOR_SHIFT) &
506 ((1 << WORK_STRUCT_COLOR_BITS) - 1);
507}
508
509static int work_next_color(int color)
510{
511 return (color + 1) % WORK_NR_COLORS;
512}
1da177e4 513
14441960 514/*
112202d9
TH
515 * While queued, %WORK_STRUCT_PWQ is set and non flag bits of a work's data
516 * contain the pointer to the queued pwq. Once execution starts, the flag
7c3eed5c 517 * is cleared and the high bits contain OFFQ flags and pool ID.
7a22ad75 518 *
112202d9
TH
519 * set_work_pwq(), set_work_pool_and_clear_pending(), mark_work_canceling()
520 * and clear_work_data() can be used to set the pwq, pool or clear
bbb68dfa
TH
521 * work->data. These functions should only be called while the work is
522 * owned - ie. while the PENDING bit is set.
7a22ad75 523 *
112202d9 524 * get_work_pool() and get_work_pwq() can be used to obtain the pool or pwq
7c3eed5c 525 * corresponding to a work. Pool is available once the work has been
112202d9 526 * queued anywhere after initialization until it is sync canceled. pwq is
7c3eed5c 527 * available only while the work item is queued.
7a22ad75 528 *
bbb68dfa
TH
529 * %WORK_OFFQ_CANCELING is used to mark a work item which is being
530 * canceled. While being canceled, a work item may have its PENDING set
531 * but stay off timer and worklist for arbitrarily long and nobody should
532 * try to steal the PENDING bit.
14441960 533 */
7a22ad75
TH
534static inline void set_work_data(struct work_struct *work, unsigned long data,
535 unsigned long flags)
365970a1 536{
6183c009 537 WARN_ON_ONCE(!work_pending(work));
7a22ad75
TH
538 atomic_long_set(&work->data, data | flags | work_static(work));
539}
365970a1 540
112202d9 541static void set_work_pwq(struct work_struct *work, struct pool_workqueue *pwq,
7a22ad75
TH
542 unsigned long extra_flags)
543{
112202d9
TH
544 set_work_data(work, (unsigned long)pwq,
545 WORK_STRUCT_PENDING | WORK_STRUCT_PWQ | extra_flags);
365970a1
DH
546}
547
4468a00f
LJ
548static void set_work_pool_and_keep_pending(struct work_struct *work,
549 int pool_id)
550{
551 set_work_data(work, (unsigned long)pool_id << WORK_OFFQ_POOL_SHIFT,
552 WORK_STRUCT_PENDING);
553}
554
7c3eed5c
TH
555static void set_work_pool_and_clear_pending(struct work_struct *work,
556 int pool_id)
7a22ad75 557{
23657bb1
TH
558 /*
559 * The following wmb is paired with the implied mb in
560 * test_and_set_bit(PENDING) and ensures all updates to @work made
561 * here are visible to and precede any updates by the next PENDING
562 * owner.
563 */
564 smp_wmb();
7c3eed5c 565 set_work_data(work, (unsigned long)pool_id << WORK_OFFQ_POOL_SHIFT, 0);
7a22ad75 566}
f756d5e2 567
7a22ad75 568static void clear_work_data(struct work_struct *work)
1da177e4 569{
7c3eed5c
TH
570 smp_wmb(); /* see set_work_pool_and_clear_pending() */
571 set_work_data(work, WORK_STRUCT_NO_POOL, 0);
1da177e4
LT
572}
573
112202d9 574static struct pool_workqueue *get_work_pwq(struct work_struct *work)
b1f4ec17 575{
e120153d 576 unsigned long data = atomic_long_read(&work->data);
7a22ad75 577
112202d9 578 if (data & WORK_STRUCT_PWQ)
e120153d
TH
579 return (void *)(data & WORK_STRUCT_WQ_DATA_MASK);
580 else
581 return NULL;
4d707b9f
ON
582}
583
7c3eed5c
TH
584/**
585 * get_work_pool - return the worker_pool a given work was associated with
586 * @work: the work item of interest
587 *
588 * Return the worker_pool @work was last associated with. %NULL if none.
589 */
590static struct worker_pool *get_work_pool(struct work_struct *work)
365970a1 591{
e120153d 592 unsigned long data = atomic_long_read(&work->data);
7c3eed5c
TH
593 struct worker_pool *pool;
594 int pool_id;
7a22ad75 595
112202d9
TH
596 if (data & WORK_STRUCT_PWQ)
597 return ((struct pool_workqueue *)
7c3eed5c 598 (data & WORK_STRUCT_WQ_DATA_MASK))->pool;
7a22ad75 599
7c3eed5c
TH
600 pool_id = data >> WORK_OFFQ_POOL_SHIFT;
601 if (pool_id == WORK_OFFQ_POOL_NONE)
7a22ad75
TH
602 return NULL;
603
7c3eed5c
TH
604 pool = worker_pool_by_id(pool_id);
605 WARN_ON_ONCE(!pool);
606 return pool;
607}
608
609/**
610 * get_work_pool_id - return the worker pool ID a given work is associated with
611 * @work: the work item of interest
612 *
613 * Return the worker_pool ID @work was last associated with.
614 * %WORK_OFFQ_POOL_NONE if none.
615 */
616static int get_work_pool_id(struct work_struct *work)
617{
54d5b7d0
LJ
618 unsigned long data = atomic_long_read(&work->data);
619
112202d9
TH
620 if (data & WORK_STRUCT_PWQ)
621 return ((struct pool_workqueue *)
54d5b7d0 622 (data & WORK_STRUCT_WQ_DATA_MASK))->pool->id;
7c3eed5c 623
54d5b7d0 624 return data >> WORK_OFFQ_POOL_SHIFT;
7c3eed5c
TH
625}
626
bbb68dfa
TH
627static void mark_work_canceling(struct work_struct *work)
628{
7c3eed5c 629 unsigned long pool_id = get_work_pool_id(work);
bbb68dfa 630
7c3eed5c
TH
631 pool_id <<= WORK_OFFQ_POOL_SHIFT;
632 set_work_data(work, pool_id | WORK_OFFQ_CANCELING, WORK_STRUCT_PENDING);
bbb68dfa
TH
633}
634
635static bool work_is_canceling(struct work_struct *work)
636{
637 unsigned long data = atomic_long_read(&work->data);
638
112202d9 639 return !(data & WORK_STRUCT_PWQ) && (data & WORK_OFFQ_CANCELING);
bbb68dfa
TH
640}
641
e22bee78 642/*
3270476a
TH
643 * Policy functions. These define the policies on how the global worker
644 * pools are managed. Unless noted otherwise, these functions assume that
d565ed63 645 * they're being called with pool->lock held.
e22bee78
TH
646 */
647
63d95a91 648static bool __need_more_worker(struct worker_pool *pool)
a848e3b6 649{
e19e397a 650 return !atomic_read(&pool->nr_running);
a848e3b6
ON
651}
652
4594bf15 653/*
e22bee78
TH
654 * Need to wake up a worker? Called from anything but currently
655 * running workers.
974271c4
TH
656 *
657 * Note that, because unbound workers never contribute to nr_running, this
706026c2 658 * function will always return %true for unbound pools as long as the
974271c4 659 * worklist isn't empty.
4594bf15 660 */
63d95a91 661static bool need_more_worker(struct worker_pool *pool)
365970a1 662{
63d95a91 663 return !list_empty(&pool->worklist) && __need_more_worker(pool);
e22bee78 664}
4594bf15 665
e22bee78 666/* Can I start working? Called from busy but !running workers. */
63d95a91 667static bool may_start_working(struct worker_pool *pool)
e22bee78 668{
63d95a91 669 return pool->nr_idle;
e22bee78
TH
670}
671
672/* Do I need to keep working? Called from currently running workers. */
63d95a91 673static bool keep_working(struct worker_pool *pool)
e22bee78 674{
e19e397a
TH
675 return !list_empty(&pool->worklist) &&
676 atomic_read(&pool->nr_running) <= 1;
e22bee78
TH
677}
678
679/* Do we need a new worker? Called from manager. */
63d95a91 680static bool need_to_create_worker(struct worker_pool *pool)
e22bee78 681{
63d95a91 682 return need_more_worker(pool) && !may_start_working(pool);
e22bee78 683}
365970a1 684
e22bee78 685/* Do I need to be the manager? */
63d95a91 686static bool need_to_manage_workers(struct worker_pool *pool)
e22bee78 687{
63d95a91 688 return need_to_create_worker(pool) ||
11ebea50 689 (pool->flags & POOL_MANAGE_WORKERS);
e22bee78
TH
690}
691
692/* Do we have too many workers and should some go away? */
63d95a91 693static bool too_many_workers(struct worker_pool *pool)
e22bee78 694{
552a37e9 695 bool managing = pool->flags & POOL_MANAGING_WORKERS;
63d95a91
TH
696 int nr_idle = pool->nr_idle + managing; /* manager is considered idle */
697 int nr_busy = pool->nr_workers - nr_idle;
e22bee78 698
ea1abd61
LJ
699 /*
700 * nr_idle and idle_list may disagree if idle rebinding is in
701 * progress. Never return %true if idle_list is empty.
702 */
703 if (list_empty(&pool->idle_list))
704 return false;
705
e22bee78 706 return nr_idle > 2 && (nr_idle - 2) * MAX_IDLE_WORKERS_RATIO >= nr_busy;
365970a1
DH
707}
708
4d707b9f 709/*
e22bee78
TH
710 * Wake up functions.
711 */
712
7e11629d 713/* Return the first worker. Safe with preemption disabled */
63d95a91 714static struct worker *first_worker(struct worker_pool *pool)
7e11629d 715{
63d95a91 716 if (unlikely(list_empty(&pool->idle_list)))
7e11629d
TH
717 return NULL;
718
63d95a91 719 return list_first_entry(&pool->idle_list, struct worker, entry);
7e11629d
TH
720}
721
722/**
723 * wake_up_worker - wake up an idle worker
63d95a91 724 * @pool: worker pool to wake worker from
7e11629d 725 *
63d95a91 726 * Wake up the first idle worker of @pool.
7e11629d
TH
727 *
728 * CONTEXT:
d565ed63 729 * spin_lock_irq(pool->lock).
7e11629d 730 */
63d95a91 731static void wake_up_worker(struct worker_pool *pool)
7e11629d 732{
63d95a91 733 struct worker *worker = first_worker(pool);
7e11629d
TH
734
735 if (likely(worker))
736 wake_up_process(worker->task);
737}
738
d302f017 739/**
e22bee78
TH
740 * wq_worker_waking_up - a worker is waking up
741 * @task: task waking up
742 * @cpu: CPU @task is waking up to
743 *
744 * This function is called during try_to_wake_up() when a worker is
745 * being awoken.
746 *
747 * CONTEXT:
748 * spin_lock_irq(rq->lock)
749 */
d84ff051 750void wq_worker_waking_up(struct task_struct *task, int cpu)
e22bee78
TH
751{
752 struct worker *worker = kthread_data(task);
753
36576000 754 if (!(worker->flags & WORKER_NOT_RUNNING)) {
ec22ca5e 755 WARN_ON_ONCE(worker->pool->cpu != cpu);
e19e397a 756 atomic_inc(&worker->pool->nr_running);
36576000 757 }
e22bee78
TH
758}
759
760/**
761 * wq_worker_sleeping - a worker is going to sleep
762 * @task: task going to sleep
763 * @cpu: CPU in question, must be the current CPU number
764 *
765 * This function is called during schedule() when a busy worker is
766 * going to sleep. Worker on the same cpu can be woken up by
767 * returning pointer to its task.
768 *
769 * CONTEXT:
770 * spin_lock_irq(rq->lock)
771 *
772 * RETURNS:
773 * Worker task on @cpu to wake up, %NULL if none.
774 */
d84ff051 775struct task_struct *wq_worker_sleeping(struct task_struct *task, int cpu)
e22bee78
TH
776{
777 struct worker *worker = kthread_data(task), *to_wakeup = NULL;
111c225a 778 struct worker_pool *pool;
e22bee78 779
111c225a
TH
780 /*
781 * Rescuers, which may not have all the fields set up like normal
782 * workers, also reach here, let's not access anything before
783 * checking NOT_RUNNING.
784 */
2d64672e 785 if (worker->flags & WORKER_NOT_RUNNING)
e22bee78
TH
786 return NULL;
787
111c225a 788 pool = worker->pool;
111c225a 789
e22bee78 790 /* this can only happen on the local cpu */
6183c009
TH
791 if (WARN_ON_ONCE(cpu != raw_smp_processor_id()))
792 return NULL;
e22bee78
TH
793
794 /*
795 * The counterpart of the following dec_and_test, implied mb,
796 * worklist not empty test sequence is in insert_work().
797 * Please read comment there.
798 *
628c78e7
TH
799 * NOT_RUNNING is clear. This means that we're bound to and
800 * running on the local cpu w/ rq lock held and preemption
801 * disabled, which in turn means that none else could be
d565ed63 802 * manipulating idle_list, so dereferencing idle_list without pool
628c78e7 803 * lock is safe.
e22bee78 804 */
e19e397a
TH
805 if (atomic_dec_and_test(&pool->nr_running) &&
806 !list_empty(&pool->worklist))
63d95a91 807 to_wakeup = first_worker(pool);
e22bee78
TH
808 return to_wakeup ? to_wakeup->task : NULL;
809}
810
811/**
812 * worker_set_flags - set worker flags and adjust nr_running accordingly
cb444766 813 * @worker: self
d302f017
TH
814 * @flags: flags to set
815 * @wakeup: wakeup an idle worker if necessary
816 *
e22bee78
TH
817 * Set @flags in @worker->flags and adjust nr_running accordingly. If
818 * nr_running becomes zero and @wakeup is %true, an idle worker is
819 * woken up.
d302f017 820 *
cb444766 821 * CONTEXT:
d565ed63 822 * spin_lock_irq(pool->lock)
d302f017
TH
823 */
824static inline void worker_set_flags(struct worker *worker, unsigned int flags,
825 bool wakeup)
826{
bd7bdd43 827 struct worker_pool *pool = worker->pool;
e22bee78 828
cb444766
TH
829 WARN_ON_ONCE(worker->task != current);
830
e22bee78
TH
831 /*
832 * If transitioning into NOT_RUNNING, adjust nr_running and
833 * wake up an idle worker as necessary if requested by
834 * @wakeup.
835 */
836 if ((flags & WORKER_NOT_RUNNING) &&
837 !(worker->flags & WORKER_NOT_RUNNING)) {
e22bee78 838 if (wakeup) {
e19e397a 839 if (atomic_dec_and_test(&pool->nr_running) &&
bd7bdd43 840 !list_empty(&pool->worklist))
63d95a91 841 wake_up_worker(pool);
e22bee78 842 } else
e19e397a 843 atomic_dec(&pool->nr_running);
e22bee78
TH
844 }
845
d302f017
TH
846 worker->flags |= flags;
847}
848
849/**
e22bee78 850 * worker_clr_flags - clear worker flags and adjust nr_running accordingly
cb444766 851 * @worker: self
d302f017
TH
852 * @flags: flags to clear
853 *
e22bee78 854 * Clear @flags in @worker->flags and adjust nr_running accordingly.
d302f017 855 *
cb444766 856 * CONTEXT:
d565ed63 857 * spin_lock_irq(pool->lock)
d302f017
TH
858 */
859static inline void worker_clr_flags(struct worker *worker, unsigned int flags)
860{
63d95a91 861 struct worker_pool *pool = worker->pool;
e22bee78
TH
862 unsigned int oflags = worker->flags;
863
cb444766
TH
864 WARN_ON_ONCE(worker->task != current);
865
d302f017 866 worker->flags &= ~flags;
e22bee78 867
42c025f3
TH
868 /*
869 * If transitioning out of NOT_RUNNING, increment nr_running. Note
870 * that the nested NOT_RUNNING is not a noop. NOT_RUNNING is mask
871 * of multiple flags, not a single flag.
872 */
e22bee78
TH
873 if ((flags & WORKER_NOT_RUNNING) && (oflags & WORKER_NOT_RUNNING))
874 if (!(worker->flags & WORKER_NOT_RUNNING))
e19e397a 875 atomic_inc(&pool->nr_running);
d302f017
TH
876}
877
8cca0eea
TH
878/**
879 * find_worker_executing_work - find worker which is executing a work
c9e7cf27 880 * @pool: pool of interest
8cca0eea
TH
881 * @work: work to find worker for
882 *
c9e7cf27
TH
883 * Find a worker which is executing @work on @pool by searching
884 * @pool->busy_hash which is keyed by the address of @work. For a worker
a2c1c57b
TH
885 * to match, its current execution should match the address of @work and
886 * its work function. This is to avoid unwanted dependency between
887 * unrelated work executions through a work item being recycled while still
888 * being executed.
889 *
890 * This is a bit tricky. A work item may be freed once its execution
891 * starts and nothing prevents the freed area from being recycled for
892 * another work item. If the same work item address ends up being reused
893 * before the original execution finishes, workqueue will identify the
894 * recycled work item as currently executing and make it wait until the
895 * current execution finishes, introducing an unwanted dependency.
896 *
897 * This function checks the work item address, work function and workqueue
898 * to avoid false positives. Note that this isn't complete as one may
899 * construct a work function which can introduce dependency onto itself
900 * through a recycled work item. Well, if somebody wants to shoot oneself
901 * in the foot that badly, there's only so much we can do, and if such
902 * deadlock actually occurs, it should be easy to locate the culprit work
903 * function.
8cca0eea
TH
904 *
905 * CONTEXT:
d565ed63 906 * spin_lock_irq(pool->lock).
8cca0eea
TH
907 *
908 * RETURNS:
909 * Pointer to worker which is executing @work if found, NULL
910 * otherwise.
4d707b9f 911 */
c9e7cf27 912static struct worker *find_worker_executing_work(struct worker_pool *pool,
8cca0eea 913 struct work_struct *work)
4d707b9f 914{
42f8570f 915 struct worker *worker;
42f8570f 916
b67bfe0d 917 hash_for_each_possible(pool->busy_hash, worker, hentry,
a2c1c57b
TH
918 (unsigned long)work)
919 if (worker->current_work == work &&
920 worker->current_func == work->func)
42f8570f
SL
921 return worker;
922
923 return NULL;
4d707b9f
ON
924}
925
bf4ede01
TH
926/**
927 * move_linked_works - move linked works to a list
928 * @work: start of series of works to be scheduled
929 * @head: target list to append @work to
930 * @nextp: out paramter for nested worklist walking
931 *
932 * Schedule linked works starting from @work to @head. Work series to
933 * be scheduled starts at @work and includes any consecutive work with
934 * WORK_STRUCT_LINKED set in its predecessor.
935 *
936 * If @nextp is not NULL, it's updated to point to the next work of
937 * the last scheduled work. This allows move_linked_works() to be
938 * nested inside outer list_for_each_entry_safe().
939 *
940 * CONTEXT:
d565ed63 941 * spin_lock_irq(pool->lock).
bf4ede01
TH
942 */
943static void move_linked_works(struct work_struct *work, struct list_head *head,
944 struct work_struct **nextp)
945{
946 struct work_struct *n;
947
948 /*
949 * Linked worklist will always end before the end of the list,
950 * use NULL for list head.
951 */
952 list_for_each_entry_safe_from(work, n, NULL, entry) {
953 list_move_tail(&work->entry, head);
954 if (!(*work_data_bits(work) & WORK_STRUCT_LINKED))
955 break;
956 }
957
958 /*
959 * If we're already inside safe list traversal and have moved
960 * multiple works to the scheduled queue, the next position
961 * needs to be updated.
962 */
963 if (nextp)
964 *nextp = n;
965}
966
112202d9 967static void pwq_activate_delayed_work(struct work_struct *work)
bf4ede01 968{
112202d9 969 struct pool_workqueue *pwq = get_work_pwq(work);
bf4ede01
TH
970
971 trace_workqueue_activate_work(work);
112202d9 972 move_linked_works(work, &pwq->pool->worklist, NULL);
bf4ede01 973 __clear_bit(WORK_STRUCT_DELAYED_BIT, work_data_bits(work));
112202d9 974 pwq->nr_active++;
bf4ede01
TH
975}
976
112202d9 977static void pwq_activate_first_delayed(struct pool_workqueue *pwq)
3aa62497 978{
112202d9 979 struct work_struct *work = list_first_entry(&pwq->delayed_works,
3aa62497
LJ
980 struct work_struct, entry);
981
112202d9 982 pwq_activate_delayed_work(work);
3aa62497
LJ
983}
984
bf4ede01 985/**
112202d9
TH
986 * pwq_dec_nr_in_flight - decrement pwq's nr_in_flight
987 * @pwq: pwq of interest
bf4ede01 988 * @color: color of work which left the queue
bf4ede01
TH
989 *
990 * A work either has completed or is removed from pending queue,
112202d9 991 * decrement nr_in_flight of its pwq and handle workqueue flushing.
bf4ede01
TH
992 *
993 * CONTEXT:
d565ed63 994 * spin_lock_irq(pool->lock).
bf4ede01 995 */
112202d9 996static void pwq_dec_nr_in_flight(struct pool_workqueue *pwq, int color)
bf4ede01
TH
997{
998 /* ignore uncolored works */
999 if (color == WORK_NO_COLOR)
1000 return;
1001
112202d9 1002 pwq->nr_in_flight[color]--;
bf4ede01 1003
112202d9
TH
1004 pwq->nr_active--;
1005 if (!list_empty(&pwq->delayed_works)) {
b3f9f405 1006 /* one down, submit a delayed one */
112202d9
TH
1007 if (pwq->nr_active < pwq->max_active)
1008 pwq_activate_first_delayed(pwq);
bf4ede01
TH
1009 }
1010
1011 /* is flush in progress and are we at the flushing tip? */
112202d9 1012 if (likely(pwq->flush_color != color))
bf4ede01
TH
1013 return;
1014
1015 /* are there still in-flight works? */
112202d9 1016 if (pwq->nr_in_flight[color])
bf4ede01
TH
1017 return;
1018
112202d9
TH
1019 /* this pwq is done, clear flush_color */
1020 pwq->flush_color = -1;
bf4ede01
TH
1021
1022 /*
112202d9 1023 * If this was the last pwq, wake up the first flusher. It
bf4ede01
TH
1024 * will handle the rest.
1025 */
112202d9
TH
1026 if (atomic_dec_and_test(&pwq->wq->nr_pwqs_to_flush))
1027 complete(&pwq->wq->first_flusher->done);
bf4ede01
TH
1028}
1029
36e227d2 1030/**
bbb68dfa 1031 * try_to_grab_pending - steal work item from worklist and disable irq
36e227d2
TH
1032 * @work: work item to steal
1033 * @is_dwork: @work is a delayed_work
bbb68dfa 1034 * @flags: place to store irq state
36e227d2
TH
1035 *
1036 * Try to grab PENDING bit of @work. This function can handle @work in any
1037 * stable state - idle, on timer or on worklist. Return values are
1038 *
1039 * 1 if @work was pending and we successfully stole PENDING
1040 * 0 if @work was idle and we claimed PENDING
1041 * -EAGAIN if PENDING couldn't be grabbed at the moment, safe to busy-retry
bbb68dfa
TH
1042 * -ENOENT if someone else is canceling @work, this state may persist
1043 * for arbitrarily long
36e227d2 1044 *
bbb68dfa 1045 * On >= 0 return, the caller owns @work's PENDING bit. To avoid getting
e0aecdd8
TH
1046 * interrupted while holding PENDING and @work off queue, irq must be
1047 * disabled on entry. This, combined with delayed_work->timer being
1048 * irqsafe, ensures that we return -EAGAIN for finite short period of time.
bbb68dfa
TH
1049 *
1050 * On successful return, >= 0, irq is disabled and the caller is
1051 * responsible for releasing it using local_irq_restore(*@flags).
1052 *
e0aecdd8 1053 * This function is safe to call from any context including IRQ handler.
bf4ede01 1054 */
bbb68dfa
TH
1055static int try_to_grab_pending(struct work_struct *work, bool is_dwork,
1056 unsigned long *flags)
bf4ede01 1057{
d565ed63 1058 struct worker_pool *pool;
112202d9 1059 struct pool_workqueue *pwq;
bf4ede01 1060
bbb68dfa
TH
1061 local_irq_save(*flags);
1062
36e227d2
TH
1063 /* try to steal the timer if it exists */
1064 if (is_dwork) {
1065 struct delayed_work *dwork = to_delayed_work(work);
1066
e0aecdd8
TH
1067 /*
1068 * dwork->timer is irqsafe. If del_timer() fails, it's
1069 * guaranteed that the timer is not queued anywhere and not
1070 * running on the local CPU.
1071 */
36e227d2
TH
1072 if (likely(del_timer(&dwork->timer)))
1073 return 1;
1074 }
1075
1076 /* try to claim PENDING the normal way */
bf4ede01
TH
1077 if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work)))
1078 return 0;
1079
1080 /*
1081 * The queueing is in progress, or it is already queued. Try to
1082 * steal it from ->worklist without clearing WORK_STRUCT_PENDING.
1083 */
d565ed63
TH
1084 pool = get_work_pool(work);
1085 if (!pool)
bbb68dfa 1086 goto fail;
bf4ede01 1087
d565ed63 1088 spin_lock(&pool->lock);
0b3dae68 1089 /*
112202d9
TH
1090 * work->data is guaranteed to point to pwq only while the work
1091 * item is queued on pwq->wq, and both updating work->data to point
1092 * to pwq on queueing and to pool on dequeueing are done under
1093 * pwq->pool->lock. This in turn guarantees that, if work->data
1094 * points to pwq which is associated with a locked pool, the work
0b3dae68
LJ
1095 * item is currently queued on that pool.
1096 */
112202d9
TH
1097 pwq = get_work_pwq(work);
1098 if (pwq && pwq->pool == pool) {
16062836
TH
1099 debug_work_deactivate(work);
1100
1101 /*
1102 * A delayed work item cannot be grabbed directly because
1103 * it might have linked NO_COLOR work items which, if left
112202d9 1104 * on the delayed_list, will confuse pwq->nr_active
16062836
TH
1105 * management later on and cause stall. Make sure the work
1106 * item is activated before grabbing.
1107 */
1108 if (*work_data_bits(work) & WORK_STRUCT_DELAYED)
112202d9 1109 pwq_activate_delayed_work(work);
16062836
TH
1110
1111 list_del_init(&work->entry);
112202d9 1112 pwq_dec_nr_in_flight(get_work_pwq(work), get_work_color(work));
16062836 1113
112202d9 1114 /* work->data points to pwq iff queued, point to pool */
16062836
TH
1115 set_work_pool_and_keep_pending(work, pool->id);
1116
1117 spin_unlock(&pool->lock);
1118 return 1;
bf4ede01 1119 }
d565ed63 1120 spin_unlock(&pool->lock);
bbb68dfa
TH
1121fail:
1122 local_irq_restore(*flags);
1123 if (work_is_canceling(work))
1124 return -ENOENT;
1125 cpu_relax();
36e227d2 1126 return -EAGAIN;
bf4ede01
TH
1127}
1128
4690c4ab 1129/**
706026c2 1130 * insert_work - insert a work into a pool
112202d9 1131 * @pwq: pwq @work belongs to
4690c4ab
TH
1132 * @work: work to insert
1133 * @head: insertion point
1134 * @extra_flags: extra WORK_STRUCT_* flags to set
1135 *
112202d9 1136 * Insert @work which belongs to @pwq after @head. @extra_flags is or'd to
706026c2 1137 * work_struct flags.
4690c4ab
TH
1138 *
1139 * CONTEXT:
d565ed63 1140 * spin_lock_irq(pool->lock).
4690c4ab 1141 */
112202d9
TH
1142static void insert_work(struct pool_workqueue *pwq, struct work_struct *work,
1143 struct list_head *head, unsigned int extra_flags)
b89deed3 1144{
112202d9 1145 struct worker_pool *pool = pwq->pool;
e22bee78 1146
4690c4ab 1147 /* we own @work, set data and link */
112202d9 1148 set_work_pwq(work, pwq, extra_flags);
1a4d9b0a 1149 list_add_tail(&work->entry, head);
e22bee78
TH
1150
1151 /*
1152 * Ensure either worker_sched_deactivated() sees the above
1153 * list_add_tail() or we see zero nr_running to avoid workers
1154 * lying around lazily while there are works to be processed.
1155 */
1156 smp_mb();
1157
63d95a91
TH
1158 if (__need_more_worker(pool))
1159 wake_up_worker(pool);
b89deed3
ON
1160}
1161
c8efcc25
TH
1162/*
1163 * Test whether @work is being queued from another work executing on the
8d03ecfe 1164 * same workqueue.
c8efcc25
TH
1165 */
1166static bool is_chained_work(struct workqueue_struct *wq)
1167{
8d03ecfe
TH
1168 struct worker *worker;
1169
1170 worker = current_wq_worker();
1171 /*
1172 * Return %true iff I'm a worker execuing a work item on @wq. If
1173 * I'm @worker, it's safe to dereference it without locking.
1174 */
112202d9 1175 return worker && worker->current_pwq->wq == wq;
c8efcc25
TH
1176}
1177
d84ff051 1178static void __queue_work(int cpu, struct workqueue_struct *wq,
1da177e4
LT
1179 struct work_struct *work)
1180{
112202d9 1181 struct pool_workqueue *pwq;
1e19ffc6 1182 struct list_head *worklist;
8a2e8e5d 1183 unsigned int work_flags;
b75cac93 1184 unsigned int req_cpu = cpu;
8930caba
TH
1185
1186 /*
1187 * While a work item is PENDING && off queue, a task trying to
1188 * steal the PENDING will busy-loop waiting for it to either get
1189 * queued or lose PENDING. Grabbing PENDING and queueing should
1190 * happen with IRQ disabled.
1191 */
1192 WARN_ON_ONCE(!irqs_disabled());
1da177e4 1193
dc186ad7 1194 debug_work_activate(work);
1e19ffc6 1195
c8efcc25 1196 /* if dying, only works from the same workqueue are allowed */
9c5a2ba7 1197 if (unlikely(wq->flags & WQ_DRAINING) &&
c8efcc25 1198 WARN_ON_ONCE(!is_chained_work(wq)))
e41e704b
TH
1199 return;
1200
112202d9 1201 /* determine the pwq to use */
c7fc77f7 1202 if (!(wq->flags & WQ_UNBOUND)) {
c9e7cf27 1203 struct worker_pool *last_pool;
18aa9eff 1204
57469821 1205 if (cpu == WORK_CPU_UNBOUND)
c7fc77f7
TH
1206 cpu = raw_smp_processor_id();
1207
18aa9eff 1208 /*
dbf2576e
TH
1209 * It's multi cpu. If @work was previously on a different
1210 * cpu, it might still be running there, in which case the
1211 * work needs to be queued on that cpu to guarantee
1212 * non-reentrancy.
18aa9eff 1213 */
7fb98ea7 1214 pwq = per_cpu_ptr(wq->cpu_pwqs, cpu);
c9e7cf27 1215 last_pool = get_work_pool(work);
dbf2576e 1216
112202d9 1217 if (last_pool && last_pool != pwq->pool) {
18aa9eff
TH
1218 struct worker *worker;
1219
d565ed63 1220 spin_lock(&last_pool->lock);
18aa9eff 1221
c9e7cf27 1222 worker = find_worker_executing_work(last_pool, work);
18aa9eff 1223
112202d9 1224 if (worker && worker->current_pwq->wq == wq) {
7fb98ea7 1225 pwq = per_cpu_ptr(wq->cpu_pwqs, last_pool->cpu);
8594fade 1226 } else {
18aa9eff 1227 /* meh... not running there, queue here */
d565ed63 1228 spin_unlock(&last_pool->lock);
112202d9 1229 spin_lock(&pwq->pool->lock);
18aa9eff 1230 }
8930caba 1231 } else {
112202d9 1232 spin_lock(&pwq->pool->lock);
8930caba 1233 }
f3421797 1234 } else {
7fb98ea7 1235 pwq = first_pwq(wq);
112202d9 1236 spin_lock(&pwq->pool->lock);
502ca9d8
TH
1237 }
1238
112202d9
TH
1239 /* pwq determined, queue */
1240 trace_workqueue_queue_work(req_cpu, pwq, work);
502ca9d8 1241
f5b2552b 1242 if (WARN_ON(!list_empty(&work->entry))) {
112202d9 1243 spin_unlock(&pwq->pool->lock);
f5b2552b
DC
1244 return;
1245 }
1e19ffc6 1246
112202d9
TH
1247 pwq->nr_in_flight[pwq->work_color]++;
1248 work_flags = work_color_to_flags(pwq->work_color);
1e19ffc6 1249
112202d9 1250 if (likely(pwq->nr_active < pwq->max_active)) {
cdadf009 1251 trace_workqueue_activate_work(work);
112202d9
TH
1252 pwq->nr_active++;
1253 worklist = &pwq->pool->worklist;
8a2e8e5d
TH
1254 } else {
1255 work_flags |= WORK_STRUCT_DELAYED;
112202d9 1256 worklist = &pwq->delayed_works;
8a2e8e5d 1257 }
1e19ffc6 1258
112202d9 1259 insert_work(pwq, work, worklist, work_flags);
1e19ffc6 1260
112202d9 1261 spin_unlock(&pwq->pool->lock);
1da177e4
LT
1262}
1263
0fcb78c2 1264/**
c1a220e7
ZR
1265 * queue_work_on - queue work on specific cpu
1266 * @cpu: CPU number to execute work on
0fcb78c2
REB
1267 * @wq: workqueue to use
1268 * @work: work to queue
1269 *
d4283e93 1270 * Returns %false if @work was already on a queue, %true otherwise.
1da177e4 1271 *
c1a220e7
ZR
1272 * We queue the work to a specific CPU, the caller must ensure it
1273 * can't go away.
1da177e4 1274 */
d4283e93
TH
1275bool queue_work_on(int cpu, struct workqueue_struct *wq,
1276 struct work_struct *work)
1da177e4 1277{
d4283e93 1278 bool ret = false;
8930caba 1279 unsigned long flags;
ef1ca236 1280
8930caba 1281 local_irq_save(flags);
c1a220e7 1282
22df02bb 1283 if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))) {
4690c4ab 1284 __queue_work(cpu, wq, work);
d4283e93 1285 ret = true;
c1a220e7 1286 }
ef1ca236 1287
8930caba 1288 local_irq_restore(flags);
1da177e4
LT
1289 return ret;
1290}
c1a220e7 1291EXPORT_SYMBOL_GPL(queue_work_on);
1da177e4 1292
c1a220e7 1293/**
0a13c00e 1294 * queue_work - queue work on a workqueue
c1a220e7
ZR
1295 * @wq: workqueue to use
1296 * @work: work to queue
1297 *
d4283e93 1298 * Returns %false if @work was already on a queue, %true otherwise.
c1a220e7 1299 *
0a13c00e
TH
1300 * We queue the work to the CPU on which it was submitted, but if the CPU dies
1301 * it can be processed by another CPU.
c1a220e7 1302 */
d4283e93 1303bool queue_work(struct workqueue_struct *wq, struct work_struct *work)
c1a220e7 1304{
57469821 1305 return queue_work_on(WORK_CPU_UNBOUND, wq, work);
c1a220e7 1306}
0a13c00e 1307EXPORT_SYMBOL_GPL(queue_work);
c1a220e7 1308
d8e794df 1309void delayed_work_timer_fn(unsigned long __data)
1da177e4 1310{
52bad64d 1311 struct delayed_work *dwork = (struct delayed_work *)__data;
1da177e4 1312
e0aecdd8 1313 /* should have been called from irqsafe timer with irq already off */
60c057bc 1314 __queue_work(dwork->cpu, dwork->wq, &dwork->work);
1da177e4 1315}
1438ade5 1316EXPORT_SYMBOL(delayed_work_timer_fn);
1da177e4 1317
7beb2edf
TH
1318static void __queue_delayed_work(int cpu, struct workqueue_struct *wq,
1319 struct delayed_work *dwork, unsigned long delay)
1da177e4 1320{
7beb2edf
TH
1321 struct timer_list *timer = &dwork->timer;
1322 struct work_struct *work = &dwork->work;
7beb2edf
TH
1323
1324 WARN_ON_ONCE(timer->function != delayed_work_timer_fn ||
1325 timer->data != (unsigned long)dwork);
fc4b514f
TH
1326 WARN_ON_ONCE(timer_pending(timer));
1327 WARN_ON_ONCE(!list_empty(&work->entry));
7beb2edf 1328
8852aac2
TH
1329 /*
1330 * If @delay is 0, queue @dwork->work immediately. This is for
1331 * both optimization and correctness. The earliest @timer can
1332 * expire is on the closest next tick and delayed_work users depend
1333 * on that there's no such delay when @delay is 0.
1334 */
1335 if (!delay) {
1336 __queue_work(cpu, wq, &dwork->work);
1337 return;
1338 }
1339
7beb2edf 1340 timer_stats_timer_set_start_info(&dwork->timer);
1da177e4 1341
60c057bc 1342 dwork->wq = wq;
1265057f 1343 dwork->cpu = cpu;
7beb2edf
TH
1344 timer->expires = jiffies + delay;
1345
1346 if (unlikely(cpu != WORK_CPU_UNBOUND))
1347 add_timer_on(timer, cpu);
1348 else
1349 add_timer(timer);
1da177e4
LT
1350}
1351
0fcb78c2
REB
1352/**
1353 * queue_delayed_work_on - queue work on specific CPU after delay
1354 * @cpu: CPU number to execute work on
1355 * @wq: workqueue to use
af9997e4 1356 * @dwork: work to queue
0fcb78c2
REB
1357 * @delay: number of jiffies to wait before queueing
1358 *
715f1300
TH
1359 * Returns %false if @work was already on a queue, %true otherwise. If
1360 * @delay is zero and @dwork is idle, it will be scheduled for immediate
1361 * execution.
0fcb78c2 1362 */
d4283e93
TH
1363bool queue_delayed_work_on(int cpu, struct workqueue_struct *wq,
1364 struct delayed_work *dwork, unsigned long delay)
7a6bc1cd 1365{
52bad64d 1366 struct work_struct *work = &dwork->work;
d4283e93 1367 bool ret = false;
8930caba 1368 unsigned long flags;
7a6bc1cd 1369
8930caba
TH
1370 /* read the comment in __queue_work() */
1371 local_irq_save(flags);
7a6bc1cd 1372
22df02bb 1373 if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))) {
7beb2edf 1374 __queue_delayed_work(cpu, wq, dwork, delay);
d4283e93 1375 ret = true;
7a6bc1cd 1376 }
8a3e77cc 1377
8930caba 1378 local_irq_restore(flags);
7a6bc1cd
VP
1379 return ret;
1380}
ae90dd5d 1381EXPORT_SYMBOL_GPL(queue_delayed_work_on);
c7fc77f7 1382
0a13c00e
TH
1383/**
1384 * queue_delayed_work - queue work on a workqueue after delay
1385 * @wq: workqueue to use
1386 * @dwork: delayable work to queue
1387 * @delay: number of jiffies to wait before queueing
1388 *
715f1300 1389 * Equivalent to queue_delayed_work_on() but tries to use the local CPU.
0a13c00e 1390 */
d4283e93 1391bool queue_delayed_work(struct workqueue_struct *wq,
0a13c00e
TH
1392 struct delayed_work *dwork, unsigned long delay)
1393{
57469821 1394 return queue_delayed_work_on(WORK_CPU_UNBOUND, wq, dwork, delay);
0a13c00e
TH
1395}
1396EXPORT_SYMBOL_GPL(queue_delayed_work);
c7fc77f7 1397
8376fe22
TH
1398/**
1399 * mod_delayed_work_on - modify delay of or queue a delayed work on specific CPU
1400 * @cpu: CPU number to execute work on
1401 * @wq: workqueue to use
1402 * @dwork: work to queue
1403 * @delay: number of jiffies to wait before queueing
1404 *
1405 * If @dwork is idle, equivalent to queue_delayed_work_on(); otherwise,
1406 * modify @dwork's timer so that it expires after @delay. If @delay is
1407 * zero, @work is guaranteed to be scheduled immediately regardless of its
1408 * current state.
1409 *
1410 * Returns %false if @dwork was idle and queued, %true if @dwork was
1411 * pending and its timer was modified.
1412 *
e0aecdd8 1413 * This function is safe to call from any context including IRQ handler.
8376fe22
TH
1414 * See try_to_grab_pending() for details.
1415 */
1416bool mod_delayed_work_on(int cpu, struct workqueue_struct *wq,
1417 struct delayed_work *dwork, unsigned long delay)
1418{
1419 unsigned long flags;
1420 int ret;
c7fc77f7 1421
8376fe22
TH
1422 do {
1423 ret = try_to_grab_pending(&dwork->work, true, &flags);
1424 } while (unlikely(ret == -EAGAIN));
63bc0362 1425
8376fe22
TH
1426 if (likely(ret >= 0)) {
1427 __queue_delayed_work(cpu, wq, dwork, delay);
1428 local_irq_restore(flags);
7a6bc1cd 1429 }
8376fe22
TH
1430
1431 /* -ENOENT from try_to_grab_pending() becomes %true */
7a6bc1cd
VP
1432 return ret;
1433}
8376fe22
TH
1434EXPORT_SYMBOL_GPL(mod_delayed_work_on);
1435
1436/**
1437 * mod_delayed_work - modify delay of or queue a delayed work
1438 * @wq: workqueue to use
1439 * @dwork: work to queue
1440 * @delay: number of jiffies to wait before queueing
1441 *
1442 * mod_delayed_work_on() on local CPU.
1443 */
1444bool mod_delayed_work(struct workqueue_struct *wq, struct delayed_work *dwork,
1445 unsigned long delay)
1446{
1447 return mod_delayed_work_on(WORK_CPU_UNBOUND, wq, dwork, delay);
1448}
1449EXPORT_SYMBOL_GPL(mod_delayed_work);
1da177e4 1450
c8e55f36
TH
1451/**
1452 * worker_enter_idle - enter idle state
1453 * @worker: worker which is entering idle state
1454 *
1455 * @worker is entering idle state. Update stats and idle timer if
1456 * necessary.
1457 *
1458 * LOCKING:
d565ed63 1459 * spin_lock_irq(pool->lock).
c8e55f36
TH
1460 */
1461static void worker_enter_idle(struct worker *worker)
1da177e4 1462{
bd7bdd43 1463 struct worker_pool *pool = worker->pool;
c8e55f36 1464
6183c009
TH
1465 if (WARN_ON_ONCE(worker->flags & WORKER_IDLE) ||
1466 WARN_ON_ONCE(!list_empty(&worker->entry) &&
1467 (worker->hentry.next || worker->hentry.pprev)))
1468 return;
c8e55f36 1469
cb444766
TH
1470 /* can't use worker_set_flags(), also called from start_worker() */
1471 worker->flags |= WORKER_IDLE;
bd7bdd43 1472 pool->nr_idle++;
e22bee78 1473 worker->last_active = jiffies;
c8e55f36
TH
1474
1475 /* idle_list is LIFO */
bd7bdd43 1476 list_add(&worker->entry, &pool->idle_list);
db7bccf4 1477
628c78e7
TH
1478 if (too_many_workers(pool) && !timer_pending(&pool->idle_timer))
1479 mod_timer(&pool->idle_timer, jiffies + IDLE_WORKER_TIMEOUT);
cb444766 1480
544ecf31 1481 /*
706026c2 1482 * Sanity check nr_running. Because wq_unbind_fn() releases
d565ed63 1483 * pool->lock between setting %WORKER_UNBOUND and zapping
628c78e7
TH
1484 * nr_running, the warning may trigger spuriously. Check iff
1485 * unbind is not in progress.
544ecf31 1486 */
24647570 1487 WARN_ON_ONCE(!(pool->flags & POOL_DISASSOCIATED) &&
bd7bdd43 1488 pool->nr_workers == pool->nr_idle &&
e19e397a 1489 atomic_read(&pool->nr_running));
c8e55f36
TH
1490}
1491
1492/**
1493 * worker_leave_idle - leave idle state
1494 * @worker: worker which is leaving idle state
1495 *
1496 * @worker is leaving idle state. Update stats.
1497 *
1498 * LOCKING:
d565ed63 1499 * spin_lock_irq(pool->lock).
c8e55f36
TH
1500 */
1501static void worker_leave_idle(struct worker *worker)
1502{
bd7bdd43 1503 struct worker_pool *pool = worker->pool;
c8e55f36 1504
6183c009
TH
1505 if (WARN_ON_ONCE(!(worker->flags & WORKER_IDLE)))
1506 return;
d302f017 1507 worker_clr_flags(worker, WORKER_IDLE);
bd7bdd43 1508 pool->nr_idle--;
c8e55f36
TH
1509 list_del_init(&worker->entry);
1510}
1511
e22bee78 1512/**
f36dc67b
LJ
1513 * worker_maybe_bind_and_lock - try to bind %current to worker_pool and lock it
1514 * @pool: target worker_pool
1515 *
1516 * Bind %current to the cpu of @pool if it is associated and lock @pool.
e22bee78
TH
1517 *
1518 * Works which are scheduled while the cpu is online must at least be
1519 * scheduled to a worker which is bound to the cpu so that if they are
1520 * flushed from cpu callbacks while cpu is going down, they are
1521 * guaranteed to execute on the cpu.
1522 *
f5faa077 1523 * This function is to be used by unbound workers and rescuers to bind
e22bee78
TH
1524 * themselves to the target cpu and may race with cpu going down or
1525 * coming online. kthread_bind() can't be used because it may put the
1526 * worker to already dead cpu and set_cpus_allowed_ptr() can't be used
706026c2 1527 * verbatim as it's best effort and blocking and pool may be
e22bee78
TH
1528 * [dis]associated in the meantime.
1529 *
706026c2 1530 * This function tries set_cpus_allowed() and locks pool and verifies the
24647570 1531 * binding against %POOL_DISASSOCIATED which is set during
f2d5a0ee
TH
1532 * %CPU_DOWN_PREPARE and cleared during %CPU_ONLINE, so if the worker
1533 * enters idle state or fetches works without dropping lock, it can
1534 * guarantee the scheduling requirement described in the first paragraph.
e22bee78
TH
1535 *
1536 * CONTEXT:
d565ed63 1537 * Might sleep. Called without any lock but returns with pool->lock
e22bee78
TH
1538 * held.
1539 *
1540 * RETURNS:
706026c2 1541 * %true if the associated pool is online (@worker is successfully
e22bee78
TH
1542 * bound), %false if offline.
1543 */
f36dc67b 1544static bool worker_maybe_bind_and_lock(struct worker_pool *pool)
d565ed63 1545__acquires(&pool->lock)
e22bee78 1546{
e22bee78 1547 while (true) {
4e6045f1 1548 /*
e22bee78
TH
1549 * The following call may fail, succeed or succeed
1550 * without actually migrating the task to the cpu if
1551 * it races with cpu hotunplug operation. Verify
24647570 1552 * against POOL_DISASSOCIATED.
4e6045f1 1553 */
24647570 1554 if (!(pool->flags & POOL_DISASSOCIATED))
f5faa077 1555 set_cpus_allowed_ptr(current, get_cpu_mask(pool->cpu));
e22bee78 1556
d565ed63 1557 spin_lock_irq(&pool->lock);
24647570 1558 if (pool->flags & POOL_DISASSOCIATED)
e22bee78 1559 return false;
f5faa077 1560 if (task_cpu(current) == pool->cpu &&
e22bee78 1561 cpumask_equal(&current->cpus_allowed,
ec22ca5e 1562 get_cpu_mask(pool->cpu)))
e22bee78 1563 return true;
d565ed63 1564 spin_unlock_irq(&pool->lock);
e22bee78 1565
5035b20f
TH
1566 /*
1567 * We've raced with CPU hot[un]plug. Give it a breather
1568 * and retry migration. cond_resched() is required here;
1569 * otherwise, we might deadlock against cpu_stop trying to
1570 * bring down the CPU on non-preemptive kernel.
1571 */
e22bee78 1572 cpu_relax();
5035b20f 1573 cond_resched();
e22bee78
TH
1574 }
1575}
1576
25511a47 1577/*
ea1abd61 1578 * Rebind an idle @worker to its CPU. worker_thread() will test
5f7dabfd 1579 * list_empty(@worker->entry) before leaving idle and call this function.
25511a47
TH
1580 */
1581static void idle_worker_rebind(struct worker *worker)
1582{
5f7dabfd 1583 /* CPU may go down again inbetween, clear UNBOUND only on success */
f36dc67b 1584 if (worker_maybe_bind_and_lock(worker->pool))
5f7dabfd 1585 worker_clr_flags(worker, WORKER_UNBOUND);
25511a47 1586
ea1abd61
LJ
1587 /* rebind complete, become available again */
1588 list_add(&worker->entry, &worker->pool->idle_list);
d565ed63 1589 spin_unlock_irq(&worker->pool->lock);
25511a47
TH
1590}
1591
e22bee78 1592/*
25511a47 1593 * Function for @worker->rebind.work used to rebind unbound busy workers to
403c821d
TH
1594 * the associated cpu which is coming back online. This is scheduled by
1595 * cpu up but can race with other cpu hotplug operations and may be
1596 * executed twice without intervening cpu down.
e22bee78 1597 */
25511a47 1598static void busy_worker_rebind_fn(struct work_struct *work)
e22bee78
TH
1599{
1600 struct worker *worker = container_of(work, struct worker, rebind_work);
e22bee78 1601
f36dc67b 1602 if (worker_maybe_bind_and_lock(worker->pool))
eab6d828 1603 worker_clr_flags(worker, WORKER_UNBOUND);
e22bee78 1604
d565ed63 1605 spin_unlock_irq(&worker->pool->lock);
e22bee78
TH
1606}
1607
25511a47 1608/**
94cf58bb
TH
1609 * rebind_workers - rebind all workers of a pool to the associated CPU
1610 * @pool: pool of interest
25511a47 1611 *
94cf58bb 1612 * @pool->cpu is coming online. Rebind all workers to the CPU. Rebinding
25511a47
TH
1613 * is different for idle and busy ones.
1614 *
ea1abd61
LJ
1615 * Idle ones will be removed from the idle_list and woken up. They will
1616 * add themselves back after completing rebind. This ensures that the
1617 * idle_list doesn't contain any unbound workers when re-bound busy workers
1618 * try to perform local wake-ups for concurrency management.
25511a47 1619 *
ea1abd61
LJ
1620 * Busy workers can rebind after they finish their current work items.
1621 * Queueing the rebind work item at the head of the scheduled list is
1622 * enough. Note that nr_running will be properly bumped as busy workers
1623 * rebind.
25511a47 1624 *
ea1abd61
LJ
1625 * On return, all non-manager workers are scheduled for rebind - see
1626 * manage_workers() for the manager special case. Any idle worker
1627 * including the manager will not appear on @idle_list until rebind is
1628 * complete, making local wake-ups safe.
25511a47 1629 */
94cf58bb 1630static void rebind_workers(struct worker_pool *pool)
25511a47 1631{
ea1abd61 1632 struct worker *worker, *n;
25511a47
TH
1633 int i;
1634
94cf58bb
TH
1635 lockdep_assert_held(&pool->assoc_mutex);
1636 lockdep_assert_held(&pool->lock);
25511a47 1637
5f7dabfd 1638 /* dequeue and kick idle ones */
94cf58bb
TH
1639 list_for_each_entry_safe(worker, n, &pool->idle_list, entry) {
1640 /*
1641 * idle workers should be off @pool->idle_list until rebind
1642 * is complete to avoid receiving premature local wake-ups.
1643 */
1644 list_del_init(&worker->entry);
25511a47 1645
94cf58bb
TH
1646 /*
1647 * worker_thread() will see the above dequeuing and call
1648 * idle_worker_rebind().
1649 */
1650 wake_up_process(worker->task);
1651 }
25511a47 1652
94cf58bb 1653 /* rebind busy workers */
b67bfe0d 1654 for_each_busy_worker(worker, i, pool) {
94cf58bb
TH
1655 struct work_struct *rebind_work = &worker->rebind_work;
1656 struct workqueue_struct *wq;
25511a47 1657
94cf58bb
TH
1658 if (test_and_set_bit(WORK_STRUCT_PENDING_BIT,
1659 work_data_bits(rebind_work)))
1660 continue;
25511a47 1661
94cf58bb 1662 debug_work_activate(rebind_work);
90beca5d 1663
94cf58bb
TH
1664 /*
1665 * wq doesn't really matter but let's keep @worker->pool
112202d9 1666 * and @pwq->pool consistent for sanity.
94cf58bb
TH
1667 */
1668 if (std_worker_pool_pri(worker->pool))
1669 wq = system_highpri_wq;
1670 else
1671 wq = system_wq;
1672
7fb98ea7 1673 insert_work(per_cpu_ptr(wq->cpu_pwqs, pool->cpu), rebind_work,
94cf58bb
TH
1674 worker->scheduled.next,
1675 work_color_to_flags(WORK_NO_COLOR));
ec58815a 1676 }
25511a47
TH
1677}
1678
c34056a3
TH
1679static struct worker *alloc_worker(void)
1680{
1681 struct worker *worker;
1682
1683 worker = kzalloc(sizeof(*worker), GFP_KERNEL);
c8e55f36
TH
1684 if (worker) {
1685 INIT_LIST_HEAD(&worker->entry);
affee4b2 1686 INIT_LIST_HEAD(&worker->scheduled);
25511a47 1687 INIT_WORK(&worker->rebind_work, busy_worker_rebind_fn);
e22bee78
TH
1688 /* on creation a worker is in !idle && prep state */
1689 worker->flags = WORKER_PREP;
c8e55f36 1690 }
c34056a3
TH
1691 return worker;
1692}
1693
1694/**
1695 * create_worker - create a new workqueue worker
63d95a91 1696 * @pool: pool the new worker will belong to
c34056a3 1697 *
63d95a91 1698 * Create a new worker which is bound to @pool. The returned worker
c34056a3
TH
1699 * can be started by calling start_worker() or destroyed using
1700 * destroy_worker().
1701 *
1702 * CONTEXT:
1703 * Might sleep. Does GFP_KERNEL allocations.
1704 *
1705 * RETURNS:
1706 * Pointer to the newly created worker.
1707 */
bc2ae0f5 1708static struct worker *create_worker(struct worker_pool *pool)
c34056a3 1709{
e34cdddb 1710 const char *pri = std_worker_pool_pri(pool) ? "H" : "";
c34056a3 1711 struct worker *worker = NULL;
f3421797 1712 int id = -1;
c34056a3 1713
d565ed63 1714 spin_lock_irq(&pool->lock);
bd7bdd43 1715 while (ida_get_new(&pool->worker_ida, &id)) {
d565ed63 1716 spin_unlock_irq(&pool->lock);
bd7bdd43 1717 if (!ida_pre_get(&pool->worker_ida, GFP_KERNEL))
c34056a3 1718 goto fail;
d565ed63 1719 spin_lock_irq(&pool->lock);
c34056a3 1720 }
d565ed63 1721 spin_unlock_irq(&pool->lock);
c34056a3
TH
1722
1723 worker = alloc_worker();
1724 if (!worker)
1725 goto fail;
1726
bd7bdd43 1727 worker->pool = pool;
c34056a3
TH
1728 worker->id = id;
1729
ec22ca5e 1730 if (pool->cpu != WORK_CPU_UNBOUND)
94dcf29a 1731 worker->task = kthread_create_on_node(worker_thread,
ec22ca5e 1732 worker, cpu_to_node(pool->cpu),
d84ff051 1733 "kworker/%d:%d%s", pool->cpu, id, pri);
f3421797
TH
1734 else
1735 worker->task = kthread_create(worker_thread, worker,
3270476a 1736 "kworker/u:%d%s", id, pri);
c34056a3
TH
1737 if (IS_ERR(worker->task))
1738 goto fail;
1739
e34cdddb 1740 if (std_worker_pool_pri(pool))
3270476a
TH
1741 set_user_nice(worker->task, HIGHPRI_NICE_LEVEL);
1742
db7bccf4 1743 /*
bc2ae0f5 1744 * Determine CPU binding of the new worker depending on
24647570 1745 * %POOL_DISASSOCIATED. The caller is responsible for ensuring the
bc2ae0f5
TH
1746 * flag remains stable across this function. See the comments
1747 * above the flag definition for details.
1748 *
1749 * As an unbound worker may later become a regular one if CPU comes
1750 * online, make sure every worker has %PF_THREAD_BOUND set.
db7bccf4 1751 */
24647570 1752 if (!(pool->flags & POOL_DISASSOCIATED)) {
ec22ca5e 1753 kthread_bind(worker->task, pool->cpu);
bc2ae0f5 1754 } else {
db7bccf4 1755 worker->task->flags |= PF_THREAD_BOUND;
bc2ae0f5 1756 worker->flags |= WORKER_UNBOUND;
f3421797 1757 }
c34056a3
TH
1758
1759 return worker;
1760fail:
1761 if (id >= 0) {
d565ed63 1762 spin_lock_irq(&pool->lock);
bd7bdd43 1763 ida_remove(&pool->worker_ida, id);
d565ed63 1764 spin_unlock_irq(&pool->lock);
c34056a3
TH
1765 }
1766 kfree(worker);
1767 return NULL;
1768}
1769
1770/**
1771 * start_worker - start a newly created worker
1772 * @worker: worker to start
1773 *
706026c2 1774 * Make the pool aware of @worker and start it.
c34056a3
TH
1775 *
1776 * CONTEXT:
d565ed63 1777 * spin_lock_irq(pool->lock).
c34056a3
TH
1778 */
1779static void start_worker(struct worker *worker)
1780{
cb444766 1781 worker->flags |= WORKER_STARTED;
bd7bdd43 1782 worker->pool->nr_workers++;
c8e55f36 1783 worker_enter_idle(worker);
c34056a3
TH
1784 wake_up_process(worker->task);
1785}
1786
1787/**
1788 * destroy_worker - destroy a workqueue worker
1789 * @worker: worker to be destroyed
1790 *
706026c2 1791 * Destroy @worker and adjust @pool stats accordingly.
c8e55f36
TH
1792 *
1793 * CONTEXT:
d565ed63 1794 * spin_lock_irq(pool->lock) which is released and regrabbed.
c34056a3
TH
1795 */
1796static void destroy_worker(struct worker *worker)
1797{
bd7bdd43 1798 struct worker_pool *pool = worker->pool;
c34056a3
TH
1799 int id = worker->id;
1800
1801 /* sanity check frenzy */
6183c009
TH
1802 if (WARN_ON(worker->current_work) ||
1803 WARN_ON(!list_empty(&worker->scheduled)))
1804 return;
c34056a3 1805
c8e55f36 1806 if (worker->flags & WORKER_STARTED)
bd7bdd43 1807 pool->nr_workers--;
c8e55f36 1808 if (worker->flags & WORKER_IDLE)
bd7bdd43 1809 pool->nr_idle--;
c8e55f36
TH
1810
1811 list_del_init(&worker->entry);
cb444766 1812 worker->flags |= WORKER_DIE;
c8e55f36 1813
d565ed63 1814 spin_unlock_irq(&pool->lock);
c8e55f36 1815
c34056a3
TH
1816 kthread_stop(worker->task);
1817 kfree(worker);
1818
d565ed63 1819 spin_lock_irq(&pool->lock);
bd7bdd43 1820 ida_remove(&pool->worker_ida, id);
c34056a3
TH
1821}
1822
63d95a91 1823static void idle_worker_timeout(unsigned long __pool)
e22bee78 1824{
63d95a91 1825 struct worker_pool *pool = (void *)__pool;
e22bee78 1826
d565ed63 1827 spin_lock_irq(&pool->lock);
e22bee78 1828
63d95a91 1829 if (too_many_workers(pool)) {
e22bee78
TH
1830 struct worker *worker;
1831 unsigned long expires;
1832
1833 /* idle_list is kept in LIFO order, check the last one */
63d95a91 1834 worker = list_entry(pool->idle_list.prev, struct worker, entry);
e22bee78
TH
1835 expires = worker->last_active + IDLE_WORKER_TIMEOUT;
1836
1837 if (time_before(jiffies, expires))
63d95a91 1838 mod_timer(&pool->idle_timer, expires);
e22bee78
TH
1839 else {
1840 /* it's been idle for too long, wake up manager */
11ebea50 1841 pool->flags |= POOL_MANAGE_WORKERS;
63d95a91 1842 wake_up_worker(pool);
d5abe669 1843 }
e22bee78
TH
1844 }
1845
d565ed63 1846 spin_unlock_irq(&pool->lock);
e22bee78 1847}
d5abe669 1848
493a1724 1849static void send_mayday(struct work_struct *work)
e22bee78 1850{
112202d9
TH
1851 struct pool_workqueue *pwq = get_work_pwq(work);
1852 struct workqueue_struct *wq = pwq->wq;
493a1724
TH
1853
1854 lockdep_assert_held(&workqueue_lock);
e22bee78
TH
1855
1856 if (!(wq->flags & WQ_RESCUER))
493a1724 1857 return;
e22bee78
TH
1858
1859 /* mayday mayday mayday */
493a1724
TH
1860 if (list_empty(&pwq->mayday_node)) {
1861 list_add_tail(&pwq->mayday_node, &wq->maydays);
e22bee78 1862 wake_up_process(wq->rescuer->task);
493a1724 1863 }
e22bee78
TH
1864}
1865
706026c2 1866static void pool_mayday_timeout(unsigned long __pool)
e22bee78 1867{
63d95a91 1868 struct worker_pool *pool = (void *)__pool;
e22bee78
TH
1869 struct work_struct *work;
1870
493a1724
TH
1871 spin_lock_irq(&workqueue_lock); /* for wq->maydays */
1872 spin_lock(&pool->lock);
e22bee78 1873
63d95a91 1874 if (need_to_create_worker(pool)) {
e22bee78
TH
1875 /*
1876 * We've been trying to create a new worker but
1877 * haven't been successful. We might be hitting an
1878 * allocation deadlock. Send distress signals to
1879 * rescuers.
1880 */
63d95a91 1881 list_for_each_entry(work, &pool->worklist, entry)
e22bee78 1882 send_mayday(work);
1da177e4 1883 }
e22bee78 1884
493a1724
TH
1885 spin_unlock(&pool->lock);
1886 spin_unlock_irq(&workqueue_lock);
e22bee78 1887
63d95a91 1888 mod_timer(&pool->mayday_timer, jiffies + MAYDAY_INTERVAL);
1da177e4
LT
1889}
1890
e22bee78
TH
1891/**
1892 * maybe_create_worker - create a new worker if necessary
63d95a91 1893 * @pool: pool to create a new worker for
e22bee78 1894 *
63d95a91 1895 * Create a new worker for @pool if necessary. @pool is guaranteed to
e22bee78
TH
1896 * have at least one idle worker on return from this function. If
1897 * creating a new worker takes longer than MAYDAY_INTERVAL, mayday is
63d95a91 1898 * sent to all rescuers with works scheduled on @pool to resolve
e22bee78
TH
1899 * possible allocation deadlock.
1900 *
1901 * On return, need_to_create_worker() is guaranteed to be false and
1902 * may_start_working() true.
1903 *
1904 * LOCKING:
d565ed63 1905 * spin_lock_irq(pool->lock) which may be released and regrabbed
e22bee78
TH
1906 * multiple times. Does GFP_KERNEL allocations. Called only from
1907 * manager.
1908 *
1909 * RETURNS:
d565ed63 1910 * false if no action was taken and pool->lock stayed locked, true
e22bee78
TH
1911 * otherwise.
1912 */
63d95a91 1913static bool maybe_create_worker(struct worker_pool *pool)
d565ed63
TH
1914__releases(&pool->lock)
1915__acquires(&pool->lock)
1da177e4 1916{
63d95a91 1917 if (!need_to_create_worker(pool))
e22bee78
TH
1918 return false;
1919restart:
d565ed63 1920 spin_unlock_irq(&pool->lock);
9f9c2364 1921
e22bee78 1922 /* if we don't make progress in MAYDAY_INITIAL_TIMEOUT, call for help */
63d95a91 1923 mod_timer(&pool->mayday_timer, jiffies + MAYDAY_INITIAL_TIMEOUT);
e22bee78
TH
1924
1925 while (true) {
1926 struct worker *worker;
1927
bc2ae0f5 1928 worker = create_worker(pool);
e22bee78 1929 if (worker) {
63d95a91 1930 del_timer_sync(&pool->mayday_timer);
d565ed63 1931 spin_lock_irq(&pool->lock);
e22bee78 1932 start_worker(worker);
6183c009
TH
1933 if (WARN_ON_ONCE(need_to_create_worker(pool)))
1934 goto restart;
e22bee78
TH
1935 return true;
1936 }
1937
63d95a91 1938 if (!need_to_create_worker(pool))
e22bee78 1939 break;
1da177e4 1940
e22bee78
TH
1941 __set_current_state(TASK_INTERRUPTIBLE);
1942 schedule_timeout(CREATE_COOLDOWN);
9f9c2364 1943
63d95a91 1944 if (!need_to_create_worker(pool))
e22bee78
TH
1945 break;
1946 }
1947
63d95a91 1948 del_timer_sync(&pool->mayday_timer);
d565ed63 1949 spin_lock_irq(&pool->lock);
63d95a91 1950 if (need_to_create_worker(pool))
e22bee78
TH
1951 goto restart;
1952 return true;
1953}
1954
1955/**
1956 * maybe_destroy_worker - destroy workers which have been idle for a while
63d95a91 1957 * @pool: pool to destroy workers for
e22bee78 1958 *
63d95a91 1959 * Destroy @pool workers which have been idle for longer than
e22bee78
TH
1960 * IDLE_WORKER_TIMEOUT.
1961 *
1962 * LOCKING:
d565ed63 1963 * spin_lock_irq(pool->lock) which may be released and regrabbed
e22bee78
TH
1964 * multiple times. Called only from manager.
1965 *
1966 * RETURNS:
d565ed63 1967 * false if no action was taken and pool->lock stayed locked, true
e22bee78
TH
1968 * otherwise.
1969 */
63d95a91 1970static bool maybe_destroy_workers(struct worker_pool *pool)
e22bee78
TH
1971{
1972 bool ret = false;
1da177e4 1973
63d95a91 1974 while (too_many_workers(pool)) {
e22bee78
TH
1975 struct worker *worker;
1976 unsigned long expires;
3af24433 1977
63d95a91 1978 worker = list_entry(pool->idle_list.prev, struct worker, entry);
e22bee78 1979 expires = worker->last_active + IDLE_WORKER_TIMEOUT;
85f4186a 1980
e22bee78 1981 if (time_before(jiffies, expires)) {
63d95a91 1982 mod_timer(&pool->idle_timer, expires);
3af24433 1983 break;
e22bee78 1984 }
1da177e4 1985
e22bee78
TH
1986 destroy_worker(worker);
1987 ret = true;
1da177e4 1988 }
1e19ffc6 1989
e22bee78 1990 return ret;
1e19ffc6
TH
1991}
1992
73f53c4a 1993/**
e22bee78
TH
1994 * manage_workers - manage worker pool
1995 * @worker: self
73f53c4a 1996 *
706026c2 1997 * Assume the manager role and manage the worker pool @worker belongs
e22bee78 1998 * to. At any given time, there can be only zero or one manager per
706026c2 1999 * pool. The exclusion is handled automatically by this function.
e22bee78
TH
2000 *
2001 * The caller can safely start processing works on false return. On
2002 * true return, it's guaranteed that need_to_create_worker() is false
2003 * and may_start_working() is true.
73f53c4a
TH
2004 *
2005 * CONTEXT:
d565ed63 2006 * spin_lock_irq(pool->lock) which may be released and regrabbed
e22bee78
TH
2007 * multiple times. Does GFP_KERNEL allocations.
2008 *
2009 * RETURNS:
d565ed63
TH
2010 * spin_lock_irq(pool->lock) which may be released and regrabbed
2011 * multiple times. Does GFP_KERNEL allocations.
73f53c4a 2012 */
e22bee78 2013static bool manage_workers(struct worker *worker)
73f53c4a 2014{
63d95a91 2015 struct worker_pool *pool = worker->pool;
e22bee78 2016 bool ret = false;
73f53c4a 2017
ee378aa4 2018 if (pool->flags & POOL_MANAGING_WORKERS)
e22bee78 2019 return ret;
1e19ffc6 2020
552a37e9 2021 pool->flags |= POOL_MANAGING_WORKERS;
73f53c4a 2022
ee378aa4
LJ
2023 /*
2024 * To simplify both worker management and CPU hotplug, hold off
2025 * management while hotplug is in progress. CPU hotplug path can't
2026 * grab %POOL_MANAGING_WORKERS to achieve this because that can
2027 * lead to idle worker depletion (all become busy thinking someone
2028 * else is managing) which in turn can result in deadlock under
b2eb83d1 2029 * extreme circumstances. Use @pool->assoc_mutex to synchronize
ee378aa4
LJ
2030 * manager against CPU hotplug.
2031 *
b2eb83d1 2032 * assoc_mutex would always be free unless CPU hotplug is in
d565ed63 2033 * progress. trylock first without dropping @pool->lock.
ee378aa4 2034 */
b2eb83d1 2035 if (unlikely(!mutex_trylock(&pool->assoc_mutex))) {
d565ed63 2036 spin_unlock_irq(&pool->lock);
b2eb83d1 2037 mutex_lock(&pool->assoc_mutex);
ee378aa4
LJ
2038 /*
2039 * CPU hotplug could have happened while we were waiting
b2eb83d1 2040 * for assoc_mutex. Hotplug itself can't handle us
ee378aa4 2041 * because manager isn't either on idle or busy list, and
706026c2 2042 * @pool's state and ours could have deviated.
ee378aa4 2043 *
b2eb83d1 2044 * As hotplug is now excluded via assoc_mutex, we can
ee378aa4 2045 * simply try to bind. It will succeed or fail depending
706026c2 2046 * on @pool's current state. Try it and adjust
ee378aa4
LJ
2047 * %WORKER_UNBOUND accordingly.
2048 */
f36dc67b 2049 if (worker_maybe_bind_and_lock(pool))
ee378aa4
LJ
2050 worker->flags &= ~WORKER_UNBOUND;
2051 else
2052 worker->flags |= WORKER_UNBOUND;
73f53c4a 2053
ee378aa4
LJ
2054 ret = true;
2055 }
73f53c4a 2056
11ebea50 2057 pool->flags &= ~POOL_MANAGE_WORKERS;
73f53c4a
TH
2058
2059 /*
e22bee78
TH
2060 * Destroy and then create so that may_start_working() is true
2061 * on return.
73f53c4a 2062 */
63d95a91
TH
2063 ret |= maybe_destroy_workers(pool);
2064 ret |= maybe_create_worker(pool);
e22bee78 2065
552a37e9 2066 pool->flags &= ~POOL_MANAGING_WORKERS;
b2eb83d1 2067 mutex_unlock(&pool->assoc_mutex);
e22bee78 2068 return ret;
73f53c4a
TH
2069}
2070
a62428c0
TH
2071/**
2072 * process_one_work - process single work
c34056a3 2073 * @worker: self
a62428c0
TH
2074 * @work: work to process
2075 *
2076 * Process @work. This function contains all the logics necessary to
2077 * process a single work including synchronization against and
2078 * interaction with other workers on the same cpu, queueing and
2079 * flushing. As long as context requirement is met, any worker can
2080 * call this function to process a work.
2081 *
2082 * CONTEXT:
d565ed63 2083 * spin_lock_irq(pool->lock) which is released and regrabbed.
a62428c0 2084 */
c34056a3 2085static void process_one_work(struct worker *worker, struct work_struct *work)
d565ed63
TH
2086__releases(&pool->lock)
2087__acquires(&pool->lock)
a62428c0 2088{
112202d9 2089 struct pool_workqueue *pwq = get_work_pwq(work);
bd7bdd43 2090 struct worker_pool *pool = worker->pool;
112202d9 2091 bool cpu_intensive = pwq->wq->flags & WQ_CPU_INTENSIVE;
73f53c4a 2092 int work_color;
7e11629d 2093 struct worker *collision;
a62428c0
TH
2094#ifdef CONFIG_LOCKDEP
2095 /*
2096 * It is permissible to free the struct work_struct from
2097 * inside the function that is called from it, this we need to
2098 * take into account for lockdep too. To avoid bogus "held
2099 * lock freed" warnings as well as problems when looking into
2100 * work->lockdep_map, make a copy and use that here.
2101 */
4d82a1de
PZ
2102 struct lockdep_map lockdep_map;
2103
2104 lockdep_copy_map(&lockdep_map, &work->lockdep_map);
a62428c0 2105#endif
6fec10a1
TH
2106 /*
2107 * Ensure we're on the correct CPU. DISASSOCIATED test is
2108 * necessary to avoid spurious warnings from rescuers servicing the
24647570 2109 * unbound or a disassociated pool.
6fec10a1 2110 */
5f7dabfd 2111 WARN_ON_ONCE(!(worker->flags & WORKER_UNBOUND) &&
24647570 2112 !(pool->flags & POOL_DISASSOCIATED) &&
ec22ca5e 2113 raw_smp_processor_id() != pool->cpu);
25511a47 2114
7e11629d
TH
2115 /*
2116 * A single work shouldn't be executed concurrently by
2117 * multiple workers on a single cpu. Check whether anyone is
2118 * already processing the work. If so, defer the work to the
2119 * currently executing one.
2120 */
c9e7cf27 2121 collision = find_worker_executing_work(pool, work);
7e11629d
TH
2122 if (unlikely(collision)) {
2123 move_linked_works(work, &collision->scheduled, NULL);
2124 return;
2125 }
2126
8930caba 2127 /* claim and dequeue */
a62428c0 2128 debug_work_deactivate(work);
c9e7cf27 2129 hash_add(pool->busy_hash, &worker->hentry, (unsigned long)work);
c34056a3 2130 worker->current_work = work;
a2c1c57b 2131 worker->current_func = work->func;
112202d9 2132 worker->current_pwq = pwq;
73f53c4a 2133 work_color = get_work_color(work);
7a22ad75 2134
a62428c0
TH
2135 list_del_init(&work->entry);
2136
fb0e7beb
TH
2137 /*
2138 * CPU intensive works don't participate in concurrency
2139 * management. They're the scheduler's responsibility.
2140 */
2141 if (unlikely(cpu_intensive))
2142 worker_set_flags(worker, WORKER_CPU_INTENSIVE, true);
2143
974271c4 2144 /*
d565ed63 2145 * Unbound pool isn't concurrency managed and work items should be
974271c4
TH
2146 * executed ASAP. Wake up another worker if necessary.
2147 */
63d95a91
TH
2148 if ((worker->flags & WORKER_UNBOUND) && need_more_worker(pool))
2149 wake_up_worker(pool);
974271c4 2150
8930caba 2151 /*
7c3eed5c 2152 * Record the last pool and clear PENDING which should be the last
d565ed63 2153 * update to @work. Also, do this inside @pool->lock so that
23657bb1
TH
2154 * PENDING and queued state changes happen together while IRQ is
2155 * disabled.
8930caba 2156 */
7c3eed5c 2157 set_work_pool_and_clear_pending(work, pool->id);
a62428c0 2158
d565ed63 2159 spin_unlock_irq(&pool->lock);
a62428c0 2160
112202d9 2161 lock_map_acquire_read(&pwq->wq->lockdep_map);
a62428c0 2162 lock_map_acquire(&lockdep_map);
e36c886a 2163 trace_workqueue_execute_start(work);
a2c1c57b 2164 worker->current_func(work);
e36c886a
AV
2165 /*
2166 * While we must be careful to not use "work" after this, the trace
2167 * point will only record its address.
2168 */
2169 trace_workqueue_execute_end(work);
a62428c0 2170 lock_map_release(&lockdep_map);
112202d9 2171 lock_map_release(&pwq->wq->lockdep_map);
a62428c0
TH
2172
2173 if (unlikely(in_atomic() || lockdep_depth(current) > 0)) {
044c782c
VI
2174 pr_err("BUG: workqueue leaked lock or atomic: %s/0x%08x/%d\n"
2175 " last function: %pf\n",
a2c1c57b
TH
2176 current->comm, preempt_count(), task_pid_nr(current),
2177 worker->current_func);
a62428c0
TH
2178 debug_show_held_locks(current);
2179 dump_stack();
2180 }
2181
d565ed63 2182 spin_lock_irq(&pool->lock);
a62428c0 2183
fb0e7beb
TH
2184 /* clear cpu intensive status */
2185 if (unlikely(cpu_intensive))
2186 worker_clr_flags(worker, WORKER_CPU_INTENSIVE);
2187
a62428c0 2188 /* we're done with it, release */
42f8570f 2189 hash_del(&worker->hentry);
c34056a3 2190 worker->current_work = NULL;
a2c1c57b 2191 worker->current_func = NULL;
112202d9
TH
2192 worker->current_pwq = NULL;
2193 pwq_dec_nr_in_flight(pwq, work_color);
a62428c0
TH
2194}
2195
affee4b2
TH
2196/**
2197 * process_scheduled_works - process scheduled works
2198 * @worker: self
2199 *
2200 * Process all scheduled works. Please note that the scheduled list
2201 * may change while processing a work, so this function repeatedly
2202 * fetches a work from the top and executes it.
2203 *
2204 * CONTEXT:
d565ed63 2205 * spin_lock_irq(pool->lock) which may be released and regrabbed
affee4b2
TH
2206 * multiple times.
2207 */
2208static void process_scheduled_works(struct worker *worker)
1da177e4 2209{
affee4b2
TH
2210 while (!list_empty(&worker->scheduled)) {
2211 struct work_struct *work = list_first_entry(&worker->scheduled,
1da177e4 2212 struct work_struct, entry);
c34056a3 2213 process_one_work(worker, work);
1da177e4 2214 }
1da177e4
LT
2215}
2216
4690c4ab
TH
2217/**
2218 * worker_thread - the worker thread function
c34056a3 2219 * @__worker: self
4690c4ab 2220 *
706026c2
TH
2221 * The worker thread function. There are NR_CPU_WORKER_POOLS dynamic pools
2222 * of these per each cpu. These workers process all works regardless of
e22bee78
TH
2223 * their specific target workqueue. The only exception is works which
2224 * belong to workqueues with a rescuer which will be explained in
2225 * rescuer_thread().
4690c4ab 2226 */
c34056a3 2227static int worker_thread(void *__worker)
1da177e4 2228{
c34056a3 2229 struct worker *worker = __worker;
bd7bdd43 2230 struct worker_pool *pool = worker->pool;
1da177e4 2231
e22bee78
TH
2232 /* tell the scheduler that this is a workqueue worker */
2233 worker->task->flags |= PF_WQ_WORKER;
c8e55f36 2234woke_up:
d565ed63 2235 spin_lock_irq(&pool->lock);
1da177e4 2236
5f7dabfd
LJ
2237 /* we are off idle list if destruction or rebind is requested */
2238 if (unlikely(list_empty(&worker->entry))) {
d565ed63 2239 spin_unlock_irq(&pool->lock);
25511a47 2240
5f7dabfd 2241 /* if DIE is set, destruction is requested */
25511a47
TH
2242 if (worker->flags & WORKER_DIE) {
2243 worker->task->flags &= ~PF_WQ_WORKER;
2244 return 0;
2245 }
2246
5f7dabfd 2247 /* otherwise, rebind */
25511a47
TH
2248 idle_worker_rebind(worker);
2249 goto woke_up;
c8e55f36 2250 }
affee4b2 2251
c8e55f36 2252 worker_leave_idle(worker);
db7bccf4 2253recheck:
e22bee78 2254 /* no more worker necessary? */
63d95a91 2255 if (!need_more_worker(pool))
e22bee78
TH
2256 goto sleep;
2257
2258 /* do we need to manage? */
63d95a91 2259 if (unlikely(!may_start_working(pool)) && manage_workers(worker))
e22bee78
TH
2260 goto recheck;
2261
c8e55f36
TH
2262 /*
2263 * ->scheduled list can only be filled while a worker is
2264 * preparing to process a work or actually processing it.
2265 * Make sure nobody diddled with it while I was sleeping.
2266 */
6183c009 2267 WARN_ON_ONCE(!list_empty(&worker->scheduled));
c8e55f36 2268
e22bee78
TH
2269 /*
2270 * When control reaches this point, we're guaranteed to have
2271 * at least one idle worker or that someone else has already
2272 * assumed the manager role.
2273 */
2274 worker_clr_flags(worker, WORKER_PREP);
2275
2276 do {
c8e55f36 2277 struct work_struct *work =
bd7bdd43 2278 list_first_entry(&pool->worklist,
c8e55f36
TH
2279 struct work_struct, entry);
2280
2281 if (likely(!(*work_data_bits(work) & WORK_STRUCT_LINKED))) {
2282 /* optimization path, not strictly necessary */
2283 process_one_work(worker, work);
2284 if (unlikely(!list_empty(&worker->scheduled)))
affee4b2 2285 process_scheduled_works(worker);
c8e55f36
TH
2286 } else {
2287 move_linked_works(work, &worker->scheduled, NULL);
2288 process_scheduled_works(worker);
affee4b2 2289 }
63d95a91 2290 } while (keep_working(pool));
e22bee78
TH
2291
2292 worker_set_flags(worker, WORKER_PREP, false);
d313dd85 2293sleep:
63d95a91 2294 if (unlikely(need_to_manage_workers(pool)) && manage_workers(worker))
e22bee78 2295 goto recheck;
d313dd85 2296
c8e55f36 2297 /*
d565ed63
TH
2298 * pool->lock is held and there's no work to process and no need to
2299 * manage, sleep. Workers are woken up only while holding
2300 * pool->lock or from local cpu, so setting the current state
2301 * before releasing pool->lock is enough to prevent losing any
2302 * event.
c8e55f36
TH
2303 */
2304 worker_enter_idle(worker);
2305 __set_current_state(TASK_INTERRUPTIBLE);
d565ed63 2306 spin_unlock_irq(&pool->lock);
c8e55f36
TH
2307 schedule();
2308 goto woke_up;
1da177e4
LT
2309}
2310
e22bee78
TH
2311/**
2312 * rescuer_thread - the rescuer thread function
111c225a 2313 * @__rescuer: self
e22bee78
TH
2314 *
2315 * Workqueue rescuer thread function. There's one rescuer for each
2316 * workqueue which has WQ_RESCUER set.
2317 *
706026c2 2318 * Regular work processing on a pool may block trying to create a new
e22bee78
TH
2319 * worker which uses GFP_KERNEL allocation which has slight chance of
2320 * developing into deadlock if some works currently on the same queue
2321 * need to be processed to satisfy the GFP_KERNEL allocation. This is
2322 * the problem rescuer solves.
2323 *
706026c2
TH
2324 * When such condition is possible, the pool summons rescuers of all
2325 * workqueues which have works queued on the pool and let them process
e22bee78
TH
2326 * those works so that forward progress can be guaranteed.
2327 *
2328 * This should happen rarely.
2329 */
111c225a 2330static int rescuer_thread(void *__rescuer)
e22bee78 2331{
111c225a
TH
2332 struct worker *rescuer = __rescuer;
2333 struct workqueue_struct *wq = rescuer->rescue_wq;
e22bee78 2334 struct list_head *scheduled = &rescuer->scheduled;
e22bee78
TH
2335
2336 set_user_nice(current, RESCUER_NICE_LEVEL);
111c225a
TH
2337
2338 /*
2339 * Mark rescuer as worker too. As WORKER_PREP is never cleared, it
2340 * doesn't participate in concurrency management.
2341 */
2342 rescuer->task->flags |= PF_WQ_WORKER;
e22bee78
TH
2343repeat:
2344 set_current_state(TASK_INTERRUPTIBLE);
2345
412d32e6
MG
2346 if (kthread_should_stop()) {
2347 __set_current_state(TASK_RUNNING);
111c225a 2348 rescuer->task->flags &= ~PF_WQ_WORKER;
e22bee78 2349 return 0;
412d32e6 2350 }
e22bee78 2351
493a1724
TH
2352 /* see whether any pwq is asking for help */
2353 spin_lock_irq(&workqueue_lock);
2354
2355 while (!list_empty(&wq->maydays)) {
2356 struct pool_workqueue *pwq = list_first_entry(&wq->maydays,
2357 struct pool_workqueue, mayday_node);
112202d9 2358 struct worker_pool *pool = pwq->pool;
e22bee78
TH
2359 struct work_struct *work, *n;
2360
2361 __set_current_state(TASK_RUNNING);
493a1724
TH
2362 list_del_init(&pwq->mayday_node);
2363
2364 spin_unlock_irq(&workqueue_lock);
e22bee78
TH
2365
2366 /* migrate to the target cpu if possible */
f36dc67b 2367 worker_maybe_bind_and_lock(pool);
b3104104 2368 rescuer->pool = pool;
e22bee78
TH
2369
2370 /*
2371 * Slurp in all works issued via this workqueue and
2372 * process'em.
2373 */
6183c009 2374 WARN_ON_ONCE(!list_empty(&rescuer->scheduled));
bd7bdd43 2375 list_for_each_entry_safe(work, n, &pool->worklist, entry)
112202d9 2376 if (get_work_pwq(work) == pwq)
e22bee78
TH
2377 move_linked_works(work, scheduled, &n);
2378
2379 process_scheduled_works(rescuer);
7576958a
TH
2380
2381 /*
d565ed63 2382 * Leave this pool. If keep_working() is %true, notify a
7576958a
TH
2383 * regular worker; otherwise, we end up with 0 concurrency
2384 * and stalling the execution.
2385 */
63d95a91
TH
2386 if (keep_working(pool))
2387 wake_up_worker(pool);
7576958a 2388
b3104104 2389 rescuer->pool = NULL;
493a1724
TH
2390 spin_unlock(&pool->lock);
2391 spin_lock(&workqueue_lock);
e22bee78
TH
2392 }
2393
493a1724
TH
2394 spin_unlock_irq(&workqueue_lock);
2395
111c225a
TH
2396 /* rescuers should never participate in concurrency management */
2397 WARN_ON_ONCE(!(rescuer->flags & WORKER_NOT_RUNNING));
e22bee78
TH
2398 schedule();
2399 goto repeat;
1da177e4
LT
2400}
2401
fc2e4d70
ON
2402struct wq_barrier {
2403 struct work_struct work;
2404 struct completion done;
2405};
2406
2407static void wq_barrier_func(struct work_struct *work)
2408{
2409 struct wq_barrier *barr = container_of(work, struct wq_barrier, work);
2410 complete(&barr->done);
2411}
2412
4690c4ab
TH
2413/**
2414 * insert_wq_barrier - insert a barrier work
112202d9 2415 * @pwq: pwq to insert barrier into
4690c4ab 2416 * @barr: wq_barrier to insert
affee4b2
TH
2417 * @target: target work to attach @barr to
2418 * @worker: worker currently executing @target, NULL if @target is not executing
4690c4ab 2419 *
affee4b2
TH
2420 * @barr is linked to @target such that @barr is completed only after
2421 * @target finishes execution. Please note that the ordering
2422 * guarantee is observed only with respect to @target and on the local
2423 * cpu.
2424 *
2425 * Currently, a queued barrier can't be canceled. This is because
2426 * try_to_grab_pending() can't determine whether the work to be
2427 * grabbed is at the head of the queue and thus can't clear LINKED
2428 * flag of the previous work while there must be a valid next work
2429 * after a work with LINKED flag set.
2430 *
2431 * Note that when @worker is non-NULL, @target may be modified
112202d9 2432 * underneath us, so we can't reliably determine pwq from @target.
4690c4ab
TH
2433 *
2434 * CONTEXT:
d565ed63 2435 * spin_lock_irq(pool->lock).
4690c4ab 2436 */
112202d9 2437static void insert_wq_barrier(struct pool_workqueue *pwq,
affee4b2
TH
2438 struct wq_barrier *barr,
2439 struct work_struct *target, struct worker *worker)
fc2e4d70 2440{
affee4b2
TH
2441 struct list_head *head;
2442 unsigned int linked = 0;
2443
dc186ad7 2444 /*
d565ed63 2445 * debugobject calls are safe here even with pool->lock locked
dc186ad7
TG
2446 * as we know for sure that this will not trigger any of the
2447 * checks and call back into the fixup functions where we
2448 * might deadlock.
2449 */
ca1cab37 2450 INIT_WORK_ONSTACK(&barr->work, wq_barrier_func);
22df02bb 2451 __set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(&barr->work));
fc2e4d70 2452 init_completion(&barr->done);
83c22520 2453
affee4b2
TH
2454 /*
2455 * If @target is currently being executed, schedule the
2456 * barrier to the worker; otherwise, put it after @target.
2457 */
2458 if (worker)
2459 head = worker->scheduled.next;
2460 else {
2461 unsigned long *bits = work_data_bits(target);
2462
2463 head = target->entry.next;
2464 /* there can already be other linked works, inherit and set */
2465 linked = *bits & WORK_STRUCT_LINKED;
2466 __set_bit(WORK_STRUCT_LINKED_BIT, bits);
2467 }
2468
dc186ad7 2469 debug_work_activate(&barr->work);
112202d9 2470 insert_work(pwq, &barr->work, head,
affee4b2 2471 work_color_to_flags(WORK_NO_COLOR) | linked);
fc2e4d70
ON
2472}
2473
73f53c4a 2474/**
112202d9 2475 * flush_workqueue_prep_pwqs - prepare pwqs for workqueue flushing
73f53c4a
TH
2476 * @wq: workqueue being flushed
2477 * @flush_color: new flush color, < 0 for no-op
2478 * @work_color: new work color, < 0 for no-op
2479 *
112202d9 2480 * Prepare pwqs for workqueue flushing.
73f53c4a 2481 *
112202d9
TH
2482 * If @flush_color is non-negative, flush_color on all pwqs should be
2483 * -1. If no pwq has in-flight commands at the specified color, all
2484 * pwq->flush_color's stay at -1 and %false is returned. If any pwq
2485 * has in flight commands, its pwq->flush_color is set to
2486 * @flush_color, @wq->nr_pwqs_to_flush is updated accordingly, pwq
73f53c4a
TH
2487 * wakeup logic is armed and %true is returned.
2488 *
2489 * The caller should have initialized @wq->first_flusher prior to
2490 * calling this function with non-negative @flush_color. If
2491 * @flush_color is negative, no flush color update is done and %false
2492 * is returned.
2493 *
112202d9 2494 * If @work_color is non-negative, all pwqs should have the same
73f53c4a
TH
2495 * work_color which is previous to @work_color and all will be
2496 * advanced to @work_color.
2497 *
2498 * CONTEXT:
2499 * mutex_lock(wq->flush_mutex).
2500 *
2501 * RETURNS:
2502 * %true if @flush_color >= 0 and there's something to flush. %false
2503 * otherwise.
2504 */
112202d9 2505static bool flush_workqueue_prep_pwqs(struct workqueue_struct *wq,
73f53c4a 2506 int flush_color, int work_color)
1da177e4 2507{
73f53c4a 2508 bool wait = false;
49e3cf44 2509 struct pool_workqueue *pwq;
1da177e4 2510
73f53c4a 2511 if (flush_color >= 0) {
6183c009 2512 WARN_ON_ONCE(atomic_read(&wq->nr_pwqs_to_flush));
112202d9 2513 atomic_set(&wq->nr_pwqs_to_flush, 1);
1da177e4 2514 }
2355b70f 2515
76af4d93
TH
2516 local_irq_disable();
2517
49e3cf44 2518 for_each_pwq(pwq, wq) {
112202d9 2519 struct worker_pool *pool = pwq->pool;
fc2e4d70 2520
76af4d93 2521 spin_lock(&pool->lock);
83c22520 2522
73f53c4a 2523 if (flush_color >= 0) {
6183c009 2524 WARN_ON_ONCE(pwq->flush_color != -1);
fc2e4d70 2525
112202d9
TH
2526 if (pwq->nr_in_flight[flush_color]) {
2527 pwq->flush_color = flush_color;
2528 atomic_inc(&wq->nr_pwqs_to_flush);
73f53c4a
TH
2529 wait = true;
2530 }
2531 }
1da177e4 2532
73f53c4a 2533 if (work_color >= 0) {
6183c009 2534 WARN_ON_ONCE(work_color != work_next_color(pwq->work_color));
112202d9 2535 pwq->work_color = work_color;
73f53c4a 2536 }
1da177e4 2537
76af4d93 2538 spin_unlock(&pool->lock);
1da177e4 2539 }
2355b70f 2540
76af4d93
TH
2541 local_irq_enable();
2542
112202d9 2543 if (flush_color >= 0 && atomic_dec_and_test(&wq->nr_pwqs_to_flush))
73f53c4a 2544 complete(&wq->first_flusher->done);
14441960 2545
73f53c4a 2546 return wait;
1da177e4
LT
2547}
2548
0fcb78c2 2549/**
1da177e4 2550 * flush_workqueue - ensure that any scheduled work has run to completion.
0fcb78c2 2551 * @wq: workqueue to flush
1da177e4
LT
2552 *
2553 * Forces execution of the workqueue and blocks until its completion.
2554 * This is typically used in driver shutdown handlers.
2555 *
fc2e4d70
ON
2556 * We sleep until all works which were queued on entry have been handled,
2557 * but we are not livelocked by new incoming ones.
1da177e4 2558 */
7ad5b3a5 2559void flush_workqueue(struct workqueue_struct *wq)
1da177e4 2560{
73f53c4a
TH
2561 struct wq_flusher this_flusher = {
2562 .list = LIST_HEAD_INIT(this_flusher.list),
2563 .flush_color = -1,
2564 .done = COMPLETION_INITIALIZER_ONSTACK(this_flusher.done),
2565 };
2566 int next_color;
1da177e4 2567
3295f0ef
IM
2568 lock_map_acquire(&wq->lockdep_map);
2569 lock_map_release(&wq->lockdep_map);
73f53c4a
TH
2570
2571 mutex_lock(&wq->flush_mutex);
2572
2573 /*
2574 * Start-to-wait phase
2575 */
2576 next_color = work_next_color(wq->work_color);
2577
2578 if (next_color != wq->flush_color) {
2579 /*
2580 * Color space is not full. The current work_color
2581 * becomes our flush_color and work_color is advanced
2582 * by one.
2583 */
6183c009 2584 WARN_ON_ONCE(!list_empty(&wq->flusher_overflow));
73f53c4a
TH
2585 this_flusher.flush_color = wq->work_color;
2586 wq->work_color = next_color;
2587
2588 if (!wq->first_flusher) {
2589 /* no flush in progress, become the first flusher */
6183c009 2590 WARN_ON_ONCE(wq->flush_color != this_flusher.flush_color);
73f53c4a
TH
2591
2592 wq->first_flusher = &this_flusher;
2593
112202d9 2594 if (!flush_workqueue_prep_pwqs(wq, wq->flush_color,
73f53c4a
TH
2595 wq->work_color)) {
2596 /* nothing to flush, done */
2597 wq->flush_color = next_color;
2598 wq->first_flusher = NULL;
2599 goto out_unlock;
2600 }
2601 } else {
2602 /* wait in queue */
6183c009 2603 WARN_ON_ONCE(wq->flush_color == this_flusher.flush_color);
73f53c4a 2604 list_add_tail(&this_flusher.list, &wq->flusher_queue);
112202d9 2605 flush_workqueue_prep_pwqs(wq, -1, wq->work_color);
73f53c4a
TH
2606 }
2607 } else {
2608 /*
2609 * Oops, color space is full, wait on overflow queue.
2610 * The next flush completion will assign us
2611 * flush_color and transfer to flusher_queue.
2612 */
2613 list_add_tail(&this_flusher.list, &wq->flusher_overflow);
2614 }
2615
2616 mutex_unlock(&wq->flush_mutex);
2617
2618 wait_for_completion(&this_flusher.done);
2619
2620 /*
2621 * Wake-up-and-cascade phase
2622 *
2623 * First flushers are responsible for cascading flushes and
2624 * handling overflow. Non-first flushers can simply return.
2625 */
2626 if (wq->first_flusher != &this_flusher)
2627 return;
2628
2629 mutex_lock(&wq->flush_mutex);
2630
4ce48b37
TH
2631 /* we might have raced, check again with mutex held */
2632 if (wq->first_flusher != &this_flusher)
2633 goto out_unlock;
2634
73f53c4a
TH
2635 wq->first_flusher = NULL;
2636
6183c009
TH
2637 WARN_ON_ONCE(!list_empty(&this_flusher.list));
2638 WARN_ON_ONCE(wq->flush_color != this_flusher.flush_color);
73f53c4a
TH
2639
2640 while (true) {
2641 struct wq_flusher *next, *tmp;
2642
2643 /* complete all the flushers sharing the current flush color */
2644 list_for_each_entry_safe(next, tmp, &wq->flusher_queue, list) {
2645 if (next->flush_color != wq->flush_color)
2646 break;
2647 list_del_init(&next->list);
2648 complete(&next->done);
2649 }
2650
6183c009
TH
2651 WARN_ON_ONCE(!list_empty(&wq->flusher_overflow) &&
2652 wq->flush_color != work_next_color(wq->work_color));
73f53c4a
TH
2653
2654 /* this flush_color is finished, advance by one */
2655 wq->flush_color = work_next_color(wq->flush_color);
2656
2657 /* one color has been freed, handle overflow queue */
2658 if (!list_empty(&wq->flusher_overflow)) {
2659 /*
2660 * Assign the same color to all overflowed
2661 * flushers, advance work_color and append to
2662 * flusher_queue. This is the start-to-wait
2663 * phase for these overflowed flushers.
2664 */
2665 list_for_each_entry(tmp, &wq->flusher_overflow, list)
2666 tmp->flush_color = wq->work_color;
2667
2668 wq->work_color = work_next_color(wq->work_color);
2669
2670 list_splice_tail_init(&wq->flusher_overflow,
2671 &wq->flusher_queue);
112202d9 2672 flush_workqueue_prep_pwqs(wq, -1, wq->work_color);
73f53c4a
TH
2673 }
2674
2675 if (list_empty(&wq->flusher_queue)) {
6183c009 2676 WARN_ON_ONCE(wq->flush_color != wq->work_color);
73f53c4a
TH
2677 break;
2678 }
2679
2680 /*
2681 * Need to flush more colors. Make the next flusher
112202d9 2682 * the new first flusher and arm pwqs.
73f53c4a 2683 */
6183c009
TH
2684 WARN_ON_ONCE(wq->flush_color == wq->work_color);
2685 WARN_ON_ONCE(wq->flush_color != next->flush_color);
73f53c4a
TH
2686
2687 list_del_init(&next->list);
2688 wq->first_flusher = next;
2689
112202d9 2690 if (flush_workqueue_prep_pwqs(wq, wq->flush_color, -1))
73f53c4a
TH
2691 break;
2692
2693 /*
2694 * Meh... this color is already done, clear first
2695 * flusher and repeat cascading.
2696 */
2697 wq->first_flusher = NULL;
2698 }
2699
2700out_unlock:
2701 mutex_unlock(&wq->flush_mutex);
1da177e4 2702}
ae90dd5d 2703EXPORT_SYMBOL_GPL(flush_workqueue);
1da177e4 2704
9c5a2ba7
TH
2705/**
2706 * drain_workqueue - drain a workqueue
2707 * @wq: workqueue to drain
2708 *
2709 * Wait until the workqueue becomes empty. While draining is in progress,
2710 * only chain queueing is allowed. IOW, only currently pending or running
2711 * work items on @wq can queue further work items on it. @wq is flushed
2712 * repeatedly until it becomes empty. The number of flushing is detemined
2713 * by the depth of chaining and should be relatively short. Whine if it
2714 * takes too long.
2715 */
2716void drain_workqueue(struct workqueue_struct *wq)
2717{
2718 unsigned int flush_cnt = 0;
49e3cf44 2719 struct pool_workqueue *pwq;
9c5a2ba7
TH
2720
2721 /*
2722 * __queue_work() needs to test whether there are drainers, is much
2723 * hotter than drain_workqueue() and already looks at @wq->flags.
2724 * Use WQ_DRAINING so that queue doesn't have to check nr_drainers.
2725 */
e98d5b16 2726 spin_lock_irq(&workqueue_lock);
9c5a2ba7
TH
2727 if (!wq->nr_drainers++)
2728 wq->flags |= WQ_DRAINING;
e98d5b16 2729 spin_unlock_irq(&workqueue_lock);
9c5a2ba7
TH
2730reflush:
2731 flush_workqueue(wq);
2732
76af4d93
TH
2733 local_irq_disable();
2734
49e3cf44 2735 for_each_pwq(pwq, wq) {
fa2563e4 2736 bool drained;
9c5a2ba7 2737
76af4d93 2738 spin_lock(&pwq->pool->lock);
112202d9 2739 drained = !pwq->nr_active && list_empty(&pwq->delayed_works);
76af4d93 2740 spin_unlock(&pwq->pool->lock);
fa2563e4
TT
2741
2742 if (drained)
9c5a2ba7
TH
2743 continue;
2744
2745 if (++flush_cnt == 10 ||
2746 (flush_cnt % 100 == 0 && flush_cnt <= 1000))
044c782c
VI
2747 pr_warn("workqueue %s: flush on destruction isn't complete after %u tries\n",
2748 wq->name, flush_cnt);
76af4d93
TH
2749
2750 local_irq_enable();
9c5a2ba7
TH
2751 goto reflush;
2752 }
2753
76af4d93 2754 spin_lock(&workqueue_lock);
9c5a2ba7
TH
2755 if (!--wq->nr_drainers)
2756 wq->flags &= ~WQ_DRAINING;
76af4d93
TH
2757 spin_unlock(&workqueue_lock);
2758
2759 local_irq_enable();
9c5a2ba7
TH
2760}
2761EXPORT_SYMBOL_GPL(drain_workqueue);
2762
606a5020 2763static bool start_flush_work(struct work_struct *work, struct wq_barrier *barr)
db700897 2764{
affee4b2 2765 struct worker *worker = NULL;
c9e7cf27 2766 struct worker_pool *pool;
112202d9 2767 struct pool_workqueue *pwq;
db700897
ON
2768
2769 might_sleep();
c9e7cf27
TH
2770 pool = get_work_pool(work);
2771 if (!pool)
baf59022 2772 return false;
db700897 2773
d565ed63 2774 spin_lock_irq(&pool->lock);
0b3dae68 2775 /* see the comment in try_to_grab_pending() with the same code */
112202d9
TH
2776 pwq = get_work_pwq(work);
2777 if (pwq) {
2778 if (unlikely(pwq->pool != pool))
4690c4ab 2779 goto already_gone;
606a5020 2780 } else {
c9e7cf27 2781 worker = find_worker_executing_work(pool, work);
affee4b2 2782 if (!worker)
4690c4ab 2783 goto already_gone;
112202d9 2784 pwq = worker->current_pwq;
606a5020 2785 }
db700897 2786
112202d9 2787 insert_wq_barrier(pwq, barr, work, worker);
d565ed63 2788 spin_unlock_irq(&pool->lock);
7a22ad75 2789
e159489b
TH
2790 /*
2791 * If @max_active is 1 or rescuer is in use, flushing another work
2792 * item on the same workqueue may lead to deadlock. Make sure the
2793 * flusher is not running on the same workqueue by verifying write
2794 * access.
2795 */
112202d9
TH
2796 if (pwq->wq->saved_max_active == 1 || pwq->wq->flags & WQ_RESCUER)
2797 lock_map_acquire(&pwq->wq->lockdep_map);
e159489b 2798 else
112202d9
TH
2799 lock_map_acquire_read(&pwq->wq->lockdep_map);
2800 lock_map_release(&pwq->wq->lockdep_map);
e159489b 2801
401a8d04 2802 return true;
4690c4ab 2803already_gone:
d565ed63 2804 spin_unlock_irq(&pool->lock);
401a8d04 2805 return false;
db700897 2806}
baf59022
TH
2807
2808/**
2809 * flush_work - wait for a work to finish executing the last queueing instance
2810 * @work: the work to flush
2811 *
606a5020
TH
2812 * Wait until @work has finished execution. @work is guaranteed to be idle
2813 * on return if it hasn't been requeued since flush started.
baf59022
TH
2814 *
2815 * RETURNS:
2816 * %true if flush_work() waited for the work to finish execution,
2817 * %false if it was already idle.
2818 */
2819bool flush_work(struct work_struct *work)
2820{
2821 struct wq_barrier barr;
2822
0976dfc1
SB
2823 lock_map_acquire(&work->lockdep_map);
2824 lock_map_release(&work->lockdep_map);
2825
606a5020 2826 if (start_flush_work(work, &barr)) {
401a8d04
TH
2827 wait_for_completion(&barr.done);
2828 destroy_work_on_stack(&barr.work);
2829 return true;
606a5020 2830 } else {
401a8d04 2831 return false;
6e84d644 2832 }
6e84d644 2833}
606a5020 2834EXPORT_SYMBOL_GPL(flush_work);
6e84d644 2835
36e227d2 2836static bool __cancel_work_timer(struct work_struct *work, bool is_dwork)
1f1f642e 2837{
bbb68dfa 2838 unsigned long flags;
1f1f642e
ON
2839 int ret;
2840
2841 do {
bbb68dfa
TH
2842 ret = try_to_grab_pending(work, is_dwork, &flags);
2843 /*
2844 * If someone else is canceling, wait for the same event it
2845 * would be waiting for before retrying.
2846 */
2847 if (unlikely(ret == -ENOENT))
606a5020 2848 flush_work(work);
1f1f642e
ON
2849 } while (unlikely(ret < 0));
2850
bbb68dfa
TH
2851 /* tell other tasks trying to grab @work to back off */
2852 mark_work_canceling(work);
2853 local_irq_restore(flags);
2854
606a5020 2855 flush_work(work);
7a22ad75 2856 clear_work_data(work);
1f1f642e
ON
2857 return ret;
2858}
2859
6e84d644 2860/**
401a8d04
TH
2861 * cancel_work_sync - cancel a work and wait for it to finish
2862 * @work: the work to cancel
6e84d644 2863 *
401a8d04
TH
2864 * Cancel @work and wait for its execution to finish. This function
2865 * can be used even if the work re-queues itself or migrates to
2866 * another workqueue. On return from this function, @work is
2867 * guaranteed to be not pending or executing on any CPU.
1f1f642e 2868 *
401a8d04
TH
2869 * cancel_work_sync(&delayed_work->work) must not be used for
2870 * delayed_work's. Use cancel_delayed_work_sync() instead.
6e84d644 2871 *
401a8d04 2872 * The caller must ensure that the workqueue on which @work was last
6e84d644 2873 * queued can't be destroyed before this function returns.
401a8d04
TH
2874 *
2875 * RETURNS:
2876 * %true if @work was pending, %false otherwise.
6e84d644 2877 */
401a8d04 2878bool cancel_work_sync(struct work_struct *work)
6e84d644 2879{
36e227d2 2880 return __cancel_work_timer(work, false);
b89deed3 2881}
28e53bdd 2882EXPORT_SYMBOL_GPL(cancel_work_sync);
b89deed3 2883
6e84d644 2884/**
401a8d04
TH
2885 * flush_delayed_work - wait for a dwork to finish executing the last queueing
2886 * @dwork: the delayed work to flush
6e84d644 2887 *
401a8d04
TH
2888 * Delayed timer is cancelled and the pending work is queued for
2889 * immediate execution. Like flush_work(), this function only
2890 * considers the last queueing instance of @dwork.
1f1f642e 2891 *
401a8d04
TH
2892 * RETURNS:
2893 * %true if flush_work() waited for the work to finish execution,
2894 * %false if it was already idle.
6e84d644 2895 */
401a8d04
TH
2896bool flush_delayed_work(struct delayed_work *dwork)
2897{
8930caba 2898 local_irq_disable();
401a8d04 2899 if (del_timer_sync(&dwork->timer))
60c057bc 2900 __queue_work(dwork->cpu, dwork->wq, &dwork->work);
8930caba 2901 local_irq_enable();
401a8d04
TH
2902 return flush_work(&dwork->work);
2903}
2904EXPORT_SYMBOL(flush_delayed_work);
2905
09383498 2906/**
57b30ae7
TH
2907 * cancel_delayed_work - cancel a delayed work
2908 * @dwork: delayed_work to cancel
09383498 2909 *
57b30ae7
TH
2910 * Kill off a pending delayed_work. Returns %true if @dwork was pending
2911 * and canceled; %false if wasn't pending. Note that the work callback
2912 * function may still be running on return, unless it returns %true and the
2913 * work doesn't re-arm itself. Explicitly flush or use
2914 * cancel_delayed_work_sync() to wait on it.
09383498 2915 *
57b30ae7 2916 * This function is safe to call from any context including IRQ handler.
09383498 2917 */
57b30ae7 2918bool cancel_delayed_work(struct delayed_work *dwork)
09383498 2919{
57b30ae7
TH
2920 unsigned long flags;
2921 int ret;
2922
2923 do {
2924 ret = try_to_grab_pending(&dwork->work, true, &flags);
2925 } while (unlikely(ret == -EAGAIN));
2926
2927 if (unlikely(ret < 0))
2928 return false;
2929
7c3eed5c
TH
2930 set_work_pool_and_clear_pending(&dwork->work,
2931 get_work_pool_id(&dwork->work));
57b30ae7 2932 local_irq_restore(flags);
c0158ca6 2933 return ret;
09383498 2934}
57b30ae7 2935EXPORT_SYMBOL(cancel_delayed_work);
09383498 2936
401a8d04
TH
2937/**
2938 * cancel_delayed_work_sync - cancel a delayed work and wait for it to finish
2939 * @dwork: the delayed work cancel
2940 *
2941 * This is cancel_work_sync() for delayed works.
2942 *
2943 * RETURNS:
2944 * %true if @dwork was pending, %false otherwise.
2945 */
2946bool cancel_delayed_work_sync(struct delayed_work *dwork)
6e84d644 2947{
36e227d2 2948 return __cancel_work_timer(&dwork->work, true);
6e84d644 2949}
f5a421a4 2950EXPORT_SYMBOL(cancel_delayed_work_sync);
1da177e4 2951
0fcb78c2 2952/**
c1a220e7
ZR
2953 * schedule_work_on - put work task on a specific cpu
2954 * @cpu: cpu to put the work task on
2955 * @work: job to be done
2956 *
2957 * This puts a job on a specific cpu
2958 */
d4283e93 2959bool schedule_work_on(int cpu, struct work_struct *work)
c1a220e7 2960{
d320c038 2961 return queue_work_on(cpu, system_wq, work);
c1a220e7
ZR
2962}
2963EXPORT_SYMBOL(schedule_work_on);
2964
0fcb78c2 2965/**
0fcb78c2
REB
2966 * schedule_work - put work task in global workqueue
2967 * @work: job to be done
0fcb78c2 2968 *
d4283e93
TH
2969 * Returns %false if @work was already on the kernel-global workqueue and
2970 * %true otherwise.
5b0f437d
BVA
2971 *
2972 * This puts a job in the kernel-global workqueue if it was not already
2973 * queued and leaves it in the same position on the kernel-global
2974 * workqueue otherwise.
0fcb78c2 2975 */
d4283e93 2976bool schedule_work(struct work_struct *work)
1da177e4 2977{
d320c038 2978 return queue_work(system_wq, work);
1da177e4 2979}
ae90dd5d 2980EXPORT_SYMBOL(schedule_work);
1da177e4 2981
0fcb78c2
REB
2982/**
2983 * schedule_delayed_work_on - queue work in global workqueue on CPU after delay
2984 * @cpu: cpu to use
52bad64d 2985 * @dwork: job to be done
0fcb78c2
REB
2986 * @delay: number of jiffies to wait
2987 *
2988 * After waiting for a given time this puts a job in the kernel-global
2989 * workqueue on the specified CPU.
2990 */
d4283e93
TH
2991bool schedule_delayed_work_on(int cpu, struct delayed_work *dwork,
2992 unsigned long delay)
1da177e4 2993{
d320c038 2994 return queue_delayed_work_on(cpu, system_wq, dwork, delay);
1da177e4 2995}
ae90dd5d 2996EXPORT_SYMBOL(schedule_delayed_work_on);
1da177e4 2997
0fcb78c2
REB
2998/**
2999 * schedule_delayed_work - put work task in global workqueue after delay
52bad64d
DH
3000 * @dwork: job to be done
3001 * @delay: number of jiffies to wait or 0 for immediate execution
0fcb78c2
REB
3002 *
3003 * After waiting for a given time this puts a job in the kernel-global
3004 * workqueue.
3005 */
d4283e93 3006bool schedule_delayed_work(struct delayed_work *dwork, unsigned long delay)
1da177e4 3007{
d320c038 3008 return queue_delayed_work(system_wq, dwork, delay);
1da177e4 3009}
ae90dd5d 3010EXPORT_SYMBOL(schedule_delayed_work);
1da177e4 3011
b6136773 3012/**
31ddd871 3013 * schedule_on_each_cpu - execute a function synchronously on each online CPU
b6136773 3014 * @func: the function to call
b6136773 3015 *
31ddd871
TH
3016 * schedule_on_each_cpu() executes @func on each online CPU using the
3017 * system workqueue and blocks until all CPUs have completed.
b6136773 3018 * schedule_on_each_cpu() is very slow.
31ddd871
TH
3019 *
3020 * RETURNS:
3021 * 0 on success, -errno on failure.
b6136773 3022 */
65f27f38 3023int schedule_on_each_cpu(work_func_t func)
15316ba8
CL
3024{
3025 int cpu;
38f51568 3026 struct work_struct __percpu *works;
15316ba8 3027
b6136773
AM
3028 works = alloc_percpu(struct work_struct);
3029 if (!works)
15316ba8 3030 return -ENOMEM;
b6136773 3031
93981800
TH
3032 get_online_cpus();
3033
15316ba8 3034 for_each_online_cpu(cpu) {
9bfb1839
IM
3035 struct work_struct *work = per_cpu_ptr(works, cpu);
3036
3037 INIT_WORK(work, func);
b71ab8c2 3038 schedule_work_on(cpu, work);
65a64464 3039 }
93981800
TH
3040
3041 for_each_online_cpu(cpu)
3042 flush_work(per_cpu_ptr(works, cpu));
3043
95402b38 3044 put_online_cpus();
b6136773 3045 free_percpu(works);
15316ba8
CL
3046 return 0;
3047}
3048
eef6a7d5
AS
3049/**
3050 * flush_scheduled_work - ensure that any scheduled work has run to completion.
3051 *
3052 * Forces execution of the kernel-global workqueue and blocks until its
3053 * completion.
3054 *
3055 * Think twice before calling this function! It's very easy to get into
3056 * trouble if you don't take great care. Either of the following situations
3057 * will lead to deadlock:
3058 *
3059 * One of the work items currently on the workqueue needs to acquire
3060 * a lock held by your code or its caller.
3061 *
3062 * Your code is running in the context of a work routine.
3063 *
3064 * They will be detected by lockdep when they occur, but the first might not
3065 * occur very often. It depends on what work items are on the workqueue and
3066 * what locks they need, which you have no control over.
3067 *
3068 * In most situations flushing the entire workqueue is overkill; you merely
3069 * need to know that a particular work item isn't queued and isn't running.
3070 * In such cases you should use cancel_delayed_work_sync() or
3071 * cancel_work_sync() instead.
3072 */
1da177e4
LT
3073void flush_scheduled_work(void)
3074{
d320c038 3075 flush_workqueue(system_wq);
1da177e4 3076}
ae90dd5d 3077EXPORT_SYMBOL(flush_scheduled_work);
1da177e4 3078
1fa44eca
JB
3079/**
3080 * execute_in_process_context - reliably execute the routine with user context
3081 * @fn: the function to execute
1fa44eca
JB
3082 * @ew: guaranteed storage for the execute work structure (must
3083 * be available when the work executes)
3084 *
3085 * Executes the function immediately if process context is available,
3086 * otherwise schedules the function for delayed execution.
3087 *
3088 * Returns: 0 - function was executed
3089 * 1 - function was scheduled for execution
3090 */
65f27f38 3091int execute_in_process_context(work_func_t fn, struct execute_work *ew)
1fa44eca
JB
3092{
3093 if (!in_interrupt()) {
65f27f38 3094 fn(&ew->work);
1fa44eca
JB
3095 return 0;
3096 }
3097
65f27f38 3098 INIT_WORK(&ew->work, fn);
1fa44eca
JB
3099 schedule_work(&ew->work);
3100
3101 return 1;
3102}
3103EXPORT_SYMBOL_GPL(execute_in_process_context);
3104
1da177e4
LT
3105int keventd_up(void)
3106{
d320c038 3107 return system_wq != NULL;
1da177e4
LT
3108}
3109
30cdf249 3110static int alloc_and_link_pwqs(struct workqueue_struct *wq)
0f900049 3111{
49e3cf44 3112 bool highpri = wq->flags & WQ_HIGHPRI;
30cdf249
TH
3113 int cpu;
3114
3115 if (!(wq->flags & WQ_UNBOUND)) {
420c0ddb
TH
3116 wq->cpu_pwqs = alloc_percpu(struct pool_workqueue);
3117 if (!wq->cpu_pwqs)
30cdf249
TH
3118 return -ENOMEM;
3119
3120 for_each_possible_cpu(cpu) {
7fb98ea7
TH
3121 struct pool_workqueue *pwq =
3122 per_cpu_ptr(wq->cpu_pwqs, cpu);
f3421797 3123
49e3cf44 3124 pwq->pool = get_std_worker_pool(cpu, highpri);
76af4d93 3125 list_add_tail_rcu(&pwq->pwqs_node, &wq->pwqs);
30cdf249
TH
3126 }
3127 } else {
3128 struct pool_workqueue *pwq;
3129
3130 pwq = kmem_cache_zalloc(pwq_cache, GFP_KERNEL);
3131 if (!pwq)
3132 return -ENOMEM;
3133
49e3cf44 3134 pwq->pool = get_std_worker_pool(WORK_CPU_UNBOUND, highpri);
76af4d93 3135 list_add_tail_rcu(&pwq->pwqs_node, &wq->pwqs);
30cdf249
TH
3136 }
3137
3138 return 0;
0f900049
TH
3139}
3140
112202d9 3141static void free_pwqs(struct workqueue_struct *wq)
0f900049 3142{
e06ffa1e 3143 if (!(wq->flags & WQ_UNBOUND))
420c0ddb
TH
3144 free_percpu(wq->cpu_pwqs);
3145 else if (!list_empty(&wq->pwqs))
3146 kmem_cache_free(pwq_cache, list_first_entry(&wq->pwqs,
3147 struct pool_workqueue, pwqs_node));
0f900049
TH
3148}
3149
f3421797
TH
3150static int wq_clamp_max_active(int max_active, unsigned int flags,
3151 const char *name)
b71ab8c2 3152{
f3421797
TH
3153 int lim = flags & WQ_UNBOUND ? WQ_UNBOUND_MAX_ACTIVE : WQ_MAX_ACTIVE;
3154
3155 if (max_active < 1 || max_active > lim)
044c782c
VI
3156 pr_warn("workqueue: max_active %d requested for %s is out of range, clamping between %d and %d\n",
3157 max_active, name, 1, lim);
b71ab8c2 3158
f3421797 3159 return clamp_val(max_active, 1, lim);
b71ab8c2
TH
3160}
3161
b196be89 3162struct workqueue_struct *__alloc_workqueue_key(const char *fmt,
d320c038
TH
3163 unsigned int flags,
3164 int max_active,
3165 struct lock_class_key *key,
b196be89 3166 const char *lock_name, ...)
1da177e4 3167{
b196be89 3168 va_list args, args1;
1da177e4 3169 struct workqueue_struct *wq;
49e3cf44 3170 struct pool_workqueue *pwq;
b196be89
TH
3171 size_t namelen;
3172
3173 /* determine namelen, allocate wq and format name */
3174 va_start(args, lock_name);
3175 va_copy(args1, args);
3176 namelen = vsnprintf(NULL, 0, fmt, args) + 1;
3177
3178 wq = kzalloc(sizeof(*wq) + namelen, GFP_KERNEL);
3179 if (!wq)
3180 goto err;
3181
3182 vsnprintf(wq->name, namelen, fmt, args1);
3183 va_end(args);
3184 va_end(args1);
1da177e4 3185
6370a6ad
TH
3186 /*
3187 * Workqueues which may be used during memory reclaim should
3188 * have a rescuer to guarantee forward progress.
3189 */
3190 if (flags & WQ_MEM_RECLAIM)
3191 flags |= WQ_RESCUER;
3192
d320c038 3193 max_active = max_active ?: WQ_DFL_ACTIVE;
b196be89 3194 max_active = wq_clamp_max_active(max_active, flags, wq->name);
3af24433 3195
b196be89 3196 /* init wq */
97e37d7b 3197 wq->flags = flags;
a0a1a5fd 3198 wq->saved_max_active = max_active;
73f53c4a 3199 mutex_init(&wq->flush_mutex);
112202d9 3200 atomic_set(&wq->nr_pwqs_to_flush, 0);
30cdf249 3201 INIT_LIST_HEAD(&wq->pwqs);
73f53c4a
TH
3202 INIT_LIST_HEAD(&wq->flusher_queue);
3203 INIT_LIST_HEAD(&wq->flusher_overflow);
493a1724 3204 INIT_LIST_HEAD(&wq->maydays);
502ca9d8 3205
eb13ba87 3206 lockdep_init_map(&wq->lockdep_map, lock_name, key, 0);
cce1a165 3207 INIT_LIST_HEAD(&wq->list);
3af24433 3208
30cdf249 3209 if (alloc_and_link_pwqs(wq) < 0)
bdbc5dd7
TH
3210 goto err;
3211
76af4d93 3212 local_irq_disable();
49e3cf44 3213 for_each_pwq(pwq, wq) {
112202d9 3214 BUG_ON((unsigned long)pwq & WORK_STRUCT_FLAG_MASK);
112202d9
TH
3215 pwq->wq = wq;
3216 pwq->flush_color = -1;
3217 pwq->max_active = max_active;
3218 INIT_LIST_HEAD(&pwq->delayed_works);
493a1724 3219 INIT_LIST_HEAD(&pwq->mayday_node);
e22bee78 3220 }
76af4d93 3221 local_irq_enable();
1537663f 3222
e22bee78
TH
3223 if (flags & WQ_RESCUER) {
3224 struct worker *rescuer;
3225
e22bee78
TH
3226 wq->rescuer = rescuer = alloc_worker();
3227 if (!rescuer)
3228 goto err;
3229
111c225a
TH
3230 rescuer->rescue_wq = wq;
3231 rescuer->task = kthread_create(rescuer_thread, rescuer, "%s",
b196be89 3232 wq->name);
e22bee78
TH
3233 if (IS_ERR(rescuer->task))
3234 goto err;
3235
e22bee78
TH
3236 rescuer->task->flags |= PF_THREAD_BOUND;
3237 wake_up_process(rescuer->task);
3af24433
ON
3238 }
3239
a0a1a5fd
TH
3240 /*
3241 * workqueue_lock protects global freeze state and workqueues
3242 * list. Grab it, set max_active accordingly and add the new
3243 * workqueue to workqueues list.
3244 */
e98d5b16 3245 spin_lock_irq(&workqueue_lock);
a0a1a5fd 3246
58a69cb4 3247 if (workqueue_freezing && wq->flags & WQ_FREEZABLE)
49e3cf44
TH
3248 for_each_pwq(pwq, wq)
3249 pwq->max_active = 0;
a0a1a5fd 3250
1537663f 3251 list_add(&wq->list, &workqueues);
a0a1a5fd 3252
e98d5b16 3253 spin_unlock_irq(&workqueue_lock);
1537663f 3254
3af24433 3255 return wq;
4690c4ab
TH
3256err:
3257 if (wq) {
112202d9 3258 free_pwqs(wq);
e22bee78 3259 kfree(wq->rescuer);
4690c4ab
TH
3260 kfree(wq);
3261 }
3262 return NULL;
3af24433 3263}
d320c038 3264EXPORT_SYMBOL_GPL(__alloc_workqueue_key);
1da177e4 3265
3af24433
ON
3266/**
3267 * destroy_workqueue - safely terminate a workqueue
3268 * @wq: target workqueue
3269 *
3270 * Safely destroy a workqueue. All work currently pending will be done first.
3271 */
3272void destroy_workqueue(struct workqueue_struct *wq)
3273{
49e3cf44 3274 struct pool_workqueue *pwq;
3af24433 3275
9c5a2ba7
TH
3276 /* drain it before proceeding with destruction */
3277 drain_workqueue(wq);
c8efcc25 3278
76af4d93
TH
3279 spin_lock_irq(&workqueue_lock);
3280
6183c009 3281 /* sanity checks */
49e3cf44 3282 for_each_pwq(pwq, wq) {
6183c009
TH
3283 int i;
3284
76af4d93
TH
3285 for (i = 0; i < WORK_NR_COLORS; i++) {
3286 if (WARN_ON(pwq->nr_in_flight[i])) {
3287 spin_unlock_irq(&workqueue_lock);
6183c009 3288 return;
76af4d93
TH
3289 }
3290 }
3291
6183c009 3292 if (WARN_ON(pwq->nr_active) ||
76af4d93
TH
3293 WARN_ON(!list_empty(&pwq->delayed_works))) {
3294 spin_unlock_irq(&workqueue_lock);
6183c009 3295 return;
76af4d93 3296 }
6183c009
TH
3297 }
3298
a0a1a5fd
TH
3299 /*
3300 * wq list is used to freeze wq, remove from list after
3301 * flushing is complete in case freeze races us.
3302 */
b1f4ec17 3303 list_del(&wq->list);
76af4d93 3304
e98d5b16 3305 spin_unlock_irq(&workqueue_lock);
3af24433 3306
e22bee78
TH
3307 if (wq->flags & WQ_RESCUER) {
3308 kthread_stop(wq->rescuer->task);
8d9df9f0 3309 kfree(wq->rescuer);
e22bee78
TH
3310 }
3311
112202d9 3312 free_pwqs(wq);
3af24433
ON
3313 kfree(wq);
3314}
3315EXPORT_SYMBOL_GPL(destroy_workqueue);
3316
9f4bd4cd 3317/**
112202d9
TH
3318 * pwq_set_max_active - adjust max_active of a pwq
3319 * @pwq: target pool_workqueue
9f4bd4cd
LJ
3320 * @max_active: new max_active value.
3321 *
112202d9 3322 * Set @pwq->max_active to @max_active and activate delayed works if
9f4bd4cd
LJ
3323 * increased.
3324 *
3325 * CONTEXT:
d565ed63 3326 * spin_lock_irq(pool->lock).
9f4bd4cd 3327 */
112202d9 3328static void pwq_set_max_active(struct pool_workqueue *pwq, int max_active)
9f4bd4cd 3329{
112202d9 3330 pwq->max_active = max_active;
9f4bd4cd 3331
112202d9
TH
3332 while (!list_empty(&pwq->delayed_works) &&
3333 pwq->nr_active < pwq->max_active)
3334 pwq_activate_first_delayed(pwq);
9f4bd4cd
LJ
3335}
3336
dcd989cb
TH
3337/**
3338 * workqueue_set_max_active - adjust max_active of a workqueue
3339 * @wq: target workqueue
3340 * @max_active: new max_active value.
3341 *
3342 * Set max_active of @wq to @max_active.
3343 *
3344 * CONTEXT:
3345 * Don't call from IRQ context.
3346 */
3347void workqueue_set_max_active(struct workqueue_struct *wq, int max_active)
3348{
49e3cf44 3349 struct pool_workqueue *pwq;
dcd989cb 3350
f3421797 3351 max_active = wq_clamp_max_active(max_active, wq->flags, wq->name);
dcd989cb 3352
e98d5b16 3353 spin_lock_irq(&workqueue_lock);
dcd989cb
TH
3354
3355 wq->saved_max_active = max_active;
3356
49e3cf44 3357 for_each_pwq(pwq, wq) {
112202d9 3358 struct worker_pool *pool = pwq->pool;
dcd989cb 3359
e98d5b16 3360 spin_lock(&pool->lock);
dcd989cb 3361
58a69cb4 3362 if (!(wq->flags & WQ_FREEZABLE) ||
35b6bb63 3363 !(pool->flags & POOL_FREEZING))
112202d9 3364 pwq_set_max_active(pwq, max_active);
9bfb1839 3365
e98d5b16 3366 spin_unlock(&pool->lock);
65a64464 3367 }
93981800 3368
e98d5b16 3369 spin_unlock_irq(&workqueue_lock);
15316ba8 3370}
dcd989cb 3371EXPORT_SYMBOL_GPL(workqueue_set_max_active);
15316ba8 3372
eef6a7d5 3373/**
dcd989cb
TH
3374 * workqueue_congested - test whether a workqueue is congested
3375 * @cpu: CPU in question
3376 * @wq: target workqueue
eef6a7d5 3377 *
dcd989cb
TH
3378 * Test whether @wq's cpu workqueue for @cpu is congested. There is
3379 * no synchronization around this function and the test result is
3380 * unreliable and only useful as advisory hints or for debugging.
eef6a7d5 3381 *
dcd989cb
TH
3382 * RETURNS:
3383 * %true if congested, %false otherwise.
eef6a7d5 3384 */
d84ff051 3385bool workqueue_congested(int cpu, struct workqueue_struct *wq)
1da177e4 3386{
7fb98ea7 3387 struct pool_workqueue *pwq;
76af4d93
TH
3388 bool ret;
3389
3390 preempt_disable();
7fb98ea7
TH
3391
3392 if (!(wq->flags & WQ_UNBOUND))
3393 pwq = per_cpu_ptr(wq->cpu_pwqs, cpu);
3394 else
3395 pwq = first_pwq(wq);
dcd989cb 3396
76af4d93
TH
3397 ret = !list_empty(&pwq->delayed_works);
3398 preempt_enable();
3399
3400 return ret;
1da177e4 3401}
dcd989cb 3402EXPORT_SYMBOL_GPL(workqueue_congested);
1da177e4 3403
dcd989cb
TH
3404/**
3405 * work_busy - test whether a work is currently pending or running
3406 * @work: the work to be tested
3407 *
3408 * Test whether @work is currently pending or running. There is no
3409 * synchronization around this function and the test result is
3410 * unreliable and only useful as advisory hints or for debugging.
dcd989cb
TH
3411 *
3412 * RETURNS:
3413 * OR'd bitmask of WORK_BUSY_* bits.
3414 */
3415unsigned int work_busy(struct work_struct *work)
1da177e4 3416{
c9e7cf27 3417 struct worker_pool *pool = get_work_pool(work);
dcd989cb
TH
3418 unsigned long flags;
3419 unsigned int ret = 0;
1da177e4 3420
dcd989cb
TH
3421 if (work_pending(work))
3422 ret |= WORK_BUSY_PENDING;
1da177e4 3423
038366c5
LJ
3424 if (pool) {
3425 spin_lock_irqsave(&pool->lock, flags);
3426 if (find_worker_executing_work(pool, work))
3427 ret |= WORK_BUSY_RUNNING;
3428 spin_unlock_irqrestore(&pool->lock, flags);
3429 }
1da177e4 3430
dcd989cb 3431 return ret;
1da177e4 3432}
dcd989cb 3433EXPORT_SYMBOL_GPL(work_busy);
1da177e4 3434
db7bccf4
TH
3435/*
3436 * CPU hotplug.
3437 *
e22bee78 3438 * There are two challenges in supporting CPU hotplug. Firstly, there
112202d9 3439 * are a lot of assumptions on strong associations among work, pwq and
706026c2 3440 * pool which make migrating pending and scheduled works very
e22bee78 3441 * difficult to implement without impacting hot paths. Secondly,
94cf58bb 3442 * worker pools serve mix of short, long and very long running works making
e22bee78
TH
3443 * blocked draining impractical.
3444 *
24647570 3445 * This is solved by allowing the pools to be disassociated from the CPU
628c78e7
TH
3446 * running as an unbound one and allowing it to be reattached later if the
3447 * cpu comes back online.
db7bccf4 3448 */
1da177e4 3449
706026c2 3450static void wq_unbind_fn(struct work_struct *work)
3af24433 3451{
38db41d9 3452 int cpu = smp_processor_id();
4ce62e9e 3453 struct worker_pool *pool;
db7bccf4 3454 struct worker *worker;
db7bccf4 3455 int i;
3af24433 3456
38db41d9 3457 for_each_std_worker_pool(pool, cpu) {
6183c009 3458 WARN_ON_ONCE(cpu != smp_processor_id());
db7bccf4 3459
94cf58bb
TH
3460 mutex_lock(&pool->assoc_mutex);
3461 spin_lock_irq(&pool->lock);
3af24433 3462
94cf58bb
TH
3463 /*
3464 * We've claimed all manager positions. Make all workers
3465 * unbound and set DISASSOCIATED. Before this, all workers
3466 * except for the ones which are still executing works from
3467 * before the last CPU down must be on the cpu. After
3468 * this, they may become diasporas.
3469 */
4ce62e9e 3470 list_for_each_entry(worker, &pool->idle_list, entry)
403c821d 3471 worker->flags |= WORKER_UNBOUND;
3af24433 3472
b67bfe0d 3473 for_each_busy_worker(worker, i, pool)
c9e7cf27 3474 worker->flags |= WORKER_UNBOUND;
06ba38a9 3475
24647570 3476 pool->flags |= POOL_DISASSOCIATED;
f2d5a0ee 3477
94cf58bb
TH
3478 spin_unlock_irq(&pool->lock);
3479 mutex_unlock(&pool->assoc_mutex);
3480 }
628c78e7 3481
e22bee78 3482 /*
403c821d 3483 * Call schedule() so that we cross rq->lock and thus can guarantee
628c78e7
TH
3484 * sched callbacks see the %WORKER_UNBOUND flag. This is necessary
3485 * as scheduler callbacks may be invoked from other cpus.
e22bee78 3486 */
e22bee78 3487 schedule();
06ba38a9 3488
e22bee78 3489 /*
628c78e7
TH
3490 * Sched callbacks are disabled now. Zap nr_running. After this,
3491 * nr_running stays zero and need_more_worker() and keep_working()
38db41d9
TH
3492 * are always true as long as the worklist is not empty. Pools on
3493 * @cpu now behave as unbound (in terms of concurrency management)
3494 * pools which are served by workers tied to the CPU.
628c78e7
TH
3495 *
3496 * On return from this function, the current worker would trigger
3497 * unbound chain execution of pending work items if other workers
3498 * didn't already.
e22bee78 3499 */
38db41d9 3500 for_each_std_worker_pool(pool, cpu)
e19e397a 3501 atomic_set(&pool->nr_running, 0);
3af24433 3502}
3af24433 3503
8db25e78
TH
3504/*
3505 * Workqueues should be brought up before normal priority CPU notifiers.
3506 * This will be registered high priority CPU notifier.
3507 */
9fdf9b73 3508static int __cpuinit workqueue_cpu_up_callback(struct notifier_block *nfb,
8db25e78
TH
3509 unsigned long action,
3510 void *hcpu)
3af24433 3511{
d84ff051 3512 int cpu = (unsigned long)hcpu;
4ce62e9e 3513 struct worker_pool *pool;
3ce63377 3514
8db25e78 3515 switch (action & ~CPU_TASKS_FROZEN) {
3af24433 3516 case CPU_UP_PREPARE:
38db41d9 3517 for_each_std_worker_pool(pool, cpu) {
3ce63377
TH
3518 struct worker *worker;
3519
3520 if (pool->nr_workers)
3521 continue;
3522
3523 worker = create_worker(pool);
3524 if (!worker)
3525 return NOTIFY_BAD;
3526
d565ed63 3527 spin_lock_irq(&pool->lock);
3ce63377 3528 start_worker(worker);
d565ed63 3529 spin_unlock_irq(&pool->lock);
3af24433 3530 }
8db25e78 3531 break;
3af24433 3532
db7bccf4
TH
3533 case CPU_DOWN_FAILED:
3534 case CPU_ONLINE:
38db41d9 3535 for_each_std_worker_pool(pool, cpu) {
94cf58bb
TH
3536 mutex_lock(&pool->assoc_mutex);
3537 spin_lock_irq(&pool->lock);
3538
24647570 3539 pool->flags &= ~POOL_DISASSOCIATED;
94cf58bb
TH
3540 rebind_workers(pool);
3541
3542 spin_unlock_irq(&pool->lock);
3543 mutex_unlock(&pool->assoc_mutex);
3544 }
db7bccf4 3545 break;
00dfcaf7 3546 }
65758202
TH
3547 return NOTIFY_OK;
3548}
3549
3550/*
3551 * Workqueues should be brought down after normal priority CPU notifiers.
3552 * This will be registered as low priority CPU notifier.
3553 */
9fdf9b73 3554static int __cpuinit workqueue_cpu_down_callback(struct notifier_block *nfb,
65758202
TH
3555 unsigned long action,
3556 void *hcpu)
3557{
d84ff051 3558 int cpu = (unsigned long)hcpu;
8db25e78
TH
3559 struct work_struct unbind_work;
3560
65758202
TH
3561 switch (action & ~CPU_TASKS_FROZEN) {
3562 case CPU_DOWN_PREPARE:
8db25e78 3563 /* unbinding should happen on the local CPU */
706026c2 3564 INIT_WORK_ONSTACK(&unbind_work, wq_unbind_fn);
7635d2fd 3565 queue_work_on(cpu, system_highpri_wq, &unbind_work);
8db25e78
TH
3566 flush_work(&unbind_work);
3567 break;
65758202
TH
3568 }
3569 return NOTIFY_OK;
3570}
3571
2d3854a3 3572#ifdef CONFIG_SMP
8ccad40d 3573
2d3854a3 3574struct work_for_cpu {
ed48ece2 3575 struct work_struct work;
2d3854a3
RR
3576 long (*fn)(void *);
3577 void *arg;
3578 long ret;
3579};
3580
ed48ece2 3581static void work_for_cpu_fn(struct work_struct *work)
2d3854a3 3582{
ed48ece2
TH
3583 struct work_for_cpu *wfc = container_of(work, struct work_for_cpu, work);
3584
2d3854a3
RR
3585 wfc->ret = wfc->fn(wfc->arg);
3586}
3587
3588/**
3589 * work_on_cpu - run a function in user context on a particular cpu
3590 * @cpu: the cpu to run on
3591 * @fn: the function to run
3592 * @arg: the function arg
3593 *
31ad9081
RR
3594 * This will return the value @fn returns.
3595 * It is up to the caller to ensure that the cpu doesn't go offline.
6b44003e 3596 * The caller must not hold any locks which would prevent @fn from completing.
2d3854a3 3597 */
d84ff051 3598long work_on_cpu(int cpu, long (*fn)(void *), void *arg)
2d3854a3 3599{
ed48ece2 3600 struct work_for_cpu wfc = { .fn = fn, .arg = arg };
6b44003e 3601
ed48ece2
TH
3602 INIT_WORK_ONSTACK(&wfc.work, work_for_cpu_fn);
3603 schedule_work_on(cpu, &wfc.work);
3604 flush_work(&wfc.work);
2d3854a3
RR
3605 return wfc.ret;
3606}
3607EXPORT_SYMBOL_GPL(work_on_cpu);
3608#endif /* CONFIG_SMP */
3609
a0a1a5fd
TH
3610#ifdef CONFIG_FREEZER
3611
3612/**
3613 * freeze_workqueues_begin - begin freezing workqueues
3614 *
58a69cb4
TH
3615 * Start freezing workqueues. After this function returns, all freezable
3616 * workqueues will queue new works to their frozen_works list instead of
706026c2 3617 * pool->worklist.
a0a1a5fd
TH
3618 *
3619 * CONTEXT:
d565ed63 3620 * Grabs and releases workqueue_lock and pool->lock's.
a0a1a5fd
TH
3621 */
3622void freeze_workqueues_begin(void)
3623{
17116969 3624 struct worker_pool *pool;
24b8a847
TH
3625 struct workqueue_struct *wq;
3626 struct pool_workqueue *pwq;
17116969 3627 int id;
a0a1a5fd 3628
e98d5b16 3629 spin_lock_irq(&workqueue_lock);
a0a1a5fd 3630
6183c009 3631 WARN_ON_ONCE(workqueue_freezing);
a0a1a5fd
TH
3632 workqueue_freezing = true;
3633
24b8a847 3634 /* set FREEZING */
17116969 3635 for_each_pool(pool, id) {
17116969 3636 spin_lock(&pool->lock);
17116969
TH
3637 WARN_ON_ONCE(pool->flags & POOL_FREEZING);
3638 pool->flags |= POOL_FREEZING;
24b8a847
TH
3639 spin_unlock(&pool->lock);
3640 }
a0a1a5fd 3641
24b8a847
TH
3642 /* suppress further executions by setting max_active to zero */
3643 list_for_each_entry(wq, &workqueues, list) {
3644 if (!(wq->flags & WQ_FREEZABLE))
3645 continue;
8b03ae3c 3646
24b8a847
TH
3647 for_each_pwq(pwq, wq) {
3648 spin_lock(&pwq->pool->lock);
3649 pwq->max_active = 0;
3650 spin_unlock(&pwq->pool->lock);
a1056305 3651 }
a0a1a5fd
TH
3652 }
3653
e98d5b16 3654 spin_unlock_irq(&workqueue_lock);
a0a1a5fd
TH
3655}
3656
3657/**
58a69cb4 3658 * freeze_workqueues_busy - are freezable workqueues still busy?
a0a1a5fd
TH
3659 *
3660 * Check whether freezing is complete. This function must be called
3661 * between freeze_workqueues_begin() and thaw_workqueues().
3662 *
3663 * CONTEXT:
3664 * Grabs and releases workqueue_lock.
3665 *
3666 * RETURNS:
58a69cb4
TH
3667 * %true if some freezable workqueues are still busy. %false if freezing
3668 * is complete.
a0a1a5fd
TH
3669 */
3670bool freeze_workqueues_busy(void)
3671{
a0a1a5fd 3672 bool busy = false;
24b8a847
TH
3673 struct workqueue_struct *wq;
3674 struct pool_workqueue *pwq;
a0a1a5fd 3675
e98d5b16 3676 spin_lock_irq(&workqueue_lock);
a0a1a5fd 3677
6183c009 3678 WARN_ON_ONCE(!workqueue_freezing);
a0a1a5fd 3679
24b8a847
TH
3680 list_for_each_entry(wq, &workqueues, list) {
3681 if (!(wq->flags & WQ_FREEZABLE))
3682 continue;
a0a1a5fd
TH
3683 /*
3684 * nr_active is monotonically decreasing. It's safe
3685 * to peek without lock.
3686 */
24b8a847 3687 for_each_pwq(pwq, wq) {
6183c009 3688 WARN_ON_ONCE(pwq->nr_active < 0);
112202d9 3689 if (pwq->nr_active) {
a0a1a5fd
TH
3690 busy = true;
3691 goto out_unlock;
3692 }
3693 }
3694 }
3695out_unlock:
e98d5b16 3696 spin_unlock_irq(&workqueue_lock);
a0a1a5fd
TH
3697 return busy;
3698}
3699
3700/**
3701 * thaw_workqueues - thaw workqueues
3702 *
3703 * Thaw workqueues. Normal queueing is restored and all collected
706026c2 3704 * frozen works are transferred to their respective pool worklists.
a0a1a5fd
TH
3705 *
3706 * CONTEXT:
d565ed63 3707 * Grabs and releases workqueue_lock and pool->lock's.
a0a1a5fd
TH
3708 */
3709void thaw_workqueues(void)
3710{
24b8a847
TH
3711 struct workqueue_struct *wq;
3712 struct pool_workqueue *pwq;
3713 struct worker_pool *pool;
3714 int id;
a0a1a5fd 3715
e98d5b16 3716 spin_lock_irq(&workqueue_lock);
a0a1a5fd
TH
3717
3718 if (!workqueue_freezing)
3719 goto out_unlock;
3720
24b8a847
TH
3721 /* clear FREEZING */
3722 for_each_pool(pool, id) {
3723 spin_lock(&pool->lock);
3724 WARN_ON_ONCE(!(pool->flags & POOL_FREEZING));
3725 pool->flags &= ~POOL_FREEZING;
3726 spin_unlock(&pool->lock);
3727 }
8b03ae3c 3728
24b8a847
TH
3729 /* restore max_active and repopulate worklist */
3730 list_for_each_entry(wq, &workqueues, list) {
3731 if (!(wq->flags & WQ_FREEZABLE))
3732 continue;
a1056305 3733
24b8a847
TH
3734 for_each_pwq(pwq, wq) {
3735 spin_lock(&pwq->pool->lock);
3736 pwq_set_max_active(pwq, wq->saved_max_active);
3737 spin_unlock(&pwq->pool->lock);
d565ed63 3738 }
a0a1a5fd
TH
3739 }
3740
24b8a847
TH
3741 /* kick workers */
3742 for_each_pool(pool, id) {
3743 spin_lock(&pool->lock);
3744 wake_up_worker(pool);
3745 spin_unlock(&pool->lock);
3746 }
3747
a0a1a5fd
TH
3748 workqueue_freezing = false;
3749out_unlock:
e98d5b16 3750 spin_unlock_irq(&workqueue_lock);
a0a1a5fd
TH
3751}
3752#endif /* CONFIG_FREEZER */
3753
6ee0578b 3754static int __init init_workqueues(void)
1da177e4 3755{
d84ff051 3756 int cpu;
c34056a3 3757
7c3eed5c
TH
3758 /* make sure we have enough bits for OFFQ pool ID */
3759 BUILD_BUG_ON((1LU << (BITS_PER_LONG - WORK_OFFQ_POOL_SHIFT)) <
6be19588 3760 WORK_CPU_END * NR_STD_WORKER_POOLS);
b5490077 3761
e904e6c2
TH
3762 WARN_ON(__alignof__(struct pool_workqueue) < __alignof__(long long));
3763
3764 pwq_cache = KMEM_CACHE(pool_workqueue, SLAB_PANIC);
3765
65758202 3766 cpu_notifier(workqueue_cpu_up_callback, CPU_PRI_WORKQUEUE_UP);
a5b4e57d 3767 hotcpu_notifier(workqueue_cpu_down_callback, CPU_PRI_WORKQUEUE_DOWN);
8b03ae3c 3768
706026c2
TH
3769 /* initialize CPU pools */
3770 for_each_wq_cpu(cpu) {
4ce62e9e 3771 struct worker_pool *pool;
8b03ae3c 3772
38db41d9 3773 for_each_std_worker_pool(pool, cpu) {
d565ed63 3774 spin_lock_init(&pool->lock);
ec22ca5e 3775 pool->cpu = cpu;
24647570 3776 pool->flags |= POOL_DISASSOCIATED;
4ce62e9e
TH
3777 INIT_LIST_HEAD(&pool->worklist);
3778 INIT_LIST_HEAD(&pool->idle_list);
c9e7cf27 3779 hash_init(pool->busy_hash);
e7577c50 3780
4ce62e9e
TH
3781 init_timer_deferrable(&pool->idle_timer);
3782 pool->idle_timer.function = idle_worker_timeout;
3783 pool->idle_timer.data = (unsigned long)pool;
e22bee78 3784
706026c2 3785 setup_timer(&pool->mayday_timer, pool_mayday_timeout,
4ce62e9e
TH
3786 (unsigned long)pool);
3787
b2eb83d1 3788 mutex_init(&pool->assoc_mutex);
4ce62e9e 3789 ida_init(&pool->worker_ida);
9daf9e67
TH
3790
3791 /* alloc pool ID */
3792 BUG_ON(worker_pool_assign_id(pool));
4ce62e9e 3793 }
8b03ae3c
TH
3794 }
3795
e22bee78 3796 /* create the initial worker */
706026c2 3797 for_each_online_wq_cpu(cpu) {
4ce62e9e 3798 struct worker_pool *pool;
e22bee78 3799
38db41d9 3800 for_each_std_worker_pool(pool, cpu) {
4ce62e9e
TH
3801 struct worker *worker;
3802
24647570
TH
3803 if (cpu != WORK_CPU_UNBOUND)
3804 pool->flags &= ~POOL_DISASSOCIATED;
3805
bc2ae0f5 3806 worker = create_worker(pool);
4ce62e9e 3807 BUG_ON(!worker);
d565ed63 3808 spin_lock_irq(&pool->lock);
4ce62e9e 3809 start_worker(worker);
d565ed63 3810 spin_unlock_irq(&pool->lock);
4ce62e9e 3811 }
e22bee78
TH
3812 }
3813
d320c038 3814 system_wq = alloc_workqueue("events", 0, 0);
1aabe902 3815 system_highpri_wq = alloc_workqueue("events_highpri", WQ_HIGHPRI, 0);
d320c038 3816 system_long_wq = alloc_workqueue("events_long", 0, 0);
f3421797
TH
3817 system_unbound_wq = alloc_workqueue("events_unbound", WQ_UNBOUND,
3818 WQ_UNBOUND_MAX_ACTIVE);
24d51add
TH
3819 system_freezable_wq = alloc_workqueue("events_freezable",
3820 WQ_FREEZABLE, 0);
1aabe902 3821 BUG_ON(!system_wq || !system_highpri_wq || !system_long_wq ||
ae930e0f 3822 !system_unbound_wq || !system_freezable_wq);
6ee0578b 3823 return 0;
1da177e4 3824}
6ee0578b 3825early_initcall(init_workqueues);