workqueue: Report work funcs that trigger automatic CPU_INTENSIVE mechanism
[linux-block.git] / kernel / workqueue.c
CommitLineData
457c8996 1// SPDX-License-Identifier: GPL-2.0-only
1da177e4 2/*
c54fce6e 3 * kernel/workqueue.c - generic async execution with shared worker pool
1da177e4 4 *
c54fce6e 5 * Copyright (C) 2002 Ingo Molnar
1da177e4 6 *
c54fce6e
TH
7 * Derived from the taskqueue/keventd code by:
8 * David Woodhouse <dwmw2@infradead.org>
9 * Andrew Morton
10 * Kai Petzke <wpp@marie.physik.tu-berlin.de>
11 * Theodore Ts'o <tytso@mit.edu>
1da177e4 12 *
c54fce6e 13 * Made to use alloc_percpu by Christoph Lameter.
1da177e4 14 *
c54fce6e
TH
15 * Copyright (C) 2010 SUSE Linux Products GmbH
16 * Copyright (C) 2010 Tejun Heo <tj@kernel.org>
89ada679 17 *
c54fce6e
TH
18 * This is the generic async execution mechanism. Work items as are
19 * executed in process context. The worker pool is shared and
b11895c4
L
20 * automatically managed. There are two worker pools for each CPU (one for
21 * normal work items and the other for high priority ones) and some extra
22 * pools for workqueues which are not bound to any specific CPU - the
23 * number of these backing pools is dynamic.
c54fce6e 24 *
9a261491 25 * Please read Documentation/core-api/workqueue.rst for details.
1da177e4
LT
26 */
27
9984de1a 28#include <linux/export.h>
1da177e4
LT
29#include <linux/kernel.h>
30#include <linux/sched.h>
31#include <linux/init.h>
32#include <linux/signal.h>
33#include <linux/completion.h>
34#include <linux/workqueue.h>
35#include <linux/slab.h>
36#include <linux/cpu.h>
37#include <linux/notifier.h>
38#include <linux/kthread.h>
1fa44eca 39#include <linux/hardirq.h>
46934023 40#include <linux/mempolicy.h>
341a5958 41#include <linux/freezer.h>
d5abe669 42#include <linux/debug_locks.h>
4e6045f1 43#include <linux/lockdep.h>
c34056a3 44#include <linux/idr.h>
29c91e99 45#include <linux/jhash.h>
42f8570f 46#include <linux/hashtable.h>
76af4d93 47#include <linux/rculist.h>
bce90380 48#include <linux/nodemask.h>
4c16bd32 49#include <linux/moduleparam.h>
3d1cb205 50#include <linux/uaccess.h>
c98a9805 51#include <linux/sched/isolation.h>
cd2440d6 52#include <linux/sched/debug.h>
62635ea8 53#include <linux/nmi.h>
940d71c6 54#include <linux/kvm_para.h>
e22bee78 55
ea138446 56#include "workqueue_internal.h"
1da177e4 57
c8e55f36 58enum {
24647570
TH
59 /*
60 * worker_pool flags
bc2ae0f5 61 *
24647570 62 * A bound pool is either associated or disassociated with its CPU.
bc2ae0f5
TH
63 * While associated (!DISASSOCIATED), all workers are bound to the
64 * CPU and none has %WORKER_UNBOUND set and concurrency management
65 * is in effect.
66 *
67 * While DISASSOCIATED, the cpu may be offline and all workers have
68 * %WORKER_UNBOUND set and concurrency management disabled, and may
24647570 69 * be executing on any CPU. The pool behaves as an unbound one.
bc2ae0f5 70 *
bc3a1afc 71 * Note that DISASSOCIATED should be flipped only while holding
1258fae7 72 * wq_pool_attach_mutex to avoid changing binding state while
4736cbf7 73 * worker_attach_to_pool() is in progress.
bc2ae0f5 74 */
692b4825 75 POOL_MANAGER_ACTIVE = 1 << 0, /* being managed */
24647570 76 POOL_DISASSOCIATED = 1 << 2, /* cpu can't serve workers */
db7bccf4 77
c8e55f36 78 /* worker flags */
c8e55f36
TH
79 WORKER_DIE = 1 << 1, /* die die die */
80 WORKER_IDLE = 1 << 2, /* is idle */
e22bee78 81 WORKER_PREP = 1 << 3, /* preparing to run works */
fb0e7beb 82 WORKER_CPU_INTENSIVE = 1 << 6, /* cpu intensive */
f3421797 83 WORKER_UNBOUND = 1 << 7, /* worker is unbound */
a9ab775b 84 WORKER_REBOUND = 1 << 8, /* worker was rebound */
e22bee78 85
a9ab775b
TH
86 WORKER_NOT_RUNNING = WORKER_PREP | WORKER_CPU_INTENSIVE |
87 WORKER_UNBOUND | WORKER_REBOUND,
db7bccf4 88
e34cdddb 89 NR_STD_WORKER_POOLS = 2, /* # standard pools per cpu */
4ce62e9e 90
29c91e99 91 UNBOUND_POOL_HASH_ORDER = 6, /* hashed by pool->attrs */
c8e55f36 92 BUSY_WORKER_HASH_ORDER = 6, /* 64 pointers */
db7bccf4 93
e22bee78
TH
94 MAX_IDLE_WORKERS_RATIO = 4, /* 1/4 of busy can be idle */
95 IDLE_WORKER_TIMEOUT = 300 * HZ, /* keep idle ones for 5 mins */
96
3233cdbd
TH
97 MAYDAY_INITIAL_TIMEOUT = HZ / 100 >= 2 ? HZ / 100 : 2,
98 /* call for help after 10ms
99 (min two ticks) */
e22bee78
TH
100 MAYDAY_INTERVAL = HZ / 10, /* and then every 100ms */
101 CREATE_COOLDOWN = HZ, /* time to breath after fail */
e22bee78
TH
102
103 /*
104 * Rescue workers are used only on emergencies and shared by
8698a745 105 * all cpus. Give MIN_NICE.
e22bee78 106 */
8698a745
DY
107 RESCUER_NICE_LEVEL = MIN_NICE,
108 HIGHPRI_NICE_LEVEL = MIN_NICE,
ecf6881f
TH
109
110 WQ_NAME_LEN = 24,
c8e55f36 111};
1da177e4
LT
112
113/*
4690c4ab
TH
114 * Structure fields follow one of the following exclusion rules.
115 *
e41e704b
TH
116 * I: Modifiable by initialization/destruction paths and read-only for
117 * everyone else.
4690c4ab 118 *
e22bee78
TH
119 * P: Preemption protected. Disabling preemption is enough and should
120 * only be modified and accessed from the local cpu.
121 *
d565ed63 122 * L: pool->lock protected. Access with pool->lock held.
4690c4ab 123 *
d565ed63
TH
124 * X: During normal operation, modification requires pool->lock and should
125 * be done only from local cpu. Either disabling preemption on local
126 * cpu or grabbing pool->lock is enough for read access. If
127 * POOL_DISASSOCIATED is set, it's identical to L.
e22bee78 128 *
bdf8b9bf
TH
129 * K: Only modified by worker while holding pool->lock. Can be safely read by
130 * self, while holding pool->lock or from IRQ context if %current is the
131 * kworker.
132 *
133 * S: Only modified by worker self.
134 *
1258fae7 135 * A: wq_pool_attach_mutex protected.
822d8405 136 *
68e13a67 137 * PL: wq_pool_mutex protected.
5bcab335 138 *
24acfb71 139 * PR: wq_pool_mutex protected for writes. RCU protected for reads.
76af4d93 140 *
5b95e1af
LJ
141 * PW: wq_pool_mutex and wq->mutex protected for writes. Either for reads.
142 *
143 * PWR: wq_pool_mutex and wq->mutex protected for writes. Either or
24acfb71 144 * RCU for reads.
5b95e1af 145 *
3c25a55d
LJ
146 * WQ: wq->mutex protected.
147 *
24acfb71 148 * WR: wq->mutex protected for writes. RCU protected for reads.
2e109a28
TH
149 *
150 * MD: wq_mayday_lock protected.
cd2440d6
PM
151 *
152 * WD: Used internally by the watchdog.
1da177e4 153 */
1da177e4 154
2eaebdb3 155/* struct worker is defined in workqueue_internal.h */
c34056a3 156
bd7bdd43 157struct worker_pool {
a9b8a985 158 raw_spinlock_t lock; /* the pool lock */
d84ff051 159 int cpu; /* I: the associated cpu */
f3f90ad4 160 int node; /* I: the associated node ID */
9daf9e67 161 int id; /* I: pool ID */
11ebea50 162 unsigned int flags; /* X: flags */
bd7bdd43 163
82607adc 164 unsigned long watchdog_ts; /* L: watchdog timestamp */
cd2440d6 165 bool cpu_stall; /* WD: stalled cpu bound pool */
82607adc 166
bc35f7ef
LJ
167 /*
168 * The counter is incremented in a process context on the associated CPU
169 * w/ preemption disabled, and decremented or reset in the same context
170 * but w/ pool->lock held. The readers grab pool->lock and are
171 * guaranteed to see if the counter reached zero.
172 */
173 int nr_running;
84f91c62 174
bd7bdd43 175 struct list_head worklist; /* L: list of pending works */
ea1abd61 176
5826cc8f
LJ
177 int nr_workers; /* L: total number of workers */
178 int nr_idle; /* L: currently idle workers */
bd7bdd43 179
2c1f1a91 180 struct list_head idle_list; /* L: list of idle workers */
bd7bdd43 181 struct timer_list idle_timer; /* L: worker idle timeout */
3f959aa3
VS
182 struct work_struct idle_cull_work; /* L: worker idle cleanup */
183
184 struct timer_list mayday_timer; /* L: SOS timer for workers */
bd7bdd43 185
c5aa87bb 186 /* a workers is either on busy_hash or idle_list, or the manager */
c9e7cf27
TH
187 DECLARE_HASHTABLE(busy_hash, BUSY_WORKER_HASH_ORDER);
188 /* L: hash of busy workers */
189
2607d7a6 190 struct worker *manager; /* L: purely informational */
92f9c5c4 191 struct list_head workers; /* A: attached workers */
e02b9312 192 struct list_head dying_workers; /* A: workers about to die */
60f5a4bc 193 struct completion *detach_completion; /* all workers detached */
e19e397a 194
7cda9aae 195 struct ida worker_ida; /* worker IDs for task name */
e19e397a 196
7a4e344c 197 struct workqueue_attrs *attrs; /* I: worker attributes */
68e13a67
LJ
198 struct hlist_node hash_node; /* PL: unbound_pool_hash node */
199 int refcnt; /* PL: refcnt for unbound pools */
7a4e344c 200
29c91e99 201 /*
24acfb71 202 * Destruction of pool is RCU protected to allow dereferences
29c91e99
TH
203 * from get_work_pool().
204 */
205 struct rcu_head rcu;
84f91c62 206};
8b03ae3c 207
725e8ec5
TH
208/*
209 * Per-pool_workqueue statistics. These can be monitored using
210 * tools/workqueue/wq_monitor.py.
211 */
212enum pool_workqueue_stats {
213 PWQ_STAT_STARTED, /* work items started execution */
214 PWQ_STAT_COMPLETED, /* work items completed execution */
616db877 215 PWQ_STAT_CPU_INTENSIVE, /* wq_cpu_intensive_thresh_us violations */
725e8ec5
TH
216 PWQ_STAT_CM_WAKEUP, /* concurrency-management worker wakeups */
217 PWQ_STAT_MAYDAY, /* maydays to rescuer */
218 PWQ_STAT_RESCUED, /* linked work items executed by rescuer */
219
220 PWQ_NR_STATS,
221};
222
1da177e4 223/*
112202d9
TH
224 * The per-pool workqueue. While queued, the lower WORK_STRUCT_FLAG_BITS
225 * of work_struct->data are used for flags and the remaining high bits
226 * point to the pwq; thus, pwqs need to be aligned at two's power of the
227 * number of flag bits.
1da177e4 228 */
112202d9 229struct pool_workqueue {
bd7bdd43 230 struct worker_pool *pool; /* I: the associated pool */
4690c4ab 231 struct workqueue_struct *wq; /* I: the owning workqueue */
73f53c4a
TH
232 int work_color; /* L: current color */
233 int flush_color; /* L: flushing color */
8864b4e5 234 int refcnt; /* L: reference count */
73f53c4a
TH
235 int nr_in_flight[WORK_NR_COLORS];
236 /* L: nr of in_flight works */
018f3a13
LJ
237
238 /*
239 * nr_active management and WORK_STRUCT_INACTIVE:
240 *
241 * When pwq->nr_active >= max_active, new work item is queued to
242 * pwq->inactive_works instead of pool->worklist and marked with
243 * WORK_STRUCT_INACTIVE.
244 *
245 * All work items marked with WORK_STRUCT_INACTIVE do not participate
246 * in pwq->nr_active and all work items in pwq->inactive_works are
247 * marked with WORK_STRUCT_INACTIVE. But not all WORK_STRUCT_INACTIVE
248 * work items are in pwq->inactive_works. Some of them are ready to
249 * run in pool->worklist or worker->scheduled. Those work itmes are
250 * only struct wq_barrier which is used for flush_work() and should
251 * not participate in pwq->nr_active. For non-barrier work item, it
252 * is marked with WORK_STRUCT_INACTIVE iff it is in pwq->inactive_works.
253 */
1e19ffc6 254 int nr_active; /* L: nr of active works */
a0a1a5fd 255 int max_active; /* L: max active works */
f97a4a1a 256 struct list_head inactive_works; /* L: inactive works */
3c25a55d 257 struct list_head pwqs_node; /* WR: node on wq->pwqs */
2e109a28 258 struct list_head mayday_node; /* MD: node on wq->maydays */
8864b4e5 259
725e8ec5
TH
260 u64 stats[PWQ_NR_STATS];
261
8864b4e5
TH
262 /*
263 * Release of unbound pwq is punted to system_wq. See put_pwq()
264 * and pwq_unbound_release_workfn() for details. pool_workqueue
24acfb71 265 * itself is also RCU protected so that the first pwq can be
b09f4fd3 266 * determined without grabbing wq->mutex.
8864b4e5
TH
267 */
268 struct work_struct unbound_release_work;
269 struct rcu_head rcu;
e904e6c2 270} __aligned(1 << WORK_STRUCT_FLAG_BITS);
1da177e4 271
73f53c4a
TH
272/*
273 * Structure used to wait for workqueue flush.
274 */
275struct wq_flusher {
3c25a55d
LJ
276 struct list_head list; /* WQ: list of flushers */
277 int flush_color; /* WQ: flush color waiting for */
73f53c4a
TH
278 struct completion done; /* flush completion */
279};
280
226223ab
TH
281struct wq_device;
282
1da177e4 283/*
c5aa87bb
TH
284 * The externally visible workqueue. It relays the issued work items to
285 * the appropriate worker_pool through its pool_workqueues.
1da177e4
LT
286 */
287struct workqueue_struct {
3c25a55d 288 struct list_head pwqs; /* WR: all pwqs of this wq */
e2dca7ad 289 struct list_head list; /* PR: list of all workqueues */
73f53c4a 290
3c25a55d
LJ
291 struct mutex mutex; /* protects this wq */
292 int work_color; /* WQ: current work color */
293 int flush_color; /* WQ: current flush color */
112202d9 294 atomic_t nr_pwqs_to_flush; /* flush in progress */
3c25a55d
LJ
295 struct wq_flusher *first_flusher; /* WQ: first flusher */
296 struct list_head flusher_queue; /* WQ: flush waiters */
297 struct list_head flusher_overflow; /* WQ: flush overflow list */
73f53c4a 298
2e109a28 299 struct list_head maydays; /* MD: pwqs requesting rescue */
30ae2fc0 300 struct worker *rescuer; /* MD: rescue worker */
e22bee78 301
87fc741e 302 int nr_drainers; /* WQ: drain in progress */
a357fc03 303 int saved_max_active; /* WQ: saved pwq max_active */
226223ab 304
5b95e1af
LJ
305 struct workqueue_attrs *unbound_attrs; /* PW: only for unbound wqs */
306 struct pool_workqueue *dfl_pwq; /* PW: only for unbound wqs */
6029a918 307
226223ab
TH
308#ifdef CONFIG_SYSFS
309 struct wq_device *wq_dev; /* I: for sysfs interface */
310#endif
4e6045f1 311#ifdef CONFIG_LOCKDEP
669de8bd
BVA
312 char *lock_name;
313 struct lock_class_key key;
4690c4ab 314 struct lockdep_map lockdep_map;
4e6045f1 315#endif
ecf6881f 316 char name[WQ_NAME_LEN]; /* I: workqueue name */
2728fd2f 317
e2dca7ad 318 /*
24acfb71
TG
319 * Destruction of workqueue_struct is RCU protected to allow walking
320 * the workqueues list without grabbing wq_pool_mutex.
e2dca7ad
TH
321 * This is used to dump all workqueues from sysrq.
322 */
323 struct rcu_head rcu;
324
2728fd2f
TH
325 /* hot fields used during command issue, aligned to cacheline */
326 unsigned int flags ____cacheline_aligned; /* WQ: WQ_* flags */
327 struct pool_workqueue __percpu *cpu_pwqs; /* I: per-cpu pwqs */
5b95e1af 328 struct pool_workqueue __rcu *numa_pwq_tbl[]; /* PWR: unbound pwqs indexed by node */
1da177e4
LT
329};
330
e904e6c2
TH
331static struct kmem_cache *pwq_cache;
332
bce90380
TH
333static cpumask_var_t *wq_numa_possible_cpumask;
334 /* possible CPUs of each node */
335
616db877
TH
336/*
337 * Per-cpu work items which run for longer than the following threshold are
338 * automatically considered CPU intensive and excluded from concurrency
339 * management to prevent them from noticeably delaying other per-cpu work items.
340 */
341static unsigned long wq_cpu_intensive_thresh_us = 10000;
342module_param_named(cpu_intensive_thresh_us, wq_cpu_intensive_thresh_us, ulong, 0644);
343
d55262c4
TH
344static bool wq_disable_numa;
345module_param_named(disable_numa, wq_disable_numa, bool, 0444);
346
cee22a15 347/* see the comment above the definition of WQ_POWER_EFFICIENT */
552f530c 348static bool wq_power_efficient = IS_ENABLED(CONFIG_WQ_POWER_EFFICIENT_DEFAULT);
cee22a15
VK
349module_param_named(power_efficient, wq_power_efficient, bool, 0444);
350
863b710b 351static bool wq_online; /* can kworkers be created yet? */
3347fa09 352
bce90380
TH
353static bool wq_numa_enabled; /* unbound NUMA affinity enabled */
354
4c16bd32
TH
355/* buf for wq_update_unbound_numa_attrs(), protected by CPU hotplug exclusion */
356static struct workqueue_attrs *wq_update_unbound_numa_attrs_buf;
357
68e13a67 358static DEFINE_MUTEX(wq_pool_mutex); /* protects pools and workqueues list */
1258fae7 359static DEFINE_MUTEX(wq_pool_attach_mutex); /* protects worker attach/detach */
a9b8a985 360static DEFINE_RAW_SPINLOCK(wq_mayday_lock); /* protects wq->maydays list */
d8bb65ab
SAS
361/* wait for manager to go away */
362static struct rcuwait manager_wait = __RCUWAIT_INITIALIZER(manager_wait);
5bcab335 363
e2dca7ad 364static LIST_HEAD(workqueues); /* PR: list of all workqueues */
68e13a67 365static bool workqueue_freezing; /* PL: have wqs started freezing? */
7d19c5ce 366
99c621ef 367/* PL&A: allowable cpus for unbound wqs and work items */
ef557180
MG
368static cpumask_var_t wq_unbound_cpumask;
369
370/* CPU where unbound work was last round robin scheduled from this CPU */
371static DEFINE_PER_CPU(int, wq_rr_cpu_last);
b05a7928 372
f303fccb
TH
373/*
374 * Local execution of unbound work items is no longer guaranteed. The
375 * following always forces round-robin CPU selection on unbound work items
376 * to uncover usages which depend on it.
377 */
378#ifdef CONFIG_DEBUG_WQ_FORCE_RR_CPU
379static bool wq_debug_force_rr_cpu = true;
380#else
381static bool wq_debug_force_rr_cpu = false;
382#endif
383module_param_named(debug_force_rr_cpu, wq_debug_force_rr_cpu, bool, 0644);
384
7d19c5ce 385/* the per-cpu worker pools */
25528213 386static DEFINE_PER_CPU_SHARED_ALIGNED(struct worker_pool [NR_STD_WORKER_POOLS], cpu_worker_pools);
7d19c5ce 387
68e13a67 388static DEFINE_IDR(worker_pool_idr); /* PR: idr of all pools */
7d19c5ce 389
68e13a67 390/* PL: hash of all unbound pools keyed by pool->attrs */
29c91e99
TH
391static DEFINE_HASHTABLE(unbound_pool_hash, UNBOUND_POOL_HASH_ORDER);
392
c5aa87bb 393/* I: attributes used when instantiating standard unbound pools on demand */
29c91e99
TH
394static struct workqueue_attrs *unbound_std_wq_attrs[NR_STD_WORKER_POOLS];
395
8a2b7538
TH
396/* I: attributes used when instantiating ordered pools on demand */
397static struct workqueue_attrs *ordered_wq_attrs[NR_STD_WORKER_POOLS];
398
d320c038 399struct workqueue_struct *system_wq __read_mostly;
ad7b1f84 400EXPORT_SYMBOL(system_wq);
044c782c 401struct workqueue_struct *system_highpri_wq __read_mostly;
1aabe902 402EXPORT_SYMBOL_GPL(system_highpri_wq);
044c782c 403struct workqueue_struct *system_long_wq __read_mostly;
d320c038 404EXPORT_SYMBOL_GPL(system_long_wq);
044c782c 405struct workqueue_struct *system_unbound_wq __read_mostly;
f3421797 406EXPORT_SYMBOL_GPL(system_unbound_wq);
044c782c 407struct workqueue_struct *system_freezable_wq __read_mostly;
24d51add 408EXPORT_SYMBOL_GPL(system_freezable_wq);
0668106c
VK
409struct workqueue_struct *system_power_efficient_wq __read_mostly;
410EXPORT_SYMBOL_GPL(system_power_efficient_wq);
411struct workqueue_struct *system_freezable_power_efficient_wq __read_mostly;
412EXPORT_SYMBOL_GPL(system_freezable_power_efficient_wq);
d320c038 413
7d19c5ce 414static int worker_thread(void *__worker);
6ba94429 415static void workqueue_sysfs_unregister(struct workqueue_struct *wq);
c29eb853 416static void show_pwq(struct pool_workqueue *pwq);
55df0933 417static void show_one_worker_pool(struct worker_pool *pool);
7d19c5ce 418
97bd2347
TH
419#define CREATE_TRACE_POINTS
420#include <trace/events/workqueue.h>
421
68e13a67 422#define assert_rcu_or_pool_mutex() \
24acfb71 423 RCU_LOCKDEP_WARN(!rcu_read_lock_held() && \
f78f5b90 424 !lockdep_is_held(&wq_pool_mutex), \
24acfb71 425 "RCU or wq_pool_mutex should be held")
5bcab335 426
5b95e1af 427#define assert_rcu_or_wq_mutex_or_pool_mutex(wq) \
24acfb71 428 RCU_LOCKDEP_WARN(!rcu_read_lock_held() && \
f78f5b90
PM
429 !lockdep_is_held(&wq->mutex) && \
430 !lockdep_is_held(&wq_pool_mutex), \
24acfb71 431 "RCU, wq->mutex or wq_pool_mutex should be held")
5b95e1af 432
f02ae73a
TH
433#define for_each_cpu_worker_pool(pool, cpu) \
434 for ((pool) = &per_cpu(cpu_worker_pools, cpu)[0]; \
435 (pool) < &per_cpu(cpu_worker_pools, cpu)[NR_STD_WORKER_POOLS]; \
7a62c2c8 436 (pool)++)
4ce62e9e 437
17116969
TH
438/**
439 * for_each_pool - iterate through all worker_pools in the system
440 * @pool: iteration cursor
611c92a0 441 * @pi: integer used for iteration
fa1b54e6 442 *
24acfb71 443 * This must be called either with wq_pool_mutex held or RCU read
68e13a67
LJ
444 * locked. If the pool needs to be used beyond the locking in effect, the
445 * caller is responsible for guaranteeing that the pool stays online.
fa1b54e6
TH
446 *
447 * The if/else clause exists only for the lockdep assertion and can be
448 * ignored.
17116969 449 */
611c92a0
TH
450#define for_each_pool(pool, pi) \
451 idr_for_each_entry(&worker_pool_idr, pool, pi) \
68e13a67 452 if (({ assert_rcu_or_pool_mutex(); false; })) { } \
fa1b54e6 453 else
17116969 454
822d8405
TH
455/**
456 * for_each_pool_worker - iterate through all workers of a worker_pool
457 * @worker: iteration cursor
822d8405
TH
458 * @pool: worker_pool to iterate workers of
459 *
1258fae7 460 * This must be called with wq_pool_attach_mutex.
822d8405
TH
461 *
462 * The if/else clause exists only for the lockdep assertion and can be
463 * ignored.
464 */
da028469
LJ
465#define for_each_pool_worker(worker, pool) \
466 list_for_each_entry((worker), &(pool)->workers, node) \
1258fae7 467 if (({ lockdep_assert_held(&wq_pool_attach_mutex); false; })) { } \
822d8405
TH
468 else
469
49e3cf44
TH
470/**
471 * for_each_pwq - iterate through all pool_workqueues of the specified workqueue
472 * @pwq: iteration cursor
473 * @wq: the target workqueue
76af4d93 474 *
24acfb71 475 * This must be called either with wq->mutex held or RCU read locked.
794b18bc
TH
476 * If the pwq needs to be used beyond the locking in effect, the caller is
477 * responsible for guaranteeing that the pwq stays online.
76af4d93
TH
478 *
479 * The if/else clause exists only for the lockdep assertion and can be
480 * ignored.
49e3cf44
TH
481 */
482#define for_each_pwq(pwq, wq) \
49e9d1a9 483 list_for_each_entry_rcu((pwq), &(wq)->pwqs, pwqs_node, \
5a644662 484 lockdep_is_held(&(wq->mutex)))
f3421797 485
dc186ad7
TG
486#ifdef CONFIG_DEBUG_OBJECTS_WORK
487
f9e62f31 488static const struct debug_obj_descr work_debug_descr;
dc186ad7 489
99777288
SG
490static void *work_debug_hint(void *addr)
491{
492 return ((struct work_struct *) addr)->func;
493}
494
b9fdac7f
DC
495static bool work_is_static_object(void *addr)
496{
497 struct work_struct *work = addr;
498
499 return test_bit(WORK_STRUCT_STATIC_BIT, work_data_bits(work));
500}
501
dc186ad7
TG
502/*
503 * fixup_init is called when:
504 * - an active object is initialized
505 */
02a982a6 506static bool work_fixup_init(void *addr, enum debug_obj_state state)
dc186ad7
TG
507{
508 struct work_struct *work = addr;
509
510 switch (state) {
511 case ODEBUG_STATE_ACTIVE:
512 cancel_work_sync(work);
513 debug_object_init(work, &work_debug_descr);
02a982a6 514 return true;
dc186ad7 515 default:
02a982a6 516 return false;
dc186ad7
TG
517 }
518}
519
dc186ad7
TG
520/*
521 * fixup_free is called when:
522 * - an active object is freed
523 */
02a982a6 524static bool work_fixup_free(void *addr, enum debug_obj_state state)
dc186ad7
TG
525{
526 struct work_struct *work = addr;
527
528 switch (state) {
529 case ODEBUG_STATE_ACTIVE:
530 cancel_work_sync(work);
531 debug_object_free(work, &work_debug_descr);
02a982a6 532 return true;
dc186ad7 533 default:
02a982a6 534 return false;
dc186ad7
TG
535 }
536}
537
f9e62f31 538static const struct debug_obj_descr work_debug_descr = {
dc186ad7 539 .name = "work_struct",
99777288 540 .debug_hint = work_debug_hint,
b9fdac7f 541 .is_static_object = work_is_static_object,
dc186ad7 542 .fixup_init = work_fixup_init,
dc186ad7
TG
543 .fixup_free = work_fixup_free,
544};
545
546static inline void debug_work_activate(struct work_struct *work)
547{
548 debug_object_activate(work, &work_debug_descr);
549}
550
551static inline void debug_work_deactivate(struct work_struct *work)
552{
553 debug_object_deactivate(work, &work_debug_descr);
554}
555
556void __init_work(struct work_struct *work, int onstack)
557{
558 if (onstack)
559 debug_object_init_on_stack(work, &work_debug_descr);
560 else
561 debug_object_init(work, &work_debug_descr);
562}
563EXPORT_SYMBOL_GPL(__init_work);
564
565void destroy_work_on_stack(struct work_struct *work)
566{
567 debug_object_free(work, &work_debug_descr);
568}
569EXPORT_SYMBOL_GPL(destroy_work_on_stack);
570
ea2e64f2
TG
571void destroy_delayed_work_on_stack(struct delayed_work *work)
572{
573 destroy_timer_on_stack(&work->timer);
574 debug_object_free(&work->work, &work_debug_descr);
575}
576EXPORT_SYMBOL_GPL(destroy_delayed_work_on_stack);
577
dc186ad7
TG
578#else
579static inline void debug_work_activate(struct work_struct *work) { }
580static inline void debug_work_deactivate(struct work_struct *work) { }
581#endif
582
4e8b22bd 583/**
67dc8325 584 * worker_pool_assign_id - allocate ID and assign it to @pool
4e8b22bd
LB
585 * @pool: the pool pointer of interest
586 *
587 * Returns 0 if ID in [0, WORK_OFFQ_POOL_NONE) is allocated and assigned
588 * successfully, -errno on failure.
589 */
9daf9e67
TH
590static int worker_pool_assign_id(struct worker_pool *pool)
591{
592 int ret;
593
68e13a67 594 lockdep_assert_held(&wq_pool_mutex);
5bcab335 595
4e8b22bd
LB
596 ret = idr_alloc(&worker_pool_idr, pool, 0, WORK_OFFQ_POOL_NONE,
597 GFP_KERNEL);
229641a6 598 if (ret >= 0) {
e68035fb 599 pool->id = ret;
229641a6
TH
600 return 0;
601 }
fa1b54e6 602 return ret;
7c3eed5c
TH
603}
604
df2d5ae4
TH
605/**
606 * unbound_pwq_by_node - return the unbound pool_workqueue for the given node
607 * @wq: the target workqueue
608 * @node: the node ID
609 *
24acfb71 610 * This must be called with any of wq_pool_mutex, wq->mutex or RCU
5b95e1af 611 * read locked.
df2d5ae4
TH
612 * If the pwq needs to be used beyond the locking in effect, the caller is
613 * responsible for guaranteeing that the pwq stays online.
d185af30
YB
614 *
615 * Return: The unbound pool_workqueue for @node.
df2d5ae4
TH
616 */
617static struct pool_workqueue *unbound_pwq_by_node(struct workqueue_struct *wq,
618 int node)
619{
5b95e1af 620 assert_rcu_or_wq_mutex_or_pool_mutex(wq);
d6e022f1
TH
621
622 /*
623 * XXX: @node can be NUMA_NO_NODE if CPU goes offline while a
624 * delayed item is pending. The plan is to keep CPU -> NODE
625 * mapping valid and stable across CPU on/offlines. Once that
626 * happens, this workaround can be removed.
627 */
628 if (unlikely(node == NUMA_NO_NODE))
629 return wq->dfl_pwq;
630
df2d5ae4
TH
631 return rcu_dereference_raw(wq->numa_pwq_tbl[node]);
632}
633
73f53c4a
TH
634static unsigned int work_color_to_flags(int color)
635{
636 return color << WORK_STRUCT_COLOR_SHIFT;
637}
638
c4560c2c 639static int get_work_color(unsigned long work_data)
73f53c4a 640{
c4560c2c 641 return (work_data >> WORK_STRUCT_COLOR_SHIFT) &
73f53c4a
TH
642 ((1 << WORK_STRUCT_COLOR_BITS) - 1);
643}
644
645static int work_next_color(int color)
646{
647 return (color + 1) % WORK_NR_COLORS;
648}
1da177e4 649
14441960 650/*
112202d9
TH
651 * While queued, %WORK_STRUCT_PWQ is set and non flag bits of a work's data
652 * contain the pointer to the queued pwq. Once execution starts, the flag
7c3eed5c 653 * is cleared and the high bits contain OFFQ flags and pool ID.
7a22ad75 654 *
112202d9
TH
655 * set_work_pwq(), set_work_pool_and_clear_pending(), mark_work_canceling()
656 * and clear_work_data() can be used to set the pwq, pool or clear
bbb68dfa
TH
657 * work->data. These functions should only be called while the work is
658 * owned - ie. while the PENDING bit is set.
7a22ad75 659 *
112202d9 660 * get_work_pool() and get_work_pwq() can be used to obtain the pool or pwq
7c3eed5c 661 * corresponding to a work. Pool is available once the work has been
112202d9 662 * queued anywhere after initialization until it is sync canceled. pwq is
7c3eed5c 663 * available only while the work item is queued.
7a22ad75 664 *
bbb68dfa
TH
665 * %WORK_OFFQ_CANCELING is used to mark a work item which is being
666 * canceled. While being canceled, a work item may have its PENDING set
667 * but stay off timer and worklist for arbitrarily long and nobody should
668 * try to steal the PENDING bit.
14441960 669 */
7a22ad75
TH
670static inline void set_work_data(struct work_struct *work, unsigned long data,
671 unsigned long flags)
365970a1 672{
6183c009 673 WARN_ON_ONCE(!work_pending(work));
7a22ad75
TH
674 atomic_long_set(&work->data, data | flags | work_static(work));
675}
365970a1 676
112202d9 677static void set_work_pwq(struct work_struct *work, struct pool_workqueue *pwq,
7a22ad75
TH
678 unsigned long extra_flags)
679{
112202d9
TH
680 set_work_data(work, (unsigned long)pwq,
681 WORK_STRUCT_PENDING | WORK_STRUCT_PWQ | extra_flags);
365970a1
DH
682}
683
4468a00f
LJ
684static void set_work_pool_and_keep_pending(struct work_struct *work,
685 int pool_id)
686{
687 set_work_data(work, (unsigned long)pool_id << WORK_OFFQ_POOL_SHIFT,
688 WORK_STRUCT_PENDING);
689}
690
7c3eed5c
TH
691static void set_work_pool_and_clear_pending(struct work_struct *work,
692 int pool_id)
7a22ad75 693{
23657bb1
TH
694 /*
695 * The following wmb is paired with the implied mb in
696 * test_and_set_bit(PENDING) and ensures all updates to @work made
697 * here are visible to and precede any updates by the next PENDING
698 * owner.
699 */
700 smp_wmb();
7c3eed5c 701 set_work_data(work, (unsigned long)pool_id << WORK_OFFQ_POOL_SHIFT, 0);
346c09f8
RP
702 /*
703 * The following mb guarantees that previous clear of a PENDING bit
704 * will not be reordered with any speculative LOADS or STORES from
705 * work->current_func, which is executed afterwards. This possible
8bdc6201 706 * reordering can lead to a missed execution on attempt to queue
346c09f8
RP
707 * the same @work. E.g. consider this case:
708 *
709 * CPU#0 CPU#1
710 * ---------------------------- --------------------------------
711 *
712 * 1 STORE event_indicated
713 * 2 queue_work_on() {
714 * 3 test_and_set_bit(PENDING)
715 * 4 } set_..._and_clear_pending() {
716 * 5 set_work_data() # clear bit
717 * 6 smp_mb()
718 * 7 work->current_func() {
719 * 8 LOAD event_indicated
720 * }
721 *
722 * Without an explicit full barrier speculative LOAD on line 8 can
723 * be executed before CPU#0 does STORE on line 1. If that happens,
724 * CPU#0 observes the PENDING bit is still set and new execution of
725 * a @work is not queued in a hope, that CPU#1 will eventually
726 * finish the queued @work. Meanwhile CPU#1 does not see
727 * event_indicated is set, because speculative LOAD was executed
728 * before actual STORE.
729 */
730 smp_mb();
7a22ad75 731}
f756d5e2 732
7a22ad75 733static void clear_work_data(struct work_struct *work)
1da177e4 734{
7c3eed5c
TH
735 smp_wmb(); /* see set_work_pool_and_clear_pending() */
736 set_work_data(work, WORK_STRUCT_NO_POOL, 0);
1da177e4
LT
737}
738
112202d9 739static struct pool_workqueue *get_work_pwq(struct work_struct *work)
b1f4ec17 740{
e120153d 741 unsigned long data = atomic_long_read(&work->data);
7a22ad75 742
112202d9 743 if (data & WORK_STRUCT_PWQ)
e120153d
TH
744 return (void *)(data & WORK_STRUCT_WQ_DATA_MASK);
745 else
746 return NULL;
4d707b9f
ON
747}
748
7c3eed5c
TH
749/**
750 * get_work_pool - return the worker_pool a given work was associated with
751 * @work: the work item of interest
752 *
68e13a67 753 * Pools are created and destroyed under wq_pool_mutex, and allows read
24acfb71
TG
754 * access under RCU read lock. As such, this function should be
755 * called under wq_pool_mutex or inside of a rcu_read_lock() region.
fa1b54e6
TH
756 *
757 * All fields of the returned pool are accessible as long as the above
758 * mentioned locking is in effect. If the returned pool needs to be used
759 * beyond the critical section, the caller is responsible for ensuring the
760 * returned pool is and stays online.
d185af30
YB
761 *
762 * Return: The worker_pool @work was last associated with. %NULL if none.
7c3eed5c
TH
763 */
764static struct worker_pool *get_work_pool(struct work_struct *work)
365970a1 765{
e120153d 766 unsigned long data = atomic_long_read(&work->data);
7c3eed5c 767 int pool_id;
7a22ad75 768
68e13a67 769 assert_rcu_or_pool_mutex();
fa1b54e6 770
112202d9
TH
771 if (data & WORK_STRUCT_PWQ)
772 return ((struct pool_workqueue *)
7c3eed5c 773 (data & WORK_STRUCT_WQ_DATA_MASK))->pool;
7a22ad75 774
7c3eed5c
TH
775 pool_id = data >> WORK_OFFQ_POOL_SHIFT;
776 if (pool_id == WORK_OFFQ_POOL_NONE)
7a22ad75
TH
777 return NULL;
778
fa1b54e6 779 return idr_find(&worker_pool_idr, pool_id);
7c3eed5c
TH
780}
781
782/**
783 * get_work_pool_id - return the worker pool ID a given work is associated with
784 * @work: the work item of interest
785 *
d185af30 786 * Return: The worker_pool ID @work was last associated with.
7c3eed5c
TH
787 * %WORK_OFFQ_POOL_NONE if none.
788 */
789static int get_work_pool_id(struct work_struct *work)
790{
54d5b7d0
LJ
791 unsigned long data = atomic_long_read(&work->data);
792
112202d9
TH
793 if (data & WORK_STRUCT_PWQ)
794 return ((struct pool_workqueue *)
54d5b7d0 795 (data & WORK_STRUCT_WQ_DATA_MASK))->pool->id;
7c3eed5c 796
54d5b7d0 797 return data >> WORK_OFFQ_POOL_SHIFT;
7c3eed5c
TH
798}
799
bbb68dfa
TH
800static void mark_work_canceling(struct work_struct *work)
801{
7c3eed5c 802 unsigned long pool_id = get_work_pool_id(work);
bbb68dfa 803
7c3eed5c
TH
804 pool_id <<= WORK_OFFQ_POOL_SHIFT;
805 set_work_data(work, pool_id | WORK_OFFQ_CANCELING, WORK_STRUCT_PENDING);
bbb68dfa
TH
806}
807
808static bool work_is_canceling(struct work_struct *work)
809{
810 unsigned long data = atomic_long_read(&work->data);
811
112202d9 812 return !(data & WORK_STRUCT_PWQ) && (data & WORK_OFFQ_CANCELING);
bbb68dfa
TH
813}
814
e22bee78 815/*
3270476a
TH
816 * Policy functions. These define the policies on how the global worker
817 * pools are managed. Unless noted otherwise, these functions assume that
d565ed63 818 * they're being called with pool->lock held.
e22bee78
TH
819 */
820
63d95a91 821static bool __need_more_worker(struct worker_pool *pool)
a848e3b6 822{
bc35f7ef 823 return !pool->nr_running;
a848e3b6
ON
824}
825
4594bf15 826/*
e22bee78
TH
827 * Need to wake up a worker? Called from anything but currently
828 * running workers.
974271c4
TH
829 *
830 * Note that, because unbound workers never contribute to nr_running, this
706026c2 831 * function will always return %true for unbound pools as long as the
974271c4 832 * worklist isn't empty.
4594bf15 833 */
63d95a91 834static bool need_more_worker(struct worker_pool *pool)
365970a1 835{
63d95a91 836 return !list_empty(&pool->worklist) && __need_more_worker(pool);
e22bee78 837}
4594bf15 838
e22bee78 839/* Can I start working? Called from busy but !running workers. */
63d95a91 840static bool may_start_working(struct worker_pool *pool)
e22bee78 841{
63d95a91 842 return pool->nr_idle;
e22bee78
TH
843}
844
845/* Do I need to keep working? Called from currently running workers. */
63d95a91 846static bool keep_working(struct worker_pool *pool)
e22bee78 847{
bc35f7ef 848 return !list_empty(&pool->worklist) && (pool->nr_running <= 1);
e22bee78
TH
849}
850
851/* Do we need a new worker? Called from manager. */
63d95a91 852static bool need_to_create_worker(struct worker_pool *pool)
e22bee78 853{
63d95a91 854 return need_more_worker(pool) && !may_start_working(pool);
e22bee78 855}
365970a1 856
e22bee78 857/* Do we have too many workers and should some go away? */
63d95a91 858static bool too_many_workers(struct worker_pool *pool)
e22bee78 859{
692b4825 860 bool managing = pool->flags & POOL_MANAGER_ACTIVE;
63d95a91
TH
861 int nr_idle = pool->nr_idle + managing; /* manager is considered idle */
862 int nr_busy = pool->nr_workers - nr_idle;
e22bee78
TH
863
864 return nr_idle > 2 && (nr_idle - 2) * MAX_IDLE_WORKERS_RATIO >= nr_busy;
365970a1
DH
865}
866
4d707b9f 867/*
e22bee78
TH
868 * Wake up functions.
869 */
870
2c1f1a91 871/* Return the first idle worker. Called with pool->lock held. */
1037de36 872static struct worker *first_idle_worker(struct worker_pool *pool)
7e11629d 873{
63d95a91 874 if (unlikely(list_empty(&pool->idle_list)))
7e11629d
TH
875 return NULL;
876
63d95a91 877 return list_first_entry(&pool->idle_list, struct worker, entry);
7e11629d
TH
878}
879
880/**
881 * wake_up_worker - wake up an idle worker
63d95a91 882 * @pool: worker pool to wake worker from
7e11629d 883 *
63d95a91 884 * Wake up the first idle worker of @pool.
7e11629d
TH
885 *
886 * CONTEXT:
a9b8a985 887 * raw_spin_lock_irq(pool->lock).
7e11629d 888 */
63d95a91 889static void wake_up_worker(struct worker_pool *pool)
7e11629d 890{
1037de36 891 struct worker *worker = first_idle_worker(pool);
7e11629d
TH
892
893 if (likely(worker))
894 wake_up_process(worker->task);
895}
896
c54d5046
TH
897/**
898 * worker_set_flags - set worker flags and adjust nr_running accordingly
899 * @worker: self
900 * @flags: flags to set
901 *
902 * Set @flags in @worker->flags and adjust nr_running accordingly.
903 *
904 * CONTEXT:
905 * raw_spin_lock_irq(pool->lock)
906 */
907static inline void worker_set_flags(struct worker *worker, unsigned int flags)
908{
909 struct worker_pool *pool = worker->pool;
910
911 WARN_ON_ONCE(worker->task != current);
912
913 /* If transitioning into NOT_RUNNING, adjust nr_running. */
914 if ((flags & WORKER_NOT_RUNNING) &&
915 !(worker->flags & WORKER_NOT_RUNNING)) {
916 pool->nr_running--;
917 }
918
919 worker->flags |= flags;
920}
921
922/**
923 * worker_clr_flags - clear worker flags and adjust nr_running accordingly
924 * @worker: self
925 * @flags: flags to clear
926 *
927 * Clear @flags in @worker->flags and adjust nr_running accordingly.
928 *
929 * CONTEXT:
930 * raw_spin_lock_irq(pool->lock)
931 */
932static inline void worker_clr_flags(struct worker *worker, unsigned int flags)
933{
934 struct worker_pool *pool = worker->pool;
935 unsigned int oflags = worker->flags;
936
937 WARN_ON_ONCE(worker->task != current);
938
939 worker->flags &= ~flags;
940
941 /*
942 * If transitioning out of NOT_RUNNING, increment nr_running. Note
943 * that the nested NOT_RUNNING is not a noop. NOT_RUNNING is mask
944 * of multiple flags, not a single flag.
945 */
946 if ((flags & WORKER_NOT_RUNNING) && (oflags & WORKER_NOT_RUNNING))
947 if (!(worker->flags & WORKER_NOT_RUNNING))
948 pool->nr_running++;
949}
950
63638450
TH
951#ifdef CONFIG_WQ_CPU_INTENSIVE_REPORT
952
953/*
954 * Concurrency-managed per-cpu work items that hog CPU for longer than
955 * wq_cpu_intensive_thresh_us trigger the automatic CPU_INTENSIVE mechanism,
956 * which prevents them from stalling other concurrency-managed work items. If a
957 * work function keeps triggering this mechanism, it's likely that the work item
958 * should be using an unbound workqueue instead.
959 *
960 * wq_cpu_intensive_report() tracks work functions which trigger such conditions
961 * and report them so that they can be examined and converted to use unbound
962 * workqueues as appropriate. To avoid flooding the console, each violating work
963 * function is tracked and reported with exponential backoff.
964 */
965#define WCI_MAX_ENTS 128
966
967struct wci_ent {
968 work_func_t func;
969 atomic64_t cnt;
970 struct hlist_node hash_node;
971};
972
973static struct wci_ent wci_ents[WCI_MAX_ENTS];
974static int wci_nr_ents;
975static DEFINE_RAW_SPINLOCK(wci_lock);
976static DEFINE_HASHTABLE(wci_hash, ilog2(WCI_MAX_ENTS));
977
978static struct wci_ent *wci_find_ent(work_func_t func)
979{
980 struct wci_ent *ent;
981
982 hash_for_each_possible_rcu(wci_hash, ent, hash_node,
983 (unsigned long)func) {
984 if (ent->func == func)
985 return ent;
986 }
987 return NULL;
988}
989
990static void wq_cpu_intensive_report(work_func_t func)
991{
992 struct wci_ent *ent;
993
994restart:
995 ent = wci_find_ent(func);
996 if (ent) {
997 u64 cnt;
998
999 /*
1000 * Start reporting from the fourth time and back off
1001 * exponentially.
1002 */
1003 cnt = atomic64_inc_return_relaxed(&ent->cnt);
1004 if (cnt >= 4 && is_power_of_2(cnt))
1005 printk_deferred(KERN_WARNING "workqueue: %ps hogged CPU for >%luus %llu times, consider switching to WQ_UNBOUND\n",
1006 ent->func, wq_cpu_intensive_thresh_us,
1007 atomic64_read(&ent->cnt));
1008 return;
1009 }
1010
1011 /*
1012 * @func is a new violation. Allocate a new entry for it. If wcn_ents[]
1013 * is exhausted, something went really wrong and we probably made enough
1014 * noise already.
1015 */
1016 if (wci_nr_ents >= WCI_MAX_ENTS)
1017 return;
1018
1019 raw_spin_lock(&wci_lock);
1020
1021 if (wci_nr_ents >= WCI_MAX_ENTS) {
1022 raw_spin_unlock(&wci_lock);
1023 return;
1024 }
1025
1026 if (wci_find_ent(func)) {
1027 raw_spin_unlock(&wci_lock);
1028 goto restart;
1029 }
1030
1031 ent = &wci_ents[wci_nr_ents++];
1032 ent->func = func;
1033 atomic64_set(&ent->cnt, 1);
1034 hash_add_rcu(wci_hash, &ent->hash_node, (unsigned long)func);
1035
1036 raw_spin_unlock(&wci_lock);
1037}
1038
1039#else /* CONFIG_WQ_CPU_INTENSIVE_REPORT */
1040static void wq_cpu_intensive_report(work_func_t func) {}
1041#endif /* CONFIG_WQ_CPU_INTENSIVE_REPORT */
1042
d302f017 1043/**
6d25be57 1044 * wq_worker_running - a worker is running again
e22bee78 1045 * @task: task waking up
e22bee78 1046 *
6d25be57 1047 * This function is called when a worker returns from schedule()
e22bee78 1048 */
6d25be57 1049void wq_worker_running(struct task_struct *task)
e22bee78
TH
1050{
1051 struct worker *worker = kthread_data(task);
1052
6d25be57
TG
1053 if (!worker->sleeping)
1054 return;
07edfece
FW
1055
1056 /*
1057 * If preempted by unbind_workers() between the WORKER_NOT_RUNNING check
1058 * and the nr_running increment below, we may ruin the nr_running reset
1059 * and leave with an unexpected pool->nr_running == 1 on the newly unbound
1060 * pool. Protect against such race.
1061 */
1062 preempt_disable();
6d25be57 1063 if (!(worker->flags & WORKER_NOT_RUNNING))
bc35f7ef 1064 worker->pool->nr_running++;
07edfece 1065 preempt_enable();
616db877
TH
1066
1067 /*
1068 * CPU intensive auto-detection cares about how long a work item hogged
1069 * CPU without sleeping. Reset the starting timestamp on wakeup.
1070 */
1071 worker->current_at = worker->task->se.sum_exec_runtime;
1072
6d25be57 1073 worker->sleeping = 0;
e22bee78
TH
1074}
1075
1076/**
1077 * wq_worker_sleeping - a worker is going to sleep
1078 * @task: task going to sleep
e22bee78 1079 *
6d25be57 1080 * This function is called from schedule() when a busy worker is
ccf45156 1081 * going to sleep.
e22bee78 1082 */
6d25be57 1083void wq_worker_sleeping(struct task_struct *task)
e22bee78 1084{
cc5bff38 1085 struct worker *worker = kthread_data(task);
111c225a 1086 struct worker_pool *pool;
e22bee78 1087
111c225a
TH
1088 /*
1089 * Rescuers, which may not have all the fields set up like normal
1090 * workers, also reach here, let's not access anything before
1091 * checking NOT_RUNNING.
1092 */
2d64672e 1093 if (worker->flags & WORKER_NOT_RUNNING)
6d25be57 1094 return;
e22bee78 1095
111c225a 1096 pool = worker->pool;
111c225a 1097
62849a96
SAS
1098 /* Return if preempted before wq_worker_running() was reached */
1099 if (worker->sleeping)
6d25be57
TG
1100 return;
1101
1102 worker->sleeping = 1;
a9b8a985 1103 raw_spin_lock_irq(&pool->lock);
e22bee78 1104
45c753f5
FW
1105 /*
1106 * Recheck in case unbind_workers() preempted us. We don't
1107 * want to decrement nr_running after the worker is unbound
1108 * and nr_running has been reset.
1109 */
1110 if (worker->flags & WORKER_NOT_RUNNING) {
1111 raw_spin_unlock_irq(&pool->lock);
1112 return;
1113 }
1114
bc35f7ef 1115 pool->nr_running--;
725e8ec5
TH
1116 if (need_more_worker(pool)) {
1117 worker->current_pwq->stats[PWQ_STAT_CM_WAKEUP]++;
cc5bff38 1118 wake_up_worker(pool);
725e8ec5 1119 }
a9b8a985 1120 raw_spin_unlock_irq(&pool->lock);
e22bee78
TH
1121}
1122
616db877
TH
1123/**
1124 * wq_worker_tick - a scheduler tick occurred while a kworker is running
1125 * @task: task currently running
1126 *
1127 * Called from scheduler_tick(). We're in the IRQ context and the current
1128 * worker's fields which follow the 'K' locking rule can be accessed safely.
1129 */
1130void wq_worker_tick(struct task_struct *task)
1131{
1132 struct worker *worker = kthread_data(task);
1133 struct pool_workqueue *pwq = worker->current_pwq;
1134 struct worker_pool *pool = worker->pool;
1135
1136 if (!pwq)
1137 return;
1138
1139 /*
1140 * If the current worker is concurrency managed and hogged the CPU for
1141 * longer than wq_cpu_intensive_thresh_us, it's automatically marked
1142 * CPU_INTENSIVE to avoid stalling other concurrency-managed work items.
1143 */
1144 if ((worker->flags & WORKER_NOT_RUNNING) ||
1145 worker->task->se.sum_exec_runtime - worker->current_at <
1146 wq_cpu_intensive_thresh_us * NSEC_PER_USEC)
1147 return;
1148
1149 raw_spin_lock(&pool->lock);
1150
1151 worker_set_flags(worker, WORKER_CPU_INTENSIVE);
63638450 1152 wq_cpu_intensive_report(worker->current_func);
616db877
TH
1153 pwq->stats[PWQ_STAT_CPU_INTENSIVE]++;
1154
1155 if (need_more_worker(pool)) {
1156 pwq->stats[PWQ_STAT_CM_WAKEUP]++;
1157 wake_up_worker(pool);
1158 }
1159
1160 raw_spin_unlock(&pool->lock);
1161}
1162
1b69ac6b
JW
1163/**
1164 * wq_worker_last_func - retrieve worker's last work function
8194fe94 1165 * @task: Task to retrieve last work function of.
1b69ac6b
JW
1166 *
1167 * Determine the last function a worker executed. This is called from
1168 * the scheduler to get a worker's last known identity.
1169 *
1170 * CONTEXT:
a9b8a985 1171 * raw_spin_lock_irq(rq->lock)
1b69ac6b 1172 *
4b047002
JW
1173 * This function is called during schedule() when a kworker is going
1174 * to sleep. It's used by psi to identify aggregation workers during
1175 * dequeuing, to allow periodic aggregation to shut-off when that
1176 * worker is the last task in the system or cgroup to go to sleep.
1177 *
1178 * As this function doesn't involve any workqueue-related locking, it
1179 * only returns stable values when called from inside the scheduler's
1180 * queuing and dequeuing paths, when @task, which must be a kworker,
1181 * is guaranteed to not be processing any works.
1182 *
1b69ac6b
JW
1183 * Return:
1184 * The last work function %current executed as a worker, NULL if it
1185 * hasn't executed any work yet.
1186 */
1187work_func_t wq_worker_last_func(struct task_struct *task)
1188{
1189 struct worker *worker = kthread_data(task);
1190
1191 return worker->last_func;
1192}
1193
8cca0eea
TH
1194/**
1195 * find_worker_executing_work - find worker which is executing a work
c9e7cf27 1196 * @pool: pool of interest
8cca0eea
TH
1197 * @work: work to find worker for
1198 *
c9e7cf27
TH
1199 * Find a worker which is executing @work on @pool by searching
1200 * @pool->busy_hash which is keyed by the address of @work. For a worker
a2c1c57b
TH
1201 * to match, its current execution should match the address of @work and
1202 * its work function. This is to avoid unwanted dependency between
1203 * unrelated work executions through a work item being recycled while still
1204 * being executed.
1205 *
1206 * This is a bit tricky. A work item may be freed once its execution
1207 * starts and nothing prevents the freed area from being recycled for
1208 * another work item. If the same work item address ends up being reused
1209 * before the original execution finishes, workqueue will identify the
1210 * recycled work item as currently executing and make it wait until the
1211 * current execution finishes, introducing an unwanted dependency.
1212 *
c5aa87bb
TH
1213 * This function checks the work item address and work function to avoid
1214 * false positives. Note that this isn't complete as one may construct a
1215 * work function which can introduce dependency onto itself through a
1216 * recycled work item. Well, if somebody wants to shoot oneself in the
1217 * foot that badly, there's only so much we can do, and if such deadlock
1218 * actually occurs, it should be easy to locate the culprit work function.
8cca0eea
TH
1219 *
1220 * CONTEXT:
a9b8a985 1221 * raw_spin_lock_irq(pool->lock).
8cca0eea 1222 *
d185af30
YB
1223 * Return:
1224 * Pointer to worker which is executing @work if found, %NULL
8cca0eea 1225 * otherwise.
4d707b9f 1226 */
c9e7cf27 1227static struct worker *find_worker_executing_work(struct worker_pool *pool,
8cca0eea 1228 struct work_struct *work)
4d707b9f 1229{
42f8570f 1230 struct worker *worker;
42f8570f 1231
b67bfe0d 1232 hash_for_each_possible(pool->busy_hash, worker, hentry,
a2c1c57b
TH
1233 (unsigned long)work)
1234 if (worker->current_work == work &&
1235 worker->current_func == work->func)
42f8570f
SL
1236 return worker;
1237
1238 return NULL;
4d707b9f
ON
1239}
1240
bf4ede01
TH
1241/**
1242 * move_linked_works - move linked works to a list
1243 * @work: start of series of works to be scheduled
1244 * @head: target list to append @work to
402dd89d 1245 * @nextp: out parameter for nested worklist walking
bf4ede01
TH
1246 *
1247 * Schedule linked works starting from @work to @head. Work series to
1248 * be scheduled starts at @work and includes any consecutive work with
1249 * WORK_STRUCT_LINKED set in its predecessor.
1250 *
1251 * If @nextp is not NULL, it's updated to point to the next work of
1252 * the last scheduled work. This allows move_linked_works() to be
1253 * nested inside outer list_for_each_entry_safe().
1254 *
1255 * CONTEXT:
a9b8a985 1256 * raw_spin_lock_irq(pool->lock).
bf4ede01
TH
1257 */
1258static void move_linked_works(struct work_struct *work, struct list_head *head,
1259 struct work_struct **nextp)
1260{
1261 struct work_struct *n;
1262
1263 /*
1264 * Linked worklist will always end before the end of the list,
1265 * use NULL for list head.
1266 */
1267 list_for_each_entry_safe_from(work, n, NULL, entry) {
1268 list_move_tail(&work->entry, head);
1269 if (!(*work_data_bits(work) & WORK_STRUCT_LINKED))
1270 break;
1271 }
1272
1273 /*
1274 * If we're already inside safe list traversal and have moved
1275 * multiple works to the scheduled queue, the next position
1276 * needs to be updated.
1277 */
1278 if (nextp)
1279 *nextp = n;
1280}
1281
8864b4e5
TH
1282/**
1283 * get_pwq - get an extra reference on the specified pool_workqueue
1284 * @pwq: pool_workqueue to get
1285 *
1286 * Obtain an extra reference on @pwq. The caller should guarantee that
1287 * @pwq has positive refcnt and be holding the matching pool->lock.
1288 */
1289static void get_pwq(struct pool_workqueue *pwq)
1290{
1291 lockdep_assert_held(&pwq->pool->lock);
1292 WARN_ON_ONCE(pwq->refcnt <= 0);
1293 pwq->refcnt++;
1294}
1295
1296/**
1297 * put_pwq - put a pool_workqueue reference
1298 * @pwq: pool_workqueue to put
1299 *
1300 * Drop a reference of @pwq. If its refcnt reaches zero, schedule its
1301 * destruction. The caller should be holding the matching pool->lock.
1302 */
1303static void put_pwq(struct pool_workqueue *pwq)
1304{
1305 lockdep_assert_held(&pwq->pool->lock);
1306 if (likely(--pwq->refcnt))
1307 return;
1308 if (WARN_ON_ONCE(!(pwq->wq->flags & WQ_UNBOUND)))
1309 return;
1310 /*
1311 * @pwq can't be released under pool->lock, bounce to
1312 * pwq_unbound_release_workfn(). This never recurses on the same
1313 * pool->lock as this path is taken only for unbound workqueues and
1314 * the release work item is scheduled on a per-cpu workqueue. To
1315 * avoid lockdep warning, unbound pool->locks are given lockdep
1316 * subclass of 1 in get_unbound_pool().
1317 */
1318 schedule_work(&pwq->unbound_release_work);
1319}
1320
dce90d47
TH
1321/**
1322 * put_pwq_unlocked - put_pwq() with surrounding pool lock/unlock
1323 * @pwq: pool_workqueue to put (can be %NULL)
1324 *
1325 * put_pwq() with locking. This function also allows %NULL @pwq.
1326 */
1327static void put_pwq_unlocked(struct pool_workqueue *pwq)
1328{
1329 if (pwq) {
1330 /*
24acfb71 1331 * As both pwqs and pools are RCU protected, the
dce90d47
TH
1332 * following lock operations are safe.
1333 */
a9b8a985 1334 raw_spin_lock_irq(&pwq->pool->lock);
dce90d47 1335 put_pwq(pwq);
a9b8a985 1336 raw_spin_unlock_irq(&pwq->pool->lock);
dce90d47
TH
1337 }
1338}
1339
f97a4a1a 1340static void pwq_activate_inactive_work(struct work_struct *work)
bf4ede01 1341{
112202d9 1342 struct pool_workqueue *pwq = get_work_pwq(work);
bf4ede01
TH
1343
1344 trace_workqueue_activate_work(work);
82607adc
TH
1345 if (list_empty(&pwq->pool->worklist))
1346 pwq->pool->watchdog_ts = jiffies;
112202d9 1347 move_linked_works(work, &pwq->pool->worklist, NULL);
f97a4a1a 1348 __clear_bit(WORK_STRUCT_INACTIVE_BIT, work_data_bits(work));
112202d9 1349 pwq->nr_active++;
bf4ede01
TH
1350}
1351
f97a4a1a 1352static void pwq_activate_first_inactive(struct pool_workqueue *pwq)
3aa62497 1353{
f97a4a1a 1354 struct work_struct *work = list_first_entry(&pwq->inactive_works,
3aa62497
LJ
1355 struct work_struct, entry);
1356
f97a4a1a 1357 pwq_activate_inactive_work(work);
3aa62497
LJ
1358}
1359
bf4ede01 1360/**
112202d9
TH
1361 * pwq_dec_nr_in_flight - decrement pwq's nr_in_flight
1362 * @pwq: pwq of interest
c4560c2c 1363 * @work_data: work_data of work which left the queue
bf4ede01
TH
1364 *
1365 * A work either has completed or is removed from pending queue,
112202d9 1366 * decrement nr_in_flight of its pwq and handle workqueue flushing.
bf4ede01
TH
1367 *
1368 * CONTEXT:
a9b8a985 1369 * raw_spin_lock_irq(pool->lock).
bf4ede01 1370 */
c4560c2c 1371static void pwq_dec_nr_in_flight(struct pool_workqueue *pwq, unsigned long work_data)
bf4ede01 1372{
c4560c2c
LJ
1373 int color = get_work_color(work_data);
1374
018f3a13
LJ
1375 if (!(work_data & WORK_STRUCT_INACTIVE)) {
1376 pwq->nr_active--;
1377 if (!list_empty(&pwq->inactive_works)) {
1378 /* one down, submit an inactive one */
1379 if (pwq->nr_active < pwq->max_active)
1380 pwq_activate_first_inactive(pwq);
1381 }
1382 }
1383
112202d9 1384 pwq->nr_in_flight[color]--;
bf4ede01 1385
bf4ede01 1386 /* is flush in progress and are we at the flushing tip? */
112202d9 1387 if (likely(pwq->flush_color != color))
8864b4e5 1388 goto out_put;
bf4ede01
TH
1389
1390 /* are there still in-flight works? */
112202d9 1391 if (pwq->nr_in_flight[color])
8864b4e5 1392 goto out_put;
bf4ede01 1393
112202d9
TH
1394 /* this pwq is done, clear flush_color */
1395 pwq->flush_color = -1;
bf4ede01
TH
1396
1397 /*
112202d9 1398 * If this was the last pwq, wake up the first flusher. It
bf4ede01
TH
1399 * will handle the rest.
1400 */
112202d9
TH
1401 if (atomic_dec_and_test(&pwq->wq->nr_pwqs_to_flush))
1402 complete(&pwq->wq->first_flusher->done);
8864b4e5
TH
1403out_put:
1404 put_pwq(pwq);
bf4ede01
TH
1405}
1406
36e227d2 1407/**
bbb68dfa 1408 * try_to_grab_pending - steal work item from worklist and disable irq
36e227d2
TH
1409 * @work: work item to steal
1410 * @is_dwork: @work is a delayed_work
bbb68dfa 1411 * @flags: place to store irq state
36e227d2
TH
1412 *
1413 * Try to grab PENDING bit of @work. This function can handle @work in any
d185af30 1414 * stable state - idle, on timer or on worklist.
36e227d2 1415 *
d185af30 1416 * Return:
3eb6b31b
MCC
1417 *
1418 * ======== ================================================================
36e227d2
TH
1419 * 1 if @work was pending and we successfully stole PENDING
1420 * 0 if @work was idle and we claimed PENDING
1421 * -EAGAIN if PENDING couldn't be grabbed at the moment, safe to busy-retry
bbb68dfa
TH
1422 * -ENOENT if someone else is canceling @work, this state may persist
1423 * for arbitrarily long
3eb6b31b 1424 * ======== ================================================================
36e227d2 1425 *
d185af30 1426 * Note:
bbb68dfa 1427 * On >= 0 return, the caller owns @work's PENDING bit. To avoid getting
e0aecdd8
TH
1428 * interrupted while holding PENDING and @work off queue, irq must be
1429 * disabled on entry. This, combined with delayed_work->timer being
1430 * irqsafe, ensures that we return -EAGAIN for finite short period of time.
bbb68dfa
TH
1431 *
1432 * On successful return, >= 0, irq is disabled and the caller is
1433 * responsible for releasing it using local_irq_restore(*@flags).
1434 *
e0aecdd8 1435 * This function is safe to call from any context including IRQ handler.
bf4ede01 1436 */
bbb68dfa
TH
1437static int try_to_grab_pending(struct work_struct *work, bool is_dwork,
1438 unsigned long *flags)
bf4ede01 1439{
d565ed63 1440 struct worker_pool *pool;
112202d9 1441 struct pool_workqueue *pwq;
bf4ede01 1442
bbb68dfa
TH
1443 local_irq_save(*flags);
1444
36e227d2
TH
1445 /* try to steal the timer if it exists */
1446 if (is_dwork) {
1447 struct delayed_work *dwork = to_delayed_work(work);
1448
e0aecdd8
TH
1449 /*
1450 * dwork->timer is irqsafe. If del_timer() fails, it's
1451 * guaranteed that the timer is not queued anywhere and not
1452 * running on the local CPU.
1453 */
36e227d2
TH
1454 if (likely(del_timer(&dwork->timer)))
1455 return 1;
1456 }
1457
1458 /* try to claim PENDING the normal way */
bf4ede01
TH
1459 if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work)))
1460 return 0;
1461
24acfb71 1462 rcu_read_lock();
bf4ede01
TH
1463 /*
1464 * The queueing is in progress, or it is already queued. Try to
1465 * steal it from ->worklist without clearing WORK_STRUCT_PENDING.
1466 */
d565ed63
TH
1467 pool = get_work_pool(work);
1468 if (!pool)
bbb68dfa 1469 goto fail;
bf4ede01 1470
a9b8a985 1471 raw_spin_lock(&pool->lock);
0b3dae68 1472 /*
112202d9
TH
1473 * work->data is guaranteed to point to pwq only while the work
1474 * item is queued on pwq->wq, and both updating work->data to point
1475 * to pwq on queueing and to pool on dequeueing are done under
1476 * pwq->pool->lock. This in turn guarantees that, if work->data
1477 * points to pwq which is associated with a locked pool, the work
0b3dae68
LJ
1478 * item is currently queued on that pool.
1479 */
112202d9
TH
1480 pwq = get_work_pwq(work);
1481 if (pwq && pwq->pool == pool) {
16062836
TH
1482 debug_work_deactivate(work);
1483
1484 /*
018f3a13
LJ
1485 * A cancelable inactive work item must be in the
1486 * pwq->inactive_works since a queued barrier can't be
1487 * canceled (see the comments in insert_wq_barrier()).
1488 *
f97a4a1a 1489 * An inactive work item cannot be grabbed directly because
d812796e 1490 * it might have linked barrier work items which, if left
f97a4a1a 1491 * on the inactive_works list, will confuse pwq->nr_active
16062836
TH
1492 * management later on and cause stall. Make sure the work
1493 * item is activated before grabbing.
1494 */
f97a4a1a
LJ
1495 if (*work_data_bits(work) & WORK_STRUCT_INACTIVE)
1496 pwq_activate_inactive_work(work);
16062836
TH
1497
1498 list_del_init(&work->entry);
c4560c2c 1499 pwq_dec_nr_in_flight(pwq, *work_data_bits(work));
16062836 1500
112202d9 1501 /* work->data points to pwq iff queued, point to pool */
16062836
TH
1502 set_work_pool_and_keep_pending(work, pool->id);
1503
a9b8a985 1504 raw_spin_unlock(&pool->lock);
24acfb71 1505 rcu_read_unlock();
16062836 1506 return 1;
bf4ede01 1507 }
a9b8a985 1508 raw_spin_unlock(&pool->lock);
bbb68dfa 1509fail:
24acfb71 1510 rcu_read_unlock();
bbb68dfa
TH
1511 local_irq_restore(*flags);
1512 if (work_is_canceling(work))
1513 return -ENOENT;
1514 cpu_relax();
36e227d2 1515 return -EAGAIN;
bf4ede01
TH
1516}
1517
4690c4ab 1518/**
706026c2 1519 * insert_work - insert a work into a pool
112202d9 1520 * @pwq: pwq @work belongs to
4690c4ab
TH
1521 * @work: work to insert
1522 * @head: insertion point
1523 * @extra_flags: extra WORK_STRUCT_* flags to set
1524 *
112202d9 1525 * Insert @work which belongs to @pwq after @head. @extra_flags is or'd to
706026c2 1526 * work_struct flags.
4690c4ab
TH
1527 *
1528 * CONTEXT:
a9b8a985 1529 * raw_spin_lock_irq(pool->lock).
4690c4ab 1530 */
112202d9
TH
1531static void insert_work(struct pool_workqueue *pwq, struct work_struct *work,
1532 struct list_head *head, unsigned int extra_flags)
b89deed3 1533{
112202d9 1534 struct worker_pool *pool = pwq->pool;
e22bee78 1535
e89a85d6 1536 /* record the work call stack in order to print it in KASAN reports */
f70da745 1537 kasan_record_aux_stack_noalloc(work);
e89a85d6 1538
4690c4ab 1539 /* we own @work, set data and link */
112202d9 1540 set_work_pwq(work, pwq, extra_flags);
1a4d9b0a 1541 list_add_tail(&work->entry, head);
8864b4e5 1542 get_pwq(pwq);
e22bee78 1543
63d95a91
TH
1544 if (__need_more_worker(pool))
1545 wake_up_worker(pool);
b89deed3
ON
1546}
1547
c8efcc25
TH
1548/*
1549 * Test whether @work is being queued from another work executing on the
8d03ecfe 1550 * same workqueue.
c8efcc25
TH
1551 */
1552static bool is_chained_work(struct workqueue_struct *wq)
1553{
8d03ecfe
TH
1554 struct worker *worker;
1555
1556 worker = current_wq_worker();
1557 /*
bf393fd4 1558 * Return %true iff I'm a worker executing a work item on @wq. If
8d03ecfe
TH
1559 * I'm @worker, it's safe to dereference it without locking.
1560 */
112202d9 1561 return worker && worker->current_pwq->wq == wq;
c8efcc25
TH
1562}
1563
ef557180
MG
1564/*
1565 * When queueing an unbound work item to a wq, prefer local CPU if allowed
1566 * by wq_unbound_cpumask. Otherwise, round robin among the allowed ones to
1567 * avoid perturbing sensitive tasks.
1568 */
1569static int wq_select_unbound_cpu(int cpu)
1570{
1571 int new_cpu;
1572
f303fccb
TH
1573 if (likely(!wq_debug_force_rr_cpu)) {
1574 if (cpumask_test_cpu(cpu, wq_unbound_cpumask))
1575 return cpu;
a8ec5880
AF
1576 } else {
1577 pr_warn_once("workqueue: round-robin CPU selection forced, expect performance impact\n");
f303fccb
TH
1578 }
1579
ef557180
MG
1580 if (cpumask_empty(wq_unbound_cpumask))
1581 return cpu;
1582
1583 new_cpu = __this_cpu_read(wq_rr_cpu_last);
1584 new_cpu = cpumask_next_and(new_cpu, wq_unbound_cpumask, cpu_online_mask);
1585 if (unlikely(new_cpu >= nr_cpu_ids)) {
1586 new_cpu = cpumask_first_and(wq_unbound_cpumask, cpu_online_mask);
1587 if (unlikely(new_cpu >= nr_cpu_ids))
1588 return cpu;
1589 }
1590 __this_cpu_write(wq_rr_cpu_last, new_cpu);
1591
1592 return new_cpu;
1593}
1594
d84ff051 1595static void __queue_work(int cpu, struct workqueue_struct *wq,
1da177e4
LT
1596 struct work_struct *work)
1597{
112202d9 1598 struct pool_workqueue *pwq;
c9178087 1599 struct worker_pool *last_pool;
1e19ffc6 1600 struct list_head *worklist;
8a2e8e5d 1601 unsigned int work_flags;
b75cac93 1602 unsigned int req_cpu = cpu;
8930caba
TH
1603
1604 /*
1605 * While a work item is PENDING && off queue, a task trying to
1606 * steal the PENDING will busy-loop waiting for it to either get
1607 * queued or lose PENDING. Grabbing PENDING and queueing should
1608 * happen with IRQ disabled.
1609 */
8e8eb730 1610 lockdep_assert_irqs_disabled();
1da177e4 1611
1e19ffc6 1612
33e3f0a3
RC
1613 /*
1614 * For a draining wq, only works from the same workqueue are
1615 * allowed. The __WQ_DESTROYING helps to spot the issue that
1616 * queues a new work item to a wq after destroy_workqueue(wq).
1617 */
1618 if (unlikely(wq->flags & (__WQ_DESTROYING | __WQ_DRAINING) &&
1619 WARN_ON_ONCE(!is_chained_work(wq))))
e41e704b 1620 return;
24acfb71 1621 rcu_read_lock();
9e8cd2f5 1622retry:
c9178087 1623 /* pwq which will be used unless @work is executing elsewhere */
aa202f1f
HD
1624 if (wq->flags & WQ_UNBOUND) {
1625 if (req_cpu == WORK_CPU_UNBOUND)
1626 cpu = wq_select_unbound_cpu(raw_smp_processor_id());
df2d5ae4 1627 pwq = unbound_pwq_by_node(wq, cpu_to_node(cpu));
aa202f1f
HD
1628 } else {
1629 if (req_cpu == WORK_CPU_UNBOUND)
1630 cpu = raw_smp_processor_id();
1631 pwq = per_cpu_ptr(wq->cpu_pwqs, cpu);
1632 }
dbf2576e 1633
c9178087
TH
1634 /*
1635 * If @work was previously on a different pool, it might still be
1636 * running there, in which case the work needs to be queued on that
1637 * pool to guarantee non-reentrancy.
1638 */
1639 last_pool = get_work_pool(work);
1640 if (last_pool && last_pool != pwq->pool) {
1641 struct worker *worker;
18aa9eff 1642
a9b8a985 1643 raw_spin_lock(&last_pool->lock);
18aa9eff 1644
c9178087 1645 worker = find_worker_executing_work(last_pool, work);
18aa9eff 1646
c9178087
TH
1647 if (worker && worker->current_pwq->wq == wq) {
1648 pwq = worker->current_pwq;
8930caba 1649 } else {
c9178087 1650 /* meh... not running there, queue here */
a9b8a985
SAS
1651 raw_spin_unlock(&last_pool->lock);
1652 raw_spin_lock(&pwq->pool->lock);
8930caba 1653 }
f3421797 1654 } else {
a9b8a985 1655 raw_spin_lock(&pwq->pool->lock);
502ca9d8
TH
1656 }
1657
9e8cd2f5
TH
1658 /*
1659 * pwq is determined and locked. For unbound pools, we could have
1660 * raced with pwq release and it could already be dead. If its
1661 * refcnt is zero, repeat pwq selection. Note that pwqs never die
df2d5ae4
TH
1662 * without another pwq replacing it in the numa_pwq_tbl or while
1663 * work items are executing on it, so the retrying is guaranteed to
9e8cd2f5
TH
1664 * make forward-progress.
1665 */
1666 if (unlikely(!pwq->refcnt)) {
1667 if (wq->flags & WQ_UNBOUND) {
a9b8a985 1668 raw_spin_unlock(&pwq->pool->lock);
9e8cd2f5
TH
1669 cpu_relax();
1670 goto retry;
1671 }
1672 /* oops */
1673 WARN_ONCE(true, "workqueue: per-cpu pwq for %s on cpu%d has 0 refcnt",
1674 wq->name, cpu);
1675 }
1676
112202d9
TH
1677 /* pwq determined, queue */
1678 trace_workqueue_queue_work(req_cpu, pwq, work);
502ca9d8 1679
24acfb71
TG
1680 if (WARN_ON(!list_empty(&work->entry)))
1681 goto out;
1e19ffc6 1682
112202d9
TH
1683 pwq->nr_in_flight[pwq->work_color]++;
1684 work_flags = work_color_to_flags(pwq->work_color);
1e19ffc6 1685
112202d9 1686 if (likely(pwq->nr_active < pwq->max_active)) {
cdadf009 1687 trace_workqueue_activate_work(work);
112202d9
TH
1688 pwq->nr_active++;
1689 worklist = &pwq->pool->worklist;
82607adc
TH
1690 if (list_empty(worklist))
1691 pwq->pool->watchdog_ts = jiffies;
8a2e8e5d 1692 } else {
f97a4a1a
LJ
1693 work_flags |= WORK_STRUCT_INACTIVE;
1694 worklist = &pwq->inactive_works;
8a2e8e5d 1695 }
1e19ffc6 1696
0687c66b 1697 debug_work_activate(work);
112202d9 1698 insert_work(pwq, work, worklist, work_flags);
1e19ffc6 1699
24acfb71 1700out:
a9b8a985 1701 raw_spin_unlock(&pwq->pool->lock);
24acfb71 1702 rcu_read_unlock();
1da177e4
LT
1703}
1704
0fcb78c2 1705/**
c1a220e7
ZR
1706 * queue_work_on - queue work on specific cpu
1707 * @cpu: CPU number to execute work on
0fcb78c2
REB
1708 * @wq: workqueue to use
1709 * @work: work to queue
1710 *
c1a220e7 1711 * We queue the work to a specific CPU, the caller must ensure it
443378f0
PM
1712 * can't go away. Callers that fail to ensure that the specified
1713 * CPU cannot go away will execute on a randomly chosen CPU.
854f5cc5
PM
1714 * But note well that callers specifying a CPU that never has been
1715 * online will get a splat.
d185af30
YB
1716 *
1717 * Return: %false if @work was already on a queue, %true otherwise.
1da177e4 1718 */
d4283e93
TH
1719bool queue_work_on(int cpu, struct workqueue_struct *wq,
1720 struct work_struct *work)
1da177e4 1721{
d4283e93 1722 bool ret = false;
8930caba 1723 unsigned long flags;
ef1ca236 1724
8930caba 1725 local_irq_save(flags);
c1a220e7 1726
22df02bb 1727 if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))) {
4690c4ab 1728 __queue_work(cpu, wq, work);
d4283e93 1729 ret = true;
c1a220e7 1730 }
ef1ca236 1731
8930caba 1732 local_irq_restore(flags);
1da177e4
LT
1733 return ret;
1734}
ad7b1f84 1735EXPORT_SYMBOL(queue_work_on);
1da177e4 1736
8204e0c1
AD
1737/**
1738 * workqueue_select_cpu_near - Select a CPU based on NUMA node
1739 * @node: NUMA node ID that we want to select a CPU from
1740 *
1741 * This function will attempt to find a "random" cpu available on a given
1742 * node. If there are no CPUs available on the given node it will return
1743 * WORK_CPU_UNBOUND indicating that we should just schedule to any
1744 * available CPU if we need to schedule this work.
1745 */
1746static int workqueue_select_cpu_near(int node)
1747{
1748 int cpu;
1749
1750 /* No point in doing this if NUMA isn't enabled for workqueues */
1751 if (!wq_numa_enabled)
1752 return WORK_CPU_UNBOUND;
1753
1754 /* Delay binding to CPU if node is not valid or online */
1755 if (node < 0 || node >= MAX_NUMNODES || !node_online(node))
1756 return WORK_CPU_UNBOUND;
1757
1758 /* Use local node/cpu if we are already there */
1759 cpu = raw_smp_processor_id();
1760 if (node == cpu_to_node(cpu))
1761 return cpu;
1762
1763 /* Use "random" otherwise know as "first" online CPU of node */
1764 cpu = cpumask_any_and(cpumask_of_node(node), cpu_online_mask);
1765
1766 /* If CPU is valid return that, otherwise just defer */
1767 return cpu < nr_cpu_ids ? cpu : WORK_CPU_UNBOUND;
1768}
1769
1770/**
1771 * queue_work_node - queue work on a "random" cpu for a given NUMA node
1772 * @node: NUMA node that we are targeting the work for
1773 * @wq: workqueue to use
1774 * @work: work to queue
1775 *
1776 * We queue the work to a "random" CPU within a given NUMA node. The basic
1777 * idea here is to provide a way to somehow associate work with a given
1778 * NUMA node.
1779 *
1780 * This function will only make a best effort attempt at getting this onto
1781 * the right NUMA node. If no node is requested or the requested node is
1782 * offline then we just fall back to standard queue_work behavior.
1783 *
1784 * Currently the "random" CPU ends up being the first available CPU in the
1785 * intersection of cpu_online_mask and the cpumask of the node, unless we
1786 * are running on the node. In that case we just use the current CPU.
1787 *
1788 * Return: %false if @work was already on a queue, %true otherwise.
1789 */
1790bool queue_work_node(int node, struct workqueue_struct *wq,
1791 struct work_struct *work)
1792{
1793 unsigned long flags;
1794 bool ret = false;
1795
1796 /*
1797 * This current implementation is specific to unbound workqueues.
1798 * Specifically we only return the first available CPU for a given
1799 * node instead of cycling through individual CPUs within the node.
1800 *
1801 * If this is used with a per-cpu workqueue then the logic in
1802 * workqueue_select_cpu_near would need to be updated to allow for
1803 * some round robin type logic.
1804 */
1805 WARN_ON_ONCE(!(wq->flags & WQ_UNBOUND));
1806
1807 local_irq_save(flags);
1808
1809 if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))) {
1810 int cpu = workqueue_select_cpu_near(node);
1811
1812 __queue_work(cpu, wq, work);
1813 ret = true;
1814 }
1815
1816 local_irq_restore(flags);
1817 return ret;
1818}
1819EXPORT_SYMBOL_GPL(queue_work_node);
1820
8c20feb6 1821void delayed_work_timer_fn(struct timer_list *t)
1da177e4 1822{
8c20feb6 1823 struct delayed_work *dwork = from_timer(dwork, t, timer);
1da177e4 1824
e0aecdd8 1825 /* should have been called from irqsafe timer with irq already off */
60c057bc 1826 __queue_work(dwork->cpu, dwork->wq, &dwork->work);
1da177e4 1827}
1438ade5 1828EXPORT_SYMBOL(delayed_work_timer_fn);
1da177e4 1829
7beb2edf
TH
1830static void __queue_delayed_work(int cpu, struct workqueue_struct *wq,
1831 struct delayed_work *dwork, unsigned long delay)
1da177e4 1832{
7beb2edf
TH
1833 struct timer_list *timer = &dwork->timer;
1834 struct work_struct *work = &dwork->work;
7beb2edf 1835
637fdbae 1836 WARN_ON_ONCE(!wq);
4b243563 1837 WARN_ON_ONCE(timer->function != delayed_work_timer_fn);
fc4b514f
TH
1838 WARN_ON_ONCE(timer_pending(timer));
1839 WARN_ON_ONCE(!list_empty(&work->entry));
7beb2edf 1840
8852aac2
TH
1841 /*
1842 * If @delay is 0, queue @dwork->work immediately. This is for
1843 * both optimization and correctness. The earliest @timer can
1844 * expire is on the closest next tick and delayed_work users depend
1845 * on that there's no such delay when @delay is 0.
1846 */
1847 if (!delay) {
1848 __queue_work(cpu, wq, &dwork->work);
1849 return;
1850 }
1851
60c057bc 1852 dwork->wq = wq;
1265057f 1853 dwork->cpu = cpu;
7beb2edf
TH
1854 timer->expires = jiffies + delay;
1855
041bd12e
TH
1856 if (unlikely(cpu != WORK_CPU_UNBOUND))
1857 add_timer_on(timer, cpu);
1858 else
1859 add_timer(timer);
1da177e4
LT
1860}
1861
0fcb78c2
REB
1862/**
1863 * queue_delayed_work_on - queue work on specific CPU after delay
1864 * @cpu: CPU number to execute work on
1865 * @wq: workqueue to use
af9997e4 1866 * @dwork: work to queue
0fcb78c2
REB
1867 * @delay: number of jiffies to wait before queueing
1868 *
d185af30 1869 * Return: %false if @work was already on a queue, %true otherwise. If
715f1300
TH
1870 * @delay is zero and @dwork is idle, it will be scheduled for immediate
1871 * execution.
0fcb78c2 1872 */
d4283e93
TH
1873bool queue_delayed_work_on(int cpu, struct workqueue_struct *wq,
1874 struct delayed_work *dwork, unsigned long delay)
7a6bc1cd 1875{
52bad64d 1876 struct work_struct *work = &dwork->work;
d4283e93 1877 bool ret = false;
8930caba 1878 unsigned long flags;
7a6bc1cd 1879
8930caba
TH
1880 /* read the comment in __queue_work() */
1881 local_irq_save(flags);
7a6bc1cd 1882
22df02bb 1883 if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))) {
7beb2edf 1884 __queue_delayed_work(cpu, wq, dwork, delay);
d4283e93 1885 ret = true;
7a6bc1cd 1886 }
8a3e77cc 1887
8930caba 1888 local_irq_restore(flags);
7a6bc1cd
VP
1889 return ret;
1890}
ad7b1f84 1891EXPORT_SYMBOL(queue_delayed_work_on);
c7fc77f7 1892
8376fe22
TH
1893/**
1894 * mod_delayed_work_on - modify delay of or queue a delayed work on specific CPU
1895 * @cpu: CPU number to execute work on
1896 * @wq: workqueue to use
1897 * @dwork: work to queue
1898 * @delay: number of jiffies to wait before queueing
1899 *
1900 * If @dwork is idle, equivalent to queue_delayed_work_on(); otherwise,
1901 * modify @dwork's timer so that it expires after @delay. If @delay is
1902 * zero, @work is guaranteed to be scheduled immediately regardless of its
1903 * current state.
1904 *
d185af30 1905 * Return: %false if @dwork was idle and queued, %true if @dwork was
8376fe22
TH
1906 * pending and its timer was modified.
1907 *
e0aecdd8 1908 * This function is safe to call from any context including IRQ handler.
8376fe22
TH
1909 * See try_to_grab_pending() for details.
1910 */
1911bool mod_delayed_work_on(int cpu, struct workqueue_struct *wq,
1912 struct delayed_work *dwork, unsigned long delay)
1913{
1914 unsigned long flags;
1915 int ret;
c7fc77f7 1916
8376fe22
TH
1917 do {
1918 ret = try_to_grab_pending(&dwork->work, true, &flags);
1919 } while (unlikely(ret == -EAGAIN));
63bc0362 1920
8376fe22
TH
1921 if (likely(ret >= 0)) {
1922 __queue_delayed_work(cpu, wq, dwork, delay);
1923 local_irq_restore(flags);
7a6bc1cd 1924 }
8376fe22
TH
1925
1926 /* -ENOENT from try_to_grab_pending() becomes %true */
7a6bc1cd
VP
1927 return ret;
1928}
8376fe22
TH
1929EXPORT_SYMBOL_GPL(mod_delayed_work_on);
1930
05f0fe6b
TH
1931static void rcu_work_rcufn(struct rcu_head *rcu)
1932{
1933 struct rcu_work *rwork = container_of(rcu, struct rcu_work, rcu);
1934
1935 /* read the comment in __queue_work() */
1936 local_irq_disable();
1937 __queue_work(WORK_CPU_UNBOUND, rwork->wq, &rwork->work);
1938 local_irq_enable();
1939}
1940
1941/**
1942 * queue_rcu_work - queue work after a RCU grace period
1943 * @wq: workqueue to use
1944 * @rwork: work to queue
1945 *
1946 * Return: %false if @rwork was already pending, %true otherwise. Note
1947 * that a full RCU grace period is guaranteed only after a %true return.
bf393fd4 1948 * While @rwork is guaranteed to be executed after a %false return, the
05f0fe6b
TH
1949 * execution may happen before a full RCU grace period has passed.
1950 */
1951bool queue_rcu_work(struct workqueue_struct *wq, struct rcu_work *rwork)
1952{
1953 struct work_struct *work = &rwork->work;
1954
1955 if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))) {
1956 rwork->wq = wq;
a7e30c0e 1957 call_rcu_hurry(&rwork->rcu, rcu_work_rcufn);
05f0fe6b
TH
1958 return true;
1959 }
1960
1961 return false;
1962}
1963EXPORT_SYMBOL(queue_rcu_work);
1964
c8e55f36
TH
1965/**
1966 * worker_enter_idle - enter idle state
1967 * @worker: worker which is entering idle state
1968 *
1969 * @worker is entering idle state. Update stats and idle timer if
1970 * necessary.
1971 *
1972 * LOCKING:
a9b8a985 1973 * raw_spin_lock_irq(pool->lock).
c8e55f36
TH
1974 */
1975static void worker_enter_idle(struct worker *worker)
1da177e4 1976{
bd7bdd43 1977 struct worker_pool *pool = worker->pool;
c8e55f36 1978
6183c009
TH
1979 if (WARN_ON_ONCE(worker->flags & WORKER_IDLE) ||
1980 WARN_ON_ONCE(!list_empty(&worker->entry) &&
1981 (worker->hentry.next || worker->hentry.pprev)))
1982 return;
c8e55f36 1983
051e1850 1984 /* can't use worker_set_flags(), also called from create_worker() */
cb444766 1985 worker->flags |= WORKER_IDLE;
bd7bdd43 1986 pool->nr_idle++;
e22bee78 1987 worker->last_active = jiffies;
c8e55f36
TH
1988
1989 /* idle_list is LIFO */
bd7bdd43 1990 list_add(&worker->entry, &pool->idle_list);
db7bccf4 1991
628c78e7
TH
1992 if (too_many_workers(pool) && !timer_pending(&pool->idle_timer))
1993 mod_timer(&pool->idle_timer, jiffies + IDLE_WORKER_TIMEOUT);
cb444766 1994
989442d7 1995 /* Sanity check nr_running. */
bc35f7ef 1996 WARN_ON_ONCE(pool->nr_workers == pool->nr_idle && pool->nr_running);
c8e55f36
TH
1997}
1998
1999/**
2000 * worker_leave_idle - leave idle state
2001 * @worker: worker which is leaving idle state
2002 *
2003 * @worker is leaving idle state. Update stats.
2004 *
2005 * LOCKING:
a9b8a985 2006 * raw_spin_lock_irq(pool->lock).
c8e55f36
TH
2007 */
2008static void worker_leave_idle(struct worker *worker)
2009{
bd7bdd43 2010 struct worker_pool *pool = worker->pool;
c8e55f36 2011
6183c009
TH
2012 if (WARN_ON_ONCE(!(worker->flags & WORKER_IDLE)))
2013 return;
d302f017 2014 worker_clr_flags(worker, WORKER_IDLE);
bd7bdd43 2015 pool->nr_idle--;
c8e55f36
TH
2016 list_del_init(&worker->entry);
2017}
2018
f7537df5 2019static struct worker *alloc_worker(int node)
c34056a3
TH
2020{
2021 struct worker *worker;
2022
f7537df5 2023 worker = kzalloc_node(sizeof(*worker), GFP_KERNEL, node);
c8e55f36
TH
2024 if (worker) {
2025 INIT_LIST_HEAD(&worker->entry);
affee4b2 2026 INIT_LIST_HEAD(&worker->scheduled);
da028469 2027 INIT_LIST_HEAD(&worker->node);
e22bee78
TH
2028 /* on creation a worker is in !idle && prep state */
2029 worker->flags = WORKER_PREP;
c8e55f36 2030 }
c34056a3
TH
2031 return worker;
2032}
2033
4736cbf7
LJ
2034/**
2035 * worker_attach_to_pool() - attach a worker to a pool
2036 * @worker: worker to be attached
2037 * @pool: the target pool
2038 *
2039 * Attach @worker to @pool. Once attached, the %WORKER_UNBOUND flag and
2040 * cpu-binding of @worker are kept coordinated with the pool across
2041 * cpu-[un]hotplugs.
2042 */
2043static void worker_attach_to_pool(struct worker *worker,
2044 struct worker_pool *pool)
2045{
1258fae7 2046 mutex_lock(&wq_pool_attach_mutex);
4736cbf7 2047
4736cbf7 2048 /*
1258fae7
TH
2049 * The wq_pool_attach_mutex ensures %POOL_DISASSOCIATED remains
2050 * stable across this function. See the comments above the flag
2051 * definition for details.
4736cbf7
LJ
2052 */
2053 if (pool->flags & POOL_DISASSOCIATED)
2054 worker->flags |= WORKER_UNBOUND;
5c25b5ff
PZ
2055 else
2056 kthread_set_per_cpu(worker->task, pool->cpu);
4736cbf7 2057
640f17c8
PZ
2058 if (worker->rescue_wq)
2059 set_cpus_allowed_ptr(worker->task, pool->attrs->cpumask);
2060
4736cbf7 2061 list_add_tail(&worker->node, &pool->workers);
a2d812a2 2062 worker->pool = pool;
4736cbf7 2063
1258fae7 2064 mutex_unlock(&wq_pool_attach_mutex);
4736cbf7
LJ
2065}
2066
60f5a4bc
LJ
2067/**
2068 * worker_detach_from_pool() - detach a worker from its pool
2069 * @worker: worker which is attached to its pool
60f5a4bc 2070 *
4736cbf7
LJ
2071 * Undo the attaching which had been done in worker_attach_to_pool(). The
2072 * caller worker shouldn't access to the pool after detached except it has
2073 * other reference to the pool.
60f5a4bc 2074 */
a2d812a2 2075static void worker_detach_from_pool(struct worker *worker)
60f5a4bc 2076{
a2d812a2 2077 struct worker_pool *pool = worker->pool;
60f5a4bc
LJ
2078 struct completion *detach_completion = NULL;
2079
1258fae7 2080 mutex_lock(&wq_pool_attach_mutex);
a2d812a2 2081
5c25b5ff 2082 kthread_set_per_cpu(worker->task, -1);
da028469 2083 list_del(&worker->node);
a2d812a2
TH
2084 worker->pool = NULL;
2085
e02b9312 2086 if (list_empty(&pool->workers) && list_empty(&pool->dying_workers))
60f5a4bc 2087 detach_completion = pool->detach_completion;
1258fae7 2088 mutex_unlock(&wq_pool_attach_mutex);
60f5a4bc 2089
b62c0751
LJ
2090 /* clear leftover flags without pool->lock after it is detached */
2091 worker->flags &= ~(WORKER_UNBOUND | WORKER_REBOUND);
2092
60f5a4bc
LJ
2093 if (detach_completion)
2094 complete(detach_completion);
2095}
2096
c34056a3
TH
2097/**
2098 * create_worker - create a new workqueue worker
63d95a91 2099 * @pool: pool the new worker will belong to
c34056a3 2100 *
051e1850 2101 * Create and start a new worker which is attached to @pool.
c34056a3
TH
2102 *
2103 * CONTEXT:
2104 * Might sleep. Does GFP_KERNEL allocations.
2105 *
d185af30 2106 * Return:
c34056a3
TH
2107 * Pointer to the newly created worker.
2108 */
bc2ae0f5 2109static struct worker *create_worker(struct worker_pool *pool)
c34056a3 2110{
e441b56f
ZL
2111 struct worker *worker;
2112 int id;
e3c916a4 2113 char id_buf[16];
c34056a3 2114
7cda9aae 2115 /* ID is needed to determine kthread name */
e441b56f 2116 id = ida_alloc(&pool->worker_ida, GFP_KERNEL);
3f0ea0b8
PM
2117 if (id < 0) {
2118 pr_err_once("workqueue: Failed to allocate a worker ID: %pe\n",
2119 ERR_PTR(id));
e441b56f 2120 return NULL;
3f0ea0b8 2121 }
c34056a3 2122
f7537df5 2123 worker = alloc_worker(pool->node);
3f0ea0b8
PM
2124 if (!worker) {
2125 pr_err_once("workqueue: Failed to allocate a worker\n");
c34056a3 2126 goto fail;
3f0ea0b8 2127 }
c34056a3 2128
c34056a3
TH
2129 worker->id = id;
2130
29c91e99 2131 if (pool->cpu >= 0)
e3c916a4
TH
2132 snprintf(id_buf, sizeof(id_buf), "%d:%d%s", pool->cpu, id,
2133 pool->attrs->nice < 0 ? "H" : "");
f3421797 2134 else
e3c916a4
TH
2135 snprintf(id_buf, sizeof(id_buf), "u%d:%d", pool->id, id);
2136
f3f90ad4 2137 worker->task = kthread_create_on_node(worker_thread, worker, pool->node,
e3c916a4 2138 "kworker/%s", id_buf);
3f0ea0b8 2139 if (IS_ERR(worker->task)) {
60f54038
PM
2140 if (PTR_ERR(worker->task) == -EINTR) {
2141 pr_err("workqueue: Interrupted when creating a worker thread \"kworker/%s\"\n",
2142 id_buf);
2143 } else {
2144 pr_err_once("workqueue: Failed to create a worker thread: %pe",
2145 worker->task);
2146 }
c34056a3 2147 goto fail;
3f0ea0b8 2148 }
c34056a3 2149
91151228 2150 set_user_nice(worker->task, pool->attrs->nice);
25834c73 2151 kthread_bind_mask(worker->task, pool->attrs->cpumask);
91151228 2152
da028469 2153 /* successful, attach the worker to the pool */
4736cbf7 2154 worker_attach_to_pool(worker, pool);
822d8405 2155
051e1850 2156 /* start the newly created worker */
a9b8a985 2157 raw_spin_lock_irq(&pool->lock);
051e1850
LJ
2158 worker->pool->nr_workers++;
2159 worker_enter_idle(worker);
2160 wake_up_process(worker->task);
a9b8a985 2161 raw_spin_unlock_irq(&pool->lock);
051e1850 2162
c34056a3 2163 return worker;
822d8405 2164
c34056a3 2165fail:
e441b56f 2166 ida_free(&pool->worker_ida, id);
c34056a3
TH
2167 kfree(worker);
2168 return NULL;
2169}
2170
793777bc
VS
2171static void unbind_worker(struct worker *worker)
2172{
2173 lockdep_assert_held(&wq_pool_attach_mutex);
2174
2175 kthread_set_per_cpu(worker->task, -1);
2176 if (cpumask_intersects(wq_unbound_cpumask, cpu_active_mask))
2177 WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task, wq_unbound_cpumask) < 0);
2178 else
2179 WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task, cpu_possible_mask) < 0);
2180}
2181
e02b9312
VS
2182static void wake_dying_workers(struct list_head *cull_list)
2183{
2184 struct worker *worker, *tmp;
2185
2186 list_for_each_entry_safe(worker, tmp, cull_list, entry) {
2187 list_del_init(&worker->entry);
2188 unbind_worker(worker);
2189 /*
2190 * If the worker was somehow already running, then it had to be
2191 * in pool->idle_list when set_worker_dying() happened or we
2192 * wouldn't have gotten here.
2193 *
2194 * Thus, the worker must either have observed the WORKER_DIE
2195 * flag, or have set its state to TASK_IDLE. Either way, the
2196 * below will be observed by the worker and is safe to do
2197 * outside of pool->lock.
2198 */
2199 wake_up_process(worker->task);
2200 }
2201}
2202
c34056a3 2203/**
e02b9312 2204 * set_worker_dying - Tag a worker for destruction
c34056a3 2205 * @worker: worker to be destroyed
e02b9312 2206 * @list: transfer worker away from its pool->idle_list and into list
c34056a3 2207 *
e02b9312
VS
2208 * Tag @worker for destruction and adjust @pool stats accordingly. The worker
2209 * should be idle.
c8e55f36
TH
2210 *
2211 * CONTEXT:
a9b8a985 2212 * raw_spin_lock_irq(pool->lock).
c34056a3 2213 */
e02b9312 2214static void set_worker_dying(struct worker *worker, struct list_head *list)
c34056a3 2215{
bd7bdd43 2216 struct worker_pool *pool = worker->pool;
c34056a3 2217
cd549687 2218 lockdep_assert_held(&pool->lock);
e02b9312 2219 lockdep_assert_held(&wq_pool_attach_mutex);
cd549687 2220
c34056a3 2221 /* sanity check frenzy */
6183c009 2222 if (WARN_ON(worker->current_work) ||
73eb7fe7
LJ
2223 WARN_ON(!list_empty(&worker->scheduled)) ||
2224 WARN_ON(!(worker->flags & WORKER_IDLE)))
6183c009 2225 return;
c34056a3 2226
73eb7fe7
LJ
2227 pool->nr_workers--;
2228 pool->nr_idle--;
5bdfff96 2229
cb444766 2230 worker->flags |= WORKER_DIE;
e02b9312
VS
2231
2232 list_move(&worker->entry, list);
2233 list_move(&worker->node, &pool->dying_workers);
c34056a3
TH
2234}
2235
3f959aa3
VS
2236/**
2237 * idle_worker_timeout - check if some idle workers can now be deleted.
2238 * @t: The pool's idle_timer that just expired
2239 *
2240 * The timer is armed in worker_enter_idle(). Note that it isn't disarmed in
2241 * worker_leave_idle(), as a worker flicking between idle and active while its
2242 * pool is at the too_many_workers() tipping point would cause too much timer
2243 * housekeeping overhead. Since IDLE_WORKER_TIMEOUT is long enough, we just let
2244 * it expire and re-evaluate things from there.
2245 */
32a6c723 2246static void idle_worker_timeout(struct timer_list *t)
e22bee78 2247{
32a6c723 2248 struct worker_pool *pool = from_timer(pool, t, idle_timer);
3f959aa3
VS
2249 bool do_cull = false;
2250
2251 if (work_pending(&pool->idle_cull_work))
2252 return;
e22bee78 2253
a9b8a985 2254 raw_spin_lock_irq(&pool->lock);
e22bee78 2255
3f959aa3 2256 if (too_many_workers(pool)) {
e22bee78
TH
2257 struct worker *worker;
2258 unsigned long expires;
2259
2260 /* idle_list is kept in LIFO order, check the last one */
3f959aa3
VS
2261 worker = list_entry(pool->idle_list.prev, struct worker, entry);
2262 expires = worker->last_active + IDLE_WORKER_TIMEOUT;
2263 do_cull = !time_before(jiffies, expires);
2264
2265 if (!do_cull)
2266 mod_timer(&pool->idle_timer, expires);
2267 }
2268 raw_spin_unlock_irq(&pool->lock);
2269
2270 if (do_cull)
2271 queue_work(system_unbound_wq, &pool->idle_cull_work);
2272}
2273
2274/**
2275 * idle_cull_fn - cull workers that have been idle for too long.
2276 * @work: the pool's work for handling these idle workers
2277 *
2278 * This goes through a pool's idle workers and gets rid of those that have been
2279 * idle for at least IDLE_WORKER_TIMEOUT seconds.
e02b9312
VS
2280 *
2281 * We don't want to disturb isolated CPUs because of a pcpu kworker being
2282 * culled, so this also resets worker affinity. This requires a sleepable
2283 * context, hence the split between timer callback and work item.
3f959aa3
VS
2284 */
2285static void idle_cull_fn(struct work_struct *work)
2286{
2287 struct worker_pool *pool = container_of(work, struct worker_pool, idle_cull_work);
e02b9312 2288 struct list_head cull_list;
3f959aa3 2289
e02b9312
VS
2290 INIT_LIST_HEAD(&cull_list);
2291 /*
2292 * Grabbing wq_pool_attach_mutex here ensures an already-running worker
2293 * cannot proceed beyong worker_detach_from_pool() in its self-destruct
2294 * path. This is required as a previously-preempted worker could run after
2295 * set_worker_dying() has happened but before wake_dying_workers() did.
2296 */
2297 mutex_lock(&wq_pool_attach_mutex);
3f959aa3
VS
2298 raw_spin_lock_irq(&pool->lock);
2299
2300 while (too_many_workers(pool)) {
2301 struct worker *worker;
2302 unsigned long expires;
2303
63d95a91 2304 worker = list_entry(pool->idle_list.prev, struct worker, entry);
e22bee78
TH
2305 expires = worker->last_active + IDLE_WORKER_TIMEOUT;
2306
3347fc9f 2307 if (time_before(jiffies, expires)) {
63d95a91 2308 mod_timer(&pool->idle_timer, expires);
3347fc9f 2309 break;
d5abe669 2310 }
3347fc9f 2311
e02b9312 2312 set_worker_dying(worker, &cull_list);
e22bee78
TH
2313 }
2314
a9b8a985 2315 raw_spin_unlock_irq(&pool->lock);
e02b9312
VS
2316 wake_dying_workers(&cull_list);
2317 mutex_unlock(&wq_pool_attach_mutex);
e22bee78 2318}
d5abe669 2319
493a1724 2320static void send_mayday(struct work_struct *work)
e22bee78 2321{
112202d9
TH
2322 struct pool_workqueue *pwq = get_work_pwq(work);
2323 struct workqueue_struct *wq = pwq->wq;
493a1724 2324
2e109a28 2325 lockdep_assert_held(&wq_mayday_lock);
e22bee78 2326
493008a8 2327 if (!wq->rescuer)
493a1724 2328 return;
e22bee78
TH
2329
2330 /* mayday mayday mayday */
493a1724 2331 if (list_empty(&pwq->mayday_node)) {
77668c8b
LJ
2332 /*
2333 * If @pwq is for an unbound wq, its base ref may be put at
2334 * any time due to an attribute change. Pin @pwq until the
2335 * rescuer is done with it.
2336 */
2337 get_pwq(pwq);
493a1724 2338 list_add_tail(&pwq->mayday_node, &wq->maydays);
e22bee78 2339 wake_up_process(wq->rescuer->task);
725e8ec5 2340 pwq->stats[PWQ_STAT_MAYDAY]++;
493a1724 2341 }
e22bee78
TH
2342}
2343
32a6c723 2344static void pool_mayday_timeout(struct timer_list *t)
e22bee78 2345{
32a6c723 2346 struct worker_pool *pool = from_timer(pool, t, mayday_timer);
e22bee78
TH
2347 struct work_struct *work;
2348
a9b8a985
SAS
2349 raw_spin_lock_irq(&pool->lock);
2350 raw_spin_lock(&wq_mayday_lock); /* for wq->maydays */
e22bee78 2351
63d95a91 2352 if (need_to_create_worker(pool)) {
e22bee78
TH
2353 /*
2354 * We've been trying to create a new worker but
2355 * haven't been successful. We might be hitting an
2356 * allocation deadlock. Send distress signals to
2357 * rescuers.
2358 */
63d95a91 2359 list_for_each_entry(work, &pool->worklist, entry)
e22bee78 2360 send_mayday(work);
1da177e4 2361 }
e22bee78 2362
a9b8a985
SAS
2363 raw_spin_unlock(&wq_mayday_lock);
2364 raw_spin_unlock_irq(&pool->lock);
e22bee78 2365
63d95a91 2366 mod_timer(&pool->mayday_timer, jiffies + MAYDAY_INTERVAL);
1da177e4
LT
2367}
2368
e22bee78
TH
2369/**
2370 * maybe_create_worker - create a new worker if necessary
63d95a91 2371 * @pool: pool to create a new worker for
e22bee78 2372 *
63d95a91 2373 * Create a new worker for @pool if necessary. @pool is guaranteed to
e22bee78
TH
2374 * have at least one idle worker on return from this function. If
2375 * creating a new worker takes longer than MAYDAY_INTERVAL, mayday is
63d95a91 2376 * sent to all rescuers with works scheduled on @pool to resolve
e22bee78
TH
2377 * possible allocation deadlock.
2378 *
c5aa87bb
TH
2379 * On return, need_to_create_worker() is guaranteed to be %false and
2380 * may_start_working() %true.
e22bee78
TH
2381 *
2382 * LOCKING:
a9b8a985 2383 * raw_spin_lock_irq(pool->lock) which may be released and regrabbed
e22bee78
TH
2384 * multiple times. Does GFP_KERNEL allocations. Called only from
2385 * manager.
e22bee78 2386 */
29187a9e 2387static void maybe_create_worker(struct worker_pool *pool)
d565ed63
TH
2388__releases(&pool->lock)
2389__acquires(&pool->lock)
1da177e4 2390{
e22bee78 2391restart:
a9b8a985 2392 raw_spin_unlock_irq(&pool->lock);
9f9c2364 2393
e22bee78 2394 /* if we don't make progress in MAYDAY_INITIAL_TIMEOUT, call for help */
63d95a91 2395 mod_timer(&pool->mayday_timer, jiffies + MAYDAY_INITIAL_TIMEOUT);
e22bee78
TH
2396
2397 while (true) {
051e1850 2398 if (create_worker(pool) || !need_to_create_worker(pool))
e22bee78 2399 break;
1da177e4 2400
e212f361 2401 schedule_timeout_interruptible(CREATE_COOLDOWN);
9f9c2364 2402
63d95a91 2403 if (!need_to_create_worker(pool))
e22bee78
TH
2404 break;
2405 }
2406
63d95a91 2407 del_timer_sync(&pool->mayday_timer);
a9b8a985 2408 raw_spin_lock_irq(&pool->lock);
051e1850
LJ
2409 /*
2410 * This is necessary even after a new worker was just successfully
2411 * created as @pool->lock was dropped and the new worker might have
2412 * already become busy.
2413 */
63d95a91 2414 if (need_to_create_worker(pool))
e22bee78 2415 goto restart;
e22bee78
TH
2416}
2417
73f53c4a 2418/**
e22bee78
TH
2419 * manage_workers - manage worker pool
2420 * @worker: self
73f53c4a 2421 *
706026c2 2422 * Assume the manager role and manage the worker pool @worker belongs
e22bee78 2423 * to. At any given time, there can be only zero or one manager per
706026c2 2424 * pool. The exclusion is handled automatically by this function.
e22bee78
TH
2425 *
2426 * The caller can safely start processing works on false return. On
2427 * true return, it's guaranteed that need_to_create_worker() is false
2428 * and may_start_working() is true.
73f53c4a
TH
2429 *
2430 * CONTEXT:
a9b8a985 2431 * raw_spin_lock_irq(pool->lock) which may be released and regrabbed
e22bee78
TH
2432 * multiple times. Does GFP_KERNEL allocations.
2433 *
d185af30 2434 * Return:
29187a9e
TH
2435 * %false if the pool doesn't need management and the caller can safely
2436 * start processing works, %true if management function was performed and
2437 * the conditions that the caller verified before calling the function may
2438 * no longer be true.
73f53c4a 2439 */
e22bee78 2440static bool manage_workers(struct worker *worker)
73f53c4a 2441{
63d95a91 2442 struct worker_pool *pool = worker->pool;
73f53c4a 2443
692b4825 2444 if (pool->flags & POOL_MANAGER_ACTIVE)
29187a9e 2445 return false;
692b4825
TH
2446
2447 pool->flags |= POOL_MANAGER_ACTIVE;
2607d7a6 2448 pool->manager = worker;
1e19ffc6 2449
29187a9e 2450 maybe_create_worker(pool);
e22bee78 2451
2607d7a6 2452 pool->manager = NULL;
692b4825 2453 pool->flags &= ~POOL_MANAGER_ACTIVE;
d8bb65ab 2454 rcuwait_wake_up(&manager_wait);
29187a9e 2455 return true;
73f53c4a
TH
2456}
2457
a62428c0
TH
2458/**
2459 * process_one_work - process single work
c34056a3 2460 * @worker: self
a62428c0
TH
2461 * @work: work to process
2462 *
2463 * Process @work. This function contains all the logics necessary to
2464 * process a single work including synchronization against and
2465 * interaction with other workers on the same cpu, queueing and
2466 * flushing. As long as context requirement is met, any worker can
2467 * call this function to process a work.
2468 *
2469 * CONTEXT:
a9b8a985 2470 * raw_spin_lock_irq(pool->lock) which is released and regrabbed.
a62428c0 2471 */
c34056a3 2472static void process_one_work(struct worker *worker, struct work_struct *work)
d565ed63
TH
2473__releases(&pool->lock)
2474__acquires(&pool->lock)
a62428c0 2475{
112202d9 2476 struct pool_workqueue *pwq = get_work_pwq(work);
bd7bdd43 2477 struct worker_pool *pool = worker->pool;
c4560c2c 2478 unsigned long work_data;
7e11629d 2479 struct worker *collision;
a62428c0
TH
2480#ifdef CONFIG_LOCKDEP
2481 /*
2482 * It is permissible to free the struct work_struct from
2483 * inside the function that is called from it, this we need to
2484 * take into account for lockdep too. To avoid bogus "held
2485 * lock freed" warnings as well as problems when looking into
2486 * work->lockdep_map, make a copy and use that here.
2487 */
4d82a1de
PZ
2488 struct lockdep_map lockdep_map;
2489
2490 lockdep_copy_map(&lockdep_map, &work->lockdep_map);
a62428c0 2491#endif
807407c0 2492 /* ensure we're on the correct CPU */
85327af6 2493 WARN_ON_ONCE(!(pool->flags & POOL_DISASSOCIATED) &&
ec22ca5e 2494 raw_smp_processor_id() != pool->cpu);
25511a47 2495
7e11629d
TH
2496 /*
2497 * A single work shouldn't be executed concurrently by
2498 * multiple workers on a single cpu. Check whether anyone is
2499 * already processing the work. If so, defer the work to the
2500 * currently executing one.
2501 */
c9e7cf27 2502 collision = find_worker_executing_work(pool, work);
7e11629d
TH
2503 if (unlikely(collision)) {
2504 move_linked_works(work, &collision->scheduled, NULL);
2505 return;
2506 }
2507
8930caba 2508 /* claim and dequeue */
a62428c0 2509 debug_work_deactivate(work);
c9e7cf27 2510 hash_add(pool->busy_hash, &worker->hentry, (unsigned long)work);
c34056a3 2511 worker->current_work = work;
a2c1c57b 2512 worker->current_func = work->func;
112202d9 2513 worker->current_pwq = pwq;
616db877 2514 worker->current_at = worker->task->se.sum_exec_runtime;
c4560c2c 2515 work_data = *work_data_bits(work);
d812796e 2516 worker->current_color = get_work_color(work_data);
7a22ad75 2517
8bf89593
TH
2518 /*
2519 * Record wq name for cmdline and debug reporting, may get
2520 * overridden through set_worker_desc().
2521 */
2522 strscpy(worker->desc, pwq->wq->name, WORKER_DESC_LEN);
2523
a62428c0
TH
2524 list_del_init(&work->entry);
2525
fb0e7beb 2526 /*
228f1d00
LJ
2527 * CPU intensive works don't participate in concurrency management.
2528 * They're the scheduler's responsibility. This takes @worker out
2529 * of concurrency management and the next code block will chain
2530 * execution of the pending work items.
fb0e7beb 2531 */
616db877 2532 if (unlikely(pwq->wq->flags & WQ_CPU_INTENSIVE))
228f1d00 2533 worker_set_flags(worker, WORKER_CPU_INTENSIVE);
fb0e7beb 2534
974271c4 2535 /*
a489a03e
LJ
2536 * Wake up another worker if necessary. The condition is always
2537 * false for normal per-cpu workers since nr_running would always
2538 * be >= 1 at this point. This is used to chain execution of the
2539 * pending work items for WORKER_NOT_RUNNING workers such as the
228f1d00 2540 * UNBOUND and CPU_INTENSIVE ones.
974271c4 2541 */
a489a03e 2542 if (need_more_worker(pool))
63d95a91 2543 wake_up_worker(pool);
974271c4 2544
8930caba 2545 /*
7c3eed5c 2546 * Record the last pool and clear PENDING which should be the last
d565ed63 2547 * update to @work. Also, do this inside @pool->lock so that
23657bb1
TH
2548 * PENDING and queued state changes happen together while IRQ is
2549 * disabled.
8930caba 2550 */
7c3eed5c 2551 set_work_pool_and_clear_pending(work, pool->id);
a62428c0 2552
a9b8a985 2553 raw_spin_unlock_irq(&pool->lock);
a62428c0 2554
a1d14934 2555 lock_map_acquire(&pwq->wq->lockdep_map);
a62428c0 2556 lock_map_acquire(&lockdep_map);
e6f3faa7 2557 /*
f52be570
PZ
2558 * Strictly speaking we should mark the invariant state without holding
2559 * any locks, that is, before these two lock_map_acquire()'s.
e6f3faa7
PZ
2560 *
2561 * However, that would result in:
2562 *
2563 * A(W1)
2564 * WFC(C)
2565 * A(W1)
2566 * C(C)
2567 *
2568 * Which would create W1->C->W1 dependencies, even though there is no
2569 * actual deadlock possible. There are two solutions, using a
2570 * read-recursive acquire on the work(queue) 'locks', but this will then
f52be570 2571 * hit the lockdep limitation on recursive locks, or simply discard
e6f3faa7
PZ
2572 * these locks.
2573 *
2574 * AFAICT there is no possible deadlock scenario between the
2575 * flush_work() and complete() primitives (except for single-threaded
2576 * workqueues), so hiding them isn't a problem.
2577 */
f52be570 2578 lockdep_invariant_state(true);
725e8ec5 2579 pwq->stats[PWQ_STAT_STARTED]++;
e36c886a 2580 trace_workqueue_execute_start(work);
a2c1c57b 2581 worker->current_func(work);
e36c886a
AV
2582 /*
2583 * While we must be careful to not use "work" after this, the trace
2584 * point will only record its address.
2585 */
1c5da0ec 2586 trace_workqueue_execute_end(work, worker->current_func);
725e8ec5 2587 pwq->stats[PWQ_STAT_COMPLETED]++;
a62428c0 2588 lock_map_release(&lockdep_map);
112202d9 2589 lock_map_release(&pwq->wq->lockdep_map);
a62428c0
TH
2590
2591 if (unlikely(in_atomic() || lockdep_depth(current) > 0)) {
044c782c 2592 pr_err("BUG: workqueue leaked lock or atomic: %s/0x%08x/%d\n"
d75f773c 2593 " last function: %ps\n",
a2c1c57b
TH
2594 current->comm, preempt_count(), task_pid_nr(current),
2595 worker->current_func);
a62428c0
TH
2596 debug_show_held_locks(current);
2597 dump_stack();
2598 }
2599
b22ce278 2600 /*
025f50f3 2601 * The following prevents a kworker from hogging CPU on !PREEMPTION
b22ce278
TH
2602 * kernels, where a requeueing work item waiting for something to
2603 * happen could deadlock with stop_machine as such work item could
2604 * indefinitely requeue itself while all other CPUs are trapped in
789cbbec
JL
2605 * stop_machine. At the same time, report a quiescent RCU state so
2606 * the same condition doesn't freeze RCU.
b22ce278 2607 */
a7e6425e 2608 cond_resched();
b22ce278 2609
a9b8a985 2610 raw_spin_lock_irq(&pool->lock);
a62428c0 2611
616db877
TH
2612 /*
2613 * In addition to %WQ_CPU_INTENSIVE, @worker may also have been marked
2614 * CPU intensive by wq_worker_tick() if @work hogged CPU longer than
2615 * wq_cpu_intensive_thresh_us. Clear it.
2616 */
2617 worker_clr_flags(worker, WORKER_CPU_INTENSIVE);
fb0e7beb 2618
1b69ac6b
JW
2619 /* tag the worker for identification in schedule() */
2620 worker->last_func = worker->current_func;
2621
a62428c0 2622 /* we're done with it, release */
42f8570f 2623 hash_del(&worker->hentry);
c34056a3 2624 worker->current_work = NULL;
a2c1c57b 2625 worker->current_func = NULL;
112202d9 2626 worker->current_pwq = NULL;
d812796e 2627 worker->current_color = INT_MAX;
c4560c2c 2628 pwq_dec_nr_in_flight(pwq, work_data);
a62428c0
TH
2629}
2630
affee4b2
TH
2631/**
2632 * process_scheduled_works - process scheduled works
2633 * @worker: self
2634 *
2635 * Process all scheduled works. Please note that the scheduled list
2636 * may change while processing a work, so this function repeatedly
2637 * fetches a work from the top and executes it.
2638 *
2639 * CONTEXT:
a9b8a985 2640 * raw_spin_lock_irq(pool->lock) which may be released and regrabbed
affee4b2
TH
2641 * multiple times.
2642 */
2643static void process_scheduled_works(struct worker *worker)
1da177e4 2644{
affee4b2
TH
2645 while (!list_empty(&worker->scheduled)) {
2646 struct work_struct *work = list_first_entry(&worker->scheduled,
1da177e4 2647 struct work_struct, entry);
c34056a3 2648 process_one_work(worker, work);
1da177e4 2649 }
1da177e4
LT
2650}
2651
197f6acc
TH
2652static void set_pf_worker(bool val)
2653{
2654 mutex_lock(&wq_pool_attach_mutex);
2655 if (val)
2656 current->flags |= PF_WQ_WORKER;
2657 else
2658 current->flags &= ~PF_WQ_WORKER;
2659 mutex_unlock(&wq_pool_attach_mutex);
2660}
2661
4690c4ab
TH
2662/**
2663 * worker_thread - the worker thread function
c34056a3 2664 * @__worker: self
4690c4ab 2665 *
c5aa87bb
TH
2666 * The worker thread function. All workers belong to a worker_pool -
2667 * either a per-cpu one or dynamic unbound one. These workers process all
2668 * work items regardless of their specific target workqueue. The only
2669 * exception is work items which belong to workqueues with a rescuer which
2670 * will be explained in rescuer_thread().
d185af30
YB
2671 *
2672 * Return: 0
4690c4ab 2673 */
c34056a3 2674static int worker_thread(void *__worker)
1da177e4 2675{
c34056a3 2676 struct worker *worker = __worker;
bd7bdd43 2677 struct worker_pool *pool = worker->pool;
1da177e4 2678
e22bee78 2679 /* tell the scheduler that this is a workqueue worker */
197f6acc 2680 set_pf_worker(true);
c8e55f36 2681woke_up:
a9b8a985 2682 raw_spin_lock_irq(&pool->lock);
1da177e4 2683
a9ab775b
TH
2684 /* am I supposed to die? */
2685 if (unlikely(worker->flags & WORKER_DIE)) {
a9b8a985 2686 raw_spin_unlock_irq(&pool->lock);
197f6acc 2687 set_pf_worker(false);
60f5a4bc
LJ
2688
2689 set_task_comm(worker->task, "kworker/dying");
e441b56f 2690 ida_free(&pool->worker_ida, worker->id);
a2d812a2 2691 worker_detach_from_pool(worker);
e02b9312 2692 WARN_ON_ONCE(!list_empty(&worker->entry));
60f5a4bc 2693 kfree(worker);
a9ab775b 2694 return 0;
c8e55f36 2695 }
affee4b2 2696
c8e55f36 2697 worker_leave_idle(worker);
db7bccf4 2698recheck:
e22bee78 2699 /* no more worker necessary? */
63d95a91 2700 if (!need_more_worker(pool))
e22bee78
TH
2701 goto sleep;
2702
2703 /* do we need to manage? */
63d95a91 2704 if (unlikely(!may_start_working(pool)) && manage_workers(worker))
e22bee78
TH
2705 goto recheck;
2706
c8e55f36
TH
2707 /*
2708 * ->scheduled list can only be filled while a worker is
2709 * preparing to process a work or actually processing it.
2710 * Make sure nobody diddled with it while I was sleeping.
2711 */
6183c009 2712 WARN_ON_ONCE(!list_empty(&worker->scheduled));
c8e55f36 2713
e22bee78 2714 /*
a9ab775b
TH
2715 * Finish PREP stage. We're guaranteed to have at least one idle
2716 * worker or that someone else has already assumed the manager
2717 * role. This is where @worker starts participating in concurrency
2718 * management if applicable and concurrency management is restored
2719 * after being rebound. See rebind_workers() for details.
e22bee78 2720 */
a9ab775b 2721 worker_clr_flags(worker, WORKER_PREP | WORKER_REBOUND);
e22bee78
TH
2722
2723 do {
c8e55f36 2724 struct work_struct *work =
bd7bdd43 2725 list_first_entry(&pool->worklist,
c8e55f36
TH
2726 struct work_struct, entry);
2727
82607adc
TH
2728 pool->watchdog_ts = jiffies;
2729
c8e55f36
TH
2730 if (likely(!(*work_data_bits(work) & WORK_STRUCT_LINKED))) {
2731 /* optimization path, not strictly necessary */
2732 process_one_work(worker, work);
2733 if (unlikely(!list_empty(&worker->scheduled)))
affee4b2 2734 process_scheduled_works(worker);
c8e55f36
TH
2735 } else {
2736 move_linked_works(work, &worker->scheduled, NULL);
2737 process_scheduled_works(worker);
affee4b2 2738 }
63d95a91 2739 } while (keep_working(pool));
e22bee78 2740
228f1d00 2741 worker_set_flags(worker, WORKER_PREP);
d313dd85 2742sleep:
c8e55f36 2743 /*
d565ed63
TH
2744 * pool->lock is held and there's no work to process and no need to
2745 * manage, sleep. Workers are woken up only while holding
2746 * pool->lock or from local cpu, so setting the current state
2747 * before releasing pool->lock is enough to prevent losing any
2748 * event.
c8e55f36
TH
2749 */
2750 worker_enter_idle(worker);
c5a94a61 2751 __set_current_state(TASK_IDLE);
a9b8a985 2752 raw_spin_unlock_irq(&pool->lock);
c8e55f36
TH
2753 schedule();
2754 goto woke_up;
1da177e4
LT
2755}
2756
e22bee78
TH
2757/**
2758 * rescuer_thread - the rescuer thread function
111c225a 2759 * @__rescuer: self
e22bee78
TH
2760 *
2761 * Workqueue rescuer thread function. There's one rescuer for each
493008a8 2762 * workqueue which has WQ_MEM_RECLAIM set.
e22bee78 2763 *
706026c2 2764 * Regular work processing on a pool may block trying to create a new
e22bee78
TH
2765 * worker which uses GFP_KERNEL allocation which has slight chance of
2766 * developing into deadlock if some works currently on the same queue
2767 * need to be processed to satisfy the GFP_KERNEL allocation. This is
2768 * the problem rescuer solves.
2769 *
706026c2
TH
2770 * When such condition is possible, the pool summons rescuers of all
2771 * workqueues which have works queued on the pool and let them process
e22bee78
TH
2772 * those works so that forward progress can be guaranteed.
2773 *
2774 * This should happen rarely.
d185af30
YB
2775 *
2776 * Return: 0
e22bee78 2777 */
111c225a 2778static int rescuer_thread(void *__rescuer)
e22bee78 2779{
111c225a
TH
2780 struct worker *rescuer = __rescuer;
2781 struct workqueue_struct *wq = rescuer->rescue_wq;
e22bee78 2782 struct list_head *scheduled = &rescuer->scheduled;
4d595b86 2783 bool should_stop;
e22bee78
TH
2784
2785 set_user_nice(current, RESCUER_NICE_LEVEL);
111c225a
TH
2786
2787 /*
2788 * Mark rescuer as worker too. As WORKER_PREP is never cleared, it
2789 * doesn't participate in concurrency management.
2790 */
197f6acc 2791 set_pf_worker(true);
e22bee78 2792repeat:
c5a94a61 2793 set_current_state(TASK_IDLE);
e22bee78 2794
4d595b86
LJ
2795 /*
2796 * By the time the rescuer is requested to stop, the workqueue
2797 * shouldn't have any work pending, but @wq->maydays may still have
2798 * pwq(s) queued. This can happen by non-rescuer workers consuming
2799 * all the work items before the rescuer got to them. Go through
2800 * @wq->maydays processing before acting on should_stop so that the
2801 * list is always empty on exit.
2802 */
2803 should_stop = kthread_should_stop();
e22bee78 2804
493a1724 2805 /* see whether any pwq is asking for help */
a9b8a985 2806 raw_spin_lock_irq(&wq_mayday_lock);
493a1724
TH
2807
2808 while (!list_empty(&wq->maydays)) {
2809 struct pool_workqueue *pwq = list_first_entry(&wq->maydays,
2810 struct pool_workqueue, mayday_node);
112202d9 2811 struct worker_pool *pool = pwq->pool;
e22bee78 2812 struct work_struct *work, *n;
82607adc 2813 bool first = true;
e22bee78
TH
2814
2815 __set_current_state(TASK_RUNNING);
493a1724
TH
2816 list_del_init(&pwq->mayday_node);
2817
a9b8a985 2818 raw_spin_unlock_irq(&wq_mayday_lock);
e22bee78 2819
51697d39
LJ
2820 worker_attach_to_pool(rescuer, pool);
2821
a9b8a985 2822 raw_spin_lock_irq(&pool->lock);
e22bee78
TH
2823
2824 /*
2825 * Slurp in all works issued via this workqueue and
2826 * process'em.
2827 */
0479c8c5 2828 WARN_ON_ONCE(!list_empty(scheduled));
82607adc
TH
2829 list_for_each_entry_safe(work, n, &pool->worklist, entry) {
2830 if (get_work_pwq(work) == pwq) {
2831 if (first)
2832 pool->watchdog_ts = jiffies;
e22bee78 2833 move_linked_works(work, scheduled, &n);
725e8ec5 2834 pwq->stats[PWQ_STAT_RESCUED]++;
82607adc
TH
2835 }
2836 first = false;
2837 }
e22bee78 2838
008847f6
N
2839 if (!list_empty(scheduled)) {
2840 process_scheduled_works(rescuer);
2841
2842 /*
2843 * The above execution of rescued work items could
2844 * have created more to rescue through
f97a4a1a 2845 * pwq_activate_first_inactive() or chained
008847f6
N
2846 * queueing. Let's put @pwq back on mayday list so
2847 * that such back-to-back work items, which may be
2848 * being used to relieve memory pressure, don't
2849 * incur MAYDAY_INTERVAL delay inbetween.
2850 */
4f3f4cf3 2851 if (pwq->nr_active && need_to_create_worker(pool)) {
a9b8a985 2852 raw_spin_lock(&wq_mayday_lock);
e66b39af
TH
2853 /*
2854 * Queue iff we aren't racing destruction
2855 * and somebody else hasn't queued it already.
2856 */
2857 if (wq->rescuer && list_empty(&pwq->mayday_node)) {
2858 get_pwq(pwq);
2859 list_add_tail(&pwq->mayday_node, &wq->maydays);
2860 }
a9b8a985 2861 raw_spin_unlock(&wq_mayday_lock);
008847f6
N
2862 }
2863 }
7576958a 2864
77668c8b
LJ
2865 /*
2866 * Put the reference grabbed by send_mayday(). @pool won't
13b1d625 2867 * go away while we're still attached to it.
77668c8b
LJ
2868 */
2869 put_pwq(pwq);
2870
7576958a 2871 /*
d8ca83e6 2872 * Leave this pool. If need_more_worker() is %true, notify a
7576958a
TH
2873 * regular worker; otherwise, we end up with 0 concurrency
2874 * and stalling the execution.
2875 */
d8ca83e6 2876 if (need_more_worker(pool))
63d95a91 2877 wake_up_worker(pool);
7576958a 2878
a9b8a985 2879 raw_spin_unlock_irq(&pool->lock);
13b1d625 2880
a2d812a2 2881 worker_detach_from_pool(rescuer);
13b1d625 2882
a9b8a985 2883 raw_spin_lock_irq(&wq_mayday_lock);
e22bee78
TH
2884 }
2885
a9b8a985 2886 raw_spin_unlock_irq(&wq_mayday_lock);
493a1724 2887
4d595b86
LJ
2888 if (should_stop) {
2889 __set_current_state(TASK_RUNNING);
197f6acc 2890 set_pf_worker(false);
4d595b86
LJ
2891 return 0;
2892 }
2893
111c225a
TH
2894 /* rescuers should never participate in concurrency management */
2895 WARN_ON_ONCE(!(rescuer->flags & WORKER_NOT_RUNNING));
e22bee78
TH
2896 schedule();
2897 goto repeat;
1da177e4
LT
2898}
2899
fca839c0
TH
2900/**
2901 * check_flush_dependency - check for flush dependency sanity
2902 * @target_wq: workqueue being flushed
2903 * @target_work: work item being flushed (NULL for workqueue flushes)
2904 *
2905 * %current is trying to flush the whole @target_wq or @target_work on it.
2906 * If @target_wq doesn't have %WQ_MEM_RECLAIM, verify that %current is not
2907 * reclaiming memory or running on a workqueue which doesn't have
2908 * %WQ_MEM_RECLAIM as that can break forward-progress guarantee leading to
2909 * a deadlock.
2910 */
2911static void check_flush_dependency(struct workqueue_struct *target_wq,
2912 struct work_struct *target_work)
2913{
2914 work_func_t target_func = target_work ? target_work->func : NULL;
2915 struct worker *worker;
2916
2917 if (target_wq->flags & WQ_MEM_RECLAIM)
2918 return;
2919
2920 worker = current_wq_worker();
2921
2922 WARN_ONCE(current->flags & PF_MEMALLOC,
d75f773c 2923 "workqueue: PF_MEMALLOC task %d(%s) is flushing !WQ_MEM_RECLAIM %s:%ps",
fca839c0 2924 current->pid, current->comm, target_wq->name, target_func);
23d11a58
TH
2925 WARN_ONCE(worker && ((worker->current_pwq->wq->flags &
2926 (WQ_MEM_RECLAIM | __WQ_LEGACY)) == WQ_MEM_RECLAIM),
d75f773c 2927 "workqueue: WQ_MEM_RECLAIM %s:%ps is flushing !WQ_MEM_RECLAIM %s:%ps",
fca839c0
TH
2928 worker->current_pwq->wq->name, worker->current_func,
2929 target_wq->name, target_func);
2930}
2931
fc2e4d70
ON
2932struct wq_barrier {
2933 struct work_struct work;
2934 struct completion done;
2607d7a6 2935 struct task_struct *task; /* purely informational */
fc2e4d70
ON
2936};
2937
2938static void wq_barrier_func(struct work_struct *work)
2939{
2940 struct wq_barrier *barr = container_of(work, struct wq_barrier, work);
2941 complete(&barr->done);
2942}
2943
4690c4ab
TH
2944/**
2945 * insert_wq_barrier - insert a barrier work
112202d9 2946 * @pwq: pwq to insert barrier into
4690c4ab 2947 * @barr: wq_barrier to insert
affee4b2
TH
2948 * @target: target work to attach @barr to
2949 * @worker: worker currently executing @target, NULL if @target is not executing
4690c4ab 2950 *
affee4b2
TH
2951 * @barr is linked to @target such that @barr is completed only after
2952 * @target finishes execution. Please note that the ordering
2953 * guarantee is observed only with respect to @target and on the local
2954 * cpu.
2955 *
2956 * Currently, a queued barrier can't be canceled. This is because
2957 * try_to_grab_pending() can't determine whether the work to be
2958 * grabbed is at the head of the queue and thus can't clear LINKED
2959 * flag of the previous work while there must be a valid next work
2960 * after a work with LINKED flag set.
2961 *
2962 * Note that when @worker is non-NULL, @target may be modified
112202d9 2963 * underneath us, so we can't reliably determine pwq from @target.
4690c4ab
TH
2964 *
2965 * CONTEXT:
a9b8a985 2966 * raw_spin_lock_irq(pool->lock).
4690c4ab 2967 */
112202d9 2968static void insert_wq_barrier(struct pool_workqueue *pwq,
affee4b2
TH
2969 struct wq_barrier *barr,
2970 struct work_struct *target, struct worker *worker)
fc2e4d70 2971{
d812796e
LJ
2972 unsigned int work_flags = 0;
2973 unsigned int work_color;
affee4b2 2974 struct list_head *head;
affee4b2 2975
dc186ad7 2976 /*
d565ed63 2977 * debugobject calls are safe here even with pool->lock locked
dc186ad7
TG
2978 * as we know for sure that this will not trigger any of the
2979 * checks and call back into the fixup functions where we
2980 * might deadlock.
2981 */
ca1cab37 2982 INIT_WORK_ONSTACK(&barr->work, wq_barrier_func);
22df02bb 2983 __set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(&barr->work));
52fa5bc5 2984
fd1a5b04
BP
2985 init_completion_map(&barr->done, &target->lockdep_map);
2986
2607d7a6 2987 barr->task = current;
83c22520 2988
018f3a13
LJ
2989 /* The barrier work item does not participate in pwq->nr_active. */
2990 work_flags |= WORK_STRUCT_INACTIVE;
2991
affee4b2
TH
2992 /*
2993 * If @target is currently being executed, schedule the
2994 * barrier to the worker; otherwise, put it after @target.
2995 */
d812796e 2996 if (worker) {
affee4b2 2997 head = worker->scheduled.next;
d812796e
LJ
2998 work_color = worker->current_color;
2999 } else {
affee4b2
TH
3000 unsigned long *bits = work_data_bits(target);
3001
3002 head = target->entry.next;
3003 /* there can already be other linked works, inherit and set */
d21cece0 3004 work_flags |= *bits & WORK_STRUCT_LINKED;
d812796e 3005 work_color = get_work_color(*bits);
affee4b2
TH
3006 __set_bit(WORK_STRUCT_LINKED_BIT, bits);
3007 }
3008
d812796e
LJ
3009 pwq->nr_in_flight[work_color]++;
3010 work_flags |= work_color_to_flags(work_color);
3011
dc186ad7 3012 debug_work_activate(&barr->work);
d21cece0 3013 insert_work(pwq, &barr->work, head, work_flags);
fc2e4d70
ON
3014}
3015
73f53c4a 3016/**
112202d9 3017 * flush_workqueue_prep_pwqs - prepare pwqs for workqueue flushing
73f53c4a
TH
3018 * @wq: workqueue being flushed
3019 * @flush_color: new flush color, < 0 for no-op
3020 * @work_color: new work color, < 0 for no-op
3021 *
112202d9 3022 * Prepare pwqs for workqueue flushing.
73f53c4a 3023 *
112202d9
TH
3024 * If @flush_color is non-negative, flush_color on all pwqs should be
3025 * -1. If no pwq has in-flight commands at the specified color, all
3026 * pwq->flush_color's stay at -1 and %false is returned. If any pwq
3027 * has in flight commands, its pwq->flush_color is set to
3028 * @flush_color, @wq->nr_pwqs_to_flush is updated accordingly, pwq
73f53c4a
TH
3029 * wakeup logic is armed and %true is returned.
3030 *
3031 * The caller should have initialized @wq->first_flusher prior to
3032 * calling this function with non-negative @flush_color. If
3033 * @flush_color is negative, no flush color update is done and %false
3034 * is returned.
3035 *
112202d9 3036 * If @work_color is non-negative, all pwqs should have the same
73f53c4a
TH
3037 * work_color which is previous to @work_color and all will be
3038 * advanced to @work_color.
3039 *
3040 * CONTEXT:
3c25a55d 3041 * mutex_lock(wq->mutex).
73f53c4a 3042 *
d185af30 3043 * Return:
73f53c4a
TH
3044 * %true if @flush_color >= 0 and there's something to flush. %false
3045 * otherwise.
3046 */
112202d9 3047static bool flush_workqueue_prep_pwqs(struct workqueue_struct *wq,
73f53c4a 3048 int flush_color, int work_color)
1da177e4 3049{
73f53c4a 3050 bool wait = false;
49e3cf44 3051 struct pool_workqueue *pwq;
1da177e4 3052
73f53c4a 3053 if (flush_color >= 0) {
6183c009 3054 WARN_ON_ONCE(atomic_read(&wq->nr_pwqs_to_flush));
112202d9 3055 atomic_set(&wq->nr_pwqs_to_flush, 1);
1da177e4 3056 }
2355b70f 3057
49e3cf44 3058 for_each_pwq(pwq, wq) {
112202d9 3059 struct worker_pool *pool = pwq->pool;
fc2e4d70 3060
a9b8a985 3061 raw_spin_lock_irq(&pool->lock);
83c22520 3062
73f53c4a 3063 if (flush_color >= 0) {
6183c009 3064 WARN_ON_ONCE(pwq->flush_color != -1);
fc2e4d70 3065
112202d9
TH
3066 if (pwq->nr_in_flight[flush_color]) {
3067 pwq->flush_color = flush_color;
3068 atomic_inc(&wq->nr_pwqs_to_flush);
73f53c4a
TH
3069 wait = true;
3070 }
3071 }
1da177e4 3072
73f53c4a 3073 if (work_color >= 0) {
6183c009 3074 WARN_ON_ONCE(work_color != work_next_color(pwq->work_color));
112202d9 3075 pwq->work_color = work_color;
73f53c4a 3076 }
1da177e4 3077
a9b8a985 3078 raw_spin_unlock_irq(&pool->lock);
1da177e4 3079 }
2355b70f 3080
112202d9 3081 if (flush_color >= 0 && atomic_dec_and_test(&wq->nr_pwqs_to_flush))
73f53c4a 3082 complete(&wq->first_flusher->done);
14441960 3083
73f53c4a 3084 return wait;
1da177e4
LT
3085}
3086
0fcb78c2 3087/**
c4f135d6 3088 * __flush_workqueue - ensure that any scheduled work has run to completion.
0fcb78c2 3089 * @wq: workqueue to flush
1da177e4 3090 *
c5aa87bb
TH
3091 * This function sleeps until all work items which were queued on entry
3092 * have finished execution, but it is not livelocked by new incoming ones.
1da177e4 3093 */
c4f135d6 3094void __flush_workqueue(struct workqueue_struct *wq)
1da177e4 3095{
73f53c4a
TH
3096 struct wq_flusher this_flusher = {
3097 .list = LIST_HEAD_INIT(this_flusher.list),
3098 .flush_color = -1,
fd1a5b04 3099 .done = COMPLETION_INITIALIZER_ONSTACK_MAP(this_flusher.done, wq->lockdep_map),
73f53c4a
TH
3100 };
3101 int next_color;
1da177e4 3102
3347fa09
TH
3103 if (WARN_ON(!wq_online))
3104 return;
3105
87915adc
JB
3106 lock_map_acquire(&wq->lockdep_map);
3107 lock_map_release(&wq->lockdep_map);
3108
3c25a55d 3109 mutex_lock(&wq->mutex);
73f53c4a
TH
3110
3111 /*
3112 * Start-to-wait phase
3113 */
3114 next_color = work_next_color(wq->work_color);
3115
3116 if (next_color != wq->flush_color) {
3117 /*
3118 * Color space is not full. The current work_color
3119 * becomes our flush_color and work_color is advanced
3120 * by one.
3121 */
6183c009 3122 WARN_ON_ONCE(!list_empty(&wq->flusher_overflow));
73f53c4a
TH
3123 this_flusher.flush_color = wq->work_color;
3124 wq->work_color = next_color;
3125
3126 if (!wq->first_flusher) {
3127 /* no flush in progress, become the first flusher */
6183c009 3128 WARN_ON_ONCE(wq->flush_color != this_flusher.flush_color);
73f53c4a
TH
3129
3130 wq->first_flusher = &this_flusher;
3131
112202d9 3132 if (!flush_workqueue_prep_pwqs(wq, wq->flush_color,
73f53c4a
TH
3133 wq->work_color)) {
3134 /* nothing to flush, done */
3135 wq->flush_color = next_color;
3136 wq->first_flusher = NULL;
3137 goto out_unlock;
3138 }
3139 } else {
3140 /* wait in queue */
6183c009 3141 WARN_ON_ONCE(wq->flush_color == this_flusher.flush_color);
73f53c4a 3142 list_add_tail(&this_flusher.list, &wq->flusher_queue);
112202d9 3143 flush_workqueue_prep_pwqs(wq, -1, wq->work_color);
73f53c4a
TH
3144 }
3145 } else {
3146 /*
3147 * Oops, color space is full, wait on overflow queue.
3148 * The next flush completion will assign us
3149 * flush_color and transfer to flusher_queue.
3150 */
3151 list_add_tail(&this_flusher.list, &wq->flusher_overflow);
3152 }
3153
fca839c0
TH
3154 check_flush_dependency(wq, NULL);
3155
3c25a55d 3156 mutex_unlock(&wq->mutex);
73f53c4a
TH
3157
3158 wait_for_completion(&this_flusher.done);
3159
3160 /*
3161 * Wake-up-and-cascade phase
3162 *
3163 * First flushers are responsible for cascading flushes and
3164 * handling overflow. Non-first flushers can simply return.
3165 */
00d5d15b 3166 if (READ_ONCE(wq->first_flusher) != &this_flusher)
73f53c4a
TH
3167 return;
3168
3c25a55d 3169 mutex_lock(&wq->mutex);
73f53c4a 3170
4ce48b37
TH
3171 /* we might have raced, check again with mutex held */
3172 if (wq->first_flusher != &this_flusher)
3173 goto out_unlock;
3174
00d5d15b 3175 WRITE_ONCE(wq->first_flusher, NULL);
73f53c4a 3176
6183c009
TH
3177 WARN_ON_ONCE(!list_empty(&this_flusher.list));
3178 WARN_ON_ONCE(wq->flush_color != this_flusher.flush_color);
73f53c4a
TH
3179
3180 while (true) {
3181 struct wq_flusher *next, *tmp;
3182
3183 /* complete all the flushers sharing the current flush color */
3184 list_for_each_entry_safe(next, tmp, &wq->flusher_queue, list) {
3185 if (next->flush_color != wq->flush_color)
3186 break;
3187 list_del_init(&next->list);
3188 complete(&next->done);
3189 }
3190
6183c009
TH
3191 WARN_ON_ONCE(!list_empty(&wq->flusher_overflow) &&
3192 wq->flush_color != work_next_color(wq->work_color));
73f53c4a
TH
3193
3194 /* this flush_color is finished, advance by one */
3195 wq->flush_color = work_next_color(wq->flush_color);
3196
3197 /* one color has been freed, handle overflow queue */
3198 if (!list_empty(&wq->flusher_overflow)) {
3199 /*
3200 * Assign the same color to all overflowed
3201 * flushers, advance work_color and append to
3202 * flusher_queue. This is the start-to-wait
3203 * phase for these overflowed flushers.
3204 */
3205 list_for_each_entry(tmp, &wq->flusher_overflow, list)
3206 tmp->flush_color = wq->work_color;
3207
3208 wq->work_color = work_next_color(wq->work_color);
3209
3210 list_splice_tail_init(&wq->flusher_overflow,
3211 &wq->flusher_queue);
112202d9 3212 flush_workqueue_prep_pwqs(wq, -1, wq->work_color);
73f53c4a
TH
3213 }
3214
3215 if (list_empty(&wq->flusher_queue)) {
6183c009 3216 WARN_ON_ONCE(wq->flush_color != wq->work_color);
73f53c4a
TH
3217 break;
3218 }
3219
3220 /*
3221 * Need to flush more colors. Make the next flusher
112202d9 3222 * the new first flusher and arm pwqs.
73f53c4a 3223 */
6183c009
TH
3224 WARN_ON_ONCE(wq->flush_color == wq->work_color);
3225 WARN_ON_ONCE(wq->flush_color != next->flush_color);
73f53c4a
TH
3226
3227 list_del_init(&next->list);
3228 wq->first_flusher = next;
3229
112202d9 3230 if (flush_workqueue_prep_pwqs(wq, wq->flush_color, -1))
73f53c4a
TH
3231 break;
3232
3233 /*
3234 * Meh... this color is already done, clear first
3235 * flusher and repeat cascading.
3236 */
3237 wq->first_flusher = NULL;
3238 }
3239
3240out_unlock:
3c25a55d 3241 mutex_unlock(&wq->mutex);
1da177e4 3242}
c4f135d6 3243EXPORT_SYMBOL(__flush_workqueue);
1da177e4 3244
9c5a2ba7
TH
3245/**
3246 * drain_workqueue - drain a workqueue
3247 * @wq: workqueue to drain
3248 *
3249 * Wait until the workqueue becomes empty. While draining is in progress,
3250 * only chain queueing is allowed. IOW, only currently pending or running
3251 * work items on @wq can queue further work items on it. @wq is flushed
b749b1b6 3252 * repeatedly until it becomes empty. The number of flushing is determined
9c5a2ba7
TH
3253 * by the depth of chaining and should be relatively short. Whine if it
3254 * takes too long.
3255 */
3256void drain_workqueue(struct workqueue_struct *wq)
3257{
3258 unsigned int flush_cnt = 0;
49e3cf44 3259 struct pool_workqueue *pwq;
9c5a2ba7
TH
3260
3261 /*
3262 * __queue_work() needs to test whether there are drainers, is much
3263 * hotter than drain_workqueue() and already looks at @wq->flags.
618b01eb 3264 * Use __WQ_DRAINING so that queue doesn't have to check nr_drainers.
9c5a2ba7 3265 */
87fc741e 3266 mutex_lock(&wq->mutex);
9c5a2ba7 3267 if (!wq->nr_drainers++)
618b01eb 3268 wq->flags |= __WQ_DRAINING;
87fc741e 3269 mutex_unlock(&wq->mutex);
9c5a2ba7 3270reflush:
c4f135d6 3271 __flush_workqueue(wq);
9c5a2ba7 3272
b09f4fd3 3273 mutex_lock(&wq->mutex);
76af4d93 3274
49e3cf44 3275 for_each_pwq(pwq, wq) {
fa2563e4 3276 bool drained;
9c5a2ba7 3277
a9b8a985 3278 raw_spin_lock_irq(&pwq->pool->lock);
f97a4a1a 3279 drained = !pwq->nr_active && list_empty(&pwq->inactive_works);
a9b8a985 3280 raw_spin_unlock_irq(&pwq->pool->lock);
fa2563e4
TT
3281
3282 if (drained)
9c5a2ba7
TH
3283 continue;
3284
3285 if (++flush_cnt == 10 ||
3286 (flush_cnt % 100 == 0 && flush_cnt <= 1000))
e9ad2eb3
SZ
3287 pr_warn("workqueue %s: %s() isn't complete after %u tries\n",
3288 wq->name, __func__, flush_cnt);
76af4d93 3289
b09f4fd3 3290 mutex_unlock(&wq->mutex);
9c5a2ba7
TH
3291 goto reflush;
3292 }
3293
9c5a2ba7 3294 if (!--wq->nr_drainers)
618b01eb 3295 wq->flags &= ~__WQ_DRAINING;
87fc741e 3296 mutex_unlock(&wq->mutex);
9c5a2ba7
TH
3297}
3298EXPORT_SYMBOL_GPL(drain_workqueue);
3299
d6e89786
JB
3300static bool start_flush_work(struct work_struct *work, struct wq_barrier *barr,
3301 bool from_cancel)
db700897 3302{
affee4b2 3303 struct worker *worker = NULL;
c9e7cf27 3304 struct worker_pool *pool;
112202d9 3305 struct pool_workqueue *pwq;
db700897
ON
3306
3307 might_sleep();
fa1b54e6 3308
24acfb71 3309 rcu_read_lock();
c9e7cf27 3310 pool = get_work_pool(work);
fa1b54e6 3311 if (!pool) {
24acfb71 3312 rcu_read_unlock();
baf59022 3313 return false;
fa1b54e6 3314 }
db700897 3315
a9b8a985 3316 raw_spin_lock_irq(&pool->lock);
0b3dae68 3317 /* see the comment in try_to_grab_pending() with the same code */
112202d9
TH
3318 pwq = get_work_pwq(work);
3319 if (pwq) {
3320 if (unlikely(pwq->pool != pool))
4690c4ab 3321 goto already_gone;
606a5020 3322 } else {
c9e7cf27 3323 worker = find_worker_executing_work(pool, work);
affee4b2 3324 if (!worker)
4690c4ab 3325 goto already_gone;
112202d9 3326 pwq = worker->current_pwq;
606a5020 3327 }
db700897 3328
fca839c0
TH
3329 check_flush_dependency(pwq->wq, work);
3330
112202d9 3331 insert_wq_barrier(pwq, barr, work, worker);
a9b8a985 3332 raw_spin_unlock_irq(&pool->lock);
7a22ad75 3333
e159489b 3334 /*
a1d14934
PZ
3335 * Force a lock recursion deadlock when using flush_work() inside a
3336 * single-threaded or rescuer equipped workqueue.
3337 *
3338 * For single threaded workqueues the deadlock happens when the work
3339 * is after the work issuing the flush_work(). For rescuer equipped
3340 * workqueues the deadlock happens when the rescuer stalls, blocking
3341 * forward progress.
e159489b 3342 */
d6e89786
JB
3343 if (!from_cancel &&
3344 (pwq->wq->saved_max_active == 1 || pwq->wq->rescuer)) {
112202d9 3345 lock_map_acquire(&pwq->wq->lockdep_map);
a1d14934
PZ
3346 lock_map_release(&pwq->wq->lockdep_map);
3347 }
24acfb71 3348 rcu_read_unlock();
401a8d04 3349 return true;
4690c4ab 3350already_gone:
a9b8a985 3351 raw_spin_unlock_irq(&pool->lock);
24acfb71 3352 rcu_read_unlock();
401a8d04 3353 return false;
db700897 3354}
baf59022 3355
d6e89786
JB
3356static bool __flush_work(struct work_struct *work, bool from_cancel)
3357{
3358 struct wq_barrier barr;
3359
3360 if (WARN_ON(!wq_online))
3361 return false;
3362
4d43d395
TH
3363 if (WARN_ON(!work->func))
3364 return false;
3365
c0feea59
TH
3366 lock_map_acquire(&work->lockdep_map);
3367 lock_map_release(&work->lockdep_map);
87915adc 3368
d6e89786
JB
3369 if (start_flush_work(work, &barr, from_cancel)) {
3370 wait_for_completion(&barr.done);
3371 destroy_work_on_stack(&barr.work);
3372 return true;
3373 } else {
3374 return false;
3375 }
3376}
3377
baf59022
TH
3378/**
3379 * flush_work - wait for a work to finish executing the last queueing instance
3380 * @work: the work to flush
3381 *
606a5020
TH
3382 * Wait until @work has finished execution. @work is guaranteed to be idle
3383 * on return if it hasn't been requeued since flush started.
baf59022 3384 *
d185af30 3385 * Return:
baf59022
TH
3386 * %true if flush_work() waited for the work to finish execution,
3387 * %false if it was already idle.
3388 */
3389bool flush_work(struct work_struct *work)
3390{
d6e89786 3391 return __flush_work(work, false);
6e84d644 3392}
606a5020 3393EXPORT_SYMBOL_GPL(flush_work);
6e84d644 3394
8603e1b3 3395struct cwt_wait {
ac6424b9 3396 wait_queue_entry_t wait;
8603e1b3
TH
3397 struct work_struct *work;
3398};
3399
ac6424b9 3400static int cwt_wakefn(wait_queue_entry_t *wait, unsigned mode, int sync, void *key)
8603e1b3
TH
3401{
3402 struct cwt_wait *cwait = container_of(wait, struct cwt_wait, wait);
3403
3404 if (cwait->work != key)
3405 return 0;
3406 return autoremove_wake_function(wait, mode, sync, key);
3407}
3408
36e227d2 3409static bool __cancel_work_timer(struct work_struct *work, bool is_dwork)
1f1f642e 3410{
8603e1b3 3411 static DECLARE_WAIT_QUEUE_HEAD(cancel_waitq);
bbb68dfa 3412 unsigned long flags;
1f1f642e
ON
3413 int ret;
3414
3415 do {
bbb68dfa
TH
3416 ret = try_to_grab_pending(work, is_dwork, &flags);
3417 /*
8603e1b3
TH
3418 * If someone else is already canceling, wait for it to
3419 * finish. flush_work() doesn't work for PREEMPT_NONE
3420 * because we may get scheduled between @work's completion
3421 * and the other canceling task resuming and clearing
3422 * CANCELING - flush_work() will return false immediately
3423 * as @work is no longer busy, try_to_grab_pending() will
3424 * return -ENOENT as @work is still being canceled and the
3425 * other canceling task won't be able to clear CANCELING as
3426 * we're hogging the CPU.
3427 *
3428 * Let's wait for completion using a waitqueue. As this
3429 * may lead to the thundering herd problem, use a custom
3430 * wake function which matches @work along with exclusive
3431 * wait and wakeup.
bbb68dfa 3432 */
8603e1b3
TH
3433 if (unlikely(ret == -ENOENT)) {
3434 struct cwt_wait cwait;
3435
3436 init_wait(&cwait.wait);
3437 cwait.wait.func = cwt_wakefn;
3438 cwait.work = work;
3439
3440 prepare_to_wait_exclusive(&cancel_waitq, &cwait.wait,
3441 TASK_UNINTERRUPTIBLE);
3442 if (work_is_canceling(work))
3443 schedule();
3444 finish_wait(&cancel_waitq, &cwait.wait);
3445 }
1f1f642e
ON
3446 } while (unlikely(ret < 0));
3447
bbb68dfa
TH
3448 /* tell other tasks trying to grab @work to back off */
3449 mark_work_canceling(work);
3450 local_irq_restore(flags);
3451
3347fa09
TH
3452 /*
3453 * This allows canceling during early boot. We know that @work
3454 * isn't executing.
3455 */
3456 if (wq_online)
d6e89786 3457 __flush_work(work, true);
3347fa09 3458
7a22ad75 3459 clear_work_data(work);
8603e1b3
TH
3460
3461 /*
3462 * Paired with prepare_to_wait() above so that either
3463 * waitqueue_active() is visible here or !work_is_canceling() is
3464 * visible there.
3465 */
3466 smp_mb();
3467 if (waitqueue_active(&cancel_waitq))
3468 __wake_up(&cancel_waitq, TASK_NORMAL, 1, work);
3469
1f1f642e
ON
3470 return ret;
3471}
3472
6e84d644 3473/**
401a8d04
TH
3474 * cancel_work_sync - cancel a work and wait for it to finish
3475 * @work: the work to cancel
6e84d644 3476 *
401a8d04
TH
3477 * Cancel @work and wait for its execution to finish. This function
3478 * can be used even if the work re-queues itself or migrates to
3479 * another workqueue. On return from this function, @work is
3480 * guaranteed to be not pending or executing on any CPU.
1f1f642e 3481 *
401a8d04
TH
3482 * cancel_work_sync(&delayed_work->work) must not be used for
3483 * delayed_work's. Use cancel_delayed_work_sync() instead.
6e84d644 3484 *
401a8d04 3485 * The caller must ensure that the workqueue on which @work was last
6e84d644 3486 * queued can't be destroyed before this function returns.
401a8d04 3487 *
d185af30 3488 * Return:
401a8d04 3489 * %true if @work was pending, %false otherwise.
6e84d644 3490 */
401a8d04 3491bool cancel_work_sync(struct work_struct *work)
6e84d644 3492{
36e227d2 3493 return __cancel_work_timer(work, false);
b89deed3 3494}
28e53bdd 3495EXPORT_SYMBOL_GPL(cancel_work_sync);
b89deed3 3496
6e84d644 3497/**
401a8d04
TH
3498 * flush_delayed_work - wait for a dwork to finish executing the last queueing
3499 * @dwork: the delayed work to flush
6e84d644 3500 *
401a8d04
TH
3501 * Delayed timer is cancelled and the pending work is queued for
3502 * immediate execution. Like flush_work(), this function only
3503 * considers the last queueing instance of @dwork.
1f1f642e 3504 *
d185af30 3505 * Return:
401a8d04
TH
3506 * %true if flush_work() waited for the work to finish execution,
3507 * %false if it was already idle.
6e84d644 3508 */
401a8d04
TH
3509bool flush_delayed_work(struct delayed_work *dwork)
3510{
8930caba 3511 local_irq_disable();
401a8d04 3512 if (del_timer_sync(&dwork->timer))
60c057bc 3513 __queue_work(dwork->cpu, dwork->wq, &dwork->work);
8930caba 3514 local_irq_enable();
401a8d04
TH
3515 return flush_work(&dwork->work);
3516}
3517EXPORT_SYMBOL(flush_delayed_work);
3518
05f0fe6b
TH
3519/**
3520 * flush_rcu_work - wait for a rwork to finish executing the last queueing
3521 * @rwork: the rcu work to flush
3522 *
3523 * Return:
3524 * %true if flush_rcu_work() waited for the work to finish execution,
3525 * %false if it was already idle.
3526 */
3527bool flush_rcu_work(struct rcu_work *rwork)
3528{
3529 if (test_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(&rwork->work))) {
3530 rcu_barrier();
3531 flush_work(&rwork->work);
3532 return true;
3533 } else {
3534 return flush_work(&rwork->work);
3535 }
3536}
3537EXPORT_SYMBOL(flush_rcu_work);
3538
f72b8792
JA
3539static bool __cancel_work(struct work_struct *work, bool is_dwork)
3540{
3541 unsigned long flags;
3542 int ret;
3543
3544 do {
3545 ret = try_to_grab_pending(work, is_dwork, &flags);
3546 } while (unlikely(ret == -EAGAIN));
3547
3548 if (unlikely(ret < 0))
3549 return false;
3550
3551 set_work_pool_and_clear_pending(work, get_work_pool_id(work));
3552 local_irq_restore(flags);
3553 return ret;
3554}
3555
73b4b532
AG
3556/*
3557 * See cancel_delayed_work()
3558 */
3559bool cancel_work(struct work_struct *work)
3560{
3561 return __cancel_work(work, false);
3562}
3563EXPORT_SYMBOL(cancel_work);
3564
09383498 3565/**
57b30ae7
TH
3566 * cancel_delayed_work - cancel a delayed work
3567 * @dwork: delayed_work to cancel
09383498 3568 *
d185af30
YB
3569 * Kill off a pending delayed_work.
3570 *
3571 * Return: %true if @dwork was pending and canceled; %false if it wasn't
3572 * pending.
3573 *
3574 * Note:
3575 * The work callback function may still be running on return, unless
3576 * it returns %true and the work doesn't re-arm itself. Explicitly flush or
3577 * use cancel_delayed_work_sync() to wait on it.
09383498 3578 *
57b30ae7 3579 * This function is safe to call from any context including IRQ handler.
09383498 3580 */
57b30ae7 3581bool cancel_delayed_work(struct delayed_work *dwork)
09383498 3582{
f72b8792 3583 return __cancel_work(&dwork->work, true);
09383498 3584}
57b30ae7 3585EXPORT_SYMBOL(cancel_delayed_work);
09383498 3586
401a8d04
TH
3587/**
3588 * cancel_delayed_work_sync - cancel a delayed work and wait for it to finish
3589 * @dwork: the delayed work cancel
3590 *
3591 * This is cancel_work_sync() for delayed works.
3592 *
d185af30 3593 * Return:
401a8d04
TH
3594 * %true if @dwork was pending, %false otherwise.
3595 */
3596bool cancel_delayed_work_sync(struct delayed_work *dwork)
6e84d644 3597{
36e227d2 3598 return __cancel_work_timer(&dwork->work, true);
6e84d644 3599}
f5a421a4 3600EXPORT_SYMBOL(cancel_delayed_work_sync);
1da177e4 3601
b6136773 3602/**
31ddd871 3603 * schedule_on_each_cpu - execute a function synchronously on each online CPU
b6136773 3604 * @func: the function to call
b6136773 3605 *
31ddd871
TH
3606 * schedule_on_each_cpu() executes @func on each online CPU using the
3607 * system workqueue and blocks until all CPUs have completed.
b6136773 3608 * schedule_on_each_cpu() is very slow.
31ddd871 3609 *
d185af30 3610 * Return:
31ddd871 3611 * 0 on success, -errno on failure.
b6136773 3612 */
65f27f38 3613int schedule_on_each_cpu(work_func_t func)
15316ba8
CL
3614{
3615 int cpu;
38f51568 3616 struct work_struct __percpu *works;
15316ba8 3617
b6136773
AM
3618 works = alloc_percpu(struct work_struct);
3619 if (!works)
15316ba8 3620 return -ENOMEM;
b6136773 3621
ffd8bea8 3622 cpus_read_lock();
93981800 3623
15316ba8 3624 for_each_online_cpu(cpu) {
9bfb1839
IM
3625 struct work_struct *work = per_cpu_ptr(works, cpu);
3626
3627 INIT_WORK(work, func);
b71ab8c2 3628 schedule_work_on(cpu, work);
65a64464 3629 }
93981800
TH
3630
3631 for_each_online_cpu(cpu)
3632 flush_work(per_cpu_ptr(works, cpu));
3633
ffd8bea8 3634 cpus_read_unlock();
b6136773 3635 free_percpu(works);
15316ba8
CL
3636 return 0;
3637}
3638
1fa44eca
JB
3639/**
3640 * execute_in_process_context - reliably execute the routine with user context
3641 * @fn: the function to execute
1fa44eca
JB
3642 * @ew: guaranteed storage for the execute work structure (must
3643 * be available when the work executes)
3644 *
3645 * Executes the function immediately if process context is available,
3646 * otherwise schedules the function for delayed execution.
3647 *
d185af30 3648 * Return: 0 - function was executed
1fa44eca
JB
3649 * 1 - function was scheduled for execution
3650 */
65f27f38 3651int execute_in_process_context(work_func_t fn, struct execute_work *ew)
1fa44eca
JB
3652{
3653 if (!in_interrupt()) {
65f27f38 3654 fn(&ew->work);
1fa44eca
JB
3655 return 0;
3656 }
3657
65f27f38 3658 INIT_WORK(&ew->work, fn);
1fa44eca
JB
3659 schedule_work(&ew->work);
3660
3661 return 1;
3662}
3663EXPORT_SYMBOL_GPL(execute_in_process_context);
3664
6ba94429
FW
3665/**
3666 * free_workqueue_attrs - free a workqueue_attrs
3667 * @attrs: workqueue_attrs to free
226223ab 3668 *
6ba94429 3669 * Undo alloc_workqueue_attrs().
226223ab 3670 */
513c98d0 3671void free_workqueue_attrs(struct workqueue_attrs *attrs)
226223ab 3672{
6ba94429
FW
3673 if (attrs) {
3674 free_cpumask_var(attrs->cpumask);
3675 kfree(attrs);
3676 }
226223ab
TH
3677}
3678
6ba94429
FW
3679/**
3680 * alloc_workqueue_attrs - allocate a workqueue_attrs
6ba94429
FW
3681 *
3682 * Allocate a new workqueue_attrs, initialize with default settings and
3683 * return it.
3684 *
3685 * Return: The allocated new workqueue_attr on success. %NULL on failure.
3686 */
513c98d0 3687struct workqueue_attrs *alloc_workqueue_attrs(void)
226223ab 3688{
6ba94429 3689 struct workqueue_attrs *attrs;
226223ab 3690
be69d00d 3691 attrs = kzalloc(sizeof(*attrs), GFP_KERNEL);
6ba94429
FW
3692 if (!attrs)
3693 goto fail;
be69d00d 3694 if (!alloc_cpumask_var(&attrs->cpumask, GFP_KERNEL))
6ba94429
FW
3695 goto fail;
3696
3697 cpumask_copy(attrs->cpumask, cpu_possible_mask);
3698 return attrs;
3699fail:
3700 free_workqueue_attrs(attrs);
3701 return NULL;
226223ab
TH
3702}
3703
6ba94429
FW
3704static void copy_workqueue_attrs(struct workqueue_attrs *to,
3705 const struct workqueue_attrs *from)
226223ab 3706{
6ba94429
FW
3707 to->nice = from->nice;
3708 cpumask_copy(to->cpumask, from->cpumask);
3709 /*
3710 * Unlike hash and equality test, this function doesn't ignore
3711 * ->no_numa as it is used for both pool and wq attrs. Instead,
3712 * get_unbound_pool() explicitly clears ->no_numa after copying.
3713 */
3714 to->no_numa = from->no_numa;
226223ab
TH
3715}
3716
6ba94429
FW
3717/* hash value of the content of @attr */
3718static u32 wqattrs_hash(const struct workqueue_attrs *attrs)
226223ab 3719{
6ba94429 3720 u32 hash = 0;
226223ab 3721
6ba94429
FW
3722 hash = jhash_1word(attrs->nice, hash);
3723 hash = jhash(cpumask_bits(attrs->cpumask),
3724 BITS_TO_LONGS(nr_cpumask_bits) * sizeof(long), hash);
3725 return hash;
226223ab 3726}
226223ab 3727
6ba94429
FW
3728/* content equality test */
3729static bool wqattrs_equal(const struct workqueue_attrs *a,
3730 const struct workqueue_attrs *b)
226223ab 3731{
6ba94429
FW
3732 if (a->nice != b->nice)
3733 return false;
3734 if (!cpumask_equal(a->cpumask, b->cpumask))
3735 return false;
3736 return true;
226223ab
TH
3737}
3738
6ba94429
FW
3739/**
3740 * init_worker_pool - initialize a newly zalloc'd worker_pool
3741 * @pool: worker_pool to initialize
3742 *
402dd89d 3743 * Initialize a newly zalloc'd @pool. It also allocates @pool->attrs.
6ba94429
FW
3744 *
3745 * Return: 0 on success, -errno on failure. Even on failure, all fields
3746 * inside @pool proper are initialized and put_unbound_pool() can be called
3747 * on @pool safely to release it.
3748 */
3749static int init_worker_pool(struct worker_pool *pool)
226223ab 3750{
a9b8a985 3751 raw_spin_lock_init(&pool->lock);
6ba94429
FW
3752 pool->id = -1;
3753 pool->cpu = -1;
3754 pool->node = NUMA_NO_NODE;
3755 pool->flags |= POOL_DISASSOCIATED;
82607adc 3756 pool->watchdog_ts = jiffies;
6ba94429
FW
3757 INIT_LIST_HEAD(&pool->worklist);
3758 INIT_LIST_HEAD(&pool->idle_list);
3759 hash_init(pool->busy_hash);
226223ab 3760
32a6c723 3761 timer_setup(&pool->idle_timer, idle_worker_timeout, TIMER_DEFERRABLE);
3f959aa3 3762 INIT_WORK(&pool->idle_cull_work, idle_cull_fn);
226223ab 3763
32a6c723 3764 timer_setup(&pool->mayday_timer, pool_mayday_timeout, 0);
226223ab 3765
6ba94429 3766 INIT_LIST_HEAD(&pool->workers);
e02b9312 3767 INIT_LIST_HEAD(&pool->dying_workers);
226223ab 3768
6ba94429
FW
3769 ida_init(&pool->worker_ida);
3770 INIT_HLIST_NODE(&pool->hash_node);
3771 pool->refcnt = 1;
226223ab 3772
6ba94429 3773 /* shouldn't fail above this point */
be69d00d 3774 pool->attrs = alloc_workqueue_attrs();
6ba94429
FW
3775 if (!pool->attrs)
3776 return -ENOMEM;
3777 return 0;
226223ab
TH
3778}
3779
669de8bd
BVA
3780#ifdef CONFIG_LOCKDEP
3781static void wq_init_lockdep(struct workqueue_struct *wq)
3782{
3783 char *lock_name;
3784
3785 lockdep_register_key(&wq->key);
3786 lock_name = kasprintf(GFP_KERNEL, "%s%s", "(wq_completion)", wq->name);
3787 if (!lock_name)
3788 lock_name = wq->name;
69a106c0
QC
3789
3790 wq->lock_name = lock_name;
669de8bd
BVA
3791 lockdep_init_map(&wq->lockdep_map, lock_name, &wq->key, 0);
3792}
3793
3794static void wq_unregister_lockdep(struct workqueue_struct *wq)
3795{
3796 lockdep_unregister_key(&wq->key);
3797}
3798
3799static void wq_free_lockdep(struct workqueue_struct *wq)
3800{
3801 if (wq->lock_name != wq->name)
3802 kfree(wq->lock_name);
3803}
3804#else
3805static void wq_init_lockdep(struct workqueue_struct *wq)
3806{
3807}
3808
3809static void wq_unregister_lockdep(struct workqueue_struct *wq)
3810{
3811}
3812
3813static void wq_free_lockdep(struct workqueue_struct *wq)
3814{
3815}
3816#endif
3817
6ba94429 3818static void rcu_free_wq(struct rcu_head *rcu)
226223ab 3819{
6ba94429
FW
3820 struct workqueue_struct *wq =
3821 container_of(rcu, struct workqueue_struct, rcu);
226223ab 3822
669de8bd
BVA
3823 wq_free_lockdep(wq);
3824
6ba94429
FW
3825 if (!(wq->flags & WQ_UNBOUND))
3826 free_percpu(wq->cpu_pwqs);
226223ab 3827 else
6ba94429 3828 free_workqueue_attrs(wq->unbound_attrs);
226223ab 3829
6ba94429 3830 kfree(wq);
226223ab
TH
3831}
3832
6ba94429 3833static void rcu_free_pool(struct rcu_head *rcu)
226223ab 3834{
6ba94429 3835 struct worker_pool *pool = container_of(rcu, struct worker_pool, rcu);
226223ab 3836
6ba94429
FW
3837 ida_destroy(&pool->worker_ida);
3838 free_workqueue_attrs(pool->attrs);
3839 kfree(pool);
226223ab
TH
3840}
3841
6ba94429
FW
3842/**
3843 * put_unbound_pool - put a worker_pool
3844 * @pool: worker_pool to put
3845 *
24acfb71 3846 * Put @pool. If its refcnt reaches zero, it gets destroyed in RCU
6ba94429
FW
3847 * safe manner. get_unbound_pool() calls this function on its failure path
3848 * and this function should be able to release pools which went through,
3849 * successfully or not, init_worker_pool().
3850 *
3851 * Should be called with wq_pool_mutex held.
3852 */
3853static void put_unbound_pool(struct worker_pool *pool)
226223ab 3854{
6ba94429 3855 DECLARE_COMPLETION_ONSTACK(detach_completion);
e02b9312 3856 struct list_head cull_list;
6ba94429 3857 struct worker *worker;
226223ab 3858
e02b9312
VS
3859 INIT_LIST_HEAD(&cull_list);
3860
6ba94429 3861 lockdep_assert_held(&wq_pool_mutex);
226223ab 3862
6ba94429
FW
3863 if (--pool->refcnt)
3864 return;
226223ab 3865
6ba94429
FW
3866 /* sanity checks */
3867 if (WARN_ON(!(pool->cpu < 0)) ||
3868 WARN_ON(!list_empty(&pool->worklist)))
3869 return;
226223ab 3870
6ba94429
FW
3871 /* release id and unhash */
3872 if (pool->id >= 0)
3873 idr_remove(&worker_pool_idr, pool->id);
3874 hash_del(&pool->hash_node);
d55262c4 3875
6ba94429 3876 /*
692b4825
TH
3877 * Become the manager and destroy all workers. This prevents
3878 * @pool's workers from blocking on attach_mutex. We're the last
3879 * manager and @pool gets freed with the flag set.
9ab03be4
VS
3880 *
3881 * Having a concurrent manager is quite unlikely to happen as we can
3882 * only get here with
3883 * pwq->refcnt == pool->refcnt == 0
3884 * which implies no work queued to the pool, which implies no worker can
3885 * become the manager. However a worker could have taken the role of
3886 * manager before the refcnts dropped to 0, since maybe_create_worker()
3887 * drops pool->lock
6ba94429 3888 */
9ab03be4
VS
3889 while (true) {
3890 rcuwait_wait_event(&manager_wait,
3891 !(pool->flags & POOL_MANAGER_ACTIVE),
3892 TASK_UNINTERRUPTIBLE);
e02b9312
VS
3893
3894 mutex_lock(&wq_pool_attach_mutex);
9ab03be4
VS
3895 raw_spin_lock_irq(&pool->lock);
3896 if (!(pool->flags & POOL_MANAGER_ACTIVE)) {
3897 pool->flags |= POOL_MANAGER_ACTIVE;
3898 break;
3899 }
3900 raw_spin_unlock_irq(&pool->lock);
e02b9312 3901 mutex_unlock(&wq_pool_attach_mutex);
9ab03be4 3902 }
692b4825 3903
6ba94429 3904 while ((worker = first_idle_worker(pool)))
e02b9312 3905 set_worker_dying(worker, &cull_list);
6ba94429 3906 WARN_ON(pool->nr_workers || pool->nr_idle);
a9b8a985 3907 raw_spin_unlock_irq(&pool->lock);
d55262c4 3908
e02b9312
VS
3909 wake_dying_workers(&cull_list);
3910
3911 if (!list_empty(&pool->workers) || !list_empty(&pool->dying_workers))
6ba94429 3912 pool->detach_completion = &detach_completion;
1258fae7 3913 mutex_unlock(&wq_pool_attach_mutex);
226223ab 3914
6ba94429
FW
3915 if (pool->detach_completion)
3916 wait_for_completion(pool->detach_completion);
226223ab 3917
6ba94429
FW
3918 /* shut down the timers */
3919 del_timer_sync(&pool->idle_timer);
3f959aa3 3920 cancel_work_sync(&pool->idle_cull_work);
6ba94429 3921 del_timer_sync(&pool->mayday_timer);
226223ab 3922
24acfb71 3923 /* RCU protected to allow dereferences from get_work_pool() */
25b00775 3924 call_rcu(&pool->rcu, rcu_free_pool);
226223ab
TH
3925}
3926
3927/**
6ba94429
FW
3928 * get_unbound_pool - get a worker_pool with the specified attributes
3929 * @attrs: the attributes of the worker_pool to get
226223ab 3930 *
6ba94429
FW
3931 * Obtain a worker_pool which has the same attributes as @attrs, bump the
3932 * reference count and return it. If there already is a matching
3933 * worker_pool, it will be used; otherwise, this function attempts to
3934 * create a new one.
226223ab 3935 *
6ba94429 3936 * Should be called with wq_pool_mutex held.
226223ab 3937 *
6ba94429
FW
3938 * Return: On success, a worker_pool with the same attributes as @attrs.
3939 * On failure, %NULL.
226223ab 3940 */
6ba94429 3941static struct worker_pool *get_unbound_pool(const struct workqueue_attrs *attrs)
226223ab 3942{
6ba94429
FW
3943 u32 hash = wqattrs_hash(attrs);
3944 struct worker_pool *pool;
3945 int node;
e2273584 3946 int target_node = NUMA_NO_NODE;
226223ab 3947
6ba94429 3948 lockdep_assert_held(&wq_pool_mutex);
226223ab 3949
6ba94429
FW
3950 /* do we already have a matching pool? */
3951 hash_for_each_possible(unbound_pool_hash, pool, hash_node, hash) {
3952 if (wqattrs_equal(pool->attrs, attrs)) {
3953 pool->refcnt++;
3954 return pool;
3955 }
3956 }
226223ab 3957
e2273584
XP
3958 /* if cpumask is contained inside a NUMA node, we belong to that node */
3959 if (wq_numa_enabled) {
3960 for_each_node(node) {
3961 if (cpumask_subset(attrs->cpumask,
3962 wq_numa_possible_cpumask[node])) {
3963 target_node = node;
3964 break;
3965 }
3966 }
3967 }
3968
6ba94429 3969 /* nope, create a new one */
e2273584 3970 pool = kzalloc_node(sizeof(*pool), GFP_KERNEL, target_node);
6ba94429
FW
3971 if (!pool || init_worker_pool(pool) < 0)
3972 goto fail;
3973
3974 lockdep_set_subclass(&pool->lock, 1); /* see put_pwq() */
3975 copy_workqueue_attrs(pool->attrs, attrs);
e2273584 3976 pool->node = target_node;
226223ab
TH
3977
3978 /*
6ba94429
FW
3979 * no_numa isn't a worker_pool attribute, always clear it. See
3980 * 'struct workqueue_attrs' comments for detail.
226223ab 3981 */
6ba94429 3982 pool->attrs->no_numa = false;
226223ab 3983
6ba94429
FW
3984 if (worker_pool_assign_id(pool) < 0)
3985 goto fail;
226223ab 3986
6ba94429 3987 /* create and start the initial worker */
3347fa09 3988 if (wq_online && !create_worker(pool))
6ba94429 3989 goto fail;
226223ab 3990
6ba94429
FW
3991 /* install */
3992 hash_add(unbound_pool_hash, &pool->hash_node, hash);
226223ab 3993
6ba94429
FW
3994 return pool;
3995fail:
3996 if (pool)
3997 put_unbound_pool(pool);
3998 return NULL;
226223ab 3999}
226223ab 4000
6ba94429 4001static void rcu_free_pwq(struct rcu_head *rcu)
7a4e344c 4002{
6ba94429
FW
4003 kmem_cache_free(pwq_cache,
4004 container_of(rcu, struct pool_workqueue, rcu));
7a4e344c
TH
4005}
4006
6ba94429
FW
4007/*
4008 * Scheduled on system_wq by put_pwq() when an unbound pwq hits zero refcnt
4009 * and needs to be destroyed.
7a4e344c 4010 */
6ba94429 4011static void pwq_unbound_release_workfn(struct work_struct *work)
7a4e344c 4012{
6ba94429
FW
4013 struct pool_workqueue *pwq = container_of(work, struct pool_workqueue,
4014 unbound_release_work);
4015 struct workqueue_struct *wq = pwq->wq;
4016 struct worker_pool *pool = pwq->pool;
b42b0bdd 4017 bool is_last = false;
7a4e344c 4018
b42b0bdd
YY
4019 /*
4020 * when @pwq is not linked, it doesn't hold any reference to the
4021 * @wq, and @wq is invalid to access.
4022 */
4023 if (!list_empty(&pwq->pwqs_node)) {
4024 if (WARN_ON_ONCE(!(wq->flags & WQ_UNBOUND)))
4025 return;
7a4e344c 4026
b42b0bdd
YY
4027 mutex_lock(&wq->mutex);
4028 list_del_rcu(&pwq->pwqs_node);
4029 is_last = list_empty(&wq->pwqs);
4030 mutex_unlock(&wq->mutex);
4031 }
6ba94429
FW
4032
4033 mutex_lock(&wq_pool_mutex);
4034 put_unbound_pool(pool);
4035 mutex_unlock(&wq_pool_mutex);
4036
25b00775 4037 call_rcu(&pwq->rcu, rcu_free_pwq);
7a4e344c 4038
2865a8fb 4039 /*
6ba94429
FW
4040 * If we're the last pwq going away, @wq is already dead and no one
4041 * is gonna access it anymore. Schedule RCU free.
2865a8fb 4042 */
669de8bd
BVA
4043 if (is_last) {
4044 wq_unregister_lockdep(wq);
25b00775 4045 call_rcu(&wq->rcu, rcu_free_wq);
669de8bd 4046 }
29c91e99
TH
4047}
4048
7a4e344c 4049/**
6ba94429
FW
4050 * pwq_adjust_max_active - update a pwq's max_active to the current setting
4051 * @pwq: target pool_workqueue
d185af30 4052 *
6ba94429 4053 * If @pwq isn't freezing, set @pwq->max_active to the associated
f97a4a1a 4054 * workqueue's saved_max_active and activate inactive work items
6ba94429 4055 * accordingly. If @pwq is freezing, clear @pwq->max_active to zero.
7a4e344c 4056 */
6ba94429 4057static void pwq_adjust_max_active(struct pool_workqueue *pwq)
4e1a1f9a 4058{
6ba94429
FW
4059 struct workqueue_struct *wq = pwq->wq;
4060 bool freezable = wq->flags & WQ_FREEZABLE;
3347fa09 4061 unsigned long flags;
4e1a1f9a 4062
6ba94429
FW
4063 /* for @wq->saved_max_active */
4064 lockdep_assert_held(&wq->mutex);
4e1a1f9a 4065
6ba94429
FW
4066 /* fast exit for non-freezable wqs */
4067 if (!freezable && pwq->max_active == wq->saved_max_active)
4068 return;
7a4e344c 4069
3347fa09 4070 /* this function can be called during early boot w/ irq disabled */
a9b8a985 4071 raw_spin_lock_irqsave(&pwq->pool->lock, flags);
29c91e99 4072
6ba94429
FW
4073 /*
4074 * During [un]freezing, the caller is responsible for ensuring that
4075 * this function is called at least once after @workqueue_freezing
4076 * is updated and visible.
4077 */
4078 if (!freezable || !workqueue_freezing) {
01341fbd
YY
4079 bool kick = false;
4080
6ba94429 4081 pwq->max_active = wq->saved_max_active;
4e1a1f9a 4082
f97a4a1a 4083 while (!list_empty(&pwq->inactive_works) &&
01341fbd 4084 pwq->nr_active < pwq->max_active) {
f97a4a1a 4085 pwq_activate_first_inactive(pwq);
01341fbd
YY
4086 kick = true;
4087 }
e2dca7ad 4088
6ba94429
FW
4089 /*
4090 * Need to kick a worker after thawed or an unbound wq's
01341fbd
YY
4091 * max_active is bumped. In realtime scenarios, always kicking a
4092 * worker will cause interference on the isolated cpu cores, so
4093 * let's kick iff work items were activated.
6ba94429 4094 */
01341fbd
YY
4095 if (kick)
4096 wake_up_worker(pwq->pool);
6ba94429
FW
4097 } else {
4098 pwq->max_active = 0;
4099 }
e2dca7ad 4100
a9b8a985 4101 raw_spin_unlock_irqrestore(&pwq->pool->lock, flags);
e2dca7ad
TH
4102}
4103
67dc8325 4104/* initialize newly allocated @pwq which is associated with @wq and @pool */
6ba94429
FW
4105static void init_pwq(struct pool_workqueue *pwq, struct workqueue_struct *wq,
4106 struct worker_pool *pool)
29c91e99 4107{
6ba94429 4108 BUG_ON((unsigned long)pwq & WORK_STRUCT_FLAG_MASK);
29c91e99 4109
6ba94429
FW
4110 memset(pwq, 0, sizeof(*pwq));
4111
4112 pwq->pool = pool;
4113 pwq->wq = wq;
4114 pwq->flush_color = -1;
4115 pwq->refcnt = 1;
f97a4a1a 4116 INIT_LIST_HEAD(&pwq->inactive_works);
6ba94429
FW
4117 INIT_LIST_HEAD(&pwq->pwqs_node);
4118 INIT_LIST_HEAD(&pwq->mayday_node);
4119 INIT_WORK(&pwq->unbound_release_work, pwq_unbound_release_workfn);
29c91e99
TH
4120}
4121
6ba94429
FW
4122/* sync @pwq with the current state of its associated wq and link it */
4123static void link_pwq(struct pool_workqueue *pwq)
29c91e99 4124{
6ba94429 4125 struct workqueue_struct *wq = pwq->wq;
29c91e99 4126
6ba94429 4127 lockdep_assert_held(&wq->mutex);
a892cacc 4128
6ba94429
FW
4129 /* may be called multiple times, ignore if already linked */
4130 if (!list_empty(&pwq->pwqs_node))
29c91e99 4131 return;
29c91e99 4132
6ba94429
FW
4133 /* set the matching work_color */
4134 pwq->work_color = wq->work_color;
29c91e99 4135
6ba94429
FW
4136 /* sync max_active to the current setting */
4137 pwq_adjust_max_active(pwq);
29c91e99 4138
6ba94429
FW
4139 /* link in @pwq */
4140 list_add_rcu(&pwq->pwqs_node, &wq->pwqs);
4141}
29c91e99 4142
6ba94429
FW
4143/* obtain a pool matching @attr and create a pwq associating the pool and @wq */
4144static struct pool_workqueue *alloc_unbound_pwq(struct workqueue_struct *wq,
4145 const struct workqueue_attrs *attrs)
4146{
4147 struct worker_pool *pool;
4148 struct pool_workqueue *pwq;
60f5a4bc 4149
6ba94429 4150 lockdep_assert_held(&wq_pool_mutex);
60f5a4bc 4151
6ba94429
FW
4152 pool = get_unbound_pool(attrs);
4153 if (!pool)
4154 return NULL;
60f5a4bc 4155
6ba94429
FW
4156 pwq = kmem_cache_alloc_node(pwq_cache, GFP_KERNEL, pool->node);
4157 if (!pwq) {
4158 put_unbound_pool(pool);
4159 return NULL;
4160 }
29c91e99 4161
6ba94429
FW
4162 init_pwq(pwq, wq, pool);
4163 return pwq;
4164}
29c91e99 4165
29c91e99 4166/**
30186c6f 4167 * wq_calc_node_cpumask - calculate a wq_attrs' cpumask for the specified node
042f7df1 4168 * @attrs: the wq_attrs of the default pwq of the target workqueue
6ba94429
FW
4169 * @node: the target NUMA node
4170 * @cpu_going_down: if >= 0, the CPU to consider as offline
4171 * @cpumask: outarg, the resulting cpumask
29c91e99 4172 *
6ba94429
FW
4173 * Calculate the cpumask a workqueue with @attrs should use on @node. If
4174 * @cpu_going_down is >= 0, that cpu is considered offline during
4175 * calculation. The result is stored in @cpumask.
a892cacc 4176 *
6ba94429
FW
4177 * If NUMA affinity is not enabled, @attrs->cpumask is always used. If
4178 * enabled and @node has online CPUs requested by @attrs, the returned
4179 * cpumask is the intersection of the possible CPUs of @node and
4180 * @attrs->cpumask.
d185af30 4181 *
6ba94429
FW
4182 * The caller is responsible for ensuring that the cpumask of @node stays
4183 * stable.
4184 *
4185 * Return: %true if the resulting @cpumask is different from @attrs->cpumask,
4186 * %false if equal.
29c91e99 4187 */
6ba94429
FW
4188static bool wq_calc_node_cpumask(const struct workqueue_attrs *attrs, int node,
4189 int cpu_going_down, cpumask_t *cpumask)
29c91e99 4190{
6ba94429
FW
4191 if (!wq_numa_enabled || attrs->no_numa)
4192 goto use_dfl;
29c91e99 4193
6ba94429
FW
4194 /* does @node have any online CPUs @attrs wants? */
4195 cpumask_and(cpumask, cpumask_of_node(node), attrs->cpumask);
4196 if (cpu_going_down >= 0)
4197 cpumask_clear_cpu(cpu_going_down, cpumask);
29c91e99 4198
6ba94429
FW
4199 if (cpumask_empty(cpumask))
4200 goto use_dfl;
4c16bd32
TH
4201
4202 /* yeap, return possible CPUs in @node that @attrs wants */
4203 cpumask_and(cpumask, attrs->cpumask, wq_numa_possible_cpumask[node]);
1ad0f0a7
MB
4204
4205 if (cpumask_empty(cpumask)) {
4206 pr_warn_once("WARNING: workqueue cpumask: online intersect > "
4207 "possible intersect\n");
4208 return false;
4209 }
4210
4c16bd32
TH
4211 return !cpumask_equal(cpumask, attrs->cpumask);
4212
4213use_dfl:
4214 cpumask_copy(cpumask, attrs->cpumask);
4215 return false;
4216}
4217
1befcf30
TH
4218/* install @pwq into @wq's numa_pwq_tbl[] for @node and return the old pwq */
4219static struct pool_workqueue *numa_pwq_tbl_install(struct workqueue_struct *wq,
4220 int node,
4221 struct pool_workqueue *pwq)
4222{
4223 struct pool_workqueue *old_pwq;
4224
5b95e1af 4225 lockdep_assert_held(&wq_pool_mutex);
1befcf30
TH
4226 lockdep_assert_held(&wq->mutex);
4227
4228 /* link_pwq() can handle duplicate calls */
4229 link_pwq(pwq);
4230
4231 old_pwq = rcu_access_pointer(wq->numa_pwq_tbl[node]);
4232 rcu_assign_pointer(wq->numa_pwq_tbl[node], pwq);
4233 return old_pwq;
4234}
4235
2d5f0764
LJ
4236/* context to store the prepared attrs & pwqs before applying */
4237struct apply_wqattrs_ctx {
4238 struct workqueue_struct *wq; /* target workqueue */
4239 struct workqueue_attrs *attrs; /* attrs to apply */
042f7df1 4240 struct list_head list; /* queued for batching commit */
2d5f0764
LJ
4241 struct pool_workqueue *dfl_pwq;
4242 struct pool_workqueue *pwq_tbl[];
4243};
4244
4245/* free the resources after success or abort */
4246static void apply_wqattrs_cleanup(struct apply_wqattrs_ctx *ctx)
4247{
4248 if (ctx) {
4249 int node;
4250
4251 for_each_node(node)
4252 put_pwq_unlocked(ctx->pwq_tbl[node]);
4253 put_pwq_unlocked(ctx->dfl_pwq);
4254
4255 free_workqueue_attrs(ctx->attrs);
4256
4257 kfree(ctx);
4258 }
4259}
4260
4261/* allocate the attrs and pwqs for later installation */
4262static struct apply_wqattrs_ctx *
4263apply_wqattrs_prepare(struct workqueue_struct *wq,
99c621ef
LJ
4264 const struct workqueue_attrs *attrs,
4265 const cpumask_var_t unbound_cpumask)
9e8cd2f5 4266{
2d5f0764 4267 struct apply_wqattrs_ctx *ctx;
4c16bd32 4268 struct workqueue_attrs *new_attrs, *tmp_attrs;
2d5f0764 4269 int node;
9e8cd2f5 4270
2d5f0764 4271 lockdep_assert_held(&wq_pool_mutex);
9e8cd2f5 4272
acafe7e3 4273 ctx = kzalloc(struct_size(ctx, pwq_tbl, nr_node_ids), GFP_KERNEL);
8719dcea 4274
be69d00d
TG
4275 new_attrs = alloc_workqueue_attrs();
4276 tmp_attrs = alloc_workqueue_attrs();
2d5f0764
LJ
4277 if (!ctx || !new_attrs || !tmp_attrs)
4278 goto out_free;
13e2e556 4279
042f7df1 4280 /*
99c621ef
LJ
4281 * Calculate the attrs of the default pwq with unbound_cpumask
4282 * which is wq_unbound_cpumask or to set to wq_unbound_cpumask.
042f7df1
LJ
4283 * If the user configured cpumask doesn't overlap with the
4284 * wq_unbound_cpumask, we fallback to the wq_unbound_cpumask.
4285 */
13e2e556 4286 copy_workqueue_attrs(new_attrs, attrs);
99c621ef 4287 cpumask_and(new_attrs->cpumask, new_attrs->cpumask, unbound_cpumask);
042f7df1 4288 if (unlikely(cpumask_empty(new_attrs->cpumask)))
99c621ef 4289 cpumask_copy(new_attrs->cpumask, unbound_cpumask);
13e2e556 4290
4c16bd32
TH
4291 /*
4292 * We may create multiple pwqs with differing cpumasks. Make a
4293 * copy of @new_attrs which will be modified and used to obtain
4294 * pools.
4295 */
4296 copy_workqueue_attrs(tmp_attrs, new_attrs);
4297
4c16bd32
TH
4298 /*
4299 * If something goes wrong during CPU up/down, we'll fall back to
4300 * the default pwq covering whole @attrs->cpumask. Always create
4301 * it even if we don't use it immediately.
4302 */
2d5f0764
LJ
4303 ctx->dfl_pwq = alloc_unbound_pwq(wq, new_attrs);
4304 if (!ctx->dfl_pwq)
4305 goto out_free;
4c16bd32
TH
4306
4307 for_each_node(node) {
042f7df1 4308 if (wq_calc_node_cpumask(new_attrs, node, -1, tmp_attrs->cpumask)) {
2d5f0764
LJ
4309 ctx->pwq_tbl[node] = alloc_unbound_pwq(wq, tmp_attrs);
4310 if (!ctx->pwq_tbl[node])
4311 goto out_free;
4c16bd32 4312 } else {
2d5f0764
LJ
4313 ctx->dfl_pwq->refcnt++;
4314 ctx->pwq_tbl[node] = ctx->dfl_pwq;
4c16bd32
TH
4315 }
4316 }
4317
042f7df1
LJ
4318 /* save the user configured attrs and sanitize it. */
4319 copy_workqueue_attrs(new_attrs, attrs);
4320 cpumask_and(new_attrs->cpumask, new_attrs->cpumask, cpu_possible_mask);
2d5f0764 4321 ctx->attrs = new_attrs;
042f7df1 4322
2d5f0764
LJ
4323 ctx->wq = wq;
4324 free_workqueue_attrs(tmp_attrs);
4325 return ctx;
4326
4327out_free:
4328 free_workqueue_attrs(tmp_attrs);
4329 free_workqueue_attrs(new_attrs);
4330 apply_wqattrs_cleanup(ctx);
4331 return NULL;
4332}
4333
4334/* set attrs and install prepared pwqs, @ctx points to old pwqs on return */
4335static void apply_wqattrs_commit(struct apply_wqattrs_ctx *ctx)
4336{
4337 int node;
9e8cd2f5 4338
4c16bd32 4339 /* all pwqs have been created successfully, let's install'em */
2d5f0764 4340 mutex_lock(&ctx->wq->mutex);
a892cacc 4341
2d5f0764 4342 copy_workqueue_attrs(ctx->wq->unbound_attrs, ctx->attrs);
4c16bd32
TH
4343
4344 /* save the previous pwq and install the new one */
f147f29e 4345 for_each_node(node)
2d5f0764
LJ
4346 ctx->pwq_tbl[node] = numa_pwq_tbl_install(ctx->wq, node,
4347 ctx->pwq_tbl[node]);
4c16bd32
TH
4348
4349 /* @dfl_pwq might not have been used, ensure it's linked */
2d5f0764
LJ
4350 link_pwq(ctx->dfl_pwq);
4351 swap(ctx->wq->dfl_pwq, ctx->dfl_pwq);
f147f29e 4352
2d5f0764
LJ
4353 mutex_unlock(&ctx->wq->mutex);
4354}
9e8cd2f5 4355
a0111cf6
LJ
4356static void apply_wqattrs_lock(void)
4357{
4358 /* CPUs should stay stable across pwq creations and installations */
ffd8bea8 4359 cpus_read_lock();
a0111cf6
LJ
4360 mutex_lock(&wq_pool_mutex);
4361}
4362
4363static void apply_wqattrs_unlock(void)
4364{
4365 mutex_unlock(&wq_pool_mutex);
ffd8bea8 4366 cpus_read_unlock();
a0111cf6
LJ
4367}
4368
4369static int apply_workqueue_attrs_locked(struct workqueue_struct *wq,
4370 const struct workqueue_attrs *attrs)
2d5f0764
LJ
4371{
4372 struct apply_wqattrs_ctx *ctx;
4c16bd32 4373
2d5f0764
LJ
4374 /* only unbound workqueues can change attributes */
4375 if (WARN_ON(!(wq->flags & WQ_UNBOUND)))
4376 return -EINVAL;
13e2e556 4377
2d5f0764 4378 /* creating multiple pwqs breaks ordering guarantee */
0a94efb5
TH
4379 if (!list_empty(&wq->pwqs)) {
4380 if (WARN_ON(wq->flags & __WQ_ORDERED_EXPLICIT))
4381 return -EINVAL;
4382
4383 wq->flags &= ~__WQ_ORDERED;
4384 }
2d5f0764 4385
99c621ef 4386 ctx = apply_wqattrs_prepare(wq, attrs, wq_unbound_cpumask);
6201171e 4387 if (!ctx)
4388 return -ENOMEM;
2d5f0764
LJ
4389
4390 /* the ctx has been prepared successfully, let's commit it */
6201171e 4391 apply_wqattrs_commit(ctx);
2d5f0764
LJ
4392 apply_wqattrs_cleanup(ctx);
4393
6201171e 4394 return 0;
9e8cd2f5
TH
4395}
4396
a0111cf6
LJ
4397/**
4398 * apply_workqueue_attrs - apply new workqueue_attrs to an unbound workqueue
4399 * @wq: the target workqueue
4400 * @attrs: the workqueue_attrs to apply, allocated with alloc_workqueue_attrs()
4401 *
4402 * Apply @attrs to an unbound workqueue @wq. Unless disabled, on NUMA
4403 * machines, this function maps a separate pwq to each NUMA node with
4404 * possibles CPUs in @attrs->cpumask so that work items are affine to the
4405 * NUMA node it was issued on. Older pwqs are released as in-flight work
4406 * items finish. Note that a work item which repeatedly requeues itself
4407 * back-to-back will stay on its current pwq.
4408 *
4409 * Performs GFP_KERNEL allocations.
4410 *
ffd8bea8 4411 * Assumes caller has CPU hotplug read exclusion, i.e. cpus_read_lock().
509b3204 4412 *
a0111cf6
LJ
4413 * Return: 0 on success and -errno on failure.
4414 */
513c98d0 4415int apply_workqueue_attrs(struct workqueue_struct *wq,
a0111cf6
LJ
4416 const struct workqueue_attrs *attrs)
4417{
4418 int ret;
4419
509b3204
DJ
4420 lockdep_assert_cpus_held();
4421
4422 mutex_lock(&wq_pool_mutex);
a0111cf6 4423 ret = apply_workqueue_attrs_locked(wq, attrs);
509b3204 4424 mutex_unlock(&wq_pool_mutex);
a0111cf6
LJ
4425
4426 return ret;
4427}
4428
4c16bd32
TH
4429/**
4430 * wq_update_unbound_numa - update NUMA affinity of a wq for CPU hot[un]plug
4431 * @wq: the target workqueue
4432 * @cpu: the CPU coming up or going down
4433 * @online: whether @cpu is coming up or going down
4434 *
4435 * This function is to be called from %CPU_DOWN_PREPARE, %CPU_ONLINE and
4436 * %CPU_DOWN_FAILED. @cpu is being hot[un]plugged, update NUMA affinity of
4437 * @wq accordingly.
4438 *
4439 * If NUMA affinity can't be adjusted due to memory allocation failure, it
4440 * falls back to @wq->dfl_pwq which may not be optimal but is always
4441 * correct.
4442 *
4443 * Note that when the last allowed CPU of a NUMA node goes offline for a
4444 * workqueue with a cpumask spanning multiple nodes, the workers which were
4445 * already executing the work items for the workqueue will lose their CPU
4446 * affinity and may execute on any CPU. This is similar to how per-cpu
4447 * workqueues behave on CPU_DOWN. If a workqueue user wants strict
4448 * affinity, it's the user's responsibility to flush the work item from
4449 * CPU_DOWN_PREPARE.
4450 */
4451static void wq_update_unbound_numa(struct workqueue_struct *wq, int cpu,
4452 bool online)
4453{
4454 int node = cpu_to_node(cpu);
4455 int cpu_off = online ? -1 : cpu;
4456 struct pool_workqueue *old_pwq = NULL, *pwq;
4457 struct workqueue_attrs *target_attrs;
4458 cpumask_t *cpumask;
4459
4460 lockdep_assert_held(&wq_pool_mutex);
4461
f7142ed4
LJ
4462 if (!wq_numa_enabled || !(wq->flags & WQ_UNBOUND) ||
4463 wq->unbound_attrs->no_numa)
4c16bd32
TH
4464 return;
4465
4466 /*
4467 * We don't wanna alloc/free wq_attrs for each wq for each CPU.
4468 * Let's use a preallocated one. The following buf is protected by
4469 * CPU hotplug exclusion.
4470 */
4471 target_attrs = wq_update_unbound_numa_attrs_buf;
4472 cpumask = target_attrs->cpumask;
4473
4c16bd32
TH
4474 copy_workqueue_attrs(target_attrs, wq->unbound_attrs);
4475 pwq = unbound_pwq_by_node(wq, node);
4476
4477 /*
4478 * Let's determine what needs to be done. If the target cpumask is
042f7df1
LJ
4479 * different from the default pwq's, we need to compare it to @pwq's
4480 * and create a new one if they don't match. If the target cpumask
4481 * equals the default pwq's, the default pwq should be used.
4c16bd32 4482 */
042f7df1 4483 if (wq_calc_node_cpumask(wq->dfl_pwq->pool->attrs, node, cpu_off, cpumask)) {
4c16bd32 4484 if (cpumask_equal(cpumask, pwq->pool->attrs->cpumask))
f7142ed4 4485 return;
4c16bd32 4486 } else {
534a3fbb 4487 goto use_dfl_pwq;
4c16bd32
TH
4488 }
4489
4c16bd32
TH
4490 /* create a new pwq */
4491 pwq = alloc_unbound_pwq(wq, target_attrs);
4492 if (!pwq) {
2d916033
FF
4493 pr_warn("workqueue: allocation failed while updating NUMA affinity of \"%s\"\n",
4494 wq->name);
77f300b1 4495 goto use_dfl_pwq;
4c16bd32
TH
4496 }
4497
f7142ed4 4498 /* Install the new pwq. */
4c16bd32
TH
4499 mutex_lock(&wq->mutex);
4500 old_pwq = numa_pwq_tbl_install(wq, node, pwq);
4501 goto out_unlock;
4502
4503use_dfl_pwq:
f7142ed4 4504 mutex_lock(&wq->mutex);
a9b8a985 4505 raw_spin_lock_irq(&wq->dfl_pwq->pool->lock);
4c16bd32 4506 get_pwq(wq->dfl_pwq);
a9b8a985 4507 raw_spin_unlock_irq(&wq->dfl_pwq->pool->lock);
4c16bd32
TH
4508 old_pwq = numa_pwq_tbl_install(wq, node, wq->dfl_pwq);
4509out_unlock:
4510 mutex_unlock(&wq->mutex);
4511 put_pwq_unlocked(old_pwq);
4512}
4513
30cdf249 4514static int alloc_and_link_pwqs(struct workqueue_struct *wq)
0f900049 4515{
49e3cf44 4516 bool highpri = wq->flags & WQ_HIGHPRI;
8a2b7538 4517 int cpu, ret;
30cdf249
TH
4518
4519 if (!(wq->flags & WQ_UNBOUND)) {
420c0ddb
TH
4520 wq->cpu_pwqs = alloc_percpu(struct pool_workqueue);
4521 if (!wq->cpu_pwqs)
30cdf249
TH
4522 return -ENOMEM;
4523
4524 for_each_possible_cpu(cpu) {
7fb98ea7
TH
4525 struct pool_workqueue *pwq =
4526 per_cpu_ptr(wq->cpu_pwqs, cpu);
7a62c2c8 4527 struct worker_pool *cpu_pools =
f02ae73a 4528 per_cpu(cpu_worker_pools, cpu);
f3421797 4529
f147f29e
TH
4530 init_pwq(pwq, wq, &cpu_pools[highpri]);
4531
4532 mutex_lock(&wq->mutex);
1befcf30 4533 link_pwq(pwq);
f147f29e 4534 mutex_unlock(&wq->mutex);
30cdf249 4535 }
9e8cd2f5 4536 return 0;
509b3204
DJ
4537 }
4538
ffd8bea8 4539 cpus_read_lock();
509b3204 4540 if (wq->flags & __WQ_ORDERED) {
8a2b7538
TH
4541 ret = apply_workqueue_attrs(wq, ordered_wq_attrs[highpri]);
4542 /* there should only be single pwq for ordering guarantee */
4543 WARN(!ret && (wq->pwqs.next != &wq->dfl_pwq->pwqs_node ||
4544 wq->pwqs.prev != &wq->dfl_pwq->pwqs_node),
4545 "ordering guarantee broken for workqueue %s\n", wq->name);
30cdf249 4546 } else {
509b3204 4547 ret = apply_workqueue_attrs(wq, unbound_std_wq_attrs[highpri]);
30cdf249 4548 }
ffd8bea8 4549 cpus_read_unlock();
509b3204
DJ
4550
4551 return ret;
0f900049
TH
4552}
4553
f3421797
TH
4554static int wq_clamp_max_active(int max_active, unsigned int flags,
4555 const char *name)
b71ab8c2 4556{
f3421797
TH
4557 int lim = flags & WQ_UNBOUND ? WQ_UNBOUND_MAX_ACTIVE : WQ_MAX_ACTIVE;
4558
4559 if (max_active < 1 || max_active > lim)
044c782c
VI
4560 pr_warn("workqueue: max_active %d requested for %s is out of range, clamping between %d and %d\n",
4561 max_active, name, 1, lim);
b71ab8c2 4562
f3421797 4563 return clamp_val(max_active, 1, lim);
b71ab8c2
TH
4564}
4565
983c7515
TH
4566/*
4567 * Workqueues which may be used during memory reclaim should have a rescuer
4568 * to guarantee forward progress.
4569 */
4570static int init_rescuer(struct workqueue_struct *wq)
4571{
4572 struct worker *rescuer;
b92b36ea 4573 int ret;
983c7515
TH
4574
4575 if (!(wq->flags & WQ_MEM_RECLAIM))
4576 return 0;
4577
4578 rescuer = alloc_worker(NUMA_NO_NODE);
4c0736a7
PM
4579 if (!rescuer) {
4580 pr_err("workqueue: Failed to allocate a rescuer for wq \"%s\"\n",
4581 wq->name);
983c7515 4582 return -ENOMEM;
4c0736a7 4583 }
983c7515
TH
4584
4585 rescuer->rescue_wq = wq;
4586 rescuer->task = kthread_create(rescuer_thread, rescuer, "%s", wq->name);
f187b697 4587 if (IS_ERR(rescuer->task)) {
b92b36ea 4588 ret = PTR_ERR(rescuer->task);
4c0736a7
PM
4589 pr_err("workqueue: Failed to create a rescuer kthread for wq \"%s\": %pe",
4590 wq->name, ERR_PTR(ret));
983c7515 4591 kfree(rescuer);
b92b36ea 4592 return ret;
983c7515
TH
4593 }
4594
4595 wq->rescuer = rescuer;
4596 kthread_bind_mask(rescuer->task, cpu_possible_mask);
4597 wake_up_process(rescuer->task);
4598
4599 return 0;
4600}
4601
a2775bbc 4602__printf(1, 4)
669de8bd
BVA
4603struct workqueue_struct *alloc_workqueue(const char *fmt,
4604 unsigned int flags,
4605 int max_active, ...)
1da177e4 4606{
df2d5ae4 4607 size_t tbl_size = 0;
ecf6881f 4608 va_list args;
1da177e4 4609 struct workqueue_struct *wq;
49e3cf44 4610 struct pool_workqueue *pwq;
b196be89 4611
5c0338c6
TH
4612 /*
4613 * Unbound && max_active == 1 used to imply ordered, which is no
4614 * longer the case on NUMA machines due to per-node pools. While
4615 * alloc_ordered_workqueue() is the right way to create an ordered
4616 * workqueue, keep the previous behavior to avoid subtle breakages
4617 * on NUMA.
4618 */
4619 if ((flags & WQ_UNBOUND) && max_active == 1)
4620 flags |= __WQ_ORDERED;
4621
cee22a15
VK
4622 /* see the comment above the definition of WQ_POWER_EFFICIENT */
4623 if ((flags & WQ_POWER_EFFICIENT) && wq_power_efficient)
4624 flags |= WQ_UNBOUND;
4625
ecf6881f 4626 /* allocate wq and format name */
df2d5ae4 4627 if (flags & WQ_UNBOUND)
ddcb57e2 4628 tbl_size = nr_node_ids * sizeof(wq->numa_pwq_tbl[0]);
df2d5ae4
TH
4629
4630 wq = kzalloc(sizeof(*wq) + tbl_size, GFP_KERNEL);
b196be89 4631 if (!wq)
d2c1d404 4632 return NULL;
b196be89 4633
6029a918 4634 if (flags & WQ_UNBOUND) {
be69d00d 4635 wq->unbound_attrs = alloc_workqueue_attrs();
6029a918
TH
4636 if (!wq->unbound_attrs)
4637 goto err_free_wq;
4638 }
4639
669de8bd 4640 va_start(args, max_active);
ecf6881f 4641 vsnprintf(wq->name, sizeof(wq->name), fmt, args);
b196be89 4642 va_end(args);
1da177e4 4643
d320c038 4644 max_active = max_active ?: WQ_DFL_ACTIVE;
b196be89 4645 max_active = wq_clamp_max_active(max_active, flags, wq->name);
3af24433 4646
b196be89 4647 /* init wq */
97e37d7b 4648 wq->flags = flags;
a0a1a5fd 4649 wq->saved_max_active = max_active;
3c25a55d 4650 mutex_init(&wq->mutex);
112202d9 4651 atomic_set(&wq->nr_pwqs_to_flush, 0);
30cdf249 4652 INIT_LIST_HEAD(&wq->pwqs);
73f53c4a
TH
4653 INIT_LIST_HEAD(&wq->flusher_queue);
4654 INIT_LIST_HEAD(&wq->flusher_overflow);
493a1724 4655 INIT_LIST_HEAD(&wq->maydays);
502ca9d8 4656
669de8bd 4657 wq_init_lockdep(wq);
cce1a165 4658 INIT_LIST_HEAD(&wq->list);
3af24433 4659
30cdf249 4660 if (alloc_and_link_pwqs(wq) < 0)
82efcab3 4661 goto err_unreg_lockdep;
1537663f 4662
40c17f75 4663 if (wq_online && init_rescuer(wq) < 0)
983c7515 4664 goto err_destroy;
3af24433 4665
226223ab
TH
4666 if ((wq->flags & WQ_SYSFS) && workqueue_sysfs_register(wq))
4667 goto err_destroy;
4668
a0a1a5fd 4669 /*
68e13a67
LJ
4670 * wq_pool_mutex protects global freeze state and workqueues list.
4671 * Grab it, adjust max_active and add the new @wq to workqueues
4672 * list.
a0a1a5fd 4673 */
68e13a67 4674 mutex_lock(&wq_pool_mutex);
a0a1a5fd 4675
a357fc03 4676 mutex_lock(&wq->mutex);
699ce097
TH
4677 for_each_pwq(pwq, wq)
4678 pwq_adjust_max_active(pwq);
a357fc03 4679 mutex_unlock(&wq->mutex);
a0a1a5fd 4680
e2dca7ad 4681 list_add_tail_rcu(&wq->list, &workqueues);
a0a1a5fd 4682
68e13a67 4683 mutex_unlock(&wq_pool_mutex);
1537663f 4684
3af24433 4685 return wq;
d2c1d404 4686
82efcab3 4687err_unreg_lockdep:
009bb421
BVA
4688 wq_unregister_lockdep(wq);
4689 wq_free_lockdep(wq);
82efcab3 4690err_free_wq:
6029a918 4691 free_workqueue_attrs(wq->unbound_attrs);
d2c1d404
TH
4692 kfree(wq);
4693 return NULL;
4694err_destroy:
4695 destroy_workqueue(wq);
4690c4ab 4696 return NULL;
3af24433 4697}
669de8bd 4698EXPORT_SYMBOL_GPL(alloc_workqueue);
1da177e4 4699
c29eb853
TH
4700static bool pwq_busy(struct pool_workqueue *pwq)
4701{
4702 int i;
4703
4704 for (i = 0; i < WORK_NR_COLORS; i++)
4705 if (pwq->nr_in_flight[i])
4706 return true;
4707
4708 if ((pwq != pwq->wq->dfl_pwq) && (pwq->refcnt > 1))
4709 return true;
f97a4a1a 4710 if (pwq->nr_active || !list_empty(&pwq->inactive_works))
c29eb853
TH
4711 return true;
4712
4713 return false;
4714}
4715
3af24433
ON
4716/**
4717 * destroy_workqueue - safely terminate a workqueue
4718 * @wq: target workqueue
4719 *
4720 * Safely destroy a workqueue. All work currently pending will be done first.
4721 */
4722void destroy_workqueue(struct workqueue_struct *wq)
4723{
49e3cf44 4724 struct pool_workqueue *pwq;
4c16bd32 4725 int node;
3af24433 4726
def98c84
TH
4727 /*
4728 * Remove it from sysfs first so that sanity check failure doesn't
4729 * lead to sysfs name conflicts.
4730 */
4731 workqueue_sysfs_unregister(wq);
4732
33e3f0a3
RC
4733 /* mark the workqueue destruction is in progress */
4734 mutex_lock(&wq->mutex);
4735 wq->flags |= __WQ_DESTROYING;
4736 mutex_unlock(&wq->mutex);
4737
9c5a2ba7
TH
4738 /* drain it before proceeding with destruction */
4739 drain_workqueue(wq);
c8efcc25 4740
def98c84
TH
4741 /* kill rescuer, if sanity checks fail, leave it w/o rescuer */
4742 if (wq->rescuer) {
4743 struct worker *rescuer = wq->rescuer;
4744
4745 /* this prevents new queueing */
a9b8a985 4746 raw_spin_lock_irq(&wq_mayday_lock);
def98c84 4747 wq->rescuer = NULL;
a9b8a985 4748 raw_spin_unlock_irq(&wq_mayday_lock);
def98c84
TH
4749
4750 /* rescuer will empty maydays list before exiting */
4751 kthread_stop(rescuer->task);
8efe1223 4752 kfree(rescuer);
def98c84
TH
4753 }
4754
c29eb853
TH
4755 /*
4756 * Sanity checks - grab all the locks so that we wait for all
4757 * in-flight operations which may do put_pwq().
4758 */
4759 mutex_lock(&wq_pool_mutex);
b09f4fd3 4760 mutex_lock(&wq->mutex);
49e3cf44 4761 for_each_pwq(pwq, wq) {
a9b8a985 4762 raw_spin_lock_irq(&pwq->pool->lock);
c29eb853 4763 if (WARN_ON(pwq_busy(pwq))) {
1d9a6159
KW
4764 pr_warn("%s: %s has the following busy pwq\n",
4765 __func__, wq->name);
c29eb853 4766 show_pwq(pwq);
a9b8a985 4767 raw_spin_unlock_irq(&pwq->pool->lock);
b09f4fd3 4768 mutex_unlock(&wq->mutex);
c29eb853 4769 mutex_unlock(&wq_pool_mutex);
55df0933 4770 show_one_workqueue(wq);
6183c009 4771 return;
76af4d93 4772 }
a9b8a985 4773 raw_spin_unlock_irq(&pwq->pool->lock);
6183c009 4774 }
b09f4fd3 4775 mutex_unlock(&wq->mutex);
6183c009 4776
a0a1a5fd
TH
4777 /*
4778 * wq list is used to freeze wq, remove from list after
4779 * flushing is complete in case freeze races us.
4780 */
e2dca7ad 4781 list_del_rcu(&wq->list);
68e13a67 4782 mutex_unlock(&wq_pool_mutex);
3af24433 4783
8864b4e5 4784 if (!(wq->flags & WQ_UNBOUND)) {
669de8bd 4785 wq_unregister_lockdep(wq);
8864b4e5
TH
4786 /*
4787 * The base ref is never dropped on per-cpu pwqs. Directly
e2dca7ad 4788 * schedule RCU free.
8864b4e5 4789 */
25b00775 4790 call_rcu(&wq->rcu, rcu_free_wq);
8864b4e5
TH
4791 } else {
4792 /*
4793 * We're the sole accessor of @wq at this point. Directly
4c16bd32
TH
4794 * access numa_pwq_tbl[] and dfl_pwq to put the base refs.
4795 * @wq will be freed when the last pwq is released.
8864b4e5 4796 */
4c16bd32
TH
4797 for_each_node(node) {
4798 pwq = rcu_access_pointer(wq->numa_pwq_tbl[node]);
4799 RCU_INIT_POINTER(wq->numa_pwq_tbl[node], NULL);
4800 put_pwq_unlocked(pwq);
4801 }
4802
4803 /*
4804 * Put dfl_pwq. @wq may be freed any time after dfl_pwq is
4805 * put. Don't access it afterwards.
4806 */
4807 pwq = wq->dfl_pwq;
4808 wq->dfl_pwq = NULL;
dce90d47 4809 put_pwq_unlocked(pwq);
29c91e99 4810 }
3af24433
ON
4811}
4812EXPORT_SYMBOL_GPL(destroy_workqueue);
4813
dcd989cb
TH
4814/**
4815 * workqueue_set_max_active - adjust max_active of a workqueue
4816 * @wq: target workqueue
4817 * @max_active: new max_active value.
4818 *
4819 * Set max_active of @wq to @max_active.
4820 *
4821 * CONTEXT:
4822 * Don't call from IRQ context.
4823 */
4824void workqueue_set_max_active(struct workqueue_struct *wq, int max_active)
4825{
49e3cf44 4826 struct pool_workqueue *pwq;
dcd989cb 4827
8719dcea 4828 /* disallow meddling with max_active for ordered workqueues */
0a94efb5 4829 if (WARN_ON(wq->flags & __WQ_ORDERED_EXPLICIT))
8719dcea
TH
4830 return;
4831
f3421797 4832 max_active = wq_clamp_max_active(max_active, wq->flags, wq->name);
dcd989cb 4833
a357fc03 4834 mutex_lock(&wq->mutex);
dcd989cb 4835
0a94efb5 4836 wq->flags &= ~__WQ_ORDERED;
dcd989cb
TH
4837 wq->saved_max_active = max_active;
4838
699ce097
TH
4839 for_each_pwq(pwq, wq)
4840 pwq_adjust_max_active(pwq);
93981800 4841
a357fc03 4842 mutex_unlock(&wq->mutex);
15316ba8 4843}
dcd989cb 4844EXPORT_SYMBOL_GPL(workqueue_set_max_active);
15316ba8 4845
27d4ee03
LW
4846/**
4847 * current_work - retrieve %current task's work struct
4848 *
4849 * Determine if %current task is a workqueue worker and what it's working on.
4850 * Useful to find out the context that the %current task is running in.
4851 *
4852 * Return: work struct if %current task is a workqueue worker, %NULL otherwise.
4853 */
4854struct work_struct *current_work(void)
4855{
4856 struct worker *worker = current_wq_worker();
4857
4858 return worker ? worker->current_work : NULL;
4859}
4860EXPORT_SYMBOL(current_work);
4861
e6267616
TH
4862/**
4863 * current_is_workqueue_rescuer - is %current workqueue rescuer?
4864 *
4865 * Determine whether %current is a workqueue rescuer. Can be used from
4866 * work functions to determine whether it's being run off the rescuer task.
d185af30
YB
4867 *
4868 * Return: %true if %current is a workqueue rescuer. %false otherwise.
e6267616
TH
4869 */
4870bool current_is_workqueue_rescuer(void)
4871{
4872 struct worker *worker = current_wq_worker();
4873
6a092dfd 4874 return worker && worker->rescue_wq;
e6267616
TH
4875}
4876
eef6a7d5 4877/**
dcd989cb
TH
4878 * workqueue_congested - test whether a workqueue is congested
4879 * @cpu: CPU in question
4880 * @wq: target workqueue
eef6a7d5 4881 *
dcd989cb
TH
4882 * Test whether @wq's cpu workqueue for @cpu is congested. There is
4883 * no synchronization around this function and the test result is
4884 * unreliable and only useful as advisory hints or for debugging.
eef6a7d5 4885 *
d3251859
TH
4886 * If @cpu is WORK_CPU_UNBOUND, the test is performed on the local CPU.
4887 * Note that both per-cpu and unbound workqueues may be associated with
4888 * multiple pool_workqueues which have separate congested states. A
4889 * workqueue being congested on one CPU doesn't mean the workqueue is also
4890 * contested on other CPUs / NUMA nodes.
4891 *
d185af30 4892 * Return:
dcd989cb 4893 * %true if congested, %false otherwise.
eef6a7d5 4894 */
d84ff051 4895bool workqueue_congested(int cpu, struct workqueue_struct *wq)
1da177e4 4896{
7fb98ea7 4897 struct pool_workqueue *pwq;
76af4d93
TH
4898 bool ret;
4899
24acfb71
TG
4900 rcu_read_lock();
4901 preempt_disable();
7fb98ea7 4902
d3251859
TH
4903 if (cpu == WORK_CPU_UNBOUND)
4904 cpu = smp_processor_id();
4905
7fb98ea7
TH
4906 if (!(wq->flags & WQ_UNBOUND))
4907 pwq = per_cpu_ptr(wq->cpu_pwqs, cpu);
4908 else
df2d5ae4 4909 pwq = unbound_pwq_by_node(wq, cpu_to_node(cpu));
dcd989cb 4910
f97a4a1a 4911 ret = !list_empty(&pwq->inactive_works);
24acfb71
TG
4912 preempt_enable();
4913 rcu_read_unlock();
76af4d93
TH
4914
4915 return ret;
1da177e4 4916}
dcd989cb 4917EXPORT_SYMBOL_GPL(workqueue_congested);
1da177e4 4918
dcd989cb
TH
4919/**
4920 * work_busy - test whether a work is currently pending or running
4921 * @work: the work to be tested
4922 *
4923 * Test whether @work is currently pending or running. There is no
4924 * synchronization around this function and the test result is
4925 * unreliable and only useful as advisory hints or for debugging.
dcd989cb 4926 *
d185af30 4927 * Return:
dcd989cb
TH
4928 * OR'd bitmask of WORK_BUSY_* bits.
4929 */
4930unsigned int work_busy(struct work_struct *work)
1da177e4 4931{
fa1b54e6 4932 struct worker_pool *pool;
dcd989cb
TH
4933 unsigned long flags;
4934 unsigned int ret = 0;
1da177e4 4935
dcd989cb
TH
4936 if (work_pending(work))
4937 ret |= WORK_BUSY_PENDING;
1da177e4 4938
24acfb71 4939 rcu_read_lock();
fa1b54e6 4940 pool = get_work_pool(work);
038366c5 4941 if (pool) {
a9b8a985 4942 raw_spin_lock_irqsave(&pool->lock, flags);
038366c5
LJ
4943 if (find_worker_executing_work(pool, work))
4944 ret |= WORK_BUSY_RUNNING;
a9b8a985 4945 raw_spin_unlock_irqrestore(&pool->lock, flags);
038366c5 4946 }
24acfb71 4947 rcu_read_unlock();
1da177e4 4948
dcd989cb 4949 return ret;
1da177e4 4950}
dcd989cb 4951EXPORT_SYMBOL_GPL(work_busy);
1da177e4 4952
3d1cb205
TH
4953/**
4954 * set_worker_desc - set description for the current work item
4955 * @fmt: printf-style format string
4956 * @...: arguments for the format string
4957 *
4958 * This function can be called by a running work function to describe what
4959 * the work item is about. If the worker task gets dumped, this
4960 * information will be printed out together to help debugging. The
4961 * description can be at most WORKER_DESC_LEN including the trailing '\0'.
4962 */
4963void set_worker_desc(const char *fmt, ...)
4964{
4965 struct worker *worker = current_wq_worker();
4966 va_list args;
4967
4968 if (worker) {
4969 va_start(args, fmt);
4970 vsnprintf(worker->desc, sizeof(worker->desc), fmt, args);
4971 va_end(args);
3d1cb205
TH
4972 }
4973}
5c750d58 4974EXPORT_SYMBOL_GPL(set_worker_desc);
3d1cb205
TH
4975
4976/**
4977 * print_worker_info - print out worker information and description
4978 * @log_lvl: the log level to use when printing
4979 * @task: target task
4980 *
4981 * If @task is a worker and currently executing a work item, print out the
4982 * name of the workqueue being serviced and worker description set with
4983 * set_worker_desc() by the currently executing work item.
4984 *
4985 * This function can be safely called on any task as long as the
4986 * task_struct itself is accessible. While safe, this function isn't
4987 * synchronized and may print out mixups or garbages of limited length.
4988 */
4989void print_worker_info(const char *log_lvl, struct task_struct *task)
4990{
4991 work_func_t *fn = NULL;
4992 char name[WQ_NAME_LEN] = { };
4993 char desc[WORKER_DESC_LEN] = { };
4994 struct pool_workqueue *pwq = NULL;
4995 struct workqueue_struct *wq = NULL;
3d1cb205
TH
4996 struct worker *worker;
4997
4998 if (!(task->flags & PF_WQ_WORKER))
4999 return;
5000
5001 /*
5002 * This function is called without any synchronization and @task
5003 * could be in any state. Be careful with dereferences.
5004 */
e700591a 5005 worker = kthread_probe_data(task);
3d1cb205
TH
5006
5007 /*
8bf89593
TH
5008 * Carefully copy the associated workqueue's workfn, name and desc.
5009 * Keep the original last '\0' in case the original is garbage.
3d1cb205 5010 */
fe557319
CH
5011 copy_from_kernel_nofault(&fn, &worker->current_func, sizeof(fn));
5012 copy_from_kernel_nofault(&pwq, &worker->current_pwq, sizeof(pwq));
5013 copy_from_kernel_nofault(&wq, &pwq->wq, sizeof(wq));
5014 copy_from_kernel_nofault(name, wq->name, sizeof(name) - 1);
5015 copy_from_kernel_nofault(desc, worker->desc, sizeof(desc) - 1);
3d1cb205
TH
5016
5017 if (fn || name[0] || desc[0]) {
d75f773c 5018 printk("%sWorkqueue: %s %ps", log_lvl, name, fn);
8bf89593 5019 if (strcmp(name, desc))
3d1cb205
TH
5020 pr_cont(" (%s)", desc);
5021 pr_cont("\n");
5022 }
5023}
5024
3494fc30
TH
5025static void pr_cont_pool_info(struct worker_pool *pool)
5026{
5027 pr_cont(" cpus=%*pbl", nr_cpumask_bits, pool->attrs->cpumask);
5028 if (pool->node != NUMA_NO_NODE)
5029 pr_cont(" node=%d", pool->node);
5030 pr_cont(" flags=0x%x nice=%d", pool->flags, pool->attrs->nice);
5031}
5032
c76feb0d
PM
5033struct pr_cont_work_struct {
5034 bool comma;
5035 work_func_t func;
5036 long ctr;
5037};
5038
5039static void pr_cont_work_flush(bool comma, work_func_t func, struct pr_cont_work_struct *pcwsp)
5040{
5041 if (!pcwsp->ctr)
5042 goto out_record;
5043 if (func == pcwsp->func) {
5044 pcwsp->ctr++;
5045 return;
5046 }
5047 if (pcwsp->ctr == 1)
5048 pr_cont("%s %ps", pcwsp->comma ? "," : "", pcwsp->func);
5049 else
5050 pr_cont("%s %ld*%ps", pcwsp->comma ? "," : "", pcwsp->ctr, pcwsp->func);
5051 pcwsp->ctr = 0;
5052out_record:
5053 if ((long)func == -1L)
5054 return;
5055 pcwsp->comma = comma;
5056 pcwsp->func = func;
5057 pcwsp->ctr = 1;
5058}
5059
5060static void pr_cont_work(bool comma, struct work_struct *work, struct pr_cont_work_struct *pcwsp)
3494fc30
TH
5061{
5062 if (work->func == wq_barrier_func) {
5063 struct wq_barrier *barr;
5064
5065 barr = container_of(work, struct wq_barrier, work);
5066
c76feb0d 5067 pr_cont_work_flush(comma, (work_func_t)-1, pcwsp);
3494fc30
TH
5068 pr_cont("%s BAR(%d)", comma ? "," : "",
5069 task_pid_nr(barr->task));
5070 } else {
c76feb0d
PM
5071 if (!comma)
5072 pr_cont_work_flush(comma, (work_func_t)-1, pcwsp);
5073 pr_cont_work_flush(comma, work->func, pcwsp);
3494fc30
TH
5074 }
5075}
5076
5077static void show_pwq(struct pool_workqueue *pwq)
5078{
c76feb0d 5079 struct pr_cont_work_struct pcws = { .ctr = 0, };
3494fc30
TH
5080 struct worker_pool *pool = pwq->pool;
5081 struct work_struct *work;
5082 struct worker *worker;
5083 bool has_in_flight = false, has_pending = false;
5084 int bkt;
5085
5086 pr_info(" pwq %d:", pool->id);
5087 pr_cont_pool_info(pool);
5088
e66b39af
TH
5089 pr_cont(" active=%d/%d refcnt=%d%s\n",
5090 pwq->nr_active, pwq->max_active, pwq->refcnt,
3494fc30
TH
5091 !list_empty(&pwq->mayday_node) ? " MAYDAY" : "");
5092
5093 hash_for_each(pool->busy_hash, bkt, worker, hentry) {
5094 if (worker->current_pwq == pwq) {
5095 has_in_flight = true;
5096 break;
5097 }
5098 }
5099 if (has_in_flight) {
5100 bool comma = false;
5101
5102 pr_info(" in-flight:");
5103 hash_for_each(pool->busy_hash, bkt, worker, hentry) {
5104 if (worker->current_pwq != pwq)
5105 continue;
5106
d75f773c 5107 pr_cont("%s %d%s:%ps", comma ? "," : "",
3494fc30 5108 task_pid_nr(worker->task),
30ae2fc0 5109 worker->rescue_wq ? "(RESCUER)" : "",
3494fc30
TH
5110 worker->current_func);
5111 list_for_each_entry(work, &worker->scheduled, entry)
c76feb0d
PM
5112 pr_cont_work(false, work, &pcws);
5113 pr_cont_work_flush(comma, (work_func_t)-1L, &pcws);
3494fc30
TH
5114 comma = true;
5115 }
5116 pr_cont("\n");
5117 }
5118
5119 list_for_each_entry(work, &pool->worklist, entry) {
5120 if (get_work_pwq(work) == pwq) {
5121 has_pending = true;
5122 break;
5123 }
5124 }
5125 if (has_pending) {
5126 bool comma = false;
5127
5128 pr_info(" pending:");
5129 list_for_each_entry(work, &pool->worklist, entry) {
5130 if (get_work_pwq(work) != pwq)
5131 continue;
5132
c76feb0d 5133 pr_cont_work(comma, work, &pcws);
3494fc30
TH
5134 comma = !(*work_data_bits(work) & WORK_STRUCT_LINKED);
5135 }
c76feb0d 5136 pr_cont_work_flush(comma, (work_func_t)-1L, &pcws);
3494fc30
TH
5137 pr_cont("\n");
5138 }
5139
f97a4a1a 5140 if (!list_empty(&pwq->inactive_works)) {
3494fc30
TH
5141 bool comma = false;
5142
f97a4a1a
LJ
5143 pr_info(" inactive:");
5144 list_for_each_entry(work, &pwq->inactive_works, entry) {
c76feb0d 5145 pr_cont_work(comma, work, &pcws);
3494fc30
TH
5146 comma = !(*work_data_bits(work) & WORK_STRUCT_LINKED);
5147 }
c76feb0d 5148 pr_cont_work_flush(comma, (work_func_t)-1L, &pcws);
3494fc30
TH
5149 pr_cont("\n");
5150 }
5151}
5152
5153/**
55df0933
IK
5154 * show_one_workqueue - dump state of specified workqueue
5155 * @wq: workqueue whose state will be printed
3494fc30 5156 */
55df0933 5157void show_one_workqueue(struct workqueue_struct *wq)
3494fc30 5158{
55df0933
IK
5159 struct pool_workqueue *pwq;
5160 bool idle = true;
3494fc30 5161 unsigned long flags;
3494fc30 5162
55df0933
IK
5163 for_each_pwq(pwq, wq) {
5164 if (pwq->nr_active || !list_empty(&pwq->inactive_works)) {
5165 idle = false;
5166 break;
3494fc30 5167 }
55df0933
IK
5168 }
5169 if (idle) /* Nothing to print for idle workqueue */
5170 return;
3494fc30 5171
55df0933 5172 pr_info("workqueue %s: flags=0x%x\n", wq->name, wq->flags);
3494fc30 5173
55df0933
IK
5174 for_each_pwq(pwq, wq) {
5175 raw_spin_lock_irqsave(&pwq->pool->lock, flags);
5176 if (pwq->nr_active || !list_empty(&pwq->inactive_works)) {
62635ea8 5177 /*
55df0933
IK
5178 * Defer printing to avoid deadlocks in console
5179 * drivers that queue work while holding locks
5180 * also taken in their write paths.
62635ea8 5181 */
55df0933
IK
5182 printk_deferred_enter();
5183 show_pwq(pwq);
5184 printk_deferred_exit();
3494fc30 5185 }
55df0933 5186 raw_spin_unlock_irqrestore(&pwq->pool->lock, flags);
62635ea8
SS
5187 /*
5188 * We could be printing a lot from atomic context, e.g.
55df0933 5189 * sysrq-t -> show_all_workqueues(). Avoid triggering
62635ea8
SS
5190 * hard lockup.
5191 */
5192 touch_nmi_watchdog();
3494fc30
TH
5193 }
5194
55df0933
IK
5195}
5196
5197/**
5198 * show_one_worker_pool - dump state of specified worker pool
5199 * @pool: worker pool whose state will be printed
5200 */
5201static void show_one_worker_pool(struct worker_pool *pool)
5202{
5203 struct worker *worker;
5204 bool first = true;
5205 unsigned long flags;
335a42eb 5206 unsigned long hung = 0;
55df0933
IK
5207
5208 raw_spin_lock_irqsave(&pool->lock, flags);
5209 if (pool->nr_workers == pool->nr_idle)
5210 goto next_pool;
335a42eb
PM
5211
5212 /* How long the first pending work is waiting for a worker. */
5213 if (!list_empty(&pool->worklist))
5214 hung = jiffies_to_msecs(jiffies - pool->watchdog_ts) / 1000;
5215
55df0933
IK
5216 /*
5217 * Defer printing to avoid deadlocks in console drivers that
5218 * queue work while holding locks also taken in their write
5219 * paths.
5220 */
5221 printk_deferred_enter();
5222 pr_info("pool %d:", pool->id);
5223 pr_cont_pool_info(pool);
335a42eb 5224 pr_cont(" hung=%lus workers=%d", hung, pool->nr_workers);
55df0933
IK
5225 if (pool->manager)
5226 pr_cont(" manager: %d",
5227 task_pid_nr(pool->manager->task));
5228 list_for_each_entry(worker, &pool->idle_list, entry) {
5229 pr_cont(" %s%d", first ? "idle: " : "",
5230 task_pid_nr(worker->task));
5231 first = false;
5232 }
5233 pr_cont("\n");
5234 printk_deferred_exit();
5235next_pool:
5236 raw_spin_unlock_irqrestore(&pool->lock, flags);
5237 /*
5238 * We could be printing a lot from atomic context, e.g.
5239 * sysrq-t -> show_all_workqueues(). Avoid triggering
5240 * hard lockup.
5241 */
5242 touch_nmi_watchdog();
5243
5244}
5245
5246/**
5247 * show_all_workqueues - dump workqueue state
5248 *
704bc669 5249 * Called from a sysrq handler and prints out all busy workqueues and pools.
55df0933
IK
5250 */
5251void show_all_workqueues(void)
5252{
5253 struct workqueue_struct *wq;
5254 struct worker_pool *pool;
5255 int pi;
5256
5257 rcu_read_lock();
5258
5259 pr_info("Showing busy workqueues and worker pools:\n");
5260
5261 list_for_each_entry_rcu(wq, &workqueues, list)
5262 show_one_workqueue(wq);
5263
5264 for_each_pool(pool, pi)
5265 show_one_worker_pool(pool);
5266
24acfb71 5267 rcu_read_unlock();
3494fc30
TH
5268}
5269
704bc669
JL
5270/**
5271 * show_freezable_workqueues - dump freezable workqueue state
5272 *
5273 * Called from try_to_freeze_tasks() and prints out all freezable workqueues
5274 * still busy.
5275 */
5276void show_freezable_workqueues(void)
5277{
5278 struct workqueue_struct *wq;
5279
5280 rcu_read_lock();
5281
5282 pr_info("Showing freezable workqueues that are still busy:\n");
5283
5284 list_for_each_entry_rcu(wq, &workqueues, list) {
5285 if (!(wq->flags & WQ_FREEZABLE))
5286 continue;
5287 show_one_workqueue(wq);
5288 }
5289
5290 rcu_read_unlock();
5291}
5292
6b59808b
TH
5293/* used to show worker information through /proc/PID/{comm,stat,status} */
5294void wq_worker_comm(char *buf, size_t size, struct task_struct *task)
5295{
6b59808b
TH
5296 int off;
5297
5298 /* always show the actual comm */
5299 off = strscpy(buf, task->comm, size);
5300 if (off < 0)
5301 return;
5302
197f6acc 5303 /* stabilize PF_WQ_WORKER and worker pool association */
6b59808b
TH
5304 mutex_lock(&wq_pool_attach_mutex);
5305
197f6acc
TH
5306 if (task->flags & PF_WQ_WORKER) {
5307 struct worker *worker = kthread_data(task);
5308 struct worker_pool *pool = worker->pool;
6b59808b 5309
197f6acc 5310 if (pool) {
a9b8a985 5311 raw_spin_lock_irq(&pool->lock);
197f6acc
TH
5312 /*
5313 * ->desc tracks information (wq name or
5314 * set_worker_desc()) for the latest execution. If
5315 * current, prepend '+', otherwise '-'.
5316 */
5317 if (worker->desc[0] != '\0') {
5318 if (worker->current_work)
5319 scnprintf(buf + off, size - off, "+%s",
5320 worker->desc);
5321 else
5322 scnprintf(buf + off, size - off, "-%s",
5323 worker->desc);
5324 }
a9b8a985 5325 raw_spin_unlock_irq(&pool->lock);
6b59808b 5326 }
6b59808b
TH
5327 }
5328
5329 mutex_unlock(&wq_pool_attach_mutex);
5330}
5331
66448bc2
MM
5332#ifdef CONFIG_SMP
5333
db7bccf4
TH
5334/*
5335 * CPU hotplug.
5336 *
e22bee78 5337 * There are two challenges in supporting CPU hotplug. Firstly, there
112202d9 5338 * are a lot of assumptions on strong associations among work, pwq and
706026c2 5339 * pool which make migrating pending and scheduled works very
e22bee78 5340 * difficult to implement without impacting hot paths. Secondly,
94cf58bb 5341 * worker pools serve mix of short, long and very long running works making
e22bee78
TH
5342 * blocked draining impractical.
5343 *
24647570 5344 * This is solved by allowing the pools to be disassociated from the CPU
628c78e7
TH
5345 * running as an unbound one and allowing it to be reattached later if the
5346 * cpu comes back online.
db7bccf4 5347 */
1da177e4 5348
e8b3f8db 5349static void unbind_workers(int cpu)
3af24433 5350{
4ce62e9e 5351 struct worker_pool *pool;
db7bccf4 5352 struct worker *worker;
3af24433 5353
f02ae73a 5354 for_each_cpu_worker_pool(pool, cpu) {
1258fae7 5355 mutex_lock(&wq_pool_attach_mutex);
a9b8a985 5356 raw_spin_lock_irq(&pool->lock);
3af24433 5357
94cf58bb 5358 /*
92f9c5c4 5359 * We've blocked all attach/detach operations. Make all workers
94cf58bb 5360 * unbound and set DISASSOCIATED. Before this, all workers
11b45b0b 5361 * must be on the cpu. After this, they may become diasporas.
b4ac9384
LJ
5362 * And the preemption disabled section in their sched callbacks
5363 * are guaranteed to see WORKER_UNBOUND since the code here
5364 * is on the same cpu.
94cf58bb 5365 */
da028469 5366 for_each_pool_worker(worker, pool)
c9e7cf27 5367 worker->flags |= WORKER_UNBOUND;
06ba38a9 5368
24647570 5369 pool->flags |= POOL_DISASSOCIATED;
f2d5a0ee 5370
eb283428 5371 /*
989442d7
LJ
5372 * The handling of nr_running in sched callbacks are disabled
5373 * now. Zap nr_running. After this, nr_running stays zero and
5374 * need_more_worker() and keep_working() are always true as
5375 * long as the worklist is not empty. This pool now behaves as
5376 * an unbound (in terms of concurrency management) pool which
eb283428
LJ
5377 * are served by workers tied to the pool.
5378 */
bc35f7ef 5379 pool->nr_running = 0;
eb283428
LJ
5380
5381 /*
5382 * With concurrency management just turned off, a busy
5383 * worker blocking could lead to lengthy stalls. Kick off
5384 * unbound chain execution of currently pending work items.
5385 */
eb283428 5386 wake_up_worker(pool);
989442d7 5387
a9b8a985 5388 raw_spin_unlock_irq(&pool->lock);
989442d7 5389
793777bc
VS
5390 for_each_pool_worker(worker, pool)
5391 unbind_worker(worker);
989442d7
LJ
5392
5393 mutex_unlock(&wq_pool_attach_mutex);
eb283428 5394 }
3af24433 5395}
3af24433 5396
bd7c089e
TH
5397/**
5398 * rebind_workers - rebind all workers of a pool to the associated CPU
5399 * @pool: pool of interest
5400 *
a9ab775b 5401 * @pool->cpu is coming online. Rebind all workers to the CPU.
bd7c089e
TH
5402 */
5403static void rebind_workers(struct worker_pool *pool)
5404{
a9ab775b 5405 struct worker *worker;
bd7c089e 5406
1258fae7 5407 lockdep_assert_held(&wq_pool_attach_mutex);
bd7c089e 5408
a9ab775b
TH
5409 /*
5410 * Restore CPU affinity of all workers. As all idle workers should
5411 * be on the run-queue of the associated CPU before any local
402dd89d 5412 * wake-ups for concurrency management happen, restore CPU affinity
a9ab775b
TH
5413 * of all workers first and then clear UNBOUND. As we're called
5414 * from CPU_ONLINE, the following shouldn't fail.
5415 */
c63a2e52
VS
5416 for_each_pool_worker(worker, pool) {
5417 kthread_set_per_cpu(worker->task, pool->cpu);
5418 WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task,
5419 pool->attrs->cpumask) < 0);
5420 }
bd7c089e 5421
a9b8a985 5422 raw_spin_lock_irq(&pool->lock);
f7c17d26 5423
3de5e884 5424 pool->flags &= ~POOL_DISASSOCIATED;
bd7c089e 5425
da028469 5426 for_each_pool_worker(worker, pool) {
a9ab775b 5427 unsigned int worker_flags = worker->flags;
bd7c089e 5428
a9ab775b
TH
5429 /*
5430 * We want to clear UNBOUND but can't directly call
5431 * worker_clr_flags() or adjust nr_running. Atomically
5432 * replace UNBOUND with another NOT_RUNNING flag REBOUND.
5433 * @worker will clear REBOUND using worker_clr_flags() when
5434 * it initiates the next execution cycle thus restoring
5435 * concurrency management. Note that when or whether
5436 * @worker clears REBOUND doesn't affect correctness.
5437 *
c95491ed 5438 * WRITE_ONCE() is necessary because @worker->flags may be
a9ab775b 5439 * tested without holding any lock in
6d25be57 5440 * wq_worker_running(). Without it, NOT_RUNNING test may
a9ab775b
TH
5441 * fail incorrectly leading to premature concurrency
5442 * management operations.
5443 */
5444 WARN_ON_ONCE(!(worker_flags & WORKER_UNBOUND));
5445 worker_flags |= WORKER_REBOUND;
5446 worker_flags &= ~WORKER_UNBOUND;
c95491ed 5447 WRITE_ONCE(worker->flags, worker_flags);
bd7c089e 5448 }
a9ab775b 5449
a9b8a985 5450 raw_spin_unlock_irq(&pool->lock);
bd7c089e
TH
5451}
5452
7dbc725e
TH
5453/**
5454 * restore_unbound_workers_cpumask - restore cpumask of unbound workers
5455 * @pool: unbound pool of interest
5456 * @cpu: the CPU which is coming up
5457 *
5458 * An unbound pool may end up with a cpumask which doesn't have any online
5459 * CPUs. When a worker of such pool get scheduled, the scheduler resets
5460 * its cpus_allowed. If @cpu is in @pool's cpumask which didn't have any
5461 * online CPU before, cpus_allowed of all its workers should be restored.
5462 */
5463static void restore_unbound_workers_cpumask(struct worker_pool *pool, int cpu)
5464{
5465 static cpumask_t cpumask;
5466 struct worker *worker;
7dbc725e 5467
1258fae7 5468 lockdep_assert_held(&wq_pool_attach_mutex);
7dbc725e
TH
5469
5470 /* is @cpu allowed for @pool? */
5471 if (!cpumask_test_cpu(cpu, pool->attrs->cpumask))
5472 return;
5473
7dbc725e 5474 cpumask_and(&cpumask, pool->attrs->cpumask, cpu_online_mask);
7dbc725e
TH
5475
5476 /* as we're called from CPU_ONLINE, the following shouldn't fail */
da028469 5477 for_each_pool_worker(worker, pool)
d945b5e9 5478 WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task, &cpumask) < 0);
7dbc725e
TH
5479}
5480
7ee681b2
TG
5481int workqueue_prepare_cpu(unsigned int cpu)
5482{
5483 struct worker_pool *pool;
5484
5485 for_each_cpu_worker_pool(pool, cpu) {
5486 if (pool->nr_workers)
5487 continue;
5488 if (!create_worker(pool))
5489 return -ENOMEM;
5490 }
5491 return 0;
5492}
5493
5494int workqueue_online_cpu(unsigned int cpu)
3af24433 5495{
4ce62e9e 5496 struct worker_pool *pool;
4c16bd32 5497 struct workqueue_struct *wq;
7dbc725e 5498 int pi;
3ce63377 5499
7ee681b2 5500 mutex_lock(&wq_pool_mutex);
7dbc725e 5501
7ee681b2 5502 for_each_pool(pool, pi) {
1258fae7 5503 mutex_lock(&wq_pool_attach_mutex);
94cf58bb 5504
7ee681b2
TG
5505 if (pool->cpu == cpu)
5506 rebind_workers(pool);
5507 else if (pool->cpu < 0)
5508 restore_unbound_workers_cpumask(pool, cpu);
94cf58bb 5509
1258fae7 5510 mutex_unlock(&wq_pool_attach_mutex);
7ee681b2 5511 }
6ba94429 5512
7ee681b2
TG
5513 /* update NUMA affinity of unbound workqueues */
5514 list_for_each_entry(wq, &workqueues, list)
5515 wq_update_unbound_numa(wq, cpu, true);
6ba94429 5516
7ee681b2
TG
5517 mutex_unlock(&wq_pool_mutex);
5518 return 0;
6ba94429
FW
5519}
5520
7ee681b2 5521int workqueue_offline_cpu(unsigned int cpu)
6ba94429 5522{
6ba94429
FW
5523 struct workqueue_struct *wq;
5524
7ee681b2 5525 /* unbinding per-cpu workers should happen on the local CPU */
e8b3f8db
LJ
5526 if (WARN_ON(cpu != smp_processor_id()))
5527 return -1;
5528
5529 unbind_workers(cpu);
7ee681b2
TG
5530
5531 /* update NUMA affinity of unbound workqueues */
5532 mutex_lock(&wq_pool_mutex);
5533 list_for_each_entry(wq, &workqueues, list)
5534 wq_update_unbound_numa(wq, cpu, false);
5535 mutex_unlock(&wq_pool_mutex);
5536
7ee681b2 5537 return 0;
6ba94429
FW
5538}
5539
6ba94429
FW
5540struct work_for_cpu {
5541 struct work_struct work;
5542 long (*fn)(void *);
5543 void *arg;
5544 long ret;
5545};
5546
5547static void work_for_cpu_fn(struct work_struct *work)
5548{
5549 struct work_for_cpu *wfc = container_of(work, struct work_for_cpu, work);
5550
5551 wfc->ret = wfc->fn(wfc->arg);
5552}
5553
5554/**
22aceb31 5555 * work_on_cpu - run a function in thread context on a particular cpu
6ba94429
FW
5556 * @cpu: the cpu to run on
5557 * @fn: the function to run
5558 * @arg: the function arg
5559 *
5560 * It is up to the caller to ensure that the cpu doesn't go offline.
5561 * The caller must not hold any locks which would prevent @fn from completing.
5562 *
5563 * Return: The value @fn returns.
5564 */
5565long work_on_cpu(int cpu, long (*fn)(void *), void *arg)
5566{
5567 struct work_for_cpu wfc = { .fn = fn, .arg = arg };
5568
5569 INIT_WORK_ONSTACK(&wfc.work, work_for_cpu_fn);
5570 schedule_work_on(cpu, &wfc.work);
5571 flush_work(&wfc.work);
5572 destroy_work_on_stack(&wfc.work);
5573 return wfc.ret;
5574}
5575EXPORT_SYMBOL_GPL(work_on_cpu);
0e8d6a93
TG
5576
5577/**
5578 * work_on_cpu_safe - run a function in thread context on a particular cpu
5579 * @cpu: the cpu to run on
5580 * @fn: the function to run
5581 * @arg: the function argument
5582 *
5583 * Disables CPU hotplug and calls work_on_cpu(). The caller must not hold
5584 * any locks which would prevent @fn from completing.
5585 *
5586 * Return: The value @fn returns.
5587 */
5588long work_on_cpu_safe(int cpu, long (*fn)(void *), void *arg)
5589{
5590 long ret = -ENODEV;
5591
ffd8bea8 5592 cpus_read_lock();
0e8d6a93
TG
5593 if (cpu_online(cpu))
5594 ret = work_on_cpu(cpu, fn, arg);
ffd8bea8 5595 cpus_read_unlock();
0e8d6a93
TG
5596 return ret;
5597}
5598EXPORT_SYMBOL_GPL(work_on_cpu_safe);
6ba94429
FW
5599#endif /* CONFIG_SMP */
5600
5601#ifdef CONFIG_FREEZER
5602
5603/**
5604 * freeze_workqueues_begin - begin freezing workqueues
5605 *
5606 * Start freezing workqueues. After this function returns, all freezable
f97a4a1a 5607 * workqueues will queue new works to their inactive_works list instead of
6ba94429
FW
5608 * pool->worklist.
5609 *
5610 * CONTEXT:
5611 * Grabs and releases wq_pool_mutex, wq->mutex and pool->lock's.
5612 */
5613void freeze_workqueues_begin(void)
5614{
5615 struct workqueue_struct *wq;
5616 struct pool_workqueue *pwq;
5617
5618 mutex_lock(&wq_pool_mutex);
5619
5620 WARN_ON_ONCE(workqueue_freezing);
5621 workqueue_freezing = true;
5622
5623 list_for_each_entry(wq, &workqueues, list) {
5624 mutex_lock(&wq->mutex);
5625 for_each_pwq(pwq, wq)
5626 pwq_adjust_max_active(pwq);
5627 mutex_unlock(&wq->mutex);
5628 }
5629
5630 mutex_unlock(&wq_pool_mutex);
5631}
5632
5633/**
5634 * freeze_workqueues_busy - are freezable workqueues still busy?
5635 *
5636 * Check whether freezing is complete. This function must be called
5637 * between freeze_workqueues_begin() and thaw_workqueues().
5638 *
5639 * CONTEXT:
5640 * Grabs and releases wq_pool_mutex.
5641 *
5642 * Return:
5643 * %true if some freezable workqueues are still busy. %false if freezing
5644 * is complete.
5645 */
5646bool freeze_workqueues_busy(void)
5647{
5648 bool busy = false;
5649 struct workqueue_struct *wq;
5650 struct pool_workqueue *pwq;
5651
5652 mutex_lock(&wq_pool_mutex);
5653
5654 WARN_ON_ONCE(!workqueue_freezing);
5655
5656 list_for_each_entry(wq, &workqueues, list) {
5657 if (!(wq->flags & WQ_FREEZABLE))
5658 continue;
5659 /*
5660 * nr_active is monotonically decreasing. It's safe
5661 * to peek without lock.
5662 */
24acfb71 5663 rcu_read_lock();
6ba94429
FW
5664 for_each_pwq(pwq, wq) {
5665 WARN_ON_ONCE(pwq->nr_active < 0);
5666 if (pwq->nr_active) {
5667 busy = true;
24acfb71 5668 rcu_read_unlock();
6ba94429
FW
5669 goto out_unlock;
5670 }
5671 }
24acfb71 5672 rcu_read_unlock();
6ba94429
FW
5673 }
5674out_unlock:
5675 mutex_unlock(&wq_pool_mutex);
5676 return busy;
5677}
5678
5679/**
5680 * thaw_workqueues - thaw workqueues
5681 *
5682 * Thaw workqueues. Normal queueing is restored and all collected
5683 * frozen works are transferred to their respective pool worklists.
5684 *
5685 * CONTEXT:
5686 * Grabs and releases wq_pool_mutex, wq->mutex and pool->lock's.
5687 */
5688void thaw_workqueues(void)
5689{
5690 struct workqueue_struct *wq;
5691 struct pool_workqueue *pwq;
5692
5693 mutex_lock(&wq_pool_mutex);
5694
5695 if (!workqueue_freezing)
5696 goto out_unlock;
5697
5698 workqueue_freezing = false;
5699
5700 /* restore max_active and repopulate worklist */
5701 list_for_each_entry(wq, &workqueues, list) {
5702 mutex_lock(&wq->mutex);
5703 for_each_pwq(pwq, wq)
5704 pwq_adjust_max_active(pwq);
5705 mutex_unlock(&wq->mutex);
5706 }
5707
5708out_unlock:
5709 mutex_unlock(&wq_pool_mutex);
5710}
5711#endif /* CONFIG_FREEZER */
5712
99c621ef 5713static int workqueue_apply_unbound_cpumask(const cpumask_var_t unbound_cpumask)
042f7df1
LJ
5714{
5715 LIST_HEAD(ctxs);
5716 int ret = 0;
5717 struct workqueue_struct *wq;
5718 struct apply_wqattrs_ctx *ctx, *n;
5719
5720 lockdep_assert_held(&wq_pool_mutex);
5721
5722 list_for_each_entry(wq, &workqueues, list) {
5723 if (!(wq->flags & WQ_UNBOUND))
5724 continue;
5725 /* creating multiple pwqs breaks ordering guarantee */
5726 if (wq->flags & __WQ_ORDERED)
5727 continue;
5728
99c621ef 5729 ctx = apply_wqattrs_prepare(wq, wq->unbound_attrs, unbound_cpumask);
042f7df1
LJ
5730 if (!ctx) {
5731 ret = -ENOMEM;
5732 break;
5733 }
5734
5735 list_add_tail(&ctx->list, &ctxs);
5736 }
5737
5738 list_for_each_entry_safe(ctx, n, &ctxs, list) {
5739 if (!ret)
5740 apply_wqattrs_commit(ctx);
5741 apply_wqattrs_cleanup(ctx);
5742 }
5743
99c621ef
LJ
5744 if (!ret) {
5745 mutex_lock(&wq_pool_attach_mutex);
5746 cpumask_copy(wq_unbound_cpumask, unbound_cpumask);
5747 mutex_unlock(&wq_pool_attach_mutex);
5748 }
042f7df1
LJ
5749 return ret;
5750}
5751
5752/**
5753 * workqueue_set_unbound_cpumask - Set the low-level unbound cpumask
5754 * @cpumask: the cpumask to set
5755 *
5756 * The low-level workqueues cpumask is a global cpumask that limits
5757 * the affinity of all unbound workqueues. This function check the @cpumask
5758 * and apply it to all unbound workqueues and updates all pwqs of them.
5759 *
67dc8325 5760 * Return: 0 - Success
042f7df1
LJ
5761 * -EINVAL - Invalid @cpumask
5762 * -ENOMEM - Failed to allocate memory for attrs or pwqs.
5763 */
5764int workqueue_set_unbound_cpumask(cpumask_var_t cpumask)
5765{
5766 int ret = -EINVAL;
042f7df1 5767
c98a9805
TS
5768 /*
5769 * Not excluding isolated cpus on purpose.
5770 * If the user wishes to include them, we allow that.
5771 */
042f7df1
LJ
5772 cpumask_and(cpumask, cpumask, cpu_possible_mask);
5773 if (!cpumask_empty(cpumask)) {
a0111cf6 5774 apply_wqattrs_lock();
d25302e4
MD
5775 if (cpumask_equal(cpumask, wq_unbound_cpumask)) {
5776 ret = 0;
5777 goto out_unlock;
5778 }
5779
99c621ef 5780 ret = workqueue_apply_unbound_cpumask(cpumask);
042f7df1 5781
d25302e4 5782out_unlock:
a0111cf6 5783 apply_wqattrs_unlock();
042f7df1 5784 }
042f7df1 5785
042f7df1
LJ
5786 return ret;
5787}
5788
6ba94429
FW
5789#ifdef CONFIG_SYSFS
5790/*
5791 * Workqueues with WQ_SYSFS flag set is visible to userland via
5792 * /sys/bus/workqueue/devices/WQ_NAME. All visible workqueues have the
5793 * following attributes.
5794 *
5795 * per_cpu RO bool : whether the workqueue is per-cpu or unbound
5796 * max_active RW int : maximum number of in-flight work items
5797 *
5798 * Unbound workqueues have the following extra attributes.
5799 *
9a19b463 5800 * pool_ids RO int : the associated pool IDs for each node
6ba94429
FW
5801 * nice RW int : nice value of the workers
5802 * cpumask RW mask : bitmask of allowed CPUs for the workers
9a19b463 5803 * numa RW bool : whether enable NUMA affinity
6ba94429
FW
5804 */
5805struct wq_device {
5806 struct workqueue_struct *wq;
5807 struct device dev;
5808};
5809
5810static struct workqueue_struct *dev_to_wq(struct device *dev)
5811{
5812 struct wq_device *wq_dev = container_of(dev, struct wq_device, dev);
5813
5814 return wq_dev->wq;
5815}
5816
5817static ssize_t per_cpu_show(struct device *dev, struct device_attribute *attr,
5818 char *buf)
5819{
5820 struct workqueue_struct *wq = dev_to_wq(dev);
5821
5822 return scnprintf(buf, PAGE_SIZE, "%d\n", (bool)!(wq->flags & WQ_UNBOUND));
5823}
5824static DEVICE_ATTR_RO(per_cpu);
5825
5826static ssize_t max_active_show(struct device *dev,
5827 struct device_attribute *attr, char *buf)
5828{
5829 struct workqueue_struct *wq = dev_to_wq(dev);
5830
5831 return scnprintf(buf, PAGE_SIZE, "%d\n", wq->saved_max_active);
5832}
5833
5834static ssize_t max_active_store(struct device *dev,
5835 struct device_attribute *attr, const char *buf,
5836 size_t count)
5837{
5838 struct workqueue_struct *wq = dev_to_wq(dev);
5839 int val;
5840
5841 if (sscanf(buf, "%d", &val) != 1 || val <= 0)
5842 return -EINVAL;
5843
5844 workqueue_set_max_active(wq, val);
5845 return count;
5846}
5847static DEVICE_ATTR_RW(max_active);
5848
5849static struct attribute *wq_sysfs_attrs[] = {
5850 &dev_attr_per_cpu.attr,
5851 &dev_attr_max_active.attr,
5852 NULL,
5853};
5854ATTRIBUTE_GROUPS(wq_sysfs);
5855
5856static ssize_t wq_pool_ids_show(struct device *dev,
5857 struct device_attribute *attr, char *buf)
5858{
5859 struct workqueue_struct *wq = dev_to_wq(dev);
5860 const char *delim = "";
5861 int node, written = 0;
5862
ffd8bea8 5863 cpus_read_lock();
24acfb71 5864 rcu_read_lock();
6ba94429
FW
5865 for_each_node(node) {
5866 written += scnprintf(buf + written, PAGE_SIZE - written,
5867 "%s%d:%d", delim, node,
5868 unbound_pwq_by_node(wq, node)->pool->id);
5869 delim = " ";
5870 }
5871 written += scnprintf(buf + written, PAGE_SIZE - written, "\n");
24acfb71 5872 rcu_read_unlock();
ffd8bea8 5873 cpus_read_unlock();
6ba94429
FW
5874
5875 return written;
5876}
5877
5878static ssize_t wq_nice_show(struct device *dev, struct device_attribute *attr,
5879 char *buf)
5880{
5881 struct workqueue_struct *wq = dev_to_wq(dev);
5882 int written;
5883
5884 mutex_lock(&wq->mutex);
5885 written = scnprintf(buf, PAGE_SIZE, "%d\n", wq->unbound_attrs->nice);
5886 mutex_unlock(&wq->mutex);
5887
5888 return written;
5889}
5890
5891/* prepare workqueue_attrs for sysfs store operations */
5892static struct workqueue_attrs *wq_sysfs_prep_attrs(struct workqueue_struct *wq)
5893{
5894 struct workqueue_attrs *attrs;
5895
899a94fe
LJ
5896 lockdep_assert_held(&wq_pool_mutex);
5897
be69d00d 5898 attrs = alloc_workqueue_attrs();
6ba94429
FW
5899 if (!attrs)
5900 return NULL;
5901
6ba94429 5902 copy_workqueue_attrs(attrs, wq->unbound_attrs);
6ba94429
FW
5903 return attrs;
5904}
5905
5906static ssize_t wq_nice_store(struct device *dev, struct device_attribute *attr,
5907 const char *buf, size_t count)
5908{
5909 struct workqueue_struct *wq = dev_to_wq(dev);
5910 struct workqueue_attrs *attrs;
d4d3e257
LJ
5911 int ret = -ENOMEM;
5912
5913 apply_wqattrs_lock();
6ba94429
FW
5914
5915 attrs = wq_sysfs_prep_attrs(wq);
5916 if (!attrs)
d4d3e257 5917 goto out_unlock;
6ba94429
FW
5918
5919 if (sscanf(buf, "%d", &attrs->nice) == 1 &&
5920 attrs->nice >= MIN_NICE && attrs->nice <= MAX_NICE)
d4d3e257 5921 ret = apply_workqueue_attrs_locked(wq, attrs);
6ba94429
FW
5922 else
5923 ret = -EINVAL;
5924
d4d3e257
LJ
5925out_unlock:
5926 apply_wqattrs_unlock();
6ba94429
FW
5927 free_workqueue_attrs(attrs);
5928 return ret ?: count;
5929}
5930
5931static ssize_t wq_cpumask_show(struct device *dev,
5932 struct device_attribute *attr, char *buf)
5933{
5934 struct workqueue_struct *wq = dev_to_wq(dev);
5935 int written;
5936
5937 mutex_lock(&wq->mutex);
5938 written = scnprintf(buf, PAGE_SIZE, "%*pb\n",
5939 cpumask_pr_args(wq->unbound_attrs->cpumask));
5940 mutex_unlock(&wq->mutex);
5941 return written;
5942}
5943
5944static ssize_t wq_cpumask_store(struct device *dev,
5945 struct device_attribute *attr,
5946 const char *buf, size_t count)
5947{
5948 struct workqueue_struct *wq = dev_to_wq(dev);
5949 struct workqueue_attrs *attrs;
d4d3e257
LJ
5950 int ret = -ENOMEM;
5951
5952 apply_wqattrs_lock();
6ba94429
FW
5953
5954 attrs = wq_sysfs_prep_attrs(wq);
5955 if (!attrs)
d4d3e257 5956 goto out_unlock;
6ba94429
FW
5957
5958 ret = cpumask_parse(buf, attrs->cpumask);
5959 if (!ret)
d4d3e257 5960 ret = apply_workqueue_attrs_locked(wq, attrs);
6ba94429 5961
d4d3e257
LJ
5962out_unlock:
5963 apply_wqattrs_unlock();
6ba94429
FW
5964 free_workqueue_attrs(attrs);
5965 return ret ?: count;
5966}
5967
5968static ssize_t wq_numa_show(struct device *dev, struct device_attribute *attr,
5969 char *buf)
5970{
5971 struct workqueue_struct *wq = dev_to_wq(dev);
5972 int written;
7dbc725e 5973
6ba94429
FW
5974 mutex_lock(&wq->mutex);
5975 written = scnprintf(buf, PAGE_SIZE, "%d\n",
5976 !wq->unbound_attrs->no_numa);
5977 mutex_unlock(&wq->mutex);
4c16bd32 5978
6ba94429 5979 return written;
65758202
TH
5980}
5981
6ba94429
FW
5982static ssize_t wq_numa_store(struct device *dev, struct device_attribute *attr,
5983 const char *buf, size_t count)
65758202 5984{
6ba94429
FW
5985 struct workqueue_struct *wq = dev_to_wq(dev);
5986 struct workqueue_attrs *attrs;
d4d3e257
LJ
5987 int v, ret = -ENOMEM;
5988
5989 apply_wqattrs_lock();
4c16bd32 5990
6ba94429
FW
5991 attrs = wq_sysfs_prep_attrs(wq);
5992 if (!attrs)
d4d3e257 5993 goto out_unlock;
4c16bd32 5994
6ba94429
FW
5995 ret = -EINVAL;
5996 if (sscanf(buf, "%d", &v) == 1) {
5997 attrs->no_numa = !v;
d4d3e257 5998 ret = apply_workqueue_attrs_locked(wq, attrs);
65758202 5999 }
6ba94429 6000
d4d3e257
LJ
6001out_unlock:
6002 apply_wqattrs_unlock();
6ba94429
FW
6003 free_workqueue_attrs(attrs);
6004 return ret ?: count;
65758202
TH
6005}
6006
6ba94429
FW
6007static struct device_attribute wq_sysfs_unbound_attrs[] = {
6008 __ATTR(pool_ids, 0444, wq_pool_ids_show, NULL),
6009 __ATTR(nice, 0644, wq_nice_show, wq_nice_store),
6010 __ATTR(cpumask, 0644, wq_cpumask_show, wq_cpumask_store),
6011 __ATTR(numa, 0644, wq_numa_show, wq_numa_store),
6012 __ATTR_NULL,
6013};
8ccad40d 6014
6ba94429
FW
6015static struct bus_type wq_subsys = {
6016 .name = "workqueue",
6017 .dev_groups = wq_sysfs_groups,
2d3854a3
RR
6018};
6019
b05a7928
FW
6020static ssize_t wq_unbound_cpumask_show(struct device *dev,
6021 struct device_attribute *attr, char *buf)
6022{
6023 int written;
6024
042f7df1 6025 mutex_lock(&wq_pool_mutex);
b05a7928
FW
6026 written = scnprintf(buf, PAGE_SIZE, "%*pb\n",
6027 cpumask_pr_args(wq_unbound_cpumask));
042f7df1 6028 mutex_unlock(&wq_pool_mutex);
b05a7928
FW
6029
6030 return written;
6031}
6032
042f7df1
LJ
6033static ssize_t wq_unbound_cpumask_store(struct device *dev,
6034 struct device_attribute *attr, const char *buf, size_t count)
6035{
6036 cpumask_var_t cpumask;
6037 int ret;
6038
6039 if (!zalloc_cpumask_var(&cpumask, GFP_KERNEL))
6040 return -ENOMEM;
6041
6042 ret = cpumask_parse(buf, cpumask);
6043 if (!ret)
6044 ret = workqueue_set_unbound_cpumask(cpumask);
6045
6046 free_cpumask_var(cpumask);
6047 return ret ? ret : count;
6048}
6049
b05a7928 6050static struct device_attribute wq_sysfs_cpumask_attr =
042f7df1
LJ
6051 __ATTR(cpumask, 0644, wq_unbound_cpumask_show,
6052 wq_unbound_cpumask_store);
b05a7928 6053
6ba94429 6054static int __init wq_sysfs_init(void)
2d3854a3 6055{
686f6697 6056 struct device *dev_root;
b05a7928
FW
6057 int err;
6058
6059 err = subsys_virtual_register(&wq_subsys, NULL);
6060 if (err)
6061 return err;
6062
686f6697
GKH
6063 dev_root = bus_get_dev_root(&wq_subsys);
6064 if (dev_root) {
6065 err = device_create_file(dev_root, &wq_sysfs_cpumask_attr);
6066 put_device(dev_root);
6067 }
6068 return err;
2d3854a3 6069}
6ba94429 6070core_initcall(wq_sysfs_init);
2d3854a3 6071
6ba94429 6072static void wq_device_release(struct device *dev)
2d3854a3 6073{
6ba94429 6074 struct wq_device *wq_dev = container_of(dev, struct wq_device, dev);
6b44003e 6075
6ba94429 6076 kfree(wq_dev);
2d3854a3 6077}
a0a1a5fd
TH
6078
6079/**
6ba94429
FW
6080 * workqueue_sysfs_register - make a workqueue visible in sysfs
6081 * @wq: the workqueue to register
a0a1a5fd 6082 *
6ba94429
FW
6083 * Expose @wq in sysfs under /sys/bus/workqueue/devices.
6084 * alloc_workqueue*() automatically calls this function if WQ_SYSFS is set
6085 * which is the preferred method.
a0a1a5fd 6086 *
6ba94429
FW
6087 * Workqueue user should use this function directly iff it wants to apply
6088 * workqueue_attrs before making the workqueue visible in sysfs; otherwise,
6089 * apply_workqueue_attrs() may race against userland updating the
6090 * attributes.
6091 *
6092 * Return: 0 on success, -errno on failure.
a0a1a5fd 6093 */
6ba94429 6094int workqueue_sysfs_register(struct workqueue_struct *wq)
a0a1a5fd 6095{
6ba94429
FW
6096 struct wq_device *wq_dev;
6097 int ret;
a0a1a5fd 6098
6ba94429 6099 /*
402dd89d 6100 * Adjusting max_active or creating new pwqs by applying
6ba94429
FW
6101 * attributes breaks ordering guarantee. Disallow exposing ordered
6102 * workqueues.
6103 */
0a94efb5 6104 if (WARN_ON(wq->flags & __WQ_ORDERED_EXPLICIT))
6ba94429 6105 return -EINVAL;
a0a1a5fd 6106
6ba94429
FW
6107 wq->wq_dev = wq_dev = kzalloc(sizeof(*wq_dev), GFP_KERNEL);
6108 if (!wq_dev)
6109 return -ENOMEM;
5bcab335 6110
6ba94429
FW
6111 wq_dev->wq = wq;
6112 wq_dev->dev.bus = &wq_subsys;
6ba94429 6113 wq_dev->dev.release = wq_device_release;
23217b44 6114 dev_set_name(&wq_dev->dev, "%s", wq->name);
a0a1a5fd 6115
6ba94429
FW
6116 /*
6117 * unbound_attrs are created separately. Suppress uevent until
6118 * everything is ready.
6119 */
6120 dev_set_uevent_suppress(&wq_dev->dev, true);
a0a1a5fd 6121
6ba94429
FW
6122 ret = device_register(&wq_dev->dev);
6123 if (ret) {
537f4146 6124 put_device(&wq_dev->dev);
6ba94429
FW
6125 wq->wq_dev = NULL;
6126 return ret;
6127 }
a0a1a5fd 6128
6ba94429
FW
6129 if (wq->flags & WQ_UNBOUND) {
6130 struct device_attribute *attr;
a0a1a5fd 6131
6ba94429
FW
6132 for (attr = wq_sysfs_unbound_attrs; attr->attr.name; attr++) {
6133 ret = device_create_file(&wq_dev->dev, attr);
6134 if (ret) {
6135 device_unregister(&wq_dev->dev);
6136 wq->wq_dev = NULL;
6137 return ret;
a0a1a5fd
TH
6138 }
6139 }
6140 }
6ba94429
FW
6141
6142 dev_set_uevent_suppress(&wq_dev->dev, false);
6143 kobject_uevent(&wq_dev->dev.kobj, KOBJ_ADD);
6144 return 0;
a0a1a5fd
TH
6145}
6146
6147/**
6ba94429
FW
6148 * workqueue_sysfs_unregister - undo workqueue_sysfs_register()
6149 * @wq: the workqueue to unregister
a0a1a5fd 6150 *
6ba94429 6151 * If @wq is registered to sysfs by workqueue_sysfs_register(), unregister.
a0a1a5fd 6152 */
6ba94429 6153static void workqueue_sysfs_unregister(struct workqueue_struct *wq)
a0a1a5fd 6154{
6ba94429 6155 struct wq_device *wq_dev = wq->wq_dev;
8b03ae3c 6156
6ba94429
FW
6157 if (!wq->wq_dev)
6158 return;
a0a1a5fd 6159
6ba94429
FW
6160 wq->wq_dev = NULL;
6161 device_unregister(&wq_dev->dev);
a0a1a5fd 6162}
6ba94429
FW
6163#else /* CONFIG_SYSFS */
6164static void workqueue_sysfs_unregister(struct workqueue_struct *wq) { }
6165#endif /* CONFIG_SYSFS */
a0a1a5fd 6166
82607adc
TH
6167/*
6168 * Workqueue watchdog.
6169 *
6170 * Stall may be caused by various bugs - missing WQ_MEM_RECLAIM, illegal
6171 * flush dependency, a concurrency managed work item which stays RUNNING
6172 * indefinitely. Workqueue stalls can be very difficult to debug as the
6173 * usual warning mechanisms don't trigger and internal workqueue state is
6174 * largely opaque.
6175 *
6176 * Workqueue watchdog monitors all worker pools periodically and dumps
6177 * state if some pools failed to make forward progress for a while where
6178 * forward progress is defined as the first item on ->worklist changing.
6179 *
6180 * This mechanism is controlled through the kernel parameter
6181 * "workqueue.watchdog_thresh" which can be updated at runtime through the
6182 * corresponding sysfs parameter file.
6183 */
6184#ifdef CONFIG_WQ_WATCHDOG
6185
82607adc 6186static unsigned long wq_watchdog_thresh = 30;
5cd79d6a 6187static struct timer_list wq_watchdog_timer;
82607adc
TH
6188
6189static unsigned long wq_watchdog_touched = INITIAL_JIFFIES;
6190static DEFINE_PER_CPU(unsigned long, wq_watchdog_touched_cpu) = INITIAL_JIFFIES;
6191
cd2440d6
PM
6192/*
6193 * Show workers that might prevent the processing of pending work items.
6194 * The only candidates are CPU-bound workers in the running state.
6195 * Pending work items should be handled by another idle worker
6196 * in all other situations.
6197 */
6198static void show_cpu_pool_hog(struct worker_pool *pool)
6199{
6200 struct worker *worker;
6201 unsigned long flags;
6202 int bkt;
6203
6204 raw_spin_lock_irqsave(&pool->lock, flags);
6205
6206 hash_for_each(pool->busy_hash, bkt, worker, hentry) {
6207 if (task_is_running(worker->task)) {
6208 /*
6209 * Defer printing to avoid deadlocks in console
6210 * drivers that queue work while holding locks
6211 * also taken in their write paths.
6212 */
6213 printk_deferred_enter();
6214
6215 pr_info("pool %d:\n", pool->id);
6216 sched_show_task(worker->task);
6217
6218 printk_deferred_exit();
6219 }
6220 }
6221
6222 raw_spin_unlock_irqrestore(&pool->lock, flags);
6223}
6224
6225static void show_cpu_pools_hogs(void)
6226{
6227 struct worker_pool *pool;
6228 int pi;
6229
6230 pr_info("Showing backtraces of running workers in stalled CPU-bound worker pools:\n");
6231
6232 rcu_read_lock();
6233
6234 for_each_pool(pool, pi) {
6235 if (pool->cpu_stall)
6236 show_cpu_pool_hog(pool);
6237
6238 }
6239
6240 rcu_read_unlock();
6241}
6242
82607adc
TH
6243static void wq_watchdog_reset_touched(void)
6244{
6245 int cpu;
6246
6247 wq_watchdog_touched = jiffies;
6248 for_each_possible_cpu(cpu)
6249 per_cpu(wq_watchdog_touched_cpu, cpu) = jiffies;
6250}
6251
5cd79d6a 6252static void wq_watchdog_timer_fn(struct timer_list *unused)
82607adc
TH
6253{
6254 unsigned long thresh = READ_ONCE(wq_watchdog_thresh) * HZ;
6255 bool lockup_detected = false;
cd2440d6 6256 bool cpu_pool_stall = false;
940d71c6 6257 unsigned long now = jiffies;
82607adc
TH
6258 struct worker_pool *pool;
6259 int pi;
6260
6261 if (!thresh)
6262 return;
6263
6264 rcu_read_lock();
6265
6266 for_each_pool(pool, pi) {
6267 unsigned long pool_ts, touched, ts;
6268
cd2440d6 6269 pool->cpu_stall = false;
82607adc
TH
6270 if (list_empty(&pool->worklist))
6271 continue;
6272
940d71c6
SS
6273 /*
6274 * If a virtual machine is stopped by the host it can look to
6275 * the watchdog like a stall.
6276 */
6277 kvm_check_and_clear_guest_paused();
6278
82607adc 6279 /* get the latest of pool and touched timestamps */
89e28ce6
WQ
6280 if (pool->cpu >= 0)
6281 touched = READ_ONCE(per_cpu(wq_watchdog_touched_cpu, pool->cpu));
6282 else
6283 touched = READ_ONCE(wq_watchdog_touched);
82607adc 6284 pool_ts = READ_ONCE(pool->watchdog_ts);
82607adc
TH
6285
6286 if (time_after(pool_ts, touched))
6287 ts = pool_ts;
6288 else
6289 ts = touched;
6290
82607adc 6291 /* did we stall? */
940d71c6 6292 if (time_after(now, ts + thresh)) {
82607adc 6293 lockup_detected = true;
cd2440d6
PM
6294 if (pool->cpu >= 0) {
6295 pool->cpu_stall = true;
6296 cpu_pool_stall = true;
6297 }
82607adc
TH
6298 pr_emerg("BUG: workqueue lockup - pool");
6299 pr_cont_pool_info(pool);
6300 pr_cont(" stuck for %us!\n",
940d71c6 6301 jiffies_to_msecs(now - pool_ts) / 1000);
82607adc 6302 }
cd2440d6
PM
6303
6304
82607adc
TH
6305 }
6306
6307 rcu_read_unlock();
6308
6309 if (lockup_detected)
55df0933 6310 show_all_workqueues();
82607adc 6311
cd2440d6
PM
6312 if (cpu_pool_stall)
6313 show_cpu_pools_hogs();
6314
82607adc
TH
6315 wq_watchdog_reset_touched();
6316 mod_timer(&wq_watchdog_timer, jiffies + thresh);
6317}
6318
cb9d7fd5 6319notrace void wq_watchdog_touch(int cpu)
82607adc
TH
6320{
6321 if (cpu >= 0)
6322 per_cpu(wq_watchdog_touched_cpu, cpu) = jiffies;
89e28ce6
WQ
6323
6324 wq_watchdog_touched = jiffies;
82607adc
TH
6325}
6326
6327static void wq_watchdog_set_thresh(unsigned long thresh)
6328{
6329 wq_watchdog_thresh = 0;
6330 del_timer_sync(&wq_watchdog_timer);
6331
6332 if (thresh) {
6333 wq_watchdog_thresh = thresh;
6334 wq_watchdog_reset_touched();
6335 mod_timer(&wq_watchdog_timer, jiffies + thresh * HZ);
6336 }
6337}
6338
6339static int wq_watchdog_param_set_thresh(const char *val,
6340 const struct kernel_param *kp)
6341{
6342 unsigned long thresh;
6343 int ret;
6344
6345 ret = kstrtoul(val, 0, &thresh);
6346 if (ret)
6347 return ret;
6348
6349 if (system_wq)
6350 wq_watchdog_set_thresh(thresh);
6351 else
6352 wq_watchdog_thresh = thresh;
6353
6354 return 0;
6355}
6356
6357static const struct kernel_param_ops wq_watchdog_thresh_ops = {
6358 .set = wq_watchdog_param_set_thresh,
6359 .get = param_get_ulong,
6360};
6361
6362module_param_cb(watchdog_thresh, &wq_watchdog_thresh_ops, &wq_watchdog_thresh,
6363 0644);
6364
6365static void wq_watchdog_init(void)
6366{
5cd79d6a 6367 timer_setup(&wq_watchdog_timer, wq_watchdog_timer_fn, TIMER_DEFERRABLE);
82607adc
TH
6368 wq_watchdog_set_thresh(wq_watchdog_thresh);
6369}
6370
6371#else /* CONFIG_WQ_WATCHDOG */
6372
6373static inline void wq_watchdog_init(void) { }
6374
6375#endif /* CONFIG_WQ_WATCHDOG */
6376
bce90380
TH
6377static void __init wq_numa_init(void)
6378{
6379 cpumask_var_t *tbl;
6380 int node, cpu;
6381
bce90380
TH
6382 if (num_possible_nodes() <= 1)
6383 return;
6384
d55262c4
TH
6385 if (wq_disable_numa) {
6386 pr_info("workqueue: NUMA affinity support disabled\n");
6387 return;
6388 }
6389
f728c4a9
ZL
6390 for_each_possible_cpu(cpu) {
6391 if (WARN_ON(cpu_to_node(cpu) == NUMA_NO_NODE)) {
6392 pr_warn("workqueue: NUMA node mapping not available for cpu%d, disabling NUMA support\n", cpu);
6393 return;
6394 }
6395 }
6396
be69d00d 6397 wq_update_unbound_numa_attrs_buf = alloc_workqueue_attrs();
4c16bd32
TH
6398 BUG_ON(!wq_update_unbound_numa_attrs_buf);
6399
bce90380
TH
6400 /*
6401 * We want masks of possible CPUs of each node which isn't readily
6402 * available. Build one from cpu_to_node() which should have been
6403 * fully initialized by now.
6404 */
6396bb22 6405 tbl = kcalloc(nr_node_ids, sizeof(tbl[0]), GFP_KERNEL);
bce90380
TH
6406 BUG_ON(!tbl);
6407
6408 for_each_node(node)
5a6024f1 6409 BUG_ON(!zalloc_cpumask_var_node(&tbl[node], GFP_KERNEL,
1be0c25d 6410 node_online(node) ? node : NUMA_NO_NODE));
bce90380
TH
6411
6412 for_each_possible_cpu(cpu) {
6413 node = cpu_to_node(cpu);
bce90380
TH
6414 cpumask_set_cpu(cpu, tbl[node]);
6415 }
6416
6417 wq_numa_possible_cpumask = tbl;
6418 wq_numa_enabled = true;
6419}
6420
3347fa09
TH
6421/**
6422 * workqueue_init_early - early init for workqueue subsystem
6423 *
6424 * This is the first half of two-staged workqueue subsystem initialization
6425 * and invoked as soon as the bare basics - memory allocation, cpumasks and
6426 * idr are up. It sets up all the data structures and system workqueues
6427 * and allows early boot code to create workqueues and queue/cancel work
6428 * items. Actual work item execution starts only after kthreads can be
6429 * created and scheduled right before early initcalls.
6430 */
2333e829 6431void __init workqueue_init_early(void)
1da177e4 6432{
7a4e344c
TH
6433 int std_nice[NR_STD_WORKER_POOLS] = { 0, HIGHPRI_NICE_LEVEL };
6434 int i, cpu;
c34056a3 6435
10cdb157 6436 BUILD_BUG_ON(__alignof__(struct pool_workqueue) < __alignof__(long long));
e904e6c2 6437
b05a7928 6438 BUG_ON(!alloc_cpumask_var(&wq_unbound_cpumask, GFP_KERNEL));
04d4e665
FW
6439 cpumask_copy(wq_unbound_cpumask, housekeeping_cpumask(HK_TYPE_WQ));
6440 cpumask_and(wq_unbound_cpumask, wq_unbound_cpumask, housekeeping_cpumask(HK_TYPE_DOMAIN));
b05a7928 6441
e904e6c2
TH
6442 pwq_cache = KMEM_CACHE(pool_workqueue, SLAB_PANIC);
6443
706026c2 6444 /* initialize CPU pools */
29c91e99 6445 for_each_possible_cpu(cpu) {
4ce62e9e 6446 struct worker_pool *pool;
8b03ae3c 6447
7a4e344c 6448 i = 0;
f02ae73a 6449 for_each_cpu_worker_pool(pool, cpu) {
7a4e344c 6450 BUG_ON(init_worker_pool(pool));
ec22ca5e 6451 pool->cpu = cpu;
29c91e99 6452 cpumask_copy(pool->attrs->cpumask, cpumask_of(cpu));
7a4e344c 6453 pool->attrs->nice = std_nice[i++];
f3f90ad4 6454 pool->node = cpu_to_node(cpu);
7a4e344c 6455
9daf9e67 6456 /* alloc pool ID */
68e13a67 6457 mutex_lock(&wq_pool_mutex);
9daf9e67 6458 BUG_ON(worker_pool_assign_id(pool));
68e13a67 6459 mutex_unlock(&wq_pool_mutex);
4ce62e9e 6460 }
8b03ae3c
TH
6461 }
6462
8a2b7538 6463 /* create default unbound and ordered wq attrs */
29c91e99
TH
6464 for (i = 0; i < NR_STD_WORKER_POOLS; i++) {
6465 struct workqueue_attrs *attrs;
6466
be69d00d 6467 BUG_ON(!(attrs = alloc_workqueue_attrs()));
29c91e99 6468 attrs->nice = std_nice[i];
29c91e99 6469 unbound_std_wq_attrs[i] = attrs;
8a2b7538
TH
6470
6471 /*
6472 * An ordered wq should have only one pwq as ordering is
6473 * guaranteed by max_active which is enforced by pwqs.
6474 * Turn off NUMA so that dfl_pwq is used for all nodes.
6475 */
be69d00d 6476 BUG_ON(!(attrs = alloc_workqueue_attrs()));
8a2b7538
TH
6477 attrs->nice = std_nice[i];
6478 attrs->no_numa = true;
6479 ordered_wq_attrs[i] = attrs;
29c91e99
TH
6480 }
6481
d320c038 6482 system_wq = alloc_workqueue("events", 0, 0);
1aabe902 6483 system_highpri_wq = alloc_workqueue("events_highpri", WQ_HIGHPRI, 0);
d320c038 6484 system_long_wq = alloc_workqueue("events_long", 0, 0);
f3421797
TH
6485 system_unbound_wq = alloc_workqueue("events_unbound", WQ_UNBOUND,
6486 WQ_UNBOUND_MAX_ACTIVE);
24d51add
TH
6487 system_freezable_wq = alloc_workqueue("events_freezable",
6488 WQ_FREEZABLE, 0);
0668106c
VK
6489 system_power_efficient_wq = alloc_workqueue("events_power_efficient",
6490 WQ_POWER_EFFICIENT, 0);
6491 system_freezable_power_efficient_wq = alloc_workqueue("events_freezable_power_efficient",
6492 WQ_FREEZABLE | WQ_POWER_EFFICIENT,
6493 0);
1aabe902 6494 BUG_ON(!system_wq || !system_highpri_wq || !system_long_wq ||
0668106c
VK
6495 !system_unbound_wq || !system_freezable_wq ||
6496 !system_power_efficient_wq ||
6497 !system_freezable_power_efficient_wq);
3347fa09
TH
6498}
6499
6500/**
6501 * workqueue_init - bring workqueue subsystem fully online
6502 *
6503 * This is the latter half of two-staged workqueue subsystem initialization
6504 * and invoked as soon as kthreads can be created and scheduled.
6505 * Workqueues have been created and work items queued on them, but there
6506 * are no kworkers executing the work items yet. Populate the worker pools
6507 * with the initial workers and enable future kworker creations.
6508 */
2333e829 6509void __init workqueue_init(void)
3347fa09 6510{
2186d9f9 6511 struct workqueue_struct *wq;
3347fa09
TH
6512 struct worker_pool *pool;
6513 int cpu, bkt;
6514
2186d9f9
TH
6515 /*
6516 * It'd be simpler to initialize NUMA in workqueue_init_early() but
6517 * CPU to node mapping may not be available that early on some
6518 * archs such as power and arm64. As per-cpu pools created
6519 * previously could be missing node hint and unbound pools NUMA
6520 * affinity, fix them up.
40c17f75
TH
6521 *
6522 * Also, while iterating workqueues, create rescuers if requested.
2186d9f9
TH
6523 */
6524 wq_numa_init();
6525
6526 mutex_lock(&wq_pool_mutex);
6527
6528 for_each_possible_cpu(cpu) {
6529 for_each_cpu_worker_pool(pool, cpu) {
6530 pool->node = cpu_to_node(cpu);
6531 }
6532 }
6533
40c17f75 6534 list_for_each_entry(wq, &workqueues, list) {
2186d9f9 6535 wq_update_unbound_numa(wq, smp_processor_id(), true);
40c17f75
TH
6536 WARN(init_rescuer(wq),
6537 "workqueue: failed to create early rescuer for %s",
6538 wq->name);
6539 }
2186d9f9
TH
6540
6541 mutex_unlock(&wq_pool_mutex);
6542
3347fa09
TH
6543 /* create the initial workers */
6544 for_each_online_cpu(cpu) {
6545 for_each_cpu_worker_pool(pool, cpu) {
6546 pool->flags &= ~POOL_DISASSOCIATED;
6547 BUG_ON(!create_worker(pool));
6548 }
6549 }
6550
6551 hash_for_each(unbound_pool_hash, bkt, pool, hash_node)
6552 BUG_ON(!create_worker(pool));
6553
6554 wq_online = true;
82607adc 6555 wq_watchdog_init();
1da177e4 6556}
c4f135d6
TH
6557
6558/*
6559 * Despite the naming, this is a no-op function which is here only for avoiding
6560 * link error. Since compile-time warning may fail to catch, we will need to
6561 * emit run-time warning from __flush_workqueue().
6562 */
6563void __warn_flushing_systemwide_wq(void) { }
6564EXPORT_SYMBOL(__warn_flushing_systemwide_wq);