workqueue: Don't call cpumask_test_cpu() with -1 CPU in wq_update_node_max_active()
[linux-block.git] / kernel / workqueue.c
CommitLineData
457c8996 1// SPDX-License-Identifier: GPL-2.0-only
1da177e4 2/*
c54fce6e 3 * kernel/workqueue.c - generic async execution with shared worker pool
1da177e4 4 *
c54fce6e 5 * Copyright (C) 2002 Ingo Molnar
1da177e4 6 *
c54fce6e
TH
7 * Derived from the taskqueue/keventd code by:
8 * David Woodhouse <dwmw2@infradead.org>
9 * Andrew Morton
10 * Kai Petzke <wpp@marie.physik.tu-berlin.de>
11 * Theodore Ts'o <tytso@mit.edu>
1da177e4 12 *
c54fce6e 13 * Made to use alloc_percpu by Christoph Lameter.
1da177e4 14 *
c54fce6e
TH
15 * Copyright (C) 2010 SUSE Linux Products GmbH
16 * Copyright (C) 2010 Tejun Heo <tj@kernel.org>
89ada679 17 *
c54fce6e
TH
18 * This is the generic async execution mechanism. Work items as are
19 * executed in process context. The worker pool is shared and
b11895c4
L
20 * automatically managed. There are two worker pools for each CPU (one for
21 * normal work items and the other for high priority ones) and some extra
22 * pools for workqueues which are not bound to any specific CPU - the
23 * number of these backing pools is dynamic.
c54fce6e 24 *
9a261491 25 * Please read Documentation/core-api/workqueue.rst for details.
1da177e4
LT
26 */
27
9984de1a 28#include <linux/export.h>
1da177e4
LT
29#include <linux/kernel.h>
30#include <linux/sched.h>
31#include <linux/init.h>
32#include <linux/signal.h>
33#include <linux/completion.h>
34#include <linux/workqueue.h>
35#include <linux/slab.h>
36#include <linux/cpu.h>
37#include <linux/notifier.h>
38#include <linux/kthread.h>
1fa44eca 39#include <linux/hardirq.h>
46934023 40#include <linux/mempolicy.h>
341a5958 41#include <linux/freezer.h>
d5abe669 42#include <linux/debug_locks.h>
4e6045f1 43#include <linux/lockdep.h>
c34056a3 44#include <linux/idr.h>
29c91e99 45#include <linux/jhash.h>
42f8570f 46#include <linux/hashtable.h>
76af4d93 47#include <linux/rculist.h>
bce90380 48#include <linux/nodemask.h>
4c16bd32 49#include <linux/moduleparam.h>
3d1cb205 50#include <linux/uaccess.h>
c98a9805 51#include <linux/sched/isolation.h>
cd2440d6 52#include <linux/sched/debug.h>
62635ea8 53#include <linux/nmi.h>
940d71c6 54#include <linux/kvm_para.h>
aa6fde93 55#include <linux/delay.h>
e22bee78 56
ea138446 57#include "workqueue_internal.h"
1da177e4 58
e563d0a7 59enum worker_pool_flags {
24647570
TH
60 /*
61 * worker_pool flags
bc2ae0f5 62 *
24647570 63 * A bound pool is either associated or disassociated with its CPU.
bc2ae0f5
TH
64 * While associated (!DISASSOCIATED), all workers are bound to the
65 * CPU and none has %WORKER_UNBOUND set and concurrency management
66 * is in effect.
67 *
68 * While DISASSOCIATED, the cpu may be offline and all workers have
69 * %WORKER_UNBOUND set and concurrency management disabled, and may
24647570 70 * be executing on any CPU. The pool behaves as an unbound one.
bc2ae0f5 71 *
bc3a1afc 72 * Note that DISASSOCIATED should be flipped only while holding
1258fae7 73 * wq_pool_attach_mutex to avoid changing binding state while
4736cbf7 74 * worker_attach_to_pool() is in progress.
bc2ae0f5 75 */
692b4825 76 POOL_MANAGER_ACTIVE = 1 << 0, /* being managed */
24647570 77 POOL_DISASSOCIATED = 1 << 2, /* cpu can't serve workers */
e563d0a7 78};
db7bccf4 79
e563d0a7 80enum worker_flags {
c8e55f36 81 /* worker flags */
c8e55f36
TH
82 WORKER_DIE = 1 << 1, /* die die die */
83 WORKER_IDLE = 1 << 2, /* is idle */
e22bee78 84 WORKER_PREP = 1 << 3, /* preparing to run works */
fb0e7beb 85 WORKER_CPU_INTENSIVE = 1 << 6, /* cpu intensive */
f3421797 86 WORKER_UNBOUND = 1 << 7, /* worker is unbound */
a9ab775b 87 WORKER_REBOUND = 1 << 8, /* worker was rebound */
e22bee78 88
a9ab775b
TH
89 WORKER_NOT_RUNNING = WORKER_PREP | WORKER_CPU_INTENSIVE |
90 WORKER_UNBOUND | WORKER_REBOUND,
e563d0a7 91};
db7bccf4 92
e563d0a7 93enum wq_internal_consts {
e34cdddb 94 NR_STD_WORKER_POOLS = 2, /* # standard pools per cpu */
4ce62e9e 95
29c91e99 96 UNBOUND_POOL_HASH_ORDER = 6, /* hashed by pool->attrs */
c8e55f36 97 BUSY_WORKER_HASH_ORDER = 6, /* 64 pointers */
db7bccf4 98
e22bee78
TH
99 MAX_IDLE_WORKERS_RATIO = 4, /* 1/4 of busy can be idle */
100 IDLE_WORKER_TIMEOUT = 300 * HZ, /* keep idle ones for 5 mins */
101
3233cdbd
TH
102 MAYDAY_INITIAL_TIMEOUT = HZ / 100 >= 2 ? HZ / 100 : 2,
103 /* call for help after 10ms
104 (min two ticks) */
e22bee78
TH
105 MAYDAY_INTERVAL = HZ / 10, /* and then every 100ms */
106 CREATE_COOLDOWN = HZ, /* time to breath after fail */
e22bee78
TH
107
108 /*
109 * Rescue workers are used only on emergencies and shared by
8698a745 110 * all cpus. Give MIN_NICE.
e22bee78 111 */
8698a745
DY
112 RESCUER_NICE_LEVEL = MIN_NICE,
113 HIGHPRI_NICE_LEVEL = MIN_NICE,
ecf6881f 114
31c89007 115 WQ_NAME_LEN = 32,
c8e55f36 116};
1da177e4
LT
117
118/*
4690c4ab
TH
119 * Structure fields follow one of the following exclusion rules.
120 *
e41e704b
TH
121 * I: Modifiable by initialization/destruction paths and read-only for
122 * everyone else.
4690c4ab 123 *
e22bee78
TH
124 * P: Preemption protected. Disabling preemption is enough and should
125 * only be modified and accessed from the local cpu.
126 *
d565ed63 127 * L: pool->lock protected. Access with pool->lock held.
4690c4ab 128 *
5797b1c1
TH
129 * LN: pool->lock and wq_node_nr_active->lock protected for writes. Either for
130 * reads.
131 *
bdf8b9bf
TH
132 * K: Only modified by worker while holding pool->lock. Can be safely read by
133 * self, while holding pool->lock or from IRQ context if %current is the
134 * kworker.
135 *
136 * S: Only modified by worker self.
137 *
1258fae7 138 * A: wq_pool_attach_mutex protected.
822d8405 139 *
68e13a67 140 * PL: wq_pool_mutex protected.
5bcab335 141 *
24acfb71 142 * PR: wq_pool_mutex protected for writes. RCU protected for reads.
76af4d93 143 *
5b95e1af
LJ
144 * PW: wq_pool_mutex and wq->mutex protected for writes. Either for reads.
145 *
146 * PWR: wq_pool_mutex and wq->mutex protected for writes. Either or
24acfb71 147 * RCU for reads.
5b95e1af 148 *
3c25a55d
LJ
149 * WQ: wq->mutex protected.
150 *
24acfb71 151 * WR: wq->mutex protected for writes. RCU protected for reads.
2e109a28 152 *
a045a272
TH
153 * WO: wq->mutex protected for writes. Updated with WRITE_ONCE() and can be read
154 * with READ_ONCE() without locking.
155 *
2e109a28 156 * MD: wq_mayday_lock protected.
cd2440d6
PM
157 *
158 * WD: Used internally by the watchdog.
1da177e4 159 */
1da177e4 160
2eaebdb3 161/* struct worker is defined in workqueue_internal.h */
c34056a3 162
bd7bdd43 163struct worker_pool {
a9b8a985 164 raw_spinlock_t lock; /* the pool lock */
d84ff051 165 int cpu; /* I: the associated cpu */
f3f90ad4 166 int node; /* I: the associated node ID */
9daf9e67 167 int id; /* I: pool ID */
bc8b50c2 168 unsigned int flags; /* L: flags */
bd7bdd43 169
82607adc 170 unsigned long watchdog_ts; /* L: watchdog timestamp */
cd2440d6 171 bool cpu_stall; /* WD: stalled cpu bound pool */
82607adc 172
bc35f7ef
LJ
173 /*
174 * The counter is incremented in a process context on the associated CPU
175 * w/ preemption disabled, and decremented or reset in the same context
176 * but w/ pool->lock held. The readers grab pool->lock and are
177 * guaranteed to see if the counter reached zero.
178 */
179 int nr_running;
84f91c62 180
bd7bdd43 181 struct list_head worklist; /* L: list of pending works */
ea1abd61 182
5826cc8f
LJ
183 int nr_workers; /* L: total number of workers */
184 int nr_idle; /* L: currently idle workers */
bd7bdd43 185
2c1f1a91 186 struct list_head idle_list; /* L: list of idle workers */
bd7bdd43 187 struct timer_list idle_timer; /* L: worker idle timeout */
3f959aa3
VS
188 struct work_struct idle_cull_work; /* L: worker idle cleanup */
189
190 struct timer_list mayday_timer; /* L: SOS timer for workers */
bd7bdd43 191
c5aa87bb 192 /* a workers is either on busy_hash or idle_list, or the manager */
c9e7cf27
TH
193 DECLARE_HASHTABLE(busy_hash, BUSY_WORKER_HASH_ORDER);
194 /* L: hash of busy workers */
195
2607d7a6 196 struct worker *manager; /* L: purely informational */
92f9c5c4 197 struct list_head workers; /* A: attached workers */
e02b9312 198 struct list_head dying_workers; /* A: workers about to die */
60f5a4bc 199 struct completion *detach_completion; /* all workers detached */
e19e397a 200
7cda9aae 201 struct ida worker_ida; /* worker IDs for task name */
e19e397a 202
7a4e344c 203 struct workqueue_attrs *attrs; /* I: worker attributes */
68e13a67
LJ
204 struct hlist_node hash_node; /* PL: unbound_pool_hash node */
205 int refcnt; /* PL: refcnt for unbound pools */
7a4e344c 206
29c91e99 207 /*
24acfb71 208 * Destruction of pool is RCU protected to allow dereferences
29c91e99
TH
209 * from get_work_pool().
210 */
211 struct rcu_head rcu;
84f91c62 212};
8b03ae3c 213
725e8ec5
TH
214/*
215 * Per-pool_workqueue statistics. These can be monitored using
216 * tools/workqueue/wq_monitor.py.
217 */
218enum pool_workqueue_stats {
219 PWQ_STAT_STARTED, /* work items started execution */
220 PWQ_STAT_COMPLETED, /* work items completed execution */
8a1dd1e5 221 PWQ_STAT_CPU_TIME, /* total CPU time consumed */
616db877 222 PWQ_STAT_CPU_INTENSIVE, /* wq_cpu_intensive_thresh_us violations */
725e8ec5 223 PWQ_STAT_CM_WAKEUP, /* concurrency-management worker wakeups */
8639eceb 224 PWQ_STAT_REPATRIATED, /* unbound workers brought back into scope */
725e8ec5
TH
225 PWQ_STAT_MAYDAY, /* maydays to rescuer */
226 PWQ_STAT_RESCUED, /* linked work items executed by rescuer */
227
228 PWQ_NR_STATS,
229};
230
1da177e4 231/*
112202d9
TH
232 * The per-pool workqueue. While queued, the lower WORK_STRUCT_FLAG_BITS
233 * of work_struct->data are used for flags and the remaining high bits
234 * point to the pwq; thus, pwqs need to be aligned at two's power of the
235 * number of flag bits.
1da177e4 236 */
112202d9 237struct pool_workqueue {
bd7bdd43 238 struct worker_pool *pool; /* I: the associated pool */
4690c4ab 239 struct workqueue_struct *wq; /* I: the owning workqueue */
73f53c4a
TH
240 int work_color; /* L: current color */
241 int flush_color; /* L: flushing color */
8864b4e5 242 int refcnt; /* L: reference count */
73f53c4a
TH
243 int nr_in_flight[WORK_NR_COLORS];
244 /* L: nr of in_flight works */
018f3a13
LJ
245
246 /*
247 * nr_active management and WORK_STRUCT_INACTIVE:
248 *
249 * When pwq->nr_active >= max_active, new work item is queued to
250 * pwq->inactive_works instead of pool->worklist and marked with
251 * WORK_STRUCT_INACTIVE.
252 *
5797b1c1
TH
253 * All work items marked with WORK_STRUCT_INACTIVE do not participate in
254 * nr_active and all work items in pwq->inactive_works are marked with
255 * WORK_STRUCT_INACTIVE. But not all WORK_STRUCT_INACTIVE work items are
256 * in pwq->inactive_works. Some of them are ready to run in
257 * pool->worklist or worker->scheduled. Those work itmes are only struct
258 * wq_barrier which is used for flush_work() and should not participate
259 * in nr_active. For non-barrier work item, it is marked with
260 * WORK_STRUCT_INACTIVE iff it is in pwq->inactive_works.
018f3a13 261 */
1e19ffc6 262 int nr_active; /* L: nr of active works */
f97a4a1a 263 struct list_head inactive_works; /* L: inactive works */
5797b1c1 264 struct list_head pending_node; /* LN: node on wq_node_nr_active->pending_pwqs */
3c25a55d 265 struct list_head pwqs_node; /* WR: node on wq->pwqs */
2e109a28 266 struct list_head mayday_node; /* MD: node on wq->maydays */
8864b4e5 267
725e8ec5
TH
268 u64 stats[PWQ_NR_STATS];
269
8864b4e5 270 /*
967b494e 271 * Release of unbound pwq is punted to a kthread_worker. See put_pwq()
687a9aa5
TH
272 * and pwq_release_workfn() for details. pool_workqueue itself is also
273 * RCU protected so that the first pwq can be determined without
967b494e 274 * grabbing wq->mutex.
8864b4e5 275 */
687a9aa5 276 struct kthread_work release_work;
8864b4e5 277 struct rcu_head rcu;
e904e6c2 278} __aligned(1 << WORK_STRUCT_FLAG_BITS);
1da177e4 279
73f53c4a
TH
280/*
281 * Structure used to wait for workqueue flush.
282 */
283struct wq_flusher {
3c25a55d
LJ
284 struct list_head list; /* WQ: list of flushers */
285 int flush_color; /* WQ: flush color waiting for */
73f53c4a
TH
286 struct completion done; /* flush completion */
287};
288
226223ab
TH
289struct wq_device;
290
91ccc6e7
TH
291/*
292 * Unlike in a per-cpu workqueue where max_active limits its concurrency level
293 * on each CPU, in an unbound workqueue, max_active applies to the whole system.
294 * As sharing a single nr_active across multiple sockets can be very expensive,
295 * the counting and enforcement is per NUMA node.
5797b1c1
TH
296 *
297 * The following struct is used to enforce per-node max_active. When a pwq wants
298 * to start executing a work item, it should increment ->nr using
299 * tryinc_node_nr_active(). If acquisition fails due to ->nr already being over
300 * ->max, the pwq is queued on ->pending_pwqs. As in-flight work items finish
301 * and decrement ->nr, node_activate_pending_pwq() activates the pending pwqs in
302 * round-robin order.
91ccc6e7
TH
303 */
304struct wq_node_nr_active {
5797b1c1
TH
305 int max; /* per-node max_active */
306 atomic_t nr; /* per-node nr_active */
307 raw_spinlock_t lock; /* nests inside pool locks */
308 struct list_head pending_pwqs; /* LN: pwqs with inactive works */
91ccc6e7
TH
309};
310
1da177e4 311/*
c5aa87bb
TH
312 * The externally visible workqueue. It relays the issued work items to
313 * the appropriate worker_pool through its pool_workqueues.
1da177e4
LT
314 */
315struct workqueue_struct {
3c25a55d 316 struct list_head pwqs; /* WR: all pwqs of this wq */
e2dca7ad 317 struct list_head list; /* PR: list of all workqueues */
73f53c4a 318
3c25a55d
LJ
319 struct mutex mutex; /* protects this wq */
320 int work_color; /* WQ: current work color */
321 int flush_color; /* WQ: current flush color */
112202d9 322 atomic_t nr_pwqs_to_flush; /* flush in progress */
3c25a55d
LJ
323 struct wq_flusher *first_flusher; /* WQ: first flusher */
324 struct list_head flusher_queue; /* WQ: flush waiters */
325 struct list_head flusher_overflow; /* WQ: flush overflow list */
73f53c4a 326
2e109a28 327 struct list_head maydays; /* MD: pwqs requesting rescue */
30ae2fc0 328 struct worker *rescuer; /* MD: rescue worker */
e22bee78 329
87fc741e 330 int nr_drainers; /* WQ: drain in progress */
5797b1c1
TH
331
332 /* See alloc_workqueue() function comment for info on min/max_active */
a045a272 333 int max_active; /* WO: max active works */
5797b1c1 334 int min_active; /* WO: min active works */
a045a272 335 int saved_max_active; /* WQ: saved max_active */
5797b1c1 336 int saved_min_active; /* WQ: saved min_active */
226223ab 337
5b95e1af 338 struct workqueue_attrs *unbound_attrs; /* PW: only for unbound wqs */
9f66cff2 339 struct pool_workqueue __rcu *dfl_pwq; /* PW: only for unbound wqs */
6029a918 340
226223ab
TH
341#ifdef CONFIG_SYSFS
342 struct wq_device *wq_dev; /* I: for sysfs interface */
343#endif
4e6045f1 344#ifdef CONFIG_LOCKDEP
669de8bd
BVA
345 char *lock_name;
346 struct lock_class_key key;
4690c4ab 347 struct lockdep_map lockdep_map;
4e6045f1 348#endif
ecf6881f 349 char name[WQ_NAME_LEN]; /* I: workqueue name */
2728fd2f 350
e2dca7ad 351 /*
24acfb71
TG
352 * Destruction of workqueue_struct is RCU protected to allow walking
353 * the workqueues list without grabbing wq_pool_mutex.
e2dca7ad
TH
354 * This is used to dump all workqueues from sysrq.
355 */
356 struct rcu_head rcu;
357
2728fd2f
TH
358 /* hot fields used during command issue, aligned to cacheline */
359 unsigned int flags ____cacheline_aligned; /* WQ: WQ_* flags */
636b927e 360 struct pool_workqueue __percpu __rcu **cpu_pwq; /* I: per-cpu pwqs */
91ccc6e7 361 struct wq_node_nr_active *node_nr_active[]; /* I: per-node nr_active */
1da177e4
LT
362};
363
e904e6c2
TH
364static struct kmem_cache *pwq_cache;
365
84193c07
TH
366/*
367 * Each pod type describes how CPUs should be grouped for unbound workqueues.
368 * See the comment above workqueue_attrs->affn_scope.
369 */
370struct wq_pod_type {
371 int nr_pods; /* number of pods */
372 cpumask_var_t *pod_cpus; /* pod -> cpus */
373 int *pod_node; /* pod -> node */
374 int *cpu_pod; /* cpu -> pod */
375};
376
377static struct wq_pod_type wq_pod_types[WQ_AFFN_NR_TYPES];
523a301e 378static enum wq_affn_scope wq_affn_dfl = WQ_AFFN_CACHE;
63c5484e
TH
379
380static const char *wq_affn_names[WQ_AFFN_NR_TYPES] = {
523a301e 381 [WQ_AFFN_DFL] = "default",
63c5484e
TH
382 [WQ_AFFN_CPU] = "cpu",
383 [WQ_AFFN_SMT] = "smt",
384 [WQ_AFFN_CACHE] = "cache",
385 [WQ_AFFN_NUMA] = "numa",
386 [WQ_AFFN_SYSTEM] = "system",
387};
bce90380 388
616db877
TH
389/*
390 * Per-cpu work items which run for longer than the following threshold are
391 * automatically considered CPU intensive and excluded from concurrency
392 * management to prevent them from noticeably delaying other per-cpu work items.
aa6fde93
TH
393 * ULONG_MAX indicates that the user hasn't overridden it with a boot parameter.
394 * The actual value is initialized in wq_cpu_intensive_thresh_init().
616db877 395 */
aa6fde93 396static unsigned long wq_cpu_intensive_thresh_us = ULONG_MAX;
616db877
TH
397module_param_named(cpu_intensive_thresh_us, wq_cpu_intensive_thresh_us, ulong, 0644);
398
cee22a15 399/* see the comment above the definition of WQ_POWER_EFFICIENT */
552f530c 400static bool wq_power_efficient = IS_ENABLED(CONFIG_WQ_POWER_EFFICIENT_DEFAULT);
cee22a15
VK
401module_param_named(power_efficient, wq_power_efficient, bool, 0444);
402
863b710b 403static bool wq_online; /* can kworkers be created yet? */
3347fa09 404
fef59c9c
TH
405/* buf for wq_update_unbound_pod_attrs(), protected by CPU hotplug exclusion */
406static struct workqueue_attrs *wq_update_pod_attrs_buf;
4c16bd32 407
68e13a67 408static DEFINE_MUTEX(wq_pool_mutex); /* protects pools and workqueues list */
1258fae7 409static DEFINE_MUTEX(wq_pool_attach_mutex); /* protects worker attach/detach */
a9b8a985 410static DEFINE_RAW_SPINLOCK(wq_mayday_lock); /* protects wq->maydays list */
d8bb65ab
SAS
411/* wait for manager to go away */
412static struct rcuwait manager_wait = __RCUWAIT_INITIALIZER(manager_wait);
5bcab335 413
e2dca7ad 414static LIST_HEAD(workqueues); /* PR: list of all workqueues */
68e13a67 415static bool workqueue_freezing; /* PL: have wqs started freezing? */
7d19c5ce 416
99c621ef 417/* PL&A: allowable cpus for unbound wqs and work items */
ef557180
MG
418static cpumask_var_t wq_unbound_cpumask;
419
fe28f631
WL
420/* PL: user requested unbound cpumask via sysfs */
421static cpumask_var_t wq_requested_unbound_cpumask;
422
423/* PL: isolated cpumask to be excluded from unbound cpumask */
424static cpumask_var_t wq_isolated_cpumask;
425
ace3c549 426/* for further constrain wq_unbound_cpumask by cmdline parameter*/
427static struct cpumask wq_cmdline_cpumask __initdata;
428
ef557180
MG
429/* CPU where unbound work was last round robin scheduled from this CPU */
430static DEFINE_PER_CPU(int, wq_rr_cpu_last);
b05a7928 431
f303fccb
TH
432/*
433 * Local execution of unbound work items is no longer guaranteed. The
434 * following always forces round-robin CPU selection on unbound work items
435 * to uncover usages which depend on it.
436 */
437#ifdef CONFIG_DEBUG_WQ_FORCE_RR_CPU
438static bool wq_debug_force_rr_cpu = true;
439#else
440static bool wq_debug_force_rr_cpu = false;
441#endif
442module_param_named(debug_force_rr_cpu, wq_debug_force_rr_cpu, bool, 0644);
443
7d19c5ce 444/* the per-cpu worker pools */
25528213 445static DEFINE_PER_CPU_SHARED_ALIGNED(struct worker_pool [NR_STD_WORKER_POOLS], cpu_worker_pools);
7d19c5ce 446
68e13a67 447static DEFINE_IDR(worker_pool_idr); /* PR: idr of all pools */
7d19c5ce 448
68e13a67 449/* PL: hash of all unbound pools keyed by pool->attrs */
29c91e99
TH
450static DEFINE_HASHTABLE(unbound_pool_hash, UNBOUND_POOL_HASH_ORDER);
451
c5aa87bb 452/* I: attributes used when instantiating standard unbound pools on demand */
29c91e99
TH
453static struct workqueue_attrs *unbound_std_wq_attrs[NR_STD_WORKER_POOLS];
454
8a2b7538
TH
455/* I: attributes used when instantiating ordered pools on demand */
456static struct workqueue_attrs *ordered_wq_attrs[NR_STD_WORKER_POOLS];
457
967b494e
TH
458/*
459 * I: kthread_worker to release pwq's. pwq release needs to be bounced to a
460 * process context while holding a pool lock. Bounce to a dedicated kthread
461 * worker to avoid A-A deadlocks.
462 */
68279f9c 463static struct kthread_worker *pwq_release_worker __ro_after_init;
967b494e 464
68279f9c 465struct workqueue_struct *system_wq __ro_after_init;
ad7b1f84 466EXPORT_SYMBOL(system_wq);
68279f9c 467struct workqueue_struct *system_highpri_wq __ro_after_init;
1aabe902 468EXPORT_SYMBOL_GPL(system_highpri_wq);
68279f9c 469struct workqueue_struct *system_long_wq __ro_after_init;
d320c038 470EXPORT_SYMBOL_GPL(system_long_wq);
68279f9c 471struct workqueue_struct *system_unbound_wq __ro_after_init;
f3421797 472EXPORT_SYMBOL_GPL(system_unbound_wq);
68279f9c 473struct workqueue_struct *system_freezable_wq __ro_after_init;
24d51add 474EXPORT_SYMBOL_GPL(system_freezable_wq);
68279f9c 475struct workqueue_struct *system_power_efficient_wq __ro_after_init;
0668106c 476EXPORT_SYMBOL_GPL(system_power_efficient_wq);
68279f9c 477struct workqueue_struct *system_freezable_power_efficient_wq __ro_after_init;
0668106c 478EXPORT_SYMBOL_GPL(system_freezable_power_efficient_wq);
d320c038 479
7d19c5ce 480static int worker_thread(void *__worker);
6ba94429 481static void workqueue_sysfs_unregister(struct workqueue_struct *wq);
c29eb853 482static void show_pwq(struct pool_workqueue *pwq);
55df0933 483static void show_one_worker_pool(struct worker_pool *pool);
7d19c5ce 484
97bd2347
TH
485#define CREATE_TRACE_POINTS
486#include <trace/events/workqueue.h>
487
68e13a67 488#define assert_rcu_or_pool_mutex() \
24acfb71 489 RCU_LOCKDEP_WARN(!rcu_read_lock_held() && \
f78f5b90 490 !lockdep_is_held(&wq_pool_mutex), \
24acfb71 491 "RCU or wq_pool_mutex should be held")
5bcab335 492
5b95e1af 493#define assert_rcu_or_wq_mutex_or_pool_mutex(wq) \
24acfb71 494 RCU_LOCKDEP_WARN(!rcu_read_lock_held() && \
f78f5b90
PM
495 !lockdep_is_held(&wq->mutex) && \
496 !lockdep_is_held(&wq_pool_mutex), \
24acfb71 497 "RCU, wq->mutex or wq_pool_mutex should be held")
5b95e1af 498
f02ae73a
TH
499#define for_each_cpu_worker_pool(pool, cpu) \
500 for ((pool) = &per_cpu(cpu_worker_pools, cpu)[0]; \
501 (pool) < &per_cpu(cpu_worker_pools, cpu)[NR_STD_WORKER_POOLS]; \
7a62c2c8 502 (pool)++)
4ce62e9e 503
17116969
TH
504/**
505 * for_each_pool - iterate through all worker_pools in the system
506 * @pool: iteration cursor
611c92a0 507 * @pi: integer used for iteration
fa1b54e6 508 *
24acfb71 509 * This must be called either with wq_pool_mutex held or RCU read
68e13a67
LJ
510 * locked. If the pool needs to be used beyond the locking in effect, the
511 * caller is responsible for guaranteeing that the pool stays online.
fa1b54e6
TH
512 *
513 * The if/else clause exists only for the lockdep assertion and can be
514 * ignored.
17116969 515 */
611c92a0
TH
516#define for_each_pool(pool, pi) \
517 idr_for_each_entry(&worker_pool_idr, pool, pi) \
68e13a67 518 if (({ assert_rcu_or_pool_mutex(); false; })) { } \
fa1b54e6 519 else
17116969 520
822d8405
TH
521/**
522 * for_each_pool_worker - iterate through all workers of a worker_pool
523 * @worker: iteration cursor
822d8405
TH
524 * @pool: worker_pool to iterate workers of
525 *
1258fae7 526 * This must be called with wq_pool_attach_mutex.
822d8405
TH
527 *
528 * The if/else clause exists only for the lockdep assertion and can be
529 * ignored.
530 */
da028469
LJ
531#define for_each_pool_worker(worker, pool) \
532 list_for_each_entry((worker), &(pool)->workers, node) \
1258fae7 533 if (({ lockdep_assert_held(&wq_pool_attach_mutex); false; })) { } \
822d8405
TH
534 else
535
49e3cf44
TH
536/**
537 * for_each_pwq - iterate through all pool_workqueues of the specified workqueue
538 * @pwq: iteration cursor
539 * @wq: the target workqueue
76af4d93 540 *
24acfb71 541 * This must be called either with wq->mutex held or RCU read locked.
794b18bc
TH
542 * If the pwq needs to be used beyond the locking in effect, the caller is
543 * responsible for guaranteeing that the pwq stays online.
76af4d93
TH
544 *
545 * The if/else clause exists only for the lockdep assertion and can be
546 * ignored.
49e3cf44
TH
547 */
548#define for_each_pwq(pwq, wq) \
49e9d1a9 549 list_for_each_entry_rcu((pwq), &(wq)->pwqs, pwqs_node, \
5a644662 550 lockdep_is_held(&(wq->mutex)))
f3421797 551
dc186ad7
TG
552#ifdef CONFIG_DEBUG_OBJECTS_WORK
553
f9e62f31 554static const struct debug_obj_descr work_debug_descr;
dc186ad7 555
99777288
SG
556static void *work_debug_hint(void *addr)
557{
558 return ((struct work_struct *) addr)->func;
559}
560
b9fdac7f
DC
561static bool work_is_static_object(void *addr)
562{
563 struct work_struct *work = addr;
564
565 return test_bit(WORK_STRUCT_STATIC_BIT, work_data_bits(work));
566}
567
dc186ad7
TG
568/*
569 * fixup_init is called when:
570 * - an active object is initialized
571 */
02a982a6 572static bool work_fixup_init(void *addr, enum debug_obj_state state)
dc186ad7
TG
573{
574 struct work_struct *work = addr;
575
576 switch (state) {
577 case ODEBUG_STATE_ACTIVE:
578 cancel_work_sync(work);
579 debug_object_init(work, &work_debug_descr);
02a982a6 580 return true;
dc186ad7 581 default:
02a982a6 582 return false;
dc186ad7
TG
583 }
584}
585
dc186ad7
TG
586/*
587 * fixup_free is called when:
588 * - an active object is freed
589 */
02a982a6 590static bool work_fixup_free(void *addr, enum debug_obj_state state)
dc186ad7
TG
591{
592 struct work_struct *work = addr;
593
594 switch (state) {
595 case ODEBUG_STATE_ACTIVE:
596 cancel_work_sync(work);
597 debug_object_free(work, &work_debug_descr);
02a982a6 598 return true;
dc186ad7 599 default:
02a982a6 600 return false;
dc186ad7
TG
601 }
602}
603
f9e62f31 604static const struct debug_obj_descr work_debug_descr = {
dc186ad7 605 .name = "work_struct",
99777288 606 .debug_hint = work_debug_hint,
b9fdac7f 607 .is_static_object = work_is_static_object,
dc186ad7 608 .fixup_init = work_fixup_init,
dc186ad7
TG
609 .fixup_free = work_fixup_free,
610};
611
612static inline void debug_work_activate(struct work_struct *work)
613{
614 debug_object_activate(work, &work_debug_descr);
615}
616
617static inline void debug_work_deactivate(struct work_struct *work)
618{
619 debug_object_deactivate(work, &work_debug_descr);
620}
621
622void __init_work(struct work_struct *work, int onstack)
623{
624 if (onstack)
625 debug_object_init_on_stack(work, &work_debug_descr);
626 else
627 debug_object_init(work, &work_debug_descr);
628}
629EXPORT_SYMBOL_GPL(__init_work);
630
631void destroy_work_on_stack(struct work_struct *work)
632{
633 debug_object_free(work, &work_debug_descr);
634}
635EXPORT_SYMBOL_GPL(destroy_work_on_stack);
636
ea2e64f2
TG
637void destroy_delayed_work_on_stack(struct delayed_work *work)
638{
639 destroy_timer_on_stack(&work->timer);
640 debug_object_free(&work->work, &work_debug_descr);
641}
642EXPORT_SYMBOL_GPL(destroy_delayed_work_on_stack);
643
dc186ad7
TG
644#else
645static inline void debug_work_activate(struct work_struct *work) { }
646static inline void debug_work_deactivate(struct work_struct *work) { }
647#endif
648
4e8b22bd 649/**
67dc8325 650 * worker_pool_assign_id - allocate ID and assign it to @pool
4e8b22bd
LB
651 * @pool: the pool pointer of interest
652 *
653 * Returns 0 if ID in [0, WORK_OFFQ_POOL_NONE) is allocated and assigned
654 * successfully, -errno on failure.
655 */
9daf9e67
TH
656static int worker_pool_assign_id(struct worker_pool *pool)
657{
658 int ret;
659
68e13a67 660 lockdep_assert_held(&wq_pool_mutex);
5bcab335 661
4e8b22bd
LB
662 ret = idr_alloc(&worker_pool_idr, pool, 0, WORK_OFFQ_POOL_NONE,
663 GFP_KERNEL);
229641a6 664 if (ret >= 0) {
e68035fb 665 pool->id = ret;
229641a6
TH
666 return 0;
667 }
fa1b54e6 668 return ret;
7c3eed5c
TH
669}
670
9f66cff2
TH
671static struct pool_workqueue __rcu **
672unbound_pwq_slot(struct workqueue_struct *wq, int cpu)
673{
674 if (cpu >= 0)
675 return per_cpu_ptr(wq->cpu_pwq, cpu);
676 else
677 return &wq->dfl_pwq;
678}
679
680/* @cpu < 0 for dfl_pwq */
681static struct pool_workqueue *unbound_pwq(struct workqueue_struct *wq, int cpu)
682{
683 return rcu_dereference_check(*unbound_pwq_slot(wq, cpu),
684 lockdep_is_held(&wq_pool_mutex) ||
685 lockdep_is_held(&wq->mutex));
686}
687
5797b1c1
TH
688/**
689 * unbound_effective_cpumask - effective cpumask of an unbound workqueue
690 * @wq: workqueue of interest
691 *
692 * @wq->unbound_attrs->cpumask contains the cpumask requested by the user which
693 * is masked with wq_unbound_cpumask to determine the effective cpumask. The
694 * default pwq is always mapped to the pool with the current effective cpumask.
695 */
696static struct cpumask *unbound_effective_cpumask(struct workqueue_struct *wq)
697{
698 return unbound_pwq(wq, -1)->pool->attrs->__pod_cpumask;
699}
700
73f53c4a
TH
701static unsigned int work_color_to_flags(int color)
702{
703 return color << WORK_STRUCT_COLOR_SHIFT;
704}
705
c4560c2c 706static int get_work_color(unsigned long work_data)
73f53c4a 707{
c4560c2c 708 return (work_data >> WORK_STRUCT_COLOR_SHIFT) &
73f53c4a
TH
709 ((1 << WORK_STRUCT_COLOR_BITS) - 1);
710}
711
712static int work_next_color(int color)
713{
714 return (color + 1) % WORK_NR_COLORS;
715}
1da177e4 716
14441960 717/*
112202d9
TH
718 * While queued, %WORK_STRUCT_PWQ is set and non flag bits of a work's data
719 * contain the pointer to the queued pwq. Once execution starts, the flag
7c3eed5c 720 * is cleared and the high bits contain OFFQ flags and pool ID.
7a22ad75 721 *
112202d9
TH
722 * set_work_pwq(), set_work_pool_and_clear_pending(), mark_work_canceling()
723 * and clear_work_data() can be used to set the pwq, pool or clear
bbb68dfa
TH
724 * work->data. These functions should only be called while the work is
725 * owned - ie. while the PENDING bit is set.
7a22ad75 726 *
112202d9 727 * get_work_pool() and get_work_pwq() can be used to obtain the pool or pwq
7c3eed5c 728 * corresponding to a work. Pool is available once the work has been
112202d9 729 * queued anywhere after initialization until it is sync canceled. pwq is
7c3eed5c 730 * available only while the work item is queued.
7a22ad75 731 *
bbb68dfa
TH
732 * %WORK_OFFQ_CANCELING is used to mark a work item which is being
733 * canceled. While being canceled, a work item may have its PENDING set
734 * but stay off timer and worklist for arbitrarily long and nobody should
735 * try to steal the PENDING bit.
14441960 736 */
7a22ad75
TH
737static inline void set_work_data(struct work_struct *work, unsigned long data,
738 unsigned long flags)
365970a1 739{
6183c009 740 WARN_ON_ONCE(!work_pending(work));
7a22ad75
TH
741 atomic_long_set(&work->data, data | flags | work_static(work));
742}
365970a1 743
112202d9 744static void set_work_pwq(struct work_struct *work, struct pool_workqueue *pwq,
7a22ad75
TH
745 unsigned long extra_flags)
746{
112202d9
TH
747 set_work_data(work, (unsigned long)pwq,
748 WORK_STRUCT_PENDING | WORK_STRUCT_PWQ | extra_flags);
365970a1
DH
749}
750
4468a00f
LJ
751static void set_work_pool_and_keep_pending(struct work_struct *work,
752 int pool_id)
753{
754 set_work_data(work, (unsigned long)pool_id << WORK_OFFQ_POOL_SHIFT,
755 WORK_STRUCT_PENDING);
756}
757
7c3eed5c
TH
758static void set_work_pool_and_clear_pending(struct work_struct *work,
759 int pool_id)
7a22ad75 760{
23657bb1
TH
761 /*
762 * The following wmb is paired with the implied mb in
763 * test_and_set_bit(PENDING) and ensures all updates to @work made
764 * here are visible to and precede any updates by the next PENDING
765 * owner.
766 */
767 smp_wmb();
7c3eed5c 768 set_work_data(work, (unsigned long)pool_id << WORK_OFFQ_POOL_SHIFT, 0);
346c09f8
RP
769 /*
770 * The following mb guarantees that previous clear of a PENDING bit
771 * will not be reordered with any speculative LOADS or STORES from
772 * work->current_func, which is executed afterwards. This possible
8bdc6201 773 * reordering can lead to a missed execution on attempt to queue
346c09f8
RP
774 * the same @work. E.g. consider this case:
775 *
776 * CPU#0 CPU#1
777 * ---------------------------- --------------------------------
778 *
779 * 1 STORE event_indicated
780 * 2 queue_work_on() {
781 * 3 test_and_set_bit(PENDING)
782 * 4 } set_..._and_clear_pending() {
783 * 5 set_work_data() # clear bit
784 * 6 smp_mb()
785 * 7 work->current_func() {
786 * 8 LOAD event_indicated
787 * }
788 *
789 * Without an explicit full barrier speculative LOAD on line 8 can
790 * be executed before CPU#0 does STORE on line 1. If that happens,
791 * CPU#0 observes the PENDING bit is still set and new execution of
792 * a @work is not queued in a hope, that CPU#1 will eventually
793 * finish the queued @work. Meanwhile CPU#1 does not see
794 * event_indicated is set, because speculative LOAD was executed
795 * before actual STORE.
796 */
797 smp_mb();
7a22ad75 798}
f756d5e2 799
7a22ad75 800static void clear_work_data(struct work_struct *work)
1da177e4 801{
7c3eed5c
TH
802 smp_wmb(); /* see set_work_pool_and_clear_pending() */
803 set_work_data(work, WORK_STRUCT_NO_POOL, 0);
1da177e4
LT
804}
805
afa4bb77
LT
806static inline struct pool_workqueue *work_struct_pwq(unsigned long data)
807{
808 return (struct pool_workqueue *)(data & WORK_STRUCT_WQ_DATA_MASK);
809}
810
112202d9 811static struct pool_workqueue *get_work_pwq(struct work_struct *work)
b1f4ec17 812{
e120153d 813 unsigned long data = atomic_long_read(&work->data);
7a22ad75 814
112202d9 815 if (data & WORK_STRUCT_PWQ)
afa4bb77 816 return work_struct_pwq(data);
e120153d
TH
817 else
818 return NULL;
4d707b9f
ON
819}
820
7c3eed5c
TH
821/**
822 * get_work_pool - return the worker_pool a given work was associated with
823 * @work: the work item of interest
824 *
68e13a67 825 * Pools are created and destroyed under wq_pool_mutex, and allows read
24acfb71
TG
826 * access under RCU read lock. As such, this function should be
827 * called under wq_pool_mutex or inside of a rcu_read_lock() region.
fa1b54e6
TH
828 *
829 * All fields of the returned pool are accessible as long as the above
830 * mentioned locking is in effect. If the returned pool needs to be used
831 * beyond the critical section, the caller is responsible for ensuring the
832 * returned pool is and stays online.
d185af30
YB
833 *
834 * Return: The worker_pool @work was last associated with. %NULL if none.
7c3eed5c
TH
835 */
836static struct worker_pool *get_work_pool(struct work_struct *work)
365970a1 837{
e120153d 838 unsigned long data = atomic_long_read(&work->data);
7c3eed5c 839 int pool_id;
7a22ad75 840
68e13a67 841 assert_rcu_or_pool_mutex();
fa1b54e6 842
112202d9 843 if (data & WORK_STRUCT_PWQ)
afa4bb77 844 return work_struct_pwq(data)->pool;
7a22ad75 845
7c3eed5c
TH
846 pool_id = data >> WORK_OFFQ_POOL_SHIFT;
847 if (pool_id == WORK_OFFQ_POOL_NONE)
7a22ad75
TH
848 return NULL;
849
fa1b54e6 850 return idr_find(&worker_pool_idr, pool_id);
7c3eed5c
TH
851}
852
853/**
854 * get_work_pool_id - return the worker pool ID a given work is associated with
855 * @work: the work item of interest
856 *
d185af30 857 * Return: The worker_pool ID @work was last associated with.
7c3eed5c
TH
858 * %WORK_OFFQ_POOL_NONE if none.
859 */
860static int get_work_pool_id(struct work_struct *work)
861{
54d5b7d0
LJ
862 unsigned long data = atomic_long_read(&work->data);
863
112202d9 864 if (data & WORK_STRUCT_PWQ)
afa4bb77 865 return work_struct_pwq(data)->pool->id;
7c3eed5c 866
54d5b7d0 867 return data >> WORK_OFFQ_POOL_SHIFT;
7c3eed5c
TH
868}
869
bbb68dfa
TH
870static void mark_work_canceling(struct work_struct *work)
871{
7c3eed5c 872 unsigned long pool_id = get_work_pool_id(work);
bbb68dfa 873
7c3eed5c
TH
874 pool_id <<= WORK_OFFQ_POOL_SHIFT;
875 set_work_data(work, pool_id | WORK_OFFQ_CANCELING, WORK_STRUCT_PENDING);
bbb68dfa
TH
876}
877
878static bool work_is_canceling(struct work_struct *work)
879{
880 unsigned long data = atomic_long_read(&work->data);
881
112202d9 882 return !(data & WORK_STRUCT_PWQ) && (data & WORK_OFFQ_CANCELING);
bbb68dfa
TH
883}
884
e22bee78 885/*
3270476a
TH
886 * Policy functions. These define the policies on how the global worker
887 * pools are managed. Unless noted otherwise, these functions assume that
d565ed63 888 * they're being called with pool->lock held.
e22bee78
TH
889 */
890
4594bf15 891/*
e22bee78
TH
892 * Need to wake up a worker? Called from anything but currently
893 * running workers.
974271c4
TH
894 *
895 * Note that, because unbound workers never contribute to nr_running, this
706026c2 896 * function will always return %true for unbound pools as long as the
974271c4 897 * worklist isn't empty.
4594bf15 898 */
63d95a91 899static bool need_more_worker(struct worker_pool *pool)
365970a1 900{
0219a352 901 return !list_empty(&pool->worklist) && !pool->nr_running;
e22bee78 902}
4594bf15 903
e22bee78 904/* Can I start working? Called from busy but !running workers. */
63d95a91 905static bool may_start_working(struct worker_pool *pool)
e22bee78 906{
63d95a91 907 return pool->nr_idle;
e22bee78
TH
908}
909
910/* Do I need to keep working? Called from currently running workers. */
63d95a91 911static bool keep_working(struct worker_pool *pool)
e22bee78 912{
bc35f7ef 913 return !list_empty(&pool->worklist) && (pool->nr_running <= 1);
e22bee78
TH
914}
915
916/* Do we need a new worker? Called from manager. */
63d95a91 917static bool need_to_create_worker(struct worker_pool *pool)
e22bee78 918{
63d95a91 919 return need_more_worker(pool) && !may_start_working(pool);
e22bee78 920}
365970a1 921
e22bee78 922/* Do we have too many workers and should some go away? */
63d95a91 923static bool too_many_workers(struct worker_pool *pool)
e22bee78 924{
692b4825 925 bool managing = pool->flags & POOL_MANAGER_ACTIVE;
63d95a91
TH
926 int nr_idle = pool->nr_idle + managing; /* manager is considered idle */
927 int nr_busy = pool->nr_workers - nr_idle;
e22bee78
TH
928
929 return nr_idle > 2 && (nr_idle - 2) * MAX_IDLE_WORKERS_RATIO >= nr_busy;
365970a1
DH
930}
931
c54d5046
TH
932/**
933 * worker_set_flags - set worker flags and adjust nr_running accordingly
934 * @worker: self
935 * @flags: flags to set
936 *
937 * Set @flags in @worker->flags and adjust nr_running accordingly.
c54d5046
TH
938 */
939static inline void worker_set_flags(struct worker *worker, unsigned int flags)
940{
941 struct worker_pool *pool = worker->pool;
942
bc8b50c2 943 lockdep_assert_held(&pool->lock);
c54d5046
TH
944
945 /* If transitioning into NOT_RUNNING, adjust nr_running. */
946 if ((flags & WORKER_NOT_RUNNING) &&
947 !(worker->flags & WORKER_NOT_RUNNING)) {
948 pool->nr_running--;
949 }
950
951 worker->flags |= flags;
952}
953
954/**
955 * worker_clr_flags - clear worker flags and adjust nr_running accordingly
956 * @worker: self
957 * @flags: flags to clear
958 *
959 * Clear @flags in @worker->flags and adjust nr_running accordingly.
c54d5046
TH
960 */
961static inline void worker_clr_flags(struct worker *worker, unsigned int flags)
962{
963 struct worker_pool *pool = worker->pool;
964 unsigned int oflags = worker->flags;
965
bc8b50c2 966 lockdep_assert_held(&pool->lock);
c54d5046
TH
967
968 worker->flags &= ~flags;
969
970 /*
971 * If transitioning out of NOT_RUNNING, increment nr_running. Note
972 * that the nested NOT_RUNNING is not a noop. NOT_RUNNING is mask
973 * of multiple flags, not a single flag.
974 */
975 if ((flags & WORKER_NOT_RUNNING) && (oflags & WORKER_NOT_RUNNING))
976 if (!(worker->flags & WORKER_NOT_RUNNING))
977 pool->nr_running++;
978}
979
797e8345
TH
980/* Return the first idle worker. Called with pool->lock held. */
981static struct worker *first_idle_worker(struct worker_pool *pool)
982{
983 if (unlikely(list_empty(&pool->idle_list)))
984 return NULL;
985
986 return list_first_entry(&pool->idle_list, struct worker, entry);
987}
988
989/**
990 * worker_enter_idle - enter idle state
991 * @worker: worker which is entering idle state
992 *
993 * @worker is entering idle state. Update stats and idle timer if
994 * necessary.
995 *
996 * LOCKING:
997 * raw_spin_lock_irq(pool->lock).
998 */
999static void worker_enter_idle(struct worker *worker)
1000{
1001 struct worker_pool *pool = worker->pool;
1002
1003 if (WARN_ON_ONCE(worker->flags & WORKER_IDLE) ||
1004 WARN_ON_ONCE(!list_empty(&worker->entry) &&
1005 (worker->hentry.next || worker->hentry.pprev)))
1006 return;
1007
1008 /* can't use worker_set_flags(), also called from create_worker() */
1009 worker->flags |= WORKER_IDLE;
1010 pool->nr_idle++;
1011 worker->last_active = jiffies;
1012
1013 /* idle_list is LIFO */
1014 list_add(&worker->entry, &pool->idle_list);
1015
1016 if (too_many_workers(pool) && !timer_pending(&pool->idle_timer))
1017 mod_timer(&pool->idle_timer, jiffies + IDLE_WORKER_TIMEOUT);
1018
1019 /* Sanity check nr_running. */
1020 WARN_ON_ONCE(pool->nr_workers == pool->nr_idle && pool->nr_running);
1021}
1022
1023/**
1024 * worker_leave_idle - leave idle state
1025 * @worker: worker which is leaving idle state
1026 *
1027 * @worker is leaving idle state. Update stats.
1028 *
1029 * LOCKING:
1030 * raw_spin_lock_irq(pool->lock).
1031 */
1032static void worker_leave_idle(struct worker *worker)
1033{
1034 struct worker_pool *pool = worker->pool;
1035
1036 if (WARN_ON_ONCE(!(worker->flags & WORKER_IDLE)))
1037 return;
1038 worker_clr_flags(worker, WORKER_IDLE);
1039 pool->nr_idle--;
1040 list_del_init(&worker->entry);
1041}
1042
1043/**
1044 * find_worker_executing_work - find worker which is executing a work
1045 * @pool: pool of interest
1046 * @work: work to find worker for
1047 *
1048 * Find a worker which is executing @work on @pool by searching
1049 * @pool->busy_hash which is keyed by the address of @work. For a worker
1050 * to match, its current execution should match the address of @work and
1051 * its work function. This is to avoid unwanted dependency between
1052 * unrelated work executions through a work item being recycled while still
1053 * being executed.
1054 *
1055 * This is a bit tricky. A work item may be freed once its execution
1056 * starts and nothing prevents the freed area from being recycled for
1057 * another work item. If the same work item address ends up being reused
1058 * before the original execution finishes, workqueue will identify the
1059 * recycled work item as currently executing and make it wait until the
1060 * current execution finishes, introducing an unwanted dependency.
1061 *
1062 * This function checks the work item address and work function to avoid
1063 * false positives. Note that this isn't complete as one may construct a
1064 * work function which can introduce dependency onto itself through a
1065 * recycled work item. Well, if somebody wants to shoot oneself in the
1066 * foot that badly, there's only so much we can do, and if such deadlock
1067 * actually occurs, it should be easy to locate the culprit work function.
1068 *
1069 * CONTEXT:
1070 * raw_spin_lock_irq(pool->lock).
1071 *
1072 * Return:
1073 * Pointer to worker which is executing @work if found, %NULL
1074 * otherwise.
1075 */
1076static struct worker *find_worker_executing_work(struct worker_pool *pool,
1077 struct work_struct *work)
1078{
1079 struct worker *worker;
1080
1081 hash_for_each_possible(pool->busy_hash, worker, hentry,
1082 (unsigned long)work)
1083 if (worker->current_work == work &&
1084 worker->current_func == work->func)
1085 return worker;
1086
1087 return NULL;
1088}
1089
1090/**
1091 * move_linked_works - move linked works to a list
1092 * @work: start of series of works to be scheduled
1093 * @head: target list to append @work to
1094 * @nextp: out parameter for nested worklist walking
1095 *
873eaca6
TH
1096 * Schedule linked works starting from @work to @head. Work series to be
1097 * scheduled starts at @work and includes any consecutive work with
1098 * WORK_STRUCT_LINKED set in its predecessor. See assign_work() for details on
1099 * @nextp.
797e8345
TH
1100 *
1101 * CONTEXT:
1102 * raw_spin_lock_irq(pool->lock).
1103 */
1104static void move_linked_works(struct work_struct *work, struct list_head *head,
1105 struct work_struct **nextp)
1106{
1107 struct work_struct *n;
1108
1109 /*
1110 * Linked worklist will always end before the end of the list,
1111 * use NULL for list head.
1112 */
1113 list_for_each_entry_safe_from(work, n, NULL, entry) {
1114 list_move_tail(&work->entry, head);
1115 if (!(*work_data_bits(work) & WORK_STRUCT_LINKED))
1116 break;
1117 }
1118
1119 /*
1120 * If we're already inside safe list traversal and have moved
1121 * multiple works to the scheduled queue, the next position
1122 * needs to be updated.
1123 */
1124 if (nextp)
1125 *nextp = n;
1126}
1127
873eaca6
TH
1128/**
1129 * assign_work - assign a work item and its linked work items to a worker
1130 * @work: work to assign
1131 * @worker: worker to assign to
1132 * @nextp: out parameter for nested worklist walking
1133 *
1134 * Assign @work and its linked work items to @worker. If @work is already being
1135 * executed by another worker in the same pool, it'll be punted there.
1136 *
1137 * If @nextp is not NULL, it's updated to point to the next work of the last
1138 * scheduled work. This allows assign_work() to be nested inside
1139 * list_for_each_entry_safe().
1140 *
1141 * Returns %true if @work was successfully assigned to @worker. %false if @work
1142 * was punted to another worker already executing it.
1143 */
1144static bool assign_work(struct work_struct *work, struct worker *worker,
1145 struct work_struct **nextp)
1146{
1147 struct worker_pool *pool = worker->pool;
1148 struct worker *collision;
1149
1150 lockdep_assert_held(&pool->lock);
1151
1152 /*
1153 * A single work shouldn't be executed concurrently by multiple workers.
1154 * __queue_work() ensures that @work doesn't jump to a different pool
1155 * while still running in the previous pool. Here, we should ensure that
1156 * @work is not executed concurrently by multiple workers from the same
1157 * pool. Check whether anyone is already processing the work. If so,
1158 * defer the work to the currently executing one.
1159 */
1160 collision = find_worker_executing_work(pool, work);
1161 if (unlikely(collision)) {
1162 move_linked_works(work, &collision->scheduled, nextp);
1163 return false;
1164 }
1165
1166 move_linked_works(work, &worker->scheduled, nextp);
1167 return true;
1168}
1169
797e8345 1170/**
0219a352
TH
1171 * kick_pool - wake up an idle worker if necessary
1172 * @pool: pool to kick
797e8345 1173 *
0219a352
TH
1174 * @pool may have pending work items. Wake up worker if necessary. Returns
1175 * whether a worker was woken up.
797e8345 1176 */
0219a352 1177static bool kick_pool(struct worker_pool *pool)
797e8345
TH
1178{
1179 struct worker *worker = first_idle_worker(pool);
8639eceb 1180 struct task_struct *p;
797e8345 1181
0219a352
TH
1182 lockdep_assert_held(&pool->lock);
1183
1184 if (!need_more_worker(pool) || !worker)
1185 return false;
1186
8639eceb
TH
1187 p = worker->task;
1188
1189#ifdef CONFIG_SMP
1190 /*
1191 * Idle @worker is about to execute @work and waking up provides an
1192 * opportunity to migrate @worker at a lower cost by setting the task's
1193 * wake_cpu field. Let's see if we want to move @worker to improve
1194 * execution locality.
1195 *
1196 * We're waking the worker that went idle the latest and there's some
1197 * chance that @worker is marked idle but hasn't gone off CPU yet. If
1198 * so, setting the wake_cpu won't do anything. As this is a best-effort
1199 * optimization and the race window is narrow, let's leave as-is for
1200 * now. If this becomes pronounced, we can skip over workers which are
1201 * still on cpu when picking an idle worker.
1202 *
1203 * If @pool has non-strict affinity, @worker might have ended up outside
1204 * its affinity scope. Repatriate.
1205 */
1206 if (!pool->attrs->affn_strict &&
1207 !cpumask_test_cpu(p->wake_cpu, pool->attrs->__pod_cpumask)) {
1208 struct work_struct *work = list_first_entry(&pool->worklist,
1209 struct work_struct, entry);
1210 p->wake_cpu = cpumask_any_distribute(pool->attrs->__pod_cpumask);
1211 get_work_pwq(work)->stats[PWQ_STAT_REPATRIATED]++;
1212 }
1213#endif
1214 wake_up_process(p);
0219a352 1215 return true;
797e8345
TH
1216}
1217
63638450
TH
1218#ifdef CONFIG_WQ_CPU_INTENSIVE_REPORT
1219
1220/*
1221 * Concurrency-managed per-cpu work items that hog CPU for longer than
1222 * wq_cpu_intensive_thresh_us trigger the automatic CPU_INTENSIVE mechanism,
1223 * which prevents them from stalling other concurrency-managed work items. If a
1224 * work function keeps triggering this mechanism, it's likely that the work item
1225 * should be using an unbound workqueue instead.
1226 *
1227 * wq_cpu_intensive_report() tracks work functions which trigger such conditions
1228 * and report them so that they can be examined and converted to use unbound
1229 * workqueues as appropriate. To avoid flooding the console, each violating work
1230 * function is tracked and reported with exponential backoff.
1231 */
1232#define WCI_MAX_ENTS 128
1233
1234struct wci_ent {
1235 work_func_t func;
1236 atomic64_t cnt;
1237 struct hlist_node hash_node;
1238};
1239
1240static struct wci_ent wci_ents[WCI_MAX_ENTS];
1241static int wci_nr_ents;
1242static DEFINE_RAW_SPINLOCK(wci_lock);
1243static DEFINE_HASHTABLE(wci_hash, ilog2(WCI_MAX_ENTS));
1244
1245static struct wci_ent *wci_find_ent(work_func_t func)
1246{
1247 struct wci_ent *ent;
1248
1249 hash_for_each_possible_rcu(wci_hash, ent, hash_node,
1250 (unsigned long)func) {
1251 if (ent->func == func)
1252 return ent;
1253 }
1254 return NULL;
1255}
1256
1257static void wq_cpu_intensive_report(work_func_t func)
1258{
1259 struct wci_ent *ent;
1260
1261restart:
1262 ent = wci_find_ent(func);
1263 if (ent) {
1264 u64 cnt;
1265
1266 /*
1267 * Start reporting from the fourth time and back off
1268 * exponentially.
1269 */
1270 cnt = atomic64_inc_return_relaxed(&ent->cnt);
1271 if (cnt >= 4 && is_power_of_2(cnt))
1272 printk_deferred(KERN_WARNING "workqueue: %ps hogged CPU for >%luus %llu times, consider switching to WQ_UNBOUND\n",
1273 ent->func, wq_cpu_intensive_thresh_us,
1274 atomic64_read(&ent->cnt));
1275 return;
1276 }
1277
1278 /*
1279 * @func is a new violation. Allocate a new entry for it. If wcn_ents[]
1280 * is exhausted, something went really wrong and we probably made enough
1281 * noise already.
1282 */
1283 if (wci_nr_ents >= WCI_MAX_ENTS)
1284 return;
1285
1286 raw_spin_lock(&wci_lock);
1287
1288 if (wci_nr_ents >= WCI_MAX_ENTS) {
1289 raw_spin_unlock(&wci_lock);
1290 return;
1291 }
1292
1293 if (wci_find_ent(func)) {
1294 raw_spin_unlock(&wci_lock);
1295 goto restart;
1296 }
1297
1298 ent = &wci_ents[wci_nr_ents++];
1299 ent->func = func;
1300 atomic64_set(&ent->cnt, 1);
1301 hash_add_rcu(wci_hash, &ent->hash_node, (unsigned long)func);
1302
1303 raw_spin_unlock(&wci_lock);
1304}
1305
1306#else /* CONFIG_WQ_CPU_INTENSIVE_REPORT */
1307static void wq_cpu_intensive_report(work_func_t func) {}
1308#endif /* CONFIG_WQ_CPU_INTENSIVE_REPORT */
1309
d302f017 1310/**
6d25be57 1311 * wq_worker_running - a worker is running again
e22bee78 1312 * @task: task waking up
e22bee78 1313 *
6d25be57 1314 * This function is called when a worker returns from schedule()
e22bee78 1315 */
6d25be57 1316void wq_worker_running(struct task_struct *task)
e22bee78
TH
1317{
1318 struct worker *worker = kthread_data(task);
1319
c8f6219b 1320 if (!READ_ONCE(worker->sleeping))
6d25be57 1321 return;
07edfece
FW
1322
1323 /*
1324 * If preempted by unbind_workers() between the WORKER_NOT_RUNNING check
1325 * and the nr_running increment below, we may ruin the nr_running reset
1326 * and leave with an unexpected pool->nr_running == 1 on the newly unbound
1327 * pool. Protect against such race.
1328 */
1329 preempt_disable();
6d25be57 1330 if (!(worker->flags & WORKER_NOT_RUNNING))
bc35f7ef 1331 worker->pool->nr_running++;
07edfece 1332 preempt_enable();
616db877
TH
1333
1334 /*
1335 * CPU intensive auto-detection cares about how long a work item hogged
1336 * CPU without sleeping. Reset the starting timestamp on wakeup.
1337 */
1338 worker->current_at = worker->task->se.sum_exec_runtime;
1339
c8f6219b 1340 WRITE_ONCE(worker->sleeping, 0);
e22bee78
TH
1341}
1342
1343/**
1344 * wq_worker_sleeping - a worker is going to sleep
1345 * @task: task going to sleep
e22bee78 1346 *
6d25be57 1347 * This function is called from schedule() when a busy worker is
ccf45156 1348 * going to sleep.
e22bee78 1349 */
6d25be57 1350void wq_worker_sleeping(struct task_struct *task)
e22bee78 1351{
cc5bff38 1352 struct worker *worker = kthread_data(task);
111c225a 1353 struct worker_pool *pool;
e22bee78 1354
111c225a
TH
1355 /*
1356 * Rescuers, which may not have all the fields set up like normal
1357 * workers, also reach here, let's not access anything before
1358 * checking NOT_RUNNING.
1359 */
2d64672e 1360 if (worker->flags & WORKER_NOT_RUNNING)
6d25be57 1361 return;
e22bee78 1362
111c225a 1363 pool = worker->pool;
111c225a 1364
62849a96 1365 /* Return if preempted before wq_worker_running() was reached */
c8f6219b 1366 if (READ_ONCE(worker->sleeping))
6d25be57
TG
1367 return;
1368
c8f6219b 1369 WRITE_ONCE(worker->sleeping, 1);
a9b8a985 1370 raw_spin_lock_irq(&pool->lock);
e22bee78 1371
45c753f5
FW
1372 /*
1373 * Recheck in case unbind_workers() preempted us. We don't
1374 * want to decrement nr_running after the worker is unbound
1375 * and nr_running has been reset.
1376 */
1377 if (worker->flags & WORKER_NOT_RUNNING) {
1378 raw_spin_unlock_irq(&pool->lock);
1379 return;
1380 }
1381
bc35f7ef 1382 pool->nr_running--;
0219a352 1383 if (kick_pool(pool))
725e8ec5 1384 worker->current_pwq->stats[PWQ_STAT_CM_WAKEUP]++;
0219a352 1385
a9b8a985 1386 raw_spin_unlock_irq(&pool->lock);
e22bee78
TH
1387}
1388
616db877
TH
1389/**
1390 * wq_worker_tick - a scheduler tick occurred while a kworker is running
1391 * @task: task currently running
1392 *
1393 * Called from scheduler_tick(). We're in the IRQ context and the current
1394 * worker's fields which follow the 'K' locking rule can be accessed safely.
1395 */
1396void wq_worker_tick(struct task_struct *task)
1397{
1398 struct worker *worker = kthread_data(task);
1399 struct pool_workqueue *pwq = worker->current_pwq;
1400 struct worker_pool *pool = worker->pool;
1401
1402 if (!pwq)
1403 return;
1404
8a1dd1e5
TH
1405 pwq->stats[PWQ_STAT_CPU_TIME] += TICK_USEC;
1406
18c8ae81
Z
1407 if (!wq_cpu_intensive_thresh_us)
1408 return;
1409
616db877
TH
1410 /*
1411 * If the current worker is concurrency managed and hogged the CPU for
1412 * longer than wq_cpu_intensive_thresh_us, it's automatically marked
1413 * CPU_INTENSIVE to avoid stalling other concurrency-managed work items.
c8f6219b
Z
1414 *
1415 * Set @worker->sleeping means that @worker is in the process of
1416 * switching out voluntarily and won't be contributing to
1417 * @pool->nr_running until it wakes up. As wq_worker_sleeping() also
1418 * decrements ->nr_running, setting CPU_INTENSIVE here can lead to
1419 * double decrements. The task is releasing the CPU anyway. Let's skip.
1420 * We probably want to make this prettier in the future.
616db877 1421 */
c8f6219b 1422 if ((worker->flags & WORKER_NOT_RUNNING) || READ_ONCE(worker->sleeping) ||
616db877
TH
1423 worker->task->se.sum_exec_runtime - worker->current_at <
1424 wq_cpu_intensive_thresh_us * NSEC_PER_USEC)
1425 return;
1426
1427 raw_spin_lock(&pool->lock);
1428
1429 worker_set_flags(worker, WORKER_CPU_INTENSIVE);
63638450 1430 wq_cpu_intensive_report(worker->current_func);
616db877
TH
1431 pwq->stats[PWQ_STAT_CPU_INTENSIVE]++;
1432
0219a352 1433 if (kick_pool(pool))
616db877 1434 pwq->stats[PWQ_STAT_CM_WAKEUP]++;
616db877
TH
1435
1436 raw_spin_unlock(&pool->lock);
1437}
1438
1b69ac6b
JW
1439/**
1440 * wq_worker_last_func - retrieve worker's last work function
8194fe94 1441 * @task: Task to retrieve last work function of.
1b69ac6b
JW
1442 *
1443 * Determine the last function a worker executed. This is called from
1444 * the scheduler to get a worker's last known identity.
1445 *
1446 * CONTEXT:
a9b8a985 1447 * raw_spin_lock_irq(rq->lock)
1b69ac6b 1448 *
4b047002
JW
1449 * This function is called during schedule() when a kworker is going
1450 * to sleep. It's used by psi to identify aggregation workers during
1451 * dequeuing, to allow periodic aggregation to shut-off when that
1452 * worker is the last task in the system or cgroup to go to sleep.
1453 *
1454 * As this function doesn't involve any workqueue-related locking, it
1455 * only returns stable values when called from inside the scheduler's
1456 * queuing and dequeuing paths, when @task, which must be a kworker,
1457 * is guaranteed to not be processing any works.
1458 *
1b69ac6b
JW
1459 * Return:
1460 * The last work function %current executed as a worker, NULL if it
1461 * hasn't executed any work yet.
1462 */
1463work_func_t wq_worker_last_func(struct task_struct *task)
1464{
1465 struct worker *worker = kthread_data(task);
1466
1467 return worker->last_func;
1468}
1469
91ccc6e7
TH
1470/**
1471 * wq_node_nr_active - Determine wq_node_nr_active to use
1472 * @wq: workqueue of interest
1473 * @node: NUMA node, can be %NUMA_NO_NODE
1474 *
1475 * Determine wq_node_nr_active to use for @wq on @node. Returns:
1476 *
1477 * - %NULL for per-cpu workqueues as they don't need to use shared nr_active.
1478 *
1479 * - node_nr_active[nr_node_ids] if @node is %NUMA_NO_NODE.
1480 *
1481 * - Otherwise, node_nr_active[@node].
1482 */
1483static struct wq_node_nr_active *wq_node_nr_active(struct workqueue_struct *wq,
1484 int node)
1485{
1486 if (!(wq->flags & WQ_UNBOUND))
1487 return NULL;
1488
1489 if (node == NUMA_NO_NODE)
1490 node = nr_node_ids;
1491
1492 return wq->node_nr_active[node];
1493}
1494
5797b1c1
TH
1495/**
1496 * wq_update_node_max_active - Update per-node max_actives to use
1497 * @wq: workqueue to update
1498 * @off_cpu: CPU that's going down, -1 if a CPU is not going down
1499 *
1500 * Update @wq->node_nr_active[]->max. @wq must be unbound. max_active is
1501 * distributed among nodes according to the proportions of numbers of online
1502 * cpus. The result is always between @wq->min_active and max_active.
1503 */
1504static void wq_update_node_max_active(struct workqueue_struct *wq, int off_cpu)
1505{
1506 struct cpumask *effective = unbound_effective_cpumask(wq);
1507 int min_active = READ_ONCE(wq->min_active);
1508 int max_active = READ_ONCE(wq->max_active);
1509 int total_cpus, node;
1510
1511 lockdep_assert_held(&wq->mutex);
1512
15930da4 1513 if (off_cpu >= 0 && !cpumask_test_cpu(off_cpu, effective))
5797b1c1
TH
1514 off_cpu = -1;
1515
1516 total_cpus = cpumask_weight_and(effective, cpu_online_mask);
1517 if (off_cpu >= 0)
1518 total_cpus--;
1519
1520 for_each_node(node) {
1521 int node_cpus;
1522
1523 node_cpus = cpumask_weight_and(effective, cpumask_of_node(node));
1524 if (off_cpu >= 0 && cpu_to_node(off_cpu) == node)
1525 node_cpus--;
1526
1527 wq_node_nr_active(wq, node)->max =
1528 clamp(DIV_ROUND_UP(max_active * node_cpus, total_cpus),
1529 min_active, max_active);
1530 }
1531
1532 wq_node_nr_active(wq, NUMA_NO_NODE)->max = min_active;
1533}
1534
8864b4e5
TH
1535/**
1536 * get_pwq - get an extra reference on the specified pool_workqueue
1537 * @pwq: pool_workqueue to get
1538 *
1539 * Obtain an extra reference on @pwq. The caller should guarantee that
1540 * @pwq has positive refcnt and be holding the matching pool->lock.
1541 */
1542static void get_pwq(struct pool_workqueue *pwq)
1543{
1544 lockdep_assert_held(&pwq->pool->lock);
1545 WARN_ON_ONCE(pwq->refcnt <= 0);
1546 pwq->refcnt++;
1547}
1548
1549/**
1550 * put_pwq - put a pool_workqueue reference
1551 * @pwq: pool_workqueue to put
1552 *
1553 * Drop a reference of @pwq. If its refcnt reaches zero, schedule its
1554 * destruction. The caller should be holding the matching pool->lock.
1555 */
1556static void put_pwq(struct pool_workqueue *pwq)
1557{
1558 lockdep_assert_held(&pwq->pool->lock);
1559 if (likely(--pwq->refcnt))
1560 return;
8864b4e5 1561 /*
967b494e
TH
1562 * @pwq can't be released under pool->lock, bounce to a dedicated
1563 * kthread_worker to avoid A-A deadlocks.
8864b4e5 1564 */
687a9aa5 1565 kthread_queue_work(pwq_release_worker, &pwq->release_work);
8864b4e5
TH
1566}
1567
dce90d47
TH
1568/**
1569 * put_pwq_unlocked - put_pwq() with surrounding pool lock/unlock
1570 * @pwq: pool_workqueue to put (can be %NULL)
1571 *
1572 * put_pwq() with locking. This function also allows %NULL @pwq.
1573 */
1574static void put_pwq_unlocked(struct pool_workqueue *pwq)
1575{
1576 if (pwq) {
1577 /*
24acfb71 1578 * As both pwqs and pools are RCU protected, the
dce90d47
TH
1579 * following lock operations are safe.
1580 */
a9b8a985 1581 raw_spin_lock_irq(&pwq->pool->lock);
dce90d47 1582 put_pwq(pwq);
a9b8a985 1583 raw_spin_unlock_irq(&pwq->pool->lock);
dce90d47
TH
1584 }
1585}
1586
afa87ce8
TH
1587static bool pwq_is_empty(struct pool_workqueue *pwq)
1588{
1589 return !pwq->nr_active && list_empty(&pwq->inactive_works);
1590}
1591
4c638030
TH
1592static void __pwq_activate_work(struct pool_workqueue *pwq,
1593 struct work_struct *work)
bf4ede01 1594{
1c270b79
TH
1595 unsigned long *wdb = work_data_bits(work);
1596
1597 WARN_ON_ONCE(!(*wdb & WORK_STRUCT_INACTIVE));
bf4ede01 1598 trace_workqueue_activate_work(work);
82607adc
TH
1599 if (list_empty(&pwq->pool->worklist))
1600 pwq->pool->watchdog_ts = jiffies;
112202d9 1601 move_linked_works(work, &pwq->pool->worklist, NULL);
1c270b79 1602 __clear_bit(WORK_STRUCT_INACTIVE_BIT, wdb);
4c638030
TH
1603}
1604
1605/**
1606 * pwq_activate_work - Activate a work item if inactive
1607 * @pwq: pool_workqueue @work belongs to
1608 * @work: work item to activate
1609 *
1610 * Returns %true if activated. %false if already active.
1611 */
1612static bool pwq_activate_work(struct pool_workqueue *pwq,
1613 struct work_struct *work)
1614{
1615 struct worker_pool *pool = pwq->pool;
91ccc6e7 1616 struct wq_node_nr_active *nna;
4c638030
TH
1617
1618 lockdep_assert_held(&pool->lock);
1619
1620 if (!(*work_data_bits(work) & WORK_STRUCT_INACTIVE))
1621 return false;
1622
91ccc6e7
TH
1623 nna = wq_node_nr_active(pwq->wq, pool->node);
1624 if (nna)
1625 atomic_inc(&nna->nr);
1626
112202d9 1627 pwq->nr_active++;
4c638030
TH
1628 __pwq_activate_work(pwq, work);
1629 return true;
bf4ede01
TH
1630}
1631
5797b1c1
TH
1632static bool tryinc_node_nr_active(struct wq_node_nr_active *nna)
1633{
1634 int max = READ_ONCE(nna->max);
1635
1636 while (true) {
1637 int old, tmp;
1638
1639 old = atomic_read(&nna->nr);
1640 if (old >= max)
1641 return false;
1642 tmp = atomic_cmpxchg_relaxed(&nna->nr, old, old + 1);
1643 if (tmp == old)
1644 return true;
1645 }
1646}
1647
1c270b79
TH
1648/**
1649 * pwq_tryinc_nr_active - Try to increment nr_active for a pwq
1650 * @pwq: pool_workqueue of interest
5797b1c1 1651 * @fill: max_active may have increased, try to increase concurrency level
1c270b79
TH
1652 *
1653 * Try to increment nr_active for @pwq. Returns %true if an nr_active count is
1654 * successfully obtained. %false otherwise.
1655 */
5797b1c1 1656static bool pwq_tryinc_nr_active(struct pool_workqueue *pwq, bool fill)
1c270b79
TH
1657{
1658 struct workqueue_struct *wq = pwq->wq;
1659 struct worker_pool *pool = pwq->pool;
91ccc6e7 1660 struct wq_node_nr_active *nna = wq_node_nr_active(wq, pool->node);
5797b1c1 1661 bool obtained = false;
1c270b79
TH
1662
1663 lockdep_assert_held(&pool->lock);
1664
5797b1c1
TH
1665 if (!nna) {
1666 /* per-cpu workqueue, pwq->nr_active is sufficient */
1667 obtained = pwq->nr_active < READ_ONCE(wq->max_active);
1668 goto out;
1669 }
1670
1671 /*
1672 * Unbound workqueue uses per-node shared nr_active $nna. If @pwq is
1673 * already waiting on $nna, pwq_dec_nr_active() will maintain the
1674 * concurrency level. Don't jump the line.
1675 *
1676 * We need to ignore the pending test after max_active has increased as
1677 * pwq_dec_nr_active() can only maintain the concurrency level but not
1678 * increase it. This is indicated by @fill.
1679 */
1680 if (!list_empty(&pwq->pending_node) && likely(!fill))
1681 goto out;
1682
1683 obtained = tryinc_node_nr_active(nna);
1684 if (obtained)
1685 goto out;
1686
1687 /*
1688 * Lockless acquisition failed. Lock, add ourself to $nna->pending_pwqs
1689 * and try again. The smp_mb() is paired with the implied memory barrier
1690 * of atomic_dec_return() in pwq_dec_nr_active() to ensure that either
1691 * we see the decremented $nna->nr or they see non-empty
1692 * $nna->pending_pwqs.
1693 */
1694 raw_spin_lock(&nna->lock);
1695
1696 if (list_empty(&pwq->pending_node))
1697 list_add_tail(&pwq->pending_node, &nna->pending_pwqs);
1698 else if (likely(!fill))
1699 goto out_unlock;
1700
1701 smp_mb();
1702
1703 obtained = tryinc_node_nr_active(nna);
1c270b79 1704
5797b1c1
TH
1705 /*
1706 * If @fill, @pwq might have already been pending. Being spuriously
1707 * pending in cold paths doesn't affect anything. Let's leave it be.
1708 */
1709 if (obtained && likely(!fill))
1710 list_del_init(&pwq->pending_node);
1711
1712out_unlock:
1713 raw_spin_unlock(&nna->lock);
1714out:
1715 if (obtained)
1c270b79
TH
1716 pwq->nr_active++;
1717 return obtained;
1718}
1719
1720/**
1721 * pwq_activate_first_inactive - Activate the first inactive work item on a pwq
1722 * @pwq: pool_workqueue of interest
5797b1c1 1723 * @fill: max_active may have increased, try to increase concurrency level
1c270b79
TH
1724 *
1725 * Activate the first inactive work item of @pwq if available and allowed by
1726 * max_active limit.
1727 *
1728 * Returns %true if an inactive work item has been activated. %false if no
1729 * inactive work item is found or max_active limit is reached.
1730 */
5797b1c1 1731static bool pwq_activate_first_inactive(struct pool_workqueue *pwq, bool fill)
1c270b79
TH
1732{
1733 struct work_struct *work =
1734 list_first_entry_or_null(&pwq->inactive_works,
1735 struct work_struct, entry);
1736
5797b1c1 1737 if (work && pwq_tryinc_nr_active(pwq, fill)) {
1c270b79
TH
1738 __pwq_activate_work(pwq, work);
1739 return true;
1740 } else {
1741 return false;
1742 }
1743}
1744
5797b1c1
TH
1745/**
1746 * node_activate_pending_pwq - Activate a pending pwq on a wq_node_nr_active
1747 * @nna: wq_node_nr_active to activate a pending pwq for
1748 * @caller_pool: worker_pool the caller is locking
1749 *
1750 * Activate a pwq in @nna->pending_pwqs. Called with @caller_pool locked.
1751 * @caller_pool may be unlocked and relocked to lock other worker_pools.
1752 */
1753static void node_activate_pending_pwq(struct wq_node_nr_active *nna,
1754 struct worker_pool *caller_pool)
1755{
1756 struct worker_pool *locked_pool = caller_pool;
1757 struct pool_workqueue *pwq;
1758 struct work_struct *work;
1759
1760 lockdep_assert_held(&caller_pool->lock);
1761
1762 raw_spin_lock(&nna->lock);
1763retry:
1764 pwq = list_first_entry_or_null(&nna->pending_pwqs,
1765 struct pool_workqueue, pending_node);
1766 if (!pwq)
1767 goto out_unlock;
1768
1769 /*
1770 * If @pwq is for a different pool than @locked_pool, we need to lock
1771 * @pwq->pool->lock. Let's trylock first. If unsuccessful, do the unlock
1772 * / lock dance. For that, we also need to release @nna->lock as it's
1773 * nested inside pool locks.
1774 */
1775 if (pwq->pool != locked_pool) {
1776 raw_spin_unlock(&locked_pool->lock);
1777 locked_pool = pwq->pool;
1778 if (!raw_spin_trylock(&locked_pool->lock)) {
1779 raw_spin_unlock(&nna->lock);
1780 raw_spin_lock(&locked_pool->lock);
1781 raw_spin_lock(&nna->lock);
1782 goto retry;
1783 }
1784 }
1785
1786 /*
1787 * $pwq may not have any inactive work items due to e.g. cancellations.
1788 * Drop it from pending_pwqs and see if there's another one.
1789 */
1790 work = list_first_entry_or_null(&pwq->inactive_works,
1791 struct work_struct, entry);
1792 if (!work) {
1793 list_del_init(&pwq->pending_node);
1794 goto retry;
1795 }
1796
1797 /*
1798 * Acquire an nr_active count and activate the inactive work item. If
1799 * $pwq still has inactive work items, rotate it to the end of the
1800 * pending_pwqs so that we round-robin through them. This means that
1801 * inactive work items are not activated in queueing order which is fine
1802 * given that there has never been any ordering across different pwqs.
1803 */
1804 if (likely(tryinc_node_nr_active(nna))) {
1805 pwq->nr_active++;
1806 __pwq_activate_work(pwq, work);
1807
1808 if (list_empty(&pwq->inactive_works))
1809 list_del_init(&pwq->pending_node);
1810 else
1811 list_move_tail(&pwq->pending_node, &nna->pending_pwqs);
1812
1813 /* if activating a foreign pool, make sure it's running */
1814 if (pwq->pool != caller_pool)
1815 kick_pool(pwq->pool);
1816 }
1817
1818out_unlock:
1819 raw_spin_unlock(&nna->lock);
1820 if (locked_pool != caller_pool) {
1821 raw_spin_unlock(&locked_pool->lock);
1822 raw_spin_lock(&caller_pool->lock);
1823 }
1824}
1825
1c270b79
TH
1826/**
1827 * pwq_dec_nr_active - Retire an active count
1828 * @pwq: pool_workqueue of interest
1829 *
1830 * Decrement @pwq's nr_active and try to activate the first inactive work item.
5797b1c1 1831 * For unbound workqueues, this function may temporarily drop @pwq->pool->lock.
1c270b79
TH
1832 */
1833static void pwq_dec_nr_active(struct pool_workqueue *pwq)
3aa62497 1834{
1c270b79 1835 struct worker_pool *pool = pwq->pool;
91ccc6e7 1836 struct wq_node_nr_active *nna = wq_node_nr_active(pwq->wq, pool->node);
3aa62497 1837
1c270b79
TH
1838 lockdep_assert_held(&pool->lock);
1839
91ccc6e7
TH
1840 /*
1841 * @pwq->nr_active should be decremented for both percpu and unbound
1842 * workqueues.
1843 */
1c270b79 1844 pwq->nr_active--;
91ccc6e7
TH
1845
1846 /*
1847 * For a percpu workqueue, it's simple. Just need to kick the first
1848 * inactive work item on @pwq itself.
1849 */
1850 if (!nna) {
5797b1c1 1851 pwq_activate_first_inactive(pwq, false);
91ccc6e7
TH
1852 return;
1853 }
1854
5797b1c1
TH
1855 /*
1856 * If @pwq is for an unbound workqueue, it's more complicated because
1857 * multiple pwqs and pools may be sharing the nr_active count. When a
1858 * pwq needs to wait for an nr_active count, it puts itself on
1859 * $nna->pending_pwqs. The following atomic_dec_return()'s implied
1860 * memory barrier is paired with smp_mb() in pwq_tryinc_nr_active() to
1861 * guarantee that either we see non-empty pending_pwqs or they see
1862 * decremented $nna->nr.
1863 *
1864 * $nna->max may change as CPUs come online/offline and @pwq->wq's
1865 * max_active gets updated. However, it is guaranteed to be equal to or
1866 * larger than @pwq->wq->min_active which is above zero unless freezing.
1867 * This maintains the forward progress guarantee.
1868 */
1869 if (atomic_dec_return(&nna->nr) >= READ_ONCE(nna->max))
1870 return;
1871
1872 if (!list_empty(&nna->pending_pwqs))
1873 node_activate_pending_pwq(nna, pool);
3aa62497
LJ
1874}
1875
bf4ede01 1876/**
112202d9
TH
1877 * pwq_dec_nr_in_flight - decrement pwq's nr_in_flight
1878 * @pwq: pwq of interest
c4560c2c 1879 * @work_data: work_data of work which left the queue
bf4ede01
TH
1880 *
1881 * A work either has completed or is removed from pending queue,
112202d9 1882 * decrement nr_in_flight of its pwq and handle workqueue flushing.
bf4ede01 1883 *
dd6c3c54
TH
1884 * NOTE:
1885 * For unbound workqueues, this function may temporarily drop @pwq->pool->lock
1886 * and thus should be called after all other state updates for the in-flight
1887 * work item is complete.
1888 *
bf4ede01 1889 * CONTEXT:
a9b8a985 1890 * raw_spin_lock_irq(pool->lock).
bf4ede01 1891 */
c4560c2c 1892static void pwq_dec_nr_in_flight(struct pool_workqueue *pwq, unsigned long work_data)
bf4ede01 1893{
c4560c2c
LJ
1894 int color = get_work_color(work_data);
1895
1c270b79
TH
1896 if (!(work_data & WORK_STRUCT_INACTIVE))
1897 pwq_dec_nr_active(pwq);
018f3a13 1898
112202d9 1899 pwq->nr_in_flight[color]--;
bf4ede01 1900
bf4ede01 1901 /* is flush in progress and are we at the flushing tip? */
112202d9 1902 if (likely(pwq->flush_color != color))
8864b4e5 1903 goto out_put;
bf4ede01
TH
1904
1905 /* are there still in-flight works? */
112202d9 1906 if (pwq->nr_in_flight[color])
8864b4e5 1907 goto out_put;
bf4ede01 1908
112202d9
TH
1909 /* this pwq is done, clear flush_color */
1910 pwq->flush_color = -1;
bf4ede01
TH
1911
1912 /*
112202d9 1913 * If this was the last pwq, wake up the first flusher. It
bf4ede01
TH
1914 * will handle the rest.
1915 */
112202d9
TH
1916 if (atomic_dec_and_test(&pwq->wq->nr_pwqs_to_flush))
1917 complete(&pwq->wq->first_flusher->done);
8864b4e5
TH
1918out_put:
1919 put_pwq(pwq);
bf4ede01
TH
1920}
1921
36e227d2 1922/**
bbb68dfa 1923 * try_to_grab_pending - steal work item from worklist and disable irq
36e227d2
TH
1924 * @work: work item to steal
1925 * @is_dwork: @work is a delayed_work
bbb68dfa 1926 * @flags: place to store irq state
36e227d2
TH
1927 *
1928 * Try to grab PENDING bit of @work. This function can handle @work in any
d185af30 1929 * stable state - idle, on timer or on worklist.
36e227d2 1930 *
d185af30 1931 * Return:
3eb6b31b
MCC
1932 *
1933 * ======== ================================================================
36e227d2
TH
1934 * 1 if @work was pending and we successfully stole PENDING
1935 * 0 if @work was idle and we claimed PENDING
1936 * -EAGAIN if PENDING couldn't be grabbed at the moment, safe to busy-retry
bbb68dfa
TH
1937 * -ENOENT if someone else is canceling @work, this state may persist
1938 * for arbitrarily long
3eb6b31b 1939 * ======== ================================================================
36e227d2 1940 *
d185af30 1941 * Note:
bbb68dfa 1942 * On >= 0 return, the caller owns @work's PENDING bit. To avoid getting
e0aecdd8
TH
1943 * interrupted while holding PENDING and @work off queue, irq must be
1944 * disabled on entry. This, combined with delayed_work->timer being
1945 * irqsafe, ensures that we return -EAGAIN for finite short period of time.
bbb68dfa
TH
1946 *
1947 * On successful return, >= 0, irq is disabled and the caller is
1948 * responsible for releasing it using local_irq_restore(*@flags).
1949 *
e0aecdd8 1950 * This function is safe to call from any context including IRQ handler.
bf4ede01 1951 */
bbb68dfa
TH
1952static int try_to_grab_pending(struct work_struct *work, bool is_dwork,
1953 unsigned long *flags)
bf4ede01 1954{
d565ed63 1955 struct worker_pool *pool;
112202d9 1956 struct pool_workqueue *pwq;
bf4ede01 1957
bbb68dfa
TH
1958 local_irq_save(*flags);
1959
36e227d2
TH
1960 /* try to steal the timer if it exists */
1961 if (is_dwork) {
1962 struct delayed_work *dwork = to_delayed_work(work);
1963
e0aecdd8
TH
1964 /*
1965 * dwork->timer is irqsafe. If del_timer() fails, it's
1966 * guaranteed that the timer is not queued anywhere and not
1967 * running on the local CPU.
1968 */
36e227d2
TH
1969 if (likely(del_timer(&dwork->timer)))
1970 return 1;
1971 }
1972
1973 /* try to claim PENDING the normal way */
bf4ede01
TH
1974 if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work)))
1975 return 0;
1976
24acfb71 1977 rcu_read_lock();
bf4ede01
TH
1978 /*
1979 * The queueing is in progress, or it is already queued. Try to
1980 * steal it from ->worklist without clearing WORK_STRUCT_PENDING.
1981 */
d565ed63
TH
1982 pool = get_work_pool(work);
1983 if (!pool)
bbb68dfa 1984 goto fail;
bf4ede01 1985
a9b8a985 1986 raw_spin_lock(&pool->lock);
0b3dae68 1987 /*
112202d9
TH
1988 * work->data is guaranteed to point to pwq only while the work
1989 * item is queued on pwq->wq, and both updating work->data to point
1990 * to pwq on queueing and to pool on dequeueing are done under
1991 * pwq->pool->lock. This in turn guarantees that, if work->data
1992 * points to pwq which is associated with a locked pool, the work
0b3dae68
LJ
1993 * item is currently queued on that pool.
1994 */
112202d9
TH
1995 pwq = get_work_pwq(work);
1996 if (pwq && pwq->pool == pool) {
16062836
TH
1997 debug_work_deactivate(work);
1998
1999 /*
018f3a13
LJ
2000 * A cancelable inactive work item must be in the
2001 * pwq->inactive_works since a queued barrier can't be
2002 * canceled (see the comments in insert_wq_barrier()).
2003 *
f97a4a1a 2004 * An inactive work item cannot be grabbed directly because
d812796e 2005 * it might have linked barrier work items which, if left
f97a4a1a 2006 * on the inactive_works list, will confuse pwq->nr_active
16062836
TH
2007 * management later on and cause stall. Make sure the work
2008 * item is activated before grabbing.
2009 */
4c638030 2010 pwq_activate_work(pwq, work);
16062836
TH
2011
2012 list_del_init(&work->entry);
16062836 2013
112202d9 2014 /* work->data points to pwq iff queued, point to pool */
16062836
TH
2015 set_work_pool_and_keep_pending(work, pool->id);
2016
dd6c3c54
TH
2017 /* must be the last step, see the function comment */
2018 pwq_dec_nr_in_flight(pwq, *work_data_bits(work));
2019
a9b8a985 2020 raw_spin_unlock(&pool->lock);
24acfb71 2021 rcu_read_unlock();
16062836 2022 return 1;
bf4ede01 2023 }
a9b8a985 2024 raw_spin_unlock(&pool->lock);
bbb68dfa 2025fail:
24acfb71 2026 rcu_read_unlock();
bbb68dfa
TH
2027 local_irq_restore(*flags);
2028 if (work_is_canceling(work))
2029 return -ENOENT;
2030 cpu_relax();
36e227d2 2031 return -EAGAIN;
bf4ede01
TH
2032}
2033
4690c4ab 2034/**
706026c2 2035 * insert_work - insert a work into a pool
112202d9 2036 * @pwq: pwq @work belongs to
4690c4ab
TH
2037 * @work: work to insert
2038 * @head: insertion point
2039 * @extra_flags: extra WORK_STRUCT_* flags to set
2040 *
112202d9 2041 * Insert @work which belongs to @pwq after @head. @extra_flags is or'd to
706026c2 2042 * work_struct flags.
4690c4ab
TH
2043 *
2044 * CONTEXT:
a9b8a985 2045 * raw_spin_lock_irq(pool->lock).
4690c4ab 2046 */
112202d9
TH
2047static void insert_work(struct pool_workqueue *pwq, struct work_struct *work,
2048 struct list_head *head, unsigned int extra_flags)
b89deed3 2049{
fe089f87 2050 debug_work_activate(work);
e22bee78 2051
e89a85d6 2052 /* record the work call stack in order to print it in KASAN reports */
f70da745 2053 kasan_record_aux_stack_noalloc(work);
e89a85d6 2054
4690c4ab 2055 /* we own @work, set data and link */
112202d9 2056 set_work_pwq(work, pwq, extra_flags);
1a4d9b0a 2057 list_add_tail(&work->entry, head);
8864b4e5 2058 get_pwq(pwq);
b89deed3
ON
2059}
2060
c8efcc25
TH
2061/*
2062 * Test whether @work is being queued from another work executing on the
8d03ecfe 2063 * same workqueue.
c8efcc25
TH
2064 */
2065static bool is_chained_work(struct workqueue_struct *wq)
2066{
8d03ecfe
TH
2067 struct worker *worker;
2068
2069 worker = current_wq_worker();
2070 /*
bf393fd4 2071 * Return %true iff I'm a worker executing a work item on @wq. If
8d03ecfe
TH
2072 * I'm @worker, it's safe to dereference it without locking.
2073 */
112202d9 2074 return worker && worker->current_pwq->wq == wq;
c8efcc25
TH
2075}
2076
ef557180
MG
2077/*
2078 * When queueing an unbound work item to a wq, prefer local CPU if allowed
2079 * by wq_unbound_cpumask. Otherwise, round robin among the allowed ones to
2080 * avoid perturbing sensitive tasks.
2081 */
2082static int wq_select_unbound_cpu(int cpu)
2083{
2084 int new_cpu;
2085
f303fccb
TH
2086 if (likely(!wq_debug_force_rr_cpu)) {
2087 if (cpumask_test_cpu(cpu, wq_unbound_cpumask))
2088 return cpu;
a8ec5880
AF
2089 } else {
2090 pr_warn_once("workqueue: round-robin CPU selection forced, expect performance impact\n");
f303fccb
TH
2091 }
2092
ef557180
MG
2093 new_cpu = __this_cpu_read(wq_rr_cpu_last);
2094 new_cpu = cpumask_next_and(new_cpu, wq_unbound_cpumask, cpu_online_mask);
2095 if (unlikely(new_cpu >= nr_cpu_ids)) {
2096 new_cpu = cpumask_first_and(wq_unbound_cpumask, cpu_online_mask);
2097 if (unlikely(new_cpu >= nr_cpu_ids))
2098 return cpu;
2099 }
2100 __this_cpu_write(wq_rr_cpu_last, new_cpu);
2101
2102 return new_cpu;
2103}
2104
d84ff051 2105static void __queue_work(int cpu, struct workqueue_struct *wq,
1da177e4
LT
2106 struct work_struct *work)
2107{
112202d9 2108 struct pool_workqueue *pwq;
fe089f87 2109 struct worker_pool *last_pool, *pool;
8a2e8e5d 2110 unsigned int work_flags;
b75cac93 2111 unsigned int req_cpu = cpu;
8930caba
TH
2112
2113 /*
2114 * While a work item is PENDING && off queue, a task trying to
2115 * steal the PENDING will busy-loop waiting for it to either get
2116 * queued or lose PENDING. Grabbing PENDING and queueing should
2117 * happen with IRQ disabled.
2118 */
8e8eb730 2119 lockdep_assert_irqs_disabled();
1da177e4 2120
1e19ffc6 2121
33e3f0a3
RC
2122 /*
2123 * For a draining wq, only works from the same workqueue are
2124 * allowed. The __WQ_DESTROYING helps to spot the issue that
2125 * queues a new work item to a wq after destroy_workqueue(wq).
2126 */
2127 if (unlikely(wq->flags & (__WQ_DESTROYING | __WQ_DRAINING) &&
2128 WARN_ON_ONCE(!is_chained_work(wq))))
e41e704b 2129 return;
24acfb71 2130 rcu_read_lock();
9e8cd2f5 2131retry:
c9178087 2132 /* pwq which will be used unless @work is executing elsewhere */
636b927e
TH
2133 if (req_cpu == WORK_CPU_UNBOUND) {
2134 if (wq->flags & WQ_UNBOUND)
aa202f1f 2135 cpu = wq_select_unbound_cpu(raw_smp_processor_id());
636b927e 2136 else
aa202f1f 2137 cpu = raw_smp_processor_id();
aa202f1f 2138 }
dbf2576e 2139
636b927e 2140 pwq = rcu_dereference(*per_cpu_ptr(wq->cpu_pwq, cpu));
fe089f87
TH
2141 pool = pwq->pool;
2142
c9178087
TH
2143 /*
2144 * If @work was previously on a different pool, it might still be
2145 * running there, in which case the work needs to be queued on that
2146 * pool to guarantee non-reentrancy.
2147 */
2148 last_pool = get_work_pool(work);
fe089f87 2149 if (last_pool && last_pool != pool) {
c9178087 2150 struct worker *worker;
18aa9eff 2151
a9b8a985 2152 raw_spin_lock(&last_pool->lock);
18aa9eff 2153
c9178087 2154 worker = find_worker_executing_work(last_pool, work);
18aa9eff 2155
c9178087
TH
2156 if (worker && worker->current_pwq->wq == wq) {
2157 pwq = worker->current_pwq;
fe089f87
TH
2158 pool = pwq->pool;
2159 WARN_ON_ONCE(pool != last_pool);
8930caba 2160 } else {
c9178087 2161 /* meh... not running there, queue here */
a9b8a985 2162 raw_spin_unlock(&last_pool->lock);
fe089f87 2163 raw_spin_lock(&pool->lock);
8930caba 2164 }
f3421797 2165 } else {
fe089f87 2166 raw_spin_lock(&pool->lock);
502ca9d8
TH
2167 }
2168
9e8cd2f5 2169 /*
636b927e
TH
2170 * pwq is determined and locked. For unbound pools, we could have raced
2171 * with pwq release and it could already be dead. If its refcnt is zero,
2172 * repeat pwq selection. Note that unbound pwqs never die without
2173 * another pwq replacing it in cpu_pwq or while work items are executing
2174 * on it, so the retrying is guaranteed to make forward-progress.
9e8cd2f5
TH
2175 */
2176 if (unlikely(!pwq->refcnt)) {
2177 if (wq->flags & WQ_UNBOUND) {
fe089f87 2178 raw_spin_unlock(&pool->lock);
9e8cd2f5
TH
2179 cpu_relax();
2180 goto retry;
2181 }
2182 /* oops */
2183 WARN_ONCE(true, "workqueue: per-cpu pwq for %s on cpu%d has 0 refcnt",
2184 wq->name, cpu);
2185 }
2186
112202d9
TH
2187 /* pwq determined, queue */
2188 trace_workqueue_queue_work(req_cpu, pwq, work);
502ca9d8 2189
24acfb71
TG
2190 if (WARN_ON(!list_empty(&work->entry)))
2191 goto out;
1e19ffc6 2192
112202d9
TH
2193 pwq->nr_in_flight[pwq->work_color]++;
2194 work_flags = work_color_to_flags(pwq->work_color);
1e19ffc6 2195
a045a272
TH
2196 /*
2197 * Limit the number of concurrently active work items to max_active.
2198 * @work must also queue behind existing inactive work items to maintain
2199 * ordering when max_active changes. See wq_adjust_max_active().
2200 */
5797b1c1 2201 if (list_empty(&pwq->inactive_works) && pwq_tryinc_nr_active(pwq, false)) {
fe089f87
TH
2202 if (list_empty(&pool->worklist))
2203 pool->watchdog_ts = jiffies;
2204
cdadf009 2205 trace_workqueue_activate_work(work);
fe089f87 2206 insert_work(pwq, work, &pool->worklist, work_flags);
0219a352 2207 kick_pool(pool);
8a2e8e5d 2208 } else {
f97a4a1a 2209 work_flags |= WORK_STRUCT_INACTIVE;
fe089f87 2210 insert_work(pwq, work, &pwq->inactive_works, work_flags);
8a2e8e5d 2211 }
1e19ffc6 2212
24acfb71 2213out:
fe089f87 2214 raw_spin_unlock(&pool->lock);
24acfb71 2215 rcu_read_unlock();
1da177e4
LT
2216}
2217
0fcb78c2 2218/**
c1a220e7
ZR
2219 * queue_work_on - queue work on specific cpu
2220 * @cpu: CPU number to execute work on
0fcb78c2
REB
2221 * @wq: workqueue to use
2222 * @work: work to queue
2223 *
c1a220e7 2224 * We queue the work to a specific CPU, the caller must ensure it
443378f0
PM
2225 * can't go away. Callers that fail to ensure that the specified
2226 * CPU cannot go away will execute on a randomly chosen CPU.
854f5cc5
PM
2227 * But note well that callers specifying a CPU that never has been
2228 * online will get a splat.
d185af30
YB
2229 *
2230 * Return: %false if @work was already on a queue, %true otherwise.
1da177e4 2231 */
d4283e93
TH
2232bool queue_work_on(int cpu, struct workqueue_struct *wq,
2233 struct work_struct *work)
1da177e4 2234{
d4283e93 2235 bool ret = false;
8930caba 2236 unsigned long flags;
ef1ca236 2237
8930caba 2238 local_irq_save(flags);
c1a220e7 2239
22df02bb 2240 if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))) {
4690c4ab 2241 __queue_work(cpu, wq, work);
d4283e93 2242 ret = true;
c1a220e7 2243 }
ef1ca236 2244
8930caba 2245 local_irq_restore(flags);
1da177e4
LT
2246 return ret;
2247}
ad7b1f84 2248EXPORT_SYMBOL(queue_work_on);
1da177e4 2249
8204e0c1 2250/**
fef59c9c 2251 * select_numa_node_cpu - Select a CPU based on NUMA node
8204e0c1
AD
2252 * @node: NUMA node ID that we want to select a CPU from
2253 *
2254 * This function will attempt to find a "random" cpu available on a given
2255 * node. If there are no CPUs available on the given node it will return
2256 * WORK_CPU_UNBOUND indicating that we should just schedule to any
2257 * available CPU if we need to schedule this work.
2258 */
fef59c9c 2259static int select_numa_node_cpu(int node)
8204e0c1
AD
2260{
2261 int cpu;
2262
8204e0c1
AD
2263 /* Delay binding to CPU if node is not valid or online */
2264 if (node < 0 || node >= MAX_NUMNODES || !node_online(node))
2265 return WORK_CPU_UNBOUND;
2266
2267 /* Use local node/cpu if we are already there */
2268 cpu = raw_smp_processor_id();
2269 if (node == cpu_to_node(cpu))
2270 return cpu;
2271
2272 /* Use "random" otherwise know as "first" online CPU of node */
2273 cpu = cpumask_any_and(cpumask_of_node(node), cpu_online_mask);
2274
2275 /* If CPU is valid return that, otherwise just defer */
2276 return cpu < nr_cpu_ids ? cpu : WORK_CPU_UNBOUND;
2277}
2278
2279/**
2280 * queue_work_node - queue work on a "random" cpu for a given NUMA node
2281 * @node: NUMA node that we are targeting the work for
2282 * @wq: workqueue to use
2283 * @work: work to queue
2284 *
2285 * We queue the work to a "random" CPU within a given NUMA node. The basic
2286 * idea here is to provide a way to somehow associate work with a given
2287 * NUMA node.
2288 *
2289 * This function will only make a best effort attempt at getting this onto
2290 * the right NUMA node. If no node is requested or the requested node is
2291 * offline then we just fall back to standard queue_work behavior.
2292 *
2293 * Currently the "random" CPU ends up being the first available CPU in the
2294 * intersection of cpu_online_mask and the cpumask of the node, unless we
2295 * are running on the node. In that case we just use the current CPU.
2296 *
2297 * Return: %false if @work was already on a queue, %true otherwise.
2298 */
2299bool queue_work_node(int node, struct workqueue_struct *wq,
2300 struct work_struct *work)
2301{
2302 unsigned long flags;
2303 bool ret = false;
2304
2305 /*
2306 * This current implementation is specific to unbound workqueues.
2307 * Specifically we only return the first available CPU for a given
2308 * node instead of cycling through individual CPUs within the node.
2309 *
2310 * If this is used with a per-cpu workqueue then the logic in
2311 * workqueue_select_cpu_near would need to be updated to allow for
2312 * some round robin type logic.
2313 */
2314 WARN_ON_ONCE(!(wq->flags & WQ_UNBOUND));
2315
2316 local_irq_save(flags);
2317
2318 if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))) {
fef59c9c 2319 int cpu = select_numa_node_cpu(node);
8204e0c1
AD
2320
2321 __queue_work(cpu, wq, work);
2322 ret = true;
2323 }
2324
2325 local_irq_restore(flags);
2326 return ret;
2327}
2328EXPORT_SYMBOL_GPL(queue_work_node);
2329
8c20feb6 2330void delayed_work_timer_fn(struct timer_list *t)
1da177e4 2331{
8c20feb6 2332 struct delayed_work *dwork = from_timer(dwork, t, timer);
1da177e4 2333
e0aecdd8 2334 /* should have been called from irqsafe timer with irq already off */
60c057bc 2335 __queue_work(dwork->cpu, dwork->wq, &dwork->work);
1da177e4 2336}
1438ade5 2337EXPORT_SYMBOL(delayed_work_timer_fn);
1da177e4 2338
7beb2edf
TH
2339static void __queue_delayed_work(int cpu, struct workqueue_struct *wq,
2340 struct delayed_work *dwork, unsigned long delay)
1da177e4 2341{
7beb2edf
TH
2342 struct timer_list *timer = &dwork->timer;
2343 struct work_struct *work = &dwork->work;
7beb2edf 2344
637fdbae 2345 WARN_ON_ONCE(!wq);
4b243563 2346 WARN_ON_ONCE(timer->function != delayed_work_timer_fn);
fc4b514f
TH
2347 WARN_ON_ONCE(timer_pending(timer));
2348 WARN_ON_ONCE(!list_empty(&work->entry));
7beb2edf 2349
8852aac2
TH
2350 /*
2351 * If @delay is 0, queue @dwork->work immediately. This is for
2352 * both optimization and correctness. The earliest @timer can
2353 * expire is on the closest next tick and delayed_work users depend
2354 * on that there's no such delay when @delay is 0.
2355 */
2356 if (!delay) {
2357 __queue_work(cpu, wq, &dwork->work);
2358 return;
2359 }
2360
60c057bc 2361 dwork->wq = wq;
1265057f 2362 dwork->cpu = cpu;
7beb2edf
TH
2363 timer->expires = jiffies + delay;
2364
aae17ebb
LB
2365 if (housekeeping_enabled(HK_TYPE_TIMER)) {
2366 /* If the current cpu is a housekeeping cpu, use it. */
2367 cpu = smp_processor_id();
2368 if (!housekeeping_test_cpu(cpu, HK_TYPE_TIMER))
2369 cpu = housekeeping_any_cpu(HK_TYPE_TIMER);
041bd12e 2370 add_timer_on(timer, cpu);
aae17ebb
LB
2371 } else {
2372 if (likely(cpu == WORK_CPU_UNBOUND))
2373 add_timer(timer);
2374 else
2375 add_timer_on(timer, cpu);
2376 }
1da177e4
LT
2377}
2378
0fcb78c2
REB
2379/**
2380 * queue_delayed_work_on - queue work on specific CPU after delay
2381 * @cpu: CPU number to execute work on
2382 * @wq: workqueue to use
af9997e4 2383 * @dwork: work to queue
0fcb78c2
REB
2384 * @delay: number of jiffies to wait before queueing
2385 *
d185af30 2386 * Return: %false if @work was already on a queue, %true otherwise. If
715f1300
TH
2387 * @delay is zero and @dwork is idle, it will be scheduled for immediate
2388 * execution.
0fcb78c2 2389 */
d4283e93
TH
2390bool queue_delayed_work_on(int cpu, struct workqueue_struct *wq,
2391 struct delayed_work *dwork, unsigned long delay)
7a6bc1cd 2392{
52bad64d 2393 struct work_struct *work = &dwork->work;
d4283e93 2394 bool ret = false;
8930caba 2395 unsigned long flags;
7a6bc1cd 2396
8930caba
TH
2397 /* read the comment in __queue_work() */
2398 local_irq_save(flags);
7a6bc1cd 2399
22df02bb 2400 if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))) {
7beb2edf 2401 __queue_delayed_work(cpu, wq, dwork, delay);
d4283e93 2402 ret = true;
7a6bc1cd 2403 }
8a3e77cc 2404
8930caba 2405 local_irq_restore(flags);
7a6bc1cd
VP
2406 return ret;
2407}
ad7b1f84 2408EXPORT_SYMBOL(queue_delayed_work_on);
c7fc77f7 2409
8376fe22
TH
2410/**
2411 * mod_delayed_work_on - modify delay of or queue a delayed work on specific CPU
2412 * @cpu: CPU number to execute work on
2413 * @wq: workqueue to use
2414 * @dwork: work to queue
2415 * @delay: number of jiffies to wait before queueing
2416 *
2417 * If @dwork is idle, equivalent to queue_delayed_work_on(); otherwise,
2418 * modify @dwork's timer so that it expires after @delay. If @delay is
2419 * zero, @work is guaranteed to be scheduled immediately regardless of its
2420 * current state.
2421 *
d185af30 2422 * Return: %false if @dwork was idle and queued, %true if @dwork was
8376fe22
TH
2423 * pending and its timer was modified.
2424 *
e0aecdd8 2425 * This function is safe to call from any context including IRQ handler.
8376fe22
TH
2426 * See try_to_grab_pending() for details.
2427 */
2428bool mod_delayed_work_on(int cpu, struct workqueue_struct *wq,
2429 struct delayed_work *dwork, unsigned long delay)
2430{
2431 unsigned long flags;
2432 int ret;
c7fc77f7 2433
8376fe22
TH
2434 do {
2435 ret = try_to_grab_pending(&dwork->work, true, &flags);
2436 } while (unlikely(ret == -EAGAIN));
63bc0362 2437
8376fe22
TH
2438 if (likely(ret >= 0)) {
2439 __queue_delayed_work(cpu, wq, dwork, delay);
2440 local_irq_restore(flags);
7a6bc1cd 2441 }
8376fe22
TH
2442
2443 /* -ENOENT from try_to_grab_pending() becomes %true */
7a6bc1cd
VP
2444 return ret;
2445}
8376fe22
TH
2446EXPORT_SYMBOL_GPL(mod_delayed_work_on);
2447
05f0fe6b
TH
2448static void rcu_work_rcufn(struct rcu_head *rcu)
2449{
2450 struct rcu_work *rwork = container_of(rcu, struct rcu_work, rcu);
2451
2452 /* read the comment in __queue_work() */
2453 local_irq_disable();
2454 __queue_work(WORK_CPU_UNBOUND, rwork->wq, &rwork->work);
2455 local_irq_enable();
2456}
2457
2458/**
2459 * queue_rcu_work - queue work after a RCU grace period
2460 * @wq: workqueue to use
2461 * @rwork: work to queue
2462 *
2463 * Return: %false if @rwork was already pending, %true otherwise. Note
2464 * that a full RCU grace period is guaranteed only after a %true return.
bf393fd4 2465 * While @rwork is guaranteed to be executed after a %false return, the
05f0fe6b
TH
2466 * execution may happen before a full RCU grace period has passed.
2467 */
2468bool queue_rcu_work(struct workqueue_struct *wq, struct rcu_work *rwork)
2469{
2470 struct work_struct *work = &rwork->work;
2471
2472 if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))) {
2473 rwork->wq = wq;
a7e30c0e 2474 call_rcu_hurry(&rwork->rcu, rcu_work_rcufn);
05f0fe6b
TH
2475 return true;
2476 }
2477
2478 return false;
2479}
2480EXPORT_SYMBOL(queue_rcu_work);
2481
f7537df5 2482static struct worker *alloc_worker(int node)
c34056a3
TH
2483{
2484 struct worker *worker;
2485
f7537df5 2486 worker = kzalloc_node(sizeof(*worker), GFP_KERNEL, node);
c8e55f36
TH
2487 if (worker) {
2488 INIT_LIST_HEAD(&worker->entry);
affee4b2 2489 INIT_LIST_HEAD(&worker->scheduled);
da028469 2490 INIT_LIST_HEAD(&worker->node);
e22bee78
TH
2491 /* on creation a worker is in !idle && prep state */
2492 worker->flags = WORKER_PREP;
c8e55f36 2493 }
c34056a3
TH
2494 return worker;
2495}
2496
9546b29e
TH
2497static cpumask_t *pool_allowed_cpus(struct worker_pool *pool)
2498{
8639eceb
TH
2499 if (pool->cpu < 0 && pool->attrs->affn_strict)
2500 return pool->attrs->__pod_cpumask;
2501 else
2502 return pool->attrs->cpumask;
9546b29e
TH
2503}
2504
4736cbf7
LJ
2505/**
2506 * worker_attach_to_pool() - attach a worker to a pool
2507 * @worker: worker to be attached
2508 * @pool: the target pool
2509 *
2510 * Attach @worker to @pool. Once attached, the %WORKER_UNBOUND flag and
2511 * cpu-binding of @worker are kept coordinated with the pool across
2512 * cpu-[un]hotplugs.
2513 */
2514static void worker_attach_to_pool(struct worker *worker,
2515 struct worker_pool *pool)
2516{
1258fae7 2517 mutex_lock(&wq_pool_attach_mutex);
4736cbf7 2518
4736cbf7 2519 /*
1258fae7
TH
2520 * The wq_pool_attach_mutex ensures %POOL_DISASSOCIATED remains
2521 * stable across this function. See the comments above the flag
2522 * definition for details.
4736cbf7
LJ
2523 */
2524 if (pool->flags & POOL_DISASSOCIATED)
2525 worker->flags |= WORKER_UNBOUND;
5c25b5ff
PZ
2526 else
2527 kthread_set_per_cpu(worker->task, pool->cpu);
4736cbf7 2528
640f17c8 2529 if (worker->rescue_wq)
9546b29e 2530 set_cpus_allowed_ptr(worker->task, pool_allowed_cpus(pool));
640f17c8 2531
4736cbf7 2532 list_add_tail(&worker->node, &pool->workers);
a2d812a2 2533 worker->pool = pool;
4736cbf7 2534
1258fae7 2535 mutex_unlock(&wq_pool_attach_mutex);
4736cbf7
LJ
2536}
2537
60f5a4bc
LJ
2538/**
2539 * worker_detach_from_pool() - detach a worker from its pool
2540 * @worker: worker which is attached to its pool
60f5a4bc 2541 *
4736cbf7
LJ
2542 * Undo the attaching which had been done in worker_attach_to_pool(). The
2543 * caller worker shouldn't access to the pool after detached except it has
2544 * other reference to the pool.
60f5a4bc 2545 */
a2d812a2 2546static void worker_detach_from_pool(struct worker *worker)
60f5a4bc 2547{
a2d812a2 2548 struct worker_pool *pool = worker->pool;
60f5a4bc
LJ
2549 struct completion *detach_completion = NULL;
2550
1258fae7 2551 mutex_lock(&wq_pool_attach_mutex);
a2d812a2 2552
5c25b5ff 2553 kthread_set_per_cpu(worker->task, -1);
da028469 2554 list_del(&worker->node);
a2d812a2
TH
2555 worker->pool = NULL;
2556
e02b9312 2557 if (list_empty(&pool->workers) && list_empty(&pool->dying_workers))
60f5a4bc 2558 detach_completion = pool->detach_completion;
1258fae7 2559 mutex_unlock(&wq_pool_attach_mutex);
60f5a4bc 2560
b62c0751
LJ
2561 /* clear leftover flags without pool->lock after it is detached */
2562 worker->flags &= ~(WORKER_UNBOUND | WORKER_REBOUND);
2563
60f5a4bc
LJ
2564 if (detach_completion)
2565 complete(detach_completion);
2566}
2567
c34056a3
TH
2568/**
2569 * create_worker - create a new workqueue worker
63d95a91 2570 * @pool: pool the new worker will belong to
c34056a3 2571 *
051e1850 2572 * Create and start a new worker which is attached to @pool.
c34056a3
TH
2573 *
2574 * CONTEXT:
2575 * Might sleep. Does GFP_KERNEL allocations.
2576 *
d185af30 2577 * Return:
c34056a3
TH
2578 * Pointer to the newly created worker.
2579 */
bc2ae0f5 2580static struct worker *create_worker(struct worker_pool *pool)
c34056a3 2581{
e441b56f
ZL
2582 struct worker *worker;
2583 int id;
5d9c7a1e 2584 char id_buf[23];
c34056a3 2585
7cda9aae 2586 /* ID is needed to determine kthread name */
e441b56f 2587 id = ida_alloc(&pool->worker_ida, GFP_KERNEL);
3f0ea0b8
PM
2588 if (id < 0) {
2589 pr_err_once("workqueue: Failed to allocate a worker ID: %pe\n",
2590 ERR_PTR(id));
e441b56f 2591 return NULL;
3f0ea0b8 2592 }
c34056a3 2593
f7537df5 2594 worker = alloc_worker(pool->node);
3f0ea0b8
PM
2595 if (!worker) {
2596 pr_err_once("workqueue: Failed to allocate a worker\n");
c34056a3 2597 goto fail;
3f0ea0b8 2598 }
c34056a3 2599
c34056a3
TH
2600 worker->id = id;
2601
29c91e99 2602 if (pool->cpu >= 0)
e3c916a4
TH
2603 snprintf(id_buf, sizeof(id_buf), "%d:%d%s", pool->cpu, id,
2604 pool->attrs->nice < 0 ? "H" : "");
f3421797 2605 else
e3c916a4
TH
2606 snprintf(id_buf, sizeof(id_buf), "u%d:%d", pool->id, id);
2607
f3f90ad4 2608 worker->task = kthread_create_on_node(worker_thread, worker, pool->node,
e3c916a4 2609 "kworker/%s", id_buf);
3f0ea0b8 2610 if (IS_ERR(worker->task)) {
60f54038
PM
2611 if (PTR_ERR(worker->task) == -EINTR) {
2612 pr_err("workqueue: Interrupted when creating a worker thread \"kworker/%s\"\n",
2613 id_buf);
2614 } else {
2615 pr_err_once("workqueue: Failed to create a worker thread: %pe",
2616 worker->task);
2617 }
c34056a3 2618 goto fail;
3f0ea0b8 2619 }
c34056a3 2620
91151228 2621 set_user_nice(worker->task, pool->attrs->nice);
9546b29e 2622 kthread_bind_mask(worker->task, pool_allowed_cpus(pool));
91151228 2623
da028469 2624 /* successful, attach the worker to the pool */
4736cbf7 2625 worker_attach_to_pool(worker, pool);
822d8405 2626
051e1850 2627 /* start the newly created worker */
a9b8a985 2628 raw_spin_lock_irq(&pool->lock);
0219a352 2629
051e1850
LJ
2630 worker->pool->nr_workers++;
2631 worker_enter_idle(worker);
0219a352
TH
2632
2633 /*
2634 * @worker is waiting on a completion in kthread() and will trigger hung
6a229b0e
TH
2635 * check if not woken up soon. As kick_pool() is noop if @pool is empty,
2636 * wake it up explicitly.
0219a352 2637 */
051e1850 2638 wake_up_process(worker->task);
0219a352 2639
a9b8a985 2640 raw_spin_unlock_irq(&pool->lock);
051e1850 2641
c34056a3 2642 return worker;
822d8405 2643
c34056a3 2644fail:
e441b56f 2645 ida_free(&pool->worker_ida, id);
c34056a3
TH
2646 kfree(worker);
2647 return NULL;
2648}
2649
793777bc
VS
2650static void unbind_worker(struct worker *worker)
2651{
2652 lockdep_assert_held(&wq_pool_attach_mutex);
2653
2654 kthread_set_per_cpu(worker->task, -1);
2655 if (cpumask_intersects(wq_unbound_cpumask, cpu_active_mask))
2656 WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task, wq_unbound_cpumask) < 0);
2657 else
2658 WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task, cpu_possible_mask) < 0);
2659}
2660
e02b9312
VS
2661static void wake_dying_workers(struct list_head *cull_list)
2662{
2663 struct worker *worker, *tmp;
2664
2665 list_for_each_entry_safe(worker, tmp, cull_list, entry) {
2666 list_del_init(&worker->entry);
2667 unbind_worker(worker);
2668 /*
2669 * If the worker was somehow already running, then it had to be
2670 * in pool->idle_list when set_worker_dying() happened or we
2671 * wouldn't have gotten here.
2672 *
2673 * Thus, the worker must either have observed the WORKER_DIE
2674 * flag, or have set its state to TASK_IDLE. Either way, the
2675 * below will be observed by the worker and is safe to do
2676 * outside of pool->lock.
2677 */
2678 wake_up_process(worker->task);
2679 }
2680}
2681
c34056a3 2682/**
e02b9312 2683 * set_worker_dying - Tag a worker for destruction
c34056a3 2684 * @worker: worker to be destroyed
e02b9312 2685 * @list: transfer worker away from its pool->idle_list and into list
c34056a3 2686 *
e02b9312
VS
2687 * Tag @worker for destruction and adjust @pool stats accordingly. The worker
2688 * should be idle.
c8e55f36
TH
2689 *
2690 * CONTEXT:
a9b8a985 2691 * raw_spin_lock_irq(pool->lock).
c34056a3 2692 */
e02b9312 2693static void set_worker_dying(struct worker *worker, struct list_head *list)
c34056a3 2694{
bd7bdd43 2695 struct worker_pool *pool = worker->pool;
c34056a3 2696
cd549687 2697 lockdep_assert_held(&pool->lock);
e02b9312 2698 lockdep_assert_held(&wq_pool_attach_mutex);
cd549687 2699
c34056a3 2700 /* sanity check frenzy */
6183c009 2701 if (WARN_ON(worker->current_work) ||
73eb7fe7
LJ
2702 WARN_ON(!list_empty(&worker->scheduled)) ||
2703 WARN_ON(!(worker->flags & WORKER_IDLE)))
6183c009 2704 return;
c34056a3 2705
73eb7fe7
LJ
2706 pool->nr_workers--;
2707 pool->nr_idle--;
5bdfff96 2708
cb444766 2709 worker->flags |= WORKER_DIE;
e02b9312
VS
2710
2711 list_move(&worker->entry, list);
2712 list_move(&worker->node, &pool->dying_workers);
c34056a3
TH
2713}
2714
3f959aa3
VS
2715/**
2716 * idle_worker_timeout - check if some idle workers can now be deleted.
2717 * @t: The pool's idle_timer that just expired
2718 *
2719 * The timer is armed in worker_enter_idle(). Note that it isn't disarmed in
2720 * worker_leave_idle(), as a worker flicking between idle and active while its
2721 * pool is at the too_many_workers() tipping point would cause too much timer
2722 * housekeeping overhead. Since IDLE_WORKER_TIMEOUT is long enough, we just let
2723 * it expire and re-evaluate things from there.
2724 */
32a6c723 2725static void idle_worker_timeout(struct timer_list *t)
e22bee78 2726{
32a6c723 2727 struct worker_pool *pool = from_timer(pool, t, idle_timer);
3f959aa3
VS
2728 bool do_cull = false;
2729
2730 if (work_pending(&pool->idle_cull_work))
2731 return;
e22bee78 2732
a9b8a985 2733 raw_spin_lock_irq(&pool->lock);
e22bee78 2734
3f959aa3 2735 if (too_many_workers(pool)) {
e22bee78
TH
2736 struct worker *worker;
2737 unsigned long expires;
2738
2739 /* idle_list is kept in LIFO order, check the last one */
3f959aa3
VS
2740 worker = list_entry(pool->idle_list.prev, struct worker, entry);
2741 expires = worker->last_active + IDLE_WORKER_TIMEOUT;
2742 do_cull = !time_before(jiffies, expires);
2743
2744 if (!do_cull)
2745 mod_timer(&pool->idle_timer, expires);
2746 }
2747 raw_spin_unlock_irq(&pool->lock);
2748
2749 if (do_cull)
2750 queue_work(system_unbound_wq, &pool->idle_cull_work);
2751}
2752
2753/**
2754 * idle_cull_fn - cull workers that have been idle for too long.
2755 * @work: the pool's work for handling these idle workers
2756 *
2757 * This goes through a pool's idle workers and gets rid of those that have been
2758 * idle for at least IDLE_WORKER_TIMEOUT seconds.
e02b9312
VS
2759 *
2760 * We don't want to disturb isolated CPUs because of a pcpu kworker being
2761 * culled, so this also resets worker affinity. This requires a sleepable
2762 * context, hence the split between timer callback and work item.
3f959aa3
VS
2763 */
2764static void idle_cull_fn(struct work_struct *work)
2765{
2766 struct worker_pool *pool = container_of(work, struct worker_pool, idle_cull_work);
9680540c 2767 LIST_HEAD(cull_list);
3f959aa3 2768
e02b9312
VS
2769 /*
2770 * Grabbing wq_pool_attach_mutex here ensures an already-running worker
2771 * cannot proceed beyong worker_detach_from_pool() in its self-destruct
2772 * path. This is required as a previously-preempted worker could run after
2773 * set_worker_dying() has happened but before wake_dying_workers() did.
2774 */
2775 mutex_lock(&wq_pool_attach_mutex);
3f959aa3
VS
2776 raw_spin_lock_irq(&pool->lock);
2777
2778 while (too_many_workers(pool)) {
2779 struct worker *worker;
2780 unsigned long expires;
2781
63d95a91 2782 worker = list_entry(pool->idle_list.prev, struct worker, entry);
e22bee78
TH
2783 expires = worker->last_active + IDLE_WORKER_TIMEOUT;
2784
3347fc9f 2785 if (time_before(jiffies, expires)) {
63d95a91 2786 mod_timer(&pool->idle_timer, expires);
3347fc9f 2787 break;
d5abe669 2788 }
3347fc9f 2789
e02b9312 2790 set_worker_dying(worker, &cull_list);
e22bee78
TH
2791 }
2792
a9b8a985 2793 raw_spin_unlock_irq(&pool->lock);
e02b9312
VS
2794 wake_dying_workers(&cull_list);
2795 mutex_unlock(&wq_pool_attach_mutex);
e22bee78 2796}
d5abe669 2797
493a1724 2798static void send_mayday(struct work_struct *work)
e22bee78 2799{
112202d9
TH
2800 struct pool_workqueue *pwq = get_work_pwq(work);
2801 struct workqueue_struct *wq = pwq->wq;
493a1724 2802
2e109a28 2803 lockdep_assert_held(&wq_mayday_lock);
e22bee78 2804
493008a8 2805 if (!wq->rescuer)
493a1724 2806 return;
e22bee78
TH
2807
2808 /* mayday mayday mayday */
493a1724 2809 if (list_empty(&pwq->mayday_node)) {
77668c8b
LJ
2810 /*
2811 * If @pwq is for an unbound wq, its base ref may be put at
2812 * any time due to an attribute change. Pin @pwq until the
2813 * rescuer is done with it.
2814 */
2815 get_pwq(pwq);
493a1724 2816 list_add_tail(&pwq->mayday_node, &wq->maydays);
e22bee78 2817 wake_up_process(wq->rescuer->task);
725e8ec5 2818 pwq->stats[PWQ_STAT_MAYDAY]++;
493a1724 2819 }
e22bee78
TH
2820}
2821
32a6c723 2822static void pool_mayday_timeout(struct timer_list *t)
e22bee78 2823{
32a6c723 2824 struct worker_pool *pool = from_timer(pool, t, mayday_timer);
e22bee78
TH
2825 struct work_struct *work;
2826
a9b8a985
SAS
2827 raw_spin_lock_irq(&pool->lock);
2828 raw_spin_lock(&wq_mayday_lock); /* for wq->maydays */
e22bee78 2829
63d95a91 2830 if (need_to_create_worker(pool)) {
e22bee78
TH
2831 /*
2832 * We've been trying to create a new worker but
2833 * haven't been successful. We might be hitting an
2834 * allocation deadlock. Send distress signals to
2835 * rescuers.
2836 */
63d95a91 2837 list_for_each_entry(work, &pool->worklist, entry)
e22bee78 2838 send_mayday(work);
1da177e4 2839 }
e22bee78 2840
a9b8a985
SAS
2841 raw_spin_unlock(&wq_mayday_lock);
2842 raw_spin_unlock_irq(&pool->lock);
e22bee78 2843
63d95a91 2844 mod_timer(&pool->mayday_timer, jiffies + MAYDAY_INTERVAL);
1da177e4
LT
2845}
2846
e22bee78
TH
2847/**
2848 * maybe_create_worker - create a new worker if necessary
63d95a91 2849 * @pool: pool to create a new worker for
e22bee78 2850 *
63d95a91 2851 * Create a new worker for @pool if necessary. @pool is guaranteed to
e22bee78
TH
2852 * have at least one idle worker on return from this function. If
2853 * creating a new worker takes longer than MAYDAY_INTERVAL, mayday is
63d95a91 2854 * sent to all rescuers with works scheduled on @pool to resolve
e22bee78
TH
2855 * possible allocation deadlock.
2856 *
c5aa87bb
TH
2857 * On return, need_to_create_worker() is guaranteed to be %false and
2858 * may_start_working() %true.
e22bee78
TH
2859 *
2860 * LOCKING:
a9b8a985 2861 * raw_spin_lock_irq(pool->lock) which may be released and regrabbed
e22bee78
TH
2862 * multiple times. Does GFP_KERNEL allocations. Called only from
2863 * manager.
e22bee78 2864 */
29187a9e 2865static void maybe_create_worker(struct worker_pool *pool)
d565ed63
TH
2866__releases(&pool->lock)
2867__acquires(&pool->lock)
1da177e4 2868{
e22bee78 2869restart:
a9b8a985 2870 raw_spin_unlock_irq(&pool->lock);
9f9c2364 2871
e22bee78 2872 /* if we don't make progress in MAYDAY_INITIAL_TIMEOUT, call for help */
63d95a91 2873 mod_timer(&pool->mayday_timer, jiffies + MAYDAY_INITIAL_TIMEOUT);
e22bee78
TH
2874
2875 while (true) {
051e1850 2876 if (create_worker(pool) || !need_to_create_worker(pool))
e22bee78 2877 break;
1da177e4 2878
e212f361 2879 schedule_timeout_interruptible(CREATE_COOLDOWN);
9f9c2364 2880
63d95a91 2881 if (!need_to_create_worker(pool))
e22bee78
TH
2882 break;
2883 }
2884
63d95a91 2885 del_timer_sync(&pool->mayday_timer);
a9b8a985 2886 raw_spin_lock_irq(&pool->lock);
051e1850
LJ
2887 /*
2888 * This is necessary even after a new worker was just successfully
2889 * created as @pool->lock was dropped and the new worker might have
2890 * already become busy.
2891 */
63d95a91 2892 if (need_to_create_worker(pool))
e22bee78 2893 goto restart;
e22bee78
TH
2894}
2895
73f53c4a 2896/**
e22bee78
TH
2897 * manage_workers - manage worker pool
2898 * @worker: self
73f53c4a 2899 *
706026c2 2900 * Assume the manager role and manage the worker pool @worker belongs
e22bee78 2901 * to. At any given time, there can be only zero or one manager per
706026c2 2902 * pool. The exclusion is handled automatically by this function.
e22bee78
TH
2903 *
2904 * The caller can safely start processing works on false return. On
2905 * true return, it's guaranteed that need_to_create_worker() is false
2906 * and may_start_working() is true.
73f53c4a
TH
2907 *
2908 * CONTEXT:
a9b8a985 2909 * raw_spin_lock_irq(pool->lock) which may be released and regrabbed
e22bee78
TH
2910 * multiple times. Does GFP_KERNEL allocations.
2911 *
d185af30 2912 * Return:
29187a9e
TH
2913 * %false if the pool doesn't need management and the caller can safely
2914 * start processing works, %true if management function was performed and
2915 * the conditions that the caller verified before calling the function may
2916 * no longer be true.
73f53c4a 2917 */
e22bee78 2918static bool manage_workers(struct worker *worker)
73f53c4a 2919{
63d95a91 2920 struct worker_pool *pool = worker->pool;
73f53c4a 2921
692b4825 2922 if (pool->flags & POOL_MANAGER_ACTIVE)
29187a9e 2923 return false;
692b4825
TH
2924
2925 pool->flags |= POOL_MANAGER_ACTIVE;
2607d7a6 2926 pool->manager = worker;
1e19ffc6 2927
29187a9e 2928 maybe_create_worker(pool);
e22bee78 2929
2607d7a6 2930 pool->manager = NULL;
692b4825 2931 pool->flags &= ~POOL_MANAGER_ACTIVE;
d8bb65ab 2932 rcuwait_wake_up(&manager_wait);
29187a9e 2933 return true;
73f53c4a
TH
2934}
2935
a62428c0
TH
2936/**
2937 * process_one_work - process single work
c34056a3 2938 * @worker: self
a62428c0
TH
2939 * @work: work to process
2940 *
2941 * Process @work. This function contains all the logics necessary to
2942 * process a single work including synchronization against and
2943 * interaction with other workers on the same cpu, queueing and
2944 * flushing. As long as context requirement is met, any worker can
2945 * call this function to process a work.
2946 *
2947 * CONTEXT:
a9b8a985 2948 * raw_spin_lock_irq(pool->lock) which is released and regrabbed.
a62428c0 2949 */
c34056a3 2950static void process_one_work(struct worker *worker, struct work_struct *work)
d565ed63
TH
2951__releases(&pool->lock)
2952__acquires(&pool->lock)
a62428c0 2953{
112202d9 2954 struct pool_workqueue *pwq = get_work_pwq(work);
bd7bdd43 2955 struct worker_pool *pool = worker->pool;
c4560c2c 2956 unsigned long work_data;
a62428c0
TH
2957#ifdef CONFIG_LOCKDEP
2958 /*
2959 * It is permissible to free the struct work_struct from
2960 * inside the function that is called from it, this we need to
2961 * take into account for lockdep too. To avoid bogus "held
2962 * lock freed" warnings as well as problems when looking into
2963 * work->lockdep_map, make a copy and use that here.
2964 */
4d82a1de
PZ
2965 struct lockdep_map lockdep_map;
2966
2967 lockdep_copy_map(&lockdep_map, &work->lockdep_map);
a62428c0 2968#endif
807407c0 2969 /* ensure we're on the correct CPU */
85327af6 2970 WARN_ON_ONCE(!(pool->flags & POOL_DISASSOCIATED) &&
ec22ca5e 2971 raw_smp_processor_id() != pool->cpu);
25511a47 2972
8930caba 2973 /* claim and dequeue */
a62428c0 2974 debug_work_deactivate(work);
c9e7cf27 2975 hash_add(pool->busy_hash, &worker->hentry, (unsigned long)work);
c34056a3 2976 worker->current_work = work;
a2c1c57b 2977 worker->current_func = work->func;
112202d9 2978 worker->current_pwq = pwq;
616db877 2979 worker->current_at = worker->task->se.sum_exec_runtime;
c4560c2c 2980 work_data = *work_data_bits(work);
d812796e 2981 worker->current_color = get_work_color(work_data);
7a22ad75 2982
8bf89593
TH
2983 /*
2984 * Record wq name for cmdline and debug reporting, may get
2985 * overridden through set_worker_desc().
2986 */
2987 strscpy(worker->desc, pwq->wq->name, WORKER_DESC_LEN);
2988
a62428c0
TH
2989 list_del_init(&work->entry);
2990
fb0e7beb 2991 /*
228f1d00
LJ
2992 * CPU intensive works don't participate in concurrency management.
2993 * They're the scheduler's responsibility. This takes @worker out
2994 * of concurrency management and the next code block will chain
2995 * execution of the pending work items.
fb0e7beb 2996 */
616db877 2997 if (unlikely(pwq->wq->flags & WQ_CPU_INTENSIVE))
228f1d00 2998 worker_set_flags(worker, WORKER_CPU_INTENSIVE);
fb0e7beb 2999
974271c4 3000 /*
0219a352
TH
3001 * Kick @pool if necessary. It's always noop for per-cpu worker pools
3002 * since nr_running would always be >= 1 at this point. This is used to
3003 * chain execution of the pending work items for WORKER_NOT_RUNNING
3004 * workers such as the UNBOUND and CPU_INTENSIVE ones.
974271c4 3005 */
0219a352 3006 kick_pool(pool);
974271c4 3007
8930caba 3008 /*
7c3eed5c 3009 * Record the last pool and clear PENDING which should be the last
d565ed63 3010 * update to @work. Also, do this inside @pool->lock so that
23657bb1
TH
3011 * PENDING and queued state changes happen together while IRQ is
3012 * disabled.
8930caba 3013 */
7c3eed5c 3014 set_work_pool_and_clear_pending(work, pool->id);
a62428c0 3015
fe48ba7d 3016 pwq->stats[PWQ_STAT_STARTED]++;
a9b8a985 3017 raw_spin_unlock_irq(&pool->lock);
a62428c0 3018
a1d14934 3019 lock_map_acquire(&pwq->wq->lockdep_map);
a62428c0 3020 lock_map_acquire(&lockdep_map);
e6f3faa7 3021 /*
f52be570
PZ
3022 * Strictly speaking we should mark the invariant state without holding
3023 * any locks, that is, before these two lock_map_acquire()'s.
e6f3faa7
PZ
3024 *
3025 * However, that would result in:
3026 *
3027 * A(W1)
3028 * WFC(C)
3029 * A(W1)
3030 * C(C)
3031 *
3032 * Which would create W1->C->W1 dependencies, even though there is no
3033 * actual deadlock possible. There are two solutions, using a
3034 * read-recursive acquire on the work(queue) 'locks', but this will then
f52be570 3035 * hit the lockdep limitation on recursive locks, or simply discard
e6f3faa7
PZ
3036 * these locks.
3037 *
3038 * AFAICT there is no possible deadlock scenario between the
3039 * flush_work() and complete() primitives (except for single-threaded
3040 * workqueues), so hiding them isn't a problem.
3041 */
f52be570 3042 lockdep_invariant_state(true);
e36c886a 3043 trace_workqueue_execute_start(work);
a2c1c57b 3044 worker->current_func(work);
e36c886a
AV
3045 /*
3046 * While we must be careful to not use "work" after this, the trace
3047 * point will only record its address.
3048 */
1c5da0ec 3049 trace_workqueue_execute_end(work, worker->current_func);
725e8ec5 3050 pwq->stats[PWQ_STAT_COMPLETED]++;
a62428c0 3051 lock_map_release(&lockdep_map);
112202d9 3052 lock_map_release(&pwq->wq->lockdep_map);
a62428c0 3053
1a65a6d1
XY
3054 if (unlikely(in_atomic() || lockdep_depth(current) > 0 ||
3055 rcu_preempt_depth() > 0)) {
3056 pr_err("BUG: workqueue leaked lock or atomic: %s/0x%08x/%d/%d\n"
d75f773c 3057 " last function: %ps\n",
1a65a6d1
XY
3058 current->comm, preempt_count(), rcu_preempt_depth(),
3059 task_pid_nr(current), worker->current_func);
a62428c0
TH
3060 debug_show_held_locks(current);
3061 dump_stack();
3062 }
3063
b22ce278 3064 /*
025f50f3 3065 * The following prevents a kworker from hogging CPU on !PREEMPTION
b22ce278
TH
3066 * kernels, where a requeueing work item waiting for something to
3067 * happen could deadlock with stop_machine as such work item could
3068 * indefinitely requeue itself while all other CPUs are trapped in
789cbbec
JL
3069 * stop_machine. At the same time, report a quiescent RCU state so
3070 * the same condition doesn't freeze RCU.
b22ce278 3071 */
a7e6425e 3072 cond_resched();
b22ce278 3073
a9b8a985 3074 raw_spin_lock_irq(&pool->lock);
a62428c0 3075
616db877
TH
3076 /*
3077 * In addition to %WQ_CPU_INTENSIVE, @worker may also have been marked
3078 * CPU intensive by wq_worker_tick() if @work hogged CPU longer than
3079 * wq_cpu_intensive_thresh_us. Clear it.
3080 */
3081 worker_clr_flags(worker, WORKER_CPU_INTENSIVE);
fb0e7beb 3082
1b69ac6b
JW
3083 /* tag the worker for identification in schedule() */
3084 worker->last_func = worker->current_func;
3085
a62428c0 3086 /* we're done with it, release */
42f8570f 3087 hash_del(&worker->hentry);
c34056a3 3088 worker->current_work = NULL;
a2c1c57b 3089 worker->current_func = NULL;
112202d9 3090 worker->current_pwq = NULL;
d812796e 3091 worker->current_color = INT_MAX;
dd6c3c54
TH
3092
3093 /* must be the last step, see the function comment */
c4560c2c 3094 pwq_dec_nr_in_flight(pwq, work_data);
a62428c0
TH
3095}
3096
affee4b2
TH
3097/**
3098 * process_scheduled_works - process scheduled works
3099 * @worker: self
3100 *
3101 * Process all scheduled works. Please note that the scheduled list
3102 * may change while processing a work, so this function repeatedly
3103 * fetches a work from the top and executes it.
3104 *
3105 * CONTEXT:
a9b8a985 3106 * raw_spin_lock_irq(pool->lock) which may be released and regrabbed
affee4b2
TH
3107 * multiple times.
3108 */
3109static void process_scheduled_works(struct worker *worker)
1da177e4 3110{
c0ab017d
TH
3111 struct work_struct *work;
3112 bool first = true;
3113
3114 while ((work = list_first_entry_or_null(&worker->scheduled,
3115 struct work_struct, entry))) {
3116 if (first) {
3117 worker->pool->watchdog_ts = jiffies;
3118 first = false;
3119 }
c34056a3 3120 process_one_work(worker, work);
1da177e4 3121 }
1da177e4
LT
3122}
3123
197f6acc
TH
3124static void set_pf_worker(bool val)
3125{
3126 mutex_lock(&wq_pool_attach_mutex);
3127 if (val)
3128 current->flags |= PF_WQ_WORKER;
3129 else
3130 current->flags &= ~PF_WQ_WORKER;
3131 mutex_unlock(&wq_pool_attach_mutex);
3132}
3133
4690c4ab
TH
3134/**
3135 * worker_thread - the worker thread function
c34056a3 3136 * @__worker: self
4690c4ab 3137 *
c5aa87bb
TH
3138 * The worker thread function. All workers belong to a worker_pool -
3139 * either a per-cpu one or dynamic unbound one. These workers process all
3140 * work items regardless of their specific target workqueue. The only
3141 * exception is work items which belong to workqueues with a rescuer which
3142 * will be explained in rescuer_thread().
d185af30
YB
3143 *
3144 * Return: 0
4690c4ab 3145 */
c34056a3 3146static int worker_thread(void *__worker)
1da177e4 3147{
c34056a3 3148 struct worker *worker = __worker;
bd7bdd43 3149 struct worker_pool *pool = worker->pool;
1da177e4 3150
e22bee78 3151 /* tell the scheduler that this is a workqueue worker */
197f6acc 3152 set_pf_worker(true);
c8e55f36 3153woke_up:
a9b8a985 3154 raw_spin_lock_irq(&pool->lock);
1da177e4 3155
a9ab775b
TH
3156 /* am I supposed to die? */
3157 if (unlikely(worker->flags & WORKER_DIE)) {
a9b8a985 3158 raw_spin_unlock_irq(&pool->lock);
197f6acc 3159 set_pf_worker(false);
60f5a4bc
LJ
3160
3161 set_task_comm(worker->task, "kworker/dying");
e441b56f 3162 ida_free(&pool->worker_ida, worker->id);
a2d812a2 3163 worker_detach_from_pool(worker);
e02b9312 3164 WARN_ON_ONCE(!list_empty(&worker->entry));
60f5a4bc 3165 kfree(worker);
a9ab775b 3166 return 0;
c8e55f36 3167 }
affee4b2 3168
c8e55f36 3169 worker_leave_idle(worker);
db7bccf4 3170recheck:
e22bee78 3171 /* no more worker necessary? */
63d95a91 3172 if (!need_more_worker(pool))
e22bee78
TH
3173 goto sleep;
3174
3175 /* do we need to manage? */
63d95a91 3176 if (unlikely(!may_start_working(pool)) && manage_workers(worker))
e22bee78
TH
3177 goto recheck;
3178
c8e55f36
TH
3179 /*
3180 * ->scheduled list can only be filled while a worker is
3181 * preparing to process a work or actually processing it.
3182 * Make sure nobody diddled with it while I was sleeping.
3183 */
6183c009 3184 WARN_ON_ONCE(!list_empty(&worker->scheduled));
c8e55f36 3185
e22bee78 3186 /*
a9ab775b
TH
3187 * Finish PREP stage. We're guaranteed to have at least one idle
3188 * worker or that someone else has already assumed the manager
3189 * role. This is where @worker starts participating in concurrency
3190 * management if applicable and concurrency management is restored
3191 * after being rebound. See rebind_workers() for details.
e22bee78 3192 */
a9ab775b 3193 worker_clr_flags(worker, WORKER_PREP | WORKER_REBOUND);
e22bee78
TH
3194
3195 do {
c8e55f36 3196 struct work_struct *work =
bd7bdd43 3197 list_first_entry(&pool->worklist,
c8e55f36
TH
3198 struct work_struct, entry);
3199
873eaca6
TH
3200 if (assign_work(work, worker, NULL))
3201 process_scheduled_works(worker);
63d95a91 3202 } while (keep_working(pool));
e22bee78 3203
228f1d00 3204 worker_set_flags(worker, WORKER_PREP);
d313dd85 3205sleep:
c8e55f36 3206 /*
d565ed63
TH
3207 * pool->lock is held and there's no work to process and no need to
3208 * manage, sleep. Workers are woken up only while holding
3209 * pool->lock or from local cpu, so setting the current state
3210 * before releasing pool->lock is enough to prevent losing any
3211 * event.
c8e55f36
TH
3212 */
3213 worker_enter_idle(worker);
c5a94a61 3214 __set_current_state(TASK_IDLE);
a9b8a985 3215 raw_spin_unlock_irq(&pool->lock);
c8e55f36
TH
3216 schedule();
3217 goto woke_up;
1da177e4
LT
3218}
3219
e22bee78
TH
3220/**
3221 * rescuer_thread - the rescuer thread function
111c225a 3222 * @__rescuer: self
e22bee78
TH
3223 *
3224 * Workqueue rescuer thread function. There's one rescuer for each
493008a8 3225 * workqueue which has WQ_MEM_RECLAIM set.
e22bee78 3226 *
706026c2 3227 * Regular work processing on a pool may block trying to create a new
e22bee78
TH
3228 * worker which uses GFP_KERNEL allocation which has slight chance of
3229 * developing into deadlock if some works currently on the same queue
3230 * need to be processed to satisfy the GFP_KERNEL allocation. This is
3231 * the problem rescuer solves.
3232 *
706026c2
TH
3233 * When such condition is possible, the pool summons rescuers of all
3234 * workqueues which have works queued on the pool and let them process
e22bee78
TH
3235 * those works so that forward progress can be guaranteed.
3236 *
3237 * This should happen rarely.
d185af30
YB
3238 *
3239 * Return: 0
e22bee78 3240 */
111c225a 3241static int rescuer_thread(void *__rescuer)
e22bee78 3242{
111c225a
TH
3243 struct worker *rescuer = __rescuer;
3244 struct workqueue_struct *wq = rescuer->rescue_wq;
4d595b86 3245 bool should_stop;
e22bee78
TH
3246
3247 set_user_nice(current, RESCUER_NICE_LEVEL);
111c225a
TH
3248
3249 /*
3250 * Mark rescuer as worker too. As WORKER_PREP is never cleared, it
3251 * doesn't participate in concurrency management.
3252 */
197f6acc 3253 set_pf_worker(true);
e22bee78 3254repeat:
c5a94a61 3255 set_current_state(TASK_IDLE);
e22bee78 3256
4d595b86
LJ
3257 /*
3258 * By the time the rescuer is requested to stop, the workqueue
3259 * shouldn't have any work pending, but @wq->maydays may still have
3260 * pwq(s) queued. This can happen by non-rescuer workers consuming
3261 * all the work items before the rescuer got to them. Go through
3262 * @wq->maydays processing before acting on should_stop so that the
3263 * list is always empty on exit.
3264 */
3265 should_stop = kthread_should_stop();
e22bee78 3266
493a1724 3267 /* see whether any pwq is asking for help */
a9b8a985 3268 raw_spin_lock_irq(&wq_mayday_lock);
493a1724
TH
3269
3270 while (!list_empty(&wq->maydays)) {
3271 struct pool_workqueue *pwq = list_first_entry(&wq->maydays,
3272 struct pool_workqueue, mayday_node);
112202d9 3273 struct worker_pool *pool = pwq->pool;
e22bee78
TH
3274 struct work_struct *work, *n;
3275
3276 __set_current_state(TASK_RUNNING);
493a1724
TH
3277 list_del_init(&pwq->mayday_node);
3278
a9b8a985 3279 raw_spin_unlock_irq(&wq_mayday_lock);
e22bee78 3280
51697d39
LJ
3281 worker_attach_to_pool(rescuer, pool);
3282
a9b8a985 3283 raw_spin_lock_irq(&pool->lock);
e22bee78
TH
3284
3285 /*
3286 * Slurp in all works issued via this workqueue and
3287 * process'em.
3288 */
873eaca6 3289 WARN_ON_ONCE(!list_empty(&rescuer->scheduled));
82607adc 3290 list_for_each_entry_safe(work, n, &pool->worklist, entry) {
873eaca6
TH
3291 if (get_work_pwq(work) == pwq &&
3292 assign_work(work, rescuer, &n))
725e8ec5 3293 pwq->stats[PWQ_STAT_RESCUED]++;
82607adc 3294 }
e22bee78 3295
873eaca6 3296 if (!list_empty(&rescuer->scheduled)) {
008847f6
N
3297 process_scheduled_works(rescuer);
3298
3299 /*
3300 * The above execution of rescued work items could
3301 * have created more to rescue through
f97a4a1a 3302 * pwq_activate_first_inactive() or chained
008847f6
N
3303 * queueing. Let's put @pwq back on mayday list so
3304 * that such back-to-back work items, which may be
3305 * being used to relieve memory pressure, don't
3306 * incur MAYDAY_INTERVAL delay inbetween.
3307 */
4f3f4cf3 3308 if (pwq->nr_active && need_to_create_worker(pool)) {
a9b8a985 3309 raw_spin_lock(&wq_mayday_lock);
e66b39af
TH
3310 /*
3311 * Queue iff we aren't racing destruction
3312 * and somebody else hasn't queued it already.
3313 */
3314 if (wq->rescuer && list_empty(&pwq->mayday_node)) {
3315 get_pwq(pwq);
3316 list_add_tail(&pwq->mayday_node, &wq->maydays);
3317 }
a9b8a985 3318 raw_spin_unlock(&wq_mayday_lock);
008847f6
N
3319 }
3320 }
7576958a 3321
77668c8b
LJ
3322 /*
3323 * Put the reference grabbed by send_mayday(). @pool won't
13b1d625 3324 * go away while we're still attached to it.
77668c8b
LJ
3325 */
3326 put_pwq(pwq);
3327
7576958a 3328 /*
0219a352
TH
3329 * Leave this pool. Notify regular workers; otherwise, we end up
3330 * with 0 concurrency and stalling the execution.
7576958a 3331 */
0219a352 3332 kick_pool(pool);
7576958a 3333
a9b8a985 3334 raw_spin_unlock_irq(&pool->lock);
13b1d625 3335
a2d812a2 3336 worker_detach_from_pool(rescuer);
13b1d625 3337
a9b8a985 3338 raw_spin_lock_irq(&wq_mayday_lock);
e22bee78
TH
3339 }
3340
a9b8a985 3341 raw_spin_unlock_irq(&wq_mayday_lock);
493a1724 3342
4d595b86
LJ
3343 if (should_stop) {
3344 __set_current_state(TASK_RUNNING);
197f6acc 3345 set_pf_worker(false);
4d595b86
LJ
3346 return 0;
3347 }
3348
111c225a
TH
3349 /* rescuers should never participate in concurrency management */
3350 WARN_ON_ONCE(!(rescuer->flags & WORKER_NOT_RUNNING));
e22bee78
TH
3351 schedule();
3352 goto repeat;
1da177e4
LT
3353}
3354
fca839c0
TH
3355/**
3356 * check_flush_dependency - check for flush dependency sanity
3357 * @target_wq: workqueue being flushed
3358 * @target_work: work item being flushed (NULL for workqueue flushes)
3359 *
3360 * %current is trying to flush the whole @target_wq or @target_work on it.
3361 * If @target_wq doesn't have %WQ_MEM_RECLAIM, verify that %current is not
3362 * reclaiming memory or running on a workqueue which doesn't have
3363 * %WQ_MEM_RECLAIM as that can break forward-progress guarantee leading to
3364 * a deadlock.
3365 */
3366static void check_flush_dependency(struct workqueue_struct *target_wq,
3367 struct work_struct *target_work)
3368{
3369 work_func_t target_func = target_work ? target_work->func : NULL;
3370 struct worker *worker;
3371
3372 if (target_wq->flags & WQ_MEM_RECLAIM)
3373 return;
3374
3375 worker = current_wq_worker();
3376
3377 WARN_ONCE(current->flags & PF_MEMALLOC,
d75f773c 3378 "workqueue: PF_MEMALLOC task %d(%s) is flushing !WQ_MEM_RECLAIM %s:%ps",
fca839c0 3379 current->pid, current->comm, target_wq->name, target_func);
23d11a58
TH
3380 WARN_ONCE(worker && ((worker->current_pwq->wq->flags &
3381 (WQ_MEM_RECLAIM | __WQ_LEGACY)) == WQ_MEM_RECLAIM),
d75f773c 3382 "workqueue: WQ_MEM_RECLAIM %s:%ps is flushing !WQ_MEM_RECLAIM %s:%ps",
fca839c0
TH
3383 worker->current_pwq->wq->name, worker->current_func,
3384 target_wq->name, target_func);
3385}
3386
fc2e4d70
ON
3387struct wq_barrier {
3388 struct work_struct work;
3389 struct completion done;
2607d7a6 3390 struct task_struct *task; /* purely informational */
fc2e4d70
ON
3391};
3392
3393static void wq_barrier_func(struct work_struct *work)
3394{
3395 struct wq_barrier *barr = container_of(work, struct wq_barrier, work);
3396 complete(&barr->done);
3397}
3398
4690c4ab
TH
3399/**
3400 * insert_wq_barrier - insert a barrier work
112202d9 3401 * @pwq: pwq to insert barrier into
4690c4ab 3402 * @barr: wq_barrier to insert
affee4b2
TH
3403 * @target: target work to attach @barr to
3404 * @worker: worker currently executing @target, NULL if @target is not executing
4690c4ab 3405 *
affee4b2
TH
3406 * @barr is linked to @target such that @barr is completed only after
3407 * @target finishes execution. Please note that the ordering
3408 * guarantee is observed only with respect to @target and on the local
3409 * cpu.
3410 *
3411 * Currently, a queued barrier can't be canceled. This is because
3412 * try_to_grab_pending() can't determine whether the work to be
3413 * grabbed is at the head of the queue and thus can't clear LINKED
3414 * flag of the previous work while there must be a valid next work
3415 * after a work with LINKED flag set.
3416 *
3417 * Note that when @worker is non-NULL, @target may be modified
112202d9 3418 * underneath us, so we can't reliably determine pwq from @target.
4690c4ab
TH
3419 *
3420 * CONTEXT:
a9b8a985 3421 * raw_spin_lock_irq(pool->lock).
4690c4ab 3422 */
112202d9 3423static void insert_wq_barrier(struct pool_workqueue *pwq,
affee4b2
TH
3424 struct wq_barrier *barr,
3425 struct work_struct *target, struct worker *worker)
fc2e4d70 3426{
d812796e
LJ
3427 unsigned int work_flags = 0;
3428 unsigned int work_color;
affee4b2 3429 struct list_head *head;
affee4b2 3430
dc186ad7 3431 /*
d565ed63 3432 * debugobject calls are safe here even with pool->lock locked
dc186ad7
TG
3433 * as we know for sure that this will not trigger any of the
3434 * checks and call back into the fixup functions where we
3435 * might deadlock.
3436 */
ca1cab37 3437 INIT_WORK_ONSTACK(&barr->work, wq_barrier_func);
22df02bb 3438 __set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(&barr->work));
52fa5bc5 3439
fd1a5b04
BP
3440 init_completion_map(&barr->done, &target->lockdep_map);
3441
2607d7a6 3442 barr->task = current;
83c22520 3443
5797b1c1 3444 /* The barrier work item does not participate in nr_active. */
018f3a13
LJ
3445 work_flags |= WORK_STRUCT_INACTIVE;
3446
affee4b2
TH
3447 /*
3448 * If @target is currently being executed, schedule the
3449 * barrier to the worker; otherwise, put it after @target.
3450 */
d812796e 3451 if (worker) {
affee4b2 3452 head = worker->scheduled.next;
d812796e
LJ
3453 work_color = worker->current_color;
3454 } else {
affee4b2
TH
3455 unsigned long *bits = work_data_bits(target);
3456
3457 head = target->entry.next;
3458 /* there can already be other linked works, inherit and set */
d21cece0 3459 work_flags |= *bits & WORK_STRUCT_LINKED;
d812796e 3460 work_color = get_work_color(*bits);
affee4b2
TH
3461 __set_bit(WORK_STRUCT_LINKED_BIT, bits);
3462 }
3463
d812796e
LJ
3464 pwq->nr_in_flight[work_color]++;
3465 work_flags |= work_color_to_flags(work_color);
3466
d21cece0 3467 insert_work(pwq, &barr->work, head, work_flags);
fc2e4d70
ON
3468}
3469
73f53c4a 3470/**
112202d9 3471 * flush_workqueue_prep_pwqs - prepare pwqs for workqueue flushing
73f53c4a
TH
3472 * @wq: workqueue being flushed
3473 * @flush_color: new flush color, < 0 for no-op
3474 * @work_color: new work color, < 0 for no-op
3475 *
112202d9 3476 * Prepare pwqs for workqueue flushing.
73f53c4a 3477 *
112202d9
TH
3478 * If @flush_color is non-negative, flush_color on all pwqs should be
3479 * -1. If no pwq has in-flight commands at the specified color, all
3480 * pwq->flush_color's stay at -1 and %false is returned. If any pwq
3481 * has in flight commands, its pwq->flush_color is set to
3482 * @flush_color, @wq->nr_pwqs_to_flush is updated accordingly, pwq
73f53c4a
TH
3483 * wakeup logic is armed and %true is returned.
3484 *
3485 * The caller should have initialized @wq->first_flusher prior to
3486 * calling this function with non-negative @flush_color. If
3487 * @flush_color is negative, no flush color update is done and %false
3488 * is returned.
3489 *
112202d9 3490 * If @work_color is non-negative, all pwqs should have the same
73f53c4a
TH
3491 * work_color which is previous to @work_color and all will be
3492 * advanced to @work_color.
3493 *
3494 * CONTEXT:
3c25a55d 3495 * mutex_lock(wq->mutex).
73f53c4a 3496 *
d185af30 3497 * Return:
73f53c4a
TH
3498 * %true if @flush_color >= 0 and there's something to flush. %false
3499 * otherwise.
3500 */
112202d9 3501static bool flush_workqueue_prep_pwqs(struct workqueue_struct *wq,
73f53c4a 3502 int flush_color, int work_color)
1da177e4 3503{
73f53c4a 3504 bool wait = false;
49e3cf44 3505 struct pool_workqueue *pwq;
1da177e4 3506
73f53c4a 3507 if (flush_color >= 0) {
6183c009 3508 WARN_ON_ONCE(atomic_read(&wq->nr_pwqs_to_flush));
112202d9 3509 atomic_set(&wq->nr_pwqs_to_flush, 1);
1da177e4 3510 }
2355b70f 3511
49e3cf44 3512 for_each_pwq(pwq, wq) {
112202d9 3513 struct worker_pool *pool = pwq->pool;
fc2e4d70 3514
a9b8a985 3515 raw_spin_lock_irq(&pool->lock);
83c22520 3516
73f53c4a 3517 if (flush_color >= 0) {
6183c009 3518 WARN_ON_ONCE(pwq->flush_color != -1);
fc2e4d70 3519
112202d9
TH
3520 if (pwq->nr_in_flight[flush_color]) {
3521 pwq->flush_color = flush_color;
3522 atomic_inc(&wq->nr_pwqs_to_flush);
73f53c4a
TH
3523 wait = true;
3524 }
3525 }
1da177e4 3526
73f53c4a 3527 if (work_color >= 0) {
6183c009 3528 WARN_ON_ONCE(work_color != work_next_color(pwq->work_color));
112202d9 3529 pwq->work_color = work_color;
73f53c4a 3530 }
1da177e4 3531
a9b8a985 3532 raw_spin_unlock_irq(&pool->lock);
1da177e4 3533 }
2355b70f 3534
112202d9 3535 if (flush_color >= 0 && atomic_dec_and_test(&wq->nr_pwqs_to_flush))
73f53c4a 3536 complete(&wq->first_flusher->done);
14441960 3537
73f53c4a 3538 return wait;
1da177e4
LT
3539}
3540
0fcb78c2 3541/**
c4f135d6 3542 * __flush_workqueue - ensure that any scheduled work has run to completion.
0fcb78c2 3543 * @wq: workqueue to flush
1da177e4 3544 *
c5aa87bb
TH
3545 * This function sleeps until all work items which were queued on entry
3546 * have finished execution, but it is not livelocked by new incoming ones.
1da177e4 3547 */
c4f135d6 3548void __flush_workqueue(struct workqueue_struct *wq)
1da177e4 3549{
73f53c4a
TH
3550 struct wq_flusher this_flusher = {
3551 .list = LIST_HEAD_INIT(this_flusher.list),
3552 .flush_color = -1,
fd1a5b04 3553 .done = COMPLETION_INITIALIZER_ONSTACK_MAP(this_flusher.done, wq->lockdep_map),
73f53c4a
TH
3554 };
3555 int next_color;
1da177e4 3556
3347fa09
TH
3557 if (WARN_ON(!wq_online))
3558 return;
3559
87915adc
JB
3560 lock_map_acquire(&wq->lockdep_map);
3561 lock_map_release(&wq->lockdep_map);
3562
3c25a55d 3563 mutex_lock(&wq->mutex);
73f53c4a
TH
3564
3565 /*
3566 * Start-to-wait phase
3567 */
3568 next_color = work_next_color(wq->work_color);
3569
3570 if (next_color != wq->flush_color) {
3571 /*
3572 * Color space is not full. The current work_color
3573 * becomes our flush_color and work_color is advanced
3574 * by one.
3575 */
6183c009 3576 WARN_ON_ONCE(!list_empty(&wq->flusher_overflow));
73f53c4a
TH
3577 this_flusher.flush_color = wq->work_color;
3578 wq->work_color = next_color;
3579
3580 if (!wq->first_flusher) {
3581 /* no flush in progress, become the first flusher */
6183c009 3582 WARN_ON_ONCE(wq->flush_color != this_flusher.flush_color);
73f53c4a
TH
3583
3584 wq->first_flusher = &this_flusher;
3585
112202d9 3586 if (!flush_workqueue_prep_pwqs(wq, wq->flush_color,
73f53c4a
TH
3587 wq->work_color)) {
3588 /* nothing to flush, done */
3589 wq->flush_color = next_color;
3590 wq->first_flusher = NULL;
3591 goto out_unlock;
3592 }
3593 } else {
3594 /* wait in queue */
6183c009 3595 WARN_ON_ONCE(wq->flush_color == this_flusher.flush_color);
73f53c4a 3596 list_add_tail(&this_flusher.list, &wq->flusher_queue);
112202d9 3597 flush_workqueue_prep_pwqs(wq, -1, wq->work_color);
73f53c4a
TH
3598 }
3599 } else {
3600 /*
3601 * Oops, color space is full, wait on overflow queue.
3602 * The next flush completion will assign us
3603 * flush_color and transfer to flusher_queue.
3604 */
3605 list_add_tail(&this_flusher.list, &wq->flusher_overflow);
3606 }
3607
fca839c0
TH
3608 check_flush_dependency(wq, NULL);
3609
3c25a55d 3610 mutex_unlock(&wq->mutex);
73f53c4a
TH
3611
3612 wait_for_completion(&this_flusher.done);
3613
3614 /*
3615 * Wake-up-and-cascade phase
3616 *
3617 * First flushers are responsible for cascading flushes and
3618 * handling overflow. Non-first flushers can simply return.
3619 */
00d5d15b 3620 if (READ_ONCE(wq->first_flusher) != &this_flusher)
73f53c4a
TH
3621 return;
3622
3c25a55d 3623 mutex_lock(&wq->mutex);
73f53c4a 3624
4ce48b37
TH
3625 /* we might have raced, check again with mutex held */
3626 if (wq->first_flusher != &this_flusher)
3627 goto out_unlock;
3628
00d5d15b 3629 WRITE_ONCE(wq->first_flusher, NULL);
73f53c4a 3630
6183c009
TH
3631 WARN_ON_ONCE(!list_empty(&this_flusher.list));
3632 WARN_ON_ONCE(wq->flush_color != this_flusher.flush_color);
73f53c4a
TH
3633
3634 while (true) {
3635 struct wq_flusher *next, *tmp;
3636
3637 /* complete all the flushers sharing the current flush color */
3638 list_for_each_entry_safe(next, tmp, &wq->flusher_queue, list) {
3639 if (next->flush_color != wq->flush_color)
3640 break;
3641 list_del_init(&next->list);
3642 complete(&next->done);
3643 }
3644
6183c009
TH
3645 WARN_ON_ONCE(!list_empty(&wq->flusher_overflow) &&
3646 wq->flush_color != work_next_color(wq->work_color));
73f53c4a
TH
3647
3648 /* this flush_color is finished, advance by one */
3649 wq->flush_color = work_next_color(wq->flush_color);
3650
3651 /* one color has been freed, handle overflow queue */
3652 if (!list_empty(&wq->flusher_overflow)) {
3653 /*
3654 * Assign the same color to all overflowed
3655 * flushers, advance work_color and append to
3656 * flusher_queue. This is the start-to-wait
3657 * phase for these overflowed flushers.
3658 */
3659 list_for_each_entry(tmp, &wq->flusher_overflow, list)
3660 tmp->flush_color = wq->work_color;
3661
3662 wq->work_color = work_next_color(wq->work_color);
3663
3664 list_splice_tail_init(&wq->flusher_overflow,
3665 &wq->flusher_queue);
112202d9 3666 flush_workqueue_prep_pwqs(wq, -1, wq->work_color);
73f53c4a
TH
3667 }
3668
3669 if (list_empty(&wq->flusher_queue)) {
6183c009 3670 WARN_ON_ONCE(wq->flush_color != wq->work_color);
73f53c4a
TH
3671 break;
3672 }
3673
3674 /*
3675 * Need to flush more colors. Make the next flusher
112202d9 3676 * the new first flusher and arm pwqs.
73f53c4a 3677 */
6183c009
TH
3678 WARN_ON_ONCE(wq->flush_color == wq->work_color);
3679 WARN_ON_ONCE(wq->flush_color != next->flush_color);
73f53c4a
TH
3680
3681 list_del_init(&next->list);
3682 wq->first_flusher = next;
3683
112202d9 3684 if (flush_workqueue_prep_pwqs(wq, wq->flush_color, -1))
73f53c4a
TH
3685 break;
3686
3687 /*
3688 * Meh... this color is already done, clear first
3689 * flusher and repeat cascading.
3690 */
3691 wq->first_flusher = NULL;
3692 }
3693
3694out_unlock:
3c25a55d 3695 mutex_unlock(&wq->mutex);
1da177e4 3696}
c4f135d6 3697EXPORT_SYMBOL(__flush_workqueue);
1da177e4 3698
9c5a2ba7
TH
3699/**
3700 * drain_workqueue - drain a workqueue
3701 * @wq: workqueue to drain
3702 *
3703 * Wait until the workqueue becomes empty. While draining is in progress,
3704 * only chain queueing is allowed. IOW, only currently pending or running
3705 * work items on @wq can queue further work items on it. @wq is flushed
b749b1b6 3706 * repeatedly until it becomes empty. The number of flushing is determined
9c5a2ba7
TH
3707 * by the depth of chaining and should be relatively short. Whine if it
3708 * takes too long.
3709 */
3710void drain_workqueue(struct workqueue_struct *wq)
3711{
3712 unsigned int flush_cnt = 0;
49e3cf44 3713 struct pool_workqueue *pwq;
9c5a2ba7
TH
3714
3715 /*
3716 * __queue_work() needs to test whether there are drainers, is much
3717 * hotter than drain_workqueue() and already looks at @wq->flags.
618b01eb 3718 * Use __WQ_DRAINING so that queue doesn't have to check nr_drainers.
9c5a2ba7 3719 */
87fc741e 3720 mutex_lock(&wq->mutex);
9c5a2ba7 3721 if (!wq->nr_drainers++)
618b01eb 3722 wq->flags |= __WQ_DRAINING;
87fc741e 3723 mutex_unlock(&wq->mutex);
9c5a2ba7 3724reflush:
c4f135d6 3725 __flush_workqueue(wq);
9c5a2ba7 3726
b09f4fd3 3727 mutex_lock(&wq->mutex);
76af4d93 3728
49e3cf44 3729 for_each_pwq(pwq, wq) {
fa2563e4 3730 bool drained;
9c5a2ba7 3731
a9b8a985 3732 raw_spin_lock_irq(&pwq->pool->lock);
afa87ce8 3733 drained = pwq_is_empty(pwq);
a9b8a985 3734 raw_spin_unlock_irq(&pwq->pool->lock);
fa2563e4
TT
3735
3736 if (drained)
9c5a2ba7
TH
3737 continue;
3738
3739 if (++flush_cnt == 10 ||
3740 (flush_cnt % 100 == 0 && flush_cnt <= 1000))
e9ad2eb3
SZ
3741 pr_warn("workqueue %s: %s() isn't complete after %u tries\n",
3742 wq->name, __func__, flush_cnt);
76af4d93 3743
b09f4fd3 3744 mutex_unlock(&wq->mutex);
9c5a2ba7
TH
3745 goto reflush;
3746 }
3747
9c5a2ba7 3748 if (!--wq->nr_drainers)
618b01eb 3749 wq->flags &= ~__WQ_DRAINING;
87fc741e 3750 mutex_unlock(&wq->mutex);
9c5a2ba7
TH
3751}
3752EXPORT_SYMBOL_GPL(drain_workqueue);
3753
d6e89786
JB
3754static bool start_flush_work(struct work_struct *work, struct wq_barrier *barr,
3755 bool from_cancel)
db700897 3756{
affee4b2 3757 struct worker *worker = NULL;
c9e7cf27 3758 struct worker_pool *pool;
112202d9 3759 struct pool_workqueue *pwq;
db700897
ON
3760
3761 might_sleep();
fa1b54e6 3762
24acfb71 3763 rcu_read_lock();
c9e7cf27 3764 pool = get_work_pool(work);
fa1b54e6 3765 if (!pool) {
24acfb71 3766 rcu_read_unlock();
baf59022 3767 return false;
fa1b54e6 3768 }
db700897 3769
a9b8a985 3770 raw_spin_lock_irq(&pool->lock);
0b3dae68 3771 /* see the comment in try_to_grab_pending() with the same code */
112202d9
TH
3772 pwq = get_work_pwq(work);
3773 if (pwq) {
3774 if (unlikely(pwq->pool != pool))
4690c4ab 3775 goto already_gone;
606a5020 3776 } else {
c9e7cf27 3777 worker = find_worker_executing_work(pool, work);
affee4b2 3778 if (!worker)
4690c4ab 3779 goto already_gone;
112202d9 3780 pwq = worker->current_pwq;
606a5020 3781 }
db700897 3782
fca839c0
TH
3783 check_flush_dependency(pwq->wq, work);
3784
112202d9 3785 insert_wq_barrier(pwq, barr, work, worker);
a9b8a985 3786 raw_spin_unlock_irq(&pool->lock);
7a22ad75 3787
e159489b 3788 /*
a1d14934
PZ
3789 * Force a lock recursion deadlock when using flush_work() inside a
3790 * single-threaded or rescuer equipped workqueue.
3791 *
3792 * For single threaded workqueues the deadlock happens when the work
3793 * is after the work issuing the flush_work(). For rescuer equipped
3794 * workqueues the deadlock happens when the rescuer stalls, blocking
3795 * forward progress.
e159489b 3796 */
d6e89786
JB
3797 if (!from_cancel &&
3798 (pwq->wq->saved_max_active == 1 || pwq->wq->rescuer)) {
112202d9 3799 lock_map_acquire(&pwq->wq->lockdep_map);
a1d14934
PZ
3800 lock_map_release(&pwq->wq->lockdep_map);
3801 }
24acfb71 3802 rcu_read_unlock();
401a8d04 3803 return true;
4690c4ab 3804already_gone:
a9b8a985 3805 raw_spin_unlock_irq(&pool->lock);
24acfb71 3806 rcu_read_unlock();
401a8d04 3807 return false;
db700897 3808}
baf59022 3809
d6e89786
JB
3810static bool __flush_work(struct work_struct *work, bool from_cancel)
3811{
3812 struct wq_barrier barr;
3813
3814 if (WARN_ON(!wq_online))
3815 return false;
3816
4d43d395
TH
3817 if (WARN_ON(!work->func))
3818 return false;
3819
c0feea59
TH
3820 lock_map_acquire(&work->lockdep_map);
3821 lock_map_release(&work->lockdep_map);
87915adc 3822
d6e89786
JB
3823 if (start_flush_work(work, &barr, from_cancel)) {
3824 wait_for_completion(&barr.done);
3825 destroy_work_on_stack(&barr.work);
3826 return true;
3827 } else {
3828 return false;
3829 }
3830}
3831
baf59022
TH
3832/**
3833 * flush_work - wait for a work to finish executing the last queueing instance
3834 * @work: the work to flush
3835 *
606a5020
TH
3836 * Wait until @work has finished execution. @work is guaranteed to be idle
3837 * on return if it hasn't been requeued since flush started.
baf59022 3838 *
d185af30 3839 * Return:
baf59022
TH
3840 * %true if flush_work() waited for the work to finish execution,
3841 * %false if it was already idle.
3842 */
3843bool flush_work(struct work_struct *work)
3844{
d6e89786 3845 return __flush_work(work, false);
6e84d644 3846}
606a5020 3847EXPORT_SYMBOL_GPL(flush_work);
6e84d644 3848
8603e1b3 3849struct cwt_wait {
ac6424b9 3850 wait_queue_entry_t wait;
8603e1b3
TH
3851 struct work_struct *work;
3852};
3853
ac6424b9 3854static int cwt_wakefn(wait_queue_entry_t *wait, unsigned mode, int sync, void *key)
8603e1b3
TH
3855{
3856 struct cwt_wait *cwait = container_of(wait, struct cwt_wait, wait);
3857
3858 if (cwait->work != key)
3859 return 0;
3860 return autoremove_wake_function(wait, mode, sync, key);
3861}
3862
36e227d2 3863static bool __cancel_work_timer(struct work_struct *work, bool is_dwork)
1f1f642e 3864{
8603e1b3 3865 static DECLARE_WAIT_QUEUE_HEAD(cancel_waitq);
bbb68dfa 3866 unsigned long flags;
1f1f642e
ON
3867 int ret;
3868
3869 do {
bbb68dfa
TH
3870 ret = try_to_grab_pending(work, is_dwork, &flags);
3871 /*
8603e1b3
TH
3872 * If someone else is already canceling, wait for it to
3873 * finish. flush_work() doesn't work for PREEMPT_NONE
3874 * because we may get scheduled between @work's completion
3875 * and the other canceling task resuming and clearing
3876 * CANCELING - flush_work() will return false immediately
3877 * as @work is no longer busy, try_to_grab_pending() will
3878 * return -ENOENT as @work is still being canceled and the
3879 * other canceling task won't be able to clear CANCELING as
3880 * we're hogging the CPU.
3881 *
3882 * Let's wait for completion using a waitqueue. As this
3883 * may lead to the thundering herd problem, use a custom
3884 * wake function which matches @work along with exclusive
3885 * wait and wakeup.
bbb68dfa 3886 */
8603e1b3
TH
3887 if (unlikely(ret == -ENOENT)) {
3888 struct cwt_wait cwait;
3889
3890 init_wait(&cwait.wait);
3891 cwait.wait.func = cwt_wakefn;
3892 cwait.work = work;
3893
3894 prepare_to_wait_exclusive(&cancel_waitq, &cwait.wait,
3895 TASK_UNINTERRUPTIBLE);
3896 if (work_is_canceling(work))
3897 schedule();
3898 finish_wait(&cancel_waitq, &cwait.wait);
3899 }
1f1f642e
ON
3900 } while (unlikely(ret < 0));
3901
bbb68dfa
TH
3902 /* tell other tasks trying to grab @work to back off */
3903 mark_work_canceling(work);
3904 local_irq_restore(flags);
3905
3347fa09
TH
3906 /*
3907 * This allows canceling during early boot. We know that @work
3908 * isn't executing.
3909 */
3910 if (wq_online)
d6e89786 3911 __flush_work(work, true);
3347fa09 3912
7a22ad75 3913 clear_work_data(work);
8603e1b3
TH
3914
3915 /*
3916 * Paired with prepare_to_wait() above so that either
3917 * waitqueue_active() is visible here or !work_is_canceling() is
3918 * visible there.
3919 */
3920 smp_mb();
3921 if (waitqueue_active(&cancel_waitq))
3922 __wake_up(&cancel_waitq, TASK_NORMAL, 1, work);
3923
1f1f642e
ON
3924 return ret;
3925}
3926
6e84d644 3927/**
401a8d04
TH
3928 * cancel_work_sync - cancel a work and wait for it to finish
3929 * @work: the work to cancel
6e84d644 3930 *
401a8d04
TH
3931 * Cancel @work and wait for its execution to finish. This function
3932 * can be used even if the work re-queues itself or migrates to
3933 * another workqueue. On return from this function, @work is
3934 * guaranteed to be not pending or executing on any CPU.
1f1f642e 3935 *
401a8d04
TH
3936 * cancel_work_sync(&delayed_work->work) must not be used for
3937 * delayed_work's. Use cancel_delayed_work_sync() instead.
6e84d644 3938 *
401a8d04 3939 * The caller must ensure that the workqueue on which @work was last
6e84d644 3940 * queued can't be destroyed before this function returns.
401a8d04 3941 *
d185af30 3942 * Return:
401a8d04 3943 * %true if @work was pending, %false otherwise.
6e84d644 3944 */
401a8d04 3945bool cancel_work_sync(struct work_struct *work)
6e84d644 3946{
36e227d2 3947 return __cancel_work_timer(work, false);
b89deed3 3948}
28e53bdd 3949EXPORT_SYMBOL_GPL(cancel_work_sync);
b89deed3 3950
6e84d644 3951/**
401a8d04
TH
3952 * flush_delayed_work - wait for a dwork to finish executing the last queueing
3953 * @dwork: the delayed work to flush
6e84d644 3954 *
401a8d04
TH
3955 * Delayed timer is cancelled and the pending work is queued for
3956 * immediate execution. Like flush_work(), this function only
3957 * considers the last queueing instance of @dwork.
1f1f642e 3958 *
d185af30 3959 * Return:
401a8d04
TH
3960 * %true if flush_work() waited for the work to finish execution,
3961 * %false if it was already idle.
6e84d644 3962 */
401a8d04
TH
3963bool flush_delayed_work(struct delayed_work *dwork)
3964{
8930caba 3965 local_irq_disable();
401a8d04 3966 if (del_timer_sync(&dwork->timer))
60c057bc 3967 __queue_work(dwork->cpu, dwork->wq, &dwork->work);
8930caba 3968 local_irq_enable();
401a8d04
TH
3969 return flush_work(&dwork->work);
3970}
3971EXPORT_SYMBOL(flush_delayed_work);
3972
05f0fe6b
TH
3973/**
3974 * flush_rcu_work - wait for a rwork to finish executing the last queueing
3975 * @rwork: the rcu work to flush
3976 *
3977 * Return:
3978 * %true if flush_rcu_work() waited for the work to finish execution,
3979 * %false if it was already idle.
3980 */
3981bool flush_rcu_work(struct rcu_work *rwork)
3982{
3983 if (test_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(&rwork->work))) {
3984 rcu_barrier();
3985 flush_work(&rwork->work);
3986 return true;
3987 } else {
3988 return flush_work(&rwork->work);
3989 }
3990}
3991EXPORT_SYMBOL(flush_rcu_work);
3992
f72b8792
JA
3993static bool __cancel_work(struct work_struct *work, bool is_dwork)
3994{
3995 unsigned long flags;
3996 int ret;
3997
3998 do {
3999 ret = try_to_grab_pending(work, is_dwork, &flags);
4000 } while (unlikely(ret == -EAGAIN));
4001
4002 if (unlikely(ret < 0))
4003 return false;
4004
4005 set_work_pool_and_clear_pending(work, get_work_pool_id(work));
4006 local_irq_restore(flags);
4007 return ret;
4008}
4009
73b4b532
AG
4010/*
4011 * See cancel_delayed_work()
4012 */
4013bool cancel_work(struct work_struct *work)
4014{
4015 return __cancel_work(work, false);
4016}
4017EXPORT_SYMBOL(cancel_work);
4018
09383498 4019/**
57b30ae7
TH
4020 * cancel_delayed_work - cancel a delayed work
4021 * @dwork: delayed_work to cancel
09383498 4022 *
d185af30
YB
4023 * Kill off a pending delayed_work.
4024 *
4025 * Return: %true if @dwork was pending and canceled; %false if it wasn't
4026 * pending.
4027 *
4028 * Note:
4029 * The work callback function may still be running on return, unless
4030 * it returns %true and the work doesn't re-arm itself. Explicitly flush or
4031 * use cancel_delayed_work_sync() to wait on it.
09383498 4032 *
57b30ae7 4033 * This function is safe to call from any context including IRQ handler.
09383498 4034 */
57b30ae7 4035bool cancel_delayed_work(struct delayed_work *dwork)
09383498 4036{
f72b8792 4037 return __cancel_work(&dwork->work, true);
09383498 4038}
57b30ae7 4039EXPORT_SYMBOL(cancel_delayed_work);
09383498 4040
401a8d04
TH
4041/**
4042 * cancel_delayed_work_sync - cancel a delayed work and wait for it to finish
4043 * @dwork: the delayed work cancel
4044 *
4045 * This is cancel_work_sync() for delayed works.
4046 *
d185af30 4047 * Return:
401a8d04
TH
4048 * %true if @dwork was pending, %false otherwise.
4049 */
4050bool cancel_delayed_work_sync(struct delayed_work *dwork)
6e84d644 4051{
36e227d2 4052 return __cancel_work_timer(&dwork->work, true);
6e84d644 4053}
f5a421a4 4054EXPORT_SYMBOL(cancel_delayed_work_sync);
1da177e4 4055
b6136773 4056/**
31ddd871 4057 * schedule_on_each_cpu - execute a function synchronously on each online CPU
b6136773 4058 * @func: the function to call
b6136773 4059 *
31ddd871
TH
4060 * schedule_on_each_cpu() executes @func on each online CPU using the
4061 * system workqueue and blocks until all CPUs have completed.
b6136773 4062 * schedule_on_each_cpu() is very slow.
31ddd871 4063 *
d185af30 4064 * Return:
31ddd871 4065 * 0 on success, -errno on failure.
b6136773 4066 */
65f27f38 4067int schedule_on_each_cpu(work_func_t func)
15316ba8
CL
4068{
4069 int cpu;
38f51568 4070 struct work_struct __percpu *works;
15316ba8 4071
b6136773
AM
4072 works = alloc_percpu(struct work_struct);
4073 if (!works)
15316ba8 4074 return -ENOMEM;
b6136773 4075
ffd8bea8 4076 cpus_read_lock();
93981800 4077
15316ba8 4078 for_each_online_cpu(cpu) {
9bfb1839
IM
4079 struct work_struct *work = per_cpu_ptr(works, cpu);
4080
4081 INIT_WORK(work, func);
b71ab8c2 4082 schedule_work_on(cpu, work);
65a64464 4083 }
93981800
TH
4084
4085 for_each_online_cpu(cpu)
4086 flush_work(per_cpu_ptr(works, cpu));
4087
ffd8bea8 4088 cpus_read_unlock();
b6136773 4089 free_percpu(works);
15316ba8
CL
4090 return 0;
4091}
4092
1fa44eca
JB
4093/**
4094 * execute_in_process_context - reliably execute the routine with user context
4095 * @fn: the function to execute
1fa44eca
JB
4096 * @ew: guaranteed storage for the execute work structure (must
4097 * be available when the work executes)
4098 *
4099 * Executes the function immediately if process context is available,
4100 * otherwise schedules the function for delayed execution.
4101 *
d185af30 4102 * Return: 0 - function was executed
1fa44eca
JB
4103 * 1 - function was scheduled for execution
4104 */
65f27f38 4105int execute_in_process_context(work_func_t fn, struct execute_work *ew)
1fa44eca
JB
4106{
4107 if (!in_interrupt()) {
65f27f38 4108 fn(&ew->work);
1fa44eca
JB
4109 return 0;
4110 }
4111
65f27f38 4112 INIT_WORK(&ew->work, fn);
1fa44eca
JB
4113 schedule_work(&ew->work);
4114
4115 return 1;
4116}
4117EXPORT_SYMBOL_GPL(execute_in_process_context);
4118
6ba94429
FW
4119/**
4120 * free_workqueue_attrs - free a workqueue_attrs
4121 * @attrs: workqueue_attrs to free
226223ab 4122 *
6ba94429 4123 * Undo alloc_workqueue_attrs().
226223ab 4124 */
513c98d0 4125void free_workqueue_attrs(struct workqueue_attrs *attrs)
226223ab 4126{
6ba94429
FW
4127 if (attrs) {
4128 free_cpumask_var(attrs->cpumask);
9546b29e 4129 free_cpumask_var(attrs->__pod_cpumask);
6ba94429
FW
4130 kfree(attrs);
4131 }
226223ab
TH
4132}
4133
6ba94429
FW
4134/**
4135 * alloc_workqueue_attrs - allocate a workqueue_attrs
6ba94429
FW
4136 *
4137 * Allocate a new workqueue_attrs, initialize with default settings and
4138 * return it.
4139 *
4140 * Return: The allocated new workqueue_attr on success. %NULL on failure.
4141 */
513c98d0 4142struct workqueue_attrs *alloc_workqueue_attrs(void)
226223ab 4143{
6ba94429 4144 struct workqueue_attrs *attrs;
226223ab 4145
be69d00d 4146 attrs = kzalloc(sizeof(*attrs), GFP_KERNEL);
6ba94429
FW
4147 if (!attrs)
4148 goto fail;
be69d00d 4149 if (!alloc_cpumask_var(&attrs->cpumask, GFP_KERNEL))
6ba94429 4150 goto fail;
9546b29e
TH
4151 if (!alloc_cpumask_var(&attrs->__pod_cpumask, GFP_KERNEL))
4152 goto fail;
6ba94429
FW
4153
4154 cpumask_copy(attrs->cpumask, cpu_possible_mask);
523a301e 4155 attrs->affn_scope = WQ_AFFN_DFL;
6ba94429
FW
4156 return attrs;
4157fail:
4158 free_workqueue_attrs(attrs);
4159 return NULL;
226223ab
TH
4160}
4161
6ba94429
FW
4162static void copy_workqueue_attrs(struct workqueue_attrs *to,
4163 const struct workqueue_attrs *from)
226223ab 4164{
6ba94429
FW
4165 to->nice = from->nice;
4166 cpumask_copy(to->cpumask, from->cpumask);
9546b29e 4167 cpumask_copy(to->__pod_cpumask, from->__pod_cpumask);
8639eceb 4168 to->affn_strict = from->affn_strict;
84193c07 4169
6ba94429 4170 /*
84193c07
TH
4171 * Unlike hash and equality test, copying shouldn't ignore wq-only
4172 * fields as copying is used for both pool and wq attrs. Instead,
4173 * get_unbound_pool() explicitly clears the fields.
6ba94429 4174 */
84193c07 4175 to->affn_scope = from->affn_scope;
af73f5c9 4176 to->ordered = from->ordered;
226223ab
TH
4177}
4178
5de7a03c
TH
4179/*
4180 * Some attrs fields are workqueue-only. Clear them for worker_pool's. See the
4181 * comments in 'struct workqueue_attrs' definition.
4182 */
4183static void wqattrs_clear_for_pool(struct workqueue_attrs *attrs)
4184{
84193c07 4185 attrs->affn_scope = WQ_AFFN_NR_TYPES;
5de7a03c
TH
4186 attrs->ordered = false;
4187}
4188
6ba94429
FW
4189/* hash value of the content of @attr */
4190static u32 wqattrs_hash(const struct workqueue_attrs *attrs)
226223ab 4191{
6ba94429 4192 u32 hash = 0;
226223ab 4193
6ba94429
FW
4194 hash = jhash_1word(attrs->nice, hash);
4195 hash = jhash(cpumask_bits(attrs->cpumask),
4196 BITS_TO_LONGS(nr_cpumask_bits) * sizeof(long), hash);
9546b29e
TH
4197 hash = jhash(cpumask_bits(attrs->__pod_cpumask),
4198 BITS_TO_LONGS(nr_cpumask_bits) * sizeof(long), hash);
8639eceb 4199 hash = jhash_1word(attrs->affn_strict, hash);
6ba94429 4200 return hash;
226223ab 4201}
226223ab 4202
6ba94429
FW
4203/* content equality test */
4204static bool wqattrs_equal(const struct workqueue_attrs *a,
4205 const struct workqueue_attrs *b)
226223ab 4206{
6ba94429
FW
4207 if (a->nice != b->nice)
4208 return false;
4209 if (!cpumask_equal(a->cpumask, b->cpumask))
4210 return false;
9546b29e
TH
4211 if (!cpumask_equal(a->__pod_cpumask, b->__pod_cpumask))
4212 return false;
8639eceb
TH
4213 if (a->affn_strict != b->affn_strict)
4214 return false;
6ba94429 4215 return true;
226223ab
TH
4216}
4217
0f36ee24
TH
4218/* Update @attrs with actually available CPUs */
4219static void wqattrs_actualize_cpumask(struct workqueue_attrs *attrs,
4220 const cpumask_t *unbound_cpumask)
4221{
4222 /*
4223 * Calculate the effective CPU mask of @attrs given @unbound_cpumask. If
4224 * @attrs->cpumask doesn't overlap with @unbound_cpumask, we fallback to
4225 * @unbound_cpumask.
4226 */
4227 cpumask_and(attrs->cpumask, attrs->cpumask, unbound_cpumask);
4228 if (unlikely(cpumask_empty(attrs->cpumask)))
4229 cpumask_copy(attrs->cpumask, unbound_cpumask);
4230}
4231
84193c07
TH
4232/* find wq_pod_type to use for @attrs */
4233static const struct wq_pod_type *
4234wqattrs_pod_type(const struct workqueue_attrs *attrs)
4235{
523a301e
TH
4236 enum wq_affn_scope scope;
4237 struct wq_pod_type *pt;
4238
4239 /* to synchronize access to wq_affn_dfl */
4240 lockdep_assert_held(&wq_pool_mutex);
4241
4242 if (attrs->affn_scope == WQ_AFFN_DFL)
4243 scope = wq_affn_dfl;
4244 else
4245 scope = attrs->affn_scope;
4246
4247 pt = &wq_pod_types[scope];
84193c07
TH
4248
4249 if (!WARN_ON_ONCE(attrs->affn_scope == WQ_AFFN_NR_TYPES) &&
4250 likely(pt->nr_pods))
4251 return pt;
4252
4253 /*
4254 * Before workqueue_init_topology(), only SYSTEM is available which is
4255 * initialized in workqueue_init_early().
4256 */
4257 pt = &wq_pod_types[WQ_AFFN_SYSTEM];
4258 BUG_ON(!pt->nr_pods);
4259 return pt;
4260}
4261
6ba94429
FW
4262/**
4263 * init_worker_pool - initialize a newly zalloc'd worker_pool
4264 * @pool: worker_pool to initialize
4265 *
402dd89d 4266 * Initialize a newly zalloc'd @pool. It also allocates @pool->attrs.
6ba94429
FW
4267 *
4268 * Return: 0 on success, -errno on failure. Even on failure, all fields
4269 * inside @pool proper are initialized and put_unbound_pool() can be called
4270 * on @pool safely to release it.
4271 */
4272static int init_worker_pool(struct worker_pool *pool)
226223ab 4273{
a9b8a985 4274 raw_spin_lock_init(&pool->lock);
6ba94429
FW
4275 pool->id = -1;
4276 pool->cpu = -1;
4277 pool->node = NUMA_NO_NODE;
4278 pool->flags |= POOL_DISASSOCIATED;
82607adc 4279 pool->watchdog_ts = jiffies;
6ba94429
FW
4280 INIT_LIST_HEAD(&pool->worklist);
4281 INIT_LIST_HEAD(&pool->idle_list);
4282 hash_init(pool->busy_hash);
226223ab 4283
32a6c723 4284 timer_setup(&pool->idle_timer, idle_worker_timeout, TIMER_DEFERRABLE);
3f959aa3 4285 INIT_WORK(&pool->idle_cull_work, idle_cull_fn);
226223ab 4286
32a6c723 4287 timer_setup(&pool->mayday_timer, pool_mayday_timeout, 0);
226223ab 4288
6ba94429 4289 INIT_LIST_HEAD(&pool->workers);
e02b9312 4290 INIT_LIST_HEAD(&pool->dying_workers);
226223ab 4291
6ba94429
FW
4292 ida_init(&pool->worker_ida);
4293 INIT_HLIST_NODE(&pool->hash_node);
4294 pool->refcnt = 1;
226223ab 4295
6ba94429 4296 /* shouldn't fail above this point */
be69d00d 4297 pool->attrs = alloc_workqueue_attrs();
6ba94429
FW
4298 if (!pool->attrs)
4299 return -ENOMEM;
5de7a03c
TH
4300
4301 wqattrs_clear_for_pool(pool->attrs);
4302
6ba94429 4303 return 0;
226223ab
TH
4304}
4305
669de8bd
BVA
4306#ifdef CONFIG_LOCKDEP
4307static void wq_init_lockdep(struct workqueue_struct *wq)
4308{
4309 char *lock_name;
4310
4311 lockdep_register_key(&wq->key);
4312 lock_name = kasprintf(GFP_KERNEL, "%s%s", "(wq_completion)", wq->name);
4313 if (!lock_name)
4314 lock_name = wq->name;
69a106c0
QC
4315
4316 wq->lock_name = lock_name;
669de8bd
BVA
4317 lockdep_init_map(&wq->lockdep_map, lock_name, &wq->key, 0);
4318}
4319
4320static void wq_unregister_lockdep(struct workqueue_struct *wq)
4321{
4322 lockdep_unregister_key(&wq->key);
4323}
4324
4325static void wq_free_lockdep(struct workqueue_struct *wq)
4326{
4327 if (wq->lock_name != wq->name)
4328 kfree(wq->lock_name);
4329}
4330#else
4331static void wq_init_lockdep(struct workqueue_struct *wq)
4332{
4333}
4334
4335static void wq_unregister_lockdep(struct workqueue_struct *wq)
4336{
4337}
4338
4339static void wq_free_lockdep(struct workqueue_struct *wq)
4340{
4341}
4342#endif
4343
91ccc6e7
TH
4344static void free_node_nr_active(struct wq_node_nr_active **nna_ar)
4345{
4346 int node;
4347
4348 for_each_node(node) {
4349 kfree(nna_ar[node]);
4350 nna_ar[node] = NULL;
4351 }
4352
4353 kfree(nna_ar[nr_node_ids]);
4354 nna_ar[nr_node_ids] = NULL;
4355}
4356
4357static void init_node_nr_active(struct wq_node_nr_active *nna)
4358{
4359 atomic_set(&nna->nr, 0);
5797b1c1
TH
4360 raw_spin_lock_init(&nna->lock);
4361 INIT_LIST_HEAD(&nna->pending_pwqs);
91ccc6e7
TH
4362}
4363
4364/*
4365 * Each node's nr_active counter will be accessed mostly from its own node and
4366 * should be allocated in the node.
4367 */
4368static int alloc_node_nr_active(struct wq_node_nr_active **nna_ar)
4369{
4370 struct wq_node_nr_active *nna;
4371 int node;
4372
4373 for_each_node(node) {
4374 nna = kzalloc_node(sizeof(*nna), GFP_KERNEL, node);
4375 if (!nna)
4376 goto err_free;
4377 init_node_nr_active(nna);
4378 nna_ar[node] = nna;
4379 }
4380
4381 /* [nr_node_ids] is used as the fallback */
4382 nna = kzalloc_node(sizeof(*nna), GFP_KERNEL, NUMA_NO_NODE);
4383 if (!nna)
4384 goto err_free;
4385 init_node_nr_active(nna);
4386 nna_ar[nr_node_ids] = nna;
4387
4388 return 0;
4389
4390err_free:
4391 free_node_nr_active(nna_ar);
4392 return -ENOMEM;
4393}
4394
6ba94429 4395static void rcu_free_wq(struct rcu_head *rcu)
226223ab 4396{
6ba94429
FW
4397 struct workqueue_struct *wq =
4398 container_of(rcu, struct workqueue_struct, rcu);
226223ab 4399
91ccc6e7
TH
4400 if (wq->flags & WQ_UNBOUND)
4401 free_node_nr_active(wq->node_nr_active);
4402
669de8bd 4403 wq_free_lockdep(wq);
636b927e
TH
4404 free_percpu(wq->cpu_pwq);
4405 free_workqueue_attrs(wq->unbound_attrs);
6ba94429 4406 kfree(wq);
226223ab
TH
4407}
4408
6ba94429 4409static void rcu_free_pool(struct rcu_head *rcu)
226223ab 4410{
6ba94429 4411 struct worker_pool *pool = container_of(rcu, struct worker_pool, rcu);
226223ab 4412
6ba94429
FW
4413 ida_destroy(&pool->worker_ida);
4414 free_workqueue_attrs(pool->attrs);
4415 kfree(pool);
226223ab
TH
4416}
4417
6ba94429
FW
4418/**
4419 * put_unbound_pool - put a worker_pool
4420 * @pool: worker_pool to put
4421 *
24acfb71 4422 * Put @pool. If its refcnt reaches zero, it gets destroyed in RCU
6ba94429
FW
4423 * safe manner. get_unbound_pool() calls this function on its failure path
4424 * and this function should be able to release pools which went through,
4425 * successfully or not, init_worker_pool().
4426 *
4427 * Should be called with wq_pool_mutex held.
4428 */
4429static void put_unbound_pool(struct worker_pool *pool)
226223ab 4430{
6ba94429
FW
4431 DECLARE_COMPLETION_ONSTACK(detach_completion);
4432 struct worker *worker;
9680540c 4433 LIST_HEAD(cull_list);
e02b9312 4434
6ba94429 4435 lockdep_assert_held(&wq_pool_mutex);
226223ab 4436
6ba94429
FW
4437 if (--pool->refcnt)
4438 return;
226223ab 4439
6ba94429
FW
4440 /* sanity checks */
4441 if (WARN_ON(!(pool->cpu < 0)) ||
4442 WARN_ON(!list_empty(&pool->worklist)))
4443 return;
226223ab 4444
6ba94429
FW
4445 /* release id and unhash */
4446 if (pool->id >= 0)
4447 idr_remove(&worker_pool_idr, pool->id);
4448 hash_del(&pool->hash_node);
d55262c4 4449
6ba94429 4450 /*
692b4825
TH
4451 * Become the manager and destroy all workers. This prevents
4452 * @pool's workers from blocking on attach_mutex. We're the last
4453 * manager and @pool gets freed with the flag set.
9ab03be4
VS
4454 *
4455 * Having a concurrent manager is quite unlikely to happen as we can
4456 * only get here with
4457 * pwq->refcnt == pool->refcnt == 0
4458 * which implies no work queued to the pool, which implies no worker can
4459 * become the manager. However a worker could have taken the role of
4460 * manager before the refcnts dropped to 0, since maybe_create_worker()
4461 * drops pool->lock
6ba94429 4462 */
9ab03be4
VS
4463 while (true) {
4464 rcuwait_wait_event(&manager_wait,
4465 !(pool->flags & POOL_MANAGER_ACTIVE),
4466 TASK_UNINTERRUPTIBLE);
e02b9312
VS
4467
4468 mutex_lock(&wq_pool_attach_mutex);
9ab03be4
VS
4469 raw_spin_lock_irq(&pool->lock);
4470 if (!(pool->flags & POOL_MANAGER_ACTIVE)) {
4471 pool->flags |= POOL_MANAGER_ACTIVE;
4472 break;
4473 }
4474 raw_spin_unlock_irq(&pool->lock);
e02b9312 4475 mutex_unlock(&wq_pool_attach_mutex);
9ab03be4 4476 }
692b4825 4477
6ba94429 4478 while ((worker = first_idle_worker(pool)))
e02b9312 4479 set_worker_dying(worker, &cull_list);
6ba94429 4480 WARN_ON(pool->nr_workers || pool->nr_idle);
a9b8a985 4481 raw_spin_unlock_irq(&pool->lock);
d55262c4 4482
e02b9312
VS
4483 wake_dying_workers(&cull_list);
4484
4485 if (!list_empty(&pool->workers) || !list_empty(&pool->dying_workers))
6ba94429 4486 pool->detach_completion = &detach_completion;
1258fae7 4487 mutex_unlock(&wq_pool_attach_mutex);
226223ab 4488
6ba94429
FW
4489 if (pool->detach_completion)
4490 wait_for_completion(pool->detach_completion);
226223ab 4491
6ba94429
FW
4492 /* shut down the timers */
4493 del_timer_sync(&pool->idle_timer);
3f959aa3 4494 cancel_work_sync(&pool->idle_cull_work);
6ba94429 4495 del_timer_sync(&pool->mayday_timer);
226223ab 4496
24acfb71 4497 /* RCU protected to allow dereferences from get_work_pool() */
25b00775 4498 call_rcu(&pool->rcu, rcu_free_pool);
226223ab
TH
4499}
4500
4501/**
6ba94429
FW
4502 * get_unbound_pool - get a worker_pool with the specified attributes
4503 * @attrs: the attributes of the worker_pool to get
226223ab 4504 *
6ba94429
FW
4505 * Obtain a worker_pool which has the same attributes as @attrs, bump the
4506 * reference count and return it. If there already is a matching
4507 * worker_pool, it will be used; otherwise, this function attempts to
4508 * create a new one.
226223ab 4509 *
6ba94429 4510 * Should be called with wq_pool_mutex held.
226223ab 4511 *
6ba94429
FW
4512 * Return: On success, a worker_pool with the same attributes as @attrs.
4513 * On failure, %NULL.
226223ab 4514 */
6ba94429 4515static struct worker_pool *get_unbound_pool(const struct workqueue_attrs *attrs)
226223ab 4516{
84193c07 4517 struct wq_pod_type *pt = &wq_pod_types[WQ_AFFN_NUMA];
6ba94429
FW
4518 u32 hash = wqattrs_hash(attrs);
4519 struct worker_pool *pool;
84193c07 4520 int pod, node = NUMA_NO_NODE;
226223ab 4521
6ba94429 4522 lockdep_assert_held(&wq_pool_mutex);
226223ab 4523
6ba94429
FW
4524 /* do we already have a matching pool? */
4525 hash_for_each_possible(unbound_pool_hash, pool, hash_node, hash) {
4526 if (wqattrs_equal(pool->attrs, attrs)) {
4527 pool->refcnt++;
4528 return pool;
4529 }
4530 }
226223ab 4531
9546b29e 4532 /* If __pod_cpumask is contained inside a NUMA pod, that's our node */
84193c07 4533 for (pod = 0; pod < pt->nr_pods; pod++) {
9546b29e 4534 if (cpumask_subset(attrs->__pod_cpumask, pt->pod_cpus[pod])) {
84193c07
TH
4535 node = pt->pod_node[pod];
4536 break;
e2273584
XP
4537 }
4538 }
4539
6ba94429 4540 /* nope, create a new one */
84193c07 4541 pool = kzalloc_node(sizeof(*pool), GFP_KERNEL, node);
6ba94429
FW
4542 if (!pool || init_worker_pool(pool) < 0)
4543 goto fail;
4544
84193c07 4545 pool->node = node;
5de7a03c
TH
4546 copy_workqueue_attrs(pool->attrs, attrs);
4547 wqattrs_clear_for_pool(pool->attrs);
226223ab 4548
6ba94429
FW
4549 if (worker_pool_assign_id(pool) < 0)
4550 goto fail;
226223ab 4551
6ba94429 4552 /* create and start the initial worker */
3347fa09 4553 if (wq_online && !create_worker(pool))
6ba94429 4554 goto fail;
226223ab 4555
6ba94429
FW
4556 /* install */
4557 hash_add(unbound_pool_hash, &pool->hash_node, hash);
226223ab 4558
6ba94429
FW
4559 return pool;
4560fail:
4561 if (pool)
4562 put_unbound_pool(pool);
4563 return NULL;
226223ab 4564}
226223ab 4565
6ba94429 4566static void rcu_free_pwq(struct rcu_head *rcu)
7a4e344c 4567{
6ba94429
FW
4568 kmem_cache_free(pwq_cache,
4569 container_of(rcu, struct pool_workqueue, rcu));
7a4e344c
TH
4570}
4571
6ba94429 4572/*
967b494e
TH
4573 * Scheduled on pwq_release_worker by put_pwq() when an unbound pwq hits zero
4574 * refcnt and needs to be destroyed.
7a4e344c 4575 */
687a9aa5 4576static void pwq_release_workfn(struct kthread_work *work)
7a4e344c 4577{
6ba94429 4578 struct pool_workqueue *pwq = container_of(work, struct pool_workqueue,
687a9aa5 4579 release_work);
6ba94429
FW
4580 struct workqueue_struct *wq = pwq->wq;
4581 struct worker_pool *pool = pwq->pool;
b42b0bdd 4582 bool is_last = false;
7a4e344c 4583
b42b0bdd 4584 /*
687a9aa5 4585 * When @pwq is not linked, it doesn't hold any reference to the
b42b0bdd
YY
4586 * @wq, and @wq is invalid to access.
4587 */
4588 if (!list_empty(&pwq->pwqs_node)) {
b42b0bdd
YY
4589 mutex_lock(&wq->mutex);
4590 list_del_rcu(&pwq->pwqs_node);
4591 is_last = list_empty(&wq->pwqs);
4592 mutex_unlock(&wq->mutex);
4593 }
6ba94429 4594
687a9aa5
TH
4595 if (wq->flags & WQ_UNBOUND) {
4596 mutex_lock(&wq_pool_mutex);
4597 put_unbound_pool(pool);
4598 mutex_unlock(&wq_pool_mutex);
4599 }
6ba94429 4600
5797b1c1
TH
4601 if (!list_empty(&pwq->pending_node)) {
4602 struct wq_node_nr_active *nna =
4603 wq_node_nr_active(pwq->wq, pwq->pool->node);
4604
4605 raw_spin_lock_irq(&nna->lock);
4606 list_del_init(&pwq->pending_node);
4607 raw_spin_unlock_irq(&nna->lock);
4608 }
4609
25b00775 4610 call_rcu(&pwq->rcu, rcu_free_pwq);
7a4e344c 4611
2865a8fb 4612 /*
6ba94429
FW
4613 * If we're the last pwq going away, @wq is already dead and no one
4614 * is gonna access it anymore. Schedule RCU free.
2865a8fb 4615 */
669de8bd
BVA
4616 if (is_last) {
4617 wq_unregister_lockdep(wq);
25b00775 4618 call_rcu(&wq->rcu, rcu_free_wq);
669de8bd 4619 }
29c91e99
TH
4620}
4621
67dc8325 4622/* initialize newly allocated @pwq which is associated with @wq and @pool */
6ba94429
FW
4623static void init_pwq(struct pool_workqueue *pwq, struct workqueue_struct *wq,
4624 struct worker_pool *pool)
29c91e99 4625{
6ba94429 4626 BUG_ON((unsigned long)pwq & WORK_STRUCT_FLAG_MASK);
29c91e99 4627
6ba94429
FW
4628 memset(pwq, 0, sizeof(*pwq));
4629
4630 pwq->pool = pool;
4631 pwq->wq = wq;
4632 pwq->flush_color = -1;
4633 pwq->refcnt = 1;
f97a4a1a 4634 INIT_LIST_HEAD(&pwq->inactive_works);
5797b1c1 4635 INIT_LIST_HEAD(&pwq->pending_node);
6ba94429
FW
4636 INIT_LIST_HEAD(&pwq->pwqs_node);
4637 INIT_LIST_HEAD(&pwq->mayday_node);
687a9aa5 4638 kthread_init_work(&pwq->release_work, pwq_release_workfn);
29c91e99
TH
4639}
4640
6ba94429
FW
4641/* sync @pwq with the current state of its associated wq and link it */
4642static void link_pwq(struct pool_workqueue *pwq)
29c91e99 4643{
6ba94429 4644 struct workqueue_struct *wq = pwq->wq;
29c91e99 4645
6ba94429 4646 lockdep_assert_held(&wq->mutex);
a892cacc 4647
6ba94429
FW
4648 /* may be called multiple times, ignore if already linked */
4649 if (!list_empty(&pwq->pwqs_node))
29c91e99 4650 return;
29c91e99 4651
6ba94429
FW
4652 /* set the matching work_color */
4653 pwq->work_color = wq->work_color;
29c91e99 4654
6ba94429
FW
4655 /* link in @pwq */
4656 list_add_rcu(&pwq->pwqs_node, &wq->pwqs);
4657}
29c91e99 4658
6ba94429
FW
4659/* obtain a pool matching @attr and create a pwq associating the pool and @wq */
4660static struct pool_workqueue *alloc_unbound_pwq(struct workqueue_struct *wq,
4661 const struct workqueue_attrs *attrs)
4662{
4663 struct worker_pool *pool;
4664 struct pool_workqueue *pwq;
60f5a4bc 4665
6ba94429 4666 lockdep_assert_held(&wq_pool_mutex);
60f5a4bc 4667
6ba94429
FW
4668 pool = get_unbound_pool(attrs);
4669 if (!pool)
4670 return NULL;
60f5a4bc 4671
6ba94429
FW
4672 pwq = kmem_cache_alloc_node(pwq_cache, GFP_KERNEL, pool->node);
4673 if (!pwq) {
4674 put_unbound_pool(pool);
4675 return NULL;
4676 }
29c91e99 4677
6ba94429
FW
4678 init_pwq(pwq, wq, pool);
4679 return pwq;
4680}
29c91e99 4681
29c91e99 4682/**
fef59c9c 4683 * wq_calc_pod_cpumask - calculate a wq_attrs' cpumask for a pod
042f7df1 4684 * @attrs: the wq_attrs of the default pwq of the target workqueue
84193c07 4685 * @cpu: the target CPU
6ba94429 4686 * @cpu_going_down: if >= 0, the CPU to consider as offline
29c91e99 4687 *
fef59c9c
TH
4688 * Calculate the cpumask a workqueue with @attrs should use on @pod. If
4689 * @cpu_going_down is >= 0, that cpu is considered offline during calculation.
9546b29e 4690 * The result is stored in @attrs->__pod_cpumask.
a892cacc 4691 *
fef59c9c
TH
4692 * If pod affinity is not enabled, @attrs->cpumask is always used. If enabled
4693 * and @pod has online CPUs requested by @attrs, the returned cpumask is the
4694 * intersection of the possible CPUs of @pod and @attrs->cpumask.
d185af30 4695 *
fef59c9c 4696 * The caller is responsible for ensuring that the cpumask of @pod stays stable.
29c91e99 4697 */
9546b29e
TH
4698static void wq_calc_pod_cpumask(struct workqueue_attrs *attrs, int cpu,
4699 int cpu_going_down)
29c91e99 4700{
84193c07
TH
4701 const struct wq_pod_type *pt = wqattrs_pod_type(attrs);
4702 int pod = pt->cpu_pod[cpu];
29c91e99 4703
fef59c9c 4704 /* does @pod have any online CPUs @attrs wants? */
9546b29e
TH
4705 cpumask_and(attrs->__pod_cpumask, pt->pod_cpus[pod], attrs->cpumask);
4706 cpumask_and(attrs->__pod_cpumask, attrs->__pod_cpumask, cpu_online_mask);
6ba94429 4707 if (cpu_going_down >= 0)
9546b29e 4708 cpumask_clear_cpu(cpu_going_down, attrs->__pod_cpumask);
29c91e99 4709
9546b29e
TH
4710 if (cpumask_empty(attrs->__pod_cpumask)) {
4711 cpumask_copy(attrs->__pod_cpumask, attrs->cpumask);
84193c07
TH
4712 return;
4713 }
4c16bd32 4714
fef59c9c 4715 /* yeap, return possible CPUs in @pod that @attrs wants */
9546b29e 4716 cpumask_and(attrs->__pod_cpumask, attrs->cpumask, pt->pod_cpus[pod]);
1ad0f0a7 4717
9546b29e 4718 if (cpumask_empty(attrs->__pod_cpumask))
1ad0f0a7
MB
4719 pr_warn_once("WARNING: workqueue cpumask: online intersect > "
4720 "possible intersect\n");
4c16bd32
TH
4721}
4722
9f66cff2 4723/* install @pwq into @wq and return the old pwq, @cpu < 0 for dfl_pwq */
636b927e
TH
4724static struct pool_workqueue *install_unbound_pwq(struct workqueue_struct *wq,
4725 int cpu, struct pool_workqueue *pwq)
1befcf30 4726{
9f66cff2 4727 struct pool_workqueue __rcu **slot = unbound_pwq_slot(wq, cpu);
1befcf30
TH
4728 struct pool_workqueue *old_pwq;
4729
5b95e1af 4730 lockdep_assert_held(&wq_pool_mutex);
1befcf30
TH
4731 lockdep_assert_held(&wq->mutex);
4732
4733 /* link_pwq() can handle duplicate calls */
4734 link_pwq(pwq);
4735
9f66cff2
TH
4736 old_pwq = rcu_access_pointer(*slot);
4737 rcu_assign_pointer(*slot, pwq);
1befcf30
TH
4738 return old_pwq;
4739}
4740
2d5f0764
LJ
4741/* context to store the prepared attrs & pwqs before applying */
4742struct apply_wqattrs_ctx {
4743 struct workqueue_struct *wq; /* target workqueue */
4744 struct workqueue_attrs *attrs; /* attrs to apply */
042f7df1 4745 struct list_head list; /* queued for batching commit */
2d5f0764
LJ
4746 struct pool_workqueue *dfl_pwq;
4747 struct pool_workqueue *pwq_tbl[];
4748};
4749
4750/* free the resources after success or abort */
4751static void apply_wqattrs_cleanup(struct apply_wqattrs_ctx *ctx)
4752{
4753 if (ctx) {
636b927e 4754 int cpu;
2d5f0764 4755
636b927e
TH
4756 for_each_possible_cpu(cpu)
4757 put_pwq_unlocked(ctx->pwq_tbl[cpu]);
2d5f0764
LJ
4758 put_pwq_unlocked(ctx->dfl_pwq);
4759
4760 free_workqueue_attrs(ctx->attrs);
4761
4762 kfree(ctx);
4763 }
4764}
4765
4766/* allocate the attrs and pwqs for later installation */
4767static struct apply_wqattrs_ctx *
4768apply_wqattrs_prepare(struct workqueue_struct *wq,
99c621ef
LJ
4769 const struct workqueue_attrs *attrs,
4770 const cpumask_var_t unbound_cpumask)
9e8cd2f5 4771{
2d5f0764 4772 struct apply_wqattrs_ctx *ctx;
9546b29e 4773 struct workqueue_attrs *new_attrs;
636b927e 4774 int cpu;
9e8cd2f5 4775
2d5f0764 4776 lockdep_assert_held(&wq_pool_mutex);
9e8cd2f5 4777
84193c07
TH
4778 if (WARN_ON(attrs->affn_scope < 0 ||
4779 attrs->affn_scope >= WQ_AFFN_NR_TYPES))
4780 return ERR_PTR(-EINVAL);
4781
636b927e 4782 ctx = kzalloc(struct_size(ctx, pwq_tbl, nr_cpu_ids), GFP_KERNEL);
8719dcea 4783
be69d00d 4784 new_attrs = alloc_workqueue_attrs();
9546b29e 4785 if (!ctx || !new_attrs)
2d5f0764 4786 goto out_free;
13e2e556 4787
4c16bd32
TH
4788 /*
4789 * If something goes wrong during CPU up/down, we'll fall back to
4790 * the default pwq covering whole @attrs->cpumask. Always create
4791 * it even if we don't use it immediately.
4792 */
0f36ee24
TH
4793 copy_workqueue_attrs(new_attrs, attrs);
4794 wqattrs_actualize_cpumask(new_attrs, unbound_cpumask);
9546b29e 4795 cpumask_copy(new_attrs->__pod_cpumask, new_attrs->cpumask);
2d5f0764
LJ
4796 ctx->dfl_pwq = alloc_unbound_pwq(wq, new_attrs);
4797 if (!ctx->dfl_pwq)
4798 goto out_free;
4c16bd32 4799
636b927e 4800 for_each_possible_cpu(cpu) {
af73f5c9 4801 if (new_attrs->ordered) {
2d5f0764 4802 ctx->dfl_pwq->refcnt++;
636b927e
TH
4803 ctx->pwq_tbl[cpu] = ctx->dfl_pwq;
4804 } else {
9546b29e
TH
4805 wq_calc_pod_cpumask(new_attrs, cpu, -1);
4806 ctx->pwq_tbl[cpu] = alloc_unbound_pwq(wq, new_attrs);
636b927e
TH
4807 if (!ctx->pwq_tbl[cpu])
4808 goto out_free;
4c16bd32
TH
4809 }
4810 }
4811
042f7df1
LJ
4812 /* save the user configured attrs and sanitize it. */
4813 copy_workqueue_attrs(new_attrs, attrs);
4814 cpumask_and(new_attrs->cpumask, new_attrs->cpumask, cpu_possible_mask);
9546b29e 4815 cpumask_copy(new_attrs->__pod_cpumask, new_attrs->cpumask);
2d5f0764 4816 ctx->attrs = new_attrs;
042f7df1 4817
2d5f0764 4818 ctx->wq = wq;
2d5f0764
LJ
4819 return ctx;
4820
4821out_free:
2d5f0764
LJ
4822 free_workqueue_attrs(new_attrs);
4823 apply_wqattrs_cleanup(ctx);
84193c07 4824 return ERR_PTR(-ENOMEM);
2d5f0764
LJ
4825}
4826
4827/* set attrs and install prepared pwqs, @ctx points to old pwqs on return */
4828static void apply_wqattrs_commit(struct apply_wqattrs_ctx *ctx)
4829{
636b927e 4830 int cpu;
9e8cd2f5 4831
4c16bd32 4832 /* all pwqs have been created successfully, let's install'em */
2d5f0764 4833 mutex_lock(&ctx->wq->mutex);
a892cacc 4834
2d5f0764 4835 copy_workqueue_attrs(ctx->wq->unbound_attrs, ctx->attrs);
4c16bd32 4836
9f66cff2 4837 /* save the previous pwqs and install the new ones */
636b927e
TH
4838 for_each_possible_cpu(cpu)
4839 ctx->pwq_tbl[cpu] = install_unbound_pwq(ctx->wq, cpu,
4840 ctx->pwq_tbl[cpu]);
9f66cff2 4841 ctx->dfl_pwq = install_unbound_pwq(ctx->wq, -1, ctx->dfl_pwq);
f147f29e 4842
5797b1c1
TH
4843 /* update node_nr_active->max */
4844 wq_update_node_max_active(ctx->wq, -1);
4845
2d5f0764
LJ
4846 mutex_unlock(&ctx->wq->mutex);
4847}
9e8cd2f5 4848
a0111cf6
LJ
4849static int apply_workqueue_attrs_locked(struct workqueue_struct *wq,
4850 const struct workqueue_attrs *attrs)
2d5f0764
LJ
4851{
4852 struct apply_wqattrs_ctx *ctx;
4c16bd32 4853
2d5f0764
LJ
4854 /* only unbound workqueues can change attributes */
4855 if (WARN_ON(!(wq->flags & WQ_UNBOUND)))
4856 return -EINVAL;
13e2e556 4857
2d5f0764 4858 /* creating multiple pwqs breaks ordering guarantee */
0a94efb5
TH
4859 if (!list_empty(&wq->pwqs)) {
4860 if (WARN_ON(wq->flags & __WQ_ORDERED_EXPLICIT))
4861 return -EINVAL;
4862
4863 wq->flags &= ~__WQ_ORDERED;
4864 }
2d5f0764 4865
99c621ef 4866 ctx = apply_wqattrs_prepare(wq, attrs, wq_unbound_cpumask);
84193c07
TH
4867 if (IS_ERR(ctx))
4868 return PTR_ERR(ctx);
2d5f0764
LJ
4869
4870 /* the ctx has been prepared successfully, let's commit it */
6201171e 4871 apply_wqattrs_commit(ctx);
2d5f0764
LJ
4872 apply_wqattrs_cleanup(ctx);
4873
6201171e 4874 return 0;
9e8cd2f5
TH
4875}
4876
a0111cf6
LJ
4877/**
4878 * apply_workqueue_attrs - apply new workqueue_attrs to an unbound workqueue
4879 * @wq: the target workqueue
4880 * @attrs: the workqueue_attrs to apply, allocated with alloc_workqueue_attrs()
4881 *
fef59c9c
TH
4882 * Apply @attrs to an unbound workqueue @wq. Unless disabled, this function maps
4883 * a separate pwq to each CPU pod with possibles CPUs in @attrs->cpumask so that
4884 * work items are affine to the pod it was issued on. Older pwqs are released as
4885 * in-flight work items finish. Note that a work item which repeatedly requeues
4886 * itself back-to-back will stay on its current pwq.
a0111cf6
LJ
4887 *
4888 * Performs GFP_KERNEL allocations.
4889 *
ffd8bea8 4890 * Assumes caller has CPU hotplug read exclusion, i.e. cpus_read_lock().
509b3204 4891 *
a0111cf6
LJ
4892 * Return: 0 on success and -errno on failure.
4893 */
513c98d0 4894int apply_workqueue_attrs(struct workqueue_struct *wq,
a0111cf6
LJ
4895 const struct workqueue_attrs *attrs)
4896{
4897 int ret;
4898
509b3204
DJ
4899 lockdep_assert_cpus_held();
4900
4901 mutex_lock(&wq_pool_mutex);
a0111cf6 4902 ret = apply_workqueue_attrs_locked(wq, attrs);
509b3204 4903 mutex_unlock(&wq_pool_mutex);
a0111cf6
LJ
4904
4905 return ret;
4906}
4907
4c16bd32 4908/**
fef59c9c 4909 * wq_update_pod - update pod affinity of a wq for CPU hot[un]plug
4c16bd32 4910 * @wq: the target workqueue
4cbfd3de
TH
4911 * @cpu: the CPU to update pool association for
4912 * @hotplug_cpu: the CPU coming up or going down
4c16bd32
TH
4913 * @online: whether @cpu is coming up or going down
4914 *
4915 * This function is to be called from %CPU_DOWN_PREPARE, %CPU_ONLINE and
fef59c9c 4916 * %CPU_DOWN_FAILED. @cpu is being hot[un]plugged, update pod affinity of
4c16bd32
TH
4917 * @wq accordingly.
4918 *
fef59c9c
TH
4919 *
4920 * If pod affinity can't be adjusted due to memory allocation failure, it falls
4921 * back to @wq->dfl_pwq which may not be optimal but is always correct.
4922 *
4923 * Note that when the last allowed CPU of a pod goes offline for a workqueue
4924 * with a cpumask spanning multiple pods, the workers which were already
4925 * executing the work items for the workqueue will lose their CPU affinity and
4926 * may execute on any CPU. This is similar to how per-cpu workqueues behave on
4927 * CPU_DOWN. If a workqueue user wants strict affinity, it's the user's
4928 * responsibility to flush the work item from CPU_DOWN_PREPARE.
4c16bd32 4929 */
fef59c9c
TH
4930static void wq_update_pod(struct workqueue_struct *wq, int cpu,
4931 int hotplug_cpu, bool online)
4c16bd32 4932{
4cbfd3de 4933 int off_cpu = online ? -1 : hotplug_cpu;
4c16bd32
TH
4934 struct pool_workqueue *old_pwq = NULL, *pwq;
4935 struct workqueue_attrs *target_attrs;
4c16bd32
TH
4936
4937 lockdep_assert_held(&wq_pool_mutex);
4938
84193c07 4939 if (!(wq->flags & WQ_UNBOUND) || wq->unbound_attrs->ordered)
4c16bd32
TH
4940 return;
4941
4942 /*
4943 * We don't wanna alloc/free wq_attrs for each wq for each CPU.
4944 * Let's use a preallocated one. The following buf is protected by
4945 * CPU hotplug exclusion.
4946 */
fef59c9c 4947 target_attrs = wq_update_pod_attrs_buf;
4c16bd32 4948
4c16bd32 4949 copy_workqueue_attrs(target_attrs, wq->unbound_attrs);
0f36ee24 4950 wqattrs_actualize_cpumask(target_attrs, wq_unbound_cpumask);
4c16bd32 4951
636b927e 4952 /* nothing to do if the target cpumask matches the current pwq */
9546b29e 4953 wq_calc_pod_cpumask(target_attrs, cpu, off_cpu);
9f66cff2 4954 if (wqattrs_equal(target_attrs, unbound_pwq(wq, cpu)->pool->attrs))
636b927e 4955 return;
4c16bd32 4956
4c16bd32
TH
4957 /* create a new pwq */
4958 pwq = alloc_unbound_pwq(wq, target_attrs);
4959 if (!pwq) {
fef59c9c 4960 pr_warn("workqueue: allocation failed while updating CPU pod affinity of \"%s\"\n",
2d916033 4961 wq->name);
77f300b1 4962 goto use_dfl_pwq;
4c16bd32
TH
4963 }
4964
f7142ed4 4965 /* Install the new pwq. */
4c16bd32 4966 mutex_lock(&wq->mutex);
636b927e 4967 old_pwq = install_unbound_pwq(wq, cpu, pwq);
4c16bd32
TH
4968 goto out_unlock;
4969
4970use_dfl_pwq:
f7142ed4 4971 mutex_lock(&wq->mutex);
9f66cff2
TH
4972 pwq = unbound_pwq(wq, -1);
4973 raw_spin_lock_irq(&pwq->pool->lock);
4974 get_pwq(pwq);
4975 raw_spin_unlock_irq(&pwq->pool->lock);
4976 old_pwq = install_unbound_pwq(wq, cpu, pwq);
4c16bd32
TH
4977out_unlock:
4978 mutex_unlock(&wq->mutex);
4979 put_pwq_unlocked(old_pwq);
4980}
4981
30cdf249 4982static int alloc_and_link_pwqs(struct workqueue_struct *wq)
0f900049 4983{
49e3cf44 4984 bool highpri = wq->flags & WQ_HIGHPRI;
8a2b7538 4985 int cpu, ret;
30cdf249 4986
636b927e
TH
4987 wq->cpu_pwq = alloc_percpu(struct pool_workqueue *);
4988 if (!wq->cpu_pwq)
4989 goto enomem;
30cdf249 4990
636b927e 4991 if (!(wq->flags & WQ_UNBOUND)) {
30cdf249 4992 for_each_possible_cpu(cpu) {
687a9aa5 4993 struct pool_workqueue **pwq_p =
ee1ceef7 4994 per_cpu_ptr(wq->cpu_pwq, cpu);
687a9aa5
TH
4995 struct worker_pool *pool =
4996 &(per_cpu_ptr(cpu_worker_pools, cpu)[highpri]);
4997
4998 *pwq_p = kmem_cache_alloc_node(pwq_cache, GFP_KERNEL,
4999 pool->node);
5000 if (!*pwq_p)
5001 goto enomem;
f3421797 5002
687a9aa5 5003 init_pwq(*pwq_p, wq, pool);
f147f29e
TH
5004
5005 mutex_lock(&wq->mutex);
687a9aa5 5006 link_pwq(*pwq_p);
f147f29e 5007 mutex_unlock(&wq->mutex);
30cdf249 5008 }
9e8cd2f5 5009 return 0;
509b3204
DJ
5010 }
5011
ffd8bea8 5012 cpus_read_lock();
509b3204 5013 if (wq->flags & __WQ_ORDERED) {
9f66cff2
TH
5014 struct pool_workqueue *dfl_pwq;
5015
8a2b7538
TH
5016 ret = apply_workqueue_attrs(wq, ordered_wq_attrs[highpri]);
5017 /* there should only be single pwq for ordering guarantee */
9f66cff2
TH
5018 dfl_pwq = rcu_access_pointer(wq->dfl_pwq);
5019 WARN(!ret && (wq->pwqs.next != &dfl_pwq->pwqs_node ||
5020 wq->pwqs.prev != &dfl_pwq->pwqs_node),
8a2b7538 5021 "ordering guarantee broken for workqueue %s\n", wq->name);
30cdf249 5022 } else {
509b3204 5023 ret = apply_workqueue_attrs(wq, unbound_std_wq_attrs[highpri]);
30cdf249 5024 }
ffd8bea8 5025 cpus_read_unlock();
509b3204 5026
64344553
Z
5027 /* for unbound pwq, flush the pwq_release_worker ensures that the
5028 * pwq_release_workfn() completes before calling kfree(wq).
5029 */
5030 if (ret)
5031 kthread_flush_worker(pwq_release_worker);
5032
509b3204 5033 return ret;
687a9aa5
TH
5034
5035enomem:
5036 if (wq->cpu_pwq) {
7b42f401
Z
5037 for_each_possible_cpu(cpu) {
5038 struct pool_workqueue *pwq = *per_cpu_ptr(wq->cpu_pwq, cpu);
5039
5040 if (pwq)
5041 kmem_cache_free(pwq_cache, pwq);
5042 }
687a9aa5
TH
5043 free_percpu(wq->cpu_pwq);
5044 wq->cpu_pwq = NULL;
5045 }
5046 return -ENOMEM;
0f900049
TH
5047}
5048
f3421797
TH
5049static int wq_clamp_max_active(int max_active, unsigned int flags,
5050 const char *name)
b71ab8c2 5051{
636b927e 5052 if (max_active < 1 || max_active > WQ_MAX_ACTIVE)
044c782c 5053 pr_warn("workqueue: max_active %d requested for %s is out of range, clamping between %d and %d\n",
636b927e 5054 max_active, name, 1, WQ_MAX_ACTIVE);
b71ab8c2 5055
636b927e 5056 return clamp_val(max_active, 1, WQ_MAX_ACTIVE);
b71ab8c2
TH
5057}
5058
983c7515
TH
5059/*
5060 * Workqueues which may be used during memory reclaim should have a rescuer
5061 * to guarantee forward progress.
5062 */
5063static int init_rescuer(struct workqueue_struct *wq)
5064{
5065 struct worker *rescuer;
b92b36ea 5066 int ret;
983c7515
TH
5067
5068 if (!(wq->flags & WQ_MEM_RECLAIM))
5069 return 0;
5070
5071 rescuer = alloc_worker(NUMA_NO_NODE);
4c0736a7
PM
5072 if (!rescuer) {
5073 pr_err("workqueue: Failed to allocate a rescuer for wq \"%s\"\n",
5074 wq->name);
983c7515 5075 return -ENOMEM;
4c0736a7 5076 }
983c7515
TH
5077
5078 rescuer->rescue_wq = wq;
b6a46f72 5079 rescuer->task = kthread_create(rescuer_thread, rescuer, "kworker/R-%s", wq->name);
f187b697 5080 if (IS_ERR(rescuer->task)) {
b92b36ea 5081 ret = PTR_ERR(rescuer->task);
4c0736a7
PM
5082 pr_err("workqueue: Failed to create a rescuer kthread for wq \"%s\": %pe",
5083 wq->name, ERR_PTR(ret));
983c7515 5084 kfree(rescuer);
b92b36ea 5085 return ret;
983c7515
TH
5086 }
5087
5088 wq->rescuer = rescuer;
85f0ab43
JL
5089 if (wq->flags & WQ_UNBOUND)
5090 kthread_bind_mask(rescuer->task, wq->unbound_attrs->cpumask);
5091 else
5092 kthread_bind_mask(rescuer->task, cpu_possible_mask);
983c7515
TH
5093 wake_up_process(rescuer->task);
5094
5095 return 0;
5096}
5097
a045a272
TH
5098/**
5099 * wq_adjust_max_active - update a wq's max_active to the current setting
5100 * @wq: target workqueue
5101 *
5102 * If @wq isn't freezing, set @wq->max_active to the saved_max_active and
5103 * activate inactive work items accordingly. If @wq is freezing, clear
5104 * @wq->max_active to zero.
5105 */
5106static void wq_adjust_max_active(struct workqueue_struct *wq)
5107{
c5404d4e 5108 bool activated;
5797b1c1 5109 int new_max, new_min;
a045a272
TH
5110
5111 lockdep_assert_held(&wq->mutex);
5112
5113 if ((wq->flags & WQ_FREEZABLE) && workqueue_freezing) {
5797b1c1
TH
5114 new_max = 0;
5115 new_min = 0;
5116 } else {
5117 new_max = wq->saved_max_active;
5118 new_min = wq->saved_min_active;
a045a272
TH
5119 }
5120
5797b1c1 5121 if (wq->max_active == new_max && wq->min_active == new_min)
a045a272
TH
5122 return;
5123
5124 /*
5797b1c1 5125 * Update @wq->max/min_active and then kick inactive work items if more
a045a272
TH
5126 * active work items are allowed. This doesn't break work item ordering
5127 * because new work items are always queued behind existing inactive
5128 * work items if there are any.
5129 */
5797b1c1
TH
5130 WRITE_ONCE(wq->max_active, new_max);
5131 WRITE_ONCE(wq->min_active, new_min);
5132
5133 if (wq->flags & WQ_UNBOUND)
5134 wq_update_node_max_active(wq, -1);
5135
5136 if (new_max == 0)
5137 return;
a045a272 5138
c5404d4e
TH
5139 /*
5140 * Round-robin through pwq's activating the first inactive work item
5141 * until max_active is filled.
5142 */
5143 do {
5144 struct pool_workqueue *pwq;
a045a272 5145
c5404d4e
TH
5146 activated = false;
5147 for_each_pwq(pwq, wq) {
5148 unsigned long flags;
a045a272 5149
c5404d4e
TH
5150 /* can be called during early boot w/ irq disabled */
5151 raw_spin_lock_irqsave(&pwq->pool->lock, flags);
5797b1c1 5152 if (pwq_activate_first_inactive(pwq, true)) {
c5404d4e
TH
5153 activated = true;
5154 kick_pool(pwq->pool);
5155 }
5156 raw_spin_unlock_irqrestore(&pwq->pool->lock, flags);
5157 }
5158 } while (activated);
a045a272
TH
5159}
5160
a2775bbc 5161__printf(1, 4)
669de8bd
BVA
5162struct workqueue_struct *alloc_workqueue(const char *fmt,
5163 unsigned int flags,
5164 int max_active, ...)
1da177e4 5165{
ecf6881f 5166 va_list args;
1da177e4 5167 struct workqueue_struct *wq;
91ccc6e7
TH
5168 size_t wq_size;
5169 int name_len;
b196be89 5170
5c0338c6 5171 /*
fef59c9c
TH
5172 * Unbound && max_active == 1 used to imply ordered, which is no longer
5173 * the case on many machines due to per-pod pools. While
5c0338c6 5174 * alloc_ordered_workqueue() is the right way to create an ordered
fef59c9c 5175 * workqueue, keep the previous behavior to avoid subtle breakages.
5c0338c6
TH
5176 */
5177 if ((flags & WQ_UNBOUND) && max_active == 1)
5178 flags |= __WQ_ORDERED;
5179
cee22a15
VK
5180 /* see the comment above the definition of WQ_POWER_EFFICIENT */
5181 if ((flags & WQ_POWER_EFFICIENT) && wq_power_efficient)
5182 flags |= WQ_UNBOUND;
5183
ecf6881f 5184 /* allocate wq and format name */
91ccc6e7
TH
5185 if (flags & WQ_UNBOUND)
5186 wq_size = struct_size(wq, node_nr_active, nr_node_ids + 1);
5187 else
5188 wq_size = sizeof(*wq);
5189
5190 wq = kzalloc(wq_size, GFP_KERNEL);
b196be89 5191 if (!wq)
d2c1d404 5192 return NULL;
b196be89 5193
6029a918 5194 if (flags & WQ_UNBOUND) {
be69d00d 5195 wq->unbound_attrs = alloc_workqueue_attrs();
6029a918
TH
5196 if (!wq->unbound_attrs)
5197 goto err_free_wq;
5198 }
5199
669de8bd 5200 va_start(args, max_active);
91ccc6e7 5201 name_len = vsnprintf(wq->name, sizeof(wq->name), fmt, args);
b196be89 5202 va_end(args);
1da177e4 5203
91ccc6e7
TH
5204 if (name_len >= WQ_NAME_LEN)
5205 pr_warn_once("workqueue: name exceeds WQ_NAME_LEN. Truncating to: %s\n",
5206 wq->name);
31c89007 5207
d320c038 5208 max_active = max_active ?: WQ_DFL_ACTIVE;
b196be89 5209 max_active = wq_clamp_max_active(max_active, flags, wq->name);
3af24433 5210
b196be89 5211 /* init wq */
97e37d7b 5212 wq->flags = flags;
a045a272 5213 wq->max_active = max_active;
5797b1c1
TH
5214 wq->min_active = min(max_active, WQ_DFL_MIN_ACTIVE);
5215 wq->saved_max_active = wq->max_active;
5216 wq->saved_min_active = wq->min_active;
3c25a55d 5217 mutex_init(&wq->mutex);
112202d9 5218 atomic_set(&wq->nr_pwqs_to_flush, 0);
30cdf249 5219 INIT_LIST_HEAD(&wq->pwqs);
73f53c4a
TH
5220 INIT_LIST_HEAD(&wq->flusher_queue);
5221 INIT_LIST_HEAD(&wq->flusher_overflow);
493a1724 5222 INIT_LIST_HEAD(&wq->maydays);
502ca9d8 5223
669de8bd 5224 wq_init_lockdep(wq);
cce1a165 5225 INIT_LIST_HEAD(&wq->list);
3af24433 5226
91ccc6e7
TH
5227 if (flags & WQ_UNBOUND) {
5228 if (alloc_node_nr_active(wq->node_nr_active) < 0)
5229 goto err_unreg_lockdep;
5230 }
5231
30cdf249 5232 if (alloc_and_link_pwqs(wq) < 0)
91ccc6e7 5233 goto err_free_node_nr_active;
1537663f 5234
40c17f75 5235 if (wq_online && init_rescuer(wq) < 0)
983c7515 5236 goto err_destroy;
3af24433 5237
226223ab
TH
5238 if ((wq->flags & WQ_SYSFS) && workqueue_sysfs_register(wq))
5239 goto err_destroy;
5240
a0a1a5fd 5241 /*
68e13a67
LJ
5242 * wq_pool_mutex protects global freeze state and workqueues list.
5243 * Grab it, adjust max_active and add the new @wq to workqueues
5244 * list.
a0a1a5fd 5245 */
68e13a67 5246 mutex_lock(&wq_pool_mutex);
a0a1a5fd 5247
a357fc03 5248 mutex_lock(&wq->mutex);
a045a272 5249 wq_adjust_max_active(wq);
a357fc03 5250 mutex_unlock(&wq->mutex);
a0a1a5fd 5251
e2dca7ad 5252 list_add_tail_rcu(&wq->list, &workqueues);
a0a1a5fd 5253
68e13a67 5254 mutex_unlock(&wq_pool_mutex);
1537663f 5255
3af24433 5256 return wq;
d2c1d404 5257
91ccc6e7
TH
5258err_free_node_nr_active:
5259 if (wq->flags & WQ_UNBOUND)
5260 free_node_nr_active(wq->node_nr_active);
82efcab3 5261err_unreg_lockdep:
009bb421
BVA
5262 wq_unregister_lockdep(wq);
5263 wq_free_lockdep(wq);
82efcab3 5264err_free_wq:
6029a918 5265 free_workqueue_attrs(wq->unbound_attrs);
d2c1d404
TH
5266 kfree(wq);
5267 return NULL;
5268err_destroy:
5269 destroy_workqueue(wq);
4690c4ab 5270 return NULL;
3af24433 5271}
669de8bd 5272EXPORT_SYMBOL_GPL(alloc_workqueue);
1da177e4 5273
c29eb853
TH
5274static bool pwq_busy(struct pool_workqueue *pwq)
5275{
5276 int i;
5277
5278 for (i = 0; i < WORK_NR_COLORS; i++)
5279 if (pwq->nr_in_flight[i])
5280 return true;
5281
9f66cff2 5282 if ((pwq != rcu_access_pointer(pwq->wq->dfl_pwq)) && (pwq->refcnt > 1))
c29eb853 5283 return true;
afa87ce8 5284 if (!pwq_is_empty(pwq))
c29eb853
TH
5285 return true;
5286
5287 return false;
5288}
5289
3af24433
ON
5290/**
5291 * destroy_workqueue - safely terminate a workqueue
5292 * @wq: target workqueue
5293 *
5294 * Safely destroy a workqueue. All work currently pending will be done first.
5295 */
5296void destroy_workqueue(struct workqueue_struct *wq)
5297{
49e3cf44 5298 struct pool_workqueue *pwq;
636b927e 5299 int cpu;
3af24433 5300
def98c84
TH
5301 /*
5302 * Remove it from sysfs first so that sanity check failure doesn't
5303 * lead to sysfs name conflicts.
5304 */
5305 workqueue_sysfs_unregister(wq);
5306
33e3f0a3
RC
5307 /* mark the workqueue destruction is in progress */
5308 mutex_lock(&wq->mutex);
5309 wq->flags |= __WQ_DESTROYING;
5310 mutex_unlock(&wq->mutex);
5311
9c5a2ba7
TH
5312 /* drain it before proceeding with destruction */
5313 drain_workqueue(wq);
c8efcc25 5314
def98c84
TH
5315 /* kill rescuer, if sanity checks fail, leave it w/o rescuer */
5316 if (wq->rescuer) {
5317 struct worker *rescuer = wq->rescuer;
5318
5319 /* this prevents new queueing */
a9b8a985 5320 raw_spin_lock_irq(&wq_mayday_lock);
def98c84 5321 wq->rescuer = NULL;
a9b8a985 5322 raw_spin_unlock_irq(&wq_mayday_lock);
def98c84
TH
5323
5324 /* rescuer will empty maydays list before exiting */
5325 kthread_stop(rescuer->task);
8efe1223 5326 kfree(rescuer);
def98c84
TH
5327 }
5328
c29eb853
TH
5329 /*
5330 * Sanity checks - grab all the locks so that we wait for all
5331 * in-flight operations which may do put_pwq().
5332 */
5333 mutex_lock(&wq_pool_mutex);
b09f4fd3 5334 mutex_lock(&wq->mutex);
49e3cf44 5335 for_each_pwq(pwq, wq) {
a9b8a985 5336 raw_spin_lock_irq(&pwq->pool->lock);
c29eb853 5337 if (WARN_ON(pwq_busy(pwq))) {
1d9a6159
KW
5338 pr_warn("%s: %s has the following busy pwq\n",
5339 __func__, wq->name);
c29eb853 5340 show_pwq(pwq);
a9b8a985 5341 raw_spin_unlock_irq(&pwq->pool->lock);
b09f4fd3 5342 mutex_unlock(&wq->mutex);
c29eb853 5343 mutex_unlock(&wq_pool_mutex);
55df0933 5344 show_one_workqueue(wq);
6183c009 5345 return;
76af4d93 5346 }
a9b8a985 5347 raw_spin_unlock_irq(&pwq->pool->lock);
6183c009 5348 }
b09f4fd3 5349 mutex_unlock(&wq->mutex);
6183c009 5350
a0a1a5fd
TH
5351 /*
5352 * wq list is used to freeze wq, remove from list after
5353 * flushing is complete in case freeze races us.
5354 */
e2dca7ad 5355 list_del_rcu(&wq->list);
68e13a67 5356 mutex_unlock(&wq_pool_mutex);
3af24433 5357
636b927e
TH
5358 /*
5359 * We're the sole accessor of @wq. Directly access cpu_pwq and dfl_pwq
5360 * to put the base refs. @wq will be auto-destroyed from the last
5361 * pwq_put. RCU read lock prevents @wq from going away from under us.
5362 */
5363 rcu_read_lock();
4c16bd32 5364
636b927e 5365 for_each_possible_cpu(cpu) {
9f66cff2
TH
5366 put_pwq_unlocked(unbound_pwq(wq, cpu));
5367 RCU_INIT_POINTER(*unbound_pwq_slot(wq, cpu), NULL);
29c91e99 5368 }
636b927e 5369
9f66cff2
TH
5370 put_pwq_unlocked(unbound_pwq(wq, -1));
5371 RCU_INIT_POINTER(*unbound_pwq_slot(wq, -1), NULL);
636b927e
TH
5372
5373 rcu_read_unlock();
3af24433
ON
5374}
5375EXPORT_SYMBOL_GPL(destroy_workqueue);
5376
dcd989cb
TH
5377/**
5378 * workqueue_set_max_active - adjust max_active of a workqueue
5379 * @wq: target workqueue
5380 * @max_active: new max_active value.
5381 *
5797b1c1
TH
5382 * Set max_active of @wq to @max_active. See the alloc_workqueue() function
5383 * comment.
dcd989cb
TH
5384 *
5385 * CONTEXT:
5386 * Don't call from IRQ context.
5387 */
5388void workqueue_set_max_active(struct workqueue_struct *wq, int max_active)
5389{
8719dcea 5390 /* disallow meddling with max_active for ordered workqueues */
0a94efb5 5391 if (WARN_ON(wq->flags & __WQ_ORDERED_EXPLICIT))
8719dcea
TH
5392 return;
5393
f3421797 5394 max_active = wq_clamp_max_active(max_active, wq->flags, wq->name);
dcd989cb 5395
a357fc03 5396 mutex_lock(&wq->mutex);
dcd989cb 5397
0a94efb5 5398 wq->flags &= ~__WQ_ORDERED;
dcd989cb 5399 wq->saved_max_active = max_active;
5797b1c1
TH
5400 if (wq->flags & WQ_UNBOUND)
5401 wq->saved_min_active = min(wq->saved_min_active, max_active);
5402
a045a272 5403 wq_adjust_max_active(wq);
93981800 5404
a357fc03 5405 mutex_unlock(&wq->mutex);
15316ba8 5406}
dcd989cb 5407EXPORT_SYMBOL_GPL(workqueue_set_max_active);
15316ba8 5408
27d4ee03
LW
5409/**
5410 * current_work - retrieve %current task's work struct
5411 *
5412 * Determine if %current task is a workqueue worker and what it's working on.
5413 * Useful to find out the context that the %current task is running in.
5414 *
5415 * Return: work struct if %current task is a workqueue worker, %NULL otherwise.
5416 */
5417struct work_struct *current_work(void)
5418{
5419 struct worker *worker = current_wq_worker();
5420
5421 return worker ? worker->current_work : NULL;
5422}
5423EXPORT_SYMBOL(current_work);
5424
e6267616
TH
5425/**
5426 * current_is_workqueue_rescuer - is %current workqueue rescuer?
5427 *
5428 * Determine whether %current is a workqueue rescuer. Can be used from
5429 * work functions to determine whether it's being run off the rescuer task.
d185af30
YB
5430 *
5431 * Return: %true if %current is a workqueue rescuer. %false otherwise.
e6267616
TH
5432 */
5433bool current_is_workqueue_rescuer(void)
5434{
5435 struct worker *worker = current_wq_worker();
5436
6a092dfd 5437 return worker && worker->rescue_wq;
e6267616
TH
5438}
5439
eef6a7d5 5440/**
dcd989cb
TH
5441 * workqueue_congested - test whether a workqueue is congested
5442 * @cpu: CPU in question
5443 * @wq: target workqueue
eef6a7d5 5444 *
dcd989cb
TH
5445 * Test whether @wq's cpu workqueue for @cpu is congested. There is
5446 * no synchronization around this function and the test result is
5447 * unreliable and only useful as advisory hints or for debugging.
eef6a7d5 5448 *
d3251859 5449 * If @cpu is WORK_CPU_UNBOUND, the test is performed on the local CPU.
636b927e
TH
5450 *
5451 * With the exception of ordered workqueues, all workqueues have per-cpu
5452 * pool_workqueues, each with its own congested state. A workqueue being
5453 * congested on one CPU doesn't mean that the workqueue is contested on any
5454 * other CPUs.
d3251859 5455 *
d185af30 5456 * Return:
dcd989cb 5457 * %true if congested, %false otherwise.
eef6a7d5 5458 */
d84ff051 5459bool workqueue_congested(int cpu, struct workqueue_struct *wq)
1da177e4 5460{
7fb98ea7 5461 struct pool_workqueue *pwq;
76af4d93
TH
5462 bool ret;
5463
24acfb71
TG
5464 rcu_read_lock();
5465 preempt_disable();
7fb98ea7 5466
d3251859
TH
5467 if (cpu == WORK_CPU_UNBOUND)
5468 cpu = smp_processor_id();
5469
636b927e 5470 pwq = *per_cpu_ptr(wq->cpu_pwq, cpu);
f97a4a1a 5471 ret = !list_empty(&pwq->inactive_works);
636b927e 5472
24acfb71
TG
5473 preempt_enable();
5474 rcu_read_unlock();
76af4d93
TH
5475
5476 return ret;
1da177e4 5477}
dcd989cb 5478EXPORT_SYMBOL_GPL(workqueue_congested);
1da177e4 5479
dcd989cb
TH
5480/**
5481 * work_busy - test whether a work is currently pending or running
5482 * @work: the work to be tested
5483 *
5484 * Test whether @work is currently pending or running. There is no
5485 * synchronization around this function and the test result is
5486 * unreliable and only useful as advisory hints or for debugging.
dcd989cb 5487 *
d185af30 5488 * Return:
dcd989cb
TH
5489 * OR'd bitmask of WORK_BUSY_* bits.
5490 */
5491unsigned int work_busy(struct work_struct *work)
1da177e4 5492{
fa1b54e6 5493 struct worker_pool *pool;
dcd989cb
TH
5494 unsigned long flags;
5495 unsigned int ret = 0;
1da177e4 5496
dcd989cb
TH
5497 if (work_pending(work))
5498 ret |= WORK_BUSY_PENDING;
1da177e4 5499
24acfb71 5500 rcu_read_lock();
fa1b54e6 5501 pool = get_work_pool(work);
038366c5 5502 if (pool) {
a9b8a985 5503 raw_spin_lock_irqsave(&pool->lock, flags);
038366c5
LJ
5504 if (find_worker_executing_work(pool, work))
5505 ret |= WORK_BUSY_RUNNING;
a9b8a985 5506 raw_spin_unlock_irqrestore(&pool->lock, flags);
038366c5 5507 }
24acfb71 5508 rcu_read_unlock();
1da177e4 5509
dcd989cb 5510 return ret;
1da177e4 5511}
dcd989cb 5512EXPORT_SYMBOL_GPL(work_busy);
1da177e4 5513
3d1cb205
TH
5514/**
5515 * set_worker_desc - set description for the current work item
5516 * @fmt: printf-style format string
5517 * @...: arguments for the format string
5518 *
5519 * This function can be called by a running work function to describe what
5520 * the work item is about. If the worker task gets dumped, this
5521 * information will be printed out together to help debugging. The
5522 * description can be at most WORKER_DESC_LEN including the trailing '\0'.
5523 */
5524void set_worker_desc(const char *fmt, ...)
5525{
5526 struct worker *worker = current_wq_worker();
5527 va_list args;
5528
5529 if (worker) {
5530 va_start(args, fmt);
5531 vsnprintf(worker->desc, sizeof(worker->desc), fmt, args);
5532 va_end(args);
3d1cb205
TH
5533 }
5534}
5c750d58 5535EXPORT_SYMBOL_GPL(set_worker_desc);
3d1cb205
TH
5536
5537/**
5538 * print_worker_info - print out worker information and description
5539 * @log_lvl: the log level to use when printing
5540 * @task: target task
5541 *
5542 * If @task is a worker and currently executing a work item, print out the
5543 * name of the workqueue being serviced and worker description set with
5544 * set_worker_desc() by the currently executing work item.
5545 *
5546 * This function can be safely called on any task as long as the
5547 * task_struct itself is accessible. While safe, this function isn't
5548 * synchronized and may print out mixups or garbages of limited length.
5549 */
5550void print_worker_info(const char *log_lvl, struct task_struct *task)
5551{
5552 work_func_t *fn = NULL;
5553 char name[WQ_NAME_LEN] = { };
5554 char desc[WORKER_DESC_LEN] = { };
5555 struct pool_workqueue *pwq = NULL;
5556 struct workqueue_struct *wq = NULL;
3d1cb205
TH
5557 struct worker *worker;
5558
5559 if (!(task->flags & PF_WQ_WORKER))
5560 return;
5561
5562 /*
5563 * This function is called without any synchronization and @task
5564 * could be in any state. Be careful with dereferences.
5565 */
e700591a 5566 worker = kthread_probe_data(task);
3d1cb205
TH
5567
5568 /*
8bf89593
TH
5569 * Carefully copy the associated workqueue's workfn, name and desc.
5570 * Keep the original last '\0' in case the original is garbage.
3d1cb205 5571 */
fe557319
CH
5572 copy_from_kernel_nofault(&fn, &worker->current_func, sizeof(fn));
5573 copy_from_kernel_nofault(&pwq, &worker->current_pwq, sizeof(pwq));
5574 copy_from_kernel_nofault(&wq, &pwq->wq, sizeof(wq));
5575 copy_from_kernel_nofault(name, wq->name, sizeof(name) - 1);
5576 copy_from_kernel_nofault(desc, worker->desc, sizeof(desc) - 1);
3d1cb205
TH
5577
5578 if (fn || name[0] || desc[0]) {
d75f773c 5579 printk("%sWorkqueue: %s %ps", log_lvl, name, fn);
8bf89593 5580 if (strcmp(name, desc))
3d1cb205
TH
5581 pr_cont(" (%s)", desc);
5582 pr_cont("\n");
5583 }
5584}
5585
3494fc30
TH
5586static void pr_cont_pool_info(struct worker_pool *pool)
5587{
5588 pr_cont(" cpus=%*pbl", nr_cpumask_bits, pool->attrs->cpumask);
5589 if (pool->node != NUMA_NO_NODE)
5590 pr_cont(" node=%d", pool->node);
5591 pr_cont(" flags=0x%x nice=%d", pool->flags, pool->attrs->nice);
5592}
5593
c76feb0d
PM
5594struct pr_cont_work_struct {
5595 bool comma;
5596 work_func_t func;
5597 long ctr;
5598};
5599
5600static void pr_cont_work_flush(bool comma, work_func_t func, struct pr_cont_work_struct *pcwsp)
5601{
5602 if (!pcwsp->ctr)
5603 goto out_record;
5604 if (func == pcwsp->func) {
5605 pcwsp->ctr++;
5606 return;
5607 }
5608 if (pcwsp->ctr == 1)
5609 pr_cont("%s %ps", pcwsp->comma ? "," : "", pcwsp->func);
5610 else
5611 pr_cont("%s %ld*%ps", pcwsp->comma ? "," : "", pcwsp->ctr, pcwsp->func);
5612 pcwsp->ctr = 0;
5613out_record:
5614 if ((long)func == -1L)
5615 return;
5616 pcwsp->comma = comma;
5617 pcwsp->func = func;
5618 pcwsp->ctr = 1;
5619}
5620
5621static void pr_cont_work(bool comma, struct work_struct *work, struct pr_cont_work_struct *pcwsp)
3494fc30
TH
5622{
5623 if (work->func == wq_barrier_func) {
5624 struct wq_barrier *barr;
5625
5626 barr = container_of(work, struct wq_barrier, work);
5627
c76feb0d 5628 pr_cont_work_flush(comma, (work_func_t)-1, pcwsp);
3494fc30
TH
5629 pr_cont("%s BAR(%d)", comma ? "," : "",
5630 task_pid_nr(barr->task));
5631 } else {
c76feb0d
PM
5632 if (!comma)
5633 pr_cont_work_flush(comma, (work_func_t)-1, pcwsp);
5634 pr_cont_work_flush(comma, work->func, pcwsp);
3494fc30
TH
5635 }
5636}
5637
5638static void show_pwq(struct pool_workqueue *pwq)
5639{
c76feb0d 5640 struct pr_cont_work_struct pcws = { .ctr = 0, };
3494fc30
TH
5641 struct worker_pool *pool = pwq->pool;
5642 struct work_struct *work;
5643 struct worker *worker;
5644 bool has_in_flight = false, has_pending = false;
5645 int bkt;
5646
5647 pr_info(" pwq %d:", pool->id);
5648 pr_cont_pool_info(pool);
5649
a045a272
TH
5650 pr_cont(" active=%d refcnt=%d%s\n",
5651 pwq->nr_active, pwq->refcnt,
3494fc30
TH
5652 !list_empty(&pwq->mayday_node) ? " MAYDAY" : "");
5653
5654 hash_for_each(pool->busy_hash, bkt, worker, hentry) {
5655 if (worker->current_pwq == pwq) {
5656 has_in_flight = true;
5657 break;
5658 }
5659 }
5660 if (has_in_flight) {
5661 bool comma = false;
5662
5663 pr_info(" in-flight:");
5664 hash_for_each(pool->busy_hash, bkt, worker, hentry) {
5665 if (worker->current_pwq != pwq)
5666 continue;
5667
d75f773c 5668 pr_cont("%s %d%s:%ps", comma ? "," : "",
3494fc30 5669 task_pid_nr(worker->task),
30ae2fc0 5670 worker->rescue_wq ? "(RESCUER)" : "",
3494fc30
TH
5671 worker->current_func);
5672 list_for_each_entry(work, &worker->scheduled, entry)
c76feb0d
PM
5673 pr_cont_work(false, work, &pcws);
5674 pr_cont_work_flush(comma, (work_func_t)-1L, &pcws);
3494fc30
TH
5675 comma = true;
5676 }
5677 pr_cont("\n");
5678 }
5679
5680 list_for_each_entry(work, &pool->worklist, entry) {
5681 if (get_work_pwq(work) == pwq) {
5682 has_pending = true;
5683 break;
5684 }
5685 }
5686 if (has_pending) {
5687 bool comma = false;
5688
5689 pr_info(" pending:");
5690 list_for_each_entry(work, &pool->worklist, entry) {
5691 if (get_work_pwq(work) != pwq)
5692 continue;
5693
c76feb0d 5694 pr_cont_work(comma, work, &pcws);
3494fc30
TH
5695 comma = !(*work_data_bits(work) & WORK_STRUCT_LINKED);
5696 }
c76feb0d 5697 pr_cont_work_flush(comma, (work_func_t)-1L, &pcws);
3494fc30
TH
5698 pr_cont("\n");
5699 }
5700
f97a4a1a 5701 if (!list_empty(&pwq->inactive_works)) {
3494fc30
TH
5702 bool comma = false;
5703
f97a4a1a
LJ
5704 pr_info(" inactive:");
5705 list_for_each_entry(work, &pwq->inactive_works, entry) {
c76feb0d 5706 pr_cont_work(comma, work, &pcws);
3494fc30
TH
5707 comma = !(*work_data_bits(work) & WORK_STRUCT_LINKED);
5708 }
c76feb0d 5709 pr_cont_work_flush(comma, (work_func_t)-1L, &pcws);
3494fc30
TH
5710 pr_cont("\n");
5711 }
5712}
5713
5714/**
55df0933
IK
5715 * show_one_workqueue - dump state of specified workqueue
5716 * @wq: workqueue whose state will be printed
3494fc30 5717 */
55df0933 5718void show_one_workqueue(struct workqueue_struct *wq)
3494fc30 5719{
55df0933
IK
5720 struct pool_workqueue *pwq;
5721 bool idle = true;
3494fc30 5722 unsigned long flags;
3494fc30 5723
55df0933 5724 for_each_pwq(pwq, wq) {
afa87ce8 5725 if (!pwq_is_empty(pwq)) {
55df0933
IK
5726 idle = false;
5727 break;
3494fc30 5728 }
55df0933
IK
5729 }
5730 if (idle) /* Nothing to print for idle workqueue */
5731 return;
3494fc30 5732
55df0933 5733 pr_info("workqueue %s: flags=0x%x\n", wq->name, wq->flags);
3494fc30 5734
55df0933
IK
5735 for_each_pwq(pwq, wq) {
5736 raw_spin_lock_irqsave(&pwq->pool->lock, flags);
afa87ce8 5737 if (!pwq_is_empty(pwq)) {
62635ea8 5738 /*
55df0933
IK
5739 * Defer printing to avoid deadlocks in console
5740 * drivers that queue work while holding locks
5741 * also taken in their write paths.
62635ea8 5742 */
55df0933
IK
5743 printk_deferred_enter();
5744 show_pwq(pwq);
5745 printk_deferred_exit();
3494fc30 5746 }
55df0933 5747 raw_spin_unlock_irqrestore(&pwq->pool->lock, flags);
62635ea8
SS
5748 /*
5749 * We could be printing a lot from atomic context, e.g.
55df0933 5750 * sysrq-t -> show_all_workqueues(). Avoid triggering
62635ea8
SS
5751 * hard lockup.
5752 */
5753 touch_nmi_watchdog();
3494fc30
TH
5754 }
5755
55df0933
IK
5756}
5757
5758/**
5759 * show_one_worker_pool - dump state of specified worker pool
5760 * @pool: worker pool whose state will be printed
5761 */
5762static void show_one_worker_pool(struct worker_pool *pool)
5763{
5764 struct worker *worker;
5765 bool first = true;
5766 unsigned long flags;
335a42eb 5767 unsigned long hung = 0;
55df0933
IK
5768
5769 raw_spin_lock_irqsave(&pool->lock, flags);
5770 if (pool->nr_workers == pool->nr_idle)
5771 goto next_pool;
335a42eb
PM
5772
5773 /* How long the first pending work is waiting for a worker. */
5774 if (!list_empty(&pool->worklist))
5775 hung = jiffies_to_msecs(jiffies - pool->watchdog_ts) / 1000;
5776
55df0933
IK
5777 /*
5778 * Defer printing to avoid deadlocks in console drivers that
5779 * queue work while holding locks also taken in their write
5780 * paths.
5781 */
5782 printk_deferred_enter();
5783 pr_info("pool %d:", pool->id);
5784 pr_cont_pool_info(pool);
335a42eb 5785 pr_cont(" hung=%lus workers=%d", hung, pool->nr_workers);
55df0933
IK
5786 if (pool->manager)
5787 pr_cont(" manager: %d",
5788 task_pid_nr(pool->manager->task));
5789 list_for_each_entry(worker, &pool->idle_list, entry) {
5790 pr_cont(" %s%d", first ? "idle: " : "",
5791 task_pid_nr(worker->task));
5792 first = false;
5793 }
5794 pr_cont("\n");
5795 printk_deferred_exit();
5796next_pool:
5797 raw_spin_unlock_irqrestore(&pool->lock, flags);
5798 /*
5799 * We could be printing a lot from atomic context, e.g.
5800 * sysrq-t -> show_all_workqueues(). Avoid triggering
5801 * hard lockup.
5802 */
5803 touch_nmi_watchdog();
5804
5805}
5806
5807/**
5808 * show_all_workqueues - dump workqueue state
5809 *
704bc669 5810 * Called from a sysrq handler and prints out all busy workqueues and pools.
55df0933
IK
5811 */
5812void show_all_workqueues(void)
5813{
5814 struct workqueue_struct *wq;
5815 struct worker_pool *pool;
5816 int pi;
5817
5818 rcu_read_lock();
5819
5820 pr_info("Showing busy workqueues and worker pools:\n");
5821
5822 list_for_each_entry_rcu(wq, &workqueues, list)
5823 show_one_workqueue(wq);
5824
5825 for_each_pool(pool, pi)
5826 show_one_worker_pool(pool);
5827
24acfb71 5828 rcu_read_unlock();
3494fc30
TH
5829}
5830
704bc669
JL
5831/**
5832 * show_freezable_workqueues - dump freezable workqueue state
5833 *
5834 * Called from try_to_freeze_tasks() and prints out all freezable workqueues
5835 * still busy.
5836 */
5837void show_freezable_workqueues(void)
5838{
5839 struct workqueue_struct *wq;
5840
5841 rcu_read_lock();
5842
5843 pr_info("Showing freezable workqueues that are still busy:\n");
5844
5845 list_for_each_entry_rcu(wq, &workqueues, list) {
5846 if (!(wq->flags & WQ_FREEZABLE))
5847 continue;
5848 show_one_workqueue(wq);
5849 }
5850
5851 rcu_read_unlock();
5852}
5853
6b59808b
TH
5854/* used to show worker information through /proc/PID/{comm,stat,status} */
5855void wq_worker_comm(char *buf, size_t size, struct task_struct *task)
5856{
6b59808b
TH
5857 int off;
5858
5859 /* always show the actual comm */
5860 off = strscpy(buf, task->comm, size);
5861 if (off < 0)
5862 return;
5863
197f6acc 5864 /* stabilize PF_WQ_WORKER and worker pool association */
6b59808b
TH
5865 mutex_lock(&wq_pool_attach_mutex);
5866
197f6acc
TH
5867 if (task->flags & PF_WQ_WORKER) {
5868 struct worker *worker = kthread_data(task);
5869 struct worker_pool *pool = worker->pool;
6b59808b 5870
197f6acc 5871 if (pool) {
a9b8a985 5872 raw_spin_lock_irq(&pool->lock);
197f6acc
TH
5873 /*
5874 * ->desc tracks information (wq name or
5875 * set_worker_desc()) for the latest execution. If
5876 * current, prepend '+', otherwise '-'.
5877 */
5878 if (worker->desc[0] != '\0') {
5879 if (worker->current_work)
5880 scnprintf(buf + off, size - off, "+%s",
5881 worker->desc);
5882 else
5883 scnprintf(buf + off, size - off, "-%s",
5884 worker->desc);
5885 }
a9b8a985 5886 raw_spin_unlock_irq(&pool->lock);
6b59808b 5887 }
6b59808b
TH
5888 }
5889
5890 mutex_unlock(&wq_pool_attach_mutex);
5891}
5892
66448bc2
MM
5893#ifdef CONFIG_SMP
5894
db7bccf4
TH
5895/*
5896 * CPU hotplug.
5897 *
e22bee78 5898 * There are two challenges in supporting CPU hotplug. Firstly, there
112202d9 5899 * are a lot of assumptions on strong associations among work, pwq and
706026c2 5900 * pool which make migrating pending and scheduled works very
e22bee78 5901 * difficult to implement without impacting hot paths. Secondly,
94cf58bb 5902 * worker pools serve mix of short, long and very long running works making
e22bee78
TH
5903 * blocked draining impractical.
5904 *
24647570 5905 * This is solved by allowing the pools to be disassociated from the CPU
628c78e7
TH
5906 * running as an unbound one and allowing it to be reattached later if the
5907 * cpu comes back online.
db7bccf4 5908 */
1da177e4 5909
e8b3f8db 5910static void unbind_workers(int cpu)
3af24433 5911{
4ce62e9e 5912 struct worker_pool *pool;
db7bccf4 5913 struct worker *worker;
3af24433 5914
f02ae73a 5915 for_each_cpu_worker_pool(pool, cpu) {
1258fae7 5916 mutex_lock(&wq_pool_attach_mutex);
a9b8a985 5917 raw_spin_lock_irq(&pool->lock);
3af24433 5918
94cf58bb 5919 /*
92f9c5c4 5920 * We've blocked all attach/detach operations. Make all workers
94cf58bb 5921 * unbound and set DISASSOCIATED. Before this, all workers
11b45b0b 5922 * must be on the cpu. After this, they may become diasporas.
b4ac9384
LJ
5923 * And the preemption disabled section in their sched callbacks
5924 * are guaranteed to see WORKER_UNBOUND since the code here
5925 * is on the same cpu.
94cf58bb 5926 */
da028469 5927 for_each_pool_worker(worker, pool)
c9e7cf27 5928 worker->flags |= WORKER_UNBOUND;
06ba38a9 5929
24647570 5930 pool->flags |= POOL_DISASSOCIATED;
f2d5a0ee 5931
eb283428 5932 /*
989442d7
LJ
5933 * The handling of nr_running in sched callbacks are disabled
5934 * now. Zap nr_running. After this, nr_running stays zero and
5935 * need_more_worker() and keep_working() are always true as
5936 * long as the worklist is not empty. This pool now behaves as
5937 * an unbound (in terms of concurrency management) pool which
eb283428
LJ
5938 * are served by workers tied to the pool.
5939 */
bc35f7ef 5940 pool->nr_running = 0;
eb283428
LJ
5941
5942 /*
5943 * With concurrency management just turned off, a busy
5944 * worker blocking could lead to lengthy stalls. Kick off
5945 * unbound chain execution of currently pending work items.
5946 */
0219a352 5947 kick_pool(pool);
989442d7 5948
a9b8a985 5949 raw_spin_unlock_irq(&pool->lock);
989442d7 5950
793777bc
VS
5951 for_each_pool_worker(worker, pool)
5952 unbind_worker(worker);
989442d7
LJ
5953
5954 mutex_unlock(&wq_pool_attach_mutex);
eb283428 5955 }
3af24433 5956}
3af24433 5957
bd7c089e
TH
5958/**
5959 * rebind_workers - rebind all workers of a pool to the associated CPU
5960 * @pool: pool of interest
5961 *
a9ab775b 5962 * @pool->cpu is coming online. Rebind all workers to the CPU.
bd7c089e
TH
5963 */
5964static void rebind_workers(struct worker_pool *pool)
5965{
a9ab775b 5966 struct worker *worker;
bd7c089e 5967
1258fae7 5968 lockdep_assert_held(&wq_pool_attach_mutex);
bd7c089e 5969
a9ab775b
TH
5970 /*
5971 * Restore CPU affinity of all workers. As all idle workers should
5972 * be on the run-queue of the associated CPU before any local
402dd89d 5973 * wake-ups for concurrency management happen, restore CPU affinity
a9ab775b
TH
5974 * of all workers first and then clear UNBOUND. As we're called
5975 * from CPU_ONLINE, the following shouldn't fail.
5976 */
c63a2e52
VS
5977 for_each_pool_worker(worker, pool) {
5978 kthread_set_per_cpu(worker->task, pool->cpu);
5979 WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task,
9546b29e 5980 pool_allowed_cpus(pool)) < 0);
c63a2e52 5981 }
bd7c089e 5982
a9b8a985 5983 raw_spin_lock_irq(&pool->lock);
f7c17d26 5984
3de5e884 5985 pool->flags &= ~POOL_DISASSOCIATED;
bd7c089e 5986
da028469 5987 for_each_pool_worker(worker, pool) {
a9ab775b 5988 unsigned int worker_flags = worker->flags;
bd7c089e 5989
a9ab775b
TH
5990 /*
5991 * We want to clear UNBOUND but can't directly call
5992 * worker_clr_flags() or adjust nr_running. Atomically
5993 * replace UNBOUND with another NOT_RUNNING flag REBOUND.
5994 * @worker will clear REBOUND using worker_clr_flags() when
5995 * it initiates the next execution cycle thus restoring
5996 * concurrency management. Note that when or whether
5997 * @worker clears REBOUND doesn't affect correctness.
5998 *
c95491ed 5999 * WRITE_ONCE() is necessary because @worker->flags may be
a9ab775b 6000 * tested without holding any lock in
6d25be57 6001 * wq_worker_running(). Without it, NOT_RUNNING test may
a9ab775b
TH
6002 * fail incorrectly leading to premature concurrency
6003 * management operations.
6004 */
6005 WARN_ON_ONCE(!(worker_flags & WORKER_UNBOUND));
6006 worker_flags |= WORKER_REBOUND;
6007 worker_flags &= ~WORKER_UNBOUND;
c95491ed 6008 WRITE_ONCE(worker->flags, worker_flags);
bd7c089e 6009 }
a9ab775b 6010
a9b8a985 6011 raw_spin_unlock_irq(&pool->lock);
bd7c089e
TH
6012}
6013
7dbc725e
TH
6014/**
6015 * restore_unbound_workers_cpumask - restore cpumask of unbound workers
6016 * @pool: unbound pool of interest
6017 * @cpu: the CPU which is coming up
6018 *
6019 * An unbound pool may end up with a cpumask which doesn't have any online
6020 * CPUs. When a worker of such pool get scheduled, the scheduler resets
6021 * its cpus_allowed. If @cpu is in @pool's cpumask which didn't have any
6022 * online CPU before, cpus_allowed of all its workers should be restored.
6023 */
6024static void restore_unbound_workers_cpumask(struct worker_pool *pool, int cpu)
6025{
6026 static cpumask_t cpumask;
6027 struct worker *worker;
7dbc725e 6028
1258fae7 6029 lockdep_assert_held(&wq_pool_attach_mutex);
7dbc725e
TH
6030
6031 /* is @cpu allowed for @pool? */
6032 if (!cpumask_test_cpu(cpu, pool->attrs->cpumask))
6033 return;
6034
7dbc725e 6035 cpumask_and(&cpumask, pool->attrs->cpumask, cpu_online_mask);
7dbc725e
TH
6036
6037 /* as we're called from CPU_ONLINE, the following shouldn't fail */
da028469 6038 for_each_pool_worker(worker, pool)
d945b5e9 6039 WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task, &cpumask) < 0);
7dbc725e
TH
6040}
6041
7ee681b2
TG
6042int workqueue_prepare_cpu(unsigned int cpu)
6043{
6044 struct worker_pool *pool;
6045
6046 for_each_cpu_worker_pool(pool, cpu) {
6047 if (pool->nr_workers)
6048 continue;
6049 if (!create_worker(pool))
6050 return -ENOMEM;
6051 }
6052 return 0;
6053}
6054
6055int workqueue_online_cpu(unsigned int cpu)
3af24433 6056{
4ce62e9e 6057 struct worker_pool *pool;
4c16bd32 6058 struct workqueue_struct *wq;
7dbc725e 6059 int pi;
3ce63377 6060
7ee681b2 6061 mutex_lock(&wq_pool_mutex);
7dbc725e 6062
7ee681b2 6063 for_each_pool(pool, pi) {
1258fae7 6064 mutex_lock(&wq_pool_attach_mutex);
94cf58bb 6065
7ee681b2
TG
6066 if (pool->cpu == cpu)
6067 rebind_workers(pool);
6068 else if (pool->cpu < 0)
6069 restore_unbound_workers_cpumask(pool, cpu);
94cf58bb 6070
1258fae7 6071 mutex_unlock(&wq_pool_attach_mutex);
7ee681b2 6072 }
6ba94429 6073
fef59c9c 6074 /* update pod affinity of unbound workqueues */
4cbfd3de 6075 list_for_each_entry(wq, &workqueues, list) {
84193c07
TH
6076 struct workqueue_attrs *attrs = wq->unbound_attrs;
6077
6078 if (attrs) {
6079 const struct wq_pod_type *pt = wqattrs_pod_type(attrs);
6080 int tcpu;
4cbfd3de 6081
84193c07 6082 for_each_cpu(tcpu, pt->pod_cpus[pt->cpu_pod[cpu]])
fef59c9c 6083 wq_update_pod(wq, tcpu, cpu, true);
5797b1c1
TH
6084
6085 mutex_lock(&wq->mutex);
6086 wq_update_node_max_active(wq, -1);
6087 mutex_unlock(&wq->mutex);
4cbfd3de
TH
6088 }
6089 }
6ba94429 6090
7ee681b2
TG
6091 mutex_unlock(&wq_pool_mutex);
6092 return 0;
6ba94429
FW
6093}
6094
7ee681b2 6095int workqueue_offline_cpu(unsigned int cpu)
6ba94429 6096{
6ba94429
FW
6097 struct workqueue_struct *wq;
6098
7ee681b2 6099 /* unbinding per-cpu workers should happen on the local CPU */
e8b3f8db
LJ
6100 if (WARN_ON(cpu != smp_processor_id()))
6101 return -1;
6102
6103 unbind_workers(cpu);
7ee681b2 6104
fef59c9c 6105 /* update pod affinity of unbound workqueues */
7ee681b2 6106 mutex_lock(&wq_pool_mutex);
4cbfd3de 6107 list_for_each_entry(wq, &workqueues, list) {
84193c07
TH
6108 struct workqueue_attrs *attrs = wq->unbound_attrs;
6109
6110 if (attrs) {
6111 const struct wq_pod_type *pt = wqattrs_pod_type(attrs);
6112 int tcpu;
4cbfd3de 6113
84193c07 6114 for_each_cpu(tcpu, pt->pod_cpus[pt->cpu_pod[cpu]])
fef59c9c 6115 wq_update_pod(wq, tcpu, cpu, false);
5797b1c1
TH
6116
6117 mutex_lock(&wq->mutex);
6118 wq_update_node_max_active(wq, cpu);
6119 mutex_unlock(&wq->mutex);
4cbfd3de
TH
6120 }
6121 }
7ee681b2
TG
6122 mutex_unlock(&wq_pool_mutex);
6123
7ee681b2 6124 return 0;
6ba94429
FW
6125}
6126
6ba94429
FW
6127struct work_for_cpu {
6128 struct work_struct work;
6129 long (*fn)(void *);
6130 void *arg;
6131 long ret;
6132};
6133
6134static void work_for_cpu_fn(struct work_struct *work)
6135{
6136 struct work_for_cpu *wfc = container_of(work, struct work_for_cpu, work);
6137
6138 wfc->ret = wfc->fn(wfc->arg);
6139}
6140
6141/**
265f3ed0 6142 * work_on_cpu_key - run a function in thread context on a particular cpu
6ba94429
FW
6143 * @cpu: the cpu to run on
6144 * @fn: the function to run
6145 * @arg: the function arg
265f3ed0 6146 * @key: The lock class key for lock debugging purposes
6ba94429
FW
6147 *
6148 * It is up to the caller to ensure that the cpu doesn't go offline.
6149 * The caller must not hold any locks which would prevent @fn from completing.
6150 *
6151 * Return: The value @fn returns.
6152 */
265f3ed0
FW
6153long work_on_cpu_key(int cpu, long (*fn)(void *),
6154 void *arg, struct lock_class_key *key)
6ba94429
FW
6155{
6156 struct work_for_cpu wfc = { .fn = fn, .arg = arg };
6157
265f3ed0 6158 INIT_WORK_ONSTACK_KEY(&wfc.work, work_for_cpu_fn, key);
6ba94429
FW
6159 schedule_work_on(cpu, &wfc.work);
6160 flush_work(&wfc.work);
6161 destroy_work_on_stack(&wfc.work);
6162 return wfc.ret;
6163}
265f3ed0 6164EXPORT_SYMBOL_GPL(work_on_cpu_key);
0e8d6a93
TG
6165
6166/**
265f3ed0 6167 * work_on_cpu_safe_key - run a function in thread context on a particular cpu
0e8d6a93
TG
6168 * @cpu: the cpu to run on
6169 * @fn: the function to run
6170 * @arg: the function argument
265f3ed0 6171 * @key: The lock class key for lock debugging purposes
0e8d6a93
TG
6172 *
6173 * Disables CPU hotplug and calls work_on_cpu(). The caller must not hold
6174 * any locks which would prevent @fn from completing.
6175 *
6176 * Return: The value @fn returns.
6177 */
265f3ed0
FW
6178long work_on_cpu_safe_key(int cpu, long (*fn)(void *),
6179 void *arg, struct lock_class_key *key)
0e8d6a93
TG
6180{
6181 long ret = -ENODEV;
6182
ffd8bea8 6183 cpus_read_lock();
0e8d6a93 6184 if (cpu_online(cpu))
265f3ed0 6185 ret = work_on_cpu_key(cpu, fn, arg, key);
ffd8bea8 6186 cpus_read_unlock();
0e8d6a93
TG
6187 return ret;
6188}
265f3ed0 6189EXPORT_SYMBOL_GPL(work_on_cpu_safe_key);
6ba94429
FW
6190#endif /* CONFIG_SMP */
6191
6192#ifdef CONFIG_FREEZER
6193
6194/**
6195 * freeze_workqueues_begin - begin freezing workqueues
6196 *
6197 * Start freezing workqueues. After this function returns, all freezable
f97a4a1a 6198 * workqueues will queue new works to their inactive_works list instead of
6ba94429
FW
6199 * pool->worklist.
6200 *
6201 * CONTEXT:
6202 * Grabs and releases wq_pool_mutex, wq->mutex and pool->lock's.
6203 */
6204void freeze_workqueues_begin(void)
6205{
6206 struct workqueue_struct *wq;
6ba94429
FW
6207
6208 mutex_lock(&wq_pool_mutex);
6209
6210 WARN_ON_ONCE(workqueue_freezing);
6211 workqueue_freezing = true;
6212
6213 list_for_each_entry(wq, &workqueues, list) {
6214 mutex_lock(&wq->mutex);
a045a272 6215 wq_adjust_max_active(wq);
6ba94429
FW
6216 mutex_unlock(&wq->mutex);
6217 }
6218
6219 mutex_unlock(&wq_pool_mutex);
6220}
6221
6222/**
6223 * freeze_workqueues_busy - are freezable workqueues still busy?
6224 *
6225 * Check whether freezing is complete. This function must be called
6226 * between freeze_workqueues_begin() and thaw_workqueues().
6227 *
6228 * CONTEXT:
6229 * Grabs and releases wq_pool_mutex.
6230 *
6231 * Return:
6232 * %true if some freezable workqueues are still busy. %false if freezing
6233 * is complete.
6234 */
6235bool freeze_workqueues_busy(void)
6236{
6237 bool busy = false;
6238 struct workqueue_struct *wq;
6239 struct pool_workqueue *pwq;
6240
6241 mutex_lock(&wq_pool_mutex);
6242
6243 WARN_ON_ONCE(!workqueue_freezing);
6244
6245 list_for_each_entry(wq, &workqueues, list) {
6246 if (!(wq->flags & WQ_FREEZABLE))
6247 continue;
6248 /*
6249 * nr_active is monotonically decreasing. It's safe
6250 * to peek without lock.
6251 */
24acfb71 6252 rcu_read_lock();
6ba94429
FW
6253 for_each_pwq(pwq, wq) {
6254 WARN_ON_ONCE(pwq->nr_active < 0);
6255 if (pwq->nr_active) {
6256 busy = true;
24acfb71 6257 rcu_read_unlock();
6ba94429
FW
6258 goto out_unlock;
6259 }
6260 }
24acfb71 6261 rcu_read_unlock();
6ba94429
FW
6262 }
6263out_unlock:
6264 mutex_unlock(&wq_pool_mutex);
6265 return busy;
6266}
6267
6268/**
6269 * thaw_workqueues - thaw workqueues
6270 *
6271 * Thaw workqueues. Normal queueing is restored and all collected
6272 * frozen works are transferred to their respective pool worklists.
6273 *
6274 * CONTEXT:
6275 * Grabs and releases wq_pool_mutex, wq->mutex and pool->lock's.
6276 */
6277void thaw_workqueues(void)
6278{
6279 struct workqueue_struct *wq;
6ba94429
FW
6280
6281 mutex_lock(&wq_pool_mutex);
6282
6283 if (!workqueue_freezing)
6284 goto out_unlock;
6285
6286 workqueue_freezing = false;
6287
6288 /* restore max_active and repopulate worklist */
6289 list_for_each_entry(wq, &workqueues, list) {
6290 mutex_lock(&wq->mutex);
a045a272 6291 wq_adjust_max_active(wq);
6ba94429
FW
6292 mutex_unlock(&wq->mutex);
6293 }
6294
6295out_unlock:
6296 mutex_unlock(&wq_pool_mutex);
6297}
6298#endif /* CONFIG_FREEZER */
6299
99c621ef 6300static int workqueue_apply_unbound_cpumask(const cpumask_var_t unbound_cpumask)
042f7df1
LJ
6301{
6302 LIST_HEAD(ctxs);
6303 int ret = 0;
6304 struct workqueue_struct *wq;
6305 struct apply_wqattrs_ctx *ctx, *n;
6306
6307 lockdep_assert_held(&wq_pool_mutex);
6308
6309 list_for_each_entry(wq, &workqueues, list) {
6310 if (!(wq->flags & WQ_UNBOUND))
6311 continue;
ca10d851 6312
042f7df1 6313 /* creating multiple pwqs breaks ordering guarantee */
ca10d851
WL
6314 if (!list_empty(&wq->pwqs)) {
6315 if (wq->flags & __WQ_ORDERED_EXPLICIT)
6316 continue;
6317 wq->flags &= ~__WQ_ORDERED;
6318 }
042f7df1 6319
99c621ef 6320 ctx = apply_wqattrs_prepare(wq, wq->unbound_attrs, unbound_cpumask);
84193c07
TH
6321 if (IS_ERR(ctx)) {
6322 ret = PTR_ERR(ctx);
042f7df1
LJ
6323 break;
6324 }
6325
6326 list_add_tail(&ctx->list, &ctxs);
6327 }
6328
6329 list_for_each_entry_safe(ctx, n, &ctxs, list) {
6330 if (!ret)
6331 apply_wqattrs_commit(ctx);
6332 apply_wqattrs_cleanup(ctx);
6333 }
6334
99c621ef
LJ
6335 if (!ret) {
6336 mutex_lock(&wq_pool_attach_mutex);
6337 cpumask_copy(wq_unbound_cpumask, unbound_cpumask);
6338 mutex_unlock(&wq_pool_attach_mutex);
6339 }
042f7df1
LJ
6340 return ret;
6341}
6342
fe28f631
WL
6343/**
6344 * workqueue_unbound_exclude_cpumask - Exclude given CPUs from unbound cpumask
6345 * @exclude_cpumask: the cpumask to be excluded from wq_unbound_cpumask
6346 *
6347 * This function can be called from cpuset code to provide a set of isolated
6348 * CPUs that should be excluded from wq_unbound_cpumask. The caller must hold
6349 * either cpus_read_lock or cpus_write_lock.
6350 */
6351int workqueue_unbound_exclude_cpumask(cpumask_var_t exclude_cpumask)
6352{
6353 cpumask_var_t cpumask;
6354 int ret = 0;
6355
6356 if (!zalloc_cpumask_var(&cpumask, GFP_KERNEL))
6357 return -ENOMEM;
6358
6359 lockdep_assert_cpus_held();
6360 mutex_lock(&wq_pool_mutex);
6361
6362 /* Save the current isolated cpumask & export it via sysfs */
6363 cpumask_copy(wq_isolated_cpumask, exclude_cpumask);
6364
6365 /*
6366 * If the operation fails, it will fall back to
6367 * wq_requested_unbound_cpumask which is initially set to
6368 * (HK_TYPE_WQ ∩ HK_TYPE_DOMAIN) house keeping mask and rewritten
6369 * by any subsequent write to workqueue/cpumask sysfs file.
6370 */
6371 if (!cpumask_andnot(cpumask, wq_requested_unbound_cpumask, exclude_cpumask))
6372 cpumask_copy(cpumask, wq_requested_unbound_cpumask);
6373 if (!cpumask_equal(cpumask, wq_unbound_cpumask))
6374 ret = workqueue_apply_unbound_cpumask(cpumask);
6375
6376 mutex_unlock(&wq_pool_mutex);
6377 free_cpumask_var(cpumask);
6378 return ret;
6379}
6380
63c5484e
TH
6381static int parse_affn_scope(const char *val)
6382{
6383 int i;
6384
6385 for (i = 0; i < ARRAY_SIZE(wq_affn_names); i++) {
6386 if (!strncasecmp(val, wq_affn_names[i], strlen(wq_affn_names[i])))
6387 return i;
6388 }
6389 return -EINVAL;
6390}
6391
6392static int wq_affn_dfl_set(const char *val, const struct kernel_param *kp)
6393{
523a301e
TH
6394 struct workqueue_struct *wq;
6395 int affn, cpu;
63c5484e
TH
6396
6397 affn = parse_affn_scope(val);
6398 if (affn < 0)
6399 return affn;
523a301e
TH
6400 if (affn == WQ_AFFN_DFL)
6401 return -EINVAL;
6402
6403 cpus_read_lock();
6404 mutex_lock(&wq_pool_mutex);
63c5484e
TH
6405
6406 wq_affn_dfl = affn;
523a301e
TH
6407
6408 list_for_each_entry(wq, &workqueues, list) {
6409 for_each_online_cpu(cpu) {
6410 wq_update_pod(wq, cpu, cpu, true);
6411 }
6412 }
6413
6414 mutex_unlock(&wq_pool_mutex);
6415 cpus_read_unlock();
6416
63c5484e
TH
6417 return 0;
6418}
6419
6420static int wq_affn_dfl_get(char *buffer, const struct kernel_param *kp)
6421{
6422 return scnprintf(buffer, PAGE_SIZE, "%s\n", wq_affn_names[wq_affn_dfl]);
6423}
6424
6425static const struct kernel_param_ops wq_affn_dfl_ops = {
6426 .set = wq_affn_dfl_set,
6427 .get = wq_affn_dfl_get,
6428};
6429
6430module_param_cb(default_affinity_scope, &wq_affn_dfl_ops, NULL, 0644);
6431
6ba94429
FW
6432#ifdef CONFIG_SYSFS
6433/*
6434 * Workqueues with WQ_SYSFS flag set is visible to userland via
6435 * /sys/bus/workqueue/devices/WQ_NAME. All visible workqueues have the
6436 * following attributes.
6437 *
63c5484e
TH
6438 * per_cpu RO bool : whether the workqueue is per-cpu or unbound
6439 * max_active RW int : maximum number of in-flight work items
6ba94429
FW
6440 *
6441 * Unbound workqueues have the following extra attributes.
6442 *
63c5484e
TH
6443 * nice RW int : nice value of the workers
6444 * cpumask RW mask : bitmask of allowed CPUs for the workers
6445 * affinity_scope RW str : worker CPU affinity scope (cache, numa, none)
8639eceb 6446 * affinity_strict RW bool : worker CPU affinity is strict
6ba94429
FW
6447 */
6448struct wq_device {
6449 struct workqueue_struct *wq;
6450 struct device dev;
6451};
6452
6453static struct workqueue_struct *dev_to_wq(struct device *dev)
6454{
6455 struct wq_device *wq_dev = container_of(dev, struct wq_device, dev);
6456
6457 return wq_dev->wq;
6458}
6459
6460static ssize_t per_cpu_show(struct device *dev, struct device_attribute *attr,
6461 char *buf)
6462{
6463 struct workqueue_struct *wq = dev_to_wq(dev);
6464
6465 return scnprintf(buf, PAGE_SIZE, "%d\n", (bool)!(wq->flags & WQ_UNBOUND));
6466}
6467static DEVICE_ATTR_RO(per_cpu);
6468
6469static ssize_t max_active_show(struct device *dev,
6470 struct device_attribute *attr, char *buf)
6471{
6472 struct workqueue_struct *wq = dev_to_wq(dev);
6473
6474 return scnprintf(buf, PAGE_SIZE, "%d\n", wq->saved_max_active);
6475}
6476
6477static ssize_t max_active_store(struct device *dev,
6478 struct device_attribute *attr, const char *buf,
6479 size_t count)
6480{
6481 struct workqueue_struct *wq = dev_to_wq(dev);
6482 int val;
6483
6484 if (sscanf(buf, "%d", &val) != 1 || val <= 0)
6485 return -EINVAL;
6486
6487 workqueue_set_max_active(wq, val);
6488 return count;
6489}
6490static DEVICE_ATTR_RW(max_active);
6491
6492static struct attribute *wq_sysfs_attrs[] = {
6493 &dev_attr_per_cpu.attr,
6494 &dev_attr_max_active.attr,
6495 NULL,
6496};
6497ATTRIBUTE_GROUPS(wq_sysfs);
6498
49277a5b
WL
6499static void apply_wqattrs_lock(void)
6500{
6501 /* CPUs should stay stable across pwq creations and installations */
6502 cpus_read_lock();
6503 mutex_lock(&wq_pool_mutex);
6504}
6505
6506static void apply_wqattrs_unlock(void)
6507{
6508 mutex_unlock(&wq_pool_mutex);
6509 cpus_read_unlock();
6510}
6511
6ba94429
FW
6512static ssize_t wq_nice_show(struct device *dev, struct device_attribute *attr,
6513 char *buf)
6514{
6515 struct workqueue_struct *wq = dev_to_wq(dev);
6516 int written;
6517
6518 mutex_lock(&wq->mutex);
6519 written = scnprintf(buf, PAGE_SIZE, "%d\n", wq->unbound_attrs->nice);
6520 mutex_unlock(&wq->mutex);
6521
6522 return written;
6523}
6524
6525/* prepare workqueue_attrs for sysfs store operations */
6526static struct workqueue_attrs *wq_sysfs_prep_attrs(struct workqueue_struct *wq)
6527{
6528 struct workqueue_attrs *attrs;
6529
899a94fe
LJ
6530 lockdep_assert_held(&wq_pool_mutex);
6531
be69d00d 6532 attrs = alloc_workqueue_attrs();
6ba94429
FW
6533 if (!attrs)
6534 return NULL;
6535
6ba94429 6536 copy_workqueue_attrs(attrs, wq->unbound_attrs);
6ba94429
FW
6537 return attrs;
6538}
6539
6540static ssize_t wq_nice_store(struct device *dev, struct device_attribute *attr,
6541 const char *buf, size_t count)
6542{
6543 struct workqueue_struct *wq = dev_to_wq(dev);
6544 struct workqueue_attrs *attrs;
d4d3e257
LJ
6545 int ret = -ENOMEM;
6546
6547 apply_wqattrs_lock();
6ba94429
FW
6548
6549 attrs = wq_sysfs_prep_attrs(wq);
6550 if (!attrs)
d4d3e257 6551 goto out_unlock;
6ba94429
FW
6552
6553 if (sscanf(buf, "%d", &attrs->nice) == 1 &&
6554 attrs->nice >= MIN_NICE && attrs->nice <= MAX_NICE)
d4d3e257 6555 ret = apply_workqueue_attrs_locked(wq, attrs);
6ba94429
FW
6556 else
6557 ret = -EINVAL;
6558
d4d3e257
LJ
6559out_unlock:
6560 apply_wqattrs_unlock();
6ba94429
FW
6561 free_workqueue_attrs(attrs);
6562 return ret ?: count;
6563}
6564
6565static ssize_t wq_cpumask_show(struct device *dev,
6566 struct device_attribute *attr, char *buf)
6567{
6568 struct workqueue_struct *wq = dev_to_wq(dev);
6569 int written;
6570
6571 mutex_lock(&wq->mutex);
6572 written = scnprintf(buf, PAGE_SIZE, "%*pb\n",
6573 cpumask_pr_args(wq->unbound_attrs->cpumask));
6574 mutex_unlock(&wq->mutex);
6575 return written;
6576}
6577
6578static ssize_t wq_cpumask_store(struct device *dev,
6579 struct device_attribute *attr,
6580 const char *buf, size_t count)
6581{
6582 struct workqueue_struct *wq = dev_to_wq(dev);
6583 struct workqueue_attrs *attrs;
d4d3e257
LJ
6584 int ret = -ENOMEM;
6585
6586 apply_wqattrs_lock();
6ba94429
FW
6587
6588 attrs = wq_sysfs_prep_attrs(wq);
6589 if (!attrs)
d4d3e257 6590 goto out_unlock;
6ba94429
FW
6591
6592 ret = cpumask_parse(buf, attrs->cpumask);
6593 if (!ret)
d4d3e257 6594 ret = apply_workqueue_attrs_locked(wq, attrs);
6ba94429 6595
d4d3e257
LJ
6596out_unlock:
6597 apply_wqattrs_unlock();
6ba94429
FW
6598 free_workqueue_attrs(attrs);
6599 return ret ?: count;
6600}
6601
63c5484e
TH
6602static ssize_t wq_affn_scope_show(struct device *dev,
6603 struct device_attribute *attr, char *buf)
6604{
6605 struct workqueue_struct *wq = dev_to_wq(dev);
6606 int written;
6607
6608 mutex_lock(&wq->mutex);
523a301e
TH
6609 if (wq->unbound_attrs->affn_scope == WQ_AFFN_DFL)
6610 written = scnprintf(buf, PAGE_SIZE, "%s (%s)\n",
6611 wq_affn_names[WQ_AFFN_DFL],
6612 wq_affn_names[wq_affn_dfl]);
6613 else
6614 written = scnprintf(buf, PAGE_SIZE, "%s\n",
6615 wq_affn_names[wq->unbound_attrs->affn_scope]);
63c5484e
TH
6616 mutex_unlock(&wq->mutex);
6617
6618 return written;
6619}
6620
6621static ssize_t wq_affn_scope_store(struct device *dev,
6622 struct device_attribute *attr,
6623 const char *buf, size_t count)
6624{
6625 struct workqueue_struct *wq = dev_to_wq(dev);
6626 struct workqueue_attrs *attrs;
6627 int affn, ret = -ENOMEM;
6628
6629 affn = parse_affn_scope(buf);
6630 if (affn < 0)
6631 return affn;
6632
6633 apply_wqattrs_lock();
6634 attrs = wq_sysfs_prep_attrs(wq);
6635 if (attrs) {
6636 attrs->affn_scope = affn;
6637 ret = apply_workqueue_attrs_locked(wq, attrs);
6638 }
6639 apply_wqattrs_unlock();
6640 free_workqueue_attrs(attrs);
6641 return ret ?: count;
6642}
6643
8639eceb
TH
6644static ssize_t wq_affinity_strict_show(struct device *dev,
6645 struct device_attribute *attr, char *buf)
6646{
6647 struct workqueue_struct *wq = dev_to_wq(dev);
6648
6649 return scnprintf(buf, PAGE_SIZE, "%d\n",
6650 wq->unbound_attrs->affn_strict);
6651}
6652
6653static ssize_t wq_affinity_strict_store(struct device *dev,
6654 struct device_attribute *attr,
6655 const char *buf, size_t count)
6656{
6657 struct workqueue_struct *wq = dev_to_wq(dev);
6658 struct workqueue_attrs *attrs;
6659 int v, ret = -ENOMEM;
6660
6661 if (sscanf(buf, "%d", &v) != 1)
6662 return -EINVAL;
6663
6664 apply_wqattrs_lock();
6665 attrs = wq_sysfs_prep_attrs(wq);
6666 if (attrs) {
6667 attrs->affn_strict = (bool)v;
6668 ret = apply_workqueue_attrs_locked(wq, attrs);
6669 }
6670 apply_wqattrs_unlock();
6671 free_workqueue_attrs(attrs);
6672 return ret ?: count;
6673}
6674
6ba94429 6675static struct device_attribute wq_sysfs_unbound_attrs[] = {
6ba94429
FW
6676 __ATTR(nice, 0644, wq_nice_show, wq_nice_store),
6677 __ATTR(cpumask, 0644, wq_cpumask_show, wq_cpumask_store),
63c5484e 6678 __ATTR(affinity_scope, 0644, wq_affn_scope_show, wq_affn_scope_store),
8639eceb 6679 __ATTR(affinity_strict, 0644, wq_affinity_strict_show, wq_affinity_strict_store),
6ba94429
FW
6680 __ATTR_NULL,
6681};
8ccad40d 6682
6ba94429
FW
6683static struct bus_type wq_subsys = {
6684 .name = "workqueue",
6685 .dev_groups = wq_sysfs_groups,
2d3854a3
RR
6686};
6687
49277a5b
WL
6688/**
6689 * workqueue_set_unbound_cpumask - Set the low-level unbound cpumask
6690 * @cpumask: the cpumask to set
6691 *
6692 * The low-level workqueues cpumask is a global cpumask that limits
6693 * the affinity of all unbound workqueues. This function check the @cpumask
6694 * and apply it to all unbound workqueues and updates all pwqs of them.
6695 *
6696 * Return: 0 - Success
6697 * -EINVAL - Invalid @cpumask
6698 * -ENOMEM - Failed to allocate memory for attrs or pwqs.
6699 */
6700static int workqueue_set_unbound_cpumask(cpumask_var_t cpumask)
6701{
6702 int ret = -EINVAL;
6703
6704 /*
6705 * Not excluding isolated cpus on purpose.
6706 * If the user wishes to include them, we allow that.
6707 */
6708 cpumask_and(cpumask, cpumask, cpu_possible_mask);
6709 if (!cpumask_empty(cpumask)) {
6710 apply_wqattrs_lock();
6711 cpumask_copy(wq_requested_unbound_cpumask, cpumask);
6712 if (cpumask_equal(cpumask, wq_unbound_cpumask)) {
6713 ret = 0;
6714 goto out_unlock;
6715 }
6716
6717 ret = workqueue_apply_unbound_cpumask(cpumask);
6718
6719out_unlock:
6720 apply_wqattrs_unlock();
6721 }
6722
6723 return ret;
6724}
6725
fe28f631
WL
6726static ssize_t __wq_cpumask_show(struct device *dev,
6727 struct device_attribute *attr, char *buf, cpumask_var_t mask)
b05a7928
FW
6728{
6729 int written;
6730
042f7df1 6731 mutex_lock(&wq_pool_mutex);
fe28f631 6732 written = scnprintf(buf, PAGE_SIZE, "%*pb\n", cpumask_pr_args(mask));
042f7df1 6733 mutex_unlock(&wq_pool_mutex);
b05a7928
FW
6734
6735 return written;
6736}
6737
fe28f631
WL
6738static ssize_t wq_unbound_cpumask_show(struct device *dev,
6739 struct device_attribute *attr, char *buf)
6740{
6741 return __wq_cpumask_show(dev, attr, buf, wq_unbound_cpumask);
6742}
6743
6744static ssize_t wq_requested_cpumask_show(struct device *dev,
6745 struct device_attribute *attr, char *buf)
6746{
6747 return __wq_cpumask_show(dev, attr, buf, wq_requested_unbound_cpumask);
6748}
6749
6750static ssize_t wq_isolated_cpumask_show(struct device *dev,
6751 struct device_attribute *attr, char *buf)
6752{
6753 return __wq_cpumask_show(dev, attr, buf, wq_isolated_cpumask);
6754}
6755
042f7df1
LJ
6756static ssize_t wq_unbound_cpumask_store(struct device *dev,
6757 struct device_attribute *attr, const char *buf, size_t count)
6758{
6759 cpumask_var_t cpumask;
6760 int ret;
6761
6762 if (!zalloc_cpumask_var(&cpumask, GFP_KERNEL))
6763 return -ENOMEM;
6764
6765 ret = cpumask_parse(buf, cpumask);
6766 if (!ret)
6767 ret = workqueue_set_unbound_cpumask(cpumask);
6768
6769 free_cpumask_var(cpumask);
6770 return ret ? ret : count;
6771}
6772
fe28f631 6773static struct device_attribute wq_sysfs_cpumask_attrs[] = {
042f7df1 6774 __ATTR(cpumask, 0644, wq_unbound_cpumask_show,
fe28f631
WL
6775 wq_unbound_cpumask_store),
6776 __ATTR(cpumask_requested, 0444, wq_requested_cpumask_show, NULL),
6777 __ATTR(cpumask_isolated, 0444, wq_isolated_cpumask_show, NULL),
6778 __ATTR_NULL,
6779};
b05a7928 6780
6ba94429 6781static int __init wq_sysfs_init(void)
2d3854a3 6782{
686f6697 6783 struct device *dev_root;
b05a7928
FW
6784 int err;
6785
6786 err = subsys_virtual_register(&wq_subsys, NULL);
6787 if (err)
6788 return err;
6789
686f6697
GKH
6790 dev_root = bus_get_dev_root(&wq_subsys);
6791 if (dev_root) {
fe28f631
WL
6792 struct device_attribute *attr;
6793
6794 for (attr = wq_sysfs_cpumask_attrs; attr->attr.name; attr++) {
6795 err = device_create_file(dev_root, attr);
6796 if (err)
6797 break;
6798 }
686f6697
GKH
6799 put_device(dev_root);
6800 }
6801 return err;
2d3854a3 6802}
6ba94429 6803core_initcall(wq_sysfs_init);
2d3854a3 6804
6ba94429 6805static void wq_device_release(struct device *dev)
2d3854a3 6806{
6ba94429 6807 struct wq_device *wq_dev = container_of(dev, struct wq_device, dev);
6b44003e 6808
6ba94429 6809 kfree(wq_dev);
2d3854a3 6810}
a0a1a5fd
TH
6811
6812/**
6ba94429
FW
6813 * workqueue_sysfs_register - make a workqueue visible in sysfs
6814 * @wq: the workqueue to register
a0a1a5fd 6815 *
6ba94429
FW
6816 * Expose @wq in sysfs under /sys/bus/workqueue/devices.
6817 * alloc_workqueue*() automatically calls this function if WQ_SYSFS is set
6818 * which is the preferred method.
a0a1a5fd 6819 *
6ba94429
FW
6820 * Workqueue user should use this function directly iff it wants to apply
6821 * workqueue_attrs before making the workqueue visible in sysfs; otherwise,
6822 * apply_workqueue_attrs() may race against userland updating the
6823 * attributes.
6824 *
6825 * Return: 0 on success, -errno on failure.
a0a1a5fd 6826 */
6ba94429 6827int workqueue_sysfs_register(struct workqueue_struct *wq)
a0a1a5fd 6828{
6ba94429
FW
6829 struct wq_device *wq_dev;
6830 int ret;
a0a1a5fd 6831
6ba94429 6832 /*
402dd89d 6833 * Adjusting max_active or creating new pwqs by applying
6ba94429
FW
6834 * attributes breaks ordering guarantee. Disallow exposing ordered
6835 * workqueues.
6836 */
0a94efb5 6837 if (WARN_ON(wq->flags & __WQ_ORDERED_EXPLICIT))
6ba94429 6838 return -EINVAL;
a0a1a5fd 6839
6ba94429
FW
6840 wq->wq_dev = wq_dev = kzalloc(sizeof(*wq_dev), GFP_KERNEL);
6841 if (!wq_dev)
6842 return -ENOMEM;
5bcab335 6843
6ba94429
FW
6844 wq_dev->wq = wq;
6845 wq_dev->dev.bus = &wq_subsys;
6ba94429 6846 wq_dev->dev.release = wq_device_release;
23217b44 6847 dev_set_name(&wq_dev->dev, "%s", wq->name);
a0a1a5fd 6848
6ba94429
FW
6849 /*
6850 * unbound_attrs are created separately. Suppress uevent until
6851 * everything is ready.
6852 */
6853 dev_set_uevent_suppress(&wq_dev->dev, true);
a0a1a5fd 6854
6ba94429
FW
6855 ret = device_register(&wq_dev->dev);
6856 if (ret) {
537f4146 6857 put_device(&wq_dev->dev);
6ba94429
FW
6858 wq->wq_dev = NULL;
6859 return ret;
6860 }
a0a1a5fd 6861
6ba94429
FW
6862 if (wq->flags & WQ_UNBOUND) {
6863 struct device_attribute *attr;
a0a1a5fd 6864
6ba94429
FW
6865 for (attr = wq_sysfs_unbound_attrs; attr->attr.name; attr++) {
6866 ret = device_create_file(&wq_dev->dev, attr);
6867 if (ret) {
6868 device_unregister(&wq_dev->dev);
6869 wq->wq_dev = NULL;
6870 return ret;
a0a1a5fd
TH
6871 }
6872 }
6873 }
6ba94429
FW
6874
6875 dev_set_uevent_suppress(&wq_dev->dev, false);
6876 kobject_uevent(&wq_dev->dev.kobj, KOBJ_ADD);
6877 return 0;
a0a1a5fd
TH
6878}
6879
6880/**
6ba94429
FW
6881 * workqueue_sysfs_unregister - undo workqueue_sysfs_register()
6882 * @wq: the workqueue to unregister
a0a1a5fd 6883 *
6ba94429 6884 * If @wq is registered to sysfs by workqueue_sysfs_register(), unregister.
a0a1a5fd 6885 */
6ba94429 6886static void workqueue_sysfs_unregister(struct workqueue_struct *wq)
a0a1a5fd 6887{
6ba94429 6888 struct wq_device *wq_dev = wq->wq_dev;
8b03ae3c 6889
6ba94429
FW
6890 if (!wq->wq_dev)
6891 return;
a0a1a5fd 6892
6ba94429
FW
6893 wq->wq_dev = NULL;
6894 device_unregister(&wq_dev->dev);
a0a1a5fd 6895}
6ba94429
FW
6896#else /* CONFIG_SYSFS */
6897static void workqueue_sysfs_unregister(struct workqueue_struct *wq) { }
6898#endif /* CONFIG_SYSFS */
a0a1a5fd 6899
82607adc
TH
6900/*
6901 * Workqueue watchdog.
6902 *
6903 * Stall may be caused by various bugs - missing WQ_MEM_RECLAIM, illegal
6904 * flush dependency, a concurrency managed work item which stays RUNNING
6905 * indefinitely. Workqueue stalls can be very difficult to debug as the
6906 * usual warning mechanisms don't trigger and internal workqueue state is
6907 * largely opaque.
6908 *
6909 * Workqueue watchdog monitors all worker pools periodically and dumps
6910 * state if some pools failed to make forward progress for a while where
6911 * forward progress is defined as the first item on ->worklist changing.
6912 *
6913 * This mechanism is controlled through the kernel parameter
6914 * "workqueue.watchdog_thresh" which can be updated at runtime through the
6915 * corresponding sysfs parameter file.
6916 */
6917#ifdef CONFIG_WQ_WATCHDOG
6918
82607adc 6919static unsigned long wq_watchdog_thresh = 30;
5cd79d6a 6920static struct timer_list wq_watchdog_timer;
82607adc
TH
6921
6922static unsigned long wq_watchdog_touched = INITIAL_JIFFIES;
6923static DEFINE_PER_CPU(unsigned long, wq_watchdog_touched_cpu) = INITIAL_JIFFIES;
6924
cd2440d6
PM
6925/*
6926 * Show workers that might prevent the processing of pending work items.
6927 * The only candidates are CPU-bound workers in the running state.
6928 * Pending work items should be handled by another idle worker
6929 * in all other situations.
6930 */
6931static void show_cpu_pool_hog(struct worker_pool *pool)
6932{
6933 struct worker *worker;
6934 unsigned long flags;
6935 int bkt;
6936
6937 raw_spin_lock_irqsave(&pool->lock, flags);
6938
6939 hash_for_each(pool->busy_hash, bkt, worker, hentry) {
6940 if (task_is_running(worker->task)) {
6941 /*
6942 * Defer printing to avoid deadlocks in console
6943 * drivers that queue work while holding locks
6944 * also taken in their write paths.
6945 */
6946 printk_deferred_enter();
6947
6948 pr_info("pool %d:\n", pool->id);
6949 sched_show_task(worker->task);
6950
6951 printk_deferred_exit();
6952 }
6953 }
6954
6955 raw_spin_unlock_irqrestore(&pool->lock, flags);
6956}
6957
6958static void show_cpu_pools_hogs(void)
6959{
6960 struct worker_pool *pool;
6961 int pi;
6962
6963 pr_info("Showing backtraces of running workers in stalled CPU-bound worker pools:\n");
6964
6965 rcu_read_lock();
6966
6967 for_each_pool(pool, pi) {
6968 if (pool->cpu_stall)
6969 show_cpu_pool_hog(pool);
6970
6971 }
6972
6973 rcu_read_unlock();
6974}
6975
82607adc
TH
6976static void wq_watchdog_reset_touched(void)
6977{
6978 int cpu;
6979
6980 wq_watchdog_touched = jiffies;
6981 for_each_possible_cpu(cpu)
6982 per_cpu(wq_watchdog_touched_cpu, cpu) = jiffies;
6983}
6984
5cd79d6a 6985static void wq_watchdog_timer_fn(struct timer_list *unused)
82607adc
TH
6986{
6987 unsigned long thresh = READ_ONCE(wq_watchdog_thresh) * HZ;
6988 bool lockup_detected = false;
cd2440d6 6989 bool cpu_pool_stall = false;
940d71c6 6990 unsigned long now = jiffies;
82607adc
TH
6991 struct worker_pool *pool;
6992 int pi;
6993
6994 if (!thresh)
6995 return;
6996
6997 rcu_read_lock();
6998
6999 for_each_pool(pool, pi) {
7000 unsigned long pool_ts, touched, ts;
7001
cd2440d6 7002 pool->cpu_stall = false;
82607adc
TH
7003 if (list_empty(&pool->worklist))
7004 continue;
7005
940d71c6
SS
7006 /*
7007 * If a virtual machine is stopped by the host it can look to
7008 * the watchdog like a stall.
7009 */
7010 kvm_check_and_clear_guest_paused();
7011
82607adc 7012 /* get the latest of pool and touched timestamps */
89e28ce6
WQ
7013 if (pool->cpu >= 0)
7014 touched = READ_ONCE(per_cpu(wq_watchdog_touched_cpu, pool->cpu));
7015 else
7016 touched = READ_ONCE(wq_watchdog_touched);
82607adc 7017 pool_ts = READ_ONCE(pool->watchdog_ts);
82607adc
TH
7018
7019 if (time_after(pool_ts, touched))
7020 ts = pool_ts;
7021 else
7022 ts = touched;
7023
82607adc 7024 /* did we stall? */
940d71c6 7025 if (time_after(now, ts + thresh)) {
82607adc 7026 lockup_detected = true;
cd2440d6
PM
7027 if (pool->cpu >= 0) {
7028 pool->cpu_stall = true;
7029 cpu_pool_stall = true;
7030 }
82607adc
TH
7031 pr_emerg("BUG: workqueue lockup - pool");
7032 pr_cont_pool_info(pool);
7033 pr_cont(" stuck for %us!\n",
940d71c6 7034 jiffies_to_msecs(now - pool_ts) / 1000);
82607adc 7035 }
cd2440d6
PM
7036
7037
82607adc
TH
7038 }
7039
7040 rcu_read_unlock();
7041
7042 if (lockup_detected)
55df0933 7043 show_all_workqueues();
82607adc 7044
cd2440d6
PM
7045 if (cpu_pool_stall)
7046 show_cpu_pools_hogs();
7047
82607adc
TH
7048 wq_watchdog_reset_touched();
7049 mod_timer(&wq_watchdog_timer, jiffies + thresh);
7050}
7051
cb9d7fd5 7052notrace void wq_watchdog_touch(int cpu)
82607adc
TH
7053{
7054 if (cpu >= 0)
7055 per_cpu(wq_watchdog_touched_cpu, cpu) = jiffies;
89e28ce6
WQ
7056
7057 wq_watchdog_touched = jiffies;
82607adc
TH
7058}
7059
7060static void wq_watchdog_set_thresh(unsigned long thresh)
7061{
7062 wq_watchdog_thresh = 0;
7063 del_timer_sync(&wq_watchdog_timer);
7064
7065 if (thresh) {
7066 wq_watchdog_thresh = thresh;
7067 wq_watchdog_reset_touched();
7068 mod_timer(&wq_watchdog_timer, jiffies + thresh * HZ);
7069 }
7070}
7071
7072static int wq_watchdog_param_set_thresh(const char *val,
7073 const struct kernel_param *kp)
7074{
7075 unsigned long thresh;
7076 int ret;
7077
7078 ret = kstrtoul(val, 0, &thresh);
7079 if (ret)
7080 return ret;
7081
7082 if (system_wq)
7083 wq_watchdog_set_thresh(thresh);
7084 else
7085 wq_watchdog_thresh = thresh;
7086
7087 return 0;
7088}
7089
7090static const struct kernel_param_ops wq_watchdog_thresh_ops = {
7091 .set = wq_watchdog_param_set_thresh,
7092 .get = param_get_ulong,
7093};
7094
7095module_param_cb(watchdog_thresh, &wq_watchdog_thresh_ops, &wq_watchdog_thresh,
7096 0644);
7097
7098static void wq_watchdog_init(void)
7099{
5cd79d6a 7100 timer_setup(&wq_watchdog_timer, wq_watchdog_timer_fn, TIMER_DEFERRABLE);
82607adc
TH
7101 wq_watchdog_set_thresh(wq_watchdog_thresh);
7102}
7103
7104#else /* CONFIG_WQ_WATCHDOG */
7105
7106static inline void wq_watchdog_init(void) { }
7107
7108#endif /* CONFIG_WQ_WATCHDOG */
7109
4a6c5607
TH
7110static void __init restrict_unbound_cpumask(const char *name, const struct cpumask *mask)
7111{
7112 if (!cpumask_intersects(wq_unbound_cpumask, mask)) {
7113 pr_warn("workqueue: Restricting unbound_cpumask (%*pb) with %s (%*pb) leaves no CPU, ignoring\n",
7114 cpumask_pr_args(wq_unbound_cpumask), name, cpumask_pr_args(mask));
7115 return;
7116 }
7117
7118 cpumask_and(wq_unbound_cpumask, wq_unbound_cpumask, mask);
7119}
7120
3347fa09
TH
7121/**
7122 * workqueue_init_early - early init for workqueue subsystem
7123 *
2930155b
TH
7124 * This is the first step of three-staged workqueue subsystem initialization and
7125 * invoked as soon as the bare basics - memory allocation, cpumasks and idr are
7126 * up. It sets up all the data structures and system workqueues and allows early
7127 * boot code to create workqueues and queue/cancel work items. Actual work item
7128 * execution starts only after kthreads can be created and scheduled right
7129 * before early initcalls.
3347fa09 7130 */
2333e829 7131void __init workqueue_init_early(void)
1da177e4 7132{
84193c07 7133 struct wq_pod_type *pt = &wq_pod_types[WQ_AFFN_SYSTEM];
7a4e344c
TH
7134 int std_nice[NR_STD_WORKER_POOLS] = { 0, HIGHPRI_NICE_LEVEL };
7135 int i, cpu;
c34056a3 7136
10cdb157 7137 BUILD_BUG_ON(__alignof__(struct pool_workqueue) < __alignof__(long long));
e904e6c2 7138
b05a7928 7139 BUG_ON(!alloc_cpumask_var(&wq_unbound_cpumask, GFP_KERNEL));
fe28f631
WL
7140 BUG_ON(!alloc_cpumask_var(&wq_requested_unbound_cpumask, GFP_KERNEL));
7141 BUG_ON(!zalloc_cpumask_var(&wq_isolated_cpumask, GFP_KERNEL));
b05a7928 7142
4a6c5607
TH
7143 cpumask_copy(wq_unbound_cpumask, cpu_possible_mask);
7144 restrict_unbound_cpumask("HK_TYPE_WQ", housekeeping_cpumask(HK_TYPE_WQ));
7145 restrict_unbound_cpumask("HK_TYPE_DOMAIN", housekeeping_cpumask(HK_TYPE_DOMAIN));
ace3c549 7146 if (!cpumask_empty(&wq_cmdline_cpumask))
4a6c5607 7147 restrict_unbound_cpumask("workqueue.unbound_cpus", &wq_cmdline_cpumask);
ace3c549 7148
fe28f631 7149 cpumask_copy(wq_requested_unbound_cpumask, wq_unbound_cpumask);
ace3c549 7150
e904e6c2
TH
7151 pwq_cache = KMEM_CACHE(pool_workqueue, SLAB_PANIC);
7152
2930155b
TH
7153 wq_update_pod_attrs_buf = alloc_workqueue_attrs();
7154 BUG_ON(!wq_update_pod_attrs_buf);
7155
7bd20b6b
MT
7156 /*
7157 * If nohz_full is enabled, set power efficient workqueue as unbound.
7158 * This allows workqueue items to be moved to HK CPUs.
7159 */
7160 if (housekeeping_enabled(HK_TYPE_TICK))
7161 wq_power_efficient = true;
7162
84193c07
TH
7163 /* initialize WQ_AFFN_SYSTEM pods */
7164 pt->pod_cpus = kcalloc(1, sizeof(pt->pod_cpus[0]), GFP_KERNEL);
7165 pt->pod_node = kcalloc(1, sizeof(pt->pod_node[0]), GFP_KERNEL);
7166 pt->cpu_pod = kcalloc(nr_cpu_ids, sizeof(pt->cpu_pod[0]), GFP_KERNEL);
7167 BUG_ON(!pt->pod_cpus || !pt->pod_node || !pt->cpu_pod);
7168
7169 BUG_ON(!zalloc_cpumask_var_node(&pt->pod_cpus[0], GFP_KERNEL, NUMA_NO_NODE));
7170
84193c07
TH
7171 pt->nr_pods = 1;
7172 cpumask_copy(pt->pod_cpus[0], cpu_possible_mask);
7173 pt->pod_node[0] = NUMA_NO_NODE;
7174 pt->cpu_pod[0] = 0;
7175
706026c2 7176 /* initialize CPU pools */
29c91e99 7177 for_each_possible_cpu(cpu) {
4ce62e9e 7178 struct worker_pool *pool;
8b03ae3c 7179
7a4e344c 7180 i = 0;
f02ae73a 7181 for_each_cpu_worker_pool(pool, cpu) {
7a4e344c 7182 BUG_ON(init_worker_pool(pool));
ec22ca5e 7183 pool->cpu = cpu;
29c91e99 7184 cpumask_copy(pool->attrs->cpumask, cpumask_of(cpu));
9546b29e 7185 cpumask_copy(pool->attrs->__pod_cpumask, cpumask_of(cpu));
7a4e344c 7186 pool->attrs->nice = std_nice[i++];
8639eceb 7187 pool->attrs->affn_strict = true;
f3f90ad4 7188 pool->node = cpu_to_node(cpu);
7a4e344c 7189
9daf9e67 7190 /* alloc pool ID */
68e13a67 7191 mutex_lock(&wq_pool_mutex);
9daf9e67 7192 BUG_ON(worker_pool_assign_id(pool));
68e13a67 7193 mutex_unlock(&wq_pool_mutex);
4ce62e9e 7194 }
8b03ae3c
TH
7195 }
7196
8a2b7538 7197 /* create default unbound and ordered wq attrs */
29c91e99
TH
7198 for (i = 0; i < NR_STD_WORKER_POOLS; i++) {
7199 struct workqueue_attrs *attrs;
7200
be69d00d 7201 BUG_ON(!(attrs = alloc_workqueue_attrs()));
29c91e99 7202 attrs->nice = std_nice[i];
29c91e99 7203 unbound_std_wq_attrs[i] = attrs;
8a2b7538
TH
7204
7205 /*
7206 * An ordered wq should have only one pwq as ordering is
7207 * guaranteed by max_active which is enforced by pwqs.
8a2b7538 7208 */
be69d00d 7209 BUG_ON(!(attrs = alloc_workqueue_attrs()));
8a2b7538 7210 attrs->nice = std_nice[i];
af73f5c9 7211 attrs->ordered = true;
8a2b7538 7212 ordered_wq_attrs[i] = attrs;
29c91e99
TH
7213 }
7214
d320c038 7215 system_wq = alloc_workqueue("events", 0, 0);
1aabe902 7216 system_highpri_wq = alloc_workqueue("events_highpri", WQ_HIGHPRI, 0);
d320c038 7217 system_long_wq = alloc_workqueue("events_long", 0, 0);
f3421797 7218 system_unbound_wq = alloc_workqueue("events_unbound", WQ_UNBOUND,
636b927e 7219 WQ_MAX_ACTIVE);
24d51add
TH
7220 system_freezable_wq = alloc_workqueue("events_freezable",
7221 WQ_FREEZABLE, 0);
0668106c
VK
7222 system_power_efficient_wq = alloc_workqueue("events_power_efficient",
7223 WQ_POWER_EFFICIENT, 0);
8318d6a6 7224 system_freezable_power_efficient_wq = alloc_workqueue("events_freezable_pwr_efficient",
0668106c
VK
7225 WQ_FREEZABLE | WQ_POWER_EFFICIENT,
7226 0);
1aabe902 7227 BUG_ON(!system_wq || !system_highpri_wq || !system_long_wq ||
0668106c
VK
7228 !system_unbound_wq || !system_freezable_wq ||
7229 !system_power_efficient_wq ||
7230 !system_freezable_power_efficient_wq);
3347fa09
TH
7231}
7232
aa6fde93
TH
7233static void __init wq_cpu_intensive_thresh_init(void)
7234{
7235 unsigned long thresh;
7236 unsigned long bogo;
7237
dd64c873
Z
7238 pwq_release_worker = kthread_create_worker(0, "pool_workqueue_release");
7239 BUG_ON(IS_ERR(pwq_release_worker));
7240
aa6fde93
TH
7241 /* if the user set it to a specific value, keep it */
7242 if (wq_cpu_intensive_thresh_us != ULONG_MAX)
7243 return;
7244
7245 /*
7246 * The default of 10ms is derived from the fact that most modern (as of
7247 * 2023) processors can do a lot in 10ms and that it's just below what
7248 * most consider human-perceivable. However, the kernel also runs on a
7249 * lot slower CPUs including microcontrollers where the threshold is way
7250 * too low.
7251 *
7252 * Let's scale up the threshold upto 1 second if BogoMips is below 4000.
7253 * This is by no means accurate but it doesn't have to be. The mechanism
7254 * is still useful even when the threshold is fully scaled up. Also, as
7255 * the reports would usually be applicable to everyone, some machines
7256 * operating on longer thresholds won't significantly diminish their
7257 * usefulness.
7258 */
7259 thresh = 10 * USEC_PER_MSEC;
7260
7261 /* see init/calibrate.c for lpj -> BogoMIPS calculation */
7262 bogo = max_t(unsigned long, loops_per_jiffy / 500000 * HZ, 1);
7263 if (bogo < 4000)
7264 thresh = min_t(unsigned long, thresh * 4000 / bogo, USEC_PER_SEC);
7265
7266 pr_debug("wq_cpu_intensive_thresh: lpj=%lu BogoMIPS=%lu thresh_us=%lu\n",
7267 loops_per_jiffy, bogo, thresh);
7268
7269 wq_cpu_intensive_thresh_us = thresh;
7270}
7271
3347fa09
TH
7272/**
7273 * workqueue_init - bring workqueue subsystem fully online
7274 *
2930155b
TH
7275 * This is the second step of three-staged workqueue subsystem initialization
7276 * and invoked as soon as kthreads can be created and scheduled. Workqueues have
7277 * been created and work items queued on them, but there are no kworkers
7278 * executing the work items yet. Populate the worker pools with the initial
7279 * workers and enable future kworker creations.
3347fa09 7280 */
2333e829 7281void __init workqueue_init(void)
3347fa09 7282{
2186d9f9 7283 struct workqueue_struct *wq;
3347fa09
TH
7284 struct worker_pool *pool;
7285 int cpu, bkt;
7286
aa6fde93
TH
7287 wq_cpu_intensive_thresh_init();
7288
2186d9f9
TH
7289 mutex_lock(&wq_pool_mutex);
7290
2930155b
TH
7291 /*
7292 * Per-cpu pools created earlier could be missing node hint. Fix them
7293 * up. Also, create a rescuer for workqueues that requested it.
7294 */
2186d9f9
TH
7295 for_each_possible_cpu(cpu) {
7296 for_each_cpu_worker_pool(pool, cpu) {
7297 pool->node = cpu_to_node(cpu);
7298 }
7299 }
7300
40c17f75 7301 list_for_each_entry(wq, &workqueues, list) {
40c17f75
TH
7302 WARN(init_rescuer(wq),
7303 "workqueue: failed to create early rescuer for %s",
7304 wq->name);
7305 }
2186d9f9
TH
7306
7307 mutex_unlock(&wq_pool_mutex);
7308
3347fa09
TH
7309 /* create the initial workers */
7310 for_each_online_cpu(cpu) {
7311 for_each_cpu_worker_pool(pool, cpu) {
7312 pool->flags &= ~POOL_DISASSOCIATED;
7313 BUG_ON(!create_worker(pool));
7314 }
7315 }
7316
7317 hash_for_each(unbound_pool_hash, bkt, pool, hash_node)
7318 BUG_ON(!create_worker(pool));
7319
7320 wq_online = true;
82607adc 7321 wq_watchdog_init();
1da177e4 7322}
c4f135d6 7323
025e1684
TH
7324/*
7325 * Initialize @pt by first initializing @pt->cpu_pod[] with pod IDs according to
7326 * @cpu_shares_pod(). Each subset of CPUs that share a pod is assigned a unique
7327 * and consecutive pod ID. The rest of @pt is initialized accordingly.
7328 */
7329static void __init init_pod_type(struct wq_pod_type *pt,
7330 bool (*cpus_share_pod)(int, int))
7331{
7332 int cur, pre, cpu, pod;
7333
7334 pt->nr_pods = 0;
7335
7336 /* init @pt->cpu_pod[] according to @cpus_share_pod() */
7337 pt->cpu_pod = kcalloc(nr_cpu_ids, sizeof(pt->cpu_pod[0]), GFP_KERNEL);
7338 BUG_ON(!pt->cpu_pod);
7339
7340 for_each_possible_cpu(cur) {
7341 for_each_possible_cpu(pre) {
7342 if (pre >= cur) {
7343 pt->cpu_pod[cur] = pt->nr_pods++;
7344 break;
7345 }
7346 if (cpus_share_pod(cur, pre)) {
7347 pt->cpu_pod[cur] = pt->cpu_pod[pre];
7348 break;
7349 }
7350 }
7351 }
7352
7353 /* init the rest to match @pt->cpu_pod[] */
7354 pt->pod_cpus = kcalloc(pt->nr_pods, sizeof(pt->pod_cpus[0]), GFP_KERNEL);
7355 pt->pod_node = kcalloc(pt->nr_pods, sizeof(pt->pod_node[0]), GFP_KERNEL);
7356 BUG_ON(!pt->pod_cpus || !pt->pod_node);
7357
7358 for (pod = 0; pod < pt->nr_pods; pod++)
7359 BUG_ON(!zalloc_cpumask_var(&pt->pod_cpus[pod], GFP_KERNEL));
7360
7361 for_each_possible_cpu(cpu) {
7362 cpumask_set_cpu(cpu, pt->pod_cpus[pt->cpu_pod[cpu]]);
7363 pt->pod_node[pt->cpu_pod[cpu]] = cpu_to_node(cpu);
7364 }
7365}
7366
63c5484e
TH
7367static bool __init cpus_dont_share(int cpu0, int cpu1)
7368{
7369 return false;
7370}
7371
7372static bool __init cpus_share_smt(int cpu0, int cpu1)
7373{
7374#ifdef CONFIG_SCHED_SMT
7375 return cpumask_test_cpu(cpu0, cpu_smt_mask(cpu1));
7376#else
7377 return false;
7378#endif
7379}
7380
025e1684
TH
7381static bool __init cpus_share_numa(int cpu0, int cpu1)
7382{
7383 return cpu_to_node(cpu0) == cpu_to_node(cpu1);
7384}
7385
2930155b
TH
7386/**
7387 * workqueue_init_topology - initialize CPU pods for unbound workqueues
7388 *
7389 * This is the third step of there-staged workqueue subsystem initialization and
7390 * invoked after SMP and topology information are fully initialized. It
7391 * initializes the unbound CPU pods accordingly.
7392 */
7393void __init workqueue_init_topology(void)
a86feae6 7394{
2930155b 7395 struct workqueue_struct *wq;
025e1684 7396 int cpu;
a86feae6 7397
63c5484e
TH
7398 init_pod_type(&wq_pod_types[WQ_AFFN_CPU], cpus_dont_share);
7399 init_pod_type(&wq_pod_types[WQ_AFFN_SMT], cpus_share_smt);
7400 init_pod_type(&wq_pod_types[WQ_AFFN_CACHE], cpus_share_cache);
025e1684 7401 init_pod_type(&wq_pod_types[WQ_AFFN_NUMA], cpus_share_numa);
a86feae6 7402
2930155b 7403 mutex_lock(&wq_pool_mutex);
a86feae6 7404
2930155b
TH
7405 /*
7406 * Workqueues allocated earlier would have all CPUs sharing the default
7407 * worker pool. Explicitly call wq_update_pod() on all workqueue and CPU
7408 * combinations to apply per-pod sharing.
7409 */
7410 list_for_each_entry(wq, &workqueues, list) {
5797b1c1 7411 for_each_online_cpu(cpu)
2930155b 7412 wq_update_pod(wq, cpu, cpu, true);
5797b1c1
TH
7413 if (wq->flags & WQ_UNBOUND) {
7414 mutex_lock(&wq->mutex);
7415 wq_update_node_max_active(wq, -1);
7416 mutex_unlock(&wq->mutex);
2930155b
TH
7417 }
7418 }
7419
7420 mutex_unlock(&wq_pool_mutex);
a86feae6
TH
7421}
7422
20bdedaf
TH
7423void __warn_flushing_systemwide_wq(void)
7424{
7425 pr_warn("WARNING: Flushing system-wide workqueues will be prohibited in near future.\n");
7426 dump_stack();
7427}
c4f135d6 7428EXPORT_SYMBOL(__warn_flushing_systemwide_wq);
ace3c549 7429
7430static int __init workqueue_unbound_cpus_setup(char *str)
7431{
7432 if (cpulist_parse(str, &wq_cmdline_cpumask) < 0) {
7433 cpumask_clear(&wq_cmdline_cpumask);
7434 pr_warn("workqueue.unbound_cpus: incorrect CPU range, using default\n");
7435 }
7436
7437 return 1;
7438}
7439__setup("workqueue.unbound_cpus=", workqueue_unbound_cpus_setup);