git.kernel.dk Git - linux-block.git/blob

1 // SPDX-License-Identifier: GPL-2.0-only

2 /*

3 * kernel/workqueue.c - generic async execution with shared worker pool

4 *

6 *

7 * Derived from the taskqueue/keventd code by:

8 * David Woodhouse <dwmw2@infradead.org>

9 * Andrew Morton

10 * Kai Petzke <wpp@marie.physik.tu-berlin.de>

11 * Theodore Ts'o <tytso@mit.edu>

12 *

13 * Made to use alloc_percpu by Christoph Lameter.

14 *

17 *

18 * This is the generic async execution mechanism. Work items as are

19 * executed in process context. The worker pool is shared and

20 * automatically managed. There are two worker pools for each CPU (one for

21 * normal work items and the other for high priority ones) and some extra

22 * pools for workqueues which are not bound to any specific CPU - the

23 * number of these backing pools is dynamic.

24 *

25 * Please read Documentation/core-api/workqueue.rst for details.

26 */

28 #include <linux/export.h>

29 #include <linux/kernel.h>

30 #include <linux/sched.h>

31 #include <linux/init.h>

32 #include <linux/signal.h>

33 #include <linux/completion.h>

34 #include <linux/workqueue.h>

35 #include <linux/slab.h>

36 #include <linux/cpu.h>

37 #include <linux/notifier.h>

38 #include <linux/kthread.h>

39 #include <linux/hardirq.h>

40 #include <linux/mempolicy.h>

41 #include <linux/freezer.h>

42 #include <linux/debug_locks.h>

43 #include <linux/lockdep.h>

44 #include <linux/idr.h>

45 #include <linux/jhash.h>

46 #include <linux/hashtable.h>

47 #include <linux/rculist.h>

48 #include <linux/nodemask.h>

49 #include <linux/moduleparam.h>

50 #include <linux/uaccess.h>

51 #include <linux/sched/isolation.h>

52 #include <linux/sched/debug.h>

53 #include <linux/nmi.h>

54 #include <linux/kvm_para.h>

55 #include <linux/delay.h>

57 #include "workqueue_internal.h"

59 enum worker_pool_flags {

60 /*

61 * worker_pool flags

62 *

63 * A bound pool is either associated or disassociated with its CPU.

64 * While associated (!DISASSOCIATED), all workers are bound to the

65 * CPU and none has %WORKER_UNBOUND set and concurrency management

66 * is in effect.

67 *

68 * While DISASSOCIATED, the cpu may be offline and all workers have

69 * %WORKER_UNBOUND set and concurrency management disabled, and may

70 * be executing on any CPU. The pool behaves as an unbound one.

71 *

72 * Note that DISASSOCIATED should be flipped only while holding

73 * wq_pool_attach_mutex to avoid changing binding state while

74 * worker_attach_to_pool() is in progress.

75 */

76 POOL_MANAGER_ACTIVE = 1 << 0, /* being managed */

77 POOL_DISASSOCIATED = 1 << 2, /* cpu can't serve workers */

78 };

80 enum worker_flags {

81 /* worker flags */

82 WORKER_DIE = 1 << 1, /* die die die */

83 WORKER_IDLE = 1 << 2, /* is idle */

84 WORKER_PREP = 1 << 3, /* preparing to run works */

85 WORKER_CPU_INTENSIVE = 1 << 6, /* cpu intensive */

86 WORKER_UNBOUND = 1 << 7, /* worker is unbound */

87 WORKER_REBOUND = 1 << 8, /* worker was rebound */

89 WORKER_NOT_RUNNING = WORKER_PREP | WORKER_CPU_INTENSIVE |

90 WORKER_UNBOUND | WORKER_REBOUND,

91 };

93 enum wq_internal_consts {

94 NR_STD_WORKER_POOLS = 2, /* # standard pools per cpu */

96 UNBOUND_POOL_HASH_ORDER = 6, /* hashed by pool->attrs */

97 BUSY_WORKER_HASH_ORDER = 6, /* 64 pointers */

99 MAX_IDLE_WORKERS_RATIO = 4, /* 1/4 of busy can be idle */

100 IDLE_WORKER_TIMEOUT = 300 * HZ, /* keep idle ones for 5 mins */

101

102 MAYDAY_INITIAL_TIMEOUT = HZ / 100 >= 2 ? HZ / 100 : 2,

103 /* call for help after 10ms

104 (min two ticks) */

105 MAYDAY_INTERVAL = HZ / 10, /* and then every 100ms */

106 CREATE_COOLDOWN = HZ, /* time to breath after fail */

107

108 /*

109 * Rescue workers are used only on emergencies and shared by

110 * all cpus. Give MIN_NICE.

111 */

112 RESCUER_NICE_LEVEL = MIN_NICE,

113 HIGHPRI_NICE_LEVEL = MIN_NICE,

114

115 WQ_NAME_LEN = 32,

116 };

117

118 /*

119 * Structure fields follow one of the following exclusion rules.

120 *

121 * I: Modifiable by initialization/destruction paths and read-only for

122 * everyone else.

123 *

124 * P: Preemption protected. Disabling preemption is enough and should

125 * only be modified and accessed from the local cpu.

126 *

127 * L: pool->lock protected. Access with pool->lock held.

128 *

129 * LN: pool->lock and wq_node_nr_active->lock protected for writes. Either for

130 * reads.

131 *

132 * K: Only modified by worker while holding pool->lock. Can be safely read by

133 * self, while holding pool->lock or from IRQ context if %current is the

134 * kworker.

135 *

136 * S: Only modified by worker self.

137 *

138 * A: wq_pool_attach_mutex protected.

139 *

140 * PL: wq_pool_mutex protected.

141 *

142 * PR: wq_pool_mutex protected for writes. RCU protected for reads.

143 *

144 * PW: wq_pool_mutex and wq->mutex protected for writes. Either for reads.

145 *

146 * PWR: wq_pool_mutex and wq->mutex protected for writes. Either or

147 * RCU for reads.

148 *

149 * WQ: wq->mutex protected.

150 *

151 * WR: wq->mutex protected for writes. RCU protected for reads.

152 *

153 * WO: wq->mutex protected for writes. Updated with WRITE_ONCE() and can be read

154 * with READ_ONCE() without locking.

155 *

156 * MD: wq_mayday_lock protected.

157 *

158 * WD: Used internally by the watchdog.

159 */

160

161 /* struct worker is defined in workqueue_internal.h */

162

163 struct worker_pool {

164 raw_spinlock_t lock; /* the pool lock */

165 int cpu; /* I: the associated cpu */

166 int node; /* I: the associated node ID */

167 int id; /* I: pool ID */

168 unsigned int flags; /* L: flags */

169

170 unsigned long watchdog_ts; /* L: watchdog timestamp */

171 bool cpu_stall; /* WD: stalled cpu bound pool */

172

173 /*

174 * The counter is incremented in a process context on the associated CPU

175 * w/ preemption disabled, and decremented or reset in the same context

176 * but w/ pool->lock held. The readers grab pool->lock and are

177 * guaranteed to see if the counter reached zero.

178 */

179 int nr_running;

180

181 struct list_head worklist; /* L: list of pending works */

182

183 int nr_workers; /* L: total number of workers */

184 int nr_idle; /* L: currently idle workers */

185

186 struct list_head idle_list; /* L: list of idle workers */

187 struct timer_list idle_timer; /* L: worker idle timeout */

188 struct work_struct idle_cull_work; /* L: worker idle cleanup */

189

190 struct timer_list mayday_timer; /* L: SOS timer for workers */

191

192 /* a workers is either on busy_hash or idle_list, or the manager */

193 DECLARE_HASHTABLE(busy_hash, BUSY_WORKER_HASH_ORDER);

194 /* L: hash of busy workers */

195

196 struct worker *manager; /* L: purely informational */

197 struct list_head workers; /* A: attached workers */

198 struct list_head dying_workers; /* A: workers about to die */

199 struct completion *detach_completion; /* all workers detached */

200

201 struct ida worker_ida; /* worker IDs for task name */

202

203 struct workqueue_attrs *attrs; /* I: worker attributes */

204 struct hlist_node hash_node; /* PL: unbound_pool_hash node */

205 int refcnt; /* PL: refcnt for unbound pools */

206

207 /*

208 * Destruction of pool is RCU protected to allow dereferences

209 * from get_work_pool().

210 */

211 struct rcu_head rcu;

212 };

213

214 /*

215 * Per-pool_workqueue statistics. These can be monitored using

216 * tools/workqueue/wq_monitor.py.

217 */

218 enum pool_workqueue_stats {

219 PWQ_STAT_STARTED, /* work items started execution */

220 PWQ_STAT_COMPLETED, /* work items completed execution */

221 PWQ_STAT_CPU_TIME, /* total CPU time consumed */

222 PWQ_STAT_CPU_INTENSIVE, /* wq_cpu_intensive_thresh_us violations */

223 PWQ_STAT_CM_WAKEUP, /* concurrency-management worker wakeups */

224 PWQ_STAT_REPATRIATED, /* unbound workers brought back into scope */

225 PWQ_STAT_MAYDAY, /* maydays to rescuer */

226 PWQ_STAT_RESCUED, /* linked work items executed by rescuer */

227

228 PWQ_NR_STATS,

229 };

230

231 /*

232 * The per-pool workqueue. While queued, the lower WORK_STRUCT_FLAG_BITS

233 * of work_struct->data are used for flags and the remaining high bits

234 * point to the pwq; thus, pwqs need to be aligned at two's power of the

235 * number of flag bits.

236 */

237 struct pool_workqueue {

238 struct worker_pool *pool; /* I: the associated pool */

239 struct workqueue_struct *wq; /* I: the owning workqueue */

240 int work_color; /* L: current color */

241 int flush_color; /* L: flushing color */

242 int refcnt; /* L: reference count */

243 int nr_in_flight[WORK_NR_COLORS];

244 /* L: nr of in_flight works */

245

246 /*

247 * nr_active management and WORK_STRUCT_INACTIVE:

248 *

249 * When pwq->nr_active >= max_active, new work item is queued to

250 * pwq->inactive_works instead of pool->worklist and marked with

251 * WORK_STRUCT_INACTIVE.

252 *

253 * All work items marked with WORK_STRUCT_INACTIVE do not participate in

254 * nr_active and all work items in pwq->inactive_works are marked with

255 * WORK_STRUCT_INACTIVE. But not all WORK_STRUCT_INACTIVE work items are

256 * in pwq->inactive_works. Some of them are ready to run in

257 * pool->worklist or worker->scheduled. Those work itmes are only struct

258 * wq_barrier which is used for flush_work() and should not participate

259 * in nr_active. For non-barrier work item, it is marked with

260 * WORK_STRUCT_INACTIVE iff it is in pwq->inactive_works.

261 */

262 int nr_active; /* L: nr of active works */

263 struct list_head inactive_works; /* L: inactive works */

264 struct list_head pending_node; /* LN: node on wq_node_nr_active->pending_pwqs */

265 struct list_head pwqs_node; /* WR: node on wq->pwqs */

266 struct list_head mayday_node; /* MD: node on wq->maydays */

267

268 u64 stats[PWQ_NR_STATS];

269

270 /*

271 * Release of unbound pwq is punted to a kthread_worker. See put_pwq()

272 * and pwq_release_workfn() for details. pool_workqueue itself is also

273 * RCU protected so that the first pwq can be determined without

274 * grabbing wq->mutex.

275 */

276 struct kthread_work release_work;

277 struct rcu_head rcu;

278 } __aligned(1 << WORK_STRUCT_FLAG_BITS);

279

280 /*

281 * Structure used to wait for workqueue flush.

282 */

283 struct wq_flusher {

284 struct list_head list; /* WQ: list of flushers */

285 int flush_color; /* WQ: flush color waiting for */

286 struct completion done; /* flush completion */

287 };

288

289 struct wq_device;

290

291 /*

292 * Unlike in a per-cpu workqueue where max_active limits its concurrency level

293 * on each CPU, in an unbound workqueue, max_active applies to the whole system.

294 * As sharing a single nr_active across multiple sockets can be very expensive,

295 * the counting and enforcement is per NUMA node.

296 *

297 * The following struct is used to enforce per-node max_active. When a pwq wants

298 * to start executing a work item, it should increment ->nr using

299 * tryinc_node_nr_active(). If acquisition fails due to ->nr already being over

300 * ->max, the pwq is queued on ->pending_pwqs. As in-flight work items finish

301 * and decrement ->nr, node_activate_pending_pwq() activates the pending pwqs in

302 * round-robin order.

303 */

304 struct wq_node_nr_active {

305 int max; /* per-node max_active */

306 atomic_t nr; /* per-node nr_active */

307 raw_spinlock_t lock; /* nests inside pool locks */

308 struct list_head pending_pwqs; /* LN: pwqs with inactive works */

309 };

310

311 /*

312 * The externally visible workqueue. It relays the issued work items to

313 * the appropriate worker_pool through its pool_workqueues.

314 */

315 struct workqueue_struct {

316 struct list_head pwqs; /* WR: all pwqs of this wq */

317 struct list_head list; /* PR: list of all workqueues */

318

319 struct mutex mutex; /* protects this wq */

320 int work_color; /* WQ: current work color */

321 int flush_color; /* WQ: current flush color */

322 atomic_t nr_pwqs_to_flush; /* flush in progress */

323 struct wq_flusher *first_flusher; /* WQ: first flusher */

324 struct list_head flusher_queue; /* WQ: flush waiters */

325 struct list_head flusher_overflow; /* WQ: flush overflow list */

326

327 struct list_head maydays; /* MD: pwqs requesting rescue */

328 struct worker *rescuer; /* MD: rescue worker */

329

330 int nr_drainers; /* WQ: drain in progress */

331

332 /* See alloc_workqueue() function comment for info on min/max_active */

333 int max_active; /* WO: max active works */

334 int min_active; /* WO: min active works */

335 int saved_max_active; /* WQ: saved max_active */

336 int saved_min_active; /* WQ: saved min_active */

337

338 struct workqueue_attrs *unbound_attrs; /* PW: only for unbound wqs */

339 struct pool_workqueue __rcu *dfl_pwq; /* PW: only for unbound wqs */

340

341 #ifdef CONFIG_SYSFS

342 struct wq_device *wq_dev; /* I: for sysfs interface */

343 #endif

344 #ifdef CONFIG_LOCKDEP

345 char *lock_name;

346 struct lock_class_key key;

347 struct lockdep_map lockdep_map;

348 #endif

349 char name[WQ_NAME_LEN]; /* I: workqueue name */

350

351 /*

352 * Destruction of workqueue_struct is RCU protected to allow walking

353 * the workqueues list without grabbing wq_pool_mutex.

354 * This is used to dump all workqueues from sysrq.

355 */

356 struct rcu_head rcu;

357

358 /* hot fields used during command issue, aligned to cacheline */

359 unsigned int flags ____cacheline_aligned; /* WQ: WQ_* flags */

360 struct pool_workqueue __percpu __rcu **cpu_pwq; /* I: per-cpu pwqs */

361 struct wq_node_nr_active *node_nr_active[]; /* I: per-node nr_active */

362 };

363

364 static struct kmem_cache *pwq_cache;

365

366 /*

367 * Each pod type describes how CPUs should be grouped for unbound workqueues.

368 * See the comment above workqueue_attrs->affn_scope.

369 */

370 struct wq_pod_type {

371 int nr_pods; /* number of pods */

372 cpumask_var_t *pod_cpus; /* pod -> cpus */

373 int *pod_node; /* pod -> node */

374 int *cpu_pod; /* cpu -> pod */

375 };

376

377 static struct wq_pod_type wq_pod_types[WQ_AFFN_NR_TYPES];

378 static enum wq_affn_scope wq_affn_dfl = WQ_AFFN_CACHE;

379

380 static const char *wq_affn_names[WQ_AFFN_NR_TYPES] = {

381 [WQ_AFFN_DFL] = "default",

382 [WQ_AFFN_CPU] = "cpu",

383 [WQ_AFFN_SMT] = "smt",

384 [WQ_AFFN_CACHE] = "cache",

385 [WQ_AFFN_NUMA] = "numa",

386 [WQ_AFFN_SYSTEM] = "system",

387 };

388

389 /*

390 * Per-cpu work items which run for longer than the following threshold are

391 * automatically considered CPU intensive and excluded from concurrency

392 * management to prevent them from noticeably delaying other per-cpu work items.

393 * ULONG_MAX indicates that the user hasn't overridden it with a boot parameter.

394 * The actual value is initialized in wq_cpu_intensive_thresh_init().

395 */

396 static unsigned long wq_cpu_intensive_thresh_us = ULONG_MAX;

397 module_param_named(cpu_intensive_thresh_us, wq_cpu_intensive_thresh_us, ulong, 0644);

398

399 /* see the comment above the definition of WQ_POWER_EFFICIENT */

400 static bool wq_power_efficient = IS_ENABLED(CONFIG_WQ_POWER_EFFICIENT_DEFAULT);

401 module_param_named(power_efficient, wq_power_efficient, bool, 0444);

402

403 static bool wq_online; /* can kworkers be created yet? */

404

405 /* buf for wq_update_unbound_pod_attrs(), protected by CPU hotplug exclusion */

406 static struct workqueue_attrs *wq_update_pod_attrs_buf;

407

408 static DEFINE_MUTEX(wq_pool_mutex); /* protects pools and workqueues list */

409 static DEFINE_MUTEX(wq_pool_attach_mutex); /* protects worker attach/detach */

410 static DEFINE_RAW_SPINLOCK(wq_mayday_lock); /* protects wq->maydays list */

411 /* wait for manager to go away */

412 static struct rcuwait manager_wait = __RCUWAIT_INITIALIZER(manager_wait);

413

414 static LIST_HEAD(workqueues); /* PR: list of all workqueues */

415 static bool workqueue_freezing; /* PL: have wqs started freezing? */

416

417 /* PL&A: allowable cpus for unbound wqs and work items */

418 static cpumask_var_t wq_unbound_cpumask;

419

420 /* PL: user requested unbound cpumask via sysfs */

421 static cpumask_var_t wq_requested_unbound_cpumask;

422

423 /* PL: isolated cpumask to be excluded from unbound cpumask */

424 static cpumask_var_t wq_isolated_cpumask;

425

426 /* for further constrain wq_unbound_cpumask by cmdline parameter*/

427 static struct cpumask wq_cmdline_cpumask __initdata;

428

429 /* CPU where unbound work was last round robin scheduled from this CPU */

430 static DEFINE_PER_CPU(int, wq_rr_cpu_last);

431

432 /*

433 * Local execution of unbound work items is no longer guaranteed. The

434 * following always forces round-robin CPU selection on unbound work items

435 * to uncover usages which depend on it.

436 */

437 #ifdef CONFIG_DEBUG_WQ_FORCE_RR_CPU

438 static bool wq_debug_force_rr_cpu = true;

439 #else

440 static bool wq_debug_force_rr_cpu = false;

441 #endif

442 module_param_named(debug_force_rr_cpu, wq_debug_force_rr_cpu, bool, 0644);

443

444 /* the per-cpu worker pools */

445 static DEFINE_PER_CPU_SHARED_ALIGNED(struct worker_pool [NR_STD_WORKER_POOLS], cpu_worker_pools);

446

447 static DEFINE_IDR(worker_pool_idr); /* PR: idr of all pools */

448

449 /* PL: hash of all unbound pools keyed by pool->attrs */

450 static DEFINE_HASHTABLE(unbound_pool_hash, UNBOUND_POOL_HASH_ORDER);

451

452 /* I: attributes used when instantiating standard unbound pools on demand */

453 static struct workqueue_attrs *unbound_std_wq_attrs[NR_STD_WORKER_POOLS];

454

455 /* I: attributes used when instantiating ordered pools on demand */

456 static struct workqueue_attrs *ordered_wq_attrs[NR_STD_WORKER_POOLS];

457

458 /*

459 * I: kthread_worker to release pwq's. pwq release needs to be bounced to a

460 * process context while holding a pool lock. Bounce to a dedicated kthread

461 * worker to avoid A-A deadlocks.

462 */

463 static struct kthread_worker *pwq_release_worker __ro_after_init;

464

465 struct workqueue_struct *system_wq __ro_after_init;

466 EXPORT_SYMBOL(system_wq);

467 struct workqueue_struct *system_highpri_wq __ro_after_init;

468 EXPORT_SYMBOL_GPL(system_highpri_wq);

469 struct workqueue_struct *system_long_wq __ro_after_init;

470 EXPORT_SYMBOL_GPL(system_long_wq);

471 struct workqueue_struct *system_unbound_wq __ro_after_init;

472 EXPORT_SYMBOL_GPL(system_unbound_wq);

473 struct workqueue_struct *system_freezable_wq __ro_after_init;

474 EXPORT_SYMBOL_GPL(system_freezable_wq);

475 struct workqueue_struct *system_power_efficient_wq __ro_after_init;

476 EXPORT_SYMBOL_GPL(system_power_efficient_wq);

477 struct workqueue_struct *system_freezable_power_efficient_wq __ro_after_init;

478 EXPORT_SYMBOL_GPL(system_freezable_power_efficient_wq);

479

480 static int worker_thread(void *__worker);

481 static void workqueue_sysfs_unregister(struct workqueue_struct *wq);

482 static void show_pwq(struct pool_workqueue *pwq);

483 static void show_one_worker_pool(struct worker_pool *pool);

484

485 #define CREATE_TRACE_POINTS

486 #include <trace/events/workqueue.h>

487

488 #define assert_rcu_or_pool_mutex() \

489 RCU_LOCKDEP_WARN(!rcu_read_lock_held() && \

490 !lockdep_is_held(&wq_pool_mutex), \

491 "RCU or wq_pool_mutex should be held")

492

493 #define assert_rcu_or_wq_mutex_or_pool_mutex(wq) \

494 RCU_LOCKDEP_WARN(!rcu_read_lock_held() && \

495 !lockdep_is_held(&wq->mutex) && \

496 !lockdep_is_held(&wq_pool_mutex), \

497 "RCU, wq->mutex or wq_pool_mutex should be held")

498

499 #define for_each_cpu_worker_pool(pool, cpu) \

500 for ((pool) = &per_cpu(cpu_worker_pools, cpu)[0]; \

501 (pool) < &per_cpu(cpu_worker_pools, cpu)[NR_STD_WORKER_POOLS]; \

502 (pool)++)

503

504 /**

505 * for_each_pool - iterate through all worker_pools in the system

506 * @pool: iteration cursor

507 * @pi: integer used for iteration

508 *

509 * This must be called either with wq_pool_mutex held or RCU read

510 * locked. If the pool needs to be used beyond the locking in effect, the

511 * caller is responsible for guaranteeing that the pool stays online.

512 *

513 * The if/else clause exists only for the lockdep assertion and can be

514 * ignored.

515 */

516 #define for_each_pool(pool, pi) \

517 idr_for_each_entry(&worker_pool_idr, pool, pi) \

518 if (({ assert_rcu_or_pool_mutex(); false; })) { } \

519 else

520

521 /**

522 * for_each_pool_worker - iterate through all workers of a worker_pool

523 * @worker: iteration cursor

524 * @pool: worker_pool to iterate workers of

525 *

526 * This must be called with wq_pool_attach_mutex.

527 *

528 * The if/else clause exists only for the lockdep assertion and can be

529 * ignored.

530 */

531 #define for_each_pool_worker(worker, pool) \

532 list_for_each_entry((worker), &(pool)->workers, node) \

533 if (({ lockdep_assert_held(&wq_pool_attach_mutex); false; })) { } \

534 else

535

536 /**

537 * for_each_pwq - iterate through all pool_workqueues of the specified workqueue

538 * @pwq: iteration cursor

539 * @wq: the target workqueue

540 *

541 * This must be called either with wq->mutex held or RCU read locked.

542 * If the pwq needs to be used beyond the locking in effect, the caller is

543 * responsible for guaranteeing that the pwq stays online.

544 *

545 * The if/else clause exists only for the lockdep assertion and can be

546 * ignored.

547 */

548 #define for_each_pwq(pwq, wq) \

549 list_for_each_entry_rcu((pwq), &(wq)->pwqs, pwqs_node, \

550 lockdep_is_held(&(wq->mutex)))

551

552 #ifdef CONFIG_DEBUG_OBJECTS_WORK

553

554 static const struct debug_obj_descr work_debug_descr;

555

556 static void *work_debug_hint(void *addr)

557 {

558 return ((struct work_struct *) addr)->func;

559 }

560

561 static bool work_is_static_object(void *addr)

562 {

563 struct work_struct *work = addr;

564

565 return test_bit(WORK_STRUCT_STATIC_BIT, work_data_bits(work));

566 }

567

568 /*

569 * fixup_init is called when:

570 * - an active object is initialized

571 */

572 static bool work_fixup_init(void *addr, enum debug_obj_state state)

573 {

574 struct work_struct *work = addr;

575

576 switch (state) {

577 case ODEBUG_STATE_ACTIVE:

578 cancel_work_sync(work);

579 debug_object_init(work, &work_debug_descr);

580 return true;

581 default:

582 return false;

583 }

584 }

585

586 /*

587 * fixup_free is called when:

588 * - an active object is freed

589 */

590 static bool work_fixup_free(void *addr, enum debug_obj_state state)

591 {

592 struct work_struct *work = addr;

593

594 switch (state) {

595 case ODEBUG_STATE_ACTIVE:

596 cancel_work_sync(work);

597 debug_object_free(work, &work_debug_descr);

598 return true;

599 default:

600 return false;

601 }

602 }

603

604 static const struct debug_obj_descr work_debug_descr = {

605 .name = "work_struct",

606 .debug_hint = work_debug_hint,

607 .is_static_object = work_is_static_object,

608 .fixup_init = work_fixup_init,

609 .fixup_free = work_fixup_free,

610 };

611

612 static inline void debug_work_activate(struct work_struct *work)

613 {

614 debug_object_activate(work, &work_debug_descr);

615 }

616

617 static inline void debug_work_deactivate(struct work_struct *work)

618 {

619 debug_object_deactivate(work, &work_debug_descr);

620 }

621

622 void __init_work(struct work_struct *work, int onstack)

623 {

624 if (onstack)

625 debug_object_init_on_stack(work, &work_debug_descr);

626 else

627 debug_object_init(work, &work_debug_descr);

628 }

629 EXPORT_SYMBOL_GPL(__init_work);

630

631 void destroy_work_on_stack(struct work_struct *work)

632 {

633 debug_object_free(work, &work_debug_descr);

634 }

635 EXPORT_SYMBOL_GPL(destroy_work_on_stack);

636

637 void destroy_delayed_work_on_stack(struct delayed_work *work)

638 {

639 destroy_timer_on_stack(&work->timer);

640 debug_object_free(&work->work, &work_debug_descr);

641 }

642 EXPORT_SYMBOL_GPL(destroy_delayed_work_on_stack);

643

644 #else

645 static inline void debug_work_activate(struct work_struct *work) { }

646 static inline void debug_work_deactivate(struct work_struct *work) { }

647 #endif

648

649 /**

650 * worker_pool_assign_id - allocate ID and assign it to @pool

651 * @pool: the pool pointer of interest

652 *

653 * Returns 0 if ID in [0, WORK_OFFQ_POOL_NONE) is allocated and assigned

654 * successfully, -errno on failure.

655 */

656 static int worker_pool_assign_id(struct worker_pool *pool)

657 {

658 int ret;

659

660 lockdep_assert_held(&wq_pool_mutex);

661

662 ret = idr_alloc(&worker_pool_idr, pool, 0, WORK_OFFQ_POOL_NONE,

663 GFP_KERNEL);

664 if (ret >= 0) {

665 pool->id = ret;

666 return 0;

667 }

668 return ret;

669 }

670

671 static struct pool_workqueue __rcu **

672 unbound_pwq_slot(struct workqueue_struct *wq, int cpu)

673 {

674 if (cpu >= 0)

675 return per_cpu_ptr(wq->cpu_pwq, cpu);

676 else

677 return &wq->dfl_pwq;

678 }

679

680 /* @cpu < 0 for dfl_pwq */

681 static struct pool_workqueue *unbound_pwq(struct workqueue_struct *wq, int cpu)

682 {

683 return rcu_dereference_check(*unbound_pwq_slot(wq, cpu),

684 lockdep_is_held(&wq_pool_mutex) ||

685 lockdep_is_held(&wq->mutex));

686 }

687

688 /**

689 * unbound_effective_cpumask - effective cpumask of an unbound workqueue

690 * @wq: workqueue of interest

691 *

692 * @wq->unbound_attrs->cpumask contains the cpumask requested by the user which

693 * is masked with wq_unbound_cpumask to determine the effective cpumask. The

694 * default pwq is always mapped to the pool with the current effective cpumask.

695 */

696 static struct cpumask *unbound_effective_cpumask(struct workqueue_struct *wq)

697 {

698 return unbound_pwq(wq, -1)->pool->attrs->__pod_cpumask;

699 }

700

701 static unsigned int work_color_to_flags(int color)

702 {

703 return color << WORK_STRUCT_COLOR_SHIFT;

704 }

705

706 static int get_work_color(unsigned long work_data)

707 {

708 return (work_data >> WORK_STRUCT_COLOR_SHIFT) &

709 ((1 << WORK_STRUCT_COLOR_BITS) - 1);

710 }

711

712 static int work_next_color(int color)

713 {

714 return (color + 1) % WORK_NR_COLORS;

715 }

716

717 /*

718 * While queued, %WORK_STRUCT_PWQ is set and non flag bits of a work's data

719 * contain the pointer to the queued pwq. Once execution starts, the flag

720 * is cleared and the high bits contain OFFQ flags and pool ID.

721 *

722 * set_work_pwq(), set_work_pool_and_clear_pending(), mark_work_canceling()

723 * and clear_work_data() can be used to set the pwq, pool or clear

724 * work->data. These functions should only be called while the work is

725 * owned - ie. while the PENDING bit is set.

726 *

727 * get_work_pool() and get_work_pwq() can be used to obtain the pool or pwq

728 * corresponding to a work. Pool is available once the work has been

729 * queued anywhere after initialization until it is sync canceled. pwq is

730 * available only while the work item is queued.

731 *

732 * %WORK_OFFQ_CANCELING is used to mark a work item which is being

733 * canceled. While being canceled, a work item may have its PENDING set

734 * but stay off timer and worklist for arbitrarily long and nobody should

735 * try to steal the PENDING bit.

736 */

737 static inline void set_work_data(struct work_struct *work, unsigned long data,

738 unsigned long flags)

739 {

740 WARN_ON_ONCE(!work_pending(work));

741 atomic_long_set(&work->data, data | flags | work_static(work));

742 }

743

744 static void set_work_pwq(struct work_struct *work, struct pool_workqueue *pwq,

745 unsigned long extra_flags)

746 {

747 set_work_data(work, (unsigned long)pwq,

748 WORK_STRUCT_PENDING | WORK_STRUCT_PWQ | extra_flags);

749 }

750

751 static void set_work_pool_and_keep_pending(struct work_struct *work,

752 int pool_id)

753 {

754 set_work_data(work, (unsigned long)pool_id << WORK_OFFQ_POOL_SHIFT,

755 WORK_STRUCT_PENDING);

756 }

757

758 static void set_work_pool_and_clear_pending(struct work_struct *work,

759 int pool_id)

760 {

761 /*

762 * The following wmb is paired with the implied mb in

763 * test_and_set_bit(PENDING) and ensures all updates to @work made

764 * here are visible to and precede any updates by the next PENDING

765 * owner.

766 */

767 smp_wmb();

768 set_work_data(work, (unsigned long)pool_id << WORK_OFFQ_POOL_SHIFT, 0);

769 /*

770 * The following mb guarantees that previous clear of a PENDING bit

771 * will not be reordered with any speculative LOADS or STORES from

772 * work->current_func, which is executed afterwards. This possible

773 * reordering can lead to a missed execution on attempt to queue

774 * the same @work. E.g. consider this case:

775 *

776 * CPU#0 CPU#1

777 * ---------------------------- --------------------------------

778 *

779 * 1 STORE event_indicated

780 * 2 queue_work_on() {

781 * 3 test_and_set_bit(PENDING)

782 * 4 } set_..._and_clear_pending() {

783 * 5 set_work_data() # clear bit

784 * 6 smp_mb()

785 * 7 work->current_func() {

786 * 8 LOAD event_indicated

787 * }

788 *

789 * Without an explicit full barrier speculative LOAD on line 8 can

790 * be executed before CPU#0 does STORE on line 1. If that happens,

791 * CPU#0 observes the PENDING bit is still set and new execution of

792 * a @work is not queued in a hope, that CPU#1 will eventually

793 * finish the queued @work. Meanwhile CPU#1 does not see

794 * event_indicated is set, because speculative LOAD was executed

795 * before actual STORE.

796 */

797 smp_mb();

798 }

799

800 static void clear_work_data(struct work_struct *work)

801 {

802 smp_wmb(); /* see set_work_pool_and_clear_pending() */

803 set_work_data(work, WORK_STRUCT_NO_POOL, 0);

804 }

805

806 static inline struct pool_workqueue *work_struct_pwq(unsigned long data)

807 {

808 return (struct pool_workqueue *)(data & WORK_STRUCT_WQ_DATA_MASK);

809 }

810

811 static struct pool_workqueue *get_work_pwq(struct work_struct *work)

812 {

813 unsigned long data = atomic_long_read(&work->data);

814

815 if (data & WORK_STRUCT_PWQ)

816 return work_struct_pwq(data);

817 else

818 return NULL;

819 }

820

821 /**

822 * get_work_pool - return the worker_pool a given work was associated with

823 * @work: the work item of interest

824 *

825 * Pools are created and destroyed under wq_pool_mutex, and allows read

826 * access under RCU read lock. As such, this function should be

827 * called under wq_pool_mutex or inside of a rcu_read_lock() region.

828 *

829 * All fields of the returned pool are accessible as long as the above

830 * mentioned locking is in effect. If the returned pool needs to be used

831 * beyond the critical section, the caller is responsible for ensuring the

832 * returned pool is and stays online.

833 *

834 * Return: The worker_pool @work was last associated with. %NULL if none.

835 */

836 static struct worker_pool *get_work_pool(struct work_struct *work)

837 {

838 unsigned long data = atomic_long_read(&work->data);

839 int pool_id;

840

841 assert_rcu_or_pool_mutex();

842

843 if (data & WORK_STRUCT_PWQ)

844 return work_struct_pwq(data)->pool;

845

846 pool_id = data >> WORK_OFFQ_POOL_SHIFT;

847 if (pool_id == WORK_OFFQ_POOL_NONE)

848 return NULL;

849

850 return idr_find(&worker_pool_idr, pool_id);

851 }

852

853 /**

854 * get_work_pool_id - return the worker pool ID a given work is associated with

855 * @work: the work item of interest

856 *

857 * Return: The worker_pool ID @work was last associated with.

858 * %WORK_OFFQ_POOL_NONE if none.

859 */

860 static int get_work_pool_id(struct work_struct *work)

861 {

862 unsigned long data = atomic_long_read(&work->data);

863

864 if (data & WORK_STRUCT_PWQ)

865 return work_struct_pwq(data)->pool->id;

866

867 return data >> WORK_OFFQ_POOL_SHIFT;

868 }

869

870 static void mark_work_canceling(struct work_struct *work)

871 {

872 unsigned long pool_id = get_work_pool_id(work);

873

874 pool_id <<= WORK_OFFQ_POOL_SHIFT;

875 set_work_data(work, pool_id | WORK_OFFQ_CANCELING, WORK_STRUCT_PENDING);

876 }

877

878 static bool work_is_canceling(struct work_struct *work)

879 {

880 unsigned long data = atomic_long_read(&work->data);

881

882 return !(data & WORK_STRUCT_PWQ) && (data & WORK_OFFQ_CANCELING);

883 }

884

885 /*

886 * Policy functions. These define the policies on how the global worker

887 * pools are managed. Unless noted otherwise, these functions assume that

888 * they're being called with pool->lock held.

889 */

890

891 /*

892 * Need to wake up a worker? Called from anything but currently

893 * running workers.

894 *

895 * Note that, because unbound workers never contribute to nr_running, this

896 * function will always return %true for unbound pools as long as the

897 * worklist isn't empty.

898 */

899 static bool need_more_worker(struct worker_pool *pool)

900 {

901 return !list_empty(&pool->worklist) && !pool->nr_running;

902 }

903

904 /* Can I start working? Called from busy but !running workers. */

905 static bool may_start_working(struct worker_pool *pool)

906 {

907 return pool->nr_idle;

908 }

909

910 /* Do I need to keep working? Called from currently running workers. */

911 static bool keep_working(struct worker_pool *pool)

912 {

913 return !list_empty(&pool->worklist) && (pool->nr_running <= 1);

914 }

915

916 /* Do we need a new worker? Called from manager. */

917 static bool need_to_create_worker(struct worker_pool *pool)

918 {

919 return need_more_worker(pool) && !may_start_working(pool);

920 }

921

922 /* Do we have too many workers and should some go away? */

923 static bool too_many_workers(struct worker_pool *pool)

924 {

925 bool managing = pool->flags & POOL_MANAGER_ACTIVE;

926 int nr_idle = pool->nr_idle + managing; /* manager is considered idle */

927 int nr_busy = pool->nr_workers - nr_idle;

928

929 return nr_idle > 2 && (nr_idle - 2) * MAX_IDLE_WORKERS_RATIO >= nr_busy;

930 }

931

932 /**

933 * worker_set_flags - set worker flags and adjust nr_running accordingly

934 * @worker: self

935 * @flags: flags to set

936 *

937 * Set @flags in @worker->flags and adjust nr_running accordingly.

938 */

939 static inline void worker_set_flags(struct worker *worker, unsigned int flags)

940 {

941 struct worker_pool *pool = worker->pool;

942

943 lockdep_assert_held(&pool->lock);

944

945 /* If transitioning into NOT_RUNNING, adjust nr_running. */

946 if ((flags & WORKER_NOT_RUNNING) &&

947 !(worker->flags & WORKER_NOT_RUNNING)) {

948 pool->nr_running--;

949 }

950

951 worker->flags |= flags;

952 }

953

954 /**

955 * worker_clr_flags - clear worker flags and adjust nr_running accordingly

956 * @worker: self

957 * @flags: flags to clear

958 *

959 * Clear @flags in @worker->flags and adjust nr_running accordingly.

960 */

961 static inline void worker_clr_flags(struct worker *worker, unsigned int flags)

962 {

963 struct worker_pool *pool = worker->pool;

964 unsigned int oflags = worker->flags;

965

966 lockdep_assert_held(&pool->lock);

967

968 worker->flags &= ~flags;

969

970 /*

971 * If transitioning out of NOT_RUNNING, increment nr_running. Note

972 * that the nested NOT_RUNNING is not a noop. NOT_RUNNING is mask

973 * of multiple flags, not a single flag.

974 */

975 if ((flags & WORKER_NOT_RUNNING) && (oflags & WORKER_NOT_RUNNING))

976 if (!(worker->flags & WORKER_NOT_RUNNING))

977 pool->nr_running++;

978 }

979

980 /* Return the first idle worker. Called with pool->lock held. */

981 static struct worker *first_idle_worker(struct worker_pool *pool)

982 {

983 if (unlikely(list_empty(&pool->idle_list)))

984 return NULL;

985

986 return list_first_entry(&pool->idle_list, struct worker, entry);

987 }

988

989 /**

990 * worker_enter_idle - enter idle state

991 * @worker: worker which is entering idle state

992 *

993 * @worker is entering idle state. Update stats and idle timer if

994 * necessary.

995 *

996 * LOCKING:

997 * raw_spin_lock_irq(pool->lock).

998 */

999 static void worker_enter_idle(struct worker *worker)

1000 {

1001 struct worker_pool *pool = worker->pool;

1002

1003 if (WARN_ON_ONCE(worker->flags & WORKER_IDLE) ||

1004 WARN_ON_ONCE(!list_empty(&worker->entry) &&

1005 (worker->hentry.next || worker->hentry.pprev)))

1006 return;

1007

1008 /* can't use worker_set_flags(), also called from create_worker() */

1009 worker->flags |= WORKER_IDLE;

1010 pool->nr_idle++;

1011 worker->last_active = jiffies;

1012

1013 /* idle_list is LIFO */

1014 list_add(&worker->entry, &pool->idle_list);

1015

1016 if (too_many_workers(pool) && !timer_pending(&pool->idle_timer))

1017 mod_timer(&pool->idle_timer, jiffies + IDLE_WORKER_TIMEOUT);

1018

1019 /* Sanity check nr_running. */

1020 WARN_ON_ONCE(pool->nr_workers == pool->nr_idle && pool->nr_running);

1021 }

1022

1023 /**

1024 * worker_leave_idle - leave idle state

1025 * @worker: worker which is leaving idle state

1026 *

1027 * @worker is leaving idle state. Update stats.

1028 *

1029 * LOCKING:

1030 * raw_spin_lock_irq(pool->lock).

1031 */

1032 static void worker_leave_idle(struct worker *worker)

1033 {

1034 struct worker_pool *pool = worker->pool;

1035

1036 if (WARN_ON_ONCE(!(worker->flags & WORKER_IDLE)))

1037 return;

1038 worker_clr_flags(worker, WORKER_IDLE);

1039 pool->nr_idle--;

1040 list_del_init(&worker->entry);

1041 }

1042

1043 /**

1044 * find_worker_executing_work - find worker which is executing a work

1045 * @pool: pool of interest

1046 * @work: work to find worker for

1047 *

1048 * Find a worker which is executing @work on @pool by searching

1049 * @pool->busy_hash which is keyed by the address of @work. For a worker

1050 * to match, its current execution should match the address of @work and

1051 * its work function. This is to avoid unwanted dependency between

1052 * unrelated work executions through a work item being recycled while still

1053 * being executed.

1054 *

1055 * This is a bit tricky. A work item may be freed once its execution

1056 * starts and nothing prevents the freed area from being recycled for

1057 * another work item. If the same work item address ends up being reused

1058 * before the original execution finishes, workqueue will identify the

1059 * recycled work item as currently executing and make it wait until the

1060 * current execution finishes, introducing an unwanted dependency.

1061 *

1062 * This function checks the work item address and work function to avoid

1063 * false positives. Note that this isn't complete as one may construct a

1064 * work function which can introduce dependency onto itself through a

1065 * recycled work item. Well, if somebody wants to shoot oneself in the

1066 * foot that badly, there's only so much we can do, and if such deadlock

1067 * actually occurs, it should be easy to locate the culprit work function.

1068 *

1069 * CONTEXT:

1070 * raw_spin_lock_irq(pool->lock).

1071 *

1072 * Return:

1073 * Pointer to worker which is executing @work if found, %NULL

1074 * otherwise.

1075 */

1076 static struct worker *find_worker_executing_work(struct worker_pool *pool,

1077 struct work_struct *work)

1078 {

1079 struct worker *worker;

1080

1081 hash_for_each_possible(pool->busy_hash, worker, hentry,

1082 (unsigned long)work)

1083 if (worker->current_work == work &&

1084 worker->current_func == work->func)

1085 return worker;

1086

1087 return NULL;

1088 }

1089

1090 /**

1091 * move_linked_works - move linked works to a list

1092 * @work: start of series of works to be scheduled

1093 * @head: target list to append @work to

1094 * @nextp: out parameter for nested worklist walking

1095 *

1096 * Schedule linked works starting from @work to @head. Work series to be

1097 * scheduled starts at @work and includes any consecutive work with

1098 * WORK_STRUCT_LINKED set in its predecessor. See assign_work() for details on

1099 * @nextp.

1100 *

1101 * CONTEXT:

1102 * raw_spin_lock_irq(pool->lock).

1103 */

1104 static void move_linked_works(struct work_struct *work, struct list_head *head,

1105 struct work_struct **nextp)

1106 {

1107 struct work_struct *n;

1108

1109 /*

1110 * Linked worklist will always end before the end of the list,

1111 * use NULL for list head.

1112 */

1113 list_for_each_entry_safe_from(work, n, NULL, entry) {

1114 list_move_tail(&work->entry, head);

1115 if (!(*work_data_bits(work) & WORK_STRUCT_LINKED))

1116 break;

1117 }

1118

1119 /*

1120 * If we're already inside safe list traversal and have moved

1121 * multiple works to the scheduled queue, the next position

1122 * needs to be updated.

1123 */

1124 if (nextp)

1125 *nextp = n;

1126 }

1127

1128 /**

1129 * assign_work - assign a work item and its linked work items to a worker

1130 * @work: work to assign

1131 * @worker: worker to assign to

1132 * @nextp: out parameter for nested worklist walking

1133 *

1134 * Assign @work and its linked work items to @worker. If @work is already being

1135 * executed by another worker in the same pool, it'll be punted there.

1136 *

1137 * If @nextp is not NULL, it's updated to point to the next work of the last

1138 * scheduled work. This allows assign_work() to be nested inside

1139 * list_for_each_entry_safe().

1140 *

1141 * Returns %true if @work was successfully assigned to @worker. %false if @work

1142 * was punted to another worker already executing it.

1143 */

1144 static bool assign_work(struct work_struct *work, struct worker *worker,

1145 struct work_struct **nextp)

1146 {

1147 struct worker_pool *pool = worker->pool;

1148 struct worker *collision;

1149

1150 lockdep_assert_held(&pool->lock);

1151

1152 /*

1153 * A single work shouldn't be executed concurrently by multiple workers.

1154 * __queue_work() ensures that @work doesn't jump to a different pool

1155 * while still running in the previous pool. Here, we should ensure that

1156 * @work is not executed concurrently by multiple workers from the same

1157 * pool. Check whether anyone is already processing the work. If so,

1158 * defer the work to the currently executing one.

1159 */

1160 collision = find_worker_executing_work(pool, work);

1161 if (unlikely(collision)) {

1162 move_linked_works(work, &collision->scheduled, nextp);

1163 return false;

1164 }

1165

1166 move_linked_works(work, &worker->scheduled, nextp);

1167 return true;

1168 }

1169

1170 /**

1171 * kick_pool - wake up an idle worker if necessary

1172 * @pool: pool to kick

1173 *

1174 * @pool may have pending work items. Wake up worker if necessary. Returns

1175 * whether a worker was woken up.

1176 */

1177 static bool kick_pool(struct worker_pool *pool)

1178 {

1179 struct worker *worker = first_idle_worker(pool);

1180 struct task_struct *p;

1181

1182 lockdep_assert_held(&pool->lock);

1183

1184 if (!need_more_worker(pool) || !worker)

1185 return false;

1186

1187 p = worker->task;

1188

1189 #ifdef CONFIG_SMP

1190 /*

1191 * Idle @worker is about to execute @work and waking up provides an

1192 * opportunity to migrate @worker at a lower cost by setting the task's

1193 * wake_cpu field. Let's see if we want to move @worker to improve

1194 * execution locality.

1195 *

1196 * We're waking the worker that went idle the latest and there's some

1197 * chance that @worker is marked idle but hasn't gone off CPU yet. If

1198 * so, setting the wake_cpu won't do anything. As this is a best-effort

1199 * optimization and the race window is narrow, let's leave as-is for

1200 * now. If this becomes pronounced, we can skip over workers which are

1201 * still on cpu when picking an idle worker.

1202 *

1203 * If @pool has non-strict affinity, @worker might have ended up outside

1204 * its affinity scope. Repatriate.

1205 */

1206 if (!pool->attrs->affn_strict &&

1207 !cpumask_test_cpu(p->wake_cpu, pool->attrs->__pod_cpumask)) {

1208 struct work_struct *work = list_first_entry(&pool->worklist,

1209 struct work_struct, entry);

1210 p->wake_cpu = cpumask_any_distribute(pool->attrs->__pod_cpumask);

1211 get_work_pwq(work)->stats[PWQ_STAT_REPATRIATED]++;

1212 }

1213 #endif

1214 wake_up_process(p);

1215 return true;

1216 }

1217

1218 #ifdef CONFIG_WQ_CPU_INTENSIVE_REPORT

1219

1220 /*

1221 * Concurrency-managed per-cpu work items that hog CPU for longer than

1222 * wq_cpu_intensive_thresh_us trigger the automatic CPU_INTENSIVE mechanism,

1223 * which prevents them from stalling other concurrency-managed work items. If a

1224 * work function keeps triggering this mechanism, it's likely that the work item

1225 * should be using an unbound workqueue instead.

1226 *

1227 * wq_cpu_intensive_report() tracks work functions which trigger such conditions

1228 * and report them so that they can be examined and converted to use unbound

1229 * workqueues as appropriate. To avoid flooding the console, each violating work

1230 * function is tracked and reported with exponential backoff.

1231 */

1232 #define WCI_MAX_ENTS 128

1233

1234 struct wci_ent {

1235 work_func_t func;

1236 atomic64_t cnt;

1237 struct hlist_node hash_node;

1238 };

1239

1240 static struct wci_ent wci_ents[WCI_MAX_ENTS];

1241 static int wci_nr_ents;

1242 static DEFINE_RAW_SPINLOCK(wci_lock);

1243 static DEFINE_HASHTABLE(wci_hash, ilog2(WCI_MAX_ENTS));

1244

1245 static struct wci_ent *wci_find_ent(work_func_t func)

1246 {

1247 struct wci_ent *ent;

1248

1249 hash_for_each_possible_rcu(wci_hash, ent, hash_node,

1250 (unsigned long)func) {

1251 if (ent->func == func)

1252 return ent;

1253 }

1254 return NULL;

1255 }

1256

1257 static void wq_cpu_intensive_report(work_func_t func)

1258 {

1259 struct wci_ent *ent;

1260

1261 restart:

1262 ent = wci_find_ent(func);

1263 if (ent) {

1264 u64 cnt;

1265

1266 /*

1267 * Start reporting from the fourth time and back off

1268 * exponentially.

1269 */

1270 cnt = atomic64_inc_return_relaxed(&ent->cnt);

1271 if (cnt >= 4 && is_power_of_2(cnt))

1272 printk_deferred(KERN_WARNING "workqueue: %ps hogged CPU for >%luus %llu times, consider switching to WQ_UNBOUND\n",

1273 ent->func, wq_cpu_intensive_thresh_us,

1274 atomic64_read(&ent->cnt));

1275 return;

1276 }

1277

1278 /*

1279 * @func is a new violation. Allocate a new entry for it. If wcn_ents[]

1280 * is exhausted, something went really wrong and we probably made enough

1281 * noise already.

1282 */

1283 if (wci_nr_ents >= WCI_MAX_ENTS)

1284 return;

1285

1286 raw_spin_lock(&wci_lock);

1287

1288 if (wci_nr_ents >= WCI_MAX_ENTS) {

1289 raw_spin_unlock(&wci_lock);

1290 return;

1291 }

1292

1293 if (wci_find_ent(func)) {

1294 raw_spin_unlock(&wci_lock);

1295 goto restart;

1296 }

1297

1298 ent = &wci_ents[wci_nr_ents++];

1299 ent->func = func;

1300 atomic64_set(&ent->cnt, 1);

1301 hash_add_rcu(wci_hash, &ent->hash_node, (unsigned long)func);

1302

1303 raw_spin_unlock(&wci_lock);

1304 }

1305

1306 #else /* CONFIG_WQ_CPU_INTENSIVE_REPORT */

1307 static void wq_cpu_intensive_report(work_func_t func) {}

1308 #endif /* CONFIG_WQ_CPU_INTENSIVE_REPORT */

1309

1310 /**

1311 * wq_worker_running - a worker is running again

1312 * @task: task waking up

1313 *

1314 * This function is called when a worker returns from schedule()

1315 */

1316 void wq_worker_running(struct task_struct *task)

1317 {

1318 struct worker *worker = kthread_data(task);

1319

1320 if (!READ_ONCE(worker->sleeping))

1321 return;

1322

1323 /*

1324 * If preempted by unbind_workers() between the WORKER_NOT_RUNNING check

1325 * and the nr_running increment below, we may ruin the nr_running reset

1326 * and leave with an unexpected pool->nr_running == 1 on the newly unbound

1327 * pool. Protect against such race.

1328 */

1329 preempt_disable();

1330 if (!(worker->flags & WORKER_NOT_RUNNING))

1331 worker->pool->nr_running++;

1332 preempt_enable();

1333

1334 /*

1335 * CPU intensive auto-detection cares about how long a work item hogged

1336 * CPU without sleeping. Reset the starting timestamp on wakeup.

1337 */

1338 worker->current_at = worker->task->se.sum_exec_runtime;

1339

1340 WRITE_ONCE(worker->sleeping, 0);

1341 }

1342

1343 /**

1344 * wq_worker_sleeping - a worker is going to sleep

1345 * @task: task going to sleep

1346 *

1347 * This function is called from schedule() when a busy worker is

1348 * going to sleep.

1349 */

1350 void wq_worker_sleeping(struct task_struct *task)

1351 {

1352 struct worker *worker = kthread_data(task);

1353 struct worker_pool *pool;

1354

1355 /*

1356 * Rescuers, which may not have all the fields set up like normal

1357 * workers, also reach here, let's not access anything before

1358 * checking NOT_RUNNING.

1359 */

1360 if (worker->flags & WORKER_NOT_RUNNING)

1361 return;

1362

1363 pool = worker->pool;

1364

1365 /* Return if preempted before wq_worker_running() was reached */

1366 if (READ_ONCE(worker->sleeping))

1367 return;

1368

1369 WRITE_ONCE(worker->sleeping, 1);

1370 raw_spin_lock_irq(&pool->lock);

1371

1372 /*

1373 * Recheck in case unbind_workers() preempted us. We don't

1374 * want to decrement nr_running after the worker is unbound

1375 * and nr_running has been reset.

1376 */

1377 if (worker->flags & WORKER_NOT_RUNNING) {

1378 raw_spin_unlock_irq(&pool->lock);

1379 return;

1380 }

1381

1382 pool->nr_running--;

1383 if (kick_pool(pool))

1384 worker->current_pwq->stats[PWQ_STAT_CM_WAKEUP]++;

1385

1386 raw_spin_unlock_irq(&pool->lock);

1387 }

1388

1389 /**

1390 * wq_worker_tick - a scheduler tick occurred while a kworker is running

1391 * @task: task currently running

1392 *

1393 * Called from scheduler_tick(). We're in the IRQ context and the current

1394 * worker's fields which follow the 'K' locking rule can be accessed safely.

1395 */

1396 void wq_worker_tick(struct task_struct *task)

1397 {

1398 struct worker *worker = kthread_data(task);

1399 struct pool_workqueue *pwq = worker->current_pwq;

1400 struct worker_pool *pool = worker->pool;

1401

1402 if (!pwq)

1403 return;

1404

1405 pwq->stats[PWQ_STAT_CPU_TIME] += TICK_USEC;

1406

1407 if (!wq_cpu_intensive_thresh_us)

1408 return;

1409

1410 /*

1411 * If the current worker is concurrency managed and hogged the CPU for

1412 * longer than wq_cpu_intensive_thresh_us, it's automatically marked

1413 * CPU_INTENSIVE to avoid stalling other concurrency-managed work items.

1414 *

1415 * Set @worker->sleeping means that @worker is in the process of

1416 * switching out voluntarily and won't be contributing to

1417 * @pool->nr_running until it wakes up. As wq_worker_sleeping() also

1418 * decrements ->nr_running, setting CPU_INTENSIVE here can lead to

1419 * double decrements. The task is releasing the CPU anyway. Let's skip.

1420 * We probably want to make this prettier in the future.

1421 */

1422 if ((worker->flags & WORKER_NOT_RUNNING) || READ_ONCE(worker->sleeping) ||

1423 worker->task->se.sum_exec_runtime - worker->current_at <

1424 wq_cpu_intensive_thresh_us * NSEC_PER_USEC)

1425 return;

1426

1427 raw_spin_lock(&pool->lock);

1428

1429 worker_set_flags(worker, WORKER_CPU_INTENSIVE);

1430 wq_cpu_intensive_report(worker->current_func);

1431 pwq->stats[PWQ_STAT_CPU_INTENSIVE]++;

1432

1433 if (kick_pool(pool))

1434 pwq->stats[PWQ_STAT_CM_WAKEUP]++;

1435

1436 raw_spin_unlock(&pool->lock);

1437 }

1438

1439 /**

1440 * wq_worker_last_func - retrieve worker's last work function

1441 * @task: Task to retrieve last work function of.

1442 *

1443 * Determine the last function a worker executed. This is called from

1444 * the scheduler to get a worker's last known identity.

1445 *

1446 * CONTEXT:

1447 * raw_spin_lock_irq(rq->lock)

1448 *

1449 * This function is called during schedule() when a kworker is going

1450 * to sleep. It's used by psi to identify aggregation workers during

1451 * dequeuing, to allow periodic aggregation to shut-off when that

1452 * worker is the last task in the system or cgroup to go to sleep.

1453 *

1454 * As this function doesn't involve any workqueue-related locking, it

1455 * only returns stable values when called from inside the scheduler's

1456 * queuing and dequeuing paths, when @task, which must be a kworker,

1457 * is guaranteed to not be processing any works.

1458 *

1459 * Return:

1460 * The last work function %current executed as a worker, NULL if it

1461 * hasn't executed any work yet.

1462 */

1463 work_func_t wq_worker_last_func(struct task_struct *task)

1464 {

1465 struct worker *worker = kthread_data(task);

1466

1467 return worker->last_func;

1468 }

1469

1470 /**

1471 * wq_node_nr_active - Determine wq_node_nr_active to use

1472 * @wq: workqueue of interest

1473 * @node: NUMA node, can be %NUMA_NO_NODE

1474 *

1475 * Determine wq_node_nr_active to use for @wq on @node. Returns:

1476 *

1477 * - %NULL for per-cpu workqueues as they don't need to use shared nr_active.

1478 *

1479 * - node_nr_active[nr_node_ids] if @node is %NUMA_NO_NODE.

1480 *

1481 * - Otherwise, node_nr_active[@node].

1482 */

1483 static struct wq_node_nr_active *wq_node_nr_active(struct workqueue_struct *wq,

1484 int node)

1485 {

1486 if (!(wq->flags & WQ_UNBOUND))

1487 return NULL;

1488

1489 if (node == NUMA_NO_NODE)

1490 node = nr_node_ids;

1491

1492 return wq->node_nr_active[node];

1493 }

1494

1495 /**

1496 * wq_update_node_max_active - Update per-node max_actives to use

1497 * @wq: workqueue to update

1498 * @off_cpu: CPU that's going down, -1 if a CPU is not going down

1499 *

1500 * Update @wq->node_nr_active[]->max. @wq must be unbound. max_active is

1501 * distributed among nodes according to the proportions of numbers of online

1502 * cpus. The result is always between @wq->min_active and max_active.

1503 */

1504 static void wq_update_node_max_active(struct workqueue_struct *wq, int off_cpu)

1505 {

1506 struct cpumask *effective = unbound_effective_cpumask(wq);

1507 int min_active = READ_ONCE(wq->min_active);

1508 int max_active = READ_ONCE(wq->max_active);

1509 int total_cpus, node;

1510

1511 lockdep_assert_held(&wq->mutex);

1512

1513 if (!cpumask_test_cpu(off_cpu, effective))

1514 off_cpu = -1;

1515

1516 total_cpus = cpumask_weight_and(effective, cpu_online_mask);

1517 if (off_cpu >= 0)

1518 total_cpus--;

1519

1520 for_each_node(node) {

1521 int node_cpus;

1522

1523 node_cpus = cpumask_weight_and(effective, cpumask_of_node(node));

1524 if (off_cpu >= 0 && cpu_to_node(off_cpu) == node)

1525 node_cpus--;

1526

1527 wq_node_nr_active(wq, node)->max =

1528 clamp(DIV_ROUND_UP(max_active * node_cpus, total_cpus),

1529 min_active, max_active);

1530 }

1531

1532 wq_node_nr_active(wq, NUMA_NO_NODE)->max = min_active;

1533 }

1534

1535 /**

1536 * get_pwq - get an extra reference on the specified pool_workqueue

1537 * @pwq: pool_workqueue to get

1538 *

1539 * Obtain an extra reference on @pwq. The caller should guarantee that

1540 * @pwq has positive refcnt and be holding the matching pool->lock.

1541 */

1542 static void get_pwq(struct pool_workqueue *pwq)

1543 {

1544 lockdep_assert_held(&pwq->pool->lock);

1545 WARN_ON_ONCE(pwq->refcnt <= 0);

1546 pwq->refcnt++;

1547 }

1548

1549 /**

1550 * put_pwq - put a pool_workqueue reference

1551 * @pwq: pool_workqueue to put

1552 *

1553 * Drop a reference of @pwq. If its refcnt reaches zero, schedule its

1554 * destruction. The caller should be holding the matching pool->lock.

1555 */

1556 static void put_pwq(struct pool_workqueue *pwq)

1557 {

1558 lockdep_assert_held(&pwq->pool->lock);

1559 if (likely(--pwq->refcnt))

1560 return;

1561 /*

1562 * @pwq can't be released under pool->lock, bounce to a dedicated

1563 * kthread_worker to avoid A-A deadlocks.

1564 */

1565 kthread_queue_work(pwq_release_worker, &pwq->release_work);

1566 }

1567

1568 /**

1569 * put_pwq_unlocked - put_pwq() with surrounding pool lock/unlock

1570 * @pwq: pool_workqueue to put (can be %NULL)

1571 *

1572 * put_pwq() with locking. This function also allows %NULL @pwq.

1573 */

1574 static void put_pwq_unlocked(struct pool_workqueue *pwq)

1575 {

1576 if (pwq) {

1577 /*

1578 * As both pwqs and pools are RCU protected, the

1579 * following lock operations are safe.

1580 */

1581 raw_spin_lock_irq(&pwq->pool->lock);

1582 put_pwq(pwq);

1583 raw_spin_unlock_irq(&pwq->pool->lock);

1584 }

1585 }

1586

1587 static bool pwq_is_empty(struct pool_workqueue *pwq)

1588 {

1589 return !pwq->nr_active && list_empty(&pwq->inactive_works);

1590 }

1591

1592 static void __pwq_activate_work(struct pool_workqueue *pwq,

1593 struct work_struct *work)

1594 {

1595 unsigned long *wdb = work_data_bits(work);

1596

1597 WARN_ON_ONCE(!(*wdb & WORK_STRUCT_INACTIVE));

1598 trace_workqueue_activate_work(work);

1599 if (list_empty(&pwq->pool->worklist))

1600 pwq->pool->watchdog_ts = jiffies;

1601 move_linked_works(work, &pwq->pool->worklist, NULL);

1602 __clear_bit(WORK_STRUCT_INACTIVE_BIT, wdb);

1603 }

1604

1605 /**

1606 * pwq_activate_work - Activate a work item if inactive

1607 * @pwq: pool_workqueue @work belongs to

1608 * @work: work item to activate

1609 *

1610 * Returns %true if activated. %false if already active.

1611 */

1612 static bool pwq_activate_work(struct pool_workqueue *pwq,

1613 struct work_struct *work)

1614 {

1615 struct worker_pool *pool = pwq->pool;

1616 struct wq_node_nr_active *nna;

1617

1618 lockdep_assert_held(&pool->lock);

1619

1620 if (!(*work_data_bits(work) & WORK_STRUCT_INACTIVE))

1621 return false;

1622

1623 nna = wq_node_nr_active(pwq->wq, pool->node);

1624 if (nna)

1625 atomic_inc(&nna->nr);

1626

1627 pwq->nr_active++;

1628 __pwq_activate_work(pwq, work);

1629 return true;

1630 }

1631

1632 static bool tryinc_node_nr_active(struct wq_node_nr_active *nna)

1633 {

1634 int max = READ_ONCE(nna->max);

1635

1636 while (true) {

1637 int old, tmp;

1638

1639 old = atomic_read(&nna->nr);

1640 if (old >= max)

1641 return false;

1642 tmp = atomic_cmpxchg_relaxed(&nna->nr, old, old + 1);

1643 if (tmp == old)

1644 return true;

1645 }

1646 }

1647

1648 /**

1649 * pwq_tryinc_nr_active - Try to increment nr_active for a pwq

1650 * @pwq: pool_workqueue of interest

1651 * @fill: max_active may have increased, try to increase concurrency level

1652 *

1653 * Try to increment nr_active for @pwq. Returns %true if an nr_active count is

1654 * successfully obtained. %false otherwise.

1655 */

1656 static bool pwq_tryinc_nr_active(struct pool_workqueue *pwq, bool fill)

1657 {

1658 struct workqueue_struct *wq = pwq->wq;

1659 struct worker_pool *pool = pwq->pool;

1660 struct wq_node_nr_active *nna = wq_node_nr_active(wq, pool->node);

1661 bool obtained = false;

1662

1663 lockdep_assert_held(&pool->lock);

1664

1665 if (!nna) {

1666 /* per-cpu workqueue, pwq->nr_active is sufficient */

1667 obtained = pwq->nr_active < READ_ONCE(wq->max_active);

1668 goto out;

1669 }

1670

1671 /*

1672 * Unbound workqueue uses per-node shared nr_active $nna. If @pwq is

1673 * already waiting on $nna, pwq_dec_nr_active() will maintain the

1674 * concurrency level. Don't jump the line.

1675 *

1676 * We need to ignore the pending test after max_active has increased as

1677 * pwq_dec_nr_active() can only maintain the concurrency level but not

1678 * increase it. This is indicated by @fill.

1679 */

1680 if (!list_empty(&pwq->pending_node) && likely(!fill))

1681 goto out;

1682

1683 obtained = tryinc_node_nr_active(nna);

1684 if (obtained)

1685 goto out;

1686

1687 /*

1688 * Lockless acquisition failed. Lock, add ourself to $nna->pending_pwqs

1689 * and try again. The smp_mb() is paired with the implied memory barrier

1690 * of atomic_dec_return() in pwq_dec_nr_active() to ensure that either

1691 * we see the decremented $nna->nr or they see non-empty

1692 * $nna->pending_pwqs.

1693 */

1694 raw_spin_lock(&nna->lock);

1695

1696 if (list_empty(&pwq->pending_node))

1697 list_add_tail(&pwq->pending_node, &nna->pending_pwqs);

1698 else if (likely(!fill))

1699 goto out_unlock;

1700

1701 smp_mb();

1702

1703 obtained = tryinc_node_nr_active(nna);

1704

1705 /*

1706 * If @fill, @pwq might have already been pending. Being spuriously

1707 * pending in cold paths doesn't affect anything. Let's leave it be.

1708 */

1709 if (obtained && likely(!fill))

1710 list_del_init(&pwq->pending_node);

1711

1712 out_unlock:

1713 raw_spin_unlock(&nna->lock);

1714 out:

1715 if (obtained)

1716 pwq->nr_active++;

1717 return obtained;

1718 }

1719

1720 /**

1721 * pwq_activate_first_inactive - Activate the first inactive work item on a pwq

1722 * @pwq: pool_workqueue of interest

1723 * @fill: max_active may have increased, try to increase concurrency level

1724 *

1725 * Activate the first inactive work item of @pwq if available and allowed by

1726 * max_active limit.

1727 *

1728 * Returns %true if an inactive work item has been activated. %false if no

1729 * inactive work item is found or max_active limit is reached.

1730 */

1731 static bool pwq_activate_first_inactive(struct pool_workqueue *pwq, bool fill)

1732 {

1733 struct work_struct *work =

1734 list_first_entry_or_null(&pwq->inactive_works,

1735 struct work_struct, entry);

1736

1737 if (work && pwq_tryinc_nr_active(pwq, fill)) {

1738 __pwq_activate_work(pwq, work);

1739 return true;

1740 } else {

1741 return false;

1742 }

1743 }

1744

1745 /**

1746 * node_activate_pending_pwq - Activate a pending pwq on a wq_node_nr_active

1747 * @nna: wq_node_nr_active to activate a pending pwq for

1748 * @caller_pool: worker_pool the caller is locking

1749 *

1750 * Activate a pwq in @nna->pending_pwqs. Called with @caller_pool locked.

1751 * @caller_pool may be unlocked and relocked to lock other worker_pools.

1752 */

1753 static void node_activate_pending_pwq(struct wq_node_nr_active *nna,

1754 struct worker_pool *caller_pool)

1755 {

1756 struct worker_pool *locked_pool = caller_pool;

1757 struct pool_workqueue *pwq;

1758 struct work_struct *work;

1759

1760 lockdep_assert_held(&caller_pool->lock);

1761

1762 raw_spin_lock(&nna->lock);

1763 retry:

1764 pwq = list_first_entry_or_null(&nna->pending_pwqs,

1765 struct pool_workqueue, pending_node);

1766 if (!pwq)

1767 goto out_unlock;

1768

1769 /*

1770 * If @pwq is for a different pool than @locked_pool, we need to lock

1771 * @pwq->pool->lock. Let's trylock first. If unsuccessful, do the unlock

1772 * / lock dance. For that, we also need to release @nna->lock as it's

1773 * nested inside pool locks.

1774 */

1775 if (pwq->pool != locked_pool) {

1776 raw_spin_unlock(&locked_pool->lock);

1777 locked_pool = pwq->pool;

1778 if (!raw_spin_trylock(&locked_pool->lock)) {

1779 raw_spin_unlock(&nna->lock);

1780 raw_spin_lock(&locked_pool->lock);

1781 raw_spin_lock(&nna->lock);

1782 goto retry;

1783 }

1784 }

1785

1786 /*

1787 * $pwq may not have any inactive work items due to e.g. cancellations.

1788 * Drop it from pending_pwqs and see if there's another one.

1789 */

1790 work = list_first_entry_or_null(&pwq->inactive_works,

1791 struct work_struct, entry);

1792 if (!work) {

1793 list_del_init(&pwq->pending_node);

1794 goto retry;

1795 }

1796

1797 /*

1798 * Acquire an nr_active count and activate the inactive work item. If

1799 * $pwq still has inactive work items, rotate it to the end of the

1800 * pending_pwqs so that we round-robin through them. This means that

1801 * inactive work items are not activated in queueing order which is fine

1802 * given that there has never been any ordering across different pwqs.

1803 */

1804 if (likely(tryinc_node_nr_active(nna))) {

1805 pwq->nr_active++;

1806 __pwq_activate_work(pwq, work);

1807

1808 if (list_empty(&pwq->inactive_works))

1809 list_del_init(&pwq->pending_node);

1810 else

1811 list_move_tail(&pwq->pending_node, &nna->pending_pwqs);

1812

1813 /* if activating a foreign pool, make sure it's running */

1814 if (pwq->pool != caller_pool)

1815 kick_pool(pwq->pool);

1816 }

1817

1818 out_unlock:

1819 raw_spin_unlock(&nna->lock);

1820 if (locked_pool != caller_pool) {

1821 raw_spin_unlock(&locked_pool->lock);

1822 raw_spin_lock(&caller_pool->lock);

1823 }

1824 }

1825

1826 /**

1827 * pwq_dec_nr_active - Retire an active count

1828 * @pwq: pool_workqueue of interest

1829 *

1830 * Decrement @pwq's nr_active and try to activate the first inactive work item.

1831 * For unbound workqueues, this function may temporarily drop @pwq->pool->lock.

1832 */

1833 static void pwq_dec_nr_active(struct pool_workqueue *pwq)

1834 {

1835 struct worker_pool *pool = pwq->pool;

1836 struct wq_node_nr_active *nna = wq_node_nr_active(pwq->wq, pool->node);

1837

1838 lockdep_assert_held(&pool->lock);

1839

1840 /*

1841 * @pwq->nr_active should be decremented for both percpu and unbound

1842 * workqueues.

1843 */

1844 pwq->nr_active--;

1845

1846 /*

1847 * For a percpu workqueue, it's simple. Just need to kick the first

1848 * inactive work item on @pwq itself.

1849 */

1850 if (!nna) {

1851 pwq_activate_first_inactive(pwq, false);

1852 return;

1853 }

1854

1855 /*

1856 * If @pwq is for an unbound workqueue, it's more complicated because

1857 * multiple pwqs and pools may be sharing the nr_active count. When a

1858 * pwq needs to wait for an nr_active count, it puts itself on

1859 * $nna->pending_pwqs. The following atomic_dec_return()'s implied

1860 * memory barrier is paired with smp_mb() in pwq_tryinc_nr_active() to

1861 * guarantee that either we see non-empty pending_pwqs or they see

1862 * decremented $nna->nr.

1863 *

1864 * $nna->max may change as CPUs come online/offline and @pwq->wq's

1865 * max_active gets updated. However, it is guaranteed to be equal to or

1866 * larger than @pwq->wq->min_active which is above zero unless freezing.

1867 * This maintains the forward progress guarantee.

1868 */

1869 if (atomic_dec_return(&nna->nr) >= READ_ONCE(nna->max))

1870 return;

1871

1872 if (!list_empty(&nna->pending_pwqs))

1873 node_activate_pending_pwq(nna, pool);

1874 }

1875

1876 /**

1877 * pwq_dec_nr_in_flight - decrement pwq's nr_in_flight

1878 * @pwq: pwq of interest

1879 * @work_data: work_data of work which left the queue

1880 *

1881 * A work either has completed or is removed from pending queue,

1882 * decrement nr_in_flight of its pwq and handle workqueue flushing.

1883 *

1884 * NOTE:

1885 * For unbound workqueues, this function may temporarily drop @pwq->pool->lock

1886 * and thus should be called after all other state updates for the in-flight

1887 * work item is complete.

1888 *

1889 * CONTEXT:

1890 * raw_spin_lock_irq(pool->lock).

1891 */

1892 static void pwq_dec_nr_in_flight(struct pool_workqueue *pwq, unsigned long work_data)

1893 {

1894 int color = get_work_color(work_data);

1895

1896 if (!(work_data & WORK_STRUCT_INACTIVE))

1897 pwq_dec_nr_active(pwq);

1898

1899 pwq->nr_in_flight[color]--;

1900

1901 /* is flush in progress and are we at the flushing tip? */

1902 if (likely(pwq->flush_color != color))

1903 goto out_put;

1904

1905 /* are there still in-flight works? */

1906 if (pwq->nr_in_flight[color])

1907 goto out_put;

1908

1909 /* this pwq is done, clear flush_color */

1910 pwq->flush_color = -1;

1911

1912 /*

1913 * If this was the last pwq, wake up the first flusher. It

1914 * will handle the rest.

1915 */

1916 if (atomic_dec_and_test(&pwq->wq->nr_pwqs_to_flush))

1917 complete(&pwq->wq->first_flusher->done);

1918 out_put:

1919 put_pwq(pwq);

1920 }

1921

1922 /**

1923 * try_to_grab_pending - steal work item from worklist and disable irq

1924 * @work: work item to steal

1925 * @is_dwork: @work is a delayed_work

1926 * @flags: place to store irq state

1927 *

1928 * Try to grab PENDING bit of @work. This function can handle @work in any

1929 * stable state - idle, on timer or on worklist.

1930 *

1931 * Return:

1932 *

1933 * ======== ================================================================

1934 * 1 if @work was pending and we successfully stole PENDING

1935 * 0 if @work was idle and we claimed PENDING

1936 * -EAGAIN if PENDING couldn't be grabbed at the moment, safe to busy-retry

1937 * -ENOENT if someone else is canceling @work, this state may persist

1938 * for arbitrarily long

1939 * ======== ================================================================

1940 *

1941 * Note:

1942 * On >= 0 return, the caller owns @work's PENDING bit. To avoid getting

1943 * interrupted while holding PENDING and @work off queue, irq must be

1944 * disabled on entry. This, combined with delayed_work->timer being

1945 * irqsafe, ensures that we return -EAGAIN for finite short period of time.

1946 *

1947 * On successful return, >= 0, irq is disabled and the caller is

1948 * responsible for releasing it using local_irq_restore(*@flags).

1949 *

1950 * This function is safe to call from any context including IRQ handler.

1951 */

1952 static int try_to_grab_pending(struct work_struct *work, bool is_dwork,

1953 unsigned long *flags)

1954 {

1955 struct worker_pool *pool;

1956 struct pool_workqueue *pwq;

1957

1958 local_irq_save(*flags);

1959

1960 /* try to steal the timer if it exists */

1961 if (is_dwork) {

1962 struct delayed_work *dwork = to_delayed_work(work);

1963

1964 /*

1965 * dwork->timer is irqsafe. If del_timer() fails, it's

1966 * guaranteed that the timer is not queued anywhere and not

1967 * running on the local CPU.

1968 */

1969 if (likely(del_timer(&dwork->timer)))

1970 return 1;

1971 }

1972

1973 /* try to claim PENDING the normal way */

1974 if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work)))

1975 return 0;

1976

1977 rcu_read_lock();

1978 /*

1979 * The queueing is in progress, or it is already queued. Try to

1980 * steal it from ->worklist without clearing WORK_STRUCT_PENDING.

1981 */

1982 pool = get_work_pool(work);

1983 if (!pool)

1984 goto fail;

1985

1986 raw_spin_lock(&pool->lock);

1987 /*

1988 * work->data is guaranteed to point to pwq only while the work

1989 * item is queued on pwq->wq, and both updating work->data to point

1990 * to pwq on queueing and to pool on dequeueing are done under

1991 * pwq->pool->lock. This in turn guarantees that, if work->data

1992 * points to pwq which is associated with a locked pool, the work

1993 * item is currently queued on that pool.

1994 */

1995 pwq = get_work_pwq(work);

1996 if (pwq && pwq->pool == pool) {

1997 debug_work_deactivate(work);

1998

1999 /*

2000 * A cancelable inactive work item must be in the

2001 * pwq->inactive_works since a queued barrier can't be

2002 * canceled (see the comments in insert_wq_barrier()).

2003 *

2004 * An inactive work item cannot be grabbed directly because

2005 * it might have linked barrier work items which, if left

2006 * on the inactive_works list, will confuse pwq->nr_active

2007 * management later on and cause stall. Make sure the work

2008 * item is activated before grabbing.

2009 */

2010 pwq_activate_work(pwq, work);

2011

2012 list_del_init(&work->entry);

2013

2014 /* work->data points to pwq iff queued, point to pool */

2015 set_work_pool_and_keep_pending(work, pool->id);

2016

2017 /* must be the last step, see the function comment */

2018 pwq_dec_nr_in_flight(pwq, *work_data_bits(work));

2019

2020 raw_spin_unlock(&pool->lock);

2021 rcu_read_unlock();

2022 return 1;

2023 }

2024 raw_spin_unlock(&pool->lock);

2025 fail:

2026 rcu_read_unlock();

2027 local_irq_restore(*flags);

2028 if (work_is_canceling(work))

2029 return -ENOENT;

2030 cpu_relax();

2031 return -EAGAIN;

2032 }

2033

2034 /**

2035 * insert_work - insert a work into a pool

2036 * @pwq: pwq @work belongs to

2037 * @work: work to insert

2038 * @head: insertion point

2039 * @extra_flags: extra WORK_STRUCT_* flags to set

2040 *

2041 * Insert @work which belongs to @pwq after @head. @extra_flags is or'd to

2042 * work_struct flags.

2043 *

2044 * CONTEXT:

2045 * raw_spin_lock_irq(pool->lock).

2046 */

2047 static void insert_work(struct pool_workqueue *pwq, struct work_struct *work,

2048 struct list_head *head, unsigned int extra_flags)

2049 {

2050 debug_work_activate(work);

2051

2052 /* record the work call stack in order to print it in KASAN reports */

2053 kasan_record_aux_stack_noalloc(work);

2054

2055 /* we own @work, set data and link */

2056 set_work_pwq(work, pwq, extra_flags);

2057 list_add_tail(&work->entry, head);

2058 get_pwq(pwq);

2059 }

2060

2061 /*

2062 * Test whether @work is being queued from another work executing on the

2063 * same workqueue.

2064 */

2065 static bool is_chained_work(struct workqueue_struct *wq)

2066 {

2067 struct worker *worker;

2068

2069 worker = current_wq_worker();

2070 /*

2071 * Return %true iff I'm a worker executing a work item on @wq. If

2072 * I'm @worker, it's safe to dereference it without locking.

2073 */

2074 return worker && worker->current_pwq->wq == wq;

2075 }

2076

2077 /*

2078 * When queueing an unbound work item to a wq, prefer local CPU if allowed

2079 * by wq_unbound_cpumask. Otherwise, round robin among the allowed ones to

2080 * avoid perturbing sensitive tasks.

2081 */

2082 static int wq_select_unbound_cpu(int cpu)

2083 {

2084 int new_cpu;

2085

2086 if (likely(!wq_debug_force_rr_cpu)) {

2087 if (cpumask_test_cpu(cpu, wq_unbound_cpumask))

2088 return cpu;

2089 } else {

2090 pr_warn_once("workqueue: round-robin CPU selection forced, expect performance impact\n");

2091 }

2092

2093 new_cpu = __this_cpu_read(wq_rr_cpu_last);

2094 new_cpu = cpumask_next_and(new_cpu, wq_unbound_cpumask, cpu_online_mask);

2095 if (unlikely(new_cpu >= nr_cpu_ids)) {

2096 new_cpu = cpumask_first_and(wq_unbound_cpumask, cpu_online_mask);

2097 if (unlikely(new_cpu >= nr_cpu_ids))

2098 return cpu;

2099 }

2100 __this_cpu_write(wq_rr_cpu_last, new_cpu);

2101

2102 return new_cpu;

2103 }

2104

2105 static void __queue_work(int cpu, struct workqueue_struct *wq,

2106 struct work_struct *work)

2107 {

2108 struct pool_workqueue *pwq;

2109 struct worker_pool *last_pool, *pool;

2110 unsigned int work_flags;

2111 unsigned int req_cpu = cpu;

2112

2113 /*

2114 * While a work item is PENDING && off queue, a task trying to

2115 * steal the PENDING will busy-loop waiting for it to either get

2116 * queued or lose PENDING. Grabbing PENDING and queueing should

2117 * happen with IRQ disabled.

2118 */

2119 lockdep_assert_irqs_disabled();

2120

2121

2122 /*

2123 * For a draining wq, only works from the same workqueue are

2124 * allowed. The __WQ_DESTROYING helps to spot the issue that

2125 * queues a new work item to a wq after destroy_workqueue(wq).

2126 */

2127 if (unlikely(wq->flags & (__WQ_DESTROYING | __WQ_DRAINING) &&

2128 WARN_ON_ONCE(!is_chained_work(wq))))

2129 return;

2130 rcu_read_lock();

2131 retry:

2132 /* pwq which will be used unless @work is executing elsewhere */

2133 if (req_cpu == WORK_CPU_UNBOUND) {

2134 if (wq->flags & WQ_UNBOUND)

2135 cpu = wq_select_unbound_cpu(raw_smp_processor_id());

2136 else

2137 cpu = raw_smp_processor_id();

2138 }

2139

2140 pwq = rcu_dereference(*per_cpu_ptr(wq->cpu_pwq, cpu));

2141 pool = pwq->pool;

2142

2143 /*

2144 * If @work was previously on a different pool, it might still be

2145 * running there, in which case the work needs to be queued on that

2146 * pool to guarantee non-reentrancy.

2147 */

2148 last_pool = get_work_pool(work);

2149 if (last_pool && last_pool != pool) {

2150 struct worker *worker;

2151

2152 raw_spin_lock(&last_pool->lock);

2153

2154 worker = find_worker_executing_work(last_pool, work);

2155

2156 if (worker && worker->current_pwq->wq == wq) {

2157 pwq = worker->current_pwq;

2158 pool = pwq->pool;

2159 WARN_ON_ONCE(pool != last_pool);

2160 } else {

2161 /* meh... not running there, queue here */

2162 raw_spin_unlock(&last_pool->lock);

2163 raw_spin_lock(&pool->lock);

2164 }

2165 } else {

2166 raw_spin_lock(&pool->lock);

2167 }

2168

2169 /*

2170 * pwq is determined and locked. For unbound pools, we could have raced

2171 * with pwq release and it could already be dead. If its refcnt is zero,

2172 * repeat pwq selection. Note that unbound pwqs never die without

2173 * another pwq replacing it in cpu_pwq or while work items are executing

2174 * on it, so the retrying is guaranteed to make forward-progress.

2175 */

2176 if (unlikely(!pwq->refcnt)) {

2177 if (wq->flags & WQ_UNBOUND) {

2178 raw_spin_unlock(&pool->lock);

2179 cpu_relax();

2180 goto retry;

2181 }

2182 /* oops */

2183 WARN_ONCE(true, "workqueue: per-cpu pwq for %s on cpu%d has 0 refcnt",

2184 wq->name, cpu);

2185 }

2186

2187 /* pwq determined, queue */

2188 trace_workqueue_queue_work(req_cpu, pwq, work);

2189

2190 if (WARN_ON(!list_empty(&work->entry)))

2191 goto out;

2192

2193 pwq->nr_in_flight[pwq->work_color]++;

2194 work_flags = work_color_to_flags(pwq->work_color);

2195

2196 /*

2197 * Limit the number of concurrently active work items to max_active.

2198 * @work must also queue behind existing inactive work items to maintain

2199 * ordering when max_active changes. See wq_adjust_max_active().

2200 */

2201 if (list_empty(&pwq->inactive_works) && pwq_tryinc_nr_active(pwq, false)) {

2202 if (list_empty(&pool->worklist))

2203 pool->watchdog_ts = jiffies;

2204

2205 trace_workqueue_activate_work(work);

2206 insert_work(pwq, work, &pool->worklist, work_flags);

2207 kick_pool(pool);

2208 } else {

2209 work_flags |= WORK_STRUCT_INACTIVE;

2210 insert_work(pwq, work, &pwq->inactive_works, work_flags);

2211 }

2212

2213 out:

2214 raw_spin_unlock(&pool->lock);

2215 rcu_read_unlock();

2216 }

2217

2218 /**

2219 * queue_work_on - queue work on specific cpu

2220 * @cpu: CPU number to execute work on

2221 * @wq: workqueue to use

2222 * @work: work to queue

2223 *

2224 * We queue the work to a specific CPU, the caller must ensure it

2225 * can't go away. Callers that fail to ensure that the specified

2226 * CPU cannot go away will execute on a randomly chosen CPU.

2227 * But note well that callers specifying a CPU that never has been

2228 * online will get a splat.

2229 *

2230 * Return: %false if @work was already on a queue, %true otherwise.

2231 */

2232 bool queue_work_on(int cpu, struct workqueue_struct *wq,

2233 struct work_struct *work)

2234 {

2235 bool ret = false;

2236 unsigned long flags;

2237

2238 local_irq_save(flags);

2239

2240 if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))) {

2241 __queue_work(cpu, wq, work);

2242 ret = true;

2243 }

2244

2245 local_irq_restore(flags);

2246 return ret;

2247 }

2248 EXPORT_SYMBOL(queue_work_on);

2249

2250 /**

2251 * select_numa_node_cpu - Select a CPU based on NUMA node

2252 * @node: NUMA node ID that we want to select a CPU from

2253 *

2254 * This function will attempt to find a "random" cpu available on a given

2255 * node. If there are no CPUs available on the given node it will return

2256 * WORK_CPU_UNBOUND indicating that we should just schedule to any

2257 * available CPU if we need to schedule this work.

2258 */

2259 static int select_numa_node_cpu(int node)

2260 {

2261 int cpu;

2262

2263 /* Delay binding to CPU if node is not valid or online */

2264 if (node < 0 || node >= MAX_NUMNODES || !node_online(node))

2265 return WORK_CPU_UNBOUND;

2266

2267 /* Use local node/cpu if we are already there */

2268 cpu = raw_smp_processor_id();

2269 if (node == cpu_to_node(cpu))

2270 return cpu;

2271

2272 /* Use "random" otherwise know as "first" online CPU of node */

2273 cpu = cpumask_any_and(cpumask_of_node(node), cpu_online_mask);

2274

2275 /* If CPU is valid return that, otherwise just defer */

2276 return cpu < nr_cpu_ids ? cpu : WORK_CPU_UNBOUND;

2277 }

2278

2279 /**

2280 * queue_work_node - queue work on a "random" cpu for a given NUMA node

2281 * @node: NUMA node that we are targeting the work for

2282 * @wq: workqueue to use

2283 * @work: work to queue

2284 *

2285 * We queue the work to a "random" CPU within a given NUMA node. The basic

2286 * idea here is to provide a way to somehow associate work with a given

2287 * NUMA node.

2288 *

2289 * This function will only make a best effort attempt at getting this onto

2290 * the right NUMA node. If no node is requested or the requested node is

2291 * offline then we just fall back to standard queue_work behavior.

2292 *

2293 * Currently the "random" CPU ends up being the first available CPU in the

2294 * intersection of cpu_online_mask and the cpumask of the node, unless we

2295 * are running on the node. In that case we just use the current CPU.

2296 *

2297 * Return: %false if @work was already on a queue, %true otherwise.

2298 */

2299 bool queue_work_node(int node, struct workqueue_struct *wq,

2300 struct work_struct *work)

2301 {

2302 unsigned long flags;

2303 bool ret = false;

2304

2305 /*

2306 * This current implementation is specific to unbound workqueues.

2307 * Specifically we only return the first available CPU for a given

2308 * node instead of cycling through individual CPUs within the node.

2309 *

2310 * If this is used with a per-cpu workqueue then the logic in

2311 * workqueue_select_cpu_near would need to be updated to allow for

2312 * some round robin type logic.

2313 */

2314 WARN_ON_ONCE(!(wq->flags & WQ_UNBOUND));

2315

2316 local_irq_save(flags);

2317

2318 if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))) {

2319 int cpu = select_numa_node_cpu(node);

2320

2321 __queue_work(cpu, wq, work);

2322 ret = true;

2323 }

2324

2325 local_irq_restore(flags);

2326 return ret;

2327 }

2328 EXPORT_SYMBOL_GPL(queue_work_node);

2329

2330 void delayed_work_timer_fn(struct timer_list *t)

2331 {

2332 struct delayed_work *dwork = from_timer(dwork, t, timer);

2333

2334 /* should have been called from irqsafe timer with irq already off */

2335 __queue_work(dwork->cpu, dwork->wq, &dwork->work);

2336 }

2337 EXPORT_SYMBOL(delayed_work_timer_fn);

2338

2339 static void __queue_delayed_work(int cpu, struct workqueue_struct *wq,

2340 struct delayed_work *dwork, unsigned long delay)

2341 {

2342 struct timer_list *timer = &dwork->timer;

2343 struct work_struct *work = &dwork->work;

2344

2345 WARN_ON_ONCE(!wq);

2346 WARN_ON_ONCE(timer->function != delayed_work_timer_fn);

2347 WARN_ON_ONCE(timer_pending(timer));

2348 WARN_ON_ONCE(!list_empty(&work->entry));

2349

2350 /*

2351 * If @delay is 0, queue @dwork->work immediately. This is for

2352 * both optimization and correctness. The earliest @timer can

2353 * expire is on the closest next tick and delayed_work users depend

2354 * on that there's no such delay when @delay is 0.

2355 */

2356 if (!delay) {

2357 __queue_work(cpu, wq, &dwork->work);

2358 return;

2359 }

2360

2361 dwork->wq = wq;

2362 dwork->cpu = cpu;

2363 timer->expires = jiffies + delay;

2364

2365 if (housekeeping_enabled(HK_TYPE_TIMER)) {

2366 /* If the current cpu is a housekeeping cpu, use it. */

2367 cpu = smp_processor_id();

2368 if (!housekeeping_test_cpu(cpu, HK_TYPE_TIMER))

2369 cpu = housekeeping_any_cpu(HK_TYPE_TIMER);

2370 add_timer_on(timer, cpu);

2371 } else {

2372 if (likely(cpu == WORK_CPU_UNBOUND))

2373 add_timer(timer);

2374 else

2375 add_timer_on(timer, cpu);

2376 }

2377 }

2378

2379 /**

2380 * queue_delayed_work_on - queue work on specific CPU after delay

2381 * @cpu: CPU number to execute work on

2382 * @wq: workqueue to use

2383 * @dwork: work to queue

2384 * @delay: number of jiffies to wait before queueing

2385 *

2386 * Return: %false if @work was already on a queue, %true otherwise. If

2387 * @delay is zero and @dwork is idle, it will be scheduled for immediate

2388 * execution.

2389 */

2390 bool queue_delayed_work_on(int cpu, struct workqueue_struct *wq,

2391 struct delayed_work *dwork, unsigned long delay)

2392 {

2393 struct work_struct *work = &dwork->work;

2394 bool ret = false;

2395 unsigned long flags;

2396

2397 /* read the comment in __queue_work() */

2398 local_irq_save(flags);

2399

2400 if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))) {

2401 __queue_delayed_work(cpu, wq, dwork, delay);

2402 ret = true;

2403 }

2404

2405 local_irq_restore(flags);

2406 return ret;

2407 }

2408 EXPORT_SYMBOL(queue_delayed_work_on);

2409

2410 /**

2411 * mod_delayed_work_on - modify delay of or queue a delayed work on specific CPU

2412 * @cpu: CPU number to execute work on

2413 * @wq: workqueue to use

2414 * @dwork: work to queue

2415 * @delay: number of jiffies to wait before queueing

2416 *

2417 * If @dwork is idle, equivalent to queue_delayed_work_on(); otherwise,

2418 * modify @dwork's timer so that it expires after @delay. If @delay is

2419 * zero, @work is guaranteed to be scheduled immediately regardless of its

2420 * current state.

2421 *

2422 * Return: %false if @dwork was idle and queued, %true if @dwork was

2423 * pending and its timer was modified.

2424 *

2425 * This function is safe to call from any context including IRQ handler.

2426 * See try_to_grab_pending() for details.

2427 */

2428 bool mod_delayed_work_on(int cpu, struct workqueue_struct *wq,

2429 struct delayed_work *dwork, unsigned long delay)

2430 {

2431 unsigned long flags;

2432 int ret;

2433

2434 do {

2435 ret = try_to_grab_pending(&dwork->work, true, &flags);

2436 } while (unlikely(ret == -EAGAIN));

2437

2438 if (likely(ret >= 0)) {

2439 __queue_delayed_work(cpu, wq, dwork, delay);

2440 local_irq_restore(flags);

2441 }

2442

2443 /* -ENOENT from try_to_grab_pending() becomes %true */

2444 return ret;

2445 }

2446 EXPORT_SYMBOL_GPL(mod_delayed_work_on);

2447

2448 static void rcu_work_rcufn(struct rcu_head *rcu)

2449 {

2450 struct rcu_work *rwork = container_of(rcu, struct rcu_work, rcu);

2451

2452 /* read the comment in __queue_work() */

2453 local_irq_disable();

2454 __queue_work(WORK_CPU_UNBOUND, rwork->wq, &rwork->work);

2455 local_irq_enable();

2456 }

2457

2458 /**

2459 * queue_rcu_work - queue work after a RCU grace period

2460 * @wq: workqueue to use

2461 * @rwork: work to queue

2462 *

2463 * Return: %false if @rwork was already pending, %true otherwise. Note

2464 * that a full RCU grace period is guaranteed only after a %true return.

2465 * While @rwork is guaranteed to be executed after a %false return, the

2466 * execution may happen before a full RCU grace period has passed.

2467 */

2468 bool queue_rcu_work(struct workqueue_struct *wq, struct rcu_work *rwork)

2469 {

2470 struct work_struct *work = &rwork->work;

2471

2472 if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))) {

2473 rwork->wq = wq;

2474 call_rcu_hurry(&rwork->rcu, rcu_work_rcufn);

2475 return true;

2476 }

2477

2478 return false;

2479 }

2480 EXPORT_SYMBOL(queue_rcu_work);

2481

2482 static struct worker *alloc_worker(int node)

2483 {

2484 struct worker *worker;

2485

2486 worker = kzalloc_node(sizeof(*worker), GFP_KERNEL, node);

2487 if (worker) {

2488 INIT_LIST_HEAD(&worker->entry);

2489 INIT_LIST_HEAD(&worker->scheduled);

2490 INIT_LIST_HEAD(&worker->node);

2491 /* on creation a worker is in !idle && prep state */

2492 worker->flags = WORKER_PREP;

2493 }

2494 return worker;

2495 }

2496

2497 static cpumask_t *pool_allowed_cpus(struct worker_pool *pool)

2498 {

2499 if (pool->cpu < 0 && pool->attrs->affn_strict)

2500 return pool->attrs->__pod_cpumask;

2501 else

2502 return pool->attrs->cpumask;

2503 }

2504

2505 /**

2506 * worker_attach_to_pool() - attach a worker to a pool

2507 * @worker: worker to be attached

2508 * @pool: the target pool

2509 *

2510 * Attach @worker to @pool. Once attached, the %WORKER_UNBOUND flag and

2511 * cpu-binding of @worker are kept coordinated with the pool across

2512 * cpu-[un]hotplugs.

2513 */

2514 static void worker_attach_to_pool(struct worker *worker,

2515 struct worker_pool *pool)

2516 {

2517 mutex_lock(&wq_pool_attach_mutex);

2518

2519 /*

2520 * The wq_pool_attach_mutex ensures %POOL_DISASSOCIATED remains

2521 * stable across this function. See the comments above the flag

2522 * definition for details.

2523 */

2524 if (pool->flags & POOL_DISASSOCIATED)

2525 worker->flags |= WORKER_UNBOUND;

2526 else

2527 kthread_set_per_cpu(worker->task, pool->cpu);

2528

2529 if (worker->rescue_wq)

2530 set_cpus_allowed_ptr(worker->task, pool_allowed_cpus(pool));

2531

2532 list_add_tail(&worker->node, &pool->workers);

2533 worker->pool = pool;

2534

2535 mutex_unlock(&wq_pool_attach_mutex);

2536 }

2537

2538 /**

2539 * worker_detach_from_pool() - detach a worker from its pool

2540 * @worker: worker which is attached to its pool

2541 *

2542 * Undo the attaching which had been done in worker_attach_to_pool(). The

2543 * caller worker shouldn't access to the pool after detached except it has

2544 * other reference to the pool.

2545 */

2546 static void worker_detach_from_pool(struct worker *worker)

2547 {

2548 struct worker_pool *pool = worker->pool;

2549 struct completion *detach_completion = NULL;

2550

2551 mutex_lock(&wq_pool_attach_mutex);

2552

2553 kthread_set_per_cpu(worker->task, -1);

2554 list_del(&worker->node);

2555 worker->pool = NULL;

2556

2557 if (list_empty(&pool->workers) && list_empty(&pool->dying_workers))

2558 detach_completion = pool->detach_completion;

2559 mutex_unlock(&wq_pool_attach_mutex);

2560

2561 /* clear leftover flags without pool->lock after it is detached */

2562 worker->flags &= ~(WORKER_UNBOUND | WORKER_REBOUND);

2563

2564 if (detach_completion)

2565 complete(detach_completion);

2566 }

2567

2568 /**

2569 * create_worker - create a new workqueue worker

2570 * @pool: pool the new worker will belong to

2571 *

2572 * Create and start a new worker which is attached to @pool.

2573 *

2574 * CONTEXT:

2575 * Might sleep. Does GFP_KERNEL allocations.

2576 *

2577 * Return:

2578 * Pointer to the newly created worker.

2579 */

2580 static struct worker *create_worker(struct worker_pool *pool)

2581 {

2582 struct worker *worker;

2583 int id;

2584 char id_buf[23];

2585

2586 /* ID is needed to determine kthread name */

2587 id = ida_alloc(&pool->worker_ida, GFP_KERNEL);

2588 if (id < 0) {

2589 pr_err_once("workqueue: Failed to allocate a worker ID: %pe\n",

2590 ERR_PTR(id));

2591 return NULL;

2592 }

2593

2594 worker = alloc_worker(pool->node);

2595 if (!worker) {

2596 pr_err_once("workqueue: Failed to allocate a worker\n");

2597 goto fail;

2598 }

2599

2600 worker->id = id;

2601

2602 if (pool->cpu >= 0)

2603 snprintf(id_buf, sizeof(id_buf), "%d:%d%s", pool->cpu, id,

2604 pool->attrs->nice < 0 ? "H" : "");

2605 else

2606 snprintf(id_buf, sizeof(id_buf), "u%d:%d", pool->id, id);

2607

2608 worker->task = kthread_create_on_node(worker_thread, worker, pool->node,

2609 "kworker/%s", id_buf);

2610 if (IS_ERR(worker->task)) {

2611 if (PTR_ERR(worker->task) == -EINTR) {

2612 pr_err("workqueue: Interrupted when creating a worker thread \"kworker/%s\"\n",

2613 id_buf);

2614 } else {

2615 pr_err_once("workqueue: Failed to create a worker thread: %pe",

2616 worker->task);

2617 }

2618 goto fail;

2619 }

2620

2621 set_user_nice(worker->task, pool->attrs->nice);

2622 kthread_bind_mask(worker->task, pool_allowed_cpus(pool));

2623

2624 /* successful, attach the worker to the pool */

2625 worker_attach_to_pool(worker, pool);

2626

2627 /* start the newly created worker */

2628 raw_spin_lock_irq(&pool->lock);

2629

2630 worker->pool->nr_workers++;

2631 worker_enter_idle(worker);

2632

2633 /*

2634 * @worker is waiting on a completion in kthread() and will trigger hung

2635 * check if not woken up soon. As kick_pool() is noop if @pool is empty,

2636 * wake it up explicitly.

2637 */

2638 wake_up_process(worker->task);

2639

2640 raw_spin_unlock_irq(&pool->lock);

2641

2642 return worker;

2643

2644 fail:

2645 ida_free(&pool->worker_ida, id);

2646 kfree(worker);

2647 return NULL;

2648 }

2649

2650 static void unbind_worker(struct worker *worker)

2651 {

2652 lockdep_assert_held(&wq_pool_attach_mutex);

2653

2654 kthread_set_per_cpu(worker->task, -1);

2655 if (cpumask_intersects(wq_unbound_cpumask, cpu_active_mask))

2656 WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task, wq_unbound_cpumask) < 0);

2657 else

2658 WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task, cpu_possible_mask) < 0);

2659 }

2660

2661 static void wake_dying_workers(struct list_head *cull_list)

2662 {

2663 struct worker *worker, *tmp;

2664

2665 list_for_each_entry_safe(worker, tmp, cull_list, entry) {

2666 list_del_init(&worker->entry);

2667 unbind_worker(worker);

2668 /*

2669 * If the worker was somehow already running, then it had to be

2670 * in pool->idle_list when set_worker_dying() happened or we

2671 * wouldn't have gotten here.

2672 *

2673 * Thus, the worker must either have observed the WORKER_DIE

2674 * flag, or have set its state to TASK_IDLE. Either way, the

2675 * below will be observed by the worker and is safe to do

2676 * outside of pool->lock.

2677 */

2678 wake_up_process(worker->task);

2679 }

2680 }

2681

2682 /**

2683 * set_worker_dying - Tag a worker for destruction

2684 * @worker: worker to be destroyed

2685 * @list: transfer worker away from its pool->idle_list and into list

2686 *

2687 * Tag @worker for destruction and adjust @pool stats accordingly. The worker

2688 * should be idle.

2689 *

2690 * CONTEXT:

2691 * raw_spin_lock_irq(pool->lock).

2692 */

2693 static void set_worker_dying(struct worker *worker, struct list_head *list)

2694 {

2695 struct worker_pool *pool = worker->pool;

2696

2697 lockdep_assert_held(&pool->lock);

2698 lockdep_assert_held(&wq_pool_attach_mutex);

2699

2700 /* sanity check frenzy */

2701 if (WARN_ON(worker->current_work) ||

2702 WARN_ON(!list_empty(&worker->scheduled)) ||

2703 WARN_ON(!(worker->flags & WORKER_IDLE)))

2704 return;

2705

2706 pool->nr_workers--;

2707 pool->nr_idle--;

2708

2709 worker->flags |= WORKER_DIE;

2710

2711 list_move(&worker->entry, list);

2712 list_move(&worker->node, &pool->dying_workers);

2713 }

2714

2715 /**

2716 * idle_worker_timeout - check if some idle workers can now be deleted.

2717 * @t: The pool's idle_timer that just expired

2718 *

2719 * The timer is armed in worker_enter_idle(). Note that it isn't disarmed in

2720 * worker_leave_idle(), as a worker flicking between idle and active while its

2721 * pool is at the too_many_workers() tipping point would cause too much timer

2722 * housekeeping overhead. Since IDLE_WORKER_TIMEOUT is long enough, we just let

2723 * it expire and re-evaluate things from there.

2724 */

2725 static void idle_worker_timeout(struct timer_list *t)

2726 {

2727 struct worker_pool *pool = from_timer(pool, t, idle_timer);

2728 bool do_cull = false;

2729

2730 if (work_pending(&pool->idle_cull_work))

2731 return;

2732

2733 raw_spin_lock_irq(&pool->lock);

2734

2735 if (too_many_workers(pool)) {

2736 struct worker *worker;

2737 unsigned long expires;

2738

2739 /* idle_list is kept in LIFO order, check the last one */

2740 worker = list_entry(pool->idle_list.prev, struct worker, entry);

2741 expires = worker->last_active + IDLE_WORKER_TIMEOUT;

2742 do_cull = !time_before(jiffies, expires);

2743

2744 if (!do_cull)

2745 mod_timer(&pool->idle_timer, expires);

2746 }

2747 raw_spin_unlock_irq(&pool->lock);

2748

2749 if (do_cull)

2750 queue_work(system_unbound_wq, &pool->idle_cull_work);

2751 }

2752

2753 /**

2754 * idle_cull_fn - cull workers that have been idle for too long.

2755 * @work: the pool's work for handling these idle workers

2756 *

2757 * This goes through a pool's idle workers and gets rid of those that have been

2758 * idle for at least IDLE_WORKER_TIMEOUT seconds.

2759 *

2760 * We don't want to disturb isolated CPUs because of a pcpu kworker being

2761 * culled, so this also resets worker affinity. This requires a sleepable

2762 * context, hence the split between timer callback and work item.

2763 */

2764 static void idle_cull_fn(struct work_struct *work)

2765 {

2766 struct worker_pool *pool = container_of(work, struct worker_pool, idle_cull_work);

2767 LIST_HEAD(cull_list);

2768

2769 /*

2770 * Grabbing wq_pool_attach_mutex here ensures an already-running worker

2771 * cannot proceed beyong worker_detach_from_pool() in its self-destruct

2772 * path. This is required as a previously-preempted worker could run after

2773 * set_worker_dying() has happened but before wake_dying_workers() did.

2774 */

2775 mutex_lock(&wq_pool_attach_mutex);

2776 raw_spin_lock_irq(&pool->lock);

2777

2778 while (too_many_workers(pool)) {

2779 struct worker *worker;

2780 unsigned long expires;

2781

2782 worker = list_entry(pool->idle_list.prev, struct worker, entry);

2783 expires = worker->last_active + IDLE_WORKER_TIMEOUT;

2784

2785 if (time_before(jiffies, expires)) {

2786 mod_timer(&pool->idle_timer, expires);

2787 break;

2788 }

2789

2790 set_worker_dying(worker, &cull_list);

2791 }

2792

2793 raw_spin_unlock_irq(&pool->lock);

2794 wake_dying_workers(&cull_list);

2795 mutex_unlock(&wq_pool_attach_mutex);

2796 }

2797

2798 static void send_mayday(struct work_struct *work)

2799 {

2800 struct pool_workqueue *pwq = get_work_pwq(work);

2801 struct workqueue_struct *wq = pwq->wq;

2802

2803 lockdep_assert_held(&wq_mayday_lock);

2804

2805 if (!wq->rescuer)

2806 return;

2807

2808 /* mayday mayday mayday */

2809 if (list_empty(&pwq->mayday_node)) {

2810 /*

2811 * If @pwq is for an unbound wq, its base ref may be put at

2812 * any time due to an attribute change. Pin @pwq until the

2813 * rescuer is done with it.

2814 */

2815 get_pwq(pwq);

2816 list_add_tail(&pwq->mayday_node, &wq->maydays);

2817 wake_up_process(wq->rescuer->task);

2818 pwq->stats[PWQ_STAT_MAYDAY]++;

2819 }

2820 }

2821

2822 static void pool_mayday_timeout(struct timer_list *t)

2823 {

2824 struct worker_pool *pool = from_timer(pool, t, mayday_timer);

2825 struct work_struct *work;

2826

2827 raw_spin_lock_irq(&pool->lock);

2828 raw_spin_lock(&wq_mayday_lock); /* for wq->maydays */

2829

2830 if (need_to_create_worker(pool)) {

2831 /*

2832 * We've been trying to create a new worker but

2833 * haven't been successful. We might be hitting an

2834 * allocation deadlock. Send distress signals to

2835 * rescuers.

2836 */

2837 list_for_each_entry(work, &pool->worklist, entry)

2838 send_mayday(work);

2839 }

2840

2841 raw_spin_unlock(&wq_mayday_lock);

2842 raw_spin_unlock_irq(&pool->lock);

2843

2844 mod_timer(&pool->mayday_timer, jiffies + MAYDAY_INTERVAL);

2845 }

2846

2847 /**

2848 * maybe_create_worker - create a new worker if necessary

2849 * @pool: pool to create a new worker for

2850 *

2851 * Create a new worker for @pool if necessary. @pool is guaranteed to

2852 * have at least one idle worker on return from this function. If

2853 * creating a new worker takes longer than MAYDAY_INTERVAL, mayday is

2854 * sent to all rescuers with works scheduled on @pool to resolve

2855 * possible allocation deadlock.

2856 *

2857 * On return, need_to_create_worker() is guaranteed to be %false and

2858 * may_start_working() %true.

2859 *

2860 * LOCKING:

2861 * raw_spin_lock_irq(pool->lock) which may be released and regrabbed

2862 * multiple times. Does GFP_KERNEL allocations. Called only from

2863 * manager.

2864 */

2865 static void maybe_create_worker(struct worker_pool *pool)

2866 __releases(&pool->lock)

2867 __acquires(&pool->lock)

2868 {

2869 restart:

2870 raw_spin_unlock_irq(&pool->lock);

2871

2872 /* if we don't make progress in MAYDAY_INITIAL_TIMEOUT, call for help */

2873 mod_timer(&pool->mayday_timer, jiffies + MAYDAY_INITIAL_TIMEOUT);

2874

2875 while (true) {

2876 if (create_worker(pool) || !need_to_create_worker(pool))

2877 break;

2878

2879 schedule_timeout_interruptible(CREATE_COOLDOWN);

2880

2881 if (!need_to_create_worker(pool))

2882 break;

2883 }

2884

2885 del_timer_sync(&pool->mayday_timer);

2886 raw_spin_lock_irq(&pool->lock);

2887 /*

2888 * This is necessary even after a new worker was just successfully

2889 * created as @pool->lock was dropped and the new worker might have

2890 * already become busy.

2891 */

2892 if (need_to_create_worker(pool))

2893 goto restart;

2894 }

2895

2896 /**

2897 * manage_workers - manage worker pool

2898 * @worker: self

2899 *

2900 * Assume the manager role and manage the worker pool @worker belongs

2901 * to. At any given time, there can be only zero or one manager per

2902 * pool. The exclusion is handled automatically by this function.

2903 *

2904 * The caller can safely start processing works on false return. On

2905 * true return, it's guaranteed that need_to_create_worker() is false

2906 * and may_start_working() is true.

2907 *

2908 * CONTEXT:

2909 * raw_spin_lock_irq(pool->lock) which may be released and regrabbed

2910 * multiple times. Does GFP_KERNEL allocations.

2911 *

2912 * Return:

2913 * %false if the pool doesn't need management and the caller can safely

2914 * start processing works, %true if management function was performed and

2915 * the conditions that the caller verified before calling the function may

2916 * no longer be true.

2917 */

2918 static bool manage_workers(struct worker *worker)

2919 {

2920 struct worker_pool *pool = worker->pool;

2921

2922 if (pool->flags & POOL_MANAGER_ACTIVE)

2923 return false;

2924

2925 pool->flags |= POOL_MANAGER_ACTIVE;

2926 pool->manager = worker;

2927

2928 maybe_create_worker(pool);

2929

2930 pool->manager = NULL;

2931 pool->flags &= ~POOL_MANAGER_ACTIVE;

2932 rcuwait_wake_up(&manager_wait);

2933 return true;

2934 }

2935

2936 /**

2937 * process_one_work - process single work

2938 * @worker: self

2939 * @work: work to process

2940 *

2941 * Process @work. This function contains all the logics necessary to

2942 * process a single work including synchronization against and

2943 * interaction with other workers on the same cpu, queueing and

2944 * flushing. As long as context requirement is met, any worker can

2945 * call this function to process a work.

2946 *

2947 * CONTEXT:

2948 * raw_spin_lock_irq(pool->lock) which is released and regrabbed.

2949 */

2950 static void process_one_work(struct worker *worker, struct work_struct *work)

2951 __releases(&pool->lock)

2952 __acquires(&pool->lock)

2953 {

2954 struct pool_workqueue *pwq = get_work_pwq(work);

2955 struct worker_pool *pool = worker->pool;

2956 unsigned long work_data;

2957 #ifdef CONFIG_LOCKDEP

2958 /*

2959 * It is permissible to free the struct work_struct from

2960 * inside the function that is called from it, this we need to

2961 * take into account for lockdep too. To avoid bogus "held

2962 * lock freed" warnings as well as problems when looking into

2963 * work->lockdep_map, make a copy and use that here.

2964 */

2965 struct lockdep_map lockdep_map;

2966

2967 lockdep_copy_map(&lockdep_map, &work->lockdep_map);

2968 #endif

2969 /* ensure we're on the correct CPU */

2970 WARN_ON_ONCE(!(pool->flags & POOL_DISASSOCIATED) &&

2971 raw_smp_processor_id() != pool->cpu);

2972

2973 /* claim and dequeue */

2974 debug_work_deactivate(work);

2975 hash_add(pool->busy_hash, &worker->hentry, (unsigned long)work);

2976 worker->current_work = work;

2977 worker->current_func = work->func;

2978 worker->current_pwq = pwq;

2979 worker->current_at = worker->task->se.sum_exec_runtime;

2980 work_data = *work_data_bits(work);

2981 worker->current_color = get_work_color(work_data);

2982

2983 /*

2984 * Record wq name for cmdline and debug reporting, may get

2985 * overridden through set_worker_desc().

2986 */

2987 strscpy(worker->desc, pwq->wq->name, WORKER_DESC_LEN);

2988

2989 list_del_init(&work->entry);

2990

2991 /*

2992 * CPU intensive works don't participate in concurrency management.

2993 * They're the scheduler's responsibility. This takes @worker out

2994 * of concurrency management and the next code block will chain

2995 * execution of the pending work items.

2996 */

2997 if (unlikely(pwq->wq->flags & WQ_CPU_INTENSIVE))

2998 worker_set_flags(worker, WORKER_CPU_INTENSIVE);

2999

3000 /*

3001 * Kick @pool if necessary. It's always noop for per-cpu worker pools

3002 * since nr_running would always be >= 1 at this point. This is used to

3003 * chain execution of the pending work items for WORKER_NOT_RUNNING

3004 * workers such as the UNBOUND and CPU_INTENSIVE ones.

3005 */

3006 kick_pool(pool);

3007

3008 /*

3009 * Record the last pool and clear PENDING which should be the last

3010 * update to @work. Also, do this inside @pool->lock so that

3011 * PENDING and queued state changes happen together while IRQ is

3012 * disabled.

3013 */

3014 set_work_pool_and_clear_pending(work, pool->id);

3015

3016 pwq->stats[PWQ_STAT_STARTED]++;

3017 raw_spin_unlock_irq(&pool->lock);

3018

3019 lock_map_acquire(&pwq->wq->lockdep_map);

3020 lock_map_acquire(&lockdep_map);

3021 /*

3022 * Strictly speaking we should mark the invariant state without holding

3023 * any locks, that is, before these two lock_map_acquire()'s.

3024 *

3025 * However, that would result in:

3026 *

3027 * A(W1)

3028 * WFC(C)

3029 * A(W1)

3030 * C(C)

3031 *

3032 * Which would create W1->C->W1 dependencies, even though there is no

3033 * actual deadlock possible. There are two solutions, using a

3034 * read-recursive acquire on the work(queue) 'locks', but this will then

3035 * hit the lockdep limitation on recursive locks, or simply discard

3036 * these locks.

3037 *

3038 * AFAICT there is no possible deadlock scenario between the

3039 * flush_work() and complete() primitives (except for single-threaded

3040 * workqueues), so hiding them isn't a problem.

3041 */

3042 lockdep_invariant_state(true);

3043 trace_workqueue_execute_start(work);

3044 worker->current_func(work);

3045 /*

3046 * While we must be careful to not use "work" after this, the trace

3047 * point will only record its address.

3048 */

3049 trace_workqueue_execute_end(work, worker->current_func);

3050 pwq->stats[PWQ_STAT_COMPLETED]++;

3051 lock_map_release(&lockdep_map);

3052 lock_map_release(&pwq->wq->lockdep_map);

3053

3054 if (unlikely(in_atomic() || lockdep_depth(current) > 0 ||

3055 rcu_preempt_depth() > 0)) {

3056 pr_err("BUG: workqueue leaked lock or atomic: %s/0x%08x/%d/%d\n"

3057 " last function: %ps\n",

3058 current->comm, preempt_count(), rcu_preempt_depth(),

3059 task_pid_nr(current), worker->current_func);

3060 debug_show_held_locks(current);

3061 dump_stack();

3062 }

3063

3064 /*

3065 * The following prevents a kworker from hogging CPU on !PREEMPTION

3066 * kernels, where a requeueing work item waiting for something to

3067 * happen could deadlock with stop_machine as such work item could

3068 * indefinitely requeue itself while all other CPUs are trapped in

3069 * stop_machine. At the same time, report a quiescent RCU state so

3070 * the same condition doesn't freeze RCU.

3071 */

3072 cond_resched();

3073

3074 raw_spin_lock_irq(&pool->lock);

3075

3076 /*

3077 * In addition to %WQ_CPU_INTENSIVE, @worker may also have been marked

3078 * CPU intensive by wq_worker_tick() if @work hogged CPU longer than

3079 * wq_cpu_intensive_thresh_us. Clear it.

3080 */

3081 worker_clr_flags(worker, WORKER_CPU_INTENSIVE);

3082

3083 /* tag the worker for identification in schedule() */

3084 worker->last_func = worker->current_func;

3085

3086 /* we're done with it, release */

3087 hash_del(&worker->hentry);

3088 worker->current_work = NULL;

3089 worker->current_func = NULL;

3090 worker->current_pwq = NULL;

3091 worker->current_color = INT_MAX;

3092

3093 /* must be the last step, see the function comment */

3094 pwq_dec_nr_in_flight(pwq, work_data);

3095 }

3096

3097 /**

3098 * process_scheduled_works - process scheduled works

3099 * @worker: self

3100 *

3101 * Process all scheduled works. Please note that the scheduled list

3102 * may change while processing a work, so this function repeatedly

3103 * fetches a work from the top and executes it.

3104 *

3105 * CONTEXT:

3106 * raw_spin_lock_irq(pool->lock) which may be released and regrabbed

3107 * multiple times.

3108 */

3109 static void process_scheduled_works(struct worker *worker)

3110 {

3111 struct work_struct *work;

3112 bool first = true;

3113

3114 while ((work = list_first_entry_or_null(&worker->scheduled,

3115 struct work_struct, entry))) {

3116 if (first) {

3117 worker->pool->watchdog_ts = jiffies;

3118 first = false;

3119 }

3120 process_one_work(worker, work);

3121 }

3122 }

3123

3124 static void set_pf_worker(bool val)

3125 {

3126 mutex_lock(&wq_pool_attach_mutex);

3127 if (val)

3128 current->flags |= PF_WQ_WORKER;

3129 else

3130 current->flags &= ~PF_WQ_WORKER;

3131 mutex_unlock(&wq_pool_attach_mutex);

3132 }

3133

3134 /**

3135 * worker_thread - the worker thread function

3136 * @__worker: self

3137 *

3138 * The worker thread function. All workers belong to a worker_pool -

3139 * either a per-cpu one or dynamic unbound one. These workers process all

3140 * work items regardless of their specific target workqueue. The only

3141 * exception is work items which belong to workqueues with a rescuer which

3142 * will be explained in rescuer_thread().

3143 *

3144 * Return: 0

3145 */

3146 static int worker_thread(void *__worker)

3147 {

3148 struct worker *worker = __worker;

3149 struct worker_pool *pool = worker->pool;

3150

3151 /* tell the scheduler that this is a workqueue worker */

3152 set_pf_worker(true);

3153 woke_up:

3154 raw_spin_lock_irq(&pool->lock);

3155

3156 /* am I supposed to die? */

3157 if (unlikely(worker->flags & WORKER_DIE)) {

3158 raw_spin_unlock_irq(&pool->lock);

3159 set_pf_worker(false);

3160

3161 set_task_comm(worker->task, "kworker/dying");

3162 ida_free(&pool->worker_ida, worker->id);

3163 worker_detach_from_pool(worker);

3164 WARN_ON_ONCE(!list_empty(&worker->entry));

3165 kfree(worker);

3166 return 0;

3167 }

3168

3169 worker_leave_idle(worker);

3170 recheck:

3171 /* no more worker necessary? */

3172 if (!need_more_worker(pool))

3173 goto sleep;

3174

3175 /* do we need to manage? */

3176 if (unlikely(!may_start_working(pool)) && manage_workers(worker))

3177 goto recheck;

3178

3179 /*

3180 * ->scheduled list can only be filled while a worker is

3181 * preparing to process a work or actually processing it.

3182 * Make sure nobody diddled with it while I was sleeping.

3183 */

3184 WARN_ON_ONCE(!list_empty(&worker->scheduled));

3185

3186 /*

3187 * Finish PREP stage. We're guaranteed to have at least one idle

3188 * worker or that someone else has already assumed the manager

3189 * role. This is where @worker starts participating in concurrency

3190 * management if applicable and concurrency management is restored

3191 * after being rebound. See rebind_workers() for details.

3192 */

3193 worker_clr_flags(worker, WORKER_PREP | WORKER_REBOUND);

3194

3195 do {

3196 struct work_struct *work =

3197 list_first_entry(&pool->worklist,

3198 struct work_struct, entry);

3199

3200 if (assign_work(work, worker, NULL))

3201 process_scheduled_works(worker);

3202 } while (keep_working(pool));

3203

3204 worker_set_flags(worker, WORKER_PREP);

3205 sleep:

3206 /*

3207 * pool->lock is held and there's no work to process and no need to

3208 * manage, sleep. Workers are woken up only while holding

3209 * pool->lock or from local cpu, so setting the current state

3210 * before releasing pool->lock is enough to prevent losing any

3211 * event.

3212 */

3213 worker_enter_idle(worker);

3214 __set_current_state(TASK_IDLE);

3215 raw_spin_unlock_irq(&pool->lock);

3216 schedule();

3217 goto woke_up;

3218 }

3219

3220 /**

3221 * rescuer_thread - the rescuer thread function

3222 * @__rescuer: self

3223 *

3224 * Workqueue rescuer thread function. There's one rescuer for each

3225 * workqueue which has WQ_MEM_RECLAIM set.

3226 *

3227 * Regular work processing on a pool may block trying to create a new

3228 * worker which uses GFP_KERNEL allocation which has slight chance of

3229 * developing into deadlock if some works currently on the same queue

3230 * need to be processed to satisfy the GFP_KERNEL allocation. This is

3231 * the problem rescuer solves.

3232 *

3233 * When such condition is possible, the pool summons rescuers of all

3234 * workqueues which have works queued on the pool and let them process

3235 * those works so that forward progress can be guaranteed.

3236 *

3237 * This should happen rarely.

3238 *

3239 * Return: 0

3240 */

3241 static int rescuer_thread(void *__rescuer)

3242 {

3243 struct worker *rescuer = __rescuer;

3244 struct workqueue_struct *wq = rescuer->rescue_wq;

3245 bool should_stop;

3246

3247 set_user_nice(current, RESCUER_NICE_LEVEL);

3248

3249 /*

3250 * Mark rescuer as worker too. As WORKER_PREP is never cleared, it

3251 * doesn't participate in concurrency management.

3252 */

3253 set_pf_worker(true);

3254 repeat:

3255 set_current_state(TASK_IDLE);

3256

3257 /*

3258 * By the time the rescuer is requested to stop, the workqueue

3259 * shouldn't have any work pending, but @wq->maydays may still have

3260 * pwq(s) queued. This can happen by non-rescuer workers consuming

3261 * all the work items before the rescuer got to them. Go through

3262 * @wq->maydays processing before acting on should_stop so that the

3263 * list is always empty on exit.

3264 */

3265 should_stop = kthread_should_stop();

3266

3267 /* see whether any pwq is asking for help */

3268 raw_spin_lock_irq(&wq_mayday_lock);

3269

3270 while (!list_empty(&wq->maydays)) {

3271 struct pool_workqueue *pwq = list_first_entry(&wq->maydays,

3272 struct pool_workqueue, mayday_node);

3273 struct worker_pool *pool = pwq->pool;

3274 struct work_struct *work, *n;

3275

3276 __set_current_state(TASK_RUNNING);

3277 list_del_init(&pwq->mayday_node);

3278

3279 raw_spin_unlock_irq(&wq_mayday_lock);

3280

3281 worker_attach_to_pool(rescuer, pool);

3282

3283 raw_spin_lock_irq(&pool->lock);

3284

3285 /*

3286 * Slurp in all works issued via this workqueue and

3287 * process'em.

3288 */

3289 WARN_ON_ONCE(!list_empty(&rescuer->scheduled));

3290 list_for_each_entry_safe(work, n, &pool->worklist, entry) {

3291 if (get_work_pwq(work) == pwq &&

3292 assign_work(work, rescuer, &n))

3293 pwq->stats[PWQ_STAT_RESCUED]++;

3294 }

3295

3296 if (!list_empty(&rescuer->scheduled)) {

3297 process_scheduled_works(rescuer);

3298

3299 /*

3300 * The above execution of rescued work items could

3301 * have created more to rescue through

3302 * pwq_activate_first_inactive() or chained

3303 * queueing. Let's put @pwq back on mayday list so

3304 * that such back-to-back work items, which may be

3305 * being used to relieve memory pressure, don't

3306 * incur MAYDAY_INTERVAL delay inbetween.

3307 */

3308 if (pwq->nr_active && need_to_create_worker(pool)) {

3309 raw_spin_lock(&wq_mayday_lock);

3310 /*

3311 * Queue iff we aren't racing destruction

3312 * and somebody else hasn't queued it already.

3313 */

3314 if (wq->rescuer && list_empty(&pwq->mayday_node)) {

3315 get_pwq(pwq);

3316 list_add_tail(&pwq->mayday_node, &wq->maydays);

3317 }

3318 raw_spin_unlock(&wq_mayday_lock);

3319 }

3320 }

3321

3322 /*

3323 * Put the reference grabbed by send_mayday(). @pool won't

3324 * go away while we're still attached to it.

3325 */

3326 put_pwq(pwq);

3327

3328 /*

3329 * Leave this pool. Notify regular workers; otherwise, we end up

3330 * with 0 concurrency and stalling the execution.

3331 */

3332 kick_pool(pool);

3333

3334 raw_spin_unlock_irq(&pool->lock);

3335

3336 worker_detach_from_pool(rescuer);

3337

3338 raw_spin_lock_irq(&wq_mayday_lock);

3339 }

3340

3341 raw_spin_unlock_irq(&wq_mayday_lock);

3342

3343 if (should_stop) {

3344 __set_current_state(TASK_RUNNING);

3345 set_pf_worker(false);

3346 return 0;

3347 }

3348

3349 /* rescuers should never participate in concurrency management */

3350 WARN_ON_ONCE(!(rescuer->flags & WORKER_NOT_RUNNING));

3351 schedule();

3352 goto repeat;

3353 }

3354

3355 /**

3356 * check_flush_dependency - check for flush dependency sanity

3357 * @target_wq: workqueue being flushed

3358 * @target_work: work item being flushed (NULL for workqueue flushes)

3359 *

3360 * %current is trying to flush the whole @target_wq or @target_work on it.

3361 * If @target_wq doesn't have %WQ_MEM_RECLAIM, verify that %current is not

3362 * reclaiming memory or running on a workqueue which doesn't have

3363 * %WQ_MEM_RECLAIM as that can break forward-progress guarantee leading to

3364 * a deadlock.

3365 */

3366 static void check_flush_dependency(struct workqueue_struct *target_wq,

3367 struct work_struct *target_work)

3368 {

3369 work_func_t target_func = target_work ? target_work->func : NULL;

3370 struct worker *worker;

3371

3372 if (target_wq->flags & WQ_MEM_RECLAIM)

3373 return;

3374

3375 worker = current_wq_worker();

3376

3377 WARN_ONCE(current->flags & PF_MEMALLOC,

3378 "workqueue: PF_MEMALLOC task %d(%s) is flushing !WQ_MEM_RECLAIM %s:%ps",

3379 current->pid, current->comm, target_wq->name, target_func);

3380 WARN_ONCE(worker && ((worker->current_pwq->wq->flags &

3381 (WQ_MEM_RECLAIM | __WQ_LEGACY)) == WQ_MEM_RECLAIM),

3382 "workqueue: WQ_MEM_RECLAIM %s:%ps is flushing !WQ_MEM_RECLAIM %s:%ps",

3383 worker->current_pwq->wq->name, worker->current_func,

3384 target_wq->name, target_func);

3385 }

3386

3387 struct wq_barrier {

3388 struct work_struct work;

3389 struct completion done;

3390 struct task_struct *task; /* purely informational */

3391 };

3392

3393 static void wq_barrier_func(struct work_struct *work)

3394 {

3395 struct wq_barrier *barr = container_of(work, struct wq_barrier, work);

3396 complete(&barr->done);

3397 }

3398

3399 /**

3400 * insert_wq_barrier - insert a barrier work

3401 * @pwq: pwq to insert barrier into

3402 * @barr: wq_barrier to insert

3403 * @target: target work to attach @barr to

3404 * @worker: worker currently executing @target, NULL if @target is not executing

3405 *

3406 * @barr is linked to @target such that @barr is completed only after

3407 * @target finishes execution. Please note that the ordering

3408 * guarantee is observed only with respect to @target and on the local

3409 * cpu.

3410 *

3411 * Currently, a queued barrier can't be canceled. This is because

3412 * try_to_grab_pending() can't determine whether the work to be

3413 * grabbed is at the head of the queue and thus can't clear LINKED

3414 * flag of the previous work while there must be a valid next work

3415 * after a work with LINKED flag set.

3416 *

3417 * Note that when @worker is non-NULL, @target may be modified

3418 * underneath us, so we can't reliably determine pwq from @target.

3419 *

3420 * CONTEXT:

3421 * raw_spin_lock_irq(pool->lock).

3422 */

3423 static void insert_wq_barrier(struct pool_workqueue *pwq,

3424 struct wq_barrier *barr,

3425 struct work_struct *target, struct worker *worker)

3426 {

3427 unsigned int work_flags = 0;

3428 unsigned int work_color;

3429 struct list_head *head;

3430

3431 /*

3432 * debugobject calls are safe here even with pool->lock locked

3433 * as we know for sure that this will not trigger any of the

3434 * checks and call back into the fixup functions where we

3435 * might deadlock.

3436 */

3437 INIT_WORK_ONSTACK(&barr->work, wq_barrier_func);

3438 __set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(&barr->work));

3439

3440 init_completion_map(&barr->done, &target->lockdep_map);

3441

3442 barr->task = current;

3443

3444 /* The barrier work item does not participate in nr_active. */

3445 work_flags |= WORK_STRUCT_INACTIVE;

3446

3447 /*

3448 * If @target is currently being executed, schedule the

3449 * barrier to the worker; otherwise, put it after @target.

3450 */

3451 if (worker) {

3452 head = worker->scheduled.next;

3453 work_color = worker->current_color;

3454 } else {

3455 unsigned long *bits = work_data_bits(target);

3456

3457 head = target->entry.next;

3458 /* there can already be other linked works, inherit and set */

3459 work_flags |= *bits & WORK_STRUCT_LINKED;

3460 work_color = get_work_color(*bits);

3461 __set_bit(WORK_STRUCT_LINKED_BIT, bits);

3462 }

3463

3464 pwq->nr_in_flight[work_color]++;

3465 work_flags |= work_color_to_flags(work_color);

3466

3467 insert_work(pwq, &barr->work, head, work_flags);

3468 }

3469

3470 /**

3471 * flush_workqueue_prep_pwqs - prepare pwqs for workqueue flushing

3472 * @wq: workqueue being flushed

3473 * @flush_color: new flush color, < 0 for no-op

3474 * @work_color: new work color, < 0 for no-op

3475 *

3476 * Prepare pwqs for workqueue flushing.

3477 *

3478 * If @flush_color is non-negative, flush_color on all pwqs should be

3479 * -1. If no pwq has in-flight commands at the specified color, all

3480 * pwq->flush_color's stay at -1 and %false is returned. If any pwq

3481 * has in flight commands, its pwq->flush_color is set to

3482 * @flush_color, @wq->nr_pwqs_to_flush is updated accordingly, pwq

3483 * wakeup logic is armed and %true is returned.

3484 *

3485 * The caller should have initialized @wq->first_flusher prior to

3486 * calling this function with non-negative @flush_color. If

3487 * @flush_color is negative, no flush color update is done and %false

3488 * is returned.

3489 *

3490 * If @work_color is non-negative, all pwqs should have the same

3491 * work_color which is previous to @work_color and all will be

3492 * advanced to @work_color.

3493 *

3494 * CONTEXT:

3495 * mutex_lock(wq->mutex).

3496 *

3497 * Return:

3498 * %true if @flush_color >= 0 and there's something to flush. %false

3499 * otherwise.

3500 */

3501 static bool flush_workqueue_prep_pwqs(struct workqueue_struct *wq,

3502 int flush_color, int work_color)

3503 {

3504 bool wait = false;

3505 struct pool_workqueue *pwq;

3506

3507 if (flush_color >= 0) {

3508 WARN_ON_ONCE(atomic_read(&wq->nr_pwqs_to_flush));

3509 atomic_set(&wq->nr_pwqs_to_flush, 1);

3510 }

3511

3512 for_each_pwq(pwq, wq) {

3513 struct worker_pool *pool = pwq->pool;

3514

3515 raw_spin_lock_irq(&pool->lock);

3516

3517 if (flush_color >= 0) {

3518 WARN_ON_ONCE(pwq->flush_color != -1);

3519

3520 if (pwq->nr_in_flight[flush_color]) {

3521 pwq->flush_color = flush_color;

3522 atomic_inc(&wq->nr_pwqs_to_flush);

3523 wait = true;

3524 }

3525 }

3526

3527 if (work_color >= 0) {

3528 WARN_ON_ONCE(work_color != work_next_color(pwq->work_color));

3529 pwq->work_color = work_color;

3530 }

3531

3532 raw_spin_unlock_irq(&pool->lock);

3533 }

3534

3535 if (flush_color >= 0 && atomic_dec_and_test(&wq->nr_pwqs_to_flush))

3536 complete(&wq->first_flusher->done);

3537

3538 return wait;

3539 }

3540

3541 /**

3542 * __flush_workqueue - ensure that any scheduled work has run to completion.

3543 * @wq: workqueue to flush

3544 *

3545 * This function sleeps until all work items which were queued on entry

3546 * have finished execution, but it is not livelocked by new incoming ones.

3547 */

3548 void __flush_workqueue(struct workqueue_struct *wq)

3549 {

3550 struct wq_flusher this_flusher = {

3551 .list = LIST_HEAD_INIT(this_flusher.list),

3552 .flush_color = -1,

3553 .done = COMPLETION_INITIALIZER_ONSTACK_MAP(this_flusher.done, wq->lockdep_map),

3554 };

3555 int next_color;

3556

3557 if (WARN_ON(!wq_online))

3558 return;

3559

3560 lock_map_acquire(&wq->lockdep_map);

3561 lock_map_release(&wq->lockdep_map);

3562

3563 mutex_lock(&wq->mutex);

3564

3565 /*

3566 * Start-to-wait phase

3567 */

3568 next_color = work_next_color(wq->work_color);

3569

3570 if (next_color != wq->flush_color) {

3571 /*

3572 * Color space is not full. The current work_color

3573 * becomes our flush_color and work_color is advanced

3574 * by one.

3575 */

3576 WARN_ON_ONCE(!list_empty(&wq->flusher_overflow));

3577 this_flusher.flush_color = wq->work_color;

3578 wq->work_color = next_color;

3579

3580 if (!wq->first_flusher) {

3581 /* no flush in progress, become the first flusher */

3582 WARN_ON_ONCE(wq->flush_color != this_flusher.flush_color);

3583

3584 wq->first_flusher = &this_flusher;

3585

3586 if (!flush_workqueue_prep_pwqs(wq, wq->flush_color,

3587 wq->work_color)) {

3588 /* nothing to flush, done */

3589 wq->flush_color = next_color;

3590 wq->first_flusher = NULL;

3591 goto out_unlock;

3592 }

3593 } else {

3594 /* wait in queue */

3595 WARN_ON_ONCE(wq->flush_color == this_flusher.flush_color);

3596 list_add_tail(&this_flusher.list, &wq->flusher_queue);

3597 flush_workqueue_prep_pwqs(wq, -1, wq->work_color);

3598 }

3599 } else {

3600 /*

3601 * Oops, color space is full, wait on overflow queue.

3602 * The next flush completion will assign us

3603 * flush_color and transfer to flusher_queue.

3604 */

3605 list_add_tail(&this_flusher.list, &wq->flusher_overflow);

3606 }

3607

3608 check_flush_dependency(wq, NULL);

3609

3610 mutex_unlock(&wq->mutex);

3611

3612 wait_for_completion(&this_flusher.done);

3613

3614 /*

3615 * Wake-up-and-cascade phase

3616 *

3617 * First flushers are responsible for cascading flushes and

3618 * handling overflow. Non-first flushers can simply return.

3619 */

3620 if (READ_ONCE(wq->first_flusher) != &this_flusher)

3621 return;

3622

3623 mutex_lock(&wq->mutex);

3624

3625 /* we might have raced, check again with mutex held */

3626 if (wq->first_flusher != &this_flusher)

3627 goto out_unlock;

3628

3629 WRITE_ONCE(wq->first_flusher, NULL);

3630

3631 WARN_ON_ONCE(!list_empty(&this_flusher.list));

3632 WARN_ON_ONCE(wq->flush_color != this_flusher.flush_color);

3633

3634 while (true) {

3635 struct wq_flusher *next, *tmp;

3636

3637 /* complete all the flushers sharing the current flush color */

3638 list_for_each_entry_safe(next, tmp, &wq->flusher_queue, list) {

3639 if (next->flush_color != wq->flush_color)

3640 break;

3641 list_del_init(&next->list);

3642 complete(&next->done);

3643 }

3644

3645 WARN_ON_ONCE(!list_empty(&wq->flusher_overflow) &&

3646 wq->flush_color != work_next_color(wq->work_color));

3647

3648 /* this flush_color is finished, advance by one */

3649 wq->flush_color = work_next_color(wq->flush_color);

3650

3651 /* one color has been freed, handle overflow queue */

3652 if (!list_empty(&wq->flusher_overflow)) {

3653 /*

3654 * Assign the same color to all overflowed

3655 * flushers, advance work_color and append to

3656 * flusher_queue. This is the start-to-wait

3657 * phase for these overflowed flushers.

3658 */

3659 list_for_each_entry(tmp, &wq->flusher_overflow, list)

3660 tmp->flush_color = wq->work_color;

3661

3662 wq->work_color = work_next_color(wq->work_color);

3663

3664 list_splice_tail_init(&wq->flusher_overflow,

3665 &wq->flusher_queue);

3666 flush_workqueue_prep_pwqs(wq, -1, wq->work_color);

3667 }

3668

3669 if (list_empty(&wq->flusher_queue)) {

3670 WARN_ON_ONCE(wq->flush_color != wq->work_color);

3671 break;

3672 }

3673

3674 /*

3675 * Need to flush more colors. Make the next flusher

3676 * the new first flusher and arm pwqs.

3677 */

3678 WARN_ON_ONCE(wq->flush_color == wq->work_color);

3679 WARN_ON_ONCE(wq->flush_color != next->flush_color);

3680

3681 list_del_init(&next->list);

3682 wq->first_flusher = next;

3683

3684 if (flush_workqueue_prep_pwqs(wq, wq->flush_color, -1))

3685 break;

3686

3687 /*

3688 * Meh... this color is already done, clear first

3689 * flusher and repeat cascading.

3690 */

3691 wq->first_flusher = NULL;

3692 }

3693

3694 out_unlock:

3695 mutex_unlock(&wq->mutex);

3696 }

3697 EXPORT_SYMBOL(__flush_workqueue);

3698

3699 /**

3700 * drain_workqueue - drain a workqueue

3701 * @wq: workqueue to drain

3702 *

3703 * Wait until the workqueue becomes empty. While draining is in progress,

3704 * only chain queueing is allowed. IOW, only currently pending or running

3705 * work items on @wq can queue further work items on it. @wq is flushed

3706 * repeatedly until it becomes empty. The number of flushing is determined

3707 * by the depth of chaining and should be relatively short. Whine if it

3708 * takes too long.

3709 */

3710 void drain_workqueue(struct workqueue_struct *wq)

3711 {

3712 unsigned int flush_cnt = 0;

3713 struct pool_workqueue *pwq;

3714

3715 /*

3716 * __queue_work() needs to test whether there are drainers, is much

3717 * hotter than drain_workqueue() and already looks at @wq->flags.

3718 * Use __WQ_DRAINING so that queue doesn't have to check nr_drainers.

3719 */

3720 mutex_lock(&wq->mutex);

3721 if (!wq->nr_drainers++)

3722 wq->flags |= __WQ_DRAINING;

3723 mutex_unlock(&wq->mutex);

3724 reflush:

3725 __flush_workqueue(wq);

3726

3727 mutex_lock(&wq->mutex);

3728

3729 for_each_pwq(pwq, wq) {

3730 bool drained;

3731

3732 raw_spin_lock_irq(&pwq->pool->lock);

3733 drained = pwq_is_empty(pwq);

3734 raw_spin_unlock_irq(&pwq->pool->lock);

3735

3736 if (drained)

3737 continue;

3738

3739 if (++flush_cnt == 10 ||

3740 (flush_cnt % 100 == 0 && flush_cnt <= 1000))

3741 pr_warn("workqueue %s: %s() isn't complete after %u tries\n",

3742 wq->name, __func__, flush_cnt);

3743

3744 mutex_unlock(&wq->mutex);

3745 goto reflush;

3746 }

3747

3748 if (!--wq->nr_drainers)

3749 wq->flags &= ~__WQ_DRAINING;

3750 mutex_unlock(&wq->mutex);

3751 }

3752 EXPORT_SYMBOL_GPL(drain_workqueue);

3753

3754 static bool start_flush_work(struct work_struct *work, struct wq_barrier *barr,

3755 bool from_cancel)

3756 {

3757 struct worker *worker = NULL;

3758 struct worker_pool *pool;

3759 struct pool_workqueue *pwq;

3760

3761 might_sleep();

3762

3763 rcu_read_lock();

3764 pool = get_work_pool(work);

3765 if (!pool) {

3766 rcu_read_unlock();

3767 return false;

3768 }

3769

3770 raw_spin_lock_irq(&pool->lock);

3771 /* see the comment in try_to_grab_pending() with the same code */

3772 pwq = get_work_pwq(work);

3773 if (pwq) {

3774 if (unlikely(pwq->pool != pool))

3775 goto already_gone;

3776 } else {

3777 worker = find_worker_executing_work(pool, work);

3778 if (!worker)

3779 goto already_gone;

3780 pwq = worker->current_pwq;

3781 }

3782

3783 check_flush_dependency(pwq->wq, work);

3784

3785 insert_wq_barrier(pwq, barr, work, worker);

3786 raw_spin_unlock_irq(&pool->lock);

3787

3788 /*

3789 * Force a lock recursion deadlock when using flush_work() inside a

3790 * single-threaded or rescuer equipped workqueue.

3791 *

3792 * For single threaded workqueues the deadlock happens when the work

3793 * is after the work issuing the flush_work(). For rescuer equipped

3794 * workqueues the deadlock happens when the rescuer stalls, blocking

3795 * forward progress.

3796 */

3797 if (!from_cancel &&

3798 (pwq->wq->saved_max_active == 1 || pwq->wq->rescuer)) {

3799 lock_map_acquire(&pwq->wq->lockdep_map);

3800 lock_map_release(&pwq->wq->lockdep_map);

3801 }

3802 rcu_read_unlock();

3803 return true;

3804 already_gone:

3805 raw_spin_unlock_irq(&pool->lock);

3806 rcu_read_unlock();

3807 return false;

3808 }

3809

3810 static bool __flush_work(struct work_struct *work, bool from_cancel)

3811 {

3812 struct wq_barrier barr;

3813

3814 if (WARN_ON(!wq_online))

3815 return false;

3816

3817 if (WARN_ON(!work->func))

3818 return false;

3819

3820 lock_map_acquire(&work->lockdep_map);

3821 lock_map_release(&work->lockdep_map);

3822

3823 if (start_flush_work(work, &barr, from_cancel)) {

3824 wait_for_completion(&barr.done);

3825 destroy_work_on_stack(&barr.work);

3826 return true;

3827 } else {

3828 return false;

3829 }

3830 }

3831

3832 /**

3833 * flush_work - wait for a work to finish executing the last queueing instance

3834 * @work: the work to flush

3835 *

3836 * Wait until @work has finished execution. @work is guaranteed to be idle

3837 * on return if it hasn't been requeued since flush started.

3838 *

3839 * Return:

3840 * %true if flush_work() waited for the work to finish execution,

3841 * %false if it was already idle.

3842 */

3843 bool flush_work(struct work_struct *work)

3844 {

3845 return __flush_work(work, false);

3846 }

3847 EXPORT_SYMBOL_GPL(flush_work);

3848

3849 struct cwt_wait {

3850 wait_queue_entry_t wait;

3851 struct work_struct *work;

3852 };

3853

3854 static int cwt_wakefn(wait_queue_entry_t *wait, unsigned mode, int sync, void *key)

3855 {

3856 struct cwt_wait *cwait = container_of(wait, struct cwt_wait, wait);

3857

3858 if (cwait->work != key)

3859 return 0;

3860 return autoremove_wake_function(wait, mode, sync, key);

3861 }

3862

3863 static bool __cancel_work_timer(struct work_struct *work, bool is_dwork)

3864 {

3865 static DECLARE_WAIT_QUEUE_HEAD(cancel_waitq);

3866 unsigned long flags;

3867 int ret;

3868

3869 do {

3870 ret = try_to_grab_pending(work, is_dwork, &flags);

3871 /*

3872 * If someone else is already canceling, wait for it to

3873 * finish. flush_work() doesn't work for PREEMPT_NONE

3874 * because we may get scheduled between @work's completion

3875 * and the other canceling task resuming and clearing

3876 * CANCELING - flush_work() will return false immediately

3877 * as @work is no longer busy, try_to_grab_pending() will

3878 * return -ENOENT as @work is still being canceled and the

3879 * other canceling task won't be able to clear CANCELING as

3880 * we're hogging the CPU.

3881 *

3882 * Let's wait for completion using a waitqueue. As this

3883 * may lead to the thundering herd problem, use a custom

3884 * wake function which matches @work along with exclusive

3885 * wait and wakeup.

3886 */

3887 if (unlikely(ret == -ENOENT)) {

3888 struct cwt_wait cwait;

3889

3890 init_wait(&cwait.wait);

3891 cwait.wait.func = cwt_wakefn;

3892 cwait.work = work;

3893

3894 prepare_to_wait_exclusive(&cancel_waitq, &cwait.wait,

3895 TASK_UNINTERRUPTIBLE);

3896 if (work_is_canceling(work))

3897 schedule();

3898 finish_wait(&cancel_waitq, &cwait.wait);

3899 }

3900 } while (unlikely(ret < 0));

3901

3902 /* tell other tasks trying to grab @work to back off */

3903 mark_work_canceling(work);

3904 local_irq_restore(flags);

3905

3906 /*

3907 * This allows canceling during early boot. We know that @work

3908 * isn't executing.

3909 */

3910 if (wq_online)

3911 __flush_work(work, true);

3912

3913 clear_work_data(work);

3914

3915 /*

3916 * Paired with prepare_to_wait() above so that either

3917 * waitqueue_active() is visible here or !work_is_canceling() is

3918 * visible there.

3919 */

3920 smp_mb();

3921 if (waitqueue_active(&cancel_waitq))

3922 __wake_up(&cancel_waitq, TASK_NORMAL, 1, work);

3923

3924 return ret;

3925 }

3926

3927 /**

3928 * cancel_work_sync - cancel a work and wait for it to finish

3929 * @work: the work to cancel

3930 *

3931 * Cancel @work and wait for its execution to finish. This function

3932 * can be used even if the work re-queues itself or migrates to

3933 * another workqueue. On return from this function, @work is

3934 * guaranteed to be not pending or executing on any CPU.

3935 *

3936 * cancel_work_sync(&delayed_work->work) must not be used for

3937 * delayed_work's. Use cancel_delayed_work_sync() instead.

3938 *

3939 * The caller must ensure that the workqueue on which @work was last

3940 * queued can't be destroyed before this function returns.

3941 *

3942 * Return:

3943 * %true if @work was pending, %false otherwise.

3944 */

3945 bool cancel_work_sync(struct work_struct *work)

3946 {

3947 return __cancel_work_timer(work, false);

3948 }

3949 EXPORT_SYMBOL_GPL(cancel_work_sync);

3950

3951 /**

3952 * flush_delayed_work - wait for a dwork to finish executing the last queueing

3953 * @dwork: the delayed work to flush

3954 *

3955 * Delayed timer is cancelled and the pending work is queued for

3956 * immediate execution. Like flush_work(), this function only

3957 * considers the last queueing instance of @dwork.

3958 *

3959 * Return:

3960 * %true if flush_work() waited for the work to finish execution,

3961 * %false if it was already idle.

3962 */

3963 bool flush_delayed_work(struct delayed_work *dwork)

3964 {

3965 local_irq_disable();

3966 if (del_timer_sync(&dwork->timer))

3967 __queue_work(dwork->cpu, dwork->wq, &dwork->work);

3968 local_irq_enable();

3969 return flush_work(&dwork->work);

3970 }

3971 EXPORT_SYMBOL(flush_delayed_work);

3972

3973 /**

3974 * flush_rcu_work - wait for a rwork to finish executing the last queueing

3975 * @rwork: the rcu work to flush

3976 *

3977 * Return:

3978 * %true if flush_rcu_work() waited for the work to finish execution,

3979 * %false if it was already idle.

3980 */

3981 bool flush_rcu_work(struct rcu_work *rwork)

3982 {

3983 if (test_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(&rwork->work))) {

3984 rcu_barrier();

3985 flush_work(&rwork->work);

3986 return true;

3987 } else {

3988 return flush_work(&rwork->work);

3989 }

3990 }

3991 EXPORT_SYMBOL(flush_rcu_work);

3992

3993 static bool __cancel_work(struct work_struct *work, bool is_dwork)

3994 {

3995 unsigned long flags;

3996 int ret;

3997

3998 do {

3999 ret = try_to_grab_pending(work, is_dwork, &flags);

4000 } while (unlikely(ret == -EAGAIN));

4001

4002 if (unlikely(ret < 0))

4003 return false;

4004

4005 set_work_pool_and_clear_pending(work, get_work_pool_id(work));

4006 local_irq_restore(flags);

4007 return ret;

4008 }

4009

4010 /*

4011 * See cancel_delayed_work()

4012 */

4013 bool cancel_work(struct work_struct *work)

4014 {

4015 return __cancel_work(work, false);

4016 }

4017 EXPORT_SYMBOL(cancel_work);

4018

4019 /**

4020 * cancel_delayed_work - cancel a delayed work

4021 * @dwork: delayed_work to cancel

4022 *

4023 * Kill off a pending delayed_work.

4024 *

4025 * Return: %true if @dwork was pending and canceled; %false if it wasn't

4026 * pending.

4027 *

4028 * Note:

4029 * The work callback function may still be running on return, unless

4030 * it returns %true and the work doesn't re-arm itself. Explicitly flush or

4031 * use cancel_delayed_work_sync() to wait on it.

4032 *

4033 * This function is safe to call from any context including IRQ handler.

4034 */

4035 bool cancel_delayed_work(struct delayed_work *dwork)

4036 {

4037 return __cancel_work(&dwork->work, true);

4038 }

4039 EXPORT_SYMBOL(cancel_delayed_work);

4040

4041 /**

4042 * cancel_delayed_work_sync - cancel a delayed work and wait for it to finish

4043 * @dwork: the delayed work cancel

4044 *

4045 * This is cancel_work_sync() for delayed works.

4046 *

4047 * Return:

4048 * %true if @dwork was pending, %false otherwise.

4049 */

4050 bool cancel_delayed_work_sync(struct delayed_work *dwork)

4051 {

4052 return __cancel_work_timer(&dwork->work, true);

4053 }

4054 EXPORT_SYMBOL(cancel_delayed_work_sync);

4055

4056 /**

4057 * schedule_on_each_cpu - execute a function synchronously on each online CPU

4058 * @func: the function to call

4059 *

4060 * schedule_on_each_cpu() executes @func on each online CPU using the

4061 * system workqueue and blocks until all CPUs have completed.

4062 * schedule_on_each_cpu() is very slow.

4063 *

4064 * Return:

4065 * 0 on success, -errno on failure.

4066 */

4067 int schedule_on_each_cpu(work_func_t func)

4068 {

4069 int cpu;

4070 struct work_struct __percpu *works;

4071

4072 works = alloc_percpu(struct work_struct);

4073 if (!works)

4074 return -ENOMEM;

4075

4076 cpus_read_lock();

4077

4078 for_each_online_cpu(cpu) {

4079 struct work_struct *work = per_cpu_ptr(works, cpu);

4080

4081 INIT_WORK(work, func);

4082 schedule_work_on(cpu, work);

4083 }

4084

4085 for_each_online_cpu(cpu)

4086 flush_work(per_cpu_ptr(works, cpu));

4087

4088 cpus_read_unlock();

4089 free_percpu(works);

4090 return 0;

4091 }

4092

4093 /**

4094 * execute_in_process_context - reliably execute the routine with user context

4095 * @fn: the function to execute

4096 * @ew: guaranteed storage for the execute work structure (must

4097 * be available when the work executes)

4098 *

4099 * Executes the function immediately if process context is available,

4100 * otherwise schedules the function for delayed execution.

4101 *

4102 * Return: 0 - function was executed

4103 * 1 - function was scheduled for execution

4104 */

4105 int execute_in_process_context(work_func_t fn, struct execute_work *ew)

4106 {

4107 if (!in_interrupt()) {

4108 fn(&ew->work);

4109 return 0;

4110 }

4111

4112 INIT_WORK(&ew->work, fn);

4113 schedule_work(&ew->work);

4114

4115 return 1;

4116 }

4117 EXPORT_SYMBOL_GPL(execute_in_process_context);

4118

4119 /**

4120 * free_workqueue_attrs - free a workqueue_attrs

4121 * @attrs: workqueue_attrs to free

4122 *

4123 * Undo alloc_workqueue_attrs().

4124 */

4125 void free_workqueue_attrs(struct workqueue_attrs *attrs)

4126 {

4127 if (attrs) {

4128 free_cpumask_var(attrs->cpumask);

4129 free_cpumask_var(attrs->__pod_cpumask);

4130 kfree(attrs);

4131 }

4132 }

4133

4134 /**

4135 * alloc_workqueue_attrs - allocate a workqueue_attrs

4136 *

4137 * Allocate a new workqueue_attrs, initialize with default settings and

4138 * return it.

4139 *

4140 * Return: The allocated new workqueue_attr on success. %NULL on failure.

4141 */

4142 struct workqueue_attrs *alloc_workqueue_attrs(void)

4143 {

4144 struct workqueue_attrs *attrs;

4145

4146 attrs = kzalloc(sizeof(*attrs), GFP_KERNEL);

4147 if (!attrs)

4148 goto fail;

4149 if (!alloc_cpumask_var(&attrs->cpumask, GFP_KERNEL))

4150 goto fail;

4151 if (!alloc_cpumask_var(&attrs->__pod_cpumask, GFP_KERNEL))

4152 goto fail;

4153

4154 cpumask_copy(attrs->cpumask, cpu_possible_mask);

4155 attrs->affn_scope = WQ_AFFN_DFL;

4156 return attrs;

4157 fail:

4158 free_workqueue_attrs(attrs);

4159 return NULL;

4160 }

4161

4162 static void copy_workqueue_attrs(struct workqueue_attrs *to,

4163 const struct workqueue_attrs *from)

4164 {

4165 to->nice = from->nice;

4166 cpumask_copy(to->cpumask, from->cpumask);

4167 cpumask_copy(to->__pod_cpumask, from->__pod_cpumask);

4168 to->affn_strict = from->affn_strict;

4169

4170 /*

4171 * Unlike hash and equality test, copying shouldn't ignore wq-only

4172 * fields as copying is used for both pool and wq attrs. Instead,

4173 * get_unbound_pool() explicitly clears the fields.

4174 */

4175 to->affn_scope = from->affn_scope;

4176 to->ordered = from->ordered;

4177 }

4178

4179 /*

4180 * Some attrs fields are workqueue-only. Clear them for worker_pool's. See the

4181 * comments in 'struct workqueue_attrs' definition.

4182 */

4183 static void wqattrs_clear_for_pool(struct workqueue_attrs *attrs)

4184 {

4185 attrs->affn_scope = WQ_AFFN_NR_TYPES;

4186 attrs->ordered = false;

4187 }

4188

4189 /* hash value of the content of @attr */

4190 static u32 wqattrs_hash(const struct workqueue_attrs *attrs)

4191 {

4192 u32 hash = 0;

4193

4194 hash = jhash_1word(attrs->nice, hash);

4195 hash = jhash(cpumask_bits(attrs->cpumask),

4196 BITS_TO_LONGS(nr_cpumask_bits) * sizeof(long), hash);

4197 hash = jhash(cpumask_bits(attrs->__pod_cpumask),

4198 BITS_TO_LONGS(nr_cpumask_bits) * sizeof(long), hash);

4199 hash = jhash_1word(attrs->affn_strict, hash);

4200 return hash;

4201 }

4202

4203 /* content equality test */

4204 static bool wqattrs_equal(const struct workqueue_attrs *a,

4205 const struct workqueue_attrs *b)

4206 {

4207 if (a->nice != b->nice)

4208 return false;

4209 if (!cpumask_equal(a->cpumask, b->cpumask))

4210 return false;

4211 if (!cpumask_equal(a->__pod_cpumask, b->__pod_cpumask))

4212 return false;

4213 if (a->affn_strict != b->affn_strict)

4214 return false;

4215 return true;

4216 }

4217

4218 /* Update @attrs with actually available CPUs */

4219 static void wqattrs_actualize_cpumask(struct workqueue_attrs *attrs,

4220 const cpumask_t *unbound_cpumask)

4221 {

4222 /*

4223 * Calculate the effective CPU mask of @attrs given @unbound_cpumask. If

4224 * @attrs->cpumask doesn't overlap with @unbound_cpumask, we fallback to

4225 * @unbound_cpumask.

4226 */

4227 cpumask_and(attrs->cpumask, attrs->cpumask, unbound_cpumask);

4228 if (unlikely(cpumask_empty(attrs->cpumask)))

4229 cpumask_copy(attrs->cpumask, unbound_cpumask);

4230 }

4231

4232 /* find wq_pod_type to use for @attrs */

4233 static const struct wq_pod_type *

4234 wqattrs_pod_type(const struct workqueue_attrs *attrs)

4235 {

4236 enum wq_affn_scope scope;

4237 struct wq_pod_type *pt;

4238

4239 /* to synchronize access to wq_affn_dfl */

4240 lockdep_assert_held(&wq_pool_mutex);

4241

4242 if (attrs->affn_scope == WQ_AFFN_DFL)

4243 scope = wq_affn_dfl;

4244 else

4245 scope = attrs->affn_scope;

4246

4247 pt = &wq_pod_types[scope];

4248

4249 if (!WARN_ON_ONCE(attrs->affn_scope == WQ_AFFN_NR_TYPES) &&

4250 likely(pt->nr_pods))

4251 return pt;

4252

4253 /*

4254 * Before workqueue_init_topology(), only SYSTEM is available which is

4255 * initialized in workqueue_init_early().

4256 */

4257 pt = &wq_pod_types[WQ_AFFN_SYSTEM];

4258 BUG_ON(!pt->nr_pods);

4259 return pt;

4260 }

4261

4262 /**

4263 * init_worker_pool - initialize a newly zalloc'd worker_pool

4264 * @pool: worker_pool to initialize

4265 *

4266 * Initialize a newly zalloc'd @pool. It also allocates @pool->attrs.

4267 *

4268 * Return: 0 on success, -errno on failure. Even on failure, all fields

4269 * inside @pool proper are initialized and put_unbound_pool() can be called

4270 * on @pool safely to release it.

4271 */

4272 static int init_worker_pool(struct worker_pool *pool)

4273 {

4274 raw_spin_lock_init(&pool->lock);

4275 pool->id = -1;

4276 pool->cpu = -1;

4277 pool->node = NUMA_NO_NODE;

4278 pool->flags |= POOL_DISASSOCIATED;

4279 pool->watchdog_ts = jiffies;

4280 INIT_LIST_HEAD(&pool->worklist);

4281 INIT_LIST_HEAD(&pool->idle_list);

4282 hash_init(pool->busy_hash);

4283

4284 timer_setup(&pool->idle_timer, idle_worker_timeout, TIMER_DEFERRABLE);

4285 INIT_WORK(&pool->idle_cull_work, idle_cull_fn);

4286

4287 timer_setup(&pool->mayday_timer, pool_mayday_timeout, 0);

4288

4289 INIT_LIST_HEAD(&pool->workers);

4290 INIT_LIST_HEAD(&pool->dying_workers);

4291

4292 ida_init(&pool->worker_ida);

4293 INIT_HLIST_NODE(&pool->hash_node);

4294 pool->refcnt = 1;

4295

4296 /* shouldn't fail above this point */

4297 pool->attrs = alloc_workqueue_attrs();

4298 if (!pool->attrs)

4299 return -ENOMEM;

4300

4301 wqattrs_clear_for_pool(pool->attrs);

4302

4303 return 0;

4304 }

4305

4306 #ifdef CONFIG_LOCKDEP

4307 static void wq_init_lockdep(struct workqueue_struct *wq)

4308 {

4309 char *lock_name;

4310

4311 lockdep_register_key(&wq->key);

4312 lock_name = kasprintf(GFP_KERNEL, "%s%s", "(wq_completion)", wq->name);

4313 if (!lock_name)

4314 lock_name = wq->name;

4315

4316 wq->lock_name = lock_name;

4317 lockdep_init_map(&wq->lockdep_map, lock_name, &wq->key, 0);

4318 }

4319

4320 static void wq_unregister_lockdep(struct workqueue_struct *wq)

4321 {

4322 lockdep_unregister_key(&wq->key);

4323 }

4324

4325 static void wq_free_lockdep(struct workqueue_struct *wq)

4326 {

4327 if (wq->lock_name != wq->name)

4328 kfree(wq->lock_name);

4329 }

4330 #else

4331 static void wq_init_lockdep(struct workqueue_struct *wq)

4332 {

4333 }

4334

4335 static void wq_unregister_lockdep(struct workqueue_struct *wq)

4336 {

4337 }

4338

4339 static void wq_free_lockdep(struct workqueue_struct *wq)

4340 {

4341 }

4342 #endif

4343

4344 static void free_node_nr_active(struct wq_node_nr_active **nna_ar)

4345 {

4346 int node;

4347

4348 for_each_node(node) {

4349 kfree(nna_ar[node]);

4350 nna_ar[node] = NULL;

4351 }

4352

4353 kfree(nna_ar[nr_node_ids]);

4354 nna_ar[nr_node_ids] = NULL;

4355 }

4356

4357 static void init_node_nr_active(struct wq_node_nr_active *nna)

4358 {

4359 atomic_set(&nna->nr, 0);

4360 raw_spin_lock_init(&nna->lock);

4361 INIT_LIST_HEAD(&nna->pending_pwqs);

4362 }

4363

4364 /*

4365 * Each node's nr_active counter will be accessed mostly from its own node and

4366 * should be allocated in the node.

4367 */

4368 static int alloc_node_nr_active(struct wq_node_nr_active **nna_ar)

4369 {

4370 struct wq_node_nr_active *nna;

4371 int node;

4372

4373 for_each_node(node) {

4374 nna = kzalloc_node(sizeof(*nna), GFP_KERNEL, node);

4375 if (!nna)

4376 goto err_free;

4377 init_node_nr_active(nna);

4378 nna_ar[node] = nna;

4379 }

4380

4381 /* [nr_node_ids] is used as the fallback */

4382 nna = kzalloc_node(sizeof(*nna), GFP_KERNEL, NUMA_NO_NODE);

4383 if (!nna)

4384 goto err_free;

4385 init_node_nr_active(nna);

4386 nna_ar[nr_node_ids] = nna;

4387

4388 return 0;

4389

4390 err_free:

4391 free_node_nr_active(nna_ar);

4392 return -ENOMEM;

4393 }

4394

4395 static void rcu_free_wq(struct rcu_head *rcu)

4396 {

4397 struct workqueue_struct *wq =

4398 container_of(rcu, struct workqueue_struct, rcu);

4399

4400 if (wq->flags & WQ_UNBOUND)

4401 free_node_nr_active(wq->node_nr_active);

4402

4403 wq_free_lockdep(wq);

4404 free_percpu(wq->cpu_pwq);

4405 free_workqueue_attrs(wq->unbound_attrs);

4406 kfree(wq);

4407 }

4408

4409 static void rcu_free_pool(struct rcu_head *rcu)

4410 {

4411 struct worker_pool *pool = container_of(rcu, struct worker_pool, rcu);

4412

4413 ida_destroy(&pool->worker_ida);

4414 free_workqueue_attrs(pool->attrs);

4415 kfree(pool);

4416 }

4417

4418 /**

4419 * put_unbound_pool - put a worker_pool

4420 * @pool: worker_pool to put

4421 *

4422 * Put @pool. If its refcnt reaches zero, it gets destroyed in RCU

4423 * safe manner. get_unbound_pool() calls this function on its failure path

4424 * and this function should be able to release pools which went through,

4425 * successfully or not, init_worker_pool().

4426 *

4427 * Should be called with wq_pool_mutex held.

4428 */

4429 static void put_unbound_pool(struct worker_pool *pool)

4430 {

4431 DECLARE_COMPLETION_ONSTACK(detach_completion);

4432 struct worker *worker;

4433 LIST_HEAD(cull_list);

4434

4435 lockdep_assert_held(&wq_pool_mutex);

4436

4437 if (--pool->refcnt)

4438 return;

4439

4440 /* sanity checks */

4441 if (WARN_ON(!(pool->cpu < 0)) ||

4442 WARN_ON(!list_empty(&pool->worklist)))

4443 return;

4444

4445 /* release id and unhash */

4446 if (pool->id >= 0)

4447 idr_remove(&worker_pool_idr, pool->id);

4448 hash_del(&pool->hash_node);

4449

4450 /*

4451 * Become the manager and destroy all workers. This prevents

4452 * @pool's workers from blocking on attach_mutex. We're the last

4453 * manager and @pool gets freed with the flag set.

4454 *

4455 * Having a concurrent manager is quite unlikely to happen as we can

4456 * only get here with

4457 * pwq->refcnt == pool->refcnt == 0

4458 * which implies no work queued to the pool, which implies no worker can

4459 * become the manager. However a worker could have taken the role of

4460 * manager before the refcnts dropped to 0, since maybe_create_worker()

4461 * drops pool->lock

4462 */

4463 while (true) {

4464 rcuwait_wait_event(&manager_wait,

4465 !(pool->flags & POOL_MANAGER_ACTIVE),

4466 TASK_UNINTERRUPTIBLE);

4467

4468 mutex_lock(&wq_pool_attach_mutex);

4469 raw_spin_lock_irq(&pool->lock);

4470 if (!(pool->flags & POOL_MANAGER_ACTIVE)) {

4471 pool->flags |= POOL_MANAGER_ACTIVE;

4472 break;

4473 }

4474 raw_spin_unlock_irq(&pool->lock);

4475 mutex_unlock(&wq_pool_attach_mutex);

4476 }

4477

4478 while ((worker = first_idle_worker(pool)))

4479 set_worker_dying(worker, &cull_list);

4480 WARN_ON(pool->nr_workers || pool->nr_idle);

4481 raw_spin_unlock_irq(&pool->lock);

4482

4483 wake_dying_workers(&cull_list);

4484

4485 if (!list_empty(&pool->workers) || !list_empty(&pool->dying_workers))

4486 pool->detach_completion = &detach_completion;

4487 mutex_unlock(&wq_pool_attach_mutex);

4488

4489 if (pool->detach_completion)

4490 wait_for_completion(pool->detach_completion);

4491

4492 /* shut down the timers */

4493 del_timer_sync(&pool->idle_timer);

4494 cancel_work_sync(&pool->idle_cull_work);

4495 del_timer_sync(&pool->mayday_timer);

4496

4497 /* RCU protected to allow dereferences from get_work_pool() */

4498 call_rcu(&pool->rcu, rcu_free_pool);

4499 }

4500

4501 /**

4502 * get_unbound_pool - get a worker_pool with the specified attributes

4503 * @attrs: the attributes of the worker_pool to get

4504 *

4505 * Obtain a worker_pool which has the same attributes as @attrs, bump the

4506 * reference count and return it. If there already is a matching

4507 * worker_pool, it will be used; otherwise, this function attempts to

4508 * create a new one.

4509 *

4510 * Should be called with wq_pool_mutex held.

4511 *

4512 * Return: On success, a worker_pool with the same attributes as @attrs.

4513 * On failure, %NULL.

4514 */

4515 static struct worker_pool *get_unbound_pool(const struct workqueue_attrs *attrs)

4516 {

4517 struct wq_pod_type *pt = &wq_pod_types[WQ_AFFN_NUMA];

4518 u32 hash = wqattrs_hash(attrs);

4519 struct worker_pool *pool;

4520 int pod, node = NUMA_NO_NODE;

4521

4522 lockdep_assert_held(&wq_pool_mutex);

4523

4524 /* do we already have a matching pool? */

4525 hash_for_each_possible(unbound_pool_hash, pool, hash_node, hash) {

4526 if (wqattrs_equal(pool->attrs, attrs)) {

4527 pool->refcnt++;

4528 return pool;

4529 }

4530 }

4531

4532 /* If __pod_cpumask is contained inside a NUMA pod, that's our node */

4533 for (pod = 0; pod < pt->nr_pods; pod++) {

4534 if (cpumask_subset(attrs->__pod_cpumask, pt->pod_cpus[pod])) {

4535 node = pt->pod_node[pod];

4536 break;

4537 }

4538 }

4539

4540 /* nope, create a new one */

4541 pool = kzalloc_node(sizeof(*pool), GFP_KERNEL, node);

4542 if (!pool || init_worker_pool(pool) < 0)

4543 goto fail;

4544

4545 pool->node = node;

4546 copy_workqueue_attrs(pool->attrs, attrs);

4547 wqattrs_clear_for_pool(pool->attrs);

4548

4549 if (worker_pool_assign_id(pool) < 0)

4550 goto fail;

4551

4552 /* create and start the initial worker */

4553 if (wq_online && !create_worker(pool))

4554 goto fail;

4555

4556 /* install */

4557 hash_add(unbound_pool_hash, &pool->hash_node, hash);

4558

4559 return pool;

4560 fail:

4561 if (pool)

4562 put_unbound_pool(pool);

4563 return NULL;

4564 }

4565

4566 static void rcu_free_pwq(struct rcu_head *rcu)

4567 {

4568 kmem_cache_free(pwq_cache,

4569 container_of(rcu, struct pool_workqueue, rcu));

4570 }

4571

4572 /*

4573 * Scheduled on pwq_release_worker by put_pwq() when an unbound pwq hits zero

4574 * refcnt and needs to be destroyed.

4575 */

4576 static void pwq_release_workfn(struct kthread_work *work)

4577 {

4578 struct pool_workqueue *pwq = container_of(work, struct pool_workqueue,

4579 release_work);

4580 struct workqueue_struct *wq = pwq->wq;

4581 struct worker_pool *pool = pwq->pool;

4582 bool is_last = false;

4583

4584 /*

4585 * When @pwq is not linked, it doesn't hold any reference to the

4586 * @wq, and @wq is invalid to access.

4587 */

4588 if (!list_empty(&pwq->pwqs_node)) {

4589 mutex_lock(&wq->mutex);

4590 list_del_rcu(&pwq->pwqs_node);

4591 is_last = list_empty(&wq->pwqs);

4592 mutex_unlock(&wq->mutex);

4593 }

4594

4595 if (wq->flags & WQ_UNBOUND) {

4596 mutex_lock(&wq_pool_mutex);

4597 put_unbound_pool(pool);

4598 mutex_unlock(&wq_pool_mutex);

4599 }

4600

4601 if (!list_empty(&pwq->pending_node)) {

4602 struct wq_node_nr_active *nna =

4603 wq_node_nr_active(pwq->wq, pwq->pool->node);

4604

4605 raw_spin_lock_irq(&nna->lock);

4606 list_del_init(&pwq->pending_node);

4607 raw_spin_unlock_irq(&nna->lock);

4608 }

4609

4610 call_rcu(&pwq->rcu, rcu_free_pwq);

4611

4612 /*

4613 * If we're the last pwq going away, @wq is already dead and no one

4614 * is gonna access it anymore. Schedule RCU free.

4615 */

4616 if (is_last) {

4617 wq_unregister_lockdep(wq);

4618 call_rcu(&wq->rcu, rcu_free_wq);

4619 }

4620 }

4621

4622 /* initialize newly allocated @pwq which is associated with @wq and @pool */

4623 static void init_pwq(struct pool_workqueue *pwq, struct workqueue_struct *wq,

4624 struct worker_pool *pool)

4625 {

4626 BUG_ON((unsigned long)pwq & WORK_STRUCT_FLAG_MASK);

4627

4628 memset(pwq, 0, sizeof(*pwq));

4629

4630 pwq->pool = pool;

4631 pwq->wq = wq;

4632 pwq->flush_color = -1;

4633 pwq->refcnt = 1;

4634 INIT_LIST_HEAD(&pwq->inactive_works);

4635 INIT_LIST_HEAD(&pwq->pending_node);

4636 INIT_LIST_HEAD(&pwq->pwqs_node);

4637 INIT_LIST_HEAD(&pwq->mayday_node);

4638 kthread_init_work(&pwq->release_work, pwq_release_workfn);

4639 }

4640

4641 /* sync @pwq with the current state of its associated wq and link it */

4642 static void link_pwq(struct pool_workqueue *pwq)

4643 {

4644 struct workqueue_struct *wq = pwq->wq;

4645

4646 lockdep_assert_held(&wq->mutex);

4647

4648 /* may be called multiple times, ignore if already linked */

4649 if (!list_empty(&pwq->pwqs_node))

4650 return;

4651

4652 /* set the matching work_color */

4653 pwq->work_color = wq->work_color;

4654

4655 /* link in @pwq */

4656 list_add_rcu(&pwq->pwqs_node, &wq->pwqs);

4657 }

4658

4659 /* obtain a pool matching @attr and create a pwq associating the pool and @wq */

4660 static struct pool_workqueue *alloc_unbound_pwq(struct workqueue_struct *wq,

4661 const struct workqueue_attrs *attrs)

4662 {

4663 struct worker_pool *pool;

4664 struct pool_workqueue *pwq;

4665

4666 lockdep_assert_held(&wq_pool_mutex);

4667

4668 pool = get_unbound_pool(attrs);

4669 if (!pool)

4670 return NULL;

4671

4672 pwq = kmem_cache_alloc_node(pwq_cache, GFP_KERNEL, pool->node);

4673 if (!pwq) {

4674 put_unbound_pool(pool);

4675 return NULL;

4676 }

4677

4678 init_pwq(pwq, wq, pool);

4679 return pwq;

4680 }

4681

4682 /**

4683 * wq_calc_pod_cpumask - calculate a wq_attrs' cpumask for a pod

4684 * @attrs: the wq_attrs of the default pwq of the target workqueue

4685 * @cpu: the target CPU

4686 * @cpu_going_down: if >= 0, the CPU to consider as offline

4687 *

4688 * Calculate the cpumask a workqueue with @attrs should use on @pod. If

4689 * @cpu_going_down is >= 0, that cpu is considered offline during calculation.

4690 * The result is stored in @attrs->__pod_cpumask.

4691 *

4692 * If pod affinity is not enabled, @attrs->cpumask is always used. If enabled

4693 * and @pod has online CPUs requested by @attrs, the returned cpumask is the

4694 * intersection of the possible CPUs of @pod and @attrs->cpumask.

4695 *

4696 * The caller is responsible for ensuring that the cpumask of @pod stays stable.

4697 */

4698 static void wq_calc_pod_cpumask(struct workqueue_attrs *attrs, int cpu,

4699 int cpu_going_down)

4700 {

4701 const struct wq_pod_type *pt = wqattrs_pod_type(attrs);

4702 int pod = pt->cpu_pod[cpu];

4703

4704 /* does @pod have any online CPUs @attrs wants? */

4705 cpumask_and(attrs->__pod_cpumask, pt->pod_cpus[pod], attrs->cpumask);

4706 cpumask_and(attrs->__pod_cpumask, attrs->__pod_cpumask, cpu_online_mask);

4707 if (cpu_going_down >= 0)

4708 cpumask_clear_cpu(cpu_going_down, attrs->__pod_cpumask);

4709

4710 if (cpumask_empty(attrs->__pod_cpumask)) {

4711 cpumask_copy(attrs->__pod_cpumask, attrs->cpumask);

4712 return;

4713 }

4714

4715 /* yeap, return possible CPUs in @pod that @attrs wants */

4716 cpumask_and(attrs->__pod_cpumask, attrs->cpumask, pt->pod_cpus[pod]);

4717

4718 if (cpumask_empty(attrs->__pod_cpumask))

4719 pr_warn_once("WARNING: workqueue cpumask: online intersect > "

4720 "possible intersect\n");

4721 }

4722

4723 /* install @pwq into @wq and return the old pwq, @cpu < 0 for dfl_pwq */

4724 static struct pool_workqueue *install_unbound_pwq(struct workqueue_struct *wq,

4725 int cpu, struct pool_workqueue *pwq)

4726 {

4727 struct pool_workqueue __rcu **slot = unbound_pwq_slot(wq, cpu);

4728 struct pool_workqueue *old_pwq;

4729

4730 lockdep_assert_held(&wq_pool_mutex);

4731 lockdep_assert_held(&wq->mutex);

4732

4733 /* link_pwq() can handle duplicate calls */

4734 link_pwq(pwq);

4735

4736 old_pwq = rcu_access_pointer(*slot);

4737 rcu_assign_pointer(*slot, pwq);

4738 return old_pwq;

4739 }

4740

4741 /* context to store the prepared attrs & pwqs before applying */

4742 struct apply_wqattrs_ctx {

4743 struct workqueue_struct *wq; /* target workqueue */

4744 struct workqueue_attrs *attrs; /* attrs to apply */

4745 struct list_head list; /* queued for batching commit */

4746 struct pool_workqueue *dfl_pwq;

4747 struct pool_workqueue *pwq_tbl[];

4748 };

4749

4750 /* free the resources after success or abort */

4751 static void apply_wqattrs_cleanup(struct apply_wqattrs_ctx *ctx)

4752 {

4753 if (ctx) {

4754 int cpu;

4755

4756 for_each_possible_cpu(cpu)

4757 put_pwq_unlocked(ctx->pwq_tbl[cpu]);

4758 put_pwq_unlocked(ctx->dfl_pwq);

4759

4760 free_workqueue_attrs(ctx->attrs);

4761

4762 kfree(ctx);

4763 }

4764 }

4765

4766 /* allocate the attrs and pwqs for later installation */

4767 static struct apply_wqattrs_ctx *

4768 apply_wqattrs_prepare(struct workqueue_struct *wq,

4769 const struct workqueue_attrs *attrs,

4770 const cpumask_var_t unbound_cpumask)

4771 {

4772 struct apply_wqattrs_ctx *ctx;

4773 struct workqueue_attrs *new_attrs;

4774 int cpu;

4775

4776 lockdep_assert_held(&wq_pool_mutex);

4777

4778 if (WARN_ON(attrs->affn_scope < 0 ||

4779 attrs->affn_scope >= WQ_AFFN_NR_TYPES))

4780 return ERR_PTR(-EINVAL);

4781

4782 ctx = kzalloc(struct_size(ctx, pwq_tbl, nr_cpu_ids), GFP_KERNEL);

4783

4784 new_attrs = alloc_workqueue_attrs();

4785 if (!ctx || !new_attrs)

4786 goto out_free;

4787

4788 /*

4789 * If something goes wrong during CPU up/down, we'll fall back to

4790 * the default pwq covering whole @attrs->cpumask. Always create

4791 * it even if we don't use it immediately.

4792 */

4793 copy_workqueue_attrs(new_attrs, attrs);

4794 wqattrs_actualize_cpumask(new_attrs, unbound_cpumask);

4795 cpumask_copy(new_attrs->__pod_cpumask, new_attrs->cpumask);

4796 ctx->dfl_pwq = alloc_unbound_pwq(wq, new_attrs);

4797 if (!ctx->dfl_pwq)

4798 goto out_free;

4799

4800 for_each_possible_cpu(cpu) {

4801 if (new_attrs->ordered) {

4802 ctx->dfl_pwq->refcnt++;

4803 ctx->pwq_tbl[cpu] = ctx->dfl_pwq;

4804 } else {

4805 wq_calc_pod_cpumask(new_attrs, cpu, -1);

4806 ctx->pwq_tbl[cpu] = alloc_unbound_pwq(wq, new_attrs);

4807 if (!ctx->pwq_tbl[cpu])

4808 goto out_free;

4809 }

4810 }

4811

4812 /* save the user configured attrs and sanitize it. */

4813 copy_workqueue_attrs(new_attrs, attrs);

4814 cpumask_and(new_attrs->cpumask, new_attrs->cpumask, cpu_possible_mask);

4815 cpumask_copy(new_attrs->__pod_cpumask, new_attrs->cpumask);

4816 ctx->attrs = new_attrs;

4817

4818 ctx->wq = wq;

4819 return ctx;

4820

4821 out_free:

4822 free_workqueue_attrs(new_attrs);

4823 apply_wqattrs_cleanup(ctx);

4824 return ERR_PTR(-ENOMEM);

4825 }

4826

4827 /* set attrs and install prepared pwqs, @ctx points to old pwqs on return */

4828 static void apply_wqattrs_commit(struct apply_wqattrs_ctx *ctx)

4829 {

4830 int cpu;

4831

4832 /* all pwqs have been created successfully, let's install'em */

4833 mutex_lock(&ctx->wq->mutex);

4834

4835 copy_workqueue_attrs(ctx->wq->unbound_attrs, ctx->attrs);

4836

4837 /* save the previous pwqs and install the new ones */

4838 for_each_possible_cpu(cpu)

4839 ctx->pwq_tbl[cpu] = install_unbound_pwq(ctx->wq, cpu,

4840 ctx->pwq_tbl[cpu]);

4841 ctx->dfl_pwq = install_unbound_pwq(ctx->wq, -1, ctx->dfl_pwq);

4842

4843 /* update node_nr_active->max */

4844 wq_update_node_max_active(ctx->wq, -1);

4845

4846 mutex_unlock(&ctx->wq->mutex);

4847 }

4848

4849 static int apply_workqueue_attrs_locked(struct workqueue_struct *wq,

4850 const struct workqueue_attrs *attrs)

4851 {

4852 struct apply_wqattrs_ctx *ctx;

4853

4854 /* only unbound workqueues can change attributes */

4855 if (WARN_ON(!(wq->flags & WQ_UNBOUND)))

4856 return -EINVAL;

4857

4858 /* creating multiple pwqs breaks ordering guarantee */

4859 if (!list_empty(&wq->pwqs)) {

4860 if (WARN_ON(wq->flags & __WQ_ORDERED_EXPLICIT))

4861 return -EINVAL;

4862

4863 wq->flags &= ~__WQ_ORDERED;

4864 }

4865

4866 ctx = apply_wqattrs_prepare(wq, attrs, wq_unbound_cpumask);

4867 if (IS_ERR(ctx))

4868 return PTR_ERR(ctx);

4869

4870 /* the ctx has been prepared successfully, let's commit it */

4871 apply_wqattrs_commit(ctx);

4872 apply_wqattrs_cleanup(ctx);

4873

4874 return 0;

4875 }

4876

4877 /**

4878 * apply_workqueue_attrs - apply new workqueue_attrs to an unbound workqueue

4879 * @wq: the target workqueue

4880 * @attrs: the workqueue_attrs to apply, allocated with alloc_workqueue_attrs()

4881 *

4882 * Apply @attrs to an unbound workqueue @wq. Unless disabled, this function maps

4883 * a separate pwq to each CPU pod with possibles CPUs in @attrs->cpumask so that

4884 * work items are affine to the pod it was issued on. Older pwqs are released as

4885 * in-flight work items finish. Note that a work item which repeatedly requeues

4886 * itself back-to-back will stay on its current pwq.

4887 *

4888 * Performs GFP_KERNEL allocations.

4889 *

4890 * Assumes caller has CPU hotplug read exclusion, i.e. cpus_read_lock().

4891 *

4892 * Return: 0 on success and -errno on failure.

4893 */

4894 int apply_workqueue_attrs(struct workqueue_struct *wq,

4895 const struct workqueue_attrs *attrs)

4896 {

4897 int ret;

4898

4899 lockdep_assert_cpus_held();

4900

4901 mutex_lock(&wq_pool_mutex);

4902 ret = apply_workqueue_attrs_locked(wq, attrs);

4903 mutex_unlock(&wq_pool_mutex);

4904

4905 return ret;

4906 }

4907

4908 /**

4909 * wq_update_pod - update pod affinity of a wq for CPU hot[un]plug

4910 * @wq: the target workqueue

4911 * @cpu: the CPU to update pool association for

4912 * @hotplug_cpu: the CPU coming up or going down

4913 * @online: whether @cpu is coming up or going down

4914 *

4915 * This function is to be called from %CPU_DOWN_PREPARE, %CPU_ONLINE and

4916 * %CPU_DOWN_FAILED. @cpu is being hot[un]plugged, update pod affinity of

4917 * @wq accordingly.

4918 *

4919 *

4920 * If pod affinity can't be adjusted due to memory allocation failure, it falls

4921 * back to @wq->dfl_pwq which may not be optimal but is always correct.

4922 *

4923 * Note that when the last allowed CPU of a pod goes offline for a workqueue

4924 * with a cpumask spanning multiple pods, the workers which were already

4925 * executing the work items for the workqueue will lose their CPU affinity and

4926 * may execute on any CPU. This is similar to how per-cpu workqueues behave on

4927 * CPU_DOWN. If a workqueue user wants strict affinity, it's the user's

4928 * responsibility to flush the work item from CPU_DOWN_PREPARE.

4929 */

4930 static void wq_update_pod(struct workqueue_struct *wq, int cpu,

4931 int hotplug_cpu, bool online)

4932 {

4933 int off_cpu = online ? -1 : hotplug_cpu;

4934 struct pool_workqueue *old_pwq = NULL, *pwq;

4935 struct workqueue_attrs *target_attrs;

4936

4937 lockdep_assert_held(&wq_pool_mutex);

4938

4939 if (!(wq->flags & WQ_UNBOUND) || wq->unbound_attrs->ordered)

4940 return;

4941

4942 /*

4943 * We don't wanna alloc/free wq_attrs for each wq for each CPU.

4944 * Let's use a preallocated one. The following buf is protected by

4945 * CPU hotplug exclusion.

4946 */

4947 target_attrs = wq_update_pod_attrs_buf;

4948

4949 copy_workqueue_attrs(target_attrs, wq->unbound_attrs);

4950 wqattrs_actualize_cpumask(target_attrs, wq_unbound_cpumask);

4951

4952 /* nothing to do if the target cpumask matches the current pwq */

4953 wq_calc_pod_cpumask(target_attrs, cpu, off_cpu);

4954 if (wqattrs_equal(target_attrs, unbound_pwq(wq, cpu)->pool->attrs))

4955 return;

4956

4957 /* create a new pwq */

4958 pwq = alloc_unbound_pwq(wq, target_attrs);

4959 if (!pwq) {

4960 pr_warn("workqueue: allocation failed while updating CPU pod affinity of \"%s\"\n",

4961 wq->name);

4962 goto use_dfl_pwq;

4963 }

4964

4965 /* Install the new pwq. */

4966 mutex_lock(&wq->mutex);

4967 old_pwq = install_unbound_pwq(wq, cpu, pwq);

4968 goto out_unlock;

4969

4970 use_dfl_pwq:

4971 mutex_lock(&wq->mutex);

4972 pwq = unbound_pwq(wq, -1);

4973 raw_spin_lock_irq(&pwq->pool->lock);

4974 get_pwq(pwq);

4975 raw_spin_unlock_irq(&pwq->pool->lock);

4976 old_pwq = install_unbound_pwq(wq, cpu, pwq);

4977 out_unlock:

4978 mutex_unlock(&wq->mutex);

4979 put_pwq_unlocked(old_pwq);

4980 }

4981

4982 static int alloc_and_link_pwqs(struct workqueue_struct *wq)

4983 {

4984 bool highpri = wq->flags & WQ_HIGHPRI;

4985 int cpu, ret;

4986

4987 wq->cpu_pwq = alloc_percpu(struct pool_workqueue *);

4988 if (!wq->cpu_pwq)

4989 goto enomem;

4990

4991 if (!(wq->flags & WQ_UNBOUND)) {

4992 for_each_possible_cpu(cpu) {

4993 struct pool_workqueue **pwq_p =

4994 per_cpu_ptr(wq->cpu_pwq, cpu);

4995 struct worker_pool *pool =

4996 &(per_cpu_ptr(cpu_worker_pools, cpu)[highpri]);

4997

4998 *pwq_p = kmem_cache_alloc_node(pwq_cache, GFP_KERNEL,

4999 pool->node);

5000 if (!*pwq_p)

5001 goto enomem;

5002

5003 init_pwq(*pwq_p, wq, pool);

5004

5005 mutex_lock(&wq->mutex);

5006 link_pwq(*pwq_p);

5007 mutex_unlock(&wq->mutex);

5008 }

5009 return 0;

5010 }

5011

5012 cpus_read_lock();

5013 if (wq->flags & __WQ_ORDERED) {

5014 struct pool_workqueue *dfl_pwq;

5015

5016 ret = apply_workqueue_attrs(wq, ordered_wq_attrs[highpri]);

5017 /* there should only be single pwq for ordering guarantee */

5018 dfl_pwq = rcu_access_pointer(wq->dfl_pwq);

5019 WARN(!ret && (wq->pwqs.next != &dfl_pwq->pwqs_node ||

5020 wq->pwqs.prev != &dfl_pwq->pwqs_node),

5021 "ordering guarantee broken for workqueue %s\n", wq->name);

5022 } else {

5023 ret = apply_workqueue_attrs(wq, unbound_std_wq_attrs[highpri]);

5024 }

5025 cpus_read_unlock();

5026

5027 /* for unbound pwq, flush the pwq_release_worker ensures that the

5028 * pwq_release_workfn() completes before calling kfree(wq).

5029 */

5030 if (ret)

5031 kthread_flush_worker(pwq_release_worker);

5032

5033 return ret;

5034

5035 enomem:

5036 if (wq->cpu_pwq) {

5037 for_each_possible_cpu(cpu) {

5038 struct pool_workqueue *pwq = *per_cpu_ptr(wq->cpu_pwq, cpu);

5039

5040 if (pwq)

5041 kmem_cache_free(pwq_cache, pwq);

5042 }

5043 free_percpu(wq->cpu_pwq);

5044 wq->cpu_pwq = NULL;

5045 }

5046 return -ENOMEM;

5047 }

5048

5049 static int wq_clamp_max_active(int max_active, unsigned int flags,

5050 const char *name)

5051 {

5052 if (max_active < 1 || max_active > WQ_MAX_ACTIVE)

5053 pr_warn("workqueue: max_active %d requested for %s is out of range, clamping between %d and %d\n",

5054 max_active, name, 1, WQ_MAX_ACTIVE);

5055

5056 return clamp_val(max_active, 1, WQ_MAX_ACTIVE);

5057 }

5058

5059 /*

5060 * Workqueues which may be used during memory reclaim should have a rescuer

5061 * to guarantee forward progress.

5062 */

5063 static int init_rescuer(struct workqueue_struct *wq)

5064 {

5065 struct worker *rescuer;

5066 int ret;

5067

5068 if (!(wq->flags & WQ_MEM_RECLAIM))

5069 return 0;

5070

5071 rescuer = alloc_worker(NUMA_NO_NODE);

5072 if (!rescuer) {

5073 pr_err("workqueue: Failed to allocate a rescuer for wq \"%s\"\n",

5074 wq->name);

5075 return -ENOMEM;

5076 }

5077

5078 rescuer->rescue_wq = wq;

5079 rescuer->task = kthread_create(rescuer_thread, rescuer, "kworker/R-%s", wq->name);

5080 if (IS_ERR(rescuer->task)) {

5081 ret = PTR_ERR(rescuer->task);

5082 pr_err("workqueue: Failed to create a rescuer kthread for wq \"%s\": %pe",

5083 wq->name, ERR_PTR(ret));

5084 kfree(rescuer);

5085 return ret;

5086 }

5087

5088 wq->rescuer = rescuer;

5089 if (wq->flags & WQ_UNBOUND)

5090 kthread_bind_mask(rescuer->task, wq->unbound_attrs->cpumask);

5091 else

5092 kthread_bind_mask(rescuer->task, cpu_possible_mask);

5093 wake_up_process(rescuer->task);

5094

5095 return 0;

5096 }

5097

5098 /**

5099 * wq_adjust_max_active - update a wq's max_active to the current setting

5100 * @wq: target workqueue

5101 *

5102 * If @wq isn't freezing, set @wq->max_active to the saved_max_active and

5103 * activate inactive work items accordingly. If @wq is freezing, clear

5104 * @wq->max_active to zero.

5105 */

5106 static void wq_adjust_max_active(struct workqueue_struct *wq)

5107 {

5108 bool activated;

5109 int new_max, new_min;

5110

5111 lockdep_assert_held(&wq->mutex);

5112

5113 if ((wq->flags & WQ_FREEZABLE) && workqueue_freezing) {

5114 new_max = 0;

5115 new_min = 0;

5116 } else {

5117 new_max = wq->saved_max_active;

5118 new_min = wq->saved_min_active;

5119 }

5120

5121 if (wq->max_active == new_max && wq->min_active == new_min)

5122 return;

5123

5124 /*

5125 * Update @wq->max/min_active and then kick inactive work items if more

5126 * active work items are allowed. This doesn't break work item ordering

5127 * because new work items are always queued behind existing inactive

5128 * work items if there are any.

5129 */

5130 WRITE_ONCE(wq->max_active, new_max);

5131 WRITE_ONCE(wq->min_active, new_min);

5132

5133 if (wq->flags & WQ_UNBOUND)

5134 wq_update_node_max_active(wq, -1);

5135

5136 if (new_max == 0)

5137 return;

5138

5139 /*

5140 * Round-robin through pwq's activating the first inactive work item

5141 * until max_active is filled.

5142 */

5143 do {

5144 struct pool_workqueue *pwq;

5145

5146 activated = false;

5147 for_each_pwq(pwq, wq) {

5148 unsigned long flags;

5149

5150 /* can be called during early boot w/ irq disabled */

5151 raw_spin_lock_irqsave(&pwq->pool->lock, flags);

5152 if (pwq_activate_first_inactive(pwq, true)) {

5153 activated = true;

5154 kick_pool(pwq->pool);

5155 }

5156 raw_spin_unlock_irqrestore(&pwq->pool->lock, flags);

5157 }

5158 } while (activated);

5159 }

5160

5161 __printf(1, 4)

5162 struct workqueue_struct *alloc_workqueue(const char *fmt,

5163 unsigned int flags,

5164 int max_active, ...)

5165 {

5166 va_list args;

5167 struct workqueue_struct *wq;

5168 size_t wq_size;

5169 int name_len;

5170

5171 /*

5172 * Unbound && max_active == 1 used to imply ordered, which is no longer

5173 * the case on many machines due to per-pod pools. While

5174 * alloc_ordered_workqueue() is the right way to create an ordered

5175 * workqueue, keep the previous behavior to avoid subtle breakages.

5176 */

5177 if ((flags & WQ_UNBOUND) && max_active == 1)

5178 flags |= __WQ_ORDERED;

5179

5180 /* see the comment above the definition of WQ_POWER_EFFICIENT */

5181 if ((flags & WQ_POWER_EFFICIENT) && wq_power_efficient)

5182 flags |= WQ_UNBOUND;

5183

5184 /* allocate wq and format name */

5185 if (flags & WQ_UNBOUND)

5186 wq_size = struct_size(wq, node_nr_active, nr_node_ids + 1);

5187 else

5188 wq_size = sizeof(*wq);

5189

5190 wq = kzalloc(wq_size, GFP_KERNEL);

5191 if (!wq)

5192 return NULL;

5193

5194 if (flags & WQ_UNBOUND) {

5195 wq->unbound_attrs = alloc_workqueue_attrs();

5196 if (!wq->unbound_attrs)

5197 goto err_free_wq;

5198 }

5199

5200 va_start(args, max_active);

5201 name_len = vsnprintf(wq->name, sizeof(wq->name), fmt, args);

5202 va_end(args);

5203

5204 if (name_len >= WQ_NAME_LEN)

5205 pr_warn_once("workqueue: name exceeds WQ_NAME_LEN. Truncating to: %s\n",

5206 wq->name);

5207

5208 max_active = max_active ?: WQ_DFL_ACTIVE;

5209 max_active = wq_clamp_max_active(max_active, flags, wq->name);

5210

5211 /* init wq */

5212 wq->flags = flags;

5213 wq->max_active = max_active;

5214 wq->min_active = min(max_active, WQ_DFL_MIN_ACTIVE);

5215 wq->saved_max_active = wq->max_active;

5216 wq->saved_min_active = wq->min_active;

5217 mutex_init(&wq->mutex);

5218 atomic_set(&wq->nr_pwqs_to_flush, 0);

5219 INIT_LIST_HEAD(&wq->pwqs);

5220 INIT_LIST_HEAD(&wq->flusher_queue);

5221 INIT_LIST_HEAD(&wq->flusher_overflow);

5222 INIT_LIST_HEAD(&wq->maydays);

5223

5224 wq_init_lockdep(wq);

5225 INIT_LIST_HEAD(&wq->list);

5226

5227 if (flags & WQ_UNBOUND) {

5228 if (alloc_node_nr_active(wq->node_nr_active) < 0)

5229 goto err_unreg_lockdep;

5230 }

5231

5232 if (alloc_and_link_pwqs(wq) < 0)

5233 goto err_free_node_nr_active;

5234

5235 if (wq_online && init_rescuer(wq) < 0)

5236 goto err_destroy;

5237

5238 if ((wq->flags & WQ_SYSFS) && workqueue_sysfs_register(wq))

5239 goto err_destroy;

5240

5241 /*

5242 * wq_pool_mutex protects global freeze state and workqueues list.

5243 * Grab it, adjust max_active and add the new @wq to workqueues

5244 * list.

5245 */

5246 mutex_lock(&wq_pool_mutex);

5247

5248 mutex_lock(&wq->mutex);

5249 wq_adjust_max_active(wq);

5250 mutex_unlock(&wq->mutex);

5251

5252 list_add_tail_rcu(&wq->list, &workqueues);

5253

5254 mutex_unlock(&wq_pool_mutex);

5255

5256 return wq;

5257

5258 err_free_node_nr_active:

5259 if (wq->flags & WQ_UNBOUND)

5260 free_node_nr_active(wq->node_nr_active);

5261 err_unreg_lockdep:

5262 wq_unregister_lockdep(wq);

5263 wq_free_lockdep(wq);

5264 err_free_wq:

5265 free_workqueue_attrs(wq->unbound_attrs);

5266 kfree(wq);

5267 return NULL;

5268 err_destroy:

5269 destroy_workqueue(wq);

5270 return NULL;

5271 }

5272 EXPORT_SYMBOL_GPL(alloc_workqueue);

5273

5274 static bool pwq_busy(struct pool_workqueue *pwq)

5275 {

5276 int i;

5277

5278 for (i = 0; i < WORK_NR_COLORS; i++)

5279 if (pwq->nr_in_flight[i])

5280 return true;

5281

5282 if ((pwq != rcu_access_pointer(pwq->wq->dfl_pwq)) && (pwq->refcnt > 1))

5283 return true;

5284 if (!pwq_is_empty(pwq))

5285 return true;

5286

5287 return false;

5288 }

5289

5290 /**

5291 * destroy_workqueue - safely terminate a workqueue

5292 * @wq: target workqueue

5293 *

5294 * Safely destroy a workqueue. All work currently pending will be done first.

5295 */

5296 void destroy_workqueue(struct workqueue_struct *wq)

5297 {

5298 struct pool_workqueue *pwq;

5299 int cpu;

5300

5301 /*

5302 * Remove it from sysfs first so that sanity check failure doesn't

5303 * lead to sysfs name conflicts.

5304 */

5305 workqueue_sysfs_unregister(wq);

5306

5307 /* mark the workqueue destruction is in progress */

5308 mutex_lock(&wq->mutex);

5309 wq->flags |= __WQ_DESTROYING;

5310 mutex_unlock(&wq->mutex);

5311

5312 /* drain it before proceeding with destruction */

5313 drain_workqueue(wq);

5314

5315 /* kill rescuer, if sanity checks fail, leave it w/o rescuer */

5316 if (wq->rescuer) {

5317 struct worker *rescuer = wq->rescuer;

5318

5319 /* this prevents new queueing */

5320 raw_spin_lock_irq(&wq_mayday_lock);

5321 wq->rescuer = NULL;

5322 raw_spin_unlock_irq(&wq_mayday_lock);

5323

5324 /* rescuer will empty maydays list before exiting */

5325 kthread_stop(rescuer->task);

5326 kfree(rescuer);

5327 }

5328

5329 /*

5330 * Sanity checks - grab all the locks so that we wait for all

5331 * in-flight operations which may do put_pwq().

5332 */

5333 mutex_lock(&wq_pool_mutex);

5334 mutex_lock(&wq->mutex);

5335 for_each_pwq(pwq, wq) {

5336 raw_spin_lock_irq(&pwq->pool->lock);

5337 if (WARN_ON(pwq_busy(pwq))) {

5338 pr_warn("%s: %s has the following busy pwq\n",

5339 __func__, wq->name);

5340 show_pwq(pwq);

5341 raw_spin_unlock_irq(&pwq->pool->lock);

5342 mutex_unlock(&wq->mutex);

5343 mutex_unlock(&wq_pool_mutex);

5344 show_one_workqueue(wq);

5345 return;

5346 }

5347 raw_spin_unlock_irq(&pwq->pool->lock);

5348 }

5349 mutex_unlock(&wq->mutex);

5350

5351 /*

5352 * wq list is used to freeze wq, remove from list after

5353 * flushing is complete in case freeze races us.

5354 */

5355 list_del_rcu(&wq->list);

5356 mutex_unlock(&wq_pool_mutex);

5357

5358 /*

5359 * We're the sole accessor of @wq. Directly access cpu_pwq and dfl_pwq

5360 * to put the base refs. @wq will be auto-destroyed from the last

5361 * pwq_put. RCU read lock prevents @wq from going away from under us.

5362 */

5363 rcu_read_lock();

5364

5365 for_each_possible_cpu(cpu) {

5366 put_pwq_unlocked(unbound_pwq(wq, cpu));

5367 RCU_INIT_POINTER(*unbound_pwq_slot(wq, cpu), NULL);

5368 }

5369

5370 put_pwq_unlocked(unbound_pwq(wq, -1));

5371 RCU_INIT_POINTER(*unbound_pwq_slot(wq, -1), NULL);

5372

5373 rcu_read_unlock();

5374 }

5375 EXPORT_SYMBOL_GPL(destroy_workqueue);

5376

5377 /**

5378 * workqueue_set_max_active - adjust max_active of a workqueue

5379 * @wq: target workqueue

5380 * @max_active: new max_active value.

5381 *

5382 * Set max_active of @wq to @max_active. See the alloc_workqueue() function

5383 * comment.

5384 *

5385 * CONTEXT:

5386 * Don't call from IRQ context.

5387 */

5388 void workqueue_set_max_active(struct workqueue_struct *wq, int max_active)

5389 {

5390 /* disallow meddling with max_active for ordered workqueues */

5391 if (WARN_ON(wq->flags & __WQ_ORDERED_EXPLICIT))

5392 return;

5393

5394 max_active = wq_clamp_max_active(max_active, wq->flags, wq->name);

5395

5396 mutex_lock(&wq->mutex);

5397

5398 wq->flags &= ~__WQ_ORDERED;

5399 wq->saved_max_active = max_active;

5400 if (wq->flags & WQ_UNBOUND)

5401 wq->saved_min_active = min(wq->saved_min_active, max_active);

5402

5403 wq_adjust_max_active(wq);

5404

5405 mutex_unlock(&wq->mutex);

5406 }

5407 EXPORT_SYMBOL_GPL(workqueue_set_max_active);

5408

5409 /**

5410 * current_work - retrieve %current task's work struct

5411 *

5412 * Determine if %current task is a workqueue worker and what it's working on.

5413 * Useful to find out the context that the %current task is running in.

5414 *

5415 * Return: work struct if %current task is a workqueue worker, %NULL otherwise.

5416 */

5417 struct work_struct *current_work(void)

5418 {

5419 struct worker *worker = current_wq_worker();

5420

5421 return worker ? worker->current_work : NULL;

5422 }

5423 EXPORT_SYMBOL(current_work);

5424

5425 /**

5426 * current_is_workqueue_rescuer - is %current workqueue rescuer?

5427 *

5428 * Determine whether %current is a workqueue rescuer. Can be used from

5429 * work functions to determine whether it's being run off the rescuer task.

5430 *

5431 * Return: %true if %current is a workqueue rescuer. %false otherwise.

5432 */

5433 bool current_is_workqueue_rescuer(void)

5434 {

5435 struct worker *worker = current_wq_worker();

5436

5437 return worker && worker->rescue_wq;

5438 }

5439

5440 /**

5441 * workqueue_congested - test whether a workqueue is congested

5442 * @cpu: CPU in question

5443 * @wq: target workqueue

5444 *

5445 * Test whether @wq's cpu workqueue for @cpu is congested. There is

5446 * no synchronization around this function and the test result is

5447 * unreliable and only useful as advisory hints or for debugging.

5448 *

5449 * If @cpu is WORK_CPU_UNBOUND, the test is performed on the local CPU.

5450 *

5451 * With the exception of ordered workqueues, all workqueues have per-cpu

5452 * pool_workqueues, each with its own congested state. A workqueue being

5453 * congested on one CPU doesn't mean that the workqueue is contested on any

5454 * other CPUs.

5455 *

5456 * Return:

5457 * %true if congested, %false otherwise.

5458 */

5459 bool workqueue_congested(int cpu, struct workqueue_struct *wq)

5460 {

5461 struct pool_workqueue *pwq;

5462 bool ret;

5463

5464 rcu_read_lock();

5465 preempt_disable();

5466

5467 if (cpu == WORK_CPU_UNBOUND)

5468 cpu = smp_processor_id();

5469

5470 pwq = *per_cpu_ptr(wq->cpu_pwq, cpu);

5471 ret = !list_empty(&pwq->inactive_works);

5472

5473 preempt_enable();

5474 rcu_read_unlock();

5475

5476 return ret;

5477 }

5478 EXPORT_SYMBOL_GPL(workqueue_congested);

5479

5480 /**

5481 * work_busy - test whether a work is currently pending or running

5482 * @work: the work to be tested

5483 *

5484 * Test whether @work is currently pending or running. There is no

5485 * synchronization around this function and the test result is

5486 * unreliable and only useful as advisory hints or for debugging.

5487 *

5488 * Return:

5489 * OR'd bitmask of WORK_BUSY_* bits.

5490 */

5491 unsigned int work_busy(struct work_struct *work)

5492 {

5493 struct worker_pool *pool;

5494 unsigned long flags;

5495 unsigned int ret = 0;

5496

5497 if (work_pending(work))

5498 ret |= WORK_BUSY_PENDING;

5499

5500 rcu_read_lock();

5501 pool = get_work_pool(work);

5502 if (pool) {

5503 raw_spin_lock_irqsave(&pool->lock, flags);

5504 if (find_worker_executing_work(pool, work))

5505 ret |= WORK_BUSY_RUNNING;

5506 raw_spin_unlock_irqrestore(&pool->lock, flags);

5507 }

5508 rcu_read_unlock();

5509

5510 return ret;

5511 }

5512 EXPORT_SYMBOL_GPL(work_busy);

5513

5514 /**

5515 * set_worker_desc - set description for the current work item

5516 * @fmt: printf-style format string

5517 * @...: arguments for the format string

5518 *

5519 * This function can be called by a running work function to describe what

5520 * the work item is about. If the worker task gets dumped, this

5521 * information will be printed out together to help debugging. The

5522 * description can be at most WORKER_DESC_LEN including the trailing '\0'.

5523 */

5524 void set_worker_desc(const char *fmt, ...)

5525 {

5526 struct worker *worker = current_wq_worker();

5527 va_list args;

5528

5529 if (worker) {

5530 va_start(args, fmt);

5531 vsnprintf(worker->desc, sizeof(worker->desc), fmt, args);

5532 va_end(args);

5533 }

5534 }

5535 EXPORT_SYMBOL_GPL(set_worker_desc);

5536

5537 /**

5538 * print_worker_info - print out worker information and description

5539 * @log_lvl: the log level to use when printing

5540 * @task: target task

5541 *

5542 * If @task is a worker and currently executing a work item, print out the

5543 * name of the workqueue being serviced and worker description set with

5544 * set_worker_desc() by the currently executing work item.

5545 *

5546 * This function can be safely called on any task as long as the

5547 * task_struct itself is accessible. While safe, this function isn't

5548 * synchronized and may print out mixups or garbages of limited length.

5549 */

5550 void print_worker_info(const char *log_lvl, struct task_struct *task)

5551 {

5552 work_func_t *fn = NULL;

5553 char name[WQ_NAME_LEN] = { };

5554 char desc[WORKER_DESC_LEN] = { };

5555 struct pool_workqueue *pwq = NULL;

5556 struct workqueue_struct *wq = NULL;

5557 struct worker *worker;

5558

5559 if (!(task->flags & PF_WQ_WORKER))

5560 return;

5561

5562 /*

5563 * This function is called without any synchronization and @task

5564 * could be in any state. Be careful with dereferences.

5565 */

5566 worker = kthread_probe_data(task);

5567

5568 /*

5569 * Carefully copy the associated workqueue's workfn, name and desc.

5570 * Keep the original last '\0' in case the original is garbage.

5571 */

5572 copy_from_kernel_nofault(&fn, &worker->current_func, sizeof(fn));

5573 copy_from_kernel_nofault(&pwq, &worker->current_pwq, sizeof(pwq));

5574 copy_from_kernel_nofault(&wq, &pwq->wq, sizeof(wq));

5575 copy_from_kernel_nofault(name, wq->name, sizeof(name) - 1);

5576 copy_from_kernel_nofault(desc, worker->desc, sizeof(desc) - 1);

5577

5578 if (fn || name[0] || desc[0]) {

5579 printk("%sWorkqueue: %s %ps", log_lvl, name, fn);

5580 if (strcmp(name, desc))

5581 pr_cont(" (%s)", desc);

5582 pr_cont("\n");

5583 }

5584 }

5585

5586 static void pr_cont_pool_info(struct worker_pool *pool)

5587 {

5588 pr_cont(" cpus=%*pbl", nr_cpumask_bits, pool->attrs->cpumask);

5589 if (pool->node != NUMA_NO_NODE)

5590 pr_cont(" node=%d", pool->node);

5591 pr_cont(" flags=0x%x nice=%d", pool->flags, pool->attrs->nice);

5592 }

5593

5594 struct pr_cont_work_struct {

5595 bool comma;

5596 work_func_t func;

5597 long ctr;

5598 };

5599

5600 static void pr_cont_work_flush(bool comma, work_func_t func, struct pr_cont_work_struct *pcwsp)

5601 {

5602 if (!pcwsp->ctr)

5603 goto out_record;

5604 if (func == pcwsp->func) {

5605 pcwsp->ctr++;

5606 return;

5607 }

5608 if (pcwsp->ctr == 1)

5609 pr_cont("%s %ps", pcwsp->comma ? "," : "", pcwsp->func);

5610 else

5611 pr_cont("%s %ld*%ps", pcwsp->comma ? "," : "", pcwsp->ctr, pcwsp->func);

5612 pcwsp->ctr = 0;

5613 out_record:

5614 if ((long)func == -1L)

5615 return;

5616 pcwsp->comma = comma;

5617 pcwsp->func = func;

5618 pcwsp->ctr = 1;

5619 }

5620

5621 static void pr_cont_work(bool comma, struct work_struct *work, struct pr_cont_work_struct *pcwsp)

5622 {

5623 if (work->func == wq_barrier_func) {

5624 struct wq_barrier *barr;

5625

5626 barr = container_of(work, struct wq_barrier, work);

5627

5628 pr_cont_work_flush(comma, (work_func_t)-1, pcwsp);

5629 pr_cont("%s BAR(%d)", comma ? "," : "",

5630 task_pid_nr(barr->task));

5631 } else {

5632 if (!comma)

5633 pr_cont_work_flush(comma, (work_func_t)-1, pcwsp);

5634 pr_cont_work_flush(comma, work->func, pcwsp);

5635 }

5636 }

5637

5638 static void show_pwq(struct pool_workqueue *pwq)

5639 {

5640 struct pr_cont_work_struct pcws = { .ctr = 0, };

5641 struct worker_pool *pool = pwq->pool;

5642 struct work_struct *work;

5643 struct worker *worker;

5644 bool has_in_flight = false, has_pending = false;

5645 int bkt;

5646

5647 pr_info(" pwq %d:", pool->id);

5648 pr_cont_pool_info(pool);

5649

5650 pr_cont(" active=%d refcnt=%d%s\n",

5651 pwq->nr_active, pwq->refcnt,

5652 !list_empty(&pwq->mayday_node) ? " MAYDAY" : "");

5653

5654 hash_for_each(pool->busy_hash, bkt, worker, hentry) {

5655 if (worker->current_pwq == pwq) {

5656 has_in_flight = true;

5657 break;

5658 }

5659 }

5660 if (has_in_flight) {

5661 bool comma = false;

5662

5663 pr_info(" in-flight:");

5664 hash_for_each(pool->busy_hash, bkt, worker, hentry) {

5665 if (worker->current_pwq != pwq)

5666 continue;

5667

5668 pr_cont("%s %d%s:%ps", comma ? "," : "",

5669 task_pid_nr(worker->task),

5670 worker->rescue_wq ? "(RESCUER)" : "",

5671 worker->current_func);

5672 list_for_each_entry(work, &worker->scheduled, entry)

5673 pr_cont_work(false, work, &pcws);

5674 pr_cont_work_flush(comma, (work_func_t)-1L, &pcws);

5675 comma = true;

5676 }

5677 pr_cont("\n");

5678 }

5679

5680 list_for_each_entry(work, &pool->worklist, entry) {

5681 if (get_work_pwq(work) == pwq) {

5682 has_pending = true;

5683 break;

5684 }

5685 }

5686 if (has_pending) {

5687 bool comma = false;

5688

5689 pr_info(" pending:");

5690 list_for_each_entry(work, &pool->worklist, entry) {

5691 if (get_work_pwq(work) != pwq)

5692 continue;

5693

5694 pr_cont_work(comma, work, &pcws);

5695 comma = !(*work_data_bits(work) & WORK_STRUCT_LINKED);

5696 }

5697 pr_cont_work_flush(comma, (work_func_t)-1L, &pcws);

5698 pr_cont("\n");

5699 }

5700

5701 if (!list_empty(&pwq->inactive_works)) {

5702 bool comma = false;

5703

5704 pr_info(" inactive:");

5705 list_for_each_entry(work, &pwq->inactive_works, entry) {

5706 pr_cont_work(comma, work, &pcws);

5707 comma = !(*work_data_bits(work) & WORK_STRUCT_LINKED);

5708 }

5709 pr_cont_work_flush(comma, (work_func_t)-1L, &pcws);

5710 pr_cont("\n");

5711 }

5712 }

5713

5714 /**

5715 * show_one_workqueue - dump state of specified workqueue

5716 * @wq: workqueue whose state will be printed

5717 */

5718 void show_one_workqueue(struct workqueue_struct *wq)

5719 {

5720 struct pool_workqueue *pwq;

5721 bool idle = true;

5722 unsigned long flags;

5723

5724 for_each_pwq(pwq, wq) {

5725 if (!pwq_is_empty(pwq)) {

5726 idle = false;

5727 break;

5728 }

5729 }

5730 if (idle) /* Nothing to print for idle workqueue */

5731 return;

5732

5733 pr_info("workqueue %s: flags=0x%x\n", wq->name, wq->flags);

5734

5735 for_each_pwq(pwq, wq) {

5736 raw_spin_lock_irqsave(&pwq->pool->lock, flags);

5737 if (!pwq_is_empty(pwq)) {

5738 /*

5739 * Defer printing to avoid deadlocks in console

5740 * drivers that queue work while holding locks

5741 * also taken in their write paths.

5742 */

5743 printk_deferred_enter();

5744 show_pwq(pwq);

5745 printk_deferred_exit();

5746 }

5747 raw_spin_unlock_irqrestore(&pwq->pool->lock, flags);

5748 /*

5749 * We could be printing a lot from atomic context, e.g.

5750 * sysrq-t -> show_all_workqueues(). Avoid triggering

5751 * hard lockup.

5752 */

5753 touch_nmi_watchdog();

5754 }

5755

5756 }

5757

5758 /**

5759 * show_one_worker_pool - dump state of specified worker pool

5760 * @pool: worker pool whose state will be printed

5761 */

5762 static void show_one_worker_pool(struct worker_pool *pool)

5763 {

5764 struct worker *worker;

5765 bool first = true;

5766 unsigned long flags;

5767 unsigned long hung = 0;

5768

5769 raw_spin_lock_irqsave(&pool->lock, flags);

5770 if (pool->nr_workers == pool->nr_idle)

5771 goto next_pool;

5772

5773 /* How long the first pending work is waiting for a worker. */

5774 if (!list_empty(&pool->worklist))

5775 hung = jiffies_to_msecs(jiffies - pool->watchdog_ts) / 1000;

5776

5777 /*

5778 * Defer printing to avoid deadlocks in console drivers that

5779 * queue work while holding locks also taken in their write

5780 * paths.

5781 */

5782 printk_deferred_enter();

5783 pr_info("pool %d:", pool->id);

5784 pr_cont_pool_info(pool);

5785 pr_cont(" hung=%lus workers=%d", hung, pool->nr_workers);

5786 if (pool->manager)

5787 pr_cont(" manager: %d",

5788 task_pid_nr(pool->manager->task));

5789 list_for_each_entry(worker, &pool->idle_list, entry) {

5790 pr_cont(" %s%d", first ? "idle: " : "",

5791 task_pid_nr(worker->task));

5792 first = false;

5793 }

5794 pr_cont("\n");

5795 printk_deferred_exit();

5796 next_pool:

5797 raw_spin_unlock_irqrestore(&pool->lock, flags);

5798 /*

5799 * We could be printing a lot from atomic context, e.g.

5800 * sysrq-t -> show_all_workqueues(). Avoid triggering

5801 * hard lockup.

5802 */

5803 touch_nmi_watchdog();

5804

5805 }

5806

5807 /**

5808 * show_all_workqueues - dump workqueue state

5809 *

5810 * Called from a sysrq handler and prints out all busy workqueues and pools.

5811 */

5812 void show_all_workqueues(void)

5813 {

5814 struct workqueue_struct *wq;

5815 struct worker_pool *pool;

5816 int pi;

5817

5818 rcu_read_lock();

5819

5820 pr_info("Showing busy workqueues and worker pools:\n");

5821

5822 list_for_each_entry_rcu(wq, &workqueues, list)

5823 show_one_workqueue(wq);

5824

5825 for_each_pool(pool, pi)

5826 show_one_worker_pool(pool);

5827

5828 rcu_read_unlock();

5829 }

5830

5831 /**

5832 * show_freezable_workqueues - dump freezable workqueue state

5833 *

5834 * Called from try_to_freeze_tasks() and prints out all freezable workqueues

5835 * still busy.

5836 */

5837 void show_freezable_workqueues(void)

5838 {

5839 struct workqueue_struct *wq;

5840

5841 rcu_read_lock();

5842

5843 pr_info("Showing freezable workqueues that are still busy:\n");

5844

5845 list_for_each_entry_rcu(wq, &workqueues, list) {

5846 if (!(wq->flags & WQ_FREEZABLE))

5847 continue;

5848 show_one_workqueue(wq);

5849 }

5850

5851 rcu_read_unlock();

5852 }

5853

5854 /* used to show worker information through /proc/PID/{comm,stat,status} */

5855 void wq_worker_comm(char *buf, size_t size, struct task_struct *task)

5856 {

5857 int off;

5858

5859 /* always show the actual comm */

5860 off = strscpy(buf, task->comm, size);

5861 if (off < 0)

5862 return;

5863

5864 /* stabilize PF_WQ_WORKER and worker pool association */

5865 mutex_lock(&wq_pool_attach_mutex);

5866

5867 if (task->flags & PF_WQ_WORKER) {

5868 struct worker *worker = kthread_data(task);

5869 struct worker_pool *pool = worker->pool;

5870

5871 if (pool) {

5872 raw_spin_lock_irq(&pool->lock);

5873 /*

5874 * ->desc tracks information (wq name or

5875 * set_worker_desc()) for the latest execution. If

5876 * current, prepend '+', otherwise '-'.

5877 */

5878 if (worker->desc[0] != '\0') {

5879 if (worker->current_work)

5880 scnprintf(buf + off, size - off, "+%s",

5881 worker->desc);

5882 else

5883 scnprintf(buf + off, size - off, "-%s",

5884 worker->desc);

5885 }

5886 raw_spin_unlock_irq(&pool->lock);

5887 }

5888 }

5889

5890 mutex_unlock(&wq_pool_attach_mutex);

5891 }

5892

5893 #ifdef CONFIG_SMP

5894

5895 /*

5896 * CPU hotplug.

5897 *

5898 * There are two challenges in supporting CPU hotplug. Firstly, there

5899 * are a lot of assumptions on strong associations among work, pwq and

5900 * pool which make migrating pending and scheduled works very

5901 * difficult to implement without impacting hot paths. Secondly,

5902 * worker pools serve mix of short, long and very long running works making

5903 * blocked draining impractical.

5904 *

5905 * This is solved by allowing the pools to be disassociated from the CPU

5906 * running as an unbound one and allowing it to be reattached later if the

5907 * cpu comes back online.

5908 */

5909

5910 static void unbind_workers(int cpu)

5911 {

5912 struct worker_pool *pool;

5913 struct worker *worker;

5914

5915 for_each_cpu_worker_pool(pool, cpu) {

5916 mutex_lock(&wq_pool_attach_mutex);

5917 raw_spin_lock_irq(&pool->lock);

5918

5919 /*

5920 * We've blocked all attach/detach operations. Make all workers

5921 * unbound and set DISASSOCIATED. Before this, all workers

5922 * must be on the cpu. After this, they may become diasporas.

5923 * And the preemption disabled section in their sched callbacks

5924 * are guaranteed to see WORKER_UNBOUND since the code here

5925 * is on the same cpu.

5926 */

5927 for_each_pool_worker(worker, pool)

5928 worker->flags |= WORKER_UNBOUND;

5929

5930 pool->flags |= POOL_DISASSOCIATED;

5931

5932 /*

5933 * The handling of nr_running in sched callbacks are disabled

5934 * now. Zap nr_running. After this, nr_running stays zero and

5935 * need_more_worker() and keep_working() are always true as

5936 * long as the worklist is not empty. This pool now behaves as

5937 * an unbound (in terms of concurrency management) pool which

5938 * are served by workers tied to the pool.

5939 */

5940 pool->nr_running = 0;

5941

5942 /*

5943 * With concurrency management just turned off, a busy

5944 * worker blocking could lead to lengthy stalls. Kick off

5945 * unbound chain execution of currently pending work items.

5946 */

5947 kick_pool(pool);

5948

5949 raw_spin_unlock_irq(&pool->lock);

5950

5951 for_each_pool_worker(worker, pool)

5952 unbind_worker(worker);

5953

5954 mutex_unlock(&wq_pool_attach_mutex);

5955 }

5956 }

5957

5958 /**

5959 * rebind_workers - rebind all workers of a pool to the associated CPU

5960 * @pool: pool of interest

5961 *

5962 * @pool->cpu is coming online. Rebind all workers to the CPU.

5963 */

5964 static void rebind_workers(struct worker_pool *pool)

5965 {

5966 struct worker *worker;

5967

5968 lockdep_assert_held(&wq_pool_attach_mutex);

5969

5970 /*

5971 * Restore CPU affinity of all workers. As all idle workers should

5972 * be on the run-queue of the associated CPU before any local

5973 * wake-ups for concurrency management happen, restore CPU affinity

5974 * of all workers first and then clear UNBOUND. As we're called

5975 * from CPU_ONLINE, the following shouldn't fail.

5976 */

5977 for_each_pool_worker(worker, pool) {

5978 kthread_set_per_cpu(worker->task, pool->cpu);

5979 WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task,

5980 pool_allowed_cpus(pool)) < 0);

5981 }

5982

5983 raw_spin_lock_irq(&pool->lock);

5984

5985 pool->flags &= ~POOL_DISASSOCIATED;

5986

5987 for_each_pool_worker(worker, pool) {

5988 unsigned int worker_flags = worker->flags;

5989

5990 /*

5991 * We want to clear UNBOUND but can't directly call

5992 * worker_clr_flags() or adjust nr_running. Atomically

5993 * replace UNBOUND with another NOT_RUNNING flag REBOUND.

5994 * @worker will clear REBOUND using worker_clr_flags() when

5995 * it initiates the next execution cycle thus restoring

5996 * concurrency management. Note that when or whether

5997 * @worker clears REBOUND doesn't affect correctness.

5998 *

5999 * WRITE_ONCE() is necessary because @worker->flags may be

6000 * tested without holding any lock in

6001 * wq_worker_running(). Without it, NOT_RUNNING test may

6002 * fail incorrectly leading to premature concurrency

6003 * management operations.

6004 */

6005 WARN_ON_ONCE(!(worker_flags & WORKER_UNBOUND));

6006 worker_flags |= WORKER_REBOUND;

6007 worker_flags &= ~WORKER_UNBOUND;

6008 WRITE_ONCE(worker->flags, worker_flags);

6009 }

6010

6011 raw_spin_unlock_irq(&pool->lock);

6012 }

6013

6014 /**

6015 * restore_unbound_workers_cpumask - restore cpumask of unbound workers

6016 * @pool: unbound pool of interest

6017 * @cpu: the CPU which is coming up

6018 *

6019 * An unbound pool may end up with a cpumask which doesn't have any online

6020 * CPUs. When a worker of such pool get scheduled, the scheduler resets

6021 * its cpus_allowed. If @cpu is in @pool's cpumask which didn't have any

6022 * online CPU before, cpus_allowed of all its workers should be restored.

6023 */

6024 static void restore_unbound_workers_cpumask(struct worker_pool *pool, int cpu)

6025 {

6026 static cpumask_t cpumask;

6027 struct worker *worker;

6028

6029 lockdep_assert_held(&wq_pool_attach_mutex);

6030

6031 /* is @cpu allowed for @pool? */

6032 if (!cpumask_test_cpu(cpu, pool->attrs->cpumask))

6033 return;

6034

6035 cpumask_and(&cpumask, pool->attrs->cpumask, cpu_online_mask);

6036

6037 /* as we're called from CPU_ONLINE, the following shouldn't fail */

6038 for_each_pool_worker(worker, pool)

6039 WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task, &cpumask) < 0);

6040 }

6041

6042 int workqueue_prepare_cpu(unsigned int cpu)

6043 {

6044 struct worker_pool *pool;

6045

6046 for_each_cpu_worker_pool(pool, cpu) {

6047 if (pool->nr_workers)

6048 continue;

6049 if (!create_worker(pool))

6050 return -ENOMEM;

6051 }

6052 return 0;

6053 }

6054

6055 int workqueue_online_cpu(unsigned int cpu)

6056 {

6057 struct worker_pool *pool;

6058 struct workqueue_struct *wq;

6059 int pi;

6060

6061 mutex_lock(&wq_pool_mutex);

6062

6063 for_each_pool(pool, pi) {

6064 mutex_lock(&wq_pool_attach_mutex);

6065

6066 if (pool->cpu == cpu)

6067 rebind_workers(pool);

6068 else if (pool->cpu < 0)

6069 restore_unbound_workers_cpumask(pool, cpu);

6070

6071 mutex_unlock(&wq_pool_attach_mutex);

6072 }

6073

6074 /* update pod affinity of unbound workqueues */

6075 list_for_each_entry(wq, &workqueues, list) {

6076 struct workqueue_attrs *attrs = wq->unbound_attrs;

6077

6078 if (attrs) {

6079 const struct wq_pod_type *pt = wqattrs_pod_type(attrs);

6080 int tcpu;

6081

6082 for_each_cpu(tcpu, pt->pod_cpus[pt->cpu_pod[cpu]])

6083 wq_update_pod(wq, tcpu, cpu, true);

6084

6085 mutex_lock(&wq->mutex);

6086 wq_update_node_max_active(wq, -1);

6087 mutex_unlock(&wq->mutex);

6088 }

6089 }

6090

6091 mutex_unlock(&wq_pool_mutex);

6092 return 0;

6093 }

6094

6095 int workqueue_offline_cpu(unsigned int cpu)

6096 {

6097 struct workqueue_struct *wq;

6098

6099 /* unbinding per-cpu workers should happen on the local CPU */

6100 if (WARN_ON(cpu != smp_processor_id()))

6101 return -1;

6102

6103 unbind_workers(cpu);

6104

6105 /* update pod affinity of unbound workqueues */

6106 mutex_lock(&wq_pool_mutex);

6107 list_for_each_entry(wq, &workqueues, list) {

6108 struct workqueue_attrs *attrs = wq->unbound_attrs;

6109

6110 if (attrs) {

6111 const struct wq_pod_type *pt = wqattrs_pod_type(attrs);

6112 int tcpu;

6113

6114 for_each_cpu(tcpu, pt->pod_cpus[pt->cpu_pod[cpu]])

6115 wq_update_pod(wq, tcpu, cpu, false);

6116

6117 mutex_lock(&wq->mutex);

6118 wq_update_node_max_active(wq, cpu);

6119 mutex_unlock(&wq->mutex);

6120 }

6121 }

6122 mutex_unlock(&wq_pool_mutex);

6123

6124 return 0;

6125 }

6126

6127 struct work_for_cpu {

6128 struct work_struct work;

6129 long (*fn)(void *);

6130 void *arg;

6131 long ret;

6132 };

6133

6134 static void work_for_cpu_fn(struct work_struct *work)

6135 {

6136 struct work_for_cpu *wfc = container_of(work, struct work_for_cpu, work);

6137

6138 wfc->ret = wfc->fn(wfc->arg);

6139 }

6140

6141 /**

6142 * work_on_cpu_key - run a function in thread context on a particular cpu

6143 * @cpu: the cpu to run on

6144 * @fn: the function to run

6145 * @arg: the function arg

6146 * @key: The lock class key for lock debugging purposes

6147 *

6148 * It is up to the caller to ensure that the cpu doesn't go offline.

6149 * The caller must not hold any locks which would prevent @fn from completing.

6150 *

6151 * Return: The value @fn returns.

6152 */

6153 long work_on_cpu_key(int cpu, long (*fn)(void *),

6154 void *arg, struct lock_class_key *key)

6155 {

6156 struct work_for_cpu wfc = { .fn = fn, .arg = arg };

6157

6158 INIT_WORK_ONSTACK_KEY(&wfc.work, work_for_cpu_fn, key);

6159 schedule_work_on(cpu, &wfc.work);

6160 flush_work(&wfc.work);

6161 destroy_work_on_stack(&wfc.work);

6162 return wfc.ret;

6163 }

6164 EXPORT_SYMBOL_GPL(work_on_cpu_key);

6165

6166 /**

6167 * work_on_cpu_safe_key - run a function in thread context on a particular cpu

6168 * @cpu: the cpu to run on

6169 * @fn: the function to run

6170 * @arg: the function argument

6171 * @key: The lock class key for lock debugging purposes

6172 *

6173 * Disables CPU hotplug and calls work_on_cpu(). The caller must not hold

6174 * any locks which would prevent @fn from completing.

6175 *

6176 * Return: The value @fn returns.

6177 */

6178 long work_on_cpu_safe_key(int cpu, long (*fn)(void *),

6179 void *arg, struct lock_class_key *key)

6180 {

6181 long ret = -ENODEV;

6182

6183 cpus_read_lock();

6184 if (cpu_online(cpu))

6185 ret = work_on_cpu_key(cpu, fn, arg, key);

6186 cpus_read_unlock();

6187 return ret;

6188 }

6189 EXPORT_SYMBOL_GPL(work_on_cpu_safe_key);

6190 #endif /* CONFIG_SMP */

6191

6192 #ifdef CONFIG_FREEZER

6193

6194 /**

6195 * freeze_workqueues_begin - begin freezing workqueues

6196 *

6197 * Start freezing workqueues. After this function returns, all freezable

6198 * workqueues will queue new works to their inactive_works list instead of

6199 * pool->worklist.

6200 *

6201 * CONTEXT:

6202 * Grabs and releases wq_pool_mutex, wq->mutex and pool->lock's.

6203 */

6204 void freeze_workqueues_begin(void)

6205 {

6206 struct workqueue_struct *wq;

6207

6208 mutex_lock(&wq_pool_mutex);

6209

6210 WARN_ON_ONCE(workqueue_freezing);

6211 workqueue_freezing = true;

6212

6213 list_for_each_entry(wq, &workqueues, list) {

6214 mutex_lock(&wq->mutex);

6215 wq_adjust_max_active(wq);

6216 mutex_unlock(&wq->mutex);

6217 }

6218

6219 mutex_unlock(&wq_pool_mutex);

6220 }

6221

6222 /**

6223 * freeze_workqueues_busy - are freezable workqueues still busy?

6224 *

6225 * Check whether freezing is complete. This function must be called

6226 * between freeze_workqueues_begin() and thaw_workqueues().

6227 *

6228 * CONTEXT:

6229 * Grabs and releases wq_pool_mutex.

6230 *

6231 * Return:

6232 * %true if some freezable workqueues are still busy. %false if freezing

6233 * is complete.

6234 */

6235 bool freeze_workqueues_busy(void)

6236 {

6237 bool busy = false;

6238 struct workqueue_struct *wq;

6239 struct pool_workqueue *pwq;

6240

6241 mutex_lock(&wq_pool_mutex);

6242

6243 WARN_ON_ONCE(!workqueue_freezing);

6244

6245 list_for_each_entry(wq, &workqueues, list) {

6246 if (!(wq->flags & WQ_FREEZABLE))

6247 continue;

6248 /*

6249 * nr_active is monotonically decreasing. It's safe

6250 * to peek without lock.

6251 */

6252 rcu_read_lock();

6253 for_each_pwq(pwq, wq) {

6254 WARN_ON_ONCE(pwq->nr_active < 0);

6255 if (pwq->nr_active) {

6256 busy = true;

6257 rcu_read_unlock();

6258 goto out_unlock;

6259 }

6260 }

6261 rcu_read_unlock();

6262 }

6263 out_unlock:

6264 mutex_unlock(&wq_pool_mutex);

6265 return busy;

6266 }

6267

6268 /**

6269 * thaw_workqueues - thaw workqueues

6270 *

6271 * Thaw workqueues. Normal queueing is restored and all collected

6272 * frozen works are transferred to their respective pool worklists.

6273 *

6274 * CONTEXT:

6275 * Grabs and releases wq_pool_mutex, wq->mutex and pool->lock's.

6276 */

6277 void thaw_workqueues(void)

6278 {

6279 struct workqueue_struct *wq;

6280

6281 mutex_lock(&wq_pool_mutex);

6282

6283 if (!workqueue_freezing)

6284 goto out_unlock;

6285

6286 workqueue_freezing = false;

6287

6288 /* restore max_active and repopulate worklist */

6289 list_for_each_entry(wq, &workqueues, list) {

6290 mutex_lock(&wq->mutex);

6291 wq_adjust_max_active(wq);

6292 mutex_unlock(&wq->mutex);

6293 }

6294

6295 out_unlock:

6296 mutex_unlock(&wq_pool_mutex);

6297 }

6298 #endif /* CONFIG_FREEZER */

6299

6300 static int workqueue_apply_unbound_cpumask(const cpumask_var_t unbound_cpumask)

6301 {

6302 LIST_HEAD(ctxs);

6303 int ret = 0;

6304 struct workqueue_struct *wq;

6305 struct apply_wqattrs_ctx *ctx, *n;

6306

6307 lockdep_assert_held(&wq_pool_mutex);

6308

6309 list_for_each_entry(wq, &workqueues, list) {

6310 if (!(wq->flags & WQ_UNBOUND))

6311 continue;

6312

6313 /* creating multiple pwqs breaks ordering guarantee */

6314 if (!list_empty(&wq->pwqs)) {

6315 if (wq->flags & __WQ_ORDERED_EXPLICIT)

6316 continue;

6317 wq->flags &= ~__WQ_ORDERED;

6318 }

6319

6320 ctx = apply_wqattrs_prepare(wq, wq->unbound_attrs, unbound_cpumask);

6321 if (IS_ERR(ctx)) {

6322 ret = PTR_ERR(ctx);

6323 break;

6324 }

6325

6326 list_add_tail(&ctx->list, &ctxs);

6327 }

6328

6329 list_for_each_entry_safe(ctx, n, &ctxs, list) {

6330 if (!ret)

6331 apply_wqattrs_commit(ctx);

6332 apply_wqattrs_cleanup(ctx);

6333 }

6334

6335 if (!ret) {

6336 mutex_lock(&wq_pool_attach_mutex);

6337 cpumask_copy(wq_unbound_cpumask, unbound_cpumask);

6338 mutex_unlock(&wq_pool_attach_mutex);

6339 }

6340 return ret;

6341 }

6342

6343 /**

6344 * workqueue_unbound_exclude_cpumask - Exclude given CPUs from unbound cpumask

6345 * @exclude_cpumask: the cpumask to be excluded from wq_unbound_cpumask

6346 *

6347 * This function can be called from cpuset code to provide a set of isolated

6348 * CPUs that should be excluded from wq_unbound_cpumask. The caller must hold

6349 * either cpus_read_lock or cpus_write_lock.

6350 */

6351 int workqueue_unbound_exclude_cpumask(cpumask_var_t exclude_cpumask)

6352 {

6353 cpumask_var_t cpumask;

6354 int ret = 0;

6355

6356 if (!zalloc_cpumask_var(&cpumask, GFP_KERNEL))

6357 return -ENOMEM;

6358

6359 lockdep_assert_cpus_held();

6360 mutex_lock(&wq_pool_mutex);

6361

6362 /* Save the current isolated cpumask & export it via sysfs */

6363 cpumask_copy(wq_isolated_cpumask, exclude_cpumask);

6364

6365 /*

6366 * If the operation fails, it will fall back to

6367 * wq_requested_unbound_cpumask which is initially set to

6368 * (HK_TYPE_WQ ∩ HK_TYPE_DOMAIN) house keeping mask and rewritten

6369 * by any subsequent write to workqueue/cpumask sysfs file.

6370 */

6371 if (!cpumask_andnot(cpumask, wq_requested_unbound_cpumask, exclude_cpumask))

6372 cpumask_copy(cpumask, wq_requested_unbound_cpumask);

6373 if (!cpumask_equal(cpumask, wq_unbound_cpumask))

6374 ret = workqueue_apply_unbound_cpumask(cpumask);

6375

6376 mutex_unlock(&wq_pool_mutex);

6377 free_cpumask_var(cpumask);

6378 return ret;

6379 }

6380

6381 static int parse_affn_scope(const char *val)

6382 {

6383 int i;

6384

6385 for (i = 0; i < ARRAY_SIZE(wq_affn_names); i++) {

6386 if (!strncasecmp(val, wq_affn_names[i], strlen(wq_affn_names[i])))

6387 return i;

6388 }

6389 return -EINVAL;

6390 }

6391

6392 static int wq_affn_dfl_set(const char *val, const struct kernel_param *kp)

6393 {

6394 struct workqueue_struct *wq;

6395 int affn, cpu;

6396

6397 affn = parse_affn_scope(val);

6398 if (affn < 0)

6399 return affn;

6400 if (affn == WQ_AFFN_DFL)

6401 return -EINVAL;

6402

6403 cpus_read_lock();

6404 mutex_lock(&wq_pool_mutex);

6405

6406 wq_affn_dfl = affn;

6407

6408 list_for_each_entry(wq, &workqueues, list) {

6409 for_each_online_cpu(cpu) {

6410 wq_update_pod(wq, cpu, cpu, true);

6411 }

6412 }

6413

6414 mutex_unlock(&wq_pool_mutex);

6415 cpus_read_unlock();

6416

6417 return 0;

6418 }

6419

6420 static int wq_affn_dfl_get(char *buffer, const struct kernel_param *kp)

6421 {

6422 return scnprintf(buffer, PAGE_SIZE, "%s\n", wq_affn_names[wq_affn_dfl]);

6423 }

6424

6425 static const struct kernel_param_ops wq_affn_dfl_ops = {

6426 .set = wq_affn_dfl_set,

6427 .get = wq_affn_dfl_get,

6428 };

6429

6430 module_param_cb(default_affinity_scope, &wq_affn_dfl_ops, NULL, 0644);

6431

6432 #ifdef CONFIG_SYSFS

6433 /*

6434 * Workqueues with WQ_SYSFS flag set is visible to userland via

6435 * /sys/bus/workqueue/devices/WQ_NAME. All visible workqueues have the

6436 * following attributes.

6437 *

6438 * per_cpu RO bool : whether the workqueue is per-cpu or unbound

6439 * max_active RW int : maximum number of in-flight work items

6440 *

6441 * Unbound workqueues have the following extra attributes.

6442 *

6443 * nice RW int : nice value of the workers

6444 * cpumask RW mask : bitmask of allowed CPUs for the workers

6445 * affinity_scope RW str : worker CPU affinity scope (cache, numa, none)

6446 * affinity_strict RW bool : worker CPU affinity is strict

6447 */

6448 struct wq_device {

6449 struct workqueue_struct *wq;

6450 struct device dev;

6451 };

6452

6453 static struct workqueue_struct *dev_to_wq(struct device *dev)

6454 {

6455 struct wq_device *wq_dev = container_of(dev, struct wq_device, dev);

6456

6457 return wq_dev->wq;

6458 }

6459

6460 static ssize_t per_cpu_show(struct device *dev, struct device_attribute *attr,

6461 char *buf)

6462 {

6463 struct workqueue_struct *wq = dev_to_wq(dev);

6464

6465 return scnprintf(buf, PAGE_SIZE, "%d\n", (bool)!(wq->flags & WQ_UNBOUND));

6466 }

6467 static DEVICE_ATTR_RO(per_cpu);

6468

6469 static ssize_t max_active_show(struct device *dev,

6470 struct device_attribute *attr, char *buf)

6471 {

6472 struct workqueue_struct *wq = dev_to_wq(dev);

6473

6474 return scnprintf(buf, PAGE_SIZE, "%d\n", wq->saved_max_active);

6475 }

6476

6477 static ssize_t max_active_store(struct device *dev,

6478 struct device_attribute *attr, const char *buf,

6479 size_t count)

6480 {

6481 struct workqueue_struct *wq = dev_to_wq(dev);

6482 int val;

6483

6484 if (sscanf(buf, "%d", &val) != 1 || val <= 0)

6485 return -EINVAL;

6486

6487 workqueue_set_max_active(wq, val);

6488 return count;

6489 }

6490 static DEVICE_ATTR_RW(max_active);

6491

6492 static struct attribute *wq_sysfs_attrs[] = {

6493 &dev_attr_per_cpu.attr,

6494 &dev_attr_max_active.attr,

6495 NULL,

6496 };

6497 ATTRIBUTE_GROUPS(wq_sysfs);

6498

6499 static void apply_wqattrs_lock(void)

6500 {

6501 /* CPUs should stay stable across pwq creations and installations */

6502 cpus_read_lock();

6503 mutex_lock(&wq_pool_mutex);

6504 }

6505

6506 static void apply_wqattrs_unlock(void)

6507 {

6508 mutex_unlock(&wq_pool_mutex);

6509 cpus_read_unlock();

6510 }

6511

6512 static ssize_t wq_nice_show(struct device *dev, struct device_attribute *attr,

6513 char *buf)

6514 {

6515 struct workqueue_struct *wq = dev_to_wq(dev);

6516 int written;

6517

6518 mutex_lock(&wq->mutex);

6519 written = scnprintf(buf, PAGE_SIZE, "%d\n", wq->unbound_attrs->nice);

6520 mutex_unlock(&wq->mutex);

6521

6522 return written;

6523 }

6524

6525 /* prepare workqueue_attrs for sysfs store operations */

6526 static struct workqueue_attrs *wq_sysfs_prep_attrs(struct workqueue_struct *wq)

6527 {

6528 struct workqueue_attrs *attrs;

6529

6530 lockdep_assert_held(&wq_pool_mutex);

6531

6532 attrs = alloc_workqueue_attrs();

6533 if (!attrs)

6534 return NULL;

6535

6536 copy_workqueue_attrs(attrs, wq->unbound_attrs);

6537 return attrs;

6538 }

6539

6540 static ssize_t wq_nice_store(struct device *dev, struct device_attribute *attr,

6541 const char *buf, size_t count)

6542 {

6543 struct workqueue_struct *wq = dev_to_wq(dev);

6544 struct workqueue_attrs *attrs;

6545 int ret = -ENOMEM;

6546

6547 apply_wqattrs_lock();

6548

6549 attrs = wq_sysfs_prep_attrs(wq);

6550 if (!attrs)

6551 goto out_unlock;

6552

6553 if (sscanf(buf, "%d", &attrs->nice) == 1 &&

6554 attrs->nice >= MIN_NICE && attrs->nice <= MAX_NICE)

6555 ret = apply_workqueue_attrs_locked(wq, attrs);

6556 else

6557 ret = -EINVAL;

6558

6559 out_unlock:

6560 apply_wqattrs_unlock();

6561 free_workqueue_attrs(attrs);

6562 return ret ?: count;

6563 }

6564

6565 static ssize_t wq_cpumask_show(struct device *dev,

6566 struct device_attribute *attr, char *buf)

6567 {

6568 struct workqueue_struct *wq = dev_to_wq(dev);

6569 int written;

6570

6571 mutex_lock(&wq->mutex);

6572 written = scnprintf(buf, PAGE_SIZE, "%*pb\n",

6573 cpumask_pr_args(wq->unbound_attrs->cpumask));

6574 mutex_unlock(&wq->mutex);

6575 return written;

6576 }

6577

6578 static ssize_t wq_cpumask_store(struct device *dev,

6579 struct device_attribute *attr,

6580 const char *buf, size_t count)

6581 {

6582 struct workqueue_struct *wq = dev_to_wq(dev);

6583 struct workqueue_attrs *attrs;

6584 int ret = -ENOMEM;

6585

6586 apply_wqattrs_lock();

6587

6588 attrs = wq_sysfs_prep_attrs(wq);

6589 if (!attrs)

6590 goto out_unlock;

6591

6592 ret = cpumask_parse(buf, attrs->cpumask);

6593 if (!ret)

6594 ret = apply_workqueue_attrs_locked(wq, attrs);

6595

6596 out_unlock:

6597 apply_wqattrs_unlock();

6598 free_workqueue_attrs(attrs);

6599 return ret ?: count;

6600 }

6601

6602 static ssize_t wq_affn_scope_show(struct device *dev,

6603 struct device_attribute *attr, char *buf)

6604 {

6605 struct workqueue_struct *wq = dev_to_wq(dev);

6606 int written;

6607

6608 mutex_lock(&wq->mutex);

6609 if (wq->unbound_attrs->affn_scope == WQ_AFFN_DFL)

6610 written = scnprintf(buf, PAGE_SIZE, "%s (%s)\n",

6611 wq_affn_names[WQ_AFFN_DFL],

6612 wq_affn_names[wq_affn_dfl]);

6613 else

6614 written = scnprintf(buf, PAGE_SIZE, "%s\n",

6615 wq_affn_names[wq->unbound_attrs->affn_scope]);

6616 mutex_unlock(&wq->mutex);

6617

6618 return written;

6619 }

6620

6621 static ssize_t wq_affn_scope_store(struct device *dev,

6622 struct device_attribute *attr,

6623 const char *buf, size_t count)

6624 {

6625 struct workqueue_struct *wq = dev_to_wq(dev);

6626 struct workqueue_attrs *attrs;

6627 int affn, ret = -ENOMEM;

6628

6629 affn = parse_affn_scope(buf);

6630 if (affn < 0)

6631 return affn;

6632

6633 apply_wqattrs_lock();

6634 attrs = wq_sysfs_prep_attrs(wq);

6635 if (attrs) {

6636 attrs->affn_scope = affn;

6637 ret = apply_workqueue_attrs_locked(wq, attrs);

6638 }

6639 apply_wqattrs_unlock();

6640 free_workqueue_attrs(attrs);

6641 return ret ?: count;

6642 }

6643

6644 static ssize_t wq_affinity_strict_show(struct device *dev,

6645 struct device_attribute *attr, char *buf)

6646 {

6647 struct workqueue_struct *wq = dev_to_wq(dev);

6648

6649 return scnprintf(buf, PAGE_SIZE, "%d\n",

6650 wq->unbound_attrs->affn_strict);

6651 }

6652

6653 static ssize_t wq_affinity_strict_store(struct device *dev,

6654 struct device_attribute *attr,

6655 const char *buf, size_t count)

6656 {

6657 struct workqueue_struct *wq = dev_to_wq(dev);

6658 struct workqueue_attrs *attrs;

6659 int v, ret = -ENOMEM;

6660

6661 if (sscanf(buf, "%d", &v) != 1)

6662 return -EINVAL;

6663

6664 apply_wqattrs_lock();

6665 attrs = wq_sysfs_prep_attrs(wq);

6666 if (attrs) {

6667 attrs->affn_strict = (bool)v;

6668 ret = apply_workqueue_attrs_locked(wq, attrs);

6669 }

6670 apply_wqattrs_unlock();

6671 free_workqueue_attrs(attrs);

6672 return ret ?: count;

6673 }

6674

6675 static struct device_attribute wq_sysfs_unbound_attrs[] = {

6676 __ATTR(nice, 0644, wq_nice_show, wq_nice_store),

6677 __ATTR(cpumask, 0644, wq_cpumask_show, wq_cpumask_store),

6678 __ATTR(affinity_scope, 0644, wq_affn_scope_show, wq_affn_scope_store),

6679 __ATTR(affinity_strict, 0644, wq_affinity_strict_show, wq_affinity_strict_store),

6680 __ATTR_NULL,

6681 };

6682

6683 static struct bus_type wq_subsys = {

6684 .name = "workqueue",

6685 .dev_groups = wq_sysfs_groups,

6686 };

6687

6688 /**

6689 * workqueue_set_unbound_cpumask - Set the low-level unbound cpumask

6690 * @cpumask: the cpumask to set

6691 *

6692 * The low-level workqueues cpumask is a global cpumask that limits

6693 * the affinity of all unbound workqueues. This function check the @cpumask

6694 * and apply it to all unbound workqueues and updates all pwqs of them.

6695 *

6696 * Return: 0 - Success

6697 * -EINVAL - Invalid @cpumask

6698 * -ENOMEM - Failed to allocate memory for attrs or pwqs.

6699 */

6700 static int workqueue_set_unbound_cpumask(cpumask_var_t cpumask)

6701 {

6702 int ret = -EINVAL;

6703

6704 /*

6705 * Not excluding isolated cpus on purpose.

6706 * If the user wishes to include them, we allow that.

6707 */

6708 cpumask_and(cpumask, cpumask, cpu_possible_mask);

6709 if (!cpumask_empty(cpumask)) {

6710 apply_wqattrs_lock();

6711 cpumask_copy(wq_requested_unbound_cpumask, cpumask);

6712 if (cpumask_equal(cpumask, wq_unbound_cpumask)) {

6713 ret = 0;

6714 goto out_unlock;

6715 }

6716

6717 ret = workqueue_apply_unbound_cpumask(cpumask);

6718

6719 out_unlock:

6720 apply_wqattrs_unlock();

6721 }

6722

6723 return ret;

6724 }

6725

6726 static ssize_t __wq_cpumask_show(struct device *dev,

6727 struct device_attribute *attr, char *buf, cpumask_var_t mask)

6728 {

6729 int written;

6730

6731 mutex_lock(&wq_pool_mutex);

6732 written = scnprintf(buf, PAGE_SIZE, "%*pb\n", cpumask_pr_args(mask));

6733 mutex_unlock(&wq_pool_mutex);

6734

6735 return written;

6736 }

6737

6738 static ssize_t wq_unbound_cpumask_show(struct device *dev,

6739 struct device_attribute *attr, char *buf)

6740 {

6741 return __wq_cpumask_show(dev, attr, buf, wq_unbound_cpumask);

6742 }

6743

6744 static ssize_t wq_requested_cpumask_show(struct device *dev,

6745 struct device_attribute *attr, char *buf)

6746 {

6747 return __wq_cpumask_show(dev, attr, buf, wq_requested_unbound_cpumask);

6748 }

6749

6750 static ssize_t wq_isolated_cpumask_show(struct device *dev,

6751 struct device_attribute *attr, char *buf)

6752 {

6753 return __wq_cpumask_show(dev, attr, buf, wq_isolated_cpumask);

6754 }

6755

6756 static ssize_t wq_unbound_cpumask_store(struct device *dev,

6757 struct device_attribute *attr, const char *buf, size_t count)

6758 {

6759 cpumask_var_t cpumask;

6760 int ret;

6761

6762 if (!zalloc_cpumask_var(&cpumask, GFP_KERNEL))

6763 return -ENOMEM;

6764

6765 ret = cpumask_parse(buf, cpumask);

6766 if (!ret)

6767 ret = workqueue_set_unbound_cpumask(cpumask);

6768

6769 free_cpumask_var(cpumask);

6770 return ret ? ret : count;

6771 }

6772

6773 static struct device_attribute wq_sysfs_cpumask_attrs[] = {

6774 __ATTR(cpumask, 0644, wq_unbound_cpumask_show,

6775 wq_unbound_cpumask_store),

6776 __ATTR(cpumask_requested, 0444, wq_requested_cpumask_show, NULL),

6777 __ATTR(cpumask_isolated, 0444, wq_isolated_cpumask_show, NULL),

6778 __ATTR_NULL,

6779 };

6780

6781 static int __init wq_sysfs_init(void)

6782 {

6783 struct device *dev_root;

6784 int err;

6785

6786 err = subsys_virtual_register(&wq_subsys, NULL);

6787 if (err)

6788 return err;

6789

6790 dev_root = bus_get_dev_root(&wq_subsys);

6791 if (dev_root) {

6792 struct device_attribute *attr;

6793

6794 for (attr = wq_sysfs_cpumask_attrs; attr->attr.name; attr++) {

6795 err = device_create_file(dev_root, attr);

6796 if (err)

6797 break;

6798 }

6799 put_device(dev_root);

6800 }

6801 return err;

6802 }

6803 core_initcall(wq_sysfs_init);

6804

6805 static void wq_device_release(struct device *dev)

6806 {

6807 struct wq_device *wq_dev = container_of(dev, struct wq_device, dev);

6808

6809 kfree(wq_dev);

6810 }

6811

6812 /**

6813 * workqueue_sysfs_register - make a workqueue visible in sysfs

6814 * @wq: the workqueue to register

6815 *

6816 * Expose @wq in sysfs under /sys/bus/workqueue/devices.

6817 * alloc_workqueue*() automatically calls this function if WQ_SYSFS is set

6818 * which is the preferred method.

6819 *

6820 * Workqueue user should use this function directly iff it wants to apply

6821 * workqueue_attrs before making the workqueue visible in sysfs; otherwise,

6822 * apply_workqueue_attrs() may race against userland updating the

6823 * attributes.

6824 *

6825 * Return: 0 on success, -errno on failure.

6826 */

6827 int workqueue_sysfs_register(struct workqueue_struct *wq)

6828 {

6829 struct wq_device *wq_dev;

6830 int ret;

6831

6832 /*

6833 * Adjusting max_active or creating new pwqs by applying

6834 * attributes breaks ordering guarantee. Disallow exposing ordered

6835 * workqueues.

6836 */

6837 if (WARN_ON(wq->flags & __WQ_ORDERED_EXPLICIT))

6838 return -EINVAL;

6839

6840 wq->wq_dev = wq_dev = kzalloc(sizeof(*wq_dev), GFP_KERNEL);

6841 if (!wq_dev)

6842 return -ENOMEM;

6843

6844 wq_dev->wq = wq;

6845 wq_dev->dev.bus = &wq_subsys;

6846 wq_dev->dev.release = wq_device_release;

6847 dev_set_name(&wq_dev->dev, "%s", wq->name);

6848

6849 /*

6850 * unbound_attrs are created separately. Suppress uevent until

6851 * everything is ready.

6852 */

6853 dev_set_uevent_suppress(&wq_dev->dev, true);

6854

6855 ret = device_register(&wq_dev->dev);

6856 if (ret) {

6857 put_device(&wq_dev->dev);

6858 wq->wq_dev = NULL;

6859 return ret;

6860 }

6861

6862 if (wq->flags & WQ_UNBOUND) {

6863 struct device_attribute *attr;

6864

6865 for (attr = wq_sysfs_unbound_attrs; attr->attr.name; attr++) {

6866 ret = device_create_file(&wq_dev->dev, attr);

6867 if (ret) {

6868 device_unregister(&wq_dev->dev);

6869 wq->wq_dev = NULL;

6870 return ret;

6871 }

6872 }

6873 }

6874

6875 dev_set_uevent_suppress(&wq_dev->dev, false);

6876 kobject_uevent(&wq_dev->dev.kobj, KOBJ_ADD);

6877 return 0;

6878 }

6879

6880 /**

6881 * workqueue_sysfs_unregister - undo workqueue_sysfs_register()

6882 * @wq: the workqueue to unregister

6883 *

6884 * If @wq is registered to sysfs by workqueue_sysfs_register(), unregister.

6885 */

6886 static void workqueue_sysfs_unregister(struct workqueue_struct *wq)

6887 {

6888 struct wq_device *wq_dev = wq->wq_dev;

6889

6890 if (!wq->wq_dev)

6891 return;

6892

6893 wq->wq_dev = NULL;

6894 device_unregister(&wq_dev->dev);

6895 }

6896 #else /* CONFIG_SYSFS */

6897 static void workqueue_sysfs_unregister(struct workqueue_struct *wq) { }

6898 #endif /* CONFIG_SYSFS */

6899

6900 /*

6901 * Workqueue watchdog.

6902 *

6903 * Stall may be caused by various bugs - missing WQ_MEM_RECLAIM, illegal

6904 * flush dependency, a concurrency managed work item which stays RUNNING

6905 * indefinitely. Workqueue stalls can be very difficult to debug as the

6906 * usual warning mechanisms don't trigger and internal workqueue state is

6907 * largely opaque.

6908 *

6909 * Workqueue watchdog monitors all worker pools periodically and dumps

6910 * state if some pools failed to make forward progress for a while where

6911 * forward progress is defined as the first item on ->worklist changing.

6912 *

6913 * This mechanism is controlled through the kernel parameter

6914 * "workqueue.watchdog_thresh" which can be updated at runtime through the

6915 * corresponding sysfs parameter file.

6916 */

6917 #ifdef CONFIG_WQ_WATCHDOG

6918

6919 static unsigned long wq_watchdog_thresh = 30;

6920 static struct timer_list wq_watchdog_timer;

6921

6922 static unsigned long wq_watchdog_touched = INITIAL_JIFFIES;

6923 static DEFINE_PER_CPU(unsigned long, wq_watchdog_touched_cpu) = INITIAL_JIFFIES;

6924

6925 /*

6926 * Show workers that might prevent the processing of pending work items.

6927 * The only candidates are CPU-bound workers in the running state.

6928 * Pending work items should be handled by another idle worker

6929 * in all other situations.

6930 */

6931 static void show_cpu_pool_hog(struct worker_pool *pool)

6932 {

6933 struct worker *worker;

6934 unsigned long flags;

6935 int bkt;

6936

6937 raw_spin_lock_irqsave(&pool->lock, flags);

6938

6939 hash_for_each(pool->busy_hash, bkt, worker, hentry) {

6940 if (task_is_running(worker->task)) {

6941 /*

6942 * Defer printing to avoid deadlocks in console

6943 * drivers that queue work while holding locks

6944 * also taken in their write paths.

6945 */

6946 printk_deferred_enter();

6947

6948 pr_info("pool %d:\n", pool->id);

6949 sched_show_task(worker->task);

6950

6951 printk_deferred_exit();

6952 }

6953 }

6954

6955 raw_spin_unlock_irqrestore(&pool->lock, flags);

6956 }

6957

6958 static void show_cpu_pools_hogs(void)

6959 {

6960 struct worker_pool *pool;

6961 int pi;

6962

6963 pr_info("Showing backtraces of running workers in stalled CPU-bound worker pools:\n");

6964

6965 rcu_read_lock();

6966

6967 for_each_pool(pool, pi) {

6968 if (pool->cpu_stall)

6969 show_cpu_pool_hog(pool);

6970

6971 }

6972

6973 rcu_read_unlock();

6974 }

6975

6976 static void wq_watchdog_reset_touched(void)

6977 {

6978 int cpu;

6979

6980 wq_watchdog_touched = jiffies;

6981 for_each_possible_cpu(cpu)

6982 per_cpu(wq_watchdog_touched_cpu, cpu) = jiffies;

6983 }

6984

6985 static void wq_watchdog_timer_fn(struct timer_list *unused)

6986 {

6987 unsigned long thresh = READ_ONCE(wq_watchdog_thresh) * HZ;

6988 bool lockup_detected = false;

6989 bool cpu_pool_stall = false;

6990 unsigned long now = jiffies;

6991 struct worker_pool *pool;

6992 int pi;

6993

6994 if (!thresh)

6995 return;

6996

6997 rcu_read_lock();

6998

6999 for_each_pool(pool, pi) {

7000 unsigned long pool_ts, touched, ts;

7001

7002 pool->cpu_stall = false;

7003 if (list_empty(&pool->worklist))

7004 continue;

7005

7006 /*

7007 * If a virtual machine is stopped by the host it can look to

7008 * the watchdog like a stall.

7009 */

7010 kvm_check_and_clear_guest_paused();

7011

7012 /* get the latest of pool and touched timestamps */

7013 if (pool->cpu >= 0)

7014 touched = READ_ONCE(per_cpu(wq_watchdog_touched_cpu, pool->cpu));

7015 else

7016 touched = READ_ONCE(wq_watchdog_touched);

7017 pool_ts = READ_ONCE(pool->watchdog_ts);

7018

7019 if (time_after(pool_ts, touched))

7020 ts = pool_ts;

7021 else

7022 ts = touched;

7023

7024 /* did we stall? */

7025 if (time_after(now, ts + thresh)) {

7026 lockup_detected = true;

7027 if (pool->cpu >= 0) {

7028 pool->cpu_stall = true;

7029 cpu_pool_stall = true;

7030 }

7031 pr_emerg("BUG: workqueue lockup - pool");

7032 pr_cont_pool_info(pool);

7033 pr_cont(" stuck for %us!\n",

7034 jiffies_to_msecs(now - pool_ts) / 1000);

7035 }

7036

7037

7038 }

7039

7040 rcu_read_unlock();

7041

7042 if (lockup_detected)

7043 show_all_workqueues();

7044

7045 if (cpu_pool_stall)

7046 show_cpu_pools_hogs();

7047

7048 wq_watchdog_reset_touched();

7049 mod_timer(&wq_watchdog_timer, jiffies + thresh);

7050 }

7051

7052 notrace void wq_watchdog_touch(int cpu)

7053 {

7054 if (cpu >= 0)

7055 per_cpu(wq_watchdog_touched_cpu, cpu) = jiffies;

7056

7057 wq_watchdog_touched = jiffies;

7058 }

7059

7060 static void wq_watchdog_set_thresh(unsigned long thresh)

7061 {

7062 wq_watchdog_thresh = 0;

7063 del_timer_sync(&wq_watchdog_timer);

7064

7065 if (thresh) {

7066 wq_watchdog_thresh = thresh;

7067 wq_watchdog_reset_touched();

7068 mod_timer(&wq_watchdog_timer, jiffies + thresh * HZ);

7069 }

7070 }

7071

7072 static int wq_watchdog_param_set_thresh(const char *val,

7073 const struct kernel_param *kp)

7074 {

7075 unsigned long thresh;

7076 int ret;

7077

7078 ret = kstrtoul(val, 0, &thresh);

7079 if (ret)

7080 return ret;

7081

7082 if (system_wq)

7083 wq_watchdog_set_thresh(thresh);

7084 else

7085 wq_watchdog_thresh = thresh;

7086

7087 return 0;

7088 }

7089

7090 static const struct kernel_param_ops wq_watchdog_thresh_ops = {

7091 .set = wq_watchdog_param_set_thresh,

7092 .get = param_get_ulong,

7093 };

7094

7095 module_param_cb(watchdog_thresh, &wq_watchdog_thresh_ops, &wq_watchdog_thresh,

7096 0644);

7097

7098 static void wq_watchdog_init(void)

7099 {

7100 timer_setup(&wq_watchdog_timer, wq_watchdog_timer_fn, TIMER_DEFERRABLE);

7101 wq_watchdog_set_thresh(wq_watchdog_thresh);

7102 }

7103

7104 #else /* CONFIG_WQ_WATCHDOG */

7105

7106 static inline void wq_watchdog_init(void) { }

7107

7108 #endif /* CONFIG_WQ_WATCHDOG */

7109

7110 static void __init restrict_unbound_cpumask(const char *name, const struct cpumask *mask)

7111 {

7112 if (!cpumask_intersects(wq_unbound_cpumask, mask)) {

7113 pr_warn("workqueue: Restricting unbound_cpumask (%*pb) with %s (%*pb) leaves no CPU, ignoring\n",

7114 cpumask_pr_args(wq_unbound_cpumask), name, cpumask_pr_args(mask));

7115 return;

7116 }

7117

7118 cpumask_and(wq_unbound_cpumask, wq_unbound_cpumask, mask);

7119 }

7120

7121 /**

7122 * workqueue_init_early - early init for workqueue subsystem

7123 *

7124 * This is the first step of three-staged workqueue subsystem initialization and

7125 * invoked as soon as the bare basics - memory allocation, cpumasks and idr are

7126 * up. It sets up all the data structures and system workqueues and allows early

7127 * boot code to create workqueues and queue/cancel work items. Actual work item

7128 * execution starts only after kthreads can be created and scheduled right

7129 * before early initcalls.

7130 */

7131 void __init workqueue_init_early(void)

7132 {

7133 struct wq_pod_type *pt = &wq_pod_types[WQ_AFFN_SYSTEM];

7134 int std_nice[NR_STD_WORKER_POOLS] = { 0, HIGHPRI_NICE_LEVEL };

7135 int i, cpu;

7136

7137 BUILD_BUG_ON(__alignof__(struct pool_workqueue) < __alignof__(long long));

7138

7139 BUG_ON(!alloc_cpumask_var(&wq_unbound_cpumask, GFP_KERNEL));

7140 BUG_ON(!alloc_cpumask_var(&wq_requested_unbound_cpumask, GFP_KERNEL));

7141 BUG_ON(!zalloc_cpumask_var(&wq_isolated_cpumask, GFP_KERNEL));

7142

7143 cpumask_copy(wq_unbound_cpumask, cpu_possible_mask);

7144 restrict_unbound_cpumask("HK_TYPE_WQ", housekeeping_cpumask(HK_TYPE_WQ));

7145 restrict_unbound_cpumask("HK_TYPE_DOMAIN", housekeeping_cpumask(HK_TYPE_DOMAIN));

7146 if (!cpumask_empty(&wq_cmdline_cpumask))

7147 restrict_unbound_cpumask("workqueue.unbound_cpus", &wq_cmdline_cpumask);

7148

7149 cpumask_copy(wq_requested_unbound_cpumask, wq_unbound_cpumask);

7150

7151 pwq_cache = KMEM_CACHE(pool_workqueue, SLAB_PANIC);

7152

7153 wq_update_pod_attrs_buf = alloc_workqueue_attrs();

7154 BUG_ON(!wq_update_pod_attrs_buf);

7155

7156 /*

7157 * If nohz_full is enabled, set power efficient workqueue as unbound.

7158 * This allows workqueue items to be moved to HK CPUs.

7159 */

7160 if (housekeeping_enabled(HK_TYPE_TICK))

7161 wq_power_efficient = true;

7162

7163 /* initialize WQ_AFFN_SYSTEM pods */

7164 pt->pod_cpus = kcalloc(1, sizeof(pt->pod_cpus[0]), GFP_KERNEL);

7165 pt->pod_node = kcalloc(1, sizeof(pt->pod_node[0]), GFP_KERNEL);

7166 pt->cpu_pod = kcalloc(nr_cpu_ids, sizeof(pt->cpu_pod[0]), GFP_KERNEL);

7167 BUG_ON(!pt->pod_cpus || !pt->pod_node || !pt->cpu_pod);

7168

7169 BUG_ON(!zalloc_cpumask_var_node(&pt->pod_cpus[0], GFP_KERNEL, NUMA_NO_NODE));

7170

7171 pt->nr_pods = 1;

7172 cpumask_copy(pt->pod_cpus[0], cpu_possible_mask);

7173 pt->pod_node[0] = NUMA_NO_NODE;

7174 pt->cpu_pod[0] = 0;

7175

7176 /* initialize CPU pools */

7177 for_each_possible_cpu(cpu) {

7178 struct worker_pool *pool;

7179

7180 i = 0;

7181 for_each_cpu_worker_pool(pool, cpu) {

7182 BUG_ON(init_worker_pool(pool));

7183 pool->cpu = cpu;

7184 cpumask_copy(pool->attrs->cpumask, cpumask_of(cpu));

7185 cpumask_copy(pool->attrs->__pod_cpumask, cpumask_of(cpu));

7186 pool->attrs->nice = std_nice[i++];

7187 pool->attrs->affn_strict = true;

7188 pool->node = cpu_to_node(cpu);

7189

7190 /* alloc pool ID */

7191 mutex_lock(&wq_pool_mutex);

7192 BUG_ON(worker_pool_assign_id(pool));

7193 mutex_unlock(&wq_pool_mutex);

7194 }

7195 }

7196

7197 /* create default unbound and ordered wq attrs */

7198 for (i = 0; i < NR_STD_WORKER_POOLS; i++) {

7199 struct workqueue_attrs *attrs;

7200

7201 BUG_ON(!(attrs = alloc_workqueue_attrs()));

7202 attrs->nice = std_nice[i];

7203 unbound_std_wq_attrs[i] = attrs;

7204

7205 /*

7206 * An ordered wq should have only one pwq as ordering is

7207 * guaranteed by max_active which is enforced by pwqs.

7208 */

7209 BUG_ON(!(attrs = alloc_workqueue_attrs()));

7210 attrs->nice = std_nice[i];

7211 attrs->ordered = true;

7212 ordered_wq_attrs[i] = attrs;

7213 }

7214

7215 system_wq = alloc_workqueue("events", 0, 0);

7216 system_highpri_wq = alloc_workqueue("events_highpri", WQ_HIGHPRI, 0);

7217 system_long_wq = alloc_workqueue("events_long", 0, 0);

7218 system_unbound_wq = alloc_workqueue("events_unbound", WQ_UNBOUND,

7219 WQ_MAX_ACTIVE);

7220 system_freezable_wq = alloc_workqueue("events_freezable",

7221 WQ_FREEZABLE, 0);

7222 system_power_efficient_wq = alloc_workqueue("events_power_efficient",

7223 WQ_POWER_EFFICIENT, 0);

7224 system_freezable_power_efficient_wq = alloc_workqueue("events_freezable_pwr_efficient",

7225 WQ_FREEZABLE | WQ_POWER_EFFICIENT,

7226 0);

7227 BUG_ON(!system_wq || !system_highpri_wq || !system_long_wq ||

7228 !system_unbound_wq || !system_freezable_wq ||

7229 !system_power_efficient_wq ||

7230 !system_freezable_power_efficient_wq);

7231 }

7232

7233 static void __init wq_cpu_intensive_thresh_init(void)

7234 {

7235 unsigned long thresh;

7236 unsigned long bogo;

7237

7238 pwq_release_worker = kthread_create_worker(0, "pool_workqueue_release");

7239 BUG_ON(IS_ERR(pwq_release_worker));

7240

7241 /* if the user set it to a specific value, keep it */

7242 if (wq_cpu_intensive_thresh_us != ULONG_MAX)

7243 return;

7244

7245 /*

7246 * The default of 10ms is derived from the fact that most modern (as of

7247 * 2023) processors can do a lot in 10ms and that it's just below what

7248 * most consider human-perceivable. However, the kernel also runs on a

7249 * lot slower CPUs including microcontrollers where the threshold is way

7250 * too low.

7251 *

7252 * Let's scale up the threshold upto 1 second if BogoMips is below 4000.

7253 * This is by no means accurate but it doesn't have to be. The mechanism

7254 * is still useful even when the threshold is fully scaled up. Also, as

7255 * the reports would usually be applicable to everyone, some machines

7256 * operating on longer thresholds won't significantly diminish their

7257 * usefulness.

7258 */

7259 thresh = 10 * USEC_PER_MSEC;

7260

7261 /* see init/calibrate.c for lpj -> BogoMIPS calculation */

7262 bogo = max_t(unsigned long, loops_per_jiffy / 500000 * HZ, 1);

7263 if (bogo < 4000)

7264 thresh = min_t(unsigned long, thresh * 4000 / bogo, USEC_PER_SEC);

7265

7266 pr_debug("wq_cpu_intensive_thresh: lpj=%lu BogoMIPS=%lu thresh_us=%lu\n",

7267 loops_per_jiffy, bogo, thresh);

7268

7269 wq_cpu_intensive_thresh_us = thresh;

7270 }

7271

7272 /**

7273 * workqueue_init - bring workqueue subsystem fully online

7274 *

7275 * This is the second step of three-staged workqueue subsystem initialization

7276 * and invoked as soon as kthreads can be created and scheduled. Workqueues have

7277 * been created and work items queued on them, but there are no kworkers

7278 * executing the work items yet. Populate the worker pools with the initial

7279 * workers and enable future kworker creations.

7280 */

7281 void __init workqueue_init(void)

7282 {

7283 struct workqueue_struct *wq;

7284 struct worker_pool *pool;

7285 int cpu, bkt;

7286

7287 wq_cpu_intensive_thresh_init();

7288

7289 mutex_lock(&wq_pool_mutex);

7290

7291 /*

7292 * Per-cpu pools created earlier could be missing node hint. Fix them

7293 * up. Also, create a rescuer for workqueues that requested it.

7294 */

7295 for_each_possible_cpu(cpu) {

7296 for_each_cpu_worker_pool(pool, cpu) {

7297 pool->node = cpu_to_node(cpu);

7298 }

7299 }

7300

7301 list_for_each_entry(wq, &workqueues, list) {

7302 WARN(init_rescuer(wq),

7303 "workqueue: failed to create early rescuer for %s",

7304 wq->name);

7305 }

7306

7307 mutex_unlock(&wq_pool_mutex);

7308

7309 /* create the initial workers */

7310 for_each_online_cpu(cpu) {

7311 for_each_cpu_worker_pool(pool, cpu) {

7312 pool->flags &= ~POOL_DISASSOCIATED;

7313 BUG_ON(!create_worker(pool));

7314 }

7315 }

7316

7317 hash_for_each(unbound_pool_hash, bkt, pool, hash_node)

7318 BUG_ON(!create_worker(pool));

7319

7320 wq_online = true;

7321 wq_watchdog_init();

7322 }

7323

7324 /*

7325 * Initialize @pt by first initializing @pt->cpu_pod[] with pod IDs according to

7326 * @cpu_shares_pod(). Each subset of CPUs that share a pod is assigned a unique

7327 * and consecutive pod ID. The rest of @pt is initialized accordingly.

7328 */

7329 static void __init init_pod_type(struct wq_pod_type *pt,

7330 bool (*cpus_share_pod)(int, int))

7331 {

7332 int cur, pre, cpu, pod;

7333

7334 pt->nr_pods = 0;

7335

7336 /* init @pt->cpu_pod[] according to @cpus_share_pod() */

7337 pt->cpu_pod = kcalloc(nr_cpu_ids, sizeof(pt->cpu_pod[0]), GFP_KERNEL);

7338 BUG_ON(!pt->cpu_pod);

7339

7340 for_each_possible_cpu(cur) {

7341 for_each_possible_cpu(pre) {

7342 if (pre >= cur) {

7343 pt->cpu_pod[cur] = pt->nr_pods++;

7344 break;

7345 }

7346 if (cpus_share_pod(cur, pre)) {

7347 pt->cpu_pod[cur] = pt->cpu_pod[pre];

7348 break;

7349 }

7350 }

7351 }

7352

7353 /* init the rest to match @pt->cpu_pod[] */

7354 pt->pod_cpus = kcalloc(pt->nr_pods, sizeof(pt->pod_cpus[0]), GFP_KERNEL);

7355 pt->pod_node = kcalloc(pt->nr_pods, sizeof(pt->pod_node[0]), GFP_KERNEL);

7356 BUG_ON(!pt->pod_cpus || !pt->pod_node);

7357

7358 for (pod = 0; pod < pt->nr_pods; pod++)

7359 BUG_ON(!zalloc_cpumask_var(&pt->pod_cpus[pod], GFP_KERNEL));

7360

7361 for_each_possible_cpu(cpu) {

7362 cpumask_set_cpu(cpu, pt->pod_cpus[pt->cpu_pod[cpu]]);

7363 pt->pod_node[pt->cpu_pod[cpu]] = cpu_to_node(cpu);

7364 }

7365 }

7366

7367 static bool __init cpus_dont_share(int cpu0, int cpu1)

7368 {

7369 return false;

7370 }

7371

7372 static bool __init cpus_share_smt(int cpu0, int cpu1)

7373 {

7374 #ifdef CONFIG_SCHED_SMT

7375 return cpumask_test_cpu(cpu0, cpu_smt_mask(cpu1));

7376 #else

7377 return false;

7378 #endif

7379 }

7380

7381 static bool __init cpus_share_numa(int cpu0, int cpu1)

7382 {

7383 return cpu_to_node(cpu0) == cpu_to_node(cpu1);

7384 }

7385

7386 /**

7387 * workqueue_init_topology - initialize CPU pods for unbound workqueues

7388 *

7389 * This is the third step of there-staged workqueue subsystem initialization and

7390 * invoked after SMP and topology information are fully initialized. It

7391 * initializes the unbound CPU pods accordingly.

7392 */

7393 void __init workqueue_init_topology(void)

7394 {

7395 struct workqueue_struct *wq;

7396 int cpu;

7397

7398 init_pod_type(&wq_pod_types[WQ_AFFN_CPU], cpus_dont_share);

7399 init_pod_type(&wq_pod_types[WQ_AFFN_SMT], cpus_share_smt);

7400 init_pod_type(&wq_pod_types[WQ_AFFN_CACHE], cpus_share_cache);

7401 init_pod_type(&wq_pod_types[WQ_AFFN_NUMA], cpus_share_numa);

7402

7403 mutex_lock(&wq_pool_mutex);

7404

7405 /*

7406 * Workqueues allocated earlier would have all CPUs sharing the default

7407 * worker pool. Explicitly call wq_update_pod() on all workqueue and CPU

7408 * combinations to apply per-pod sharing.

7409 */

7410 list_for_each_entry(wq, &workqueues, list) {

7411 for_each_online_cpu(cpu)

7412 wq_update_pod(wq, cpu, cpu, true);

7413 if (wq->flags & WQ_UNBOUND) {

7414 mutex_lock(&wq->mutex);

7415 wq_update_node_max_active(wq, -1);

7416 mutex_unlock(&wq->mutex);

7417 }

7418 }

7419

7420 mutex_unlock(&wq_pool_mutex);

7421 }

7422

7423 void __warn_flushing_systemwide_wq(void)

7424 {

7425 pr_warn("WARNING: Flushing system-wide workqueues will be prohibited in near future.\n");

7426 dump_stack();

7427 }

7428 EXPORT_SYMBOL(__warn_flushing_systemwide_wq);

7429

7430 static int __init workqueue_unbound_cpus_setup(char *str)

7431 {

7432 if (cpulist_parse(str, &wq_cmdline_cpumask) < 0) {

7433 cpumask_clear(&wq_cmdline_cpumask);

7434 pr_warn("workqueue.unbound_cpus: incorrect CPU range, using default\n");

7435 }

7436

7437 return 1;

7438 }

7439 __setup("workqueue.unbound_cpus=", workqueue_unbound_cpus_setup);