Commit | Line | Data |
---|---|---|
22e40925 | 1 | /* SPDX-License-Identifier: GPL-2.0+ */ |
3549c2bc PM |
2 | /* |
3 | * RCU expedited grace periods | |
4 | * | |
3549c2bc PM |
5 | * Copyright IBM Corporation, 2016 |
6 | * | |
22e40925 | 7 | * Authors: Paul E. McKenney <paulmck@linux.ibm.com> |
3549c2bc PM |
8 | */ |
9 | ||
55ebfce0 BF |
10 | #include <linux/lockdep.h> |
11 | ||
142d106d | 12 | static void rcu_exp_handler(void *unused); |
d87cda50 | 13 | static int rcu_print_task_exp_stall(struct rcu_node *rnp); |
142d106d | 14 | |
09e2db37 PM |
15 | /* |
16 | * Record the start of an expedited grace period. | |
17 | */ | |
63d4c8c9 | 18 | static void rcu_exp_gp_seq_start(void) |
3549c2bc | 19 | { |
63d4c8c9 | 20 | rcu_seq_start(&rcu_state.expedited_sequence); |
3549c2bc | 21 | } |
09e2db37 | 22 | |
9a414201 PM |
23 | /* |
24 | * Return then value that expedited-grace-period counter will have | |
25 | * at the end of the current grace period. | |
26 | */ | |
63d4c8c9 | 27 | static __maybe_unused unsigned long rcu_exp_gp_seq_endval(void) |
9a414201 | 28 | { |
63d4c8c9 | 29 | return rcu_seq_endval(&rcu_state.expedited_sequence); |
9a414201 PM |
30 | } |
31 | ||
09e2db37 PM |
32 | /* |
33 | * Record the end of an expedited grace period. | |
34 | */ | |
63d4c8c9 | 35 | static void rcu_exp_gp_seq_end(void) |
3549c2bc | 36 | { |
63d4c8c9 | 37 | rcu_seq_end(&rcu_state.expedited_sequence); |
3549c2bc PM |
38 | smp_mb(); /* Ensure that consecutive grace periods serialize. */ |
39 | } | |
09e2db37 PM |
40 | |
41 | /* | |
42 | * Take a snapshot of the expedited-grace-period counter. | |
43 | */ | |
63d4c8c9 | 44 | static unsigned long rcu_exp_gp_seq_snap(void) |
3549c2bc PM |
45 | { |
46 | unsigned long s; | |
47 | ||
48 | smp_mb(); /* Caller's modifications seen first by other CPUs. */ | |
63d4c8c9 PM |
49 | s = rcu_seq_snap(&rcu_state.expedited_sequence); |
50 | trace_rcu_exp_grace_period(rcu_state.name, s, TPS("snap")); | |
3549c2bc PM |
51 | return s; |
52 | } | |
09e2db37 PM |
53 | |
54 | /* | |
55 | * Given a counter snapshot from rcu_exp_gp_seq_snap(), return true | |
56 | * if a full expedited grace period has elapsed since that snapshot | |
57 | * was taken. | |
58 | */ | |
63d4c8c9 | 59 | static bool rcu_exp_gp_seq_done(unsigned long s) |
3549c2bc | 60 | { |
63d4c8c9 | 61 | return rcu_seq_done(&rcu_state.expedited_sequence, s); |
3549c2bc PM |
62 | } |
63 | ||
64 | /* | |
65 | * Reset the ->expmaskinit values in the rcu_node tree to reflect any | |
66 | * recent CPU-online activity. Note that these masks are not cleared | |
67 | * when CPUs go offline, so they reflect the union of all CPUs that have | |
68 | * ever been online. This means that this function normally takes its | |
69 | * no-work-to-do fastpath. | |
70 | */ | |
63d4c8c9 | 71 | static void sync_exp_reset_tree_hotplug(void) |
3549c2bc PM |
72 | { |
73 | bool done; | |
74 | unsigned long flags; | |
75 | unsigned long mask; | |
76 | unsigned long oldmask; | |
63d4c8c9 | 77 | int ncpus = smp_load_acquire(&rcu_state.ncpus); /* Order vs. locking. */ |
3549c2bc PM |
78 | struct rcu_node *rnp; |
79 | struct rcu_node *rnp_up; | |
80 | ||
81 | /* If no new CPUs onlined since last time, nothing to do. */ | |
63d4c8c9 | 82 | if (likely(ncpus == rcu_state.ncpus_snap)) |
3549c2bc | 83 | return; |
63d4c8c9 | 84 | rcu_state.ncpus_snap = ncpus; |
3549c2bc PM |
85 | |
86 | /* | |
87 | * Each pass through the following loop propagates newly onlined | |
88 | * CPUs for the current rcu_node structure up the rcu_node tree. | |
89 | */ | |
aedf4ba9 | 90 | rcu_for_each_leaf_node(rnp) { |
3549c2bc PM |
91 | raw_spin_lock_irqsave_rcu_node(rnp, flags); |
92 | if (rnp->expmaskinit == rnp->expmaskinitnext) { | |
93 | raw_spin_unlock_irqrestore_rcu_node(rnp, flags); | |
94 | continue; /* No new CPUs, nothing to do. */ | |
95 | } | |
96 | ||
97 | /* Update this node's mask, track old value for propagation. */ | |
98 | oldmask = rnp->expmaskinit; | |
99 | rnp->expmaskinit = rnp->expmaskinitnext; | |
100 | raw_spin_unlock_irqrestore_rcu_node(rnp, flags); | |
101 | ||
102 | /* If was already nonzero, nothing to propagate. */ | |
103 | if (oldmask) | |
104 | continue; | |
105 | ||
106 | /* Propagate the new CPU up the tree. */ | |
107 | mask = rnp->grpmask; | |
108 | rnp_up = rnp->parent; | |
109 | done = false; | |
110 | while (rnp_up) { | |
111 | raw_spin_lock_irqsave_rcu_node(rnp_up, flags); | |
112 | if (rnp_up->expmaskinit) | |
113 | done = true; | |
114 | rnp_up->expmaskinit |= mask; | |
115 | raw_spin_unlock_irqrestore_rcu_node(rnp_up, flags); | |
116 | if (done) | |
117 | break; | |
118 | mask = rnp_up->grpmask; | |
119 | rnp_up = rnp_up->parent; | |
120 | } | |
121 | } | |
122 | } | |
123 | ||
124 | /* | |
125 | * Reset the ->expmask values in the rcu_node tree in preparation for | |
126 | * a new expedited grace period. | |
127 | */ | |
63d4c8c9 | 128 | static void __maybe_unused sync_exp_reset_tree(void) |
3549c2bc PM |
129 | { |
130 | unsigned long flags; | |
131 | struct rcu_node *rnp; | |
132 | ||
63d4c8c9 | 133 | sync_exp_reset_tree_hotplug(); |
aedf4ba9 | 134 | rcu_for_each_node_breadth_first(rnp) { |
3549c2bc PM |
135 | raw_spin_lock_irqsave_rcu_node(rnp, flags); |
136 | WARN_ON_ONCE(rnp->expmask); | |
137 | rnp->expmask = rnp->expmaskinit; | |
138 | raw_spin_unlock_irqrestore_rcu_node(rnp, flags); | |
139 | } | |
140 | } | |
141 | ||
142 | /* | |
143 | * Return non-zero if there is no RCU expedited grace period in progress | |
144 | * for the specified rcu_node structure, in other words, if all CPUs and | |
145 | * tasks covered by the specified rcu_node structure have done their bit | |
146 | * for the current expedited grace period. Works only for preemptible | |
147 | * RCU -- other RCU implementation use other means. | |
148 | * | |
7be8c56f | 149 | * Caller must hold the specificed rcu_node structure's ->lock |
3549c2bc | 150 | */ |
dcfc315b | 151 | static bool sync_rcu_preempt_exp_done(struct rcu_node *rnp) |
3549c2bc | 152 | { |
55ebfce0 BF |
153 | raw_lockdep_assert_held_rcu_node(rnp); |
154 | ||
3549c2bc PM |
155 | return rnp->exp_tasks == NULL && |
156 | READ_ONCE(rnp->expmask) == 0; | |
157 | } | |
158 | ||
55ebfce0 BF |
159 | /* |
160 | * Like sync_rcu_preempt_exp_done(), but this function assumes the caller | |
161 | * doesn't hold the rcu_node's ->lock, and will acquire and release the lock | |
162 | * itself | |
163 | */ | |
164 | static bool sync_rcu_preempt_exp_done_unlocked(struct rcu_node *rnp) | |
165 | { | |
166 | unsigned long flags; | |
167 | bool ret; | |
168 | ||
169 | raw_spin_lock_irqsave_rcu_node(rnp, flags); | |
170 | ret = sync_rcu_preempt_exp_done(rnp); | |
171 | raw_spin_unlock_irqrestore_rcu_node(rnp, flags); | |
172 | ||
173 | return ret; | |
174 | } | |
175 | ||
176 | ||
3549c2bc PM |
177 | /* |
178 | * Report the exit from RCU read-side critical section for the last task | |
179 | * that queued itself during or before the current expedited preemptible-RCU | |
180 | * grace period. This event is reported either to the rcu_node structure on | |
181 | * which the task was queued or to one of that rcu_node structure's ancestors, | |
182 | * recursively up the tree. (Calm down, calm down, we do the recursion | |
183 | * iteratively!) | |
184 | * | |
7be8c56f | 185 | * Caller must hold the specified rcu_node structure's ->lock. |
3549c2bc | 186 | */ |
63d4c8c9 | 187 | static void __rcu_report_exp_rnp(struct rcu_node *rnp, |
3549c2bc PM |
188 | bool wake, unsigned long flags) |
189 | __releases(rnp->lock) | |
190 | { | |
191 | unsigned long mask; | |
192 | ||
193 | for (;;) { | |
194 | if (!sync_rcu_preempt_exp_done(rnp)) { | |
195 | if (!rnp->expmask) | |
196 | rcu_initiate_boost(rnp, flags); | |
197 | else | |
198 | raw_spin_unlock_irqrestore_rcu_node(rnp, flags); | |
199 | break; | |
200 | } | |
201 | if (rnp->parent == NULL) { | |
202 | raw_spin_unlock_irqrestore_rcu_node(rnp, flags); | |
203 | if (wake) { | |
204 | smp_mb(); /* EGP done before wake_up(). */ | |
63d4c8c9 | 205 | swake_up_one(&rcu_state.expedited_wq); |
3549c2bc PM |
206 | } |
207 | break; | |
208 | } | |
209 | mask = rnp->grpmask; | |
210 | raw_spin_unlock_rcu_node(rnp); /* irqs remain disabled */ | |
211 | rnp = rnp->parent; | |
212 | raw_spin_lock_rcu_node(rnp); /* irqs already disabled */ | |
213 | WARN_ON_ONCE(!(rnp->expmask & mask)); | |
214 | rnp->expmask &= ~mask; | |
215 | } | |
216 | } | |
217 | ||
218 | /* | |
219 | * Report expedited quiescent state for specified node. This is a | |
220 | * lock-acquisition wrapper function for __rcu_report_exp_rnp(). | |
3549c2bc | 221 | */ |
63d4c8c9 | 222 | static void __maybe_unused rcu_report_exp_rnp(struct rcu_node *rnp, bool wake) |
3549c2bc PM |
223 | { |
224 | unsigned long flags; | |
225 | ||
226 | raw_spin_lock_irqsave_rcu_node(rnp, flags); | |
63d4c8c9 | 227 | __rcu_report_exp_rnp(rnp, wake, flags); |
3549c2bc PM |
228 | } |
229 | ||
230 | /* | |
231 | * Report expedited quiescent state for multiple CPUs, all covered by the | |
7be8c56f | 232 | * specified leaf rcu_node structure. |
3549c2bc | 233 | */ |
63d4c8c9 | 234 | static void rcu_report_exp_cpu_mult(struct rcu_node *rnp, |
3549c2bc PM |
235 | unsigned long mask, bool wake) |
236 | { | |
237 | unsigned long flags; | |
238 | ||
239 | raw_spin_lock_irqsave_rcu_node(rnp, flags); | |
240 | if (!(rnp->expmask & mask)) { | |
241 | raw_spin_unlock_irqrestore_rcu_node(rnp, flags); | |
242 | return; | |
243 | } | |
244 | rnp->expmask &= ~mask; | |
63d4c8c9 | 245 | __rcu_report_exp_rnp(rnp, wake, flags); /* Releases rnp->lock. */ |
3549c2bc PM |
246 | } |
247 | ||
248 | /* | |
249 | * Report expedited quiescent state for specified rcu_data (CPU). | |
250 | */ | |
63d4c8c9 | 251 | static void rcu_report_exp_rdp(struct rcu_data *rdp) |
3549c2bc | 252 | { |
1bb33644 | 253 | WRITE_ONCE(rdp->exp_deferred_qs, false); |
63d4c8c9 | 254 | rcu_report_exp_cpu_mult(rdp->mynode, rdp->grpmask, true); |
3549c2bc PM |
255 | } |
256 | ||
45975c7d | 257 | /* Common code for work-done checking. */ |
63d4c8c9 | 258 | static bool sync_exp_work_done(unsigned long s) |
3549c2bc | 259 | { |
63d4c8c9 PM |
260 | if (rcu_exp_gp_seq_done(s)) { |
261 | trace_rcu_exp_grace_period(rcu_state.name, s, TPS("done")); | |
96050c68 | 262 | smp_mb(); /* Ensure test happens before caller kfree(). */ |
3549c2bc PM |
263 | return true; |
264 | } | |
265 | return false; | |
266 | } | |
267 | ||
268 | /* | |
269 | * Funnel-lock acquisition for expedited grace periods. Returns true | |
270 | * if some other task completed an expedited grace period that this task | |
271 | * can piggy-back on, and with no mutex held. Otherwise, returns false | |
272 | * with the mutex held, indicating that the caller must actually do the | |
273 | * expedited grace period. | |
274 | */ | |
63d4c8c9 | 275 | static bool exp_funnel_lock(unsigned long s) |
3549c2bc | 276 | { |
da1df50d | 277 | struct rcu_data *rdp = per_cpu_ptr(&rcu_data, raw_smp_processor_id()); |
3549c2bc | 278 | struct rcu_node *rnp = rdp->mynode; |
336a4f6c | 279 | struct rcu_node *rnp_root = rcu_get_root(); |
3549c2bc PM |
280 | |
281 | /* Low-contention fastpath. */ | |
282 | if (ULONG_CMP_LT(READ_ONCE(rnp->exp_seq_rq), s) && | |
283 | (rnp == rnp_root || | |
284 | ULONG_CMP_LT(READ_ONCE(rnp_root->exp_seq_rq), s)) && | |
63d4c8c9 | 285 | mutex_trylock(&rcu_state.exp_mutex)) |
3549c2bc PM |
286 | goto fastpath; |
287 | ||
288 | /* | |
289 | * Each pass through the following loop works its way up | |
290 | * the rcu_node tree, returning if others have done the work or | |
63d4c8c9 | 291 | * otherwise falls through to acquire ->exp_mutex. The mapping |
3549c2bc PM |
292 | * from CPU to rcu_node structure can be inexact, as it is just |
293 | * promoting locality and is not strictly needed for correctness. | |
294 | */ | |
295 | for (; rnp != NULL; rnp = rnp->parent) { | |
63d4c8c9 | 296 | if (sync_exp_work_done(s)) |
3549c2bc PM |
297 | return true; |
298 | ||
299 | /* Work not done, either wait here or go up. */ | |
300 | spin_lock(&rnp->exp_lock); | |
301 | if (ULONG_CMP_GE(rnp->exp_seq_rq, s)) { | |
302 | ||
303 | /* Someone else doing GP, so wait for them. */ | |
304 | spin_unlock(&rnp->exp_lock); | |
63d4c8c9 | 305 | trace_rcu_exp_funnel_lock(rcu_state.name, rnp->level, |
3549c2bc PM |
306 | rnp->grplo, rnp->grphi, |
307 | TPS("wait")); | |
031aeee0 | 308 | wait_event(rnp->exp_wq[rcu_seq_ctr(s) & 0x3], |
63d4c8c9 | 309 | sync_exp_work_done(s)); |
3549c2bc PM |
310 | return true; |
311 | } | |
312 | rnp->exp_seq_rq = s; /* Followers can wait on us. */ | |
313 | spin_unlock(&rnp->exp_lock); | |
63d4c8c9 PM |
314 | trace_rcu_exp_funnel_lock(rcu_state.name, rnp->level, |
315 | rnp->grplo, rnp->grphi, TPS("nxtlvl")); | |
3549c2bc | 316 | } |
63d4c8c9 | 317 | mutex_lock(&rcu_state.exp_mutex); |
3549c2bc | 318 | fastpath: |
63d4c8c9 PM |
319 | if (sync_exp_work_done(s)) { |
320 | mutex_unlock(&rcu_state.exp_mutex); | |
3549c2bc PM |
321 | return true; |
322 | } | |
63d4c8c9 PM |
323 | rcu_exp_gp_seq_start(); |
324 | trace_rcu_exp_grace_period(rcu_state.name, s, TPS("start")); | |
3549c2bc PM |
325 | return false; |
326 | } | |
327 | ||
3549c2bc | 328 | /* |
25f3d7ef PM |
329 | * Select the CPUs within the specified rcu_node that the upcoming |
330 | * expedited grace period needs to wait for. | |
3549c2bc | 331 | */ |
25f3d7ef | 332 | static void sync_rcu_exp_select_node_cpus(struct work_struct *wp) |
3549c2bc PM |
333 | { |
334 | int cpu; | |
335 | unsigned long flags; | |
3549c2bc PM |
336 | unsigned long mask_ofl_test; |
337 | unsigned long mask_ofl_ipi; | |
338 | int ret; | |
25f3d7ef PM |
339 | struct rcu_exp_work *rewp = |
340 | container_of(wp, struct rcu_exp_work, rew_work); | |
341 | struct rcu_node *rnp = container_of(rewp, struct rcu_node, rew); | |
3549c2bc | 342 | |
25f3d7ef | 343 | raw_spin_lock_irqsave_rcu_node(rnp, flags); |
3549c2bc | 344 | |
25f3d7ef PM |
345 | /* Each pass checks a CPU for identity, offline, and idle. */ |
346 | mask_ofl_test = 0; | |
347 | for_each_leaf_node_cpu_mask(rnp, cpu, rnp->expmask) { | |
348 | unsigned long mask = leaf_node_cpu_bit(rnp, cpu); | |
da1df50d | 349 | struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu); |
25f3d7ef PM |
350 | int snap; |
351 | ||
352 | if (raw_smp_processor_id() == cpu || | |
353 | !(rnp->qsmaskinitnext & mask)) { | |
354 | mask_ofl_test |= mask; | |
355 | } else { | |
dc5a4f29 | 356 | snap = rcu_dynticks_snap(rdp); |
25f3d7ef | 357 | if (rcu_dynticks_in_eqs(snap)) |
65963d24 | 358 | mask_ofl_test |= mask; |
25f3d7ef PM |
359 | else |
360 | rdp->exp_dynticks_snap = snap; | |
3549c2bc | 361 | } |
25f3d7ef PM |
362 | } |
363 | mask_ofl_ipi = rnp->expmask & ~mask_ofl_test; | |
3549c2bc | 364 | |
25f3d7ef PM |
365 | /* |
366 | * Need to wait for any blocked tasks as well. Note that | |
367 | * additional blocking tasks will also block the expedited GP | |
368 | * until such time as the ->expmask bits are cleared. | |
369 | */ | |
370 | if (rcu_preempt_has_tasks(rnp)) | |
371 | rnp->exp_tasks = rnp->blkd_tasks.next; | |
372 | raw_spin_unlock_irqrestore_rcu_node(rnp, flags); | |
373 | ||
374 | /* IPI the remaining CPUs for expedited quiescent state. */ | |
375 | for_each_leaf_node_cpu_mask(rnp, cpu, rnp->expmask) { | |
376 | unsigned long mask = leaf_node_cpu_bit(rnp, cpu); | |
da1df50d | 377 | struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu); |
0742ac3e | 378 | |
25f3d7ef PM |
379 | if (!(mask_ofl_ipi & mask)) |
380 | continue; | |
3549c2bc | 381 | retry_ipi: |
dc5a4f29 | 382 | if (rcu_dynticks_in_eqs_since(rdp, rdp->exp_dynticks_snap)) { |
25f3d7ef PM |
383 | mask_ofl_test |= mask; |
384 | continue; | |
385 | } | |
b9ad4d6e PM |
386 | if (get_cpu() == cpu) { |
387 | put_cpu(); | |
388 | continue; | |
389 | } | |
142d106d | 390 | ret = smp_call_function_single(cpu, rcu_exp_handler, NULL, 0); |
b9ad4d6e | 391 | put_cpu(); |
25f3d7ef PM |
392 | if (!ret) { |
393 | mask_ofl_ipi &= ~mask; | |
394 | continue; | |
395 | } | |
396 | /* Failed, raced with CPU hotplug operation. */ | |
397 | raw_spin_lock_irqsave_rcu_node(rnp, flags); | |
398 | if ((rnp->qsmaskinitnext & mask) && | |
399 | (rnp->expmask & mask)) { | |
400 | /* Online, so delay for a bit and try again. */ | |
3549c2bc | 401 | raw_spin_unlock_irqrestore_rcu_node(rnp, flags); |
63d4c8c9 | 402 | trace_rcu_exp_grace_period(rcu_state.name, rcu_exp_gp_seq_endval(), TPS("selectofl")); |
25f3d7ef PM |
403 | schedule_timeout_uninterruptible(1); |
404 | goto retry_ipi; | |
405 | } | |
406 | /* CPU really is offline, so we can ignore it. */ | |
407 | if (!(rnp->expmask & mask)) | |
408 | mask_ofl_ipi &= ~mask; | |
409 | raw_spin_unlock_irqrestore_rcu_node(rnp, flags); | |
410 | } | |
411 | /* Report quiescent states for those that went offline. */ | |
412 | mask_ofl_test |= mask_ofl_ipi; | |
413 | if (mask_ofl_test) | |
63d4c8c9 | 414 | rcu_report_exp_cpu_mult(rnp, mask_ofl_test, false); |
25f3d7ef PM |
415 | } |
416 | ||
417 | /* | |
418 | * Select the nodes that the upcoming expedited grace period needs | |
419 | * to wait for. | |
420 | */ | |
142d106d | 421 | static void sync_rcu_exp_select_cpus(void) |
25f3d7ef | 422 | { |
fcc63543 | 423 | int cpu; |
25f3d7ef PM |
424 | struct rcu_node *rnp; |
425 | ||
63d4c8c9 PM |
426 | trace_rcu_exp_grace_period(rcu_state.name, rcu_exp_gp_seq_endval(), TPS("reset")); |
427 | sync_exp_reset_tree(); | |
428 | trace_rcu_exp_grace_period(rcu_state.name, rcu_exp_gp_seq_endval(), TPS("select")); | |
25f3d7ef PM |
429 | |
430 | /* Schedule work for each leaf rcu_node structure. */ | |
aedf4ba9 | 431 | rcu_for_each_leaf_node(rnp) { |
25f3d7ef PM |
432 | rnp->exp_need_flush = false; |
433 | if (!READ_ONCE(rnp->expmask)) | |
434 | continue; /* Avoid early boot non-existent wq. */ | |
25f3d7ef | 435 | if (!READ_ONCE(rcu_par_gp_wq) || |
5257514d | 436 | rcu_scheduler_active != RCU_SCHEDULER_RUNNING || |
aedf4ba9 | 437 | rcu_is_last_leaf_node(rnp)) { |
5257514d | 438 | /* No workqueues yet or last leaf, do direct call. */ |
25f3d7ef PM |
439 | sync_rcu_exp_select_node_cpus(&rnp->rew.rew_work); |
440 | continue; | |
3549c2bc | 441 | } |
25f3d7ef | 442 | INIT_WORK(&rnp->rew.rew_work, sync_rcu_exp_select_node_cpus); |
9cac83a5 | 443 | cpu = find_next_bit(&rnp->ffmask, BITS_PER_LONG, -1); |
fcc63543 | 444 | /* If all offline, queue the work on an unbound CPU. */ |
9cac83a5 | 445 | if (unlikely(cpu > rnp->grphi - rnp->grplo)) |
fcc63543 | 446 | cpu = WORK_CPU_UNBOUND; |
9cac83a5 PM |
447 | else |
448 | cpu += rnp->grplo; | |
fcc63543 | 449 | queue_work_on(cpu, rcu_par_gp_wq, &rnp->rew.rew_work); |
25f3d7ef | 450 | rnp->exp_need_flush = true; |
3549c2bc | 451 | } |
25f3d7ef PM |
452 | |
453 | /* Wait for workqueue jobs (if any) to complete. */ | |
aedf4ba9 | 454 | rcu_for_each_leaf_node(rnp) |
25f3d7ef PM |
455 | if (rnp->exp_need_flush) |
456 | flush_work(&rnp->rew.rew_work); | |
3549c2bc PM |
457 | } |
458 | ||
63d4c8c9 | 459 | static void synchronize_sched_expedited_wait(void) |
3549c2bc PM |
460 | { |
461 | int cpu; | |
462 | unsigned long jiffies_stall; | |
463 | unsigned long jiffies_start; | |
464 | unsigned long mask; | |
465 | int ndetected; | |
466 | struct rcu_node *rnp; | |
336a4f6c | 467 | struct rcu_node *rnp_root = rcu_get_root(); |
3549c2bc PM |
468 | int ret; |
469 | ||
63d4c8c9 | 470 | trace_rcu_exp_grace_period(rcu_state.name, rcu_exp_gp_seq_endval(), TPS("startwait")); |
3549c2bc PM |
471 | jiffies_stall = rcu_jiffies_till_stall_check(); |
472 | jiffies_start = jiffies; | |
473 | ||
474 | for (;;) { | |
b3dae109 | 475 | ret = swait_event_timeout_exclusive( |
63d4c8c9 | 476 | rcu_state.expedited_wq, |
55ebfce0 | 477 | sync_rcu_preempt_exp_done_unlocked(rnp_root), |
3549c2bc | 478 | jiffies_stall); |
55ebfce0 | 479 | if (ret > 0 || sync_rcu_preempt_exp_done_unlocked(rnp_root)) |
3549c2bc | 480 | return; |
908d2c1f | 481 | WARN_ON(ret < 0); /* workqueues should not be signaled. */ |
24a6cff2 PM |
482 | if (rcu_cpu_stall_suppress) |
483 | continue; | |
484 | panic_on_rcu_stall(); | |
3549c2bc | 485 | pr_err("INFO: %s detected expedited stalls on CPUs/tasks: {", |
63d4c8c9 | 486 | rcu_state.name); |
3549c2bc | 487 | ndetected = 0; |
aedf4ba9 | 488 | rcu_for_each_leaf_node(rnp) { |
3549c2bc | 489 | ndetected += rcu_print_task_exp_stall(rnp); |
bc75e999 | 490 | for_each_leaf_node_possible_cpu(rnp, cpu) { |
3549c2bc PM |
491 | struct rcu_data *rdp; |
492 | ||
bc75e999 | 493 | mask = leaf_node_cpu_bit(rnp, cpu); |
3549c2bc PM |
494 | if (!(rnp->expmask & mask)) |
495 | continue; | |
496 | ndetected++; | |
da1df50d | 497 | rdp = per_cpu_ptr(&rcu_data, cpu); |
3549c2bc PM |
498 | pr_cont(" %d-%c%c%c", cpu, |
499 | "O."[!!cpu_online(cpu)], | |
500 | "o."[!!(rdp->grpmask & rnp->expmaskinit)], | |
501 | "N."[!!(rdp->grpmask & rnp->expmaskinitnext)]); | |
502 | } | |
3549c2bc PM |
503 | } |
504 | pr_cont(" } %lu jiffies s: %lu root: %#lx/%c\n", | |
63d4c8c9 | 505 | jiffies - jiffies_start, rcu_state.expedited_sequence, |
3549c2bc PM |
506 | rnp_root->expmask, ".T"[!!rnp_root->exp_tasks]); |
507 | if (ndetected) { | |
508 | pr_err("blocking rcu_node structures:"); | |
aedf4ba9 | 509 | rcu_for_each_node_breadth_first(rnp) { |
3549c2bc PM |
510 | if (rnp == rnp_root) |
511 | continue; /* printed unconditionally */ | |
55ebfce0 | 512 | if (sync_rcu_preempt_exp_done_unlocked(rnp)) |
3549c2bc PM |
513 | continue; |
514 | pr_cont(" l=%u:%d-%d:%#lx/%c", | |
515 | rnp->level, rnp->grplo, rnp->grphi, | |
516 | rnp->expmask, | |
517 | ".T"[!!rnp->exp_tasks]); | |
518 | } | |
519 | pr_cont("\n"); | |
520 | } | |
aedf4ba9 | 521 | rcu_for_each_leaf_node(rnp) { |
bc75e999 MR |
522 | for_each_leaf_node_possible_cpu(rnp, cpu) { |
523 | mask = leaf_node_cpu_bit(rnp, cpu); | |
3549c2bc PM |
524 | if (!(rnp->expmask & mask)) |
525 | continue; | |
526 | dump_cpu_task(cpu); | |
527 | } | |
528 | } | |
529 | jiffies_stall = 3 * rcu_jiffies_till_stall_check() + 3; | |
530 | } | |
531 | } | |
532 | ||
533 | /* | |
534 | * Wait for the current expedited grace period to complete, and then | |
535 | * wake up everyone who piggybacked on the just-completed expedited | |
536 | * grace period. Also update all the ->exp_seq_rq counters as needed | |
537 | * in order to avoid counter-wrap problems. | |
538 | */ | |
63d4c8c9 | 539 | static void rcu_exp_wait_wake(unsigned long s) |
3549c2bc PM |
540 | { |
541 | struct rcu_node *rnp; | |
542 | ||
63d4c8c9 PM |
543 | synchronize_sched_expedited_wait(); |
544 | rcu_exp_gp_seq_end(); | |
545 | trace_rcu_exp_grace_period(rcu_state.name, s, TPS("end")); | |
3549c2bc PM |
546 | |
547 | /* | |
548 | * Switch over to wakeup mode, allowing the next GP, but -only- the | |
549 | * next GP, to proceed. | |
550 | */ | |
63d4c8c9 | 551 | mutex_lock(&rcu_state.exp_wake_mutex); |
3549c2bc | 552 | |
aedf4ba9 | 553 | rcu_for_each_node_breadth_first(rnp) { |
3549c2bc PM |
554 | if (ULONG_CMP_LT(READ_ONCE(rnp->exp_seq_rq), s)) { |
555 | spin_lock(&rnp->exp_lock); | |
556 | /* Recheck, avoid hang in case someone just arrived. */ | |
557 | if (ULONG_CMP_LT(rnp->exp_seq_rq, s)) | |
558 | rnp->exp_seq_rq = s; | |
559 | spin_unlock(&rnp->exp_lock); | |
560 | } | |
3c345825 | 561 | smp_mb(); /* All above changes before wakeup. */ |
63d4c8c9 | 562 | wake_up_all(&rnp->exp_wq[rcu_seq_ctr(rcu_state.expedited_sequence) & 0x3]); |
3549c2bc | 563 | } |
63d4c8c9 PM |
564 | trace_rcu_exp_grace_period(rcu_state.name, s, TPS("endwake")); |
565 | mutex_unlock(&rcu_state.exp_wake_mutex); | |
3549c2bc PM |
566 | } |
567 | ||
52d7e48b PM |
568 | /* |
569 | * Common code to drive an expedited grace period forward, used by | |
570 | * workqueues and mid-boot-time tasks. | |
571 | */ | |
142d106d | 572 | static void rcu_exp_sel_wait_wake(unsigned long s) |
52d7e48b PM |
573 | { |
574 | /* Initialize the rcu_node tree in preparation for the wait. */ | |
142d106d | 575 | sync_rcu_exp_select_cpus(); |
52d7e48b PM |
576 | |
577 | /* Wait and clean up, including waking everyone. */ | |
63d4c8c9 | 578 | rcu_exp_wait_wake(s); |
52d7e48b PM |
579 | } |
580 | ||
8b355e3b PM |
581 | /* |
582 | * Work-queue handler to drive an expedited grace period forward. | |
583 | */ | |
584 | static void wait_rcu_exp_gp(struct work_struct *wp) | |
585 | { | |
586 | struct rcu_exp_work *rewp; | |
587 | ||
8b355e3b | 588 | rewp = container_of(wp, struct rcu_exp_work, rew_work); |
142d106d | 589 | rcu_exp_sel_wait_wake(rewp->rew_s); |
8b355e3b PM |
590 | } |
591 | ||
40e0a6cf PM |
592 | #ifdef CONFIG_PREEMPT_RCU |
593 | ||
594 | /* | |
595 | * Remote handler for smp_call_function_single(). If there is an | |
596 | * RCU read-side critical section in effect, request that the | |
597 | * next rcu_read_unlock() record the quiescent state up the | |
598 | * ->expmask fields in the rcu_node tree. Otherwise, immediately | |
599 | * report the quiescent state. | |
600 | */ | |
142d106d | 601 | static void rcu_exp_handler(void *unused) |
40e0a6cf | 602 | { |
3e310098 | 603 | unsigned long flags; |
da1df50d | 604 | struct rcu_data *rdp = this_cpu_ptr(&rcu_data); |
3e310098 | 605 | struct rcu_node *rnp = rdp->mynode; |
40e0a6cf PM |
606 | struct task_struct *t = current; |
607 | ||
608 | /* | |
3e310098 PM |
609 | * First, the common case of not being in an RCU read-side |
610 | * critical section. If also enabled or idle, immediately | |
611 | * report the quiescent state, otherwise defer. | |
40e0a6cf | 612 | */ |
3e310098 PM |
613 | if (!t->rcu_read_lock_nesting) { |
614 | if (!(preempt_count() & (PREEMPT_MASK | SOFTIRQ_MASK)) || | |
615 | rcu_dynticks_curr_cpu_in_eqs()) { | |
63d4c8c9 | 616 | rcu_report_exp_rdp(rdp); |
3e310098 | 617 | } else { |
1bb33644 | 618 | rdp->exp_deferred_qs = true; |
fced9c8c PM |
619 | set_tsk_need_resched(t); |
620 | set_preempt_need_resched(); | |
3e310098 | 621 | } |
40e0a6cf PM |
622 | return; |
623 | } | |
624 | ||
625 | /* | |
3e310098 PM |
626 | * Second, the less-common case of being in an RCU read-side |
627 | * critical section. In this case we can count on a future | |
628 | * rcu_read_unlock(). However, this rcu_read_unlock() might | |
629 | * execute on some other CPU, but in that case there will be | |
630 | * a future context switch. Either way, if the expedited | |
631 | * grace period is still waiting on this CPU, set ->deferred_qs | |
632 | * so that the eventual quiescent state will be reported. | |
633 | * Note that there is a large group of race conditions that | |
634 | * can have caused this quiescent state to already have been | |
635 | * reported, so we really do need to check ->expmask. | |
40e0a6cf | 636 | */ |
3e310098 PM |
637 | if (t->rcu_read_lock_nesting > 0) { |
638 | raw_spin_lock_irqsave_rcu_node(rnp, flags); | |
05f41571 | 639 | if (rnp->expmask & rdp->grpmask) { |
1bb33644 | 640 | rdp->exp_deferred_qs = true; |
add0d37b | 641 | t->rcu_read_unlock_special.b.exp_hint = true; |
05f41571 | 642 | } |
3e310098 | 643 | raw_spin_unlock_irqrestore_rcu_node(rnp, flags); |
1de462ed | 644 | return; |
3e310098 PM |
645 | } |
646 | ||
647 | /* | |
648 | * The final and least likely case is where the interrupted | |
649 | * code was just about to or just finished exiting the RCU-preempt | |
650 | * read-side critical section, and no, we can't tell which. | |
651 | * So either way, set ->deferred_qs to flag later code that | |
652 | * a quiescent state is required. | |
653 | * | |
654 | * If the CPU is fully enabled (or if some buggy RCU-preempt | |
655 | * read-side critical section is being used from idle), just | |
f1a98045 | 656 | * invoke rcu_preempt_deferred_qs() to immediately report the |
3e310098 PM |
657 | * quiescent state. We cannot use rcu_read_unlock_special() |
658 | * because we are in an interrupt handler, which will cause that | |
659 | * function to take an early exit without doing anything. | |
660 | * | |
fced9c8c | 661 | * Otherwise, force a context switch after the CPU enables everything. |
3e310098 | 662 | */ |
1bb33644 | 663 | rdp->exp_deferred_qs = true; |
3e310098 | 664 | if (!(preempt_count() & (PREEMPT_MASK | SOFTIRQ_MASK)) || |
fced9c8c | 665 | WARN_ON_ONCE(rcu_dynticks_curr_cpu_in_eqs())) { |
3e310098 | 666 | rcu_preempt_deferred_qs(t); |
fced9c8c PM |
667 | } else { |
668 | set_tsk_need_resched(t); | |
669 | set_preempt_need_resched(); | |
670 | } | |
40e0a6cf PM |
671 | } |
672 | ||
8fa946d4 | 673 | /* PREEMPT=y, so no PREEMPT=n expedited grace period to clean up after. */ |
45975c7d PM |
674 | static void sync_sched_exp_online_cleanup(int cpu) |
675 | { | |
676 | } | |
677 | ||
d87cda50 PM |
678 | /* |
679 | * Scan the current list of tasks blocked within RCU read-side critical | |
680 | * sections, printing out the tid of each that is blocking the current | |
681 | * expedited grace period. | |
682 | */ | |
683 | static int rcu_print_task_exp_stall(struct rcu_node *rnp) | |
684 | { | |
685 | struct task_struct *t; | |
686 | int ndetected = 0; | |
687 | ||
688 | if (!rnp->exp_tasks) | |
689 | return 0; | |
690 | t = list_entry(rnp->exp_tasks->prev, | |
691 | struct task_struct, rcu_node_entry); | |
692 | list_for_each_entry_continue(t, &rnp->blkd_tasks, rcu_node_entry) { | |
693 | pr_cont(" P%d", t->pid); | |
694 | ndetected++; | |
695 | } | |
696 | return ndetected; | |
697 | } | |
698 | ||
40e0a6cf PM |
699 | #else /* #ifdef CONFIG_PREEMPT_RCU */ |
700 | ||
e015a341 PM |
701 | /* Request an expedited quiescent state. */ |
702 | static void rcu_exp_need_qs(void) | |
703 | { | |
704 | __this_cpu_write(rcu_data.cpu_no_qs.b.exp, true); | |
705 | /* Store .exp before .rcu_urgent_qs. */ | |
706 | smp_store_release(this_cpu_ptr(&rcu_data.rcu_urgent_qs), true); | |
707 | set_tsk_need_resched(current); | |
708 | set_preempt_need_resched(); | |
709 | } | |
710 | ||
45975c7d | 711 | /* Invoked on each online non-idle CPU for expedited quiescent state. */ |
142d106d | 712 | static void rcu_exp_handler(void *unused) |
45975c7d PM |
713 | { |
714 | struct rcu_data *rdp; | |
715 | struct rcu_node *rnp; | |
45975c7d | 716 | |
da1df50d | 717 | rdp = this_cpu_ptr(&rcu_data); |
45975c7d PM |
718 | rnp = rdp->mynode; |
719 | if (!(READ_ONCE(rnp->expmask) & rdp->grpmask) || | |
720 | __this_cpu_read(rcu_data.cpu_no_qs.b.exp)) | |
721 | return; | |
722 | if (rcu_is_cpu_rrupt_from_idle()) { | |
63d4c8c9 | 723 | rcu_report_exp_rdp(this_cpu_ptr(&rcu_data)); |
45975c7d PM |
724 | return; |
725 | } | |
e015a341 | 726 | rcu_exp_need_qs(); |
45975c7d PM |
727 | } |
728 | ||
729 | /* Send IPI for expedited cleanup if needed at end of CPU-hotplug operation. */ | |
730 | static void sync_sched_exp_online_cleanup(int cpu) | |
731 | { | |
e015a341 PM |
732 | unsigned long flags; |
733 | int my_cpu; | |
45975c7d PM |
734 | struct rcu_data *rdp; |
735 | int ret; | |
736 | struct rcu_node *rnp; | |
45975c7d | 737 | |
da1df50d | 738 | rdp = per_cpu_ptr(&rcu_data, cpu); |
45975c7d | 739 | rnp = rdp->mynode; |
e015a341 PM |
740 | my_cpu = get_cpu(); |
741 | /* Quiescent state either not needed or already requested, leave. */ | |
742 | if (!(READ_ONCE(rnp->expmask) & rdp->grpmask) || | |
743 | __this_cpu_read(rcu_data.cpu_no_qs.b.exp)) { | |
744 | put_cpu(); | |
45975c7d | 745 | return; |
e015a341 PM |
746 | } |
747 | /* Quiescent state needed on current CPU, so set it up locally. */ | |
748 | if (my_cpu == cpu) { | |
749 | local_irq_save(flags); | |
750 | rcu_exp_need_qs(); | |
751 | local_irq_restore(flags); | |
752 | put_cpu(); | |
753 | return; | |
754 | } | |
755 | /* Quiescent state needed on some other CPU, send IPI. */ | |
142d106d | 756 | ret = smp_call_function_single(cpu, rcu_exp_handler, NULL, 0); |
e015a341 | 757 | put_cpu(); |
45975c7d PM |
758 | WARN_ON_ONCE(ret); |
759 | } | |
760 | ||
d87cda50 PM |
761 | /* |
762 | * Because preemptible RCU does not exist, we never have to check for | |
763 | * tasks blocked within RCU read-side critical sections that are | |
764 | * blocking the current expedited grace period. | |
765 | */ | |
766 | static int rcu_print_task_exp_stall(struct rcu_node *rnp) | |
767 | { | |
768 | return 0; | |
769 | } | |
770 | ||
3cd4ca47 | 771 | #endif /* #else #ifdef CONFIG_PREEMPT_RCU */ |
45975c7d | 772 | |
3cd4ca47 PM |
773 | /** |
774 | * synchronize_rcu_expedited - Brute-force RCU grace period | |
775 | * | |
776 | * Wait for an RCU grace period, but expedite it. The basic idea is to | |
777 | * IPI all non-idle non-nohz online CPUs. The IPI handler checks whether | |
778 | * the CPU is in an RCU critical section, and if so, it sets a flag that | |
779 | * causes the outermost rcu_read_unlock() to report the quiescent state | |
780 | * for RCU-preempt or asks the scheduler for help for RCU-sched. On the | |
781 | * other hand, if the CPU is not in an RCU read-side critical section, | |
782 | * the IPI handler reports the quiescent state immediately. | |
783 | * | |
784 | * Although this is a greate improvement over previous expedited | |
785 | * implementations, it is still unfriendly to real-time workloads, so is | |
786 | * thus not recommended for any sort of common-case code. In fact, if | |
787 | * you are using synchronize_rcu_expedited() in a loop, please restructure | |
788 | * your code to batch your updates, and then Use a single synchronize_rcu() | |
789 | * instead. | |
790 | * | |
791 | * This has the same semantics as (but is more brutal than) synchronize_rcu(). | |
792 | */ | |
40e0a6cf PM |
793 | void synchronize_rcu_expedited(void) |
794 | { | |
89230726 PM |
795 | struct rcu_exp_work rew; |
796 | struct rcu_node *rnp; | |
797 | unsigned long s; | |
798 | ||
45975c7d PM |
799 | RCU_LOCKDEP_WARN(lock_is_held(&rcu_bh_lock_map) || |
800 | lock_is_held(&rcu_lock_map) || | |
801 | lock_is_held(&rcu_sched_lock_map), | |
8fa946d4 | 802 | "Illegal synchronize_rcu_expedited() in RCU read-side critical section"); |
45975c7d | 803 | |
3cd4ca47 | 804 | /* Is the state is such that the call is a grace period? */ |
45975c7d PM |
805 | if (rcu_blocking_is_gp()) |
806 | return; | |
807 | ||
89230726 PM |
808 | /* If expedited grace periods are prohibited, fall back to normal. */ |
809 | if (rcu_gp_is_normal()) { | |
810 | wait_rcu_gp(call_rcu); | |
811 | return; | |
812 | } | |
813 | ||
814 | /* Take a snapshot of the sequence number. */ | |
815 | s = rcu_exp_gp_seq_snap(); | |
816 | if (exp_funnel_lock(s)) | |
817 | return; /* Someone else did our work for us. */ | |
818 | ||
819 | /* Ensure that load happens before action based on it. */ | |
820 | if (unlikely(rcu_scheduler_active == RCU_SCHEDULER_INIT)) { | |
821 | /* Direct call during scheduler init and early_initcalls(). */ | |
822 | rcu_exp_sel_wait_wake(s); | |
823 | } else { | |
824 | /* Marshall arguments & schedule the expedited grace period. */ | |
825 | rew.rew_s = s; | |
826 | INIT_WORK_ONSTACK(&rew.rew_work, wait_rcu_exp_gp); | |
827 | queue_work(rcu_gp_wq, &rew.rew_work); | |
828 | } | |
829 | ||
830 | /* Wait for expedited grace period to complete. */ | |
89230726 PM |
831 | rnp = rcu_get_root(); |
832 | wait_event(rnp->exp_wq[rcu_seq_ctr(s) & 0x3], | |
833 | sync_exp_work_done(s)); | |
834 | smp_mb(); /* Workqueue actions happen before return. */ | |
835 | ||
836 | /* Let the next expedited grace period start. */ | |
837 | mutex_unlock(&rcu_state.exp_mutex); | |
40e0a6cf PM |
838 | } |
839 | EXPORT_SYMBOL_GPL(synchronize_rcu_expedited); |