Commit | Line | Data |
---|---|---|
457c8996 | 1 | // SPDX-License-Identifier: GPL-2.0-only |
4eacdf18 | 2 | /* |
08ab707d FW |
3 | * Context tracking: Probe on high level context boundaries such as kernel, |
4 | * userspace, guest or idle. | |
4eacdf18 FW |
5 | * |
6 | * This is used by RCU to remove its dependency on the timer tick while a CPU | |
08ab707d | 7 | * runs in idle, userspace or guest mode. |
4eacdf18 | 8 | * |
08ab707d | 9 | * User/guest tracking started by Frederic Weisbecker: |
4eacdf18 | 10 | * |
08ab707d | 11 | * Copyright (C) 2012 Red Hat, Inc., Frederic Weisbecker |
4eacdf18 FW |
12 | * |
13 | * Many thanks to Gilad Ben-Yossef, Paul McKenney, Ingo Molnar, Andrew Morton, | |
14 | * Steven Rostedt, Peter Zijlstra for suggestions and improvements. | |
15 | * | |
08ab707d FW |
16 | * RCU extended quiescent state bits imported from kernel/rcu/tree.c |
17 | * where the relevant authorship may be found. | |
4eacdf18 FW |
18 | */ |
19 | ||
91d1aa43 FW |
20 | #include <linux/context_tracking.h> |
21 | #include <linux/rcupdate.h> | |
22 | #include <linux/sched.h> | |
91d1aa43 | 23 | #include <linux/hardirq.h> |
6a61671b | 24 | #include <linux/export.h> |
4cdf77a8 | 25 | #include <linux/kprobes.h> |
17211455 | 26 | #include <trace/events/rcu.h> |
91d1aa43 | 27 | |
e67198cc | 28 | |
62e2412d FW |
29 | DEFINE_PER_CPU(struct context_tracking, context_tracking) = { |
30 | #ifdef CONFIG_CONTEXT_TRACKING_IDLE | |
904e600e | 31 | .dynticks_nesting = 1, |
95e04f48 | 32 | .dynticks_nmi_nesting = DYNTICK_IRQ_NONIDLE, |
62e2412d | 33 | #endif |
17147677 | 34 | .state = ATOMIC_INIT(RCU_DYNTICKS_IDX), |
62e2412d FW |
35 | }; |
36 | EXPORT_SYMBOL_GPL(context_tracking); | |
37 | ||
e67198cc | 38 | #ifdef CONFIG_CONTEXT_TRACKING_IDLE |
17211455 FW |
39 | #define TPS(x) tracepoint_string(x) |
40 | ||
41 | /* Record the current task on dyntick-idle entry. */ | |
42 | static __always_inline void rcu_dynticks_task_enter(void) | |
43 | { | |
44 | #if defined(CONFIG_TASKS_RCU) && defined(CONFIG_NO_HZ_FULL) | |
45 | WRITE_ONCE(current->rcu_tasks_idle_cpu, smp_processor_id()); | |
46 | #endif /* #if defined(CONFIG_TASKS_RCU) && defined(CONFIG_NO_HZ_FULL) */ | |
47 | } | |
48 | ||
49 | /* Record no current task on dyntick-idle exit. */ | |
50 | static __always_inline void rcu_dynticks_task_exit(void) | |
51 | { | |
52 | #if defined(CONFIG_TASKS_RCU) && defined(CONFIG_NO_HZ_FULL) | |
53 | WRITE_ONCE(current->rcu_tasks_idle_cpu, -1); | |
54 | #endif /* #if defined(CONFIG_TASKS_RCU) && defined(CONFIG_NO_HZ_FULL) */ | |
55 | } | |
56 | ||
57 | /* Turn on heavyweight RCU tasks trace readers on idle/user entry. */ | |
58 | static __always_inline void rcu_dynticks_task_trace_enter(void) | |
59 | { | |
60 | #ifdef CONFIG_TASKS_TRACE_RCU | |
61 | if (IS_ENABLED(CONFIG_TASKS_TRACE_RCU_READ_MB)) | |
62 | current->trc_reader_special.b.need_mb = true; | |
63 | #endif /* #ifdef CONFIG_TASKS_TRACE_RCU */ | |
64 | } | |
65 | ||
66 | /* Turn off heavyweight RCU tasks trace readers on idle/user exit. */ | |
67 | static __always_inline void rcu_dynticks_task_trace_exit(void) | |
68 | { | |
69 | #ifdef CONFIG_TASKS_TRACE_RCU | |
70 | if (IS_ENABLED(CONFIG_TASKS_TRACE_RCU_READ_MB)) | |
71 | current->trc_reader_special.b.need_mb = false; | |
72 | #endif /* #ifdef CONFIG_TASKS_TRACE_RCU */ | |
73 | } | |
74 | ||
75 | /* | |
76 | * Record entry into an extended quiescent state. This is only to be | |
77 | * called when not already in an extended quiescent state, that is, | |
78 | * RCU is watching prior to the call to this function and is no longer | |
79 | * watching upon return. | |
80 | */ | |
17147677 | 81 | static noinstr void ct_kernel_exit_state(int offset) |
17211455 FW |
82 | { |
83 | int seq; | |
84 | ||
85 | /* | |
86 | * CPUs seeing atomic_add_return() must see prior RCU read-side | |
87 | * critical sections, and we also must force ordering with the | |
88 | * next idle sojourn. | |
89 | */ | |
90 | rcu_dynticks_task_trace_enter(); // Before ->dynticks update! | |
17147677 | 91 | seq = ct_state_inc(offset); |
17211455 | 92 | // RCU is no longer watching. Better be in extended quiescent state! |
17147677 | 93 | WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && (seq & RCU_DYNTICKS_IDX)); |
17211455 FW |
94 | } |
95 | ||
96 | /* | |
97 | * Record exit from an extended quiescent state. This is only to be | |
98 | * called from an extended quiescent state, that is, RCU is not watching | |
99 | * prior to the call to this function and is watching upon return. | |
100 | */ | |
17147677 | 101 | static noinstr void ct_kernel_enter_state(int offset) |
17211455 FW |
102 | { |
103 | int seq; | |
104 | ||
105 | /* | |
106 | * CPUs seeing atomic_add_return() must see prior idle sojourns, | |
107 | * and we also must force ordering with the next RCU read-side | |
108 | * critical section. | |
109 | */ | |
17147677 | 110 | seq = ct_state_inc(offset); |
17211455 FW |
111 | // RCU is now watching. Better not be in an extended quiescent state! |
112 | rcu_dynticks_task_trace_exit(); // After ->dynticks update! | |
17147677 | 113 | WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !(seq & RCU_DYNTICKS_IDX)); |
17211455 FW |
114 | } |
115 | ||
116 | /* | |
117 | * Enter an RCU extended quiescent state, which can be either the | |
118 | * idle loop or adaptive-tickless usermode execution. | |
119 | * | |
120 | * We crowbar the ->dynticks_nmi_nesting field to zero to allow for | |
121 | * the possibility of usermode upcalls having messed up our count | |
122 | * of interrupt nesting level during the prior busy period. | |
123 | */ | |
17147677 | 124 | static void noinstr ct_kernel_exit(bool user, int offset) |
17211455 FW |
125 | { |
126 | struct context_tracking *ct = this_cpu_ptr(&context_tracking); | |
127 | ||
128 | WARN_ON_ONCE(ct_dynticks_nmi_nesting() != DYNTICK_IRQ_NONIDLE); | |
129 | WRITE_ONCE(ct->dynticks_nmi_nesting, 0); | |
130 | WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && | |
131 | ct_dynticks_nesting() == 0); | |
132 | if (ct_dynticks_nesting() != 1) { | |
133 | // RCU will still be watching, so just do accounting and leave. | |
134 | ct->dynticks_nesting--; | |
135 | return; | |
136 | } | |
137 | ||
138 | instrumentation_begin(); | |
139 | lockdep_assert_irqs_disabled(); | |
140 | trace_rcu_dyntick(TPS("Start"), ct_dynticks_nesting(), 0, ct_dynticks()); | |
141 | WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !user && !is_idle_task(current)); | |
142 | rcu_preempt_deferred_qs(current); | |
143 | ||
17147677 FW |
144 | // instrumentation for the noinstr ct_kernel_exit_state() |
145 | instrument_atomic_write(&ct->state, sizeof(ct->state)); | |
17211455 FW |
146 | |
147 | instrumentation_end(); | |
148 | WRITE_ONCE(ct->dynticks_nesting, 0); /* Avoid irq-access tearing. */ | |
149 | // RCU is watching here ... | |
17147677 | 150 | ct_kernel_exit_state(offset); |
17211455 FW |
151 | // ... but is no longer watching here. |
152 | rcu_dynticks_task_enter(); | |
153 | } | |
154 | ||
155 | /* | |
156 | * Exit an RCU extended quiescent state, which can be either the | |
157 | * idle loop or adaptive-tickless usermode execution. | |
158 | * | |
159 | * We crowbar the ->dynticks_nmi_nesting field to DYNTICK_IRQ_NONIDLE to | |
160 | * allow for the possibility of usermode upcalls messing up our count of | |
161 | * interrupt nesting level during the busy period that is just now starting. | |
162 | */ | |
17147677 | 163 | static void noinstr ct_kernel_enter(bool user, int offset) |
17211455 FW |
164 | { |
165 | struct context_tracking *ct = this_cpu_ptr(&context_tracking); | |
166 | long oldval; | |
167 | ||
168 | WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !raw_irqs_disabled()); | |
169 | oldval = ct_dynticks_nesting(); | |
170 | WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && oldval < 0); | |
171 | if (oldval) { | |
172 | // RCU was already watching, so just do accounting and leave. | |
173 | ct->dynticks_nesting++; | |
174 | return; | |
175 | } | |
176 | rcu_dynticks_task_exit(); | |
177 | // RCU is not watching here ... | |
17147677 | 178 | ct_kernel_enter_state(offset); |
17211455 FW |
179 | // ... but is watching here. |
180 | instrumentation_begin(); | |
181 | ||
17147677 FW |
182 | // instrumentation for the noinstr ct_kernel_enter_state() |
183 | instrument_atomic_write(&ct->state, sizeof(ct->state)); | |
17211455 FW |
184 | |
185 | trace_rcu_dyntick(TPS("End"), ct_dynticks_nesting(), 1, ct_dynticks()); | |
186 | WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !user && !is_idle_task(current)); | |
187 | WRITE_ONCE(ct->dynticks_nesting, 1); | |
188 | WARN_ON_ONCE(ct_dynticks_nmi_nesting()); | |
189 | WRITE_ONCE(ct->dynticks_nmi_nesting, DYNTICK_IRQ_NONIDLE); | |
190 | instrumentation_end(); | |
191 | } | |
192 | ||
193 | /** | |
c33ef43a | 194 | * ct_nmi_exit - inform RCU of exit from NMI context |
17211455 FW |
195 | * |
196 | * If we are returning from the outermost NMI handler that interrupted an | |
17147677 | 197 | * RCU-idle period, update ct->state and ct->dynticks_nmi_nesting |
17211455 FW |
198 | * to let the RCU grace-period handling know that the CPU is back to |
199 | * being RCU-idle. | |
200 | * | |
c33ef43a | 201 | * If you add or remove a call to ct_nmi_exit(), be sure to test |
17211455 FW |
202 | * with CONFIG_RCU_EQS_DEBUG=y. |
203 | */ | |
c33ef43a | 204 | void noinstr ct_nmi_exit(void) |
17211455 FW |
205 | { |
206 | struct context_tracking *ct = this_cpu_ptr(&context_tracking); | |
207 | ||
208 | instrumentation_begin(); | |
209 | /* | |
210 | * Check for ->dynticks_nmi_nesting underflow and bad ->dynticks. | |
211 | * (We are exiting an NMI handler, so RCU better be paying attention | |
212 | * to us!) | |
213 | */ | |
214 | WARN_ON_ONCE(ct_dynticks_nmi_nesting() <= 0); | |
215 | WARN_ON_ONCE(rcu_dynticks_curr_cpu_in_eqs()); | |
216 | ||
217 | /* | |
218 | * If the nesting level is not 1, the CPU wasn't RCU-idle, so | |
219 | * leave it in non-RCU-idle state. | |
220 | */ | |
221 | if (ct_dynticks_nmi_nesting() != 1) { | |
222 | trace_rcu_dyntick(TPS("--="), ct_dynticks_nmi_nesting(), ct_dynticks_nmi_nesting() - 2, | |
223 | ct_dynticks()); | |
224 | WRITE_ONCE(ct->dynticks_nmi_nesting, /* No store tearing. */ | |
225 | ct_dynticks_nmi_nesting() - 2); | |
226 | instrumentation_end(); | |
227 | return; | |
228 | } | |
229 | ||
230 | /* This NMI interrupted an RCU-idle CPU, restore RCU-idleness. */ | |
231 | trace_rcu_dyntick(TPS("Startirq"), ct_dynticks_nmi_nesting(), 0, ct_dynticks()); | |
232 | WRITE_ONCE(ct->dynticks_nmi_nesting, 0); /* Avoid store tearing. */ | |
233 | ||
17147677 FW |
234 | // instrumentation for the noinstr ct_kernel_exit_state() |
235 | instrument_atomic_write(&ct->state, sizeof(ct->state)); | |
17211455 FW |
236 | instrumentation_end(); |
237 | ||
238 | // RCU is watching here ... | |
17147677 | 239 | ct_kernel_exit_state(RCU_DYNTICKS_IDX); |
17211455 FW |
240 | // ... but is no longer watching here. |
241 | ||
242 | if (!in_nmi()) | |
243 | rcu_dynticks_task_enter(); | |
244 | } | |
245 | ||
246 | /** | |
c33ef43a | 247 | * ct_nmi_enter - inform RCU of entry to NMI context |
17211455 | 248 | * |
17147677 | 249 | * If the CPU was idle from RCU's viewpoint, update ct->state and |
17211455 FW |
250 | * ct->dynticks_nmi_nesting to let the RCU grace-period handling know |
251 | * that the CPU is active. This implementation permits nested NMIs, as | |
252 | * long as the nesting level does not overflow an int. (You will probably | |
253 | * run out of stack space first.) | |
254 | * | |
c33ef43a | 255 | * If you add or remove a call to ct_nmi_enter(), be sure to test |
17211455 FW |
256 | * with CONFIG_RCU_EQS_DEBUG=y. |
257 | */ | |
c33ef43a | 258 | void noinstr ct_nmi_enter(void) |
17211455 FW |
259 | { |
260 | long incby = 2; | |
261 | struct context_tracking *ct = this_cpu_ptr(&context_tracking); | |
262 | ||
263 | /* Complain about underflow. */ | |
264 | WARN_ON_ONCE(ct_dynticks_nmi_nesting() < 0); | |
265 | ||
266 | /* | |
267 | * If idle from RCU viewpoint, atomically increment ->dynticks | |
268 | * to mark non-idle and increment ->dynticks_nmi_nesting by one. | |
269 | * Otherwise, increment ->dynticks_nmi_nesting by two. This means | |
270 | * if ->dynticks_nmi_nesting is equal to one, we are guaranteed | |
271 | * to be in the outermost NMI handler that interrupted an RCU-idle | |
272 | * period (observation due to Andy Lutomirski). | |
273 | */ | |
274 | if (rcu_dynticks_curr_cpu_in_eqs()) { | |
275 | ||
276 | if (!in_nmi()) | |
277 | rcu_dynticks_task_exit(); | |
278 | ||
279 | // RCU is not watching here ... | |
17147677 | 280 | ct_kernel_enter_state(RCU_DYNTICKS_IDX); |
17211455 FW |
281 | // ... but is watching here. |
282 | ||
283 | instrumentation_begin(); | |
284 | // instrumentation for the noinstr rcu_dynticks_curr_cpu_in_eqs() | |
17147677 FW |
285 | instrument_atomic_read(&ct->state, sizeof(ct->state)); |
286 | // instrumentation for the noinstr ct_kernel_enter_state() | |
287 | instrument_atomic_write(&ct->state, sizeof(ct->state)); | |
17211455 FW |
288 | |
289 | incby = 1; | |
290 | } else if (!in_nmi()) { | |
291 | instrumentation_begin(); | |
292 | rcu_irq_enter_check_tick(); | |
293 | } else { | |
294 | instrumentation_begin(); | |
295 | } | |
296 | ||
297 | trace_rcu_dyntick(incby == 1 ? TPS("Endirq") : TPS("++="), | |
298 | ct_dynticks_nmi_nesting(), | |
299 | ct_dynticks_nmi_nesting() + incby, ct_dynticks()); | |
300 | instrumentation_end(); | |
301 | WRITE_ONCE(ct->dynticks_nmi_nesting, /* Prevent store tearing. */ | |
302 | ct_dynticks_nmi_nesting() + incby); | |
303 | barrier(); | |
304 | } | |
305 | ||
306 | /** | |
c33ef43a | 307 | * ct_idle_enter - inform RCU that current CPU is entering idle |
17211455 FW |
308 | * |
309 | * Enter idle mode, in other words, -leave- the mode in which RCU | |
310 | * read-side critical sections can occur. (Though RCU read-side | |
311 | * critical sections can occur in irq handlers in idle, a possibility | |
312 | * handled by irq_enter() and irq_exit().) | |
313 | * | |
c33ef43a | 314 | * If you add or remove a call to ct_idle_enter(), be sure to test with |
17211455 FW |
315 | * CONFIG_RCU_EQS_DEBUG=y. |
316 | */ | |
c33ef43a | 317 | void noinstr ct_idle_enter(void) |
17211455 FW |
318 | { |
319 | WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !raw_irqs_disabled()); | |
17147677 | 320 | ct_kernel_exit(false, RCU_DYNTICKS_IDX + CONTEXT_IDLE); |
17211455 | 321 | } |
c33ef43a | 322 | EXPORT_SYMBOL_GPL(ct_idle_enter); |
17211455 FW |
323 | |
324 | /** | |
c33ef43a | 325 | * ct_idle_exit - inform RCU that current CPU is leaving idle |
17211455 FW |
326 | * |
327 | * Exit idle mode, in other words, -enter- the mode in which RCU | |
328 | * read-side critical sections can occur. | |
329 | * | |
c33ef43a | 330 | * If you add or remove a call to ct_idle_exit(), be sure to test with |
17211455 FW |
331 | * CONFIG_RCU_EQS_DEBUG=y. |
332 | */ | |
c33ef43a | 333 | void noinstr ct_idle_exit(void) |
17211455 FW |
334 | { |
335 | unsigned long flags; | |
336 | ||
337 | raw_local_irq_save(flags); | |
17147677 | 338 | ct_kernel_enter(false, RCU_DYNTICKS_IDX - CONTEXT_IDLE); |
17211455 FW |
339 | raw_local_irq_restore(flags); |
340 | } | |
e67198cc | 341 | EXPORT_SYMBOL_GPL(ct_idle_exit); |
6f0e6c15 | 342 | |
3864caaf FW |
343 | /** |
344 | * ct_irq_enter - inform RCU that current CPU is entering irq away from idle | |
345 | * | |
346 | * Enter an interrupt handler, which might possibly result in exiting | |
347 | * idle mode, in other words, entering the mode in which read-side critical | |
348 | * sections can occur. The caller must have disabled interrupts. | |
349 | * | |
350 | * Note that the Linux kernel is fully capable of entering an interrupt | |
351 | * handler that it never exits, for example when doing upcalls to user mode! | |
352 | * This code assumes that the idle loop never does upcalls to user mode. | |
353 | * If your architecture's idle loop does do upcalls to user mode (or does | |
354 | * anything else that results in unbalanced calls to the irq_enter() and | |
355 | * irq_exit() functions), RCU will give you what you deserve, good and hard. | |
356 | * But very infrequently and irreproducibly. | |
357 | * | |
358 | * Use things like work queues to work around this limitation. | |
359 | * | |
360 | * You have been warned. | |
361 | * | |
362 | * If you add or remove a call to ct_irq_enter(), be sure to test with | |
363 | * CONFIG_RCU_EQS_DEBUG=y. | |
364 | */ | |
6f0e6c15 FW |
365 | noinstr void ct_irq_enter(void) |
366 | { | |
3864caaf FW |
367 | lockdep_assert_irqs_disabled(); |
368 | ct_nmi_enter(); | |
6f0e6c15 FW |
369 | } |
370 | ||
3864caaf FW |
371 | /** |
372 | * ct_irq_exit - inform RCU that current CPU is exiting irq towards idle | |
373 | * | |
374 | * Exit from an interrupt handler, which might possibly result in entering | |
375 | * idle mode, in other words, leaving the mode in which read-side critical | |
376 | * sections can occur. The caller must have disabled interrupts. | |
377 | * | |
378 | * This code assumes that the idle loop never does anything that might | |
379 | * result in unbalanced calls to irq_enter() and irq_exit(). If your | |
380 | * architecture's idle loop violates this assumption, RCU will give you what | |
381 | * you deserve, good and hard. But very infrequently and irreproducibly. | |
382 | * | |
383 | * Use things like work queues to work around this limitation. | |
384 | * | |
385 | * You have been warned. | |
386 | * | |
387 | * If you add or remove a call to ct_irq_exit(), be sure to test with | |
388 | * CONFIG_RCU_EQS_DEBUG=y. | |
389 | */ | |
6f0e6c15 FW |
390 | noinstr void ct_irq_exit(void) |
391 | { | |
3864caaf FW |
392 | lockdep_assert_irqs_disabled(); |
393 | ct_nmi_exit(); | |
6f0e6c15 FW |
394 | } |
395 | ||
3864caaf FW |
396 | /* |
397 | * Wrapper for ct_irq_enter() where interrupts are enabled. | |
398 | * | |
399 | * If you add or remove a call to ct_irq_enter_irqson(), be sure to test | |
400 | * with CONFIG_RCU_EQS_DEBUG=y. | |
401 | */ | |
6f0e6c15 FW |
402 | void ct_irq_enter_irqson(void) |
403 | { | |
3864caaf FW |
404 | unsigned long flags; |
405 | ||
406 | local_irq_save(flags); | |
407 | ct_irq_enter(); | |
408 | local_irq_restore(flags); | |
6f0e6c15 FW |
409 | } |
410 | ||
3864caaf FW |
411 | /* |
412 | * Wrapper for ct_irq_exit() where interrupts are enabled. | |
413 | * | |
414 | * If you add or remove a call to ct_irq_exit_irqson(), be sure to test | |
415 | * with CONFIG_RCU_EQS_DEBUG=y. | |
416 | */ | |
6f0e6c15 FW |
417 | void ct_irq_exit_irqson(void) |
418 | { | |
3864caaf FW |
419 | unsigned long flags; |
420 | ||
421 | local_irq_save(flags); | |
422 | ct_irq_exit(); | |
423 | local_irq_restore(flags); | |
6f0e6c15 | 424 | } |
c33ef43a | 425 | #else |
17147677 FW |
426 | static __always_inline void ct_kernel_exit(bool user, int offset) { } |
427 | static __always_inline void ct_kernel_enter(bool user, int offset) { } | |
e67198cc FW |
428 | #endif /* #ifdef CONFIG_CONTEXT_TRACKING_IDLE */ |
429 | ||
24a9c541 FW |
430 | #ifdef CONFIG_CONTEXT_TRACKING_USER |
431 | ||
1b6a259a FW |
432 | #define CREATE_TRACE_POINTS |
433 | #include <trace/events/context_tracking.h> | |
434 | ||
74c57875 FW |
435 | DEFINE_STATIC_KEY_FALSE(context_tracking_key); |
436 | EXPORT_SYMBOL_GPL(context_tracking_key); | |
65f382fd | 437 | |
0372007f | 438 | static noinstr bool context_tracking_recursion_enter(void) |
aed5ed47 FW |
439 | { |
440 | int recursion; | |
441 | ||
442 | recursion = __this_cpu_inc_return(context_tracking.recursion); | |
443 | if (recursion == 1) | |
444 | return true; | |
445 | ||
446 | WARN_ONCE((recursion < 1), "Invalid context tracking recursion value %d\n", recursion); | |
447 | __this_cpu_dec(context_tracking.recursion); | |
448 | ||
449 | return false; | |
450 | } | |
451 | ||
0372007f | 452 | static __always_inline void context_tracking_recursion_exit(void) |
aed5ed47 FW |
453 | { |
454 | __this_cpu_dec(context_tracking.recursion); | |
455 | } | |
456 | ||
4eacdf18 | 457 | /** |
0ffc781a FW |
458 | * __ct_user_enter - Inform the context tracking that the CPU is going |
459 | * to enter user or guest space mode. | |
4eacdf18 FW |
460 | * |
461 | * This function must be called right before we switch from the kernel | |
3aab4f50 RR |
462 | * to user or guest space, when it's guaranteed the remaining kernel |
463 | * instructions to execute won't use any RCU read side critical section | |
464 | * because this function sets RCU in extended quiescent state. | |
4eacdf18 | 465 | */ |
0ffc781a | 466 | void noinstr __ct_user_enter(enum ctx_state state) |
91d1aa43 | 467 | { |
17147677 | 468 | struct context_tracking *ct = this_cpu_ptr(&context_tracking); |
56450649 FW |
469 | lockdep_assert_irqs_disabled(); |
470 | ||
4eacdf18 | 471 | /* Kernel threads aren't supposed to go to userspace */ |
91d1aa43 FW |
472 | WARN_ON_ONCE(!current->mm); |
473 | ||
aed5ed47 | 474 | if (!context_tracking_recursion_enter()) |
d0e536d8 | 475 | return; |
aed5ed47 | 476 | |
17147677 FW |
477 | if (__ct_state() != state) { |
478 | if (ct->active) { | |
d65ec121 FW |
479 | /* |
480 | * At this stage, only low level arch entry code remains and | |
481 | * then we'll run in userspace. We can assume there won't be | |
482 | * any RCU read-side critical section until the next call to | |
6f0e6c15 | 483 | * user_exit() or ct_irq_enter(). Let's remove RCU's dependency |
d65ec121 FW |
484 | * on the tick. |
485 | */ | |
19fdd98b | 486 | if (state == CONTEXT_USER) { |
0372007f | 487 | instrumentation_begin(); |
19fdd98b RR |
488 | trace_user_enter(0); |
489 | vtime_user_enter(current); | |
0372007f | 490 | instrumentation_end(); |
19fdd98b | 491 | } |
56450649 FW |
492 | /* |
493 | * Other than generic entry implementation, we may be past the last | |
494 | * rescheduling opportunity in the entry code. Trigger a self IPI | |
495 | * that will fire and reschedule once we resume in user/guest mode. | |
496 | */ | |
497 | rcu_irq_work_resched(); | |
17147677 | 498 | |
c33ef43a FW |
499 | /* |
500 | * Enter RCU idle mode right before resuming userspace. No use of RCU | |
501 | * is permitted between this call and rcu_eqs_exit(). This way the | |
502 | * CPU doesn't need to maintain the tick for RCU maintenance purposes | |
503 | * when the CPU runs in userspace. | |
504 | */ | |
17147677 FW |
505 | ct_kernel_exit(true, RCU_DYNTICKS_IDX + state); |
506 | ||
507 | /* | |
508 | * Special case if we only track user <-> kernel transitions for tickless | |
509 | * cputime accounting but we don't support RCU extended quiescent state. | |
510 | * In this we case we don't care about any concurrency/ordering. | |
511 | */ | |
512 | if (!IS_ENABLED(CONFIG_CONTEXT_TRACKING_IDLE)) | |
0f613bfa | 513 | raw_atomic_set(&ct->state, state); |
17147677 FW |
514 | } else { |
515 | /* | |
516 | * Even if context tracking is disabled on this CPU, because it's outside | |
517 | * the full dynticks mask for example, we still have to keep track of the | |
518 | * context transitions and states to prevent inconsistency on those of | |
519 | * other CPUs. | |
520 | * If a task triggers an exception in userspace, sleep on the exception | |
521 | * handler and then migrate to another CPU, that new CPU must know where | |
522 | * the exception returns by the time we call exception_exit(). | |
523 | * This information can only be provided by the previous CPU when it called | |
524 | * exception_enter(). | |
525 | * OTOH we can spare the calls to vtime and RCU when context_tracking.active | |
526 | * is false because we know that CPU is not tickless. | |
527 | */ | |
528 | if (!IS_ENABLED(CONFIG_CONTEXT_TRACKING_IDLE)) { | |
529 | /* Tracking for vtime only, no concurrent RCU EQS accounting */ | |
0f613bfa | 530 | raw_atomic_set(&ct->state, state); |
17147677 FW |
531 | } else { |
532 | /* | |
533 | * Tracking for vtime and RCU EQS. Make sure we don't race | |
534 | * with NMIs. OTOH we don't care about ordering here since | |
535 | * RCU only requires RCU_DYNTICKS_IDX increments to be fully | |
536 | * ordered. | |
537 | */ | |
0f613bfa | 538 | raw_atomic_add(state, &ct->state); |
17147677 | 539 | } |
d65ec121 | 540 | } |
91d1aa43 | 541 | } |
aed5ed47 | 542 | context_tracking_recursion_exit(); |
d0e536d8 | 543 | } |
0ffc781a | 544 | EXPORT_SYMBOL_GPL(__ct_user_enter); |
d0e536d8 | 545 | |
f67671ba FW |
546 | /* |
547 | * OBSOLETE: | |
548 | * This function should be noinstr but the below local_irq_restore() is | |
549 | * unsafe because it involves illegal RCU uses through tracing and lockdep. | |
550 | * This is unlikely to be fixed as this function is obsolete. The preferred | |
551 | * way is to call __context_tracking_enter() through user_enter_irqoff() | |
552 | * or context_tracking_guest_enter(). It should be the arch entry code | |
553 | * responsibility to call into context tracking with IRQs disabled. | |
554 | */ | |
fe98db1c | 555 | void ct_user_enter(enum ctx_state state) |
d0e536d8 PB |
556 | { |
557 | unsigned long flags; | |
558 | ||
559 | /* | |
560 | * Some contexts may involve an exception occuring in an irq, | |
561 | * leading to that nesting: | |
c33ef43a | 562 | * ct_irq_enter() rcu_eqs_exit(true) rcu_eqs_enter(true) ct_irq_exit() |
d0e536d8 PB |
563 | * This would mess up the dyntick_nesting count though. And rcu_irq_*() |
564 | * helpers are enough to protect RCU uses inside the exception. So | |
565 | * just return immediately if we detect we are in an IRQ. | |
566 | */ | |
567 | if (in_interrupt()) | |
568 | return; | |
569 | ||
570 | local_irq_save(flags); | |
0ffc781a | 571 | __ct_user_enter(state); |
91d1aa43 FW |
572 | local_irq_restore(flags); |
573 | } | |
fe98db1c FW |
574 | NOKPROBE_SYMBOL(ct_user_enter); |
575 | EXPORT_SYMBOL_GPL(ct_user_enter); | |
3aab4f50 | 576 | |
f163f030 FW |
577 | /** |
578 | * user_enter_callable() - Unfortunate ASM callable version of user_enter() for | |
579 | * archs that didn't manage to check the context tracking | |
580 | * static key from low level code. | |
581 | * | |
582 | * This OBSOLETE function should be noinstr but it unsafely calls | |
583 | * local_irq_restore(), involving illegal RCU uses through tracing and lockdep. | |
f67671ba FW |
584 | * This is unlikely to be fixed as this function is obsolete. The preferred |
585 | * way is to call user_enter_irqoff(). It should be the arch entry code | |
586 | * responsibility to call into context tracking with IRQs disabled. | |
587 | */ | |
f163f030 | 588 | void user_enter_callable(void) |
3aab4f50 | 589 | { |
f70cd6b0 | 590 | user_enter(); |
3aab4f50 | 591 | } |
f163f030 | 592 | NOKPROBE_SYMBOL(user_enter_callable); |
91d1aa43 | 593 | |
4eacdf18 | 594 | /** |
0ffc781a FW |
595 | * __ct_user_exit - Inform the context tracking that the CPU is |
596 | * exiting user or guest mode and entering the kernel. | |
4eacdf18 | 597 | * |
3aab4f50 RR |
598 | * This function must be called after we entered the kernel from user or |
599 | * guest space before any use of RCU read side critical section. This | |
600 | * potentially include any high level kernel code like syscalls, exceptions, | |
601 | * signal handling, etc... | |
4eacdf18 FW |
602 | * |
603 | * This call supports re-entrancy. This way it can be called from any exception | |
604 | * handler without needing to know if we came from userspace or not. | |
605 | */ | |
0ffc781a | 606 | void noinstr __ct_user_exit(enum ctx_state state) |
91d1aa43 | 607 | { |
17147677 FW |
608 | struct context_tracking *ct = this_cpu_ptr(&context_tracking); |
609 | ||
aed5ed47 | 610 | if (!context_tracking_recursion_enter()) |
d0e536d8 | 611 | return; |
aed5ed47 | 612 | |
17147677 FW |
613 | if (__ct_state() == state) { |
614 | if (ct->active) { | |
d65ec121 | 615 | /* |
c33ef43a FW |
616 | * Exit RCU idle mode while entering the kernel because it can |
617 | * run a RCU read side critical section anytime. | |
d65ec121 | 618 | */ |
17147677 | 619 | ct_kernel_enter(true, RCU_DYNTICKS_IDX - state); |
19fdd98b | 620 | if (state == CONTEXT_USER) { |
0372007f | 621 | instrumentation_begin(); |
19fdd98b RR |
622 | vtime_user_exit(current); |
623 | trace_user_exit(0); | |
0372007f | 624 | instrumentation_end(); |
19fdd98b | 625 | } |
17147677 FW |
626 | |
627 | /* | |
628 | * Special case if we only track user <-> kernel transitions for tickless | |
629 | * cputime accounting but we don't support RCU extended quiescent state. | |
630 | * In this we case we don't care about any concurrency/ordering. | |
631 | */ | |
632 | if (!IS_ENABLED(CONFIG_CONTEXT_TRACKING_IDLE)) | |
0f613bfa | 633 | raw_atomic_set(&ct->state, CONTEXT_KERNEL); |
17147677 FW |
634 | |
635 | } else { | |
636 | if (!IS_ENABLED(CONFIG_CONTEXT_TRACKING_IDLE)) { | |
637 | /* Tracking for vtime only, no concurrent RCU EQS accounting */ | |
0f613bfa | 638 | raw_atomic_set(&ct->state, CONTEXT_KERNEL); |
17147677 FW |
639 | } else { |
640 | /* | |
641 | * Tracking for vtime and RCU EQS. Make sure we don't race | |
642 | * with NMIs. OTOH we don't care about ordering here since | |
643 | * RCU only requires RCU_DYNTICKS_IDX increments to be fully | |
644 | * ordered. | |
645 | */ | |
0f613bfa | 646 | raw_atomic_sub(state, &ct->state); |
17147677 | 647 | } |
d65ec121 | 648 | } |
91d1aa43 | 649 | } |
aed5ed47 | 650 | context_tracking_recursion_exit(); |
d0e536d8 | 651 | } |
0ffc781a | 652 | EXPORT_SYMBOL_GPL(__ct_user_exit); |
d0e536d8 | 653 | |
f67671ba FW |
654 | /* |
655 | * OBSOLETE: | |
656 | * This function should be noinstr but the below local_irq_save() is | |
657 | * unsafe because it involves illegal RCU uses through tracing and lockdep. | |
658 | * This is unlikely to be fixed as this function is obsolete. The preferred | |
659 | * way is to call __context_tracking_exit() through user_exit_irqoff() | |
660 | * or context_tracking_guest_exit(). It should be the arch entry code | |
661 | * responsibility to call into context tracking with IRQs disabled. | |
662 | */ | |
fe98db1c | 663 | void ct_user_exit(enum ctx_state state) |
d0e536d8 PB |
664 | { |
665 | unsigned long flags; | |
666 | ||
667 | if (in_interrupt()) | |
668 | return; | |
669 | ||
670 | local_irq_save(flags); | |
0ffc781a | 671 | __ct_user_exit(state); |
91d1aa43 FW |
672 | local_irq_restore(flags); |
673 | } | |
fe98db1c FW |
674 | NOKPROBE_SYMBOL(ct_user_exit); |
675 | EXPORT_SYMBOL_GPL(ct_user_exit); | |
3aab4f50 | 676 | |
f163f030 FW |
677 | /** |
678 | * user_exit_callable() - Unfortunate ASM callable version of user_exit() for | |
679 | * archs that didn't manage to check the context tracking | |
680 | * static key from low level code. | |
681 | * | |
682 | * This OBSOLETE function should be noinstr but it unsafely calls local_irq_save(), | |
f67671ba FW |
683 | * involving illegal RCU uses through tracing and lockdep. This is unlikely |
684 | * to be fixed as this function is obsolete. The preferred way is to call | |
685 | * user_exit_irqoff(). It should be the arch entry code responsibility to | |
686 | * call into context tracking with IRQs disabled. | |
687 | */ | |
f163f030 | 688 | void user_exit_callable(void) |
3aab4f50 | 689 | { |
f70cd6b0 | 690 | user_exit(); |
3aab4f50 | 691 | } |
f163f030 | 692 | NOKPROBE_SYMBOL(user_exit_callable); |
91d1aa43 | 693 | |
2a0aafce | 694 | void __init ct_cpu_track_user(int cpu) |
91d1aa43 | 695 | { |
fafe870f FW |
696 | static __initdata bool initialized = false; |
697 | ||
698 | if (!per_cpu(context_tracking.active, cpu)) { | |
699 | per_cpu(context_tracking.active, cpu) = true; | |
74c57875 | 700 | static_branch_inc(&context_tracking_key); |
fafe870f FW |
701 | } |
702 | ||
703 | if (initialized) | |
704 | return; | |
705 | ||
490f561b | 706 | #ifdef CONFIG_HAVE_TIF_NOHZ |
fafe870f FW |
707 | /* |
708 | * Set TIF_NOHZ to init/0 and let it propagate to all tasks through fork | |
709 | * This assumes that init is the only task at this early boot stage. | |
710 | */ | |
711 | set_tsk_thread_flag(&init_task, TIF_NOHZ); | |
490f561b | 712 | #endif |
fafe870f FW |
713 | WARN_ON_ONCE(!tasklist_empty()); |
714 | ||
715 | initialized = true; | |
91d1aa43 | 716 | } |
65f382fd | 717 | |
24a9c541 | 718 | #ifdef CONFIG_CONTEXT_TRACKING_USER_FORCE |
65f382fd FW |
719 | void __init context_tracking_init(void) |
720 | { | |
721 | int cpu; | |
722 | ||
723 | for_each_possible_cpu(cpu) | |
2a0aafce | 724 | ct_cpu_track_user(cpu); |
65f382fd FW |
725 | } |
726 | #endif | |
24a9c541 FW |
727 | |
728 | #endif /* #ifdef CONFIG_CONTEXT_TRACKING_USER */ |