Commit | Line | Data |
---|---|---|
142781e1 TG |
1 | // SPDX-License-Identifier: GPL-2.0 |
2 | ||
3 | #include <linux/context_tracking.h> | |
4 | #include <linux/entry-common.h> | |
a9f3a74a TG |
5 | #include <linux/livepatch.h> |
6 | #include <linux/audit.h> | |
142781e1 TG |
7 | |
8 | #define CREATE_TRACE_POINTS | |
9 | #include <trace/events/syscalls.h> | |
10 | ||
11 | /** | |
12 | * enter_from_user_mode - Establish state when coming from user mode | |
13 | * | |
14 | * Syscall/interrupt entry disables interrupts, but user mode is traced as | |
15 | * interrupts enabled. Also with NO_HZ_FULL RCU might be idle. | |
16 | * | |
17 | * 1) Tell lockdep that interrupts are disabled | |
18 | * 2) Invoke context tracking if enabled to reactivate RCU | |
19 | * 3) Trace interrupts off state | |
20 | */ | |
21 | static __always_inline void enter_from_user_mode(struct pt_regs *regs) | |
22 | { | |
23 | arch_check_user_regs(regs); | |
24 | lockdep_hardirqs_off(CALLER_ADDR0); | |
25 | ||
26 | CT_WARN_ON(ct_state() != CONTEXT_USER); | |
27 | user_exit_irqoff(); | |
28 | ||
29 | instrumentation_begin(); | |
30 | trace_hardirqs_off_finish(); | |
31 | instrumentation_end(); | |
32 | } | |
33 | ||
34 | static inline void syscall_enter_audit(struct pt_regs *regs, long syscall) | |
35 | { | |
36 | if (unlikely(audit_context())) { | |
37 | unsigned long args[6]; | |
38 | ||
39 | syscall_get_arguments(current, regs, args); | |
40 | audit_syscall_entry(syscall, args[0], args[1], args[2], args[3]); | |
41 | } | |
42 | } | |
43 | ||
44 | static long syscall_trace_enter(struct pt_regs *regs, long syscall, | |
b86678cf | 45 | unsigned long ti_work, unsigned long work) |
142781e1 TG |
46 | { |
47 | long ret = 0; | |
48 | ||
49 | /* Handle ptrace */ | |
64eb35f7 | 50 | if (work & (SYSCALL_WORK_SYSCALL_TRACE | SYSCALL_WORK_SYSCALL_EMU)) { |
142781e1 | 51 | ret = arch_syscall_enter_tracehook(regs); |
64eb35f7 | 52 | if (ret || (work & SYSCALL_WORK_SYSCALL_EMU)) |
142781e1 TG |
53 | return -1L; |
54 | } | |
55 | ||
56 | /* Do seccomp after ptrace, to catch any tracer changes. */ | |
23d67a54 | 57 | if (work & SYSCALL_WORK_SECCOMP) { |
142781e1 TG |
58 | ret = __secure_computing(NULL); |
59 | if (ret == -1L) | |
60 | return ret; | |
61 | } | |
62 | ||
b6ec4134 KC |
63 | /* Either of the above might have changed the syscall number */ |
64 | syscall = syscall_get_nr(current, regs); | |
65 | ||
524666cb | 66 | if (unlikely(work & SYSCALL_WORK_SYSCALL_TRACEPOINT)) |
142781e1 TG |
67 | trace_sys_enter(regs, syscall); |
68 | ||
69 | syscall_enter_audit(regs, syscall); | |
70 | ||
71 | return ret ? : syscall; | |
72 | } | |
73 | ||
4facb95b TG |
74 | static __always_inline long |
75 | __syscall_enter_from_user_work(struct pt_regs *regs, long syscall) | |
142781e1 | 76 | { |
b86678cf | 77 | unsigned long work = READ_ONCE(current_thread_info()->syscall_work); |
142781e1 TG |
78 | unsigned long ti_work; |
79 | ||
142781e1 | 80 | ti_work = READ_ONCE(current_thread_info()->flags); |
b86678cf GKB |
81 | if (work & SYSCALL_WORK_ENTER || ti_work & SYSCALL_ENTER_WORK) |
82 | syscall = syscall_trace_enter(regs, syscall, ti_work, work); | |
142781e1 TG |
83 | |
84 | return syscall; | |
85 | } | |
86 | ||
4facb95b TG |
87 | long syscall_enter_from_user_mode_work(struct pt_regs *regs, long syscall) |
88 | { | |
89 | return __syscall_enter_from_user_work(regs, syscall); | |
90 | } | |
91 | ||
92 | noinstr long syscall_enter_from_user_mode(struct pt_regs *regs, long syscall) | |
93 | { | |
94 | long ret; | |
95 | ||
96 | enter_from_user_mode(regs); | |
97 | ||
98 | instrumentation_begin(); | |
99 | local_irq_enable(); | |
100 | ret = __syscall_enter_from_user_work(regs, syscall); | |
101 | instrumentation_end(); | |
102 | ||
103 | return ret; | |
104 | } | |
105 | ||
106 | noinstr void syscall_enter_from_user_mode_prepare(struct pt_regs *regs) | |
107 | { | |
108 | enter_from_user_mode(regs); | |
109 | instrumentation_begin(); | |
110 | local_irq_enable(); | |
111 | instrumentation_end(); | |
112 | } | |
113 | ||
a9f3a74a TG |
114 | /** |
115 | * exit_to_user_mode - Fixup state when exiting to user mode | |
116 | * | |
117 | * Syscall/interupt exit enables interrupts, but the kernel state is | |
118 | * interrupts disabled when this is invoked. Also tell RCU about it. | |
119 | * | |
120 | * 1) Trace interrupts on state | |
121 | * 2) Invoke context tracking if enabled to adjust RCU state | |
122 | * 3) Invoke architecture specific last minute exit code, e.g. speculation | |
123 | * mitigations, etc. | |
124 | * 4) Tell lockdep that interrupts are enabled | |
125 | */ | |
126 | static __always_inline void exit_to_user_mode(void) | |
127 | { | |
128 | instrumentation_begin(); | |
129 | trace_hardirqs_on_prepare(); | |
130 | lockdep_hardirqs_on_prepare(CALLER_ADDR0); | |
131 | instrumentation_end(); | |
132 | ||
133 | user_enter_irqoff(); | |
134 | arch_exit_to_user_mode(); | |
135 | lockdep_hardirqs_on(CALLER_ADDR0); | |
136 | } | |
137 | ||
138 | /* Workaround to allow gradual conversion of architecture code */ | |
12db8b69 JA |
139 | void __weak arch_do_signal_or_restart(struct pt_regs *regs, bool has_signal) { } |
140 | ||
141 | static void handle_signal_work(struct pt_regs *regs, unsigned long ti_work) | |
142 | { | |
143 | if (ti_work & _TIF_NOTIFY_SIGNAL) | |
144 | tracehook_notify_signal(); | |
145 | ||
146 | arch_do_signal_or_restart(regs, ti_work & _TIF_SIGPENDING); | |
147 | } | |
a9f3a74a TG |
148 | |
149 | static unsigned long exit_to_user_mode_loop(struct pt_regs *regs, | |
150 | unsigned long ti_work) | |
151 | { | |
152 | /* | |
153 | * Before returning to user space ensure that all pending work | |
154 | * items have been completed. | |
155 | */ | |
156 | while (ti_work & EXIT_TO_USER_MODE_WORK) { | |
157 | ||
158 | local_irq_enable_exit_to_user(ti_work); | |
159 | ||
160 | if (ti_work & _TIF_NEED_RESCHED) | |
161 | schedule(); | |
162 | ||
163 | if (ti_work & _TIF_UPROBE) | |
164 | uprobe_notify_resume(regs); | |
165 | ||
166 | if (ti_work & _TIF_PATCH_PENDING) | |
167 | klp_update_patch_state(current); | |
168 | ||
12db8b69 JA |
169 | if (ti_work & (_TIF_SIGPENDING | _TIF_NOTIFY_SIGNAL)) |
170 | handle_signal_work(regs, ti_work); | |
a9f3a74a TG |
171 | |
172 | if (ti_work & _TIF_NOTIFY_RESUME) { | |
a9f3a74a TG |
173 | tracehook_notify_resume(regs); |
174 | rseq_handle_notify_resume(NULL, regs); | |
175 | } | |
176 | ||
177 | /* Architecture specific TIF work */ | |
178 | arch_exit_to_user_mode_work(regs, ti_work); | |
179 | ||
180 | /* | |
181 | * Disable interrupts and reevaluate the work flags as they | |
182 | * might have changed while interrupts and preemption was | |
183 | * enabled above. | |
184 | */ | |
185 | local_irq_disable_exit_to_user(); | |
186 | ti_work = READ_ONCE(current_thread_info()->flags); | |
187 | } | |
188 | ||
189 | /* Return the latest work state for arch_exit_to_user_mode() */ | |
190 | return ti_work; | |
191 | } | |
192 | ||
193 | static void exit_to_user_mode_prepare(struct pt_regs *regs) | |
194 | { | |
195 | unsigned long ti_work = READ_ONCE(current_thread_info()->flags); | |
196 | ||
197 | lockdep_assert_irqs_disabled(); | |
198 | ||
199 | if (unlikely(ti_work & EXIT_TO_USER_MODE_WORK)) | |
200 | ti_work = exit_to_user_mode_loop(regs, ti_work); | |
201 | ||
202 | arch_exit_to_user_mode_prepare(regs, ti_work); | |
203 | ||
204 | /* Ensure that the address limit is intact and no locks are held */ | |
205 | addr_limit_user_check(); | |
206 | lockdep_assert_irqs_disabled(); | |
207 | lockdep_sys_exit(); | |
208 | } | |
209 | ||
210 | #ifndef _TIF_SINGLESTEP | |
64eb35f7 | 211 | static inline bool report_single_step(unsigned long work) |
a9f3a74a TG |
212 | { |
213 | return false; | |
214 | } | |
215 | #else | |
216 | /* | |
64eb35f7 | 217 | * If SYSCALL_EMU is set, then the only reason to report is when |
a9f3a74a | 218 | * TIF_SINGLESTEP is set (i.e. PTRACE_SYSEMU_SINGLESTEP). This syscall |
900ffe39 | 219 | * instruction has been already reported in syscall_enter_from_user_mode(). |
a9f3a74a | 220 | */ |
64eb35f7 | 221 | static inline bool report_single_step(unsigned long work) |
a9f3a74a | 222 | { |
64eb35f7 GKB |
223 | if (!(work & SYSCALL_WORK_SYSCALL_EMU)) |
224 | return false; | |
225 | ||
226 | return !!(current_thread_info()->flags & _TIF_SINGLESTEP); | |
a9f3a74a TG |
227 | } |
228 | #endif | |
229 | ||
b86678cf GKB |
230 | static void syscall_exit_work(struct pt_regs *regs, unsigned long ti_work, |
231 | unsigned long work) | |
a9f3a74a TG |
232 | { |
233 | bool step; | |
234 | ||
235 | audit_syscall_exit(regs); | |
236 | ||
524666cb | 237 | if (work & SYSCALL_WORK_SYSCALL_TRACEPOINT) |
a9f3a74a TG |
238 | trace_sys_exit(regs, syscall_get_return_value(current, regs)); |
239 | ||
64eb35f7 | 240 | step = report_single_step(work); |
64c19ba2 | 241 | if (step || work & SYSCALL_WORK_SYSCALL_TRACE) |
a9f3a74a TG |
242 | arch_syscall_exit_tracehook(regs, step); |
243 | } | |
244 | ||
245 | /* | |
246 | * Syscall specific exit to user mode preparation. Runs with interrupts | |
247 | * enabled. | |
248 | */ | |
249 | static void syscall_exit_to_user_mode_prepare(struct pt_regs *regs) | |
250 | { | |
b86678cf | 251 | unsigned long work = READ_ONCE(current_thread_info()->syscall_work); |
a9f3a74a TG |
252 | u32 cached_flags = READ_ONCE(current_thread_info()->flags); |
253 | unsigned long nr = syscall_get_nr(current, regs); | |
254 | ||
255 | CT_WARN_ON(ct_state() != CONTEXT_KERNEL); | |
256 | ||
257 | if (IS_ENABLED(CONFIG_PROVE_LOCKING)) { | |
258 | if (WARN(irqs_disabled(), "syscall %lu left IRQs disabled", nr)) | |
259 | local_irq_enable(); | |
260 | } | |
261 | ||
262 | rseq_syscall(regs); | |
263 | ||
264 | /* | |
265 | * Do one-time syscall specific work. If these work items are | |
266 | * enabled, we want to run them exactly once per syscall exit with | |
267 | * interrupts enabled. | |
268 | */ | |
b86678cf GKB |
269 | if (unlikely(work & SYSCALL_WORK_EXIT || cached_flags & SYSCALL_EXIT_WORK)) |
270 | syscall_exit_work(regs, cached_flags, work); | |
a9f3a74a TG |
271 | } |
272 | ||
273 | __visible noinstr void syscall_exit_to_user_mode(struct pt_regs *regs) | |
274 | { | |
275 | instrumentation_begin(); | |
276 | syscall_exit_to_user_mode_prepare(regs); | |
277 | local_irq_disable_exit_to_user(); | |
278 | exit_to_user_mode_prepare(regs); | |
279 | instrumentation_end(); | |
280 | exit_to_user_mode(); | |
281 | } | |
282 | ||
142781e1 TG |
283 | noinstr void irqentry_enter_from_user_mode(struct pt_regs *regs) |
284 | { | |
285 | enter_from_user_mode(regs); | |
286 | } | |
a9f3a74a TG |
287 | |
288 | noinstr void irqentry_exit_to_user_mode(struct pt_regs *regs) | |
289 | { | |
290 | instrumentation_begin(); | |
291 | exit_to_user_mode_prepare(regs); | |
292 | instrumentation_end(); | |
293 | exit_to_user_mode(); | |
294 | } | |
a5497bab | 295 | |
aadfc2f9 | 296 | noinstr irqentry_state_t irqentry_enter(struct pt_regs *regs) |
a5497bab TG |
297 | { |
298 | irqentry_state_t ret = { | |
299 | .exit_rcu = false, | |
300 | }; | |
301 | ||
302 | if (user_mode(regs)) { | |
303 | irqentry_enter_from_user_mode(regs); | |
304 | return ret; | |
305 | } | |
306 | ||
307 | /* | |
308 | * If this entry hit the idle task invoke rcu_irq_enter() whether | |
309 | * RCU is watching or not. | |
310 | * | |
78a56e04 | 311 | * Interrupts can nest when the first interrupt invokes softirq |
a5497bab TG |
312 | * processing on return which enables interrupts. |
313 | * | |
314 | * Scheduler ticks in the idle task can mark quiescent state and | |
315 | * terminate a grace period, if and only if the timer interrupt is | |
316 | * not nested into another interrupt. | |
317 | * | |
7f2a53c2 | 318 | * Checking for rcu_is_watching() here would prevent the nesting |
a5497bab TG |
319 | * interrupt to invoke rcu_irq_enter(). If that nested interrupt is |
320 | * the tick then rcu_flavor_sched_clock_irq() would wrongfully | |
321 | * assume that it is the first interupt and eventually claim | |
78a56e04 | 322 | * quiescent state and end grace periods prematurely. |
a5497bab TG |
323 | * |
324 | * Unconditionally invoke rcu_irq_enter() so RCU state stays | |
325 | * consistent. | |
326 | * | |
327 | * TINY_RCU does not support EQS, so let the compiler eliminate | |
328 | * this part when enabled. | |
329 | */ | |
330 | if (!IS_ENABLED(CONFIG_TINY_RCU) && is_idle_task(current)) { | |
331 | /* | |
332 | * If RCU is not watching then the same careful | |
333 | * sequence vs. lockdep and tracing is required | |
45ff5105 | 334 | * as in irqentry_enter_from_user_mode(). |
a5497bab TG |
335 | */ |
336 | lockdep_hardirqs_off(CALLER_ADDR0); | |
337 | rcu_irq_enter(); | |
338 | instrumentation_begin(); | |
339 | trace_hardirqs_off_finish(); | |
340 | instrumentation_end(); | |
341 | ||
342 | ret.exit_rcu = true; | |
343 | return ret; | |
344 | } | |
345 | ||
346 | /* | |
347 | * If RCU is watching then RCU only wants to check whether it needs | |
348 | * to restart the tick in NOHZ mode. rcu_irq_enter_check_tick() | |
349 | * already contains a warning when RCU is not watching, so no point | |
350 | * in having another one here. | |
351 | */ | |
9d820f68 | 352 | lockdep_hardirqs_off(CALLER_ADDR0); |
a5497bab TG |
353 | instrumentation_begin(); |
354 | rcu_irq_enter_check_tick(); | |
9d820f68 | 355 | trace_hardirqs_off_finish(); |
a5497bab TG |
356 | instrumentation_end(); |
357 | ||
358 | return ret; | |
359 | } | |
360 | ||
361 | void irqentry_exit_cond_resched(void) | |
362 | { | |
363 | if (!preempt_count()) { | |
364 | /* Sanity check RCU and thread stack */ | |
365 | rcu_irq_exit_check_preempt(); | |
366 | if (IS_ENABLED(CONFIG_DEBUG_ENTRY)) | |
367 | WARN_ON_ONCE(!on_thread_stack()); | |
368 | if (need_resched()) | |
369 | preempt_schedule_irq(); | |
370 | } | |
371 | } | |
372 | ||
aadfc2f9 | 373 | noinstr void irqentry_exit(struct pt_regs *regs, irqentry_state_t state) |
a5497bab TG |
374 | { |
375 | lockdep_assert_irqs_disabled(); | |
376 | ||
377 | /* Check whether this returns to user mode */ | |
378 | if (user_mode(regs)) { | |
379 | irqentry_exit_to_user_mode(regs); | |
380 | } else if (!regs_irqs_disabled(regs)) { | |
381 | /* | |
382 | * If RCU was not watching on entry this needs to be done | |
383 | * carefully and needs the same ordering of lockdep/tracing | |
384 | * and RCU as the return to user mode path. | |
385 | */ | |
386 | if (state.exit_rcu) { | |
387 | instrumentation_begin(); | |
388 | /* Tell the tracer that IRET will enable interrupts */ | |
389 | trace_hardirqs_on_prepare(); | |
390 | lockdep_hardirqs_on_prepare(CALLER_ADDR0); | |
391 | instrumentation_end(); | |
392 | rcu_irq_exit(); | |
393 | lockdep_hardirqs_on(CALLER_ADDR0); | |
394 | return; | |
395 | } | |
396 | ||
397 | instrumentation_begin(); | |
398 | if (IS_ENABLED(CONFIG_PREEMPTION)) | |
399 | irqentry_exit_cond_resched(); | |
400 | /* Covers both tracing and lockdep */ | |
401 | trace_hardirqs_on(); | |
402 | instrumentation_end(); | |
403 | } else { | |
404 | /* | |
405 | * IRQ flags state is correct already. Just tell RCU if it | |
406 | * was not watching on entry. | |
407 | */ | |
408 | if (state.exit_rcu) | |
409 | rcu_irq_exit(); | |
410 | } | |
411 | } | |
b6be002b TG |
412 | |
413 | irqentry_state_t noinstr irqentry_nmi_enter(struct pt_regs *regs) | |
414 | { | |
415 | irqentry_state_t irq_state; | |
416 | ||
417 | irq_state.lockdep = lockdep_hardirqs_enabled(); | |
418 | ||
419 | __nmi_enter(); | |
420 | lockdep_hardirqs_off(CALLER_ADDR0); | |
421 | lockdep_hardirq_enter(); | |
422 | rcu_nmi_enter(); | |
423 | ||
424 | instrumentation_begin(); | |
425 | trace_hardirqs_off_finish(); | |
426 | ftrace_nmi_enter(); | |
427 | instrumentation_end(); | |
428 | ||
429 | return irq_state; | |
430 | } | |
431 | ||
432 | void noinstr irqentry_nmi_exit(struct pt_regs *regs, irqentry_state_t irq_state) | |
433 | { | |
434 | instrumentation_begin(); | |
435 | ftrace_nmi_exit(); | |
436 | if (irq_state.lockdep) { | |
437 | trace_hardirqs_on_prepare(); | |
438 | lockdep_hardirqs_on_prepare(CALLER_ADDR0); | |
439 | } | |
440 | instrumentation_end(); | |
441 | ||
442 | rcu_nmi_exit(); | |
443 | lockdep_hardirq_exit(); | |
444 | if (irq_state.lockdep) | |
445 | lockdep_hardirqs_on(CALLER_ADDR0); | |
446 | __nmi_exit(); | |
447 | } |